vllm-project
diff --git a/‎tests/compile/test_basic_correctness.py
Lines changed: 1 addition & 1 deletion b/‎tests/compile/test_basic_correctness.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/conftest.py
Lines changed: 63 additions & 53 deletions b/‎tests/conftest.py
Lines changed: 63 additions & 53 deletions
diff --git a/‎tests/entrypoints/llm/test_chat.py
Lines changed: 0 additions & 1 deletion b/‎tests/entrypoints/llm/test_chat.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/entrypoints/openai/test_audio.py
Lines changed: 0 additions & 2 deletions b/‎tests/entrypoints/openai/test_audio.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/entrypoints/openai/test_video.py
Lines changed: 0 additions & 2 deletions b/‎tests/entrypoints/openai/test_video.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/entrypoints/openai/test_vision.py
Lines changed: 0 additions & 2 deletions b/‎tests/entrypoints/openai/test_vision.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/entrypoints/openai/test_vision_embedding.py
Lines changed: 0 additions & 2 deletions b/‎tests/entrypoints/openai/test_vision_embedding.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/entrypoints/test_chat_utils.py
Lines changed: 3 additions & 3 deletions b/‎tests/entrypoints/test_chat_utils.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/models/decoder_only/audio_language/test_ultravox.py
Lines changed: 2 additions & 13 deletions b/‎tests/models/decoder_only/audio_language/test_ultravox.py
Lines changed: 2 additions & 13 deletions
@@ -60,7 +60,7 @@ class TestSetting:
         # embedding model
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
-            model_args=["--task", "embed"],
+            model_args=["--task", "embed", "--dtype", "bfloat16"],
             pp_size=1,
             tp_size=1,
             attn_backend="FLASH_ATTN",
 
@@ -14,16 +14,16 @@
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          BatchFeature)
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from tests.models.utils import (TokensTextLogprobs,
                                 TokensTextLogprobsPromptLogprobs)
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -34,8 +34,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_list_of)
+from vllm.utils import cuda_device_count_stateless, is_list_of
 
 logger = init_logger(__name__)
 
@@ -271,14 +270,18 @@ def video_assets() -> _VideoAssets:
 
 class HfRunner:
 
-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def get_default_device(self):
         from vllm.platforms import current_platform
+
+        return ("cpu" if current_platform.is_cpu()
+                or current_platform.is_openvino() else "cuda")
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
         if x is None or isinstance(x, (bool, )):
             return x
 
         if device is None:
-            device = "cpu" if current_platform.is_cpu(
-            ) or current_platform.is_openvino() else "cuda"
+            device = self.device
 
         if isinstance(x, dict):
             return {k: self.wrap_device(v, device) for k, v in x.items()}
@@ -291,45 +294,59 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
     def __init__(
         self,
         model_name: str,
-        dtype: str = "half",
+        dtype: str = "auto",
         *,
         model_kwargs: Optional[dict[str, Any]] = None,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[..., BatchEncoding] = identity,
     ) -> None:
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
         self.model_name = model_name
 
+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
         if is_sentence_transformer:
             # Lazy init required for AMD CI
             from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(
-                SentenceTransformer(
-                    model_name,
-                    device="cpu",
-                    trust_remote_code=True,
-                ).to(dtype=torch_dtype))
+
+            self.model = SentenceTransformer(
+                model_name,
+                device=self.device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=True,
+            )
         elif is_cross_encoder:
             # Lazy init required for AMD CI
             from sentence_transformers import CrossEncoder
-            self.model = CrossEncoder(model_name,
-                                      device="cpu",
-                                      trust_remote_code=True)
-            self.model.model = self.wrap_device(self.model.model)\
-                .to(dtype=torch_dtype)
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=True,
+            )
         else:
-            model_kwargs = model_kwargs if model_kwargs is not None else {}
-            self.model = self.wrap_device(
-                auto_cls.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=True,
-                    **model_kwargs,
-                ))
+            model = auto_cls.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                **model_kwargs,
+            )
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(self.device)
+
+            self.model = model
 
         if not skip_tokenizer_init:
             self.tokenizer = AutoTokenizer.from_pretrained(
@@ -349,16 +366,13 @@ def __init__(
         if skip_tokenizer_init:
             self.tokenizer = self.processor.tokenizer
 
-        self.dtype = dtype
-        self.postprocess_inputs = postprocess_inputs
-
     def get_inputs(
         self,
         prompts: list[str],
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
-    ) -> list[BatchEncoding]:
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
         if images is not None:
             assert len(prompts) == len(images)
 
@@ -368,7 +382,7 @@ def get_inputs(
         if audios is not None:
             assert len(prompts) == len(audios)
 
-        all_inputs: list[BatchEncoding] = []
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
         for i, prompt in enumerate(prompts):
             processor_kwargs: dict[str, Any] = {
                 "text": prompt,
@@ -384,7 +398,8 @@ def get_inputs(
                 processor_kwargs["sampling_rate"] = sr
 
             inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)
 
             all_inputs.append(inputs)
 
@@ -417,7 +432,7 @@ def generate(
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for inputs in all_inputs:
             output_ids = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 **kwargs,
             )
@@ -488,7 +503,7 @@ def generate_greedy_logprobs(
         all_logprobs: list[list[torch.Tensor]] = []
         for inputs in all_inputs:
             output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
@@ -569,7 +584,7 @@ def generate_greedy_logprobs_limit(
 
         for inputs in all_inputs:
             output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
@@ -620,19 +635,15 @@ def generate_encoder_decoder_greedy_logprobs_limit(
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
 
-            encoder_inputs = self.wrap_device(
-                self.processor(**processor_kwargs),
-                device=self.model.device.type,
-            )
+            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(encoder_inputs)
 
             if decoder_prompt is None:
                 decoder_input_ids = None
             else:
-                decoder_input_ids = self.wrap_device(
-                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids,
-                    device=self.model.device.type,
-                )
+                decoder_inputs = self.tokenizer(decoder_prompt,
+                                                return_tensors="pt")
+                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
 
             output = self.model.generate(
                 decoder_input_ids=decoder_input_ids,
@@ -684,6 +695,7 @@ class VllmRunner:
     """
     The default value of some arguments have been modified from
     :class:`~vllm.LLM` as follows:
+
     - `trust_remote_code`: Set to `True` instead of `False` for convenience.
     - `seed`: Set to `0` instead of `None` for test reproducibility.
     - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
@@ -701,10 +713,8 @@ def __init__(
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
         seed: Optional[int] = 0,
-        # Use smaller max model length, otherwise bigger model cannot run due
-        # to kv cache size limit.
         max_model_len: int = 1024,
-        dtype: str = "half",
+        dtype: str = "auto",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
         block_size: int = 16,
@@ -1110,4 +1120,4 @@ def pytest_collection_modifyitems(config, items):
     skip_optional = pytest.mark.skip(reason="need --optional option to run")
     for item in items:
         if "optional" in item.keywords:
-            item.add_marker(skip_optional)
+            item.add_marker(skip_optional)
@@ -64,7 +64,6 @@ def test_multi_chat():
 def test_chat_multi_image(image_urls: list[str]):
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
-        dtype="bfloat16",
         max_model_len=4096,
         max_num_seqs=5,
         enforce_eager=True,
 
@@ -18,8 +18,6 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
 
@@ -24,8 +24,6 @@ def server():
     args = [
         "--task",
         "generate",
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "32768",
         "--max-num-seqs",
 
@@ -25,8 +25,6 @@ def server():
     args = [
         "--task",
         "generate",
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
 
@@ -28,8 +28,6 @@ def server():
     args = [
         "--task",
         "embed",
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
 
@@ -34,7 +34,7 @@ def phi3v_model_config():
                        tokenizer=PHI3V_MODEL_ID,
                        tokenizer_mode="auto",
                        trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                        seed=0,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -58,7 +58,7 @@ def mllama_model_config():
                        tokenizer=MLLAMA_MODEL_ID,
                        tokenizer_mode="auto",
                        trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                        seed=0,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -669,7 +669,7 @@ def get_conversation(is_hf: bool):
                                tokenizer=MLLAMA_MODEL_ID,
                                tokenizer_mode="auto",
                                trust_remote_code=True,
-                               dtype="bfloat16",
+                               dtype="auto",
                                seed=0,
                                limit_mm_per_prompt={
                                    "image": 2,
 
@@ -5,11 +5,10 @@
 import numpy as np
 import pytest
 import pytest_asyncio
-from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from transformers import AutoModel, AutoTokenizer
 
 from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
@@ -107,8 +106,6 @@ def run_test(
     **kwargs,
 ):
     """Inference result should be the same between hf and vllm."""
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -124,15 +121,7 @@ def run_test(
             for vllm_prompt, _, audio in prompts_and_audios
         ]
 
-    def process(hf_inputs: BatchEncoding, **kwargs):
-        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModel) as hf_model:
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
         hf_outputs_per_audio = [
             hf_model.generate_greedy_logprobs_limit(
                 [hf_prompt],