Skip to content

Commit f690372

Browse files
[Core] Update dtype detection and defaults (#14858)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 8b3e94a commit f690372

File tree

22 files changed

+175
-227
lines changed

22 files changed

+175
-227
lines changed

tests/compile/test_basic_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ class TestSetting:
6060
# embedding model
6161
TestSetting(
6262
model="BAAI/bge-multilingual-gemma2",
63-
model_args=["--task", "embed"],
63+
model_args=["--task", "embed", "--dtype", "bfloat16"],
6464
pp_size=1,
6565
tp_size=1,
6666
attn_backend="FLASH_ATTN",

tests/conftest.py

Lines changed: 63 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,16 @@
1414
import torch.nn.functional as F
1515
from huggingface_hub import snapshot_download
1616
from PIL import Image
17-
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
18-
BatchFeature)
17+
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
18+
BatchEncoding, BatchFeature)
1919
from transformers.models.auto.auto_factory import _BaseAutoModelClass
2020

2121
from tests.models.utils import (TokensTextLogprobs,
2222
TokensTextLogprobsPromptLogprobs)
2323
from vllm import LLM, SamplingParams
2424
from vllm.assets.image import ImageAsset
2525
from vllm.assets.video import VideoAsset
26-
from vllm.config import TaskOption, TokenizerPoolConfig
26+
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
2727
from vllm.connections import global_http_connection
2828
from vllm.distributed import (cleanup_dist_env_and_memory,
2929
init_distributed_environment,
@@ -34,8 +34,7 @@
3434
from vllm.logger import init_logger
3535
from vllm.outputs import RequestOutput
3636
from vllm.sampling_params import BeamSearchParams
37-
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
38-
identity, is_list_of)
37+
from vllm.utils import cuda_device_count_stateless, is_list_of
3938

4039
logger = init_logger(__name__)
4140

@@ -271,14 +270,18 @@ def video_assets() -> _VideoAssets:
271270

272271
class HfRunner:
273272

274-
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
273+
def get_default_device(self):
275274
from vllm.platforms import current_platform
275+
276+
return ("cpu" if current_platform.is_cpu()
277+
or current_platform.is_openvino() else "cuda")
278+
279+
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
276280
if x is None or isinstance(x, (bool, )):
277281
return x
278282

279283
if device is None:
280-
device = "cpu" if current_platform.is_cpu(
281-
) or current_platform.is_openvino() else "cuda"
284+
device = self.device
282285

283286
if isinstance(x, dict):
284287
return {k: self.wrap_device(v, device) for k, v in x.items()}
@@ -291,45 +294,59 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
291294
def __init__(
292295
self,
293296
model_name: str,
294-
dtype: str = "half",
297+
dtype: str = "auto",
295298
*,
296299
model_kwargs: Optional[dict[str, Any]] = None,
297300
is_sentence_transformer: bool = False,
298301
is_cross_encoder: bool = False,
299302
skip_tokenizer_init: bool = False,
300303
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
301-
postprocess_inputs: Callable[..., BatchEncoding] = identity,
302304
) -> None:
303-
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
304-
305305
self.model_name = model_name
306306

307+
self.config = AutoConfig.from_pretrained(
308+
model_name,
309+
trust_remote_code=True,
310+
)
311+
self.device = self.get_default_device()
312+
self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
313+
314+
model_kwargs = model_kwargs if model_kwargs is not None else {}
315+
model_kwargs.setdefault("torch_dtype", torch_dtype)
316+
307317
if is_sentence_transformer:
308318
# Lazy init required for AMD CI
309319
from sentence_transformers import SentenceTransformer
310-
self.model = self.wrap_device(
311-
SentenceTransformer(
312-
model_name,
313-
device="cpu",
314-
trust_remote_code=True,
315-
).to(dtype=torch_dtype))
320+
321+
self.model = SentenceTransformer(
322+
model_name,
323+
device=self.device,
324+
model_kwargs=model_kwargs,
325+
trust_remote_code=True,
326+
)
316327
elif is_cross_encoder:
317328
# Lazy init required for AMD CI
318329
from sentence_transformers import CrossEncoder
319-
self.model = CrossEncoder(model_name,
320-
device="cpu",
321-
trust_remote_code=True)
322-
self.model.model = self.wrap_device(self.model.model)\
323-
.to(dtype=torch_dtype)
330+
331+
self.model = CrossEncoder(
332+
model_name,
333+
device=self.device,
334+
automodel_args=model_kwargs,
335+
trust_remote_code=True,
336+
)
324337
else:
325-
model_kwargs = model_kwargs if model_kwargs is not None else {}
326-
self.model = self.wrap_device(
327-
auto_cls.from_pretrained(
328-
model_name,
329-
torch_dtype=torch_dtype,
330-
trust_remote_code=True,
331-
**model_kwargs,
332-
))
338+
model = auto_cls.from_pretrained(
339+
model_name,
340+
trust_remote_code=True,
341+
**model_kwargs,
342+
)
343+
344+
if (getattr(model, "quantization_method", None) != "bitsandbytes"
345+
and len({p.device
346+
for p in model.parameters()}) < 2):
347+
model = model.to(self.device)
348+
349+
self.model = model
333350

334351
if not skip_tokenizer_init:
335352
self.tokenizer = AutoTokenizer.from_pretrained(
@@ -349,16 +366,13 @@ def __init__(
349366
if skip_tokenizer_init:
350367
self.tokenizer = self.processor.tokenizer
351368

352-
self.dtype = dtype
353-
self.postprocess_inputs = postprocess_inputs
354-
355369
def get_inputs(
356370
self,
357371
prompts: list[str],
358372
images: Optional[PromptImageInput] = None,
359373
videos: Optional[PromptVideoInput] = None,
360374
audios: Optional[PromptAudioInput] = None,
361-
) -> list[BatchEncoding]:
375+
) -> list[Union[BatchFeature, BatchEncoding]]:
362376
if images is not None:
363377
assert len(prompts) == len(images)
364378

@@ -368,7 +382,7 @@ def get_inputs(
368382
if audios is not None:
369383
assert len(prompts) == len(audios)
370384

371-
all_inputs: list[BatchEncoding] = []
385+
all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
372386
for i, prompt in enumerate(prompts):
373387
processor_kwargs: dict[str, Any] = {
374388
"text": prompt,
@@ -384,7 +398,8 @@ def get_inputs(
384398
processor_kwargs["sampling_rate"] = sr
385399

386400
inputs = self.processor(**processor_kwargs)
387-
inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
401+
if isinstance(inputs, BatchFeature):
402+
inputs = inputs.to(dtype=self.dtype)
388403

389404
all_inputs.append(inputs)
390405

@@ -417,7 +432,7 @@ def generate(
417432
outputs: list[tuple[list[list[int]], list[str]]] = []
418433
for inputs in all_inputs:
419434
output_ids = self.model.generate(
420-
**self.wrap_device(inputs, device=self.model.device.type),
435+
**self.wrap_device(inputs),
421436
use_cache=True,
422437
**kwargs,
423438
)
@@ -488,7 +503,7 @@ def generate_greedy_logprobs(
488503
all_logprobs: list[list[torch.Tensor]] = []
489504
for inputs in all_inputs:
490505
output = self.model.generate(
491-
**self.wrap_device(inputs, device=self.model.device.type),
506+
**self.wrap_device(inputs),
492507
use_cache=True,
493508
do_sample=False,
494509
max_new_tokens=max_tokens,
@@ -569,7 +584,7 @@ def generate_greedy_logprobs_limit(
569584

570585
for inputs in all_inputs:
571586
output = self.model.generate(
572-
**self.wrap_device(inputs, device=self.model.device.type),
587+
**self.wrap_device(inputs),
573588
use_cache=True,
574589
do_sample=False,
575590
max_new_tokens=max_tokens,
@@ -620,19 +635,15 @@ def generate_encoder_decoder_greedy_logprobs_limit(
620635
if images is not None and images[i] is not None:
621636
processor_kwargs["images"] = images[i]
622637

623-
encoder_inputs = self.wrap_device(
624-
self.processor(**processor_kwargs),
625-
device=self.model.device.type,
626-
)
638+
encoder_inputs = self.processor(**processor_kwargs)
639+
encoder_inputs = self.wrap_device(encoder_inputs)
627640

628641
if decoder_prompt is None:
629642
decoder_input_ids = None
630643
else:
631-
decoder_input_ids = self.wrap_device(
632-
self.tokenizer(decoder_prompt,
633-
return_tensors="pt").input_ids,
634-
device=self.model.device.type,
635-
)
644+
decoder_inputs = self.tokenizer(decoder_prompt,
645+
return_tensors="pt")
646+
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
636647

637648
output = self.model.generate(
638649
decoder_input_ids=decoder_input_ids,
@@ -684,6 +695,7 @@ class VllmRunner:
684695
"""
685696
The default value of some arguments have been modified from
686697
:class:`~vllm.LLM` as follows:
698+
687699
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
688700
- `seed`: Set to `0` instead of `None` for test reproducibility.
689701
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
@@ -701,10 +713,8 @@ def __init__(
701713
tokenizer_mode: str = "auto",
702714
trust_remote_code: bool = True,
703715
seed: Optional[int] = 0,
704-
# Use smaller max model length, otherwise bigger model cannot run due
705-
# to kv cache size limit.
706716
max_model_len: int = 1024,
707-
dtype: str = "half",
717+
dtype: str = "auto",
708718
disable_log_stats: bool = True,
709719
tensor_parallel_size: int = 1,
710720
block_size: int = 16,
@@ -1110,4 +1120,4 @@ def pytest_collection_modifyitems(config, items):
11101120
skip_optional = pytest.mark.skip(reason="need --optional option to run")
11111121
for item in items:
11121122
if "optional" in item.keywords:
1113-
item.add_marker(skip_optional)
1123+
item.add_marker(skip_optional)

tests/entrypoints/llm/test_chat.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ def test_multi_chat():
6464
def test_chat_multi_image(image_urls: list[str]):
6565
llm = LLM(
6666
model="microsoft/Phi-3.5-vision-instruct",
67-
dtype="bfloat16",
6867
max_model_len=4096,
6968
max_num_seqs=5,
7069
enforce_eager=True,

tests/entrypoints/openai/test_audio.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
@pytest.fixture(scope="module")
1919
def server():
2020
args = [
21-
"--dtype",
22-
"bfloat16",
2321
"--max-model-len",
2422
"2048",
2523
"--max-num-seqs",

tests/entrypoints/openai/test_video.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ def server():
2424
args = [
2525
"--task",
2626
"generate",
27-
"--dtype",
28-
"bfloat16",
2927
"--max-model-len",
3028
"32768",
3129
"--max-num-seqs",

tests/entrypoints/openai/test_vision.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ def server():
2525
args = [
2626
"--task",
2727
"generate",
28-
"--dtype",
29-
"bfloat16",
3028
"--max-model-len",
3129
"2048",
3230
"--max-num-seqs",

tests/entrypoints/openai/test_vision_embedding.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ def server():
2828
args = [
2929
"--task",
3030
"embed",
31-
"--dtype",
32-
"bfloat16",
3331
"--max-model-len",
3432
"2048",
3533
"--max-num-seqs",

tests/entrypoints/test_chat_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def phi3v_model_config():
3434
tokenizer=PHI3V_MODEL_ID,
3535
tokenizer_mode="auto",
3636
trust_remote_code=True,
37-
dtype="bfloat16",
37+
dtype="auto",
3838
seed=0,
3939
limit_mm_per_prompt={
4040
"image": 2,
@@ -58,7 +58,7 @@ def mllama_model_config():
5858
tokenizer=MLLAMA_MODEL_ID,
5959
tokenizer_mode="auto",
6060
trust_remote_code=True,
61-
dtype="bfloat16",
61+
dtype="auto",
6262
seed=0,
6363
limit_mm_per_prompt={
6464
"image": 2,
@@ -669,7 +669,7 @@ def get_conversation(is_hf: bool):
669669
tokenizer=MLLAMA_MODEL_ID,
670670
tokenizer_mode="auto",
671671
trust_remote_code=True,
672-
dtype="bfloat16",
672+
dtype="auto",
673673
seed=0,
674674
limit_mm_per_prompt={
675675
"image": 2,

tests/models/decoder_only/audio_language/test_ultravox.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55
import numpy as np
66
import pytest
77
import pytest_asyncio
8-
from transformers import AutoModel, AutoTokenizer, BatchEncoding
8+
from transformers import AutoModel, AutoTokenizer
99

1010
from vllm.multimodal.audio import resample_audio
1111
from vllm.sequence import SampleLogprobs
12-
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
1312

1413
from ....conftest import HfRunner, VllmRunner
1514
from ....utils import RemoteOpenAIServer
@@ -107,8 +106,6 @@ def run_test(
107106
**kwargs,
108107
):
109108
"""Inference result should be the same between hf and vllm."""
110-
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
111-
112109
# NOTE: take care of the order. run vLLM first, and then run HF.
113110
# vLLM needs a fresh new process without cuda initialization.
114111
# if we run HF first, the cuda initialization will be done and it
@@ -124,15 +121,7 @@ def run_test(
124121
for vllm_prompt, _, audio in prompts_and_audios
125122
]
126123

127-
def process(hf_inputs: BatchEncoding, **kwargs):
128-
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
129-
.to(torch_dtype) # type: ignore
130-
return hf_inputs
131-
132-
with hf_runner(model,
133-
dtype=dtype,
134-
postprocess_inputs=process,
135-
auto_cls=AutoModel) as hf_model:
124+
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
136125
hf_outputs_per_audio = [
137126
hf_model.generate_greedy_logprobs_limit(
138127
[hf_prompt],

0 commit comments

Comments
 (0)