Skip to content

Commit f047de2

Browse files
[Perf]Optimize rotary_emb implementation to use Triton operator for improved inference performance
Signed-off-by: cynthieye <[email protected]> Co-authored-by: MagnetoWang <[email protected]>
2 parents bf7c6fc + 87aaade commit f047de2

File tree

351 files changed

+22758
-5478
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

351 files changed

+22758
-5478
lines changed

.buildkite/release-pipeline.yaml

+15
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,18 @@ steps:
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
8787
env:
8888
DOCKER_BUILDKIT: "1"
89+
90+
- block: "Build Neuron release image"
91+
key: block-neuron-release-image-build
92+
depends_on: ~
93+
94+
- label: "Build and publish Neuron release image"
95+
depends_on: block-neuron-release-image-build
96+
agents:
97+
queue: neuron-postmerge
98+
commands:
99+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
100+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
101+
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
102+
env:
103+
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

+26-2
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,34 @@
55
set -ex
66

77
# Setup cleanup
8-
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
8+
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
99
trap remove_docker_container EXIT
1010
remove_docker_container
1111

1212
# Try building the docker image
13-
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
13+
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
14+
15+
# Run the image
16+
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
17+
18+
function cpu_tests() {
19+
20+
# offline inference
21+
podman exec cpu-test-ubi9-ppc bash -c "
22+
set -e
23+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
24+
25+
# Run basic model test
26+
podman exec cpu-test-ubi9-ppc bash -c "
27+
set -e
28+
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
29+
pip install sentence-transformers datamodel_code_generator
30+
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
31+
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
32+
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
33+
}
34+
35+
# All of CPU tests are expected to be finished less than 40 mins.
36+
export -f cpu_tests
37+
timeout 40m bash -c cpu_tests
1438

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
# This script build the CPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# Setup cleanup
8+
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
9+
trap remove_docker_container EXIT
10+
remove_docker_container
11+
12+
# Try building the docker image
13+
docker build -t cpu-test -f docker/Dockerfile.s390x .

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ source /etc/environment
1717
docker run --privileged --net host --shm-size=16G -it \
1818
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
1919
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
20-
&& python3 -m pip install pytest \
20+
&& python3 -m pip install pytest tpu-info \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
2222
&& export VLLM_USE_V1=1 \
2323
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
24+
&& echo HARDWARE \
25+
&& tpu-info \
2426
&& echo TEST_0 \
2527
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
2628
&& echo TEST_1 \

.buildkite/test-pipeline.yaml

+17-2
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ steps:
118118
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
119119
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
120120
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
121-
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
121+
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
122122
- pytest -v -s entrypoints/test_chat_utils.py
123123
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
124124

@@ -341,6 +341,13 @@ steps:
341341
commands:
342342
- bash scripts/run-benchmarks.sh
343343

344+
- label: Benchmarks CLI Test # 10min
345+
source_file_dependencies:
346+
- vllm/
347+
- tests/benchmarks/
348+
commands:
349+
- pytest -v -s benchmarks/
350+
344351
- label: Quantization Test # 33min
345352
source_file_dependencies:
346353
- csrc/
@@ -378,8 +385,10 @@ steps:
378385
source_file_dependencies:
379386
- vllm/
380387
- tests/tool_use
388+
- tests/mistral_tool_use
381389
commands:
382390
- pytest -v -s tool_use
391+
- pytest -v -s mistral_tool_use
383392

384393
##### models test #####
385394

@@ -391,8 +400,9 @@ steps:
391400
- pytest -v -s models/test_transformers.py
392401
- pytest -v -s models/test_registry.py
393402
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
394-
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
403+
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
395404
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
405+
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
396406

397407
- label: Language Models Test (Standard) # 32min
398408
#mirror_hardwares: [amd]
@@ -402,6 +412,8 @@ steps:
402412
- tests/models/embedding/language
403413
- tests/models/encoder_decoder/language
404414
commands:
415+
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
416+
- pip install causal-conv1d
405417
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
406418
- pytest -v -s models/embedding/language -m core_model
407419

@@ -413,6 +425,8 @@ steps:
413425
- tests/models/embedding/language
414426
- tests/models/encoder_decoder/language
415427
commands:
428+
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
429+
- pip install causal-conv1d
416430
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
417431
- pytest -v -s models/embedding/language -m 'not core_model'
418432

@@ -538,6 +552,7 @@ steps:
538552
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
539553
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
540554
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
555+
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
541556

542557
- label: Plugin Tests (2 GPUs) # 40min
543558
working_dir: "/vllm-workspace/tests"

.github/ISSUE_TEMPLATE/200-installation.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body:
1414
description: |
1515
Please run the following and paste the output below.
1616
```sh
17-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
17+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
1818
# For security purposes, please feel free to check the contents of collect_env.py before running it.
1919
python collect_env.py
2020
```

.github/ISSUE_TEMPLATE/300-usage.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body:
1414
description: |
1515
Please run the following and paste the output below.
1616
```sh
17-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
17+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
1818
# For security purposes, please feel free to check the contents of collect_env.py before running it.
1919
python collect_env.py
2020
```

.github/ISSUE_TEMPLATE/400-bug-report.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body:
1414
description: |
1515
Please run the following and paste the output below.
1616
```sh
17-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
17+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
1818
# For security purposes, please feel free to check the contents of collect_env.py before running it.
1919
python collect_env.py
2020
```

.github/ISSUE_TEMPLATE/700-performance-discussion.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ body:
3535
description: |
3636
Please run the following and paste the output below.
3737
```sh
38-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
38+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
3939
# For security purposes, please feel free to check the contents of collect_env.py before running it.
4040
python collect_env.py
4141
```

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,6 @@ benchmarks/**/*.json
203203
# Linting
204204
actionlint
205205
shellcheck*/
206+
207+
# Ingore moe/marlin_moe gen code
208+
csrc/moe/marlin_moe_wna16/kernel_*

.pre-commit-config.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ repos:
1111
hooks:
1212
- id: yapf
1313
args: [--in-place, --verbose]
14-
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
1514
- repo: https://github.com/astral-sh/ruff-pre-commit
1615
rev: v0.9.3
1716
hooks:

CMakeLists.txt

+42-11
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ set(VLLM_EXT_SRC
230230
"csrc/cache_kernels.cu"
231231
"csrc/attention/paged_attention_v1.cu"
232232
"csrc/attention/paged_attention_v2.cu"
233+
"csrc/attention/merge_attn_states.cu"
233234
"csrc/pos_encoding_kernels.cu"
234235
"csrc/activation_kernels.cu"
235236
"csrc/layernorm_kernels.cu"
@@ -608,21 +609,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
608609
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
609610
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
610611
if (MARLIN_MOE_ARCHS)
611-
set(MARLIN_MOE_SRC
612-
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
613-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
614-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
615-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
616-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
617-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
618-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
619-
"csrc/moe/marlin_moe_ops.cu")
620612

613+
#
614+
# For the Marlin MOE kernels we automatically generate sources for various
615+
# preselected input type pairs and schedules.
616+
# Generate sources:
617+
set(MOE_MARLIN_GEN_SCRIPT
618+
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
619+
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
620+
621+
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
622+
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
623+
624+
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
625+
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
626+
execute_process(
627+
COMMAND ${CMAKE_COMMAND} -E env
628+
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
629+
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
630+
RESULT_VARIABLE moe_marlin_generation_result
631+
OUTPUT_VARIABLE moe_marlin_generation_output
632+
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
633+
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
634+
)
635+
636+
if (NOT moe_marlin_generation_result EQUAL 0)
637+
message(FATAL_ERROR "Marlin MOE generation failed."
638+
" Result: \"${moe_marlin_generation_result}\""
639+
"\nCheck the log for details: "
640+
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
641+
else()
642+
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
643+
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
644+
message(STATUS "Marlin MOE generation completed successfully.")
645+
endif()
646+
else()
647+
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
648+
endif()
649+
650+
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
621651
set_gencode_flags_for_srcs(
622-
SRCS "${MARLIN_MOE_SRC}"
652+
SRCS "${MOE_WNAA16_MARLIN_SRC}"
623653
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
624654

625-
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
655+
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
656+
626657
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
627658
else()
628659
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
1010
</h3>
1111

1212
<p align="center">
13-
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
13+
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
1414
</p>
1515

1616
---

0 commit comments

Comments
 (0)