docs: Benchmarking guide updates (#678) (#699)

kthui · web-flow · commit 926370b45984 · 2025-04-15T21:09:37.000-04:00
diff --git a/examples/llm/benchmarks/README.md b/examples/llm/benchmarks/README.md
@@ -26,6 +26,15 @@ This guide provides detailed steps on benchmarking Large Language Models (LLMs)
 
 H100 80GB x8 node(s) are required for benchmarking.
 
+> [!NOTE]
+> This guide was tested on node(s) with the following hardware configuration:
+> * **GPUs**: 8xH100 80GB HBM3 (GPU Memory Bandwidth 3.2 TBs)
+> * **CPU**: 2x Intel Saphire Rapids, Intel(R) Xeon(R) Platinum 8480CL E5, 112 cores (56 cores per CPU), 2.00 GHz (Base), 3.8 Ghz (Max boost), PCIe Gen5
+> * **NVLink**: NVLink 4th Generation, 900 GB/s (GPU to GPU NVLink bidirectional bandwidth), 18 Links per GPU
+> * **InfiniBand**: 8X400Gbit/s (Compute Links), 2X400Gbit/s (Storage Links)
+>
+> Benchmarking with a different hardware configuration may yield suboptimal results.
+
 1\. Build benchmarking image
 ```bash
 ./container/build.sh
@@ -43,7 +52,7 @@ docker compose -f deploy/docker_compose.yml up -d
 
 ## Disaggregated Single Node Benchmarking
 
-*One H100 80GB x8 node is required for this setup.*
+One H100 80GB x8 node is required for this setup.
 
 In the following setup we compare Dynamo disaggregated vLLM performance to
 [native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on a single node. These were chosen to optimize
@@ -72,12 +81,7 @@ Collect the performance numbers as shown on the [Collecting Performance Numbers]
 
 ## Disaggregated Multi Node Benchmarking
 
-*Two H100 80GB x8 nodes are required for this setup.*
-
-> [!Note]
-> Nodes used for benchmarking were part of a cluster connected via InfiniBand
-> NDR with 8 connections for compute and 2 for storage. Both fabrics were on
-> their own fat tree non-blocking topology.
+Two H100 80GB x8 nodes are required for this setup.
 
 In the following steps we compare Dynamo disaggregated vLLM performance to
 [native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on two nodes. These were chosen to optimize
diff --git a/examples/llm/benchmarks/disagg.yaml b/examples/llm/benchmarks/disagg.yaml
@@ -13,6 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+Common:
+  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+  router: round-robin
+  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
+  block-size: 128
+  max-model-len: 3500
+  max-num-batched-tokens: 3500
+  disable-log-requests: true
+
 Frontend:
   # This model was chosen for its 70B size and FP8 precision, which the TP and
   # DP configurations were tuned for its size, and its precision reduces model
@@ -22,38 +32,26 @@ Frontend:
   port: 8000
 
 Processor:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  router: round-robin
+  common-configs: [model, router]
 
 # x1 process with 4 GPUs generating output tokens (the "decode" phase).
 VllmWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
-  block-size: 128
-  max-model-len: 3500
+  common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
   # Enable prefill at different workers.
   remote-prefill: true
   # Disable local prefill so only disaggregated prefill is used.
   conditional-disagg: false
-  tensor-parallel-size: 4
   gpu-memory-utilization: 0.95
-  disable-log-requests: true
+  tensor-parallel-size: 4
   ServiceArgs:
     workers: 1
     resources:
       gpu: 4
 
 # x4 processes each with 1 GPU handling the initial prefill (context embedding) phase.
 PrefillWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 128
-  max-model-len: 3500
-  max-num-batched-tokens: 3500
+  common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, gpu-memory-utilization, disable-log-requests]
   tensor-parallel-size: 1
-  gpu-memory-utilization: 0.95
-  disable-log-requests: true
   ServiceArgs:
     workers: 4
     resources:
diff --git a/examples/llm/benchmarks/disagg_multinode.yaml b/examples/llm/benchmarks/disagg_multinode.yaml
@@ -13,15 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-Frontend:
-  served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-
-Processor:
+Common:
   model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  block-size: 128
-  max-model-len: 3500
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
   # Routing policy determines how remote workers are selected for processing
   # prefill requests
   # 1. random: randomly select workers for prefill requests
@@ -31,39 +25,43 @@ Processor:
   # 3. kv: finding prefill workers by KV cache is not beneficial when caching is
   #        disabled on this setup
   router: round-robin
+  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
+  block-size: 128
+  max-model-len: 3500
+  max-num-batched-tokens: 3500
+  disable-log-requests: true
+
+Frontend:
+  served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+
+Processor:
+  common-configs: [model, block-size, max-model-len, router]
 
 Router:
-  model-name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  common-configs: [model]
   min-workers: 1
 
 VllmWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 128
-  max-model-len: 3500
+  common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
   # Enable prefill at different workers.
   remote-prefill: true
   # Disable local prefill so only disaggregated prefill is used.
   conditional-disagg: false
+  # The GPU memory utilization do not have to match between VllmWorker and PrefillWorker.
+  gpu-memory-utilization: 0.95
   # TP size is doubled from single node setup
   tensor-parallel-size: 8
-  gpu-memory-utilization: 0.95
-  disable-log-requests: true
-  router: round-robin
   ServiceArgs:
     workers: 1
     resources:
       gpu: 8
 
 PrefillWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 128
-  max-model-len: 3500
-  max-num-batched-tokens: 3500
-  tensor-parallel-size: 1
+  common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, disable-log-requests]
   gpu-memory-utilization: 0.95
-  disable-log-requests: true
+  tensor-parallel-size: 1
   ServiceArgs:
     # DP size is doubled from single node setup
     workers: 8