Skip to content

Commit 5fe1f8a

Browse files
feat(serve): Enhance multi-node deployment and worker configuration (ai-dynamo#457)
Co-authored-by: hongkuanz <[email protected]>
1 parent 01bad19 commit 5fe1f8a

File tree

4 files changed

+88
-18
lines changed

4 files changed

+88
-18
lines changed

deploy/dynamo/sdk/src/dynamo/sdk/cli/allocator.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def assign_gpus(self, count: float) -> list[int]:
9696
return unassigned[:count]
9797

9898
@inject
99-
def get_worker_env(
99+
def get_resource_envs(
100100
self,
101101
service: Service[Any],
102102
services: dict[str, Any] = Provide[BentoMLContainer.config.services],
@@ -105,30 +105,30 @@ def get_worker_env(
105105

106106
num_gpus = 0
107107
num_workers = 1
108-
worker_env: list[dict[str, str]] = []
108+
resource_envs: list[dict[str, str]] = []
109109
if "gpu" in (config.get("resources") or {}):
110110
num_gpus = config["resources"]["gpu"] # type: ignore
111111
if config.get("workers"):
112112
if (workers := config["workers"]) == "cpu_count":
113113
num_workers = int(self.system_resources["cpu"])
114114
# don't assign gpus to workers
115-
return num_workers, worker_env
115+
return num_workers, resource_envs
116116
else: # workers is a number
117117
num_workers = workers
118118
if num_gpus and DISABLE_GPU_ALLOCATION_ENV not in os.environ:
119119
if os.environ.get(DYNAMO_DEPLOYMENT_ENV):
120120
# K8s replicas: Assumes DYNAMO_DEPLOYMENT_ENV is set
121121
# each pod in replicaset will have separate GPU with same CUDA_VISIBLE_DEVICES
122122
assigned = self.assign_gpus(num_gpus)
123-
worker_env = [
123+
resource_envs = [
124124
{"CUDA_VISIBLE_DEVICES": ",".join(map(str, assigned))}
125125
for _ in range(num_workers)
126126
]
127127
else:
128128
# local deployment where we split all available GPUs across workers
129129
for _ in range(num_workers):
130130
assigned = self.assign_gpus(num_gpus)
131-
worker_env.append(
131+
resource_envs.append(
132132
{"CUDA_VISIBLE_DEVICES": ",".join(map(str, assigned))}
133133
)
134-
return num_workers, worker_env
134+
return num_workers, resource_envs

deploy/dynamo/sdk/src/dynamo/sdk/cli/serve_dynamo.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from typing import Any
2727

2828
import click
29+
import uvloop
2930

3031
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
3132
from dynamo.sdk import dynamo_context
@@ -186,6 +187,7 @@ async def worker(runtime: DistributedRuntime):
186187
logger.error(f"[{run_id}] Error in Dynamo component setup: {str(e)}")
187188
raise
188189

190+
uvloop.install()
189191
asyncio.run(worker())
190192

191193

deploy/dynamo/sdk/src/dynamo/sdk/cli/serving.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def create_dependency_watcher(
127127
) -> tuple[Watcher, CircusSocket, str]:
128128
from bentoml.serving import create_watcher
129129

130-
num_workers, worker_envs = scheduler.get_worker_env(svc)
130+
num_workers, resource_envs = scheduler.get_resource_envs(svc)
131131
uri, socket = _get_server_socket(svc, uds_path, port_stack, backlog)
132132
args = [
133133
"-m",
@@ -141,8 +141,8 @@ def create_dependency_watcher(
141141
"$(CIRCUS.WID)",
142142
]
143143

144-
if worker_envs:
145-
args.extend(["--worker-env", json.dumps(worker_envs)])
144+
if resource_envs:
145+
args.extend(["--worker-env", json.dumps(resource_envs)])
146146

147147
watcher = create_watcher(
148148
name=f"service_{svc.name}",
@@ -171,7 +171,7 @@ def create_dynamo_watcher(
171171
uri, socket = _get_server_socket(svc, uds_path, port_stack, backlog)
172172

173173
# Get worker configuration
174-
num_workers, worker_envs = scheduler.get_worker_env(svc)
174+
num_workers, resource_envs = scheduler.get_resource_envs(svc)
175175

176176
# Create Dynamo-specific worker args
177177
args = [
@@ -184,8 +184,8 @@ def create_dynamo_watcher(
184184
"$(CIRCUS.WID)",
185185
]
186186

187-
if worker_envs:
188-
args.extend(["--worker-env", json.dumps(worker_envs)])
187+
if resource_envs:
188+
args.extend(["--worker-env", json.dumps(resource_envs)])
189189

190190
# Update env to include ServiceConfig and service-specific environment variables
191191
worker_env = env.copy() if env else {}
@@ -315,7 +315,7 @@ def serve_http(
315315

316316
if service_name and service_name != svc.name:
317317
svc = svc.find_dependent_by_name(service_name)
318-
num_workers, worker_envs = allocator.get_worker_env(svc)
318+
num_workers, resource_envs = allocator.get_resource_envs(svc)
319319
server_on_deployment(svc)
320320
uds_path = tempfile.mkdtemp(prefix="bentoml-uds-")
321321
try:
@@ -419,8 +419,8 @@ def serve_http(
419419
*timeouts_args,
420420
*timeout_args,
421421
]
422-
if worker_envs:
423-
server_args.extend(["--worker-env", json.dumps(worker_envs)])
422+
if resource_envs:
423+
server_args.extend(["--worker-env", json.dumps(resource_envs)])
424424
if development_mode:
425425
server_args.append("--development-mode")
426426

@@ -438,13 +438,37 @@ def serve_http(
438438
"--worker-id",
439439
"$(CIRCUS.WID)",
440440
]
441+
# resource_envs is the resource allocation (ie CUDA_VISIBLE_DEVICES) for each worker created by the allocator
442+
# these resource_envs are passed to each individual worker's environment which is set in serve_dynamo
443+
if resource_envs:
444+
args.extend(["--worker-env", json.dumps(resource_envs)])
445+
# env is the base bentoml environment variables. We make a copy and update it to add any service configurations and additional env vars
446+
worker_env = env.copy() if env else {}
447+
448+
# Pass through the main service config
449+
if "DYNAMO_SERVICE_CONFIG" in os.environ:
450+
worker_env["DYNAMO_SERVICE_CONFIG"] = os.environ[
451+
"DYNAMO_SERVICE_CONFIG"
452+
]
453+
454+
# Get service-specific environment variables from DYNAMO_SERVICE_ENVS
455+
if "DYNAMO_SERVICE_ENVS" in os.environ:
456+
try:
457+
service_envs = json.loads(os.environ["DYNAMO_SERVICE_ENVS"])
458+
if svc.name in service_envs:
459+
service_args = service_envs[svc.name].get("ServiceArgs", {})
460+
if "envs" in service_args:
461+
worker_env.update(service_args["envs"])
462+
except json.JSONDecodeError as e:
463+
logger.warning(f"Failed to parse DYNAMO_SERVICE_ENVS: {e}")
464+
441465
watcher = create_watcher(
442466
name=f"dynamo_service_{svc.name}",
443467
args=args,
444468
numprocesses=num_workers,
445469
working_dir=str(bento_path.absolute()),
446470
close_child_stdin=not development_mode,
447-
env=env, # Dependency map will be injected by serve_http
471+
env=worker_env, # Dependency map will be injected by serve_http
448472
)
449473
watchers.append(watcher)
450474
print(f"dynamo_service_{svc.name} entrypoint created")
@@ -495,15 +519,15 @@ def serve_http(
495519
arbiter.start(
496520
cb=lambda _: logger.info( # type: ignore
497521
(
498-
"Starting Dynamo Service %s (%s/%s) listening on %s://%s:%d (Press CTRL+C to quit)"
522+
"Starting Dynamo Service %s (Press CTRL+C to quit)"
499523
if (
500524
hasattr(svc, "is_dynamo_component")
501525
and svc.is_dynamo_component()
502526
)
503527
else "Starting %s (Press CTRL+C to quit)"
504528
),
505529
*(
506-
(svc.name, *svc.dynamo_address(), scheme, log_host, port)
530+
(svc.name,)
507531
if (
508532
hasattr(svc, "is_dynamo_component")
509533
and svc.is_dynamo_component()

examples/llm/README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,50 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
150150

151151
```
152152

153+
### Multinode Examples
154+
155+
#### Single node sized models
156+
You can deploy our example architectures on multiple nodes via NATS/ETCD based discovery and communication. Here's an example of deploying disaggregated serving on 2 nodes
157+
158+
##### Disaggregated Deployment with KV Routing
159+
Node 1: Frontend, Processor, Router, 8 Decode
160+
Node 2: 8 Prefill
161+
162+
**Step 1**: Start NATS/ETCD on your head node. Ensure you have the correct firewall rules to allow communication between the nodes as you will need the NATS/ETCD endpoints to be accessible by node 2.
163+
```bash
164+
# node 1
165+
docker compose -f deploy/docker-compose.yml up -d
166+
```
167+
168+
**Step 2**: Create the inference graph for this deployment. The easiest way to do this is to remove the `.link(PrefillWorker)` from the `disagg_router.py` file.
169+
170+
```python
171+
# graphs/disag_router.py
172+
# imports...
173+
Frontend.link(Processor).link(Router).link(VllmWorker)
174+
```
175+
176+
**Step 3**: Start the frontend, processor, router, and 8 VllmWorkers on node 1.
177+
```bash
178+
# node 1
179+
cd /workspace/examples/llm
180+
dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml --VllmWorker.ServiceArgs.workers=8
181+
```
182+
183+
**Step 4**: Start 8 PrefillWorkers on node 2.
184+
Since we only want to start the `PrefillWorker` on node 2, you can simply run just the PrefillWorker component directly.
185+
186+
```bash
187+
# node 2
188+
export NATS_SERVER = '<your-nats-server-address>' # note this should start with nats://...
189+
export ETCD_ENDPOINTS = '<your-etcd-endpoints-address>'
190+
191+
cd /workspace/examples/llm
192+
dynamo serve components.prefill_worker:PrefillWorker -f ./configs/disagg_router.yaml --PrefillWorker.ServiceArgs.workers=8
193+
```
194+
195+
You can now use the same curl request from above to interact with your deployment!
196+
153197
### Close deployment
154198

155199
Kill all dynamo processes managed by circusd.

0 commit comments

Comments
 (0)