-
Notifications
You must be signed in to change notification settings - Fork 635
/
Copy pathgemma3.yaml
35 lines (31 loc) · 883 Bytes
/
gemma3.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
resources:
accelerators: {L4:1, L4:4, A100:1, A100-80GB:1}
ports:
- 8000
disk_tier: best
use_spot: true
envs:
HF_TOKEN:
MODEL_NAME: google/gemma-3-4b-it
MAX_MODEL_LEN: 4096
setup: |
uv pip install git+https://github.com/huggingface/transformers.git@0013ba6
uv pip install vllm --extra-index-url https://wheels.vllm.ai/c0c25e25fa93ee7c3f279abbba5597c0fafa74ee
run: |
echo 'Starting vllm openai api server...'
python -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--model $MODEL_NAME \
--max-model-len $MAX_MODEL_LEN
service:
replicas: 1
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1