-
Notifications
You must be signed in to change notification settings - Fork 114
/
Copy pathtriton_trt-llm.containerfile
86 lines (72 loc) · 2.95 KB
/
triton_trt-llm.containerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3
ARG ENGINE_DEST_PATH=/var/run/models/engine
ARG HF_HOME=/var/run/hugging_face
ARG MODEL_DEST_PATH=/var/run/models/model
FROM ${BASE_CONTAINER_IMAGE}
# Set a set of useful labels.
LABEL "base"="${BASE_CONTAINER_IMAGE}"
LABEL "role"="server"
# Stop APT (Debian package manager) from complaining about interactivity.
ENV DEBIAN_FRONTEND=noninteractive
# Set additional environment values that make usage more pleasant.
ENV TERM=xterm-256color
RUN apt update \
&& apt install --yes \
apt-transport-https \
ca-certificates \
curl \
gnupg \
cgroup-tools \
&& rm -rf /var/lib/apt/lists/*
# Install kubectl because server.py script depends on it.
# Step 1: acquire the Kubernetes APT GPG key.
RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key \
| gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \
&& chmod 644 /etc/apt/keyrings/kubernetes-apt-keyring.gpg
# Step 2: Acquire the API sources list for Kubernetes.
RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /' \
| tee /etc/apt/sources.list.d/kubernetes.list \
&& chmod 644 /etc/apt/sources.list.d/kubernetes.list
# Step 3: Install kubectl.
RUN apt update \
&& apt install --yes \
kubectl \
&& apt autoremove --yes \
&& apt purge --yes \
&& rm -rf /var/lib/apt/lists/*
# Set Triton CLI environment variables which control where
# TRTLLM engine and model files are downloaded to; and where
# the path to the Huggingface cache.
ENV ENGINE_DEST_PATH ${ENGINE_DEST_PATH}
ENV HF_HOME ${HF_HOME}
ENV MODEL_DEST_PATH ${MODEL_DEST_PATH}
# Set the active working directory.
WORKDIR /workspace
# Install a custom version of Triton CLI that support Tensor parallelism and
# the 70B version of Llama models.
RUN pip --verbose install \
--no-cache-dir \
--no-color \
--no-input \
git+https://github.com/triton-inference-server/triton_cli.git@jwyman/aslb-mn
# Copy kubessh script w/ executable permissions for everyone.
# This enables the script to be executed no matter the user the container is run as.
# This works around the issue of the file being non-executable when the container is build on a Windows host.
COPY --chmod=555 kubessh .
COPY server.py .
RUN apt list --installed \
&& pip list --version
ENTRYPOINT [ "/bin/bash" ]