Deploy vLLM using Docker for a production-ready, containerized LLM inference and serving solution.
# Ubuntu/Debian
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
| Property | Value |
|---|---|
| Image Name | vllm/vllm-openai |
| Registry | Docker Hub |
| Pulls | 10M+ |
| Size | ~9.5 GB |
| Base | Ubuntu + PyTorch + CUDA |
| Tag | Description |
|---|---|
latest |
Latest stable release |
nightly |
Nightly build (latest features) |
cpu |
CPU-only build |
<version> |
Specific version (e.g., 0.16.0) |
docker run --gpus all -p 8000:8000 vllm/vllm-openai:latest \
--model meta-llama/Llama-2-7b-chat-hf
docker run --gpus all \
-p 8000:8000 \
-v vllm-models:/models \
vllm/vllm-openai:latest \
--model /models/Llama-2-7b-chat-hf
docker run --gpus all \
-p 8000:8000 \
-v /path/to/models:/models \
vllm/vllm-openai:latest \
--model /models/Llama-2-7b-chat-hf
Create docker-compose.yml:
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm-server
ports:
- "8000:8000"
volumes:
- ./models:/models
- vllm-cache:/root/.cache/huggingface
environment:
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command:
- --model
- /models/Llama-2-7b-chat-hf
- --host
- "0.0.0.0"
- --port
- "8000"
volumes:
vllm-cache:
Start the service:
docker compose up -d
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm-multi-gpu
ports:
- "8000:8000"
volumes:
- ./models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 2
capabilities: [gpu]
command:
- --model
- /models/Llama-2-70b-chat-hf
- --tensor-parallel-size
- "2"
- --gpu-memory-utilization
- "0.95"
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm-secure
ports:
- "8000:8000"
environment:
- VLLM_API_KEY=${VLLM_API_KEY:-sk-your-api-key}
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
volumes:
- ./models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command:
- --model
- /models/Llama-2-7b-chat-hf
- --api-key
- ${VLLM_API_KEY:-sk-your-api-key}
| Variable | Description | Example |
|---|---|---|
HUGGING_FACE_HUB_TOKEN |
HF Hub authentication token | hf_xxx |
VLLM_API_KEY |
API key for authentication | sk-xxx |
CUDA_VISIBLE_DEVICES |
Select specific GPUs | 0,1 |
VLLM_ALLOW_LONG_MAX_MODEL_LEN |
Allow longer model lengths | 1 |
VLLM_TEST_FORCE_FP8 |
Force FP8 quantization | 1 |
| Argument | Description | Example |
|---|---|---|
--model |
Model name or path | meta-llama/Llama-2-7b-chat-hf |
--tensor-parallel-size |
Number of GPUs | 2 |
--max-model-len |
Max sequence length | 4096 |
--gpu-memory-utilization |
GPU memory fraction | 0.9 |
--quantization |
Quantization method | awq, gptq, fp8 |
--enable-prefix-caching |
Enable prefix caching | (flag) |
--enable-chunked-prefill |
Enable chunked prefill | (flag) |
--max-num-seqs |
Max concurrent sequences | 256 |
--served-model-name |
Custom model name in API | my-llama-model |
services:
vllm:
image: vllm/vllm-openai:latest
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
services:
vllm:
deploy:
resources:
limits:
cpus: '8'
memory: 64G
reservations:
cpus: '4'
memory: 32G
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
services:
vllm:
logging:
driver: json-file
options:
max-size: "100m"
max-file: "3"
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm-production
hostname: vllm-server
ports:
- "8000:8000"
volumes:
- /data/models:/models:ro
- vllm-cache:/root/.cache/huggingface
- ./logs:/var/log/vllm
environment:
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
- VLLM_API_KEY=${VLLM_API_KEY}
- CUDA_VISIBLE_DEVICES=0
deploy:
resources:
limits:
cpus: '8'
memory: 64G
reservations:
cpus: '4'
memory: 32G
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
command:
- --model
- /models/Llama-2-7b-chat-hf
- --host
- "0.0.0.0"
- --port
- "8000"
- --tensor-parallel-size
- "1"
- --max-model-len
- "4096"
- --gpu-memory-utilization
- "0.9"
- --enable-prefix-caching
- --served-model-name
- "llama-2-7b"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 120s
restart: unless-stopped
logging:
driver: json-file
options:
max-size: "100m"
max-file: "5"
volumes:
vllm-cache:
curl http://localhost:8000/health
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${VLLM_API_KEY:-not-needed}" \
-d '{
"model": "llama-2-7b",
"prompt": "Explain quantum computing in simple terms:",
"max_tokens": 200,
"temperature": 0.7
}'
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${VLLM_API_KEY:-not-needed}" \
-d '{
"model": "llama-2-7b",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"}
],
"max_tokens": 100
}'
curl http://localhost:8000/v1/models \
-H "Authorization: Bearer ${VLLM_API_KEY:-not-needed}"
# Verify NVIDIA Container Toolkit
docker run --gpus all nvidia/cuda:12.0-base nvidia-smi
# Check Docker GPU access
docker exec vllm-server nvidia-smi
# Pre-download model on host
huggingface-cli download meta-llama/Llama-2-7b-chat-hf \
--local-dir /path/to/models/Llama-2-7b-chat-hf
# Then mount in Docker
docker run -v /path/to/models:/models ...
# Reduce GPU memory utilization
command:
- --gpu-memory-utilization
- "0.8"
- --max-model-len
- "2048"
Any questions?
Feel free to contact us. Find all contact information on our contact page.