Security hardening guide for production vLLM deployments covering authentication, network security, access control, and best practices.
Enable API key authentication to restrict access:
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Llama-2-7b-chat-hf \
--api-key sk-your-secret-api-key
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest
environment:
- VLLM_API_KEY=${VLLM_API_KEY}
command:
- --api-key
- ${VLLM_API_KEY}
# vault.yml (encrypted)
---
vllm_api_key: "sk-prod-secret-key-xxx"
huggingface_token: "hf_xxx"
# Create encrypted vault
ansible-vault create vault.yml
# Use in playbook
ansible-playbook playbook.yml --ask-vault-pass
For multiple clients, use a reverse proxy with key management:
# nginx.conf
http {
map $http_authorization $api_key {
"Bearer sk-client1-key" "client1";
"Bearer sk-client2-key" "client2";
default "";
}
server {
location /v1/ {
if ($api_key = "") {
return 401;
}
proxy_pass http://vllm:8000;
}
}
}
# Allow only specific IPs
sudo ufw allow from 192.168.1.0/24 to any port 8000 proto tcp
# Or allow only localhost (for reverse proxy setup)
sudo ufw allow from 127.0.0.1 to any port 8000 proto tcp
# Enable firewall
sudo ufw enable
# Allow specific subnet
sudo iptables -A INPUT -p tcp -s 192.168.1.0/24 --dport 8000 -j ACCEPT
# Drop all other traffic to port 8000
sudo iptables -A INPUT -p tcp --dport 8000 -j DROP
For reverse proxy setups, bind to localhost:
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Llama-2-7b-chat-hf \
--host 127.0.0.1 \
--port 8000
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest
networks:
- vllm-internal
# No port exposure - only accessible within network
expose:
- "8000"
nginx:
image: nginx:alpine
networks:
- vllm-internal
- public
ports:
- "443:443"
depends_on:
- vllm
networks:
vllm-internal:
driver: bridge
internal: true
public:
driver: bridge
server {
listen 443 ssl http2;
server_name vllm.example.com;
ssl_certificate /etc/ssl/certs/vllm.crt;
ssl_certificate_key /etc/ssl/private/vllm.key;
# Modern SSL configuration
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256;
ssl_prefer_server_ciphers off;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1d;
location /v1/ {
proxy_pass http://127.0.0.1:8000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Rate limiting
limit_req zone=vllm_limit burst=20 nodelay;
}
location /health {
proxy_pass http://127.0.0.1:8000;
access_log off;
}
}
# HTTP redirect
server {
listen 80;
server_name vllm.example.com;
return 301 https://$server_name$request_uri;
}
# Install Certbot
sudo apt install certbot python3-certbot-nginx
# Obtain certificate
sudo certbot --nginx -d vllm.example.com
# Auto-renewal (cron)
0 3 * * * certbot renew --quiet
http {
# Define rate limit zone
limit_req_zone $binary_remote_addr zone=vllm_limit:10m rate=10r/s;
server {
location /v1/ {
limit_req zone=vllm_limit burst=20 nodelay;
limit_req_status 429;
proxy_pass http://vllm:8000;
}
}
}
from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
limiter = Limiter(key_func=get_remote_address)
@app.middleware("http")
async def rate_limit_middleware(request: Request, call_next):
try:
return await call_next(request)
except RateLimitExceeded:
return JSONResponse(
status_code=429,
content={"detail": "Rate limit exceeded"}
)
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Llama-2-7b-chat-hf \
--allowed-origins "https://app.example.com" \
--allowed-methods "POST,GET,OPTIONS" \
--allowed-headers "Authorization,Content-Type" \
--allow-credentials
location /v1/ {
allow 192.168.1.0/24;
allow 10.0.0.0/8;
deny all;
proxy_pass http://vllm:8000;
}
# Create password file
sudo htpasswd -c /etc/nginx/.htpasswd vllmuser
location /v1/ {
auth_basic "vLLM API";
auth_basic_user_file /etc/nginx/.htpasswd;
proxy_pass http://vllm:8000;
}
version: '3.8'
services:
vllm:
image: vllm/vllm-openai:latest@sha256:<digest> # Pin by digest
container_name: vllm-secure
read_only: true # Read-only root filesystem
tmpfs:
- /tmp
- /root/.cache
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE
user: "1000:1000" # Non-root user
volumes:
- /data/models:/models:ro # Read-only model mount
- vllm-cache:/root/.cache
deploy:
resources:
limits:
cpus: '8'
memory: 64G
# Run Docker security audit
docker run --rm \
--net host \
--pid host \
--userns host \
--cap-add audit_control \
-v /var/lib:/var/lib:ro \
-v /var/run/docker.sock:/var/run/docker.sock:ro \
docker/docker-bench-security
Only use models from trusted sources:
# Allowed models list
vllm_allowed_models:
- "meta-llama/Llama-2-*"
- "mistralai/Mistral-*"
- "Qwen/Qwen-*"
# Verify model checksum
sha256sum /models/model.safetensors
# Compare with expected hash
# Limit output tokens to prevent large extractions
{
"model": "llama-2-7b",
"prompt": "...",
"max_tokens": 512 # Limit output
}
log_format vllm '$remote_addr - $remote_user [$time_local] '
'"$request" $status $body_bytes_sent '
'"$http_referer" "$http_user_agent" '
'rt=$request_time';
access_log /var/log/nginx/vllm_access.log vllm;
# Log configuration
vllm_log_level: "info"
vllm_log_requests: true
vllm_log_stats: true
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-metrics \
--metrics-port 9090
# Prometheus alerting rules
groups:
- name: vllm-alerts
rules:
- alert: HighErrorRate
expr: rate(vllm_request_errors_total[5m]) > 0.1
for: 5m
- alert: HighLatency
expr: histogram_quantile(0.99, vllm_request_duration_seconds_bucket) > 5
for: 5m
- alert: HighMemoryUsage
expr: vllm_gpu_memory_usage_bytes / vllm_gpu_memory_total_bytes > 0.95
for: 2m
# Check current connections
netstat -an | grep :8000 | wc -l
# Block offending IP
sudo ufw deny from <IP_ADDRESS>
# Review logs
tail -f /var/log/nginx/vllm_access.log | grep <IP_ADDRESS>
Running vLLM in regulated environments? We assist with:
Secure your deployment: office@linux-server-admin.com | Contact Page