Deploy vLLM using Ansible for automated, reproducible infrastructure. This guide provides playbooks for both Docker-based and bare-metal deployments.
Create an inventory file inventory.yml:
all:
children:
vllm_servers:
hosts:
vllm-prod-01:
ansible_host: 192.168.1.100
ansible_user: ubuntu
ansible_ssh_private_key_file: ~/.ssh/id_ed25519
# vLLM specific variables
vllm_model: meta-llama/Llama-2-7b-chat-hf
vllm_port: 8000
vllm_gpu_count: 1
vllm-prod-02:
ansible_host: 192.168.1.101
ansible_user: ubuntu
ansible_ssh_private_key_file: ~/.ssh/id_ed25519
vllm_model: meta-llama/Llama-2-70b-chat-hf
vllm_port: 8000
vllm_gpu_count: 2
vllm_tensor_parallel: 2
Create group_vars/vllm_servers.yml:
---
# vLLM Configuration
vllm_version: "latest"
vllm_docker_image: "vllm/vllm-openai:{{ vllm_version }}"
vllm_container_name: "vllm-server"
vllm_network: "vllm-network"
# Model Configuration
vllm_model: "meta-llama/Llama-2-7b-chat-hf"
vllm_max_model_len: 4096
vllm_gpu_memory_utilization: 0.9
vllm_enable_prefix_caching: true
vllm_enable_chunked_prefill: false
# API Configuration
vllm_host: "0.0.0.0"
vllm_port: 8000
vllm_api_key: "" # Set via vault for production
# Resource Configuration
vllm_tensor_parallel_size: 1
vllm_max_num_seqs: 256
# Volume Configuration
vllm_models_path: "/data/models"
vllm_cache_path: "/data/vllm-cache"
vllm_logs_path: "/var/log/vllm"
# HuggingFace Token (use Ansible Vault)
# huggingface_token: "hf_xxx"
# System Configuration
vllm_restart_policy: "unless-stopped"
vllm_healthcheck_enabled: true
vllm_healthcheck_interval: "30s"
vllm_healthcheck_timeout: "10s"
vllm_healthcheck_retries: 3
Create playbook.yml:
---
- name: Deploy vLLM LLM Serving Engine
hosts: vllm_servers
become: true
gather_facts: true
vars:
required_packages:
- apt-transport-https
- ca-certificates
- curl
- gnupg
- lsb-release
pre_tasks:
- name: Gather facts if not already gathered
ansible.builtin.setup:
when: ansible_facts | length == 0
- name: Validate GPU presence
ansible.builtin.command: nvidia-smi
register: nvidia_check
changed_when: false
failed_when: false
- name: Report GPU status
ansible.builtin.debug:
msg: "NVIDIA GPU detected: {{ 'Yes' if nvidia_check.rc == 0 else 'No - CPU mode only' }}"
tasks:
- name: Include Docker setup
ansible.builtin.include_tasks: tasks/docker_setup.yml
when: vllm_deploy_method | default('docker') == 'docker'
- name: Include vLLM Docker deployment
ansible.builtin.include_tasks: tasks/vllm_docker.yml
- name: Verify deployment
ansible.builtin.include_tasks: tasks/verify.yml
---
- name: Install Docker prerequisites
ansible.builtin.apt:
name: "{{ required_packages }}"
state: present
update_cache: true
when: ansible_os_family == "Debian"
- name: Add Docker GPG key
ansible.builtin.apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
when: ansible_os_family == "Debian"
- name: Add Docker repository
ansible.builtin.apt_repository:
repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
state: present
when: ansible_os_family == "Debian"
- name: Install Docker
ansible.builtin.apt:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-compose-plugin
state: present
update_cache: true
- name: Install NVIDIA Container Toolkit
ansible.builtin.apt:
name:
- nvidia-container-toolkit
state: present
when: nvidia_check.rc == 0
- name: Configure Docker for NVIDIA GPU
ansible.builtin.copy:
dest: /etc/docker/daemon.json
content: |
{
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
owner: root
group: root
mode: '0644'
notify: Restart Docker
when: nvidia_check.rc == 0
- name: Ensure Docker service is running
ansible.builtin.systemd:
name: docker
state: started
enabled: true
- name: Add user to docker group
ansible.builtin.user:
name: "{{ ansible_user }}"
groups: docker
append: true
- name: Create vLLM network
ansible.builtin.command:
cmd: docker network create {{ vllm_network }}
register: network_create
changed_when: network_create.rc == 0
failed_when: network_create.rc != 0 and 'already exists' not in network_create.stderr
---
- name: Create model directory
ansible.builtin.file:
path: "{{ vllm_models_path }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
mode: '0755'
- name: Create cache directory
ansible.builtin.file:
path: "{{ vllm_cache_path }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
mode: '0755'
- name: Create logs directory
ansible.builtin.file:
path: "{{ vllm_logs_path }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
mode: '0755'
- name: Generate Docker Compose file
ansible.builtin.template:
src: templates/docker-compose.yml.j2
dest: "/home/{{ ansible_user }}/docker-compose.yml"
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
mode: '0644'
- name: Pull vLLM Docker image
ansible.builtin.command:
cmd: docker pull {{ vllm_docker_image }}
register: docker_pull
changed_when: "'Pulling' in docker_pull.stdout or docker_pull.rc != 0"
- name: Deploy vLLM with Docker Compose
ansible.builtin.command:
cmd: docker compose up -d
chdir: "/home/{{ ansible_user }}"
register: compose_up
changed_when: compose_up.rc == 0
- name: Wait for vLLM to be healthy
ansible.builtin.uri:
url: "http://localhost:{{ vllm_port }}/health"
method: GET
status_code: 200
register: health_check
retries: "{{ vllm_healthcheck_retries | int + 5 }}"
delay: 10
until: health_check.status == 200
ignore_errors: true
---
- name: Check container status
ansible.builtin.command:
cmd: docker ps -a --filter name={{ vllm_container_name }} --format "{{ '{{' }}.Status{{ '}}' }}"
register: container_status
changed_when: false
- name: Display container status
ansible.builtin.debug:
var: container_status.stdout
- name: Test API health endpoint
ansible.builtin.uri:
url: "http://localhost:{{ vllm_port }}/health"
method: GET
return_content: true
register: api_health
ignore_errors: true
- name: Display API health
ansible.builtin.debug:
var: api_health.status
- name: List available models
ansible.builtin.uri:
url: "http://localhost:{{ vllm_port }}/v1/models"
method: GET
return_content: true
register: models_list
ignore_errors: true
- name: Display models
ansible.builtin.debug:
var: models_list.json
version: '3.8'
services:
vllm:
image: {{ vllm_docker_image }}
container_name: {{ vllm_container_name }}
hostname: {{ inventory_hostname }}
ports:
- "{{ vllm_port }}:8000"
volumes:
- {{ vllm_models_path }}:/models
- {{ vllm_cache_path }}:/root/.cache/huggingface
- {{ vllm_logs_path }}:/var/log/vllm
environment:
{% if huggingface_token is defined %}
- HUGGING_FACE_HUB_TOKEN={{ huggingface_token }}
{% endif %}
{% if vllm_api_key | default('') | length > 0 %}
- VLLM_API_KEY={{ vllm_api_key }}
{% endif %}
- CUDA_VISIBLE_DEVICES={% for i in range(vllm_gpu_count | default(1)) %}{{ i }}{% if not loop.last %},{% endif %}{% endfor %}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: {{ vllm_gpu_count | default(1) }}
capabilities: [gpu]
command:
- --model
- {{ vllm_model }}
- --host
- "0.0.0.0"
- --port
- "8000"
- --max-model-len
- "{{ vllm_max_model_len }}"
- --gpu-memory-utilization
- "{{ vllm_gpu_memory_utilization }}"
- --tensor-parallel-size
- "{{ vllm_tensor_parallel_size }}"
- --max-num-seqs
- "{{ vllm_max_num_seqs }}"
{% if vllm_enable_prefix_caching %}
- --enable-prefix-caching
{% endif %}
{% if vllm_enable_chunked_prefill %}
- --enable-chunked-prefill
{% endif %}
{% if vllm_served_model_name is defined %}
- --served-model-name
- {{ vllm_served_model_name }}
{% endif %}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: {{ vllm_healthcheck_interval }}
timeout: {{ vllm_healthcheck_timeout }}
retries: {{ vllm_healthcheck_retries }}
start_period: 120s
restart: {{ vllm_restart_policy }}
logging:
driver: json-file
options:
max-size: "100m"
max-file: "5"
networks:
default:
name: {{ vllm_network }}
external: true
# Deploy to all vLLM servers
ansible-playbook -i inventory.yml playbook.yml
# Deploy with verbose output
ansible-playbook -i inventory.yml playbook.yml -vvv
# Deploy to specific host
ansible-playbook -i inventory.yml playbook.yml --limit vllm-prod-01
# Dry run (check mode)
ansible-playbook -i inventory.yml playbook.yml --check
# Create encrypted vault file
ansible-vault create vault.yml
# Add to vault.yml:
# ---
# huggingface_token: "hf_xxx"
# vllm_api_key: "sk-your-secret-key"
# Deploy with vault
ansible-playbook -i inventory.yml playbook.yml --ask-vault-pass
# Update with rolling restart (one host at a time)
ansible-playbook -i inventory.yml playbook.yml --limit vllm_servers[0]
ansible-playbook -i inventory.yml playbook.yml --limit vllm_servers[1]
Create handlers/main.yml:
---
- name: Restart Docker
ansible.builtin.systemd:
name: docker
state: restarted
- name: Restart vLLM
ansible.builtin.command:
cmd: docker compose restart
chdir: "/home/{{ ansible_user }}"
# group_vars/high_throughput.yml
vllm_model: meta-llama/Llama-2-70b-chat-hf
vllm_tensor_parallel_size: 2
vllm_gpu_count: 2
vllm_gpu_memory_utilization: 0.95
vllm_max_num_seqs: 512
vllm_enable_prefix_caching: true
vllm_enable_chunked_prefill: true
# group_vars/low_latency.yml
vllm_model: meta-llama/Llama-2-7b-chat-hf
vllm_tensor_parallel_size: 1
vllm_gpu_count: 1
vllm_gpu_memory_utilization: 0.9
vllm_max_model_len: 2048
vllm_max_num_seqs: 128
vllm_enforce_eager: false
# group_vars/multi_model.yml
vllm_models:
- name: llama-2-7b
path: meta-llama/Llama-2-7b-chat-hf
- name: mistral-7b
path: mistralai/Mistral-7B-Instruct-v0.2
ansible vllm_servers -m command -a "docker logs {{ vllm_container_name }} --tail 100"
ansible vllm_servers -m command -a "docker exec {{ vllm_container_name }} nvidia-smi"
ansible vllm_servers -m uri -a "url=http://localhost:{{ vllm_port }}/health method=GET"
Any questions?
Feel free to contact us. Find all contact information on our contact page.