Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/actions/pytest/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ runs:
# Run pytest with detailed output and JUnit XML
set +e # Don't exit on test failures

# -W ignore::DeprecationWarning suppresses DeprecationWarnings from SWIG/SciPy (vLLM dependency). Remove once vLLM updates to a SciPy version which uses SWIG 4.4.x.
docker run --runtime=nvidia --gpus all -w /workspace \
--cpus=${NUM_CPUS} \
--network host \
--name ${{ env.CONTAINER_ID }}_pytest \
${{ inputs.image_tag }} \
bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -W ignore::DeprecationWarning -m \"${{ inputs.pytest_marks }}\""

TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
Expand Down
17 changes: 9 additions & 8 deletions container/Dockerfile.vllm
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@
# SPDX-License-Identifier: Apache-2.0

ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# TODO OPS-612: NCCL will hang with 25.03, so use 25.01 for now
# Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8)
# TODO OPS-612: NCCL hang issue with 25.03 - verify if resolved in 25.04
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
ARG BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
ARG ENABLE_KVBM=false
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.8"
ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.9"

# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.11.0"
ARG VLLM_REF="v0.11.2"
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF="v0.3.1"
ARG TORCH_BACKEND="cu128"
ARG FLASHINF_REF="v0.5.2"
ARG TORCH_BACKEND="cu129"

# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
Expand Down Expand Up @@ -206,7 +207,7 @@ RUN apt-get update && \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-8 && \
cuda-command-line-tools-12-9 && \
rm -rf /var/lib/apt/lists/*

# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image
Expand Down
5 changes: 3 additions & 2 deletions container/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,12 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc2"
TENSORRTLLM_PIP_WHEEL=""

VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# FIXME: NCCL will hang with 25.03, so use 25.01 for now
# Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8)
# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.04
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"

NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
Expand Down
10 changes: 6 additions & 4 deletions container/deps/vllm/install_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,23 @@

set -euo pipefail

VLLM_REF="v0.11.0"
VLLM_REF="v0.11.2"

# Basic Configurations
ARCH=$(uname -m)
MAX_JOBS=16
INSTALLATION_DIR=/tmp

# VLLM and Dependency Configurations
TORCH_BACKEND="cu128"
TORCH_BACKEND="cu129"
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
DEEPGEMM_REF=""
CUDA_VERSION="12.8" # For DEEPGEMM
CUDA_VERSION="12.9" # For DEEPGEMM

# These flags are applicable when installing vLLM from source code
EDITABLE=true
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
FLASHINF_REF="v0.3.1"
FLASHINF_REF="v0.5.2"

while [[ $# -gt 0 ]]; do
case $1 in
Expand Down Expand Up @@ -141,6 +141,8 @@ if [[ $VLLM_REF =~ ^v ]] && { [ "$ARCH" = "amd64" ] || { [ "$ARCH" = "arm64" ] &
echo "Installing vLLM $VLLM_REF from PyPI... (ARCH=$ARCH, TORCH_BACKEND=$TORCH_BACKEND)"

uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

else
# VLLM_REF does not start with 'v' or amd64 - use git checkout path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests

(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
Expand Down
1 change: 0 additions & 1 deletion examples/backends/vllm/deploy/agg_kvbm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.45"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
Expand Down
2 changes: 0 additions & 2 deletions examples/backends/vllm/deploy/disagg_kvbm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
Expand Down Expand Up @@ -68,7 +67,6 @@ spec:
- --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
Expand Down
2 changes: 0 additions & 2 deletions examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
Expand Down Expand Up @@ -68,7 +67,6 @@ spec:
- --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
Expand Down
2 changes: 0 additions & 2 deletions examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.23"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
Expand Down Expand Up @@ -72,7 +71,6 @@ spec:
- --is-prefill-worker
- --gpu-memory-utilization
- "0.23"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from vllm.config import VllmConfig
from vllm.forward_context import ForwardContext
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request


Expand All @@ -40,8 +41,15 @@ def __init__(self, metadata: bytes):


class DynamoConnector(KVConnectorBase_V1):
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
super().__init__(vllm_config=vllm_config, role=role)
def __init__(
self,
vllm_config: "VllmConfig",
role: KVConnectorRole,
kv_cache_config: Optional["KVCacheConfig"] = None,
):
super().__init__(
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
)

assert vllm_config.kv_transfer_config is not None
assert vllm_config.kv_transfer_config.engine_id is not None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
LMCacheConnectorV1,
)
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request


Expand All @@ -46,8 +47,15 @@ class PdConnector(MultiConnector):
- The second connector must be NIXL and will be used by decode worker to get KV blocks from prefill worker.
"""

def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
super().__init__(vllm_config=vllm_config, role=role)
def __init__(
self,
vllm_config: "VllmConfig",
role: KVConnectorRole,
kv_cache_config: "KVCacheConfig",
):
super().__init__(
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
)
if len(self._connectors) != 2:
raise ValueError(
f"PdConnector requires exactly two connectors (got {len(self._connectors)})"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
from vllm.model_executor.models.utils import extract_layer_index
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ trtllm =[
vllm = [
"uvloop",
"nixl[cu12]<=0.7.1",
"vllm[flashinfer]==0.11.0",
"vllm[flashinfer]==0.11.2",
]

sglang = [
Expand Down
2 changes: 1 addition & 1 deletion recipes/llama-3-70b/vllm/agg/deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand Down
4 changes: 2 additions & 2 deletions recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand Down Expand Up @@ -74,7 +74,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand Down
4 changes: 2 additions & 2 deletions recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand Down Expand Up @@ -98,7 +98,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand Down
32 changes: 32 additions & 0 deletions tests/dependencies/test_vllm_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Unit tests to sanity check that required dependencies can be imported."""

import pytest


@pytest.mark.vllm
@pytest.mark.unit
@pytest.mark.gpu_1
def test_import_deep_ep():
"""Test that deep_ep module can be imported."""
try:
import deep_ep

assert deep_ep is not None
except ImportError as e:
pytest.fail(f"Failed to import deep_ep: {e}")


@pytest.mark.vllm
@pytest.mark.unit
@pytest.mark.gpu_1
def test_import_pplx_kernels():
"""Test that pplx_kernels module can be imported."""
try:
import pplx_kernels

assert pplx_kernels is not None
except ImportError as e:
pytest.fail(f"Failed to import pplx_kernels: {e}")
4 changes: 2 additions & 2 deletions tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
ARG LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"

# Other build arguments
ARG PYTHON_VERSION=3.12
Expand Down Expand Up @@ -57,7 +57,7 @@ RUN apt-get update && \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-8 && \
cuda-command-line-tools-12-9 && \
rm -rf /var/lib/apt/lists/*

# Copy CUDA development tools from vLLM image (for JIT compilation)
Expand Down
1 change: 0 additions & 1 deletion tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ spec:
- --model
- deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code
- --disable-log-requests
- --tensor-parallel-size
- "1"
- --data-parallel-size
Expand Down
2 changes: 0 additions & 2 deletions tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ spec:
- --model
- deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code
- --disable-log-requests
- --tensor-parallel-size
- "1"
- --data-parallel-size
Expand Down Expand Up @@ -130,7 +129,6 @@ spec:
- --model
- deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code
- --disable-log-requests
- --is-prefill-worker
- --tensor-parallel-size
- "1"
Expand Down
Loading