From 1c4e8755738df9721cd8a6ee835d7777955d609b Mon Sep 17 00:00:00 2001 From: karen-sy Date: Wed, 19 Nov 2025 11:12:40 -0800 Subject: [PATCH 01/13] try bumping vllm/flashinfer for v0.11.1 --- container/Dockerfile.vllm | 4 ++-- container/deps/vllm/install_vllm.sh | 4 ++-- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 186b406c38..d99f2f4c5d 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -14,9 +14,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" ARG CUDA_VERSION="12.8" # Make sure to update the dependency version in pyproject.toml when updating this -ARG VLLM_REF="v0.11.0" +ARG VLLM_REF="v0.11.1" # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds -ARG FLASHINF_REF="v0.3.1" +ARG FLASHINF_REF="v0.5.2" ARG TORCH_BACKEND="cu128" # If left blank, then we will fallback to vLLM defaults diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index 0ebbb58823..97554976c4 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -13,7 +13,7 @@ set -euo pipefail -VLLM_REF="v0.11.0" +VLLM_REF="v0.11.1" # Basic Configurations ARCH=$(uname -m) @@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM # These flags are applicable when installing vLLM from source code EDITABLE=true VLLM_GIT_URL="https://github.com/vllm-project/vllm.git" -FLASHINF_REF="v0.3.1" +FLASHINF_REF="v0.5.2" while [[ $# -gt 0 ]]; do case $1 in diff --git a/pyproject.toml b/pyproject.toml index 675fbd0c70..dca0c6eaee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ trtllm =[ vllm = [ "uvloop", "nixl[cu12]<=0.7.1", - "vllm[flashinfer]==0.10.2", + "vllm[flashinfer]==0.11.1", ] sglang = [ From 53b4826baae9a42306fc1507969894d645c0e6cb Mon Sep 17 00:00:00 2001 From: karen-sy Date: Wed, 19 Nov 2025 21:09:12 -0800 Subject: [PATCH 02/13] upgrade to torch=2.9.0+cu129 --- container/Dockerfile.vllm | 10 +++++----- container/build.sh | 2 +- container/deps/vllm/install_vllm.sh | 4 ++-- .../deploy/container/Dockerfile.local_vllm | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index d99f2f4c5d..348f01a032 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -7,17 +7,17 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04" ARG ENABLE_KVBM=false ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" -ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" -ARG CUDA_VERSION="12.8" +ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" +ARG CUDA_VERSION="12.9" # Make sure to update the dependency version in pyproject.toml when updating this ARG VLLM_REF="v0.11.1" # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds ARG FLASHINF_REF="v0.5.2" -ARG TORCH_BACKEND="cu128" +ARG TORCH_BACKEND="cu129" # If left blank, then we will fallback to vLLM defaults ARG DEEPGEMM_REF="" @@ -206,7 +206,7 @@ RUN apt-get update && \ # prometheus dependencies ca-certificates \ # DeepGemm uses 'cuobjdump' which does not come with CUDA image - cuda-command-line-tools-12-8 && \ + cuda-command-line-tools-12-9 && \ rm -rf /var/lib/apt/lists/* # Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image diff --git a/container/build.sh b/container/build.sh index 2bc2327cc0..adc2f16a99 100755 --- a/container/build.sh +++ b/container/build.sh @@ -107,7 +107,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" +VLLM_BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04" NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index 97554976c4..de5460666b 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -21,10 +21,10 @@ MAX_JOBS=16 INSTALLATION_DIR=/tmp # VLLM and Dependency Configurations -TORCH_BACKEND="cu128" +TORCH_BACKEND="cu129" TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels DEEPGEMM_REF="" -CUDA_VERSION="12.8" # For DEEPGEMM +CUDA_VERSION="12.9" # For DEEPGEMM # These flags are applicable when installing vLLM from source code EDITABLE=true diff --git a/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm b/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm index 5a4631f9b7..3b790a3bb8 100644 --- a/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm +++ b/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm @@ -9,7 +9,7 @@ ARG LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input" ARG DYNAMO_BASE_IMAGE="dynamo:latest-none" ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" -ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" +ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" # Other build arguments ARG PYTHON_VERSION=3.12 @@ -57,7 +57,7 @@ RUN apt-get update && \ # prometheus dependencies ca-certificates \ # DeepGemm uses 'cuobjdump' which does not come with CUDA image - cuda-command-line-tools-12-8 && \ + cuda-command-line-tools-12-9 && \ rm -rf /var/lib/apt/lists/* # Copy CUDA development tools from vLLM image (for JIT compilation) From 3679987bc377456408d031647b9778f56cd66a76 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Wed, 19 Nov 2025 21:26:13 -0800 Subject: [PATCH 03/13] ngc 25.04 --- container/Dockerfile.vllm | 5 +++-- container/build.sh | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 348f01a032..53967b7f78 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -3,11 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" -# TODO OPS-612: NCCL will hang with 25.03, so use 25.01 for now +# Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8) +# TODO OPS-612: NCCL hang issue with 25.03 - verify if resolved in 25.04 # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -ARG BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" ARG ENABLE_KVBM=false ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" diff --git a/container/build.sh b/container/build.sh index adc2f16a99..495f31aaa7 100755 --- a/container/build.sh +++ b/container/build.sh @@ -103,11 +103,12 @@ TENSORRTLLM_PIP_WHEEL="" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" -# FIXME: NCCL will hang with 25.03, so use 25.01 for now +# Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8) +# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.04 # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -VLLM_BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04" +VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" From b9ea0e902d9e61c8e2f30d75d3ca688abc4710b8 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Thu, 20 Nov 2025 10:52:14 -0800 Subject: [PATCH 04/13] minor up to ngc 25.06 for consistency --- container/Dockerfile.vllm | 2 +- container/build.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 53967b7f78..4d33d02914 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -8,7 +8,7 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -ARG BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" ARG ENABLE_KVBM=false ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" diff --git a/container/build.sh b/container/build.sh index 495f31aaa7..70e86c3bf1 100755 --- a/container/build.sh +++ b/container/build.sh @@ -104,11 +104,11 @@ TENSORRTLLM_PIP_WHEEL="" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8) -# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.04 +# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.6 # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" +VLLM_BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" From 379a1d2c5fc0fae7276800b1783031efcb9a7bd4 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Thu, 20 Nov 2025 11:20:58 -0800 Subject: [PATCH 05/13] ci seems to fail on 25.06 so revert to 25.04 --- container/Dockerfile.vllm | 2 +- container/build.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 4d33d02914..53967b7f78 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -8,7 +8,7 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" +ARG BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" ARG ENABLE_KVBM=false ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" diff --git a/container/build.sh b/container/build.sh index 70e86c3bf1..495f31aaa7 100755 --- a/container/build.sh +++ b/container/build.sh @@ -104,11 +104,11 @@ TENSORRTLLM_PIP_WHEEL="" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8) -# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.6 +# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.04 # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -VLLM_BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" +VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" From 8858e98823bff3826b8f2e84a8e5e8733e94d816 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Thu, 20 Nov 2025 11:47:44 -0800 Subject: [PATCH 06/13] 0.11.2 --- container/Dockerfile.vllm | 2 +- container/deps/vllm/install_vllm.sh | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 53967b7f78..130d58ff98 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -15,7 +15,7 @@ ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" ARG CUDA_VERSION="12.9" # Make sure to update the dependency version in pyproject.toml when updating this -ARG VLLM_REF="v0.11.1" +ARG VLLM_REF="v0.11.2" # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds ARG FLASHINF_REF="v0.5.2" ARG TORCH_BACKEND="cu129" diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index de5460666b..e424f3337e 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -13,7 +13,7 @@ set -euo pipefail -VLLM_REF="v0.11.1" +VLLM_REF="v0.11.2" # Basic Configurations ARCH=$(uname -m) diff --git a/pyproject.toml b/pyproject.toml index dca0c6eaee..80cc048c6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ trtllm =[ vllm = [ "uvloop", "nixl[cu12]<=0.7.1", - "vllm[flashinfer]==0.11.1", + "vllm[flashinfer]==0.11.2", ] sglang = [ From 3d138eb9218d5149aec9ab8d61728431d747a688 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Tue, 25 Nov 2025 09:58:14 -0800 Subject: [PATCH 07/13] remove --disable-log-requests deprecated flag --- .../sglang/slurm_jobs/scripts/vllm/benchmark_serving.py | 1 - examples/backends/vllm/deploy/agg_kvbm.yaml | 1 - examples/backends/vllm/deploy/disagg_kvbm.yaml | 2 -- examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml | 2 -- examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml | 2 -- recipes/llama-3-70b/vllm/agg/deploy.yaml | 2 +- recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml | 4 ++-- recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml | 4 ++-- tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml | 1 - tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml | 2 -- 10 files changed, 5 insertions(+), 16 deletions(-) diff --git a/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py b/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py index f9c67be7bc..a5962afe17 100644 --- a/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py +++ b/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py @@ -8,7 +8,6 @@ vLLM OpenAI API server vllm serve \ --swap-space 16 \ - --disable-log-requests (TGI backend) ./launch_tgi_server.sh diff --git a/examples/backends/vllm/deploy/agg_kvbm.yaml b/examples/backends/vllm/deploy/agg_kvbm.yaml index 62e28386aa..0c42380979 100644 --- a/examples/backends/vllm/deploy/agg_kvbm.yaml +++ b/examples/backends/vllm/deploy/agg_kvbm.yaml @@ -42,7 +42,6 @@ spec: - Qwen/Qwen3-8B - --gpu-memory-utilization - "0.45" - - --disable-log-requests - --max-model-len - "32000" - --enforce-eager diff --git a/examples/backends/vllm/deploy/disagg_kvbm.yaml b/examples/backends/vllm/deploy/disagg_kvbm.yaml index f4315a13cd..77c357ba38 100644 --- a/examples/backends/vllm/deploy/disagg_kvbm.yaml +++ b/examples/backends/vllm/deploy/disagg_kvbm.yaml @@ -35,7 +35,6 @@ spec: - Qwen/Qwen3-8B - --gpu-memory-utilization - "0.3" - - --disable-log-requests - --max-model-len - "32000" - --enforce-eager @@ -68,7 +67,6 @@ spec: - --is-prefill-worker - --gpu-memory-utilization - "0.3" - - --disable-log-requests - --max-model-len - "32000" - --enforce-eager diff --git a/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml b/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml index 1aa5281d09..d9626fc27a 100644 --- a/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml +++ b/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml @@ -35,7 +35,6 @@ spec: - Qwen/Qwen3-8B - --gpu-memory-utilization - "0.3" - - --disable-log-requests - --max-model-len - "32000" - --enforce-eager @@ -68,7 +67,6 @@ spec: - --is-prefill-worker - --gpu-memory-utilization - "0.3" - - --disable-log-requests - --max-model-len - "32000" - --enforce-eager diff --git a/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml b/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml index 439b17a91f..141ca375fa 100644 --- a/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml +++ b/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml @@ -37,7 +37,6 @@ spec: - Qwen/Qwen3-8B - --gpu-memory-utilization - "0.23" - - --disable-log-requests - --max-model-len - "32000" - --enforce-eager @@ -72,7 +71,6 @@ spec: - --is-prefill-worker - --gpu-memory-utilization - "0.23" - - --disable-log-requests - --max-model-len - "32000" - --enforce-eager diff --git a/recipes/llama-3-70b/vllm/agg/deploy.yaml b/recipes/llama-3-70b/vllm/agg/deploy.yaml index 2cca281b96..54078054d2 100644 --- a/recipes/llama-3-70b/vllm/agg/deploy.yaml +++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml @@ -43,7 +43,7 @@ spec: - name: HF_HOME value: /opt/models args: - - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" + - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c diff --git a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml index 94acf7c846..b66870435a 100644 --- a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml +++ b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml @@ -43,7 +43,7 @@ spec: - name: HF_HOME value: /opt/models args: - - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" + - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c @@ -74,7 +74,7 @@ spec: - name: HF_HOME value: /opt/models args: - - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" + - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c diff --git a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml index e67996f06a..7c91aaeda5 100644 --- a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml +++ b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml @@ -55,7 +55,7 @@ spec: - name: HF_HOME value: /opt/models args: - - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" + - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c @@ -98,7 +98,7 @@ spec: - name: HF_HOME value: /opt/models args: - - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" + - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c diff --git a/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml b/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml index fa76acfc2c..9de8641034 100644 --- a/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml +++ b/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml @@ -60,7 +60,6 @@ spec: - --model - deepseek-ai/DeepSeek-V2-Lite - --trust-remote-code - - --disable-log-requests - --tensor-parallel-size - "1" - --data-parallel-size diff --git a/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml b/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml index b45bcf97de..6e559164c7 100644 --- a/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml +++ b/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml @@ -63,7 +63,6 @@ spec: - --model - deepseek-ai/DeepSeek-V2-Lite - --trust-remote-code - - --disable-log-requests - --tensor-parallel-size - "1" - --data-parallel-size @@ -130,7 +129,6 @@ spec: - --model - deepseek-ai/DeepSeek-V2-Lite - --trust-remote-code - - --disable-log-requests - --is-prefill-worker - --tensor-parallel-size - "1" From 8aabd9e29e0b3c9ceb6f219ddcdde88fadc236a6 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Tue, 25 Nov 2025 10:34:04 -0800 Subject: [PATCH 08/13] lil syntax change for import STR_DTYPE_TO_TORCH_DTYPE --- .../kvbm/python/kvbm/vllm_integration/connector_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py index 3d2532602d..ef791d36ed 100644 --- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py +++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py @@ -14,7 +14,7 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.model_executor.models.utils import extract_layer_index -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata From 42b5c7ec9172a622b0b76ddb2d491196d327094b Mon Sep 17 00:00:00 2001 From: alec-flowers Date: Tue, 25 Nov 2025 18:57:15 +0000 Subject: [PATCH 09/13] fix KVBM connector API changes Signed-off-by: alec-flowers --- .../vllm_integration/connector/dynamo_connector.py | 12 ++++++++++-- .../kvbm/vllm_integration/connector/pd_connector.py | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py index 8d06db7055..a8bb948c0c 100644 --- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py +++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py @@ -23,6 +23,7 @@ from vllm.config import VllmConfig from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request @@ -40,8 +41,15 @@ def __init__(self, metadata: bytes): class DynamoConnector(KVConnectorBase_V1): - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): - super().__init__(vllm_config=vllm_config, role=role) + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) assert vllm_config.kv_transfer_config is not None assert vllm_config.kv_transfer_config.engine_id is not None diff --git a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py index ceea2917ba..c89e554ea7 100644 --- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py +++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py @@ -29,6 +29,7 @@ LMCacheConnectorV1, ) from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request @@ -46,8 +47,15 @@ class PdConnector(MultiConnector): - The second connector must be NIXL and will be used by decode worker to get KV blocks from prefill worker. """ - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): - super().__init__(vllm_config=vllm_config, role=role) + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: "KVCacheConfig", + ): + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) if len(self._connectors) != 2: raise ValueError( f"PdConnector requires exactly two connectors (got {len(self._connectors)})" From 9b2a6944c6540e0df9602425b0794945a749fe33 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Tue, 25 Nov 2025 13:21:57 -0800 Subject: [PATCH 10/13] -W ignore::DeprecationWarning for SWIG/Scipy --- .github/actions/pytest/action.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index cca684695b..5daef6da97 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -54,12 +54,13 @@ runs: # Run pytest with detailed output and JUnit XML set +e # Don't exit on test failures + # -W ignore::DeprecationWarning suppresses DeprecationWarnings from SWIG/SciPy (vLLM dependency). Remove once vLLM updates to a SciPy version which uses SWIG 4.4.x. docker run --runtime=nvidia --gpus all -w /workspace \ --cpus=${NUM_CPUS} \ --network host \ --name ${{ env.CONTAINER_ID }}_pytest \ ${{ inputs.image_tag }} \ - bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\"" + bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -W ignore::DeprecationWarning -m \"${{ inputs.pytest_marks }}\"" TEST_EXIT_CODE=$? echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV From f13bb23aec2f203feb91940f90fbd05cdd0f58d5 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Tue, 25 Nov 2025 16:27:17 -0800 Subject: [PATCH 11/13] add flashinfer-cubin, flashinfer-jit-cache --- container/deps/vllm/install_vllm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index e424f3337e..abaa0702a1 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -141,6 +141,8 @@ if [[ $VLLM_REF =~ ^v ]] && { [ "$ARCH" = "amd64" ] || { [ "$ARCH" = "arm64" ] & echo "Installing vLLM $VLLM_REF from PyPI... (ARCH=$ARCH, TORCH_BACKEND=$TORCH_BACKEND)" uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND + uv pip install flashinfer-cubin==$FLASHINF_REF + uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') else # VLLM_REF does not start with 'v' or amd64 - use git checkout path From 8acd4c10c521dedff2c15df96a904684867a3886 Mon Sep 17 00:00:00 2001 From: karen-sy Date: Tue, 25 Nov 2025 22:07:55 -0800 Subject: [PATCH 12/13] deepep pplx import tests --- tests/dependencies/test_vllm_imports.py | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/dependencies/test_vllm_imports.py diff --git a/tests/dependencies/test_vllm_imports.py b/tests/dependencies/test_vllm_imports.py new file mode 100644 index 0000000000..d26aedfec8 --- /dev/null +++ b/tests/dependencies/test_vllm_imports.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests to sanity check that required dependencies can be imported.""" + +import pytest + + +@pytest.mark.vllm +@pytest.mark.unit +@pytest.mark.gpu_1 +def test_import_deep_ep(): + """Test that deep_ep module can be imported.""" + try: + import deep_ep + + assert deep_ep is not None + except ImportError as e: + pytest.fail(f"Failed to import deep_ep: {e}") + + +@pytest.mark.vllm +@pytest.mark.unit +@pytest.mark.gpu_1 +def test_import_pplx_kernels(): + """Test that pplx_kernels module can be imported.""" + try: + import pplx_kernels + + assert pplx_kernels is not None + except ImportError as e: + pytest.fail(f"Failed to import pplx_kernels: {e}") From c75b0fa9833bd5b1bfd943f179914ddea04594f0 Mon Sep 17 00:00:00 2001 From: alec-flowers Date: Thu, 27 Nov 2025 00:13:35 +0000 Subject: [PATCH 13/13] fix for hang Signed-off-by: alec-flowers --- components/src/dynamo/vllm/args.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/components/src/dynamo/vllm/args.py b/components/src/dynamo/vllm/args.py index 4c0d0a0988..59155b5698 100644 --- a/components/src/dynamo/vllm/args.py +++ b/components/src/dynamo/vllm/args.py @@ -198,6 +198,20 @@ def parse_args() -> Config: args = parser.parse_args() engine_args = AsyncEngineArgs.from_cli_args(args) + # Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor. + # With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same + # process. This causes a hot loop in _process_engine_step that doesn't release the GIL, + # blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate + # processes, avoiding the GIL contention. + # See: https://github.com/vllm-project/vllm/issues/29369 + tp_size = getattr(engine_args, "tensor_parallel_size", None) or 1 + if tp_size == 1 and engine_args.distributed_executor_backend is None: + logger.info( + "Setting --distributed-executor-backend=mp for TP=1 to avoid " + "UniProcExecutor GIL contention with NIXL connector" + ) + engine_args.distributed_executor_backend = "mp" + if engine_args.enable_prefix_caching is None: logger.debug( "--enable-prefix-caching or --no-enable-prefix-caching not specified. Defaulting to True (vLLM v1 default behavior)"