From 1c4e8755738df9721cd8a6ee835d7777955d609b Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Wed, 19 Nov 2025 11:12:40 -0800
Subject: [PATCH 01/13] try bumping vllm/flashinfer for v0.11.1

---
 container/Dockerfile.vllm           | 4 ++--
 container/deps/vllm/install_vllm.sh | 4 ++--
 pyproject.toml                      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm
index 186b406c38..d99f2f4c5d 100644
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -14,9 +14,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
 ARG CUDA_VERSION="12.8"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="v0.11.0"
+ARG VLLM_REF="v0.11.1"
 # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
-ARG FLASHINF_REF="v0.3.1"
+ARG FLASHINF_REF="v0.5.2"
 ARG TORCH_BACKEND="cu128"
 
 # If left blank, then we will fallback to vLLM defaults
diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh
index 0ebbb58823..97554976c4 100755
--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -13,7 +13,7 @@
 
 set -euo pipefail
 
-VLLM_REF="v0.11.0"
+VLLM_REF="v0.11.1"
 
 # Basic Configurations
 ARCH=$(uname -m)
@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM
 # These flags are applicable when installing vLLM from source code
 EDITABLE=true
 VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
-FLASHINF_REF="v0.3.1"
+FLASHINF_REF="v0.5.2"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
diff --git a/pyproject.toml b/pyproject.toml
index 675fbd0c70..dca0c6eaee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,7 +55,7 @@ trtllm =[
 vllm = [
     "uvloop",
     "nixl[cu12]<=0.7.1",
-    "vllm[flashinfer]==0.10.2",
+    "vllm[flashinfer]==0.11.1",
 ]
 
 sglang = [

From 53b4826baae9a42306fc1507969894d645c0e6cb Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Wed, 19 Nov 2025 21:09:12 -0800
Subject: [PATCH 02/13] upgrade to torch=2.9.0+cu129

---
 container/Dockerfile.vllm                              | 10 +++++-----
 container/build.sh                                     |  2 +-
 container/deps/vllm/install_vllm.sh                    |  4 ++--
 .../deploy/container/Dockerfile.local_vllm             |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm
index d99f2f4c5d..348f01a032 100644
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -7,17 +7,17 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04"
 ARG ENABLE_KVBM=false
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
-ARG CUDA_VERSION="12.8"
+ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
+ARG CUDA_VERSION="12.9"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
 ARG VLLM_REF="v0.11.1"
 # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
 ARG FLASHINF_REF="v0.5.2"
-ARG TORCH_BACKEND="cu128"
+ARG TORCH_BACKEND="cu129"
 
 # If left blank, then we will fallback to vLLM defaults
 ARG DEEPGEMM_REF=""
@@ -206,7 +206,7 @@ RUN apt-get update && \
         # prometheus dependencies
         ca-certificates \
         # DeepGemm uses 'cuobjdump' which does not come with CUDA image
-        cuda-command-line-tools-12-8 && \
+        cuda-command-line-tools-12-9 && \
     rm -rf /var/lib/apt/lists/*
 
 # Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image
diff --git a/container/build.sh b/container/build.sh
index 2bc2327cc0..adc2f16a99 100755
--- a/container/build.sh
+++ b/container/build.sh
@@ -107,7 +107,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+VLLM_BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04"
 
 NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh
index 97554976c4..de5460666b 100755
--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -21,10 +21,10 @@ MAX_JOBS=16
 INSTALLATION_DIR=/tmp
 
 # VLLM and Dependency Configurations
-TORCH_BACKEND="cu128"
+TORCH_BACKEND="cu129"
 TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
 DEEPGEMM_REF=""
-CUDA_VERSION="12.8" # For DEEPGEMM
+CUDA_VERSION="12.9" # For DEEPGEMM
 
 # These flags are applicable when installing vLLM from source code
 EDITABLE=true
diff --git a/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm b/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
index 5a4631f9b7..3b790a3bb8 100644
--- a/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
+++ b/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
@@ -9,7 +9,7 @@
 ARG LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
 ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
+ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
 
 # Other build arguments
 ARG PYTHON_VERSION=3.12
@@ -57,7 +57,7 @@ RUN apt-get update && \
         # prometheus dependencies
         ca-certificates \
         # DeepGemm uses 'cuobjdump' which does not come with CUDA image
-        cuda-command-line-tools-12-8 && \
+        cuda-command-line-tools-12-9 && \
     rm -rf /var/lib/apt/lists/*
 
 # Copy CUDA development tools from vLLM image (for JIT compilation)

From 3679987bc377456408d031647b9778f56cd66a76 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Wed, 19 Nov 2025 21:26:13 -0800
Subject: [PATCH 03/13] ngc 25.04

---
 container/Dockerfile.vllm | 5 +++--
 container/build.sh        | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm
index 348f01a032..53967b7f78 100644
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -3,11 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-# TODO OPS-612: NCCL will hang with 25.03, so use 25.01 for now
+# Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8)
+# TODO OPS-612: NCCL hang issue with 25.03 - verify if resolved in 25.04
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-ARG BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
 ARG ENABLE_KVBM=false
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
 ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
diff --git a/container/build.sh b/container/build.sh
index adc2f16a99..495f31aaa7 100755
--- a/container/build.sh
+++ b/container/build.sh
@@ -103,11 +103,12 @@ TENSORRTLLM_PIP_WHEEL=""
 
 
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-# FIXME: NCCL will hang with 25.03, so use 25.01 for now
+# Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8)
+# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.04
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-VLLM_BASE_IMAGE_TAG="25.01-cuda12.9-devel-ubuntu24.04"
+VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
 
 NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

From b9ea0e902d9e61c8e2f30d75d3ca688abc4710b8 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Thu, 20 Nov 2025 10:52:14 -0800
Subject: [PATCH 04/13] minor up to ngc 25.06 for consistency

---
 container/Dockerfile.vllm | 2 +-
 container/build.sh        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm
index 53967b7f78..4d33d02914 100644
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -8,7 +8,7 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-ARG BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
 ARG ENABLE_KVBM=false
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
 ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
diff --git a/container/build.sh b/container/build.sh
index 495f31aaa7..70e86c3bf1 100755
--- a/container/build.sh
+++ b/container/build.sh
@@ -104,11 +104,11 @@ TENSORRTLLM_PIP_WHEEL=""
 
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8)
-# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.04
+# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.6
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
+VLLM_BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
 
 NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

From 379a1d2c5fc0fae7276800b1783031efcb9a7bd4 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Thu, 20 Nov 2025 11:20:58 -0800
Subject: [PATCH 05/13] ci seems to fail on 25.06 so revert to 25.04

---
 container/Dockerfile.vllm | 2 +-
 container/build.sh        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm
index 4d33d02914..53967b7f78 100644
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -8,7 +8,7 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
 ARG ENABLE_KVBM=false
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
 ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
diff --git a/container/build.sh b/container/build.sh
index 70e86c3bf1..495f31aaa7 100755
--- a/container/build.sh
+++ b/container/build.sh
@@ -104,11 +104,11 @@ TENSORRTLLM_PIP_WHEEL=""
 
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Note: CUDA 12.9 requires NGC 25.04+ (25.01 only has CUDA 12.8)
-# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.6
+# FIXME: NCCL hang issue with 25.03 - verify if resolved in 25.04
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-VLLM_BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
+VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
 
 NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

From 8858e98823bff3826b8f2e84a8e5e8733e94d816 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Thu, 20 Nov 2025 11:47:44 -0800
Subject: [PATCH 06/13] 0.11.2

---
 container/Dockerfile.vllm           | 2 +-
 container/deps/vllm/install_vllm.sh | 2 +-
 pyproject.toml                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm
index 53967b7f78..130d58ff98 100644
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -15,7 +15,7 @@ ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
 ARG CUDA_VERSION="12.9"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="v0.11.1"
+ARG VLLM_REF="v0.11.2"
 # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
 ARG FLASHINF_REF="v0.5.2"
 ARG TORCH_BACKEND="cu129"
diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh
index de5460666b..e424f3337e 100755
--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -13,7 +13,7 @@
 
 set -euo pipefail
 
-VLLM_REF="v0.11.1"
+VLLM_REF="v0.11.2"
 
 # Basic Configurations
 ARCH=$(uname -m)
diff --git a/pyproject.toml b/pyproject.toml
index dca0c6eaee..80cc048c6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,7 +55,7 @@ trtllm =[
 vllm = [
     "uvloop",
     "nixl[cu12]<=0.7.1",
-    "vllm[flashinfer]==0.11.1",
+    "vllm[flashinfer]==0.11.2",
 ]
 
 sglang = [

From 3d138eb9218d5149aec9ab8d61728431d747a688 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Tue, 25 Nov 2025 09:58:14 -0800
Subject: [PATCH 07/13] remove --disable-log-requests deprecated flag

---
 .../sglang/slurm_jobs/scripts/vllm/benchmark_serving.py       | 1 -
 examples/backends/vllm/deploy/agg_kvbm.yaml                   | 1 -
 examples/backends/vllm/deploy/disagg_kvbm.yaml                | 2 --
 examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml           | 2 --
 examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml            | 2 --
 recipes/llama-3-70b/vllm/agg/deploy.yaml                      | 2 +-
 recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml        | 4 ++--
 recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml       | 4 ++--
 tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml      | 1 -
 tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml   | 2 --
 10 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py b/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
index f9c67be7bc..a5962afe17 100644
--- a/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
+++ b/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
@@ -8,7 +8,6 @@
     vLLM OpenAI API server
     vllm serve <your_model> \
         --swap-space 16 \
-        --disable-log-requests
 
     (TGI backend)
     ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
diff --git a/examples/backends/vllm/deploy/agg_kvbm.yaml b/examples/backends/vllm/deploy/agg_kvbm.yaml
index 62e28386aa..0c42380979 100644
--- a/examples/backends/vllm/deploy/agg_kvbm.yaml
+++ b/examples/backends/vllm/deploy/agg_kvbm.yaml
@@ -42,7 +42,6 @@ spec:
             - Qwen/Qwen3-8B
             - --gpu-memory-utilization
             - "0.45"
-            - --disable-log-requests
             - --max-model-len
             - "32000"
             - --enforce-eager
diff --git a/examples/backends/vllm/deploy/disagg_kvbm.yaml b/examples/backends/vllm/deploy/disagg_kvbm.yaml
index f4315a13cd..77c357ba38 100644
--- a/examples/backends/vllm/deploy/disagg_kvbm.yaml
+++ b/examples/backends/vllm/deploy/disagg_kvbm.yaml
@@ -35,7 +35,6 @@ spec:
             - Qwen/Qwen3-8B
             - --gpu-memory-utilization
             - "0.3"
-            - --disable-log-requests
             - --max-model-len
             - "32000"
             - --enforce-eager
@@ -68,7 +67,6 @@ spec:
             - --is-prefill-worker
             - --gpu-memory-utilization
             - "0.3"
-            - --disable-log-requests
             - --max-model-len
             - "32000"
             - --enforce-eager
diff --git a/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml b/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
index 1aa5281d09..d9626fc27a 100644
--- a/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
+++ b/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
@@ -35,7 +35,6 @@ spec:
             - Qwen/Qwen3-8B
             - --gpu-memory-utilization
             - "0.3"
-            - --disable-log-requests
             - --max-model-len
             - "32000"
             - --enforce-eager
@@ -68,7 +67,6 @@ spec:
             - --is-prefill-worker
             - --gpu-memory-utilization
             - "0.3"
-            - --disable-log-requests
             - --max-model-len
             - "32000"
             - --enforce-eager
diff --git a/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml b/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
index 439b17a91f..141ca375fa 100644
--- a/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
+++ b/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
@@ -37,7 +37,6 @@ spec:
             - Qwen/Qwen3-8B
             - --gpu-memory-utilization
             - "0.23"
-            - --disable-log-requests
             - --max-model-len
             - "32000"
             - --enforce-eager
@@ -72,7 +71,6 @@ spec:
             - --is-prefill-worker
             - --gpu-memory-utilization
             - "0.23"
-            - --disable-log-requests
             - --max-model-len
             - "32000"
             - --enforce-eager
diff --git a/recipes/llama-3-70b/vllm/agg/deploy.yaml b/recipes/llama-3-70b/vllm/agg/deploy.yaml
index 2cca281b96..54078054d2 100644
--- a/recipes/llama-3-70b/vllm/agg/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml
@@ -43,7 +43,7 @@ spec:
             - name: HF_HOME
               value: /opt/models
           args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
           command:
           - /bin/sh
           - -c
diff --git a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
index 94acf7c846..b66870435a 100644
--- a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
@@ -43,7 +43,7 @@ spec:
             - name: HF_HOME
               value: /opt/models
           args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
           command:
           - /bin/sh
           - -c
@@ -74,7 +74,7 @@ spec:
             - name: HF_HOME
               value: /opt/models
           args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
           command:
           - /bin/sh
           - -c
diff --git a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
index e67996f06a..7c91aaeda5 100644
--- a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
@@ -55,7 +55,7 @@ spec:
             - name: HF_HOME
               value: /opt/models
           args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
           command:
           - /bin/sh
           - -c
@@ -98,7 +98,7 @@ spec:
             - name: HF_HOME
               value: /opt/models
           args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
           command:
           - /bin/sh
           - -c
diff --git a/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml b/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
index fa76acfc2c..9de8641034 100644
--- a/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
+++ b/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
@@ -60,7 +60,6 @@ spec:
             - --model
             - deepseek-ai/DeepSeek-V2-Lite
             - --trust-remote-code
-            - --disable-log-requests
             - --tensor-parallel-size
             - "1"
             - --data-parallel-size
diff --git a/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml b/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
index b45bcf97de..6e559164c7 100644
--- a/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
+++ b/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
@@ -63,7 +63,6 @@ spec:
             - --model
             - deepseek-ai/DeepSeek-V2-Lite
             - --trust-remote-code
-            - --disable-log-requests
             - --tensor-parallel-size
             - "1"
             - --data-parallel-size
@@ -130,7 +129,6 @@ spec:
             - --model
             - deepseek-ai/DeepSeek-V2-Lite
             - --trust-remote-code
-            - --disable-log-requests
             - --is-prefill-worker
             - --tensor-parallel-size
             - "1"

From 8aabd9e29e0b3c9ceb6f219ddcdde88fadc236a6 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Tue, 25 Nov 2025 10:34:04 -0800
Subject: [PATCH 08/13] lil syntax change for import STR_DTYPE_TO_TORCH_DTYPE

---
 .../kvbm/python/kvbm/vllm_integration/connector_worker.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
index 3d2532602d..ef791d36ed 100644
--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
@@ -14,7 +14,7 @@
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.model_executor.models.utils import extract_layer_index
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata

From 42b5c7ec9172a622b0b76ddb2d491196d327094b Mon Sep 17 00:00:00 2001
From: alec-flowers <aflowers@nvidia.com>
Date: Tue, 25 Nov 2025 18:57:15 +0000
Subject: [PATCH 09/13] fix KVBM connector API changes

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 .../vllm_integration/connector/dynamo_connector.py   | 12 ++++++++++--
 .../kvbm/vllm_integration/connector/pd_connector.py  | 12 ++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
index 8d06db7055..a8bb948c0c 100644
--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
@@ -23,6 +23,7 @@
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 
@@ -40,8 +41,15 @@ def __init__(self, metadata: bytes):
 
 
 class DynamoConnector(KVConnectorBase_V1):
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
 
         assert vllm_config.kv_transfer_config is not None
         assert vllm_config.kv_transfer_config.engine_id is not None
diff --git a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
index ceea2917ba..c89e554ea7 100644
--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
@@ -29,6 +29,7 @@
         LMCacheConnectorV1,
     )
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
 
@@ -46,8 +47,15 @@ class PdConnector(MultiConnector):
     - The second connector must be NIXL and will be used by decode worker to get KV blocks from prefill worker.
     """
 
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
         if len(self._connectors) != 2:
             raise ValueError(
                 f"PdConnector requires exactly two connectors (got {len(self._connectors)})"

From 9b2a6944c6540e0df9602425b0794945a749fe33 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Tue, 25 Nov 2025 13:21:57 -0800
Subject: [PATCH 10/13] -W ignore::DeprecationWarning for SWIG/Scipy

---
 .github/actions/pytest/action.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml
index cca684695b..5daef6da97 100644
--- a/.github/actions/pytest/action.yml
+++ b/.github/actions/pytest/action.yml
@@ -54,12 +54,13 @@ runs:
         # Run pytest with detailed output and JUnit XML
         set +e  # Don't exit on test failures
 
+        # -W ignore::DeprecationWarning suppresses DeprecationWarnings from SWIG/SciPy (vLLM dependency). Remove once vLLM updates to a SciPy version which uses SWIG 4.4.x.
         docker run --runtime=nvidia --gpus all -w /workspace \
           --cpus=${NUM_CPUS} \
           --network host \
           --name ${{ env.CONTAINER_ID }}_pytest \
           ${{ inputs.image_tag }} \
-          bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
+          bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -W ignore::DeprecationWarning -m \"${{ inputs.pytest_marks }}\""
 
         TEST_EXIT_CODE=$?
         echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV

From f13bb23aec2f203feb91940f90fbd05cdd0f58d5 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Tue, 25 Nov 2025 16:27:17 -0800
Subject: [PATCH 11/13] add flashinfer-cubin, flashinfer-jit-cache

---
 container/deps/vllm/install_vllm.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh
index e424f3337e..abaa0702a1 100755
--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -141,6 +141,8 @@ if [[ $VLLM_REF =~ ^v ]] && { [ "$ARCH" = "amd64" ] || { [ "$ARCH" = "arm64" ] &
     echo "Installing vLLM $VLLM_REF from PyPI... (ARCH=$ARCH, TORCH_BACKEND=$TORCH_BACKEND)"
 
     uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND
+    uv pip install flashinfer-cubin==$FLASHINF_REF
+    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 else
     # VLLM_REF does not start with 'v' or amd64 - use git checkout path

From 8acd4c10c521dedff2c15df96a904684867a3886 Mon Sep 17 00:00:00 2001
From: karen-sy <karenc@nvidia.com>
Date: Tue, 25 Nov 2025 22:07:55 -0800
Subject: [PATCH 12/13] deepep pplx import tests

---
 tests/dependencies/test_vllm_imports.py | 32 +++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 tests/dependencies/test_vllm_imports.py

diff --git a/tests/dependencies/test_vllm_imports.py b/tests/dependencies/test_vllm_imports.py
new file mode 100644
index 0000000000..d26aedfec8
--- /dev/null
+++ b/tests/dependencies/test_vllm_imports.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests to sanity check that required dependencies can be imported."""
+
+import pytest
+
+
+@pytest.mark.vllm
+@pytest.mark.unit
+@pytest.mark.gpu_1
+def test_import_deep_ep():
+    """Test that deep_ep module can be imported."""
+    try:
+        import deep_ep
+
+        assert deep_ep is not None
+    except ImportError as e:
+        pytest.fail(f"Failed to import deep_ep: {e}")
+
+
+@pytest.mark.vllm
+@pytest.mark.unit
+@pytest.mark.gpu_1
+def test_import_pplx_kernels():
+    """Test that pplx_kernels module can be imported."""
+    try:
+        import pplx_kernels
+
+        assert pplx_kernels is not None
+    except ImportError as e:
+        pytest.fail(f"Failed to import pplx_kernels: {e}")

From c75b0fa9833bd5b1bfd943f179914ddea04594f0 Mon Sep 17 00:00:00 2001
From: alec-flowers <aflowers@nvidia.com>
Date: Thu, 27 Nov 2025 00:13:35 +0000
Subject: [PATCH 13/13] fix for hang

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 components/src/dynamo/vllm/args.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/components/src/dynamo/vllm/args.py b/components/src/dynamo/vllm/args.py
index 4c0d0a0988..59155b5698 100644
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -198,6 +198,20 @@ def parse_args() -> Config:
     args = parser.parse_args()
     engine_args = AsyncEngineArgs.from_cli_args(args)
 
+    # Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
+    # With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
+    # process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
+    # blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
+    # processes, avoiding the GIL contention.
+    # See: https://github.com/vllm-project/vllm/issues/29369
+    tp_size = getattr(engine_args, "tensor_parallel_size", None) or 1
+    if tp_size == 1 and engine_args.distributed_executor_backend is None:
+        logger.info(
+            "Setting --distributed-executor-backend=mp for TP=1 to avoid "
+            "UniProcExecutor GIL contention with NIXL connector"
+        )
+        engine_args.distributed_executor_backend = "mp"
+
     if engine_args.enable_prefix_caching is None:
         logger.debug(
             "--enable-prefix-caching or --no-enable-prefix-caching not specified. Defaulting to True (vLLM v1 default behavior)"