pytorch · mergennachin · Mar 2, 2026 · manuelcandales · Mar 3, 2026 · metascroy
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -257,10 +257,14 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
 
   # Per-component quantization flags
   VR_QUANT_ARGS=""
+  VR_DTYPE_ARGS=""
   if [ "$QUANT_NAME" = "quantized-8da4w" ]; then
     VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
   elif [ "$QUANT_NAME" = "quantized-int4-metal" ]; then
     VR_QUANT_ARGS="--qlinear-encoder fpa4w --qlinear fpa4w"
+  elif [ "$QUANT_NAME" = "quantized-int4-tile-packed" ]; then
+    VR_QUANT_ARGS="--qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
+    VR_DTYPE_ARGS="--dtype bf16"
   fi
 
   # Determine streaming mode based on MODE parameter
@@ -284,7 +288,8 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
       --backend "$DEVICE" \
       ${STREAMING_ARG} \
       --output-dir "${OUTPUT_DIR}" \
-      ${VR_QUANT_ARGS}
+      ${VR_QUANT_ARGS} \
+      ${VR_DTYPE_ARGS}
 
   # Export preprocessor
   python -m executorch.extension.audio.mel_spectrogram ${PREPROCESSOR_ARGS}

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -298,6 +298,10 @@ EOF
     ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
+    # Add CUDA data path if present
+    if [ "$DEVICE" = "cuda" ] && [ -f "${MODEL_DIR}/aoti_cuda_blob.ptd" ]; then
+      RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+    fi
     # Determine streaming mode based on MODE parameter
     USE_STREAMING="true"
     if [ "$MODE" = "vr-offline" ]; then

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -132,6 +132,8 @@ jobs:
         model:
           - repo: "mistralai"
             name: "Voxtral-Mini-3B-2507"
+          - repo: "mistralai"
+            name: "Voxtral-Mini-4B-Realtime-2602"
           - repo: "openai"
             name: "whisper-small"
           - repo: "openai"
@@ -152,6 +154,15 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
+          # Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
+          - model:
+              repo: "mistralai"
+              name: "Voxtral-Mini-4B-Realtime-2602"
+            quant: "non-quantized"
+          - model:
+              repo: "mistralai"
+              name: "Voxtral-Mini-4B-Realtime-2602"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -181,7 +192,12 @@ jobs:
           echo "::endgroup::"
         fi
 
-        source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        # Voxtral Realtime uses offline mode for CUDA CI (not streaming)
+        VR_MODE=""
+        if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
+          VR_MODE="vr-offline"
+        fi
+        source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"
 
   test-model-cuda-e2e:
     name: test-model-cuda-e2e
@@ -196,6 +212,8 @@ jobs:
         model:
           - repo: "mistralai"
             name: "Voxtral-Mini-3B-2507"
+          - repo: "mistralai"
+            name: "Voxtral-Mini-4B-Realtime-2602"
           - repo: "openai"
             name: "whisper-small"
           - repo: "openai"
@@ -214,6 +232,15 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
+          # Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
+          - model:
+              repo: "mistralai"
+              name: "Voxtral-Mini-4B-Realtime-2602"
+            quant: "non-quantized"
+          - model:
+              repo: "mistralai"
+              name: "Voxtral-Mini-4B-Realtime-2602"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -224,7 +251,12 @@ jobs:
       download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        # Voxtral Realtime uses offline mode for CUDA CI (not streaming)
+        VR_MODE=""
+        if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
+          VR_MODE="vr-offline"
+        fi
+        source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"
 
   test-cuda-pybind:
     name: test-cuda-pybind

diff --git a/Makefile b/Makefile
@@ -15,7 +15,7 @@
 # SUPPORTED MODELS:
 # -----------------
 # - voxtral:  Multimodal voice + text model (CPU, CUDA, Metal)
-# - voxtral_realtime: Realtime speech-to-text model (CPU)
+# - voxtral_realtime: Realtime speech-to-text model (CPU, CUDA, Metal)
 # - whisper:  Speech recognition model (CPU, CUDA, Metal)
 # - parakeet: Speech recognition model (CPU, CUDA, Metal)
 # - sortformer: Speaker diarization model (CPU)
@@ -91,13 +91,14 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
 	@echo "  voxtral-cuda        - Build Voxtral runner with CUDA backend"
 	@echo "  voxtral-cpu         - Build Voxtral runner with CPU backend"
 	@echo "  voxtral-metal       - Build Voxtral runner with Metal backend (macOS only)"
+	@echo "  voxtral_realtime-cuda - Build Voxtral Realtime runner with CUDA backend"
 	@echo "  voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
 	@echo "  voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
 	@echo "  whisper-cuda        - Build Whisper runner with CUDA backend"
@@ -244,6 +245,15 @@ voxtral_realtime-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
 
+voxtral_realtime-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Voxtral Realtime runner with CUDA..."
+	cd examples/models/voxtral_realtime && cmake --workflow --preset voxtral-realtime-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
+
 silero-vad-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release

@@ -28,6 +28,19 @@
                 "type": "equals",
                 "rhs": "Darwin"
             }
+        },
+        {
+            "name": "voxtral-realtime-cuda",
+            "displayName": "Voxtral Realtime runner (CUDA)",
+            "inherits": ["voxtral-realtime-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_CUDA": "ON"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
         }
     ],
     "buildPresets": [
@@ -43,6 +56,12 @@
             "configurePreset": "voxtral-realtime-metal",
             "configuration": "Release",
             "targets": ["voxtral_realtime_runner"]
+        },
+        {
+            "name": "voxtral-realtime-cuda",
+            "displayName": "Build Voxtral Realtime runner (CUDA)",
+            "configurePreset": "voxtral-realtime-cuda",
+            "targets": ["voxtral_realtime_runner"]
         }
     ],
     "workflowPresets": [
@@ -73,6 +92,20 @@
                     "name": "voxtral-realtime-metal"
                 }
             ]
+        },
+        {
+            "name": "voxtral-realtime-cuda",
+            "displayName": "Configure and build Voxtral Realtime runner (CUDA)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "voxtral-realtime-cuda"
+                },
+                {
+                    "type": "build",
+                    "name": "voxtral-realtime-cuda"
+                }
+            ]
         }
     ]
 }
@@ -88,8 +88,43 @@ python export_voxtral_rt.py \
 |---------|---------|-----------|--------------|
 | `xnnpack` | ✓ | ✓ | `4w`, `8w`, `8da4w`, `8da8w` |
 | `metal` | ✓ | ✓ | none (fp32) or `fpa4w` (Metal-specific 4-bit) |
+| `cuda` | ✓ | ✓ | `4w`, `8w`, `8da4w`, `8da8w` |
 
-Metal backend provides Apple GPU acceleration.
+Metal backend provides Apple GPU acceleration. CUDA backend provides NVIDIA GPU
+acceleration via AOTInductor.
+
+#### CUDA export examples
+
+Offline with int4 quantization:
+
+```bash
+python export_voxtral_rt.py \
+    --model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
+    --backend cuda \
+    --dtype bf16 \
+    --output-dir ./voxtral_rt_exports \
+    --qlinear-encoder 4w \
+    --qlinear-encoder-packing-format tile_packed_to_4d \
+    --qlinear 4w \
+    --qlinear-packing-format tile_packed_to_4d \
+    --qembedding 8w
+```
+
+Streaming with int4 quantization:
+
+```bash
+python export_voxtral_rt.py \
+    --model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
+    --backend cuda \
+    --dtype bf16 \
+    --streaming \
+    --output-dir ./voxtral_rt_exports \
+    --qlinear-encoder 4w \
+    --qlinear-encoder-packing-format tile_packed_to_4d \
+    --qlinear 4w \
+    --qlinear-packing-format tile_packed_to_4d \
+    --qembedding 8w
+```
 
 #### Metal export examples
 
@@ -133,14 +168,17 @@ EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_ex
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--model-path` | (required) | Directory with `params.json` + `consolidated.safetensors` |
-| `--backend` | `xnnpack` | `xnnpack`, `metal`, or `portable` |
+| `--backend` | `xnnpack` | `xnnpack`, `metal`, `cuda`, or `portable` |
+| `--dtype` | `fp32` | Model dtype: `fp32` or `bf16` |
 | `--output-dir` | `./voxtral_rt_exports` | Output directory |
 | `--max-seq-len` | `4096` | KV cache length |
 | `--delay-tokens` | `6` | Transcription delay in tokens (6 = 480ms) |
 | `--qlinear` | (none) | Decoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
 | `--qlinear-group-size` | `32` | Group size for decoder linear quantization |
+| `--qlinear-packing-format` | (none) | Packing format for decoder 4w quantization (`tile_packed_to_4d` for CUDA) |
 | `--qlinear-encoder` | (none) | Encoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
 | `--qlinear-encoder-group-size` | `32` | Group size for encoder linear quantization |
+| `--qlinear-encoder-packing-format` | (none) | Packing format for encoder 4w quantization (`tile_packed_to_4d` for CUDA) |
 | `--qembedding` | (none) | Embedding layer quantization (`8w`) |
 | `--streaming` | off | Export streaming encoder with KV cache |
 | `--max-enc-len` | `750` | Encoder sliding window size (streaming only) |
@@ -164,6 +202,15 @@ make voxtral_realtime-cpu
 This builds ExecuTorch core libraries with XNNPACK, then the runner binary
 at `cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner`.
 
+### CUDA (NVIDIA GPU)
+
+```bash
+make voxtral_realtime-cuda
+```
+
+This builds ExecuTorch with CUDA backend support. The runner binary is at
+the same path as above. Requires NVIDIA GPU with CUDA toolkit installed.
+
 ### Metal (Apple GPU)
 
 ```bash
@@ -180,10 +227,22 @@ The runner requires:
 - `tekken.json` — tokenizer from the model weights directory
 - `preprocessor.pte` — mel spectrogram preprocessor (see [Preprocessor](#preprocessor))
 - A 16kHz mono WAV audio file (or live audio via `--mic`)
+- For CUDA: `aoti_cuda_blob.ptd` — delegate data file (pass via `--data_path`)
+
+```bash
+cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
+    --model_path voxtral_rt_exports/model.pte \
+    --tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
+    --preprocessor_path voxtral_rt_exports/preprocessor.pte \
+    --audio_path input.wav
+```
+
+For CUDA, include the `.ptd` data file:
 
 ```bash
 cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
     --model_path voxtral_rt_exports/model.pte \
+    --data_path voxtral_rt_exports/aoti_cuda_blob.ptd \
     --tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
     --preprocessor_path voxtral_rt_exports/preprocessor.pte \
     --audio_path input.wav
@@ -218,9 +277,13 @@ ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f f32le -nostats -loglevel error
 
 Ctrl+C stops recording and flushes remaining text.
 
+**CUDA:** Add `--data_path voxtral_rt_exports/aoti_cuda_blob.ptd` to all
+run commands above when using the CUDA backend.
+
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--model_path` | `model.pte` | Path to exported model |
+| `--data_path` | (none) | Path to delegate data file (`.ptd`, required for CUDA) |
 | `--tokenizer_path` | `tekken.json` | Path to Tekken tokenizer |
 | `--preprocessor_path` | (none) | Path to mel preprocessor `.pte` |
 | `--audio_path` | (none) | Path to 16kHz mono WAV file |