Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .ci/scripts/export_model_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,14 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then

# Per-component quantization flags
VR_QUANT_ARGS=""
VR_DTYPE_ARGS=""
if [ "$QUANT_NAME" = "quantized-8da4w" ]; then
VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
elif [ "$QUANT_NAME" = "quantized-int4-metal" ]; then
VR_QUANT_ARGS="--qlinear-encoder fpa4w --qlinear fpa4w"
elif [ "$QUANT_NAME" = "quantized-int4-tile-packed" ]; then
VR_QUANT_ARGS="--qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
VR_DTYPE_ARGS="--dtype bf16"
fi

# Determine streaming mode based on MODE parameter
Expand All @@ -284,7 +288,8 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
--backend "$DEVICE" \
${STREAMING_ARG} \
--output-dir "${OUTPUT_DIR}" \
${VR_QUANT_ARGS}
${VR_QUANT_ARGS} \
${VR_DTYPE_ARGS}

# Export preprocessor
python -m executorch.extension.audio.mel_spectrogram ${PREPROCESSOR_ARGS}
Expand Down
4 changes: 4 additions & 0 deletions .ci/scripts/test_model_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,10 @@ EOF
;;
voxtral_realtime)
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
# Add CUDA data path if present
if [ "$DEVICE" = "cuda" ] && [ -f "${MODEL_DIR}/aoti_cuda_blob.ptd" ]; then
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
fi
# Determine streaming mode based on MODE parameter
USE_STREAMING="true"
if [ "$MODE" = "vr-offline" ]; then
Expand Down
36 changes: 34 additions & 2 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ jobs:
model:
- repo: "mistralai"
name: "Voxtral-Mini-3B-2507"
- repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
Expand All @@ -152,6 +154,15 @@ jobs:
repo: "google"
name: "gemma-3-4b-it"
quant: "quantized-int4-weight-only"
# Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
- model:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "non-quantized"
- model:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "quantized-int4-weight-only"
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
Expand Down Expand Up @@ -181,7 +192,12 @@ jobs:
echo "::endgroup::"
fi

source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
# Voxtral Realtime uses offline mode for CUDA CI (not streaming)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not streaming?

VR_MODE=""
if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
VR_MODE="vr-offline"
fi
source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"

test-model-cuda-e2e:
name: test-model-cuda-e2e
Expand All @@ -196,6 +212,8 @@ jobs:
model:
- repo: "mistralai"
name: "Voxtral-Mini-3B-2507"
- repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
Expand All @@ -214,6 +232,15 @@ jobs:
repo: "google"
name: "gemma-3-4b-it"
quant: "quantized-int4-weight-only"
# Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
- model:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "non-quantized"
- model:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "quantized-int4-weight-only"
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
Expand All @@ -224,7 +251,12 @@ jobs:
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
# Voxtral Realtime uses offline mode for CUDA CI (not streaming)
VR_MODE=""
if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
VR_MODE="vr-offline"
fi
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"

test-cuda-pybind:
name: test-cuda-pybind
Expand Down
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# SUPPORTED MODELS:
# -----------------
# - voxtral: Multimodal voice + text model (CPU, CUDA, Metal)
# - voxtral_realtime: Realtime speech-to-text model (CPU)
# - voxtral_realtime: Realtime speech-to-text model (CPU, CUDA, Metal)
# - whisper: Speech recognition model (CPU, CUDA, Metal)
# - parakeet: Speech recognition model (CPU, CUDA, Metal)
# - sortformer: Speaker diarization model (CPU)
Expand Down Expand Up @@ -91,13 +91,14 @@
#
# ==============================================================================

.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help

help:
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@echo " voxtral-cuda - Build Voxtral runner with CUDA backend"
@echo " voxtral-cpu - Build Voxtral runner with CPU backend"
@echo " voxtral-metal - Build Voxtral runner with Metal backend (macOS only)"
@echo " voxtral_realtime-cuda - Build Voxtral Realtime runner with CUDA backend"
@echo " voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
@echo " voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
@echo " whisper-cuda - Build Whisper runner with CUDA backend"
Expand Down Expand Up @@ -244,6 +245,15 @@ voxtral_realtime-metal:
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"

voxtral_realtime-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
cmake --workflow --preset llm-release-cuda
@echo "==> Building Voxtral Realtime runner with CUDA..."
cd examples/models/voxtral_realtime && cmake --workflow --preset voxtral-realtime-cuda
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"

silero-vad-cpu:
@echo "==> Building and installing ExecuTorch..."
cmake --workflow --preset llm-release
Expand Down
33 changes: 33 additions & 0 deletions examples/models/voxtral_realtime/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@
"type": "equals",
"rhs": "Darwin"
}
},
{
"name": "voxtral-realtime-cuda",
"displayName": "Voxtral Realtime runner (CUDA)",
"inherits": ["voxtral-realtime-base"],
"cacheVariables": {
"EXECUTORCH_BUILD_CUDA": "ON"
},
"condition": {
"type": "inList",
"string": "${hostSystemName}",
"list": ["Linux", "Windows"]
}
}
],
"buildPresets": [
Expand All @@ -43,6 +56,12 @@
"configurePreset": "voxtral-realtime-metal",
"configuration": "Release",
"targets": ["voxtral_realtime_runner"]
},
{
"name": "voxtral-realtime-cuda",
"displayName": "Build Voxtral Realtime runner (CUDA)",
"configurePreset": "voxtral-realtime-cuda",
"targets": ["voxtral_realtime_runner"]
}
],
"workflowPresets": [
Expand Down Expand Up @@ -73,6 +92,20 @@
"name": "voxtral-realtime-metal"
}
]
},
{
"name": "voxtral-realtime-cuda",
"displayName": "Configure and build Voxtral Realtime runner (CUDA)",
"steps": [
{
"type": "configure",
"name": "voxtral-realtime-cuda"
},
{
"type": "build",
"name": "voxtral-realtime-cuda"
}
]
}
]
}
67 changes: 65 additions & 2 deletions examples/models/voxtral_realtime/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,43 @@ python export_voxtral_rt.py \
|---------|---------|-----------|--------------|
| `xnnpack` | ✓ | ✓ | `4w`, `8w`, `8da4w`, `8da8w` |
| `metal` | ✓ | ✓ | none (fp32) or `fpa4w` (Metal-specific 4-bit) |
| `cuda` | ✓ | ✓ | `4w`, `8w`, `8da4w`, `8da8w` |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does Cuda support 8da4w/8da8w?

Related, I'm pretty sure xnnpack does not support 4w/8w.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@metascroy

Does Cuda support 8da4w/8da8w?

Good catch, will fix.

Related, I'm pretty sure xnnpack does not support 4w/8w.

xnnpack supports per-channel 4w and 8w. For example, we use 8w for token embeddings.

Copy link
Contributor

@metascroy metascroy Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ET's embedding CPU op supports weight only schemes, but I don't think xnnpack supports weight-only quantization for linear layers.

With that said, 4w/8da4w and 8w/8da8w quantize weight data the same. The only difference is the 8da variants add fake activation quantization in front.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@manuelcandales is there any plan for metal aoti to use int4/int8 for a more uniform experience.

The kernel should support it because I'm using int4/int8 with MLX.


Metal backend provides Apple GPU acceleration.
Metal backend provides Apple GPU acceleration. CUDA backend provides NVIDIA GPU
acceleration via AOTInductor.

#### CUDA export examples

Offline with int4 quantization:

```bash
python export_voxtral_rt.py \
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
--backend cuda \
--dtype bf16 \
--output-dir ./voxtral_rt_exports \
--qlinear-encoder 4w \
--qlinear-encoder-packing-format tile_packed_to_4d \
--qlinear 4w \
--qlinear-packing-format tile_packed_to_4d \
--qembedding 8w
```

Streaming with int4 quantization:

```bash
python export_voxtral_rt.py \
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
--backend cuda \
--dtype bf16 \
--streaming \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this is supported, then why not test it in CI?

--output-dir ./voxtral_rt_exports \
--qlinear-encoder 4w \
--qlinear-encoder-packing-format tile_packed_to_4d \
--qlinear 4w \
--qlinear-packing-format tile_packed_to_4d \
--qembedding 8w
```

#### Metal export examples

Expand Down Expand Up @@ -133,14 +168,17 @@ EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_ex
| Flag | Default | Description |
|------|---------|-------------|
| `--model-path` | (required) | Directory with `params.json` + `consolidated.safetensors` |
| `--backend` | `xnnpack` | `xnnpack`, `metal`, or `portable` |
| `--backend` | `xnnpack` | `xnnpack`, `metal`, `cuda`, or `portable` |
| `--dtype` | `fp32` | Model dtype: `fp32` or `bf16` |
| `--output-dir` | `./voxtral_rt_exports` | Output directory |
| `--max-seq-len` | `4096` | KV cache length |
| `--delay-tokens` | `6` | Transcription delay in tokens (6 = 480ms) |
| `--qlinear` | (none) | Decoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
| `--qlinear-group-size` | `32` | Group size for decoder linear quantization |
| `--qlinear-packing-format` | (none) | Packing format for decoder 4w quantization (`tile_packed_to_4d` for CUDA) |
| `--qlinear-encoder` | (none) | Encoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
| `--qlinear-encoder-group-size` | `32` | Group size for encoder linear quantization |
| `--qlinear-encoder-packing-format` | (none) | Packing format for encoder 4w quantization (`tile_packed_to_4d` for CUDA) |
| `--qembedding` | (none) | Embedding layer quantization (`8w`) |
| `--streaming` | off | Export streaming encoder with KV cache |
| `--max-enc-len` | `750` | Encoder sliding window size (streaming only) |
Expand All @@ -164,6 +202,15 @@ make voxtral_realtime-cpu
This builds ExecuTorch core libraries with XNNPACK, then the runner binary
at `cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner`.

### CUDA (NVIDIA GPU)

```bash
make voxtral_realtime-cuda
```

This builds ExecuTorch with CUDA backend support. The runner binary is at
the same path as above. Requires NVIDIA GPU with CUDA toolkit installed.

### Metal (Apple GPU)

```bash
Expand All @@ -180,10 +227,22 @@ The runner requires:
- `tekken.json` — tokenizer from the model weights directory
- `preprocessor.pte` — mel spectrogram preprocessor (see [Preprocessor](#preprocessor))
- A 16kHz mono WAV audio file (or live audio via `--mic`)
- For CUDA: `aoti_cuda_blob.ptd` — delegate data file (pass via `--data_path`)

```bash
cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
--model_path voxtral_rt_exports/model.pte \
--tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
--preprocessor_path voxtral_rt_exports/preprocessor.pte \
--audio_path input.wav
```

For CUDA, include the `.ptd` data file:

```bash
cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
--model_path voxtral_rt_exports/model.pte \
--data_path voxtral_rt_exports/aoti_cuda_blob.ptd \
--tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
--preprocessor_path voxtral_rt_exports/preprocessor.pte \
--audio_path input.wav
Expand Down Expand Up @@ -218,9 +277,13 @@ ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f f32le -nostats -loglevel error

Ctrl+C stops recording and flushes remaining text.

**CUDA:** Add `--data_path voxtral_rt_exports/aoti_cuda_blob.ptd` to all
run commands above when using the CUDA backend.

| Flag | Default | Description |
|------|---------|-------------|
| `--model_path` | `model.pte` | Path to exported model |
| `--data_path` | (none) | Path to delegate data file (`.ptd`, required for CUDA) |
| `--tokenizer_path` | `tekken.json` | Path to Tekken tokenizer |
| `--preprocessor_path` | (none) | Path to mel preprocessor `.pte` |
| `--audio_path` | (none) | Path to 16kHz mono WAV file |
Expand Down
Loading
Loading