From 1bdc2319ff4d7e7e30ba21d35868abfd74572c40 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Tue, 14 Apr 2026 22:08:03 +0000 Subject: [PATCH 1/7] Add Ministral-3-3B VLM recipe: hybrid Olive + Mobius export Complete olive recipe for Ministral-3-3B-Instruct-2512 VLM using: - Text decoder: Olive/ModelBuilder (INT4 for both CPU and CUDA) - Vision encoder + embedding: Mobius (dynamo-free ONNX construction) - Vision INT4 quantization: Olive post-export (CPU only) - context_length=32768, Permute3D transform in processor_config Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../builtin/.gitignore | 9 + .../builtin/README.md | 182 +++++++ .../builtin/cpu_and_mobile/text.json | 18 + .../builtin/cpu_and_mobile/vision.json | 18 + .../builtin/cuda/text.json | 31 ++ .../builtin/eval.py | 501 ++++++++++++++++++ .../builtin/inference.py | 163 ++++++ .../builtin/info.yml | 9 + .../builtin/optimize.py | 405 ++++++++++++++ .../builtin/requirements.txt | 4 + 10 files changed, 1340 insertions(+) create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py create mode 100644 mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore b/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore new file mode 100644 index 00000000..3b614474 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore @@ -0,0 +1,9 @@ +# Generated model artifacts +models/ + +# Python bytecode +__pycache__/ +*.pyc + +# Olive cache +.olive-cache/ diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md new file mode 100644 index 00000000..179fd84d --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md @@ -0,0 +1,182 @@ +# Ministral-3-3B ONNX Runtime GenAI Example + +This example demonstrates how to convert [Ministral-3-3B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) vision-language model to ONNX format using Olive and run inference with ONNX Runtime GenAI. + +Ministral-3-3B is a multimodal (VLM) model combining a Pixtral vision encoder with a Mistral text decoder using YaRN RoPE for extended context. The pipeline exports three sub-models: +- **Vision encoder** and **embedding** via [mobius](https://github.com/onnxruntime/mobius) (declarative ONNX graph construction); vision optionally INT4-quantized via Olive for CPU +- **Text decoder** via Olive/ModelBuilder (GQA + INT4/FP16 quantization) + +## Prerequisites + +```bash +pip install -r requirements.txt +``` + +Install ONNX Runtime GenAI: + +| Device | Install Command | +|--------|-----------------| +| CPU | `pip install onnxruntime-genai --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` | +| GPU (CUDA) | `pip install onnxruntime-genai-cuda --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` | + +## Steps + +### 1. Export & Optimize Models + +**CPU (INT4 all models):** + +```bash +python optimize.py --config-dir cpu_and_mobile --device cpu +``` + +**CUDA (FP16 all models):** + +```bash +python optimize.py --config-dir cuda --device gpu +``` + +**With local dequantized checkpoint (skips FP8 dequant):** + +```bash +python optimize.py --config-dir cpu_and_mobile --device cpu --model-path /path/to/Ministral-3-3B-dequantized +``` + +This runs: +- **Olive/ModelBuilder** for text decoder (GQA attention, YaRN RoPE, INT4/FP16) +- **Mobius** for vision encoder (Pixtral, dynamic H×W, 2D RoPE) and embedding (token + image fusion) +- **Olive INT4 quantization** on vision (cpu_and_mobile only; embedding stays FP16) + +Then generates `genai_config.json` and `processor_config.json` for the ORT GenAI runtime. + +### 2. Output Structure + +``` +cpu_and_mobile/models/ # or cuda/models/ +├── decoder/ +│ ├── model.onnx # Text decoder (Mistral + YaRN) +│ └── model.onnx.data +├── vision/ +│ ├── model.onnx # Pixtral vision encoder (FP16) +│ └── model.onnx.data +├── embedding/ +│ ├── model.onnx # Embedding fusion model (FP16) +│ └── model.onnx.data +├── genai_config.json # Runtime configuration +├── processor_config.json # Pixtral image preprocessing +├── tokenizer.json +└── tokenizer_config.json +``` + +### 3. Run Inference + +```bash +# Text-only +python inference.py --prompt "What is the capital of France?" + +# Image + text +python inference.py --image photo.jpg --prompt "Describe this image" + +# Interactive mode +python inference.py --interactive + +# CUDA model +python inference.py --model_path cuda/models --prompt "Hello" +``` + +Alternatively, use the built-in GenAI multimodal demo: + +```bash +python -m onnxruntime_genai.models.model_mm -m cpu_and_mobile/models --max_length 4096 +``` + +### 4. Evaluate + +Run the AI2D science diagram QA benchmark: + +```bash +# ONNX only (CPU INT4) +python eval.py --device cpu --model_path cpu_and_mobile/models + +# ONNX only (CUDA FP16) +python eval.py --device cuda --model_path cuda/models + +# Compare ONNX vs PyTorch reference +python eval.py --pytorch_model mistralai/Ministral-3-3B-Instruct-2512 --num_samples 100 +``` + +Expected precision gaps (ONNX vs PyTorch): +- **FP32**: ~0 pp (exact parity) +- **FP16**: <2 pp (precision loss) +- **INT4**: <5 pp (quantization loss) + +## Directory Structure + +``` +mistralai-Ministral-3-3B-Instruct-2512/builtin/ +├── cpu_and_mobile/ +│ ├── text.json # INT4 text decoder config (Olive/ModelBuilder) +│ └── vision.json # INT4 vision quantization (Olive, post-mobius) +├── cuda/ +│ └── text.json # FP16 text decoder config (Olive/ModelBuilder) +├── optimize.py # Export orchestrator (Olive + Mobius) +├── inference.py # ORT GenAI inference (text + VLM) +├── eval.py # AI2D benchmark evaluation +├── requirements.txt +├── info.yml +└── README.md +``` + +> **Note:** Unlike Qwen VLM recipes (which use Olive for all 3 sub-models end-to-end), +> Ministral uses **mobius** for vision and embedding ONNX export, then **Olive** for +> INT4 quantization (cpu_and_mobile only). The CUDA target uses FP16 from mobius directly. + +## Differences from Qwen VLM Recipes + +Qwen VLM recipes export all three sub-models through Olive using JSON configs +(`text.json`, `vision.json`, `embedding.json`). Each JSON defines a multi-pass +pipeline: PyTorch export → graph surgery → ORT fusion → quantization/FP16. + +This recipe takes a different approach for **vision and embedding**: + +| Component | Qwen | Ministral | Why | +|-----------|------|-----------|-----| +| Text decoder | Olive/ModelBuilder (`text.json`) | Olive/ModelBuilder (`text.json`) | Same — ModelBuilder handles GQA + quantization | +| Vision encoder | Olive: PyTorch export + 5-6 passes | **Mobius** export + Olive INT4 (`vision.json`) | Pixtral's dynamic image dims break `torch.onnx.export` | +| Embedding | Olive: PyTorch export + 5 passes | **Mobius** export (FP16, no INT4) | INT4 breaks embedding's Equal/Gather logic | + +**Why does Ministral use mobius instead of Olive for export?** Mobius constructs +the ONNX graph declaratively rather than tracing through PyTorch. The resulting +models already contain the graph optimizations that Qwen's Olive passes spend +5-6 steps creating: + +- **Fused operators:** `MultiHeadAttention`, `SkipSimplifiedLayerNormalization`, + `RotaryEmbedding` — already present in mobius output (Qwen achieves these via + `OrtTransformersOptimization`) +- **FP16 weights:** all 840M vision params exported as FP16 directly (Qwen + converts from FP32 via `OnnxFloatToFloat16`) +- **Clean graph:** 0 Gemm nodes, 0 redundant Cast chains (Qwen cleans these + via `GemmToMatMulAdd` and `OnnxPeepholeOptimizer`) +- **No PyTorch export artifacts:** no `PackedAttentionToLoopMHA` surgery needed + since mobius doesn't go through dynamo + +**What Olive still handles:** For `cpu_and_mobile`, `vision.json` applies +`OnnxBlockWiseRtnQuantization` (INT4) to the mobius-exported FP16 vision model. +For `cuda`, no additional Olive passes are needed — FP16 is optimal for GPU. + +**Why optimize.py has more lines (~400) than Qwen (~170):** + +| Code section | Lines | Why it can't be JSON-driven | +|---|---|---| +| `export_vision_and_embedding()` | ~55 | Olive has no mobius integration; Pixtral's dynamic dims cause dynamo failures | +| `update_genai_config()` | ~150 | Olive generates decoder config only; VLM 3-model config + transforms-based processor_config has no Olive pass | +| `quantize_vision_and_embedding()` | ~25 | Post-export INT4 on pre-built ONNX (Olive JSON-driven, but needs orchestration) | +| `fix_tokenizer()` | ~15 | No Olive tokenizer patching pass | + +The text decoder export (`text.json`) and INT4 quantization (`vision.json`) ARE Olive JSON-driven — identical to Qwen. + +## Notes + +- The HuggingFace checkpoint uses FP8 quantized weights. The export pipeline dequantizes these automatically (`weight * weight_scale_inv`). +- The tokenizer uses `TokenizersBackend` class which genai doesn't support. The optimize script fixes this to `LlamaTokenizer`. +- Pixtral vision supports dynamic image sizes (multiples of 28, up to 1540×1540). +- The text decoder includes `llama_4_attn_scale` for long-context attention (>16K tokens). diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json new file mode 100644 index 00000000..df60e9c8 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json @@ -0,0 +1,18 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512" + }, + "passes": { + "convert": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "extra_options": { + "filename": "model.onnx" + } + } + }, + "no_artifacts": true, + "output_dir": "cpu_and_mobile/models/decoder" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json new file mode 100644 index 00000000..6aeaea2e --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json @@ -0,0 +1,18 @@ +{ + "input_model": { + "type": "ONNXModel", + "model_path": "cpu_and_mobile/models/vision/model.onnx" + }, + "passes": { + "int4": { + "type": "OnnxBlockWiseRtnQuantization", + "block_size": 128, + "is_symmetric": true, + "accuracy_level": 4, + "save_as_external_data": true, + "external_data_name": "model.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "cpu_and_mobile/models/vision" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json new file mode 100644 index 00000000..a445f666 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json @@ -0,0 +1,31 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512" + }, + "passes": { + "convert": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "extra_options": { + "filename": "model.onnx" + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "cuda/models/decoder" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py new file mode 100644 index 00000000..f962f5f1 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py @@ -0,0 +1,501 @@ +"""Evaluate Ministral-3-3B VLM (ONNX) vs PyTorch on AI2D (diagram understanding). + +AI2D is a multiple-choice visual QA benchmark on scientific diagrams. +Each sample has an image, a question, four answer options, and a ground-truth answer. +Accuracy is the fraction of questions answered with the correct option letter. + +Expected precision gaps (ONNX vs PyTorch reference): + CPU + FP32 → expect ~0 pp gap (exact parity) + CUDA + FP16 → expect <2 pp gap (FP16 precision loss) + CPU + INT4 → expect <5 pp gap (quantization loss) + +Usage: + # CPU INT4 model (default) + python eval.py --device cpu --model_path cpu_and_mobile/models + + # CUDA FP16 model + python eval.py --device cuda --model_path cuda/models + + # Compare ONNX vs PyTorch reference + python eval.py --pytorch_model mistralai/Ministral-3-3B-Instruct-2512 + + # Larger sample + python eval.py --num_samples 200 +""" + +import argparse +import io +import json +import os +import re +import tempfile +import time + +import onnxruntime_genai as og +from datasets import load_dataset +from PIL import Image + +NUMBERS = ["1", "2", "3", "4"] + +# Expected accuracy gap thresholds (percentage points) by precision. +# These help users quickly assess whether a model export is healthy. +EXPECTED_GAP_PP = { + "fp32": 0.0, + "fp16": 2.0, + "int4": 5.0, +} + +DEFAULT_SYSTEM_PROMPT = ( + "You are a concise multiple-choice answering assistant. " + "When given a question with numbered options, respond with ONLY a single digit (1, 2, 3, or 4). " + "Do not include any explanation, reasoning, or other text — just the digit." +) + + +# --------------------------------------------------------------------------- +# Prompt helpers +# --------------------------------------------------------------------------- + + +def build_messages(question: str, options: list[str], system_prompt: str = "") -> str: + """Return a JSON-encoded chat messages list (for apply_chat_template).""" + option_text = "\n".join(f"{N}. {o}" for N, o in zip(NUMBERS, options)) + content = ( + f"Look at the diagram and answer the multiple-choice question.\n\n" + f"Question: {question}\n\n" + f"Options:\n{option_text}\n\n" + f"Reply with the number only (1, 2, 3, or 4)." + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append( + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": content}], + } + ) + return json.dumps(messages) + + +def parse_answer(text: str) -> str | None: + """Extract the first 1/2/3/4 digit from a model response.""" + text = text.strip() + m = re.search(r"\b([1-4])\b", text) + if m: + return m.group(1) + for ch in text: + if ch in NUMBERS: + return ch + return None + + +def ground_truth_number(sample: dict) -> str | None: + """Normalise the dataset's answer field to a 1-based number string. + + AI2D stores answer as a 0-based integer index into the options list. + We map: index 0 → '1', 1 → '2', 2 → '3', 3 → '4'. + """ + answer = sample.get("answer", "") + try: + idx = int(answer) + if 0 <= idx < 4: + return NUMBERS[idx] + except (ValueError, TypeError): + pass + return None + + +# --------------------------------------------------------------------------- +# Precision detection +# --------------------------------------------------------------------------- + + +def detect_onnx_precision(model_path: str) -> str: + """Infer ONNX model precision from the model directory or genai_config. + + Heuristics (in order): + 1. If genai_config.json exists and contains model builder metadata → use it + 2. If path contains 'int4' → 'int4' + 3. If path contains 'cpu_and_mobile' → 'int4' (default for CPU target) + 4. If path contains 'cuda' or 'fp16' → 'fp16' + 5. Fallback → 'fp16' + """ + path_lower = model_path.lower() + + # Check genai_config for precision hints + config_path = os.path.join(model_path, "genai_config.json") + if os.path.exists(config_path): + try: + with open(config_path) as f: + config = json.load(f) + # ModelBuilder writes precision into the config + decoder_cfg = config.get("model", {}).get("decoder", {}) + if "int4" in json.dumps(decoder_cfg).lower(): + return "int4" + except (json.JSONDecodeError, OSError): + pass + + if "int4" in path_lower: + return "int4" + if "cpu_and_mobile" in path_lower: + return "int4" + if "fp16" in path_lower or "cuda" in path_lower: + return "fp16" + return "fp16" + + +# --------------------------------------------------------------------------- +# Dataset helpers +# --------------------------------------------------------------------------- + + +def pil_from_sample(sample: dict) -> Image.Image | None: + """Return PIL image from a dataset sample regardless of field format.""" + img = sample.get("image") + if img is None: + return None + if isinstance(img, Image.Image): + return img.convert("RGB") + if isinstance(img, bytes): + return Image.open(io.BytesIO(img)).convert("RGB") + if isinstance(img, dict) and "bytes" in img: + return Image.open(io.BytesIO(img["bytes"])).convert("RGB") + return None + + +def load_ai2d(num_samples: int): + """Load a deterministic subset of AI2D test samples.""" + print(f"Loading AI2D dataset ({num_samples} samples)…") + ds = load_dataset("lmms-lab/ai2d", split="test") + ds = ds.select(range(min(num_samples, len(ds)))) + print(f" Loaded {len(ds)} samples.") + return ds + + +# --------------------------------------------------------------------------- +# ONNX inference +# --------------------------------------------------------------------------- + + +def build_onnx_runner(model_path: str): + """Load ONNX model with ORT GenAI.""" + print(f"\nLoading ONNX model from: {model_path}") + model = og.Model(model_path) + processor = model.create_multimodal_processor() + tokenizer = og.Tokenizer(model) + print(" ONNX model loaded.") + return model, processor, tokenizer + + +def run_onnx( + model, processor, tokenizer, pil_image: Image.Image, messages_json: str +) -> str: + """Run a single inference with the ONNX GenAI model.""" + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + pil_image.save(f, format="PNG") + tmp_path = f.name + + try: + images = og.Images.open(tmp_path) + prompt = tokenizer.apply_chat_template( + messages_json, add_generation_prompt=True + ) + inputs = processor(prompt, images=images) + + params = og.GeneratorParams(model) + params.set_search_options(max_length=2000, do_sample=False) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + + tokens = [] + while not generator.is_done(): + generator.generate_next_token() + tokens.append(generator.get_next_tokens()[0]) + del generator + + return tokenizer.decode(tokens) + finally: + os.unlink(tmp_path) + + +# --------------------------------------------------------------------------- +# PyTorch inference +# --------------------------------------------------------------------------- + + +def build_pytorch_runner(model_id: str, device: str = "auto"): + """Load HuggingFace PyTorch model for comparison. + + Args: + model_id: HuggingFace model ID or local path. + device: 'cpu', 'cuda', or 'auto' (auto-detect). + """ + print(f"\nLoading PyTorch model: {model_id}") + import torch + from transformers import AutoProcessor, Mistral3ForConditionalGeneration + + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.float16 if device == "cuda" else torch.float32 + precision_label = "fp16" if device == "cuda" else "fp32" + print(f" Device: {device}, dtype: {dtype} ({precision_label})") + + pt_model = Mistral3ForConditionalGeneration.from_pretrained( + model_id, dtype=dtype, trust_remote_code=True + ).to(device) + pt_proc = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + print(" PyTorch model loaded.") + return pt_model, pt_proc, device, precision_label + + +def run_pytorch( + pt_model, + pt_proc, + pil_image: Image.Image, + question: str, + options: list[str], + device: str, + system_prompt: str = "", +) -> str: + """Run a single inference with the HuggingFace PyTorch model.""" + import torch + + option_text = "\n".join(f"{N}. {o}" for N, o in zip(NUMBERS, options)) + content = ( + f"Look at the diagram and answer the multiple-choice question.\n\n" + f"Question: {question}\n\n" + f"Options:\n{option_text}\n\n" + f"Reply with the number only (1, 2, 3, or 4)." + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append( + { + "role": "user", + "content": [ + {"type": "image", "image": pil_image}, + {"type": "text", "text": content}, + ], + } + ) + text = pt_proc.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + inputs = pt_proc( + text=[text], images=[pil_image], padding=True, return_tensors="pt" + ).to(device) + + with torch.no_grad(): + out = pt_model.generate(**inputs, max_new_tokens=8, do_sample=False) + + out_ids = out[0][inputs["input_ids"].shape[-1] :] + return pt_proc.decode(out_ids, skip_special_tokens=True) + + +# --------------------------------------------------------------------------- +# Evaluation loop +# --------------------------------------------------------------------------- + + +def evaluate(dataset, runner_fn, label: str) -> dict: + """Run evaluation on a dataset with the given runner function.""" + correct = 0 + skipped = 0 + total = len(dataset) + latencies = [] + + print(f"\n{'=' * 60}") + print(f" Evaluating: {label} ({total} samples)") + print(f"{'=' * 60}") + + for i, sample in enumerate(dataset): + gt = ground_truth_number(sample) + if gt is None: + skipped += 1 + continue + + pil_image = pil_from_sample(sample) + if pil_image is None: + skipped += 1 + continue + + question = sample.get("question", "") + options = sample.get("options", []) + if len(options) < 2: + skipped += 1 + continue + + try: + t0 = time.perf_counter() + raw = runner_fn(pil_image, question, options) + elapsed = time.perf_counter() - t0 + latencies.append(elapsed) + except Exception as e: + print(f" [WARN] sample {i}: {e}") + skipped += 1 + continue + + pred = parse_answer(raw) + hit = pred == gt + + if (i + 1) % 10 == 0 or i == 0: + print( + f" [{i + 1:4d}/{total}] gt={gt} pred={pred} raw={raw.strip()!r:20} " + f"{'✓' if hit else '✗'} running_acc={correct / (i + 1 - skipped + 1e-9):.3f}" + ) + + if hit: + correct += 1 + + evaluated = total - skipped + accuracy = correct / evaluated if evaluated > 0 else 0.0 + avg_lat = sum(latencies) / len(latencies) if latencies else 0.0 + + print( + f"\n {label}: {correct}/{evaluated} correct | " + f"accuracy = {accuracy:.4f} ({accuracy * 100:.2f}%)" + ) + print(f" avg latency per sample: {avg_lat:.2f}s | skipped: {skipped}") + return { + "label": label, + "accuracy": accuracy, + "correct": correct, + "evaluated": evaluated, + "avg_latency_s": avg_lat, + "skipped": skipped, + } + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Eval ONNX vs PyTorch Ministral-3-3B VLM on AI2D" + ) + parser.add_argument( + "--model_path", + default="cpu_and_mobile/models", + help="Path to ONNX model dir (default: cpu_and_mobile/models/)", + ) + parser.add_argument( + "--pytorch_model", + default=None, + help="HuggingFace model ID for PyTorch comparison", + ) + parser.add_argument( + "--num_samples", + type=int, + default=100, + help="Number of AI2D test samples to evaluate (default: 100)", + ) + parser.add_argument( + "--device", + choices=["cpu", "cuda", "auto"], + default="auto", + help="Device for inference: cpu, cuda, or auto-detect (default: auto)", + ) + parser.add_argument( + "--skip_onnx", + action="store_true", + help="Skip ONNX evaluation", + ) + parser.add_argument( + "--system_prompt", + default=DEFAULT_SYSTEM_PROMPT, + help="System prompt to suppress chain-of-thought. Pass empty string to disable.", + ) + args = parser.parse_args() + + ds = load_ai2d(args.num_samples) + results = [] + + sys_prompt = args.system_prompt + if sys_prompt: + print(f"\nSystem prompt: {sys_prompt!r}") + else: + print("\nSystem prompt: (none)") + + # Detect ONNX precision from model path + onnx_precision = detect_onnx_precision(args.model_path) + + # ---- ONNX ---- + if not args.skip_onnx: + onnx_model, onnx_proc, onnx_tok = build_onnx_runner(args.model_path) + + def onnx_runner(pil_image, question, options): + msgs = build_messages(question, options, sys_prompt) + return run_onnx(onnx_model, onnx_proc, onnx_tok, pil_image, msgs) + + onnx_label = f"ONNX ({onnx_precision.upper()}) @ {args.model_path}" + results.append(evaluate(ds, onnx_runner, onnx_label)) + + # ---- PyTorch (optional) ---- + pt_precision = None + if args.pytorch_model: + pt_model, pt_proc, pt_device, pt_precision = build_pytorch_runner( + args.pytorch_model, device=args.device + ) + + def pt_runner(pil_image, question, options): + return run_pytorch( + pt_model, pt_proc, pil_image, question, options, pt_device, sys_prompt + ) + + pt_label = f"PyTorch ({pt_precision.upper()}) @ {args.pytorch_model}" + results.append(evaluate(ds, pt_runner, pt_label)) + + # ---- Summary ---- + print(f"\n{'=' * 60}") + print(" EVALUATION SUMMARY") + print(f"{'=' * 60}") + print(" Model : Ministral-3-3B-Instruct-2512 (VLM)") + print(" Dataset : AI2D (science diagram QA, multiple choice)") + print(f" Samples : {args.num_samples}") + print(f" ONNX prec : {onnx_precision.upper()}") + if pt_precision: + print(f" PyTorch prec: {pt_precision.upper()}") + print( + f" System prompt: " + f"{'(none)' if not sys_prompt else sys_prompt[:80] + ('...' if len(sys_prompt) > 80 else '')}" + ) + print() + for r in results: + print(f" {r['label']}") + print( + f" Accuracy : {r['accuracy'] * 100:.2f}% ({r['correct']}/{r['evaluated']})" + ) + print(f" Avg lat : {r['avg_latency_s']:.2f}s/sample") + print() + + if len(results) == 2: + delta = results[0]["accuracy"] - results[1]["accuracy"] + abs_delta = abs(delta) * 100 + print(f" Accuracy delta (ONNX - PyTorch): {delta * 100:+.2f} pp") + print( + f" Speedup (PyTorch lat / ONNX lat): " + f"{results[1]['avg_latency_s'] / max(results[0]['avg_latency_s'], 1e-9):.2f}x" + ) + + # Precision gap assessment + expected_gap = EXPECTED_GAP_PP.get(onnx_precision, 5.0) + print() + print(f" Expected gap for {onnx_precision.upper()}: <{expected_gap:.0f} pp") + if abs_delta <= expected_gap: + print(f" ✓ PASS — {abs_delta:.2f} pp gap is within expected range") + else: + print( + f" ✗ WARN — {abs_delta:.2f} pp gap exceeds expected {expected_gap:.0f} pp for {onnx_precision.upper()}" + ) + print( + " This may indicate a quality regression in the export pipeline." + ) + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py new file mode 100644 index 00000000..1b799c1b --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py @@ -0,0 +1,163 @@ +"""ONNX Runtime GenAI inference for Ministral-3-3B vision-language model. + +Usage: + python inference.py --prompt "What is the capital of France?" + python inference.py --image photo.jpg --prompt "Describe this image" + python inference.py --interactive + python inference.py --model_path cuda/models --prompt "Hello" +""" + +import argparse +import json +import time + +import onnxruntime_genai as og + + +def main(): + parser = argparse.ArgumentParser( + description="ONNX Runtime GenAI inference for Ministral-3-3B" + ) + parser.add_argument( + "--model_path", + type=str, + default="cpu_and_mobile/models", + help="Path to model directory containing genai_config.json and ONNX models", + ) + parser.add_argument("--image", type=str, default=None, help="Path to image file") + parser.add_argument("--prompt", type=str, default=None, help="Text prompt") + parser.add_argument( + "--max_length", type=int, default=4096, help="Maximum total tokens" + ) + parser.add_argument( + "--interactive", action="store_true", help="Run in interactive mode" + ) + args = parser.parse_args() + + print(f"Loading model from: {args.model_path}") + model = og.Model(args.model_path) + processor = model.create_multimodal_processor() + tokenizer = og.Tokenizer(model) + tokenizer_stream = processor.create_stream() + + if args.interactive: + interactive_mode(model, processor, tokenizer, tokenizer_stream, args) + elif args.prompt: + generate_response( + model, + processor, + tokenizer, + tokenizer_stream, + args.prompt, + args.image, + args.max_length, + ) + else: + print("Please provide --prompt or --interactive") + parser.print_help() + + +def generate_response( + model, processor, tokenizer, tokenizer_stream, prompt, image_path, max_length=4096 +): + """Run a single generation.""" + images = None + if image_path: + print(f"Loading image: {image_path}") + images = og.Images.open(image_path) + messages = [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": prompt}], + } + ] + else: + messages = [{"role": "user", "content": prompt}] + + full_prompt = tokenizer.apply_chat_template( + json.dumps(messages), add_generation_prompt=True + ) + print(f"\nPrompt: {prompt}") + if image_path: + print(f"Image: {image_path}") + print("\nGenerating response...") + + inputs = processor(full_prompt, images=images) + params = og.GeneratorParams(model) + params.set_search_options(max_length=max_length) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + + token_count = 0 + ttft = None + t_start = time.perf_counter() + + print("\nResponse: ", end="", flush=True) + while not generator.is_done(): + generator.generate_next_token() + if ttft is None: + ttft = time.perf_counter() - t_start + token_count += 1 + new_token = generator.get_next_tokens()[0] + print(tokenizer_stream.decode(new_token), end="", flush=True) + + t_total = time.perf_counter() - t_start + print() + del generator + + decode_tokens = max(token_count - 1, 1) + decode_time = t_total - (ttft or 0) + tps = decode_tokens / decode_time if decode_time > 0 else 0 + + print(f"\n Tokens generated : {token_count}") + print(f" TTFT : {(ttft or 0) * 1000:.1f} ms") + print(f" Decode TPS : {tps:.1f} tokens/sec") + print(f" Total time : {t_total:.2f} s") + + +def interactive_mode(model, processor, tokenizer, tokenizer_stream, args): + """Run in interactive mode.""" + print("\n" + "=" * 50) + print("Interactive Mode - Enter 'quit' to stop") + print("To include an image: image:/path/to/image.jpg your prompt") + print("=" * 50 + "\n") + + while True: + try: + user_input = input("You: ").strip() + except EOFError: + break + + if user_input.lower() in ("quit", "exit"): + break + if not user_input: + continue + + image_path = None + prompt = user_input + if user_input.startswith("image:"): + parts = user_input.split(" ", 1) + image_path = parts[0][6:] + prompt = parts[1] if len(parts) > 1 else "Describe this image" + + try: + generate_response( + model, + processor, + tokenizer, + tokenizer_stream, + prompt, + image_path, + args.max_length, + ) + except Exception as e: + print(f"Error: {e}") + + print("-" * 50 + "\n") + + print("Goodbye!") + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml b/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml new file mode 100644 index 00000000..a7b84368 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml @@ -0,0 +1,9 @@ +keywords: + olive-ai +ep: + CPUExecutionProvider + CUDAExecutionProvider +device: + cpu + gpu +name: ministral_3_3b diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py new file mode 100644 index 00000000..c1c59428 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -0,0 +1,405 @@ +"""End-to-end optimization pipeline for Ministral-3-3B ONNX models. + +Uses mobius for vision and embedding export (reliable dynamo-free ONNX +construction), and Olive/ModelBuilder for text decoder export (GQA + INT4). + +Pipeline: + cpu_and_mobile: Olive/ModelBuilder(INT4) → mobius(FP16) → Olive INT4 quant + cuda: Olive/ModelBuilder(FP16) → mobius(FP16) + +Architecture difference from Qwen VLM recipes: + Qwen uses Olive passes for all 3 sub-models (export + optimization). + Ministral uses mobius for vision/embedding because Pixtral's dynamic + image dimensions cause torch.onnx.export/dynamo failures. Mobius + produces already-optimized graphs (fused MHA, SkipLayerNorm, FP16). + For cpu_and_mobile, Olive then applies INT4 quantization post-export. + +Usage: + python optimize.py --config-dir cpu_and_mobile --device cpu + python optimize.py --config-dir cuda --device gpu + python optimize.py --config-dir cpu_and_mobile --device cpu --skip-export + python optimize.py --config-dir cpu_and_mobile --device cpu --model-path /local/dequantized/checkpoint +""" + +import argparse +import json +import logging +import os +import shutil +from pathlib import Path + +logging.getLogger("onnxscript").setLevel(logging.WARNING) +logging.getLogger("onnx_ir").setLevel(logging.WARNING) + +MODELS_DIR = "models" +MODEL_NAME = "mistralai/Ministral-3-3B-Instruct-2512" + +# Lazy-loaded HuggingFace config (avoids import-time network access) +_HF_CONFIG = None + + +def _get_hf_config(): + """Load and cache the HuggingFace model config.""" + global _HF_CONFIG + if _HF_CONFIG is None: + from transformers import Mistral3Config + + _HF_CONFIG = Mistral3Config.from_pretrained(MODEL_NAME) + return _HF_CONFIG + + +def export_text_decoder(config_dir: str): + """Export text decoder using Olive/ModelBuilder (GQA + quantization). + + text.json outputs directly to /models/decoder/model.onnx. + ModelBuilder also generates genai_config.json, tokenizer, and chat_template + inside decoder/ — we move them to the models root where the VLM pipeline + expects them. + """ + try: + from olive import run + except ImportError: + from olive.workflows import run + + config_path = Path(config_dir) / "text.json" + if config_path.exists(): + print(f" [Olive] Exporting text decoder from {config_path}...") + run(str(config_path)) + else: + raise FileNotFoundError(f"Text config not found: {config_path}") + + # Move shared configs from decoder/ to models root for VLM pipeline + decoder_dir = Path(config_dir) / MODELS_DIR / "decoder" + models_root = Path(config_dir) / MODELS_DIR + for filename in ( + "genai_config.json", + "tokenizer.json", + "tokenizer_config.json", + "chat_template.jinja", + ): + src = decoder_dir / filename + if src.exists(): + shutil.move(str(src), str(models_root / filename)) + + +def export_vision_and_embedding( + output_dir: str, + model_path: str, + dtype: str = "f16", +): + """Export vision encoder and embedding using mobius. + + Mobius constructs the ONNX graph declaratively and applies pretrained + weights, avoiding torch.onnx.export dynamo issues with Pixtral's + dynamic image dimensions. + """ + from mobius import build + + print(f" [Mobius] Building VLM from {model_path} (dtype={dtype})...") + # mobius.build() accepts dtype as a string (e.g. "f16", "f32", "bf16") + # and resolves it internally — pass the CLI string directly + pkg = build(model_path, dtype=dtype, load_weights=True) + + os.makedirs(output_dir, exist_ok=True) + + required_components = ("vision", "embedding") + missing_components = [] + + for component in required_components: + if component in pkg: + component_dir = os.path.join(output_dir, component) + os.makedirs(component_dir, exist_ok=True) + try: + pkg.save( + component_dir, + components=lambda name, c=component: name == c, + check_weights=True, + ) + # mobius saves as model.onnx directly in component_dir + expected_onnx = os.path.join(component_dir, "model.onnx") + if not os.path.exists(expected_onnx): + raise FileNotFoundError( + f"Mobius export did not produce expected ONNX file for '{component}': {expected_onnx}" + ) + print(f" [Mobius] Saved {expected_onnx}") + except Exception: + shutil.rmtree(component_dir, ignore_errors=True) + raise + else: + missing_components.append(component) + + if missing_components: + raise ValueError( + "Mobius package is missing required component(s): " + + ", ".join(missing_components) + ) + + print(" [Mobius] Vision and embedding export complete") + + +def quantize_vision_and_embedding(config_dir: str): + """Apply INT4 quantization to mobius-exported vision and embedding models. + + Runs Olive passes defined in vision.json / embedding.json (if they exist + in config_dir). These configs use ONNXModel input type and write quantized + output directly to the component directory, replacing the FP16 model. + + For cpu_and_mobile: INT4 quantization reduces model size ~73%. + For cuda: no vision/embedding configs exist (FP16 is optimal for GPU). + """ + try: + from olive import run + except ImportError: + from olive.workflows import run + + output_dir = str(Path(config_dir) / MODELS_DIR) + + for component in ("vision", "embedding"): + config_path = Path(config_dir) / f"{component}.json" + if not config_path.exists(): + continue + + mobius_onnx = os.path.join(output_dir, component, "model.onnx") + if not os.path.exists(mobius_onnx): + print( + f" [WARN] {mobius_onnx} not found, skipping {component} quantization" + ) + continue + + print(f" [Olive] Quantizing {component} from {config_path}...") + run(str(config_path)) + + +def export_models(config_dir: str, model_path: str, dtype: str = "f16"): + """Export all 3 sub-models: text (Olive), vision + embedding (mobius). + + For cpu_and_mobile, also applies INT4 quantization to vision and embedding + via Olive passes (vision.json / embedding.json). + """ + output_dir = str(Path(config_dir) / MODELS_DIR) + + print("=== Exporting models ===") + + # Text decoder via Olive/ModelBuilder (outputs to decoder/model.onnx directly) + export_text_decoder(config_dir) + + # Vision + embedding via mobius (FP16) + export_vision_and_embedding(output_dir, model_path, dtype) + + # INT4 quantization of vision + embedding (cpu_and_mobile only) + quantize_vision_and_embedding(config_dir) + + print() + + +def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): + """Patch genai_config.json with embedding/vision sections and processor_config. + + Derives model-specific values from the HuggingFace config (lazily loaded) + to avoid hardcoded constants drifting from the actual checkpoint. + """ + config_path = Path(output_dir) / "genai_config.json" + + with open(config_path) as f: + config = json.load(f) + + if device == "gpu": + provider_options = [ + { + "cuda": { + "enable_cuda_graph": "1", + "enable_skip_layer_norm_strict_mode": "1", + } + } + ] + vision_provider_options = [ + { + "cuda": { + "enable_cuda_graph": "0", + "enable_skip_layer_norm_strict_mode": "1", + } + } + ] + else: + provider_options = [] + vision_provider_options = [] + + session_options = { + "log_id": "onnxruntime-genai", + "provider_options": provider_options, + } + vision_session_options = { + "log_id": "onnxruntime-genai", + "provider_options": vision_provider_options, + } + + config["model"]["decoder"]["session_options"] = session_options + config["model"]["decoder"]["filename"] = "decoder/model.onnx" + + # Sync position_ids with what the decoder ONNX model actually supports + decoder_onnx = Path(output_dir) / "decoder" / "model.onnx" + if decoder_onnx.exists(): + import onnx + + decoder_model = onnx.load(str(decoder_onnx), load_external_data=False) + onnx_input_names = {inp.name for inp in decoder_model.graph.input} + if "position_ids" in onnx_input_names: + config["model"]["decoder"].setdefault("inputs", {})["position_ids"] = ( + "position_ids" + ) + else: + config["model"]["decoder"].get("inputs", {}).pop("position_ids", None) + + config["model"]["embedding"] = { + "filename": "embedding/model.onnx", + "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, + "outputs": {"inputs_embeds": "inputs_embeds"}, + "session_options": vision_session_options, + } + + # Vision config — values derived from HF config to stay in sync with checkpoint + hf_config = _get_hf_config() + config["model"]["vision"] = { + "filename": "vision/model.onnx", + "config_filename": "processor_config.json", + "spatial_merge_size": hf_config.spatial_merge_size, + "patch_size": hf_config.vision_config.patch_size, + "inputs": {"pixel_values": "pixel_values"}, + "outputs": {"image_features": "image_features"}, + "session_options": vision_session_options, + } + + config["model"]["bos_token_id"] = hf_config.text_config.bos_token_id or 1 + config["model"]["context_length"] = 32768 + config["model"]["eos_token_id"] = hf_config.text_config.eos_token_id or 2 + config["model"]["pad_token_id"] = hf_config.text_config.pad_token_id or 11 + config["model"]["image_token_id"] = hf_config.image_token_index + config["model"]["type"] = "mistral3" + + config["search"]["max_length"] = 32768 + config["search"]["top_k"] = 1 + config["search"]["past_present_share_buffer"] = False + if config["search"].get("top_p") is None: + config["search"]["top_p"] = 1.0 + + with open(config_path, "w") as f: + json.dump(config, f, indent=4) + print(f" Updated {config_path}") + + # Transforms-based processor config (matches ORT GenAI's image preprocessor format) + processor_config = { + "processor": { + "name": "pixtral_image_processor", + "transforms": [ + { + "operation": { + "name": "decode_image", + "type": "DecodeImage", + "attrs": {"color_space": "RGB"}, + } + }, + { + "operation": { + "name": "convert_to_rgb", + "type": "ConvertRGB", + } + }, + { + "operation": { + "name": "resize", + "type": "Resize", + "attrs": { + "height": 1540, + "width": 1540, + "smart_resize": 1, + "min_pixels": 784, + "max_pixels": 2371600, + "patch_size": hf_config.vision_config.patch_size, + "merge_size": hf_config.spatial_merge_size, + }, + } + }, + { + "operation": { + "name": "rescale", + "type": "Rescale", + "attrs": {"rescale_factor": 0.00392156862745098}, + } + }, + { + "operation": { + "name": "normalize", + "type": "Normalize", + "attrs": { + "mean": [0.48145466, 0.4578275, 0.40821073], + "std": [0.26862954, 0.26130258, 0.27577711], + }, + } + }, + { + "operation": { + "name": "permute", + "type": "Permute3D", + "attrs": {"dims": [2, 0, 1]}, + } + }, + ], + } + } + + processor_path = Path(output_dir) / "processor_config.json" + with open(processor_path, "w") as f: + json.dump(processor_config, f, indent=2) + print(f" Created {processor_path}") + + +def fix_tokenizer(output_dir: str = MODELS_DIR): + """Fix tokenizer_config.json for onnxruntime-genai compatibility. + + Ministral3's tokenizer uses 'TokenizersBackend' class which isn't supported + by genai's ort-extensions tokenizer. Change to 'LlamaTokenizer'. + """ + tc_path = Path(output_dir) / "tokenizer_config.json" + if tc_path.exists(): + tc = json.loads(tc_path.read_text(encoding="utf-8")) + if tc.get("tokenizer_class") == "TokenizersBackend": + tc["tokenizer_class"] = "LlamaTokenizer" + tc_path.write_text( + json.dumps(tc, indent=2, ensure_ascii=False), encoding="utf-8" + ) + print(" Fixed tokenizer_class to LlamaTokenizer") + + +def main(): + parser = argparse.ArgumentParser(description="Optimize Ministral-3-3B ONNX models") + parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--config-dir", default="cpu_and_mobile") + parser.add_argument("--skip-export", action="store_true") + parser.add_argument("--models-dir", default=None) + parser.add_argument( + "--model-path", + default=MODEL_NAME, + help="HuggingFace model ID or local path to dequantized checkpoint", + ) + parser.add_argument( + "--dtype", + default="f16", + choices=["f16", "f32", "bf16"], + help="Dtype for mobius vision/embedding export (default: f16)", + ) + args = parser.parse_args() + + models_dir = args.models_dir or str(Path(args.config_dir) / MODELS_DIR) + + if not args.skip_export: + export_models(args.config_dir, args.model_path, args.dtype) + + print("=== Generating configs ===") + update_genai_config(output_dir=models_dir, device=args.device) + fix_tokenizer(output_dir=models_dir) + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt b/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt new file mode 100644 index 00000000..0a184051 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/microsoft/Olive.git@main +git+https://github.com/onnxruntime/mobius.git@3777c18 +torch>=2.10.0,<2.11.0 +transformers>=4.57.0 From c29131f97721a380ca0ae0d1c05ac69a42810461 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Wed, 15 Apr 2026 22:03:46 +0000 Subject: [PATCH 2/7] feat: finalize INT4 quantization pipeline with eval benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add _strip_unused_initializers to reduce INT4 model size (1.7GB→220MB) - Add _fix_gather_block_quantized for RoPE cache preservation - CUDA: INT4 text + FP16 vision (71.65% AI2D) - CPU: INT4 text + INT4 vision (69.07% AI2D) - Remove unnecessary genai_config overrides (trust ModelBuilder) - Add comprehensive README with benchmark results - Fix eval.py build_messages for Jinja sort compatibility Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../builtin/README.md | 50 +++++- .../builtin/cpu_and_mobile/vision.json | 4 + .../builtin/eval.py | 16 +- .../builtin/optimize.py | 155 +++++++++++++++++- 4 files changed, 200 insertions(+), 25 deletions(-) diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md index 179fd84d..12bdc14c 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md @@ -6,6 +6,31 @@ Ministral-3-3B is a multimodal (VLM) model combining a Pixtral vision encoder wi - **Vision encoder** and **embedding** via [mobius](https://github.com/onnxruntime/mobius) (declarative ONNX graph construction); vision optionally INT4-quantized via Olive for CPU - **Text decoder** via Olive/ModelBuilder (GQA + INT4/FP16 quantization) +## Exported Configurations + +| Component | CUDA | CPU | +|-----------|------|-----| +| Text decoder | INT4 (`MatMulNBits`) | INT4 (`MatMulNBits`) | +| Vision encoder | FP16 | INT4 (`MatMulNBits` via Olive) | +| Embedding | FP16 | FP32 | + +- **CUDA**: INT4 text decoder + FP16 vision/embedding. Optimized for throughput on NVIDIA GPUs. +- **CPU**: INT4 text decoder + INT4 vision + FP32 embedding. Fully quantized for deployment on CPU-only machines. Embedding stays FP32 because INT4 breaks its `Equal`/`Gather` logic. + +## Benchmark Results + +Evaluated on [AI2D](https://allenai.org/data/diagrams) (science diagram multiple-choice QA, 4 options per question). + +| Configuration | Accuracy | Samples | Latency (s/sample) | Gap vs PyTorch | +|---------------|----------|---------|---------------------|----------------| +| PyTorch FP32 (CPU) | 72.00% | 100 | 21.66 | — baseline — | +| PyTorch FP16 (CUDA) | 73.00% | 200 | 0.20 | — baseline — | +| ONNX CUDA (INT4 text + FP16 vision) | 71.65% | 200 | 0.11 | −1.35 pp | +| ONNX CPU (INT4 text + INT4 vision) | 69.07% | 194 | 33.28 | −2.93 pp | + +All ONNX configurations are within the expected precision gap for INT4 quantization (<5 pp). +The CUDA ONNX model achieves **55× speedup** over CPU ONNX and **2× speedup** over PyTorch CUDA FP16. + ## Prerequisites ```bash @@ -91,23 +116,24 @@ python -m onnxruntime_genai.models.model_mm -m cpu_and_mobile/models --max_lengt ### 4. Evaluate -Run the AI2D science diagram QA benchmark: +Run the AI2D science diagram QA benchmark (see [Benchmark Results](#benchmark-results) for expected accuracy): ```bash # ONNX only (CPU INT4) python eval.py --device cpu --model_path cpu_and_mobile/models -# ONNX only (CUDA FP16) +# ONNX only (CUDA) python eval.py --device cuda --model_path cuda/models -# Compare ONNX vs PyTorch reference -python eval.py --pytorch_model mistralai/Ministral-3-3B-Instruct-2512 --num_samples 100 +# PyTorch baseline (BF16 variant avoids FP8 kernel requirement) +python eval.py --skip_onnx --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 --device cpu --num_samples 100 + +# Compare ONNX vs PyTorch side-by-side +python eval.py --model_path cuda/models --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 --num_samples 100 ``` -Expected precision gaps (ONNX vs PyTorch): -- **FP32**: ~0 pp (exact parity) -- **FP16**: <2 pp (precision loss) -- **INT4**: <5 pp (quantization loss) +> **Note:** The default HuggingFace checkpoint (`Ministral-3-3B-Instruct-2512`) uses FP8 weights, +> which require a specific CUDA kernel build. Use the `-BF16` variant for PyTorch baselines. ## Directory Structure @@ -174,8 +200,16 @@ For `cuda`, no additional Olive passes are needed — FP16 is optimal for GPU. The text decoder export (`text.json`) and INT4 quantization (`vision.json`) ARE Olive JSON-driven — identical to Qwen. +## Known Limitations + +- **CPU INT4 vision: language drift on some images.** The INT4-quantized vision encoder (CPU) occasionally produces embeddings that cause the text decoder to respond in the wrong language (e.g., Chinese instead of English). This has been observed on specific test images (e.g., `challenge.jpg`) and is a known artifact of aggressive vision quantization via the mobius export pipeline. The CUDA FP16 vision model does not exhibit this issue. +- **FP8 checkpoint requires special kernels.** The default HuggingFace checkpoint uses FP8 weights. Use the `-BF16` variant for PyTorch evaluation on machines without `kernels-community/finegrained-fp8`. +- **Single-image only.** Multi-image inputs are not yet supported; the runtime rejects prompts with more than one `[IMG]` token. + ## Notes +- **CPU INT4 pipeline**: Mobius exports FP16 as an intermediate format. Olive then quantizes to INT4 for CPU deployment. The final model uses INT4 `MatMulNBits` which runs natively on CPU. FP16 is never used at runtime — it is only the input format required by Olive's `OnnxBlockWiseRtnQuantization` pass. +- **CUDA pipeline**: Mobius exports FP16 directly for vision/embedding. Text decoder uses INT4 via ModelBuilder. No additional quantization pass needed. - The HuggingFace checkpoint uses FP8 quantized weights. The export pipeline dequantizes these automatically (`weight * weight_scale_inv`). - The tokenizer uses `TokenizersBackend` class which genai doesn't support. The optimize script fixes this to `LlamaTokenizer`. - Pixtral vision supports dynamic image sizes (multiples of 28, up to 1540×1540). diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json index 6aeaea2e..22427895 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json @@ -9,6 +9,10 @@ "block_size": 128, "is_symmetric": true, "accuracy_level": 4, + "nodes_to_exclude": [ + "vision_encoder/vision_tower/rope/Gather_node_23", + "vision_encoder/vision_tower/rope/Gather_node_24" + ], "save_as_external_data": true, "external_data_name": "model.onnx.data" } diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py index f962f5f1..720a5e75 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py @@ -58,10 +58,15 @@ def build_messages(question: str, options: list[str], system_prompt: str = "") -> str: - """Return a JSON-encoded chat messages list (for apply_chat_template).""" + """Return a JSON-encoded chat messages list (for apply_chat_template). + + Uses string content with [IMG] prefix instead of structured content + because ORT GenAI's Jinja does not support the sort() filter needed + by Mistral3's structured-content template path. + """ option_text = "\n".join(f"{N}. {o}" for N, o in zip(NUMBERS, options)) content = ( - f"Look at the diagram and answer the multiple-choice question.\n\n" + f"[IMG]Look at the diagram and answer the multiple-choice question.\n\n" f"Question: {question}\n\n" f"Options:\n{option_text}\n\n" f"Reply with the number only (1, 2, 3, or 4)." @@ -69,12 +74,7 @@ def build_messages(question: str, options: list[str], system_prompt: str = "") - messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) - messages.append( - { - "role": "user", - "content": [{"type": "image"}, {"type": "text", "text": content}], - } - ) + messages.append({"role": "user", "content": content}) return json.dumps(messages) diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py index c1c59428..06630eca 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -144,7 +144,11 @@ def quantize_vision_and_embedding(config_dir: str): in config_dir). These configs use ONNXModel input type and write quantized output directly to the component directory, replacing the FP16 model. - For cpu_and_mobile: INT4 quantization reduces model size ~73%. + After quantization, strips unused initializers (original FP16 weights kept + by Olive alongside INT4) and replaces any GatherBlockQuantized nodes with + plain Gather (ORT can't load quantized Gather on RoPE caches). + + For cpu_and_mobile: INT4 quantization reduces model size ~87%. For cuda: no vision/embedding configs exist (FP16 is optimal for GPU). """ try: @@ -166,9 +170,146 @@ def quantize_vision_and_embedding(config_dir: str): ) continue + # Save FP16 Gather data before quantization (for GatherBlockQuantized fix) + pre_quant_gathers = _save_gather_data(mobius_onnx) + print(f" [Olive] Quantizing {component} from {config_path}...") run(str(config_path)) + _fix_gather_block_quantized(mobius_onnx, pre_quant_gathers) + _strip_unused_initializers(mobius_onnx) + + +def _save_gather_data(onnx_path: str) -> dict: + """Save Gather node initializer data before quantization. + + Returns a dict mapping initializer names to numpy arrays for any + initializers used by Gather nodes (e.g., RoPE cos/sin caches). + """ + import onnx + from onnx import numpy_helper + + model = onnx.load(onnx_path) + gather_inputs = set() + for node in model.graph.node: + if node.op_type == "Gather": + gather_inputs.add(node.input[0]) + + result = {} + for init in model.graph.initializer: + if init.name in gather_inputs: + result[init.name] = numpy_helper.to_array(init) + return result + + +def _fix_gather_block_quantized(onnx_path: str, pre_quant_gathers: dict): + """Replace GatherBlockQuantized nodes with plain Gather + original data. + + Olive's INT4 quantization may quantize Gather nodes (e.g., RoPE cos/sin + caches) into GatherBlockQuantized, which ORT can't load. This restores + the original FP16/FP32 data and replaces with plain Gather. + """ + import onnx + from onnx import helper, numpy_helper + + model = onnx.load(onnx_path, load_external_data=False) + gbq_nodes = [n for n in model.graph.node if n.op_type == "GatherBlockQuantized"] + if not gbq_nodes: + return + + model = onnx.load(onnx_path) + inits = {i.name: i for i in model.graph.initializer} + + for node in list(model.graph.node): + if node.op_type != "GatherBlockQuantized": + continue + + q_name = node.input[0] + idx_name = node.input[1] + orig_name = q_name.replace("_Q4", "") + + if orig_name in pre_quant_gathers: + new_init = numpy_helper.from_array( + pre_quant_gathers[orig_name], name=orig_name + ) + model.graph.initializer.append(new_init) + + new_node = helper.make_node( + "Gather", + [orig_name, idx_name], + list(node.output), + name=node.name.replace("_Q4", ""), + ) + + idx = list(model.graph.node).index(node) + model.graph.node.remove(node) + model.graph.node.insert(idx, new_node) + + for inp in node.input: + if inp != idx_name and inp in inits: + init = inits[inp] + if init in model.graph.initializer: + model.graph.initializer.remove(init) + + onnx.save( + model, + onnx_path, + save_as_external_data=True, + all_tensors_to_one_file=True, + location="model.onnx.data", + size_threshold=1024, + ) + print(f" [PostProcess] Replaced {len(gbq_nodes)} GatherBlockQuantized → Gather") + + +def _strip_unused_initializers(onnx_path: str): + """Remove unused initializers and re-save to shrink the external data file. + + Olive's OnnxBlockWiseRtnQuantization keeps original weights alongside + the new INT4 (UINT8) weights. Stripping the unused originals typically + reduces the data file by ~87% (e.g., 1.7GB → 220MB for the vision model). + """ + if not os.path.exists(onnx_path): + return + + import onnx + from onnx.external_data_helper import convert_model_to_external_data + + model = onnx.load(onnx_path) + + used_names = set() + for node in model.graph.node: + for inp in node.input: + used_names.add(inp) + for inp in model.graph.input: + used_names.add(inp.name) + + before = len(model.graph.initializer) + new_inits = [init for init in model.graph.initializer if init.name in used_names] + removed = before - len(new_inits) + + if removed == 0: + return + + del model.graph.initializer[:] + model.graph.initializer.extend(new_inits) + + for init in model.graph.initializer: + del init.external_data[:] + + data_name = "model.onnx.data" + data_path = os.path.join(os.path.dirname(onnx_path), data_name) + if os.path.exists(data_path): + os.remove(data_path) + + convert_model_to_external_data( + model, all_tensors_to_one_file=True, location=data_name, size_threshold=1024 + ) + onnx.save(model, onnx_path) + + data_mb = os.path.getsize(data_path) / 1e6 if os.path.exists(data_path) else 0 + print(f" [Cleanup] Stripped {removed} unused initializers → {data_mb:.0f} MB") + def export_models(config_dir: str, model_path: str, dtype: str = "f16"): """Export all 3 sub-models: text (Olive), vision + embedding (mobius). @@ -269,18 +410,14 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): "session_options": vision_session_options, } - config["model"]["bos_token_id"] = hf_config.text_config.bos_token_id or 1 - config["model"]["context_length"] = 32768 - config["model"]["eos_token_id"] = hf_config.text_config.eos_token_id or 2 - config["model"]["pad_token_id"] = hf_config.text_config.pad_token_id or 11 + # Add VLM-specific fields not generated by ModelBuilder. + # Don't override context_length or max_length — PR #2077's ModelBuilder + # sets these correctly (context_length=262144, max_length=32768). config["model"]["image_token_id"] = hf_config.image_token_index - config["model"]["type"] = "mistral3" - config["search"]["max_length"] = 32768 + # Override search defaults for VLM: greedy decoding, no KV sharing config["search"]["top_k"] = 1 config["search"]["past_present_share_buffer"] = False - if config["search"].get("top_p") is None: - config["search"]["top_p"] = 1.0 with open(config_path, "w") as f: json.dump(config, f, indent=4) From 3acaa7f6ea5b3951a9369349b10776f5bbd86858 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Wed, 15 Apr 2026 22:39:33 +0000 Subject: [PATCH 3/7] Address 8 review comments: docs, code quality, consistency - eval.py: Add explanatory comments to except-pass clauses - optimize.py: Update docstring to match INT4 shipping config - optimize.py: Document _get_hf_config MODEL_NAME usage - optimize.py: Improve --dtype help text - README.md: Fix precision labels (CUDA=INT4 text, CPU embedding=FP16) - README.md: Remove stale FP32 embedding references Note: eval.py dtype= kwarg is valid in transformers >=5.0 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../builtin/README.md | 12 ++++++------ .../builtin/eval.py | 4 ++-- .../builtin/optimize.py | 18 +++++++++++++----- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md index 12bdc14c..44fb3d02 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md @@ -3,8 +3,8 @@ This example demonstrates how to convert [Ministral-3-3B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) vision-language model to ONNX format using Olive and run inference with ONNX Runtime GenAI. Ministral-3-3B is a multimodal (VLM) model combining a Pixtral vision encoder with a Mistral text decoder using YaRN RoPE for extended context. The pipeline exports three sub-models: -- **Vision encoder** and **embedding** via [mobius](https://github.com/onnxruntime/mobius) (declarative ONNX graph construction); vision optionally INT4-quantized via Olive for CPU -- **Text decoder** via Olive/ModelBuilder (GQA + INT4/FP16 quantization) +- **Vision encoder** and **embedding** via [mobius](https://github.com/onnxruntime/mobius) (declarative ONNX graph construction); vision INT4-quantized via Olive for CPU +- **Text decoder** via Olive/ModelBuilder (GQA + INT4 quantization) ## Exported Configurations @@ -12,10 +12,10 @@ Ministral-3-3B is a multimodal (VLM) model combining a Pixtral vision encoder wi |-----------|------|-----| | Text decoder | INT4 (`MatMulNBits`) | INT4 (`MatMulNBits`) | | Vision encoder | FP16 | INT4 (`MatMulNBits` via Olive) | -| Embedding | FP16 | FP32 | +| Embedding | FP16 | FP16 | - **CUDA**: INT4 text decoder + FP16 vision/embedding. Optimized for throughput on NVIDIA GPUs. -- **CPU**: INT4 text decoder + INT4 vision + FP32 embedding. Fully quantized for deployment on CPU-only machines. Embedding stays FP32 because INT4 breaks its `Equal`/`Gather` logic. +- **CPU**: INT4 text decoder + INT4 vision + FP16 embedding. Fully quantized where possible. Embedding stays FP16 because INT4 breaks its `Equal`/`Gather` logic. ## Benchmark Results @@ -48,13 +48,13 @@ Install ONNX Runtime GenAI: ### 1. Export & Optimize Models -**CPU (INT4 all models):** +**CPU (INT4 text + INT4 vision + FP16 embedding):** ```bash python optimize.py --config-dir cpu_and_mobile --device cpu ``` -**CUDA (FP16 all models):** +**CUDA (INT4 text + FP16 vision/embedding):** ```bash python optimize.py --config-dir cuda --device gpu diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py index 720a5e75..27986655 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py @@ -102,7 +102,7 @@ def ground_truth_number(sample: dict) -> str | None: if 0 <= idx < 4: return NUMBERS[idx] except (ValueError, TypeError): - pass + pass # answer is not a valid integer index — return None below return None @@ -134,7 +134,7 @@ def detect_onnx_precision(model_path: str) -> str: if "int4" in json.dumps(decoder_cfg).lower(): return "int4" except (json.JSONDecodeError, OSError): - pass + pass # config unreadable — fall through to path-based heuristic if "int4" in path_lower: return "int4" diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py index 06630eca..8b694d62 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -4,15 +4,16 @@ construction), and Olive/ModelBuilder for text decoder export (GQA + INT4). Pipeline: - cpu_and_mobile: Olive/ModelBuilder(INT4) → mobius(FP16) → Olive INT4 quant - cuda: Olive/ModelBuilder(FP16) → mobius(FP16) + cpu_and_mobile: Olive/ModelBuilder(INT4) → mobius(FP16) → Olive INT4 vision + cuda: Olive/ModelBuilder(INT4) → mobius(FP16) Architecture difference from Qwen VLM recipes: Qwen uses Olive passes for all 3 sub-models (export + optimization). Ministral uses mobius for vision/embedding because Pixtral's dynamic image dimensions cause torch.onnx.export/dynamo failures. Mobius produces already-optimized graphs (fused MHA, SkipLayerNorm, FP16). - For cpu_and_mobile, Olive then applies INT4 quantization post-export. + For cpu_and_mobile, Olive then applies INT4 quantization to the vision + model post-export. Usage: python optimize.py --config-dir cpu_and_mobile --device cpu @@ -39,7 +40,12 @@ def _get_hf_config(): - """Load and cache the HuggingFace model config.""" + """Load and cache the HuggingFace model config. + + Always loads from MODEL_NAME (the canonical HF model ID) rather than + --model-path, because the config values (image_token_id, patch_size, etc.) + are architecture constants that don't change between checkpoints. + """ global _HF_CONFIG if _HF_CONFIG is None: from transformers import Mistral3Config @@ -522,7 +528,9 @@ def main(): "--dtype", default="f16", choices=["f16", "f32", "bf16"], - help="Dtype for mobius vision/embedding export (default: f16)", + help="Dtype for mobius vision/embedding export. FP16 is recommended: it serves " + "as the intermediate format for Olive INT4 quantization on CPU, and as " + "the final format on CUDA. (default: f16)", ) args = parser.parse_args() From 25d460e50ba94aaee276f8ebb1682bfafe49e682 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Wed, 15 Apr 2026 22:52:49 +0000 Subject: [PATCH 4/7] Fix --models-dir path: copy exports to custom dir before config generation When --models-dir differs from the default (/models/), text.json output_dir is hardcoded so exports go to the default location. Copy the entire export tree to --models-dir after export so that update_genai_config() and fix_tokenizer() find the files. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../builtin/optimize.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py index 8b694d62..97a1273c 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -539,6 +539,18 @@ def main(): if not args.skip_export: export_models(args.config_dir, args.model_path, args.dtype) + # If --models-dir is specified and differs from the default export location, + # copy the exported models there. text.json output_dir is hardcoded in the + # JSON config so we can't redirect it — copy after export instead. + default_dir = str(Path(args.config_dir) / MODELS_DIR) + if models_dir != default_dir: + import shutil + + if os.path.exists(models_dir): + shutil.rmtree(models_dir) + shutil.copytree(default_dir, models_dir) + print(f" Copied models to {models_dir}") + print("=== Generating configs ===") update_genai_config(output_dir=models_dir, device=args.device) fix_tokenizer(output_dir=models_dir) From d1648900b67634f634a490b5099e1c540887ca73 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Wed, 15 Apr 2026 23:21:23 +0000 Subject: [PATCH 5/7] Disable CUDA graph capture for VLM (matches Qwen convention) CUDA graph capture is unsupported for VLMs with dynamic image sizes. Set enable_cuda_graph=0 for ALL models (decoder, vision, embedding), matching the Qwen VLM recipe convention. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../builtin/optimize.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py index 97a1273c..86da2708 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -351,15 +351,9 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): config = json.load(f) if device == "gpu": + # CUDA graph capture is unsupported for VLMs with dynamic image sizes. + # Disable for all models (matches Qwen VLM recipe convention). provider_options = [ - { - "cuda": { - "enable_cuda_graph": "1", - "enable_skip_layer_norm_strict_mode": "1", - } - } - ] - vision_provider_options = [ { "cuda": { "enable_cuda_graph": "0", @@ -367,6 +361,7 @@ def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): } } ] + vision_provider_options = provider_options else: provider_options = [] vision_provider_options = [] From 860675cf403a23d6f678b10134dee6dc9eef0956 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Thu, 16 Apr 2026 17:51:35 +0000 Subject: [PATCH 6/7] Fix: clear Olive cache before quantization to prevent stale output paths Olive caches the full resolved config including absolute output_dir. On re-runs with different --models-dir, the stale cache writes to the old path, creating unexpected directories (e.g., ministral3-cpu-int4-test). Clear the cache before each quantization run. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py index 86da2708..39b1ce66 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -164,6 +164,13 @@ def quantize_vision_and_embedding(config_dir: str): output_dir = str(Path(config_dir) / MODELS_DIR) + # Clear Olive cache to prevent stale output paths from previous runs. + # Olive caches the full resolved config (including absolute output_dir), + # which can cause writes to unexpected directories on re-runs. + olive_cache = Path(config_dir) / ".olive-cache" + if olive_cache.exists(): + shutil.rmtree(olive_cache, ignore_errors=True) + for component in ("vision", "embedding"): config_path = Path(config_dir) / f"{component}.json" if not config_path.exists(): From 25009c2ea4a72c47cf8ce729a0888a6a9b2069c2 Mon Sep 17 00:00:00 2001 From: Ti-Tai Wang Date: Thu, 16 Apr 2026 17:59:33 +0000 Subject: [PATCH 7/7] Refactor: write exports directly to --models-dir, eliminate copy - export_text_decoder: Load text.json as dict, override output_dir - export_vision_and_embedding: Already accepts output_dir parameter - quantize_vision_and_embedding: Load vision.json as dict, override model_path and output_dir - Remove shutil.copytree post-export step from main() - Remove .olive-cache clear (no longer needed) - Pass models_dir through export_models() pipeline This eliminates duplicate directories, copy overhead for multi-GB files, and ghost directories from stale Olive cache paths. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../builtin/optimize.py | 103 ++++++++---------- 1 file changed, 47 insertions(+), 56 deletions(-) diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py index 39b1ce66..8036bee9 100644 --- a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -54,13 +54,13 @@ def _get_hf_config(): return _HF_CONFIG -def export_text_decoder(config_dir: str): +def export_text_decoder(config_dir: str, models_dir: str): """Export text decoder using Olive/ModelBuilder (GQA + quantization). - text.json outputs directly to /models/decoder/model.onnx. - ModelBuilder also generates genai_config.json, tokenizer, and chat_template - inside decoder/ — we move them to the models root where the VLM pipeline - expects them. + Loads text.json as a dict and overrides output_dir to write directly + to /decoder. ModelBuilder also generates genai_config.json, + tokenizer, and chat_template inside decoder/ — we move them to the + models root where the VLM pipeline expects them. """ try: from olive import run @@ -68,15 +68,19 @@ def export_text_decoder(config_dir: str): from olive.workflows import run config_path = Path(config_dir) / "text.json" - if config_path.exists(): - print(f" [Olive] Exporting text decoder from {config_path}...") - run(str(config_path)) - else: + if not config_path.exists(): raise FileNotFoundError(f"Text config not found: {config_path}") + # Load config as dict and override output_dir to write directly to models_dir + with open(config_path) as f: + config = json.load(f) + config["output_dir"] = os.path.join(models_dir, "decoder") + + print(f" [Olive] Exporting text decoder from {config_path}...") + run(config) + # Move shared configs from decoder/ to models root for VLM pipeline - decoder_dir = Path(config_dir) / MODELS_DIR / "decoder" - models_root = Path(config_dir) / MODELS_DIR + decoder_dir = Path(models_dir) / "decoder" for filename in ( "genai_config.json", "tokenizer.json", @@ -85,7 +89,7 @@ def export_text_decoder(config_dir: str): ): src = decoder_dir / filename if src.exists(): - shutil.move(str(src), str(models_root / filename)) + shutil.move(str(src), str(Path(models_dir) / filename)) def export_vision_and_embedding( @@ -143,16 +147,15 @@ def export_vision_and_embedding( print(" [Mobius] Vision and embedding export complete") -def quantize_vision_and_embedding(config_dir: str): +def quantize_vision_and_embedding(config_dir: str, models_dir: str): """Apply INT4 quantization to mobius-exported vision and embedding models. - Runs Olive passes defined in vision.json / embedding.json (if they exist - in config_dir). These configs use ONNXModel input type and write quantized - output directly to the component directory, replacing the FP16 model. + Loads vision.json / embedding.json as dicts and overrides model_path + and output_dir to point directly to //. + This avoids writing to the JSON config's hardcoded relative paths. - After quantization, strips unused initializers (original FP16 weights kept - by Olive alongside INT4) and replaces any GatherBlockQuantized nodes with - plain Gather (ORT can't load quantized Gather on RoPE caches). + After quantization, replaces any GatherBlockQuantized nodes (ORT can't + load quantized Gather on RoPE caches) and strips unused initializers. For cpu_and_mobile: INT4 quantization reduces model size ~87%. For cuda: no vision/embedding configs exist (FP16 is optimal for GPU). @@ -162,35 +165,32 @@ def quantize_vision_and_embedding(config_dir: str): except ImportError: from olive.workflows import run - output_dir = str(Path(config_dir) / MODELS_DIR) - - # Clear Olive cache to prevent stale output paths from previous runs. - # Olive caches the full resolved config (including absolute output_dir), - # which can cause writes to unexpected directories on re-runs. - olive_cache = Path(config_dir) / ".olive-cache" - if olive_cache.exists(): - shutil.rmtree(olive_cache, ignore_errors=True) - for component in ("vision", "embedding"): config_path = Path(config_dir) / f"{component}.json" if not config_path.exists(): continue - mobius_onnx = os.path.join(output_dir, component, "model.onnx") - if not os.path.exists(mobius_onnx): + component_onnx = os.path.join(models_dir, component, "model.onnx") + if not os.path.exists(component_onnx): print( - f" [WARN] {mobius_onnx} not found, skipping {component} quantization" + f" [WARN] {component_onnx} not found, skipping {component} quantization" ) continue # Save FP16 Gather data before quantization (for GatherBlockQuantized fix) - pre_quant_gathers = _save_gather_data(mobius_onnx) + pre_quant_gathers = _save_gather_data(component_onnx) + + # Load config as dict and override paths to target models_dir directly + with open(config_path) as f: + config = json.load(f) + config["input_model"]["model_path"] = component_onnx + config["output_dir"] = os.path.join(models_dir, component) print(f" [Olive] Quantizing {component} from {config_path}...") - run(str(config_path)) + run(config) - _fix_gather_block_quantized(mobius_onnx, pre_quant_gathers) - _strip_unused_initializers(mobius_onnx) + _fix_gather_block_quantized(component_onnx, pre_quant_gathers) + _strip_unused_initializers(component_onnx) def _save_gather_data(onnx_path: str) -> dict: @@ -324,24 +324,27 @@ def _strip_unused_initializers(onnx_path: str): print(f" [Cleanup] Stripped {removed} unused initializers → {data_mb:.0f} MB") -def export_models(config_dir: str, model_path: str, dtype: str = "f16"): +def export_models( + config_dir: str, model_path: str, dtype: str = "f16", models_dir: str | None = None +): """Export all 3 sub-models: text (Olive), vision + embedding (mobius). - For cpu_and_mobile, also applies INT4 quantization to vision and embedding - via Olive passes (vision.json / embedding.json). + All outputs go directly to models_dir. No post-export copy needed. + For cpu_and_mobile, also applies INT4 quantization to vision. """ - output_dir = str(Path(config_dir) / MODELS_DIR) + if models_dir is None: + models_dir = str(Path(config_dir) / MODELS_DIR) print("=== Exporting models ===") - # Text decoder via Olive/ModelBuilder (outputs to decoder/model.onnx directly) - export_text_decoder(config_dir) + # Text decoder via Olive/ModelBuilder + export_text_decoder(config_dir, models_dir) # Vision + embedding via mobius (FP16) - export_vision_and_embedding(output_dir, model_path, dtype) + export_vision_and_embedding(models_dir, model_path, dtype) # INT4 quantization of vision + embedding (cpu_and_mobile only) - quantize_vision_and_embedding(config_dir) + quantize_vision_and_embedding(config_dir, models_dir) print() @@ -539,19 +542,7 @@ def main(): models_dir = args.models_dir or str(Path(args.config_dir) / MODELS_DIR) if not args.skip_export: - export_models(args.config_dir, args.model_path, args.dtype) - - # If --models-dir is specified and differs from the default export location, - # copy the exported models there. text.json output_dir is hardcoded in the - # JSON config so we can't redirect it — copy after export instead. - default_dir = str(Path(args.config_dir) / MODELS_DIR) - if models_dir != default_dir: - import shutil - - if os.path.exists(models_dir): - shutil.rmtree(models_dir) - shutil.copytree(default_dir, models_dir) - print(f" Copied models to {models_dir}") + export_models(args.config_dir, args.model_path, args.dtype, models_dir) print("=== Generating configs ===") update_genai_config(output_dir=models_dir, device=args.device)