diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore b/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore new file mode 100644 index 00000000..3b614474 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore @@ -0,0 +1,9 @@ +# Generated model artifacts +models/ + +# Python bytecode +__pycache__/ +*.pyc + +# Olive cache +.olive-cache/ diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md new file mode 100644 index 00000000..44fb3d02 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md @@ -0,0 +1,216 @@ +# Ministral-3-3B ONNX Runtime GenAI Example + +This example demonstrates how to convert [Ministral-3-3B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) vision-language model to ONNX format using Olive and run inference with ONNX Runtime GenAI. + +Ministral-3-3B is a multimodal (VLM) model combining a Pixtral vision encoder with a Mistral text decoder using YaRN RoPE for extended context. The pipeline exports three sub-models: +- **Vision encoder** and **embedding** via [mobius](https://github.com/onnxruntime/mobius) (declarative ONNX graph construction); vision INT4-quantized via Olive for CPU +- **Text decoder** via Olive/ModelBuilder (GQA + INT4 quantization) + +## Exported Configurations + +| Component | CUDA | CPU | +|-----------|------|-----| +| Text decoder | INT4 (`MatMulNBits`) | INT4 (`MatMulNBits`) | +| Vision encoder | FP16 | INT4 (`MatMulNBits` via Olive) | +| Embedding | FP16 | FP16 | + +- **CUDA**: INT4 text decoder + FP16 vision/embedding. Optimized for throughput on NVIDIA GPUs. +- **CPU**: INT4 text decoder + INT4 vision + FP16 embedding. Fully quantized where possible. Embedding stays FP16 because INT4 breaks its `Equal`/`Gather` logic. + +## Benchmark Results + +Evaluated on [AI2D](https://allenai.org/data/diagrams) (science diagram multiple-choice QA, 4 options per question). + +| Configuration | Accuracy | Samples | Latency (s/sample) | Gap vs PyTorch | +|---------------|----------|---------|---------------------|----------------| +| PyTorch FP32 (CPU) | 72.00% | 100 | 21.66 | — baseline — | +| PyTorch FP16 (CUDA) | 73.00% | 200 | 0.20 | — baseline — | +| ONNX CUDA (INT4 text + FP16 vision) | 71.65% | 200 | 0.11 | −1.35 pp | +| ONNX CPU (INT4 text + INT4 vision) | 69.07% | 194 | 33.28 | −2.93 pp | + +All ONNX configurations are within the expected precision gap for INT4 quantization (<5 pp). +The CUDA ONNX model achieves **55× speedup** over CPU ONNX and **2× speedup** over PyTorch CUDA FP16. + +## Prerequisites + +```bash +pip install -r requirements.txt +``` + +Install ONNX Runtime GenAI: + +| Device | Install Command | +|--------|-----------------| +| CPU | `pip install onnxruntime-genai --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` | +| GPU (CUDA) | `pip install onnxruntime-genai-cuda --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` | + +## Steps + +### 1. Export & Optimize Models + +**CPU (INT4 text + INT4 vision + FP16 embedding):** + +```bash +python optimize.py --config-dir cpu_and_mobile --device cpu +``` + +**CUDA (INT4 text + FP16 vision/embedding):** + +```bash +python optimize.py --config-dir cuda --device gpu +``` + +**With local dequantized checkpoint (skips FP8 dequant):** + +```bash +python optimize.py --config-dir cpu_and_mobile --device cpu --model-path /path/to/Ministral-3-3B-dequantized +``` + +This runs: +- **Olive/ModelBuilder** for text decoder (GQA attention, YaRN RoPE, INT4/FP16) +- **Mobius** for vision encoder (Pixtral, dynamic H×W, 2D RoPE) and embedding (token + image fusion) +- **Olive INT4 quantization** on vision (cpu_and_mobile only; embedding stays FP16) + +Then generates `genai_config.json` and `processor_config.json` for the ORT GenAI runtime. + +### 2. Output Structure + +``` +cpu_and_mobile/models/ # or cuda/models/ +├── decoder/ +│ ├── model.onnx # Text decoder (Mistral + YaRN) +│ └── model.onnx.data +├── vision/ +│ ├── model.onnx # Pixtral vision encoder (FP16) +│ └── model.onnx.data +├── embedding/ +│ ├── model.onnx # Embedding fusion model (FP16) +│ └── model.onnx.data +├── genai_config.json # Runtime configuration +├── processor_config.json # Pixtral image preprocessing +├── tokenizer.json +└── tokenizer_config.json +``` + +### 3. Run Inference + +```bash +# Text-only +python inference.py --prompt "What is the capital of France?" + +# Image + text +python inference.py --image photo.jpg --prompt "Describe this image" + +# Interactive mode +python inference.py --interactive + +# CUDA model +python inference.py --model_path cuda/models --prompt "Hello" +``` + +Alternatively, use the built-in GenAI multimodal demo: + +```bash +python -m onnxruntime_genai.models.model_mm -m cpu_and_mobile/models --max_length 4096 +``` + +### 4. Evaluate + +Run the AI2D science diagram QA benchmark (see [Benchmark Results](#benchmark-results) for expected accuracy): + +```bash +# ONNX only (CPU INT4) +python eval.py --device cpu --model_path cpu_and_mobile/models + +# ONNX only (CUDA) +python eval.py --device cuda --model_path cuda/models + +# PyTorch baseline (BF16 variant avoids FP8 kernel requirement) +python eval.py --skip_onnx --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 --device cpu --num_samples 100 + +# Compare ONNX vs PyTorch side-by-side +python eval.py --model_path cuda/models --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 --num_samples 100 +``` + +> **Note:** The default HuggingFace checkpoint (`Ministral-3-3B-Instruct-2512`) uses FP8 weights, +> which require a specific CUDA kernel build. Use the `-BF16` variant for PyTorch baselines. + +## Directory Structure + +``` +mistralai-Ministral-3-3B-Instruct-2512/builtin/ +├── cpu_and_mobile/ +│ ├── text.json # INT4 text decoder config (Olive/ModelBuilder) +│ └── vision.json # INT4 vision quantization (Olive, post-mobius) +├── cuda/ +│ └── text.json # FP16 text decoder config (Olive/ModelBuilder) +├── optimize.py # Export orchestrator (Olive + Mobius) +├── inference.py # ORT GenAI inference (text + VLM) +├── eval.py # AI2D benchmark evaluation +├── requirements.txt +├── info.yml +└── README.md +``` + +> **Note:** Unlike Qwen VLM recipes (which use Olive for all 3 sub-models end-to-end), +> Ministral uses **mobius** for vision and embedding ONNX export, then **Olive** for +> INT4 quantization (cpu_and_mobile only). The CUDA target uses FP16 from mobius directly. + +## Differences from Qwen VLM Recipes + +Qwen VLM recipes export all three sub-models through Olive using JSON configs +(`text.json`, `vision.json`, `embedding.json`). Each JSON defines a multi-pass +pipeline: PyTorch export → graph surgery → ORT fusion → quantization/FP16. + +This recipe takes a different approach for **vision and embedding**: + +| Component | Qwen | Ministral | Why | +|-----------|------|-----------|-----| +| Text decoder | Olive/ModelBuilder (`text.json`) | Olive/ModelBuilder (`text.json`) | Same — ModelBuilder handles GQA + quantization | +| Vision encoder | Olive: PyTorch export + 5-6 passes | **Mobius** export + Olive INT4 (`vision.json`) | Pixtral's dynamic image dims break `torch.onnx.export` | +| Embedding | Olive: PyTorch export + 5 passes | **Mobius** export (FP16, no INT4) | INT4 breaks embedding's Equal/Gather logic | + +**Why does Ministral use mobius instead of Olive for export?** Mobius constructs +the ONNX graph declaratively rather than tracing through PyTorch. The resulting +models already contain the graph optimizations that Qwen's Olive passes spend +5-6 steps creating: + +- **Fused operators:** `MultiHeadAttention`, `SkipSimplifiedLayerNormalization`, + `RotaryEmbedding` — already present in mobius output (Qwen achieves these via + `OrtTransformersOptimization`) +- **FP16 weights:** all 840M vision params exported as FP16 directly (Qwen + converts from FP32 via `OnnxFloatToFloat16`) +- **Clean graph:** 0 Gemm nodes, 0 redundant Cast chains (Qwen cleans these + via `GemmToMatMulAdd` and `OnnxPeepholeOptimizer`) +- **No PyTorch export artifacts:** no `PackedAttentionToLoopMHA` surgery needed + since mobius doesn't go through dynamo + +**What Olive still handles:** For `cpu_and_mobile`, `vision.json` applies +`OnnxBlockWiseRtnQuantization` (INT4) to the mobius-exported FP16 vision model. +For `cuda`, no additional Olive passes are needed — FP16 is optimal for GPU. + +**Why optimize.py has more lines (~400) than Qwen (~170):** + +| Code section | Lines | Why it can't be JSON-driven | +|---|---|---| +| `export_vision_and_embedding()` | ~55 | Olive has no mobius integration; Pixtral's dynamic dims cause dynamo failures | +| `update_genai_config()` | ~150 | Olive generates decoder config only; VLM 3-model config + transforms-based processor_config has no Olive pass | +| `quantize_vision_and_embedding()` | ~25 | Post-export INT4 on pre-built ONNX (Olive JSON-driven, but needs orchestration) | +| `fix_tokenizer()` | ~15 | No Olive tokenizer patching pass | + +The text decoder export (`text.json`) and INT4 quantization (`vision.json`) ARE Olive JSON-driven — identical to Qwen. + +## Known Limitations + +- **CPU INT4 vision: language drift on some images.** The INT4-quantized vision encoder (CPU) occasionally produces embeddings that cause the text decoder to respond in the wrong language (e.g., Chinese instead of English). This has been observed on specific test images (e.g., `challenge.jpg`) and is a known artifact of aggressive vision quantization via the mobius export pipeline. The CUDA FP16 vision model does not exhibit this issue. +- **FP8 checkpoint requires special kernels.** The default HuggingFace checkpoint uses FP8 weights. Use the `-BF16` variant for PyTorch evaluation on machines without `kernels-community/finegrained-fp8`. +- **Single-image only.** Multi-image inputs are not yet supported; the runtime rejects prompts with more than one `[IMG]` token. + +## Notes + +- **CPU INT4 pipeline**: Mobius exports FP16 as an intermediate format. Olive then quantizes to INT4 for CPU deployment. The final model uses INT4 `MatMulNBits` which runs natively on CPU. FP16 is never used at runtime — it is only the input format required by Olive's `OnnxBlockWiseRtnQuantization` pass. +- **CUDA pipeline**: Mobius exports FP16 directly for vision/embedding. Text decoder uses INT4 via ModelBuilder. No additional quantization pass needed. +- The HuggingFace checkpoint uses FP8 quantized weights. The export pipeline dequantizes these automatically (`weight * weight_scale_inv`). +- The tokenizer uses `TokenizersBackend` class which genai doesn't support. The optimize script fixes this to `LlamaTokenizer`. +- Pixtral vision supports dynamic image sizes (multiples of 28, up to 1540×1540). +- The text decoder includes `llama_4_attn_scale` for long-context attention (>16K tokens). diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json new file mode 100644 index 00000000..df60e9c8 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json @@ -0,0 +1,18 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512" + }, + "passes": { + "convert": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "extra_options": { + "filename": "model.onnx" + } + } + }, + "no_artifacts": true, + "output_dir": "cpu_and_mobile/models/decoder" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json new file mode 100644 index 00000000..22427895 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json @@ -0,0 +1,22 @@ +{ + "input_model": { + "type": "ONNXModel", + "model_path": "cpu_and_mobile/models/vision/model.onnx" + }, + "passes": { + "int4": { + "type": "OnnxBlockWiseRtnQuantization", + "block_size": 128, + "is_symmetric": true, + "accuracy_level": 4, + "nodes_to_exclude": [ + "vision_encoder/vision_tower/rope/Gather_node_23", + "vision_encoder/vision_tower/rope/Gather_node_24" + ], + "save_as_external_data": true, + "external_data_name": "model.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "cpu_and_mobile/models/vision" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json new file mode 100644 index 00000000..a445f666 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json @@ -0,0 +1,31 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512" + }, + "passes": { + "convert": { + "type": "ModelBuilder", + "precision": "int4", + "int4_accuracy_level": 4, + "extra_options": { + "filename": "model.onnx" + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "cuda/models/decoder" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py new file mode 100644 index 00000000..27986655 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py @@ -0,0 +1,501 @@ +"""Evaluate Ministral-3-3B VLM (ONNX) vs PyTorch on AI2D (diagram understanding). + +AI2D is a multiple-choice visual QA benchmark on scientific diagrams. +Each sample has an image, a question, four answer options, and a ground-truth answer. +Accuracy is the fraction of questions answered with the correct option letter. + +Expected precision gaps (ONNX vs PyTorch reference): + CPU + FP32 → expect ~0 pp gap (exact parity) + CUDA + FP16 → expect <2 pp gap (FP16 precision loss) + CPU + INT4 → expect <5 pp gap (quantization loss) + +Usage: + # CPU INT4 model (default) + python eval.py --device cpu --model_path cpu_and_mobile/models + + # CUDA FP16 model + python eval.py --device cuda --model_path cuda/models + + # Compare ONNX vs PyTorch reference + python eval.py --pytorch_model mistralai/Ministral-3-3B-Instruct-2512 + + # Larger sample + python eval.py --num_samples 200 +""" + +import argparse +import io +import json +import os +import re +import tempfile +import time + +import onnxruntime_genai as og +from datasets import load_dataset +from PIL import Image + +NUMBERS = ["1", "2", "3", "4"] + +# Expected accuracy gap thresholds (percentage points) by precision. +# These help users quickly assess whether a model export is healthy. +EXPECTED_GAP_PP = { + "fp32": 0.0, + "fp16": 2.0, + "int4": 5.0, +} + +DEFAULT_SYSTEM_PROMPT = ( + "You are a concise multiple-choice answering assistant. " + "When given a question with numbered options, respond with ONLY a single digit (1, 2, 3, or 4). " + "Do not include any explanation, reasoning, or other text — just the digit." +) + + +# --------------------------------------------------------------------------- +# Prompt helpers +# --------------------------------------------------------------------------- + + +def build_messages(question: str, options: list[str], system_prompt: str = "") -> str: + """Return a JSON-encoded chat messages list (for apply_chat_template). + + Uses string content with [IMG] prefix instead of structured content + because ORT GenAI's Jinja does not support the sort() filter needed + by Mistral3's structured-content template path. + """ + option_text = "\n".join(f"{N}. {o}" for N, o in zip(NUMBERS, options)) + content = ( + f"[IMG]Look at the diagram and answer the multiple-choice question.\n\n" + f"Question: {question}\n\n" + f"Options:\n{option_text}\n\n" + f"Reply with the number only (1, 2, 3, or 4)." + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": content}) + return json.dumps(messages) + + +def parse_answer(text: str) -> str | None: + """Extract the first 1/2/3/4 digit from a model response.""" + text = text.strip() + m = re.search(r"\b([1-4])\b", text) + if m: + return m.group(1) + for ch in text: + if ch in NUMBERS: + return ch + return None + + +def ground_truth_number(sample: dict) -> str | None: + """Normalise the dataset's answer field to a 1-based number string. + + AI2D stores answer as a 0-based integer index into the options list. + We map: index 0 → '1', 1 → '2', 2 → '3', 3 → '4'. + """ + answer = sample.get("answer", "") + try: + idx = int(answer) + if 0 <= idx < 4: + return NUMBERS[idx] + except (ValueError, TypeError): + pass # answer is not a valid integer index — return None below + return None + + +# --------------------------------------------------------------------------- +# Precision detection +# --------------------------------------------------------------------------- + + +def detect_onnx_precision(model_path: str) -> str: + """Infer ONNX model precision from the model directory or genai_config. + + Heuristics (in order): + 1. If genai_config.json exists and contains model builder metadata → use it + 2. If path contains 'int4' → 'int4' + 3. If path contains 'cpu_and_mobile' → 'int4' (default for CPU target) + 4. If path contains 'cuda' or 'fp16' → 'fp16' + 5. Fallback → 'fp16' + """ + path_lower = model_path.lower() + + # Check genai_config for precision hints + config_path = os.path.join(model_path, "genai_config.json") + if os.path.exists(config_path): + try: + with open(config_path) as f: + config = json.load(f) + # ModelBuilder writes precision into the config + decoder_cfg = config.get("model", {}).get("decoder", {}) + if "int4" in json.dumps(decoder_cfg).lower(): + return "int4" + except (json.JSONDecodeError, OSError): + pass # config unreadable — fall through to path-based heuristic + + if "int4" in path_lower: + return "int4" + if "cpu_and_mobile" in path_lower: + return "int4" + if "fp16" in path_lower or "cuda" in path_lower: + return "fp16" + return "fp16" + + +# --------------------------------------------------------------------------- +# Dataset helpers +# --------------------------------------------------------------------------- + + +def pil_from_sample(sample: dict) -> Image.Image | None: + """Return PIL image from a dataset sample regardless of field format.""" + img = sample.get("image") + if img is None: + return None + if isinstance(img, Image.Image): + return img.convert("RGB") + if isinstance(img, bytes): + return Image.open(io.BytesIO(img)).convert("RGB") + if isinstance(img, dict) and "bytes" in img: + return Image.open(io.BytesIO(img["bytes"])).convert("RGB") + return None + + +def load_ai2d(num_samples: int): + """Load a deterministic subset of AI2D test samples.""" + print(f"Loading AI2D dataset ({num_samples} samples)…") + ds = load_dataset("lmms-lab/ai2d", split="test") + ds = ds.select(range(min(num_samples, len(ds)))) + print(f" Loaded {len(ds)} samples.") + return ds + + +# --------------------------------------------------------------------------- +# ONNX inference +# --------------------------------------------------------------------------- + + +def build_onnx_runner(model_path: str): + """Load ONNX model with ORT GenAI.""" + print(f"\nLoading ONNX model from: {model_path}") + model = og.Model(model_path) + processor = model.create_multimodal_processor() + tokenizer = og.Tokenizer(model) + print(" ONNX model loaded.") + return model, processor, tokenizer + + +def run_onnx( + model, processor, tokenizer, pil_image: Image.Image, messages_json: str +) -> str: + """Run a single inference with the ONNX GenAI model.""" + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + pil_image.save(f, format="PNG") + tmp_path = f.name + + try: + images = og.Images.open(tmp_path) + prompt = tokenizer.apply_chat_template( + messages_json, add_generation_prompt=True + ) + inputs = processor(prompt, images=images) + + params = og.GeneratorParams(model) + params.set_search_options(max_length=2000, do_sample=False) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + + tokens = [] + while not generator.is_done(): + generator.generate_next_token() + tokens.append(generator.get_next_tokens()[0]) + del generator + + return tokenizer.decode(tokens) + finally: + os.unlink(tmp_path) + + +# --------------------------------------------------------------------------- +# PyTorch inference +# --------------------------------------------------------------------------- + + +def build_pytorch_runner(model_id: str, device: str = "auto"): + """Load HuggingFace PyTorch model for comparison. + + Args: + model_id: HuggingFace model ID or local path. + device: 'cpu', 'cuda', or 'auto' (auto-detect). + """ + print(f"\nLoading PyTorch model: {model_id}") + import torch + from transformers import AutoProcessor, Mistral3ForConditionalGeneration + + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.float16 if device == "cuda" else torch.float32 + precision_label = "fp16" if device == "cuda" else "fp32" + print(f" Device: {device}, dtype: {dtype} ({precision_label})") + + pt_model = Mistral3ForConditionalGeneration.from_pretrained( + model_id, dtype=dtype, trust_remote_code=True + ).to(device) + pt_proc = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + print(" PyTorch model loaded.") + return pt_model, pt_proc, device, precision_label + + +def run_pytorch( + pt_model, + pt_proc, + pil_image: Image.Image, + question: str, + options: list[str], + device: str, + system_prompt: str = "", +) -> str: + """Run a single inference with the HuggingFace PyTorch model.""" + import torch + + option_text = "\n".join(f"{N}. {o}" for N, o in zip(NUMBERS, options)) + content = ( + f"Look at the diagram and answer the multiple-choice question.\n\n" + f"Question: {question}\n\n" + f"Options:\n{option_text}\n\n" + f"Reply with the number only (1, 2, 3, or 4)." + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append( + { + "role": "user", + "content": [ + {"type": "image", "image": pil_image}, + {"type": "text", "text": content}, + ], + } + ) + text = pt_proc.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + inputs = pt_proc( + text=[text], images=[pil_image], padding=True, return_tensors="pt" + ).to(device) + + with torch.no_grad(): + out = pt_model.generate(**inputs, max_new_tokens=8, do_sample=False) + + out_ids = out[0][inputs["input_ids"].shape[-1] :] + return pt_proc.decode(out_ids, skip_special_tokens=True) + + +# --------------------------------------------------------------------------- +# Evaluation loop +# --------------------------------------------------------------------------- + + +def evaluate(dataset, runner_fn, label: str) -> dict: + """Run evaluation on a dataset with the given runner function.""" + correct = 0 + skipped = 0 + total = len(dataset) + latencies = [] + + print(f"\n{'=' * 60}") + print(f" Evaluating: {label} ({total} samples)") + print(f"{'=' * 60}") + + for i, sample in enumerate(dataset): + gt = ground_truth_number(sample) + if gt is None: + skipped += 1 + continue + + pil_image = pil_from_sample(sample) + if pil_image is None: + skipped += 1 + continue + + question = sample.get("question", "") + options = sample.get("options", []) + if len(options) < 2: + skipped += 1 + continue + + try: + t0 = time.perf_counter() + raw = runner_fn(pil_image, question, options) + elapsed = time.perf_counter() - t0 + latencies.append(elapsed) + except Exception as e: + print(f" [WARN] sample {i}: {e}") + skipped += 1 + continue + + pred = parse_answer(raw) + hit = pred == gt + + if (i + 1) % 10 == 0 or i == 0: + print( + f" [{i + 1:4d}/{total}] gt={gt} pred={pred} raw={raw.strip()!r:20} " + f"{'✓' if hit else '✗'} running_acc={correct / (i + 1 - skipped + 1e-9):.3f}" + ) + + if hit: + correct += 1 + + evaluated = total - skipped + accuracy = correct / evaluated if evaluated > 0 else 0.0 + avg_lat = sum(latencies) / len(latencies) if latencies else 0.0 + + print( + f"\n {label}: {correct}/{evaluated} correct | " + f"accuracy = {accuracy:.4f} ({accuracy * 100:.2f}%)" + ) + print(f" avg latency per sample: {avg_lat:.2f}s | skipped: {skipped}") + return { + "label": label, + "accuracy": accuracy, + "correct": correct, + "evaluated": evaluated, + "avg_latency_s": avg_lat, + "skipped": skipped, + } + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Eval ONNX vs PyTorch Ministral-3-3B VLM on AI2D" + ) + parser.add_argument( + "--model_path", + default="cpu_and_mobile/models", + help="Path to ONNX model dir (default: cpu_and_mobile/models/)", + ) + parser.add_argument( + "--pytorch_model", + default=None, + help="HuggingFace model ID for PyTorch comparison", + ) + parser.add_argument( + "--num_samples", + type=int, + default=100, + help="Number of AI2D test samples to evaluate (default: 100)", + ) + parser.add_argument( + "--device", + choices=["cpu", "cuda", "auto"], + default="auto", + help="Device for inference: cpu, cuda, or auto-detect (default: auto)", + ) + parser.add_argument( + "--skip_onnx", + action="store_true", + help="Skip ONNX evaluation", + ) + parser.add_argument( + "--system_prompt", + default=DEFAULT_SYSTEM_PROMPT, + help="System prompt to suppress chain-of-thought. Pass empty string to disable.", + ) + args = parser.parse_args() + + ds = load_ai2d(args.num_samples) + results = [] + + sys_prompt = args.system_prompt + if sys_prompt: + print(f"\nSystem prompt: {sys_prompt!r}") + else: + print("\nSystem prompt: (none)") + + # Detect ONNX precision from model path + onnx_precision = detect_onnx_precision(args.model_path) + + # ---- ONNX ---- + if not args.skip_onnx: + onnx_model, onnx_proc, onnx_tok = build_onnx_runner(args.model_path) + + def onnx_runner(pil_image, question, options): + msgs = build_messages(question, options, sys_prompt) + return run_onnx(onnx_model, onnx_proc, onnx_tok, pil_image, msgs) + + onnx_label = f"ONNX ({onnx_precision.upper()}) @ {args.model_path}" + results.append(evaluate(ds, onnx_runner, onnx_label)) + + # ---- PyTorch (optional) ---- + pt_precision = None + if args.pytorch_model: + pt_model, pt_proc, pt_device, pt_precision = build_pytorch_runner( + args.pytorch_model, device=args.device + ) + + def pt_runner(pil_image, question, options): + return run_pytorch( + pt_model, pt_proc, pil_image, question, options, pt_device, sys_prompt + ) + + pt_label = f"PyTorch ({pt_precision.upper()}) @ {args.pytorch_model}" + results.append(evaluate(ds, pt_runner, pt_label)) + + # ---- Summary ---- + print(f"\n{'=' * 60}") + print(" EVALUATION SUMMARY") + print(f"{'=' * 60}") + print(" Model : Ministral-3-3B-Instruct-2512 (VLM)") + print(" Dataset : AI2D (science diagram QA, multiple choice)") + print(f" Samples : {args.num_samples}") + print(f" ONNX prec : {onnx_precision.upper()}") + if pt_precision: + print(f" PyTorch prec: {pt_precision.upper()}") + print( + f" System prompt: " + f"{'(none)' if not sys_prompt else sys_prompt[:80] + ('...' if len(sys_prompt) > 80 else '')}" + ) + print() + for r in results: + print(f" {r['label']}") + print( + f" Accuracy : {r['accuracy'] * 100:.2f}% ({r['correct']}/{r['evaluated']})" + ) + print(f" Avg lat : {r['avg_latency_s']:.2f}s/sample") + print() + + if len(results) == 2: + delta = results[0]["accuracy"] - results[1]["accuracy"] + abs_delta = abs(delta) * 100 + print(f" Accuracy delta (ONNX - PyTorch): {delta * 100:+.2f} pp") + print( + f" Speedup (PyTorch lat / ONNX lat): " + f"{results[1]['avg_latency_s'] / max(results[0]['avg_latency_s'], 1e-9):.2f}x" + ) + + # Precision gap assessment + expected_gap = EXPECTED_GAP_PP.get(onnx_precision, 5.0) + print() + print(f" Expected gap for {onnx_precision.upper()}: <{expected_gap:.0f} pp") + if abs_delta <= expected_gap: + print(f" ✓ PASS — {abs_delta:.2f} pp gap is within expected range") + else: + print( + f" ✗ WARN — {abs_delta:.2f} pp gap exceeds expected {expected_gap:.0f} pp for {onnx_precision.upper()}" + ) + print( + " This may indicate a quality regression in the export pipeline." + ) + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py new file mode 100644 index 00000000..1b799c1b --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py @@ -0,0 +1,163 @@ +"""ONNX Runtime GenAI inference for Ministral-3-3B vision-language model. + +Usage: + python inference.py --prompt "What is the capital of France?" + python inference.py --image photo.jpg --prompt "Describe this image" + python inference.py --interactive + python inference.py --model_path cuda/models --prompt "Hello" +""" + +import argparse +import json +import time + +import onnxruntime_genai as og + + +def main(): + parser = argparse.ArgumentParser( + description="ONNX Runtime GenAI inference for Ministral-3-3B" + ) + parser.add_argument( + "--model_path", + type=str, + default="cpu_and_mobile/models", + help="Path to model directory containing genai_config.json and ONNX models", + ) + parser.add_argument("--image", type=str, default=None, help="Path to image file") + parser.add_argument("--prompt", type=str, default=None, help="Text prompt") + parser.add_argument( + "--max_length", type=int, default=4096, help="Maximum total tokens" + ) + parser.add_argument( + "--interactive", action="store_true", help="Run in interactive mode" + ) + args = parser.parse_args() + + print(f"Loading model from: {args.model_path}") + model = og.Model(args.model_path) + processor = model.create_multimodal_processor() + tokenizer = og.Tokenizer(model) + tokenizer_stream = processor.create_stream() + + if args.interactive: + interactive_mode(model, processor, tokenizer, tokenizer_stream, args) + elif args.prompt: + generate_response( + model, + processor, + tokenizer, + tokenizer_stream, + args.prompt, + args.image, + args.max_length, + ) + else: + print("Please provide --prompt or --interactive") + parser.print_help() + + +def generate_response( + model, processor, tokenizer, tokenizer_stream, prompt, image_path, max_length=4096 +): + """Run a single generation.""" + images = None + if image_path: + print(f"Loading image: {image_path}") + images = og.Images.open(image_path) + messages = [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": prompt}], + } + ] + else: + messages = [{"role": "user", "content": prompt}] + + full_prompt = tokenizer.apply_chat_template( + json.dumps(messages), add_generation_prompt=True + ) + print(f"\nPrompt: {prompt}") + if image_path: + print(f"Image: {image_path}") + print("\nGenerating response...") + + inputs = processor(full_prompt, images=images) + params = og.GeneratorParams(model) + params.set_search_options(max_length=max_length) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + + token_count = 0 + ttft = None + t_start = time.perf_counter() + + print("\nResponse: ", end="", flush=True) + while not generator.is_done(): + generator.generate_next_token() + if ttft is None: + ttft = time.perf_counter() - t_start + token_count += 1 + new_token = generator.get_next_tokens()[0] + print(tokenizer_stream.decode(new_token), end="", flush=True) + + t_total = time.perf_counter() - t_start + print() + del generator + + decode_tokens = max(token_count - 1, 1) + decode_time = t_total - (ttft or 0) + tps = decode_tokens / decode_time if decode_time > 0 else 0 + + print(f"\n Tokens generated : {token_count}") + print(f" TTFT : {(ttft or 0) * 1000:.1f} ms") + print(f" Decode TPS : {tps:.1f} tokens/sec") + print(f" Total time : {t_total:.2f} s") + + +def interactive_mode(model, processor, tokenizer, tokenizer_stream, args): + """Run in interactive mode.""" + print("\n" + "=" * 50) + print("Interactive Mode - Enter 'quit' to stop") + print("To include an image: image:/path/to/image.jpg your prompt") + print("=" * 50 + "\n") + + while True: + try: + user_input = input("You: ").strip() + except EOFError: + break + + if user_input.lower() in ("quit", "exit"): + break + if not user_input: + continue + + image_path = None + prompt = user_input + if user_input.startswith("image:"): + parts = user_input.split(" ", 1) + image_path = parts[0][6:] + prompt = parts[1] if len(parts) > 1 else "Describe this image" + + try: + generate_response( + model, + processor, + tokenizer, + tokenizer_stream, + prompt, + image_path, + args.max_length, + ) + except Exception as e: + print(f"Error: {e}") + + print("-" * 50 + "\n") + + print("Goodbye!") + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml b/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml new file mode 100644 index 00000000..a7b84368 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml @@ -0,0 +1,9 @@ +keywords: + olive-ai +ep: + CPUExecutionProvider + CUDAExecutionProvider +device: + cpu + gpu +name: ministral_3_3b diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py new file mode 100644 index 00000000..8036bee9 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -0,0 +1,555 @@ +"""End-to-end optimization pipeline for Ministral-3-3B ONNX models. + +Uses mobius for vision and embedding export (reliable dynamo-free ONNX +construction), and Olive/ModelBuilder for text decoder export (GQA + INT4). + +Pipeline: + cpu_and_mobile: Olive/ModelBuilder(INT4) → mobius(FP16) → Olive INT4 vision + cuda: Olive/ModelBuilder(INT4) → mobius(FP16) + +Architecture difference from Qwen VLM recipes: + Qwen uses Olive passes for all 3 sub-models (export + optimization). + Ministral uses mobius for vision/embedding because Pixtral's dynamic + image dimensions cause torch.onnx.export/dynamo failures. Mobius + produces already-optimized graphs (fused MHA, SkipLayerNorm, FP16). + For cpu_and_mobile, Olive then applies INT4 quantization to the vision + model post-export. + +Usage: + python optimize.py --config-dir cpu_and_mobile --device cpu + python optimize.py --config-dir cuda --device gpu + python optimize.py --config-dir cpu_and_mobile --device cpu --skip-export + python optimize.py --config-dir cpu_and_mobile --device cpu --model-path /local/dequantized/checkpoint +""" + +import argparse +import json +import logging +import os +import shutil +from pathlib import Path + +logging.getLogger("onnxscript").setLevel(logging.WARNING) +logging.getLogger("onnx_ir").setLevel(logging.WARNING) + +MODELS_DIR = "models" +MODEL_NAME = "mistralai/Ministral-3-3B-Instruct-2512" + +# Lazy-loaded HuggingFace config (avoids import-time network access) +_HF_CONFIG = None + + +def _get_hf_config(): + """Load and cache the HuggingFace model config. + + Always loads from MODEL_NAME (the canonical HF model ID) rather than + --model-path, because the config values (image_token_id, patch_size, etc.) + are architecture constants that don't change between checkpoints. + """ + global _HF_CONFIG + if _HF_CONFIG is None: + from transformers import Mistral3Config + + _HF_CONFIG = Mistral3Config.from_pretrained(MODEL_NAME) + return _HF_CONFIG + + +def export_text_decoder(config_dir: str, models_dir: str): + """Export text decoder using Olive/ModelBuilder (GQA + quantization). + + Loads text.json as a dict and overrides output_dir to write directly + to /decoder. ModelBuilder also generates genai_config.json, + tokenizer, and chat_template inside decoder/ — we move them to the + models root where the VLM pipeline expects them. + """ + try: + from olive import run + except ImportError: + from olive.workflows import run + + config_path = Path(config_dir) / "text.json" + if not config_path.exists(): + raise FileNotFoundError(f"Text config not found: {config_path}") + + # Load config as dict and override output_dir to write directly to models_dir + with open(config_path) as f: + config = json.load(f) + config["output_dir"] = os.path.join(models_dir, "decoder") + + print(f" [Olive] Exporting text decoder from {config_path}...") + run(config) + + # Move shared configs from decoder/ to models root for VLM pipeline + decoder_dir = Path(models_dir) / "decoder" + for filename in ( + "genai_config.json", + "tokenizer.json", + "tokenizer_config.json", + "chat_template.jinja", + ): + src = decoder_dir / filename + if src.exists(): + shutil.move(str(src), str(Path(models_dir) / filename)) + + +def export_vision_and_embedding( + output_dir: str, + model_path: str, + dtype: str = "f16", +): + """Export vision encoder and embedding using mobius. + + Mobius constructs the ONNX graph declaratively and applies pretrained + weights, avoiding torch.onnx.export dynamo issues with Pixtral's + dynamic image dimensions. + """ + from mobius import build + + print(f" [Mobius] Building VLM from {model_path} (dtype={dtype})...") + # mobius.build() accepts dtype as a string (e.g. "f16", "f32", "bf16") + # and resolves it internally — pass the CLI string directly + pkg = build(model_path, dtype=dtype, load_weights=True) + + os.makedirs(output_dir, exist_ok=True) + + required_components = ("vision", "embedding") + missing_components = [] + + for component in required_components: + if component in pkg: + component_dir = os.path.join(output_dir, component) + os.makedirs(component_dir, exist_ok=True) + try: + pkg.save( + component_dir, + components=lambda name, c=component: name == c, + check_weights=True, + ) + # mobius saves as model.onnx directly in component_dir + expected_onnx = os.path.join(component_dir, "model.onnx") + if not os.path.exists(expected_onnx): + raise FileNotFoundError( + f"Mobius export did not produce expected ONNX file for '{component}': {expected_onnx}" + ) + print(f" [Mobius] Saved {expected_onnx}") + except Exception: + shutil.rmtree(component_dir, ignore_errors=True) + raise + else: + missing_components.append(component) + + if missing_components: + raise ValueError( + "Mobius package is missing required component(s): " + + ", ".join(missing_components) + ) + + print(" [Mobius] Vision and embedding export complete") + + +def quantize_vision_and_embedding(config_dir: str, models_dir: str): + """Apply INT4 quantization to mobius-exported vision and embedding models. + + Loads vision.json / embedding.json as dicts and overrides model_path + and output_dir to point directly to //. + This avoids writing to the JSON config's hardcoded relative paths. + + After quantization, replaces any GatherBlockQuantized nodes (ORT can't + load quantized Gather on RoPE caches) and strips unused initializers. + + For cpu_and_mobile: INT4 quantization reduces model size ~87%. + For cuda: no vision/embedding configs exist (FP16 is optimal for GPU). + """ + try: + from olive import run + except ImportError: + from olive.workflows import run + + for component in ("vision", "embedding"): + config_path = Path(config_dir) / f"{component}.json" + if not config_path.exists(): + continue + + component_onnx = os.path.join(models_dir, component, "model.onnx") + if not os.path.exists(component_onnx): + print( + f" [WARN] {component_onnx} not found, skipping {component} quantization" + ) + continue + + # Save FP16 Gather data before quantization (for GatherBlockQuantized fix) + pre_quant_gathers = _save_gather_data(component_onnx) + + # Load config as dict and override paths to target models_dir directly + with open(config_path) as f: + config = json.load(f) + config["input_model"]["model_path"] = component_onnx + config["output_dir"] = os.path.join(models_dir, component) + + print(f" [Olive] Quantizing {component} from {config_path}...") + run(config) + + _fix_gather_block_quantized(component_onnx, pre_quant_gathers) + _strip_unused_initializers(component_onnx) + + +def _save_gather_data(onnx_path: str) -> dict: + """Save Gather node initializer data before quantization. + + Returns a dict mapping initializer names to numpy arrays for any + initializers used by Gather nodes (e.g., RoPE cos/sin caches). + """ + import onnx + from onnx import numpy_helper + + model = onnx.load(onnx_path) + gather_inputs = set() + for node in model.graph.node: + if node.op_type == "Gather": + gather_inputs.add(node.input[0]) + + result = {} + for init in model.graph.initializer: + if init.name in gather_inputs: + result[init.name] = numpy_helper.to_array(init) + return result + + +def _fix_gather_block_quantized(onnx_path: str, pre_quant_gathers: dict): + """Replace GatherBlockQuantized nodes with plain Gather + original data. + + Olive's INT4 quantization may quantize Gather nodes (e.g., RoPE cos/sin + caches) into GatherBlockQuantized, which ORT can't load. This restores + the original FP16/FP32 data and replaces with plain Gather. + """ + import onnx + from onnx import helper, numpy_helper + + model = onnx.load(onnx_path, load_external_data=False) + gbq_nodes = [n for n in model.graph.node if n.op_type == "GatherBlockQuantized"] + if not gbq_nodes: + return + + model = onnx.load(onnx_path) + inits = {i.name: i for i in model.graph.initializer} + + for node in list(model.graph.node): + if node.op_type != "GatherBlockQuantized": + continue + + q_name = node.input[0] + idx_name = node.input[1] + orig_name = q_name.replace("_Q4", "") + + if orig_name in pre_quant_gathers: + new_init = numpy_helper.from_array( + pre_quant_gathers[orig_name], name=orig_name + ) + model.graph.initializer.append(new_init) + + new_node = helper.make_node( + "Gather", + [orig_name, idx_name], + list(node.output), + name=node.name.replace("_Q4", ""), + ) + + idx = list(model.graph.node).index(node) + model.graph.node.remove(node) + model.graph.node.insert(idx, new_node) + + for inp in node.input: + if inp != idx_name and inp in inits: + init = inits[inp] + if init in model.graph.initializer: + model.graph.initializer.remove(init) + + onnx.save( + model, + onnx_path, + save_as_external_data=True, + all_tensors_to_one_file=True, + location="model.onnx.data", + size_threshold=1024, + ) + print(f" [PostProcess] Replaced {len(gbq_nodes)} GatherBlockQuantized → Gather") + + +def _strip_unused_initializers(onnx_path: str): + """Remove unused initializers and re-save to shrink the external data file. + + Olive's OnnxBlockWiseRtnQuantization keeps original weights alongside + the new INT4 (UINT8) weights. Stripping the unused originals typically + reduces the data file by ~87% (e.g., 1.7GB → 220MB for the vision model). + """ + if not os.path.exists(onnx_path): + return + + import onnx + from onnx.external_data_helper import convert_model_to_external_data + + model = onnx.load(onnx_path) + + used_names = set() + for node in model.graph.node: + for inp in node.input: + used_names.add(inp) + for inp in model.graph.input: + used_names.add(inp.name) + + before = len(model.graph.initializer) + new_inits = [init for init in model.graph.initializer if init.name in used_names] + removed = before - len(new_inits) + + if removed == 0: + return + + del model.graph.initializer[:] + model.graph.initializer.extend(new_inits) + + for init in model.graph.initializer: + del init.external_data[:] + + data_name = "model.onnx.data" + data_path = os.path.join(os.path.dirname(onnx_path), data_name) + if os.path.exists(data_path): + os.remove(data_path) + + convert_model_to_external_data( + model, all_tensors_to_one_file=True, location=data_name, size_threshold=1024 + ) + onnx.save(model, onnx_path) + + data_mb = os.path.getsize(data_path) / 1e6 if os.path.exists(data_path) else 0 + print(f" [Cleanup] Stripped {removed} unused initializers → {data_mb:.0f} MB") + + +def export_models( + config_dir: str, model_path: str, dtype: str = "f16", models_dir: str | None = None +): + """Export all 3 sub-models: text (Olive), vision + embedding (mobius). + + All outputs go directly to models_dir. No post-export copy needed. + For cpu_and_mobile, also applies INT4 quantization to vision. + """ + if models_dir is None: + models_dir = str(Path(config_dir) / MODELS_DIR) + + print("=== Exporting models ===") + + # Text decoder via Olive/ModelBuilder + export_text_decoder(config_dir, models_dir) + + # Vision + embedding via mobius (FP16) + export_vision_and_embedding(models_dir, model_path, dtype) + + # INT4 quantization of vision + embedding (cpu_and_mobile only) + quantize_vision_and_embedding(config_dir, models_dir) + + print() + + +def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): + """Patch genai_config.json with embedding/vision sections and processor_config. + + Derives model-specific values from the HuggingFace config (lazily loaded) + to avoid hardcoded constants drifting from the actual checkpoint. + """ + config_path = Path(output_dir) / "genai_config.json" + + with open(config_path) as f: + config = json.load(f) + + if device == "gpu": + # CUDA graph capture is unsupported for VLMs with dynamic image sizes. + # Disable for all models (matches Qwen VLM recipe convention). + provider_options = [ + { + "cuda": { + "enable_cuda_graph": "0", + "enable_skip_layer_norm_strict_mode": "1", + } + } + ] + vision_provider_options = provider_options + else: + provider_options = [] + vision_provider_options = [] + + session_options = { + "log_id": "onnxruntime-genai", + "provider_options": provider_options, + } + vision_session_options = { + "log_id": "onnxruntime-genai", + "provider_options": vision_provider_options, + } + + config["model"]["decoder"]["session_options"] = session_options + config["model"]["decoder"]["filename"] = "decoder/model.onnx" + + # Sync position_ids with what the decoder ONNX model actually supports + decoder_onnx = Path(output_dir) / "decoder" / "model.onnx" + if decoder_onnx.exists(): + import onnx + + decoder_model = onnx.load(str(decoder_onnx), load_external_data=False) + onnx_input_names = {inp.name for inp in decoder_model.graph.input} + if "position_ids" in onnx_input_names: + config["model"]["decoder"].setdefault("inputs", {})["position_ids"] = ( + "position_ids" + ) + else: + config["model"]["decoder"].get("inputs", {}).pop("position_ids", None) + + config["model"]["embedding"] = { + "filename": "embedding/model.onnx", + "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, + "outputs": {"inputs_embeds": "inputs_embeds"}, + "session_options": vision_session_options, + } + + # Vision config — values derived from HF config to stay in sync with checkpoint + hf_config = _get_hf_config() + config["model"]["vision"] = { + "filename": "vision/model.onnx", + "config_filename": "processor_config.json", + "spatial_merge_size": hf_config.spatial_merge_size, + "patch_size": hf_config.vision_config.patch_size, + "inputs": {"pixel_values": "pixel_values"}, + "outputs": {"image_features": "image_features"}, + "session_options": vision_session_options, + } + + # Add VLM-specific fields not generated by ModelBuilder. + # Don't override context_length or max_length — PR #2077's ModelBuilder + # sets these correctly (context_length=262144, max_length=32768). + config["model"]["image_token_id"] = hf_config.image_token_index + + # Override search defaults for VLM: greedy decoding, no KV sharing + config["search"]["top_k"] = 1 + config["search"]["past_present_share_buffer"] = False + + with open(config_path, "w") as f: + json.dump(config, f, indent=4) + print(f" Updated {config_path}") + + # Transforms-based processor config (matches ORT GenAI's image preprocessor format) + processor_config = { + "processor": { + "name": "pixtral_image_processor", + "transforms": [ + { + "operation": { + "name": "decode_image", + "type": "DecodeImage", + "attrs": {"color_space": "RGB"}, + } + }, + { + "operation": { + "name": "convert_to_rgb", + "type": "ConvertRGB", + } + }, + { + "operation": { + "name": "resize", + "type": "Resize", + "attrs": { + "height": 1540, + "width": 1540, + "smart_resize": 1, + "min_pixels": 784, + "max_pixels": 2371600, + "patch_size": hf_config.vision_config.patch_size, + "merge_size": hf_config.spatial_merge_size, + }, + } + }, + { + "operation": { + "name": "rescale", + "type": "Rescale", + "attrs": {"rescale_factor": 0.00392156862745098}, + } + }, + { + "operation": { + "name": "normalize", + "type": "Normalize", + "attrs": { + "mean": [0.48145466, 0.4578275, 0.40821073], + "std": [0.26862954, 0.26130258, 0.27577711], + }, + } + }, + { + "operation": { + "name": "permute", + "type": "Permute3D", + "attrs": {"dims": [2, 0, 1]}, + } + }, + ], + } + } + + processor_path = Path(output_dir) / "processor_config.json" + with open(processor_path, "w") as f: + json.dump(processor_config, f, indent=2) + print(f" Created {processor_path}") + + +def fix_tokenizer(output_dir: str = MODELS_DIR): + """Fix tokenizer_config.json for onnxruntime-genai compatibility. + + Ministral3's tokenizer uses 'TokenizersBackend' class which isn't supported + by genai's ort-extensions tokenizer. Change to 'LlamaTokenizer'. + """ + tc_path = Path(output_dir) / "tokenizer_config.json" + if tc_path.exists(): + tc = json.loads(tc_path.read_text(encoding="utf-8")) + if tc.get("tokenizer_class") == "TokenizersBackend": + tc["tokenizer_class"] = "LlamaTokenizer" + tc_path.write_text( + json.dumps(tc, indent=2, ensure_ascii=False), encoding="utf-8" + ) + print(" Fixed tokenizer_class to LlamaTokenizer") + + +def main(): + parser = argparse.ArgumentParser(description="Optimize Ministral-3-3B ONNX models") + parser.add_argument("--device", choices=["gpu", "cpu"], default="cpu") + parser.add_argument("--config-dir", default="cpu_and_mobile") + parser.add_argument("--skip-export", action="store_true") + parser.add_argument("--models-dir", default=None) + parser.add_argument( + "--model-path", + default=MODEL_NAME, + help="HuggingFace model ID or local path to dequantized checkpoint", + ) + parser.add_argument( + "--dtype", + default="f16", + choices=["f16", "f32", "bf16"], + help="Dtype for mobius vision/embedding export. FP16 is recommended: it serves " + "as the intermediate format for Olive INT4 quantization on CPU, and as " + "the final format on CUDA. (default: f16)", + ) + args = parser.parse_args() + + models_dir = args.models_dir or str(Path(args.config_dir) / MODELS_DIR) + + if not args.skip_export: + export_models(args.config_dir, args.model_path, args.dtype, models_dir) + + print("=== Generating configs ===") + update_genai_config(output_dir=models_dir, device=args.device) + fix_tokenizer(output_dir=models_dir) + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt b/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt new file mode 100644 index 00000000..0a184051 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/microsoft/Olive.git@main +git+https://github.com/onnxruntime/mobius.git@3777c18 +torch>=2.10.0,<2.11.0 +transformers>=4.57.0