Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions crates/multimodal/src/registry/llama4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,8 @@ impl ModelProcessorSpec for Llama4Spec {
}

fn matches(&self, metadata: &ModelMetadata) -> bool {
let id = metadata.model_id.to_ascii_lowercase();
// Match "llama-4", "llama4", "Llama-4-Maverick", "Llama-4-Scout", etc.
id.contains("llama-4") || id.contains("llama4")
metadata.matches_model_type_hints(&["llama-4"])
|| metadata.matches_model_type_hints(&["llama4"])
}

fn placeholder_token(&self, _metadata: &ModelMetadata) -> RegistryResult<String> {
Expand Down Expand Up @@ -265,4 +264,29 @@ mod tests {
// Position: 1 + 290 + 290 = 581
assert_eq!(replacements[0].tokens[581], 200090); // <|image|>
}

#[test]
fn llama4_matches_alias_via_model_type() {
let tokenizer = TestTokenizer::new(&[
("<|image|>", 200090),
("<|image_start|>", 200088),
("<|image_end|>", 200089),
("<|patch|>", 200092),
("<|tile_x_separator|>", 200093),
("<|tile_y_separator|>", 200094),
]);
let config = json!({
"model_type": "llama4",
"image_token_index": 200092,
"vision_config": {"image_size": 336, "patch_size": 14}
});
let metadata = ModelMetadata {
model_id: "custom-model",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry.lookup(&metadata).expect("llama4 alias");
assert_eq!(spec.name(), "llama4");
}
}
20 changes: 19 additions & 1 deletion crates/multimodal/src/registry/llava.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ impl ModelProcessorSpec for LlavaSpec {
}

fn matches(&self, metadata: &ModelMetadata) -> bool {
metadata.model_id.to_ascii_lowercase().contains("llava")
metadata.matches_model_type_hints(&["llava"])
}

fn placeholder_token(&self, _metadata: &ModelMetadata) -> RegistryResult<String> {
Expand Down Expand Up @@ -103,4 +103,22 @@ mod tests {
.unwrap();
assert_eq!(replacements[0].tokens.len(), 576);
}

#[test]
fn llava_matches_alias_via_model_type() {
let tokenizer = TestTokenizer::new(&[("<image>", 32000)]);
let config = json!({
"model_type": "llava",
"image_token_index": 32000,
"vision_config": {"patch_size": 14}
});
let metadata = ModelMetadata {
model_id: "custom-model",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry.lookup(&metadata).expect("llava alias");
assert_eq!(spec.name(), "llava");
}
}
21 changes: 19 additions & 2 deletions crates/multimodal/src/registry/phi3_v.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ impl ModelProcessorSpec for Phi3VisionSpec {
}

fn matches(&self, metadata: &ModelMetadata) -> bool {
let id = metadata.model_id.to_ascii_lowercase();
id.contains("phi") && id.contains("vision")
metadata.matches_model_type_hints(&["phi", "vision"])
|| metadata.matches_model_type_hints(&["phi3_v"])
}

fn placeholder_token(&self, _metadata: &ModelMetadata) -> RegistryResult<String> {
Expand Down Expand Up @@ -92,4 +92,21 @@ mod tests {
assert_eq!(replacements[0].tokens.len(), 144);
assert_eq!(replacements[0].tokens[0], 555);
}

#[test]
fn phi3_matches_alias_via_model_type() {
let tokenizer = TestTokenizer::new(&[("<image>", 555)]);
let config = json!({
"model_type": "phi3_v",
"img_processor": {"num_img_tokens": 144}
});
let metadata = ModelMetadata {
model_id: "custom-model",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry.lookup(&metadata).expect("phi3 alias");
assert_eq!(spec.name(), "phi3_v");
}
}
24 changes: 22 additions & 2 deletions crates/multimodal/src/registry/qwen3_vl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ impl ModelProcessorSpec for Qwen3VLVisionSpec {
}

fn matches(&self, metadata: &ModelMetadata) -> bool {
let id = metadata.model_id.to_ascii_lowercase();
id.contains("qwen3") && id.contains("vl")
metadata.matches_model_type_hints(&["qwen3", "vl"])
}

fn placeholder_token(&self, metadata: &ModelMetadata) -> RegistryResult<String> {
Expand Down Expand Up @@ -176,4 +175,25 @@ mod tests {
// Must match qwen3_vl spec, not qwen_vl
assert_eq!(spec.name(), "qwen3_vl");
}

#[test]
fn qwen3_vl_matches_alias_via_model_type() {
let tokenizer = TestTokenizer::new(&[("<|image_pad|>", 151655)]);
let config = json!({
"model_type": "qwen3_vl",
"vision_start_token_id": 151652,
"image_token_id": 151655,
"vision_end_token_id": 151653
});
let metadata = ModelMetadata {
model_id: "custom-model",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry
.lookup(&metadata)
.expect("should match qwen3 alias");
assert_eq!(spec.name(), "qwen3_vl");
}
}
22 changes: 20 additions & 2 deletions crates/multimodal/src/registry/qwen_vl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ impl ModelProcessorSpec for QwenVLVisionSpec {
}

fn matches(&self, metadata: &ModelMetadata) -> bool {
metadata.model_id.to_ascii_lowercase().contains("qwen")
&& metadata.model_id.to_ascii_lowercase().contains("vl")
metadata.matches_model_type_hints(&["qwen", "vl"])
}

fn placeholder_token(&self, _metadata: &ModelMetadata) -> RegistryResult<String> {
Expand Down Expand Up @@ -138,4 +137,23 @@ mod tests {
assert_eq!(replacements[0].tokens[0], 151652);
assert_eq!(replacements[0].tokens[1], 151654);
}

#[test]
fn qwen_vl_matches_alias_via_model_type() {
let tokenizer = TestTokenizer::new(&[("<image>", 999)]);
let config = json!({
"model_type": "qwen2_vl",
"vision_start_token_id": 151652,
"vision_token_id": 151654,
"image_token_id": 151655
});
let metadata = ModelMetadata {
model_id: "custom-model",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry.lookup(&metadata).expect("should match qwen alias");
assert_eq!(spec.name(), "qwen_vl");
}
}
18 changes: 18 additions & 0 deletions crates/multimodal/src/registry/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,24 @@ impl<'a> ModelMetadata<'a> {
Self::find_value(self.config, path).and_then(|value| value.as_u64().map(|v| v as u32))
}

pub fn config_model_type(&self) -> Option<&str> {
Self::find_value(self.config, &["model_type"]).and_then(Value::as_str)
}

pub fn matches_model_type_hints(&self, hints: &[&str]) -> bool {
Self::matches_all_hints(self.model_id, hints)
|| self
.config_model_type()
.is_some_and(|model_type| Self::matches_all_hints(model_type, hints))
}

fn matches_all_hints(candidate: &str, hints: &[&str]) -> bool {
let candidate = candidate.to_ascii_lowercase();
hints
.iter()
.all(|hint| candidate.contains(&hint.to_ascii_lowercase()))
}

fn find_value<'v>(value: &'v Value, path: &[&str]) -> Option<&'v Value> {
let mut current = value;
for key in path {
Expand Down
63 changes: 52 additions & 11 deletions crates/multimodal/src/vision/image_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -337,28 +337,33 @@ impl ImageProcessorRegistry {
self.processors.insert(pattern.into(), processor);
}

/// Find a processor for the given model ID.
/// Find a processor for the given model ID, falling back to model_type.
///
/// Matches by substring containment (case-insensitive).
pub fn find(&self, model_id: &str) -> Option<&dyn ImagePreProcessor> {
let model_lower = model_id.to_lowercase();
for (pattern, processor) in &self.processors {
if model_lower.contains(&pattern.to_lowercase()) {
return Some(processor.as_ref());
}
}
None
pub fn find(&self, model_id: &str, model_type: Option<&str>) -> Option<&dyn ImagePreProcessor> {
self.find_in_candidate(model_id)
.or_else(|| model_type.and_then(|model_type| self.find_in_candidate(model_type)))
Comment thread
YouNeedCryDear marked this conversation as resolved.
Comment on lines +344 to +345
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Normalize model_type separators before fallback lookup

ImageProcessorRegistry::find now falls back to model_type, but it feeds the raw string into substring matching without normalizing separators. Because built-in keys include hyphenated patterns like llava-next/llava-v1.6, an underscore-form config value such as llava_next will miss the intended specific match and can fall through to the generic llava processor, causing aliased LLaVA-NeXT requests to be preprocessed with the wrong pipeline. Please normalize model_type (e.g., _/-) or add equivalent aliases before matching.

Useful? React with 👍 / 👎.

}

/// Check if a model has a registered processor.
pub fn has_processor(&self, model_id: &str) -> bool {
self.find(model_id).is_some()
self.find(model_id, None).is_some()
}

/// Get list of supported model patterns.
pub fn supported_patterns(&self) -> Vec<&str> {
self.processors.keys().map(|s| s.as_str()).collect()
}

fn find_in_candidate(&self, candidate: &str) -> Option<&dyn ImagePreProcessor> {
let candidate = candidate.to_ascii_lowercase();
for (pattern, processor) in &self.processors {
if candidate.contains(&pattern.to_ascii_lowercase()) {
return Some(processor.as_ref());
}
}
Comment on lines +358 to +364
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

FILE="$(fd -p 'image_processor.rs' | head -n1)"
echo "Inspecting: $FILE"

echo
echo "[1] Confirm HashMap + first-match iteration:"
rg -n -C2 'processors:\s*HashMap|for \(pattern, processor\) in &self\.processors|return Some\(processor\.as_ref\(\)\)' "$FILE"

echo
echo "[2] List registered patterns and detect substring overlaps:"
python - <<'PY'
import re
from pathlib import Path

p = next(Path('.').rglob('image_processor.rs'))
text = p.read_text()

patterns = re.findall(r'registry\.register\(\s*"([^"]+)"', text)
print("patterns:", patterns)

overlaps = []
for a in patterns:
    for b in patterns:
        if a != b and b in a:
            overlaps.append((a, b))

print("overlaps:")
for a, b in sorted(set(overlaps)):
    print(f"  '{a}' contains '{b}'")
PY

Repository: lightseekorg/smg

Length of output: 1156


Deterministic processor selection is not guaranteed.

Line 360 returns the first substring match from a HashMap, but iteration order is non-deterministic. With overlapping patterns (for example llava-next and llava), this can select the wrong processor unpredictably. The patterns registered include both llava-next and llava, where one is a substring of the other, making the issue concrete.

🔧 Proposed fix (choose most specific match deterministically)
 fn find_in_candidate(&self, candidate: &str) -> Option<&dyn ImagePreProcessor> {
     let candidate = candidate.to_ascii_lowercase();
-    for (pattern, processor) in &self.processors {
-        if candidate.contains(&pattern.to_ascii_lowercase()) {
-            return Some(processor.as_ref());
-        }
-    }
-    None
+    self.processors
+        .iter()
+        .filter(|(pattern, _)| candidate.contains(&pattern.to_ascii_lowercase()))
+        .max_by(|(a, _), (b, _)| {
+            a.len()
+                .cmp(&b.len())
+                .then_with(|| a.as_str().cmp(b.as_str()))
+        })
+        .map(|(_, processor)| processor.as_ref())
 }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/multimodal/src/vision/image_processor.rs` around lines 358 - 364, The
current find_in_candidate method returns the first substring match from
self.processors (a HashMap), which is non-deterministic and can pick
shorter/overlapping patterns like "llava" before "llava-next"; change it to
collect all processors whose pattern (case-insensitive) is contained in the
candidate, then choose the most specific match deterministically by sorting
matches by descending pattern length and then by a stable tie-breaker (e.g.,
lexicographic order) and returning the first entry. Update find_in_candidate to
(1) lowercase candidate once, (2) iterate through self.processors to collect
matching (pattern, processor) pairs, (3) sort the collected matches by
(-pattern.len(), pattern) and (4) return the processor reference of the top
match or None if empty, ensuring case-insensitive comparisons and deterministic
selection.

None
}
}

impl Default for ImageProcessorRegistry {
Expand Down Expand Up @@ -436,6 +441,10 @@ impl ImageProcessorRegistry {
"phi3-vision",
Box::new(super::processors::Phi3VisionProcessor::new()),
);
registry.register(
"phi3_v",
Box::new(super::processors::Phi3VisionProcessor::new()),
);

// Register LLaMA 4 Vision
registry.register(
Expand Down Expand Up @@ -551,7 +560,7 @@ mod tests {
assert!(registry.has_processor("lmms-lab/llava-next-interleave-qwen-7b"));

// Get the processor and check model name
let processor = registry.find("llava-hf/llava-1.5-7b-hf").unwrap();
let processor = registry.find("llava-hf/llava-1.5-7b-hf", None).unwrap();
assert_eq!(processor.model_name(), "llava");
}

Expand All @@ -566,4 +575,36 @@ mod tests {
assert!(registry.has_processor("TEST-MODEL"));
assert!(!registry.has_processor("other-model"));
}

#[test]
fn test_registry_find_falls_back_to_model_type() {
let registry = ImageProcessorRegistry::with_defaults();

assert!(registry.find("custom-model", None).is_none());

let processor = registry
.find("custom-model", Some("qwen3_vl"))
.expect("qwen3 processor by model_type");
assert_eq!(processor.model_name(), "qwen3-vl");
}

#[test]
fn test_registry_find_preserves_fast_path() {
let registry = ImageProcessorRegistry::with_defaults();

let processor = registry
.find("Qwen3-VL-30B-A3B-Instruct", Some("qwen2_vl"))
.expect("qwen3 processor by model_id");
assert_eq!(processor.model_name(), "qwen3-vl");
}

#[test]
fn test_registry_find_phi3_model_type_fallback() {
let registry = ImageProcessorRegistry::with_defaults();

let processor = registry
.find("custom-model", Some("phi3_v"))
.expect("phi3 processor by model_type");
assert_eq!(processor.model_name(), "phi3-vision");
}
}
6 changes: 5 additions & 1 deletion model_gateway/src/routers/grpc/multimodal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,10 @@ pub(crate) async fn process_multimodal(

// Step 2: Resolve model spec and preprocess images
let model_config = components.get_or_load_config(model_id, tokenizer_source)?;
let model_type = model_config
.config
.get("model_type")
.and_then(|value| value.as_str());
let metadata = ModelMetadata {
model_id,
tokenizer,
Expand All @@ -276,7 +280,7 @@ pub(crate) async fn process_multimodal(

let image_processor = components
.image_processor_registry
.find(model_id)
.find(model_id, model_type)
.ok_or_else(|| anyhow::anyhow!("No image processor found for model: {model_id}"))?;

// ImagePreProcessor::preprocess takes &[DynamicImage]; images are behind Arc<ImageFrame>.
Expand Down