fix: handle invalid UTF-8 in Ruby and Vue preprocessors

khasinski · khasinski · commit 73128a2b5af9 · 2026-01-21T23:52:07.000+01:00
The Ruby and Vue preprocessors were using `from_utf8().unwrap()` which
panics when processing files containing invalid UTF-8 bytes. This can
happen when:
- Binary files are inadvertently scanned
- Files are truncated at multi-byte character boundaries
- Files use non-UTF-8 encodings

This change wraps the UTF-8 conversion in `if let Ok(...)` to gracefully
skip the regex-based template extraction when UTF-8 conversion fails,
while still allowing the byte-level processing to continue (in Ruby's
case).

Fixes panic: `thread panicked at crates/oxide/src/extractor/pre_processors/ruby.rs:37:59`
diff --git a/crates/oxide/src/extractor/pre_processors/ruby.rs b/crates/oxide/src/extractor/pre_processors/ruby.rs
@@ -34,44 +34,46 @@ impl PreProcessor for Ruby {
 
         // Extract embedded template languages
         // https://viewcomponent.org/guide/templates.html#interpolations
-        let content_as_str = std::str::from_utf8(content).unwrap();
-
-        let starts = TEMPLATE_START_REGEX
-            .captures_iter(content_as_str)
-            .collect::<Vec<_>>();
-        let ends = TEMPLATE_END_REGEX
-            .captures_iter(content_as_str)
-            .collect::<Vec<_>>();
-
-        for start in starts.iter() {
-            // The language for this block
-            let lang = start.get(1).unwrap().as_str();
-
-            // The HEREDOC delimiter
-            let delimiter_start = start.get(2).unwrap().as_str();
-
-            // Where the "body" starts for the HEREDOC block
-            let body_start = start.get(0).unwrap().end();
-
-            // Look through all of the ends to find a matching language
-            for end in ends.iter() {
-                // 1. This must appear after the start
-                let body_end = end.get(0).unwrap().start();
-                if body_end < body_start {
-                    continue;
-                }
+        // Only process if content is valid UTF-8, otherwise skip HEREDOC extraction
+        // but still perform the byte-level Ruby processing below
+        if let Ok(content_as_str) = std::str::from_utf8(content) {
+            let starts = TEMPLATE_START_REGEX
+                .captures_iter(content_as_str)
+                .collect::<Vec<_>>();
+            let ends = TEMPLATE_END_REGEX
+                .captures_iter(content_as_str)
+                .collect::<Vec<_>>();
+
+            for start in starts.iter() {
+                // The language for this block
+                let lang = start.get(1).unwrap().as_str();
+
+                // The HEREDOC delimiter
+                let delimiter_start = start.get(2).unwrap().as_str();
+
+                // Where the "body" starts for the HEREDOC block
+                let body_start = start.get(0).unwrap().end();
+
+                // Look through all of the ends to find a matching language
+                for end in ends.iter() {
+                    // 1. This must appear after the start
+                    let body_end = end.get(0).unwrap().start();
+                    if body_end < body_start {
+                        continue;
+                    }
 
-                // The languages must match otherwise we haven't found the end
-                let delimiter_end = end.get(1).unwrap().as_str();
-                if delimiter_end != delimiter_start {
-                    continue;
-                }
+                    // The languages must match otherwise we haven't found the end
+                    let delimiter_end = end.get(1).unwrap().as_str();
+                    if delimiter_end != delimiter_start {
+                        continue;
+                    }
 
-                let body = &content_as_str[body_start..body_end];
-                let replaced = pre_process_input(body.as_bytes(), &lang.to_ascii_lowercase());
+                    let body = &content_as_str[body_start..body_end];
+                    let replaced = pre_process_input(body.as_bytes(), &lang.to_ascii_lowercase());
 
-                result.replace_range(body_start..body_end, replaced);
-                break;
+                    result.replace_range(body_start..body_end, replaced);
+                    break;
+                }
             }
         }
 
@@ -427,4 +429,26 @@ mod tests {
             vec!["text-amber-600", "text-sky-500", "text-green-500"],
         );
     }
+
+    #[test]
+    fn test_invalid_utf8_does_not_panic() {
+        use crate::extractor::pre_processors::pre_processor::PreProcessor;
+
+        // Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte
+        let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82];
+
+        let processor = Ruby::default();
+
+        // Should not panic, just return the input unchanged
+        let result = processor.process(invalid_utf8);
+        assert_eq!(result, invalid_utf8);
+    }
+
+    #[test]
+    fn test_valid_utf8_with_multibyte_chars() {
+        // Test that valid UTF-8 with multi-byte characters (like em-dashes) works
+        let input = "# Comment with em—dash\n%w[flex px-2.5]";
+
+        Ruby::test_extract_contains(input, vec!["flex", "px-2.5"]);
+    }
 }
diff --git a/crates/oxide/src/extractor/pre_processors/vue.rs b/crates/oxide/src/extractor/pre_processors/vue.rs
@@ -15,13 +15,15 @@ impl PreProcessor for Vue {
     fn process(&self, content: &[u8]) -> Vec<u8> {
         let mut result = content.to_vec();
 
-        let content_as_str = std::str::from_utf8(content).unwrap();
-        for (_, [lang, body]) in TEMPLATE_REGEX
-            .captures_iter(content_as_str)
-            .map(|c| c.extract())
-        {
-            let replaced = pre_process_input(body.as_bytes(), lang);
-            result = result.replace(body, replaced);
+        // Only process template tags if content is valid UTF-8
+        if let Ok(content_as_str) = std::str::from_utf8(content) {
+            for (_, [lang, body]) in TEMPLATE_REGEX
+                .captures_iter(content_as_str)
+                .map(|c| c.extract())
+            {
+                let replaced = pre_process_input(body.as_bytes(), lang);
+                result = result.replace(body, replaced);
+            }
         }
 
         result
@@ -43,4 +45,16 @@ mod tests {
 
         Vue::test_extract_contains(input, vec!["bg-neutral-900", "text-red-500"]);
     }
+
+    #[test]
+    fn test_invalid_utf8_does_not_panic() {
+        // Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte
+        let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82];
+
+        let processor = Vue::default();
+
+        // Should not panic, just return the input unchanged
+        let result = processor.process(invalid_utf8);
+        assert_eq!(result, invalid_utf8);
+    }
 }