Skip to content

Commit 73128a2

Browse files
committed
fix: handle invalid UTF-8 in Ruby and Vue preprocessors
The Ruby and Vue preprocessors were using `from_utf8().unwrap()` which panics when processing files containing invalid UTF-8 bytes. This can happen when: - Binary files are inadvertently scanned - Files are truncated at multi-byte character boundaries - Files use non-UTF-8 encodings This change wraps the UTF-8 conversion in `if let Ok(...)` to gracefully skip the regex-based template extraction when UTF-8 conversion fails, while still allowing the byte-level processing to continue (in Ruby's case). Fixes panic: `thread panicked at crates/oxide/src/extractor/pre_processors/ruby.rs:37:59`
1 parent 7971167 commit 73128a2

2 files changed

Lines changed: 80 additions & 42 deletions

File tree

crates/oxide/src/extractor/pre_processors/ruby.rs

Lines changed: 59 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -34,44 +34,46 @@ impl PreProcessor for Ruby {
3434

3535
// Extract embedded template languages
3636
// https://viewcomponent.org/guide/templates.html#interpolations
37-
let content_as_str = std::str::from_utf8(content).unwrap();
38-
39-
let starts = TEMPLATE_START_REGEX
40-
.captures_iter(content_as_str)
41-
.collect::<Vec<_>>();
42-
let ends = TEMPLATE_END_REGEX
43-
.captures_iter(content_as_str)
44-
.collect::<Vec<_>>();
45-
46-
for start in starts.iter() {
47-
// The language for this block
48-
let lang = start.get(1).unwrap().as_str();
49-
50-
// The HEREDOC delimiter
51-
let delimiter_start = start.get(2).unwrap().as_str();
52-
53-
// Where the "body" starts for the HEREDOC block
54-
let body_start = start.get(0).unwrap().end();
55-
56-
// Look through all of the ends to find a matching language
57-
for end in ends.iter() {
58-
// 1. This must appear after the start
59-
let body_end = end.get(0).unwrap().start();
60-
if body_end < body_start {
61-
continue;
62-
}
37+
// Only process if content is valid UTF-8, otherwise skip HEREDOC extraction
38+
// but still perform the byte-level Ruby processing below
39+
if let Ok(content_as_str) = std::str::from_utf8(content) {
40+
let starts = TEMPLATE_START_REGEX
41+
.captures_iter(content_as_str)
42+
.collect::<Vec<_>>();
43+
let ends = TEMPLATE_END_REGEX
44+
.captures_iter(content_as_str)
45+
.collect::<Vec<_>>();
46+
47+
for start in starts.iter() {
48+
// The language for this block
49+
let lang = start.get(1).unwrap().as_str();
50+
51+
// The HEREDOC delimiter
52+
let delimiter_start = start.get(2).unwrap().as_str();
53+
54+
// Where the "body" starts for the HEREDOC block
55+
let body_start = start.get(0).unwrap().end();
56+
57+
// Look through all of the ends to find a matching language
58+
for end in ends.iter() {
59+
// 1. This must appear after the start
60+
let body_end = end.get(0).unwrap().start();
61+
if body_end < body_start {
62+
continue;
63+
}
6364

64-
// The languages must match otherwise we haven't found the end
65-
let delimiter_end = end.get(1).unwrap().as_str();
66-
if delimiter_end != delimiter_start {
67-
continue;
68-
}
65+
// The languages must match otherwise we haven't found the end
66+
let delimiter_end = end.get(1).unwrap().as_str();
67+
if delimiter_end != delimiter_start {
68+
continue;
69+
}
6970

70-
let body = &content_as_str[body_start..body_end];
71-
let replaced = pre_process_input(body.as_bytes(), &lang.to_ascii_lowercase());
71+
let body = &content_as_str[body_start..body_end];
72+
let replaced = pre_process_input(body.as_bytes(), &lang.to_ascii_lowercase());
7273

73-
result.replace_range(body_start..body_end, replaced);
74-
break;
74+
result.replace_range(body_start..body_end, replaced);
75+
break;
76+
}
7577
}
7678
}
7779

@@ -427,4 +429,26 @@ mod tests {
427429
vec!["text-amber-600", "text-sky-500", "text-green-500"],
428430
);
429431
}
432+
433+
#[test]
434+
fn test_invalid_utf8_does_not_panic() {
435+
use crate::extractor::pre_processors::pre_processor::PreProcessor;
436+
437+
// Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte
438+
let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82];
439+
440+
let processor = Ruby::default();
441+
442+
// Should not panic, just return the input unchanged
443+
let result = processor.process(invalid_utf8);
444+
assert_eq!(result, invalid_utf8);
445+
}
446+
447+
#[test]
448+
fn test_valid_utf8_with_multibyte_chars() {
449+
// Test that valid UTF-8 with multi-byte characters (like em-dashes) works
450+
let input = "# Comment with em—dash\n%w[flex px-2.5]";
451+
452+
Ruby::test_extract_contains(input, vec!["flex", "px-2.5"]);
453+
}
430454
}

crates/oxide/src/extractor/pre_processors/vue.rs

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@ impl PreProcessor for Vue {
1515
fn process(&self, content: &[u8]) -> Vec<u8> {
1616
let mut result = content.to_vec();
1717

18-
let content_as_str = std::str::from_utf8(content).unwrap();
19-
for (_, [lang, body]) in TEMPLATE_REGEX
20-
.captures_iter(content_as_str)
21-
.map(|c| c.extract())
22-
{
23-
let replaced = pre_process_input(body.as_bytes(), lang);
24-
result = result.replace(body, replaced);
18+
// Only process template tags if content is valid UTF-8
19+
if let Ok(content_as_str) = std::str::from_utf8(content) {
20+
for (_, [lang, body]) in TEMPLATE_REGEX
21+
.captures_iter(content_as_str)
22+
.map(|c| c.extract())
23+
{
24+
let replaced = pre_process_input(body.as_bytes(), lang);
25+
result = result.replace(body, replaced);
26+
}
2527
}
2628

2729
result
@@ -43,4 +45,16 @@ mod tests {
4345

4446
Vue::test_extract_contains(input, vec!["bg-neutral-900", "text-red-500"]);
4547
}
48+
49+
#[test]
50+
fn test_invalid_utf8_does_not_panic() {
51+
// Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte
52+
let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82];
53+
54+
let processor = Vue::default();
55+
56+
// Should not panic, just return the input unchanged
57+
let result = processor.process(invalid_utf8);
58+
assert_eq!(result, invalid_utf8);
59+
}
4660
}

0 commit comments

Comments
 (0)