diff --git a/Cargo.lock b/Cargo.lock index f69bb03c..035493b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -932,6 +932,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "sliceslice" version = "0.4.3" @@ -1280,6 +1286,7 @@ dependencies = [ "regex-automata", "serde", "serde_json", + "simdutf8", "sliceslice", "thiserror", "wildcard", diff --git a/Cargo.toml b/Cargo.toml index 655019b7..20cb6b80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ regex-automata = "0.4.14" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" serde-wasm-bindgen = "0.6.5" +simdutf8 = "0.1.5" sliceslice = "0.4.3" thiserror = "2.0.18" wasm-bindgen = { version = "0.2.108", features = ["serde-serialize"] } diff --git a/engine/Cargo.toml b/engine/Cargo.toml index bf7b9b00..798a9982 100644 --- a/engine/Cargo.toml +++ b/engine/Cargo.toml @@ -30,6 +30,7 @@ memchr.workspace = true rand.workspace = true regex-automata = { workspace = true, optional = true } serde.workspace = true +simdutf8.workspace = true sliceslice.workspace = true thiserror.workspace = true wildcard.workspace = true diff --git a/engine/src/ast/index_expr.rs b/engine/src/ast/index_expr.rs index f1f8443c..b3ef7474 100644 --- a/engine/src/ast/index_expr.rs +++ b/engine/src/ast/index_expr.rs @@ -936,7 +936,10 @@ mod tests { LhsValue::Bytes(bytes) => bytes, _ => unreachable!(), }; - assert_eq!(std::str::from_utf8(&bytes).unwrap(), format!("[{i}][{j}]")); + assert_eq!( + simdutf8::basic::from_utf8(&bytes).unwrap(), + format!("[{i}][{j}]") + ); } let indexes = [FieldIndex::MapEach, FieldIndex::ArrayIndex(i)]; @@ -948,7 +951,10 @@ mod tests { LhsValue::Bytes(bytes) => bytes, _ => unreachable!(), }; - assert_eq!(std::str::from_utf8(&bytes).unwrap(), format!("[{j}][{i}]")); + assert_eq!( + simdutf8::basic::from_utf8(&bytes).unwrap(), + format!("[{j}][{i}]") + ); } } @@ -963,7 +969,10 @@ mod tests { LhsValue::Bytes(bytes) => bytes, _ => unreachable!(), }; - assert_eq!(std::str::from_utf8(&bytes).unwrap(), format!("[{i}][{j}]")); + assert_eq!( + simdutf8::basic::from_utf8(&bytes).unwrap(), + format!("[{i}][{j}]") + ); j = (j + 1) % 10; i += (j == 0) as u32; } diff --git a/engine/src/lhs_types/bytes.rs b/engine/src/lhs_types/bytes.rs index 0ebf1f69..59ca8fc8 100644 --- a/engine/src/lhs_types/bytes.rs +++ b/engine/src/lhs_types/bytes.rs @@ -254,7 +254,7 @@ impl Serialize for Bytes<'_> { where S: Serializer, { - if let Ok(s) = std::str::from_utf8(self) { + if let Ok(s) = simdutf8::basic::from_utf8(self) { serializer.serialize_str(s) } else { serializer.serialize_bytes(self) diff --git a/engine/src/lhs_types/map.rs b/engine/src/lhs_types/map.rs index 1a26b499..bbe48e7b 100644 --- a/engine/src/lhs_types/map.rs +++ b/engine/src/lhs_types/map.rs @@ -339,12 +339,12 @@ impl Serialize for Map<'_> { let to_map = self .data .iter() - .all(|(key, _)| std::str::from_utf8(key).is_ok()); + .all(|(key, _)| simdutf8::basic::from_utf8(key).is_ok()); if to_map { let mut map = serializer.serialize_map(Some(self.len()))?; for (k, v) in self.data.iter() { - map.serialize_entry(std::str::from_utf8(k).unwrap(), v)?; + map.serialize_entry(simdutf8::basic::from_utf8(k).unwrap(), v)?; } map.end() } else { diff --git a/engine/src/rhs_types/bytes.rs b/engine/src/rhs_types/bytes.rs index 45e51b59..a4a3ac01 100644 --- a/engine/src/rhs_types/bytes.rs +++ b/engine/src/rhs_types/bytes.rs @@ -4,7 +4,6 @@ use serde::{Serialize, Serializer}; use std::fmt::{self, Debug, Formatter}; use std::hash::{Hash, Hasher}; use std::ops::Deref; -use std::str; /// BytesFormat describes the format in which the string was expressed #[derive(PartialEq, Eq, Copy, Clone)] @@ -48,10 +47,12 @@ impl Serialize for BytesExpr { S: Serializer, { match self.format() { - BytesFormat::Quoted | BytesFormat::Raw(_) => match std::str::from_utf8(&self.data) { - Ok(s) => s.serialize(serializer), - Err(_) => self.data.serialize(serializer), - }, + BytesFormat::Quoted | BytesFormat::Raw(_) => { + match simdutf8::basic::from_utf8(&self.data) { + Ok(s) => s.serialize(serializer), + Err(_) => self.data.serialize(serializer), + } + } BytesFormat::Byte => self.data.serialize(serializer), } } @@ -117,10 +118,12 @@ impl Debug for BytesExpr { } match self.format { - BytesFormat::Quoted | BytesFormat::Raw(_) => match std::str::from_utf8(&self.data) { - Ok(s) => s.fmt(f), - Err(_) => fmt_raw(&self.data, f), - }, + BytesFormat::Quoted | BytesFormat::Raw(_) => { + match simdutf8::basic::from_utf8(&self.data) { + Ok(s) => s.fmt(f), + Err(_) => fmt_raw(&self.data, f), + } + } BytesFormat::Byte => fmt_raw(&self.data, f), } } diff --git a/engine/src/scheme.rs b/engine/src/scheme.rs index db21743f..b6c9f7bf 100644 --- a/engine/src/scheme.rs +++ b/engine/src/scheme.rs @@ -76,10 +76,16 @@ impl<'i> Lex<'i> for FieldIndex { input, )), }, - RhsValue::Bytes(b) => match String::from_utf8(b.into()) { - Ok(s) => Ok((FieldIndex::MapKey(s), rest)), - Err(_) => Err((LexErrorKind::ExpectedLiteral("expected utf8 string"), input)), - }, + RhsValue::Bytes(b) => { + match simdutf8::basic::from_utf8(&b) { + Ok(_) => { + // SAFETY: simdutf8 just validated the bytes as valid UTF-8. + let s = unsafe { String::from_utf8_unchecked(b.into()) }; + Ok((FieldIndex::MapKey(s), rest)) + } + Err(_) => Err((LexErrorKind::ExpectedLiteral("expected utf8 string"), input)), + } + } _ => unreachable!(), } } diff --git a/engine/src/types.rs b/engine/src/types.rs index d6466481..68db2403 100644 --- a/engine/src/types.rs +++ b/engine/src/types.rs @@ -785,7 +785,7 @@ impl Serialize for LhsValue<'_> { match self { LhsValue::Ip(ip) => ip.serialize(serializer), LhsValue::Bytes(bytes) => { - if let Ok(s) = std::str::from_utf8(bytes) { + if let Ok(s) = simdutf8::basic::from_utf8(bytes) { serializer.serialize_str(s) } else { serializer.serialize_bytes(bytes)