diff --git a/core/parser/src/lexer/cursor.rs b/core/parser/src/lexer/cursor.rs index a0253ab7c70..72ac1129073 100644 --- a/core/parser/src/lexer/cursor.rs +++ b/core/parser/src/lexer/cursor.rs @@ -162,16 +162,28 @@ impl Cursor { /// It also stops when the next character is not an ascii or there is no next character. /// /// Note that all characters up until the stop character are added to the buffer, including the character right before. - pub(super) fn take_while_ascii_pred(&mut self, buf: &mut Vec, pred: &F) -> io::Result<()> + #[allow(clippy::cast_possible_truncation)] + #[inline] + pub(super) fn take_while_ascii_pred<'a, F>( + &mut self, + buf: &'a mut [u8], + pred: &F, + ) -> io::Result<&'a [u8]> where F: Fn(char) -> bool, { + let mut count = 0; loop { if !self.next_is_ascii_pred(pred)? { - return Ok(()); + return Ok(&buf[..count]); } else if let Some(byte) = self.next_char()? { - #[allow(clippy::cast_possible_truncation)] - buf.push(byte as u8); + buf[count] = byte as u8; + count += 1; + } else if count >= buf.len() { + return Err(Error::new( + ErrorKind::UnexpectedEof, + "Unexpected end of buffer while taking characters", + )); } else { // next_is_pred will return false if the next value is None so the None case should already be handled. unreachable!(); diff --git a/core/parser/src/lexer/regex.rs b/core/parser/src/lexer/regex.rs index e95798e1a62..387d929a9c6 100644 --- a/core/parser/src/lexer/regex.rs +++ b/core/parser/src/lexer/regex.rs @@ -3,8 +3,8 @@ use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer}; use crate::source::ReadChar; use bitflags::bitflags; -use boa_ast::{Position, PositionGroup}; -use boa_interner::{Interner, Sym}; +use boa_ast::PositionGroup; +use boa_interner::Interner; use regress::{Flags, Regex}; use std::fmt::{Display, Write}; use std::str::{self, FromStr}; @@ -114,13 +114,17 @@ impl Tokenizer for RegexLiteral { } } - let mut flags = Vec::new(); + let mut flags: [u8; 8] = [0; 8]; let flags_start = cursor.pos(); - cursor.take_while_ascii_pred(&mut flags, &char::is_alphabetic)?; + let flags_slice = cursor.take_while_ascii_pred(&mut flags, &char::is_alphabetic)?; - // SAFETY: We have already checked that the bytes are valid UTF-8. - let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) }; + // TODO: Change this to if err() then convert flags_slice to str + let flags_string = match RegExpFlags::from_bytes(flags_slice) { + Err(message) => return Err(Error::Syntax(message.into(), flags_start)), + Ok(regex_flags) => regex_flags.to_string(), + }; + let flags_str = flags_string.as_str(); let mut body_utf16 = Vec::new(); // We convert the body to UTF-16 since it may contain code points that are not valid UTF-8. @@ -149,7 +153,7 @@ impl Tokenizer for RegexLiteral { Ok(Token::new_by_position_group( TokenKind::regular_expression_literal( interner.get_or_intern(body_utf16.as_slice()), - parse_regex_flags(flags_str, flags_start, interner)?, + interner.get_or_intern(flags_str.to_string().as_str()), ), start_pos, cursor.pos_group(), @@ -189,6 +193,45 @@ bitflags! { } } +impl RegExpFlags { + fn from_bytes(bytes: &[u8]) -> Result { + let mut flags = Self::default(); + for c in bytes { + let new_flag = match c { + b'g' => Self::GLOBAL, + b'i' => Self::IGNORE_CASE, + b'm' => Self::MULTILINE, + b's' => Self::DOT_ALL, + b'u' => Self::UNICODE, + b'y' => Self::STICKY, + b'd' => Self::HAS_INDICES, + b'v' => Self::UNICODE_SETS, + 0x00 => continue, + _ => { + return Err(format!( + "invalid regular expression flag {}", + char::from(c.to_owned()) + )); + } + }; + + if flags.contains(new_flag) { + return Err(format!( + "repeated regular expression flag {}", + char::from(c.to_owned()) + )); + } + flags.insert(new_flag); + } + + if flags.contains(Self::UNICODE) && flags.contains(Self::UNICODE_SETS) { + return Err("cannot use both 'u' and 'v' flags".into()); + } + + Ok(flags) + } +} + impl FromStr for RegExpFlags { type Err = String; @@ -224,13 +267,6 @@ impl FromStr for RegExpFlags { } } -fn parse_regex_flags(s: &str, start: Position, interner: &mut Interner) -> Result { - match RegExpFlags::from_str(s) { - Err(message) => Err(Error::Syntax(message.into(), start)), - Ok(flags) => Ok(interner.get_or_intern(flags.to_string().as_str())), - } -} - impl Display for RegExpFlags { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.contains(Self::HAS_INDICES) { diff --git a/core/parser/src/lexer/tests.rs b/core/parser/src/lexer/tests.rs index 29a8b80cd8e..2e04a5b4ba3 100644 --- a/core/parser/src/lexer/tests.rs +++ b/core/parser/src/lexer/tests.rs @@ -866,45 +866,46 @@ fn addition_no_spaces_e_number() { fn take_while_ascii_pred_simple() { let mut cur = Cursor::from(&b"abcdefghijk"[..]); - let mut buf: Vec = Vec::new(); + let mut buf: [u8; 8] = [0; 8]; - cur.take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c') + let slice = cur + .take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c') .unwrap(); - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc"); + assert_eq!(str::from_utf8(slice).unwrap(), "abc"); } #[test] fn take_while_ascii_pred_immediate_stop() { let mut cur = Cursor::from(&b"abcdefghijk"[..]); - let mut buf: Vec = Vec::new(); + let mut buf: [u8; 8] = [0; 8]; - cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap(); + let slice = cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap(); - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), ""); + assert_eq!(str::from_utf8(slice).unwrap(), ""); } #[test] fn take_while_ascii_pred_entire_str() { let mut cur = Cursor::from(&b"abcdefghijk"[..]); - let mut buf: Vec = Vec::new(); + let mut buf: [u8; 11] = [0; 11]; - cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap(); + let slice = cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap(); - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk"); + assert_eq!(str::from_utf8(slice).unwrap(), "abcdefghijk"); } #[test] fn take_while_ascii_pred_non_ascii_stop() { let mut cur = Cursor::from("abcde😀fghijk".as_bytes()); - let mut buf: Vec = Vec::new(); + let mut buf: [u8; 12] = [0; 12]; - cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap(); + let slice = cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap(); - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcde"); + assert_eq!(str::from_utf8(slice).unwrap(), "abcde"); } #[test]