From 4bdd6e81defcc7379825c3f7ab370057a6490058 Mon Sep 17 00:00:00 2001 From: Allan Simon Date: Tue, 9 Sep 2025 15:11:20 +0200 Subject: [PATCH 1/4] Update tokenizer to understand that WITH from "WITH TIMEZONE" does not require a new line otherwise it creates broken output like ```sql CREATE TABLE whatever ( id INT NOT NULL, last_modification_datetime TIMESTAMP(0) WITH TIME ZONE NOT NULL, creation_datetime TIMESTAMP(0) WITH TIME ZONE NOT NULL, hashed_token VARCHAR(512) DEFAULT NULL, PRIMARY KEY(id) ) ``` instead of ```sql CREATE TABLE whatever ( id INT NOT NULL, last_modification_datetime TIMESTAMP(0) WITH TIME ZONE NOT NULL, creation_datetime TIMESTAMP(0) WITH TIME ZONE NOT NULL, hashed_token VARCHAR(512) DEFAULT NULL, PRIMARY KEY(id) ) ``` --- src/Tokenizer.php | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index f8f35a6..64b6499 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -722,6 +722,12 @@ final class Tokenizer 'YEARWEEK', ]; + /** @var list */ + private array $dataTypeModifiers = [ + 'WITH TIME ZONE', + 'WITHOUT TIME ZONE', + ]; + /** Regular expression for tokenizing. */ private readonly string $tokenizeRegex; @@ -834,11 +840,13 @@ private function makeRegexFromList(array $values, bool $sorted = false): string private function makeTokenizeRegexes(): array { // Set up regular expressions + $regexBoundaries = $this->makeRegexFromList($this->boundaries); $regexReserved = $this->makeRegexFromList($this->reserved); $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); $regexFunction = $this->makeRegexFromList($this->functions); + $regexDataTypeModifiers = str_replace(' ', '\s+', $this->makeRegexFromList($this->dataTypeModifiers)); return [ Token::TOKEN_TYPE_WHITESPACE => '\s+', @@ -866,6 +874,10 @@ private function makeTokenizeRegexes(): array Token::TOKEN_TYPE_NUMBER => '(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')', // punctuation and symbols Token::TOKEN_TYPE_BOUNDARY => $regexBoundaries, + // data type modifiers, this make 'WITH TIMEZONE' to be different from the 'WITH" from CTE + Token::TOKEN_TYPE_RESERVED => '(? '(? Date: Tue, 9 Sep 2025 15:17:38 +0200 Subject: [PATCH 2/4] add test --- tests/TokenizerTest.php | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/TokenizerTest.php b/tests/TokenizerTest.php index 1e8af7c..112cc84 100644 --- a/tests/TokenizerTest.php +++ b/tests/TokenizerTest.php @@ -1658,6 +1658,38 @@ public static function tokenizeData(): Generator ], '/* foo...', ]; + + yield 'WITH TIME ZONE as single token' => [ + [ + new Token(Token::TOKEN_TYPE_RESERVED, 'TIMESTAMP'), + new Token(Token::TOKEN_TYPE_BOUNDARY, '('), + new Token(Token::TOKEN_TYPE_NUMBER, '0'), + new Token(Token::TOKEN_TYPE_BOUNDARY, ')'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_RESERVED, 'WITH TIME ZONE'), + ], + 'TIMESTAMP(0) WITH TIME ZONE', + ]; + + yield 'WITHOUT TIME ZONE as single token' => [ + [ + new Token(Token::TOKEN_TYPE_RESERVED, 'TIME'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_RESERVED, 'WITHOUT TIME ZONE'), + ], + 'TIME WITHOUT TIME ZONE', + ]; + + yield 'CTE WITH still works' => [ + [ + new Token(Token::TOKEN_TYPE_RESERVED_TOPLEVEL, 'WITH'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_WORD, 'cte'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_RESERVED, 'AS'), + ], + 'WITH cte AS', + ]; } public function testTokenizeLongConcat(): void From b33bcd05fb3748bbdc211eea8f7f00da5d22f0b0 Mon Sep 17 00:00:00 2001 From: Allan Simon Date: Wed, 10 Sep 2025 12:00:56 +0200 Subject: [PATCH 3/4] coding style Tokenizer.php --- src/Tokenizer.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 64b6499..b974b69 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -841,11 +841,11 @@ private function makeTokenizeRegexes(): array { // Set up regular expressions - $regexBoundaries = $this->makeRegexFromList($this->boundaries); - $regexReserved = $this->makeRegexFromList($this->reserved); - $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); - $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); - $regexFunction = $this->makeRegexFromList($this->functions); + $regexBoundaries = $this->makeRegexFromList($this->boundaries); + $regexReserved = $this->makeRegexFromList($this->reserved); + $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); + $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); + $regexFunction = $this->makeRegexFromList($this->functions); $regexDataTypeModifiers = str_replace(' ', '\s+', $this->makeRegexFromList($this->dataTypeModifiers)); return [ From 3eeb3c611a3ea7c2a2797b6c2aeacf643ba9cef3 Mon Sep 17 00:00:00 2001 From: Allan Simon Date: Wed, 10 Sep 2025 12:03:31 +0200 Subject: [PATCH 4/4] coding style TokenizerTest.php --- tests/TokenizerTest.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/TokenizerTest.php b/tests/TokenizerTest.php index 112cc84..8cfbcb2 100644 --- a/tests/TokenizerTest.php +++ b/tests/TokenizerTest.php @@ -1670,7 +1670,7 @@ public static function tokenizeData(): Generator ], 'TIMESTAMP(0) WITH TIME ZONE', ]; - + yield 'WITHOUT TIME ZONE as single token' => [ [ new Token(Token::TOKEN_TYPE_RESERVED, 'TIME'), @@ -1679,7 +1679,7 @@ public static function tokenizeData(): Generator ], 'TIME WITHOUT TIME ZONE', ]; - + yield 'CTE WITH still works' => [ [ new Token(Token::TOKEN_TYPE_RESERVED_TOPLEVEL, 'WITH'),