From dcf03b508c5a249771059892debcb29784a0e459 Mon Sep 17 00:00:00 2001 From: Peter Blackson Date: Thu, 18 Sep 2025 10:14:32 +0200 Subject: [PATCH] Begin implementing antlr translation --- .gitmodules | 3 + gearley-example-grammars/Cargo.toml | 4 + gearley-example-grammars/build.rs | 3 + gearley-example-grammars/grammars-v4 | 1 + gearley-example-grammars/src/antlr4.rs | 1 + gearley-example-grammars/src/g4.lalrpop | 409 ++++++++++++++++++++++++ gearley-example-grammars/src/lib.rs | 1 + 7 files changed, 422 insertions(+) create mode 100644 gearley-example-grammars/build.rs create mode 160000 gearley-example-grammars/grammars-v4 create mode 100644 gearley-example-grammars/src/antlr4.rs create mode 100644 gearley-example-grammars/src/g4.lalrpop diff --git a/.gitmodules b/.gitmodules index 7bfb816..1e76ad3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "gearley-wasm/vite/src/assets/ace-builds"] path = gearley-wasm/vite/src/assets/ace-builds url = git@github.com:ajaxorg/ace-builds.git +[submodule "gearley-example-grammars/grammars-v4"] + path = gearley-example-grammars/grammars-v4 + url = git@github.com:antlr/grammars-v4.git diff --git a/gearley-example-grammars/Cargo.toml b/gearley-example-grammars/Cargo.toml index e626cae..7c74d49 100644 --- a/gearley-example-grammars/Cargo.toml +++ b/gearley-example-grammars/Cargo.toml @@ -8,3 +8,7 @@ gearley-forest = { path = "../gearley-forest/" } cfg-load = { path = "../../cfg/cfg-load/" } cfg = { path = "../../cfg/cfg/" } cfg-examples = { path = "../../cfg/cfg-examples/" } +lalrpop-util = { version = "0.22.2", features = ["lexer", "unicode"] } + +[build-dependencies] +lalrpop = "0.22.2" diff --git a/gearley-example-grammars/build.rs b/gearley-example-grammars/build.rs new file mode 100644 index 0000000..ca5c283 --- /dev/null +++ b/gearley-example-grammars/build.rs @@ -0,0 +1,3 @@ +fn main() { + lalrpop::process_root().unwrap(); +} diff --git a/gearley-example-grammars/grammars-v4 b/gearley-example-grammars/grammars-v4 new file mode 160000 index 0000000..a5754dd --- /dev/null +++ b/gearley-example-grammars/grammars-v4 @@ -0,0 +1 @@ +Subproject commit a5754dd907b434d161213b729178c8fbbd128c2c diff --git a/gearley-example-grammars/src/antlr4.rs b/gearley-example-grammars/src/antlr4.rs new file mode 100644 index 0000000..7a17eca --- /dev/null +++ b/gearley-example-grammars/src/antlr4.rs @@ -0,0 +1 @@ +lalrpop_mod!(g4); diff --git a/gearley-example-grammars/src/g4.lalrpop b/gearley-example-grammars/src/g4.lalrpop new file mode 100644 index 0000000..24f1e6c --- /dev/null +++ b/gearley-example-grammars/src/g4.lalrpop @@ -0,0 +1,409 @@ +use std::str::FromStr; + +grammar; + +pub GrammarSpec: x = { GrammarDecl PrequelConstruct* Rules ModeSpec* }; + +pub GrammarDecl: GrammarDeclType = { + GRAMMAR => GrammarDeclType::Grammar, + LEXER GRAMMAR => GrammarDeclType::Lexer, + PARSER GRAMMAR => GrammarDeclType::Parser, +}; + +pub PrequelConstruct: PrequelConstructType = { + OptionsSpec DelegateGrammars TokensSpec ChannelsSpec Action => +}; + +pub OptionsSpec = { + OPTIONS (option SEMI)* RBRACE +}; + +pub Action = { + AT (actionScopeName COLONCOLON)? Identifier actionBlock +}; + +pub Identifier = { + TOKEN_REF, + RULE_REF +} + +GRAMMAR = "grammar"; +LEXER = "lexer"; +PARSER = "parser"; +OPTIONS = "options"; +AT = "@"; +COLONCOLON = "::"; + + +parser grammar ANTLRv4Parser; + +options { + tokenVocab = ANTLRv4Lexer; +} + +// The main entry point for parsing a v4 grammar. +grammarSpec + : grammarDecl prequelConstruct* rules modeSpec* EOF + ; + +grammarDecl + : grammarType identifier SEMI + ; + +grammarType + : LEXER GRAMMAR + | PARSER GRAMMAR + | GRAMMAR + ; + +// This is the list of all constructs that can be declared before +// the set of rules that compose the grammar, and is invoked 0..n +// times by the grammarPrequel rule. + +prequelConstruct + : optionsSpec + | delegateGrammars + | tokensSpec + | channelsSpec + | action_ + ; + +// ------------ +// Options - things that affect analysis and/or code generation + +optionsSpec + : OPTIONS (option SEMI)* RBRACE + ; + +option + : identifier ASSIGN optionValue + ; + +optionValue + : identifier (DOT identifier)* + | STRING_LITERAL + | actionBlock + | INT + ; + +// ------------ +// Delegates + +delegateGrammars + : IMPORT delegateGrammar (COMMA delegateGrammar)* SEMI + ; + +delegateGrammar + : identifier ASSIGN identifier + | identifier + ; + +// ------------ +// Tokens & Channels + +tokensSpec + : TOKENS idList? RBRACE + ; + +channelsSpec + : CHANNELS idList? RBRACE + ; + +idList + : identifier (COMMA identifier)* COMMA? + ; + +// Match stuff like @parser::members {int i;} + +action_ + : AT (actionScopeName COLONCOLON)? identifier actionBlock + ; + +// Scope names could collide with keywords; allow them as ids for action scopes + +actionScopeName + : identifier + | LEXER + | PARSER + ; + +actionBlock + : ACTION + ; + +argActionBlock + : BEGIN_ARGUMENT ARGUMENT_CONTENT*? END_ARGUMENT + ; + +modeSpec + : MODE identifier SEMI lexerRuleSpec* + ; + +rules + : ruleSpec* + ; + +ruleSpec + : parserRuleSpec + | lexerRuleSpec + ; + +parserRuleSpec + : ruleModifiers? RULE_REF argActionBlock? ruleReturns? throwsSpec? localsSpec? rulePrequel* COLON ruleBlock SEMI + exceptionGroup + ; + +exceptionGroup + : exceptionHandler* finallyClause? + ; + +exceptionHandler + : CATCH argActionBlock actionBlock + ; + +finallyClause + : FINALLY actionBlock + ; + +rulePrequel + : optionsSpec + | ruleAction + ; + +ruleReturns + : RETURNS argActionBlock + ; + +// -------------- +// Exception spec +throwsSpec + : THROWS qualifiedIdentifier (COMMA qualifiedIdentifier)* + ; + +localsSpec + : LOCALS argActionBlock + ; + +/** Match stuff like @init {int i;} */ +ruleAction + : AT identifier actionBlock + ; + +ruleModifiers + : ruleModifier+ + ; + +// An individual access modifier for a rule. The 'fragment' modifier +// is an internal indication for lexer rules that they do not match +// from the input but are like subroutines for other lexer rules to +// reuse for certain lexical patterns. The other modifiers are passed +// to the code generation templates and may be ignored by the template +// if they are of no use in that language. + +ruleModifier + : PUBLIC + | PRIVATE + | PROTECTED + | FRAGMENT + ; + +ruleBlock + : ruleAltList + ; + +ruleAltList + : labeledAlt (OR labeledAlt)* + ; + +labeledAlt + : alternative (POUND identifier)? + ; + +// -------------------- +// Lexer rules + +lexerRuleSpec + : FRAGMENT? TOKEN_REF optionsSpec? COLON lexerRuleBlock SEMI + ; + +lexerRuleBlock + : lexerAltList + ; + +lexerAltList + : lexerAlt (OR lexerAlt)* + ; + +lexerAlt + : lexerElements lexerCommands? + | + // explicitly allow empty alts + ; + +lexerElements + : lexerElement+ + | + ; + +lexerElement + : lexerAtom ebnfSuffix? + | lexerBlock ebnfSuffix? + | actionBlock QUESTION? + ; + +// but preds can be anywhere + +lexerBlock + : LPAREN lexerAltList RPAREN + ; + +// E.g., channel(HIDDEN), skip, more, mode(INSIDE), push(INSIDE), pop + +lexerCommands + : RARROW lexerCommand (COMMA lexerCommand)* + ; + +lexerCommand + : lexerCommandName LPAREN lexerCommandExpr RPAREN + | lexerCommandName + ; + +lexerCommandName + : identifier + | MODE + ; + +lexerCommandExpr + : identifier + | INT + ; + +// -------------------- +// Rule Alts + +altList + : alternative (OR alternative)* + ; + +alternative + : elementOptions? element+ + | + // explicitly allow empty alts + ; + +element + : labeledElement (ebnfSuffix |) + | atom (ebnfSuffix |) + | ebnf + | actionBlock QUESTION? predicateOptions? + ; + +predicateOptions + : LT predicateOption (COMMA predicateOption)* GT + ; + +predicateOption + : elementOption + | identifier ASSIGN (actionBlock | INT | STRING_LITERAL) + ; + +labeledElement + : identifier (ASSIGN | PLUS_ASSIGN) (atom | block) + ; + +// -------------------- +// EBNF and blocks + +ebnf + : block blockSuffix? + ; + +blockSuffix + : ebnfSuffix + ; + +ebnfSuffix + : QUESTION QUESTION? + | STAR QUESTION? + | PLUS QUESTION? + ; + +lexerAtom + : characterRange + | terminalDef + | notSet + | LEXER_CHAR_SET + | wildcard + ; + +atom + : terminalDef + | ruleref + | notSet + | wildcard + ; + +wildcard + : DOT elementOptions? + ; + +// -------------------- +// Inverted element set +notSet + : NOT setElement + | NOT blockSet + ; + +blockSet + : LPAREN setElement (OR setElement)* RPAREN + ; + +setElement + : TOKEN_REF elementOptions? + | STRING_LITERAL elementOptions? + | characterRange + | LEXER_CHAR_SET + ; + +// ------------- +// Grammar Block +block + : LPAREN (optionsSpec? ruleAction* COLON)? altList RPAREN + ; + +// ---------------- +// Parser rule ref +ruleref + : RULE_REF argActionBlock? elementOptions? + ; + +// --------------- +// Character Range +characterRange + : STRING_LITERAL RANGE STRING_LITERAL + ; + +terminalDef + : TOKEN_REF elementOptions? + | STRING_LITERAL elementOptions? + ; + +// Terminals may be adorned with certain options when +// reference in the grammar: TOK<,,,> +elementOptions + : LT elementOption (COMMA elementOption)* GT + ; + +elementOption + : qualifiedIdentifier + | identifier ASSIGN (qualifiedIdentifier | STRING_LITERAL | INT) + ; + +identifier + : RULE_REF + | TOKEN_REF + ; + +qualifiedIdentifier + : identifier (DOT identifier)* + ; \ No newline at end of file diff --git a/gearley-example-grammars/src/lib.rs b/gearley-example-grammars/src/lib.rs index 8135721..949afb1 100644 --- a/gearley-example-grammars/src/lib.rs +++ b/gearley-example-grammars/src/lib.rs @@ -1,6 +1,7 @@ pub mod ambiguous_math; pub mod c; pub mod precedenced_math; +mod antlr4; pub static BNFS: &'static [[&'static str; 4]] = &[ [