diff --git a/build.sc b/build.sc index 23a2c215fa..c41e7c4664 100644 --- a/build.sc +++ b/build.sc @@ -513,6 +513,40 @@ object vercors extends Module { } } + object textMateGenerator extends VercorsModule { + + def base = T { + settings.src / "parsers" / "antlr4" + } + + override def key: String = "textMateGenerator" + + override def deps: T[Agg[Dep]] = Agg(ivy"org.antlr:antlr4-runtime:4.8", ivy"com.lihaoyi::upickle:3.1.3") + + override def bareResources = + T.sources( + base() / "SpecLexer.g4", + base() / "LangPVLLexer.g4", + base() / "LexerAdaptor.java", + ) + + override def moduleDeps = Seq(hre) + + object antlrGrammarParser extends parsers.GenModule { + override def base = T { + settings.src / "textMateGenerator" / "antlr4" + } + + override def lexer: String = "ANTLRv4Lexer.g4" + + override def parser: String = "ANTLRv4Parser.g4" + + override def deps: Seq[String] = Seq("LexBasic.g4") + } + + override def generatedSources = T { Seq(antlrGrammarParser.generate()) } + } + object rewrite extends VercorsModule { def key = "rewrite" def deps = Agg( diff --git a/src/parsers/antlr4/LLVMSpecParser.g4 b/src/parsers/antlr4/LLVMSpecParser.g4 index 0fbbc09311..ff639c954e 100644 --- a/src/parsers/antlr4/LLVMSpecParser.g4 +++ b/src/parsers/antlr4/LLVMSpecParser.g4 @@ -9,7 +9,7 @@ import LangLLVMSpecParser, SpecParser; langExpr: expression; langId: Identifier; -langConstInt: Constant; +langConstInt: IntegerConstant; langType: type; langStatement: EOF EOF; langStatic: EOF EOF; diff --git a/src/parsers/antlr4/LangPVLLexer.g4 b/src/parsers/antlr4/LangPVLLexer.g4 index e3331e00e4..8a44c90bbe 100644 --- a/src/parsers/antlr4/LangPVLLexer.g4 +++ b/src/parsers/antlr4/LangPVLLexer.g4 @@ -144,7 +144,7 @@ HEX_DIGIT ; mode DEFAULT_MODE; -Identifier : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*; +Identifier : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*; COMMENT : '/*' .*? '*/' -> skip; LINE_COMMENT : '//' .*? ('\n'|EOF) -> skip; diff --git a/src/parsers/antlr4/SpecLexer.g4 b/src/parsers/antlr4/SpecLexer.g4 index dbe017a447..c98a2e070f 100644 --- a/src/parsers/antlr4/SpecLexer.g4 +++ b/src/parsers/antlr4/SpecLexer.g4 @@ -29,7 +29,7 @@ VAL_FALSE: 'false'; VAL_PACKAGE: 'package'; */ -NEVER: EOF '='; +NEVER: EOF '=' ; // Must be able to contain identifiers from any frontend, so it's fine to over-approximate valid identifiers a bit. LANG_ID_ESCAPE: '`' ~[`]+ '`'; @@ -46,7 +46,7 @@ VAL_SET: 'set'; VAL_BAG: 'bag'; VAL_POINTER: 'pointer'; VAL_MAP: 'map'; -VAL_OPTION: 'option'; +VAL_OPTION options { category=spec_type_cons } : 'option'; VAL_EITHER: 'either'; VAL_TUPLE: 'tuple'; VAL_TYPE: 'type'; @@ -122,7 +122,7 @@ VAL_PERM_VAL: 'perm'; VAL_PERM: 'Perm'; VAL_POINTS_TO: 'PointsTo'; VAL_RUNNING: 'running'; -VAL_SOME: 'Some'; +VAL_SOME: 'Some'; VAL_LEFT: 'Left'; VAL_RIGHT: 'Right'; VAL_VALUE: 'Value'; diff --git a/src/textMateGenerator/antlr4/ANTLRv4Lexer.g4 b/src/textMateGenerator/antlr4/ANTLRv4Lexer.g4 new file mode 100644 index 0000000000..4e8efe5d0e --- /dev/null +++ b/src/textMateGenerator/antlr4/ANTLRv4Lexer.g4 @@ -0,0 +1,395 @@ +/* + * [The "BSD license"] + * Copyright (c) 2012-2015 Terence Parr + * Copyright (c) 2012-2015 Sam Harwell + * Copyright (c) 2015 Gerald Rosenberg + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// ====================================================== +// Lexer specification +// ====================================================== + +lexer grammar ANTLRv4Lexer; + +options { + superClass = LexerAdaptor; +} + +import LexBasic; + +// Standard set of fragments +tokens { + TOKEN_REF, + RULE_REF, + LEXER_CHAR_SET +} + +channels { + OFF_CHANNEL, + COMMENT +} + +// ------------------------- +// Comments +DOC_COMMENT + : DocComment -> channel (COMMENT) + ; + +BLOCK_COMMENT + : BlockComment -> channel (COMMENT) + ; + +LINE_COMMENT + : LineComment -> channel (COMMENT) + ; + +// ------------------------- +// Integer + +INT + : DecimalNumeral + ; + +// ------------------------- +// Literal string +// +// ANTLR makes no distinction between a single character literal and a +// multi-character string. All literals are single quote delimited and +// may contain unicode escape sequences of the form \uxxxx, where x +// is a valid hexadecimal number (per Unicode standard). +STRING_LITERAL + : SQuoteLiteral + ; + +UNTERMINATED_STRING_LITERAL + : USQuoteLiteral + ; + +// ------------------------- +// Arguments +// +// Certain argument lists, such as those specifying call parameters +// to a rule invocation, or input parameters to a rule specification +// are contained within square brackets. +BEGIN_ARGUMENT + : LBrack { this.handleBeginArgument(); } + ; + +// ------------------------- +// Target Language Actions +BEGIN_ACTION + : LBrace -> pushMode (TargetLanguageAction) + ; + +// ------------------------- +// Keywords +// +// 'options', 'tokens', and 'channels' are considered keywords +// but only when followed by '{', and considered as a single token. +// Otherwise, the symbols are tokenized as RULE_REF and allowed as +// an identifier in a labeledElement. +OPTIONS + : 'options' WSNLCHARS* '{' + ; + +TOKENS + : 'tokens' WSNLCHARS* '{' + ; + +CHANNELS + : 'channels' WSNLCHARS* '{' + ; + +fragment WSNLCHARS + : ' ' + | '\t' + | '\f' + | '\n' + | '\r' + ; + +IMPORT + : 'import' + ; + +FRAGMENT + : 'fragment' + ; + +LEXER + : 'lexer' + ; + +GRAMMAR + : 'grammar' + ; + +LOCALS + : 'locals' + ; + +MODE + : 'mode' + ; + +// ------------------------- +// Punctuation + +COLON + : Colon + ; + +COLONCOLON + : DColon + ; + +COMMA + : Comma + ; + +SEMI + : Semi + ; + +LPAREN + : LParen + ; + +RPAREN + : RParen + ; + +LBRACE + : LBrace + ; + +RBRACE + : RBrace + ; + +RARROW + : RArrow + ; + +LT + : Lt + ; + +GT + : Gt + ; + +ASSIGN + : Equal + ; + +QUESTION + : Question + ; + +STAR + : Star + ; + +PLUS_ASSIGN + : PlusAssign + ; + +PLUS + : Plus + ; + +OR + : Pipe + ; + +DOLLAR + : Dollar + ; + +RANGE + : Range + ; + +DOT + : Dot + ; + +AT + : At + ; + +POUND + : Pound + ; + +NOT + : Tilde + ; + +// ------------------------- +// Identifiers - allows unicode rule/token names + +ID + : Id + ; + +// ------------------------- +// Whitespace + +WS + : Ws+ -> channel (OFF_CHANNEL) + ; + +// ------------------------- +// Illegal Characters +// +// This is an illegal character trap which is always the last rule in the +// lexer specification. It matches a single character of any value and being +// the last rule in the file will match when no other rule knows what to do +// about the character. It is reported as an error but is not passed on to the +// parser. This means that the parser to deal with the gramamr file anyway +// but we will not try to analyse or code generate from a file with lexical +// errors. + +// Comment this rule out to allow the error to be propagated to the parser +ERRCHAR + : . -> channel (HIDDEN) + ; + +// ====================================================== +// Lexer modes +// ------------------------- +// Arguments +mode Argument; + +// E.g., [int x, List a[]] +NESTED_ARGUMENT + : LBrack -> type (ARGUMENT_CONTENT), pushMode (Argument) + ; + +ARGUMENT_ESCAPE + : EscAny -> type (ARGUMENT_CONTENT) + ; + +ARGUMENT_STRING_LITERAL + : DQuoteLiteral -> type (ARGUMENT_CONTENT) + ; + +ARGUMENT_CHAR_LITERAL + : SQuoteLiteral -> type (ARGUMENT_CONTENT) + ; + +END_ARGUMENT + : RBrack { this.handleEndArgument(); } + ; + +// added this to return non-EOF token type here. EOF does something weird +UNTERMINATED_ARGUMENT + : EOF -> popMode + ; + +ARGUMENT_CONTENT + : . + ; + +// TODO: This grammar and the one used in the Intellij Antlr4 plugin differ +// for "actions". This needs to be resolved at some point. +// The Intellij Antlr4 grammar is here: +// https://github.com/antlr/intellij-plugin-v4/blob/1f36fde17f7fa63cb18d7eeb9cb213815ac658fb/src/main/antlr/org/antlr/intellij/plugin/parser/ANTLRv4Lexer.g4#L587 + +// ------------------------- +// Target Language Actions +// +// Many language targets use {} as block delimiters and so we +// must recursively match {} delimited blocks to balance the +// braces. Additionally, we must make some assumptions about +// literal string representation in the target language. We assume +// that they are delimited by ' or " and so consume these +// in their own alts so as not to inadvertantly match {}. +mode TargetLanguageAction; + +NESTED_ACTION + : LBrace -> type (ACTION_CONTENT), pushMode (TargetLanguageAction) + ; + +ACTION_ESCAPE + : EscAny -> type (ACTION_CONTENT) + ; + +ACTION_STRING_LITERAL + : DQuoteLiteral -> type (ACTION_CONTENT) + ; + +ACTION_CHAR_LITERAL + : SQuoteLiteral -> type (ACTION_CONTENT) + ; + +ACTION_DOC_COMMENT + : DocComment -> type (ACTION_CONTENT) + ; + +ACTION_BLOCK_COMMENT + : BlockComment -> type (ACTION_CONTENT) + ; + +ACTION_LINE_COMMENT + : LineComment -> type (ACTION_CONTENT) + ; + +END_ACTION + : RBrace { this.handleEndAction(); } + ; + +UNTERMINATED_ACTION + : EOF -> popMode + ; + +ACTION_CONTENT + : . + ; + +// ------------------------- +mode LexerCharSet; + +LEXER_CHAR_SET_BODY + : (~ [\]\\] | EscAny)+ -> more + ; + +LEXER_CHAR_SET + : RBrack -> popMode + ; + +UNTERMINATED_CHAR_SET + : EOF -> popMode + ; + +// ------------------------------------------------------------------------------ +// Grammar specific Keywords, Punctuation, etc. +fragment Id + : NameStartChar NameChar* + ; diff --git a/src/textMateGenerator/antlr4/ANTLRv4Parser.g4 b/src/textMateGenerator/antlr4/ANTLRv4Parser.g4 new file mode 100644 index 0000000000..a2d02df1ae --- /dev/null +++ b/src/textMateGenerator/antlr4/ANTLRv4Parser.g4 @@ -0,0 +1,252 @@ +/* + * [The "BSD license"] + * Copyright (c) 2012-2014 Terence Parr + * Copyright (c) 2012-2014 Sam Harwell + * Copyright (c) 2015 Gerald Rosenberg + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +parser grammar ANTLRv4Parser; + +options { + tokenVocab = ANTLRv4Lexer; +} + +// The main entry point for parsing a v4 grammar. +grammarSpec + : grammarDecl prequelConstruct* rules modeSpec* EOF + ; + +grammarDecl + : grammarType identifier SEMI + ; + +grammarType + : LEXER GRAMMAR + ; + +prequelConstruct + : optionsSpec + | delegateGrammars + | tokensSpec + | channelsSpec + | action_ + ; + +// ------------ +// Options - things that affect analysis and/or code generation + +optionsSpec + : OPTIONS (option SEMI)* RBRACE + ; + +option + : identifier ASSIGN optionValue + ; + +optionValue + : identifier (DOT identifier)* + | STRING_LITERAL + | actionBlock + | INT + ; + +// ------------ +// Delegates + +delegateGrammars + : IMPORT delegateGrammar (COMMA delegateGrammar)* SEMI + ; + +delegateGrammar + : identifier ASSIGN identifier + | identifier + ; + +// ------------ +// Tokens & Channels + +tokensSpec + : TOKENS idList? RBRACE + ; + +channelsSpec + : CHANNELS idList? RBRACE + ; + +idList + : identifier (COMMA identifier)* COMMA? + ; + +// Match stuff like @parser::members {int i;} + +action_ + : AT (actionScopeName COLONCOLON)? identifier actionBlock + ; + +// Scope names could collide with keywords; allow them as ids for action scopes + +actionScopeName + : identifier + | LEXER + ; + +actionBlock + : BEGIN_ACTION ACTION_CONTENT* END_ACTION + ; + +modeSpec + : MODE identifier SEMI lexerRuleSpec* + ; + +rules + : ruleSpec* + ; + +ruleSpec + : lexerRuleSpec + ; + +// -------------------- +// Lexer rules + +lexerRuleSpec + : FRAGMENT? TOKEN_REF optionsSpec? COLON lexerRuleBlock SEMI + ; + +lexerRuleBlock + : lexerAltList + ; + +lexerAltList + : lexerAlt (OR lexerAlt)* + ; + +lexerAlt + : lexerElements lexerCommands? + | + // explicitly allow empty alts + ; + +lexerElements + : lexerElement+ + | + ; + +lexerElement + : lexerAtom ebnfSuffix? + | lexerBlock ebnfSuffix? + | actionBlock QUESTION? + ; + +// but preds can be anywhere + +lexerBlock + : LPAREN lexerAltList RPAREN + ; + +// E.g., channel(HIDDEN), skip, more, mode(INSIDE), push(INSIDE), pop + +lexerCommands + : RARROW lexerCommand (COMMA lexerCommand)* + ; + +lexerCommand + : lexerCommandName LPAREN lexerCommandExpr RPAREN + | lexerCommandName + ; + +lexerCommandName + : identifier + | MODE + ; + +lexerCommandExpr + : identifier + | INT + ; + +// -------------------- +// EBNF and blocks + +ebnfSuffix + : QUESTION QUESTION? + | STAR QUESTION? + | PLUS QUESTION? + ; + +lexerAtom + : characterRange + | terminalDef + | notSet + | LEXER_CHAR_SET + | DOT elementOptions? + ; + +// -------------------- +// Inverted element set +notSet + : NOT setElement + | NOT blockSet + ; + +blockSet + : LPAREN setElement (OR setElement)* RPAREN + ; + +setElement + : TOKEN_REF elementOptions? + | STRING_LITERAL elementOptions? + | characterRange + | LEXER_CHAR_SET + ; + +// --------------- +// Character Range +characterRange + : STRING_LITERAL RANGE STRING_LITERAL + ; + +terminalDef + : TOKEN_REF elementOptions? + | STRING_LITERAL elementOptions? + ; + +// Terminals may be adorned with certain options when +// reference in the grammar: TOK<,,,> +elementOptions + : LT elementOption (COMMA elementOption)* GT + ; + +elementOption + : identifier + | identifier ASSIGN (identifier | STRING_LITERAL) + ; + +identifier + : RULE_REF + | TOKEN_REF + ; diff --git a/src/textMateGenerator/antlr4/LexBasic.g4 b/src/textMateGenerator/antlr4/LexBasic.g4 new file mode 100644 index 0000000000..1939630ece --- /dev/null +++ b/src/textMateGenerator/antlr4/LexBasic.g4 @@ -0,0 +1,286 @@ +/* + * [The "BSD license"] + * Copyright (c) 2014-2015 Gerald Rosenberg + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/** + * A generally reusable set of fragments for import in to Lexer grammars. + * + * Modified 2015.06.16 gbr - + * -- generalized for inclusion into the ANTLRv4 grammar distribution + * + */ + +// $antlr-format alignTrailingComments on, columnLimit 130, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments off +// $antlr-format useTab off, allowShortRulesOnASingleLine off, allowShortBlocksOnASingleLine on, alignSemicolons hanging +// $antlr-format alignColons hanging + +lexer grammar LexBasic; + +// ====================================================== +// Lexer fragments +// +// ----------------------------------- +// Whitespace & Comments + +fragment Ws + : Hws + | Vws + ; + +fragment Hws + : [ \t] + ; + +fragment Vws + : [\r\n\f] + ; + +fragment BlockComment + : '/*' .*? ('*/' | EOF) + ; + +fragment DocComment + : '/**' .*? ('*/' | EOF) + ; + +fragment LineComment + : '//' ~ [\r\n]* + ; + +// ----------------------------------- +// Escapes +// Any kind of escaped character that we can embed within ANTLR literal strings. + +fragment EscSeq + : Esc ([btnfr"'\\] | UnicodeEsc | . | EOF) + ; + +fragment EscAny + : Esc . + ; + +fragment UnicodeEsc + : 'u' (HexDigit (HexDigit (HexDigit HexDigit?)?)?)? + ; + +// ----------------------------------- +// Numerals + +fragment DecimalNumeral + : '0' + | [1-9] DecDigit* + ; + +// ----------------------------------- +// Digits + +fragment HexDigit + : [0-9a-fA-F] + ; + +fragment DecDigit + : [0-9] + ; + +// ----------------------------------- +// Literals + +fragment BoolLiteral + : 'true' + | 'false' + ; + +fragment CharLiteral + : SQuote (EscSeq | ~ ['\r\n\\]) SQuote + ; + +fragment SQuoteLiteral + : SQuote (EscSeq | ~ ['\r\n\\])* SQuote + ; + +fragment DQuoteLiteral + : DQuote (EscSeq | ~ ["\r\n\\])* DQuote + ; + +fragment USQuoteLiteral + : SQuote (EscSeq | ~ ['\r\n\\])* + ; + +// ----------------------------------- +// Character ranges + +fragment NameChar + : NameStartChar + | '0' .. '9' + | Underscore + | '\u00B7' + | '\u0300' .. '\u036F' + | '\u203F' .. '\u2040' + ; + +fragment NameStartChar + : 'A' .. 'Z' + | 'a' .. 'z' + | '\u00C0' .. '\u00D6' + | '\u00D8' .. '\u00F6' + | '\u00F8' .. '\u02FF' + | '\u0370' .. '\u037D' + | '\u037F' .. '\u1FFF' + | '\u200C' .. '\u200D' + | '\u2070' .. '\u218F' + | '\u2C00' .. '\u2FEF' + | '\u3001' .. '\uD7FF' + | '\uF900' .. '\uFDCF' + | '\uFDF0' .. '\uFFFD' + // ignores | ['\u10000-'\uEFFFF] + ; + +// ----------------------------------- +// Types + +fragment Int + : 'int' + ; + +// ----------------------------------- +// Symbols + +fragment Esc + : '\\' + ; + +fragment Colon + : ':' + ; + +fragment DColon + : '::' + ; + +fragment SQuote + : '\'' + ; + +fragment DQuote + : '"' + ; + +fragment LParen + : '(' + ; + +fragment RParen + : ')' + ; + +fragment LBrace + : '{' + ; + +fragment RBrace + : '}' + ; + +fragment LBrack + : '[' + ; + +fragment RBrack + : ']' + ; + +fragment RArrow + : '->' + ; + +fragment Lt + : '<' + ; + +fragment Gt + : '>' + ; + +fragment Equal + : '=' + ; + +fragment Question + : '?' + ; + +fragment Star + : '*' + ; + +fragment Plus + : '+' + ; + +fragment PlusAssign + : '+=' + ; + +fragment Underscore + : '_' + ; + +fragment Pipe + : '|' + ; + +fragment Dollar + : '$' + ; + +fragment Comma + : ',' + ; + +fragment Semi + : ';' + ; + +fragment Dot + : '.' + ; + +fragment Range + : '..' + ; + +fragment At + : '@' + ; + +fragment Pound + : '#' + ; + +fragment Tilde + : '~' + ; diff --git a/src/textMateGenerator/gen/CGL.scala b/src/textMateGenerator/gen/CGL.scala new file mode 100644 index 0000000000..964be0b8c6 --- /dev/null +++ b/src/textMateGenerator/gen/CGL.scala @@ -0,0 +1,21 @@ +package gen +import upickle.default.{ReadWriter => RW, macroRW} + +/** + * Common Grammar Language + */ +case class CGL(scopeName: String, fileTypes: String, patterns: Seq[MatchPattern]) { + def addPattern(mp: MatchPattern): CGL = { + CGL(scopeName, fileTypes, patterns = patterns :+ mp) + } +} +object CGL{ + implicit val rw: RW[CGL] = macroRW +} + + + +case class MatchPattern(name: String, `match`: String) +object MatchPattern { + implicit val rw: RW[MatchPattern] = macroRW +} \ No newline at end of file diff --git a/src/textMateGenerator/gen/Main.scala b/src/textMateGenerator/gen/Main.scala new file mode 100644 index 0000000000..23b926a5e2 --- /dev/null +++ b/src/textMateGenerator/gen/Main.scala @@ -0,0 +1,83 @@ +package gen + +import hre.io.{RWFile, Readable} +import upickle.default.{macroRW, ReadWriter => RW} +import upickle._ + +import java.nio.file.Paths +import org.antlr.v4.runtime +import org.antlr.v4.runtime.misc.IntervalSet +import org.antlr.v4.runtime.{CharStreams, CommonTokenStream} +import vct.antlr4.generated.ANTLRv4Parser.{LexerAltContext, LexerRuleSpecContext, Rules0Context} +import vct.antlr4.generated.ANTLRv4ParserPatterns._ +import vct.antlr4.generated.{ANTLRv4Lexer, ANTLRv4Parser} + +import java.io.FileNotFoundException +import scala.jdk.CollectionConverters.CollectionHasAsScala + +case object Main { + + def parse(path: String) = { + val readable: Readable = RWFile(Paths.get(path)) + try { + readable.read { reader => + val stream: runtime.CharStream = CharStreams.fromReader(reader, readable.fileName) + val lexer = new ANTLRv4Lexer(stream) + val tokens = new CommonTokenStream(lexer) + val parser = new ANTLRv4Parser(tokens) + parser.grammarSpec() + } + } catch { + case f: FileNotFoundException => throw f + } + } + + def main(args: Array[String]): Unit = { + var textMateGrammar = CGL(args(0), args(1), Nil) + val tree = parse(args(2)) + val GrammarSpec0( + GrammarDecl0(_, grammarName, _), + _, + Rules0(modelessRules), + modedRules, + _ + ) = tree + + val rules: Seq[LexerRuleSpecContext] = + modelessRules.map { case RuleSpec0(rule) => rule } ++ + modedRules.flatMap { case ModeSpec0(_, _, _, rules) => rules } + + rules.foreach(processRule) + + // print(upickle.default.write(textMateGrammar, indent = 2)) + } + + def processRule(rule: LexerRuleSpecContext): Unit = { + val LexerRuleSpec0(_, name, _, _, LexerRuleBlock0(alts), _) = rule + val alts = altsRule.children.asScala.collect { case alt: LexerAltContext => alt } + val set = new IntervalSet() + set.addAll(set) + println(name) + } + + sealed trait RegLang + case class CharInSet(chars: IntervalSet) + case class Alts(langs: Seq[RegLang]) extends RegLang + case class Seqn(langs: Seq[RegLang]) extends RegLang + case class Star(lang: RegLang, greedy: Boolean) extends RegLang + case class Plus(lang: RegLang, greedy: Boolean) extends RegLang + case class QMark(lang: RegLAng, greedy: Boolean) extends RegLang + + def asRegLang(alts: LexerAltListContext): RegLang = + + + def escapeAndEncloseWords(text: String): String = { + val stringWithEscapedSymbols = text.replaceAll("([^\\w\\\\])", "\\\\$1") + val wordsWithBoundaries = stringWithEscapedSymbols.replaceAll("\\b(\\w+)\\b", "\\\\b(?:$1)\\\\b") + val finalString = wordsWithBoundaries.replaceAll("""\\\\\\\\\\""", """\\\\\\""") + + finalString + } + +} + diff --git a/src/textMateGenerator/vct/antlr4/generated/LexerAdaptor.java b/src/textMateGenerator/vct/antlr4/generated/LexerAdaptor.java new file mode 100644 index 0000000000..2a656729d2 --- /dev/null +++ b/src/textMateGenerator/vct/antlr4/generated/LexerAdaptor.java @@ -0,0 +1,152 @@ +/* + [The "BSD licence"] + Copyright (c) 2005-2007 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +//package org.antlr.parser.antlr4; +package vct.antlr4.generated; + +import org.antlr.v4.runtime.CharStream; +import org.antlr.v4.runtime.Lexer; +import org.antlr.v4.runtime.Token; +import org.antlr.v4.runtime.misc.Interval; + +public abstract class LexerAdaptor extends Lexer { + + /** + * Generic type for OPTIONS, TOKENS and CHANNELS + */ + private static final int PREQUEL_CONSTRUCT = -10; + private static final int OPTIONS_CONSTRUCT = -11; + + public LexerAdaptor(CharStream input) { + super(input); + } + + /** + * Track whether we are inside of a rule and whether it is lexical parser. _currentRuleType==Token.INVALID_TYPE + * means that we are outside of a rule. At the first sign of a rule name reference and _currentRuleType==invalid, we + * can assume that we are starting a parser rule. Similarly, seeing a token reference when not already in rule means + * starting a token rule. The terminating ';' of a rule, flips this back to invalid type. + * + * This is not perfect logic but works. For example, "grammar T;" means that we start and stop a lexical rule for + * the "T;". Dangerous but works. + * + * The whole point of this state information is to distinguish between [..arg actions..] and [charsets]. Char sets + * can only occur in lexical rules and arg actions cannot occur. + */ + private int _currentRuleType = Token.INVALID_TYPE; + + private boolean insideOptionsBlock = false; + + public int getCurrentRuleType() { + return _currentRuleType; + } + + public void setCurrentRuleType(int ruleType) { + this._currentRuleType = ruleType; + } + + protected void handleBeginArgument() { + if (inLexerRule()) { + pushMode(ANTLRv4Lexer.LexerCharSet); + more(); + } else { + pushMode(ANTLRv4Lexer.Argument); + } + } + + protected void handleEndArgument() { + popMode(); + if (_modeStack.size() > 0) { + setType(ANTLRv4Lexer.ARGUMENT_CONTENT); + } + } + + protected void handleEndAction() { + int oldMode = _mode; + int newMode = popMode(); + boolean isActionWithinAction = _modeStack.size() > 0 + && newMode == ANTLRv4Lexer.TargetLanguageAction + && oldMode == newMode; + + if (isActionWithinAction) { + setType(ANTLRv4Lexer.ACTION_CONTENT); + } + } + + @Override + public Token emit() { + if ((_type == ANTLRv4Lexer.OPTIONS || _type == ANTLRv4Lexer.TOKENS || _type == ANTLRv4Lexer.CHANNELS) + && getCurrentRuleType() == Token.INVALID_TYPE) { // enter prequel construct ending with an RBRACE + setCurrentRuleType(PREQUEL_CONSTRUCT); + } else if (_type == ANTLRv4Lexer.OPTIONS && getCurrentRuleType() == ANTLRv4Lexer.TOKEN_REF) + { + setCurrentRuleType(OPTIONS_CONSTRUCT); + } else if (_type == ANTLRv4Lexer.RBRACE && getCurrentRuleType() == PREQUEL_CONSTRUCT) { // exit prequel construct + setCurrentRuleType(Token.INVALID_TYPE); + } else if (_type == ANTLRv4Lexer.RBRACE && getCurrentRuleType() == OPTIONS_CONSTRUCT) + { // exit options + setCurrentRuleType(ANTLRv4Lexer.TOKEN_REF); + } else if (_type == ANTLRv4Lexer.AT && getCurrentRuleType() == Token.INVALID_TYPE) { // enter action + setCurrentRuleType(ANTLRv4Lexer.AT); + } else if (_type == ANTLRv4Lexer.SEMI && getCurrentRuleType() == OPTIONS_CONSTRUCT) + { // ';' in options { .... }. Don't change anything. + } else if (_type == ANTLRv4Lexer.END_ACTION && getCurrentRuleType() == ANTLRv4Lexer.AT) { // exit action + setCurrentRuleType(Token.INVALID_TYPE); + } else if (_type == ANTLRv4Lexer.ID) { + String firstChar = _input.getText(Interval.of(_tokenStartCharIndex, _tokenStartCharIndex)); + if (Character.isUpperCase(firstChar.charAt(0))) { + _type = ANTLRv4Lexer.TOKEN_REF; + } else { + _type = ANTLRv4Lexer.RULE_REF; + } + + if (getCurrentRuleType() == Token.INVALID_TYPE) { // if outside of rule def + setCurrentRuleType(_type); // set to inside lexer or parser rule + } + } else if (_type == ANTLRv4Lexer.SEMI) { // exit rule def + setCurrentRuleType(Token.INVALID_TYPE); + } + + return super.emit(); + } + + private boolean inLexerRule() { + return getCurrentRuleType() == ANTLRv4Lexer.TOKEN_REF; + } + + @SuppressWarnings("unused") + private boolean inParserRule() { // not used, but added for clarity + return getCurrentRuleType() == ANTLRv4Lexer.RULE_REF; + } + + @Override + public void reset() { + setCurrentRuleType(Token.INVALID_TYPE); + insideOptionsBlock = false; + super.reset(); + } +}