1 /* 2 * [The "BSD license"] 3 * Copyright (c) 2013 Terence Parr 4 * Copyright (c) 2013 Sam Harwell 5 * Copyright (c) 2017 Egbert Voigt 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 module antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher; 33 34 import std.uni; 35 import std.conv; 36 import std.string; 37 import antlr.v4.runtime.Parser; 38 import antlr.v4.runtime.ANTLRInputStream; 39 import antlr.v4.runtime.BailErrorStrategy; 40 import antlr.v4.runtime.Lexer; 41 import antlr.v4.runtime.IllegalArgumentException; 42 import antlr.v4.runtime.ListTokenSource; 43 import antlr.v4.runtime.ParserInterpreter; 44 import antlr.v4.runtime.ParserRuleContext; 45 import antlr.v4.runtime.RuleContext; 46 import antlr.v4.runtime.RecognitionException; 47 import antlr.v4.runtime.Token; 48 import antlr.v4.runtime.TokenConstantDefinition; 49 import antlr.v4.runtime.CommonTokenStream; 50 import antlr.v4.runtime.atn.ATNSimulator; 51 import antlr.v4.runtime.tree.ParseTree; 52 import antlr.v4.runtime.tree.CannotInvokeStartRule; 53 import antlr.v4.runtime.tree.StartRuleDoesNotConsumeFullPattern; 54 import antlr.v4.runtime.tree.TerminalNode; 55 import antlr.v4.runtime.tree.RuleNode; 56 import antlr.v4.runtime.tree.pattern.Chunk; 57 import antlr.v4.runtime.tree.pattern.TagChunk; 58 import antlr.v4.runtime.tree.pattern.TextChunk; 59 import antlr.v4.runtime.tree.pattern.TokenTagToken; 60 import antlr.v4.runtime.tree.pattern.ParseTreePattern; 61 import antlr.v4.runtime.tree.pattern.ParseTreeMatch; 62 import antlr.v4.runtime.tree.pattern.RuleTagToken; 63 import antlr.v4.runtime.misc.ParseCancellationException; 64 65 /** 66 * @uml 67 * A tree pattern matching mechanism for ANTLR {@link ParseTree}s. 68 * 69 * <p>Patterns are strings of source input text with special tags representing 70 * token or rule references such as:</p> 71 * 72 * <p>{@code <ID> = <expr>;}</p> 73 * 74 * <p>Given a pattern start rule such as {@code statement}, this object constructs 75 * a {@link ParseTree} with placeholders for the {@code ID} and {@code expr} 76 * subtree. Then the {@link #match} routines can compare an actual 77 * {@link ParseTree} from a parse with this pattern. Tag {@code <ID>} matches 78 * any {@code ID} token and tag {@code <expr>} references the result of the 79 * {@code expr} rule (generally an instance of {@code ExprContext}.</p> 80 * 81 * <p>Pattern {@code x = 0;} is a similar pattern that matches the same pattern 82 * except that it requires the identifier to be {@code x} and the expression to 83 * be {@code 0}.</p> 84 * 85 * <p>The {@link #matches} routines return {@code true} or {@code false} based 86 * upon a match for the tree rooted at the parameter sent in. The 87 * {@link #match} routines return a {@link ParseTreeMatch} object that 88 * contains the parse tree, the parse tree pattern, and a map from tag name to 89 * matched nodes (more below). A subtree that fails to match, returns with 90 * {@link ParseTreeMatch#mismatchedNode} set to the first tree node that did not 91 * match.</p> 92 * 93 * <p>For efficiency, you can compile a tree pattern in string form to a 94 * {@link ParseTreePattern} object.</p> 95 * 96 * <p>See {@code TestParseTreeMatcher} for lots of examples. 97 * {@link ParseTreePattern} has two static helper methods: 98 * {@link ParseTreePattern#findAll} and {@link ParseTreePattern#match} that 99 * are easy to use but not super efficient because they create new 100 * {@link ParseTreePatternMatcher} objects each time and have to compile the 101 * pattern in string form before using it.</p> 102 * 103 * <p>The lexer and parser that you pass into the {@link ParseTreePatternMatcher} 104 * constructor are used to parse the pattern in string form. The lexer converts 105 * the {@code <ID> = <expr>;} into a sequence of four tokens (assuming lexer 106 * throws out whitespace or puts it on a hidden channel). Be aware that the 107 * input stream is reset for the lexer (but not the parser; a 108 * {@link ParserInterpreter} is created to parse the input.). Any user-defined 109 * fields you have put into the lexer might get changed when this mechanism asks 110 * it to scan the pattern string.</p> 111 * 112 * <p>Normally a parser does not accept token {@code <expr>} as a valid 113 * {@code expr} but, from the parser passed in, we create a special version of 114 * the underlying grammar representation (an {@link ATN}) that allows imaginary 115 * tokens representing rules ({@code <expr>}) to match entire rules. We call 116 * these <em>bypass alternatives</em>.</p> 117 * 118 * <p>Delimiters are {@code <} and {@code >}, with {@code \} as the escape string 119 * by default, but you can set them to whatever you want using 120 * {@link #setDelimiters}. You must escape both start and stop strings 121 * {@code \<} and {@code \>}.</p> 122 */ 123 class ParseTreePatternMatcher 124 { 125 126 /** 127 * @uml 128 * This is the backing field for {@link #getLexer()}. 129 */ 130 private Lexer lexer; 131 132 /** 133 * @uml 134 * This is the backing field for {@link #getParser()}. 135 */ 136 private Parser parser; 137 138 protected string start = "<"; 139 140 protected string stop = ">"; 141 142 /** 143 * @uml 144 * e.g., \< and \> must escape BOTH! 145 */ 146 protected string escape = "\\"; 147 148 /** 149 * @uml 150 * Constructs a {@link ParseTreePatternMatcher} or from a {@link Lexer} and 151 * {@link Parser} object. The lexer input stream is altered for tokenizing 152 * the tree patterns. The parser is used as a convenient mechanism to get 153 * the grammar name, plus token, rule names. 154 */ 155 public this(Lexer lexer, Parser parser) 156 { 157 this.lexer = lexer; 158 this.parser = parser; 159 } 160 161 /** 162 * @uml 163 * Set the delimiters used for marking rule and token tags within concrete 164 * syntax used by the tree pattern parser. 165 * 166 * @param start The start delimiter. 167 * @param stop The stop delimiter. 168 * @param escapeLeft The escape sequence to use for escaping a start or stop delimiter. 169 * 170 * @exception IllegalArgumentException if {@code start} is {@code null} or empty. 171 * @exception IllegalArgumentException if {@code stop} is {@code null} or empty. 172 */ 173 public void setDelimiters(string start, string stop, string escapeLeft) 174 { 175 if (start is null || start.length) { 176 throw new IllegalArgumentException("start cannot be null or empty"); 177 } 178 179 if (stop is null || stop.length) { 180 throw new IllegalArgumentException("stop cannot be null or empty"); 181 } 182 183 this.start = start; 184 this.stop = stop; 185 this.escape = escapeLeft; 186 } 187 188 /** 189 * @uml 190 * Does {@code pattern} matched as rule {@code patternRuleIndex} match {@code tree}? 191 */ 192 public bool matches(ParseTree tree, string pattern, int patternRuleIndex) 193 { 194 ParseTreePattern p = compile(pattern, patternRuleIndex); 195 return matches(tree, p); 196 } 197 198 /** 199 * @uml 200 * Does {@code pattern} matched as rule patternRuleIndex match tree? Pass in a 201 * compiled pattern instead of a string representation of a tree pattern. 202 */ 203 public bool matches(ParseTree tree, ParseTreePattern pattern) 204 { 205 ParseTree[][string] labels; 206 ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels); 207 return mismatchedNode is null; 208 } 209 210 /** 211 * @uml 212 * Compare {@code pattern} matched as rule {@code patternRuleIndex} against 213 * {@code tree} and return a {@link ParseTreeMatch} object that contains the 214 * matched elements, or the node at which the match failed. 215 */ 216 public ParseTreeMatch match(ParseTree tree, string pattern, int patternRuleIndex) 217 { 218 ParseTreePattern p = compile(pattern, patternRuleIndex); 219 return match(tree, p); 220 } 221 222 /** 223 * @uml 224 * Compare {@code pattern} matched against {@code tree} and return a 225 * {@link ParseTreeMatch} object that contains the matched elements, or thenode at which the match failed. Pass in a compiled pattern instead of a 226 * string representation of a tree pattern. 227 */ 228 public ParseTreeMatch match(ParseTree tree, ParseTreePattern pattern) 229 { 230 ParseTree[][string] labels; 231 ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels); 232 return new ParseTreeMatch(tree, pattern, labels, mismatchedNode); 233 } 234 235 /** 236 * @uml 237 * For repeated use of a tree pattern, compile it to a 238 * {@link ParseTreePattern} using this method. 239 */ 240 public ParseTreePattern compile(string pattern, int patternRuleIndex) 241 { 242 auto tokenList = tokenize(pattern); 243 ListTokenSource tokenSrc = new ListTokenSource(tokenList); 244 CommonTokenStream tokens = new CommonTokenStream(tokenSrc); 245 246 ParserInterpreter parserInterp = new ParserInterpreter(parser.getGrammarFileName(), 247 parser.getVocabulary(), 248 parser.getRuleNames(), 249 parser.getATNWithBypassAlts(), 250 tokens); 251 252 ParseTree tree = null; 253 try { 254 parserInterp.setErrorHandler(new BailErrorStrategy()); 255 tree = parserInterp.parse(patternRuleIndex); 256 // System.out.println("pattern tree = "+tree.toStringTree(parserInterp)); 257 } 258 catch (ParseCancellationException e) { 259 throw cast(RecognitionException)e.getCause(); 260 } 261 catch (RecognitionException re) { 262 throw re; 263 } 264 catch (Exception e) { 265 throw new CannotInvokeStartRule(e); 266 } 267 268 // Make sure tree pattern compilation checks for a complete parse 269 if ( tokens.LA(1)!=TokenConstantDefinition.EOF ) { 270 throw new StartRuleDoesNotConsumeFullPattern(); 271 } 272 return new ParseTreePattern(this, pattern, patternRuleIndex, tree); 273 } 274 275 /** 276 * @uml 277 * Used to convert the tree pattern string into a series of tokens. The 278 * input stream is reset. 279 */ 280 public Lexer getLexer() 281 { 282 return lexer; 283 } 284 285 /** 286 * @uml 287 * Used to collect to the grammar file name, token names, rule names for 288 * used to parse the pattern into a parse tree. 289 */ 290 public Parser getParser() 291 { 292 return parser; 293 } 294 295 /** 296 * @uml 297 * Recursively walk {@code tree} against {@code patternTree}, filling 298 * {@code match.}{@link ParseTreeMatch#labels labels}. 299 * 300 * @return the first node encountered in {@code tree} which does not match 301 * a corresponding node in {@code patternTree}, or {@code null} if the match 302 * was successful. The specific node returned depends on the matching 303 * algorithm used by the implementation, and may be overridden. 304 */ 305 protected ParseTree matchImpl(ParseTree tree, ParseTree patternTree, ParseTree[][string] labels) 306 { 307 if (tree is null) { 308 throw new IllegalArgumentException("tree cannot be null"); 309 } 310 311 if (patternTree is null) { 312 throw new IllegalArgumentException("patternTree cannot be null"); 313 } 314 315 // x and <ID>, x and y, or x and x; or could be mismatched types 316 if (tree.classinfo == TerminalNode.classinfo && patternTree.classinfo == TerminalNode.classinfo) { 317 TerminalNode t1 = cast(TerminalNode)tree; 318 TerminalNode t2 = cast(TerminalNode)patternTree; 319 ParseTree mismatchedNode = null; 320 // both are tokens and they have same type 321 if (t1.getSymbol().getType() == t2.getSymbol().getType() ) { 322 if (t2.getSymbol().classinfo == TokenTagToken.classinfo) { // x and <ID> 323 TokenTagToken tokenTagToken = cast(TokenTagToken)t2.getSymbol(); 324 // track label->list-of-nodes for both token name and label (if any) 325 labels[tokenTagToken.getTokenName] ~= tree; 326 if (tokenTagToken.getLabel() !is null) { 327 labels[tokenTagToken.getLabel] ~= tree; 328 } 329 } 330 else if (t1.getText == t2.getText) { 331 // x and x 332 } 333 else { 334 // x and y 335 if (mismatchedNode is null) { 336 mismatchedNode = t1; 337 } 338 } 339 } 340 else { 341 if (mismatchedNode is null) { 342 mismatchedNode = t1; 343 } 344 } 345 346 return mismatchedNode; 347 } 348 if (tree.classinfo == ParserRuleContext.classinfo && patternTree.classinfo == ParserRuleContext.classinfo) { 349 ParserRuleContext r1 = cast(ParserRuleContext)tree; 350 ParserRuleContext r2 = cast(ParserRuleContext)patternTree; 351 ParseTree mismatchedNode = null; 352 // (expr ...) and <expr> 353 RuleTagToken ruleTagToken = getRuleTagToken(r2); 354 if ( ruleTagToken !is null ) { 355 ParseTreeMatch m = null; 356 if (r1.getRuleIndex() == r2.getRuleIndex()) { 357 // track label->list-of-nodes for both rule name and label (if any) 358 labels[ruleTagToken.getRuleName] ~= tree; 359 if ( ruleTagToken.getLabel() !is null ) { 360 labels[ruleTagToken.getLabel] ~= tree; 361 } 362 } 363 else { 364 if (mismatchedNode is null) { 365 mismatchedNode = r1; 366 } 367 } 368 369 return mismatchedNode; 370 } 371 372 // (expr ...) and (expr ...) 373 if (r1.getChildCount() != r2.getChildCount()) { 374 if (mismatchedNode is null) { 375 mismatchedNode = r1; 376 } 377 378 return mismatchedNode; 379 } 380 381 int n = r1.getChildCount(); 382 for (int i = 0; i<n; i++) { 383 ParseTree childMatch = matchImpl(r1.getChild(i), patternTree.getChild(i), labels); 384 if ( childMatch !is null ) { 385 return childMatch; 386 } 387 } 388 389 return mismatchedNode; 390 } 391 392 // if nodes aren't both tokens or both rule nodes, can't match 393 return tree; 394 } 395 396 public RuleTagToken getRuleTagToken(ParseTree t) 397 { 398 if (t.classinfo == RuleNode.classinfo) { 399 RuleNode r = cast(RuleNode)t; 400 if (r.getChildCount == 1 && r.getChild(0).classinfo == TerminalNode.classinfo) { 401 TerminalNode c = cast(TerminalNode)r.getChild(0); 402 if (c.getSymbol().classinfo == RuleTagToken.classinfo) { 403 // System.out.println("rule tag subtree "+t.toStringTree(parser)); 404 return cast(RuleTagToken)c.getSymbol(); 405 } 406 } 407 } 408 return null; 409 } 410 411 public Token[] tokenize(string pattern) 412 { 413 // split pattern into chunks: sea (raw input) and islands (<ID>, <expr>) 414 Chunk[] chunks = split(pattern); 415 416 // create token stream from text and tags 417 Token[] tokens; 418 foreach (Chunk chunk; chunks) { 419 if (chunk.classinfo == TagChunk.classinfo) { 420 TagChunk tagChunk = cast(TagChunk)chunk; 421 // add special rule token or conjure up new token from name 422 if (isUpper(tagChunk.getTag()[0])) { 423 int ttype = parser.getTokenType(tagChunk.getTag()); 424 if (ttype == TokenConstantDefinition.INVALID_TYPE ) { 425 throw new IllegalArgumentException("Unknown token " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern); 426 } 427 TokenTagToken t = new TokenTagToken(tagChunk.getTag(), ttype, tagChunk.getLabel()); 428 tokens ~= t; 429 } 430 else if (isLower(tagChunk.getTag()[0]) ) { 431 int ruleIndex = parser.getRuleIndex(tagChunk.getTag()); 432 if (ruleIndex == -1) { 433 throw new IllegalArgumentException("Unknown rule " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern); 434 } 435 int ruleImaginaryTokenType = parser.getATNWithBypassAlts().ruleToTokenType[ruleIndex]; 436 tokens ~= new RuleTagToken(tagChunk.getTag(), ruleImaginaryTokenType, tagChunk.getLabel()); 437 } 438 else { 439 throw new IllegalArgumentException("invalid tag: " ~ tagChunk.getTag ~ " in pattern: " ~ pattern); 440 } 441 } 442 else { 443 TextChunk textChunk = cast(TextChunk)chunk; 444 ANTLRInputStream ins = new ANTLRInputStream(textChunk.getText()); 445 lexer.setInputStream(ins); 446 Token t = lexer.nextToken(); 447 while (t.getType() != TokenConstantDefinition.EOF) { 448 tokens ~= t; 449 t = lexer.nextToken(); 450 } 451 } 452 } 453 454 // System.out.println("tokens="+tokens); 455 return tokens; 456 } 457 458 /** 459 * @uml 460 * Split {@code <ID> = <e:expr> ;} into 4 chunks for tokenizing by {@link #tokenize}. 461 */ 462 public Chunk[] split(string pattern) 463 { 464 int p = 0; 465 int n = to!int(pattern.length); 466 Chunk[] chunks; 467 // find all start and stop indexes first, then collect 468 int[] starts; 469 int[] stops; 470 while (p < n) { 471 if (p == pattern.indexOf(escape ~ start, p) ) { 472 p += to!int(escape.length + start.length); 473 } 474 else if ( p == pattern.indexOf(escape ~ stop, p) ) { 475 p += to!int(escape.length + stop.length); 476 } 477 else if ( p == pattern.indexOf(start,p) ) { 478 starts ~= p; 479 p += to!int(start.length); 480 } 481 else if ( p == pattern.indexOf(stop,p) ) { 482 stops ~= p; 483 p += to!int(stop.length); 484 } 485 else { 486 p++; 487 } 488 } 489 490 // System.out.println(""); 491 // System.out.println(starts); 492 // System.out.println(stops); 493 if (starts.length > stops.length) { 494 throw new IllegalArgumentException("unterminated tag in pattern: " ~ pattern); 495 } 496 497 if ( starts.length < stops.length) { 498 throw new IllegalArgumentException("missing start tag in pattern: " ~ pattern); 499 } 500 501 int ntags = to!int(starts.length); 502 for (int i=0; i<ntags; i++) { 503 if ( starts[i] >= stops[i] ) { 504 throw new IllegalArgumentException("tag delimiters out of order in pattern: " ~ pattern); 505 } 506 } 507 508 // collect into chunks now 509 if (ntags == 0) { 510 string text = pattern[0.. n]; 511 chunks ~= new TextChunk(text); 512 } 513 514 if (ntags>0 && starts[0] > 0) { // copy text up to first tag into chunks 515 string text = pattern[0..starts[0]]; 516 chunks ~= new TextChunk(text); 517 } 518 for (int i=0; i<ntags; i++) { 519 // copy inside of <tag> 520 string tag = pattern[starts[i] + start.length..stops[i]]; 521 string ruleOrToken = tag; 522 string label = null; 523 int colon = to!int(tag.indexOf(':')); 524 if (colon >= 0) { 525 label = tag[0..colon]; 526 ruleOrToken = tag[colon+1..tag.length]; 527 } 528 chunks ~= new TagChunk(label, ruleOrToken); 529 if (i + 1 < ntags) { 530 // copy from end of <tag> to start of next 531 string text = pattern[stops[i] + to!int(stop.length)..starts[i + 1]]; 532 chunks ~= new TextChunk(text); 533 } 534 } 535 if ( ntags > 0 ) { 536 int afterLastTag = to!int(stops[ntags - 1] + stop.length); 537 if ( afterLastTag < n ) { // copy text from end of last tag to end 538 string text = pattern[afterLastTag..n]; 539 chunks ~= new TextChunk(text); 540 } 541 } 542 543 // strip out the escape sequences from text chunks but not tags 544 for (int i = 0; i < chunks.length; i++) { 545 Chunk c = chunks[i]; 546 if (c.classinfo == TextChunk.classinfo) { 547 TextChunk tc = cast(TextChunk)c; 548 string unescaped = tc.getText().replace(escape, ""); 549 if (unescaped.length < tc.getText().length) { 550 chunks[i] = new TextChunk(unescaped); 551 } 552 } 553 } 554 555 return chunks; 556 } 557 558 }