1 /* 2 * [The "BSD license"] 3 * Copyright (c) 2013 Terence Parr 4 * Copyright (c) 2013 Sam Harwell 5 * Copyright (c) 2017 Egbert Voigt 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 module antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher; 33 34 import std.uni; 35 import std.conv; 36 import std..string; 37 import antlr.v4.runtime.Parser; 38 import antlr.v4.runtime.ANTLRInputStream; 39 import antlr.v4.runtime.BailErrorStrategy; 40 import antlr.v4.runtime.Lexer; 41 import antlr.v4.runtime.IllegalArgumentException; 42 import antlr.v4.runtime.ListTokenSource; 43 import antlr.v4.runtime.ParserInterpreter; 44 import antlr.v4.runtime.ParserRuleContext; 45 import antlr.v4.runtime.RuleContext; 46 import antlr.v4.runtime.RecognitionException; 47 import antlr.v4.runtime.Token; 48 import antlr.v4.runtime.TokenConstantDefinition; 49 import antlr.v4.runtime.CommonTokenStream; 50 import antlr.v4.runtime.atn.ATNSimulator; 51 import antlr.v4.runtime.tree.ParseTree; 52 import antlr.v4.runtime.tree.CannotInvokeStartRule; 53 import antlr.v4.runtime.tree.StartRuleDoesNotConsumeFullPattern; 54 import antlr.v4.runtime.tree.TerminalNode; 55 import antlr.v4.runtime.tree.RuleNode; 56 import antlr.v4.runtime.tree.pattern.Chunk; 57 import antlr.v4.runtime.tree.pattern.TagChunk; 58 import antlr.v4.runtime.tree.pattern.TextChunk; 59 import antlr.v4.runtime.tree.pattern.TokenTagToken; 60 import antlr.v4.runtime.tree.pattern.ParseTreePattern; 61 import antlr.v4.runtime.tree.pattern.ParseTreeMatch; 62 import antlr.v4.runtime.tree.pattern.RuleTagToken; 63 import antlr.v4.runtime.misc.ParseCancellationException; 64 65 // Class ParseTreePatternMatcher 66 /** 67 * @uml 68 * A tree pattern matching mechanism for ANTLR {@link ParseTree}s. 69 * 70 * <p>Patterns are strings of source input text with special tags representing 71 * token or rule references such as:</p> 72 * 73 * <p>{@code <ID> = <expr>;}</p> 74 * 75 * <p>Given a pattern start rule such as {@code statement}, this object constructs 76 * a {@link ParseTree} with placeholders for the {@code ID} and {@code expr} 77 * subtree. Then the {@link #match} routines can compare an actual 78 * {@link ParseTree} from a parse with this pattern. Tag {@code <ID>} matches 79 * any {@code ID} token and tag {@code <expr>} references the result of the 80 * {@code expr} rule (generally an instance of {@code ExprContext}.</p> 81 * 82 * <p>Pattern {@code x = 0;} is a similar pattern that matches the same pattern 83 * except that it requires the identifier to be {@code x} and the expression to 84 * be {@code 0}.</p> 85 * 86 * <p>The {@link #matches} routines return {@code true} or {@code false} based 87 * upon a match for the tree rooted at the parameter sent in. The 88 * {@link #match} routines return a {@link ParseTreeMatch} object that 89 * contains the parse tree, the parse tree pattern, and a map from tag name to 90 * matched nodes (more below). A subtree that fails to match, returns with 91 * {@link ParseTreeMatch#mismatchedNode} set to the first tree node that did not 92 * match.</p> 93 * 94 * <p>For efficiency, you can compile a tree pattern in string form to a 95 * {@link ParseTreePattern} object.</p> 96 * 97 * <p>See {@code TestParseTreeMatcher} for lots of examples. 98 * {@link ParseTreePattern} has two static helper methods: 99 * {@link ParseTreePattern#findAll} and {@link ParseTreePattern#match} that 100 * are easy to use but not super efficient because they create new 101 * {@link ParseTreePatternMatcher} objects each time and have to compile the 102 * pattern in string form before using it.</p> 103 * 104 * <p>The lexer and parser that you pass into the {@link ParseTreePatternMatcher} 105 * constructor are used to parse the pattern in string form. The lexer converts 106 * the {@code <ID> = <expr>;} into a sequence of four tokens (assuming lexer 107 * throws out whitespace or puts it on a hidden channel). Be aware that the 108 * input stream is reset for the lexer (but not the parser; a 109 * {@link ParserInterpreter} is created to parse the input.). Any user-defined 110 * fields you have put into the lexer might get changed when this mechanism asks 111 * it to scan the pattern string.</p> 112 * 113 * <p>Normally a parser does not accept token {@code <expr>} as a valid 114 * {@code expr} but, from the parser passed in, we create a special version of 115 * the underlying grammar representation (an {@link ATN}) that allows imaginary 116 * tokens representing rules ({@code <expr>}) to match entire rules. We call 117 * these <em>bypass alternatives</em>.</p> 118 * 119 * <p>Delimiters are {@code <} and {@code >}, with {@code \} as the escape string 120 * by default, but you can set them to whatever you want using 121 * {@link #setDelimiters}. You must escape both start and stop strings 122 * {@code \<} and {@code \>}.</p> 123 */ 124 class ParseTreePatternMatcher 125 { 126 127 /** 128 * @uml 129 * This is the backing field for {@link #getLexer()}. 130 */ 131 private Lexer lexer; 132 133 /** 134 * @uml 135 * This is the backing field for {@link #getParser()}. 136 */ 137 private Parser parser; 138 139 protected string start = "<"; 140 141 protected string stop = ">"; 142 143 /** 144 * @uml 145 * e.g., \< and \> must escape BOTH! 146 */ 147 protected string escape = "\\"; 148 149 /** 150 * @uml 151 * Constructs a {@link ParseTreePatternMatcher} or from a {@link Lexer} and 152 * {@link Parser} object. The lexer input stream is altered for tokenizing 153 * the tree patterns. The parser is used as a convenient mechanism to get 154 * the grammar name, plus token, rule names. 155 */ 156 public this(Lexer lexer, Parser parser) 157 { 158 this.lexer = lexer; 159 this.parser = parser; 160 } 161 162 /** 163 * @uml 164 * Set the delimiters used for marking rule and token tags within concrete 165 * syntax used by the tree pattern parser. 166 * 167 * @param start The start delimiter. 168 * @param stop The stop delimiter. 169 * @param escapeLeft The escape sequence to use for escaping a start or stop delimiter. 170 * 171 * @exception IllegalArgumentException if {@code start} is {@code null} or empty. 172 * @exception IllegalArgumentException if {@code stop} is {@code null} or empty. 173 */ 174 public void setDelimiters(string start, string stop, string escapeLeft) 175 { 176 if (start is null || start.length) { 177 throw new IllegalArgumentException("start cannot be null or empty"); 178 } 179 180 if (stop is null || stop.length) { 181 throw new IllegalArgumentException("stop cannot be null or empty"); 182 } 183 184 this.start = start; 185 this.stop = stop; 186 this.escape = escapeLeft; 187 } 188 189 /** 190 * @uml 191 * Does {@code pattern} matched as rule {@code patternRuleIndex} match {@code tree}? 192 */ 193 public bool matches(ParseTree tree, string pattern, int patternRuleIndex) 194 { 195 ParseTreePattern p = compile(pattern, patternRuleIndex); 196 return matches(tree, p); 197 } 198 199 /** 200 * @uml 201 * Does {@code pattern} matched as rule patternRuleIndex match tree? Pass in a 202 * compiled pattern instead of a string representation of a tree pattern. 203 */ 204 public bool matches(ParseTree tree, ParseTreePattern pattern) 205 { 206 ParseTree[][string] labels; 207 ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels); 208 return mismatchedNode is null; 209 } 210 211 /** 212 * @uml 213 * Compare {@code pattern} matched as rule {@code patternRuleIndex} against 214 * {@code tree} and return a {@link ParseTreeMatch} object that contains the 215 * matched elements, or the node at which the match failed. 216 */ 217 public ParseTreeMatch match(ParseTree tree, string pattern, int patternRuleIndex) 218 { 219 ParseTreePattern p = compile(pattern, patternRuleIndex); 220 return match(tree, p); 221 } 222 223 /** 224 * @uml 225 * Compare {@code pattern} matched against {@code tree} and return a 226 * {@link ParseTreeMatch} object that contains the matched elements, or thenode at which the match failed. Pass in a compiled pattern instead of a 227 * string representation of a tree pattern. 228 */ 229 public ParseTreeMatch match(ParseTree tree, ParseTreePattern pattern) 230 { 231 ParseTree[][string] labels; 232 ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels); 233 return new ParseTreeMatch(tree, pattern, labels, mismatchedNode); 234 } 235 236 /** 237 * @uml 238 * For repeated use of a tree pattern, compile it to a 239 * {@link ParseTreePattern} using this method. 240 */ 241 public ParseTreePattern compile(string pattern, int patternRuleIndex) 242 { 243 auto tokenList = tokenize(pattern); 244 ListTokenSource tokenSrc = new ListTokenSource(tokenList); 245 CommonTokenStream tokens = new CommonTokenStream(tokenSrc); 246 247 ParserInterpreter parserInterp = new ParserInterpreter(parser.getGrammarFileName(), 248 parser.getVocabulary(), 249 parser.getRuleNames(), 250 parser.getATNWithBypassAlts(), 251 tokens); 252 253 ParseTree tree = null; 254 try { 255 parserInterp.setErrorHandler(new BailErrorStrategy()); 256 tree = parserInterp.parse(patternRuleIndex); 257 // System.out.println("pattern tree = "+tree.toStringTree(parserInterp)); 258 } 259 catch (ParseCancellationException e) { 260 throw cast(RecognitionException)e.getCause(); 261 } 262 catch (RecognitionException re) { 263 throw re; 264 } 265 catch (Exception e) { 266 throw new CannotInvokeStartRule(e); 267 } 268 269 // Make sure tree pattern compilation checks for a complete parse 270 if ( tokens.LA(1)!=TokenConstantDefinition.EOF ) { 271 throw new StartRuleDoesNotConsumeFullPattern(); 272 } 273 return new ParseTreePattern(this, pattern, patternRuleIndex, tree); 274 } 275 276 /** 277 * @uml 278 * Used to convert the tree pattern string into a series of tokens. The 279 * input stream is reset. 280 */ 281 public Lexer getLexer() 282 { 283 return lexer; 284 } 285 286 /** 287 * @uml 288 * Used to collect to the grammar file name, token names, rule names for 289 * used to parse the pattern into a parse tree. 290 */ 291 public Parser getParser() 292 { 293 return parser; 294 } 295 296 /** 297 * @uml 298 * Recursively walk {@code tree} against {@code patternTree}, filling 299 * {@code match.}{@link ParseTreeMatch#labels labels}. 300 * 301 * @return the first node encountered in {@code tree} which does not match 302 * a corresponding node in {@code patternTree}, or {@code null} if the match 303 * was successful. The specific node returned depends on the matching 304 * algorithm used by the implementation, and may be overridden. 305 */ 306 protected ParseTree matchImpl(ParseTree tree, ParseTree patternTree, ParseTree[][string] labels) 307 { 308 if (tree is null) { 309 throw new IllegalArgumentException("tree cannot be null"); 310 } 311 312 if (patternTree is null) { 313 throw new IllegalArgumentException("patternTree cannot be null"); 314 } 315 316 // x and <ID>, x and y, or x and x; or could be mismatched types 317 if (tree.classinfo == TerminalNode.classinfo && patternTree.classinfo == TerminalNode.classinfo) { 318 TerminalNode t1 = cast(TerminalNode)tree; 319 TerminalNode t2 = cast(TerminalNode)patternTree; 320 ParseTree mismatchedNode = null; 321 // both are tokens and they have same type 322 if (t1.getSymbol().getType() == t2.getSymbol().getType() ) { 323 if (t2.getSymbol().classinfo == TokenTagToken.classinfo) { // x and <ID> 324 TokenTagToken tokenTagToken = cast(TokenTagToken)t2.getSymbol(); 325 // track label->list-of-nodes for both token name and label (if any) 326 labels[tokenTagToken.getTokenName] ~= tree; 327 if (tokenTagToken.getLabel() !is null) { 328 labels[tokenTagToken.getLabel] ~= tree; 329 } 330 } 331 else if (t1.getText == t2.getText) { 332 // x and x 333 } 334 else { 335 // x and y 336 if (mismatchedNode is null) { 337 mismatchedNode = t1; 338 } 339 } 340 } 341 else { 342 if (mismatchedNode is null) { 343 mismatchedNode = t1; 344 } 345 } 346 347 return mismatchedNode; 348 } 349 if (tree.classinfo == ParserRuleContext.classinfo && patternTree.classinfo == ParserRuleContext.classinfo) { 350 ParserRuleContext r1 = cast(ParserRuleContext)tree; 351 ParserRuleContext r2 = cast(ParserRuleContext)patternTree; 352 ParseTree mismatchedNode = null; 353 // (expr ...) and <expr> 354 RuleTagToken ruleTagToken = getRuleTagToken(r2); 355 if ( ruleTagToken !is null ) { 356 ParseTreeMatch m = null; 357 if (r1.getRuleIndex() == r2.getRuleIndex()) { 358 // track label->list-of-nodes for both rule name and label (if any) 359 labels[ruleTagToken.getRuleName] ~= tree; 360 if ( ruleTagToken.getLabel() !is null ) { 361 labels[ruleTagToken.getLabel] ~= tree; 362 } 363 } 364 else { 365 if (mismatchedNode is null) { 366 mismatchedNode = r1; 367 } 368 } 369 370 return mismatchedNode; 371 } 372 373 // (expr ...) and (expr ...) 374 if (r1.getChildCount() != r2.getChildCount()) { 375 if (mismatchedNode is null) { 376 mismatchedNode = r1; 377 } 378 379 return mismatchedNode; 380 } 381 382 int n = r1.getChildCount(); 383 for (int i = 0; i<n; i++) { 384 ParseTree childMatch = matchImpl(r1.getChild(i), patternTree.getChild(i), labels); 385 if ( childMatch !is null ) { 386 return childMatch; 387 } 388 } 389 390 return mismatchedNode; 391 } 392 393 // if nodes aren't both tokens or both rule nodes, can't match 394 return tree; 395 } 396 397 public RuleTagToken getRuleTagToken(ParseTree t) 398 { 399 if (t.classinfo == RuleNode.classinfo) { 400 RuleNode r = cast(RuleNode)t; 401 if (r.getChildCount == 1 && r.getChild(0).classinfo == TerminalNode.classinfo) { 402 TerminalNode c = cast(TerminalNode)r.getChild(0); 403 if (c.getSymbol().classinfo == RuleTagToken.classinfo) { 404 // System.out.println("rule tag subtree "+t.toStringTree(parser)); 405 return cast(RuleTagToken)c.getSymbol(); 406 } 407 } 408 } 409 return null; 410 } 411 412 public Token[] tokenize(string pattern) 413 { 414 // split pattern into chunks: sea (raw input) and islands (<ID>, <expr>) 415 Chunk[] chunks = split(pattern); 416 417 // create token stream from text and tags 418 Token[] tokens; 419 foreach (Chunk chunk; chunks) { 420 if (chunk.classinfo == TagChunk.classinfo) { 421 TagChunk tagChunk = cast(TagChunk)chunk; 422 // add special rule token or conjure up new token from name 423 if (isUpper(tagChunk.getTag()[0])) { 424 int ttype = parser.getTokenType(tagChunk.getTag()); 425 if (ttype == TokenConstantDefinition.INVALID_TYPE ) { 426 throw new IllegalArgumentException("Unknown token " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern); 427 } 428 TokenTagToken t = new TokenTagToken(tagChunk.getTag(), ttype, tagChunk.getLabel()); 429 tokens ~= t; 430 } 431 else if (isLower(tagChunk.getTag()[0]) ) { 432 int ruleIndex = parser.getRuleIndex(tagChunk.getTag()); 433 if (ruleIndex == -1) { 434 throw new IllegalArgumentException("Unknown rule " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern); 435 } 436 int ruleImaginaryTokenType = parser.getATNWithBypassAlts().ruleToTokenType[ruleIndex]; 437 tokens ~= new RuleTagToken(tagChunk.getTag(), ruleImaginaryTokenType, tagChunk.getLabel()); 438 } 439 else { 440 throw new IllegalArgumentException("invalid tag: " ~ tagChunk.getTag ~ " in pattern: " ~ pattern); 441 } 442 } 443 else { 444 TextChunk textChunk = cast(TextChunk)chunk; 445 ANTLRInputStream ins = new ANTLRInputStream(textChunk.getText()); 446 lexer.setInputStream(ins); 447 Token t = lexer.nextToken(); 448 while (t.getType() != TokenConstantDefinition.EOF) { 449 tokens ~= t; 450 t = lexer.nextToken(); 451 } 452 } 453 } 454 455 // System.out.println("tokens="+tokens); 456 return tokens; 457 } 458 459 /** 460 * @uml 461 * Split {@code <ID> = <e:expr> ;} into 4 chunks for tokenizing by {@link #tokenize}. 462 */ 463 public Chunk[] split(string pattern) 464 { 465 int p = 0; 466 int n = to!int(pattern.length); 467 Chunk[] chunks; 468 // find all start and stop indexes first, then collect 469 int[] starts; 470 int[] stops; 471 while (p < n) { 472 if (p == pattern.indexOf(escape ~ start, p) ) { 473 p += to!int(escape.length + start.length); 474 } 475 else if ( p == pattern.indexOf(escape ~ stop, p) ) { 476 p += to!int(escape.length + stop.length); 477 } 478 else if ( p == pattern.indexOf(start,p) ) { 479 starts ~= p; 480 p += to!int(start.length); 481 } 482 else if ( p == pattern.indexOf(stop,p) ) { 483 stops ~= p; 484 p += to!int(stop.length); 485 } 486 else { 487 p++; 488 } 489 } 490 491 // System.out.println(""); 492 // System.out.println(starts); 493 // System.out.println(stops); 494 if (starts.length > stops.length) { 495 throw new IllegalArgumentException("unterminated tag in pattern: " ~ pattern); 496 } 497 498 if ( starts.length < stops.length) { 499 throw new IllegalArgumentException("missing start tag in pattern: " ~ pattern); 500 } 501 502 int ntags = to!int(starts.length); 503 for (int i=0; i<ntags; i++) { 504 if ( starts[i] >= stops[i] ) { 505 throw new IllegalArgumentException("tag delimiters out of order in pattern: " ~ pattern); 506 } 507 } 508 509 // collect into chunks now 510 if (ntags == 0) { 511 string text = pattern[0.. n]; 512 chunks ~= new TextChunk(text); 513 } 514 515 if (ntags>0 && starts[0] > 0) { // copy text up to first tag into chunks 516 string text = pattern[0..starts[0]]; 517 chunks ~= new TextChunk(text); 518 } 519 for (int i=0; i<ntags; i++) { 520 // copy inside of <tag> 521 string tag = pattern[starts[i] + start.length..stops[i]]; 522 string ruleOrToken = tag; 523 string label = null; 524 int colon = to!int(tag.indexOf(':')); 525 if (colon >= 0) { 526 label = tag[0..colon]; 527 ruleOrToken = tag[colon+1..tag.length]; 528 } 529 chunks ~= new TagChunk(label, ruleOrToken); 530 if (i + 1 < ntags) { 531 // copy from end of <tag> to start of next 532 string text = pattern[stops[i] + to!int(stop.length)..starts[i + 1]]; 533 chunks ~= new TextChunk(text); 534 } 535 } 536 if ( ntags > 0 ) { 537 int afterLastTag = to!int(stops[ntags - 1] + stop.length); 538 if ( afterLastTag < n ) { // copy text from end of last tag to end 539 string text = pattern[afterLastTag..n]; 540 chunks ~= new TextChunk(text); 541 } 542 } 543 544 // strip out the escape sequences from text chunks but not tags 545 for (int i = 0; i < chunks.length; i++) { 546 Chunk c = chunks[i]; 547 if (c.classinfo == TextChunk.classinfo) { 548 TextChunk tc = cast(TextChunk)c; 549 string unescaped = tc.getText().replace(escape, ""); 550 if (unescaped.length < tc.getText().length) { 551 chunks[i] = new TextChunk(unescaped); 552 } 553 } 554 } 555 556 return chunks; 557 } 558 559 }