1 /* 2 * Copyright (c) 2012-2019 The ANTLR Project. All rights reserved. 3 * Use of this file is governed by the BSD 3-clause license that 4 * can be found in the LICENSE.txt file in the project root. 5 */ 6 7 module antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher; 8 9 import std.uni; 10 import std.conv; 11 import std..string; 12 import antlr.v4.runtime.Parser; 13 import antlr.v4.runtime.ANTLRInputStream; 14 import antlr.v4.runtime.BailErrorStrategy; 15 import antlr.v4.runtime.Lexer; 16 import antlr.v4.runtime.IllegalArgumentException; 17 import antlr.v4.runtime.ListTokenSource; 18 import antlr.v4.runtime.ParserInterpreter; 19 import antlr.v4.runtime.ParserRuleContext; 20 import antlr.v4.runtime.RuleContext; 21 import antlr.v4.runtime.RecognitionException; 22 import antlr.v4.runtime.Token; 23 import antlr.v4.runtime.TokenConstantDefinition; 24 import antlr.v4.runtime.CommonTokenStream; 25 import antlr.v4.runtime.atn.ATNSimulator; 26 import antlr.v4.runtime.tree.ParseTree; 27 import antlr.v4.runtime.tree.CannotInvokeStartRule; 28 import antlr.v4.runtime.tree.StartRuleDoesNotConsumeFullPattern; 29 import antlr.v4.runtime.tree.TerminalNode; 30 import antlr.v4.runtime.tree.RuleNode; 31 import antlr.v4.runtime.tree.pattern.Chunk; 32 import antlr.v4.runtime.tree.pattern.TagChunk; 33 import antlr.v4.runtime.tree.pattern.TextChunk; 34 import antlr.v4.runtime.tree.pattern.TokenTagToken; 35 import antlr.v4.runtime.tree.pattern.ParseTreePattern; 36 import antlr.v4.runtime.tree.pattern.ParseTreeMatch; 37 import antlr.v4.runtime.tree.pattern.RuleTagToken; 38 import antlr.v4.runtime.misc.ParseCancellationException; 39 40 /** 41 * A tree pattern matching mechanism for ANTLR {@link ParseTree}s. 42 * 43 * Patterns are strings of source input text with special tags representing 44 * token or rule references such as: 45 * 46 * <p>{@code <ID> = <expr>;}</p> 47 * 48 * Given a pattern start rule such as {@code statement}, this object constructs 49 * a {@link ParseTree} with placeholders for the {@code ID} and {@code expr} 50 * subtree. Then the {@link #match} routines can compare an actual 51 * {@link ParseTree} from a parse with this pattern. Tag {@code <ID>} matches 52 * any {@code ID} token and tag {@code <expr>} references the result of the 53 * {@code expr} rule (generally an instance of {@code ExprContext}). 54 * 55 * Pattern {@code x = 0;} is a similar pattern that matches the same pattern 56 * except that it requires the identifier to be {@code x} and the expression to 57 * be {@code 0}. 58 * 59 * The {@link #matches} routines return {@code true} or {@code false} based 60 * upon a match for the tree rooted at the parameter sent in. The 61 * {@link #match} routines return a {@link ParseTreeMatch} object that 62 * contains the parse tree, the parse tree pattern, and a map from tag name to 63 * matched nodes (more below). A subtree that fails to match, returns with 64 * {@link ParseTreeMatch#mismatchedNode} set to the first tree node that did not 65 * match. 66 * 67 * For efficiency, you can compile a tree pattern in string form to a 68 * {@link ParseTreePattern} object. 69 * 70 * See {@code TestParseTreeMatcher} for lots of examples. 71 * {@link ParseTreePattern} has two static helper methods: 72 * {@link ParseTreePattern#findAll} and {@link ParseTreePattern#match} that 73 * are easy to use but not super efficient because they create new 74 * {@link ParseTreePatternMatcher} objects each time and have to compile the 75 * pattern in string form before using it. 76 * 77 * The lexer and parser that you pass into the {@link ParseTreePatternMatcher} 78 * constructor are used to parse the pattern in string form. The lexer converts 79 * the {@code <ID> = <expr>;} into a sequence of four tokens (assuming lexer 80 * throws out whitespace or puts it on a hidden channel). Be aware that the 81 * input stream is reset for the lexer (but not the parser; a 82 * {@link ParserInterpreter} is created to parse the input.). Any user-defined 83 * fields you have put into the lexer might get changed when this mechanism asks 84 * it to scan the pattern string. 85 * 86 * Normally a parser does not accept token {@code <expr>} as a valid 87 * {@code expr} but, from the parser passed in, we create a special version of 88 * the underlying grammar representation (an {@link ATN}) that allows imaginary 89 * tokens representing rules ({@code <expr>}) to match entire rules. We call 90 * these <em>bypass alternatives</em>. 91 * 92 * Delimiters are {@code <} and {@code >}, with {@code \} as the escape string 93 * by default, but you can set them to whatever you want using 94 * {@link #setDelimiters}. You must escape both start and stop strings 95 * {@code \<} and {@code \>}. 96 */ 97 class ParseTreePatternMatcher 98 { 99 100 /** 101 * This is the backing field for {@link #getLexer()}. 102 */ 103 private Lexer lexer; 104 105 /** 106 * This is the backing field for {@link #getParser()}. 107 */ 108 private Parser parser; 109 110 protected string start = "<"; 111 112 protected string stop = ">"; 113 114 /** 115 * e.g., \< and \> must escape BOTH! 116 */ 117 protected string escape = "\\"; 118 119 /** 120 * Constructs a {@link ParseTreePatternMatcher} or from a {@link Lexer} and 121 * {@link Parser} object. The lexer input stream is altered for tokenizing 122 * the tree patterns. The parser is used as a convenient mechanism to get 123 * the grammar name, plus token, rule names. 124 */ 125 public this(Lexer lexer, Parser parser) 126 { 127 this.lexer = lexer; 128 this.parser = parser; 129 } 130 131 /** 132 * Set the delimiters used for marking rule and token tags within concrete 133 * syntax used by the tree pattern parser. 134 * 135 * @param start The start delimiter. 136 * @param stop The stop delimiter. 137 * @param escapeLeft The escape sequence to use for escaping a start or stop delimiter. 138 * 139 * @exception IllegalArgumentException if {@code start} is {@code null} or empty. 140 * @exception IllegalArgumentException if {@code stop} is {@code null} or empty. 141 */ 142 public void setDelimiters(string start, string stop, string escapeLeft) 143 { 144 if (start is null || start.length) { 145 throw new IllegalArgumentException("start cannot be null or empty"); 146 } 147 148 if (stop is null || stop.length) { 149 throw new IllegalArgumentException("stop cannot be null or empty"); 150 } 151 152 this.start = start; 153 this.stop = stop; 154 this.escape = escapeLeft; 155 } 156 157 /** 158 * Does {@code pattern} matched as rule {@code patternRuleIndex} match {@code tree}? 159 */ 160 public bool matches(ParseTree tree, string pattern, int patternRuleIndex) 161 { 162 ParseTreePattern p = compile(pattern, patternRuleIndex); 163 return matches(tree, p); 164 } 165 166 /** 167 * Does {@code pattern} matched as rule patternRuleIndex match tree? Pass in a 168 * compiled pattern instead of a string representation of a tree pattern. 169 */ 170 public bool matches(ParseTree tree, ParseTreePattern pattern) 171 { 172 ParseTree[][string] labels; 173 ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels); 174 return mismatchedNode is null; 175 } 176 177 /** 178 * Compare {@code pattern} matched as rule {@code patternRuleIndex} against 179 * {@code tree} and return a {@link ParseTreeMatch} object that contains the 180 * matched elements, or the node at which the match failed. 181 */ 182 public ParseTreeMatch match(ParseTree tree, string pattern, int patternRuleIndex) 183 { 184 ParseTreePattern p = compile(pattern, patternRuleIndex); 185 return match(tree, p); 186 } 187 188 /** 189 * Compare {@code pattern} matched against {@code tree} and return a 190 * {@link ParseTreeMatch} object that contains the matched elements, or thenode at which the match failed. Pass in a compiled pattern instead of a 191 * string representation of a tree pattern. 192 */ 193 public ParseTreeMatch match(ParseTree tree, ParseTreePattern pattern) 194 { 195 ParseTree[][string] labels; 196 ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels); 197 return new ParseTreeMatch(tree, pattern, labels, mismatchedNode); 198 } 199 200 /** 201 * For repeated use of a tree pattern, compile it to a 202 * {@link ParseTreePattern} using this method. 203 */ 204 public ParseTreePattern compile(string pattern, int patternRuleIndex) 205 { 206 auto tokenList = tokenize(pattern); 207 ListTokenSource tokenSrc = new ListTokenSource(tokenList); 208 CommonTokenStream tokens = new CommonTokenStream(tokenSrc); 209 210 ParserInterpreter parserInterp = new ParserInterpreter(parser.getGrammarFileName(), 211 parser.getVocabulary(), 212 parser.getRuleNames(), 213 parser.getATNWithBypassAlts(), 214 tokens); 215 216 ParseTree tree = null; 217 try { 218 parserInterp.setErrorHandler(new BailErrorStrategy()); 219 tree = parserInterp.parse(patternRuleIndex); 220 // System.out.println("pattern tree = "+tree.toStringTree(parserInterp)); 221 } 222 catch (ParseCancellationException e) { 223 throw cast(RecognitionException)e.getCause(); 224 } 225 catch (RecognitionException re) { 226 throw re; 227 } 228 catch (Exception e) { 229 throw new CannotInvokeStartRule(e); 230 } 231 232 // Make sure tree pattern compilation checks for a complete parse 233 if ( tokens.LA(1)!=TokenConstantDefinition.EOF ) { 234 throw new StartRuleDoesNotConsumeFullPattern(); 235 } 236 return new ParseTreePattern(this, pattern, patternRuleIndex, tree); 237 } 238 239 /** 240 * Used to convert the tree pattern string into a series of tokens. The 241 * input stream is reset. 242 */ 243 public Lexer getLexer() 244 { 245 return lexer; 246 } 247 248 /** 249 * Used to collect to the grammar file name, token names, rule names for 250 * used to parse the pattern into a parse tree. 251 */ 252 public Parser getParser() 253 { 254 return parser; 255 } 256 257 /** 258 * Recursively walk {@code tree} against {@code patternTree}, filling 259 * {@code match.}{@link ParseTreeMatch#labels labels}. 260 * 261 * @return the first node encountered in {@code tree} which does not match 262 * a corresponding node in {@code patternTree}, or {@code null} if the match 263 * was successful. The specific node returned depends on the matching 264 * algorithm used by the implementation, and may be overridden. 265 */ 266 protected ParseTree matchImpl(ParseTree tree, ParseTree patternTree, ParseTree[][string] labels) 267 { 268 if (tree is null) { 269 throw new IllegalArgumentException("tree cannot be null"); 270 } 271 272 if (patternTree is null) { 273 throw new IllegalArgumentException("patternTree cannot be null"); 274 } 275 276 // x and <ID>, x and y, or x and x; or could be mismatched types 277 if (tree.classinfo == TerminalNode.classinfo && patternTree.classinfo == TerminalNode.classinfo) { 278 TerminalNode t1 = cast(TerminalNode)tree; 279 TerminalNode t2 = cast(TerminalNode)patternTree; 280 ParseTree mismatchedNode = null; 281 // both are tokens and they have same type 282 if (t1.getSymbol().getType() == t2.getSymbol().getType() ) { 283 if (t2.getSymbol().classinfo == TokenTagToken.classinfo) { // x and <ID> 284 TokenTagToken tokenTagToken = cast(TokenTagToken)t2.getSymbol(); 285 // track label->list-of-nodes for both token name and label (if any) 286 labels[tokenTagToken.getTokenName] ~= tree; 287 if (tokenTagToken.getLabel() !is null) { 288 labels[tokenTagToken.getLabel] ~= tree; 289 } 290 } 291 else if (t1.getText == t2.getText) { 292 // x and x 293 } 294 else { 295 // x and y 296 if (mismatchedNode is null) { 297 mismatchedNode = t1; 298 } 299 } 300 } 301 else { 302 if (mismatchedNode is null) { 303 mismatchedNode = t1; 304 } 305 } 306 307 return mismatchedNode; 308 } 309 if (tree.classinfo == ParserRuleContext.classinfo && patternTree.classinfo == ParserRuleContext.classinfo) { 310 ParserRuleContext r1 = cast(ParserRuleContext)tree; 311 ParserRuleContext r2 = cast(ParserRuleContext)patternTree; 312 ParseTree mismatchedNode = null; 313 // (expr ...) and <expr> 314 RuleTagToken ruleTagToken = getRuleTagToken(r2); 315 if ( ruleTagToken !is null ) { 316 ParseTreeMatch m = null; 317 if (r1.getRuleIndex() == r2.getRuleIndex()) { 318 // track label->list-of-nodes for both rule name and label (if any) 319 labels[ruleTagToken.getRuleName] ~= tree; 320 if ( ruleTagToken.getLabel() !is null ) { 321 labels[ruleTagToken.getLabel] ~= tree; 322 } 323 } 324 else { 325 if (mismatchedNode is null) { 326 mismatchedNode = r1; 327 } 328 } 329 330 return mismatchedNode; 331 } 332 333 // (expr ...) and (expr ...) 334 if (r1.getChildCount() != r2.getChildCount()) { 335 if (mismatchedNode is null) { 336 mismatchedNode = r1; 337 } 338 339 return mismatchedNode; 340 } 341 342 int n = r1.getChildCount(); 343 for (int i = 0; i<n; i++) { 344 ParseTree childMatch = matchImpl(r1.getChild(i), patternTree.getChild(i), labels); 345 if ( childMatch !is null ) { 346 return childMatch; 347 } 348 } 349 350 return mismatchedNode; 351 } 352 353 // if nodes aren't both tokens or both rule nodes, can't match 354 return tree; 355 } 356 357 public RuleTagToken getRuleTagToken(ParseTree t) 358 { 359 if (t.classinfo == RuleNode.classinfo) { 360 RuleNode r = cast(RuleNode)t; 361 if (r.getChildCount == 1 && r.getChild(0).classinfo == TerminalNode.classinfo) { 362 TerminalNode c = cast(TerminalNode)r.getChild(0); 363 if (c.getSymbol().classinfo == RuleTagToken.classinfo) { 364 // System.out.println("rule tag subtree "+t.toStringTree(parser)); 365 return cast(RuleTagToken)c.getSymbol(); 366 } 367 } 368 } 369 return null; 370 } 371 372 public Token[] tokenize(string pattern) 373 { 374 // split pattern into chunks: sea (raw input) and islands (<ID>, <expr>) 375 Chunk[] chunks = split(pattern); 376 377 // create token stream from text and tags 378 Token[] tokens; 379 foreach (Chunk chunk; chunks) { 380 if (chunk.classinfo == TagChunk.classinfo) { 381 TagChunk tagChunk = cast(TagChunk)chunk; 382 // add special rule token or conjure up new token from name 383 if (isUpper(tagChunk.getTag()[0])) { 384 int ttype = parser.getTokenType(tagChunk.getTag()); 385 if (ttype == TokenConstantDefinition.INVALID_TYPE ) { 386 throw new IllegalArgumentException("Unknown token " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern); 387 } 388 TokenTagToken t = new TokenTagToken(tagChunk.getTag(), ttype, tagChunk.getLabel()); 389 tokens ~= t; 390 } 391 else if (isLower(tagChunk.getTag()[0]) ) { 392 int ruleIndex = parser.getRuleIndex(tagChunk.getTag()); 393 if (ruleIndex == -1) { 394 throw new IllegalArgumentException("Unknown rule " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern); 395 } 396 int ruleImaginaryTokenType = parser.getATNWithBypassAlts().ruleToTokenType[ruleIndex]; 397 tokens ~= new RuleTagToken(tagChunk.getTag(), ruleImaginaryTokenType, tagChunk.getLabel()); 398 } 399 else { 400 throw new IllegalArgumentException("invalid tag: " ~ tagChunk.getTag ~ " in pattern: " ~ pattern); 401 } 402 } 403 else { 404 TextChunk textChunk = cast(TextChunk)chunk; 405 ANTLRInputStream ins = new ANTLRInputStream(textChunk.getText()); 406 lexer.setInputStream(ins); 407 Token t = lexer.nextToken(); 408 while (t.getType() != TokenConstantDefinition.EOF) { 409 tokens ~= t; 410 t = lexer.nextToken(); 411 } 412 } 413 } 414 415 // System.out.println("tokens="+tokens); 416 return tokens; 417 } 418 419 /** 420 * Split {@code <ID> = <e:expr> ;} into 4 chunks for tokenizing by {@link #tokenize}. 421 */ 422 public Chunk[] split(string pattern) 423 { 424 int p = 0; 425 int n = to!int(pattern.length); 426 Chunk[] chunks; 427 // find all start and stop indexes first, then collect 428 int[] starts; 429 int[] stops; 430 while (p < n) { 431 if (p == pattern.indexOf(escape ~ start, p) ) { 432 p += to!int(escape.length + start.length); 433 } 434 else if ( p == pattern.indexOf(escape ~ stop, p) ) { 435 p += to!int(escape.length + stop.length); 436 } 437 else if ( p == pattern.indexOf(start,p) ) { 438 starts ~= p; 439 p += to!int(start.length); 440 } 441 else if ( p == pattern.indexOf(stop,p) ) { 442 stops ~= p; 443 p += to!int(stop.length); 444 } 445 else { 446 p++; 447 } 448 } 449 450 // System.out.println(""); 451 // System.out.println(starts); 452 // System.out.println(stops); 453 if (starts.length > stops.length) { 454 throw new IllegalArgumentException("unterminated tag in pattern: " ~ pattern); 455 } 456 457 if ( starts.length < stops.length) { 458 throw new IllegalArgumentException("missing start tag in pattern: " ~ pattern); 459 } 460 461 int ntags = to!int(starts.length); 462 for (int i=0; i<ntags; i++) { 463 if ( starts[i] >= stops[i] ) { 464 throw new IllegalArgumentException("tag delimiters out of order in pattern: " ~ pattern); 465 } 466 } 467 468 // collect into chunks now 469 if (ntags == 0) { 470 string text = pattern[0.. n]; 471 chunks ~= new TextChunk(text); 472 } 473 474 if (ntags>0 && starts[0] > 0) { // copy text up to first tag into chunks 475 string text = pattern[0..starts[0]]; 476 chunks ~= new TextChunk(text); 477 } 478 for (int i=0; i<ntags; i++) { 479 // copy inside of <tag> 480 string tag = pattern[starts[i] + start.length..stops[i]]; 481 string ruleOrToken = tag; 482 string label = null; 483 int colon = to!int(tag.indexOf(':')); 484 if (colon >= 0) { 485 label = tag[0..colon]; 486 ruleOrToken = tag[colon+1..tag.length]; 487 } 488 chunks ~= new TagChunk(label, ruleOrToken); 489 if (i + 1 < ntags) { 490 // copy from end of <tag> to start of next 491 string text = pattern[stops[i] + to!int(stop.length)..starts[i + 1]]; 492 chunks ~= new TextChunk(text); 493 } 494 } 495 if ( ntags > 0 ) { 496 int afterLastTag = to!int(stops[ntags - 1] + stop.length); 497 if ( afterLastTag < n ) { // copy text from end of last tag to end 498 string text = pattern[afterLastTag..n]; 499 chunks ~= new TextChunk(text); 500 } 501 } 502 503 // strip out the escape sequences from text chunks but not tags 504 for (int i = 0; i < chunks.length; i++) { 505 Chunk c = chunks[i]; 506 if (c.classinfo == TextChunk.classinfo) { 507 TextChunk tc = cast(TextChunk)c; 508 string unescaped = tc.getText().replace(escape, ""); 509 if (unescaped.length < tc.getText().length) { 510 chunks[i] = new TextChunk(unescaped); 511 } 512 } 513 } 514 515 return chunks; 516 } 517 518 }