antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher source code

1 /*
2  * Copyright (c) 2012-2019 The ANTLR Project. All rights reserved.
3  * Use of this file is governed by the BSD 3-clause license that
4  * can be found in the LICENSE.txt file in the project root.
5  */
6 
7 module antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher;
8 
9 import std.uni;
10 import std.conv;
11 import std.string;
12 import antlr.v4.runtime.Parser;
13 import antlr.v4.runtime.ANTLRInputStream;
14 import antlr.v4.runtime.BailErrorStrategy;
15 import antlr.v4.runtime.Lexer;
16 import antlr.v4.runtime.IllegalArgumentException;
17 import antlr.v4.runtime.ListTokenSource;
18 import antlr.v4.runtime.ParserInterpreter;
19 import antlr.v4.runtime.ParserRuleContext;
20 import antlr.v4.runtime.RuleContext;
21 import antlr.v4.runtime.RecognitionException;
22 import antlr.v4.runtime.Token;
23 import antlr.v4.runtime.TokenConstantDefinition;
24 import antlr.v4.runtime.CommonTokenStream;
25 import antlr.v4.runtime.atn.ATNSimulator;
26 import antlr.v4.runtime.tree.ParseTree;
27 import antlr.v4.runtime.tree.CannotInvokeStartRule;
28 import antlr.v4.runtime.tree.StartRuleDoesNotConsumeFullPattern;
29 import antlr.v4.runtime.tree.TerminalNode;
30 import antlr.v4.runtime.tree.RuleNode;
31 import antlr.v4.runtime.tree.pattern.Chunk;
32 import antlr.v4.runtime.tree.pattern.TagChunk;
33 import antlr.v4.runtime.tree.pattern.TextChunk;
34 import antlr.v4.runtime.tree.pattern.TokenTagToken;
35 import antlr.v4.runtime.tree.pattern.ParseTreePattern;
36 import antlr.v4.runtime.tree.pattern.ParseTreeMatch;
37 import antlr.v4.runtime.tree.pattern.RuleTagToken;
38 import antlr.v4.runtime.misc.ParseCancellationException;
39 
40 /**
41  * A tree pattern matching mechanism for ANTLR {@link ParseTree}s.
42  *
43  * Patterns are strings of source input text with special tags representing
44  * token or rule references such as:
45  *
46  * <p>{@code <ID> = <expr>;}</p>
47  *
48  * Given a pattern start rule such as {@code statement}, this object constructs
49  * a {@link ParseTree} with placeholders for the {@code ID} and {@code expr}
50  * subtree. Then the {@link #match} routines can compare an actual
51  * {@link ParseTree} from a parse with this pattern. Tag {@code <ID>} matches
52  * any {@code ID} token and tag {@code <expr>} references the result of the
53  * {@code expr} rule (generally an instance of {@code ExprContext}).
54  *
55  * Pattern {@code x = 0;} is a similar pattern that matches the same pattern
56  * except that it requires the identifier to be {@code x} and the expression to
57  * be {@code 0}.
58  *
59  * The {@link #matches} routines return {@code true} or {@code false} based
60  * upon a match for the tree rooted at the parameter sent in. The
61  * {@link #match} routines return a {@link ParseTreeMatch} object that
62  * contains the parse tree, the parse tree pattern, and a map from tag name to
63  * matched nodes (more below). A subtree that fails to match, returns with
64  * {@link ParseTreeMatch#mismatchedNode} set to the first tree node that did not
65  * match.
66  *
67  * For efficiency, you can compile a tree pattern in string form to a
68  * {@link ParseTreePattern} object.
69  *
70  * See {@code TestParseTreeMatcher} for lots of examples.
71  * {@link ParseTreePattern} has two static helper methods:
72  * {@link ParseTreePattern#findAll} and {@link ParseTreePattern#match} that
73  * are easy to use but not super efficient because they create new
74  * {@link ParseTreePatternMatcher} objects each time and have to compile the
75  * pattern in string form before using it.
76  *
77  * The lexer and parser that you pass into the {@link ParseTreePatternMatcher}
78  * constructor are used to parse the pattern in string form. The lexer converts
79  * the {@code <ID> = <expr>;} into a sequence of four tokens (assuming lexer
80  * throws out whitespace or puts it on a hidden channel). Be aware that the
81  * input stream is reset for the lexer (but not the parser; a
82  * {@link ParserInterpreter} is created to parse the input.). Any user-defined
83  * fields you have put into the lexer might get changed when this mechanism asks
84  * it to scan the pattern string.
85  *
86  * Normally a parser does not accept token {@code <expr>} as a valid
87  * {@code expr} but, from the parser passed in, we create a special version of
88  * the underlying grammar representation (an {@link ATN}) that allows imaginary
89  * tokens representing rules ({@code <expr>}) to match entire rules. We call
90  * these <em>bypass alternatives</em>.
91  *
92  * Delimiters are {@code <} and {@code >}, with {@code \} as the escape string
93  * by default, but you can set them to whatever you want using
94  * {@link #setDelimiters}. You must escape both start and stop strings
95  * {@code \<} and {@code \>}.
96  */
97 class ParseTreePatternMatcher
98 {
99 
100     /**
101      * This is the backing field for {@link #getLexer()}.
102      */
103     private Lexer lexer;
104 
105     /**
106      * This is the backing field for {@link #getParser()}.
107      */
108     private Parser parser;
109 
110     protected string start = "<";
111 
112     protected string stop = ">";
113 
114     /**
115      * e.g., \< and \> must escape BOTH!
116      */
117     protected string escape = "\\";
118 
119     /**
120      * Constructs a {@link ParseTreePatternMatcher} or from a {@link Lexer} and
121      * {@link Parser} object. The lexer input stream is altered for tokenizing
122      * the tree patterns. The parser is used as a convenient mechanism to get
123      * the grammar name, plus token, rule names.
124      */
125     public this(Lexer lexer, Parser parser)
126     {
127         this.lexer = lexer;
128         this.parser = parser;
129     }
130 
131     /**
132      * Set the delimiters used for marking rule and token tags within concrete
133      * syntax used by the tree pattern parser.
134      *
135      *  @param start The start delimiter.
136      *  @param stop The stop delimiter.
137      *  @param escapeLeft The escape sequence to use for escaping a start or stop delimiter.
138      *
139      *  @exception IllegalArgumentException if {@code start} is {@code null} or empty.
140      *  @exception IllegalArgumentException if {@code stop} is {@code null} or empty.
141      */
142     public void setDelimiters(string start, string stop, string escapeLeft)
143     {
144         if (start is null || start.length) {
145             throw new IllegalArgumentException("start cannot be null or empty");
146         }
147 
148         if (stop is null || stop.length) {
149             throw new IllegalArgumentException("stop cannot be null or empty");
150         }
151 
152         this.start = start;
153         this.stop = stop;
154         this.escape = escapeLeft;
155     }
156 
157     /**
158      * Does {@code pattern} matched as rule {@code patternRuleIndex} match {@code tree}?
159      */
160     public bool matches(ParseTree tree, string pattern, int patternRuleIndex)
161     {
162         ParseTreePattern p = compile(pattern, patternRuleIndex);
163         return matches(tree, p);
164     }
165 
166     /**
167      * Does {@code pattern} matched as rule patternRuleIndex match tree? Pass in a
168      * compiled pattern instead of a string representation of a tree pattern.
169      */
170     public bool matches(ParseTree tree, ParseTreePattern pattern)
171     {
172         ParseTree[][string] labels;
173         ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels);
174         return mismatchedNode is null;
175     }
176 
177     /**
178      * Compare {@code pattern} matched as rule {@code patternRuleIndex} against
179      * {@code tree} and return a {@link ParseTreeMatch} object that contains the
180      * matched elements, or the node at which the match failed.
181      */
182     public ParseTreeMatch match(ParseTree tree, string pattern, int patternRuleIndex)
183     {
184         ParseTreePattern p = compile(pattern, patternRuleIndex);
185         return match(tree, p);
186     }
187 
188     /**
189      * Compare {@code pattern} matched against {@code tree} and return a
190      * {@link ParseTreeMatch} object that contains the matched elements, or thenode at which the match failed. Pass in a compiled pattern instead of a
191      * string representation of a tree pattern.
192      */
193     public ParseTreeMatch match(ParseTree tree, ParseTreePattern pattern)
194     {
195         ParseTree[][string] labels;
196         ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels);
197         return new ParseTreeMatch(tree, pattern, labels, mismatchedNode);
198     }
199 
200     /**
201      * For repeated use of a tree pattern, compile it to a
202      * {@link ParseTreePattern} using this method.
203      */
204     public ParseTreePattern compile(string pattern, int patternRuleIndex)
205     {
206 	auto tokenList = tokenize(pattern);
207         ListTokenSource tokenSrc = new ListTokenSource(tokenList);
208         CommonTokenStream tokens = new CommonTokenStream(tokenSrc);
209 
210         ParserInterpreter parserInterp = new ParserInterpreter(parser.getGrammarFileName(),
211                                                                parser.getVocabulary(),
212                                                                parser.getRuleNames(),
213                                                                parser.getATNWithBypassAlts(),
214                                                                tokens);
215 
216         ParseTree tree = null;
217         try {
218             parserInterp.setErrorHandler(new BailErrorStrategy());
219             tree = parserInterp.parse(patternRuleIndex);
220             //			System.out.println("pattern tree = "+tree.toStringTree(parserInterp));
221         }
222         catch (ParseCancellationException e) {
223             throw cast(RecognitionException)e.getCause();
224         }
225         catch (RecognitionException re) {
226             throw re;
227         }
228         catch (Exception e) {
229             throw new CannotInvokeStartRule(e);
230         }
231 
232         // Make sure tree pattern compilation checks for a complete parse
233         if ( tokens.LA(1)!=TokenConstantDefinition.EOF ) {
234             throw new StartRuleDoesNotConsumeFullPattern();
235         }
236         return new ParseTreePattern(this, pattern, patternRuleIndex, tree);
237     }
238 
239     /**
240      * Used to convert the tree pattern string into a series of tokens. The
241      * input stream is reset.
242      */
243     public Lexer getLexer()
244     {
245         return lexer;
246     }
247 
248     /**
249      * Used to collect to the grammar file name, token names, rule names for
250      * used to parse the pattern into a parse tree.
251      */
252     public Parser getParser()
253     {
254         return parser;
255     }
256 
257     /**
258      * Recursively walk {@code tree} against {@code patternTree}, filling
259      * {@code match.}{@link ParseTreeMatch#labels labels}.
260      *
261      *  @return the first node encountered in {@code tree} which does not match
262      * a corresponding node in {@code patternTree}, or {@code null} if the match
263      * was successful. The specific node returned depends on the matching
264      * algorithm used by the implementation, and may be overridden.
265      */
266     protected ParseTree matchImpl(ParseTree tree, ParseTree patternTree, ParseTree[][string] labels)
267     {
268 	if (tree is null) {
269             throw new IllegalArgumentException("tree cannot be null");
270         }
271 
272         if (patternTree is null) {
273             throw new IllegalArgumentException("patternTree cannot be null");
274         }
275 
276         // x and <ID>, x and y, or x and x; or could be mismatched types
277         if (tree.classinfo == TerminalNode.classinfo && patternTree.classinfo == TerminalNode.classinfo) {
278             TerminalNode t1 = cast(TerminalNode)tree;
279             TerminalNode t2 = cast(TerminalNode)patternTree;
280             ParseTree mismatchedNode = null;
281             // both are tokens and they have same type
282             if (t1.getSymbol().getType() == t2.getSymbol().getType() ) {
283                 if (t2.getSymbol().classinfo == TokenTagToken.classinfo) { // x and <ID>
284                     TokenTagToken tokenTagToken = cast(TokenTagToken)t2.getSymbol();
285                     // track label->list-of-nodes for both token name and label (if any)
286                     labels[tokenTagToken.getTokenName] ~= tree;
287                     if (tokenTagToken.getLabel() !is null) {
288                         labels[tokenTagToken.getLabel] ~= tree;
289                     }
290                 }
291                 else if (t1.getText == t2.getText) {
292                     // x and x
293                 }
294                 else {
295                     // x and y
296                     if (mismatchedNode is null) {
297                         mismatchedNode = t1;
298                     }
299                 }
300             }
301             else {
302                 if (mismatchedNode is null) {
303                     mismatchedNode = t1;
304                 }
305             }
306 
307             return mismatchedNode;
308         }
309         if (tree.classinfo == ParserRuleContext.classinfo && patternTree.classinfo == ParserRuleContext.classinfo) {
310             ParserRuleContext r1 = cast(ParserRuleContext)tree;
311             ParserRuleContext r2 = cast(ParserRuleContext)patternTree;
312             ParseTree mismatchedNode = null;
313             // (expr ...) and <expr>
314             RuleTagToken ruleTagToken = getRuleTagToken(r2);
315             if ( ruleTagToken !is null ) {
316                 ParseTreeMatch m = null;
317                 if (r1.getRuleIndex() == r2.getRuleIndex()) {
318                     // track label->list-of-nodes for both rule name and label (if any)
319                     labels[ruleTagToken.getRuleName] ~= tree;
320                     if ( ruleTagToken.getLabel() !is null ) {
321                         labels[ruleTagToken.getLabel] ~= tree;
322                     }
323                 }
324                 else {
325                     if (mismatchedNode is null) {
326                         mismatchedNode = r1;
327                     }
328                 }
329 
330                 return mismatchedNode;
331             }
332 
333             // (expr ...) and (expr ...)
334             if (r1.getChildCount() != r2.getChildCount()) {
335                 if (mismatchedNode is null) {
336                     mismatchedNode = r1;
337                 }
338 
339                 return mismatchedNode;
340             }
341 
342             int n = r1.getChildCount();
343             for (int i = 0; i<n; i++) {
344                 ParseTree childMatch = matchImpl(r1.getChild(i), patternTree.getChild(i), labels);
345                 if ( childMatch !is null ) {
346                     return childMatch;
347                 }
348             }
349 
350             return mismatchedNode;
351         }
352 
353         // if nodes aren't both tokens or both rule nodes, can't match
354         return tree;
355     }
356 
357     public RuleTagToken getRuleTagToken(ParseTree t)
358     {
359         if (t.classinfo == RuleNode.classinfo) {
360             RuleNode r = cast(RuleNode)t;
361             if (r.getChildCount == 1 && r.getChild(0).classinfo == TerminalNode.classinfo) {
362                 TerminalNode c = cast(TerminalNode)r.getChild(0);
363                 if (c.getSymbol().classinfo == RuleTagToken.classinfo) {
364                     //	System.out.println("rule tag subtree "+t.toStringTree(parser));
365                     return cast(RuleTagToken)c.getSymbol();
366                 }
367             }
368         }
369         return null;
370     }
371 
372     public Token[] tokenize(string pattern)
373     {
374 	// split pattern into chunks: sea (raw input) and islands (<ID>, <expr>)
375         Chunk[] chunks = split(pattern);
376 
377         // create token stream from text and tags
378         Token[] tokens;
379         foreach (Chunk chunk; chunks) {
380             if (chunk.classinfo == TagChunk.classinfo) {
381                 TagChunk tagChunk = cast(TagChunk)chunk;
382                 // add special rule token or conjure up new token from name
383                 if (isUpper(tagChunk.getTag()[0])) {
384                     int ttype = parser.getTokenType(tagChunk.getTag());
385                     if (ttype == TokenConstantDefinition.INVALID_TYPE ) {
386                         throw new IllegalArgumentException("Unknown token " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern);
387                     }
388                     TokenTagToken t = new TokenTagToken(tagChunk.getTag(), ttype, tagChunk.getLabel());
389                     tokens ~= t;
390                 }
391                 else if (isLower(tagChunk.getTag()[0]) ) {
392                     int ruleIndex = parser.getRuleIndex(tagChunk.getTag());
393                     if (ruleIndex == -1) {
394                         throw new IllegalArgumentException("Unknown rule " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern);
395                     }
396                     int ruleImaginaryTokenType = parser.getATNWithBypassAlts().ruleToTokenType[ruleIndex];
397                     tokens ~= new RuleTagToken(tagChunk.getTag(), ruleImaginaryTokenType, tagChunk.getLabel());
398                 }
399                 else {
400                     throw new IllegalArgumentException("invalid tag: " ~ tagChunk.getTag ~ " in pattern: " ~ pattern);
401                 }
402             }
403             else {
404                 TextChunk textChunk = cast(TextChunk)chunk;
405                 ANTLRInputStream ins = new ANTLRInputStream(textChunk.getText());
406                 lexer.setInputStream(ins);
407                 Token t = lexer.nextToken();
408                 while (t.getType() != TokenConstantDefinition.EOF) {
409                     tokens ~= t;
410                     t = lexer.nextToken();
411                 }
412             }
413         }
414 
415         //		System.out.println("tokens="+tokens);
416         return tokens;
417     }
418 
419     /**
420      * Split {@code <ID> = <e:expr> ;} into 4 chunks for tokenizing by {@link #tokenize}.
421      */
422     public Chunk[] split(string pattern)
423     {
424         int p = 0;
425         int n = to!int(pattern.length);
426         Chunk[] chunks;
427         // find all start and stop indexes first, then collect
428         int[] starts;
429         int[] stops;
430         while (p < n) {
431             if (p == pattern.indexOf(escape ~ start, p) ) {
432                 p += to!int(escape.length + start.length);
433             }
434             else if ( p == pattern.indexOf(escape ~ stop, p) ) {
435                 p += to!int(escape.length + stop.length);
436             }
437             else if ( p == pattern.indexOf(start,p) ) {
438                 starts ~= p;
439                 p += to!int(start.length);
440             }
441             else if ( p == pattern.indexOf(stop,p) ) {
442                 stops ~= p;
443                 p += to!int(stop.length);
444             }
445             else {
446                 p++;
447             }
448         }
449 
450         //		System.out.println("");
451         //		System.out.println(starts);
452         //		System.out.println(stops);
453         if (starts.length > stops.length) {
454             throw new IllegalArgumentException("unterminated tag in pattern: " ~ pattern);
455         }
456 
457         if ( starts.length < stops.length) {
458             throw new IllegalArgumentException("missing start tag in pattern: " ~ pattern);
459         }
460 
461         int ntags = to!int(starts.length);
462         for (int i=0; i<ntags; i++) {
463             if ( starts[i] >= stops[i] ) {
464                 throw new IllegalArgumentException("tag delimiters out of order in pattern: " ~ pattern);
465             }
466         }
467 
468         // collect into chunks now
469         if (ntags == 0) {
470             string text = pattern[0.. n];
471             chunks ~= new TextChunk(text);
472         }
473 
474         if (ntags>0 && starts[0] > 0) { // copy text up to first tag into chunks
475             string text = pattern[0..starts[0]];
476             chunks ~= new TextChunk(text);
477         }
478         for (int i=0; i<ntags; i++) {
479             // copy inside of <tag>
480             string tag = pattern[starts[i] + start.length..stops[i]];
481             string ruleOrToken = tag;
482             string label = null;
483             int colon = to!int(tag.indexOf(':'));
484             if (colon >= 0) {
485                 label = tag[0..colon];
486                 ruleOrToken = tag[colon+1..tag.length];
487             }
488             chunks ~= new TagChunk(label, ruleOrToken);
489             if (i + 1 < ntags) {
490                 // copy from end of <tag> to start of next
491                 string text = pattern[stops[i] + to!int(stop.length)..starts[i + 1]];
492                 chunks ~= new TextChunk(text);
493             }
494         }
495         if ( ntags > 0 ) {
496             int afterLastTag = to!int(stops[ntags - 1] + stop.length);
497             if ( afterLastTag < n ) { // copy text from end of last tag to end
498                 string text = pattern[afterLastTag..n];
499                 chunks ~= new TextChunk(text);
500             }
501         }
502 
503         // strip out the escape sequences from text chunks but not tags
504         for (int i = 0; i < chunks.length; i++) {
505             Chunk c = chunks[i];
506             if (c.classinfo == TextChunk.classinfo) {
507                 TextChunk tc = cast(TextChunk)c;
508                 string unescaped = tc.getText().replace(escape, "");
509                 if (unescaped.length < tc.getText().length) {
510                     chunks[i] = new TextChunk(unescaped);
511                 }
512             }
513         }
514 
515         return chunks;
516     }
517 
518 }