antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher source code

1 /*
2  * [The "BSD license"]
3  * Copyright (c) 2013 Terence Parr
4  * Copyright (c) 2013 Sam Harwell
5  * Copyright (c) 2017 Egbert Voigt
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 module antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher;
33 
34 import std.uni;
35 import std.conv;
36 import std..string;
37 import antlr.v4.runtime.Parser;
38 import antlr.v4.runtime.ANTLRInputStream;
39 import antlr.v4.runtime.BailErrorStrategy;
40 import antlr.v4.runtime.Lexer;
41 import antlr.v4.runtime.IllegalArgumentException;
42 import antlr.v4.runtime.ListTokenSource;
43 import antlr.v4.runtime.ParserInterpreter;
44 import antlr.v4.runtime.ParserRuleContext;
45 import antlr.v4.runtime.RuleContext;
46 import antlr.v4.runtime.RecognitionException;
47 import antlr.v4.runtime.Token;
48 import antlr.v4.runtime.TokenConstantDefinition;
49 import antlr.v4.runtime.CommonTokenStream;
50 import antlr.v4.runtime.atn.ATNSimulator;
51 import antlr.v4.runtime.tree.ParseTree;
52 import antlr.v4.runtime.tree.CannotInvokeStartRule;
53 import antlr.v4.runtime.tree.StartRuleDoesNotConsumeFullPattern;
54 import antlr.v4.runtime.tree.TerminalNode;
55 import antlr.v4.runtime.tree.RuleNode;
56 import antlr.v4.runtime.tree.pattern.Chunk;
57 import antlr.v4.runtime.tree.pattern.TagChunk;
58 import antlr.v4.runtime.tree.pattern.TextChunk;
59 import antlr.v4.runtime.tree.pattern.TokenTagToken;
60 import antlr.v4.runtime.tree.pattern.ParseTreePattern;
61 import antlr.v4.runtime.tree.pattern.ParseTreeMatch;
62 import antlr.v4.runtime.tree.pattern.RuleTagToken;
63 import antlr.v4.runtime.misc.ParseCancellationException;
64 
65 // Class ParseTreePatternMatcher
66 /**
67  * @uml
68  * A tree pattern matching mechanism for ANTLR {@link ParseTree}s.
69  *
70  * <p>Patterns are strings of source input text with special tags representing
71  * token or rule references such as:</p>
72  *
73  * <p>{@code <ID> = <expr>;}</p>
74  *
75  * <p>Given a pattern start rule such as {@code statement}, this object constructs
76  * a {@link ParseTree} with placeholders for the {@code ID} and {@code expr}
77  * subtree. Then the {@link #match} routines can compare an actual
78  * {@link ParseTree} from a parse with this pattern. Tag {@code <ID>} matches
79  * any {@code ID} token and tag {@code <expr>} references the result of the
80  * {@code expr} rule (generally an instance of {@code ExprContext}.</p>
81  *
82  * <p>Pattern {@code x = 0;} is a similar pattern that matches the same pattern
83  * except that it requires the identifier to be {@code x} and the expression to
84  * be {@code 0}.</p>
85  *
86  * <p>The {@link #matches} routines return {@code true} or {@code false} based
87  * upon a match for the tree rooted at the parameter sent in. The
88  * {@link #match} routines return a {@link ParseTreeMatch} object that
89  * contains the parse tree, the parse tree pattern, and a map from tag name to
90  * matched nodes (more below). A subtree that fails to match, returns with
91  * {@link ParseTreeMatch#mismatchedNode} set to the first tree node that did not
92  * match.</p>
93  *
94  * <p>For efficiency, you can compile a tree pattern in string form to a
95  * {@link ParseTreePattern} object.</p>
96  *
97  * <p>See {@code TestParseTreeMatcher} for lots of examples.
98  * {@link ParseTreePattern} has two static helper methods:
99  * {@link ParseTreePattern#findAll} and {@link ParseTreePattern#match} that
100  * are easy to use but not super efficient because they create new
101  * {@link ParseTreePatternMatcher} objects each time and have to compile the
102  * pattern in string form before using it.</p>
103  *
104  * <p>The lexer and parser that you pass into the {@link ParseTreePatternMatcher}
105  * constructor are used to parse the pattern in string form. The lexer converts
106  * the {@code <ID> = <expr>;} into a sequence of four tokens (assuming lexer
107  * throws out whitespace or puts it on a hidden channel). Be aware that the
108  * input stream is reset for the lexer (but not the parser; a
109  * {@link ParserInterpreter} is created to parse the input.). Any user-defined
110  * fields you have put into the lexer might get changed when this mechanism asks
111  * it to scan the pattern string.</p>
112  *
113  * <p>Normally a parser does not accept token {@code <expr>} as a valid
114  * {@code expr} but, from the parser passed in, we create a special version of
115  * the underlying grammar representation (an {@link ATN}) that allows imaginary
116  * tokens representing rules ({@code <expr>}) to match entire rules. We call
117  * these <em>bypass alternatives</em>.</p>
118  *
119  * <p>Delimiters are {@code <} and {@code >}, with {@code \} as the escape string
120  * by default, but you can set them to whatever you want using
121  * {@link #setDelimiters}. You must escape both start and stop strings
122  * {@code \<} and {@code \>}.</p>
123  */
124 class ParseTreePatternMatcher
125 {
126 
127     /**
128      * @uml
129      * This is the backing field for {@link #getLexer()}.
130      */
131     private Lexer lexer;
132 
133     /**
134      * @uml
135      * This is the backing field for {@link #getParser()}.
136      */
137     private Parser parser;
138 
139     protected string start = "<";
140 
141     protected string stop = ">";
142 
143     /**
144      * @uml
145      * e.g., \< and \> must escape BOTH!
146      */
147     protected string escape = "\\";
148 
149     /**
150      * @uml
151      * Constructs a {@link ParseTreePatternMatcher} or from a {@link Lexer} and
152      * {@link Parser} object. The lexer input stream is altered for tokenizing
153      * the tree patterns. The parser is used as a convenient mechanism to get
154      * the grammar name, plus token, rule names.
155      */
156     public this(Lexer lexer, Parser parser)
157     {
158         this.lexer = lexer;
159         this.parser = parser;
160     }
161 
162     /**
163      * @uml
164      * Set the delimiters used for marking rule and token tags within concrete
165      * syntax used by the tree pattern parser.
166      *
167      *  @param start The start delimiter.
168      *  @param stop The stop delimiter.
169      *  @param escapeLeft The escape sequence to use for escaping a start or stop delimiter.
170      *
171      *  @exception IllegalArgumentException if {@code start} is {@code null} or empty.
172      *  @exception IllegalArgumentException if {@code stop} is {@code null} or empty.
173      */
174     public void setDelimiters(string start, string stop, string escapeLeft)
175     {
176         if (start is null || start.length) {
177             throw new IllegalArgumentException("start cannot be null or empty");
178         }
179 
180         if (stop is null || stop.length) {
181             throw new IllegalArgumentException("stop cannot be null or empty");
182         }
183 
184         this.start = start;
185         this.stop = stop;
186         this.escape = escapeLeft;
187     }
188 
189     /**
190      * @uml
191      * Does {@code pattern} matched as rule {@code patternRuleIndex} match {@code tree}?
192      */
193     public bool matches(ParseTree tree, string pattern, int patternRuleIndex)
194     {
195         ParseTreePattern p = compile(pattern, patternRuleIndex);
196         return matches(tree, p);
197     }
198 
199     /**
200      * @uml
201      * Does {@code pattern} matched as rule patternRuleIndex match tree? Pass in a
202      * compiled pattern instead of a string representation of a tree pattern.
203      */
204     public bool matches(ParseTree tree, ParseTreePattern pattern)
205     {
206         ParseTree[][string] labels;
207         ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels);
208         return mismatchedNode is null;
209     }
210 
211     /**
212      * @uml
213      * Compare {@code pattern} matched as rule {@code patternRuleIndex} against
214      * {@code tree} and return a {@link ParseTreeMatch} object that contains the
215      * matched elements, or the node at which the match failed.
216      */
217     public ParseTreeMatch match(ParseTree tree, string pattern, int patternRuleIndex)
218     {
219         ParseTreePattern p = compile(pattern, patternRuleIndex);
220         return match(tree, p);
221     }
222 
223     /**
224      * @uml
225      * Compare {@code pattern} matched against {@code tree} and return a
226      * {@link ParseTreeMatch} object that contains the matched elements, or thenode at which the match failed. Pass in a compiled pattern instead of a
227      * string representation of a tree pattern.
228      */
229     public ParseTreeMatch match(ParseTree tree, ParseTreePattern pattern)
230     {
231         ParseTree[][string] labels;
232         ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels);
233         return new ParseTreeMatch(tree, pattern, labels, mismatchedNode);
234     }
235 
236     /**
237      * @uml
238      * For repeated use of a tree pattern, compile it to a
239      * {@link ParseTreePattern} using this method.
240      */
241     public ParseTreePattern compile(string pattern, int patternRuleIndex)
242     {
243 	auto tokenList = tokenize(pattern);
244         ListTokenSource tokenSrc = new ListTokenSource(tokenList);
245         CommonTokenStream tokens = new CommonTokenStream(tokenSrc);
246 
247         ParserInterpreter parserInterp = new ParserInterpreter(parser.getGrammarFileName(),
248                                                                parser.getVocabulary(),
249                                                                parser.getRuleNames(),
250                                                                parser.getATNWithBypassAlts(),
251                                                                tokens);
252 
253         ParseTree tree = null;
254         try {
255             parserInterp.setErrorHandler(new BailErrorStrategy());
256             tree = parserInterp.parse(patternRuleIndex);
257             //			System.out.println("pattern tree = "+tree.toStringTree(parserInterp));
258         }
259         catch (ParseCancellationException e) {
260             throw cast(RecognitionException)e.getCause();
261         }
262         catch (RecognitionException re) {
263             throw re;
264         }
265         catch (Exception e) {
266             throw new CannotInvokeStartRule(e);
267         }
268 
269         // Make sure tree pattern compilation checks for a complete parse
270         if ( tokens.LA(1)!=TokenConstantDefinition.EOF ) {
271             throw new StartRuleDoesNotConsumeFullPattern();
272         }
273         return new ParseTreePattern(this, pattern, patternRuleIndex, tree);
274     }
275 
276     /**
277      * @uml
278      * Used to convert the tree pattern string into a series of tokens. The
279      * input stream is reset.
280      */
281     public Lexer getLexer()
282     {
283         return lexer;
284     }
285 
286     /**
287      * @uml
288      * Used to collect to the grammar file name, token names, rule names for
289      * used to parse the pattern into a parse tree.
290      */
291     public Parser getParser()
292     {
293         return parser;
294     }
295 
296     /**
297      * @uml
298      * Recursively walk {@code tree} against {@code patternTree}, filling
299      * {@code match.}{@link ParseTreeMatch#labels labels}.
300      *
301      *  @return the first node encountered in {@code tree} which does not match
302      * a corresponding node in {@code patternTree}, or {@code null} if the match
303      * was successful. The specific node returned depends on the matching
304      * algorithm used by the implementation, and may be overridden.
305      */
306     protected ParseTree matchImpl(ParseTree tree, ParseTree patternTree, ParseTree[][string] labels)
307     {
308 	if (tree is null) {
309             throw new IllegalArgumentException("tree cannot be null");
310         }
311 
312         if (patternTree is null) {
313             throw new IllegalArgumentException("patternTree cannot be null");
314         }
315 
316         // x and <ID>, x and y, or x and x; or could be mismatched types
317         if (tree.classinfo == TerminalNode.classinfo && patternTree.classinfo == TerminalNode.classinfo) {
318             TerminalNode t1 = cast(TerminalNode)tree;
319             TerminalNode t2 = cast(TerminalNode)patternTree;
320             ParseTree mismatchedNode = null;
321             // both are tokens and they have same type
322             if (t1.getSymbol().getType() == t2.getSymbol().getType() ) {
323                 if (t2.getSymbol().classinfo == TokenTagToken.classinfo) { // x and <ID>
324                     TokenTagToken tokenTagToken = cast(TokenTagToken)t2.getSymbol();
325                     // track label->list-of-nodes for both token name and label (if any)
326                     labels[tokenTagToken.getTokenName] ~= tree;
327                     if (tokenTagToken.getLabel() !is null) {
328                         labels[tokenTagToken.getLabel] ~= tree;
329                     }
330                 }
331                 else if (t1.getText == t2.getText) {
332                     // x and x
333                 }
334                 else {
335                     // x and y
336                     if (mismatchedNode is null) {
337                         mismatchedNode = t1;
338                     }
339                 }
340             }
341             else {
342                 if (mismatchedNode is null) {
343                     mismatchedNode = t1;
344                 }
345             }
346 
347             return mismatchedNode;
348         }
349         if (tree.classinfo == ParserRuleContext.classinfo && patternTree.classinfo == ParserRuleContext.classinfo) {
350             ParserRuleContext r1 = cast(ParserRuleContext)tree;
351             ParserRuleContext r2 = cast(ParserRuleContext)patternTree;
352             ParseTree mismatchedNode = null;
353             // (expr ...) and <expr>
354             RuleTagToken ruleTagToken = getRuleTagToken(r2);
355             if ( ruleTagToken !is null ) {
356                 ParseTreeMatch m = null;
357                 if (r1.getRuleIndex() == r2.getRuleIndex()) {
358                     // track label->list-of-nodes for both rule name and label (if any)
359                     labels[ruleTagToken.getRuleName] ~= tree;
360                     if ( ruleTagToken.getLabel() !is null ) {
361                         labels[ruleTagToken.getLabel] ~= tree;
362                     }
363                 }
364                 else {
365                     if (mismatchedNode is null) {
366                         mismatchedNode = r1;
367                     }
368                 }
369 
370                 return mismatchedNode;
371             }
372 
373             // (expr ...) and (expr ...)
374             if (r1.getChildCount() != r2.getChildCount()) {
375                 if (mismatchedNode is null) {
376                     mismatchedNode = r1;
377                 }
378 
379                 return mismatchedNode;
380             }
381 
382             int n = r1.getChildCount();
383             for (int i = 0; i<n; i++) {
384                 ParseTree childMatch = matchImpl(r1.getChild(i), patternTree.getChild(i), labels);
385                 if ( childMatch !is null ) {
386                     return childMatch;
387                 }
388             }
389 
390             return mismatchedNode;
391         }
392 
393         // if nodes aren't both tokens or both rule nodes, can't match
394         return tree;
395     }
396 
397     public RuleTagToken getRuleTagToken(ParseTree t)
398     {
399         if (t.classinfo == RuleNode.classinfo) {
400             RuleNode r = cast(RuleNode)t;
401             if (r.getChildCount == 1 && r.getChild(0).classinfo == TerminalNode.classinfo) {
402                 TerminalNode c = cast(TerminalNode)r.getChild(0);
403                 if (c.getSymbol().classinfo == RuleTagToken.classinfo) {
404                     //	System.out.println("rule tag subtree "+t.toStringTree(parser));
405                     return cast(RuleTagToken)c.getSymbol();
406                 }
407             }
408         }
409         return null;
410     }
411 
412     public Token[] tokenize(string pattern)
413     {
414 	// split pattern into chunks: sea (raw input) and islands (<ID>, <expr>)
415         Chunk[] chunks = split(pattern);
416 
417         // create token stream from text and tags
418         Token[] tokens;
419         foreach (Chunk chunk; chunks) {
420             if (chunk.classinfo == TagChunk.classinfo) {
421                 TagChunk tagChunk = cast(TagChunk)chunk;
422                 // add special rule token or conjure up new token from name
423                 if (isUpper(tagChunk.getTag()[0])) {
424                     int ttype = parser.getTokenType(tagChunk.getTag());
425                     if (ttype == TokenConstantDefinition.INVALID_TYPE ) {
426                         throw new IllegalArgumentException("Unknown token " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern);
427                     }
428                     TokenTagToken t = new TokenTagToken(tagChunk.getTag(), ttype, tagChunk.getLabel());
429                     tokens ~= t;
430                 }
431                 else if (isLower(tagChunk.getTag()[0]) ) {
432                     int ruleIndex = parser.getRuleIndex(tagChunk.getTag());
433                     if (ruleIndex == -1) {
434                         throw new IllegalArgumentException("Unknown rule " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern);
435                     }
436                     int ruleImaginaryTokenType = parser.getATNWithBypassAlts().ruleToTokenType[ruleIndex];
437                     tokens ~= new RuleTagToken(tagChunk.getTag(), ruleImaginaryTokenType, tagChunk.getLabel());
438                 }
439                 else {
440                     throw new IllegalArgumentException("invalid tag: " ~ tagChunk.getTag ~ " in pattern: " ~ pattern);
441                 }
442             }
443             else {
444                 TextChunk textChunk = cast(TextChunk)chunk;
445                 ANTLRInputStream ins = new ANTLRInputStream(textChunk.getText());
446                 lexer.setInputStream(ins);
447                 Token t = lexer.nextToken();
448                 while (t.getType() != TokenConstantDefinition.EOF) {
449                     tokens ~= t;
450                     t = lexer.nextToken();
451                 }
452             }
453         }
454 
455         //		System.out.println("tokens="+tokens);
456         return tokens;
457     }
458 
459     /**
460      * @uml
461      * Split {@code <ID> = <e:expr> ;} into 4 chunks for tokenizing by {@link #tokenize}.
462      */
463     public Chunk[] split(string pattern)
464     {
465         int p = 0;
466         int n = to!int(pattern.length);
467         Chunk[] chunks;
468         // find all start and stop indexes first, then collect
469         int[] starts;
470         int[] stops;
471         while (p < n) {
472             if (p == pattern.indexOf(escape ~ start, p) ) {
473                 p += to!int(escape.length + start.length);
474             }
475             else if ( p == pattern.indexOf(escape ~ stop, p) ) {
476                 p += to!int(escape.length + stop.length);
477             }
478             else if ( p == pattern.indexOf(start,p) ) {
479                 starts ~= p;
480                 p += to!int(start.length);
481             }
482             else if ( p == pattern.indexOf(stop,p) ) {
483                 stops ~= p;
484                 p += to!int(stop.length);
485             }
486             else {
487                 p++;
488             }
489         }
490 
491         //		System.out.println("");
492         //		System.out.println(starts);
493         //		System.out.println(stops);
494         if (starts.length > stops.length) {
495             throw new IllegalArgumentException("unterminated tag in pattern: " ~ pattern);
496         }
497 
498         if ( starts.length < stops.length) {
499             throw new IllegalArgumentException("missing start tag in pattern: " ~ pattern);
500         }
501 
502         int ntags = to!int(starts.length);
503         for (int i=0; i<ntags; i++) {
504             if ( starts[i] >= stops[i] ) {
505                 throw new IllegalArgumentException("tag delimiters out of order in pattern: " ~ pattern);
506             }
507         }
508 
509         // collect into chunks now
510         if (ntags == 0) {
511             string text = pattern[0.. n];
512             chunks ~= new TextChunk(text);
513         }
514 
515         if (ntags>0 && starts[0] > 0) { // copy text up to first tag into chunks
516             string text = pattern[0..starts[0]];
517             chunks ~= new TextChunk(text);
518         }
519         for (int i=0; i<ntags; i++) {
520             // copy inside of <tag>
521             string tag = pattern[starts[i] + start.length..stops[i]];
522             string ruleOrToken = tag;
523             string label = null;
524             int colon = to!int(tag.indexOf(':'));
525             if (colon >= 0) {
526                 label = tag[0..colon];
527                 ruleOrToken = tag[colon+1..tag.length];
528             }
529             chunks ~= new TagChunk(label, ruleOrToken);
530             if (i + 1 < ntags) {
531                 // copy from end of <tag> to start of next
532                 string text = pattern[stops[i] + to!int(stop.length)..starts[i + 1]];
533                 chunks ~= new TextChunk(text);
534             }
535         }
536         if ( ntags > 0 ) {
537             int afterLastTag = to!int(stops[ntags - 1] + stop.length);
538             if ( afterLastTag < n ) { // copy text from end of last tag to end
539                 string text = pattern[afterLastTag..n];
540                 chunks ~= new TextChunk(text);
541             }
542         }
543 
544         // strip out the escape sequences from text chunks but not tags
545         for (int i = 0; i < chunks.length; i++) {
546             Chunk c = chunks[i];
547             if (c.classinfo == TextChunk.classinfo) {
548                 TextChunk tc = cast(TextChunk)c;
549                 string unescaped = tc.getText().replace(escape, "");
550                 if (unescaped.length < tc.getText().length) {
551                     chunks[i] = new TextChunk(unescaped);
552                 }
553             }
554         }
555 
556         return chunks;
557     }
558 
559 }