1 /*
2  * [The "BSD license"]
3  * Copyright (c) 2013 Terence Parr
4  * Copyright (c) 2013 Sam Harwell
5  * Copyright (c) 2017 Egbert Voigt
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 module antlr.v4.runtime.tree.pattern.ParseTreePatternMatcher;
33 
34 import std.uni;
35 import std.conv;
36 import std.string;
37 import antlr.v4.runtime.Parser;
38 import antlr.v4.runtime.ANTLRInputStream;
39 import antlr.v4.runtime.BailErrorStrategy;
40 import antlr.v4.runtime.Lexer;
41 import antlr.v4.runtime.IllegalArgumentException;
42 import antlr.v4.runtime.ListTokenSource;
43 import antlr.v4.runtime.ParserInterpreter;
44 import antlr.v4.runtime.ParserRuleContext;
45 import antlr.v4.runtime.RuleContext;
46 import antlr.v4.runtime.RecognitionException;
47 import antlr.v4.runtime.Token;
48 import antlr.v4.runtime.TokenConstantDefinition;
49 import antlr.v4.runtime.CommonTokenStream;
50 import antlr.v4.runtime.atn.ATNSimulator;
51 import antlr.v4.runtime.tree.ParseTree;
52 import antlr.v4.runtime.tree.CannotInvokeStartRule;
53 import antlr.v4.runtime.tree.StartRuleDoesNotConsumeFullPattern;
54 import antlr.v4.runtime.tree.TerminalNode;
55 import antlr.v4.runtime.tree.RuleNode;
56 import antlr.v4.runtime.tree.pattern.Chunk;
57 import antlr.v4.runtime.tree.pattern.TagChunk;
58 import antlr.v4.runtime.tree.pattern.TextChunk;
59 import antlr.v4.runtime.tree.pattern.TokenTagToken;
60 import antlr.v4.runtime.tree.pattern.ParseTreePattern;
61 import antlr.v4.runtime.tree.pattern.ParseTreeMatch;
62 import antlr.v4.runtime.tree.pattern.RuleTagToken;
63 import antlr.v4.runtime.misc.ParseCancellationException;
64 
65 /**
66  * @uml
67  * A tree pattern matching mechanism for ANTLR {@link ParseTree}s.
68  *
69  * <p>Patterns are strings of source input text with special tags representing
70  * token or rule references such as:</p>
71  *
72  * <p>{@code <ID> = <expr>;}</p>
73  *
74  * <p>Given a pattern start rule such as {@code statement}, this object constructs
75  * a {@link ParseTree} with placeholders for the {@code ID} and {@code expr}
76  * subtree. Then the {@link #match} routines can compare an actual
77  * {@link ParseTree} from a parse with this pattern. Tag {@code <ID>} matches
78  * any {@code ID} token and tag {@code <expr>} references the result of the
79  * {@code expr} rule (generally an instance of {@code ExprContext}.</p>
80  *
81  * <p>Pattern {@code x = 0;} is a similar pattern that matches the same pattern
82  * except that it requires the identifier to be {@code x} and the expression to
83  * be {@code 0}.</p>
84  *
85  * <p>The {@link #matches} routines return {@code true} or {@code false} based
86  * upon a match for the tree rooted at the parameter sent in. The
87  * {@link #match} routines return a {@link ParseTreeMatch} object that
88  * contains the parse tree, the parse tree pattern, and a map from tag name to
89  * matched nodes (more below). A subtree that fails to match, returns with
90  * {@link ParseTreeMatch#mismatchedNode} set to the first tree node that did not
91  * match.</p>
92  *
93  * <p>For efficiency, you can compile a tree pattern in string form to a
94  * {@link ParseTreePattern} object.</p>
95  *
96  * <p>See {@code TestParseTreeMatcher} for lots of examples.
97  * {@link ParseTreePattern} has two static helper methods:
98  * {@link ParseTreePattern#findAll} and {@link ParseTreePattern#match} that
99  * are easy to use but not super efficient because they create new
100  * {@link ParseTreePatternMatcher} objects each time and have to compile the
101  * pattern in string form before using it.</p>
102  *
103  * <p>The lexer and parser that you pass into the {@link ParseTreePatternMatcher}
104  * constructor are used to parse the pattern in string form. The lexer converts
105  * the {@code <ID> = <expr>;} into a sequence of four tokens (assuming lexer
106  * throws out whitespace or puts it on a hidden channel). Be aware that the
107  * input stream is reset for the lexer (but not the parser; a
108  * {@link ParserInterpreter} is created to parse the input.). Any user-defined
109  * fields you have put into the lexer might get changed when this mechanism asks
110  * it to scan the pattern string.</p>
111  *
112  * <p>Normally a parser does not accept token {@code <expr>} as a valid
113  * {@code expr} but, from the parser passed in, we create a special version of
114  * the underlying grammar representation (an {@link ATN}) that allows imaginary
115  * tokens representing rules ({@code <expr>}) to match entire rules. We call
116  * these <em>bypass alternatives</em>.</p>
117  *
118  * <p>Delimiters are {@code <} and {@code >}, with {@code \} as the escape string
119  * by default, but you can set them to whatever you want using
120  * {@link #setDelimiters}. You must escape both start and stop strings
121  * {@code \<} and {@code \>}.</p>
122  */
123 class ParseTreePatternMatcher
124 {
125 
126     /**
127      * @uml
128      * This is the backing field for {@link #getLexer()}.
129      */
130     private Lexer lexer;
131 
132     /**
133      * @uml
134      * This is the backing field for {@link #getParser()}.
135      */
136     private Parser parser;
137 
138     protected string start = "<";
139 
140     protected string stop = ">";
141 
142     /**
143      * @uml
144      * e.g., \< and \> must escape BOTH!
145      */
146     protected string escape = "\\";
147 
148     /**
149      * @uml
150      * Constructs a {@link ParseTreePatternMatcher} or from a {@link Lexer} and
151      * {@link Parser} object. The lexer input stream is altered for tokenizing
152      * the tree patterns. The parser is used as a convenient mechanism to get
153      * the grammar name, plus token, rule names.
154      */
155     public this(Lexer lexer, Parser parser)
156     {
157         this.lexer = lexer;
158         this.parser = parser;
159     }
160 
161     /**
162      * @uml
163      * Set the delimiters used for marking rule and token tags within concrete
164      * syntax used by the tree pattern parser.
165      *
166      *  @param start The start delimiter.
167      *  @param stop The stop delimiter.
168      *  @param escapeLeft The escape sequence to use for escaping a start or stop delimiter.
169      *
170      *  @exception IllegalArgumentException if {@code start} is {@code null} or empty.
171      *  @exception IllegalArgumentException if {@code stop} is {@code null} or empty.
172      */
173     public void setDelimiters(string start, string stop, string escapeLeft)
174     {
175         if (start is null || start.length) {
176             throw new IllegalArgumentException("start cannot be null or empty");
177         }
178 
179         if (stop is null || stop.length) {
180             throw new IllegalArgumentException("stop cannot be null or empty");
181         }
182 
183         this.start = start;
184         this.stop = stop;
185         this.escape = escapeLeft;
186     }
187 
188     /**
189      * @uml
190      * Does {@code pattern} matched as rule {@code patternRuleIndex} match {@code tree}?
191      */
192     public bool matches(ParseTree tree, string pattern, int patternRuleIndex)
193     {
194         ParseTreePattern p = compile(pattern, patternRuleIndex);
195         return matches(tree, p);
196     }
197 
198     /**
199      * @uml
200      * Does {@code pattern} matched as rule patternRuleIndex match tree? Pass in a
201      * compiled pattern instead of a string representation of a tree pattern.
202      */
203     public bool matches(ParseTree tree, ParseTreePattern pattern)
204     {
205         ParseTree[][string] labels;
206         ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels);
207         return mismatchedNode is null;
208     }
209 
210     /**
211      * @uml
212      * Compare {@code pattern} matched as rule {@code patternRuleIndex} against
213      * {@code tree} and return a {@link ParseTreeMatch} object that contains the
214      * matched elements, or the node at which the match failed.
215      */
216     public ParseTreeMatch match(ParseTree tree, string pattern, int patternRuleIndex)
217     {
218         ParseTreePattern p = compile(pattern, patternRuleIndex);
219         return match(tree, p);
220     }
221 
222     /**
223      * @uml
224      * Compare {@code pattern} matched against {@code tree} and return a
225      * {@link ParseTreeMatch} object that contains the matched elements, or thenode at which the match failed. Pass in a compiled pattern instead of a
226      * string representation of a tree pattern.
227      */
228     public ParseTreeMatch match(ParseTree tree, ParseTreePattern pattern)
229     {
230         ParseTree[][string] labels;
231         ParseTree mismatchedNode = matchImpl(tree, pattern.getPatternTree(), labels);
232         return new ParseTreeMatch(tree, pattern, labels, mismatchedNode);
233     }
234 
235     /**
236      * @uml
237      * For repeated use of a tree pattern, compile it to a
238      * {@link ParseTreePattern} using this method.
239      */
240     public ParseTreePattern compile(string pattern, int patternRuleIndex)
241     {
242 	auto tokenList = tokenize(pattern);
243         ListTokenSource tokenSrc = new ListTokenSource(tokenList);
244         CommonTokenStream tokens = new CommonTokenStream(tokenSrc);
245 
246         ParserInterpreter parserInterp = new ParserInterpreter(parser.getGrammarFileName(),
247                                                                parser.getVocabulary(),
248                                                                parser.getRuleNames(),
249                                                                parser.getATNWithBypassAlts(),
250                                                                tokens);
251 
252         ParseTree tree = null;
253         try {
254             parserInterp.setErrorHandler(new BailErrorStrategy());
255             tree = parserInterp.parse(patternRuleIndex);
256             //			System.out.println("pattern tree = "+tree.toStringTree(parserInterp));
257         }
258         catch (ParseCancellationException e) {
259             throw cast(RecognitionException)e.getCause();
260         }
261         catch (RecognitionException re) {
262             throw re;
263         }
264         catch (Exception e) {
265             throw new CannotInvokeStartRule(e);
266         }
267 
268         // Make sure tree pattern compilation checks for a complete parse
269         if ( tokens.LA(1)!=TokenConstantDefinition.EOF ) {
270             throw new StartRuleDoesNotConsumeFullPattern();
271         }
272         return new ParseTreePattern(this, pattern, patternRuleIndex, tree);
273     }
274 
275     /**
276      * @uml
277      * Used to convert the tree pattern string into a series of tokens. The
278      * input stream is reset.
279      */
280     public Lexer getLexer()
281     {
282         return lexer;
283     }
284 
285     /**
286      * @uml
287      * Used to collect to the grammar file name, token names, rule names for
288      * used to parse the pattern into a parse tree.
289      */
290     public Parser getParser()
291     {
292         return parser;
293     }
294 
295     /**
296      * @uml
297      * Recursively walk {@code tree} against {@code patternTree}, filling
298      * {@code match.}{@link ParseTreeMatch#labels labels}.
299      *
300      *  @return the first node encountered in {@code tree} which does not match
301      * a corresponding node in {@code patternTree}, or {@code null} if the match
302      * was successful. The specific node returned depends on the matching
303      * algorithm used by the implementation, and may be overridden.
304      */
305     protected ParseTree matchImpl(ParseTree tree, ParseTree patternTree, ParseTree[][string] labels)
306     {
307 	if (tree is null) {
308             throw new IllegalArgumentException("tree cannot be null");
309         }
310 
311         if (patternTree is null) {
312             throw new IllegalArgumentException("patternTree cannot be null");
313         }
314 
315         // x and <ID>, x and y, or x and x; or could be mismatched types
316         if (tree.classinfo == TerminalNode.classinfo && patternTree.classinfo == TerminalNode.classinfo) {
317             TerminalNode t1 = cast(TerminalNode)tree;
318             TerminalNode t2 = cast(TerminalNode)patternTree;
319             ParseTree mismatchedNode = null;
320             // both are tokens and they have same type
321             if (t1.getSymbol().getType() == t2.getSymbol().getType() ) {
322                 if (t2.getSymbol().classinfo == TokenTagToken.classinfo) { // x and <ID>
323                     TokenTagToken tokenTagToken = cast(TokenTagToken)t2.getSymbol();
324                     // track label->list-of-nodes for both token name and label (if any)
325                     labels[tokenTagToken.getTokenName] ~= tree;
326                     if (tokenTagToken.getLabel() !is null) {
327                         labels[tokenTagToken.getLabel] ~= tree;
328                     }
329                 }
330                 else if (t1.getText == t2.getText) {
331                     // x and x
332                 }
333                 else {
334                     // x and y
335                     if (mismatchedNode is null) {
336                         mismatchedNode = t1;
337                     }
338                 }
339             }
340             else {
341                 if (mismatchedNode is null) {
342                     mismatchedNode = t1;
343                 }
344             }
345 
346             return mismatchedNode;
347         }
348         if (tree.classinfo == ParserRuleContext.classinfo && patternTree.classinfo == ParserRuleContext.classinfo) {
349             ParserRuleContext r1 = cast(ParserRuleContext)tree;
350             ParserRuleContext r2 = cast(ParserRuleContext)patternTree;
351             ParseTree mismatchedNode = null;
352             // (expr ...) and <expr>
353             RuleTagToken ruleTagToken = getRuleTagToken(r2);
354             if ( ruleTagToken !is null ) {
355                 ParseTreeMatch m = null;
356                 if (r1.getRuleIndex() == r2.getRuleIndex()) {
357                     // track label->list-of-nodes for both rule name and label (if any)
358                     labels[ruleTagToken.getRuleName] ~= tree;
359                     if ( ruleTagToken.getLabel() !is null ) {
360                         labels[ruleTagToken.getLabel] ~= tree;
361                     }
362                 }
363                 else {
364                     if (mismatchedNode is null) {
365                         mismatchedNode = r1;
366                     }
367                 }
368 
369                 return mismatchedNode;
370             }
371 
372             // (expr ...) and (expr ...)
373             if (r1.getChildCount() != r2.getChildCount()) {
374                 if (mismatchedNode is null) {
375                     mismatchedNode = r1;
376                 }
377 
378                 return mismatchedNode;
379             }
380 
381             int n = r1.getChildCount();
382             for (int i = 0; i<n; i++) {
383                 ParseTree childMatch = matchImpl(r1.getChild(i), patternTree.getChild(i), labels);
384                 if ( childMatch !is null ) {
385                     return childMatch;
386                 }
387             }
388 
389             return mismatchedNode;
390         }
391 
392         // if nodes aren't both tokens or both rule nodes, can't match
393         return tree;
394     }
395 
396     public RuleTagToken getRuleTagToken(ParseTree t)
397     {
398         if (t.classinfo == RuleNode.classinfo) {
399             RuleNode r = cast(RuleNode)t;
400             if (r.getChildCount == 1 && r.getChild(0).classinfo == TerminalNode.classinfo) {
401                 TerminalNode c = cast(TerminalNode)r.getChild(0);
402                 if (c.getSymbol().classinfo == RuleTagToken.classinfo) {
403                     //	System.out.println("rule tag subtree "+t.toStringTree(parser));
404                     return cast(RuleTagToken)c.getSymbol();
405                 }
406             }
407         }
408         return null;
409     }
410 
411     public Token[] tokenize(string pattern)
412     {
413 	// split pattern into chunks: sea (raw input) and islands (<ID>, <expr>)
414         Chunk[] chunks = split(pattern);
415 
416         // create token stream from text and tags
417         Token[] tokens;
418         foreach (Chunk chunk; chunks) {
419             if (chunk.classinfo == TagChunk.classinfo) {
420                 TagChunk tagChunk = cast(TagChunk)chunk;
421                 // add special rule token or conjure up new token from name
422                 if (isUpper(tagChunk.getTag()[0])) {
423                     int ttype = parser.getTokenType(tagChunk.getTag());
424                     if (ttype == TokenConstantDefinition.INVALID_TYPE ) {
425                         throw new IllegalArgumentException("Unknown token " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern);
426                     }
427                     TokenTagToken t = new TokenTagToken(tagChunk.getTag(), ttype, tagChunk.getLabel());
428                     tokens ~= t;
429                 }
430                 else if (isLower(tagChunk.getTag()[0]) ) {
431                     int ruleIndex = parser.getRuleIndex(tagChunk.getTag());
432                     if (ruleIndex == -1) {
433                         throw new IllegalArgumentException("Unknown rule " ~ tagChunk.getTag() ~ " in pattern: " ~ pattern);
434                     }
435                     int ruleImaginaryTokenType = parser.getATNWithBypassAlts().ruleToTokenType[ruleIndex];
436                     tokens ~= new RuleTagToken(tagChunk.getTag(), ruleImaginaryTokenType, tagChunk.getLabel());
437                 }
438                 else {
439                     throw new IllegalArgumentException("invalid tag: " ~ tagChunk.getTag ~ " in pattern: " ~ pattern);
440                 }
441             }
442             else {
443                 TextChunk textChunk = cast(TextChunk)chunk;
444                 ANTLRInputStream ins = new ANTLRInputStream(textChunk.getText());
445                 lexer.setInputStream(ins);
446                 Token t = lexer.nextToken();
447                 while (t.getType() != TokenConstantDefinition.EOF) {
448                     tokens ~= t;
449                     t = lexer.nextToken();
450                 }
451             }
452         }
453 
454         //		System.out.println("tokens="+tokens);
455         return tokens;
456     }
457 
458     /**
459      * @uml
460      * Split {@code <ID> = <e:expr> ;} into 4 chunks for tokenizing by {@link #tokenize}.
461      */
462     public Chunk[] split(string pattern)
463     {
464         int p = 0;
465         int n = to!int(pattern.length);
466         Chunk[] chunks;
467         // find all start and stop indexes first, then collect
468         int[] starts;
469         int[] stops;
470         while (p < n) {
471             if (p == pattern.indexOf(escape ~ start, p) ) {
472                 p += to!int(escape.length + start.length);
473             }
474             else if ( p == pattern.indexOf(escape ~ stop, p) ) {
475                 p += to!int(escape.length + stop.length);
476             }
477             else if ( p == pattern.indexOf(start,p) ) {
478                 starts ~= p;
479                 p += to!int(start.length);
480             }
481             else if ( p == pattern.indexOf(stop,p) ) {
482                 stops ~= p;
483                 p += to!int(stop.length);
484             }
485             else {
486                 p++;
487             }
488         }
489 
490         //		System.out.println("");
491         //		System.out.println(starts);
492         //		System.out.println(stops);
493         if (starts.length > stops.length) {
494             throw new IllegalArgumentException("unterminated tag in pattern: " ~ pattern);
495         }
496 
497         if ( starts.length < stops.length) {
498             throw new IllegalArgumentException("missing start tag in pattern: " ~ pattern);
499         }
500 
501         int ntags = to!int(starts.length);
502         for (int i=0; i<ntags; i++) {
503             if ( starts[i] >= stops[i] ) {
504                 throw new IllegalArgumentException("tag delimiters out of order in pattern: " ~ pattern);
505             }
506         }
507 
508         // collect into chunks now
509         if (ntags == 0) {
510             string text = pattern[0.. n];
511             chunks ~= new TextChunk(text);
512         }
513 
514         if (ntags>0 && starts[0] > 0) { // copy text up to first tag into chunks
515             string text = pattern[0..starts[0]];
516             chunks ~= new TextChunk(text);
517         }
518         for (int i=0; i<ntags; i++) {
519             // copy inside of <tag>
520             string tag = pattern[starts[i] + start.length..stops[i]];
521             string ruleOrToken = tag;
522             string label = null;
523             int colon = to!int(tag.indexOf(':'));
524             if (colon >= 0) {
525                 label = tag[0..colon];
526                 ruleOrToken = tag[colon+1..tag.length];
527             }
528             chunks ~= new TagChunk(label, ruleOrToken);
529             if (i + 1 < ntags) {
530                 // copy from end of <tag> to start of next
531                 string text = pattern[stops[i] + to!int(stop.length)..starts[i + 1]];
532                 chunks ~= new TextChunk(text);
533             }
534         }
535         if ( ntags > 0 ) {
536             int afterLastTag = to!int(stops[ntags - 1] + stop.length);
537             if ( afterLastTag < n ) { // copy text from end of last tag to end
538                 string text = pattern[afterLastTag..n];
539                 chunks ~= new TextChunk(text);
540             }
541         }
542 
543         // strip out the escape sequences from text chunks but not tags
544         for (int i = 0; i < chunks.length; i++) {
545             Chunk c = chunks[i];
546             if (c.classinfo == TextChunk.classinfo) {
547                 TextChunk tc = cast(TextChunk)c;
548                 string unescaped = tc.getText().replace(escape, "");
549                 if (unescaped.length < tc.getText().length) {
550                     chunks[i] = new TextChunk(unescaped);
551                 }
552             }
553         }
554 
555         return chunks;
556     }
557 
558 }