antlr.v4.runtime.BufferedTokenStream source code

1 /*
2  * Copyright (c) 2012-2020 The ANTLR Project. All rights reserved.
3  * Use of this file is governed by the BSD 3-clause license that
4  * can be found in the LICENSE.txt file in the project root.
5  */
6 
7 module antlr.v4.runtime.BufferedTokenStream;
8 
9 import antlr.v4.runtime.IllegalStateException;
10 import antlr.v4.runtime.Lexer;
11 import antlr.v4.runtime.RuleContext;
12 import antlr.v4.runtime.Token;
13 import antlr.v4.runtime.TokenConstantDefinition;
14 import antlr.v4.runtime.TokenSource;
15 import antlr.v4.runtime.TokenStream;
16 import antlr.v4.runtime.WritableToken;
17 import antlr.v4.runtime.misc.Interval;
18 import std.algorithm: canFind;
19 import std.array;
20 import std.conv;
21 import std.format;
22 import std.variant;
23 import std.algorithm;
24 
25 /**
26  * This implementation of {@link TokenStream} loads tokens from a
27  * {@link TokenSource} on-demand, and places the tokens in a buffer to provide
28  * access to any previous token by index.
29  *
30  * <p>
31  * This token stream ignores the value of {@link Token#getChannel}. If your
32  * parser requires the token stream filter tokens to only those on a particular
33  * channel, such as {@link Token#DEFAULT_CHANNEL} or
34  * {@link Token#HIDDEN_CHANNEL}, use a filtering token stream such a
35  * {@link CommonTokenStream}.</p>
36  */
37 class BufferedTokenStream : TokenStream
38 {
39 
40     /**
41      * The {@link TokenSource} from which tokens for this stream are fetched.
42      */
43     protected TokenSource tokenSource;
44 
45     /**
46      * A collection of all tokens fetched from the token source. The list is
47      * considered a complete view of the input once {@link #fetchedEOF} is set
48      * to {@code true}.
49      */
50     protected Token[] tokens;
51 
52     /**
53      * The index into {@link #tokens} of the current token (next token to
54      * {@link #consume}). {@link #tokens}{@code [}{@link #p}{@code ]} should be
55      * {@link #LT LT(1)}.
56      *
57      * <p>This field is set to -1 when the stream is first constructed or when
58      * {@link #setTokenSource} is called, indicating that the first token has
59      * not yet been fetched from the token source. For additional information,
60      * see the documentation of {@link IntStream} for a description of
61      * Initializing Methods.</p>
62      * @uml
63      * @read
64      */
65     private size_t index_ = size_t.max;
66 
67     /**
68      * Indicates whether the {@link Token#EOF} token has been fetched from
69      * {@link #tokenSource} and added to {@link #tokens}. This field improves
70      * performance for the following cases:
71      *
72      * <ul>
73      * <li>{@link #consume}: The lookahead check in {@link #consume} to prevent
74      * consuming the EOF symbol is optimized by checking the values of
75      * {@link #fetchedEOF} and {@link #p} instead of calling {@link #LA}.</li>
76      * <li>{@link #fetch}: The check to prevent adding multiple EOF symbols into
77      * {@link #tokens} is trivial with this field.</li>
78      * <ul>
79      */
80     protected bool fetchedEOF;
81 
82     public this(TokenSource tokenSource)
83     in
84     {
85         assert (tokenSource !is null, "tokenSource cannot be null");
86     }
87     do
88     {
89             this.tokenSource = tokenSource;
90     }
91 
92     /**
93      * @uml
94      * @override
95      */
96     public override TokenSource getTokenSource()
97     {
98         return tokenSource;
99     }
100 
101     public int mark()
102     {
103         return 0;
104     }
105 
106     public void release(int marker)
107     {
108         // no resources to release
109     }
110 
111     public void reset()
112     {
113         seek(0);
114     }
115 
116     public void seek(size_t index)
117     {
118         lazyInit;
119         index_ = adjustSeekIndex(index);
120     }
121 
122     public size_t size()
123     {
124         return tokens.length;
125     }
126 
127     public void consume()
128     {
129         bool skipEofCheck;
130         if (cast(int)index_ >= 0) {
131             if (fetchedEOF) {
132                 // the last token in tokens is EOF. skip check if p indexes any
133                 // fetched token except the last.
134                 skipEofCheck = cast(int)index_ < cast(int)tokens.length - 1;
135             }
136             else {
137                 // no EOF token in tokens. skip check if p indexes a fetched token.
138                 skipEofCheck = index_ < tokens.length;
139             }
140         }
141         else {
142             // not yet initialized
143             skipEofCheck = false;
144         }
145 
146         if (!skipEofCheck && LA(1) == TokenConstantDefinition.EOF) {
147             throw new IllegalStateException("cannot consume EOF");
148         }
149 
150         if (sync(index_ + 1)) {
151             index_ = adjustSeekIndex(index_ + 1);
152         }
153     }
154 
155     /**
156      * Make sure index {@code i} in tokens has a token.
157      *
158      * @return {@code true} if a token is located at index {@code i}, otherwise
159      *    {@code false}.
160      * @see #get(int i)
161      */
162     protected bool sync(size_t i)
163     in
164     {
165         assert (i >= 0);
166     }
167     do
168     {
169         auto n = cast(int)i - cast(int)tokens.length + 1; // how many more elements we need?
170         if ( n > 0 ) {
171             auto fetched = fetch(n);
172             return fetched >= n;
173         }
174         return true;
175     }
176 
177     /**
178      * Add {@code n} elements to buffer.
179      *
180      *  @return The actual number of elements added to the buffer.
181      */
182     protected size_t fetch(size_t n)
183     {
184         if (fetchedEOF) {
185             return 0;
186         }
187         for (int i = 0; i < n; i++) {
188             Token t = tokenSource.nextToken();
189             if (cast(WritableToken)t) {
190                 (cast(WritableToken)t).setTokenIndex(to!int(tokens.length));
191             }
192             tokens ~= t;
193             if (t.getType == TokenConstantDefinition.EOF) {
194                 fetchedEOF = true;
195                 return i + 1;
196             }
197         }
198         return n;
199     }
200 
201     /**
202      * @uml
203      * @override
204      */
205     public override Token get(size_t i)
206     in
207     {
208         assert( i >= 0 && i < tokens.length, format("token index %1$s out of range 0..%2$s", i, tokens.length-1));
209     }
210     do
211     {
212         return tokens[i];
213     }
214 
215     /**
216      * Get all tokens from start..stop inclusively
217      */
218     public Token[] get(size_t start, size_t stop)
219     {
220     if (start < 0 || stop < 0 ) return null;
221         lazyInit;
222         Token[] subset;
223         if (stop >= tokens.length) stop = to!int(tokens.length) - 1;
224         for (auto i = start; i <= stop; i++) {
225             Token t = tokens[i];
226             if (t.getType == TokenConstantDefinition.EOF)
227                 break;
228             subset ~= t;
229         }
230         return subset;
231     }
232 
233     public dchar LA(int i)
234     {
235         return LT(i).getType;
236     }
237 
238     public Token LB(int k)
239     {
240         if ((cast(int)index_ - k) < 0)
241             return null;
242         return tokens[index_ - k];
243     }
244 
245     /**
246      * @uml
247      * @override
248      */
249     public override Token LT(int k)
250     {
251         lazyInit();
252         if (k == 0)
253             return null;
254         if (k < 0)
255             return LB(-k);
256         auto i = cast(int)index_ + k - 1;
257         sync(i);
258         if ( i >= tokens.length ) { // return EOF token
259             // EOF must be last token
260             return tokens[$-1];
261         }
262         return tokens[i];
263     }
264 
265     /**
266      * Allowed derived classes to modify the behavior of operations which change
267      * the current stream position by adjusting the target token index of a seek
268      * operation. The default implementation simply returns {@code i}. If an
269      * exception is thrown in this method, the current stream index should not be
270      * changed.
271      *
272      * <p>For example, {@link CommonTokenStream} overrides this method to ensure that
273      * the seek target is always an on-channel token.</p>
274      *
275      *  @param i The target token index.
276      *  @return The adjusted target token index.
277      */
278     protected size_t adjustSeekIndex(size_t i)
279     {
280         return i;
281     }
282 
283     protected void lazyInit()
284     {
285         if (index_ == size_t.max) {
286             setup;
287         }
288     }
289 
290     protected void setup()
291     {
292         sync(0);
293         index_ = adjustSeekIndex(0);
294     }
295 
296     /**
297      * Reset this token stream by setting its token source.
298      */
299     public void setTokenSource(TokenSource tokenSource)
300     {
301         this.tokenSource = tokenSource;
302         tokens.length = 0;
303         index_ = size_t.max;
304         fetchedEOF = false;
305     }
306 
307     public Token[] getTokens()
308     {
309         return tokens;
310     }
311 
312     public Token[] getTokens(size_t start, size_t stop)
313     {
314         return getTokens(start, stop, null);
315     }
316 
317     /**
318      * Given a start and stop index, return a List of all tokens in
319      * the token type BitSet.  Return null if no tokens were found.  This
320      * method looks at both on and off channel tokens.
321      */
322     public Token[] getTokens(size_t start, size_t stop, int[] types)
323     in
324     {
325         lazyInit();
326         assert(start >= 0 && stop < tokens.length &&
327                stop > 0  && start < tokens.length,
328                format("start %1$s or stop %2$s not in 0..%3$s", start, stop, tokens.length - 1));
329     }
330     do
331     {
332             if (start > stop)
333                 return null;
334 
335             // list = tokens[start:stop]:{T t, t.getType() in types}
336             Token[] filteredTokens;
337             for (auto i = start; i<=stop; i++) {
338                 Token t = tokens[i];
339                 if (types is null || types.canFind(t.getType()) ) {
340                     filteredTokens ~= t;
341                 }
342             }
343             if (filteredTokens.length == 0) {
344                 filteredTokens = null;
345             }
346             return filteredTokens;
347     }
348 
349     public Token[] getTokens(size_t start, size_t stop, int ttype)
350     {
351         int[] s;
352         s ~= ttype;
353         return getTokens(start, stop, s);
354     }
355 
356     /**
357      * Given a starting index, return the index of the next token on channel.
358      * Return {@code i} if {@code tokens[i]} is on channel. Return the index of
359      * the EOF token if there are no tokens on channel between {@code i} and
360      * EOF.
361      */
362     protected size_t nextTokenOnChannel(size_t i, int channel)
363     {
364         sync(i);
365         if (i >= size) {
366             return size - 1;
367         }
368 
369         Token token = tokens[i];
370         while (token.getChannel != channel) {
371             if (token.getType == TokenConstantDefinition.EOF) {
372                 return i;
373             }
374 
375             i++;
376             sync(i);
377             token = tokens[i];
378         }
379 
380         return i;
381     }
382 
383     /**
384      * Given a starting index, return the index of the previous token on
385      * channel. Return {@code i} if {@code tokens[i]} is on channel. Return -1
386      * if there are no tokens on channel between {@code i} and 0.
387      *
388      * <p>
389      * If {@code i} specifies an index at or after the EOF token, the EOF token
390      * index is returned. This is due to the fact that the EOF token is treated
391      * as though it were on every channel.</p>
392      */
393     protected size_t previousTokenOnChannel(size_t i, int channel)
394     {
395         sync(i);
396         if (i >= size) {
397             // the EOF token is on every channel
398             return size() - 1;
399         }
400         while (i >= 0) {
401             Token token = tokens[i];
402             if (token.getType() == TokenConstantDefinition.EOF || token.getChannel == channel) {
403                 return i;
404             }
405             i--;
406         }
407         return i;
408     }
409 
410     /**
411      * Collect all tokens on specified channel to the right of
412      * the current token up until we see a token on DEFAULT_TOKEN_CHANNEL or
413      * EOF. If channel is -1, find any non default channel token.
414      */
415     public Token[] getHiddenTokensToRight(size_t tokenIndex, int channel)
416     in
417     {
418         lazyInit();
419         assert(tokenIndex >= 0 && tokenIndex < tokens.length, format("%1$s not in 0..%2$s", tokenIndex, tokens.length-1));
420     }
421     do
422     {
423             auto nextOnChannel =
424                 nextTokenOnChannel(tokenIndex + 1, Lexer.DEFAULT_TOKEN_CHANNEL);
425             size_t to;
426             auto from = tokenIndex + 1;
427             // if none onchannel to right, nextOnChannel=-1 so set to = last token
428             if ( nextOnChannel == -1 ) to = size()-1;
429             else to = nextOnChannel;
430             return filterForChannel(from, to, channel);
431     }
432 
433     /**
434      * Collect all hidden tokens (any off-default channel) to the right of
435      * the current token up until we see a token on DEFAULT_TOKEN_CHANNEL
436      * of EOF.
437      */
438     public Token[] getHiddenTokensToRight(size_t tokenIndex)
439     {
440         return getHiddenTokensToRight(tokenIndex, -1);
441     }
442 
443     /**
444      * Collect all tokens on specified channel to the left of
445      * the current token up until we see a token on DEFAULT_TOKEN_CHANNEL.
446      * If channel is -1, find any non default channel token.
447      */
448     public Token[] getHiddenTokensToLeft(size_t tokenIndex, int channel)
449     in
450     {
451         lazyInit();
452         assert(tokenIndex >= 0 && tokenIndex < tokens.length, format("%1$s not in 0..%2$s", tokenIndex, tokens.length-1));
453     }
454     do
455     {
456         if (tokenIndex == 0) {
457             // obviously no tokens can appear before the first token
458             return null;
459         }
460         auto prevOnChannel =
461             previousTokenOnChannel(tokenIndex - 1, Lexer.DEFAULT_TOKEN_CHANNEL);
462         if ( prevOnChannel == tokenIndex - 1 ) return null;
463         // if none onchannel to left, prevOnChannel=-1 then from=0
464         auto from = prevOnChannel+1;
465         auto to = tokenIndex-1;
466         return filterForChannel(from, to, channel);
467     }
468 
469     /**
470      * Collect all hidden tokens (any off-default channel) to the left of
471      * the current token up until we see a token on DEFAULT_TOKEN_CHANNEL.
472      */
473     public Token[] getHiddenTokensToLeft(size_t tokenIndex)
474     {
475         return getHiddenTokensToLeft(tokenIndex, -1);
476     }
477 
478     /**
479      * Collect all hidden tokens (any off-default channel) to the left of
480      * the current token up until we see a token on DEFAULT_TOKEN_CHANNEL.
481      */
482     public Token[] filterForChannel(size_t from, size_t to, int channel)
483     {
484         Token[] hidden;
485         for (auto i=from; i<=to; i++) {
486             Token t = tokens[i];
487             if (channel == -1) {
488                 if (t.getChannel != Lexer.DEFAULT_TOKEN_CHANNEL) hidden ~= t;
489             }
490             else {
491                 if (t.getChannel == channel)
492                     hidden ~= t;
493             }
494         }
495         if (hidden.length == 0) return null;
496         return hidden;
497     }
498 
499     public string getSourceName()
500     {
501         return tokenSource.getSourceName();
502     }
503 
504     /**
505      * @uml
506      * @override
507      */
508     public override Variant getText()
509     {
510         lazyInit;
511         fill;
512         return getText(Interval.of(0, to!int(size) - 1));
513     }
514 
515     /**
516      * @uml
517      * @override
518      */
519     public override Variant getText(Interval interval)
520     {
521         import std.array : join;
522         int start = interval.a;
523         int stop = interval.b;
524         if (start < 0 || stop < 0) {
525             Variant v;
526             return v;
527         }
528         fill;
529         if (stop >= tokens.length)
530             stop = to!int(tokens.length) - 1;
531 
532         string buf;
533         foreach (t; tokens[start..stop+1]) {
534             if (t.getType == TokenConstantDefinition.EOF)
535                 break;
536             buf ~= t.getText.get!string;
537         }
538         return Variant(buf);
539     }
540 
541     /**
542      * @uml
543      * @override
544      */
545     public override Variant getText(RuleContext ctx)
546     {
547         return getText(ctx.getSourceInterval());
548     }
549 
550     /**
551      * @uml
552      * @override
553      */
554     public override Variant getText(Token start, Token stop)
555     {
556         if (start !is null && stop !is null) {
557             return getText(Interval.of(to!int(start.getTokenIndex()), to!int(stop.getTokenIndex())));
558         }
559         Variant v = "";
560         return v;
561     }
562 
563     /**
564      * Get all tokens from lexer until EOF
565      */
566     public void fill()
567     {
568         lazyInit;
569         const int blockSize = 1000;
570         while (true) {
571             auto fetched = fetch(blockSize);
572             if (fetched < blockSize) {
573                 return;
574             }
575         }
576     }
577 
578     public final size_t index()
579     {
580         return this.index_;
581     }
582 
583 }