antlr.v4.runtime.Lexer source code

1 /*
2  * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
3  * Use of this file is governed by the BSD 3-clause license that
4  * can be found in the LICENSE.txt file in the project root.
5  */
6 
7 module antlr.v4.runtime.Lexer;
8 
9 import std.stdio;
10 import std.typecons;
11 import std.array;
12 import std.conv;
13 import std.variant;
14 import antlr.v4.runtime.ANTLRErrorListener;
15 import antlr.v4.runtime.Recognizer;
16 import antlr.v4.runtime.RecognitionException;
17 import antlr.v4.runtime.atn.LexerATNSimulator;
18 import antlr.v4.runtime.Token;
19 import antlr.v4.runtime.TokenConstantDefinition;
20 import antlr.v4.runtime.TokenSource;
21 import antlr.v4.runtime.InterfaceLexer;
22 import antlr.v4.runtime.TokenFactory;
23 import antlr.v4.runtime.CharStream;
24 import antlr.v4.runtime.IntStream;
25 import antlr.v4.runtime.IntStreamConstant;
26 import antlr.v4.runtime.CommonToken;
27 import antlr.v4.runtime.CommonTokenFactory;
28 import antlr.v4.runtime.IllegalStateException;
29 import antlr.v4.runtime.LexerNoViableAltException;
30 import antlr.v4.runtime.misc;
31 import antlr.v4.runtime.InterfaceRuleContext;
32 
33 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b");
34 
35 /**
36  * A lexer is recognizer that draws input symbols from a character stream.
37  * lexer grammars result in a subclass of this object. A Lexer object
38  * uses simplified match() and error recovery mechanisms in the interest
39  * of speed.
40  */
41 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer
42 {
43 
44     public static immutable int DEFAULT_MODE = 0;
45 
46     public static immutable int MORE = -2;
47 
48     public static immutable int SKIP = -3;
49 
50     public static immutable int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL;
51 
52     public static immutable int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL;
53 
54     public static immutable int MIN_CHAR_VALUE = char.min;
55 
56     public static immutable int MAX_CHAR_VALUE = char.max;
57 
58     public CharStream _input;
59 
60     protected TokenFactorySourcePair _tokenFactorySourcePair;
61 
62     /**
63      * How to create token objects
64      * @uml
65      * @read
66      * @write
67      * @override
68      */
69     public TokenFactory!CommonToken tokenFactory_;
70 
71     /**
72      * The goal of all lexer rules/methods is to create a token object.
73      * This is an instance variable as multiple rules may collaborate to
74      * create a single token.  nextToken will return this object after
75      * matching lexer rule(s).  If you subclass to allow multiple token
76      * emissions, then set this to the last token to be matched or
77      * something nonnull so that the auto token emit mechanism will not
78      * emit another token.
79      */
80     public Token _token;
81 
82     public IntegerStack _modeStack;
83 
84     /**
85      * What character index in the stream did the current token start at?
86      * Needed, for example, to get the text for current token.  Set at
87      * the start of nextToken.
88      */
89     public int _tokenStartCharIndex = -1;
90 
91     /**
92      * The line on which the first character of the token resides
93      */
94     public int _tokenStartLine;
95 
96     /**
97      * The character position of first character within the line
98      */
99     public int _tokenStartCharPositionInLine;
100 
101     public bool _hitEOF;
102 
103     /**
104      * The channel number for the current token
105      */
106     public int _channel;
107 
108     /**
109      * The token type for the current token
110      */
111     public int _type;
112 
113     public int _mode;
114 
115     /**
116      * You can set the text for the current token to override what is in
117      * the input char buffer.  Use setText() or can set this instance var.
118      */
119     public Variant _text;
120 
121     public this()
122     {
123     }
124 
125     public this(CharStream input)
126     {
127         tokenFactory_ = CommonTokenFactory.DEFAULT;
128         this._input = input;
129         this._tokenFactorySourcePair = tuple(this, input);
130         _modeStack = new IntegerStack();
131     }
132 
133     public void reset()
134     {
135     // wack Lexer state variables
136         if (_input !is null) {
137             _input.seek(0); // rewind the input
138         }
139         _token = null;
140         _type = TokenConstantDefinition.INVALID_TYPE;
141         _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
142         _tokenStartCharIndex = -1;
143         _tokenStartCharPositionInLine = -1;
144         _tokenStartLine = -1;
145         _text.init;
146         _hitEOF = false;
147         _mode = Lexer.DEFAULT_MODE;
148         _modeStack.clear();
149         getInterpreter().reset();
150     }
151 
152     /**
153      * Return a token from this source; i.e., match a token on the char
154      * stream.
155      */
156     public Token nextToken()
157     {
158     if (_input is null) {
159             throw new IllegalStateException("nextToken requires a non-null input stream.");
160         }
161         // Mark start location in char stream so unbuffered streams are
162         // guaranteed at least have text of current token
163         int tokenStartMarker = _input.mark();
164         try{
165         outer:
166             while (true) {
167                 if (_hitEOF) {
168                     emitEOF();
169                     return _token;
170                 }
171                 _token = null;
172                 _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
173                 _tokenStartCharIndex = _input.index;
174                 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine();
175                 _tokenStartLine = getInterpreter.getLine;
176                 _text.init;
177                 do {
178                     _type = TokenConstantDefinition.INVALID_TYPE;
179                     debug(Lexer) {
180                         import std.stdio;
181                         writefln("nextToken line = %s at %s in mode %s at index %s",
182                                  _tokenStartLine,
183                                  cast(char)_input.LA(1),
184                                  _mode,
185                                  _input.index);
186                     }
187                     int ttype;
188                     try {
189                         ttype = getInterpreter.match(_input, _mode);
190                     }
191                     catch (LexerNoViableAltException e) {
192                         notifyListeners(e);     // report error
193                         recover(e);
194                         ttype = SKIP;
195                     }
196                     if (_input.LA(1) == IntStreamConstant.EOF) {
197                         _hitEOF = true;
198                     }
199                     if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype;
200                     if (_type == SKIP) {
201                         continue outer;
202                     }
203                 }
204                 while (_type == MORE);
205 
206                 if (_token is null) {
207                     emit();
208                 }
209                 return _token;
210             }
211         }
212         finally {
213             // make sure we release marker after match or
214             // unbuffered char stream will keep buffering
215             _input.release(tokenStartMarker);
216         }
217         assert(0);
218     }
219 
220     /**
221      * Instruct the lexer to skip creating a token for current lexer rule
222      * and look for another token.  nextToken() knows to keep looking when
223      * a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
224      * if token==null at end of any token rule, it creates one for you
225      * and emits it.
226      */
227     public void skip()
228     {
229         _type = SKIP;
230     }
231 
232     public void more()
233     {
234         _type = MORE;
235     }
236 
237     public void mode(int m)
238     {
239         _mode = m;
240     }
241 
242     public void pushMode(int m)
243     {
244         debug(LexerATNSimulator)
245             writefln("pushMode %s %s", m, _modeStack);
246         _modeStack.push(_mode);
247         mode(m);
248     }
249 
250     public int popMode()
251     {
252         assert (!_modeStack.isEmpty, "Empty stack");
253         debug(LexerATNSimulator)
254             writefln("popMode back to %s", _modeStack.peek);
255         mode(_modeStack.pop);
256         return _mode;
257     }
258 
259     /**
260      * Set the char stream and reset the lexer
261      * @uml
262      * @override
263      */
264     public override void setInputStream(IntStream input)
265     {
266         this._input = null;
267         this._tokenFactorySourcePair = tuple(this, _input);
268         reset();
269         this._input = cast(CharStream)input;
270         this._tokenFactorySourcePair = tuple(this, _input);
271     }
272 
273     public string getSourceName()
274     {
275         return _input.getSourceName();
276     }
277 
278     /**
279      * @uml
280      * @override
281      */
282     public override CharStream getInputStream()
283     {
284         return _input;
285     }
286 
287     /**
288      * By default does not support multiple emits per nextToken invocation
289      * for efficiency reasons.  Subclass and override this method, nextToken,
290      * and getToken (to push tokens into a list and pull from that list
291      * rather than a single variable as this implementation does).
292      */
293     public void emit(Token token)
294     {
295         this._token = token;
296     }
297 
298     /**
299      * The standard method called to automatically emit a token at the
300      * outermost lexical rule.  The token object should point into the
301      * char buffer start..stop.  If there is a text override in 'text',
302      * use that to set the token's text.  Override this method to emit
303      * custom Token objects or provide a new factory.
304      */
305     public Token emit()
306     {
307         Variant v = _text;
308         Token t = tokenFactory_.create(_tokenFactorySourcePair, _type,
309                                        v, _channel, _tokenStartCharIndex,
310                                        getCharIndex()-1, _tokenStartLine,
311                                        _tokenStartCharPositionInLine);
312         emit(t);
313         return t;
314     }
315 
316     public Token emitEOF()
317     {
318         int cpos = getCharPositionInLine();
319         int line = getLine();
320         Variant Null;
321         Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, Null, TokenConstantDefinition.DEFAULT_CHANNEL,
322                                          _input.index(), _input.index()-1,
323                                          line, cpos);
324         emit(eof);
325         return eof;
326     }
327 
328     public int getLine()
329     {
330         return getInterpreter().getLine();
331     }
332 
333     public int getCharPositionInLine()
334     {
335         return getInterpreter().getCharPositionInLine();
336     }
337 
338     public void setLine(int line)
339     {
340         getInterpreter().setLine(line);
341     }
342 
343     public void setCharPositionInLine(int charPositionInLine)
344     {
345         getInterpreter().setCharPositionInLine(charPositionInLine);
346     }
347 
348     /**
349      * What is the index of the current character of lookahead?
350      */
351     public int getCharIndex()
352     {
353         return _input.index();
354     }
355 
356     /**
357      * Return the text matched so far for the current token or any
358      * text override.
359      */
360     public Variant getText()
361     {
362         Variant Null;
363         if (_text !is Null) {
364             return _text;
365         }
366         Variant v = getInterpreter().getText(_input);
367         return v;
368     }
369 
370     /**
371      * Set the complete text of this token; it wipes any previous
372      * changes to the text.
373      */
374     public void setText(Variant text)
375     {
376         this._text = text;
377     }
378 
379     /**
380      * Override if emitting multiple tokens.
381      */
382     public Token getToken()
383     {
384         return _token;
385     }
386 
387     public void setToken(Token token)
388     {
389         this._token = token;
390     }
391 
392     public void setType(int ttype)
393     {
394         _type = ttype;
395     }
396 
397     public int getType()
398     {
399         return _type;
400     }
401 
402     public void setChannel(int channel)
403     {
404     _channel = channel;
405     }
406 
407     public int getChannel()
408     {
409         return _channel;
410     }
411 
412     public string[] getChannelNames()
413     {
414         return null;
415     }
416 
417     public string[] getModeNames()
418     {
419         return null;
420     }
421 
422     /**
423      * Used to print out token names like ID during debugging and
424      * error reporting.  The generated parsers implement a method
425      * that overrides this to point to their String[] tokenNames
426      * @uml
427      * @override
428      */
429     public override string[] getTokenNames()
430     {
431         return null;
432     }
433 
434     /**
435      * Return a list of all Token objects in input char stream.
436      * Forces load of all tokens. Does not include EOF token.
437      */
438     public Token[] getAllTokens()
439     {
440     Token[] tokens;
441         Token t = nextToken();
442         while (t.getType() != TokenConstantDefinition.EOF) {
443             tokens ~= t;
444             t = nextToken();
445         }
446         return tokens;
447     }
448 
449     public void recover(LexerNoViableAltException e)
450     {
451     if (_input.LA(1) != IntStreamConstant.EOF) {
452             // skip a char and try again
453             getInterpreter().consume(_input);
454         }
455     }
456 
457     public void notifyListeners(LexerNoViableAltException e)
458     {
459         string text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index()));
460         string msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'";
461 
462         ANTLRErrorListener!(int, LexerATNSimulator) listener = getErrorListenerDispatch();
463         listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
464     }
465 
466     public string getErrorDisplay(string s)
467     {
468         auto buf = appender!string;
469         foreach (char c; s) {
470             buf.put(getErrorDisplay(c));
471         }
472         return buf.data;
473     }
474 
475     public string getErrorDisplay(int c)
476     {
477         string s;
478         switch ( c ) {
479         case TokenConstantDefinition.EOF :
480             s = "<EOF>";
481             break;
482         case '\n' :
483             s = "\\n";
484             break;
485         case '\t' :
486             s = "\\t";
487             break;
488         case '\r' :
489             s = "\\r";
490             break;
491         default:
492             s ~= cast(wchar)c;
493             break;
494         }
495         return s;
496     }
497 
498     public string getCharErrorDisplay(int c)
499     {
500         string s = getErrorDisplay(c);
501         return "'" ~ s ~ "'";
502     }
503 
504     /**
505      * Lexers can normally match any char in it's vocabulary after matching
506      * a token, so do the easy thing and just kill a character and hope
507      * it all works out.  You can instead use the rule invocation stack
508      * to do sophisticated error recovery if you are in a fragment rule.
509      */
510     public void recover(RecognitionException re)
511     {
512     //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
513         //re.printStackTrace();
514         // TODO: Do we lose character or line position information?
515         _input.consume();
516     }
517 
518     /**
519      * @uml
520      * @override
521      */
522     public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex,
523                                 int actionIndex)
524     {
525     }
526 
527     public override final TokenFactory!CommonToken tokenFactory()
528     {
529         return this.tokenFactory_;
530     }
531 
532     public override final void tokenFactory(TokenFactory!CommonToken tokenFactory)
533     {
534         this.tokenFactory_ = tokenFactory;
535     }
536 
537 }