antlr.v4.runtime.Lexer source code

1 /*
2  * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
3  * Use of this file is governed by the BSD 3-clause license that
4  * can be found in the LICENSE.txt file in the project root.
5  */
6 
7 module antlr.v4.runtime.Lexer;
8 
9 import std.stdio;
10 import std.typecons;
11 import std.array;
12 import std.conv;
13 import std.variant;
14 import antlr.v4.runtime.ANTLRErrorListener;
15 import antlr.v4.runtime.Recognizer;
16 import antlr.v4.runtime.RecognitionException;
17 import antlr.v4.runtime.atn.LexerATNSimulator;
18 import antlr.v4.runtime.Token;
19 import antlr.v4.runtime.TokenConstantDefinition;
20 import antlr.v4.runtime.TokenSource;
21 import antlr.v4.runtime.InterfaceLexer;
22 import antlr.v4.runtime.TokenFactory;
23 import antlr.v4.runtime.CharStream;
24 import antlr.v4.runtime.IntStream;
25 import antlr.v4.runtime.IntStreamConstant;
26 import antlr.v4.runtime.CommonToken;
27 import antlr.v4.runtime.CommonTokenFactory;
28 import antlr.v4.runtime.IllegalStateException;
29 import antlr.v4.runtime.LexerNoViableAltException;
30 import antlr.v4.runtime.misc;
31 import antlr.v4.runtime.InterfaceRuleContext;
32 
33 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b");
34 
35 /**
36  * A lexer is recognizer that draws input symbols from a character stream.
37  * lexer grammars result in a subclass of this object. A Lexer object
38  * uses simplified match() and error recovery mechanisms in the interest
39  * of speed.
40  */
41 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer
42 {
43 
44     public static immutable int DEFAULT_MODE = 0;
45 
46     public static immutable int MORE = -2;
47 
48     public static immutable int SKIP = -3;
49 
50     public static immutable int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL;
51 
52     public static immutable int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL;
53 
54     public static immutable int MIN_CHAR_VALUE = 0;
55 
56     public static immutable int MAX_CHAR_VALUE = 0x10FFFF;
57 
58     public CharStream _input;
59 
60     protected TokenFactorySourcePair _tokenFactorySourcePair;
61 
62     /**
63      * How to create token objects
64      * @uml
65      * @read
66      * @write
67      * @override
68      */
69     public TokenFactory!CommonToken tokenFactory_;
70 
71     /**
72      * The goal of all lexer rules/methods is to create a token object.
73      * This is an instance variable as multiple rules may collaborate to
74      * create a single token.  nextToken will return this object after
75      * matching lexer rule(s).  If you subclass to allow multiple token
76      * emissions, then set this to the last token to be matched or
77      * something nonnull so that the auto token emit mechanism will not
78      * emit another token.
79      */
80     public Token _token;
81 
82     public IntegerStack _modeStack;
83 
84     /**
85      * What character index in the stream did the current token start at?
86      * Needed, for example, to get the text for current token.  Set at
87      * the start of nextToken.
88      */
89     public size_t _tokenStartCharIndex;
90 
91     /**
92      * The line on which the first character of the token resides
93      */
94     public int _tokenStartLine;
95 
96     /**
97      * The character position of first character within the line
98      */
99     public int _tokenStartCharPositionInLine;
100 
101     public bool _hitEOF;
102 
103     /**
104      * The channel number for the current token
105      */
106     public int _channel;
107 
108     /**
109      * The token type for the current token
110      */
111     public int _type;
112 
113     public int _mode;
114 
115     /**
116      * You can set the text for the current token to override what is in
117      * the input char buffer.  Use setText() or can set this instance var.
118      */
119     public Variant _text;
120 
121     public this()
122     {
123     }
124 
125     public this(CharStream input)
126     {
127         tokenFactory_ = CommonTokenFactory.DEFAULT;
128         this._input = input;
129         this._tokenFactorySourcePair = tuple(this, input);
130         _modeStack = new IntegerStack();
131     }
132 
133     public void reset()
134     {
135     // wack Lexer state variables
136         if (_input !is null) {
137             _input.seek(0); // rewind the input
138         }
139         _token = null;
140         _type = TokenConstantDefinition.INVALID_TYPE;
141         _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
142         _tokenStartCharIndex = -1;
143         _tokenStartCharPositionInLine = -1;
144         _tokenStartLine = -1;
145         _text.init;
146         _hitEOF = false;
147         _mode = Lexer.DEFAULT_MODE;
148         _modeStack.clear();
149         getInterpreter().reset();
150     }
151 
152     /**
153      * Return a token from this source; i.e., match a token on the char
154      * stream.
155      */
156     public Token nextToken()
157     {
158         if (_input is null) {
159                 throw new IllegalStateException("nextToken requires a non-null input stream.");
160             }
161         // Mark start location in char stream so unbuffered streams are
162         // guaranteed at least have text of current token
163         int tokenStartMarker = _input.mark();
164         try{
165         outer:
166             while (true) {
167                 if (_hitEOF) {
168                     emitEOF();
169                     return _token;
170                 }
171                 _token = null;
172                 _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
173                 _tokenStartCharIndex = _input.index;
174                 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine();
175                 _tokenStartLine = getInterpreter.getLine;
176                 _text.init;
177                 do {
178                     _type = TokenConstantDefinition.INVALID_TYPE;
179                     debug(Lexer) {
180                         import std.stdio;
181                         writefln("nextToken line = %s at %s: %s in mode %s at index %s",
182                                  _tokenStartLine,
183                                  _tokenStartCharPositionInLine,
184                                  _input.LA(1),
185                                  _mode,
186                                  _input.index);
187                     }
188                     int ttype;
189                     try {
190                         ttype = getInterpreter.match(_input, _mode);
191                     }
192                     catch (LexerNoViableAltException e) {
193                         notifyListeners(e);     // report error
194                         recover(e);
195                         ttype = SKIP;
196                     }
197                     if (_input.LA(1) == IntStreamConstant.EOF) {
198                         _hitEOF = true;
199                     }
200                     if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype;
201                     if (_type == SKIP) {
202                         continue outer;
203                     }
204                 }
205                 while (_type == MORE);
206 
207                 if (_token is null) {
208                     emit();
209                 }
210                 return _token;
211             }
212         }
213         finally {
214             // make sure we release marker after match or
215             // unbuffered char stream will keep buffering
216             _input.release(tokenStartMarker);
217         }
218         assert(0);
219     }
220 
221     /**
222      * Instruct the lexer to skip creating a token for current lexer rule
223      * and look for another token.  nextToken() knows to keep looking when
224      * a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
225      * if token==null at end of any token rule, it creates one for you
226      * and emits it.
227      */
228     public void skip()
229     {
230         _type = SKIP;
231     }
232 
233     public void more()
234     {
235         _type = MORE;
236     }
237 
238     public void mode(int m)
239     {
240         _mode = m;
241     }
242 
243     public void pushMode(int m)
244     {
245         debug(LexerATNSimulator)
246             writefln("pushMode %s %s", m, _modeStack);
247         _modeStack.push(_mode);
248         mode(m);
249     }
250 
251     public int popMode()
252     {
253         assert (!_modeStack.isEmpty, "Empty stack");
254         debug(LexerATNSimulator)
255             writefln("popMode back to %s", _modeStack.peek);
256         mode(_modeStack.pop);
257         return _mode;
258     }
259 
260     /**
261      * Set the char stream and reset the lexer
262      * @uml
263      * @override
264      */
265     public override void setInputStream(IntStream input)
266     {
267         this._input = null;
268         this._tokenFactorySourcePair = tuple(this, _input);
269         reset();
270         this._input = cast(CharStream)input;
271         this._tokenFactorySourcePair = tuple(this, _input);
272     }
273 
274     public string getSourceName()
275     {
276         return _input.getSourceName();
277     }
278 
279     /**
280      * @uml
281      * @override
282      */
283     public override CharStream getInputStream()
284     {
285         return _input;
286     }
287 
288     /**
289      * By default does not support multiple emits per nextToken invocation
290      * for efficiency reasons.  Subclass and override this method, nextToken,
291      * and getToken (to push tokens into a list and pull from that list
292      * rather than a single variable as this implementation does).
293      */
294     public void emit(Token token)
295     {
296         this._token = token;
297     }
298 
299     /**
300      * The standard method called to automatically emit a token at the
301      * outermost lexical rule.  The token object should point into the
302      * char buffer start..stop.  If there is a text override in 'text',
303      * use that to set the token's text.  Override this method to emit
304      * custom Token objects or provide a new factory.
305      */
306     public Token emit()
307     {
308         Variant v = _text;
309         Token t = tokenFactory_.create(_tokenFactorySourcePair, _type,
310                                        v, _channel, _tokenStartCharIndex,
311                                        getCharIndex()-1, _tokenStartLine,
312                                        _tokenStartCharPositionInLine);
313         emit(t);
314         return t;
315     }
316 
317     public Token emitEOF()
318     {
319         int cpos = getCharPositionInLine();
320         int line = getLine();
321         Variant Null;
322         Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, Null, TokenConstantDefinition.DEFAULT_CHANNEL,
323                                          _input.index(), _input.index()-1,
324                                          line, cpos);
325         emit(eof);
326         return eof;
327     }
328 
329     public int getLine()
330     {
331         return getInterpreter().getLine();
332     }
333 
334     public int getCharPositionInLine()
335     {
336         return getInterpreter().getCharPositionInLine();
337     }
338 
339     public void setLine(int line)
340     {
341         getInterpreter().setLine(line);
342     }
343 
344     public void setCharPositionInLine(int charPositionInLine)
345     {
346         getInterpreter().setCharPositionInLine(charPositionInLine);
347     }
348 
349     /**
350      * What is the index of the current character of lookahead?
351      */
352     public size_t getCharIndex()
353     {
354         return _input.index();
355     }
356 
357     /**
358      * Return the text matched so far for the current token or any
359      * text override.
360      */
361     public Variant getText()
362     {
363         Variant Null;
364         if (_text !is Null) {
365             return _text;
366         }
367         Variant v = getInterpreter().getText(_input);
368         return v;
369     }
370 
371     /**
372      * Set the complete text of this token; it wipes any previous
373      * changes to the text.
374      */
375     public void setText(Variant text)
376     {
377         this._text = text;
378     }
379 
380     /**
381      * Override if emitting multiple tokens.
382      */
383     public Token getToken()
384     {
385         return _token;
386     }
387 
388     public void setToken(Token token)
389     {
390         this._token = token;
391     }
392 
393     public void setType(int ttype)
394     {
395         _type = ttype;
396     }
397 
398     public int getType()
399     {
400         return _type;
401     }
402 
403     public void setChannel(int channel)
404     {
405     _channel = channel;
406     }
407 
408     public int getChannel()
409     {
410         return _channel;
411     }
412 
413     public string[] getChannelNames()
414     {
415         return null;
416     }
417 
418     public string[] getModeNames()
419     {
420         return null;
421     }
422 
423     /**
424      * Used to print out token names like ID during debugging and
425      * error reporting.  The generated parsers implement a method
426      * that overrides this to point to their String[] tokenNames
427      * @uml
428      * @override
429      */
430     public override string[] getTokenNames()
431     {
432         return null;
433     }
434 
435     /**
436      * Return a list of all Token objects in input char stream.
437      * Forces load of all tokens. Does not include EOF token.
438      */
439     public Token[] getAllTokens()
440     {
441     Token[] tokens;
442         Token t = nextToken();
443         while (t.getType() != TokenConstantDefinition.EOF) {
444             tokens ~= t;
445             t = nextToken();
446         }
447         return tokens;
448     }
449 
450     public void recover(LexerNoViableAltException e)
451     {
452     if (_input.LA(1) != IntStreamConstant.EOF) {
453             // skip a char and try again
454             getInterpreter().consume(_input);
455         }
456     }
457 
458     public void notifyListeners(LexerNoViableAltException e)
459     {
460         auto text = _input.getText(Interval.of(to!int(_tokenStartCharIndex), to!int(_input.index)));
461         auto msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'";
462         ANTLRErrorListener!(int, LexerATNSimulator) listener = getErrorListenerDispatch();
463         listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
464     }
465 
466     public string getErrorDisplay(string s)
467     {
468         auto buf = appender!string;
469         foreach (dchar c; s) {
470             buf.put(getErrorDisplay(c));
471         }
472         return buf.data;
473     }
474 
475     public string getErrorDisplay(dchar c)
476     {
477         string s;
478         switch ( c ) {
479         case TokenConstantDefinition.EOF :
480             s = "<EOF>";
481             break;
482         case '\n' :
483             s = "\\n";
484             break;
485         case '\t' :
486             s = "\\t";
487             break;
488         case '\r' :
489             s = "\\r";
490             break;
491         default:
492             s ~= c;
493             break;
494         }
495         return s;
496     }
497 
498     public string getCharErrorDisplay(dchar c)
499     {
500         string s = getErrorDisplay(c);
501         return "'" ~ s ~ "'";
502     }
503 
504     /**
505      * Lexers can normally match any char in it's vocabulary after matching
506      * a token, so do the easy thing and just kill a character and hope
507      * it all works out.  You can instead use the rule invocation stack
508      * to do sophisticated error recovery if you are in a fragment rule.
509      */
510     public void recover(RecognitionException re)
511     {
512     //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
513         //re.printStackTrace();
514         // TODO: Do we lose character or line position information?
515         _input.consume();
516     }
517 
518     /**
519      * @uml
520      * @override
521      */
522     public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex,
523                                 int actionIndex)
524     {
525     }
526 
527     public override final TokenFactory!CommonToken tokenFactory()
528     {
529         return this.tokenFactory_;
530     }
531 
532     public override final void tokenFactory(TokenFactory!CommonToken tokenFactory)
533     {
534         this.tokenFactory_ = tokenFactory;
535     }
536 
537 }