antlr.v4.runtime.Lexer source code

1 /*
2  * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
3  * Use of this file is governed by the BSD 3-clause license that
4  * can be found in the LICENSE.txt file in the project root.
5  */
6 
7 module antlr.v4.runtime.Lexer;
8 
9 import std.stdio;
10 import std.typecons;
11 import std.array;
12 import std.conv;
13 import antlr.v4.runtime.ANTLRErrorListener;
14 import antlr.v4.runtime.Recognizer;
15 import antlr.v4.runtime.RecognitionException;
16 import antlr.v4.runtime.atn.LexerATNSimulator;
17 import antlr.v4.runtime.Token;
18 import antlr.v4.runtime.TokenConstantDefinition;
19 import antlr.v4.runtime.TokenSource;
20 import antlr.v4.runtime.InterfaceLexer;
21 import antlr.v4.runtime.TokenFactory;
22 import antlr.v4.runtime.CharStream;
23 import antlr.v4.runtime.IntStream;
24 import antlr.v4.runtime.IntStreamConstant;
25 import antlr.v4.runtime.CommonToken;
26 import antlr.v4.runtime.CommonTokenFactory;
27 import antlr.v4.runtime.IllegalStateException;
28 import antlr.v4.runtime.LexerNoViableAltException;
29 import antlr.v4.runtime.misc;
30 import antlr.v4.runtime.InterfaceRuleContext;
31 
32 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b");
33 
34 // Class Lexer
35 /**
36  * A lexer is recognizer that draws input symbols from a character stream.
37  * lexer grammars result in a subclass of this object. A Lexer object
38  * uses simplified match() and error recovery mechanisms in the interest
39  * of speed.
40  */
41 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer
42 {
43 
44     public static immutable int DEFAULT_MODE = 0;
45 
46     public static immutable int MORE = -2;
47 
48     public static immutable int SKIP = -3;
49 
50     public static immutable int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL;
51 
52     public static immutable int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL;
53 
54     public static immutable int MIN_CHAR_VALUE = char.min;
55 
56     public static immutable int MAX_CHAR_VALUE = char.max;
57 
58     public CharStream _input;
59 
60     protected TokenFactorySourcePair _tokenFactorySourcePair;
61 
62     /**
63      * How to create token objects
64      * @uml
65      * @read
66      * @write
67      * @override
68      */
69     public TokenFactory!CommonToken tokenFactory_;
70 
71     /**
72      * The goal of all lexer rules/methods is to create a token object.
73      * This is an instance variable as multiple rules may collaborate to
74      * create a single token.  nextToken will return this object after
75      * matching lexer rule(s).  If you subclass to allow multiple token
76      * emissions, then set this to the last token to be matched or
77      * something nonnull so that the auto token emit mechanism will not
78      * emit another token.
79      */
80     public Token _token;
81 
82     public IntegerStack _modeStack;
83 
84     /**
85      * What character index in the stream did the current token start at?
86      * Needed, for example, to get the text for current token.  Set at
87      * the start of nextToken.
88      */
89     public int _tokenStartCharIndex = -1;
90 
91     /**
92      * The line on which the first character of the token resides
93      */
94     public int _tokenStartLine;
95 
96     /**
97      * The character position of first character within the line
98      */
99     public int _tokenStartCharPositionInLine;
100 
101     public bool _hitEOF;
102 
103     /**
104      * The channel number for the current token
105      */
106     public int _channel;
107 
108     /**
109      * The token type for the current token
110      */
111     public int _type;
112 
113     public int _mode;
114 
115     /**
116      * You can set the text for the current token to override what is in
117      * the input char buffer.  Use setText() or can set this instance var.
118      */
119     public string _text;
120 
121     public this()
122     {
123     }
124 
125     public this(CharStream input)
126     {
127         tokenFactory_ = CommonTokenFactory.DEFAULT;
128         this._input = input;
129         this._tokenFactorySourcePair = tuple(this, input);
130         _modeStack = new IntegerStack();
131     }
132 
133     public void reset()
134     {
135 	// wack Lexer state variables
136         if (_input !is null) {
137             _input.seek(0); // rewind the input
138         }
139         _token = null;
140         _type = TokenConstantDefinition.INVALID_TYPE;
141         _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
142         _tokenStartCharIndex = -1;
143         _tokenStartCharPositionInLine = -1;
144         _tokenStartLine = -1;
145         _text = null;
146         _hitEOF = false;
147         _mode = Lexer.DEFAULT_MODE;
148         _modeStack.clear();
149         getInterpreter().reset();
150     }
151 
152     /**
153      * Return a token from this source; i.e., match a token on the char
154      * stream.
155      */
156     public Token nextToken()
157     {
158 	if (_input is null) {
159             throw new IllegalStateException("nextToken requires a non-null input stream.");
160         }
161         // Mark start location in char stream so unbuffered streams are
162         // guaranteed at least have text of current token
163         int tokenStartMarker = _input.mark();
164         try{
165         outer:
166             while (true) {
167                 if (_hitEOF) {
168                     emitEOF();
169                     return _token;
170                 }
171                 _token = null;
172                 _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
173                 _tokenStartCharIndex = _input.index;
174                 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine();
175                 _tokenStartLine = getInterpreter.getLine;
176                 _text = null;
177                 do {
178                     _type = TokenConstantDefinition.INVALID_TYPE;
179                     debug(Lexer) {
180                         import std.stdio;
181                         writefln("nextToken line = %s at %s in mode %s at index %s",
182                                  _tokenStartLine,
183                                  cast(char)_input.LA(1),
184                                  _mode,
185                                  _input.index);
186                     }
187                     int ttype;
188                     try {
189                         ttype = getInterpreter.match(_input, _mode);
190                     }
191                     catch (LexerNoViableAltException e) {
192                         notifyListeners(e);		// report error
193                         recover(e);
194                         ttype = SKIP;
195                     }
196                     if (_input.LA(1) == IntStreamConstant.EOF) {
197                         _hitEOF = true;
198                     }
199                     if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype;
200                     if (_type == SKIP) {
201                         continue outer;
202                     }
203                 }
204                 while (_type == MORE);
205 
206                 if (_token is null) {
207                     emit();
208                 }
209                 return _token;
210             }
211         }
212         finally {
213             // make sure we release marker after match or
214             // unbuffered char stream will keep buffering
215             _input.release(tokenStartMarker);
216         }
217         assert(0);
218     }
219 
220     /**
221      * Instruct the lexer to skip creating a token for current lexer rule
222      * and look for another token.  nextToken() knows to keep looking when
223      * a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
224      * if token==null at end of any token rule, it creates one for you
225      * and emits it.
226      */
227     public void skip()
228     {
229         _type = SKIP;
230     }
231 
232     public void more()
233     {
234         _type = MORE;
235     }
236 
237     public void mode(int m)
238     {
239         _mode = m;
240     }
241 
242     public void pushMode(int m)
243     {
244         debug(LexerATNSimulator)
245             writefln("pushMode %s %s", m, _modeStack);
246         _modeStack.push(_mode);
247         mode(m);
248     }
249 
250     public int popMode()
251     {
252         assert (!_modeStack.isEmpty, "Empty stack");
253         debug(LexerATNSimulator)
254             writefln("popMode back to %s", _modeStack.peek);
255         mode(_modeStack.pop);
256         return _mode;
257     }
258 
259     /**
260      * Set the char stream and reset the lexer
261      * @uml
262      * @override
263      */
264     public override void setInputStream(IntStream input)
265     {
266         this._input = null;
267         this._tokenFactorySourcePair = tuple(this, _input);
268         reset();
269         this._input = cast(CharStream)input;
270         this._tokenFactorySourcePair = tuple(this, _input);
271     }
272 
273     public string getSourceName()
274     {
275         return _input.getSourceName();
276     }
277 
278     /**
279      * @uml
280      * @override
281      */
282     public override CharStream getInputStream()
283     {
284         return _input;
285     }
286 
287     /**
288      * By default does not support multiple emits per nextToken invocation
289      * for efficiency reasons.  Subclass and override this method, nextToken,
290      * and getToken (to push tokens into a list and pull from that list
291      * rather than a single variable as this implementation does).
292      */
293     public void emit(Token token)
294     {
295         this._token = token;
296     }
297 
298     /**
299      * The standard method called to automatically emit a token at the
300      * outermost lexical rule.  The token object should point into the
301      * char buffer start..stop.  If there is a text override in 'text',
302      * use that to set the token's text.  Override this method to emit
303      * custom Token objects or provide a new factory.
304      */
305     public Token emit()
306     {
307         Token t = tokenFactory_.create(_tokenFactorySourcePair, _type,
308                                        _text, _channel, _tokenStartCharIndex,
309                                        getCharIndex()-1, _tokenStartLine,
310                                        _tokenStartCharPositionInLine);
311         emit(t);
312         return t;
313     }
314 
315     public Token emitEOF()
316     {
317         int cpos = getCharPositionInLine();
318         int line = getLine();
319         Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, null, TokenConstantDefinition.DEFAULT_CHANNEL,
320                                          _input.index(), _input.index()-1,
321                                          line, cpos);
322         emit(eof);
323         return eof;
324     }
325 
326     public int getLine()
327     {
328         return getInterpreter().getLine();
329     }
330 
331     public int getCharPositionInLine()
332     {
333         return getInterpreter().getCharPositionInLine();
334     }
335 
336     public void setLine(int line)
337     {
338         getInterpreter().setLine(line);
339     }
340 
341     public void setCharPositionInLine(int charPositionInLine)
342     {
343         getInterpreter().setCharPositionInLine(charPositionInLine);
344     }
345 
346     /**
347      * What is the index of the current character of lookahead?
348      */
349     public int getCharIndex()
350     {
351         return _input.index();
352     }
353 
354     /**
355      * Return the text matched so far for the current token or any
356      * text override.
357      */
358     public string getText()
359     {
360         if (_text !is null) {
361             return _text;
362         }
363         return getInterpreter().getText(_input);
364     }
365 
366     /**
367      * Set the complete text of this token; it wipes any previous
368      * changes to the text.
369      */
370     public void setText(string text)
371     {
372         this._text = text;
373     }
374 
375     /**
376      * Override if emitting multiple tokens.
377      */
378     public Token getToken()
379     {
380         return _token;
381     }
382 
383     public void setToken(Token token)
384     {
385         this._token = token;
386     }
387 
388     public void setType(int ttype)
389     {
390         _type = ttype;
391     }
392 
393     public int getType()
394     {
395         return _type;
396     }
397 
398     public void setChannel(int channel)
399     {
400 	_channel = channel;
401     }
402 
403     public int getChannel()
404     {
405         return _channel;
406     }
407 
408     public string[] getChannelNames()
409     {
410         return null;
411     }
412 
413     public string[] getModeNames()
414     {
415         return null;
416     }
417 
418     /**
419      * Used to print out token names like ID during debugging and
420      * error reporting.  The generated parsers implement a method
421      * that overrides this to point to their String[] tokenNames
422      * @uml
423      * @override
424      */
425     public override string[] getTokenNames()
426     {
427         return null;
428     }
429 
430     /**
431      * Return a list of all Token objects in input char stream.
432      * Forces load of all tokens. Does not include EOF token.
433      */
434     public Token[] getAllTokens()
435     {
436 	Token[] tokens;
437         Token t = nextToken();
438         while (t.getType() != TokenConstantDefinition.EOF) {
439             tokens ~= t;
440             t = nextToken();
441         }
442         return tokens;
443     }
444 
445     public void recover(LexerNoViableAltException e)
446     {
447 	if (_input.LA(1) != IntStreamConstant.EOF) {
448             // skip a char and try again
449             getInterpreter().consume(_input);
450         }
451     }
452 
453     public void notifyListeners(LexerNoViableAltException e)
454     {
455         string text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index()));
456         string msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'";
457 
458         ANTLRErrorListener!(int, LexerATNSimulator) listener = getErrorListenerDispatch();
459         listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
460     }
461 
462     public string getErrorDisplay(string s)
463     {
464         auto buf = appender!string;
465         foreach (char c; s) {
466             buf.put(getErrorDisplay(c));
467         }
468         return buf.data;
469     }
470 
471     public string getErrorDisplay(int c)
472     {
473         string s = to!string(c);
474         switch ( c ) {
475         case TokenConstantDefinition.EOF :
476             s = "<EOF>";
477             break;
478         case '\n' :
479             s = "\\n";
480             break;
481         case '\t' :
482             s = "\\t";
483             break;
484         case '\r' :
485             s = "\\r";
486             break;
487         default: break;
488         }
489         return s;
490     }
491 
492     public string getCharErrorDisplay(int c)
493     {
494         string s = getErrorDisplay(c);
495         return "'" ~ s ~ "'";
496     }
497 
498     /**
499      * Lexers can normally match any char in it's vocabulary after matching
500      * a token, so do the easy thing and just kill a character and hope
501      * it all works out.  You can instead use the rule invocation stack
502      * to do sophisticated error recovery if you are in a fragment rule.
503      */
504     public void recover(RecognitionException re)
505     {
506 	//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
507         //re.printStackTrace();
508         // TODO: Do we lose character or line position information?
509         _input.consume();
510     }
511 
512     /**
513      * @uml
514      * @override
515      */
516     public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex,
517                                 int actionIndex)
518     {
519     }
520 
521     public override final TokenFactory!CommonToken tokenFactory()
522     {
523         return this.tokenFactory_;
524     }
525 
526     public override final void tokenFactory(TokenFactory!CommonToken tokenFactory)
527     {
528         this.tokenFactory_ = tokenFactory;
529     }
530 
531 }