antlr.v4.runtime.Lexer source code

1 /*
2  * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
3  * Use of this file is governed by the BSD 3-clause license that
4  * can be found in the LICENSE.txt file in the project root.
5  */
6 
7 module antlr.v4.runtime.Lexer;
8 
9 import std.stdio;
10 import std.typecons;
11 import std.array;
12 import std.conv;
13 import antlr.v4.runtime.ANTLRErrorListener;
14 import antlr.v4.runtime.Recognizer;
15 import antlr.v4.runtime.RecognitionException;
16 import antlr.v4.runtime.atn.LexerATNSimulator;
17 import antlr.v4.runtime.Token;
18 import antlr.v4.runtime.TokenConstantDefinition;
19 import antlr.v4.runtime.TokenSource;
20 import antlr.v4.runtime.InterfaceLexer;
21 import antlr.v4.runtime.TokenFactory;
22 import antlr.v4.runtime.CharStream;
23 import antlr.v4.runtime.IntStream;
24 import antlr.v4.runtime.IntStreamConstant;
25 import antlr.v4.runtime.CommonToken;
26 import antlr.v4.runtime.CommonTokenFactory;
27 import antlr.v4.runtime.IllegalStateException;
28 import antlr.v4.runtime.LexerNoViableAltException;
29 import antlr.v4.runtime.misc;
30 import antlr.v4.runtime.InterfaceRuleContext;
31 
32 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b");
33 
34 /**
35  * A lexer is recognizer that draws input symbols from a character stream.
36  * lexer grammars result in a subclass of this object. A Lexer object
37  * uses simplified match() and error recovery mechanisms in the interest
38  * of speed.
39  */
40 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer
41 {
42 
43     public static immutable int DEFAULT_MODE = 0;
44 
45     public static immutable int MORE = -2;
46 
47     public static immutable int SKIP = -3;
48 
49     public static immutable int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL;
50 
51     public static immutable int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL;
52 
53     public static immutable int MIN_CHAR_VALUE = char.min;
54 
55     public static immutable int MAX_CHAR_VALUE = char.max;
56 
57     public CharStream _input;
58 
59     protected TokenFactorySourcePair _tokenFactorySourcePair;
60 
61     /**
62      * How to create token objects
63      * @uml
64      * @read
65      * @write
66      * @override
67      */
68     public TokenFactory!CommonToken tokenFactory_;
69 
70     /**
71      * The goal of all lexer rules/methods is to create a token object.
72      * This is an instance variable as multiple rules may collaborate to
73      * create a single token.  nextToken will return this object after
74      * matching lexer rule(s).  If you subclass to allow multiple token
75      * emissions, then set this to the last token to be matched or
76      * something nonnull so that the auto token emit mechanism will not
77      * emit another token.
78      */
79     public Token _token;
80 
81     public IntegerStack _modeStack;
82 
83     /**
84      * What character index in the stream did the current token start at?
85      * Needed, for example, to get the text for current token.  Set at
86      * the start of nextToken.
87      */
88     public int _tokenStartCharIndex = -1;
89 
90     /**
91      * The line on which the first character of the token resides
92      */
93     public int _tokenStartLine;
94 
95     /**
96      * The character position of first character within the line
97      */
98     public int _tokenStartCharPositionInLine;
99 
100     public bool _hitEOF;
101 
102     /**
103      * The channel number for the current token
104      */
105     public int _channel;
106 
107     /**
108      * The token type for the current token
109      */
110     public int _type;
111 
112     public int _mode;
113 
114     /**
115      * You can set the text for the current token to override what is in
116      * the input char buffer.  Use setText() or can set this instance var.
117      */
118     public string _text;
119 
120     public this()
121     {
122     }
123 
124     public this(CharStream input)
125     {
126         tokenFactory_ = CommonTokenFactory.DEFAULT;
127         this._input = input;
128         this._tokenFactorySourcePair = tuple(this, input);
129         _modeStack = new IntegerStack();
130     }
131 
132     public void reset()
133     {
134 	// wack Lexer state variables
135         if (_input !is null) {
136             _input.seek(0); // rewind the input
137         }
138         _token = null;
139         _type = TokenConstantDefinition.INVALID_TYPE;
140         _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
141         _tokenStartCharIndex = -1;
142         _tokenStartCharPositionInLine = -1;
143         _tokenStartLine = -1;
144         _text = null;
145         _hitEOF = false;
146         _mode = Lexer.DEFAULT_MODE;
147         _modeStack.clear();
148         getInterpreter().reset();
149     }
150 
151     /**
152      * Return a token from this source; i.e., match a token on the char
153      * stream.
154      */
155     public Token nextToken()
156     {
157 	if (_input is null) {
158             throw new IllegalStateException("nextToken requires a non-null input stream.");
159         }
160         // Mark start location in char stream so unbuffered streams are
161         // guaranteed at least have text of current token
162         int tokenStartMarker = _input.mark();
163         try{
164         outer:
165             while (true) {
166                 if (_hitEOF) {
167                     emitEOF();
168                     return _token;
169                 }
170                 _token = null;
171                 _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
172                 _tokenStartCharIndex = _input.index;
173                 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine();
174                 _tokenStartLine = getInterpreter.getLine;
175                 _text = null;
176                 do {
177                     _type = TokenConstantDefinition.INVALID_TYPE;
178                     debug(Lexer) {
179                         import std.stdio;
180                         writefln("nextToken line = %s at %s in mode %s at index %s",
181                                  _tokenStartLine,
182                                  cast(char)_input.LA(1),
183                                  _mode,
184                                  _input.index);
185                     }
186                     int ttype;
187                     try {
188                         ttype = getInterpreter.match(_input, _mode);
189                     }
190                     catch (LexerNoViableAltException e) {
191                         notifyListeners(e);		// report error
192                         recover(e);
193                         ttype = SKIP;
194                     }
195                     if (_input.LA(1) == IntStreamConstant.EOF) {
196                         _hitEOF = true;
197                     }
198                     if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype;
199                     if (_type == SKIP) {
200                         continue outer;
201                     }
202                 }
203                 while (_type == MORE);
204 
205                 if (_token is null) {
206                     emit();
207                 }
208                 return _token;
209             }
210         }
211         finally {
212             // make sure we release marker after match or
213             // unbuffered char stream will keep buffering
214             _input.release(tokenStartMarker);
215         }
216         assert(0);
217     }
218 
219     /**
220      * Instruct the lexer to skip creating a token for current lexer rule
221      * and look for another token.  nextToken() knows to keep looking when
222      * a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
223      * if token==null at end of any token rule, it creates one for you
224      * and emits it.
225      */
226     public void skip()
227     {
228         _type = SKIP;
229     }
230 
231     public void more()
232     {
233         _type = MORE;
234     }
235 
236     public void mode(int m)
237     {
238         _mode = m;
239     }
240 
241     public void pushMode(int m)
242     {
243         debug(LexerATNSimulator)
244             writefln("pushMode %s %s", m, _modeStack);
245         _modeStack.push(_mode);
246         mode(m);
247     }
248 
249     public int popMode()
250     {
251         assert (!_modeStack.isEmpty, "Empty stack");
252         debug(LexerATNSimulator)
253             writefln("popMode back to %s", _modeStack.peek);
254         mode(_modeStack.pop);
255         return _mode;
256     }
257 
258     /**
259      * Set the char stream and reset the lexer
260      * @uml
261      * @override
262      */
263     public override void setInputStream(IntStream input)
264     {
265         this._input = null;
266         this._tokenFactorySourcePair = tuple(this, _input);
267         reset();
268         this._input = cast(CharStream)input;
269         this._tokenFactorySourcePair = tuple(this, _input);
270     }
271 
272     public string getSourceName()
273     {
274         return _input.getSourceName();
275     }
276 
277     /**
278      * @uml
279      * @override
280      */
281     public override CharStream getInputStream()
282     {
283         return _input;
284     }
285 
286     /**
287      * By default does not support multiple emits per nextToken invocation
288      * for efficiency reasons.  Subclass and override this method, nextToken,
289      * and getToken (to push tokens into a list and pull from that list
290      * rather than a single variable as this implementation does).
291      */
292     public void emit(Token token)
293     {
294         this._token = token;
295     }
296 
297     /**
298      * The standard method called to automatically emit a token at the
299      * outermost lexical rule.  The token object should point into the
300      * char buffer start..stop.  If there is a text override in 'text',
301      * use that to set the token's text.  Override this method to emit
302      * custom Token objects or provide a new factory.
303      */
304     public Token emit()
305     {
306         Token t = tokenFactory_.create(_tokenFactorySourcePair, _type,
307                                        _text, _channel, _tokenStartCharIndex,
308                                        getCharIndex()-1, _tokenStartLine,
309                                        _tokenStartCharPositionInLine);
310         emit(t);
311         return t;
312     }
313 
314     public Token emitEOF()
315     {
316         int cpos = getCharPositionInLine();
317         int line = getLine();
318         Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, null, TokenConstantDefinition.DEFAULT_CHANNEL,
319                                          _input.index(), _input.index()-1,
320                                          line, cpos);
321         emit(eof);
322         return eof;
323     }
324 
325     public int getLine()
326     {
327         return getInterpreter().getLine();
328     }
329 
330     public int getCharPositionInLine()
331     {
332         return getInterpreter().getCharPositionInLine();
333     }
334 
335     public void setLine(int line)
336     {
337         getInterpreter().setLine(line);
338     }
339 
340     public void setCharPositionInLine(int charPositionInLine)
341     {
342         getInterpreter().setCharPositionInLine(charPositionInLine);
343     }
344 
345     /**
346      * What is the index of the current character of lookahead?
347      */
348     public int getCharIndex()
349     {
350         return _input.index();
351     }
352 
353     /**
354      * Return the text matched so far for the current token or any
355      * text override.
356      */
357     public string getText()
358     {
359         if (_text !is null) {
360             return _text;
361         }
362         return getInterpreter().getText(_input);
363     }
364 
365     /**
366      * Set the complete text of this token; it wipes any previous
367      * changes to the text.
368      */
369     public void setText(string text)
370     {
371         this._text = text;
372     }
373 
374     /**
375      * Override if emitting multiple tokens.
376      */
377     public Token getToken()
378     {
379         return _token;
380     }
381 
382     public void setToken(Token token)
383     {
384         this._token = token;
385     }
386 
387     public void setType(int ttype)
388     {
389         _type = ttype;
390     }
391 
392     public int getType()
393     {
394         return _type;
395     }
396 
397     public void setChannel(int channel)
398     {
399 	_channel = channel;
400     }
401 
402     public int getChannel()
403     {
404         return _channel;
405     }
406 
407     public string[] getChannelNames()
408     {
409         return null;
410     }
411 
412     public string[] getModeNames()
413     {
414         return null;
415     }
416 
417     /**
418      * Used to print out token names like ID during debugging and
419      * error reporting.  The generated parsers implement a method
420      * that overrides this to point to their String[] tokenNames
421      * @uml
422      * @override
423      */
424     public override string[] getTokenNames()
425     {
426         return null;
427     }
428 
429     /**
430      * Return a list of all Token objects in input char stream.
431      * Forces load of all tokens. Does not include EOF token.
432      */
433     public Token[] getAllTokens()
434     {
435 	Token[] tokens;
436         Token t = nextToken();
437         while (t.getType() != TokenConstantDefinition.EOF) {
438             tokens ~= t;
439             t = nextToken();
440         }
441         return tokens;
442     }
443 
444     public void recover(LexerNoViableAltException e)
445     {
446 	if (_input.LA(1) != IntStreamConstant.EOF) {
447             // skip a char and try again
448             getInterpreter().consume(_input);
449         }
450     }
451 
452     public void notifyListeners(LexerNoViableAltException e)
453     {
454         string text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index()));
455         string msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'";
456 
457         ANTLRErrorListener!(int, LexerATNSimulator) listener = getErrorListenerDispatch();
458         listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
459     }
460 
461     public string getErrorDisplay(string s)
462     {
463         auto buf = appender!string;
464         foreach (char c; s) {
465             buf.put(getErrorDisplay(c));
466         }
467         return buf.data;
468     }
469 
470     public string getErrorDisplay(int c)
471     {
472         string s = to!string(c);
473         switch ( c ) {
474         case TokenConstantDefinition.EOF :
475             s = "<EOF>";
476             break;
477         case '\n' :
478             s = "\\n";
479             break;
480         case '\t' :
481             s = "\\t";
482             break;
483         case '\r' :
484             s = "\\r";
485             break;
486         default: break;
487         }
488         return s;
489     }
490 
491     public string getCharErrorDisplay(int c)
492     {
493         string s = getErrorDisplay(c);
494         return "'" ~ s ~ "'";
495     }
496 
497     /**
498      * Lexers can normally match any char in it's vocabulary after matching
499      * a token, so do the easy thing and just kill a character and hope
500      * it all works out.  You can instead use the rule invocation stack
501      * to do sophisticated error recovery if you are in a fragment rule.
502      */
503     public void recover(RecognitionException re)
504     {
505 	//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
506         //re.printStackTrace();
507         // TODO: Do we lose character or line position information?
508         _input.consume();
509     }
510 
511     /**
512      * @uml
513      * @override
514      */
515     public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex,
516                                 int actionIndex)
517     {
518     }
519 
520     public override final TokenFactory!CommonToken tokenFactory()
521     {
522         return this.tokenFactory_;
523     }
524 
525     public override final void tokenFactory(TokenFactory!CommonToken tokenFactory)
526     {
527         this.tokenFactory_ = tokenFactory;
528     }
529 
530 }