1 /*
2 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
3 * Use of this file is governed by the BSD 3-clause license that
4 * can be found in the LICENSE.txt file in the project root.
5 */
6
7 module antlr.v4.runtime.Lexer;
8
9 import std.stdio;
10 import std.typecons;
11 import std.array;
12 import std.conv;
13 import std.variant;
14 import antlr.v4.runtime.ANTLRErrorListener;
15 import antlr.v4.runtime.Recognizer;
16 import antlr.v4.runtime.RecognitionException;
17 import antlr.v4.runtime.atn.LexerATNSimulator;
18 import antlr.v4.runtime.Token;
19 import antlr.v4.runtime.TokenConstantDefinition;
20 import antlr.v4.runtime.TokenSource;
21 import antlr.v4.runtime.InterfaceLexer;
22 import antlr.v4.runtime.TokenFactory;
23 import antlr.v4.runtime.CharStream;
24 import antlr.v4.runtime.IntStream;
25 import antlr.v4.runtime.IntStreamConstant;
26 import antlr.v4.runtime.CommonToken;
27 import antlr.v4.runtime.CommonTokenFactory;
28 import antlr.v4.runtime.IllegalStateException;
29 import antlr.v4.runtime.LexerNoViableAltException;
30 import antlr.v4.runtime.misc;
31 import antlr.v4.runtime.InterfaceRuleContext;
32
33 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b");
34
35 /**
36 * A lexer is recognizer that draws input symbols from a character stream.
37 * lexer grammars result in a subclass of this object. A Lexer object
38 * uses simplified match() and error recovery mechanisms in the interest
39 * of speed.
40 */
41 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer
42 {
43
44 enum int DEFAULT_MODE = 0;
45
46 enum int MORE = -2;
47
48 enum int SKIP = -3;
49
50 enum int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL;
51
52 enum int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL;
53
54 enum int MIN_CHAR_VALUE = 0;
55
56 enum int MAX_CHAR_VALUE = 0x10FFFF;
57
58 public CharStream _input;
59
60 protected TokenFactorySourcePair _tokenFactorySourcePair;
61
62 /**
63 * How to create token objects
64 * @uml
65 * @read
66 * @write
67 * @override
68 */
69 public TokenFactory!CommonToken tokenFactory_;
70
71 /**
72 * The goal of all lexer rules/methods is to create a token object.
73 * This is an instance variable as multiple rules may collaborate to
74 * create a single token. nextToken will return this object after
75 * matching lexer rule(s). If you subclass to allow multiple token
76 * emissions, then set this to the last token to be matched or
77 * something nonnull so that the auto token emit mechanism will not
78 * emit another token.
79 */
80 public Token _token;
81
82 public IntegerStack _modeStack;
83
84 /**
85 * What character index in the stream did the current token start at?
86 * Needed, for example, to get the text for current token. Set at
87 * the start of nextToken.
88 */
89 public size_t _tokenStartCharIndex;
90
91 /**
92 * The line on which the first character of the token resides
93 */
94 public int _tokenStartLine;
95
96 /**
97 * The character position of first character within the line
98 */
99 public int _tokenStartCharPositionInLine;
100
101 public bool _hitEOF;
102
103 /**
104 * The channel number for the current token
105 */
106 public int _channel;
107
108 /**
109 * The token type for the current token
110 */
111 public int _type;
112
113 public int _mode;
114
115 /**
116 * You can set the text for the current token to override what is in
117 * the input char buffer. Use setText() or can set this instance var.
118 */
119 public Variant _text;
120
121 public this()
122 {
123 }
124
125 public this(CharStream input)
126 {
127 tokenFactory_ = CommonTokenFactory.DEFAULT;
128 this._input = input;
129 this._tokenFactorySourcePair = tuple(this, input);
130 _modeStack = new IntegerStack();
131 }
132
133 public void reset()
134 {
135 // wack Lexer state variables
136 if (_input !is null) {
137 _input.seek(0); // rewind the input
138 }
139 _token = null;
140 _type = TokenConstantDefinition.INVALID_TYPE;
141 _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
142 _tokenStartCharIndex = -1;
143 _tokenStartCharPositionInLine = -1;
144 _tokenStartLine = -1;
145 _text.init;
146 _hitEOF = false;
147 _mode = Lexer.DEFAULT_MODE;
148 _modeStack.clear();
149 getInterpreter().reset();
150 }
151
152 /**
153 * Return a token from this source; i.e., match a token on the char
154 * stream.
155 */
156 public Token nextToken()
157 {
158 if (_input is null) {
159 throw new IllegalStateException("nextToken requires a non-null input stream.");
160 }
161 // Mark start location in char stream so unbuffered streams are
162 // guaranteed at least have text of current token
163 int tokenStartMarker = _input.mark();
164 try{
165 outer:
166 while (true) {
167 if (_hitEOF) {
168 emitEOF();
169 return _token;
170 }
171 _token = null;
172 _channel = TokenConstantDefinition.DEFAULT_CHANNEL;
173 _tokenStartCharIndex = _input.index;
174 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine();
175 _tokenStartLine = getInterpreter.getLine;
176 _text.init;
177 do {
178 _type = TokenConstantDefinition.INVALID_TYPE;
179 debug(Lexer) {
180 import std.stdio;
181 writefln("nextToken line = %s at %s: %s in mode %s at index %s",
182 _tokenStartLine,
183 _tokenStartCharPositionInLine,
184 _input.LA(1),
185 _mode,
186 _input.index);
187 }
188 int ttype;
189 try {
190 ttype = getInterpreter.match(_input, _mode);
191 }
192 catch (LexerNoViableAltException e) {
193 notifyListeners(e); // report error
194 recover(e);
195 ttype = SKIP;
196 }
197 if (_input.LA(1) == IntStreamConstant.EOF) {
198 _hitEOF = true;
199 }
200 if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype;
201 if (_type == SKIP) {
202 continue outer;
203 }
204 }
205 while (_type == MORE);
206
207 if (_token is null) {
208 emit();
209 }
210 return _token;
211 }
212 }
213 finally {
214 // make sure we release marker after match or
215 // unbuffered char stream will keep buffering
216 _input.release(tokenStartMarker);
217 }
218 assert(0);
219 }
220
221 /**
222 * Instruct the lexer to skip creating a token for current lexer rule
223 * and look for another token. nextToken() knows to keep looking when
224 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that
225 * if token==null at end of any token rule, it creates one for you
226 * and emits it.
227 */
228 public void skip()
229 {
230 _type = SKIP;
231 }
232
233 public void more()
234 {
235 _type = MORE;
236 }
237
238 public void mode(int m)
239 {
240 _mode = m;
241 }
242
243 public void pushMode(int m)
244 {
245 debug(LexerATNSimulator)
246 writefln("pushMode %s %s", m, _modeStack);
247 _modeStack.push(_mode);
248 mode(m);
249 }
250
251 public int popMode()
252 {
253 assert (!_modeStack.isEmpty, "Empty stack");
254 debug(LexerATNSimulator)
255 writefln("popMode back to %s", _modeStack.peek);
256 mode(_modeStack.pop);
257 return _mode;
258 }
259
260 /**
261 * Set the char stream and reset the lexer
262 * @uml
263 * @override
264 */
265 public override void setInputStream(IntStream input)
266 {
267 this._input = null;
268 this._tokenFactorySourcePair = tuple(this, _input);
269 reset();
270 this._input = cast(CharStream)input;
271 this._tokenFactorySourcePair = tuple(this, _input);
272 }
273
274 public string getSourceName()
275 {
276 return _input.getSourceName();
277 }
278
279 /**
280 * @uml
281 * @override
282 */
283 public override CharStream getInputStream()
284 {
285 return _input;
286 }
287
288 /**
289 * By default does not support multiple emits per nextToken invocation
290 * for efficiency reasons. Subclass and override this method, nextToken,
291 * and getToken (to push tokens into a list and pull from that list
292 * rather than a single variable as this implementation does).
293 */
294 public void emit(Token token)
295 {
296 this._token = token;
297 }
298
299 /**
300 * The standard method called to automatically emit a token at the
301 * outermost lexical rule. The token object should point into the
302 * char buffer start..stop. If there is a text override in 'text',
303 * use that to set the token's text. Override this method to emit
304 * custom Token objects or provide a new factory.
305 */
306 public Token emit()
307 {
308 Variant v = _text;
309 Token t = tokenFactory_.create(_tokenFactorySourcePair, _type,
310 v, _channel, _tokenStartCharIndex,
311 getCharIndex()-1, _tokenStartLine,
312 _tokenStartCharPositionInLine);
313 emit(t);
314 return t;
315 }
316
317 public Token emitEOF()
318 {
319 int cpos = getCharPositionInLine();
320 int line = getLine();
321 Variant Null;
322 Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, Null, TokenConstantDefinition.DEFAULT_CHANNEL,
323 _input.index(), _input.index()-1,
324 line, cpos);
325 emit(eof);
326 return eof;
327 }
328
329 public int getLine()
330 {
331 return getInterpreter().getLine();
332 }
333
334 public int getCharPositionInLine()
335 {
336 return getInterpreter().getCharPositionInLine();
337 }
338
339 public void setLine(int line)
340 {
341 getInterpreter().setLine(line);
342 }
343
344 public void setCharPositionInLine(int charPositionInLine)
345 {
346 getInterpreter().setCharPositionInLine(charPositionInLine);
347 }
348
349 /**
350 * What is the index of the current character of lookahead?
351 */
352 public size_t getCharIndex()
353 {
354 return _input.index();
355 }
356
357 /**
358 * Return the text matched so far for the current token or any
359 * text override.
360 */
361 public Variant getText()
362 {
363 Variant Null;
364 if (_text !is Null) {
365 return _text;
366 }
367 Variant v = getInterpreter().getText(_input);
368 return v;
369 }
370
371 /**
372 * Set the complete text of this token; it wipes any previous
373 * changes to the text.
374 */
375 public void setText(Variant text)
376 {
377 this._text = text;
378 }
379
380 /**
381 * Override if emitting multiple tokens.
382 */
383 public Token getToken()
384 {
385 return _token;
386 }
387
388 public void setToken(Token token)
389 {
390 this._token = token;
391 }
392
393 public void setType(int ttype)
394 {
395 _type = ttype;
396 }
397
398 public int getType()
399 {
400 return _type;
401 }
402
403 public void setChannel(int channel)
404 {
405 _channel = channel;
406 }
407
408 public int getChannel()
409 {
410 return _channel;
411 }
412
413 public string[] getChannelNames()
414 {
415 return null;
416 }
417
418 public string[] getModeNames()
419 {
420 return null;
421 }
422
423 /**
424 * Used to print out token names like ID during debugging and
425 * error reporting. The generated parsers implement a method
426 * that overrides this to point to their String[] tokenNames
427 * @uml
428 * @override
429 */
430 public override string[] getTokenNames()
431 {
432 return null;
433 }
434
435 /**
436 * Return a list of all Token objects in input char stream.
437 * Forces load of all tokens. Does not include EOF token.
438 */
439 public Token[] getAllTokens()
440 {
441 Token[] tokens;
442 Token t = nextToken();
443 while (t.getType() != TokenConstantDefinition.EOF) {
444 tokens ~= t;
445 t = nextToken();
446 }
447 return tokens;
448 }
449
450 public void recover(LexerNoViableAltException e)
451 {
452 if (_input.LA(1) != IntStreamConstant.EOF) {
453 // skip a char and try again
454 getInterpreter().consume(_input);
455 }
456 }
457
458 public void notifyListeners(LexerNoViableAltException e)
459 {
460 auto text = _input.getText(Interval.of(to!int(_tokenStartCharIndex), to!int(_input.index)));
461 auto msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'";
462 ANTLRErrorListener listener = getErrorListenerDispatch();
463 listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
464 }
465
466 public string getErrorDisplay(string s)
467 {
468 auto buf = appender!string;
469 foreach (dchar c; s) {
470 buf.put(getErrorDisplay(c));
471 }
472 return buf.data;
473 }
474
475 public string getErrorDisplay(dchar c)
476 {
477 string s;
478 switch ( c ) {
479 case TokenConstantDefinition.EOF :
480 s = "<EOF>";
481 break;
482 case '\n' :
483 s = "\\n";
484 break;
485 case '\t' :
486 s = "\\t";
487 break;
488 case '\r' :
489 s = "\\r";
490 break;
491 default:
492 s ~= c;
493 break;
494 }
495 return s;
496 }
497
498 public string getCharErrorDisplay(dchar c)
499 {
500 string s = getErrorDisplay(c);
501 return "'" ~ s ~ "'";
502 }
503
504 /**
505 * Lexers can normally match any char in it's vocabulary after matching
506 * a token, so do the easy thing and just kill a character and hope
507 * it all works out. You can instead use the rule invocation stack
508 * to do sophisticated error recovery if you are in a fragment rule.
509 */
510 public void recover(RecognitionException re)
511 {
512 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
513 //re.printStackTrace();
514 // TODO: Do we lose character or line position information?
515 _input.consume();
516 }
517
518 /**
519 * @uml
520 * @override
521 */
522 public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex,
523 int actionIndex)
524 {
525 }
526
527 public override final TokenFactory!CommonToken tokenFactory()
528 {
529 return this.tokenFactory_;
530 }
531
532 public override final void tokenFactory(TokenFactory!CommonToken tokenFactory)
533 {
534 this.tokenFactory_ = tokenFactory;
535 }
536
537 }