1 /* 2 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 3 * Use of this file is governed by the BSD 3-clause license that 4 * can be found in the LICENSE.txt file in the project root. 5 */ 6 7 module antlr.v4.runtime.Lexer; 8 9 import std.stdio; 10 import std.typecons; 11 import std.array; 12 import std.conv; 13 import std.variant; 14 import antlr.v4.runtime.ANTLRErrorListener; 15 import antlr.v4.runtime.Recognizer; 16 import antlr.v4.runtime.RecognitionException; 17 import antlr.v4.runtime.atn.LexerATNSimulator; 18 import antlr.v4.runtime.Token; 19 import antlr.v4.runtime.TokenConstantDefinition; 20 import antlr.v4.runtime.TokenSource; 21 import antlr.v4.runtime.InterfaceLexer; 22 import antlr.v4.runtime.TokenFactory; 23 import antlr.v4.runtime.CharStream; 24 import antlr.v4.runtime.IntStream; 25 import antlr.v4.runtime.IntStreamConstant; 26 import antlr.v4.runtime.CommonToken; 27 import antlr.v4.runtime.CommonTokenFactory; 28 import antlr.v4.runtime.IllegalStateException; 29 import antlr.v4.runtime.LexerNoViableAltException; 30 import antlr.v4.runtime.misc; 31 import antlr.v4.runtime.InterfaceRuleContext; 32 33 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b"); 34 35 /** 36 * A lexer is recognizer that draws input symbols from a character stream. 37 * lexer grammars result in a subclass of this object. A Lexer object 38 * uses simplified match() and error recovery mechanisms in the interest 39 * of speed. 40 */ 41 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer 42 { 43 44 public static immutable int DEFAULT_MODE = 0; 45 46 public static immutable int MORE = -2; 47 48 public static immutable int SKIP = -3; 49 50 public static immutable int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL; 51 52 public static immutable int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL; 53 54 public static immutable int MIN_CHAR_VALUE = char.min; 55 56 public static immutable int MAX_CHAR_VALUE = char.max; 57 58 public CharStream _input; 59 60 protected TokenFactorySourcePair _tokenFactorySourcePair; 61 62 /** 63 * How to create token objects 64 * @uml 65 * @read 66 * @write 67 * @override 68 */ 69 public TokenFactory!CommonToken tokenFactory_; 70 71 /** 72 * The goal of all lexer rules/methods is to create a token object. 73 * This is an instance variable as multiple rules may collaborate to 74 * create a single token. nextToken will return this object after 75 * matching lexer rule(s). If you subclass to allow multiple token 76 * emissions, then set this to the last token to be matched or 77 * something nonnull so that the auto token emit mechanism will not 78 * emit another token. 79 */ 80 public Token _token; 81 82 public IntegerStack _modeStack; 83 84 /** 85 * What character index in the stream did the current token start at? 86 * Needed, for example, to get the text for current token. Set at 87 * the start of nextToken. 88 */ 89 public int _tokenStartCharIndex = -1; 90 91 /** 92 * The line on which the first character of the token resides 93 */ 94 public int _tokenStartLine; 95 96 /** 97 * The character position of first character within the line 98 */ 99 public int _tokenStartCharPositionInLine; 100 101 public bool _hitEOF; 102 103 /** 104 * The channel number for the current token 105 */ 106 public int _channel; 107 108 /** 109 * The token type for the current token 110 */ 111 public int _type; 112 113 public int _mode; 114 115 /** 116 * You can set the text for the current token to override what is in 117 * the input char buffer. Use setText() or can set this instance var. 118 */ 119 public Variant _text; 120 121 public this() 122 { 123 } 124 125 public this(CharStream input) 126 { 127 tokenFactory_ = CommonTokenFactory.DEFAULT; 128 this._input = input; 129 this._tokenFactorySourcePair = tuple(this, input); 130 _modeStack = new IntegerStack(); 131 } 132 133 public void reset() 134 { 135 // wack Lexer state variables 136 if (_input !is null) { 137 _input.seek(0); // rewind the input 138 } 139 _token = null; 140 _type = TokenConstantDefinition.INVALID_TYPE; 141 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 142 _tokenStartCharIndex = -1; 143 _tokenStartCharPositionInLine = -1; 144 _tokenStartLine = -1; 145 _text.init; 146 _hitEOF = false; 147 _mode = Lexer.DEFAULT_MODE; 148 _modeStack.clear(); 149 getInterpreter().reset(); 150 } 151 152 /** 153 * Return a token from this source; i.e., match a token on the char 154 * stream. 155 */ 156 public Token nextToken() 157 { 158 if (_input is null) { 159 throw new IllegalStateException("nextToken requires a non-null input stream."); 160 } 161 // Mark start location in char stream so unbuffered streams are 162 // guaranteed at least have text of current token 163 int tokenStartMarker = _input.mark(); 164 try{ 165 outer: 166 while (true) { 167 if (_hitEOF) { 168 emitEOF(); 169 return _token; 170 } 171 _token = null; 172 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 173 _tokenStartCharIndex = _input.index; 174 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine(); 175 _tokenStartLine = getInterpreter.getLine; 176 _text.init; 177 do { 178 _type = TokenConstantDefinition.INVALID_TYPE; 179 debug(Lexer) { 180 import std.stdio; 181 writefln("nextToken line = %s at %s in mode %s at index %s", 182 _tokenStartLine, 183 cast(char)_input.LA(1), 184 _mode, 185 _input.index); 186 } 187 int ttype; 188 try { 189 ttype = getInterpreter.match(_input, _mode); 190 } 191 catch (LexerNoViableAltException e) { 192 notifyListeners(e); // report error 193 recover(e); 194 ttype = SKIP; 195 } 196 if (_input.LA(1) == IntStreamConstant.EOF) { 197 _hitEOF = true; 198 } 199 if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype; 200 if (_type == SKIP) { 201 continue outer; 202 } 203 } 204 while (_type == MORE); 205 206 if (_token is null) { 207 emit(); 208 } 209 return _token; 210 } 211 } 212 finally { 213 // make sure we release marker after match or 214 // unbuffered char stream will keep buffering 215 _input.release(tokenStartMarker); 216 } 217 assert(0); 218 } 219 220 /** 221 * Instruct the lexer to skip creating a token for current lexer rule 222 * and look for another token. nextToken() knows to keep looking when 223 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 224 * if token==null at end of any token rule, it creates one for you 225 * and emits it. 226 */ 227 public void skip() 228 { 229 _type = SKIP; 230 } 231 232 public void more() 233 { 234 _type = MORE; 235 } 236 237 public void mode(int m) 238 { 239 _mode = m; 240 } 241 242 public void pushMode(int m) 243 { 244 debug(LexerATNSimulator) 245 writefln("pushMode %s %s", m, _modeStack); 246 _modeStack.push(_mode); 247 mode(m); 248 } 249 250 public int popMode() 251 { 252 assert (!_modeStack.isEmpty, "Empty stack"); 253 debug(LexerATNSimulator) 254 writefln("popMode back to %s", _modeStack.peek); 255 mode(_modeStack.pop); 256 return _mode; 257 } 258 259 /** 260 * Set the char stream and reset the lexer 261 * @uml 262 * @override 263 */ 264 public override void setInputStream(IntStream input) 265 { 266 this._input = null; 267 this._tokenFactorySourcePair = tuple(this, _input); 268 reset(); 269 this._input = cast(CharStream)input; 270 this._tokenFactorySourcePair = tuple(this, _input); 271 } 272 273 public string getSourceName() 274 { 275 return _input.getSourceName(); 276 } 277 278 /** 279 * @uml 280 * @override 281 */ 282 public override CharStream getInputStream() 283 { 284 return _input; 285 } 286 287 /** 288 * By default does not support multiple emits per nextToken invocation 289 * for efficiency reasons. Subclass and override this method, nextToken, 290 * and getToken (to push tokens into a list and pull from that list 291 * rather than a single variable as this implementation does). 292 */ 293 public void emit(Token token) 294 { 295 this._token = token; 296 } 297 298 /** 299 * The standard method called to automatically emit a token at the 300 * outermost lexical rule. The token object should point into the 301 * char buffer start..stop. If there is a text override in 'text', 302 * use that to set the token's text. Override this method to emit 303 * custom Token objects or provide a new factory. 304 */ 305 public Token emit() 306 { 307 Variant v = _text; 308 Token t = tokenFactory_.create(_tokenFactorySourcePair, _type, 309 v, _channel, _tokenStartCharIndex, 310 getCharIndex()-1, _tokenStartLine, 311 _tokenStartCharPositionInLine); 312 emit(t); 313 return t; 314 } 315 316 public Token emitEOF() 317 { 318 int cpos = getCharPositionInLine(); 319 int line = getLine(); 320 Variant Null; 321 Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, Null, TokenConstantDefinition.DEFAULT_CHANNEL, 322 _input.index(), _input.index()-1, 323 line, cpos); 324 emit(eof); 325 return eof; 326 } 327 328 public int getLine() 329 { 330 return getInterpreter().getLine(); 331 } 332 333 public int getCharPositionInLine() 334 { 335 return getInterpreter().getCharPositionInLine(); 336 } 337 338 public void setLine(int line) 339 { 340 getInterpreter().setLine(line); 341 } 342 343 public void setCharPositionInLine(int charPositionInLine) 344 { 345 getInterpreter().setCharPositionInLine(charPositionInLine); 346 } 347 348 /** 349 * What is the index of the current character of lookahead? 350 */ 351 public int getCharIndex() 352 { 353 return _input.index(); 354 } 355 356 /** 357 * Return the text matched so far for the current token or any 358 * text override. 359 */ 360 public Variant getText() 361 { 362 Variant Null; 363 if (_text !is Null) { 364 return _text; 365 } 366 Variant v = getInterpreter().getText(_input); 367 return v; 368 } 369 370 /** 371 * Set the complete text of this token; it wipes any previous 372 * changes to the text. 373 */ 374 public void setText(Variant text) 375 { 376 this._text = text; 377 } 378 379 /** 380 * Override if emitting multiple tokens. 381 */ 382 public Token getToken() 383 { 384 return _token; 385 } 386 387 public void setToken(Token token) 388 { 389 this._token = token; 390 } 391 392 public void setType(int ttype) 393 { 394 _type = ttype; 395 } 396 397 public int getType() 398 { 399 return _type; 400 } 401 402 public void setChannel(int channel) 403 { 404 _channel = channel; 405 } 406 407 public int getChannel() 408 { 409 return _channel; 410 } 411 412 public string[] getChannelNames() 413 { 414 return null; 415 } 416 417 public string[] getModeNames() 418 { 419 return null; 420 } 421 422 /** 423 * Used to print out token names like ID during debugging and 424 * error reporting. The generated parsers implement a method 425 * that overrides this to point to their String[] tokenNames 426 * @uml 427 * @override 428 */ 429 public override string[] getTokenNames() 430 { 431 return null; 432 } 433 434 /** 435 * Return a list of all Token objects in input char stream. 436 * Forces load of all tokens. Does not include EOF token. 437 */ 438 public Token[] getAllTokens() 439 { 440 Token[] tokens; 441 Token t = nextToken(); 442 while (t.getType() != TokenConstantDefinition.EOF) { 443 tokens ~= t; 444 t = nextToken(); 445 } 446 return tokens; 447 } 448 449 public void recover(LexerNoViableAltException e) 450 { 451 if (_input.LA(1) != IntStreamConstant.EOF) { 452 // skip a char and try again 453 getInterpreter().consume(_input); 454 } 455 } 456 457 public void notifyListeners(LexerNoViableAltException e) 458 { 459 string text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index())); 460 string msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'"; 461 462 ANTLRErrorListener!(int, LexerATNSimulator) listener = getErrorListenerDispatch(); 463 listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e); 464 } 465 466 public string getErrorDisplay(string s) 467 { 468 auto buf = appender!string; 469 foreach (char c; s) { 470 buf.put(getErrorDisplay(c)); 471 } 472 return buf.data; 473 } 474 475 public string getErrorDisplay(int c) 476 { 477 string s; 478 switch ( c ) { 479 case TokenConstantDefinition.EOF : 480 s = "<EOF>"; 481 break; 482 case '\n' : 483 s = "\\n"; 484 break; 485 case '\t' : 486 s = "\\t"; 487 break; 488 case '\r' : 489 s = "\\r"; 490 break; 491 default: 492 s ~= cast(wchar)c; 493 break; 494 } 495 return s; 496 } 497 498 public string getCharErrorDisplay(int c) 499 { 500 string s = getErrorDisplay(c); 501 return "'" ~ s ~ "'"; 502 } 503 504 /** 505 * Lexers can normally match any char in it's vocabulary after matching 506 * a token, so do the easy thing and just kill a character and hope 507 * it all works out. You can instead use the rule invocation stack 508 * to do sophisticated error recovery if you are in a fragment rule. 509 */ 510 public void recover(RecognitionException re) 511 { 512 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); 513 //re.printStackTrace(); 514 // TODO: Do we lose character or line position information? 515 _input.consume(); 516 } 517 518 /** 519 * @uml 520 * @override 521 */ 522 public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex, 523 int actionIndex) 524 { 525 } 526 527 public override final TokenFactory!CommonToken tokenFactory() 528 { 529 return this.tokenFactory_; 530 } 531 532 public override final void tokenFactory(TokenFactory!CommonToken tokenFactory) 533 { 534 this.tokenFactory_ = tokenFactory; 535 } 536 537 }