1 /* 2 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 3 * Use of this file is governed by the BSD 3-clause license that 4 * can be found in the LICENSE.txt file in the project root. 5 */ 6 7 module antlr.v4.runtime.Lexer; 8 9 import std.stdio; 10 import std.typecons; 11 import std.array; 12 import std.conv; 13 import antlr.v4.runtime.ANTLRErrorListener; 14 import antlr.v4.runtime.Recognizer; 15 import antlr.v4.runtime.RecognitionException; 16 import antlr.v4.runtime.atn.LexerATNSimulator; 17 import antlr.v4.runtime.Token; 18 import antlr.v4.runtime.TokenConstantDefinition; 19 import antlr.v4.runtime.TokenSource; 20 import antlr.v4.runtime.InterfaceLexer; 21 import antlr.v4.runtime.TokenFactory; 22 import antlr.v4.runtime.CharStream; 23 import antlr.v4.runtime.IntStream; 24 import antlr.v4.runtime.IntStreamConstant; 25 import antlr.v4.runtime.CommonToken; 26 import antlr.v4.runtime.CommonTokenFactory; 27 import antlr.v4.runtime.IllegalStateException; 28 import antlr.v4.runtime.LexerNoViableAltException; 29 import antlr.v4.runtime.misc; 30 import antlr.v4.runtime.InterfaceRuleContext; 31 32 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b"); 33 34 // Class Lexer 35 /** 36 * A lexer is recognizer that draws input symbols from a character stream. 37 * lexer grammars result in a subclass of this object. A Lexer object 38 * uses simplified match() and error recovery mechanisms in the interest 39 * of speed. 40 */ 41 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer 42 { 43 44 public static immutable int DEFAULT_MODE = 0; 45 46 public static immutable int MORE = -2; 47 48 public static immutable int SKIP = -3; 49 50 public static immutable int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL; 51 52 public static immutable int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL; 53 54 public static immutable int MIN_CHAR_VALUE = char.min; 55 56 public static immutable int MAX_CHAR_VALUE = char.max; 57 58 public CharStream _input; 59 60 protected TokenFactorySourcePair _tokenFactorySourcePair; 61 62 /** 63 * How to create token objects 64 * @uml 65 * @read 66 * @write 67 * @override 68 */ 69 public TokenFactory!CommonToken tokenFactory_; 70 71 /** 72 * The goal of all lexer rules/methods is to create a token object. 73 * This is an instance variable as multiple rules may collaborate to 74 * create a single token. nextToken will return this object after 75 * matching lexer rule(s). If you subclass to allow multiple token 76 * emissions, then set this to the last token to be matched or 77 * something nonnull so that the auto token emit mechanism will not 78 * emit another token. 79 */ 80 public Token _token; 81 82 public IntegerStack _modeStack; 83 84 /** 85 * What character index in the stream did the current token start at? 86 * Needed, for example, to get the text for current token. Set at 87 * the start of nextToken. 88 */ 89 public int _tokenStartCharIndex = -1; 90 91 /** 92 * The line on which the first character of the token resides 93 */ 94 public int _tokenStartLine; 95 96 /** 97 * The character position of first character within the line 98 */ 99 public int _tokenStartCharPositionInLine; 100 101 public bool _hitEOF; 102 103 /** 104 * The channel number for the current token 105 */ 106 public int _channel; 107 108 /** 109 * The token type for the current token 110 */ 111 public int _type; 112 113 public int _mode; 114 115 /** 116 * You can set the text for the current token to override what is in 117 * the input char buffer. Use setText() or can set this instance var. 118 */ 119 public string _text; 120 121 public this() 122 { 123 } 124 125 public this(CharStream input) 126 { 127 tokenFactory_ = CommonTokenFactory.DEFAULT; 128 this._input = input; 129 this._tokenFactorySourcePair = tuple(this, input); 130 _modeStack = new IntegerStack(); 131 } 132 133 public void reset() 134 { 135 // wack Lexer state variables 136 if (_input !is null) { 137 _input.seek(0); // rewind the input 138 } 139 _token = null; 140 _type = TokenConstantDefinition.INVALID_TYPE; 141 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 142 _tokenStartCharIndex = -1; 143 _tokenStartCharPositionInLine = -1; 144 _tokenStartLine = -1; 145 _text = null; 146 _hitEOF = false; 147 _mode = Lexer.DEFAULT_MODE; 148 _modeStack.clear(); 149 getInterpreter().reset(); 150 } 151 152 /** 153 * Return a token from this source; i.e., match a token on the char 154 * stream. 155 */ 156 public Token nextToken() 157 { 158 if (_input is null) { 159 throw new IllegalStateException("nextToken requires a non-null input stream."); 160 } 161 // Mark start location in char stream so unbuffered streams are 162 // guaranteed at least have text of current token 163 int tokenStartMarker = _input.mark(); 164 try{ 165 outer: 166 while (true) { 167 if (_hitEOF) { 168 emitEOF(); 169 return _token; 170 } 171 _token = null; 172 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 173 _tokenStartCharIndex = _input.index; 174 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine(); 175 _tokenStartLine = getInterpreter.getLine; 176 _text = null; 177 do { 178 _type = TokenConstantDefinition.INVALID_TYPE; 179 debug(Lexer) { 180 import std.stdio; 181 writefln("nextToken line = %s at %s in mode %s at index %s", 182 _tokenStartLine, 183 cast(char)_input.LA(1), 184 _mode, 185 _input.index); 186 } 187 int ttype; 188 try { 189 ttype = getInterpreter.match(_input, _mode); 190 } 191 catch (LexerNoViableAltException e) { 192 notifyListeners(e); // report error 193 recover(e); 194 ttype = SKIP; 195 } 196 if (_input.LA(1) == IntStreamConstant.EOF) { 197 _hitEOF = true; 198 } 199 if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype; 200 if (_type == SKIP) { 201 continue outer; 202 } 203 } 204 while (_type == MORE); 205 206 if (_token is null) { 207 emit(); 208 } 209 return _token; 210 } 211 } 212 finally { 213 // make sure we release marker after match or 214 // unbuffered char stream will keep buffering 215 _input.release(tokenStartMarker); 216 } 217 assert(0); 218 } 219 220 /** 221 * Instruct the lexer to skip creating a token for current lexer rule 222 * and look for another token. nextToken() knows to keep looking when 223 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 224 * if token==null at end of any token rule, it creates one for you 225 * and emits it. 226 */ 227 public void skip() 228 { 229 _type = SKIP; 230 } 231 232 public void more() 233 { 234 _type = MORE; 235 } 236 237 public void mode(int m) 238 { 239 _mode = m; 240 } 241 242 public void pushMode(int m) 243 { 244 debug(LexerATNSimulator) 245 writefln("pushMode %s %s", m, _modeStack); 246 _modeStack.push(_mode); 247 mode(m); 248 } 249 250 public int popMode() 251 { 252 assert (!_modeStack.isEmpty, "Empty stack"); 253 debug(LexerATNSimulator) 254 writefln("popMode back to %s", _modeStack.peek); 255 mode(_modeStack.pop); 256 return _mode; 257 } 258 259 /** 260 * Set the char stream and reset the lexer 261 * @uml 262 * @override 263 */ 264 public override void setInputStream(IntStream input) 265 { 266 this._input = null; 267 this._tokenFactorySourcePair = tuple(this, _input); 268 reset(); 269 this._input = cast(CharStream)input; 270 this._tokenFactorySourcePair = tuple(this, _input); 271 } 272 273 public string getSourceName() 274 { 275 return _input.getSourceName(); 276 } 277 278 /** 279 * @uml 280 * @override 281 */ 282 public override CharStream getInputStream() 283 { 284 return _input; 285 } 286 287 /** 288 * By default does not support multiple emits per nextToken invocation 289 * for efficiency reasons. Subclass and override this method, nextToken, 290 * and getToken (to push tokens into a list and pull from that list 291 * rather than a single variable as this implementation does). 292 */ 293 public void emit(Token token) 294 { 295 this._token = token; 296 } 297 298 /** 299 * The standard method called to automatically emit a token at the 300 * outermost lexical rule. The token object should point into the 301 * char buffer start..stop. If there is a text override in 'text', 302 * use that to set the token's text. Override this method to emit 303 * custom Token objects or provide a new factory. 304 */ 305 public Token emit() 306 { 307 Token t = tokenFactory_.create(_tokenFactorySourcePair, _type, 308 _text, _channel, _tokenStartCharIndex, 309 getCharIndex()-1, _tokenStartLine, 310 _tokenStartCharPositionInLine); 311 emit(t); 312 return t; 313 } 314 315 public Token emitEOF() 316 { 317 int cpos = getCharPositionInLine(); 318 int line = getLine(); 319 Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, null, TokenConstantDefinition.DEFAULT_CHANNEL, 320 _input.index(), _input.index()-1, 321 line, cpos); 322 emit(eof); 323 return eof; 324 } 325 326 public int getLine() 327 { 328 return getInterpreter().getLine(); 329 } 330 331 public int getCharPositionInLine() 332 { 333 return getInterpreter().getCharPositionInLine(); 334 } 335 336 public void setLine(int line) 337 { 338 getInterpreter().setLine(line); 339 } 340 341 public void setCharPositionInLine(int charPositionInLine) 342 { 343 getInterpreter().setCharPositionInLine(charPositionInLine); 344 } 345 346 /** 347 * What is the index of the current character of lookahead? 348 */ 349 public int getCharIndex() 350 { 351 return _input.index(); 352 } 353 354 /** 355 * Return the text matched so far for the current token or any 356 * text override. 357 */ 358 public string getText() 359 { 360 if (_text !is null) { 361 return _text; 362 } 363 return getInterpreter().getText(_input); 364 } 365 366 /** 367 * Set the complete text of this token; it wipes any previous 368 * changes to the text. 369 */ 370 public void setText(string text) 371 { 372 this._text = text; 373 } 374 375 /** 376 * Override if emitting multiple tokens. 377 */ 378 public Token getToken() 379 { 380 return _token; 381 } 382 383 public void setToken(Token token) 384 { 385 this._token = token; 386 } 387 388 public void setType(int ttype) 389 { 390 _type = ttype; 391 } 392 393 public int getType() 394 { 395 return _type; 396 } 397 398 public void setChannel(int channel) 399 { 400 _channel = channel; 401 } 402 403 public int getChannel() 404 { 405 return _channel; 406 } 407 408 public string[] getChannelNames() 409 { 410 return null; 411 } 412 413 public string[] getModeNames() 414 { 415 return null; 416 } 417 418 /** 419 * Used to print out token names like ID during debugging and 420 * error reporting. The generated parsers implement a method 421 * that overrides this to point to their String[] tokenNames 422 * @uml 423 * @override 424 */ 425 public override string[] getTokenNames() 426 { 427 return null; 428 } 429 430 /** 431 * Return a list of all Token objects in input char stream. 432 * Forces load of all tokens. Does not include EOF token. 433 */ 434 public Token[] getAllTokens() 435 { 436 Token[] tokens; 437 Token t = nextToken(); 438 while (t.getType() != TokenConstantDefinition.EOF) { 439 tokens ~= t; 440 t = nextToken(); 441 } 442 return tokens; 443 } 444 445 public void recover(LexerNoViableAltException e) 446 { 447 if (_input.LA(1) != IntStreamConstant.EOF) { 448 // skip a char and try again 449 getInterpreter().consume(_input); 450 } 451 } 452 453 public void notifyListeners(LexerNoViableAltException e) 454 { 455 string text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index())); 456 string msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'"; 457 458 ANTLRErrorListener!(int, LexerATNSimulator) listener = getErrorListenerDispatch(); 459 listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e); 460 } 461 462 public string getErrorDisplay(string s) 463 { 464 auto buf = appender!string; 465 foreach (char c; s) { 466 buf.put(getErrorDisplay(c)); 467 } 468 return buf.data; 469 } 470 471 public string getErrorDisplay(int c) 472 { 473 string s = to!string(c); 474 switch ( c ) { 475 case TokenConstantDefinition.EOF : 476 s = "<EOF>"; 477 break; 478 case '\n' : 479 s = "\\n"; 480 break; 481 case '\t' : 482 s = "\\t"; 483 break; 484 case '\r' : 485 s = "\\r"; 486 break; 487 default: break; 488 } 489 return s; 490 } 491 492 public string getCharErrorDisplay(int c) 493 { 494 string s = getErrorDisplay(c); 495 return "'" ~ s ~ "'"; 496 } 497 498 /** 499 * Lexers can normally match any char in it's vocabulary after matching 500 * a token, so do the easy thing and just kill a character and hope 501 * it all works out. You can instead use the rule invocation stack 502 * to do sophisticated error recovery if you are in a fragment rule. 503 */ 504 public void recover(RecognitionException re) 505 { 506 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); 507 //re.printStackTrace(); 508 // TODO: Do we lose character or line position information? 509 _input.consume(); 510 } 511 512 /** 513 * @uml 514 * @override 515 */ 516 public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex, 517 int actionIndex) 518 { 519 } 520 521 public override final TokenFactory!CommonToken tokenFactory() 522 { 523 return this.tokenFactory_; 524 } 525 526 public override final void tokenFactory(TokenFactory!CommonToken tokenFactory) 527 { 528 this.tokenFactory_ = tokenFactory; 529 } 530 531 }