1 /* 2 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 3 * Use of this file is governed by the BSD 3-clause license that 4 * can be found in the LICENSE.txt file in the project root. 5 */ 6 7 module antlr.v4.runtime.Lexer; 8 9 import std.stdio; 10 import std.typecons; 11 import std.array; 12 import std.conv; 13 import antlr.v4.runtime.ANTLRErrorListener; 14 import antlr.v4.runtime.Recognizer; 15 import antlr.v4.runtime.RecognitionException; 16 import antlr.v4.runtime.atn.LexerATNSimulator; 17 import antlr.v4.runtime.Token; 18 import antlr.v4.runtime.TokenConstantDefinition; 19 import antlr.v4.runtime.TokenSource; 20 import antlr.v4.runtime.InterfaceLexer; 21 import antlr.v4.runtime.TokenFactory; 22 import antlr.v4.runtime.CharStream; 23 import antlr.v4.runtime.IntStream; 24 import antlr.v4.runtime.IntStreamConstant; 25 import antlr.v4.runtime.CommonToken; 26 import antlr.v4.runtime.CommonTokenFactory; 27 import antlr.v4.runtime.IllegalStateException; 28 import antlr.v4.runtime.LexerNoViableAltException; 29 import antlr.v4.runtime.misc; 30 import antlr.v4.runtime.InterfaceRuleContext; 31 32 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b"); 33 34 /** 35 * A lexer is recognizer that draws input symbols from a character stream. 36 * lexer grammars result in a subclass of this object. A Lexer object 37 * uses simplified match() and error recovery mechanisms in the interest 38 * of speed. 39 */ 40 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer 41 { 42 43 public static immutable int DEFAULT_MODE = 0; 44 45 public static immutable int MORE = -2; 46 47 public static immutable int SKIP = -3; 48 49 public static immutable int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL; 50 51 public static immutable int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL; 52 53 public static immutable int MIN_CHAR_VALUE = char.min; 54 55 public static immutable int MAX_CHAR_VALUE = char.max; 56 57 public CharStream _input; 58 59 protected TokenFactorySourcePair _tokenFactorySourcePair; 60 61 /** 62 * How to create token objects 63 * @uml 64 * @read 65 * @write 66 * @override 67 */ 68 public TokenFactory!CommonToken tokenFactory_; 69 70 /** 71 * The goal of all lexer rules/methods is to create a token object. 72 * This is an instance variable as multiple rules may collaborate to 73 * create a single token. nextToken will return this object after 74 * matching lexer rule(s). If you subclass to allow multiple token 75 * emissions, then set this to the last token to be matched or 76 * something nonnull so that the auto token emit mechanism will not 77 * emit another token. 78 */ 79 public Token _token; 80 81 public IntegerStack _modeStack; 82 83 /** 84 * What character index in the stream did the current token start at? 85 * Needed, for example, to get the text for current token. Set at 86 * the start of nextToken. 87 */ 88 public int _tokenStartCharIndex = -1; 89 90 /** 91 * The line on which the first character of the token resides 92 */ 93 public int _tokenStartLine; 94 95 /** 96 * The character position of first character within the line 97 */ 98 public int _tokenStartCharPositionInLine; 99 100 public bool _hitEOF; 101 102 /** 103 * The channel number for the current token 104 */ 105 public int _channel; 106 107 /** 108 * The token type for the current token 109 */ 110 public int _type; 111 112 public int _mode; 113 114 /** 115 * You can set the text for the current token to override what is in 116 * the input char buffer. Use setText() or can set this instance var. 117 */ 118 public string _text; 119 120 public this() 121 { 122 } 123 124 public this(CharStream input) 125 { 126 tokenFactory_ = CommonTokenFactory.DEFAULT; 127 this._input = input; 128 this._tokenFactorySourcePair = tuple(this, input); 129 _modeStack = new IntegerStack(); 130 } 131 132 public void reset() 133 { 134 // wack Lexer state variables 135 if (_input !is null) { 136 _input.seek(0); // rewind the input 137 } 138 _token = null; 139 _type = TokenConstantDefinition.INVALID_TYPE; 140 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 141 _tokenStartCharIndex = -1; 142 _tokenStartCharPositionInLine = -1; 143 _tokenStartLine = -1; 144 _text = null; 145 _hitEOF = false; 146 _mode = Lexer.DEFAULT_MODE; 147 _modeStack.clear(); 148 getInterpreter().reset(); 149 } 150 151 /** 152 * Return a token from this source; i.e., match a token on the char 153 * stream. 154 */ 155 public Token nextToken() 156 { 157 if (_input is null) { 158 throw new IllegalStateException("nextToken requires a non-null input stream."); 159 } 160 // Mark start location in char stream so unbuffered streams are 161 // guaranteed at least have text of current token 162 int tokenStartMarker = _input.mark(); 163 try{ 164 outer: 165 while (true) { 166 if (_hitEOF) { 167 emitEOF(); 168 return _token; 169 } 170 _token = null; 171 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 172 _tokenStartCharIndex = _input.index; 173 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine(); 174 _tokenStartLine = getInterpreter.getLine; 175 _text = null; 176 do { 177 _type = TokenConstantDefinition.INVALID_TYPE; 178 debug(Lexer) { 179 import std.stdio; 180 writefln("nextToken line = %s at %s in mode %s at index %s", 181 _tokenStartLine, 182 cast(char)_input.LA(1), 183 _mode, 184 _input.index); 185 } 186 int ttype; 187 try { 188 ttype = getInterpreter.match(_input, _mode); 189 } 190 catch (LexerNoViableAltException e) { 191 notifyListeners(e); // report error 192 recover(e); 193 ttype = SKIP; 194 } 195 if (_input.LA(1) == IntStreamConstant.EOF) { 196 _hitEOF = true; 197 } 198 if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype; 199 if (_type == SKIP) { 200 continue outer; 201 } 202 } 203 while (_type == MORE); 204 205 if (_token is null) { 206 emit(); 207 } 208 return _token; 209 } 210 } 211 finally { 212 // make sure we release marker after match or 213 // unbuffered char stream will keep buffering 214 _input.release(tokenStartMarker); 215 } 216 assert(0); 217 } 218 219 /** 220 * Instruct the lexer to skip creating a token for current lexer rule 221 * and look for another token. nextToken() knows to keep looking when 222 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 223 * if token==null at end of any token rule, it creates one for you 224 * and emits it. 225 */ 226 public void skip() 227 { 228 _type = SKIP; 229 } 230 231 public void more() 232 { 233 _type = MORE; 234 } 235 236 public void mode(int m) 237 { 238 _mode = m; 239 } 240 241 public void pushMode(int m) 242 { 243 debug(LexerATNSimulator) 244 writefln("pushMode %s %s", m, _modeStack); 245 _modeStack.push(_mode); 246 mode(m); 247 } 248 249 public int popMode() 250 { 251 assert (!_modeStack.isEmpty, "Empty stack"); 252 debug(LexerATNSimulator) 253 writefln("popMode back to %s", _modeStack.peek); 254 mode(_modeStack.pop); 255 return _mode; 256 } 257 258 /** 259 * Set the char stream and reset the lexer 260 * @uml 261 * @override 262 */ 263 public override void setInputStream(IntStream input) 264 { 265 this._input = null; 266 this._tokenFactorySourcePair = tuple(this, _input); 267 reset(); 268 this._input = cast(CharStream)input; 269 this._tokenFactorySourcePair = tuple(this, _input); 270 } 271 272 public string getSourceName() 273 { 274 return _input.getSourceName(); 275 } 276 277 /** 278 * @uml 279 * @override 280 */ 281 public override CharStream getInputStream() 282 { 283 return _input; 284 } 285 286 /** 287 * By default does not support multiple emits per nextToken invocation 288 * for efficiency reasons. Subclass and override this method, nextToken, 289 * and getToken (to push tokens into a list and pull from that list 290 * rather than a single variable as this implementation does). 291 */ 292 public void emit(Token token) 293 { 294 this._token = token; 295 } 296 297 /** 298 * The standard method called to automatically emit a token at the 299 * outermost lexical rule. The token object should point into the 300 * char buffer start..stop. If there is a text override in 'text', 301 * use that to set the token's text. Override this method to emit 302 * custom Token objects or provide a new factory. 303 */ 304 public Token emit() 305 { 306 Token t = tokenFactory_.create(_tokenFactorySourcePair, _type, 307 _text, _channel, _tokenStartCharIndex, 308 getCharIndex()-1, _tokenStartLine, 309 _tokenStartCharPositionInLine); 310 emit(t); 311 return t; 312 } 313 314 public Token emitEOF() 315 { 316 int cpos = getCharPositionInLine(); 317 int line = getLine(); 318 Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, null, TokenConstantDefinition.DEFAULT_CHANNEL, 319 _input.index(), _input.index()-1, 320 line, cpos); 321 emit(eof); 322 return eof; 323 } 324 325 public int getLine() 326 { 327 return getInterpreter().getLine(); 328 } 329 330 public int getCharPositionInLine() 331 { 332 return getInterpreter().getCharPositionInLine(); 333 } 334 335 public void setLine(int line) 336 { 337 getInterpreter().setLine(line); 338 } 339 340 public void setCharPositionInLine(int charPositionInLine) 341 { 342 getInterpreter().setCharPositionInLine(charPositionInLine); 343 } 344 345 /** 346 * What is the index of the current character of lookahead? 347 */ 348 public int getCharIndex() 349 { 350 return _input.index(); 351 } 352 353 /** 354 * Return the text matched so far for the current token or any 355 * text override. 356 */ 357 public string getText() 358 { 359 if (_text !is null) { 360 return _text; 361 } 362 return getInterpreter().getText(_input); 363 } 364 365 /** 366 * Set the complete text of this token; it wipes any previous 367 * changes to the text. 368 */ 369 public void setText(string text) 370 { 371 this._text = text; 372 } 373 374 /** 375 * Override if emitting multiple tokens. 376 */ 377 public Token getToken() 378 { 379 return _token; 380 } 381 382 public void setToken(Token token) 383 { 384 this._token = token; 385 } 386 387 public void setType(int ttype) 388 { 389 _type = ttype; 390 } 391 392 public int getType() 393 { 394 return _type; 395 } 396 397 public void setChannel(int channel) 398 { 399 _channel = channel; 400 } 401 402 public int getChannel() 403 { 404 return _channel; 405 } 406 407 public string[] getChannelNames() 408 { 409 return null; 410 } 411 412 public string[] getModeNames() 413 { 414 return null; 415 } 416 417 /** 418 * Used to print out token names like ID during debugging and 419 * error reporting. The generated parsers implement a method 420 * that overrides this to point to their String[] tokenNames 421 * @uml 422 * @override 423 */ 424 public override string[] getTokenNames() 425 { 426 return null; 427 } 428 429 /** 430 * Return a list of all Token objects in input char stream. 431 * Forces load of all tokens. Does not include EOF token. 432 */ 433 public Token[] getAllTokens() 434 { 435 Token[] tokens; 436 Token t = nextToken(); 437 while (t.getType() != TokenConstantDefinition.EOF) { 438 tokens ~= t; 439 t = nextToken(); 440 } 441 return tokens; 442 } 443 444 public void recover(LexerNoViableAltException e) 445 { 446 if (_input.LA(1) != IntStreamConstant.EOF) { 447 // skip a char and try again 448 getInterpreter().consume(_input); 449 } 450 } 451 452 public void notifyListeners(LexerNoViableAltException e) 453 { 454 string text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index())); 455 string msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'"; 456 457 ANTLRErrorListener!(int, LexerATNSimulator) listener = getErrorListenerDispatch(); 458 listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e); 459 } 460 461 public string getErrorDisplay(string s) 462 { 463 auto buf = appender!string; 464 foreach (char c; s) { 465 buf.put(getErrorDisplay(c)); 466 } 467 return buf.data; 468 } 469 470 public string getErrorDisplay(int c) 471 { 472 string s = to!string(c); 473 switch ( c ) { 474 case TokenConstantDefinition.EOF : 475 s = "<EOF>"; 476 break; 477 case '\n' : 478 s = "\\n"; 479 break; 480 case '\t' : 481 s = "\\t"; 482 break; 483 case '\r' : 484 s = "\\r"; 485 break; 486 default: break; 487 } 488 return s; 489 } 490 491 public string getCharErrorDisplay(int c) 492 { 493 string s = getErrorDisplay(c); 494 return "'" ~ s ~ "'"; 495 } 496 497 /** 498 * Lexers can normally match any char in it's vocabulary after matching 499 * a token, so do the easy thing and just kill a character and hope 500 * it all works out. You can instead use the rule invocation stack 501 * to do sophisticated error recovery if you are in a fragment rule. 502 */ 503 public void recover(RecognitionException re) 504 { 505 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); 506 //re.printStackTrace(); 507 // TODO: Do we lose character or line position information? 508 _input.consume(); 509 } 510 511 /** 512 * @uml 513 * @override 514 */ 515 public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex, 516 int actionIndex) 517 { 518 } 519 520 public override final TokenFactory!CommonToken tokenFactory() 521 { 522 return this.tokenFactory_; 523 } 524 525 public override final void tokenFactory(TokenFactory!CommonToken tokenFactory) 526 { 527 this.tokenFactory_ = tokenFactory; 528 } 529 530 }