1 /* 2 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 3 * Use of this file is governed by the BSD 3-clause license that 4 * can be found in the LICENSE.txt file in the project root. 5 */ 6 7 module antlr.v4.runtime.Lexer; 8 9 import std.stdio; 10 import std.typecons; 11 import std.array; 12 import std.conv; 13 import std.variant; 14 import antlr.v4.runtime.ANTLRErrorListener; 15 import antlr.v4.runtime.Recognizer; 16 import antlr.v4.runtime.RecognitionException; 17 import antlr.v4.runtime.atn.LexerATNSimulator; 18 import antlr.v4.runtime.Token; 19 import antlr.v4.runtime.TokenConstantDefinition; 20 import antlr.v4.runtime.TokenSource; 21 import antlr.v4.runtime.InterfaceLexer; 22 import antlr.v4.runtime.TokenFactory; 23 import antlr.v4.runtime.CharStream; 24 import antlr.v4.runtime.IntStream; 25 import antlr.v4.runtime.IntStreamConstant; 26 import antlr.v4.runtime.CommonToken; 27 import antlr.v4.runtime.CommonTokenFactory; 28 import antlr.v4.runtime.IllegalStateException; 29 import antlr.v4.runtime.LexerNoViableAltException; 30 import antlr.v4.runtime.misc; 31 import antlr.v4.runtime.InterfaceRuleContext; 32 33 alias TokenFactorySourcePair = Tuple!(TokenSource, "a", CharStream, "b"); 34 35 /** 36 * A lexer is recognizer that draws input symbols from a character stream. 37 * lexer grammars result in a subclass of this object. A Lexer object 38 * uses simplified match() and error recovery mechanisms in the interest 39 * of speed. 40 */ 41 abstract class Lexer : Recognizer!(int, LexerATNSimulator), TokenSource, InterfaceLexer 42 { 43 44 enum int DEFAULT_MODE = 0; 45 46 enum int MORE = -2; 47 48 enum int SKIP = -3; 49 50 enum int DEFAULT_TOKEN_CHANNEL = TokenConstantDefinition.DEFAULT_CHANNEL; 51 52 enum int HIDDEN = TokenConstantDefinition.HIDDEN_CHANNEL; 53 54 enum int MIN_CHAR_VALUE = 0; 55 56 enum int MAX_CHAR_VALUE = 0x10FFFF; 57 58 public CharStream _input; 59 60 protected TokenFactorySourcePair _tokenFactorySourcePair; 61 62 /** 63 * How to create token objects 64 * @uml 65 * @read 66 * @write 67 * @override 68 */ 69 public TokenFactory!CommonToken tokenFactory_; 70 71 /** 72 * The goal of all lexer rules/methods is to create a token object. 73 * This is an instance variable as multiple rules may collaborate to 74 * create a single token. nextToken will return this object after 75 * matching lexer rule(s). If you subclass to allow multiple token 76 * emissions, then set this to the last token to be matched or 77 * something nonnull so that the auto token emit mechanism will not 78 * emit another token. 79 */ 80 public Token _token; 81 82 public IntegerStack _modeStack; 83 84 /** 85 * What character index in the stream did the current token start at? 86 * Needed, for example, to get the text for current token. Set at 87 * the start of nextToken. 88 */ 89 public size_t _tokenStartCharIndex; 90 91 /** 92 * The line on which the first character of the token resides 93 */ 94 public int _tokenStartLine; 95 96 /** 97 * The character position of first character within the line 98 */ 99 public int _tokenStartCharPositionInLine; 100 101 public bool _hitEOF; 102 103 /** 104 * The channel number for the current token 105 */ 106 public int _channel; 107 108 /** 109 * The token type for the current token 110 */ 111 public int _type; 112 113 public int _mode; 114 115 /** 116 * You can set the text for the current token to override what is in 117 * the input char buffer. Use setText() or can set this instance var. 118 */ 119 public Variant _text; 120 121 public this() 122 { 123 } 124 125 public this(CharStream input) 126 { 127 tokenFactory_ = CommonTokenFactory.DEFAULT; 128 this._input = input; 129 this._tokenFactorySourcePair = tuple(this, input); 130 _modeStack = new IntegerStack(); 131 } 132 133 public void reset() 134 { 135 // wack Lexer state variables 136 if (_input !is null) { 137 _input.seek(0); // rewind the input 138 } 139 _token = null; 140 _type = TokenConstantDefinition.INVALID_TYPE; 141 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 142 _tokenStartCharIndex = -1; 143 _tokenStartCharPositionInLine = -1; 144 _tokenStartLine = -1; 145 _text.init; 146 _hitEOF = false; 147 _mode = Lexer.DEFAULT_MODE; 148 _modeStack.clear(); 149 getInterpreter().reset(); 150 } 151 152 /** 153 * Return a token from this source; i.e., match a token on the char 154 * stream. 155 */ 156 public Token nextToken() 157 { 158 if (_input is null) { 159 throw new IllegalStateException("nextToken requires a non-null input stream."); 160 } 161 // Mark start location in char stream so unbuffered streams are 162 // guaranteed at least have text of current token 163 int tokenStartMarker = _input.mark(); 164 try{ 165 outer: 166 while (true) { 167 if (_hitEOF) { 168 emitEOF(); 169 return _token; 170 } 171 _token = null; 172 _channel = TokenConstantDefinition.DEFAULT_CHANNEL; 173 _tokenStartCharIndex = _input.index; 174 _tokenStartCharPositionInLine = getInterpreter.getCharPositionInLine(); 175 _tokenStartLine = getInterpreter.getLine; 176 _text.init; 177 do { 178 _type = TokenConstantDefinition.INVALID_TYPE; 179 debug(Lexer) { 180 import std.stdio; 181 writefln("nextToken line = %s at %s: %s in mode %s at index %s", 182 _tokenStartLine, 183 _tokenStartCharPositionInLine, 184 _input.LA(1), 185 _mode, 186 _input.index); 187 } 188 int ttype; 189 try { 190 ttype = getInterpreter.match(_input, _mode); 191 } 192 catch (LexerNoViableAltException e) { 193 notifyListeners(e); // report error 194 recover(e); 195 ttype = SKIP; 196 } 197 if (_input.LA(1) == IntStreamConstant.EOF) { 198 _hitEOF = true; 199 } 200 if (_type == TokenConstantDefinition.INVALID_TYPE) _type = ttype; 201 if (_type == SKIP) { 202 continue outer; 203 } 204 } 205 while (_type == MORE); 206 207 if (_token is null) { 208 emit(); 209 } 210 return _token; 211 } 212 } 213 finally { 214 // make sure we release marker after match or 215 // unbuffered char stream will keep buffering 216 _input.release(tokenStartMarker); 217 } 218 assert(0); 219 } 220 221 /** 222 * Instruct the lexer to skip creating a token for current lexer rule 223 * and look for another token. nextToken() knows to keep looking when 224 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 225 * if token==null at end of any token rule, it creates one for you 226 * and emits it. 227 */ 228 public void skip() 229 { 230 _type = SKIP; 231 } 232 233 public void more() 234 { 235 _type = MORE; 236 } 237 238 public void mode(int m) 239 { 240 _mode = m; 241 } 242 243 public void pushMode(int m) 244 { 245 debug(LexerATNSimulator) 246 writefln("pushMode %s %s", m, _modeStack); 247 _modeStack.push(_mode); 248 mode(m); 249 } 250 251 public int popMode() 252 { 253 assert (!_modeStack.isEmpty, "Empty stack"); 254 debug(LexerATNSimulator) 255 writefln("popMode back to %s", _modeStack.peek); 256 mode(_modeStack.pop); 257 return _mode; 258 } 259 260 /** 261 * Set the char stream and reset the lexer 262 * @uml 263 * @override 264 */ 265 public override void setInputStream(IntStream input) 266 { 267 this._input = null; 268 this._tokenFactorySourcePair = tuple(this, _input); 269 reset(); 270 this._input = cast(CharStream)input; 271 this._tokenFactorySourcePair = tuple(this, _input); 272 } 273 274 public string getSourceName() 275 { 276 return _input.getSourceName(); 277 } 278 279 /** 280 * @uml 281 * @override 282 */ 283 public override CharStream getInputStream() 284 { 285 return _input; 286 } 287 288 /** 289 * By default does not support multiple emits per nextToken invocation 290 * for efficiency reasons. Subclass and override this method, nextToken, 291 * and getToken (to push tokens into a list and pull from that list 292 * rather than a single variable as this implementation does). 293 */ 294 public void emit(Token token) 295 { 296 this._token = token; 297 } 298 299 /** 300 * The standard method called to automatically emit a token at the 301 * outermost lexical rule. The token object should point into the 302 * char buffer start..stop. If there is a text override in 'text', 303 * use that to set the token's text. Override this method to emit 304 * custom Token objects or provide a new factory. 305 */ 306 public Token emit() 307 { 308 Variant v = _text; 309 Token t = tokenFactory_.create(_tokenFactorySourcePair, _type, 310 v, _channel, _tokenStartCharIndex, 311 getCharIndex()-1, _tokenStartLine, 312 _tokenStartCharPositionInLine); 313 emit(t); 314 return t; 315 } 316 317 public Token emitEOF() 318 { 319 int cpos = getCharPositionInLine(); 320 int line = getLine(); 321 Variant Null; 322 Token eof = tokenFactory_.create(_tokenFactorySourcePair, TokenConstantDefinition.EOF, Null, TokenConstantDefinition.DEFAULT_CHANNEL, 323 _input.index(), _input.index()-1, 324 line, cpos); 325 emit(eof); 326 return eof; 327 } 328 329 public int getLine() 330 { 331 return getInterpreter().getLine(); 332 } 333 334 public int getCharPositionInLine() 335 { 336 return getInterpreter().getCharPositionInLine(); 337 } 338 339 public void setLine(int line) 340 { 341 getInterpreter().setLine(line); 342 } 343 344 public void setCharPositionInLine(int charPositionInLine) 345 { 346 getInterpreter().setCharPositionInLine(charPositionInLine); 347 } 348 349 /** 350 * What is the index of the current character of lookahead? 351 */ 352 public size_t getCharIndex() 353 { 354 return _input.index(); 355 } 356 357 /** 358 * Return the text matched so far for the current token or any 359 * text override. 360 */ 361 public Variant getText() 362 { 363 Variant Null; 364 if (_text !is Null) { 365 return _text; 366 } 367 Variant v = getInterpreter().getText(_input); 368 return v; 369 } 370 371 /** 372 * Set the complete text of this token; it wipes any previous 373 * changes to the text. 374 */ 375 public void setText(Variant text) 376 { 377 this._text = text; 378 } 379 380 /** 381 * Override if emitting multiple tokens. 382 */ 383 public Token getToken() 384 { 385 return _token; 386 } 387 388 public void setToken(Token token) 389 { 390 this._token = token; 391 } 392 393 public void setType(int ttype) 394 { 395 _type = ttype; 396 } 397 398 public int getType() 399 { 400 return _type; 401 } 402 403 public void setChannel(int channel) 404 { 405 _channel = channel; 406 } 407 408 public int getChannel() 409 { 410 return _channel; 411 } 412 413 public string[] getChannelNames() 414 { 415 return null; 416 } 417 418 public string[] getModeNames() 419 { 420 return null; 421 } 422 423 /** 424 * Used to print out token names like ID during debugging and 425 * error reporting. The generated parsers implement a method 426 * that overrides this to point to their String[] tokenNames 427 * @uml 428 * @override 429 */ 430 public override string[] getTokenNames() 431 { 432 return null; 433 } 434 435 /** 436 * Return a list of all Token objects in input char stream. 437 * Forces load of all tokens. Does not include EOF token. 438 */ 439 public Token[] getAllTokens() 440 { 441 Token[] tokens; 442 Token t = nextToken(); 443 while (t.getType() != TokenConstantDefinition.EOF) { 444 tokens ~= t; 445 t = nextToken(); 446 } 447 return tokens; 448 } 449 450 public void recover(LexerNoViableAltException e) 451 { 452 if (_input.LA(1) != IntStreamConstant.EOF) { 453 // skip a char and try again 454 getInterpreter().consume(_input); 455 } 456 } 457 458 public void notifyListeners(LexerNoViableAltException e) 459 { 460 auto text = _input.getText(Interval.of(to!int(_tokenStartCharIndex), to!int(_input.index))); 461 auto msg = "token recognition error at: '" ~ getErrorDisplay(text) ~ "'"; 462 ANTLRErrorListener listener = getErrorListenerDispatch(); 463 listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e); 464 } 465 466 public string getErrorDisplay(string s) 467 { 468 auto buf = appender!string; 469 foreach (dchar c; s) { 470 buf.put(getErrorDisplay(c)); 471 } 472 return buf.data; 473 } 474 475 public string getErrorDisplay(dchar c) 476 { 477 string s; 478 switch ( c ) { 479 case TokenConstantDefinition.EOF : 480 s = "<EOF>"; 481 break; 482 case '\n' : 483 s = "\\n"; 484 break; 485 case '\t' : 486 s = "\\t"; 487 break; 488 case '\r' : 489 s = "\\r"; 490 break; 491 default: 492 s ~= c; 493 break; 494 } 495 return s; 496 } 497 498 public string getCharErrorDisplay(dchar c) 499 { 500 string s = getErrorDisplay(c); 501 return "'" ~ s ~ "'"; 502 } 503 504 /** 505 * Lexers can normally match any char in it's vocabulary after matching 506 * a token, so do the easy thing and just kill a character and hope 507 * it all works out. You can instead use the rule invocation stack 508 * to do sophisticated error recovery if you are in a fragment rule. 509 */ 510 public void recover(RecognitionException re) 511 { 512 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); 513 //re.printStackTrace(); 514 // TODO: Do we lose character or line position information? 515 _input.consume(); 516 } 517 518 /** 519 * @uml 520 * @override 521 */ 522 public override void action(InterfaceRuleContext interfaceRuleContext, int ruleIndex, 523 int actionIndex) 524 { 525 } 526 527 public override final TokenFactory!CommonToken tokenFactory() 528 { 529 return this.tokenFactory_; 530 } 531 532 public override final void tokenFactory(TokenFactory!CommonToken tokenFactory) 533 { 534 this.tokenFactory_ = tokenFactory; 535 } 536 537 }