github.com/NeowayLabs/nash@v0.2.2-0.20200127205349-a227041ffd50/scanner/lex.go (about) 1 // Package scanner is the lexical parser. 2 package scanner 3 4 import ( 5 "fmt" 6 "strings" 7 "unicode" 8 "unicode/utf8" 9 10 "github.com/madlambda/nash/token" 11 ) 12 13 type ( 14 Token struct { 15 typ token.Token 16 token.FileInfo 17 18 val string 19 } 20 21 stateFn func(*Lexer) stateFn 22 23 // Lexer holds the state of the scanner 24 Lexer struct { 25 name string // identify the source, used only for error reports 26 input string // the string being scanned 27 start int // start position of current token 28 29 width int // width of last rune read 30 Tokens chan Token // channel of scanned tokens 31 32 // file positions 33 pos int // file offset 34 line int // current line position 35 lineStart int // line of the symbol's start 36 prevColumn int // previous column value 37 column int // current column position 38 columnStart int // column of the symbol's start 39 40 openParens int 41 42 addSemicolon bool 43 } 44 ) 45 46 const ( 47 eof = -1 48 ) 49 50 func (i Token) Type() token.Token { return i.typ } 51 func (i Token) Value() string { return i.val } 52 53 func (i Token) String() string { 54 switch i.typ { 55 case token.Illegal: 56 return "ERROR: " + i.val 57 case token.EOF: 58 return "EOF" 59 } 60 61 if len(i.typ.String()) > 10 { 62 return fmt.Sprintf("%s...", i.typ.String()[0:10]) 63 } 64 65 return fmt.Sprintf("%s", i.typ) 66 } 67 68 // run lexes the input by executing state functions until the state is nil 69 func (l *Lexer) run() { 70 l.line, l.lineStart, l.column, l.columnStart = 1, 1, 0, 0 71 72 for state := lexStart; state != nil; { 73 state = state(l) 74 } 75 76 l.emit(token.EOF) 77 close(l.Tokens) // No more tokens will be delivered 78 } 79 80 func (l *Lexer) emitVal(t token.Token, val string, line, column int) { 81 l.Tokens <- Token{ 82 FileInfo: token.NewFileInfo(line, column), 83 84 typ: t, 85 val: val, 86 } 87 88 l.start = l.pos 89 l.lineStart = l.line 90 l.columnStart = l.column 91 } 92 93 func (l *Lexer) emit(t token.Token) { 94 l.Tokens <- Token{ 95 FileInfo: token.NewFileInfo(l.lineStart, l.columnStart), 96 97 typ: t, 98 val: l.input[l.start:l.pos], 99 } 100 101 l.start = l.pos 102 l.lineStart = l.line 103 l.columnStart = l.column 104 } 105 106 // peek returns but does not consume the next rune from input 107 func (l *Lexer) peek() rune { 108 rune := l.next() 109 l.backup() 110 return rune 111 } 112 113 // next consumes the next rune from input 114 func (l *Lexer) next() rune { 115 var r rune 116 117 if l.pos >= len(l.input) { 118 l.width = 0 119 return eof 120 } 121 122 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) 123 124 l.pos += l.width 125 l.prevColumn = l.column 126 127 if r == '\n' { 128 l.line++ 129 l.column = 0 130 } else { 131 l.column++ 132 } 133 134 return r 135 } 136 137 // ignore skips over the pending input before this point 138 func (l *Lexer) ignore() { 139 l.start = l.pos 140 l.lineStart = l.line 141 l.columnStart = l.column 142 } 143 144 // backup steps back one rune 145 func (l *Lexer) backup() { 146 l.pos -= l.width 147 148 r, _ := utf8.DecodeRuneInString(l.input[l.pos:]) 149 150 l.column = l.prevColumn 151 152 if r == '\n' { 153 l.line-- 154 } 155 } 156 157 // acceptRun consumes a run of runes from the valid setup 158 func (l *Lexer) acceptRun(valid string) { 159 for strings.IndexRune(valid, l.next()) >= 0 { 160 161 } 162 163 l.backup() 164 } 165 166 // errorf emit an error token 167 func (l *Lexer) errorf(format string, args ...interface{}) stateFn { 168 fname := l.name 169 170 if fname == "" { 171 fname = "<none>" 172 } 173 174 errMsg := fmt.Sprintf(format, args...) 175 176 arguments := make([]interface{}, 0, len(args)+2) 177 arguments = append(arguments, fname, l.line, l.column, errMsg) 178 179 l.Tokens <- Token{ 180 FileInfo: token.NewFileInfo(l.line, l.column), 181 182 typ: token.Illegal, 183 val: fmt.Sprintf("%s:%d:%d: %s", arguments...), 184 } 185 186 l.start = len(l.input) 187 l.lineStart = l.line 188 l.columnStart = l.column 189 l.pos = l.start 190 191 return nil // finish the state machine 192 } 193 194 func Lex(name, input string) *Lexer { 195 l := &Lexer{ 196 name: name, 197 input: input, 198 Tokens: make(chan Token), 199 } 200 201 go l.run() // concurrently run state machine 202 203 return l 204 } 205 206 func lexStart(l *Lexer) stateFn { 207 r := l.next() 208 209 switch { 210 case r == eof: 211 if l.addSemicolon { 212 l.emitVal(token.Semicolon, ";", l.line, l.column) 213 } 214 215 l.addSemicolon = false 216 217 return nil 218 case '0' <= r && r <= '9': 219 digits := "0123456789" 220 221 l.acceptRun(digits) 222 223 next := l.peek() 224 225 // >[2=] 226 // cmd[2] 227 if next == '=' || next == ']' || (!isIdentifier(l.peek()) && !isArgument(l.peek())) { 228 l.emit(token.Number) 229 } else if isIdentifier(l.peek()) { 230 absorbIdentifier(l) 231 232 if isArgument(l.peek()) { 233 absorbArgument(l) 234 235 l.emit(token.Arg) 236 } else { 237 l.emit(token.Ident) 238 } 239 } else if isArgument(l.peek()) { 240 absorbArgument(l) 241 l.emit(token.Arg) 242 } 243 244 return lexStart 245 case r == ';': 246 l.emit(token.Semicolon) 247 return lexStart 248 case isSpace(r): 249 return lexSpace 250 251 case isEndOfLine(r): 252 l.ignore() 253 254 if l.addSemicolon && l.openParens == 0 { 255 l.emitVal(token.Semicolon, ";", l.line, l.column) 256 } 257 258 l.addSemicolon = false 259 260 return lexStart 261 case r == '"': 262 l.ignore() 263 264 return lexQuote 265 case r == '#': 266 return lexComment 267 case r == '+': 268 l.emit(token.Plus) 269 return lexStart 270 case r == '>': 271 l.emit(token.Gt) 272 return lexStart 273 case r == '|': 274 l.emit(token.Pipe) 275 return lexStart 276 case r == '$': 277 r = l.next() 278 279 if !isIdentifier(r) { 280 return l.errorf("Expected identifier, but found %q", r) 281 } 282 283 absorbIdentifier(l) 284 285 next := l.peek() 286 if next != eof && !isSpace(next) && 287 !isEndOfLine(next) && next != ';' && 288 next != ')' && next != ',' && next != '+' && 289 next != '[' && next != ']' && next != '(' && 290 next != '.' { 291 l.errorf("Unrecognized character in action: %#U", next) 292 return nil 293 } 294 295 l.emit(token.Variable) 296 return lexStart 297 case r == '=': 298 if l.peek() == '=' { 299 l.next() 300 l.emit(token.Equal) 301 } else { 302 l.emit(token.Assign) 303 } 304 305 return lexStart 306 case r == '!': 307 if l.peek() == '=' { 308 l.next() 309 l.emit(token.NotEqual) 310 } else { 311 l.emit(token.Arg) 312 } 313 314 return lexStart 315 case r == '<': 316 if l.peek() == '=' { 317 l.next() 318 l.emit(token.AssignCmd) 319 } else { 320 l.emit(token.Lt) 321 } 322 323 return lexStart 324 case r == '{': 325 l.addSemicolon = false 326 l.emit(token.LBrace) 327 return lexStart 328 case r == '}': 329 l.emit(token.RBrace) 330 l.addSemicolon = false 331 return lexStart 332 case r == '[': 333 l.emit(token.LBrack) 334 return lexStart 335 case r == ']': 336 l.emit(token.RBrack) 337 return lexStart 338 case r == '(': 339 l.openParens++ 340 341 l.emit(token.LParen) 342 l.addSemicolon = false 343 return lexStart 344 case r == ')': 345 l.openParens-- 346 347 l.emit(token.RParen) 348 l.addSemicolon = true 349 return lexStart 350 case r == ',': 351 l.emit(token.Comma) 352 return lexStart 353 case r == '.': 354 dotLine, dotColumn := l.line, l.column 355 next := l.peek() 356 if next == '.' { 357 l.next() 358 next = l.peek() 359 if next == '.' { 360 l.next() 361 l.emitVal(token.Dotdotdot, "...", dotLine, dotColumn) 362 return lexStart 363 } 364 } 365 absorbArgument(l) 366 l.emit(token.Arg) 367 if next == eof && l.openParens > 0 { 368 l.addSemicolon = false 369 } else { 370 l.addSemicolon = true 371 } 372 return lexStart 373 case isIdentifier(r): 374 // nash literals are lowercase 375 absorbIdentifier(l) 376 377 next := l.peek() 378 379 if isEndOfLine(next) || isSpace(next) || 380 next == '=' || next == '(' || 381 next == ')' || next == ',' || 382 next == '[' || next == eof { 383 lit := scanIdentifier(l) 384 385 if len(lit) > 1 && r >= 'a' && r <= 'z' { 386 l.emit(token.Lookup(lit)) 387 } else { 388 l.emit(token.Ident) 389 } 390 } else if next == '.' { 391 // because of shell idiosyncrasies I've to replicate 392 // almost same dotdotdot lex here... 393 ident := l.input[l.start:l.pos] 394 identLine, identCol := l.lineStart, l.columnStart 395 dotLine, dotColumn := l.line, l.column 396 l.next() 397 next = l.peek() 398 if next == '.' { 399 l.next() 400 next = l.peek() 401 if next == '.' { 402 l.next() 403 l.emitVal(token.Ident, ident, identLine, identCol) 404 l.emitVal(token.Dotdotdot, "...", dotLine, dotColumn) 405 return lexStart 406 } 407 } 408 absorbArgument(l) 409 l.emit(token.Arg) 410 } else { 411 absorbArgument(l) 412 l.emit(token.Arg) 413 } 414 415 if next == eof && l.openParens > 0 { 416 l.addSemicolon = false 417 } else { 418 l.addSemicolon = true 419 } 420 421 return lexStart 422 case isArgument(r): 423 absorbArgument(l) 424 l.emit(token.Arg) 425 l.addSemicolon = true 426 return lexStart 427 } 428 429 return l.errorf("Unrecognized character in action: %#U", r) 430 } 431 432 func absorbIdentifier(l *Lexer) { 433 for { 434 r := l.next() 435 436 if isIdentifier(r) { 437 continue // absorb 438 } 439 440 break 441 } 442 443 l.backup() // pos is now ahead of the alphanum 444 } 445 446 func absorbArgument(l *Lexer) { 447 for { 448 r := l.next() 449 450 if isArgument(r) { 451 continue // absorb 452 } 453 454 break 455 } 456 457 l.backup() // pos is now ahead of the alphanum 458 } 459 460 func scanIdentifier(l *Lexer) string { 461 absorbIdentifier(l) 462 463 return l.input[l.start:l.pos] 464 } 465 466 func lexQuote(l *Lexer) stateFn { 467 var data []rune 468 469 data = make([]rune, 0, 256) 470 471 for { 472 r := l.next() 473 474 if r != '"' && r != eof { 475 if r == '\\' { 476 r = l.next() 477 478 switch r { 479 case 'n': 480 data = append(data, '\n') 481 case 't': 482 data = append(data, '\t') 483 case '\\': 484 data = append(data, '\\') 485 case '"': 486 data = append(data, '"') 487 case 'x', 'u', 'U': 488 return l.errorf("Escape types 'x', 'u' and 'U' aren't implemented yet") 489 case '0', '1', '2', '3', '4', '5', '6', '7': 490 x := r - '0' 491 492 for i := 2; i > 0; i-- { 493 r = l.next() 494 495 if r >= '0' && r <= '7' { 496 x = x*8 + r - '0' 497 continue 498 } 499 500 return l.errorf("non-octal character in escape sequence: %c", r) 501 } 502 503 if x > 255 { 504 return l.errorf("octal escape value > 255: %d", x) 505 } 506 507 data = append(data, x) 508 } 509 } else { 510 data = append(data, r) 511 } 512 513 continue 514 } 515 516 if r == eof { 517 return l.errorf("Quoted string not finished: %s", l.input[l.start:]) 518 } 519 520 l.emitVal(token.String, string(data), l.lineStart, l.columnStart) 521 522 l.ignore() // ignores last quote 523 break 524 } 525 526 return lexStart 527 } 528 529 func lexComment(l *Lexer) stateFn { 530 for { 531 r := l.next() 532 533 if isEndOfLine(r) { 534 l.backup() 535 l.emit(token.Comment) 536 537 break 538 } 539 540 if r == eof { 541 l.backup() 542 l.emit(token.Comment) 543 break 544 } 545 } 546 547 return lexStart 548 } 549 550 func lexSpace(l *Lexer) stateFn { 551 ignoreSpaces(l) 552 return lexStart 553 } 554 555 func ignoreSpaces(l *Lexer) { 556 for { 557 r := l.next() 558 559 if !isSpace(r) { 560 break 561 } 562 } 563 564 l.backup() 565 l.ignore() 566 } 567 568 // isSpace reports whether r is a space character. 569 func isSpace(r rune) bool { 570 return r == ' ' || r == '\t' 571 } 572 573 func isArgument(r rune) bool { 574 isId := isAlpha(r) 575 576 return isId || (r != eof && !isEndOfLine(r) && !isSpace(r) && 577 r != '$' && r != '{' && r != '}' && r != '(' && r != ']' && r != '[' && 578 r != ')' && r != '>' && r != '"' && r != ',' && r != ';' && r != '|') 579 } 580 581 func isIdentifier(r rune) bool { 582 return isAlpha(r) || r == '_' 583 } 584 585 // isIdentifier reports whether r is a valid identifier 586 func isAlpha(r rune) bool { 587 return unicode.IsLetter(r) || unicode.IsDigit(r) 588 } 589 590 // isEndOfLine reports whether r is an end-of-line character. 591 func isEndOfLine(r rune) bool { 592 return r == '\r' || r == '\n' 593 }