git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/toml/lex.go (about) 1 package toml 2 3 import ( 4 "fmt" 5 "reflect" 6 "runtime" 7 "strings" 8 "unicode" 9 "unicode/utf8" 10 ) 11 12 type itemType int 13 14 const ( 15 itemError itemType = iota 16 itemNIL // used in the parser to indicate no type 17 itemEOF 18 itemText 19 itemString 20 itemRawString 21 itemMultilineString 22 itemRawMultilineString 23 itemBool 24 itemInteger 25 itemFloat 26 itemDatetime 27 itemArray // the start of an array 28 itemArrayEnd 29 itemTableStart 30 itemTableEnd 31 itemArrayTableStart 32 itemArrayTableEnd 33 itemKeyStart 34 itemKeyEnd 35 itemCommentStart 36 itemInlineTableStart 37 itemInlineTableEnd 38 ) 39 40 const eof = 0 41 42 type stateFn func(lx *lexer) stateFn 43 44 func (p Position) String() string { 45 return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len) 46 } 47 48 type lexer struct { 49 input string 50 start int 51 pos int 52 line int 53 state stateFn 54 items chan item 55 56 // Allow for backing up up to 4 runes. This is necessary because TOML 57 // contains 3-rune tokens (""" and '''). 58 prevWidths [4]int 59 nprev int // how many of prevWidths are in use 60 atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again. 61 62 // A stack of state functions used to maintain context. 63 // 64 // The idea is to reuse parts of the state machine in various places. For 65 // example, values can appear at the top level or within arbitrarily nested 66 // arrays. The last state on the stack is used after a value has been lexed. 67 // Similarly for comments. 68 stack []stateFn 69 } 70 71 type item struct { 72 typ itemType 73 val string 74 err error 75 pos Position 76 } 77 78 func (lx *lexer) nextItem() item { 79 for { 80 select { 81 case item := <-lx.items: 82 return item 83 default: 84 lx.state = lx.state(lx) 85 //fmt.Printf(" STATE %-24s current: %-10s stack: %s\n", lx.state, lx.current(), lx.stack) 86 } 87 } 88 } 89 90 func lex(input string) *lexer { 91 lx := &lexer{ 92 input: input, 93 state: lexTop, 94 items: make(chan item, 10), 95 stack: make([]stateFn, 0, 10), 96 line: 1, 97 } 98 return lx 99 } 100 101 func (lx *lexer) push(state stateFn) { 102 lx.stack = append(lx.stack, state) 103 } 104 105 func (lx *lexer) pop() stateFn { 106 if len(lx.stack) == 0 { 107 return lx.errorf("BUG in lexer: no states to pop") 108 } 109 last := lx.stack[len(lx.stack)-1] 110 lx.stack = lx.stack[0 : len(lx.stack)-1] 111 return last 112 } 113 114 func (lx *lexer) current() string { 115 return lx.input[lx.start:lx.pos] 116 } 117 118 func (lx lexer) getPos() Position { 119 p := Position{ 120 Line: lx.line, 121 Start: lx.start, 122 Len: lx.pos - lx.start, 123 } 124 if p.Len <= 0 { 125 p.Len = 1 126 } 127 return p 128 } 129 130 func (lx *lexer) emit(typ itemType) { 131 // Needed for multiline strings ending with an incomplete UTF-8 sequence. 132 if lx.start > lx.pos { 133 lx.error(errLexUTF8{lx.input[lx.pos]}) 134 return 135 } 136 lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()} 137 lx.start = lx.pos 138 } 139 140 func (lx *lexer) emitTrim(typ itemType) { 141 lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())} 142 lx.start = lx.pos 143 } 144 145 func (lx *lexer) next() (r rune) { 146 if lx.atEOF { 147 panic("BUG in lexer: next called after EOF") 148 } 149 if lx.pos >= len(lx.input) { 150 lx.atEOF = true 151 return eof 152 } 153 154 if lx.input[lx.pos] == '\n' { 155 lx.line++ 156 } 157 lx.prevWidths[3] = lx.prevWidths[2] 158 lx.prevWidths[2] = lx.prevWidths[1] 159 lx.prevWidths[1] = lx.prevWidths[0] 160 if lx.nprev < 4 { 161 lx.nprev++ 162 } 163 164 r, w := utf8.DecodeRuneInString(lx.input[lx.pos:]) 165 if r == utf8.RuneError { 166 lx.error(errLexUTF8{lx.input[lx.pos]}) 167 return utf8.RuneError 168 } 169 170 // Note: don't use peek() here, as this calls next(). 171 if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) { 172 lx.errorControlChar(r) 173 return utf8.RuneError 174 } 175 176 lx.prevWidths[0] = w 177 lx.pos += w 178 return r 179 } 180 181 // ignore skips over the pending input before this point. 182 func (lx *lexer) ignore() { 183 lx.start = lx.pos 184 } 185 186 // backup steps back one rune. Can be called 4 times between calls to next. 187 func (lx *lexer) backup() { 188 if lx.atEOF { 189 lx.atEOF = false 190 return 191 } 192 if lx.nprev < 1 { 193 panic("BUG in lexer: backed up too far") 194 } 195 w := lx.prevWidths[0] 196 lx.prevWidths[0] = lx.prevWidths[1] 197 lx.prevWidths[1] = lx.prevWidths[2] 198 lx.prevWidths[2] = lx.prevWidths[3] 199 lx.nprev-- 200 201 lx.pos -= w 202 if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' { 203 lx.line-- 204 } 205 } 206 207 // accept consumes the next rune if it's equal to `valid`. 208 func (lx *lexer) accept(valid rune) bool { 209 if lx.next() == valid { 210 return true 211 } 212 lx.backup() 213 return false 214 } 215 216 // peek returns but does not consume the next rune in the input. 217 func (lx *lexer) peek() rune { 218 r := lx.next() 219 lx.backup() 220 return r 221 } 222 223 // skip ignores all input that matches the given predicate. 224 func (lx *lexer) skip(pred func(rune) bool) { 225 for { 226 r := lx.next() 227 if pred(r) { 228 continue 229 } 230 lx.backup() 231 lx.ignore() 232 return 233 } 234 } 235 236 // error stops all lexing by emitting an error and returning `nil`. 237 // 238 // Note that any value that is a character is escaped if it's a special 239 // character (newlines, tabs, etc.). 240 func (lx *lexer) error(err error) stateFn { 241 if lx.atEOF { 242 return lx.errorPrevLine(err) 243 } 244 lx.items <- item{typ: itemError, pos: lx.getPos(), err: err} 245 return nil 246 } 247 248 // errorfPrevline is like error(), but sets the position to the last column of 249 // the previous line. 250 // 251 // This is so that unexpected EOF or NL errors don't show on a new blank line. 252 func (lx *lexer) errorPrevLine(err error) stateFn { 253 pos := lx.getPos() 254 pos.Line-- 255 pos.Len = 1 256 pos.Start = lx.pos - 1 257 lx.items <- item{typ: itemError, pos: pos, err: err} 258 return nil 259 } 260 261 // errorPos is like error(), but allows explicitly setting the position. 262 func (lx *lexer) errorPos(start, length int, err error) stateFn { 263 pos := lx.getPos() 264 pos.Start = start 265 pos.Len = length 266 lx.items <- item{typ: itemError, pos: pos, err: err} 267 return nil 268 } 269 270 // errorf is like error, and creates a new error. 271 func (lx *lexer) errorf(format string, values ...interface{}) stateFn { 272 if lx.atEOF { 273 pos := lx.getPos() 274 pos.Line-- 275 pos.Len = 1 276 pos.Start = lx.pos - 1 277 lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)} 278 return nil 279 } 280 lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)} 281 return nil 282 } 283 284 func (lx *lexer) errorControlChar(cc rune) stateFn { 285 return lx.errorPos(lx.pos-1, 1, errLexControl{cc}) 286 } 287 288 // lexTop consumes elements at the top level of TOML data. 289 func lexTop(lx *lexer) stateFn { 290 r := lx.next() 291 if isWhitespace(r) || isNL(r) { 292 return lexSkip(lx, lexTop) 293 } 294 switch r { 295 case '#': 296 lx.push(lexTop) 297 return lexCommentStart 298 case '[': 299 return lexTableStart 300 case eof: 301 if lx.pos > lx.start { 302 return lx.errorf("unexpected EOF") 303 } 304 lx.emit(itemEOF) 305 return nil 306 } 307 308 // At this point, the only valid item can be a key, so we back up 309 // and let the key lexer do the rest. 310 lx.backup() 311 lx.push(lexTopEnd) 312 return lexKeyStart 313 } 314 315 // lexTopEnd is entered whenever a top-level item has been consumed. (A value 316 // or a table.) It must see only whitespace, and will turn back to lexTop 317 // upon a newline. If it sees EOF, it will quit the lexer successfully. 318 func lexTopEnd(lx *lexer) stateFn { 319 r := lx.next() 320 switch { 321 case r == '#': 322 // a comment will read to a newline for us. 323 lx.push(lexTop) 324 return lexCommentStart 325 case isWhitespace(r): 326 return lexTopEnd 327 case isNL(r): 328 lx.ignore() 329 return lexTop 330 case r == eof: 331 lx.emit(itemEOF) 332 return nil 333 } 334 return lx.errorf( 335 "expected a top-level item to end with a newline, comment, or EOF, but got %q instead", 336 r) 337 } 338 339 // lexTable lexes the beginning of a table. Namely, it makes sure that 340 // it starts with a character other than '.' and ']'. 341 // It assumes that '[' has already been consumed. 342 // It also handles the case that this is an item in an array of tables. 343 // e.g., '[[name]]'. 344 func lexTableStart(lx *lexer) stateFn { 345 if lx.peek() == '[' { 346 lx.next() 347 lx.emit(itemArrayTableStart) 348 lx.push(lexArrayTableEnd) 349 } else { 350 lx.emit(itemTableStart) 351 lx.push(lexTableEnd) 352 } 353 return lexTableNameStart 354 } 355 356 func lexTableEnd(lx *lexer) stateFn { 357 lx.emit(itemTableEnd) 358 return lexTopEnd 359 } 360 361 func lexArrayTableEnd(lx *lexer) stateFn { 362 if r := lx.next(); r != ']' { 363 return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r) 364 } 365 lx.emit(itemArrayTableEnd) 366 return lexTopEnd 367 } 368 369 func lexTableNameStart(lx *lexer) stateFn { 370 lx.skip(isWhitespace) 371 switch r := lx.peek(); { 372 case r == ']' || r == eof: 373 return lx.errorf("unexpected end of table name (table names cannot be empty)") 374 case r == '.': 375 return lx.errorf("unexpected table separator (table names cannot be empty)") 376 case r == '"' || r == '\'': 377 lx.ignore() 378 lx.push(lexTableNameEnd) 379 return lexQuotedName 380 default: 381 lx.push(lexTableNameEnd) 382 return lexBareName 383 } 384 } 385 386 // lexTableNameEnd reads the end of a piece of a table name, optionally 387 // consuming whitespace. 388 func lexTableNameEnd(lx *lexer) stateFn { 389 lx.skip(isWhitespace) 390 switch r := lx.next(); { 391 case isWhitespace(r): 392 return lexTableNameEnd 393 case r == '.': 394 lx.ignore() 395 return lexTableNameStart 396 case r == ']': 397 return lx.pop() 398 default: 399 return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r) 400 } 401 } 402 403 // lexBareName lexes one part of a key or table. 404 // 405 // It assumes that at least one valid character for the table has already been 406 // read. 407 // 408 // Lexes only one part, e.g. only 'a' inside 'a.b'. 409 func lexBareName(lx *lexer) stateFn { 410 r := lx.next() 411 if isBareKeyChar(r) { 412 return lexBareName 413 } 414 lx.backup() 415 lx.emit(itemText) 416 return lx.pop() 417 } 418 419 // lexBareName lexes one part of a key or table. 420 // 421 // It assumes that at least one valid character for the table has already been 422 // read. 423 // 424 // Lexes only one part, e.g. only '"a"' inside '"a".b'. 425 func lexQuotedName(lx *lexer) stateFn { 426 r := lx.next() 427 switch { 428 case isWhitespace(r): 429 return lexSkip(lx, lexValue) 430 case r == '"': 431 lx.ignore() // ignore the '"' 432 return lexString 433 case r == '\'': 434 lx.ignore() // ignore the "'" 435 return lexRawString 436 case r == eof: 437 return lx.errorf("unexpected EOF; expected value") 438 default: 439 return lx.errorf("expected value but found %q instead", r) 440 } 441 } 442 443 // lexKeyStart consumes all key parts until a '='. 444 func lexKeyStart(lx *lexer) stateFn { 445 lx.skip(isWhitespace) 446 switch r := lx.peek(); { 447 case r == '=' || r == eof: 448 return lx.errorf("unexpected '=': key name appears blank") 449 case r == '.': 450 return lx.errorf("unexpected '.': keys cannot start with a '.'") 451 case r == '"' || r == '\'': 452 lx.ignore() 453 fallthrough 454 default: // Bare key 455 lx.emit(itemKeyStart) 456 return lexKeyNameStart 457 } 458 } 459 460 func lexKeyNameStart(lx *lexer) stateFn { 461 lx.skip(isWhitespace) 462 switch r := lx.peek(); { 463 case r == '=' || r == eof: 464 return lx.errorf("unexpected '='") 465 case r == '.': 466 return lx.errorf("unexpected '.'") 467 case r == '"' || r == '\'': 468 lx.ignore() 469 lx.push(lexKeyEnd) 470 return lexQuotedName 471 default: 472 lx.push(lexKeyEnd) 473 return lexBareName 474 } 475 } 476 477 // lexKeyEnd consumes the end of a key and trims whitespace (up to the key 478 // separator). 479 func lexKeyEnd(lx *lexer) stateFn { 480 lx.skip(isWhitespace) 481 switch r := lx.next(); { 482 case isWhitespace(r): 483 return lexSkip(lx, lexKeyEnd) 484 case r == eof: 485 return lx.errorf("unexpected EOF; expected key separator '='") 486 case r == '.': 487 lx.ignore() 488 return lexKeyNameStart 489 case r == '=': 490 lx.emit(itemKeyEnd) 491 return lexSkip(lx, lexValue) 492 default: 493 return lx.errorf("expected '.' or '=', but got %q instead", r) 494 } 495 } 496 497 // lexValue starts the consumption of a value anywhere a value is expected. 498 // lexValue will ignore whitespace. 499 // After a value is lexed, the last state on the next is popped and returned. 500 func lexValue(lx *lexer) stateFn { 501 // We allow whitespace to precede a value, but NOT newlines. 502 // In array syntax, the array states are responsible for ignoring newlines. 503 r := lx.next() 504 switch { 505 case isWhitespace(r): 506 return lexSkip(lx, lexValue) 507 case isDigit(r): 508 lx.backup() // avoid an extra state and use the same as above 509 return lexNumberOrDateStart 510 } 511 switch r { 512 case '[': 513 lx.ignore() 514 lx.emit(itemArray) 515 return lexArrayValue 516 case '{': 517 lx.ignore() 518 lx.emit(itemInlineTableStart) 519 return lexInlineTableValue 520 case '"': 521 if lx.accept('"') { 522 if lx.accept('"') { 523 lx.ignore() // Ignore """ 524 return lexMultilineString 525 } 526 lx.backup() 527 } 528 lx.ignore() // ignore the '"' 529 return lexString 530 case '\'': 531 if lx.accept('\'') { 532 if lx.accept('\'') { 533 lx.ignore() // Ignore """ 534 return lexMultilineRawString 535 } 536 lx.backup() 537 } 538 lx.ignore() // ignore the "'" 539 return lexRawString 540 case '.': // special error case, be kind to users 541 return lx.errorf("floats must start with a digit, not '.'") 542 case 'i', 'n': 543 if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) { 544 lx.emit(itemFloat) 545 return lx.pop() 546 } 547 case '-', '+': 548 return lexDecimalNumberStart 549 } 550 if unicode.IsLetter(r) { 551 // Be permissive here; lexBool will give a nice error if the 552 // user wrote something like 553 // x = foo 554 // (i.e. not 'true' or 'false' but is something else word-like.) 555 lx.backup() 556 return lexBool 557 } 558 if r == eof { 559 return lx.errorf("unexpected EOF; expected value") 560 } 561 return lx.errorf("expected value but found %q instead", r) 562 } 563 564 // lexArrayValue consumes one value in an array. It assumes that '[' or ',' 565 // have already been consumed. All whitespace and newlines are ignored. 566 func lexArrayValue(lx *lexer) stateFn { 567 r := lx.next() 568 switch { 569 case isWhitespace(r) || isNL(r): 570 return lexSkip(lx, lexArrayValue) 571 case r == '#': 572 lx.push(lexArrayValue) 573 return lexCommentStart 574 case r == ',': 575 return lx.errorf("unexpected comma") 576 case r == ']': 577 return lexArrayEnd 578 } 579 580 lx.backup() 581 lx.push(lexArrayValueEnd) 582 return lexValue 583 } 584 585 // lexArrayValueEnd consumes everything between the end of an array value and 586 // the next value (or the end of the array): it ignores whitespace and newlines 587 // and expects either a ',' or a ']'. 588 func lexArrayValueEnd(lx *lexer) stateFn { 589 switch r := lx.next(); { 590 case isWhitespace(r) || isNL(r): 591 return lexSkip(lx, lexArrayValueEnd) 592 case r == '#': 593 lx.push(lexArrayValueEnd) 594 return lexCommentStart 595 case r == ',': 596 lx.ignore() 597 return lexArrayValue // move on to the next value 598 case r == ']': 599 return lexArrayEnd 600 default: 601 return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r)) 602 } 603 } 604 605 // lexArrayEnd finishes the lexing of an array. 606 // It assumes that a ']' has just been consumed. 607 func lexArrayEnd(lx *lexer) stateFn { 608 lx.ignore() 609 lx.emit(itemArrayEnd) 610 return lx.pop() 611 } 612 613 // lexInlineTableValue consumes one key/value pair in an inline table. 614 // It assumes that '{' or ',' have already been consumed. Whitespace is ignored. 615 func lexInlineTableValue(lx *lexer) stateFn { 616 r := lx.next() 617 switch { 618 case isWhitespace(r): 619 return lexSkip(lx, lexInlineTableValue) 620 case isNL(r): 621 return lx.errorPrevLine(errLexInlineTableNL{}) 622 case r == '#': 623 lx.push(lexInlineTableValue) 624 return lexCommentStart 625 case r == ',': 626 return lx.errorf("unexpected comma") 627 case r == '}': 628 return lexInlineTableEnd 629 } 630 lx.backup() 631 lx.push(lexInlineTableValueEnd) 632 return lexKeyStart 633 } 634 635 // lexInlineTableValueEnd consumes everything between the end of an inline table 636 // key/value pair and the next pair (or the end of the table): 637 // it ignores whitespace and expects either a ',' or a '}'. 638 func lexInlineTableValueEnd(lx *lexer) stateFn { 639 switch r := lx.next(); { 640 case isWhitespace(r): 641 return lexSkip(lx, lexInlineTableValueEnd) 642 case isNL(r): 643 return lx.errorPrevLine(errLexInlineTableNL{}) 644 case r == '#': 645 lx.push(lexInlineTableValueEnd) 646 return lexCommentStart 647 case r == ',': 648 lx.ignore() 649 lx.skip(isWhitespace) 650 if lx.peek() == '}' { 651 return lx.errorf("trailing comma not allowed in inline tables") 652 } 653 return lexInlineTableValue 654 case r == '}': 655 return lexInlineTableEnd 656 default: 657 return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r)) 658 } 659 } 660 661 func runeOrEOF(r rune) string { 662 if r == eof { 663 return "end of file" 664 } 665 return "'" + string(r) + "'" 666 } 667 668 // lexInlineTableEnd finishes the lexing of an inline table. 669 // It assumes that a '}' has just been consumed. 670 func lexInlineTableEnd(lx *lexer) stateFn { 671 lx.ignore() 672 lx.emit(itemInlineTableEnd) 673 return lx.pop() 674 } 675 676 // lexString consumes the inner contents of a string. It assumes that the 677 // beginning '"' has already been consumed and ignored. 678 func lexString(lx *lexer) stateFn { 679 r := lx.next() 680 switch { 681 case r == eof: 682 return lx.errorf(`unexpected EOF; expected '"'`) 683 case isNL(r): 684 return lx.errorPrevLine(errLexStringNL{}) 685 case r == '\\': 686 lx.push(lexString) 687 return lexStringEscape 688 case r == '"': 689 lx.backup() 690 lx.emit(itemString) 691 lx.next() 692 lx.ignore() 693 return lx.pop() 694 } 695 return lexString 696 } 697 698 // lexMultilineString consumes the inner contents of a string. It assumes that 699 // the beginning '"""' has already been consumed and ignored. 700 func lexMultilineString(lx *lexer) stateFn { 701 r := lx.next() 702 switch r { 703 default: 704 return lexMultilineString 705 case eof: 706 return lx.errorf(`unexpected EOF; expected '"""'`) 707 case '\\': 708 return lexMultilineStringEscape 709 case '"': 710 /// Found " → try to read two more "". 711 if lx.accept('"') { 712 if lx.accept('"') { 713 /// Peek ahead: the string can contain " and "", including at the 714 /// end: """str""""" 715 /// 6 or more at the end, however, is an error. 716 if lx.peek() == '"' { 717 /// Check if we already lexed 5 's; if so we have 6 now, and 718 /// that's just too many man! 719 /// 720 /// Second check is for the edge case: 721 /// 722 /// two quotes allowed. 723 /// vv 724 /// """lol \"""""" 725 /// ^^ ^^^---- closing three 726 /// escaped 727 /// 728 /// But ugly, but it works 729 if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) { 730 return lx.errorf(`unexpected '""""""'`) 731 } 732 lx.backup() 733 lx.backup() 734 return lexMultilineString 735 } 736 737 lx.backup() /// backup: don't include the """ in the item. 738 lx.backup() 739 lx.backup() 740 lx.emit(itemMultilineString) 741 lx.next() /// Read over ''' again and discard it. 742 lx.next() 743 lx.next() 744 lx.ignore() 745 return lx.pop() 746 } 747 lx.backup() 748 } 749 return lexMultilineString 750 } 751 } 752 753 // lexRawString consumes a raw string. Nothing can be escaped in such a string. 754 // It assumes that the beginning "'" has already been consumed and ignored. 755 func lexRawString(lx *lexer) stateFn { 756 r := lx.next() 757 switch { 758 default: 759 return lexRawString 760 case r == eof: 761 return lx.errorf(`unexpected EOF; expected "'"`) 762 case isNL(r): 763 return lx.errorPrevLine(errLexStringNL{}) 764 case r == '\'': 765 lx.backup() 766 lx.emit(itemRawString) 767 lx.next() 768 lx.ignore() 769 return lx.pop() 770 } 771 } 772 773 // lexMultilineRawString consumes a raw string. Nothing can be escaped in such 774 // a string. It assumes that the beginning "”'" has already been consumed and 775 // ignored. 776 func lexMultilineRawString(lx *lexer) stateFn { 777 r := lx.next() 778 switch r { 779 default: 780 return lexMultilineRawString 781 case eof: 782 return lx.errorf(`unexpected EOF; expected "'''"`) 783 case '\'': 784 /// Found ' → try to read two more ''. 785 if lx.accept('\'') { 786 if lx.accept('\'') { 787 /// Peek ahead: the string can contain ' and '', including at the 788 /// end: '''str''''' 789 /// 6 or more at the end, however, is an error. 790 if lx.peek() == '\'' { 791 /// Check if we already lexed 5 's; if so we have 6 now, and 792 /// that's just too many man! 793 if strings.HasSuffix(lx.current(), "'''''") { 794 return lx.errorf(`unexpected "''''''"`) 795 } 796 lx.backup() 797 lx.backup() 798 return lexMultilineRawString 799 } 800 801 lx.backup() /// backup: don't include the ''' in the item. 802 lx.backup() 803 lx.backup() 804 lx.emit(itemRawMultilineString) 805 lx.next() /// Read over ''' again and discard it. 806 lx.next() 807 lx.next() 808 lx.ignore() 809 return lx.pop() 810 } 811 lx.backup() 812 } 813 return lexMultilineRawString 814 } 815 } 816 817 // lexMultilineStringEscape consumes an escaped character. It assumes that the 818 // preceding '\\' has already been consumed. 819 func lexMultilineStringEscape(lx *lexer) stateFn { 820 if isNL(lx.next()) { /// \ escaping newline. 821 return lexMultilineString 822 } 823 lx.backup() 824 lx.push(lexMultilineString) 825 return lexStringEscape(lx) 826 } 827 828 func lexStringEscape(lx *lexer) stateFn { 829 r := lx.next() 830 switch r { 831 case 'b': 832 fallthrough 833 case 't': 834 fallthrough 835 case 'n': 836 fallthrough 837 case 'f': 838 fallthrough 839 case 'r': 840 fallthrough 841 case '"': 842 fallthrough 843 case ' ', '\t': 844 // Inside """ .. """ strings you can use \ to escape newlines, and any 845 // amount of whitespace can be between the \ and \n. 846 fallthrough 847 case '\\': 848 return lx.pop() 849 case 'u': 850 return lexShortUnicodeEscape 851 case 'U': 852 return lexLongUnicodeEscape 853 } 854 return lx.error(errLexEscape{r}) 855 } 856 857 func lexShortUnicodeEscape(lx *lexer) stateFn { 858 var r rune 859 for i := 0; i < 4; i++ { 860 r = lx.next() 861 if !isHexadecimal(r) { 862 return lx.errorf( 863 `expected four hexadecimal digits after '\u', but got %q instead`, 864 lx.current()) 865 } 866 } 867 return lx.pop() 868 } 869 870 func lexLongUnicodeEscape(lx *lexer) stateFn { 871 var r rune 872 for i := 0; i < 8; i++ { 873 r = lx.next() 874 if !isHexadecimal(r) { 875 return lx.errorf( 876 `expected eight hexadecimal digits after '\U', but got %q instead`, 877 lx.current()) 878 } 879 } 880 return lx.pop() 881 } 882 883 // lexNumberOrDateStart processes the first character of a value which begins 884 // with a digit. It exists to catch values starting with '0', so that 885 // lexBaseNumberOrDate can differentiate base prefixed integers from other 886 // types. 887 func lexNumberOrDateStart(lx *lexer) stateFn { 888 r := lx.next() 889 switch r { 890 case '0': 891 return lexBaseNumberOrDate 892 } 893 894 if !isDigit(r) { 895 // The only way to reach this state is if the value starts 896 // with a digit, so specifically treat anything else as an 897 // error. 898 return lx.errorf("expected a digit but got %q", r) 899 } 900 901 return lexNumberOrDate 902 } 903 904 // lexNumberOrDate consumes either an integer, float or datetime. 905 func lexNumberOrDate(lx *lexer) stateFn { 906 r := lx.next() 907 if isDigit(r) { 908 return lexNumberOrDate 909 } 910 switch r { 911 case '-', ':': 912 return lexDatetime 913 case '_': 914 return lexDecimalNumber 915 case '.', 'e', 'E': 916 return lexFloat 917 } 918 919 lx.backup() 920 lx.emit(itemInteger) 921 return lx.pop() 922 } 923 924 // lexDatetime consumes a Datetime, to a first approximation. 925 // The parser validates that it matches one of the accepted formats. 926 func lexDatetime(lx *lexer) stateFn { 927 r := lx.next() 928 if isDigit(r) { 929 return lexDatetime 930 } 931 switch r { 932 case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+': 933 return lexDatetime 934 } 935 936 lx.backup() 937 lx.emitTrim(itemDatetime) 938 return lx.pop() 939 } 940 941 // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix. 942 func lexHexInteger(lx *lexer) stateFn { 943 r := lx.next() 944 if isHexadecimal(r) { 945 return lexHexInteger 946 } 947 switch r { 948 case '_': 949 return lexHexInteger 950 } 951 952 lx.backup() 953 lx.emit(itemInteger) 954 return lx.pop() 955 } 956 957 // lexOctalInteger consumes an octal integer after seeing the '0o' prefix. 958 func lexOctalInteger(lx *lexer) stateFn { 959 r := lx.next() 960 if isOctal(r) { 961 return lexOctalInteger 962 } 963 switch r { 964 case '_': 965 return lexOctalInteger 966 } 967 968 lx.backup() 969 lx.emit(itemInteger) 970 return lx.pop() 971 } 972 973 // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix. 974 func lexBinaryInteger(lx *lexer) stateFn { 975 r := lx.next() 976 if isBinary(r) { 977 return lexBinaryInteger 978 } 979 switch r { 980 case '_': 981 return lexBinaryInteger 982 } 983 984 lx.backup() 985 lx.emit(itemInteger) 986 return lx.pop() 987 } 988 989 // lexDecimalNumber consumes a decimal float or integer. 990 func lexDecimalNumber(lx *lexer) stateFn { 991 r := lx.next() 992 if isDigit(r) { 993 return lexDecimalNumber 994 } 995 switch r { 996 case '.', 'e', 'E': 997 return lexFloat 998 case '_': 999 return lexDecimalNumber 1000 } 1001 1002 lx.backup() 1003 lx.emit(itemInteger) 1004 return lx.pop() 1005 } 1006 1007 // lexDecimalNumber consumes the first digit of a number beginning with a sign. 1008 // It assumes the sign has already been consumed. Values which start with a sign 1009 // are only allowed to be decimal integers or floats. 1010 // 1011 // The special "nan" and "inf" values are also recognized. 1012 func lexDecimalNumberStart(lx *lexer) stateFn { 1013 r := lx.next() 1014 1015 // Special error cases to give users better error messages 1016 switch r { 1017 case 'i': 1018 if !lx.accept('n') || !lx.accept('f') { 1019 return lx.errorf("invalid float: '%s'", lx.current()) 1020 } 1021 lx.emit(itemFloat) 1022 return lx.pop() 1023 case 'n': 1024 if !lx.accept('a') || !lx.accept('n') { 1025 return lx.errorf("invalid float: '%s'", lx.current()) 1026 } 1027 lx.emit(itemFloat) 1028 return lx.pop() 1029 case '0': 1030 p := lx.peek() 1031 switch p { 1032 case 'b', 'o', 'x': 1033 return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p) 1034 } 1035 case '.': 1036 return lx.errorf("floats must start with a digit, not '.'") 1037 } 1038 1039 if isDigit(r) { 1040 return lexDecimalNumber 1041 } 1042 1043 return lx.errorf("expected a digit but got %q", r) 1044 } 1045 1046 // lexBaseNumberOrDate differentiates between the possible values which 1047 // start with '0'. It assumes that before reaching this state, the initial '0' 1048 // has been consumed. 1049 func lexBaseNumberOrDate(lx *lexer) stateFn { 1050 r := lx.next() 1051 // Note: All datetimes start with at least two digits, so we don't 1052 // handle date characters (':', '-', etc.) here. 1053 if isDigit(r) { 1054 return lexNumberOrDate 1055 } 1056 switch r { 1057 case '_': 1058 // Can only be decimal, because there can't be an underscore 1059 // between the '0' and the base designator, and dates can't 1060 // contain underscores. 1061 return lexDecimalNumber 1062 case '.', 'e', 'E': 1063 return lexFloat 1064 case 'b': 1065 r = lx.peek() 1066 if !isBinary(r) { 1067 lx.errorf("not a binary number: '%s%c'", lx.current(), r) 1068 } 1069 return lexBinaryInteger 1070 case 'o': 1071 r = lx.peek() 1072 if !isOctal(r) { 1073 lx.errorf("not an octal number: '%s%c'", lx.current(), r) 1074 } 1075 return lexOctalInteger 1076 case 'x': 1077 r = lx.peek() 1078 if !isHexadecimal(r) { 1079 lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r) 1080 } 1081 return lexHexInteger 1082 } 1083 1084 lx.backup() 1085 lx.emit(itemInteger) 1086 return lx.pop() 1087 } 1088 1089 // lexFloat consumes the elements of a float. It allows any sequence of 1090 // float-like characters, so floats emitted by the lexer are only a first 1091 // approximation and must be validated by the parser. 1092 func lexFloat(lx *lexer) stateFn { 1093 r := lx.next() 1094 if isDigit(r) { 1095 return lexFloat 1096 } 1097 switch r { 1098 case '_', '.', '-', '+', 'e', 'E': 1099 return lexFloat 1100 } 1101 1102 lx.backup() 1103 lx.emit(itemFloat) 1104 return lx.pop() 1105 } 1106 1107 // lexBool consumes a bool string: 'true' or 'false. 1108 func lexBool(lx *lexer) stateFn { 1109 var rs []rune 1110 for { 1111 r := lx.next() 1112 if !unicode.IsLetter(r) { 1113 lx.backup() 1114 break 1115 } 1116 rs = append(rs, r) 1117 } 1118 s := string(rs) 1119 switch s { 1120 case "true", "false": 1121 lx.emit(itemBool) 1122 return lx.pop() 1123 } 1124 return lx.errorf("expected value but found %q instead", s) 1125 } 1126 1127 // lexCommentStart begins the lexing of a comment. It will emit 1128 // itemCommentStart and consume no characters, passing control to lexComment. 1129 func lexCommentStart(lx *lexer) stateFn { 1130 lx.ignore() 1131 lx.emit(itemCommentStart) 1132 return lexComment 1133 } 1134 1135 // lexComment lexes an entire comment. It assumes that '#' has been consumed. 1136 // It will consume *up to* the first newline character, and pass control 1137 // back to the last state on the stack. 1138 func lexComment(lx *lexer) stateFn { 1139 switch r := lx.next(); { 1140 case isNL(r) || r == eof: 1141 lx.backup() 1142 lx.emit(itemText) 1143 return lx.pop() 1144 default: 1145 return lexComment 1146 } 1147 } 1148 1149 // lexSkip ignores all slurped input and moves on to the next state. 1150 func lexSkip(lx *lexer, nextState stateFn) stateFn { 1151 lx.ignore() 1152 return nextState 1153 } 1154 1155 func (s stateFn) String() string { 1156 name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name() 1157 if i := strings.LastIndexByte(name, '.'); i > -1 { 1158 name = name[i+1:] 1159 } 1160 if s == nil { 1161 name = "<nil>" 1162 } 1163 return name + "()" 1164 } 1165 1166 func (itype itemType) String() string { 1167 switch itype { 1168 case itemError: 1169 return "Error" 1170 case itemNIL: 1171 return "NIL" 1172 case itemEOF: 1173 return "EOF" 1174 case itemText: 1175 return "Text" 1176 case itemString, itemRawString, itemMultilineString, itemRawMultilineString: 1177 return "String" 1178 case itemBool: 1179 return "Bool" 1180 case itemInteger: 1181 return "Integer" 1182 case itemFloat: 1183 return "Float" 1184 case itemDatetime: 1185 return "DateTime" 1186 case itemTableStart: 1187 return "TableStart" 1188 case itemTableEnd: 1189 return "TableEnd" 1190 case itemKeyStart: 1191 return "KeyStart" 1192 case itemKeyEnd: 1193 return "KeyEnd" 1194 case itemArray: 1195 return "Array" 1196 case itemArrayEnd: 1197 return "ArrayEnd" 1198 case itemCommentStart: 1199 return "CommentStart" 1200 case itemInlineTableStart: 1201 return "InlineTableStart" 1202 case itemInlineTableEnd: 1203 return "InlineTableEnd" 1204 } 1205 panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype))) 1206 } 1207 1208 func (item item) String() string { 1209 return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val) 1210 } 1211 1212 func isWhitespace(r rune) bool { return r == '\t' || r == ' ' } 1213 func isNL(r rune) bool { return r == '\n' || r == '\r' } 1214 func isControl(r rune) bool { // Control characters except \t, \r, \n 1215 switch r { 1216 case '\t', '\r', '\n': 1217 return false 1218 default: 1219 return (r >= 0x00 && r <= 0x1f) || r == 0x7f 1220 } 1221 } 1222 func isDigit(r rune) bool { return r >= '0' && r <= '9' } 1223 func isBinary(r rune) bool { return r == '0' || r == '1' } 1224 func isOctal(r rune) bool { return r >= '0' && r <= '7' } 1225 func isHexadecimal(r rune) bool { 1226 return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F') 1227 } 1228 func isBareKeyChar(r rune) bool { 1229 return (r >= 'A' && r <= 'Z') || 1230 (r >= 'a' && r <= 'z') || 1231 (r >= '0' && r <= '9') || 1232 r == '_' || r == '-' 1233 }