git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/toml/lex.go

git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/toml/lex.go (about)

     1  package toml
     2  
     3  import (
     4  	"fmt"
     5  	"reflect"
     6  	"runtime"
     7  	"strings"
     8  	"unicode"
     9  	"unicode/utf8"
    10  )
    11  
    12  type itemType int
    13  
    14  const (
    15  	itemError itemType = iota
    16  	itemNIL            // used in the parser to indicate no type
    17  	itemEOF
    18  	itemText
    19  	itemString
    20  	itemRawString
    21  	itemMultilineString
    22  	itemRawMultilineString
    23  	itemBool
    24  	itemInteger
    25  	itemFloat
    26  	itemDatetime
    27  	itemArray // the start of an array
    28  	itemArrayEnd
    29  	itemTableStart
    30  	itemTableEnd
    31  	itemArrayTableStart
    32  	itemArrayTableEnd
    33  	itemKeyStart
    34  	itemKeyEnd
    35  	itemCommentStart
    36  	itemInlineTableStart
    37  	itemInlineTableEnd
    38  )
    39  
    40  const eof = 0
    41  
    42  type stateFn func(lx *lexer) stateFn
    43  
    44  func (p Position) String() string {
    45  	return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
    46  }
    47  
    48  type lexer struct {
    49  	input string
    50  	start int
    51  	pos   int
    52  	line  int
    53  	state stateFn
    54  	items chan item
    55  
    56  	// Allow for backing up up to 4 runes. This is necessary because TOML
    57  	// contains 3-rune tokens (""" and ''').
    58  	prevWidths [4]int
    59  	nprev      int  // how many of prevWidths are in use
    60  	atEOF      bool // If we emit an eof, we can still back up, but it is not OK to call next again.
    61  
    62  	// A stack of state functions used to maintain context.
    63  	//
    64  	// The idea is to reuse parts of the state machine in various places. For
    65  	// example, values can appear at the top level or within arbitrarily nested
    66  	// arrays. The last state on the stack is used after a value has been lexed.
    67  	// Similarly for comments.
    68  	stack []stateFn
    69  }
    70  
    71  type item struct {
    72  	typ itemType
    73  	val string
    74  	err error
    75  	pos Position
    76  }
    77  
    78  func (lx *lexer) nextItem() item {
    79  	for {
    80  		select {
    81  		case item := <-lx.items:
    82  			return item
    83  		default:
    84  			lx.state = lx.state(lx)
    85  			//fmt.Printf("     STATE %-24s  current: %-10s	stack: %s\n", lx.state, lx.current(), lx.stack)
    86  		}
    87  	}
    88  }
    89  
    90  func lex(input string) *lexer {
    91  	lx := &lexer{
    92  		input: input,
    93  		state: lexTop,
    94  		items: make(chan item, 10),
    95  		stack: make([]stateFn, 0, 10),
    96  		line:  1,
    97  	}
    98  	return lx
    99  }
   100  
   101  func (lx *lexer) push(state stateFn) {
   102  	lx.stack = append(lx.stack, state)
   103  }
   104  
   105  func (lx *lexer) pop() stateFn {
   106  	if len(lx.stack) == 0 {
   107  		return lx.errorf("BUG in lexer: no states to pop")
   108  	}
   109  	last := lx.stack[len(lx.stack)-1]
   110  	lx.stack = lx.stack[0 : len(lx.stack)-1]
   111  	return last
   112  }
   113  
   114  func (lx *lexer) current() string {
   115  	return lx.input[lx.start:lx.pos]
   116  }
   117  
   118  func (lx lexer) getPos() Position {
   119  	p := Position{
   120  		Line:  lx.line,
   121  		Start: lx.start,
   122  		Len:   lx.pos - lx.start,
   123  	}
   124  	if p.Len <= 0 {
   125  		p.Len = 1
   126  	}
   127  	return p
   128  }
   129  
   130  func (lx *lexer) emit(typ itemType) {
   131  	// Needed for multiline strings ending with an incomplete UTF-8 sequence.
   132  	if lx.start > lx.pos {
   133  		lx.error(errLexUTF8{lx.input[lx.pos]})
   134  		return
   135  	}
   136  	lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
   137  	lx.start = lx.pos
   138  }
   139  
   140  func (lx *lexer) emitTrim(typ itemType) {
   141  	lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
   142  	lx.start = lx.pos
   143  }
   144  
   145  func (lx *lexer) next() (r rune) {
   146  	if lx.atEOF {
   147  		panic("BUG in lexer: next called after EOF")
   148  	}
   149  	if lx.pos >= len(lx.input) {
   150  		lx.atEOF = true
   151  		return eof
   152  	}
   153  
   154  	if lx.input[lx.pos] == '\n' {
   155  		lx.line++
   156  	}
   157  	lx.prevWidths[3] = lx.prevWidths[2]
   158  	lx.prevWidths[2] = lx.prevWidths[1]
   159  	lx.prevWidths[1] = lx.prevWidths[0]
   160  	if lx.nprev < 4 {
   161  		lx.nprev++
   162  	}
   163  
   164  	r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
   165  	if r == utf8.RuneError {
   166  		lx.error(errLexUTF8{lx.input[lx.pos]})
   167  		return utf8.RuneError
   168  	}
   169  
   170  	// Note: don't use peek() here, as this calls next().
   171  	if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
   172  		lx.errorControlChar(r)
   173  		return utf8.RuneError
   174  	}
   175  
   176  	lx.prevWidths[0] = w
   177  	lx.pos += w
   178  	return r
   179  }
   180  
   181  // ignore skips over the pending input before this point.
   182  func (lx *lexer) ignore() {
   183  	lx.start = lx.pos
   184  }
   185  
   186  // backup steps back one rune. Can be called 4 times between calls to next.
   187  func (lx *lexer) backup() {
   188  	if lx.atEOF {
   189  		lx.atEOF = false
   190  		return
   191  	}
   192  	if lx.nprev < 1 {
   193  		panic("BUG in lexer: backed up too far")
   194  	}
   195  	w := lx.prevWidths[0]
   196  	lx.prevWidths[0] = lx.prevWidths[1]
   197  	lx.prevWidths[1] = lx.prevWidths[2]
   198  	lx.prevWidths[2] = lx.prevWidths[3]
   199  	lx.nprev--
   200  
   201  	lx.pos -= w
   202  	if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
   203  		lx.line--
   204  	}
   205  }
   206  
   207  // accept consumes the next rune if it's equal to `valid`.
   208  func (lx *lexer) accept(valid rune) bool {
   209  	if lx.next() == valid {
   210  		return true
   211  	}
   212  	lx.backup()
   213  	return false
   214  }
   215  
   216  // peek returns but does not consume the next rune in the input.
   217  func (lx *lexer) peek() rune {
   218  	r := lx.next()
   219  	lx.backup()
   220  	return r
   221  }
   222  
   223  // skip ignores all input that matches the given predicate.
   224  func (lx *lexer) skip(pred func(rune) bool) {
   225  	for {
   226  		r := lx.next()
   227  		if pred(r) {
   228  			continue
   229  		}
   230  		lx.backup()
   231  		lx.ignore()
   232  		return
   233  	}
   234  }
   235  
   236  // error stops all lexing by emitting an error and returning `nil`.
   237  //
   238  // Note that any value that is a character is escaped if it's a special
   239  // character (newlines, tabs, etc.).
   240  func (lx *lexer) error(err error) stateFn {
   241  	if lx.atEOF {
   242  		return lx.errorPrevLine(err)
   243  	}
   244  	lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
   245  	return nil
   246  }
   247  
   248  // errorfPrevline is like error(), but sets the position to the last column of
   249  // the previous line.
   250  //
   251  // This is so that unexpected EOF or NL errors don't show on a new blank line.
   252  func (lx *lexer) errorPrevLine(err error) stateFn {
   253  	pos := lx.getPos()
   254  	pos.Line--
   255  	pos.Len = 1
   256  	pos.Start = lx.pos - 1
   257  	lx.items <- item{typ: itemError, pos: pos, err: err}
   258  	return nil
   259  }
   260  
   261  // errorPos is like error(), but allows explicitly setting the position.
   262  func (lx *lexer) errorPos(start, length int, err error) stateFn {
   263  	pos := lx.getPos()
   264  	pos.Start = start
   265  	pos.Len = length
   266  	lx.items <- item{typ: itemError, pos: pos, err: err}
   267  	return nil
   268  }
   269  
   270  // errorf is like error, and creates a new error.
   271  func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
   272  	if lx.atEOF {
   273  		pos := lx.getPos()
   274  		pos.Line--
   275  		pos.Len = 1
   276  		pos.Start = lx.pos - 1
   277  		lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
   278  		return nil
   279  	}
   280  	lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
   281  	return nil
   282  }
   283  
   284  func (lx *lexer) errorControlChar(cc rune) stateFn {
   285  	return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
   286  }
   287  
   288  // lexTop consumes elements at the top level of TOML data.
   289  func lexTop(lx *lexer) stateFn {
   290  	r := lx.next()
   291  	if isWhitespace(r) || isNL(r) {
   292  		return lexSkip(lx, lexTop)
   293  	}
   294  	switch r {
   295  	case '#':
   296  		lx.push(lexTop)
   297  		return lexCommentStart
   298  	case '[':
   299  		return lexTableStart
   300  	case eof:
   301  		if lx.pos > lx.start {
   302  			return lx.errorf("unexpected EOF")
   303  		}
   304  		lx.emit(itemEOF)
   305  		return nil
   306  	}
   307  
   308  	// At this point, the only valid item can be a key, so we back up
   309  	// and let the key lexer do the rest.
   310  	lx.backup()
   311  	lx.push(lexTopEnd)
   312  	return lexKeyStart
   313  }
   314  
   315  // lexTopEnd is entered whenever a top-level item has been consumed. (A value
   316  // or a table.) It must see only whitespace, and will turn back to lexTop
   317  // upon a newline. If it sees EOF, it will quit the lexer successfully.
   318  func lexTopEnd(lx *lexer) stateFn {
   319  	r := lx.next()
   320  	switch {
   321  	case r == '#':
   322  		// a comment will read to a newline for us.
   323  		lx.push(lexTop)
   324  		return lexCommentStart
   325  	case isWhitespace(r):
   326  		return lexTopEnd
   327  	case isNL(r):
   328  		lx.ignore()
   329  		return lexTop
   330  	case r == eof:
   331  		lx.emit(itemEOF)
   332  		return nil
   333  	}
   334  	return lx.errorf(
   335  		"expected a top-level item to end with a newline, comment, or EOF, but got %q instead",
   336  		r)
   337  }
   338  
   339  // lexTable lexes the beginning of a table. Namely, it makes sure that
   340  // it starts with a character other than '.' and ']'.
   341  // It assumes that '[' has already been consumed.
   342  // It also handles the case that this is an item in an array of tables.
   343  // e.g., '[[name]]'.
   344  func lexTableStart(lx *lexer) stateFn {
   345  	if lx.peek() == '[' {
   346  		lx.next()
   347  		lx.emit(itemArrayTableStart)
   348  		lx.push(lexArrayTableEnd)
   349  	} else {
   350  		lx.emit(itemTableStart)
   351  		lx.push(lexTableEnd)
   352  	}
   353  	return lexTableNameStart
   354  }
   355  
   356  func lexTableEnd(lx *lexer) stateFn {
   357  	lx.emit(itemTableEnd)
   358  	return lexTopEnd
   359  }
   360  
   361  func lexArrayTableEnd(lx *lexer) stateFn {
   362  	if r := lx.next(); r != ']' {
   363  		return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
   364  	}
   365  	lx.emit(itemArrayTableEnd)
   366  	return lexTopEnd
   367  }
   368  
   369  func lexTableNameStart(lx *lexer) stateFn {
   370  	lx.skip(isWhitespace)
   371  	switch r := lx.peek(); {
   372  	case r == ']' || r == eof:
   373  		return lx.errorf("unexpected end of table name (table names cannot be empty)")
   374  	case r == '.':
   375  		return lx.errorf("unexpected table separator (table names cannot be empty)")
   376  	case r == '"' || r == '\'':
   377  		lx.ignore()
   378  		lx.push(lexTableNameEnd)
   379  		return lexQuotedName
   380  	default:
   381  		lx.push(lexTableNameEnd)
   382  		return lexBareName
   383  	}
   384  }
   385  
   386  // lexTableNameEnd reads the end of a piece of a table name, optionally
   387  // consuming whitespace.
   388  func lexTableNameEnd(lx *lexer) stateFn {
   389  	lx.skip(isWhitespace)
   390  	switch r := lx.next(); {
   391  	case isWhitespace(r):
   392  		return lexTableNameEnd
   393  	case r == '.':
   394  		lx.ignore()
   395  		return lexTableNameStart
   396  	case r == ']':
   397  		return lx.pop()
   398  	default:
   399  		return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
   400  	}
   401  }
   402  
   403  // lexBareName lexes one part of a key or table.
   404  //
   405  // It assumes that at least one valid character for the table has already been
   406  // read.
   407  //
   408  // Lexes only one part, e.g. only 'a' inside 'a.b'.
   409  func lexBareName(lx *lexer) stateFn {
   410  	r := lx.next()
   411  	if isBareKeyChar(r) {
   412  		return lexBareName
   413  	}
   414  	lx.backup()
   415  	lx.emit(itemText)
   416  	return lx.pop()
   417  }
   418  
   419  // lexBareName lexes one part of a key or table.
   420  //
   421  // It assumes that at least one valid character for the table has already been
   422  // read.
   423  //
   424  // Lexes only one part, e.g. only '"a"' inside '"a".b'.
   425  func lexQuotedName(lx *lexer) stateFn {
   426  	r := lx.next()
   427  	switch {
   428  	case isWhitespace(r):
   429  		return lexSkip(lx, lexValue)
   430  	case r == '"':
   431  		lx.ignore() // ignore the '"'
   432  		return lexString
   433  	case r == '\'':
   434  		lx.ignore() // ignore the "'"
   435  		return lexRawString
   436  	case r == eof:
   437  		return lx.errorf("unexpected EOF; expected value")
   438  	default:
   439  		return lx.errorf("expected value but found %q instead", r)
   440  	}
   441  }
   442  
   443  // lexKeyStart consumes all key parts until a '='.
   444  func lexKeyStart(lx *lexer) stateFn {
   445  	lx.skip(isWhitespace)
   446  	switch r := lx.peek(); {
   447  	case r == '=' || r == eof:
   448  		return lx.errorf("unexpected '=': key name appears blank")
   449  	case r == '.':
   450  		return lx.errorf("unexpected '.': keys cannot start with a '.'")
   451  	case r == '"' || r == '\'':
   452  		lx.ignore()
   453  		fallthrough
   454  	default: // Bare key
   455  		lx.emit(itemKeyStart)
   456  		return lexKeyNameStart
   457  	}
   458  }
   459  
   460  func lexKeyNameStart(lx *lexer) stateFn {
   461  	lx.skip(isWhitespace)
   462  	switch r := lx.peek(); {
   463  	case r == '=' || r == eof:
   464  		return lx.errorf("unexpected '='")
   465  	case r == '.':
   466  		return lx.errorf("unexpected '.'")
   467  	case r == '"' || r == '\'':
   468  		lx.ignore()
   469  		lx.push(lexKeyEnd)
   470  		return lexQuotedName
   471  	default:
   472  		lx.push(lexKeyEnd)
   473  		return lexBareName
   474  	}
   475  }
   476  
   477  // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
   478  // separator).
   479  func lexKeyEnd(lx *lexer) stateFn {
   480  	lx.skip(isWhitespace)
   481  	switch r := lx.next(); {
   482  	case isWhitespace(r):
   483  		return lexSkip(lx, lexKeyEnd)
   484  	case r == eof:
   485  		return lx.errorf("unexpected EOF; expected key separator '='")
   486  	case r == '.':
   487  		lx.ignore()
   488  		return lexKeyNameStart
   489  	case r == '=':
   490  		lx.emit(itemKeyEnd)
   491  		return lexSkip(lx, lexValue)
   492  	default:
   493  		return lx.errorf("expected '.' or '=', but got %q instead", r)
   494  	}
   495  }
   496  
   497  // lexValue starts the consumption of a value anywhere a value is expected.
   498  // lexValue will ignore whitespace.
   499  // After a value is lexed, the last state on the next is popped and returned.
   500  func lexValue(lx *lexer) stateFn {
   501  	// We allow whitespace to precede a value, but NOT newlines.
   502  	// In array syntax, the array states are responsible for ignoring newlines.
   503  	r := lx.next()
   504  	switch {
   505  	case isWhitespace(r):
   506  		return lexSkip(lx, lexValue)
   507  	case isDigit(r):
   508  		lx.backup() // avoid an extra state and use the same as above
   509  		return lexNumberOrDateStart
   510  	}
   511  	switch r {
   512  	case '[':
   513  		lx.ignore()
   514  		lx.emit(itemArray)
   515  		return lexArrayValue
   516  	case '{':
   517  		lx.ignore()
   518  		lx.emit(itemInlineTableStart)
   519  		return lexInlineTableValue
   520  	case '"':
   521  		if lx.accept('"') {
   522  			if lx.accept('"') {
   523  				lx.ignore() // Ignore """
   524  				return lexMultilineString
   525  			}
   526  			lx.backup()
   527  		}
   528  		lx.ignore() // ignore the '"'
   529  		return lexString
   530  	case '\'':
   531  		if lx.accept('\'') {
   532  			if lx.accept('\'') {
   533  				lx.ignore() // Ignore """
   534  				return lexMultilineRawString
   535  			}
   536  			lx.backup()
   537  		}
   538  		lx.ignore() // ignore the "'"
   539  		return lexRawString
   540  	case '.': // special error case, be kind to users
   541  		return lx.errorf("floats must start with a digit, not '.'")
   542  	case 'i', 'n':
   543  		if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
   544  			lx.emit(itemFloat)
   545  			return lx.pop()
   546  		}
   547  	case '-', '+':
   548  		return lexDecimalNumberStart
   549  	}
   550  	if unicode.IsLetter(r) {
   551  		// Be permissive here; lexBool will give a nice error if the
   552  		// user wrote something like
   553  		//   x = foo
   554  		// (i.e. not 'true' or 'false' but is something else word-like.)
   555  		lx.backup()
   556  		return lexBool
   557  	}
   558  	if r == eof {
   559  		return lx.errorf("unexpected EOF; expected value")
   560  	}
   561  	return lx.errorf("expected value but found %q instead", r)
   562  }
   563  
   564  // lexArrayValue consumes one value in an array. It assumes that '[' or ','
   565  // have already been consumed. All whitespace and newlines are ignored.
   566  func lexArrayValue(lx *lexer) stateFn {
   567  	r := lx.next()
   568  	switch {
   569  	case isWhitespace(r) || isNL(r):
   570  		return lexSkip(lx, lexArrayValue)
   571  	case r == '#':
   572  		lx.push(lexArrayValue)
   573  		return lexCommentStart
   574  	case r == ',':
   575  		return lx.errorf("unexpected comma")
   576  	case r == ']':
   577  		return lexArrayEnd
   578  	}
   579  
   580  	lx.backup()
   581  	lx.push(lexArrayValueEnd)
   582  	return lexValue
   583  }
   584  
   585  // lexArrayValueEnd consumes everything between the end of an array value and
   586  // the next value (or the end of the array): it ignores whitespace and newlines
   587  // and expects either a ',' or a ']'.
   588  func lexArrayValueEnd(lx *lexer) stateFn {
   589  	switch r := lx.next(); {
   590  	case isWhitespace(r) || isNL(r):
   591  		return lexSkip(lx, lexArrayValueEnd)
   592  	case r == '#':
   593  		lx.push(lexArrayValueEnd)
   594  		return lexCommentStart
   595  	case r == ',':
   596  		lx.ignore()
   597  		return lexArrayValue // move on to the next value
   598  	case r == ']':
   599  		return lexArrayEnd
   600  	default:
   601  		return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
   602  	}
   603  }
   604  
   605  // lexArrayEnd finishes the lexing of an array.
   606  // It assumes that a ']' has just been consumed.
   607  func lexArrayEnd(lx *lexer) stateFn {
   608  	lx.ignore()
   609  	lx.emit(itemArrayEnd)
   610  	return lx.pop()
   611  }
   612  
   613  // lexInlineTableValue consumes one key/value pair in an inline table.
   614  // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
   615  func lexInlineTableValue(lx *lexer) stateFn {
   616  	r := lx.next()
   617  	switch {
   618  	case isWhitespace(r):
   619  		return lexSkip(lx, lexInlineTableValue)
   620  	case isNL(r):
   621  		return lx.errorPrevLine(errLexInlineTableNL{})
   622  	case r == '#':
   623  		lx.push(lexInlineTableValue)
   624  		return lexCommentStart
   625  	case r == ',':
   626  		return lx.errorf("unexpected comma")
   627  	case r == '}':
   628  		return lexInlineTableEnd
   629  	}
   630  	lx.backup()
   631  	lx.push(lexInlineTableValueEnd)
   632  	return lexKeyStart
   633  }
   634  
   635  // lexInlineTableValueEnd consumes everything between the end of an inline table
   636  // key/value pair and the next pair (or the end of the table):
   637  // it ignores whitespace and expects either a ',' or a '}'.
   638  func lexInlineTableValueEnd(lx *lexer) stateFn {
   639  	switch r := lx.next(); {
   640  	case isWhitespace(r):
   641  		return lexSkip(lx, lexInlineTableValueEnd)
   642  	case isNL(r):
   643  		return lx.errorPrevLine(errLexInlineTableNL{})
   644  	case r == '#':
   645  		lx.push(lexInlineTableValueEnd)
   646  		return lexCommentStart
   647  	case r == ',':
   648  		lx.ignore()
   649  		lx.skip(isWhitespace)
   650  		if lx.peek() == '}' {
   651  			return lx.errorf("trailing comma not allowed in inline tables")
   652  		}
   653  		return lexInlineTableValue
   654  	case r == '}':
   655  		return lexInlineTableEnd
   656  	default:
   657  		return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
   658  	}
   659  }
   660  
   661  func runeOrEOF(r rune) string {
   662  	if r == eof {
   663  		return "end of file"
   664  	}
   665  	return "'" + string(r) + "'"
   666  }
   667  
   668  // lexInlineTableEnd finishes the lexing of an inline table.
   669  // It assumes that a '}' has just been consumed.
   670  func lexInlineTableEnd(lx *lexer) stateFn {
   671  	lx.ignore()
   672  	lx.emit(itemInlineTableEnd)
   673  	return lx.pop()
   674  }
   675  
   676  // lexString consumes the inner contents of a string. It assumes that the
   677  // beginning '"' has already been consumed and ignored.
   678  func lexString(lx *lexer) stateFn {
   679  	r := lx.next()
   680  	switch {
   681  	case r == eof:
   682  		return lx.errorf(`unexpected EOF; expected '"'`)
   683  	case isNL(r):
   684  		return lx.errorPrevLine(errLexStringNL{})
   685  	case r == '\\':
   686  		lx.push(lexString)
   687  		return lexStringEscape
   688  	case r == '"':
   689  		lx.backup()
   690  		lx.emit(itemString)
   691  		lx.next()
   692  		lx.ignore()
   693  		return lx.pop()
   694  	}
   695  	return lexString
   696  }
   697  
   698  // lexMultilineString consumes the inner contents of a string. It assumes that
   699  // the beginning '"""' has already been consumed and ignored.
   700  func lexMultilineString(lx *lexer) stateFn {
   701  	r := lx.next()
   702  	switch r {
   703  	default:
   704  		return lexMultilineString
   705  	case eof:
   706  		return lx.errorf(`unexpected EOF; expected '"""'`)
   707  	case '\\':
   708  		return lexMultilineStringEscape
   709  	case '"':
   710  		/// Found " → try to read two more "".
   711  		if lx.accept('"') {
   712  			if lx.accept('"') {
   713  				/// Peek ahead: the string can contain " and "", including at the
   714  				/// end: """str"""""
   715  				/// 6 or more at the end, however, is an error.
   716  				if lx.peek() == '"' {
   717  					/// Check if we already lexed 5 's; if so we have 6 now, and
   718  					/// that's just too many man!
   719  					///
   720  					/// Second check is for the edge case:
   721  					///
   722  					///            two quotes allowed.
   723  					///            vv
   724  					///   """lol \""""""
   725  					///          ^^  ^^^---- closing three
   726  					///     escaped
   727  					///
   728  					/// But ugly, but it works
   729  					if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) {
   730  						return lx.errorf(`unexpected '""""""'`)
   731  					}
   732  					lx.backup()
   733  					lx.backup()
   734  					return lexMultilineString
   735  				}
   736  
   737  				lx.backup() /// backup: don't include the """ in the item.
   738  				lx.backup()
   739  				lx.backup()
   740  				lx.emit(itemMultilineString)
   741  				lx.next() /// Read over ''' again and discard it.
   742  				lx.next()
   743  				lx.next()
   744  				lx.ignore()
   745  				return lx.pop()
   746  			}
   747  			lx.backup()
   748  		}
   749  		return lexMultilineString
   750  	}
   751  }
   752  
   753  // lexRawString consumes a raw string. Nothing can be escaped in such a string.
   754  // It assumes that the beginning "'" has already been consumed and ignored.
   755  func lexRawString(lx *lexer) stateFn {
   756  	r := lx.next()
   757  	switch {
   758  	default:
   759  		return lexRawString
   760  	case r == eof:
   761  		return lx.errorf(`unexpected EOF; expected "'"`)
   762  	case isNL(r):
   763  		return lx.errorPrevLine(errLexStringNL{})
   764  	case r == '\'':
   765  		lx.backup()
   766  		lx.emit(itemRawString)
   767  		lx.next()
   768  		lx.ignore()
   769  		return lx.pop()
   770  	}
   771  }
   772  
   773  // lexMultilineRawString consumes a raw string. Nothing can be escaped in such
   774  // a string. It assumes that the beginning "”'" has already been consumed and
   775  // ignored.
   776  func lexMultilineRawString(lx *lexer) stateFn {
   777  	r := lx.next()
   778  	switch r {
   779  	default:
   780  		return lexMultilineRawString
   781  	case eof:
   782  		return lx.errorf(`unexpected EOF; expected "'''"`)
   783  	case '\'':
   784  		/// Found ' → try to read two more ''.
   785  		if lx.accept('\'') {
   786  			if lx.accept('\'') {
   787  				/// Peek ahead: the string can contain ' and '', including at the
   788  				/// end: '''str'''''
   789  				/// 6 or more at the end, however, is an error.
   790  				if lx.peek() == '\'' {
   791  					/// Check if we already lexed 5 's; if so we have 6 now, and
   792  					/// that's just too many man!
   793  					if strings.HasSuffix(lx.current(), "'''''") {
   794  						return lx.errorf(`unexpected "''''''"`)
   795  					}
   796  					lx.backup()
   797  					lx.backup()
   798  					return lexMultilineRawString
   799  				}
   800  
   801  				lx.backup() /// backup: don't include the ''' in the item.
   802  				lx.backup()
   803  				lx.backup()
   804  				lx.emit(itemRawMultilineString)
   805  				lx.next() /// Read over ''' again and discard it.
   806  				lx.next()
   807  				lx.next()
   808  				lx.ignore()
   809  				return lx.pop()
   810  			}
   811  			lx.backup()
   812  		}
   813  		return lexMultilineRawString
   814  	}
   815  }
   816  
   817  // lexMultilineStringEscape consumes an escaped character. It assumes that the
   818  // preceding '\\' has already been consumed.
   819  func lexMultilineStringEscape(lx *lexer) stateFn {
   820  	if isNL(lx.next()) { /// \ escaping newline.
   821  		return lexMultilineString
   822  	}
   823  	lx.backup()
   824  	lx.push(lexMultilineString)
   825  	return lexStringEscape(lx)
   826  }
   827  
   828  func lexStringEscape(lx *lexer) stateFn {
   829  	r := lx.next()
   830  	switch r {
   831  	case 'b':
   832  		fallthrough
   833  	case 't':
   834  		fallthrough
   835  	case 'n':
   836  		fallthrough
   837  	case 'f':
   838  		fallthrough
   839  	case 'r':
   840  		fallthrough
   841  	case '"':
   842  		fallthrough
   843  	case ' ', '\t':
   844  		// Inside """ .. """ strings you can use \ to escape newlines, and any
   845  		// amount of whitespace can be between the \ and \n.
   846  		fallthrough
   847  	case '\\':
   848  		return lx.pop()
   849  	case 'u':
   850  		return lexShortUnicodeEscape
   851  	case 'U':
   852  		return lexLongUnicodeEscape
   853  	}
   854  	return lx.error(errLexEscape{r})
   855  }
   856  
   857  func lexShortUnicodeEscape(lx *lexer) stateFn {
   858  	var r rune
   859  	for i := 0; i < 4; i++ {
   860  		r = lx.next()
   861  		if !isHexadecimal(r) {
   862  			return lx.errorf(
   863  				`expected four hexadecimal digits after '\u', but got %q instead`,
   864  				lx.current())
   865  		}
   866  	}
   867  	return lx.pop()
   868  }
   869  
   870  func lexLongUnicodeEscape(lx *lexer) stateFn {
   871  	var r rune
   872  	for i := 0; i < 8; i++ {
   873  		r = lx.next()
   874  		if !isHexadecimal(r) {
   875  			return lx.errorf(
   876  				`expected eight hexadecimal digits after '\U', but got %q instead`,
   877  				lx.current())
   878  		}
   879  	}
   880  	return lx.pop()
   881  }
   882  
   883  // lexNumberOrDateStart processes the first character of a value which begins
   884  // with a digit. It exists to catch values starting with '0', so that
   885  // lexBaseNumberOrDate can differentiate base prefixed integers from other
   886  // types.
   887  func lexNumberOrDateStart(lx *lexer) stateFn {
   888  	r := lx.next()
   889  	switch r {
   890  	case '0':
   891  		return lexBaseNumberOrDate
   892  	}
   893  
   894  	if !isDigit(r) {
   895  		// The only way to reach this state is if the value starts
   896  		// with a digit, so specifically treat anything else as an
   897  		// error.
   898  		return lx.errorf("expected a digit but got %q", r)
   899  	}
   900  
   901  	return lexNumberOrDate
   902  }
   903  
   904  // lexNumberOrDate consumes either an integer, float or datetime.
   905  func lexNumberOrDate(lx *lexer) stateFn {
   906  	r := lx.next()
   907  	if isDigit(r) {
   908  		return lexNumberOrDate
   909  	}
   910  	switch r {
   911  	case '-', ':':
   912  		return lexDatetime
   913  	case '_':
   914  		return lexDecimalNumber
   915  	case '.', 'e', 'E':
   916  		return lexFloat
   917  	}
   918  
   919  	lx.backup()
   920  	lx.emit(itemInteger)
   921  	return lx.pop()
   922  }
   923  
   924  // lexDatetime consumes a Datetime, to a first approximation.
   925  // The parser validates that it matches one of the accepted formats.
   926  func lexDatetime(lx *lexer) stateFn {
   927  	r := lx.next()
   928  	if isDigit(r) {
   929  		return lexDatetime
   930  	}
   931  	switch r {
   932  	case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
   933  		return lexDatetime
   934  	}
   935  
   936  	lx.backup()
   937  	lx.emitTrim(itemDatetime)
   938  	return lx.pop()
   939  }
   940  
   941  // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
   942  func lexHexInteger(lx *lexer) stateFn {
   943  	r := lx.next()
   944  	if isHexadecimal(r) {
   945  		return lexHexInteger
   946  	}
   947  	switch r {
   948  	case '_':
   949  		return lexHexInteger
   950  	}
   951  
   952  	lx.backup()
   953  	lx.emit(itemInteger)
   954  	return lx.pop()
   955  }
   956  
   957  // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
   958  func lexOctalInteger(lx *lexer) stateFn {
   959  	r := lx.next()
   960  	if isOctal(r) {
   961  		return lexOctalInteger
   962  	}
   963  	switch r {
   964  	case '_':
   965  		return lexOctalInteger
   966  	}
   967  
   968  	lx.backup()
   969  	lx.emit(itemInteger)
   970  	return lx.pop()
   971  }
   972  
   973  // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
   974  func lexBinaryInteger(lx *lexer) stateFn {
   975  	r := lx.next()
   976  	if isBinary(r) {
   977  		return lexBinaryInteger
   978  	}
   979  	switch r {
   980  	case '_':
   981  		return lexBinaryInteger
   982  	}
   983  
   984  	lx.backup()
   985  	lx.emit(itemInteger)
   986  	return lx.pop()
   987  }
   988  
   989  // lexDecimalNumber consumes a decimal float or integer.
   990  func lexDecimalNumber(lx *lexer) stateFn {
   991  	r := lx.next()
   992  	if isDigit(r) {
   993  		return lexDecimalNumber
   994  	}
   995  	switch r {
   996  	case '.', 'e', 'E':
   997  		return lexFloat
   998  	case '_':
   999  		return lexDecimalNumber
  1000  	}
  1001  
  1002  	lx.backup()
  1003  	lx.emit(itemInteger)
  1004  	return lx.pop()
  1005  }
  1006  
  1007  // lexDecimalNumber consumes the first digit of a number beginning with a sign.
  1008  // It assumes the sign has already been consumed. Values which start with a sign
  1009  // are only allowed to be decimal integers or floats.
  1010  //
  1011  // The special "nan" and "inf" values are also recognized.
  1012  func lexDecimalNumberStart(lx *lexer) stateFn {
  1013  	r := lx.next()
  1014  
  1015  	// Special error cases to give users better error messages
  1016  	switch r {
  1017  	case 'i':
  1018  		if !lx.accept('n') || !lx.accept('f') {
  1019  			return lx.errorf("invalid float: '%s'", lx.current())
  1020  		}
  1021  		lx.emit(itemFloat)
  1022  		return lx.pop()
  1023  	case 'n':
  1024  		if !lx.accept('a') || !lx.accept('n') {
  1025  			return lx.errorf("invalid float: '%s'", lx.current())
  1026  		}
  1027  		lx.emit(itemFloat)
  1028  		return lx.pop()
  1029  	case '0':
  1030  		p := lx.peek()
  1031  		switch p {
  1032  		case 'b', 'o', 'x':
  1033  			return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
  1034  		}
  1035  	case '.':
  1036  		return lx.errorf("floats must start with a digit, not '.'")
  1037  	}
  1038  
  1039  	if isDigit(r) {
  1040  		return lexDecimalNumber
  1041  	}
  1042  
  1043  	return lx.errorf("expected a digit but got %q", r)
  1044  }
  1045  
  1046  // lexBaseNumberOrDate differentiates between the possible values which
  1047  // start with '0'. It assumes that before reaching this state, the initial '0'
  1048  // has been consumed.
  1049  func lexBaseNumberOrDate(lx *lexer) stateFn {
  1050  	r := lx.next()
  1051  	// Note: All datetimes start with at least two digits, so we don't
  1052  	// handle date characters (':', '-', etc.) here.
  1053  	if isDigit(r) {
  1054  		return lexNumberOrDate
  1055  	}
  1056  	switch r {
  1057  	case '_':
  1058  		// Can only be decimal, because there can't be an underscore
  1059  		// between the '0' and the base designator, and dates can't
  1060  		// contain underscores.
  1061  		return lexDecimalNumber
  1062  	case '.', 'e', 'E':
  1063  		return lexFloat
  1064  	case 'b':
  1065  		r = lx.peek()
  1066  		if !isBinary(r) {
  1067  			lx.errorf("not a binary number: '%s%c'", lx.current(), r)
  1068  		}
  1069  		return lexBinaryInteger
  1070  	case 'o':
  1071  		r = lx.peek()
  1072  		if !isOctal(r) {
  1073  			lx.errorf("not an octal number: '%s%c'", lx.current(), r)
  1074  		}
  1075  		return lexOctalInteger
  1076  	case 'x':
  1077  		r = lx.peek()
  1078  		if !isHexadecimal(r) {
  1079  			lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
  1080  		}
  1081  		return lexHexInteger
  1082  	}
  1083  
  1084  	lx.backup()
  1085  	lx.emit(itemInteger)
  1086  	return lx.pop()
  1087  }
  1088  
  1089  // lexFloat consumes the elements of a float. It allows any sequence of
  1090  // float-like characters, so floats emitted by the lexer are only a first
  1091  // approximation and must be validated by the parser.
  1092  func lexFloat(lx *lexer) stateFn {
  1093  	r := lx.next()
  1094  	if isDigit(r) {
  1095  		return lexFloat
  1096  	}
  1097  	switch r {
  1098  	case '_', '.', '-', '+', 'e', 'E':
  1099  		return lexFloat
  1100  	}
  1101  
  1102  	lx.backup()
  1103  	lx.emit(itemFloat)
  1104  	return lx.pop()
  1105  }
  1106  
  1107  // lexBool consumes a bool string: 'true' or 'false.
  1108  func lexBool(lx *lexer) stateFn {
  1109  	var rs []rune
  1110  	for {
  1111  		r := lx.next()
  1112  		if !unicode.IsLetter(r) {
  1113  			lx.backup()
  1114  			break
  1115  		}
  1116  		rs = append(rs, r)
  1117  	}
  1118  	s := string(rs)
  1119  	switch s {
  1120  	case "true", "false":
  1121  		lx.emit(itemBool)
  1122  		return lx.pop()
  1123  	}
  1124  	return lx.errorf("expected value but found %q instead", s)
  1125  }
  1126  
  1127  // lexCommentStart begins the lexing of a comment. It will emit
  1128  // itemCommentStart and consume no characters, passing control to lexComment.
  1129  func lexCommentStart(lx *lexer) stateFn {
  1130  	lx.ignore()
  1131  	lx.emit(itemCommentStart)
  1132  	return lexComment
  1133  }
  1134  
  1135  // lexComment lexes an entire comment. It assumes that '#' has been consumed.
  1136  // It will consume *up to* the first newline character, and pass control
  1137  // back to the last state on the stack.
  1138  func lexComment(lx *lexer) stateFn {
  1139  	switch r := lx.next(); {
  1140  	case isNL(r) || r == eof:
  1141  		lx.backup()
  1142  		lx.emit(itemText)
  1143  		return lx.pop()
  1144  	default:
  1145  		return lexComment
  1146  	}
  1147  }
  1148  
  1149  // lexSkip ignores all slurped input and moves on to the next state.
  1150  func lexSkip(lx *lexer, nextState stateFn) stateFn {
  1151  	lx.ignore()
  1152  	return nextState
  1153  }
  1154  
  1155  func (s stateFn) String() string {
  1156  	name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
  1157  	if i := strings.LastIndexByte(name, '.'); i > -1 {
  1158  		name = name[i+1:]
  1159  	}
  1160  	if s == nil {
  1161  		name = "<nil>"
  1162  	}
  1163  	return name + "()"
  1164  }
  1165  
  1166  func (itype itemType) String() string {
  1167  	switch itype {
  1168  	case itemError:
  1169  		return "Error"
  1170  	case itemNIL:
  1171  		return "NIL"
  1172  	case itemEOF:
  1173  		return "EOF"
  1174  	case itemText:
  1175  		return "Text"
  1176  	case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
  1177  		return "String"
  1178  	case itemBool:
  1179  		return "Bool"
  1180  	case itemInteger:
  1181  		return "Integer"
  1182  	case itemFloat:
  1183  		return "Float"
  1184  	case itemDatetime:
  1185  		return "DateTime"
  1186  	case itemTableStart:
  1187  		return "TableStart"
  1188  	case itemTableEnd:
  1189  		return "TableEnd"
  1190  	case itemKeyStart:
  1191  		return "KeyStart"
  1192  	case itemKeyEnd:
  1193  		return "KeyEnd"
  1194  	case itemArray:
  1195  		return "Array"
  1196  	case itemArrayEnd:
  1197  		return "ArrayEnd"
  1198  	case itemCommentStart:
  1199  		return "CommentStart"
  1200  	case itemInlineTableStart:
  1201  		return "InlineTableStart"
  1202  	case itemInlineTableEnd:
  1203  		return "InlineTableEnd"
  1204  	}
  1205  	panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
  1206  }
  1207  
  1208  func (item item) String() string {
  1209  	return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
  1210  }
  1211  
  1212  func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
  1213  func isNL(r rune) bool         { return r == '\n' || r == '\r' }
  1214  func isControl(r rune) bool { // Control characters except \t, \r, \n
  1215  	switch r {
  1216  	case '\t', '\r', '\n':
  1217  		return false
  1218  	default:
  1219  		return (r >= 0x00 && r <= 0x1f) || r == 0x7f
  1220  	}
  1221  }
  1222  func isDigit(r rune) bool  { return r >= '0' && r <= '9' }
  1223  func isBinary(r rune) bool { return r == '0' || r == '1' }
  1224  func isOctal(r rune) bool  { return r >= '0' && r <= '7' }
  1225  func isHexadecimal(r rune) bool {
  1226  	return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
  1227  }
  1228  func isBareKeyChar(r rune) bool {
  1229  	return (r >= 'A' && r <= 'Z') ||
  1230  		(r >= 'a' && r <= 'z') ||
  1231  		(r >= '0' && r <= '9') ||
  1232  		r == '_' || r == '-'
  1233  }