cuelang.org/go@v0.10.1/cue/scanner/scanner.go (about)

     1  // Copyright 2018 The CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package scanner implements a scanner for CUE source text. It takes a []byte
    16  // as source which can then be tokenized through repeated calls to the Scan
    17  // method.
    18  package scanner
    19  
    20  import (
    21  	"bytes"
    22  	"fmt"
    23  	"path/filepath"
    24  	"strconv"
    25  	"unicode"
    26  	"unicode/utf8"
    27  
    28  	"cuelang.org/go/cue/token"
    29  )
    30  
    31  // An ErrorHandler is a generic error handler used throughout CUE packages.
    32  //
    33  // The position points to the beginning of the offending value.
    34  type ErrorHandler func(pos token.Pos, msg string, args []interface{})
    35  
    36  // A Scanner holds the Scanner's internal state while processing
    37  // a given text. It can be allocated as part of another data
    38  // structure but must be initialized via Init before use.
    39  type Scanner struct {
    40  	// immutable state
    41  	file *token.File  // source file handle
    42  	dir  string       // directory portion of file.Name()
    43  	src  []byte       // source
    44  	errh ErrorHandler // error reporting; or nil
    45  	mode Mode         // scanning mode
    46  
    47  	// scanning state
    48  	ch              rune // current character
    49  	offset          int  // character offset
    50  	rdOffset        int  // reading offset (position after current character)
    51  	lineOffset      int  // current line offset
    52  	linesSinceLast  int
    53  	spacesSinceLast int
    54  	insertEOL       bool // insert a comma before next newline
    55  
    56  	quoteStack []quoteInfo
    57  
    58  	// public state - ok to modify
    59  	ErrorCount int // number of errors encountered
    60  }
    61  
    62  type quoteInfo struct {
    63  	char    rune
    64  	numChar int
    65  	numHash int
    66  }
    67  
    68  const bom = 0xFEFF // byte order mark, only permitted as very first character
    69  
    70  // Read the next Unicode char into s.ch.
    71  // s.ch < 0 means end-of-file.
    72  func (s *Scanner) next() {
    73  	if s.rdOffset < len(s.src) {
    74  		s.offset = s.rdOffset
    75  		if s.ch == '\n' {
    76  			s.lineOffset = s.offset
    77  			s.file.AddLine(s.offset)
    78  		}
    79  		r, w := rune(s.src[s.rdOffset]), 1
    80  		switch {
    81  		case r == 0:
    82  			s.errf(s.offset, "illegal character NUL")
    83  		case r >= utf8.RuneSelf:
    84  			// not ASCII
    85  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    86  			if r == utf8.RuneError && w == 1 {
    87  				s.errf(s.offset, "illegal UTF-8 encoding")
    88  			} else if r == bom && s.offset > 0 {
    89  				s.errf(s.offset, "illegal byte order mark")
    90  			}
    91  		}
    92  		s.rdOffset += w
    93  		s.ch = r
    94  	} else {
    95  		s.offset = len(s.src)
    96  		if s.ch == '\n' {
    97  			s.lineOffset = s.offset
    98  			s.file.AddLine(s.offset)
    99  		}
   100  		s.ch = -1 // eof
   101  	}
   102  }
   103  
   104  // A Mode value is a set of flags (or 0).
   105  // They control scanner behavior.
   106  type Mode uint
   107  
   108  // These constants are options to the Init function.
   109  const (
   110  	ScanComments     Mode = 1 << iota // return comments as COMMENT tokens
   111  	DontInsertCommas                  // do not automatically insert commas
   112  )
   113  
   114  // Init prepares the scanner s to tokenize the text src by setting the
   115  // scanner at the beginning of src. The scanner uses the file set file
   116  // for position information and it adds line information for each line.
   117  // It is ok to re-use the same file when re-scanning the same file as
   118  // line information which is already present is ignored. Init causes a
   119  // panic if the file size does not match the src size.
   120  //
   121  // Calls to Scan will invoke the error handler err if they encounter a
   122  // syntax error and err is not nil. Also, for each error encountered,
   123  // the Scanner field ErrorCount is incremented by one. The mode parameter
   124  // determines how comments are handled.
   125  //
   126  // Note that Init may call err if there is an error in the first character
   127  // of the file.
   128  func (s *Scanner) Init(file *token.File, src []byte, eh ErrorHandler, mode Mode) {
   129  	// Explicitly initialize all fields since a scanner may be reused.
   130  	if file.Size() != len(src) {
   131  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   132  	}
   133  	s.file = file
   134  	s.dir, _ = filepath.Split(file.Name())
   135  	s.src = src
   136  	s.errh = eh
   137  	s.mode = mode
   138  
   139  	s.ch = ' '
   140  	s.offset = 0
   141  	s.rdOffset = 0
   142  	s.lineOffset = 0
   143  	s.insertEOL = false
   144  	s.ErrorCount = 0
   145  
   146  	s.next()
   147  	if s.ch == bom {
   148  		s.next() // ignore BOM at file beginning
   149  	}
   150  }
   151  
   152  func (s *Scanner) errf(offs int, msg string, args ...interface{}) {
   153  	if s.errh != nil {
   154  		s.errh(s.file.Pos(offs, 0), msg, args)
   155  	}
   156  	s.ErrorCount++
   157  }
   158  
   159  var prefix = []byte("//line ")
   160  
   161  func (s *Scanner) interpretLineComment(text []byte) {
   162  	if bytes.HasPrefix(text, prefix) {
   163  		// get filename and line number, if any
   164  		if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
   165  			if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
   166  				// valid //line filename:line comment
   167  				filename := string(bytes.TrimSpace(text[len(prefix):i]))
   168  				if filename != "" {
   169  					filename = filepath.Clean(filename)
   170  					if !filepath.IsAbs(filename) {
   171  						// make filename relative to current directory
   172  						filename = filepath.Join(s.dir, filename)
   173  					}
   174  				}
   175  				// update scanner position
   176  				s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
   177  			}
   178  		}
   179  	}
   180  }
   181  
   182  func (s *Scanner) scanComment() string {
   183  	// initial '/' already consumed; s.ch == '/'
   184  	offs := s.offset - 1 // position of initial '/'
   185  	hasCR := false
   186  
   187  	if s.ch == '/' {
   188  		//-style comment
   189  		s.next()
   190  		for s.ch != '\n' && s.ch >= 0 {
   191  			if s.ch == '\r' {
   192  				hasCR = true
   193  			}
   194  			s.next()
   195  		}
   196  		if offs == s.lineOffset {
   197  			// comment starts at the beginning of the current line
   198  			s.interpretLineComment(s.src[offs:s.offset])
   199  		}
   200  		goto exit
   201  	}
   202  
   203  	s.errf(offs, "comment not terminated")
   204  
   205  exit:
   206  	lit := s.src[offs:s.offset]
   207  	if hasCR {
   208  		// TODO: preserve /r/n
   209  		lit = stripCR(lit)
   210  	}
   211  
   212  	return string(lit)
   213  }
   214  
   215  func isLetter(ch rune) bool {
   216  	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   217  }
   218  
   219  func isDigit(ch rune) bool {
   220  	// TODO(mpvl): Is this correct?
   221  	return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   222  }
   223  
   224  func (s *Scanner) scanFieldIdentifier() string {
   225  	offs := s.offset
   226  	if s.ch == '_' {
   227  		s.next()
   228  	}
   229  	if s.ch == '#' {
   230  		s.next()
   231  		// TODO: remove this block to allow #<num>
   232  		if isDigit(s.ch) {
   233  			return string(s.src[offs:s.offset])
   234  		}
   235  	}
   236  	for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' {
   237  		s.next()
   238  	}
   239  	return string(s.src[offs:s.offset])
   240  }
   241  
   242  func (s *Scanner) scanIdentifier() string {
   243  	offs := s.offset
   244  	for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' {
   245  		s.next()
   246  	}
   247  	return string(s.src[offs:s.offset])
   248  }
   249  
   250  func digitVal(ch rune) int {
   251  	switch {
   252  	case '0' <= ch && ch <= '9':
   253  		return int(ch - '0')
   254  	case ch == '_':
   255  		return 0
   256  	case 'a' <= ch && ch <= 'f':
   257  		return int(ch - 'a' + 10)
   258  	case 'A' <= ch && ch <= 'F':
   259  		return int(ch - 'A' + 10)
   260  	}
   261  	return 16 // larger than any legal digit val
   262  }
   263  
   264  func (s *Scanner) scanMantissa(base int) {
   265  	var last rune
   266  	for digitVal(s.ch) < base {
   267  		if last == '_' && s.ch == '_' {
   268  			s.errf(s.offset, "illegal '_' in number")
   269  		}
   270  		last = s.ch
   271  		s.next()
   272  	}
   273  	if last == '_' {
   274  		s.errf(s.offset-1, "illegal '_' in number")
   275  	}
   276  }
   277  
   278  func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
   279  	// digitVal(s.ch) < 10
   280  	offs := s.offset
   281  	tok := token.INT
   282  
   283  	if seenDecimalPoint {
   284  		offs--
   285  		tok = token.FLOAT
   286  		s.scanMantissa(10)
   287  		goto exponent
   288  	}
   289  
   290  	if s.ch == '0' {
   291  		// int or float
   292  		offs := s.offset
   293  		s.next()
   294  		if s.ch == 'x' || s.ch == 'X' {
   295  			// hexadecimal int
   296  			s.next()
   297  			s.scanMantissa(16)
   298  			if s.offset-offs <= 2 {
   299  				// only scanned "0x" or "0X"
   300  				s.errf(offs, "illegal hexadecimal number")
   301  			}
   302  		} else if s.ch == 'b' {
   303  			// binary int
   304  			s.next()
   305  			s.scanMantissa(2)
   306  			if s.offset-offs <= 2 {
   307  				// only scanned "0b"
   308  				s.errf(offs, "illegal binary number")
   309  			}
   310  		} else if s.ch == 'o' {
   311  			// octal int
   312  			s.next()
   313  			s.scanMantissa(8)
   314  			if s.offset-offs <= 2 {
   315  				// only scanned "0o"
   316  				s.errf(offs, "illegal octal number")
   317  			}
   318  		} else {
   319  			// 0 or float
   320  			seenDigits := false
   321  			if s.ch >= '0' && s.ch <= '9' {
   322  				seenDigits = true
   323  				s.scanMantissa(10)
   324  			}
   325  			if s.ch == '.' || s.ch == 'e' || s.ch == 'E' {
   326  				goto fraction
   327  			}
   328  			if seenDigits {
   329  				// integer other than 0 may not start with 0
   330  				s.errf(offs, "illegal integer number")
   331  			}
   332  		}
   333  		goto exit
   334  	}
   335  
   336  	// decimal int or float
   337  	s.scanMantissa(10)
   338  
   339  	// TODO: allow 3h4s, etc.
   340  	// switch s.ch {
   341  	// case 'h', 'm', 's', "ยต"[0], 'u', 'n':
   342  	// }
   343  
   344  fraction:
   345  	if s.ch == '.' {
   346  		if p := s.offset + 1; p < len(s.src) && s.src[p] == '.' {
   347  			// interpret dot as part of a range.
   348  			goto exit
   349  		}
   350  		tok = token.FLOAT
   351  		s.next()
   352  		s.scanMantissa(10)
   353  	}
   354  
   355  exponent:
   356  	switch s.ch {
   357  	case 'K', 'M', 'G', 'T', 'P':
   358  		tok = token.INT // TODO: Or should we allow this to be a float?
   359  		s.next()
   360  		if s.ch == 'i' {
   361  			s.next()
   362  		}
   363  		goto exit
   364  	}
   365  
   366  	if s.ch == 'e' || s.ch == 'E' {
   367  		tok = token.FLOAT
   368  		s.next()
   369  		if s.ch == '-' || s.ch == '+' {
   370  			s.next()
   371  		}
   372  		s.scanMantissa(10)
   373  	}
   374  
   375  exit:
   376  	return tok, string(s.src[offs:s.offset])
   377  }
   378  
   379  // scanEscape parses an escape sequence where rune is the accepted
   380  // escaped quote. In case of a syntax error, it stops at the offending
   381  // character (without consuming it) and returns false. Otherwise
   382  // it returns true.
   383  //
   384  // Must be compliant with https://tools.ietf.org/html/rfc4627.
   385  func (s *Scanner) scanEscape(quote quoteInfo) (ok, interpolation bool) {
   386  	for range quote.numHash {
   387  		if s.ch != '#' {
   388  			return true, false
   389  		}
   390  		s.next()
   391  	}
   392  
   393  	offs := s.offset
   394  
   395  	var n int
   396  	var base, max uint32
   397  	switch s.ch {
   398  	case '(':
   399  		return true, true
   400  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '/', quote.char:
   401  		s.next()
   402  		return true, false
   403  	case '0', '1', '2', '3', '4', '5', '6', '7':
   404  		n, base, max = 3, 8, 255
   405  	case 'x':
   406  		s.next()
   407  		n, base, max = 2, 16, 255
   408  	case 'u':
   409  		s.next()
   410  		n, base, max = 4, 16, unicode.MaxRune
   411  	case 'U':
   412  		s.next()
   413  		n, base, max = 8, 16, unicode.MaxRune
   414  	default:
   415  		msg := "unknown escape sequence"
   416  		if s.ch < 0 {
   417  			msg = "escape sequence not terminated"
   418  		}
   419  		s.errf(offs, msg)
   420  		return false, false
   421  	}
   422  
   423  	var x uint32
   424  	for n > 0 {
   425  		d := uint32(digitVal(s.ch))
   426  		if d >= base {
   427  			if s.ch < 0 {
   428  				s.errf(s.offset, "escape sequence not terminated")
   429  			} else {
   430  				s.errf(s.offset, "illegal character %#U in escape sequence", s.ch)
   431  			}
   432  			return false, false
   433  		}
   434  		x = x*base + d
   435  		s.next()
   436  		n--
   437  	}
   438  
   439  	// TODO: this is valid JSON, so remove, but normalize and report an error
   440  	// if for unmatched surrogate pairs .
   441  	if x > max {
   442  		s.errf(offs, "escape sequence is invalid Unicode code point")
   443  		return false, false
   444  	}
   445  
   446  	return true, false
   447  }
   448  
   449  func (s *Scanner) scanString(offs int, quote quoteInfo) (token.Token, string) {
   450  	// ", """, ', or ''' opening already consumed
   451  
   452  	tok := token.STRING
   453  
   454  	hasCR := false
   455  	extra := 0
   456  	for {
   457  		ch := s.ch
   458  		if (quote.numChar != 3 && ch == '\n') || ch < 0 {
   459  			s.errf(offs, "string literal not terminated")
   460  			lit := s.src[offs:s.offset]
   461  			if hasCR {
   462  				lit = stripCR(lit)
   463  			}
   464  			return tok, string(lit)
   465  		}
   466  
   467  		s.next()
   468  		ch, ok := s.consumeStringClose(ch, quote)
   469  		if ok {
   470  			break
   471  		}
   472  		if ch == '\r' && quote.numChar == 3 {
   473  			hasCR = true
   474  		}
   475  		if ch == '\\' {
   476  			if _, interpolation := s.scanEscape(quote); interpolation {
   477  				tok = token.INTERPOLATION
   478  				extra = 1
   479  				s.quoteStack = append(s.quoteStack, quote)
   480  				break
   481  			}
   482  		}
   483  	}
   484  	lit := s.src[offs : s.offset+extra]
   485  	if hasCR {
   486  		lit = stripCR(lit)
   487  	}
   488  	return tok, string(lit)
   489  }
   490  
   491  func (s *Scanner) consumeQuotes(quote rune, max int) (next rune, n int) {
   492  	for ; n < max; n++ {
   493  		if s.ch != quote {
   494  			return s.ch, n
   495  		}
   496  		s.next()
   497  	}
   498  	return s.ch, n
   499  }
   500  
   501  func (s *Scanner) consumeStringClose(ch rune, quote quoteInfo) (next rune, atEnd bool) {
   502  	if quote.char != ch {
   503  		return ch, false
   504  	}
   505  	numChar := quote.numChar
   506  	n := numChar + quote.numHash
   507  	want := quote.char
   508  	for i := 1; i < n; i++ {
   509  		if i == numChar {
   510  			want = '#'
   511  		}
   512  		if want != s.ch {
   513  			return ch, false
   514  		}
   515  		ch = s.ch
   516  		s.next()
   517  	}
   518  	return s.ch, true
   519  }
   520  
   521  func (s *Scanner) scanHashes(maxHash int) int {
   522  	for i := range maxHash {
   523  		if s.ch != '#' {
   524  			return i
   525  		}
   526  		s.next()
   527  	}
   528  	return maxHash
   529  }
   530  
   531  func stripCR(b []byte) []byte {
   532  	c := make([]byte, len(b))
   533  	i := 0
   534  	for _, ch := range b {
   535  		if ch != '\r' {
   536  			c[i] = ch
   537  			i++
   538  		}
   539  	}
   540  	return c[:i]
   541  }
   542  
   543  // scanAttribute scans aa full attribute of the form @foo(str). An attribute
   544  // is a lexical entry and as such whitespace is treated as normal characters
   545  // within the attribute.
   546  func (s *Scanner) scanAttribute() (tok token.Token, lit string) {
   547  	offs := s.offset - 1 // @ already consumed
   548  
   549  	s.scanIdentifier()
   550  
   551  	if _, tok, _ := s.Scan(); tok == token.LPAREN {
   552  		s.scanAttributeTokens(token.RPAREN)
   553  	} else {
   554  		s.errf(s.offset, "invalid attribute: expected '('")
   555  	}
   556  	return token.ATTRIBUTE, string(s.src[offs:s.offset])
   557  }
   558  
   559  func (s *Scanner) scanAttributeTokens(close token.Token) {
   560  	for {
   561  		switch _, tok, _ := s.Scan(); tok {
   562  		case close:
   563  			return
   564  		case token.EOF:
   565  			s.errf(s.offset, "attribute missing '%s'", close)
   566  			return
   567  
   568  		case token.INTERPOLATION:
   569  			s.errf(s.offset, "interpolation not allowed in attribute")
   570  			s.popInterpolation()
   571  			s.recoverParen(1)
   572  		case token.LPAREN:
   573  			s.scanAttributeTokens(token.RPAREN)
   574  		case token.LBRACE:
   575  			s.scanAttributeTokens(token.RBRACE)
   576  		case token.LBRACK:
   577  			s.scanAttributeTokens(token.RBRACK)
   578  		case token.RPAREN, token.RBRACK, token.RBRACE:
   579  			s.errf(s.offset, "unexpected '%s'", tok)
   580  		}
   581  	}
   582  }
   583  
   584  // recoverParen is an approximate recovery mechanism to recover from invalid
   585  // attributes.
   586  func (s *Scanner) recoverParen(open int) {
   587  	for {
   588  		switch s.ch {
   589  		case '\n', -1:
   590  			return
   591  		case '(':
   592  			open++
   593  		case ')':
   594  			if open--; open == 0 {
   595  				return
   596  			}
   597  		}
   598  		s.next()
   599  	}
   600  }
   601  
   602  func (s *Scanner) skipWhitespace(inc int) {
   603  	for {
   604  		switch s.ch {
   605  		case ' ', '\t':
   606  			s.spacesSinceLast += inc
   607  		case '\n':
   608  			s.linesSinceLast += inc
   609  			if s.insertEOL {
   610  				return
   611  			}
   612  		case '\r':
   613  		default:
   614  			return
   615  		}
   616  		s.next()
   617  	}
   618  }
   619  
   620  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   621  // Different routines recognize different length tok_i based on matches
   622  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   623  // respectively. Otherwise, the result is tok0 if there was no other
   624  // matching character, or tok2 if the matching character was ch2.
   625  
   626  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   627  	if s.ch == '=' {
   628  		s.next()
   629  		return tok1
   630  	}
   631  	return tok0
   632  }
   633  
   634  func (s *Scanner) popInterpolation() quoteInfo {
   635  	quote := s.quoteStack[len(s.quoteStack)-1]
   636  	s.quoteStack = s.quoteStack[:len(s.quoteStack)-1]
   637  	return quote
   638  }
   639  
   640  // ResumeInterpolation resumes scanning of a string interpolation.
   641  func (s *Scanner) ResumeInterpolation() string {
   642  	quote := s.popInterpolation()
   643  	_, str := s.scanString(s.offset-1, quote)
   644  	return str
   645  }
   646  
   647  // Offset returns the current position offset.
   648  func (s *Scanner) Offset() int {
   649  	return s.offset
   650  }
   651  
   652  // Scan scans the next token and returns the token position, the token,
   653  // and its literal string if applicable. The source end is indicated by
   654  // EOF.
   655  //
   656  // If the returned token is a literal (IDENT, INT, FLOAT,
   657  // IMAG, CHAR, STRING) or COMMENT, the literal string
   658  // has the corresponding value.
   659  //
   660  // If the returned token is a keyword, the literal string is the keyword.
   661  //
   662  // If the returned token is Comma, the corresponding
   663  // literal string is "," if the comma was present in the source,
   664  // and "\n" if the semicolon was inserted because of a newline or
   665  // at EOF.
   666  //
   667  // If the returned token is ILLEGAL, the literal string is the
   668  // offending character.
   669  //
   670  // In all other cases, Scan returns an empty literal string.
   671  //
   672  // For more tolerant parsing, Scan will return a valid token if
   673  // possible even if a syntax error was encountered. Thus, even
   674  // if the resulting token sequence contains no illegal tokens,
   675  // a client may not assume that no error occurred. Instead it
   676  // must check the scanner's ErrorCount or the number of calls
   677  // of the error handler, if there was one installed.
   678  //
   679  // Scan adds line information to the file added to the file
   680  // set with Init. Token positions are relative to that file
   681  // and thus relative to the file set.
   682  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   683  scanAgain:
   684  	s.skipWhitespace(1)
   685  
   686  	var rel token.RelPos
   687  	switch {
   688  	case s.linesSinceLast > 1:
   689  		rel = token.NewSection
   690  	case s.linesSinceLast == 1:
   691  		rel = token.Newline
   692  	case s.spacesSinceLast > 0:
   693  		rel = token.Blank
   694  	default:
   695  		rel = token.NoSpace
   696  	}
   697  	// current token start
   698  	offset := s.offset
   699  	pos = s.file.Pos(offset, rel)
   700  
   701  	// determine token value
   702  	insertEOL := false
   703  	var quote quoteInfo
   704  	switch ch := s.ch; {
   705  	case '0' <= ch && ch <= '9':
   706  		insertEOL = true
   707  		tok, lit = s.scanNumber(false)
   708  	case isLetter(ch), ch == '$', ch == '#':
   709  		lit = s.scanFieldIdentifier()
   710  		if len(lit) > 1 {
   711  			// keywords are longer than one letter - avoid lookup otherwise
   712  			tok = token.Lookup(lit)
   713  			insertEOL = true
   714  			break
   715  		}
   716  		if ch != '#' || (s.ch != '\'' && s.ch != '"' && s.ch != '#') {
   717  			tok = token.IDENT
   718  			insertEOL = true
   719  			break
   720  		}
   721  		quote.numHash = 1
   722  		ch = s.ch
   723  		fallthrough
   724  	default:
   725  		s.next() // always make progress
   726  		switch ch {
   727  		case -1:
   728  			if s.insertEOL {
   729  				s.insertEOL = false // EOF consumed
   730  				return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
   731  			}
   732  			tok = token.EOF
   733  		case '_':
   734  			if s.ch == '|' {
   735  				// Unconditionally require this to be followed by another
   736  				// underscore to avoid needing an extra lookahead.
   737  				// Note that `_|x` is always equal to _.
   738  				s.next()
   739  				if s.ch != '_' {
   740  					s.errf(s.file.Offset(pos), "illegal token '_|'; expected '_'")
   741  					insertEOL = s.insertEOL // preserve insertComma info
   742  					tok = token.ILLEGAL
   743  					lit = "_|"
   744  					break
   745  				}
   746  				s.next()
   747  				tok = token.BOTTOM
   748  				lit = "_|_"
   749  			} else {
   750  				tok = token.IDENT
   751  				lit = "_" + s.scanFieldIdentifier()
   752  			}
   753  			insertEOL = true
   754  
   755  		case '\n':
   756  			// we only reach here if s.insertComma was
   757  			// set in the first place and exited early
   758  			// from s.skipWhitespace()
   759  			s.insertEOL = false // newline consumed
   760  			p := s.file.Pos(offset, token.Elided)
   761  			s.skipWhitespace(1)
   762  			// Don't elide comma before a ',' or ':' to ensure JSON
   763  			// conformance. Note that cue fmt should immediately undo those.
   764  			if s.ch == ',' || s.ch == ':' {
   765  				return s.Scan()
   766  			}
   767  			return p, token.COMMA, "\n"
   768  
   769  		case '#':
   770  			for quote.numHash++; s.ch == '#'; quote.numHash++ {
   771  				s.next()
   772  			}
   773  			ch = s.ch
   774  			if ch != '\'' && ch != '"' {
   775  				break
   776  			}
   777  			s.next()
   778  			fallthrough
   779  		case '"', '\'':
   780  			insertEOL = true
   781  			quote.char = ch
   782  			quote.numChar = 1
   783  			offs := s.offset - 1 - quote.numHash
   784  			switch _, n := s.consumeQuotes(ch, 2); n {
   785  			case 0:
   786  				quote.numChar = 1
   787  				tok, lit = s.scanString(offs, quote)
   788  			case 1:
   789  				// When the string is surrounded by hashes,
   790  				// a single leading quote is OK (and part of the string)
   791  				// e.g. #""hello""#
   792  				// unless it's succeeded by the correct number of terminating
   793  				// hash characters
   794  				// e.g. ##""##
   795  				if n := s.scanHashes(quote.numHash); n == quote.numHash {
   796  					// It's the empty string.
   797  					tok, lit = token.STRING, string(s.src[offs:s.offset])
   798  				} else {
   799  					tok, lit = s.scanString(offs, quote)
   800  				}
   801  			case 2:
   802  				quote.numChar = 3
   803  				switch s.ch {
   804  				case '\n':
   805  					s.next()
   806  					tok, lit = s.scanString(offs, quote)
   807  				case '\r':
   808  					s.next()
   809  					if s.ch == '\n' {
   810  						s.next()
   811  						tok, lit = s.scanString(offs, quote)
   812  						break
   813  					}
   814  					fallthrough
   815  				default:
   816  					s.errf(offs, "expected newline after multiline quote %s",
   817  						s.src[offs:s.offset])
   818  					tok, lit = token.STRING, string(s.src[offs:s.offset])
   819  				}
   820  			}
   821  		case '@':
   822  			insertEOL = true
   823  			tok, lit = s.scanAttribute()
   824  		case ':':
   825  			tok = token.COLON
   826  		case ';':
   827  			tok = token.SEMICOLON
   828  			insertEOL = true
   829  		case '?':
   830  			tok = token.OPTION
   831  			insertEOL = true
   832  		case '.':
   833  			if '0' <= s.ch && s.ch <= '9' {
   834  				insertEOL = true
   835  				tok, lit = s.scanNumber(true)
   836  			} else if s.ch == '.' {
   837  				s.next()
   838  				if s.ch == '.' {
   839  					s.next()
   840  					tok = token.ELLIPSIS
   841  					insertEOL = true
   842  				} else {
   843  					s.errf(s.file.Offset(pos), "illegal token '..'; expected '.'")
   844  				}
   845  			} else {
   846  				tok = token.PERIOD
   847  			}
   848  		case ',':
   849  			tok = token.COMMA
   850  			lit = ","
   851  		case '(':
   852  			tok = token.LPAREN
   853  		case ')':
   854  			insertEOL = true
   855  			tok = token.RPAREN
   856  		case '[':
   857  			tok = token.LBRACK
   858  		case ']':
   859  			insertEOL = true
   860  			tok = token.RBRACK
   861  		case '{':
   862  			tok = token.LBRACE
   863  		case '}':
   864  			insertEOL = true
   865  			tok = token.RBRACE
   866  		case '+':
   867  			tok = token.ADD // Consider ++ for list concatenate.
   868  		case '-':
   869  			tok = token.SUB
   870  		case '*':
   871  			tok = token.MUL
   872  		case '/':
   873  			if s.ch == '/' {
   874  				// comment
   875  				if s.insertEOL {
   876  					// reset position to the beginning of the comment
   877  					s.ch = '/'
   878  					s.offset = s.file.Offset(pos)
   879  					s.rdOffset = s.offset + 1
   880  					s.insertEOL = false // newline consumed
   881  					return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
   882  				}
   883  				comment := s.scanComment()
   884  				if s.mode&ScanComments == 0 {
   885  					// skip comment
   886  					s.insertEOL = false // newline consumed
   887  					goto scanAgain
   888  				}
   889  				tok = token.COMMENT
   890  				lit = comment
   891  			} else {
   892  				tok = token.QUO
   893  			}
   894  		// We no longer use %, but seems like a useful token to use for
   895  		// something else at some point.
   896  		// case '%':
   897  		case '<':
   898  			if s.ch == '-' {
   899  				s.next()
   900  				tok = token.ARROW
   901  			} else {
   902  				tok = s.switch2(token.LSS, token.LEQ)
   903  			}
   904  		case '>':
   905  			tok = s.switch2(token.GTR, token.GEQ)
   906  		case '=':
   907  			if s.ch == '~' {
   908  				s.next()
   909  				tok = token.MAT
   910  			} else {
   911  				tok = s.switch2(token.BIND, token.EQL)
   912  			}
   913  		case '!':
   914  			if s.ch == '~' {
   915  				s.next()
   916  				tok = token.NMAT
   917  			} else {
   918  				tok = s.switch2(token.NOT, token.NEQ)
   919  			}
   920  		case '&':
   921  			switch s.ch {
   922  			case '&':
   923  				s.next()
   924  				tok = token.LAND
   925  			default:
   926  				tok = token.AND
   927  			}
   928  		case '|':
   929  			if s.ch == '|' {
   930  				s.next()
   931  				tok = token.LOR
   932  			} else {
   933  				tok = token.OR
   934  			}
   935  		default:
   936  			// next reports unexpected BOMs - don't repeat
   937  			if ch != bom {
   938  				s.errf(s.file.Offset(pos), "illegal character %#U", ch)
   939  			}
   940  			insertEOL = s.insertEOL // preserve insertSemi info
   941  			tok = token.ILLEGAL
   942  			lit = string(ch)
   943  		}
   944  	}
   945  	if s.mode&DontInsertCommas == 0 {
   946  		s.insertEOL = insertEOL
   947  	}
   948  
   949  	s.linesSinceLast = 0
   950  	s.spacesSinceLast = 0
   951  	return
   952  }