cuelang.org/go@v0.13.0/cue/scanner/scanner.go (about)

     1  // Copyright 2018 The CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package scanner implements a scanner for CUE source text. It takes a []byte
    16  // as source which can then be tokenized through repeated calls to the Scan
    17  // method.
    18  package scanner
    19  
    20  import (
    21  	"fmt"
    22  	"path/filepath"
    23  	"unicode"
    24  	"unicode/utf8"
    25  
    26  	"cuelang.org/go/cue/token"
    27  )
    28  
    29  // An ErrorHandler is a generic error handler used throughout CUE packages.
    30  //
    31  // The position points to the beginning of the offending value.
    32  type ErrorHandler func(pos token.Pos, msg string, args []interface{})
    33  
    34  // A Scanner holds the Scanner's internal state while processing
    35  // a given text. It can be allocated as part of another data
    36  // structure but must be initialized via Init before use.
    37  type Scanner struct {
    38  	// immutable state
    39  	file *token.File  // source file handle
    40  	dir  string       // directory portion of file.Name()
    41  	src  []byte       // source
    42  	errh ErrorHandler // error reporting; or nil
    43  	mode Mode         // scanning mode
    44  
    45  	// scanning state
    46  	ch              rune // current character
    47  	offset          int  // character offset
    48  	rdOffset        int  // reading offset (position after current character)
    49  	linesSinceLast  int
    50  	spacesSinceLast int
    51  	insertEOL       bool // insert a comma before next newline
    52  
    53  	quoteStack []quoteInfo
    54  
    55  	// public state - ok to modify
    56  	ErrorCount int // number of errors encountered
    57  }
    58  
    59  type quoteInfo struct {
    60  	char    rune
    61  	numChar int
    62  	numHash int
    63  }
    64  
    65  const bom = 0xFEFF // byte order mark, only permitted as very first character
    66  
    67  // Read the next Unicode char into s.ch.
    68  // s.ch < 0 means end-of-file.
    69  func (s *Scanner) next() {
    70  	if s.rdOffset < len(s.src) {
    71  		s.offset = s.rdOffset
    72  		if s.ch == '\n' {
    73  			s.file.AddLine(s.offset)
    74  		}
    75  		r, w := rune(s.src[s.rdOffset]), 1
    76  		switch {
    77  		case r == 0:
    78  			s.errf(s.offset, "illegal character NUL")
    79  		case r >= utf8.RuneSelf:
    80  			// not ASCII
    81  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    82  			if r == utf8.RuneError && w == 1 {
    83  				s.errf(s.offset, "illegal UTF-8 encoding")
    84  			} else if r == bom && s.offset > 0 {
    85  				s.errf(s.offset, "illegal byte order mark")
    86  			}
    87  		}
    88  		s.rdOffset += w
    89  		s.ch = r
    90  	} else {
    91  		s.offset = len(s.src)
    92  		if s.ch == '\n' {
    93  			s.file.AddLine(s.offset)
    94  		}
    95  		s.ch = -1 // eof
    96  	}
    97  }
    98  
    99  // A Mode value is a set of flags (or 0).
   100  // They control scanner behavior.
   101  type Mode uint
   102  
   103  // These constants are options to the Init function.
   104  const (
   105  	ScanComments     Mode = 1 << iota // return comments as COMMENT tokens
   106  	DontInsertCommas                  // do not automatically insert commas
   107  )
   108  
   109  // Init prepares the scanner s to tokenize the text src by setting the
   110  // scanner at the beginning of src. The scanner uses the file set file
   111  // for position information and it adds line information for each line.
   112  // It is ok to re-use the same file when re-scanning the same file as
   113  // line information which is already present is ignored. Init causes a
   114  // panic if the file size does not match the src size.
   115  //
   116  // Calls to Scan will invoke the error handler err if they encounter a
   117  // syntax error and err is not nil. Also, for each error encountered,
   118  // the Scanner field ErrorCount is incremented by one. The mode parameter
   119  // determines how comments are handled.
   120  //
   121  // Note that Init may call err if there is an error in the first character
   122  // of the file.
   123  func (s *Scanner) Init(file *token.File, src []byte, eh ErrorHandler, mode Mode) {
   124  	// Explicitly initialize all fields since a scanner may be reused.
   125  	if file.Size() != len(src) {
   126  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   127  	}
   128  	s.file = file
   129  	s.dir, _ = filepath.Split(file.Name())
   130  	s.src = src
   131  	s.errh = eh
   132  	s.mode = mode
   133  
   134  	s.ch = ' '
   135  	s.offset = 0
   136  	s.rdOffset = 0
   137  	s.insertEOL = false
   138  	s.ErrorCount = 0
   139  
   140  	s.next()
   141  	if s.ch == bom {
   142  		s.next() // ignore BOM at file beginning
   143  	}
   144  }
   145  
   146  func (s *Scanner) errf(offs int, msg string, args ...interface{}) {
   147  	if s.errh != nil {
   148  		s.errh(s.file.Pos(offs, 0), msg, args)
   149  	}
   150  	s.ErrorCount++
   151  }
   152  
   153  func (s *Scanner) scanComment() string {
   154  	// initial '/' already consumed; s.ch == '/'
   155  	offs := s.offset - 1 // position of initial '/'
   156  	hasCR := false
   157  
   158  	if s.ch == '/' {
   159  		//-style comment
   160  		s.next()
   161  		for s.ch != '\n' && s.ch >= 0 {
   162  			if s.ch == '\r' {
   163  				hasCR = true
   164  			}
   165  			s.next()
   166  		}
   167  		goto exit
   168  	}
   169  
   170  	s.errf(offs, "comment not terminated")
   171  
   172  exit:
   173  	lit := s.src[offs:s.offset]
   174  	if hasCR {
   175  		// TODO: preserve /r/n
   176  		lit = stripCR(lit)
   177  	}
   178  
   179  	return string(lit)
   180  }
   181  
   182  func isLetter(ch rune) bool {
   183  	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   184  }
   185  
   186  func isDigit(ch rune) bool {
   187  	// TODO(mpvl): Is this correct?
   188  	return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   189  }
   190  
   191  func (s *Scanner) scanFieldIdentifier() string {
   192  	offs := s.offset
   193  	if s.ch == '_' {
   194  		s.next()
   195  	}
   196  	if s.ch == '#' {
   197  		s.next()
   198  		// TODO: remove this block to allow #<num>
   199  		if isDigit(s.ch) {
   200  			return string(s.src[offs:s.offset])
   201  		}
   202  	}
   203  	for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' {
   204  		s.next()
   205  	}
   206  	return string(s.src[offs:s.offset])
   207  }
   208  
   209  func (s *Scanner) scanIdentifier() string {
   210  	offs := s.offset
   211  	for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' {
   212  		s.next()
   213  	}
   214  	return string(s.src[offs:s.offset])
   215  }
   216  
   217  func digitVal(ch rune) int {
   218  	switch {
   219  	case '0' <= ch && ch <= '9':
   220  		return int(ch - '0')
   221  	case ch == '_':
   222  		return 0
   223  	case 'a' <= ch && ch <= 'f':
   224  		return int(ch - 'a' + 10)
   225  	case 'A' <= ch && ch <= 'F':
   226  		return int(ch - 'A' + 10)
   227  	}
   228  	return 16 // larger than any legal digit val
   229  }
   230  
   231  func (s *Scanner) scanMantissa(base int) {
   232  	var last rune
   233  	for digitVal(s.ch) < base {
   234  		if last == '_' && s.ch == '_' {
   235  			s.errf(s.offset, "illegal '_' in number")
   236  		}
   237  		last = s.ch
   238  		s.next()
   239  	}
   240  	if last == '_' {
   241  		s.errf(s.offset-1, "illegal '_' in number")
   242  	}
   243  }
   244  
   245  func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
   246  	// digitVal(s.ch) < 10
   247  	offs := s.offset
   248  	tok := token.INT
   249  
   250  	if seenDecimalPoint {
   251  		offs--
   252  		tok = token.FLOAT
   253  		s.scanMantissa(10)
   254  		goto exponent
   255  	}
   256  
   257  	if s.ch == '0' {
   258  		// int or float
   259  		offs := s.offset
   260  		s.next()
   261  		if s.ch == 'x' || s.ch == 'X' {
   262  			// hexadecimal int
   263  			s.next()
   264  			s.scanMantissa(16)
   265  			if s.offset-offs <= 2 {
   266  				// only scanned "0x" or "0X"
   267  				s.errf(offs, "illegal hexadecimal number")
   268  			}
   269  		} else if s.ch == 'b' {
   270  			// binary int
   271  			s.next()
   272  			s.scanMantissa(2)
   273  			if s.offset-offs <= 2 {
   274  				// only scanned "0b"
   275  				s.errf(offs, "illegal binary number")
   276  			}
   277  		} else if s.ch == 'o' {
   278  			// octal int
   279  			s.next()
   280  			s.scanMantissa(8)
   281  			if s.offset-offs <= 2 {
   282  				// only scanned "0o"
   283  				s.errf(offs, "illegal octal number")
   284  			}
   285  		} else {
   286  			// 0 or float
   287  			seenDigits := false
   288  			if s.ch >= '0' && s.ch <= '9' {
   289  				seenDigits = true
   290  				s.scanMantissa(10)
   291  			}
   292  			if s.ch == '.' || s.ch == 'e' || s.ch == 'E' {
   293  				goto fraction
   294  			}
   295  			if seenDigits {
   296  				// integer other than 0 may not start with 0
   297  				s.errf(offs, "illegal integer number")
   298  			}
   299  		}
   300  		goto exit
   301  	}
   302  
   303  	// decimal int or float
   304  	s.scanMantissa(10)
   305  
   306  	// TODO: allow 3h4s, etc.
   307  	// switch s.ch {
   308  	// case 'h', 'm', 's', "ยต"[0], 'u', 'n':
   309  	// }
   310  
   311  fraction:
   312  	if s.ch == '.' {
   313  		if p := s.offset + 1; p < len(s.src) && s.src[p] == '.' {
   314  			// interpret dot as part of a range.
   315  			goto exit
   316  		}
   317  		tok = token.FLOAT
   318  		s.next()
   319  		s.scanMantissa(10)
   320  	}
   321  
   322  exponent:
   323  	switch s.ch {
   324  	case 'K', 'M', 'G', 'T', 'P':
   325  		tok = token.INT // TODO: Or should we allow this to be a float?
   326  		s.next()
   327  		if s.ch == 'i' {
   328  			s.next()
   329  		}
   330  		goto exit
   331  	}
   332  
   333  	if s.ch == 'e' || s.ch == 'E' {
   334  		tok = token.FLOAT
   335  		s.next()
   336  		if s.ch == '-' || s.ch == '+' {
   337  			s.next()
   338  		}
   339  		s.scanMantissa(10)
   340  	}
   341  
   342  exit:
   343  	return tok, string(s.src[offs:s.offset])
   344  }
   345  
   346  // scanEscape parses an escape sequence where rune is the accepted
   347  // escaped quote. In case of a syntax error, it stops at the offending
   348  // character (without consuming it) and returns false. Otherwise
   349  // it returns true.
   350  //
   351  // Must be compliant with https://tools.ietf.org/html/rfc4627.
   352  func (s *Scanner) scanEscape(quote quoteInfo) (ok, interpolation bool) {
   353  	for range quote.numHash {
   354  		if s.ch != '#' {
   355  			return true, false
   356  		}
   357  		s.next()
   358  	}
   359  
   360  	offs := s.offset
   361  
   362  	var n int
   363  	var base, max uint32
   364  	switch s.ch {
   365  	case '(':
   366  		return true, true
   367  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '/', quote.char:
   368  		s.next()
   369  		return true, false
   370  	case '0', '1', '2', '3', '4', '5', '6', '7':
   371  		n, base, max = 3, 8, 255
   372  	case 'x':
   373  		s.next()
   374  		n, base, max = 2, 16, 255
   375  	case 'u':
   376  		s.next()
   377  		n, base, max = 4, 16, unicode.MaxRune
   378  	case 'U':
   379  		s.next()
   380  		n, base, max = 8, 16, unicode.MaxRune
   381  	default:
   382  		msg := "unknown escape sequence"
   383  		if s.ch < 0 {
   384  			msg = "escape sequence not terminated"
   385  		}
   386  		s.errf(offs, msg)
   387  		return false, false
   388  	}
   389  
   390  	var x uint32
   391  	for n > 0 {
   392  		d := uint32(digitVal(s.ch))
   393  		if d >= base {
   394  			if s.ch < 0 {
   395  				s.errf(s.offset, "escape sequence not terminated")
   396  			} else {
   397  				s.errf(s.offset, "illegal character %#U in escape sequence", s.ch)
   398  			}
   399  			return false, false
   400  		}
   401  		x = x*base + d
   402  		s.next()
   403  		n--
   404  	}
   405  
   406  	// TODO: this is valid JSON, so remove, but normalize and report an error
   407  	// if for unmatched surrogate pairs .
   408  	if x > max {
   409  		s.errf(offs, "escape sequence is invalid Unicode code point")
   410  		return false, false
   411  	}
   412  
   413  	return true, false
   414  }
   415  
   416  func (s *Scanner) scanString(offs int, quote quoteInfo) (token.Token, string) {
   417  	// ", """, ', or ''' opening already consumed
   418  
   419  	tok := token.STRING
   420  
   421  	hasCR := false
   422  	extra := 0
   423  	for {
   424  		ch := s.ch
   425  		if (quote.numChar != 3 && ch == '\n') || ch < 0 {
   426  			s.errf(offs, "string literal not terminated")
   427  			lit := s.src[offs:s.offset]
   428  			if hasCR {
   429  				lit = stripCR(lit)
   430  			}
   431  			return tok, string(lit)
   432  		}
   433  
   434  		s.next()
   435  		ch, ok := s.consumeStringClose(ch, quote)
   436  		if ok {
   437  			break
   438  		}
   439  		if ch == '\r' && quote.numChar == 3 {
   440  			hasCR = true
   441  		}
   442  		if ch == '\\' {
   443  			if _, interpolation := s.scanEscape(quote); interpolation {
   444  				tok = token.INTERPOLATION
   445  				extra = 1
   446  				s.quoteStack = append(s.quoteStack, quote)
   447  				break
   448  			}
   449  		}
   450  	}
   451  	lit := s.src[offs : s.offset+extra]
   452  	if hasCR {
   453  		lit = stripCR(lit)
   454  	}
   455  	return tok, string(lit)
   456  }
   457  
   458  func (s *Scanner) consumeQuotes(quote rune, max int) (next rune, n int) {
   459  	for ; n < max; n++ {
   460  		if s.ch != quote {
   461  			return s.ch, n
   462  		}
   463  		s.next()
   464  	}
   465  	return s.ch, n
   466  }
   467  
   468  func (s *Scanner) consumeStringClose(ch rune, quote quoteInfo) (next rune, atEnd bool) {
   469  	if quote.char != ch {
   470  		return ch, false
   471  	}
   472  	numChar := quote.numChar
   473  	n := numChar + quote.numHash
   474  	want := quote.char
   475  	for i := 1; i < n; i++ {
   476  		if i == numChar {
   477  			want = '#'
   478  		}
   479  		if want != s.ch {
   480  			return ch, false
   481  		}
   482  		ch = s.ch
   483  		s.next()
   484  	}
   485  	return s.ch, true
   486  }
   487  
   488  func (s *Scanner) scanHashes(maxHash int) int {
   489  	for i := range maxHash {
   490  		if s.ch != '#' {
   491  			return i
   492  		}
   493  		s.next()
   494  	}
   495  	return maxHash
   496  }
   497  
   498  func stripCR(b []byte) []byte {
   499  	c := make([]byte, len(b))
   500  	i := 0
   501  	for _, ch := range b {
   502  		if ch != '\r' {
   503  			c[i] = ch
   504  			i++
   505  		}
   506  	}
   507  	return c[:i]
   508  }
   509  
   510  // scanAttribute scans aa full attribute of the form @foo(str). An attribute
   511  // is a lexical entry and as such whitespace is treated as normal characters
   512  // within the attribute.
   513  func (s *Scanner) scanAttribute() (tok token.Token, lit string) {
   514  	offs := s.offset - 1 // @ already consumed
   515  
   516  	s.scanIdentifier()
   517  
   518  	if _, tok, _ := s.Scan(); tok == token.LPAREN {
   519  		s.scanAttributeTokens(token.RPAREN)
   520  	} else {
   521  		s.errf(s.offset, "invalid attribute: expected '('")
   522  	}
   523  	return token.ATTRIBUTE, string(s.src[offs:s.offset])
   524  }
   525  
   526  func (s *Scanner) scanAttributeTokens(close token.Token) {
   527  	for {
   528  		switch _, tok, _ := s.Scan(); tok {
   529  		case close:
   530  			return
   531  		case token.EOF:
   532  			s.errf(s.offset, "attribute missing '%s'", close)
   533  			return
   534  
   535  		case token.INTERPOLATION:
   536  			s.errf(s.offset, "interpolation not allowed in attribute")
   537  			s.popInterpolation()
   538  			s.recoverParen(1)
   539  		case token.LPAREN:
   540  			s.scanAttributeTokens(token.RPAREN)
   541  		case token.LBRACE:
   542  			s.scanAttributeTokens(token.RBRACE)
   543  		case token.LBRACK:
   544  			s.scanAttributeTokens(token.RBRACK)
   545  		case token.RPAREN, token.RBRACK, token.RBRACE:
   546  			s.errf(s.offset, "unexpected '%s'", tok)
   547  		}
   548  	}
   549  }
   550  
   551  // recoverParen is an approximate recovery mechanism to recover from invalid
   552  // attributes.
   553  func (s *Scanner) recoverParen(open int) {
   554  	for {
   555  		switch s.ch {
   556  		case '\n', -1:
   557  			return
   558  		case '(':
   559  			open++
   560  		case ')':
   561  			if open--; open == 0 {
   562  				return
   563  			}
   564  		}
   565  		s.next()
   566  	}
   567  }
   568  
   569  func (s *Scanner) skipWhitespace(inc int) {
   570  	for {
   571  		switch s.ch {
   572  		case ' ', '\t':
   573  			s.spacesSinceLast += inc
   574  		case '\n':
   575  			s.linesSinceLast += inc
   576  			if s.insertEOL {
   577  				return
   578  			}
   579  		case '\r':
   580  		default:
   581  			return
   582  		}
   583  		s.next()
   584  	}
   585  }
   586  
   587  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   588  // Different routines recognize different length tok_i based on matches
   589  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   590  // respectively. Otherwise, the result is tok0 if there was no other
   591  // matching character, or tok2 if the matching character was ch2.
   592  
   593  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   594  	if s.ch == '=' {
   595  		s.next()
   596  		return tok1
   597  	}
   598  	return tok0
   599  }
   600  
   601  func (s *Scanner) popInterpolation() quoteInfo {
   602  	quote := s.quoteStack[len(s.quoteStack)-1]
   603  	s.quoteStack = s.quoteStack[:len(s.quoteStack)-1]
   604  	return quote
   605  }
   606  
   607  // ResumeInterpolation resumes scanning of a string interpolation.
   608  func (s *Scanner) ResumeInterpolation() string {
   609  	quote := s.popInterpolation()
   610  	_, str := s.scanString(s.offset-1, quote)
   611  	return str
   612  }
   613  
   614  // Offset returns the current position offset.
   615  func (s *Scanner) Offset() int {
   616  	return s.offset
   617  }
   618  
   619  // Scan scans the next token and returns the token position, the token,
   620  // and its literal string if applicable. The source end is indicated by
   621  // EOF.
   622  //
   623  // If the returned token is a literal (IDENT, INT, FLOAT,
   624  // IMAG, CHAR, STRING) or COMMENT, the literal string
   625  // has the corresponding value.
   626  //
   627  // If the returned token is a keyword, the literal string is the keyword.
   628  //
   629  // If the returned token is Comma, the corresponding
   630  // literal string is "," if the comma was present in the source,
   631  // and "\n" if the semicolon was inserted because of a newline or
   632  // at EOF.
   633  //
   634  // If the returned token is ILLEGAL, the literal string is the
   635  // offending character.
   636  //
   637  // In all other cases, Scan returns an empty literal string.
   638  //
   639  // For more tolerant parsing, Scan will return a valid token if
   640  // possible even if a syntax error was encountered. Thus, even
   641  // if the resulting token sequence contains no illegal tokens,
   642  // a client may not assume that no error occurred. Instead it
   643  // must check the scanner's ErrorCount or the number of calls
   644  // of the error handler, if there was one installed.
   645  //
   646  // Scan adds line information to the file added to the file
   647  // set with Init. Token positions are relative to that file
   648  // and thus relative to the file set.
   649  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   650  scanAgain:
   651  	s.skipWhitespace(1)
   652  
   653  	var rel token.RelPos
   654  	switch {
   655  	case s.linesSinceLast > 1:
   656  		rel = token.NewSection
   657  	case s.linesSinceLast == 1:
   658  		rel = token.Newline
   659  	case s.spacesSinceLast > 0:
   660  		rel = token.Blank
   661  	default:
   662  		rel = token.NoSpace
   663  	}
   664  	// current token start
   665  	offset := s.offset
   666  	pos = s.file.Pos(offset, rel)
   667  
   668  	// determine token value
   669  	insertEOL := false
   670  	var quote quoteInfo
   671  	switch ch := s.ch; {
   672  	case '0' <= ch && ch <= '9':
   673  		insertEOL = true
   674  		tok, lit = s.scanNumber(false)
   675  	case isLetter(ch), ch == '$', ch == '#':
   676  		lit = s.scanFieldIdentifier()
   677  		if len(lit) > 1 {
   678  			// keywords are longer than one letter - avoid lookup otherwise
   679  			tok = token.Lookup(lit)
   680  			insertEOL = true
   681  			break
   682  		}
   683  		if ch != '#' || (s.ch != '\'' && s.ch != '"' && s.ch != '#') {
   684  			tok = token.IDENT
   685  			insertEOL = true
   686  			break
   687  		}
   688  		quote.numHash = 1
   689  		ch = s.ch
   690  		fallthrough
   691  	default:
   692  		s.next() // always make progress
   693  		switch ch {
   694  		case -1:
   695  			if s.insertEOL {
   696  				s.insertEOL = false // EOF consumed
   697  				return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
   698  			}
   699  			tok = token.EOF
   700  		case '_':
   701  			if s.ch == '|' {
   702  				// Unconditionally require this to be followed by another
   703  				// underscore to avoid needing an extra lookahead.
   704  				// Note that `_|x` is always equal to _.
   705  				s.next()
   706  				if s.ch != '_' {
   707  					s.errf(s.file.Offset(pos), "illegal token '_|'; expected '_'")
   708  					insertEOL = s.insertEOL // preserve insertComma info
   709  					tok = token.ILLEGAL
   710  					lit = "_|"
   711  					break
   712  				}
   713  				s.next()
   714  				tok = token.BOTTOM
   715  				lit = "_|_"
   716  			} else {
   717  				tok = token.IDENT
   718  				lit = "_" + s.scanFieldIdentifier()
   719  			}
   720  			insertEOL = true
   721  
   722  		case '\n':
   723  			// we only reach here if s.insertComma was
   724  			// set in the first place and exited early
   725  			// from s.skipWhitespace()
   726  			s.insertEOL = false // newline consumed
   727  			p := s.file.Pos(offset, token.Elided)
   728  			s.skipWhitespace(1)
   729  			// Don't elide comma before a ',' or ':' to ensure JSON
   730  			// conformance. Note that cue fmt should immediately undo those.
   731  			if s.ch == ',' || s.ch == ':' {
   732  				return s.Scan()
   733  			}
   734  			return p, token.COMMA, "\n"
   735  
   736  		case '#':
   737  			for quote.numHash++; s.ch == '#'; quote.numHash++ {
   738  				s.next()
   739  			}
   740  			ch = s.ch
   741  			if ch != '\'' && ch != '"' {
   742  				break
   743  			}
   744  			s.next()
   745  			fallthrough
   746  		case '"', '\'':
   747  			insertEOL = true
   748  			quote.char = ch
   749  			quote.numChar = 1
   750  			offs := s.offset - 1 - quote.numHash
   751  			switch _, n := s.consumeQuotes(ch, 2); n {
   752  			case 0:
   753  				quote.numChar = 1
   754  				tok, lit = s.scanString(offs, quote)
   755  			case 1:
   756  				// When the string is surrounded by hashes,
   757  				// a single leading quote is OK (and part of the string)
   758  				// e.g. #""hello""#
   759  				// unless it's succeeded by the correct number of terminating
   760  				// hash characters
   761  				// e.g. ##""##
   762  				if n := s.scanHashes(quote.numHash); n == quote.numHash {
   763  					// It's the empty string.
   764  					tok, lit = token.STRING, string(s.src[offs:s.offset])
   765  				} else {
   766  					tok, lit = s.scanString(offs, quote)
   767  				}
   768  			case 2:
   769  				quote.numChar = 3
   770  				switch s.ch {
   771  				case '\n':
   772  					s.next()
   773  					tok, lit = s.scanString(offs, quote)
   774  				case '\r':
   775  					s.next()
   776  					if s.ch == '\n' {
   777  						s.next()
   778  						tok, lit = s.scanString(offs, quote)
   779  						break
   780  					}
   781  					fallthrough
   782  				default:
   783  					s.errf(offs, "expected newline after multiline quote %s",
   784  						s.src[offs:s.offset])
   785  					tok, lit = token.STRING, string(s.src[offs:s.offset])
   786  				}
   787  			}
   788  		case '@':
   789  			insertEOL = true
   790  			tok, lit = s.scanAttribute()
   791  		case ':':
   792  			tok = token.COLON
   793  		case ';':
   794  			tok = token.SEMICOLON
   795  			insertEOL = true
   796  		case '?':
   797  			tok = token.OPTION
   798  			insertEOL = true
   799  		case '.':
   800  			if '0' <= s.ch && s.ch <= '9' {
   801  				insertEOL = true
   802  				tok, lit = s.scanNumber(true)
   803  			} else if s.ch == '.' {
   804  				s.next()
   805  				if s.ch == '.' {
   806  					s.next()
   807  					tok = token.ELLIPSIS
   808  					insertEOL = true
   809  				} else {
   810  					s.errf(s.file.Offset(pos), "illegal token '..'; expected '.'")
   811  				}
   812  			} else {
   813  				tok = token.PERIOD
   814  			}
   815  		case ',':
   816  			tok = token.COMMA
   817  			lit = ","
   818  		case '(':
   819  			tok = token.LPAREN
   820  		case ')':
   821  			insertEOL = true
   822  			tok = token.RPAREN
   823  		case '[':
   824  			tok = token.LBRACK
   825  		case ']':
   826  			insertEOL = true
   827  			tok = token.RBRACK
   828  		case '{':
   829  			tok = token.LBRACE
   830  		case '}':
   831  			insertEOL = true
   832  			tok = token.RBRACE
   833  		case '+':
   834  			tok = token.ADD // Consider ++ for list concatenate.
   835  		case '-':
   836  			tok = token.SUB
   837  		case '*':
   838  			tok = token.MUL
   839  		case '/':
   840  			if s.ch == '/' {
   841  				// comment
   842  				if s.insertEOL {
   843  					// reset position to the beginning of the comment
   844  					s.ch = '/'
   845  					s.offset = s.file.Offset(pos)
   846  					s.rdOffset = s.offset + 1
   847  					s.insertEOL = false // newline consumed
   848  					return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
   849  				}
   850  				comment := s.scanComment()
   851  				if s.mode&ScanComments == 0 {
   852  					// skip comment
   853  					s.insertEOL = false // newline consumed
   854  					goto scanAgain
   855  				}
   856  				tok = token.COMMENT
   857  				lit = comment
   858  			} else {
   859  				tok = token.QUO
   860  			}
   861  		// We no longer use %, but seems like a useful token to use for
   862  		// something else at some point.
   863  		// case '%':
   864  		case '<':
   865  			if s.ch == '-' {
   866  				s.next()
   867  				tok = token.ARROW
   868  			} else {
   869  				tok = s.switch2(token.LSS, token.LEQ)
   870  			}
   871  		case '>':
   872  			tok = s.switch2(token.GTR, token.GEQ)
   873  		case '=':
   874  			if s.ch == '~' {
   875  				s.next()
   876  				tok = token.MAT
   877  			} else {
   878  				tok = s.switch2(token.BIND, token.EQL)
   879  			}
   880  		case '!':
   881  			if s.ch == '~' {
   882  				s.next()
   883  				tok = token.NMAT
   884  			} else {
   885  				tok = s.switch2(token.NOT, token.NEQ)
   886  			}
   887  		case '&':
   888  			switch s.ch {
   889  			case '&':
   890  				s.next()
   891  				tok = token.LAND
   892  			default:
   893  				tok = token.AND
   894  			}
   895  		case '|':
   896  			if s.ch == '|' {
   897  				s.next()
   898  				tok = token.LOR
   899  			} else {
   900  				tok = token.OR
   901  			}
   902  		default:
   903  			// next reports unexpected BOMs - don't repeat
   904  			if ch != bom {
   905  				s.errf(s.file.Offset(pos), "illegal character %#U", ch)
   906  			}
   907  			insertEOL = s.insertEOL // preserve insertSemi info
   908  			tok = token.ILLEGAL
   909  			lit = string(ch)
   910  		}
   911  	}
   912  	if s.mode&DontInsertCommas == 0 {
   913  		s.insertEOL = insertEOL
   914  	}
   915  
   916  	s.linesSinceLast = 0
   917  	s.spacesSinceLast = 0
   918  	return
   919  }