github.com/joomcode/cue@v0.4.4-0.20221111115225-539fe3512047/cue/scanner/scanner.go (about)

     1  // Copyright 2018 The CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package scanner implements a scanner for CUE source text. It takes a []byte
    16  // as source which can then be tokenized through repeated calls to the Scan
    17  // method.
    18  package scanner // import "github.com/joomcode/cue/cue/scanner"
    19  
    20  import (
    21  	"bytes"
    22  	"fmt"
    23  	"path/filepath"
    24  	"strconv"
    25  	"strings"
    26  	"unicode"
    27  	"unicode/utf8"
    28  
    29  	"github.com/joomcode/cue/cue/token"
    30  )
    31  
    32  // An ErrorHandler is a generic error handler used throughout CUE packages.
    33  //
    34  // The position points to the beginning of the offending value.
    35  type ErrorHandler func(pos token.Pos, msg string, args []interface{})
    36  
    37  // A Scanner holds the Scanner's internal state while processing
    38  // a given text. It can be allocated as part of another data
    39  // structure but must be initialized via Init before use.
    40  type Scanner struct {
    41  	// immutable state
    42  	file *token.File  // source file handle
    43  	dir  string       // directory portion of file.Name()
    44  	src  []byte       // source
    45  	errh ErrorHandler // error reporting; or nil
    46  	mode Mode         // scanning mode
    47  
    48  	// scanning state
    49  	ch              rune // current character
    50  	offset          int  // character offset
    51  	rdOffset        int  // reading offset (position after current character)
    52  	lineOffset      int  // current line offset
    53  	linesSinceLast  int
    54  	spacesSinceLast int
    55  	insertEOL       bool // insert a comma before next newline
    56  
    57  	quoteStack []quoteInfo
    58  
    59  	// public state - ok to modify
    60  	ErrorCount int // number of errors encountered
    61  }
    62  
    63  type quoteInfo struct {
    64  	char    rune
    65  	numChar int
    66  	numHash int
    67  }
    68  
    69  const bom = 0xFEFF // byte order mark, only permitted as very first character
    70  
    71  // Read the next Unicode char into s.ch.
    72  // s.ch < 0 means end-of-file.
    73  func (s *Scanner) next() {
    74  	if s.rdOffset < len(s.src) {
    75  		s.offset = s.rdOffset
    76  		if s.ch == '\n' {
    77  			s.lineOffset = s.offset
    78  			s.file.AddLine(s.offset)
    79  		}
    80  		r, w := rune(s.src[s.rdOffset]), 1
    81  		switch {
    82  		case r == 0:
    83  			s.errf(s.offset, "illegal character NUL")
    84  		case r >= utf8.RuneSelf:
    85  			// not ASCII
    86  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    87  			if r == utf8.RuneError && w == 1 {
    88  				s.errf(s.offset, "illegal UTF-8 encoding")
    89  			} else if r == bom && s.offset > 0 {
    90  				s.errf(s.offset, "illegal byte order mark")
    91  			}
    92  		}
    93  		s.rdOffset += w
    94  		s.ch = r
    95  	} else {
    96  		s.offset = len(s.src)
    97  		if s.ch == '\n' {
    98  			s.lineOffset = s.offset
    99  			s.file.AddLine(s.offset)
   100  		}
   101  		s.ch = -1 // eof
   102  	}
   103  }
   104  
   105  // A Mode value is a set of flags (or 0).
   106  // They control scanner behavior.
   107  type Mode uint
   108  
   109  // These constants are options to the Init function.
   110  const (
   111  	ScanComments     Mode = 1 << iota // return comments as COMMENT tokens
   112  	dontInsertCommas                  // do not automatically insert commas - for testing only
   113  )
   114  
   115  // Init prepares the scanner s to tokenize the text src by setting the
   116  // scanner at the beginning of src. The scanner uses the file set file
   117  // for position information and it adds line information for each line.
   118  // It is ok to re-use the same file when re-scanning the same file as
   119  // line information which is already present is ignored. Init causes a
   120  // panic if the file size does not match the src size.
   121  //
   122  // Calls to Scan will invoke the error handler err if they encounter a
   123  // syntax error and err is not nil. Also, for each error encountered,
   124  // the Scanner field ErrorCount is incremented by one. The mode parameter
   125  // determines how comments are handled.
   126  //
   127  // Note that Init may call err if there is an error in the first character
   128  // of the file.
   129  func (s *Scanner) Init(file *token.File, src []byte, eh ErrorHandler, mode Mode) {
   130  	// Explicitly initialize all fields since a scanner may be reused.
   131  	if file.Size() != len(src) {
   132  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   133  	}
   134  	s.file = file
   135  	s.dir, _ = filepath.Split(file.Name())
   136  	s.src = src
   137  	s.errh = eh
   138  	s.mode = mode
   139  
   140  	s.ch = ' '
   141  	s.offset = 0
   142  	s.rdOffset = 0
   143  	s.lineOffset = 0
   144  	s.insertEOL = false
   145  	s.ErrorCount = 0
   146  
   147  	s.next()
   148  	if s.ch == bom {
   149  		s.next() // ignore BOM at file beginning
   150  	}
   151  }
   152  
   153  func (s *Scanner) errf(offs int, msg string, args ...interface{}) {
   154  	if s.errh != nil {
   155  		s.errh(s.file.Pos(offs, 0), msg, args)
   156  	}
   157  	s.ErrorCount++
   158  }
   159  
   160  var prefix = []byte("//line ")
   161  
   162  func (s *Scanner) interpretLineComment(text []byte) {
   163  	if bytes.HasPrefix(text, prefix) {
   164  		// get filename and line number, if any
   165  		if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
   166  			if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
   167  				// valid //line filename:line comment
   168  				filename := string(bytes.TrimSpace(text[len(prefix):i]))
   169  				if filename != "" {
   170  					filename = filepath.Clean(filename)
   171  					if !filepath.IsAbs(filename) {
   172  						// make filename relative to current directory
   173  						filename = filepath.Join(s.dir, filename)
   174  					}
   175  				}
   176  				// update scanner position
   177  				s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
   178  			}
   179  		}
   180  	}
   181  }
   182  
   183  func (s *Scanner) scanComment() string {
   184  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   185  	offs := s.offset - 1 // position of initial '/'
   186  	hasCR := false
   187  
   188  	if s.ch == '/' {
   189  		//-style comment
   190  		s.next()
   191  		for s.ch != '\n' && s.ch >= 0 {
   192  			if s.ch == '\r' {
   193  				hasCR = true
   194  			}
   195  			s.next()
   196  		}
   197  		if offs == s.lineOffset {
   198  			// comment starts at the beginning of the current line
   199  			s.interpretLineComment(s.src[offs:s.offset])
   200  		}
   201  		goto exit
   202  	}
   203  
   204  	s.errf(offs, "comment not terminated")
   205  
   206  exit:
   207  	lit := s.src[offs:s.offset]
   208  	if hasCR {
   209  		// TODO: preserve /r/n
   210  		lit = stripCR(lit)
   211  	}
   212  
   213  	return string(lit)
   214  }
   215  
   216  func (s *Scanner) findLineEnd() bool {
   217  	// initial '/' already consumed
   218  
   219  	defer func(offs int) {
   220  		// reset scanner state to where it was upon calling findLineEnd
   221  		s.ch = '/'
   222  		s.offset = offs
   223  		s.rdOffset = offs + 1
   224  		s.next() // consume initial '/' again
   225  	}(s.offset - 1)
   226  
   227  	// read ahead until a newline, EOF, or non-comment token is found
   228  	for s.ch == '/' || s.ch == '*' {
   229  		if s.ch == '/' {
   230  			//-style comment always contains a newline
   231  			return true
   232  		}
   233  		/*-style comment: look for newline */
   234  		s.next()
   235  		for s.ch >= 0 {
   236  			ch := s.ch
   237  			if ch == '\n' {
   238  				return true
   239  			}
   240  			s.next()
   241  			if ch == '*' && s.ch == '/' {
   242  				s.next()
   243  				break
   244  			}
   245  		}
   246  		s.skipWhitespace(0) // s.insertSemi is set
   247  		if s.ch < 0 || s.ch == '\n' {
   248  			return true
   249  		}
   250  		if s.ch != '/' {
   251  			// non-comment token
   252  			return false
   253  		}
   254  		s.next() // consume '/'
   255  	}
   256  
   257  	return false
   258  }
   259  
   260  func isLetter(ch rune) bool {
   261  	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   262  }
   263  
   264  func isDigit(ch rune) bool {
   265  	// TODO(mpvl): Is this correct?
   266  	return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   267  }
   268  
   269  func (s *Scanner) scanFieldIdentifier() string {
   270  	offs := s.offset
   271  	if s.ch == '_' {
   272  		s.next()
   273  	}
   274  	if s.ch == '#' {
   275  		s.next()
   276  		// TODO: remove this block to allow #<num>
   277  		if isDigit(s.ch) {
   278  			return string(s.src[offs:s.offset])
   279  		}
   280  	}
   281  	for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' {
   282  		s.next()
   283  	}
   284  	return string(s.src[offs:s.offset])
   285  }
   286  
   287  func (s *Scanner) scanIdentifier() string {
   288  	offs := s.offset
   289  	for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' || s.ch == '$' {
   290  		s.next()
   291  	}
   292  	return string(s.src[offs:s.offset])
   293  }
   294  
   295  func isExtendedIdent(r rune) bool {
   296  	return strings.IndexRune("-_#$%. ", r) >= 0
   297  }
   298  
   299  func digitVal(ch rune) int {
   300  	switch {
   301  	case '0' <= ch && ch <= '9':
   302  		return int(ch - '0')
   303  	case ch == '_':
   304  		return 0
   305  	case 'a' <= ch && ch <= 'f':
   306  		return int(ch - 'a' + 10)
   307  	case 'A' <= ch && ch <= 'F':
   308  		return int(ch - 'A' + 10)
   309  	}
   310  	return 16 // larger than any legal digit val
   311  }
   312  
   313  func (s *Scanner) scanMantissa(base int) {
   314  	var last rune
   315  	for digitVal(s.ch) < base {
   316  		if last == '_' && s.ch == '_' {
   317  			s.errf(s.offset, "illegal '_' in number")
   318  		}
   319  		last = s.ch
   320  		s.next()
   321  	}
   322  	if last == '_' {
   323  		s.errf(s.offset-1, "illegal '_' in number")
   324  	}
   325  }
   326  
   327  func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
   328  	// digitVal(s.ch) < 10
   329  	offs := s.offset
   330  	tok := token.INT
   331  
   332  	if seenDecimalPoint {
   333  		offs--
   334  		tok = token.FLOAT
   335  		s.scanMantissa(10)
   336  		goto exponent
   337  	}
   338  
   339  	if s.ch == '0' {
   340  		// int or float
   341  		offs := s.offset
   342  		s.next()
   343  		if s.ch == 'x' || s.ch == 'X' {
   344  			// hexadecimal int
   345  			s.next()
   346  			s.scanMantissa(16)
   347  			if s.offset-offs <= 2 {
   348  				// only scanned "0x" or "0X"
   349  				s.errf(offs, "illegal hexadecimal number")
   350  			}
   351  		} else if s.ch == 'b' {
   352  			// binary int
   353  			s.next()
   354  			s.scanMantissa(2)
   355  			if s.offset-offs <= 2 {
   356  				// only scanned "0b"
   357  				s.errf(offs, "illegal binary number")
   358  			}
   359  		} else if s.ch == 'o' {
   360  			// octal int
   361  			s.next()
   362  			s.scanMantissa(8)
   363  			if s.offset-offs <= 2 {
   364  				// only scanned "0o"
   365  				s.errf(offs, "illegal octal number")
   366  			}
   367  		} else {
   368  			// 0 or float
   369  			seenDigits := false
   370  			if s.ch >= '0' && s.ch <= '9' {
   371  				seenDigits = true
   372  				s.scanMantissa(10)
   373  			}
   374  			if s.ch == '.' || s.ch == 'e' || s.ch == 'E' {
   375  				goto fraction
   376  			}
   377  			if seenDigits {
   378  				// integer other than 0 may not start with 0
   379  				s.errf(offs, "illegal integer number")
   380  			}
   381  		}
   382  		goto exit
   383  	}
   384  
   385  	// decimal int or float
   386  	s.scanMantissa(10)
   387  
   388  	// TODO: allow 3h4s, etc.
   389  	// switch s.ch {
   390  	// case 'h', 'm', 's', "ยต"[0], 'u', 'n':
   391  	// }
   392  
   393  fraction:
   394  	if s.ch == '.' {
   395  		if p := s.offset + 1; p < len(s.src) && s.src[p] == '.' {
   396  			// interpret dot as part of a range.
   397  			goto exit
   398  		}
   399  		tok = token.FLOAT
   400  		s.next()
   401  		s.scanMantissa(10)
   402  	}
   403  
   404  exponent:
   405  	switch s.ch {
   406  	case 'K', 'M', 'G', 'T', 'P':
   407  		tok = token.INT // TODO: Or should we allow this to be a float?
   408  		s.next()
   409  		if s.ch == 'i' {
   410  			s.next()
   411  		}
   412  		goto exit
   413  	}
   414  
   415  	if s.ch == 'e' || s.ch == 'E' {
   416  		tok = token.FLOAT
   417  		s.next()
   418  		if s.ch == '-' || s.ch == '+' {
   419  			s.next()
   420  		}
   421  		s.scanMantissa(10)
   422  	}
   423  
   424  exit:
   425  	return tok, string(s.src[offs:s.offset])
   426  }
   427  
   428  // scanEscape parses an escape sequence where rune is the accepted
   429  // escaped quote. In case of a syntax error, it stops at the offending
   430  // character (without consuming it) and returns false. Otherwise
   431  // it returns true.
   432  //
   433  // Must be compliant with https://tools.ietf.org/html/rfc4627.
   434  func (s *Scanner) scanEscape(quote quoteInfo) (ok, interpolation bool) {
   435  	for i := 0; i < quote.numHash; i++ {
   436  		if s.ch != '#' {
   437  			return true, false
   438  		}
   439  		s.next()
   440  	}
   441  
   442  	offs := s.offset
   443  
   444  	var n int
   445  	var base, max uint32
   446  	switch s.ch {
   447  	case '(':
   448  		return true, true
   449  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '/', quote.char:
   450  		s.next()
   451  		return true, false
   452  	case '0', '1', '2', '3', '4', '5', '6', '7':
   453  		n, base, max = 3, 8, 255
   454  	case 'x':
   455  		s.next()
   456  		n, base, max = 2, 16, 255
   457  	case 'u':
   458  		s.next()
   459  		n, base, max = 4, 16, unicode.MaxRune
   460  	case 'U':
   461  		s.next()
   462  		n, base, max = 8, 16, unicode.MaxRune
   463  	default:
   464  		msg := "unknown escape sequence"
   465  		if s.ch < 0 {
   466  			msg = "escape sequence not terminated"
   467  		}
   468  		s.errf(offs, msg)
   469  		return false, false
   470  	}
   471  
   472  	var x uint32
   473  	for n > 0 {
   474  		d := uint32(digitVal(s.ch))
   475  		if d >= base {
   476  			if s.ch < 0 {
   477  				s.errf(s.offset, "escape sequence not terminated")
   478  			} else {
   479  				s.errf(s.offset, "illegal character %#U in escape sequence", s.ch)
   480  			}
   481  			return false, false
   482  		}
   483  		x = x*base + d
   484  		s.next()
   485  		n--
   486  	}
   487  
   488  	// TODO: this is valid JSON, so remove, but normalize and report an error
   489  	// if for unmatched surrogate pairs .
   490  	if x > max {
   491  		s.errf(offs, "escape sequence is invalid Unicode code point")
   492  		return false, false
   493  	}
   494  
   495  	return true, false
   496  }
   497  
   498  func (s *Scanner) scanString(offs int, quote quoteInfo) (token.Token, string) {
   499  	// ", """, ', or ''' opening already consumed
   500  
   501  	tok := token.STRING
   502  
   503  	hasCR := false
   504  	extra := 0
   505  	for {
   506  		ch := s.ch
   507  		if (quote.numChar != 3 && ch == '\n') || ch < 0 {
   508  			s.errf(offs, "string literal not terminated")
   509  			lit := s.src[offs:s.offset]
   510  			if hasCR {
   511  				lit = stripCR(lit)
   512  			}
   513  			return tok, string(lit)
   514  		}
   515  
   516  		s.next()
   517  		ch, ok := s.consumeStringClose(ch, quote)
   518  		if ok {
   519  			break
   520  		}
   521  		if ch == '\r' && quote.numChar == 3 {
   522  			hasCR = true
   523  		}
   524  		if ch == '\\' {
   525  			if _, interpolation := s.scanEscape(quote); interpolation {
   526  				tok = token.INTERPOLATION
   527  				extra = 1
   528  				s.quoteStack = append(s.quoteStack, quote)
   529  				break
   530  			}
   531  		}
   532  	}
   533  	lit := s.src[offs : s.offset+extra]
   534  	if hasCR {
   535  		lit = stripCR(lit)
   536  	}
   537  	return tok, string(lit)
   538  }
   539  
   540  func (s *Scanner) consumeQuotes(quote rune, max int) (next rune, n int) {
   541  	for ; n < max; n++ {
   542  		if s.ch != quote {
   543  			return s.ch, n
   544  		}
   545  		s.next()
   546  	}
   547  	return s.ch, n
   548  }
   549  
   550  func (s *Scanner) consumeStringClose(ch rune, quote quoteInfo) (next rune, atEnd bool) {
   551  	if quote.char != ch {
   552  		return ch, false
   553  	}
   554  	numChar := quote.numChar
   555  	n := numChar + quote.numHash
   556  	want := quote.char
   557  	for i := 1; i < n; i++ {
   558  		if i == numChar {
   559  			want = '#'
   560  		}
   561  		if want != s.ch {
   562  			return ch, false
   563  		}
   564  		ch = s.ch
   565  		s.next()
   566  	}
   567  	return s.ch, true
   568  }
   569  
   570  func (s *Scanner) checkHashCount(offs int, quote quoteInfo) {
   571  	for i := 0; i < quote.numHash; i++ {
   572  		if s.ch != '#' {
   573  			s.errf(offs, "string literal not terminated")
   574  			return
   575  		}
   576  		s.next()
   577  	}
   578  }
   579  
   580  func stripCR(b []byte) []byte {
   581  	c := make([]byte, len(b))
   582  	i := 0
   583  	for _, ch := range b {
   584  		if ch != '\r' {
   585  			c[i] = ch
   586  			i++
   587  		}
   588  	}
   589  	return c[:i]
   590  }
   591  
   592  // scanAttribute scans aa full attribute of the form @foo(str). An attribute
   593  // is a lexical entry and as such whitespace is treated as normal characters
   594  // within the attribute.
   595  func (s *Scanner) scanAttribute() (tok token.Token, lit string) {
   596  	offs := s.offset - 1 // @ already consumed
   597  
   598  	s.scanIdentifier()
   599  
   600  	if _, tok, _ := s.Scan(); tok == token.LPAREN {
   601  		s.scanAttributeTokens(token.RPAREN)
   602  	} else {
   603  		s.errf(s.offset, "invalid attribute: expected '('")
   604  	}
   605  	return token.ATTRIBUTE, string(s.src[offs:s.offset])
   606  }
   607  
   608  func (s *Scanner) scanAttributeTokens(close token.Token) {
   609  	for {
   610  		switch _, tok, _ := s.Scan(); tok {
   611  		case close:
   612  			return
   613  		case token.EOF:
   614  			s.errf(s.offset, "attribute missing '%s'", close)
   615  			return
   616  
   617  		case token.INTERPOLATION:
   618  			s.errf(s.offset, "interpolation not allowed in attribute")
   619  			s.popInterpolation()
   620  			s.recoverParen(1)
   621  		case token.LPAREN:
   622  			s.scanAttributeTokens(token.RPAREN)
   623  		case token.LBRACE:
   624  			s.scanAttributeTokens(token.RBRACE)
   625  		case token.LBRACK:
   626  			s.scanAttributeTokens(token.RBRACK)
   627  		case token.RPAREN, token.RBRACK, token.RBRACE:
   628  			s.errf(s.offset, "unexpected '%s'", tok)
   629  		}
   630  	}
   631  }
   632  
   633  // recoverParen is an approximate recovery mechanism to recover from invalid
   634  // attributes.
   635  func (s *Scanner) recoverParen(open int) {
   636  	for {
   637  		switch s.ch {
   638  		case '\n', -1:
   639  			return
   640  		case '(':
   641  			open++
   642  		case ')':
   643  			if open--; open == 0 {
   644  				return
   645  			}
   646  		}
   647  		s.next()
   648  	}
   649  }
   650  
   651  func (s *Scanner) skipWhitespace(inc int) {
   652  	for {
   653  		switch s.ch {
   654  		case ' ', '\t':
   655  			s.spacesSinceLast += inc
   656  		case '\n':
   657  			s.linesSinceLast += inc
   658  			if s.insertEOL {
   659  				return
   660  			}
   661  		case '\r':
   662  		default:
   663  			return
   664  		}
   665  		s.next()
   666  	}
   667  }
   668  
   669  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   670  // Different routines recognize different length tok_i based on matches
   671  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   672  // respectively. Otherwise, the result is tok0 if there was no other
   673  // matching character, or tok2 if the matching character was ch2.
   674  
   675  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   676  	if s.ch == '=' {
   677  		s.next()
   678  		return tok1
   679  	}
   680  	return tok0
   681  }
   682  
   683  func (s *Scanner) popInterpolation() quoteInfo {
   684  	quote := s.quoteStack[len(s.quoteStack)-1]
   685  	s.quoteStack = s.quoteStack[:len(s.quoteStack)-1]
   686  	return quote
   687  }
   688  
   689  // ResumeInterpolation resumes scanning of a string interpolation.
   690  func (s *Scanner) ResumeInterpolation() string {
   691  	quote := s.popInterpolation()
   692  	_, str := s.scanString(s.offset-1, quote)
   693  	return str
   694  }
   695  
   696  // Scan scans the next token and returns the token position, the token,
   697  // and its literal string if applicable. The source end is indicated by
   698  // EOF.
   699  //
   700  // If the returned token is a literal (IDENT, INT, FLOAT,
   701  // IMAG, CHAR, STRING) or COMMENT, the literal string
   702  // has the corresponding value.
   703  //
   704  // If the returned token is a keyword, the literal string is the keyword.
   705  //
   706  // If the returned token is Comma, the corresponding
   707  // literal string is "," if the comma was present in the source,
   708  // and "\n" if the semicolon was inserted because of a newline or
   709  // at EOF.
   710  //
   711  // If the returned token is ILLEGAL, the literal string is the
   712  // offending character.
   713  //
   714  // In all other cases, Scan returns an empty literal string.
   715  //
   716  // For more tolerant parsing, Scan will return a valid token if
   717  // possible even if a syntax error was encountered. Thus, even
   718  // if the resulting token sequence contains no illegal tokens,
   719  // a client may not assume that no error occurred. Instead it
   720  // must check the scanner's ErrorCount or the number of calls
   721  // of the error handler, if there was one installed.
   722  //
   723  // Scan adds line information to the file added to the file
   724  // set with Init. Token positions are relative to that file
   725  // and thus relative to the file set.
   726  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   727  scanAgain:
   728  	s.skipWhitespace(1)
   729  
   730  	var rel token.RelPos
   731  	switch {
   732  	case s.linesSinceLast > 1:
   733  		rel = token.NewSection
   734  	case s.linesSinceLast == 1:
   735  		rel = token.Newline
   736  	case s.spacesSinceLast > 0:
   737  		rel = token.Blank
   738  	default:
   739  		rel = token.NoSpace
   740  	}
   741  	// current token start
   742  	offset := s.offset
   743  	pos = s.file.Pos(offset, rel)
   744  
   745  	// determine token value
   746  	insertEOL := false
   747  	var quote quoteInfo
   748  	switch ch := s.ch; {
   749  	case '0' <= ch && ch <= '9':
   750  		insertEOL = true
   751  		tok, lit = s.scanNumber(false)
   752  	case isLetter(ch), ch == '$', ch == '#':
   753  		lit = s.scanFieldIdentifier()
   754  		if len(lit) > 1 {
   755  			// keywords are longer than one letter - avoid lookup otherwise
   756  			tok = token.Lookup(lit)
   757  			insertEOL = true
   758  			break
   759  		}
   760  		if ch != '#' || (s.ch != '\'' && s.ch != '"' && s.ch != '#') {
   761  			tok = token.IDENT
   762  			insertEOL = true
   763  			break
   764  		}
   765  		quote.numHash = 1
   766  		ch = s.ch
   767  		fallthrough
   768  	default:
   769  		s.next() // always make progress
   770  		switch ch {
   771  		case -1:
   772  			if s.insertEOL {
   773  				s.insertEOL = false // EOF consumed
   774  				return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
   775  			}
   776  			tok = token.EOF
   777  		case '_':
   778  			if s.ch == '|' {
   779  				// Unconditionally require this to be followed by another
   780  				// underscore to avoid needing an extra lookahead.
   781  				// Note that `_|x` is always equal to _.
   782  				s.next()
   783  				if s.ch != '_' {
   784  					s.errf(s.file.Offset(pos), "illegal token '_|'; expected '_'")
   785  					insertEOL = s.insertEOL // preserve insertComma info
   786  					tok = token.ILLEGAL
   787  					lit = "_|"
   788  					break
   789  				}
   790  				s.next()
   791  				tok = token.BOTTOM
   792  				lit = "_|_"
   793  			} else {
   794  				tok = token.IDENT
   795  				lit = "_" + s.scanFieldIdentifier()
   796  			}
   797  			insertEOL = true
   798  
   799  		case '\n':
   800  			// we only reach here if s.insertComma was
   801  			// set in the first place and exited early
   802  			// from s.skipWhitespace()
   803  			s.insertEOL = false // newline consumed
   804  			p := s.file.Pos(offset, token.Elided)
   805  			s.skipWhitespace(1)
   806  			// Don't elide comma before a ',' or ':' to ensure JSON
   807  			// conformance. Note that cue fmt should immediately undo those.
   808  			if s.ch == ',' || s.ch == ':' {
   809  				return s.Scan()
   810  			}
   811  			return p, token.COMMA, "\n"
   812  
   813  		case '#':
   814  			for quote.numHash++; s.ch == '#'; quote.numHash++ {
   815  				s.next()
   816  			}
   817  			ch = s.ch
   818  			if ch != '\'' && ch != '"' {
   819  				break
   820  			}
   821  			s.next()
   822  			fallthrough
   823  		case '"', '\'':
   824  			insertEOL = true
   825  			quote.char = ch
   826  			quote.numChar = 1
   827  			offs := s.offset - 1 - quote.numHash
   828  			switch _, n := s.consumeQuotes(ch, 2); n {
   829  			case 0:
   830  				quote.numChar = 1
   831  				tok, lit = s.scanString(offs, quote)
   832  			case 1:
   833  				s.checkHashCount(offs, quote)
   834  				tok, lit = token.STRING, string(s.src[offs:s.offset])
   835  			case 2:
   836  				quote.numChar = 3
   837  				switch s.ch {
   838  				case '\n':
   839  					s.next()
   840  					tok, lit = s.scanString(offs, quote)
   841  				case '\r':
   842  					s.next()
   843  					if s.ch == '\n' {
   844  						s.next()
   845  						tok, lit = s.scanString(offs, quote)
   846  						break
   847  					}
   848  					fallthrough
   849  				default:
   850  					s.errf(offs, "expected newline after multiline quote %s",
   851  						s.src[offs:s.offset])
   852  					tok, lit = token.STRING, string(s.src[offs:s.offset])
   853  				}
   854  			}
   855  		case '@':
   856  			insertEOL = true
   857  			tok, lit = s.scanAttribute()
   858  		case ':':
   859  			if s.ch == ':' {
   860  				s.next()
   861  				tok = token.ISA
   862  			} else {
   863  				tok = token.COLON
   864  			}
   865  		case ';':
   866  			tok = token.SEMICOLON
   867  			insertEOL = true
   868  		case '?':
   869  			tok = token.OPTION
   870  			insertEOL = true
   871  		case '.':
   872  			if '0' <= s.ch && s.ch <= '9' {
   873  				insertEOL = true
   874  				tok, lit = s.scanNumber(true)
   875  			} else if s.ch == '.' {
   876  				s.next()
   877  				if s.ch == '.' {
   878  					s.next()
   879  					tok = token.ELLIPSIS
   880  					insertEOL = true
   881  				} else {
   882  					s.errf(s.file.Offset(pos), "illegal token '..'; expected '.'")
   883  				}
   884  			} else {
   885  				tok = token.PERIOD
   886  			}
   887  		case ',':
   888  			tok = token.COMMA
   889  			lit = ","
   890  		case '(':
   891  			tok = token.LPAREN
   892  		case ')':
   893  			insertEOL = true
   894  			tok = token.RPAREN
   895  		case '[':
   896  			tok = token.LBRACK
   897  		case ']':
   898  			insertEOL = true
   899  			tok = token.RBRACK
   900  		case '{':
   901  			tok = token.LBRACE
   902  		case '}':
   903  			insertEOL = true
   904  			tok = token.RBRACE
   905  		case '+':
   906  			tok = token.ADD // Consider ++ for list concatenate.
   907  		case '-':
   908  			tok = token.SUB
   909  		case '*':
   910  			tok = token.MUL
   911  		case '/':
   912  			if s.ch == '/' {
   913  				// comment
   914  				if s.insertEOL && s.findLineEnd() {
   915  					// reset position to the beginning of the comment
   916  					s.ch = '/'
   917  					s.offset = s.file.Offset(pos)
   918  					s.rdOffset = s.offset + 1
   919  					s.insertEOL = false // newline consumed
   920  					return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
   921  				}
   922  				comment := s.scanComment()
   923  				if s.mode&ScanComments == 0 {
   924  					// skip comment
   925  					s.insertEOL = false // newline consumed
   926  					goto scanAgain
   927  				}
   928  				tok = token.COMMENT
   929  				lit = comment
   930  			} else {
   931  				tok = token.QUO
   932  			}
   933  		// We no longer use %, but seems like a useful token to use for
   934  		// something else at some point.
   935  		// case '%':
   936  		case '<':
   937  			if s.ch == '-' {
   938  				s.next()
   939  				tok = token.ARROW
   940  			} else {
   941  				tok = s.switch2(token.LSS, token.LEQ)
   942  			}
   943  		case '>':
   944  			tok = s.switch2(token.GTR, token.GEQ)
   945  		case '=':
   946  			if s.ch == '~' {
   947  				s.next()
   948  				tok = token.MAT
   949  			} else {
   950  				tok = s.switch2(token.BIND, token.EQL)
   951  			}
   952  		case '!':
   953  			if s.ch == '~' {
   954  				s.next()
   955  				tok = token.NMAT
   956  			} else {
   957  				tok = s.switch2(token.NOT, token.NEQ)
   958  			}
   959  		case '&':
   960  			switch s.ch {
   961  			case '&':
   962  				s.next()
   963  				tok = token.LAND
   964  			default:
   965  				tok = token.AND
   966  			}
   967  		case '|':
   968  			if s.ch == '|' {
   969  				s.next()
   970  				tok = token.LOR
   971  			} else {
   972  				tok = token.OR
   973  			}
   974  		default:
   975  			// next reports unexpected BOMs - don't repeat
   976  			if ch != bom {
   977  				s.errf(s.file.Offset(pos), "illegal character %#U", ch)
   978  			}
   979  			insertEOL = s.insertEOL // preserve insertSemi info
   980  			tok = token.ILLEGAL
   981  			lit = string(ch)
   982  		}
   983  	}
   984  	if s.mode&dontInsertCommas == 0 {
   985  		s.insertEOL = insertEOL
   986  	}
   987  
   988  	s.linesSinceLast = 0
   989  	s.spacesSinceLast = 0
   990  	return
   991  }