go.starlark.net@v0.0.0-20231101134539-556fd59b42f6/syntax/scan.go (about)

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // A lexical scanner for Starlark.
     8  
     9  import (
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"math/big"
    14  	"os"
    15  	"strconv"
    16  	"strings"
    17  	"unicode"
    18  	"unicode/utf8"
    19  )
    20  
    21  // A Token represents a Starlark lexical token.
    22  type Token int8
    23  
    24  const (
    25  	ILLEGAL Token = iota
    26  	EOF
    27  
    28  	NEWLINE
    29  	INDENT
    30  	OUTDENT
    31  
    32  	// Tokens with values
    33  	IDENT  // x
    34  	INT    // 123
    35  	FLOAT  // 1.23e45
    36  	STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
    37  	BYTES  // b"foo", etc
    38  
    39  	// Punctuation
    40  	PLUS          // +
    41  	MINUS         // -
    42  	STAR          // *
    43  	SLASH         // /
    44  	SLASHSLASH    // //
    45  	PERCENT       // %
    46  	AMP           // &
    47  	PIPE          // |
    48  	CIRCUMFLEX    // ^
    49  	LTLT          // <<
    50  	GTGT          // >>
    51  	TILDE         // ~
    52  	DOT           // .
    53  	COMMA         // ,
    54  	EQ            // =
    55  	SEMI          // ;
    56  	COLON         // :
    57  	LPAREN        // (
    58  	RPAREN        // )
    59  	LBRACK        // [
    60  	RBRACK        // ]
    61  	LBRACE        // {
    62  	RBRACE        // }
    63  	LT            // <
    64  	GT            // >
    65  	GE            // >=
    66  	LE            // <=
    67  	EQL           // ==
    68  	NEQ           // !=
    69  	PLUS_EQ       // +=    (keep order consistent with PLUS..GTGT)
    70  	MINUS_EQ      // -=
    71  	STAR_EQ       // *=
    72  	SLASH_EQ      // /=
    73  	SLASHSLASH_EQ // //=
    74  	PERCENT_EQ    // %=
    75  	AMP_EQ        // &=
    76  	PIPE_EQ       // |=
    77  	CIRCUMFLEX_EQ // ^=
    78  	LTLT_EQ       // <<=
    79  	GTGT_EQ       // >>=
    80  	STARSTAR      // **
    81  
    82  	// Keywords
    83  	AND
    84  	BREAK
    85  	CONTINUE
    86  	DEF
    87  	ELIF
    88  	ELSE
    89  	FOR
    90  	IF
    91  	IN
    92  	LAMBDA
    93  	LOAD
    94  	NOT
    95  	NOT_IN // synthesized by parser from NOT IN
    96  	OR
    97  	PASS
    98  	RETURN
    99  	WHILE
   100  
   101  	maxToken
   102  )
   103  
   104  func (tok Token) String() string { return tokenNames[tok] }
   105  
   106  // GoString is like String but quotes punctuation tokens.
   107  // Use Sprintf("%#v", tok) when constructing error messages.
   108  func (tok Token) GoString() string {
   109  	if tok >= PLUS && tok <= STARSTAR {
   110  		return "'" + tokenNames[tok] + "'"
   111  	}
   112  	return tokenNames[tok]
   113  }
   114  
   115  var tokenNames = [...]string{
   116  	ILLEGAL:       "illegal token",
   117  	EOF:           "end of file",
   118  	NEWLINE:       "newline",
   119  	INDENT:        "indent",
   120  	OUTDENT:       "outdent",
   121  	IDENT:         "identifier",
   122  	INT:           "int literal",
   123  	FLOAT:         "float literal",
   124  	STRING:        "string literal",
   125  	PLUS:          "+",
   126  	MINUS:         "-",
   127  	STAR:          "*",
   128  	SLASH:         "/",
   129  	SLASHSLASH:    "//",
   130  	PERCENT:       "%",
   131  	AMP:           "&",
   132  	PIPE:          "|",
   133  	CIRCUMFLEX:    "^",
   134  	LTLT:          "<<",
   135  	GTGT:          ">>",
   136  	TILDE:         "~",
   137  	DOT:           ".",
   138  	COMMA:         ",",
   139  	EQ:            "=",
   140  	SEMI:          ";",
   141  	COLON:         ":",
   142  	LPAREN:        "(",
   143  	RPAREN:        ")",
   144  	LBRACK:        "[",
   145  	RBRACK:        "]",
   146  	LBRACE:        "{",
   147  	RBRACE:        "}",
   148  	LT:            "<",
   149  	GT:            ">",
   150  	GE:            ">=",
   151  	LE:            "<=",
   152  	EQL:           "==",
   153  	NEQ:           "!=",
   154  	PLUS_EQ:       "+=",
   155  	MINUS_EQ:      "-=",
   156  	STAR_EQ:       "*=",
   157  	SLASH_EQ:      "/=",
   158  	SLASHSLASH_EQ: "//=",
   159  	PERCENT_EQ:    "%=",
   160  	AMP_EQ:        "&=",
   161  	PIPE_EQ:       "|=",
   162  	CIRCUMFLEX_EQ: "^=",
   163  	LTLT_EQ:       "<<=",
   164  	GTGT_EQ:       ">>=",
   165  	STARSTAR:      "**",
   166  	AND:           "and",
   167  	BREAK:         "break",
   168  	CONTINUE:      "continue",
   169  	DEF:           "def",
   170  	ELIF:          "elif",
   171  	ELSE:          "else",
   172  	FOR:           "for",
   173  	IF:            "if",
   174  	IN:            "in",
   175  	LAMBDA:        "lambda",
   176  	LOAD:          "load",
   177  	NOT:           "not",
   178  	NOT_IN:        "not in",
   179  	OR:            "or",
   180  	PASS:          "pass",
   181  	RETURN:        "return",
   182  	WHILE:         "while",
   183  }
   184  
   185  // A FilePortion describes the content of a portion of a file.
   186  // Callers may provide a FilePortion for the src argument of Parse
   187  // when the desired initial line and column numbers are not (1, 1),
   188  // such as when an expression is parsed from within larger file.
   189  type FilePortion struct {
   190  	Content             []byte
   191  	FirstLine, FirstCol int32
   192  }
   193  
   194  // A Position describes the location of a rune of input.
   195  type Position struct {
   196  	file *string // filename (indirect for compactness)
   197  	Line int32   // 1-based line number; 0 if line unknown
   198  	Col  int32   // 1-based column (rune) number; 0 if column unknown
   199  }
   200  
   201  // IsValid reports whether the position is valid.
   202  func (p Position) IsValid() bool { return p.file != nil }
   203  
   204  // Filename returns the name of the file containing this position.
   205  func (p Position) Filename() string {
   206  	if p.file != nil {
   207  		return *p.file
   208  	}
   209  	return "<invalid>"
   210  }
   211  
   212  // MakePosition returns position with the specified components.
   213  func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
   214  
   215  // add returns the position at the end of s, assuming it starts at p.
   216  func (p Position) add(s string) Position {
   217  	if n := strings.Count(s, "\n"); n > 0 {
   218  		p.Line += int32(n)
   219  		s = s[strings.LastIndex(s, "\n")+1:]
   220  		p.Col = 1
   221  	}
   222  	p.Col += int32(utf8.RuneCountInString(s))
   223  	return p
   224  }
   225  
   226  func (p Position) String() string {
   227  	file := p.Filename()
   228  	if p.Line > 0 {
   229  		if p.Col > 0 {
   230  			return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
   231  		}
   232  		return fmt.Sprintf("%s:%d", file, p.Line)
   233  	}
   234  	return file
   235  }
   236  
   237  func (p Position) isBefore(q Position) bool {
   238  	if p.Line != q.Line {
   239  		return p.Line < q.Line
   240  	}
   241  	return p.Col < q.Col
   242  }
   243  
   244  // An scanner represents a single input file being parsed.
   245  type scanner struct {
   246  	rest           []byte    // rest of input (in REPL, a line of input)
   247  	token          []byte    // token being scanned
   248  	pos            Position  // current input position
   249  	depth          int       // nesting of [ ] { } ( )
   250  	indentstk      []int     // stack of indentation levels
   251  	dents          int       // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
   252  	lineStart      bool      // after NEWLINE; convert spaces to indentation tokens
   253  	keepComments   bool      // accumulate comments in slice
   254  	lineComments   []Comment // list of full line comments (if keepComments)
   255  	suffixComments []Comment // list of suffix comments (if keepComments)
   256  
   257  	readline func() ([]byte, error) // read next line of input (REPL only)
   258  }
   259  
   260  func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
   261  	var firstLine, firstCol int32 = 1, 1
   262  	if portion, ok := src.(FilePortion); ok {
   263  		firstLine, firstCol = portion.FirstLine, portion.FirstCol
   264  	}
   265  	sc := &scanner{
   266  		pos:          MakePosition(&filename, firstLine, firstCol),
   267  		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
   268  		lineStart:    true,
   269  		keepComments: keepComments,
   270  	}
   271  	sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
   272  	if sc.readline == nil {
   273  		data, err := readSource(filename, src)
   274  		if err != nil {
   275  			return nil, err
   276  		}
   277  		sc.rest = data
   278  	}
   279  	return sc, nil
   280  }
   281  
   282  func readSource(filename string, src interface{}) ([]byte, error) {
   283  	switch src := src.(type) {
   284  	case string:
   285  		return []byte(src), nil
   286  	case []byte:
   287  		return src, nil
   288  	case io.Reader:
   289  		data, err := io.ReadAll(src)
   290  		if err != nil {
   291  			err = &os.PathError{Op: "read", Path: filename, Err: err}
   292  			return nil, err
   293  		}
   294  		return data, nil
   295  	case FilePortion:
   296  		return src.Content, nil
   297  	case nil:
   298  		return os.ReadFile(filename)
   299  	default:
   300  		return nil, fmt.Errorf("invalid source: %T", src)
   301  	}
   302  }
   303  
   304  // An Error describes the nature and position of a scanner or parser error.
   305  type Error struct {
   306  	Pos Position
   307  	Msg string
   308  }
   309  
   310  func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
   311  
   312  // errorf is called to report an error.
   313  // errorf does not return: it panics.
   314  func (sc *scanner) error(pos Position, s string) {
   315  	panic(Error{pos, s})
   316  }
   317  
   318  func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
   319  	sc.error(pos, fmt.Sprintf(format, args...))
   320  }
   321  
   322  func (sc *scanner) recover(err *error) {
   323  	// The scanner and parser panic both for routine errors like
   324  	// syntax errors and for programmer bugs like array index
   325  	// errors.  Turn both into error returns.  Catching bug panics
   326  	// is especially important when processing many files.
   327  	switch e := recover().(type) {
   328  	case nil:
   329  		// no panic
   330  	case Error:
   331  		*err = e
   332  	default:
   333  		*err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
   334  		if debug {
   335  			log.Fatal(*err)
   336  		}
   337  	}
   338  }
   339  
   340  // eof reports whether the input has reached end of file.
   341  func (sc *scanner) eof() bool {
   342  	return len(sc.rest) == 0 && !sc.readLine()
   343  }
   344  
   345  // readLine attempts to read another line of input.
   346  // Precondition: len(sc.rest)==0.
   347  func (sc *scanner) readLine() bool {
   348  	if sc.readline != nil {
   349  		var err error
   350  		sc.rest, err = sc.readline()
   351  		if err != nil {
   352  			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
   353  		}
   354  		return len(sc.rest) > 0
   355  	}
   356  	return false
   357  }
   358  
   359  // peekRune returns the next rune in the input without consuming it.
   360  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   361  func (sc *scanner) peekRune() rune {
   362  	// TODO(adonovan): opt: measure and perhaps inline eof.
   363  	if sc.eof() {
   364  		return 0
   365  	}
   366  
   367  	// fast path: ASCII
   368  	if b := sc.rest[0]; b < utf8.RuneSelf {
   369  		if b == '\r' {
   370  			return '\n'
   371  		}
   372  		return rune(b)
   373  	}
   374  
   375  	r, _ := utf8.DecodeRune(sc.rest)
   376  	return r
   377  }
   378  
   379  // readRune consumes and returns the next rune in the input.
   380  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   381  func (sc *scanner) readRune() rune {
   382  	// eof() has been inlined here, both to avoid a call
   383  	// and to establish len(rest)>0 to avoid a bounds check.
   384  	if len(sc.rest) == 0 {
   385  		if !sc.readLine() {
   386  			sc.error(sc.pos, "internal scanner error: readRune at EOF")
   387  		}
   388  		// Redundant, but eliminates the bounds-check below.
   389  		if len(sc.rest) == 0 {
   390  			return 0
   391  		}
   392  	}
   393  
   394  	// fast path: ASCII
   395  	if b := sc.rest[0]; b < utf8.RuneSelf {
   396  		r := rune(b)
   397  		sc.rest = sc.rest[1:]
   398  		if r == '\r' {
   399  			if len(sc.rest) > 0 && sc.rest[0] == '\n' {
   400  				sc.rest = sc.rest[1:]
   401  			}
   402  			r = '\n'
   403  		}
   404  		if r == '\n' {
   405  			sc.pos.Line++
   406  			sc.pos.Col = 1
   407  		} else {
   408  			sc.pos.Col++
   409  		}
   410  		return r
   411  	}
   412  
   413  	r, size := utf8.DecodeRune(sc.rest)
   414  	sc.rest = sc.rest[size:]
   415  	sc.pos.Col++
   416  	return r
   417  }
   418  
   419  // tokenValue records the position and value associated with each token.
   420  type tokenValue struct {
   421  	raw    string   // raw text of token
   422  	int    int64    // decoded int
   423  	bigInt *big.Int // decoded integers > int64
   424  	float  float64  // decoded float
   425  	string string   // decoded string or bytes
   426  	pos    Position // start position of token
   427  }
   428  
   429  // startToken marks the beginning of the next input token.
   430  // It must be followed by a call to endToken once the token has
   431  // been consumed using readRune.
   432  func (sc *scanner) startToken(val *tokenValue) {
   433  	sc.token = sc.rest
   434  	val.raw = ""
   435  	val.pos = sc.pos
   436  }
   437  
   438  // endToken marks the end of an input token.
   439  // It records the actual token string in val.raw if the caller
   440  // has not done that already.
   441  func (sc *scanner) endToken(val *tokenValue) {
   442  	if val.raw == "" {
   443  		val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
   444  	}
   445  }
   446  
   447  // nextToken is called by the parser to obtain the next input token.
   448  // It returns the token value and sets val to the data associated with
   449  // the token.
   450  //
   451  // For all our input tokens, the associated data is val.pos (the
   452  // position where the token begins), val.raw (the input string
   453  // corresponding to the token).  For string and int tokens, the string
   454  // and int fields additionally contain the token's interpreted value.
   455  func (sc *scanner) nextToken(val *tokenValue) Token {
   456  
   457  	// The following distribution of tokens guides case ordering:
   458  	//
   459  	//      COMMA          27   %
   460  	//      STRING         23   %
   461  	//      IDENT          15   %
   462  	//      EQL            11   %
   463  	//      LBRACK          5.5 %
   464  	//      RBRACK          5.5 %
   465  	//      NEWLINE         3   %
   466  	//      LPAREN          2.9 %
   467  	//      RPAREN          2.9 %
   468  	//      INT             2   %
   469  	//      others        < 1   %
   470  	//
   471  	// Although NEWLINE tokens are infrequent, and lineStart is
   472  	// usually (~97%) false on entry, skipped newlines account for
   473  	// about 50% of all iterations of the 'start' loop.
   474  
   475  start:
   476  	var c rune
   477  
   478  	// Deal with leading spaces and indentation.
   479  	blank := false
   480  	savedLineStart := sc.lineStart
   481  	if sc.lineStart {
   482  		sc.lineStart = false
   483  		col := 0
   484  		for {
   485  			c = sc.peekRune()
   486  			if c == ' ' {
   487  				col++
   488  				sc.readRune()
   489  			} else if c == '\t' {
   490  				const tab = 8
   491  				col += int(tab - (sc.pos.Col-1)%tab)
   492  				sc.readRune()
   493  			} else {
   494  				break
   495  			}
   496  		}
   497  
   498  		// The third clause matches EOF.
   499  		if c == '#' || c == '\n' || c == 0 {
   500  			blank = true
   501  		}
   502  
   503  		// Compute indentation level for non-blank lines not
   504  		// inside an expression.  This is not the common case.
   505  		if !blank && sc.depth == 0 {
   506  			cur := sc.indentstk[len(sc.indentstk)-1]
   507  			if col > cur {
   508  				// indent
   509  				sc.dents++
   510  				sc.indentstk = append(sc.indentstk, col)
   511  			} else if col < cur {
   512  				// outdent(s)
   513  				for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
   514  					sc.dents--
   515  					sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
   516  				}
   517  				if col != sc.indentstk[len(sc.indentstk)-1] {
   518  					sc.error(sc.pos, "unindent does not match any outer indentation level")
   519  				}
   520  			}
   521  		}
   522  	}
   523  
   524  	// Return saved indentation tokens.
   525  	if sc.dents != 0 {
   526  		sc.startToken(val)
   527  		sc.endToken(val)
   528  		if sc.dents < 0 {
   529  			sc.dents++
   530  			return OUTDENT
   531  		} else {
   532  			sc.dents--
   533  			return INDENT
   534  		}
   535  	}
   536  
   537  	// start of line proper
   538  	c = sc.peekRune()
   539  
   540  	// Skip spaces.
   541  	for c == ' ' || c == '\t' {
   542  		sc.readRune()
   543  		c = sc.peekRune()
   544  	}
   545  
   546  	// comment
   547  	if c == '#' {
   548  		if sc.keepComments {
   549  			sc.startToken(val)
   550  		}
   551  		// Consume up to newline (included).
   552  		for c != 0 && c != '\n' {
   553  			sc.readRune()
   554  			c = sc.peekRune()
   555  		}
   556  		if sc.keepComments {
   557  			sc.endToken(val)
   558  			if blank {
   559  				sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
   560  			} else {
   561  				sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
   562  			}
   563  		}
   564  	}
   565  
   566  	// newline
   567  	if c == '\n' {
   568  		sc.lineStart = true
   569  
   570  		// Ignore newlines within expressions (common case).
   571  		if sc.depth > 0 {
   572  			sc.readRune()
   573  			goto start
   574  		}
   575  
   576  		// Ignore blank lines, except in the REPL,
   577  		// where they emit OUTDENTs and NEWLINE.
   578  		if blank {
   579  			if sc.readline == nil {
   580  				sc.readRune()
   581  				goto start
   582  			} else if len(sc.indentstk) > 1 {
   583  				sc.dents = 1 - len(sc.indentstk)
   584  				sc.indentstk = sc.indentstk[:1]
   585  				goto start
   586  			}
   587  		}
   588  
   589  		// At top-level (not in an expression).
   590  		sc.startToken(val)
   591  		sc.readRune()
   592  		val.raw = "\n"
   593  		return NEWLINE
   594  	}
   595  
   596  	// end of file
   597  	if c == 0 {
   598  		// Emit OUTDENTs for unfinished indentation,
   599  		// preceded by a NEWLINE if we haven't just emitted one.
   600  		if len(sc.indentstk) > 1 {
   601  			if savedLineStart {
   602  				sc.dents = 1 - len(sc.indentstk)
   603  				sc.indentstk = sc.indentstk[:1]
   604  				goto start
   605  			} else {
   606  				sc.lineStart = true
   607  				sc.startToken(val)
   608  				val.raw = "\n"
   609  				return NEWLINE
   610  			}
   611  		}
   612  
   613  		sc.startToken(val)
   614  		sc.endToken(val)
   615  		return EOF
   616  	}
   617  
   618  	// line continuation
   619  	if c == '\\' {
   620  		sc.readRune()
   621  		if sc.peekRune() != '\n' {
   622  			sc.errorf(sc.pos, "stray backslash in program")
   623  		}
   624  		sc.readRune()
   625  		goto start
   626  	}
   627  
   628  	// start of the next token
   629  	sc.startToken(val)
   630  
   631  	// comma (common case)
   632  	if c == ',' {
   633  		sc.readRune()
   634  		sc.endToken(val)
   635  		return COMMA
   636  	}
   637  
   638  	// string literal
   639  	if c == '"' || c == '\'' {
   640  		return sc.scanString(val, c)
   641  	}
   642  
   643  	// identifier or keyword
   644  	if isIdentStart(c) {
   645  		if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
   646  			//  r"..."
   647  			//  b"..."
   648  			sc.readRune()
   649  			c = sc.peekRune()
   650  			return sc.scanString(val, c)
   651  		} else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
   652  			// rb"..."
   653  			sc.readRune()
   654  			sc.readRune()
   655  			c = sc.peekRune()
   656  			return sc.scanString(val, c)
   657  		}
   658  
   659  		for isIdent(c) {
   660  			sc.readRune()
   661  			c = sc.peekRune()
   662  		}
   663  		sc.endToken(val)
   664  		if k, ok := keywordToken[val.raw]; ok {
   665  			return k
   666  		}
   667  
   668  		return IDENT
   669  	}
   670  
   671  	// brackets
   672  	switch c {
   673  	case '[', '(', '{':
   674  		sc.depth++
   675  		sc.readRune()
   676  		sc.endToken(val)
   677  		switch c {
   678  		case '[':
   679  			return LBRACK
   680  		case '(':
   681  			return LPAREN
   682  		case '{':
   683  			return LBRACE
   684  		}
   685  		panic("unreachable")
   686  
   687  	case ']', ')', '}':
   688  		if sc.depth == 0 {
   689  			sc.errorf(sc.pos, "unexpected %q", c)
   690  		} else {
   691  			sc.depth--
   692  		}
   693  		sc.readRune()
   694  		sc.endToken(val)
   695  		switch c {
   696  		case ']':
   697  			return RBRACK
   698  		case ')':
   699  			return RPAREN
   700  		case '}':
   701  			return RBRACE
   702  		}
   703  		panic("unreachable")
   704  	}
   705  
   706  	// int or float literal, or period
   707  	if isdigit(c) || c == '.' {
   708  		return sc.scanNumber(val, c)
   709  	}
   710  
   711  	// other punctuation
   712  	defer sc.endToken(val)
   713  	switch c {
   714  	case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
   715  		start := sc.pos
   716  		sc.readRune()
   717  		if sc.peekRune() == '=' {
   718  			sc.readRune()
   719  			switch c {
   720  			case '<':
   721  				return LE
   722  			case '>':
   723  				return GE
   724  			case '=':
   725  				return EQL
   726  			case '!':
   727  				return NEQ
   728  			case '+':
   729  				return PLUS_EQ
   730  			case '-':
   731  				return MINUS_EQ
   732  			case '/':
   733  				return SLASH_EQ
   734  			case '%':
   735  				return PERCENT_EQ
   736  			case '&':
   737  				return AMP_EQ
   738  			case '|':
   739  				return PIPE_EQ
   740  			case '^':
   741  				return CIRCUMFLEX_EQ
   742  			}
   743  		}
   744  		switch c {
   745  		case '=':
   746  			return EQ
   747  		case '<':
   748  			if sc.peekRune() == '<' {
   749  				sc.readRune()
   750  				if sc.peekRune() == '=' {
   751  					sc.readRune()
   752  					return LTLT_EQ
   753  				} else {
   754  					return LTLT
   755  				}
   756  			}
   757  			return LT
   758  		case '>':
   759  			if sc.peekRune() == '>' {
   760  				sc.readRune()
   761  				if sc.peekRune() == '=' {
   762  					sc.readRune()
   763  					return GTGT_EQ
   764  				} else {
   765  					return GTGT
   766  				}
   767  			}
   768  			return GT
   769  		case '!':
   770  			sc.error(start, "unexpected input character '!'")
   771  		case '+':
   772  			return PLUS
   773  		case '-':
   774  			return MINUS
   775  		case '/':
   776  			if sc.peekRune() == '/' {
   777  				sc.readRune()
   778  				if sc.peekRune() == '=' {
   779  					sc.readRune()
   780  					return SLASHSLASH_EQ
   781  				} else {
   782  					return SLASHSLASH
   783  				}
   784  			}
   785  			return SLASH
   786  		case '%':
   787  			return PERCENT
   788  		case '&':
   789  			return AMP
   790  		case '|':
   791  			return PIPE
   792  		case '^':
   793  			return CIRCUMFLEX
   794  		}
   795  		panic("unreachable")
   796  
   797  	case ':', ';', '~': // single-char tokens (except comma)
   798  		sc.readRune()
   799  		switch c {
   800  		case ':':
   801  			return COLON
   802  		case ';':
   803  			return SEMI
   804  		case '~':
   805  			return TILDE
   806  		}
   807  		panic("unreachable")
   808  
   809  	case '*': // possibly followed by '*' or '='
   810  		sc.readRune()
   811  		switch sc.peekRune() {
   812  		case '*':
   813  			sc.readRune()
   814  			return STARSTAR
   815  		case '=':
   816  			sc.readRune()
   817  			return STAR_EQ
   818  		}
   819  		return STAR
   820  	}
   821  
   822  	sc.errorf(sc.pos, "unexpected input character %#q", c)
   823  	panic("unreachable")
   824  }
   825  
   826  func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
   827  	start := sc.pos
   828  	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
   829  	sc.readRune()
   830  
   831  	// String literals may contain escaped or unescaped newlines,
   832  	// causing them to span multiple lines (gulps) of REPL input;
   833  	// they are the only such token. Thus we cannot call endToken,
   834  	// as it assumes sc.rest is unchanged since startToken.
   835  	// Instead, buffer the token here.
   836  	// TODO(adonovan): opt: buffer only if we encounter a newline.
   837  	raw := new(strings.Builder)
   838  
   839  	// Copy the prefix, e.g. r' or " (see startToken).
   840  	raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
   841  
   842  	if !triple {
   843  		// single-quoted string literal
   844  		for {
   845  			if sc.eof() {
   846  				sc.error(val.pos, "unexpected EOF in string")
   847  			}
   848  			c := sc.readRune()
   849  			raw.WriteRune(c)
   850  			if c == quote {
   851  				break
   852  			}
   853  			if c == '\n' {
   854  				sc.error(val.pos, "unexpected newline in string")
   855  			}
   856  			if c == '\\' {
   857  				if sc.eof() {
   858  					sc.error(val.pos, "unexpected EOF in string")
   859  				}
   860  				c = sc.readRune()
   861  				raw.WriteRune(c)
   862  			}
   863  		}
   864  	} else {
   865  		// triple-quoted string literal
   866  		sc.readRune()
   867  		raw.WriteRune(quote)
   868  		sc.readRune()
   869  		raw.WriteRune(quote)
   870  
   871  		quoteCount := 0
   872  		for {
   873  			if sc.eof() {
   874  				sc.error(val.pos, "unexpected EOF in string")
   875  			}
   876  			c := sc.readRune()
   877  			raw.WriteRune(c)
   878  			if c == quote {
   879  				quoteCount++
   880  				if quoteCount == 3 {
   881  					break
   882  				}
   883  			} else {
   884  				quoteCount = 0
   885  			}
   886  			if c == '\\' {
   887  				if sc.eof() {
   888  					sc.error(val.pos, "unexpected EOF in string")
   889  				}
   890  				c = sc.readRune()
   891  				raw.WriteRune(c)
   892  			}
   893  		}
   894  	}
   895  	val.raw = raw.String()
   896  
   897  	s, _, isByte, err := unquote(val.raw)
   898  	if err != nil {
   899  		sc.error(start, err.Error())
   900  	}
   901  	val.string = s
   902  	if isByte {
   903  		return BYTES
   904  	} else {
   905  		return STRING
   906  	}
   907  }
   908  
   909  func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
   910  	// https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
   911  	//
   912  	// Python features not supported:
   913  	// - integer literals of >64 bits of precision
   914  	// - 123L or 123l long suffix
   915  	// - traditional octal: 0755
   916  	// https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
   917  
   918  	start := sc.pos
   919  	fraction, exponent := false, false
   920  
   921  	if c == '.' {
   922  		// dot or start of fraction
   923  		sc.readRune()
   924  		c = sc.peekRune()
   925  		if !isdigit(c) {
   926  			sc.endToken(val)
   927  			return DOT
   928  		}
   929  		fraction = true
   930  	} else if c == '0' {
   931  		// hex, octal, binary or float
   932  		sc.readRune()
   933  		c = sc.peekRune()
   934  
   935  		if c == '.' {
   936  			fraction = true
   937  		} else if c == 'x' || c == 'X' {
   938  			// hex
   939  			sc.readRune()
   940  			c = sc.peekRune()
   941  			if !isxdigit(c) {
   942  				sc.error(start, "invalid hex literal")
   943  			}
   944  			for isxdigit(c) {
   945  				sc.readRune()
   946  				c = sc.peekRune()
   947  			}
   948  		} else if c == 'o' || c == 'O' {
   949  			// octal
   950  			sc.readRune()
   951  			c = sc.peekRune()
   952  			if !isodigit(c) {
   953  				sc.error(sc.pos, "invalid octal literal")
   954  			}
   955  			for isodigit(c) {
   956  				sc.readRune()
   957  				c = sc.peekRune()
   958  			}
   959  		} else if c == 'b' || c == 'B' {
   960  			// binary
   961  			sc.readRune()
   962  			c = sc.peekRune()
   963  			if !isbdigit(c) {
   964  				sc.error(sc.pos, "invalid binary literal")
   965  			}
   966  			for isbdigit(c) {
   967  				sc.readRune()
   968  				c = sc.peekRune()
   969  			}
   970  		} else {
   971  			// float (or obsolete octal "0755")
   972  			allzeros, octal := true, true
   973  			for isdigit(c) {
   974  				if c != '0' {
   975  					allzeros = false
   976  				}
   977  				if c > '7' {
   978  					octal = false
   979  				}
   980  				sc.readRune()
   981  				c = sc.peekRune()
   982  			}
   983  			if c == '.' {
   984  				fraction = true
   985  			} else if c == 'e' || c == 'E' {
   986  				exponent = true
   987  			} else if octal && !allzeros {
   988  				sc.endToken(val)
   989  				sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
   990  			}
   991  		}
   992  	} else {
   993  		// decimal
   994  		for isdigit(c) {
   995  			sc.readRune()
   996  			c = sc.peekRune()
   997  		}
   998  
   999  		if c == '.' {
  1000  			fraction = true
  1001  		} else if c == 'e' || c == 'E' {
  1002  			exponent = true
  1003  		}
  1004  	}
  1005  
  1006  	if fraction {
  1007  		sc.readRune() // consume '.'
  1008  		c = sc.peekRune()
  1009  		for isdigit(c) {
  1010  			sc.readRune()
  1011  			c = sc.peekRune()
  1012  		}
  1013  
  1014  		if c == 'e' || c == 'E' {
  1015  			exponent = true
  1016  		}
  1017  	}
  1018  
  1019  	if exponent {
  1020  		sc.readRune() // consume [eE]
  1021  		c = sc.peekRune()
  1022  		if c == '+' || c == '-' {
  1023  			sc.readRune()
  1024  			c = sc.peekRune()
  1025  			if !isdigit(c) {
  1026  				sc.error(sc.pos, "invalid float literal")
  1027  			}
  1028  		}
  1029  		for isdigit(c) {
  1030  			sc.readRune()
  1031  			c = sc.peekRune()
  1032  		}
  1033  	}
  1034  
  1035  	sc.endToken(val)
  1036  	if fraction || exponent {
  1037  		var err error
  1038  		val.float, err = strconv.ParseFloat(val.raw, 64)
  1039  		if err != nil {
  1040  			sc.error(sc.pos, "invalid float literal")
  1041  		}
  1042  		return FLOAT
  1043  	} else {
  1044  		var err error
  1045  		s := val.raw
  1046  		val.bigInt = nil
  1047  		if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
  1048  			val.int, err = strconv.ParseInt(s[2:], 8, 64)
  1049  		} else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
  1050  			val.int, err = strconv.ParseInt(s[2:], 2, 64)
  1051  		} else {
  1052  			val.int, err = strconv.ParseInt(s, 0, 64)
  1053  			if err != nil {
  1054  				num := new(big.Int)
  1055  				var ok bool
  1056  				val.bigInt, ok = num.SetString(s, 0)
  1057  				if ok {
  1058  					err = nil
  1059  				}
  1060  			}
  1061  		}
  1062  		if err != nil {
  1063  			sc.error(start, "invalid int literal")
  1064  		}
  1065  		return INT
  1066  	}
  1067  }
  1068  
  1069  // isIdent reports whether c is an identifier rune.
  1070  func isIdent(c rune) bool {
  1071  	return isdigit(c) || isIdentStart(c)
  1072  }
  1073  
  1074  func isIdentStart(c rune) bool {
  1075  	return 'a' <= c && c <= 'z' ||
  1076  		'A' <= c && c <= 'Z' ||
  1077  		c == '_' ||
  1078  		unicode.IsLetter(c)
  1079  }
  1080  
  1081  func isdigit(c rune) bool  { return '0' <= c && c <= '9' }
  1082  func isodigit(c rune) bool { return '0' <= c && c <= '7' }
  1083  func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
  1084  func isbdigit(c rune) bool { return '0' == c || c == '1' }
  1085  
  1086  // keywordToken records the special tokens for
  1087  // strings that should not be treated as ordinary identifiers.
  1088  var keywordToken = map[string]Token{
  1089  	"and":      AND,
  1090  	"break":    BREAK,
  1091  	"continue": CONTINUE,
  1092  	"def":      DEF,
  1093  	"elif":     ELIF,
  1094  	"else":     ELSE,
  1095  	"for":      FOR,
  1096  	"if":       IF,
  1097  	"in":       IN,
  1098  	"lambda":   LAMBDA,
  1099  	"load":     LOAD,
  1100  	"not":      NOT,
  1101  	"or":       OR,
  1102  	"pass":     PASS,
  1103  	"return":   RETURN,
  1104  	"while":    WHILE,
  1105  
  1106  	// reserved words:
  1107  	"as": ILLEGAL,
  1108  	// "assert":   ILLEGAL, // heavily used by our tests
  1109  	"async":    ILLEGAL,
  1110  	"await":    ILLEGAL,
  1111  	"class":    ILLEGAL,
  1112  	"del":      ILLEGAL,
  1113  	"except":   ILLEGAL,
  1114  	"finally":  ILLEGAL,
  1115  	"from":     ILLEGAL,
  1116  	"global":   ILLEGAL,
  1117  	"import":   ILLEGAL,
  1118  	"is":       ILLEGAL,
  1119  	"nonlocal": ILLEGAL,
  1120  	"raise":    ILLEGAL,
  1121  	"try":      ILLEGAL,
  1122  	"with":     ILLEGAL,
  1123  	"yield":    ILLEGAL,
  1124  }