github.com/k14s/starlark-go@v0.0.0-20200720175618-3a5c849cc368/syntax/scan.go (about)

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // A lexical scanner for Starlark.
     8  
     9  import (
    10  	"fmt"
    11  	"io"
    12  	"io/ioutil"
    13  	"log"
    14  	"math/big"
    15  	"os"
    16  	"strconv"
    17  	"strings"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  // A Token represents a Starlark lexical token.
    23  type Token int8
    24  
    25  const (
    26  	ILLEGAL Token = iota
    27  	EOF
    28  
    29  	NEWLINE
    30  	INDENT
    31  	OUTDENT
    32  
    33  	// Tokens with values
    34  	IDENT  // x
    35  	INT    // 123
    36  	FLOAT  // 1.23e45
    37  	STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
    38  
    39  	// Punctuation
    40  	PLUS          // +
    41  	MINUS         // -
    42  	STAR          // *
    43  	SLASH         // /
    44  	SLASHSLASH    // //
    45  	PERCENT       // %
    46  	AMP           // &
    47  	PIPE          // |
    48  	CIRCUMFLEX    // ^
    49  	LTLT          // <<
    50  	GTGT          // >>
    51  	TILDE         // ~
    52  	DOT           // .
    53  	COMMA         // ,
    54  	EQ            // =
    55  	SEMI          // ;
    56  	COLON         // :
    57  	LPAREN        // (
    58  	RPAREN        // )
    59  	LBRACK        // [
    60  	RBRACK        // ]
    61  	LBRACE        // {
    62  	RBRACE        // }
    63  	LT            // <
    64  	GT            // >
    65  	GE            // >=
    66  	LE            // <=
    67  	EQL           // ==
    68  	NEQ           // !=
    69  	PLUS_EQ       // +=    (keep order consistent with PLUS..GTGT)
    70  	MINUS_EQ      // -=
    71  	STAR_EQ       // *=
    72  	SLASH_EQ      // /=
    73  	SLASHSLASH_EQ // //=
    74  	PERCENT_EQ    // %=
    75  	AMP_EQ        // &=
    76  	PIPE_EQ       // |=
    77  	CIRCUMFLEX_EQ // ^=
    78  	LTLT_EQ       // <<=
    79  	GTGT_EQ       // >>=
    80  	STARSTAR      // **
    81  
    82  	// Keywords
    83  	AND
    84  	BREAK
    85  	CONTINUE
    86  	DEF
    87  	ELIF
    88  	ELSE
    89  	FOR
    90  	IF
    91  	IN
    92  	LAMBDA
    93  	LOAD
    94  	NOT
    95  	NOT_IN // synthesized by parser from NOT IN
    96  	OR
    97  	PASS
    98  	RETURN
    99  	WHILE
   100  
   101  	maxToken
   102  )
   103  
   104  func (tok Token) String() string { return tokenNames[tok] }
   105  
   106  // GoString is like String but quotes punctuation tokens.
   107  // Use Sprintf("%#v", tok) when constructing error messages.
   108  func (tok Token) GoString() string {
   109  	if tok >= PLUS && tok <= STARSTAR {
   110  		return "'" + tokenNames[tok] + "'"
   111  	}
   112  	return tokenNames[tok]
   113  }
   114  
   115  var tokenNames = [...]string{
   116  	ILLEGAL:       "illegal token",
   117  	EOF:           "end of file",
   118  	NEWLINE:       "newline",
   119  	INDENT:        "indent",
   120  	OUTDENT:       "outdent",
   121  	IDENT:         "identifier",
   122  	INT:           "int literal",
   123  	FLOAT:         "float literal",
   124  	STRING:        "string literal",
   125  	PLUS:          "+",
   126  	MINUS:         "-",
   127  	STAR:          "*",
   128  	SLASH:         "/",
   129  	SLASHSLASH:    "//",
   130  	PERCENT:       "%",
   131  	AMP:           "&",
   132  	PIPE:          "|",
   133  	CIRCUMFLEX:    "^",
   134  	LTLT:          "<<",
   135  	GTGT:          ">>",
   136  	TILDE:         "~",
   137  	DOT:           ".",
   138  	COMMA:         ",",
   139  	EQ:            "=",
   140  	SEMI:          ";",
   141  	COLON:         ":",
   142  	LPAREN:        "(",
   143  	RPAREN:        ")",
   144  	LBRACK:        "[",
   145  	RBRACK:        "]",
   146  	LBRACE:        "{",
   147  	RBRACE:        "}",
   148  	LT:            "<",
   149  	GT:            ">",
   150  	GE:            ">=",
   151  	LE:            "<=",
   152  	EQL:           "==",
   153  	NEQ:           "!=",
   154  	PLUS_EQ:       "+=",
   155  	MINUS_EQ:      "-=",
   156  	STAR_EQ:       "*=",
   157  	SLASH_EQ:      "/=",
   158  	SLASHSLASH_EQ: "//=",
   159  	PERCENT_EQ:    "%=",
   160  	AMP_EQ:        "&=",
   161  	PIPE_EQ:       "|=",
   162  	CIRCUMFLEX_EQ: "^=",
   163  	LTLT_EQ:       "<<=",
   164  	GTGT_EQ:       ">>=",
   165  	STARSTAR:      "**",
   166  	AND:           "and",
   167  	BREAK:         "break",
   168  	CONTINUE:      "continue",
   169  	DEF:           "def",
   170  	ELIF:          "elif",
   171  	ELSE:          "else",
   172  	FOR:           "for",
   173  	IF:            "if",
   174  	IN:            "in",
   175  	LAMBDA:        "lambda",
   176  	LOAD:          "load",
   177  	NOT:           "not",
   178  	NOT_IN:        "not in",
   179  	OR:            "or",
   180  	PASS:          "pass",
   181  	RETURN:        "return",
   182  	WHILE:         "while",
   183  }
   184  
   185  // A Position describes the location of a rune of input.
   186  type Position struct {
   187  	file *string // filename (indirect for compactness)
   188  	Line int32   // 1-based line number; 0 if line unknown
   189  	Col  int32   // 1-based column (rune) number; 0 if column unknown
   190  }
   191  
   192  // IsValid reports whether the position is valid.
   193  func (p Position) IsValid() bool { return p.file != nil }
   194  
   195  // Filename returns the name of the file containing this position.
   196  func (p Position) Filename() string {
   197  	if p.file != nil {
   198  		return *p.file
   199  	}
   200  	return "<invalid>"
   201  }
   202  
   203  // MakePosition returns position with the specified components.
   204  func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
   205  
   206  // add returns the position at the end of s, assuming it starts at p.
   207  func (p Position) add(s string) Position {
   208  	if n := strings.Count(s, "\n"); n > 0 {
   209  		p.Line += int32(n)
   210  		s = s[strings.LastIndex(s, "\n")+1:]
   211  		p.Col = 1
   212  	}
   213  	p.Col += int32(utf8.RuneCountInString(s))
   214  	return p
   215  }
   216  
   217  func (p Position) String() string {
   218  	file := p.Filename()
   219  	if p.Line > 0 {
   220  		if p.Col > 0 {
   221  			return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
   222  		}
   223  		return fmt.Sprintf("%s:%d", file, p.Line)
   224  	}
   225  	return file
   226  }
   227  
   228  func (p Position) isBefore(q Position) bool {
   229  	if p.Line != q.Line {
   230  		return p.Line < q.Line
   231  	}
   232  	return p.Col < q.Col
   233  }
   234  
   235  // An scanner represents a single input file being parsed.
   236  type scanner struct {
   237  	rest           []byte    // rest of input (in REPL, a line of input)
   238  	token          []byte    // token being scanned
   239  	pos            Position  // current input position
   240  	depth          int       // nesting of [ ] { } ( )
   241  	indentstk      []int     // stack of indentation levels
   242  	dents          int       // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
   243  	lineStart      bool      // after NEWLINE; convert spaces to indentation tokens
   244  	keepComments   bool      // accumulate comments in slice
   245  	lineComments   []Comment // list of full line comments (if keepComments)
   246  	suffixComments []Comment // list of suffix comments (if keepComments)
   247  
   248  	readline func() ([]byte, error) // read next line of input (REPL only)
   249  }
   250  
   251  func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
   252  	sc := &scanner{
   253  		pos:          Position{file: &filename, Line: 1, Col: 1},
   254  		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
   255  		lineStart:    true,
   256  		keepComments: keepComments,
   257  	}
   258  	sc.readline, _ = src.(func() ([]byte, error)) // REPL only
   259  	if sc.readline == nil {
   260  		data, err := readSource(filename, src)
   261  		if err != nil {
   262  			return nil, err
   263  		}
   264  		sc.rest = data
   265  	}
   266  	return sc, nil
   267  }
   268  
   269  func readSource(filename string, src interface{}) ([]byte, error) {
   270  	switch src := src.(type) {
   271  	case string:
   272  		return []byte(src), nil
   273  	case []byte:
   274  		return src, nil
   275  	case io.Reader:
   276  		data, err := ioutil.ReadAll(src)
   277  		if err != nil {
   278  			err = &os.PathError{Op: "read", Path: filename, Err: err}
   279  		}
   280  		return data, nil
   281  	case nil:
   282  		return ioutil.ReadFile(filename)
   283  	default:
   284  		return nil, fmt.Errorf("invalid source: %T", src)
   285  	}
   286  }
   287  
   288  // An Error describes the nature and position of a scanner or parser error.
   289  type Error struct {
   290  	Pos Position
   291  	Msg string
   292  }
   293  
   294  func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
   295  
   296  // errorf is called to report an error.
   297  // errorf does not return: it panics.
   298  func (sc *scanner) error(pos Position, s string) {
   299  	panic(Error{pos, s})
   300  }
   301  
   302  func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
   303  	sc.error(pos, fmt.Sprintf(format, args...))
   304  }
   305  
   306  func (sc *scanner) recover(err *error) {
   307  	// The scanner and parser panic both for routine errors like
   308  	// syntax errors and for programmer bugs like array index
   309  	// errors.  Turn both into error returns.  Catching bug panics
   310  	// is especially important when processing many files.
   311  	switch e := recover().(type) {
   312  	case nil:
   313  		// no panic
   314  	case Error:
   315  		*err = e
   316  	default:
   317  		*err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
   318  		if debug {
   319  			log.Fatal(*err)
   320  		}
   321  	}
   322  }
   323  
   324  // eof reports whether the input has reached end of file.
   325  func (sc *scanner) eof() bool {
   326  	return len(sc.rest) == 0 && !sc.readLine()
   327  }
   328  
   329  // readLine attempts to read another line of input.
   330  // Precondition: len(sc.rest)==0.
   331  func (sc *scanner) readLine() bool {
   332  	if sc.readline != nil {
   333  		var err error
   334  		sc.rest, err = sc.readline()
   335  		if err != nil {
   336  			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
   337  		}
   338  		return len(sc.rest) > 0
   339  	}
   340  	return false
   341  }
   342  
   343  // peekRune returns the next rune in the input without consuming it.
   344  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   345  func (sc *scanner) peekRune() rune {
   346  	// TODO(adonovan): opt: measure and perhaps inline eof.
   347  	if sc.eof() {
   348  		return 0
   349  	}
   350  
   351  	// fast path: ASCII
   352  	if b := sc.rest[0]; b < utf8.RuneSelf {
   353  		if b == '\r' {
   354  			return '\n'
   355  		}
   356  		return rune(b)
   357  	}
   358  
   359  	r, _ := utf8.DecodeRune(sc.rest)
   360  	return r
   361  }
   362  
   363  // readRune consumes and returns the next rune in the input.
   364  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   365  func (sc *scanner) readRune() rune {
   366  	// eof() has been inlined here, both to avoid a call
   367  	// and to establish len(rest)>0 to avoid a bounds check.
   368  	if len(sc.rest) == 0 {
   369  		if !sc.readLine() {
   370  			sc.error(sc.pos, "internal scanner error: readRune at EOF")
   371  		}
   372  		// Redundant, but eliminates the bounds-check below.
   373  		if len(sc.rest) == 0 {
   374  			return 0
   375  		}
   376  	}
   377  
   378  	// fast path: ASCII
   379  	if b := sc.rest[0]; b < utf8.RuneSelf {
   380  		r := rune(b)
   381  		sc.rest = sc.rest[1:]
   382  		if r == '\r' {
   383  			if len(sc.rest) > 0 && sc.rest[0] == '\n' {
   384  				sc.rest = sc.rest[1:]
   385  			}
   386  			r = '\n'
   387  		}
   388  		if r == '\n' {
   389  			sc.pos.Line++
   390  			sc.pos.Col = 1
   391  		} else {
   392  			sc.pos.Col++
   393  		}
   394  		return r
   395  	}
   396  
   397  	r, size := utf8.DecodeRune(sc.rest)
   398  	sc.rest = sc.rest[size:]
   399  	sc.pos.Col++
   400  	return r
   401  }
   402  
   403  // tokenValue records the position and value associated with each token.
   404  type tokenValue struct {
   405  	raw    string   // raw text of token
   406  	int    int64    // decoded int
   407  	bigInt *big.Int // decoded integers > int64
   408  	float  float64  // decoded float
   409  	string string   // decoded string
   410  	pos    Position // start position of token
   411  }
   412  
   413  // startToken marks the beginning of the next input token.
   414  // It must be followed by a call to endToken once the token has
   415  // been consumed using readRune.
   416  func (sc *scanner) startToken(val *tokenValue) {
   417  	sc.token = sc.rest
   418  	val.raw = ""
   419  	val.pos = sc.pos
   420  }
   421  
   422  // endToken marks the end of an input token.
   423  // It records the actual token string in val.raw if the caller
   424  // has not done that already.
   425  func (sc *scanner) endToken(val *tokenValue) {
   426  	if val.raw == "" {
   427  		val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
   428  	}
   429  }
   430  
   431  // nextToken is called by the parser to obtain the next input token.
   432  // It returns the token value and sets val to the data associated with
   433  // the token.
   434  //
   435  // For all our input tokens, the associated data is val.pos (the
   436  // position where the token begins), val.raw (the input string
   437  // corresponding to the token).  For string and int tokens, the string
   438  // and int fields additionally contain the token's interpreted value.
   439  func (sc *scanner) nextToken(val *tokenValue) Token {
   440  
   441  	// The following distribution of tokens guides case ordering:
   442  	//
   443  	//      COMMA          27   %
   444  	//      STRING         23   %
   445  	//      IDENT          15   %
   446  	//      EQL            11   %
   447  	//      LBRACK          5.5 %
   448  	//      RBRACK          5.5 %
   449  	//      NEWLINE         3   %
   450  	//      LPAREN          2.9 %
   451  	//      RPAREN          2.9 %
   452  	//      INT             2   %
   453  	//      others        < 1   %
   454  	//
   455  	// Although NEWLINE tokens are infrequent, and lineStart is
   456  	// usually (~97%) false on entry, skipped newlines account for
   457  	// about 50% of all iterations of the 'start' loop.
   458  
   459  start:
   460  	var c rune
   461  
   462  	// Deal with leading spaces and indentation.
   463  	blank := false
   464  	savedLineStart := sc.lineStart
   465  	if sc.lineStart {
   466  		sc.lineStart = false
   467  		col := 0
   468  		for {
   469  			c = sc.peekRune()
   470  			if c == ' ' {
   471  				col++
   472  				sc.readRune()
   473  			} else if c == '\t' {
   474  				const tab = 8
   475  				col += int(tab - (sc.pos.Col-1)%tab)
   476  				sc.readRune()
   477  			} else {
   478  				break
   479  			}
   480  		}
   481  
   482  		// The third clause matches EOF.
   483  		if c == '#' || c == '\n' || c == 0 {
   484  			blank = true
   485  		}
   486  
   487  		// Compute indentation level for non-blank lines not
   488  		// inside an expression.  This is not the common case.
   489  		if !blank && sc.depth == 0 {
   490  			cur := sc.indentstk[len(sc.indentstk)-1]
   491  			if col > cur {
   492  				// indent
   493  				sc.dents++
   494  				sc.indentstk = append(sc.indentstk, col)
   495  			} else if col < cur {
   496  				// outdent(s)
   497  				for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
   498  					sc.dents--
   499  					sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
   500  				}
   501  				if col != sc.indentstk[len(sc.indentstk)-1] {
   502  					sc.error(sc.pos, "unindent does not match any outer indentation level")
   503  				}
   504  			}
   505  		}
   506  	}
   507  
   508  	// Return saved indentation tokens.
   509  	if sc.dents != 0 {
   510  		sc.startToken(val)
   511  		sc.endToken(val)
   512  		if sc.dents < 0 {
   513  			sc.dents++
   514  			return OUTDENT
   515  		} else {
   516  			sc.dents--
   517  			return INDENT
   518  		}
   519  	}
   520  
   521  	// start of line proper
   522  	c = sc.peekRune()
   523  
   524  	// Skip spaces.
   525  	for c == ' ' || c == '\t' {
   526  		sc.readRune()
   527  		c = sc.peekRune()
   528  	}
   529  
   530  	// comment
   531  	if c == '#' {
   532  		if sc.keepComments {
   533  			sc.startToken(val)
   534  		}
   535  		// Consume up to newline (included).
   536  		for c != 0 && c != '\n' {
   537  			sc.readRune()
   538  			c = sc.peekRune()
   539  		}
   540  		if sc.keepComments {
   541  			sc.endToken(val)
   542  			if blank {
   543  				sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
   544  			} else {
   545  				sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
   546  			}
   547  		}
   548  	}
   549  
   550  	// newline
   551  	if c == '\n' {
   552  		sc.lineStart = true
   553  
   554  		// Ignore newlines within expressions (common case).
   555  		if sc.depth > 0 {
   556  			sc.readRune()
   557  			goto start
   558  		}
   559  
   560  		// Ignore blank lines, except in the REPL,
   561  		// where they emit OUTDENTs and NEWLINE.
   562  		if blank {
   563  			if sc.readline == nil {
   564  				sc.readRune()
   565  				goto start
   566  			} else if len(sc.indentstk) > 1 {
   567  				sc.dents = 1 - len(sc.indentstk)
   568  				sc.indentstk = sc.indentstk[:1]
   569  				goto start
   570  			}
   571  		}
   572  
   573  		// At top-level (not in an expression).
   574  		sc.startToken(val)
   575  		sc.readRune()
   576  		val.raw = "\n"
   577  		return NEWLINE
   578  	}
   579  
   580  	// end of file
   581  	if c == 0 {
   582  		// Emit OUTDENTs for unfinished indentation,
   583  		// preceded by a NEWLINE if we haven't just emitted one.
   584  		if len(sc.indentstk) > 1 {
   585  			if savedLineStart {
   586  				sc.dents = 1 - len(sc.indentstk)
   587  				sc.indentstk = sc.indentstk[:1]
   588  				goto start
   589  			} else {
   590  				sc.lineStart = true
   591  				sc.startToken(val)
   592  				val.raw = "\n"
   593  				return NEWLINE
   594  			}
   595  		}
   596  
   597  		sc.startToken(val)
   598  		sc.endToken(val)
   599  		return EOF
   600  	}
   601  
   602  	// line continuation
   603  	if c == '\\' {
   604  		sc.readRune()
   605  		if sc.peekRune() != '\n' {
   606  			sc.errorf(sc.pos, "stray backslash in program")
   607  		}
   608  		sc.readRune()
   609  		goto start
   610  	}
   611  
   612  	// start of the next token
   613  	sc.startToken(val)
   614  
   615  	// comma (common case)
   616  	if c == ',' {
   617  		sc.readRune()
   618  		sc.endToken(val)
   619  		return COMMA
   620  	}
   621  
   622  	// string literal
   623  	if c == '"' || c == '\'' {
   624  		return sc.scanString(val, c)
   625  	}
   626  
   627  	// identifier or keyword
   628  	if isIdentStart(c) {
   629  		// raw string literal
   630  		if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
   631  			sc.readRune()
   632  			c = sc.peekRune()
   633  			return sc.scanString(val, c)
   634  		}
   635  
   636  		for isIdent(c) {
   637  			sc.readRune()
   638  			c = sc.peekRune()
   639  		}
   640  		sc.endToken(val)
   641  		if k, ok := keywordToken[val.raw]; ok {
   642  			return k
   643  		}
   644  
   645  		return IDENT
   646  	}
   647  
   648  	// brackets
   649  	switch c {
   650  	case '[', '(', '{':
   651  		sc.depth++
   652  		sc.readRune()
   653  		sc.endToken(val)
   654  		switch c {
   655  		case '[':
   656  			return LBRACK
   657  		case '(':
   658  			return LPAREN
   659  		case '{':
   660  			return LBRACE
   661  		}
   662  		panic("unreachable")
   663  
   664  	case ']', ')', '}':
   665  		if sc.depth == 0 {
   666  			sc.errorf(sc.pos, "unexpected %q", c)
   667  		} else {
   668  			sc.depth--
   669  		}
   670  		sc.readRune()
   671  		sc.endToken(val)
   672  		switch c {
   673  		case ']':
   674  			return RBRACK
   675  		case ')':
   676  			return RPAREN
   677  		case '}':
   678  			return RBRACE
   679  		}
   680  		panic("unreachable")
   681  	}
   682  
   683  	// int or float literal, or period
   684  	if isdigit(c) || c == '.' {
   685  		return sc.scanNumber(val, c)
   686  	}
   687  
   688  	// other punctuation
   689  	defer sc.endToken(val)
   690  	switch c {
   691  	case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
   692  		start := sc.pos
   693  		sc.readRune()
   694  		if sc.peekRune() == '=' {
   695  			sc.readRune()
   696  			switch c {
   697  			case '<':
   698  				return LE
   699  			case '>':
   700  				return GE
   701  			case '=':
   702  				return EQL
   703  			case '!':
   704  				return NEQ
   705  			case '+':
   706  				return PLUS_EQ
   707  			case '-':
   708  				return MINUS_EQ
   709  			case '/':
   710  				return SLASH_EQ
   711  			case '%':
   712  				return PERCENT_EQ
   713  			case '&':
   714  				return AMP_EQ
   715  			case '|':
   716  				return PIPE_EQ
   717  			case '^':
   718  				return CIRCUMFLEX_EQ
   719  			}
   720  		}
   721  		switch c {
   722  		case '=':
   723  			return EQ
   724  		case '<':
   725  			if sc.peekRune() == '<' {
   726  				sc.readRune()
   727  				if sc.peekRune() == '=' {
   728  					sc.readRune()
   729  					return LTLT_EQ
   730  				} else {
   731  					return LTLT
   732  				}
   733  			}
   734  			return LT
   735  		case '>':
   736  			if sc.peekRune() == '>' {
   737  				sc.readRune()
   738  				if sc.peekRune() == '=' {
   739  					sc.readRune()
   740  					return GTGT_EQ
   741  				} else {
   742  					return GTGT
   743  				}
   744  			}
   745  			return GT
   746  		case '!':
   747  			sc.error(start, "unexpected input character '!'")
   748  		case '+':
   749  			return PLUS
   750  		case '-':
   751  			return MINUS
   752  		case '/':
   753  			if sc.peekRune() == '/' {
   754  				sc.readRune()
   755  				if sc.peekRune() == '=' {
   756  					sc.readRune()
   757  					return SLASHSLASH_EQ
   758  				} else {
   759  					return SLASHSLASH
   760  				}
   761  			}
   762  			return SLASH
   763  		case '%':
   764  			return PERCENT
   765  		case '&':
   766  			return AMP
   767  		case '|':
   768  			return PIPE
   769  		case '^':
   770  			return CIRCUMFLEX
   771  		}
   772  		panic("unreachable")
   773  
   774  	case ':', ';', '~': // single-char tokens (except comma)
   775  		sc.readRune()
   776  		switch c {
   777  		case ':':
   778  			return COLON
   779  		case ';':
   780  			return SEMI
   781  		case '~':
   782  			return TILDE
   783  		}
   784  		panic("unreachable")
   785  
   786  	case '*': // possibly followed by '*' or '='
   787  		sc.readRune()
   788  		switch sc.peekRune() {
   789  		case '*':
   790  			sc.readRune()
   791  			return STARSTAR
   792  		case '=':
   793  			sc.readRune()
   794  			return STAR_EQ
   795  		}
   796  		return STAR
   797  	}
   798  
   799  	sc.errorf(sc.pos, "unexpected input character %#q", c)
   800  	panic("unreachable")
   801  }
   802  
   803  func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
   804  	start := sc.pos
   805  	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
   806  	sc.readRune()
   807  	if !triple {
   808  		// Precondition: startToken was already called.
   809  		for {
   810  			if sc.eof() {
   811  				sc.error(val.pos, "unexpected EOF in string")
   812  			}
   813  			c := sc.readRune()
   814  			if c == quote {
   815  				break
   816  			}
   817  			if c == '\n' {
   818  				sc.error(val.pos, "unexpected newline in string")
   819  			}
   820  			if c == '\\' {
   821  				if sc.eof() {
   822  					sc.error(val.pos, "unexpected EOF in string")
   823  				}
   824  				sc.readRune()
   825  			}
   826  		}
   827  		sc.endToken(val)
   828  	} else {
   829  		// triple-quoted string literal
   830  		sc.readRune()
   831  		sc.readRune()
   832  
   833  		// A triple-quoted string literal may span multiple
   834  		// gulps of REPL input; it is the only such token.
   835  		// Thus we must avoid {start,end}Token.
   836  		raw := new(strings.Builder)
   837  
   838  		// Copy the prefix, e.g. r''' or """ (see startToken).
   839  		raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
   840  
   841  		quoteCount := 0
   842  		for {
   843  			if sc.eof() {
   844  				sc.error(val.pos, "unexpected EOF in string")
   845  			}
   846  			c := sc.readRune()
   847  			raw.WriteRune(c)
   848  			if c == quote {
   849  				quoteCount++
   850  				if quoteCount == 3 {
   851  					break
   852  				}
   853  			} else {
   854  				quoteCount = 0
   855  			}
   856  			if c == '\\' {
   857  				if sc.eof() {
   858  					sc.error(val.pos, "unexpected EOF in string")
   859  				}
   860  				c = sc.readRune()
   861  				raw.WriteRune(c)
   862  			}
   863  		}
   864  		val.raw = raw.String()
   865  	}
   866  
   867  	s, _, err := unquote(val.raw)
   868  	if err != nil {
   869  		sc.error(start, err.Error())
   870  	}
   871  	val.string = s
   872  	return STRING
   873  }
   874  
   875  func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
   876  	// https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
   877  	//
   878  	// Python features not supported:
   879  	// - integer literals of >64 bits of precision
   880  	// - 123L or 123l long suffix
   881  	// - traditional octal: 0755
   882  	// https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
   883  
   884  	start := sc.pos
   885  	fraction, exponent := false, false
   886  
   887  	if c == '.' {
   888  		// dot or start of fraction
   889  		sc.readRune()
   890  		c = sc.peekRune()
   891  		if !isdigit(c) {
   892  			sc.endToken(val)
   893  			return DOT
   894  		}
   895  		fraction = true
   896  	} else if c == '0' {
   897  		// hex, octal, binary or float
   898  		sc.readRune()
   899  		c = sc.peekRune()
   900  
   901  		if c == '.' {
   902  			fraction = true
   903  		} else if c == 'x' || c == 'X' {
   904  			// hex
   905  			sc.readRune()
   906  			c = sc.peekRune()
   907  			if !isxdigit(c) {
   908  				sc.error(start, "invalid hex literal")
   909  			}
   910  			for isxdigit(c) {
   911  				sc.readRune()
   912  				c = sc.peekRune()
   913  			}
   914  		} else if c == 'o' || c == 'O' {
   915  			// octal
   916  			sc.readRune()
   917  			c = sc.peekRune()
   918  			if !isodigit(c) {
   919  				sc.error(sc.pos, "invalid octal literal")
   920  			}
   921  			for isodigit(c) {
   922  				sc.readRune()
   923  				c = sc.peekRune()
   924  			}
   925  		} else if c == 'b' || c == 'B' {
   926  			// binary
   927  			sc.readRune()
   928  			c = sc.peekRune()
   929  			if !isbdigit(c) {
   930  				sc.error(sc.pos, "invalid binary literal")
   931  			}
   932  			for isbdigit(c) {
   933  				sc.readRune()
   934  				c = sc.peekRune()
   935  			}
   936  		} else {
   937  			// float (or obsolete octal "0755")
   938  			allzeros, octal := true, true
   939  			for isdigit(c) {
   940  				if c != '0' {
   941  					allzeros = false
   942  				}
   943  				if c > '7' {
   944  					octal = false
   945  				}
   946  				sc.readRune()
   947  				c = sc.peekRune()
   948  			}
   949  			if c == '.' {
   950  				fraction = true
   951  			} else if c == 'e' || c == 'E' {
   952  				exponent = true
   953  			} else if octal && !allzeros {
   954  				sc.endToken(val)
   955  				sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
   956  			}
   957  		}
   958  	} else {
   959  		// decimal
   960  		for isdigit(c) {
   961  			sc.readRune()
   962  			c = sc.peekRune()
   963  		}
   964  
   965  		if c == '.' {
   966  			fraction = true
   967  		} else if c == 'e' || c == 'E' {
   968  			exponent = true
   969  		}
   970  	}
   971  
   972  	if fraction {
   973  		sc.readRune() // consume '.'
   974  		c = sc.peekRune()
   975  		for isdigit(c) {
   976  			sc.readRune()
   977  			c = sc.peekRune()
   978  		}
   979  
   980  		if c == 'e' || c == 'E' {
   981  			exponent = true
   982  		}
   983  	}
   984  
   985  	if exponent {
   986  		sc.readRune() // consume [eE]
   987  		c = sc.peekRune()
   988  		if c == '+' || c == '-' {
   989  			sc.readRune()
   990  			c = sc.peekRune()
   991  			if !isdigit(c) {
   992  				sc.error(sc.pos, "invalid float literal")
   993  			}
   994  		}
   995  		for isdigit(c) {
   996  			sc.readRune()
   997  			c = sc.peekRune()
   998  		}
   999  	}
  1000  
  1001  	sc.endToken(val)
  1002  	if fraction || exponent {
  1003  		var err error
  1004  		val.float, err = strconv.ParseFloat(val.raw, 64)
  1005  		if err != nil {
  1006  			sc.error(sc.pos, "invalid float literal")
  1007  		}
  1008  		return FLOAT
  1009  	} else {
  1010  		var err error
  1011  		s := val.raw
  1012  		val.bigInt = nil
  1013  		if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
  1014  			val.int, err = strconv.ParseInt(s[2:], 8, 64)
  1015  		} else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
  1016  			val.int, err = strconv.ParseInt(s[2:], 2, 64)
  1017  		} else {
  1018  			val.int, err = strconv.ParseInt(s, 0, 64)
  1019  			if err != nil {
  1020  				num := new(big.Int)
  1021  				var ok bool = true
  1022  				val.bigInt, ok = num.SetString(s, 0)
  1023  				if ok {
  1024  					err = nil
  1025  				}
  1026  			}
  1027  		}
  1028  		if err != nil {
  1029  			sc.error(start, "invalid int literal")
  1030  		}
  1031  		return INT
  1032  	}
  1033  }
  1034  
  1035  // isIdent reports whether c is an identifier rune.
  1036  func isIdent(c rune) bool {
  1037  	return isdigit(c) || isIdentStart(c)
  1038  }
  1039  
  1040  func isIdentStart(c rune) bool {
  1041  	return 'a' <= c && c <= 'z' ||
  1042  		'A' <= c && c <= 'Z' ||
  1043  		c == '_' ||
  1044  		unicode.IsLetter(c)
  1045  }
  1046  
  1047  func isdigit(c rune) bool  { return '0' <= c && c <= '9' }
  1048  func isodigit(c rune) bool { return '0' <= c && c <= '7' }
  1049  func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
  1050  func isbdigit(c rune) bool { return '0' == c || c == '1' }
  1051  
  1052  // keywordToken records the special tokens for
  1053  // strings that should not be treated as ordinary identifiers.
  1054  var keywordToken = map[string]Token{
  1055  	"and":      AND,
  1056  	"break":    BREAK,
  1057  	"continue": CONTINUE,
  1058  	"def":      DEF,
  1059  	"elif":     ELIF,
  1060  	"else":     ELSE,
  1061  	"for":      FOR,
  1062  	"if":       IF,
  1063  	"in":       IN,
  1064  	"lambda":   LAMBDA,
  1065  	"load":     LOAD,
  1066  	"not":      NOT,
  1067  	"or":       OR,
  1068  	"pass":     PASS,
  1069  	"return":   RETURN,
  1070  	"while":    WHILE,
  1071  
  1072  	// reserved words:
  1073  	"as": ILLEGAL,
  1074  	// "assert":   ILLEGAL, // heavily used by our tests
  1075  	"class":    ILLEGAL,
  1076  	"del":      ILLEGAL,
  1077  	"except":   ILLEGAL,
  1078  	"finally":  ILLEGAL,
  1079  	"from":     ILLEGAL,
  1080  	"global":   ILLEGAL,
  1081  	"import":   ILLEGAL,
  1082  	"is":       ILLEGAL,
  1083  	"nonlocal": ILLEGAL,
  1084  	"raise":    ILLEGAL,
  1085  	"try":      ILLEGAL,
  1086  	"with":     ILLEGAL,
  1087  	"yield":    ILLEGAL,
  1088  }