github.com/lab47/exprcore@v0.0.0-20210525052339-fb7d6bd9331e/syntax/scan.go (about)

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // A lexical scanner for exprcore.
     8  
     9  import (
    10  	"fmt"
    11  	"io"
    12  	"io/ioutil"
    13  	"log"
    14  	"math/big"
    15  	"os"
    16  	"strconv"
    17  	"strings"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  // A Token represents a exprcore lexical token.
    23  type Token int8
    24  
    25  const (
    26  	ILLEGAL Token = iota
    27  	EOF
    28  
    29  	NEWLINE
    30  	INDENT
    31  	OUTDENT
    32  
    33  	// Tokens with values
    34  	IDENT  // x
    35  	INT    // 123
    36  	FLOAT  // 1.23e45
    37  	STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
    38  
    39  	SHELL        //  $ foo bar
    40  	DSHELL_START // the start of a shell expression with expandation
    41  	DSHELL_PART  // an expandation within a shell expression
    42  	DSHELL_END   // the end of a DSHELL
    43  
    44  	// Punctuation
    45  	PLUS          // +
    46  	MINUS         // -
    47  	STAR          // *
    48  	SLASH         // /
    49  	SLASHSLASH    // //
    50  	PERCENT       // %
    51  	AMP           // &
    52  	PIPE          // |
    53  	CIRCUMFLEX    // ^
    54  	LTLT          // <<
    55  	GTGT          // >>
    56  	TILDE         // ~
    57  	DOT           // .
    58  	COMMA         // ,
    59  	EQ            // =
    60  	SEMI          // ;
    61  	COLON         // :
    62  	LPAREN        // (
    63  	RPAREN        // )
    64  	LBRACK        // [
    65  	RBRACK        // ]
    66  	LBRACE        // {
    67  	RBRACE        // }
    68  	LT            // <
    69  	GT            // >
    70  	GE            // >=
    71  	LE            // <=
    72  	EQL           // ==
    73  	NEQ           // !=
    74  	PLUS_EQ       // +=    (keep order consistent with PLUS..GTGT)
    75  	MINUS_EQ      // -=
    76  	STAR_EQ       // *=
    77  	SLASH_EQ      // /=
    78  	SLASHSLASH_EQ // //=
    79  	PERCENT_EQ    // %=
    80  	AMP_EQ        // &=
    81  	PIPE_EQ       // |=
    82  	CIRCUMFLEX_EQ // ^=
    83  	LTLT_EQ       // <<=
    84  	GTGT_EQ       // >>=
    85  	ARROW         // =>
    86  	AT            // @
    87  	PERCENT_BRACE // %{
    88  	STARSTAR      // **
    89  
    90  	// Keywords
    91  	AND
    92  	BREAK
    93  	CONTINUE
    94  	DEF
    95  	ELIF
    96  	ELSE
    97  	FOR
    98  	IF
    99  	IN
   100  	LAMBDA
   101  	LOAD
   102  	IMPORT
   103  	AS
   104  	USING
   105  	NOT
   106  	NOT_IN // synthesized by parser from NOT IN
   107  	OR
   108  	PASS
   109  	RETURN
   110  	WHILE
   111  
   112  	maxToken
   113  )
   114  
   115  func (tok Token) String() string { return tokenNames[tok] }
   116  
   117  // GoString is like String but quotes punctuation tokens.
   118  // Use Sprintf("%#v", tok) when constructing error messages.
   119  func (tok Token) GoString() string {
   120  	if tok >= PLUS && tok <= STARSTAR {
   121  		return "'" + tokenNames[tok] + "'"
   122  	}
   123  	return tokenNames[tok]
   124  }
   125  
   126  var tokenNames = [...]string{
   127  	ILLEGAL:       "illegal token",
   128  	EOF:           "end of file",
   129  	NEWLINE:       "newline",
   130  	INDENT:        "indent",
   131  	OUTDENT:       "outdent",
   132  	IDENT:         "identifier",
   133  	INT:           "int literal",
   134  	FLOAT:         "float literal",
   135  	STRING:        "string literal",
   136  	SHELL:         "a shell expression",
   137  	DSHELL_START:  "the start of a dynamic shell expression",
   138  	DSHELL_PART:   "part of a dynamic shell expression",
   139  	DSHELL_END:    "the end of a dynamic shell expression",
   140  	PLUS:          "+",
   141  	MINUS:         "-",
   142  	STAR:          "*",
   143  	SLASH:         "/",
   144  	SLASHSLASH:    "//",
   145  	PERCENT:       "%",
   146  	PERCENT_BRACE: "%{",
   147  	AMP:           "&",
   148  	PIPE:          "|",
   149  	CIRCUMFLEX:    "^",
   150  	LTLT:          "<<",
   151  	GTGT:          ">>",
   152  	TILDE:         "~",
   153  	DOT:           ".",
   154  	COMMA:         ",",
   155  	EQ:            "=",
   156  	SEMI:          ";",
   157  	COLON:         ":",
   158  	LPAREN:        "(",
   159  	RPAREN:        ")",
   160  	LBRACK:        "[",
   161  	RBRACK:        "]",
   162  	LBRACE:        "{",
   163  	RBRACE:        "}",
   164  	LT:            "<",
   165  	GT:            ">",
   166  	GE:            ">=",
   167  	LE:            "<=",
   168  	EQL:           "==",
   169  	NEQ:           "!=",
   170  	PLUS_EQ:       "+=",
   171  	MINUS_EQ:      "-=",
   172  	STAR_EQ:       "*=",
   173  	SLASH_EQ:      "/=",
   174  	SLASHSLASH_EQ: "//=",
   175  	PERCENT_EQ:    "%=",
   176  	AMP_EQ:        "&=",
   177  	PIPE_EQ:       "|=",
   178  	CIRCUMFLEX_EQ: "^=",
   179  	LTLT_EQ:       "<<=",
   180  	GTGT_EQ:       ">>=",
   181  	STARSTAR:      "**",
   182  	ARROW:         "=>",
   183  	AT:            "@",
   184  	AND:           "and",
   185  	BREAK:         "break",
   186  	CONTINUE:      "continue",
   187  	DEF:           "def",
   188  	ELIF:          "elif",
   189  	ELSE:          "else",
   190  	FOR:           "for",
   191  	IF:            "if",
   192  	IN:            "in",
   193  	LAMBDA:        "lambda",
   194  	LOAD:          "load",
   195  	IMPORT:        "import",
   196  	AS:            "as",
   197  	USING:         "using",
   198  	NOT:           "not",
   199  	NOT_IN:        "not in",
   200  	OR:            "or",
   201  	PASS:          "pass",
   202  	RETURN:        "return",
   203  	WHILE:         "while",
   204  }
   205  
   206  // A Position describes the location of a rune of input.
   207  type Position struct {
   208  	file *string // filename (indirect for compactness)
   209  	Line int32   // 1-based line number; 0 if line unknown
   210  	Col  int32   // 1-based column (rune) number; 0 if column unknown
   211  }
   212  
   213  // IsValid reports whether the position is valid.
   214  func (p Position) IsValid() bool { return p.file != nil }
   215  
   216  // Filename returns the name of the file containing this position.
   217  func (p Position) Filename() string {
   218  	if p.file != nil {
   219  		return *p.file
   220  	}
   221  	return "<invalid>"
   222  }
   223  
   224  // MakePosition returns position with the specified components.
   225  func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
   226  
   227  // add returns the position at the end of s, assuming it starts at p.
   228  func (p Position) add(s string) Position {
   229  	if n := strings.Count(s, "\n"); n > 0 {
   230  		p.Line += int32(n)
   231  		s = s[strings.LastIndex(s, "\n")+1:]
   232  		p.Col = 1
   233  	}
   234  	p.Col += int32(utf8.RuneCountInString(s))
   235  	return p
   236  }
   237  
   238  func (p Position) String() string {
   239  	file := p.Filename()
   240  	if p.Line > 0 {
   241  		if p.Col > 0 {
   242  			return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
   243  		}
   244  		return fmt.Sprintf("%s:%d", file, p.Line)
   245  	}
   246  	return file
   247  }
   248  
   249  func (p Position) isBefore(q Position) bool {
   250  	if p.Line != q.Line {
   251  		return p.Line < q.Line
   252  	}
   253  	return p.Col < q.Col
   254  }
   255  
   256  // An scanner represents a single input file being parsed.
   257  type scanner struct {
   258  	rest           []byte    // rest of input (in REPL, a line of input)
   259  	token          []byte    // token being scanned
   260  	pos            Position  // current input position
   261  	depth          int       // nesting of [ ] { } ( )
   262  	indentstk      []int     // stack of indentation levels
   263  	dents          int       // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
   264  	lineStart      bool      // after NEWLINE; convert spaces to indentation tokens
   265  	keepComments   bool      // accumulate comments in slice
   266  	lineComments   []Comment // list of full line comments (if keepComments)
   267  	suffixComments []Comment // list of suffix comments (if keepComments)
   268  
   269  	insertSemi bool // insert a semicolon before next newline
   270  
   271  	interpDepth     int // how far inside ${'s we are
   272  	interpExprDepth int // how far inside `'s we are
   273  
   274  	readline func() ([]byte, error) // read next line of input (REPL only)
   275  }
   276  
   277  func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
   278  	sc := &scanner{
   279  		pos:          Position{file: &filename, Line: 1, Col: 1},
   280  		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
   281  		lineStart:    true,
   282  		keepComments: keepComments,
   283  	}
   284  	sc.readline, _ = src.(func() ([]byte, error)) // REPL only
   285  	if sc.readline == nil {
   286  		data, err := readSource(filename, src)
   287  		if err != nil {
   288  			return nil, err
   289  		}
   290  		sc.rest = data
   291  	}
   292  	return sc, nil
   293  }
   294  
   295  func readSource(filename string, src interface{}) ([]byte, error) {
   296  	switch src := src.(type) {
   297  	case string:
   298  		return []byte(src), nil
   299  	case []byte:
   300  		return src, nil
   301  	case io.Reader:
   302  		data, err := ioutil.ReadAll(src)
   303  		if err != nil {
   304  			err = &os.PathError{Op: "read", Path: filename, Err: err}
   305  			return nil, err
   306  		}
   307  		return data, nil
   308  	case nil:
   309  		return ioutil.ReadFile(filename)
   310  	default:
   311  		return nil, fmt.Errorf("invalid source: %T", src)
   312  	}
   313  }
   314  
   315  // An Error describes the nature and position of a scanner or parser error.
   316  type Error struct {
   317  	Pos Position
   318  	Msg string
   319  }
   320  
   321  func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
   322  
   323  // errorf is called to report an error.
   324  // errorf does not return: it panics.
   325  func (sc *scanner) error(pos Position, s string) {
   326  	panic(Error{pos, s})
   327  }
   328  
   329  func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
   330  	sc.error(pos, fmt.Sprintf(format, args...))
   331  }
   332  
   333  func (sc *scanner) recover(err *error) {
   334  	// The scanner and parser panic both for routine errors like
   335  	// syntax errors and for programmer bugs like array index
   336  	// errors.  Turn both into error returns.  Catching bug panics
   337  	// is especially important when processing many files.
   338  	switch e := recover().(type) {
   339  	case nil:
   340  		// no panic
   341  	case Error:
   342  		*err = e
   343  	default:
   344  		*err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
   345  		if debug {
   346  			log.Fatal(*err)
   347  		}
   348  	}
   349  }
   350  
   351  // eof reports whether the input has reached end of file.
   352  func (sc *scanner) eof() bool {
   353  	return len(sc.rest) == 0 && !sc.readLine()
   354  }
   355  
   356  // readLine attempts to read another line of input.
   357  // Precondition: len(sc.rest)==0.
   358  func (sc *scanner) readLine() bool {
   359  	if sc.readline != nil {
   360  		var err error
   361  		sc.rest, err = sc.readline()
   362  		if err != nil {
   363  			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
   364  		}
   365  		return len(sc.rest) > 0
   366  	}
   367  	return false
   368  }
   369  
   370  // peekRune returns the next rune in the input without consuming it.
   371  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   372  func (sc *scanner) peekRune() rune {
   373  	// TODO(adonovan): opt: measure and perhaps inline eof.
   374  	if sc.eof() {
   375  		return 0
   376  	}
   377  
   378  	// fast path: ASCII
   379  	if b := sc.rest[0]; b < utf8.RuneSelf {
   380  		if b == '\r' {
   381  			return '\n'
   382  		}
   383  		return rune(b)
   384  	}
   385  
   386  	r, _ := utf8.DecodeRune(sc.rest)
   387  	return r
   388  }
   389  
   390  // readRune consumes and returns the next rune in the input.
   391  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   392  func (sc *scanner) readRune() rune {
   393  	// eof() has been inlined here, both to avoid a call
   394  	// and to establish len(rest)>0 to avoid a bounds check.
   395  	if len(sc.rest) == 0 {
   396  		if !sc.readLine() {
   397  			sc.error(sc.pos, "internal scanner error: readRune at EOF")
   398  		}
   399  		// Redundant, but eliminates the bounds-check below.
   400  		if len(sc.rest) == 0 {
   401  			return 0
   402  		}
   403  	}
   404  
   405  	// fast path: ASCII
   406  	if b := sc.rest[0]; b < utf8.RuneSelf {
   407  		r := rune(b)
   408  		sc.rest = sc.rest[1:]
   409  		if r == '\r' {
   410  			if len(sc.rest) > 0 && sc.rest[0] == '\n' {
   411  				sc.rest = sc.rest[1:]
   412  			}
   413  			r = '\n'
   414  		}
   415  		if r == '\n' {
   416  			sc.pos.Line++
   417  			sc.pos.Col = 1
   418  		} else {
   419  			sc.pos.Col++
   420  		}
   421  		return r
   422  	}
   423  
   424  	r, size := utf8.DecodeRune(sc.rest)
   425  	sc.rest = sc.rest[size:]
   426  	sc.pos.Col++
   427  	return r
   428  }
   429  
   430  // tokenValue records the position and value associated with each token.
   431  type tokenValue struct {
   432  	raw    string   // raw text of token
   433  	int    int64    // decoded int
   434  	bigInt *big.Int // decoded integers > int64
   435  	float  float64  // decoded float
   436  	string string   // decoded string
   437  	pos    Position // start position of token
   438  }
   439  
   440  // startToken marks the beginning of the next input token.
   441  // It must be followed by a call to endToken once the token has
   442  // been consumed using readRune.
   443  func (sc *scanner) startToken(val *tokenValue) {
   444  	sc.token = sc.rest
   445  	val.raw = ""
   446  	val.pos = sc.pos
   447  }
   448  
   449  // endToken marks the end of an input token.
   450  // It records the actual token string in val.raw if the caller
   451  // has not done that already.
   452  func (sc *scanner) endToken(val *tokenValue) {
   453  	if val.raw == "" {
   454  		val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
   455  	}
   456  }
   457  
   458  // nextToken is called by the parser to obtain the next input token.
   459  // It returns the token value and sets val to the data associated with
   460  // the token.
   461  //
   462  // For all our input tokens, the associated data is val.pos (the
   463  // position where the token begins), val.raw (the input string
   464  // corresponding to the token).  For string and int tokens, the string
   465  // and int fields additionally contain the token's interpreted value.
   466  func (sc *scanner) nextToken(val *tokenValue) Token {
   467  
   468  	// The following distribution of tokens guides case ordering:
   469  	//
   470  	//      COMMA          27   %
   471  	//      STRING         23   %
   472  	//      IDENT          15   %
   473  	//      EQL            11   %
   474  	//      LBRACK          5.5 %
   475  	//      RBRACK          5.5 %
   476  	//      NEWLINE         3   %
   477  	//      LPAREN          2.9 %
   478  	//      RPAREN          2.9 %
   479  	//      INT             2   %
   480  	//      others        < 1   %
   481  	//
   482  	// Although NEWLINE tokens are infrequent, and lineStart is
   483  	// usually (~97%) false on entry, skipped newlines account for
   484  	// about 50% of all iterations of the 'start' loop.
   485  
   486  	insertSemi := false
   487  
   488  	// Replace the value with the updated on on every time through
   489  	defer func() {
   490  		sc.insertSemi = insertSemi
   491  	}()
   492  
   493  start:
   494  	var c rune
   495  
   496  	// Deal with leading spaces and indentation.
   497  	blank := false
   498  	/*
   499  		savedLineStart := sc.lineStart
   500  		if sc.lineStart {
   501  			sc.lineStart = false
   502  			col := 0
   503  			for {
   504  				c = sc.peekRune()
   505  				if c == ' ' {
   506  					col++
   507  					sc.readRune()
   508  				} else if c == '\t' {
   509  					const tab = 8
   510  					col += int(tab - (sc.pos.Col-1)%tab)
   511  					sc.readRune()
   512  				} else {
   513  					break
   514  				}
   515  			}
   516  
   517  			// The third clause matches EOF.
   518  			if c == '#' || c == '\n' || c == 0 {
   519  				blank = true
   520  			}
   521  
   522  			// Compute indentation level for non-blank lines not
   523  			// inside an expression.  This is not the common case.
   524  			if false { // !blank { //  && sc.depth == 0 {
   525  				cur := sc.indentstk[len(sc.indentstk)-1]
   526  				if col > cur {
   527  					// indent
   528  					sc.dents++
   529  					sc.indentstk = append(sc.indentstk, col)
   530  				} else if col < cur {
   531  					// outdent(s)
   532  					for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
   533  						sc.dents--
   534  						sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
   535  					}
   536  					if col != sc.indentstk[len(sc.indentstk)-1] {
   537  						sc.error(sc.pos, "unindent does not match any outer indentation level")
   538  					}
   539  				}
   540  			}
   541  	*/
   542  
   543  	// Return saved indentation tokens.
   544  	/*
   545  		if sc.dents != 0 {
   546  			sc.startToken(val)
   547  			sc.endToken(val)
   548  			if sc.dents < 0 {
   549  				sc.dents++
   550  				return OUTDENT
   551  			} else {
   552  				sc.dents--
   553  				return INDENT
   554  			}
   555  		}
   556  	*/
   557  
   558  	// start of line proper
   559  	c = sc.peekRune()
   560  
   561  	// Skip spaces.
   562  	for c == ' ' || c == '\t' || (c == '\n' && !sc.insertSemi) || c == '\r' {
   563  		sc.readRune()
   564  		c = sc.peekRune()
   565  	}
   566  
   567  	// comment
   568  	if c == '#' {
   569  		if sc.keepComments {
   570  			sc.startToken(val)
   571  		}
   572  		// Consume up to newline (included).
   573  		for c != 0 && c != '\n' {
   574  			sc.readRune()
   575  			c = sc.peekRune()
   576  		}
   577  
   578  		if sc.keepComments {
   579  			sc.endToken(val)
   580  			if blank {
   581  				sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
   582  			} else {
   583  				sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
   584  			}
   585  		}
   586  
   587  		goto start
   588  	}
   589  
   590  	// newline
   591  	if c == '\n' {
   592  		// Only seen if insertSemi was true because otherwise the loop above will eat newlines
   593  		sc.lineStart = true
   594  
   595  		// Ignore newlines within expressions (common case).
   596  		/*
   597  			if sc.depth > 0 {
   598  				sc.readRune()
   599  				goto start
   600  			}
   601  		*/
   602  
   603  		// Ignore blank lines, except in the REPL,
   604  		// where they emit OUTDENTs and NEWLINE.
   605  		/*
   606  			if blank {
   607  				if sc.readline == nil {
   608  					sc.readRune()
   609  					goto start
   610  				} else if len(sc.indentstk) > 1 {
   611  					sc.dents = 1 - len(sc.indentstk)
   612  					sc.indentstk = sc.indentstk[:1]
   613  					goto start
   614  				}
   615  			}
   616  		*/
   617  
   618  		// At top-level (not in an expression).
   619  		sc.startToken(val)
   620  		sc.readRune()
   621  		val.raw = "\n"
   622  		return SEMI
   623  	}
   624  
   625  	// end of file
   626  	if c == 0 {
   627  		// Emit OUTDENTs for unfinished indentation,
   628  		// preceded by a NEWLINE if we haven't just emitted one.
   629  		/*
   630  			if len(sc.indentstk) > 1 {
   631  				if savedLineStart {
   632  					sc.dents = 1 - len(sc.indentstk)
   633  					sc.indentstk = sc.indentstk[:1]
   634  					goto start
   635  				} else {
   636  					sc.lineStart = true
   637  					sc.startToken(val)
   638  					val.raw = "\n"
   639  					return NEWLINE
   640  				}
   641  			}
   642  		*/
   643  
   644  		if sc.insertSemi {
   645  			sc.startToken(val)
   646  			sc.endToken(val)
   647  			return SEMI
   648  		}
   649  
   650  		sc.startToken(val)
   651  		sc.endToken(val)
   652  		return EOF
   653  	}
   654  
   655  	// line continuation
   656  	if c == '\\' {
   657  		sc.readRune()
   658  		if sc.peekRune() != '\n' {
   659  			sc.errorf(sc.pos, "stray backslash in program")
   660  		}
   661  		sc.readRune()
   662  		goto start
   663  	}
   664  
   665  	// start of the next token
   666  	sc.startToken(val)
   667  
   668  	// comma (common case)
   669  	if c == ',' {
   670  		sc.readRune()
   671  		sc.endToken(val)
   672  		return COMMA
   673  	}
   674  
   675  	// string literal
   676  	if c == '"' || c == '\'' {
   677  		insertSemi = true
   678  		return sc.scanString(val, c)
   679  	}
   680  
   681  	if c == '`' {
   682  		tok := sc.scanShellExpr(val)
   683  		if tok == SHELL {
   684  			insertSemi = true
   685  		}
   686  
   687  		return tok
   688  	}
   689  
   690  	if c == '$' {
   691  		tok := sc.scanShell(val)
   692  		if tok == SHELL {
   693  			insertSemi = true
   694  		}
   695  
   696  		return tok
   697  	}
   698  
   699  	// identifier or keyword
   700  	if isIdentStart(c) {
   701  		// raw string literal
   702  		if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
   703  			insertSemi = true
   704  			sc.readRune()
   705  			c = sc.peekRune()
   706  			return sc.scanString(val, c)
   707  		}
   708  
   709  		for isIdent(c) {
   710  			sc.readRune()
   711  			c = sc.peekRune()
   712  		}
   713  		sc.endToken(val)
   714  		if k, ok := keywordToken[val.raw]; ok {
   715  			switch k {
   716  			case BREAK, CONTINUE, PASS, RETURN:
   717  				insertSemi = true
   718  			}
   719  
   720  			return k
   721  		}
   722  
   723  		insertSemi = true
   724  		return IDENT
   725  	}
   726  
   727  	// brackets
   728  	switch c {
   729  	case '[', '(', '{':
   730  		sc.depth++
   731  		sc.readRune()
   732  		sc.endToken(val)
   733  		switch c {
   734  		case '[':
   735  			return LBRACK
   736  		case '(':
   737  			return LPAREN
   738  		case '{':
   739  			return LBRACE
   740  		}
   741  		panic("unreachable")
   742  
   743  	case ']', ')', '}':
   744  		if c == '}' {
   745  			if sc.interpDepth > 0 {
   746  				tok := sc.scanMoreShell(val)
   747  				if tok == DSHELL_END {
   748  					insertSemi = true
   749  				}
   750  
   751  				return tok
   752  			}
   753  
   754  			if sc.interpExprDepth > 0 {
   755  				tok := sc.scanMoreShellExpr(val)
   756  				if tok == DSHELL_END {
   757  					insertSemi = true
   758  				}
   759  
   760  				return tok
   761  			}
   762  		}
   763  
   764  		if sc.depth == 0 {
   765  			sc.errorf(sc.pos, "unexpected %q", c)
   766  		} else {
   767  			sc.depth--
   768  		}
   769  		sc.readRune()
   770  		sc.endToken(val)
   771  
   772  		insertSemi = true
   773  		switch c {
   774  		case ']':
   775  			return RBRACK
   776  		case ')':
   777  			return RPAREN
   778  		case '}':
   779  			return RBRACE
   780  		}
   781  		panic("unreachable")
   782  	}
   783  
   784  	// int or float literal, or period
   785  	if isdigit(c) || c == '.' {
   786  		insertSemi = true
   787  		return sc.scanNumber(val, c)
   788  	}
   789  
   790  	// other punctuation
   791  	defer sc.endToken(val)
   792  	switch c {
   793  	case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
   794  		start := sc.pos
   795  		sc.readRune()
   796  		if sc.peekRune() == '=' {
   797  			sc.readRune()
   798  			switch c {
   799  			case '<':
   800  				return LE
   801  			case '>':
   802  				return GE
   803  			case '=':
   804  				return EQL
   805  			case '!':
   806  				return NEQ
   807  			case '+':
   808  				return PLUS_EQ
   809  			case '-':
   810  				return MINUS_EQ
   811  			case '/':
   812  				return SLASH_EQ
   813  			case '%':
   814  				return PERCENT_EQ
   815  			case '&':
   816  				return AMP_EQ
   817  			case '|':
   818  				return PIPE_EQ
   819  			case '^':
   820  				return CIRCUMFLEX_EQ
   821  			}
   822  		}
   823  		switch c {
   824  		case '=':
   825  			if sc.peekRune() == '>' {
   826  				sc.readRune()
   827  				return ARROW
   828  			}
   829  
   830  			return EQ
   831  		case '<':
   832  			if sc.peekRune() == '<' {
   833  				sc.readRune()
   834  				if sc.peekRune() == '=' {
   835  					sc.readRune()
   836  					return LTLT_EQ
   837  				} else {
   838  					return LTLT
   839  				}
   840  			}
   841  			return LT
   842  		case '>':
   843  			if sc.peekRune() == '>' {
   844  				sc.readRune()
   845  				if sc.peekRune() == '=' {
   846  					sc.readRune()
   847  					return GTGT_EQ
   848  				} else {
   849  					return GTGT
   850  				}
   851  			}
   852  			return GT
   853  		case '!':
   854  			sc.error(start, "unexpected input character '!'")
   855  		case '+':
   856  			return PLUS
   857  		case '-':
   858  			return MINUS
   859  		case '/':
   860  			if sc.peekRune() == '/' {
   861  				sc.readRune()
   862  				if sc.peekRune() == '=' {
   863  					sc.readRune()
   864  					return SLASHSLASH_EQ
   865  				} else {
   866  					return SLASHSLASH
   867  				}
   868  			}
   869  			return SLASH
   870  		case '%':
   871  			if sc.peekRune() == '{' {
   872  				sc.readRune()
   873  				sc.depth++
   874  				return PERCENT_BRACE
   875  			}
   876  
   877  			return PERCENT
   878  		case '&':
   879  			return AMP
   880  		case '|':
   881  			return PIPE
   882  		case '^':
   883  			return CIRCUMFLEX
   884  		}
   885  		panic("unreachable")
   886  
   887  	case ':', ';', '~', '@': // single-char tokens (except comma)
   888  		sc.readRune()
   889  		switch c {
   890  		case ':':
   891  			return COLON
   892  		case ';':
   893  			return SEMI
   894  		case '~':
   895  			return TILDE
   896  		case '@':
   897  			return AT
   898  		}
   899  		panic("unreachable")
   900  
   901  	case '*': // possibly followed by '*' or '='
   902  		sc.readRune()
   903  		switch sc.peekRune() {
   904  		case '*':
   905  			sc.readRune()
   906  			return STARSTAR
   907  		case '=':
   908  			sc.readRune()
   909  			return STAR_EQ
   910  		}
   911  		return STAR
   912  	}
   913  
   914  	sc.errorf(sc.pos, "unexpected input character %#q", c)
   915  	panic("unreachable")
   916  }
   917  
   918  func (sc *scanner) scanShellExpr(val *tokenValue) Token {
   919  	sc.readRune()
   920  
   921  	var (
   922  		raw       strings.Builder
   923  		hasExpand bool
   924  	)
   925  
   926  	for sc.peekRune() == ' ' {
   927  		sc.readRune()
   928  	}
   929  
   930  	for {
   931  		if sc.eof() {
   932  			break
   933  		}
   934  
   935  		c := sc.readRune()
   936  		if c == '`' {
   937  			break
   938  		}
   939  
   940  		if c == '$' {
   941  			nc := sc.peekRune()
   942  			if nc == '{' {
   943  				sc.readRune()
   944  				sc.interpExprDepth++
   945  				hasExpand = true
   946  				break
   947  			}
   948  		} else if c == '\\' {
   949  			if sc.eof() {
   950  				sc.error(val.pos, "unexpected EOF in string")
   951  			}
   952  			c = sc.readRune()
   953  		}
   954  
   955  		raw.WriteRune(c)
   956  	}
   957  
   958  	val.string = raw.String()
   959  
   960  	if hasExpand {
   961  		return DSHELL_START
   962  	} else {
   963  		return SHELL
   964  	}
   965  }
   966  
   967  func (sc *scanner) scanShell(val *tokenValue) Token {
   968  	sc.readRune()
   969  
   970  	var (
   971  		raw       strings.Builder
   972  		hasExpand bool
   973  	)
   974  
   975  	for sc.peekRune() == ' ' {
   976  		sc.readRune()
   977  	}
   978  
   979  	for {
   980  		if sc.eof() {
   981  			break
   982  		}
   983  
   984  		c := sc.peekRune()
   985  		if c == '\n' {
   986  			break
   987  		}
   988  
   989  		sc.readRune()
   990  		if c == '$' {
   991  			nc := sc.peekRune()
   992  			if nc == '{' {
   993  				sc.readRune()
   994  				sc.interpDepth++
   995  				hasExpand = true
   996  				break
   997  			}
   998  		} else if c == '\\' {
   999  			if sc.eof() {
  1000  				sc.error(val.pos, "unexpected EOF in string")
  1001  			}
  1002  			c = sc.readRune()
  1003  		}
  1004  
  1005  		raw.WriteRune(c)
  1006  	}
  1007  
  1008  	val.string = raw.String()
  1009  
  1010  	if hasExpand {
  1011  		return DSHELL_START
  1012  	} else {
  1013  		return SHELL
  1014  	}
  1015  }
  1016  
  1017  func (sc *scanner) scanMoreShell(val *tokenValue) Token {
  1018  	sc.interpDepth--
  1019  
  1020  	sc.readRune()
  1021  
  1022  	var (
  1023  		raw       strings.Builder
  1024  		hasExpand bool
  1025  	)
  1026  
  1027  	for {
  1028  		if sc.eof() {
  1029  			break
  1030  		}
  1031  
  1032  		c := sc.peekRune()
  1033  		if c == '\n' {
  1034  			break
  1035  		}
  1036  
  1037  		sc.readRune()
  1038  
  1039  		if c == '$' {
  1040  			nc := sc.peekRune()
  1041  			if nc == '{' {
  1042  				sc.readRune()
  1043  				sc.interpDepth++
  1044  				hasExpand = true
  1045  				break
  1046  			}
  1047  		} else if c == '\\' {
  1048  			if sc.eof() {
  1049  				sc.error(val.pos, "unexpected EOF in string")
  1050  			}
  1051  			c = sc.readRune()
  1052  		}
  1053  
  1054  		raw.WriteRune(c)
  1055  	}
  1056  
  1057  	val.string = raw.String()
  1058  
  1059  	if !hasExpand {
  1060  		return DSHELL_END
  1061  	} else {
  1062  		return DSHELL_PART
  1063  	}
  1064  }
  1065  
  1066  func (sc *scanner) scanMoreShellExpr(val *tokenValue) Token {
  1067  	sc.interpDepth--
  1068  
  1069  	sc.readRune()
  1070  
  1071  	var (
  1072  		raw       strings.Builder
  1073  		hasExpand bool
  1074  	)
  1075  
  1076  	for {
  1077  		if sc.eof() {
  1078  			sc.error(val.pos, "unexpected EOF in string")
  1079  		}
  1080  
  1081  		c := sc.readRune()
  1082  		if c == '`' {
  1083  			break
  1084  		}
  1085  
  1086  		if c == '$' {
  1087  			nc := sc.peekRune()
  1088  			if nc == '{' {
  1089  				sc.readRune()
  1090  				sc.interpExprDepth++
  1091  				hasExpand = true
  1092  				break
  1093  			}
  1094  		} else if c == '\\' {
  1095  			if sc.eof() {
  1096  				sc.error(val.pos, "unexpected EOF in string")
  1097  			}
  1098  			c = sc.readRune()
  1099  		}
  1100  
  1101  		raw.WriteRune(c)
  1102  	}
  1103  
  1104  	val.string = raw.String()
  1105  
  1106  	if !hasExpand {
  1107  		return DSHELL_END
  1108  	} else {
  1109  		return DSHELL_PART
  1110  	}
  1111  }
  1112  
  1113  func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
  1114  	start := sc.pos
  1115  	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
  1116  	sc.readRune()
  1117  
  1118  	// String literals may contain escaped or unescaped newlines,
  1119  	// causing them to span multiple lines (gulps) of REPL input;
  1120  	// they are the only such token. Thus we cannot call endToken,
  1121  	// as it assumes sc.rest is unchanged since startToken.
  1122  	// Instead, buffer the token here.
  1123  	// TODO(adonovan): opt: buffer only if we encounter a newline.
  1124  	raw := new(strings.Builder)
  1125  
  1126  	// Copy the prefix, e.g. r' or " (see startToken).
  1127  	raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
  1128  
  1129  	if !triple {
  1130  		// single-quoted string literal
  1131  		for {
  1132  			if sc.eof() {
  1133  				sc.error(val.pos, "unexpected EOF in string")
  1134  			}
  1135  			c := sc.readRune()
  1136  			raw.WriteRune(c)
  1137  			if c == quote {
  1138  				break
  1139  			}
  1140  			if c == '\n' {
  1141  				sc.error(val.pos, "unexpected newline in string")
  1142  			}
  1143  			if c == '\\' {
  1144  				if sc.eof() {
  1145  					sc.error(val.pos, "unexpected EOF in string")
  1146  				}
  1147  				c = sc.readRune()
  1148  				raw.WriteRune(c)
  1149  			}
  1150  		}
  1151  	} else {
  1152  		// triple-quoted string literal
  1153  		sc.readRune()
  1154  		raw.WriteRune(quote)
  1155  		sc.readRune()
  1156  		raw.WriteRune(quote)
  1157  
  1158  		quoteCount := 0
  1159  		for {
  1160  			if sc.eof() {
  1161  				sc.error(val.pos, "unexpected EOF in string")
  1162  			}
  1163  			c := sc.readRune()
  1164  			raw.WriteRune(c)
  1165  			if c == quote {
  1166  				quoteCount++
  1167  				if quoteCount == 3 {
  1168  					break
  1169  				}
  1170  			} else {
  1171  				quoteCount = 0
  1172  			}
  1173  			if c == '\\' {
  1174  				if sc.eof() {
  1175  					sc.error(val.pos, "unexpected EOF in string")
  1176  				}
  1177  				c = sc.readRune()
  1178  				raw.WriteRune(c)
  1179  			}
  1180  		}
  1181  	}
  1182  	val.raw = raw.String()
  1183  
  1184  	s, _, err := unquote(val.raw)
  1185  	if err != nil {
  1186  		sc.error(start, err.Error())
  1187  	}
  1188  	val.string = s
  1189  	return STRING
  1190  }
  1191  
  1192  func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
  1193  	// https://github.com/google/exprcore-go/blob/master/doc/spec.md#lexical-elements
  1194  	//
  1195  	// Python features not supported:
  1196  	// - integer literals of >64 bits of precision
  1197  	// - 123L or 123l long suffix
  1198  	// - traditional octal: 0755
  1199  	// https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
  1200  
  1201  	start := sc.pos
  1202  	fraction, exponent := false, false
  1203  
  1204  	if c == '.' {
  1205  		// dot or start of fraction
  1206  		sc.readRune()
  1207  		c = sc.peekRune()
  1208  		if !isdigit(c) {
  1209  			sc.endToken(val)
  1210  			return DOT
  1211  		}
  1212  		fraction = true
  1213  	} else if c == '0' {
  1214  		// hex, octal, binary or float
  1215  		sc.readRune()
  1216  		c = sc.peekRune()
  1217  
  1218  		if c == '.' {
  1219  			fraction = true
  1220  		} else if c == 'x' || c == 'X' {
  1221  			// hex
  1222  			sc.readRune()
  1223  			c = sc.peekRune()
  1224  			if !isxdigit(c) {
  1225  				sc.error(start, "invalid hex literal")
  1226  			}
  1227  			for isxdigit(c) {
  1228  				sc.readRune()
  1229  				c = sc.peekRune()
  1230  			}
  1231  		} else if c == 'o' || c == 'O' {
  1232  			// octal
  1233  			sc.readRune()
  1234  			c = sc.peekRune()
  1235  			if !isodigit(c) {
  1236  				sc.error(sc.pos, "invalid octal literal")
  1237  			}
  1238  			for isodigit(c) {
  1239  				sc.readRune()
  1240  				c = sc.peekRune()
  1241  			}
  1242  		} else if c == 'b' || c == 'B' {
  1243  			// binary
  1244  			sc.readRune()
  1245  			c = sc.peekRune()
  1246  			if !isbdigit(c) {
  1247  				sc.error(sc.pos, "invalid binary literal")
  1248  			}
  1249  			for isbdigit(c) {
  1250  				sc.readRune()
  1251  				c = sc.peekRune()
  1252  			}
  1253  		} else {
  1254  			// float (or obsolete octal "0755")
  1255  			allzeros, octal := true, true
  1256  			for isdigit(c) {
  1257  				if c != '0' {
  1258  					allzeros = false
  1259  				}
  1260  				if c > '7' {
  1261  					octal = false
  1262  				}
  1263  				sc.readRune()
  1264  				c = sc.peekRune()
  1265  			}
  1266  			if c == '.' {
  1267  				fraction = true
  1268  			} else if c == 'e' || c == 'E' {
  1269  				exponent = true
  1270  			} else if octal && !allzeros {
  1271  				sc.endToken(val)
  1272  				sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
  1273  			}
  1274  		}
  1275  	} else {
  1276  		// decimal
  1277  		for isdigit(c) {
  1278  			sc.readRune()
  1279  			c = sc.peekRune()
  1280  		}
  1281  
  1282  		if c == '.' {
  1283  			fraction = true
  1284  		} else if c == 'e' || c == 'E' {
  1285  			exponent = true
  1286  		}
  1287  	}
  1288  
  1289  	if fraction {
  1290  		sc.readRune() // consume '.'
  1291  		c = sc.peekRune()
  1292  		for isdigit(c) {
  1293  			sc.readRune()
  1294  			c = sc.peekRune()
  1295  		}
  1296  
  1297  		if c == 'e' || c == 'E' {
  1298  			exponent = true
  1299  		}
  1300  	}
  1301  
  1302  	if exponent {
  1303  		sc.readRune() // consume [eE]
  1304  		c = sc.peekRune()
  1305  		if c == '+' || c == '-' {
  1306  			sc.readRune()
  1307  			c = sc.peekRune()
  1308  			if !isdigit(c) {
  1309  				sc.error(sc.pos, "invalid float literal")
  1310  			}
  1311  		}
  1312  		for isdigit(c) {
  1313  			sc.readRune()
  1314  			c = sc.peekRune()
  1315  		}
  1316  	}
  1317  
  1318  	sc.endToken(val)
  1319  	if fraction || exponent {
  1320  		var err error
  1321  		val.float, err = strconv.ParseFloat(val.raw, 64)
  1322  		if err != nil {
  1323  			sc.error(sc.pos, "invalid float literal")
  1324  		}
  1325  		return FLOAT
  1326  	} else {
  1327  		var err error
  1328  		s := val.raw
  1329  		val.bigInt = nil
  1330  		if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
  1331  			val.int, err = strconv.ParseInt(s[2:], 8, 64)
  1332  		} else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
  1333  			val.int, err = strconv.ParseInt(s[2:], 2, 64)
  1334  		} else {
  1335  			val.int, err = strconv.ParseInt(s, 0, 64)
  1336  			if err != nil {
  1337  				num := new(big.Int)
  1338  				var ok bool
  1339  				val.bigInt, ok = num.SetString(s, 0)
  1340  				if ok {
  1341  					err = nil
  1342  				}
  1343  			}
  1344  		}
  1345  		if err != nil {
  1346  			sc.error(start, "invalid int literal")
  1347  		}
  1348  		return INT
  1349  	}
  1350  }
  1351  
  1352  // isIdent reports whether c is an identifier rune.
  1353  func isIdent(c rune) bool {
  1354  	return isdigit(c) || isIdentStart(c)
  1355  }
  1356  
  1357  func isIdentStart(c rune) bool {
  1358  	return 'a' <= c && c <= 'z' ||
  1359  		'A' <= c && c <= 'Z' ||
  1360  		c == '_' ||
  1361  		unicode.IsLetter(c)
  1362  }
  1363  
  1364  func isdigit(c rune) bool  { return '0' <= c && c <= '9' }
  1365  func isodigit(c rune) bool { return '0' <= c && c <= '7' }
  1366  func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
  1367  func isbdigit(c rune) bool { return '0' == c || c == '1' }
  1368  
  1369  // keywordToken records the special tokens for
  1370  // strings that should not be treated as ordinary identifiers.
  1371  var keywordToken = map[string]Token{
  1372  	"and":      AND,
  1373  	"break":    BREAK,
  1374  	"continue": CONTINUE,
  1375  	"def":      DEF,
  1376  	"elif":     ELIF,
  1377  	"else":     ELSE,
  1378  	"for":      FOR,
  1379  	"if":       IF,
  1380  	"in":       IN,
  1381  	"lambda":   LAMBDA,
  1382  	"load":     LOAD,
  1383  	"import":   IMPORT,
  1384  	"as":       AS,
  1385  	"using":    USING,
  1386  	"not":      NOT,
  1387  	"or":       OR,
  1388  	"pass":     PASS,
  1389  	"return":   RETURN,
  1390  	"while":    WHILE,
  1391  
  1392  	// reserved words:
  1393  	// "assert":   ILLEGAL, // heavily used by our tests
  1394  	"class":    ILLEGAL,
  1395  	"del":      ILLEGAL,
  1396  	"except":   ILLEGAL,
  1397  	"finally":  ILLEGAL,
  1398  	"from":     ILLEGAL,
  1399  	"global":   ILLEGAL,
  1400  	"is":       ILLEGAL,
  1401  	"nonlocal": ILLEGAL,
  1402  	"raise":    ILLEGAL,
  1403  	"try":      ILLEGAL,
  1404  	"with":     ILLEGAL,
  1405  	"yield":    ILLEGAL,
  1406  }