github.com/likebike/go--@v0.0.0-20190911215757-0bd925d16e96/go/src/cmd/compile/internal/syntax/scanner.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements scanner, a lexical tokenizer for
     6  // Go source. After initialization, consecutive calls of
     7  // next advance the scanner one token at a time.
     8  //
     9  // This file, source.go, and tokens.go are self-contained
    10  // (go tool compile scanner.go source.go tokens.go compiles)
    11  // and thus could be made into its own package.
    12  
    13  package syntax
    14  
    15  import (
    16  	"fmt"
    17  	"io"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  type scanner struct {
    23  	source
    24  	pragh  func(line, col uint, msg string)
    25  	nlsemi bool // if set '\n' and EOF translate to ';'
    26  
    27  	// current token, valid after calling next()
    28  	line, col uint
    29  	tok       token
    30  	lit       string   // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF")
    31  	kind      LitKind  // valid if tok is _Literal
    32  	op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    33  	prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    34  }
    35  
    36  func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) {
    37  	s.source.init(src, errh)
    38  	s.pragh = pragh
    39  	s.nlsemi = false
    40  }
    41  
    42  // next advances the scanner by reading the next token.
    43  //
    44  // If a read, source encoding, or lexical error occurs, next
    45  // calls the error handler installed with init. The handler
    46  // must exist.
    47  //
    48  // If a //line or //go: directive is encountered at the start
    49  // of a line, next calls the directive handler pragh installed
    50  // with init, if not nil.
    51  //
    52  // The (line, col) position passed to the error and directive
    53  // handler is always at or after the current source reading
    54  // position.
    55  func (s *scanner) next() {
    56  	nlsemi := s.nlsemi
    57  	s.nlsemi = false
    58  
    59      // Go-- support for 'shebang' lines:
    60      if s.source.line0<=linebase && s.source.col0<=colbase {  // BOM does not affect 'col'.
    61          if s.getr()=='#' && s.source.r<len(s.source.buf) && s.source.buf[s.source.r]=='!' { s.skipLine('#')
    62          } else { s.ungetr() }
    63      }
    64  
    65  redo:
    66  	// skip white space
    67  	c := s.getr()
    68  	for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    69  		c = s.getr()
    70  	}
    71  
    72  	// token start
    73  	s.line, s.col = s.source.line0, s.source.col0
    74  
    75  	if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
    76  		s.ident()
    77  		return
    78  	}
    79  
    80  	switch c {
    81  	case -1:
    82  		if nlsemi {
    83  			s.lit = "EOF"
    84  			s.tok = _Semi
    85  			break
    86  		}
    87  		s.tok = _EOF
    88  
    89  	case '\n':
    90  		s.lit = "newline"
    91  		s.tok = _Semi
    92  
    93  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
    94  		s.number(c)
    95  
    96  	case '"':
    97  		s.stdString()
    98  
    99  	case '`':
   100  		s.rawString()
   101  
   102  	case '\'':
   103  		s.rune()
   104  
   105  	case '(':
   106  		s.tok = _Lparen
   107  
   108  	case '[':
   109  		s.tok = _Lbrack
   110  
   111  	case '{':
   112  		s.tok = _Lbrace
   113  
   114  	case ',':
   115  		s.tok = _Comma
   116  
   117  	case ';':
   118  		s.lit = "semicolon"
   119  		s.tok = _Semi
   120  
   121  	case ')':
   122  		s.nlsemi = true
   123  		s.tok = _Rparen
   124  
   125  	case ']':
   126  		s.nlsemi = true
   127  		s.tok = _Rbrack
   128  
   129  	case '}':
   130  		s.nlsemi = true
   131  		s.tok = _Rbrace
   132  
   133  	case ':':
   134  		if s.getr() == '=' {
   135  			s.tok = _Define
   136  			break
   137  		}
   138  		s.ungetr()
   139  		s.tok = _Colon
   140  
   141  	case '.':
   142  		c = s.getr()
   143  		if isDigit(c) {
   144  			s.ungetr2()
   145  			s.number('.')
   146  			break
   147  		}
   148  		if c == '.' {
   149  			c = s.getr()
   150  			if c == '.' {
   151  				s.tok = _DotDotDot
   152  				break
   153  			}
   154  			s.ungetr2()
   155  		}
   156  		s.ungetr()
   157  		s.tok = _Dot
   158  
   159  	case '+':
   160  		s.op, s.prec = Add, precAdd
   161  		c = s.getr()
   162  		if c != '+' {
   163  			goto assignop
   164  		}
   165  		s.nlsemi = true
   166  		s.tok = _IncOp
   167  
   168  	case '-':
   169  		s.op, s.prec = Sub, precAdd
   170  		c = s.getr()
   171  		if c != '-' {
   172  			goto assignop
   173  		}
   174  		s.nlsemi = true
   175  		s.tok = _IncOp
   176  
   177  	case '*':
   178  		s.op, s.prec = Mul, precMul
   179  		// don't goto assignop - want _Star token
   180  		if s.getr() == '=' {
   181  			s.tok = _AssignOp
   182  			break
   183  		}
   184  		s.ungetr()
   185  		s.tok = _Star
   186  
   187  	case '/':
   188  		c = s.getr()
   189  		if c == '/' {
   190  			s.lineComment()
   191  			goto redo
   192  		}
   193  		if c == '*' {
   194  			s.fullComment()
   195  			if s.source.line > s.line && nlsemi {
   196  				// A multi-line comment acts like a newline;
   197  				// it translates to a ';' if nlsemi is set.
   198  				s.lit = "newline"
   199  				s.tok = _Semi
   200  				break
   201  			}
   202  			goto redo
   203  		}
   204  		s.op, s.prec = Div, precMul
   205  		goto assignop
   206  
   207  	case '%':
   208  		s.op, s.prec = Rem, precMul
   209  		c = s.getr()
   210  		goto assignop
   211  
   212  	case '&':
   213  		c = s.getr()
   214  		if c == '&' {
   215  			s.op, s.prec = AndAnd, precAndAnd
   216  			s.tok = _Operator
   217  			break
   218  		}
   219  		s.op, s.prec = And, precMul
   220  		if c == '^' {
   221  			s.op = AndNot
   222  			c = s.getr()
   223  		}
   224  		goto assignop
   225  
   226  	case '|':
   227  		c = s.getr()
   228  		if c == '|' {
   229  			s.op, s.prec = OrOr, precOrOr
   230  			s.tok = _Operator
   231  			break
   232  		}
   233  		s.op, s.prec = Or, precAdd
   234  		goto assignop
   235  
   236  	case '~':
   237  		s.error("bitwise complement operator is ^")
   238  		fallthrough
   239  
   240  	case '^':
   241  		s.op, s.prec = Xor, precAdd
   242  		c = s.getr()
   243  		goto assignop
   244  
   245  	case '<':
   246  		c = s.getr()
   247  		if c == '=' {
   248  			s.op, s.prec = Leq, precCmp
   249  			s.tok = _Operator
   250  			break
   251  		}
   252  		if c == '<' {
   253  			s.op, s.prec = Shl, precMul
   254  			c = s.getr()
   255  			goto assignop
   256  		}
   257  		if c == '-' {
   258  			s.tok = _Arrow
   259  			break
   260  		}
   261  		s.ungetr()
   262  		s.op, s.prec = Lss, precCmp
   263  		s.tok = _Operator
   264  
   265  	case '>':
   266  		c = s.getr()
   267  		if c == '=' {
   268  			s.op, s.prec = Geq, precCmp
   269  			s.tok = _Operator
   270  			break
   271  		}
   272  		if c == '>' {
   273  			s.op, s.prec = Shr, precMul
   274  			c = s.getr()
   275  			goto assignop
   276  		}
   277  		s.ungetr()
   278  		s.op, s.prec = Gtr, precCmp
   279  		s.tok = _Operator
   280  
   281  	case '=':
   282  		if s.getr() == '=' {
   283  			s.op, s.prec = Eql, precCmp
   284  			s.tok = _Operator
   285  			break
   286  		}
   287  		s.ungetr()
   288  		s.tok = _Assign
   289  
   290  	case '!':
   291  		if s.getr() == '=' {
   292  			s.op, s.prec = Neq, precCmp
   293  			s.tok = _Operator
   294  			break
   295  		}
   296  		s.ungetr()
   297  		s.op, s.prec = Not, 0
   298  		s.tok = _Operator
   299  
   300  	default:
   301  		s.tok = 0
   302  		s.error(fmt.Sprintf("invalid character %#U", c))
   303  		goto redo
   304  	}
   305  
   306  	return
   307  
   308  assignop:
   309  	if c == '=' {
   310  		s.tok = _AssignOp
   311  		return
   312  	}
   313  	s.ungetr()
   314  	s.tok = _Operator
   315  }
   316  
   317  func isLetter(c rune) bool {
   318  	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
   319  }
   320  
   321  func isDigit(c rune) bool {
   322  	return '0' <= c && c <= '9'
   323  }
   324  
   325  func (s *scanner) ident() {
   326  	s.startLit()
   327  
   328  	// accelerate common case (7bit ASCII)
   329  	c := s.getr()
   330  	for isLetter(c) || isDigit(c) {
   331  		c = s.getr()
   332  	}
   333  
   334  	// general case
   335  	if c >= utf8.RuneSelf {
   336  		for s.isIdentRune(c, false) {
   337  			c = s.getr()
   338  		}
   339  	}
   340  	s.ungetr()
   341  
   342  	lit := s.stopLit()
   343  
   344  	// possibly a keyword
   345  	if len(lit) >= 2 {
   346  		if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) {
   347  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   348  			s.tok = tok
   349  			return
   350  		}
   351  	}
   352  
   353  	s.nlsemi = true
   354  	s.lit = string(lit)
   355  	s.tok = _Name
   356  }
   357  
   358  func (s *scanner) isIdentRune(c rune, first bool) bool {
   359  	switch {
   360  	case unicode.IsLetter(c) || c == '_':
   361  		// ok
   362  	case unicode.IsDigit(c):
   363  		if first {
   364  			s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
   365  		}
   366  	case c >= utf8.RuneSelf:
   367  		s.error(fmt.Sprintf("invalid identifier character %#U", c))
   368  	default:
   369  		return false
   370  	}
   371  	return true
   372  }
   373  
   374  // hash is a perfect hash function for keywords.
   375  // It assumes that s has at least length 2.
   376  func hash(s []byte) uint {
   377  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   378  }
   379  
   380  var keywordMap [1 << 6]token // size must be power of two
   381  
   382  func init() {
   383  	// populate keywordMap
   384  	for tok := _Break; tok <= _Var; tok++ {
   385  		h := hash([]byte(tokstrings[tok]))
   386  		if keywordMap[h] != 0 {
   387  			panic("imperfect hash")
   388  		}
   389  		keywordMap[h] = tok
   390  	}
   391  }
   392  
   393  func (s *scanner) number(c rune) {
   394  	s.startLit()
   395  
   396  	if c != '.' {
   397  		s.kind = IntLit // until proven otherwise
   398  		if c == '0' {
   399  			c = s.getr()
   400  			if c == 'x' || c == 'X' {
   401  				// hex
   402  				c = s.getr()
   403  				hasDigit := false
   404  				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   405  					c = s.getr()
   406  					hasDigit = true
   407  				}
   408  				if !hasDigit {
   409  					s.error("malformed hex constant")
   410  				}
   411  				goto done
   412  			}
   413  
   414  			// decimal 0, octal, or float
   415  			has8or9 := false
   416  			for isDigit(c) {
   417  				if c > '7' {
   418  					has8or9 = true
   419  				}
   420  				c = s.getr()
   421  			}
   422  			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
   423  				// octal
   424  				if has8or9 {
   425  					s.error("malformed octal constant")
   426  				}
   427  				goto done
   428  			}
   429  
   430  		} else {
   431  			// decimal or float
   432  			for isDigit(c) {
   433  				c = s.getr()
   434  			}
   435  		}
   436  	}
   437  
   438  	// float
   439  	if c == '.' {
   440  		s.kind = FloatLit
   441  		c = s.getr()
   442  		for isDigit(c) {
   443  			c = s.getr()
   444  		}
   445  	}
   446  
   447  	// exponent
   448  	if c == 'e' || c == 'E' {
   449  		s.kind = FloatLit
   450  		c = s.getr()
   451  		if c == '-' || c == '+' {
   452  			c = s.getr()
   453  		}
   454  		if !isDigit(c) {
   455  			s.error("malformed floating-point constant exponent")
   456  		}
   457  		for isDigit(c) {
   458  			c = s.getr()
   459  		}
   460  	}
   461  
   462  	// complex
   463  	if c == 'i' {
   464  		s.kind = ImagLit
   465  		s.getr()
   466  	}
   467  
   468  done:
   469  	s.ungetr()
   470  	s.nlsemi = true
   471  	s.lit = string(s.stopLit())
   472  	s.tok = _Literal
   473  }
   474  
   475  func (s *scanner) rune() {
   476  	s.startLit()
   477  
   478  	ok := true // only report errors if we're ok so far
   479  	n := 0
   480  	for ; ; n++ {
   481  		r := s.getr()
   482  		if r == '\'' {
   483  			break
   484  		}
   485  		if r == '\\' {
   486  			if !s.escape('\'') {
   487  				ok = false
   488  			}
   489  			continue
   490  		}
   491  		if r == '\n' {
   492  			s.ungetr() // assume newline is not part of literal
   493  			if ok {
   494  				s.error("newline in character literal")
   495  				ok = false
   496  			}
   497  			break
   498  		}
   499  		if r < 0 {
   500  			if ok {
   501  				s.errh(s.line, s.col, "invalid character literal (missing closing ')")
   502  				ok = false
   503  			}
   504  			break
   505  		}
   506  	}
   507  
   508  	if ok {
   509  		if n == 0 {
   510  			s.error("empty character literal or unescaped ' in character literal")
   511  		} else if n != 1 {
   512  			s.errh(s.line, s.col, "invalid character literal (more than one character)")
   513  		}
   514  	}
   515  
   516  	s.nlsemi = true
   517  	s.lit = string(s.stopLit())
   518  	s.kind = RuneLit
   519  	s.tok = _Literal
   520  }
   521  
   522  func (s *scanner) stdString() {
   523  	s.startLit()
   524  
   525  	for {
   526  		r := s.getr()
   527  		if r == '"' {
   528  			break
   529  		}
   530  		if r == '\\' {
   531  			s.escape('"')
   532  			continue
   533  		}
   534  		if r == '\n' {
   535  			s.ungetr() // assume newline is not part of literal
   536  			s.error("newline in string")
   537  			break
   538  		}
   539  		if r < 0 {
   540  			s.errh(s.line, s.col, "string not terminated")
   541  			break
   542  		}
   543  	}
   544  
   545  	s.nlsemi = true
   546  	s.lit = string(s.stopLit())
   547  	s.kind = StringLit
   548  	s.tok = _Literal
   549  }
   550  
   551  func (s *scanner) rawString() {
   552  	s.startLit()
   553  
   554  	for {
   555  		r := s.getr()
   556  		if r == '`' {
   557  			break
   558  		}
   559  		if r < 0 {
   560  			s.errh(s.line, s.col, "string not terminated")
   561  			break
   562  		}
   563  	}
   564  	// We leave CRs in the string since they are part of the
   565  	// literal (even though they are not part of the literal
   566  	// value).
   567  
   568  	s.nlsemi = true
   569  	s.lit = string(s.stopLit())
   570  	s.kind = StringLit
   571  	s.tok = _Literal
   572  }
   573  
   574  func (s *scanner) skipLine(r rune) {
   575  	for r >= 0 {
   576  		if r == '\n' {
   577  			s.ungetr() // don't consume '\n' - needed for nlsemi logic
   578  			break
   579  		}
   580  		r = s.getr()
   581  	}
   582  }
   583  
   584  func (s *scanner) lineComment() {
   585  	r := s.getr()
   586  	// directives must start at the beginning of the line (s.col == colbase)
   587  	if s.col != colbase || s.pragh == nil || (r != 'g' && r != 'l') {
   588  		s.skipLine(r)
   589  		return
   590  	}
   591  	// s.col == colbase && s.pragh != nil && (r == 'g' || r == 'l')
   592  
   593  	// recognize directives
   594  	prefix := "go:"
   595  	if r == 'l' {
   596  		prefix = "line "
   597  	}
   598  	for _, m := range prefix {
   599  		if r != m {
   600  			s.skipLine(r)
   601  			return
   602  		}
   603  		r = s.getr()
   604  	}
   605  
   606  	// directive text without line ending (which may be "\r\n" if Windows),
   607  	s.startLit()
   608  	s.skipLine(r)
   609  	text := s.stopLit()
   610  	if i := len(text) - 1; i >= 0 && text[i] == '\r' {
   611  		text = text[:i]
   612  	}
   613  
   614  	s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since directive text starts after //
   615  }
   616  
   617  func (s *scanner) fullComment() {
   618  	for {
   619  		r := s.getr()
   620  		for r == '*' {
   621  			r = s.getr()
   622  			if r == '/' {
   623  				return
   624  			}
   625  		}
   626  		if r < 0 {
   627  			s.errh(s.line, s.col, "comment not terminated")
   628  			return
   629  		}
   630  	}
   631  }
   632  
   633  func (s *scanner) escape(quote rune) bool {
   634  	var n int
   635  	var base, max uint32
   636  
   637  	c := s.getr()
   638  	switch c {
   639  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   640  		return true
   641  	case '0', '1', '2', '3', '4', '5', '6', '7':
   642  		n, base, max = 3, 8, 255
   643  	case 'x':
   644  		c = s.getr()
   645  		n, base, max = 2, 16, 255
   646  	case 'u':
   647  		c = s.getr()
   648  		n, base, max = 4, 16, unicode.MaxRune
   649  	case 'U':
   650  		c = s.getr()
   651  		n, base, max = 8, 16, unicode.MaxRune
   652  	default:
   653  		if c < 0 {
   654  			return true // complain in caller about EOF
   655  		}
   656  		s.error("unknown escape sequence")
   657  		return false
   658  	}
   659  
   660  	var x uint32
   661  	for i := n; i > 0; i-- {
   662  		d := base
   663  		switch {
   664  		case isDigit(c):
   665  			d = uint32(c) - '0'
   666  		case 'a' <= c && c <= 'f':
   667  			d = uint32(c) - ('a' - 10)
   668  		case 'A' <= c && c <= 'F':
   669  			d = uint32(c) - ('A' - 10)
   670  		}
   671  		if d >= base {
   672  			if c < 0 {
   673  				return true // complain in caller about EOF
   674  			}
   675  			kind := "hex"
   676  			if base == 8 {
   677  				kind = "octal"
   678  			}
   679  			s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c))
   680  			s.ungetr()
   681  			return false
   682  		}
   683  		// d < base
   684  		x = x*base + d
   685  		c = s.getr()
   686  	}
   687  	s.ungetr()
   688  
   689  	if x > max && base == 8 {
   690  		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
   691  		return false
   692  	}
   693  
   694  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   695  		s.error("escape sequence is invalid Unicode code point")
   696  		return false
   697  	}
   698  
   699  	return true
   700  }