github.com/epfl-dcsl/gotee@v0.0.0-20200909122901-014b35f5e5e9/src/cmd/compile/internal/syntax/scanner.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements scanner, a lexical tokenizer for
     6  // Go source. After initialization, consecutive calls of
     7  // next advance the scanner one token at a time.
     8  //
     9  // This file, source.go, and tokens.go are self-contained
    10  // (go tool compile scanner.go source.go tokens.go compiles)
    11  // and thus could be made into its own package.
    12  
    13  package syntax
    14  
    15  import (
    16  	"fmt"
    17  	"io"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  type scanner struct {
    23  	source
    24  	pragh  func(line, col uint, msg string)
    25  	nlsemi bool // if set '\n' and EOF translate to ';'
    26  
    27  	// current token, valid after calling next()
    28  	line, col uint
    29  	tok       token
    30  	lit       string   // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF")
    31  	kind      LitKind  // valid if tok is _Literal
    32  	op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    33  	prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    34  }
    35  
    36  func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) {
    37  	s.source.init(src, errh)
    38  	s.pragh = pragh
    39  	s.nlsemi = false
    40  }
    41  
    42  // next advances the scanner by reading the next token.
    43  //
    44  // If a read, source encoding, or lexical error occurs, next
    45  // calls the error handler installed with init. The handler
    46  // must exist.
    47  //
    48  // If a //line or //go: directive is encountered at the start
    49  // of a line, next calls the directive handler pragh installed
    50  // with init, if not nil.
    51  //
    52  // The (line, col) position passed to the error and directive
    53  // handler is always at or after the current source reading
    54  // position.
    55  func (s *scanner) next() {
    56  	nlsemi := s.nlsemi
    57  	s.nlsemi = false
    58  
    59  redo:
    60  	// skip white space
    61  	c := s.getr()
    62  	for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    63  		c = s.getr()
    64  	}
    65  
    66  	// token start
    67  	s.line, s.col = s.source.line0, s.source.col0
    68  
    69  	if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
    70  		s.ident()
    71  		return
    72  	}
    73  
    74  	switch c {
    75  	case -1:
    76  		if nlsemi {
    77  			s.lit = "EOF"
    78  			s.tok = _Semi
    79  			break
    80  		}
    81  		s.tok = _EOF
    82  
    83  	case '\n':
    84  		s.lit = "newline"
    85  		s.tok = _Semi
    86  
    87  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
    88  		s.number(c)
    89  
    90  	case '"':
    91  		s.stdString()
    92  
    93  	case '`':
    94  		s.rawString()
    95  
    96  	case '\'':
    97  		s.rune()
    98  
    99  	case '(':
   100  		s.tok = _Lparen
   101  
   102  	case '[':
   103  		s.tok = _Lbrack
   104  
   105  	case '{':
   106  		s.tok = _Lbrace
   107  
   108  	case ',':
   109  		s.tok = _Comma
   110  
   111  	case ';':
   112  		s.lit = "semicolon"
   113  		s.tok = _Semi
   114  
   115  	case ')':
   116  		s.nlsemi = true
   117  		s.tok = _Rparen
   118  
   119  	case ']':
   120  		s.nlsemi = true
   121  		s.tok = _Rbrack
   122  
   123  	case '}':
   124  		s.nlsemi = true
   125  		s.tok = _Rbrace
   126  
   127  	case ':':
   128  		if s.getr() == '=' {
   129  			s.tok = _Define
   130  			break
   131  		}
   132  		s.ungetr()
   133  		s.tok = _Colon
   134  
   135  	case '.':
   136  		c = s.getr()
   137  		if isDigit(c) {
   138  			s.ungetr2()
   139  			s.number('.')
   140  			break
   141  		}
   142  		if c == '.' {
   143  			c = s.getr()
   144  			if c == '.' {
   145  				s.tok = _DotDotDot
   146  				break
   147  			}
   148  			s.ungetr2()
   149  		}
   150  		s.ungetr()
   151  		s.tok = _Dot
   152  
   153  	case '+':
   154  		s.op, s.prec = Add, precAdd
   155  		c = s.getr()
   156  		if c != '+' {
   157  			goto assignop
   158  		}
   159  		s.nlsemi = true
   160  		s.tok = _IncOp
   161  
   162  	case '-':
   163  		s.op, s.prec = Sub, precAdd
   164  		c = s.getr()
   165  		if c != '-' {
   166  			goto assignop
   167  		}
   168  		s.nlsemi = true
   169  		s.tok = _IncOp
   170  
   171  	case '*':
   172  		s.op, s.prec = Mul, precMul
   173  		// don't goto assignop - want _Star token
   174  		if s.getr() == '=' {
   175  			s.tok = _AssignOp
   176  			break
   177  		}
   178  		s.ungetr()
   179  		s.tok = _Star
   180  
   181  	case '/':
   182  		c = s.getr()
   183  		if c == '/' {
   184  			s.lineComment()
   185  			goto redo
   186  		}
   187  		if c == '*' {
   188  			s.fullComment()
   189  			if s.source.line > s.line && nlsemi {
   190  				// A multi-line comment acts like a newline;
   191  				// it translates to a ';' if nlsemi is set.
   192  				s.lit = "newline"
   193  				s.tok = _Semi
   194  				break
   195  			}
   196  			goto redo
   197  		}
   198  		s.op, s.prec = Div, precMul
   199  		goto assignop
   200  
   201  	case '%':
   202  		s.op, s.prec = Rem, precMul
   203  		c = s.getr()
   204  		goto assignop
   205  
   206  	case '&':
   207  		c = s.getr()
   208  		if c == '&' {
   209  			s.op, s.prec = AndAnd, precAndAnd
   210  			s.tok = _Operator
   211  			break
   212  		}
   213  		s.op, s.prec = And, precMul
   214  		if c == '^' {
   215  			s.op = AndNot
   216  			c = s.getr()
   217  		}
   218  		goto assignop
   219  
   220  	case '|':
   221  		c = s.getr()
   222  		if c == '|' {
   223  			s.op, s.prec = OrOr, precOrOr
   224  			s.tok = _Operator
   225  			break
   226  		}
   227  		s.op, s.prec = Or, precAdd
   228  		goto assignop
   229  
   230  	case '~':
   231  		s.error("bitwise complement operator is ^")
   232  		fallthrough
   233  
   234  	case '^':
   235  		s.op, s.prec = Xor, precAdd
   236  		c = s.getr()
   237  		goto assignop
   238  
   239  	case '<':
   240  		c = s.getr()
   241  		if c == '=' {
   242  			s.op, s.prec = Leq, precCmp
   243  			s.tok = _Operator
   244  			break
   245  		}
   246  		if c == '<' {
   247  			s.op, s.prec = Shl, precMul
   248  			c = s.getr()
   249  			goto assignop
   250  		}
   251  		if c == '-' {
   252  			s.tok = _Arrow
   253  			break
   254  		}
   255  		s.ungetr()
   256  		s.op, s.prec = Lss, precCmp
   257  		s.tok = _Operator
   258  
   259  	case '>':
   260  		c = s.getr()
   261  		if c == '=' {
   262  			s.op, s.prec = Geq, precCmp
   263  			s.tok = _Operator
   264  			break
   265  		}
   266  		if c == '>' {
   267  			s.op, s.prec = Shr, precMul
   268  			c = s.getr()
   269  			goto assignop
   270  		}
   271  		s.ungetr()
   272  		s.op, s.prec = Gtr, precCmp
   273  		s.tok = _Operator
   274  
   275  	case '=':
   276  		if s.getr() == '=' {
   277  			s.op, s.prec = Eql, precCmp
   278  			s.tok = _Operator
   279  			break
   280  		}
   281  		s.ungetr()
   282  		s.tok = _Assign
   283  
   284  	case '!':
   285  		if s.getr() == '=' {
   286  			s.op, s.prec = Neq, precCmp
   287  			s.tok = _Operator
   288  			break
   289  		}
   290  		s.ungetr()
   291  		s.op, s.prec = Not, 0
   292  		s.tok = _Operator
   293  
   294  	default:
   295  		s.tok = 0
   296  		s.error(fmt.Sprintf("invalid character %#U", c))
   297  		goto redo
   298  	}
   299  
   300  	return
   301  
   302  assignop:
   303  	if c == '=' {
   304  		s.tok = _AssignOp
   305  		return
   306  	}
   307  	s.ungetr()
   308  	s.tok = _Operator
   309  }
   310  
   311  func isLetter(c rune) bool {
   312  	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
   313  }
   314  
   315  func isDigit(c rune) bool {
   316  	return '0' <= c && c <= '9'
   317  }
   318  
   319  func (s *scanner) ident() {
   320  	s.startLit()
   321  
   322  	// accelerate common case (7bit ASCII)
   323  	c := s.getr()
   324  	for isLetter(c) || isDigit(c) {
   325  		c = s.getr()
   326  	}
   327  
   328  	// general case
   329  	if c >= utf8.RuneSelf {
   330  		for s.isIdentRune(c, false) {
   331  			c = s.getr()
   332  		}
   333  	}
   334  	s.ungetr()
   335  
   336  	lit := s.stopLit()
   337  
   338  	// possibly a keyword
   339  	if len(lit) >= 2 {
   340  		if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) {
   341  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   342  			s.tok = tok
   343  			return
   344  		}
   345  	}
   346  
   347  	s.nlsemi = true
   348  	s.lit = string(lit)
   349  	s.tok = _Name
   350  }
   351  
   352  func (s *scanner) isIdentRune(c rune, first bool) bool {
   353  	switch {
   354  	case unicode.IsLetter(c) || c == '_':
   355  		// ok
   356  	case unicode.IsDigit(c):
   357  		if first {
   358  			s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
   359  		}
   360  	case c >= utf8.RuneSelf:
   361  		s.error(fmt.Sprintf("invalid identifier character %#U", c))
   362  	default:
   363  		return false
   364  	}
   365  	return true
   366  }
   367  
   368  // hash is a perfect hash function for keywords.
   369  // It assumes that s has at least length 2.
   370  func hash(s []byte) uint {
   371  	if v, ok := hashing[string(s)]; ok {
   372  		return v
   373  	}
   374  	return 0
   375  	//return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   376  }
   377  
   378  var keywordMap [1 << 6]token // size must be power of two
   379  var hashing map[string]uint
   380  
   381  func init() {
   382  	hashing = make(map[string]uint)
   383  
   384  	// populate keywordMap
   385  	for tok := _Break; tok <= _Var; tok++ {
   386  		//TODO aghosn hack to fix the hash function:
   387  		hashing[tokstrings[tok]] = uint(tok)
   388  		h := hash([]byte(tokstrings[tok]))
   389  		if keywordMap[h] != 0 {
   390  			panic("imperfect hash")
   391  		}
   392  		keywordMap[h] = tok
   393  	}
   394  }
   395  
   396  func (s *scanner) number(c rune) {
   397  	s.startLit()
   398  
   399  	if c != '.' {
   400  		s.kind = IntLit // until proven otherwise
   401  		if c == '0' {
   402  			c = s.getr()
   403  			if c == 'x' || c == 'X' {
   404  				// hex
   405  				c = s.getr()
   406  				hasDigit := false
   407  				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   408  					c = s.getr()
   409  					hasDigit = true
   410  				}
   411  				if !hasDigit {
   412  					s.error("malformed hex constant")
   413  				}
   414  				goto done
   415  			}
   416  
   417  			// decimal 0, octal, or float
   418  			has8or9 := false
   419  			for isDigit(c) {
   420  				if c > '7' {
   421  					has8or9 = true
   422  				}
   423  				c = s.getr()
   424  			}
   425  			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
   426  				// octal
   427  				if has8or9 {
   428  					s.error("malformed octal constant")
   429  				}
   430  				goto done
   431  			}
   432  
   433  		} else {
   434  			// decimal or float
   435  			for isDigit(c) {
   436  				c = s.getr()
   437  			}
   438  		}
   439  	}
   440  
   441  	// float
   442  	if c == '.' {
   443  		s.kind = FloatLit
   444  		c = s.getr()
   445  		for isDigit(c) {
   446  			c = s.getr()
   447  		}
   448  	}
   449  
   450  	// exponent
   451  	if c == 'e' || c == 'E' {
   452  		s.kind = FloatLit
   453  		c = s.getr()
   454  		if c == '-' || c == '+' {
   455  			c = s.getr()
   456  		}
   457  		if !isDigit(c) {
   458  			s.error("malformed floating-point constant exponent")
   459  		}
   460  		for isDigit(c) {
   461  			c = s.getr()
   462  		}
   463  	}
   464  
   465  	// complex
   466  	if c == 'i' {
   467  		s.kind = ImagLit
   468  		s.getr()
   469  	}
   470  
   471  done:
   472  	s.ungetr()
   473  	s.nlsemi = true
   474  	s.lit = string(s.stopLit())
   475  	s.tok = _Literal
   476  }
   477  
   478  func (s *scanner) rune() {
   479  	s.startLit()
   480  
   481  	ok := true // only report errors if we're ok so far
   482  	n := 0
   483  	for ; ; n++ {
   484  		r := s.getr()
   485  		if r == '\'' {
   486  			break
   487  		}
   488  		if r == '\\' {
   489  			if !s.escape('\'') {
   490  				ok = false
   491  			}
   492  			continue
   493  		}
   494  		if r == '\n' {
   495  			s.ungetr() // assume newline is not part of literal
   496  			if ok {
   497  				s.error("newline in character literal")
   498  				ok = false
   499  			}
   500  			break
   501  		}
   502  		if r < 0 {
   503  			if ok {
   504  				s.errh(s.line, s.col, "invalid character literal (missing closing ')")
   505  				ok = false
   506  			}
   507  			break
   508  		}
   509  	}
   510  
   511  	if ok {
   512  		if n == 0 {
   513  			s.error("empty character literal or unescaped ' in character literal")
   514  		} else if n != 1 {
   515  			s.errh(s.line, s.col, "invalid character literal (more than one character)")
   516  		}
   517  	}
   518  
   519  	s.nlsemi = true
   520  	s.lit = string(s.stopLit())
   521  	s.kind = RuneLit
   522  	s.tok = _Literal
   523  }
   524  
   525  func (s *scanner) stdString() {
   526  	s.startLit()
   527  
   528  	for {
   529  		r := s.getr()
   530  		if r == '"' {
   531  			break
   532  		}
   533  		if r == '\\' {
   534  			s.escape('"')
   535  			continue
   536  		}
   537  		if r == '\n' {
   538  			s.ungetr() // assume newline is not part of literal
   539  			s.error("newline in string")
   540  			break
   541  		}
   542  		if r < 0 {
   543  			s.errh(s.line, s.col, "string not terminated")
   544  			break
   545  		}
   546  	}
   547  
   548  	s.nlsemi = true
   549  	s.lit = string(s.stopLit())
   550  	s.kind = StringLit
   551  	s.tok = _Literal
   552  }
   553  
   554  func (s *scanner) rawString() {
   555  	s.startLit()
   556  
   557  	for {
   558  		r := s.getr()
   559  		if r == '`' {
   560  			break
   561  		}
   562  		if r < 0 {
   563  			s.errh(s.line, s.col, "string not terminated")
   564  			break
   565  		}
   566  	}
   567  	// We leave CRs in the string since they are part of the
   568  	// literal (even though they are not part of the literal
   569  	// value).
   570  
   571  	s.nlsemi = true
   572  	s.lit = string(s.stopLit())
   573  	s.kind = StringLit
   574  	s.tok = _Literal
   575  }
   576  
   577  func (s *scanner) skipLine(r rune) {
   578  	for r >= 0 {
   579  		if r == '\n' {
   580  			s.ungetr() // don't consume '\n' - needed for nlsemi logic
   581  			break
   582  		}
   583  		r = s.getr()
   584  	}
   585  }
   586  
   587  func (s *scanner) lineComment() {
   588  	r := s.getr()
   589  	// directives must start at the beginning of the line (s.col == colbase)
   590  	if s.col != colbase || s.pragh == nil || (r != 'g' && r != 'l') {
   591  		s.skipLine(r)
   592  		return
   593  	}
   594  	// s.col == colbase && s.pragh != nil && (r == 'g' || r == 'l')
   595  
   596  	// recognize directives
   597  	prefix := "go:"
   598  	if r == 'l' {
   599  		prefix = "line "
   600  	}
   601  	for _, m := range prefix {
   602  		if r != m {
   603  			s.skipLine(r)
   604  			return
   605  		}
   606  		r = s.getr()
   607  	}
   608  
   609  	// directive text without line ending (which may be "\r\n" if Windows),
   610  	s.startLit()
   611  	s.skipLine(r)
   612  	text := s.stopLit()
   613  	if i := len(text) - 1; i >= 0 && text[i] == '\r' {
   614  		text = text[:i]
   615  	}
   616  
   617  	s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since directive text starts after //
   618  }
   619  
   620  func (s *scanner) fullComment() {
   621  	for {
   622  		r := s.getr()
   623  		for r == '*' {
   624  			r = s.getr()
   625  			if r == '/' {
   626  				return
   627  			}
   628  		}
   629  		if r < 0 {
   630  			s.errh(s.line, s.col, "comment not terminated")
   631  			return
   632  		}
   633  	}
   634  }
   635  
   636  func (s *scanner) escape(quote rune) bool {
   637  	var n int
   638  	var base, max uint32
   639  
   640  	c := s.getr()
   641  	switch c {
   642  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   643  		return true
   644  	case '0', '1', '2', '3', '4', '5', '6', '7':
   645  		n, base, max = 3, 8, 255
   646  	case 'x':
   647  		c = s.getr()
   648  		n, base, max = 2, 16, 255
   649  	case 'u':
   650  		c = s.getr()
   651  		n, base, max = 4, 16, unicode.MaxRune
   652  	case 'U':
   653  		c = s.getr()
   654  		n, base, max = 8, 16, unicode.MaxRune
   655  	default:
   656  		if c < 0 {
   657  			return true // complain in caller about EOF
   658  		}
   659  		s.error("unknown escape sequence")
   660  		return false
   661  	}
   662  
   663  	var x uint32
   664  	for i := n; i > 0; i-- {
   665  		d := base
   666  		switch {
   667  		case isDigit(c):
   668  			d = uint32(c) - '0'
   669  		case 'a' <= c && c <= 'f':
   670  			d = uint32(c) - ('a' - 10)
   671  		case 'A' <= c && c <= 'F':
   672  			d = uint32(c) - ('A' - 10)
   673  		}
   674  		if d >= base {
   675  			if c < 0 {
   676  				return true // complain in caller about EOF
   677  			}
   678  			kind := "hex"
   679  			if base == 8 {
   680  				kind = "octal"
   681  			}
   682  			s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c))
   683  			s.ungetr()
   684  			return false
   685  		}
   686  		// d < base
   687  		x = x*base + d
   688  		c = s.getr()
   689  	}
   690  	s.ungetr()
   691  
   692  	if x > max && base == 8 {
   693  		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
   694  		return false
   695  	}
   696  
   697  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   698  		s.error("escape sequence is invalid Unicode code point")
   699  		return false
   700  	}
   701  
   702  	return true
   703  }