github.com/riscv/riscv-go@v0.0.0-20200123204226-124ebd6fcc8e/src/cmd/compile/internal/syntax/scanner.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements scanner, a lexical tokenizer for
     6  // Go source. After initialization, consecutive calls of
     7  // next advance the scanner one token at a time.
     8  //
     9  // This file, source.go, and tokens.go are self-contained
    10  // (go tool compile scanner.go source.go tokens.go compiles)
    11  // and thus could be made into its own package.
    12  
    13  package syntax
    14  
    15  import (
    16  	"fmt"
    17  	"io"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  type scanner struct {
    23  	source
    24  	pragh  func(line, col uint, msg string)
    25  	nlsemi bool // if set '\n' and EOF translate to ';'
    26  
    27  	// current token, valid after calling next()
    28  	line, col uint
    29  	tok       token
    30  	lit       string   // valid if tok is _Name or _Literal
    31  	kind      LitKind  // valid if tok is _Literal
    32  	op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    33  	prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    34  }
    35  
    36  func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) {
    37  	s.source.init(src, errh)
    38  	s.pragh = pragh
    39  	s.nlsemi = false
    40  }
    41  
    42  // next advances the scanner by reading the next token.
    43  //
    44  // If a read, source encoding, or lexical error occurs, next
    45  // calls the error handler installed with init. The handler
    46  // must exist.
    47  //
    48  // If a //line or //go: directive is encountered, next
    49  // calls the pragma handler installed with init, if not nil.
    50  //
    51  // The (line, col) position passed to the error and pragma
    52  // handler is always at or after the current source reading
    53  // position.
    54  func (s *scanner) next() {
    55  	nlsemi := s.nlsemi
    56  	s.nlsemi = false
    57  
    58  redo:
    59  	// skip white space
    60  	c := s.getr()
    61  	for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    62  		c = s.getr()
    63  	}
    64  
    65  	// token start
    66  	s.line, s.col = s.source.line0, s.source.col0
    67  
    68  	if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
    69  		s.ident()
    70  		return
    71  	}
    72  
    73  	switch c {
    74  	case -1:
    75  		if nlsemi {
    76  			s.tok = _Semi
    77  			break
    78  		}
    79  		s.tok = _EOF
    80  
    81  	case '\n':
    82  		s.tok = _Semi
    83  
    84  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
    85  		s.number(c)
    86  
    87  	case '"':
    88  		s.stdString()
    89  
    90  	case '`':
    91  		s.rawString()
    92  
    93  	case '\'':
    94  		s.rune()
    95  
    96  	case '(':
    97  		s.tok = _Lparen
    98  
    99  	case '[':
   100  		s.tok = _Lbrack
   101  
   102  	case '{':
   103  		s.tok = _Lbrace
   104  
   105  	case ',':
   106  		s.tok = _Comma
   107  
   108  	case ';':
   109  		s.tok = _Semi
   110  
   111  	case ')':
   112  		s.nlsemi = true
   113  		s.tok = _Rparen
   114  
   115  	case ']':
   116  		s.nlsemi = true
   117  		s.tok = _Rbrack
   118  
   119  	case '}':
   120  		s.nlsemi = true
   121  		s.tok = _Rbrace
   122  
   123  	case ':':
   124  		if s.getr() == '=' {
   125  			s.tok = _Define
   126  			break
   127  		}
   128  		s.ungetr()
   129  		s.tok = _Colon
   130  
   131  	case '.':
   132  		c = s.getr()
   133  		if isDigit(c) {
   134  			s.ungetr2()
   135  			s.number('.')
   136  			break
   137  		}
   138  		if c == '.' {
   139  			c = s.getr()
   140  			if c == '.' {
   141  				s.tok = _DotDotDot
   142  				break
   143  			}
   144  			s.ungetr2()
   145  		}
   146  		s.ungetr()
   147  		s.tok = _Dot
   148  
   149  	case '+':
   150  		s.op, s.prec = Add, precAdd
   151  		c = s.getr()
   152  		if c != '+' {
   153  			goto assignop
   154  		}
   155  		s.nlsemi = true
   156  		s.tok = _IncOp
   157  
   158  	case '-':
   159  		s.op, s.prec = Sub, precAdd
   160  		c = s.getr()
   161  		if c != '-' {
   162  			goto assignop
   163  		}
   164  		s.nlsemi = true
   165  		s.tok = _IncOp
   166  
   167  	case '*':
   168  		s.op, s.prec = Mul, precMul
   169  		// don't goto assignop - want _Star token
   170  		if s.getr() == '=' {
   171  			s.tok = _AssignOp
   172  			break
   173  		}
   174  		s.ungetr()
   175  		s.tok = _Star
   176  
   177  	case '/':
   178  		c = s.getr()
   179  		if c == '/' {
   180  			s.lineComment()
   181  			goto redo
   182  		}
   183  		if c == '*' {
   184  			s.fullComment()
   185  			if s.source.line > s.line && nlsemi {
   186  				// A multi-line comment acts like a newline;
   187  				// it translates to a ';' if nlsemi is set.
   188  				s.tok = _Semi
   189  				break
   190  			}
   191  			goto redo
   192  		}
   193  		s.op, s.prec = Div, precMul
   194  		goto assignop
   195  
   196  	case '%':
   197  		s.op, s.prec = Rem, precMul
   198  		c = s.getr()
   199  		goto assignop
   200  
   201  	case '&':
   202  		c = s.getr()
   203  		if c == '&' {
   204  			s.op, s.prec = AndAnd, precAndAnd
   205  			s.tok = _Operator
   206  			break
   207  		}
   208  		s.op, s.prec = And, precMul
   209  		if c == '^' {
   210  			s.op = AndNot
   211  			c = s.getr()
   212  		}
   213  		goto assignop
   214  
   215  	case '|':
   216  		c = s.getr()
   217  		if c == '|' {
   218  			s.op, s.prec = OrOr, precOrOr
   219  			s.tok = _Operator
   220  			break
   221  		}
   222  		s.op, s.prec = Or, precAdd
   223  		goto assignop
   224  
   225  	case '~':
   226  		s.error("bitwise complement operator is ^")
   227  		fallthrough
   228  
   229  	case '^':
   230  		s.op, s.prec = Xor, precAdd
   231  		c = s.getr()
   232  		goto assignop
   233  
   234  	case '<':
   235  		c = s.getr()
   236  		if c == '=' {
   237  			s.op, s.prec = Leq, precCmp
   238  			s.tok = _Operator
   239  			break
   240  		}
   241  		if c == '<' {
   242  			s.op, s.prec = Shl, precMul
   243  			c = s.getr()
   244  			goto assignop
   245  		}
   246  		if c == '-' {
   247  			s.tok = _Arrow
   248  			break
   249  		}
   250  		s.ungetr()
   251  		s.op, s.prec = Lss, precCmp
   252  		s.tok = _Operator
   253  
   254  	case '>':
   255  		c = s.getr()
   256  		if c == '=' {
   257  			s.op, s.prec = Geq, precCmp
   258  			s.tok = _Operator
   259  			break
   260  		}
   261  		if c == '>' {
   262  			s.op, s.prec = Shr, precMul
   263  			c = s.getr()
   264  			goto assignop
   265  		}
   266  		s.ungetr()
   267  		s.op, s.prec = Gtr, precCmp
   268  		s.tok = _Operator
   269  
   270  	case '=':
   271  		if s.getr() == '=' {
   272  			s.op, s.prec = Eql, precCmp
   273  			s.tok = _Operator
   274  			break
   275  		}
   276  		s.ungetr()
   277  		s.tok = _Assign
   278  
   279  	case '!':
   280  		if s.getr() == '=' {
   281  			s.op, s.prec = Neq, precCmp
   282  			s.tok = _Operator
   283  			break
   284  		}
   285  		s.ungetr()
   286  		s.op, s.prec = Not, 0
   287  		s.tok = _Operator
   288  
   289  	default:
   290  		s.tok = 0
   291  		s.error(fmt.Sprintf("invalid character %#U", c))
   292  		goto redo
   293  	}
   294  
   295  	return
   296  
   297  assignop:
   298  	if c == '=' {
   299  		s.tok = _AssignOp
   300  		return
   301  	}
   302  	s.ungetr()
   303  	s.tok = _Operator
   304  }
   305  
   306  func isLetter(c rune) bool {
   307  	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
   308  }
   309  
   310  func isDigit(c rune) bool {
   311  	return '0' <= c && c <= '9'
   312  }
   313  
   314  func (s *scanner) ident() {
   315  	s.startLit()
   316  
   317  	// accelerate common case (7bit ASCII)
   318  	c := s.getr()
   319  	for isLetter(c) || isDigit(c) {
   320  		c = s.getr()
   321  	}
   322  
   323  	// general case
   324  	if c >= utf8.RuneSelf {
   325  		for s.isIdentRune(c, false) {
   326  			c = s.getr()
   327  		}
   328  	}
   329  	s.ungetr()
   330  
   331  	lit := s.stopLit()
   332  
   333  	// possibly a keyword
   334  	if len(lit) >= 2 {
   335  		if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) {
   336  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   337  			s.tok = tok
   338  			return
   339  		}
   340  	}
   341  
   342  	s.nlsemi = true
   343  	s.lit = string(lit)
   344  	s.tok = _Name
   345  }
   346  
   347  func (s *scanner) isIdentRune(c rune, first bool) bool {
   348  	switch {
   349  	case unicode.IsLetter(c) || c == '_':
   350  		// ok
   351  	case unicode.IsDigit(c):
   352  		if first {
   353  			s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
   354  		}
   355  	case c >= utf8.RuneSelf:
   356  		s.error(fmt.Sprintf("invalid identifier character %#U", c))
   357  	default:
   358  		return false
   359  	}
   360  	return true
   361  }
   362  
   363  // hash is a perfect hash function for keywords.
   364  // It assumes that s has at least length 2.
   365  func hash(s []byte) uint {
   366  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   367  }
   368  
   369  var keywordMap [1 << 6]token // size must be power of two
   370  
   371  func init() {
   372  	// populate keywordMap
   373  	for tok := _Break; tok <= _Var; tok++ {
   374  		h := hash([]byte(tokstrings[tok]))
   375  		if keywordMap[h] != 0 {
   376  			panic("imperfect hash")
   377  		}
   378  		keywordMap[h] = tok
   379  	}
   380  }
   381  
   382  func (s *scanner) number(c rune) {
   383  	s.startLit()
   384  
   385  	if c != '.' {
   386  		s.kind = IntLit // until proven otherwise
   387  		if c == '0' {
   388  			c = s.getr()
   389  			if c == 'x' || c == 'X' {
   390  				// hex
   391  				c = s.getr()
   392  				hasDigit := false
   393  				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   394  					c = s.getr()
   395  					hasDigit = true
   396  				}
   397  				if !hasDigit {
   398  					s.error("malformed hex constant")
   399  				}
   400  				goto done
   401  			}
   402  
   403  			// decimal 0, octal, or float
   404  			has8or9 := false
   405  			for isDigit(c) {
   406  				if c > '7' {
   407  					has8or9 = true
   408  				}
   409  				c = s.getr()
   410  			}
   411  			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
   412  				// octal
   413  				if has8or9 {
   414  					s.error("malformed octal constant")
   415  				}
   416  				goto done
   417  			}
   418  
   419  		} else {
   420  			// decimal or float
   421  			for isDigit(c) {
   422  				c = s.getr()
   423  			}
   424  		}
   425  	}
   426  
   427  	// float
   428  	if c == '.' {
   429  		s.kind = FloatLit
   430  		c = s.getr()
   431  		for isDigit(c) {
   432  			c = s.getr()
   433  		}
   434  	}
   435  
   436  	// exponent
   437  	if c == 'e' || c == 'E' {
   438  		s.kind = FloatLit
   439  		c = s.getr()
   440  		if c == '-' || c == '+' {
   441  			c = s.getr()
   442  		}
   443  		if !isDigit(c) {
   444  			s.error("malformed floating-point constant exponent")
   445  		}
   446  		for isDigit(c) {
   447  			c = s.getr()
   448  		}
   449  	}
   450  
   451  	// complex
   452  	if c == 'i' {
   453  		s.kind = ImagLit
   454  		s.getr()
   455  	}
   456  
   457  done:
   458  	s.ungetr()
   459  	s.nlsemi = true
   460  	s.lit = string(s.stopLit())
   461  	s.tok = _Literal
   462  }
   463  
   464  func (s *scanner) stdString() {
   465  	s.startLit()
   466  
   467  	for {
   468  		r := s.getr()
   469  		if r == '"' {
   470  			break
   471  		}
   472  		if r == '\\' {
   473  			s.escape('"')
   474  			continue
   475  		}
   476  		if r == '\n' {
   477  			s.ungetr() // assume newline is not part of literal
   478  			s.error("newline in string")
   479  			break
   480  		}
   481  		if r < 0 {
   482  			s.errh(s.line, s.col, "string not terminated")
   483  			break
   484  		}
   485  	}
   486  
   487  	s.nlsemi = true
   488  	s.lit = string(s.stopLit())
   489  	s.kind = StringLit
   490  	s.tok = _Literal
   491  }
   492  
   493  func (s *scanner) rawString() {
   494  	s.startLit()
   495  
   496  	for {
   497  		r := s.getr()
   498  		if r == '`' {
   499  			break
   500  		}
   501  		if r < 0 {
   502  			s.errh(s.line, s.col, "string not terminated")
   503  			break
   504  		}
   505  	}
   506  	// We leave CRs in the string since they are part of the
   507  	// literal (even though they are not part of the literal
   508  	// value).
   509  
   510  	s.nlsemi = true
   511  	s.lit = string(s.stopLit())
   512  	s.kind = StringLit
   513  	s.tok = _Literal
   514  }
   515  
   516  func (s *scanner) rune() {
   517  	s.startLit()
   518  
   519  	r := s.getr()
   520  	ok := false
   521  	if r == '\'' {
   522  		s.error("empty character literal or unescaped ' in character literal")
   523  	} else if r == '\n' {
   524  		s.ungetr() // assume newline is not part of literal
   525  		s.error("newline in character literal")
   526  	} else {
   527  		ok = true
   528  		if r == '\\' {
   529  			ok = s.escape('\'')
   530  		}
   531  	}
   532  
   533  	r = s.getr()
   534  	if r != '\'' {
   535  		// only report error if we're ok so far
   536  		if ok {
   537  			s.error("missing '")
   538  		}
   539  		s.ungetr()
   540  	}
   541  
   542  	s.nlsemi = true
   543  	s.lit = string(s.stopLit())
   544  	s.kind = RuneLit
   545  	s.tok = _Literal
   546  }
   547  
   548  func (s *scanner) skipLine(r rune) {
   549  	for r >= 0 {
   550  		if r == '\n' {
   551  			s.ungetr() // don't consume '\n' - needed for nlsemi logic
   552  			break
   553  		}
   554  		r = s.getr()
   555  	}
   556  }
   557  
   558  func (s *scanner) lineComment() {
   559  	r := s.getr()
   560  	if s.pragh == nil || (r != 'g' && r != 'l') {
   561  		s.skipLine(r)
   562  		return
   563  	}
   564  	// s.pragh != nil && (r == 'g' || r == 'l')
   565  
   566  	// recognize pragmas
   567  	prefix := "go:"
   568  	if r == 'l' {
   569  		prefix = "line "
   570  	}
   571  	for _, m := range prefix {
   572  		if r != m {
   573  			s.skipLine(r)
   574  			return
   575  		}
   576  		r = s.getr()
   577  	}
   578  
   579  	// pragma text without line ending (which may be "\r\n" if Windows),
   580  	s.startLit()
   581  	s.skipLine(r)
   582  	text := s.stopLit()
   583  	if i := len(text) - 1; i >= 0 && text[i] == '\r' {
   584  		text = text[:i]
   585  	}
   586  
   587  	s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since pragma text starts after //
   588  }
   589  
   590  func (s *scanner) fullComment() {
   591  	for {
   592  		r := s.getr()
   593  		for r == '*' {
   594  			r = s.getr()
   595  			if r == '/' {
   596  				return
   597  			}
   598  		}
   599  		if r < 0 {
   600  			s.errh(s.line, s.col, "comment not terminated")
   601  			return
   602  		}
   603  	}
   604  }
   605  
   606  func (s *scanner) escape(quote rune) bool {
   607  	var n int
   608  	var base, max uint32
   609  
   610  	c := s.getr()
   611  	switch c {
   612  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   613  		return true
   614  	case '0', '1', '2', '3', '4', '5', '6', '7':
   615  		n, base, max = 3, 8, 255
   616  	case 'x':
   617  		c = s.getr()
   618  		n, base, max = 2, 16, 255
   619  	case 'u':
   620  		c = s.getr()
   621  		n, base, max = 4, 16, unicode.MaxRune
   622  	case 'U':
   623  		c = s.getr()
   624  		n, base, max = 8, 16, unicode.MaxRune
   625  	default:
   626  		if c < 0 {
   627  			return true // complain in caller about EOF
   628  		}
   629  		s.error("unknown escape sequence")
   630  		return false
   631  	}
   632  
   633  	var x uint32
   634  	for i := n; i > 0; i-- {
   635  		d := base
   636  		switch {
   637  		case isDigit(c):
   638  			d = uint32(c) - '0'
   639  		case 'a' <= c && c <= 'f':
   640  			d = uint32(c) - ('a' - 10)
   641  		case 'A' <= c && c <= 'F':
   642  			d = uint32(c) - ('A' - 10)
   643  		}
   644  		if d >= base {
   645  			if c < 0 {
   646  				return true // complain in caller about EOF
   647  			}
   648  			kind := "hex"
   649  			if base == 8 {
   650  				kind = "octal"
   651  			}
   652  			s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c))
   653  			s.ungetr()
   654  			return false
   655  		}
   656  		// d < base
   657  		x = x*base + d
   658  		c = s.getr()
   659  	}
   660  	s.ungetr()
   661  
   662  	if x > max && base == 8 {
   663  		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
   664  		return false
   665  	}
   666  
   667  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   668  		s.error("escape sequence is invalid Unicode code point")
   669  		return false
   670  	}
   671  
   672  	return true
   673  }