github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/cmd/compile/internal/syntax/scanner.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements scanner, a lexical tokenizer for
     6  // Go source. After initialization, consecutive calls of
     7  // next advance the scanner one token at a time.
     8  //
     9  // This file, source.go, and tokens.go are self-contained
    10  // (go tool compile scanner.go source.go tokens.go compiles)
    11  // and thus could be made into its own package.
    12  
    13  package syntax
    14  
    15  import (
    16  	"fmt"
    17  	"io"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  // The mode flags below control which comments are reported
    23  // by calling the error handler. If no flag is set, comments
    24  // are ignored.
    25  const (
    26  	comments   uint = 1 << iota // call handler for all comments
    27  	directives                  // call handler for directives only
    28  )
    29  
    30  type scanner struct {
    31  	source
    32  	mode   uint
    33  	nlsemi bool // if set '\n' and EOF translate to ';'
    34  
    35  	// current token, valid after calling next()
    36  	line, col uint
    37  	tok       token
    38  	lit       string   // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF")
    39  	kind      LitKind  // valid if tok is _Literal
    40  	op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    41  	prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    42  }
    43  
    44  func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) {
    45  	s.source.init(src, errh)
    46  	s.mode = mode
    47  	s.nlsemi = false
    48  }
    49  
    50  // next advances the scanner by reading the next token.
    51  //
    52  // If a read, source encoding, or lexical error occurs, next calls
    53  // the installed error handler with the respective error position
    54  // and message. The error message is guaranteed to be non-empty and
    55  // never starts with a '/'. The error handler must exist.
    56  //
    57  // If the scanner mode includes the comments flag and a comment
    58  // (including comments containing directives) is encountered, the
    59  // error handler is also called with each comment position and text
    60  // (including opening /* or // and closing */, but without a newline
    61  // at the end of line comments). Comment text always starts with a /
    62  // which can be used to distinguish these handler calls from errors.
    63  //
    64  // If the scanner mode includes the directives (but not the comments)
    65  // flag, only comments containing a //line, /*line, or //go: directive
    66  // are reported, in the same way as regular comments. Directives in
    67  // //-style comments are only recognized if they are at the beginning
    68  // of a line.
    69  //
    70  func (s *scanner) next() {
    71  	nlsemi := s.nlsemi
    72  	s.nlsemi = false
    73  
    74  redo:
    75  	// skip white space
    76  	c := s.getr()
    77  	for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    78  		c = s.getr()
    79  	}
    80  
    81  	// token start
    82  	s.line, s.col = s.source.line0, s.source.col0
    83  
    84  	if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
    85  		s.ident()
    86  		return
    87  	}
    88  
    89  	switch c {
    90  	case -1:
    91  		if nlsemi {
    92  			s.lit = "EOF"
    93  			s.tok = _Semi
    94  			break
    95  		}
    96  		s.tok = _EOF
    97  
    98  	case '\n':
    99  		s.lit = "newline"
   100  		s.tok = _Semi
   101  
   102  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   103  		s.number(c)
   104  
   105  	case '"':
   106  		s.stdString()
   107  
   108  	case '`':
   109  		s.rawString()
   110  
   111  	case '\'':
   112  		s.rune()
   113  
   114  	case '(':
   115  		s.tok = _Lparen
   116  
   117  	case '[':
   118  		s.tok = _Lbrack
   119  
   120  	case '{':
   121  		s.tok = _Lbrace
   122  
   123  	case ',':
   124  		s.tok = _Comma
   125  
   126  	case ';':
   127  		s.lit = "semicolon"
   128  		s.tok = _Semi
   129  
   130  	case ')':
   131  		s.nlsemi = true
   132  		s.tok = _Rparen
   133  
   134  	case ']':
   135  		s.nlsemi = true
   136  		s.tok = _Rbrack
   137  
   138  	case '}':
   139  		s.nlsemi = true
   140  		s.tok = _Rbrace
   141  
   142  	case ':':
   143  		if s.getr() == '=' {
   144  			s.tok = _Define
   145  			break
   146  		}
   147  		s.ungetr()
   148  		s.tok = _Colon
   149  
   150  	case '.':
   151  		c = s.getr()
   152  		if isDigit(c) {
   153  			s.ungetr2()
   154  			s.number('.')
   155  			break
   156  		}
   157  		if c == '.' {
   158  			c = s.getr()
   159  			if c == '.' {
   160  				s.tok = _DotDotDot
   161  				break
   162  			}
   163  			s.ungetr2()
   164  		}
   165  		s.ungetr()
   166  		s.tok = _Dot
   167  
   168  	case '+':
   169  		s.op, s.prec = Add, precAdd
   170  		c = s.getr()
   171  		if c != '+' {
   172  			goto assignop
   173  		}
   174  		s.nlsemi = true
   175  		s.tok = _IncOp
   176  
   177  	case '-':
   178  		s.op, s.prec = Sub, precAdd
   179  		c = s.getr()
   180  		if c != '-' {
   181  			goto assignop
   182  		}
   183  		s.nlsemi = true
   184  		s.tok = _IncOp
   185  
   186  	case '*':
   187  		s.op, s.prec = Mul, precMul
   188  		// don't goto assignop - want _Star token
   189  		if s.getr() == '=' {
   190  			s.tok = _AssignOp
   191  			break
   192  		}
   193  		s.ungetr()
   194  		s.tok = _Star
   195  
   196  	case '/':
   197  		c = s.getr()
   198  		if c == '/' {
   199  			s.lineComment()
   200  			goto redo
   201  		}
   202  		if c == '*' {
   203  			s.fullComment()
   204  			if s.source.line > s.line && nlsemi {
   205  				// A multi-line comment acts like a newline;
   206  				// it translates to a ';' if nlsemi is set.
   207  				s.lit = "newline"
   208  				s.tok = _Semi
   209  				break
   210  			}
   211  			goto redo
   212  		}
   213  		s.op, s.prec = Div, precMul
   214  		goto assignop
   215  
   216  	case '%':
   217  		s.op, s.prec = Rem, precMul
   218  		c = s.getr()
   219  		goto assignop
   220  
   221  	case '&':
   222  		c = s.getr()
   223  		if c == '&' {
   224  			s.op, s.prec = AndAnd, precAndAnd
   225  			s.tok = _Operator
   226  			break
   227  		}
   228  		s.op, s.prec = And, precMul
   229  		if c == '^' {
   230  			s.op = AndNot
   231  			c = s.getr()
   232  		}
   233  		goto assignop
   234  
   235  	case '|':
   236  		c = s.getr()
   237  		if c == '|' {
   238  			s.op, s.prec = OrOr, precOrOr
   239  			s.tok = _Operator
   240  			break
   241  		}
   242  		s.op, s.prec = Or, precAdd
   243  		goto assignop
   244  
   245  	case '^':
   246  		s.op, s.prec = Xor, precAdd
   247  		c = s.getr()
   248  		goto assignop
   249  
   250  	case '<':
   251  		c = s.getr()
   252  		if c == '=' {
   253  			s.op, s.prec = Leq, precCmp
   254  			s.tok = _Operator
   255  			break
   256  		}
   257  		if c == '<' {
   258  			s.op, s.prec = Shl, precMul
   259  			c = s.getr()
   260  			goto assignop
   261  		}
   262  		if c == '-' {
   263  			s.tok = _Arrow
   264  			break
   265  		}
   266  		s.ungetr()
   267  		s.op, s.prec = Lss, precCmp
   268  		s.tok = _Operator
   269  
   270  	case '>':
   271  		c = s.getr()
   272  		if c == '=' {
   273  			s.op, s.prec = Geq, precCmp
   274  			s.tok = _Operator
   275  			break
   276  		}
   277  		if c == '>' {
   278  			s.op, s.prec = Shr, precMul
   279  			c = s.getr()
   280  			goto assignop
   281  		}
   282  		s.ungetr()
   283  		s.op, s.prec = Gtr, precCmp
   284  		s.tok = _Operator
   285  
   286  	case '=':
   287  		if s.getr() == '=' {
   288  			s.op, s.prec = Eql, precCmp
   289  			s.tok = _Operator
   290  			break
   291  		}
   292  		s.ungetr()
   293  		s.tok = _Assign
   294  
   295  	case '!':
   296  		if s.getr() == '=' {
   297  			s.op, s.prec = Neq, precCmp
   298  			s.tok = _Operator
   299  			break
   300  		}
   301  		s.ungetr()
   302  		s.op, s.prec = Not, 0
   303  		s.tok = _Operator
   304  
   305  	default:
   306  		s.tok = 0
   307  		s.error(fmt.Sprintf("invalid character %#U", c))
   308  		goto redo
   309  	}
   310  
   311  	return
   312  
   313  assignop:
   314  	if c == '=' {
   315  		s.tok = _AssignOp
   316  		return
   317  	}
   318  	s.ungetr()
   319  	s.tok = _Operator
   320  }
   321  
   322  func isLetter(c rune) bool {
   323  	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
   324  }
   325  
   326  func isDigit(c rune) bool {
   327  	return '0' <= c && c <= '9'
   328  }
   329  
   330  func (s *scanner) ident() {
   331  	s.startLit()
   332  
   333  	// accelerate common case (7bit ASCII)
   334  	c := s.getr()
   335  	for isLetter(c) || isDigit(c) {
   336  		c = s.getr()
   337  	}
   338  
   339  	// general case
   340  	if c >= utf8.RuneSelf {
   341  		for s.isIdentRune(c, false) {
   342  			c = s.getr()
   343  		}
   344  	}
   345  	s.ungetr()
   346  
   347  	lit := s.stopLit()
   348  
   349  	// possibly a keyword
   350  	if len(lit) >= 2 {
   351  		if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
   352  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   353  			s.tok = tok
   354  			return
   355  		}
   356  	}
   357  
   358  	s.nlsemi = true
   359  	s.lit = string(lit)
   360  	s.tok = _Name
   361  }
   362  
   363  // tokStrFast is a faster version of token.String, which assumes that tok
   364  // is one of the valid tokens - and can thus skip bounds checks.
   365  func tokStrFast(tok token) string {
   366  	return _token_name[_token_index[tok-1]:_token_index[tok]]
   367  }
   368  
   369  func (s *scanner) isIdentRune(c rune, first bool) bool {
   370  	switch {
   371  	case unicode.IsLetter(c) || c == '_':
   372  		// ok
   373  	case unicode.IsDigit(c):
   374  		if first {
   375  			s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
   376  		}
   377  	case c >= utf8.RuneSelf:
   378  		s.error(fmt.Sprintf("invalid identifier character %#U", c))
   379  	default:
   380  		return false
   381  	}
   382  	return true
   383  }
   384  
   385  // hash is a perfect hash function for keywords.
   386  // It assumes that s has at least length 2.
   387  func hash(s []byte) uint {
   388  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   389  }
   390  
   391  var keywordMap [1 << 6]token // size must be power of two
   392  
   393  func init() {
   394  	// populate keywordMap
   395  	for tok := _Break; tok <= _Var; tok++ {
   396  		h := hash([]byte(tok.String()))
   397  		if keywordMap[h] != 0 {
   398  			panic("imperfect hash")
   399  		}
   400  		keywordMap[h] = tok
   401  	}
   402  }
   403  
   404  func (s *scanner) number(c rune) {
   405  	s.startLit()
   406  
   407  	if c != '.' {
   408  		s.kind = IntLit // until proven otherwise
   409  		if c == '0' {
   410  			c = s.getr()
   411  			if c == 'x' || c == 'X' {
   412  				// hex
   413  				c = s.getr()
   414  				hasDigit := false
   415  				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   416  					c = s.getr()
   417  					hasDigit = true
   418  				}
   419  				if !hasDigit {
   420  					s.error("malformed hex constant")
   421  				}
   422  				goto done
   423  			}
   424  
   425  			// decimal 0, octal, or float
   426  			has8or9 := false
   427  			for isDigit(c) {
   428  				if c > '7' {
   429  					has8or9 = true
   430  				}
   431  				c = s.getr()
   432  			}
   433  			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
   434  				// octal
   435  				if has8or9 {
   436  					s.error("malformed octal constant")
   437  				}
   438  				goto done
   439  			}
   440  
   441  		} else {
   442  			// decimal or float
   443  			for isDigit(c) {
   444  				c = s.getr()
   445  			}
   446  		}
   447  	}
   448  
   449  	// float
   450  	if c == '.' {
   451  		s.kind = FloatLit
   452  		c = s.getr()
   453  		for isDigit(c) {
   454  			c = s.getr()
   455  		}
   456  	}
   457  
   458  	// exponent
   459  	if c == 'e' || c == 'E' {
   460  		s.kind = FloatLit
   461  		c = s.getr()
   462  		if c == '-' || c == '+' {
   463  			c = s.getr()
   464  		}
   465  		if !isDigit(c) {
   466  			s.error("malformed floating-point constant exponent")
   467  		}
   468  		for isDigit(c) {
   469  			c = s.getr()
   470  		}
   471  	}
   472  
   473  	// complex
   474  	if c == 'i' {
   475  		s.kind = ImagLit
   476  		s.getr()
   477  	}
   478  
   479  done:
   480  	s.ungetr()
   481  	s.nlsemi = true
   482  	s.lit = string(s.stopLit())
   483  	s.tok = _Literal
   484  }
   485  
   486  func (s *scanner) rune() {
   487  	s.startLit()
   488  
   489  	ok := true // only report errors if we're ok so far
   490  	n := 0
   491  	for ; ; n++ {
   492  		r := s.getr()
   493  		if r == '\'' {
   494  			break
   495  		}
   496  		if r == '\\' {
   497  			if !s.escape('\'') {
   498  				ok = false
   499  			}
   500  			continue
   501  		}
   502  		if r == '\n' {
   503  			s.ungetr() // assume newline is not part of literal
   504  			if ok {
   505  				s.error("newline in character literal")
   506  				ok = false
   507  			}
   508  			break
   509  		}
   510  		if r < 0 {
   511  			if ok {
   512  				s.errh(s.line, s.col, "invalid character literal (missing closing ')")
   513  				ok = false
   514  			}
   515  			break
   516  		}
   517  	}
   518  
   519  	if ok {
   520  		if n == 0 {
   521  			s.error("empty character literal or unescaped ' in character literal")
   522  		} else if n != 1 {
   523  			s.errh(s.line, s.col, "invalid character literal (more than one character)")
   524  		}
   525  	}
   526  
   527  	s.nlsemi = true
   528  	s.lit = string(s.stopLit())
   529  	s.kind = RuneLit
   530  	s.tok = _Literal
   531  }
   532  
   533  func (s *scanner) stdString() {
   534  	s.startLit()
   535  
   536  	for {
   537  		r := s.getr()
   538  		if r == '"' {
   539  			break
   540  		}
   541  		if r == '\\' {
   542  			s.escape('"')
   543  			continue
   544  		}
   545  		if r == '\n' {
   546  			s.ungetr() // assume newline is not part of literal
   547  			s.error("newline in string")
   548  			break
   549  		}
   550  		if r < 0 {
   551  			s.errh(s.line, s.col, "string not terminated")
   552  			break
   553  		}
   554  	}
   555  
   556  	s.nlsemi = true
   557  	s.lit = string(s.stopLit())
   558  	s.kind = StringLit
   559  	s.tok = _Literal
   560  }
   561  
   562  func (s *scanner) rawString() {
   563  	s.startLit()
   564  
   565  	for {
   566  		r := s.getr()
   567  		if r == '`' {
   568  			break
   569  		}
   570  		if r < 0 {
   571  			s.errh(s.line, s.col, "string not terminated")
   572  			break
   573  		}
   574  	}
   575  	// We leave CRs in the string since they are part of the
   576  	// literal (even though they are not part of the literal
   577  	// value).
   578  
   579  	s.nlsemi = true
   580  	s.lit = string(s.stopLit())
   581  	s.kind = StringLit
   582  	s.tok = _Literal
   583  }
   584  
   585  func (s *scanner) comment(text string) {
   586  	s.errh(s.line, s.col, text)
   587  }
   588  
   589  func (s *scanner) skipLine(r rune) {
   590  	for r >= 0 {
   591  		if r == '\n' {
   592  			s.ungetr() // don't consume '\n' - needed for nlsemi logic
   593  			break
   594  		}
   595  		r = s.getr()
   596  	}
   597  }
   598  
   599  func (s *scanner) lineComment() {
   600  	r := s.getr()
   601  
   602  	if s.mode&comments != 0 {
   603  		s.startLit()
   604  		s.skipLine(r)
   605  		s.comment("//" + string(s.stopLit()))
   606  		return
   607  	}
   608  
   609  	// directives must start at the beginning of the line (s.col == colbase)
   610  	if s.mode&directives == 0 || s.col != colbase || (r != 'g' && r != 'l') {
   611  		s.skipLine(r)
   612  		return
   613  	}
   614  
   615  	// recognize go: or line directives
   616  	prefix := "go:"
   617  	if r == 'l' {
   618  		prefix = "line "
   619  	}
   620  	for _, m := range prefix {
   621  		if r != m {
   622  			s.skipLine(r)
   623  			return
   624  		}
   625  		r = s.getr()
   626  	}
   627  
   628  	// directive text
   629  	s.startLit()
   630  	s.skipLine(r)
   631  	s.comment("//" + prefix + string(s.stopLit()))
   632  }
   633  
   634  func (s *scanner) skipComment(r rune) bool {
   635  	for r >= 0 {
   636  		for r == '*' {
   637  			r = s.getr()
   638  			if r == '/' {
   639  				return true
   640  			}
   641  		}
   642  		r = s.getr()
   643  	}
   644  	s.errh(s.line, s.col, "comment not terminated")
   645  	return false
   646  }
   647  
   648  func (s *scanner) fullComment() {
   649  	r := s.getr()
   650  
   651  	if s.mode&comments != 0 {
   652  		s.startLit()
   653  		if s.skipComment(r) {
   654  			s.comment("/*" + string(s.stopLit()))
   655  		} else {
   656  			s.killLit() // not a complete comment - ignore
   657  		}
   658  		return
   659  	}
   660  
   661  	if s.mode&directives == 0 || r != 'l' {
   662  		s.skipComment(r)
   663  		return
   664  	}
   665  
   666  	// recognize line directive
   667  	const prefix = "line "
   668  	for _, m := range prefix {
   669  		if r != m {
   670  			s.skipComment(r)
   671  			return
   672  		}
   673  		r = s.getr()
   674  	}
   675  
   676  	// directive text
   677  	s.startLit()
   678  	if s.skipComment(r) {
   679  		s.comment("/*" + prefix + string(s.stopLit()))
   680  	} else {
   681  		s.killLit() // not a complete comment - ignore
   682  	}
   683  }
   684  
   685  func (s *scanner) escape(quote rune) bool {
   686  	var n int
   687  	var base, max uint32
   688  
   689  	c := s.getr()
   690  	switch c {
   691  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   692  		return true
   693  	case '0', '1', '2', '3', '4', '5', '6', '7':
   694  		n, base, max = 3, 8, 255
   695  	case 'x':
   696  		c = s.getr()
   697  		n, base, max = 2, 16, 255
   698  	case 'u':
   699  		c = s.getr()
   700  		n, base, max = 4, 16, unicode.MaxRune
   701  	case 'U':
   702  		c = s.getr()
   703  		n, base, max = 8, 16, unicode.MaxRune
   704  	default:
   705  		if c < 0 {
   706  			return true // complain in caller about EOF
   707  		}
   708  		s.error("unknown escape sequence")
   709  		return false
   710  	}
   711  
   712  	var x uint32
   713  	for i := n; i > 0; i-- {
   714  		d := base
   715  		switch {
   716  		case isDigit(c):
   717  			d = uint32(c) - '0'
   718  		case 'a' <= c && c <= 'f':
   719  			d = uint32(c) - ('a' - 10)
   720  		case 'A' <= c && c <= 'F':
   721  			d = uint32(c) - ('A' - 10)
   722  		}
   723  		if d >= base {
   724  			if c < 0 {
   725  				return true // complain in caller about EOF
   726  			}
   727  			kind := "hex"
   728  			if base == 8 {
   729  				kind = "octal"
   730  			}
   731  			s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c))
   732  			s.ungetr()
   733  			return false
   734  		}
   735  		// d < base
   736  		x = x*base + d
   737  		c = s.getr()
   738  	}
   739  	s.ungetr()
   740  
   741  	if x > max && base == 8 {
   742  		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
   743  		return false
   744  	}
   745  
   746  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   747  		s.error("escape sequence is invalid Unicode code point")
   748  		return false
   749  	}
   750  
   751  	return true
   752  }