github.com/FenixAra/go@v0.0.0-20170127160404-96ea0918e670/src/cmd/compile/internal/syntax/scanner.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"strings"
    11  	"unicode"
    12  	"unicode/utf8"
    13  )
    14  
    15  type scanner struct {
    16  	source
    17  	nlsemi bool // if set '\n' and EOF translate to ';'
    18  	pragma Pragma
    19  
    20  	// current token, valid after calling next()
    21  	pos, line int
    22  	tok       token
    23  	lit       string   // valid if tok is _Name or _Literal
    24  	kind      LitKind  // valid if tok is _Literal
    25  	op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    26  	prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    27  
    28  	pragh PragmaHandler
    29  }
    30  
    31  func (s *scanner) init(src io.Reader, errh ErrorHandler, pragh PragmaHandler) {
    32  	s.source.init(src, errh)
    33  	s.nlsemi = false
    34  	s.pragh = pragh
    35  }
    36  
    37  func (s *scanner) next() {
    38  	nlsemi := s.nlsemi
    39  	s.nlsemi = false
    40  
    41  redo:
    42  	// skip white space
    43  	c := s.getr()
    44  	for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    45  		c = s.getr()
    46  	}
    47  
    48  	// token start
    49  	s.pos, s.line = s.source.pos0(), s.source.line0
    50  
    51  	if isLetter(c) || c >= utf8.RuneSelf && (unicode.IsLetter(c) || s.isCompatRune(c, true)) {
    52  		s.ident()
    53  		return
    54  	}
    55  
    56  	switch c {
    57  	case -1:
    58  		if nlsemi {
    59  			s.tok = _Semi
    60  			break
    61  		}
    62  		s.tok = _EOF
    63  
    64  	case '\n':
    65  		s.tok = _Semi
    66  
    67  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
    68  		s.number(c)
    69  
    70  	case '"':
    71  		s.stdString()
    72  
    73  	case '`':
    74  		s.rawString()
    75  
    76  	case '\'':
    77  		s.rune()
    78  
    79  	case '(':
    80  		s.tok = _Lparen
    81  
    82  	case '[':
    83  		s.tok = _Lbrack
    84  
    85  	case '{':
    86  		s.tok = _Lbrace
    87  
    88  	case ',':
    89  		s.tok = _Comma
    90  
    91  	case ';':
    92  		s.tok = _Semi
    93  
    94  	case ')':
    95  		s.nlsemi = true
    96  		s.tok = _Rparen
    97  
    98  	case ']':
    99  		s.nlsemi = true
   100  		s.tok = _Rbrack
   101  
   102  	case '}':
   103  		s.nlsemi = true
   104  		s.tok = _Rbrace
   105  
   106  	case ':':
   107  		if s.getr() == '=' {
   108  			s.tok = _Define
   109  			break
   110  		}
   111  		s.ungetr()
   112  		s.tok = _Colon
   113  
   114  	case '.':
   115  		c = s.getr()
   116  		if isDigit(c) {
   117  			s.ungetr()
   118  			s.source.r0-- // make sure '.' is part of literal (line cannot have changed)
   119  			s.number('.')
   120  			break
   121  		}
   122  		if c == '.' {
   123  			c = s.getr()
   124  			if c == '.' {
   125  				s.tok = _DotDotDot
   126  				break
   127  			}
   128  			s.ungetr()
   129  			s.source.r0-- // make next ungetr work (line cannot have changed)
   130  		}
   131  		s.ungetr()
   132  		s.tok = _Dot
   133  
   134  	case '+':
   135  		s.op, s.prec = Add, precAdd
   136  		c = s.getr()
   137  		if c != '+' {
   138  			goto assignop
   139  		}
   140  		s.nlsemi = true
   141  		s.tok = _IncOp
   142  
   143  	case '-':
   144  		s.op, s.prec = Sub, precAdd
   145  		c = s.getr()
   146  		if c != '-' {
   147  			goto assignop
   148  		}
   149  		s.nlsemi = true
   150  		s.tok = _IncOp
   151  
   152  	case '*':
   153  		s.op, s.prec = Mul, precMul
   154  		// don't goto assignop - want _Star token
   155  		if s.getr() == '=' {
   156  			s.tok = _AssignOp
   157  			break
   158  		}
   159  		s.ungetr()
   160  		s.tok = _Star
   161  
   162  	case '/':
   163  		c = s.getr()
   164  		if c == '/' {
   165  			s.lineComment()
   166  			goto redo
   167  		}
   168  		if c == '*' {
   169  			s.fullComment()
   170  			if s.source.line > s.line && nlsemi {
   171  				// A multi-line comment acts like a newline;
   172  				// it translates to a ';' if nlsemi is set.
   173  				s.tok = _Semi
   174  				break
   175  			}
   176  			goto redo
   177  		}
   178  		s.op, s.prec = Div, precMul
   179  		goto assignop
   180  
   181  	case '%':
   182  		s.op, s.prec = Rem, precMul
   183  		c = s.getr()
   184  		goto assignop
   185  
   186  	case '&':
   187  		c = s.getr()
   188  		if c == '&' {
   189  			s.op, s.prec = AndAnd, precAndAnd
   190  			s.tok = _Operator
   191  			break
   192  		}
   193  		s.op, s.prec = And, precMul
   194  		if c == '^' {
   195  			s.op = AndNot
   196  			c = s.getr()
   197  		}
   198  		goto assignop
   199  
   200  	case '|':
   201  		c = s.getr()
   202  		if c == '|' {
   203  			s.op, s.prec = OrOr, precOrOr
   204  			s.tok = _Operator
   205  			break
   206  		}
   207  		s.op, s.prec = Or, precAdd
   208  		goto assignop
   209  
   210  	case '~':
   211  		s.error("bitwise complement operator is ^")
   212  		fallthrough
   213  
   214  	case '^':
   215  		s.op, s.prec = Xor, precAdd
   216  		c = s.getr()
   217  		goto assignop
   218  
   219  	case '<':
   220  		c = s.getr()
   221  		if c == '=' {
   222  			s.op, s.prec = Leq, precCmp
   223  			s.tok = _Operator
   224  			break
   225  		}
   226  		if c == '<' {
   227  			s.op, s.prec = Shl, precMul
   228  			c = s.getr()
   229  			goto assignop
   230  		}
   231  		if c == '-' {
   232  			s.tok = _Arrow
   233  			break
   234  		}
   235  		s.ungetr()
   236  		s.op, s.prec = Lss, precCmp
   237  		s.tok = _Operator
   238  
   239  	case '>':
   240  		c = s.getr()
   241  		if c == '=' {
   242  			s.op, s.prec = Geq, precCmp
   243  			s.tok = _Operator
   244  			break
   245  		}
   246  		if c == '>' {
   247  			s.op, s.prec = Shr, precMul
   248  			c = s.getr()
   249  			goto assignop
   250  		}
   251  		s.ungetr()
   252  		s.op, s.prec = Gtr, precCmp
   253  		s.tok = _Operator
   254  
   255  	case '=':
   256  		if s.getr() == '=' {
   257  			s.op, s.prec = Eql, precCmp
   258  			s.tok = _Operator
   259  			break
   260  		}
   261  		s.ungetr()
   262  		s.tok = _Assign
   263  
   264  	case '!':
   265  		if s.getr() == '=' {
   266  			s.op, s.prec = Neq, precCmp
   267  			s.tok = _Operator
   268  			break
   269  		}
   270  		s.ungetr()
   271  		s.op, s.prec = Not, 0
   272  		s.tok = _Operator
   273  
   274  	default:
   275  		s.tok = 0
   276  		s.error(fmt.Sprintf("illegal character %#U", c))
   277  		goto redo
   278  	}
   279  
   280  	return
   281  
   282  assignop:
   283  	if c == '=' {
   284  		s.tok = _AssignOp
   285  		return
   286  	}
   287  	s.ungetr()
   288  	s.tok = _Operator
   289  }
   290  
   291  func isLetter(c rune) bool {
   292  	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
   293  }
   294  
   295  func isDigit(c rune) bool {
   296  	return '0' <= c && c <= '9'
   297  }
   298  
   299  func (s *scanner) ident() {
   300  	s.startLit()
   301  
   302  	// accelerate common case (7bit ASCII)
   303  	c := s.getr()
   304  	for isLetter(c) || isDigit(c) {
   305  		c = s.getr()
   306  	}
   307  
   308  	// general case
   309  	if c >= utf8.RuneSelf {
   310  		for unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) || s.isCompatRune(c, false) {
   311  			c = s.getr()
   312  		}
   313  	}
   314  	s.ungetr()
   315  
   316  	lit := s.stopLit()
   317  
   318  	// possibly a keyword
   319  	if len(lit) >= 2 {
   320  		if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) {
   321  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   322  			s.tok = tok
   323  			return
   324  		}
   325  	}
   326  
   327  	s.nlsemi = true
   328  	s.lit = string(lit)
   329  	s.tok = _Name
   330  }
   331  
   332  func (s *scanner) isCompatRune(c rune, start bool) bool {
   333  	if !gcCompat || c < utf8.RuneSelf {
   334  		return false
   335  	}
   336  	if start && unicode.IsNumber(c) {
   337  		s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
   338  	} else {
   339  		s.error(fmt.Sprintf("invalid identifier character %#U", c))
   340  	}
   341  	return true
   342  }
   343  
   344  // hash is a perfect hash function for keywords.
   345  // It assumes that s has at least length 2.
   346  func hash(s []byte) uint {
   347  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   348  }
   349  
   350  var keywordMap [1 << 6]token // size must be power of two
   351  
   352  func init() {
   353  	// populate keywordMap
   354  	for tok := _Break; tok <= _Var; tok++ {
   355  		h := hash([]byte(tokstrings[tok]))
   356  		if keywordMap[h] != 0 {
   357  			panic("imperfect hash")
   358  		}
   359  		keywordMap[h] = tok
   360  	}
   361  }
   362  
   363  func (s *scanner) number(c rune) {
   364  	s.startLit()
   365  
   366  	if c != '.' {
   367  		s.kind = IntLit // until proven otherwise
   368  		if c == '0' {
   369  			c = s.getr()
   370  			if c == 'x' || c == 'X' {
   371  				// hex
   372  				c = s.getr()
   373  				hasDigit := false
   374  				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   375  					c = s.getr()
   376  					hasDigit = true
   377  				}
   378  				if !hasDigit {
   379  					s.error("malformed hex constant")
   380  				}
   381  				goto done
   382  			}
   383  
   384  			// decimal 0, octal, or float
   385  			has8or9 := false
   386  			for isDigit(c) {
   387  				if c > '7' {
   388  					has8or9 = true
   389  				}
   390  				c = s.getr()
   391  			}
   392  			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
   393  				// octal
   394  				if has8or9 {
   395  					s.error("malformed octal constant")
   396  				}
   397  				goto done
   398  			}
   399  
   400  		} else {
   401  			// decimal or float
   402  			for isDigit(c) {
   403  				c = s.getr()
   404  			}
   405  		}
   406  	}
   407  
   408  	// float
   409  	if c == '.' {
   410  		s.kind = FloatLit
   411  		c = s.getr()
   412  		for isDigit(c) {
   413  			c = s.getr()
   414  		}
   415  	}
   416  
   417  	// exponent
   418  	if c == 'e' || c == 'E' {
   419  		s.kind = FloatLit
   420  		c = s.getr()
   421  		if c == '-' || c == '+' {
   422  			c = s.getr()
   423  		}
   424  		if !isDigit(c) {
   425  			s.error("malformed floating-point constant exponent")
   426  		}
   427  		for isDigit(c) {
   428  			c = s.getr()
   429  		}
   430  	}
   431  
   432  	// complex
   433  	if c == 'i' {
   434  		s.kind = ImagLit
   435  		s.getr()
   436  	}
   437  
   438  done:
   439  	s.ungetr()
   440  	s.nlsemi = true
   441  	s.lit = string(s.stopLit())
   442  	s.tok = _Literal
   443  }
   444  
   445  func (s *scanner) stdString() {
   446  	s.startLit()
   447  
   448  	for {
   449  		r := s.getr()
   450  		if r == '"' {
   451  			break
   452  		}
   453  		if r == '\\' {
   454  			s.escape('"')
   455  			continue
   456  		}
   457  		if r == '\n' {
   458  			s.ungetr() // assume newline is not part of literal
   459  			s.error("newline in string")
   460  			break
   461  		}
   462  		if r < 0 {
   463  			s.error_at(s.pos, s.line, "string not terminated")
   464  			break
   465  		}
   466  	}
   467  
   468  	s.nlsemi = true
   469  	s.lit = string(s.stopLit())
   470  	s.kind = StringLit
   471  	s.tok = _Literal
   472  }
   473  
   474  func (s *scanner) rawString() {
   475  	s.startLit()
   476  
   477  	for {
   478  		r := s.getr()
   479  		if r == '`' {
   480  			break
   481  		}
   482  		if r < 0 {
   483  			s.error_at(s.pos, s.line, "string not terminated")
   484  			break
   485  		}
   486  	}
   487  	// We leave CRs in the string since they are part of the
   488  	// literal (even though they are not part of the literal
   489  	// value).
   490  
   491  	s.nlsemi = true
   492  	s.lit = string(s.stopLit())
   493  	s.kind = StringLit
   494  	s.tok = _Literal
   495  }
   496  
   497  func (s *scanner) rune() {
   498  	s.startLit()
   499  
   500  	r := s.getr()
   501  	ok := false
   502  	if r == '\'' {
   503  		s.error("empty character literal or unescaped ' in character literal")
   504  	} else if r == '\n' {
   505  		s.ungetr() // assume newline is not part of literal
   506  		s.error("newline in character literal")
   507  	} else {
   508  		ok = true
   509  		if r == '\\' {
   510  			ok = s.escape('\'')
   511  		}
   512  	}
   513  
   514  	r = s.getr()
   515  	if r != '\'' {
   516  		// only report error if we're ok so far
   517  		if ok {
   518  			s.error("missing '")
   519  		}
   520  		s.ungetr()
   521  	}
   522  
   523  	s.nlsemi = true
   524  	s.lit = string(s.stopLit())
   525  	s.kind = RuneLit
   526  	s.tok = _Literal
   527  }
   528  
   529  func (s *scanner) lineComment() {
   530  	// recognize pragmas
   531  	var prefix string
   532  	r := s.getr()
   533  	if s.pragh == nil {
   534  		goto skip
   535  	}
   536  
   537  	switch r {
   538  	case 'g':
   539  		prefix = "go:"
   540  	case 'l':
   541  		prefix = "line "
   542  	default:
   543  		goto skip
   544  	}
   545  
   546  	s.startLit()
   547  	for _, m := range prefix {
   548  		if r != m {
   549  			s.stopLit()
   550  			goto skip
   551  		}
   552  		r = s.getr()
   553  	}
   554  
   555  	for r >= 0 {
   556  		if r == '\n' {
   557  			s.ungetr()
   558  			break
   559  		}
   560  		r = s.getr()
   561  	}
   562  	s.pragma |= s.pragh(0, s.line, strings.TrimSuffix(string(s.stopLit()), "\r"))
   563  	return
   564  
   565  skip:
   566  	// consume line
   567  	for r != '\n' && r >= 0 {
   568  		r = s.getr()
   569  	}
   570  	s.ungetr() // don't consume '\n' - needed for nlsemi logic
   571  }
   572  
   573  func (s *scanner) fullComment() {
   574  	for {
   575  		r := s.getr()
   576  		for r == '*' {
   577  			r = s.getr()
   578  			if r == '/' {
   579  				return
   580  			}
   581  		}
   582  		if r < 0 {
   583  			s.error_at(s.pos, s.line, "comment not terminated")
   584  			return
   585  		}
   586  	}
   587  }
   588  
   589  func (s *scanner) escape(quote rune) bool {
   590  	var n int
   591  	var base, max uint32
   592  
   593  	c := s.getr()
   594  	switch c {
   595  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   596  		return true
   597  	case '0', '1', '2', '3', '4', '5', '6', '7':
   598  		n, base, max = 3, 8, 255
   599  	case 'x':
   600  		c = s.getr()
   601  		n, base, max = 2, 16, 255
   602  	case 'u':
   603  		c = s.getr()
   604  		n, base, max = 4, 16, unicode.MaxRune
   605  	case 'U':
   606  		c = s.getr()
   607  		n, base, max = 8, 16, unicode.MaxRune
   608  	default:
   609  		if c < 0 {
   610  			return true // complain in caller about EOF
   611  		}
   612  		s.error("unknown escape sequence")
   613  		return false
   614  	}
   615  
   616  	var x uint32
   617  	for i := n; i > 0; i-- {
   618  		d := base
   619  		switch {
   620  		case isDigit(c):
   621  			d = uint32(c) - '0'
   622  		case 'a' <= c && c <= 'f':
   623  			d = uint32(c) - ('a' - 10)
   624  		case 'A' <= c && c <= 'F':
   625  			d = uint32(c) - ('A' - 10)
   626  		}
   627  		if d >= base {
   628  			if c < 0 {
   629  				return true // complain in caller about EOF
   630  			}
   631  			if gcCompat {
   632  				name := "hex"
   633  				if base == 8 {
   634  					name = "octal"
   635  				}
   636  				s.error(fmt.Sprintf("non-%s character in escape sequence: %c", name, c))
   637  			} else {
   638  				if c != quote {
   639  					s.error(fmt.Sprintf("illegal character %#U in escape sequence", c))
   640  				} else {
   641  					s.error("escape sequence incomplete")
   642  				}
   643  			}
   644  			s.ungetr()
   645  			return false
   646  		}
   647  		// d < base
   648  		x = x*base + d
   649  		c = s.getr()
   650  	}
   651  	s.ungetr()
   652  
   653  	if x > max && base == 8 {
   654  		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
   655  		return false
   656  	}
   657  
   658  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   659  		s.error("escape sequence is invalid Unicode code point")
   660  		return false
   661  	}
   662  
   663  	return true
   664  }