github.com/euank/go@v0.0.0-20160829210321-495514729181/src/cmd/compile/internal/syntax/scanner.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"strings"
    11  	"unicode"
    12  	"unicode/utf8"
    13  )
    14  
    15  type scanner struct {
    16  	source
    17  	nlsemi bool // if set '\n' and EOF translate to ';'
    18  
    19  	// current token, valid after calling next()
    20  	pos, line int
    21  	tok       token
    22  	lit       string   // valid if tok is _Name or _Literal
    23  	kind      LitKind  // valid if tok is _Literal
    24  	op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    25  	prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    26  
    27  	pragmas []Pragma
    28  }
    29  
    30  func (s *scanner) init(src io.Reader, errh ErrorHandler) {
    31  	s.source.init(src, errh)
    32  	s.nlsemi = false
    33  }
    34  
    35  func (s *scanner) next() {
    36  	nlsemi := s.nlsemi
    37  	s.nlsemi = false
    38  
    39  redo:
    40  	// skip white space
    41  	c := s.getr()
    42  	for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    43  		c = s.getr()
    44  	}
    45  
    46  	// token start
    47  	s.pos, s.line = s.source.pos0(), s.source.line0
    48  
    49  	if isLetter(c) || c >= utf8.RuneSelf && (unicode.IsLetter(c) || s.isCompatRune(c, true)) {
    50  		s.ident()
    51  		return
    52  	}
    53  
    54  	switch c {
    55  	case -1:
    56  		if nlsemi {
    57  			s.tok = _Semi
    58  			break
    59  		}
    60  		s.tok = _EOF
    61  
    62  	case '\n':
    63  		s.tok = _Semi
    64  
    65  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
    66  		s.number(c)
    67  
    68  	case '"':
    69  		s.stdString()
    70  
    71  	case '`':
    72  		s.rawString()
    73  
    74  	case '\'':
    75  		s.rune()
    76  
    77  	case '(':
    78  		s.tok = _Lparen
    79  
    80  	case '[':
    81  		s.tok = _Lbrack
    82  
    83  	case '{':
    84  		s.tok = _Lbrace
    85  
    86  	case ',':
    87  		s.tok = _Comma
    88  
    89  	case ';':
    90  		s.tok = _Semi
    91  
    92  	case ')':
    93  		s.nlsemi = true
    94  		s.tok = _Rparen
    95  
    96  	case ']':
    97  		s.nlsemi = true
    98  		s.tok = _Rbrack
    99  
   100  	case '}':
   101  		s.nlsemi = true
   102  		s.tok = _Rbrace
   103  
   104  	case ':':
   105  		if s.getr() == '=' {
   106  			s.tok = _Define
   107  			break
   108  		}
   109  		s.ungetr()
   110  		s.tok = _Colon
   111  
   112  	case '.':
   113  		c = s.getr()
   114  		if isDigit(c) {
   115  			s.ungetr()
   116  			s.source.r0-- // make sure '.' is part of literal (line cannot have changed)
   117  			s.number('.')
   118  			break
   119  		}
   120  		if c == '.' {
   121  			c = s.getr()
   122  			if c == '.' {
   123  				s.tok = _DotDotDot
   124  				break
   125  			}
   126  			s.ungetr()
   127  			s.source.r0-- // make next ungetr work (line cannot have changed)
   128  		}
   129  		s.ungetr()
   130  		s.tok = _Dot
   131  
   132  	case '+':
   133  		s.op, s.prec = Add, precAdd
   134  		c = s.getr()
   135  		if c != '+' {
   136  			goto assignop
   137  		}
   138  		s.nlsemi = true
   139  		s.tok = _IncOp
   140  
   141  	case '-':
   142  		s.op, s.prec = Sub, precAdd
   143  		c = s.getr()
   144  		if c != '-' {
   145  			goto assignop
   146  		}
   147  		s.nlsemi = true
   148  		s.tok = _IncOp
   149  
   150  	case '*':
   151  		s.op, s.prec = Mul, precMul
   152  		// don't goto assignop - want _Star token
   153  		if s.getr() == '=' {
   154  			s.tok = _AssignOp
   155  			break
   156  		}
   157  		s.ungetr()
   158  		s.tok = _Star
   159  
   160  	case '/':
   161  		c = s.getr()
   162  		if c == '/' {
   163  			s.lineComment()
   164  			goto redo
   165  		}
   166  		if c == '*' {
   167  			s.fullComment()
   168  			if s.source.line > s.line && nlsemi {
   169  				// A multi-line comment acts like a newline;
   170  				// it translates to a ';' if nlsemi is set.
   171  				s.tok = _Semi
   172  				break
   173  			}
   174  			goto redo
   175  		}
   176  		s.op, s.prec = Div, precMul
   177  		goto assignop
   178  
   179  	case '%':
   180  		s.op, s.prec = Rem, precMul
   181  		c = s.getr()
   182  		goto assignop
   183  
   184  	case '&':
   185  		c = s.getr()
   186  		if c == '&' {
   187  			s.op, s.prec = AndAnd, precAndAnd
   188  			s.tok = _Operator
   189  			break
   190  		}
   191  		s.op, s.prec = And, precMul
   192  		if c == '^' {
   193  			s.op = AndNot
   194  			c = s.getr()
   195  		}
   196  		goto assignop
   197  
   198  	case '|':
   199  		c = s.getr()
   200  		if c == '|' {
   201  			s.op, s.prec = OrOr, precOrOr
   202  			s.tok = _Operator
   203  			break
   204  		}
   205  		s.op, s.prec = Or, precAdd
   206  		goto assignop
   207  
   208  	case '~':
   209  		s.error("bitwise complement operator is ^")
   210  		fallthrough
   211  
   212  	case '^':
   213  		s.op, s.prec = Xor, precAdd
   214  		c = s.getr()
   215  		goto assignop
   216  
   217  	case '<':
   218  		c = s.getr()
   219  		if c == '=' {
   220  			s.op, s.prec = Leq, precCmp
   221  			s.tok = _Operator
   222  			break
   223  		}
   224  		if c == '<' {
   225  			s.op, s.prec = Shl, precMul
   226  			c = s.getr()
   227  			goto assignop
   228  		}
   229  		if c == '-' {
   230  			s.tok = _Arrow
   231  			break
   232  		}
   233  		s.ungetr()
   234  		s.op, s.prec = Lss, precCmp
   235  		s.tok = _Operator
   236  
   237  	case '>':
   238  		c = s.getr()
   239  		if c == '=' {
   240  			s.op, s.prec = Geq, precCmp
   241  			s.tok = _Operator
   242  			break
   243  		}
   244  		if c == '>' {
   245  			s.op, s.prec = Shr, precMul
   246  			c = s.getr()
   247  			goto assignop
   248  		}
   249  		s.ungetr()
   250  		s.op, s.prec = Gtr, precCmp
   251  		s.tok = _Operator
   252  
   253  	case '=':
   254  		if s.getr() == '=' {
   255  			s.op, s.prec = Eql, precCmp
   256  			s.tok = _Operator
   257  			break
   258  		}
   259  		s.ungetr()
   260  		s.tok = _Assign
   261  
   262  	case '!':
   263  		if s.getr() == '=' {
   264  			s.op, s.prec = Neq, precCmp
   265  			s.tok = _Operator
   266  			break
   267  		}
   268  		s.ungetr()
   269  		s.op, s.prec = Not, 0
   270  		s.tok = _Operator
   271  
   272  	default:
   273  		s.tok = 0
   274  		s.error(fmt.Sprintf("illegal character %#U", c))
   275  		goto redo
   276  	}
   277  
   278  	return
   279  
   280  assignop:
   281  	if c == '=' {
   282  		s.tok = _AssignOp
   283  		return
   284  	}
   285  	s.ungetr()
   286  	s.tok = _Operator
   287  }
   288  
   289  func isLetter(c rune) bool {
   290  	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
   291  }
   292  
   293  func isDigit(c rune) bool {
   294  	return '0' <= c && c <= '9'
   295  }
   296  
   297  func (s *scanner) ident() {
   298  	s.startLit()
   299  
   300  	// accelerate common case (7bit ASCII)
   301  	c := s.getr()
   302  	for isLetter(c) || isDigit(c) {
   303  		c = s.getr()
   304  	}
   305  
   306  	// general case
   307  	if c >= utf8.RuneSelf {
   308  		for unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) || s.isCompatRune(c, false) {
   309  			c = s.getr()
   310  		}
   311  	}
   312  	s.ungetr()
   313  
   314  	lit := s.stopLit()
   315  
   316  	// possibly a keyword
   317  	if len(lit) >= 2 {
   318  		if tok := keywordMap[hash(lit)]; tok != 0 && strbyteseql(tokstrings[tok], lit) {
   319  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   320  			s.tok = tok
   321  			return
   322  		}
   323  	}
   324  
   325  	s.nlsemi = true
   326  	s.lit = string(lit)
   327  	s.tok = _Name
   328  }
   329  
   330  func (s *scanner) isCompatRune(c rune, start bool) bool {
   331  	if !gcCompat || c < utf8.RuneSelf {
   332  		return false
   333  	}
   334  	if start && unicode.IsNumber(c) {
   335  		s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
   336  	} else {
   337  		s.error(fmt.Sprintf("invalid identifier character %#U", c))
   338  	}
   339  	return true
   340  }
   341  
   342  // hash is a perfect hash function for keywords.
   343  // It assumes that s has at least length 2.
   344  func hash(s []byte) uint {
   345  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   346  }
   347  
   348  func strbyteseql(s string, b []byte) bool {
   349  	if len(s) == len(b) {
   350  		for i, b := range b {
   351  			if s[i] != b {
   352  				return false
   353  			}
   354  		}
   355  		return true
   356  	}
   357  	return false
   358  }
   359  
   360  var keywordMap [1 << 6]token // size must be power of two
   361  
   362  func init() {
   363  	// populate keywordMap
   364  	for tok := _Break; tok <= _Var; tok++ {
   365  		h := hash([]byte(tokstrings[tok]))
   366  		if keywordMap[h] != 0 {
   367  			panic("imperfect hash")
   368  		}
   369  		keywordMap[h] = tok
   370  	}
   371  }
   372  
   373  func (s *scanner) number(c rune) {
   374  	s.startLit()
   375  
   376  	if c != '.' {
   377  		s.kind = IntLit // until proven otherwise
   378  		if c == '0' {
   379  			c = s.getr()
   380  			if c == 'x' || c == 'X' {
   381  				// hex
   382  				c = s.getr()
   383  				hasDigit := false
   384  				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   385  					c = s.getr()
   386  					hasDigit = true
   387  				}
   388  				if !hasDigit {
   389  					s.error("malformed hex constant")
   390  				}
   391  				goto done
   392  			}
   393  
   394  			// decimal 0, octal, or float
   395  			has8or9 := false
   396  			for isDigit(c) {
   397  				if c > '7' {
   398  					has8or9 = true
   399  				}
   400  				c = s.getr()
   401  			}
   402  			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
   403  				// octal
   404  				if has8or9 {
   405  					s.error("malformed octal constant")
   406  				}
   407  				goto done
   408  			}
   409  
   410  		} else {
   411  			// decimal or float
   412  			for isDigit(c) {
   413  				c = s.getr()
   414  			}
   415  		}
   416  	}
   417  
   418  	// float
   419  	if c == '.' {
   420  		s.kind = FloatLit
   421  		c = s.getr()
   422  		for isDigit(c) {
   423  			c = s.getr()
   424  		}
   425  	}
   426  
   427  	// exponent
   428  	if c == 'e' || c == 'E' {
   429  		s.kind = FloatLit
   430  		c = s.getr()
   431  		if c == '-' || c == '+' {
   432  			c = s.getr()
   433  		}
   434  		if !isDigit(c) {
   435  			s.error("malformed floating-point constant exponent")
   436  		}
   437  		for isDigit(c) {
   438  			c = s.getr()
   439  		}
   440  	}
   441  
   442  	// complex
   443  	if c == 'i' {
   444  		s.kind = ImagLit
   445  		s.getr()
   446  	}
   447  
   448  done:
   449  	s.ungetr()
   450  	s.nlsemi = true
   451  	s.lit = string(s.stopLit())
   452  	s.tok = _Literal
   453  }
   454  
   455  func (s *scanner) stdString() {
   456  	s.startLit()
   457  
   458  	for {
   459  		r := s.getr()
   460  		if r == '"' {
   461  			break
   462  		}
   463  		if r == '\\' {
   464  			s.escape('"')
   465  			continue
   466  		}
   467  		if r == '\n' {
   468  			s.ungetr() // assume newline is not part of literal
   469  			s.error("newline in string")
   470  			break
   471  		}
   472  		if r < 0 {
   473  			s.error_at(s.pos, s.line, "string not terminated")
   474  			break
   475  		}
   476  	}
   477  
   478  	s.nlsemi = true
   479  	s.lit = string(s.stopLit())
   480  	s.kind = StringLit
   481  	s.tok = _Literal
   482  }
   483  
   484  func (s *scanner) rawString() {
   485  	s.startLit()
   486  
   487  	for {
   488  		r := s.getr()
   489  		if r == '`' {
   490  			break
   491  		}
   492  		if r < 0 {
   493  			s.error_at(s.pos, s.line, "string not terminated")
   494  			break
   495  		}
   496  	}
   497  	// We leave CRs in the string since they are part of the
   498  	// literal (even though they are not part of the literal
   499  	// value).
   500  
   501  	s.nlsemi = true
   502  	s.lit = string(s.stopLit())
   503  	s.kind = StringLit
   504  	s.tok = _Literal
   505  }
   506  
   507  func (s *scanner) rune() {
   508  	s.startLit()
   509  
   510  	r := s.getr()
   511  	ok := false
   512  	if r == '\'' {
   513  		s.error("empty character literal or unescaped ' in character literal")
   514  	} else if r == '\n' {
   515  		s.ungetr() // assume newline is not part of literal
   516  		s.error("newline in character literal")
   517  	} else {
   518  		ok = true
   519  		if r == '\\' {
   520  			ok = s.escape('\'')
   521  		}
   522  	}
   523  
   524  	r = s.getr()
   525  	if r != '\'' {
   526  		// only report error if we're ok so far
   527  		if ok {
   528  			s.error("missing '")
   529  		}
   530  		s.ungetr()
   531  	}
   532  
   533  	s.nlsemi = true
   534  	s.lit = string(s.stopLit())
   535  	s.kind = RuneLit
   536  	s.tok = _Literal
   537  }
   538  
   539  func (s *scanner) lineComment() {
   540  	// recognize pragmas
   541  	var prefix string
   542  	r := s.getr()
   543  	switch r {
   544  	case 'g':
   545  		prefix = "go:"
   546  	case 'l':
   547  		prefix = "line "
   548  	default:
   549  		goto skip
   550  	}
   551  
   552  	s.startLit()
   553  	for _, m := range prefix {
   554  		if r != m {
   555  			s.stopLit()
   556  			goto skip
   557  		}
   558  		r = s.getr()
   559  	}
   560  
   561  	for r >= 0 {
   562  		if r == '\n' {
   563  			s.ungetr()
   564  			break
   565  		}
   566  		r = s.getr()
   567  	}
   568  	s.pragmas = append(s.pragmas, Pragma{
   569  		Line: s.line,
   570  		Text: strings.TrimSuffix(string(s.stopLit()), "\r"),
   571  	})
   572  	return
   573  
   574  skip:
   575  	// consume line
   576  	for r != '\n' && r >= 0 {
   577  		r = s.getr()
   578  	}
   579  	s.ungetr() // don't consume '\n' - needed for nlsemi logic
   580  }
   581  
   582  func (s *scanner) fullComment() {
   583  	for {
   584  		r := s.getr()
   585  		for r == '*' {
   586  			r = s.getr()
   587  			if r == '/' {
   588  				return
   589  			}
   590  		}
   591  		if r < 0 {
   592  			s.error_at(s.pos, s.line, "comment not terminated")
   593  			return
   594  		}
   595  	}
   596  }
   597  
   598  func (s *scanner) escape(quote rune) bool {
   599  	var n int
   600  	var base, max uint32
   601  
   602  	c := s.getr()
   603  	switch c {
   604  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   605  		return true
   606  	case '0', '1', '2', '3', '4', '5', '6', '7':
   607  		n, base, max = 3, 8, 255
   608  	case 'x':
   609  		c = s.getr()
   610  		n, base, max = 2, 16, 255
   611  	case 'u':
   612  		c = s.getr()
   613  		n, base, max = 4, 16, unicode.MaxRune
   614  	case 'U':
   615  		c = s.getr()
   616  		n, base, max = 8, 16, unicode.MaxRune
   617  	default:
   618  		if c < 0 {
   619  			return true // complain in caller about EOF
   620  		}
   621  		s.error("unknown escape sequence")
   622  		return false
   623  	}
   624  
   625  	var x uint32
   626  	for i := n; i > 0; i-- {
   627  		d := base
   628  		switch {
   629  		case isDigit(c):
   630  			d = uint32(c) - '0'
   631  		case 'a' <= c && c <= 'f':
   632  			d = uint32(c) - ('a' - 10)
   633  		case 'A' <= c && c <= 'F':
   634  			d = uint32(c) - ('A' - 10)
   635  		}
   636  		if d >= base {
   637  			if c < 0 {
   638  				return true // complain in caller about EOF
   639  			}
   640  			if gcCompat {
   641  				name := "hex"
   642  				if base == 8 {
   643  					name = "octal"
   644  				}
   645  				s.error(fmt.Sprintf("non-%s character in escape sequence: %c", name, c))
   646  			} else {
   647  				if c != quote {
   648  					s.error(fmt.Sprintf("illegal character %#U in escape sequence", c))
   649  				} else {
   650  					s.error("escape sequence incomplete")
   651  				}
   652  			}
   653  			s.ungetr()
   654  			return false
   655  		}
   656  		// d < base
   657  		x = x*base + d
   658  		c = s.getr()
   659  	}
   660  	s.ungetr()
   661  
   662  	if x > max && base == 8 {
   663  		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
   664  		return false
   665  	}
   666  
   667  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   668  		s.error("escape sequence is invalid Unicode code point")
   669  		return false
   670  	}
   671  
   672  	return true
   673  }