github.com/dannin/go@v0.0.0-20161031215817-d35dfd405eaa/src/cmd/compile/internal/syntax/scanner.go (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"strings"
    11  	"unicode"
    12  	"unicode/utf8"
    13  )
    14  
    15  type scanner struct {
    16  	source
    17  	nlsemi bool // if set '\n' and EOF translate to ';'
    18  	pragma Pragma
    19  
    20  	// current token, valid after calling next()
    21  	pos, line int
    22  	tok       token
    23  	lit       string   // valid if tok is _Name or _Literal
    24  	kind      LitKind  // valid if tok is _Literal
    25  	op        Operator // valid if tok is _Operator, _AssignOp, or _IncOp
    26  	prec      int      // valid if tok is _Operator, _AssignOp, or _IncOp
    27  
    28  	pragh PragmaHandler
    29  }
    30  
    31  func (s *scanner) init(src io.Reader, errh ErrorHandler, pragh PragmaHandler) {
    32  	s.source.init(src, errh)
    33  	s.nlsemi = false
    34  	s.pragh = pragh
    35  }
    36  
    37  func (s *scanner) next() {
    38  	nlsemi := s.nlsemi
    39  	s.nlsemi = false
    40  
    41  redo:
    42  	// skip white space
    43  	c := s.getr()
    44  	for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
    45  		c = s.getr()
    46  	}
    47  
    48  	// token start
    49  	s.pos, s.line = s.source.pos0(), s.source.line0
    50  
    51  	if isLetter(c) || c >= utf8.RuneSelf && (unicode.IsLetter(c) || s.isCompatRune(c, true)) {
    52  		s.ident()
    53  		return
    54  	}
    55  
    56  	switch c {
    57  	case -1:
    58  		if nlsemi {
    59  			s.tok = _Semi
    60  			break
    61  		}
    62  		s.tok = _EOF
    63  
    64  	case '\n':
    65  		s.tok = _Semi
    66  
    67  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
    68  		s.number(c)
    69  
    70  	case '"':
    71  		s.stdString()
    72  
    73  	case '`':
    74  		s.rawString()
    75  
    76  	case '\'':
    77  		s.rune()
    78  
    79  	case '(':
    80  		s.tok = _Lparen
    81  
    82  	case '[':
    83  		s.tok = _Lbrack
    84  
    85  	case '{':
    86  		s.tok = _Lbrace
    87  
    88  	case ',':
    89  		s.tok = _Comma
    90  
    91  	case ';':
    92  		s.tok = _Semi
    93  
    94  	case ')':
    95  		s.nlsemi = true
    96  		s.tok = _Rparen
    97  
    98  	case ']':
    99  		s.nlsemi = true
   100  		s.tok = _Rbrack
   101  
   102  	case '}':
   103  		s.nlsemi = true
   104  		s.tok = _Rbrace
   105  
   106  	case ':':
   107  		if s.getr() == '=' {
   108  			s.tok = _Define
   109  			break
   110  		}
   111  		s.ungetr()
   112  		s.tok = _Colon
   113  
   114  	case '.':
   115  		c = s.getr()
   116  		if isDigit(c) {
   117  			s.ungetr()
   118  			s.source.r0-- // make sure '.' is part of literal (line cannot have changed)
   119  			s.number('.')
   120  			break
   121  		}
   122  		if c == '.' {
   123  			c = s.getr()
   124  			if c == '.' {
   125  				s.tok = _DotDotDot
   126  				break
   127  			}
   128  			s.ungetr()
   129  			s.source.r0-- // make next ungetr work (line cannot have changed)
   130  		}
   131  		s.ungetr()
   132  		s.tok = _Dot
   133  
   134  	case '+':
   135  		s.op, s.prec = Add, precAdd
   136  		c = s.getr()
   137  		if c != '+' {
   138  			goto assignop
   139  		}
   140  		s.nlsemi = true
   141  		s.tok = _IncOp
   142  
   143  	case '-':
   144  		s.op, s.prec = Sub, precAdd
   145  		c = s.getr()
   146  		if c != '-' {
   147  			goto assignop
   148  		}
   149  		s.nlsemi = true
   150  		s.tok = _IncOp
   151  
   152  	case '*':
   153  		s.op, s.prec = Mul, precMul
   154  		// don't goto assignop - want _Star token
   155  		if s.getr() == '=' {
   156  			s.tok = _AssignOp
   157  			break
   158  		}
   159  		s.ungetr()
   160  		s.tok = _Star
   161  
   162  	case '/':
   163  		c = s.getr()
   164  		if c == '/' {
   165  			s.lineComment()
   166  			goto redo
   167  		}
   168  		if c == '*' {
   169  			s.fullComment()
   170  			if s.source.line > s.line && nlsemi {
   171  				// A multi-line comment acts like a newline;
   172  				// it translates to a ';' if nlsemi is set.
   173  				s.tok = _Semi
   174  				break
   175  			}
   176  			goto redo
   177  		}
   178  		s.op, s.prec = Div, precMul
   179  		goto assignop
   180  
   181  	case '%':
   182  		s.op, s.prec = Rem, precMul
   183  		c = s.getr()
   184  		goto assignop
   185  
   186  	case '&':
   187  		c = s.getr()
   188  		if c == '&' {
   189  			s.op, s.prec = AndAnd, precAndAnd
   190  			s.tok = _Operator
   191  			break
   192  		}
   193  		s.op, s.prec = And, precMul
   194  		if c == '^' {
   195  			s.op = AndNot
   196  			c = s.getr()
   197  		}
   198  		goto assignop
   199  
   200  	case '|':
   201  		c = s.getr()
   202  		if c == '|' {
   203  			s.op, s.prec = OrOr, precOrOr
   204  			s.tok = _Operator
   205  			break
   206  		}
   207  		s.op, s.prec = Or, precAdd
   208  		goto assignop
   209  
   210  	case '~':
   211  		s.error("bitwise complement operator is ^")
   212  		fallthrough
   213  
   214  	case '^':
   215  		s.op, s.prec = Xor, precAdd
   216  		c = s.getr()
   217  		goto assignop
   218  
   219  	case '<':
   220  		c = s.getr()
   221  		if c == '=' {
   222  			s.op, s.prec = Leq, precCmp
   223  			s.tok = _Operator
   224  			break
   225  		}
   226  		if c == '<' {
   227  			s.op, s.prec = Shl, precMul
   228  			c = s.getr()
   229  			goto assignop
   230  		}
   231  		if c == '-' {
   232  			s.tok = _Larrow
   233  			break
   234  		}
   235  		s.ungetr()
   236  		s.op, s.prec = Lss, precCmp
   237  		s.tok = _Operator
   238  
   239  	case '>':
   240  		c = s.getr()
   241  		if c == '=' {
   242  			s.op, s.prec = Geq, precCmp
   243  			s.tok = _Operator
   244  			break
   245  		}
   246  		if c == '>' {
   247  			s.op, s.prec = Shr, precMul
   248  			c = s.getr()
   249  			goto assignop
   250  		}
   251  		s.ungetr()
   252  		s.op, s.prec = Gtr, precCmp
   253  		s.tok = _Operator
   254  
   255  	case '=':
   256  		c = s.getr()
   257  		if c == '=' {
   258  			s.op, s.prec = Eql, precCmp
   259  			s.tok = _Operator
   260  			break
   261  		}
   262  		if c == '>' {
   263  			s.tok = _Rarrow
   264  			break
   265  		}
   266  		s.ungetr()
   267  		s.tok = _Assign
   268  
   269  	case '!':
   270  		if s.getr() == '=' {
   271  			s.op, s.prec = Neq, precCmp
   272  			s.tok = _Operator
   273  			break
   274  		}
   275  		s.ungetr()
   276  		s.op, s.prec = Not, 0
   277  		s.tok = _Operator
   278  
   279  	default:
   280  		s.tok = 0
   281  		s.error(fmt.Sprintf("illegal character %#U", c))
   282  		goto redo
   283  	}
   284  
   285  	return
   286  
   287  assignop:
   288  	if c == '=' {
   289  		s.tok = _AssignOp
   290  		return
   291  	}
   292  	s.ungetr()
   293  	s.tok = _Operator
   294  }
   295  
   296  func isLetter(c rune) bool {
   297  	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
   298  }
   299  
   300  func isDigit(c rune) bool {
   301  	return '0' <= c && c <= '9'
   302  }
   303  
   304  func (s *scanner) ident() {
   305  	s.startLit()
   306  
   307  	// accelerate common case (7bit ASCII)
   308  	c := s.getr()
   309  	for isLetter(c) || isDigit(c) {
   310  		c = s.getr()
   311  	}
   312  
   313  	// general case
   314  	if c >= utf8.RuneSelf {
   315  		for unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) || s.isCompatRune(c, false) {
   316  			c = s.getr()
   317  		}
   318  	}
   319  	s.ungetr()
   320  
   321  	lit := s.stopLit()
   322  
   323  	// possibly a keyword
   324  	if len(lit) >= 2 {
   325  		if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) {
   326  			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
   327  			s.tok = tok
   328  			return
   329  		}
   330  	}
   331  
   332  	s.nlsemi = true
   333  	s.lit = string(lit)
   334  	s.tok = _Name
   335  }
   336  
   337  func (s *scanner) isCompatRune(c rune, start bool) bool {
   338  	if !gcCompat || c < utf8.RuneSelf {
   339  		return false
   340  	}
   341  	if start && unicode.IsNumber(c) {
   342  		s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
   343  	} else {
   344  		s.error(fmt.Sprintf("invalid identifier character %#U", c))
   345  	}
   346  	return true
   347  }
   348  
   349  // hash is a perfect hash function for keywords.
   350  // It assumes that s has at least length 2.
   351  func hash(s []byte) uint {
   352  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
   353  }
   354  
   355  var keywordMap [1 << 6]token // size must be power of two
   356  
   357  func init() {
   358  	// populate keywordMap
   359  	for tok := _Break; tok <= _Var; tok++ {
   360  		h := hash([]byte(tokstrings[tok]))
   361  		if keywordMap[h] != 0 {
   362  			panic("imperfect hash")
   363  		}
   364  		keywordMap[h] = tok
   365  	}
   366  }
   367  
   368  func (s *scanner) number(c rune) {
   369  	s.startLit()
   370  
   371  	if c != '.' {
   372  		s.kind = IntLit // until proven otherwise
   373  		if c == '0' {
   374  			c = s.getr()
   375  			if c == 'x' || c == 'X' {
   376  				// hex
   377  				c = s.getr()
   378  				hasDigit := false
   379  				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   380  					c = s.getr()
   381  					hasDigit = true
   382  				}
   383  				if !hasDigit {
   384  					s.error("malformed hex constant")
   385  				}
   386  				goto done
   387  			}
   388  
   389  			// decimal 0, octal, or float
   390  			has8or9 := false
   391  			for isDigit(c) {
   392  				if c > '7' {
   393  					has8or9 = true
   394  				}
   395  				c = s.getr()
   396  			}
   397  			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
   398  				// octal
   399  				if has8or9 {
   400  					s.error("malformed octal constant")
   401  				}
   402  				goto done
   403  			}
   404  
   405  		} else {
   406  			// decimal or float
   407  			for isDigit(c) {
   408  				c = s.getr()
   409  			}
   410  		}
   411  	}
   412  
   413  	// float
   414  	if c == '.' {
   415  		s.kind = FloatLit
   416  		c = s.getr()
   417  		for isDigit(c) {
   418  			c = s.getr()
   419  		}
   420  	}
   421  
   422  	// exponent
   423  	if c == 'e' || c == 'E' {
   424  		s.kind = FloatLit
   425  		c = s.getr()
   426  		if c == '-' || c == '+' {
   427  			c = s.getr()
   428  		}
   429  		if !isDigit(c) {
   430  			s.error("malformed floating-point constant exponent")
   431  		}
   432  		for isDigit(c) {
   433  			c = s.getr()
   434  		}
   435  	}
   436  
   437  	// complex
   438  	if c == 'i' {
   439  		s.kind = ImagLit
   440  		s.getr()
   441  	}
   442  
   443  done:
   444  	s.ungetr()
   445  	s.nlsemi = true
   446  	s.lit = string(s.stopLit())
   447  	s.tok = _Literal
   448  }
   449  
   450  func (s *scanner) stdString() {
   451  	s.startLit()
   452  
   453  	for {
   454  		r := s.getr()
   455  		if r == '"' {
   456  			break
   457  		}
   458  		if r == '\\' {
   459  			s.escape('"')
   460  			continue
   461  		}
   462  		if r == '\n' {
   463  			s.ungetr() // assume newline is not part of literal
   464  			s.error("newline in string")
   465  			break
   466  		}
   467  		if r < 0 {
   468  			s.error_at(s.pos, s.line, "string not terminated")
   469  			break
   470  		}
   471  	}
   472  
   473  	s.nlsemi = true
   474  	s.lit = string(s.stopLit())
   475  	s.kind = StringLit
   476  	s.tok = _Literal
   477  }
   478  
   479  func (s *scanner) rawString() {
   480  	s.startLit()
   481  
   482  	for {
   483  		r := s.getr()
   484  		if r == '`' {
   485  			break
   486  		}
   487  		if r < 0 {
   488  			s.error_at(s.pos, s.line, "string not terminated")
   489  			break
   490  		}
   491  	}
   492  	// We leave CRs in the string since they are part of the
   493  	// literal (even though they are not part of the literal
   494  	// value).
   495  
   496  	s.nlsemi = true
   497  	s.lit = string(s.stopLit())
   498  	s.kind = StringLit
   499  	s.tok = _Literal
   500  }
   501  
   502  func (s *scanner) rune() {
   503  	s.startLit()
   504  
   505  	r := s.getr()
   506  	ok := false
   507  	if r == '\'' {
   508  		s.error("empty character literal or unescaped ' in character literal")
   509  	} else if r == '\n' {
   510  		s.ungetr() // assume newline is not part of literal
   511  		s.error("newline in character literal")
   512  	} else {
   513  		ok = true
   514  		if r == '\\' {
   515  			ok = s.escape('\'')
   516  		}
   517  	}
   518  
   519  	r = s.getr()
   520  	if r != '\'' {
   521  		// only report error if we're ok so far
   522  		if ok {
   523  			s.error("missing '")
   524  		}
   525  		s.ungetr()
   526  	}
   527  
   528  	s.nlsemi = true
   529  	s.lit = string(s.stopLit())
   530  	s.kind = RuneLit
   531  	s.tok = _Literal
   532  }
   533  
   534  func (s *scanner) lineComment() {
   535  	// recognize pragmas
   536  	var prefix string
   537  	r := s.getr()
   538  	if s.pragh == nil {
   539  		goto skip
   540  	}
   541  
   542  	switch r {
   543  	case 'g':
   544  		prefix = "go:"
   545  	case 'l':
   546  		prefix = "line "
   547  	default:
   548  		goto skip
   549  	}
   550  
   551  	s.startLit()
   552  	for _, m := range prefix {
   553  		if r != m {
   554  			s.stopLit()
   555  			goto skip
   556  		}
   557  		r = s.getr()
   558  	}
   559  
   560  	for r >= 0 {
   561  		if r == '\n' {
   562  			s.ungetr()
   563  			break
   564  		}
   565  		r = s.getr()
   566  	}
   567  	s.pragma |= s.pragh(0, s.line, strings.TrimSuffix(string(s.stopLit()), "\r"))
   568  	return
   569  
   570  skip:
   571  	// consume line
   572  	for r != '\n' && r >= 0 {
   573  		r = s.getr()
   574  	}
   575  	s.ungetr() // don't consume '\n' - needed for nlsemi logic
   576  }
   577  
   578  func (s *scanner) fullComment() {
   579  	for {
   580  		r := s.getr()
   581  		for r == '*' {
   582  			r = s.getr()
   583  			if r == '/' {
   584  				return
   585  			}
   586  		}
   587  		if r < 0 {
   588  			s.error_at(s.pos, s.line, "comment not terminated")
   589  			return
   590  		}
   591  	}
   592  }
   593  
   594  func (s *scanner) escape(quote rune) bool {
   595  	var n int
   596  	var base, max uint32
   597  
   598  	c := s.getr()
   599  	switch c {
   600  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   601  		return true
   602  	case '0', '1', '2', '3', '4', '5', '6', '7':
   603  		n, base, max = 3, 8, 255
   604  	case 'x':
   605  		c = s.getr()
   606  		n, base, max = 2, 16, 255
   607  	case 'u':
   608  		c = s.getr()
   609  		n, base, max = 4, 16, unicode.MaxRune
   610  	case 'U':
   611  		c = s.getr()
   612  		n, base, max = 8, 16, unicode.MaxRune
   613  	default:
   614  		if c < 0 {
   615  			return true // complain in caller about EOF
   616  		}
   617  		s.error("unknown escape sequence")
   618  		return false
   619  	}
   620  
   621  	var x uint32
   622  	for i := n; i > 0; i-- {
   623  		d := base
   624  		switch {
   625  		case isDigit(c):
   626  			d = uint32(c) - '0'
   627  		case 'a' <= c && c <= 'f':
   628  			d = uint32(c) - ('a' - 10)
   629  		case 'A' <= c && c <= 'F':
   630  			d = uint32(c) - ('A' - 10)
   631  		}
   632  		if d >= base {
   633  			if c < 0 {
   634  				return true // complain in caller about EOF
   635  			}
   636  			if gcCompat {
   637  				name := "hex"
   638  				if base == 8 {
   639  					name = "octal"
   640  				}
   641  				s.error(fmt.Sprintf("non-%s character in escape sequence: %c", name, c))
   642  			} else {
   643  				if c != quote {
   644  					s.error(fmt.Sprintf("illegal character %#U in escape sequence", c))
   645  				} else {
   646  					s.error("escape sequence incomplete")
   647  				}
   648  			}
   649  			s.ungetr()
   650  			return false
   651  		}
   652  		// d < base
   653  		x = x*base + d
   654  		c = s.getr()
   655  	}
   656  	s.ungetr()
   657  
   658  	if x > max && base == 8 {
   659  		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
   660  		return false
   661  	}
   662  
   663  	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
   664  		s.error("escape sequence is invalid Unicode code point")
   665  		return false
   666  	}
   667  
   668  	return true
   669  }