github.com/goplus/llgo@v0.8.3/xtool/clang/types/scanner/scanner.go (about)

     1  /*
     2   * Copyright (c) 2022 The GoPlus Authors (goplus.org). All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package scanner
    18  
    19  import (
    20  	"fmt"
    21  	"go/token"
    22  	"unicode"
    23  	"unicode/utf8"
    24  )
    25  
    26  // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    27  // encountered and a handler was installed, the handler is called with a
    28  // position and an error message. The position points to the beginning of
    29  // the offending token.
    30  type ErrorHandler func(pos token.Position, msg string)
    31  
    32  // A Scanner holds the scanner's internal state while processing
    33  // a given text. It can be allocated as part of another data
    34  // structure but must be initialized via Init before use.
    35  type Scanner struct {
    36  	// immutable state
    37  	src string
    38  
    39  	// scanning state
    40  	ch       rune // current character
    41  	offset   int  // character offset
    42  	rdOffset int  // reading offset (position after current character)
    43  
    44  	// public state - ok to modify
    45  	ErrorCount int // number of errors encountered
    46  	OnErr      func(msg string)
    47  }
    48  
    49  const (
    50  	bom = 0xFEFF // byte order mark, only permitted as very first character
    51  	eof = -1     // end of file
    52  )
    53  
    54  // Read the next Unicode char into s.ch.
    55  // s.ch < 0 means end-of-file.
    56  //
    57  // For optimization, there is some overlap between this method and
    58  // s.scanIdentifier.
    59  func (s *Scanner) next() {
    60  	if s.rdOffset < len(s.src) {
    61  		s.offset = s.rdOffset
    62  		r, w := rune(s.src[s.rdOffset]), 1
    63  		switch {
    64  		case r == 0:
    65  			s.error("illegal character NUL")
    66  		case r >= utf8.RuneSelf:
    67  			// not ASCII
    68  			r, w = utf8.DecodeRuneInString(s.src[s.rdOffset:])
    69  			if r == utf8.RuneError && w == 1 {
    70  				s.error("illegal UTF-8 encoding")
    71  			} else if r == bom && s.offset > 0 {
    72  				s.error("illegal byte order mark")
    73  			}
    74  		}
    75  		s.rdOffset += w
    76  		s.ch = r
    77  	} else {
    78  		s.offset = len(s.src)
    79  		s.ch = eof
    80  	}
    81  }
    82  
    83  // peek returns the byte following the most recently read character without
    84  // advancing the scanner. If the scanner is at EOF, peek returns 0.
    85  func (s *Scanner) peek() byte {
    86  	if s.rdOffset < len(s.src) {
    87  		return s.src[s.rdOffset]
    88  	}
    89  	return 0
    90  }
    91  
    92  func (s *Scanner) Init(src string) {
    93  	s.src = src
    94  	s.ch = ' '
    95  	s.offset = 0
    96  	s.rdOffset = 0
    97  	s.ErrorCount = 0
    98  
    99  	s.next()
   100  	if s.ch == bom {
   101  		s.next() // ignore BOM at file beginning
   102  	}
   103  }
   104  
   105  func (s *Scanner) Source() string {
   106  	return s.src
   107  }
   108  
   109  func (s *Scanner) error(msg string) {
   110  	if s.OnErr != nil {
   111  		s.OnErr(msg)
   112  	}
   113  	s.ErrorCount++
   114  }
   115  
   116  func (s *Scanner) errorf(format string, args ...interface{}) {
   117  	s.error(fmt.Sprintf(format, args...))
   118  }
   119  
   120  func isLetter(ch rune) bool {
   121  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   122  }
   123  
   124  func isDigit(ch rune) bool {
   125  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   126  }
   127  
   128  // scanIdentifier reads the string of valid identifier characters at s.offset.
   129  // It must only be called when s.ch is known to be a valid letter.
   130  //
   131  // Be careful when making changes to this function: it is optimized and affects
   132  // scanning performance significantly.
   133  func (s *Scanner) scanIdentifier() string {
   134  	offs := s.offset
   135  
   136  	// Optimize for the common case of an ASCII identifier.
   137  	//
   138  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
   139  	// avoids conversions to runes.
   140  	//
   141  	// In case we encounter a non-ASCII character, fall back on the slower path
   142  	// of calling into s.next().
   143  	for rdOffset, b := range s.src[s.rdOffset:] {
   144  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
   145  			// Avoid assigning a rune for the common case of an ascii character.
   146  			continue
   147  		}
   148  		s.rdOffset += rdOffset
   149  		if 0 < b && b < utf8.RuneSelf {
   150  			// Optimization: we've encountered an ASCII character that's not a letter
   151  			// or number. Avoid the call into s.next() and corresponding set up.
   152  			//
   153  			// Note that s.next() does some line accounting if s.ch is '\n', so this
   154  			// shortcut is only possible because we know that the preceding character
   155  			// is not '\n'.
   156  			s.ch = rune(b)
   157  			s.offset = s.rdOffset
   158  			s.rdOffset++
   159  			goto exit
   160  		}
   161  		// We know that the preceding character is valid for an identifier because
   162  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
   163  		// at s.rdOffset resets the scanner state.
   164  		s.next()
   165  		for isLetter(s.ch) || isDigit(s.ch) {
   166  			s.next()
   167  		}
   168  		goto exit
   169  	}
   170  	s.offset = len(s.src)
   171  	s.rdOffset = len(s.src)
   172  	s.ch = eof
   173  
   174  exit:
   175  	return string(s.src[offs:s.offset])
   176  }
   177  
   178  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   179  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   180  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   181  
   182  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
   183  	if base <= 10 {
   184  		max := rune('0' + base)
   185  		for isDecimal(s.ch) {
   186  			if s.ch >= max && *invalid < 0 {
   187  				*invalid = s.offset // record invalid rune offset
   188  			}
   189  			digsep = 1
   190  			s.next()
   191  		}
   192  	} else {
   193  		for isHex(s.ch) {
   194  			digsep = 1
   195  			s.next()
   196  		}
   197  	}
   198  	return
   199  }
   200  
   201  func (s *Scanner) scanNumber() (token.Token, string) {
   202  	offs := s.offset
   203  
   204  	base := 10        // number base
   205  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   206  	digsep := 0       // bit 0: digit present, bit 1: '_' present
   207  	invalid := -1     // index of invalid digit in literal, or < 0
   208  
   209  	if s.ch == '0' {
   210  		s.next()
   211  		switch lower(s.ch) {
   212  		case 'x':
   213  			s.next()
   214  			base, prefix = 16, 'x'
   215  		case 'o':
   216  			s.next()
   217  			base, prefix = 8, 'o'
   218  		case 'b':
   219  			s.next()
   220  			base, prefix = 2, 'b'
   221  		default:
   222  			base, prefix = 8, '0'
   223  			digsep = 1 // leading 0
   224  		}
   225  	}
   226  	digsep |= s.digits(base, &invalid)
   227  	if digsep&1 == 0 {
   228  		s.error(litname(prefix) + " has no digits")
   229  	}
   230  
   231  	lit := string(s.src[offs:s.offset])
   232  	if invalid >= 0 {
   233  		s.errorf("invalid digit %q in %s", lit[invalid-offs], litname(prefix))
   234  	}
   235  	return token.INT, lit
   236  }
   237  
   238  func litname(prefix rune) string {
   239  	switch prefix {
   240  	case 'x':
   241  		return "hexadecimal literal"
   242  	case 'o', '0':
   243  		return "octal literal"
   244  	case 'b':
   245  		return "binary literal"
   246  	}
   247  	return "decimal literal"
   248  }
   249  
   250  func (s *Scanner) skipWhitespace() {
   251  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' || s.ch == '\r' {
   252  		s.next()
   253  	}
   254  }
   255  
   256  func (s *Scanner) Scan() (tok token.Token, lit string) {
   257  	s.skipWhitespace()
   258  
   259  	// determine token value
   260  	switch ch := s.ch; {
   261  	case isLetter(ch):
   262  		lit = s.scanIdentifier()
   263  		tok = token.IDENT
   264  	case isDecimal(ch):
   265  		tok, lit = s.scanNumber()
   266  	default:
   267  		s.next() // always make progress
   268  		switch ch {
   269  		case -1:
   270  			tok = token.EOF
   271  		case '.':
   272  			// fractions starting with a '.' are handled by outer switch
   273  			tok = token.PERIOD
   274  			if s.ch == '.' && s.peek() == '.' {
   275  				s.next()
   276  				s.next() // consume last '.'
   277  				tok = token.ELLIPSIS
   278  			}
   279  		case ',':
   280  			tok = token.COMMA
   281  		case '(':
   282  			tok = token.LPAREN
   283  		case ')':
   284  			tok = token.RPAREN
   285  		case '[':
   286  			tok = token.LBRACK
   287  		case ']':
   288  			tok = token.RBRACK
   289  		case '*':
   290  			tok = token.MUL
   291  		case '^':
   292  			tok = token.XOR
   293  		default:
   294  			// next reports unexpected BOMs - don't repeat
   295  			if ch != bom {
   296  				s.errorf("illegal character %#U", ch)
   297  			}
   298  			tok = token.ILLEGAL
   299  			lit = string(ch)
   300  		}
   301  	}
   302  	return
   303  }