github.com/goplus/llgo@v0.8.3/xtool/clang/types/scanner/scanner.go (about) 1 /* 2 * Copyright (c) 2022 The GoPlus Authors (goplus.org). All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package scanner 18 19 import ( 20 "fmt" 21 "go/token" 22 "unicode" 23 "unicode/utf8" 24 ) 25 26 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 27 // encountered and a handler was installed, the handler is called with a 28 // position and an error message. The position points to the beginning of 29 // the offending token. 30 type ErrorHandler func(pos token.Position, msg string) 31 32 // A Scanner holds the scanner's internal state while processing 33 // a given text. It can be allocated as part of another data 34 // structure but must be initialized via Init before use. 35 type Scanner struct { 36 // immutable state 37 src string 38 39 // scanning state 40 ch rune // current character 41 offset int // character offset 42 rdOffset int // reading offset (position after current character) 43 44 // public state - ok to modify 45 ErrorCount int // number of errors encountered 46 OnErr func(msg string) 47 } 48 49 const ( 50 bom = 0xFEFF // byte order mark, only permitted as very first character 51 eof = -1 // end of file 52 ) 53 54 // Read the next Unicode char into s.ch. 55 // s.ch < 0 means end-of-file. 56 // 57 // For optimization, there is some overlap between this method and 58 // s.scanIdentifier. 59 func (s *Scanner) next() { 60 if s.rdOffset < len(s.src) { 61 s.offset = s.rdOffset 62 r, w := rune(s.src[s.rdOffset]), 1 63 switch { 64 case r == 0: 65 s.error("illegal character NUL") 66 case r >= utf8.RuneSelf: 67 // not ASCII 68 r, w = utf8.DecodeRuneInString(s.src[s.rdOffset:]) 69 if r == utf8.RuneError && w == 1 { 70 s.error("illegal UTF-8 encoding") 71 } else if r == bom && s.offset > 0 { 72 s.error("illegal byte order mark") 73 } 74 } 75 s.rdOffset += w 76 s.ch = r 77 } else { 78 s.offset = len(s.src) 79 s.ch = eof 80 } 81 } 82 83 // peek returns the byte following the most recently read character without 84 // advancing the scanner. If the scanner is at EOF, peek returns 0. 85 func (s *Scanner) peek() byte { 86 if s.rdOffset < len(s.src) { 87 return s.src[s.rdOffset] 88 } 89 return 0 90 } 91 92 func (s *Scanner) Init(src string) { 93 s.src = src 94 s.ch = ' ' 95 s.offset = 0 96 s.rdOffset = 0 97 s.ErrorCount = 0 98 99 s.next() 100 if s.ch == bom { 101 s.next() // ignore BOM at file beginning 102 } 103 } 104 105 func (s *Scanner) Source() string { 106 return s.src 107 } 108 109 func (s *Scanner) error(msg string) { 110 if s.OnErr != nil { 111 s.OnErr(msg) 112 } 113 s.ErrorCount++ 114 } 115 116 func (s *Scanner) errorf(format string, args ...interface{}) { 117 s.error(fmt.Sprintf(format, args...)) 118 } 119 120 func isLetter(ch rune) bool { 121 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 122 } 123 124 func isDigit(ch rune) bool { 125 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 126 } 127 128 // scanIdentifier reads the string of valid identifier characters at s.offset. 129 // It must only be called when s.ch is known to be a valid letter. 130 // 131 // Be careful when making changes to this function: it is optimized and affects 132 // scanning performance significantly. 133 func (s *Scanner) scanIdentifier() string { 134 offs := s.offset 135 136 // Optimize for the common case of an ASCII identifier. 137 // 138 // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and 139 // avoids conversions to runes. 140 // 141 // In case we encounter a non-ASCII character, fall back on the slower path 142 // of calling into s.next(). 143 for rdOffset, b := range s.src[s.rdOffset:] { 144 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' { 145 // Avoid assigning a rune for the common case of an ascii character. 146 continue 147 } 148 s.rdOffset += rdOffset 149 if 0 < b && b < utf8.RuneSelf { 150 // Optimization: we've encountered an ASCII character that's not a letter 151 // or number. Avoid the call into s.next() and corresponding set up. 152 // 153 // Note that s.next() does some line accounting if s.ch is '\n', so this 154 // shortcut is only possible because we know that the preceding character 155 // is not '\n'. 156 s.ch = rune(b) 157 s.offset = s.rdOffset 158 s.rdOffset++ 159 goto exit 160 } 161 // We know that the preceding character is valid for an identifier because 162 // scanIdentifier is only called when s.ch is a letter, so calling s.next() 163 // at s.rdOffset resets the scanner state. 164 s.next() 165 for isLetter(s.ch) || isDigit(s.ch) { 166 s.next() 167 } 168 goto exit 169 } 170 s.offset = len(s.src) 171 s.rdOffset = len(s.src) 172 s.ch = eof 173 174 exit: 175 return string(s.src[offs:s.offset]) 176 } 177 178 func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter 179 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 180 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } 181 182 func (s *Scanner) digits(base int, invalid *int) (digsep int) { 183 if base <= 10 { 184 max := rune('0' + base) 185 for isDecimal(s.ch) { 186 if s.ch >= max && *invalid < 0 { 187 *invalid = s.offset // record invalid rune offset 188 } 189 digsep = 1 190 s.next() 191 } 192 } else { 193 for isHex(s.ch) { 194 digsep = 1 195 s.next() 196 } 197 } 198 return 199 } 200 201 func (s *Scanner) scanNumber() (token.Token, string) { 202 offs := s.offset 203 204 base := 10 // number base 205 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' 206 digsep := 0 // bit 0: digit present, bit 1: '_' present 207 invalid := -1 // index of invalid digit in literal, or < 0 208 209 if s.ch == '0' { 210 s.next() 211 switch lower(s.ch) { 212 case 'x': 213 s.next() 214 base, prefix = 16, 'x' 215 case 'o': 216 s.next() 217 base, prefix = 8, 'o' 218 case 'b': 219 s.next() 220 base, prefix = 2, 'b' 221 default: 222 base, prefix = 8, '0' 223 digsep = 1 // leading 0 224 } 225 } 226 digsep |= s.digits(base, &invalid) 227 if digsep&1 == 0 { 228 s.error(litname(prefix) + " has no digits") 229 } 230 231 lit := string(s.src[offs:s.offset]) 232 if invalid >= 0 { 233 s.errorf("invalid digit %q in %s", lit[invalid-offs], litname(prefix)) 234 } 235 return token.INT, lit 236 } 237 238 func litname(prefix rune) string { 239 switch prefix { 240 case 'x': 241 return "hexadecimal literal" 242 case 'o', '0': 243 return "octal literal" 244 case 'b': 245 return "binary literal" 246 } 247 return "decimal literal" 248 } 249 250 func (s *Scanner) skipWhitespace() { 251 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' || s.ch == '\r' { 252 s.next() 253 } 254 } 255 256 func (s *Scanner) Scan() (tok token.Token, lit string) { 257 s.skipWhitespace() 258 259 // determine token value 260 switch ch := s.ch; { 261 case isLetter(ch): 262 lit = s.scanIdentifier() 263 tok = token.IDENT 264 case isDecimal(ch): 265 tok, lit = s.scanNumber() 266 default: 267 s.next() // always make progress 268 switch ch { 269 case -1: 270 tok = token.EOF 271 case '.': 272 // fractions starting with a '.' are handled by outer switch 273 tok = token.PERIOD 274 if s.ch == '.' && s.peek() == '.' { 275 s.next() 276 s.next() // consume last '.' 277 tok = token.ELLIPSIS 278 } 279 case ',': 280 tok = token.COMMA 281 case '(': 282 tok = token.LPAREN 283 case ')': 284 tok = token.RPAREN 285 case '[': 286 tok = token.LBRACK 287 case ']': 288 tok = token.RBRACK 289 case '*': 290 tok = token.MUL 291 case '^': 292 tok = token.XOR 293 default: 294 // next reports unexpected BOMs - don't repeat 295 if ch != bom { 296 s.errorf("illegal character %#U", ch) 297 } 298 tok = token.ILLEGAL 299 lit = string(ch) 300 } 301 } 302 return 303 }