github.com/searKing/golang/go@v1.2.117/go/scanner/scanner.go (about) 1 // Copyright 2020 The searKing Author. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package scanner 6 7 import ( 8 "bufio" 9 "bytes" 10 "fmt" 11 "go/token" 12 "path/filepath" 13 "regexp" 14 "strings" 15 "unicode" 16 "unicode/utf8" 17 ) 18 19 // A mode value is a set of flags (or 0). 20 // They control scanner behavior. 21 type Mode uint 22 23 const ( 24 ModeCaseSensitive Mode = 1 << iota 25 ModeRegexpPerl 26 ModeRegexpPosix 27 ) 28 29 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 30 // encountered and a handler was installed, the handler is called with a 31 // position and an error message. The position points to the beginning of 32 // the offending token. 33 type ErrorHandler func(pos token.Position, msg string) 34 35 // A Scanner holds the scanner's internal state while processing 36 // a given text. It can be allocated as part of another data 37 // structure but must be initialized via Init before use. 38 type Scanner struct { 39 // immutable state 40 file *token.File // source file handle 41 dir string // directory portion of file.Name() 42 src []byte // source 43 err ErrorHandler // error reporting; or nil 44 mode Mode // scanning mode 45 46 // scanning state 47 offset int // character offset 48 rdOffset int // reading offset (position after current character) 49 lineOffset int // current line offset 50 51 // public state - ok to modify 52 ErrorCount int // number of errors encountered 53 } 54 55 const bom = 0xFEFF // byte order mark, only permitted as very first character 56 57 func (s *Scanner) AtEOF() bool { 58 return s.rdOffset >= len(s.src) 59 } 60 61 func (s *Scanner) CurrentBytes() []byte { 62 return s.src[s.offset:s.rdOffset] 63 } 64 65 func (s *Scanner) CurrentString() string { 66 return string(s.CurrentBytes()) 67 } 68 69 func (s *Scanner) CurrentRunes() []rune { 70 return []rune(s.CurrentString()) 71 } 72 73 func (s *Scanner) CurrentRune() rune { 74 runes := s.CurrentRunes() 75 if len(runes) > 0 { 76 return runes[0] 77 } 78 return -1 79 } 80 81 func (s *Scanner) CurrentLength() int { 82 return s.rdOffset - s.offset 83 } 84 85 // walk until current is consumed 86 func (s *Scanner) Consume() { 87 chars := s.CurrentBytes() 88 if len(chars) == 0 { 89 return 90 } 91 92 lines := bytes.Split(chars, []byte{'\n'}) 93 var hasCL bool 94 if len(lines) > 1 { 95 hasCL = true 96 } 97 98 for _, line := range lines { 99 lineLen := len(line) 100 if hasCL { 101 lineLen++ 102 s.lineOffset = s.offset 103 s.file.AddLine(s.offset) 104 } 105 106 s.offset = s.offset + lineLen 107 } 108 s.offset = s.rdOffset 109 } 110 111 func (s *Scanner) NextByte() { 112 s.NextBytesN(1) 113 } 114 115 func (s *Scanner) NextBytesN(n int) { 116 s.Consume() 117 if s.rdOffset+n <= len(s.src) { 118 s.rdOffset += n 119 } else { 120 s.offset = len(s.src) 121 } 122 } 123 124 // Read the NextRune Unicode char into s.ch. 125 // s.AtEOF() == true means end-of-file. 126 func (s *Scanner) NextRune() { 127 if s.rdOffset < len(s.src) { 128 s.Consume() 129 r, w := rune(s.src[s.rdOffset]), 1 130 switch { 131 case r == 0: 132 s.error(s.offset, "illegal character NUL") 133 case r >= utf8.RuneSelf: 134 // not ASCII 135 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 136 if r == utf8.RuneError && w == 1 { 137 s.error(s.offset, "illegal UTF-8 encoding") 138 } else if r == bom && s.offset > 0 { 139 s.error(s.offset, "illegal byte order mark") 140 } 141 } 142 s.rdOffset += w 143 } else { 144 s.Consume() 145 s.offset = len(s.src) 146 } 147 } 148 149 func (s *Scanner) PeekRune() rune { 150 if s.rdOffset < len(s.src) { 151 r, w := rune(s.src[s.rdOffset]), 1 152 switch { 153 case r == 0: 154 s.error(s.offset, "illegal character NUL") 155 case r >= utf8.RuneSelf: 156 // not ASCII 157 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 158 if r == utf8.RuneError && w == 1 { 159 s.error(s.offset, "illegal UTF-8 encoding") 160 } else if r == bom && s.offset > 0 { 161 s.error(s.offset, "illegal byte order mark") 162 } 163 } 164 return r 165 } 166 return -1 167 } 168 169 // PeekByte returns the byte following the most recently read character without 170 // advancing the scanner. If the scanner is at EOF, PeekByte returns 0. 171 func (s *Scanner) PeekByte() byte { 172 if s.rdOffset < len(s.src) { 173 return s.src[s.rdOffset] 174 } 175 return 0 176 } 177 178 // Read the NextRune Unicode chars into s.ch. 179 // s.ch < 0 means end-of-file. 180 func (s *Scanner) NextRunesN(n int) { 181 offsetBegin := s.rdOffset 182 183 for i := 0; i < n; i++ { 184 s.NextRune() 185 } 186 s.offset = offsetBegin 187 } 188 189 // Read the NextRune Unicode chars into s.ch. 190 // s.ch < 0 means end-of-file. 191 func (s *Scanner) NextRegexp(expectStrs ...string) { 192 match := s.PeekRegexpAny() 193 if match == "" { 194 return 195 } 196 offsetBegin := s.rdOffset 197 198 for range match { 199 s.NextRune() 200 } 201 s.offset = offsetBegin 202 } 203 204 // PeekRegexpAny returns the string following the most recently read character which matches the regexp case without 205 // advancing the scanner. If the scanner is at EOF or regexp unmatched, PeekRegexpAny returns nil. 206 func (s *Scanner) PeekRegexpAny(expectStrs ...string) string { 207 if s.AtEOF() { 208 return "" 209 } 210 if s.mode&ModeRegexpPosix != 0 { 211 return s.peekRegexpPosix(expectStrs...) 212 } else if s.mode&ModeRegexpPerl != 0 { 213 return s.peekRegexpPerl(expectStrs...) 214 } 215 216 return s.PeekString(expectStrs...) 217 } 218 219 func (s *Scanner) PeekString(expectStrs ...string) string { 220 if s.AtEOF() { 221 return "" 222 } 223 224 // regex mode 225 for _, expect := range expectStrs { 226 endPos := s.rdOffset + len(expect) 227 if endPos > len(s.src) { 228 continue 229 } 230 selected := s.src[s.rdOffset:endPos] 231 if string(selected) == expect { 232 return string(selected) 233 } 234 235 if ((s.mode&ModeCaseSensitive != 0) && strings.EqualFold(string(selected), expect)) || 236 string(selected) == expect { 237 return string(selected) 238 } 239 } 240 return "" 241 } 242 243 func (s *Scanner) peekRegexpPosix(expectStrs ...string) string { 244 if s.AtEOF() { 245 return "" 246 } 247 248 // regex mode 249 for _, expect := range expectStrs { 250 expect = "^" + strings.TrimPrefix(expect, "^") 251 252 reg := regexp.MustCompilePOSIX(expect) 253 matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:])) 254 if len(matches) == 0 { 255 continue 256 } 257 258 return matches[0] 259 } 260 return "" 261 } 262 263 func (s *Scanner) peekRegexpPerl(expectStrs ...string) string { 264 if s.AtEOF() { 265 return "" 266 } 267 268 // regex mode 269 for _, expect := range expectStrs { 270 expect = "^" + strings.TrimPrefix(expect, "^") 271 272 reg := regexp.MustCompile(expect) 273 matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:])) 274 if len(matches) == 0 { 275 continue 276 } 277 278 return matches[0] 279 } 280 return "" 281 } 282 283 // Init prepares the scanner s to tokenize the text src by setting the 284 // scanner at the beginning of src. The scanner uses the file set file 285 // for position information and it adds line information for each line. 286 // It is ok to re-use the same file when re-scanning the same file as 287 // line information which is already present is ignored. Init causes a 288 // panic if the file size does not match the src size. 289 // 290 // Calls to Scan will invoke the error handler err if they encounter a 291 // syntax error and err is not nil. Also, for each error encountered, 292 // the Scanner field ErrorCount is incremented by one. The mode parameter 293 // determines how comments are handled. 294 // 295 // Note that Init may call err if there is an error in the first character 296 // of the file. 297 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 298 // Explicitly initialize all fields since a scanner may be reused. 299 if file.Size() != len(src) { 300 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 301 } 302 s.file = file 303 s.dir, _ = filepath.Split(file.Name()) 304 s.src = src 305 s.err = err 306 s.mode = mode 307 308 s.offset = 0 309 s.rdOffset = 0 310 s.lineOffset = 0 311 s.ErrorCount = 0 312 313 if s.PeekRune() == bom { 314 s.NextRune() // ignore BOM at file beginning 315 } 316 } 317 318 func (s *Scanner) error(offs int, msg string) { 319 if s.err != nil { 320 s.err(s.file.Position(s.file.Pos(offs)), msg) 321 } 322 s.ErrorCount++ 323 } 324 325 func digitVal(ch rune) int { 326 switch { 327 case '0' <= ch && ch <= '9': 328 return int(ch - '0') 329 case 'a' <= ch && ch <= 'f': 330 return int(ch - 'a' + 10) 331 case 'A' <= ch && ch <= 'F': 332 return int(ch - 'A' + 10) 333 } 334 return 16 // larger than any legal digit val 335 } 336 337 // ScanEscape parses an escape sequence where rune is the accepted 338 // escaped quote. In case of a syntax error, it stops at the offending 339 // character (without consuming it) and returns false. Otherwise 340 // it returns true. 341 func (s *Scanner) ScanEscape(quote rune) bool { 342 offs := s.offset 343 344 var ch = s.CurrentRune() 345 346 var n int 347 var base, max uint32 348 switch ch { 349 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 350 s.NextRune() 351 return true 352 case '0', '1', '2', '3', '4', '5', '6', '7': 353 n, base, max = 3, 8, 255 354 case 'x': 355 s.NextRune() 356 n, base, max = 2, 16, 255 357 case 'u': 358 s.NextRune() 359 n, base, max = 4, 16, unicode.MaxRune 360 case 'U': 361 s.NextRune() 362 n, base, max = 8, 16, unicode.MaxRune 363 default: 364 msg := "unknown escape sequence" 365 if ch < 0 { 366 msg = "escape sequence not terminated" 367 } 368 s.error(offs, msg) 369 return false 370 } 371 372 var x uint32 373 for n > 0 { 374 d := uint32(digitVal(ch)) 375 if d >= base { 376 msg := fmt.Sprintf("illegal character %#U in escape sequence", ch) 377 if ch < 0 { 378 msg = "escape sequence not terminated" 379 } 380 s.error(s.offset, msg) 381 return false 382 } 383 x = x*base + d 384 s.NextRune() 385 n-- 386 } 387 388 if x > max || 0xD800 <= x && x < 0xE000 { 389 s.error(offs, "escape sequence is invalid Unicode code point") 390 return false 391 } 392 393 return true 394 } 395 396 func (s *Scanner) ScanRune() string { 397 // '\'' opening already consumed 398 offs := s.offset - 1 399 400 valid := true 401 n := 0 402 for { 403 var ch = s.CurrentRune() 404 405 if ch == '\n' || ch < 0 { 406 // only report error if we don't have one already 407 if valid { 408 s.error(offs, "rune literal not terminated") 409 valid = false 410 } 411 break 412 } 413 s.NextRune() 414 if ch == '\'' { 415 break 416 } 417 n++ 418 if ch == '\\' { 419 if !s.ScanEscape('\'') { 420 valid = false 421 } 422 // continue to read to closing quote 423 } 424 } 425 426 if valid && n != 1 { 427 s.error(offs, "illegal rune literal") 428 } 429 430 return string(s.src[offs:s.offset]) 431 } 432 433 func (s *Scanner) ScanString() string { 434 // '"' opening already consumed 435 offs := s.offset - 1 436 437 for { 438 var ch = s.CurrentRune() 439 if ch == '\n' || ch < 0 { 440 s.error(offs, "string literal not terminated") 441 break 442 } 443 s.NextRune() 444 if ch == '"' { 445 break 446 } 447 if ch == '\\' { 448 s.ScanEscape('"') 449 } 450 } 451 452 return string(s.src[offs:s.offset]) 453 } 454 455 func stripCR(b []byte, comment bool) []byte { 456 c := make([]byte, len(b)) 457 i := 0 458 for j, ch := range b { 459 // In a /*-style comment, don't strip \r from *\r/ (incl. 460 // sequences of \r from *\r\r...\r/) since the resulting 461 // */ would terminate the comment too early unless the \r 462 // is immediately following the opening /* in which case 463 // it's ok because /*/ is not closed yet (issue #11151). 464 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' { 465 c[i] = ch 466 i++ 467 } 468 } 469 return c[:i] 470 } 471 472 func (s *Scanner) ScanRawString() string { 473 // '`' opening already consumed 474 offs := s.offset - 1 475 476 hasCR := false 477 for { 478 var ch = s.CurrentRune() 479 if ch < 0 { 480 s.error(offs, "raw string literal not terminated") 481 break 482 } 483 s.NextRune() 484 if ch == '`' { 485 break 486 } 487 if ch == '\r' { 488 hasCR = true 489 } 490 } 491 492 lit := s.src[offs:s.offset] 493 if hasCR { 494 lit = stripCR(lit, false) 495 } 496 497 return string(lit) 498 } 499 500 func (s *Scanner) ScanLine() string { 501 // '"' opening already consumed 502 offs := s.offset 503 504 for { 505 var ch = s.CurrentRune() 506 if ch < 0 { 507 s.error(offs, "string literal not terminated") 508 break 509 } 510 s.NextRune() 511 if ch == '\n' { 512 break 513 } 514 } 515 516 return string(s.src[offs:s.offset]) 517 } 518 519 // ScanSplits advances the Scanner to the next token by splits when first meet, which will then be 520 // available through the Bytes or Text method. It returns false when the 521 // scan stops, either by reaching the end of the input or an error. 522 // After Scan returns false, the Err method will return any error that 523 // occurred during scanning, except that if it was io.EOF, Err 524 // will return nil. 525 func (s *Scanner) ScanSplits(splits ...bufio.SplitFunc) ([]byte, bool) { 526 s.Consume() 527 528 for _, split := range splits { 529 if split == nil { 530 continue 531 } 532 // See if we can get a token with what we already have. 533 // If we've run out of data but have an error, give the split function 534 // a chance to recover any remaining, possibly empty token. 535 // atEOF is true always, for we consume by a byte slice 536 advance, token, err := split(s.src[s.rdOffset:], true) 537 if err != nil && err != bufio.ErrFinalToken { 538 s.error(s.offset, err.Error()) 539 return nil, false 540 } 541 s.NextBytesN(advance) 542 if len(token) != 0 { 543 return token, true 544 } 545 } 546 return nil, false 547 }