github.com/searKing/golang/go@v1.2.74/go/scanner/scanner.go (about) 1 // Copyright 2020 The searKing Author. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package scanner 6 7 import ( 8 "bufio" 9 "bytes" 10 "fmt" 11 "go/token" 12 "path/filepath" 13 "regexp" 14 "strings" 15 "unicode" 16 "unicode/utf8" 17 ) 18 19 // A mode value is a set of flags (or 0). 20 // They control scanner behavior. 21 // 22 type Mode uint 23 24 const ( 25 ModeCaseSensitive Mode = 1 << iota 26 ModeRegexpPerl 27 ModeRegexpPosix 28 ) 29 30 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 31 // encountered and a handler was installed, the handler is called with a 32 // position and an error message. The position points to the beginning of 33 // the offending token. 34 // 35 type ErrorHandler func(pos token.Position, msg string) 36 37 // A Scanner holds the scanner's internal state while processing 38 // a given text. It can be allocated as part of another data 39 // structure but must be initialized via Init before use. 40 // 41 type Scanner struct { 42 // immutable state 43 file *token.File // source file handle 44 dir string // directory portion of file.Name() 45 src []byte // source 46 err ErrorHandler // error reporting; or nil 47 mode Mode // scanning mode 48 49 // scanning state 50 offset int // character offset 51 rdOffset int // reading offset (position after current character) 52 lineOffset int // current line offset 53 54 // public state - ok to modify 55 ErrorCount int // number of errors encountered 56 } 57 58 const bom = 0xFEFF // byte order mark, only permitted as very first character 59 60 func (s *Scanner) AtEOF() bool { 61 return s.rdOffset >= len(s.src) 62 } 63 64 func (s *Scanner) CurrentBytes() []byte { 65 return s.src[s.offset:s.rdOffset] 66 } 67 68 func (s *Scanner) CurrentString() string { 69 return string(s.CurrentBytes()) 70 } 71 72 func (s *Scanner) CurrentRunes() []rune { 73 return []rune(s.CurrentString()) 74 } 75 76 func (s *Scanner) CurrentRune() rune { 77 runes := s.CurrentRunes() 78 if len(runes) > 0 { 79 return runes[0] 80 } 81 return -1 82 } 83 84 func (s *Scanner) CurrentLength() int { 85 return s.rdOffset - s.offset 86 } 87 88 // walk until current is consumed 89 func (s *Scanner) Consume() { 90 chars := s.CurrentBytes() 91 if len(chars) == 0 { 92 return 93 } 94 95 lines := bytes.Split(chars, []byte{'\n'}) 96 var hasCL bool 97 if len(lines) > 1 { 98 hasCL = true 99 } 100 101 for _, line := range lines { 102 lineLen := len(line) 103 if hasCL { 104 lineLen++ 105 s.lineOffset = s.offset 106 s.file.AddLine(s.offset) 107 } 108 109 s.offset = s.offset + lineLen 110 } 111 s.offset = s.rdOffset 112 } 113 114 func (s *Scanner) NextByte() { 115 s.NextBytesN(1) 116 } 117 118 func (s *Scanner) NextBytesN(n int) { 119 s.Consume() 120 if s.rdOffset+n <= len(s.src) { 121 s.rdOffset += n 122 } else { 123 s.offset = len(s.src) 124 } 125 } 126 127 // Read the NextRune Unicode char into s.ch. 128 // s.AtEOF() == true means end-of-file. 129 func (s *Scanner) NextRune() { 130 if s.rdOffset < len(s.src) { 131 s.Consume() 132 r, w := rune(s.src[s.rdOffset]), 1 133 switch { 134 case r == 0: 135 s.error(s.offset, "illegal character NUL") 136 case r >= utf8.RuneSelf: 137 // not ASCII 138 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 139 if r == utf8.RuneError && w == 1 { 140 s.error(s.offset, "illegal UTF-8 encoding") 141 } else if r == bom && s.offset > 0 { 142 s.error(s.offset, "illegal byte order mark") 143 } 144 } 145 s.rdOffset += w 146 } else { 147 s.Consume() 148 s.offset = len(s.src) 149 } 150 } 151 152 func (s *Scanner) PeekRune() rune { 153 if s.rdOffset < len(s.src) { 154 r, w := rune(s.src[s.rdOffset]), 1 155 switch { 156 case r == 0: 157 s.error(s.offset, "illegal character NUL") 158 case r >= utf8.RuneSelf: 159 // not ASCII 160 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 161 if r == utf8.RuneError && w == 1 { 162 s.error(s.offset, "illegal UTF-8 encoding") 163 } else if r == bom && s.offset > 0 { 164 s.error(s.offset, "illegal byte order mark") 165 } 166 } 167 return r 168 } 169 return -1 170 } 171 172 // PeekByte returns the byte following the most recently read character without 173 // advancing the scanner. If the scanner is at EOF, PeekByte returns 0. 174 func (s *Scanner) PeekByte() byte { 175 if s.rdOffset < len(s.src) { 176 return s.src[s.rdOffset] 177 } 178 return 0 179 } 180 181 // Read the NextRune Unicode chars into s.ch. 182 // s.ch < 0 means end-of-file. 183 // 184 func (s *Scanner) NextRunesN(n int) { 185 offsetBegin := s.rdOffset 186 187 for i := 0; i < n; i++ { 188 s.NextRune() 189 } 190 s.offset = offsetBegin 191 } 192 193 // Read the NextRune Unicode chars into s.ch. 194 // s.ch < 0 means end-of-file. 195 // 196 func (s *Scanner) NextRegexp(expectStrs ...string) { 197 match := s.PeekRegexpAny() 198 if match == "" { 199 return 200 } 201 offsetBegin := s.rdOffset 202 203 for range match { 204 s.NextRune() 205 } 206 s.offset = offsetBegin 207 } 208 209 // PeekRegexpAny returns the string following the most recently read character which matches the regexp case without 210 // advancing the scanner. If the scanner is at EOF or regexp unmatched, PeekRegexpAny returns nil. 211 func (s *Scanner) PeekRegexpAny(expectStrs ...string) string { 212 if s.AtEOF() { 213 return "" 214 } 215 if s.mode&ModeRegexpPosix != 0 { 216 return s.peekRegexpPosix(expectStrs...) 217 } else if s.mode&ModeRegexpPerl != 0 { 218 return s.peekRegexpPerl(expectStrs...) 219 } 220 221 return s.PeekString(expectStrs...) 222 } 223 224 func (s *Scanner) PeekString(expectStrs ...string) string { 225 if s.AtEOF() { 226 return "" 227 } 228 229 // regex mode 230 for _, expect := range expectStrs { 231 endPos := s.rdOffset + len(expect) 232 if endPos > len(s.src) { 233 continue 234 } 235 selected := s.src[s.rdOffset:endPos] 236 if string(selected) == expect { 237 return string(selected) 238 } 239 240 if ((s.mode&ModeCaseSensitive != 0) && strings.EqualFold(string(selected), expect)) || 241 string(selected) == expect { 242 return string(selected) 243 } 244 } 245 return "" 246 } 247 248 func (s *Scanner) peekRegexpPosix(expectStrs ...string) string { 249 if s.AtEOF() { 250 return "" 251 } 252 253 // regex mode 254 for _, expect := range expectStrs { 255 expect = "^" + strings.TrimPrefix(expect, "^") 256 257 reg := regexp.MustCompilePOSIX(expect) 258 matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:])) 259 if len(matches) == 0 { 260 continue 261 } 262 263 return matches[0] 264 } 265 return "" 266 } 267 268 func (s *Scanner) peekRegexpPerl(expectStrs ...string) string { 269 if s.AtEOF() { 270 return "" 271 } 272 273 // regex mode 274 for _, expect := range expectStrs { 275 expect = "^" + strings.TrimPrefix(expect, "^") 276 277 reg := regexp.MustCompile(expect) 278 matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:])) 279 if len(matches) == 0 { 280 continue 281 } 282 283 return matches[0] 284 } 285 return "" 286 } 287 288 // Init prepares the scanner s to tokenize the text src by setting the 289 // scanner at the beginning of src. The scanner uses the file set file 290 // for position information and it adds line information for each line. 291 // It is ok to re-use the same file when re-scanning the same file as 292 // line information which is already present is ignored. Init causes a 293 // panic if the file size does not match the src size. 294 // 295 // Calls to Scan will invoke the error handler err if they encounter a 296 // syntax error and err is not nil. Also, for each error encountered, 297 // the Scanner field ErrorCount is incremented by one. The mode parameter 298 // determines how comments are handled. 299 // 300 // Note that Init may call err if there is an error in the first character 301 // of the file. 302 // 303 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 304 // Explicitly initialize all fields since a scanner may be reused. 305 if file.Size() != len(src) { 306 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 307 } 308 s.file = file 309 s.dir, _ = filepath.Split(file.Name()) 310 s.src = src 311 s.err = err 312 s.mode = mode 313 314 s.offset = 0 315 s.rdOffset = 0 316 s.lineOffset = 0 317 s.ErrorCount = 0 318 319 if s.PeekRune() == bom { 320 s.NextRune() // ignore BOM at file beginning 321 } 322 } 323 324 func (s *Scanner) error(offs int, msg string) { 325 if s.err != nil { 326 s.err(s.file.Position(s.file.Pos(offs)), msg) 327 } 328 s.ErrorCount++ 329 } 330 331 func digitVal(ch rune) int { 332 switch { 333 case '0' <= ch && ch <= '9': 334 return int(ch - '0') 335 case 'a' <= ch && ch <= 'f': 336 return int(ch - 'a' + 10) 337 case 'A' <= ch && ch <= 'F': 338 return int(ch - 'A' + 10) 339 } 340 return 16 // larger than any legal digit val 341 } 342 343 // ScanEscape parses an escape sequence where rune is the accepted 344 // escaped quote. In case of a syntax error, it stops at the offending 345 // character (without consuming it) and returns false. Otherwise 346 // it returns true. 347 func (s *Scanner) ScanEscape(quote rune) bool { 348 offs := s.offset 349 350 var ch = s.CurrentRune() 351 352 var n int 353 var base, max uint32 354 switch ch { 355 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 356 s.NextRune() 357 return true 358 case '0', '1', '2', '3', '4', '5', '6', '7': 359 n, base, max = 3, 8, 255 360 case 'x': 361 s.NextRune() 362 n, base, max = 2, 16, 255 363 case 'u': 364 s.NextRune() 365 n, base, max = 4, 16, unicode.MaxRune 366 case 'U': 367 s.NextRune() 368 n, base, max = 8, 16, unicode.MaxRune 369 default: 370 msg := "unknown escape sequence" 371 if ch < 0 { 372 msg = "escape sequence not terminated" 373 } 374 s.error(offs, msg) 375 return false 376 } 377 378 var x uint32 379 for n > 0 { 380 d := uint32(digitVal(ch)) 381 if d >= base { 382 msg := fmt.Sprintf("illegal character %#U in escape sequence", ch) 383 if ch < 0 { 384 msg = "escape sequence not terminated" 385 } 386 s.error(s.offset, msg) 387 return false 388 } 389 x = x*base + d 390 s.NextRune() 391 n-- 392 } 393 394 if x > max || 0xD800 <= x && x < 0xE000 { 395 s.error(offs, "escape sequence is invalid Unicode code point") 396 return false 397 } 398 399 return true 400 } 401 402 func (s *Scanner) ScanRune() string { 403 // '\'' opening already consumed 404 offs := s.offset - 1 405 406 valid := true 407 n := 0 408 for { 409 var ch = s.CurrentRune() 410 411 if ch == '\n' || ch < 0 { 412 // only report error if we don't have one already 413 if valid { 414 s.error(offs, "rune literal not terminated") 415 valid = false 416 } 417 break 418 } 419 s.NextRune() 420 if ch == '\'' { 421 break 422 } 423 n++ 424 if ch == '\\' { 425 if !s.ScanEscape('\'') { 426 valid = false 427 } 428 // continue to read to closing quote 429 } 430 } 431 432 if valid && n != 1 { 433 s.error(offs, "illegal rune literal") 434 } 435 436 return string(s.src[offs:s.offset]) 437 } 438 439 func (s *Scanner) ScanString() string { 440 // '"' opening already consumed 441 offs := s.offset - 1 442 443 for { 444 var ch = s.CurrentRune() 445 if ch == '\n' || ch < 0 { 446 s.error(offs, "string literal not terminated") 447 break 448 } 449 s.NextRune() 450 if ch == '"' { 451 break 452 } 453 if ch == '\\' { 454 s.ScanEscape('"') 455 } 456 } 457 458 return string(s.src[offs:s.offset]) 459 } 460 461 func stripCR(b []byte, comment bool) []byte { 462 c := make([]byte, len(b)) 463 i := 0 464 for j, ch := range b { 465 // In a /*-style comment, don't strip \r from *\r/ (incl. 466 // sequences of \r from *\r\r...\r/) since the resulting 467 // */ would terminate the comment too early unless the \r 468 // is immediately following the opening /* in which case 469 // it's ok because /*/ is not closed yet (issue #11151). 470 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' { 471 c[i] = ch 472 i++ 473 } 474 } 475 return c[:i] 476 } 477 478 func (s *Scanner) ScanRawString() string { 479 // '`' opening already consumed 480 offs := s.offset - 1 481 482 hasCR := false 483 for { 484 var ch = s.CurrentRune() 485 if ch < 0 { 486 s.error(offs, "raw string literal not terminated") 487 break 488 } 489 s.NextRune() 490 if ch == '`' { 491 break 492 } 493 if ch == '\r' { 494 hasCR = true 495 } 496 } 497 498 lit := s.src[offs:s.offset] 499 if hasCR { 500 lit = stripCR(lit, false) 501 } 502 503 return string(lit) 504 } 505 506 func (s *Scanner) ScanLine() string { 507 // '"' opening already consumed 508 offs := s.offset 509 510 for { 511 var ch = s.CurrentRune() 512 if ch < 0 { 513 s.error(offs, "string literal not terminated") 514 break 515 } 516 s.NextRune() 517 if ch == '\n' { 518 break 519 } 520 } 521 522 return string(s.src[offs:s.offset]) 523 } 524 525 // ScanSplits advances the Scanner to the next token by splits when first meet, which will then be 526 // available through the Bytes or Text method. It returns false when the 527 // scan stops, either by reaching the end of the input or an error. 528 // After Scan returns false, the Err method will return any error that 529 // occurred during scanning, except that if it was io.EOF, Err 530 // will return nil. 531 func (s *Scanner) ScanSplits(splits ...bufio.SplitFunc) ([]byte, bool) { 532 s.Consume() 533 534 for _, split := range splits { 535 if split == nil { 536 continue 537 } 538 // See if we can get a token with what we already have. 539 // If we've run out of data but have an error, give the split function 540 // a chance to recover any remaining, possibly empty token. 541 // atEOF is true always, for we consume by a byte slice 542 advance, token, err := split(s.src[s.rdOffset:], true) 543 if err != nil && err != bufio.ErrFinalToken { 544 s.error(s.offset, err.Error()) 545 return nil, false 546 } 547 s.NextBytesN(advance) 548 if len(token) != 0 { 549 return token, true 550 } 551 } 552 return nil, false 553 }