github.com/searKing/golang/go@v1.2.117/go/scanner/split.go (about) 1 // Copyright 2020 The searKing Author. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package scanner 6 7 import ( 8 "bufio" 9 "bytes" 10 "errors" 11 "fmt" 12 "regexp" 13 "strings" 14 "unicode" 15 "unicode/utf8" 16 ) 17 18 // Split functions 19 var ( 20 // ScanBytes is a split function for a Scanner that returns each byte as a token. 21 ScanBytes = bufio.ScanBytes 22 23 // ScanRunes is a split function for a Scanner that returns each 24 // UTF-8-encoded rune as a token. The sequence of runes returned is 25 // equivalent to that from a range loop over the input as a string, which 26 // means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". 27 // Because of the Scan interface, this makes it impossible for the client to 28 // distinguish correctly encoded replacement runes from encoding errors. 29 ScanRunes = bufio.ScanRunes 30 31 // ScanWords is a split function for a Scanner that returns each 32 // space-separated word of text, with surrounding spaces deleted. It will 33 // never return an empty string. The definition of space is set by 34 // unicode.IsSpace. 35 ScanWords = bufio.ScanWords 36 37 // ScanLines is a split function for a Scanner that returns each line of 38 // text, stripped of any trailing end-of-line marker. The returned line may 39 // be empty. The end-of-line marker is one optional carriage return followed 40 // by one mandatory newline. In regular expression notation, it is `\r?\n`. 41 // The last non-empty line of input will be returned even if it has no 42 // newline. 43 ScanLines = bufio.ScanLines 44 ) 45 46 // ScanRawStrings is a split function for a Scanner that returns each string quoted by ` of 47 // text. The returned line may be empty. Escape is disallowed 48 // Raw string literals are character sequences between back quotes, as in `foo`. 49 // Within the quotes, any character may appear except back quote. 50 // The value of a raw string literal is the string composed of the uninterpreted (implicitly UTF-8-encoded) characters 51 // between the quotes; in particular, backslashes have no special meaning and the string may contain newlines. 52 // Carriage return characters ('\r') inside raw string literals are discarded from the raw string value. 53 // https://golang.org/ref/spec#String_literals 54 // raw_string_lit = "`" { unicode_char | newline } "`" . 55 func ScanRawStrings(data []byte, atEOF bool) (advance int, token []byte, err error) { 56 return scanStrings(data, atEOF, '`') 57 } 58 59 // ScanInterpretedStrings is a split function for a Scanner that returns each string quoted by " of 60 // text. The returned line may be empty. 61 // Interpreted string literals are character sequences between double quotes, as in "bar". 62 // Within the quotes, any character may appear except newline and unescaped double quote. 63 // The text between the quotes forms the value of the literal, 64 // with backslash escapes interpreted as they are in rune literals 65 // (except that \' is illegal and \" is legal), with the same restrictions. 66 // The three-digit octal (\nnn) and two-digit hexadecimal (\xnn) 67 // escapes represent individual bytes of the resulting string; 68 // all other escapes represent the (possibly multi-byte) UTF-8 encoding of individual characters. Thus inside a string 69 // literal \377 and \xFF represent a single byte of value 0xFF=255, while ΓΏ, \u00FF, \U000000FF and \xc3\xbf represent 70 // the two bytes 0xc3 0xbf of the UTF-8 encoding of character U+00FF. 71 // https://golang.org/ref/spec#String_literals 72 // interpreted_string_lit = `"` { unicode_value | byte_value } `"` . 73 func ScanInterpretedStrings(data []byte, atEOF bool) (advance int, token []byte, err error) { 74 return scanStrings(data, atEOF, '"') 75 } 76 77 // ScanEscapes is a split function wrapper for a Scanner that returns each string which is an escape format of 78 // text. The returned line may be empty. 79 func ScanEscapes(quote rune) func(data []byte, atEOF bool) (advance int, token []byte, err error) { 80 return func(data []byte, atEOF bool) (advance int, token []byte, err error) { 81 return scanEscapes(data, atEOF, quote) 82 } 83 } 84 85 // ScanMantissas is a split function wrapper for a Scanner that returns each string which is an n-base number format of 86 // text. The returned line may be empty. 87 func ScanMantissas(base int) func(data []byte, atEOF bool) (advance int, token []byte, err error) { 88 return ScanWhile(func(r rune) bool { 89 return digitVal(r) < base 90 }) 91 } 92 93 // https://golang.org/ref/spec#Integer_literals 94 // https://golang.org/ref/spec#Floating-point_literals 95 // https://golang.org/ref/spec#Imaginary_literals 96 // ScanNumbers is a split function wrapper for a Scanner that returns each string which is an integer, floating-point 97 // or imaginary format of text. The returned line may be empty. 98 func ScanNumbers(data []byte, atEOF bool) (advance int, token []byte, err error) { 99 if atEOF && len(data) == 0 { 100 return needMoreData() 101 } 102 var off int 103 var seenSign bool 104 var seenDecimalPoint bool 105 var seenDecimalNumber bool 106 107 var lookforFraction bool 108 var lookforExponent bool 109 // First character 1: digitVal(ch) < 10. 110 // Handle .989 or 0x888 111 for { 112 // read a rune 113 advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF)) 114 off = off + advance 115 if err != nil || len(token) == 0 { 116 return advance, token, err 117 } 118 ch := bytes.Runes(token)[0] 119 if ch == '.' { 120 // . can be seen once only 121 if seenDecimalPoint { 122 off-- 123 return off, data[:off], nil 124 } 125 seenDecimalPoint = true 126 continue 127 } 128 129 // sign can be seen leading or after E or e 130 if ch == '+' || ch == '-' { 131 // sign can be seen once only, and can never be after "." 132 if seenSign || seenDecimalPoint { 133 off-- 134 return off, data[:off], nil 135 } 136 seenSign = true 137 continue 138 } 139 140 // number must be leading with "." "+" "-" or "0-9" 141 if !seenDecimalNumber && digitVal(ch) > 10 { 142 msg := fmt.Sprintf("illegal character %#U leading escape sequence, expect \\", token) 143 return 0, nil, errors.New(msg) 144 } 145 seenDecimalNumber = true 146 147 // .989777 148 if seenDecimalPoint { 149 advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF)) 150 off = off + advance 151 if err != nil || len(token) == 0 { 152 return advance, token, err 153 } 154 // look for "E" or "e" 155 lookforExponent = true 156 break 157 } 158 159 // 0x12 160 if ch == '0' { 161 // int or float 162 advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF)) 163 off = off + advance 164 if err != nil { 165 return advance, token, err 166 } 167 if len(token) == 0 { 168 return off, data[:off], nil 169 } 170 ch = bytes.Runes(token)[0] 171 172 if ch == 'x' || ch == 'X' { 173 // hexadecimal int 174 advance, token, err := handleSplitError(ScanMantissas(16)(data[off:], atEOF)) 175 off = off + advance 176 if err != nil || len(token) == 0 { 177 return advance, token, err 178 } 179 if len(token) <= 0 { 180 // only scanned "0x" or "0X" 181 return 0, nil, errors.New("illegal hexadecimal number") 182 } 183 return off, data[:off], nil 184 } else { 185 // octal int or float 186 seenDecimalDigit := false 187 advance, token, err := handleSplitError(ScanMantissas(8)(data[off:], atEOF)) 188 off = off + advance 189 if err != nil { 190 return advance, token, err 191 } 192 193 // read new rune 194 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 195 off = off + advance 196 if err != nil { 197 return advance, token, err 198 } 199 if len(token) == 0 { 200 return off, data[:off], nil 201 } 202 ch = bytes.Runes(token)[0] 203 204 if ch == '8' || ch == '9' { 205 // illegal octal int or float 206 seenDecimalDigit = true 207 advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF)) 208 off = off + advance 209 if err != nil || len(token) == 0 { 210 return advance, token, err 211 } 212 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 213 off = off + advance 214 if err != nil || len(token) == 0 { 215 return advance, token, err 216 } 217 ch = bytes.Runes(token)[0] 218 } 219 if ch == '.' || ch == 'e' || ch == 'E' || ch == 'i' { 220 off-- //backward for fraction "." "e" "E" or "i" 221 lookforFraction = true 222 break 223 } 224 // octal int 225 if seenDecimalDigit { 226 return 0, nil, errors.New("illegal octal number") 227 } 228 229 off-- //backward for exit 230 231 } 232 return off, data[:off], nil 233 } 234 235 // decimal int or float 236 advance, token, err = handleSplitError(ScanMantissas(10)(data[off:], atEOF)) 237 off = off + advance 238 if err != nil || len(token) == 0 { 239 return advance, token, err 240 } 241 lookforFraction = true 242 break 243 } 244 245 // read a rune 246 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 247 off = off + advance 248 if err != nil { 249 return advance, token, err 250 } 251 if len(token) == 0 { 252 return off, data[:off], nil 253 } 254 ch := bytes.Runes(token)[0] 255 256 if lookforFraction && ch == '.' { 257 advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF)) 258 off = off + advance 259 if err != nil { 260 return advance, token, err 261 } 262 if len(token) == 0 { 263 return off, data[:off], nil 264 } 265 lookforExponent = true 266 267 // read new rune 268 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 269 off = off + advance 270 if err != nil { 271 return advance, token, err 272 } 273 if len(token) == 0 { 274 return off, data[:off], nil 275 } 276 ch = bytes.Runes(token)[0] 277 } 278 279 if lookforExponent && (ch == 'e' || ch == 'E') { 280 advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF)) 281 off = off + advance 282 if err != nil { 283 return advance, token, err 284 } 285 if len(token) == 0 { 286 return off, data[:off], nil 287 } 288 ch = bytes.Runes(token)[0] 289 290 if ch == '-' || ch == '+' { 291 advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF)) 292 off = off + advance 293 if err != nil { 294 return advance, token, err 295 } 296 if len(token) == 0 { 297 return off, data[:off], nil 298 } 299 ch = bytes.Runes(token)[0] 300 } 301 if digitVal(ch) < 10 { 302 advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF)) 303 off = off + advance 304 if err != nil { 305 return advance, token, err 306 } 307 if len(token) == 0 { 308 return off, data[:off], nil 309 } 310 311 // read new rune 312 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 313 off = off + advance 314 if err != nil { 315 return advance, token, err 316 } 317 if len(token) == 0 { 318 return off, data[:off], nil 319 } 320 } else { 321 return 0, nil, errors.New("illegal floating-point exponent") 322 } 323 } 324 325 if ch != 'i' { 326 // backward 327 off = off - utf8.RuneLen(ch) 328 } 329 return off, data[:off], nil 330 } 331 332 // https://golang.org/ref/spec#Identifiers 333 // ScanIdentifier is a split function wrapper for a Scanner that returns each string which is an identifier format of text. 334 // The returned line may be empty. 335 // identifier = letter { letter | unicode_digit } . 336 func ScanIdentifier(data []byte, atEOF bool) (advance int, token []byte, err error) { 337 if atEOF && len(data) == 0 { 338 return needMoreData() 339 } 340 var off int 341 342 // First character 1: \. 343 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 344 off = off + advance 345 if err != nil || len(token) == 0 { 346 return advance, token, err 347 } 348 ch := bytes.Runes(token)[0] 349 350 if isLetter(ch) { 351 for isLetter(ch) || isDigit(ch) { 352 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 353 off = off + advance 354 if err != nil { 355 return advance, token, err 356 } 357 if token == nil { 358 return off, data[:off], nil 359 } 360 ch = bytes.Runes(token)[0] 361 } 362 } 363 off -= utf8.RuneLen(ch) // backward 364 return off, data[:off], nil 365 } 366 367 // ScanUntil is a split function wrapper for a Scanner that returns each string until filter case is meet. 368 // The returned line may be empty. 369 func ScanUntil(filter func(r rune) bool) func(data []byte, atEOF bool) (advance int, token []byte, err error) { 370 return ScanWhile(func(r rune) bool { 371 if filter == nil { 372 return false 373 } 374 return !filter(r) 375 }) 376 } 377 378 // ScanUntil is a split function wrapper for a Scanner that returns each string until filter case is not meet. 379 // The returned line may be empty. 380 func ScanWhile(filter func(r rune) bool) func(data []byte, atEOF bool) (advance int, token []byte, err error) { 381 return func(data []byte, atEOF bool) (advance int, token []byte, err error) { 382 if filter == nil || atEOF && len(data) == 0 { 383 return needMoreData() 384 } 385 var off int 386 387 // First character 1: \. 388 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 389 off = off + advance 390 if err != nil || len(token) == 0 { 391 return advance, token, err 392 } 393 ch := bytes.Runes(token)[0] 394 395 for filter(ch) { 396 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 397 off = off + advance 398 if err != nil { 399 return advance, token, err 400 } 401 if token == nil { 402 return off, data[:off], nil 403 } 404 ch = bytes.Runes(token)[0] 405 } 406 off -= utf8.RuneLen(ch) // backward 407 408 return off, data[:off], nil 409 } 410 } 411 412 // ScanRegexp is a split function wrapper for a Scanner that returns each string until regexp case is not meet. 413 // The returned line may be empty. 414 func ScanRegexp(regs ...*regexp.Regexp) func(data []byte, atEOF bool) (advance int, token []byte, err error) { 415 return func(data []byte, atEOF bool) (advance int, token []byte, err error) { 416 if atEOF && len(data) == 0 { 417 return needMoreData() 418 } 419 var off int 420 421 // First character 1: \. 422 // regex mode 423 for _, reg := range regs { 424 if reg == nil { 425 continue 426 } 427 428 locs := reg.FindStringIndex(string(data[off:])) 429 if len(locs) == 0 { 430 continue 431 } 432 off = locs[1] 433 return off, data[locs[0]:off], nil 434 } 435 436 return off, data[:off], nil 437 } 438 } 439 440 // ScanRegexpPerl is a split function wrapper for a Scanner that returns each string until regexp case is not meet. 441 // The returned line may be empty. 442 // This so-called leftmost-first matching is the same semantics 443 // that Perl, Python, and other implementations use, although this 444 // package implements it without the expense of backtracking. 445 // For POSIX leftmost-longest matching, see ScanRegexpPosix. 446 func ScanRegexpPerl(expectStrs ...string) func(data []byte, atEOF bool) (advance int, token []byte, err error) { 447 var regs []*regexp.Regexp 448 for _, expect := range expectStrs { 449 expect = "^" + strings.TrimPrefix(expect, "^") 450 451 regs = append(regs, regexp.MustCompile(expect)) 452 } 453 return ScanRegexp(regs...) 454 } 455 456 // ScanRegexpPosix is a split function wrapper for a Scanner that returns each string until regexp case is not meet. 457 // The returned line may be empty. 458 // ScanRegexpPosix is like ScanRegexpPerl but restricts the regular expression 459 // to POSIX ERE (egrep) syntax and changes the match semantics to 460 // leftmost-longest. 461 func ScanRegexpPosix(expectStrs ...string) func(data []byte, atEOF bool) (advance int, token []byte, err error) { 462 var regs []*regexp.Regexp 463 for _, expect := range expectStrs { 464 expect = "^" + strings.TrimPrefix(expect, "^") 465 466 regs = append(regs, regexp.MustCompilePOSIX(expect)) 467 } 468 return ScanRegexp(regs...) 469 } 470 471 // https://golang.org/ref/spec#String_literals 472 // string_lit = raw_string_lit | interpreted_string_lit . 473 // raw_string_lit = "`" { unicode_char | newline } "`" . 474 // interpreted_string_lit = `"` { unicode_value | byte_value } `"` . 475 func scanStrings(data []byte, atEOF bool, quote rune) (advance int, token []byte, err error) { 476 if atEOF && len(data) == 0 { 477 return needMoreData() 478 } 479 var off int 480 481 // First character 1: ". 482 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 483 off = off + advance 484 if err != nil || len(token) == 0 { 485 return advance, token, err 486 } 487 if !bytes.ContainsRune(token, quote) { 488 msg := fmt.Sprintf("illegal character %#U leading escape sequence, expect \\", token) 489 return 0, nil, errors.New(msg) 490 } 491 492 var allowEscape bool 493 if quote == '"' { 494 allowEscape = true 495 } 496 // '"' opening already consumed 497 for _, ch := range data[off:] { 498 off++ 499 if ch == '\n' || ch < 0 { 500 return 0, nil, errors.New("string literal not terminated") 501 } 502 503 if rune(ch) == quote { 504 break 505 } 506 507 if allowEscape && ch == '\\' { 508 // backward 509 off-- 510 advance, token, err = handleSplitError(ScanEscapes(quote)(data[off:], atEOF)) 511 off = off + advance 512 if err != nil || len(token) == 0 { 513 return advance, token, err 514 } 515 516 } 517 } 518 return off, data[:off], nil 519 } 520 521 func scanEscapes(data []byte, atEOF bool, quote rune) (advance int, token []byte, err error) { 522 if atEOF && len(data) == 0 { 523 return needMoreData() 524 } 525 var off int 526 527 // First character 1: \. 528 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 529 off = off + advance 530 if err != nil || len(token) == 0 { 531 return advance, token, err 532 } 533 534 if !bytes.ContainsRune(token, '\\') { 535 msg := fmt.Sprintf("illegal character %#U leading escape sequence, expect \\", token) 536 return 0, nil, errors.New(msg) 537 } 538 539 // Second character 2: char. 540 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 541 off = off + advance 542 if err != nil || len(token) == 0 { 543 return advance, token, err 544 } 545 546 ch := bytes.Runes(token)[0] 547 548 var n int 549 var base, max uint32 550 switch ch { 551 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 552 return off, data[0:off], nil 553 case '0', '1', '2', '3', '4', '5', '6', '7': 554 n, base, max = 3, 8, 255 555 case 'x': 556 n, base, max = 2, 16, 255 557 case 'u': 558 n, base, max = 4, 16, unicode.MaxRune 559 case 'U': 560 n, base, max = 8, 16, unicode.MaxRune 561 default: 562 msg := "unknown escape sequence" 563 if ch < 0 { 564 msg = "escape sequence not terminated" 565 } 566 return 0, nil, errors.New(msg) 567 } 568 569 switch ch { 570 case 'x', 'u', 'U': 571 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 572 off = off + advance 573 if err != nil || len(token) == 0 { 574 return advance, token, err 575 } 576 577 ch = bytes.Runes(token)[0] 578 } 579 580 var x uint32 581 for n > 0 { 582 d := uint32(digitVal(ch)) 583 if d >= base { 584 msg := fmt.Sprintf("illegal character %#U in escape sequence", ch) 585 if ch < 0 { 586 msg = "escape sequence not terminated" 587 } 588 return 0, nil, errors.New(msg) 589 } 590 x = x*base + d 591 592 advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF)) 593 off = off + advance 594 if err != nil || len(token) == 0 { 595 return advance, token, err 596 } 597 ch = bytes.Runes(token)[0] 598 599 n-- 600 } 601 602 if x > max || 0xD800 <= x && x < 0xE000 { 603 return 0, nil, errors.New("escape sequence is invalid Unicode code point") 604 } 605 return off, data[:off], nil 606 } 607 608 func isLetter(ch rune) bool { 609 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 610 } 611 612 func isDigit(ch rune) bool { 613 return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 614 } 615 616 func needMoreData() (advance int, token []byte, err error) { 617 return 0, nil, nil 618 } 619 620 func handleSplitError(advance int, token []byte, err error) (int, []byte, error) { 621 if err != nil { 622 if err == bufio.ErrFinalToken { 623 return 0, nil, nil 624 } 625 return 0, nil, err 626 } 627 628 if len(token) == 0 { 629 // needMoreData 630 return 0, nil, nil 631 } 632 633 return advance, token, nil 634 }