github.com/x04/go/src@v0.0.0-20200202162449-3d481ceb3525/bytes/bytes.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bytes implements functions for the manipulation of byte slices. 6 // It is analogous to the facilities of the strings package. 7 package bytes 8 9 import ( 10 "github.com/x04/go/src/internal/bytealg" 11 "github.com/x04/go/src/unicode" 12 "github.com/x04/go/src/unicode/utf8" 13 ) 14 15 // Equal reports whether a and b 16 // are the same length and contain the same bytes. 17 // A nil argument is equivalent to an empty slice. 18 func Equal(a, b []byte) bool { 19 // Neither cmd/compile nor gccgo allocates for these string conversions. 20 return string(a) == string(b) 21 } 22 23 // Compare returns an integer comparing two byte slices lexicographically. 24 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. 25 // A nil argument is equivalent to an empty slice. 26 func Compare(a, b []byte) int { 27 return bytealg.Compare(a, b) 28 } 29 30 // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes), 31 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes. 32 func explode(s []byte, n int) [][]byte { 33 if n <= 0 { 34 n = len(s) 35 } 36 a := make([][]byte, n) 37 var size int 38 na := 0 39 for len(s) > 0 { 40 if na+1 >= n { 41 a[na] = s 42 na++ 43 break 44 } 45 _, size = utf8.DecodeRune(s) 46 a[na] = s[0:size:size] 47 s = s[size:] 48 na++ 49 } 50 return a[0:na] 51 } 52 53 // Count counts the number of non-overlapping instances of sep in s. 54 // If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s. 55 func Count(s, sep []byte) int { 56 // special case 57 if len(sep) == 0 { 58 return utf8.RuneCount(s) + 1 59 } 60 if len(sep) == 1 { 61 return bytealg.Count(s, sep[0]) 62 } 63 n := 0 64 for { 65 i := Index(s, sep) 66 if i == -1 { 67 return n 68 } 69 n++ 70 s = s[i+len(sep):] 71 } 72 } 73 74 // Contains reports whether subslice is within b. 75 func Contains(b, subslice []byte) bool { 76 return Index(b, subslice) != -1 77 } 78 79 // ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b. 80 func ContainsAny(b []byte, chars string) bool { 81 return IndexAny(b, chars) >= 0 82 } 83 84 // ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b. 85 func ContainsRune(b []byte, r rune) bool { 86 return IndexRune(b, r) >= 0 87 } 88 89 // IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b. 90 func IndexByte(b []byte, c byte) int { 91 return bytealg.IndexByte(b, c) 92 } 93 94 func indexBytePortable(s []byte, c byte) int { 95 for i, b := range s { 96 if b == c { 97 return i 98 } 99 } 100 return -1 101 } 102 103 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. 104 func LastIndex(s, sep []byte) int { 105 n := len(sep) 106 switch { 107 case n == 0: 108 return len(s) 109 case n == 1: 110 return LastIndexByte(s, sep[0]) 111 case n == len(s): 112 if Equal(s, sep) { 113 return 0 114 } 115 return -1 116 case n > len(s): 117 return -1 118 } 119 // Rabin-Karp search from the end of the string 120 hashss, pow := hashStrRev(sep) 121 last := len(s) - n 122 var h uint32 123 for i := len(s) - 1; i >= last; i-- { 124 h = h*primeRK + uint32(s[i]) 125 } 126 if h == hashss && Equal(s[last:], sep) { 127 return last 128 } 129 for i := last - 1; i >= 0; i-- { 130 h *= primeRK 131 h += uint32(s[i]) 132 h -= pow * uint32(s[i+n]) 133 if h == hashss && Equal(s[i:i+n], sep) { 134 return i 135 } 136 } 137 return -1 138 } 139 140 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. 141 func LastIndexByte(s []byte, c byte) int { 142 for i := len(s) - 1; i >= 0; i-- { 143 if s[i] == c { 144 return i 145 } 146 } 147 return -1 148 } 149 150 // IndexRune interprets s as a sequence of UTF-8-encoded code points. 151 // It returns the byte index of the first occurrence in s of the given rune. 152 // It returns -1 if rune is not present in s. 153 // If r is utf8.RuneError, it returns the first instance of any 154 // invalid UTF-8 byte sequence. 155 func IndexRune(s []byte, r rune) int { 156 switch { 157 case 0 <= r && r < utf8.RuneSelf: 158 return IndexByte(s, byte(r)) 159 case r == utf8.RuneError: 160 for i := 0; i < len(s); { 161 r1, n := utf8.DecodeRune(s[i:]) 162 if r1 == utf8.RuneError { 163 return i 164 } 165 i += n 166 } 167 return -1 168 case !utf8.ValidRune(r): 169 return -1 170 default: 171 var b [utf8.UTFMax]byte 172 n := utf8.EncodeRune(b[:], r) 173 return Index(s, b[:n]) 174 } 175 } 176 177 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points. 178 // It returns the byte index of the first occurrence in s of any of the Unicode 179 // code points in chars. It returns -1 if chars is empty or if there is no code 180 // point in common. 181 func IndexAny(s []byte, chars string) int { 182 if chars == "" { 183 // Avoid scanning all of s. 184 return -1 185 } 186 if len(s) > 8 { 187 if as, isASCII := makeASCIISet(chars); isASCII { 188 for i, c := range s { 189 if as.contains(c) { 190 return i 191 } 192 } 193 return -1 194 } 195 } 196 var width int 197 for i := 0; i < len(s); i += width { 198 r := rune(s[i]) 199 if r < utf8.RuneSelf { 200 width = 1 201 } else { 202 r, width = utf8.DecodeRune(s[i:]) 203 } 204 for _, ch := range chars { 205 if r == ch { 206 return i 207 } 208 } 209 } 210 return -1 211 } 212 213 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code 214 // points. It returns the byte index of the last occurrence in s of any of 215 // the Unicode code points in chars. It returns -1 if chars is empty or if 216 // there is no code point in common. 217 func LastIndexAny(s []byte, chars string) int { 218 if chars == "" { 219 // Avoid scanning all of s. 220 return -1 221 } 222 if len(s) > 8 { 223 if as, isASCII := makeASCIISet(chars); isASCII { 224 for i := len(s) - 1; i >= 0; i-- { 225 if as.contains(s[i]) { 226 return i 227 } 228 } 229 return -1 230 } 231 } 232 for i := len(s); i > 0; { 233 r, size := utf8.DecodeLastRune(s[:i]) 234 i -= size 235 for _, c := range chars { 236 if r == c { 237 return i 238 } 239 } 240 } 241 return -1 242 } 243 244 // Generic split: splits after each instance of sep, 245 // including sepSave bytes of sep in the subslices. 246 func genSplit(s, sep []byte, sepSave, n int) [][]byte { 247 if n == 0 { 248 return nil 249 } 250 if len(sep) == 0 { 251 return explode(s, n) 252 } 253 if n < 0 { 254 n = Count(s, sep) + 1 255 } 256 257 a := make([][]byte, n) 258 n-- 259 i := 0 260 for i < n { 261 m := Index(s, sep) 262 if m < 0 { 263 break 264 } 265 a[i] = s[: m+sepSave : m+sepSave] 266 s = s[m+len(sep):] 267 i++ 268 } 269 a[i] = s 270 return a[:i+1] 271 } 272 273 // SplitN slices s into subslices separated by sep and returns a slice of 274 // the subslices between those separators. 275 // If sep is empty, SplitN splits after each UTF-8 sequence. 276 // The count determines the number of subslices to return: 277 // n > 0: at most n subslices; the last subslice will be the unsplit remainder. 278 // n == 0: the result is nil (zero subslices) 279 // n < 0: all subslices 280 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) } 281 282 // SplitAfterN slices s into subslices after each instance of sep and 283 // returns a slice of those subslices. 284 // If sep is empty, SplitAfterN splits after each UTF-8 sequence. 285 // The count determines the number of subslices to return: 286 // n > 0: at most n subslices; the last subslice will be the unsplit remainder. 287 // n == 0: the result is nil (zero subslices) 288 // n < 0: all subslices 289 func SplitAfterN(s, sep []byte, n int) [][]byte { 290 return genSplit(s, sep, len(sep), n) 291 } 292 293 // Split slices s into all subslices separated by sep and returns a slice of 294 // the subslices between those separators. 295 // If sep is empty, Split splits after each UTF-8 sequence. 296 // It is equivalent to SplitN with a count of -1. 297 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) } 298 299 // SplitAfter slices s into all subslices after each instance of sep and 300 // returns a slice of those subslices. 301 // If sep is empty, SplitAfter splits after each UTF-8 sequence. 302 // It is equivalent to SplitAfterN with a count of -1. 303 func SplitAfter(s, sep []byte) [][]byte { 304 return genSplit(s, sep, len(sep), -1) 305 } 306 307 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} 308 309 // Fields interprets s as a sequence of UTF-8-encoded code points. 310 // It splits the slice s around each instance of one or more consecutive white space 311 // characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an 312 // empty slice if s contains only white space. 313 func Fields(s []byte) [][]byte { 314 // First count the fields. 315 // This is an exact count if s is ASCII, otherwise it is an approximation. 316 n := 0 317 wasSpace := 1 318 // setBits is used to track which bits are set in the bytes of s. 319 setBits := uint8(0) 320 for i := 0; i < len(s); i++ { 321 r := s[i] 322 setBits |= r 323 isSpace := int(asciiSpace[r]) 324 n += wasSpace & ^isSpace 325 wasSpace = isSpace 326 } 327 328 if setBits >= utf8.RuneSelf { 329 // Some runes in the input slice are not ASCII. 330 return FieldsFunc(s, unicode.IsSpace) 331 } 332 333 // ASCII fast path 334 a := make([][]byte, n) 335 na := 0 336 fieldStart := 0 337 i := 0 338 // Skip spaces in the front of the input. 339 for i < len(s) && asciiSpace[s[i]] != 0 { 340 i++ 341 } 342 fieldStart = i 343 for i < len(s) { 344 if asciiSpace[s[i]] == 0 { 345 i++ 346 continue 347 } 348 a[na] = s[fieldStart:i:i] 349 na++ 350 i++ 351 // Skip spaces in between fields. 352 for i < len(s) && asciiSpace[s[i]] != 0 { 353 i++ 354 } 355 fieldStart = i 356 } 357 if fieldStart < len(s) { // Last field might end at EOF. 358 a[na] = s[fieldStart:len(s):len(s)] 359 } 360 return a 361 } 362 363 // FieldsFunc interprets s as a sequence of UTF-8-encoded code points. 364 // It splits the slice s at each run of code points c satisfying f(c) and 365 // returns a slice of subslices of s. If all code points in s satisfy f(c), or 366 // len(s) == 0, an empty slice is returned. 367 // FieldsFunc makes no guarantees about the order in which it calls f(c). 368 // If f does not return consistent results for a given c, FieldsFunc may crash. 369 func FieldsFunc(s []byte, f func(rune) bool) [][]byte { 370 // A span is used to record a slice of s of the form s[start:end]. 371 // The start index is inclusive and the end index is exclusive. 372 type span struct { 373 start int 374 end int 375 } 376 spans := make([]span, 0, 32) 377 378 // Find the field start and end indices. 379 wasField := false 380 fromIndex := 0 381 for i := 0; i < len(s); { 382 size := 1 383 r := rune(s[i]) 384 if r >= utf8.RuneSelf { 385 r, size = utf8.DecodeRune(s[i:]) 386 } 387 if f(r) { 388 if wasField { 389 spans = append(spans, span{start: fromIndex, end: i}) 390 wasField = false 391 } 392 } else { 393 if !wasField { 394 fromIndex = i 395 wasField = true 396 } 397 } 398 i += size 399 } 400 401 // Last field might end at EOF. 402 if wasField { 403 spans = append(spans, span{fromIndex, len(s)}) 404 } 405 406 // Create subslices from recorded field indices. 407 a := make([][]byte, len(spans)) 408 for i, span := range spans { 409 a[i] = s[span.start:span.end:span.end] 410 } 411 412 return a 413 } 414 415 // Join concatenates the elements of s to create a new byte slice. The separator 416 // sep is placed between elements in the resulting slice. 417 func Join(s [][]byte, sep []byte) []byte { 418 if len(s) == 0 { 419 return []byte{} 420 } 421 if len(s) == 1 { 422 // Just return a copy. 423 return append([]byte(nil), s[0]...) 424 } 425 n := len(sep) * (len(s) - 1) 426 for _, v := range s { 427 n += len(v) 428 } 429 430 b := make([]byte, n) 431 bp := copy(b, s[0]) 432 for _, v := range s[1:] { 433 bp += copy(b[bp:], sep) 434 bp += copy(b[bp:], v) 435 } 436 return b 437 } 438 439 // HasPrefix tests whether the byte slice s begins with prefix. 440 func HasPrefix(s, prefix []byte) bool { 441 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix) 442 } 443 444 // HasSuffix tests whether the byte slice s ends with suffix. 445 func HasSuffix(s, suffix []byte) bool { 446 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix) 447 } 448 449 // Map returns a copy of the byte slice s with all its characters modified 450 // according to the mapping function. If mapping returns a negative value, the character is 451 // dropped from the byte slice with no replacement. The characters in s and the 452 // output are interpreted as UTF-8-encoded code points. 453 func Map(mapping func(r rune) rune, s []byte) []byte { 454 // In the worst case, the slice can grow when mapped, making 455 // things unpleasant. But it's so rare we barge in assuming it's 456 // fine. It could also shrink but that falls out naturally. 457 maxbytes := len(s) // length of b 458 nbytes := 0 // number of bytes encoded in b 459 b := make([]byte, maxbytes) 460 for i := 0; i < len(s); { 461 wid := 1 462 r := rune(s[i]) 463 if r >= utf8.RuneSelf { 464 r, wid = utf8.DecodeRune(s[i:]) 465 } 466 r = mapping(r) 467 if r >= 0 { 468 rl := utf8.RuneLen(r) 469 if rl < 0 { 470 rl = len(string(utf8.RuneError)) 471 } 472 if nbytes+rl > maxbytes { 473 // Grow the buffer. 474 maxbytes = maxbytes*2 + utf8.UTFMax 475 nb := make([]byte, maxbytes) 476 copy(nb, b[0:nbytes]) 477 b = nb 478 } 479 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) 480 } 481 i += wid 482 } 483 return b[0:nbytes] 484 } 485 486 // Repeat returns a new byte slice consisting of count copies of b. 487 // 488 // It panics if count is negative or if 489 // the result of (len(b) * count) overflows. 490 func Repeat(b []byte, count int) []byte { 491 if count == 0 { 492 return []byte{} 493 } 494 // Since we cannot return an error on overflow, 495 // we should panic if the repeat will generate 496 // an overflow. 497 // See Issue golang.org/issue/16237. 498 if count < 0 { 499 panic("bytes: negative Repeat count") 500 } else if len(b)*count/count != len(b) { 501 panic("bytes: Repeat count causes overflow") 502 } 503 504 nb := make([]byte, len(b)*count) 505 bp := copy(nb, b) 506 for bp < len(nb) { 507 copy(nb[bp:], nb[:bp]) 508 bp *= 2 509 } 510 return nb 511 } 512 513 // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to 514 // their upper case. 515 func ToUpper(s []byte) []byte { 516 isASCII, hasLower := true, false 517 for i := 0; i < len(s); i++ { 518 c := s[i] 519 if c >= utf8.RuneSelf { 520 isASCII = false 521 break 522 } 523 hasLower = hasLower || ('a' <= c && c <= 'z') 524 } 525 526 if isASCII { // optimize for ASCII-only byte slices. 527 if !hasLower { 528 // Just return a copy. 529 return append([]byte(""), s...) 530 } 531 b := make([]byte, len(s)) 532 for i := 0; i < len(s); i++ { 533 c := s[i] 534 if 'a' <= c && c <= 'z' { 535 c -= 'a' - 'A' 536 } 537 b[i] = c 538 } 539 return b 540 } 541 return Map(unicode.ToUpper, s) 542 } 543 544 // ToLower returns a copy of the byte slice s with all Unicode letters mapped to 545 // their lower case. 546 func ToLower(s []byte) []byte { 547 isASCII, hasUpper := true, false 548 for i := 0; i < len(s); i++ { 549 c := s[i] 550 if c >= utf8.RuneSelf { 551 isASCII = false 552 break 553 } 554 hasUpper = hasUpper || ('A' <= c && c <= 'Z') 555 } 556 557 if isASCII { // optimize for ASCII-only byte slices. 558 if !hasUpper { 559 return append([]byte(""), s...) 560 } 561 b := make([]byte, len(s)) 562 for i := 0; i < len(s); i++ { 563 c := s[i] 564 if 'A' <= c && c <= 'Z' { 565 c += 'a' - 'A' 566 } 567 b[i] = c 568 } 569 return b 570 } 571 return Map(unicode.ToLower, s) 572 } 573 574 // ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case. 575 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) } 576 577 // ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their 578 // upper case, giving priority to the special casing rules. 579 func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte { 580 return Map(c.ToUpper, s) 581 } 582 583 // ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their 584 // lower case, giving priority to the special casing rules. 585 func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte { 586 return Map(c.ToLower, s) 587 } 588 589 // ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their 590 // title case, giving priority to the special casing rules. 591 func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte { 592 return Map(c.ToTitle, s) 593 } 594 595 // ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes 596 // representing invalid UTF-8 replaced with the bytes in replacement, which may be empty. 597 func ToValidUTF8(s, replacement []byte) []byte { 598 b := make([]byte, 0, len(s)+len(replacement)) 599 invalid := false // previous byte was from an invalid UTF-8 sequence 600 for i := 0; i < len(s); { 601 c := s[i] 602 if c < utf8.RuneSelf { 603 i++ 604 invalid = false 605 b = append(b, byte(c)) 606 continue 607 } 608 _, wid := utf8.DecodeRune(s[i:]) 609 if wid == 1 { 610 i++ 611 if !invalid { 612 invalid = true 613 b = append(b, replacement...) 614 } 615 continue 616 } 617 invalid = false 618 b = append(b, s[i:i+wid]...) 619 i += wid 620 } 621 return b 622 } 623 624 // isSeparator reports whether the rune could mark a word boundary. 625 // TODO: update when package unicode captures more of the properties. 626 func isSeparator(r rune) bool { 627 // ASCII alphanumerics and underscore are not separators 628 if r <= 0x7F { 629 switch { 630 case '0' <= r && r <= '9': 631 return false 632 case 'a' <= r && r <= 'z': 633 return false 634 case 'A' <= r && r <= 'Z': 635 return false 636 case r == '_': 637 return false 638 } 639 return true 640 } 641 // Letters and digits are not separators 642 if unicode.IsLetter(r) || unicode.IsDigit(r) { 643 return false 644 } 645 // Otherwise, all we can do for now is treat spaces as separators. 646 return unicode.IsSpace(r) 647 } 648 649 // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin 650 // words mapped to their title case. 651 // 652 // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly. 653 func Title(s []byte) []byte { 654 // Use a closure here to remember state. 655 // Hackish but effective. Depends on Map scanning in order and calling 656 // the closure once per rune. 657 prev := ' ' 658 return Map( 659 func(r rune) rune { 660 if isSeparator(prev) { 661 prev = r 662 return unicode.ToTitle(r) 663 } 664 prev = r 665 return r 666 }, 667 s) 668 } 669 670 // TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off 671 // all leading UTF-8-encoded code points c that satisfy f(c). 672 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte { 673 i := indexFunc(s, f, false) 674 if i == -1 { 675 return nil 676 } 677 return s[i:] 678 } 679 680 // TrimRightFunc returns a subslice of s by slicing off all trailing 681 // UTF-8-encoded code points c that satisfy f(c). 682 func TrimRightFunc(s []byte, f func(r rune) bool) []byte { 683 i := lastIndexFunc(s, f, false) 684 if i >= 0 && s[i] >= utf8.RuneSelf { 685 _, wid := utf8.DecodeRune(s[i:]) 686 i += wid 687 } else { 688 i++ 689 } 690 return s[0:i] 691 } 692 693 // TrimFunc returns a subslice of s by slicing off all leading and trailing 694 // UTF-8-encoded code points c that satisfy f(c). 695 func TrimFunc(s []byte, f func(r rune) bool) []byte { 696 return TrimRightFunc(TrimLeftFunc(s, f), f) 697 } 698 699 // TrimPrefix returns s without the provided leading prefix string. 700 // If s doesn't start with prefix, s is returned unchanged. 701 func TrimPrefix(s, prefix []byte) []byte { 702 if HasPrefix(s, prefix) { 703 return s[len(prefix):] 704 } 705 return s 706 } 707 708 // TrimSuffix returns s without the provided trailing suffix string. 709 // If s doesn't end with suffix, s is returned unchanged. 710 func TrimSuffix(s, suffix []byte) []byte { 711 if HasSuffix(s, suffix) { 712 return s[:len(s)-len(suffix)] 713 } 714 return s 715 } 716 717 // IndexFunc interprets s as a sequence of UTF-8-encoded code points. 718 // It returns the byte index in s of the first Unicode 719 // code point satisfying f(c), or -1 if none do. 720 func IndexFunc(s []byte, f func(r rune) bool) int { 721 return indexFunc(s, f, true) 722 } 723 724 // LastIndexFunc interprets s as a sequence of UTF-8-encoded code points. 725 // It returns the byte index in s of the last Unicode 726 // code point satisfying f(c), or -1 if none do. 727 func LastIndexFunc(s []byte, f func(r rune) bool) int { 728 return lastIndexFunc(s, f, true) 729 } 730 731 // indexFunc is the same as IndexFunc except that if 732 // truth==false, the sense of the predicate function is 733 // inverted. 734 func indexFunc(s []byte, f func(r rune) bool, truth bool) int { 735 start := 0 736 for start < len(s) { 737 wid := 1 738 r := rune(s[start]) 739 if r >= utf8.RuneSelf { 740 r, wid = utf8.DecodeRune(s[start:]) 741 } 742 if f(r) == truth { 743 return start 744 } 745 start += wid 746 } 747 return -1 748 } 749 750 // lastIndexFunc is the same as LastIndexFunc except that if 751 // truth==false, the sense of the predicate function is 752 // inverted. 753 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int { 754 for i := len(s); i > 0; { 755 r, size := rune(s[i-1]), 1 756 if r >= utf8.RuneSelf { 757 r, size = utf8.DecodeLastRune(s[0:i]) 758 } 759 i -= size 760 if f(r) == truth { 761 return i 762 } 763 } 764 return -1 765 } 766 767 // asciiSet is a 32-byte value, where each bit represents the presence of a 768 // given ASCII character in the set. The 128-bits of the lower 16 bytes, 769 // starting with the least-significant bit of the lowest word to the 770 // most-significant bit of the highest word, map to the full range of all 771 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, 772 // ensuring that any non-ASCII character will be reported as not in the set. 773 type asciiSet [8]uint32 774 775 // makeASCIISet creates a set of ASCII characters and reports whether all 776 // characters in chars are ASCII. 777 func makeASCIISet(chars string) (as asciiSet, ok bool) { 778 for i := 0; i < len(chars); i++ { 779 c := chars[i] 780 if c >= utf8.RuneSelf { 781 return as, false 782 } 783 as[c>>5] |= 1 << uint(c&31) 784 } 785 return as, true 786 } 787 788 // contains reports whether c is inside the set. 789 func (as *asciiSet) contains(c byte) bool { 790 return (as[c>>5] & (1 << uint(c&31))) != 0 791 } 792 793 func makeCutsetFunc(cutset string) func(r rune) bool { 794 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 795 return func(r rune) bool { 796 return r == rune(cutset[0]) 797 } 798 } 799 if as, isASCII := makeASCIISet(cutset); isASCII { 800 return func(r rune) bool { 801 return r < utf8.RuneSelf && as.contains(byte(r)) 802 } 803 } 804 return func(r rune) bool { 805 for _, c := range cutset { 806 if c == r { 807 return true 808 } 809 } 810 return false 811 } 812 } 813 814 // Trim returns a subslice of s by slicing off all leading and 815 // trailing UTF-8-encoded code points contained in cutset. 816 func Trim(s []byte, cutset string) []byte { 817 return TrimFunc(s, makeCutsetFunc(cutset)) 818 } 819 820 // TrimLeft returns a subslice of s by slicing off all leading 821 // UTF-8-encoded code points contained in cutset. 822 func TrimLeft(s []byte, cutset string) []byte { 823 return TrimLeftFunc(s, makeCutsetFunc(cutset)) 824 } 825 826 // TrimRight returns a subslice of s by slicing off all trailing 827 // UTF-8-encoded code points that are contained in cutset. 828 func TrimRight(s []byte, cutset string) []byte { 829 return TrimRightFunc(s, makeCutsetFunc(cutset)) 830 } 831 832 // TrimSpace returns a subslice of s by slicing off all leading and 833 // trailing white space, as defined by Unicode. 834 func TrimSpace(s []byte) []byte { 835 // Fast path for ASCII: look for the first ASCII non-space byte 836 start := 0 837 for ; start < len(s); start++ { 838 c := s[start] 839 if c >= utf8.RuneSelf { 840 // If we run into a non-ASCII byte, fall back to the 841 // slower unicode-aware method on the remaining bytes 842 return TrimFunc(s[start:], unicode.IsSpace) 843 } 844 if asciiSpace[c] == 0 { 845 break 846 } 847 } 848 849 // Now look for the first ASCII non-space byte from the end 850 stop := len(s) 851 for ; stop > start; stop-- { 852 c := s[stop-1] 853 if c >= utf8.RuneSelf { 854 return TrimFunc(s[start:stop], unicode.IsSpace) 855 } 856 if asciiSpace[c] == 0 { 857 break 858 } 859 } 860 861 // At this point s[start:stop] starts and ends with an ASCII 862 // non-space bytes, so we're done. Non-ASCII cases have already 863 // been handled above. 864 if start == stop { 865 // Special case to preserve previous TrimLeftFunc behavior, 866 // returning nil instead of empty slice if all spaces. 867 return nil 868 } 869 return s[start:stop] 870 } 871 872 // Runes interprets s as a sequence of UTF-8-encoded code points. 873 // It returns a slice of runes (Unicode code points) equivalent to s. 874 func Runes(s []byte) []rune { 875 t := make([]rune, utf8.RuneCount(s)) 876 i := 0 877 for len(s) > 0 { 878 r, l := utf8.DecodeRune(s) 879 t[i] = r 880 i++ 881 s = s[l:] 882 } 883 return t 884 } 885 886 // Replace returns a copy of the slice s with the first n 887 // non-overlapping instances of old replaced by new. 888 // If old is empty, it matches at the beginning of the slice 889 // and after each UTF-8 sequence, yielding up to k+1 replacements 890 // for a k-rune slice. 891 // If n < 0, there is no limit on the number of replacements. 892 func Replace(s, old, new []byte, n int) []byte { 893 m := 0 894 if n != 0 { 895 // Compute number of replacements. 896 m = Count(s, old) 897 } 898 if m == 0 { 899 // Just return a copy. 900 return append([]byte(nil), s...) 901 } 902 if n < 0 || m < n { 903 n = m 904 } 905 906 // Apply replacements to buffer. 907 t := make([]byte, len(s)+n*(len(new)-len(old))) 908 w := 0 909 start := 0 910 for i := 0; i < n; i++ { 911 j := start 912 if len(old) == 0 { 913 if i > 0 { 914 _, wid := utf8.DecodeRune(s[start:]) 915 j += wid 916 } 917 } else { 918 j += Index(s[start:], old) 919 } 920 w += copy(t[w:], s[start:j]) 921 w += copy(t[w:], new) 922 start = j + len(old) 923 } 924 w += copy(t[w:], s[start:]) 925 return t[0:w] 926 } 927 928 // ReplaceAll returns a copy of the slice s with all 929 // non-overlapping instances of old replaced by new. 930 // If old is empty, it matches at the beginning of the slice 931 // and after each UTF-8 sequence, yielding up to k+1 replacements 932 // for a k-rune slice. 933 func ReplaceAll(s, old, new []byte) []byte { 934 return Replace(s, old, new, -1) 935 } 936 937 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 938 // are equal under Unicode case-folding, which is a more general 939 // form of case-insensitivity. 940 func EqualFold(s, t []byte) bool { 941 for len(s) != 0 && len(t) != 0 { 942 // Extract first rune from each. 943 var sr, tr rune 944 if s[0] < utf8.RuneSelf { 945 sr, s = rune(s[0]), s[1:] 946 } else { 947 r, size := utf8.DecodeRune(s) 948 sr, s = r, s[size:] 949 } 950 if t[0] < utf8.RuneSelf { 951 tr, t = rune(t[0]), t[1:] 952 } else { 953 r, size := utf8.DecodeRune(t) 954 tr, t = r, t[size:] 955 } 956 957 // If they match, keep going; if not, return false. 958 959 // Easy case. 960 if tr == sr { 961 continue 962 } 963 964 // Make sr < tr to simplify what follows. 965 if tr < sr { 966 tr, sr = sr, tr 967 } 968 // Fast check for ASCII. 969 if tr < utf8.RuneSelf { 970 // ASCII only, sr/tr must be upper/lower case 971 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 972 continue 973 } 974 return false 975 } 976 977 // General case. SimpleFold(x) returns the next equivalent rune > x 978 // or wraps around to smaller values. 979 r := unicode.SimpleFold(sr) 980 for r != sr && r < tr { 981 r = unicode.SimpleFold(r) 982 } 983 if r == tr { 984 continue 985 } 986 return false 987 } 988 989 // One string is empty. Are both? 990 return len(s) == len(t) 991 } 992 993 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. 994 func Index(s, sep []byte) int { 995 n := len(sep) 996 switch { 997 case n == 0: 998 return 0 999 case n == 1: 1000 return IndexByte(s, sep[0]) 1001 case n == len(s): 1002 if Equal(sep, s) { 1003 return 0 1004 } 1005 return -1 1006 case n > len(s): 1007 return -1 1008 case n <= bytealg.MaxLen: 1009 // Use brute force when s and sep both are small 1010 if len(s) <= bytealg.MaxBruteForce { 1011 return bytealg.Index(s, sep) 1012 } 1013 c0 := sep[0] 1014 c1 := sep[1] 1015 i := 0 1016 t := len(s) - n + 1 1017 fails := 0 1018 for i < t { 1019 if s[i] != c0 { 1020 // IndexByte is faster than bytealg.Index, so use it as long as 1021 // we're not getting lots of false positives. 1022 o := IndexByte(s[i:t], c0) 1023 if o < 0 { 1024 return -1 1025 } 1026 i += o 1027 } 1028 if s[i+1] == c1 && Equal(s[i:i+n], sep) { 1029 return i 1030 } 1031 fails++ 1032 i++ 1033 // Switch to bytealg.Index when IndexByte produces too many false positives. 1034 if fails > bytealg.Cutover(i) { 1035 r := bytealg.Index(s[i:], sep) 1036 if r >= 0 { 1037 return r + i 1038 } 1039 return -1 1040 } 1041 } 1042 return -1 1043 } 1044 c0 := sep[0] 1045 c1 := sep[1] 1046 i := 0 1047 fails := 0 1048 t := len(s) - n + 1 1049 for i < t { 1050 if s[i] != c0 { 1051 o := IndexByte(s[i:t], c0) 1052 if o < 0 { 1053 break 1054 } 1055 i += o 1056 } 1057 if s[i+1] == c1 && Equal(s[i:i+n], sep) { 1058 return i 1059 } 1060 i++ 1061 fails++ 1062 if fails >= 4+i>>4 && i < t { 1063 // Give up on IndexByte, it isn't skipping ahead 1064 // far enough to be better than Rabin-Karp. 1065 // Experiments (using IndexPeriodic) suggest 1066 // the cutover is about 16 byte skips. 1067 // TODO: if large prefixes of sep are matching 1068 // we should cutover at even larger average skips, 1069 // because Equal becomes that much more expensive. 1070 // This code does not take that effect into account. 1071 j := indexRabinKarp(s[i:], sep) 1072 if j < 0 { 1073 return -1 1074 } 1075 return i + j 1076 } 1077 } 1078 return -1 1079 } 1080 1081 func indexRabinKarp(s, sep []byte) int { 1082 // Rabin-Karp search 1083 hashsep, pow := hashStr(sep) 1084 n := len(sep) 1085 var h uint32 1086 for i := 0; i < n; i++ { 1087 h = h*primeRK + uint32(s[i]) 1088 } 1089 if h == hashsep && Equal(s[:n], sep) { 1090 return 0 1091 } 1092 for i := n; i < len(s); { 1093 h *= primeRK 1094 h += uint32(s[i]) 1095 h -= pow * uint32(s[i-n]) 1096 i++ 1097 if h == hashsep && Equal(s[i-n:i], sep) { 1098 return i - n 1099 } 1100 } 1101 return -1 1102 } 1103 1104 // primeRK is the prime base used in Rabin-Karp algorithm. 1105 const primeRK = 16777619 1106 1107 // hashStr returns the hash and the appropriate multiplicative 1108 // factor for use in Rabin-Karp algorithm. 1109 func hashStr(sep []byte) (uint32, uint32) { 1110 hash := uint32(0) 1111 for i := 0; i < len(sep); i++ { 1112 hash = hash*primeRK + uint32(sep[i]) 1113 } 1114 var pow, sq uint32 = 1, primeRK 1115 for i := len(sep); i > 0; i >>= 1 { 1116 if i&1 != 0 { 1117 pow *= sq 1118 } 1119 sq *= sq 1120 } 1121 return hash, pow 1122 } 1123 1124 // hashStrRev returns the hash of the reverse of sep and the 1125 // appropriate multiplicative factor for use in Rabin-Karp algorithm. 1126 func hashStrRev(sep []byte) (uint32, uint32) { 1127 hash := uint32(0) 1128 for i := len(sep) - 1; i >= 0; i-- { 1129 hash = hash*primeRK + uint32(sep[i]) 1130 } 1131 var pow, sq uint32 = 1, primeRK 1132 for i := len(sep); i > 0; i >>= 1 { 1133 if i&1 != 0 { 1134 pow *= sq 1135 } 1136 sq *= sq 1137 } 1138 return hash, pow 1139 }