github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/strings/strings.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package strings implements simple functions to manipulate UTF-8 encoded strings. 6 // 7 // For information about UTF-8 strings in Go, see https://blog.golang.org/strings. 8 package strings 9 10 import ( 11 "internal/bytealg" 12 "unicode" 13 "unicode/utf8" 14 ) 15 16 // explode splits s into a slice of UTF-8 strings, 17 // one string per Unicode character up to a maximum of n (n < 0 means no limit). 18 // Invalid UTF-8 bytes are sliced individually. 19 func explode(s string, n int) []string { 20 l := utf8.RuneCountInString(s) 21 if n < 0 || n > l { 22 n = l 23 } 24 a := make([]string, n) 25 for i := 0; i < n-1; i++ { 26 _, size := utf8.DecodeRuneInString(s) 27 a[i] = s[:size] 28 s = s[size:] 29 } 30 if n > 0 { 31 a[n-1] = s 32 } 33 return a 34 } 35 36 // Count counts the number of non-overlapping instances of substr in s. 37 // If substr is an empty string, Count returns 1 + the number of Unicode code points in s. 38 func Count(s, substr string) int { 39 // special case 40 if len(substr) == 0 { 41 return utf8.RuneCountInString(s) + 1 42 } 43 if len(substr) == 1 { 44 return bytealg.CountString(s, substr[0]) 45 } 46 n := 0 47 for { 48 i := Index(s, substr) 49 if i == -1 { 50 return n 51 } 52 n++ 53 s = s[i+len(substr):] 54 } 55 } 56 57 // Contains reports whether substr is within s. 58 func Contains(s, substr string) bool { 59 return Index(s, substr) >= 0 60 } 61 62 // ContainsAny reports whether any Unicode code points in chars are within s. 63 func ContainsAny(s, chars string) bool { 64 return IndexAny(s, chars) >= 0 65 } 66 67 // ContainsRune reports whether the Unicode code point r is within s. 68 func ContainsRune(s string, r rune) bool { 69 return IndexRune(s, r) >= 0 70 } 71 72 // LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s. 73 func LastIndex(s, substr string) int { 74 n := len(substr) 75 switch { 76 case n == 0: 77 return len(s) 78 case n == 1: 79 return LastIndexByte(s, substr[0]) 80 case n == len(s): 81 if substr == s { 82 return 0 83 } 84 return -1 85 case n > len(s): 86 return -1 87 } 88 // Rabin-Karp search from the end of the string 89 hashss, pow := bytealg.HashStrRev(substr) 90 last := len(s) - n 91 var h uint32 92 for i := len(s) - 1; i >= last; i-- { 93 h = h*bytealg.PrimeRK + uint32(s[i]) 94 } 95 if h == hashss && s[last:] == substr { 96 return last 97 } 98 for i := last - 1; i >= 0; i-- { 99 h *= bytealg.PrimeRK 100 h += uint32(s[i]) 101 h -= pow * uint32(s[i+n]) 102 if h == hashss && s[i:i+n] == substr { 103 return i 104 } 105 } 106 return -1 107 } 108 109 // IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s. 110 func IndexByte(s string, c byte) int { 111 return bytealg.IndexByteString(s, c) 112 } 113 114 // IndexRune returns the index of the first instance of the Unicode code point 115 // r, or -1 if rune is not present in s. 116 // If r is utf8.RuneError, it returns the first instance of any 117 // invalid UTF-8 byte sequence. 118 func IndexRune(s string, r rune) int { 119 switch { 120 case 0 <= r && r < utf8.RuneSelf: 121 return IndexByte(s, byte(r)) 122 case r == utf8.RuneError: 123 for i, r := range s { 124 if r == utf8.RuneError { 125 return i 126 } 127 } 128 return -1 129 case !utf8.ValidRune(r): 130 return -1 131 default: 132 return Index(s, string(r)) 133 } 134 } 135 136 // IndexAny returns the index of the first instance of any Unicode code point 137 // from chars in s, or -1 if no Unicode code point from chars is present in s. 138 func IndexAny(s, chars string) int { 139 if chars == "" { 140 // Avoid scanning all of s. 141 return -1 142 } 143 if len(chars) == 1 { 144 // Avoid scanning all of s. 145 r := rune(chars[0]) 146 if r >= utf8.RuneSelf { 147 r = utf8.RuneError 148 } 149 return IndexRune(s, r) 150 } 151 if len(s) > 8 { 152 if as, isASCII := makeASCIISet(chars); isASCII { 153 for i := 0; i < len(s); i++ { 154 if as.contains(s[i]) { 155 return i 156 } 157 } 158 return -1 159 } 160 } 161 for i, c := range s { 162 if IndexRune(chars, c) >= 0 { 163 return i 164 } 165 } 166 return -1 167 } 168 169 // LastIndexAny returns the index of the last instance of any Unicode code 170 // point from chars in s, or -1 if no Unicode code point from chars is 171 // present in s. 172 func LastIndexAny(s, chars string) int { 173 if chars == "" { 174 // Avoid scanning all of s. 175 return -1 176 } 177 if len(s) == 1 { 178 rc := rune(s[0]) 179 if rc >= utf8.RuneSelf { 180 rc = utf8.RuneError 181 } 182 if IndexRune(chars, rc) >= 0 { 183 return 0 184 } 185 return -1 186 } 187 if len(s) > 8 { 188 if as, isASCII := makeASCIISet(chars); isASCII { 189 for i := len(s) - 1; i >= 0; i-- { 190 if as.contains(s[i]) { 191 return i 192 } 193 } 194 return -1 195 } 196 } 197 if len(chars) == 1 { 198 rc := rune(chars[0]) 199 if rc >= utf8.RuneSelf { 200 rc = utf8.RuneError 201 } 202 for i := len(s); i > 0; { 203 r, size := utf8.DecodeLastRuneInString(s[:i]) 204 i -= size 205 if rc == r { 206 return i 207 } 208 } 209 return -1 210 } 211 for i := len(s); i > 0; { 212 r, size := utf8.DecodeLastRuneInString(s[:i]) 213 i -= size 214 if IndexRune(chars, r) >= 0 { 215 return i 216 } 217 } 218 return -1 219 } 220 221 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. 222 func LastIndexByte(s string, c byte) int { 223 for i := len(s) - 1; i >= 0; i-- { 224 if s[i] == c { 225 return i 226 } 227 } 228 return -1 229 } 230 231 // Generic split: splits after each instance of sep, 232 // including sepSave bytes of sep in the subarrays. 233 func genSplit(s, sep string, sepSave, n int) []string { 234 if n == 0 { 235 return nil 236 } 237 if sep == "" { 238 return explode(s, n) 239 } 240 if n < 0 { 241 n = Count(s, sep) + 1 242 } 243 244 if n > len(s)+1 { 245 n = len(s) + 1 246 } 247 a := make([]string, n) 248 n-- 249 i := 0 250 for i < n { 251 m := Index(s, sep) 252 if m < 0 { 253 break 254 } 255 a[i] = s[:m+sepSave] 256 s = s[m+len(sep):] 257 i++ 258 } 259 a[i] = s 260 return a[:i+1] 261 } 262 263 // SplitN slices s into substrings separated by sep and returns a slice of 264 // the substrings between those separators. 265 // 266 // The count determines the number of substrings to return: 267 // 268 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 269 // n == 0: the result is nil (zero substrings) 270 // n < 0: all substrings 271 // 272 // Edge cases for s and sep (for example, empty strings) are handled 273 // as described in the documentation for Split. 274 // 275 // To split around the first instance of a separator, see Cut. 276 func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) } 277 278 // SplitAfterN slices s into substrings after each instance of sep and 279 // returns a slice of those substrings. 280 // 281 // The count determines the number of substrings to return: 282 // 283 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 284 // n == 0: the result is nil (zero substrings) 285 // n < 0: all substrings 286 // 287 // Edge cases for s and sep (for example, empty strings) are handled 288 // as described in the documentation for SplitAfter. 289 func SplitAfterN(s, sep string, n int) []string { 290 return genSplit(s, sep, len(sep), n) 291 } 292 293 // Split slices s into all substrings separated by sep and returns a slice of 294 // the substrings between those separators. 295 // 296 // If s does not contain sep and sep is not empty, Split returns a 297 // slice of length 1 whose only element is s. 298 // 299 // If sep is empty, Split splits after each UTF-8 sequence. If both s 300 // and sep are empty, Split returns an empty slice. 301 // 302 // It is equivalent to SplitN with a count of -1. 303 // 304 // To split around the first instance of a separator, see Cut. 305 func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) } 306 307 // SplitAfter slices s into all substrings after each instance of sep and 308 // returns a slice of those substrings. 309 // 310 // If s does not contain sep and sep is not empty, SplitAfter returns 311 // a slice of length 1 whose only element is s. 312 // 313 // If sep is empty, SplitAfter splits after each UTF-8 sequence. If 314 // both s and sep are empty, SplitAfter returns an empty slice. 315 // 316 // It is equivalent to SplitAfterN with a count of -1. 317 func SplitAfter(s, sep string) []string { 318 return genSplit(s, sep, len(sep), -1) 319 } 320 321 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} 322 323 // Fields splits the string s around each instance of one or more consecutive white space 324 // characters, as defined by unicode.IsSpace, returning a slice of substrings of s or an 325 // empty slice if s contains only white space. 326 func Fields(s string) []string { 327 // First count the fields. 328 // This is an exact count if s is ASCII, otherwise it is an approximation. 329 n := 0 330 wasSpace := 1 331 // setBits is used to track which bits are set in the bytes of s. 332 setBits := uint8(0) 333 for i := 0; i < len(s); i++ { 334 r := s[i] 335 setBits |= r 336 isSpace := int(asciiSpace[r]) 337 n += wasSpace & ^isSpace 338 wasSpace = isSpace 339 } 340 341 if setBits >= utf8.RuneSelf { 342 // Some runes in the input string are not ASCII. 343 return FieldsFunc(s, unicode.IsSpace) 344 } 345 // ASCII fast path 346 a := make([]string, n) 347 na := 0 348 fieldStart := 0 349 i := 0 350 // Skip spaces in the front of the input. 351 for i < len(s) && asciiSpace[s[i]] != 0 { 352 i++ 353 } 354 fieldStart = i 355 for i < len(s) { 356 if asciiSpace[s[i]] == 0 { 357 i++ 358 continue 359 } 360 a[na] = s[fieldStart:i] 361 na++ 362 i++ 363 // Skip spaces in between fields. 364 for i < len(s) && asciiSpace[s[i]] != 0 { 365 i++ 366 } 367 fieldStart = i 368 } 369 if fieldStart < len(s) { // Last field might end at EOF. 370 a[na] = s[fieldStart:] 371 } 372 return a 373 } 374 375 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) 376 // and returns an array of slices of s. If all code points in s satisfy f(c) or the 377 // string is empty, an empty slice is returned. 378 // 379 // FieldsFunc makes no guarantees about the order in which it calls f(c) 380 // and assumes that f always returns the same value for a given c. 381 func FieldsFunc(s string, f func(rune) bool) []string { 382 // A span is used to record a slice of s of the form s[start:end]. 383 // The start index is inclusive and the end index is exclusive. 384 type span struct { 385 start int 386 end int 387 } 388 spans := make([]span, 0, 32) 389 390 // Find the field start and end indices. 391 // Doing this in a separate pass (rather than slicing the string s 392 // and collecting the result substrings right away) is significantly 393 // more efficient, possibly due to cache effects. 394 start := -1 // valid span start if >= 0 395 for end, rune := range s { 396 if f(rune) { 397 if start >= 0 { 398 spans = append(spans, span{start, end}) 399 // Set start to a negative value. 400 // Note: using -1 here consistently and reproducibly 401 // slows down this code by a several percent on amd64. 402 start = ^start 403 } 404 } else { 405 if start < 0 { 406 start = end 407 } 408 } 409 } 410 411 // Last field might end at EOF. 412 if start >= 0 { 413 spans = append(spans, span{start, len(s)}) 414 } 415 416 // Create strings from recorded field indices. 417 a := make([]string, len(spans)) 418 for i, span := range spans { 419 a[i] = s[span.start:span.end] 420 } 421 422 return a 423 } 424 425 // Join concatenates the elements of its first argument to create a single string. The separator 426 // string sep is placed between elements in the resulting string. 427 func Join(elems []string, sep string) string { 428 switch len(elems) { 429 case 0: 430 return "" 431 case 1: 432 return elems[0] 433 } 434 n := len(sep) * (len(elems) - 1) 435 for i := 0; i < len(elems); i++ { 436 n += len(elems[i]) 437 } 438 439 var b Builder 440 b.Grow(n) 441 b.WriteString(elems[0]) 442 for _, s := range elems[1:] { 443 b.WriteString(sep) 444 b.WriteString(s) 445 } 446 return b.String() 447 } 448 449 // HasPrefix tests whether the string s begins with prefix. 450 func HasPrefix(s, prefix string) bool { 451 return len(s) >= len(prefix) && s[0:len(prefix)] == prefix 452 } 453 454 // HasSuffix tests whether the string s ends with suffix. 455 func HasSuffix(s, suffix string) bool { 456 return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix 457 } 458 459 // Map returns a copy of the string s with all its characters modified 460 // according to the mapping function. If mapping returns a negative value, the character is 461 // dropped from the string with no replacement. 462 func Map(mapping func(rune) rune, s string) string { 463 // In the worst case, the string can grow when mapped, making 464 // things unpleasant. But it's so rare we barge in assuming it's 465 // fine. It could also shrink but that falls out naturally. 466 467 // The output buffer b is initialized on demand, the first 468 // time a character differs. 469 var b Builder 470 471 for i, c := range s { 472 r := mapping(c) 473 if r == c && c != utf8.RuneError { 474 continue 475 } 476 477 var width int 478 if c == utf8.RuneError { 479 c, width = utf8.DecodeRuneInString(s[i:]) 480 if width != 1 && r == c { 481 continue 482 } 483 } else { 484 width = utf8.RuneLen(c) 485 } 486 487 b.Grow(len(s) + utf8.UTFMax) 488 b.WriteString(s[:i]) 489 if r >= 0 { 490 b.WriteRune(r) 491 } 492 493 s = s[i+width:] 494 break 495 } 496 497 // Fast path for unchanged input 498 if b.Cap() == 0 { // didn't call b.Grow above 499 return s 500 } 501 502 for _, c := range s { 503 r := mapping(c) 504 505 if r >= 0 { 506 // common case 507 // Due to inlining, it is more performant to determine if WriteByte should be 508 // invoked rather than always call WriteRune 509 if r < utf8.RuneSelf { 510 b.WriteByte(byte(r)) 511 } else { 512 // r is not a ASCII rune. 513 b.WriteRune(r) 514 } 515 } 516 } 517 518 return b.String() 519 } 520 521 // Repeat returns a new string consisting of count copies of the string s. 522 // 523 // It panics if count is negative or if the result of (len(s) * count) 524 // overflows. 525 func Repeat(s string, count int) string { 526 switch count { 527 case 0: 528 return "" 529 case 1: 530 return s 531 } 532 533 // Since we cannot return an error on overflow, 534 // we should panic if the repeat will generate 535 // an overflow. 536 // See golang.org/issue/16237. 537 if count < 0 { 538 panic("strings: negative Repeat count") 539 } else if len(s)*count/count != len(s) { 540 panic("strings: Repeat count causes overflow") 541 } 542 543 if len(s) == 0 { 544 return "" 545 } 546 547 n := len(s) * count 548 549 // Past a certain chunk size it is counterproductive to use 550 // larger chunks as the source of the write, as when the source 551 // is too large we are basically just thrashing the CPU D-cache. 552 // So if the result length is larger than an empirically-found 553 // limit (8KB), we stop growing the source string once the limit 554 // is reached and keep reusing the same source string - that 555 // should therefore be always resident in the L1 cache - until we 556 // have completed the construction of the result. 557 // This yields significant speedups (up to +100%) in cases where 558 // the result length is large (roughly, over L2 cache size). 559 const chunkLimit = 8 * 1024 560 chunkMax := n 561 if n > chunkLimit { 562 chunkMax = chunkLimit / len(s) * len(s) 563 if chunkMax == 0 { 564 chunkMax = len(s) 565 } 566 } 567 568 var b Builder 569 b.Grow(n) 570 b.WriteString(s) 571 for b.Len() < n { 572 chunk := n - b.Len() 573 if chunk > b.Len() { 574 chunk = b.Len() 575 } 576 if chunk > chunkMax { 577 chunk = chunkMax 578 } 579 b.WriteString(b.String()[:chunk]) 580 } 581 return b.String() 582 } 583 584 // ToUpper returns s with all Unicode letters mapped to their upper case. 585 func ToUpper(s string) string { 586 isASCII, hasLower := true, false 587 for i := 0; i < len(s); i++ { 588 c := s[i] 589 if c >= utf8.RuneSelf { 590 isASCII = false 591 break 592 } 593 hasLower = hasLower || ('a' <= c && c <= 'z') 594 } 595 596 if isASCII { // optimize for ASCII-only strings. 597 if !hasLower { 598 return s 599 } 600 var ( 601 b Builder 602 pos int 603 ) 604 b.Grow(len(s)) 605 for i := 0; i < len(s); i++ { 606 c := s[i] 607 if 'a' <= c && c <= 'z' { 608 c -= 'a' - 'A' 609 if pos < i { 610 b.WriteString(s[pos:i]) 611 } 612 b.WriteByte(c) 613 pos = i + 1 614 } 615 } 616 if pos < len(s) { 617 b.WriteString(s[pos:]) 618 } 619 return b.String() 620 } 621 return Map(unicode.ToUpper, s) 622 } 623 624 // ToLower returns s with all Unicode letters mapped to their lower case. 625 func ToLower(s string) string { 626 isASCII, hasUpper := true, false 627 for i := 0; i < len(s); i++ { 628 c := s[i] 629 if c >= utf8.RuneSelf { 630 isASCII = false 631 break 632 } 633 hasUpper = hasUpper || ('A' <= c && c <= 'Z') 634 } 635 636 if isASCII { // optimize for ASCII-only strings. 637 if !hasUpper { 638 return s 639 } 640 var ( 641 b Builder 642 pos int 643 ) 644 b.Grow(len(s)) 645 for i := 0; i < len(s); i++ { 646 c := s[i] 647 if 'A' <= c && c <= 'Z' { 648 c += 'a' - 'A' 649 if pos < i { 650 b.WriteString(s[pos:i]) 651 } 652 b.WriteByte(c) 653 pos = i + 1 654 } 655 } 656 if pos < len(s) { 657 b.WriteString(s[pos:]) 658 } 659 return b.String() 660 } 661 return Map(unicode.ToLower, s) 662 } 663 664 // ToTitle returns a copy of the string s with all Unicode letters mapped to 665 // their Unicode title case. 666 func ToTitle(s string) string { return Map(unicode.ToTitle, s) } 667 668 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their 669 // upper case using the case mapping specified by c. 670 func ToUpperSpecial(c unicode.SpecialCase, s string) string { 671 return Map(c.ToUpper, s) 672 } 673 674 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their 675 // lower case using the case mapping specified by c. 676 func ToLowerSpecial(c unicode.SpecialCase, s string) string { 677 return Map(c.ToLower, s) 678 } 679 680 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their 681 // Unicode title case, giving priority to the special casing rules. 682 func ToTitleSpecial(c unicode.SpecialCase, s string) string { 683 return Map(c.ToTitle, s) 684 } 685 686 // ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences 687 // replaced by the replacement string, which may be empty. 688 func ToValidUTF8(s, replacement string) string { 689 var b Builder 690 691 for i, c := range s { 692 if c != utf8.RuneError { 693 continue 694 } 695 696 _, wid := utf8.DecodeRuneInString(s[i:]) 697 if wid == 1 { 698 b.Grow(len(s) + len(replacement)) 699 b.WriteString(s[:i]) 700 s = s[i:] 701 break 702 } 703 } 704 705 // Fast path for unchanged input 706 if b.Cap() == 0 { // didn't call b.Grow above 707 return s 708 } 709 710 invalid := false // previous byte was from an invalid UTF-8 sequence 711 for i := 0; i < len(s); { 712 c := s[i] 713 if c < utf8.RuneSelf { 714 i++ 715 invalid = false 716 b.WriteByte(c) 717 continue 718 } 719 _, wid := utf8.DecodeRuneInString(s[i:]) 720 if wid == 1 { 721 i++ 722 if !invalid { 723 invalid = true 724 b.WriteString(replacement) 725 } 726 continue 727 } 728 invalid = false 729 b.WriteString(s[i : i+wid]) 730 i += wid 731 } 732 733 return b.String() 734 } 735 736 // isSeparator reports whether the rune could mark a word boundary. 737 // TODO: update when package unicode captures more of the properties. 738 func isSeparator(r rune) bool { 739 // ASCII alphanumerics and underscore are not separators 740 if r <= 0x7F { 741 switch { 742 case '0' <= r && r <= '9': 743 return false 744 case 'a' <= r && r <= 'z': 745 return false 746 case 'A' <= r && r <= 'Z': 747 return false 748 case r == '_': 749 return false 750 } 751 return true 752 } 753 // Letters and digits are not separators 754 if unicode.IsLetter(r) || unicode.IsDigit(r) { 755 return false 756 } 757 // Otherwise, all we can do for now is treat spaces as separators. 758 return unicode.IsSpace(r) 759 } 760 761 // Title returns a copy of the string s with all Unicode letters that begin words 762 // mapped to their Unicode title case. 763 // 764 // Deprecated: The rule Title uses for word boundaries does not handle Unicode 765 // punctuation properly. Use golang.org/x/text/cases instead. 766 func Title(s string) string { 767 // Use a closure here to remember state. 768 // Hackish but effective. Depends on Map scanning in order and calling 769 // the closure once per rune. 770 prev := ' ' 771 return Map( 772 func(r rune) rune { 773 if isSeparator(prev) { 774 prev = r 775 return unicode.ToTitle(r) 776 } 777 prev = r 778 return r 779 }, 780 s) 781 } 782 783 // TrimLeftFunc returns a slice of the string s with all leading 784 // Unicode code points c satisfying f(c) removed. 785 func TrimLeftFunc(s string, f func(rune) bool) string { 786 i := indexFunc(s, f, false) 787 if i == -1 { 788 return "" 789 } 790 return s[i:] 791 } 792 793 // TrimRightFunc returns a slice of the string s with all trailing 794 // Unicode code points c satisfying f(c) removed. 795 func TrimRightFunc(s string, f func(rune) bool) string { 796 i := lastIndexFunc(s, f, false) 797 if i >= 0 && s[i] >= utf8.RuneSelf { 798 _, wid := utf8.DecodeRuneInString(s[i:]) 799 i += wid 800 } else { 801 i++ 802 } 803 return s[0:i] 804 } 805 806 // TrimFunc returns a slice of the string s with all leading 807 // and trailing Unicode code points c satisfying f(c) removed. 808 func TrimFunc(s string, f func(rune) bool) string { 809 return TrimRightFunc(TrimLeftFunc(s, f), f) 810 } 811 812 // IndexFunc returns the index into s of the first Unicode 813 // code point satisfying f(c), or -1 if none do. 814 func IndexFunc(s string, f func(rune) bool) int { 815 return indexFunc(s, f, true) 816 } 817 818 // LastIndexFunc returns the index into s of the last 819 // Unicode code point satisfying f(c), or -1 if none do. 820 func LastIndexFunc(s string, f func(rune) bool) int { 821 return lastIndexFunc(s, f, true) 822 } 823 824 // indexFunc is the same as IndexFunc except that if 825 // truth==false, the sense of the predicate function is 826 // inverted. 827 func indexFunc(s string, f func(rune) bool, truth bool) int { 828 for i, r := range s { 829 if f(r) == truth { 830 return i 831 } 832 } 833 return -1 834 } 835 836 // lastIndexFunc is the same as LastIndexFunc except that if 837 // truth==false, the sense of the predicate function is 838 // inverted. 839 func lastIndexFunc(s string, f func(rune) bool, truth bool) int { 840 for i := len(s); i > 0; { 841 r, size := utf8.DecodeLastRuneInString(s[0:i]) 842 i -= size 843 if f(r) == truth { 844 return i 845 } 846 } 847 return -1 848 } 849 850 // asciiSet is a 32-byte value, where each bit represents the presence of a 851 // given ASCII character in the set. The 128-bits of the lower 16 bytes, 852 // starting with the least-significant bit of the lowest word to the 853 // most-significant bit of the highest word, map to the full range of all 854 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, 855 // ensuring that any non-ASCII character will be reported as not in the set. 856 // This allocates a total of 32 bytes even though the upper half 857 // is unused to avoid bounds checks in asciiSet.contains. 858 type asciiSet [8]uint32 859 860 // makeASCIISet creates a set of ASCII characters and reports whether all 861 // characters in chars are ASCII. 862 func makeASCIISet(chars string) (as asciiSet, ok bool) { 863 for i := 0; i < len(chars); i++ { 864 c := chars[i] 865 if c >= utf8.RuneSelf { 866 return as, false 867 } 868 as[c/32] |= 1 << (c % 32) 869 } 870 return as, true 871 } 872 873 // contains reports whether c is inside the set. 874 func (as *asciiSet) contains(c byte) bool { 875 return (as[c/32] & (1 << (c % 32))) != 0 876 } 877 878 // Trim returns a slice of the string s with all leading and 879 // trailing Unicode code points contained in cutset removed. 880 func Trim(s, cutset string) string { 881 if s == "" || cutset == "" { 882 return s 883 } 884 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 885 return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0]) 886 } 887 if as, ok := makeASCIISet(cutset); ok { 888 return trimLeftASCII(trimRightASCII(s, &as), &as) 889 } 890 return trimLeftUnicode(trimRightUnicode(s, cutset), cutset) 891 } 892 893 // TrimLeft returns a slice of the string s with all leading 894 // Unicode code points contained in cutset removed. 895 // 896 // To remove a prefix, use TrimPrefix instead. 897 func TrimLeft(s, cutset string) string { 898 if s == "" || cutset == "" { 899 return s 900 } 901 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 902 return trimLeftByte(s, cutset[0]) 903 } 904 if as, ok := makeASCIISet(cutset); ok { 905 return trimLeftASCII(s, &as) 906 } 907 return trimLeftUnicode(s, cutset) 908 } 909 910 func trimLeftByte(s string, c byte) string { 911 for len(s) > 0 && s[0] == c { 912 s = s[1:] 913 } 914 return s 915 } 916 917 func trimLeftASCII(s string, as *asciiSet) string { 918 for len(s) > 0 { 919 if !as.contains(s[0]) { 920 break 921 } 922 s = s[1:] 923 } 924 return s 925 } 926 927 func trimLeftUnicode(s, cutset string) string { 928 for len(s) > 0 { 929 r, n := rune(s[0]), 1 930 if r >= utf8.RuneSelf { 931 r, n = utf8.DecodeRuneInString(s) 932 } 933 if !ContainsRune(cutset, r) { 934 break 935 } 936 s = s[n:] 937 } 938 return s 939 } 940 941 // TrimRight returns a slice of the string s, with all trailing 942 // Unicode code points contained in cutset removed. 943 // 944 // To remove a suffix, use TrimSuffix instead. 945 func TrimRight(s, cutset string) string { 946 if s == "" || cutset == "" { 947 return s 948 } 949 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 950 return trimRightByte(s, cutset[0]) 951 } 952 if as, ok := makeASCIISet(cutset); ok { 953 return trimRightASCII(s, &as) 954 } 955 return trimRightUnicode(s, cutset) 956 } 957 958 func trimRightByte(s string, c byte) string { 959 for len(s) > 0 && s[len(s)-1] == c { 960 s = s[:len(s)-1] 961 } 962 return s 963 } 964 965 func trimRightASCII(s string, as *asciiSet) string { 966 for len(s) > 0 { 967 if !as.contains(s[len(s)-1]) { 968 break 969 } 970 s = s[:len(s)-1] 971 } 972 return s 973 } 974 975 func trimRightUnicode(s, cutset string) string { 976 for len(s) > 0 { 977 r, n := rune(s[len(s)-1]), 1 978 if r >= utf8.RuneSelf { 979 r, n = utf8.DecodeLastRuneInString(s) 980 } 981 if !ContainsRune(cutset, r) { 982 break 983 } 984 s = s[:len(s)-n] 985 } 986 return s 987 } 988 989 // TrimSpace returns a slice of the string s, with all leading 990 // and trailing white space removed, as defined by Unicode. 991 func TrimSpace(s string) string { 992 // Fast path for ASCII: look for the first ASCII non-space byte 993 start := 0 994 for ; start < len(s); start++ { 995 c := s[start] 996 if c >= utf8.RuneSelf { 997 // If we run into a non-ASCII byte, fall back to the 998 // slower unicode-aware method on the remaining bytes 999 return TrimFunc(s[start:], unicode.IsSpace) 1000 } 1001 if asciiSpace[c] == 0 { 1002 break 1003 } 1004 } 1005 1006 // Now look for the first ASCII non-space byte from the end 1007 stop := len(s) 1008 for ; stop > start; stop-- { 1009 c := s[stop-1] 1010 if c >= utf8.RuneSelf { 1011 // start has been already trimmed above, should trim end only 1012 return TrimRightFunc(s[start:stop], unicode.IsSpace) 1013 } 1014 if asciiSpace[c] == 0 { 1015 break 1016 } 1017 } 1018 1019 // At this point s[start:stop] starts and ends with an ASCII 1020 // non-space bytes, so we're done. Non-ASCII cases have already 1021 // been handled above. 1022 return s[start:stop] 1023 } 1024 1025 // TrimPrefix returns s without the provided leading prefix string. 1026 // If s doesn't start with prefix, s is returned unchanged. 1027 func TrimPrefix(s, prefix string) string { 1028 if HasPrefix(s, prefix) { 1029 return s[len(prefix):] 1030 } 1031 return s 1032 } 1033 1034 // TrimSuffix returns s without the provided trailing suffix string. 1035 // If s doesn't end with suffix, s is returned unchanged. 1036 func TrimSuffix(s, suffix string) string { 1037 if HasSuffix(s, suffix) { 1038 return s[:len(s)-len(suffix)] 1039 } 1040 return s 1041 } 1042 1043 // Replace returns a copy of the string s with the first n 1044 // non-overlapping instances of old replaced by new. 1045 // If old is empty, it matches at the beginning of the string 1046 // and after each UTF-8 sequence, yielding up to k+1 replacements 1047 // for a k-rune string. 1048 // If n < 0, there is no limit on the number of replacements. 1049 func Replace(s, old, new string, n int) string { 1050 if old == new || n == 0 { 1051 return s // avoid allocation 1052 } 1053 1054 // Compute number of replacements. 1055 if m := Count(s, old); m == 0 { 1056 return s // avoid allocation 1057 } else if n < 0 || m < n { 1058 n = m 1059 } 1060 1061 // Apply replacements to buffer. 1062 var b Builder 1063 b.Grow(len(s) + n*(len(new)-len(old))) 1064 start := 0 1065 for i := 0; i < n; i++ { 1066 j := start 1067 if len(old) == 0 { 1068 if i > 0 { 1069 _, wid := utf8.DecodeRuneInString(s[start:]) 1070 j += wid 1071 } 1072 } else { 1073 j += Index(s[start:], old) 1074 } 1075 b.WriteString(s[start:j]) 1076 b.WriteString(new) 1077 start = j + len(old) 1078 } 1079 b.WriteString(s[start:]) 1080 return b.String() 1081 } 1082 1083 // ReplaceAll returns a copy of the string s with all 1084 // non-overlapping instances of old replaced by new. 1085 // If old is empty, it matches at the beginning of the string 1086 // and after each UTF-8 sequence, yielding up to k+1 replacements 1087 // for a k-rune string. 1088 func ReplaceAll(s, old, new string) string { 1089 return Replace(s, old, new, -1) 1090 } 1091 1092 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 1093 // are equal under simple Unicode case-folding, which is a more general 1094 // form of case-insensitivity. 1095 func EqualFold(s, t string) bool { 1096 // ASCII fast path 1097 i := 0 1098 for ; i < len(s) && i < len(t); i++ { 1099 sr := s[i] 1100 tr := t[i] 1101 if sr|tr >= utf8.RuneSelf { 1102 goto hasUnicode 1103 } 1104 1105 // Easy case. 1106 if tr == sr { 1107 continue 1108 } 1109 1110 // Make sr < tr to simplify what follows. 1111 if tr < sr { 1112 tr, sr = sr, tr 1113 } 1114 // ASCII only, sr/tr must be upper/lower case 1115 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1116 continue 1117 } 1118 return false 1119 } 1120 // Check if we've exhausted both strings. 1121 return len(s) == len(t) 1122 1123 hasUnicode: 1124 s = s[i:] 1125 t = t[i:] 1126 for _, sr := range s { 1127 // If t is exhausted the strings are not equal. 1128 if len(t) == 0 { 1129 return false 1130 } 1131 1132 // Extract first rune from second string. 1133 var tr rune 1134 if t[0] < utf8.RuneSelf { 1135 tr, t = rune(t[0]), t[1:] 1136 } else { 1137 r, size := utf8.DecodeRuneInString(t) 1138 tr, t = r, t[size:] 1139 } 1140 1141 // If they match, keep going; if not, return false. 1142 1143 // Easy case. 1144 if tr == sr { 1145 continue 1146 } 1147 1148 // Make sr < tr to simplify what follows. 1149 if tr < sr { 1150 tr, sr = sr, tr 1151 } 1152 // Fast check for ASCII. 1153 if tr < utf8.RuneSelf { 1154 // ASCII only, sr/tr must be upper/lower case 1155 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1156 continue 1157 } 1158 return false 1159 } 1160 1161 // General case. SimpleFold(x) returns the next equivalent rune > x 1162 // or wraps around to smaller values. 1163 r := unicode.SimpleFold(sr) 1164 for r != sr && r < tr { 1165 r = unicode.SimpleFold(r) 1166 } 1167 if r == tr { 1168 continue 1169 } 1170 return false 1171 } 1172 1173 // First string is empty, so check if the second one is also empty. 1174 return len(t) == 0 1175 } 1176 1177 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. 1178 func Index(s, substr string) int { 1179 n := len(substr) 1180 switch { 1181 case n == 0: 1182 return 0 1183 case n == 1: 1184 return IndexByte(s, substr[0]) 1185 case n == len(s): 1186 if substr == s { 1187 return 0 1188 } 1189 return -1 1190 case n > len(s): 1191 return -1 1192 case n <= bytealg.MaxLen: 1193 // Use brute force when s and substr both are small 1194 if len(s) <= bytealg.MaxBruteForce { 1195 return bytealg.IndexString(s, substr) 1196 } 1197 c0 := substr[0] 1198 c1 := substr[1] 1199 i := 0 1200 t := len(s) - n + 1 1201 fails := 0 1202 for i < t { 1203 if s[i] != c0 { 1204 // IndexByte is faster than bytealg.IndexString, so use it as long as 1205 // we're not getting lots of false positives. 1206 o := IndexByte(s[i+1:t], c0) 1207 if o < 0 { 1208 return -1 1209 } 1210 i += o + 1 1211 } 1212 if s[i+1] == c1 && s[i:i+n] == substr { 1213 return i 1214 } 1215 fails++ 1216 i++ 1217 // Switch to bytealg.IndexString when IndexByte produces too many false positives. 1218 if fails > bytealg.Cutover(i) { 1219 r := bytealg.IndexString(s[i:], substr) 1220 if r >= 0 { 1221 return r + i 1222 } 1223 return -1 1224 } 1225 } 1226 return -1 1227 } 1228 c0 := substr[0] 1229 c1 := substr[1] 1230 i := 0 1231 t := len(s) - n + 1 1232 fails := 0 1233 for i < t { 1234 if s[i] != c0 { 1235 o := IndexByte(s[i+1:t], c0) 1236 if o < 0 { 1237 return -1 1238 } 1239 i += o + 1 1240 } 1241 if s[i+1] == c1 && s[i:i+n] == substr { 1242 return i 1243 } 1244 i++ 1245 fails++ 1246 if fails >= 4+i>>4 && i < t { 1247 // See comment in ../bytes/bytes.go. 1248 j := bytealg.IndexRabinKarp(s[i:], substr) 1249 if j < 0 { 1250 return -1 1251 } 1252 return i + j 1253 } 1254 } 1255 return -1 1256 } 1257 1258 // Cut slices s around the first instance of sep, 1259 // returning the text before and after sep. 1260 // The found result reports whether sep appears in s. 1261 // If sep does not appear in s, cut returns s, "", false. 1262 func Cut(s, sep string) (before, after string, found bool) { 1263 if i := Index(s, sep); i >= 0 { 1264 return s[:i], s[i+len(sep):], true 1265 } 1266 return s, "", false 1267 } 1268 1269 // CutPrefix returns s without the provided leading prefix string 1270 // and reports whether it found the prefix. 1271 // If s doesn't start with prefix, CutPrefix returns s, false. 1272 // If prefix is the empty string, CutPrefix returns s, true. 1273 func CutPrefix(s, prefix string) (after string, found bool) { 1274 if !HasPrefix(s, prefix) { 1275 return s, false 1276 } 1277 return s[len(prefix):], true 1278 } 1279 1280 // CutSuffix returns s without the provided ending suffix string 1281 // and reports whether it found the suffix. 1282 // If s doesn't end with suffix, CutSuffix returns s, false. 1283 // If suffix is the empty string, CutSuffix returns s, true. 1284 func CutSuffix(s, suffix string) (before string, found bool) { 1285 if !HasSuffix(s, suffix) { 1286 return s, false 1287 } 1288 return s[:len(s)-len(suffix)], true 1289 }