github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/strings/strings.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package strings implements simple functions to manipulate UTF-8 encoded strings. 6 // 7 // For information about UTF-8 strings in Go, see https://blog.golang.org/strings. 8 package strings 9 10 import ( 11 "internal/bytealg" 12 "unicode" 13 "unicode/utf8" 14 ) 15 16 const maxInt = int(^uint(0) >> 1) 17 18 // explode splits s into a slice of UTF-8 strings, 19 // one string per Unicode character up to a maximum of n (n < 0 means no limit). 20 // Invalid UTF-8 bytes are sliced individually. 21 func explode(s string, n int) []string { 22 l := utf8.RuneCountInString(s) 23 if n < 0 || n > l { 24 n = l 25 } 26 a := make([]string, n) 27 for i := 0; i < n-1; i++ { 28 _, size := utf8.DecodeRuneInString(s) 29 a[i] = s[:size] 30 s = s[size:] 31 } 32 if n > 0 { 33 a[n-1] = s 34 } 35 return a 36 } 37 38 // Count counts the number of non-overlapping instances of substr in s. 39 // If substr is an empty string, Count returns 1 + the number of Unicode code points in s. 40 func Count(s, substr string) int { 41 // special case 42 if len(substr) == 0 { 43 return utf8.RuneCountInString(s) + 1 44 } 45 if len(substr) == 1 { 46 return bytealg.CountString(s, substr[0]) 47 } 48 n := 0 49 for { 50 i := Index(s, substr) 51 if i == -1 { 52 return n 53 } 54 n++ 55 s = s[i+len(substr):] 56 } 57 } 58 59 // Contains reports whether substr is within s. 60 func Contains(s, substr string) bool { 61 return Index(s, substr) >= 0 62 } 63 64 // ContainsAny reports whether any Unicode code points in chars are within s. 65 func ContainsAny(s, chars string) bool { 66 return IndexAny(s, chars) >= 0 67 } 68 69 // ContainsRune reports whether the Unicode code point r is within s. 70 func ContainsRune(s string, r rune) bool { 71 return IndexRune(s, r) >= 0 72 } 73 74 // ContainsFunc reports whether any Unicode code points r within s satisfy f(r). 75 func ContainsFunc(s string, f func(rune) bool) bool { 76 return IndexFunc(s, f) >= 0 77 } 78 79 // LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s. 80 func LastIndex(s, substr string) int { 81 n := len(substr) 82 switch { 83 case n == 0: 84 return len(s) 85 case n == 1: 86 return LastIndexByte(s, substr[0]) 87 case n == len(s): 88 if substr == s { 89 return 0 90 } 91 return -1 92 case n > len(s): 93 return -1 94 } 95 // Rabin-Karp search from the end of the string 96 hashss, pow := bytealg.HashStrRev(substr) 97 last := len(s) - n 98 var h uint32 99 for i := len(s) - 1; i >= last; i-- { 100 h = h*bytealg.PrimeRK + uint32(s[i]) 101 } 102 if h == hashss && s[last:] == substr { 103 return last 104 } 105 for i := last - 1; i >= 0; i-- { 106 h *= bytealg.PrimeRK 107 h += uint32(s[i]) 108 h -= pow * uint32(s[i+n]) 109 if h == hashss && s[i:i+n] == substr { 110 return i 111 } 112 } 113 return -1 114 } 115 116 // IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s. 117 func IndexByte(s string, c byte) int { 118 return bytealg.IndexByteString(s, c) 119 } 120 121 // IndexRune returns the index of the first instance of the Unicode code point 122 // r, or -1 if rune is not present in s. 123 // If r is utf8.RuneError, it returns the first instance of any 124 // invalid UTF-8 byte sequence. 125 func IndexRune(s string, r rune) int { 126 switch { 127 case 0 <= r && r < utf8.RuneSelf: 128 return IndexByte(s, byte(r)) 129 case r == utf8.RuneError: 130 for i, r := range s { 131 if r == utf8.RuneError { 132 return i 133 } 134 } 135 return -1 136 case !utf8.ValidRune(r): 137 return -1 138 default: 139 return Index(s, string(r)) 140 } 141 } 142 143 // IndexAny returns the index of the first instance of any Unicode code point 144 // from chars in s, or -1 if no Unicode code point from chars is present in s. 145 func IndexAny(s, chars string) int { 146 if chars == "" { 147 // Avoid scanning all of s. 148 return -1 149 } 150 if len(chars) == 1 { 151 // Avoid scanning all of s. 152 r := rune(chars[0]) 153 if r >= utf8.RuneSelf { 154 r = utf8.RuneError 155 } 156 return IndexRune(s, r) 157 } 158 if len(s) > 8 { 159 if as, isASCII := makeASCIISet(chars); isASCII { 160 for i := 0; i < len(s); i++ { 161 if as.contains(s[i]) { 162 return i 163 } 164 } 165 return -1 166 } 167 } 168 for i, c := range s { 169 if IndexRune(chars, c) >= 0 { 170 return i 171 } 172 } 173 return -1 174 } 175 176 // LastIndexAny returns the index of the last instance of any Unicode code 177 // point from chars in s, or -1 if no Unicode code point from chars is 178 // present in s. 179 func LastIndexAny(s, chars string) int { 180 if chars == "" { 181 // Avoid scanning all of s. 182 return -1 183 } 184 if len(s) == 1 { 185 rc := rune(s[0]) 186 if rc >= utf8.RuneSelf { 187 rc = utf8.RuneError 188 } 189 if IndexRune(chars, rc) >= 0 { 190 return 0 191 } 192 return -1 193 } 194 if len(s) > 8 { 195 if as, isASCII := makeASCIISet(chars); isASCII { 196 for i := len(s) - 1; i >= 0; i-- { 197 if as.contains(s[i]) { 198 return i 199 } 200 } 201 return -1 202 } 203 } 204 if len(chars) == 1 { 205 rc := rune(chars[0]) 206 if rc >= utf8.RuneSelf { 207 rc = utf8.RuneError 208 } 209 for i := len(s); i > 0; { 210 r, size := utf8.DecodeLastRuneInString(s[:i]) 211 i -= size 212 if rc == r { 213 return i 214 } 215 } 216 return -1 217 } 218 for i := len(s); i > 0; { 219 r, size := utf8.DecodeLastRuneInString(s[:i]) 220 i -= size 221 if IndexRune(chars, r) >= 0 { 222 return i 223 } 224 } 225 return -1 226 } 227 228 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. 229 func LastIndexByte(s string, c byte) int { 230 for i := len(s) - 1; i >= 0; i-- { 231 if s[i] == c { 232 return i 233 } 234 } 235 return -1 236 } 237 238 // Generic split: splits after each instance of sep, 239 // including sepSave bytes of sep in the subarrays. 240 func genSplit(s, sep string, sepSave, n int) []string { 241 if n == 0 { 242 return nil 243 } 244 if sep == "" { 245 return explode(s, n) 246 } 247 if n < 0 { 248 n = Count(s, sep) + 1 249 } 250 251 if n > len(s)+1 { 252 n = len(s) + 1 253 } 254 a := make([]string, n) 255 n-- 256 i := 0 257 for i < n { 258 m := Index(s, sep) 259 if m < 0 { 260 break 261 } 262 a[i] = s[:m+sepSave] 263 s = s[m+len(sep):] 264 i++ 265 } 266 a[i] = s 267 return a[:i+1] 268 } 269 270 // SplitN slices s into substrings separated by sep and returns a slice of 271 // the substrings between those separators. 272 // 273 // The count determines the number of substrings to return: 274 // 275 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 276 // n == 0: the result is nil (zero substrings) 277 // n < 0: all substrings 278 // 279 // Edge cases for s and sep (for example, empty strings) are handled 280 // as described in the documentation for Split. 281 // 282 // To split around the first instance of a separator, see Cut. 283 func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) } 284 285 // SplitAfterN slices s into substrings after each instance of sep and 286 // returns a slice of those substrings. 287 // 288 // The count determines the number of substrings to return: 289 // 290 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 291 // n == 0: the result is nil (zero substrings) 292 // n < 0: all substrings 293 // 294 // Edge cases for s and sep (for example, empty strings) are handled 295 // as described in the documentation for SplitAfter. 296 func SplitAfterN(s, sep string, n int) []string { 297 return genSplit(s, sep, len(sep), n) 298 } 299 300 // Split slices s into all substrings separated by sep and returns a slice of 301 // the substrings between those separators. 302 // 303 // If s does not contain sep and sep is not empty, Split returns a 304 // slice of length 1 whose only element is s. 305 // 306 // If sep is empty, Split splits after each UTF-8 sequence. If both s 307 // and sep are empty, Split returns an empty slice. 308 // 309 // It is equivalent to SplitN with a count of -1. 310 // 311 // To split around the first instance of a separator, see Cut. 312 func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) } 313 314 // SplitAfter slices s into all substrings after each instance of sep and 315 // returns a slice of those substrings. 316 // 317 // If s does not contain sep and sep is not empty, SplitAfter returns 318 // a slice of length 1 whose only element is s. 319 // 320 // If sep is empty, SplitAfter splits after each UTF-8 sequence. If 321 // both s and sep are empty, SplitAfter returns an empty slice. 322 // 323 // It is equivalent to SplitAfterN with a count of -1. 324 func SplitAfter(s, sep string) []string { 325 return genSplit(s, sep, len(sep), -1) 326 } 327 328 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} 329 330 // Fields splits the string s around each instance of one or more consecutive white space 331 // characters, as defined by unicode.IsSpace, returning a slice of substrings of s or an 332 // empty slice if s contains only white space. 333 func Fields(s string) []string { 334 // First count the fields. 335 // This is an exact count if s is ASCII, otherwise it is an approximation. 336 n := 0 337 wasSpace := 1 338 // setBits is used to track which bits are set in the bytes of s. 339 setBits := uint8(0) 340 for i := 0; i < len(s); i++ { 341 r := s[i] 342 setBits |= r 343 isSpace := int(asciiSpace[r]) 344 n += wasSpace & ^isSpace 345 wasSpace = isSpace 346 } 347 348 if setBits >= utf8.RuneSelf { 349 // Some runes in the input string are not ASCII. 350 return FieldsFunc(s, unicode.IsSpace) 351 } 352 // ASCII fast path 353 a := make([]string, n) 354 na := 0 355 fieldStart := 0 356 i := 0 357 // Skip spaces in the front of the input. 358 for i < len(s) && asciiSpace[s[i]] != 0 { 359 i++ 360 } 361 fieldStart = i 362 for i < len(s) { 363 if asciiSpace[s[i]] == 0 { 364 i++ 365 continue 366 } 367 a[na] = s[fieldStart:i] 368 na++ 369 i++ 370 // Skip spaces in between fields. 371 for i < len(s) && asciiSpace[s[i]] != 0 { 372 i++ 373 } 374 fieldStart = i 375 } 376 if fieldStart < len(s) { // Last field might end at EOF. 377 a[na] = s[fieldStart:] 378 } 379 return a 380 } 381 382 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) 383 // and returns an array of slices of s. If all code points in s satisfy f(c) or the 384 // string is empty, an empty slice is returned. 385 // 386 // FieldsFunc makes no guarantees about the order in which it calls f(c) 387 // and assumes that f always returns the same value for a given c. 388 func FieldsFunc(s string, f func(rune) bool) []string { 389 // A span is used to record a slice of s of the form s[start:end]. 390 // The start index is inclusive and the end index is exclusive. 391 type span struct { 392 start int 393 end int 394 } 395 spans := make([]span, 0, 32) 396 397 // Find the field start and end indices. 398 // Doing this in a separate pass (rather than slicing the string s 399 // and collecting the result substrings right away) is significantly 400 // more efficient, possibly due to cache effects. 401 start := -1 // valid span start if >= 0 402 for end, rune := range s { 403 if f(rune) { 404 if start >= 0 { 405 spans = append(spans, span{start, end}) 406 // Set start to a negative value. 407 // Note: using -1 here consistently and reproducibly 408 // slows down this code by a several percent on amd64. 409 start = ^start 410 } 411 } else { 412 if start < 0 { 413 start = end 414 } 415 } 416 } 417 418 // Last field might end at EOF. 419 if start >= 0 { 420 spans = append(spans, span{start, len(s)}) 421 } 422 423 // Create strings from recorded field indices. 424 a := make([]string, len(spans)) 425 for i, span := range spans { 426 a[i] = s[span.start:span.end] 427 } 428 429 return a 430 } 431 432 // Join concatenates the elements of its first argument to create a single string. The separator 433 // string sep is placed between elements in the resulting string. 434 func Join(elems []string, sep string) string { 435 switch len(elems) { 436 case 0: 437 return "" 438 case 1: 439 return elems[0] 440 } 441 442 var n int 443 if len(sep) > 0 { 444 if len(sep) >= maxInt/(len(elems)-1) { 445 panic("strings: Join output length overflow") 446 } 447 n += len(sep) * (len(elems) - 1) 448 } 449 for _, elem := range elems { 450 if len(elem) > maxInt-n { 451 panic("strings: Join output length overflow") 452 } 453 n += len(elem) 454 } 455 456 var b Builder 457 b.Grow(n) 458 b.WriteString(elems[0]) 459 for _, s := range elems[1:] { 460 b.WriteString(sep) 461 b.WriteString(s) 462 } 463 return b.String() 464 } 465 466 // HasPrefix tests whether the string s begins with prefix. 467 func HasPrefix(s, prefix string) bool { 468 return len(s) >= len(prefix) && s[0:len(prefix)] == prefix 469 } 470 471 // HasSuffix tests whether the string s ends with suffix. 472 func HasSuffix(s, suffix string) bool { 473 return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix 474 } 475 476 // Map returns a copy of the string s with all its characters modified 477 // according to the mapping function. If mapping returns a negative value, the character is 478 // dropped from the string with no replacement. 479 func Map(mapping func(rune) rune, s string) string { 480 // In the worst case, the string can grow when mapped, making 481 // things unpleasant. But it's so rare we barge in assuming it's 482 // fine. It could also shrink but that falls out naturally. 483 484 // The output buffer b is initialized on demand, the first 485 // time a character differs. 486 var b Builder 487 488 for i, c := range s { 489 r := mapping(c) 490 if r == c && c != utf8.RuneError { 491 continue 492 } 493 494 var width int 495 if c == utf8.RuneError { 496 c, width = utf8.DecodeRuneInString(s[i:]) 497 if width != 1 && r == c { 498 continue 499 } 500 } else { 501 width = utf8.RuneLen(c) 502 } 503 504 b.Grow(len(s) + utf8.UTFMax) 505 b.WriteString(s[:i]) 506 if r >= 0 { 507 b.WriteRune(r) 508 } 509 510 s = s[i+width:] 511 break 512 } 513 514 // Fast path for unchanged input 515 if b.Cap() == 0 { // didn't call b.Grow above 516 return s 517 } 518 519 for _, c := range s { 520 r := mapping(c) 521 522 if r >= 0 { 523 // common case 524 // Due to inlining, it is more performant to determine if WriteByte should be 525 // invoked rather than always call WriteRune 526 if r < utf8.RuneSelf { 527 b.WriteByte(byte(r)) 528 } else { 529 // r is not a ASCII rune. 530 b.WriteRune(r) 531 } 532 } 533 } 534 535 return b.String() 536 } 537 538 // Repeat returns a new string consisting of count copies of the string s. 539 // 540 // It panics if count is negative or if the result of (len(s) * count) 541 // overflows. 542 func Repeat(s string, count int) string { 543 switch count { 544 case 0: 545 return "" 546 case 1: 547 return s 548 } 549 550 // Since we cannot return an error on overflow, 551 // we should panic if the repeat will generate an overflow. 552 // See golang.org/issue/16237. 553 if count < 0 { 554 panic("strings: negative Repeat count") 555 } 556 if len(s) >= maxInt/count { 557 panic("strings: Repeat output length overflow") 558 } 559 n := len(s) * count 560 561 if len(s) == 0 { 562 return "" 563 } 564 565 // Past a certain chunk size it is counterproductive to use 566 // larger chunks as the source of the write, as when the source 567 // is too large we are basically just thrashing the CPU D-cache. 568 // So if the result length is larger than an empirically-found 569 // limit (8KB), we stop growing the source string once the limit 570 // is reached and keep reusing the same source string - that 571 // should therefore be always resident in the L1 cache - until we 572 // have completed the construction of the result. 573 // This yields significant speedups (up to +100%) in cases where 574 // the result length is large (roughly, over L2 cache size). 575 const chunkLimit = 8 * 1024 576 chunkMax := n 577 if n > chunkLimit { 578 chunkMax = chunkLimit / len(s) * len(s) 579 if chunkMax == 0 { 580 chunkMax = len(s) 581 } 582 } 583 584 var b Builder 585 b.Grow(n) 586 b.WriteString(s) 587 for b.Len() < n { 588 chunk := n - b.Len() 589 if chunk > b.Len() { 590 chunk = b.Len() 591 } 592 if chunk > chunkMax { 593 chunk = chunkMax 594 } 595 b.WriteString(b.String()[:chunk]) 596 } 597 return b.String() 598 } 599 600 // ToUpper returns s with all Unicode letters mapped to their upper case. 601 func ToUpper(s string) string { 602 isASCII, hasLower := true, false 603 for i := 0; i < len(s); i++ { 604 c := s[i] 605 if c >= utf8.RuneSelf { 606 isASCII = false 607 break 608 } 609 hasLower = hasLower || ('a' <= c && c <= 'z') 610 } 611 612 if isASCII { // optimize for ASCII-only strings. 613 if !hasLower { 614 return s 615 } 616 var ( 617 b Builder 618 pos int 619 ) 620 b.Grow(len(s)) 621 for i := 0; i < len(s); i++ { 622 c := s[i] 623 if 'a' <= c && c <= 'z' { 624 c -= 'a' - 'A' 625 if pos < i { 626 b.WriteString(s[pos:i]) 627 } 628 b.WriteByte(c) 629 pos = i + 1 630 } 631 } 632 if pos < len(s) { 633 b.WriteString(s[pos:]) 634 } 635 return b.String() 636 } 637 return Map(unicode.ToUpper, s) 638 } 639 640 // ToLower returns s with all Unicode letters mapped to their lower case. 641 func ToLower(s string) string { 642 isASCII, hasUpper := true, false 643 for i := 0; i < len(s); i++ { 644 c := s[i] 645 if c >= utf8.RuneSelf { 646 isASCII = false 647 break 648 } 649 hasUpper = hasUpper || ('A' <= c && c <= 'Z') 650 } 651 652 if isASCII { // optimize for ASCII-only strings. 653 if !hasUpper { 654 return s 655 } 656 var ( 657 b Builder 658 pos int 659 ) 660 b.Grow(len(s)) 661 for i := 0; i < len(s); i++ { 662 c := s[i] 663 if 'A' <= c && c <= 'Z' { 664 c += 'a' - 'A' 665 if pos < i { 666 b.WriteString(s[pos:i]) 667 } 668 b.WriteByte(c) 669 pos = i + 1 670 } 671 } 672 if pos < len(s) { 673 b.WriteString(s[pos:]) 674 } 675 return b.String() 676 } 677 return Map(unicode.ToLower, s) 678 } 679 680 // ToTitle returns a copy of the string s with all Unicode letters mapped to 681 // their Unicode title case. 682 func ToTitle(s string) string { return Map(unicode.ToTitle, s) } 683 684 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their 685 // upper case using the case mapping specified by c. 686 func ToUpperSpecial(c unicode.SpecialCase, s string) string { 687 return Map(c.ToUpper, s) 688 } 689 690 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their 691 // lower case using the case mapping specified by c. 692 func ToLowerSpecial(c unicode.SpecialCase, s string) string { 693 return Map(c.ToLower, s) 694 } 695 696 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their 697 // Unicode title case, giving priority to the special casing rules. 698 func ToTitleSpecial(c unicode.SpecialCase, s string) string { 699 return Map(c.ToTitle, s) 700 } 701 702 // ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences 703 // replaced by the replacement string, which may be empty. 704 func ToValidUTF8(s, replacement string) string { 705 var b Builder 706 707 for i, c := range s { 708 if c != utf8.RuneError { 709 continue 710 } 711 712 _, wid := utf8.DecodeRuneInString(s[i:]) 713 if wid == 1 { 714 b.Grow(len(s) + len(replacement)) 715 b.WriteString(s[:i]) 716 s = s[i:] 717 break 718 } 719 } 720 721 // Fast path for unchanged input 722 if b.Cap() == 0 { // didn't call b.Grow above 723 return s 724 } 725 726 invalid := false // previous byte was from an invalid UTF-8 sequence 727 for i := 0; i < len(s); { 728 c := s[i] 729 if c < utf8.RuneSelf { 730 i++ 731 invalid = false 732 b.WriteByte(c) 733 continue 734 } 735 _, wid := utf8.DecodeRuneInString(s[i:]) 736 if wid == 1 { 737 i++ 738 if !invalid { 739 invalid = true 740 b.WriteString(replacement) 741 } 742 continue 743 } 744 invalid = false 745 b.WriteString(s[i : i+wid]) 746 i += wid 747 } 748 749 return b.String() 750 } 751 752 // isSeparator reports whether the rune could mark a word boundary. 753 // TODO: update when package unicode captures more of the properties. 754 func isSeparator(r rune) bool { 755 // ASCII alphanumerics and underscore are not separators 756 if r <= 0x7F { 757 switch { 758 case '0' <= r && r <= '9': 759 return false 760 case 'a' <= r && r <= 'z': 761 return false 762 case 'A' <= r && r <= 'Z': 763 return false 764 case r == '_': 765 return false 766 } 767 return true 768 } 769 // Letters and digits are not separators 770 if unicode.IsLetter(r) || unicode.IsDigit(r) { 771 return false 772 } 773 // Otherwise, all we can do for now is treat spaces as separators. 774 return unicode.IsSpace(r) 775 } 776 777 // Title returns a copy of the string s with all Unicode letters that begin words 778 // mapped to their Unicode title case. 779 // 780 // Deprecated: The rule Title uses for word boundaries does not handle Unicode 781 // punctuation properly. Use golang.org/x/text/cases instead. 782 func Title(s string) string { 783 // Use a closure here to remember state. 784 // Hackish but effective. Depends on Map scanning in order and calling 785 // the closure once per rune. 786 prev := ' ' 787 return Map( 788 func(r rune) rune { 789 if isSeparator(prev) { 790 prev = r 791 return unicode.ToTitle(r) 792 } 793 prev = r 794 return r 795 }, 796 s) 797 } 798 799 // TrimLeftFunc returns a slice of the string s with all leading 800 // Unicode code points c satisfying f(c) removed. 801 func TrimLeftFunc(s string, f func(rune) bool) string { 802 i := indexFunc(s, f, false) 803 if i == -1 { 804 return "" 805 } 806 return s[i:] 807 } 808 809 // TrimRightFunc returns a slice of the string s with all trailing 810 // Unicode code points c satisfying f(c) removed. 811 func TrimRightFunc(s string, f func(rune) bool) string { 812 i := lastIndexFunc(s, f, false) 813 if i >= 0 && s[i] >= utf8.RuneSelf { 814 _, wid := utf8.DecodeRuneInString(s[i:]) 815 i += wid 816 } else { 817 i++ 818 } 819 return s[0:i] 820 } 821 822 // TrimFunc returns a slice of the string s with all leading 823 // and trailing Unicode code points c satisfying f(c) removed. 824 func TrimFunc(s string, f func(rune) bool) string { 825 return TrimRightFunc(TrimLeftFunc(s, f), f) 826 } 827 828 // IndexFunc returns the index into s of the first Unicode 829 // code point satisfying f(c), or -1 if none do. 830 func IndexFunc(s string, f func(rune) bool) int { 831 return indexFunc(s, f, true) 832 } 833 834 // LastIndexFunc returns the index into s of the last 835 // Unicode code point satisfying f(c), or -1 if none do. 836 func LastIndexFunc(s string, f func(rune) bool) int { 837 return lastIndexFunc(s, f, true) 838 } 839 840 // indexFunc is the same as IndexFunc except that if 841 // truth==false, the sense of the predicate function is 842 // inverted. 843 func indexFunc(s string, f func(rune) bool, truth bool) int { 844 for i, r := range s { 845 if f(r) == truth { 846 return i 847 } 848 } 849 return -1 850 } 851 852 // lastIndexFunc is the same as LastIndexFunc except that if 853 // truth==false, the sense of the predicate function is 854 // inverted. 855 func lastIndexFunc(s string, f func(rune) bool, truth bool) int { 856 for i := len(s); i > 0; { 857 r, size := utf8.DecodeLastRuneInString(s[0:i]) 858 i -= size 859 if f(r) == truth { 860 return i 861 } 862 } 863 return -1 864 } 865 866 // asciiSet is a 32-byte value, where each bit represents the presence of a 867 // given ASCII character in the set. The 128-bits of the lower 16 bytes, 868 // starting with the least-significant bit of the lowest word to the 869 // most-significant bit of the highest word, map to the full range of all 870 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, 871 // ensuring that any non-ASCII character will be reported as not in the set. 872 // This allocates a total of 32 bytes even though the upper half 873 // is unused to avoid bounds checks in asciiSet.contains. 874 type asciiSet [8]uint32 875 876 // makeASCIISet creates a set of ASCII characters and reports whether all 877 // characters in chars are ASCII. 878 func makeASCIISet(chars string) (as asciiSet, ok bool) { 879 for i := 0; i < len(chars); i++ { 880 c := chars[i] 881 if c >= utf8.RuneSelf { 882 return as, false 883 } 884 as[c/32] |= 1 << (c % 32) 885 } 886 return as, true 887 } 888 889 // contains reports whether c is inside the set. 890 func (as *asciiSet) contains(c byte) bool { 891 return (as[c/32] & (1 << (c % 32))) != 0 892 } 893 894 // Trim returns a slice of the string s with all leading and 895 // trailing Unicode code points contained in cutset removed. 896 func Trim(s, cutset string) string { 897 if s == "" || cutset == "" { 898 return s 899 } 900 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 901 return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0]) 902 } 903 if as, ok := makeASCIISet(cutset); ok { 904 return trimLeftASCII(trimRightASCII(s, &as), &as) 905 } 906 return trimLeftUnicode(trimRightUnicode(s, cutset), cutset) 907 } 908 909 // TrimLeft returns a slice of the string s with all leading 910 // Unicode code points contained in cutset removed. 911 // 912 // To remove a prefix, use TrimPrefix instead. 913 func TrimLeft(s, cutset string) string { 914 if s == "" || cutset == "" { 915 return s 916 } 917 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 918 return trimLeftByte(s, cutset[0]) 919 } 920 if as, ok := makeASCIISet(cutset); ok { 921 return trimLeftASCII(s, &as) 922 } 923 return trimLeftUnicode(s, cutset) 924 } 925 926 func trimLeftByte(s string, c byte) string { 927 for len(s) > 0 && s[0] == c { 928 s = s[1:] 929 } 930 return s 931 } 932 933 func trimLeftASCII(s string, as *asciiSet) string { 934 for len(s) > 0 { 935 if !as.contains(s[0]) { 936 break 937 } 938 s = s[1:] 939 } 940 return s 941 } 942 943 func trimLeftUnicode(s, cutset string) string { 944 for len(s) > 0 { 945 r, n := rune(s[0]), 1 946 if r >= utf8.RuneSelf { 947 r, n = utf8.DecodeRuneInString(s) 948 } 949 if !ContainsRune(cutset, r) { 950 break 951 } 952 s = s[n:] 953 } 954 return s 955 } 956 957 // TrimRight returns a slice of the string s, with all trailing 958 // Unicode code points contained in cutset removed. 959 // 960 // To remove a suffix, use TrimSuffix instead. 961 func TrimRight(s, cutset string) string { 962 if s == "" || cutset == "" { 963 return s 964 } 965 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 966 return trimRightByte(s, cutset[0]) 967 } 968 if as, ok := makeASCIISet(cutset); ok { 969 return trimRightASCII(s, &as) 970 } 971 return trimRightUnicode(s, cutset) 972 } 973 974 func trimRightByte(s string, c byte) string { 975 for len(s) > 0 && s[len(s)-1] == c { 976 s = s[:len(s)-1] 977 } 978 return s 979 } 980 981 func trimRightASCII(s string, as *asciiSet) string { 982 for len(s) > 0 { 983 if !as.contains(s[len(s)-1]) { 984 break 985 } 986 s = s[:len(s)-1] 987 } 988 return s 989 } 990 991 func trimRightUnicode(s, cutset string) string { 992 for len(s) > 0 { 993 r, n := rune(s[len(s)-1]), 1 994 if r >= utf8.RuneSelf { 995 r, n = utf8.DecodeLastRuneInString(s) 996 } 997 if !ContainsRune(cutset, r) { 998 break 999 } 1000 s = s[:len(s)-n] 1001 } 1002 return s 1003 } 1004 1005 // TrimSpace returns a slice of the string s, with all leading 1006 // and trailing white space removed, as defined by Unicode. 1007 func TrimSpace(s string) string { 1008 // Fast path for ASCII: look for the first ASCII non-space byte 1009 start := 0 1010 for ; start < len(s); start++ { 1011 c := s[start] 1012 if c >= utf8.RuneSelf { 1013 // If we run into a non-ASCII byte, fall back to the 1014 // slower unicode-aware method on the remaining bytes 1015 return TrimFunc(s[start:], unicode.IsSpace) 1016 } 1017 if asciiSpace[c] == 0 { 1018 break 1019 } 1020 } 1021 1022 // Now look for the first ASCII non-space byte from the end 1023 stop := len(s) 1024 for ; stop > start; stop-- { 1025 c := s[stop-1] 1026 if c >= utf8.RuneSelf { 1027 // start has been already trimmed above, should trim end only 1028 return TrimRightFunc(s[start:stop], unicode.IsSpace) 1029 } 1030 if asciiSpace[c] == 0 { 1031 break 1032 } 1033 } 1034 1035 // At this point s[start:stop] starts and ends with an ASCII 1036 // non-space bytes, so we're done. Non-ASCII cases have already 1037 // been handled above. 1038 return s[start:stop] 1039 } 1040 1041 // TrimPrefix returns s without the provided leading prefix string. 1042 // If s doesn't start with prefix, s is returned unchanged. 1043 func TrimPrefix(s, prefix string) string { 1044 if HasPrefix(s, prefix) { 1045 return s[len(prefix):] 1046 } 1047 return s 1048 } 1049 1050 // TrimSuffix returns s without the provided trailing suffix string. 1051 // If s doesn't end with suffix, s is returned unchanged. 1052 func TrimSuffix(s, suffix string) string { 1053 if HasSuffix(s, suffix) { 1054 return s[:len(s)-len(suffix)] 1055 } 1056 return s 1057 } 1058 1059 // Replace returns a copy of the string s with the first n 1060 // non-overlapping instances of old replaced by new. 1061 // If old is empty, it matches at the beginning of the string 1062 // and after each UTF-8 sequence, yielding up to k+1 replacements 1063 // for a k-rune string. 1064 // If n < 0, there is no limit on the number of replacements. 1065 func Replace(s, old, new string, n int) string { 1066 if old == new || n == 0 { 1067 return s // avoid allocation 1068 } 1069 1070 // Compute number of replacements. 1071 if m := Count(s, old); m == 0 { 1072 return s // avoid allocation 1073 } else if n < 0 || m < n { 1074 n = m 1075 } 1076 1077 // Apply replacements to buffer. 1078 var b Builder 1079 b.Grow(len(s) + n*(len(new)-len(old))) 1080 start := 0 1081 for i := 0; i < n; i++ { 1082 j := start 1083 if len(old) == 0 { 1084 if i > 0 { 1085 _, wid := utf8.DecodeRuneInString(s[start:]) 1086 j += wid 1087 } 1088 } else { 1089 j += Index(s[start:], old) 1090 } 1091 b.WriteString(s[start:j]) 1092 b.WriteString(new) 1093 start = j + len(old) 1094 } 1095 b.WriteString(s[start:]) 1096 return b.String() 1097 } 1098 1099 // ReplaceAll returns a copy of the string s with all 1100 // non-overlapping instances of old replaced by new. 1101 // If old is empty, it matches at the beginning of the string 1102 // and after each UTF-8 sequence, yielding up to k+1 replacements 1103 // for a k-rune string. 1104 func ReplaceAll(s, old, new string) string { 1105 return Replace(s, old, new, -1) 1106 } 1107 1108 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 1109 // are equal under simple Unicode case-folding, which is a more general 1110 // form of case-insensitivity. 1111 func EqualFold(s, t string) bool { 1112 // ASCII fast path 1113 i := 0 1114 for ; i < len(s) && i < len(t); i++ { 1115 sr := s[i] 1116 tr := t[i] 1117 if sr|tr >= utf8.RuneSelf { 1118 goto hasUnicode 1119 } 1120 1121 // Easy case. 1122 if tr == sr { 1123 continue 1124 } 1125 1126 // Make sr < tr to simplify what follows. 1127 if tr < sr { 1128 tr, sr = sr, tr 1129 } 1130 // ASCII only, sr/tr must be upper/lower case 1131 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1132 continue 1133 } 1134 return false 1135 } 1136 // Check if we've exhausted both strings. 1137 return len(s) == len(t) 1138 1139 hasUnicode: 1140 s = s[i:] 1141 t = t[i:] 1142 for _, sr := range s { 1143 // If t is exhausted the strings are not equal. 1144 if len(t) == 0 { 1145 return false 1146 } 1147 1148 // Extract first rune from second string. 1149 var tr rune 1150 if t[0] < utf8.RuneSelf { 1151 tr, t = rune(t[0]), t[1:] 1152 } else { 1153 r, size := utf8.DecodeRuneInString(t) 1154 tr, t = r, t[size:] 1155 } 1156 1157 // If they match, keep going; if not, return false. 1158 1159 // Easy case. 1160 if tr == sr { 1161 continue 1162 } 1163 1164 // Make sr < tr to simplify what follows. 1165 if tr < sr { 1166 tr, sr = sr, tr 1167 } 1168 // Fast check for ASCII. 1169 if tr < utf8.RuneSelf { 1170 // ASCII only, sr/tr must be upper/lower case 1171 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1172 continue 1173 } 1174 return false 1175 } 1176 1177 // General case. SimpleFold(x) returns the next equivalent rune > x 1178 // or wraps around to smaller values. 1179 r := unicode.SimpleFold(sr) 1180 for r != sr && r < tr { 1181 r = unicode.SimpleFold(r) 1182 } 1183 if r == tr { 1184 continue 1185 } 1186 return false 1187 } 1188 1189 // First string is empty, so check if the second one is also empty. 1190 return len(t) == 0 1191 } 1192 1193 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. 1194 func Index(s, substr string) int { 1195 n := len(substr) 1196 switch { 1197 case n == 0: 1198 return 0 1199 case n == 1: 1200 return IndexByte(s, substr[0]) 1201 case n == len(s): 1202 if substr == s { 1203 return 0 1204 } 1205 return -1 1206 case n > len(s): 1207 return -1 1208 case n <= bytealg.MaxLen: 1209 // Use brute force when s and substr both are small 1210 if len(s) <= bytealg.MaxBruteForce { 1211 return bytealg.IndexString(s, substr) 1212 } 1213 c0 := substr[0] 1214 c1 := substr[1] 1215 i := 0 1216 t := len(s) - n + 1 1217 fails := 0 1218 for i < t { 1219 if s[i] != c0 { 1220 // IndexByte is faster than bytealg.IndexString, so use it as long as 1221 // we're not getting lots of false positives. 1222 o := IndexByte(s[i+1:t], c0) 1223 if o < 0 { 1224 return -1 1225 } 1226 i += o + 1 1227 } 1228 if s[i+1] == c1 && s[i:i+n] == substr { 1229 return i 1230 } 1231 fails++ 1232 i++ 1233 // Switch to bytealg.IndexString when IndexByte produces too many false positives. 1234 if fails > bytealg.Cutover(i) { 1235 r := bytealg.IndexString(s[i:], substr) 1236 if r >= 0 { 1237 return r + i 1238 } 1239 return -1 1240 } 1241 } 1242 return -1 1243 } 1244 c0 := substr[0] 1245 c1 := substr[1] 1246 i := 0 1247 t := len(s) - n + 1 1248 fails := 0 1249 for i < t { 1250 if s[i] != c0 { 1251 o := IndexByte(s[i+1:t], c0) 1252 if o < 0 { 1253 return -1 1254 } 1255 i += o + 1 1256 } 1257 if s[i+1] == c1 && s[i:i+n] == substr { 1258 return i 1259 } 1260 i++ 1261 fails++ 1262 if fails >= 4+i>>4 && i < t { 1263 // See comment in ../bytes/bytes.go. 1264 j := bytealg.IndexRabinKarp(s[i:], substr) 1265 if j < 0 { 1266 return -1 1267 } 1268 return i + j 1269 } 1270 } 1271 return -1 1272 } 1273 1274 // Cut slices s around the first instance of sep, 1275 // returning the text before and after sep. 1276 // The found result reports whether sep appears in s. 1277 // If sep does not appear in s, cut returns s, "", false. 1278 func Cut(s, sep string) (before, after string, found bool) { 1279 if i := Index(s, sep); i >= 0 { 1280 return s[:i], s[i+len(sep):], true 1281 } 1282 return s, "", false 1283 } 1284 1285 // CutPrefix returns s without the provided leading prefix string 1286 // and reports whether it found the prefix. 1287 // If s doesn't start with prefix, CutPrefix returns s, false. 1288 // If prefix is the empty string, CutPrefix returns s, true. 1289 func CutPrefix(s, prefix string) (after string, found bool) { 1290 if !HasPrefix(s, prefix) { 1291 return s, false 1292 } 1293 return s[len(prefix):], true 1294 } 1295 1296 // CutSuffix returns s without the provided ending suffix string 1297 // and reports whether it found the suffix. 1298 // If s doesn't end with suffix, CutSuffix returns s, false. 1299 // If suffix is the empty string, CutSuffix returns s, true. 1300 func CutSuffix(s, suffix string) (before string, found bool) { 1301 if !HasSuffix(s, suffix) { 1302 return s, false 1303 } 1304 return s[:len(s)-len(suffix)], true 1305 }