github.com/pgavlin/text@v0.0.0-20240419000839-8438d0a47805/strings.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package strings implements simple functions to manipulate UTF-8 encoded strings. 6 // 7 // For information about UTF-8 strings in Go, see https://blog.golang.org/strings. 8 package text 9 10 import ( 11 "reflect" 12 "unicode" 13 "unsafe" 14 15 "github.com/pgavlin/text/internal/bytealg" 16 "github.com/pgavlin/text/utf8" 17 ) 18 19 type String interface { 20 ~string | ~[]byte 21 } 22 23 func Empty[S String]() S { 24 var s S 25 return s 26 } 27 28 func IsEmpty[S String](s S) bool { 29 return len(s) == 0 30 } 31 32 func ToRunes[S String](s S) []rune { 33 return []rune(bytealg.AsString(s)) 34 } 35 36 func ToString[S String](r []rune) S { 37 s := string(r) 38 if isString[S]() { 39 return S(s) 40 } 41 return S(unsafe.Slice(unsafe.StringData(s), len(s))) 42 } 43 44 func isString[S String]() bool { 45 var s S 46 return reflect.TypeOf(s).Kind() == reflect.String 47 } 48 49 const maxInt = int(^uint(0) >> 1) 50 51 // explode splits s into a slice of UTF-8 strings, 52 // one string per Unicode character up to a maximum of n (n < 0 means no limit). 53 // Invalid UTF-8 bytes are sliced individually. 54 func explode[S String](s S, n int) []S { 55 l := utf8.RuneCount(s) 56 if n < 0 || n > l { 57 n = l 58 } 59 a := make([]S, n) 60 for i := 0; i < n-1; i++ { 61 _, size := utf8.DecodeRune(s) 62 a[i] = s[:size] 63 s = s[size:] 64 } 65 if n > 0 { 66 a[n-1] = s 67 } 68 return a 69 } 70 71 // Count counts the number of non-overlapping instances of substr in s. 72 // If substr is an empty string, Count returns 1 + the number of Unicode code points in s. 73 func Count[S1, S2 String](s S1, substr S2) int { 74 // special case 75 if len(substr) == 0 { 76 return utf8.RuneCount(s) + 1 77 } 78 if len(substr) == 1 { 79 return bytealg.CountString(s, substr[0]) 80 } 81 n := 0 82 for { 83 i := Index(s, substr) 84 if i == -1 { 85 return n 86 } 87 n++ 88 s = s[i+len(substr):] 89 } 90 } 91 92 // Contains reports whether substr is within s. 93 func Contains[S1, S2 String](s S1, substr S2) bool { 94 return Index(s, substr) >= 0 95 } 96 97 // ContainsAny reports whether any Unicode code points in chars are within s. 98 func ContainsAny[S1, S2 String](s S1, chars S2) bool { 99 return IndexAny(s, chars) >= 0 100 } 101 102 // ContainsRune reports whether the Unicode code point r is within s. 103 func ContainsRune[S String](s S, r rune) bool { 104 return IndexRune(s, r) >= 0 105 } 106 107 // ContainsFunc reports whether any Unicode code points r within s satisfy f(r). 108 func ContainsFunc[S String](s S, f func(rune) bool) bool { 109 return IndexFunc(s, f) >= 0 110 } 111 112 // LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s. 113 func LastIndex[S1, S2 String](s S1, substr S2) int { 114 n := len(substr) 115 switch { 116 case n == 0: 117 return len(s) 118 case n == 1: 119 return LastIndexByte(s, substr[0]) 120 case n == len(s): 121 if Equal(substr, s) { 122 return 0 123 } 124 return -1 125 case n > len(s): 126 return -1 127 } 128 // Rabin-Karp search from the end of the string 129 hashss, pow := bytealg.HashStrRev(substr) 130 last := len(s) - n 131 var h uint32 132 for i := len(s) - 1; i >= last; i-- { 133 h = h*bytealg.PrimeRK + uint32(s[i]) 134 } 135 if h == hashss && Equal(s[last:], substr) { 136 return last 137 } 138 for i := last - 1; i >= 0; i-- { 139 h *= bytealg.PrimeRK 140 h += uint32(s[i]) 141 h -= pow * uint32(s[i+n]) 142 if h == hashss && Equal(s[i:i+n], substr) { 143 return i 144 } 145 } 146 return -1 147 } 148 149 // IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s. 150 func IndexByte[S String](s S, c byte) int { 151 return bytealg.IndexByteString(s, c) 152 } 153 154 // IndexRune returns the index of the first instance of the Unicode code point 155 // r, or -1 if rune is not present in s. 156 // If r is utf8.RuneError, it returns the first instance of any 157 // invalid UTF-8 byte sequence. 158 func IndexRune[S String](s S, r rune) int { 159 switch { 160 case 0 <= r && r < utf8.RuneSelf: 161 return IndexByte(s, byte(r)) 162 case r == utf8.RuneError: 163 for i, r := range bytealg.AsString(s) { 164 if r == utf8.RuneError { 165 return i 166 } 167 } 168 return -1 169 case !utf8.ValidRune(r): 170 return -1 171 default: 172 return Index(s, string(r)) 173 } 174 } 175 176 // IndexAny returns the index of the first instance of any Unicode code point 177 // from chars in s, or -1 if no Unicode code point from chars is present in s. 178 func IndexAny[S1, S2 String](s S1, chars S2) int { 179 if IsEmpty(chars) { 180 // Avoid scanning all of s. 181 return -1 182 } 183 if len(chars) == 1 { 184 // Avoid scanning all of s. 185 r := rune(chars[0]) 186 if r >= utf8.RuneSelf { 187 r = utf8.RuneError 188 } 189 return IndexRune(s, r) 190 } 191 if len(s) > 8 { 192 if as, isASCII := makeASCIISet(chars); isASCII { 193 for i := 0; i < len(s); i++ { 194 if as.contains(s[i]) { 195 return i 196 } 197 } 198 return -1 199 } 200 } 201 for i, c := range bytealg.AsString(s) { 202 if IndexRune(chars, c) >= 0 { 203 return i 204 } 205 } 206 return -1 207 } 208 209 // LastIndexAny returns the index of the last instance of any Unicode code 210 // point from chars in s, or -1 if no Unicode code point from chars is 211 // present in s. 212 func LastIndexAny[S1, S2 String](s S1, chars S2) int { 213 if IsEmpty(chars) { 214 // Avoid scanning all of s. 215 return -1 216 } 217 if len(s) == 1 { 218 rc := rune(s[0]) 219 if rc >= utf8.RuneSelf { 220 rc = utf8.RuneError 221 } 222 if IndexRune(chars, rc) >= 0 { 223 return 0 224 } 225 return -1 226 } 227 if len(s) > 8 { 228 if as, isASCII := makeASCIISet(chars); isASCII { 229 for i := len(s) - 1; i >= 0; i-- { 230 if as.contains(s[i]) { 231 return i 232 } 233 } 234 return -1 235 } 236 } 237 if len(chars) == 1 { 238 rc := rune(chars[0]) 239 if rc >= utf8.RuneSelf { 240 rc = utf8.RuneError 241 } 242 for i := len(s); i > 0; { 243 r, size := utf8.DecodeLastRune(s[:i]) 244 i -= size 245 if rc == r { 246 return i 247 } 248 } 249 return -1 250 } 251 for i := len(s); i > 0; { 252 r, size := utf8.DecodeLastRune(s[:i]) 253 i -= size 254 if IndexRune(chars, r) >= 0 { 255 return i 256 } 257 } 258 return -1 259 } 260 261 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. 262 func LastIndexByte[S String](s S, c byte) int { 263 for i := len(s) - 1; i >= 0; i-- { 264 if s[i] == c { 265 return i 266 } 267 } 268 return -1 269 } 270 271 // Generic split: splits after each instance of sep, 272 // including sepSave bytes of sep in the subarrays. 273 func genSplit[S1, S2 String](s S1, sep S2, sepSave, n int) []S1 { 274 if n == 0 { 275 return nil 276 } 277 if IsEmpty(sep) { 278 return explode(s, n) 279 } 280 if n < 0 { 281 n = Count(s, sep) + 1 282 } 283 284 if n > len(s)+1 { 285 n = len(s) + 1 286 } 287 a := make([]S1, n) 288 n-- 289 i := 0 290 for i < n { 291 m := Index(s, sep) 292 if m < 0 { 293 break 294 } 295 a[i] = s[:m+sepSave] 296 s = s[m+len(sep):] 297 i++ 298 } 299 a[i] = s 300 return a[:i+1] 301 } 302 303 // SplitN slices s into substrings separated by sep and returns a slice of 304 // the substrings between those separators. 305 // 306 // The count determines the number of substrings to return: 307 // 308 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 309 // n == 0: the result is nil (zero substrings) 310 // n < 0: all substrings 311 // 312 // Edge cases for s and sep (for example, empty strings) are handled 313 // as described in the documentation for Split. 314 // 315 // To split around the first instance of a separator, see Cut. 316 func SplitN[S1, S2 String](s S1, sep S2, n int) []S1 { return genSplit(s, sep, 0, n) } 317 318 // SplitAfterN slices s into substrings after each instance of sep and 319 // returns a slice of those substrings. 320 // 321 // The count determines the number of substrings to return: 322 // 323 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 324 // n == 0: the result is nil (zero substrings) 325 // n < 0: all substrings 326 // 327 // Edge cases for s and sep (for example, empty strings) are handled 328 // as described in the documentation for SplitAfter. 329 func SplitAfterN[S1, S2 String](s S1, sep S2, n int) []S1 { 330 return genSplit(s, sep, len(sep), n) 331 } 332 333 // Split slices s into all substrings separated by sep and returns a slice of 334 // the substrings between those separators. 335 // 336 // If s does not contain sep and sep is not empty, Split returns a 337 // slice of length 1 whose only element is s. 338 // 339 // If sep is empty, Split splits after each UTF-8 sequence. If both s 340 // and sep are empty, Split returns an empty slice. 341 // 342 // It is equivalent to SplitN with a count of -1. 343 // 344 // To split around the first instance of a separator, see Cut. 345 func Split[S1, S2 String](s S1, sep S2) []S1 { return genSplit(s, sep, 0, -1) } 346 347 // SplitAfter slices s into all substrings after each instance of sep and 348 // returns a slice of those substrings. 349 // 350 // If s does not contain sep and sep is not empty, SplitAfter returns 351 // a slice of length 1 whose only element is s. 352 // 353 // If sep is empty, SplitAfter splits after each UTF-8 sequence. If 354 // both s and sep are empty, SplitAfter returns an empty slice. 355 // 356 // It is equivalent to SplitAfterN with a count of -1. 357 func SplitAfter[S1, S2 String](s S1, sep S2) []S1 { 358 return genSplit(s, sep, len(sep), -1) 359 } 360 361 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} 362 363 // Fields splits the string s around each instance of one or more consecutive white space 364 // characters, as defined by unicode.IsSpace, returning a slice of substrings of s or an 365 // empty slice if s contains only white space. 366 func Fields[S String](s S) []S { 367 // First count the fields. 368 // This is an exact count if s is ASCII, otherwise it is an approximation. 369 n := 0 370 wasSpace := 1 371 // setBits is used to track which bits are set in the bytes of s. 372 setBits := uint8(0) 373 for i := 0; i < len(s); i++ { 374 r := s[i] 375 setBits |= r 376 isSpace := int(asciiSpace[r]) 377 n += wasSpace & ^isSpace 378 wasSpace = isSpace 379 } 380 381 if setBits >= utf8.RuneSelf { 382 // Some runes in the input string are not ASCII. 383 return FieldsFunc(s, unicode.IsSpace) 384 } 385 // ASCII fast path 386 a := make([]S, n) 387 na := 0 388 fieldStart := 0 389 i := 0 390 // Skip spaces in the front of the input. 391 for i < len(s) && asciiSpace[s[i]] != 0 { 392 i++ 393 } 394 fieldStart = i 395 for i < len(s) { 396 if asciiSpace[s[i]] == 0 { 397 i++ 398 continue 399 } 400 a[na] = s[fieldStart:i] 401 na++ 402 i++ 403 // Skip spaces in between fields. 404 for i < len(s) && asciiSpace[s[i]] != 0 { 405 i++ 406 } 407 fieldStart = i 408 } 409 if fieldStart < len(s) { // Last field might end at EOF. 410 a[na] = s[fieldStart:] 411 } 412 return a 413 } 414 415 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) 416 // and returns an array of slices of s. If all code points in s satisfy f(c) or the 417 // string is empty, an empty slice is returned. 418 // 419 // FieldsFunc makes no guarantees about the order in which it calls f(c) 420 // and assumes that f always returns the same value for a given c. 421 func FieldsFunc[S String](s S, f func(rune) bool) []S { 422 // A span is used to record a slice of s of the form s[start:end]. 423 // The start index is inclusive and the end index is exclusive. 424 type span struct { 425 start int 426 end int 427 } 428 spans := make([]span, 0, 32) 429 430 // Find the field start and end indices. 431 // Doing this in a separate pass (rather than slicing the string s 432 // and collecting the result substrings right away) is significantly 433 // more efficient, possibly due to cache effects. 434 start := -1 // valid span start if >= 0 435 for end, rune := range bytealg.AsString(s) { 436 if f(rune) { 437 if start >= 0 { 438 spans = append(spans, span{start, end}) 439 // Set start to a negative value. 440 // Note: using -1 here consistently and reproducibly 441 // slows down this code by a several percent on amd64. 442 start = ^start 443 } 444 } else { 445 if start < 0 { 446 start = end 447 } 448 } 449 } 450 451 // Last field might end at EOF. 452 if start >= 0 { 453 spans = append(spans, span{start, len(s)}) 454 } 455 456 // Create strings from recorded field indices. 457 a := make([]S, len(spans)) 458 for i, span := range spans { 459 a[i] = s[span.start:span.end] 460 } 461 462 return a 463 } 464 465 // Concat concatenates its inputs to create a single string. 466 func Concat[S1, S2 String](a S1, b S2) S1 { 467 c := make([]byte, len(a)+len(b)) 468 copy(c, a) 469 copy(c[len(a):], b) 470 return S1(c) 471 } 472 473 // Join concatenates the elements of its first argument to create a single string. The separator 474 // string sep is placed between elements in the resulting string. 475 func Join[S1, S2 String](elems []S1, sep S2) S1 { 476 switch len(elems) { 477 case 0: 478 return Empty[S1]() 479 case 1: 480 return elems[0] 481 } 482 483 var n int 484 if len(sep) > 0 { 485 if len(sep) >= maxInt/(len(elems)-1) { 486 panic("strings: Join output length overflow") 487 } 488 n += len(sep) * (len(elems) - 1) 489 } 490 for _, elem := range elems { 491 if len(elem) > maxInt-n { 492 panic("strings: Join output length overflow") 493 } 494 n += len(elem) 495 } 496 497 var b Builder[S1] 498 b.Grow(n) 499 b.WriteText(elems[0]) 500 for _, s := range elems[1:] { 501 WriteString(&b, sep) 502 b.WriteText(s) 503 } 504 return b.Text() 505 } 506 507 // HasPrefix tests whether the string s begins with prefix. 508 func HasPrefix[S1, S2 String](s S1, prefix S2) bool { 509 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix) 510 } 511 512 // HasSuffix tests whether the string s ends with suffix. 513 func HasSuffix[S1, S2 String](s S1, suffix S2) bool { 514 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix) 515 } 516 517 // Map returns a copy of the string s with all its characters modified 518 // according to the mapping function. If mapping returns a negative value, the character is 519 // dropped from the string with no replacement. 520 func Map[S String](mapping func(rune) rune, s S) S { 521 // In the worst case, the string can grow when mapped, making 522 // things unpleasant. But it's so rare we barge in assuming it's 523 // fine. It could also shrink but that falls out naturally. 524 525 // The output buffer b is initialized on demand, the first 526 // time a character differs. 527 var b Builder[S] 528 529 for i, c := range bytealg.AsString(s) { 530 r := mapping(c) 531 if r == c && c != utf8.RuneError { 532 continue 533 } 534 535 var width int 536 if c == utf8.RuneError { 537 c, width = utf8.DecodeRune(s[i:]) 538 if width != 1 && r == c { 539 continue 540 } 541 } else { 542 width = utf8.RuneLen(c) 543 } 544 545 b.Grow(len(s) + utf8.UTFMax) 546 b.WriteText(s[:i]) 547 if r >= 0 { 548 b.WriteRune(r) 549 } 550 551 s = s[i+width:] 552 break 553 } 554 555 // Fast path for unchanged input 556 if b.Cap() == 0 { // didn't call b.Grow above 557 return s 558 } 559 560 for _, c := range bytealg.AsString(s) { 561 r := mapping(c) 562 563 if r >= 0 { 564 // common case 565 // Due to inlining, it is more performant to determine if WriteByte should be 566 // invoked rather than always call WriteRune 567 if r < utf8.RuneSelf { 568 b.WriteByte(byte(r)) 569 } else { 570 // r is not a ASCII rune. 571 b.WriteRune(r) 572 } 573 } 574 } 575 576 return b.Text() 577 } 578 579 // Repeat returns a new string consisting of count copies of the string s. 580 // 581 // It panics if count is negative or if the result of (len(s) * count) 582 // overflows. 583 func Repeat[S String](s S, count int) S { 584 switch count { 585 case 0: 586 return Empty[S]() 587 case 1: 588 return s 589 } 590 591 // Since we cannot return an error on overflow, 592 // we should panic if the repeat will generate an overflow. 593 // See golang.org/issue/16237. 594 if count < 0 { 595 panic("strings: negative Repeat count") 596 } 597 if len(s) >= maxInt/count { 598 panic("strings: Repeat output length overflow") 599 } 600 n := len(s) * count 601 602 if IsEmpty(s) { 603 return Empty[S]() 604 } 605 606 // Past a certain chunk size it is counterproductive to use 607 // larger chunks as the source of the write, as when the source 608 // is too large we are basically just thrashing the CPU D-cache. 609 // So if the result length is larger than an empirically-found 610 // limit (8KB), we stop growing the source string once the limit 611 // is reached and keep reusing the same source string - that 612 // should therefore be always resident in the L1 cache - until we 613 // have completed the construction of the result. 614 // This yields significant speedups (up to +100%) in cases where 615 // the result length is large (roughly, over L2 cache size). 616 const chunkLimit = 8 * 1024 617 chunkMax := n 618 if n > chunkLimit { 619 chunkMax = chunkLimit / len(s) * len(s) 620 if chunkMax == 0 { 621 chunkMax = len(s) 622 } 623 } 624 625 var b Builder[S] 626 b.Grow(n) 627 b.WriteText(s) 628 for b.Len() < n { 629 chunk := n - b.Len() 630 if chunk > b.Len() { 631 chunk = b.Len() 632 } 633 if chunk > chunkMax { 634 chunk = chunkMax 635 } 636 b.WriteText(b.Text()[:chunk]) 637 } 638 return b.Text() 639 } 640 641 // ToUpper returns s with all Unicode letters mapped to their upper case. 642 func ToUpper[S String](s S) S { 643 isASCII, hasLower := true, false 644 for i := 0; i < len(s); i++ { 645 c := s[i] 646 if c >= utf8.RuneSelf { 647 isASCII = false 648 break 649 } 650 hasLower = hasLower || ('a' <= c && c <= 'z') 651 } 652 653 if isASCII { // optimize for ASCII-only strings. 654 if !hasLower { 655 return s 656 } 657 var ( 658 b Builder[S] 659 pos int 660 ) 661 b.Grow(len(s)) 662 for i := 0; i < len(s); i++ { 663 c := s[i] 664 if 'a' <= c && c <= 'z' { 665 c -= 'a' - 'A' 666 if pos < i { 667 b.WriteText(s[pos:i]) 668 } 669 b.WriteByte(c) 670 pos = i + 1 671 } 672 } 673 if pos < len(s) { 674 b.WriteText(s[pos:]) 675 } 676 return b.Text() 677 } 678 return Map(unicode.ToUpper, s) 679 } 680 681 // ToLower returns s with all Unicode letters mapped to their lower case. 682 func ToLower[S String](s S) S { 683 isASCII, hasUpper := true, false 684 for i := 0; i < len(s); i++ { 685 c := s[i] 686 if c >= utf8.RuneSelf { 687 isASCII = false 688 break 689 } 690 hasUpper = hasUpper || ('A' <= c && c <= 'Z') 691 } 692 693 if isASCII { // optimize for ASCII-only strings. 694 if !hasUpper { 695 return s 696 } 697 var ( 698 b Builder[S] 699 pos int 700 ) 701 b.Grow(len(s)) 702 for i := 0; i < len(s); i++ { 703 c := s[i] 704 if 'A' <= c && c <= 'Z' { 705 c += 'a' - 'A' 706 if pos < i { 707 b.WriteText(s[pos:i]) 708 } 709 b.WriteByte(c) 710 pos = i + 1 711 } 712 } 713 if pos < len(s) { 714 b.WriteText(s[pos:]) 715 } 716 return b.Text() 717 } 718 return Map(unicode.ToLower, s) 719 } 720 721 // ToTitle returns a copy of the string s with all Unicode letters mapped to 722 // their Unicode title case. 723 func ToTitle[S String](s S) S { return Map(unicode.ToTitle, s) } 724 725 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their 726 // upper case using the case mapping specified by c. 727 func ToUpperSpecial[S String](c unicode.SpecialCase, s S) S { 728 return Map(c.ToUpper, s) 729 } 730 731 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their 732 // lower case using the case mapping specified by c. 733 func ToLowerSpecial[S String](c unicode.SpecialCase, s S) S { 734 return Map(c.ToLower, s) 735 } 736 737 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their 738 // Unicode title case, giving priority to the special casing rules. 739 func ToTitleSpecial[S String](c unicode.SpecialCase, s S) S { 740 return Map(c.ToTitle, s) 741 } 742 743 // ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences 744 // replaced by the replacement string, which may be empty. 745 func ToValidUTF8[S1, S2 String](s S1, replacement S2) S1 { 746 var b Builder[S1] 747 748 for i, c := range bytealg.AsString(s) { 749 if c != utf8.RuneError { 750 continue 751 } 752 753 _, wid := utf8.DecodeRune(s[i:]) 754 if wid == 1 { 755 b.Grow(len(s) + len(replacement)) 756 b.WriteText(s[:i]) 757 s = s[i:] 758 break 759 } 760 } 761 762 // Fast path for unchanged input 763 if b.Cap() == 0 { // didn't call b.Grow above 764 return s 765 } 766 767 invalid := false // previous byte was from an invalid UTF-8 sequence 768 for i := 0; i < len(s); { 769 c := s[i] 770 if c < utf8.RuneSelf { 771 i++ 772 invalid = false 773 b.WriteByte(c) 774 continue 775 } 776 _, wid := utf8.DecodeRune(s[i:]) 777 if wid == 1 { 778 i++ 779 if !invalid { 780 invalid = true 781 WriteString(&b, replacement) 782 } 783 continue 784 } 785 invalid = false 786 b.WriteText(s[i : i+wid]) 787 i += wid 788 } 789 790 return b.Text() 791 } 792 793 // isSeparator reports whether the rune could mark a word boundary. 794 // TODO: update when package unicode captures more of the properties. 795 func isSeparator(r rune) bool { 796 // ASCII alphanumerics and underscore are not separators 797 if r <= 0x7F { 798 switch { 799 case '0' <= r && r <= '9': 800 return false 801 case 'a' <= r && r <= 'z': 802 return false 803 case 'A' <= r && r <= 'Z': 804 return false 805 case r == '_': 806 return false 807 } 808 return true 809 } 810 // Letters and digits are not separators 811 if unicode.IsLetter(r) || unicode.IsDigit(r) { 812 return false 813 } 814 // Otherwise, all we can do for now is treat spaces as separators. 815 return unicode.IsSpace(r) 816 } 817 818 // Title returns a copy of the string s with all Unicode letters that begin words 819 // mapped to their Unicode title case. 820 // 821 // Deprecated: The rule Title uses for word boundaries does not handle Unicode 822 // punctuation properly. Use golang.org/x/text/cases instead. 823 func Title[S String](s S) S { 824 // Use a closure here to remember state. 825 // Hackish but effective. Depends on Map scanning in order and calling 826 // the closure once per rune. 827 prev := ' ' 828 return Map( 829 func(r rune) rune { 830 if isSeparator(prev) { 831 prev = r 832 return unicode.ToTitle(r) 833 } 834 prev = r 835 return r 836 }, 837 s) 838 } 839 840 // TrimLeftFunc returns a slice of the string s with all leading 841 // Unicode code points c satisfying f(c) removed. 842 func TrimLeftFunc[S String](s S, f func(rune) bool) S { 843 i := indexFunc(s, f, false) 844 if i == -1 { 845 return Empty[S]() 846 } 847 return s[i:] 848 } 849 850 // TrimRightFunc returns a slice of the string s with all trailing 851 // Unicode code points c satisfying f(c) removed. 852 func TrimRightFunc[S String](s S, f func(rune) bool) S { 853 i := lastIndexFunc(s, f, false) 854 if i >= 0 && s[i] >= utf8.RuneSelf { 855 _, wid := utf8.DecodeRune(s[i:]) 856 i += wid 857 } else { 858 i++ 859 } 860 return s[0:i] 861 } 862 863 // TrimFunc returns a slice of the string s with all leading 864 // and trailing Unicode code points c satisfying f(c) removed. 865 func TrimFunc[S String](s S, f func(rune) bool) S { 866 return TrimRightFunc(TrimLeftFunc(s, f), f) 867 } 868 869 // IndexFunc returns the index into s of the first Unicode 870 // code point satisfying f(c), or -1 if none do. 871 func IndexFunc[S String](s S, f func(rune) bool) int { 872 return indexFunc(s, f, true) 873 } 874 875 // LastIndexFunc returns the index into s of the last 876 // Unicode code point satisfying f(c), or -1 if none do. 877 func LastIndexFunc[S String](s S, f func(rune) bool) int { 878 return lastIndexFunc(s, f, true) 879 } 880 881 // indexFunc is the same as IndexFunc except that if 882 // truth==false, the sense of the predicate function is 883 // inverted. 884 func indexFunc[S String](s S, f func(rune) bool, truth bool) int { 885 for i, r := range bytealg.AsString(s) { 886 if f(r) == truth { 887 return i 888 } 889 } 890 return -1 891 } 892 893 // lastIndexFunc is the same as LastIndexFunc except that if 894 // truth==false, the sense of the predicate function is 895 // inverted. 896 func lastIndexFunc[S String](s S, f func(rune) bool, truth bool) int { 897 for i := len(s); i > 0; { 898 r, size := utf8.DecodeLastRune(s[0:i]) 899 i -= size 900 if f(r) == truth { 901 return i 902 } 903 } 904 return -1 905 } 906 907 // asciiSet is a 32-byte value, where each bit represents the presence of a 908 // given ASCII character in the set. The 128-bits of the lower 16 bytes, 909 // starting with the least-significant bit of the lowest word to the 910 // most-significant bit of the highest word, map to the full range of all 911 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, 912 // ensuring that any non-ASCII character will be reported as not in the set. 913 // This allocates a total of 32 bytes even though the upper half 914 // is unused to avoid bounds checks in asciiSet.contains. 915 type asciiSet [8]uint32 916 917 // makeASCIISet creates a set of ASCII characters and reports whether all 918 // characters in chars are ASCII. 919 func makeASCIISet[S String](chars S) (as asciiSet, ok bool) { 920 for i := 0; i < len(chars); i++ { 921 c := chars[i] 922 if c >= utf8.RuneSelf { 923 return as, false 924 } 925 as[c/32] |= 1 << (c % 32) 926 } 927 return as, true 928 } 929 930 // contains reports whether c is inside the set. 931 func (as *asciiSet) contains(c byte) bool { 932 return (as[c/32] & (1 << (c % 32))) != 0 933 } 934 935 // Trim returns a slice of the string s with all leading and 936 // trailing Unicode code points contained in cutset removed. 937 func Trim[S1, S2 String](s S1, cutset S2) S1 { 938 if IsEmpty(s) || IsEmpty(cutset) { 939 return s 940 } 941 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 942 return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0]) 943 } 944 if as, ok := makeASCIISet(cutset); ok { 945 return trimLeftASCII(trimRightASCII(s, &as), &as) 946 } 947 return trimLeftUnicode(trimRightUnicode(s, cutset), cutset) 948 } 949 950 // TrimLeft returns a slice of the string s with all leading 951 // Unicode code points contained in cutset removed. 952 // 953 // To remove a prefix, use TrimPrefix instead. 954 func TrimLeft[S1, S2 String](s S1, cutset S2) S1 { 955 if IsEmpty(s) || IsEmpty(cutset) { 956 return s 957 } 958 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 959 return trimLeftByte(s, cutset[0]) 960 } 961 if as, ok := makeASCIISet(cutset); ok { 962 return trimLeftASCII(s, &as) 963 } 964 return trimLeftUnicode(s, cutset) 965 } 966 967 func trimLeftByte[S String](s S, c byte) S { 968 for len(s) > 0 && s[0] == c { 969 s = s[1:] 970 } 971 return s 972 } 973 974 func trimLeftASCII[S String](s S, as *asciiSet) S { 975 for len(s) > 0 { 976 if !as.contains(s[0]) { 977 break 978 } 979 s = s[1:] 980 } 981 return s 982 } 983 984 func trimLeftUnicode[S1, S2 String](s S1, cutset S2) S1 { 985 for len(s) > 0 { 986 r, n := rune(s[0]), 1 987 if r >= utf8.RuneSelf { 988 r, n = utf8.DecodeRune(s) 989 } 990 if !ContainsRune(cutset, r) { 991 break 992 } 993 s = s[n:] 994 } 995 return s 996 } 997 998 // TrimRight returns a slice of the string s, with all trailing 999 // Unicode code points contained in cutset removed. 1000 // 1001 // To remove a suffix, use TrimSuffix instead. 1002 func TrimRight[S1, S2 String](s S1, cutset S2) S1 { 1003 if IsEmpty(s) || IsEmpty(cutset) { 1004 return s 1005 } 1006 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 1007 return trimRightByte(s, cutset[0]) 1008 } 1009 if as, ok := makeASCIISet(cutset); ok { 1010 return trimRightASCII(s, &as) 1011 } 1012 return trimRightUnicode(s, cutset) 1013 } 1014 1015 func trimRightByte[S String](s S, c byte) S { 1016 for len(s) > 0 && s[len(s)-1] == c { 1017 s = s[:len(s)-1] 1018 } 1019 return s 1020 } 1021 1022 func trimRightASCII[S String](s S, as *asciiSet) S { 1023 for len(s) > 0 { 1024 if !as.contains(s[len(s)-1]) { 1025 break 1026 } 1027 s = s[:len(s)-1] 1028 } 1029 return s 1030 } 1031 1032 func trimRightUnicode[S1, S2 String](s S1, cutset S2) S1 { 1033 for len(s) > 0 { 1034 r, n := rune(s[len(s)-1]), 1 1035 if r >= utf8.RuneSelf { 1036 r, n = utf8.DecodeLastRune(s) 1037 } 1038 if !ContainsRune(cutset, r) { 1039 break 1040 } 1041 s = s[:len(s)-n] 1042 } 1043 return s 1044 } 1045 1046 // TrimSpace returns a slice of the string s, with all leading 1047 // and trailing white space removed, as defined by Unicode. 1048 func TrimSpace[S String](s S) S { 1049 // Fast path for ASCII: look for the first ASCII non-space byte 1050 start := 0 1051 for ; start < len(s); start++ { 1052 c := s[start] 1053 if c >= utf8.RuneSelf { 1054 // If we run into a non-ASCII byte, fall back to the 1055 // slower unicode-aware method on the remaining bytes 1056 return TrimFunc(s[start:], unicode.IsSpace) 1057 } 1058 if asciiSpace[c] == 0 { 1059 break 1060 } 1061 } 1062 1063 // Now look for the first ASCII non-space byte from the end 1064 stop := len(s) 1065 for ; stop > start; stop-- { 1066 c := s[stop-1] 1067 if c >= utf8.RuneSelf { 1068 // start has been already trimmed above, should trim end only 1069 return TrimRightFunc(s[start:stop], unicode.IsSpace) 1070 } 1071 if asciiSpace[c] == 0 { 1072 break 1073 } 1074 } 1075 1076 // At this point s[start:stop] starts and ends with an ASCII 1077 // non-space bytes, so we're done. Non-ASCII cases have already 1078 // been handled above. 1079 return s[start:stop] 1080 } 1081 1082 // TrimPrefix returns s without the provided leading prefix string. 1083 // If s doesn't start with prefix, s is returned unchanged. 1084 func TrimPrefix[S1, S2 String](s S1, prefix S2) S1 { 1085 if HasPrefix(s, prefix) { 1086 return s[len(prefix):] 1087 } 1088 return s 1089 } 1090 1091 // TrimSuffix returns s without the provided trailing suffix string. 1092 // If s doesn't end with suffix, s is returned unchanged. 1093 func TrimSuffix[S1, S2 String](s S1, suffix S2) S1 { 1094 if HasSuffix(s, suffix) { 1095 return s[:len(s)-len(suffix)] 1096 } 1097 return s 1098 } 1099 1100 // Replace returns a copy of the string s with the first n 1101 // non-overlapping instances of old replaced by new. 1102 // If old is empty, it matches at the beginning of the string 1103 // and after each UTF-8 sequence, yielding up to k+1 replacements 1104 // for a k-rune string. 1105 // If n < 0, there is no limit on the number of replacements. 1106 func Replace[S1, S2, S3 String](s S1, old S2, new S3, n int) S1 { 1107 if Equal(old, new) || n == 0 { 1108 return s // avoid allocation 1109 } 1110 1111 // Compute number of replacements. 1112 if m := Count(s, old); m == 0 { 1113 return s // avoid allocation 1114 } else if n < 0 || m < n { 1115 n = m 1116 } 1117 1118 // Apply replacements to buffer. 1119 var b Builder[S1] 1120 b.Grow(len(s) + n*(len(new)-len(old))) 1121 start := 0 1122 for i := 0; i < n; i++ { 1123 j := start 1124 if len(old) == 0 { 1125 if i > 0 { 1126 _, wid := utf8.DecodeRune(s[start:]) 1127 j += wid 1128 } 1129 } else { 1130 j += Index(s[start:], old) 1131 } 1132 b.WriteText(s[start:j]) 1133 WriteString(&b, new) 1134 start = j + len(old) 1135 } 1136 b.WriteText(s[start:]) 1137 return b.Text() 1138 } 1139 1140 // ReplaceAll returns a copy of the string s with all 1141 // non-overlapping instances of old replaced by new. 1142 // If old is empty, it matches at the beginning of the string 1143 // and after each UTF-8 sequence, yielding up to k+1 replacements 1144 // for a k-rune string. 1145 func ReplaceAll[S1, S2, S3 String](s S1, old S2, new S3) S1 { 1146 return Replace(s, old, new, -1) 1147 } 1148 1149 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 1150 // are equal under simple Unicode case-folding, which is a more general 1151 // form of case-insensitivity. 1152 func EqualFold[S1, S2 String](s S1, t S2) bool { 1153 // ASCII fast path 1154 i := 0 1155 for ; i < len(s) && i < len(t); i++ { 1156 sr := s[i] 1157 tr := t[i] 1158 if sr|tr >= utf8.RuneSelf { 1159 goto hasUnicode 1160 } 1161 1162 // Easy case. 1163 if tr == sr { 1164 continue 1165 } 1166 1167 // Make sr < tr to simplify what follows. 1168 if tr < sr { 1169 tr, sr = sr, tr 1170 } 1171 // ASCII only, sr/tr must be upper/lower case 1172 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1173 continue 1174 } 1175 return false 1176 } 1177 // Check if we've exhausted both strings. 1178 return len(s) == len(t) 1179 1180 hasUnicode: 1181 s = s[i:] 1182 t = t[i:] 1183 for _, sr := range bytealg.AsString(s) { 1184 // If t is exhausted the strings are not equal. 1185 if len(t) == 0 { 1186 return false 1187 } 1188 1189 // Extract first rune from second string. 1190 var tr rune 1191 if t[0] < utf8.RuneSelf { 1192 tr, t = rune(t[0]), t[1:] 1193 } else { 1194 r, size := utf8.DecodeRune(t) 1195 tr, t = r, t[size:] 1196 } 1197 1198 // If they match, keep going; if not, return false. 1199 1200 // Easy case. 1201 if tr == sr { 1202 continue 1203 } 1204 1205 // Make sr < tr to simplify what follows. 1206 if tr < sr { 1207 tr, sr = sr, tr 1208 } 1209 // Fast check for ASCII. 1210 if tr < utf8.RuneSelf { 1211 // ASCII only, sr/tr must be upper/lower case 1212 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1213 continue 1214 } 1215 return false 1216 } 1217 1218 // General case. SimpleFold(x) returns the next equivalent rune > x 1219 // or wraps around to smaller values. 1220 r := unicode.SimpleFold(sr) 1221 for r != sr && r < tr { 1222 r = unicode.SimpleFold(r) 1223 } 1224 if r == tr { 1225 continue 1226 } 1227 return false 1228 } 1229 1230 // First string is empty, so check if the second one is also empty. 1231 return len(t) == 0 1232 } 1233 1234 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. 1235 func Index[S1, S2 String](s S1, substr S2) int { 1236 n := len(substr) 1237 switch { 1238 case n == 0: 1239 return 0 1240 case n == 1: 1241 return IndexByte(s, substr[0]) 1242 case n == len(s): 1243 if Equal(substr, s) { 1244 return 0 1245 } 1246 return -1 1247 case n > len(s): 1248 return -1 1249 case n <= bytealg.MaxLen: 1250 // Use brute force when s and substr both are small 1251 if len(s) <= bytealg.MaxBruteForce { 1252 return bytealg.IndexString(s, substr) 1253 } 1254 c0 := substr[0] 1255 c1 := substr[1] 1256 i := 0 1257 t := len(s) - n + 1 1258 fails := 0 1259 for i < t { 1260 if s[i] != c0 { 1261 // IndexByte is faster than bytealg.IndexString, so use it as long as 1262 // we're not getting lots of false positives. 1263 o := IndexByte(s[i+1:t], c0) 1264 if o < 0 { 1265 return -1 1266 } 1267 i += o + 1 1268 } 1269 if s[i+1] == c1 && Equal(s[i:i+n], substr) { 1270 return i 1271 } 1272 fails++ 1273 i++ 1274 // Switch to bytealg.IndexString when IndexByte produces too many false positives. 1275 if fails > bytealg.Cutover(i) { 1276 r := bytealg.IndexString(s[i:], substr) 1277 if r >= 0 { 1278 return r + i 1279 } 1280 return -1 1281 } 1282 } 1283 return -1 1284 } 1285 c0 := substr[0] 1286 c1 := substr[1] 1287 i := 0 1288 t := len(s) - n + 1 1289 fails := 0 1290 for i < t { 1291 if s[i] != c0 { 1292 o := IndexByte(s[i+1:t], c0) 1293 if o < 0 { 1294 return -1 1295 } 1296 i += o + 1 1297 } 1298 if s[i+1] == c1 && Equal(s[i:i+n], substr) { 1299 return i 1300 } 1301 i++ 1302 fails++ 1303 if fails >= 4+i>>4 && i < t { 1304 // See comment in ../bytes/bytes.go. 1305 j := bytealg.IndexRabinKarp(s[i:], substr) 1306 if j < 0 { 1307 return -1 1308 } 1309 return i + j 1310 } 1311 } 1312 return -1 1313 } 1314 1315 // Cut slices s around the first instance of sep, 1316 // returning the text before and after sep. 1317 // The found result reports whether sep appears in s. 1318 // If sep does not appear in s, cut returns s, "", false. 1319 func Cut[S1, S2 String](s S1, sep S2) (before, after S1, found bool) { 1320 if i := Index(s, sep); i >= 0 { 1321 return s[:i], s[i+len(sep):], true 1322 } 1323 return s, Empty[S1](), false 1324 } 1325 1326 // CutPrefix returns s without the provided leading prefix string 1327 // and reports whether it found the prefix. 1328 // If s doesn't start with prefix, CutPrefix returns s, false. 1329 // If prefix is the empty string, CutPrefix returns s, true. 1330 func CutPrefix[S1, S2 String](s S1, prefix S2) (after S1, found bool) { 1331 if !HasPrefix(s, prefix) { 1332 return s, false 1333 } 1334 return s[len(prefix):], true 1335 } 1336 1337 // CutSuffix returns s without the provided ending suffix string 1338 // and reports whether it found the suffix. 1339 // If s doesn't end with suffix, CutSuffix returns s, false. 1340 // If suffix is the empty string, CutSuffix returns s, true. 1341 func CutSuffix[S1, S2 String](s S1, suffix S2) (before S1, found bool) { 1342 if !HasSuffix(s, suffix) { 1343 return s, false 1344 } 1345 return s[:len(s)-len(suffix)], true 1346 }