github.com/mh-cbon/go@v0.0.0-20160603070303-9e112a3fe4c0/src/strings/strings.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package strings implements simple functions to manipulate UTF-8 encoded strings. 6 // 7 // For information about UTF-8 strings in Go, see https://blog.golang.org/strings. 8 package strings 9 10 import ( 11 "unicode" 12 "unicode/utf8" 13 ) 14 15 // explode splits s into a slice of UTF-8 strings, 16 // one string per Unicode character up to a maximum of n (n < 0 means no limit). 17 // Invalid UTF-8 sequences become correct encodings of U+FFFD. 18 func explode(s string, n int) []string { 19 l := utf8.RuneCountInString(s) 20 if n < 0 || n > l { 21 n = l 22 } 23 a := make([]string, n) 24 for i := 0; i < n-1; i++ { 25 ch, size := utf8.DecodeRuneInString(s) 26 a[i] = s[:size] 27 s = s[size:] 28 if ch == utf8.RuneError { 29 a[i] = string(utf8.RuneError) 30 } 31 } 32 if n > 0 { 33 a[n-1] = s 34 } 35 return a 36 } 37 38 // primeRK is the prime base used in Rabin-Karp algorithm. 39 const primeRK = 16777619 40 41 // hashStr returns the hash and the appropriate multiplicative 42 // factor for use in Rabin-Karp algorithm. 43 func hashStr(sep string) (uint32, uint32) { 44 hash := uint32(0) 45 for i := 0; i < len(sep); i++ { 46 hash = hash*primeRK + uint32(sep[i]) 47 } 48 var pow, sq uint32 = 1, primeRK 49 for i := len(sep); i > 0; i >>= 1 { 50 if i&1 != 0 { 51 pow *= sq 52 } 53 sq *= sq 54 } 55 return hash, pow 56 } 57 58 // hashStrRev returns the hash of the reverse of sep and the 59 // appropriate multiplicative factor for use in Rabin-Karp algorithm. 60 func hashStrRev(sep string) (uint32, uint32) { 61 hash := uint32(0) 62 for i := len(sep) - 1; i >= 0; i-- { 63 hash = hash*primeRK + uint32(sep[i]) 64 } 65 var pow, sq uint32 = 1, primeRK 66 for i := len(sep); i > 0; i >>= 1 { 67 if i&1 != 0 { 68 pow *= sq 69 } 70 sq *= sq 71 } 72 return hash, pow 73 } 74 75 // Count counts the number of non-overlapping instances of sep in s. 76 // If sep is an empty string, Count returns 1 + the number of Unicode code points in s. 77 func Count(s, sep string) int { 78 n := 0 79 // special cases 80 switch { 81 case len(sep) == 0: 82 return utf8.RuneCountInString(s) + 1 83 case len(sep) == 1: 84 // special case worth making fast 85 c := sep[0] 86 for i := 0; i < len(s); i++ { 87 if s[i] == c { 88 n++ 89 } 90 } 91 return n 92 case len(sep) > len(s): 93 return 0 94 case len(sep) == len(s): 95 if sep == s { 96 return 1 97 } 98 return 0 99 } 100 // Rabin-Karp search 101 hashsep, pow := hashStr(sep) 102 h := uint32(0) 103 for i := 0; i < len(sep); i++ { 104 h = h*primeRK + uint32(s[i]) 105 } 106 lastmatch := 0 107 if h == hashsep && s[:len(sep)] == sep { 108 n++ 109 lastmatch = len(sep) 110 } 111 for i := len(sep); i < len(s); { 112 h *= primeRK 113 h += uint32(s[i]) 114 h -= pow * uint32(s[i-len(sep)]) 115 i++ 116 if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep { 117 n++ 118 lastmatch = i 119 } 120 } 121 return n 122 } 123 124 // Contains reports whether substr is within s. 125 func Contains(s, substr string) bool { 126 return Index(s, substr) >= 0 127 } 128 129 // ContainsAny reports whether any Unicode code points in chars are within s. 130 func ContainsAny(s, chars string) bool { 131 return IndexAny(s, chars) >= 0 132 } 133 134 // ContainsRune reports whether the Unicode code point r is within s. 135 func ContainsRune(s string, r rune) bool { 136 return IndexRune(s, r) >= 0 137 } 138 139 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. 140 func LastIndex(s, sep string) int { 141 n := len(sep) 142 switch { 143 case n == 0: 144 return len(s) 145 case n == 1: 146 return LastIndexByte(s, sep[0]) 147 case n == len(s): 148 if sep == s { 149 return 0 150 } 151 return -1 152 case n > len(s): 153 return -1 154 } 155 // Rabin-Karp search from the end of the string 156 hashsep, pow := hashStrRev(sep) 157 last := len(s) - n 158 var h uint32 159 for i := len(s) - 1; i >= last; i-- { 160 h = h*primeRK + uint32(s[i]) 161 } 162 if h == hashsep && s[last:] == sep { 163 return last 164 } 165 for i := last - 1; i >= 0; i-- { 166 h *= primeRK 167 h += uint32(s[i]) 168 h -= pow * uint32(s[i+n]) 169 if h == hashsep && s[i:i+n] == sep { 170 return i 171 } 172 } 173 return -1 174 } 175 176 // IndexRune returns the index of the first instance of the Unicode code point 177 // r, or -1 if rune is not present in s. 178 func IndexRune(s string, r rune) int { 179 switch { 180 case r < utf8.RuneSelf: 181 return IndexByte(s, byte(r)) 182 default: 183 for i, c := range s { 184 if c == r { 185 return i 186 } 187 } 188 } 189 return -1 190 } 191 192 // IndexAny returns the index of the first instance of any Unicode code point 193 // from chars in s, or -1 if no Unicode code point from chars is present in s. 194 func IndexAny(s, chars string) int { 195 if len(chars) > 0 { 196 for i, c := range s { 197 for _, m := range chars { 198 if c == m { 199 return i 200 } 201 } 202 } 203 } 204 return -1 205 } 206 207 // LastIndexAny returns the index of the last instance of any Unicode code 208 // point from chars in s, or -1 if no Unicode code point from chars is 209 // present in s. 210 func LastIndexAny(s, chars string) int { 211 if len(chars) > 0 { 212 for i := len(s); i > 0; { 213 rune, size := utf8.DecodeLastRuneInString(s[0:i]) 214 i -= size 215 for _, m := range chars { 216 if rune == m { 217 return i 218 } 219 } 220 } 221 } 222 return -1 223 } 224 225 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. 226 func LastIndexByte(s string, c byte) int { 227 for i := len(s) - 1; i >= 0; i-- { 228 if s[i] == c { 229 return i 230 } 231 } 232 return -1 233 } 234 235 // Generic split: splits after each instance of sep, 236 // including sepSave bytes of sep in the subarrays. 237 func genSplit(s, sep string, sepSave, n int) []string { 238 if n == 0 { 239 return nil 240 } 241 if sep == "" { 242 return explode(s, n) 243 } 244 if n < 0 { 245 n = Count(s, sep) + 1 246 } 247 c := sep[0] 248 start := 0 249 a := make([]string, n) 250 na := 0 251 for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ { 252 if s[i] == c && (len(sep) == 1 || s[i:i+len(sep)] == sep) { 253 a[na] = s[start : i+sepSave] 254 na++ 255 start = i + len(sep) 256 i += len(sep) - 1 257 } 258 } 259 a[na] = s[start:] 260 return a[0 : na+1] 261 } 262 263 // SplitN slices s into substrings separated by sep and returns a slice of 264 // the substrings between those separators. 265 // If sep is empty, SplitN splits after each UTF-8 sequence. 266 // The count determines the number of substrings to return: 267 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 268 // n == 0: the result is nil (zero substrings) 269 // n < 0: all substrings 270 func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) } 271 272 // SplitAfterN slices s into substrings after each instance of sep and 273 // returns a slice of those substrings. 274 // If sep is empty, SplitAfterN splits after each UTF-8 sequence. 275 // The count determines the number of substrings to return: 276 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 277 // n == 0: the result is nil (zero substrings) 278 // n < 0: all substrings 279 func SplitAfterN(s, sep string, n int) []string { 280 return genSplit(s, sep, len(sep), n) 281 } 282 283 // Split slices s into all substrings separated by sep and returns a slice of 284 // the substrings between those separators. 285 // If sep is empty, Split splits after each UTF-8 sequence. 286 // It is equivalent to SplitN with a count of -1. 287 func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) } 288 289 // SplitAfter slices s into all substrings after each instance of sep and 290 // returns a slice of those substrings. 291 // If sep is empty, SplitAfter splits after each UTF-8 sequence. 292 // It is equivalent to SplitAfterN with a count of -1. 293 func SplitAfter(s, sep string) []string { 294 return genSplit(s, sep, len(sep), -1) 295 } 296 297 // Fields splits the string s around each instance of one or more consecutive white space 298 // characters, as defined by unicode.IsSpace, returning an array of substrings of s or an 299 // empty list if s contains only white space. 300 func Fields(s string) []string { 301 return FieldsFunc(s, unicode.IsSpace) 302 } 303 304 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) 305 // and returns an array of slices of s. If all code points in s satisfy f(c) or the 306 // string is empty, an empty slice is returned. 307 // FieldsFunc makes no guarantees about the order in which it calls f(c). 308 // If f does not return consistent results for a given c, FieldsFunc may crash. 309 func FieldsFunc(s string, f func(rune) bool) []string { 310 // First count the fields. 311 n := 0 312 inField := false 313 for _, rune := range s { 314 wasInField := inField 315 inField = !f(rune) 316 if inField && !wasInField { 317 n++ 318 } 319 } 320 321 // Now create them. 322 a := make([]string, n) 323 na := 0 324 fieldStart := -1 // Set to -1 when looking for start of field. 325 for i, rune := range s { 326 if f(rune) { 327 if fieldStart >= 0 { 328 a[na] = s[fieldStart:i] 329 na++ 330 fieldStart = -1 331 } 332 } else if fieldStart == -1 { 333 fieldStart = i 334 } 335 } 336 if fieldStart >= 0 { // Last field might end at EOF. 337 a[na] = s[fieldStart:] 338 } 339 return a 340 } 341 342 // Join concatenates the elements of a to create a single string. The separator string 343 // sep is placed between elements in the resulting string. 344 func Join(a []string, sep string) string { 345 if len(a) == 0 { 346 return "" 347 } 348 if len(a) == 1 { 349 return a[0] 350 } 351 n := len(sep) * (len(a) - 1) 352 for i := 0; i < len(a); i++ { 353 n += len(a[i]) 354 } 355 356 b := make([]byte, n) 357 bp := copy(b, a[0]) 358 for _, s := range a[1:] { 359 bp += copy(b[bp:], sep) 360 bp += copy(b[bp:], s) 361 } 362 return string(b) 363 } 364 365 // HasPrefix tests whether the string s begins with prefix. 366 func HasPrefix(s, prefix string) bool { 367 return len(s) >= len(prefix) && s[0:len(prefix)] == prefix 368 } 369 370 // HasSuffix tests whether the string s ends with suffix. 371 func HasSuffix(s, suffix string) bool { 372 return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix 373 } 374 375 // Map returns a copy of the string s with all its characters modified 376 // according to the mapping function. If mapping returns a negative value, the character is 377 // dropped from the string with no replacement. 378 func Map(mapping func(rune) rune, s string) string { 379 // In the worst case, the string can grow when mapped, making 380 // things unpleasant. But it's so rare we barge in assuming it's 381 // fine. It could also shrink but that falls out naturally. 382 maxbytes := len(s) // length of b 383 nbytes := 0 // number of bytes encoded in b 384 // The output buffer b is initialized on demand, the first 385 // time a character differs. 386 var b []byte 387 388 for i, c := range s { 389 r := mapping(c) 390 if b == nil { 391 if r == c { 392 continue 393 } 394 b = make([]byte, maxbytes) 395 nbytes = copy(b, s[:i]) 396 } 397 if r >= 0 { 398 wid := 1 399 if r >= utf8.RuneSelf { 400 wid = utf8.RuneLen(r) 401 } 402 if nbytes+wid > maxbytes { 403 // Grow the buffer. 404 maxbytes = maxbytes*2 + utf8.UTFMax 405 nb := make([]byte, maxbytes) 406 copy(nb, b[0:nbytes]) 407 b = nb 408 } 409 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) 410 } 411 } 412 if b == nil { 413 return s 414 } 415 return string(b[0:nbytes]) 416 } 417 418 // Repeat returns a new string consisting of count copies of the string s. 419 func Repeat(s string, count int) string { 420 b := make([]byte, len(s)*count) 421 bp := copy(b, s) 422 for bp < len(b) { 423 copy(b[bp:], b[:bp]) 424 bp *= 2 425 } 426 return string(b) 427 } 428 429 // ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case. 430 func ToUpper(s string) string { return Map(unicode.ToUpper, s) } 431 432 // ToLower returns a copy of the string s with all Unicode letters mapped to their lower case. 433 func ToLower(s string) string { return Map(unicode.ToLower, s) } 434 435 // ToTitle returns a copy of the string s with all Unicode letters mapped to their title case. 436 func ToTitle(s string) string { return Map(unicode.ToTitle, s) } 437 438 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their 439 // upper case, giving priority to the special casing rules. 440 func ToUpperSpecial(_case unicode.SpecialCase, s string) string { 441 return Map(func(r rune) rune { return _case.ToUpper(r) }, s) 442 } 443 444 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their 445 // lower case, giving priority to the special casing rules. 446 func ToLowerSpecial(_case unicode.SpecialCase, s string) string { 447 return Map(func(r rune) rune { return _case.ToLower(r) }, s) 448 } 449 450 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their 451 // title case, giving priority to the special casing rules. 452 func ToTitleSpecial(_case unicode.SpecialCase, s string) string { 453 return Map(func(r rune) rune { return _case.ToTitle(r) }, s) 454 } 455 456 // isSeparator reports whether the rune could mark a word boundary. 457 // TODO: update when package unicode captures more of the properties. 458 func isSeparator(r rune) bool { 459 // ASCII alphanumerics and underscore are not separators 460 if r <= 0x7F { 461 switch { 462 case '0' <= r && r <= '9': 463 return false 464 case 'a' <= r && r <= 'z': 465 return false 466 case 'A' <= r && r <= 'Z': 467 return false 468 case r == '_': 469 return false 470 } 471 return true 472 } 473 // Letters and digits are not separators 474 if unicode.IsLetter(r) || unicode.IsDigit(r) { 475 return false 476 } 477 // Otherwise, all we can do for now is treat spaces as separators. 478 return unicode.IsSpace(r) 479 } 480 481 // Title returns a copy of the string s with all Unicode letters that begin words 482 // mapped to their title case. 483 // 484 // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly. 485 func Title(s string) string { 486 // Use a closure here to remember state. 487 // Hackish but effective. Depends on Map scanning in order and calling 488 // the closure once per rune. 489 prev := ' ' 490 return Map( 491 func(r rune) rune { 492 if isSeparator(prev) { 493 prev = r 494 return unicode.ToTitle(r) 495 } 496 prev = r 497 return r 498 }, 499 s) 500 } 501 502 // TrimLeftFunc returns a slice of the string s with all leading 503 // Unicode code points c satisfying f(c) removed. 504 func TrimLeftFunc(s string, f func(rune) bool) string { 505 i := indexFunc(s, f, false) 506 if i == -1 { 507 return "" 508 } 509 return s[i:] 510 } 511 512 // TrimRightFunc returns a slice of the string s with all trailing 513 // Unicode code points c satisfying f(c) removed. 514 func TrimRightFunc(s string, f func(rune) bool) string { 515 i := lastIndexFunc(s, f, false) 516 if i >= 0 && s[i] >= utf8.RuneSelf { 517 _, wid := utf8.DecodeRuneInString(s[i:]) 518 i += wid 519 } else { 520 i++ 521 } 522 return s[0:i] 523 } 524 525 // TrimFunc returns a slice of the string s with all leading 526 // and trailing Unicode code points c satisfying f(c) removed. 527 func TrimFunc(s string, f func(rune) bool) string { 528 return TrimRightFunc(TrimLeftFunc(s, f), f) 529 } 530 531 // IndexFunc returns the index into s of the first Unicode 532 // code point satisfying f(c), or -1 if none do. 533 func IndexFunc(s string, f func(rune) bool) int { 534 return indexFunc(s, f, true) 535 } 536 537 // LastIndexFunc returns the index into s of the last 538 // Unicode code point satisfying f(c), or -1 if none do. 539 func LastIndexFunc(s string, f func(rune) bool) int { 540 return lastIndexFunc(s, f, true) 541 } 542 543 // indexFunc is the same as IndexFunc except that if 544 // truth==false, the sense of the predicate function is 545 // inverted. 546 func indexFunc(s string, f func(rune) bool, truth bool) int { 547 start := 0 548 for start < len(s) { 549 wid := 1 550 r := rune(s[start]) 551 if r >= utf8.RuneSelf { 552 r, wid = utf8.DecodeRuneInString(s[start:]) 553 } 554 if f(r) == truth { 555 return start 556 } 557 start += wid 558 } 559 return -1 560 } 561 562 // lastIndexFunc is the same as LastIndexFunc except that if 563 // truth==false, the sense of the predicate function is 564 // inverted. 565 func lastIndexFunc(s string, f func(rune) bool, truth bool) int { 566 for i := len(s); i > 0; { 567 r, size := utf8.DecodeLastRuneInString(s[0:i]) 568 i -= size 569 if f(r) == truth { 570 return i 571 } 572 } 573 return -1 574 } 575 576 func makeCutsetFunc(cutset string) func(rune) bool { 577 return func(r rune) bool { return IndexRune(cutset, r) >= 0 } 578 } 579 580 // Trim returns a slice of the string s with all leading and 581 // trailing Unicode code points contained in cutset removed. 582 func Trim(s string, cutset string) string { 583 if s == "" || cutset == "" { 584 return s 585 } 586 return TrimFunc(s, makeCutsetFunc(cutset)) 587 } 588 589 // TrimLeft returns a slice of the string s with all leading 590 // Unicode code points contained in cutset removed. 591 func TrimLeft(s string, cutset string) string { 592 if s == "" || cutset == "" { 593 return s 594 } 595 return TrimLeftFunc(s, makeCutsetFunc(cutset)) 596 } 597 598 // TrimRight returns a slice of the string s, with all trailing 599 // Unicode code points contained in cutset removed. 600 func TrimRight(s string, cutset string) string { 601 if s == "" || cutset == "" { 602 return s 603 } 604 return TrimRightFunc(s, makeCutsetFunc(cutset)) 605 } 606 607 // TrimSpace returns a slice of the string s, with all leading 608 // and trailing white space removed, as defined by Unicode. 609 func TrimSpace(s string) string { 610 return TrimFunc(s, unicode.IsSpace) 611 } 612 613 // TrimPrefix returns s without the provided leading prefix string. 614 // If s doesn't start with prefix, s is returned unchanged. 615 func TrimPrefix(s, prefix string) string { 616 if HasPrefix(s, prefix) { 617 return s[len(prefix):] 618 } 619 return s 620 } 621 622 // TrimSuffix returns s without the provided trailing suffix string. 623 // If s doesn't end with suffix, s is returned unchanged. 624 func TrimSuffix(s, suffix string) string { 625 if HasSuffix(s, suffix) { 626 return s[:len(s)-len(suffix)] 627 } 628 return s 629 } 630 631 // Replace returns a copy of the string s with the first n 632 // non-overlapping instances of old replaced by new. 633 // If old is empty, it matches at the beginning of the string 634 // and after each UTF-8 sequence, yielding up to k+1 replacements 635 // for a k-rune string. 636 // If n < 0, there is no limit on the number of replacements. 637 func Replace(s, old, new string, n int) string { 638 if old == new || n == 0 { 639 return s // avoid allocation 640 } 641 642 // Compute number of replacements. 643 if m := Count(s, old); m == 0 { 644 return s // avoid allocation 645 } else if n < 0 || m < n { 646 n = m 647 } 648 649 // Apply replacements to buffer. 650 t := make([]byte, len(s)+n*(len(new)-len(old))) 651 w := 0 652 start := 0 653 for i := 0; i < n; i++ { 654 j := start 655 if len(old) == 0 { 656 if i > 0 { 657 _, wid := utf8.DecodeRuneInString(s[start:]) 658 j += wid 659 } 660 } else { 661 j += Index(s[start:], old) 662 } 663 w += copy(t[w:], s[start:j]) 664 w += copy(t[w:], new) 665 start = j + len(old) 666 } 667 w += copy(t[w:], s[start:]) 668 return string(t[0:w]) 669 } 670 671 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 672 // are equal under Unicode case-folding. 673 func EqualFold(s, t string) bool { 674 for s != "" && t != "" { 675 // Extract first rune from each string. 676 var sr, tr rune 677 if s[0] < utf8.RuneSelf { 678 sr, s = rune(s[0]), s[1:] 679 } else { 680 r, size := utf8.DecodeRuneInString(s) 681 sr, s = r, s[size:] 682 } 683 if t[0] < utf8.RuneSelf { 684 tr, t = rune(t[0]), t[1:] 685 } else { 686 r, size := utf8.DecodeRuneInString(t) 687 tr, t = r, t[size:] 688 } 689 690 // If they match, keep going; if not, return false. 691 692 // Easy case. 693 if tr == sr { 694 continue 695 } 696 697 // Make sr < tr to simplify what follows. 698 if tr < sr { 699 tr, sr = sr, tr 700 } 701 // Fast check for ASCII. 702 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { 703 // ASCII, and sr is upper case. tr must be lower case. 704 if tr == sr+'a'-'A' { 705 continue 706 } 707 return false 708 } 709 710 // General case. SimpleFold(x) returns the next equivalent rune > x 711 // or wraps around to smaller values. 712 r := unicode.SimpleFold(sr) 713 for r != sr && r < tr { 714 r = unicode.SimpleFold(r) 715 } 716 if r == tr { 717 continue 718 } 719 return false 720 } 721 722 // One string is empty. Are both? 723 return s == t 724 }