github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/bytes/bytes.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bytes implements functions for the manipulation of byte slices. 6 // It is analogous to the facilities of the strings package. 7 package bytes 8 9 import ( 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // Compare returns an integer comparing two byte slices lexicographically. 15 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. 16 // A nil argument is equivalent to an empty slice. 17 func Compare(a, b []byte) int { 18 m := len(a) 19 if m > len(b) { 20 m = len(b) 21 } 22 for i, ac := range a[0:m] { 23 bc := b[i] 24 switch { 25 case ac > bc: 26 return 1 27 case ac < bc: 28 return -1 29 } 30 } 31 switch { 32 case len(a) < len(b): 33 return -1 34 case len(a) > len(b): 35 return 1 36 } 37 return 0 38 } 39 40 func equalPortable(a, b []byte) bool { 41 if len(a) != len(b) { 42 return false 43 } 44 for i, c := range a { 45 if c != b[i] { 46 return false 47 } 48 } 49 return true 50 } 51 52 // explode splits s into a slice of UTF-8 sequences, one per Unicode character (still slices of bytes), 53 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes. 54 func explode(s []byte, n int) [][]byte { 55 if n <= 0 { 56 n = len(s) 57 } 58 a := make([][]byte, n) 59 var size int 60 na := 0 61 for len(s) > 0 { 62 if na+1 >= n { 63 a[na] = s 64 na++ 65 break 66 } 67 _, size = utf8.DecodeRune(s) 68 a[na] = s[0:size] 69 s = s[size:] 70 na++ 71 } 72 return a[0:na] 73 } 74 75 // Count counts the number of non-overlapping instances of sep in s. 76 func Count(s, sep []byte) int { 77 n := len(sep) 78 if n == 0 { 79 return utf8.RuneCount(s) + 1 80 } 81 if n > len(s) { 82 return 0 83 } 84 count := 0 85 c := sep[0] 86 i := 0 87 t := s[:len(s)-n+1] 88 for i < len(t) { 89 if t[i] != c { 90 o := IndexByte(t[i:], c) 91 if o < 0 { 92 break 93 } 94 i += o 95 } 96 if n == 1 || Equal(s[i:i+n], sep) { 97 count++ 98 i += n 99 continue 100 } 101 i++ 102 } 103 return count 104 } 105 106 // Contains returns whether subslice is within b. 107 func Contains(b, subslice []byte) bool { 108 return Index(b, subslice) != -1 109 } 110 111 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. 112 func Index(s, sep []byte) int { 113 n := len(sep) 114 if n == 0 { 115 return 0 116 } 117 if n > len(s) { 118 return -1 119 } 120 c := sep[0] 121 if n == 1 { 122 return IndexByte(s, c) 123 } 124 i := 0 125 t := s[:len(s)-n+1] 126 for i < len(t) { 127 if t[i] != c { 128 o := IndexByte(t[i:], c) 129 if o < 0 { 130 break 131 } 132 i += o 133 } 134 if Equal(s[i:i+n], sep) { 135 return i 136 } 137 i++ 138 } 139 return -1 140 } 141 142 func indexBytePortable(s []byte, c byte) int { 143 for i, b := range s { 144 if b == c { 145 return i 146 } 147 } 148 return -1 149 } 150 151 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. 152 func LastIndex(s, sep []byte) int { 153 n := len(sep) 154 if n == 0 { 155 return len(s) 156 } 157 c := sep[0] 158 for i := len(s) - n; i >= 0; i-- { 159 if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) { 160 return i 161 } 162 } 163 return -1 164 } 165 166 // IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points. 167 // It returns the byte index of the first occurrence in s of the given rune. 168 // It returns -1 if rune is not present in s. 169 func IndexRune(s []byte, r rune) int { 170 for i := 0; i < len(s); { 171 r1, size := utf8.DecodeRune(s[i:]) 172 if r == r1 { 173 return i 174 } 175 i += size 176 } 177 return -1 178 } 179 180 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points. 181 // It returns the byte index of the first occurrence in s of any of the Unicode 182 // code points in chars. It returns -1 if chars is empty or if there is no code 183 // point in common. 184 func IndexAny(s []byte, chars string) int { 185 if len(chars) > 0 { 186 var r rune 187 var width int 188 for i := 0; i < len(s); i += width { 189 r = rune(s[i]) 190 if r < utf8.RuneSelf { 191 width = 1 192 } else { 193 r, width = utf8.DecodeRune(s[i:]) 194 } 195 for _, ch := range chars { 196 if r == ch { 197 return i 198 } 199 } 200 } 201 } 202 return -1 203 } 204 205 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code 206 // points. It returns the byte index of the last occurrence in s of any of 207 // the Unicode code points in chars. It returns -1 if chars is empty or if 208 // there is no code point in common. 209 func LastIndexAny(s []byte, chars string) int { 210 if len(chars) > 0 { 211 for i := len(s); i > 0; { 212 r, size := utf8.DecodeLastRune(s[0:i]) 213 i -= size 214 for _, ch := range chars { 215 if r == ch { 216 return i 217 } 218 } 219 } 220 } 221 return -1 222 } 223 224 // Generic split: splits after each instance of sep, 225 // including sepSave bytes of sep in the subslices. 226 func genSplit(s, sep []byte, sepSave, n int) [][]byte { 227 if n == 0 { 228 return nil 229 } 230 if len(sep) == 0 { 231 return explode(s, n) 232 } 233 if n < 0 { 234 n = Count(s, sep) + 1 235 } 236 c := sep[0] 237 start := 0 238 a := make([][]byte, n) 239 na := 0 240 for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ { 241 if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) { 242 a[na] = s[start : i+sepSave] 243 na++ 244 start = i + len(sep) 245 i += len(sep) - 1 246 } 247 } 248 a[na] = s[start:] 249 return a[0 : na+1] 250 } 251 252 // SplitN slices s into subslices separated by sep and returns a slice of 253 // the subslices between those separators. 254 // If sep is empty, SplitN splits after each UTF-8 sequence. 255 // The count determines the number of subslices to return: 256 // n > 0: at most n subslices; the last subslice will be the unsplit remainder. 257 // n == 0: the result is nil (zero subslices) 258 // n < 0: all subslices 259 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) } 260 261 // SplitAfterN slices s into subslices after each instance of sep and 262 // returns a slice of those subslices. 263 // If sep is empty, SplitAfterN splits after each UTF-8 sequence. 264 // The count determines the number of subslices to return: 265 // n > 0: at most n subslices; the last subslice will be the unsplit remainder. 266 // n == 0: the result is nil (zero subslices) 267 // n < 0: all subslices 268 func SplitAfterN(s, sep []byte, n int) [][]byte { 269 return genSplit(s, sep, len(sep), n) 270 } 271 272 // Split slices s into all subslices separated by sep and returns a slice of 273 // the subslices between those separators. 274 // If sep is empty, Split splits after each UTF-8 sequence. 275 // It is equivalent to SplitN with a count of -1. 276 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) } 277 278 // SplitAfter slices s into all subslices after each instance of sep and 279 // returns a slice of those subslices. 280 // If sep is empty, SplitAfter splits after each UTF-8 sequence. 281 // It is equivalent to SplitAfterN with a count of -1. 282 func SplitAfter(s, sep []byte) [][]byte { 283 return genSplit(s, sep, len(sep), -1) 284 } 285 286 // Fields splits the slice s around each instance of one or more consecutive white space 287 // characters, returning a slice of subslices of s or an empty list if s contains only white space. 288 func Fields(s []byte) [][]byte { 289 return FieldsFunc(s, unicode.IsSpace) 290 } 291 292 // FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points. 293 // It splits the slice s at each run of code points c satisfying f(c) and 294 // returns a slice of subslices of s. If no code points in s satisfy f(c), an 295 // empty slice is returned. 296 func FieldsFunc(s []byte, f func(rune) bool) [][]byte { 297 n := 0 298 inField := false 299 for i := 0; i < len(s); { 300 r, size := utf8.DecodeRune(s[i:]) 301 wasInField := inField 302 inField = !f(r) 303 if inField && !wasInField { 304 n++ 305 } 306 i += size 307 } 308 309 a := make([][]byte, n) 310 na := 0 311 fieldStart := -1 312 for i := 0; i <= len(s) && na < n; { 313 r, size := utf8.DecodeRune(s[i:]) 314 if fieldStart < 0 && size > 0 && !f(r) { 315 fieldStart = i 316 i += size 317 continue 318 } 319 if fieldStart >= 0 && (size == 0 || f(r)) { 320 a[na] = s[fieldStart:i] 321 na++ 322 fieldStart = -1 323 } 324 if size == 0 { 325 break 326 } 327 i += size 328 } 329 return a[0:na] 330 } 331 332 // Join concatenates the elements of s to create a new byte slice. The separator 333 // sep is placed between elements in the resulting slice. 334 func Join(s [][]byte, sep []byte) []byte { 335 if len(s) == 0 { 336 return []byte{} 337 } 338 if len(s) == 1 { 339 // Just return a copy. 340 return append([]byte(nil), s[0]...) 341 } 342 n := len(sep) * (len(s) - 1) 343 for _, v := range s { 344 n += len(v) 345 } 346 347 b := make([]byte, n) 348 bp := copy(b, s[0]) 349 for _, v := range s[1:] { 350 bp += copy(b[bp:], sep) 351 bp += copy(b[bp:], v) 352 } 353 return b 354 } 355 356 // HasPrefix tests whether the byte slice s begins with prefix. 357 func HasPrefix(s, prefix []byte) bool { 358 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix) 359 } 360 361 // HasSuffix tests whether the byte slice s ends with suffix. 362 func HasSuffix(s, suffix []byte) bool { 363 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix) 364 } 365 366 // Map returns a copy of the byte slice s with all its characters modified 367 // according to the mapping function. If mapping returns a negative value, the character is 368 // dropped from the string with no replacement. The characters in s and the 369 // output are interpreted as UTF-8-encoded Unicode code points. 370 func Map(mapping func(r rune) rune, s []byte) []byte { 371 // In the worst case, the slice can grow when mapped, making 372 // things unpleasant. But it's so rare we barge in assuming it's 373 // fine. It could also shrink but that falls out naturally. 374 maxbytes := len(s) // length of b 375 nbytes := 0 // number of bytes encoded in b 376 b := make([]byte, maxbytes) 377 for i := 0; i < len(s); { 378 wid := 1 379 r := rune(s[i]) 380 if r >= utf8.RuneSelf { 381 r, wid = utf8.DecodeRune(s[i:]) 382 } 383 r = mapping(r) 384 if r >= 0 { 385 if nbytes+utf8.RuneLen(r) > maxbytes { 386 // Grow the buffer. 387 maxbytes = maxbytes*2 + utf8.UTFMax 388 nb := make([]byte, maxbytes) 389 copy(nb, b[0:nbytes]) 390 b = nb 391 } 392 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) 393 } 394 i += wid 395 } 396 return b[0:nbytes] 397 } 398 399 // Repeat returns a new byte slice consisting of count copies of b. 400 func Repeat(b []byte, count int) []byte { 401 nb := make([]byte, len(b)*count) 402 bp := 0 403 for i := 0; i < count; i++ { 404 for j := 0; j < len(b); j++ { 405 nb[bp] = b[j] 406 bp++ 407 } 408 } 409 return nb 410 } 411 412 // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to their upper case. 413 func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) } 414 415 // ToLower returns a copy of the byte slice s with all Unicode letters mapped to their lower case. 416 func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) } 417 418 // ToTitle returns a copy of the byte slice s with all Unicode letters mapped to their title case. 419 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) } 420 421 // ToUpperSpecial returns a copy of the byte slice s with all Unicode letters mapped to their 422 // upper case, giving priority to the special casing rules. 423 func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte { 424 return Map(func(r rune) rune { return _case.ToUpper(r) }, s) 425 } 426 427 // ToLowerSpecial returns a copy of the byte slice s with all Unicode letters mapped to their 428 // lower case, giving priority to the special casing rules. 429 func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte { 430 return Map(func(r rune) rune { return _case.ToLower(r) }, s) 431 } 432 433 // ToTitleSpecial returns a copy of the byte slice s with all Unicode letters mapped to their 434 // title case, giving priority to the special casing rules. 435 func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte { 436 return Map(func(r rune) rune { return _case.ToTitle(r) }, s) 437 } 438 439 // isSeparator reports whether the rune could mark a word boundary. 440 // TODO: update when package unicode captures more of the properties. 441 func isSeparator(r rune) bool { 442 // ASCII alphanumerics and underscore are not separators 443 if r <= 0x7F { 444 switch { 445 case '0' <= r && r <= '9': 446 return false 447 case 'a' <= r && r <= 'z': 448 return false 449 case 'A' <= r && r <= 'Z': 450 return false 451 case r == '_': 452 return false 453 } 454 return true 455 } 456 // Letters and digits are not separators 457 if unicode.IsLetter(r) || unicode.IsDigit(r) { 458 return false 459 } 460 // Otherwise, all we can do for now is treat spaces as separators. 461 return unicode.IsSpace(r) 462 } 463 464 // Title returns a copy of s with all Unicode letters that begin words 465 // mapped to their title case. 466 // 467 // BUG: The rule Title uses for word boundaries does not handle Unicode punctuation properly. 468 func Title(s []byte) []byte { 469 // Use a closure here to remember state. 470 // Hackish but effective. Depends on Map scanning in order and calling 471 // the closure once per rune. 472 prev := ' ' 473 return Map( 474 func(r rune) rune { 475 if isSeparator(prev) { 476 prev = r 477 return unicode.ToTitle(r) 478 } 479 prev = r 480 return r 481 }, 482 s) 483 } 484 485 // TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded 486 // Unicode code points c that satisfy f(c). 487 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte { 488 i := indexFunc(s, f, false) 489 if i == -1 { 490 return nil 491 } 492 return s[i:] 493 } 494 495 // TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8 496 // encoded Unicode code points c that satisfy f(c). 497 func TrimRightFunc(s []byte, f func(r rune) bool) []byte { 498 i := lastIndexFunc(s, f, false) 499 if i >= 0 && s[i] >= utf8.RuneSelf { 500 _, wid := utf8.DecodeRune(s[i:]) 501 i += wid 502 } else { 503 i++ 504 } 505 return s[0:i] 506 } 507 508 // TrimFunc returns a subslice of s by slicing off all leading and trailing 509 // UTF-8-encoded Unicode code points c that satisfy f(c). 510 func TrimFunc(s []byte, f func(r rune) bool) []byte { 511 return TrimRightFunc(TrimLeftFunc(s, f), f) 512 } 513 514 // TrimPrefix returns s without the provided leading prefix string. 515 // If s doesn't start with prefix, s is returned unchanged. 516 func TrimPrefix(s, prefix []byte) []byte { 517 if HasPrefix(s, prefix) { 518 return s[len(prefix):] 519 } 520 return s 521 } 522 523 // TrimSuffix returns s without the provided trailing suffix string. 524 // If s doesn't end with suffix, s is returned unchanged. 525 func TrimSuffix(s, suffix []byte) []byte { 526 if HasSuffix(s, suffix) { 527 return s[:len(s)-len(suffix)] 528 } 529 return s 530 } 531 532 // IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points. 533 // It returns the byte index in s of the first Unicode 534 // code point satisfying f(c), or -1 if none do. 535 func IndexFunc(s []byte, f func(r rune) bool) int { 536 return indexFunc(s, f, true) 537 } 538 539 // LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points. 540 // It returns the byte index in s of the last Unicode 541 // code point satisfying f(c), or -1 if none do. 542 func LastIndexFunc(s []byte, f func(r rune) bool) int { 543 return lastIndexFunc(s, f, true) 544 } 545 546 // indexFunc is the same as IndexFunc except that if 547 // truth==false, the sense of the predicate function is 548 // inverted. 549 func indexFunc(s []byte, f func(r rune) bool, truth bool) int { 550 start := 0 551 for start < len(s) { 552 wid := 1 553 r := rune(s[start]) 554 if r >= utf8.RuneSelf { 555 r, wid = utf8.DecodeRune(s[start:]) 556 } 557 if f(r) == truth { 558 return start 559 } 560 start += wid 561 } 562 return -1 563 } 564 565 // lastIndexFunc is the same as LastIndexFunc except that if 566 // truth==false, the sense of the predicate function is 567 // inverted. 568 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int { 569 for i := len(s); i > 0; { 570 r, size := rune(s[i-1]), 1 571 if r >= utf8.RuneSelf { 572 r, size = utf8.DecodeLastRune(s[0:i]) 573 } 574 i -= size 575 if f(r) == truth { 576 return i 577 } 578 } 579 return -1 580 } 581 582 func makeCutsetFunc(cutset string) func(r rune) bool { 583 return func(r rune) bool { 584 for _, c := range cutset { 585 if c == r { 586 return true 587 } 588 } 589 return false 590 } 591 } 592 593 // Trim returns a subslice of s by slicing off all leading and 594 // trailing UTF-8-encoded Unicode code points contained in cutset. 595 func Trim(s []byte, cutset string) []byte { 596 return TrimFunc(s, makeCutsetFunc(cutset)) 597 } 598 599 // TrimLeft returns a subslice of s by slicing off all leading 600 // UTF-8-encoded Unicode code points contained in cutset. 601 func TrimLeft(s []byte, cutset string) []byte { 602 return TrimLeftFunc(s, makeCutsetFunc(cutset)) 603 } 604 605 // TrimRight returns a subslice of s by slicing off all trailing 606 // UTF-8-encoded Unicode code points that are contained in cutset. 607 func TrimRight(s []byte, cutset string) []byte { 608 return TrimRightFunc(s, makeCutsetFunc(cutset)) 609 } 610 611 // TrimSpace returns a subslice of s by slicing off all leading and 612 // trailing white space, as defined by Unicode. 613 func TrimSpace(s []byte) []byte { 614 return TrimFunc(s, unicode.IsSpace) 615 } 616 617 // Runes returns a slice of runes (Unicode code points) equivalent to s. 618 func Runes(s []byte) []rune { 619 t := make([]rune, utf8.RuneCount(s)) 620 i := 0 621 for len(s) > 0 { 622 r, l := utf8.DecodeRune(s) 623 t[i] = r 624 i++ 625 s = s[l:] 626 } 627 return t 628 } 629 630 // Replace returns a copy of the slice s with the first n 631 // non-overlapping instances of old replaced by new. 632 // If n < 0, there is no limit on the number of replacements. 633 func Replace(s, old, new []byte, n int) []byte { 634 m := 0 635 if n != 0 { 636 // Compute number of replacements. 637 m = Count(s, old) 638 } 639 if m == 0 { 640 // Just return a copy. 641 return append([]byte(nil), s...) 642 } 643 if n < 0 || m < n { 644 n = m 645 } 646 647 // Apply replacements to buffer. 648 t := make([]byte, len(s)+n*(len(new)-len(old))) 649 w := 0 650 start := 0 651 for i := 0; i < n; i++ { 652 j := start 653 if len(old) == 0 { 654 if i > 0 { 655 _, wid := utf8.DecodeRune(s[start:]) 656 j += wid 657 } 658 } else { 659 j += Index(s[start:], old) 660 } 661 w += copy(t[w:], s[start:j]) 662 w += copy(t[w:], new) 663 start = j + len(old) 664 } 665 w += copy(t[w:], s[start:]) 666 return t[0:w] 667 } 668 669 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 670 // are equal under Unicode case-folding. 671 func EqualFold(s, t []byte) bool { 672 for len(s) != 0 && len(t) != 0 { 673 // Extract first rune from each. 674 var sr, tr rune 675 if s[0] < utf8.RuneSelf { 676 sr, s = rune(s[0]), s[1:] 677 } else { 678 r, size := utf8.DecodeRune(s) 679 sr, s = r, s[size:] 680 } 681 if t[0] < utf8.RuneSelf { 682 tr, t = rune(t[0]), t[1:] 683 } else { 684 r, size := utf8.DecodeRune(t) 685 tr, t = r, t[size:] 686 } 687 688 // If they match, keep going; if not, return false. 689 690 // Easy case. 691 if tr == sr { 692 continue 693 } 694 695 // Make sr < tr to simplify what follows. 696 if tr < sr { 697 tr, sr = sr, tr 698 } 699 // Fast check for ASCII. 700 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { 701 // ASCII, and sr is upper case. tr must be lower case. 702 if tr == sr+'a'-'A' { 703 continue 704 } 705 return false 706 } 707 708 // General case. SimpleFold(x) returns the next equivalent rune > x 709 // or wraps around to smaller values. 710 r := unicode.SimpleFold(sr) 711 for r != sr && r < tr { 712 r = unicode.SimpleFold(r) 713 } 714 if r == tr { 715 continue 716 } 717 return false 718 } 719 720 // One string is empty. Are both? 721 return len(s) == len(t) 722 }