github.com/mitranim/gg@v0.1.17/text.go (about) 1 package gg 2 3 import ( 4 "regexp" 5 "strings" 6 "unicode" 7 "unicode/utf8" 8 u "unsafe" 9 ) 10 11 /* 12 Same as `len`. Limited to `Text` types but can be passed to higher-order 13 functions. 14 */ 15 func TextLen[A Text](val A) int { return len(val) } 16 17 // True if len <= 0. Inverse of `IsTextNotEmpty`. 18 func IsTextEmpty[A Text](val A) bool { return len(val) <= 0 } 19 20 // True if len > 0. Inverse of `IsTextEmpty`. 21 func IsTextNotEmpty[A Text](val A) bool { return len(val) > 0 } 22 23 // Returns the first byte or 0. 24 func TextHeadByte[A Text](val A) byte { 25 if len(val) > 0 { 26 return val[0] 27 } 28 return 0 29 } 30 31 // Returns the last byte or 0. 32 func TextLastByte[A Text](val A) byte { 33 if len(val) > 0 { 34 return val[len(val)-1] 35 } 36 return 0 37 } 38 39 /* 40 Like `utf8.DecodeRuneInString`, but faster at the time of writing, and without 41 `utf8.RuneError`. On decoding error, the result is `(0, 0)`. 42 */ 43 func TextHeadChar[A Text](src A) (char rune, size int) { 44 for ind, val := range ToText[string](src) { 45 if ind == 0 { 46 char = val 47 size = len(src) 48 } else { 49 size = ind 50 break 51 } 52 } 53 return 54 } 55 56 /* 57 True if the inputs would be `==` if compared as strings. When used on typedefs 58 of `[]byte`, this is the same as `bytes.Equal`. 59 */ 60 func TextEq[A Text](one, two A) bool { return ToString(one) == ToString(two) } 61 62 /* 63 Similar to `unsafe.StringData`, but takes arbitrary text as input. Returns the 64 pointer to the first byte of the underlying data array for the given string or 65 byte slice. Use caution. Mutating the underlying data may trigger segfaults or 66 cause undefined behavior. 67 */ 68 func TextDat[A Text](val A) *byte { return CastUnsafe[*byte](val) } 69 70 /* 71 Implementation note. We could write `TextDat` as following, but it would not be 72 an improvement, because it still makes assumptions about the underlying 73 structure of the data, specifically it assumes that strings and byte slices 74 have a different width. At the time of writing, Go doesn't seem to provide a 75 safe and free way to detect if we have `~string` or `~[]byte`. A type switch on 76 `any(src)` works only for core types such as `string`, but not for typedefs 77 conforming to `~string` and `~[]byte`. Alternatives involve overheads such as 78 calling interface methods of `reflect.Type`, which would stop this function 79 from being a free cast. 80 81 func TextDat[A Text](src A) *byte { 82 if u.Sizeof(src) == SizeofString { 83 return u.StringData(string(src)) 84 } 85 if u.Sizeof(src) == SizeofSlice { 86 return u.SliceData([]byte(src)) 87 } 88 panic(`unreachable`) 89 } 90 */ 91 92 /* 93 Allocation-free conversion between two types conforming to the `Text` 94 constraint, typically variants of `string` and/or `[]byte`. 95 */ 96 func ToText[Out, Src Text](src Src) Out { 97 out := CastUnsafe[Out](src) 98 99 /** 100 Implementation note. We could also write the condition as shown below: 101 102 Kind[Src]() == r.String && Kind[Out]() == r.Slice 103 104 But the above would be measurably slower than the unsafe trick. 105 In addition, sizeof lets us ensure that the target can be cast into 106 `SliceHeader` without affecting other memory. 107 */ 108 if u.Sizeof(src) == SizeofString && u.Sizeof(out) == SizeofSliceHeader { 109 CastUnsafe[*SliceHeader](&out).Cap = len(out) 110 } 111 112 return out 113 } 114 115 /* 116 Allocation-free conversion. Reinterprets arbitrary text as a string. If the 117 string is used with an API that relies on string immutability, for example as a 118 map key, the source memory must not be mutated afterwards. 119 */ 120 func ToString[A Text](val A) string { return CastUnsafe[string](val) } 121 122 /* 123 Implementation note. `ToString` could be written as shown below. This passes our 124 test, but runs marginally slower than our current implementation, and does not 125 improve correctness, because `TextDat` also makes assumptions about the 126 underlying structure of the string header. 127 128 func ToString[A Text](val A) string { return u.String(TextDat(val), len(val)) } 129 */ 130 131 /* 132 Allocation-free conversion. Reinterprets arbitrary text as bytes. If the source 133 was a string, the output must NOT be mutated. Mutating memory that belongs to a 134 string may produce segfaults or undefined behavior. 135 */ 136 func ToBytes[A Text](val A) []byte { return u.Slice(TextDat(val), len(val)) } 137 138 /* 139 Converts arguments to strings and concatenates the results. See `StringCatch` 140 for the encoding rules. Also see `JoinDense` for a simpler version that doesn't 141 involve `any`. 142 */ 143 func Str(src ...any) string { return JoinAny(src, ``) } 144 145 /* 146 Similar to `Str`. Concatenates string representations of the input values. 147 Additionally, if the output is non-empty and doesn't end with a newline 148 character, appends `Newline` at the end. 149 */ 150 func Strln(src ...any) string { 151 switch len(src) { 152 case 0: 153 return `` 154 155 case 1: 156 return AppendNewlineOpt(String(src[0])) 157 158 default: 159 var buf Buf 160 buf.AppendAnysln(src...) 161 return buf.String() 162 } 163 } 164 165 /* 166 Converts arguments to strings and joins the results with a single space. See 167 `StringCatch` for encoding rules. Also see `JoinSpaced` for a more limited but 168 more efficient version that doesn't involve `any`. 169 */ 170 func Spaced(src ...any) string { return JoinAny(src, Space) } 171 172 /* 173 Converts arguments to strings and joins the results with a single space, 174 ignoring empty strings. See `StringCatch` for the encoding rules. Also see 175 `JoinSpacedOpt` for a more limited but more efficient version that doesn't 176 involve `any`. 177 */ 178 func SpacedOpt(src ...any) string { return JoinAnyOpt(src, Space) } 179 180 /* 181 Similar to `strings.Join` but takes `[]any`, converting elements to strings. See 182 `StringCatch` for the encoding rules. Also see `Join`, `JoinOpt`, 183 `JoinAnyOpt`. 184 */ 185 func JoinAny(src []any, sep string) string { 186 switch len(src) { 187 case 0: 188 return `` 189 190 case 1: 191 return String(src[0]) 192 193 default: 194 var buf Buf 195 for ind, src := range src { 196 if ind > 0 { 197 buf.AppendString(sep) 198 } 199 buf.AppendAny(src) 200 } 201 return buf.String() 202 } 203 } 204 205 // Like `JoinAny` but ignores empty strings. 206 func JoinAnyOpt(src []any, sep string) string { 207 switch len(src) { 208 case 0: 209 return `` 210 211 case 1: 212 return String(src[0]) 213 214 default: 215 var buf Buf 216 217 for ind, src := range src { 218 len0 := buf.Len() 219 if ind > 0 { 220 buf.AppendString(sep) 221 } 222 len1 := buf.Len() 223 224 buf.AppendAny(src) 225 226 if ind > 0 && buf.Len() == len1 { 227 buf.TruncLen(len0) 228 } 229 } 230 231 return buf.String() 232 } 233 } 234 235 // Concatenates the given text without any separators. 236 func JoinDense[A Text](val ...A) string { return Join(val, ``) } 237 238 // Joins the given strings with a space. 239 func JoinSpaced[A Text](val ...A) string { return Join(val, Space) } 240 241 // Joins non-empty strings with a space. 242 func JoinSpacedOpt[A Text](val ...A) string { return JoinOpt(val, Space) } 243 244 // Joins the given strings with newlines. 245 func JoinLines[A Text](val ...A) string { return Join(val, Newline) } 246 247 // Joins non-empty strings with newlines. 248 func JoinLinesOpt[A Text](val ...A) string { return JoinOpt(val, Newline) } 249 250 /* 251 Similar to `strings.Join` but works on any input compatible with the `Text` 252 interface. Also see `JoinOpt`, `JoinAny`, `JoinAnyOpt`. 253 */ 254 func Join[A Text](src []A, sep string) string { 255 switch len(src) { 256 case 0: 257 return `` 258 259 case 1: 260 return ToString(src[0]) 261 262 default: 263 var buf Buf 264 buf.GrowCap(Sum(src, TextLen[A]) + (len(sep) * (len(src) - 1))) 265 266 buf.AppendString(ToString(src[0])) 267 for _, src := range src[1:] { 268 buf.AppendString(sep) 269 buf.AppendString(ToString(src)) 270 } 271 return buf.String() 272 } 273 } 274 275 /* 276 Similar to `strings.Join` but works for any input compatible with the `Text` 277 interface and ignores empty strings. 278 */ 279 func JoinOpt[A Text](src []A, sep string) string { 280 switch len(src) { 281 case 0: 282 return `` 283 284 case 1: 285 return ToString(src[0]) 286 287 default: 288 var size int 289 for _, src := range src { 290 wid := len(src) 291 if wid > 0 { 292 size = size + wid + len(sep) 293 } 294 } 295 296 var buf Buf 297 buf.GrowCap(size) 298 299 var found bool 300 for _, src := range src { 301 if len(src) > 0 { 302 if found { 303 buf.AppendString(sep) 304 } 305 found = true 306 buf = append(buf, src...) 307 } 308 } 309 return buf.String() 310 } 311 } 312 313 /* 314 Similar to `strings.SplitN` for N = 1. More efficient: returns a tuple instead 315 of allocating a slice. Safer: returns zero values if split doesn't succeed. 316 */ 317 func Split2[A Text](src A, sep string) (A, A) { 318 ind := strings.Index(ToString(src), sep) 319 if ind >= 0 { 320 return src[:ind], src[ind+len(sep):] 321 } 322 return src, Zero[A]() 323 } 324 325 /* 326 Splits the given text into lines. The resulting strings do not contain any 327 newline characters. If the input is empty, the output is empty. Avoids 328 information loss: preserves empty lines, allowing the caller to transform and 329 join the lines without losing blanks. The following sequences are considered 330 newlines: "\r\n", "\r", "\n". 331 */ 332 func SplitLines[A Text](src A) []A { 333 /** 334 In our benchmark in Go 1.20.2, this runs about 20-30 times faster than the 335 equivalent regexp-based implementation. 336 337 It would be much simpler to use `strings.FieldsFunc` and `bytes.FieldsFunc`, 338 but they would elide empty lines, losing information and making this 339 non-reversible. They would also be about 2 times slower. 340 341 TODO simpler implementation. 342 */ 343 344 var out []A 345 var prev int 346 var next int 347 max := len(src) 348 349 /** 350 Iterating bytes is significantly faster than runes, and in valid UTF-8 it's 351 not possible to encounter '\r' or '\n' in multi-byte characters, making this 352 safe for valid text. 353 */ 354 for next < max { 355 char := src[next] 356 357 if char == '\r' && next < len(src)-1 && src[next+1] == '\n' { 358 out = append(out, src[prev:next]) 359 next = next + 2 360 prev = next 361 continue 362 } 363 364 if char == '\n' || char == '\r' { 365 out = append(out, src[prev:next]) 366 next++ 367 prev = next 368 continue 369 } 370 371 next++ 372 } 373 374 if next > 0 { 375 out = append(out, src[prev:next]) 376 } 377 return out 378 } 379 380 /* 381 Similar to `SplitLines`, but splits only on the first newline occurrence, 382 returning the first line and the remainder, plus the number of bytes in the 383 elided line separator. The following sequences are considered newlines: 384 "\r\n", "\r", "\n". 385 */ 386 func SplitLines2[A Text](src A) (A, A, int) { 387 size := len(src) 388 limit := size - 1 389 390 for ind, char := range ToString(src) { 391 if char == '\r' { 392 if ind < limit && src[ind+1] == '\n' { 393 return src[:ind], src[ind+2:], 2 394 } 395 return src[:ind], src[ind+1:], 1 396 } 397 if char == '\n' { 398 return src[:ind], src[ind+1:], 1 399 } 400 } 401 return src, Zero[A](), 0 402 } 403 404 /* 405 Searches for the given separator and returns the part of the text before the 406 separator, removing that prefix from the original text referenced by the 407 pointer. The separator is excluded from both chunks. As a special case, if the 408 separator is empty, pops the entire source text. 409 */ 410 func TextPop[Src, Sep Text](ptr *Src, sep Sep) Src { 411 if ptr == nil { 412 return Zero[Src]() 413 } 414 415 src := *ptr 416 417 if len(sep) == 0 { 418 PtrClear(ptr) 419 return src 420 } 421 422 ind := strings.Index(ToString(src), ToString(sep)) 423 if !(ind >= 0 && ind < len(src)) { 424 PtrClear(ptr) 425 return src 426 } 427 428 *ptr = src[ind+len(sep):] 429 return src[:ind] 430 } 431 432 // True if the string ends with a line feed or carriage return. 433 func HasNewlineSuffix[A Text](src A) bool { 434 return isByteNewline(TextLastByte(src)) 435 } 436 437 /* 438 If the given text is non-empty and does not end with a newline character, 439 appends `Newline` and returns the result. Otherwise returns the text unchanged. 440 If the input type is a typedef of `[]byte` and has enough capacity, it's 441 mutated. In other cases, the text is reallocated. Also see 442 `Buf.AppendNewlineOpt` and `Strln`. 443 */ 444 func AppendNewlineOpt[A Text](val A) A { 445 if len(val) > 0 && !HasNewlineSuffix(val) { 446 return ToText[A](append([]byte(val), Newline...)) 447 } 448 return val 449 } 450 451 // Missing/private half of `strings.TrimSpace`. Trims only the prefix. 452 func TrimSpacePrefix[A Text](src A) A { 453 return ToText[A](strings.TrimLeftFunc(ToString(src), unicode.IsSpace)) 454 } 455 456 // Missing/private half of `strings.TrimSpace`. Trims only the suffix. 457 func TrimSpaceSuffix[A Text](src A) A { 458 return ToText[A](strings.TrimRightFunc(ToString(src), unicode.IsSpace)) 459 } 460 461 /* 462 Regexp for splitting arbitrary text into words, Unicode-aware. Used by 463 `ToWords`. 464 */ 465 var ReWord = NewLazy(func() *regexp.Regexp { 466 return regexp.MustCompile(`\p{Lu}+[\p{Ll}\d]*|[\p{Ll}\d]+`) 467 }) 468 469 /* 470 Splits arbitrary text into words, Unicode-aware. Suitable for conversion between 471 typographic cases such as `camelCase` and `snake_case`. 472 */ 473 func ToWords[A Text](val A) Words { 474 return ReWord.Get().FindAllString(ToString(val), -1) 475 } 476 477 /* 478 Tool for converting between typographic cases such as `camelCase` and 479 `snake_case`. 480 */ 481 type Words []string 482 483 // Combines the words via "". 484 func (self Words) Dense() string { return self.Join(``) } 485 486 // Combines the words via " ". 487 func (self Words) Spaced() string { return self.Join(` `) } 488 489 // Combines the words via "_". 490 func (self Words) Snake() string { return self.Join(`_`) } 491 492 // Combines the words via "-". 493 func (self Words) Kebab() string { return self.Join(`-`) } 494 495 // Combines the words via ",". 496 func (self Words) Comma() string { return self.Join(`,`) } 497 498 // Combines the words via "|". 499 func (self Words) Piped() string { return self.Join(`|`) } 500 501 // Converts each word to lowercase. Mutates and returns the receiver. 502 func (self Words) Lower() Words { return MapMut(self, strings.ToLower) } 503 504 // Converts each word to UPPERCASE. Mutates and returns the receiver. 505 func (self Words) Upper() Words { return MapMut(self, strings.ToUpper) } 506 507 // Converts each word to Titlecase. Mutates and returns the receiver. 508 func (self Words) Title() Words { 509 //nolint:staticcheck 510 return MapMut(self, strings.Title) 511 } 512 513 /* 514 Converts the first word to Titlecase and each other word to lowercase. Mutates 515 and returns the receiver. 516 */ 517 func (self Words) Sentence() Words { 518 //nolint:staticcheck 519 return self.MapHead(strings.Title).MapTail(strings.ToLower) 520 } 521 522 /* 523 Converts the first word to lowercase and each other word to Titlecase. Mutates 524 and returns the receiver. 525 */ 526 func (self Words) Camel() Words { 527 //nolint:staticcheck 528 return self.MapHead(strings.ToLower).MapTail(strings.Title) 529 } 530 531 // Same as `strings.Join`. 532 func (self Words) Join(val string) string { return strings.Join(self, val) } 533 534 /* 535 Mutates the receiver by replacing the first element with the result of calling 536 the given function on that element. If the receiver is empty, this is a nop. 537 */ 538 func (self Words) MapHead(fun func(string) string) Words { 539 if fun != nil && len(self) > 0 { 540 self[0] = fun(self[0]) 541 } 542 return self 543 } 544 545 /* 546 Mutates the receiver by replacing elements, other than the first, with the 547 results of the given function. 548 */ 549 func (self Words) MapTail(fun func(string) string) Words { 550 if len(self) > 0 { 551 MapMut(self[1:], fun) 552 } 553 return self 554 } 555 556 // Uses `utf8.RuneCountInString` to count chars in arbitrary text. 557 func CharCount[A Text](val A) int { 558 return utf8.RuneCountInString(ToString(val)) 559 } 560 561 /* 562 Similar to `src[start:end]`, but instead of slicing text at byte positions, 563 slices text at character positions. Similar to `string([]rune(src)[start:end])`, 564 but slightly more performant and more permissive. 565 */ 566 func TextCut[A Text](src A, start, end int) (_ A) { 567 if !(end > start) { 568 return 569 } 570 571 startInd := 0 572 endInd := len(src) 573 charInd := 0 574 575 for byteInd := range ToString(src) { 576 if charInd == start { 577 startInd = byteInd 578 } 579 if charInd == end { 580 endInd = byteInd 581 break 582 } 583 charInd++ 584 } 585 586 return src[startInd:endInd] 587 } 588 589 /* 590 Truncates text to the given count of Unicode characters (not bytes). The limit 591 can't exceed `math.MaxInt`. Also see `TextTruncWith` which is more general. 592 */ 593 func TextTrunc[A Text](src A, limit uint) (_ A) { 594 return TextTruncWith(src, Zero[A](), limit) 595 } 596 597 /* 598 Shortcut for `TextTruncWith(src, "…")`. Truncates the given text to the given total 599 count of Unicode characters with an ellipsis. 600 */ 601 func TextEllipsis[A Text](src A, limit uint) A { 602 return TextTruncWith(src, ToText[A](`…`), limit) 603 } 604 605 /* 606 Truncates the given text to the given total count of Unicode characters 607 (not bytes) with a suffix. If the text is under the limit, it's returned 608 unchanged, otherwise it's truncated and the given suffix is appended. The total 609 count includes the character count of the given suffix string. The limit can't 610 exceed `math.MaxInt`. Also see shortcut `TextEllipsis` which uses this with the 611 ellipsis character '…'. 612 */ 613 func TextTruncWith[A Text](src, suf A, limit uint) A { 614 if limit == 0 { 615 return Zero[A]() 616 } 617 618 lim := safeUintToInt(limit) 619 sufCharLen := CharCount(suf) 620 str := ToString(src) 621 prevInd := 0 622 nextInd := 0 623 charInd := 0 624 625 for nextInd = range str { 626 if charInd+sufCharLen > lim { 627 return ToText[A](str[:prevInd] + ToString(suf)) 628 } 629 prevInd = nextInd 630 charInd++ 631 } 632 return src 633 }