github.com/mitranim/gg@v0.1.17/text.go

github.com/mitranim/gg@v0.1.17/text.go (about)

     1  package gg
     2  
     3  import (
     4  	"regexp"
     5  	"strings"
     6  	"unicode"
     7  	"unicode/utf8"
     8  	u "unsafe"
     9  )
    10  
    11  /*
    12  Same as `len`. Limited to `Text` types but can be passed to higher-order
    13  functions.
    14  */
    15  func TextLen[A Text](val A) int { return len(val) }
    16  
    17  // True if len <= 0. Inverse of `IsTextNotEmpty`.
    18  func IsTextEmpty[A Text](val A) bool { return len(val) <= 0 }
    19  
    20  // True if len > 0. Inverse of `IsTextEmpty`.
    21  func IsTextNotEmpty[A Text](val A) bool { return len(val) > 0 }
    22  
    23  // Returns the first byte or 0.
    24  func TextHeadByte[A Text](val A) byte {
    25  	if len(val) > 0 {
    26  		return val[0]
    27  	}
    28  	return 0
    29  }
    30  
    31  // Returns the last byte or 0.
    32  func TextLastByte[A Text](val A) byte {
    33  	if len(val) > 0 {
    34  		return val[len(val)-1]
    35  	}
    36  	return 0
    37  }
    38  
    39  /*
    40  Like `utf8.DecodeRuneInString`, but faster at the time of writing, and without
    41  `utf8.RuneError`. On decoding error, the result is `(0, 0)`.
    42  */
    43  func TextHeadChar[A Text](src A) (char rune, size int) {
    44  	for ind, val := range ToText[string](src) {
    45  		if ind == 0 {
    46  			char = val
    47  			size = len(src)
    48  		} else {
    49  			size = ind
    50  			break
    51  		}
    52  	}
    53  	return
    54  }
    55  
    56  /*
    57  True if the inputs would be `==` if compared as strings. When used on typedefs
    58  of `[]byte`, this is the same as `bytes.Equal`.
    59  */
    60  func TextEq[A Text](one, two A) bool { return ToString(one) == ToString(two) }
    61  
    62  /*
    63  Similar to `unsafe.StringData`, but takes arbitrary text as input. Returns the
    64  pointer to the first byte of the underlying data array for the given string or
    65  byte slice. Use caution. Mutating the underlying data may trigger segfaults or
    66  cause undefined behavior.
    67  */
    68  func TextDat[A Text](val A) *byte { return CastUnsafe[*byte](val) }
    69  
    70  /*
    71  Implementation note. We could write `TextDat` as following, but it would not be
    72  an improvement, because it still makes assumptions about the underlying
    73  structure of the data, specifically it assumes that strings and byte slices
    74  have a different width. At the time of writing, Go doesn't seem to provide a
    75  safe and free way to detect if we have `~string` or `~[]byte`. A type switch on
    76  `any(src)` works only for core types such as `string`, but not for typedefs
    77  conforming to `~string` and `~[]byte`. Alternatives involve overheads such as
    78  calling interface methods of `reflect.Type`, which would stop this function
    79  from being a free cast.
    80  
    81  	func TextDat[A Text](src A) *byte {
    82  		if u.Sizeof(src) == SizeofString {
    83  			return u.StringData(string(src))
    84  		}
    85  		if u.Sizeof(src) == SizeofSlice {
    86  			return u.SliceData([]byte(src))
    87  		}
    88  		panic(`unreachable`)
    89  	}
    90  */
    91  
    92  /*
    93  Allocation-free conversion between two types conforming to the `Text`
    94  constraint, typically variants of `string` and/or `[]byte`.
    95  */
    96  func ToText[Out, Src Text](src Src) Out {
    97  	out := CastUnsafe[Out](src)
    98  
    99  	/**
   100  	Implementation note. We could also write the condition as shown below:
   101  
   102  		Kind[Src]() == r.String && Kind[Out]() == r.Slice
   103  
   104  	But the above would be measurably slower than the unsafe trick.
   105  	In addition, sizeof lets us ensure that the target can be cast into
   106  	`SliceHeader` without affecting other memory.
   107  	*/
   108  	if u.Sizeof(src) == SizeofString && u.Sizeof(out) == SizeofSliceHeader {
   109  		CastUnsafe[*SliceHeader](&out).Cap = len(out)
   110  	}
   111  
   112  	return out
   113  }
   114  
   115  /*
   116  Allocation-free conversion. Reinterprets arbitrary text as a string. If the
   117  string is used with an API that relies on string immutability, for example as a
   118  map key, the source memory must not be mutated afterwards.
   119  */
   120  func ToString[A Text](val A) string { return CastUnsafe[string](val) }
   121  
   122  /*
   123  Implementation note. `ToString` could be written as shown below. This passes our
   124  test, but runs marginally slower than our current implementation, and does not
   125  improve correctness, because `TextDat` also makes assumptions about the
   126  underlying structure of the string header.
   127  
   128  	func ToString[A Text](val A) string { return u.String(TextDat(val), len(val)) }
   129  */
   130  
   131  /*
   132  Allocation-free conversion. Reinterprets arbitrary text as bytes. If the source
   133  was a string, the output must NOT be mutated. Mutating memory that belongs to a
   134  string may produce segfaults or undefined behavior.
   135  */
   136  func ToBytes[A Text](val A) []byte { return u.Slice(TextDat(val), len(val)) }
   137  
   138  /*
   139  Converts arguments to strings and concatenates the results. See `StringCatch`
   140  for the encoding rules. Also see `JoinDense` for a simpler version that doesn't
   141  involve `any`.
   142  */
   143  func Str(src ...any) string { return JoinAny(src, ``) }
   144  
   145  /*
   146  Similar to `Str`. Concatenates string representations of the input values.
   147  Additionally, if the output is non-empty and doesn't end with a newline
   148  character, appends `Newline` at the end.
   149  */
   150  func Strln(src ...any) string {
   151  	switch len(src) {
   152  	case 0:
   153  		return ``
   154  
   155  	case 1:
   156  		return AppendNewlineOpt(String(src[0]))
   157  
   158  	default:
   159  		var buf Buf
   160  		buf.AppendAnysln(src...)
   161  		return buf.String()
   162  	}
   163  }
   164  
   165  /*
   166  Converts arguments to strings and joins the results with a single space. See
   167  `StringCatch` for encoding rules. Also see `JoinSpaced` for a more limited but
   168  more efficient version that doesn't involve `any`.
   169  */
   170  func Spaced(src ...any) string { return JoinAny(src, Space) }
   171  
   172  /*
   173  Converts arguments to strings and joins the results with a single space,
   174  ignoring empty strings. See `StringCatch` for the encoding rules. Also see
   175  `JoinSpacedOpt` for a more limited but more efficient version that doesn't
   176  involve `any`.
   177  */
   178  func SpacedOpt(src ...any) string { return JoinAnyOpt(src, Space) }
   179  
   180  /*
   181  Similar to `strings.Join` but takes `[]any`, converting elements to strings. See
   182  `StringCatch` for the encoding rules. Also see `Join`, `JoinOpt`,
   183  `JoinAnyOpt`.
   184  */
   185  func JoinAny(src []any, sep string) string {
   186  	switch len(src) {
   187  	case 0:
   188  		return ``
   189  
   190  	case 1:
   191  		return String(src[0])
   192  
   193  	default:
   194  		var buf Buf
   195  		for ind, src := range src {
   196  			if ind > 0 {
   197  				buf.AppendString(sep)
   198  			}
   199  			buf.AppendAny(src)
   200  		}
   201  		return buf.String()
   202  	}
   203  }
   204  
   205  // Like `JoinAny` but ignores empty strings.
   206  func JoinAnyOpt(src []any, sep string) string {
   207  	switch len(src) {
   208  	case 0:
   209  		return ``
   210  
   211  	case 1:
   212  		return String(src[0])
   213  
   214  	default:
   215  		var buf Buf
   216  
   217  		for ind, src := range src {
   218  			len0 := buf.Len()
   219  			if ind > 0 {
   220  				buf.AppendString(sep)
   221  			}
   222  			len1 := buf.Len()
   223  
   224  			buf.AppendAny(src)
   225  
   226  			if ind > 0 && buf.Len() == len1 {
   227  				buf.TruncLen(len0)
   228  			}
   229  		}
   230  
   231  		return buf.String()
   232  	}
   233  }
   234  
   235  // Concatenates the given text without any separators.
   236  func JoinDense[A Text](val ...A) string { return Join(val, ``) }
   237  
   238  // Joins the given strings with a space.
   239  func JoinSpaced[A Text](val ...A) string { return Join(val, Space) }
   240  
   241  // Joins non-empty strings with a space.
   242  func JoinSpacedOpt[A Text](val ...A) string { return JoinOpt(val, Space) }
   243  
   244  // Joins the given strings with newlines.
   245  func JoinLines[A Text](val ...A) string { return Join(val, Newline) }
   246  
   247  // Joins non-empty strings with newlines.
   248  func JoinLinesOpt[A Text](val ...A) string { return JoinOpt(val, Newline) }
   249  
   250  /*
   251  Similar to `strings.Join` but works on any input compatible with the `Text`
   252  interface. Also see `JoinOpt`, `JoinAny`, `JoinAnyOpt`.
   253  */
   254  func Join[A Text](src []A, sep string) string {
   255  	switch len(src) {
   256  	case 0:
   257  		return ``
   258  
   259  	case 1:
   260  		return ToString(src[0])
   261  
   262  	default:
   263  		var buf Buf
   264  		buf.GrowCap(Sum(src, TextLen[A]) + (len(sep) * (len(src) - 1)))
   265  
   266  		buf.AppendString(ToString(src[0]))
   267  		for _, src := range src[1:] {
   268  			buf.AppendString(sep)
   269  			buf.AppendString(ToString(src))
   270  		}
   271  		return buf.String()
   272  	}
   273  }
   274  
   275  /*
   276  Similar to `strings.Join` but works for any input compatible with the `Text`
   277  interface and ignores empty strings.
   278  */
   279  func JoinOpt[A Text](src []A, sep string) string {
   280  	switch len(src) {
   281  	case 0:
   282  		return ``
   283  
   284  	case 1:
   285  		return ToString(src[0])
   286  
   287  	default:
   288  		var size int
   289  		for _, src := range src {
   290  			wid := len(src)
   291  			if wid > 0 {
   292  				size = size + wid + len(sep)
   293  			}
   294  		}
   295  
   296  		var buf Buf
   297  		buf.GrowCap(size)
   298  
   299  		var found bool
   300  		for _, src := range src {
   301  			if len(src) > 0 {
   302  				if found {
   303  					buf.AppendString(sep)
   304  				}
   305  				found = true
   306  				buf = append(buf, src...)
   307  			}
   308  		}
   309  		return buf.String()
   310  	}
   311  }
   312  
   313  /*
   314  Similar to `strings.SplitN` for N = 1. More efficient: returns a tuple instead
   315  of allocating a slice. Safer: returns zero values if split doesn't succeed.
   316  */
   317  func Split2[A Text](src A, sep string) (A, A) {
   318  	ind := strings.Index(ToString(src), sep)
   319  	if ind >= 0 {
   320  		return src[:ind], src[ind+len(sep):]
   321  	}
   322  	return src, Zero[A]()
   323  }
   324  
   325  /*
   326  Splits the given text into lines. The resulting strings do not contain any
   327  newline characters. If the input is empty, the output is empty. Avoids
   328  information loss: preserves empty lines, allowing the caller to transform and
   329  join the lines without losing blanks. The following sequences are considered
   330  newlines: "\r\n", "\r", "\n".
   331  */
   332  func SplitLines[A Text](src A) []A {
   333  	/**
   334  	In our benchmark in Go 1.20.2, this runs about 20-30 times faster than the
   335  	equivalent regexp-based implementation.
   336  
   337  	It would be much simpler to use `strings.FieldsFunc` and `bytes.FieldsFunc`,
   338  	but they would elide empty lines, losing information and making this
   339  	non-reversible. They would also be about 2 times slower.
   340  
   341  	TODO simpler implementation.
   342  	*/
   343  
   344  	var out []A
   345  	var prev int
   346  	var next int
   347  	max := len(src)
   348  
   349  	/**
   350  	Iterating bytes is significantly faster than runes, and in valid UTF-8 it's
   351  	not possible to encounter '\r' or '\n' in multi-byte characters, making this
   352  	safe for valid text.
   353  	*/
   354  	for next < max {
   355  		char := src[next]
   356  
   357  		if char == '\r' && next < len(src)-1 && src[next+1] == '\n' {
   358  			out = append(out, src[prev:next])
   359  			next = next + 2
   360  			prev = next
   361  			continue
   362  		}
   363  
   364  		if char == '\n' || char == '\r' {
   365  			out = append(out, src[prev:next])
   366  			next++
   367  			prev = next
   368  			continue
   369  		}
   370  
   371  		next++
   372  	}
   373  
   374  	if next > 0 {
   375  		out = append(out, src[prev:next])
   376  	}
   377  	return out
   378  }
   379  
   380  /*
   381  Similar to `SplitLines`, but splits only on the first newline occurrence,
   382  returning the first line and the remainder, plus the number of bytes in the
   383  elided line separator. The following sequences are considered newlines:
   384  "\r\n", "\r", "\n".
   385  */
   386  func SplitLines2[A Text](src A) (A, A, int) {
   387  	size := len(src)
   388  	limit := size - 1
   389  
   390  	for ind, char := range ToString(src) {
   391  		if char == '\r' {
   392  			if ind < limit && src[ind+1] == '\n' {
   393  				return src[:ind], src[ind+2:], 2
   394  			}
   395  			return src[:ind], src[ind+1:], 1
   396  		}
   397  		if char == '\n' {
   398  			return src[:ind], src[ind+1:], 1
   399  		}
   400  	}
   401  	return src, Zero[A](), 0
   402  }
   403  
   404  /*
   405  Searches for the given separator and returns the part of the text before the
   406  separator, removing that prefix from the original text referenced by the
   407  pointer. The separator is excluded from both chunks. As a special case, if the
   408  separator is empty, pops the entire source text.
   409  */
   410  func TextPop[Src, Sep Text](ptr *Src, sep Sep) Src {
   411  	if ptr == nil {
   412  		return Zero[Src]()
   413  	}
   414  
   415  	src := *ptr
   416  
   417  	if len(sep) == 0 {
   418  		PtrClear(ptr)
   419  		return src
   420  	}
   421  
   422  	ind := strings.Index(ToString(src), ToString(sep))
   423  	if !(ind >= 0 && ind < len(src)) {
   424  		PtrClear(ptr)
   425  		return src
   426  	}
   427  
   428  	*ptr = src[ind+len(sep):]
   429  	return src[:ind]
   430  }
   431  
   432  // True if the string ends with a line feed or carriage return.
   433  func HasNewlineSuffix[A Text](src A) bool {
   434  	return isByteNewline(TextLastByte(src))
   435  }
   436  
   437  /*
   438  If the given text is non-empty and does not end with a newline character,
   439  appends `Newline` and returns the result. Otherwise returns the text unchanged.
   440  If the input type is a typedef of `[]byte` and has enough capacity, it's
   441  mutated. In other cases, the text is reallocated. Also see
   442  `Buf.AppendNewlineOpt` and `Strln`.
   443  */
   444  func AppendNewlineOpt[A Text](val A) A {
   445  	if len(val) > 0 && !HasNewlineSuffix(val) {
   446  		return ToText[A](append([]byte(val), Newline...))
   447  	}
   448  	return val
   449  }
   450  
   451  // Missing/private half of `strings.TrimSpace`. Trims only the prefix.
   452  func TrimSpacePrefix[A Text](src A) A {
   453  	return ToText[A](strings.TrimLeftFunc(ToString(src), unicode.IsSpace))
   454  }
   455  
   456  // Missing/private half of `strings.TrimSpace`. Trims only the suffix.
   457  func TrimSpaceSuffix[A Text](src A) A {
   458  	return ToText[A](strings.TrimRightFunc(ToString(src), unicode.IsSpace))
   459  }
   460  
   461  /*
   462  Regexp for splitting arbitrary text into words, Unicode-aware. Used by
   463  `ToWords`.
   464  */
   465  var ReWord = NewLazy(func() *regexp.Regexp {
   466  	return regexp.MustCompile(`\p{Lu}+[\p{Ll}\d]*|[\p{Ll}\d]+`)
   467  })
   468  
   469  /*
   470  Splits arbitrary text into words, Unicode-aware. Suitable for conversion between
   471  typographic cases such as `camelCase` and `snake_case`.
   472  */
   473  func ToWords[A Text](val A) Words {
   474  	return ReWord.Get().FindAllString(ToString(val), -1)
   475  }
   476  
   477  /*
   478  Tool for converting between typographic cases such as `camelCase` and
   479  `snake_case`.
   480  */
   481  type Words []string
   482  
   483  // Combines the words via "".
   484  func (self Words) Dense() string { return self.Join(``) }
   485  
   486  // Combines the words via " ".
   487  func (self Words) Spaced() string { return self.Join(` `) }
   488  
   489  // Combines the words via "_".
   490  func (self Words) Snake() string { return self.Join(`_`) }
   491  
   492  // Combines the words via "-".
   493  func (self Words) Kebab() string { return self.Join(`-`) }
   494  
   495  // Combines the words via ",".
   496  func (self Words) Comma() string { return self.Join(`,`) }
   497  
   498  // Combines the words via "|".
   499  func (self Words) Piped() string { return self.Join(`|`) }
   500  
   501  // Converts each word to lowercase. Mutates and returns the receiver.
   502  func (self Words) Lower() Words { return MapMut(self, strings.ToLower) }
   503  
   504  // Converts each word to UPPERCASE. Mutates and returns the receiver.
   505  func (self Words) Upper() Words { return MapMut(self, strings.ToUpper) }
   506  
   507  // Converts each word to Titlecase. Mutates and returns the receiver.
   508  func (self Words) Title() Words {
   509  	//nolint:staticcheck
   510  	return MapMut(self, strings.Title)
   511  }
   512  
   513  /*
   514  Converts the first word to Titlecase and each other word to lowercase. Mutates
   515  and returns the receiver.
   516  */
   517  func (self Words) Sentence() Words {
   518  	//nolint:staticcheck
   519  	return self.MapHead(strings.Title).MapTail(strings.ToLower)
   520  }
   521  
   522  /*
   523  Converts the first word to lowercase and each other word to Titlecase. Mutates
   524  and returns the receiver.
   525  */
   526  func (self Words) Camel() Words {
   527  	//nolint:staticcheck
   528  	return self.MapHead(strings.ToLower).MapTail(strings.Title)
   529  }
   530  
   531  // Same as `strings.Join`.
   532  func (self Words) Join(val string) string { return strings.Join(self, val) }
   533  
   534  /*
   535  Mutates the receiver by replacing the first element with the result of calling
   536  the given function on that element. If the receiver is empty, this is a nop.
   537  */
   538  func (self Words) MapHead(fun func(string) string) Words {
   539  	if fun != nil && len(self) > 0 {
   540  		self[0] = fun(self[0])
   541  	}
   542  	return self
   543  }
   544  
   545  /*
   546  Mutates the receiver by replacing elements, other than the first, with the
   547  results of the given function.
   548  */
   549  func (self Words) MapTail(fun func(string) string) Words {
   550  	if len(self) > 0 {
   551  		MapMut(self[1:], fun)
   552  	}
   553  	return self
   554  }
   555  
   556  // Uses `utf8.RuneCountInString` to count chars in arbitrary text.
   557  func CharCount[A Text](val A) int {
   558  	return utf8.RuneCountInString(ToString(val))
   559  }
   560  
   561  /*
   562  Similar to `src[start:end]`, but instead of slicing text at byte positions,
   563  slices text at character positions. Similar to `string([]rune(src)[start:end])`,
   564  but slightly more performant and more permissive.
   565  */
   566  func TextCut[A Text](src A, start, end int) (_ A) {
   567  	if !(end > start) {
   568  		return
   569  	}
   570  
   571  	startInd := 0
   572  	endInd := len(src)
   573  	charInd := 0
   574  
   575  	for byteInd := range ToString(src) {
   576  		if charInd == start {
   577  			startInd = byteInd
   578  		}
   579  		if charInd == end {
   580  			endInd = byteInd
   581  			break
   582  		}
   583  		charInd++
   584  	}
   585  
   586  	return src[startInd:endInd]
   587  }
   588  
   589  /*
   590  Truncates text to the given count of Unicode characters (not bytes). The limit
   591  can't exceed `math.MaxInt`. Also see `TextTruncWith` which is more general.
   592  */
   593  func TextTrunc[A Text](src A, limit uint) (_ A) {
   594  	return TextTruncWith(src, Zero[A](), limit)
   595  }
   596  
   597  /*
   598  Shortcut for `TextTruncWith(src, "…")`. Truncates the given text to the given total
   599  count of Unicode characters with an ellipsis.
   600  */
   601  func TextEllipsis[A Text](src A, limit uint) A {
   602  	return TextTruncWith(src, ToText[A](`…`), limit)
   603  }
   604  
   605  /*
   606  Truncates the given text to the given total count of Unicode characters
   607  (not bytes) with a suffix. If the text is under the limit, it's returned
   608  unchanged, otherwise it's truncated and the given suffix is appended. The total
   609  count includes the character count of the given suffix string. The limit can't
   610  exceed `math.MaxInt`. Also see shortcut `TextEllipsis` which uses this with the
   611  ellipsis character '…'.
   612  */
   613  func TextTruncWith[A Text](src, suf A, limit uint) A {
   614  	if limit == 0 {
   615  		return Zero[A]()
   616  	}
   617  
   618  	lim := safeUintToInt(limit)
   619  	sufCharLen := CharCount(suf)
   620  	str := ToString(src)
   621  	prevInd := 0
   622  	nextInd := 0
   623  	charInd := 0
   624  
   625  	for nextInd = range str {
   626  		if charInd+sufCharLen > lim {
   627  			return ToText[A](str[:prevInd] + ToString(suf))
   628  		}
   629  		prevInd = nextInd
   630  		charInd++
   631  	}
   632  	return src
   633  }