github.com/pgavlin/text@v0.0.0-20240419000839-8438d0a47805/utf8/utf8.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package utf8 implements functions and constants to support text encoded in
     6  // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
     7  // See https://en.wikipedia.org/wiki/UTF-8
     8  package utf8
     9  
    10  import (
    11  	"unicode/utf8"
    12  
    13  	"github.com/pgavlin/text/internal/bytealg"
    14  )
    15  
    16  // Numbers fundamental to the encoding.
    17  const (
    18  	RuneError = utf8.RuneError // the "error" Rune or "Unicode replacement character"
    19  	RuneSelf  = utf8.RuneSelf  // characters below RuneSelf are represented as themselves in a single byte.
    20  	MaxRune   = utf8.MaxRune   // Maximum valid Unicode code point.
    21  	UTFMax    = utf8.UTFMax    // maximum number of bytes of a UTF-8 encoded Unicode character.
    22  )
    23  
    24  // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
    25  // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
    26  func FullRune[S ~string | ~[]byte](p S) bool {
    27  	return utf8.FullRuneInString(bytealg.AsString(p))
    28  }
    29  
    30  // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
    31  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    32  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    33  // results for correct, non-empty UTF-8.
    34  //
    35  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    36  // out of range, or is not the shortest possible UTF-8 encoding for the
    37  // value. No other validation is performed.
    38  func DecodeRune[S ~string | ~[]byte](s S) (r rune, size int) {
    39  	return utf8.DecodeRuneInString(bytealg.AsString(s))
    40  }
    41  
    42  // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
    43  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    44  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    45  // results for correct, non-empty UTF-8.
    46  //
    47  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    48  // out of range, or is not the shortest possible UTF-8 encoding for the
    49  // value. No other validation is performed.
    50  func DecodeLastRune[S ~string | ~[]byte](s S) (r rune, size int) {
    51  	return utf8.DecodeLastRuneInString(bytealg.AsString(s))
    52  }
    53  
    54  // RuneLen returns the number of bytes required to encode the rune.
    55  // It returns -1 if the rune is not a valid value to encode in UTF-8.
    56  func RuneLen(r rune) int {
    57  	return utf8.RuneLen(r)
    58  }
    59  
    60  // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
    61  // If the rune is out of range, it writes the encoding of RuneError.
    62  // It returns the number of bytes written.
    63  func EncodeRune(p []byte, r rune) int {
    64  	return utf8.EncodeRune(p, r)
    65  }
    66  
    67  // AppendRune appends the UTF-8 encoding of r to the end of p and
    68  // returns the extended buffer. If the rune is out of range,
    69  // it appends the encoding of RuneError.
    70  func AppendRune(p []byte, r rune) []byte {
    71  	return utf8.AppendRune(p, r)
    72  }
    73  
    74  // RuneCount returns the number of runes in p. Erroneous and short
    75  // encodings are treated as single runes of width 1 byte.
    76  func RuneCount[S ~string | ~[]byte](s S) int {
    77  	return utf8.RuneCountInString(bytealg.AsString(s))
    78  }
    79  
    80  // RuneStart reports whether the byte could be the first byte of an encoded,
    81  // possibly invalid rune. Second and subsequent bytes always have the top two
    82  // bits set to 10.
    83  func RuneStart(b byte) bool {
    84  	return utf8.RuneStart(b)
    85  }
    86  
    87  // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
    88  func Valid[S ~string | ~[]byte](s S) bool {
    89  	return utf8.ValidString(bytealg.AsString(s))
    90  }
    91  
    92  // ValidRune reports whether r can be legally encoded as UTF-8.
    93  // Code points that are out of range or a surrogate half are illegal.
    94  func ValidRune(r rune) bool {
    95  	return utf8.ValidRune(r)
    96  }