github.com/pgavlin/text@v0.0.0-20240419000839-8438d0a47805/utf8/utf8.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package utf8 implements functions and constants to support text encoded in 6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. 7 // See https://en.wikipedia.org/wiki/UTF-8 8 package utf8 9 10 import ( 11 "unicode/utf8" 12 13 "github.com/pgavlin/text/internal/bytealg" 14 ) 15 16 // Numbers fundamental to the encoding. 17 const ( 18 RuneError = utf8.RuneError // the "error" Rune or "Unicode replacement character" 19 RuneSelf = utf8.RuneSelf // characters below RuneSelf are represented as themselves in a single byte. 20 MaxRune = utf8.MaxRune // Maximum valid Unicode code point. 21 UTFMax = utf8.UTFMax // maximum number of bytes of a UTF-8 encoded Unicode character. 22 ) 23 24 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. 25 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. 26 func FullRune[S ~string | ~[]byte](p S) bool { 27 return utf8.FullRuneInString(bytealg.AsString(p)) 28 } 29 30 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and 31 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if 32 // the encoding is invalid, it returns (RuneError, 1). Both are impossible 33 // results for correct, non-empty UTF-8. 34 // 35 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 36 // out of range, or is not the shortest possible UTF-8 encoding for the 37 // value. No other validation is performed. 38 func DecodeRune[S ~string | ~[]byte](s S) (r rune, size int) { 39 return utf8.DecodeRuneInString(bytealg.AsString(s)) 40 } 41 42 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and 43 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if 44 // the encoding is invalid, it returns (RuneError, 1). Both are impossible 45 // results for correct, non-empty UTF-8. 46 // 47 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 48 // out of range, or is not the shortest possible UTF-8 encoding for the 49 // value. No other validation is performed. 50 func DecodeLastRune[S ~string | ~[]byte](s S) (r rune, size int) { 51 return utf8.DecodeLastRuneInString(bytealg.AsString(s)) 52 } 53 54 // RuneLen returns the number of bytes required to encode the rune. 55 // It returns -1 if the rune is not a valid value to encode in UTF-8. 56 func RuneLen(r rune) int { 57 return utf8.RuneLen(r) 58 } 59 60 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. 61 // If the rune is out of range, it writes the encoding of RuneError. 62 // It returns the number of bytes written. 63 func EncodeRune(p []byte, r rune) int { 64 return utf8.EncodeRune(p, r) 65 } 66 67 // AppendRune appends the UTF-8 encoding of r to the end of p and 68 // returns the extended buffer. If the rune is out of range, 69 // it appends the encoding of RuneError. 70 func AppendRune(p []byte, r rune) []byte { 71 return utf8.AppendRune(p, r) 72 } 73 74 // RuneCount returns the number of runes in p. Erroneous and short 75 // encodings are treated as single runes of width 1 byte. 76 func RuneCount[S ~string | ~[]byte](s S) int { 77 return utf8.RuneCountInString(bytealg.AsString(s)) 78 } 79 80 // RuneStart reports whether the byte could be the first byte of an encoded, 81 // possibly invalid rune. Second and subsequent bytes always have the top two 82 // bits set to 10. 83 func RuneStart(b byte) bool { 84 return utf8.RuneStart(b) 85 } 86 87 // Valid reports whether p consists entirely of valid UTF-8-encoded runes. 88 func Valid[S ~string | ~[]byte](s S) bool { 89 return utf8.ValidString(bytealg.AsString(s)) 90 } 91 92 // ValidRune reports whether r can be legally encoded as UTF-8. 93 // Code points that are out of range or a surrogate half are illegal. 94 func ValidRune(r rune) bool { 95 return utf8.ValidRune(r) 96 }