github.com/primecitizens/pcz/std@v0.2.1/text/unicode/utf8/decode.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2009 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 // Package utf8 implements functions and constants to support text encoded in 9 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. 10 // See https://en.wikipedia.org/wiki/UTF-8 11 package utf8 12 13 import ( 14 . "github.com/primecitizens/pcz/std/text/unicode/common" 15 ) 16 17 // Valid reports whether s consists entirely of valid UTF-8-encoded runes. 18 func Valid(s string) bool { 19 // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. 20 for len(s) >= 8 { 21 // Combining two 32 bit loads allows the same code to be used 22 // for 32 and 64 bit platforms. 23 // The compiler can generate a 32bit load for first32 and second32 24 // on many platforms. See test/codegen/memcombine.go. 25 first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24 26 second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24 27 if (first32|second32)&0x80808080 != 0 { 28 // Found a non ASCII byte (>= RuneSelf). 29 break 30 } 31 s = s[8:] 32 } 33 34 for i, n := 0, len(s); i < n; { 35 si := s[i] 36 if si < RuneSelf { 37 i++ 38 continue 39 } 40 x := first[si] 41 if x == xx { 42 return false // Illegal starter byte. 43 } 44 size := int(x & 7) 45 if i+size > n { 46 return false // Short or invalid. 47 } 48 accept := acceptRanges[x>>4] 49 if c := s[i+1]; c < accept.lo || accept.hi < c { 50 return false 51 } else if size == 2 { 52 } else if c := s[i+2]; c < locb || hicb < c { 53 return false 54 } else if size == 3 { 55 } else if c := s[i+3]; c < locb || hicb < c { 56 return false 57 } 58 i += size 59 } 60 return true 61 } 62 63 // Count returns the number of runes in s. Erroneous and short 64 // encodings are treated as single runes of width 1 byte. 65 func Count(s string) (n int) { 66 for i := 0; i < len(s); n++ { 67 c := s[i] 68 if c < RuneSelf { 69 // ASCII fast path 70 i++ 71 continue 72 } 73 x := first[c] 74 if x == xx { 75 i++ // invalid. 76 continue 77 } 78 size := int(x & 7) 79 if i+size > len(s) { 80 i++ // Short or invalid. 81 continue 82 } 83 accept := acceptRanges[x>>4] 84 if c := s[i+1]; c < accept.lo || accept.hi < c { 85 size = 1 86 } else if size == 2 { 87 } else if c := s[i+2]; c < locb || hicb < c { 88 size = 1 89 } else if size == 3 { 90 } else if c := s[i+3]; c < locb || hicb < c { 91 size = 1 92 } 93 i += size 94 } 95 return n 96 } 97 98 // First unpacks the first UTF-8 encoding in s and returns the rune and 99 // its width in bytes. If p is empty it returns (RuneError, 0). 100 // 101 // Otherwise, if the encoding is invalid, it returns 102 // (RuneError, 1). Both are impossible results for correct, 103 // non-empty UTF-8. 104 // 105 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 106 // out of range, or is not the shortest possible UTF-8 encoding for the 107 // value. No other validation is performed. 108 func First(s string) (r rune, size int) { 109 if len(s) == 0 { 110 return RuneError, 0 111 } 112 s0 := s[0] 113 x := first[s0] 114 if x >= as { 115 // The following code simulates an additional check for x == xx and 116 // handling the ASCII and invalid cases accordingly. This mask-and-or 117 // approach prevents an additional branch. 118 mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF. 119 return rune(s[0])&^mask | RuneError&mask, 1 120 } 121 sz := int(x & 7) 122 accept := acceptRanges[x>>4] 123 if len(s) < sz { 124 return RuneError, 1 125 } 126 s1 := s[1] 127 if s1 < accept.lo || accept.hi < s1 { 128 return RuneError, 1 129 } 130 if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks 131 return rune(s0&mask2)<<6 | rune(s1&maskx), 2 132 } 133 s2 := s[2] 134 if s2 < locb || hicb < s2 { 135 return RuneError, 1 136 } 137 if sz <= 3 { 138 return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3 139 } 140 s3 := s[3] 141 if s3 < locb || hicb < s3 { 142 return RuneError, 1 143 } 144 return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4 145 } 146 147 // Last unpacks the last UTF-8 encoding in p and returns the rune and 148 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if 149 // the encoding is invalid, it returns (RuneError, 1). Both are impossible 150 // results for correct, non-empty UTF-8. 151 // 152 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 153 // out of range, or is not the shortest possible UTF-8 encoding for the 154 // value. No other validation is performed. 155 func Last(s string) (r rune, size int) { 156 end := len(s) 157 if end == 0 { 158 return RuneError, 0 159 } 160 start := end - 1 161 r = rune(s[start]) 162 if r < RuneSelf { 163 return r, 1 164 } 165 // guard against O(n^2) behavior when traversing 166 // backwards through strings with long sequences of 167 // invalid UTF-8. 168 lim := end - MaxRuneLen 169 if lim < 0 { 170 lim = 0 171 } 172 for start--; start >= lim; start-- { 173 if RuneStart(s[start]) { 174 break 175 } 176 } 177 if start < 0 { 178 start = 0 179 } 180 r, size = First(s[start:end]) 181 if start+size != end { 182 return RuneError, 1 183 } 184 return r, size 185 } 186 187 // FullRune reports whether the bytes in s begin with a full UTF-8 encoding of a rune. 188 // 189 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. 190 func FullRune(s string) bool { 191 if len(s) == 0 { 192 return false 193 } 194 x := first[s[0]] 195 if len(s) >= int(x&7) { 196 return true // ASCII, invalid, or valid. 197 } 198 // Must be short or invalid. 199 accept := acceptRanges[x>>4] 200 if len(s) > 1 && (s[1] < accept.lo || accept.hi < s[1]) { 201 return true 202 } else if len(s) > 2 && (s[2] < locb || hicb < s[2]) { 203 return true 204 } 205 return false 206 } 207 208 // RuneStart reports whether the byte could be the first byte of an encoded, 209 // possibly invalid rune. Second and subsequent bytes always have the top two 210 // bits set to 10. 211 func RuneStart(b byte) bool { 212 return b&0xC0 != 0x80 213 } 214 215 // RuneValid reports whether r can be legally encoded as UTF-8. 216 // Code points that are out of range or a surrogate half are illegal. 217 func RuneValid(r rune) bool { 218 switch { 219 case 0 <= r && r < SurrogateMin, 220 SurrogateMax < r && r <= MaxRune: 221 return true 222 } 223 return false 224 }