github.com/primecitizens/pcz/std@v0.2.1/text/unicode/letter.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2009 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 // Package unicode provides data and functions to test some properties of 9 // Unicode code points. 10 package unicode 11 12 import ( 13 "github.com/primecitizens/pcz/std/text/unicode/common" 14 ) 15 16 const ( 17 MaxRune = common.MaxRune // Maximum valid Unicode code point. 18 ReplacementChar = common.RuneError // Represents invalid code points. 19 MaxASCII = '\u007F' // maximum ASCII value. 20 MaxLatin1 = '\u00FF' // maximum Latin-1 value. 21 ) 22 23 // RangeTable defines a set of Unicode code points by listing the ranges of 24 // code points within the set. The ranges are listed in two slices 25 // to save space: a slice of 16-bit ranges and a slice of 32-bit ranges. 26 // The two slices must be in sorted order and non-overlapping. 27 // Also, R32 should contain only values >= 0x10000 (1<<16). 28 type RangeTable struct { 29 R16 []Range16 30 R32 []Range32 31 LatinOffset int // number of entries in R16 with Hi <= MaxLatin1 32 } 33 34 // Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi 35 // inclusive and has the specified stride. 36 type Range16 struct { 37 Lo uint16 38 Hi uint16 39 Stride uint16 40 } 41 42 // Range32 represents of a range of Unicode code points and is used when one or 43 // more of the values will not fit in 16 bits. The range runs from Lo to Hi 44 // inclusive and has the specified stride. Lo and Hi must always be >= 1<<16. 45 type Range32 struct { 46 Lo uint32 47 Hi uint32 48 Stride uint32 49 } 50 51 // CaseRange represents a range of Unicode code points for simple (one 52 // code point to one code point) case conversion. 53 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas 54 // are the number to add to the code point to reach the code point for a 55 // different case for that character. They may be negative. If zero, it 56 // means the character is in the corresponding case. There is a special 57 // case representing sequences of alternating corresponding Upper and Lower 58 // pairs. It appears with a fixed Delta of 59 // 60 // {UpperLower, UpperLower, UpperLower} 61 // 62 // The constant UpperLower has an otherwise impossible delta value. 63 type CaseRange struct { 64 Lo uint32 65 Hi uint32 66 Delta d 67 } 68 69 // SpecialCase represents language-specific case mappings such as Turkish. 70 // Methods of SpecialCase customize (by overriding) the standard mappings. 71 type SpecialCase []CaseRange 72 73 // BUG(r): There is no mechanism for full case folding, that is, for 74 // characters that involve multiple runes in the input or output. 75 76 // Indices into the Delta arrays inside CaseRanges for case mapping. 77 const ( 78 UpperCase = iota 79 LowerCase 80 TitleCase 81 MaxCase 82 ) 83 84 type d [MaxCase]rune // to make the CaseRanges text shorter 85 86 // If the Delta field of a CaseRange is UpperLower, it means 87 // this CaseRange represents a sequence of the form (say) 88 // Upper Lower Upper Lower. 89 const ( 90 UpperLower = MaxRune + 1 // (Cannot be a valid delta.) 91 ) 92 93 // linearMax is the maximum size table for linear search for non-Latin1 rune. 94 // Derived by running 'go test -calibrate'. 95 const linearMax = 18 96 97 // is16 reports whether r is in the sorted slice of 16-bit ranges. 98 func is16(ranges []Range16, r uint16) bool { 99 if len(ranges) <= linearMax || r <= MaxLatin1 { 100 for i := range ranges { 101 range_ := &ranges[i] 102 if r < range_.Lo { 103 return false 104 } 105 if r <= range_.Hi { 106 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 107 } 108 } 109 return false 110 } 111 112 // binary search over ranges 113 lo := 0 114 hi := len(ranges) 115 for lo < hi { 116 m := lo + (hi-lo)/2 117 range_ := &ranges[m] 118 if range_.Lo <= r && r <= range_.Hi { 119 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 120 } 121 if r < range_.Lo { 122 hi = m 123 } else { 124 lo = m + 1 125 } 126 } 127 return false 128 } 129 130 // is32 reports whether r is in the sorted slice of 32-bit ranges. 131 func is32(ranges []Range32, r uint32) bool { 132 if len(ranges) <= linearMax { 133 for i := range ranges { 134 range_ := &ranges[i] 135 if r < range_.Lo { 136 return false 137 } 138 if r <= range_.Hi { 139 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 140 } 141 } 142 return false 143 } 144 145 // binary search over ranges 146 lo := 0 147 hi := len(ranges) 148 for lo < hi { 149 m := lo + (hi-lo)/2 150 range_ := ranges[m] 151 if range_.Lo <= r && r <= range_.Hi { 152 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 153 } 154 if r < range_.Lo { 155 hi = m 156 } else { 157 lo = m + 1 158 } 159 } 160 return false 161 } 162 163 // Is reports whether the rune is in the specified table of ranges. 164 func Is(rangeTab *RangeTable, r rune) bool { 165 r16 := rangeTab.R16 166 // Compare as uint32 to correctly handle negative runes. 167 if len(r16) > 0 && uint32(r) <= uint32(r16[len(r16)-1].Hi) { 168 return is16(r16, uint16(r)) 169 } 170 r32 := rangeTab.R32 171 if len(r32) > 0 && r >= rune(r32[0].Lo) { 172 return is32(r32, uint32(r)) 173 } 174 return false 175 } 176 177 func isExcludingLatin(rangeTab *RangeTable, r rune) bool { 178 r16 := rangeTab.R16 179 // Compare as uint32 to correctly handle negative runes. 180 if off := rangeTab.LatinOffset; len(r16) > off && uint32(r) <= uint32(r16[len(r16)-1].Hi) { 181 return is16(r16[off:], uint16(r)) 182 } 183 r32 := rangeTab.R32 184 if len(r32) > 0 && r >= rune(r32[0].Lo) { 185 return is32(r32, uint32(r)) 186 } 187 return false 188 } 189 190 // IsUpper reports whether the rune is an upper case letter. 191 func IsUpper(r rune) bool { 192 // See comment in IsGraphic. 193 if uint32(r) <= MaxLatin1 { 194 return properties[uint8(r)]&pLmask == pLu 195 } 196 return isExcludingLatin(Upper, r) 197 } 198 199 // IsLower reports whether the rune is a lower case letter. 200 func IsLower(r rune) bool { 201 // See comment in IsGraphic. 202 if uint32(r) <= MaxLatin1 { 203 return properties[uint8(r)]&pLmask == pLl 204 } 205 return isExcludingLatin(Lower, r) 206 } 207 208 // IsTitle reports whether the rune is a title case letter. 209 func IsTitle(r rune) bool { 210 if r <= MaxLatin1 { 211 return false 212 } 213 return isExcludingLatin(Title, r) 214 } 215 216 // to maps the rune using the specified case mapping. 217 // It additionally reports whether caseRange contained a mapping for r. 218 func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) { 219 if _case < 0 || MaxCase <= _case { 220 return ReplacementChar, false // as reasonable an error as any 221 } 222 // binary search over ranges 223 lo := 0 224 hi := len(caseRange) 225 for lo < hi { 226 m := lo + (hi-lo)/2 227 cr := caseRange[m] 228 if rune(cr.Lo) <= r && r <= rune(cr.Hi) { 229 delta := cr.Delta[_case] 230 if delta > MaxRune { 231 // In an Upper-Lower sequence, which always starts with 232 // an UpperCase letter, the real deltas always look like: 233 // {0, 1, 0} UpperCase (Lower is next) 234 // {-1, 0, -1} LowerCase (Upper, Title are previous) 235 // The characters at even offsets from the beginning of the 236 // sequence are upper case; the ones at odd offsets are lower. 237 // The correct mapping can be done by clearing or setting the low 238 // bit in the sequence offset. 239 // The constants UpperCase and TitleCase are even while LowerCase 240 // is odd so we take the low bit from _case. 241 return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true 242 } 243 return r + delta, true 244 } 245 if r < rune(cr.Lo) { 246 hi = m 247 } else { 248 lo = m + 1 249 } 250 } 251 return r, false 252 } 253 254 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. 255 func To(_case int, r rune) rune { 256 r, _ = to(_case, r, CaseRanges) 257 return r 258 } 259 260 // ToUpper maps the rune to upper case. 261 func ToUpper(r rune) rune { 262 if r <= MaxASCII { 263 if 'a' <= r && r <= 'z' { 264 r -= 'a' - 'A' 265 } 266 return r 267 } 268 return To(UpperCase, r) 269 } 270 271 // ToLower maps the rune to lower case. 272 func ToLower(r rune) rune { 273 if r <= MaxASCII { 274 if 'A' <= r && r <= 'Z' { 275 r += 'a' - 'A' 276 } 277 return r 278 } 279 return To(LowerCase, r) 280 } 281 282 // ToTitle maps the rune to title case. 283 func ToTitle(r rune) rune { 284 if r <= MaxASCII { 285 if 'a' <= r && r <= 'z' { // title case is upper case for ASCII 286 r -= 'a' - 'A' 287 } 288 return r 289 } 290 return To(TitleCase, r) 291 } 292 293 // ToUpper maps the rune to upper case giving priority to the special mapping. 294 func (special SpecialCase) ToUpper(r rune) rune { 295 r1, hadMapping := to(UpperCase, r, []CaseRange(special)) 296 if r1 == r && !hadMapping { 297 r1 = ToUpper(r) 298 } 299 return r1 300 } 301 302 // ToTitle maps the rune to title case giving priority to the special mapping. 303 func (special SpecialCase) ToTitle(r rune) rune { 304 r1, hadMapping := to(TitleCase, r, []CaseRange(special)) 305 if r1 == r && !hadMapping { 306 r1 = ToTitle(r) 307 } 308 return r1 309 } 310 311 // ToLower maps the rune to lower case giving priority to the special mapping. 312 func (special SpecialCase) ToLower(r rune) rune { 313 r1, hadMapping := to(LowerCase, r, []CaseRange(special)) 314 if r1 == r && !hadMapping { 315 r1 = ToLower(r) 316 } 317 return r1 318 } 319 320 // caseOrbit is defined in tables.go as []foldPair. Right now all the 321 // entries fit in uint16, so use uint16. If that changes, compilation 322 // will fail (the constants in the composite literal will not fit in uint16) 323 // and the types here can change to uint32. 324 type foldPair struct { 325 From uint16 326 To uint16 327 } 328 329 // SimpleFold iterates over Unicode code points equivalent under 330 // the Unicode-defined simple case folding. Among the code points 331 // equivalent to rune (including rune itself), SimpleFold returns the 332 // smallest rune > r if one exists, or else the smallest rune >= 0. 333 // If r is not a valid Unicode code point, SimpleFold(r) returns r. 334 // 335 // For example: 336 // 337 // SimpleFold('A') = 'a' 338 // SimpleFold('a') = 'A' 339 // 340 // SimpleFold('K') = 'k' 341 // SimpleFold('k') = '\u212A' (Kelvin symbol, K) 342 // SimpleFold('\u212A') = 'K' 343 // 344 // SimpleFold('1') = '1' 345 // 346 // SimpleFold(-2) = -2 347 func SimpleFold(r rune) rune { 348 if r < 0 || r > MaxRune { 349 return r 350 } 351 352 if int(r) < len(asciiFold) { 353 return rune(asciiFold[r]) 354 } 355 356 // Consult caseOrbit table for special cases. 357 lo := 0 358 hi := len(caseOrbit) 359 for lo < hi { 360 m := lo + (hi-lo)/2 361 if rune(caseOrbit[m].From) < r { 362 lo = m + 1 363 } else { 364 hi = m 365 } 366 } 367 if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r { 368 return rune(caseOrbit[lo].To) 369 } 370 371 // No folding specified. This is a one- or two-element 372 // equivalence class containing rune and ToLower(rune) 373 // and ToUpper(rune) if they are different from rune. 374 if l := ToLower(r); l != r { 375 return l 376 } 377 return ToUpper(r) 378 }