github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/unicode/letter.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package unicode provides data and functions to test some properties of 6 // Unicode code points. 7 package unicode 8 9 const ( 10 MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. 11 ReplacementChar = '\uFFFD' // Represents invalid code points. 12 MaxASCII = '\u007F' // maximum ASCII value. 13 MaxLatin1 = '\u00FF' // maximum Latin-1 value. 14 ) 15 16 // RangeTable defines a set of Unicode code points by listing the ranges of 17 // code points within the set. The ranges are listed in two slices 18 // to save space: a slice of 16-bit ranges and a slice of 32-bit ranges. 19 // The two slices must be in sorted order and non-overlapping. 20 // Also, R32 should contain only values >= 0x10000 (1<<16). 21 type RangeTable struct { 22 R16 []Range16 23 R32 []Range32 24 LatinOffset int // number of entries in R16 with Hi <= MaxLatin1 25 } 26 27 // Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi 28 // inclusive and has the specified stride. 29 type Range16 struct { 30 Lo uint16 31 Hi uint16 32 Stride uint16 33 } 34 35 // Range32 represents of a range of Unicode code points and is used when one or 36 // more of the values will not fit in 16 bits. The range runs from Lo to Hi 37 // inclusive and has the specified stride. Lo and Hi must always be >= 1<<16. 38 type Range32 struct { 39 Lo uint32 40 Hi uint32 41 Stride uint32 42 } 43 44 // CaseRange represents a range of Unicode code points for simple (one 45 // code point to one code point) case conversion. 46 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas 47 // are the number to add to the code point to reach the code point for a 48 // different case for that character. They may be negative. If zero, it 49 // means the character is in the corresponding case. There is a special 50 // case representing sequences of alternating corresponding Upper and Lower 51 // pairs. It appears with a fixed Delta of 52 // 53 // {UpperLower, UpperLower, UpperLower} 54 // 55 // The constant UpperLower has an otherwise impossible delta value. 56 type CaseRange struct { 57 Lo uint32 58 Hi uint32 59 Delta d 60 } 61 62 // SpecialCase represents language-specific case mappings such as Turkish. 63 // Methods of SpecialCase customize (by overriding) the standard mappings. 64 type SpecialCase []CaseRange 65 66 // BUG(r): There is no mechanism for full case folding, that is, for 67 // characters that involve multiple runes in the input or output. 68 69 // Indices into the Delta arrays inside CaseRanges for case mapping. 70 const ( 71 UpperCase = iota 72 LowerCase 73 TitleCase 74 MaxCase 75 ) 76 77 type d [MaxCase]rune // to make the CaseRanges text shorter 78 79 // If the Delta field of a [CaseRange] is UpperLower, it means 80 // this CaseRange represents a sequence of the form (say) 81 // [Upper] [Lower] [Upper] [Lower]. 82 const ( 83 UpperLower = MaxRune + 1 // (Cannot be a valid delta.) 84 ) 85 86 // linearMax is the maximum size table for linear search for non-Latin1 rune. 87 // Derived by running 'go test -calibrate'. 88 const linearMax = 18 89 90 // is16 reports whether r is in the sorted slice of 16-bit ranges. 91 func is16(ranges []Range16, r uint16) bool { 92 if len(ranges) <= linearMax || r <= MaxLatin1 { 93 for i := range ranges { 94 range_ := &ranges[i] 95 if r < range_.Lo { 96 return false 97 } 98 if r <= range_.Hi { 99 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 100 } 101 } 102 return false 103 } 104 105 // binary search over ranges 106 lo := 0 107 hi := len(ranges) 108 for lo < hi { 109 m := int(uint(lo+hi) >> 1) 110 range_ := &ranges[m] 111 if range_.Lo <= r && r <= range_.Hi { 112 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 113 } 114 if r < range_.Lo { 115 hi = m 116 } else { 117 lo = m + 1 118 } 119 } 120 return false 121 } 122 123 // is32 reports whether r is in the sorted slice of 32-bit ranges. 124 func is32(ranges []Range32, r uint32) bool { 125 if len(ranges) <= linearMax { 126 for i := range ranges { 127 range_ := &ranges[i] 128 if r < range_.Lo { 129 return false 130 } 131 if r <= range_.Hi { 132 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 133 } 134 } 135 return false 136 } 137 138 // binary search over ranges 139 lo := 0 140 hi := len(ranges) 141 for lo < hi { 142 m := int(uint(lo+hi) >> 1) 143 range_ := ranges[m] 144 if range_.Lo <= r && r <= range_.Hi { 145 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 146 } 147 if r < range_.Lo { 148 hi = m 149 } else { 150 lo = m + 1 151 } 152 } 153 return false 154 } 155 156 // Is reports whether the rune is in the specified table of ranges. 157 func Is(rangeTab *RangeTable, r rune) bool { 158 r16 := rangeTab.R16 159 // Compare as uint32 to correctly handle negative runes. 160 if len(r16) > 0 && uint32(r) <= uint32(r16[len(r16)-1].Hi) { 161 return is16(r16, uint16(r)) 162 } 163 r32 := rangeTab.R32 164 if len(r32) > 0 && r >= rune(r32[0].Lo) { 165 return is32(r32, uint32(r)) 166 } 167 return false 168 } 169 170 func isExcludingLatin(rangeTab *RangeTable, r rune) bool { 171 r16 := rangeTab.R16 172 // Compare as uint32 to correctly handle negative runes. 173 if off := rangeTab.LatinOffset; len(r16) > off && uint32(r) <= uint32(r16[len(r16)-1].Hi) { 174 return is16(r16[off:], uint16(r)) 175 } 176 r32 := rangeTab.R32 177 if len(r32) > 0 && r >= rune(r32[0].Lo) { 178 return is32(r32, uint32(r)) 179 } 180 return false 181 } 182 183 // IsUpper reports whether the rune is an upper case letter. 184 func IsUpper(r rune) bool { 185 // See comment in IsGraphic. 186 if uint32(r) <= MaxLatin1 { 187 return properties[uint8(r)]&pLmask == pLu 188 } 189 return isExcludingLatin(Upper, r) 190 } 191 192 // IsLower reports whether the rune is a lower case letter. 193 func IsLower(r rune) bool { 194 // See comment in IsGraphic. 195 if uint32(r) <= MaxLatin1 { 196 return properties[uint8(r)]&pLmask == pLl 197 } 198 return isExcludingLatin(Lower, r) 199 } 200 201 // IsTitle reports whether the rune is a title case letter. 202 func IsTitle(r rune) bool { 203 if r <= MaxLatin1 { 204 return false 205 } 206 return isExcludingLatin(Title, r) 207 } 208 209 // to maps the rune using the specified case mapping. 210 // It additionally reports whether caseRange contained a mapping for r. 211 func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) { 212 if _case < 0 || MaxCase <= _case { 213 return ReplacementChar, false // as reasonable an error as any 214 } 215 // binary search over ranges 216 lo := 0 217 hi := len(caseRange) 218 for lo < hi { 219 m := int(uint(lo+hi) >> 1) 220 cr := caseRange[m] 221 if rune(cr.Lo) <= r && r <= rune(cr.Hi) { 222 delta := cr.Delta[_case] 223 if delta > MaxRune { 224 // In an Upper-Lower sequence, which always starts with 225 // an UpperCase letter, the real deltas always look like: 226 // {0, 1, 0} UpperCase (Lower is next) 227 // {-1, 0, -1} LowerCase (Upper, Title are previous) 228 // The characters at even offsets from the beginning of the 229 // sequence are upper case; the ones at odd offsets are lower. 230 // The correct mapping can be done by clearing or setting the low 231 // bit in the sequence offset. 232 // The constants UpperCase and TitleCase are even while LowerCase 233 // is odd so we take the low bit from _case. 234 return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true 235 } 236 return r + delta, true 237 } 238 if r < rune(cr.Lo) { 239 hi = m 240 } else { 241 lo = m + 1 242 } 243 } 244 return r, false 245 } 246 247 // To maps the rune to the specified case: [UpperCase], [LowerCase], or [TitleCase]. 248 func To(_case int, r rune) rune { 249 r, _ = to(_case, r, CaseRanges) 250 return r 251 } 252 253 // ToUpper maps the rune to upper case. 254 func ToUpper(r rune) rune { 255 if r <= MaxASCII { 256 if 'a' <= r && r <= 'z' { 257 r -= 'a' - 'A' 258 } 259 return r 260 } 261 return To(UpperCase, r) 262 } 263 264 // ToLower maps the rune to lower case. 265 func ToLower(r rune) rune { 266 if r <= MaxASCII { 267 if 'A' <= r && r <= 'Z' { 268 r += 'a' - 'A' 269 } 270 return r 271 } 272 return To(LowerCase, r) 273 } 274 275 // ToTitle maps the rune to title case. 276 func ToTitle(r rune) rune { 277 if r <= MaxASCII { 278 if 'a' <= r && r <= 'z' { // title case is upper case for ASCII 279 r -= 'a' - 'A' 280 } 281 return r 282 } 283 return To(TitleCase, r) 284 } 285 286 // ToUpper maps the rune to upper case giving priority to the special mapping. 287 func (special SpecialCase) ToUpper(r rune) rune { 288 r1, hadMapping := to(UpperCase, r, []CaseRange(special)) 289 if r1 == r && !hadMapping { 290 r1 = ToUpper(r) 291 } 292 return r1 293 } 294 295 // ToTitle maps the rune to title case giving priority to the special mapping. 296 func (special SpecialCase) ToTitle(r rune) rune { 297 r1, hadMapping := to(TitleCase, r, []CaseRange(special)) 298 if r1 == r && !hadMapping { 299 r1 = ToTitle(r) 300 } 301 return r1 302 } 303 304 // ToLower maps the rune to lower case giving priority to the special mapping. 305 func (special SpecialCase) ToLower(r rune) rune { 306 r1, hadMapping := to(LowerCase, r, []CaseRange(special)) 307 if r1 == r && !hadMapping { 308 r1 = ToLower(r) 309 } 310 return r1 311 } 312 313 // caseOrbit is defined in tables.go as []foldPair. Right now all the 314 // entries fit in uint16, so use uint16. If that changes, compilation 315 // will fail (the constants in the composite literal will not fit in uint16) 316 // and the types here can change to uint32. 317 type foldPair struct { 318 From uint16 319 To uint16 320 } 321 322 // SimpleFold iterates over Unicode code points equivalent under 323 // the Unicode-defined simple case folding. Among the code points 324 // equivalent to rune (including rune itself), SimpleFold returns the 325 // smallest rune > r if one exists, or else the smallest rune >= 0. 326 // If r is not a valid Unicode code point, SimpleFold(r) returns r. 327 // 328 // For example: 329 // 330 // SimpleFold('A') = 'a' 331 // SimpleFold('a') = 'A' 332 // 333 // SimpleFold('K') = 'k' 334 // SimpleFold('k') = '\u212A' (Kelvin symbol, K) 335 // SimpleFold('\u212A') = 'K' 336 // 337 // SimpleFold('1') = '1' 338 // 339 // SimpleFold(-2) = -2 340 func SimpleFold(r rune) rune { 341 if r < 0 || r > MaxRune { 342 return r 343 } 344 345 if int(r) < len(asciiFold) { 346 return rune(asciiFold[r]) 347 } 348 349 // Consult caseOrbit table for special cases. 350 lo := 0 351 hi := len(caseOrbit) 352 for lo < hi { 353 m := int(uint(lo+hi) >> 1) 354 if rune(caseOrbit[m].From) < r { 355 lo = m + 1 356 } else { 357 hi = m 358 } 359 } 360 if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r { 361 return rune(caseOrbit[lo].To) 362 } 363 364 // No folding specified. This is a one- or two-element 365 // equivalence class containing rune and ToLower(rune) 366 // and ToUpper(rune) if they are different from rune. 367 if l := ToLower(r); l != r { 368 return l 369 } 370 return ToUpper(r) 371 }