github.com/mdaxf/iac@v0.0.0-20240519030858-58a061660378/vendor_skip/golang.org/x/text/internal/colltab/collelem.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package colltab 6 7 import ( 8 "fmt" 9 "unicode" 10 ) 11 12 // Level identifies the collation comparison level. 13 // The primary level corresponds to the basic sorting of text. 14 // The secondary level corresponds to accents and related linguistic elements. 15 // The tertiary level corresponds to casing and related concepts. 16 // The quaternary level is derived from the other levels by the 17 // various algorithms for handling variable elements. 18 type Level int 19 20 const ( 21 Primary Level = iota 22 Secondary 23 Tertiary 24 Quaternary 25 Identity 26 27 NumLevels 28 ) 29 30 const ( 31 defaultSecondary = 0x20 32 defaultTertiary = 0x2 33 maxTertiary = 0x1F 34 MaxQuaternary = 0x1FFFFF // 21 bits. 35 ) 36 37 // Elem is a representation of a collation element. This API provides ways to encode 38 // and decode Elems. Implementations of collation tables may use values greater 39 // or equal to PrivateUse for their own purposes. However, these should never be 40 // returned by AppendNext. 41 type Elem uint32 42 43 const ( 44 maxCE Elem = 0xAFFFFFFF 45 PrivateUse = minContract 46 minContract = 0xC0000000 47 maxContract = 0xDFFFFFFF 48 minExpand = 0xE0000000 49 maxExpand = 0xEFFFFFFF 50 minDecomp = 0xF0000000 51 ) 52 53 type ceType int 54 55 const ( 56 ceNormal ceType = iota // ceNormal includes implicits (ce == 0) 57 ceContractionIndex // rune can be a start of a contraction 58 ceExpansionIndex // rune expands into a sequence of collation elements 59 ceDecompose // rune expands using NFKC decomposition 60 ) 61 62 func (ce Elem) ctype() ceType { 63 if ce <= maxCE { 64 return ceNormal 65 } 66 if ce <= maxContract { 67 return ceContractionIndex 68 } else { 69 if ce <= maxExpand { 70 return ceExpansionIndex 71 } 72 return ceDecompose 73 } 74 panic("should not reach here") 75 return ceType(-1) 76 } 77 78 // For normal collation elements, we assume that a collation element either has 79 // a primary or non-default secondary value, not both. 80 // Collation elements with a primary value are of the form 81 // 82 // 01pppppp pppppppp ppppppp0 ssssssss 83 // - p* is primary collation value 84 // - s* is the secondary collation value 85 // 00pppppp pppppppp ppppppps sssttttt, where 86 // - p* is primary collation value 87 // - s* offset of secondary from default value. 88 // - t* is the tertiary collation value 89 // 100ttttt cccccccc pppppppp pppppppp 90 // - t* is the tertiar collation value 91 // - c* is the canonical combining class 92 // - p* is the primary collation value 93 // 94 // Collation elements with a secondary value are of the form 95 // 96 // 1010cccc ccccssss ssssssss tttttttt, where 97 // - c* is the canonical combining class 98 // - s* is the secondary collation value 99 // - t* is the tertiary collation value 100 // 11qqqqqq qqqqqqqq qqqqqqq0 00000000 101 // - q* quaternary value 102 const ( 103 ceTypeMask = 0xC0000000 104 ceTypeMaskExt = 0xE0000000 105 ceIgnoreMask = 0xF00FFFFF 106 ceType1 = 0x40000000 107 ceType2 = 0x00000000 108 ceType3or4 = 0x80000000 109 ceType4 = 0xA0000000 110 ceTypeQ = 0xC0000000 111 Ignore = ceType4 112 firstNonPrimary = 0x80000000 113 lastSpecialPrimary = 0xA0000000 114 secondaryMask = 0x80000000 115 hasTertiaryMask = 0x40000000 116 primaryValueMask = 0x3FFFFE00 117 maxPrimaryBits = 21 118 compactPrimaryBits = 16 119 maxSecondaryBits = 12 120 maxTertiaryBits = 8 121 maxCCCBits = 8 122 maxSecondaryCompactBits = 8 123 maxSecondaryDiffBits = 4 124 maxTertiaryCompactBits = 5 125 primaryShift = 9 126 compactSecondaryShift = 5 127 minCompactSecondary = defaultSecondary - 4 128 ) 129 130 func makeImplicitCE(primary int) Elem { 131 return ceType1 | Elem(primary<<primaryShift) | defaultSecondary 132 } 133 134 // MakeElem returns an Elem for the given values. It will return an error 135 // if the given combination of values is invalid. 136 func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) { 137 if w := primary; w >= 1<<maxPrimaryBits || w < 0 { 138 return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits) 139 } 140 if w := secondary; w >= 1<<maxSecondaryBits || w < 0 { 141 return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits) 142 } 143 if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 { 144 return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits) 145 } 146 ce := Elem(0) 147 if primary != 0 { 148 if ccc != 0 { 149 if primary >= 1<<compactPrimaryBits { 150 return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits) 151 } 152 if secondary != defaultSecondary { 153 return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc) 154 } 155 ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits)) 156 ce |= Elem(ccc) << compactPrimaryBits 157 ce |= Elem(primary) 158 ce |= ceType3or4 159 } else if tertiary == defaultTertiary { 160 if secondary >= 1<<maxSecondaryCompactBits { 161 return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits) 162 } 163 ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary) 164 ce |= ceType1 165 } else { 166 d := secondary - defaultSecondary + maxSecondaryDiffBits 167 if d >= 1<<maxSecondaryDiffBits || d < 0 { 168 return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits) 169 } 170 if tertiary >= 1<<maxTertiaryCompactBits { 171 return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits) 172 } 173 ce = Elem(primary<<maxSecondaryDiffBits + d) 174 ce = ce<<maxTertiaryCompactBits + Elem(tertiary) 175 } 176 } else { 177 ce = Elem(secondary<<maxTertiaryBits + tertiary) 178 ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits) 179 ce |= ceType4 180 } 181 return ce, nil 182 } 183 184 // MakeQuaternary returns an Elem with the given quaternary value. 185 func MakeQuaternary(v int) Elem { 186 return ceTypeQ | Elem(v<<primaryShift) 187 } 188 189 // Mask sets weights for any level smaller than l to 0. 190 // The resulting Elem can be used to test for equality with 191 // other Elems to which the same mask has been applied. 192 func (ce Elem) Mask(l Level) uint32 { 193 return 0 194 } 195 196 // CCC returns the canonical combining class associated with the underlying character, 197 // if applicable, or 0 otherwise. 198 func (ce Elem) CCC() uint8 { 199 if ce&ceType3or4 != 0 { 200 if ce&ceType4 == ceType3or4 { 201 return uint8(ce >> 16) 202 } 203 return uint8(ce >> 20) 204 } 205 return 0 206 } 207 208 // Primary returns the primary collation weight for ce. 209 func (ce Elem) Primary() int { 210 if ce >= firstNonPrimary { 211 if ce > lastSpecialPrimary { 212 return 0 213 } 214 return int(uint16(ce)) 215 } 216 return int(ce&primaryValueMask) >> primaryShift 217 } 218 219 // Secondary returns the secondary collation weight for ce. 220 func (ce Elem) Secondary() int { 221 switch ce & ceTypeMask { 222 case ceType1: 223 return int(uint8(ce)) 224 case ceType2: 225 return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) 226 case ceType3or4: 227 if ce < ceType4 { 228 return defaultSecondary 229 } 230 return int(ce>>8) & 0xFFF 231 case ceTypeQ: 232 return 0 233 } 234 panic("should not reach here") 235 } 236 237 // Tertiary returns the tertiary collation weight for ce. 238 func (ce Elem) Tertiary() uint8 { 239 if ce&hasTertiaryMask == 0 { 240 if ce&ceType3or4 == 0 { 241 return uint8(ce & 0x1F) 242 } 243 if ce&ceType4 == ceType4 { 244 return uint8(ce) 245 } 246 return uint8(ce>>24) & 0x1F // type 2 247 } else if ce&ceTypeMask == ceType1 { 248 return defaultTertiary 249 } 250 // ce is a quaternary value. 251 return 0 252 } 253 254 func (ce Elem) updateTertiary(t uint8) Elem { 255 if ce&ceTypeMask == ceType1 { 256 // convert to type 4 257 nce := ce & primaryValueMask 258 nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift 259 ce = nce 260 } else if ce&ceTypeMaskExt == ceType3or4 { 261 ce &= ^Elem(maxTertiary << 24) 262 return ce | (Elem(t) << 24) 263 } else { 264 // type 2 or 4 265 ce &= ^Elem(maxTertiary) 266 } 267 return ce | Elem(t) 268 } 269 270 // Quaternary returns the quaternary value if explicitly specified, 271 // 0 if ce == Ignore, or MaxQuaternary otherwise. 272 // Quaternary values are used only for shifted variants. 273 func (ce Elem) Quaternary() int { 274 if ce&ceTypeMask == ceTypeQ { 275 return int(ce&primaryValueMask) >> primaryShift 276 } else if ce&ceIgnoreMask == Ignore { 277 return 0 278 } 279 return MaxQuaternary 280 } 281 282 // Weight returns the collation weight for the given level. 283 func (ce Elem) Weight(l Level) int { 284 switch l { 285 case Primary: 286 return ce.Primary() 287 case Secondary: 288 return ce.Secondary() 289 case Tertiary: 290 return int(ce.Tertiary()) 291 case Quaternary: 292 return ce.Quaternary() 293 } 294 return 0 // return 0 (ignore) for undefined levels. 295 } 296 297 // For contractions, collation elements are of the form 298 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where 299 // - n* is the size of the first node in the contraction trie. 300 // - i* is the index of the first node in the contraction trie. 301 // - b* is the offset into the contraction collation element table. 302 // 303 // See contract.go for details on the contraction trie. 304 const ( 305 maxNBits = 4 306 maxTrieIndexBits = 12 307 maxContractOffsetBits = 13 308 ) 309 310 func splitContractIndex(ce Elem) (index, n, offset int) { 311 n = int(ce & (1<<maxNBits - 1)) 312 ce >>= maxNBits 313 index = int(ce & (1<<maxTrieIndexBits - 1)) 314 ce >>= maxTrieIndexBits 315 offset = int(ce & (1<<maxContractOffsetBits - 1)) 316 return 317 } 318 319 // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb, 320 // where b* is the index into the expansion sequence table. 321 const maxExpandIndexBits = 16 322 323 func splitExpandIndex(ce Elem) (index int) { 324 return int(uint16(ce)) 325 } 326 327 // Some runes can be expanded using NFKD decomposition. Instead of storing the full 328 // sequence of collation elements, we decompose the rune and lookup the collation 329 // elements for each rune in the decomposition and modify the tertiary weights. 330 // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where 331 // - v* is the replacement tertiary weight for the first rune, 332 // - w* is the replacement tertiary weight for the second rune, 333 // 334 // Tertiary weights of subsequent runes should be replaced with maxTertiary. 335 // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. 336 func splitDecompose(ce Elem) (t1, t2 uint8) { 337 return uint8(ce), uint8(ce >> 8) 338 } 339 340 const ( 341 // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. 342 minUnified rune = 0x4E00 343 maxUnified = 0x9FFF 344 minCompatibility = 0xF900 345 maxCompatibility = 0xFAFF 346 minRare = 0x3400 347 maxRare = 0x4DBF 348 ) 349 const ( 350 commonUnifiedOffset = 0x10000 351 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF 352 otherOffset = 0x50000 // largest rune in rare is U+2FA1D 353 illegalOffset = otherOffset + int(unicode.MaxRune) 354 maxPrimary = illegalOffset + 1 355 ) 356 357 // implicitPrimary returns the primary weight for the a rune 358 // for which there is no entry for the rune in the collation table. 359 // We take a different approach from the one specified in 360 // https://unicode.org/reports/tr10/#Implicit_Weights, 361 // but preserve the resulting relative ordering of the runes. 362 func implicitPrimary(r rune) int { 363 if unicode.Is(unicode.Ideographic, r) { 364 if r >= minUnified && r <= maxUnified { 365 // The most common case for CJK. 366 return int(r) + commonUnifiedOffset 367 } 368 if r >= minCompatibility && r <= maxCompatibility { 369 // This will typically not hit. The DUCET explicitly specifies mappings 370 // for all characters that do not decompose. 371 return int(r) + commonUnifiedOffset 372 } 373 return int(r) + rareUnifiedOffset 374 } 375 return int(r) + otherOffset 376 }