github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/colltab/collelem.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package colltab 6 7 import ( 8 "fmt" 9 "unicode" 10 ) 11 12 // Level identifies the collation comparison level. 13 // The primary level corresponds to the basic sorting of text. 14 // The secondary level corresponds to accents and related linguistic elements. 15 // The tertiary level corresponds to casing and related concepts. 16 // The quaternary level is derived from the other levels by the 17 // various algorithms for handling variable elements. 18 type Level int 19 20 const ( 21 Primary Level = iota 22 Secondary 23 Tertiary 24 Quaternary 25 Identity 26 27 NumLevels 28 ) 29 30 const ( 31 defaultSecondary = 0x20 32 defaultTertiary = 0x2 33 maxTertiary = 0x1F 34 MaxQuaternary = 0x1FFFFF // 21 bits. 35 ) 36 37 // Elem is a representation of a collation element. This API provides ways to encode 38 // and decode Elems. Implementations of collation tables may use values greater 39 // or equal to PrivateUse for their own purposes. However, these should never be 40 // returned by AppendNext. 41 type Elem uint32 42 43 const ( 44 maxCE Elem = 0xAFFFFFFF 45 PrivateUse = minContract 46 minContract = 0xC0000000 47 maxContract = 0xDFFFFFFF 48 minExpand = 0xE0000000 49 maxExpand = 0xEFFFFFFF 50 minDecomp = 0xF0000000 51 ) 52 53 type ceType int 54 55 const ( 56 ceNormal ceType = iota // ceNormal includes implicits (ce == 0) 57 ceContractionIndex // rune can be a start of a contraction 58 ceExpansionIndex // rune expands into a sequence of collation elements 59 ceDecompose // rune expands using NFKC decomposition 60 ) 61 62 func (ce Elem) ctype() ceType { 63 if ce <= maxCE { 64 return ceNormal 65 } 66 if ce <= maxContract { 67 return ceContractionIndex 68 } else { 69 if ce <= maxExpand { 70 return ceExpansionIndex 71 } 72 return ceDecompose 73 } 74 panic("should not reach here") 75 return ceType(-1) 76 } 77 78 // For normal collation elements, we assume that a collation element either has 79 // a primary or non-default secondary value, not both. 80 // Collation elements with a primary value are of the form 81 // 01pppppp pppppppp ppppppp0 ssssssss 82 // - p* is primary collation value 83 // - s* is the secondary collation value 84 // 00pppppp pppppppp ppppppps sssttttt, where 85 // - p* is primary collation value 86 // - s* offset of secondary from default value. 87 // - t* is the tertiary collation value 88 // 100ttttt cccccccc pppppppp pppppppp 89 // - t* is the tertiar collation value 90 // - c* is the canonical combining class 91 // - p* is the primary collation value 92 // Collation elements with a secondary value are of the form 93 // 1010cccc ccccssss ssssssss tttttttt, where 94 // - c* is the canonical combining class 95 // - s* is the secondary collation value 96 // - t* is the tertiary collation value 97 // 11qqqqqq qqqqqqqq qqqqqqq0 00000000 98 // - q* quaternary value 99 const ( 100 ceTypeMask = 0xC0000000 101 ceTypeMaskExt = 0xE0000000 102 ceIgnoreMask = 0xF00FFFFF 103 ceType1 = 0x40000000 104 ceType2 = 0x00000000 105 ceType3or4 = 0x80000000 106 ceType4 = 0xA0000000 107 ceTypeQ = 0xC0000000 108 Ignore = ceType4 109 firstNonPrimary = 0x80000000 110 lastSpecialPrimary = 0xA0000000 111 secondaryMask = 0x80000000 112 hasTertiaryMask = 0x40000000 113 primaryValueMask = 0x3FFFFE00 114 maxPrimaryBits = 21 115 compactPrimaryBits = 16 116 maxSecondaryBits = 12 117 maxTertiaryBits = 8 118 maxCCCBits = 8 119 maxSecondaryCompactBits = 8 120 maxSecondaryDiffBits = 4 121 maxTertiaryCompactBits = 5 122 primaryShift = 9 123 compactSecondaryShift = 5 124 minCompactSecondary = defaultSecondary - 4 125 ) 126 127 func makeImplicitCE(primary int) Elem { 128 return ceType1 | Elem(primary<<primaryShift) | defaultSecondary 129 } 130 131 // MakeElem returns an Elem for the given values. It will return an error 132 // if the given combination of values is invalid. 133 func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) { 134 if w := primary; w >= 1<<maxPrimaryBits || w < 0 { 135 return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits) 136 } 137 if w := secondary; w >= 1<<maxSecondaryBits || w < 0 { 138 return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits) 139 } 140 if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 { 141 return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits) 142 } 143 ce := Elem(0) 144 if primary != 0 { 145 if ccc != 0 { 146 if primary >= 1<<compactPrimaryBits { 147 return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits) 148 } 149 if secondary != defaultSecondary { 150 return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc) 151 } 152 ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits)) 153 ce |= Elem(ccc) << compactPrimaryBits 154 ce |= Elem(primary) 155 ce |= ceType3or4 156 } else if tertiary == defaultTertiary { 157 if secondary >= 1<<maxSecondaryCompactBits { 158 return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits) 159 } 160 ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary) 161 ce |= ceType1 162 } else { 163 d := secondary - defaultSecondary + maxSecondaryDiffBits 164 if d >= 1<<maxSecondaryDiffBits || d < 0 { 165 return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits) 166 } 167 if tertiary >= 1<<maxTertiaryCompactBits { 168 return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits) 169 } 170 ce = Elem(primary<<maxSecondaryDiffBits + d) 171 ce = ce<<maxTertiaryCompactBits + Elem(tertiary) 172 } 173 } else { 174 ce = Elem(secondary<<maxTertiaryBits + tertiary) 175 ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits) 176 ce |= ceType4 177 } 178 return ce, nil 179 } 180 181 // MakeQuaternary returns an Elem with the given quaternary value. 182 func MakeQuaternary(v int) Elem { 183 return ceTypeQ | Elem(v<<primaryShift) 184 } 185 186 // Mask sets weights for any level smaller than l to 0. 187 // The resulting Elem can be used to test for equality with 188 // other Elems to which the same mask has been applied. 189 func (ce Elem) Mask(l Level) uint32 { 190 return 0 191 } 192 193 // CCC returns the canonical combining class associated with the underlying character, 194 // if applicable, or 0 otherwise. 195 func (ce Elem) CCC() uint8 { 196 if ce&ceType3or4 != 0 { 197 if ce&ceType4 == ceType3or4 { 198 return uint8(ce >> 16) 199 } 200 return uint8(ce >> 20) 201 } 202 return 0 203 } 204 205 // Primary returns the primary collation weight for ce. 206 func (ce Elem) Primary() int { 207 if ce >= firstNonPrimary { 208 if ce > lastSpecialPrimary { 209 return 0 210 } 211 return int(uint16(ce)) 212 } 213 return int(ce&primaryValueMask) >> primaryShift 214 } 215 216 // Secondary returns the secondary collation weight for ce. 217 func (ce Elem) Secondary() int { 218 switch ce & ceTypeMask { 219 case ceType1: 220 return int(uint8(ce)) 221 case ceType2: 222 return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) 223 case ceType3or4: 224 if ce < ceType4 { 225 return defaultSecondary 226 } 227 return int(ce>>8) & 0xFFF 228 case ceTypeQ: 229 return 0 230 } 231 panic("should not reach here") 232 } 233 234 // Tertiary returns the tertiary collation weight for ce. 235 func (ce Elem) Tertiary() uint8 { 236 if ce&hasTertiaryMask == 0 { 237 if ce&ceType3or4 == 0 { 238 return uint8(ce & 0x1F) 239 } 240 if ce&ceType4 == ceType4 { 241 return uint8(ce) 242 } 243 return uint8(ce>>24) & 0x1F // type 2 244 } else if ce&ceTypeMask == ceType1 { 245 return defaultTertiary 246 } 247 // ce is a quaternary value. 248 return 0 249 } 250 251 func (ce Elem) updateTertiary(t uint8) Elem { 252 if ce&ceTypeMask == ceType1 { 253 // convert to type 4 254 nce := ce & primaryValueMask 255 nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift 256 ce = nce 257 } else if ce&ceTypeMaskExt == ceType3or4 { 258 ce &= ^Elem(maxTertiary << 24) 259 return ce | (Elem(t) << 24) 260 } else { 261 // type 2 or 4 262 ce &= ^Elem(maxTertiary) 263 } 264 return ce | Elem(t) 265 } 266 267 // Quaternary returns the quaternary value if explicitly specified, 268 // 0 if ce == Ignore, or MaxQuaternary otherwise. 269 // Quaternary values are used only for shifted variants. 270 func (ce Elem) Quaternary() int { 271 if ce&ceTypeMask == ceTypeQ { 272 return int(ce&primaryValueMask) >> primaryShift 273 } else if ce&ceIgnoreMask == Ignore { 274 return 0 275 } 276 return MaxQuaternary 277 } 278 279 // Weight returns the collation weight for the given level. 280 func (ce Elem) Weight(l Level) int { 281 switch l { 282 case Primary: 283 return ce.Primary() 284 case Secondary: 285 return ce.Secondary() 286 case Tertiary: 287 return int(ce.Tertiary()) 288 case Quaternary: 289 return ce.Quaternary() 290 } 291 return 0 // return 0 (ignore) for undefined levels. 292 } 293 294 // For contractions, collation elements are of the form 295 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where 296 // - n* is the size of the first node in the contraction trie. 297 // - i* is the index of the first node in the contraction trie. 298 // - b* is the offset into the contraction collation element table. 299 // See contract.go for details on the contraction trie. 300 const ( 301 maxNBits = 4 302 maxTrieIndexBits = 12 303 maxContractOffsetBits = 13 304 ) 305 306 func splitContractIndex(ce Elem) (index, n, offset int) { 307 n = int(ce & (1<<maxNBits - 1)) 308 ce >>= maxNBits 309 index = int(ce & (1<<maxTrieIndexBits - 1)) 310 ce >>= maxTrieIndexBits 311 offset = int(ce & (1<<maxContractOffsetBits - 1)) 312 return 313 } 314 315 // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb, 316 // where b* is the index into the expansion sequence table. 317 const maxExpandIndexBits = 16 318 319 func splitExpandIndex(ce Elem) (index int) { 320 return int(uint16(ce)) 321 } 322 323 // Some runes can be expanded using NFKD decomposition. Instead of storing the full 324 // sequence of collation elements, we decompose the rune and lookup the collation 325 // elements for each rune in the decomposition and modify the tertiary weights. 326 // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where 327 // - v* is the replacement tertiary weight for the first rune, 328 // - w* is the replacement tertiary weight for the second rune, 329 // Tertiary weights of subsequent runes should be replaced with maxTertiary. 330 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. 331 func splitDecompose(ce Elem) (t1, t2 uint8) { 332 return uint8(ce), uint8(ce >> 8) 333 } 334 335 const ( 336 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. 337 minUnified rune = 0x4E00 338 maxUnified = 0x9FFF 339 minCompatibility = 0xF900 340 maxCompatibility = 0xFAFF 341 minRare = 0x3400 342 maxRare = 0x4DBF 343 ) 344 const ( 345 commonUnifiedOffset = 0x10000 346 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF 347 otherOffset = 0x50000 // largest rune in rare is U+2FA1D 348 illegalOffset = otherOffset + int(unicode.MaxRune) 349 maxPrimary = illegalOffset + 1 350 ) 351 352 // implicitPrimary returns the primary weight for the a rune 353 // for which there is no entry for the rune in the collation table. 354 // We take a different approach from the one specified in 355 // http://unicode.org/reports/tr10/#Implicit_Weights, 356 // but preserve the resulting relative ordering of the runes. 357 func implicitPrimary(r rune) int { 358 if unicode.Is(unicode.Ideographic, r) { 359 if r >= minUnified && r <= maxUnified { 360 // The most common case for CJK. 361 return int(r) + commonUnifiedOffset 362 } 363 if r >= minCompatibility && r <= maxCompatibility { 364 // This will typically not hit. The DUCET explicitly specifies mappings 365 // for all characters that do not decompose. 366 return int(r) + commonUnifiedOffset 367 } 368 return int(r) + rareUnifiedOffset 369 } 370 return int(r) + otherOffset 371 }