golang.org/x/text@v0.14.0/collate/build/colelem.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package build 6 7 import ( 8 "fmt" 9 "unicode" 10 11 "golang.org/x/text/internal/colltab" 12 ) 13 14 const ( 15 defaultSecondary = 0x20 16 defaultTertiary = 0x2 17 maxTertiary = 0x1F 18 ) 19 20 type rawCE struct { 21 w []int 22 ccc uint8 23 } 24 25 func makeRawCE(w []int, ccc uint8) rawCE { 26 ce := rawCE{w: make([]int, 4), ccc: ccc} 27 copy(ce.w, w) 28 return ce 29 } 30 31 // A collation element is represented as an uint32. 32 // In the typical case, a rune maps to a single collation element. If a rune 33 // can be the start of a contraction or expands into multiple collation elements, 34 // then the collation element that is associated with a rune will have a special 35 // form to represent such m to n mappings. Such special collation elements 36 // have a value >= 0x80000000. 37 38 const ( 39 maxPrimaryBits = 21 40 maxSecondaryBits = 12 41 maxTertiaryBits = 8 42 ) 43 44 func makeCE(ce rawCE) (uint32, error) { 45 v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc) 46 return uint32(v), e 47 } 48 49 // For contractions, collation elements are of the form 50 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where 51 // - n* is the size of the first node in the contraction trie. 52 // - i* is the index of the first node in the contraction trie. 53 // - b* is the offset into the contraction collation element table. 54 // 55 // See contract.go for details on the contraction trie. 56 const ( 57 contractID = 0xC0000000 58 maxNBits = 4 59 maxTrieIndexBits = 12 60 maxContractOffsetBits = 13 61 ) 62 63 func makeContractIndex(h ctHandle, offset int) (uint32, error) { 64 if h.n >= 1<<maxNBits { 65 return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits) 66 } 67 if h.index >= 1<<maxTrieIndexBits { 68 return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits) 69 } 70 if offset >= 1<<maxContractOffsetBits { 71 return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits) 72 } 73 ce := uint32(contractID) 74 ce += uint32(offset << (maxNBits + maxTrieIndexBits)) 75 ce += uint32(h.index << maxNBits) 76 ce += uint32(h.n) 77 return ce, nil 78 } 79 80 // For expansions, collation elements are of the form 81 // 11100000 00000000 bbbbbbbb bbbbbbbb, 82 // where b* is the index into the expansion sequence table. 83 const ( 84 expandID = 0xE0000000 85 maxExpandIndexBits = 16 86 ) 87 88 func makeExpandIndex(index int) (uint32, error) { 89 if index >= 1<<maxExpandIndexBits { 90 return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits) 91 } 92 return expandID + uint32(index), nil 93 } 94 95 // Each list of collation elements corresponding to an expansion starts with 96 // a header indicating the length of the sequence. 97 func makeExpansionHeader(n int) (uint32, error) { 98 return uint32(n), nil 99 } 100 101 // Some runes can be expanded using NFKD decomposition. Instead of storing the full 102 // sequence of collation elements, we decompose the rune and lookup the collation 103 // elements for each rune in the decomposition and modify the tertiary weights. 104 // The collation element, in this case, is of the form 105 // 11110000 00000000 wwwwwwww vvvvvvvv, where 106 // - v* is the replacement tertiary weight for the first rune, 107 // - w* is the replacement tertiary weight for the second rune. 108 // 109 // Tertiary weights of subsequent runes should be replaced with maxTertiary. 110 // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. 111 const ( 112 decompID = 0xF0000000 113 ) 114 115 func makeDecompose(t1, t2 int) (uint32, error) { 116 if t1 >= 256 || t1 < 0 { 117 return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1) 118 } 119 if t2 >= 256 || t2 < 0 { 120 return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2) 121 } 122 return uint32(t2<<8+t1) + decompID, nil 123 } 124 125 const ( 126 // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. 127 minUnified rune = 0x4E00 128 maxUnified = 0x9FFF 129 minCompatibility = 0xF900 130 maxCompatibility = 0xFAFF 131 minRare = 0x3400 132 maxRare = 0x4DBF 133 ) 134 const ( 135 commonUnifiedOffset = 0x10000 136 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF 137 otherOffset = 0x50000 // largest rune in rare is U+2FA1D 138 illegalOffset = otherOffset + int(unicode.MaxRune) 139 maxPrimary = illegalOffset + 1 140 ) 141 142 // implicitPrimary returns the primary weight for the a rune 143 // for which there is no entry for the rune in the collation table. 144 // We take a different approach from the one specified in 145 // https://unicode.org/reports/tr10/#Implicit_Weights, 146 // but preserve the resulting relative ordering of the runes. 147 func implicitPrimary(r rune) int { 148 if unicode.Is(unicode.Ideographic, r) { 149 if r >= minUnified && r <= maxUnified { 150 // The most common case for CJK. 151 return int(r) + commonUnifiedOffset 152 } 153 if r >= minCompatibility && r <= maxCompatibility { 154 // This will typically not hit. The DUCET explicitly specifies mappings 155 // for all characters that do not decompose. 156 return int(r) + commonUnifiedOffset 157 } 158 return int(r) + rareUnifiedOffset 159 } 160 return int(r) + otherOffset 161 } 162 163 // convertLargeWeights converts collation elements with large 164 // primaries (either double primaries or for illegal runes) 165 // to our own representation. 166 // A CJK character C is represented in the DUCET as 167 // 168 // [.FBxx.0020.0002.C][.BBBB.0000.0000.C] 169 // 170 // We will rewrite these characters to a single CE. 171 // We assume the CJK values start at 0x8000. 172 // See https://unicode.org/reports/tr10/#Implicit_Weights 173 func convertLargeWeights(elems []rawCE) (res []rawCE, err error) { 174 const ( 175 cjkPrimaryStart = 0xFB40 176 rarePrimaryStart = 0xFB80 177 otherPrimaryStart = 0xFBC0 178 illegalPrimary = 0xFFFE 179 highBitsMask = 0x3F 180 lowBitsMask = 0x7FFF 181 lowBitsFlag = 0x8000 182 shiftBits = 15 183 ) 184 for i := 0; i < len(elems); i++ { 185 ce := elems[i].w 186 p := ce[0] 187 if p < cjkPrimaryStart { 188 continue 189 } 190 if p > 0xFFFF { 191 return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p) 192 } 193 if p >= illegalPrimary { 194 ce[0] = illegalOffset + p - illegalPrimary 195 } else { 196 if i+1 >= len(elems) { 197 return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) 198 } 199 if elems[i+1].w[0]&lowBitsFlag == 0 { 200 return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) 201 } 202 np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask 203 switch { 204 case p < rarePrimaryStart: 205 np += commonUnifiedOffset 206 case p < otherPrimaryStart: 207 np += rareUnifiedOffset 208 default: 209 p += otherOffset 210 } 211 ce[0] = np 212 for j := i + 1; j+1 < len(elems); j++ { 213 elems[j] = elems[j+1] 214 } 215 elems = elems[:len(elems)-1] 216 } 217 } 218 return elems, nil 219 } 220 221 // nextWeight computes the first possible collation weights following elems 222 // for the given level. 223 func nextWeight(level colltab.Level, elems []rawCE) []rawCE { 224 if level == colltab.Identity { 225 next := make([]rawCE, len(elems)) 226 copy(next, elems) 227 return next 228 } 229 next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)} 230 next[0].w[level]++ 231 if level < colltab.Secondary { 232 next[0].w[colltab.Secondary] = defaultSecondary 233 } 234 if level < colltab.Tertiary { 235 next[0].w[colltab.Tertiary] = defaultTertiary 236 } 237 // Filter entries that cannot influence ordering. 238 for _, ce := range elems[1:] { 239 skip := true 240 for i := colltab.Primary; i < level; i++ { 241 skip = skip && ce.w[i] == 0 242 } 243 if !skip { 244 next = append(next, ce) 245 } 246 } 247 return next 248 } 249 250 func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) { 251 for ; i < len(elems) && elems[i].w[level] == 0; i++ { 252 } 253 if i < len(elems) { 254 return i, elems[i].w[level] 255 } 256 return i, 0 257 } 258 259 // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise. 260 // It also returns the collation level at which the difference is found. 261 func compareWeights(a, b []rawCE) (result int, level colltab.Level) { 262 for level := colltab.Primary; level < colltab.Identity; level++ { 263 var va, vb int 264 for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 { 265 ia, va = nextVal(a, ia, level) 266 ib, vb = nextVal(b, ib, level) 267 if va != vb { 268 if va < vb { 269 return -1, level 270 } else { 271 return 1, level 272 } 273 } 274 } 275 } 276 return 0, colltab.Identity 277 } 278 279 func equalCE(a, b rawCE) bool { 280 for i := 0; i < 3; i++ { 281 if b.w[i] != a.w[i] { 282 return false 283 } 284 } 285 return true 286 } 287 288 func equalCEArrays(a, b []rawCE) bool { 289 if len(a) != len(b) { 290 return false 291 } 292 for i := range a { 293 if !equalCE(a[i], b[i]) { 294 return false 295 } 296 } 297 return true 298 }