golang.org/x/text@v0.14.0/collate/build/order.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package build 6 7 import ( 8 "fmt" 9 "log" 10 "sort" 11 "strings" 12 "unicode" 13 14 "golang.org/x/text/internal/colltab" 15 "golang.org/x/text/unicode/norm" 16 ) 17 18 type logicalAnchor int 19 20 const ( 21 firstAnchor logicalAnchor = -1 22 noAnchor = 0 23 lastAnchor = 1 24 ) 25 26 // entry is used to keep track of a single entry in the collation element table 27 // during building. Examples of entries can be found in the Default Unicode 28 // Collation Element Table. 29 // See https://www.unicode.org/Public/UCA/6.0.0/allkeys.txt. 30 type entry struct { 31 str string // same as string(runes) 32 runes []rune 33 elems []rawCE // the collation elements 34 extend string // weights of extend to be appended to elems 35 before bool // weights relative to next instead of previous. 36 lock bool // entry is used in extension and can no longer be moved. 37 38 // prev, next, and level are used to keep track of tailorings. 39 prev, next *entry 40 level colltab.Level // next differs at this level 41 skipRemove bool // do not unlink when removed 42 43 decompose bool // can use NFKD decomposition to generate elems 44 exclude bool // do not include in table 45 implicit bool // derived, is not included in the list 46 modified bool // entry was modified in tailoring 47 logical logicalAnchor 48 49 expansionIndex int // used to store index into expansion table 50 contractionHandle ctHandle 51 contractionIndex int // index into contraction elements 52 } 53 54 func (e *entry) String() string { 55 return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)", 56 e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex) 57 } 58 59 func (e *entry) skip() bool { 60 return e.contraction() 61 } 62 63 func (e *entry) expansion() bool { 64 return !e.decompose && len(e.elems) > 1 65 } 66 67 func (e *entry) contraction() bool { 68 return len(e.runes) > 1 69 } 70 71 func (e *entry) contractionStarter() bool { 72 return e.contractionHandle.n != 0 73 } 74 75 // nextIndexed gets the next entry that needs to be stored in the table. 76 // It returns the entry and the collation level at which the next entry differs 77 // from the current entry. 78 // Entries that can be explicitly derived and logical reset positions are 79 // examples of entries that will not be indexed. 80 func (e *entry) nextIndexed() (*entry, colltab.Level) { 81 level := e.level 82 for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next { 83 if e.level < level { 84 level = e.level 85 } 86 } 87 return e, level 88 } 89 90 // remove unlinks entry e from the sorted chain and clears the collation 91 // elements. e may not be at the front or end of the list. This should always 92 // be the case, as the front and end of the list are always logical anchors, 93 // which may not be removed. 94 func (e *entry) remove() { 95 if e.logical != noAnchor { 96 log.Fatalf("may not remove anchor %q", e.str) 97 } 98 // TODO: need to set e.prev.level to e.level if e.level is smaller? 99 e.elems = nil 100 if !e.skipRemove { 101 if e.prev != nil { 102 e.prev.next = e.next 103 } 104 if e.next != nil { 105 e.next.prev = e.prev 106 } 107 } 108 e.skipRemove = false 109 } 110 111 // insertAfter inserts n after e. 112 func (e *entry) insertAfter(n *entry) { 113 if e == n { 114 panic("e == anchor") 115 } 116 if e == nil { 117 panic("unexpected nil anchor") 118 } 119 n.remove() 120 n.decompose = false // redo decomposition test 121 122 n.next = e.next 123 n.prev = e 124 if e.next != nil { 125 e.next.prev = n 126 } 127 e.next = n 128 } 129 130 // insertBefore inserts n before e. 131 func (e *entry) insertBefore(n *entry) { 132 if e == n { 133 panic("e == anchor") 134 } 135 if e == nil { 136 panic("unexpected nil anchor") 137 } 138 n.remove() 139 n.decompose = false // redo decomposition test 140 141 n.prev = e.prev 142 n.next = e 143 if e.prev != nil { 144 e.prev.next = n 145 } 146 e.prev = n 147 } 148 149 func (e *entry) encodeBase() (ce uint32, err error) { 150 switch { 151 case e.expansion(): 152 ce, err = makeExpandIndex(e.expansionIndex) 153 default: 154 if e.decompose { 155 log.Fatal("decompose should be handled elsewhere") 156 } 157 ce, err = makeCE(e.elems[0]) 158 } 159 return 160 } 161 162 func (e *entry) encode() (ce uint32, err error) { 163 if e.skip() { 164 log.Fatal("cannot build colElem for entry that should be skipped") 165 } 166 switch { 167 case e.decompose: 168 t1 := e.elems[0].w[2] 169 t2 := 0 170 if len(e.elems) > 1 { 171 t2 = e.elems[1].w[2] 172 } 173 ce, err = makeDecompose(t1, t2) 174 case e.contractionStarter(): 175 ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex) 176 default: 177 if len(e.runes) > 1 { 178 log.Fatal("colElem: contractions are handled in contraction trie") 179 } 180 ce, err = e.encodeBase() 181 } 182 return 183 } 184 185 // entryLess returns true if a sorts before b and false otherwise. 186 func entryLess(a, b *entry) bool { 187 if res, _ := compareWeights(a.elems, b.elems); res != 0 { 188 return res == -1 189 } 190 if a.logical != noAnchor { 191 return a.logical == firstAnchor 192 } 193 if b.logical != noAnchor { 194 return b.logical == lastAnchor 195 } 196 return a.str < b.str 197 } 198 199 type sortedEntries []*entry 200 201 func (s sortedEntries) Len() int { 202 return len(s) 203 } 204 205 func (s sortedEntries) Swap(i, j int) { 206 s[i], s[j] = s[j], s[i] 207 } 208 209 func (s sortedEntries) Less(i, j int) bool { 210 return entryLess(s[i], s[j]) 211 } 212 213 type ordering struct { 214 id string 215 entryMap map[string]*entry 216 ordered []*entry 217 handle *trieHandle 218 } 219 220 // insert inserts e into both entryMap and ordered. 221 // Note that insert simply appends e to ordered. To reattain a sorted 222 // order, o.sort() should be called. 223 func (o *ordering) insert(e *entry) { 224 if e.logical == noAnchor { 225 o.entryMap[e.str] = e 226 } else { 227 // Use key format as used in UCA rules. 228 o.entryMap[fmt.Sprintf("[%s]", e.str)] = e 229 // Also add index entry for XML format. 230 o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e 231 } 232 o.ordered = append(o.ordered, e) 233 } 234 235 // newEntry creates a new entry for the given info and inserts it into 236 // the index. 237 func (o *ordering) newEntry(s string, ces []rawCE) *entry { 238 e := &entry{ 239 runes: []rune(s), 240 elems: ces, 241 str: s, 242 } 243 o.insert(e) 244 return e 245 } 246 247 // find looks up and returns the entry for the given string. 248 // It returns nil if str is not in the index and if an implicit value 249 // cannot be derived, that is, if str represents more than one rune. 250 func (o *ordering) find(str string) *entry { 251 e := o.entryMap[str] 252 if e == nil { 253 r := []rune(str) 254 if len(r) == 1 { 255 const ( 256 firstHangul = 0xAC00 257 lastHangul = 0xD7A3 258 ) 259 if r[0] >= firstHangul && r[0] <= lastHangul { 260 ce := []rawCE{} 261 nfd := norm.NFD.String(str) 262 for _, r := range nfd { 263 ce = append(ce, o.find(string(r)).elems...) 264 } 265 e = o.newEntry(nfd, ce) 266 } else { 267 e = o.newEntry(string(r[0]), []rawCE{ 268 {w: []int{ 269 implicitPrimary(r[0]), 270 defaultSecondary, 271 defaultTertiary, 272 int(r[0]), 273 }, 274 }, 275 }) 276 e.modified = true 277 } 278 e.exclude = true // do not index implicits 279 } 280 } 281 return e 282 } 283 284 // makeRootOrdering returns a newly initialized ordering value and populates 285 // it with a set of logical reset points that can be used as anchors. 286 // The anchors first_tertiary_ignorable and __END__ will always sort at 287 // the beginning and end, respectively. This means that prev and next are non-nil 288 // for any indexed entry. 289 func makeRootOrdering() ordering { 290 const max = unicode.MaxRune 291 o := ordering{ 292 entryMap: make(map[string]*entry), 293 } 294 insert := func(typ logicalAnchor, s string, ce []int) { 295 e := &entry{ 296 elems: []rawCE{{w: ce}}, 297 str: s, 298 exclude: true, 299 logical: typ, 300 } 301 o.insert(e) 302 } 303 insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0}) 304 insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max}) 305 insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max}) 306 insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max}) 307 insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max}) 308 return o 309 } 310 311 // patchForInsert eliminates entries from the list with more than one collation element. 312 // The next and prev fields of the eliminated entries still point to appropriate 313 // values in the newly created list. 314 // It requires that sort has been called. 315 func (o *ordering) patchForInsert() { 316 for i := 0; i < len(o.ordered)-1; { 317 e := o.ordered[i] 318 lev := e.level 319 n := e.next 320 for ; n != nil && len(n.elems) > 1; n = n.next { 321 if n.level < lev { 322 lev = n.level 323 } 324 n.skipRemove = true 325 } 326 for ; o.ordered[i] != n; i++ { 327 o.ordered[i].level = lev 328 o.ordered[i].next = n 329 o.ordered[i+1].prev = e 330 } 331 } 332 } 333 334 // clone copies all ordering of es into a new ordering value. 335 func (o *ordering) clone() *ordering { 336 o.sort() 337 oo := ordering{ 338 entryMap: make(map[string]*entry), 339 } 340 for _, e := range o.ordered { 341 ne := &entry{ 342 runes: e.runes, 343 elems: e.elems, 344 str: e.str, 345 decompose: e.decompose, 346 exclude: e.exclude, 347 logical: e.logical, 348 } 349 oo.insert(ne) 350 } 351 oo.sort() // link all ordering. 352 oo.patchForInsert() 353 return &oo 354 } 355 356 // front returns the first entry to be indexed. 357 // It assumes that sort() has been called. 358 func (o *ordering) front() *entry { 359 e := o.ordered[0] 360 if e.prev != nil { 361 log.Panicf("unexpected first entry: %v", e) 362 } 363 // The first entry is always a logical position, which should not be indexed. 364 e, _ = e.nextIndexed() 365 return e 366 } 367 368 // sort sorts all ordering based on their collation elements and initializes 369 // the prev, next, and level fields accordingly. 370 func (o *ordering) sort() { 371 sort.Sort(sortedEntries(o.ordered)) 372 l := o.ordered 373 for i := 1; i < len(l); i++ { 374 k := i - 1 375 l[k].next = l[i] 376 _, l[k].level = compareWeights(l[k].elems, l[i].elems) 377 l[i].prev = l[k] 378 } 379 } 380 381 // genColElems generates a collation element array from the runes in str. This 382 // assumes that all collation elements have already been added to the Builder. 383 func (o *ordering) genColElems(str string) []rawCE { 384 elems := []rawCE{} 385 for _, r := range []rune(str) { 386 for _, ce := range o.find(string(r)).elems { 387 if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 { 388 elems = append(elems, ce) 389 } 390 } 391 } 392 return elems 393 }