golang.org/x/text@v0.14.0/collate/build/builder.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package build // import "golang.org/x/text/collate/build" 6 7 import ( 8 "fmt" 9 "io" 10 "log" 11 "sort" 12 "strings" 13 "unicode/utf8" 14 15 "golang.org/x/text/internal/colltab" 16 "golang.org/x/text/language" 17 "golang.org/x/text/unicode/norm" 18 ) 19 20 // TODO: optimizations: 21 // - expandElem is currently 20K. By putting unique colElems in a separate 22 // table and having a byte array of indexes into this table, we can reduce 23 // the total size to about 7K. By also factoring out the length bytes, we 24 // can reduce this to about 6K. 25 // - trie valueBlocks are currently 100K. There are a lot of sparse blocks 26 // and many consecutive values with the same stride. This can be further 27 // compacted. 28 // - Compress secondary weights into 8 bits. 29 // - Some LDML specs specify a context element. Currently we simply concatenate 30 // those. Context can be implemented using the contraction trie. If Builder 31 // could analyze and detect when using a context makes sense, there is no 32 // need to expose this construct in the API. 33 34 // A Builder builds a root collation table. The user must specify the 35 // collation elements for each entry. A common use will be to base the weights 36 // on those specified in the allkeys* file as provided by the UCA or CLDR. 37 type Builder struct { 38 index *trieBuilder 39 root ordering 40 locale []*Tailoring 41 t *table 42 err error 43 built bool 44 45 minNonVar int // lowest primary recorded for a variable 46 varTop int // highest primary recorded for a non-variable 47 48 // indexes used for reusing expansions and contractions 49 expIndex map[string]int // positions of expansions keyed by their string representation 50 ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes 51 ctElem map[string]int // contraction elements keyed by their string representation 52 } 53 54 // A Tailoring builds a collation table based on another collation table. 55 // The table is defined by specifying tailorings to the underlying table. 56 // See https://unicode.org/reports/tr35/ for an overview of tailoring 57 // collation tables. The CLDR contains pre-defined tailorings for a variety 58 // of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.) 59 type Tailoring struct { 60 id string 61 builder *Builder 62 index *ordering 63 64 anchor *entry 65 before bool 66 } 67 68 // NewBuilder returns a new Builder. 69 func NewBuilder() *Builder { 70 return &Builder{ 71 index: newTrieBuilder(), 72 root: makeRootOrdering(), 73 expIndex: make(map[string]int), 74 ctHandle: make(map[string]ctHandle), 75 ctElem: make(map[string]int), 76 } 77 } 78 79 // Tailoring returns a Tailoring for the given locale. One should 80 // have completed all calls to Add before calling Tailoring. 81 func (b *Builder) Tailoring(loc language.Tag) *Tailoring { 82 t := &Tailoring{ 83 id: loc.String(), 84 builder: b, 85 index: b.root.clone(), 86 } 87 t.index.id = t.id 88 b.locale = append(b.locale, t) 89 return t 90 } 91 92 // Add adds an entry to the collation element table, mapping 93 // a slice of runes to a sequence of collation elements. 94 // A collation element is specified as list of weights: []int{primary, secondary, ...}. 95 // The entries are typically obtained from a collation element table 96 // as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format. 97 // Note that the collation elements specified by colelems are only used 98 // as a guide. The actual weights generated by Builder may differ. 99 // The argument variables is a list of indices into colelems that should contain 100 // a value for each colelem that is a variable. (See the reference above.) 101 func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error { 102 str := string(runes) 103 elems := make([]rawCE, len(colelems)) 104 for i, ce := range colelems { 105 if len(ce) == 0 { 106 break 107 } 108 elems[i] = makeRawCE(ce, 0) 109 if len(ce) == 1 { 110 elems[i].w[1] = defaultSecondary 111 } 112 if len(ce) <= 2 { 113 elems[i].w[2] = defaultTertiary 114 } 115 if len(ce) <= 3 { 116 elems[i].w[3] = ce[0] 117 } 118 } 119 for i, ce := range elems { 120 p := ce.w[0] 121 isvar := false 122 for _, j := range variables { 123 if i == j { 124 isvar = true 125 } 126 } 127 if isvar { 128 if p >= b.minNonVar && b.minNonVar > 0 { 129 return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar) 130 } 131 if p > b.varTop { 132 b.varTop = p 133 } 134 } else if p > 1 { // 1 is a special primary value reserved for FFFE 135 if p <= b.varTop { 136 return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop) 137 } 138 if b.minNonVar == 0 || p < b.minNonVar { 139 b.minNonVar = p 140 } 141 } 142 } 143 elems, err := convertLargeWeights(elems) 144 if err != nil { 145 return err 146 } 147 cccs := []uint8{} 148 nfd := norm.NFD.String(str) 149 for i := range nfd { 150 cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC()) 151 } 152 if len(cccs) < len(elems) { 153 if len(cccs) > 2 { 154 return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems)) 155 } 156 p := len(elems) - 1 157 for ; p > 0 && elems[p].w[0] == 0; p-- { 158 elems[p].ccc = cccs[len(cccs)-1] 159 } 160 for ; p >= 0; p-- { 161 elems[p].ccc = cccs[0] 162 } 163 } else { 164 for i := range elems { 165 elems[i].ccc = cccs[i] 166 } 167 } 168 // doNorm in collate.go assumes that the following conditions hold. 169 if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] { 170 return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs) 171 } 172 b.root.newEntry(str, elems) 173 return nil 174 } 175 176 func (t *Tailoring) setAnchor(anchor string) error { 177 anchor = norm.NFC.String(anchor) 178 a := t.index.find(anchor) 179 if a == nil { 180 a = t.index.newEntry(anchor, nil) 181 a.implicit = true 182 a.modified = true 183 for _, r := range []rune(anchor) { 184 e := t.index.find(string(r)) 185 e.lock = true 186 } 187 } 188 t.anchor = a 189 return nil 190 } 191 192 // SetAnchor sets the point after which elements passed in subsequent calls to 193 // Insert will be inserted. It is equivalent to the reset directive in an LDML 194 // specification. See Insert for an example. 195 // SetAnchor supports the following logical reset positions: 196 // <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>, 197 // and <last_non_ignorable/>. 198 func (t *Tailoring) SetAnchor(anchor string) error { 199 if err := t.setAnchor(anchor); err != nil { 200 return err 201 } 202 t.before = false 203 return nil 204 } 205 206 // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to 207 // Insert will insert entries before the anchor. 208 func (t *Tailoring) SetAnchorBefore(anchor string) error { 209 if err := t.setAnchor(anchor); err != nil { 210 return err 211 } 212 t.before = true 213 return nil 214 } 215 216 // Insert sets the ordering of str relative to the entry set by the previous 217 // call to SetAnchor or Insert. The argument extend corresponds 218 // to the extend elements as defined in LDML. A non-empty value for extend 219 // will cause the collation elements corresponding to extend to be appended 220 // to the collation elements generated for the entry added by Insert. 221 // This has the same net effect as sorting str after the string anchor+extend. 222 // See https://www.unicode.org/reports/tr10/#Tailoring_Example for details 223 // on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements 224 // for full details on LDML. 225 // 226 // Examples: create a tailoring for Swedish, where "ä" is ordered after "z" 227 // at the primary sorting level: 228 // 229 // t := b.Tailoring("se") 230 // t.SetAnchor("z") 231 // t.Insert(colltab.Primary, "ä", "") 232 // 233 // Order "ü" after "ue" at the secondary sorting level: 234 // 235 // t.SetAnchor("ue") 236 // t.Insert(colltab.Secondary, "ü","") 237 // 238 // or 239 // 240 // t.SetAnchor("u") 241 // t.Insert(colltab.Secondary, "ü", "e") 242 // 243 // Order "q" after "ab" at the secondary level and "Q" after "q" 244 // at the tertiary level: 245 // 246 // t.SetAnchor("ab") 247 // t.Insert(colltab.Secondary, "q", "") 248 // t.Insert(colltab.Tertiary, "Q", "") 249 // 250 // Order "b" before "a": 251 // 252 // t.SetAnchorBefore("a") 253 // t.Insert(colltab.Primary, "b", "") 254 // 255 // Order "0" after the last primary ignorable: 256 // 257 // t.SetAnchor("<last_primary_ignorable/>") 258 // t.Insert(colltab.Primary, "0", "") 259 func (t *Tailoring) Insert(level colltab.Level, str, extend string) error { 260 if t.anchor == nil { 261 return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str) 262 } 263 str = norm.NFC.String(str) 264 e := t.index.find(str) 265 if e == nil { 266 e = t.index.newEntry(str, nil) 267 } else if e.logical != noAnchor { 268 return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str) 269 } 270 if e.lock { 271 return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str) 272 } 273 a := t.anchor 274 // Find the first element after the anchor which differs at a level smaller or 275 // equal to the given level. Then insert at this position. 276 // See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details. 277 e.before = t.before 278 if t.before { 279 t.before = false 280 if a.prev == nil { 281 a.insertBefore(e) 282 } else { 283 for a = a.prev; a.level > level; a = a.prev { 284 } 285 a.insertAfter(e) 286 } 287 e.level = level 288 } else { 289 for ; a.level > level; a = a.next { 290 } 291 e.level = a.level 292 if a != e { 293 a.insertAfter(e) 294 a.level = level 295 } else { 296 // We don't set a to prev itself. This has the effect of the entry 297 // getting new collation elements that are an increment of itself. 298 // This is intentional. 299 a.prev.level = level 300 } 301 } 302 e.extend = norm.NFD.String(extend) 303 e.exclude = false 304 e.modified = true 305 e.elems = nil 306 t.anchor = e 307 return nil 308 } 309 310 func (o *ordering) getWeight(e *entry) []rawCE { 311 if len(e.elems) == 0 && e.logical == noAnchor { 312 if e.implicit { 313 for _, r := range e.runes { 314 e.elems = append(e.elems, o.getWeight(o.find(string(r)))...) 315 } 316 } else if e.before { 317 count := [colltab.Identity + 1]int{} 318 a := e 319 for ; a.elems == nil && !a.implicit; a = a.next { 320 count[a.level]++ 321 } 322 e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)} 323 for i := colltab.Primary; i < colltab.Quaternary; i++ { 324 if count[i] != 0 { 325 e.elems[0].w[i] -= count[i] 326 break 327 } 328 } 329 if e.prev != nil { 330 o.verifyWeights(e.prev, e, e.prev.level) 331 } 332 } else { 333 prev := e.prev 334 e.elems = nextWeight(prev.level, o.getWeight(prev)) 335 o.verifyWeights(e, e.next, e.level) 336 } 337 } 338 return e.elems 339 } 340 341 func (o *ordering) addExtension(e *entry) { 342 if ex := o.find(e.extend); ex != nil { 343 e.elems = append(e.elems, ex.elems...) 344 } else { 345 for _, r := range []rune(e.extend) { 346 e.elems = append(e.elems, o.find(string(r)).elems...) 347 } 348 } 349 e.extend = "" 350 } 351 352 func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error { 353 if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil { 354 return nil 355 } 356 for i := colltab.Primary; i < level; i++ { 357 if a.elems[0].w[i] < b.elems[0].w[i] { 358 return nil 359 } 360 } 361 if a.elems[0].w[level] >= b.elems[0].w[level] { 362 err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems) 363 log.Println(err) 364 // TODO: return the error instead, or better, fix the conflicting entry by making room. 365 } 366 return nil 367 } 368 369 func (b *Builder) error(e error) { 370 if e != nil { 371 b.err = e 372 } 373 } 374 375 func (b *Builder) errorID(locale string, e error) { 376 if e != nil { 377 b.err = fmt.Errorf("%s:%v", locale, e) 378 } 379 } 380 381 // patchNorm ensures that NFC and NFD counterparts are consistent. 382 func (o *ordering) patchNorm() { 383 // Insert the NFD counterparts, if necessary. 384 for _, e := range o.ordered { 385 nfd := norm.NFD.String(e.str) 386 if nfd != e.str { 387 if e0 := o.find(nfd); e0 != nil && !e0.modified { 388 e0.elems = e.elems 389 } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) { 390 e := o.newEntry(nfd, e.elems) 391 e.modified = true 392 } 393 } 394 } 395 // Update unchanged composed forms if one of their parts changed. 396 for _, e := range o.ordered { 397 nfd := norm.NFD.String(e.str) 398 if e.modified || nfd == e.str { 399 continue 400 } 401 if e0 := o.find(nfd); e0 != nil { 402 e.elems = e0.elems 403 } else { 404 e.elems = o.genColElems(nfd) 405 if norm.NFD.LastBoundary([]byte(nfd)) == 0 { 406 r := []rune(nfd) 407 head := string(r[0]) 408 tail := "" 409 for i := 1; i < len(r); i++ { 410 s := norm.NFC.String(head + string(r[i])) 411 if e0 := o.find(s); e0 != nil && e0.modified { 412 head = s 413 } else { 414 tail += string(r[i]) 415 } 416 } 417 e.elems = append(o.genColElems(head), o.genColElems(tail)...) 418 } 419 } 420 } 421 // Exclude entries for which the individual runes generate the same collation elements. 422 for _, e := range o.ordered { 423 if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) { 424 e.exclude = true 425 } 426 } 427 } 428 429 func (b *Builder) buildOrdering(o *ordering) { 430 for _, e := range o.ordered { 431 o.getWeight(e) 432 } 433 for _, e := range o.ordered { 434 o.addExtension(e) 435 } 436 o.patchNorm() 437 o.sort() 438 simplify(o) 439 b.processExpansions(o) // requires simplify 440 b.processContractions(o) // requires simplify 441 442 t := newNode() 443 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 444 if !e.skip() { 445 ce, err := e.encode() 446 b.errorID(o.id, err) 447 t.insert(e.runes[0], ce) 448 } 449 } 450 o.handle = b.index.addTrie(t) 451 } 452 453 func (b *Builder) build() (*table, error) { 454 if b.built { 455 return b.t, b.err 456 } 457 b.built = true 458 b.t = &table{ 459 Table: colltab.Table{ 460 MaxContractLen: utf8.UTFMax, 461 VariableTop: uint32(b.varTop), 462 }, 463 } 464 465 b.buildOrdering(&b.root) 466 b.t.root = b.root.handle 467 for _, t := range b.locale { 468 b.buildOrdering(t.index) 469 if b.err != nil { 470 break 471 } 472 } 473 i, err := b.index.generate() 474 b.t.trie = *i 475 b.t.Index = colltab.Trie{ 476 Index: i.index, 477 Values: i.values, 478 Index0: i.index[blockSize*b.t.root.lookupStart:], 479 Values0: i.values[blockSize*b.t.root.valueStart:], 480 } 481 b.error(err) 482 return b.t, b.err 483 } 484 485 // Build builds the root Collator. 486 func (b *Builder) Build() (colltab.Weighter, error) { 487 table, err := b.build() 488 if err != nil { 489 return nil, err 490 } 491 return table, nil 492 } 493 494 // Build builds a Collator for Tailoring t. 495 func (t *Tailoring) Build() (colltab.Weighter, error) { 496 // TODO: implement. 497 return nil, nil 498 } 499 500 // Print prints the tables for b and all its Tailorings as a Go file 501 // that can be included in the Collate package. 502 func (b *Builder) Print(w io.Writer) (n int, err error) { 503 p := func(nn int, e error) { 504 n += nn 505 if err == nil { 506 err = e 507 } 508 } 509 t, err := b.build() 510 if err != nil { 511 return 0, err 512 } 513 p(fmt.Fprintf(w, `var availableLocales = "und`)) 514 for _, loc := range b.locale { 515 if loc.id != "und" { 516 p(fmt.Fprintf(w, ",%s", loc.id)) 517 } 518 } 519 p(fmt.Fprint(w, "\"\n\n")) 520 p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop)) 521 p(fmt.Fprintln(w, "var locales = [...]tableIndex{")) 522 for _, loc := range b.locale { 523 if loc.id == "und" { 524 p(t.fprintIndex(w, loc.index.handle, loc.id)) 525 } 526 } 527 for _, loc := range b.locale { 528 if loc.id != "und" { 529 p(t.fprintIndex(w, loc.index.handle, loc.id)) 530 } 531 } 532 p(fmt.Fprint(w, "}\n\n")) 533 n, _, err = t.fprint(w, "main") 534 return 535 } 536 537 // reproducibleFromNFKD checks whether the given expansion could be generated 538 // from an NFKD expansion. 539 func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool { 540 // Length must be equal. 541 if len(exp) != len(nfkd) { 542 return false 543 } 544 for i, ce := range exp { 545 // Primary and secondary values should be equal. 546 if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] { 547 return false 548 } 549 // Tertiary values should be equal to maxTertiary for third element onwards. 550 // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can 551 // simply be dropped. Try this out by dropping the following code. 552 if i >= 2 && ce.w[2] != maxTertiary { 553 return false 554 } 555 if _, err := makeCE(ce); err != nil { 556 // Simply return false. The error will be caught elsewhere. 557 return false 558 } 559 } 560 return true 561 } 562 563 func simplify(o *ordering) { 564 // Runes that are a starter of a contraction should not be removed. 565 // (To date, there is only Kannada character 0CCA.) 566 keep := make(map[rune]bool) 567 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 568 if len(e.runes) > 1 { 569 keep[e.runes[0]] = true 570 } 571 } 572 // Tag entries for which the runes NFKD decompose to identical values. 573 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 574 s := e.str 575 nfkd := norm.NFKD.String(s) 576 nfd := norm.NFD.String(s) 577 if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd { 578 continue 579 } 580 if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) { 581 e.decompose = true 582 } 583 } 584 } 585 586 // appendExpansion converts the given collation sequence to 587 // collation elements and adds them to the expansion table. 588 // It returns an index to the expansion table. 589 func (b *Builder) appendExpansion(e *entry) int { 590 t := b.t 591 i := len(t.ExpandElem) 592 ce := uint32(len(e.elems)) 593 t.ExpandElem = append(t.ExpandElem, ce) 594 for _, w := range e.elems { 595 ce, err := makeCE(w) 596 if err != nil { 597 b.error(err) 598 return -1 599 } 600 t.ExpandElem = append(t.ExpandElem, ce) 601 } 602 return i 603 } 604 605 // processExpansions extracts data necessary to generate 606 // the extraction tables. 607 func (b *Builder) processExpansions(o *ordering) { 608 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 609 if !e.expansion() { 610 continue 611 } 612 key := fmt.Sprintf("%v", e.elems) 613 i, ok := b.expIndex[key] 614 if !ok { 615 i = b.appendExpansion(e) 616 b.expIndex[key] = i 617 } 618 e.expansionIndex = i 619 } 620 } 621 622 func (b *Builder) processContractions(o *ordering) { 623 // Collate contractions per starter rune. 624 starters := []rune{} 625 cm := make(map[rune][]*entry) 626 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 627 if e.contraction() { 628 if len(e.str) > b.t.MaxContractLen { 629 b.t.MaxContractLen = len(e.str) 630 } 631 r := e.runes[0] 632 if _, ok := cm[r]; !ok { 633 starters = append(starters, r) 634 } 635 cm[r] = append(cm[r], e) 636 } 637 } 638 // Add entries of single runes that are at a start of a contraction. 639 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 640 if !e.contraction() { 641 r := e.runes[0] 642 if _, ok := cm[r]; ok { 643 cm[r] = append(cm[r], e) 644 } 645 } 646 } 647 // Build the tries for the contractions. 648 t := b.t 649 for _, r := range starters { 650 l := cm[r] 651 // Compute suffix strings. There are 31 different contraction suffix 652 // sets for 715 contractions and 82 contraction starter runes as of 653 // version 6.0.0. 654 sufx := []string{} 655 hasSingle := false 656 for _, e := range l { 657 if len(e.runes) > 1 { 658 sufx = append(sufx, string(e.runes[1:])) 659 } else { 660 hasSingle = true 661 } 662 } 663 if !hasSingle { 664 b.error(fmt.Errorf("no single entry for starter rune %U found", r)) 665 continue 666 } 667 // Unique the suffix set. 668 sort.Strings(sufx) 669 key := strings.Join(sufx, "\n") 670 handle, ok := b.ctHandle[key] 671 if !ok { 672 var err error 673 handle, err = appendTrie(&t.ContractTries, sufx) 674 if err != nil { 675 b.error(err) 676 } 677 b.ctHandle[key] = handle 678 } 679 // Bucket sort entries in index order. 680 es := make([]*entry, len(l)) 681 for _, e := range l { 682 var p, sn int 683 if len(e.runes) > 1 { 684 str := []byte(string(e.runes[1:])) 685 p, sn = lookup(&t.ContractTries, handle, str) 686 if sn != len(str) { 687 log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str)) 688 } 689 } 690 if es[p] != nil { 691 log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0]) 692 } 693 es[p] = e 694 } 695 // Create collation elements for contractions. 696 elems := []uint32{} 697 for _, e := range es { 698 ce, err := e.encodeBase() 699 b.errorID(o.id, err) 700 elems = append(elems, ce) 701 } 702 key = fmt.Sprintf("%v", elems) 703 i, ok := b.ctElem[key] 704 if !ok { 705 i = len(t.ContractElem) 706 b.ctElem[key] = i 707 t.ContractElem = append(t.ContractElem, elems...) 708 } 709 // Store info in entry for starter rune. 710 es[0].contractionIndex = i 711 es[0].contractionHandle = handle 712 } 713 }