github.com/liquid-dev/text@v0.3.3-liquid/collate/build/builder.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package build // import "github.com/liquid-dev/text/collate/build" 6 7 import ( 8 "fmt" 9 "io" 10 "log" 11 "sort" 12 "strings" 13 "unicode/utf8" 14 15 "github.com/liquid-dev/text/internal/colltab" 16 "github.com/liquid-dev/text/language" 17 "github.com/liquid-dev/text/unicode/norm" 18 ) 19 20 // TODO: optimizations: 21 // - expandElem is currently 20K. By putting unique colElems in a separate 22 // table and having a byte array of indexes into this table, we can reduce 23 // the total size to about 7K. By also factoring out the length bytes, we 24 // can reduce this to about 6K. 25 // - trie valueBlocks are currently 100K. There are a lot of sparse blocks 26 // and many consecutive values with the same stride. This can be further 27 // compacted. 28 // - Compress secondary weights into 8 bits. 29 // - Some LDML specs specify a context element. Currently we simply concatenate 30 // those. Context can be implemented using the contraction trie. If Builder 31 // could analyze and detect when using a context makes sense, there is no 32 // need to expose this construct in the API. 33 34 // A Builder builds a root collation table. The user must specify the 35 // collation elements for each entry. A common use will be to base the weights 36 // on those specified in the allkeys* file as provided by the UCA or CLDR. 37 type Builder struct { 38 index *trieBuilder 39 root ordering 40 locale []*Tailoring 41 t *table 42 err error 43 built bool 44 45 minNonVar int // lowest primary recorded for a variable 46 varTop int // highest primary recorded for a non-variable 47 48 // indexes used for reusing expansions and contractions 49 expIndex map[string]int // positions of expansions keyed by their string representation 50 ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes 51 ctElem map[string]int // contraction elements keyed by their string representation 52 } 53 54 // A Tailoring builds a collation table based on another collation table. 55 // The table is defined by specifying tailorings to the underlying table. 56 // See https://unicode.org/reports/tr35/ for an overview of tailoring 57 // collation tables. The CLDR contains pre-defined tailorings for a variety 58 // of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.) 59 type Tailoring struct { 60 id string 61 builder *Builder 62 index *ordering 63 64 anchor *entry 65 before bool 66 } 67 68 // NewBuilder returns a new Builder. 69 func NewBuilder() *Builder { 70 return &Builder{ 71 index: newTrieBuilder(), 72 root: makeRootOrdering(), 73 expIndex: make(map[string]int), 74 ctHandle: make(map[string]ctHandle), 75 ctElem: make(map[string]int), 76 } 77 } 78 79 // Tailoring returns a Tailoring for the given locale. One should 80 // have completed all calls to Add before calling Tailoring. 81 func (b *Builder) Tailoring(loc language.Tag) *Tailoring { 82 t := &Tailoring{ 83 id: loc.String(), 84 builder: b, 85 index: b.root.clone(), 86 } 87 t.index.id = t.id 88 b.locale = append(b.locale, t) 89 return t 90 } 91 92 // Add adds an entry to the collation element table, mapping 93 // a slice of runes to a sequence of collation elements. 94 // A collation element is specified as list of weights: []int{primary, secondary, ...}. 95 // The entries are typically obtained from a collation element table 96 // as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format. 97 // Note that the collation elements specified by colelems are only used 98 // as a guide. The actual weights generated by Builder may differ. 99 // The argument variables is a list of indices into colelems that should contain 100 // a value for each colelem that is a variable. (See the reference above.) 101 func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error { 102 str := string(runes) 103 elems := make([]rawCE, len(colelems)) 104 for i, ce := range colelems { 105 if len(ce) == 0 { 106 break 107 } 108 elems[i] = makeRawCE(ce, 0) 109 if len(ce) == 1 { 110 elems[i].w[1] = defaultSecondary 111 } 112 if len(ce) <= 2 { 113 elems[i].w[2] = defaultTertiary 114 } 115 if len(ce) <= 3 { 116 elems[i].w[3] = ce[0] 117 } 118 } 119 for i, ce := range elems { 120 p := ce.w[0] 121 isvar := false 122 for _, j := range variables { 123 if i == j { 124 isvar = true 125 } 126 } 127 if isvar { 128 if p >= b.minNonVar && b.minNonVar > 0 { 129 return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar) 130 } 131 if p > b.varTop { 132 b.varTop = p 133 } 134 } else if p > 1 { // 1 is a special primary value reserved for FFFE 135 if p <= b.varTop { 136 return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop) 137 } 138 if b.minNonVar == 0 || p < b.minNonVar { 139 b.minNonVar = p 140 } 141 } 142 } 143 elems, err := convertLargeWeights(elems) 144 if err != nil { 145 return err 146 } 147 cccs := []uint8{} 148 nfd := norm.NFD.String(str) 149 for i := range nfd { 150 cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC()) 151 } 152 if len(cccs) < len(elems) { 153 if len(cccs) > 2 { 154 return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems)) 155 } 156 p := len(elems) - 1 157 for ; p > 0 && elems[p].w[0] == 0; p-- { 158 elems[p].ccc = cccs[len(cccs)-1] 159 } 160 for ; p >= 0; p-- { 161 elems[p].ccc = cccs[0] 162 } 163 } else { 164 for i := range elems { 165 elems[i].ccc = cccs[i] 166 } 167 } 168 // doNorm in collate.go assumes that the following conditions hold. 169 if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] { 170 return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs) 171 } 172 b.root.newEntry(str, elems) 173 return nil 174 } 175 176 func (t *Tailoring) setAnchor(anchor string) error { 177 anchor = norm.NFC.String(anchor) 178 a := t.index.find(anchor) 179 if a == nil { 180 a = t.index.newEntry(anchor, nil) 181 a.implicit = true 182 a.modified = true 183 for _, r := range []rune(anchor) { 184 e := t.index.find(string(r)) 185 e.lock = true 186 } 187 } 188 t.anchor = a 189 return nil 190 } 191 192 // SetAnchor sets the point after which elements passed in subsequent calls to 193 // Insert will be inserted. It is equivalent to the reset directive in an LDML 194 // specification. See Insert for an example. 195 // SetAnchor supports the following logical reset positions: 196 // <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>, 197 // and <last_non_ignorable/>. 198 func (t *Tailoring) SetAnchor(anchor string) error { 199 if err := t.setAnchor(anchor); err != nil { 200 return err 201 } 202 t.before = false 203 return nil 204 } 205 206 // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to 207 // Insert will insert entries before the anchor. 208 func (t *Tailoring) SetAnchorBefore(anchor string) error { 209 if err := t.setAnchor(anchor); err != nil { 210 return err 211 } 212 t.before = true 213 return nil 214 } 215 216 // Insert sets the ordering of str relative to the entry set by the previous 217 // call to SetAnchor or Insert. The argument extend corresponds 218 // to the extend elements as defined in LDML. A non-empty value for extend 219 // will cause the collation elements corresponding to extend to be appended 220 // to the collation elements generated for the entry added by Insert. 221 // This has the same net effect as sorting str after the string anchor+extend. 222 // See https://www.unicode.org/reports/tr10/#Tailoring_Example for details 223 // on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements 224 // for full details on LDML. 225 // 226 // Examples: create a tailoring for Swedish, where "ä" is ordered after "z" 227 // at the primary sorting level: 228 // t := b.Tailoring("se") 229 // t.SetAnchor("z") 230 // t.Insert(colltab.Primary, "ä", "") 231 // Order "ü" after "ue" at the secondary sorting level: 232 // t.SetAnchor("ue") 233 // t.Insert(colltab.Secondary, "ü","") 234 // or 235 // t.SetAnchor("u") 236 // t.Insert(colltab.Secondary, "ü", "e") 237 // Order "q" afer "ab" at the secondary level and "Q" after "q" 238 // at the tertiary level: 239 // t.SetAnchor("ab") 240 // t.Insert(colltab.Secondary, "q", "") 241 // t.Insert(colltab.Tertiary, "Q", "") 242 // Order "b" before "a": 243 // t.SetAnchorBefore("a") 244 // t.Insert(colltab.Primary, "b", "") 245 // Order "0" after the last primary ignorable: 246 // t.SetAnchor("<last_primary_ignorable/>") 247 // t.Insert(colltab.Primary, "0", "") 248 func (t *Tailoring) Insert(level colltab.Level, str, extend string) error { 249 if t.anchor == nil { 250 return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str) 251 } 252 str = norm.NFC.String(str) 253 e := t.index.find(str) 254 if e == nil { 255 e = t.index.newEntry(str, nil) 256 } else if e.logical != noAnchor { 257 return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str) 258 } 259 if e.lock { 260 return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str) 261 } 262 a := t.anchor 263 // Find the first element after the anchor which differs at a level smaller or 264 // equal to the given level. Then insert at this position. 265 // See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details. 266 e.before = t.before 267 if t.before { 268 t.before = false 269 if a.prev == nil { 270 a.insertBefore(e) 271 } else { 272 for a = a.prev; a.level > level; a = a.prev { 273 } 274 a.insertAfter(e) 275 } 276 e.level = level 277 } else { 278 for ; a.level > level; a = a.next { 279 } 280 e.level = a.level 281 if a != e { 282 a.insertAfter(e) 283 a.level = level 284 } else { 285 // We don't set a to prev itself. This has the effect of the entry 286 // getting new collation elements that are an increment of itself. 287 // This is intentional. 288 a.prev.level = level 289 } 290 } 291 e.extend = norm.NFD.String(extend) 292 e.exclude = false 293 e.modified = true 294 e.elems = nil 295 t.anchor = e 296 return nil 297 } 298 299 func (o *ordering) getWeight(e *entry) []rawCE { 300 if len(e.elems) == 0 && e.logical == noAnchor { 301 if e.implicit { 302 for _, r := range e.runes { 303 e.elems = append(e.elems, o.getWeight(o.find(string(r)))...) 304 } 305 } else if e.before { 306 count := [colltab.Identity + 1]int{} 307 a := e 308 for ; a.elems == nil && !a.implicit; a = a.next { 309 count[a.level]++ 310 } 311 e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)} 312 for i := colltab.Primary; i < colltab.Quaternary; i++ { 313 if count[i] != 0 { 314 e.elems[0].w[i] -= count[i] 315 break 316 } 317 } 318 if e.prev != nil { 319 o.verifyWeights(e.prev, e, e.prev.level) 320 } 321 } else { 322 prev := e.prev 323 e.elems = nextWeight(prev.level, o.getWeight(prev)) 324 o.verifyWeights(e, e.next, e.level) 325 } 326 } 327 return e.elems 328 } 329 330 func (o *ordering) addExtension(e *entry) { 331 if ex := o.find(e.extend); ex != nil { 332 e.elems = append(e.elems, ex.elems...) 333 } else { 334 for _, r := range []rune(e.extend) { 335 e.elems = append(e.elems, o.find(string(r)).elems...) 336 } 337 } 338 e.extend = "" 339 } 340 341 func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error { 342 if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil { 343 return nil 344 } 345 for i := colltab.Primary; i < level; i++ { 346 if a.elems[0].w[i] < b.elems[0].w[i] { 347 return nil 348 } 349 } 350 if a.elems[0].w[level] >= b.elems[0].w[level] { 351 err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems) 352 log.Println(err) 353 // TODO: return the error instead, or better, fix the conflicting entry by making room. 354 } 355 return nil 356 } 357 358 func (b *Builder) error(e error) { 359 if e != nil { 360 b.err = e 361 } 362 } 363 364 func (b *Builder) errorID(locale string, e error) { 365 if e != nil { 366 b.err = fmt.Errorf("%s:%v", locale, e) 367 } 368 } 369 370 // patchNorm ensures that NFC and NFD counterparts are consistent. 371 func (o *ordering) patchNorm() { 372 // Insert the NFD counterparts, if necessary. 373 for _, e := range o.ordered { 374 nfd := norm.NFD.String(e.str) 375 if nfd != e.str { 376 if e0 := o.find(nfd); e0 != nil && !e0.modified { 377 e0.elems = e.elems 378 } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) { 379 e := o.newEntry(nfd, e.elems) 380 e.modified = true 381 } 382 } 383 } 384 // Update unchanged composed forms if one of their parts changed. 385 for _, e := range o.ordered { 386 nfd := norm.NFD.String(e.str) 387 if e.modified || nfd == e.str { 388 continue 389 } 390 if e0 := o.find(nfd); e0 != nil { 391 e.elems = e0.elems 392 } else { 393 e.elems = o.genColElems(nfd) 394 if norm.NFD.LastBoundary([]byte(nfd)) == 0 { 395 r := []rune(nfd) 396 head := string(r[0]) 397 tail := "" 398 for i := 1; i < len(r); i++ { 399 s := norm.NFC.String(head + string(r[i])) 400 if e0 := o.find(s); e0 != nil && e0.modified { 401 head = s 402 } else { 403 tail += string(r[i]) 404 } 405 } 406 e.elems = append(o.genColElems(head), o.genColElems(tail)...) 407 } 408 } 409 } 410 // Exclude entries for which the individual runes generate the same collation elements. 411 for _, e := range o.ordered { 412 if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) { 413 e.exclude = true 414 } 415 } 416 } 417 418 func (b *Builder) buildOrdering(o *ordering) { 419 for _, e := range o.ordered { 420 o.getWeight(e) 421 } 422 for _, e := range o.ordered { 423 o.addExtension(e) 424 } 425 o.patchNorm() 426 o.sort() 427 simplify(o) 428 b.processExpansions(o) // requires simplify 429 b.processContractions(o) // requires simplify 430 431 t := newNode() 432 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 433 if !e.skip() { 434 ce, err := e.encode() 435 b.errorID(o.id, err) 436 t.insert(e.runes[0], ce) 437 } 438 } 439 o.handle = b.index.addTrie(t) 440 } 441 442 func (b *Builder) build() (*table, error) { 443 if b.built { 444 return b.t, b.err 445 } 446 b.built = true 447 b.t = &table{ 448 Table: colltab.Table{ 449 MaxContractLen: utf8.UTFMax, 450 VariableTop: uint32(b.varTop), 451 }, 452 } 453 454 b.buildOrdering(&b.root) 455 b.t.root = b.root.handle 456 for _, t := range b.locale { 457 b.buildOrdering(t.index) 458 if b.err != nil { 459 break 460 } 461 } 462 i, err := b.index.generate() 463 b.t.trie = *i 464 b.t.Index = colltab.Trie{ 465 Index: i.index, 466 Values: i.values, 467 Index0: i.index[blockSize*b.t.root.lookupStart:], 468 Values0: i.values[blockSize*b.t.root.valueStart:], 469 } 470 b.error(err) 471 return b.t, b.err 472 } 473 474 // Build builds the root Collator. 475 func (b *Builder) Build() (colltab.Weighter, error) { 476 table, err := b.build() 477 if err != nil { 478 return nil, err 479 } 480 return table, nil 481 } 482 483 // Build builds a Collator for Tailoring t. 484 func (t *Tailoring) Build() (colltab.Weighter, error) { 485 // TODO: implement. 486 return nil, nil 487 } 488 489 // Print prints the tables for b and all its Tailorings as a Go file 490 // that can be included in the Collate package. 491 func (b *Builder) Print(w io.Writer) (n int, err error) { 492 p := func(nn int, e error) { 493 n += nn 494 if err == nil { 495 err = e 496 } 497 } 498 t, err := b.build() 499 if err != nil { 500 return 0, err 501 } 502 p(fmt.Fprintf(w, `var availableLocales = "und`)) 503 for _, loc := range b.locale { 504 if loc.id != "und" { 505 p(fmt.Fprintf(w, ",%s", loc.id)) 506 } 507 } 508 p(fmt.Fprint(w, "\"\n\n")) 509 p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop)) 510 p(fmt.Fprintln(w, "var locales = [...]tableIndex{")) 511 for _, loc := range b.locale { 512 if loc.id == "und" { 513 p(t.fprintIndex(w, loc.index.handle, loc.id)) 514 } 515 } 516 for _, loc := range b.locale { 517 if loc.id != "und" { 518 p(t.fprintIndex(w, loc.index.handle, loc.id)) 519 } 520 } 521 p(fmt.Fprint(w, "}\n\n")) 522 n, _, err = t.fprint(w, "main") 523 return 524 } 525 526 // reproducibleFromNFKD checks whether the given expansion could be generated 527 // from an NFKD expansion. 528 func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool { 529 // Length must be equal. 530 if len(exp) != len(nfkd) { 531 return false 532 } 533 for i, ce := range exp { 534 // Primary and secondary values should be equal. 535 if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] { 536 return false 537 } 538 // Tertiary values should be equal to maxTertiary for third element onwards. 539 // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can 540 // simply be dropped. Try this out by dropping the following code. 541 if i >= 2 && ce.w[2] != maxTertiary { 542 return false 543 } 544 if _, err := makeCE(ce); err != nil { 545 // Simply return false. The error will be caught elsewhere. 546 return false 547 } 548 } 549 return true 550 } 551 552 func simplify(o *ordering) { 553 // Runes that are a starter of a contraction should not be removed. 554 // (To date, there is only Kannada character 0CCA.) 555 keep := make(map[rune]bool) 556 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 557 if len(e.runes) > 1 { 558 keep[e.runes[0]] = true 559 } 560 } 561 // Tag entries for which the runes NFKD decompose to identical values. 562 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 563 s := e.str 564 nfkd := norm.NFKD.String(s) 565 nfd := norm.NFD.String(s) 566 if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd { 567 continue 568 } 569 if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) { 570 e.decompose = true 571 } 572 } 573 } 574 575 // appendExpansion converts the given collation sequence to 576 // collation elements and adds them to the expansion table. 577 // It returns an index to the expansion table. 578 func (b *Builder) appendExpansion(e *entry) int { 579 t := b.t 580 i := len(t.ExpandElem) 581 ce := uint32(len(e.elems)) 582 t.ExpandElem = append(t.ExpandElem, ce) 583 for _, w := range e.elems { 584 ce, err := makeCE(w) 585 if err != nil { 586 b.error(err) 587 return -1 588 } 589 t.ExpandElem = append(t.ExpandElem, ce) 590 } 591 return i 592 } 593 594 // processExpansions extracts data necessary to generate 595 // the extraction tables. 596 func (b *Builder) processExpansions(o *ordering) { 597 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 598 if !e.expansion() { 599 continue 600 } 601 key := fmt.Sprintf("%v", e.elems) 602 i, ok := b.expIndex[key] 603 if !ok { 604 i = b.appendExpansion(e) 605 b.expIndex[key] = i 606 } 607 e.expansionIndex = i 608 } 609 } 610 611 func (b *Builder) processContractions(o *ordering) { 612 // Collate contractions per starter rune. 613 starters := []rune{} 614 cm := make(map[rune][]*entry) 615 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 616 if e.contraction() { 617 if len(e.str) > b.t.MaxContractLen { 618 b.t.MaxContractLen = len(e.str) 619 } 620 r := e.runes[0] 621 if _, ok := cm[r]; !ok { 622 starters = append(starters, r) 623 } 624 cm[r] = append(cm[r], e) 625 } 626 } 627 // Add entries of single runes that are at a start of a contraction. 628 for e := o.front(); e != nil; e, _ = e.nextIndexed() { 629 if !e.contraction() { 630 r := e.runes[0] 631 if _, ok := cm[r]; ok { 632 cm[r] = append(cm[r], e) 633 } 634 } 635 } 636 // Build the tries for the contractions. 637 t := b.t 638 for _, r := range starters { 639 l := cm[r] 640 // Compute suffix strings. There are 31 different contraction suffix 641 // sets for 715 contractions and 82 contraction starter runes as of 642 // version 6.0.0. 643 sufx := []string{} 644 hasSingle := false 645 for _, e := range l { 646 if len(e.runes) > 1 { 647 sufx = append(sufx, string(e.runes[1:])) 648 } else { 649 hasSingle = true 650 } 651 } 652 if !hasSingle { 653 b.error(fmt.Errorf("no single entry for starter rune %U found", r)) 654 continue 655 } 656 // Unique the suffix set. 657 sort.Strings(sufx) 658 key := strings.Join(sufx, "\n") 659 handle, ok := b.ctHandle[key] 660 if !ok { 661 var err error 662 handle, err = appendTrie(&t.ContractTries, sufx) 663 if err != nil { 664 b.error(err) 665 } 666 b.ctHandle[key] = handle 667 } 668 // Bucket sort entries in index order. 669 es := make([]*entry, len(l)) 670 for _, e := range l { 671 var p, sn int 672 if len(e.runes) > 1 { 673 str := []byte(string(e.runes[1:])) 674 p, sn = lookup(&t.ContractTries, handle, str) 675 if sn != len(str) { 676 log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str)) 677 } 678 } 679 if es[p] != nil { 680 log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0]) 681 } 682 es[p] = e 683 } 684 // Create collation elements for contractions. 685 elems := []uint32{} 686 for _, e := range es { 687 ce, err := e.encodeBase() 688 b.errorID(o.id, err) 689 elems = append(elems, ce) 690 } 691 key = fmt.Sprintf("%v", elems) 692 i, ok := b.ctElem[key] 693 if !ok { 694 i = len(t.ContractElem) 695 b.ctElem[key] = i 696 t.ContractElem = append(t.ContractElem, elems...) 697 } 698 // Store info in entry for starter rune. 699 es[0].contractionIndex = i 700 es[0].contractionHandle = handle 701 } 702 }