github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/language/match.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package language 6 7 import "errors" 8 9 // Matcher is the interface that wraps the Match method. 10 // 11 // Match returns the best match for any of the given tags, along with 12 // a unique index associated with the returned tag and a confidence 13 // score. 14 type Matcher interface { 15 Match(t ...Tag) (tag Tag, index int, c Confidence) 16 } 17 18 // Comprehends reports the confidence score for a speaker of a given language 19 // to being able to comprehend the written form of an alternative language. 20 func Comprehends(speaker, alternative Tag) Confidence { 21 _, _, c := NewMatcher([]Tag{alternative}).Match(speaker) 22 return c 23 } 24 25 // NewMatcher returns a Matcher that matches an ordered list of preferred tags 26 // against a list of supported tags based on written intelligibility, closeness 27 // of dialect, equivalence of subtags and various other rules. It is initialized 28 // with the list of supported tags. The first element is used as the default 29 // value in case no match is found. 30 // 31 // Its Match method matches the first of the given Tags to reach a certain 32 // confidence threshold. The tags passed to Match should therefore be specified 33 // in order of preference. Extensions are ignored for matching. 34 // 35 // The index returned by the Match method corresponds to the index of the 36 // matched tag in t, but is augmented with the Unicode extension ('u')of the 37 // corresponding preferred tag. This allows user locale options to be passed 38 // transparently. 39 func NewMatcher(t []Tag) Matcher { 40 return newMatcher(t) 41 } 42 43 func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) { 44 match, w, c := m.getBest(want...) 45 if match == nil { 46 t = m.default_.tag 47 } else { 48 t, index = match.tag, match.index 49 } 50 // Copy options from the user-provided tag into the result tag. This is hard 51 // to do after the fact, so we do it here. 52 // TODO: consider also adding in variants that are compatible with the 53 // matched language. 54 // TODO: Add back region if it is non-ambiguous? Or create another tag to 55 // preserve the region? 56 if u, ok := w.Extension('u'); ok { 57 t, _ = Raw.Compose(t, u) 58 } 59 return t, index, c 60 } 61 62 type scriptRegionFlags uint8 63 64 const ( 65 isList = 1 << iota 66 scriptInFrom 67 regionInFrom 68 ) 69 70 func (t *Tag) setUndefinedLang(id langID) { 71 if t.lang == 0 { 72 t.lang = id 73 } 74 } 75 76 func (t *Tag) setUndefinedScript(id scriptID) { 77 if t.script == 0 { 78 t.script = id 79 } 80 } 81 82 func (t *Tag) setUndefinedRegion(id regionID) { 83 if t.region == 0 || t.region.contains(id) { 84 t.region = id 85 } 86 } 87 88 // ErrMissingLikelyTagsData indicates no information was available 89 // to compute likely values of missing tags. 90 var ErrMissingLikelyTagsData = errors.New("missing likely tags data") 91 92 // addLikelySubtags sets subtags to their most likely value, given the locale. 93 // In most cases this means setting fields for unknown values, but in some 94 // cases it may alter a value. It returns a ErrMissingLikelyTagsData error 95 // if the given locale cannot be expanded. 96 func (t Tag) addLikelySubtags() (Tag, error) { 97 id, err := addTags(t) 98 if err != nil { 99 return t, err 100 } else if id.equalTags(t) { 101 return t, nil 102 } 103 id.remakeString() 104 return id, nil 105 } 106 107 // specializeRegion attempts to specialize a group region. 108 func specializeRegion(t *Tag) bool { 109 if i := regionInclusion[t.region]; i < nRegionGroups { 110 x := likelyRegionGroup[i] 111 if langID(x.lang) == t.lang && scriptID(x.script) == t.script { 112 t.region = regionID(x.region) 113 } 114 return true 115 } 116 return false 117 } 118 119 func addTags(t Tag) (Tag, error) { 120 // We leave private use identifiers alone. 121 if t.private() { 122 return t, nil 123 } 124 if t.script != 0 && t.region != 0 { 125 if t.lang != 0 { 126 // already fully specified 127 specializeRegion(&t) 128 return t, nil 129 } 130 // Search matches for und-script-region. Note that for these cases 131 // region will never be a group so there is no need to check for this. 132 list := likelyRegion[t.region : t.region+1] 133 if x := list[0]; x.flags&isList != 0 { 134 list = likelyRegionList[x.lang : x.lang+uint16(x.script)] 135 } 136 for _, x := range list { 137 // Deviating from the spec. See match_test.go for details. 138 if scriptID(x.script) == t.script { 139 t.setUndefinedLang(langID(x.lang)) 140 return t, nil 141 } 142 } 143 } 144 if t.lang != 0 { 145 // Search matches for lang-script and lang-region, where lang != und. 146 if t.lang < langNoIndexOffset { 147 x := likelyLang[t.lang] 148 if x.flags&isList != 0 { 149 list := likelyLangList[x.region : x.region+uint16(x.script)] 150 if t.script != 0 { 151 for _, x := range list { 152 if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 { 153 t.setUndefinedRegion(regionID(x.region)) 154 return t, nil 155 } 156 } 157 } else if t.region != 0 { 158 count := 0 159 goodScript := true 160 tt := t 161 for _, x := range list { 162 // We visit all entries for which the script was not 163 // defined, including the ones where the region was not 164 // defined. This allows for proper disambiguation within 165 // regions. 166 if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) { 167 tt.region = regionID(x.region) 168 tt.setUndefinedScript(scriptID(x.script)) 169 goodScript = goodScript && tt.script == scriptID(x.script) 170 count++ 171 } 172 } 173 if count == 1 { 174 return tt, nil 175 } 176 // Even if we fail to find a unique Region, we might have 177 // an unambiguous script. 178 if goodScript { 179 t.script = tt.script 180 } 181 } 182 } 183 } 184 } else { 185 // Search matches for und-script. 186 if t.script != 0 { 187 x := likelyScript[t.script] 188 if x.region != 0 { 189 t.setUndefinedRegion(regionID(x.region)) 190 t.setUndefinedLang(langID(x.lang)) 191 return t, nil 192 } 193 } 194 // Search matches for und-region. If und-script-region exists, it would 195 // have been found earlier. 196 if t.region != 0 { 197 if i := regionInclusion[t.region]; i < nRegionGroups { 198 x := likelyRegionGroup[i] 199 if x.region != 0 { 200 t.setUndefinedLang(langID(x.lang)) 201 t.setUndefinedScript(scriptID(x.script)) 202 t.region = regionID(x.region) 203 } 204 } else { 205 x := likelyRegion[t.region] 206 if x.flags&isList != 0 { 207 x = likelyRegionList[x.lang] 208 } 209 if x.script != 0 && x.flags != scriptInFrom { 210 t.setUndefinedLang(langID(x.lang)) 211 t.setUndefinedScript(scriptID(x.script)) 212 return t, nil 213 } 214 } 215 } 216 } 217 218 // Search matches for lang. 219 if t.lang < langNoIndexOffset { 220 x := likelyLang[t.lang] 221 if x.flags&isList != 0 { 222 x = likelyLangList[x.region] 223 } 224 if x.region != 0 { 225 t.setUndefinedScript(scriptID(x.script)) 226 t.setUndefinedRegion(regionID(x.region)) 227 } 228 specializeRegion(&t) 229 if t.lang == 0 { 230 t.lang = _en // default language 231 } 232 return t, nil 233 } 234 return t, ErrMissingLikelyTagsData 235 } 236 237 func (t *Tag) setTagsFrom(id Tag) { 238 t.lang = id.lang 239 t.script = id.script 240 t.region = id.region 241 } 242 243 // minimize removes the region or script subtags from t such that 244 // t.addLikelySubtags() == t.minimize().addLikelySubtags(). 245 func (t Tag) minimize() (Tag, error) { 246 t, err := minimizeTags(t) 247 if err != nil { 248 return t, err 249 } 250 t.remakeString() 251 return t, nil 252 } 253 254 // minimizeTags mimics the behavior of the ICU 51 C implementation. 255 func minimizeTags(t Tag) (Tag, error) { 256 if t.equalTags(und) { 257 return t, nil 258 } 259 max, err := addTags(t) 260 if err != nil { 261 return t, err 262 } 263 for _, id := range [...]Tag{ 264 {lang: t.lang}, 265 {lang: t.lang, region: t.region}, 266 {lang: t.lang, script: t.script}, 267 } { 268 if x, err := addTags(id); err == nil && max.equalTags(x) { 269 t.setTagsFrom(id) 270 break 271 } 272 } 273 return t, nil 274 } 275 276 // Tag Matching 277 // CLDR defines an algorithm for finding the best match between two sets of language 278 // tags. The basic algorithm defines how to score a possible match and then find 279 // the match with the best score 280 // (see http://www.unicode.org/reports/tr35/#LanguageMatching). 281 // Using scoring has several disadvantages. The scoring obfuscates the importance of 282 // the various factors considered, making the algorithm harder to understand. Using 283 // scoring also requires the full score to be computed for each pair of tags. 284 // 285 // We will use a different algorithm which aims to have the following properties: 286 // - clarity on the precedence of the various selection factors, and 287 // - improved performance by allowing early termination of a comparison. 288 // 289 // Matching algorithm (overview) 290 // Input: 291 // - supported: a set of supported tags 292 // - default: the default tag to return in case there is no match 293 // - desired: list of desired tags, ordered by preference, starting with 294 // the most-preferred. 295 // 296 // Algorithm: 297 // 1) Set the best match to the lowest confidence level 298 // 2) For each tag in "desired": 299 // a) For each tag in "supported": 300 // 1) compute the match between the two tags. 301 // 2) if the match is better than the previous best match, replace it 302 // with the new match. (see next section) 303 // b) if the current best match is above a certain threshold, return this 304 // match without proceeding to the next tag in "desired". [See Note 1] 305 // 3) If the best match so far is below a certain threshold, return "default". 306 // 307 // Ranking: 308 // We use two phases to determine whether one pair of tags are a better match 309 // than another pair of tags. First, we determine a rough confidence level. If the 310 // levels are different, the one with the highest confidence wins. 311 // Second, if the rough confidence levels are identical, we use a set of tie-breaker 312 // rules. 313 // 314 // The confidence level of matching a pair of tags is determined by finding the 315 // lowest confidence level of any matches of the corresponding subtags (the 316 // result is deemed as good as its weakest link). 317 // We define the following levels: 318 // Exact - An exact match of a subtag, before adding likely subtags. 319 // MaxExact - An exact match of a subtag, after adding likely subtags. 320 // [See Note 2]. 321 // High - High level of mutual intelligibility between different subtag 322 // variants. 323 // Low - Low level of mutual intelligibility between different subtag 324 // variants. 325 // No - No mutual intelligibility. 326 // 327 // The following levels can occur for each type of subtag: 328 // Base: Exact, MaxExact, High, Low, No 329 // Script: Exact, MaxExact [see Note 3], Low, No 330 // Region: Exact, MaxExact, High 331 // Variant: Exact, High 332 // Private: Exact, No 333 // 334 // Any result with a confidence level of Low or higher is deemed a possible match. 335 // Once a desired tag matches any of the supported tags with a level of MaxExact 336 // or higher, the next desired tag is not considered (see Step 2.b). 337 // Note that CLDR provides languageMatching data that defines close equivalence 338 // classes for base languages, scripts and regions. 339 // 340 // Tie-breaking 341 // If we get the same confidence level for two matches, we apply a sequence of 342 // tie-breaking rules. The first that succeeds defines the result. The rules are 343 // applied in the following order. 344 // 1) Original language was defined and was identical. 345 // 2) Original region was defined and was identical. 346 // 3) Distance between two maximized regions was the smallest. 347 // 4) Original script was defined and was identical. 348 // 5) Distance from want tag to have tag using the parent relation [see Note 5.] 349 // If there is still no winner after these rules are applied, the first match 350 // found wins. 351 // 352 // Notes: 353 // [1] Note that even if we may not have a perfect match, if a match is above a 354 // certain threshold, it is considered a better match than any other match 355 // to a tag later in the list of preferred language tags. 356 // [2] In practice, as matching of Exact is done in a separate phase from 357 // matching the other levels, we reuse the Exact level to mean MaxExact in 358 // the second phase. As a consequence, we only need the levels defined by 359 // the Confidence type. The MaxExact confidence level is mapped to High in 360 // the public API. 361 // [3] We do not differentiate between maximized script values that were derived 362 // from suppressScript versus most likely tag data. We determined that in 363 // ranking the two, one ranks just after the other. Moreover, the two cannot 364 // occur concurrently. As a consequence, they are identical for practical 365 // purposes. 366 // [4] In case of deprecated, macro-equivalents and legacy mappings, we assign 367 // the MaxExact level to allow iw vs he to still be a closer match than 368 // en-AU vs en-US, for example. 369 // [5] In CLDR a locale inherits fields that are unspecified for this locale 370 // from its parent. Therefore, if a locale is a parent of another locale, 371 // it is a strong measure for closeness, especially when no other tie 372 // breaker rule applies. One could also argue it is inconsistent, for 373 // example, when pt-AO matches pt (which CLDR equates with pt-BR), even 374 // though its parent is pt-PT according to the inheritance rules. 375 // 376 // Implementation Details: 377 // There are several performance considerations worth pointing out. Most notably, 378 // we preprocess as much as possible (within reason) at the time of creation of a 379 // matcher. This includes: 380 // - creating a per-language map, which includes data for the raw base language 381 // and its canonicalized variant (if applicable), 382 // - expanding entries for the equivalence classes defined in CLDR's 383 // languageMatch data. 384 // The per-language map ensures that typically only a very small number of tags 385 // need to be considered. The pre-expansion of canonicalized subtags and 386 // equivalence classes reduces the amount of map lookups that need to be done at 387 // runtime. 388 389 // matcher keeps a set of supported language tags, indexed by language. 390 type matcher struct { 391 default_ *haveTag 392 index map[langID]*matchHeader 393 passSettings bool 394 } 395 396 // matchHeader has the lists of tags for exact matches and matches based on 397 // maximized and canonicalized tags for a given language. 398 type matchHeader struct { 399 exact []haveTag 400 max []haveTag 401 } 402 403 // haveTag holds a supported Tag and its maximized script and region. The maximized 404 // or canonicalized language is not stored as it is not needed during matching. 405 type haveTag struct { 406 tag Tag 407 408 // index of this tag in the original list of supported tags. 409 index int 410 411 // conf is the maximum confidence that can result from matching this haveTag. 412 // When conf < Exact this means it was inserted after applying a CLDR equivalence rule. 413 conf Confidence 414 415 // Maximized region and script. 416 maxRegion regionID 417 maxScript scriptID 418 419 // altScript may be checked as an alternative match to maxScript. If altScript 420 // matches, the confidence level for this match is Low. Theoretically there 421 // could be multiple alternative scripts. This does not occur in practice. 422 altScript scriptID 423 424 // nextMax is the index of the next haveTag with the same maximized tags. 425 nextMax uint16 426 } 427 428 func makeHaveTag(tag Tag, index int) (haveTag, langID) { 429 max := tag 430 if tag.lang != 0 { 431 max, _ = max.canonicalize(All) 432 max, _ = addTags(max) 433 max.remakeString() 434 } 435 return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang 436 } 437 438 // altScript returns an alternative script that may match the given script with 439 // a low confidence. At the moment, the langMatch data allows for at most one 440 // script to map to another and we rely on this to keep the code simple. 441 func altScript(l langID, s scriptID) scriptID { 442 for _, alt := range matchScript { 443 if (alt.lang == 0 || langID(alt.lang) == l) && scriptID(alt.have) == s { 444 return scriptID(alt.want) 445 } 446 } 447 return 0 448 } 449 450 // addIfNew adds a haveTag to the list of tags only if it is a unique tag. 451 // Tags that have the same maximized values are linked by index. 452 func (h *matchHeader) addIfNew(n haveTag, exact bool) { 453 // Don't add new exact matches. 454 for _, v := range h.exact { 455 if v.tag.equalsRest(n.tag) { 456 return 457 } 458 } 459 if exact { 460 h.exact = append(h.exact, n) 461 } 462 // Allow duplicate maximized tags, but create a linked list to allow quickly 463 // comparing the equivalents and bail out. 464 for i, v := range h.max { 465 if v.maxScript == n.maxScript && 466 v.maxRegion == n.maxRegion && 467 v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() { 468 for h.max[i].nextMax != 0 { 469 i = int(h.max[i].nextMax) 470 } 471 h.max[i].nextMax = uint16(len(h.max)) 472 break 473 } 474 } 475 h.max = append(h.max, n) 476 } 477 478 // header returns the matchHeader for the given language. It creates one if 479 // it doesn't already exist. 480 func (m *matcher) header(l langID) *matchHeader { 481 if h := m.index[l]; h != nil { 482 return h 483 } 484 h := &matchHeader{} 485 m.index[l] = h 486 return h 487 } 488 489 // newMatcher builds an index for the given supported tags and returns it as 490 // a matcher. It also expands the index by considering various equivalence classes 491 // for a given tag. 492 func newMatcher(supported []Tag) *matcher { 493 m := &matcher{ 494 index: make(map[langID]*matchHeader), 495 } 496 if len(supported) == 0 { 497 m.default_ = &haveTag{} 498 return m 499 } 500 // Add supported languages to the index. Add exact matches first to give 501 // them precedence. 502 for i, tag := range supported { 503 pair, _ := makeHaveTag(tag, i) 504 m.header(tag.lang).addIfNew(pair, true) 505 } 506 m.default_ = &m.header(supported[0].lang).exact[0] 507 for i, tag := range supported { 508 pair, max := makeHaveTag(tag, i) 509 if max != tag.lang { 510 m.header(max).addIfNew(pair, false) 511 } 512 } 513 514 // update is used to add indexes in the map for equivalent languages. 515 // If force is true, the update will also apply to derived entries. To 516 // avoid applying a "transitive closure", use false. 517 update := func(want, have uint16, conf Confidence, force bool) { 518 if hh := m.index[langID(have)]; hh != nil { 519 if !force && len(hh.exact) == 0 { 520 return 521 } 522 hw := m.header(langID(want)) 523 for _, v := range hh.max { 524 if conf < v.conf { 525 v.conf = conf 526 } 527 v.nextMax = 0 // this value needs to be recomputed 528 if v.altScript != 0 { 529 v.altScript = altScript(langID(want), v.maxScript) 530 } 531 hw.addIfNew(v, conf == Exact && len(hh.exact) > 0) 532 } 533 } 534 } 535 536 // Add entries for languages with mutual intelligibility as defined by CLDR's 537 // languageMatch data. 538 for _, ml := range matchLang { 539 update(ml.want, ml.have, Confidence(ml.conf), false) 540 if !ml.oneway { 541 update(ml.have, ml.want, Confidence(ml.conf), false) 542 } 543 } 544 545 // Add entries for possible canonicalizations. This is an optimization to 546 // ensure that only one map lookup needs to be done at runtime per desired tag. 547 // First we match deprecated equivalents. If they are perfect equivalents 548 // (their canonicalization simply substitutes a different language code, but 549 // nothing else), the match confidence is Exact, otherwise it is High. 550 for i, lm := range langAliasMap { 551 if lm.from == _sh { 552 continue 553 } 554 555 // If deprecated codes match and there is no fiddling with the script or 556 // or region, we consider it an exact match. 557 conf := Exact 558 if langAliasTypes[i] != langMacro { 559 if !isExactEquivalent(langID(lm.from)) { 560 conf = High 561 } 562 update(lm.to, lm.from, conf, true) 563 } 564 update(lm.from, lm.to, conf, true) 565 } 566 return m 567 } 568 569 // getBest gets the best matching tag in m for any of the given tags, taking into 570 // account the order of preference of the given tags. 571 func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) { 572 best := bestMatch{} 573 for _, w := range want { 574 var max Tag 575 // Check for exact match first. 576 h := m.index[w.lang] 577 if w.lang != 0 { 578 // Base language is defined. 579 if h == nil { 580 continue 581 } 582 for i := range h.exact { 583 have := &h.exact[i] 584 if have.tag.equalsRest(w) { 585 return have, w, Exact 586 } 587 } 588 max, _ = w.canonicalize(Legacy | Deprecated) 589 max, _ = addTags(max) 590 } else { 591 // Base language is not defined. 592 if h != nil { 593 for i := range h.exact { 594 have := &h.exact[i] 595 if have.tag.equalsRest(w) { 596 return have, w, Exact 597 } 598 } 599 } 600 if w.script == 0 && w.region == 0 { 601 // We skip all tags matching und for approximate matching, including 602 // private tags. 603 continue 604 } 605 max, _ = addTags(w) 606 if h = m.index[max.lang]; h == nil { 607 continue 608 } 609 } 610 // Check for match based on maximized tag. 611 for i := range h.max { 612 have := &h.max[i] 613 best.update(have, w, max.script, max.region) 614 if best.conf == Exact { 615 for have.nextMax != 0 { 616 have = &h.max[have.nextMax] 617 best.update(have, w, max.script, max.region) 618 } 619 return best.have, best.want, High 620 } 621 } 622 } 623 if best.conf <= No { 624 if len(want) != 0 { 625 return nil, want[0], No 626 } 627 return nil, Tag{}, No 628 } 629 return best.have, best.want, best.conf 630 } 631 632 // bestMatch accumulates the best match so far. 633 type bestMatch struct { 634 have *haveTag 635 want Tag 636 conf Confidence 637 // Cached results from applying tie-breaking rules. 638 origLang bool 639 origReg bool 640 regDist uint8 641 origScript bool 642 parentDist uint8 // 255 if have is not an ancestor of want tag. 643 } 644 645 // update updates the existing best match if the new pair is considered to be a 646 // better match. 647 // To determine if the given pair is a better match, it first computes the rough 648 // confidence level. If this surpasses the current match, it will replace it and 649 // update the tie-breaker rule cache. If there is a tie, it proceeds with applying 650 // a series of tie-breaker rules. If there is no conclusive winner after applying 651 // the tie-breaker rules, it leaves the current match as the preferred match. 652 func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID) { 653 // Bail if the maximum attainable confidence is below that of the current best match. 654 c := have.conf 655 if c < m.conf { 656 return 657 } 658 if have.maxScript != maxScript { 659 // There is usually very little comprehension between different scripts. 660 // In a few cases there may still be Low comprehension. This possibility is 661 // pre-computed and stored in have.altScript. 662 if Low < m.conf || have.altScript != maxScript { 663 return 664 } 665 c = Low 666 } else if have.maxRegion != maxRegion { 667 // There is usually a small difference between languages across regions. 668 // We use the region distance (below) to disambiguate between equal matches. 669 if High < c { 670 c = High 671 } 672 } 673 674 // We store the results of the computations of the tie-breaker rules along 675 // with the best match. There is no need to do the checks once we determine 676 // we have a winner, but we do still need to do the tie-breaker computations. 677 // We use "beaten" to keep track if we still need to do the checks. 678 beaten := false // true if the new pair defeats the current one. 679 if c != m.conf { 680 if c < m.conf { 681 return 682 } 683 beaten = true 684 } 685 686 // Tie-breaker rules: 687 // We prefer if the pre-maximized language was specified and identical. 688 origLang := have.tag.lang == tag.lang && tag.lang != 0 689 if !beaten && m.origLang != origLang { 690 if m.origLang { 691 return 692 } 693 beaten = true 694 } 695 696 // We prefer if the pre-maximized region was specified and identical. 697 origReg := have.tag.region == tag.region && tag.region != 0 698 if !beaten && m.origReg != origReg { 699 if m.origReg { 700 return 701 } 702 beaten = true 703 } 704 705 // Next we prefer smaller distances between regions, as defined by regionDist. 706 regDist := regionDist(have.maxRegion, maxRegion, tag.lang) 707 if !beaten && m.regDist != regDist { 708 if regDist > m.regDist { 709 return 710 } 711 beaten = true 712 } 713 714 // Next we prefer if the pre-maximized script was specified and identical. 715 origScript := have.tag.script == tag.script && tag.script != 0 716 if !beaten && m.origScript != origScript { 717 if m.origScript { 718 return 719 } 720 beaten = true 721 } 722 723 // Finally we prefer tags which have a closer parent relationship. 724 parentDist := parentDistance(have.tag.region, tag) 725 if !beaten && m.parentDist != parentDist { 726 if parentDist > m.parentDist { 727 return 728 } 729 beaten = true 730 } 731 732 // Update m to the newly found best match. 733 if beaten { 734 m.have = have 735 m.want = tag 736 m.conf = c 737 m.origLang = origLang 738 m.origReg = origReg 739 m.origScript = origScript 740 m.regDist = regDist 741 m.parentDist = parentDist 742 } 743 } 744 745 // parentDistance returns the number of times Parent must be called before the 746 // regions match. It is assumed that it has already been checked that lang and 747 // script are identical. If haveRegion does not occur in the ancestor chain of 748 // tag, it returns 255. 749 func parentDistance(haveRegion regionID, tag Tag) uint8 { 750 p := tag.Parent() 751 d := uint8(1) 752 for haveRegion != p.region { 753 if p.region == 0 { 754 return 255 755 } 756 p = p.Parent() 757 d++ 758 } 759 return d 760 } 761 762 // regionDist wraps regionDistance with some exceptions to the algorithmic distance. 763 func regionDist(a, b regionID, lang langID) uint8 { 764 if lang == _en { 765 // Two variants of non-US English are close to each other, regardless of distance. 766 if a != _US && b != _US { 767 return 2 768 } 769 } 770 return uint8(regionDistance(a, b)) 771 } 772 773 // regionDistance computes the distance between two regions based on the 774 // distance in the graph of region containments as defined in CLDR. It iterates 775 // over increasingly inclusive sets of groups, represented as bit vectors, until 776 // the source bit vector has bits in common with the destination vector. 777 func regionDistance(a, b regionID) int { 778 if a == b { 779 return 0 780 } 781 p, q := regionInclusion[a], regionInclusion[b] 782 if p < nRegionGroups { 783 p, q = q, p 784 } 785 set := regionInclusionBits 786 if q < nRegionGroups && set[p]&(1<<q) != 0 { 787 return 1 788 } 789 d := 2 790 for goal := set[q]; set[p]&goal == 0; p = regionInclusionNext[p] { 791 d++ 792 } 793 return d 794 } 795 796 func (t Tag) variants() string { 797 if t.pVariant == 0 { 798 return "" 799 } 800 return t.str[t.pVariant:t.pExt] 801 } 802 803 // variantOrPrivateTagStr returns variants or private use tags. 804 func (t Tag) variantOrPrivateTagStr() string { 805 if t.pExt > 0 { 806 return t.str[t.pVariant:t.pExt] 807 } 808 return t.str[t.pVariant:] 809 } 810 811 // equalsRest compares everything except the language. 812 func (a Tag) equalsRest(b Tag) bool { 813 // TODO: don't include extensions in this comparison. To do this efficiently, 814 // though, we should handle private tags separately. 815 return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr() 816 } 817 818 // isExactEquivalent returns true if canonicalizing the language will not alter 819 // the script or region of a tag. 820 func isExactEquivalent(l langID) bool { 821 for _, o := range notEquivalent { 822 if o == l { 823 return false 824 } 825 } 826 return true 827 } 828 829 var notEquivalent []langID 830 831 func init() { 832 // Create a list of all languages for which canonicalization may alter the 833 // script or region. 834 for _, lm := range langAliasMap { 835 tag := Tag{lang: langID(lm.from)} 836 if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 { 837 notEquivalent = append(notEquivalent, langID(lm.from)) 838 } 839 } 840 }