github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/language/language.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run maketables.go gen_common.go -output tables.go 6 //go:generate go run gen_index.go 7 8 // Package language implements BCP 47 language tags and related functionality. 9 // 10 // The Tag type, which is used to represent languages, is agnostic to the 11 // meaning of its subtags. Tags are not fully canonicalized to preserve 12 // information that may be valuable in certain contexts. As a consequence, two 13 // different tags may represent identical languages. 14 // 15 // Initializing language- or locale-specific components usually consists of 16 // two steps. The first step is to select a display language based on the 17 // preferred languages of the user and the languages supported by an application. 18 // The second step is to create the language-specific services based on 19 // this selection. Each is discussed in more details below. 20 // 21 // Matching preferred against supported languages 22 // 23 // An application may support various languages. This list is typically limited 24 // by the languages for which there exists translations of the user interface. 25 // Similarly, a user may provide a list of preferred languages which is limited 26 // by the languages understood by this user. 27 // An application should use a Matcher to find the best supported language based 28 // on the user's preferred list. 29 // Matchers are aware of the intricacies of equivalence between languages. 30 // The default Matcher implementation takes into account things such as 31 // deprecated subtags, legacy tags, and mutual intelligibility between scripts 32 // and languages. 33 // 34 // A Matcher for English, Australian English, Danish, and standard Mandarin can 35 // be defined as follows: 36 // 37 // var matcher = language.NewMatcher([]language.Tag{ 38 // language.English, // The first language is used as fallback. 39 // language.MustParse("en-AU"), 40 // language.Danish, 41 // language.Chinese, 42 // }) 43 // 44 // The following code selects the best match for someone speaking Spanish and 45 // Norwegian: 46 // 47 // preferred := []language.Tag{ language.Spanish, language.Norwegian } 48 // tag, _, _ := matcher.Match(preferred...) 49 // 50 // In this case, the best match is Danish, as Danish is sufficiently a match to 51 // Norwegian to not have to fall back to the default. 52 // See ParseAcceptLanguage on how to handle the Accept-Language HTTP header. 53 // 54 // Selecting language-specific services 55 // 56 // One should always use the Tag returned by the Matcher to create an instance 57 // of any of the language-specific services provided by the text repository. 58 // This prevents the mixing of languages, such as having a different language for 59 // messages and display names, as well as improper casing or sorting order for 60 // the selected language. 61 // Using the returned Tag also allows user-defined settings, such as collation 62 // order or numbering system to be transparently passed as options. 63 // 64 // If you have language-specific data in your application, however, it will in 65 // most cases suffice to use the index returned by the matcher to identify 66 // the user language. 67 // The following loop provides an alternative in case this is not sufficient: 68 // 69 // supported := map[language.Tag]data{ 70 // language.English: enData, 71 // language.MustParse("en-AU"): enAUData, 72 // language.Danish: daData, 73 // language.Chinese: zhData, 74 // } 75 // tag, _, _ := matcher.Match(preferred...) 76 // for ; tag != language.Und; tag = tag.Parent() { 77 // if v, ok := supported[tag]; ok { 78 // return v 79 // } 80 // } 81 // return enData // should not reach here 82 // 83 // Repeatedly taking the Parent of the tag returned by Match will eventually 84 // match one of the tags used to initialize the Matcher. 85 // 86 // Canonicalization 87 // 88 // By default, only legacy and deprecated tags are converted into their 89 // canonical equivalent. All other information is preserved. This approach makes 90 // the confidence scores more accurate and allows matchers to distinguish 91 // between variants that are otherwise lost. 92 // 93 // As a consequence, two tags that should be treated as identical according to 94 // BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The 95 // Matchers will handle such distinctions, though, and are aware of the 96 // equivalence relations. The CanonType type can be used to alter the 97 // canonicalization form. 98 // 99 // References 100 // 101 // BCP 47 - Tags for Identifying Languages 102 // http://tools.ietf.org/html/bcp47 103 package language // import "github.com/insionng/yougam/libraries/x/text/language" 104 105 // TODO: Remove above NOTE after: 106 // - verifying that tables are dropped correctly (most notably matcher tables). 107 108 import ( 109 "errors" 110 "fmt" 111 "strings" 112 ) 113 114 const ( 115 // maxCoreSize is the maximum size of a BCP 47 tag without variants and 116 // extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes. 117 maxCoreSize = 12 118 119 // max99thPercentileSize is a somewhat arbitrary buffer size that presumably 120 // is large enough to hold at least 99% of the BCP 47 tags. 121 max99thPercentileSize = 32 122 123 // maxSimpleUExtensionSize is the maximum size of a -u extension with one 124 // key-type pair. Equals len("-u-") + key (2) + dash + max value (8). 125 maxSimpleUExtensionSize = 14 126 ) 127 128 // Tag represents a BCP 47 language tag. It is used to specify an instance of a 129 // specific language or locale. All language tag values are guaranteed to be 130 // well-formed. 131 type Tag struct { 132 lang langID 133 region regionID 134 script scriptID 135 pVariant byte // offset in str, includes preceding '-' 136 pExt uint16 // offset of first extension, includes preceding '-' 137 138 // str is the string representation of the Tag. It will only be used if the 139 // tag has variants or extensions. 140 str string 141 } 142 143 // Make is a convenience wrapper for Parse that omits the error. 144 // In case of an error, a sensible default is returned. 145 func Make(s string) Tag { 146 return Default.Make(s) 147 } 148 149 // Make is a convenience wrapper for c.Parse that omits the error. 150 // In case of an error, a sensible default is returned. 151 func (c CanonType) Make(s string) Tag { 152 t, _ := c.Parse(s) 153 return t 154 } 155 156 // Raw returns the raw base language, script and region, without making an 157 // attempt to infer their values. 158 func (t Tag) Raw() (b Base, s Script, r Region) { 159 return Base{t.lang}, Script{t.script}, Region{t.region} 160 } 161 162 // equalTags compares language, script and region subtags only. 163 func (t Tag) equalTags(a Tag) bool { 164 return t.lang == a.lang && t.script == a.script && t.region == a.region 165 } 166 167 // IsRoot returns true if t is equal to language "und". 168 func (t Tag) IsRoot() bool { 169 if int(t.pVariant) < len(t.str) { 170 return false 171 } 172 return t.equalTags(und) 173 } 174 175 // private reports whether the Tag consists solely of a private use tag. 176 func (t Tag) private() bool { 177 return t.str != "" && t.pVariant == 0 178 } 179 180 // CanonType can be used to enable or disable various types of canonicalization. 181 type CanonType int 182 183 const ( 184 // Replace deprecated base languages with their preferred replacements. 185 DeprecatedBase CanonType = 1 << iota 186 // Replace deprecated scripts with their preferred replacements. 187 DeprecatedScript 188 // Replace deprecated regions with their preferred replacements. 189 DeprecatedRegion 190 // Remove redundant scripts. 191 SuppressScript 192 // Normalize legacy encodings. This includes legacy languages defined in 193 // CLDR as well as bibliographic codes defined in ISO-639. 194 Legacy 195 // Map the dominant language of a macro language group to the macro language 196 // subtag. For example cmn -> zh. 197 Macro 198 // The CLDR flag should be used if full compatibility with CLDR is required. 199 // There are a few cases where language.Tag may differ from CLDR. To follow all 200 // of CLDR's suggestions, use All|CLDR. 201 CLDR 202 203 // Raw can be used to Compose or Parse without Canonicalization. 204 Raw CanonType = 0 205 206 // Replace all deprecated tags with their preferred replacements. 207 Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion 208 209 // All canonicalizations recommended by BCP 47. 210 BCP47 = Deprecated | SuppressScript 211 212 // All canonicalizations. 213 All = BCP47 | Legacy | Macro 214 215 // Default is the canonicalization used by Parse, Make and Compose. To 216 // preserve as much information as possible, canonicalizations that remove 217 // potentially valuable information are not included. The Matcher is 218 // designed to recognize similar tags that would be the same if 219 // they were canonicalized using All. 220 Default = Deprecated | Legacy 221 222 canonLang = DeprecatedBase | Legacy | Macro 223 224 // TODO: LikelyScript, LikelyRegion: suppress similar to ICU. 225 ) 226 227 // canonicalize returns the canonicalized equivalent of the tag and 228 // whether there was any change. 229 func (t Tag) canonicalize(c CanonType) (Tag, bool) { 230 if c == Raw { 231 return t, false 232 } 233 changed := false 234 if c&SuppressScript != 0 { 235 if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] { 236 t.script = 0 237 changed = true 238 } 239 } 240 if c&canonLang != 0 { 241 for { 242 if l, aliasType := normLang(t.lang); l != t.lang { 243 switch aliasType { 244 case langLegacy: 245 if c&Legacy != 0 { 246 if t.lang == _sh && t.script == 0 { 247 t.script = _Latn 248 } 249 t.lang = l 250 changed = true 251 } 252 case langMacro: 253 if c&Macro != 0 { 254 // We deviate here from CLDR. The mapping "nb" -> "no" 255 // qualifies as a typical Macro language mapping. However, 256 // for legacy reasons, CLDR maps "no", the macro language 257 // code for Norwegian, to the dominant variant "nb". This 258 // change is currently under consideration for CLDR as well. 259 // See http://unicode.org/cldr/trac/ticket/2698 and also 260 // http://unicode.org/cldr/trac/ticket/1790 for some of the 261 // practical implications. TODO: this check could be removed 262 // if CLDR adopts this change. 263 if c&CLDR == 0 || t.lang != _nb { 264 changed = true 265 t.lang = l 266 } 267 } 268 case langDeprecated: 269 if c&DeprecatedBase != 0 { 270 if t.lang == _mo && t.region == 0 { 271 t.region = _MD 272 } 273 t.lang = l 274 changed = true 275 // Other canonicalization types may still apply. 276 continue 277 } 278 } 279 } else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 { 280 t.lang = _nb 281 changed = true 282 } 283 break 284 } 285 } 286 if c&DeprecatedScript != 0 { 287 if t.script == _Qaai { 288 changed = true 289 t.script = _Zinh 290 } 291 } 292 if c&DeprecatedRegion != 0 { 293 if r := normRegion(t.region); r != 0 { 294 changed = true 295 t.region = r 296 } 297 } 298 return t, changed 299 } 300 301 // Canonicalize returns the canonicalized equivalent of the tag. 302 func (c CanonType) Canonicalize(t Tag) (Tag, error) { 303 t, changed := t.canonicalize(c) 304 if changed { 305 t.remakeString() 306 } 307 return t, nil 308 } 309 310 // Confidence indicates the level of certainty for a given return value. 311 // For example, Serbian may be written in Cyrillic or Latin script. 312 // The confidence level indicates whether a value was explicitly specified, 313 // whether it is typically the only possible value, or whether there is 314 // an ambiguity. 315 type Confidence int 316 317 const ( 318 No Confidence = iota // full confidence that there was no match 319 Low // most likely value picked out of a set of alternatives 320 High // value is generally assumed to be the correct match 321 Exact // exact match or explicitly specified value 322 ) 323 324 var confName = []string{"No", "Low", "High", "Exact"} 325 326 func (c Confidence) String() string { 327 return confName[c] 328 } 329 330 // remakeString is used to update t.str in case lang, script or region changed. 331 // It is assumed that pExt and pVariant still point to the start of the 332 // respective parts. 333 func (t *Tag) remakeString() { 334 if t.str == "" { 335 return 336 } 337 extra := t.str[t.pVariant:] 338 if t.pVariant > 0 { 339 extra = extra[1:] 340 } 341 if t.equalTags(und) && strings.HasPrefix(extra, "x-") { 342 t.str = extra 343 t.pVariant = 0 344 t.pExt = 0 345 return 346 } 347 var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases. 348 b := buf[:t.genCoreBytes(buf[:])] 349 if extra != "" { 350 diff := uint8(len(b)) - t.pVariant 351 b = append(b, '-') 352 b = append(b, extra...) 353 t.pVariant += diff 354 t.pExt += uint16(diff) 355 } else { 356 t.pVariant = uint8(len(b)) 357 t.pExt = uint16(len(b)) 358 } 359 t.str = string(b) 360 } 361 362 // genCoreBytes writes a string for the base languages, script and region tags 363 // to the given buffer and returns the number of bytes written. It will never 364 // write more than maxCoreSize bytes. 365 func (t *Tag) genCoreBytes(buf []byte) int { 366 n := t.lang.stringToBuf(buf[:]) 367 if t.script != 0 { 368 n += copy(buf[n:], "-") 369 n += copy(buf[n:], t.script.String()) 370 } 371 if t.region != 0 { 372 n += copy(buf[n:], "-") 373 n += copy(buf[n:], t.region.String()) 374 } 375 return n 376 } 377 378 // String returns the canonical string representation of the language tag. 379 func (t Tag) String() string { 380 if t.str != "" { 381 return t.str 382 } 383 if t.script == 0 && t.region == 0 { 384 return t.lang.String() 385 } 386 buf := [maxCoreSize]byte{} 387 return string(buf[:t.genCoreBytes(buf[:])]) 388 } 389 390 // Base returns the base language of the language tag. If the base language is 391 // unspecified, an attempt will be made to infer it from the context. 392 // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change. 393 func (t Tag) Base() (Base, Confidence) { 394 if t.lang != 0 { 395 return Base{t.lang}, Exact 396 } 397 c := High 398 if t.script == 0 && !(Region{t.region}).IsCountry() { 399 c = Low 400 } 401 if tag, err := addTags(t); err == nil && tag.lang != 0 { 402 return Base{tag.lang}, c 403 } 404 return Base{0}, No 405 } 406 407 // Script infers the script for the language tag. If it was not explicitly given, it will infer 408 // a most likely candidate. 409 // If more than one script is commonly used for a language, the most likely one 410 // is returned with a low confidence indication. For example, it returns (Cyrl, Low) 411 // for Serbian. 412 // If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined) 413 // as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks 414 // common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts. 415 // See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for 416 // unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified. 417 // Note that an inferred script is never guaranteed to be the correct one. Latin is 418 // almost exclusively used for Afrikaans, but Arabic has been used for some texts 419 // in the past. Also, the script that is commonly used may change over time. 420 // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change. 421 func (t Tag) Script() (Script, Confidence) { 422 if t.script != 0 { 423 return Script{t.script}, Exact 424 } 425 sc, c := scriptID(_Zzzz), No 426 if t.lang < langNoIndexOffset { 427 if scr := scriptID(suppressScript[t.lang]); scr != 0 { 428 // Note: it is not always the case that a language with a suppress 429 // script value is only written in one script (e.g. kk, ms, pa). 430 if t.region == 0 { 431 return Script{scriptID(scr)}, High 432 } 433 sc, c = scr, High 434 } 435 } 436 if tag, err := addTags(t); err == nil { 437 if tag.script != sc { 438 sc, c = tag.script, Low 439 } 440 } else { 441 t, _ = (Deprecated | Macro).Canonicalize(t) 442 if tag, err := addTags(t); err == nil && tag.script != sc { 443 sc, c = tag.script, Low 444 } 445 } 446 return Script{sc}, c 447 } 448 449 // Region returns the region for the language tag. If it was not explicitly given, it will 450 // infer a most likely candidate from the context. 451 // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change. 452 func (t Tag) Region() (Region, Confidence) { 453 if t.region != 0 { 454 return Region{t.region}, Exact 455 } 456 if t, err := addTags(t); err == nil { 457 return Region{t.region}, Low // TODO: differentiate between high and low. 458 } 459 t, _ = (Deprecated | Macro).Canonicalize(t) 460 if tag, err := addTags(t); err == nil { 461 return Region{tag.region}, Low 462 } 463 return Region{_ZZ}, No // TODO: return world instead of undetermined? 464 } 465 466 // Variant returns the variants specified explicitly for this language tag. 467 // or nil if no variant was specified. 468 func (t Tag) Variants() []Variant { 469 v := []Variant{} 470 if int(t.pVariant) < int(t.pExt) { 471 for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; { 472 x, str = nextToken(str) 473 v = append(v, Variant{x}) 474 } 475 } 476 return v 477 } 478 479 // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a 480 // specific language are substituted with fields from the parent language. 481 // The parent for a language may change for newer versions of CLDR. 482 func (t Tag) Parent() Tag { 483 if t.str != "" { 484 // Strip the variants and extensions. 485 t, _ = Raw.Compose(t.Raw()) 486 if t.region == 0 && t.script != 0 && t.lang != 0 { 487 base, _ := addTags(Tag{lang: t.lang}) 488 if base.script == t.script { 489 return Tag{lang: t.lang} 490 } 491 } 492 return t 493 } 494 if t.lang != 0 { 495 if t.region != 0 { 496 maxScript := t.script 497 if maxScript == 0 { 498 max, _ := addTags(t) 499 maxScript = max.script 500 } 501 502 for i := range parents { 503 if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript { 504 for _, r := range parents[i].fromRegion { 505 if regionID(r) == t.region { 506 return Tag{ 507 lang: t.lang, 508 script: scriptID(parents[i].script), 509 region: regionID(parents[i].toRegion), 510 } 511 } 512 } 513 } 514 } 515 516 // Strip the script if it is the default one. 517 base, _ := addTags(Tag{lang: t.lang}) 518 if base.script != maxScript { 519 return Tag{lang: t.lang, script: maxScript} 520 } 521 return Tag{lang: t.lang} 522 } else if t.script != 0 { 523 // The parent for an base-script pair with a non-default script is 524 // "und" instead of the base language. 525 base, _ := addTags(Tag{lang: t.lang}) 526 if base.script != t.script { 527 return und 528 } 529 return Tag{lang: t.lang} 530 } 531 } 532 return und 533 } 534 535 // returns token t and the rest of the string. 536 func nextToken(s string) (t, tail string) { 537 p := strings.Index(s[1:], "-") 538 if p == -1 { 539 return s[1:], "" 540 } 541 p++ 542 return s[1:p], s[p:] 543 } 544 545 // Extension is a single BCP 47 extension. 546 type Extension struct { 547 s string 548 } 549 550 // String returns the string representation of the extension, including the 551 // type tag. 552 func (e Extension) String() string { 553 return e.s 554 } 555 556 // ParseExtension parses s as an extension and returns it on success. 557 func ParseExtension(s string) (e Extension, err error) { 558 scan := makeScannerString(s) 559 var end int 560 if n := len(scan.token); n != 1 { 561 return Extension{}, errSyntax 562 } 563 scan.toLower(0, len(scan.b)) 564 end = parseExtension(&scan) 565 if end != len(s) { 566 return Extension{}, errSyntax 567 } 568 return Extension{string(scan.b)}, nil 569 } 570 571 // Type returns the one-byte extension type of e. It returns 0 for the zero 572 // exception. 573 func (e Extension) Type() byte { 574 if e.s == "" { 575 return 0 576 } 577 return e.s[0] 578 } 579 580 // Tokens returns the list of tokens of e. 581 func (e Extension) Tokens() []string { 582 return strings.Split(e.s, "-") 583 } 584 585 // Extension returns the extension of type x for tag t. It will return 586 // false for ok if t does not have the requested extension. The returned 587 // extension will be invalid in this case. 588 func (t Tag) Extension(x byte) (ext Extension, ok bool) { 589 for i := int(t.pExt); i < len(t.str)-1; { 590 var ext string 591 i, ext = getExtension(t.str, i) 592 if ext[0] == x { 593 return Extension{ext}, true 594 } 595 } 596 return Extension{string(x)}, false 597 } 598 599 // Extensions returns all extensions of t. 600 func (t Tag) Extensions() []Extension { 601 e := []Extension{} 602 for i := int(t.pExt); i < len(t.str)-1; { 603 var ext string 604 i, ext = getExtension(t.str, i) 605 e = append(e, Extension{ext}) 606 } 607 return e 608 } 609 610 // TypeForKey returns the type associated with the given key, where key and type 611 // are of the allowed values defined for the Unicode locale extension ('u') in 612 // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. 613 // TypeForKey will traverse the inheritance chain to get the correct value. 614 func (t Tag) TypeForKey(key string) string { 615 if start, end, _ := t.findTypeForKey(key); end != start { 616 return t.str[start:end] 617 } 618 return "" 619 } 620 621 var ( 622 errPrivateUse = errors.New("cannot set a key on a private use tag") 623 errInvalidArguments = errors.New("invalid key or type") 624 ) 625 626 // SetTypeForKey returns a new Tag with the key set to type, where key and type 627 // are of the allowed values defined for the Unicode locale extension ('u') in 628 // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. 629 // An empty value removes an existing pair with the same key. 630 func (t Tag) SetTypeForKey(key, value string) (Tag, error) { 631 if t.private() { 632 return t, errPrivateUse 633 } 634 if len(key) != 2 { 635 return t, errInvalidArguments 636 } 637 638 // Remove the setting if value is "". 639 if value == "" { 640 start, end, _ := t.findTypeForKey(key) 641 if start != end { 642 // Remove key tag and leading '-'. 643 start -= 4 644 645 // Remove a possible empty extension. 646 if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' { 647 start -= 2 648 } 649 if start == int(t.pVariant) && end == len(t.str) { 650 t.str = "" 651 t.pVariant, t.pExt = 0, 0 652 } else { 653 t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:]) 654 } 655 } 656 return t, nil 657 } 658 659 if len(value) < 3 || len(value) > 8 { 660 return t, errInvalidArguments 661 } 662 663 var ( 664 buf [maxCoreSize + maxSimpleUExtensionSize]byte 665 uStart int // start of the -u extension. 666 ) 667 668 // Generate the tag string if needed. 669 if t.str == "" { 670 uStart = t.genCoreBytes(buf[:]) 671 buf[uStart] = '-' 672 uStart++ 673 } 674 675 // Create new key-type pair and parse it to verify. 676 b := buf[uStart:] 677 copy(b, "u-") 678 copy(b[2:], key) 679 b[4] = '-' 680 b = b[:5+copy(b[5:], value)] 681 scan := makeScanner(b) 682 if parseExtensions(&scan); scan.err != nil { 683 return t, scan.err 684 } 685 686 // Assemble the replacement string. 687 if t.str == "" { 688 t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1) 689 t.str = string(buf[:uStart+len(b)]) 690 } else { 691 s := t.str 692 start, end, hasExt := t.findTypeForKey(key) 693 if start == end { 694 if hasExt { 695 b = b[2:] 696 } 697 t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:]) 698 } else { 699 t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:]) 700 } 701 } 702 return t, nil 703 } 704 705 // findKeyAndType returns the start and end position for the type corresponding 706 // to key or the point at which to insert the key-value pair if the type 707 // wasn't found. The hasExt return value reports whether an -u extension was present. 708 // Note: the extensions are typically very small and are likely to contain 709 // only one key-type pair. 710 func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) { 711 p := int(t.pExt) 712 if len(key) != 2 || p == len(t.str) || p == 0 { 713 return p, p, false 714 } 715 s := t.str 716 717 // Find the correct extension. 718 for p++; s[p] != 'u'; p++ { 719 if s[p] > 'u' { 720 p-- 721 return p, p, false 722 } 723 if p = nextExtension(s, p); p == len(s) { 724 return len(s), len(s), false 725 } 726 } 727 // Proceed to the hyphen following the extension name. 728 p++ 729 730 // curKey is the key currently being processed. 731 curKey := "" 732 733 // Iterate over keys until we get the end of a section. 734 for { 735 // p points to the hyphen preceding the current token. 736 if p3 := p + 3; s[p3] == '-' { 737 // Found a key. 738 // Check whether we just processed the key that was requested. 739 if curKey == key { 740 return start, p, true 741 } 742 // Set to the next key and continue scanning type tokens. 743 curKey = s[p+1 : p3] 744 if curKey > key { 745 return p, p, true 746 } 747 // Start of the type token sequence. 748 start = p + 4 749 // A type is at least 3 characters long. 750 p += 7 // 4 + 3 751 } else { 752 // Attribute or type, which is at least 3 characters long. 753 p += 4 754 } 755 // p points past the third character of a type or attribute. 756 max := p + 5 // maximum length of token plus hyphen. 757 if len(s) < max { 758 max = len(s) 759 } 760 for ; p < max && s[p] != '-'; p++ { 761 } 762 // Bail if we have exhausted all tokens or if the next token starts 763 // a new extension. 764 if p == len(s) || s[p+2] == '-' { 765 if curKey == key { 766 return start, p, true 767 } 768 return p, p, true 769 } 770 } 771 } 772 773 // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags 774 // for which data exists in the text repository. The index will change over time 775 // and should not be stored in persistent storage. Extensions, except for the 776 // 'va' type of the 'u' extension, are ignored. It will return 0, false if no 777 // compact tag exists, where 0 is the index for the root language (Und). 778 func CompactIndex(t Tag) (index int, ok bool) { 779 // TODO: perhaps give more frequent tags a lower index. 780 // TODO: we could make the indexes stable. This will excluded some 781 // possibilities for optimization, so don't do this quite yet. 782 b, s, r := t.Raw() 783 if len(t.str) > 0 { 784 if strings.HasPrefix(t.str, "x-") { 785 // We have no entries for user-defined tags. 786 return 0, false 787 } 788 if uint16(t.pVariant) != t.pExt { 789 // There are no tags with variants and an u-va type. 790 if t.TypeForKey("va") != "" { 791 return 0, false 792 } 793 t, _ = Raw.Compose(b, s, r, t.Variants()) 794 } else if _, ok := t.Extension('u'); ok { 795 // Strip all but the 'va' entry. 796 variant := t.TypeForKey("va") 797 t, _ = Raw.Compose(b, s, r) 798 t, _ = t.SetTypeForKey("va", variant) 799 } 800 if len(t.str) > 0 { 801 // We have some variants. 802 for i, s := range specialTags { 803 if s == t { 804 return i + 1, true 805 } 806 } 807 return 0, false 808 } 809 } 810 // No variants specified: just compare core components. 811 // The key has the form lllssrrr, where l, s, and r are nibbles for 812 // respectively the langID, scriptID, and regionID. 813 key := uint32(b.langID) << (8 + 12) 814 key |= uint32(s.scriptID) << 12 815 key |= uint32(r.regionID) 816 x, ok := coreTags[key] 817 return int(x), ok 818 } 819 820 // Base is an ISO 639 language code, used for encoding the base language 821 // of a language tag. 822 type Base struct { 823 langID 824 } 825 826 // ParseBase parses a 2- or 3-letter ISO 639 code. 827 // It returns a ValueError if s is a well-formed but unknown language identifier 828 // or another error if another error occurred. 829 func ParseBase(s string) (Base, error) { 830 if n := len(s); n < 2 || 3 < n { 831 return Base{}, errSyntax 832 } 833 var buf [3]byte 834 l, err := getLangID(buf[:copy(buf[:], s)]) 835 return Base{l}, err 836 } 837 838 // Script is a 4-letter ISO 15924 code for representing scripts. 839 // It is idiomatically represented in title case. 840 type Script struct { 841 scriptID 842 } 843 844 // ParseScript parses a 4-letter ISO 15924 code. 845 // It returns a ValueError if s is a well-formed but unknown script identifier 846 // or another error if another error occurred. 847 func ParseScript(s string) (Script, error) { 848 if len(s) != 4 { 849 return Script{}, errSyntax 850 } 851 var buf [4]byte 852 sc, err := getScriptID(script, buf[:copy(buf[:], s)]) 853 return Script{sc}, err 854 } 855 856 // Region is an ISO 3166-1 or UN M.49 code for representing countries and regions. 857 type Region struct { 858 regionID 859 } 860 861 // EncodeM49 returns the Region for the given UN M.49 code. 862 // It returns an error if r is not a valid code. 863 func EncodeM49(r int) (Region, error) { 864 rid, err := getRegionM49(r) 865 return Region{rid}, err 866 } 867 868 // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code. 869 // It returns a ValueError if s is a well-formed but unknown region identifier 870 // or another error if another error occurred. 871 func ParseRegion(s string) (Region, error) { 872 if n := len(s); n < 2 || 3 < n { 873 return Region{}, errSyntax 874 } 875 var buf [3]byte 876 r, err := getRegionID(buf[:copy(buf[:], s)]) 877 return Region{r}, err 878 } 879 880 // IsCountry returns whether this region is a country or autonomous area. This 881 // includes non-standard definitions from CLDR. 882 func (r Region) IsCountry() bool { 883 if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK { 884 return false 885 } 886 return true 887 } 888 889 // IsGroup returns whether this region defines a collection of regions. This 890 // includes non-standard definitions from CLDR. 891 func (r Region) IsGroup() bool { 892 if r.regionID == 0 { 893 return false 894 } 895 return int(regionInclusion[r.regionID]) < len(regionContainment) 896 } 897 898 // Contains returns whether Region c is contained by Region r. It returns true 899 // if c == r. 900 func (r Region) Contains(c Region) bool { 901 return r.regionID.contains(c.regionID) 902 } 903 904 func (r regionID) contains(c regionID) bool { 905 if r == c { 906 return true 907 } 908 g := regionInclusion[r] 909 if g >= nRegionGroups { 910 return false 911 } 912 m := regionContainment[g] 913 914 d := regionInclusion[c] 915 b := regionInclusionBits[d] 916 917 // A contained country may belong to multiple disjoint groups. Matching any 918 // of these indicates containment. If the contained region is a group, it 919 // must strictly be a subset. 920 if d >= nRegionGroups { 921 return b&m != 0 922 } 923 return b&^m == 0 924 } 925 926 var errNoTLD = errors.New("language: region is not a valid ccTLD") 927 928 // TLD returns the country code top-level domain (ccTLD). UK is returned for GB. 929 // In all other cases it returns either the region itself or an error. 930 // 931 // This method may return an error for a region for which there exists a 932 // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The 933 // region will already be canonicalized it was obtained from a Tag that was 934 // obtained using any of the default methods. 935 func (r Region) TLD() (Region, error) { 936 // See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the 937 // difference between ISO 3166-1 and IANA ccTLD. 938 if r.regionID == _GB { 939 r = Region{_UK} 940 } 941 if (r.typ() & ccTLD) == 0 { 942 return Region{}, errNoTLD 943 } 944 return r, nil 945 } 946 947 // Canonicalize returns the region or a possible replacement if the region is 948 // deprecated. It will not return a replacement for deprecated regions that 949 // are split into multiple regions. 950 func (r Region) Canonicalize() Region { 951 if cr := normRegion(r.regionID); cr != 0 { 952 return Region{cr} 953 } 954 return r 955 } 956 957 // Variant represents a registered variant of a language as defined by BCP 47. 958 type Variant struct { 959 variant string 960 } 961 962 // ParseVariant parses and returns a Variant. An error is returned if s is not 963 // a valid variant. 964 func ParseVariant(s string) (Variant, error) { 965 s = strings.ToLower(s) 966 if _, ok := variantIndex[s]; ok { 967 return Variant{s}, nil 968 } 969 return Variant{}, mkErrInvalid([]byte(s)) 970 } 971 972 // String returns the string representation of the variant. 973 func (v Variant) String() string { 974 return v.variant 975 }