github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/language/language.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run maketables.go gen_common.go -output tables.go
     6  //go:generate go run gen_index.go
     7  
     8  // Package language implements BCP 47 language tags and related functionality.
     9  //
    10  // The Tag type, which is used to represent languages, is agnostic to the
    11  // meaning of its subtags. Tags are not fully canonicalized to preserve
    12  // information that may be valuable in certain contexts. As a consequence, two
    13  // different tags may represent identical languages.
    14  //
    15  // Initializing language- or locale-specific components usually consists of
    16  // two steps. The first step is to select a display language based on the
    17  // preferred languages of the user and the languages supported by an application.
    18  // The second step is to create the language-specific services based on
    19  // this selection. Each is discussed in more details below.
    20  //
    21  // Matching preferred against supported languages
    22  //
    23  // An application may support various languages. This list is typically limited
    24  // by the languages for which there exists translations of the user interface.
    25  // Similarly, a user may provide a list of preferred languages which is limited
    26  // by the languages understood by this user.
    27  // An application should use a Matcher to find the best supported language based
    28  // on the user's preferred list.
    29  // Matchers are aware of the intricacies of equivalence between languages.
    30  // The default Matcher implementation takes into account things such as
    31  // deprecated subtags, legacy tags, and mutual intelligibility between scripts
    32  // and languages.
    33  //
    34  // A Matcher for English, Australian English, Danish, and standard Mandarin can
    35  // be defined as follows:
    36  //
    37  //		var matcher = language.NewMatcher([]language.Tag{
    38  //			language.English,   // The first language is used as fallback.
    39  // 			language.MustParse("en-AU"),
    40  //			language.Danish,
    41  //			language.Chinese,
    42  //		})
    43  //
    44  // The following code selects the best match for someone speaking Spanish and
    45  // Norwegian:
    46  //
    47  // 		preferred := []language.Tag{ language.Spanish, language.Norwegian }
    48  //		tag, _, _ := matcher.Match(preferred...)
    49  //
    50  // In this case, the best match is Danish, as Danish is sufficiently a match to
    51  // Norwegian to not have to fall back to the default.
    52  // See ParseAcceptLanguage on how to handle the Accept-Language HTTP header.
    53  //
    54  // Selecting language-specific services
    55  //
    56  // One should always use the Tag returned by the Matcher to create an instance
    57  // of any of the language-specific services provided by the text repository.
    58  // This prevents the mixing of languages, such as having a different language for
    59  // messages and display names, as well as improper casing or sorting order for
    60  // the selected language.
    61  // Using the returned Tag also allows user-defined settings, such as collation
    62  // order or numbering system to be transparently passed as options.
    63  //
    64  // If you have language-specific data in your application, however, it will in
    65  // most cases suffice to use the index returned by the matcher to identify
    66  // the user language.
    67  // The following loop provides an alternative in case this is not sufficient:
    68  //
    69  // 		supported := map[language.Tag]data{
    70  //			language.English:            enData,
    71  // 			language.MustParse("en-AU"): enAUData,
    72  //			language.Danish:             daData,
    73  //			language.Chinese:            zhData,
    74  // 		}
    75  //		tag, _, _ := matcher.Match(preferred...)
    76  //		for ; tag != language.Und; tag = tag.Parent() {
    77  //			if v, ok := supported[tag]; ok {
    78  //				return v
    79  //			}
    80  //		}
    81  // 		return enData // should not reach here
    82  //
    83  // Repeatedly taking the Parent of the tag returned by Match will eventually
    84  // match one of the tags used to initialize the Matcher.
    85  //
    86  // Canonicalization
    87  //
    88  // By default, only legacy and deprecated tags are converted into their
    89  // canonical equivalent. All other information is preserved. This approach makes
    90  // the confidence scores more accurate and allows matchers to distinguish
    91  // between variants that are otherwise lost.
    92  //
    93  // As a consequence, two tags that should be treated as identical according to
    94  // BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The
    95  // Matchers will handle such distinctions, though, and are aware of the
    96  // equivalence relations. The CanonType type can be used to alter the
    97  // canonicalization form.
    98  //
    99  // References
   100  //
   101  // BCP 47 - Tags for Identifying Languages
   102  // http://tools.ietf.org/html/bcp47
   103  package language // import "github.com/insionng/yougam/libraries/x/text/language"
   104  
   105  // TODO: Remove above NOTE after:
   106  // - verifying that tables are dropped correctly (most notably matcher tables).
   107  
   108  import (
   109  	"errors"
   110  	"fmt"
   111  	"strings"
   112  )
   113  
   114  const (
   115  	// maxCoreSize is the maximum size of a BCP 47 tag without variants and
   116  	// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
   117  	maxCoreSize = 12
   118  
   119  	// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
   120  	// is large enough to hold at least 99% of the BCP 47 tags.
   121  	max99thPercentileSize = 32
   122  
   123  	// maxSimpleUExtensionSize is the maximum size of a -u extension with one
   124  	// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
   125  	maxSimpleUExtensionSize = 14
   126  )
   127  
   128  // Tag represents a BCP 47 language tag. It is used to specify an instance of a
   129  // specific language or locale. All language tag values are guaranteed to be
   130  // well-formed.
   131  type Tag struct {
   132  	lang     langID
   133  	region   regionID
   134  	script   scriptID
   135  	pVariant byte   // offset in str, includes preceding '-'
   136  	pExt     uint16 // offset of first extension, includes preceding '-'
   137  
   138  	// str is the string representation of the Tag. It will only be used if the
   139  	// tag has variants or extensions.
   140  	str string
   141  }
   142  
   143  // Make is a convenience wrapper for Parse that omits the error.
   144  // In case of an error, a sensible default is returned.
   145  func Make(s string) Tag {
   146  	return Default.Make(s)
   147  }
   148  
   149  // Make is a convenience wrapper for c.Parse that omits the error.
   150  // In case of an error, a sensible default is returned.
   151  func (c CanonType) Make(s string) Tag {
   152  	t, _ := c.Parse(s)
   153  	return t
   154  }
   155  
   156  // Raw returns the raw base language, script and region, without making an
   157  // attempt to infer their values.
   158  func (t Tag) Raw() (b Base, s Script, r Region) {
   159  	return Base{t.lang}, Script{t.script}, Region{t.region}
   160  }
   161  
   162  // equalTags compares language, script and region subtags only.
   163  func (t Tag) equalTags(a Tag) bool {
   164  	return t.lang == a.lang && t.script == a.script && t.region == a.region
   165  }
   166  
   167  // IsRoot returns true if t is equal to language "und".
   168  func (t Tag) IsRoot() bool {
   169  	if int(t.pVariant) < len(t.str) {
   170  		return false
   171  	}
   172  	return t.equalTags(und)
   173  }
   174  
   175  // private reports whether the Tag consists solely of a private use tag.
   176  func (t Tag) private() bool {
   177  	return t.str != "" && t.pVariant == 0
   178  }
   179  
   180  // CanonType can be used to enable or disable various types of canonicalization.
   181  type CanonType int
   182  
   183  const (
   184  	// Replace deprecated base languages with their preferred replacements.
   185  	DeprecatedBase CanonType = 1 << iota
   186  	// Replace deprecated scripts with their preferred replacements.
   187  	DeprecatedScript
   188  	// Replace deprecated regions with their preferred replacements.
   189  	DeprecatedRegion
   190  	// Remove redundant scripts.
   191  	SuppressScript
   192  	// Normalize legacy encodings. This includes legacy languages defined in
   193  	// CLDR as well as bibliographic codes defined in ISO-639.
   194  	Legacy
   195  	// Map the dominant language of a macro language group to the macro language
   196  	// subtag. For example cmn -> zh.
   197  	Macro
   198  	// The CLDR flag should be used if full compatibility with CLDR is required.
   199  	// There are a few cases where language.Tag may differ from CLDR. To follow all
   200  	// of CLDR's suggestions, use All|CLDR.
   201  	CLDR
   202  
   203  	// Raw can be used to Compose or Parse without Canonicalization.
   204  	Raw CanonType = 0
   205  
   206  	// Replace all deprecated tags with their preferred replacements.
   207  	Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
   208  
   209  	// All canonicalizations recommended by BCP 47.
   210  	BCP47 = Deprecated | SuppressScript
   211  
   212  	// All canonicalizations.
   213  	All = BCP47 | Legacy | Macro
   214  
   215  	// Default is the canonicalization used by Parse, Make and Compose. To
   216  	// preserve as much information as possible, canonicalizations that remove
   217  	// potentially valuable information are not included. The Matcher is
   218  	// designed to recognize similar tags that would be the same if
   219  	// they were canonicalized using All.
   220  	Default = Deprecated | Legacy
   221  
   222  	canonLang = DeprecatedBase | Legacy | Macro
   223  
   224  	// TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
   225  )
   226  
   227  // canonicalize returns the canonicalized equivalent of the tag and
   228  // whether there was any change.
   229  func (t Tag) canonicalize(c CanonType) (Tag, bool) {
   230  	if c == Raw {
   231  		return t, false
   232  	}
   233  	changed := false
   234  	if c&SuppressScript != 0 {
   235  		if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
   236  			t.script = 0
   237  			changed = true
   238  		}
   239  	}
   240  	if c&canonLang != 0 {
   241  		for {
   242  			if l, aliasType := normLang(t.lang); l != t.lang {
   243  				switch aliasType {
   244  				case langLegacy:
   245  					if c&Legacy != 0 {
   246  						if t.lang == _sh && t.script == 0 {
   247  							t.script = _Latn
   248  						}
   249  						t.lang = l
   250  						changed = true
   251  					}
   252  				case langMacro:
   253  					if c&Macro != 0 {
   254  						// We deviate here from CLDR. The mapping "nb" -> "no"
   255  						// qualifies as a typical Macro language mapping.  However,
   256  						// for legacy reasons, CLDR maps "no", the macro language
   257  						// code for Norwegian, to the dominant variant "nb". This
   258  						// change is currently under consideration for CLDR as well.
   259  						// See http://unicode.org/cldr/trac/ticket/2698 and also
   260  						// http://unicode.org/cldr/trac/ticket/1790 for some of the
   261  						// practical implications. TODO: this check could be removed
   262  						// if CLDR adopts this change.
   263  						if c&CLDR == 0 || t.lang != _nb {
   264  							changed = true
   265  							t.lang = l
   266  						}
   267  					}
   268  				case langDeprecated:
   269  					if c&DeprecatedBase != 0 {
   270  						if t.lang == _mo && t.region == 0 {
   271  							t.region = _MD
   272  						}
   273  						t.lang = l
   274  						changed = true
   275  						// Other canonicalization types may still apply.
   276  						continue
   277  					}
   278  				}
   279  			} else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
   280  				t.lang = _nb
   281  				changed = true
   282  			}
   283  			break
   284  		}
   285  	}
   286  	if c&DeprecatedScript != 0 {
   287  		if t.script == _Qaai {
   288  			changed = true
   289  			t.script = _Zinh
   290  		}
   291  	}
   292  	if c&DeprecatedRegion != 0 {
   293  		if r := normRegion(t.region); r != 0 {
   294  			changed = true
   295  			t.region = r
   296  		}
   297  	}
   298  	return t, changed
   299  }
   300  
   301  // Canonicalize returns the canonicalized equivalent of the tag.
   302  func (c CanonType) Canonicalize(t Tag) (Tag, error) {
   303  	t, changed := t.canonicalize(c)
   304  	if changed {
   305  		t.remakeString()
   306  	}
   307  	return t, nil
   308  }
   309  
   310  // Confidence indicates the level of certainty for a given return value.
   311  // For example, Serbian may be written in Cyrillic or Latin script.
   312  // The confidence level indicates whether a value was explicitly specified,
   313  // whether it is typically the only possible value, or whether there is
   314  // an ambiguity.
   315  type Confidence int
   316  
   317  const (
   318  	No    Confidence = iota // full confidence that there was no match
   319  	Low                     // most likely value picked out of a set of alternatives
   320  	High                    // value is generally assumed to be the correct match
   321  	Exact                   // exact match or explicitly specified value
   322  )
   323  
   324  var confName = []string{"No", "Low", "High", "Exact"}
   325  
   326  func (c Confidence) String() string {
   327  	return confName[c]
   328  }
   329  
   330  // remakeString is used to update t.str in case lang, script or region changed.
   331  // It is assumed that pExt and pVariant still point to the start of the
   332  // respective parts.
   333  func (t *Tag) remakeString() {
   334  	if t.str == "" {
   335  		return
   336  	}
   337  	extra := t.str[t.pVariant:]
   338  	if t.pVariant > 0 {
   339  		extra = extra[1:]
   340  	}
   341  	if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
   342  		t.str = extra
   343  		t.pVariant = 0
   344  		t.pExt = 0
   345  		return
   346  	}
   347  	var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
   348  	b := buf[:t.genCoreBytes(buf[:])]
   349  	if extra != "" {
   350  		diff := uint8(len(b)) - t.pVariant
   351  		b = append(b, '-')
   352  		b = append(b, extra...)
   353  		t.pVariant += diff
   354  		t.pExt += uint16(diff)
   355  	} else {
   356  		t.pVariant = uint8(len(b))
   357  		t.pExt = uint16(len(b))
   358  	}
   359  	t.str = string(b)
   360  }
   361  
   362  // genCoreBytes writes a string for the base languages, script and region tags
   363  // to the given buffer and returns the number of bytes written. It will never
   364  // write more than maxCoreSize bytes.
   365  func (t *Tag) genCoreBytes(buf []byte) int {
   366  	n := t.lang.stringToBuf(buf[:])
   367  	if t.script != 0 {
   368  		n += copy(buf[n:], "-")
   369  		n += copy(buf[n:], t.script.String())
   370  	}
   371  	if t.region != 0 {
   372  		n += copy(buf[n:], "-")
   373  		n += copy(buf[n:], t.region.String())
   374  	}
   375  	return n
   376  }
   377  
   378  // String returns the canonical string representation of the language tag.
   379  func (t Tag) String() string {
   380  	if t.str != "" {
   381  		return t.str
   382  	}
   383  	if t.script == 0 && t.region == 0 {
   384  		return t.lang.String()
   385  	}
   386  	buf := [maxCoreSize]byte{}
   387  	return string(buf[:t.genCoreBytes(buf[:])])
   388  }
   389  
   390  // Base returns the base language of the language tag. If the base language is
   391  // unspecified, an attempt will be made to infer it from the context.
   392  // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
   393  func (t Tag) Base() (Base, Confidence) {
   394  	if t.lang != 0 {
   395  		return Base{t.lang}, Exact
   396  	}
   397  	c := High
   398  	if t.script == 0 && !(Region{t.region}).IsCountry() {
   399  		c = Low
   400  	}
   401  	if tag, err := addTags(t); err == nil && tag.lang != 0 {
   402  		return Base{tag.lang}, c
   403  	}
   404  	return Base{0}, No
   405  }
   406  
   407  // Script infers the script for the language tag. If it was not explicitly given, it will infer
   408  // a most likely candidate.
   409  // If more than one script is commonly used for a language, the most likely one
   410  // is returned with a low confidence indication. For example, it returns (Cyrl, Low)
   411  // for Serbian.
   412  // If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
   413  // as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
   414  // common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
   415  // See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
   416  // unknown value in CLDR.  (Zzzz, Exact) is returned if Zzzz was explicitly specified.
   417  // Note that an inferred script is never guaranteed to be the correct one. Latin is
   418  // almost exclusively used for Afrikaans, but Arabic has been used for some texts
   419  // in the past.  Also, the script that is commonly used may change over time.
   420  // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
   421  func (t Tag) Script() (Script, Confidence) {
   422  	if t.script != 0 {
   423  		return Script{t.script}, Exact
   424  	}
   425  	sc, c := scriptID(_Zzzz), No
   426  	if t.lang < langNoIndexOffset {
   427  		if scr := scriptID(suppressScript[t.lang]); scr != 0 {
   428  			// Note: it is not always the case that a language with a suppress
   429  			// script value is only written in one script (e.g. kk, ms, pa).
   430  			if t.region == 0 {
   431  				return Script{scriptID(scr)}, High
   432  			}
   433  			sc, c = scr, High
   434  		}
   435  	}
   436  	if tag, err := addTags(t); err == nil {
   437  		if tag.script != sc {
   438  			sc, c = tag.script, Low
   439  		}
   440  	} else {
   441  		t, _ = (Deprecated | Macro).Canonicalize(t)
   442  		if tag, err := addTags(t); err == nil && tag.script != sc {
   443  			sc, c = tag.script, Low
   444  		}
   445  	}
   446  	return Script{sc}, c
   447  }
   448  
   449  // Region returns the region for the language tag. If it was not explicitly given, it will
   450  // infer a most likely candidate from the context.
   451  // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
   452  func (t Tag) Region() (Region, Confidence) {
   453  	if t.region != 0 {
   454  		return Region{t.region}, Exact
   455  	}
   456  	if t, err := addTags(t); err == nil {
   457  		return Region{t.region}, Low // TODO: differentiate between high and low.
   458  	}
   459  	t, _ = (Deprecated | Macro).Canonicalize(t)
   460  	if tag, err := addTags(t); err == nil {
   461  		return Region{tag.region}, Low
   462  	}
   463  	return Region{_ZZ}, No // TODO: return world instead of undetermined?
   464  }
   465  
   466  // Variant returns the variants specified explicitly for this language tag.
   467  // or nil if no variant was specified.
   468  func (t Tag) Variants() []Variant {
   469  	v := []Variant{}
   470  	if int(t.pVariant) < int(t.pExt) {
   471  		for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
   472  			x, str = nextToken(str)
   473  			v = append(v, Variant{x})
   474  		}
   475  	}
   476  	return v
   477  }
   478  
   479  // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
   480  // specific language are substituted with fields from the parent language.
   481  // The parent for a language may change for newer versions of CLDR.
   482  func (t Tag) Parent() Tag {
   483  	if t.str != "" {
   484  		// Strip the variants and extensions.
   485  		t, _ = Raw.Compose(t.Raw())
   486  		if t.region == 0 && t.script != 0 && t.lang != 0 {
   487  			base, _ := addTags(Tag{lang: t.lang})
   488  			if base.script == t.script {
   489  				return Tag{lang: t.lang}
   490  			}
   491  		}
   492  		return t
   493  	}
   494  	if t.lang != 0 {
   495  		if t.region != 0 {
   496  			maxScript := t.script
   497  			if maxScript == 0 {
   498  				max, _ := addTags(t)
   499  				maxScript = max.script
   500  			}
   501  
   502  			for i := range parents {
   503  				if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
   504  					for _, r := range parents[i].fromRegion {
   505  						if regionID(r) == t.region {
   506  							return Tag{
   507  								lang:   t.lang,
   508  								script: scriptID(parents[i].script),
   509  								region: regionID(parents[i].toRegion),
   510  							}
   511  						}
   512  					}
   513  				}
   514  			}
   515  
   516  			// Strip the script if it is the default one.
   517  			base, _ := addTags(Tag{lang: t.lang})
   518  			if base.script != maxScript {
   519  				return Tag{lang: t.lang, script: maxScript}
   520  			}
   521  			return Tag{lang: t.lang}
   522  		} else if t.script != 0 {
   523  			// The parent for an base-script pair with a non-default script is
   524  			// "und" instead of the base language.
   525  			base, _ := addTags(Tag{lang: t.lang})
   526  			if base.script != t.script {
   527  				return und
   528  			}
   529  			return Tag{lang: t.lang}
   530  		}
   531  	}
   532  	return und
   533  }
   534  
   535  // returns token t and the rest of the string.
   536  func nextToken(s string) (t, tail string) {
   537  	p := strings.Index(s[1:], "-")
   538  	if p == -1 {
   539  		return s[1:], ""
   540  	}
   541  	p++
   542  	return s[1:p], s[p:]
   543  }
   544  
   545  // Extension is a single BCP 47 extension.
   546  type Extension struct {
   547  	s string
   548  }
   549  
   550  // String returns the string representation of the extension, including the
   551  // type tag.
   552  func (e Extension) String() string {
   553  	return e.s
   554  }
   555  
   556  // ParseExtension parses s as an extension and returns it on success.
   557  func ParseExtension(s string) (e Extension, err error) {
   558  	scan := makeScannerString(s)
   559  	var end int
   560  	if n := len(scan.token); n != 1 {
   561  		return Extension{}, errSyntax
   562  	}
   563  	scan.toLower(0, len(scan.b))
   564  	end = parseExtension(&scan)
   565  	if end != len(s) {
   566  		return Extension{}, errSyntax
   567  	}
   568  	return Extension{string(scan.b)}, nil
   569  }
   570  
   571  // Type returns the one-byte extension type of e. It returns 0 for the zero
   572  // exception.
   573  func (e Extension) Type() byte {
   574  	if e.s == "" {
   575  		return 0
   576  	}
   577  	return e.s[0]
   578  }
   579  
   580  // Tokens returns the list of tokens of e.
   581  func (e Extension) Tokens() []string {
   582  	return strings.Split(e.s, "-")
   583  }
   584  
   585  // Extension returns the extension of type x for tag t. It will return
   586  // false for ok if t does not have the requested extension. The returned
   587  // extension will be invalid in this case.
   588  func (t Tag) Extension(x byte) (ext Extension, ok bool) {
   589  	for i := int(t.pExt); i < len(t.str)-1; {
   590  		var ext string
   591  		i, ext = getExtension(t.str, i)
   592  		if ext[0] == x {
   593  			return Extension{ext}, true
   594  		}
   595  	}
   596  	return Extension{string(x)}, false
   597  }
   598  
   599  // Extensions returns all extensions of t.
   600  func (t Tag) Extensions() []Extension {
   601  	e := []Extension{}
   602  	for i := int(t.pExt); i < len(t.str)-1; {
   603  		var ext string
   604  		i, ext = getExtension(t.str, i)
   605  		e = append(e, Extension{ext})
   606  	}
   607  	return e
   608  }
   609  
   610  // TypeForKey returns the type associated with the given key, where key and type
   611  // are of the allowed values defined for the Unicode locale extension ('u') in
   612  // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
   613  // TypeForKey will traverse the inheritance chain to get the correct value.
   614  func (t Tag) TypeForKey(key string) string {
   615  	if start, end, _ := t.findTypeForKey(key); end != start {
   616  		return t.str[start:end]
   617  	}
   618  	return ""
   619  }
   620  
   621  var (
   622  	errPrivateUse       = errors.New("cannot set a key on a private use tag")
   623  	errInvalidArguments = errors.New("invalid key or type")
   624  )
   625  
   626  // SetTypeForKey returns a new Tag with the key set to type, where key and type
   627  // are of the allowed values defined for the Unicode locale extension ('u') in
   628  // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
   629  // An empty value removes an existing pair with the same key.
   630  func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
   631  	if t.private() {
   632  		return t, errPrivateUse
   633  	}
   634  	if len(key) != 2 {
   635  		return t, errInvalidArguments
   636  	}
   637  
   638  	// Remove the setting if value is "".
   639  	if value == "" {
   640  		start, end, _ := t.findTypeForKey(key)
   641  		if start != end {
   642  			// Remove key tag and leading '-'.
   643  			start -= 4
   644  
   645  			// Remove a possible empty extension.
   646  			if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
   647  				start -= 2
   648  			}
   649  			if start == int(t.pVariant) && end == len(t.str) {
   650  				t.str = ""
   651  				t.pVariant, t.pExt = 0, 0
   652  			} else {
   653  				t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
   654  			}
   655  		}
   656  		return t, nil
   657  	}
   658  
   659  	if len(value) < 3 || len(value) > 8 {
   660  		return t, errInvalidArguments
   661  	}
   662  
   663  	var (
   664  		buf    [maxCoreSize + maxSimpleUExtensionSize]byte
   665  		uStart int // start of the -u extension.
   666  	)
   667  
   668  	// Generate the tag string if needed.
   669  	if t.str == "" {
   670  		uStart = t.genCoreBytes(buf[:])
   671  		buf[uStart] = '-'
   672  		uStart++
   673  	}
   674  
   675  	// Create new key-type pair and parse it to verify.
   676  	b := buf[uStart:]
   677  	copy(b, "u-")
   678  	copy(b[2:], key)
   679  	b[4] = '-'
   680  	b = b[:5+copy(b[5:], value)]
   681  	scan := makeScanner(b)
   682  	if parseExtensions(&scan); scan.err != nil {
   683  		return t, scan.err
   684  	}
   685  
   686  	// Assemble the replacement string.
   687  	if t.str == "" {
   688  		t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
   689  		t.str = string(buf[:uStart+len(b)])
   690  	} else {
   691  		s := t.str
   692  		start, end, hasExt := t.findTypeForKey(key)
   693  		if start == end {
   694  			if hasExt {
   695  				b = b[2:]
   696  			}
   697  			t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
   698  		} else {
   699  			t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
   700  		}
   701  	}
   702  	return t, nil
   703  }
   704  
   705  // findKeyAndType returns the start and end position for the type corresponding
   706  // to key or the point at which to insert the key-value pair if the type
   707  // wasn't found. The hasExt return value reports whether an -u extension was present.
   708  // Note: the extensions are typically very small and are likely to contain
   709  // only one key-type pair.
   710  func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
   711  	p := int(t.pExt)
   712  	if len(key) != 2 || p == len(t.str) || p == 0 {
   713  		return p, p, false
   714  	}
   715  	s := t.str
   716  
   717  	// Find the correct extension.
   718  	for p++; s[p] != 'u'; p++ {
   719  		if s[p] > 'u' {
   720  			p--
   721  			return p, p, false
   722  		}
   723  		if p = nextExtension(s, p); p == len(s) {
   724  			return len(s), len(s), false
   725  		}
   726  	}
   727  	// Proceed to the hyphen following the extension name.
   728  	p++
   729  
   730  	// curKey is the key currently being processed.
   731  	curKey := ""
   732  
   733  	// Iterate over keys until we get the end of a section.
   734  	for {
   735  		// p points to the hyphen preceding the current token.
   736  		if p3 := p + 3; s[p3] == '-' {
   737  			// Found a key.
   738  			// Check whether we just processed the key that was requested.
   739  			if curKey == key {
   740  				return start, p, true
   741  			}
   742  			// Set to the next key and continue scanning type tokens.
   743  			curKey = s[p+1 : p3]
   744  			if curKey > key {
   745  				return p, p, true
   746  			}
   747  			// Start of the type token sequence.
   748  			start = p + 4
   749  			// A type is at least 3 characters long.
   750  			p += 7 // 4 + 3
   751  		} else {
   752  			// Attribute or type, which is at least 3 characters long.
   753  			p += 4
   754  		}
   755  		// p points past the third character of a type or attribute.
   756  		max := p + 5 // maximum length of token plus hyphen.
   757  		if len(s) < max {
   758  			max = len(s)
   759  		}
   760  		for ; p < max && s[p] != '-'; p++ {
   761  		}
   762  		// Bail if we have exhausted all tokens or if the next token starts
   763  		// a new extension.
   764  		if p == len(s) || s[p+2] == '-' {
   765  			if curKey == key {
   766  				return start, p, true
   767  			}
   768  			return p, p, true
   769  		}
   770  	}
   771  }
   772  
   773  // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
   774  // for which data exists in the text repository. The index will change over time
   775  // and should not be stored in persistent storage. Extensions, except for the
   776  // 'va' type of the 'u' extension, are ignored. It will return 0, false if no
   777  // compact tag exists, where 0 is the index for the root language (Und).
   778  func CompactIndex(t Tag) (index int, ok bool) {
   779  	// TODO: perhaps give more frequent tags a lower index.
   780  	// TODO: we could make the indexes stable. This will excluded some
   781  	//       possibilities for optimization, so don't do this quite yet.
   782  	b, s, r := t.Raw()
   783  	if len(t.str) > 0 {
   784  		if strings.HasPrefix(t.str, "x-") {
   785  			// We have no entries for user-defined tags.
   786  			return 0, false
   787  		}
   788  		if uint16(t.pVariant) != t.pExt {
   789  			// There are no tags with variants and an u-va type.
   790  			if t.TypeForKey("va") != "" {
   791  				return 0, false
   792  			}
   793  			t, _ = Raw.Compose(b, s, r, t.Variants())
   794  		} else if _, ok := t.Extension('u'); ok {
   795  			// Strip all but the 'va' entry.
   796  			variant := t.TypeForKey("va")
   797  			t, _ = Raw.Compose(b, s, r)
   798  			t, _ = t.SetTypeForKey("va", variant)
   799  		}
   800  		if len(t.str) > 0 {
   801  			// We have some variants.
   802  			for i, s := range specialTags {
   803  				if s == t {
   804  					return i + 1, true
   805  				}
   806  			}
   807  			return 0, false
   808  		}
   809  	}
   810  	// No variants specified: just compare core components.
   811  	// The key has the form lllssrrr, where l, s, and r are nibbles for
   812  	// respectively the langID, scriptID, and regionID.
   813  	key := uint32(b.langID) << (8 + 12)
   814  	key |= uint32(s.scriptID) << 12
   815  	key |= uint32(r.regionID)
   816  	x, ok := coreTags[key]
   817  	return int(x), ok
   818  }
   819  
   820  // Base is an ISO 639 language code, used for encoding the base language
   821  // of a language tag.
   822  type Base struct {
   823  	langID
   824  }
   825  
   826  // ParseBase parses a 2- or 3-letter ISO 639 code.
   827  // It returns a ValueError if s is a well-formed but unknown language identifier
   828  // or another error if another error occurred.
   829  func ParseBase(s string) (Base, error) {
   830  	if n := len(s); n < 2 || 3 < n {
   831  		return Base{}, errSyntax
   832  	}
   833  	var buf [3]byte
   834  	l, err := getLangID(buf[:copy(buf[:], s)])
   835  	return Base{l}, err
   836  }
   837  
   838  // Script is a 4-letter ISO 15924 code for representing scripts.
   839  // It is idiomatically represented in title case.
   840  type Script struct {
   841  	scriptID
   842  }
   843  
   844  // ParseScript parses a 4-letter ISO 15924 code.
   845  // It returns a ValueError if s is a well-formed but unknown script identifier
   846  // or another error if another error occurred.
   847  func ParseScript(s string) (Script, error) {
   848  	if len(s) != 4 {
   849  		return Script{}, errSyntax
   850  	}
   851  	var buf [4]byte
   852  	sc, err := getScriptID(script, buf[:copy(buf[:], s)])
   853  	return Script{sc}, err
   854  }
   855  
   856  // Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
   857  type Region struct {
   858  	regionID
   859  }
   860  
   861  // EncodeM49 returns the Region for the given UN M.49 code.
   862  // It returns an error if r is not a valid code.
   863  func EncodeM49(r int) (Region, error) {
   864  	rid, err := getRegionM49(r)
   865  	return Region{rid}, err
   866  }
   867  
   868  // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
   869  // It returns a ValueError if s is a well-formed but unknown region identifier
   870  // or another error if another error occurred.
   871  func ParseRegion(s string) (Region, error) {
   872  	if n := len(s); n < 2 || 3 < n {
   873  		return Region{}, errSyntax
   874  	}
   875  	var buf [3]byte
   876  	r, err := getRegionID(buf[:copy(buf[:], s)])
   877  	return Region{r}, err
   878  }
   879  
   880  // IsCountry returns whether this region is a country or autonomous area. This
   881  // includes non-standard definitions from CLDR.
   882  func (r Region) IsCountry() bool {
   883  	if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
   884  		return false
   885  	}
   886  	return true
   887  }
   888  
   889  // IsGroup returns whether this region defines a collection of regions. This
   890  // includes non-standard definitions from CLDR.
   891  func (r Region) IsGroup() bool {
   892  	if r.regionID == 0 {
   893  		return false
   894  	}
   895  	return int(regionInclusion[r.regionID]) < len(regionContainment)
   896  }
   897  
   898  // Contains returns whether Region c is contained by Region r. It returns true
   899  // if c == r.
   900  func (r Region) Contains(c Region) bool {
   901  	return r.regionID.contains(c.regionID)
   902  }
   903  
   904  func (r regionID) contains(c regionID) bool {
   905  	if r == c {
   906  		return true
   907  	}
   908  	g := regionInclusion[r]
   909  	if g >= nRegionGroups {
   910  		return false
   911  	}
   912  	m := regionContainment[g]
   913  
   914  	d := regionInclusion[c]
   915  	b := regionInclusionBits[d]
   916  
   917  	// A contained country may belong to multiple disjoint groups. Matching any
   918  	// of these indicates containment. If the contained region is a group, it
   919  	// must strictly be a subset.
   920  	if d >= nRegionGroups {
   921  		return b&m != 0
   922  	}
   923  	return b&^m == 0
   924  }
   925  
   926  var errNoTLD = errors.New("language: region is not a valid ccTLD")
   927  
   928  // TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
   929  // In all other cases it returns either the region itself or an error.
   930  //
   931  // This method may return an error for a region for which there exists a
   932  // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
   933  // region will already be canonicalized it was obtained from a Tag that was
   934  // obtained using any of the default methods.
   935  func (r Region) TLD() (Region, error) {
   936  	// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
   937  	// difference between ISO 3166-1 and IANA ccTLD.
   938  	if r.regionID == _GB {
   939  		r = Region{_UK}
   940  	}
   941  	if (r.typ() & ccTLD) == 0 {
   942  		return Region{}, errNoTLD
   943  	}
   944  	return r, nil
   945  }
   946  
   947  // Canonicalize returns the region or a possible replacement if the region is
   948  // deprecated. It will not return a replacement for deprecated regions that
   949  // are split into multiple regions.
   950  func (r Region) Canonicalize() Region {
   951  	if cr := normRegion(r.regionID); cr != 0 {
   952  		return Region{cr}
   953  	}
   954  	return r
   955  }
   956  
   957  // Variant represents a registered variant of a language as defined by BCP 47.
   958  type Variant struct {
   959  	variant string
   960  }
   961  
   962  // ParseVariant parses and returns a Variant. An error is returned if s is not
   963  // a valid variant.
   964  func ParseVariant(s string) (Variant, error) {
   965  	s = strings.ToLower(s)
   966  	if _, ok := variantIndex[s]; ok {
   967  		return Variant{s}, nil
   968  	}
   969  	return Variant{}, mkErrInvalid([]byte(s))
   970  }
   971  
   972  // String returns the string representation of the variant.
   973  func (v Variant) String() string {
   974  	return v.variant
   975  }