github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/language/lookup.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package language
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"sort"
    11  	"strconv"
    12  
    13  	"golang.org/x/text/internal/tag"
    14  )
    15  
    16  // findIndex tries to find the given tag in idx and returns a standardized error
    17  // if it could not be found.
    18  func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
    19  	if !tag.FixCase(form, key) {
    20  		return 0, errSyntax
    21  	}
    22  	i := idx.Index(key)
    23  	if i == -1 {
    24  		return 0, mkErrInvalid(key)
    25  	}
    26  	return i, nil
    27  }
    28  
    29  func searchUint(imap []uint16, key uint16) int {
    30  	return sort.Search(len(imap), func(i int) bool {
    31  		return imap[i] >= key
    32  	})
    33  }
    34  
    35  type langID uint16
    36  
    37  // getLangID returns the langID of s if s is a canonical subtag
    38  // or langUnknown if s is not a canonical subtag.
    39  func getLangID(s []byte) (langID, error) {
    40  	if len(s) == 2 {
    41  		return getLangISO2(s)
    42  	}
    43  	return getLangISO3(s)
    44  }
    45  
    46  // mapLang returns the mapped langID of id according to mapping m.
    47  func normLang(id langID) (langID, langAliasType) {
    48  	k := sort.Search(len(langAliasMap), func(i int) bool {
    49  		return langAliasMap[i].from >= uint16(id)
    50  	})
    51  	if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
    52  		return langID(langAliasMap[k].to), langAliasTypes[k]
    53  	}
    54  	return id, langAliasTypeUnknown
    55  }
    56  
    57  // getLangISO2 returns the langID for the given 2-letter ISO language code
    58  // or unknownLang if this does not exist.
    59  func getLangISO2(s []byte) (langID, error) {
    60  	if !tag.FixCase("zz", s) {
    61  		return 0, errSyntax
    62  	}
    63  	if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
    64  		return langID(i), nil
    65  	}
    66  	return 0, mkErrInvalid(s)
    67  }
    68  
    69  const base = 'z' - 'a' + 1
    70  
    71  func strToInt(s []byte) uint {
    72  	v := uint(0)
    73  	for i := 0; i < len(s); i++ {
    74  		v *= base
    75  		v += uint(s[i] - 'a')
    76  	}
    77  	return v
    78  }
    79  
    80  // converts the given integer to the original ASCII string passed to strToInt.
    81  // len(s) must match the number of characters obtained.
    82  func intToStr(v uint, s []byte) {
    83  	for i := len(s) - 1; i >= 0; i-- {
    84  		s[i] = byte(v%base) + 'a'
    85  		v /= base
    86  	}
    87  }
    88  
    89  // getLangISO3 returns the langID for the given 3-letter ISO language code
    90  // or unknownLang if this does not exist.
    91  func getLangISO3(s []byte) (langID, error) {
    92  	if tag.FixCase("und", s) {
    93  		// first try to match canonical 3-letter entries
    94  		for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
    95  			if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
    96  				// We treat "und" as special and always translate it to "unspecified".
    97  				// Note that ZZ and Zzzz are private use and are not treated as
    98  				// unspecified by default.
    99  				id := langID(i)
   100  				if id == nonCanonicalUnd {
   101  					return 0, nil
   102  				}
   103  				return id, nil
   104  			}
   105  		}
   106  		if i := altLangISO3.Index(s); i != -1 {
   107  			return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
   108  		}
   109  		n := strToInt(s)
   110  		if langNoIndex[n/8]&(1<<(n%8)) != 0 {
   111  			return langID(n) + langNoIndexOffset, nil
   112  		}
   113  		// Check for non-canonical uses of ISO3.
   114  		for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
   115  			if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
   116  				return langID(i), nil
   117  			}
   118  		}
   119  		return 0, mkErrInvalid(s)
   120  	}
   121  	return 0, errSyntax
   122  }
   123  
   124  // stringToBuf writes the string to b and returns the number of bytes
   125  // written.  cap(b) must be >= 3.
   126  func (id langID) stringToBuf(b []byte) int {
   127  	if id >= langNoIndexOffset {
   128  		intToStr(uint(id)-langNoIndexOffset, b[:3])
   129  		return 3
   130  	} else if id == 0 {
   131  		return copy(b, "und")
   132  	}
   133  	l := lang[id<<2:]
   134  	if l[3] == 0 {
   135  		return copy(b, l[:3])
   136  	}
   137  	return copy(b, l[:2])
   138  }
   139  
   140  // String returns the BCP 47 representation of the langID.
   141  // Use b as variable name, instead of id, to ensure the variable
   142  // used is consistent with that of Base in which this type is embedded.
   143  func (b langID) String() string {
   144  	if b == 0 {
   145  		return "und"
   146  	} else if b >= langNoIndexOffset {
   147  		b -= langNoIndexOffset
   148  		buf := [3]byte{}
   149  		intToStr(uint(b), buf[:])
   150  		return string(buf[:])
   151  	}
   152  	l := lang.Elem(int(b))
   153  	if l[3] == 0 {
   154  		return l[:3]
   155  	}
   156  	return l[:2]
   157  }
   158  
   159  // ISO3 returns the ISO 639-3 language code.
   160  func (b langID) ISO3() string {
   161  	if b == 0 || b >= langNoIndexOffset {
   162  		return b.String()
   163  	}
   164  	l := lang.Elem(int(b))
   165  	if l[3] == 0 {
   166  		return l[:3]
   167  	} else if l[2] == 0 {
   168  		return altLangISO3.Elem(int(l[3]))[:3]
   169  	}
   170  	// This allocation will only happen for 3-letter ISO codes
   171  	// that are non-canonical BCP 47 language identifiers.
   172  	return l[0:1] + l[2:4]
   173  }
   174  
   175  // IsPrivateUse reports whether this language code is reserved for private use.
   176  func (b langID) IsPrivateUse() bool {
   177  	return langPrivateStart <= b && b <= langPrivateEnd
   178  }
   179  
   180  type regionID uint16
   181  
   182  // getRegionID returns the region id for s if s is a valid 2-letter region code
   183  // or unknownRegion.
   184  func getRegionID(s []byte) (regionID, error) {
   185  	if len(s) == 3 {
   186  		if isAlpha(s[0]) {
   187  			return getRegionISO3(s)
   188  		}
   189  		if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
   190  			return getRegionM49(int(i))
   191  		}
   192  	}
   193  	return getRegionISO2(s)
   194  }
   195  
   196  // getRegionISO2 returns the regionID for the given 2-letter ISO country code
   197  // or unknownRegion if this does not exist.
   198  func getRegionISO2(s []byte) (regionID, error) {
   199  	i, err := findIndex(regionISO, s, "ZZ")
   200  	if err != nil {
   201  		return 0, err
   202  	}
   203  	return regionID(i) + isoRegionOffset, nil
   204  }
   205  
   206  // getRegionISO3 returns the regionID for the given 3-letter ISO country code
   207  // or unknownRegion if this does not exist.
   208  func getRegionISO3(s []byte) (regionID, error) {
   209  	if tag.FixCase("ZZZ", s) {
   210  		for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
   211  			if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
   212  				return regionID(i) + isoRegionOffset, nil
   213  			}
   214  		}
   215  		for i := 0; i < len(altRegionISO3); i += 3 {
   216  			if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
   217  				return regionID(altRegionIDs[i/3]), nil
   218  			}
   219  		}
   220  		return 0, mkErrInvalid(s)
   221  	}
   222  	return 0, errSyntax
   223  }
   224  
   225  func getRegionM49(n int) (regionID, error) {
   226  	if 0 < n && n <= 999 {
   227  		const (
   228  			searchBits = 7
   229  			regionBits = 9
   230  			regionMask = 1<<regionBits - 1
   231  		)
   232  		idx := n >> searchBits
   233  		buf := fromM49[m49Index[idx]:m49Index[idx+1]]
   234  		val := uint16(n) << regionBits // we rely on bits shifting out
   235  		i := sort.Search(len(buf), func(i int) bool {
   236  			return buf[i] >= val
   237  		})
   238  		if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
   239  			return regionID(r & regionMask), nil
   240  		}
   241  	}
   242  	var e ValueError
   243  	fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
   244  	return 0, e
   245  }
   246  
   247  // normRegion returns a region if r is deprecated or 0 otherwise.
   248  // TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
   249  // TODO: consider mapping split up regions to new most populous one (like CLDR).
   250  func normRegion(r regionID) regionID {
   251  	m := regionOldMap
   252  	k := sort.Search(len(m), func(i int) bool {
   253  		return m[i].from >= uint16(r)
   254  	})
   255  	if k < len(m) && m[k].from == uint16(r) {
   256  		return regionID(m[k].to)
   257  	}
   258  	return 0
   259  }
   260  
   261  const (
   262  	iso3166UserAssigned = 1 << iota
   263  	ccTLD
   264  	bcp47Region
   265  )
   266  
   267  func (r regionID) typ() byte {
   268  	return regionTypes[r]
   269  }
   270  
   271  // String returns the BCP 47 representation for the region.
   272  // It returns "ZZ" for an unspecified region.
   273  func (r regionID) String() string {
   274  	if r < isoRegionOffset {
   275  		if r == 0 {
   276  			return "ZZ"
   277  		}
   278  		return fmt.Sprintf("%03d", r.M49())
   279  	}
   280  	r -= isoRegionOffset
   281  	return regionISO.Elem(int(r))[:2]
   282  }
   283  
   284  // ISO3 returns the 3-letter ISO code of r.
   285  // Note that not all regions have a 3-letter ISO code.
   286  // In such cases this method returns "ZZZ".
   287  func (r regionID) ISO3() string {
   288  	if r < isoRegionOffset {
   289  		return "ZZZ"
   290  	}
   291  	r -= isoRegionOffset
   292  	reg := regionISO.Elem(int(r))
   293  	switch reg[2] {
   294  	case 0:
   295  		return altRegionISO3[reg[3]:][:3]
   296  	case ' ':
   297  		return "ZZZ"
   298  	}
   299  	return reg[0:1] + reg[2:4]
   300  }
   301  
   302  // M49 returns the UN M.49 encoding of r, or 0 if this encoding
   303  // is not defined for r.
   304  func (r regionID) M49() int {
   305  	return int(m49[r])
   306  }
   307  
   308  // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
   309  // may include private-use tags that are assigned by CLDR and used in this
   310  // implementation. So IsPrivateUse and IsCountry can be simultaneously true.
   311  func (r regionID) IsPrivateUse() bool {
   312  	return r.typ()&iso3166UserAssigned != 0
   313  }
   314  
   315  type scriptID uint8
   316  
   317  // getScriptID returns the script id for string s. It assumes that s
   318  // is of the format [A-Z][a-z]{3}.
   319  func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
   320  	i, err := findIndex(idx, s, "Zzzz")
   321  	return scriptID(i), err
   322  }
   323  
   324  // String returns the script code in title case.
   325  // It returns "Zzzz" for an unspecified script.
   326  func (s scriptID) String() string {
   327  	if s == 0 {
   328  		return "Zzzz"
   329  	}
   330  	return script.Elem(int(s))
   331  }
   332  
   333  // IsPrivateUse reports whether this script code is reserved for private use.
   334  func (s scriptID) IsPrivateUse() bool {
   335  	return _Qaaa <= s && s <= _Qabx
   336  }
   337  
   338  const (
   339  	maxAltTaglen = len("en-US-POSIX")
   340  	maxLen       = maxAltTaglen
   341  )
   342  
   343  var (
   344  	// grandfatheredMap holds a mapping from legacy and grandfathered tags to
   345  	// their base language or index to more elaborate tag.
   346  	grandfatheredMap = map[[maxLen]byte]int16{
   347  		[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
   348  		[maxLen]byte{'i', '-', 'a', 'm', 'i'}:                          _ami, // i-ami
   349  		[maxLen]byte{'i', '-', 'b', 'n', 'n'}:                          _bnn, // i-bnn
   350  		[maxLen]byte{'i', '-', 'h', 'a', 'k'}:                          _hak, // i-hak
   351  		[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}:      _tlh, // i-klingon
   352  		[maxLen]byte{'i', '-', 'l', 'u', 'x'}:                          _lb,  // i-lux
   353  		[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}:           _nv,  // i-navajo
   354  		[maxLen]byte{'i', '-', 'p', 'w', 'n'}:                          _pwn, // i-pwn
   355  		[maxLen]byte{'i', '-', 't', 'a', 'o'}:                          _tao, // i-tao
   356  		[maxLen]byte{'i', '-', 't', 'a', 'y'}:                          _tay, // i-tay
   357  		[maxLen]byte{'i', '-', 't', 's', 'u'}:                          _tsu, // i-tsu
   358  		[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}:                     _nb,  // no-bok
   359  		[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}:                     _nn,  // no-nyn
   360  		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}:      _sfb, // sgn-BE-FR
   361  		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}:      _vgt, // sgn-BE-NL
   362  		[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}:      _sgg, // sgn-CH-DE
   363  		[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}:           _cmn, // zh-guoyu
   364  		[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}:           _hak, // zh-hakka
   365  		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
   366  		[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}:           _hsn, // zh-xiang
   367  
   368  		// Grandfathered tags with no modern replacement will be converted as
   369  		// follows:
   370  		[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
   371  		[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}:           -2, // en-GB-oed
   372  		[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}:           -3, // i-default
   373  		[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}:      -4, // i-enochian
   374  		[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}:                     -5, // i-mingo
   375  		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}:                          -6, // zh-min
   376  
   377  		// CLDR-specific tag.
   378  		[maxLen]byte{'r', 'o', 'o', 't'}:                                    0,  // root
   379  		[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
   380  	}
   381  
   382  	altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
   383  
   384  	altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
   385  )
   386  
   387  func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
   388  	if v, ok := grandfatheredMap[s]; ok {
   389  		if v < 0 {
   390  			return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
   391  		}
   392  		t.lang = langID(v)
   393  		return t, true
   394  	}
   395  	return t, false
   396  }