github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/language/match_test.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package language
     6  
     7  import (
     8  	"bytes"
     9  	"flag"
    10  	"fmt"
    11  	"strings"
    12  	"testing"
    13  )
    14  
    15  var verbose = flag.Bool("verbose", false, "set to true to print the internal tables of matchers")
    16  
    17  func TestAddLikelySubtags(t *testing.T) {
    18  	tests := []struct{ in, out string }{
    19  		{"aa", "aa-Latn-ET"},
    20  		{"aa-Latn", "aa-Latn-ET"},
    21  		{"aa-Arab", "aa-Arab-ET"},
    22  		{"aa-Arab-ER", "aa-Arab-ER"},
    23  		{"kk", "kk-Cyrl-KZ"},
    24  		{"kk-CN", "kk-Arab-CN"},
    25  		{"cmn", "cmn"},
    26  		{"zh-AU", "zh-Hant-AU"},
    27  		{"zh-VN", "zh-Hant-VN"},
    28  		{"zh-SG", "zh-Hans-SG"},
    29  		{"zh-Hant", "zh-Hant-TW"},
    30  		{"zh-Hani", "zh-Hani-CN"},
    31  		{"und-Hani", "zh-Hani-CN"},
    32  		{"und", "en-Latn-US"},
    33  		{"und-GB", "en-Latn-GB"},
    34  		{"und-CW", "pap-Latn-CW"},
    35  		{"und-YT", "fr-Latn-YT"},
    36  		{"und-Arab", "ar-Arab-EG"},
    37  		{"und-AM", "hy-Armn-AM"},
    38  		{"und-002", "en-Latn-NG"},
    39  		{"und-Latn-002", "en-Latn-NG"},
    40  		{"en-Latn-002", "en-Latn-NG"},
    41  		{"en-002", "en-Latn-NG"},
    42  		{"en-001", "en-Latn-US"},
    43  		{"und-003", "en-Latn-US"},
    44  		{"und-GB", "en-Latn-GB"},
    45  		{"Latn-001", "en-Latn-US"},
    46  		{"en-001", "en-Latn-US"},
    47  		{"es-419", "es-Latn-419"},
    48  		{"he-145", "he-Hebr-IL"},
    49  		{"ky-145", "ky-Latn-TR"},
    50  		{"kk", "kk-Cyrl-KZ"},
    51  		// Don't specialize duplicate and ambiguous matches.
    52  		{"kk-034", "kk-Arab-034"}, // Matches IR and AF. Both are Arab.
    53  		{"ku-145", "ku-Latn-TR"},  // Matches IQ, TR, and LB, but kk -> TR.
    54  		{"und-Arab-CC", "ms-Arab-CC"},
    55  		{"und-Arab-GB", "ks-Arab-GB"},
    56  		{"und-Hans-CC", "zh-Hans-CC"},
    57  		{"und-CC", "en-Latn-CC"},
    58  		{"sr", "sr-Cyrl-RS"},
    59  		{"sr-151", "sr-Latn-151"}, // Matches RO and RU.
    60  		// We would like addLikelySubtags to generate the same results if the input
    61  		// only changes by adding tags that would otherwise have been added
    62  		// by the expansion.
    63  		// In other words:
    64  		//     und-AA -> xx-Scrp-AA   implies und-Scrp-AA -> xx-Scrp-AA
    65  		//     und-AA -> xx-Scrp-AA   implies xx-AA -> xx-Scrp-AA
    66  		//     und-Scrp -> xx-Scrp-AA implies und-Scrp-AA -> xx-Scrp-AA
    67  		//     und-Scrp -> xx-Scrp-AA implies xx-Scrp -> xx-Scrp-AA
    68  		//     xx -> xx-Scrp-AA       implies xx-Scrp -> xx-Scrp-AA
    69  		//     xx -> xx-Scrp-AA       implies xx-AA -> xx-Scrp-AA
    70  		//
    71  		// The algorithm specified in
    72  		//   http://unicode.org/reports/tr35/tr35-9.html#Supplemental_Data,
    73  		// Section C.10, does not handle the first case. For example,
    74  		// the CLDR data contains an entry und-BJ -> fr-Latn-BJ, but not
    75  		// there is no rule for und-Latn-BJ.  According to spec, und-Latn-BJ
    76  		// would expand to en-Latn-BJ, violating the aforementioned principle.
    77  		// We deviate from the spec by letting und-Scrp-AA expand to xx-Scrp-AA
    78  		// if a rule of the form und-AA -> xx-Scrp-AA is defined.
    79  		// Note that as of version 23, CLDR has some explicitly specified
    80  		// entries that do not conform to these rules. The implementation
    81  		// will not correct these explicit inconsistencies. A later versions of CLDR
    82  		// is supposed to fix this.
    83  		{"und-Latn-BJ", "fr-Latn-BJ"},
    84  		{"und-Bugi-ID", "bug-Bugi-ID"},
    85  		// regions, scripts and languages without definitions
    86  		{"und-Arab-AA", "ar-Arab-AA"},
    87  		{"und-Afak-RE", "fr-Afak-RE"},
    88  		{"und-Arab-GB", "ks-Arab-GB"},
    89  		{"abp-Arab-GB", "abp-Arab-GB"},
    90  		// script has preference over region
    91  		{"und-Arab-NL", "ar-Arab-NL"},
    92  		{"zza", "zza-Latn-TR"},
    93  		// preserve variants and extensions
    94  		{"de-1901", "de-Latn-DE-1901"},
    95  		{"de-x-abc", "de-Latn-DE-x-abc"},
    96  		{"de-1901-x-abc", "de-Latn-DE-1901-x-abc"},
    97  		{"x-abc", "x-abc"}, // TODO: is this the desired behavior?
    98  	}
    99  	for i, tt := range tests {
   100  		in, _ := Parse(tt.in)
   101  		out, _ := Parse(tt.out)
   102  		in, _ = in.addLikelySubtags()
   103  		if in.String() != out.String() {
   104  			t.Errorf("%d: add(%s) was %s; want %s", i, tt.in, in, tt.out)
   105  		}
   106  	}
   107  }
   108  func TestMinimize(t *testing.T) {
   109  	tests := []struct{ in, out string }{
   110  		{"aa", "aa"},
   111  		{"aa-Latn", "aa"},
   112  		{"aa-Latn-ET", "aa"},
   113  		{"aa-ET", "aa"},
   114  		{"aa-Arab", "aa-Arab"},
   115  		{"aa-Arab-ER", "aa-Arab-ER"},
   116  		{"aa-Arab-ET", "aa-Arab"},
   117  		{"und", "und"},
   118  		{"und-Latn", "und"},
   119  		{"und-Latn-US", "und"},
   120  		{"en-Latn-US", "en"},
   121  		{"cmn", "cmn"},
   122  		{"cmn-Hans", "cmn-Hans"},
   123  		{"cmn-Hant", "cmn-Hant"},
   124  		{"zh-AU", "zh-AU"},
   125  		{"zh-VN", "zh-VN"},
   126  		{"zh-SG", "zh-SG"},
   127  		{"zh-Hant", "zh-Hant"},
   128  		{"zh-Hant-TW", "zh-TW"},
   129  		{"zh-Hans", "zh"},
   130  		{"zh-Hani", "zh-Hani"},
   131  		{"und-Hans", "und-Hans"},
   132  		{"und-Hani", "und-Hani"},
   133  
   134  		{"und-CW", "und-CW"},
   135  		{"und-YT", "und-YT"},
   136  		{"und-Arab", "und-Arab"},
   137  		{"und-AM", "und-AM"},
   138  		{"und-Arab-CC", "und-Arab-CC"},
   139  		{"und-CC", "und-CC"},
   140  		{"und-Latn-BJ", "und-BJ"},
   141  		{"und-Bugi-ID", "und-Bugi"},
   142  		{"bug-Bugi-ID", "bug-Bugi"},
   143  		// regions, scripts and languages without definitions
   144  		{"und-Arab-AA", "und-Arab-AA"},
   145  		// preserve variants and extensions
   146  		{"de-Latn-1901", "de-1901"},
   147  		{"de-Latn-x-abc", "de-x-abc"},
   148  		{"de-DE-1901-x-abc", "de-1901-x-abc"},
   149  		{"x-abc", "x-abc"}, // TODO: is this the desired behavior?
   150  	}
   151  	for i, tt := range tests {
   152  		in, _ := Parse(tt.in)
   153  		out, _ := Parse(tt.out)
   154  		min, _ := in.minimize()
   155  		if min.String() != out.String() {
   156  			t.Errorf("%d: min(%s) was %s; want %s", i, tt.in, min, tt.out)
   157  		}
   158  		max, _ := min.addLikelySubtags()
   159  		if x, _ := in.addLikelySubtags(); x.String() != max.String() {
   160  			t.Errorf("%d: max(min(%s)) = %s; want %s", i, tt.in, max, x)
   161  		}
   162  	}
   163  }
   164  
   165  func TestRegionDistance(t *testing.T) {
   166  	tests := []struct {
   167  		a, b string
   168  		d    int
   169  	}{
   170  		{"NL", "NL", 0},
   171  		{"NL", "EU", 1},
   172  		{"EU", "NL", 1},
   173  		{"005", "005", 0},
   174  		{"NL", "BE", 2},
   175  		{"CO", "005", 1},
   176  		{"005", "CO", 1},
   177  		{"CO", "419", 2},
   178  		{"419", "CO", 2},
   179  		{"005", "419", 1},
   180  		{"419", "005", 1},
   181  		{"001", "013", 2},
   182  		{"013", "001", 2},
   183  		{"CO", "CW", 4},
   184  		{"CO", "PW", 6},
   185  		{"CO", "BV", 6},
   186  		{"ZZ", "QQ", 2},
   187  	}
   188  	for i, tt := range tests {
   189  		ra, _ := getRegionID([]byte(tt.a))
   190  		rb, _ := getRegionID([]byte(tt.b))
   191  		if d := regionDistance(ra, rb); d != tt.d {
   192  			t.Errorf("%d: d(%s, %s) = %v; want %v", i, tt.a, tt.b, d, tt.d)
   193  		}
   194  	}
   195  }
   196  
   197  func TestParentDistance(t *testing.T) {
   198  	tests := []struct {
   199  		parent string
   200  		tag    string
   201  		d      uint8
   202  	}{
   203  		{"en-001", "en-AU", 1},
   204  		{"pt-PT", "pt-AO", 1},
   205  		{"pt", "pt-AO", 2},
   206  		{"en-AU", "en-GB", 255},
   207  		{"en-NL", "en-AU", 255},
   208  		// Note that pt-BR and en-US are not automatically minimized.
   209  		{"pt-BR", "pt-AO", 255},
   210  		{"en-US", "en-AU", 255},
   211  	}
   212  	for _, tt := range tests {
   213  		r := Raw.MustParse(tt.parent).region
   214  		tag := Raw.MustParse(tt.tag)
   215  		if d := parentDistance(r, tag); d != tt.d {
   216  			t.Errorf("d(%s, %s) was %d; want %d", r, tag, d, tt.d)
   217  		}
   218  	}
   219  }
   220  
   221  // Implementation of String methods for various types for debugging purposes.
   222  
   223  func (m *matcher) String() string {
   224  	w := &bytes.Buffer{}
   225  	fmt.Fprintln(w, "Default:", m.default_)
   226  	for tag, h := range m.index {
   227  		fmt.Fprintf(w, "  %s: %v\n", tag, h)
   228  	}
   229  	return w.String()
   230  }
   231  
   232  func (h *matchHeader) String() string {
   233  	w := &bytes.Buffer{}
   234  	fmt.Fprintf(w, "exact: ")
   235  	for _, h := range h.exact {
   236  		fmt.Fprintf(w, "%v, ", h)
   237  	}
   238  	fmt.Fprint(w, "; max: ")
   239  	for _, h := range h.max {
   240  		fmt.Fprintf(w, "%v, ", h)
   241  	}
   242  	return w.String()
   243  }
   244  
   245  func (t haveTag) String() string {
   246  	return fmt.Sprintf("%v:%d:%v:%v-%v|%v", t.tag, t.index, t.conf, t.maxRegion, t.maxScript, t.altScript)
   247  }
   248  
   249  // The test set for TestBestMatch is defined in data_test.go.
   250  func TestBestMatch(t *testing.T) {
   251  	parse := func(list string) (out []Tag) {
   252  		for _, s := range strings.Split(list, ",") {
   253  			out = append(out, mk(strings.TrimSpace(s)))
   254  		}
   255  		return out
   256  	}
   257  	for i, tt := range matchTests {
   258  		supported := parse(tt.supported)
   259  		m := newMatcher(supported)
   260  		if *verbose {
   261  			fmt.Printf("%s:\n%v\n", tt.comment, m)
   262  		}
   263  		for _, tm := range tt.test {
   264  			tag, _, conf := m.Match(parse(tm.desired)...)
   265  			if tag.String() != tm.match {
   266  				t.Errorf("%d:%s: find %s in %q: have %s; want %s (%v)\n", i, tt.comment, tm.desired, tt.supported, tag, tm.match, conf)
   267  			}
   268  		}
   269  	}
   270  }
   271  
   272  var benchHave = []Tag{
   273  	mk("en"),
   274  	mk("en-GB"),
   275  	mk("za"),
   276  	mk("zh-Hant"),
   277  	mk("zh-Hans-CN"),
   278  	mk("zh"),
   279  	mk("zh-HK"),
   280  	mk("ar-MK"),
   281  	mk("en-CA"),
   282  	mk("fr-CA"),
   283  	mk("fr-US"),
   284  	mk("fr-CH"),
   285  	mk("fr"),
   286  	mk("lt"),
   287  	mk("lv"),
   288  	mk("iw"),
   289  	mk("iw-NL"),
   290  	mk("he"),
   291  	mk("he-IT"),
   292  	mk("tlh"),
   293  	mk("ja"),
   294  	mk("ja-Jpan"),
   295  	mk("ja-Jpan-JP"),
   296  	mk("de"),
   297  	mk("de-CH"),
   298  	mk("de-AT"),
   299  	mk("de-DE"),
   300  	mk("sr"),
   301  	mk("sr-Latn"),
   302  	mk("sr-Cyrl"),
   303  	mk("sr-ME"),
   304  }
   305  
   306  var benchWant = [][]Tag{
   307  	[]Tag{
   308  		mk("en"),
   309  	},
   310  	[]Tag{
   311  		mk("en-AU"),
   312  		mk("de-HK"),
   313  		mk("nl"),
   314  		mk("fy"),
   315  		mk("lv"),
   316  	},
   317  	[]Tag{
   318  		mk("en-AU"),
   319  		mk("de-HK"),
   320  		mk("nl"),
   321  		mk("fy"),
   322  	},
   323  	[]Tag{
   324  		mk("ja-Hant"),
   325  		mk("da-HK"),
   326  		mk("nl"),
   327  		mk("zh-TW"),
   328  	},
   329  	[]Tag{
   330  		mk("ja-Hant"),
   331  		mk("da-HK"),
   332  		mk("nl"),
   333  		mk("hr"),
   334  	},
   335  }
   336  
   337  func BenchmarkMatch(b *testing.B) {
   338  	m := newMatcher(benchHave)
   339  	for i := 0; i < b.N; i++ {
   340  		for _, want := range benchWant {
   341  			m.getBest(want...)
   342  		}
   343  	}
   344  }
   345  
   346  func BenchmarkMatchExact(b *testing.B) {
   347  	want := mk("en")
   348  	m := newMatcher(benchHave)
   349  	for i := 0; i < b.N; i++ {
   350  		m.getBest(want)
   351  	}
   352  }
   353  
   354  func BenchmarkMatchAltLanguagePresent(b *testing.B) {
   355  	want := mk("hr")
   356  	m := newMatcher(benchHave)
   357  	for i := 0; i < b.N; i++ {
   358  		m.getBest(want)
   359  	}
   360  }
   361  
   362  func BenchmarkMatchAltLanguageNotPresent(b *testing.B) {
   363  	want := mk("nn")
   364  	m := newMatcher(benchHave)
   365  	for i := 0; i < b.N; i++ {
   366  		m.getBest(want)
   367  	}
   368  }
   369  
   370  func BenchmarkMatchAltScriptPresent(b *testing.B) {
   371  	want := mk("zh-Hant-CN")
   372  	m := newMatcher(benchHave)
   373  	for i := 0; i < b.N; i++ {
   374  		m.getBest(want)
   375  	}
   376  }
   377  
   378  func BenchmarkMatchAltScriptNotPresent(b *testing.B) {
   379  	want := mk("fr-Cyrl")
   380  	m := newMatcher(benchHave)
   381  	for i := 0; i < b.N; i++ {
   382  		m.getBest(want)
   383  	}
   384  }
   385  
   386  func BenchmarkMatchLimitedExact(b *testing.B) {
   387  	want := []Tag{mk("he-NL"), mk("iw-NL")}
   388  	m := newMatcher(benchHave)
   389  	for i := 0; i < b.N; i++ {
   390  		m.getBest(want...)
   391  	}
   392  }