golang.org/x/text@v0.14.0/collate/reg_test.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package collate 6 7 import ( 8 "archive/zip" 9 "bufio" 10 "bytes" 11 "flag" 12 "io" 13 "log" 14 "path" 15 "regexp" 16 "strconv" 17 "strings" 18 "testing" 19 "unicode/utf8" 20 21 "golang.org/x/text/collate/build" 22 "golang.org/x/text/internal/gen" 23 "golang.org/x/text/language" 24 ) 25 26 var long = flag.Bool("long", false, 27 "run time-consuming tests, such as tests that fetch data online") 28 29 // This regression test runs tests for the test files in CollationTest.zip 30 // (taken from https://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/). 31 // 32 // The test files have the following form: 33 // # header 34 // 0009 0021; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 025E] 35 // 0009 003F; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 0263] 36 // 000A 0021; # ('\u000A') <LINE FEED (LF)> [| | | 0202 025E] 37 // 000A 003F; # ('\u000A') <LINE FEED (LF)> [| | | 0202 0263] 38 // 39 // The part before the semicolon is the hex representation of a sequence 40 // of runes. After the hash mark is a comment. The strings 41 // represented by rune sequence are in the file in sorted order, as 42 // defined by the DUCET. 43 44 type Test struct { 45 name string 46 str [][]byte 47 comment []string 48 } 49 50 var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`) 51 var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`) 52 53 func TestCollation(t *testing.T) { 54 if !gen.IsLocal() && !*long { 55 t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source") 56 } 57 t.Skip("must first update to new file format to support test") 58 for _, test := range loadTestData() { 59 doTest(t, test) 60 } 61 } 62 63 func Error(e error) { 64 if e != nil { 65 log.Fatal(e) 66 } 67 } 68 69 // parseUCA parses a Default Unicode Collation Element Table of the format 70 // specified in https://www.unicode.org/reports/tr10/#File_Format. 71 // It returns the variable top. 72 func parseUCA(builder *build.Builder) { 73 r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt") 74 defer r.Close() 75 input := bufio.NewReader(r) 76 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) 77 for i := 1; true; i++ { 78 l, prefix, err := input.ReadLine() 79 if err == io.EOF { 80 break 81 } 82 Error(err) 83 line := string(l) 84 if prefix { 85 log.Fatalf("%d: buffer overflow", i) 86 } 87 if len(line) == 0 || line[0] == '#' { 88 continue 89 } 90 if line[0] == '@' { 91 if strings.HasPrefix(line[1:], "version ") { 92 if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() { 93 log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion()) 94 } 95 } 96 } else { 97 // parse entries 98 part := strings.Split(line, " ; ") 99 if len(part) != 2 { 100 log.Fatalf("%d: production rule without ';': %v", i, line) 101 } 102 lhs := []rune{} 103 for _, v := range strings.Split(part[0], " ") { 104 if v != "" { 105 lhs = append(lhs, rune(convHex(i, v))) 106 } 107 } 108 vars := []int{} 109 rhs := [][]int{} 110 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { 111 if m[1] == "*" { 112 vars = append(vars, i) 113 } 114 elem := []int{} 115 for _, h := range strings.Split(m[2], ".") { 116 elem = append(elem, convHex(i, h)) 117 } 118 rhs = append(rhs, elem) 119 } 120 builder.Add(lhs, rhs, vars) 121 } 122 } 123 } 124 125 func convHex(line int, s string) int { 126 r, e := strconv.ParseInt(s, 16, 32) 127 if e != nil { 128 log.Fatalf("%d: %v", line, e) 129 } 130 return int(r) 131 } 132 133 func loadTestData() []Test { 134 f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip") 135 buffer, err := io.ReadAll(f) 136 f.Close() 137 Error(err) 138 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) 139 Error(err) 140 tests := []Test{} 141 for _, f := range archive.File { 142 // Skip the short versions, which are simply duplicates of the long versions. 143 if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() { 144 continue 145 } 146 ff, err := f.Open() 147 Error(err) 148 defer ff.Close() 149 scanner := bufio.NewScanner(ff) 150 test := Test{name: path.Base(f.Name)} 151 for scanner.Scan() { 152 line := scanner.Text() 153 if len(line) <= 1 || line[0] == '#' { 154 if m := versionRe.FindStringSubmatch(line); m != nil { 155 if m[1] != gen.UnicodeVersion() { 156 log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion()) 157 } 158 } 159 continue 160 } 161 m := testRe.FindStringSubmatch(line) 162 if m == nil || len(m) < 3 { 163 log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m) 164 } 165 str := []byte{} 166 // In the regression test data (unpaired) surrogates are assigned a weight 167 // corresponding to their code point value. However, utf8.DecodeRune, 168 // which is used to compute the implicit weight, assigns FFFD to surrogates. 169 // We therefore skip tests with surrogates. This skips about 35 entries 170 // per test. 171 valid := true 172 for _, split := range strings.Split(m[1], " ") { 173 r, err := strconv.ParseUint(split, 16, 64) 174 Error(err) 175 valid = valid && utf8.ValidRune(rune(r)) 176 str = append(str, string(rune(r))...) 177 } 178 if valid { 179 test.str = append(test.str, str) 180 test.comment = append(test.comment, m[2]) 181 } 182 } 183 if scanner.Err() != nil { 184 log.Fatal(scanner.Err()) 185 } 186 tests = append(tests, test) 187 } 188 return tests 189 } 190 191 var errorCount int 192 193 func runes(b []byte) []rune { 194 return []rune(string(b)) 195 } 196 197 var shifted = language.MustParse("und-u-ka-shifted-ks-level4") 198 199 func doTest(t *testing.T, tc Test) { 200 bld := build.NewBuilder() 201 parseUCA(bld) 202 w, err := bld.Build() 203 Error(err) 204 var tag language.Tag 205 if !strings.Contains(tc.name, "NON_IGNOR") { 206 tag = shifted 207 } 208 c := NewFromTable(w, OptionsFromTag(tag)) 209 b := &Buffer{} 210 prev := tc.str[0] 211 for i := 1; i < len(tc.str); i++ { 212 b.Reset() 213 s := tc.str[i] 214 ka := c.Key(b, prev) 215 kb := c.Key(b, s) 216 if r := bytes.Compare(ka, kb); r == 1 { 217 t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r) 218 prev = s 219 continue 220 } 221 if r := c.Compare(prev, s); r == 1 { 222 t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r) 223 } 224 if r := c.Compare(s, prev); r == -1 { 225 t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r) 226 } 227 prev = s 228 } 229 }