golang.org/x/text@v0.14.0/collate/maketables.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 7 // Collation table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "archive/zip" 14 "bufio" 15 "bytes" 16 "flag" 17 "fmt" 18 "io" 19 "log" 20 "os" 21 "regexp" 22 "sort" 23 "strconv" 24 "strings" 25 "unicode/utf8" 26 27 "golang.org/x/text/collate" 28 "golang.org/x/text/collate/build" 29 "golang.org/x/text/internal/colltab" 30 "golang.org/x/text/internal/gen" 31 "golang.org/x/text/language" 32 "golang.org/x/text/unicode/cldr" 33 ) 34 35 var ( 36 test = flag.Bool("test", false, 37 "test existing tables; can be used to compare web data with package data.") 38 short = flag.Bool("short", false, `Use "short" alternatives, when available.`) 39 draft = flag.Bool("draft", false, `Use draft versions, when available.`) 40 tags = flag.String("tags", "", "build tags to be included after go:build directive") 41 pkg = flag.String("package", "collate", 42 "the name of the package in which the generated file is to be included") 43 44 tables = flagStringSetAllowAll("tables", "collate", "collate,chars", 45 "comma-spearated list of tables to generate.") 46 exclude = flagStringSet("exclude", "zh2", "", 47 "comma-separated list of languages to exclude.") 48 include = flagStringSet("include", "", "", 49 "comma-separated list of languages to include. Include trumps exclude.") 50 // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons) 51 // TODO: Not included: traditional (buggy for Bengali) 52 types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "", 53 "comma-separated list of types that should be included.") 54 ) 55 56 // stringSet implements an ordered set based on a list. It implements flag.Value 57 // to allow a set to be specified as a comma-separated list. 58 type stringSet struct { 59 s []string 60 allowed *stringSet 61 dirty bool // needs compaction if true 62 all bool 63 allowAll bool 64 } 65 66 func flagStringSet(name, def, allowed, usage string) *stringSet { 67 ss := &stringSet{} 68 if allowed != "" { 69 usage += fmt.Sprintf(" (allowed values: any of %s)", allowed) 70 ss.allowed = &stringSet{} 71 failOnError(ss.allowed.Set(allowed)) 72 } 73 ss.Set(def) 74 flag.Var(ss, name, usage) 75 return ss 76 } 77 78 func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet { 79 ss := &stringSet{allowAll: true} 80 if allowed == "" { 81 flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`)) 82 } else { 83 ss.allowed = &stringSet{} 84 failOnError(ss.allowed.Set(allowed)) 85 flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed)) 86 } 87 ss.Set(def) 88 return ss 89 } 90 91 func (ss stringSet) Len() int { 92 return len(ss.s) 93 } 94 95 func (ss stringSet) String() string { 96 return strings.Join(ss.s, ",") 97 } 98 99 func (ss *stringSet) Set(s string) error { 100 if ss.allowAll && s == "all" { 101 ss.s = nil 102 ss.all = true 103 return nil 104 } 105 ss.s = ss.s[:0] 106 for _, s := range strings.Split(s, ",") { 107 if s := strings.TrimSpace(s); s != "" { 108 if ss.allowed != nil && !ss.allowed.contains(s) { 109 return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed) 110 } 111 ss.add(s) 112 } 113 } 114 ss.compact() 115 return nil 116 } 117 118 func (ss *stringSet) add(s string) { 119 ss.s = append(ss.s, s) 120 ss.dirty = true 121 } 122 123 func (ss *stringSet) values() []string { 124 ss.compact() 125 return ss.s 126 } 127 128 func (ss *stringSet) contains(s string) bool { 129 if ss.all { 130 return true 131 } 132 for _, v := range ss.s { 133 if v == s { 134 return true 135 } 136 } 137 return false 138 } 139 140 func (ss *stringSet) compact() { 141 if !ss.dirty { 142 return 143 } 144 a := ss.s 145 sort.Strings(a) 146 k := 0 147 for i := 1; i < len(a); i++ { 148 if a[k] != a[i] { 149 a[k+1] = a[i] 150 k++ 151 } 152 } 153 ss.s = a[:k+1] 154 ss.dirty = false 155 } 156 157 func skipLang(l string) bool { 158 if include.Len() > 0 { 159 return !include.contains(l) 160 } 161 return exclude.contains(l) 162 } 163 164 // altInclude returns a list of alternatives (for the LDML alt attribute) 165 // in order of preference. An empty string in this list indicates the 166 // default entry. 167 func altInclude() []string { 168 l := []string{} 169 if *short { 170 l = append(l, "short") 171 } 172 l = append(l, "") 173 // TODO: handle draft using cldr.SetDraftLevel 174 if *draft { 175 l = append(l, "proposed") 176 } 177 return l 178 } 179 180 func failOnError(e error) { 181 if e != nil { 182 log.Panic(e) 183 } 184 } 185 186 func openArchive() *zip.Reader { 187 f := gen.OpenCLDRCoreZip() 188 buffer, err := io.ReadAll(f) 189 f.Close() 190 failOnError(err) 191 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) 192 failOnError(err) 193 return archive 194 } 195 196 // parseUCA parses a Default Unicode Collation Element Table of the format 197 // specified in https://www.unicode.org/reports/tr10/#File_Format. 198 // It returns the variable top. 199 func parseUCA(builder *build.Builder) { 200 var r io.ReadCloser 201 var err error 202 for _, f := range openArchive().File { 203 if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") { 204 r, err = f.Open() 205 } 206 } 207 if r == nil { 208 log.Fatal("File allkeys_CLDR.txt not found in archive.") 209 } 210 failOnError(err) 211 defer r.Close() 212 scanner := bufio.NewScanner(r) 213 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) 214 for i := 1; scanner.Scan(); i++ { 215 line := scanner.Text() 216 if len(line) == 0 || line[0] == '#' { 217 continue 218 } 219 if line[0] == '@' { 220 // parse properties 221 switch { 222 case strings.HasPrefix(line[1:], "version "): 223 a := strings.Split(line[1:], " ") 224 if a[1] != gen.UnicodeVersion() { 225 log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion()) 226 } 227 case strings.HasPrefix(line[1:], "backwards "): 228 log.Fatalf("%d: unsupported option backwards", i) 229 default: 230 log.Printf("%d: unknown option %s", i, line[1:]) 231 } 232 } else { 233 // parse entries 234 part := strings.Split(line, " ; ") 235 if len(part) != 2 { 236 log.Fatalf("%d: production rule without ';': %v", i, line) 237 } 238 lhs := []rune{} 239 for _, v := range strings.Split(part[0], " ") { 240 if v == "" { 241 continue 242 } 243 lhs = append(lhs, rune(convHex(i, v))) 244 } 245 var n int 246 var vars []int 247 rhs := [][]int{} 248 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { 249 n += len(m[0]) 250 elem := []int{} 251 for _, h := range strings.Split(m[2], ".") { 252 elem = append(elem, convHex(i, h)) 253 } 254 if m[1] == "*" { 255 vars = append(vars, i) 256 } 257 rhs = append(rhs, elem) 258 } 259 if len(part[1]) < n+3 || part[1][n+1] != '#' { 260 log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) 261 } 262 if *test { 263 testInput.add(string(lhs)) 264 } 265 failOnError(builder.Add(lhs, rhs, vars)) 266 } 267 } 268 if scanner.Err() != nil { 269 log.Fatal(scanner.Err()) 270 } 271 } 272 273 func convHex(line int, s string) int { 274 r, e := strconv.ParseInt(s, 16, 32) 275 if e != nil { 276 log.Fatalf("%d: %v", line, e) 277 } 278 return int(r) 279 } 280 281 var testInput = stringSet{} 282 283 var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`) 284 var tagRe = regexp.MustCompile(`<([a-z_]*) */>`) 285 286 var mainLocales = []string{} 287 288 // charSets holds a list of exemplar characters per category. 289 type charSets map[string][]string 290 291 func (p charSets) fprint(w io.Writer) { 292 fmt.Fprintln(w, "[exN]string{") 293 for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} { 294 if set := p[k]; len(set) != 0 { 295 fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " ")) 296 } 297 } 298 fmt.Fprintln(w, "\t},") 299 } 300 301 var localeChars = make(map[string]charSets) 302 303 const exemplarHeader = ` 304 type exemplarType int 305 const ( 306 exCharacters exemplarType = iota 307 exContractions 308 exPunctuation 309 exAuxiliary 310 exCurrency 311 exIndex 312 exN 313 ) 314 ` 315 316 func printExemplarCharacters(w io.Writer) { 317 fmt.Fprintln(w, exemplarHeader) 318 fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{") 319 for _, loc := range mainLocales { 320 fmt.Fprintf(w, "\t%q: ", loc) 321 localeChars[loc].fprint(w) 322 } 323 fmt.Fprintln(w, "}") 324 } 325 326 func decodeCLDR(d *cldr.Decoder) *cldr.CLDR { 327 r := gen.OpenCLDRCoreZip() 328 data, err := d.DecodeZip(r) 329 failOnError(err) 330 return data 331 } 332 333 // parseMain parses XML files in the main directory of the CLDR core.zip file. 334 func parseMain() { 335 d := &cldr.Decoder{} 336 d.SetDirFilter("main") 337 d.SetSectionFilter("characters") 338 data := decodeCLDR(d) 339 for _, loc := range data.Locales() { 340 x := data.RawLDML(loc) 341 if skipLang(x.Identity.Language.Type) { 342 continue 343 } 344 if x.Characters != nil { 345 x, _ = data.LDML(loc) 346 loc = language.Make(loc).String() 347 for _, ec := range x.Characters.ExemplarCharacters { 348 if ec.Draft != "" { 349 continue 350 } 351 if _, ok := localeChars[loc]; !ok { 352 mainLocales = append(mainLocales, loc) 353 localeChars[loc] = make(charSets) 354 } 355 localeChars[loc][ec.Type] = parseCharacters(ec.Data()) 356 } 357 } 358 } 359 } 360 361 func parseCharacters(chars string) []string { 362 parseSingle := func(s string) (r rune, tail string, escaped bool) { 363 if s[0] == '\\' { 364 return rune(s[1]), s[2:], true 365 } 366 r, sz := utf8.DecodeRuneInString(s) 367 return r, s[sz:], false 368 } 369 chars = strings.TrimSpace(chars) 370 if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' { 371 chars = chars[1:n] 372 } 373 list := []string{} 374 var r, last, end rune 375 for len(chars) > 0 { 376 if chars[0] == '{' { // character sequence 377 buf := []rune{} 378 for chars = chars[1:]; len(chars) > 0; { 379 r, chars, _ = parseSingle(chars) 380 if r == '}' { 381 break 382 } 383 if r == ' ' { 384 log.Fatalf("space not supported in sequence %q", chars) 385 } 386 buf = append(buf, r) 387 } 388 list = append(list, string(buf)) 389 last = 0 390 } else { // single character 391 escaped := false 392 r, chars, escaped = parseSingle(chars) 393 if r != ' ' { 394 if r == '-' && !escaped { 395 if last == 0 { 396 log.Fatal("'-' should be preceded by a character") 397 } 398 end, chars, _ = parseSingle(chars) 399 for ; last <= end; last++ { 400 list = append(list, string(last)) 401 } 402 last = 0 403 } else { 404 list = append(list, string(r)) 405 last = r 406 } 407 } 408 } 409 } 410 return list 411 } 412 413 var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`) 414 415 // typeMap translates legacy type keys to their BCP47 equivalent. 416 var typeMap = map[string]string{ 417 "phonebook": "phonebk", 418 "traditional": "trad", 419 } 420 421 // parseCollation parses XML files in the collation directory of the CLDR core.zip file. 422 func parseCollation(b *build.Builder) { 423 d := &cldr.Decoder{} 424 d.SetDirFilter("collation") 425 data := decodeCLDR(d) 426 for _, loc := range data.Locales() { 427 x, err := data.LDML(loc) 428 failOnError(err) 429 if skipLang(x.Identity.Language.Type) { 430 continue 431 } 432 cs := x.Collations.Collation 433 sl := cldr.MakeSlice(&cs) 434 if len(types.s) == 0 { 435 sl.SelectAnyOf("type", x.Collations.Default()) 436 } else if !types.all { 437 sl.SelectAnyOf("type", types.s...) 438 } 439 sl.SelectOnePerGroup("alt", altInclude()) 440 441 for _, c := range cs { 442 id, err := language.Parse(loc) 443 if err != nil { 444 fmt.Fprintf(os.Stderr, "invalid locale: %q", err) 445 continue 446 } 447 // Support both old- and new-style defaults. 448 d := c.Type 449 if x.Collations.DefaultCollation == nil { 450 d = x.Collations.Default() 451 } else { 452 d = x.Collations.DefaultCollation.Data() 453 } 454 // We assume tables are being built either for search or collation, 455 // but not both. For search the default is always "search". 456 if d != c.Type && c.Type != "search" { 457 typ := c.Type 458 if len(c.Type) > 8 { 459 typ = typeMap[c.Type] 460 } 461 id, err = id.SetTypeForKey("co", typ) 462 failOnError(err) 463 } 464 t := b.Tailoring(id) 465 c.Process(processor{t}) 466 } 467 } 468 } 469 470 type processor struct { 471 t *build.Tailoring 472 } 473 474 func (p processor) Reset(anchor string, before int) (err error) { 475 if before != 0 { 476 err = p.t.SetAnchorBefore(anchor) 477 } else { 478 err = p.t.SetAnchor(anchor) 479 } 480 failOnError(err) 481 return nil 482 } 483 484 func (p processor) Insert(level int, str, context, extend string) error { 485 str = context + str 486 if *test { 487 testInput.add(str) 488 } 489 // TODO: mimic bug in old maketables: remove. 490 err := p.t.Insert(colltab.Level(level-1), str, context+extend) 491 failOnError(err) 492 return nil 493 } 494 495 func (p processor) Index(id string) { 496 } 497 498 func testCollator(c *collate.Collator) { 499 c0 := collate.New(language.Und) 500 501 // iterator over all characters for all locales and check 502 // whether Key is equal. 503 buf := collate.Buffer{} 504 505 // Add all common and not too uncommon runes to the test set. 506 for i := rune(0); i < 0x30000; i++ { 507 testInput.add(string(i)) 508 } 509 for i := rune(0xE0000); i < 0xF0000; i++ { 510 testInput.add(string(i)) 511 } 512 for _, str := range testInput.values() { 513 k0 := c0.KeyFromString(&buf, str) 514 k := c.KeyFromString(&buf, str) 515 if !bytes.Equal(k0, k) { 516 failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) 517 } 518 buf.Reset() 519 } 520 fmt.Println("PASS") 521 } 522 523 func main() { 524 gen.Init() 525 b := build.NewBuilder() 526 parseUCA(b) 527 if tables.contains("chars") { 528 parseMain() 529 } 530 parseCollation(b) 531 532 c, err := b.Build() 533 failOnError(err) 534 535 if *test { 536 testCollator(collate.NewFromTable(c)) 537 } else { 538 w := &bytes.Buffer{} 539 540 gen.WriteUnicodeVersion(w) 541 gen.WriteCLDRVersion(w) 542 543 if tables.contains("collate") { 544 _, err = b.Print(w) 545 failOnError(err) 546 } 547 if tables.contains("chars") { 548 printExemplarCharacters(w) 549 } 550 gen.WriteGoFile("tables.go", *pkg, w.Bytes()) 551 } 552 }