github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/collate/maketables.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Collation table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "archive/zip" 14 "bufio" 15 "bytes" 16 "flag" 17 "fmt" 18 "io" 19 "io/ioutil" 20 "log" 21 "os" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 "unicode/utf8" 27 28 "golang.org/x/text/collate" 29 "golang.org/x/text/collate/build" 30 "golang.org/x/text/collate/colltab" 31 "golang.org/x/text/internal/gen" 32 "golang.org/x/text/language" 33 "golang.org/x/text/unicode/cldr" 34 ) 35 36 var ( 37 test = flag.Bool("test", false, 38 "test existing tables; can be used to compare web data with package data.") 39 short = flag.Bool("short", false, `Use "short" alternatives, when available.`) 40 draft = flag.Bool("draft", false, `Use draft versions, when available.`) 41 tags = flag.String("tags", "", "build tags to be included after +build directive") 42 pkg = flag.String("package", "collate", 43 "the name of the package in which the generated file is to be included") 44 45 tables = flagStringSetAllowAll("tables", "collate", "collate,chars", 46 "comma-spearated list of tables to generate.") 47 exclude = flagStringSet("exclude", "zh2", "", 48 "comma-separated list of languages to exclude.") 49 include = flagStringSet("include", "", "", 50 "comma-separated list of languages to include. Include trumps exclude.") 51 types = flagStringSetAllowAll("types", "", "", 52 "comma-separated list of types that should be included.") 53 ) 54 55 // stringSet implements an ordered set based on a list. It implements flag.Value 56 // to allow a set to be specified as a comma-separated list. 57 type stringSet struct { 58 s []string 59 allowed *stringSet 60 dirty bool // needs compaction if true 61 all bool 62 allowAll bool 63 } 64 65 func flagStringSet(name, def, allowed, usage string) *stringSet { 66 ss := &stringSet{} 67 if allowed != "" { 68 usage += fmt.Sprintf(" (allowed values: any of %s)", allowed) 69 ss.allowed = &stringSet{} 70 failOnError(ss.allowed.Set(allowed)) 71 } 72 ss.Set(def) 73 flag.Var(ss, name, usage) 74 return ss 75 } 76 77 func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet { 78 ss := &stringSet{allowAll: true} 79 if allowed == "" { 80 flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`)) 81 } else { 82 ss.allowed = &stringSet{} 83 failOnError(ss.allowed.Set(allowed)) 84 flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed)) 85 } 86 ss.Set(def) 87 return ss 88 } 89 90 func (ss stringSet) Len() int { 91 return len(ss.s) 92 } 93 94 func (ss stringSet) String() string { 95 return strings.Join(ss.s, ",") 96 } 97 98 func (ss *stringSet) Set(s string) error { 99 if ss.allowAll && s == "all" { 100 ss.s = nil 101 ss.all = true 102 return nil 103 } 104 ss.s = ss.s[:0] 105 for _, s := range strings.Split(s, ",") { 106 if s := strings.TrimSpace(s); s != "" { 107 if ss.allowed != nil && !ss.allowed.contains(s) { 108 return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed) 109 } 110 ss.add(s) 111 } 112 } 113 ss.compact() 114 return nil 115 } 116 117 func (ss *stringSet) add(s string) { 118 ss.s = append(ss.s, s) 119 ss.dirty = true 120 } 121 122 func (ss *stringSet) values() []string { 123 ss.compact() 124 return ss.s 125 } 126 127 func (ss *stringSet) contains(s string) bool { 128 if ss.all { 129 return true 130 } 131 for _, v := range ss.s { 132 if v == s { 133 return true 134 } 135 } 136 return false 137 } 138 139 func (ss *stringSet) compact() { 140 if !ss.dirty { 141 return 142 } 143 a := ss.s 144 sort.Strings(a) 145 k := 0 146 for i := 1; i < len(a); i++ { 147 if a[k] != a[i] { 148 a[k+1] = a[i] 149 k++ 150 } 151 } 152 ss.s = a[:k+1] 153 ss.dirty = false 154 } 155 156 func skipLang(l string) bool { 157 if include.Len() > 0 { 158 return !include.contains(l) 159 } 160 return exclude.contains(l) 161 } 162 163 // altInclude returns a list of alternatives (for the LDML alt attribute) 164 // in order of preference. An empty string in this list indicates the 165 // default entry. 166 func altInclude() []string { 167 l := []string{} 168 if *short { 169 l = append(l, "short") 170 } 171 l = append(l, "") 172 // TODO: handle draft using cldr.SetDraftLevel 173 if *draft { 174 l = append(l, "proposed") 175 } 176 return l 177 } 178 179 func failOnError(e error) { 180 if e != nil { 181 log.Panic(e) 182 } 183 } 184 185 func openArchive() *zip.Reader { 186 f := gen.OpenCLDRCoreZip() 187 buffer, err := ioutil.ReadAll(f) 188 f.Close() 189 failOnError(err) 190 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) 191 failOnError(err) 192 return archive 193 } 194 195 // parseUCA parses a Default Unicode Collation Element Table of the format 196 // specified in http://www.unicode.org/reports/tr10/#File_Format. 197 // It returns the variable top. 198 func parseUCA(builder *build.Builder) { 199 var r io.ReadCloser 200 var err error 201 for _, f := range openArchive().File { 202 if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") { 203 r, err = f.Open() 204 } 205 } 206 if r == nil { 207 log.Fatal("File allkeys_CLDR.txt not found in archive.") 208 } 209 failOnError(err) 210 defer r.Close() 211 scanner := bufio.NewScanner(r) 212 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) 213 for i := 1; scanner.Scan(); i++ { 214 line := scanner.Text() 215 if len(line) == 0 || line[0] == '#' { 216 continue 217 } 218 if line[0] == '@' { 219 // parse properties 220 switch { 221 case strings.HasPrefix(line[1:], "version "): 222 a := strings.Split(line[1:], " ") 223 if a[1] != gen.UnicodeVersion() { 224 log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion()) 225 } 226 case strings.HasPrefix(line[1:], "backwards "): 227 log.Fatalf("%d: unsupported option backwards", i) 228 default: 229 log.Printf("%d: unknown option %s", i, line[1:]) 230 } 231 } else { 232 // parse entries 233 part := strings.Split(line, " ; ") 234 if len(part) != 2 { 235 log.Fatalf("%d: production rule without ';': %v", i, line) 236 } 237 lhs := []rune{} 238 for _, v := range strings.Split(part[0], " ") { 239 if v == "" { 240 continue 241 } 242 lhs = append(lhs, rune(convHex(i, v))) 243 } 244 var n int 245 var vars []int 246 rhs := [][]int{} 247 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { 248 n += len(m[0]) 249 elem := []int{} 250 for _, h := range strings.Split(m[2], ".") { 251 elem = append(elem, convHex(i, h)) 252 } 253 if m[1] == "*" { 254 vars = append(vars, i) 255 } 256 rhs = append(rhs, elem) 257 } 258 if len(part[1]) < n+3 || part[1][n+1] != '#' { 259 log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) 260 } 261 if *test { 262 testInput.add(string(lhs)) 263 } 264 failOnError(builder.Add(lhs, rhs, vars)) 265 } 266 } 267 if scanner.Err() != nil { 268 log.Fatal(scanner.Err()) 269 } 270 } 271 272 func convHex(line int, s string) int { 273 r, e := strconv.ParseInt(s, 16, 32) 274 if e != nil { 275 log.Fatalf("%d: %v", line, e) 276 } 277 return int(r) 278 } 279 280 var testInput = stringSet{} 281 282 var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`) 283 var tagRe = regexp.MustCompile(`<([a-z_]*) */>`) 284 285 var mainLocales = []string{} 286 287 // charsets holds a list of exemplar characters per category. 288 type charSets map[string][]string 289 290 func (p charSets) fprint(w io.Writer) { 291 fmt.Fprintln(w, "[exN]string{") 292 for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} { 293 if set := p[k]; len(set) != 0 { 294 fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " ")) 295 } 296 } 297 fmt.Fprintln(w, "\t},") 298 } 299 300 var localeChars = make(map[string]charSets) 301 302 const exemplarHeader = ` 303 type exemplarType int 304 const ( 305 exCharacters exemplarType = iota 306 exContractions 307 exPunctuation 308 exAuxiliary 309 exCurrency 310 exIndex 311 exN 312 ) 313 ` 314 315 func printExemplarCharacters(w io.Writer) { 316 fmt.Fprintln(w, exemplarHeader) 317 fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{") 318 for _, loc := range mainLocales { 319 fmt.Fprintf(w, "\t%q: ", loc) 320 localeChars[loc].fprint(w) 321 } 322 fmt.Fprintln(w, "}") 323 } 324 325 func decodeCLDR(d *cldr.Decoder) *cldr.CLDR { 326 r := gen.OpenCLDRCoreZip() 327 data, err := d.DecodeZip(r) 328 failOnError(err) 329 return data 330 } 331 332 // parseMain parses XML files in the main directory of the CLDR core.zip file. 333 func parseMain() { 334 d := &cldr.Decoder{} 335 d.SetDirFilter("main") 336 d.SetSectionFilter("characters") 337 data := decodeCLDR(d) 338 for _, loc := range data.Locales() { 339 x := data.RawLDML(loc) 340 if skipLang(x.Identity.Language.Type) { 341 continue 342 } 343 if x.Characters != nil { 344 x, _ = data.LDML(loc) 345 loc = language.Make(loc).String() 346 for _, ec := range x.Characters.ExemplarCharacters { 347 if ec.Draft != "" { 348 continue 349 } 350 if _, ok := localeChars[loc]; !ok { 351 mainLocales = append(mainLocales, loc) 352 localeChars[loc] = make(charSets) 353 } 354 localeChars[loc][ec.Type] = parseCharacters(ec.Data()) 355 } 356 } 357 } 358 } 359 360 func parseCharacters(chars string) []string { 361 parseSingle := func(s string) (r rune, tail string, escaped bool) { 362 if s[0] == '\\' { 363 return rune(s[1]), s[2:], true 364 } 365 r, sz := utf8.DecodeRuneInString(s) 366 return r, s[sz:], false 367 } 368 chars = strings.TrimSpace(chars) 369 if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' { 370 chars = chars[1:n] 371 } 372 list := []string{} 373 var r, last, end rune 374 for len(chars) > 0 { 375 if chars[0] == '{' { // character sequence 376 buf := []rune{} 377 for chars = chars[1:]; len(chars) > 0; { 378 r, chars, _ = parseSingle(chars) 379 if r == '}' { 380 break 381 } 382 if r == ' ' { 383 log.Fatalf("space not supported in sequence %q", chars) 384 } 385 buf = append(buf, r) 386 } 387 list = append(list, string(buf)) 388 last = 0 389 } else { // single character 390 escaped := false 391 r, chars, escaped = parseSingle(chars) 392 if r != ' ' { 393 if r == '-' && !escaped { 394 if last == 0 { 395 log.Fatal("'-' should be preceded by a character") 396 } 397 end, chars, _ = parseSingle(chars) 398 for ; last <= end; last++ { 399 list = append(list, string(last)) 400 } 401 last = 0 402 } else { 403 list = append(list, string(r)) 404 last = r 405 } 406 } 407 } 408 } 409 return list 410 } 411 412 var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`) 413 414 // parseCollation parses XML files in the collation directory of the CLDR core.zip file. 415 func parseCollation(b *build.Builder) { 416 d := &cldr.Decoder{} 417 d.SetDirFilter("collation") 418 data := decodeCLDR(d) 419 for _, loc := range data.Locales() { 420 x, err := data.LDML(loc) 421 failOnError(err) 422 if skipLang(x.Identity.Language.Type) { 423 continue 424 } 425 cs := x.Collations.Collation 426 sl := cldr.MakeSlice(&cs) 427 if len(types.s) == 0 { 428 sl.SelectAnyOf("type", x.Collations.Default()) 429 } else if !types.all { 430 sl.SelectAnyOf("type", types.s...) 431 } 432 sl.SelectOnePerGroup("alt", altInclude()) 433 434 for _, c := range cs { 435 id, err := language.Parse(loc) 436 if err != nil { 437 fmt.Fprintf(os.Stderr, "invalid locale: %q", err) 438 continue 439 } 440 // Support both old- and new-style defaults. 441 d := c.Type 442 if x.Collations.DefaultCollation == nil { 443 d = x.Collations.Default() 444 } else { 445 d = x.Collations.DefaultCollation.Data() 446 } 447 // We assume tables are being built either for search or collation, 448 // but not both. For search the default is always "search". 449 if d != c.Type && c.Type != "search" { 450 id, err = id.SetTypeForKey("co", c.Type) 451 failOnError(err) 452 } 453 t := b.Tailoring(id) 454 c.Process(processor{t}) 455 } 456 } 457 } 458 459 type processor struct { 460 t *build.Tailoring 461 } 462 463 func (p processor) Reset(anchor string, before int) (err error) { 464 if before != 0 { 465 err = p.t.SetAnchorBefore(anchor) 466 } else { 467 err = p.t.SetAnchor(anchor) 468 } 469 failOnError(err) 470 return nil 471 } 472 473 func (p processor) Insert(level int, str, context, extend string) error { 474 str = context + str 475 if *test { 476 testInput.add(str) 477 } 478 // TODO: mimic bug in old maketables: remove. 479 err := p.t.Insert(colltab.Level(level-1), str, context+extend) 480 failOnError(err) 481 return nil 482 } 483 484 func (p processor) Index(id string) { 485 } 486 487 func testCollator(c *collate.Collator) { 488 c0 := collate.New(language.Und) 489 490 // iterator over all characters for all locales and check 491 // whether Key is equal. 492 buf := collate.Buffer{} 493 494 // Add all common and not too uncommon runes to the test set. 495 for i := rune(0); i < 0x30000; i++ { 496 testInput.add(string(i)) 497 } 498 for i := rune(0xE0000); i < 0xF0000; i++ { 499 testInput.add(string(i)) 500 } 501 for _, str := range testInput.values() { 502 k0 := c0.KeyFromString(&buf, str) 503 k := c.KeyFromString(&buf, str) 504 if !bytes.Equal(k0, k) { 505 failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) 506 } 507 buf.Reset() 508 } 509 fmt.Println("PASS") 510 } 511 512 func main() { 513 gen.Init() 514 b := build.NewBuilder() 515 parseUCA(b) 516 if tables.contains("chars") { 517 parseMain() 518 } 519 parseCollation(b) 520 521 c, err := b.Build() 522 failOnError(err) 523 524 if *test { 525 testCollator(collate.NewFromTable(c)) 526 } else { 527 w := &bytes.Buffer{} 528 529 gen.WriteUnicodeVersion(w) 530 gen.WriteCLDRVersion(w) 531 532 if tables.contains("collate") { 533 _, err = b.Print(w) 534 failOnError(err) 535 } 536 if tables.contains("chars") { 537 printExemplarCharacters(w) 538 } 539 gen.WriteGoFile("tables.go", *pkg, w.Bytes()) 540 } 541 }