golang.org/x/text@v0.14.0/language/gen.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 7 // Language tag table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "flag" 14 "fmt" 15 "io" 16 "log" 17 "sort" 18 "strconv" 19 "strings" 20 21 "golang.org/x/text/internal/gen" 22 "golang.org/x/text/internal/language" 23 "golang.org/x/text/unicode/cldr" 24 ) 25 26 var ( 27 test = flag.Bool("test", 28 false, 29 "test existing tables; can be used to compare web data with package data.") 30 outputFile = flag.String("output", 31 "tables.go", 32 "output file for generated tables") 33 ) 34 35 func main() { 36 gen.Init() 37 38 w := gen.NewCodeWriter() 39 defer w.WriteGoFile("tables.go", "language") 40 41 b := newBuilder(w) 42 gen.WriteCLDRVersion(w) 43 44 b.writeConstants() 45 b.writeMatchData() 46 } 47 48 type builder struct { 49 w *gen.CodeWriter 50 hw io.Writer // MultiWriter for w and w.Hash 51 data *cldr.CLDR 52 supp *cldr.SupplementalData 53 } 54 55 func (b *builder) langIndex(s string) uint16 { 56 return uint16(language.MustParseBase(s)) 57 } 58 59 func (b *builder) regionIndex(s string) int { 60 return int(language.MustParseRegion(s)) 61 } 62 63 func (b *builder) scriptIndex(s string) int { 64 return int(language.MustParseScript(s)) 65 } 66 67 func newBuilder(w *gen.CodeWriter) *builder { 68 r := gen.OpenCLDRCoreZip() 69 defer r.Close() 70 d := &cldr.Decoder{} 71 data, err := d.DecodeZip(r) 72 if err != nil { 73 log.Fatal(err) 74 } 75 b := builder{ 76 w: w, 77 hw: io.MultiWriter(w, w.Hash), 78 data: data, 79 supp: data.Supplemental(), 80 } 81 return &b 82 } 83 84 // writeConsts computes f(v) for all v in values and writes the results 85 // as constants named _v to a single constant block. 86 func (b *builder) writeConsts(f func(string) int, values ...string) { 87 fmt.Fprintln(b.w, "const (") 88 for _, v := range values { 89 fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v)) 90 } 91 fmt.Fprintln(b.w, ")") 92 } 93 94 // TODO: region inclusion data will probably not be use used in future matchers. 95 96 var langConsts = []string{ 97 "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und", 98 } 99 100 var scriptConsts = []string{ 101 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 102 "Zzzz", 103 } 104 105 var regionConsts = []string{ 106 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 107 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 108 } 109 110 func (b *builder) writeConstants() { 111 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 112 b.writeConsts(b.regionIndex, regionConsts...) 113 b.writeConsts(b.scriptIndex, scriptConsts...) 114 } 115 116 type mutualIntelligibility struct { 117 want, have uint16 118 distance uint8 119 oneway bool 120 } 121 122 type scriptIntelligibility struct { 123 wantLang, haveLang uint16 124 wantScript, haveScript uint8 125 distance uint8 126 // Always oneway 127 } 128 129 type regionIntelligibility struct { 130 lang uint16 // compact language id 131 script uint8 // 0 means any 132 group uint8 // 0 means any; if bit 7 is set it means inverse 133 distance uint8 134 // Always twoway. 135 } 136 137 // writeMatchData writes tables with languages and scripts for which there is 138 // mutual intelligibility. The data is based on CLDR's languageMatching data. 139 // Note that we use a different algorithm than the one defined by CLDR and that 140 // we slightly modify the data. For example, we convert scores to confidence levels. 141 // We also drop all region-related data as we use a different algorithm to 142 // determine region equivalence. 143 func (b *builder) writeMatchData() { 144 lm := b.supp.LanguageMatching.LanguageMatches 145 cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new") 146 147 regionHierarchy := map[string][]string{} 148 for _, g := range b.supp.TerritoryContainment.Group { 149 regions := strings.Split(g.Contains, " ") 150 regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...) 151 } 152 // Regions start at 1, so the slice must be one larger than the number of 153 // regions. 154 regionToGroups := make([]uint8, language.NumRegions+1) 155 156 idToIndex := map[string]uint8{} 157 for i, mv := range lm[0].MatchVariable { 158 if i > 6 { 159 log.Fatalf("Too many groups: %d", i) 160 } 161 idToIndex[mv.Id] = uint8(i + 1) 162 // TODO: also handle '-' 163 for _, r := range strings.Split(mv.Value, "+") { 164 todo := []string{r} 165 for k := 0; k < len(todo); k++ { 166 r := todo[k] 167 regionToGroups[b.regionIndex(r)] |= 1 << uint8(i) 168 todo = append(todo, regionHierarchy[r]...) 169 } 170 } 171 } 172 b.w.WriteVar("regionToGroups", regionToGroups) 173 174 // maps language id to in- and out-of-group region. 175 paradigmLocales := [][3]uint16{} 176 locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ") 177 for i := 0; i < len(locales); i += 2 { 178 x := [3]uint16{} 179 for j := 0; j < 2; j++ { 180 pc := strings.SplitN(locales[i+j], "-", 2) 181 x[0] = b.langIndex(pc[0]) 182 if len(pc) == 2 { 183 x[1+j] = uint16(b.regionIndex(pc[1])) 184 } 185 } 186 paradigmLocales = append(paradigmLocales, x) 187 } 188 b.w.WriteVar("paradigmLocales", paradigmLocales) 189 190 b.w.WriteType(mutualIntelligibility{}) 191 b.w.WriteType(scriptIntelligibility{}) 192 b.w.WriteType(regionIntelligibility{}) 193 194 matchLang := []mutualIntelligibility{} 195 matchScript := []scriptIntelligibility{} 196 matchRegion := []regionIntelligibility{} 197 // Convert the languageMatch entries in lists keyed by desired language. 198 for _, m := range lm[0].LanguageMatch { 199 // Different versions of CLDR use different separators. 200 desired := strings.Replace(m.Desired, "-", "_", -1) 201 supported := strings.Replace(m.Supported, "-", "_", -1) 202 d := strings.Split(desired, "_") 203 s := strings.Split(supported, "_") 204 if len(d) != len(s) { 205 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 206 continue 207 } 208 distance, _ := strconv.ParseInt(m.Distance, 10, 8) 209 switch len(d) { 210 case 2: 211 if desired == supported && desired == "*_*" { 212 continue 213 } 214 // language-script pair. 215 matchScript = append(matchScript, scriptIntelligibility{ 216 wantLang: uint16(b.langIndex(d[0])), 217 haveLang: uint16(b.langIndex(s[0])), 218 wantScript: uint8(b.scriptIndex(d[1])), 219 haveScript: uint8(b.scriptIndex(s[1])), 220 distance: uint8(distance), 221 }) 222 if m.Oneway != "true" { 223 matchScript = append(matchScript, scriptIntelligibility{ 224 wantLang: uint16(b.langIndex(s[0])), 225 haveLang: uint16(b.langIndex(d[0])), 226 wantScript: uint8(b.scriptIndex(s[1])), 227 haveScript: uint8(b.scriptIndex(d[1])), 228 distance: uint8(distance), 229 }) 230 } 231 case 1: 232 if desired == supported && desired == "*" { 233 continue 234 } 235 if distance == 1 { 236 // nb == no is already handled by macro mapping. Check there 237 // really is only this case. 238 if d[0] != "no" || s[0] != "nb" { 239 log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) 240 } 241 continue 242 } 243 // TODO: consider dropping oneway field and just doubling the entry. 244 matchLang = append(matchLang, mutualIntelligibility{ 245 want: uint16(b.langIndex(d[0])), 246 have: uint16(b.langIndex(s[0])), 247 distance: uint8(distance), 248 oneway: m.Oneway == "true", 249 }) 250 case 3: 251 if desired == supported && desired == "*_*_*" { 252 continue 253 } 254 if desired != supported { 255 // This is now supported by CLDR, but only one case, which 256 // should already be covered by paradigm locales. For instance, 257 // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in 258 // testdata/CLDRLocaleMatcherTest.txt tests this. 259 if supported != "en_*_GB" { 260 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 261 } 262 continue 263 } 264 ri := regionIntelligibility{ 265 lang: b.langIndex(d[0]), 266 distance: uint8(distance), 267 } 268 if d[1] != "*" { 269 ri.script = uint8(b.scriptIndex(d[1])) 270 } 271 switch { 272 case d[2] == "*": 273 ri.group = 0x80 // not contained in anything 274 case strings.HasPrefix(d[2], "$!"): 275 ri.group = 0x80 276 d[2] = "$" + d[2][len("$!"):] 277 fallthrough 278 case strings.HasPrefix(d[2], "$"): 279 ri.group |= idToIndex[d[2]] 280 } 281 matchRegion = append(matchRegion, ri) 282 default: 283 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 284 } 285 } 286 sort.SliceStable(matchLang, func(i, j int) bool { 287 return matchLang[i].distance < matchLang[j].distance 288 }) 289 b.w.WriteComment(` 290 matchLang holds pairs of langIDs of base languages that are typically 291 mutually intelligible. Each pair is associated with a confidence and 292 whether the intelligibility goes one or both ways.`) 293 b.w.WriteVar("matchLang", matchLang) 294 295 b.w.WriteComment(` 296 matchScript holds pairs of scriptIDs where readers of one script 297 can typically also read the other. Each is associated with a confidence.`) 298 sort.SliceStable(matchScript, func(i, j int) bool { 299 return matchScript[i].distance < matchScript[j].distance 300 }) 301 b.w.WriteVar("matchScript", matchScript) 302 303 sort.SliceStable(matchRegion, func(i, j int) bool { 304 return matchRegion[i].distance < matchRegion[j].distance 305 }) 306 b.w.WriteVar("matchRegion", matchRegion) 307 }