vitess.io/vitess@v0.16.2/go/mysql/collations/tools/makecolldata/mysqldata.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "fmt" 21 "log" 22 "path" 23 "sort" 24 "strconv" 25 "strings" 26 27 "github.com/spf13/pflag" 28 29 "vitess.io/vitess/go/mysql/collations/internal/charset" 30 "vitess.io/vitess/go/mysql/collations/internal/uca" 31 "vitess.io/vitess/go/mysql/collations/tools/makecolldata/codegen" 32 ) 33 34 var Print8BitData = pflag.Bool("full8bit", false, "") 35 36 type TableGenerator struct { 37 *codegen.Generator 38 dedup map[string]string 39 40 baseWeightsUca400 TailoringWeights 41 baseWeightsUca520 TailoringWeights 42 baseWeightsUca900 TailoringWeights 43 } 44 45 type Generator struct { 46 *codegen.Generator 47 Tables TableGenerator 48 } 49 50 func diffMaps(orgWeights, modWeights TailoringWeights) (diff []uca.Patch) { 51 if len(modWeights) == 0 { 52 return nil 53 } 54 55 diffMap := make(TailoringWeights) 56 for key, val := range modWeights { 57 if orgVal, ok := orgWeights[key]; !ok || len(orgVal) != len(val) { 58 diffMap[key] = val 59 continue 60 } 61 62 for i, arr := range val { 63 if orgWeights[key][i] != arr { 64 diffMap[key] = val 65 break 66 } 67 } 68 } 69 70 for key, val := range diffMap { 71 cp, err := strconv.ParseInt(key[2:], 16, 32) 72 if err != nil { 73 panic(err) 74 } 75 diff = append(diff, uca.Patch{Codepoint: rune(cp), Patch: val}) 76 } 77 78 sort.Slice(diff, func(i, j int) bool { 79 return diff[i].Codepoint < diff[j].Codepoint 80 }) 81 82 return 83 } 84 85 func (g *TableGenerator) dedupTable(name, coll string, val any) (string, bool) { 86 raw := fmt.Sprintf("%#v", val) 87 if exist, ok := g.dedup[raw]; ok { 88 return exist, true 89 } 90 91 varname := fmt.Sprintf("%s_%s", name, coll) 92 g.dedup[raw] = varname 93 return varname, false 94 } 95 96 func (g *Generator) printCollationUcaLegacy(meta *CollationMetadata) { 97 tableWeightPatches := g.Tables.writeWeightPatches(meta) 98 tableContractions := g.Tables.writeContractions(meta) 99 100 g.P("register(&Collation_uca_legacy{") 101 g.P("name: ", codegen.Quote(meta.Name), ",") 102 g.P("id: ", meta.Number, ",") 103 g.P("charset: ", PkgCharset, ".Charset_", meta.Charset, "{},") 104 g.P("weights: weightTable_uca", meta.UCAVersion, ",") 105 if tableWeightPatches != "" { 106 g.P("tailoring: ", tableWeightPatches, ",") 107 } 108 if tableContractions != "" { 109 g.P("contract: ", tableContractions, "{},") 110 } 111 switch meta.UCAVersion { 112 case 400: 113 g.P("maxCodepoint: 0xFFFF,") 114 case 520: 115 g.P("maxCodepoint: 0x10FFFF,") 116 default: 117 g.Fail("invalid UCAVersion") 118 } 119 g.P("})") 120 } 121 122 func (g *TableGenerator) writeWeightPatches(meta *CollationMetadata) string { 123 var tableWeightPatches string 124 var dedup bool 125 var baseWeights TailoringWeights 126 127 switch meta.UCAVersion { 128 case 400: 129 baseWeights = g.baseWeightsUca400 130 case 520: 131 baseWeights = g.baseWeightsUca520 132 case 900: 133 baseWeights = g.baseWeightsUca900 134 default: 135 g.Fail("invalid UCAVersion") 136 } 137 138 diff := diffMaps(baseWeights, meta.Weights) 139 if len(diff) > 0 { 140 tableWeightPatches, dedup = g.dedupTable("weightTailoring", meta.Name, diff) 141 if !dedup { 142 g.P("var ", tableWeightPatches, " = ", diff) 143 g.P() 144 } 145 } 146 147 return tableWeightPatches 148 } 149 150 func (g *TableGenerator) writeContractions(meta *CollationMetadata) string { 151 var tableContractions string 152 var dedup bool 153 154 if len(meta.Contractions) > 0 { 155 tableContractions, dedup = g.dedupTable("contractor", meta.Name, meta.Contractions) 156 if !dedup { 157 g.printContractionsFast(tableContractions, meta.Contractions) 158 g.P() 159 } 160 } 161 return tableContractions 162 } 163 164 func (g *TableGenerator) writeReorders(meta *CollationMetadata) string { 165 var tableReorder string 166 var dedup bool 167 168 if len(meta.Reorder) > 0 { 169 tableReorder, dedup = g.dedupTable("reorder", meta.Name, meta.Reorder) 170 if !dedup { 171 var reorder []uca.Reorder 172 for _, r := range meta.Reorder { 173 reorder = append(reorder, uca.Reorder{FromMin: r[0], FromMax: r[1], ToMin: r[2], ToMax: r[3]}) 174 } 175 g.P("var ", tableReorder, " = ", reorder) 176 g.P() 177 } 178 } 179 return tableReorder 180 } 181 182 func (g *Generator) printCollationUca900(meta *CollationMetadata) { 183 if meta.UCAVersion != 900 { 184 g.Fail("unexpected UCA version for UCA900 collation") 185 } 186 187 tableWeights := "weightTable_uca900" 188 switch meta.Name { 189 case "utf8mb4_zh_0900_as_cs": 190 // the chinese weights table is large enough that we don't apply weight patches 191 // to it, we generate it as a whole 192 tableWeights = "weightTable_uca900_zh" 193 meta.Weights = nil 194 195 // HACK: Chinese collations are fully reordered on their patched weights. 196 // They do not need manual reordering even if they include reorder ranges 197 // FIXME: Why does this collation have a reorder range that doesn't apply? 198 meta.Reorder = nil 199 200 case "utf8mb4_ja_0900_as_cs", "utf8mb4_ja_0900_as_cs_ks": 201 // the japanese weights table is large enough that we don't apply weight patches 202 // to it, we generate it as a whole 203 tableWeights = "weightTable_uca900_ja" 204 meta.Weights = nil 205 } 206 207 tableWeightPatches := g.Tables.writeWeightPatches(meta) 208 tableContractions := g.Tables.writeContractions(meta) 209 tableReorder := g.Tables.writeReorders(meta) 210 211 g.P("register(&Collation_utf8mb4_uca_0900{") 212 g.P("name: ", codegen.Quote(meta.Name), ",") 213 g.P("id: ", meta.Number, ",") 214 215 var levels int 216 switch { 217 case strings.HasSuffix(meta.Name, "_ai_ci"): 218 levels = 1 219 case strings.HasSuffix(meta.Name, "_as_ci"): 220 levels = 2 221 case strings.HasSuffix(meta.Name, "_as_cs"): 222 levels = 3 223 case strings.HasSuffix(meta.Name, "_as_cs_ks"): 224 levels = 4 225 default: 226 g.Fail(fmt.Sprintf("unknown levelsForCompare: %q", meta.Name)) 227 } 228 229 g.P("levelsForCompare: ", levels, ",") 230 g.P("weights: ", tableWeights, ",") 231 if tableWeightPatches != "" { 232 g.P("tailoring: ", tableWeightPatches, ",") 233 } 234 if tableContractions != "" { 235 g.P("contract: ", tableContractions, "{},") 236 } 237 if tableReorder != "" { 238 g.P("reorder: ", tableReorder, ",") 239 } 240 if meta.UpperCaseFirst { 241 g.P("upperCaseFirst: true,") 242 } 243 g.P("})") 244 } 245 246 func (g *TableGenerator) printSlice(name, coll string, slice any) string { 247 tableName, dedup := g.dedupTable(name, coll, slice) 248 if !dedup { 249 g.P("var ", tableName, " = ", slice) 250 g.P() 251 } 252 return tableName 253 } 254 255 func (g *TableGenerator) printUnicodeMappings(name, coll string, mappings []charset.UnicodeMapping) string { 256 tableName, dedup := g.dedupTable(name, coll, mappings) 257 if !dedup { 258 g.P("var ", tableName, " = ", mappings) 259 g.P() 260 } 261 return tableName 262 } 263 264 func (g *Generator) printCollation8bit(meta *CollationMetadata) { 265 var tableCtype, tableToLower, tableToUpper, tableSortOrder, tableToUnicode, tableFromUnicode string 266 267 if *Print8BitData { 268 tableCtype = g.Tables.printSlice("ctype", meta.Name, codegen.Array8(meta.CType)) 269 tableToLower = g.Tables.printSlice("tolower", meta.Name, codegen.Array8(meta.ToLower)) 270 tableToUpper = g.Tables.printSlice("toupper", meta.Name, codegen.Array8(meta.ToUpper)) 271 } 272 if meta.SortOrder != nil { 273 tableSortOrder = g.Tables.printSlice("sortorder", meta.Name, codegen.Array8(meta.SortOrder)) 274 } 275 if meta.Charset != "latin1" { 276 if meta.TabToUni != nil { 277 tableToUnicode = g.Tables.printSlice("tounicode", meta.Name, codegen.Array16(meta.TabToUni)) 278 } 279 if meta.TabFromUni != nil { 280 tableFromUnicode = g.Tables.printUnicodeMappings("fromunicode", meta.Name, meta.TabFromUni) 281 } 282 } 283 284 var collation string 285 if meta.Flags.Binary { 286 collation = "Collation_8bit_bin" 287 } else { 288 collation = "Collation_8bit_simple_ci" 289 } 290 291 g.P("register(&", collation, "{") 292 g.P("id: ", meta.Number, ",") 293 g.P("name: ", codegen.Quote(meta.Name), ",") 294 295 g.P("simpletables: simpletables{") 296 if *Print8BitData { 297 g.P("ctype: &", tableCtype, ",") 298 g.P("tolower: &", tableToLower, ",") 299 g.P("toupper: &", tableToUpper, ",") 300 } 301 if tableSortOrder != "" { 302 g.P("sort: &", tableSortOrder, ",") 303 } 304 g.P("},") 305 306 // Optimized implementation for latin1 307 if meta.Charset == "latin1" { 308 g.P("charset: ", PkgCharset, ".Charset_latin1{},") 309 } else { 310 g.P("charset: &", PkgCharset, ".Charset_8bit{") 311 g.P("Name_: ", codegen.Quote(meta.Charset), ",") 312 if tableToUnicode != "" { 313 g.P("ToUnicode: &", tableToUnicode, ",") 314 } 315 if tableFromUnicode != "" { 316 g.P("FromUnicode: ", tableFromUnicode, ",") 317 } 318 g.P("},") 319 } 320 g.P("})") 321 } 322 323 func (g *Generator) printCollationUnicode(meta *CollationMetadata) { 324 var collation string 325 if meta.Flags.Binary { 326 collation = "Collation_unicode_bin" 327 } else { 328 collation = "Collation_unicode_general_ci" 329 } 330 g.P("register(&", collation, "{") 331 g.P("id: ", meta.Number, ",") 332 g.P("name: ", strconv.Quote(meta.Name), ",") 333 if !meta.Flags.Binary { 334 g.P("unicase: unicaseInfo_default,") 335 } 336 g.P("charset: ", PkgCharset, ".Charset_", meta.Charset, "{},") 337 g.P("})") 338 } 339 340 func (g *Generator) printCollationMultibyte(meta *CollationMetadata) { 341 var tableSortOrder string 342 if meta.SortOrder != nil { 343 tableSortOrder = g.Tables.printSlice("sortorder", meta.Name, codegen.Array8(meta.SortOrder)) 344 } 345 346 g.P("register(&Collation_multibyte{") 347 g.P("id: ", meta.Number, ",") 348 g.P("name: ", codegen.Quote(meta.Name), ",") 349 if tableSortOrder != "" { 350 g.P("sort: &", tableSortOrder, ",") 351 } 352 g.P("charset: ", PkgCharset, ".Charset_", meta.Charset, "{},") 353 g.P("})") 354 } 355 356 func makemysqldata(output string, metadata AllMetadata) { 357 var unsupportedByCharset = make(map[string][]string) 358 var g = Generator{ 359 Generator: codegen.NewGenerator(PkgCollations), 360 Tables: TableGenerator{ 361 Generator: codegen.NewGenerator(PkgCollations), 362 dedup: make(map[string]string), 363 baseWeightsUca400: metadata.get("utf8mb4_unicode_ci").Weights, 364 baseWeightsUca520: metadata.get("utf8mb4_unicode_520_ci").Weights, 365 baseWeightsUca900: metadata.get("utf8mb4_0900_ai_ci").Weights, 366 }, 367 } 368 369 g.P("func init() {") 370 371 for _, meta := range metadata { 372 switch { 373 case meta.Name == "utf8mb4_0900_bin" || meta.Name == "binary": 374 // hardcoded collations; nothing to export here 375 376 case meta.Name == "tis620_bin": 377 // explicitly unsupported for now because of not accurate results 378 379 case meta.CollationImpl == "any_uca" || 380 meta.CollationImpl == "utf16_uca" || 381 meta.CollationImpl == "utf32_uca" || 382 meta.CollationImpl == "ucs2_uca": 383 g.printCollationUcaLegacy(meta) 384 385 case meta.CollationImpl == "uca_900": 386 g.printCollationUca900(meta) 387 388 case meta.CollationImpl == "8bit_bin" || meta.CollationImpl == "8bit_simple_ci": 389 g.printCollation8bit(meta) 390 391 case meta.Name == "gb18030_unicode_520_ci": 392 g.printCollationUcaLegacy(meta) 393 394 case charset.IsMultibyteByName(meta.Charset): 395 g.printCollationMultibyte(meta) 396 397 case strings.HasSuffix(meta.Name, "_bin") && charset.IsUnicodeByName(meta.Charset): 398 g.printCollationUnicode(meta) 399 400 case strings.HasSuffix(meta.Name, "_general_ci"): 401 g.printCollationUnicode(meta) 402 403 default: 404 unsupportedByCharset[meta.Charset] = append(unsupportedByCharset[meta.Charset], meta.Name) 405 } 406 } 407 408 g.P("}") 409 codegen.Merge(g.Tables.Generator, g.Generator).WriteToFile(path.Join(output, "mysqldata.go")) 410 411 var unhandledCount int 412 for impl, collations := range unsupportedByCharset { 413 log.Printf("unhandled implementation %q: %s", impl, strings.Join(collations, ", ")) 414 unhandledCount += len(collations) 415 } 416 417 log.Printf("mysqldata: %d/%d collations (%.2f%% handled)", 418 len(metadata)-unhandledCount, len(metadata), 419 float64(len(metadata)-unhandledCount)/float64(len(metadata))*100.0, 420 ) 421 }