vitess.io/vitess@v0.16.2/go/mysql/collations/tools/makecolldata/mysqldata.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"fmt"
    21  	"log"
    22  	"path"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  
    27  	"github.com/spf13/pflag"
    28  
    29  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    30  	"vitess.io/vitess/go/mysql/collations/internal/uca"
    31  	"vitess.io/vitess/go/mysql/collations/tools/makecolldata/codegen"
    32  )
    33  
    34  var Print8BitData = pflag.Bool("full8bit", false, "")
    35  
    36  type TableGenerator struct {
    37  	*codegen.Generator
    38  	dedup map[string]string
    39  
    40  	baseWeightsUca400 TailoringWeights
    41  	baseWeightsUca520 TailoringWeights
    42  	baseWeightsUca900 TailoringWeights
    43  }
    44  
    45  type Generator struct {
    46  	*codegen.Generator
    47  	Tables TableGenerator
    48  }
    49  
    50  func diffMaps(orgWeights, modWeights TailoringWeights) (diff []uca.Patch) {
    51  	if len(modWeights) == 0 {
    52  		return nil
    53  	}
    54  
    55  	diffMap := make(TailoringWeights)
    56  	for key, val := range modWeights {
    57  		if orgVal, ok := orgWeights[key]; !ok || len(orgVal) != len(val) {
    58  			diffMap[key] = val
    59  			continue
    60  		}
    61  
    62  		for i, arr := range val {
    63  			if orgWeights[key][i] != arr {
    64  				diffMap[key] = val
    65  				break
    66  			}
    67  		}
    68  	}
    69  
    70  	for key, val := range diffMap {
    71  		cp, err := strconv.ParseInt(key[2:], 16, 32)
    72  		if err != nil {
    73  			panic(err)
    74  		}
    75  		diff = append(diff, uca.Patch{Codepoint: rune(cp), Patch: val})
    76  	}
    77  
    78  	sort.Slice(diff, func(i, j int) bool {
    79  		return diff[i].Codepoint < diff[j].Codepoint
    80  	})
    81  
    82  	return
    83  }
    84  
    85  func (g *TableGenerator) dedupTable(name, coll string, val any) (string, bool) {
    86  	raw := fmt.Sprintf("%#v", val)
    87  	if exist, ok := g.dedup[raw]; ok {
    88  		return exist, true
    89  	}
    90  
    91  	varname := fmt.Sprintf("%s_%s", name, coll)
    92  	g.dedup[raw] = varname
    93  	return varname, false
    94  }
    95  
    96  func (g *Generator) printCollationUcaLegacy(meta *CollationMetadata) {
    97  	tableWeightPatches := g.Tables.writeWeightPatches(meta)
    98  	tableContractions := g.Tables.writeContractions(meta)
    99  
   100  	g.P("register(&Collation_uca_legacy{")
   101  	g.P("name: ", codegen.Quote(meta.Name), ",")
   102  	g.P("id: ", meta.Number, ",")
   103  	g.P("charset: ", PkgCharset, ".Charset_", meta.Charset, "{},")
   104  	g.P("weights: weightTable_uca", meta.UCAVersion, ",")
   105  	if tableWeightPatches != "" {
   106  		g.P("tailoring: ", tableWeightPatches, ",")
   107  	}
   108  	if tableContractions != "" {
   109  		g.P("contract: ", tableContractions, "{},")
   110  	}
   111  	switch meta.UCAVersion {
   112  	case 400:
   113  		g.P("maxCodepoint: 0xFFFF,")
   114  	case 520:
   115  		g.P("maxCodepoint: 0x10FFFF,")
   116  	default:
   117  		g.Fail("invalid UCAVersion")
   118  	}
   119  	g.P("})")
   120  }
   121  
   122  func (g *TableGenerator) writeWeightPatches(meta *CollationMetadata) string {
   123  	var tableWeightPatches string
   124  	var dedup bool
   125  	var baseWeights TailoringWeights
   126  
   127  	switch meta.UCAVersion {
   128  	case 400:
   129  		baseWeights = g.baseWeightsUca400
   130  	case 520:
   131  		baseWeights = g.baseWeightsUca520
   132  	case 900:
   133  		baseWeights = g.baseWeightsUca900
   134  	default:
   135  		g.Fail("invalid UCAVersion")
   136  	}
   137  
   138  	diff := diffMaps(baseWeights, meta.Weights)
   139  	if len(diff) > 0 {
   140  		tableWeightPatches, dedup = g.dedupTable("weightTailoring", meta.Name, diff)
   141  		if !dedup {
   142  			g.P("var ", tableWeightPatches, " = ", diff)
   143  			g.P()
   144  		}
   145  	}
   146  
   147  	return tableWeightPatches
   148  }
   149  
   150  func (g *TableGenerator) writeContractions(meta *CollationMetadata) string {
   151  	var tableContractions string
   152  	var dedup bool
   153  
   154  	if len(meta.Contractions) > 0 {
   155  		tableContractions, dedup = g.dedupTable("contractor", meta.Name, meta.Contractions)
   156  		if !dedup {
   157  			g.printContractionsFast(tableContractions, meta.Contractions)
   158  			g.P()
   159  		}
   160  	}
   161  	return tableContractions
   162  }
   163  
   164  func (g *TableGenerator) writeReorders(meta *CollationMetadata) string {
   165  	var tableReorder string
   166  	var dedup bool
   167  
   168  	if len(meta.Reorder) > 0 {
   169  		tableReorder, dedup = g.dedupTable("reorder", meta.Name, meta.Reorder)
   170  		if !dedup {
   171  			var reorder []uca.Reorder
   172  			for _, r := range meta.Reorder {
   173  				reorder = append(reorder, uca.Reorder{FromMin: r[0], FromMax: r[1], ToMin: r[2], ToMax: r[3]})
   174  			}
   175  			g.P("var ", tableReorder, " = ", reorder)
   176  			g.P()
   177  		}
   178  	}
   179  	return tableReorder
   180  }
   181  
   182  func (g *Generator) printCollationUca900(meta *CollationMetadata) {
   183  	if meta.UCAVersion != 900 {
   184  		g.Fail("unexpected UCA version for UCA900 collation")
   185  	}
   186  
   187  	tableWeights := "weightTable_uca900"
   188  	switch meta.Name {
   189  	case "utf8mb4_zh_0900_as_cs":
   190  		// the chinese weights table is large enough that we don't apply weight patches
   191  		// to it, we generate it as a whole
   192  		tableWeights = "weightTable_uca900_zh"
   193  		meta.Weights = nil
   194  
   195  		// HACK: Chinese collations are fully reordered on their patched weights.
   196  		// They do not need manual reordering even if they include reorder ranges
   197  		// FIXME: Why does this collation have a reorder range that doesn't apply?
   198  		meta.Reorder = nil
   199  
   200  	case "utf8mb4_ja_0900_as_cs", "utf8mb4_ja_0900_as_cs_ks":
   201  		// the japanese weights table is large enough that we don't apply weight patches
   202  		// to it, we generate it as a whole
   203  		tableWeights = "weightTable_uca900_ja"
   204  		meta.Weights = nil
   205  	}
   206  
   207  	tableWeightPatches := g.Tables.writeWeightPatches(meta)
   208  	tableContractions := g.Tables.writeContractions(meta)
   209  	tableReorder := g.Tables.writeReorders(meta)
   210  
   211  	g.P("register(&Collation_utf8mb4_uca_0900{")
   212  	g.P("name: ", codegen.Quote(meta.Name), ",")
   213  	g.P("id: ", meta.Number, ",")
   214  
   215  	var levels int
   216  	switch {
   217  	case strings.HasSuffix(meta.Name, "_ai_ci"):
   218  		levels = 1
   219  	case strings.HasSuffix(meta.Name, "_as_ci"):
   220  		levels = 2
   221  	case strings.HasSuffix(meta.Name, "_as_cs"):
   222  		levels = 3
   223  	case strings.HasSuffix(meta.Name, "_as_cs_ks"):
   224  		levels = 4
   225  	default:
   226  		g.Fail(fmt.Sprintf("unknown levelsForCompare: %q", meta.Name))
   227  	}
   228  
   229  	g.P("levelsForCompare: ", levels, ",")
   230  	g.P("weights: ", tableWeights, ",")
   231  	if tableWeightPatches != "" {
   232  		g.P("tailoring: ", tableWeightPatches, ",")
   233  	}
   234  	if tableContractions != "" {
   235  		g.P("contract: ", tableContractions, "{},")
   236  	}
   237  	if tableReorder != "" {
   238  		g.P("reorder: ", tableReorder, ",")
   239  	}
   240  	if meta.UpperCaseFirst {
   241  		g.P("upperCaseFirst: true,")
   242  	}
   243  	g.P("})")
   244  }
   245  
   246  func (g *TableGenerator) printSlice(name, coll string, slice any) string {
   247  	tableName, dedup := g.dedupTable(name, coll, slice)
   248  	if !dedup {
   249  		g.P("var ", tableName, " = ", slice)
   250  		g.P()
   251  	}
   252  	return tableName
   253  }
   254  
   255  func (g *TableGenerator) printUnicodeMappings(name, coll string, mappings []charset.UnicodeMapping) string {
   256  	tableName, dedup := g.dedupTable(name, coll, mappings)
   257  	if !dedup {
   258  		g.P("var ", tableName, " = ", mappings)
   259  		g.P()
   260  	}
   261  	return tableName
   262  }
   263  
   264  func (g *Generator) printCollation8bit(meta *CollationMetadata) {
   265  	var tableCtype, tableToLower, tableToUpper, tableSortOrder, tableToUnicode, tableFromUnicode string
   266  
   267  	if *Print8BitData {
   268  		tableCtype = g.Tables.printSlice("ctype", meta.Name, codegen.Array8(meta.CType))
   269  		tableToLower = g.Tables.printSlice("tolower", meta.Name, codegen.Array8(meta.ToLower))
   270  		tableToUpper = g.Tables.printSlice("toupper", meta.Name, codegen.Array8(meta.ToUpper))
   271  	}
   272  	if meta.SortOrder != nil {
   273  		tableSortOrder = g.Tables.printSlice("sortorder", meta.Name, codegen.Array8(meta.SortOrder))
   274  	}
   275  	if meta.Charset != "latin1" {
   276  		if meta.TabToUni != nil {
   277  			tableToUnicode = g.Tables.printSlice("tounicode", meta.Name, codegen.Array16(meta.TabToUni))
   278  		}
   279  		if meta.TabFromUni != nil {
   280  			tableFromUnicode = g.Tables.printUnicodeMappings("fromunicode", meta.Name, meta.TabFromUni)
   281  		}
   282  	}
   283  
   284  	var collation string
   285  	if meta.Flags.Binary {
   286  		collation = "Collation_8bit_bin"
   287  	} else {
   288  		collation = "Collation_8bit_simple_ci"
   289  	}
   290  
   291  	g.P("register(&", collation, "{")
   292  	g.P("id: ", meta.Number, ",")
   293  	g.P("name: ", codegen.Quote(meta.Name), ",")
   294  
   295  	g.P("simpletables: simpletables{")
   296  	if *Print8BitData {
   297  		g.P("ctype: &", tableCtype, ",")
   298  		g.P("tolower: &", tableToLower, ",")
   299  		g.P("toupper: &", tableToUpper, ",")
   300  	}
   301  	if tableSortOrder != "" {
   302  		g.P("sort: &", tableSortOrder, ",")
   303  	}
   304  	g.P("},")
   305  
   306  	// Optimized implementation for latin1
   307  	if meta.Charset == "latin1" {
   308  		g.P("charset: ", PkgCharset, ".Charset_latin1{},")
   309  	} else {
   310  		g.P("charset: &", PkgCharset, ".Charset_8bit{")
   311  		g.P("Name_: ", codegen.Quote(meta.Charset), ",")
   312  		if tableToUnicode != "" {
   313  			g.P("ToUnicode: &", tableToUnicode, ",")
   314  		}
   315  		if tableFromUnicode != "" {
   316  			g.P("FromUnicode: ", tableFromUnicode, ",")
   317  		}
   318  		g.P("},")
   319  	}
   320  	g.P("})")
   321  }
   322  
   323  func (g *Generator) printCollationUnicode(meta *CollationMetadata) {
   324  	var collation string
   325  	if meta.Flags.Binary {
   326  		collation = "Collation_unicode_bin"
   327  	} else {
   328  		collation = "Collation_unicode_general_ci"
   329  	}
   330  	g.P("register(&", collation, "{")
   331  	g.P("id: ", meta.Number, ",")
   332  	g.P("name: ", strconv.Quote(meta.Name), ",")
   333  	if !meta.Flags.Binary {
   334  		g.P("unicase: unicaseInfo_default,")
   335  	}
   336  	g.P("charset: ", PkgCharset, ".Charset_", meta.Charset, "{},")
   337  	g.P("})")
   338  }
   339  
   340  func (g *Generator) printCollationMultibyte(meta *CollationMetadata) {
   341  	var tableSortOrder string
   342  	if meta.SortOrder != nil {
   343  		tableSortOrder = g.Tables.printSlice("sortorder", meta.Name, codegen.Array8(meta.SortOrder))
   344  	}
   345  
   346  	g.P("register(&Collation_multibyte{")
   347  	g.P("id: ", meta.Number, ",")
   348  	g.P("name: ", codegen.Quote(meta.Name), ",")
   349  	if tableSortOrder != "" {
   350  		g.P("sort: &", tableSortOrder, ",")
   351  	}
   352  	g.P("charset: ", PkgCharset, ".Charset_", meta.Charset, "{},")
   353  	g.P("})")
   354  }
   355  
   356  func makemysqldata(output string, metadata AllMetadata) {
   357  	var unsupportedByCharset = make(map[string][]string)
   358  	var g = Generator{
   359  		Generator: codegen.NewGenerator(PkgCollations),
   360  		Tables: TableGenerator{
   361  			Generator:         codegen.NewGenerator(PkgCollations),
   362  			dedup:             make(map[string]string),
   363  			baseWeightsUca400: metadata.get("utf8mb4_unicode_ci").Weights,
   364  			baseWeightsUca520: metadata.get("utf8mb4_unicode_520_ci").Weights,
   365  			baseWeightsUca900: metadata.get("utf8mb4_0900_ai_ci").Weights,
   366  		},
   367  	}
   368  
   369  	g.P("func init() {")
   370  
   371  	for _, meta := range metadata {
   372  		switch {
   373  		case meta.Name == "utf8mb4_0900_bin" || meta.Name == "binary":
   374  			// hardcoded collations; nothing to export here
   375  
   376  		case meta.Name == "tis620_bin":
   377  			// explicitly unsupported for now because of not accurate results
   378  
   379  		case meta.CollationImpl == "any_uca" ||
   380  			meta.CollationImpl == "utf16_uca" ||
   381  			meta.CollationImpl == "utf32_uca" ||
   382  			meta.CollationImpl == "ucs2_uca":
   383  			g.printCollationUcaLegacy(meta)
   384  
   385  		case meta.CollationImpl == "uca_900":
   386  			g.printCollationUca900(meta)
   387  
   388  		case meta.CollationImpl == "8bit_bin" || meta.CollationImpl == "8bit_simple_ci":
   389  			g.printCollation8bit(meta)
   390  
   391  		case meta.Name == "gb18030_unicode_520_ci":
   392  			g.printCollationUcaLegacy(meta)
   393  
   394  		case charset.IsMultibyteByName(meta.Charset):
   395  			g.printCollationMultibyte(meta)
   396  
   397  		case strings.HasSuffix(meta.Name, "_bin") && charset.IsUnicodeByName(meta.Charset):
   398  			g.printCollationUnicode(meta)
   399  
   400  		case strings.HasSuffix(meta.Name, "_general_ci"):
   401  			g.printCollationUnicode(meta)
   402  
   403  		default:
   404  			unsupportedByCharset[meta.Charset] = append(unsupportedByCharset[meta.Charset], meta.Name)
   405  		}
   406  	}
   407  
   408  	g.P("}")
   409  	codegen.Merge(g.Tables.Generator, g.Generator).WriteToFile(path.Join(output, "mysqldata.go"))
   410  
   411  	var unhandledCount int
   412  	for impl, collations := range unsupportedByCharset {
   413  		log.Printf("unhandled implementation %q: %s", impl, strings.Join(collations, ", "))
   414  		unhandledCount += len(collations)
   415  	}
   416  
   417  	log.Printf("mysqldata: %d/%d collations (%.2f%% handled)",
   418  		len(metadata)-unhandledCount, len(metadata),
   419  		float64(len(metadata)-unhandledCount)/float64(len(metadata))*100.0,
   420  	)
   421  }