github.com/dolthub/go-mysql-server@v0.18.0/sql/encodings/generate/main.go (about) 1 // Copyright 2023 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "encoding/binary" 19 "fmt" 20 "hash/fnv" 21 "os" 22 "sort" 23 "strings" 24 "unsafe" 25 26 "golang.org/x/exp/constraints" 27 ) 28 29 var Header = `// Copyright 2023 Dolthub, Inc. 30 // 31 // Licensed under the Apache License, Version 2.0 (the "License"); 32 // you may not use this file except in compliance with the License. 33 // You may obtain a copy of the License at 34 // 35 // http://www.apache.org/licenses/LICENSE-2.0 36 // 37 // Unless required by applicable law or agreed to in writing, software 38 // distributed under the License is distributed on an "AS IS" BASIS, 39 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 // See the License for the specific language governing permissions and 41 // limitations under the License. 42 43 // THIS FILE IS GENERATED. DO NOT EDIT BY HAND. 44 45 package encodings 46 47 import ( 48 _ "embed" 49 "encoding/binary" 50 "sync" 51 ) 52 53 func loadWeightsMap(m map[rune]int32, bin []byte) { 54 for i := 0; i < len(bin); i += 8 { 55 m[rune(binary.BigEndian.Uint32(bin[i:]))] = int32(binary.BigEndian.Uint32(bin[i+4:])) 56 } 57 } 58 ` 59 60 func main() { 61 // Verify that (sizeof(rune) == sizeof(int32)), just in case a future Go version breaks this assumption 62 if unsafe.Sizeof(rune(0)) != unsafe.Sizeof(int32(0)) { 63 panic("sizeof(rune) != sizeof(int32)") 64 } 65 66 // Hash the contents of all maps 67 for k, v := range WeightMaps { 68 runesInMap := SortedMapKeys(v) 69 hash := fnv.New64a() 70 for _, r := range runesInMap { 71 sortOrder := v[r] 72 _, _ = hash.Write([]byte{byte(r), byte(r >> 8), byte(r >> 16), byte(r >> 24)}) 73 _, _ = hash.Write([]byte{byte(sortOrder), byte(sortOrder >> 8), byte(sortOrder >> 16), byte(sortOrder >> 24)}) 74 } 75 FileContentHashes[k] = hash.Sum64() 76 } 77 78 // Check for duplicate weight maps 79 weightKeys := SortedMapKeys(WeightMaps) 80 allDuplicatedMaps := make(map[string][]string) 81 for i := 0; i < len(weightKeys); i++ { 82 weightKey := weightKeys[i] 83 contentHash := FileContentHashes[weightKey] 84 var duplicateKeyNames []string 85 for j := len(weightKeys) - 1; j > i; j-- { 86 compareWeightKey := weightKeys[j] 87 if contentHash == FileContentHashes[compareWeightKey] { 88 duplicateKeyNames = append(duplicateKeyNames, compareWeightKey) 89 weightKeys = append(weightKeys[:j], weightKeys[j+1:]...) 90 } 91 } 92 sort.Strings(duplicateKeyNames) 93 // Find the common prefix of all names if they exist, else concatenate all names 94 if len(duplicateKeyNames) > 0 { 95 // Grab the duplicated map and delete the first key 96 duplicatedMap := WeightMaps[weightKey] 97 delete(WeightMaps, weightKey) 98 // Find the common prefix and delete the duplicate keys 99 prefix, _ := GetCharacterSet(weightKey) 100 for _, duplicateKeyName := range duplicateKeyNames { 101 delete(WeightMaps, duplicateKeyName) 102 prefix = CommonPrefix(prefix, duplicateKeyName) 103 } 104 // If there is a common prefix then we'll prepend "common_", else concatenate all of the character sets 105 if len(prefix) > 0 { 106 prefix = "common_" + prefix 107 } else { 108 allCharsets := make([]string, 0, len(duplicateKeyNames)) 109 allCharsetsMap := make(map[string]struct{}) 110 firstCharset, _ := GetCharacterSet(weightKey) 111 allCharsets = append(allCharsets, firstCharset) 112 allCharsetsMap[firstCharset] = struct{}{} 113 for _, duplicateKeyName := range duplicateKeyNames { 114 charset, _ := GetCharacterSet(duplicateKeyName) 115 // Some duplicate collations may be in the same character set, so we filter those out too 116 if _, ok := allCharsetsMap[charset]; !ok { 117 allCharsets = append(allCharsets, charset) 118 allCharsetsMap[charset] = struct{}{} 119 } 120 } 121 prefix = "common_" + strings.Join(allCharsets, "_") 122 } 123 // Add the new key to the weight maps 124 _, newKey := GetCharacterSet(weightKey) 125 newKey = prefix + newKey 126 WeightMaps[newKey] = duplicatedMap 127 allDuplicatedMaps[newKey] = append([]string{weightKey}, duplicateKeyNames...) 128 } 129 } 130 weightKeys = SortedMapKeys(WeightMaps) 131 132 // Load the weightmaps file for writing 133 gofile, err := os.OpenFile("../weightmaps.go", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644) 134 if err != nil { 135 panic(err) 136 } 137 defer gofile.Close() 138 _, err = fmt.Fprintf(gofile, "%s", Header) 139 if err != nil { 140 panic(err) 141 } 142 143 // Write all of the keys and their corresponding weight maps to files 144 for _, k := range weightKeys { 145 v := WeightMaps[k] 146 OutputWeights(k, v) 147 OutputGoForMap(gofile, k) 148 } 149 150 // Display all of the duplicate maps and their new map name 151 duplicates := SortedMapKeys(allDuplicatedMaps) 152 for _, duplicate := range duplicates { 153 fmt.Printf("%s: [%s]\n", duplicate, strings.Join(allDuplicatedMaps[duplicate], ", ")) 154 } 155 } 156 157 func OutputWeights(name string, weights map[rune]int32) { 158 binfile, err := os.OpenFile("../"+name+".bin", os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644) 159 if err != nil { 160 panic(err) 161 } 162 defer binfile.Close() 163 164 keys := SortedMapKeys(weights) 165 for _, k := range keys { 166 v := weights[k] 167 err := binary.Write(binfile, binary.BigEndian, k) 168 if err != nil { 169 panic(err) 170 } 171 err = binary.Write(binfile, binary.BigEndian, v) 172 if err != nil { 173 panic(err) 174 } 175 } 176 } 177 178 func OutputGoForMap(gofile *os.File, name string) { 179 fmt.Fprintln(gofile) 180 fmt.Fprintln(gofile, "//go:embed "+name+".bin") 181 fmt.Fprintln(gofile, "var "+name+"_bin []byte // This is generated using the ./generate package.") 182 fmt.Fprintln(gofile, "var "+name+"_map = make(map[rune]int32)") 183 fmt.Fprintln(gofile, "var "+name+"_once sync.Once") 184 fmt.Fprintln(gofile) 185 fmt.Fprintln(gofile, "func "+name+"() map[rune]int32 {") 186 fmt.Fprintln(gofile, "\t"+name+"_once.Do(func() { loadWeightsMap("+name+"_map, "+name+"_bin) })") 187 fmt.Fprintln(gofile, "\treturn "+name+"_map") 188 fmt.Fprintln(gofile, "}") 189 } 190 191 var WeightMaps = map[string]map[rune]int32{ 192 "utf16_croatian_ci_Weights": utf16_croatian_ci_Weights, 193 "utf16_czech_ci_Weights": utf16_czech_ci_Weights, 194 "utf16_danish_ci_Weights": utf16_danish_ci_Weights, 195 "utf16_esperanto_ci_Weights": utf16_esperanto_ci_Weights, 196 "utf16_estonian_ci_Weights": utf16_estonian_ci_Weights, 197 "utf16_german2_ci_Weights": utf16_german2_ci_Weights, 198 "utf16_hungarian_ci_Weights": utf16_hungarian_ci_Weights, 199 "utf16_icelandic_ci_Weights": utf16_icelandic_ci_Weights, 200 "utf16_latvian_ci_Weights": utf16_latvian_ci_Weights, 201 "utf16_lithuanian_ci_Weights": utf16_lithuanian_ci_Weights, 202 "utf16_persian_ci_Weights": utf16_persian_ci_Weights, 203 "utf16_polish_ci_Weights": utf16_polish_ci_Weights, 204 "utf16_roman_ci_Weights": utf16_roman_ci_Weights, 205 "utf16_romanian_ci_Weights": utf16_romanian_ci_Weights, 206 "utf16_sinhala_ci_Weights": utf16_sinhala_ci_Weights, 207 "utf16_slovak_ci_Weights": utf16_slovak_ci_Weights, 208 "utf16_slovenian_ci_Weights": utf16_slovenian_ci_Weights, 209 "utf16_spanish2_ci_Weights": utf16_spanish2_ci_Weights, 210 "utf16_spanish_ci_Weights": utf16_spanish_ci_Weights, 211 "utf16_swedish_ci_Weights": utf16_swedish_ci_Weights, 212 "utf16_turkish_ci_Weights": utf16_turkish_ci_Weights, 213 "utf16_unicode_520_ci_Weights": utf16_unicode_520_ci_Weights, 214 "utf16_unicode_ci_Weights": utf16_unicode_ci_Weights, 215 "utf16_vietnamese_ci_Weights": utf16_vietnamese_ci_Weights, 216 "utf32_croatian_ci_Weights": utf32_croatian_ci_Weights, 217 "utf32_czech_ci_Weights": utf32_czech_ci_Weights, 218 "utf32_danish_ci_Weights": utf32_danish_ci_Weights, 219 "utf32_esperanto_ci_Weights": utf32_esperanto_ci_Weights, 220 "utf32_estonian_ci_Weights": utf32_estonian_ci_Weights, 221 "utf32_german2_ci_Weights": utf32_german2_ci_Weights, 222 "utf32_hungarian_ci_Weights": utf32_hungarian_ci_Weights, 223 "utf32_icelandic_ci_Weights": utf32_icelandic_ci_Weights, 224 "utf32_latvian_ci_Weights": utf32_latvian_ci_Weights, 225 "utf32_lithuanian_ci_Weights": utf32_lithuanian_ci_Weights, 226 "utf32_persian_ci_Weights": utf32_persian_ci_Weights, 227 "utf32_polish_ci_Weights": utf32_polish_ci_Weights, 228 "utf32_roman_ci_Weights": utf32_roman_ci_Weights, 229 "utf32_romanian_ci_Weights": utf32_romanian_ci_Weights, 230 "utf32_sinhala_ci_Weights": utf32_sinhala_ci_Weights, 231 "utf32_slovak_ci_Weights": utf32_slovak_ci_Weights, 232 "utf32_slovenian_ci_Weights": utf32_slovenian_ci_Weights, 233 "utf32_spanish2_ci_Weights": utf32_spanish2_ci_Weights, 234 "utf32_spanish_ci_Weights": utf32_spanish_ci_Weights, 235 "utf32_swedish_ci_Weights": utf32_swedish_ci_Weights, 236 "utf32_turkish_ci_Weights": utf32_turkish_ci_Weights, 237 "utf32_unicode_520_ci_Weights": utf32_unicode_520_ci_Weights, 238 "utf32_unicode_ci_Weights": utf32_unicode_ci_Weights, 239 "utf32_vietnamese_ci_Weights": utf32_vietnamese_ci_Weights, 240 "utf8mb3_croatian_ci_Weights": utf8mb3_croatian_ci_Weights, 241 "utf8mb3_czech_ci_Weights": utf8mb3_czech_ci_Weights, 242 "utf8mb3_danish_ci_Weights": utf8mb3_danish_ci_Weights, 243 "utf8mb3_esperanto_ci_Weights": utf8mb3_esperanto_ci_Weights, 244 "utf8mb3_estonian_ci_Weights": utf8mb3_estonian_ci_Weights, 245 "utf8mb3_german2_ci_Weights": utf8mb3_german2_ci_Weights, 246 "utf8mb3_hungarian_ci_Weights": utf8mb3_hungarian_ci_Weights, 247 "utf8mb3_icelandic_ci_Weights": utf8mb3_icelandic_ci_Weights, 248 "utf8mb3_latvian_ci_Weights": utf8mb3_latvian_ci_Weights, 249 "utf8mb3_lithuanian_ci_Weights": utf8mb3_lithuanian_ci_Weights, 250 "utf8mb3_persian_ci_Weights": utf8mb3_persian_ci_Weights, 251 "utf8mb3_polish_ci_Weights": utf8mb3_polish_ci_Weights, 252 "utf8mb3_roman_ci_Weights": utf8mb3_roman_ci_Weights, 253 "utf8mb3_romanian_ci_Weights": utf8mb3_romanian_ci_Weights, 254 "utf8mb3_sinhala_ci_Weights": utf8mb3_sinhala_ci_Weights, 255 "utf8mb3_slovak_ci_Weights": utf8mb3_slovak_ci_Weights, 256 "utf8mb3_slovenian_ci_Weights": utf8mb3_slovenian_ci_Weights, 257 "utf8mb3_spanish2_ci_Weights": utf8mb3_spanish2_ci_Weights, 258 "utf8mb3_spanish_ci_Weights": utf8mb3_spanish_ci_Weights, 259 "utf8mb3_swedish_ci_Weights": utf8mb3_swedish_ci_Weights, 260 "utf8mb3_turkish_ci_Weights": utf8mb3_turkish_ci_Weights, 261 "utf8mb3_unicode_520_ci_Weights": utf8mb3_unicode_520_ci_Weights, 262 "utf8mb3_unicode_ci_Weights": utf8mb3_unicode_ci_Weights, 263 "utf8mb3_vietnamese_ci_Weights": utf8mb3_vietnamese_ci_Weights, 264 "utf8mb4_0900_ai_ci_Weights": utf8mb4_0900_ai_ci_Weights, 265 "utf8mb4_0900_as_ci_Weights": utf8mb4_0900_as_ci_Weights, 266 "utf8mb4_0900_as_cs_Weights": utf8mb4_0900_as_cs_Weights, 267 "utf8mb4_croatian_ci_Weights": utf8mb4_croatian_ci_Weights, 268 "utf8mb4_cs_0900_ai_ci_Weights": utf8mb4_cs_0900_ai_ci_Weights, 269 "utf8mb4_cs_0900_as_cs_Weights": utf8mb4_cs_0900_as_cs_Weights, 270 "utf8mb4_czech_ci_Weights": utf8mb4_czech_ci_Weights, 271 "utf8mb4_da_0900_ai_ci_Weights": utf8mb4_da_0900_ai_ci_Weights, 272 "utf8mb4_da_0900_as_cs_Weights": utf8mb4_da_0900_as_cs_Weights, 273 "utf8mb4_danish_ci_Weights": utf8mb4_danish_ci_Weights, 274 "utf8mb4_de_pb_0900_ai_ci_Weights": utf8mb4_de_pb_0900_ai_ci_Weights, 275 "utf8mb4_de_pb_0900_as_cs_Weights": utf8mb4_de_pb_0900_as_cs_Weights, 276 "utf8mb4_eo_0900_ai_ci_Weights": utf8mb4_eo_0900_ai_ci_Weights, 277 "utf8mb4_eo_0900_as_cs_Weights": utf8mb4_eo_0900_as_cs_Weights, 278 "utf8mb4_es_0900_ai_ci_Weights": utf8mb4_es_0900_ai_ci_Weights, 279 "utf8mb4_es_0900_as_cs_Weights": utf8mb4_es_0900_as_cs_Weights, 280 "utf8mb4_es_trad_0900_ai_ci_Weights": utf8mb4_es_trad_0900_ai_ci_Weights, 281 "utf8mb4_es_trad_0900_as_cs_Weights": utf8mb4_es_trad_0900_as_cs_Weights, 282 "utf8mb4_esperanto_ci_Weights": utf8mb4_esperanto_ci_Weights, 283 "utf8mb4_estonian_ci_Weights": utf8mb4_estonian_ci_Weights, 284 "utf8mb4_et_0900_ai_ci_Weights": utf8mb4_et_0900_ai_ci_Weights, 285 "utf8mb4_et_0900_as_cs_Weights": utf8mb4_et_0900_as_cs_Weights, 286 "utf8mb4_german2_ci_Weights": utf8mb4_german2_ci_Weights, 287 "utf8mb4_hr_0900_ai_ci_Weights": utf8mb4_hr_0900_ai_ci_Weights, 288 "utf8mb4_hr_0900_as_cs_Weights": utf8mb4_hr_0900_as_cs_Weights, 289 "utf8mb4_hu_0900_ai_ci_Weights": utf8mb4_hu_0900_ai_ci_Weights, 290 "utf8mb4_hu_0900_as_cs_Weights": utf8mb4_hu_0900_as_cs_Weights, 291 "utf8mb4_hungarian_ci_Weights": utf8mb4_hungarian_ci_Weights, 292 "utf8mb4_icelandic_ci_Weights": utf8mb4_icelandic_ci_Weights, 293 "utf8mb4_is_0900_ai_ci_Weights": utf8mb4_is_0900_ai_ci_Weights, 294 "utf8mb4_is_0900_as_cs_Weights": utf8mb4_is_0900_as_cs_Weights, 295 "utf8mb4_ja_0900_as_cs_Weights": utf8mb4_ja_0900_as_cs_Weights, 296 "utf8mb4_ja_0900_as_cs_ks_Weights": utf8mb4_ja_0900_as_cs_ks_Weights, 297 "utf8mb4_la_0900_ai_ci_Weights": utf8mb4_la_0900_ai_ci_Weights, 298 "utf8mb4_la_0900_as_cs_Weights": utf8mb4_la_0900_as_cs_Weights, 299 "utf8mb4_latvian_ci_Weights": utf8mb4_latvian_ci_Weights, 300 "utf8mb4_lithuanian_ci_Weights": utf8mb4_lithuanian_ci_Weights, 301 "utf8mb4_lt_0900_ai_ci_Weights": utf8mb4_lt_0900_ai_ci_Weights, 302 "utf8mb4_lt_0900_as_cs_Weights": utf8mb4_lt_0900_as_cs_Weights, 303 "utf8mb4_lv_0900_ai_ci_Weights": utf8mb4_lv_0900_ai_ci_Weights, 304 "utf8mb4_lv_0900_as_cs_Weights": utf8mb4_lv_0900_as_cs_Weights, 305 "utf8mb4_persian_ci_Weights": utf8mb4_persian_ci_Weights, 306 "utf8mb4_pl_0900_ai_ci_Weights": utf8mb4_pl_0900_ai_ci_Weights, 307 "utf8mb4_pl_0900_as_cs_Weights": utf8mb4_pl_0900_as_cs_Weights, 308 "utf8mb4_polish_ci_Weights": utf8mb4_polish_ci_Weights, 309 "utf8mb4_ro_0900_ai_ci_Weights": utf8mb4_ro_0900_ai_ci_Weights, 310 "utf8mb4_ro_0900_as_cs_Weights": utf8mb4_ro_0900_as_cs_Weights, 311 "utf8mb4_roman_ci_Weights": utf8mb4_roman_ci_Weights, 312 "utf8mb4_romanian_ci_Weights": utf8mb4_romanian_ci_Weights, 313 "utf8mb4_ru_0900_ai_ci_Weights": utf8mb4_ru_0900_ai_ci_Weights, 314 "utf8mb4_ru_0900_as_cs_Weights": utf8mb4_ru_0900_as_cs_Weights, 315 "utf8mb4_sinhala_ci_Weights": utf8mb4_sinhala_ci_Weights, 316 "utf8mb4_sk_0900_ai_ci_Weights": utf8mb4_sk_0900_ai_ci_Weights, 317 "utf8mb4_sk_0900_as_cs_Weights": utf8mb4_sk_0900_as_cs_Weights, 318 "utf8mb4_sl_0900_ai_ci_Weights": utf8mb4_sl_0900_ai_ci_Weights, 319 "utf8mb4_sl_0900_as_cs_Weights": utf8mb4_sl_0900_as_cs_Weights, 320 "utf8mb4_slovak_ci_Weights": utf8mb4_slovak_ci_Weights, 321 "utf8mb4_slovenian_ci_Weights": utf8mb4_slovenian_ci_Weights, 322 "utf8mb4_spanish2_ci_Weights": utf8mb4_spanish2_ci_Weights, 323 "utf8mb4_spanish_ci_Weights": utf8mb4_spanish_ci_Weights, 324 "utf8mb4_sv_0900_ai_ci_Weights": utf8mb4_sv_0900_ai_ci_Weights, 325 "utf8mb4_sv_0900_as_cs_Weights": utf8mb4_sv_0900_as_cs_Weights, 326 "utf8mb4_swedish_ci_Weights": utf8mb4_swedish_ci_Weights, 327 "utf8mb4_tr_0900_ai_ci_Weights": utf8mb4_tr_0900_ai_ci_Weights, 328 "utf8mb4_tr_0900_as_cs_Weights": utf8mb4_tr_0900_as_cs_Weights, 329 "utf8mb4_turkish_ci_Weights": utf8mb4_turkish_ci_Weights, 330 "utf8mb4_unicode_520_ci_Weights": utf8mb4_unicode_520_ci_Weights, 331 "utf8mb4_unicode_ci_Weights": utf8mb4_unicode_ci_Weights, 332 "utf8mb4_vi_0900_ai_ci_Weights": utf8mb4_vi_0900_ai_ci_Weights, 333 "utf8mb4_vi_0900_as_cs_Weights": utf8mb4_vi_0900_as_cs_Weights, 334 "utf8mb4_vietnamese_ci_Weights": utf8mb4_vietnamese_ci_Weights, 335 "utf8mb4_zh_0900_as_cs_Weights": utf8mb4_zh_0900_as_cs_Weights, 336 } 337 338 var FileContentHashes = map[string]uint64{} 339 340 func SortedMapKeys[K constraints.Ordered, V any](m map[K]V) []K { 341 keys := make([]K, 0, len(m)) 342 for key := range m { 343 keys = append(keys, key) 344 } 345 sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] }) 346 return keys 347 } 348 349 func CommonPrefix(str1 string, str2 string) string { 350 minLen := len(str1) 351 if len(str2) < minLen { 352 minLen = len(str2) 353 } 354 i := 0 355 for ; i < minLen; i++ { 356 if str1[i] != str2[i] { 357 break 358 } 359 } 360 return str1[:i] 361 } 362 363 func GetCharacterSet(str string) (charset string, restOfString string) { 364 index := strings.Index(str, "_") 365 return str[:index], str[index:] 366 }