github.com/dolthub/go-mysql-server@v0.18.0/sql/charactersets.go (about) 1 // Copyright 2022-2023 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sql 16 17 import ( 18 "strings" 19 20 "github.com/dolthub/go-mysql-server/sql/encodings" 21 ) 22 23 // CharacterSet represents the character set of a string. 24 type CharacterSet struct { 25 ID CharacterSetID 26 Name string 27 DefaultCollation CollationID 28 BinaryCollation CollationID 29 Description string 30 MaxLength uint8 31 Encoder encodings.Encoder 32 } 33 34 // CharacterSetsIterator iterates over every character set available. 35 type CharacterSetsIterator struct { 36 idx int 37 } 38 39 // CharacterSetID represents a character set. Unlike collations, this ID is not intended for storage and may change as 40 // the default collation changes. It is recommended to use the character set's name if persistence is desired. 41 type CharacterSetID uint16 42 43 // The character sets below are ordered alphabetically to make it easier to visually parse them. 44 // As each ID acts as an index to the `characterSetArray`, they are explicitly defined. 45 // A character set's ID is defined as the default collation's ID. 46 47 const ( 48 CharacterSet_armscii8 CharacterSetID = 32 49 CharacterSet_ascii CharacterSetID = 11 50 CharacterSet_big5 CharacterSetID = 1 51 CharacterSet_binary CharacterSetID = 63 52 CharacterSet_cp1250 CharacterSetID = 26 53 CharacterSet_cp1251 CharacterSetID = 51 54 CharacterSet_cp1256 CharacterSetID = 57 55 CharacterSet_cp1257 CharacterSetID = 59 56 CharacterSet_cp850 CharacterSetID = 4 57 CharacterSet_cp852 CharacterSetID = 40 58 CharacterSet_cp866 CharacterSetID = 36 59 CharacterSet_cp932 CharacterSetID = 95 60 CharacterSet_dec8 CharacterSetID = 3 61 CharacterSet_eucjpms CharacterSetID = 97 62 CharacterSet_euckr CharacterSetID = 19 63 CharacterSet_gb18030 CharacterSetID = 248 64 CharacterSet_gb2312 CharacterSetID = 24 65 CharacterSet_gbk CharacterSetID = 28 66 CharacterSet_geostd8 CharacterSetID = 92 67 CharacterSet_greek CharacterSetID = 25 68 CharacterSet_hebrew CharacterSetID = 16 69 CharacterSet_hp8 CharacterSetID = 6 70 CharacterSet_keybcs2 CharacterSetID = 37 71 CharacterSet_koi8r CharacterSetID = 7 72 CharacterSet_koi8u CharacterSetID = 22 73 CharacterSet_latin1 CharacterSetID = 8 74 CharacterSet_latin2 CharacterSetID = 9 75 CharacterSet_latin5 CharacterSetID = 30 76 CharacterSet_latin7 CharacterSetID = 41 77 CharacterSet_macce CharacterSetID = 38 78 CharacterSet_macroman CharacterSetID = 39 79 CharacterSet_sjis CharacterSetID = 13 80 CharacterSet_swe7 CharacterSetID = 10 81 CharacterSet_tis620 CharacterSetID = 18 82 CharacterSet_ucs2 CharacterSetID = 35 83 CharacterSet_ujis CharacterSetID = 12 84 CharacterSet_utf16 CharacterSetID = 54 85 CharacterSet_utf16le CharacterSetID = 56 86 CharacterSet_utf32 CharacterSetID = 60 87 CharacterSet_utf8mb3 CharacterSetID = 33 88 CharacterSet_utf8mb4 CharacterSetID = 255 89 90 CharacterSet_utf8 = CharacterSet_utf8mb3 91 92 // CharacterSet_Unspecified is used when a character set has not been specified, either explicitly or implicitly. 93 // This is usually used as an intermediate character set to be later replaced by an analyzer pass or a plan, 94 // although it is valid to use it directly. When used, behaves identically to the character set belonging to the 95 // default collation, although it will NOT match the aforementioned character set. 96 CharacterSet_Unspecified CharacterSetID = 0 97 ) 98 99 // characterSetArray contains the details of every character set, indexed by their ID. This allows for character sets to 100 // be efficiently passed around (since only an uint16 is needed), while still being able to quickly access all of their 101 // properties (index lookups are significantly faster than map lookups). 102 var characterSetArray = [256]CharacterSet{ 103 /*000*/ {CharacterSet_Unspecified, "", Collation_Unspecified, Collation_Unspecified, "", 0, nil}, 104 /*001*/ {CharacterSet_big5, "big5", Collation_big5_chinese_ci, Collation_big5_bin, "Big5 Traditional Chinese", 2, nil}, 105 /*002*/ {}, 106 /*003*/ {CharacterSet_dec8, "dec8", Collation_dec8_swedish_ci, Collation_dec8_bin, "DEC West European", 1, encodings.Dec8}, 107 /*004*/ {CharacterSet_cp850, "cp850", Collation_cp850_general_ci, Collation_cp850_bin, "DOS West European", 1, nil}, 108 /*005*/ {}, 109 /*006*/ {CharacterSet_hp8, "hp8", Collation_hp8_english_ci, Collation_hp8_bin, "HP West European", 1, nil}, 110 /*007*/ {CharacterSet_koi8r, "koi8r", Collation_koi8r_general_ci, Collation_koi8r_bin, "KOI8-R Relcom Russian", 1, nil}, 111 /*008*/ {CharacterSet_latin1, "latin1", Collation_latin1_swedish_ci, Collation_latin1_bin, "cp1252 West European", 1, encodings.Latin1}, 112 /*009*/ {CharacterSet_latin2, "latin2", Collation_latin2_general_ci, Collation_latin2_bin, "ISO 8859-2 Central European", 1, nil}, 113 /*010*/ {CharacterSet_swe7, "swe7", Collation_swe7_swedish_ci, Collation_swe7_bin, "7bit Swedish", 1, encodings.Swe7}, 114 /*011*/ {CharacterSet_ascii, "ascii", Collation_ascii_general_ci, Collation_ascii_bin, "US ASCII", 1, encodings.Ascii}, 115 /*012*/ {CharacterSet_ujis, "ujis", Collation_ujis_japanese_ci, Collation_ujis_bin, "EUC-JP Japanese", 3, nil}, 116 /*013*/ {CharacterSet_sjis, "sjis", Collation_sjis_japanese_ci, Collation_sjis_bin, "Shift-JIS Japanese", 2, nil}, 117 /*014*/ {}, 118 /*015*/ {}, 119 /*016*/ {CharacterSet_hebrew, "hebrew", Collation_hebrew_general_ci, Collation_hebrew_bin, "ISO 8859-8 Hebrew", 1, nil}, 120 /*017*/ {}, 121 /*018*/ {CharacterSet_tis620, "tis620", Collation_tis620_thai_ci, Collation_tis620_bin, "TIS620 Thai", 1, nil}, 122 /*019*/ {CharacterSet_euckr, "euckr", Collation_euckr_korean_ci, Collation_euckr_bin, "EUC-KR Korean", 2, nil}, 123 /*020*/ {}, 124 /*021*/ {}, 125 /*022*/ {CharacterSet_koi8u, "koi8u", Collation_koi8u_general_ci, Collation_koi8u_bin, "KOI8-U Ukrainian", 1, nil}, 126 /*023*/ {}, 127 /*024*/ {CharacterSet_gb2312, "gb2312", Collation_gb2312_chinese_ci, Collation_gb2312_bin, "GB2312 Simplified Chinese", 2, nil}, 128 /*025*/ {CharacterSet_greek, "greek", Collation_greek_general_ci, Collation_greek_bin, "ISO 8859-7 Greek", 1, nil}, 129 /*026*/ {CharacterSet_cp1250, "cp1250", Collation_cp1250_general_ci, Collation_cp1250_bin, "Windows Central European", 1, nil}, 130 /*027*/ {}, 131 /*028*/ {CharacterSet_gbk, "gbk", Collation_gbk_chinese_ci, Collation_gbk_bin, "GBK Simplified Chinese", 2, nil}, 132 /*029*/ {}, 133 /*030*/ {CharacterSet_latin5, "latin5", Collation_latin5_turkish_ci, Collation_latin5_bin, "ISO 8859-9 Turkish", 1, nil}, 134 /*031*/ {}, 135 /*032*/ {CharacterSet_armscii8, "armscii8", Collation_armscii8_general_ci, Collation_armscii8_bin, "ARMSCII-8 Armenian", 1, nil}, 136 /*033*/ {CharacterSet_utf8mb3, "utf8mb3", Collation_utf8mb3_general_ci, Collation_utf8mb3_bin, "UTF-8 Unicode", 3, encodings.Utf8mb3}, 137 /*034*/ {}, 138 /*035*/ {CharacterSet_ucs2, "ucs2", Collation_ucs2_general_ci, Collation_ucs2_bin, "UCS-2 Unicode", 2, nil}, 139 /*036*/ {CharacterSet_cp866, "cp866", Collation_cp866_general_ci, Collation_cp866_bin, "DOS Russian", 1, nil}, 140 /*037*/ {CharacterSet_keybcs2, "keybcs2", Collation_keybcs2_general_ci, Collation_keybcs2_bin, "DOS Kamenicky Czech-Slovak", 1, nil}, 141 /*038*/ {CharacterSet_macce, "macce", Collation_macce_general_ci, Collation_macce_bin, "Mac Central European", 1, nil}, 142 /*039*/ {CharacterSet_macroman, "macroman", Collation_macroman_general_ci, Collation_macroman_bin, "Mac West European", 1, nil}, 143 /*040*/ {CharacterSet_cp852, "cp852", Collation_cp852_general_ci, Collation_cp852_bin, "DOS Central European", 1, nil}, 144 /*041*/ {CharacterSet_latin7, "latin7", Collation_latin7_general_ci, Collation_latin7_bin, "ISO 8859-13 Baltic", 1, encodings.Latin7}, 145 /*042*/ {}, 146 /*043*/ {}, 147 /*044*/ {}, 148 /*045*/ {}, 149 /*046*/ {}, 150 /*047*/ {}, 151 /*048*/ {}, 152 /*049*/ {}, 153 /*050*/ {}, 154 /*051*/ {CharacterSet_cp1251, "cp1251", Collation_cp1251_general_ci, Collation_cp1251_bin, "Windows Cyrillic", 1, nil}, 155 /*052*/ {}, 156 /*053*/ {}, 157 /*054*/ {CharacterSet_utf16, "utf16", Collation_utf16_general_ci, Collation_utf16_bin, "UTF-16 Unicode", 4, encodings.Utf16}, 158 /*055*/ {}, 159 /*056*/ {CharacterSet_utf16le, "utf16le", Collation_utf16le_general_ci, Collation_utf16le_bin, "UTF-16LE Unicode", 4, nil}, 160 /*057*/ {CharacterSet_cp1256, "cp1256", Collation_cp1256_general_ci, Collation_cp1256_bin, "Windows Arabic", 1, encodings.Cp1256}, 161 /*058*/ {}, 162 /*059*/ {CharacterSet_cp1257, "cp1257", Collation_cp1257_general_ci, Collation_cp1257_bin, "Windows Baltic", 1, encodings.Cp1257}, 163 /*060*/ {CharacterSet_utf32, "utf32", Collation_utf32_general_ci, Collation_utf32_bin, "UTF-32 Unicode", 4, encodings.Utf32}, 164 /*061*/ {}, 165 /*062*/ {}, 166 /*063*/ {CharacterSet_binary, "binary", Collation_binary, Collation_binary, "Binary pseudo charset", 1, encodings.Binary}, 167 /*064*/ {}, 168 /*065*/ {}, 169 /*066*/ {}, 170 /*067*/ {}, 171 /*068*/ {}, 172 /*069*/ {}, 173 /*070*/ {}, 174 /*071*/ {}, 175 /*072*/ {}, 176 /*073*/ {}, 177 /*074*/ {}, 178 /*075*/ {}, 179 /*076*/ {}, 180 /*077*/ {}, 181 /*078*/ {}, 182 /*079*/ {}, 183 /*080*/ {}, 184 /*081*/ {}, 185 /*082*/ {}, 186 /*083*/ {}, 187 /*084*/ {}, 188 /*085*/ {}, 189 /*086*/ {}, 190 /*087*/ {}, 191 /*088*/ {}, 192 /*089*/ {}, 193 /*090*/ {}, 194 /*091*/ {}, 195 /*092*/ {CharacterSet_geostd8, "geostd8", Collation_geostd8_general_ci, Collation_geostd8_bin, "GEOSTD8 Georgian", 1, encodings.Geostd8}, 196 /*093*/ {}, 197 /*094*/ {}, 198 /*095*/ {CharacterSet_cp932, "cp932", Collation_cp932_japanese_ci, Collation_cp932_bin, "SJIS for Windows Japanese", 2, nil}, 199 /*096*/ {}, 200 /*097*/ {CharacterSet_eucjpms, "eucjpms", Collation_eucjpms_japanese_ci, Collation_eucjpms_bin, "UJIS for Windows Japanese", 3, nil}, 201 /*098*/ {}, 202 /*099*/ {}, 203 /*100*/ {}, 204 /*101*/ {}, 205 /*102*/ {}, 206 /*103*/ {}, 207 /*104*/ {}, 208 /*105*/ {}, 209 /*106*/ {}, 210 /*107*/ {}, 211 /*108*/ {}, 212 /*109*/ {}, 213 /*110*/ {}, 214 /*111*/ {}, 215 /*112*/ {}, 216 /*113*/ {}, 217 /*114*/ {}, 218 /*115*/ {}, 219 /*116*/ {}, 220 /*117*/ {}, 221 /*118*/ {}, 222 /*119*/ {}, 223 /*120*/ {}, 224 /*121*/ {}, 225 /*122*/ {}, 226 /*123*/ {}, 227 /*124*/ {}, 228 /*125*/ {}, 229 /*126*/ {}, 230 /*127*/ {}, 231 /*128*/ {}, 232 /*129*/ {}, 233 /*130*/ {}, 234 /*131*/ {}, 235 /*132*/ {}, 236 /*133*/ {}, 237 /*134*/ {}, 238 /*135*/ {}, 239 /*136*/ {}, 240 /*137*/ {}, 241 /*138*/ {}, 242 /*139*/ {}, 243 /*140*/ {}, 244 /*141*/ {}, 245 /*142*/ {}, 246 /*143*/ {}, 247 /*144*/ {}, 248 /*145*/ {}, 249 /*146*/ {}, 250 /*147*/ {}, 251 /*148*/ {}, 252 /*149*/ {}, 253 /*150*/ {}, 254 /*151*/ {}, 255 /*152*/ {}, 256 /*153*/ {}, 257 /*154*/ {}, 258 /*155*/ {}, 259 /*156*/ {}, 260 /*157*/ {}, 261 /*158*/ {}, 262 /*159*/ {}, 263 /*160*/ {}, 264 /*161*/ {}, 265 /*162*/ {}, 266 /*163*/ {}, 267 /*164*/ {}, 268 /*165*/ {}, 269 /*166*/ {}, 270 /*167*/ {}, 271 /*168*/ {}, 272 /*169*/ {}, 273 /*170*/ {}, 274 /*171*/ {}, 275 /*172*/ {}, 276 /*173*/ {}, 277 /*174*/ {}, 278 /*175*/ {}, 279 /*176*/ {}, 280 /*177*/ {}, 281 /*178*/ {}, 282 /*179*/ {}, 283 /*180*/ {}, 284 /*181*/ {}, 285 /*182*/ {}, 286 /*183*/ {}, 287 /*184*/ {}, 288 /*185*/ {}, 289 /*186*/ {}, 290 /*187*/ {}, 291 /*188*/ {}, 292 /*189*/ {}, 293 /*100*/ {}, 294 /*191*/ {}, 295 /*192*/ {}, 296 /*193*/ {}, 297 /*194*/ {}, 298 /*195*/ {}, 299 /*196*/ {}, 300 /*197*/ {}, 301 /*198*/ {}, 302 /*199*/ {}, 303 /*200*/ {}, 304 /*201*/ {}, 305 /*202*/ {}, 306 /*203*/ {}, 307 /*204*/ {}, 308 /*205*/ {}, 309 /*206*/ {}, 310 /*207*/ {}, 311 /*208*/ {}, 312 /*209*/ {}, 313 /*210*/ {}, 314 /*211*/ {}, 315 /*212*/ {}, 316 /*213*/ {}, 317 /*214*/ {}, 318 /*215*/ {}, 319 /*216*/ {}, 320 /*217*/ {}, 321 /*218*/ {}, 322 /*219*/ {}, 323 /*220*/ {}, 324 /*221*/ {}, 325 /*222*/ {}, 326 /*223*/ {}, 327 /*224*/ {}, 328 /*225*/ {}, 329 /*226*/ {}, 330 /*227*/ {}, 331 /*228*/ {}, 332 /*229*/ {}, 333 /*230*/ {}, 334 /*231*/ {}, 335 /*232*/ {}, 336 /*233*/ {}, 337 /*234*/ {}, 338 /*235*/ {}, 339 /*236*/ {}, 340 /*237*/ {}, 341 /*238*/ {}, 342 /*239*/ {}, 343 /*240*/ {}, 344 /*241*/ {}, 345 /*242*/ {}, 346 /*243*/ {}, 347 /*244*/ {}, 348 /*245*/ {}, 349 /*246*/ {}, 350 /*247*/ {}, 351 /*248*/ {CharacterSet_gb18030, "gb18030", Collation_gb18030_chinese_ci, Collation_gb18030_bin, "China National Standard GB18030", 4, nil}, 352 /*249*/ {}, 353 /*250*/ {}, 354 /*251*/ {}, 355 /*252*/ {}, 356 /*253*/ {}, 357 /*254*/ {}, 358 /*255*/ {CharacterSet_utf8mb4, "utf8mb4", Collation_utf8mb4_0900_ai_ci, Collation_utf8mb4_bin, "UTF-8 Unicode", 4, encodings.Utf8mb4}, 359 } 360 361 // init is used to set the unspecified character set's details to match those of the default collation's character set. 362 func init() { 363 defaultCharacterSet := characterSetArray[Collation_Default.CharacterSet()] 364 characterSetArray[0].Name = defaultCharacterSet.Name 365 characterSetArray[0].Description = defaultCharacterSet.Description 366 characterSetArray[0].MaxLength = defaultCharacterSet.MaxLength 367 characterSetArray[0].Encoder = defaultCharacterSet.Encoder 368 } 369 370 // characterSetStringToID maps a character set's name to its ID. 371 var characterSetStringToID = map[string]CharacterSetID{ 372 "armscii8": CharacterSet_armscii8, 373 "ascii": CharacterSet_ascii, 374 "big5": CharacterSet_big5, 375 "binary": CharacterSet_binary, 376 "cp1250": CharacterSet_cp1250, 377 "cp1251": CharacterSet_cp1251, 378 "cp1256": CharacterSet_cp1256, 379 "cp1257": CharacterSet_cp1257, 380 "cp850": CharacterSet_cp850, 381 "cp852": CharacterSet_cp852, 382 "cp866": CharacterSet_cp866, 383 "cp932": CharacterSet_cp932, 384 "dec8": CharacterSet_dec8, 385 "eucjpms": CharacterSet_eucjpms, 386 "euckr": CharacterSet_euckr, 387 "gb18030": CharacterSet_gb18030, 388 "gb2312": CharacterSet_gb2312, 389 "gbk": CharacterSet_gbk, 390 "geostd8": CharacterSet_geostd8, 391 "greek": CharacterSet_greek, 392 "hebrew": CharacterSet_hebrew, 393 "hp8": CharacterSet_hp8, 394 "keybcs2": CharacterSet_keybcs2, 395 "koi8r": CharacterSet_koi8r, 396 "koi8u": CharacterSet_koi8u, 397 "latin1": CharacterSet_latin1, 398 "latin2": CharacterSet_latin2, 399 "latin5": CharacterSet_latin5, 400 "latin7": CharacterSet_latin7, 401 "macce": CharacterSet_macce, 402 "macroman": CharacterSet_macroman, 403 "sjis": CharacterSet_sjis, 404 "swe7": CharacterSet_swe7, 405 "tis620": CharacterSet_tis620, 406 "ucs2": CharacterSet_ucs2, 407 "ujis": CharacterSet_ujis, 408 "utf16": CharacterSet_utf16, 409 "utf16le": CharacterSet_utf16le, 410 "utf32": CharacterSet_utf32, 411 "utf8": CharacterSet_utf8mb3, 412 "utf8mb3": CharacterSet_utf8mb3, 413 "utf8mb4": CharacterSet_utf8mb4, 414 } 415 416 // SupportedCharsets contains all non-binary character sets that are currently supported. 417 var SupportedCharsets = []CharacterSetID{ 418 CharacterSet_utf8mb4, 419 } 420 421 // ParseCharacterSet takes in a string representing a CharacterSet and returns the result if a match is found, or an 422 // error if not. 423 func ParseCharacterSet(str string) (CharacterSetID, error) { 424 if cs, ok := characterSetStringToID[strings.ToLower(str)]; ok { 425 return cs, nil 426 } 427 // It is valid recognize an empty string as the invalid charset, as some analyzer steps may temporarily use the 428 // invalid charset 429 if str == "" { 430 return CharacterSet_Unspecified, nil 431 } 432 return CharacterSet_Unspecified, ErrCharSetUnknown.New(str) 433 } 434 435 // Name returns the name of this CharacterSet. 436 func (cs CharacterSetID) Name() string { 437 return characterSetArray[cs].Name 438 } 439 440 // DefaultCollation returns the default CollationID for this CharacterSet. 441 func (cs CharacterSetID) DefaultCollation() CollationID { 442 return characterSetArray[cs].DefaultCollation 443 } 444 445 // BinaryCollation returns the binary CollationID for this CharacterSet. 446 func (cs CharacterSetID) BinaryCollation() CollationID { 447 return characterSetArray[cs].BinaryCollation 448 } 449 450 // Description returns the plain-English description of the CharacterSet. 451 func (cs CharacterSetID) Description() string { 452 return characterSetArray[cs].Description 453 } 454 455 // MaxLength returns the maximum size of a single character in the CharacterSet. 456 func (cs CharacterSetID) MaxLength() int64 { 457 return int64(characterSetArray[cs].MaxLength) 458 } 459 460 // String returns the string representation of the CharacterSet. 461 func (cs CharacterSetID) String() string { 462 return characterSetArray[cs].Name 463 } 464 465 // Encoder returns this CharacterSet's encoder. As character sets are a work-in-progress, it is 466 // recommended to check if it is nil before allowing the character set to be set within a table. 467 func (cs CharacterSetID) Encoder() encodings.Encoder { 468 return characterSetArray[cs].Encoder 469 } 470 471 // NewCharacterSetsIterator returns a new CharacterSetsIterator. 472 func NewCharacterSetsIterator() *CharacterSetsIterator { 473 return &CharacterSetsIterator{0} 474 } 475 476 // Next returns the next character set. If all character sets have been iterated over, returns false. 477 func (csi *CharacterSetsIterator) Next() (CharacterSet, bool) { 478 for ; csi.idx < len(characterSetArray); csi.idx++ { 479 if characterSetArray[csi.idx].ID == 0 { 480 continue 481 } 482 csi.idx++ 483 return characterSetArray[csi.idx-1], true 484 } 485 return CharacterSet{}, false 486 }