github.com/bingoohuang/gg@v0.0.0-20240325092523-45da7dee9335/pkg/sqlparse/tidbparser/dependency/util/charset/charset.go (about)

     1  // Copyright 2015 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package charset
    15  
    16  import (
    17  	"strings"
    18  
    19  	"github.com/bingoohuang/gg/pkg/sqlparse/tidbparser/dependency/mysql"
    20  	"github.com/juju/errors"
    21  )
    22  
    23  // Charset is a charset.
    24  // Now we only support MySQL.
    25  type Charset struct {
    26  	Name             string
    27  	DefaultCollation string
    28  	Collations       map[string]*Collation
    29  	Desc             string
    30  	Maxlen           int
    31  }
    32  
    33  // Collation is a collation.
    34  // Now we only support MySQL.
    35  type Collation struct {
    36  	ID          int
    37  	CharsetName string
    38  	Name        string
    39  	IsDefault   bool
    40  }
    41  
    42  var charsets = make(map[string]*Charset)
    43  
    44  // All the supported charsets should be in the following table.
    45  var charsetInfos = []*Charset{
    46  	{CharsetUTF8, CollationUTF8, make(map[string]*Collation), "UTF-8 Unicode", 3},
    47  	{CharsetUTF8MB4, CollationUTF8MB4, make(map[string]*Collation), "UTF-8 Unicode", 4},
    48  	{CharsetASCII, CollationASCII, make(map[string]*Collation), "US ASCII", 1},
    49  	{CharsetLatin1, CollationLatin1, make(map[string]*Collation), "Latin1", 1},
    50  	{CharsetBin, CollationBin, make(map[string]*Collation), "binary", 1},
    51  }
    52  
    53  func init() {
    54  	for _, c := range charsetInfos {
    55  		charsets[c.Name] = c
    56  	}
    57  	for _, c := range collations {
    58  		charset, ok := charsets[c.CharsetName]
    59  		if !ok {
    60  			continue
    61  		}
    62  		charset.Collations[c.Name] = c
    63  	}
    64  }
    65  
    66  // Desc is a charset description.
    67  type Desc struct {
    68  	Name             string
    69  	Desc             string
    70  	DefaultCollation string
    71  	Maxlen           int
    72  }
    73  
    74  // GetAllCharsets gets all charset descriptions in the local charsets.
    75  func GetAllCharsets() []*Desc {
    76  	descs := make([]*Desc, 0, len(charsets))
    77  	// The charsetInfos is an array, so the iterate order will be stable.
    78  	for _, ci := range charsetInfos {
    79  		c, ok := charsets[ci.Name]
    80  		if !ok {
    81  			continue
    82  		}
    83  		desc := &Desc{
    84  			Name:             c.Name,
    85  			DefaultCollation: c.DefaultCollation,
    86  			Desc:             c.Desc,
    87  			Maxlen:           c.Maxlen,
    88  		}
    89  		descs = append(descs, desc)
    90  	}
    91  	return descs
    92  }
    93  
    94  // ValidCharsetAndCollation checks the charset and the collation validity
    95  // and returns a boolean.
    96  func ValidCharsetAndCollation(cs string, co string) bool {
    97  	// We will use utf8 as a default charset.
    98  	if cs == "" {
    99  		cs = "utf8"
   100  	}
   101  
   102  	c, ok := charsets[cs]
   103  	if !ok {
   104  		return false
   105  	}
   106  
   107  	if co == "" {
   108  		return true
   109  	}
   110  	_, ok = c.Collations[co]
   111  	if !ok {
   112  		return false
   113  	}
   114  
   115  	return true
   116  }
   117  
   118  // GetDefaultCollation returns the default collation for charset.
   119  func GetDefaultCollation(charset string) (string, error) {
   120  	charset = strings.ToLower(charset)
   121  	if charset == CharsetBin {
   122  		return CollationBin, nil
   123  	}
   124  	c, ok := charsets[charset]
   125  	if !ok {
   126  		return "", errors.Errorf("Unknown charset %s", charset)
   127  	}
   128  	return c.DefaultCollation, nil
   129  }
   130  
   131  // GetCharsetInfo returns charset and collation for cs as name.
   132  func GetCharsetInfo(cs string) (string, string, error) {
   133  	c, ok := charsets[strings.ToLower(cs)]
   134  	if !ok {
   135  		return "", "", errors.Errorf("Unknown charset %s", cs)
   136  	}
   137  	return c.Name, c.DefaultCollation, nil
   138  }
   139  
   140  // GetCharsetDesc gets charset descriptions in the local charsets.
   141  func GetCharsetDesc(cs string) (*Desc, error) {
   142  	c, ok := charsets[strings.ToLower(cs)]
   143  	if !ok {
   144  		return nil, errors.Errorf("Unknown charset %s", cs)
   145  	}
   146  	desc := &Desc{
   147  		Name:             c.Name,
   148  		DefaultCollation: c.DefaultCollation,
   149  		Desc:             c.Desc,
   150  		Maxlen:           c.Maxlen,
   151  	}
   152  	return desc, nil
   153  }
   154  
   155  // GetCharsetInfoByID returns charset and collation for id as cs_number.
   156  func GetCharsetInfoByID(coID int) (string, string, error) {
   157  	if coID == mysql.DefaultCollationID {
   158  		return mysql.DefaultCharset, mysql.DefaultCollationName, nil
   159  	}
   160  	for _, collation := range collations {
   161  		if coID == collation.ID {
   162  			return collation.CharsetName, collation.Name, nil
   163  		}
   164  	}
   165  	return "", "", errors.Errorf("Unknown charset id %d", coID)
   166  }
   167  
   168  // GetCollations returns a list for all collations.
   169  func GetCollations() []*Collation {
   170  	return collations
   171  }
   172  
   173  const (
   174  	// CharsetBin is used for marking binary charset.
   175  	CharsetBin = "binary"
   176  	// CollationBin is the default collation for CharsetBin.
   177  	CollationBin = "binary"
   178  	// CharsetUTF8 is the default charset for string types.
   179  	CharsetUTF8 = "utf8"
   180  	// CollationUTF8 is the default collation for CharsetUTF8.
   181  	CollationUTF8 = "utf8_bin"
   182  	// CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go.
   183  	CharsetUTF8MB4 = "utf8mb4"
   184  	// CollationUTF8MB4 is the default collation for CharsetUTF8MB4.
   185  	CollationUTF8MB4 = "utf8mb4_bin"
   186  	// CharsetASCII is a subset of UTF8.
   187  	CharsetASCII = "ascii"
   188  	// CollationASCII is the default collation for CharsetACSII.
   189  	CollationASCII = "ascii_bin"
   190  	// CharsetLatin1 is a single byte charset.
   191  	CharsetLatin1 = "latin1"
   192  	// CollationLatin1 is the default collation for CharsetLatin1.
   193  	CollationLatin1 = "latin1_bin"
   194  )
   195  
   196  var collations = []*Collation{
   197  	{1, "big5", "big5_chinese_ci", true},
   198  	{2, "latin2", "latin2_czech_cs", false},
   199  	{3, "dec8", "dec8_swedish_ci", true},
   200  	{4, "cp850", "cp850_general_ci", true},
   201  	{5, "latin1", "latin1_german1_ci", false},
   202  	{6, "hp8", "hp8_english_ci", true},
   203  	{7, "koi8r", "koi8r_general_ci", true},
   204  	{8, "latin1", "latin1_swedish_ci", true},
   205  	{9, "latin2", "latin2_general_ci", true},
   206  	{10, "swe7", "swe7_swedish_ci", true},
   207  	{11, "ascii", "ascii_general_ci", true},
   208  	{12, "ujis", "ujis_japanese_ci", true},
   209  	{13, "sjis", "sjis_japanese_ci", true},
   210  	{14, "cp1251", "cp1251_bulgarian_ci", false},
   211  	{15, "latin1", "latin1_danish_ci", false},
   212  	{16, "hebrew", "hebrew_general_ci", true},
   213  	{18, "tis620", "tis620_thai_ci", true},
   214  	{19, "euckr", "euckr_korean_ci", true},
   215  	{20, "latin7", "latin7_estonian_cs", false},
   216  	{21, "latin2", "latin2_hungarian_ci", false},
   217  	{22, "koi8u", "koi8u_general_ci", true},
   218  	{23, "cp1251", "cp1251_ukrainian_ci", false},
   219  	{24, "gb2312", "gb2312_chinese_ci", true},
   220  	{25, "greek", "greek_general_ci", true},
   221  	{26, "cp1250", "cp1250_general_ci", true},
   222  	{27, "latin2", "latin2_croatian_ci", false},
   223  	{28, "gbk", "gbk_chinese_ci", true},
   224  	{29, "cp1257", "cp1257_lithuanian_ci", false},
   225  	{30, "latin5", "latin5_turkish_ci", true},
   226  	{31, "latin1", "latin1_german2_ci", false},
   227  	{32, "armscii8", "armscii8_general_ci", true},
   228  	{33, "utf8", "utf8_general_ci", true},
   229  	{34, "cp1250", "cp1250_czech_cs", false},
   230  	{35, "ucs2", "ucs2_general_ci", true},
   231  	{36, "cp866", "cp866_general_ci", true},
   232  	{37, "keybcs2", "keybcs2_general_ci", true},
   233  	{38, "macce", "macce_general_ci", true},
   234  	{39, "macroman", "macroman_general_ci", true},
   235  	{40, "cp852", "cp852_general_ci", true},
   236  	{41, "latin7", "latin7_general_ci", true},
   237  	{42, "latin7", "latin7_general_cs", false},
   238  	{43, "macce", "macce_bin", false},
   239  	{44, "cp1250", "cp1250_croatian_ci", false},
   240  	{45, "utf8mb4", "utf8mb4_general_ci", true},
   241  	{46, "utf8mb4", "utf8mb4_bin", false},
   242  	{47, "latin1", "latin1_bin", false},
   243  	{48, "latin1", "latin1_general_ci", false},
   244  	{49, "latin1", "latin1_general_cs", false},
   245  	{50, "cp1251", "cp1251_bin", false},
   246  	{51, "cp1251", "cp1251_general_ci", true},
   247  	{52, "cp1251", "cp1251_general_cs", false},
   248  	{53, "macroman", "macroman_bin", false},
   249  	{54, "utf16", "utf16_general_ci", true},
   250  	{55, "utf16", "utf16_bin", false},
   251  	{56, "utf16le", "utf16le_general_ci", true},
   252  	{57, "cp1256", "cp1256_general_ci", true},
   253  	{58, "cp1257", "cp1257_bin", false},
   254  	{59, "cp1257", "cp1257_general_ci", true},
   255  	{60, "utf32", "utf32_general_ci", true},
   256  	{61, "utf32", "utf32_bin", false},
   257  	{62, "utf16le", "utf16le_bin", false},
   258  	{63, "binary", "binary", true},
   259  	{64, "armscii8", "armscii8_bin", false},
   260  	{65, "ascii", "ascii_bin", false},
   261  	{66, "cp1250", "cp1250_bin", false},
   262  	{67, "cp1256", "cp1256_bin", false},
   263  	{68, "cp866", "cp866_bin", false},
   264  	{69, "dec8", "dec8_bin", false},
   265  	{70, "greek", "greek_bin", false},
   266  	{71, "hebrew", "hebrew_bin", false},
   267  	{72, "hp8", "hp8_bin", false},
   268  	{73, "keybcs2", "keybcs2_bin", false},
   269  	{74, "koi8r", "koi8r_bin", false},
   270  	{75, "koi8u", "koi8u_bin", false},
   271  	{77, "latin2", "latin2_bin", false},
   272  	{78, "latin5", "latin5_bin", false},
   273  	{79, "latin7", "latin7_bin", false},
   274  	{80, "cp850", "cp850_bin", false},
   275  	{81, "cp852", "cp852_bin", false},
   276  	{82, "swe7", "swe7_bin", false},
   277  	{83, "utf8", "utf8_bin", false},
   278  	{84, "big5", "big5_bin", false},
   279  	{85, "euckr", "euckr_bin", false},
   280  	{86, "gb2312", "gb2312_bin", false},
   281  	{87, "gbk", "gbk_bin", false},
   282  	{88, "sjis", "sjis_bin", false},
   283  	{89, "tis620", "tis620_bin", false},
   284  	{90, "ucs2", "ucs2_bin", false},
   285  	{91, "ujis", "ujis_bin", false},
   286  	{92, "geostd8", "geostd8_general_ci", true},
   287  	{93, "geostd8", "geostd8_bin", false},
   288  	{94, "latin1", "latin1_spanish_ci", false},
   289  	{95, "cp932", "cp932_japanese_ci", true},
   290  	{96, "cp932", "cp932_bin", false},
   291  	{97, "eucjpms", "eucjpms_japanese_ci", true},
   292  	{98, "eucjpms", "eucjpms_bin", false},
   293  	{99, "cp1250", "cp1250_polish_ci", false},
   294  	{101, "utf16", "utf16_unicode_ci", false},
   295  	{102, "utf16", "utf16_icelandic_ci", false},
   296  	{103, "utf16", "utf16_latvian_ci", false},
   297  	{104, "utf16", "utf16_romanian_ci", false},
   298  	{105, "utf16", "utf16_slovenian_ci", false},
   299  	{106, "utf16", "utf16_polish_ci", false},
   300  	{107, "utf16", "utf16_estonian_ci", false},
   301  	{108, "utf16", "utf16_spanish_ci", false},
   302  	{109, "utf16", "utf16_swedish_ci", false},
   303  	{110, "utf16", "utf16_turkish_ci", false},
   304  	{111, "utf16", "utf16_czech_ci", false},
   305  	{112, "utf16", "utf16_danish_ci", false},
   306  	{113, "utf16", "utf16_lithuanian_ci", false},
   307  	{114, "utf16", "utf16_slovak_ci", false},
   308  	{115, "utf16", "utf16_spanish2_ci", false},
   309  	{116, "utf16", "utf16_roman_ci", false},
   310  	{117, "utf16", "utf16_persian_ci", false},
   311  	{118, "utf16", "utf16_esperanto_ci", false},
   312  	{119, "utf16", "utf16_hungarian_ci", false},
   313  	{120, "utf16", "utf16_sinhala_ci", false},
   314  	{121, "utf16", "utf16_german2_ci", false},
   315  	{122, "utf16", "utf16_croatian_ci", false},
   316  	{123, "utf16", "utf16_unicode_520_ci", false},
   317  	{124, "utf16", "utf16_vietnamese_ci", false},
   318  	{128, "ucs2", "ucs2_unicode_ci", false},
   319  	{129, "ucs2", "ucs2_icelandic_ci", false},
   320  	{130, "ucs2", "ucs2_latvian_ci", false},
   321  	{131, "ucs2", "ucs2_romanian_ci", false},
   322  	{132, "ucs2", "ucs2_slovenian_ci", false},
   323  	{133, "ucs2", "ucs2_polish_ci", false},
   324  	{134, "ucs2", "ucs2_estonian_ci", false},
   325  	{135, "ucs2", "ucs2_spanish_ci", false},
   326  	{136, "ucs2", "ucs2_swedish_ci", false},
   327  	{137, "ucs2", "ucs2_turkish_ci", false},
   328  	{138, "ucs2", "ucs2_czech_ci", false},
   329  	{139, "ucs2", "ucs2_danish_ci", false},
   330  	{140, "ucs2", "ucs2_lithuanian_ci", false},
   331  	{141, "ucs2", "ucs2_slovak_ci", false},
   332  	{142, "ucs2", "ucs2_spanish2_ci", false},
   333  	{143, "ucs2", "ucs2_roman_ci", false},
   334  	{144, "ucs2", "ucs2_persian_ci", false},
   335  	{145, "ucs2", "ucs2_esperanto_ci", false},
   336  	{146, "ucs2", "ucs2_hungarian_ci", false},
   337  	{147, "ucs2", "ucs2_sinhala_ci", false},
   338  	{148, "ucs2", "ucs2_german2_ci", false},
   339  	{149, "ucs2", "ucs2_croatian_ci", false},
   340  	{150, "ucs2", "ucs2_unicode_520_ci", false},
   341  	{151, "ucs2", "ucs2_vietnamese_ci", false},
   342  	{159, "ucs2", "ucs2_general_mysql500_ci", false},
   343  	{160, "utf32", "utf32_unicode_ci", false},
   344  	{161, "utf32", "utf32_icelandic_ci", false},
   345  	{162, "utf32", "utf32_latvian_ci", false},
   346  	{163, "utf32", "utf32_romanian_ci", false},
   347  	{164, "utf32", "utf32_slovenian_ci", false},
   348  	{165, "utf32", "utf32_polish_ci", false},
   349  	{166, "utf32", "utf32_estonian_ci", false},
   350  	{167, "utf32", "utf32_spanish_ci", false},
   351  	{168, "utf32", "utf32_swedish_ci", false},
   352  	{169, "utf32", "utf32_turkish_ci", false},
   353  	{170, "utf32", "utf32_czech_ci", false},
   354  	{171, "utf32", "utf32_danish_ci", false},
   355  	{172, "utf32", "utf32_lithuanian_ci", false},
   356  	{173, "utf32", "utf32_slovak_ci", false},
   357  	{174, "utf32", "utf32_spanish2_ci", false},
   358  	{175, "utf32", "utf32_roman_ci", false},
   359  	{176, "utf32", "utf32_persian_ci", false},
   360  	{177, "utf32", "utf32_esperanto_ci", false},
   361  	{178, "utf32", "utf32_hungarian_ci", false},
   362  	{179, "utf32", "utf32_sinhala_ci", false},
   363  	{180, "utf32", "utf32_german2_ci", false},
   364  	{181, "utf32", "utf32_croatian_ci", false},
   365  	{182, "utf32", "utf32_unicode_520_ci", false},
   366  	{183, "utf32", "utf32_vietnamese_ci", false},
   367  	{192, "utf8", "utf8_unicode_ci", false},
   368  	{193, "utf8", "utf8_icelandic_ci", false},
   369  	{194, "utf8", "utf8_latvian_ci", false},
   370  	{195, "utf8", "utf8_romanian_ci", false},
   371  	{196, "utf8", "utf8_slovenian_ci", false},
   372  	{197, "utf8", "utf8_polish_ci", false},
   373  	{198, "utf8", "utf8_estonian_ci", false},
   374  	{199, "utf8", "utf8_spanish_ci", false},
   375  	{200, "utf8", "utf8_swedish_ci", false},
   376  	{201, "utf8", "utf8_turkish_ci", false},
   377  	{202, "utf8", "utf8_czech_ci", false},
   378  	{203, "utf8", "utf8_danish_ci", false},
   379  	{204, "utf8", "utf8_lithuanian_ci", false},
   380  	{205, "utf8", "utf8_slovak_ci", false},
   381  	{206, "utf8", "utf8_spanish2_ci", false},
   382  	{207, "utf8", "utf8_roman_ci", false},
   383  	{208, "utf8", "utf8_persian_ci", false},
   384  	{209, "utf8", "utf8_esperanto_ci", false},
   385  	{210, "utf8", "utf8_hungarian_ci", false},
   386  	{211, "utf8", "utf8_sinhala_ci", false},
   387  	{212, "utf8", "utf8_german2_ci", false},
   388  	{213, "utf8", "utf8_croatian_ci", false},
   389  	{214, "utf8", "utf8_unicode_520_ci", false},
   390  	{215, "utf8", "utf8_vietnamese_ci", false},
   391  	{223, "utf8", "utf8_general_mysql500_ci", false},
   392  	{224, "utf8mb4", "utf8mb4_unicode_ci", false},
   393  	{225, "utf8mb4", "utf8mb4_icelandic_ci", false},
   394  	{226, "utf8mb4", "utf8mb4_latvian_ci", false},
   395  	{227, "utf8mb4", "utf8mb4_romanian_ci", false},
   396  	{228, "utf8mb4", "utf8mb4_slovenian_ci", false},
   397  	{229, "utf8mb4", "utf8mb4_polish_ci", false},
   398  	{230, "utf8mb4", "utf8mb4_estonian_ci", false},
   399  	{231, "utf8mb4", "utf8mb4_spanish_ci", false},
   400  	{232, "utf8mb4", "utf8mb4_swedish_ci", false},
   401  	{233, "utf8mb4", "utf8mb4_turkish_ci", false},
   402  	{234, "utf8mb4", "utf8mb4_czech_ci", false},
   403  	{235, "utf8mb4", "utf8mb4_danish_ci", false},
   404  	{236, "utf8mb4", "utf8mb4_lithuanian_ci", false},
   405  	{237, "utf8mb4", "utf8mb4_slovak_ci", false},
   406  	{238, "utf8mb4", "utf8mb4_spanish2_ci", false},
   407  	{239, "utf8mb4", "utf8mb4_roman_ci", false},
   408  	{240, "utf8mb4", "utf8mb4_persian_ci", false},
   409  	{241, "utf8mb4", "utf8mb4_esperanto_ci", false},
   410  	{242, "utf8mb4", "utf8mb4_hungarian_ci", false},
   411  	{243, "utf8mb4", "utf8mb4_sinhala_ci", false},
   412  	{244, "utf8mb4", "utf8mb4_german2_ci", false},
   413  	{245, "utf8mb4", "utf8mb4_croatian_ci", false},
   414  	{246, "utf8mb4", "utf8mb4_unicode_520_ci", false},
   415  	{247, "utf8mb4", "utf8mb4_vietnamese_ci", false},
   416  }