github.com/pingcap/tidb/parser@v0.0.0-20231013125129-93a834a6bf8d/charset/encoding_test.go (about) 1 // Copyright 2021 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package charset_test 15 16 import ( 17 "fmt" 18 "testing" 19 "unicode/utf8" 20 21 "github.com/pingcap/tidb/parser/charset" 22 "github.com/stretchr/testify/require" 23 "golang.org/x/text/transform" 24 ) 25 26 func TestEncoding(t *testing.T) { 27 enc := charset.FindEncoding(charset.CharsetGBK) 28 require.Equal(t, charset.CharsetGBK, enc.Name()) 29 30 txt := []byte("一二三四") 31 e, _ := charset.Lookup("gbk") 32 gbkEncodedTxt, _, err := transform.Bytes(e.NewEncoder(), txt) 33 require.NoError(t, err) 34 result, err := enc.Transform(nil, gbkEncodedTxt, charset.OpDecode) 35 require.NoError(t, err) 36 require.Equal(t, txt, result) 37 38 gbkEncodedTxt2, err := enc.Transform(nil, txt, charset.OpEncode) 39 require.NoError(t, err) 40 require.Equal(t, gbkEncodedTxt2, gbkEncodedTxt) 41 result, err = enc.Transform(nil, gbkEncodedTxt2, charset.OpDecode) 42 require.NoError(t, err) 43 require.Equal(t, txt, result) 44 45 GBKCases := []struct { 46 utf8Str string 47 result string 48 isValid bool 49 }{ 50 {"一二三", "涓?簩涓?", false}, // MySQL reports '涓?簩涓'. 51 {"一二三123", "涓?簩涓?23", false}, 52 {"测试", "娴嬭瘯", true}, 53 {"案1案2", "妗?妗?", false}, 54 {"焊䏷菡釬", "鐒婁彿鑿¢嚞", true}, 55 {"鞍杏以伊位依", "闉嶆潖浠ヤ紛浣嶄緷", true}, 56 {"移維緯胃萎衣謂違", "绉荤董绶?儍钀庤。璎傞仌", false}, 57 {"仆仂仗仞仭仟价伉佚估", "浠嗕粋浠椾粸浠?粺浠蜂級浣氫及", false}, 58 {"佝佗佇佶侈侏侘佻佩佰侑佯", "浣濅綏浣囦蕉渚堜緩渚樹交浣╀桨渚戜蒋", true}, 59 {"\x80", "?", false}, 60 {"\x80a", "?", false}, 61 {"\x80aa", "?a", false}, 62 {"aa\x80ab", "aa?b", false}, 63 {"a你好\x80a测试", "a浣犲ソ?娴嬭瘯", false}, 64 {"aa\x80", "aa?", false}, 65 } 66 for _, tc := range GBKCases { 67 cmt := fmt.Sprintf("%v", tc) 68 result, err := enc.Transform(nil, []byte(tc.utf8Str), charset.OpDecodeReplace) 69 if tc.isValid { 70 require.NoError(t, err, cmt) 71 } else { 72 require.Error(t, err, cmt) 73 } 74 require.Equal(t, tc.result, string(result), cmt) 75 } 76 77 utf8Cases := []struct { 78 utf8Str string 79 result string 80 isValid bool 81 }{ 82 {"一二三", "һ\xb6\xfe\xc8\xfd", true}, 83 {"🀁", "?", false}, 84 {"valid_string_🀁", "valid_string_?", false}, 85 {"€", "?", false}, 86 {"€a", "?a", false}, 87 {"a€aa", "a?aa", false}, 88 {"aaa€", "aaa?", false}, 89 } 90 for _, tc := range utf8Cases { 91 cmt := fmt.Sprintf("%v", tc) 92 result, err := enc.Transform(nil, []byte(tc.utf8Str), charset.OpEncodeReplace) 93 if tc.isValid { 94 require.NoError(t, err, cmt) 95 } else { 96 require.Error(t, err, cmt) 97 } 98 require.Equal(t, tc.result, string(result), cmt) 99 } 100 } 101 102 func TestEncodingValidate(t *testing.T) { 103 oxfffefd := string([]byte{0xff, 0xfe, 0xfd}) 104 testCases := []struct { 105 chs string 106 str string 107 expected string 108 nSrc int 109 ok bool 110 }{ 111 {charset.CharsetASCII, "", "", 0, true}, 112 {charset.CharsetASCII, "qwerty", "qwerty", 6, true}, 113 {charset.CharsetASCII, "qwÊrty", "qw?rty", 2, false}, 114 {charset.CharsetASCII, "中文", "??", 0, false}, 115 {charset.CharsetASCII, "中文?qwert", "???qwert", 0, false}, 116 {charset.CharsetUTF8MB4, "", "", 0, true}, 117 {charset.CharsetUTF8MB4, "qwerty", "qwerty", 6, true}, 118 {charset.CharsetUTF8MB4, "qwÊrty", "qwÊrty", 7, true}, 119 {charset.CharsetUTF8MB4, "qwÊ合法字符串", "qwÊ合法字符串", 19, true}, 120 {charset.CharsetUTF8MB4, "😂", "😂", 4, true}, 121 {charset.CharsetUTF8MB4, oxfffefd, "???", 0, false}, 122 {charset.CharsetUTF8MB4, "中文" + oxfffefd, "中文???", 6, false}, 123 {charset.CharsetUTF8MB4, string(utf8.RuneError), "�", 3, true}, 124 {charset.CharsetUTF8, "", "", 0, true}, 125 {charset.CharsetUTF8, "qwerty", "qwerty", 6, true}, 126 {charset.CharsetUTF8, "qwÊrty", "qwÊrty", 7, true}, 127 {charset.CharsetUTF8, "qwÊ合法字符串", "qwÊ合法字符串", 19, true}, 128 {charset.CharsetUTF8, "😂", "?", 0, false}, 129 {charset.CharsetUTF8, "valid_str😂", "valid_str?", 9, false}, 130 {charset.CharsetUTF8, oxfffefd, "???", 0, false}, 131 {charset.CharsetUTF8, "中文" + oxfffefd, "中文???", 6, false}, 132 {charset.CharsetUTF8, string(utf8.RuneError), "�", 3, true}, 133 {charset.CharsetGBK, "", "", 0, true}, 134 {charset.CharsetGBK, "asdf", "asdf", 4, true}, 135 {charset.CharsetGBK, "中文", "中文", 6, true}, 136 {charset.CharsetGBK, "À", "?", 0, false}, 137 {charset.CharsetGBK, "中文À中文", "中文?中文", 6, false}, 138 {charset.CharsetGBK, "asdfÀ", "asdf?", 4, false}, 139 } 140 for _, tc := range testCases { 141 msg := fmt.Sprintf("%v", tc) 142 enc := charset.FindEncoding(tc.chs) 143 if tc.chs == charset.CharsetUTF8 { 144 enc = charset.EncodingUTF8MB3StrictImpl 145 } 146 strBytes := []byte(tc.str) 147 require.Equal(t, tc.ok, enc.IsValid(strBytes), msg) 148 replace, _ := enc.Transform(nil, strBytes, charset.OpReplaceNoErr) 149 require.Equal(t, tc.expected, string(replace), msg) 150 } 151 }