github.com/pingcap/tidb/parser@v0.0.0-20231013125129-93a834a6bf8d/charset/encoding_test.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package charset_test
    15  
    16  import (
    17  	"fmt"
    18  	"testing"
    19  	"unicode/utf8"
    20  
    21  	"github.com/pingcap/tidb/parser/charset"
    22  	"github.com/stretchr/testify/require"
    23  	"golang.org/x/text/transform"
    24  )
    25  
    26  func TestEncoding(t *testing.T) {
    27  	enc := charset.FindEncoding(charset.CharsetGBK)
    28  	require.Equal(t, charset.CharsetGBK, enc.Name())
    29  
    30  	txt := []byte("一二三四")
    31  	e, _ := charset.Lookup("gbk")
    32  	gbkEncodedTxt, _, err := transform.Bytes(e.NewEncoder(), txt)
    33  	require.NoError(t, err)
    34  	result, err := enc.Transform(nil, gbkEncodedTxt, charset.OpDecode)
    35  	require.NoError(t, err)
    36  	require.Equal(t, txt, result)
    37  
    38  	gbkEncodedTxt2, err := enc.Transform(nil, txt, charset.OpEncode)
    39  	require.NoError(t, err)
    40  	require.Equal(t, gbkEncodedTxt2, gbkEncodedTxt)
    41  	result, err = enc.Transform(nil, gbkEncodedTxt2, charset.OpDecode)
    42  	require.NoError(t, err)
    43  	require.Equal(t, txt, result)
    44  
    45  	GBKCases := []struct {
    46  		utf8Str string
    47  		result  string
    48  		isValid bool
    49  	}{
    50  		{"一二三", "涓?簩涓?", false}, // MySQL reports '涓?簩涓'.
    51  		{"一二三123", "涓?簩涓?23", false},
    52  		{"测试", "娴嬭瘯", true},
    53  		{"案1案2", "妗?妗?", false},
    54  		{"焊䏷菡釬", "鐒婁彿鑿¢嚞", true},
    55  		{"鞍杏以伊位依", "闉嶆潖浠ヤ紛浣嶄緷", true},
    56  		{"移維緯胃萎衣謂違", "绉荤董绶?儍钀庤。璎傞仌", false},
    57  		{"仆仂仗仞仭仟价伉佚估", "浠嗕粋浠椾粸浠?粺浠蜂級浣氫及", false},
    58  		{"佝佗佇佶侈侏侘佻佩佰侑佯", "浣濅綏浣囦蕉渚堜緩渚樹交浣╀桨渚戜蒋", true},
    59  		{"\x80", "?", false},
    60  		{"\x80a", "?", false},
    61  		{"\x80aa", "?a", false},
    62  		{"aa\x80ab", "aa?b", false},
    63  		{"a你好\x80a测试", "a浣犲ソ?娴嬭瘯", false},
    64  		{"aa\x80", "aa?", false},
    65  	}
    66  	for _, tc := range GBKCases {
    67  		cmt := fmt.Sprintf("%v", tc)
    68  		result, err := enc.Transform(nil, []byte(tc.utf8Str), charset.OpDecodeReplace)
    69  		if tc.isValid {
    70  			require.NoError(t, err, cmt)
    71  		} else {
    72  			require.Error(t, err, cmt)
    73  		}
    74  		require.Equal(t, tc.result, string(result), cmt)
    75  	}
    76  
    77  	utf8Cases := []struct {
    78  		utf8Str string
    79  		result  string
    80  		isValid bool
    81  	}{
    82  		{"一二三", "һ\xb6\xfe\xc8\xfd", true},
    83  		{"🀁", "?", false},
    84  		{"valid_string_🀁", "valid_string_?", false},
    85  		{"€", "?", false},
    86  		{"€a", "?a", false},
    87  		{"a€aa", "a?aa", false},
    88  		{"aaa€", "aaa?", false},
    89  	}
    90  	for _, tc := range utf8Cases {
    91  		cmt := fmt.Sprintf("%v", tc)
    92  		result, err := enc.Transform(nil, []byte(tc.utf8Str), charset.OpEncodeReplace)
    93  		if tc.isValid {
    94  			require.NoError(t, err, cmt)
    95  		} else {
    96  			require.Error(t, err, cmt)
    97  		}
    98  		require.Equal(t, tc.result, string(result), cmt)
    99  	}
   100  }
   101  
   102  func TestEncodingValidate(t *testing.T) {
   103  	oxfffefd := string([]byte{0xff, 0xfe, 0xfd})
   104  	testCases := []struct {
   105  		chs      string
   106  		str      string
   107  		expected string
   108  		nSrc     int
   109  		ok       bool
   110  	}{
   111  		{charset.CharsetASCII, "", "", 0, true},
   112  		{charset.CharsetASCII, "qwerty", "qwerty", 6, true},
   113  		{charset.CharsetASCII, "qwÊrty", "qw?rty", 2, false},
   114  		{charset.CharsetASCII, "中文", "??", 0, false},
   115  		{charset.CharsetASCII, "中文?qwert", "???qwert", 0, false},
   116  		{charset.CharsetUTF8MB4, "", "", 0, true},
   117  		{charset.CharsetUTF8MB4, "qwerty", "qwerty", 6, true},
   118  		{charset.CharsetUTF8MB4, "qwÊrty", "qwÊrty", 7, true},
   119  		{charset.CharsetUTF8MB4, "qwÊ合法字符串", "qwÊ合法字符串", 19, true},
   120  		{charset.CharsetUTF8MB4, "😂", "😂", 4, true},
   121  		{charset.CharsetUTF8MB4, oxfffefd, "???", 0, false},
   122  		{charset.CharsetUTF8MB4, "中文" + oxfffefd, "中文???", 6, false},
   123  		{charset.CharsetUTF8MB4, string(utf8.RuneError), "�", 3, true},
   124  		{charset.CharsetUTF8, "", "", 0, true},
   125  		{charset.CharsetUTF8, "qwerty", "qwerty", 6, true},
   126  		{charset.CharsetUTF8, "qwÊrty", "qwÊrty", 7, true},
   127  		{charset.CharsetUTF8, "qwÊ合法字符串", "qwÊ合法字符串", 19, true},
   128  		{charset.CharsetUTF8, "😂", "?", 0, false},
   129  		{charset.CharsetUTF8, "valid_str😂", "valid_str?", 9, false},
   130  		{charset.CharsetUTF8, oxfffefd, "???", 0, false},
   131  		{charset.CharsetUTF8, "中文" + oxfffefd, "中文???", 6, false},
   132  		{charset.CharsetUTF8, string(utf8.RuneError), "�", 3, true},
   133  		{charset.CharsetGBK, "", "", 0, true},
   134  		{charset.CharsetGBK, "asdf", "asdf", 4, true},
   135  		{charset.CharsetGBK, "中文", "中文", 6, true},
   136  		{charset.CharsetGBK, "À", "?", 0, false},
   137  		{charset.CharsetGBK, "中文À中文", "中文?中文", 6, false},
   138  		{charset.CharsetGBK, "asdfÀ", "asdf?", 4, false},
   139  	}
   140  	for _, tc := range testCases {
   141  		msg := fmt.Sprintf("%v", tc)
   142  		enc := charset.FindEncoding(tc.chs)
   143  		if tc.chs == charset.CharsetUTF8 {
   144  			enc = charset.EncodingUTF8MB3StrictImpl
   145  		}
   146  		strBytes := []byte(tc.str)
   147  		require.Equal(t, tc.ok, enc.IsValid(strBytes), msg)
   148  		replace, _ := enc.Transform(nil, strBytes, charset.OpReplaceNoErr)
   149  		require.Equal(t, tc.expected, string(replace), msg)
   150  	}
   151  }