github.com/pingcap/tidb/parser@v0.0.0-20231013125129-93a834a6bf8d/charset/encoding.go (about) 1 // Copyright 2021 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package charset 15 16 import "bytes" 17 18 // Make sure all of them implement Encoding interface. 19 var ( 20 _ Encoding = &encodingUTF8{} 21 _ Encoding = &encodingUTF8MB3Strict{} 22 _ Encoding = &encodingASCII{} 23 _ Encoding = &encodingLatin1{} 24 _ Encoding = &encodingBin{} 25 _ Encoding = &encodingGBK{} 26 ) 27 28 // IsSupportedEncoding checks if the charset is fully supported. 29 func IsSupportedEncoding(charset string) bool { 30 _, ok := encodingMap[charset] 31 return ok 32 } 33 34 // FindEncodingTakeUTF8AsNoop finds the encoding according to the charset 35 // except that utf-8 is treated as no-operation encoding. This is used to 36 // reduce the overhead of utf-8 validation in some cases. 37 func FindEncodingTakeUTF8AsNoop(charset string) Encoding { 38 enc := FindEncoding(charset) 39 if enc.Tp() == EncodingTpUTF8 { 40 return EncodingBinImpl 41 } 42 return enc 43 } 44 45 // FindEncoding finds the encoding according to charset. 46 func FindEncoding(charset string) Encoding { 47 if len(charset) == 0 { 48 return EncodingBinImpl 49 } 50 if e, exist := encodingMap[charset]; exist { 51 return e 52 } 53 return EncodingBinImpl 54 } 55 56 var encodingMap = map[string]Encoding{ 57 CharsetUTF8MB4: EncodingUTF8Impl, 58 CharsetUTF8: EncodingUTF8Impl, 59 CharsetGBK: EncodingGBKImpl, 60 CharsetLatin1: EncodingLatin1Impl, 61 CharsetBin: EncodingBinImpl, 62 CharsetASCII: EncodingASCIIImpl, 63 } 64 65 // Encoding provide encode/decode functions for a string with a specific charset. 66 type Encoding interface { 67 // Name is the name of the encoding. 68 Name() string 69 // Tp is the type of the encoding. 70 Tp() EncodingTp 71 // Peek returns the next char. 72 Peek(src []byte) []byte 73 // MbLen returns multiple byte length, if the next character is single byte, return 0. 74 MbLen(string) int 75 // IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding. 76 IsValid(src []byte) bool 77 // Foreach iterates the characters in in current encoding. 78 Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool) 79 // Transform map the bytes in src to dest according to Op. 80 // **the caller should initialize the dest if it wants to avoid memory alloc every time, 81 // or else it will always make a new one** 82 // 83 // **the returned array may be the alias of `src`, edit the returned array on your own risk** 84 Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error) 85 // ToUpper change a string to uppercase. 86 ToUpper(src string) string 87 // ToLower change a string to lowercase. 88 ToLower(src string) string 89 } 90 91 // EncodingTp is the type of the encoding. 92 type EncodingTp int8 93 94 //revive:disable 95 const ( 96 EncodingTpNone EncodingTp = iota 97 EncodingTpUTF8 98 EncodingTpUTF8MB3Strict 99 EncodingTpASCII 100 EncodingTpLatin1 101 EncodingTpBin 102 EncodingTpGBK 103 ) 104 105 //revive:enable 106 107 // Op is used by Encoding.Transform. 108 type Op int16 109 110 const ( 111 opFromUTF8 Op = 1 << iota 112 opToUTF8 113 opTruncateTrim 114 opTruncateReplace 115 opCollectFrom 116 opCollectTo 117 opSkipError 118 ) 119 120 //revive:disable 121 const ( 122 // OpReplaceNoErr is used to replace invalid bytes with '?'. 123 OpReplaceNoErr = opFromUTF8 | opTruncateReplace | opCollectFrom | opSkipError 124 OpReplace = opFromUTF8 | opTruncateReplace | opCollectFrom 125 OpEncode = opFromUTF8 | opTruncateTrim | opCollectTo 126 OpEncodeNoErr = OpEncode | opSkipError 127 OpEncodeReplace = opFromUTF8 | opTruncateReplace | opCollectTo 128 OpDecode = opToUTF8 | opTruncateTrim | opCollectTo 129 OpDecodeNoErr = OpDecode | opSkipError 130 OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo 131 ) 132 133 //revive:enable 134 135 // CountValidBytes counts the first valid bytes in src that 136 // can be encoded to the current encoding. 137 func CountValidBytes(e Encoding, src []byte) int { 138 nSrc := 0 139 e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool { 140 if ok { 141 nSrc += len(from) 142 } 143 return ok 144 }) 145 return nSrc 146 } 147 148 // CountValidBytesDecode counts the first valid bytes in src that 149 // can be decoded to utf-8. 150 func CountValidBytesDecode(e Encoding, src []byte) int { 151 nSrc := 0 152 e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool { 153 if ok { 154 nSrc += len(from) 155 } 156 return ok 157 }) 158 return nSrc 159 }