github.com/pingcap/tidb/parser@v0.0.0-20231013125129-93a834a6bf8d/charset/encoding.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package charset
    15  
    16  import "bytes"
    17  
    18  // Make sure all of them implement Encoding interface.
    19  var (
    20  	_ Encoding = &encodingUTF8{}
    21  	_ Encoding = &encodingUTF8MB3Strict{}
    22  	_ Encoding = &encodingASCII{}
    23  	_ Encoding = &encodingLatin1{}
    24  	_ Encoding = &encodingBin{}
    25  	_ Encoding = &encodingGBK{}
    26  )
    27  
    28  // IsSupportedEncoding checks if the charset is fully supported.
    29  func IsSupportedEncoding(charset string) bool {
    30  	_, ok := encodingMap[charset]
    31  	return ok
    32  }
    33  
    34  // FindEncodingTakeUTF8AsNoop finds the encoding according to the charset
    35  // except that utf-8 is treated as no-operation encoding. This is used to
    36  // reduce the overhead of utf-8 validation in some cases.
    37  func FindEncodingTakeUTF8AsNoop(charset string) Encoding {
    38  	enc := FindEncoding(charset)
    39  	if enc.Tp() == EncodingTpUTF8 {
    40  		return EncodingBinImpl
    41  	}
    42  	return enc
    43  }
    44  
    45  // FindEncoding finds the encoding according to charset.
    46  func FindEncoding(charset string) Encoding {
    47  	if len(charset) == 0 {
    48  		return EncodingBinImpl
    49  	}
    50  	if e, exist := encodingMap[charset]; exist {
    51  		return e
    52  	}
    53  	return EncodingBinImpl
    54  }
    55  
    56  var encodingMap = map[string]Encoding{
    57  	CharsetUTF8MB4: EncodingUTF8Impl,
    58  	CharsetUTF8:    EncodingUTF8Impl,
    59  	CharsetGBK:     EncodingGBKImpl,
    60  	CharsetLatin1:  EncodingLatin1Impl,
    61  	CharsetBin:     EncodingBinImpl,
    62  	CharsetASCII:   EncodingASCIIImpl,
    63  }
    64  
    65  // Encoding provide encode/decode functions for a string with a specific charset.
    66  type Encoding interface {
    67  	// Name is the name of the encoding.
    68  	Name() string
    69  	// Tp is the type of the encoding.
    70  	Tp() EncodingTp
    71  	// Peek returns the next char.
    72  	Peek(src []byte) []byte
    73  	// MbLen returns multiple byte length, if the next character is single byte, return 0.
    74  	MbLen(string) int
    75  	// IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding.
    76  	IsValid(src []byte) bool
    77  	// Foreach iterates the characters in in current encoding.
    78  	Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool)
    79  	// Transform map the bytes in src to dest according to Op.
    80  	// **the caller should initialize the dest if it wants to avoid memory alloc every time,
    81  	//   or else it will always make a new one**
    82  	//
    83  	// **the returned array may be the alias of `src`, edit the returned array on your own risk**
    84  	Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error)
    85  	// ToUpper change a string to uppercase.
    86  	ToUpper(src string) string
    87  	// ToLower change a string to lowercase.
    88  	ToLower(src string) string
    89  }
    90  
    91  // EncodingTp is the type of the encoding.
    92  type EncodingTp int8
    93  
    94  //revive:disable
    95  const (
    96  	EncodingTpNone EncodingTp = iota
    97  	EncodingTpUTF8
    98  	EncodingTpUTF8MB3Strict
    99  	EncodingTpASCII
   100  	EncodingTpLatin1
   101  	EncodingTpBin
   102  	EncodingTpGBK
   103  )
   104  
   105  //revive:enable
   106  
   107  // Op is used by Encoding.Transform.
   108  type Op int16
   109  
   110  const (
   111  	opFromUTF8 Op = 1 << iota
   112  	opToUTF8
   113  	opTruncateTrim
   114  	opTruncateReplace
   115  	opCollectFrom
   116  	opCollectTo
   117  	opSkipError
   118  )
   119  
   120  //revive:disable
   121  const (
   122  	// OpReplaceNoErr is used to replace invalid bytes with '?'.
   123  	OpReplaceNoErr  = opFromUTF8 | opTruncateReplace | opCollectFrom | opSkipError
   124  	OpReplace       = opFromUTF8 | opTruncateReplace | opCollectFrom
   125  	OpEncode        = opFromUTF8 | opTruncateTrim | opCollectTo
   126  	OpEncodeNoErr   = OpEncode | opSkipError
   127  	OpEncodeReplace = opFromUTF8 | opTruncateReplace | opCollectTo
   128  	OpDecode        = opToUTF8 | opTruncateTrim | opCollectTo
   129  	OpDecodeNoErr   = OpDecode | opSkipError
   130  	OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo
   131  )
   132  
   133  //revive:enable
   134  
   135  // CountValidBytes counts the first valid bytes in src that
   136  // can be encoded to the current encoding.
   137  func CountValidBytes(e Encoding, src []byte) int {
   138  	nSrc := 0
   139  	e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
   140  		if ok {
   141  			nSrc += len(from)
   142  		}
   143  		return ok
   144  	})
   145  	return nSrc
   146  }
   147  
   148  // CountValidBytesDecode counts the first valid bytes in src that
   149  // can be decoded to utf-8.
   150  func CountValidBytesDecode(e Encoding, src []byte) int {
   151  	nSrc := 0
   152  	e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool {
   153  		if ok {
   154  			nSrc += len(from)
   155  		}
   156  		return ok
   157  	})
   158  	return nSrc
   159  }