github.com/matrixorigin/matrixone@v1.2.0/pkg/vectorize/lengthutf8/length_utf8.go (about)

     1  // Copyright 2021 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package lengthutf8
    16  
    17  var (
    18  	table [256]uint8
    19  )
    20  
    21  func init() {
    22  	table = [256]uint8{
    23  		// start byte of 1-byte utf8 char: 0b0000'0000 ~ 0b0111'1111
    24  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    25  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    26  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    27  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    28  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    29  		1, 1, 1,
    30  		// continuation byte: 0b1000'0000 ~ 0b1011'1111
    31  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    32  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    33  		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    34  		// start byte of 2-byte utf8 char: 0b1100'0000 ~ 0b1101'1111
    35  		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    36  		2, 2, 2, 2, 2, 2, 2,
    37  		// start byte of 3-byte utf8 char: 0b1110'0000 ~ 0b1110'1111
    38  		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    39  		// start byte of 4-byte utf8 char: 0b1111'0000 ~ 0b1111'0111
    40  		// invalid utf8 byte: 0b1111'1000~ 0b1111'1111
    41  		4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1}
    42  }
    43  
    44  // this countUTF8CodePoints implementation bases on https://github.com/satanson/cpp_etudes/blob/master/include/string_functions.hh
    45  func CountUTF8CodePoints(s []byte) uint64 {
    46  	var charSize uint8
    47  	var count uint64 = 0
    48  	length := len(s)
    49  
    50  	for i := 0; i < length; {
    51  		charSize = table[s[i]]
    52  		count++
    53  		i = i + int(charSize)
    54  	}
    55  	return count
    56  }