github.com/matrixorigin/matrixone@v1.2.0/pkg/vectorize/lengthutf8/length_utf8.go (about) 1 // Copyright 2021 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package lengthutf8 16 17 var ( 18 table [256]uint8 19 ) 20 21 func init() { 22 table = [256]uint8{ 23 // start byte of 1-byte utf8 char: 0b0000'0000 ~ 0b0111'1111 24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 29 1, 1, 1, 30 // continuation byte: 0b1000'0000 ~ 0b1011'1111 31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 34 // start byte of 2-byte utf8 char: 0b1100'0000 ~ 0b1101'1111 35 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 36 2, 2, 2, 2, 2, 2, 2, 37 // start byte of 3-byte utf8 char: 0b1110'0000 ~ 0b1110'1111 38 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 39 // start byte of 4-byte utf8 char: 0b1111'0000 ~ 0b1111'0111 40 // invalid utf8 byte: 0b1111'1000~ 0b1111'1111 41 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1} 42 } 43 44 // this countUTF8CodePoints implementation bases on https://github.com/satanson/cpp_etudes/blob/master/include/string_functions.hh 45 func CountUTF8CodePoints(s []byte) uint64 { 46 var charSize uint8 47 var count uint64 = 0 48 length := len(s) 49 50 for i := 0; i < length; { 51 charSize = table[s[i]] 52 count++ 53 i = i + int(charSize) 54 } 55 return count 56 }