github.com/cloudwego/dynamicgo@v0.2.6-0.20240519101509-707f41b6b834/native/utf8.c (about) 1 /* 2 * Copyright (c) 2009 The Go Authors. All rights reserved. 3 * Modifications Copyright 2023 CloudWeGo Authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 #include "native.h" 19 20 // ascii: 0x00 ~ 0x7F 21 static inline int _mm_ascii_mask(__m128i vv) 22 { 23 return _mm_movemask_epi8(vv); 24 } 25 26 #if USE_AVX2 27 28 // ascii: 0x00 ~ 0x7F 29 static inline int _mm256_ascii_mask(__m256i vv) 30 { 31 return _mm256_movemask_epi8(vv); 32 } 33 34 #endif 35 36 static inline bool is_ascii(uint8_t ch) 37 { 38 return ch < 0x80; 39 } 40 41 // The default lowest and highest continuation byte. 42 const static uint8_t locb = 0x80; 43 const static uint8_t hicb = 0xBF; 44 const static uint8_t xx = 0xF1; // invalid: size 1 45 const static uint8_t as = 0xF0; // ASCII: size 1 46 const static uint8_t s1 = 0x02; // accept 0, size 2 47 const static uint8_t s2 = 0x13; // accept 1, size 3 48 const static uint8_t s3 = 0x03; // accept 0, size 3 49 const static uint8_t s4 = 0x23; // accept 2, size 3 50 const static uint8_t s5 = 0x34; // accept 3, size 4 51 const static uint8_t s6 = 0x04; // accept 0, size 4 52 const static uint8_t s7 = 0x44; // accept 4, size 4 53 54 // first is information about the first byte in a UTF-8 sequence. 55 static const uint8_t first[256] = { 56 // 1 2 3 4 5 6 7 8 9 A B C D E F 57 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F 58 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F 59 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F 60 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F 61 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F 62 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F 63 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F 64 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F 65 // 1 2 3 4 5 6 7 8 9 A B C D E F 66 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F 67 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F 68 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF 69 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF 70 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF 71 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF 72 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF 73 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF 74 }; 75 76 // AcceptRange gives the range of valid values for the second byte in a UTF-8 77 // sequence. 78 struct AcceptRange 79 { 80 uint8_t lo; // lowest value for second byte. 81 uint8_t hi; // highest value for second byte. 82 }; 83 84 // ranges has size 16 to avoid bounds checks in the code that uses it. 85 const static struct AcceptRange ranges[5] = { 86 {locb, hicb}, // 0 87 {0xA0, hicb}, // 1 88 {locb, 0x9F}, // 2 89 {0x90, hicb}, // 3 90 {locb, 0x8F}, // 4 91 }; 92 93 // UTF-8 code point | first byte | second byte | third byte | fourth byte 94 // U+0000 - U+007F | 0___ ____ 95 // U+0080 - U+07FF | 110_ ____ | 10__ ____ 96 // U+0800 - U+D7FF | 1110 ____ | 10__ ____ | 10__ ____ 97 // U+D800 - U+DFFF | reserved for UTF-16 surrogate pairs 98 // U+E000 - U+FFFF | 1110 ____ | 10__ ____ | 10__ ____ 99 // U+10000 - U+10FFFF | 1111 0___ | 10__ ____ | 10__ ____ | 10__ ____ 100 // checks non-ascii characters, and returns the utf-8 length 101 static inline ssize_t nonascii_is_utf8(const uint8_t *sp, size_t n) 102 { 103 uint8_t mask = first[sp[0]]; 104 uint8_t size = mask & 7; 105 if (n < size) 106 { 107 return 0; 108 } 109 struct AcceptRange accept = ranges[mask >> 4]; 110 switch (size) 111 { 112 case 4: 113 if (sp[3] < locb || hicb < sp[3]) 114 return 0; 115 case 3: 116 if (sp[2] < locb || hicb < sp[2]) 117 return 0; 118 case 2: 119 if (sp[1] < accept.lo || accept.hi < sp[1]) 120 return 0; 121 break; 122 case 1: 123 return 0; // invalid chars 124 case 0: 125 return 1; // ascii chars 126 default: 127 return 0; 128 } 129 return size; 130 } 131 132 ssize_t find_non_ascii(const uint8_t *sp, ssize_t nb) 133 { 134 const uint8_t *ss = sp; 135 int64_t m; 136 137 #if USE_AVX2 138 while (nb >= 32) 139 { 140 __m256i v = _mm256_loadu_si256((const void *)(sp)); 141 if (unlikely((m = _mm256_ascii_mask(v)) != 0)) 142 { 143 return sp - ss + __builtin_ctzll(m); 144 } 145 nb -= 32; 146 sp += 32; 147 } 148 149 /* clear spper half to avoid AVX-SSE transition penalty */ 150 _mm256_zeroupper(); 151 #endif 152 153 while (nb >= 16) 154 { 155 __m128i v = _mm_loadu_si128((const void *)(sp)); 156 if (unlikely((m = _mm_ascii_mask(v)) != 0)) 157 { 158 return sp - ss + __builtin_ctzll(m); 159 } 160 nb -= 16; 161 sp += 16; 162 } 163 164 /* remaining bytes, do with scalar code */ 165 while (nb-- > 0) 166 { 167 if (is_ascii(*sp)) 168 { 169 sp++; 170 } 171 else 172 { 173 return sp - ss; 174 } 175 } 176 177 /* nothing found */ 178 return -1; 179 } 180 181 // utf8_validate validates whether the JSON string is valid UTF-8. 182 // return -1 if validate, otherwise, return the error postion. 183 ssize_t utf8_validate(const char *sp, ssize_t nb) 184 { 185 const uint8_t *p = (const uint8_t *)sp; 186 const uint8_t *s = (const uint8_t *)sp; 187 ssize_t n; 188 ssize_t b; 189 190 // Optimize for the continuous non-ascii chars */ 191 while (nb > 0 && (n = (!is_ascii(*p) ? 0 : find_non_ascii(p, nb))) != -1) 192 { 193 /* not found non-ascii in string */ 194 if (n >= nb) 195 { 196 return -1; 197 } 198 199 nb -= n; 200 p += n; 201 202 /* validate the non-ascii */ 203 if (unlikely((b = nonascii_is_utf8(p, nb)) == 0)) 204 { 205 return p - s; 206 } 207 208 nb -= b; 209 p += b; 210 } 211 212 return -1; 213 }