github.com/bytedance/sonic@v1.11.7-0.20240517092252-d2edb31b167b/native/fastint.h (about) 1 /* 2 * Copyright 2021 ByteDance Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #pragma once 17 18 #include "native.h" 19 #include "tab.h" 20 21 static const char Vec16xA0[16] __attribute__((aligned(16))) = { 22 '0', 23 '0', 24 '0', 25 '0', 26 '0', 27 '0', 28 '0', 29 '0', 30 '0', 31 '0', 32 '0', 33 '0', 34 '0', 35 '0', 36 '0', 37 '0', 38 }; 39 40 static const uint16_t Vec8x10[8] __attribute__((aligned(16))) = { 41 10, 42 10, 43 10, 44 10, 45 10, 46 10, 47 10, 48 10, 49 }; 50 51 static const uint32_t Vec4x10k[4] __attribute__((aligned(16))) = { 52 10000, 53 10000, 54 10000, 55 10000, 56 }; 57 58 static const uint32_t Vec4xDiv10k[4] __attribute__((aligned(16))) = { 59 0xd1b71759, 60 0xd1b71759, 61 0xd1b71759, 62 0xd1b71759, 63 }; 64 65 static const uint16_t VecDivPowers[8] __attribute__((aligned(16))) = { 66 0x20c5, 67 0x147b, 68 0x3334, 69 0x8000, 70 0x20c5, 71 0x147b, 72 0x3334, 73 0x8000, 74 }; 75 76 static const uint16_t VecShiftPowers[8] __attribute__((aligned(16))) = { 77 0x0080, 78 0x0800, 79 0x2000, 80 0x8000, 81 0x0080, 82 0x0800, 83 0x2000, 84 0x8000, 85 }; 86 87 static const uint8_t VecShiftShuffles[144] __attribute__((aligned(16))) = { 88 0x00, 89 0x01, 90 0x02, 91 0x03, 92 0x04, 93 0x05, 94 0x06, 95 0x07, 96 0x08, 97 0x09, 98 0x0a, 99 0x0b, 100 0x0c, 101 0x0d, 102 0x0e, 103 0x0f, 104 0x01, 105 0x02, 106 0x03, 107 0x04, 108 0x05, 109 0x06, 110 0x07, 111 0x08, 112 0x09, 113 0x0a, 114 0x0b, 115 0x0c, 116 0x0d, 117 0x0e, 118 0x0f, 119 0xff, 120 0x02, 121 0x03, 122 0x04, 123 0x05, 124 0x06, 125 0x07, 126 0x08, 127 0x09, 128 0x0a, 129 0x0b, 130 0x0c, 131 0x0d, 132 0x0e, 133 0x0f, 134 0xff, 135 0xff, 136 0x03, 137 0x04, 138 0x05, 139 0x06, 140 0x07, 141 0x08, 142 0x09, 143 0x0a, 144 0x0b, 145 0x0c, 146 0x0d, 147 0x0e, 148 0x0f, 149 0xff, 150 0xff, 151 0xff, 152 0x04, 153 0x05, 154 0x06, 155 0x07, 156 0x08, 157 0x09, 158 0x0a, 159 0x0b, 160 0x0c, 161 0x0d, 162 0x0e, 163 0x0f, 164 0xff, 165 0xff, 166 0xff, 167 0xff, 168 0x05, 169 0x06, 170 0x07, 171 0x08, 172 0x09, 173 0x0a, 174 0x0b, 175 0x0c, 176 0x0d, 177 0x0e, 178 0x0f, 179 0xff, 180 0xff, 181 0xff, 182 0xff, 183 0xff, 184 0x06, 185 0x07, 186 0x08, 187 0x09, 188 0x0a, 189 0x0b, 190 0x0c, 191 0x0d, 192 0x0e, 193 0x0f, 194 0xff, 195 0xff, 196 0xff, 197 0xff, 198 0xff, 199 0xff, 200 0x07, 201 0x08, 202 0x09, 203 0x0a, 204 0x0b, 205 0x0c, 206 0x0d, 207 0x0e, 208 0x0f, 209 0xff, 210 0xff, 211 0xff, 212 0xff, 213 0xff, 214 0xff, 215 0xff, 216 0x08, 217 0x09, 218 0x0a, 219 0x0b, 220 0x0c, 221 0x0d, 222 0x0e, 223 0x0f, 224 0xff, 225 0xff, 226 0xff, 227 0xff, 228 0xff, 229 0xff, 230 0xff, 231 0xff, 232 }; 233 234 static always_inline int itoa1(char *out, int n, uint32_t v) 235 { 236 out[n++] = (char)v + '0'; 237 return n; 238 } 239 240 static always_inline int itoa2(char *out, int n, uint32_t v) 241 { 242 out[n++] = Digits[v]; 243 out[n++] = Digits[v + 1]; 244 return n; 245 } 246 247 static always_inline __m128i itoa8_sse2(uint32_t v) 248 { 249 __m128i v00 = _mm_cvtsi32_si128(v); 250 __m128i v01 = _mm_mul_epu32(v00, as_m128v(Vec4xDiv10k)); 251 __m128i v02 = _mm_srli_epi64(v01, 45); 252 __m128i v03 = _mm_mul_epu32(v02, as_m128v(Vec4x10k)); 253 __m128i v04 = _mm_sub_epi32(v00, v03); 254 __m128i v05 = _mm_unpacklo_epi16(v02, v04); 255 __m128i v06 = _mm_slli_epi64(v05, 2); 256 __m128i v07 = _mm_unpacklo_epi16(v06, v06); 257 __m128i v08 = _mm_unpacklo_epi32(v07, v07); 258 __m128i v09 = _mm_mulhi_epu16(v08, as_m128v(VecDivPowers)); 259 __m128i v10 = _mm_mulhi_epu16(v09, as_m128v(VecShiftPowers)); 260 __m128i v11 = _mm_mullo_epi16(v10, as_m128v(Vec8x10)); 261 __m128i v12 = _mm_slli_epi64(v11, 16); 262 __m128i v13 = _mm_sub_epi16(v10, v12); 263 return v13; 264 } 265 266 static always_inline int u32toa_small(char *out, uint32_t val) 267 { 268 int n = 0; 269 uint32_t d1 = (val / 100) << 1; 270 uint32_t d2 = (val % 100) << 1; 271 272 /* 1000-th digit */ 273 if (val >= 1000) 274 { 275 out[n++] = Digits[d1]; 276 } 277 278 /* 100-th digit */ 279 if (val >= 100) 280 { 281 out[n++] = Digits[d1 + 1]; 282 } 283 284 /* 10-th digit */ 285 if (val >= 10) 286 { 287 out[n++] = Digits[d2]; 288 } 289 290 /* last digit */ 291 out[n++] = Digits[d2 + 1]; 292 return n; 293 } 294 295 static always_inline int u32toa_medium(char *out, uint32_t val) 296 { 297 int n = 0; 298 uint32_t b = val / 10000; 299 uint32_t c = val % 10000; 300 uint32_t d1 = (b / 100) << 1; 301 uint32_t d2 = (b % 100) << 1; 302 uint32_t d3 = (c / 100) << 1; 303 uint32_t d4 = (c % 100) << 1; 304 305 /* 10000000-th digit */ 306 if (val >= 10000000) 307 { 308 out[n++] = Digits[d1]; 309 } 310 311 /* 1000000-th digit */ 312 if (val >= 1000000) 313 { 314 out[n++] = Digits[d1 + 1]; 315 } 316 317 /* 100000-th digit */ 318 if (val >= 100000) 319 { 320 out[n++] = Digits[d2]; 321 } 322 323 /* remaining digits */ 324 out[n++] = Digits[d2 + 1]; 325 out[n++] = Digits[d3]; 326 out[n++] = Digits[d3 + 1]; 327 out[n++] = Digits[d4]; 328 out[n++] = Digits[d4 + 1]; 329 return n; 330 } 331 332 static always_inline int u64toa_large_sse2(char *out, uint64_t val) 333 { 334 uint32_t a = (uint32_t)(val / 100000000); 335 uint32_t b = (uint32_t)(val % 100000000); 336 337 /* convert to digits */ 338 __m128i v0 = itoa8_sse2(a); 339 __m128i v1 = itoa8_sse2(b); 340 341 /* convert to bytes, add '0' */ 342 __m128i v2 = _mm_packus_epi16(v0, v1); 343 __m128i v3 = _mm_add_epi8(v2, as_m128v(Vec16xA0)); 344 345 /* count number of digit */ 346 __m128i v4 = _mm_cmpeq_epi8(v3, as_m128v(Vec16xA0)); 347 uint32_t bm = _mm_movemask_epi8(v4); 348 uint32_t nd = __builtin_ctz(~bm | 0x8000); 349 350 /* shift digits to the beginning */ 351 __m128i p = _mm_loadu_si128(as_m128c(&VecShiftShuffles[nd * 16])); 352 __m128i r = _mm_shuffle_epi8(v3, p); 353 354 /* store the result */ 355 _mm_storeu_si128(as_m128p(out), r); 356 return 16 - nd; 357 } 358 359 static always_inline int u64toa_xlarge_sse2(char *out, uint64_t val) 360 { 361 int n = 0; 362 uint64_t b = val % 10000000000000000; 363 uint32_t a = (uint32_t)(val / 10000000000000000); 364 365 /* the highest 4 digits */ 366 if (a < 10) 367 { 368 n = itoa1(out, n, a); 369 } 370 else if (a < 100) 371 { 372 n = itoa2(out, n, a << 1); 373 } 374 else if (a < 1000) 375 { 376 n = itoa1(out, n, a / 100); 377 n = itoa2(out, n, (a % 100) << 1); 378 } 379 else 380 { 381 n = itoa2(out, n, (a / 100) << 1); 382 n = itoa2(out, n, (a % 100) << 1); 383 } 384 385 /* remaining digits */ 386 __m128i v0 = itoa8_sse2((uint32_t)(b / 100000000)); 387 __m128i v1 = itoa8_sse2((uint32_t)(b % 100000000)); 388 __m128i v2 = _mm_packus_epi16(v0, v1); 389 __m128i v3 = _mm_add_epi8(v2, as_m128v(Vec16xA0)); 390 391 /* convert to bytes, add '0' */ 392 _mm_storeu_si128(as_m128p(&out[n]), v3); 393 return n + 16; 394 } 395 static always_inline int u64toa_1(char *out, uint64_t val) 396 { 397 if (likely(val < 10000)) 398 { 399 return u32toa_small(out, (uint32_t)val); 400 } 401 else if (likely(val < 100000000)) 402 { 403 return u32toa_medium(out, (uint32_t)val); 404 } 405 else if (likely(val < 10000000000000000)) 406 { 407 return u64toa_large_sse2(out, val); 408 } 409 else 410 { 411 return u64toa_xlarge_sse2(out, val); 412 } 413 } 414 415 static always_inline int i64toa_1(char *out, int64_t val) 416 { 417 if (likely(val >= 0)) 418 { 419 return u64toa_1(out, (uint64_t)val); 420 } 421 else 422 { 423 *out = '-'; 424 return u64toa_1(out + 1, (uint64_t)(-val)) + 1; 425 } 426 }