github.com/goshafaq/sonic@v0.0.0-20231026082336-871835fb94c6/native/fastint.c (about) 1 /* 2 * Copyright 2021 ByteDance Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "native.h" 18 #include "tab.h" 19 20 static const char Vec16xA0[16] __attribute__((aligned(16))) = { 21 '0', '0', '0', '0', '0', '0', '0', '0', 22 '0', '0', '0', '0', '0', '0', '0', '0', 23 }; 24 25 static const uint16_t Vec8x10[8] __attribute__((aligned(16))) = { 26 10, 10, 10, 10, 27 10, 10, 10, 10, 28 }; 29 30 static const uint32_t Vec4x10k[4] __attribute__((aligned(16))) = { 31 10000, 32 10000, 33 10000, 34 10000, 35 }; 36 37 static const uint32_t Vec4xDiv10k[4] __attribute__((aligned(16))) = { 38 0xd1b71759, 39 0xd1b71759, 40 0xd1b71759, 41 0xd1b71759, 42 }; 43 44 static const uint16_t VecDivPowers[8] __attribute__((aligned(16))) = { 45 0x20c5, 0x147b, 46 0x3334, 0x8000, 47 0x20c5, 0x147b, 48 0x3334, 0x8000, 49 }; 50 51 static const uint16_t VecShiftPowers[8] __attribute__((aligned(16))) = { 52 0x0080, 0x0800, 53 0x2000, 0x8000, 54 0x0080, 0x0800, 55 0x2000, 0x8000, 56 }; 57 58 static const uint8_t VecShiftShuffles[144] __attribute__((aligned(16))) = { 59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 60 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 61 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 62 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 63 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 64 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 65 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 66 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 67 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 68 }; 69 70 static inline int itoa1(char *out, int n, uint32_t v) { 71 out[n++] = (char)v + '0'; 72 return n; 73 } 74 75 static inline int itoa2(char *out, int n, uint32_t v) { 76 out[n++] = Digits[v]; 77 out[n++] = Digits[v + 1]; 78 return n; 79 } 80 81 static inline __m128i itoa8_sse2(uint32_t v) { 82 __m128i v00 = _mm_cvtsi32_si128 (v); 83 __m128i v01 = _mm_mul_epu32 (v00, as_m128v(Vec4xDiv10k)); 84 __m128i v02 = _mm_srli_epi64 (v01, 45); 85 __m128i v03 = _mm_mul_epu32 (v02, as_m128v(Vec4x10k)); 86 __m128i v04 = _mm_sub_epi32 (v00, v03); 87 __m128i v05 = _mm_unpacklo_epi16 (v02, v04); 88 __m128i v06 = _mm_slli_epi64 (v05, 2); 89 __m128i v07 = _mm_unpacklo_epi16 (v06, v06); 90 __m128i v08 = _mm_unpacklo_epi32 (v07, v07); 91 __m128i v09 = _mm_mulhi_epu16 (v08, as_m128v(VecDivPowers)); 92 __m128i v10 = _mm_mulhi_epu16 (v09, as_m128v(VecShiftPowers)); 93 __m128i v11 = _mm_mullo_epi16 (v10, as_m128v(Vec8x10)); 94 __m128i v12 = _mm_slli_epi64 (v11, 16); 95 __m128i v13 = _mm_sub_epi16 (v10, v12); 96 return v13; 97 } 98 99 static inline int u32toa_small(char *out, uint32_t val) { 100 int n = 0; 101 uint32_t d1 = (val / 100) << 1; 102 uint32_t d2 = (val % 100) << 1; 103 104 /* 1000-th digit */ 105 if (val >= 1000) { 106 out[n++] = Digits[d1]; 107 } 108 109 /* 100-th digit */ 110 if (val >= 100) { 111 out[n++] = Digits[d1 + 1]; 112 } 113 114 /* 10-th digit */ 115 if (val >= 10) { 116 out[n++] = Digits[d2]; 117 } 118 119 /* last digit */ 120 out[n++] = Digits[d2 + 1]; 121 return n; 122 } 123 124 static inline int u32toa_medium(char *out, uint32_t val) { 125 int n = 0; 126 uint32_t b = val / 10000; 127 uint32_t c = val % 10000; 128 uint32_t d1 = (b / 100) << 1; 129 uint32_t d2 = (b % 100) << 1; 130 uint32_t d3 = (c / 100) << 1; 131 uint32_t d4 = (c % 100) << 1; 132 133 /* 10000000-th digit */ 134 if (val >= 10000000) { 135 out[n++] = Digits[d1]; 136 } 137 138 /* 1000000-th digit */ 139 if (val >= 1000000) { 140 out[n++] = Digits[d1 + 1]; 141 } 142 143 /* 100000-th digit */ 144 if (val >= 100000) { 145 out[n++] = Digits[d2]; 146 } 147 148 /* remaining digits */ 149 out[n++] = Digits[d2 + 1]; 150 out[n++] = Digits[d3]; 151 out[n++] = Digits[d3 + 1]; 152 out[n++] = Digits[d4]; 153 out[n++] = Digits[d4 + 1]; 154 return n; 155 } 156 157 static inline int u64toa_large_sse2(char *out, uint64_t val) { 158 uint32_t a = (uint32_t)(val / 100000000); 159 uint32_t b = (uint32_t)(val % 100000000); 160 161 /* convert to digits */ 162 __m128i v0 = itoa8_sse2(a); 163 __m128i v1 = itoa8_sse2(b); 164 165 /* convert to bytes, add '0' */ 166 __m128i v2 = _mm_packus_epi16 (v0, v1); 167 __m128i v3 = _mm_add_epi8 (v2, as_m128v(Vec16xA0)); 168 169 /* count number of digit */ 170 __m128i v4 = _mm_cmpeq_epi8 (v3, as_m128v(Vec16xA0)); 171 uint32_t bm = _mm_movemask_epi8 (v4); 172 uint32_t nd = __builtin_ctz (~bm | 0x8000); 173 174 /* shift digits to the beginning */ 175 __m128i p = _mm_loadu_si128 (as_m128c(&VecShiftShuffles[nd * 16])); 176 __m128i r = _mm_shuffle_epi8 (v3, p); 177 178 /* store the result */ 179 _mm_storeu_si128(as_m128p(out), r); 180 return 16 - nd; 181 } 182 183 static inline int u64toa_xlarge_sse2(char *out, uint64_t val) { 184 int n = 0; 185 uint64_t b = val % 10000000000000000; 186 uint32_t a = (uint32_t)(val / 10000000000000000); 187 188 /* the highest 4 digits */ 189 if (a < 10) { 190 n = itoa1(out, n, a); 191 } else if (a < 100) { 192 n = itoa2(out, n, a << 1); 193 } else if (a < 1000) { 194 n = itoa1(out, n, a / 100); 195 n = itoa2(out, n, (a % 100) << 1); 196 } else { 197 n = itoa2(out, n, (a / 100) << 1); 198 n = itoa2(out, n, (a % 100) << 1); 199 } 200 201 /* remaining digits */ 202 __m128i v0 = itoa8_sse2 ((uint32_t)(b / 100000000)); 203 __m128i v1 = itoa8_sse2 ((uint32_t)(b % 100000000)); 204 __m128i v2 = _mm_packus_epi16 (v0, v1); 205 __m128i v3 = _mm_add_epi8 (v2, as_m128v(Vec16xA0)); 206 207 /* convert to bytes, add '0' */ 208 _mm_storeu_si128(as_m128p(&out[n]), v3); 209 return n + 16; 210 } 211 212 int i64toa(char *out, int64_t val) { 213 if (likely(val >= 0)) { 214 return u64toa(out, (uint64_t)val); 215 } else { 216 *out = '-'; 217 return u64toa(out + 1, (uint64_t)(-val)) + 1; 218 } 219 } 220 221 int u64toa(char *out, uint64_t val) { 222 if (likely(val < 10000)) { 223 return u32toa_small(out, (uint32_t)val); 224 } else if (likely(val < 100000000)) { 225 return u32toa_medium(out, (uint32_t)val); 226 } else if (likely(val < 10000000000000000)) { 227 return u64toa_large_sse2(out, val); 228 } else { 229 return u64toa_xlarge_sse2(out, val); 230 } 231 }