github.com/bytedance/sonic@v1.11.7-0.20240517092252-d2edb31b167b/native/fastint.h (about)

     1  /*
     2   * Copyright 2021 ByteDance Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  #pragma once
    17  
    18  #include "native.h"
    19  #include "tab.h"
    20  
    21  static const char Vec16xA0[16] __attribute__((aligned(16))) = {
    22      '0',
    23      '0',
    24      '0',
    25      '0',
    26      '0',
    27      '0',
    28      '0',
    29      '0',
    30      '0',
    31      '0',
    32      '0',
    33      '0',
    34      '0',
    35      '0',
    36      '0',
    37      '0',
    38  };
    39  
    40  static const uint16_t Vec8x10[8] __attribute__((aligned(16))) = {
    41      10,
    42      10,
    43      10,
    44      10,
    45      10,
    46      10,
    47      10,
    48      10,
    49  };
    50  
    51  static const uint32_t Vec4x10k[4] __attribute__((aligned(16))) = {
    52      10000,
    53      10000,
    54      10000,
    55      10000,
    56  };
    57  
    58  static const uint32_t Vec4xDiv10k[4] __attribute__((aligned(16))) = {
    59      0xd1b71759,
    60      0xd1b71759,
    61      0xd1b71759,
    62      0xd1b71759,
    63  };
    64  
    65  static const uint16_t VecDivPowers[8] __attribute__((aligned(16))) = {
    66      0x20c5,
    67      0x147b,
    68      0x3334,
    69      0x8000,
    70      0x20c5,
    71      0x147b,
    72      0x3334,
    73      0x8000,
    74  };
    75  
    76  static const uint16_t VecShiftPowers[8] __attribute__((aligned(16))) = {
    77      0x0080,
    78      0x0800,
    79      0x2000,
    80      0x8000,
    81      0x0080,
    82      0x0800,
    83      0x2000,
    84      0x8000,
    85  };
    86  
    87  static const uint8_t VecShiftShuffles[144] __attribute__((aligned(16))) = {
    88      0x00,
    89      0x01,
    90      0x02,
    91      0x03,
    92      0x04,
    93      0x05,
    94      0x06,
    95      0x07,
    96      0x08,
    97      0x09,
    98      0x0a,
    99      0x0b,
   100      0x0c,
   101      0x0d,
   102      0x0e,
   103      0x0f,
   104      0x01,
   105      0x02,
   106      0x03,
   107      0x04,
   108      0x05,
   109      0x06,
   110      0x07,
   111      0x08,
   112      0x09,
   113      0x0a,
   114      0x0b,
   115      0x0c,
   116      0x0d,
   117      0x0e,
   118      0x0f,
   119      0xff,
   120      0x02,
   121      0x03,
   122      0x04,
   123      0x05,
   124      0x06,
   125      0x07,
   126      0x08,
   127      0x09,
   128      0x0a,
   129      0x0b,
   130      0x0c,
   131      0x0d,
   132      0x0e,
   133      0x0f,
   134      0xff,
   135      0xff,
   136      0x03,
   137      0x04,
   138      0x05,
   139      0x06,
   140      0x07,
   141      0x08,
   142      0x09,
   143      0x0a,
   144      0x0b,
   145      0x0c,
   146      0x0d,
   147      0x0e,
   148      0x0f,
   149      0xff,
   150      0xff,
   151      0xff,
   152      0x04,
   153      0x05,
   154      0x06,
   155      0x07,
   156      0x08,
   157      0x09,
   158      0x0a,
   159      0x0b,
   160      0x0c,
   161      0x0d,
   162      0x0e,
   163      0x0f,
   164      0xff,
   165      0xff,
   166      0xff,
   167      0xff,
   168      0x05,
   169      0x06,
   170      0x07,
   171      0x08,
   172      0x09,
   173      0x0a,
   174      0x0b,
   175      0x0c,
   176      0x0d,
   177      0x0e,
   178      0x0f,
   179      0xff,
   180      0xff,
   181      0xff,
   182      0xff,
   183      0xff,
   184      0x06,
   185      0x07,
   186      0x08,
   187      0x09,
   188      0x0a,
   189      0x0b,
   190      0x0c,
   191      0x0d,
   192      0x0e,
   193      0x0f,
   194      0xff,
   195      0xff,
   196      0xff,
   197      0xff,
   198      0xff,
   199      0xff,
   200      0x07,
   201      0x08,
   202      0x09,
   203      0x0a,
   204      0x0b,
   205      0x0c,
   206      0x0d,
   207      0x0e,
   208      0x0f,
   209      0xff,
   210      0xff,
   211      0xff,
   212      0xff,
   213      0xff,
   214      0xff,
   215      0xff,
   216      0x08,
   217      0x09,
   218      0x0a,
   219      0x0b,
   220      0x0c,
   221      0x0d,
   222      0x0e,
   223      0x0f,
   224      0xff,
   225      0xff,
   226      0xff,
   227      0xff,
   228      0xff,
   229      0xff,
   230      0xff,
   231      0xff,
   232  };
   233  
   234  static always_inline int itoa1(char *out, int n, uint32_t v)
   235  {
   236      out[n++] = (char)v + '0';
   237      return n;
   238  }
   239  
   240  static always_inline int itoa2(char *out, int n, uint32_t v)
   241  {
   242      out[n++] = Digits[v];
   243      out[n++] = Digits[v + 1];
   244      return n;
   245  }
   246  
   247  static always_inline __m128i itoa8_sse2(uint32_t v)
   248  {
   249      __m128i v00 = _mm_cvtsi32_si128(v);
   250      __m128i v01 = _mm_mul_epu32(v00, as_m128v(Vec4xDiv10k));
   251      __m128i v02 = _mm_srli_epi64(v01, 45);
   252      __m128i v03 = _mm_mul_epu32(v02, as_m128v(Vec4x10k));
   253      __m128i v04 = _mm_sub_epi32(v00, v03);
   254      __m128i v05 = _mm_unpacklo_epi16(v02, v04);
   255      __m128i v06 = _mm_slli_epi64(v05, 2);
   256      __m128i v07 = _mm_unpacklo_epi16(v06, v06);
   257      __m128i v08 = _mm_unpacklo_epi32(v07, v07);
   258      __m128i v09 = _mm_mulhi_epu16(v08, as_m128v(VecDivPowers));
   259      __m128i v10 = _mm_mulhi_epu16(v09, as_m128v(VecShiftPowers));
   260      __m128i v11 = _mm_mullo_epi16(v10, as_m128v(Vec8x10));
   261      __m128i v12 = _mm_slli_epi64(v11, 16);
   262      __m128i v13 = _mm_sub_epi16(v10, v12);
   263      return v13;
   264  }
   265  
   266  static always_inline int u32toa_small(char *out, uint32_t val)
   267  {
   268      int n = 0;
   269      uint32_t d1 = (val / 100) << 1;
   270      uint32_t d2 = (val % 100) << 1;
   271  
   272      /* 1000-th digit */
   273      if (val >= 1000)
   274      {
   275          out[n++] = Digits[d1];
   276      }
   277  
   278      /* 100-th digit */
   279      if (val >= 100)
   280      {
   281          out[n++] = Digits[d1 + 1];
   282      }
   283  
   284      /* 10-th digit */
   285      if (val >= 10)
   286      {
   287          out[n++] = Digits[d2];
   288      }
   289  
   290      /* last digit */
   291      out[n++] = Digits[d2 + 1];
   292      return n;
   293  }
   294  
   295  static always_inline int u32toa_medium(char *out, uint32_t val)
   296  {
   297      int n = 0;
   298      uint32_t b = val / 10000;
   299      uint32_t c = val % 10000;
   300      uint32_t d1 = (b / 100) << 1;
   301      uint32_t d2 = (b % 100) << 1;
   302      uint32_t d3 = (c / 100) << 1;
   303      uint32_t d4 = (c % 100) << 1;
   304  
   305      /* 10000000-th digit */
   306      if (val >= 10000000)
   307      {
   308          out[n++] = Digits[d1];
   309      }
   310  
   311      /* 1000000-th digit */
   312      if (val >= 1000000)
   313      {
   314          out[n++] = Digits[d1 + 1];
   315      }
   316  
   317      /* 100000-th digit */
   318      if (val >= 100000)
   319      {
   320          out[n++] = Digits[d2];
   321      }
   322  
   323      /* remaining digits */
   324      out[n++] = Digits[d2 + 1];
   325      out[n++] = Digits[d3];
   326      out[n++] = Digits[d3 + 1];
   327      out[n++] = Digits[d4];
   328      out[n++] = Digits[d4 + 1];
   329      return n;
   330  }
   331  
   332  static always_inline int u64toa_large_sse2(char *out, uint64_t val)
   333  {
   334      uint32_t a = (uint32_t)(val / 100000000);
   335      uint32_t b = (uint32_t)(val % 100000000);
   336  
   337      /* convert to digits */
   338      __m128i v0 = itoa8_sse2(a);
   339      __m128i v1 = itoa8_sse2(b);
   340  
   341      /* convert to bytes, add '0' */
   342      __m128i v2 = _mm_packus_epi16(v0, v1);
   343      __m128i v3 = _mm_add_epi8(v2, as_m128v(Vec16xA0));
   344  
   345      /* count number of digit */
   346      __m128i v4 = _mm_cmpeq_epi8(v3, as_m128v(Vec16xA0));
   347      uint32_t bm = _mm_movemask_epi8(v4);
   348      uint32_t nd = __builtin_ctz(~bm | 0x8000);
   349  
   350      /* shift digits to the beginning */
   351      __m128i p = _mm_loadu_si128(as_m128c(&VecShiftShuffles[nd * 16]));
   352      __m128i r = _mm_shuffle_epi8(v3, p);
   353  
   354      /* store the result */
   355      _mm_storeu_si128(as_m128p(out), r);
   356      return 16 - nd;
   357  }
   358  
   359  static always_inline int u64toa_xlarge_sse2(char *out, uint64_t val)
   360  {
   361      int n = 0;
   362      uint64_t b = val % 10000000000000000;
   363      uint32_t a = (uint32_t)(val / 10000000000000000);
   364  
   365      /* the highest 4 digits */
   366      if (a < 10)
   367      {
   368          n = itoa1(out, n, a);
   369      }
   370      else if (a < 100)
   371      {
   372          n = itoa2(out, n, a << 1);
   373      }
   374      else if (a < 1000)
   375      {
   376          n = itoa1(out, n, a / 100);
   377          n = itoa2(out, n, (a % 100) << 1);
   378      }
   379      else
   380      {
   381          n = itoa2(out, n, (a / 100) << 1);
   382          n = itoa2(out, n, (a % 100) << 1);
   383      }
   384  
   385      /* remaining digits */
   386      __m128i v0 = itoa8_sse2((uint32_t)(b / 100000000));
   387      __m128i v1 = itoa8_sse2((uint32_t)(b % 100000000));
   388      __m128i v2 = _mm_packus_epi16(v0, v1);
   389      __m128i v3 = _mm_add_epi8(v2, as_m128v(Vec16xA0));
   390  
   391      /* convert to bytes, add '0' */
   392      _mm_storeu_si128(as_m128p(&out[n]), v3);
   393      return n + 16;
   394  }
   395  static always_inline int u64toa_1(char *out, uint64_t val)
   396  {
   397      if (likely(val < 10000))
   398      {
   399          return u32toa_small(out, (uint32_t)val);
   400      }
   401      else if (likely(val < 100000000))
   402      {
   403          return u32toa_medium(out, (uint32_t)val);
   404      }
   405      else if (likely(val < 10000000000000000))
   406      {
   407          return u64toa_large_sse2(out, val);
   408      }
   409      else
   410      {
   411          return u64toa_xlarge_sse2(out, val);
   412      }
   413  }
   414  
   415  static always_inline int i64toa_1(char *out, int64_t val)
   416  {
   417      if (likely(val >= 0))
   418      {
   419          return u64toa_1(out, (uint64_t)val);
   420      }
   421      else
   422      {
   423          *out = '-';
   424          return u64toa_1(out + 1, (uint64_t)(-val)) + 1;
   425      }
   426  }