gitee.com/quant1x/gox@v1.7.6/num/asm/_cpp/special.cpp (about)

     1  #include <cmath>
     2  #include <cstddef>
     3  #include <x86intrin.h>
     4  
     5  #define MAX_VECTOR_SIZE 256
     6  #include "vectorclass.h"
     7  #include "vectormath_exp.h"
     8  
     9  const size_t vsize_float = 8;
    10  const size_t vsize_double = 4;
    11  
    12  template<typename T>
    13  void Sqrt(T* __restrict x, size_t n) {
    14      for (size_t i = 0; i < n; i++) {
    15          x[i] = std::sqrt(x[i]);
    16      }
    17  }
    18  
    19  template<typename T>
    20  void Round(T* __restrict x, size_t n) {
    21      for (size_t i = 0; i < n; i++) {
    22          x[i] = std::round(x[i]);
    23      }
    24  }
    25  
    26  template<typename T>
    27  void Floor(T* __restrict x, size_t n) {
    28      for (size_t i = 0; i < n; i++) {
    29          x[i] = std::floor(x[i]);
    30      }
    31  }
    32  
    33  template<typename T>
    34  void Ceil(T* __restrict x, size_t n) {
    35      for (size_t i = 0; i < n; i++) {
    36          x[i] = std::ceil(x[i]);
    37      }
    38  }
    39  
    40  void Sqrt_F64_V(double* x, size_t n) {
    41      Sqrt(x, n);
    42  }
    43  
    44  void Sqrt_F32_V(float* x, size_t n) {
    45      Sqrt(x, n);
    46  }
    47  
    48  void Round_F64_V(double* x, size_t n) {
    49      Round(x, n);
    50  }
    51  
    52  void Round_F32_V(float* x, size_t n) {
    53      Round(x, n);
    54  }
    55  
    56  void Floor_F64_V(double* x, size_t n) {
    57      Floor(x, n);
    58  }
    59  
    60  void Floor_F32_V(float* x, size_t n) {
    61      Floor(x, n);
    62  }
    63  
    64  void Ceil_F64_V(double* x, size_t n) {
    65      Ceil(x, n);
    66  }
    67  
    68  void Ceil_F32_V(float* x, size_t n) {
    69      Ceil(x, n);
    70  }
    71  
    72  // Note: Tail is computed in Go. Using store_partial would generate a memcpy call and
    73  // operator[] an indirect jump. Neither can be translated to avo.
    74  
    75  void Pow_4x_F64_V(double* __restrict x, double* __restrict y, size_t n) {
    76      size_t nsimd = n & size_t(-vsize_double);
    77  
    78      Vec4d vx, vy;
    79      for (size_t i = 0; i < nsimd; i += vsize_double) {
    80          vx.load(x + i);
    81          vy.load(y + i);
    82          [[clang::always_inline]] vx = pow(vx, vy);
    83          vx.store(x + i);
    84      }
    85  }
    86  
    87  void Pow_8x_F32_V(float* __restrict x, float* __restrict y, size_t n) {
    88      size_t nsimd = n & size_t(-vsize_float);
    89  
    90      Vec8f vx, vy;
    91      for (size_t i = 0; i < nsimd; i += vsize_float) {
    92          vx.load(x + i);
    93          vy.load(y + i);
    94          [[clang::always_inline]] vx = pow(vx, vy);
    95          vx.store(x + i);
    96      }
    97  }
    98  
    99  void PowNumber_4x_F64_V(double* __restrict x, double y, size_t n) {
   100      size_t nsimd = n & size_t(-vsize_double);
   101  
   102      Vec4d vx, vy(y);
   103      for (size_t i = 0; i < nsimd; i += vsize_double) {
   104          vx.load(x + i);
   105          [[clang::always_inline]] vx = pow(vx, vy);
   106          vx.store(x + i);
   107      }
   108  }
   109  
   110  void PowNumber_8x_F32_V(float* __restrict x, float y, size_t n) {
   111      size_t nsimd = n & size_t(-vsize_float);
   112  
   113      Vec8f vx, vy(y);
   114      for (size_t i = 0; i < nsimd; i += vsize_float) {
   115          vx.load(x + i);
   116          [[clang::always_inline]] vx = pow(vx, vy);
   117          vx.store(x + i);
   118      }
   119  }
   120  
   121  /*
   122   * Adapted from https://github.com/JishinMaster/simd_utils
   123   *
   124   * Version : 0.2.2
   125   * Author  : JishinMaster
   126   * Licence : BSD-2
   127   */
   128  
   129  /* yes I know, the top of this file is quite ugly */
   130  # define ALIGN32_BEG
   131  # define ALIGN32_END __attribute__((aligned(32)))
   132  
   133  /* __m128 is ugly to write */
   134  typedef __m256  v8sf; // vector of 8 float (avx)
   135  typedef __m256i v8si; // vector of 8 int   (avx)
   136  typedef __m128i v4si; // vector of 8 int   (avx)
   137  
   138  #define ALIGN16_BEG
   139  #define ALIGN16_END __attribute__((aligned(16)))
   140  #define ALIGN32_BEG
   141  #define ALIGN32_END __attribute__((aligned(32)))
   142  #define ALIGN64_BEG
   143  #define ALIGN64_END __attribute__((aligned(64)))
   144  
   145  #define AVX_LEN_BYTES 32  // Size of AVX lane
   146  #define AVX_LEN_INT16 16  // number of int16 with an AVX lane
   147  #define AVX_LEN_INT32 8   // number of int32 with an AVX lane
   148  #define AVX_LEN_FLOAT 8   // number of float with an AVX lane
   149  #define AVX_LEN_DOUBLE 4  // number of double with an AVX lane
   150  
   151  #define _PI32AVX_CONST(Name, Val)                                            \
   152    static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
   153  
   154  _PI32AVX_CONST(1, 1);
   155  _PI32AVX_CONST(inv1, ~1);
   156  _PI32AVX_CONST(2, 2);
   157  _PI32AVX_CONST(4, 4);
   158  
   159  /* declare some AVX constants -- why can't I figure a better way to do that? */
   160  #define _PS256_CONST(Name, Val)                                            \
   161    static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
   162  #define _PI32_CONST256(Name, Val)                                            \
   163    static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
   164  #define _PS256_CONST_TYPE(Name, Type, Val)                                 \
   165    static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
   166  
   167  #define c_cephes_L102A 3.0078125E-1f
   168  #define c_cephes_L102B 2.48745663981195213739E-4f
   169  #define c_cephes_L10EA 4.3359375E-1f
   170  #define c_cephes_L10EB 7.00731903251827651129E-4f
   171  
   172  #define INVLN10 0.4342944819032518f  // 0.4342944819f
   173  #define INVLN2 1.4426950408889634f   // 1.44269504089f
   174  #define LN2 0.6931471805599453094172321214581765680755001343602552541206800094f
   175  #define LN2_DIV_LN10 0.3010299956639811952137388947244930267681898814621085413104274611f
   176  
   177  /* For log */
   178  _PS256_CONST(cephes_L102A, 3.0078125E-1f);
   179  _PS256_CONST(cephes_L102B, 2.48745663981195213739E-4f);
   180  _PS256_CONST(cephes_L10EA, 4.3359375E-1f);
   181  _PS256_CONST(cephes_L10EB, 7.00731903251827651129E-4f);
   182  
   183  _PS256_CONST(cephes_LOG2EA, 0.44269504088896340735992f);
   184  
   185  _PS256_CONST(MAXLOGF, 88.72283905206835f);
   186  _PS256_CONST(MAXLOGFDIV2, 44.361419526034176f);
   187  _PS256_CONST(MINLOGF, -103.278929903431851103f);
   188  _PS256_CONST(cephes_exp_minC1, -0.693359375f);
   189  _PS256_CONST(cephes_exp_minC2, 2.12194440e-4f);
   190  _PS256_CONST(0p625, 0.625f);
   191  
   192  _PS256_CONST(MAXNUMF, 3.4028234663852885981170418348451692544e38f);
   193  
   194  _PS256_CONST(1  , 1.0f);
   195  _PS256_CONST(0p5, 0.5f);
   196  /* the smallest non denormalized float number */
   197  _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
   198  _PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
   199  _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
   200  
   201  _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
   202  _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
   203  
   204  _PI32_CONST256(0, 0);
   205  _PI32_CONST256(1, 1);
   206  _PI32_CONST256(inv1, ~1);
   207  _PI32_CONST256(2, 2);
   208  _PI32_CONST256(4, 4);
   209  _PI32_CONST256(0x7f, 0x7f);
   210  
   211  _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
   212  _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
   213  _PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
   214  _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
   215  _PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
   216  _PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
   217  _PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
   218  _PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
   219  _PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
   220  _PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
   221  _PS256_CONST(cephes_log_q1, -2.12194440e-4);
   222  _PS256_CONST(cephes_log_q2, 0.693359375);
   223  
   224  /* natural logarithm computed for 8 simultaneous float
   225     return NaN for x <= 0
   226  */
   227  inline v8sf log256_ps(v8sf x) {
   228    v8si imm0;
   229    v8sf one = *(v8sf*)_ps256_1;
   230  
   231    //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
   232    v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
   233  
   234    x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
   235  
   236    // can be done with AVX2
   237    imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
   238  
   239    /* keep only the fractional part */
   240    x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
   241    x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
   242  
   243    // this is again another AVX2 instruction
   244    imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
   245    v8sf e = _mm256_cvtepi32_ps(imm0);
   246  
   247    e = _mm256_add_ps(e, one);
   248  
   249    /* part2:
   250       if( x < SQRTHF ) {
   251         e -= 1;
   252         x = x + x - 1.0;
   253       } else { x = x - 1.0; }
   254    */
   255    //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
   256    v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
   257    v8sf tmp = _mm256_and_ps(x, mask);
   258    x = _mm256_sub_ps(x, one);
   259    e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
   260    x = _mm256_add_ps(x, tmp);
   261  
   262    v8sf z = _mm256_mul_ps(x,x);
   263  
   264    v8sf y = *(v8sf*)_ps256_cephes_log_p0;
   265    y = _mm256_mul_ps(y, x);
   266    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
   267    y = _mm256_mul_ps(y, x);
   268    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
   269    y = _mm256_mul_ps(y, x);
   270    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
   271    y = _mm256_mul_ps(y, x);
   272    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
   273    y = _mm256_mul_ps(y, x);
   274    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
   275    y = _mm256_mul_ps(y, x);
   276    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
   277    y = _mm256_mul_ps(y, x);
   278    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
   279    y = _mm256_mul_ps(y, x);
   280    y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
   281    y = _mm256_mul_ps(y, x);
   282  
   283    y = _mm256_mul_ps(y, z);
   284  
   285    tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
   286    y = _mm256_add_ps(y, tmp);
   287  
   288  
   289    tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
   290    y = _mm256_sub_ps(y, tmp);
   291  
   292    tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
   293    x = _mm256_add_ps(x, y);
   294    x = _mm256_add_ps(x, tmp);
   295    x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
   296    return x;
   297  }
   298  
   299  _PS256_CONST(exp_hi,	88.3762626647949f);
   300  _PS256_CONST(exp_lo,	-88.3762626647949f);
   301  
   302  _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
   303  _PS256_CONST(cephes_exp_C1, 0.693359375);
   304  _PS256_CONST(cephes_exp_C2, -2.12194440e-4);
   305  
   306  _PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
   307  _PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
   308  _PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
   309  _PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
   310  _PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
   311  _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
   312  
   313  // rewritten alternate version which properly returns MAXNUMF or 0.0 outside of boundaries
   314  inline v8sf exp256_ps_alternate(v8sf x)
   315  {
   316      v8sf z_tmp, z, fx;
   317      v8si n;
   318      v8sf xsupmaxlogf, xinfminglogf;
   319  
   320      xsupmaxlogf = _mm256_cmp_ps(x, *(v8sf *) _ps256_MAXLOGF, _CMP_GT_OS);
   321      xinfminglogf = _mm256_cmp_ps(x, *(v8sf *) _ps256_MINLOGF, _CMP_LT_OS);
   322  
   323      /* Express e**x = e**g 2**n
   324       *   = e**g e**( n loge(2) )
   325       *   = e**( g + n loge(2) )
   326       */
   327      fx = _mm256_fmadd_ps(*(v8sf *) _ps256_cephes_LOG2EF, x, *(v8sf *) _ps256_0p5);
   328      z = _mm256_round_ps(fx, _MM_FROUND_FLOOR);
   329  
   330      x = _mm256_fmadd_ps(z, *(v8sf *) _ps256_cephes_exp_minC1, x);
   331      x = _mm256_fmadd_ps(z, *(v8sf *) _ps256_cephes_exp_minC2, x);
   332  
   333      n = _mm256_cvttps_epi32(z);
   334      n = _mm256_add_epi32(n, *(v8si *) _pi32_256_0x7f);
   335      n = _mm256_slli_epi32(n, 23);
   336      v8sf pow2n = _mm256_castsi256_ps(n);
   337  
   338      z = _mm256_mul_ps(x, x);
   339  
   340      z_tmp = _mm256_fmadd_ps(*(v8sf *) _ps256_cephes_exp_p0, x, *(v8sf *) _ps256_cephes_exp_p1);
   341      z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p2);
   342      z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p3);
   343      z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p4);
   344      z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p5);
   345      z_tmp = _mm256_fmadd_ps(z_tmp, z, x);
   346      z_tmp = _mm256_add_ps(z_tmp, *(v8sf *) _ps256_1);
   347  
   348      /* build 2^n */
   349      z_tmp = _mm256_mul_ps(z_tmp, pow2n);
   350  
   351      z = _mm256_blendv_ps(z_tmp, *(v8sf *) _ps256_MAXNUMF, xsupmaxlogf);
   352      z = _mm256_blendv_ps(z, _mm256_setzero_ps(), xinfminglogf);
   353  
   354      return z;
   355  }
   356  
   357  _PS256_CONST(minus_cephes_DP1, -0.78515625);
   358  _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
   359  _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
   360  _PS256_CONST(sincof_p0, -1.9515295891E-4);
   361  _PS256_CONST(sincof_p1,  8.3321608736E-3);
   362  _PS256_CONST(sincof_p2, -1.6666654611E-1);
   363  _PS256_CONST(coscof_p0,  2.443315711809948E-005);
   364  _PS256_CONST(coscof_p1, -1.388731625493765E-003);
   365  _PS256_CONST(coscof_p2,  4.166664568298827E-002);
   366  _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
   367  
   368  inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   369  
   370    v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
   371    v8si imm0, imm2, imm4;
   372  
   373    sign_bit_sin = x;
   374    /* take the absolute value */
   375    x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
   376    /* extract the sign bit (upper one) */
   377    sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
   378  
   379    /* scale by 4/Pi */
   380    y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
   381  
   382    /* store the integer part of y in imm2 */
   383    imm2 = _mm256_cvttps_epi32(y);
   384  
   385    /* j=(j+1) & (~1) (see the cephes sources) */
   386    imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
   387    imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
   388  
   389    y = _mm256_cvtepi32_ps(imm2);
   390    imm4 = imm2;
   391  
   392    /* get the swap sign flag for the sine */
   393    imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
   394    imm0 = _mm256_slli_epi32(imm0, 29);
   395    //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
   396  
   397    /* get the polynom selection mask for the sine*/
   398    imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
   399    imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
   400  
   401    //v8sf poly_mask = _mm256_castsi256_ps(imm2);
   402    v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
   403    v8sf poly_mask = _mm256_castsi256_ps(imm2);
   404  
   405    /* The magic pass: "Extended precision modular arithmetic"
   406       x = ((x - y * DP1) - y * DP2) - y * DP3; */
   407    xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
   408    xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
   409    xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
   410    xmm1 = _mm256_mul_ps(y, xmm1);
   411    xmm2 = _mm256_mul_ps(y, xmm2);
   412    xmm3 = _mm256_mul_ps(y, xmm3);
   413    x = _mm256_add_ps(x, xmm1);
   414    x = _mm256_add_ps(x, xmm2);
   415    x = _mm256_add_ps(x, xmm3);
   416  
   417    imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
   418    imm4 = _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
   419    imm4 = _mm256_slli_epi32(imm4, 29);
   420  
   421    v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
   422  
   423    sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
   424  
   425    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   426    v8sf z = _mm256_mul_ps(x,x);
   427    y = *(v8sf*)_ps256_coscof_p0;
   428  
   429    y = _mm256_mul_ps(y, z);
   430    y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
   431    y = _mm256_mul_ps(y, z);
   432    y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
   433    y = _mm256_mul_ps(y, z);
   434    y = _mm256_mul_ps(y, z);
   435    v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
   436    y = _mm256_sub_ps(y, tmp);
   437    y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
   438  
   439    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
   440  
   441    v8sf y2 = *(v8sf*)_ps256_sincof_p0;
   442    y2 = _mm256_mul_ps(y2, z);
   443    y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
   444    y2 = _mm256_mul_ps(y2, z);
   445    y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
   446    y2 = _mm256_mul_ps(y2, z);
   447    y2 = _mm256_mul_ps(y2, x);
   448    y2 = _mm256_add_ps(y2, x);
   449  
   450    /* select the correct result from the two polynoms */
   451    xmm3 = poly_mask;
   452    v8sf ysin2 = _mm256_and_ps(xmm3, y2);
   453    v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
   454    y2 = _mm256_sub_ps(y2,ysin2);
   455    y = _mm256_sub_ps(y, ysin1);
   456  
   457    xmm1 = _mm256_add_ps(ysin1,ysin2);
   458    xmm2 = _mm256_add_ps(y,y2);
   459  
   460    /* update the sign */
   461    *s = _mm256_xor_ps(xmm1, sign_bit_sin);
   462    *c = _mm256_xor_ps(xmm2, sign_bit_cos);
   463  }
   464  
   465  /* These are for a 24-bit significand: */
   466  static const float minus_cephes_DP1 = -0.78515625f;
   467  static const float minus_cephes_DP2 = -2.4187564849853515625e-4f;
   468  static const float minus_cephes_DP3 = -3.77489497744594108e-8f;
   469  static float lossth = 8192.;
   470  
   471  static const float T24M1 = 16777215.f;
   472  static const float FOPI = 1.27323954473516f;
   473  static const float PIO4F = 0.7853981633974483096f;
   474  
   475  static const float sincof[] = {-1.9515295891E-4f, 8.3321608736E-3f, -1.6666654611E-1f};
   476  static const float coscof[] = {2.443315711809948E-5f, -1.388731625493765E-3f,
   477                                 4.166664568298827E-2f};
   478  
   479  inline int mysincosf(float xx, float *s, float *c)
   480  {
   481      float x, y, y1, y2, z;
   482      int j, sign_sin, sign_cos;
   483  
   484      sign_sin = 1;
   485      sign_cos = 1;
   486  
   487      x = xx;
   488      if (xx < 0) {
   489          sign_sin = -1;
   490          x = -xx;
   491      }
   492      if (x > T24M1) {
   493          return (0.0f);
   494      }
   495      j = FOPI * x; /* integer part of x/(PI/4) */
   496      y = j;
   497      /* map zeros to origin */
   498      if (j & 1) {
   499          j += 1;
   500          y += 1.0f;
   501      }
   502      j &= 7; /* octant modulo 360 degrees */
   503      /* reflect in x axis */
   504      if (j > 3) {
   505          sign_sin = -sign_sin;
   506          sign_cos = -sign_cos;
   507          j -= 4;
   508      }
   509  
   510      if (j > 1)
   511          sign_cos = -sign_cos;
   512  
   513      if (x > lossth) {
   514          x = x - y * PIO4F;
   515      } else {
   516          /* Extended precision modular arithmetic */
   517          x = ((x + y * minus_cephes_DP1) + y * minus_cephes_DP2) + y * minus_cephes_DP3;
   518      }
   519      /*einits();*/
   520      z = x * x;
   521  
   522      /* measured relative error in +/- pi/4 is 7.8e-8 */
   523      y1 = ((coscof[0] * z + coscof[1]) * z + coscof[2]) * z * z;
   524      y1 -= 0.5f * z;
   525      y1 += 1.0f;
   526  
   527      /* Theoretical relative error = 3.8e-9 in [-pi/4, +pi/4] */
   528      y2 = ((sincof[0] * z + sincof[1]) * z + sincof[2]) * z * x;
   529      y2 += x;
   530  
   531      if ((j == 1) || (j == 2)) {
   532          *s = y1;
   533          *c = y2;
   534      } else {
   535          *s = y2;
   536          *c = y1;
   537      }
   538      // COS
   539  
   540      /*einitd();*/
   541      if (sign_sin < 0) {
   542          *s = -(*s);
   543      }
   544      if (sign_cos < 0) {
   545          *c = -(*c);
   546      }
   547  
   548      return 0;
   549  }
   550  
   551  void Log10_Len8x_F32_V(float* x, size_t n)
   552  {
   553      const v8sf invln10f = _mm256_set1_ps((float) INVLN10);  //_mm256_broadcast_ss(&invln10f_mask);
   554  
   555      for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) {
   556          v8sf src_tmp = log256_ps(_mm256_loadu_ps(x + i));
   557          _mm256_storeu_ps(x + i, _mm256_mul_ps(src_tmp, invln10f));
   558      }
   559  }
   560  
   561  void Log2_Len8x_F32_V(float* x, size_t n)
   562  {
   563      const v8sf invln2f = _mm256_set1_ps((float) INVLN2);  //_mm256_broadcast_ss(&invln10f_mask);
   564  
   565      for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) {
   566          v8sf src_tmp = log256_ps(_mm256_loadu_ps(x + i));
   567          _mm256_storeu_ps(x + i, _mm256_mul_ps(src_tmp, invln2f));
   568      }
   569  }
   570  
   571  void Log_Len8x_F32_V(float* x, size_t n)
   572  {
   573      for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) {
   574          _mm256_storeu_ps(x + i, log256_ps(_mm256_loadu_ps(x + i)));
   575      }
   576  }
   577  
   578  void Exp_Len8x_F32_V(float* x, size_t n)
   579  {
   580      for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) {
   581          _mm256_storeu_ps(x + i, exp256_ps_alternate(_mm256_loadu_ps(x + i)));
   582      }
   583  }
   584  
   585  void Sin_F32_V(float* x, size_t n) {
   586      size_t stop_len = n / AVX_LEN_FLOAT;
   587      stop_len *= AVX_LEN_FLOAT;
   588  
   589      for (size_t i = 0; i < stop_len; i += AVX_LEN_FLOAT) {
   590          v8sf src_tmp = _mm256_loadu_ps(x + i);
   591          v8sf dst_sin_tmp;
   592          v8sf dst_cos_tmp;
   593          sincos256_ps(src_tmp, &dst_sin_tmp, &dst_cos_tmp);
   594          _mm256_storeu_ps(x + i, dst_sin_tmp);
   595      }
   596      for (size_t i = stop_len; i < n; i++) {
   597          float dst_cos_tmp;
   598          mysincosf(x[i], x + i, &dst_cos_tmp);
   599      }
   600  }
   601  
   602  void Cos_F32_V(float* x, size_t n) {
   603      size_t stop_len = n / AVX_LEN_FLOAT;
   604      stop_len *= AVX_LEN_FLOAT;
   605  
   606      for (size_t i = 0; i < stop_len; i += AVX_LEN_FLOAT) {
   607          v8sf src_tmp = _mm256_loadu_ps(x + i);
   608          v8sf dst_sin_tmp;
   609          v8sf dst_cos_tmp;
   610          sincos256_ps(src_tmp, &dst_sin_tmp, &dst_cos_tmp);
   611          _mm256_storeu_ps(x + i, dst_cos_tmp);
   612      }
   613      for (size_t i = stop_len; i < n; i++) {
   614          float dst_sin_tmp;
   615          mysincosf(x[i], &dst_sin_tmp, x + i);
   616      }
   617  }
   618  
   619  void SinCos_F32_V(float* dst_sin, float* dst_cos, float* x, size_t n) {
   620      size_t stop_len = n / AVX_LEN_FLOAT;
   621      stop_len *= AVX_LEN_FLOAT;
   622  
   623      for (size_t i = 0; i < stop_len; i += AVX_LEN_FLOAT) {
   624          v8sf src_tmp = _mm256_loadu_ps(x + i);
   625          v8sf dst_sin_tmp;
   626          v8sf dst_cos_tmp;
   627          sincos256_ps(src_tmp, &dst_sin_tmp, &dst_cos_tmp);
   628          _mm256_storeu_ps(dst_sin + i, dst_sin_tmp);
   629          _mm256_storeu_ps(dst_cos + i, dst_cos_tmp);
   630      }
   631      for (size_t i = stop_len; i < n; i++) {
   632          mysincosf(x[i], dst_sin + i, dst_cos + i);
   633      }
   634  }