gitee.com/quant1x/gox@v1.7.6/num/asm/_cpp/special.cpp (about) 1 #include <cmath> 2 #include <cstddef> 3 #include <x86intrin.h> 4 5 #define MAX_VECTOR_SIZE 256 6 #include "vectorclass.h" 7 #include "vectormath_exp.h" 8 9 const size_t vsize_float = 8; 10 const size_t vsize_double = 4; 11 12 template<typename T> 13 void Sqrt(T* __restrict x, size_t n) { 14 for (size_t i = 0; i < n; i++) { 15 x[i] = std::sqrt(x[i]); 16 } 17 } 18 19 template<typename T> 20 void Round(T* __restrict x, size_t n) { 21 for (size_t i = 0; i < n; i++) { 22 x[i] = std::round(x[i]); 23 } 24 } 25 26 template<typename T> 27 void Floor(T* __restrict x, size_t n) { 28 for (size_t i = 0; i < n; i++) { 29 x[i] = std::floor(x[i]); 30 } 31 } 32 33 template<typename T> 34 void Ceil(T* __restrict x, size_t n) { 35 for (size_t i = 0; i < n; i++) { 36 x[i] = std::ceil(x[i]); 37 } 38 } 39 40 void Sqrt_F64_V(double* x, size_t n) { 41 Sqrt(x, n); 42 } 43 44 void Sqrt_F32_V(float* x, size_t n) { 45 Sqrt(x, n); 46 } 47 48 void Round_F64_V(double* x, size_t n) { 49 Round(x, n); 50 } 51 52 void Round_F32_V(float* x, size_t n) { 53 Round(x, n); 54 } 55 56 void Floor_F64_V(double* x, size_t n) { 57 Floor(x, n); 58 } 59 60 void Floor_F32_V(float* x, size_t n) { 61 Floor(x, n); 62 } 63 64 void Ceil_F64_V(double* x, size_t n) { 65 Ceil(x, n); 66 } 67 68 void Ceil_F32_V(float* x, size_t n) { 69 Ceil(x, n); 70 } 71 72 // Note: Tail is computed in Go. Using store_partial would generate a memcpy call and 73 // operator[] an indirect jump. Neither can be translated to avo. 74 75 void Pow_4x_F64_V(double* __restrict x, double* __restrict y, size_t n) { 76 size_t nsimd = n & size_t(-vsize_double); 77 78 Vec4d vx, vy; 79 for (size_t i = 0; i < nsimd; i += vsize_double) { 80 vx.load(x + i); 81 vy.load(y + i); 82 [[clang::always_inline]] vx = pow(vx, vy); 83 vx.store(x + i); 84 } 85 } 86 87 void Pow_8x_F32_V(float* __restrict x, float* __restrict y, size_t n) { 88 size_t nsimd = n & size_t(-vsize_float); 89 90 Vec8f vx, vy; 91 for (size_t i = 0; i < nsimd; i += vsize_float) { 92 vx.load(x + i); 93 vy.load(y + i); 94 [[clang::always_inline]] vx = pow(vx, vy); 95 vx.store(x + i); 96 } 97 } 98 99 void PowNumber_4x_F64_V(double* __restrict x, double y, size_t n) { 100 size_t nsimd = n & size_t(-vsize_double); 101 102 Vec4d vx, vy(y); 103 for (size_t i = 0; i < nsimd; i += vsize_double) { 104 vx.load(x + i); 105 [[clang::always_inline]] vx = pow(vx, vy); 106 vx.store(x + i); 107 } 108 } 109 110 void PowNumber_8x_F32_V(float* __restrict x, float y, size_t n) { 111 size_t nsimd = n & size_t(-vsize_float); 112 113 Vec8f vx, vy(y); 114 for (size_t i = 0; i < nsimd; i += vsize_float) { 115 vx.load(x + i); 116 [[clang::always_inline]] vx = pow(vx, vy); 117 vx.store(x + i); 118 } 119 } 120 121 /* 122 * Adapted from https://github.com/JishinMaster/simd_utils 123 * 124 * Version : 0.2.2 125 * Author : JishinMaster 126 * Licence : BSD-2 127 */ 128 129 /* yes I know, the top of this file is quite ugly */ 130 # define ALIGN32_BEG 131 # define ALIGN32_END __attribute__((aligned(32))) 132 133 /* __m128 is ugly to write */ 134 typedef __m256 v8sf; // vector of 8 float (avx) 135 typedef __m256i v8si; // vector of 8 int (avx) 136 typedef __m128i v4si; // vector of 8 int (avx) 137 138 #define ALIGN16_BEG 139 #define ALIGN16_END __attribute__((aligned(16))) 140 #define ALIGN32_BEG 141 #define ALIGN32_END __attribute__((aligned(32))) 142 #define ALIGN64_BEG 143 #define ALIGN64_END __attribute__((aligned(64))) 144 145 #define AVX_LEN_BYTES 32 // Size of AVX lane 146 #define AVX_LEN_INT16 16 // number of int16 with an AVX lane 147 #define AVX_LEN_INT32 8 // number of int32 with an AVX lane 148 #define AVX_LEN_FLOAT 8 // number of float with an AVX lane 149 #define AVX_LEN_DOUBLE 4 // number of double with an AVX lane 150 151 #define _PI32AVX_CONST(Name, Val) \ 152 static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val } 153 154 _PI32AVX_CONST(1, 1); 155 _PI32AVX_CONST(inv1, ~1); 156 _PI32AVX_CONST(2, 2); 157 _PI32AVX_CONST(4, 4); 158 159 /* declare some AVX constants -- why can't I figure a better way to do that? */ 160 #define _PS256_CONST(Name, Val) \ 161 static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 162 #define _PI32_CONST256(Name, Val) \ 163 static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 164 #define _PS256_CONST_TYPE(Name, Type, Val) \ 165 static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 166 167 #define c_cephes_L102A 3.0078125E-1f 168 #define c_cephes_L102B 2.48745663981195213739E-4f 169 #define c_cephes_L10EA 4.3359375E-1f 170 #define c_cephes_L10EB 7.00731903251827651129E-4f 171 172 #define INVLN10 0.4342944819032518f // 0.4342944819f 173 #define INVLN2 1.4426950408889634f // 1.44269504089f 174 #define LN2 0.6931471805599453094172321214581765680755001343602552541206800094f 175 #define LN2_DIV_LN10 0.3010299956639811952137388947244930267681898814621085413104274611f 176 177 /* For log */ 178 _PS256_CONST(cephes_L102A, 3.0078125E-1f); 179 _PS256_CONST(cephes_L102B, 2.48745663981195213739E-4f); 180 _PS256_CONST(cephes_L10EA, 4.3359375E-1f); 181 _PS256_CONST(cephes_L10EB, 7.00731903251827651129E-4f); 182 183 _PS256_CONST(cephes_LOG2EA, 0.44269504088896340735992f); 184 185 _PS256_CONST(MAXLOGF, 88.72283905206835f); 186 _PS256_CONST(MAXLOGFDIV2, 44.361419526034176f); 187 _PS256_CONST(MINLOGF, -103.278929903431851103f); 188 _PS256_CONST(cephes_exp_minC1, -0.693359375f); 189 _PS256_CONST(cephes_exp_minC2, 2.12194440e-4f); 190 _PS256_CONST(0p625, 0.625f); 191 192 _PS256_CONST(MAXNUMF, 3.4028234663852885981170418348451692544e38f); 193 194 _PS256_CONST(1 , 1.0f); 195 _PS256_CONST(0p5, 0.5f); 196 /* the smallest non denormalized float number */ 197 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); 198 _PS256_CONST_TYPE(mant_mask, int, 0x7f800000); 199 _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); 200 201 _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); 202 _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); 203 204 _PI32_CONST256(0, 0); 205 _PI32_CONST256(1, 1); 206 _PI32_CONST256(inv1, ~1); 207 _PI32_CONST256(2, 2); 208 _PI32_CONST256(4, 4); 209 _PI32_CONST256(0x7f, 0x7f); 210 211 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524); 212 _PS256_CONST(cephes_log_p0, 7.0376836292E-2); 213 _PS256_CONST(cephes_log_p1, - 1.1514610310E-1); 214 _PS256_CONST(cephes_log_p2, 1.1676998740E-1); 215 _PS256_CONST(cephes_log_p3, - 1.2420140846E-1); 216 _PS256_CONST(cephes_log_p4, + 1.4249322787E-1); 217 _PS256_CONST(cephes_log_p5, - 1.6668057665E-1); 218 _PS256_CONST(cephes_log_p6, + 2.0000714765E-1); 219 _PS256_CONST(cephes_log_p7, - 2.4999993993E-1); 220 _PS256_CONST(cephes_log_p8, + 3.3333331174E-1); 221 _PS256_CONST(cephes_log_q1, -2.12194440e-4); 222 _PS256_CONST(cephes_log_q2, 0.693359375); 223 224 /* natural logarithm computed for 8 simultaneous float 225 return NaN for x <= 0 226 */ 227 inline v8sf log256_ps(v8sf x) { 228 v8si imm0; 229 v8sf one = *(v8sf*)_ps256_1; 230 231 //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); 232 v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); 233 234 x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ 235 236 // can be done with AVX2 237 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); 238 239 /* keep only the fractional part */ 240 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); 241 x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); 242 243 // this is again another AVX2 instruction 244 imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); 245 v8sf e = _mm256_cvtepi32_ps(imm0); 246 247 e = _mm256_add_ps(e, one); 248 249 /* part2: 250 if( x < SQRTHF ) { 251 e -= 1; 252 x = x + x - 1.0; 253 } else { x = x - 1.0; } 254 */ 255 //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); 256 v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); 257 v8sf tmp = _mm256_and_ps(x, mask); 258 x = _mm256_sub_ps(x, one); 259 e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); 260 x = _mm256_add_ps(x, tmp); 261 262 v8sf z = _mm256_mul_ps(x,x); 263 264 v8sf y = *(v8sf*)_ps256_cephes_log_p0; 265 y = _mm256_mul_ps(y, x); 266 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); 267 y = _mm256_mul_ps(y, x); 268 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); 269 y = _mm256_mul_ps(y, x); 270 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); 271 y = _mm256_mul_ps(y, x); 272 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); 273 y = _mm256_mul_ps(y, x); 274 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); 275 y = _mm256_mul_ps(y, x); 276 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); 277 y = _mm256_mul_ps(y, x); 278 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); 279 y = _mm256_mul_ps(y, x); 280 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); 281 y = _mm256_mul_ps(y, x); 282 283 y = _mm256_mul_ps(y, z); 284 285 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); 286 y = _mm256_add_ps(y, tmp); 287 288 289 tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); 290 y = _mm256_sub_ps(y, tmp); 291 292 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); 293 x = _mm256_add_ps(x, y); 294 x = _mm256_add_ps(x, tmp); 295 x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN 296 return x; 297 } 298 299 _PS256_CONST(exp_hi, 88.3762626647949f); 300 _PS256_CONST(exp_lo, -88.3762626647949f); 301 302 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341); 303 _PS256_CONST(cephes_exp_C1, 0.693359375); 304 _PS256_CONST(cephes_exp_C2, -2.12194440e-4); 305 306 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4); 307 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3); 308 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3); 309 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2); 310 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1); 311 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1); 312 313 // rewritten alternate version which properly returns MAXNUMF or 0.0 outside of boundaries 314 inline v8sf exp256_ps_alternate(v8sf x) 315 { 316 v8sf z_tmp, z, fx; 317 v8si n; 318 v8sf xsupmaxlogf, xinfminglogf; 319 320 xsupmaxlogf = _mm256_cmp_ps(x, *(v8sf *) _ps256_MAXLOGF, _CMP_GT_OS); 321 xinfminglogf = _mm256_cmp_ps(x, *(v8sf *) _ps256_MINLOGF, _CMP_LT_OS); 322 323 /* Express e**x = e**g 2**n 324 * = e**g e**( n loge(2) ) 325 * = e**( g + n loge(2) ) 326 */ 327 fx = _mm256_fmadd_ps(*(v8sf *) _ps256_cephes_LOG2EF, x, *(v8sf *) _ps256_0p5); 328 z = _mm256_round_ps(fx, _MM_FROUND_FLOOR); 329 330 x = _mm256_fmadd_ps(z, *(v8sf *) _ps256_cephes_exp_minC1, x); 331 x = _mm256_fmadd_ps(z, *(v8sf *) _ps256_cephes_exp_minC2, x); 332 333 n = _mm256_cvttps_epi32(z); 334 n = _mm256_add_epi32(n, *(v8si *) _pi32_256_0x7f); 335 n = _mm256_slli_epi32(n, 23); 336 v8sf pow2n = _mm256_castsi256_ps(n); 337 338 z = _mm256_mul_ps(x, x); 339 340 z_tmp = _mm256_fmadd_ps(*(v8sf *) _ps256_cephes_exp_p0, x, *(v8sf *) _ps256_cephes_exp_p1); 341 z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p2); 342 z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p3); 343 z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p4); 344 z_tmp = _mm256_fmadd_ps(z_tmp, x, *(v8sf *) _ps256_cephes_exp_p5); 345 z_tmp = _mm256_fmadd_ps(z_tmp, z, x); 346 z_tmp = _mm256_add_ps(z_tmp, *(v8sf *) _ps256_1); 347 348 /* build 2^n */ 349 z_tmp = _mm256_mul_ps(z_tmp, pow2n); 350 351 z = _mm256_blendv_ps(z_tmp, *(v8sf *) _ps256_MAXNUMF, xsupmaxlogf); 352 z = _mm256_blendv_ps(z, _mm256_setzero_ps(), xinfminglogf); 353 354 return z; 355 } 356 357 _PS256_CONST(minus_cephes_DP1, -0.78515625); 358 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 359 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 360 _PS256_CONST(sincof_p0, -1.9515295891E-4); 361 _PS256_CONST(sincof_p1, 8.3321608736E-3); 362 _PS256_CONST(sincof_p2, -1.6666654611E-1); 363 _PS256_CONST(coscof_p0, 2.443315711809948E-005); 364 _PS256_CONST(coscof_p1, -1.388731625493765E-003); 365 _PS256_CONST(coscof_p2, 4.166664568298827E-002); 366 _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI 367 368 inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { 369 370 v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; 371 v8si imm0, imm2, imm4; 372 373 sign_bit_sin = x; 374 /* take the absolute value */ 375 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); 376 /* extract the sign bit (upper one) */ 377 sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask); 378 379 /* scale by 4/Pi */ 380 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); 381 382 /* store the integer part of y in imm2 */ 383 imm2 = _mm256_cvttps_epi32(y); 384 385 /* j=(j+1) & (~1) (see the cephes sources) */ 386 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); 387 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1); 388 389 y = _mm256_cvtepi32_ps(imm2); 390 imm4 = imm2; 391 392 /* get the swap sign flag for the sine */ 393 imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4); 394 imm0 = _mm256_slli_epi32(imm0, 29); 395 //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); 396 397 /* get the polynom selection mask for the sine*/ 398 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2); 399 imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0); 400 401 //v8sf poly_mask = _mm256_castsi256_ps(imm2); 402 v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); 403 v8sf poly_mask = _mm256_castsi256_ps(imm2); 404 405 /* The magic pass: "Extended precision modular arithmetic" 406 x = ((x - y * DP1) - y * DP2) - y * DP3; */ 407 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; 408 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; 409 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; 410 xmm1 = _mm256_mul_ps(y, xmm1); 411 xmm2 = _mm256_mul_ps(y, xmm2); 412 xmm3 = _mm256_mul_ps(y, xmm3); 413 x = _mm256_add_ps(x, xmm1); 414 x = _mm256_add_ps(x, xmm2); 415 x = _mm256_add_ps(x, xmm3); 416 417 imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2); 418 imm4 = _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4); 419 imm4 = _mm256_slli_epi32(imm4, 29); 420 421 v8sf sign_bit_cos = _mm256_castsi256_ps(imm4); 422 423 sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); 424 425 /* Evaluate the first polynom (0 <= x <= Pi/4) */ 426 v8sf z = _mm256_mul_ps(x,x); 427 y = *(v8sf*)_ps256_coscof_p0; 428 429 y = _mm256_mul_ps(y, z); 430 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); 431 y = _mm256_mul_ps(y, z); 432 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); 433 y = _mm256_mul_ps(y, z); 434 y = _mm256_mul_ps(y, z); 435 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); 436 y = _mm256_sub_ps(y, tmp); 437 y = _mm256_add_ps(y, *(v8sf*)_ps256_1); 438 439 /* Evaluate the second polynom (Pi/4 <= x <= 0) */ 440 441 v8sf y2 = *(v8sf*)_ps256_sincof_p0; 442 y2 = _mm256_mul_ps(y2, z); 443 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); 444 y2 = _mm256_mul_ps(y2, z); 445 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); 446 y2 = _mm256_mul_ps(y2, z); 447 y2 = _mm256_mul_ps(y2, x); 448 y2 = _mm256_add_ps(y2, x); 449 450 /* select the correct result from the two polynoms */ 451 xmm3 = poly_mask; 452 v8sf ysin2 = _mm256_and_ps(xmm3, y2); 453 v8sf ysin1 = _mm256_andnot_ps(xmm3, y); 454 y2 = _mm256_sub_ps(y2,ysin2); 455 y = _mm256_sub_ps(y, ysin1); 456 457 xmm1 = _mm256_add_ps(ysin1,ysin2); 458 xmm2 = _mm256_add_ps(y,y2); 459 460 /* update the sign */ 461 *s = _mm256_xor_ps(xmm1, sign_bit_sin); 462 *c = _mm256_xor_ps(xmm2, sign_bit_cos); 463 } 464 465 /* These are for a 24-bit significand: */ 466 static const float minus_cephes_DP1 = -0.78515625f; 467 static const float minus_cephes_DP2 = -2.4187564849853515625e-4f; 468 static const float minus_cephes_DP3 = -3.77489497744594108e-8f; 469 static float lossth = 8192.; 470 471 static const float T24M1 = 16777215.f; 472 static const float FOPI = 1.27323954473516f; 473 static const float PIO4F = 0.7853981633974483096f; 474 475 static const float sincof[] = {-1.9515295891E-4f, 8.3321608736E-3f, -1.6666654611E-1f}; 476 static const float coscof[] = {2.443315711809948E-5f, -1.388731625493765E-3f, 477 4.166664568298827E-2f}; 478 479 inline int mysincosf(float xx, float *s, float *c) 480 { 481 float x, y, y1, y2, z; 482 int j, sign_sin, sign_cos; 483 484 sign_sin = 1; 485 sign_cos = 1; 486 487 x = xx; 488 if (xx < 0) { 489 sign_sin = -1; 490 x = -xx; 491 } 492 if (x > T24M1) { 493 return (0.0f); 494 } 495 j = FOPI * x; /* integer part of x/(PI/4) */ 496 y = j; 497 /* map zeros to origin */ 498 if (j & 1) { 499 j += 1; 500 y += 1.0f; 501 } 502 j &= 7; /* octant modulo 360 degrees */ 503 /* reflect in x axis */ 504 if (j > 3) { 505 sign_sin = -sign_sin; 506 sign_cos = -sign_cos; 507 j -= 4; 508 } 509 510 if (j > 1) 511 sign_cos = -sign_cos; 512 513 if (x > lossth) { 514 x = x - y * PIO4F; 515 } else { 516 /* Extended precision modular arithmetic */ 517 x = ((x + y * minus_cephes_DP1) + y * minus_cephes_DP2) + y * minus_cephes_DP3; 518 } 519 /*einits();*/ 520 z = x * x; 521 522 /* measured relative error in +/- pi/4 is 7.8e-8 */ 523 y1 = ((coscof[0] * z + coscof[1]) * z + coscof[2]) * z * z; 524 y1 -= 0.5f * z; 525 y1 += 1.0f; 526 527 /* Theoretical relative error = 3.8e-9 in [-pi/4, +pi/4] */ 528 y2 = ((sincof[0] * z + sincof[1]) * z + sincof[2]) * z * x; 529 y2 += x; 530 531 if ((j == 1) || (j == 2)) { 532 *s = y1; 533 *c = y2; 534 } else { 535 *s = y2; 536 *c = y1; 537 } 538 // COS 539 540 /*einitd();*/ 541 if (sign_sin < 0) { 542 *s = -(*s); 543 } 544 if (sign_cos < 0) { 545 *c = -(*c); 546 } 547 548 return 0; 549 } 550 551 void Log10_Len8x_F32_V(float* x, size_t n) 552 { 553 const v8sf invln10f = _mm256_set1_ps((float) INVLN10); //_mm256_broadcast_ss(&invln10f_mask); 554 555 for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) { 556 v8sf src_tmp = log256_ps(_mm256_loadu_ps(x + i)); 557 _mm256_storeu_ps(x + i, _mm256_mul_ps(src_tmp, invln10f)); 558 } 559 } 560 561 void Log2_Len8x_F32_V(float* x, size_t n) 562 { 563 const v8sf invln2f = _mm256_set1_ps((float) INVLN2); //_mm256_broadcast_ss(&invln10f_mask); 564 565 for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) { 566 v8sf src_tmp = log256_ps(_mm256_loadu_ps(x + i)); 567 _mm256_storeu_ps(x + i, _mm256_mul_ps(src_tmp, invln2f)); 568 } 569 } 570 571 void Log_Len8x_F32_V(float* x, size_t n) 572 { 573 for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) { 574 _mm256_storeu_ps(x + i, log256_ps(_mm256_loadu_ps(x + i))); 575 } 576 } 577 578 void Exp_Len8x_F32_V(float* x, size_t n) 579 { 580 for (size_t i = 0; i < n; i += AVX_LEN_FLOAT) { 581 _mm256_storeu_ps(x + i, exp256_ps_alternate(_mm256_loadu_ps(x + i))); 582 } 583 } 584 585 void Sin_F32_V(float* x, size_t n) { 586 size_t stop_len = n / AVX_LEN_FLOAT; 587 stop_len *= AVX_LEN_FLOAT; 588 589 for (size_t i = 0; i < stop_len; i += AVX_LEN_FLOAT) { 590 v8sf src_tmp = _mm256_loadu_ps(x + i); 591 v8sf dst_sin_tmp; 592 v8sf dst_cos_tmp; 593 sincos256_ps(src_tmp, &dst_sin_tmp, &dst_cos_tmp); 594 _mm256_storeu_ps(x + i, dst_sin_tmp); 595 } 596 for (size_t i = stop_len; i < n; i++) { 597 float dst_cos_tmp; 598 mysincosf(x[i], x + i, &dst_cos_tmp); 599 } 600 } 601 602 void Cos_F32_V(float* x, size_t n) { 603 size_t stop_len = n / AVX_LEN_FLOAT; 604 stop_len *= AVX_LEN_FLOAT; 605 606 for (size_t i = 0; i < stop_len; i += AVX_LEN_FLOAT) { 607 v8sf src_tmp = _mm256_loadu_ps(x + i); 608 v8sf dst_sin_tmp; 609 v8sf dst_cos_tmp; 610 sincos256_ps(src_tmp, &dst_sin_tmp, &dst_cos_tmp); 611 _mm256_storeu_ps(x + i, dst_cos_tmp); 612 } 613 for (size_t i = stop_len; i < n; i++) { 614 float dst_sin_tmp; 615 mysincosf(x[i], &dst_sin_tmp, x + i); 616 } 617 } 618 619 void SinCos_F32_V(float* dst_sin, float* dst_cos, float* x, size_t n) { 620 size_t stop_len = n / AVX_LEN_FLOAT; 621 stop_len *= AVX_LEN_FLOAT; 622 623 for (size_t i = 0; i < stop_len; i += AVX_LEN_FLOAT) { 624 v8sf src_tmp = _mm256_loadu_ps(x + i); 625 v8sf dst_sin_tmp; 626 v8sf dst_cos_tmp; 627 sincos256_ps(src_tmp, &dst_sin_tmp, &dst_cos_tmp); 628 _mm256_storeu_ps(dst_sin + i, dst_sin_tmp); 629 _mm256_storeu_ps(dst_cos + i, dst_cos_tmp); 630 } 631 for (size_t i = stop_len; i < n; i++) { 632 mysincosf(x[i], dst_sin + i, dst_cos + i); 633 } 634 }