github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h (about) 1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ 2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ 3 #ifdef _MSC_VER 4 5 #include <emmintrin.h> 6 #include <immintrin.h> 7 #include <smmintrin.h> 8 9 #endif 10 11 inline int _mm256_extract_epi16_N0(const __m256i X) { 12 return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8); 13 } 14 15 inline int _mm256_extract_epi16_N1(const __m256i X) { 16 return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8); 17 } 18 19 inline int _mm256_extract_epi8_N0(const __m256i X) { 20 return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16); 21 } 22 23 inline int _mm256_extract_epi8_N1(const __m256i X) { 24 return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16); 25 } 26 27 namespace Eigen { 28 namespace internal { 29 30 typedef struct Packet32q8i { 31 __m256i val; 32 operator __m256i() const { return val; } 33 Packet32q8i() : val(_mm256_setzero_si256()){}; 34 Packet32q8i(__m256i val) : val(val) {} 35 } Packet32q8i; 36 37 typedef struct Packet16q16i { 38 __m256i val; 39 operator __m256i() const { return val; } 40 Packet16q16i() : val(_mm256_setzero_si256()){}; 41 Packet16q16i(__m256i val) : val(val) {} 42 } Packet16q16i; 43 44 typedef struct Packet32q8u { 45 __m256i val; 46 operator __m256i() const { return val; } 47 Packet32q8u() : val(_mm256_setzero_si256()){}; 48 Packet32q8u(__m256i val) : val(val) {} 49 } Packet32q8u; 50 51 typedef struct Packet16q8i { 52 __m128i val; 53 operator __m128i() const { return val; } 54 Packet16q8i() : val(_mm_setzero_si128()) {} 55 Packet16q8i(__m128i val) : val(val) {} 56 } Packet16q8i; 57 58 typedef struct Packet16q8u { 59 __m128i val; 60 operator __m128i() const { return val; } 61 Packet16q8u() : val(_mm_setzero_si128()) {} 62 Packet16q8u(__m128i val) : val(val) {} 63 } Packet16q8u; 64 65 typedef struct Packet8q16i { 66 __m128i val; 67 operator __m128i() const { return val; } 68 Packet8q16i() : val(_mm_setzero_si128()) {} 69 Packet8q16i(__m128i val) : val(val) {} 70 } Packet8q16i; 71 72 typedef struct Packet8q32i { 73 __m256i val; 74 operator __m256i() const { return val; } 75 Packet8q32i() : val(_mm256_setzero_si256()){}; 76 Packet8q32i(__m256i val) : val(val) {} 77 } Packet8q32i; 78 79 typedef struct Packet4q32i { 80 __m128i val; 81 operator __m128i() const { return val; } 82 Packet4q32i() : val(_mm_setzero_si128()) {} 83 Packet4q32i(__m128i val) : val(val) {} 84 } Packet4q32i; 85 86 #ifndef EIGEN_VECTORIZE_AVX512 87 template <> 88 struct packet_traits<QInt8> : default_packet_traits { 89 typedef Packet32q8i type; 90 typedef Packet16q8i half; 91 enum { 92 Vectorizable = 1, 93 AlignedOnScalar = 1, 94 size = 32, 95 }; 96 enum { 97 HasAdd = 0, 98 HasSub = 0, 99 HasMul = 0, 100 HasNegate = 0, 101 HasAbs = 0, 102 HasAbs2 = 0, 103 HasMin = 1, 104 HasMax = 1, 105 HasConj = 0, 106 HasSetLinear = 0 107 }; 108 }; 109 template <> 110 struct packet_traits<QUInt8> : default_packet_traits { 111 typedef Packet32q8u type; 112 typedef Packet16q8u half; 113 enum { 114 Vectorizable = 1, 115 AlignedOnScalar = 1, 116 size = 32, 117 }; 118 enum { 119 HasAdd = 0, 120 HasSub = 0, 121 HasMul = 0, 122 HasNegate = 0, 123 HasAbs = 0, 124 HasAbs2 = 0, 125 HasMin = 1, 126 HasMax = 1, 127 HasConj = 0, 128 HasSetLinear = 0 129 }; 130 }; 131 template <> 132 struct packet_traits<QInt16> : default_packet_traits { 133 typedef Packet16q16i type; 134 typedef Packet8q16i half; 135 enum { 136 Vectorizable = 1, 137 AlignedOnScalar = 1, 138 size = 16, 139 }; 140 enum { 141 HasAdd = 0, 142 HasSub = 0, 143 HasMul = 0, 144 HasNegate = 0, 145 HasAbs = 0, 146 HasAbs2 = 0, 147 HasMin = 1, 148 HasMax = 1, 149 HasConj = 0, 150 HasSetLinear = 0 151 }; 152 }; 153 template <> 154 struct packet_traits<QInt32> : default_packet_traits { 155 typedef Packet8q32i type; 156 typedef Packet4q32i half; 157 enum { 158 Vectorizable = 1, 159 AlignedOnScalar = 1, 160 size = 8, 161 }; 162 enum { 163 HasAdd = 1, 164 HasSub = 1, 165 HasMul = 1, 166 HasNegate = 1, 167 HasAbs = 0, 168 HasAbs2 = 0, 169 HasMin = 1, 170 HasMax = 1, 171 HasConj = 0, 172 HasSetLinear = 0 173 }; 174 }; 175 #endif 176 177 template <> 178 struct unpacket_traits<Packet32q8i> { 179 typedef QInt8 type; 180 typedef Packet16q8i half; 181 enum { 182 size = 32, 183 alignment = Aligned32, 184 vectorizable = true, 185 masked_load_available = false, 186 masked_store_available = false 187 }; 188 }; 189 template <> 190 struct unpacket_traits<Packet16q8i> { 191 typedef QInt8 type; 192 typedef Packet16q8i half; 193 enum { 194 size = 16, 195 alignment = Aligned32, 196 vectorizable = true, 197 masked_load_available = false, 198 masked_store_available = false 199 }; 200 }; 201 template <> 202 struct unpacket_traits<Packet16q16i> { 203 typedef QInt16 type; 204 typedef Packet8q16i half; 205 enum { 206 size = 16, 207 alignment = Aligned32, 208 vectorizable = true, 209 masked_load_available = false, 210 masked_store_available = false 211 }; 212 }; 213 template <> 214 struct unpacket_traits<Packet8q16i> { 215 typedef QInt16 type; 216 typedef Packet8q16i half; 217 enum { 218 size = 8, 219 alignment = Aligned32, 220 vectorizable = true, 221 masked_load_available = false, 222 masked_store_available = false 223 }; 224 }; 225 template <> 226 struct unpacket_traits<Packet32q8u> { 227 typedef QUInt8 type; 228 typedef Packet16q8u half; 229 enum { 230 size = 32, 231 alignment = Aligned32, 232 vectorizable = true, 233 masked_load_available = false, 234 masked_store_available = false 235 }; 236 }; 237 template <> 238 struct unpacket_traits<Packet8q32i> { 239 typedef QInt32 type; 240 typedef Packet4q32i half; 241 enum { 242 size = 8, 243 alignment = Aligned32, 244 vectorizable = true, 245 masked_load_available = false, 246 masked_store_available = false 247 }; 248 }; 249 250 // Unaligned load 251 template <> 252 EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) { 253 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( 254 reinterpret_cast<const __m256i*>(from)); 255 } 256 template <> 257 EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) { 258 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128( 259 reinterpret_cast<const __m128i*>(from)); 260 } 261 template <> 262 EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) { 263 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( 264 reinterpret_cast<const __m256i*>(from)); 265 } 266 template <> 267 EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) { 268 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( 269 reinterpret_cast<const __m256i*>(from)); 270 } 271 template <> 272 EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) { 273 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128( 274 reinterpret_cast<const __m128i*>(from)); 275 } 276 template <> 277 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) { 278 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( 279 reinterpret_cast<const __m256i*>(from)); 280 } 281 282 // Aligned load 283 template <> 284 EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) { 285 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( 286 reinterpret_cast<const __m256i*>(from)); 287 } 288 template <> 289 EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) { 290 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128( 291 reinterpret_cast<const __m128i*>(from)); 292 } 293 template <> 294 EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) { 295 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( 296 reinterpret_cast<const __m256i*>(from)); 297 } 298 template <> 299 EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) { 300 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( 301 reinterpret_cast<const __m256i*>(from)); 302 } 303 template <> 304 EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) { 305 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128( 306 reinterpret_cast<const __m128i*>(from)); 307 } 308 template <> 309 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) { 310 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( 311 reinterpret_cast<const __m256i*>(from)); 312 } 313 314 // Unaligned store 315 template <> 316 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) { 317 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( 318 reinterpret_cast<__m256i*>(to), from.val); 319 } 320 template <> 321 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) { 322 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), 323 from.val); 324 } 325 template <> 326 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) { 327 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( 328 reinterpret_cast<__m256i*>(to), from.val); 329 } 330 template <> 331 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) { 332 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( 333 reinterpret_cast<__m256i*>(to), from.val); 334 } 335 template <> 336 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) { 337 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), 338 from.val); 339 } 340 template <> 341 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) { 342 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( 343 reinterpret_cast<__m256i*>(to), from.val); 344 } 345 346 // Aligned store 347 template <> 348 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) { 349 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), 350 from.val); 351 } 352 template <> 353 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) { 354 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), 355 from.val); 356 } 357 template <> 358 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) { 359 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), 360 from.val); 361 } 362 template <> 363 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) { 364 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), 365 from.val); 366 } 367 template <> 368 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) { 369 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), 370 from.val); 371 } 372 template <> 373 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) { 374 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), 375 from.val); 376 } 377 378 // Extract first element. 379 template <> 380 EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) { 381 return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); 382 } 383 template <> 384 EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) { 385 return _mm256_extract_epi16_N0(a.val); 386 } 387 template <> 388 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) { 389 return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.val)); 390 } 391 template <> 392 EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) { 393 return _mm256_extract_epi8_N0(a.val); 394 } 395 396 // Initialize to constant value. 397 template <> 398 EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) { 399 return _mm256_set1_epi8(from.value); 400 } 401 template <> 402 EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) { 403 return _mm256_set1_epi8(static_cast<uint8_t>(from.value)); 404 } 405 template <> 406 EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) { 407 return _mm256_set1_epi32(from.value); 408 } 409 410 // Basic arithmetic packet ops for QInt32. 411 template <> 412 EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a, 413 const Packet8q32i& b) { 414 return _mm256_add_epi32(a.val, b.val); 415 } 416 template <> 417 EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) { 418 return _mm256_set1_epi16(from.value); 419 } 420 template <> 421 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a, 422 const Packet8q32i& b) { 423 return _mm256_sub_epi32(a.val, b.val); 424 } 425 // Note: mullo truncates the result to 32 bits. 426 template <> 427 EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a, 428 const Packet8q32i& b) { 429 return _mm256_mullo_epi32(a.val, b.val); 430 } 431 template <> 432 EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) { 433 return _mm256_sub_epi32(_mm256_setzero_si256(), a.val); 434 } 435 436 // Min and max. 437 template <> 438 EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a, 439 const Packet8q32i& b) { 440 return _mm256_min_epi32(a.val, b.val); 441 } 442 template <> 443 EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a, 444 const Packet8q32i& b) { 445 return _mm256_max_epi32(a.val, b.val); 446 } 447 448 template <> 449 EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a, 450 const Packet16q16i& b) { 451 return _mm256_min_epi16(a.val, b.val); 452 } 453 template <> 454 EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a, 455 const Packet16q16i& b) { 456 return _mm256_max_epi16(a.val, b.val); 457 } 458 459 template <> 460 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a, 461 const Packet32q8u& b) { 462 return _mm256_min_epu8(a.val, b.val); 463 } 464 template <> 465 EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a, 466 const Packet32q8u& b) { 467 return _mm256_max_epu8(a.val, b.val); 468 } 469 470 template <> 471 EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a, 472 const Packet32q8i& b) { 473 return _mm256_min_epi8(a.val, b.val); 474 } 475 template <> 476 EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a, 477 const Packet32q8i& b) { 478 return _mm256_max_epi8(a.val, b.val); 479 } 480 481 // Reductions. 482 template <> 483 EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) { 484 __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1)); 485 tmp = 486 _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 487 return pfirst<Packet8q32i>( 488 _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); 489 } 490 template <> 491 EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) { 492 __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1)); 493 tmp = 494 _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 495 return pfirst<Packet8q32i>( 496 _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); 497 } 498 499 template <> 500 EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) { 501 __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1)); 502 tmp = 503 _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 504 tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); 505 return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp)); 506 } 507 template <> 508 EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) { 509 __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1)); 510 tmp = 511 _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 512 tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); 513 return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp)); 514 } 515 516 template <> 517 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) { 518 __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1)); 519 tmp = 520 _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 521 tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); 522 tmp = _mm256_min_epu8(tmp, 523 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 524 return std::min(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)), 525 static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp))); 526 } 527 template <> 528 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) { 529 __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1)); 530 tmp = 531 _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 532 tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); 533 tmp = _mm256_max_epu8(tmp, 534 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 535 return std::max(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)), 536 static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp))); 537 } 538 539 template <> 540 EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) { 541 __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1)); 542 tmp = 543 _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 544 tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); 545 tmp = _mm256_min_epi8(tmp, 546 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 547 return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp)); 548 } 549 template <> 550 EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) { 551 __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1)); 552 tmp = 553 _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 554 tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); 555 tmp = _mm256_max_epi8(tmp, 556 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); 557 return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp)); 558 } 559 560 // Vectorized scaling of Packet32q8i by float. 561 template <> 562 struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> { 563 typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type; 564 #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN 565 EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) 566 #else 567 scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } 568 #endif 569 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type 570 operator()(const QInt32& a, const double& b) const { 571 return a * b; 572 } 573 574 EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, 575 const double& b) const { 576 __m256d scale = _mm256_set1_pd(b); 577 __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a)); 578 __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo)); 579 __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1)); 580 __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi)); 581 return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 582 1); 583 } 584 }; 585 586 template <> 587 struct functor_traits<scalar_product_op<QInt32, double>> { 588 enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true }; 589 }; 590 591 } // end namespace internal 592 } // end namespace Eigen 593 594 #endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_