github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h (about) 1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ 2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ 3 4 #include "PacketMathAVX2.h" 5 6 namespace Eigen { 7 namespace internal { 8 9 typedef struct Packet64q8i { 10 __m512i val; 11 operator __m512i() const { return val; } 12 Packet64q8i(); 13 Packet64q8i(__m512i val) : val(val) {} 14 } Packet64q8i; 15 16 typedef struct Packet32q16i { 17 __m512i val; 18 operator __m512i() const { return val; } 19 Packet32q16i(); 20 Packet32q16i(__m512i val) : val(val) {} 21 } Packet32q16i; 22 23 typedef struct Packet64q8u { 24 __m512i val; 25 operator __m512i() const { return val; } 26 Packet64q8u(); 27 Packet64q8u(__m512i val) : val(val) {} 28 } Packet64q8u; 29 30 typedef struct Packet16q32i { 31 __m512i val; 32 operator __m512i() const { return val; } 33 Packet16q32i(); 34 Packet16q32i(__m512i val) : val(val) {} 35 } Packet16q32i; 36 37 template <> 38 struct packet_traits<QInt8> : default_packet_traits { 39 typedef Packet64q8i type; 40 typedef Packet32q8i half; 41 enum { 42 Vectorizable = 1, 43 AlignedOnScalar = 1, 44 size = 64, 45 }; 46 enum { 47 HasAdd = 0, 48 HasSub = 0, 49 HasMul = 0, 50 HasNegate = 0, 51 HasAbs = 0, 52 HasAbs2 = 0, 53 HasMin = 1, 54 HasMax = 1, 55 HasConj = 0, 56 HasSetLinear = 0 57 }; 58 }; 59 template <> 60 struct packet_traits<QUInt8> : default_packet_traits { 61 typedef Packet64q8u type; 62 typedef Packet32q8u half; 63 enum { 64 Vectorizable = 1, 65 AlignedOnScalar = 1, 66 size = 64, 67 }; 68 enum { 69 HasAdd = 0, 70 HasSub = 0, 71 HasMul = 0, 72 HasNegate = 0, 73 HasAbs = 0, 74 HasAbs2 = 0, 75 HasMin = 1, 76 HasMax = 1, 77 HasConj = 0, 78 HasSetLinear = 0 79 }; 80 }; 81 template <> 82 struct packet_traits<QInt16> : default_packet_traits { 83 typedef Packet32q16i type; 84 typedef Packet16q16i half; 85 enum { 86 Vectorizable = 1, 87 AlignedOnScalar = 1, 88 size = 32, 89 }; 90 enum { 91 HasAdd = 0, 92 HasSub = 0, 93 HasMul = 0, 94 HasNegate = 0, 95 HasAbs = 0, 96 HasAbs2 = 0, 97 HasMin = 1, 98 HasMax = 1, 99 HasConj = 0, 100 HasSetLinear = 0 101 }; 102 }; 103 template <> 104 struct packet_traits<QInt32> : default_packet_traits { 105 typedef Packet16q32i type; 106 typedef Packet8q32i half; 107 enum { 108 Vectorizable = 1, 109 AlignedOnScalar = 1, 110 size = 16, 111 }; 112 enum { 113 HasAdd = 1, 114 HasSub = 1, 115 HasMul = 1, 116 HasNegate = 1, 117 HasAbs = 0, 118 HasAbs2 = 0, 119 HasMin = 1, 120 HasMax = 1, 121 HasConj = 0, 122 HasSetLinear = 0 123 }; 124 }; 125 126 template <> 127 struct unpacket_traits<Packet64q8i> { 128 typedef QInt8 type; 129 typedef Packet32q8i half; 130 enum { 131 size = 64, 132 alignment = Aligned64, 133 masked_load_available = false, 134 masked_store_available = false 135 }; 136 }; 137 template <> 138 struct unpacket_traits<Packet32q16i> { 139 typedef QInt16 type; 140 typedef Packet16q16i half; 141 enum { 142 size = 32, 143 alignment = Aligned64, 144 masked_load_available = false, 145 masked_store_available = false 146 }; 147 }; 148 template <> 149 struct unpacket_traits<Packet64q8u> { 150 typedef QUInt8 type; 151 typedef Packet32q8u half; 152 enum { 153 size = 64, 154 alignment = Aligned64, 155 masked_load_available = false, 156 masked_store_available = false 157 }; 158 }; 159 template <> 160 struct unpacket_traits<Packet16q32i> { 161 typedef QInt32 type; 162 typedef Packet8q32i half; 163 enum { 164 size = 16, 165 alignment = Aligned64, 166 masked_load_available = false, 167 masked_store_available = false 168 }; 169 }; 170 171 // Unaligned load 172 template <> 173 EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) { 174 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 175 reinterpret_cast<const __m512i*>(from)); 176 } 177 template <> 178 EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) { 179 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 180 reinterpret_cast<const __m512i*>(from)); 181 } 182 template <> 183 EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) { 184 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 185 reinterpret_cast<const __m512i*>(from)); 186 } 187 template <> 188 EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) { 189 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 190 reinterpret_cast<const __m512i*>(from)); 191 } 192 193 // Aligned load 194 template <> 195 EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) { 196 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 197 reinterpret_cast<const __m512i*>(from)); 198 } 199 template <> 200 EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) { 201 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 202 reinterpret_cast<const __m512i*>(from)); 203 } 204 template <> 205 EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) { 206 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 207 reinterpret_cast<const __m512i*>(from)); 208 } 209 template <> 210 EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) { 211 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 212 reinterpret_cast<const __m512i*>(from)); 213 } 214 215 // Unaligned store 216 template <> 217 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) { 218 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 219 reinterpret_cast<__m512i*>(to), from.val); 220 } 221 template <> 222 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) { 223 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 224 reinterpret_cast<__m512i*>(to), from.val); 225 } 226 template <> 227 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) { 228 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 229 reinterpret_cast<__m512i*>(to), from.val); 230 } 231 template <> 232 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) { 233 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 234 reinterpret_cast<__m512i*>(to), from.val); 235 } 236 237 // Aligned store 238 template <> 239 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) { 240 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 241 from.val); 242 } 243 template <> 244 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) { 245 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 246 from.val); 247 } 248 template <> 249 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) { 250 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 251 from.val); 252 } 253 template <> 254 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) { 255 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 256 from.val); 257 } 258 259 // Extract first element. 260 template <> 261 EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) { 262 return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0)); 263 } 264 template <> 265 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) { 266 return static_cast<uint8_t>( 267 _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0)); 268 } 269 template <> 270 EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) { 271 return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0); 272 } 273 template <> 274 EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) { 275 return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0); 276 } 277 278 // Initialize to constant value. 279 template <> 280 EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) { 281 return _mm512_set1_epi8(from.value); 282 } 283 template <> 284 EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) { 285 return _mm512_set1_epi16(from.value); 286 } 287 template <> 288 EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) { 289 return _mm512_set1_epi8(static_cast<uint8_t>(from.value)); 290 } 291 template <> 292 EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) { 293 return _mm512_set1_epi32(from.value); 294 } 295 296 // Basic arithmetic packet ops for QInt32. 297 template <> 298 EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a, 299 const Packet16q32i& b) { 300 return _mm512_add_epi32(a.val, b.val); 301 } 302 template <> 303 EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a, 304 const Packet16q32i& b) { 305 return _mm512_sub_epi32(a.val, b.val); 306 } 307 // Note: mullo truncates the result to 32 bits. 308 template <> 309 EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a, 310 const Packet16q32i& b) { 311 return _mm512_mullo_epi32(a.val, b.val); 312 } 313 template <> 314 EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) { 315 return _mm512_sub_epi32(_mm512_setzero_si512(), a.val); 316 } 317 318 // Min and max. 319 template <> 320 EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a, 321 const Packet16q32i& b) { 322 return _mm512_min_epi32(a.val, b.val); 323 } 324 template <> 325 EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a, 326 const Packet16q32i& b) { 327 return _mm512_max_epi32(a.val, b.val); 328 } 329 330 template <> 331 EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a, 332 const Packet64q8u& b) { 333 #ifdef EIGEN_VECTORIZE_AVX512BW 334 return _mm512_min_epu8(a.val, b.val); 335 #else 336 __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); 337 __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); 338 __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); 339 __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); 340 __m256i r0 = _mm256_min_epu8(ap0, bp0); 341 __m256i r1 = _mm256_min_epu8(ap1, bp1); 342 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 343 #endif 344 } 345 template <> 346 EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a, 347 const Packet64q8u& b) { 348 #ifdef EIGEN_VECTORIZE_AVX512BW 349 return _mm512_max_epu8(a.val, b.val); 350 #else 351 __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); 352 __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); 353 __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); 354 __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); 355 __m256i r0 = _mm256_max_epu8(ap0, bp0); 356 __m256i r1 = _mm256_max_epu8(ap1, bp1); 357 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 358 #endif 359 } 360 361 template <> 362 EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a, 363 const Packet64q8i& b) { 364 #ifdef EIGEN_VECTORIZE_AVX512BW 365 return _mm512_min_epi8(a.val, b.val); 366 #else 367 __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); 368 __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); 369 __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); 370 __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); 371 __m256i r0 = _mm256_min_epi8(ap0, bp0); 372 __m256i r1 = _mm256_min_epi8(ap1, bp1); 373 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 374 #endif 375 } 376 template <> 377 EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a, 378 const Packet32q16i& b) { 379 #ifdef EIGEN_VECTORIZE_AVX512BW 380 return _mm512_min_epi16(a.val, b.val); 381 #else 382 __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); 383 __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); 384 __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); 385 __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); 386 __m256i r0 = _mm256_min_epi16(ap0, bp0); 387 __m256i r1 = _mm256_min_epi16(ap1, bp1); 388 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 389 #endif 390 } 391 template <> 392 EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a, 393 const Packet64q8i& b) { 394 #ifdef EIGEN_VECTORIZE_AVX512BW 395 return _mm512_max_epi8(a.val, b.val); 396 #else 397 __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); 398 __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); 399 __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); 400 __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); 401 __m256i r0 = _mm256_max_epi8(ap0, bp0); 402 __m256i r1 = _mm256_max_epi8(ap1, bp1); 403 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 404 #endif 405 } 406 template <> 407 EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a, 408 const Packet32q16i& b) { 409 #ifdef EIGEN_VECTORIZE_AVX512BW 410 return _mm512_max_epi16(a.val, b.val); 411 #else 412 __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); 413 __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); 414 __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); 415 __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); 416 __m256i r0 = _mm256_max_epi16(ap0, bp0); 417 __m256i r1 = _mm256_max_epi16(ap1, bp1); 418 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 419 #endif 420 } 421 422 // Reductions. 423 template <> 424 EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) { 425 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 426 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 427 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 428 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 429 Packet4i res = 430 _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3)); 431 res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 432 return pfirst( 433 _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 434 } 435 template <> 436 EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) { 437 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 438 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 439 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 440 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 441 Packet4i res = 442 _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3)); 443 res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 444 return pfirst( 445 _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 446 } 447 template <> 448 EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) { 449 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 450 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 451 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 452 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 453 Packet4i res = 454 _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3)); 455 res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 456 std::uint32_t w = pfirst( 457 _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 458 return std::min( 459 {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)}); 460 } 461 template <> 462 EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) { 463 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 464 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 465 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 466 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 467 Packet4i res = 468 _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3)); 469 res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 470 std::uint32_t w = pfirst( 471 _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 472 return std::max( 473 {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)}); 474 } 475 template <> 476 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) { 477 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 478 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 479 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 480 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 481 Packet4i res = 482 _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3)); 483 res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 484 std::uint32_t w = pfirst( 485 _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 486 return std::min( 487 {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16), 488 static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)}); 489 } 490 template <> 491 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) { 492 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 493 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 494 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 495 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 496 Packet4i res = 497 _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3)); 498 res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 499 std::uint32_t w = pfirst( 500 _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 501 return std::max( 502 {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16), 503 static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)}); 504 } 505 template <> 506 EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) { 507 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 508 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 509 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 510 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 511 Packet4i res = 512 _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3)); 513 res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 514 std::uint32_t w = pfirst( 515 _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 516 return std::min( 517 {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16), 518 static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)}); 519 } 520 template <> 521 EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) { 522 Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); 523 Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); 524 Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); 525 Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); 526 Packet4i res = 527 _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3)); 528 res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 529 std::uint32_t w = pfirst( 530 _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); 531 return std::min( 532 {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16), 533 static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)}); 534 } 535 536 } // end namespace internal 537 } // end namespace Eigen 538 539 #endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_