github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h (about)

     1  #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
     2  #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
     3  
     4  #include "PacketMathAVX2.h"
     5  
     6  namespace Eigen {
     7  namespace internal {
     8  
     9  typedef struct Packet64q8i {
    10    __m512i val;
    11    operator __m512i() const { return val; }
    12    Packet64q8i();
    13    Packet64q8i(__m512i val) : val(val) {}
    14  } Packet64q8i;
    15  
    16  typedef struct Packet32q16i {
    17    __m512i val;
    18    operator __m512i() const { return val; }
    19    Packet32q16i();
    20    Packet32q16i(__m512i val) : val(val) {}
    21  } Packet32q16i;
    22  
    23  typedef struct Packet64q8u {
    24    __m512i val;
    25    operator __m512i() const { return val; }
    26    Packet64q8u();
    27    Packet64q8u(__m512i val) : val(val) {}
    28  } Packet64q8u;
    29  
    30  typedef struct Packet16q32i {
    31    __m512i val;
    32    operator __m512i() const { return val; }
    33    Packet16q32i();
    34    Packet16q32i(__m512i val) : val(val) {}
    35  } Packet16q32i;
    36  
    37  template <>
    38  struct packet_traits<QInt8> : default_packet_traits {
    39    typedef Packet64q8i type;
    40    typedef Packet32q8i half;
    41    enum {
    42      Vectorizable = 1,
    43      AlignedOnScalar = 1,
    44      size = 64,
    45    };
    46    enum {
    47      HasAdd = 0,
    48      HasSub = 0,
    49      HasMul = 0,
    50      HasNegate = 0,
    51      HasAbs = 0,
    52      HasAbs2 = 0,
    53      HasMin = 1,
    54      HasMax = 1,
    55      HasConj = 0,
    56      HasSetLinear = 0
    57    };
    58  };
    59  template <>
    60  struct packet_traits<QUInt8> : default_packet_traits {
    61    typedef Packet64q8u type;
    62    typedef Packet32q8u half;
    63    enum {
    64      Vectorizable = 1,
    65      AlignedOnScalar = 1,
    66      size = 64,
    67    };
    68    enum {
    69      HasAdd = 0,
    70      HasSub = 0,
    71      HasMul = 0,
    72      HasNegate = 0,
    73      HasAbs = 0,
    74      HasAbs2 = 0,
    75      HasMin = 1,
    76      HasMax = 1,
    77      HasConj = 0,
    78      HasSetLinear = 0
    79    };
    80  };
    81  template <>
    82  struct packet_traits<QInt16> : default_packet_traits {
    83    typedef Packet32q16i type;
    84    typedef Packet16q16i half;
    85    enum {
    86      Vectorizable = 1,
    87      AlignedOnScalar = 1,
    88      size = 32,
    89    };
    90    enum {
    91      HasAdd = 0,
    92      HasSub = 0,
    93      HasMul = 0,
    94      HasNegate = 0,
    95      HasAbs = 0,
    96      HasAbs2 = 0,
    97      HasMin = 1,
    98      HasMax = 1,
    99      HasConj = 0,
   100      HasSetLinear = 0
   101    };
   102  };
   103  template <>
   104  struct packet_traits<QInt32> : default_packet_traits {
   105    typedef Packet16q32i type;
   106    typedef Packet8q32i half;
   107    enum {
   108      Vectorizable = 1,
   109      AlignedOnScalar = 1,
   110      size = 16,
   111    };
   112    enum {
   113      HasAdd = 1,
   114      HasSub = 1,
   115      HasMul = 1,
   116      HasNegate = 1,
   117      HasAbs = 0,
   118      HasAbs2 = 0,
   119      HasMin = 1,
   120      HasMax = 1,
   121      HasConj = 0,
   122      HasSetLinear = 0
   123    };
   124  };
   125  
   126  template <>
   127  struct unpacket_traits<Packet64q8i> {
   128    typedef QInt8 type;
   129    typedef Packet32q8i half;
   130    enum {
   131      size = 64,
   132      alignment = Aligned64,
   133      masked_load_available = false,
   134      masked_store_available = false
   135    };
   136  };
   137  template <>
   138  struct unpacket_traits<Packet32q16i> {
   139    typedef QInt16 type;
   140    typedef Packet16q16i half;
   141    enum {
   142      size = 32,
   143      alignment = Aligned64,
   144      masked_load_available = false,
   145      masked_store_available = false
   146    };
   147  };
   148  template <>
   149  struct unpacket_traits<Packet64q8u> {
   150    typedef QUInt8 type;
   151    typedef Packet32q8u half;
   152    enum {
   153      size = 64,
   154      alignment = Aligned64,
   155      masked_load_available = false,
   156      masked_store_available = false
   157    };
   158  };
   159  template <>
   160  struct unpacket_traits<Packet16q32i> {
   161    typedef QInt32 type;
   162    typedef Packet8q32i half;
   163    enum {
   164      size = 16,
   165      alignment = Aligned64,
   166      masked_load_available = false,
   167      masked_store_available = false
   168    };
   169  };
   170  
   171  // Unaligned load
   172  template <>
   173  EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
   174    EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
   175        reinterpret_cast<const __m512i*>(from));
   176  }
   177  template <>
   178  EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
   179    EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
   180        reinterpret_cast<const __m512i*>(from));
   181  }
   182  template <>
   183  EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
   184    EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
   185        reinterpret_cast<const __m512i*>(from));
   186  }
   187  template <>
   188  EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
   189    EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
   190        reinterpret_cast<const __m512i*>(from));
   191  }
   192  
   193  // Aligned load
   194  template <>
   195  EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
   196    EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
   197        reinterpret_cast<const __m512i*>(from));
   198  }
   199  template <>
   200  EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
   201    EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
   202        reinterpret_cast<const __m512i*>(from));
   203  }
   204  template <>
   205  EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
   206    EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
   207        reinterpret_cast<const __m512i*>(from));
   208  }
   209  template <>
   210  EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
   211    EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
   212        reinterpret_cast<const __m512i*>(from));
   213  }
   214  
   215  // Unaligned store
   216  template <>
   217  EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
   218    EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
   219        reinterpret_cast<__m512i*>(to), from.val);
   220  }
   221  template <>
   222  EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
   223    EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
   224        reinterpret_cast<__m512i*>(to), from.val);
   225  }
   226  template <>
   227  EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
   228    EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
   229        reinterpret_cast<__m512i*>(to), from.val);
   230  }
   231  template <>
   232  EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
   233    EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
   234        reinterpret_cast<__m512i*>(to), from.val);
   235  }
   236  
   237  // Aligned store
   238  template <>
   239  EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
   240    EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
   241                                                 from.val);
   242  }
   243  template <>
   244  EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
   245    EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
   246                                                 from.val);
   247  }
   248  template <>
   249  EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
   250    EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
   251                                                 from.val);
   252  }
   253  template <>
   254  EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
   255    EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
   256                                                 from.val);
   257  }
   258  
   259  // Extract first element.
   260  template <>
   261  EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
   262    return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
   263  }
   264  template <>
   265  EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
   266    return static_cast<uint8_t>(
   267        _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
   268  }
   269  template <>
   270  EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
   271    return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0);
   272  }
   273  template <>
   274  EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
   275    return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0);
   276  }
   277  
   278  // Initialize to constant value.
   279  template <>
   280  EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
   281    return _mm512_set1_epi8(from.value);
   282  }
   283  template <>
   284  EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
   285    return _mm512_set1_epi16(from.value);
   286  }
   287  template <>
   288  EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
   289    return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
   290  }
   291  template <>
   292  EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
   293    return _mm512_set1_epi32(from.value);
   294  }
   295  
   296  // Basic arithmetic packet ops for QInt32.
   297  template <>
   298  EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
   299                                                      const Packet16q32i& b) {
   300    return _mm512_add_epi32(a.val, b.val);
   301  }
   302  template <>
   303  EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
   304                                                      const Packet16q32i& b) {
   305    return _mm512_sub_epi32(a.val, b.val);
   306  }
   307  // Note: mullo truncates the result to 32 bits.
   308  template <>
   309  EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
   310                                                      const Packet16q32i& b) {
   311    return _mm512_mullo_epi32(a.val, b.val);
   312  }
   313  template <>
   314  EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
   315    return _mm512_sub_epi32(_mm512_setzero_si512(), a.val);
   316  }
   317  
   318  // Min and max.
   319  template <>
   320  EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
   321                                                      const Packet16q32i& b) {
   322    return _mm512_min_epi32(a.val, b.val);
   323  }
   324  template <>
   325  EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
   326                                                      const Packet16q32i& b) {
   327    return _mm512_max_epi32(a.val, b.val);
   328  }
   329  
   330  template <>
   331  EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
   332                                                    const Packet64q8u& b) {
   333  #ifdef EIGEN_VECTORIZE_AVX512BW
   334    return _mm512_min_epu8(a.val, b.val);
   335  #else
   336    __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
   337    __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
   338    __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
   339    __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
   340    __m256i r0 = _mm256_min_epu8(ap0, bp0);
   341    __m256i r1 = _mm256_min_epu8(ap1, bp1);
   342    return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
   343  #endif
   344  }
   345  template <>
   346  EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
   347                                                    const Packet64q8u& b) {
   348  #ifdef EIGEN_VECTORIZE_AVX512BW
   349    return _mm512_max_epu8(a.val, b.val);
   350  #else
   351    __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
   352    __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
   353    __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
   354    __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
   355    __m256i r0 = _mm256_max_epu8(ap0, bp0);
   356    __m256i r1 = _mm256_max_epu8(ap1, bp1);
   357    return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
   358  #endif
   359  }
   360  
   361  template <>
   362  EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
   363                                                    const Packet64q8i& b) {
   364  #ifdef EIGEN_VECTORIZE_AVX512BW
   365    return _mm512_min_epi8(a.val, b.val);
   366  #else
   367    __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
   368    __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
   369    __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
   370    __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
   371    __m256i r0 = _mm256_min_epi8(ap0, bp0);
   372    __m256i r1 = _mm256_min_epi8(ap1, bp1);
   373    return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
   374  #endif
   375  }
   376  template <>
   377  EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
   378                                                      const Packet32q16i& b) {
   379  #ifdef EIGEN_VECTORIZE_AVX512BW
   380    return _mm512_min_epi16(a.val, b.val);
   381  #else
   382    __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
   383    __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
   384    __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
   385    __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
   386    __m256i r0 = _mm256_min_epi16(ap0, bp0);
   387    __m256i r1 = _mm256_min_epi16(ap1, bp1);
   388    return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
   389  #endif
   390  }
   391  template <>
   392  EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
   393                                                    const Packet64q8i& b) {
   394  #ifdef EIGEN_VECTORIZE_AVX512BW
   395    return _mm512_max_epi8(a.val, b.val);
   396  #else
   397    __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
   398    __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
   399    __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
   400    __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
   401    __m256i r0 = _mm256_max_epi8(ap0, bp0);
   402    __m256i r1 = _mm256_max_epi8(ap1, bp1);
   403    return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
   404  #endif
   405  }
   406  template <>
   407  EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
   408                                                      const Packet32q16i& b) {
   409  #ifdef EIGEN_VECTORIZE_AVX512BW
   410    return _mm512_max_epi16(a.val, b.val);
   411  #else
   412    __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
   413    __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
   414    __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
   415    __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
   416    __m256i r0 = _mm256_max_epi16(ap0, bp0);
   417    __m256i r1 = _mm256_max_epi16(ap1, bp1);
   418    return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
   419  #endif
   420  }
   421  
   422  // Reductions.
   423  template <>
   424  EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
   425    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   426    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   427    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   428    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   429    Packet4i res =
   430        _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
   431    res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   432    return pfirst(
   433        _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   434  }
   435  template <>
   436  EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
   437    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   438    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   439    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   440    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   441    Packet4i res =
   442        _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
   443    res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   444    return pfirst(
   445        _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   446  }
   447  template <>
   448  EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
   449    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   450    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   451    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   452    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   453    Packet4i res =
   454        _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
   455    res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   456    std::uint32_t w = pfirst(
   457        _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   458    return std::min(
   459        {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
   460  }
   461  template <>
   462  EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
   463    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   464    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   465    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   466    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   467    Packet4i res =
   468        _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
   469    res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   470    std::uint32_t w = pfirst(
   471        _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   472    return std::max(
   473        {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
   474  }
   475  template <>
   476  EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
   477    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   478    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   479    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   480    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   481    Packet4i res =
   482        _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
   483    res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   484    std::uint32_t w = pfirst(
   485        _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   486    return std::min(
   487        {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
   488         static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
   489  }
   490  template <>
   491  EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
   492    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   493    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   494    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   495    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   496    Packet4i res =
   497        _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
   498    res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   499    std::uint32_t w = pfirst(
   500        _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   501    return std::max(
   502        {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
   503         static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
   504  }
   505  template <>
   506  EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
   507    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   508    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   509    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   510    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   511    Packet4i res =
   512        _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
   513    res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   514    std::uint32_t w = pfirst(
   515        _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   516    return std::min(
   517        {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
   518         static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
   519  }
   520  template <>
   521  EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
   522    Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
   523    Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
   524    Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
   525    Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
   526    Packet4i res =
   527        _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
   528    res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   529    std::uint32_t w = pfirst(
   530        _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
   531    return std::min(
   532        {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
   533         static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
   534  }
   535  
   536  }  // end namespace internal
   537  }  // end namespace Eigen
   538  
   539  #endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_