github.com/johnnyeven/libtools@v0.0.0-20191126065708-61829c1adf46/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h (about)

     1  #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
     2  #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
     3  #ifdef _MSC_VER
     4  
     5  #include <emmintrin.h>
     6  #include <immintrin.h>
     7  #include <smmintrin.h>
     8  
     9  #endif
    10  
    11  inline int _mm256_extract_epi16_N0(const __m256i X) {
    12    return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
    13  }
    14  
    15  inline int _mm256_extract_epi16_N1(const __m256i X) {
    16    return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
    17  }
    18  
    19  inline int _mm256_extract_epi8_N0(const __m256i X) {
    20    return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
    21  }
    22  
    23  inline int _mm256_extract_epi8_N1(const __m256i X) {
    24    return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
    25  }
    26  
    27  namespace Eigen {
    28  namespace internal {
    29  
    30  typedef struct Packet32q8i {
    31    __m256i val;
    32    operator __m256i() const { return val; }
    33    Packet32q8i() : val(_mm256_setzero_si256()){};
    34    Packet32q8i(__m256i val) : val(val) {}
    35  } Packet32q8i;
    36  
    37  typedef struct Packet16q16i {
    38    __m256i val;
    39    operator __m256i() const { return val; }
    40    Packet16q16i() : val(_mm256_setzero_si256()){};
    41    Packet16q16i(__m256i val) : val(val) {}
    42  } Packet16q16i;
    43  
    44  typedef struct Packet32q8u {
    45    __m256i val;
    46    operator __m256i() const { return val; }
    47    Packet32q8u() : val(_mm256_setzero_si256()){};
    48    Packet32q8u(__m256i val) : val(val) {}
    49  } Packet32q8u;
    50  
    51  typedef struct Packet16q8i {
    52    __m128i val;
    53    operator __m128i() const { return val; }
    54    Packet16q8i() : val(_mm_setzero_si128()) {}
    55    Packet16q8i(__m128i val) : val(val) {}
    56  } Packet16q8i;
    57  
    58  typedef struct Packet16q8u {
    59    __m128i val;
    60    operator __m128i() const { return val; }
    61    Packet16q8u() : val(_mm_setzero_si128()) {}
    62    Packet16q8u(__m128i val) : val(val) {}
    63  } Packet16q8u;
    64  
    65  typedef struct Packet8q16i {
    66    __m128i val;
    67    operator __m128i() const { return val; }
    68    Packet8q16i() : val(_mm_setzero_si128()) {}
    69    Packet8q16i(__m128i val) : val(val) {}
    70  } Packet8q16i;
    71  
    72  typedef struct Packet8q32i {
    73    __m256i val;
    74    operator __m256i() const { return val; }
    75    Packet8q32i() : val(_mm256_setzero_si256()){};
    76    Packet8q32i(__m256i val) : val(val) {}
    77  } Packet8q32i;
    78  
    79  typedef struct Packet4q32i {
    80    __m128i val;
    81    operator __m128i() const { return val; }
    82    Packet4q32i() : val(_mm_setzero_si128()) {}
    83    Packet4q32i(__m128i val) : val(val) {}
    84  } Packet4q32i;
    85  
    86  #ifndef EIGEN_VECTORIZE_AVX512
    87  template <>
    88  struct packet_traits<QInt8> : default_packet_traits {
    89    typedef Packet32q8i type;
    90    typedef Packet16q8i half;
    91    enum {
    92      Vectorizable = 1,
    93      AlignedOnScalar = 1,
    94      size = 32,
    95    };
    96    enum {
    97      HasAdd = 0,
    98      HasSub = 0,
    99      HasMul = 0,
   100      HasNegate = 0,
   101      HasAbs = 0,
   102      HasAbs2 = 0,
   103      HasMin = 1,
   104      HasMax = 1,
   105      HasConj = 0,
   106      HasSetLinear = 0
   107    };
   108  };
   109  template <>
   110  struct packet_traits<QUInt8> : default_packet_traits {
   111    typedef Packet32q8u type;
   112    typedef Packet16q8u half;
   113    enum {
   114      Vectorizable = 1,
   115      AlignedOnScalar = 1,
   116      size = 32,
   117    };
   118    enum {
   119      HasAdd = 0,
   120      HasSub = 0,
   121      HasMul = 0,
   122      HasNegate = 0,
   123      HasAbs = 0,
   124      HasAbs2 = 0,
   125      HasMin = 1,
   126      HasMax = 1,
   127      HasConj = 0,
   128      HasSetLinear = 0
   129    };
   130  };
   131  template <>
   132  struct packet_traits<QInt16> : default_packet_traits {
   133    typedef Packet16q16i type;
   134    typedef Packet8q16i half;
   135    enum {
   136      Vectorizable = 1,
   137      AlignedOnScalar = 1,
   138      size = 16,
   139    };
   140    enum {
   141      HasAdd = 0,
   142      HasSub = 0,
   143      HasMul = 0,
   144      HasNegate = 0,
   145      HasAbs = 0,
   146      HasAbs2 = 0,
   147      HasMin = 1,
   148      HasMax = 1,
   149      HasConj = 0,
   150      HasSetLinear = 0
   151    };
   152  };
   153  template <>
   154  struct packet_traits<QInt32> : default_packet_traits {
   155    typedef Packet8q32i type;
   156    typedef Packet4q32i half;
   157    enum {
   158      Vectorizable = 1,
   159      AlignedOnScalar = 1,
   160      size = 8,
   161    };
   162    enum {
   163      HasAdd = 1,
   164      HasSub = 1,
   165      HasMul = 1,
   166      HasNegate = 1,
   167      HasAbs = 0,
   168      HasAbs2 = 0,
   169      HasMin = 1,
   170      HasMax = 1,
   171      HasConj = 0,
   172      HasSetLinear = 0
   173    };
   174  };
   175  #endif
   176  
   177  template <>
   178  struct unpacket_traits<Packet32q8i> {
   179    typedef QInt8 type;
   180    typedef Packet16q8i half;
   181    enum {
   182      size = 32,
   183      alignment = Aligned32,
   184      vectorizable = true,
   185      masked_load_available = false,
   186      masked_store_available = false
   187    };
   188  };
   189  template <>
   190  struct unpacket_traits<Packet16q8i> {
   191    typedef QInt8 type;
   192    typedef Packet16q8i half;
   193    enum {
   194      size = 16,
   195      alignment = Aligned32,
   196      vectorizable = true,
   197      masked_load_available = false,
   198      masked_store_available = false
   199    };
   200  };
   201  template <>
   202  struct unpacket_traits<Packet16q16i> {
   203    typedef QInt16 type;
   204    typedef Packet8q16i half;
   205    enum {
   206      size = 16,
   207      alignment = Aligned32,
   208      vectorizable = true,
   209      masked_load_available = false,
   210      masked_store_available = false
   211    };
   212  };
   213  template <>
   214  struct unpacket_traits<Packet8q16i> {
   215    typedef QInt16 type;
   216    typedef Packet8q16i half;
   217    enum {
   218      size = 8,
   219      alignment = Aligned32,
   220      vectorizable = true,
   221      masked_load_available = false,
   222      masked_store_available = false
   223    };
   224  };
   225  template <>
   226  struct unpacket_traits<Packet32q8u> {
   227    typedef QUInt8 type;
   228    typedef Packet16q8u half;
   229    enum {
   230      size = 32,
   231      alignment = Aligned32,
   232      vectorizable = true,
   233      masked_load_available = false,
   234      masked_store_available = false
   235    };
   236  };
   237  template <>
   238  struct unpacket_traits<Packet8q32i> {
   239    typedef QInt32 type;
   240    typedef Packet4q32i half;
   241    enum {
   242      size = 8,
   243      alignment = Aligned32,
   244      vectorizable = true,
   245      masked_load_available = false,
   246      masked_store_available = false
   247    };
   248  };
   249  
   250  // Unaligned load
   251  template <>
   252  EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
   253    EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
   254        reinterpret_cast<const __m256i*>(from));
   255  }
   256  template <>
   257  EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
   258    EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
   259        reinterpret_cast<const __m128i*>(from));
   260  }
   261  template <>
   262  EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
   263    EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
   264        reinterpret_cast<const __m256i*>(from));
   265  }
   266  template <>
   267  EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
   268    EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
   269        reinterpret_cast<const __m256i*>(from));
   270  }
   271  template <>
   272  EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
   273    EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
   274        reinterpret_cast<const __m128i*>(from));
   275  }
   276  template <>
   277  EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
   278    EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
   279        reinterpret_cast<const __m256i*>(from));
   280  }
   281  
   282  // Aligned load
   283  template <>
   284  EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
   285    EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
   286        reinterpret_cast<const __m256i*>(from));
   287  }
   288  template <>
   289  EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
   290    EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
   291        reinterpret_cast<const __m128i*>(from));
   292  }
   293  template <>
   294  EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
   295    EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
   296        reinterpret_cast<const __m256i*>(from));
   297  }
   298  template <>
   299  EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
   300    EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
   301        reinterpret_cast<const __m256i*>(from));
   302  }
   303  template <>
   304  EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
   305    EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
   306        reinterpret_cast<const __m128i*>(from));
   307  }
   308  template <>
   309  EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
   310    EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
   311        reinterpret_cast<const __m256i*>(from));
   312  }
   313  
   314  // Unaligned store
   315  template <>
   316  EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
   317    EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
   318        reinterpret_cast<__m256i*>(to), from.val);
   319  }
   320  template <>
   321  EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
   322    EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
   323                                                 from.val);
   324  }
   325  template <>
   326  EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   327    EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
   328        reinterpret_cast<__m256i*>(to), from.val);
   329  }
   330  template <>
   331  EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
   332    EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
   333        reinterpret_cast<__m256i*>(to), from.val);
   334  }
   335  template <>
   336  EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
   337    EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
   338                                                 from.val);
   339  }
   340  template <>
   341  EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   342    EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
   343        reinterpret_cast<__m256i*>(to), from.val);
   344  }
   345  
   346  // Aligned store
   347  template <>
   348  EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
   349    EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
   350                                                 from.val);
   351  }
   352  template <>
   353  EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
   354    EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
   355                                                 from.val);
   356  }
   357  template <>
   358  EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
   359    EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
   360                                              from.val);
   361  }
   362  template <>
   363  EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   364    EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
   365                                                 from.val);
   366  }
   367  template <>
   368  EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
   369    EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
   370                                                 from.val);
   371  }
   372  template <>
   373  EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
   374    EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
   375                                              from.val);
   376  }
   377  
   378  // Extract first element.
   379  template <>
   380  EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
   381    return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
   382  }
   383  template <>
   384  EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
   385    return _mm256_extract_epi16_N0(a.val);
   386  }
   387  template <>
   388  EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
   389    return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.val));
   390  }
   391  template <>
   392  EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
   393    return _mm256_extract_epi8_N0(a.val);
   394  }
   395  
   396  // Initialize to constant value.
   397  template <>
   398  EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
   399    return _mm256_set1_epi8(from.value);
   400  }
   401  template <>
   402  EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
   403    return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
   404  }
   405  template <>
   406  EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
   407    return _mm256_set1_epi32(from.value);
   408  }
   409  
   410  // Basic arithmetic packet ops for QInt32.
   411  template <>
   412  EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
   413                                                    const Packet8q32i& b) {
   414    return _mm256_add_epi32(a.val, b.val);
   415  }
   416  template <>
   417  EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
   418    return _mm256_set1_epi16(from.value);
   419  }
   420  template <>
   421  EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
   422                                                    const Packet8q32i& b) {
   423    return _mm256_sub_epi32(a.val, b.val);
   424  }
   425  // Note: mullo truncates the result to 32 bits.
   426  template <>
   427  EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
   428                                                    const Packet8q32i& b) {
   429    return _mm256_mullo_epi32(a.val, b.val);
   430  }
   431  template <>
   432  EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
   433    return _mm256_sub_epi32(_mm256_setzero_si256(), a.val);
   434  }
   435  
   436  // Min and max.
   437  template <>
   438  EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
   439                                                    const Packet8q32i& b) {
   440    return _mm256_min_epi32(a.val, b.val);
   441  }
   442  template <>
   443  EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
   444                                                    const Packet8q32i& b) {
   445    return _mm256_max_epi32(a.val, b.val);
   446  }
   447  
   448  template <>
   449  EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
   450                                                      const Packet16q16i& b) {
   451    return _mm256_min_epi16(a.val, b.val);
   452  }
   453  template <>
   454  EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
   455                                                      const Packet16q16i& b) {
   456    return _mm256_max_epi16(a.val, b.val);
   457  }
   458  
   459  template <>
   460  EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
   461                                                    const Packet32q8u& b) {
   462    return _mm256_min_epu8(a.val, b.val);
   463  }
   464  template <>
   465  EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
   466                                                    const Packet32q8u& b) {
   467    return _mm256_max_epu8(a.val, b.val);
   468  }
   469  
   470  template <>
   471  EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
   472                                                    const Packet32q8i& b) {
   473    return _mm256_min_epi8(a.val, b.val);
   474  }
   475  template <>
   476  EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
   477                                                    const Packet32q8i& b) {
   478    return _mm256_max_epi8(a.val, b.val);
   479  }
   480  
   481  // Reductions.
   482  template <>
   483  EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
   484    __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
   485    tmp =
   486        _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   487    return pfirst<Packet8q32i>(
   488        _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
   489  }
   490  template <>
   491  EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
   492    __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
   493    tmp =
   494        _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   495    return pfirst<Packet8q32i>(
   496        _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
   497  }
   498  
   499  template <>
   500  EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
   501    __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
   502    tmp =
   503        _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   504    tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
   505    return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
   506  }
   507  template <>
   508  EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
   509    __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
   510    tmp =
   511        _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   512    tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
   513    return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
   514  }
   515  
   516  template <>
   517  EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
   518    __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
   519    tmp =
   520        _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   521    tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
   522    tmp = _mm256_min_epu8(tmp,
   523                          _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   524    return std::min(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
   525                    static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
   526  }
   527  template <>
   528  EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
   529    __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
   530    tmp =
   531        _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   532    tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
   533    tmp = _mm256_max_epu8(tmp,
   534                          _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   535    return std::max(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
   536                    static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
   537  }
   538  
   539  template <>
   540  EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
   541    __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
   542    tmp =
   543        _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   544    tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
   545    tmp = _mm256_min_epi8(tmp,
   546                          _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   547    return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
   548  }
   549  template <>
   550  EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
   551    __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
   552    tmp =
   553        _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   554    tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
   555    tmp = _mm256_max_epi8(tmp,
   556                          _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   557    return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
   558  }
   559  
   560  // Vectorized scaling of Packet32q8i by float.
   561  template <>
   562  struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
   563    typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
   564  #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   565    EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
   566  #else
   567    scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
   568  #endif
   569    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
   570    operator()(const QInt32& a, const double& b) const {
   571      return a * b;
   572    }
   573  
   574    EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
   575                                                   const double& b) const {
   576      __m256d scale = _mm256_set1_pd(b);
   577      __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
   578      __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
   579      __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
   580      __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
   581      return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
   582                                     1);
   583    }
   584  };
   585  
   586  template <>
   587  struct functor_traits<scalar_product_op<QInt32, double>> {
   588    enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
   589  };
   590  
   591  }  // end namespace internal
   592  }  // end namespace Eigen
   593  
   594  #endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_