gitee.com/quant1x/gox@v1.7.6/num/asm/_cpp/find.cpp (about)

     1  #include <cstddef>
     2  #include <x86intrin.h>
     3  
     4  size_t Find_F64(double* x, double a, size_t n) {
     5      __m256d va = _mm256_set1_pd(a);
     6  
     7      size_t i = 0;
     8      for (; i < (n & size_t(-8)); i += 8) {
     9          __m256d y1 = _mm256_loadu_pd(&x[i]);
    10          __m256d y2 = _mm256_loadu_pd(&x[i + 4]);
    11          __m256i m1 = _mm256_cmpeq_epi64(va, y1);
    12          __m256i m2 = _mm256_cmpeq_epi64(va, y2);
    13          __m256i m = _mm256_or_si256(m1, m2);
    14          if (!_mm256_testz_si256(m, m)) {
    15              int mask = (_mm256_movemask_pd((__m256)m2) << 4) + _mm256_movemask_pd((__m256)m1);
    16              return i + __builtin_ctz(mask);
    17          }
    18      }
    19      for (; i < n; i++) {
    20          if (x[i] == a) {
    21              return i;
    22          }
    23      }
    24  
    25      return i;
    26  }
    27  
    28  size_t Find_F32(float* x, float a, size_t n) {
    29      __m256 va = _mm256_set1_ps(a);
    30  
    31      size_t i = 0;
    32      for (; i < (n & size_t(-16)); i += 16) {
    33          __m256 y1 = _mm256_loadu_ps(&x[i]);
    34          __m256 y2 = _mm256_loadu_ps(&x[i + 8]);
    35          __m256i m1 = _mm256_cmpeq_epi32(va, y1);
    36          __m256i m2 = _mm256_cmpeq_epi32(va, y2);
    37          __m256i m = _mm256_or_si256(m1, m2);
    38          if (!_mm256_testz_si256(m, m)) {
    39              int mask = (_mm256_movemask_ps((__m256)m2) << 8) + _mm256_movemask_ps((__m256)m1);
    40              return i + __builtin_ctz(mask);
    41          }
    42      }
    43      for (; i < n; i++) {
    44          if (x[i] == a) {
    45              return i;
    46          }
    47      }
    48  
    49      return i;
    50  }