github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/c/l2_arm64.c (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  #include <arm_neon.h>
    13  
    14  // l2 only works with length >= 16
    15  void l2(float *a, float *b, float *res, long *len)
    16  {
    17      int size = *len;
    18  
    19      // use the vectorized version for the first n - (n % 4) elements
    20      int l = size - (size % 4);
    21  
    22      // create 4*4 registers to store the result
    23      float32x4_t res_vec0 = vdupq_n_f32(0);
    24      float32x4_t res_vec1 = vdupq_n_f32(0);
    25      float32x4_t res_vec2 = vdupq_n_f32(0);
    26      float32x4_t res_vec3 = vdupq_n_f32(0);
    27  
    28      int i = 0;
    29  
    30      // load 4*4 floats at a time
    31      while (i + 16 <= l)
    32      {
    33          float32x4x4_t a4 = vld1q_f32_x4(a + i);
    34          float32x4x4_t b4 = vld1q_f32_x4(b + i);
    35  
    36          float32x4_t diff0 = vsubq_f32(a4.val[0], b4.val[0]);
    37          float32x4_t diff1 = vsubq_f32(a4.val[1], b4.val[1]);
    38          float32x4_t diff2 = vsubq_f32(a4.val[2], b4.val[2]);
    39          float32x4_t diff3 = vsubq_f32(a4.val[3], b4.val[3]);
    40          res_vec0 += vmulq_f32(diff0, diff0);
    41          res_vec1 += vmulq_f32(diff1, diff1);
    42          res_vec2 += vmulq_f32(diff2, diff2);
    43          res_vec3 += vmulq_f32(diff3, diff3);
    44  
    45          i += 16;
    46      }
    47  
    48      while (i < l)
    49      {
    50          float32x4_t a_vec = vld1q_f32(a + i);
    51          float32x4_t b_vec = vld1q_f32(b + i);
    52          float32x4_t diff = vsubq_f32(a_vec, b_vec);
    53          res_vec0 += vmulq_f32(diff, diff);
    54  
    55          i += 4;
    56      }
    57  
    58      // convert to scalar
    59      float sum = vaddvq_f32(res_vec0);
    60      sum += vaddvq_f32(res_vec1);
    61      sum += vaddvq_f32(res_vec2);
    62      sum += vaddvq_f32(res_vec3);
    63  
    64      // add the remaining vectors
    65      for (int i = l; i < size; i++)
    66      {
    67          float diff = a[i] - b[i];
    68          float sq = diff * diff;
    69          sum += sq;
    70      }
    71  
    72      res[0] = sum;
    73  }