github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/c/l2_arm64.c (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 #include <arm_neon.h> 13 14 // l2 only works with length >= 16 15 void l2(float *a, float *b, float *res, long *len) 16 { 17 int size = *len; 18 19 // use the vectorized version for the first n - (n % 4) elements 20 int l = size - (size % 4); 21 22 // create 4*4 registers to store the result 23 float32x4_t res_vec0 = vdupq_n_f32(0); 24 float32x4_t res_vec1 = vdupq_n_f32(0); 25 float32x4_t res_vec2 = vdupq_n_f32(0); 26 float32x4_t res_vec3 = vdupq_n_f32(0); 27 28 int i = 0; 29 30 // load 4*4 floats at a time 31 while (i + 16 <= l) 32 { 33 float32x4x4_t a4 = vld1q_f32_x4(a + i); 34 float32x4x4_t b4 = vld1q_f32_x4(b + i); 35 36 float32x4_t diff0 = vsubq_f32(a4.val[0], b4.val[0]); 37 float32x4_t diff1 = vsubq_f32(a4.val[1], b4.val[1]); 38 float32x4_t diff2 = vsubq_f32(a4.val[2], b4.val[2]); 39 float32x4_t diff3 = vsubq_f32(a4.val[3], b4.val[3]); 40 res_vec0 += vmulq_f32(diff0, diff0); 41 res_vec1 += vmulq_f32(diff1, diff1); 42 res_vec2 += vmulq_f32(diff2, diff2); 43 res_vec3 += vmulq_f32(diff3, diff3); 44 45 i += 16; 46 } 47 48 while (i < l) 49 { 50 float32x4_t a_vec = vld1q_f32(a + i); 51 float32x4_t b_vec = vld1q_f32(b + i); 52 float32x4_t diff = vsubq_f32(a_vec, b_vec); 53 res_vec0 += vmulq_f32(diff, diff); 54 55 i += 4; 56 } 57 58 // convert to scalar 59 float sum = vaddvq_f32(res_vec0); 60 sum += vaddvq_f32(res_vec1); 61 sum += vaddvq_f32(res_vec2); 62 sum += vaddvq_f32(res_vec3); 63 64 // add the remaining vectors 65 for (int i = l; i < size; i++) 66 { 67 float diff = a[i] - b[i]; 68 float sq = diff * diff; 69 sum += sq; 70 } 71 72 res[0] = sum; 73 }