github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/hnsw/distancer/c/dot_arm64.c (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 #include <arm_neon.h> 13 14 // dot only works with length >= 16 15 void dot(float *a, float *b, float *res, long *len) 16 { 17 int size = *len; 18 19 // use the vectorized version for the first n - (n % 4) elements 20 int l = size - (size % 4); 21 22 // create 4*4 registers to store the result 23 float32x4_t res_vec0 = vdupq_n_f32(0); 24 float32x4_t res_vec1 = vdupq_n_f32(0); 25 float32x4_t res_vec2 = vdupq_n_f32(0); 26 float32x4_t res_vec3 = vdupq_n_f32(0); 27 28 int i = 0; 29 30 // load 4*4 floats at a time 31 while (i + 16 <= l) 32 { 33 float32x4x4_t a4 = vld1q_f32_x4(a + i); 34 float32x4x4_t b4 = vld1q_f32_x4(b + i); 35 36 res_vec0 += vmulq_f32(a4.val[0], b4.val[0]); 37 res_vec1 += vmulq_f32(a4.val[1], b4.val[1]); 38 res_vec2 += vmulq_f32(a4.val[2], b4.val[2]); 39 res_vec3 += vmulq_f32(a4.val[3], b4.val[3]); 40 41 i += 16; 42 } 43 44 while (i < l) 45 { 46 float32x4_t a_vec = vld1q_f32(a + i); 47 float32x4_t b_vec = vld1q_f32(b + i); 48 res_vec0 += vmulq_f32(a_vec, b_vec); 49 50 i += 4; 51 } 52 53 // convert to scalar 54 float sum = vaddvq_f32(res_vec0); 55 sum += vaddvq_f32(res_vec1); 56 sum += vaddvq_f32(res_vec2); 57 sum += vaddvq_f32(res_vec3); 58 59 // add the remaining vectors 60 for (int i = l; i < size; i++) 61 { 62 sum += a[i] * b[i]; 63 } 64 65 res[0] = sum; 66 }