gitee.com/quant1x/num@v0.3.2/asm/src/floats_neon.c (about) 1 // Copyright 2022 gorse Project Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <arm_neon.h> 16 #include <stdint.h> 17 18 void vmul_const_add_to(float *a, float *b, float *c, long n) { 19 int epoch = n / 4; 20 int remain = n % 4; 21 for (int i = 0; i < epoch; i++) { 22 float32x4_t v1 = vld1q_f32(a); 23 float32x4_t v3 = vld1q_f32(c); 24 float32x4_t v = vmlaq_n_f32(v3, v1, *b); 25 vst1q_f32(c, v); 26 a += 4; 27 c += 4; 28 } 29 for (int i = 0; i < remain; i++) { 30 c[i] += a[i] * b[0]; 31 } 32 } 33 34 void vmul_const_to(float *a, float *b, float *c, long n) { 35 int epoch = n / 4; 36 int remain = n % 4; 37 for (int i = 0; i < epoch; i++) { 38 float32x4_t v1 = vld1q_f32(a); 39 float32x4_t v = vmulq_n_f32(v1, *b); 40 vst1q_f32(c, v); 41 a += 4; 42 c += 4; 43 } 44 for (int i = 0; i < remain; i++) { 45 c[i] = a[i] * b[0]; 46 } 47 } 48 49 void vmul_const(float *a, float *b, long n) { 50 int epoch = n / 4; 51 int remain = n % 4; 52 for (int i = 0; i < epoch; i++) { 53 float32x4_t v1 = vld1q_f32(a); 54 float32x4_t v = vmulq_n_f32(v1, *b); 55 vst1q_f32(a, v); 56 a += 4; 57 } 58 for (int i = 0; i < remain; i++) { 59 a[i] *= b[0]; 60 } 61 } 62 63 void vmul_to(float *a, float *b, float *c, long n) { 64 int epoch = n / 4; 65 int remain = n % 4; 66 for (int i = 0; i < epoch; i++) { 67 float32x4_t v1 = vld1q_f32(a); 68 float32x4_t v2 = vld1q_f32(b); 69 float32x4_t v = vmulq_f32(v1, v2); 70 vst1q_f32(c, v); 71 a += 4; 72 b += 4; 73 c += 4; 74 } 75 for (int i = 0; i < remain; i++) { 76 c[i] = a[i] * b[i]; 77 } 78 } 79 80 void vdot(float *a, float *b, long n, float* ret) { 81 int epoch = n / 4; 82 int remain = n % 4; 83 float32x4_t s; 84 if (epoch > 0) { 85 float32x4_t v1 = vld1q_f32(a); 86 float32x4_t v2 = vld1q_f32(b); 87 s = vmulq_f32(v1, v2); 88 a += 4; 89 b += 4; 90 } 91 for (int i = 1; i < epoch; i++) { 92 float32x4_t v1 = vld1q_f32(a); 93 float32x4_t v2 = vld1q_f32(b); 94 s = vmlaq_f32(s, v1, v2); 95 a += 4; 96 b += 4; 97 } 98 float partial[4]; 99 vst1q_f32(partial, s); 100 *ret = 0; 101 for (int i = 0; i < 4; i++) { 102 *ret += partial[i]; 103 } 104 for (int i = 0; i < remain; i++) { 105 *ret += a[i] * b[i]; 106 } 107 }