gitee.com/quant1x/num@v0.3.2/asm/src/floats_neon.c (about)

     1  // Copyright 2022 gorse Project Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  // http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include <arm_neon.h>
    16  #include <stdint.h>
    17  
    18  void vmul_const_add_to(float *a, float *b, float *c, long n) {
    19      int epoch = n / 4;
    20      int remain = n % 4;
    21      for (int i = 0; i < epoch; i++) {
    22          float32x4_t v1 = vld1q_f32(a);
    23          float32x4_t v3 = vld1q_f32(c);
    24          float32x4_t v = vmlaq_n_f32(v3, v1, *b);
    25          vst1q_f32(c, v);
    26          a += 4;
    27          c += 4;
    28      }
    29      for (int i = 0; i < remain; i++) {
    30          c[i] += a[i] * b[0];
    31      }
    32  }
    33  
    34  void vmul_const_to(float *a, float *b, float *c, long n) {
    35      int epoch = n / 4;
    36      int remain = n % 4;
    37      for (int i = 0; i < epoch; i++) {
    38          float32x4_t v1 = vld1q_f32(a);
    39          float32x4_t v = vmulq_n_f32(v1, *b);
    40          vst1q_f32(c, v);
    41          a += 4;
    42          c += 4;
    43      }
    44      for (int i = 0; i < remain; i++) {
    45          c[i] = a[i] * b[0];
    46      }
    47  }
    48  
    49  void vmul_const(float *a, float *b, long n) {
    50      int epoch = n / 4;
    51      int remain = n % 4;
    52      for (int i = 0; i < epoch; i++) {
    53          float32x4_t v1 = vld1q_f32(a);
    54          float32x4_t v = vmulq_n_f32(v1, *b);
    55          vst1q_f32(a, v);
    56          a += 4;
    57      }
    58      for (int i = 0; i < remain; i++) {
    59          a[i] *= b[0];
    60      }
    61  }
    62  
    63  void vmul_to(float *a, float *b, float *c, long n) {
    64      int epoch = n / 4;
    65      int remain = n % 4;
    66      for (int i = 0; i < epoch; i++) {
    67          float32x4_t v1 = vld1q_f32(a);
    68          float32x4_t v2 = vld1q_f32(b);
    69          float32x4_t v = vmulq_f32(v1, v2);
    70          vst1q_f32(c, v);
    71          a += 4;
    72          b += 4;
    73          c += 4;
    74      }
    75      for (int i = 0; i < remain; i++) {
    76          c[i] = a[i] * b[i];
    77      }
    78  }
    79  
    80  void vdot(float *a, float *b, long n, float* ret) {
    81      int epoch = n / 4;
    82      int remain = n % 4;
    83      float32x4_t s;
    84      if (epoch > 0) {
    85          float32x4_t v1 = vld1q_f32(a);
    86          float32x4_t v2 = vld1q_f32(b);
    87          s = vmulq_f32(v1, v2);
    88          a += 4;
    89          b += 4;
    90      }
    91      for (int i = 1; i < epoch; i++) {
    92          float32x4_t v1 = vld1q_f32(a);
    93          float32x4_t v2 = vld1q_f32(b);
    94          s = vmlaq_f32(s, v1, v2);
    95          a += 4;
    96          b += 4;
    97      }
    98      float partial[4];
    99      vst1q_f32(partial, s);
   100      *ret = 0;
   101      for (int i = 0; i < 4; i++) {
   102          *ret += partial[i];
   103      }
   104      for (int i = 0; i < remain; i++) {
   105          *ret += a[i] * b[i];
   106      }
   107  }