gonum.org/v1/gonum@v0.14.0/internal/asm/f32/dotinc_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define X_PTR SI 10 #define Y_PTR DI 11 #define LEN CX 12 #define TAIL BX 13 #define INC_X R8 14 #define INCx3_X R10 15 #define INC_Y R9 16 #define INCx3_Y R11 17 #define SUM X0 18 #define P_SUM X1 19 20 // func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) 21 TEXT ·DotInc(SB), NOSPLIT, $0 22 MOVQ x_base+0(FP), X_PTR // X_PTR = &x 23 MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y 24 PXOR SUM, SUM // SUM = 0 25 MOVQ n+48(FP), LEN // LEN = n 26 CMPQ LEN, $0 27 JE dot_end 28 29 MOVQ ix+72(FP), INC_X // INC_X = ix 30 MOVQ iy+80(FP), INC_Y // INC_Y = iy 31 LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix]) 32 LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy]) 33 34 MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32) 35 SHLQ $2, INC_X 36 MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32) 37 SHLQ $2, INC_Y 38 39 MOVQ LEN, TAIL 40 ANDQ $0x3, TAIL // TAIL = LEN % 4 41 SHRQ $2, LEN // LEN = floor( LEN / 4 ) 42 JZ dot_tail // if LEN == 0 { goto dot_tail } 43 44 PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining 45 LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 46 LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3 47 48 dot_loop: // Loop unrolled 4x do { 49 MOVSS (X_PTR), X2 // X_i = x[i:i+1] 50 MOVSS (X_PTR)(INC_X*1), X3 51 MOVSS (X_PTR)(INC_X*2), X4 52 MOVSS (X_PTR)(INCx3_X*1), X5 53 54 MULSS (Y_PTR), X2 // X_i *= y[i:i+1] 55 MULSS (Y_PTR)(INC_Y*1), X3 56 MULSS (Y_PTR)(INC_Y*2), X4 57 MULSS (Y_PTR)(INCx3_Y*1), X5 58 59 ADDSS X2, SUM // SUM += X_i 60 ADDSS X3, P_SUM 61 ADDSS X4, SUM 62 ADDSS X5, P_SUM 63 64 LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4]) 65 LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4]) 66 67 DECQ LEN 68 JNZ dot_loop // } while --LEN > 0 69 70 ADDSS P_SUM, SUM // P_SUM += SUM 71 CMPQ TAIL, $0 // if TAIL == 0 { return } 72 JE dot_end 73 74 dot_tail: // do { 75 MOVSS (X_PTR), X2 // X2 = x[i] 76 MULSS (Y_PTR), X2 // X2 *= y[i] 77 ADDSS X2, SUM // SUM += X2 78 ADDQ INC_X, X_PTR // X_PTR += INC_X 79 ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y 80 DECQ TAIL 81 JNZ dot_tail // } while --TAIL > 0 82 83 dot_end: 84 MOVSS SUM, sum+88(FP) // return SUM 85 RET