gonum.org/v1/gonum@v0.14.0/internal/asm/f32/ddotinc_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define X_PTR SI 10 #define Y_PTR DI 11 #define LEN CX 12 #define TAIL BX 13 #define INC_X R8 14 #define INCx3_X R10 15 #define INC_Y R9 16 #define INCx3_Y R11 17 #define SUM X0 18 #define P_SUM X1 19 20 // func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) 21 TEXT ·DdotInc(SB), NOSPLIT, $0 22 MOVQ x_base+0(FP), X_PTR // X_PTR = &x 23 MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y 24 MOVQ n+48(FP), LEN // LEN = n 25 PXOR SUM, SUM // SUM = 0 26 CMPQ LEN, $0 27 JE dot_end 28 29 MOVQ ix+72(FP), INC_X // INC_X = ix 30 MOVQ iy+80(FP), INC_Y // INC_Y = iy 31 LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix]) 32 LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy]) 33 34 MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32) 35 SHLQ $2, INC_X 36 MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32) 37 SHLQ $2, INC_Y 38 39 MOVQ LEN, TAIL 40 ANDQ $3, TAIL // TAIL = LEN % 4 41 SHRQ $2, LEN // LEN = floor( LEN / 4 ) 42 JZ dot_tail // if LEN == 0 { goto dot_tail } 43 44 PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining 45 LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 46 LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3 47 48 dot_loop: // Loop unrolled 4x do { 49 CVTSS2SD (X_PTR), X2 // X_i = x[i:i+1] 50 CVTSS2SD (X_PTR)(INC_X*1), X3 51 CVTSS2SD (X_PTR)(INC_X*2), X4 52 CVTSS2SD (X_PTR)(INCx3_X*1), X5 53 54 CVTSS2SD (Y_PTR), X6 // X_j = y[i:i+1] 55 CVTSS2SD (Y_PTR)(INC_Y*1), X7 56 CVTSS2SD (Y_PTR)(INC_Y*2), X8 57 CVTSS2SD (Y_PTR)(INCx3_Y*1), X9 58 59 MULSD X6, X2 // X_i *= X_j 60 MULSD X7, X3 61 MULSD X8, X4 62 MULSD X9, X5 63 64 ADDSD X2, SUM // SUM += X_i 65 ADDSD X3, P_SUM 66 ADDSD X4, SUM 67 ADDSD X5, P_SUM 68 69 LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4]) 70 LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4]) 71 72 DECQ LEN 73 JNZ dot_loop // } while --LEN > 0 74 75 ADDSD P_SUM, SUM // SUM += P_SUM 76 CMPQ TAIL, $0 // if TAIL == 0 { return } 77 JE dot_end 78 79 dot_tail: // do { 80 CVTSS2SD (X_PTR), X2 // X2 = x[i] 81 CVTSS2SD (Y_PTR), X3 // X2 *= y[i] 82 MULSD X3, X2 83 ADDSD X2, SUM // SUM += X2 84 ADDQ INC_X, X_PTR // X_PTR += INC_X 85 ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y 86 DECQ TAIL 87 JNZ dot_tail // } while --TAIL > 0 88 89 dot_end: 90 MOVSD SUM, sum+88(FP) // return SUM 91 RET