gonum.org/v1/gonum@v0.14.0/internal/asm/f32/ddotinc_amd64.s (about)

     1  // Copyright ©2017 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  #define X_PTR SI
    10  #define Y_PTR DI
    11  #define LEN CX
    12  #define TAIL BX
    13  #define INC_X R8
    14  #define INCx3_X R10
    15  #define INC_Y R9
    16  #define INCx3_Y R11
    17  #define SUM X0
    18  #define P_SUM X1
    19  
    20  // func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
    21  TEXT ·DdotInc(SB), NOSPLIT, $0
    22  	MOVQ x_base+0(FP), X_PTR  // X_PTR = &x
    23  	MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
    24  	MOVQ n+48(FP), LEN        // LEN = n
    25  	PXOR SUM, SUM             // SUM = 0
    26  	CMPQ LEN, $0
    27  	JE   dot_end
    28  
    29  	MOVQ ix+72(FP), INC_X        // INC_X = ix
    30  	MOVQ iy+80(FP), INC_Y        // INC_Y = iy
    31  	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
    32  	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
    33  
    34  	MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32)
    35  	SHLQ $2, INC_X
    36  	MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32)
    37  	SHLQ $2, INC_Y
    38  
    39  	MOVQ LEN, TAIL
    40  	ANDQ $3, TAIL  // TAIL = LEN % 4
    41  	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
    42  	JZ   dot_tail  // if LEN == 0 { goto dot_tail }
    43  
    44  	PXOR P_SUM, P_SUM              // P_SUM = 0  for pipelining
    45  	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
    46  	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
    47  
    48  dot_loop: // Loop unrolled 4x  do {
    49  	CVTSS2SD (X_PTR), X2            // X_i = x[i:i+1]
    50  	CVTSS2SD (X_PTR)(INC_X*1), X3
    51  	CVTSS2SD (X_PTR)(INC_X*2), X4
    52  	CVTSS2SD (X_PTR)(INCx3_X*1), X5
    53  
    54  	CVTSS2SD (Y_PTR), X6            // X_j = y[i:i+1]
    55  	CVTSS2SD (Y_PTR)(INC_Y*1), X7
    56  	CVTSS2SD (Y_PTR)(INC_Y*2), X8
    57  	CVTSS2SD (Y_PTR)(INCx3_Y*1), X9
    58  
    59  	MULSD X6, X2 // X_i *= X_j
    60  	MULSD X7, X3
    61  	MULSD X8, X4
    62  	MULSD X9, X5
    63  
    64  	ADDSD X2, SUM   // SUM += X_i
    65  	ADDSD X3, P_SUM
    66  	ADDSD X4, SUM
    67  	ADDSD X5, P_SUM
    68  
    69  	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
    70  	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
    71  
    72  	DECQ LEN
    73  	JNZ  dot_loop // } while --LEN > 0
    74  
    75  	ADDSD P_SUM, SUM // SUM += P_SUM
    76  	CMPQ  TAIL, $0   // if TAIL == 0 { return }
    77  	JE    dot_end
    78  
    79  dot_tail: // do {
    80  	CVTSS2SD (X_PTR), X2  // X2 = x[i]
    81  	CVTSS2SD (Y_PTR), X3  // X2 *= y[i]
    82  	MULSD    X3, X2
    83  	ADDSD    X2, SUM      // SUM += X2
    84  	ADDQ     INC_X, X_PTR // X_PTR += INC_X
    85  	ADDQ     INC_Y, Y_PTR // Y_PTR += INC_Y
    86  	DECQ     TAIL
    87  	JNZ      dot_tail     // } while --TAIL > 0
    88  
    89  dot_end:
    90  	MOVSD SUM, sum+88(FP) // return SUM
    91  	RET