github.com/qiaogw/arrgo@v0.0.8/internal/matmul_amd64.s (about)

     1  // +build !noasm !appengine
     2  
     3  #define NOSPLIT 7
     4  
     5  // func dotProd(a,b []float64) (float64)
     6  TEXT ·DotProd(SB), NOSPLIT, $0
     7  	// a Data ptr
     8  	MOVQ 	a_base+0(FP), R8
     9  	MOVQ 	a_len+8(FP), SI
    10  	MOVQ 	b_base+24(FP), R9
    11  	XORQ 	DI, DI
    12  	PXOR 	X0, X0
    13  	
    14  	// zero len return
    15  	CMPQ  	SI, $0
    16  	JE   	dotp_end
    17  
    18  	// check tail
    19  	SUBQ 	$2, SI
    20  	JL   	dotp_tail
    21  
    22  	CMPB	·FmaSupt(SB), $1
    23  	JE 	dotp_fma_loop
    24  
    25  dotp_loop:
    26  	MOVOU	(R8)(DI*8), X1
    27  	MULPD	(R9)(DI*8), X1
    28  	ADDPD	X1, X0
    29  	ADDQ	$2, DI
    30  	CMPQ	DI, SI
    31  	JLE	dotp_loop
    32  dotp_tail:
    33  	ADDQ 	$1, SI
    34  	CMPQ 	DI, SI
    35  	JNE   	dotp_end
    36  	MOVSD 	(R8)(DI*8), X1
    37  	MULSD 	(R9)(DI*8), X1
    38  	ADDSD 	X1, X0
    39  	JMP 	dotp_end
    40  
    41  dotp_fma_loop:
    42  	MOVOU 	(R8)(DI*8), X1
    43  	// VMFADD231PD X1, (R9)(DI*8), X0 (x0 += x1*(R9)
    44  	BYTE	$0xC4; BYTE $0xC2; BYTE $0xF1; BYTE $0xB8; BYTE $0x04; BYTE $0xF9
    45  	ADDQ	$2, DI
    46  	CMPQ	DI, SI
    47  	JLE	dotp_fma_loop
    48  dotp_fma_tail:
    49  	ADDQ	$1, SI
    50  	CMPQ	DI, SI
    51  	JNE 	dotp_end
    52  	MOVSD	(R8)(DI*8), X1
    53  	// VMFADD231SD X1, (R9)(DI*8), X0 (x0 += x1*x2)
    54  	BYTE	$0xC4; BYTE $0xC2; BYTE $0xF1; BYTE $0xB9; BYTE $0x04; BYTE $0xF9
    55  dotp_end:
    56  	CMPB 	·Sse3Supt(SB), $1
    57  	JE	dotp_sse3
    58  	MOVAPD	X0, X1
    59  	UNPCKHPD X1, X0
    60  	ADDPD 	X1, X0
    61  	MOVSD	X0, ret+48(FP)
    62  	RET
    63  dotp_sse3:
    64  	BYTE $0x66; BYTE $0x0F; BYTE $0x7C; BYTE $0xC0
    65  	//HADDPD X0, X0  //Added in 1.6
    66  	MOVSD	X0, ret+48(FP)
    67  	RET