github.com/qiaogw/arrgo@v0.0.8/internal/matmul_amd64.s (about) 1 // +build !noasm !appengine 2 3 #define NOSPLIT 7 4 5 // func dotProd(a,b []float64) (float64) 6 TEXT ·DotProd(SB), NOSPLIT, $0 7 // a Data ptr 8 MOVQ a_base+0(FP), R8 9 MOVQ a_len+8(FP), SI 10 MOVQ b_base+24(FP), R9 11 XORQ DI, DI 12 PXOR X0, X0 13 14 // zero len return 15 CMPQ SI, $0 16 JE dotp_end 17 18 // check tail 19 SUBQ $2, SI 20 JL dotp_tail 21 22 CMPB ·FmaSupt(SB), $1 23 JE dotp_fma_loop 24 25 dotp_loop: 26 MOVOU (R8)(DI*8), X1 27 MULPD (R9)(DI*8), X1 28 ADDPD X1, X0 29 ADDQ $2, DI 30 CMPQ DI, SI 31 JLE dotp_loop 32 dotp_tail: 33 ADDQ $1, SI 34 CMPQ DI, SI 35 JNE dotp_end 36 MOVSD (R8)(DI*8), X1 37 MULSD (R9)(DI*8), X1 38 ADDSD X1, X0 39 JMP dotp_end 40 41 dotp_fma_loop: 42 MOVOU (R8)(DI*8), X1 43 // VMFADD231PD X1, (R9)(DI*8), X0 (x0 += x1*(R9) 44 BYTE $0xC4; BYTE $0xC2; BYTE $0xF1; BYTE $0xB8; BYTE $0x04; BYTE $0xF9 45 ADDQ $2, DI 46 CMPQ DI, SI 47 JLE dotp_fma_loop 48 dotp_fma_tail: 49 ADDQ $1, SI 50 CMPQ DI, SI 51 JNE dotp_end 52 MOVSD (R8)(DI*8), X1 53 // VMFADD231SD X1, (R9)(DI*8), X0 (x0 += x1*x2) 54 BYTE $0xC4; BYTE $0xC2; BYTE $0xF1; BYTE $0xB9; BYTE $0x04; BYTE $0xF9 55 dotp_end: 56 CMPB ·Sse3Supt(SB), $1 57 JE dotp_sse3 58 MOVAPD X0, X1 59 UNPCKHPD X1, X0 60 ADDPD X1, X0 61 MOVSD X0, ret+48(FP) 62 RET 63 dotp_sse3: 64 BYTE $0x66; BYTE $0x0F; BYTE $0x7C; BYTE $0xC0 65 //HADDPD X0, X0 //Added in 1.6 66 MOVSD X0, ret+48(FP) 67 RET