gonum.org/v1/gonum@v0.14.0/internal/asm/f32/ddotunitary_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define HADDPD_SUM_SUM LONG $0xC07C0F66 // @ HADDPD X0, X0 10 11 #define X_PTR SI 12 #define Y_PTR DI 13 #define LEN CX 14 #define TAIL BX 15 #define IDX AX 16 #define SUM X0 17 #define P_SUM X1 18 19 // func DdotUnitary(x, y []float32) (sum float32) 20 TEXT ·DdotUnitary(SB), NOSPLIT, $0 21 MOVQ x_base+0(FP), X_PTR // X_PTR = &x 22 MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y 23 MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) ) 24 CMPQ y_len+32(FP), LEN 25 CMOVQLE y_len+32(FP), LEN 26 PXOR SUM, SUM // psum = 0 27 CMPQ LEN, $0 28 JE dot_end 29 30 XORQ IDX, IDX 31 MOVQ Y_PTR, DX 32 ANDQ $0xF, DX // Align on 16-byte boundary for ADDPS 33 JZ dot_no_trim // if DX == 0 { goto dot_no_trim } 34 35 SUBQ $16, DX 36 37 dot_align: // Trim first value(s) in unaligned buffer do { 38 CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i]) 39 CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i]) 40 MULSD X3, X2 41 ADDSD X2, SUM // SUM += X2 42 INCQ IDX // IDX++ 43 DECQ LEN 44 JZ dot_end // if --TAIL == 0 { return } 45 ADDQ $4, DX 46 JNZ dot_align // } while --LEN > 0 47 48 dot_no_trim: 49 PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining 50 MOVQ LEN, TAIL 51 ANDQ $0x7, TAIL // TAIL = LEN % 8 52 SHRQ $3, LEN // LEN = floor( LEN / 8 ) 53 JZ dot_tail_start // if LEN == 0 { goto dot_tail_start } 54 55 dot_loop: // Loop unrolled 8x do { 56 CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] 57 CVTPS2PD 8(X_PTR)(IDX*4), X3 58 CVTPS2PD 16(X_PTR)(IDX*4), X4 59 CVTPS2PD 24(X_PTR)(IDX*4), X5 60 61 CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1] 62 CVTPS2PD 8(Y_PTR)(IDX*4), X7 63 CVTPS2PD 16(Y_PTR)(IDX*4), X8 64 CVTPS2PD 24(Y_PTR)(IDX*4), X9 65 66 MULPD X6, X2 // X_i *= X_j 67 MULPD X7, X3 68 MULPD X8, X4 69 MULPD X9, X5 70 71 ADDPD X2, SUM // SUM += X_i 72 ADDPD X3, P_SUM 73 ADDPD X4, SUM 74 ADDPD X5, P_SUM 75 76 ADDQ $8, IDX // IDX += 8 77 DECQ LEN 78 JNZ dot_loop // } while --LEN > 0 79 80 ADDPD P_SUM, SUM // SUM += P_SUM 81 CMPQ TAIL, $0 // if TAIL == 0 { return } 82 JE dot_end 83 84 dot_tail_start: 85 MOVQ TAIL, LEN 86 SHRQ $1, LEN 87 JZ dot_tail_one 88 89 dot_tail_two: 90 CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] 91 CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1] 92 MULPD X6, X2 // X_i *= X_j 93 ADDPD X2, SUM // SUM += X_i 94 ADDQ $2, IDX // IDX += 2 95 DECQ LEN 96 JNZ dot_tail_two // } while --LEN > 0 97 98 ANDQ $1, TAIL 99 JZ dot_end 100 101 dot_tail_one: 102 CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i]) 103 CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i]) 104 MULSD X3, X2 // X2 *= X3 105 ADDSD X2, SUM // SUM += X2 106 107 dot_end: 108 HADDPD_SUM_SUM // SUM = \sum{ SUM[i] } 109 MOVSD SUM, sum+48(FP) // return SUM 110 RET