github.com/gopherd/gonum@v0.0.4/internal/asm/f32/dotunitary_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define HADDPS_SUM_SUM LONG $0xC07C0FF2 // @ HADDPS X0, X0 10 11 #define X_PTR SI 12 #define Y_PTR DI 13 #define LEN CX 14 #define TAIL BX 15 #define IDX AX 16 #define SUM X0 17 #define P_SUM X1 18 19 // func DotUnitary(x, y []float32) (sum float32) 20 TEXT ·DotUnitary(SB), NOSPLIT, $0 21 MOVQ x_base+0(FP), X_PTR // X_PTR = &x 22 MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y 23 PXOR SUM, SUM // SUM = 0 24 MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) ) 25 CMPQ y_len+32(FP), LEN 26 CMOVQLE y_len+32(FP), LEN 27 CMPQ LEN, $0 28 JE dot_end 29 30 XORQ IDX, IDX 31 MOVQ Y_PTR, DX 32 ANDQ $0xF, DX // Align on 16-byte boundary for MULPS 33 JZ dot_no_trim // if DX == 0 { goto dot_no_trim } 34 SUBQ $16, DX 35 36 dot_align: // Trim first value(s) in unaligned buffer do { 37 MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i] 38 MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i] 39 ADDSS X2, SUM // SUM += X2 40 INCQ IDX // IDX++ 41 DECQ LEN 42 JZ dot_end // if --TAIL == 0 { return } 43 ADDQ $4, DX 44 JNZ dot_align // } while --DX > 0 45 46 dot_no_trim: 47 PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining 48 MOVQ LEN, TAIL 49 ANDQ $0xF, TAIL // TAIL = LEN % 16 50 SHRQ $4, LEN // LEN = floor( LEN / 16 ) 51 JZ dot_tail4_start // if LEN == 0 { goto dot_tail4_start } 52 53 dot_loop: // Loop unrolled 16x do { 54 MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] 55 MOVUPS 16(X_PTR)(IDX*4), X3 56 MOVUPS 32(X_PTR)(IDX*4), X4 57 MOVUPS 48(X_PTR)(IDX*4), X5 58 59 MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1] 60 MULPS 16(Y_PTR)(IDX*4), X3 61 MULPS 32(Y_PTR)(IDX*4), X4 62 MULPS 48(Y_PTR)(IDX*4), X5 63 64 ADDPS X2, SUM // SUM += X_i 65 ADDPS X3, P_SUM 66 ADDPS X4, SUM 67 ADDPS X5, P_SUM 68 69 ADDQ $16, IDX // IDX += 16 70 DECQ LEN 71 JNZ dot_loop // } while --LEN > 0 72 73 ADDPS P_SUM, SUM // SUM += P_SUM 74 CMPQ TAIL, $0 // if TAIL == 0 { return } 75 JE dot_end 76 77 dot_tail4_start: // Reset loop counter for 4-wide tail loop 78 MOVQ TAIL, LEN // LEN = floor( TAIL / 4 ) 79 SHRQ $2, LEN 80 JZ dot_tail_start // if LEN == 0 { goto dot_tail_start } 81 82 dot_tail4_loop: // Loop unrolled 4x do { 83 MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] 84 MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1] 85 ADDPS X2, SUM // SUM += X_i 86 ADDQ $4, IDX // i += 4 87 DECQ LEN 88 JNZ dot_tail4_loop // } while --LEN > 0 89 90 dot_tail_start: // Reset loop counter for 1-wide tail loop 91 ANDQ $3, TAIL // TAIL = TAIL % 4 92 JZ dot_end // if TAIL == 0 { return } 93 94 dot_tail: // do { 95 MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i] 96 MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i] 97 ADDSS X2, SUM // psum += X2 98 INCQ IDX // IDX++ 99 DECQ TAIL 100 JNZ dot_tail // } while --TAIL > 0 101 102 dot_end: 103 HADDPS_SUM_SUM // SUM = \sum{ SUM[i] } 104 HADDPS_SUM_SUM 105 MOVSS SUM, sum+48(FP) // return SUM 106 RET