github.com/gopherd/gonum@v0.0.4/internal/asm/f32/axpyunitaryto_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) 10 TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 11 MOVQ dst_base+0(FP), DI // DI = &dst 12 MOVQ x_base+32(FP), SI // SI = &x 13 MOVQ y_base+56(FP), DX // DX = &y 14 MOVQ x_len+40(FP), BX // BX = min( len(x), len(y), len(dst) ) 15 CMPQ y_len+64(FP), BX 16 CMOVQLE y_len+64(FP), BX 17 CMPQ dst_len+8(FP), BX 18 CMOVQLE dst_len+8(FP), BX 19 CMPQ BX, $0 // if BX == 0 { return } 20 JE axpy_end 21 MOVSS alpha+24(FP), X0 22 SHUFPS $0, X0, X0 // X0 = { a, a, a, a, } 23 XORQ AX, AX // i = 0 24 MOVQ DX, CX 25 ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS 26 JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim } 27 28 XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 ) 29 INCQ CX 30 SHRQ $2, CX 31 32 axpy_align: // Trim first value(s) in unaligned buffer do { 33 MOVSS (SI)(AX*4), X2 // X2 = x[i] 34 MULSS X0, X2 // X2 *= a 35 ADDSS (DX)(AX*4), X2 // X2 += y[i] 36 MOVSS X2, (DI)(AX*4) // y[i] = X2 37 INCQ AX // i++ 38 DECQ BX 39 JZ axpy_end // if --BX == 0 { return } 40 LOOP axpy_align // } while --CX > 0 41 42 axpy_no_trim: 43 MOVUPS X0, X1 // Copy X0 to X1 for pipelining 44 MOVQ BX, CX 45 ANDQ $0xF, BX // BX = len % 16 46 SHRQ $4, CX // CX = floor( len / 16 ) 47 JZ axpy_tail4_start // if CX == 0 { return } 48 49 axpy_loop: // Loop unrolled 16x do { 50 MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4] 51 MOVUPS 16(SI)(AX*4), X3 52 MOVUPS 32(SI)(AX*4), X4 53 MOVUPS 48(SI)(AX*4), X5 54 MULPS X0, X2 // X2 *= a 55 MULPS X1, X3 56 MULPS X0, X4 57 MULPS X1, X5 58 ADDPS (DX)(AX*4), X2 // X2 += y[i:i+4] 59 ADDPS 16(DX)(AX*4), X3 60 ADDPS 32(DX)(AX*4), X4 61 ADDPS 48(DX)(AX*4), X5 62 MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2 63 MOVUPS X3, 16(DI)(AX*4) 64 MOVUPS X4, 32(DI)(AX*4) 65 MOVUPS X5, 48(DI)(AX*4) 66 ADDQ $16, AX // i += 16 67 LOOP axpy_loop // while (--CX) > 0 68 CMPQ BX, $0 // if BX == 0 { return } 69 JE axpy_end 70 71 axpy_tail4_start: // Reset loop counter for 4-wide tail loop 72 MOVQ BX, CX // CX = floor( BX / 4 ) 73 SHRQ $2, CX 74 JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start } 75 76 axpy_tail4: // Loop unrolled 4x do { 77 MOVUPS (SI)(AX*4), X2 // X2 = x[i] 78 MULPS X0, X2 // X2 *= a 79 ADDPS (DX)(AX*4), X2 // X2 += y[i] 80 MOVUPS X2, (DI)(AX*4) // y[i] = X2 81 ADDQ $4, AX // i += 4 82 LOOP axpy_tail4 // } while --CX > 0 83 84 axpy_tail_start: // Reset loop counter for 1-wide tail loop 85 MOVQ BX, CX // CX = BX % 4 86 ANDQ $3, CX 87 JZ axpy_end // if CX == 0 { return } 88 89 axpy_tail: 90 MOVSS (SI)(AX*4), X1 // X1 = x[i] 91 MULSS X0, X1 // X1 *= a 92 ADDSS (DX)(AX*4), X1 // X1 += y[i] 93 MOVSS X1, (DI)(AX*4) // y[i] = X1 94 INCQ AX // i++ 95 LOOP axpy_tail // } while --CX > 0 96 97 axpy_end: 98 RET