gonum.org/v1/gonum@v0.14.0/internal/asm/f32/axpyincto_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) 10 TEXT ·AxpyIncTo(SB), NOSPLIT, $0 11 MOVQ n+96(FP), CX // CX = n 12 CMPQ CX, $0 // if n==0 { return } 13 JLE axpyi_end 14 MOVQ dst_base+0(FP), DI // DI = &dst 15 MOVQ x_base+48(FP), SI // SI = &x 16 MOVQ y_base+72(FP), DX // DX = &y 17 MOVQ ix+120(FP), R8 // R8 = ix // Load the first index 18 MOVQ iy+128(FP), R9 // R9 = iy 19 MOVQ idst+32(FP), R10 // R10 = idst 20 LEAQ (SI)(R8*4), SI // SI = &(x[ix]) 21 LEAQ (DX)(R9*4), DX // DX = &(y[iy]) 22 LEAQ (DI)(R10*4), DI // DI = &(dst[idst]) 23 MOVQ incX+104(FP), R8 // R8 = incX 24 SHLQ $2, R8 // R8 *= sizeof(float32) 25 MOVQ incY+112(FP), R9 // R9 = incY 26 SHLQ $2, R9 // R9 *= sizeof(float32) 27 MOVQ incDst+24(FP), R10 // R10 = incDst 28 SHLQ $2, R10 // R10 *= sizeof(float32) 29 MOVSS alpha+40(FP), X0 // X0 = alpha 30 MOVSS X0, X1 // X1 = X0 // for pipelining 31 MOVQ CX, BX 32 ANDQ $3, BX // BX = n % 4 33 SHRQ $2, CX // CX = floor( n / 4 ) 34 JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start } 35 36 axpyi_loop: // Loop unrolled 4x do { 37 MOVSS (SI), X2 // X_i = x[i] 38 MOVSS (SI)(R8*1), X3 39 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 40 MOVSS (SI), X4 41 MOVSS (SI)(R8*1), X5 42 MULSS X1, X2 // X_i *= a 43 MULSS X0, X3 44 MULSS X1, X4 45 MULSS X0, X5 46 ADDSS (DX), X2 // X_i += y[i] 47 ADDSS (DX)(R9*1), X3 48 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 49 ADDSS (DX), X4 50 ADDSS (DX)(R9*1), X5 51 MOVSS X2, (DI) // dst[i] = X_i 52 MOVSS X3, (DI)(R10*1) 53 LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2]) 54 MOVSS X4, (DI) 55 MOVSS X5, (DI)(R10*1) 56 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses 57 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 58 LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2]) 59 LOOP axpyi_loop // } while --CX > 0 60 CMPQ BX, $0 // if BX == 0 { return } 61 JE axpyi_end 62 63 axpyi_tail_start: // Reset loop registers 64 MOVQ BX, CX // Loop counter: CX = BX 65 66 axpyi_tail: // do { 67 MOVSS (SI), X2 // X2 = x[i] 68 MULSS X1, X2 // X2 *= a 69 ADDSS (DX), X2 // X2 += y[i] 70 MOVSS X2, (DI) // dst[i] = X2 71 ADDQ R8, SI // SI = &(SI[incX]) 72 ADDQ R9, DX // DX = &(DX[incY]) 73 ADDQ R10, DI // DI = &(DI[incY]) 74 LOOP axpyi_tail // } while --CX > 0 75 76 axpyi_end: 77 RET 78