github.com/egonelbre/exp@v0.0.0-20240430123955-ed1d3aa93911/vector/compare/axpyinc_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // func AsmAxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) 10 TEXT ·AsmAxpyInc(SB), NOSPLIT, $0 11 MOVQ n+56(FP), CX // CX = n 12 CMPQ CX, $0 // if n==0 { return } 13 JLE axpyi_end 14 MOVQ x_base+8(FP), SI // SI = &x 15 MOVQ y_base+32(FP), DI // DI = &y 16 MOVQ ix+80(FP), R8 // R8 = ix 17 MOVQ iy+88(FP), R9 // R9 = iy 18 LEAQ (SI)(R8*4), SI // SI = &(x[ix]) 19 LEAQ (DI)(R9*4), DI // DI = &(y[iy]) 20 MOVQ DI, DX // DX = DI Read Pointer for y 21 MOVQ incX+64(FP), R8 // R8 = incX 22 SHLQ $2, R8 // R8 *= sizeof(float32) 23 MOVQ incY+72(FP), R9 // R9 = incY 24 SHLQ $2, R9 // R9 *= sizeof(float32) 25 MOVSS alpha+0(FP), X0 // X0 = alpha 26 MOVSS X0, X1 // X1 = X0 // for pipelining 27 MOVQ CX, BX 28 ANDQ $3, BX // BX = n % 4 29 SHRQ $2, CX // CX = floor( n / 4 ) 30 JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start } 31 32 axpyi_loop: // Loop unrolled 4x do { 33 MOVSS (SI), X2 // X_i = x[i] 34 MOVSS (SI)(R8*1), X3 35 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 36 MOVSS (SI), X4 37 MOVSS (SI)(R8*1), X5 38 MULSS X1, X2 // X_i *= a 39 MULSS X0, X3 40 MULSS X1, X4 41 MULSS X0, X5 42 ADDSS (DX), X2 // X_i += y[i] 43 ADDSS (DX)(R9*1), X3 44 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 45 ADDSS (DX), X4 46 ADDSS (DX)(R9*1), X5 47 MOVSS X2, (DI) // y[i] = X_i 48 MOVSS X3, (DI)(R9*1) 49 LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2]) 50 MOVSS X4, (DI) 51 MOVSS X5, (DI)(R9*1) 52 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses 53 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 54 LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2]) 55 LOOP axpyi_loop // } while --CX > 0 56 CMPQ BX, $0 // if BX == 0 { return } 57 JE axpyi_end 58 59 axpyi_tail_start: // Reset loop registers 60 MOVQ BX, CX // Loop counter: CX = BX 61 62 axpyi_tail: // do { 63 MOVSS (SI), X2 // X2 = x[i] 64 MULSS X1, X2 // X2 *= a 65 ADDSS (DI), X2 // X2 += y[i] 66 MOVSS X2, (DI) // y[i] = X2 67 ADDQ R8, SI // SI = &(SI[incX]) 68 ADDQ R9, DI // DI = &(DI[incY]) 69 LOOP axpyi_tail // } while --CX > 0 70 71 axpyi_end: 72 RET 73