github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/internal/asm/f32/axpyunitary_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // func AxpyUnitary(alpha float32, x, y []float32) 10 TEXT ·AxpyUnitary(SB), NOSPLIT, $0 11 MOVQ x_base+8(FP), SI // SI = &x 12 MOVQ y_base+32(FP), DI // DI = &y 13 MOVQ x_len+16(FP), BX // BX = min( len(x), len(y) ) 14 CMPQ y_len+40(FP), BX 15 CMOVQLE y_len+40(FP), BX 16 CMPQ BX, $0 // if BX == 0 { return } 17 JE axpy_end 18 MOVSS alpha+0(FP), X0 19 SHUFPS $0, X0, X0 // X0 = { a, a, a, a } 20 XORQ AX, AX // i = 0 21 PXOR X2, X2 // 2 NOP instructions (PXOR) to align 22 PXOR X3, X3 // loop to cache line 23 MOVQ DI, CX 24 ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS 25 JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim } 26 27 XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 ) 28 INCQ CX 29 SHRQ $2, CX 30 31 axpy_align: // Trim first value(s) in unaligned buffer do { 32 MOVSS (SI)(AX*4), X2 // X2 = x[i] 33 MULSS X0, X2 // X2 *= a 34 ADDSS (DI)(AX*4), X2 // X2 += y[i] 35 MOVSS X2, (DI)(AX*4) // y[i] = X2 36 INCQ AX // i++ 37 DECQ BX 38 JZ axpy_end // if --BX == 0 { return } 39 LOOP axpy_align // } while --CX > 0 40 41 axpy_no_trim: 42 MOVUPS X0, X1 // Copy X0 to X1 for pipelining 43 MOVQ BX, CX 44 ANDQ $0xF, BX // BX = len % 16 45 SHRQ $4, CX // CX = int( len / 16 ) 46 JZ axpy_tail4_start // if CX == 0 { return } 47 48 axpy_loop: // Loop unrolled 16x do { 49 MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4] 50 MOVUPS 16(SI)(AX*4), X3 51 MOVUPS 32(SI)(AX*4), X4 52 MOVUPS 48(SI)(AX*4), X5 53 MULPS X0, X2 // X2 *= a 54 MULPS X1, X3 55 MULPS X0, X4 56 MULPS X1, X5 57 ADDPS (DI)(AX*4), X2 // X2 += y[i:i+4] 58 ADDPS 16(DI)(AX*4), X3 59 ADDPS 32(DI)(AX*4), X4 60 ADDPS 48(DI)(AX*4), X5 61 MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2 62 MOVUPS X3, 16(DI)(AX*4) 63 MOVUPS X4, 32(DI)(AX*4) 64 MOVUPS X5, 48(DI)(AX*4) 65 ADDQ $16, AX // i += 16 66 LOOP axpy_loop // while (--CX) > 0 67 CMPQ BX, $0 // if BX == 0 { return } 68 JE axpy_end 69 70 axpy_tail4_start: // Reset loop counter for 4-wide tail loop 71 MOVQ BX, CX // CX = floor( BX / 4 ) 72 SHRQ $2, CX 73 JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start } 74 75 axpy_tail4: // Loop unrolled 4x do { 76 MOVUPS (SI)(AX*4), X2 // X2 = x[i] 77 MULPS X0, X2 // X2 *= a 78 ADDPS (DI)(AX*4), X2 // X2 += y[i] 79 MOVUPS X2, (DI)(AX*4) // y[i] = X2 80 ADDQ $4, AX // i += 4 81 LOOP axpy_tail4 // } while --CX > 0 82 83 axpy_tail_start: // Reset loop counter for 1-wide tail loop 84 MOVQ BX, CX // CX = BX % 4 85 ANDQ $3, CX 86 JZ axpy_end // if CX == 0 { return } 87 88 axpy_tail: 89 MOVSS (SI)(AX*4), X1 // X1 = x[i] 90 MULSS X0, X1 // X1 *= a 91 ADDSS (DI)(AX*4), X1 // X1 += y[i] 92 MOVSS X1, (DI)(AX*4) // y[i] = X1 93 INCQ AX // i++ 94 LOOP axpy_tail // } while --CX > 0 95 96 axpy_end: 97 RET