gonum.org/v1/gonum@v0.14.0/internal/asm/c64/axpyincto_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // MOVSHDUP X3, X2 10 #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 // MOVSLDUP X3, X3 12 #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 // ADDSUBPS X2, X3 14 #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 16 // MOVSHDUP X5, X4 17 #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 // MOVSLDUP X5, X5 19 #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 // ADDSUBPS X4, X5 21 #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 23 // MOVSHDUP X7, X6 24 #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 // MOVSLDUP X7, X7 26 #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 // ADDSUBPS X6, X7 28 #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 30 // MOVSHDUP X9, X8 31 #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 // MOVSLDUP X9, X9 33 #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 // ADDSUBPS X8, X9 35 #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 37 // func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) 38 TEXT ·AxpyIncTo(SB), NOSPLIT, $0 39 MOVQ dst_base+0(FP), DI // DI = &dst 40 MOVQ x_base+48(FP), SI // SI = &x 41 MOVQ y_base+72(FP), DX // DX = &y 42 MOVQ n+96(FP), CX // CX = n 43 CMPQ CX, $0 // if n==0 { return } 44 JE axpyi_end 45 MOVQ ix+120(FP), R8 // Load the first index 46 MOVQ iy+128(FP), R9 47 MOVQ idst+32(FP), R10 48 LEAQ (SI)(R8*8), SI // SI = &(x[ix]) 49 LEAQ (DX)(R9*8), DX // DX = &(y[iy]) 50 LEAQ (DI)(R10*8), DI // DI = &(dst[idst]) 51 MOVQ incX+104(FP), R8 // Incrementors*8 for easy iteration (ADDQ) 52 SHLQ $3, R8 53 MOVQ incY+112(FP), R9 54 SHLQ $3, R9 55 MOVQ incDst+24(FP), R10 56 SHLQ $3, R10 57 MOVSD alpha+40(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 58 MOVAPS X0, X1 59 SHUFPS $0x11, X1, X1 // X1 = { 0, 0, real(a), imag(a) } 60 MOVAPS X0, X10 // Copy X0 and X1 for pipelining 61 MOVAPS X1, X11 62 MOVQ CX, BX 63 ANDQ $3, CX // CX = n % 4 64 SHRQ $2, BX // BX = floor( n / 4 ) 65 JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 66 67 axpyi_loop: // do { 68 MOVSD (SI), X3 // X_i = { imag(x[i]), real(x[i]) } 69 MOVSD (SI)(R8*1), X5 70 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 71 MOVSD (SI), X7 72 MOVSD (SI)(R8*1), X9 73 74 // X_(i-1) = { imag(x[i]), imag(x[i]) } 75 MOVSHDUP_X3_X2 76 MOVSHDUP_X5_X4 77 MOVSHDUP_X7_X6 78 MOVSHDUP_X9_X8 79 80 // X_i = { real(x[i]), real(x[i]) } 81 MOVSLDUP_X3_X3 82 MOVSLDUP_X5_X5 83 MOVSLDUP_X7_X7 84 MOVSLDUP_X9_X9 85 86 // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 87 // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 88 MULPS X1, X2 89 MULPS X0, X3 90 MULPS X11, X4 91 MULPS X10, X5 92 MULPS X1, X6 93 MULPS X0, X7 94 MULPS X11, X8 95 MULPS X10, X9 96 97 // X_i = { 98 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 99 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 100 // } 101 ADDSUBPS_X2_X3 102 ADDSUBPS_X4_X5 103 ADDSUBPS_X6_X7 104 ADDSUBPS_X8_X9 105 106 // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 107 MOVSD (DX), X2 108 MOVSD (DX)(R9*1), X4 109 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 110 MOVSD (DX), X6 111 MOVSD (DX)(R9*1), X8 112 ADDPS X2, X3 113 ADDPS X4, X5 114 ADDPS X6, X7 115 ADDPS X8, X9 116 117 MOVSD X3, (DI) // y[i] = X_i 118 MOVSD X5, (DI)(R10*1) 119 LEAQ (DI)(R10*2), DI // DI = &(DI[incDst]) 120 MOVSD X7, (DI) 121 MOVSD X9, (DI)(R10*1) 122 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 123 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 124 LEAQ (DI)(R10*2), DI // DI = &(DI[incDst]) 125 DECQ BX 126 JNZ axpyi_loop // } while --BX > 0 127 CMPQ CX, $0 // if CX == 0 { return } 128 JE axpyi_end 129 130 axpyi_tail: 131 MOVSD (SI), X3 // X_i = { imag(x[i]), real(x[i]) } 132 MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) } 133 MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) } 134 135 // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 136 // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 137 MULPS X1, X2 138 MULPS X0, X3 139 140 // X_i = { 141 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 142 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 143 // } 144 ADDSUBPS_X2_X3 145 146 // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 147 MOVSD (DX), X4 148 ADDPS X4, X3 149 MOVSD X3, (DI) // y[i] = X_i 150 ADDQ R8, SI // SI += incX 151 ADDQ R9, DX // DX += incY 152 ADDQ R10, DI // DI += incDst 153 LOOP axpyi_tail // } while --CX > 0 154 155 axpyi_end: 156 RET