gonum.org/v1/gonum@v0.14.0/internal/asm/c64/axpyinc_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // MOVSHDUP X3, X2 10 #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 // MOVSLDUP X3, X3 12 #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 // ADDSUBPS X2, X3 14 #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 16 // MOVSHDUP X5, X4 17 #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 // MOVSLDUP X5, X5 19 #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 // ADDSUBPS X4, X5 21 #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 23 // MOVSHDUP X7, X6 24 #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 // MOVSLDUP X7, X7 26 #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 // ADDSUBPS X6, X7 28 #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 30 // MOVSHDUP X9, X8 31 #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 // MOVSLDUP X9, X9 33 #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 // ADDSUBPS X8, X9 35 #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 37 // func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) 38 TEXT ·AxpyInc(SB), NOSPLIT, $0 39 MOVQ x_base+8(FP), SI // SI = &x 40 MOVQ y_base+32(FP), DI // DI = &y 41 MOVQ n+56(FP), CX // CX = n 42 CMPQ CX, $0 // if n==0 { return } 43 JE axpyi_end 44 MOVQ ix+80(FP), R8 // R8 = ix 45 MOVQ iy+88(FP), R9 // R9 = iy 46 LEAQ (SI)(R8*8), SI // SI = &(x[ix]) 47 LEAQ (DI)(R9*8), DI // DI = &(y[iy]) 48 MOVQ DI, DX // DX = DI // Read/Write pointers 49 MOVQ incX+64(FP), R8 // R8 = incX 50 SHLQ $3, R8 // R8 *= sizeof(complex64) 51 MOVQ incY+72(FP), R9 // R9 = incY 52 SHLQ $3, R9 // R9 *= sizeof(complex64) 53 MOVSD alpha+0(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 54 MOVAPS X0, X1 55 SHUFPS $0x11, X1, X1 // X1 = { 0, 0, real(a), imag(a) } 56 MOVAPS X0, X10 // Copy X0 and X1 for pipelining 57 MOVAPS X1, X11 58 MOVQ CX, BX 59 ANDQ $3, CX // CX = n % 4 60 SHRQ $2, BX // BX = floor( n / 4 ) 61 JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 62 63 axpyi_loop: // do { 64 MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) } 65 MOVSD (SI)(R8*1), X5 66 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 67 MOVSD (SI), X7 68 MOVSD (SI)(R8*1), X9 69 70 // X_(i-1) = { imag(x[i]), imag(x[i]) } 71 MOVSHDUP_X3_X2 72 MOVSHDUP_X5_X4 73 MOVSHDUP_X7_X6 74 MOVSHDUP_X9_X8 75 76 // X_i = { real(x[i]), real(x[i]) } 77 MOVSLDUP_X3_X3 78 MOVSLDUP_X5_X5 79 MOVSLDUP_X7_X7 80 MOVSLDUP_X9_X9 81 82 // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 83 // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 84 MULPS X1, X2 85 MULPS X0, X3 86 MULPS X11, X4 87 MULPS X10, X5 88 MULPS X1, X6 89 MULPS X0, X7 90 MULPS X11, X8 91 MULPS X10, X9 92 93 // X_i = { 94 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 95 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 96 // } 97 ADDSUBPS_X2_X3 98 ADDSUBPS_X4_X5 99 ADDSUBPS_X6_X7 100 ADDSUBPS_X8_X9 101 102 // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 103 MOVSD (DX), X2 104 MOVSD (DX)(R9*1), X4 105 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 106 MOVSD (DX), X6 107 MOVSD (DX)(R9*1), X8 108 ADDPS X2, X3 109 ADDPS X4, X5 110 ADDPS X6, X7 111 ADDPS X8, X9 112 113 MOVSD X3, (DI) // y[i] = X_i 114 MOVSD X5, (DI)(R9*1) 115 LEAQ (DI)(R9*2), DI // DI = &(DI[incDst]) 116 MOVSD X7, (DI) 117 MOVSD X9, (DI)(R9*1) 118 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 119 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 120 LEAQ (DI)(R9*2), DI // DI = &(DI[incDst]) 121 DECQ BX 122 JNZ axpyi_loop // } while --BX > 0 123 CMPQ CX, $0 // if CX == 0 { return } 124 JE axpyi_end 125 126 axpyi_tail: // do { 127 MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) } 128 MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) } 129 MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) } 130 131 // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 132 // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 133 MULPS X1, X2 134 MULPS X0, X3 135 136 // X_i = { 137 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 138 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 139 // } 140 ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i) 141 142 // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 143 MOVSD (DI), X4 144 ADDPS X4, X3 145 MOVSD X3, (DI) // y[i] = X_i 146 ADDQ R8, SI // SI += incX 147 ADDQ R9, DI // DI += incY 148 LOOP axpyi_tail // } while --CX > 0 149 150 axpyi_end: 151 RET