gonum.org/v1/gonum@v0.14.0/internal/asm/c128/axpyincto_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // MOVDDUP X2, X3 10 #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA 11 // MOVDDUP X4, X5 12 #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC 13 // MOVDDUP X6, X7 14 #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE 15 // MOVDDUP X8, X9 16 #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8 17 18 // ADDSUBPD X2, X3 19 #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 20 // ADDSUBPD X4, X5 21 #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 // ADDSUBPD X6, X7 23 #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 24 // ADDSUBPD X8, X9 25 #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 26 27 // func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) 28 TEXT ·AxpyIncTo(SB), NOSPLIT, $0 29 MOVQ dst_base+0(FP), DI // DI = &dst 30 MOVQ x_base+56(FP), SI // SI = &x 31 MOVQ y_base+80(FP), DX // DX = &y 32 MOVQ n+104(FP), CX // CX = n 33 CMPQ CX, $0 // if n==0 { return } 34 JE axpyi_end 35 MOVQ ix+128(FP), R8 // R8 = ix // Load the first index 36 SHLQ $4, R8 // R8 *= sizeof(complex128) 37 MOVQ iy+136(FP), R9 // R9 = iy 38 SHLQ $4, R9 // R9 *= sizeof(complex128) 39 MOVQ idst+32(FP), R10 // R10 = idst 40 SHLQ $4, R10 // R10 *= sizeof(complex128) 41 LEAQ (SI)(R8*1), SI // SI = &(x[ix]) 42 LEAQ (DX)(R9*1), DX // DX = &(y[iy]) 43 LEAQ (DI)(R10*1), DI // DI = &(dst[idst]) 44 MOVQ incX+112(FP), R8 // R8 = incX 45 SHLQ $4, R8 // R8 *= sizeof(complex128) 46 MOVQ incY+120(FP), R9 // R9 = incY 47 SHLQ $4, R9 // R9 *= sizeof(complex128) 48 MOVQ incDst+24(FP), R10 // R10 = incDst 49 SHLQ $4, R10 // R10 *= sizeof(complex128) 50 MOVUPS alpha+40(FP), X0 // X0 = { imag(a), real(a) } 51 MOVAPS X0, X1 52 SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) } 53 MOVAPS X0, X10 // Copy X0 and X1 for pipelining 54 MOVAPS X1, X11 55 MOVQ CX, BX 56 ANDQ $3, CX // CX = n % 4 57 SHRQ $2, BX // BX = floor( n / 4 ) 58 JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 59 60 axpyi_loop: // do { 61 MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 62 MOVUPS (SI)(R8*1), X4 63 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 64 65 MOVUPS (SI), X6 66 MOVUPS (SI)(R8*1), X8 67 68 // X_(i+1) = { real(x[i], real(x[i]) } 69 MOVDDUP_X2_X3 70 MOVDDUP_X4_X5 71 MOVDDUP_X6_X7 72 MOVDDUP_X8_X9 73 74 // X_i = { imag(x[i]), imag(x[i]) } 75 SHUFPD $0x3, X2, X2 76 SHUFPD $0x3, X4, X4 77 SHUFPD $0x3, X6, X6 78 SHUFPD $0x3, X8, X8 79 80 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 81 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 82 MULPD X1, X2 83 MULPD X0, X3 84 MULPD X11, X4 85 MULPD X10, X5 86 MULPD X1, X6 87 MULPD X0, X7 88 MULPD X11, X8 89 MULPD X10, X9 90 91 // X_(i+1) = { 92 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 93 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 94 // } 95 ADDSUBPD_X2_X3 96 ADDSUBPD_X4_X5 97 ADDSUBPD_X6_X7 98 ADDSUBPD_X8_X9 99 100 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 101 ADDPD (DX), X3 102 ADDPD (DX)(R9*1), X5 103 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 104 ADDPD (DX), X7 105 ADDPD (DX)(R9*1), X9 106 MOVUPS X3, (DI) // dst[i] = X_(i+1) 107 MOVUPS X5, (DI)(R10*1) 108 LEAQ (DI)(R10*2), DI 109 MOVUPS X7, (DI) 110 MOVUPS X9, (DI)(R10*1) 111 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 112 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 113 LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2]) 114 DECQ BX 115 JNZ axpyi_loop // } while --BX > 0 116 CMPQ CX, $0 // if CX == 0 { return } 117 JE axpyi_end 118 119 axpyi_tail: // do { 120 MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 121 MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } 122 SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } 123 MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 124 MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 125 126 // X_(i+1) = { 127 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 128 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 129 // } 130 ADDSUBPD_X2_X3 131 132 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 133 ADDPD (DX), X3 134 MOVUPS X3, (DI) // y[i] X_(i+1) 135 ADDQ R8, SI // SI += incX 136 ADDQ R9, DX // DX += incY 137 ADDQ R10, DI // DI += incDst 138 LOOP axpyi_tail // } while --CX > 0 139 140 axpyi_end: 141 RET