github.com/gopherd/gonum@v0.0.4/internal/asm/c128/axpyinc_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // MOVDDUP X2, X3 10 #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA 11 // MOVDDUP X4, X5 12 #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC 13 // MOVDDUP X6, X7 14 #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE 15 // MOVDDUP X8, X9 16 #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8 17 18 // ADDSUBPD X2, X3 19 #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 20 // ADDSUBPD X4, X5 21 #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 // ADDSUBPD X6, X7 23 #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 24 // ADDSUBPD X8, X9 25 #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 26 27 // func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) 28 TEXT ·AxpyInc(SB), NOSPLIT, $0 29 MOVQ x_base+16(FP), SI // SI = &x 30 MOVQ y_base+40(FP), DI // DI = &y 31 MOVQ n+64(FP), CX // CX = n 32 CMPQ CX, $0 // if n==0 { return } 33 JE axpyi_end 34 MOVQ ix+88(FP), R8 // R8 = ix // Load the first index 35 SHLQ $4, R8 // R8 *= sizeof(complex128) 36 MOVQ iy+96(FP), R9 // R9 = iy 37 SHLQ $4, R9 // R9 *= sizeof(complex128) 38 LEAQ (SI)(R8*1), SI // SI = &(x[ix]) 39 LEAQ (DI)(R9*1), DI // DI = &(y[iy]) 40 MOVQ DI, DX // DX = DI // Separate Read/Write pointers 41 MOVQ incX+72(FP), R8 // R8 = incX 42 SHLQ $4, R8 // R8 *= sizeof(complex128) 43 MOVQ incY+80(FP), R9 // R9 = iy 44 SHLQ $4, R9 // R9 *= sizeof(complex128) 45 MOVUPS alpha+0(FP), X0 // X0 = { imag(a), real(a) } 46 MOVAPS X0, X1 47 SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) } 48 MOVAPS X0, X10 // Copy X0 and X1 for pipelining 49 MOVAPS X1, X11 50 MOVQ CX, BX 51 ANDQ $3, CX // CX = n % 4 52 SHRQ $2, BX // BX = floor( n / 4 ) 53 JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 54 55 axpyi_loop: // do { 56 MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 57 MOVUPS (SI)(R8*1), X4 58 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 59 MOVUPS (SI), X6 60 MOVUPS (SI)(R8*1), X8 61 62 // X_(i+1) = { real(x[i], real(x[i]) } 63 MOVDDUP_X2_X3 64 MOVDDUP_X4_X5 65 MOVDDUP_X6_X7 66 MOVDDUP_X8_X9 67 68 // X_i = { imag(x[i]), imag(x[i]) } 69 SHUFPD $0x3, X2, X2 70 SHUFPD $0x3, X4, X4 71 SHUFPD $0x3, X6, X6 72 SHUFPD $0x3, X8, X8 73 74 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 75 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 76 MULPD X1, X2 77 MULPD X0, X3 78 MULPD X11, X4 79 MULPD X10, X5 80 MULPD X1, X6 81 MULPD X0, X7 82 MULPD X11, X8 83 MULPD X10, X9 84 85 // X_(i+1) = { 86 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 87 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 88 // } 89 ADDSUBPD_X2_X3 90 ADDSUBPD_X4_X5 91 ADDSUBPD_X6_X7 92 ADDSUBPD_X8_X9 93 94 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 95 ADDPD (DX), X3 96 ADDPD (DX)(R9*1), X5 97 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 98 ADDPD (DX), X7 99 ADDPD (DX)(R9*1), X9 100 MOVUPS X3, (DI) // dst[i] = X_(i+1) 101 MOVUPS X5, (DI)(R9*1) 102 LEAQ (DI)(R9*2), DI 103 MOVUPS X7, (DI) 104 MOVUPS X9, (DI)(R9*1) 105 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 106 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 107 LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2]) 108 DECQ BX 109 JNZ axpyi_loop // } while --BX > 0 110 CMPQ CX, $0 // if CX == 0 { return } 111 JE axpyi_end 112 113 axpyi_tail: // do { 114 MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 115 MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } 116 SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } 117 MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 118 MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 119 120 // X_(i+1) = { 121 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 122 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 123 // } 124 ADDSUBPD_X2_X3 125 126 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 127 ADDPD (DI), X3 128 MOVUPS X3, (DI) // y[i] = X_i 129 ADDQ R8, SI // SI = &(SI[incX]) 130 ADDQ R9, DI // DI = &(DI[incY]) 131 LOOP axpyi_tail // } while --CX > 0 132 133 axpyi_end: 134 RET