github.com/gopherd/gonum@v0.0.4/internal/asm/c64/axpyunitary_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // MOVSHDUP X3, X2 10 #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 // MOVSLDUP X3, X3 12 #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 // ADDSUBPS X2, X3 14 #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 16 // MOVSHDUP X5, X4 17 #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 // MOVSLDUP X5, X5 19 #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 // ADDSUBPS X4, X5 21 #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 23 // MOVSHDUP X7, X6 24 #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 // MOVSLDUP X7, X7 26 #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 // ADDSUBPS X6, X7 28 #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 30 // MOVSHDUP X9, X8 31 #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 // MOVSLDUP X9, X9 33 #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 // ADDSUBPS X8, X9 35 #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 37 // func AxpyUnitary(alpha complex64, x, y []complex64) 38 TEXT ·AxpyUnitary(SB), NOSPLIT, $0 39 MOVQ x_base+8(FP), SI // SI = &x 40 MOVQ y_base+32(FP), DI // DI = &y 41 MOVQ x_len+16(FP), CX // CX = min( len(x), len(y) ) 42 CMPQ y_len+40(FP), CX 43 CMOVQLE y_len+40(FP), CX 44 CMPQ CX, $0 // if CX == 0 { return } 45 JE caxy_end 46 PXOR X0, X0 // Clear work registers and cache-align loop 47 PXOR X1, X1 48 MOVSD alpha+0(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 49 SHUFPD $0, X0, X0 // X0 = { imag(a), real(a), imag(a), real(a) } 50 MOVAPS X0, X1 51 SHUFPS $0x11, X1, X1 // X1 = { real(a), imag(a), real(a), imag(a) } 52 XORQ AX, AX // i = 0 53 MOVQ DI, BX // Align on 16-byte boundary for ADDPS 54 ANDQ $15, BX // BX = &y & 15 55 JZ caxy_no_trim // if BX == 0 { goto caxy_no_trim } 56 57 // Trim first value in unaligned buffer 58 XORPS X2, X2 // Clear work registers and cache-align loop 59 XORPS X3, X3 60 XORPS X4, X4 61 MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 62 MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 63 MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 64 MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 65 MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 66 67 // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) } 68 ADDSUBPS_X2_X3 69 MOVSD (DI)(AX*8), X4 // X3 += y[i] 70 ADDPS X4, X3 71 MOVSD X3, (DI)(AX*8) // y[i] = X3 72 INCQ AX // i++ 73 DECQ CX // --CX 74 JZ caxy_end // if CX == 0 { return } 75 76 caxy_no_trim: 77 MOVAPS X0, X10 // Copy X0 and X1 for pipelineing 78 MOVAPS X1, X11 79 MOVQ CX, BX 80 ANDQ $7, CX // CX = n % 8 81 SHRQ $3, BX // BX = floor( n / 8 ) 82 JZ caxy_tail // if BX == 0 { goto caxy_tail } 83 84 caxy_loop: // do { 85 // X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) } 86 MOVUPS (SI)(AX*8), X3 87 MOVUPS 16(SI)(AX*8), X5 88 MOVUPS 32(SI)(AX*8), X7 89 MOVUPS 48(SI)(AX*8), X9 90 91 // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) } 92 MOVSHDUP_X3_X2 93 MOVSHDUP_X5_X4 94 MOVSHDUP_X7_X6 95 MOVSHDUP_X9_X8 96 97 // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) } 98 MOVSLDUP_X3_X3 99 MOVSLDUP_X5_X5 100 MOVSLDUP_X7_X7 101 MOVSLDUP_X9_X9 102 103 // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]), 104 // imag(a) * real(x[i+1]), real(a) * real(x[i+1]) } 105 // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]), 106 // real(a) * imag(x[i+1]), imag(a) * imag(x[i+1]) } 107 MULPS X1, X2 108 MULPS X0, X3 109 MULPS X11, X4 110 MULPS X10, X5 111 MULPS X1, X6 112 MULPS X0, X7 113 MULPS X11, X8 114 MULPS X10, X9 115 116 // X_i = { 117 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 118 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 119 // imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]), 120 // real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]), 121 // } 122 ADDSUBPS_X2_X3 123 ADDSUBPS_X4_X5 124 ADDSUBPS_X6_X7 125 ADDSUBPS_X8_X9 126 127 // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]), 128 // imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1]) } 129 ADDPS (DI)(AX*8), X3 130 ADDPS 16(DI)(AX*8), X5 131 ADDPS 32(DI)(AX*8), X7 132 ADDPS 48(DI)(AX*8), X9 133 MOVUPS X3, (DI)(AX*8) // y[i:i+1] = X_i 134 MOVUPS X5, 16(DI)(AX*8) 135 MOVUPS X7, 32(DI)(AX*8) 136 MOVUPS X9, 48(DI)(AX*8) 137 ADDQ $8, AX // i += 8 138 DECQ BX // --BX 139 JNZ caxy_loop // } while BX > 0 140 CMPQ CX, $0 // if CX == 0 { return } 141 JE caxy_end 142 143 caxy_tail: // do { 144 MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 145 MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 146 MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 147 MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 148 MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 149 150 // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), 151 // real(a)*real(x[i]) - imag(a)*imag(x[i]) } 152 ADDSUBPS_X2_X3 153 MOVSD (DI)(AX*8), X4 // X3 += y[i] 154 ADDPS X4, X3 155 MOVSD X3, (DI)(AX*8) // y[i] = X3 156 INCQ AX // ++i 157 LOOP caxy_tail // } while --CX > 0 158 159 caxy_end: 160 RET