gonum.org/v1/gonum@v0.14.0/internal/asm/c64/axpyunitaryto_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // MOVSHDUP X3, X2 10 #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 // MOVSLDUP X3, X3 12 #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 // ADDSUBPS X2, X3 14 #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 16 // MOVSHDUP X5, X4 17 #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 // MOVSLDUP X5, X5 19 #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 // ADDSUBPS X4, X5 21 #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 23 // MOVSHDUP X7, X6 24 #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 // MOVSLDUP X7, X7 26 #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 // ADDSUBPS X6, X7 28 #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 30 // MOVSHDUP X9, X8 31 #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 // MOVSLDUP X9, X9 33 #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 // ADDSUBPS X8, X9 35 #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 37 // func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) 38 TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 39 MOVQ dst_base+0(FP), DI // DI = &dst 40 MOVQ x_base+32(FP), SI // SI = &x 41 MOVQ y_base+56(FP), DX // DX = &y 42 MOVQ x_len+40(FP), CX 43 CMPQ y_len+64(FP), CX // CX = min( len(x), len(y), len(dst) ) 44 CMOVQLE y_len+64(FP), CX 45 CMPQ dst_len+8(FP), CX 46 CMOVQLE dst_len+8(FP), CX 47 CMPQ CX, $0 // if CX == 0 { return } 48 JE caxy_end 49 MOVSD alpha+24(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 50 SHUFPD $0, X0, X0 // X0 = { imag(a), real(a), imag(a), real(a) } 51 MOVAPS X0, X1 52 SHUFPS $0x11, X1, X1 // X1 = { real(a), imag(a), real(a), imag(a) } 53 XORQ AX, AX // i = 0 54 MOVQ DX, BX // Align on 16-byte boundary for ADDPS 55 ANDQ $15, BX // BX = &y & 15 56 JZ caxy_no_trim // if BX == 0 { goto caxy_no_trim } 57 58 MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 59 MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 60 MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 61 MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 62 MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 63 64 // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) } 65 ADDSUBPS_X2_X3 66 MOVSD (DX)(AX*8), X4 // X3 += y[i] 67 ADDPS X4, X3 68 MOVSD X3, (DI)(AX*8) // dst[i] = X3 69 INCQ AX // i++ 70 DECQ CX // --CX 71 JZ caxy_tail // if BX == 0 { goto caxy_tail } 72 73 caxy_no_trim: 74 MOVAPS X0, X10 // Copy X0 and X1 for pipelineing 75 MOVAPS X1, X11 76 MOVQ CX, BX 77 ANDQ $7, CX // CX = n % 8 78 SHRQ $3, BX // BX = floor( n / 8 ) 79 JZ caxy_tail // if BX == 0 { goto caxy_tail } 80 81 caxy_loop: 82 // X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) } 83 MOVUPS (SI)(AX*8), X3 84 MOVUPS 16(SI)(AX*8), X5 85 MOVUPS 32(SI)(AX*8), X7 86 MOVUPS 48(SI)(AX*8), X9 87 88 // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) } 89 MOVSHDUP_X3_X2 90 MOVSHDUP_X5_X4 91 MOVSHDUP_X7_X6 92 MOVSHDUP_X9_X8 93 94 // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) } 95 MOVSLDUP_X3_X3 96 MOVSLDUP_X5_X5 97 MOVSLDUP_X7_X7 98 MOVSLDUP_X9_X9 99 100 // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]), 101 // imag(a) * real(x[i+1]), real(a) * real(x[i+1]) } 102 // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]), 103 // real(a) * imag(x[i+1]), imag(a) * imag(x[i+1]) } 104 MULPS X1, X2 105 MULPS X0, X3 106 MULPS X11, X4 107 MULPS X10, X5 108 MULPS X1, X6 109 MULPS X0, X7 110 MULPS X11, X8 111 MULPS X10, X9 112 113 // X_i = { 114 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 115 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 116 // imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]), 117 // real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]), 118 // } 119 ADDSUBPS_X2_X3 120 ADDSUBPS_X4_X5 121 ADDSUBPS_X6_X7 122 ADDSUBPS_X8_X9 123 124 // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]), 125 // imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1]) } 126 ADDPS (DX)(AX*8), X3 127 ADDPS 16(DX)(AX*8), X5 128 ADDPS 32(DX)(AX*8), X7 129 ADDPS 48(DX)(AX*8), X9 130 MOVUPS X3, (DI)(AX*8) // y[i:i+1] = X_i 131 MOVUPS X5, 16(DI)(AX*8) 132 MOVUPS X7, 32(DI)(AX*8) 133 MOVUPS X9, 48(DI)(AX*8) 134 ADDQ $8, AX // i += 8 135 DECQ BX // --BX 136 JNZ caxy_loop // } while BX > 0 137 CMPQ CX, $0 // if CX == 0 { return } 138 JE caxy_end 139 140 caxy_tail: // do { 141 MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 142 MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 143 MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 144 MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 145 MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 146 147 // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), 148 // real(a)*real(x[i]) - imag(a)*imag(x[i]) } 149 ADDSUBPS_X2_X3 150 MOVSD (DX)(AX*8), X4 // X3 += y[i] 151 ADDPS X4, X3 152 MOVSD X3, (DI)(AX*8) // y[i] = X3 153 INCQ AX // ++i 154 LOOP caxy_tail // } while --CX > 0 155 156 caxy_end: 157 RET