github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/internal/asm/c128/axpyunitaryto_amd64.s (about) 1 // Copyright ©2016 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 // MOVDDUP X2, X3 10 #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA 11 // MOVDDUP X4, X5 12 #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC 13 // MOVDDUP X6, X7 14 #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE 15 // MOVDDUP X8, X9 16 #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8 17 18 // ADDSUBPD X2, X3 19 #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 20 // ADDSUBPD X4, X5 21 #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 // ADDSUBPD X6, X7 23 #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 24 // ADDSUBPD X8, X9 25 #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 26 27 // func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128) 28 TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 29 MOVQ dst_base+0(FP), DI // DI = &dst 30 MOVQ x_base+40(FP), SI // SI = &x 31 MOVQ y_base+64(FP), DX // DX = &y 32 MOVQ x_len+48(FP), CX // CX = min( len(x), len(y), len(dst) ) 33 CMPQ y_len+72(FP), CX 34 CMOVQLE y_len+72(FP), CX 35 CMPQ dst_len+8(FP), CX 36 CMOVQLE dst_len+8(FP), CX 37 CMPQ CX, $0 // if CX == 0 { return } 38 JE caxy_end 39 MOVUPS alpha+24(FP), X0 // X0 = { imag(a), real(a) } 40 MOVAPS X0, X1 41 SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) } 42 XORQ AX, AX // i = 0 43 MOVAPS X0, X10 // Copy X0 and X1 for pipelining 44 MOVAPS X1, X11 45 MOVQ CX, BX 46 ANDQ $3, CX // CX = n % 4 47 SHRQ $2, BX // BX = floor( n / 4 ) 48 JZ caxy_tail // if BX == 0 { goto caxy_tail } 49 50 caxy_loop: // do { 51 MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) } 52 MOVUPS 16(SI)(AX*8), X4 53 MOVUPS 32(SI)(AX*8), X6 54 MOVUPS 48(SI)(AX*8), X8 55 56 // X_(i+1) = { real(x[i], real(x[i]) } 57 MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi) 58 MOVDDUP_X4_X5 59 MOVDDUP_X6_X7 60 MOVDDUP_X8_X9 61 62 // X_i = { imag(x[i]), imag(x[i]) } 63 SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr) 64 SHUFPD $0x3, X4, X4 65 SHUFPD $0x3, X6, X6 66 SHUFPD $0x3, X8, X8 67 68 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 69 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 70 MULPD X1, X2 71 MULPD X0, X3 72 MULPD X11, X4 73 MULPD X10, X5 74 MULPD X1, X6 75 MULPD X0, X7 76 MULPD X11, X8 77 MULPD X10, X9 78 79 // X_(i+1) = { 80 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 81 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 82 // } 83 ADDSUBPD_X2_X3 84 ADDSUBPD_X4_X5 85 ADDSUBPD_X6_X7 86 ADDSUBPD_X8_X9 87 88 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 89 ADDPD (DX)(AX*8), X3 90 ADDPD 16(DX)(AX*8), X5 91 ADDPD 32(DX)(AX*8), X7 92 ADDPD 48(DX)(AX*8), X9 93 MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1) 94 MOVUPS X5, 16(DI)(AX*8) 95 MOVUPS X7, 32(DI)(AX*8) 96 MOVUPS X9, 48(DI)(AX*8) 97 ADDQ $8, AX // i += 8 98 DECQ BX 99 JNZ caxy_loop // } while --BX > 0 100 CMPQ CX, $0 // if CX == 0 { return } 101 JE caxy_end 102 103 caxy_tail: // Same calculation, but read in values to avoid trampling memory 104 MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) } 105 MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } 106 SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } 107 MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 108 MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 109 110 // X_(i+1) = { 111 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 112 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 113 // } 114 ADDSUBPD_X2_X3 115 116 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 117 ADDPD (DX)(AX*8), X3 118 MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1) 119 ADDQ $2, AX // i += 2 120 LOOP caxy_tail // } while --CX > 0 121 122 caxy_end: 123 RET