gonum.org/v1/gonum@v0.14.0/internal/asm/c128/axpyincto_amd64.s (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  // MOVDDUP X2, X3
    10  #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
    11  // MOVDDUP X4, X5
    12  #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
    13  // MOVDDUP X6, X7
    14  #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
    15  // MOVDDUP X8, X9
    16  #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
    17  
    18  // ADDSUBPD X2, X3
    19  #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
    20  // ADDSUBPD X4, X5
    21  #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
    22  // ADDSUBPD X6, X7
    23  #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
    24  // ADDSUBPD X8, X9
    25  #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
    26  
    27  // func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
    28  TEXT ·AxpyIncTo(SB), NOSPLIT, $0
    29  	MOVQ   dst_base+0(FP), DI // DI = &dst
    30  	MOVQ   x_base+56(FP), SI  // SI = &x
    31  	MOVQ   y_base+80(FP), DX  // DX = &y
    32  	MOVQ   n+104(FP), CX      // CX = n
    33  	CMPQ   CX, $0             // if n==0 { return }
    34  	JE     axpyi_end
    35  	MOVQ   ix+128(FP), R8     // R8 = ix  // Load the first index
    36  	SHLQ   $4, R8             // R8 *= sizeof(complex128)
    37  	MOVQ   iy+136(FP), R9     // R9 = iy
    38  	SHLQ   $4, R9             // R9 *= sizeof(complex128)
    39  	MOVQ   idst+32(FP), R10   // R10 = idst
    40  	SHLQ   $4, R10            // R10 *= sizeof(complex128)
    41  	LEAQ   (SI)(R8*1), SI     // SI = &(x[ix])
    42  	LEAQ   (DX)(R9*1), DX     // DX = &(y[iy])
    43  	LEAQ   (DI)(R10*1), DI    // DI = &(dst[idst])
    44  	MOVQ   incX+112(FP), R8   // R8 = incX
    45  	SHLQ   $4, R8             // R8 *= sizeof(complex128)
    46  	MOVQ   incY+120(FP), R9   // R9 = incY
    47  	SHLQ   $4, R9             // R9 *= sizeof(complex128)
    48  	MOVQ   incDst+24(FP), R10 // R10 = incDst
    49  	SHLQ   $4, R10            // R10 *= sizeof(complex128)
    50  	MOVUPS alpha+40(FP), X0   // X0 = { imag(a), real(a) }
    51  	MOVAPS X0, X1
    52  	SHUFPD $0x1, X1, X1       // X1 = { real(a), imag(a) }
    53  	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
    54  	MOVAPS X1, X11
    55  	MOVQ   CX, BX
    56  	ANDQ   $3, CX             // CX = n % 4
    57  	SHRQ   $2, BX             // BX = floor( n / 4 )
    58  	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
    59  
    60  axpyi_loop: // do {
    61  	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
    62  	MOVUPS (SI)(R8*1), X4
    63  	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
    64  
    65  	MOVUPS (SI), X6
    66  	MOVUPS (SI)(R8*1), X8
    67  
    68  	// X_(i+1) = { real(x[i], real(x[i]) }
    69  	MOVDDUP_X2_X3
    70  	MOVDDUP_X4_X5
    71  	MOVDDUP_X6_X7
    72  	MOVDDUP_X8_X9
    73  
    74  	// X_i = { imag(x[i]), imag(x[i]) }
    75  	SHUFPD $0x3, X2, X2
    76  	SHUFPD $0x3, X4, X4
    77  	SHUFPD $0x3, X6, X6
    78  	SHUFPD $0x3, X8, X8
    79  
    80  	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
    81  	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
    82  	MULPD X1, X2
    83  	MULPD X0, X3
    84  	MULPD X11, X4
    85  	MULPD X10, X5
    86  	MULPD X1, X6
    87  	MULPD X0, X7
    88  	MULPD X11, X8
    89  	MULPD X10, X9
    90  
    91  	// X_(i+1) = {
    92  	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
    93  	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
    94  	//  }
    95  	ADDSUBPD_X2_X3
    96  	ADDSUBPD_X4_X5
    97  	ADDSUBPD_X6_X7
    98  	ADDSUBPD_X8_X9
    99  
   100  	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
   101  	ADDPD  (DX), X3
   102  	ADDPD  (DX)(R9*1), X5
   103  	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
   104  	ADDPD  (DX), X7
   105  	ADDPD  (DX)(R9*1), X9
   106  	MOVUPS X3, (DI)        // dst[i] = X_(i+1)
   107  	MOVUPS X5, (DI)(R10*1)
   108  	LEAQ   (DI)(R10*2), DI
   109  	MOVUPS X7, (DI)
   110  	MOVUPS X9, (DI)(R10*1)
   111  	LEAQ   (SI)(R8*2), SI  // SI = &(SI[incX*2])
   112  	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
   113  	LEAQ   (DI)(R10*2), DI // DI = &(DI[incDst*2])
   114  	DECQ   BX
   115  	JNZ    axpyi_loop      // } while --BX > 0
   116  	CMPQ   CX, $0          // if CX == 0 { return }
   117  	JE     axpyi_end
   118  
   119  axpyi_tail: // do {
   120  	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
   121  	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
   122  	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
   123  	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
   124  	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
   125  
   126  	// X_(i+1) = {
   127  	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
   128  	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
   129  	//  }
   130  	ADDSUBPD_X2_X3
   131  
   132  	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
   133  	ADDPD  (DX), X3
   134  	MOVUPS X3, (DI)   // y[i] X_(i+1)
   135  	ADDQ   R8, SI     // SI += incX
   136  	ADDQ   R9, DX     // DX += incY
   137  	ADDQ   R10, DI    // DI += incDst
   138  	LOOP   axpyi_tail // } while --CX > 0
   139  
   140  axpyi_end:
   141  	RET