gonum.org/v1/gonum@v0.14.0/internal/asm/c64/axpyincto_amd64.s (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  // MOVSHDUP X3, X2
    10  #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
    11  // MOVSLDUP X3, X3
    12  #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
    13  // ADDSUBPS X2, X3
    14  #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
    15  
    16  // MOVSHDUP X5, X4
    17  #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
    18  // MOVSLDUP X5, X5
    19  #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
    20  // ADDSUBPS X4, X5
    21  #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
    22  
    23  // MOVSHDUP X7, X6
    24  #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
    25  // MOVSLDUP X7, X7
    26  #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
    27  // ADDSUBPS X6, X7
    28  #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
    29  
    30  // MOVSHDUP X9, X8
    31  #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
    32  // MOVSLDUP X9, X9
    33  #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
    34  // ADDSUBPS X8, X9
    35  #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
    36  
    37  // func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
    38  TEXT ·AxpyIncTo(SB), NOSPLIT, $0
    39  	MOVQ   dst_base+0(FP), DI // DI = &dst
    40  	MOVQ   x_base+48(FP), SI  // SI = &x
    41  	MOVQ   y_base+72(FP), DX  // DX = &y
    42  	MOVQ   n+96(FP), CX       // CX = n
    43  	CMPQ   CX, $0             // if n==0 { return }
    44  	JE     axpyi_end
    45  	MOVQ   ix+120(FP), R8     // Load the first index
    46  	MOVQ   iy+128(FP), R9
    47  	MOVQ   idst+32(FP), R10
    48  	LEAQ   (SI)(R8*8), SI     // SI = &(x[ix])
    49  	LEAQ   (DX)(R9*8), DX     // DX = &(y[iy])
    50  	LEAQ   (DI)(R10*8), DI    // DI = &(dst[idst])
    51  	MOVQ   incX+104(FP), R8   // Incrementors*8 for easy iteration (ADDQ)
    52  	SHLQ   $3, R8
    53  	MOVQ   incY+112(FP), R9
    54  	SHLQ   $3, R9
    55  	MOVQ   incDst+24(FP), R10
    56  	SHLQ   $3, R10
    57  	MOVSD  alpha+40(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
    58  	MOVAPS X0, X1
    59  	SHUFPS $0x11, X1, X1      // X1 = { 0, 0, real(a), imag(a) }
    60  	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
    61  	MOVAPS X1, X11
    62  	MOVQ   CX, BX
    63  	ANDQ   $3, CX             // CX = n % 4
    64  	SHRQ   $2, BX             // BX = floor( n / 4 )
    65  	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
    66  
    67  axpyi_loop: // do {
    68  	MOVSD (SI), X3       // X_i = { imag(x[i]), real(x[i]) }
    69  	MOVSD (SI)(R8*1), X5
    70  	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
    71  	MOVSD (SI), X7
    72  	MOVSD (SI)(R8*1), X9
    73  
    74  	// X_(i-1) = { imag(x[i]), imag(x[i]) }
    75  	MOVSHDUP_X3_X2
    76  	MOVSHDUP_X5_X4
    77  	MOVSHDUP_X7_X6
    78  	MOVSHDUP_X9_X8
    79  
    80  	// X_i = { real(x[i]), real(x[i]) }
    81  	MOVSLDUP_X3_X3
    82  	MOVSLDUP_X5_X5
    83  	MOVSLDUP_X7_X7
    84  	MOVSLDUP_X9_X9
    85  
    86  	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
    87  	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
    88  	MULPS X1, X2
    89  	MULPS X0, X3
    90  	MULPS X11, X4
    91  	MULPS X10, X5
    92  	MULPS X1, X6
    93  	MULPS X0, X7
    94  	MULPS X11, X8
    95  	MULPS X10, X9
    96  
    97  	// X_i = {
    98  	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
    99  	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
   100  	//  }
   101  	ADDSUBPS_X2_X3
   102  	ADDSUBPS_X4_X5
   103  	ADDSUBPS_X6_X7
   104  	ADDSUBPS_X8_X9
   105  
   106  	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
   107  	MOVSD (DX), X2
   108  	MOVSD (DX)(R9*1), X4
   109  	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
   110  	MOVSD (DX), X6
   111  	MOVSD (DX)(R9*1), X8
   112  	ADDPS X2, X3
   113  	ADDPS X4, X5
   114  	ADDPS X6, X7
   115  	ADDPS X8, X9
   116  
   117  	MOVSD X3, (DI)        // y[i] = X_i
   118  	MOVSD X5, (DI)(R10*1)
   119  	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
   120  	MOVSD X7, (DI)
   121  	MOVSD X9, (DI)(R10*1)
   122  	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
   123  	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
   124  	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
   125  	DECQ  BX
   126  	JNZ   axpyi_loop      // } while --BX > 0
   127  	CMPQ  CX, $0          // if CX == 0 { return }
   128  	JE    axpyi_end
   129  
   130  axpyi_tail:
   131  	MOVSD (SI), X3 // X_i     = { imag(x[i]), real(x[i]) }
   132  	MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
   133  	MOVSLDUP_X3_X3 // X_i     = { real(x[i]), real(x[i]) }
   134  
   135  	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
   136  	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
   137  	MULPS X1, X2
   138  	MULPS X0, X3
   139  
   140  	// X_i = {
   141  	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
   142  	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
   143  	//  }
   144  	ADDSUBPS_X2_X3
   145  
   146  	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
   147  	MOVSD (DX), X4
   148  	ADDPS X4, X3
   149  	MOVSD X3, (DI)   // y[i] = X_i
   150  	ADDQ  R8, SI     // SI += incX
   151  	ADDQ  R9, DX     // DX += incY
   152  	ADDQ  R10, DI    // DI += incDst
   153  	LOOP  axpyi_tail // } while --CX > 0
   154  
   155  axpyi_end:
   156  	RET