gonum.org/v1/gonum@v0.14.0/internal/asm/c64/axpyinc_amd64.s (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  // MOVSHDUP X3, X2
    10  #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
    11  // MOVSLDUP X3, X3
    12  #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
    13  // ADDSUBPS X2, X3
    14  #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
    15  
    16  // MOVSHDUP X5, X4
    17  #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
    18  // MOVSLDUP X5, X5
    19  #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
    20  // ADDSUBPS X4, X5
    21  #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
    22  
    23  // MOVSHDUP X7, X6
    24  #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
    25  // MOVSLDUP X7, X7
    26  #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
    27  // ADDSUBPS X6, X7
    28  #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
    29  
    30  // MOVSHDUP X9, X8
    31  #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
    32  // MOVSLDUP X9, X9
    33  #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
    34  // ADDSUBPS X8, X9
    35  #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
    36  
    37  // func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
    38  TEXT ·AxpyInc(SB), NOSPLIT, $0
    39  	MOVQ   x_base+8(FP), SI  // SI = &x
    40  	MOVQ   y_base+32(FP), DI // DI = &y
    41  	MOVQ   n+56(FP), CX      // CX = n
    42  	CMPQ   CX, $0            // if n==0 { return }
    43  	JE     axpyi_end
    44  	MOVQ   ix+80(FP), R8     // R8 = ix
    45  	MOVQ   iy+88(FP), R9     // R9 = iy
    46  	LEAQ   (SI)(R8*8), SI    // SI = &(x[ix])
    47  	LEAQ   (DI)(R9*8), DI    // DI = &(y[iy])
    48  	MOVQ   DI, DX            // DX = DI    // Read/Write pointers
    49  	MOVQ   incX+64(FP), R8   // R8 = incX
    50  	SHLQ   $3, R8            // R8 *= sizeof(complex64)
    51  	MOVQ   incY+72(FP), R9   // R9 = incY
    52  	SHLQ   $3, R9            // R9 *= sizeof(complex64)
    53  	MOVSD  alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
    54  	MOVAPS X0, X1
    55  	SHUFPS $0x11, X1, X1     // X1 = { 0, 0, real(a), imag(a) }
    56  	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
    57  	MOVAPS X1, X11
    58  	MOVQ   CX, BX
    59  	ANDQ   $3, CX            // CX = n % 4
    60  	SHRQ   $2, BX            // BX = floor( n / 4 )
    61  	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
    62  
    63  axpyi_loop: // do {
    64  	MOVSD (SI), X3       // X_i = { imag(x[i+1]), real(x[i+1]) }
    65  	MOVSD (SI)(R8*1), X5
    66  	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
    67  	MOVSD (SI), X7
    68  	MOVSD (SI)(R8*1), X9
    69  
    70  	// X_(i-1) = { imag(x[i]), imag(x[i]) }
    71  	MOVSHDUP_X3_X2
    72  	MOVSHDUP_X5_X4
    73  	MOVSHDUP_X7_X6
    74  	MOVSHDUP_X9_X8
    75  
    76  	// X_i = { real(x[i]), real(x[i]) }
    77  	MOVSLDUP_X3_X3
    78  	MOVSLDUP_X5_X5
    79  	MOVSLDUP_X7_X7
    80  	MOVSLDUP_X9_X9
    81  
    82  	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
    83  	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
    84  	MULPS X1, X2
    85  	MULPS X0, X3
    86  	MULPS X11, X4
    87  	MULPS X10, X5
    88  	MULPS X1, X6
    89  	MULPS X0, X7
    90  	MULPS X11, X8
    91  	MULPS X10, X9
    92  
    93  	// X_i = {
    94  	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
    95  	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
    96  	//  }
    97  	ADDSUBPS_X2_X3
    98  	ADDSUBPS_X4_X5
    99  	ADDSUBPS_X6_X7
   100  	ADDSUBPS_X8_X9
   101  
   102  	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
   103  	MOVSD (DX), X2
   104  	MOVSD (DX)(R9*1), X4
   105  	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
   106  	MOVSD (DX), X6
   107  	MOVSD (DX)(R9*1), X8
   108  	ADDPS X2, X3
   109  	ADDPS X4, X5
   110  	ADDPS X6, X7
   111  	ADDPS X8, X9
   112  
   113  	MOVSD X3, (DI)       // y[i] = X_i
   114  	MOVSD X5, (DI)(R9*1)
   115  	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
   116  	MOVSD X7, (DI)
   117  	MOVSD X9, (DI)(R9*1)
   118  	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
   119  	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
   120  	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
   121  	DECQ  BX
   122  	JNZ   axpyi_loop     // }  while --BX > 0
   123  	CMPQ  CX, $0         // if CX == 0 { return }
   124  	JE    axpyi_end
   125  
   126  axpyi_tail: // do {
   127  	MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
   128  	MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) }
   129  	MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) }
   130  
   131  	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
   132  	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
   133  	MULPS X1, X2
   134  	MULPS X0, X3
   135  
   136  	// X_i = {
   137  	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
   138  	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
   139  	//  }
   140  	ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i)
   141  
   142  	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
   143  	MOVSD (DI), X4
   144  	ADDPS X4, X3
   145  	MOVSD X3, (DI)   // y[i] = X_i
   146  	ADDQ  R8, SI     // SI += incX
   147  	ADDQ  R9, DI     // DI += incY
   148  	LOOP  axpyi_tail // } while --CX > 0
   149  
   150  axpyi_end:
   151  	RET