gonum.org/v1/gonum@v0.14.0/internal/asm/c128/axpyinc_amd64.s (about)

     1  // Copyright ©2016 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !noasm,!gccgo,!safe
     6  
     7  #include "textflag.h"
     8  
     9  // MOVDDUP X2, X3
    10  #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
    11  // MOVDDUP X4, X5
    12  #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
    13  // MOVDDUP X6, X7
    14  #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
    15  // MOVDDUP X8, X9
    16  #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
    17  
    18  // ADDSUBPD X2, X3
    19  #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
    20  // ADDSUBPD X4, X5
    21  #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
    22  // ADDSUBPD X6, X7
    23  #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
    24  // ADDSUBPD X8, X9
    25  #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
    26  
    27  // func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
    28  TEXT ·AxpyInc(SB), NOSPLIT, $0
    29  	MOVQ   x_base+16(FP), SI // SI = &x
    30  	MOVQ   y_base+40(FP), DI // DI = &y
    31  	MOVQ   n+64(FP), CX      // CX = n
    32  	CMPQ   CX, $0            // if n==0 { return }
    33  	JE     axpyi_end
    34  	MOVQ   ix+88(FP), R8     // R8 = ix  // Load the first index
    35  	SHLQ   $4, R8            // R8 *= sizeof(complex128)
    36  	MOVQ   iy+96(FP), R9     // R9 = iy
    37  	SHLQ   $4, R9            // R9 *= sizeof(complex128)
    38  	LEAQ   (SI)(R8*1), SI    // SI = &(x[ix])
    39  	LEAQ   (DI)(R9*1), DI    // DI = &(y[iy])
    40  	MOVQ   DI, DX            // DX = DI      // Separate Read/Write pointers
    41  	MOVQ   incX+72(FP), R8   // R8 = incX
    42  	SHLQ   $4, R8            // R8 *= sizeof(complex128)
    43  	MOVQ   incY+80(FP), R9   // R9 = iy
    44  	SHLQ   $4, R9            // R9 *= sizeof(complex128)
    45  	MOVUPS alpha+0(FP), X0   // X0 = { imag(a), real(a) }
    46  	MOVAPS X0, X1
    47  	SHUFPD $0x1, X1, X1      // X1 = { real(a), imag(a) }
    48  	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
    49  	MOVAPS X1, X11
    50  	MOVQ   CX, BX
    51  	ANDQ   $3, CX            // CX = n % 4
    52  	SHRQ   $2, BX            // BX = floor( n / 4 )
    53  	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
    54  
    55  axpyi_loop: // do {
    56  	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
    57  	MOVUPS (SI)(R8*1), X4
    58  	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
    59  	MOVUPS (SI), X6
    60  	MOVUPS (SI)(R8*1), X8
    61  
    62  	// X_(i+1) = { real(x[i], real(x[i]) }
    63  	MOVDDUP_X2_X3
    64  	MOVDDUP_X4_X5
    65  	MOVDDUP_X6_X7
    66  	MOVDDUP_X8_X9
    67  
    68  	// X_i = { imag(x[i]), imag(x[i]) }
    69  	SHUFPD $0x3, X2, X2
    70  	SHUFPD $0x3, X4, X4
    71  	SHUFPD $0x3, X6, X6
    72  	SHUFPD $0x3, X8, X8
    73  
    74  	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
    75  	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
    76  	MULPD X1, X2
    77  	MULPD X0, X3
    78  	MULPD X11, X4
    79  	MULPD X10, X5
    80  	MULPD X1, X6
    81  	MULPD X0, X7
    82  	MULPD X11, X8
    83  	MULPD X10, X9
    84  
    85  	// X_(i+1) = {
    86  	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
    87  	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
    88  	//  }
    89  	ADDSUBPD_X2_X3
    90  	ADDSUBPD_X4_X5
    91  	ADDSUBPD_X6_X7
    92  	ADDSUBPD_X8_X9
    93  
    94  	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
    95  	ADDPD  (DX), X3
    96  	ADDPD  (DX)(R9*1), X5
    97  	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
    98  	ADDPD  (DX), X7
    99  	ADDPD  (DX)(R9*1), X9
   100  	MOVUPS X3, (DI)       // dst[i] = X_(i+1)
   101  	MOVUPS X5, (DI)(R9*1)
   102  	LEAQ   (DI)(R9*2), DI
   103  	MOVUPS X7, (DI)
   104  	MOVUPS X9, (DI)(R9*1)
   105  	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
   106  	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
   107  	LEAQ   (DI)(R9*2), DI // DI = &(DI[incY*2])
   108  	DECQ   BX
   109  	JNZ    axpyi_loop     // } while --BX > 0
   110  	CMPQ   CX, $0         // if CX == 0 { return }
   111  	JE     axpyi_end
   112  
   113  axpyi_tail: // do {
   114  	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
   115  	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
   116  	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
   117  	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
   118  	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
   119  
   120  	// X_(i+1) = {
   121  	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
   122  	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
   123  	//  }
   124  	ADDSUBPD_X2_X3
   125  
   126  	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
   127  	ADDPD  (DI), X3
   128  	MOVUPS X3, (DI)   // y[i] = X_i
   129  	ADDQ   R8, SI     // SI = &(SI[incX])
   130  	ADDQ   R9, DI     // DI = &(DI[incY])
   131  	LOOP   axpyi_tail // } while --CX > 0
   132  
   133  axpyi_end:
   134  	RET