github.com/egonelbre/exp@v0.0.0-20240430123955-ed1d3aa93911/sse/sse_amd64.s (about)

     1  #include "textflag.h"
     2  
     3  // AX = &dst[0], BX = &src[0], CX = min(len(dst), len(src))
     4  #define SetupSlices \
     5  	MOVQ     dst+8(FP),  CX ; \
     6  	MOVQ     src+32(FP), BX ; \
     7  	CMPQ     CX, BX         ; \
     8  	CMOVQGT  BX, CX         ; \
     9  	MOVQ     dst+0(FP),  AX ; \
    10  	MOVQ     src+24(FP), BX ; \
    11  
    12  #define Load4x(FROM, A, B, C, D) \
    13  	MOVOU  +00(FROM), A; \
    14  	MOVOU  +16(FROM), B; \
    15  	MOVOU  +32(FROM), C; \
    16  	MOVOU  +48(FROM), D;
    17  
    18  #define Store4x(INTO, A, B, C, D)  \
    19  	MOVOU  A, +00(INTO); \
    20  	MOVOU  B, +16(INTO); \
    21  	MOVOU  C, +32(INTO); \
    22  	MOVOU  D, +48(INTO);
    23  
    24  #define Apply4x(OP) \
    25  	OP X0, X4;      \
    26  	OP X1, X5;      \
    27  	OP X2, X6;      \
    28  	OP X3, X7;
    29  
    30  
    31  // func AddU32_SSE(dst, src []uint32)
    32  TEXT ·AddU32_SSE(SB),NOSPLIT,$0-48
    33  	SetupSlices
    34  
    35  	vector:
    36  		SUBQ     $16, CX
    37  		JL trailing
    38  
    39  		Load4x(BX, X0, X1, X2, X3)
    40  		Load4x(AX, X4, X5, X6, X7)
    41  		Apply4x(PADDL)
    42  		Store4x(AX, X4, X5, X6, X7)
    43  
    44  		// increment
    45  		ADDQ $64, BX
    46  		ADDQ $64, AX
    47  		JMP vector
    48  
    49  	trailing:
    50  		ADDQ $17, CX
    51  	elem:
    52  		DECQ     CX
    53  		JZ done
    54  		MOVL (BX), DX
    55  		ADDL DX,  (AX)
    56  		ADDQ $4, BX
    57  		ADDQ $4, AX
    58  		JMP elem
    59  	done:
    60  		RET
    61  
    62  // func SubU32_SSE(dst, src []uint32)
    63  TEXT ·SubU32_SSE(SB),NOSPLIT,$0
    64  	SetupSlices
    65  
    66  	vector:
    67  		SUBQ     $16, CX
    68  		JL trailing
    69  
    70  		Load4x(BX, X0, X1, X2, X3)
    71  		Load4x(AX, X4, X5, X6, X7)
    72  		Apply4x(PSUBL)
    73  		Store4x(AX, X4, X5, X6, X7)
    74  
    75  		ADDQ $64, BX
    76  		ADDQ $64, AX
    77  		JMP vector
    78  
    79  	trailing:
    80  		ADDQ $17, CX
    81  	elem:
    82  		DECQ     CX
    83  		JZ done
    84  		MOVL (BX), DX
    85  		// sub
    86  		SUBL DX, (AX)
    87  		// increment
    88  		ADDQ $4, BX
    89  		ADDQ $4, AX
    90  		JMP elem
    91  	done:
    92  		RET
    93  
    94  // func MulU32_SSE(dst, src []uint32)
    95  TEXT ·MulU32_SSE(SB),NOSPLIT,$0
    96  	SetupSlices
    97  
    98  	vector:
    99  		SUBQ     $16, CX
   100  		JL trailing
   101  
   102  		Load4x(BX, X0, X1, X2, X3)
   103  		Load4x(AX, X4, X5, X6, X7)
   104  		Apply4x(PMULLW)
   105  		Store4x(AX, X4, X5, X6, X7)
   106  
   107  		// increment
   108  		ADDQ $64, BX
   109  		ADDQ $64, AX
   110  		JMP vector
   111  
   112  	trailing:
   113  		ADDQ $17, CX
   114  	elem:
   115  		DECQ     CX
   116  		JZ done
   117  		MOVL (BX), DX
   118  		// sub
   119  		IMULL (AX), DX
   120  		MOVL  DX, (AX)
   121  		// increment
   122  		ADDQ $4, BX
   123  		ADDQ $4, AX
   124  		JMP elem
   125  	done:
   126  		RET