github.com/egonelbre/exp@v0.0.0-20240430123955-ed1d3aa93911/sse/sse_amd64.s (about) 1 #include "textflag.h" 2 3 // AX = &dst[0], BX = &src[0], CX = min(len(dst), len(src)) 4 #define SetupSlices \ 5 MOVQ dst+8(FP), CX ; \ 6 MOVQ src+32(FP), BX ; \ 7 CMPQ CX, BX ; \ 8 CMOVQGT BX, CX ; \ 9 MOVQ dst+0(FP), AX ; \ 10 MOVQ src+24(FP), BX ; \ 11 12 #define Load4x(FROM, A, B, C, D) \ 13 MOVOU +00(FROM), A; \ 14 MOVOU +16(FROM), B; \ 15 MOVOU +32(FROM), C; \ 16 MOVOU +48(FROM), D; 17 18 #define Store4x(INTO, A, B, C, D) \ 19 MOVOU A, +00(INTO); \ 20 MOVOU B, +16(INTO); \ 21 MOVOU C, +32(INTO); \ 22 MOVOU D, +48(INTO); 23 24 #define Apply4x(OP) \ 25 OP X0, X4; \ 26 OP X1, X5; \ 27 OP X2, X6; \ 28 OP X3, X7; 29 30 31 // func AddU32_SSE(dst, src []uint32) 32 TEXT ·AddU32_SSE(SB),NOSPLIT,$0-48 33 SetupSlices 34 35 vector: 36 SUBQ $16, CX 37 JL trailing 38 39 Load4x(BX, X0, X1, X2, X3) 40 Load4x(AX, X4, X5, X6, X7) 41 Apply4x(PADDL) 42 Store4x(AX, X4, X5, X6, X7) 43 44 // increment 45 ADDQ $64, BX 46 ADDQ $64, AX 47 JMP vector 48 49 trailing: 50 ADDQ $17, CX 51 elem: 52 DECQ CX 53 JZ done 54 MOVL (BX), DX 55 ADDL DX, (AX) 56 ADDQ $4, BX 57 ADDQ $4, AX 58 JMP elem 59 done: 60 RET 61 62 // func SubU32_SSE(dst, src []uint32) 63 TEXT ·SubU32_SSE(SB),NOSPLIT,$0 64 SetupSlices 65 66 vector: 67 SUBQ $16, CX 68 JL trailing 69 70 Load4x(BX, X0, X1, X2, X3) 71 Load4x(AX, X4, X5, X6, X7) 72 Apply4x(PSUBL) 73 Store4x(AX, X4, X5, X6, X7) 74 75 ADDQ $64, BX 76 ADDQ $64, AX 77 JMP vector 78 79 trailing: 80 ADDQ $17, CX 81 elem: 82 DECQ CX 83 JZ done 84 MOVL (BX), DX 85 // sub 86 SUBL DX, (AX) 87 // increment 88 ADDQ $4, BX 89 ADDQ $4, AX 90 JMP elem 91 done: 92 RET 93 94 // func MulU32_SSE(dst, src []uint32) 95 TEXT ·MulU32_SSE(SB),NOSPLIT,$0 96 SetupSlices 97 98 vector: 99 SUBQ $16, CX 100 JL trailing 101 102 Load4x(BX, X0, X1, X2, X3) 103 Load4x(AX, X4, X5, X6, X7) 104 Apply4x(PMULLW) 105 Store4x(AX, X4, X5, X6, X7) 106 107 // increment 108 ADDQ $64, BX 109 ADDQ $64, AX 110 JMP vector 111 112 trailing: 113 ADDQ $17, CX 114 elem: 115 DECQ CX 116 JZ done 117 MOVL (BX), DX 118 // sub 119 IMULL (AX), DX 120 MOVL DX, (AX) 121 // increment 122 ADDQ $4, BX 123 ADDQ $4, AX 124 JMP elem 125 done: 126 RET