github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/add_amd64.s (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!appengine 6 7 TEXT ·addConst8TinyInplaceSSSE3Asm(SB),4,$0-16 8 // DI = pointer to current main[] element. 9 MOVQ main+0(FP), DI 10 MOVD val+8(FP), X0 11 12 PXOR X1, X1 13 PSHUFB X1, X0 14 // all bytes of X0 are now equal to val 15 16 MOVOU (DI), X1 17 PADDB X0, X1 18 MOVOU X1, (DI) 19 RET 20 21 TEXT ·addConst8OddInplaceSSSE3Asm(SB),4,$0-24 22 // DI = pointer to current main[] element. 23 MOVQ main+0(FP), DI 24 MOVD val+8(FP), X0 25 MOVQ nByte+16(FP), SI 26 27 PXOR X1, X1 28 PSHUFB X1, X0 29 // all bytes of X0 are now equal to val 30 31 LEAQ -32(DI)(SI*1), AX 32 CMPQ AX, DI 33 JLE addConst8OddInplaceSSSE3Final 34 35 addConst8OddInplaceSSSE3Loop: 36 // tried 2x unroll, benefit appears to exist but is smaller than ~4% so 37 // I won't bother for now 38 MOVOU (DI), X1 39 PADDB X0, X1 40 MOVOU X1, (DI) 41 ADDQ $16, DI 42 CMPQ AX, DI 43 JG addConst8OddInplaceSSSE3Loop 44 45 addConst8OddInplaceSSSE3Final: 46 // Load and parallel-add to last two vectors (which usually overlap) 47 // simultaneously, before writing back. 48 ADDQ $16, AX 49 MOVOU (DI), X1 50 MOVOU (AX), X2 51 PADDB X0, X1 52 PADDB X0, X2 53 MOVOU X1, (DI) 54 MOVOU X2, (AX) 55 RET 56 57 TEXT ·addConst8SSSE3Asm(SB),4,$0-32 58 // DI = pointer to current src[] element. 59 // R8 = pointer to current dst[] element. 60 MOVQ dst+0(FP), R8 61 MOVQ src+8(FP), DI 62 MOVD val+16(FP), X0 63 MOVQ nByte+24(FP), SI 64 65 PXOR X1, X1 66 PSHUFB X1, X0 67 // all bytes of X0 are now equal to val 68 69 // SI = pointer to end of src[]. 70 ADDQ DI, SI 71 72 addConst8SSSE3Loop: 73 MOVOU (DI), X1 74 PADDB X0, X1 75 MOVOU X1, (R8) 76 ADDQ $16, DI 77 ADDQ $16, R8 78 CMPQ SI, DI 79 JG addConst8SSSE3Loop 80 81 RET 82 83 TEXT ·addConst8OddSSSE3Asm(SB),4,$0-32 84 // DI = pointer to current src[] element. 85 // R8 = pointer to current dst[] element. 86 MOVQ dst+0(FP), R8 87 MOVQ src+8(FP), DI 88 MOVD val+16(FP), X0 89 MOVQ nByte+24(FP), BX 90 91 PXOR X1, X1 92 PSHUFB X1, X0 93 94 // set AX to 16 bytes before end of src[]. 95 // change BX to 16 bytes before end of dst[]. 96 SUBQ $16, BX 97 LEAQ 0(DI)(BX*1), AX 98 ADDQ R8, BX 99 100 addConst8OddSSSE3Loop: 101 MOVOU (DI), X1 102 PADDB X0, X1 103 MOVOU X1, (R8) 104 ADDQ $16, DI 105 ADDQ $16, R8 106 CMPQ AX, DI 107 JG addConst8OddSSSE3Loop 108 109 // Final usually-unaligned read and write. 110 MOVOU (AX), X1 111 PADDB X0, X1 112 MOVOU X1, (BX) 113 RET 114 115 TEXT ·subtractFromConst8TinyInplaceSSSE3Asm(SB),4,$0-16 116 // Almost identical to addConst8TinyInplaceSSSE3Asm. 117 // DI = pointer to current main[] element. 118 MOVQ main+0(FP), DI 119 MOVD val+8(FP), X0 120 121 PXOR X1, X1 122 PSHUFB X1, X0 123 // all bytes of X0 are now equal to val 124 125 MOVOU (DI), X1 126 PSUBB X1, X0 127 MOVOU X0, (DI) 128 RET 129 130 TEXT ·subtractFromConst8OddInplaceSSSE3Asm(SB),4,$0-24 131 // Almost identical to addConst8OddInplaceSSSE3Asm. 132 // DI = pointer to current main[] element. 133 MOVQ main+0(FP), DI 134 MOVD val+8(FP), X0 135 MOVQ nByte+16(FP), SI 136 137 PXOR X1, X1 138 PSHUFB X1, X0 139 // all bytes of X0 are now equal to val 140 141 LEAQ -32(DI)(SI*1), BX 142 CMPQ BX, DI 143 JLE subtractFromConst8OddInplaceSSSE3Final 144 145 subtractFromConst8OddInplaceSSSE3Loop: 146 MOVOU (DI), X2 147 MOVO X0, X1 148 PSUBB X2, X1 149 MOVOU X1, (DI) 150 ADDQ $16, DI 151 CMPQ BX, DI 152 JG subtractFromConst8OddInplaceSSSE3Loop 153 154 subtractFromConst8OddInplaceSSSE3Final: 155 ADDQ $16, BX 156 MOVOU (DI), X2 157 MOVOU (BX), X3 158 MOVO X0, X1 159 PSUBB X2, X0 160 PSUBB X3, X1 161 MOVOU X0, (DI) 162 MOVOU X1, (BX) 163 RET 164 165 TEXT ·subtractFromConst8SSSE3Asm(SB),4,$0-32 166 // Almost identical to addConst8SSSE3Asm. 167 // DI = pointer to current src[] element. 168 // R8 = pointer to current dst[] element. 169 MOVQ dst+0(FP), R8 170 MOVQ src+8(FP), DI 171 MOVD val+16(FP), X0 172 MOVQ nByte+24(FP), SI 173 174 PXOR X1, X1 175 PSHUFB X1, X0 176 // all bytes of X0 are now equal to val 177 178 // SI = pointer to end of src[]. 179 ADDQ DI, SI 180 181 subtractFromConst8SSSE3Loop: 182 MOVOU (DI), X2 183 MOVO X0, X1 184 PSUBB X2, X1 185 MOVOU X1, (R8) 186 ADDQ $16, DI 187 ADDQ $16, R8 188 CMPQ SI, DI 189 JG subtractFromConst8SSSE3Loop 190 191 RET 192 193 TEXT ·subtractFromConst8OddSSSE3Asm(SB),4,$0-32 194 // Almost identical to addConst8OddSSSE3Asm. 195 // DI = pointer to current src[] element. 196 // R8 = pointer to current dst[] element. 197 MOVQ dst+0(FP), R8 198 MOVQ src+8(FP), DI 199 MOVD val+16(FP), X0 200 MOVQ nByte+24(FP), BX 201 202 PXOR X1, X1 203 PSHUFB X1, X0 204 205 // set AX to 16 bytes before end of src[]. 206 // change BX to 16 bytes before end of dst[]. 207 SUBQ $16, BX 208 LEAQ 0(DI)(BX*1), AX 209 ADDQ R8, BX 210 211 subtractFromConst8OddSSSE3Loop: 212 MOVOU (DI), X2 213 MOVO X0, X1 214 PSUBB X2, X1 215 MOVOU X1, (R8) 216 ADDQ $16, DI 217 ADDQ $16, R8 218 CMPQ AX, DI 219 JG subtractFromConst8OddSSSE3Loop 220 221 // Final usually-unaligned read and write. 222 MOVOU (AX), X1 223 PSUBB X1, X0 224 MOVOU X0, (BX) 225 RET