github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/simd_amd64.s (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!appengine 6 7 DATA ·Mask0f0f<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f 8 DATA ·Mask0f0f<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f 9 // NOPTR = 16, RODATA = 8 10 GLOBL ·Mask0f0f<>(SB), 24, $16 11 12 DATA ·Reverse8<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 13 DATA ·Reverse8<>+0x08(SB)/8, $0x0001020304050607 14 GLOBL ·Reverse8<>(SB), 24, $16 15 16 TEXT ·unpackedNibbleLookupTinyInplaceSSSE3Asm(SB),4,$0-16 17 // DI = pointer to current main[] element. 18 MOVQ main+0(FP), DI 19 MOVQ tablePtr+8(FP), SI 20 MOVOU (SI), X0 21 MOVOU (DI), X1 22 PSHUFB X1, X0 23 MOVOU X0, (DI) 24 RET 25 26 TEXT ·unpackedNibbleLookupOddInplaceSSSE3Asm(SB),4,$0-24 27 // DI = pointer to current main[] element. 28 MOVQ main+0(FP), DI 29 MOVQ tablePtr+8(FP), SI 30 MOVQ nByte+16(FP), R9 31 32 MOVOU (SI), X0 33 34 // set AX to 32 bytes before end of main[]. 35 LEAQ -32(DI)(R9*1), AX 36 37 CMPQ AX, DI 38 JLE unpackedNibbleLookupOddInplaceSSSE3Finish 39 40 // 'Odd' refers to handling of byte counts which aren't multiples of 41 // bytesPerVec. They don't have to literally be odd (or non-multiples of 42 // bytesPerVec, for that matter). 43 unpackedNibbleLookupOddInplaceSSSE3Loop: 44 MOVOU (DI), X1 45 MOVO X0, X2 46 PSHUFB X1, X2 47 MOVOU X2, (DI) 48 ADDQ $16, DI 49 CMPQ AX, DI 50 JG unpackedNibbleLookupOddInplaceSSSE3Loop 51 52 unpackedNibbleLookupOddInplaceSSSE3Finish: 53 // These loads usually overlap, so they must both occur before the 54 // first write-back. 55 ADDQ $16, AX 56 MOVOU (DI), X1 57 MOVO X0, X2 58 MOVOU (AX), X3 59 PSHUFB X1, X2 60 PSHUFB X3, X0 61 MOVOU X2, (DI) 62 MOVOU X0, (AX) 63 RET 64 65 TEXT ·unpackedNibbleLookupSSSE3Asm(SB),4,$0-32 66 // DI = pointer to current src[] element. 67 // R8 = pointer to current dst[] element. 68 MOVQ dst+0(FP), R8 69 MOVQ src+8(FP), DI 70 MOVQ tablePtr+16(FP), SI 71 MOVQ nByte+24(FP), R9 72 73 MOVOU (SI), X0 74 75 // R9 = pointer to end of src[]. 76 ADDQ DI, R9 77 78 unpackedNibbleLookupSSSE3Loop: 79 MOVOU (DI), X1 80 MOVO X0, X2 81 PSHUFB X1, X2 82 MOVOU X2, (R8) 83 ADDQ $16, DI 84 ADDQ $16, R8 85 CMPQ R9, DI 86 JG unpackedNibbleLookupSSSE3Loop 87 88 RET 89 90 TEXT ·unpackedNibbleLookupOddSSSE3Asm(SB),4,$0-32 91 // DI = pointer to current src[] element. 92 // R8 = pointer to current dst[] element. 93 MOVQ dst+0(FP), R8 94 MOVQ src+8(FP), DI 95 MOVQ tablePtr+16(FP), SI 96 MOVQ nByte+24(FP), R9 97 98 MOVOU (SI), X0 99 100 // set AX to 16 bytes before end of src[]. 101 // change R9 to 16 bytes before end of dst[]. 102 SUBQ $16, R9 103 LEAQ 0(DI)(R9*1), AX 104 ADDQ R8, R9 105 106 unpackedNibbleLookupOddSSSE3Loop: 107 MOVOU (DI), X1 108 MOVO X0, X2 109 PSHUFB X1, X2 110 MOVOU X2, (R8) 111 ADDQ $16, DI 112 ADDQ $16, R8 113 CMPQ AX, DI 114 JG unpackedNibbleLookupOddSSSE3Loop 115 116 // Final usually-unaligned read and write. 117 MOVOU (AX), X1 118 PSHUFB X1, X0 119 MOVOU X0, (R9) 120 RET 121 122 TEXT ·packedNibbleLookupSSSE3Asm(SB),4,$0-32 123 // DI = pointer to current src[] element. 124 // R8 = pointer to current dst[] element. 125 MOVQ dst+0(FP), R8 126 MOVQ src+8(FP), DI 127 MOVQ tablePtr+16(FP), SI 128 MOVQ nSrcByte+24(FP), R9 129 130 MOVOU (SI), X0 131 MOVOU ·Mask0f0f<>(SB), X1 132 133 // AX = pointer to last relevant word of src[]. 134 // (note that 8 src bytes -> 16 dst bytes) 135 LEAQ -8(DI)(R9*1), AX 136 CMPQ AX, DI 137 JLE packedNibbleLookupSSSE3Final 138 139 packedNibbleLookupSSSE3Loop: 140 MOVOU (DI), X3 141 MOVO X0, X4 142 MOVO X0, X5 143 // Isolate high and low nibbles, then parallel-lookup. 144 MOVO X3, X2 145 PSRLQ $4, X3 146 PAND X1, X2 147 PAND X1, X3 148 PSHUFB X2, X4 149 PSHUFB X3, X5 150 // Use unpacklo/unpackhi to stitch results together. 151 // Even bytes (0, 2, 4, ...) are in X2/X4, odd in X5. 152 MOVO X4, X2 153 PUNPCKLBW X5, X4 154 PUNPCKHBW X5, X2 155 MOVOU X4, (R8) 156 MOVOU X2, 16(R8) 157 ADDQ $16, DI 158 ADDQ $32, R8 159 CMPQ AX, DI 160 JG packedNibbleLookupSSSE3Loop 161 packedNibbleLookupSSSE3Final: 162 // Necessary to write one more vector. We skip unpackhi, but must 163 // execute the rest of the loop body. 164 MOVOU (DI), X3 165 MOVO X0, X4 166 MOVO X3, X2 167 PSRLQ $4, X3 168 PAND X1, X2 169 PAND X1, X3 170 PSHUFB X2, X4 171 PSHUFB X3, X0 172 PUNPCKLBW X0, X4 173 MOVOU X4, (R8) 174 RET 175 176 TEXT ·packedNibbleLookupOddSSSE3Asm(SB),4,$0-32 177 // DI = pointer to current src[] element. 178 // R8 = pointer to current dst[] element. 179 MOVQ dst+0(FP), R8 180 MOVQ src+8(FP), DI 181 MOVQ tablePtr+16(FP), SI 182 MOVQ nSrcFullByte+24(FP), R9 183 184 MOVOU (SI), X0 185 MOVOU ·Mask0f0f<>(SB), X1 186 187 // set AX to 32 bytes before end of dst[]. 188 // change R9 to 16 bytes before end of src[]. 189 SUBQ $16, R9 190 LEAQ 0(R8)(R9*2), AX 191 ADDQ DI, R9 192 193 packedNibbleLookupOddSSSE3Loop: 194 MOVOU (DI), X3 195 MOVO X0, X4 196 MOVO X0, X5 197 // Isolate high and low nibbles, then parallel-lookup. 198 MOVO X3, X2 199 PSRLQ $4, X3 200 PAND X1, X2 201 PAND X1, X3 202 PSHUFB X2, X4 203 PSHUFB X3, X5 204 // Use unpacklo/unpackhi to stitch results together. 205 // Even bytes (0, 2, 4, ...) are in X2/X4, odd in X5. 206 MOVO X4, X2 207 PUNPCKLBW X5, X4 208 PUNPCKHBW X5, X2 209 MOVOU X4, (R8) 210 MOVOU X2, 16(R8) 211 ADDQ $16, DI 212 ADDQ $32, R8 213 CMPQ R9, DI 214 JG packedNibbleLookupOddSSSE3Loop 215 216 // Final usually-unaligned read and write. 217 MOVOU (R9), X3 218 MOVO X0, X4 219 MOVO X3, X2 220 PSRLQ $4, X3 221 PAND X1, X2 222 PAND X1, X3 223 PSHUFB X2, X4 224 PSHUFB X3, X0 225 MOVO X4, X2 226 PUNPCKLBW X0, X4 227 PUNPCKHBW X0, X2 228 MOVOU X4, (AX) 229 MOVOU X2, 16(AX) 230 RET 231 232 TEXT ·interleave8SSE2Asm(SB),4,$0-32 233 MOVQ dst+0(FP), R8 234 MOVQ even+8(FP), SI 235 MOVQ odd+16(FP), DI 236 MOVQ nDstByte+24(FP), R9 237 238 // AX = pointer to last vec of dst[]. 239 LEAQ -16(R8)(R9*1), AX 240 CMPQ AX, R8 241 JLE interleave8SSE2Final 242 243 interleave8SSE2Loop: 244 // Read 16 bytes from even[] and odd[], and use _mm_unpacklo_epi8() and 245 // _mm_unpackhi_epi8() to interleave and write 32 bytes to dst[]. 246 MOVOU (SI), X0 247 MOVOU (DI), X1 248 MOVO X0, X2 249 PUNPCKLBW X1, X0 250 PUNPCKHBW X1, X2 251 MOVOU X0, (R8) 252 MOVOU X2, 16(R8) 253 ADDQ $16, SI 254 ADDQ $32, R8 255 ADDQ $16, DI 256 CMPQ AX, R8 257 JG interleave8SSE2Loop 258 259 interleave8SSE2Final: 260 MOVOU (SI), X0 261 MOVOU (DI), X1 262 PUNPCKLBW X1, X0 263 MOVOU X0, (R8) 264 RET 265 266 TEXT ·interleave8OddSSE2Asm(SB),4,$0-32 267 MOVQ dst+0(FP), R8 268 MOVQ even+8(FP), SI 269 MOVQ odd+16(FP), DI 270 MOVQ nOddByte+24(FP), DX 271 272 // AX = 16 bytes before end of even[]. 273 LEAQ -16(SI)(DX*1), AX 274 LEAQ -16(DI)(DX*1), BX 275 LEAQ -32(R8)(DX*2), R9 276 277 interleave8OddSSE2Loop: 278 // At least 32 bytes left to write to dst[]. 279 MOVOU (SI), X0 280 MOVOU (DI), X1 281 MOVO X0, X2 282 PUNPCKLBW X1, X0 283 PUNPCKHBW X1, X2 284 MOVOU X0, (R8) 285 MOVOU X2, 16(R8) 286 ADDQ $16, SI 287 ADDQ $16, DI 288 ADDQ $32, R8 289 CMPQ AX, SI 290 JG interleave8OddSSE2Loop 291 292 // Final read/write: back up to 16 bytes before end of even[]/odd[] and 293 // 32 bytes before end of dst[]. This will usually re-write a bunch of 294 // dst[] bytes, but that's okay. 295 MOVOU (AX), X0 296 MOVOU (BX), X1 297 MOVO X0, X2 298 PUNPCKLBW X1, X0 299 PUNPCKHBW X1, X2 300 MOVOU X0, (R9) 301 MOVOU X2, 16(R9) 302 RET 303 304 TEXT ·reverse8InplaceSSSE3Asm(SB),4,$0-16 305 MOVQ main+0(FP), SI 306 MOVQ nByte+8(FP), AX 307 308 // DI iterates backwards from the end of main[]. 309 LEAQ -16(SI)(AX*1), DI 310 CMPQ SI, DI 311 JGE reverse8InplaceSSSE3Final 312 MOVOU ·Reverse8<>(SB), X0 313 314 reverse8InplaceSSSE3Loop: 315 MOVOU (SI), X1 316 MOVOU (DI), X2 317 PSHUFB X0, X1 318 PSHUFB X0, X2 319 MOVOU X2, (SI) 320 MOVOU X1, (DI) 321 ADDQ $16, SI 322 SUBQ $16, DI 323 CMPQ SI, DI 324 JL reverse8InplaceSSSE3Loop 325 326 reverse8InplaceSSSE3Final: 327 // 16 or fewer bytes left, [SI, DI+16). 328 // If 8..16, load two words, bswap64, write back. 329 // If 4..7, load two u32s, bswap32, write back. 330 // If 2..3, swap two bytes. 331 // If <= 1 (can be as small as -16), return immediately. 332 SUBQ SI, DI 333 // Now DI has remaining byte count - 16. 334 CMPQ DI, $-14 335 JL reverse8InplaceSSSE3Ret 336 CMPQ DI, $-8 337 JL reverse8InplaceSSSE3TwoThroughSeven 338 LEAQ 8(SI)(DI*1), BX 339 MOVQ (SI), R8 340 MOVQ (BX), R9 341 BSWAPQ R8 342 BSWAPQ R9 343 MOVQ R9, (SI) 344 MOVQ R8, (BX) 345 346 reverse8InplaceSSSE3Ret: 347 RET 348 349 reverse8InplaceSSSE3TwoThroughSeven: 350 CMPQ DI, $-12 351 JL reverse8InplaceSSSE3TwoOrThree 352 LEAQ 12(SI)(DI*1), BX 353 MOVL (SI), R8 354 MOVL (BX), R9 355 BSWAPL R8 356 BSWAPL R9 357 MOVL R9, (SI) 358 MOVL R8, (BX) 359 RET 360 361 reverse8InplaceSSSE3TwoOrThree: 362 LEAQ 15(SI)(DI*1), BX 363 MOVB (SI), R8 364 MOVB (BX), R9 365 MOVB R8, (BX) 366 MOVB R9, (SI) 367 RET 368 369 TEXT ·reverse8SSSE3Asm(SB),4,$0-24 370 MOVQ dst+0(FP), DI 371 MOVQ src+8(FP), SI 372 MOVQ nByte+16(FP), DX 373 374 // R8 iterates backwards from the end of src[]. 375 LEAQ -16(SI)(DX*1), R8 376 MOVOU ·Reverse8<>(SB), X0 377 // Save final dst[] pointer for later. 378 LEAQ -16(DI)(DX*1), R9 379 380 reverse8SSSE3Loop: 381 MOVOU (R8), X1 382 PSHUFB X0, X1 383 MOVOU X1, (DI) 384 SUBQ $16, R8 385 ADDQ $16, DI 386 CMPQ SI, R8 387 JL reverse8SSSE3Loop 388 389 MOVOU (SI), X1 390 PSHUFB X0, X1 391 MOVOU X1, (R9) 392 RET 393 394 TEXT ·bitFromEveryByteSSE2Asm(SB),4,$0-32 395 // bitFromEveryByteSSE2Asm grabs a single bit from every src[] byte, 396 // and packs them into dst[]. 397 // The implementation is based on the _mm_movemask_epi8() instruction, 398 // which grabs the *high* bit from each byte, so this function takes a 399 // 'lshift' argument instead of the wrapper's bitIdx. 400 401 // Register allocation: 402 // AX: pointer to start of dst 403 // BX: pointer to start of src 404 // CX: nDstByte (must be even), minus 2 to support 2x unroll 405 // (rule of thumb: if the loop is less than ~10 operations, 406 // unrolling is likely to make a noticeable difference with 407 // minimal effort; otherwise don't bother) 408 // DX: loop counter 409 // SI, DI: intermediate movemask results 410 // 411 // X0: lshift 412 MOVQ dst+0(FP), AX 413 MOVQ src+8(FP), BX 414 MOVQ lshift+16(FP), X0 415 416 MOVQ nDstByte+24(FP), CX 417 SUBQ $2, CX 418 // Compilers emit this instead of XORQ DX,DX since it's smaller and has 419 // the same effect. 420 XORL DX, DX 421 422 CMPQ CX, DX 423 JLE bitFromEveryByteSSE2AsmOdd 424 425 bitFromEveryByteSSE2AsmLoop: 426 MOVOU (BX)(DX*8), X1 427 MOVOU 16(BX)(DX*8), X2 428 PSLLQ X0, X1 429 PSLLQ X0, X2 430 PMOVMSKB X1, SI 431 PMOVMSKB X2, DI 432 MOVW SI, (AX)(DX*1) 433 MOVW DI, 2(AX)(DX*1) 434 ADDQ $4, DX 435 CMPQ CX, DX 436 JG bitFromEveryByteSSE2AsmLoop 437 438 JL bitFromEveryByteSSE2AsmFinish 439 440 // Move this label up one line if we ever need to accept nDstByte == 0. 441 bitFromEveryByteSSE2AsmOdd: 442 MOVOU (BX)(DX*8), X1 443 PSLLQ X0, X1 444 PMOVMSKB X1, SI 445 MOVW SI, (AX)(DX*1) 446 447 bitFromEveryByteSSE2AsmFinish: 448 RET