github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/multibyte_amd64.s (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!appengine 6 7 DATA ·Reverse16<>+0x00(SB)/8, $0x09080b0a0d0c0f0e 8 DATA ·Reverse16<>+0x08(SB)/8, $0x0100030205040706 9 GLOBL ·Reverse16<>(SB), 24, $16 10 // NOPTR = 16, RODATA = 8 11 12 TEXT ·index16SSE2Asm(SB),4,$0-32 13 // index16SSE2Asm scans main[], searching for the first instance of 14 // val. If no instances are found, it returns -1. 15 // It requires nElem >= 8. 16 // The implementation is based on a loop which uses _mm_cmpeq_epi16() 17 // to scan 8 uint16s in parallel, and _mm_movemask_epi8() to extract 18 // the result of that scan. It is similar to firstLeq8 in cmp_amd64.s. 19 20 // There's a ~10% benefit from 2x-unrolling the main loop so that only 21 // one test is performed per loop iteration (i.e. just look at the 22 // bitwise-or of the comparison results, and backtrack a bit on a hit). 23 // I'll leave that on the table for now to keep the logic simpler. 24 25 // Register allocation: 26 // AX: pointer to start of main[] 27 // BX: nElem - 8 28 // CX: current index 29 // X0: vector with 8 copies of val 30 MOVQ main+0(FP), AX 31 32 // clang compiles _mm_set1_epi16() to this, I'll trust it. 33 MOVQ val+8(FP), X0 34 PSHUFLW $0xe0, X0, X0 35 PSHUFD $0, X0, X0 36 37 MOVQ nElem+16(FP), BX 38 SUBQ $8, BX 39 XORL CX, CX 40 41 index16SSE2AsmLoop: 42 // Scan 8 elements starting from &(main[CX]). 43 MOVOU (AX)(CX*2), X1 44 PCMPEQW X0, X1 45 PMOVMSKB X1, DX 46 // Bits 2k and 2k+1 are now set in DX iff the uint16 at position k 47 // compared equal. 48 TESTQ DX, DX 49 JNE index16SSE2AsmFound 50 ADDQ $8, CX 51 CMPQ BX, CX 52 JG index16SSE2AsmLoop 53 54 // Scan the last 8 elements; this may partially overlap with the 55 // previous scan. 56 MOVQ BX, CX 57 MOVOU (AX)(CX*2), X1 58 PCMPEQW X0, X1 59 PMOVMSKB X1, DX 60 TESTQ DX, DX 61 JNE index16SSE2AsmFound 62 // No match found, return -1. 63 MOVQ $-1, ret+24(FP) 64 RET 65 66 index16SSE2AsmFound: 67 BSFQ DX, AX 68 // AX now has the index of the lowest set bit in DX. 69 SHRQ $1, AX 70 ADDQ CX, AX 71 MOVQ AX, ret+24(FP) 72 RET 73 74 TEXT ·reverse16InplaceSSSE3Asm(SB),4,$0-16 75 // This is only called with nElem > 8. So we can safely divide this 76 // into two cases: 77 // 1. (nElem+7) % 16 in {0..7}. Execute (nElem+7)/16 normal iterations 78 // and exit. Last two writes usually overlap. 79 // 2. (nElem+7) % 16 in {8..15}. Execute (nElem-9)/16 normal 80 // iterations. Then we have between 17 and 24 central elements 81 // left; handle them by processing *three* vectors at once at the 82 // end. 83 // Logic is essentially identical to reverseComp4InplaceSSSE3Asm, 84 // except we don't need to complement here. 85 MOVQ main+0(FP), SI 86 MOVQ nElem+8(FP), AX 87 88 // DI iterates backwards from the end of seq8[]. 89 LEAQ -16(SI)(AX*2), DI 90 91 MOVOU ·Reverse16<>(SB), X0 92 SUBQ $1, AX 93 MOVQ AX, BX 94 ANDQ $8, BX 95 // BX is now 0 when we don't need to process 3 vectors at the end, and 96 // 8 when we do. 97 LEAQ 0(AX)(BX*2), R9 98 // R9 is now nElem+15 when we don't need to process 3 vectors at the 99 // end, and nElem-1 when we do. 100 LEAQ -24(SI)(R9*1), AX 101 // AX can now be used for the loop termination check: 102 // if nElem == 9, R9 == 24, so AX == uintptr(main) + 0. 103 // if nElem == 16, R9 == 31, so AX == uintptr(main) + 7. 104 // if nElem == 17, R9 == 16, so AX == uintptr(main) - 8. 105 // if nElem == 24, R9 == 23, so AX == uintptr(main) - 1. 106 CMPQ AX, SI 107 JL reverse16InplaceSSSE3LastThree 108 109 reverse16InplaceSSSE3Loop: 110 MOVOU (SI), X1 111 MOVOU (DI), X2 112 PSHUFB X0, X1 113 PSHUFB X0, X2 114 MOVOU X2, (SI) 115 MOVOU X1, (DI) 116 ADDQ $16, SI 117 SUBQ $16, DI 118 CMPQ AX, SI 119 JGE reverse16InplaceSSSE3Loop 120 121 TESTQ BX, BX 122 JNE reverse16InplaceSSSE3Ret 123 reverse16InplaceSSSE3LastThree: 124 MOVOU (SI), X1 125 MOVOU 16(SI), X2 126 MOVOU (DI), X3 127 PSHUFB X0, X1 128 PSHUFB X0, X2 129 PSHUFB X0, X3 130 MOVOU X3, (SI) 131 MOVOU X2, -16(DI) 132 MOVOU X1, (DI) 133 134 reverse16InplaceSSSE3Ret: 135 RET 136 137 TEXT ·reverse16SSSE3Asm(SB),4,$0-24 138 MOVQ dst+0(FP), DI 139 MOVQ src+8(FP), SI 140 MOVQ nElem+16(FP), AX 141 142 // R8 iterates backwards from the end of src[]. 143 LEAQ -16(SI)(AX*2), R8 144 MOVOU ·Reverse16<>(SB), X0 145 // Save final dst[] pointer for later. 146 LEAQ -16(DI)(AX*2), R9 147 148 reverse16SSSE3Loop: 149 MOVOU (R8), X1 150 PSHUFB X0, X1 151 MOVOU X1, (DI) 152 SUBQ $16, R8 153 ADDQ $16, DI 154 CMPQ SI, R8 155 JL reverse16SSSE3Loop 156 157 MOVOU (SI), X1 158 PSHUFB X0, X1 159 MOVOU X1, (R9) 160 RET