github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/bloom/filter_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define salt0 0x47b6137b 6 #define salt1 0x44974d91 7 #define salt2 0x8824ad5b 8 #define salt3 0xa2b7289d 9 #define salt4 0x705495c7 10 #define salt5 0x2df1424b 11 #define salt6 0x9efc4947 12 #define salt7 0x5c6bfb31 13 14 // See block_amd64.s for a description of this algorithm. 15 #define generateMask(src, dst) \ 16 VMOVDQA ones(SB), dst \ 17 VPMULLD salt(SB), src, src \ 18 VPSRLD $27, src, src \ 19 VPSLLVD src, dst, dst 20 21 #define applyMask(src, dst) \ 22 VPOR dst, src, src \ 23 VMOVDQU src, dst 24 25 #define fasthash1x64(scale, value) \ 26 SHRQ $32, value \ 27 IMULQ scale, value \ 28 SHRQ $32, value \ 29 SHLQ $5, value 30 31 #define fasthash4x64(scale, value) \ 32 VPSRLQ $32, value, value \ 33 VPMULUDQ scale, value, value \ 34 VPSRLQ $32, value, value \ 35 VPSLLQ $5, value, value 36 37 #define extract4x64(srcYMM, srcXMM, tmpXMM, r0, r1, r2, r3) \ 38 VEXTRACTI128 $1, srcYMM, tmpXMM \ 39 MOVQ srcXMM, r0 \ 40 VPEXTRQ $1, srcXMM, r1 \ 41 MOVQ tmpXMM, r2 \ 42 VPEXTRQ $1, tmpXMM, r3 43 44 #define insert(salt, src, dst) \ 45 MOVL src, CX \ 46 IMULL salt, CX \ 47 SHRL $27, CX \ 48 MOVL $1, DX \ 49 SHLL CX, DX \ 50 ORL DX, dst 51 52 #define check(salt, b, x) \ 53 MOVL b, CX \ 54 MOVL x, DX \ 55 IMULL salt, DX \ 56 SHRL $27, DX \ 57 BTL DX, CX \ 58 JAE notfound 59 60 // func filterInsertBulk(f []Block, x []uint64) 61 TEXT ·filterInsertBulk(SB), NOSPLIT, $0-48 62 MOVQ f_base+0(FP), AX 63 MOVQ f_len+8(FP), CX 64 MOVQ x_base+24(FP), BX 65 MOVQ x_len+32(FP), DX 66 CMPB ·hasAVX2(SB), $0 67 JE fallback 68 avx2: 69 VPBROADCASTQ f_base+8(FP), Y0 70 // Loop initialization, SI holds the current index in `x`, DI is the number 71 // of elements in `x` rounded down to the nearest multiple of 4. 72 XORQ SI, SI 73 MOVQ DX, DI 74 SHRQ $2, DI 75 SHLQ $2, DI 76 avx2loop4x64: 77 CMPQ SI, DI 78 JAE avx2loop1x64 79 80 // The masks and indexes for 4 input hashes are computed in each loop 81 // iteration. The hashes are loaded in Y1 so we can use vector instructions 82 // to compute all 4 indexes in parallel. The lower 32 bits of the hashes are 83 // also broadcasted in 4 YMM registers to compute the 4 masks that will then 84 // be applied to the filter. 85 VMOVDQU (BX)(SI*8), Y1 86 VPBROADCASTD 0(BX)(SI*8), Y2 87 VPBROADCASTD 8(BX)(SI*8), Y3 88 VPBROADCASTD 16(BX)(SI*8), Y4 89 VPBROADCASTD 24(BX)(SI*8), Y5 90 91 fasthash4x64(Y0, Y1) 92 generateMask(Y2, Y6) 93 generateMask(Y3, Y7) 94 generateMask(Y4, Y8) 95 generateMask(Y5, Y9) 96 97 // The next block of instructions move indexes from the vector to general 98 // purpose registers in order to use them as offsets when applying the mask 99 // to the filter. 100 extract4x64(Y1, X1, X10, R8, R9, R10, R11) 101 102 // Apply masks to the filter; this operation is sensitive to aliasing, when 103 // blocks overlap the, CPU has to serialize the reads and writes, which has 104 // a measurable impact on throughput. This would be frequent for small bloom 105 // filters which may have only a few blocks, the probability of seeing 106 // overlapping blocks on large filters should be small enough to make this 107 // a non-issue though. 108 applyMask(Y6, (AX)(R8*1)) 109 applyMask(Y7, (AX)(R9*1)) 110 applyMask(Y8, (AX)(R10*1)) 111 applyMask(Y9, (AX)(R11*1)) 112 113 ADDQ $4, SI 114 JMP avx2loop4x64 115 avx2loop1x64: 116 // Compute trailing elements in `x` if the length was not a multiple of 4. 117 // This is the same algorithm as the one in the loop4x64 section, working 118 // on a single mask/block pair at a time. 119 CMPQ SI, DX 120 JE avx2done 121 MOVQ (BX)(SI*8), R8 122 VPBROADCASTD (BX)(SI*8), Y0 123 fasthash1x64(CX, R8) 124 generateMask(Y0, Y1) 125 applyMask(Y1, (AX)(R8*1)) 126 INCQ SI 127 JMP avx2loop1x64 128 avx2done: 129 VZEROUPPER 130 JMP done 131 fallback: 132 XORQ SI, SI 133 MOVQ DX, DI 134 MOVQ CX, R10 135 loop: 136 CMPQ SI, DI 137 JE done 138 MOVLQZX (BX)(SI*8), R8 139 MOVQ (BX)(SI*8), R9 140 fasthash1x64(R10, R9) 141 insert($salt0, R8, 0(AX)(R9*1)) 142 insert($salt1, R8, 4(AX)(R9*1)) 143 insert($salt2, R8, 8(AX)(R9*1)) 144 insert($salt3, R8, 12(AX)(R9*1)) 145 insert($salt4, R8, 16(AX)(R9*1)) 146 insert($salt5, R8, 20(AX)(R9*1)) 147 insert($salt6, R8, 24(AX)(R9*1)) 148 insert($salt7, R8, 28(AX)(R9*1)) 149 INCQ SI 150 JMP loop 151 done: 152 RET 153 154 // func filterInsert(f []Block, x uint64) 155 TEXT ·filterInsert(SB), NOSPLIT, $0-32 156 MOVQ f_base+0(FP), AX 157 MOVQ f_len+8(FP), BX 158 MOVQ x+24(FP), CX 159 fasthash1x64(BX, CX) 160 CMPB ·hasAVX2(SB), $0 161 JE fallback 162 avx2: 163 VPBROADCASTD x+24(FP), Y1 164 generateMask(Y1, Y0) 165 applyMask(Y0, (AX)(CX*1)) 166 VZEROUPPER 167 RET 168 fallback: 169 ADDQ CX, AX 170 MOVL x+24(FP), BX 171 insert($salt0, BX, 0(AX)) 172 insert($salt1, BX, 4(AX)) 173 insert($salt2, BX, 8(AX)) 174 insert($salt3, BX, 12(AX)) 175 insert($salt4, BX, 16(AX)) 176 insert($salt5, BX, 20(AX)) 177 insert($salt6, BX, 24(AX)) 178 insert($salt7, BX, 28(AX)) 179 RET 180 181 // func filterCheck(f []Block, x uint64) bool 182 TEXT ·filterCheck(SB), NOSPLIT, $0-33 183 MOVQ f_base+0(FP), AX 184 MOVQ f_len+8(FP), BX 185 MOVQ x+24(FP), CX 186 fasthash1x64(BX, CX) 187 CMPB ·hasAVX2(SB), $0 188 JE fallback 189 avx2: 190 VPBROADCASTD x+24(FP), Y1 191 generateMask(Y1, Y0) 192 VPAND (AX)(CX*1), Y0, Y1 193 VPTEST Y0, Y1 194 SETCS ret+32(FP) 195 VZEROUPPER 196 RET 197 fallback: 198 ADDQ CX, AX 199 MOVL x+24(FP), BX 200 check($salt0, 0(AX), BX) 201 check($salt1, 4(AX), BX) 202 check($salt2, 8(AX), BX) 203 check($salt3, 12(AX), BX) 204 check($salt4, 16(AX), BX) 205 check($salt5, 20(AX), BX) 206 check($salt6, 24(AX), BX) 207 check($salt7, 28(AX), BX) 208 MOVB $1, CX 209 JMP done 210 notfound: 211 XORB CX, CX 212 done: 213 MOVB CX, ret+32(FP) 214 RET