github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/bloom/block_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define salt0 0x47b6137b 6 #define salt1 0x44974d91 7 #define salt2 0x8824ad5b 8 #define salt3 0xa2b7289d 9 #define salt4 0x705495c7 10 #define salt5 0x2df1424b 11 #define salt6 0x9efc4947 12 #define salt7 0x5c6bfb31 13 14 DATA ones+0(SB)/4, $1 15 DATA ones+4(SB)/4, $1 16 DATA ones+8(SB)/4, $1 17 DATA ones+12(SB)/4, $1 18 DATA ones+16(SB)/4, $1 19 DATA ones+20(SB)/4, $1 20 DATA ones+24(SB)/4, $1 21 DATA ones+28(SB)/4, $1 22 GLOBL ones(SB), RODATA|NOPTR, $32 23 24 DATA salt+0(SB)/4, $salt0 25 DATA salt+4(SB)/4, $salt1 26 DATA salt+8(SB)/4, $salt2 27 DATA salt+12(SB)/4, $salt3 28 DATA salt+16(SB)/4, $salt4 29 DATA salt+20(SB)/4, $salt5 30 DATA salt+24(SB)/4, $salt6 31 DATA salt+28(SB)/4, $salt7 32 GLOBL salt(SB), RODATA|NOPTR, $32 33 34 // This initial block is a SIMD implementation of the mask function declared in 35 // block_default.go and block_optimized.go. For each of the 8 x 32 bits words of 36 // the bloom filter block, the operation performed is: 37 // 38 // block[i] = 1 << ((x * salt[i]) >> 27) 39 // 40 // Arguments 41 // --------- 42 // 43 // * src is a memory location where the value to use when computing the mask is 44 // located. The memory location is not modified. 45 // 46 // * tmp is a YMM register used as scratch space to hold intermediary results in 47 // the algorithm. 48 // 49 // * dst is a YMM register where the final mask is written. 50 // 51 #define generateMask(src, tmp, dst) \ 52 VMOVDQA ones(SB), dst \ 53 VPBROADCASTD src, tmp \ 54 VPMULLD salt(SB), tmp, tmp \ 55 VPSRLD $27, tmp, tmp \ 56 VPSLLVD tmp, dst, dst 57 58 #define insert(salt, src, dst) \ 59 MOVL src, CX \ 60 IMULL salt, CX \ 61 SHRL $27, CX \ 62 MOVL $1, DX \ 63 SHLL CX, DX \ 64 ORL DX, dst 65 66 #define check(salt, b, x) \ 67 MOVL b, CX \ 68 MOVL x, DX \ 69 IMULL salt, DX \ 70 SHRL $27, DX \ 71 BTL DX, CX \ 72 JAE notfound 73 74 // func blockInsert(b *Block, x uint32) 75 TEXT ·blockInsert(SB), NOSPLIT, $0-16 76 MOVQ b+0(FP), AX 77 CMPB ·hasAVX2(SB), $0 78 JE fallback 79 avx2: 80 generateMask(x+8(FP), Y1, Y0) 81 // Set all 1 bits of the mask in the bloom filter block. 82 VPOR (AX), Y0, Y0 83 VMOVDQU Y0, (AX) 84 VZEROUPPER 85 RET 86 fallback: 87 MOVL x+8(FP), BX 88 insert($salt0, BX, 0(AX)) 89 insert($salt1, BX, 4(AX)) 90 insert($salt2, BX, 8(AX)) 91 insert($salt3, BX, 12(AX)) 92 insert($salt4, BX, 16(AX)) 93 insert($salt5, BX, 20(AX)) 94 insert($salt6, BX, 24(AX)) 95 insert($salt7, BX, 28(AX)) 96 RET 97 98 // func blockCheck(b *Block, x uint32) bool 99 TEXT ·blockCheck(SB), NOSPLIT, $0-17 100 MOVQ b+0(FP), AX 101 CMPB ·hasAVX2(SB), $0 102 JE fallback 103 avx2: 104 generateMask(x+8(FP), Y1, Y0) 105 // Compare the 1 bits of the mask with the bloom filter block, then compare 106 // the result with the mask, expecting equality if the value `x` was present 107 // in the block. 108 VPAND (AX), Y0, Y1 // Y0 = block & mask 109 VPTEST Y0, Y1 // if (Y0 & ^Y1) != 0 { CF = 1 } 110 SETCS ret+16(FP) // return CF == 1 111 VZEROUPPER 112 RET 113 fallback: 114 MOVL x+8(FP), BX 115 check($salt0, 0(AX), BX) 116 check($salt1, 4(AX), BX) 117 check($salt2, 8(AX), BX) 118 check($salt3, 12(AX), BX) 119 check($salt4, 16(AX), BX) 120 check($salt5, 20(AX), BX) 121 check($salt6, 24(AX), BX) 122 check($salt7, 28(AX), BX) 123 MOVB $1, CX 124 JMP done 125 notfound: 126 XORB CX, CX 127 done: 128 MOVB CX, ret+16(FP) 129 RET