github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/hashprobe/hashprobe_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // This version of the probing algorithm for 32 bit keys takes advantage of 6 // the memory layout of table groups and SIMD instructions to accelerate the 7 // probing operations. 8 // 9 // The first 32 bytes of a table group contain the bit mask indicating which 10 // slots are in use, and the array of keys, which fits into a single vector 11 // register (YMM) and can be loaded and tested with a single instruction. 12 // 13 // A first version of the table group used the number of keys held in the 14 // group instead of a bit mask, which required the probing operation to 15 // reconstruct the bit mask during the lookup operation in order to identify 16 // which elements of the VPCMPEQD result should be retained. The extra CPU 17 // instructions used to reconstruct the bit mask had a measurable overhead. 18 // By holding the bit mask in the data structure, we can determine the number 19 // of keys in a group using the POPCNT instruction, and avoid recomputing the 20 // mask during lookups. 21 // 22 // func multiProbe32AVX2(table []table32Group, numKeys int, hashes []uintptr, keys sparse.Uint32Array, values []int32) int 23 TEXT ·multiProbe32AVX2(SB), NOSPLIT, $0-112 24 MOVQ table_base+0(FP), AX 25 MOVQ table_len+8(FP), BX 26 MOVQ numKeys+24(FP), CX 27 MOVQ hashes_base+32(FP), DX 28 MOVQ hashes_len+40(FP), DI 29 MOVQ keys_array_ptr+56(FP), R8 30 MOVQ keys_array_off+72(FP), R15 31 MOVQ values_base+80(FP), R9 32 DECQ BX // modulo = len(table) - 1 33 34 XORQ SI, SI 35 JMP test 36 loop: 37 MOVQ (DX)(SI*8), R10 // hash 38 VPBROADCASTD (R8), Y0 // [key] 39 probe: 40 MOVQ R10, R11 41 ANDQ BX, R11 // hash & modulo 42 SHLQ $6, R11 // x 64 (size of table32Group) 43 LEAQ (AX)(R11*1), R12 44 45 VMOVDQU (R12), Y1 46 VPCMPEQD Y0, Y1, Y2 47 VMOVMSKPS Y2, R11 48 MOVL 56(R12), R13 49 TESTL R11, R13 50 JZ insert 51 52 TZCNTL R11, R13 53 MOVL 28(R12)(R13*4), R14 54 next: 55 MOVL R14, (R9)(SI*4) 56 INCQ SI 57 ADDQ R15, R8 58 test: 59 CMPQ SI, DI 60 JNE loop 61 MOVQ CX, ret+104(FP) 62 VZEROUPPER 63 RET 64 insert: 65 CMPL R13, $0b1111111 66 JE probeNextGroup 67 68 MOVL R13, R11 69 POPCNTL R13, R13 70 MOVQ X0, R14 // key 71 SHLL $1, R11 72 ORL $1, R11 73 MOVL R11, 56(R12) // group.len = (group.len << 1) | 1 74 MOVL R14, (R12)(R13*4) // group.keys[i] = key 75 MOVL CX, 28(R12)(R13*4) // group.values[i] = value 76 MOVL CX, R14 77 INCL CX 78 JMP next 79 probeNextGroup: 80 INCQ R10 81 JMP probe 82 83 // func multiProbe64AVX2(table []table64Group, numKeys int, hashes []uintptr, keys sparse.Uint64Array, values []int32) int 84 TEXT ·multiProbe64AVX2(SB), NOSPLIT, $0-112 85 MOVQ table_base+0(FP), AX 86 MOVQ table_len+8(FP), BX 87 MOVQ numKeys+24(FP), CX 88 MOVQ hashes_base+32(FP), DX 89 MOVQ hashes_len+40(FP), DI 90 MOVQ keys_array_ptr+56(FP), R8 91 MOVQ keys_array_off+72(FP), R15 92 MOVQ values_base+80(FP), R9 93 DECQ BX // modulo = len(table) - 1 94 95 XORQ SI, SI 96 JMP test 97 loop: 98 MOVQ (DX)(SI*8), R10 // hash 99 VPBROADCASTQ (R8), Y0 // [key] 100 probe: 101 MOVQ R10, R11 102 ANDQ BX, R11 // hash & modulo 103 SHLQ $6, R11 // x 64 (size of table64Group) 104 LEAQ (AX)(R11*1), R12 105 106 VMOVDQU (R12), Y1 107 VPCMPEQQ Y0, Y1, Y2 108 VMOVMSKPD Y2, R11 109 MOVL 48(R12), R13 110 TESTL R11, R13 111 JZ insert 112 113 TZCNTL R11, R13 114 MOVL 32(R12)(R13*4), R14 115 next: 116 MOVL R14, (R9)(SI*4) 117 INCQ SI 118 ADDQ R15, R8 119 test: 120 CMPQ SI, DI 121 JNE loop 122 MOVQ CX, ret+104(FP) 123 VZEROUPPER 124 RET 125 insert: 126 CMPL R13, $0b1111 127 JE probeNextGroup 128 129 MOVL R13, R11 130 POPCNTL R13, R13 131 SHLL $1, R11 132 ORL $1, R11 133 MOVL R11, 48(R12) // group.len = (group.len << 1) | 1 134 MOVQ X0, (R12)(R13*8) // group.keys[i] = key 135 MOVL CX, 32(R12)(R13*4) // group.values[i] = value 136 MOVL CX, R14 137 INCL CX 138 JMP next 139 probeNextGroup: 140 INCQ R10 141 JMP probe 142 143 // func multiProbe128SSE2(table []byte, tableCap, tableLen int, hashes []uintptr, keys sparse.Uint128Array, values []int32) int 144 TEXT ·multiProbe128SSE2(SB), NOSPLIT, $0-120 145 MOVQ table_base+0(FP), AX 146 MOVQ tableCap+24(FP), BX 147 MOVQ tableLen+32(FP), CX 148 MOVQ hashes_base+40(FP), DX 149 MOVQ hashes_len+48(FP), DI 150 MOVQ keys_array_ptr+64(FP), R8 151 MOVQ keys_array_off+80(FP), R15 152 MOVQ values_base+88(FP), R9 153 154 MOVQ BX, R10 155 SHLQ $4, R10 156 LEAQ (AX)(R10*1), R10 157 DECQ BX // modulo = tableCap - 1 158 159 XORQ SI, SI 160 JMP test 161 loop: 162 MOVQ (DX)(SI*8), R11 // hash 163 MOVOU (R8), X0 // key 164 probe: 165 MOVQ R11, R12 166 ANDQ BX, R12 167 168 MOVL (R10)(R12*4), R14 169 CMPL R14, $0 170 JE insert 171 172 SHLQ $4, R12 173 MOVOU (AX)(R12*1), X1 174 PCMPEQL X0, X1 175 MOVMSKPS X1, R13 176 CMPL R13, $0b1111 177 JE next 178 179 INCQ R11 180 JMP probe 181 next: 182 DECL R14 183 MOVL R14, (R9)(SI*4) 184 INCQ SI 185 ADDQ R15, R8 186 test: 187 CMPQ SI, DI 188 JNE loop 189 MOVQ CX, ret+112(FP) 190 RET 191 insert: 192 INCL CX 193 MOVL CX, (R10)(R12*4) 194 MOVL CX, R14 195 SHLQ $4, R12 196 MOVOU X0, (AX)(R12*1) 197 JMP next