github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/sparse/gather_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // func gatherBitsAVX2(dst []byte, src Uint8Array) 6 TEXT ·gatherBitsAVX2(SB), NOSPLIT, $0-48 7 MOVQ dst_base+0(FP), AX 8 MOVQ src_array_ptr+24(FP), BX 9 MOVQ src_array_len+32(FP), CX 10 MOVQ src_array_off+40(FP), DX 11 XORQ SI, SI 12 SHRQ $3, CX 13 14 VPBROADCASTD src_array_off+40(FP), Y0 15 VPMULLD range0n7<>(SB), Y0, Y0 16 VPCMPEQD Y1, Y1, Y1 17 VPCMPEQD Y2, Y2, Y2 18 loop: 19 VPGATHERDD Y1, (BX)(Y0*1), Y3 20 VMOVDQU Y2, Y1 21 VPSLLD $31, Y3, Y3 22 VMOVMSKPS Y3, DI 23 24 MOVB DI, (AX)(SI*1) 25 26 LEAQ (BX)(DX*8), BX 27 INCQ SI 28 CMPQ SI, CX 29 JNE loop 30 VZEROUPPER 31 RET 32 33 // func gatherBitsDefault(dst []byte, src Uint8Array) 34 TEXT ·gatherBitsDefault(SB), NOSPLIT, $0-48 35 MOVQ dst_base+0(FP), AX 36 MOVQ src_array_ptr+24(FP), BX 37 MOVQ src_array_len+32(FP), CX 38 MOVQ src_array_off+40(FP), DX 39 XORQ SI, SI 40 SHRQ $3, CX 41 loop: 42 LEAQ (BX)(DX*2), DI 43 MOVBQZX (BX), R8 44 MOVBQZX (BX)(DX*1), R9 45 MOVBQZX (DI), R10 46 MOVBQZX (DI)(DX*1), R11 47 LEAQ (BX)(DX*4), BX 48 LEAQ (DI)(DX*4), DI 49 MOVBQZX (BX), R12 50 MOVBQZX (BX)(DX*1), R13 51 MOVBQZX (DI), R14 52 MOVBQZX (DI)(DX*1), R15 53 LEAQ (BX)(DX*4), BX 54 55 ANDQ $1, R8 56 ANDQ $1, R9 57 ANDQ $1, R10 58 ANDQ $1, R11 59 ANDQ $1, R12 60 ANDQ $1, R13 61 ANDQ $1, R14 62 ANDQ $1, R15 63 64 SHLQ $1, R9 65 SHLQ $2, R10 66 SHLQ $3, R11 67 SHLQ $4, R12 68 SHLQ $5, R13 69 SHLQ $6, R14 70 SHLQ $7, R15 71 72 ORQ R9, R8 73 ORQ R11, R10 74 ORQ R13, R12 75 ORQ R15, R14 76 ORQ R10, R8 77 ORQ R12, R8 78 ORQ R14, R8 79 80 MOVB R8, (AX)(SI*1) 81 82 INCQ SI 83 CMPQ SI, CX 84 JNE loop 85 RET 86 87 // func gather32AVX2(dst []uint32, src Uint32Array) 88 TEXT ·gather32AVX2(SB), NOSPLIT, $0-48 89 MOVQ dst_base+0(FP), AX 90 MOVQ dst_len+8(FP), CX 91 MOVQ src_array_ptr+24(FP), BX 92 MOVQ src_array_off+40(FP), DX 93 XORQ SI, SI 94 95 VPBROADCASTD src_array_off+40(FP), Y0 96 VPMULLD range0n7<>(SB), Y0, Y0 97 VPCMPEQD Y1, Y1, Y1 98 VPCMPEQD Y2, Y2, Y2 99 loop: 100 VPGATHERDD Y1, (BX)(Y0*1), Y3 101 VMOVDQU Y3, (AX)(SI*4) 102 VMOVDQU Y2, Y1 103 104 LEAQ (BX)(DX*8), BX 105 ADDQ $8, SI 106 CMPQ SI, CX 107 JNE loop 108 VZEROUPPER 109 RET 110 111 // func gather64AVX2(dst []uint64, src Uint64Array) 112 TEXT ·gather64AVX2(SB), NOSPLIT, $0-48 113 MOVQ dst_base+0(FP), AX 114 MOVQ dst_len+8(FP), CX 115 MOVQ src_array_ptr+24(FP), BX 116 MOVQ src_array_off+40(FP), DX 117 XORQ SI, SI 118 119 VPBROADCASTQ src_array_off+40(FP), Y0 120 VPMULLD range0n3<>(SB), Y0, Y0 121 VPCMPEQQ Y1, Y1, Y1 122 VPCMPEQQ Y2, Y2, Y2 123 loop: 124 VPGATHERQQ Y1, (BX)(Y0*1), Y3 125 VMOVDQU Y3, (AX)(SI*8) 126 VMOVDQU Y2, Y1 127 128 LEAQ (BX)(DX*4), BX 129 ADDQ $4, SI 130 CMPQ SI, CX 131 JNE loop 132 VZEROUPPER 133 RET 134 135 // func gather128(dst [][16]byte, src Uint128Array) int 136 TEXT ·gather128(SB), NOSPLIT, $0-56 137 MOVQ dst_base+0(FP), AX 138 MOVQ dst_len+8(FP), CX 139 MOVQ src_array_ptr+24(FP), BX 140 MOVQ src_array_len+32(FP), DI 141 MOVQ src_array_off+40(FP), DX 142 XORQ SI, SI 143 144 CMPQ DI, CX 145 CMOVQLT DI, CX 146 147 CMPQ CX, $0 148 JE done 149 150 CMPQ CX, $1 151 JE tail 152 153 XORQ SI, SI 154 MOVQ CX, DI 155 SHRQ $1, DI 156 SHLQ $1, DI 157 loop: 158 MOVOU (BX), X0 159 MOVOU (BX)(DX*1), X1 160 161 MOVOU X0, (AX) 162 MOVOU X1, 16(AX) 163 164 LEAQ (BX)(DX*2), BX 165 ADDQ $32, AX 166 ADDQ $2, SI 167 CMPQ SI, DI 168 JNE loop 169 170 CMPQ SI, CX 171 JE done 172 tail: 173 MOVOU (BX), X0 174 MOVOU X0, (AX) 175 done: 176 MOVQ CX, ret+48(FP) 177 RET 178 179 GLOBL range0n3<>(SB), RODATA|NOPTR, $32 180 DATA range0n3<>+0(SB)/8, $0 181 DATA range0n3<>+8(SB)/8, $1 182 DATA range0n3<>+16(SB)/8, $2 183 DATA range0n3<>+24(SB)/8, $3 184 185 GLOBL range0n7<>(SB), RODATA|NOPTR, $32 186 DATA range0n7<>+0(SB)/4, $0 187 DATA range0n7<>+4(SB)/4, $1 188 DATA range0n7<>+8(SB)/4, $2 189 DATA range0n7<>+12(SB)/4, $3 190 DATA range0n7<>+16(SB)/4, $4 191 DATA range0n7<>+20(SB)/4, $5 192 DATA range0n7<>+24(SB)/4, $6 193 DATA range0n7<>+28(SB)/4, $7