github.com/parquet-go/parquet-go@v0.20.0/null_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 // func nullIndex8(bits *uint64, rows sparse.Array) 6 TEXT ·nullIndex8(SB), NOSPLIT, $0-32 7 MOVQ bits+0(FP), AX 8 MOVQ rows_array_ptr+8(FP), BX 9 MOVQ rows_array_len+16(FP), DI 10 MOVQ rows_array_off+24(FP), DX 11 12 MOVQ $1, CX 13 XORQ SI, SI 14 15 CMPQ DI, $0 16 JE done 17 loop1x1: 18 XORQ R8, R8 19 MOVB (BX), R9 20 CMPB R9, $0 21 JE next1x1 22 23 MOVQ SI, R10 24 SHRQ $6, R10 25 ORQ CX, (AX)(R10*8) 26 next1x1: 27 ADDQ DX, BX 28 ROLQ $1, CX 29 INCQ SI 30 CMPQ SI, DI 31 JNE loop1x1 32 done: 33 RET 34 35 // func nullIndex32(bits *uint64, rows sparse.Array) 36 TEXT ·nullIndex32(SB), NOSPLIT, $0-32 37 MOVQ bits+0(FP), AX 38 MOVQ rows_array_ptr+8(FP), BX 39 MOVQ rows_array_len+16(FP), DI 40 MOVQ rows_array_off+24(FP), DX 41 42 MOVQ $1, CX 43 XORQ SI, SI 44 45 CMPQ DI, $0 46 JE done 47 48 CMPQ DI, $8 49 JB loop1x4 50 51 CMPB ·hasAVX2(SB), $0 52 JE loop1x4 53 54 MOVQ DI, R8 55 SHRQ $3, R8 56 SHLQ $3, R8 57 58 VPBROADCASTD rows_array_off+24(FP), Y0 59 VPMULLD ·range0n8(SB), Y0, Y0 60 VPCMPEQD Y1, Y1, Y1 61 VPCMPEQD Y2, Y2, Y2 62 VPXOR Y3, Y3, Y3 63 loop8x4: 64 VPGATHERDD Y1, (BX)(Y0*1), Y4 65 VPCMPEQD Y3, Y4, Y4 66 VMOVMSKPS Y4, R9 67 VMOVDQU Y2, Y1 68 69 NOTQ R9 70 ANDQ $0b11111111, R9 71 72 MOVQ SI, CX 73 ANDQ $0b111111, CX 74 75 MOVQ SI, R10 76 SHRQ $6, R10 77 78 SHLQ CX, R9 79 ORQ R9, (AX)(R10*8) 80 81 LEAQ (BX)(DX*8), BX 82 ADDQ $8, SI 83 CMPQ SI, R8 84 JNE loop8x4 85 VZEROUPPER 86 87 CMPQ SI, DI 88 JE done 89 90 MOVQ $1, R8 91 MOVQ SI, CX 92 ANDQ $0b111111, R8 93 SHLQ CX, R8 94 MOVQ R8, CX 95 96 loop1x4: 97 MOVL (BX), R8 98 CMPL R8, $0 99 JE next1x4 100 101 MOVQ SI, R9 102 SHRQ $6, R9 103 ORQ CX, (AX)(R9*8) 104 next1x4: 105 ADDQ DX, BX 106 ROLQ $1, CX 107 INCQ SI 108 CMPQ SI, DI 109 JNE loop1x4 110 done: 111 RET 112 113 // func nullIndex64(bits *uint64, rows sparse.Array) 114 TEXT ·nullIndex64(SB), NOSPLIT, $0-32 115 MOVQ bits+0(FP), AX 116 MOVQ rows_array_ptr+8(FP), BX 117 MOVQ rows_array_len+16(FP), DI 118 MOVQ rows_array_off+24(FP), DX 119 120 MOVQ $1, CX 121 XORQ SI, SI 122 123 CMPQ DI, $0 124 JE done 125 126 CMPQ DI, $4 127 JB loop1x8 128 129 CMPB ·hasAVX2(SB), $0 130 JE loop1x8 131 132 MOVQ DI, R8 133 SHRQ $2, R8 134 SHLQ $2, R8 135 136 VPBROADCASTQ rows_array_off+24(FP), Y0 137 VPMULLD scale4x8<>(SB), Y0, Y0 138 VPCMPEQQ Y1, Y1, Y1 139 VPCMPEQQ Y2, Y2, Y2 140 VPXOR Y3, Y3, Y3 141 loop4x8: 142 VPGATHERQQ Y1, (BX)(Y0*1), Y4 143 VPCMPEQQ Y3, Y4, Y4 144 VMOVMSKPD Y4, R9 145 VMOVDQU Y2, Y1 146 147 NOTQ R9 148 ANDQ $0b1111, R9 149 150 MOVQ SI, CX 151 ANDQ $0b111111, CX 152 153 MOVQ SI, R10 154 SHRQ $6, R10 155 156 SHLQ CX, R9 157 ORQ R9, (AX)(R10*8) 158 159 LEAQ (BX)(DX*4), BX 160 ADDQ $4, SI 161 CMPQ SI, R8 162 JNE loop4x8 163 VZEROUPPER 164 165 CMPQ SI, DI 166 JE done 167 168 MOVQ $1, R8 169 MOVQ SI, CX 170 ANDQ $0b111111, R8 171 SHLQ CX, R8 172 MOVQ R8, CX 173 174 loop1x8: 175 MOVQ (BX), R8 176 CMPQ R8, $0 177 JE next1x8 178 179 MOVQ SI, R9 180 SHRQ $6, R9 181 ORQ CX, (AX)(R9*8) 182 next1x8: 183 ADDQ DX, BX 184 ROLQ $1, CX 185 INCQ SI 186 CMPQ SI, DI 187 JNE loop1x8 188 done: 189 RET 190 191 GLOBL scale4x8<>(SB), RODATA|NOPTR, $32 192 DATA scale4x8<>+0(SB)/8, $0 193 DATA scale4x8<>+8(SB)/8, $1 194 DATA scale4x8<>+16(SB)/8, $2 195 DATA scale4x8<>+24(SB)/8, $3 196 197 // func nullIndex128(bits *uint64, rows sparse.Array) 198 TEXT ·nullIndex128(SB), NOSPLIT, $0-32 199 MOVQ bits+0(FP), AX 200 MOVQ rows_array_ptr+8(FP), BX 201 MOVQ rows_array_len+16(FP), DI 202 MOVQ rows_array_off+24(FP), DX 203 204 CMPQ DI, $0 205 JE done 206 207 MOVQ $1, CX 208 XORQ SI, SI 209 PXOR X0, X0 210 loop1x16: 211 MOVOU (BX), X1 212 PCMPEQQ X0, X1 213 MOVMSKPD X1, R8 214 CMPB R8, $0b11 215 JE next1x16 216 217 MOVQ SI, R9 218 SHRQ $6, R9 219 ORQ CX, (AX)(R9*8) 220 next1x16: 221 ADDQ DX, BX 222 ROLQ $1, CX 223 INCQ SI 224 CMPQ SI, DI 225 JNE loop1x16 226 done: 227 RET