github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/encoding/rle/rle_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 GLOBL bitMasks<>(SB), RODATA|NOPTR, $64 6 DATA bitMasks<>+0(SB)/8, $0b0000000100000001000000010000000100000001000000010000000100000001 7 DATA bitMasks<>+8(SB)/8, $0b0000001100000011000000110000001100000011000000110000001100000011 8 DATA bitMasks<>+16(SB)/8, $0b0000011100000111000001110000011100000111000001110000011100000111 9 DATA bitMasks<>+24(SB)/8, $0b0000111100001111000011110000111100001111000011110000111100001111 10 DATA bitMasks<>+32(SB)/8, $0b0001111100011111000111110001111100011111000111110001111100011111 11 DATA bitMasks<>+40(SB)/8, $0b0011111100111111001111110011111100111111001111110011111100111111 12 DATA bitMasks<>+48(SB)/8, $0b0111111101111111011111110111111101111111011111110111111101111111 13 DATA bitMasks<>+56(SB)/8, $0b1111111111111111111111111111111111111111111111111111111111111111 14 15 // func decodeBytesBitpackBMI2(dst, src []byte, count, bitWidth uint) 16 TEXT ·decodeBytesBitpackBMI2(SB), NOSPLIT, $0-64 17 MOVQ dst_base+0(FP), AX 18 MOVQ src_base+24(FP), BX 19 MOVQ count+48(FP), CX 20 MOVQ bitWidth+56(FP), DX 21 LEAQ bitMasks<>(SB), DI 22 MOVQ -8(DI)(DX*8), DI 23 XORQ SI, SI 24 SHRQ $3, CX 25 JMP test 26 loop: 27 MOVQ (BX), R8 28 PDEPQ DI, R8, R8 29 MOVQ R8, (AX)(SI*8) 30 ADDQ DX, BX 31 INCQ SI 32 test: 33 CMPQ SI, CX 34 JNE loop 35 RET 36 37 // func encodeBytesBitpackBMI2(dst []byte, src []uint64, bitWidth uint) int 38 TEXT ·encodeBytesBitpackBMI2(SB), NOSPLIT, $0-64 39 MOVQ dst_base+0(FP), AX 40 MOVQ src_base+24(FP), BX 41 MOVQ src_len+32(FP), CX 42 MOVQ bitWidth+48(FP), DX 43 LEAQ bitMasks<>(SB), DI 44 MOVQ -8(DI)(DX*8), DI 45 XORQ SI, SI 46 JMP test 47 loop: 48 MOVQ (BX)(SI*8), R8 49 PEXTQ DI, R8, R8 50 MOVQ R8, (AX) 51 ADDQ DX, AX 52 INCQ SI 53 test: 54 CMPQ SI, CX 55 JNE loop 56 done: 57 SUBQ dst+0(FP), AX 58 MOVQ AX, ret+56(FP) 59 RET 60 61 // func encodeInt32IndexEqual8ContiguousAVX2(words [][8]int32) int 62 TEXT ·encodeInt32IndexEqual8ContiguousAVX2(SB), NOSPLIT, $0-32 63 MOVQ words_base+0(FP), AX 64 MOVQ words_len+8(FP), BX 65 XORQ SI, SI 66 SHLQ $5, BX 67 JMP test 68 loop: 69 VMOVDQU (AX)(SI*1), Y0 70 VPSHUFD $0, Y0, Y1 71 VPCMPEQD Y1, Y0, Y0 72 VMOVMSKPS Y0, CX 73 CMPL CX, $0xFF 74 JE done 75 ADDQ $32, SI 76 test: 77 CMPQ SI, BX 78 JNE loop 79 done: 80 VZEROUPPER 81 SHRQ $5, SI 82 MOVQ SI, ret+24(FP) 83 RET 84 85 // func encodeInt32IndexEqual8ContiguousSSE(words [][8]int32) int 86 TEXT ·encodeInt32IndexEqual8ContiguousSSE(SB), NOSPLIT, $0-32 87 MOVQ words_base+0(FP), AX 88 MOVQ words_len+8(FP), BX 89 XORQ SI, SI 90 SHLQ $5, BX 91 JMP test 92 loop: 93 MOVOU (AX)(SI*1), X0 94 MOVOU 16(AX)(SI*1), X1 95 PSHUFD $0, X0, X2 96 PCMPEQL X2, X0 97 PCMPEQL X2, X1 98 MOVMSKPS X0, CX 99 MOVMSKPS X1, DX 100 ANDL DX, CX 101 CMPL CX, $0xF 102 JE done 103 ADDQ $32, SI 104 test: 105 CMPQ SI, BX 106 JNE loop 107 done: 108 SHRQ $5, SI 109 MOVQ SI, ret+24(FP) 110 RET 111 112 // func encodeInt32Bitpack1to16bitsAVX2(dst []byte, src [][8]int32, bitWidth uint) int 113 TEXT ·encodeInt32Bitpack1to16bitsAVX2(SB), NOSPLIT, $0-64 114 MOVQ dst_base+0(FP), AX 115 MOVQ src_base+24(FP), BX 116 MOVQ src_len+32(FP), CX 117 MOVQ bitWidth+48(FP), DX 118 119 MOVQ DX, X0 120 VPBROADCASTQ X0, Y6 // [1*bitWidth...] 121 VPSLLQ $1, Y6, Y7 // [2*bitWidth...] 122 VPADDQ Y6, Y7, Y8 // [3*bitWidth...] 123 VPSLLQ $2, Y6, Y9 // [4*bitWidth...] 124 125 MOVQ $64, DI 126 MOVQ DI, X1 127 VPBROADCASTQ X1, Y10 128 VPSUBQ Y6, Y10, Y11 // [64-1*bitWidth...] 129 VPSUBQ Y9, Y10, Y12 // [64-4*bitWidth...] 130 VPCMPEQQ Y4, Y4, Y4 131 VPSRLVQ Y11, Y4, Y4 132 133 VPXOR Y5, Y5, Y5 134 XORQ SI, SI 135 SHLQ $5, CX 136 JMP test 137 loop: 138 VMOVDQU (BX)(SI*1), Y0 139 VPSHUFD $0b01010101, Y0, Y1 140 VPSHUFD $0b10101010, Y0, Y2 141 VPSHUFD $0b11111111, Y0, Y3 142 143 VPAND Y4, Y0, Y0 144 VPAND Y4, Y1, Y1 145 VPAND Y4, Y2, Y2 146 VPAND Y4, Y3, Y3 147 148 VPSLLVQ Y6, Y1, Y1 149 VPSLLVQ Y7, Y2, Y2 150 VPSLLVQ Y8, Y3, Y3 151 152 VPOR Y1, Y0, Y0 153 VPOR Y3, Y2, Y2 154 VPOR Y2, Y0, Y0 155 156 VPERMQ $0b00001010, Y0, Y1 157 158 VPSLLVQ X9, X1, X2 159 VPSRLQ X12, X1, X3 160 VBLENDPD $0b10, X3, X2, X1 161 VBLENDPD $0b10, X5, X0, X0 162 VPOR X1, X0, X0 163 164 VMOVDQU X0, (AX) 165 166 ADDQ DX, AX 167 ADDQ $32, SI 168 test: 169 CMPQ SI, CX 170 JNE loop 171 VZEROUPPER 172 SUBQ dst+0(FP), AX 173 MOVQ AX, ret+56(FP) 174 RET