github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/rle/rle_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  GLOBL bitMasks<>(SB), RODATA|NOPTR, $64
     6  DATA bitMasks<>+0(SB)/8,  $0b0000000100000001000000010000000100000001000000010000000100000001
     7  DATA bitMasks<>+8(SB)/8,  $0b0000001100000011000000110000001100000011000000110000001100000011
     8  DATA bitMasks<>+16(SB)/8, $0b0000011100000111000001110000011100000111000001110000011100000111
     9  DATA bitMasks<>+24(SB)/8, $0b0000111100001111000011110000111100001111000011110000111100001111
    10  DATA bitMasks<>+32(SB)/8, $0b0001111100011111000111110001111100011111000111110001111100011111
    11  DATA bitMasks<>+40(SB)/8, $0b0011111100111111001111110011111100111111001111110011111100111111
    12  DATA bitMasks<>+48(SB)/8, $0b0111111101111111011111110111111101111111011111110111111101111111
    13  DATA bitMasks<>+56(SB)/8, $0b1111111111111111111111111111111111111111111111111111111111111111
    14  
    15  // func decodeBytesBitpackBMI2(dst, src []byte, count, bitWidth uint)
    16  TEXT ·decodeBytesBitpackBMI2(SB), NOSPLIT, $0-64
    17      MOVQ dst_base+0(FP), AX
    18      MOVQ src_base+24(FP), BX
    19      MOVQ count+48(FP), CX
    20      MOVQ bitWidth+56(FP), DX
    21      LEAQ bitMasks<>(SB), DI
    22      MOVQ -8(DI)(DX*8), DI
    23      XORQ SI, SI
    24      SHRQ $3, CX
    25      JMP test
    26  loop:
    27      MOVQ (BX), R8
    28      PDEPQ DI, R8, R8
    29      MOVQ R8, (AX)(SI*8)
    30      ADDQ DX, BX
    31      INCQ SI
    32  test:
    33      CMPQ SI, CX
    34      JNE loop
    35      RET
    36  
    37  // func encodeBytesBitpackBMI2(dst []byte, src []uint64, bitWidth uint) int
    38  TEXT ·encodeBytesBitpackBMI2(SB), NOSPLIT, $0-64
    39      MOVQ dst_base+0(FP), AX
    40      MOVQ src_base+24(FP), BX
    41      MOVQ src_len+32(FP), CX
    42      MOVQ bitWidth+48(FP), DX
    43      LEAQ bitMasks<>(SB), DI
    44      MOVQ -8(DI)(DX*8), DI
    45      XORQ SI, SI
    46      JMP test
    47  loop:
    48      MOVQ (BX)(SI*8), R8
    49      PEXTQ DI, R8, R8
    50      MOVQ R8, (AX)
    51      ADDQ DX, AX
    52      INCQ SI
    53  test:
    54      CMPQ SI, CX
    55      JNE loop
    56  done:
    57      SUBQ dst+0(FP), AX
    58      MOVQ AX, ret+56(FP)
    59      RET
    60  
    61  // func encodeInt32IndexEqual8ContiguousAVX2(words [][8]int32) int
    62  TEXT ·encodeInt32IndexEqual8ContiguousAVX2(SB), NOSPLIT, $0-32
    63      MOVQ words_base+0(FP), AX
    64      MOVQ words_len+8(FP), BX
    65      XORQ SI, SI
    66      SHLQ $5, BX
    67      JMP test
    68  loop:
    69      VMOVDQU (AX)(SI*1), Y0
    70      VPSHUFD $0, Y0, Y1
    71      VPCMPEQD Y1, Y0, Y0
    72      VMOVMSKPS Y0, CX
    73      CMPL CX, $0xFF
    74      JE done
    75      ADDQ $32, SI
    76  test:
    77      CMPQ SI, BX
    78      JNE loop
    79  done:
    80      VZEROUPPER
    81      SHRQ $5, SI
    82      MOVQ SI, ret+24(FP)
    83      RET
    84  
    85  // func encodeInt32IndexEqual8ContiguousSSE(words [][8]int32) int
    86  TEXT ·encodeInt32IndexEqual8ContiguousSSE(SB), NOSPLIT, $0-32
    87      MOVQ words_base+0(FP), AX
    88      MOVQ words_len+8(FP), BX
    89      XORQ SI, SI
    90      SHLQ $5, BX
    91      JMP test
    92  loop:
    93      MOVOU (AX)(SI*1), X0
    94      MOVOU 16(AX)(SI*1), X1
    95      PSHUFD $0, X0, X2
    96      PCMPEQL X2, X0
    97      PCMPEQL X2, X1
    98      MOVMSKPS X0, CX
    99      MOVMSKPS X1, DX
   100      ANDL DX, CX
   101      CMPL CX, $0xF
   102      JE done
   103      ADDQ $32, SI
   104  test:
   105      CMPQ SI, BX
   106      JNE loop
   107  done:
   108      SHRQ $5, SI
   109      MOVQ SI, ret+24(FP)
   110      RET
   111  
   112  // func encodeInt32Bitpack1to16bitsAVX2(dst []byte, src [][8]int32, bitWidth uint) int
   113  TEXT ·encodeInt32Bitpack1to16bitsAVX2(SB), NOSPLIT, $0-64
   114      MOVQ dst_base+0(FP), AX
   115      MOVQ src_base+24(FP), BX
   116      MOVQ src_len+32(FP), CX
   117      MOVQ bitWidth+48(FP), DX
   118  
   119      MOVQ DX, X0
   120      VPBROADCASTQ X0, Y6 // [1*bitWidth...]
   121      VPSLLQ $1, Y6, Y7   // [2*bitWidth...]
   122      VPADDQ Y6, Y7, Y8   // [3*bitWidth...]
   123      VPSLLQ $2, Y6, Y9   // [4*bitWidth...]
   124  
   125      MOVQ $64, DI
   126      MOVQ DI, X1
   127      VPBROADCASTQ X1, Y10
   128      VPSUBQ Y6, Y10, Y11 // [64-1*bitWidth...]
   129      VPSUBQ Y9, Y10, Y12 // [64-4*bitWidth...]
   130      VPCMPEQQ Y4, Y4, Y4
   131      VPSRLVQ Y11, Y4, Y4
   132  
   133      VPXOR Y5, Y5, Y5
   134      XORQ SI, SI
   135      SHLQ $5, CX
   136      JMP test
   137  loop:
   138      VMOVDQU (BX)(SI*1), Y0
   139      VPSHUFD $0b01010101, Y0, Y1
   140      VPSHUFD $0b10101010, Y0, Y2
   141      VPSHUFD $0b11111111, Y0, Y3
   142  
   143      VPAND Y4, Y0, Y0
   144      VPAND Y4, Y1, Y1
   145      VPAND Y4, Y2, Y2
   146      VPAND Y4, Y3, Y3
   147  
   148      VPSLLVQ Y6, Y1, Y1
   149      VPSLLVQ Y7, Y2, Y2
   150      VPSLLVQ Y8, Y3, Y3
   151  
   152      VPOR Y1, Y0, Y0
   153      VPOR Y3, Y2, Y2
   154      VPOR Y2, Y0, Y0
   155  
   156      VPERMQ $0b00001010, Y0, Y1
   157  
   158      VPSLLVQ X9, X1, X2
   159      VPSRLQ X12, X1, X3
   160      VBLENDPD $0b10, X3, X2, X1
   161      VBLENDPD $0b10, X5, X0, X0
   162      VPOR X1, X0, X0
   163  
   164      VMOVDQU X0, (AX)
   165  
   166      ADDQ DX, AX
   167      ADDQ $32, SI
   168  test:
   169      CMPQ SI, CX
   170      JNE loop
   171      VZEROUPPER
   172      SUBQ dst+0(FP), AX
   173      MOVQ AX, ret+56(FP)
   174      RET