github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/internal/bitpack/unpack_int32_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "funcdata.h"
     4  #include "textflag.h"
     5  
     6  // func unpackInt32Default(dst []int32, src []byte, bitWidth uint)
     7  TEXT ·unpackInt32Default(SB), NOSPLIT, $0-56
     8      MOVQ dst_base+0(FP), AX
     9      MOVQ dst_len+8(FP), DX
    10      MOVQ src_base+24(FP), BX
    11      MOVQ bitWidth+48(FP), CX
    12  
    13      MOVQ $1, R8 // bitMask = (1 << bitWidth) - 1
    14      SHLQ CX, R8
    15      DECQ R8
    16      MOVQ CX, R9 // bitWidth
    17  
    18      XORQ DI, DI // bitOffset
    19      XORQ SI, SI // index
    20      JMP test
    21  loop:
    22      MOVQ DI, R10
    23      MOVQ DI, CX
    24      SHRQ $5, R10      // i = bitOffset / 32
    25      ANDQ $0b11111, CX // j = bitOffset % 32
    26  
    27      MOVL (BX)(R10*4), R11
    28      MOVL R8, R12  // d = bitMask
    29      SHLL CX, R12  // d = d << j
    30      ANDL R12, R11 // d = src[i] & d
    31      SHRL CX, R11  // d = d >> j
    32  
    33      MOVL CX, R13
    34      ADDL R9, R13
    35      CMPL R13, $32
    36      JBE next // j+bitWidth <= 32 ?
    37  
    38      MOVL 4(BX)(R10*4), R14
    39      MOVL CX, R12
    40      MOVL $32, CX
    41      SUBL R12, CX  // k = 32 - j
    42      MOVL R8, R12  // c = bitMask
    43      SHRL CX, R12  // c = c >> k
    44      ANDL R12, R14 // c = src[i+1] & c
    45      SHLL CX, R14  // c = c << k
    46      ORL R14, R11  // d = d | c
    47  next:
    48      MOVL R11, (AX)(SI*4) // dst[n] = d
    49      ADDQ R9, DI          // bitOffset += bitWidth
    50      INCQ SI
    51  test:
    52      CMPQ SI, DX
    53      JNE loop
    54      RET
    55  
    56  // -----------------------------------------------------------------------------
    57  // The unpack* functions below are adaptations of the algorithms
    58  // described in "Decoding billions of integers per second through vectorization"
    59  // from D. Lemire & L. Boytsov, the following changes were made:
    60  //
    61  // - The paper described two methods for decoding integers called "horizontal"
    62  //   and "vertical". The "horizontal" version is the one that applies the best
    63  //   to the bit packing done in the Parquet delta encoding; however, it also
    64  //   differs in some ways, many compression techniques discussed in the paper
    65  //   are not implemented in the Parquet format.
    66  //
    67  // - The paper focuses on implementations based on SSE instructions, which
    68  //   describes how to use PMULLD to emulate the lack of variable bit shift
    69  //   for packed integers. Our version of the bit unpacking algorithms here
    70  //   uses AVX2 and can perform variable bit shifts using VPSRLVD, which yields
    71  //   better throughput since the instruction latency is a single CPU cycle,
    72  //   vs 10 for VPMULLD.
    73  //
    74  // - The reference implementation at https://github.com/lemire/FastPFor/ uses
    75  //   specializations for each bit size, resulting in 32 unique functions.
    76  //   Our version here are more generic, we provide 3 variations of the
    77  //   algorithm for bit widths 1 to 16, 17 to 26, and 27 to 31 (unpacking 32
    78  //   bits values is a simple copy). In that regard, our implementation is
    79  //   somewhat an improvement over the reference, since it uses less code and
    80  //   less memory to hold the shuffle masks and shift tables.
    81  //
    82  // Technically, each specialization of our functions could be expressed by the
    83  // algorithm used for unpacking values of 27 to 31 bits. However, multiple steps
    84  // of the main loop can be removed for lower bit widths, providing up to ~35%
    85  // better throughput for smaller sizes. Since we expect delta encoding to often
    86  // result in bit packing values to smaller bit widths, the specializations are
    87  // worth the extra complexity.
    88  //
    89  // For more details, see: https://arxiv.org/pdf/1209.2137v5.pdf
    90  // -----------------------------------------------------------------------------
    91  
    92  // unpackInt32x1to16bitsAVX2 is the implementation of the bit unpacking
    93  // algorithm for inputs of bit width 1 to 16.
    94  //
    95  // In this version of the algorithm, we can perform a single memory load in each
    96  // loop iteration since we know that 8 values will fit in a single XMM register.
    97  //
    98  // func unpackInt32x1to16bitsAVX2(dst []int32, src []byte, bitWidth uint)
    99  TEXT ·unpackInt32x1to16bitsAVX2(SB), NOSPLIT, $56-56
   100      NO_LOCAL_POINTERS
   101      MOVQ dst_base+0(FP), AX
   102      MOVQ dst_len+8(FP), DX
   103      MOVQ src_base+24(FP), BX
   104      MOVQ bitWidth+48(FP), CX
   105  
   106      CMPQ DX, $8
   107      JB tail
   108  
   109      MOVQ DX, DI
   110      SHRQ $3, DI
   111      SHLQ $3, DI
   112      XORQ SI, SI
   113  
   114      MOVQ $1, R8
   115      SHLQ CX, R8
   116      DECQ R8
   117      MOVQ R8, X0
   118      VPBROADCASTD X0, X0 // bitMask = (1 << bitWidth) - 1
   119  
   120      MOVQ CX, R9
   121      DECQ R9
   122      SHLQ $5, R9 // 32 * (bitWidth - 1)
   123  
   124      MOVQ CX, R10
   125      DECQ R10
   126      SHLQ $5, R10
   127      ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256
   128  
   129      LEAQ ·shuffleInt32x1to16bits(SB), R11
   130      VMOVDQA (R11)(R9*1), X1
   131      VMOVDQA 16(R11)(R9*1), X2
   132  
   133      LEAQ ·shiftRightInt32(SB), R12
   134      VMOVDQA (R12)(R10*1), X3
   135      VMOVDQA 16(R12)(R10*1), X4
   136  loop:
   137      VMOVDQU (BX), X7
   138  
   139      VPSHUFB X1, X7, X5
   140      VPSHUFB X2, X7, X6
   141  
   142      VPSRLVD X3, X5, X5
   143      VPSRLVD X4, X6, X6
   144  
   145      VPAND X0, X5, X5
   146      VPAND X0, X6, X6
   147  
   148      VMOVDQU X5, (AX)(SI*4)
   149      VMOVDQU X6, 16(AX)(SI*4)
   150  
   151      ADDQ CX, BX
   152      ADDQ $8, SI
   153      CMPQ SI, DI
   154      JNE loop
   155      VZEROUPPER
   156  
   157      CMPQ SI, DX
   158      JE done
   159      LEAQ (AX)(SI*4), AX
   160      SUBQ SI, DX
   161  tail:
   162      MOVQ AX, dst_base-56(SP)
   163      MOVQ DX, dst_len-48(SP)
   164      MOVQ BX, src_base-32(SP)
   165      MOVQ CX, bitWidth-8(SP)
   166      CALL ·unpackInt32Default(SB)
   167  done:
   168      RET
   169  
   170  // unpackInt32x17to26bitsAVX2 is the implementation of the bit unpacking
   171  // algorithm for inputs of bit width 17 to 26.
   172  //
   173  // In this version of the algorithm, we need to 32 bytes at each loop iteration
   174  // because 8 bit-packed values will span across two XMM registers.
   175  //
   176  // func unpackInt32x17to26bitsAVX2(dst []int32, src []byte, bitWidth uint)
   177  TEXT ·unpackInt32x17to26bitsAVX2(SB), NOSPLIT, $56-56
   178      NO_LOCAL_POINTERS
   179      MOVQ dst_base+0(FP), AX
   180      MOVQ dst_len+8(FP), DX
   181      MOVQ src_base+24(FP), BX
   182      MOVQ bitWidth+48(FP), CX
   183  
   184      CMPQ DX, $8
   185      JB tail
   186  
   187      MOVQ DX, DI
   188      SHRQ $3, DI
   189      SHLQ $3, DI
   190      XORQ SI, SI
   191  
   192      MOVQ $1, R8
   193      SHLQ CX, R8
   194      DECQ R8
   195      MOVQ R8, X0
   196      VPBROADCASTD X0, X0
   197  
   198      MOVQ CX, R9
   199      SUBQ $17, R9
   200      IMULQ $48, R9 // 48 * (bitWidth - 17)
   201  
   202      MOVQ CX, R10
   203      DECQ R10
   204      SHLQ $5, R10
   205      ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256
   206  
   207      LEAQ ·shuffleInt32x17to26bits(SB), R11
   208      VMOVDQA (R11)(R9*1), X1
   209      VMOVDQA 16(R11)(R9*1), X2
   210      VMOVDQA 32(R11)(R9*1), X3
   211  
   212      LEAQ ·shiftRightInt32(SB), R12
   213      VMOVDQA (R12)(R10*1), X4
   214      VMOVDQA 16(R12)(R10*1), X5
   215  loop:
   216      VMOVDQU (BX), X6
   217      VMOVDQU 16(BX), X7
   218  
   219      VPSHUFB X1, X6, X8
   220      VPSHUFB X2, X6, X9
   221      VPSHUFB X3, X7, X10
   222      VPOR X10, X9, X9
   223  
   224      VPSRLVD X4, X8, X8
   225      VPSRLVD X5, X9, X9
   226  
   227      VPAND X0, X8, X8
   228      VPAND X0, X9, X9
   229  
   230      VMOVDQU X8, (AX)(SI*4)
   231      VMOVDQU X9, 16(AX)(SI*4)
   232  
   233      ADDQ CX, BX
   234      ADDQ $8, SI
   235      CMPQ SI, DI
   236      JNE loop
   237      VZEROUPPER
   238  
   239      CMPQ SI, DX
   240      JE done
   241      LEAQ (AX)(SI*4), AX
   242      SUBQ SI, DX
   243  tail:
   244      MOVQ AX, dst_base-56(SP)
   245      MOVQ DX, dst_len-48(SP)
   246      MOVQ BX, src_base-32(SP)
   247      MOVQ CX, bitWidth-8(SP)
   248      CALL ·unpackInt32Default(SB)
   249  done:
   250      RET
   251  
   252  // unpackInt32x27to31bitsAVX2 is the implementation of the bit unpacking
   253  // algorithm for inputs of bit width 27 to 31.
   254  //
   255  // In this version of the algorithm the bit-packed values may span across up to
   256  // 5 bytes. The simpler approach for smaller bit widths where we could perform a
   257  // single shuffle + shift to unpack the values do not work anymore.
   258  //
   259  // Values are unpacked in two steps: the first one extracts lower bits which are
   260  // shifted RIGHT to align on the beginning of 32 bit words, the second extracts
   261  // upper bits which are shifted LEFT to be moved to the end of the 32 bit words.
   262  //
   263  // The amount of LEFT shifts is always "8 minus the amount of RIGHT shift".
   264  //
   265  // func unpackInt32x27to31bitsAVX2(dst []int32, src []byte, bitWidth uint)
   266  TEXT ·unpackInt32x27to31bitsAVX2(SB), NOSPLIT, $56-56
   267      NO_LOCAL_POINTERS
   268      MOVQ dst_base+0(FP), AX
   269      MOVQ dst_len+8(FP), DX
   270      MOVQ src_base+24(FP), BX
   271      MOVQ bitWidth+48(FP), CX
   272  
   273      CMPQ DX, $8
   274      JB tail
   275  
   276      MOVQ DX, DI
   277      SHRQ $3, DI
   278      SHLQ $3, DI
   279      XORQ SI, SI
   280  
   281      MOVQ $1, R8
   282      SHLQ CX, R8
   283      DECQ R8
   284      MOVQ R8, X0
   285      VPBROADCASTD X0, X0
   286  
   287      MOVQ CX, R9
   288      SUBQ $27, R9
   289      IMULQ $80, R9 // (80 * (bitWidth - 27))
   290  
   291      MOVQ CX, R10
   292      DECQ R10
   293      SHLQ $5, R10
   294      ANDQ $0xFF, R10 // (32 * (bitWidth - 1)) % 256
   295  
   296      LEAQ ·shuffleInt32x27to31bits(SB), R11
   297      VMOVDQA (R11)(R9*1), X1
   298      VMOVDQA 16(R11)(R9*1), X2
   299      VMOVDQA 32(R11)(R9*1), X3
   300      VMOVDQA 48(R11)(R9*1), X4
   301      VMOVDQA 64(R11)(R9*1), X5
   302  
   303      LEAQ ·shiftRightInt32(SB), R12
   304      LEAQ ·shiftLeftInt32(SB), R13
   305      VMOVDQA (R12)(R10*1), X6
   306      VMOVDQA (R13)(R10*1), X7
   307      VMOVDQA 16(R12)(R10*1), X8
   308      VMOVDQA 16(R13)(R10*1), X9
   309  loop:
   310      VMOVDQU (BX), X10
   311      VMOVDQU 16(BX), X11
   312  
   313      VPSHUFB X1, X10, X12
   314      VPSHUFB X2, X10, X13
   315      VPSHUFB X3, X10, X14
   316      VPSHUFB X4, X11, X15
   317      VPSHUFB X5, X11, X11
   318  
   319      VPSRLVD X6, X12, X12
   320      VPSLLVD X7, X13, X13
   321      VPSRLVD X8, X14, X14
   322      VPSRLVD X8, X15, X15
   323      VPSLLVD X9, X11, X11
   324  
   325      VPOR X13, X12, X12
   326      VPOR X15, X14, X14
   327      VPOR X11, X14, X14
   328  
   329      VPAND X0, X12, X12
   330      VPAND X0, X14, X14
   331  
   332      VMOVDQU X12, (AX)(SI*4)
   333      VMOVDQU X14, 16(AX)(SI*4)
   334  
   335      ADDQ CX, BX
   336      ADDQ $8, SI
   337      CMPQ SI, DI
   338      JNE loop
   339      VZEROUPPER
   340  
   341      CMPQ SI, DX
   342      JE done
   343      LEAQ (AX)(SI*4), AX
   344      SUBQ SI, DX
   345  tail:
   346      MOVQ AX, dst_base-56(SP)
   347      MOVQ DX, dst_len-48(SP)
   348      MOVQ BX, src_base-32(SP)
   349      MOVQ CX, bitWidth-8(SP)
   350      CALL ·unpackInt32Default(SB)
   351  done:
   352      RET