github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/internal/bitpack/unpack_int64_amd64.s

github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/internal/bitpack/unpack_int64_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "funcdata.h"
     4  #include "textflag.h"
     5  
     6  // func unpackInt64Default(dst []int64, src []uint32, bitWidth uint)
     7  TEXT ·unpackInt64Default(SB), NOSPLIT, $0-56
     8      MOVQ dst_base+0(FP), AX
     9      MOVQ dst_len+8(FP), DX
    10      MOVQ src_base+24(FP), BX
    11      MOVQ bitWidth+48(FP), CX
    12  
    13      MOVQ $1, R8 // bitMask = (1 << bitWidth) - 1
    14      SHLQ CX, R8, R8
    15      DECQ R8
    16      MOVQ CX, R9 // bitWidth
    17  
    18      XORQ DI, DI // bitOffset
    19      XORQ SI, SI // index
    20      XORQ R10, R10
    21      XORQ R11, R11
    22      XORQ R14, R14
    23      JMP test
    24  loop:
    25      MOVQ DI, R10
    26      MOVQ DI, CX
    27      SHRQ $5, R10      // i = bitOffset / 32
    28      ANDQ $0b11111, CX // j = bitOffset % 32
    29  
    30      MOVLQZX (BX)(R10*4), R11
    31      MOVQ R8, R12  // d = bitMask
    32      SHLQ CX, R12  // d = d << j
    33      ANDQ R12, R11 // d = src[i] & d
    34      SHRQ CX, R11  // d = d >> j
    35  
    36      MOVQ CX, R13
    37      ADDQ R9, R13
    38      CMPQ R13, $32
    39      JBE next // j+bitWidth <= 32 ?
    40      MOVQ CX, R15 // j
    41  
    42      MOVLQZX 4(BX)(R10*4), R14
    43      MOVQ $32, CX
    44      SUBQ R15, CX  // k = 32 - j
    45      MOVQ R8, R12  // c = bitMask
    46      SHRQ CX, R12  // c = c >> k
    47      ANDQ R12, R14 // c = src[i+1] & c
    48      SHLQ CX, R14  // c = c << k
    49      ORQ R14, R11  // d = d | c
    50  
    51      CMPQ R13, $64
    52      JBE next
    53  
    54      MOVLQZX 8(BX)(R10*4), R14
    55      MOVQ $64, CX
    56      SUBQ R15, CX  // k = 64 - j
    57      MOVQ R8, R12  // c = bitMask
    58      SHRQ CX, R12  // c = c >> k
    59      ANDQ R12, R14 // c = src[i+2] & c
    60      SHLQ CX, R14  // c = c << k
    61      ORQ R14, R11  // d = d | c
    62  next:
    63      MOVQ R11, (AX)(SI*8) // dst[n] = d
    64      ADDQ R9, DI          // bitOffset += bitWidth
    65      INCQ SI
    66  test:
    67      CMPQ SI, DX
    68      JNE loop
    69      RET
    70  
    71  // This bit unpacking function was inspired from the 32 bit version, but
    72  // adapted to account for the fact that eight 64 bit values span across
    73  // two YMM registers, and across lanes of YMM registers.
    74  //
    75  // Because of the two lanes of YMM registers, we cannot use the VPSHUFB
    76  // instruction to dispatch bytes of the input to the registers. Instead we use
    77  // the VPERMD instruction, which has higher latency but supports dispatching
    78  // bytes across register lanes. Measurable throughput gains remain despite the
    79  // algorithm running on a few more CPU cycles per loop.
    80  //
    81  // The initialization phase of this algorithm generates masks for
    82  // permutations and shifts used to decode the bit-packed values.
    83  //
    84  // The permutation masks are written to Y7 and Y8, and contain the results
    85  // of this formula:
    86  //
    87  //      temp[i] = (bitWidth * i) / 32
    88  //      mask[i] = temp[i] | ((temp[i] + 1) << 32)
    89  //
    90  // Since VPERMQ only supports reading the permutation combination from an
    91  // immediate value, we use VPERMD and generate permutation for pairs of two
    92  // consecutive 32 bit words, which is why we have the upper part of each 64
    93  // bit word set with (x+1)<<32.
    94  //
    95  // The masks for right shifts are written to Y5 and Y6, and computed with
    96  // this formula:
    97  //
    98  //      shift[i] = (bitWidth * i) - (32 * ((bitWidth * i) / 32))
    99  //
   100  // The amount to shift by is the number of values previously unpacked, offseted
   101  // by the byte count of 32 bit words that we read from first bits from.
   102  //
   103  // Technically the masks could be precomputed and declared in global tables;
   104  // however, declaring masks for all bit width is tedious and makes code
   105  // maintenance more costly for no measurable benefits on production workloads.
   106  //
   107  // func unpackInt64x1to32bitsAVX2(dst []int64, src []byte, bitWidth uint)
   108  TEXT ·unpackInt64x1to32bitsAVX2(SB), NOSPLIT, $56-56
   109      NO_LOCAL_POINTERS
   110      MOVQ dst_base+0(FP), AX
   111      MOVQ dst_len+8(FP), DX
   112      MOVQ src_base+24(FP), BX
   113      MOVQ bitWidth+48(FP), CX
   114  
   115      CMPQ DX, $8
   116      JB tail
   117  
   118      MOVQ DX, DI
   119      SHRQ $3, DI
   120      SHLQ $3, DI
   121      XORQ SI, SI
   122  
   123      MOVQ $1, R8
   124      SHLQ CX, R8
   125      DECQ R8
   126      MOVQ R8, X0
   127      VPBROADCASTQ X0, Y0 // bitMask = (1 << bitWidth) - 1
   128  
   129      VPCMPEQQ Y1, Y1, Y1
   130      VPSRLQ $63, Y1, Y1  // [1,1,1,1]
   131  
   132      MOVQ CX, X2
   133      VPBROADCASTQ X2, Y2 // [bitWidth]
   134  
   135      VMOVDQU range0n7<>+0(SB), Y3  // [0,1,2,3]
   136      VMOVDQU range0n7<>+32(SB), Y4 // [4,5,6,7]
   137  
   138      VPMULLD Y2, Y3, Y5 // [bitWidth] * [0,1,2,3]
   139      VPMULLD Y2, Y4, Y6 // [bitWidth] * [4,5,6,7]
   140  
   141      VPSRLQ $5, Y5, Y7 // ([bitWidth] * [0,1,2,3]) / 32
   142      VPSRLQ $5, Y6, Y8 // ([bitWidth] * [4,5,6,7]) / 32
   143  
   144      VPSLLQ $5, Y7, Y9  // (([bitWidth] * [0,1,2,3]) / 32) * 32
   145      VPSLLQ $5, Y8, Y10 // (([bitWidth] * [4,5,6,7]) / 32) * 32
   146  
   147      VPADDQ Y1, Y7, Y11
   148      VPADDQ Y1, Y8, Y12
   149      VPSLLQ $32, Y11, Y11
   150      VPSLLQ $32, Y12, Y12
   151      VPOR Y11, Y7, Y7 // permutations[i] = [i | ((i + 1) << 32)]
   152      VPOR Y12, Y8, Y8 // permutations[i] = [i | ((i + 1) << 32)]
   153  
   154      VPSUBQ Y9, Y5, Y5 // shifts
   155      VPSUBQ Y10, Y6, Y6
   156  loop:
   157      VMOVDQU (BX), Y1
   158  
   159      VPERMD Y1, Y7, Y2
   160      VPERMD Y1, Y8, Y3
   161  
   162      VPSRLVQ Y5, Y2, Y2
   163      VPSRLVQ Y6, Y3, Y3
   164  
   165      VPAND Y0, Y2, Y2
   166      VPAND Y0, Y3, Y3
   167  
   168      VMOVDQU Y2, (AX)(SI*8)
   169      VMOVDQU Y3, 32(AX)(SI*8)
   170  
   171      ADDQ CX, BX
   172      ADDQ $8, SI
   173      CMPQ SI, DI
   174      JNE loop
   175      VZEROUPPER
   176  
   177      CMPQ SI, DX
   178      JE done
   179      LEAQ (AX)(SI*8), AX
   180      SUBQ SI, DX
   181  tail:
   182      MOVQ AX, dst_base-56(SP)
   183      MOVQ DX, dst_len-48(SP)
   184      MOVQ BX, src_base-32(SP)
   185      MOVQ CX, bitWidth-8(SP)
   186      CALL ·unpackInt64Default(SB)
   187  done:
   188      RET
   189  
   190  GLOBL range0n7<>(SB), RODATA|NOPTR, $64
   191  DATA range0n7<>+0(SB)/8,  $0
   192  DATA range0n7<>+8(SB)/8,  $1
   193  DATA range0n7<>+16(SB)/8, $2
   194  DATA range0n7<>+24(SB)/8, $3
   195  DATA range0n7<>+32(SB)/8, $4
   196  DATA range0n7<>+40(SB)/8, $5
   197  DATA range0n7<>+48(SB)/8, $6
   198  DATA range0n7<>+56(SB)/8, $7