github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/encoding/bytestreamsplit/bytestreamsplit_amd64.s (about)

     1   //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // This file contains optimizations of the BYTE_STREAM_SPLIT encoding using AVX2
     6  // and AVX512 (when available).
     7  //
     8  // The AVX2/512 instruction set comes with instructions to load memory from, or
     9  // store memory at sparse locations called VPGATHER and VPSCATTER. VPGATHER was
    10  // available in the AVX2 instruction set, VPSCATTER was introduced in AVX512
    11  // (when the AVX512_VBMI extension is supported). Gathering bytes are sparse
    12  // memory locations is useful during the decoding process since we are
    13  // recomposing 32 or 64 bit floating point values from 4 or 8 bytes dispatched
    14  // in the input byte array.
    15  //
    16  // To either deconstruct or reconstruct floating point values, we need to
    17  // reorder the bytes of each value. If we have 4 32 bit floats, we can permute
    18  // their bytes so that the first one contains all the first bytes, the second
    19  // contains all the second bytes, etc... The VPSHUFB instruction is used to
    20  // perform the byte permutation, or the VPERMB instruction for 64 bit floats.
    21  //
    22  // We use different instructions because the VPSHUFB instruction works on two
    23  // lanes of 16 bytes when used on YMM registers. 4 32 bit floats take 16 bytes,
    24  // so a a YMM register can hold two lanes of 4 32 bit floats and the VPSHUFB
    25  // can permute the two sets of values in a single invocation. For 64 bit floats
    26  // we need to permute 8 values, which take 64 bytes and therefore need to be
    27  // held in a ZMM register and apply permutations across the entire register,
    28  // which is only possible using VPERMB.
    29  //
    30  // Technically we could use ZMM registers when working on 32 bit floats to work
    31  // on 16 values per iteration. However, measurements indicated that the latency
    32  // of VPGATHERDD/VPSCATTERDD on ZMM registers did not provide any improvements
    33  // to the throughput of the algorithms, but working on more values increased the
    34  // code complexity. Using YMM registers offered the best balance between
    35  // performance and maintainability.
    36  //
    37  // At a high level the vectorized algorithms are the following:
    38  //
    39  // encoding
    40  // --------
    41  //   * Load a vector of data from the input buffer
    42  //   * Permute bytes, grouping bytes by index
    43  //   * Scatter bytes of the register to the output buffer
    44  //
    45  // decoding
    46  // --------
    47  //   * Gather sparse bytes from the input buffer
    48  //   * Permute bytes, reconstructing the original values
    49  //   * Store the vector in the output buffer
    50  //
    51  // When AVX instructions are not available, the functions fallback to scalar
    52  // implementations of the algorithms. These yield much lower throughput, but
    53  // performed 20-30% better than the code generated by the Go compiler.
    54  
    55  // func encodeFloat(dst, src []byte)
    56  TEXT ·encodeFloat(SB), NOSPLIT, $0-48
    57      MOVQ src_base+24(FP), AX
    58      MOVQ src_len+32(FP), BX
    59      MOVQ dst_base+0(FP), DX
    60  
    61      MOVQ AX, CX
    62      ADDQ BX, CX // end
    63      SHRQ $2, BX // len
    64  
    65      CMPQ BX, $0
    66      JE done
    67  
    68      CMPB ·encodeFloatHasAVX512(SB), $0
    69      JE loop1x4
    70  
    71      CMPQ BX, $8
    72      JB loop1x4
    73  
    74      MOVQ CX, DI
    75      SUBQ AX, DI
    76      SHRQ $5, DI
    77      SHLQ $5, DI
    78      ADDQ AX, DI
    79  
    80      VMOVDQU32 shuffle8x4<>(SB), Y0
    81      VPBROADCASTD BX, Y2
    82      VPMULLD scale8x4<>(SB), Y2, Y2
    83      VPADDD offset8x4<>(SB), Y2, Y2
    84  loop8x4:
    85      KXORQ K1, K1, K1
    86      KNOTQ K1, K1
    87  
    88      VMOVDQU32 (AX), Y1
    89      VPSHUFB Y0, Y1, Y1
    90      VPSCATTERDD Y1, K1, (DX)(Y2*1)
    91  
    92      ADDQ $32, AX
    93      ADDQ $8, DX
    94      CMPQ AX, DI
    95      JNE loop8x4
    96      VZEROUPPER
    97  
    98      CMPQ AX, CX
    99      JE done
   100  loop1x4:
   101      MOVL (AX), SI
   102      MOVQ DX, DI
   103  
   104      MOVB SI, (DI)
   105      SHRL $8, SI
   106      ADDQ BX, DI
   107  
   108      MOVB SI, (DI)
   109      SHRL $8, SI
   110      ADDQ BX, DI
   111  
   112      MOVB SI, (DI)
   113      SHRL $8, SI
   114      ADDQ BX, DI
   115  
   116      MOVB SI, (DI)
   117  
   118      ADDQ $4, AX
   119      INCQ DX
   120      CMPQ AX, CX
   121      JB loop1x4
   122  done:
   123      RET
   124  
   125  // func encodeDouble(dst, src []byte)
   126  TEXT ·encodeDouble(SB), NOSPLIT, $0-48
   127      MOVQ src_base+24(FP), AX
   128      MOVQ src_len+32(FP), BX
   129      MOVQ dst_base+0(FP), DX
   130  
   131      MOVQ AX, CX
   132      ADDQ BX, CX
   133      SHRQ $3, BX
   134  
   135      CMPQ BX, $0
   136      JE done
   137  
   138      CMPB ·encodeDoubleHasAVX512(SB), $0
   139      JE loop1x8
   140  
   141      CMPQ BX, $8
   142      JB loop1x8
   143  
   144      MOVQ CX, DI
   145      SUBQ AX, DI
   146      SHRQ $6, DI
   147      SHLQ $6, DI
   148      ADDQ AX, DI
   149  
   150      VMOVDQU64 shuffle8x8<>(SB), Z0
   151      VPBROADCASTQ BX, Z2
   152      VPMULLQ scale8x8<>(SB), Z2, Z2
   153  loop8x8:
   154      KXORQ K1, K1, K1
   155      KNOTQ K1, K1
   156  
   157      VMOVDQU64 (AX), Z1
   158      VPERMB Z1, Z0, Z1
   159      VPSCATTERQQ Z1, K1, (DX)(Z2*1)
   160  
   161      ADDQ $64, AX
   162      ADDQ $8, DX
   163      CMPQ AX, DI
   164      JNE loop8x8
   165      VZEROUPPER
   166  
   167      CMPQ AX, CX
   168      JE done
   169  loop1x8:
   170      MOVQ (AX), SI
   171      MOVQ DX, DI
   172  
   173      MOVB SI, (DI)
   174      SHRQ $8, SI
   175      ADDQ BX, DI
   176  
   177      MOVB SI, (DI)
   178      SHRQ $8, SI
   179      ADDQ BX, DI
   180  
   181      MOVB SI, (DI)
   182      SHRQ $8, SI
   183      ADDQ BX, DI
   184  
   185      MOVB SI, (DI)
   186      SHRQ $8, SI
   187      ADDQ BX, DI
   188  
   189      MOVB SI, (DI)
   190      SHRQ $8, SI
   191      ADDQ BX, DI
   192  
   193      MOVB SI, (DI)
   194      SHRQ $8, SI
   195      ADDQ BX, DI
   196  
   197      MOVB SI, (DI)
   198      SHRQ $8, SI
   199      ADDQ BX, DI
   200  
   201      MOVB SI, (DI)
   202  
   203      ADDQ $8, AX
   204      INCQ DX
   205      CMPQ AX, CX
   206      JB loop1x8
   207  done:
   208      RET
   209  
   210  // func decodeFloat(dst, src []byte)
   211  TEXT ·decodeFloat(SB), NOSPLIT, $0-48
   212      MOVQ dst_base+0(FP), AX
   213      MOVQ dst_len+8(FP), BX
   214      MOVQ src_base+24(FP), DX
   215  
   216      MOVQ AX, CX
   217      ADDQ BX, CX // end
   218      SHRQ $2, BX // len
   219  
   220      CMPQ BX, $0
   221      JE done
   222  
   223      CMPB ·decodeFloatHasAVX2(SB), $0
   224      JE loop1x4
   225  
   226      CMPQ BX, $8
   227      JB loop1x4
   228  
   229      MOVQ CX, DI
   230      SUBQ AX, DI
   231      SHRQ $5, DI
   232      SHLQ $5, DI
   233      ADDQ AX, DI
   234  
   235      MOVQ $0xFFFFFFFF, SI
   236      MOVQ BX, X5
   237      MOVQ SI, X6
   238      VMOVDQU shuffle8x4<>(SB), Y0
   239      VPBROADCASTD X5, Y2
   240      VPBROADCASTD X6, Y3
   241      VPMULLD scale8x4<>(SB), Y2, Y2
   242      VPADDD offset8x4<>(SB), Y2, Y2
   243      VMOVDQU Y3, Y4
   244  loop8x4:
   245      VPGATHERDD Y4, (DX)(Y2*1), Y1
   246      VPSHUFB Y0, Y1, Y1
   247      VMOVDQU Y1, (AX)
   248      VMOVDQU Y3, Y4
   249  
   250      ADDQ $32, AX
   251      ADDQ $8, DX
   252      CMPQ AX, DI
   253      JNE loop8x4
   254      VZEROUPPER
   255  
   256      CMPQ AX, CX
   257      JE done
   258  loop1x4:
   259      MOVQ DX, DI
   260      MOVBLZX (DI), R8
   261      ADDQ BX, DI
   262      MOVBLZX (DI), R9
   263      ADDQ BX, DI
   264      MOVBLZX (DI), R10
   265      ADDQ BX, DI
   266      MOVBLZX (DI), R11
   267  
   268      SHLL $8, R9
   269      SHLL $16, R10
   270      SHLL $24, R11
   271  
   272      ORL R9, R8
   273      ORL R10, R8
   274      ORL R11, R8
   275  
   276      MOVL R8, (AX)
   277  
   278      ADDQ $4, AX
   279      INCQ DX
   280      CMPQ AX, CX
   281      JB loop1x4
   282  done:
   283      RET
   284  
   285  // func decodeDouble(dst, src []byte)
   286  TEXT ·decodeDouble(SB), NOSPLIT, $0-48
   287      MOVQ dst_base+0(FP), AX
   288      MOVQ dst_len+8(FP), BX
   289      MOVQ src_base+24(FP), DX
   290  
   291      MOVQ AX, CX
   292      ADDQ BX, CX
   293      SHRQ $3, BX
   294  
   295      CMPQ BX, $0
   296      JE done
   297  
   298      CMPB ·decodeDoubleHasAVX512(SB), $0
   299      JE loop1x8
   300  
   301      CMPQ BX, $8
   302      JB loop1x8
   303  
   304      MOVQ CX, DI
   305      SUBQ AX, DI
   306      SHRQ $6, DI
   307      SHLQ $6, DI
   308      ADDQ AX, DI
   309  
   310      VMOVDQU64 shuffle8x8<>(SB), Z0
   311      VPBROADCASTQ BX, Z2
   312      VPMULLQ scale8x8<>(SB), Z2, Z2
   313  loop8x8:
   314      KXORQ K1, K1, K1
   315      KNOTQ K1, K1
   316  
   317      VPGATHERQQ (DX)(Z2*1), K1, Z1
   318      VPERMB Z1, Z0, Z1
   319      VMOVDQU64 Z1, (AX)
   320  
   321      ADDQ $64, AX
   322      ADDQ $8, DX
   323      CMPQ AX, DI
   324      JNE loop8x8
   325      VZEROUPPER
   326  
   327      CMPQ AX, CX
   328      JE done
   329  loop1x8:
   330      MOVQ DX, DI
   331      XORQ R12, R12
   332  
   333      MOVBQZX (DI), R8
   334      ADDQ BX, DI
   335      MOVBQZX (DI), R9
   336      ADDQ BX, DI
   337      MOVBQZX (DI), R10
   338      ADDQ BX, DI
   339      MOVBQZX (DI), R11
   340      ADDQ BX, DI
   341  
   342      SHLQ $8, R9
   343      SHLQ $16, R10
   344      SHLQ $24, R11
   345  
   346      ORQ R8, R12
   347      ORQ R9, R12
   348      ORQ R10, R12
   349      ORQ R11, R12
   350  
   351      MOVBQZX (DI), R8
   352      ADDQ BX, DI
   353      MOVBQZX (DI), R9
   354      ADDQ BX, DI
   355      MOVBQZX (DI), R10
   356      ADDQ BX, DI
   357      MOVBQZX (DI), R11
   358  
   359      SHLQ $32, R8
   360      SHLQ $40, R9
   361      SHLQ $48, R10
   362      SHLQ $56, R11
   363  
   364      ORQ R8, R12
   365      ORQ R9, R12
   366      ORQ R10, R12
   367      ORQ R11, R12
   368  
   369      MOVQ R12, (AX)
   370  
   371      ADDQ $8, AX
   372      INCQ DX
   373      CMPQ AX, CX
   374      JB loop1x8
   375  done:
   376      RET
   377  
   378  GLOBL scale8x4<>(SB), RODATA|NOPTR, $32
   379  DATA scale8x4<>+0(SB)/4,  $0
   380  DATA scale8x4<>+4(SB)/4,  $1
   381  DATA scale8x4<>+8(SB)/4,  $2
   382  DATA scale8x4<>+12(SB)/4, $3
   383  DATA scale8x4<>+16(SB)/4, $0
   384  DATA scale8x4<>+20(SB)/4, $1
   385  DATA scale8x4<>+24(SB)/4, $2
   386  DATA scale8x4<>+28(SB)/4, $3
   387  
   388  GLOBL offset8x4<>(SB), RODATA|NOPTR, $32
   389  DATA offset8x4<>+0(SB)/4,  $0
   390  DATA offset8x4<>+4(SB)/4,  $0
   391  DATA offset8x4<>+8(SB)/4,  $0
   392  DATA offset8x4<>+12(SB)/4, $0
   393  DATA offset8x4<>+16(SB)/4, $4
   394  DATA offset8x4<>+20(SB)/4, $4
   395  DATA offset8x4<>+24(SB)/4, $4
   396  DATA offset8x4<>+28(SB)/4, $4
   397  
   398  GLOBL shuffle8x4<>(SB), RODATA|NOPTR, $32
   399  DATA shuffle8x4<>+0(SB)/4,  $0x0C080400
   400  DATA shuffle8x4<>+4(SB)/4,  $0x0D090501
   401  DATA shuffle8x4<>+8(SB)/4,  $0x0E0A0602
   402  DATA shuffle8x4<>+12(SB)/4, $0x0F0B0703
   403  DATA shuffle8x4<>+16(SB)/4, $0x0C080400
   404  DATA shuffle8x4<>+20(SB)/4, $0x0D090501
   405  DATA shuffle8x4<>+24(SB)/4, $0x0E0A0602
   406  DATA shuffle8x4<>+28(SB)/4, $0x0F0B0703
   407  
   408  GLOBL scale8x8<>(SB), RODATA|NOPTR, $64
   409  DATA scale8x8<>+0(SB)/8,  $0
   410  DATA scale8x8<>+8(SB)/8,  $1
   411  DATA scale8x8<>+16(SB)/8, $2
   412  DATA scale8x8<>+24(SB)/8, $3
   413  DATA scale8x8<>+32(SB)/8, $4
   414  DATA scale8x8<>+40(SB)/8, $5
   415  DATA scale8x8<>+48(SB)/8, $6
   416  DATA scale8x8<>+56(SB)/8, $7
   417  
   418  GLOBL shuffle8x8<>(SB), RODATA|NOPTR, $64
   419  DATA shuffle8x8<>+0(SB)/8,  $0x3830282018100800
   420  DATA shuffle8x8<>+8(SB)/8,  $0x3931292119110901
   421  DATA shuffle8x8<>+16(SB)/8, $0x3A322A221A120A02
   422  DATA shuffle8x8<>+24(SB)/8, $0x3B332B231B130B03
   423  DATA shuffle8x8<>+32(SB)/8, $0x3C342C241C140C04
   424  DATA shuffle8x8<>+40(SB)/8, $0x3D352D251D150D05
   425  DATA shuffle8x8<>+48(SB)/8, $0x3E362E261E160E06
   426  DATA shuffle8x8<>+56(SB)/8, $0x3F372F271F170F07