github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/page_min_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // func minInt32(data []int32) int32
     6  TEXT ·minInt32(SB), NOSPLIT, $-28
     7      MOVQ data_base+0(FP), AX
     8      MOVQ data_len+8(FP), CX
     9      XORQ BX, BX
    10  
    11      CMPQ CX, $0
    12      JE done
    13      XORQ SI, SI
    14      MOVLQZX (AX), BX
    15  
    16      CMPB ·hasAVX512VL(SB), $0
    17      JE loop
    18  
    19      CMPQ CX, $32
    20      JB loop
    21  
    22      MOVQ CX, DI
    23      SHRQ $5, DI
    24      SHLQ $5, DI
    25      VPBROADCASTD (AX), Z0
    26  loop32:
    27      VMOVDQU32 (AX)(SI*4), Z1
    28      VMOVDQU32 64(AX)(SI*4), Z2
    29      VPMINSD Z1, Z0, Z0
    30      VPMINSD Z2, Z0, Z0
    31      ADDQ $32, SI
    32      CMPQ SI, DI
    33      JNE loop32
    34  
    35      VMOVDQU32 swap32+0(SB), Z1
    36      VPERMI2D Z0, Z0, Z1
    37      VPMINSD Y1, Y0, Y0
    38  
    39      VMOVDQU32 swap32+32(SB), Y1
    40      VPERMI2D Y0, Y0, Y1
    41      VPMINSD X1, X0, X0
    42  
    43      VMOVDQU32 swap32+48(SB), X1
    44      VPERMI2D X0, X0, X1
    45      VPMINSD X1, X0, X0
    46      VZEROUPPER
    47  
    48      MOVQ X0, DX
    49      MOVL DX, BX
    50      SHRQ $32, DX
    51      CMPL DX, BX
    52      CMOVLLT DX, BX
    53  
    54      CMPQ SI, CX
    55      JE done
    56  loop:
    57      MOVLQZX (AX)(SI*4), DX
    58      CMPL DX, BX
    59      CMOVLLT DX, BX
    60      INCQ SI
    61      CMPQ SI, CX
    62      JNE loop
    63  done:
    64      MOVL BX, ret+24(FP)
    65      RET
    66  
    67  // func minInt64(data []int64) int64
    68  TEXT ·minInt64(SB), NOSPLIT, $-32
    69      MOVQ data_base+0(FP), AX
    70      MOVQ data_len+8(FP), CX
    71      XORQ BX, BX
    72  
    73      CMPQ CX, $0
    74      JE done
    75      XORQ SI, SI
    76      MOVQ (AX), BX
    77  
    78      CMPB ·hasAVX512VL(SB), $0
    79      JE loop
    80  
    81      CMPQ CX, $32
    82      JB loop
    83  
    84      MOVQ CX, DI
    85      SHRQ $5, DI
    86      SHLQ $5, DI
    87      VPBROADCASTQ (AX), Z0
    88  loop32:
    89      VMOVDQU64 (AX)(SI*8), Z1
    90      VMOVDQU64 64(AX)(SI*8), Z2
    91      VMOVDQU64 128(AX)(SI*8), Z3
    92      VMOVDQU64 192(AX)(SI*8), Z4
    93      VPMINSQ Z1, Z2, Z5
    94      VPMINSQ Z3, Z4, Z6
    95      VPMINSQ Z5, Z6, Z1
    96      VPMINSQ Z1, Z0, Z0
    97      ADDQ $32, SI
    98      CMPQ SI, DI
    99      JNE loop32
   100  
   101      VMOVDQU32 swap32+0(SB), Z1
   102      VPERMI2D Z0, Z0, Z1
   103      VPMINSQ Y1, Y0, Y0
   104  
   105      VMOVDQU32 swap32+32(SB), Y1
   106      VPERMI2D Y0, Y0, Y1
   107      VPMINSQ X1, X0, X0
   108  
   109      VMOVDQU32 swap32+48(SB), X1
   110      VPERMI2D X0, X0, X1
   111      VPMINSQ X1, X0, X0
   112      VZEROUPPER
   113  
   114      MOVQ X0, BX
   115      CMPQ SI, CX
   116      JE done
   117  loop:
   118      MOVQ (AX)(SI*8), DX
   119      CMPQ DX, BX
   120      CMOVQLT DX, BX
   121      INCQ SI
   122      CMPQ SI, CX
   123      JNE loop
   124  done:
   125      MOVQ BX, ret+24(FP)
   126      RET
   127  
   128  // func minUint32(data []int32) int32
   129  TEXT ·minUint32(SB), NOSPLIT, $-28
   130      MOVQ data_base+0(FP), AX
   131      MOVQ data_len+8(FP), CX
   132      XORQ BX, BX
   133  
   134      CMPQ CX, $0
   135      JE done
   136      XORQ SI, SI
   137      MOVLQZX (AX), BX
   138  
   139      CMPB ·hasAVX512VL(SB), $0
   140      JE loop
   141  
   142      CMPQ CX, $32
   143      JB loop
   144  
   145      MOVQ CX, DI
   146      SHRQ $5, DI
   147      SHLQ $5, DI
   148      VPBROADCASTD (AX), Z0
   149  loop32:
   150      VMOVDQU32 (AX)(SI*4), Z1
   151      VMOVDQU32 64(AX)(SI*4), Z2
   152      VPMINUD Z1, Z0, Z0
   153      VPMINUD Z2, Z0, Z0
   154      ADDQ $32, SI
   155      CMPQ SI, DI
   156      JNE loop32
   157  
   158      VMOVDQU32 swap32+0(SB), Z1
   159      VPERMI2D Z0, Z0, Z1
   160      VPMINUD Y1, Y0, Y0
   161  
   162      VMOVDQU32 swap32+32(SB), Y1
   163      VPERMI2D Y0, Y0, Y1
   164      VPMINUD X1, X0, X0
   165  
   166      VMOVDQU32 swap32+48(SB), X1
   167      VPERMI2D X0, X0, X1
   168      VPMINUD X1, X0, X0
   169      VZEROUPPER
   170  
   171      MOVQ X0, DX
   172      MOVL DX, BX
   173      SHRQ $32, DX
   174      CMPL DX, BX
   175      CMOVLCS DX, BX
   176  
   177      CMPQ SI, CX
   178      JE done
   179  loop:
   180      MOVLQZX (AX)(SI*4), DX
   181      CMPL DX, BX
   182      CMOVLCS DX, BX
   183      INCQ SI
   184      CMPQ SI, CX
   185      JNE loop
   186  done:
   187      MOVL BX, ret+24(FP)
   188      RET
   189  
   190  // func minUint64(data []uint64) uint64
   191  TEXT ·minUint64(SB), NOSPLIT, $-32
   192      MOVQ data_base+0(FP), AX
   193      MOVQ data_len+8(FP), CX
   194      XORQ BX, BX
   195  
   196      CMPQ CX, $0
   197      JE done
   198      XORQ SI, SI
   199      MOVQ (AX), BX
   200  
   201      CMPB ·hasAVX512VL(SB), $0
   202      JE loop
   203  
   204      CMPQ CX, $32
   205      JB loop
   206  
   207      MOVQ CX, DI
   208      SHRQ $5, DI
   209      SHLQ $5, DI
   210      VPBROADCASTQ (AX), Z0
   211  loop32:
   212      VMOVDQU64 (AX)(SI*8), Z1
   213      VMOVDQU64 64(AX)(SI*8), Z2
   214      VMOVDQU64 128(AX)(SI*8), Z3
   215      VMOVDQU64 192(AX)(SI*8), Z4
   216      VPMINUQ Z1, Z2, Z5
   217      VPMINUQ Z3, Z4, Z6
   218      VPMINUQ Z5, Z6, Z1
   219      VPMINUQ Z1, Z0, Z0
   220      ADDQ $32, SI
   221      CMPQ SI, DI
   222      JNE loop32
   223  
   224      VMOVDQU32 swap32+0(SB), Z1
   225      VPERMI2D Z0, Z0, Z1
   226      VPMINUQ Y1, Y0, Y0
   227  
   228      VMOVDQU32 swap32+32(SB), Y1
   229      VPERMI2D Y0, Y0, Y1
   230      VPMINUQ X1, X0, X0
   231  
   232      VMOVDQU32 swap32+48(SB), X1
   233      VPERMI2D X0, X0, X1
   234      VPMINUQ X1, X0, X0
   235      VZEROUPPER
   236  
   237      MOVQ X0, BX
   238      CMPQ SI, CX
   239      JE done
   240  loop:
   241      MOVQ (AX)(SI*8), DX
   242      CMPQ DX, BX
   243      CMOVQCS DX, BX
   244      INCQ SI
   245      CMPQ SI, CX
   246      JNE loop
   247  done:
   248      MOVQ BX, ret+24(FP)
   249      RET
   250  
   251  // func minFloat32(data []float32) float32
   252  TEXT ·minFloat32(SB), NOSPLIT, $-28
   253      MOVQ data_base+0(FP), AX
   254      MOVQ data_len+8(FP), CX
   255      XORQ BX, BX
   256  
   257      CMPQ CX, $0
   258      JE done
   259      XORPS X0, X0
   260      XORPS X1, X1
   261      XORQ SI, SI
   262      MOVLQZX (AX), BX
   263      MOVQ BX, X0
   264  
   265      CMPB ·hasAVX512VL(SB), $0
   266      JE loop
   267  
   268      CMPQ CX, $64
   269      JB loop
   270  
   271      MOVQ CX, DI
   272      SHRQ $6, DI
   273      SHLQ $6, DI
   274      VPBROADCASTD (AX), Z0
   275  loop64:
   276      VMOVDQU32 (AX)(SI*4), Z1
   277      VMOVDQU32 64(AX)(SI*4), Z2
   278      VMOVDQU32 128(AX)(SI*4), Z3
   279      VMOVDQU32 192(AX)(SI*4), Z4
   280      VMINPS Z1, Z2, Z5
   281      VMINPS Z3, Z4, Z6
   282      VMINPS Z5, Z6, Z1
   283      VMINPS Z1, Z0, Z0
   284      ADDQ $64, SI
   285      CMPQ SI, DI
   286      JNE loop64
   287  
   288      VMOVDQU32 swap32+0(SB), Z1
   289      VPERMI2D Z0, Z0, Z1
   290      VMINPS Y1, Y0, Y0
   291  
   292      VMOVDQU32 swap32+32(SB), Y1
   293      VPERMI2D Y0, Y0, Y1
   294      VMINPS X1, X0, X0
   295  
   296      VMOVDQU32 swap32+48(SB), X1
   297      VPERMI2D X0, X0, X1
   298      VMINPS X1, X0, X0
   299      VZEROUPPER
   300  
   301      MOVAPS X0, X1
   302      PSRLQ $32, X1
   303      MOVQ X0, BX
   304      MOVQ X1, DX
   305      UCOMISS X0, X1
   306      CMOVLCS DX, BX
   307  
   308      CMPQ SI, CX
   309      JE done
   310      MOVQ BX, X0
   311  loop:
   312      MOVLQZX (AX)(SI*4), DX
   313      MOVQ DX, X1
   314      UCOMISS X0, X1
   315      CMOVLCS DX, BX
   316      MOVQ BX, X0
   317      INCQ SI
   318      CMPQ SI, CX
   319      JNE loop
   320  done:
   321      MOVL BX, ret+24(FP)
   322      RET
   323  
   324  // func minFloat64(data []float64) float64
   325  TEXT ·minFloat64(SB), NOSPLIT, $-32
   326      MOVQ data_base+0(FP), AX
   327      MOVQ data_len+8(FP), CX
   328      XORQ BX, BX
   329  
   330      CMPQ CX, $0
   331      JE done
   332      XORPD X0, X0
   333      XORPD X1, X1
   334      XORQ SI, SI
   335      MOVQ (AX), BX
   336      MOVQ BX, X0
   337  
   338      CMPB ·hasAVX512VL(SB), $0
   339      JE loop
   340  
   341      CMPQ CX, $32
   342      JB loop
   343  
   344      MOVQ CX, DI
   345      SHRQ $5, DI
   346      SHLQ $5, DI
   347      VPBROADCASTQ (AX), Z0
   348  loop32:
   349      VMOVDQU64 (AX)(SI*8), Z1
   350      VMOVDQU64 64(AX)(SI*8), Z2
   351      VMOVDQU64 128(AX)(SI*8), Z3
   352      VMOVDQU64 192(AX)(SI*8), Z4
   353      VMINPD Z1, Z2, Z5
   354      VMINPD Z3, Z4, Z6
   355      VMINPD Z5, Z6, Z1
   356      VMINPD Z1, Z0, Z0
   357      ADDQ $32, SI
   358      CMPQ SI, DI
   359      JNE loop32
   360  
   361      VMOVDQU64 swap32+0(SB), Z1
   362      VPERMI2D Z0, Z0, Z1
   363      VMINPD Y1, Y0, Y0
   364  
   365      VMOVDQU64 swap32+32(SB), Y1
   366      VPERMI2D Y0, Y0, Y1
   367      VMINPD X1, X0, X0
   368  
   369      VMOVDQU64 swap32+48(SB), X1
   370      VPERMI2D X0, X0, X1
   371      VMINPD X1, X0, X0
   372      VZEROUPPER
   373  
   374      MOVQ X0, BX
   375      CMPQ SI, CX
   376      JE done
   377  loop:
   378      MOVQ (AX)(SI*8), DX
   379      MOVQ DX, X1
   380      UCOMISD X0, X1
   381      CMOVQCS DX, BX
   382      MOVQ BX, X0
   383      INCQ SI
   384      CMPQ SI, CX
   385      JNE loop
   386  done:
   387      MOVQ BX, ret+24(FP)
   388      RET
   389  
   390  // vpminu128 is a macro comparing unsigned 128 bits values held in the
   391  // `srcValues` and `minValues` vectors. The `srcIndexes` and `minIndexes`
   392  // vectors contain the indexes of elements in the value vectors. Remaining
   393  // K and R arguments are mask and general purpose registers needed to hold
   394  // temporary values during the computation. The last M argument is a mask
   395  // generated by vpminu128mask.
   396  //
   397  // The routine uses AVX-512 instructions (VPCMPUQ, VPBLENDMQ) to implement
   398  // the comparison of 128 bits values. The values are expected to be stored
   399  // in the vectors as a little-endian pair of two consecutive quad words.
   400  //
   401  // The results are written to the `minValues` and `minIndexes` vectors,
   402  // overwriting the inputs. `srcValues` and `srcIndexes` are read-only
   403  // parameters.
   404  //
   405  // At a high level, for two pairs of quad words forming two 128 bits values
   406  // A and B, the test implemented by this macro is:
   407  //
   408  //   A[1] < B[1] || (A[1] == B[1] && A[0] < B[0])
   409  //
   410  // Values in the source vector that evalute to true on this expression are
   411  // written to the vector of minimum values, and their indexes are written to
   412  // the vector of indexes.
   413  #define vpminu128(srcValues, srcIndexes, minValues, minIndexes, K1, K2, R1, R2, R3, M) \
   414      VPCMPUQ $0, minValues, srcValues, K1 \
   415      VPCMPUQ $1, minValues, srcValues, K2 \
   416      KMOVB K1, R1 \
   417      KMOVB K2, R2 \
   418      MOVB R2, R3 \
   419      SHLB $1, R3 \
   420      ANDB R3, R1 \
   421      ORB R2, R1 \
   422      ANDB M, R1 \
   423      MOVB R1, R2 \
   424      SHRB $1, R2 \
   425      ORB R2, R1 \
   426      KMOVB R1, K1 \
   427      VPBLENDMQ srcValues, minValues, K1, minValues \
   428      VPBLENDMQ srcIndexes, minIndexes, K1, minIndexes
   429  
   430  // vpminu128mask is a macro used to initialize the mask passed as last argument
   431  // to vpminu128. The argument M is intended to be a general purpose register.
   432  //
   433  // The bit mask is used to merge the results of the "less than" and "equal"
   434  // comparison that are performed on each lane of minimum vectors. The upper bits
   435  // are used to compute results of the operation to determines which of the pairs
   436  // of quad words representing the 128 bits elements are the minimums.
   437  #define vpminu128mask(M) MOVB $0b10101010, M
   438  
   439  // func minBE128(data [][16]byte) []byte
   440  TEXT ·minBE128(SB), NOSPLIT, $-48
   441      MOVQ data_base+0(FP), AX
   442      MOVQ data_len+8(FP), CX
   443      CMPQ CX, $0
   444      JE null
   445  
   446      SHLQ $4, CX
   447      MOVQ CX, DX // len
   448      MOVQ AX, BX // min
   449      ADDQ AX, CX // end
   450  
   451      CMPQ DX, $256
   452      JB loop
   453  
   454      CMPB ·hasAVX512MinMaxBE128(SB), $0
   455      JE loop
   456  
   457      // Z19 holds a vector of the count by which we increment the vectors of
   458      // swap at each loop iteration.
   459      MOVQ $16, DI
   460      VPBROADCASTQ DI, Z19
   461  
   462      // Z31 holds the shuffle mask used to convert 128 bits elements from big to
   463      // little endian so we can apply vectorized comparison instructions.
   464      VMOVDQU64 bswap128(SB), Z31
   465  
   466      // These vectors hold four lanes of minimum values found in the input.
   467      VBROADCASTI64X2 (AX), Z0
   468      VPSHUFB Z31, Z0, Z0
   469      VMOVDQU64 Z0, Z5
   470      VMOVDQU64 Z0, Z10
   471      VMOVDQU64 Z0, Z15
   472  
   473      // These vectors hold four lanes of swap of minimum values.
   474      //
   475      // We initialize them at zero because we broadcast the first value of the
   476      // input in the vectors that track the minimums of each lane; in other
   477      // words, we assume the minimum value is at the first offset and work our
   478      // way up from there.
   479      VPXORQ Z2, Z2, Z2
   480      VPXORQ Z7, Z7, Z7
   481      VPXORQ Z12, Z12, Z12
   482      VPXORQ Z17, Z17, Z17
   483  
   484      // These vectors are used to compute the swap of minimum values held
   485      // in [Z1, Z5, Z10, Z15]. Each vector holds a contiguous sequence of
   486      // swap; for example, Z3 is initialized with [0, 1, 2, 3]. At each
   487      // loop iteration, the swap are incremented by the number of elements
   488      // consumed from the input (4x4=16).
   489      VMOVDQU64 indexes128(SB), Z3
   490      VPXORQ Z8, Z8, Z8
   491      VPXORQ Z13, Z13, Z13
   492      VPXORQ Z18, Z18, Z18
   493      MOVQ $4, DI
   494      VPBROADCASTQ DI, Z1
   495      VPADDQ Z1, Z3, Z8
   496      VPADDQ Z1, Z8, Z13
   497      VPADDQ Z1, Z13, Z18
   498  
   499      vpminu128mask(DI)
   500      SHRQ $8, DX
   501      SHLQ $8, DX
   502      ADDQ AX, DX
   503  loop16:
   504      // Compute 4x4 minimum values in vector registers, along with their swap
   505      // in the input array.
   506      VMOVDQU64 (AX), Z1
   507      VMOVDQU64 64(AX), Z6
   508      VMOVDQU64 128(AX), Z11
   509      VMOVDQU64 192(AX), Z16
   510      VPSHUFB Z31, Z1, Z1
   511      VPSHUFB Z31, Z6, Z6
   512      VPSHUFB Z31, Z11, Z11
   513      VPSHUFB Z31, Z16, Z16
   514      vpminu128(Z1, Z3, Z0, Z2, K1, K2, R8, R9, R10, DI)
   515      vpminu128(Z6, Z8, Z5, Z7, K3, K4, R11, R12, R13, DI)
   516      vpminu128(Z11, Z13, Z10, Z12, K1, K2, R8, R9, R10, DI)
   517      vpminu128(Z16, Z18, Z15, Z17, K3, K4, R11, R12, R13, DI)
   518      VPADDQ Z19, Z3, Z3
   519      VPADDQ Z19, Z8, Z8
   520      VPADDQ Z19, Z13, Z13
   521      VPADDQ Z19, Z18, Z18
   522      ADDQ $256, AX
   523      CMPQ AX, DX
   524      JB loop16
   525  
   526      // After the loop completed, we need to merge the lanes that each contain
   527      // 4 minimum values (so 16 total candidate at this stage). The results are
   528      // reduced into 4 candidates in Z0, with their swap in Z2.
   529      vpminu128(Z10, Z12, Z0, Z2, K1, K2, R8, R9, R10, DI)
   530      vpminu128(Z15, Z17, Z5, Z7, K3, K4, R11, R12, R13, DI)
   531      vpminu128(Z5, Z7, Z0, Z2, K1, K2, R8, R9, R10, DI)
   532  
   533      // Further reduce the results by swapping the upper and lower parts of the
   534      // vector registers, and comparing them to determine which values are the
   535      // smallest. We compare 2x2 values at this step, then 2x1 values at the next
   536      // to find the index of the minimum.
   537      VMOVDQU64 swap64+0(SB), Z1
   538      VMOVDQU64 swap64+0(SB), Z3
   539      VPERMI2Q Z0, Z0, Z1
   540      VPERMI2Q Z2, Z2, Z3
   541      vpminu128(Y1, Y3, Y0, Y2, K1, K2, R8, R9, R10, DI)
   542  
   543      VMOVDQU64 swap64+32(SB), Y1
   544      VMOVDQU64 swap64+32(SB), Y3
   545      VPERMI2Q Y0, Y0, Y1
   546      VPERMI2Q Y2, Y2, Y3
   547      vpminu128(X1, X3, X0, X2, K1, K2, R8, R9, R10, DI)
   548      VZEROUPPER
   549  
   550      // Extract the index of the minimum value computed in the lower 64 bits of
   551      // X2 and position the BX pointer at the index of the minimum value.
   552      MOVQ X2, DX
   553      SHLQ $4, DX
   554      ADDQ DX, BX
   555      CMPQ AX, CX
   556      JE done
   557  
   558      // Unless the input was aligned on 256 bytes, we need to perform a few more
   559      // iterations on the remaining elements.
   560      //
   561      // This loop is also taken if the CPU has no support for AVX-512.
   562  loop:
   563      MOVQ (AX), R8
   564      MOVQ (BX), R9
   565      BSWAPQ R8
   566      BSWAPQ R9
   567      CMPQ R8, R9
   568      JB less
   569      JA next
   570      MOVQ 8(AX), R8
   571      MOVQ 8(BX), R9
   572      BSWAPQ R8
   573      BSWAPQ R9
   574      CMPQ R8, R9
   575      JAE next
   576  less:
   577      MOVQ AX, BX
   578  next:
   579      ADDQ $16, AX
   580      CMPQ AX, CX
   581      JB loop
   582  done:
   583      MOVQ BX, ret_base+24(FP)
   584      MOVQ $16, ret_len+32(FP)
   585      MOVQ $16, ret_cap+40(FP)
   586      RET
   587  null:
   588      XORQ BX, BX
   589      MOVQ BX, ret_base+24(FP)
   590      MOVQ BX, ret_len+32(FP)
   591      MOVQ BX, ret_cap+40(FP)
   592      RET