github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/page_max_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // func maxInt32(data []int32) int32
     6  TEXT ·maxInt32(SB), NOSPLIT, $-28
     7      MOVQ data_base+0(FP), AX
     8      MOVQ data_len+8(FP), CX
     9      XORQ BX, BX
    10  
    11      CMPQ CX, $0
    12      JE done
    13      XORQ SI, SI
    14      MOVLQZX (AX), BX
    15  
    16      CMPB ·hasAVX512VL(SB), $0
    17      JE loop
    18  
    19      CMPQ CX, $32
    20      JB loop
    21  
    22      MOVQ CX, DI
    23      SHRQ $5, DI
    24      SHLQ $5, DI
    25      VPBROADCASTD (AX), Z0
    26  loop32:
    27      VMOVDQU32 (AX)(SI*4), Z1
    28      VMOVDQU32 64(AX)(SI*4), Z2
    29      VPMAXSD Z1, Z0, Z0
    30      VPMAXSD Z2, Z0, Z0
    31      ADDQ $32, SI
    32      CMPQ SI, DI
    33      JNE loop32
    34  
    35      VMOVDQU32 swap32+0(SB), Z1
    36      VPERMI2D Z0, Z0, Z1
    37      VPMAXSD Y1, Y0, Y0
    38  
    39      VMOVDQU32 swap32+32(SB), Y1
    40      VPERMI2D Y0, Y0, Y1
    41      VPMAXSD X1, X0, X0
    42  
    43      VMOVDQU32 swap32+48(SB), X1
    44      VPERMI2D X0, X0, X1
    45      VPMAXSD X1, X0, X0
    46      VZEROUPPER
    47  
    48      MOVQ X0, DX
    49      MOVL DX, BX
    50      SHRQ $32, DX
    51      CMPL DX, BX
    52      CMOVLGT DX, BX
    53  
    54      CMPQ SI, CX
    55      JE done
    56  loop:
    57      MOVLQZX (AX)(SI*4), DX
    58      CMPL DX, BX
    59      CMOVLGT DX, BX
    60      INCQ SI
    61      CMPQ SI, CX
    62      JNE loop
    63  done:
    64      MOVL BX, ret+24(FP)
    65      RET
    66  
    67  // func maxInt64(data []int64) int64
    68  TEXT ·maxInt64(SB), NOSPLIT, $-32
    69      MOVQ data_base+0(FP), AX
    70      MOVQ data_len+8(FP), CX
    71      XORQ BX, BX
    72  
    73      CMPQ CX, $0
    74      JE done
    75      XORQ SI, SI
    76      MOVQ (AX), BX
    77  
    78      CMPB ·hasAVX512VL(SB), $0
    79      JE loop
    80  
    81      CMPQ CX, $32
    82      JB loop
    83  
    84      MOVQ CX, DI
    85      SHRQ $5, DI
    86      SHLQ $5, DI
    87      VPBROADCASTQ (AX), Z0
    88  loop32:
    89      VMOVDQU64 (AX)(SI*8), Z1
    90      VMOVDQU64 64(AX)(SI*8), Z2
    91      VMOVDQU64 128(AX)(SI*8), Z3
    92      VMOVDQU64 192(AX)(SI*8), Z4
    93      VPMAXSQ Z1, Z2, Z5
    94      VPMAXSQ Z3, Z4, Z6
    95      VPMAXSQ Z5, Z6, Z1
    96      VPMAXSQ Z1, Z0, Z0
    97      ADDQ $32, SI
    98      CMPQ SI, DI
    99      JNE loop32
   100  
   101      VMOVDQU32 swap32+0(SB), Z1
   102      VPERMI2D Z0, Z0, Z1
   103      VPMAXSQ Y1, Y0, Y0
   104  
   105      VMOVDQU32 swap32+32(SB), Y1
   106      VPERMI2D Y0, Y0, Y1
   107      VPMAXSQ X1, X0, X0
   108  
   109      VMOVDQU32 swap32+48(SB), X1
   110      VPERMI2D X0, X0, X1
   111      VPMAXSQ X1, X0, X0
   112      VZEROUPPER
   113  
   114      MOVQ X0, BX
   115      CMPQ SI, CX
   116      JE done
   117  loop:
   118      MOVQ (AX)(SI*8), DX
   119      CMPQ DX, BX
   120      CMOVQGT DX, BX
   121      INCQ SI
   122      CMPQ SI, CX
   123      JNE loop
   124  done:
   125      MOVQ BX, ret+24(FP)
   126      RET
   127  
   128  // func maxUint32(data []int32) int32
   129  TEXT ·maxUint32(SB), NOSPLIT, $-28
   130      MOVQ data_base+0(FP), AX
   131      MOVQ data_len+8(FP), CX
   132      XORQ BX, BX
   133  
   134      CMPQ CX, $0
   135      JE done
   136      XORQ SI, SI
   137      MOVLQZX (AX), BX
   138  
   139      CMPB ·hasAVX512VL(SB), $0
   140      JE loop
   141  
   142      CMPQ CX, $32
   143      JB loop
   144  
   145      MOVQ CX, DI
   146      SHRQ $5, DI
   147      SHLQ $5, DI
   148      VPBROADCASTD (AX), Z0
   149  loop32:
   150      VMOVDQU32 (AX)(SI*4), Z1
   151      VMOVDQU32 64(AX)(SI*4), Z2
   152      VPMAXUD Z1, Z0, Z0
   153      VPMAXUD Z2, Z0, Z0
   154      ADDQ $32, SI
   155      CMPQ SI, DI
   156      JNE loop32
   157  
   158      VMOVDQU32 swap32+0(SB), Z1
   159      VPERMI2D Z0, Z0, Z1
   160      VPMAXUD Y1, Y0, Y0
   161  
   162      VMOVDQU32 swap32+32(SB), Y1
   163      VPERMI2D Y0, Y0, Y1
   164      VPMAXUD X1, X0, X0
   165  
   166      VMOVDQU32 swap32+48(SB), X1
   167      VPERMI2D X0, X0, X1
   168      VPMAXUD X1, X0, X0
   169      VZEROUPPER
   170  
   171      MOVQ X0, DX
   172      MOVL DX, BX
   173      SHRQ $32, DX
   174      CMPL DX, BX
   175      CMOVLHI DX, BX
   176  
   177      CMPQ SI, CX
   178      JE done
   179  loop:
   180      MOVLQZX (AX)(SI*4), DX
   181      CMPL DX, BX
   182      CMOVLHI DX, BX
   183      INCQ SI
   184      CMPQ SI, CX
   185      JNE loop
   186  done:
   187      MOVL BX, ret+24(FP)
   188      RET
   189  
   190  // func maxUint64(data []uint64) uint64
   191  TEXT ·maxUint64(SB), NOSPLIT, $-32
   192      MOVQ data_base+0(FP), AX
   193      MOVQ data_len+8(FP), CX
   194      XORQ BX, BX
   195  
   196      CMPQ CX, $0
   197      JE done
   198      XORQ SI, SI
   199      MOVQ (AX), BX
   200  
   201      CMPB ·hasAVX512VL(SB), $0
   202      JE loop
   203  
   204      CMPQ CX, $32
   205      JB loop
   206  
   207      MOVQ CX, DI
   208      SHRQ $5, DI
   209      SHLQ $5, DI
   210      VPBROADCASTQ (AX), Z0
   211  loop32:
   212      VMOVDQU64 (AX)(SI*8), Z1
   213      VMOVDQU64 64(AX)(SI*8), Z2
   214      VMOVDQU64 128(AX)(SI*8), Z3
   215      VMOVDQU64 192(AX)(SI*8), Z4
   216      VPMAXUQ Z1, Z2, Z5
   217      VPMAXUQ Z3, Z4, Z6
   218      VPMAXUQ Z5, Z6, Z1
   219      VPMAXUQ Z1, Z0, Z0
   220      ADDQ $32, SI
   221      CMPQ SI, DI
   222      JNE loop32
   223  
   224      VMOVDQU32 swap32+0(SB), Z1
   225      VPERMI2D Z0, Z0, Z1
   226      VPMAXUQ Y1, Y0, Y0
   227  
   228      VMOVDQU32 swap32+32(SB), Y1
   229      VPERMI2D Y0, Y0, Y1
   230      VPMAXUQ X1, X0, X0
   231  
   232      VMOVDQU32 swap32+48(SB), X1
   233      VPERMI2D X0, X0, X1
   234      VPMAXUQ X1, X0, X0
   235      VZEROUPPER
   236  
   237      MOVQ X0, BX
   238      CMPQ SI, CX
   239      JE done
   240  loop:
   241      MOVQ (AX)(SI*8), DX
   242      CMPQ DX, BX
   243      CMOVQHI DX, BX
   244      INCQ SI
   245      CMPQ SI, CX
   246      JNE loop
   247  done:
   248      MOVQ BX, ret+24(FP)
   249      RET
   250  
   251  // func maxFloat32(data []float32) float32
   252  TEXT ·maxFloat32(SB), NOSPLIT, $-28
   253      MOVQ data_base+0(FP), AX
   254      MOVQ data_len+8(FP), CX
   255      XORQ BX, BX
   256  
   257      CMPQ CX, $0
   258      JE done
   259      XORPS X0, X0
   260      XORPS X1, X1
   261      XORQ SI, SI
   262      MOVLQZX (AX), BX
   263      MOVQ BX, X0
   264  
   265      CMPB ·hasAVX512VL(SB), $0
   266      JE loop
   267  
   268      CMPQ CX, $64
   269      JB loop
   270  
   271      MOVQ CX, DI
   272      SHRQ $6, DI
   273      SHLQ $6, DI
   274      VPBROADCASTD (AX), Z0
   275  loop64:
   276      VMOVDQU32 (AX)(SI*4), Z1
   277      VMOVDQU32 64(AX)(SI*4), Z2
   278      VMOVDQU32 128(AX)(SI*4), Z3
   279      VMOVDQU32 192(AX)(SI*4), Z4
   280      VMAXPS Z1, Z2, Z5
   281      VMAXPS Z3, Z4, Z6
   282      VMAXPS Z5, Z6, Z1
   283      VMAXPS Z1, Z0, Z0
   284      ADDQ $64, SI
   285      CMPQ SI, DI
   286      JNE loop64
   287  
   288      VMOVDQU32 swap32+0(SB), Z1
   289      VPERMI2D Z0, Z0, Z1
   290      VMAXPS Y1, Y0, Y0
   291  
   292      VMOVDQU32 swap32+32(SB), Y1
   293      VPERMI2D Y0, Y0, Y1
   294      VMAXPS X1, X0, X0
   295  
   296      VMOVDQU32 swap32+48(SB), X1
   297      VPERMI2D X0, X0, X1
   298      VMAXPS X1, X0, X0
   299      VZEROUPPER
   300  
   301      MOVAPS X0, X1
   302      PSRLQ $32, X1
   303      MOVQ X0, BX
   304      MOVQ X1, DX
   305      UCOMISS X0, X1
   306      CMOVLHI DX, BX
   307  
   308      CMPQ SI, CX
   309      JE done
   310      MOVQ BX, X0
   311  loop:
   312      MOVLQZX (AX)(SI*4), DX
   313      MOVQ DX, X1
   314      UCOMISS X0, X1
   315      CMOVLHI DX, BX
   316      MOVQ BX, X0
   317      INCQ SI
   318      CMPQ SI, CX
   319      JNE loop
   320  done:
   321      MOVL BX, ret+24(FP)
   322      RET
   323  
   324  // func maxFloat64(data []float64) float64
   325  TEXT ·maxFloat64(SB), NOSPLIT, $-32
   326      MOVQ data_base+0(FP), AX
   327      MOVQ data_len+8(FP), CX
   328      XORQ BX, BX
   329  
   330      CMPQ CX, $0
   331      JE done
   332      XORPD X0, X0
   333      XORPD X1, X1
   334      XORQ SI, SI
   335      MOVQ (AX), BX
   336      MOVQ BX, X0
   337  
   338      CMPB ·hasAVX512VL(SB), $0
   339      JE loop
   340  
   341      CMPQ CX, $32
   342      JB loop
   343  
   344      MOVQ CX, DI
   345      SHRQ $5, DI
   346      SHLQ $5, DI
   347      VPBROADCASTQ (AX), Z0
   348  loop32:
   349      VMOVDQU64 (AX)(SI*8), Z1
   350      VMOVDQU64 64(AX)(SI*8), Z2
   351      VMOVDQU64 128(AX)(SI*8), Z3
   352      VMOVDQU64 192(AX)(SI*8), Z4
   353      VMAXPD Z1, Z2, Z5
   354      VMAXPD Z3, Z4, Z6
   355      VMAXPD Z5, Z6, Z1
   356      VMAXPD Z1, Z0, Z0
   357      ADDQ $32, SI
   358      CMPQ SI, DI
   359      JNE loop32
   360  
   361      VMOVDQU64 swap32+0(SB), Z1
   362      VPERMI2D Z0, Z0, Z1
   363      VMAXPD Y1, Y0, Y0
   364  
   365      VMOVDQU64 swap32+32(SB), Y1
   366      VPERMI2D Y0, Y0, Y1
   367      VMAXPD X1, X0, X0
   368  
   369      VMOVDQU64 swap32+48(SB), X1
   370      VPERMI2D X0, X0, X1
   371      VMAXPD X1, X0, X0
   372      VZEROUPPER
   373  
   374      MOVQ X0, BX
   375      CMPQ SI, CX
   376      JE done
   377  loop:
   378      MOVQ (AX)(SI*8), DX
   379      MOVQ DX, X1
   380      UCOMISD X0, X1
   381      CMOVQHI DX, BX
   382      MOVQ BX, X0
   383      INCQ SI
   384      CMPQ SI, CX
   385      JNE loop
   386  done:
   387      MOVQ BX, ret+24(FP)
   388      RET
   389  
   390  // vpmaxu128 is a macro comparing unsigned 128 bits values held in the
   391  // `srcValues` and `maxValues` vectors. The `srcIndexes` and `maxIndexes`
   392  // vectors contain the indexes of elements in the value vectors. Remaining
   393  // K and R arguments are mask and general purpose registers needed to hold
   394  // temporary values during the computation. The last M argument is a mask
   395  // generated by vpmaxu128mask.
   396  //
   397  // The routine uses AVX-512 instructions (VPCMPUQ, VPBLENDMQ) to implement
   398  // the comparison of 128 bits values. The values are expected to be stored
   399  // in the vectors as a little-endian pair of two consecutive quad words.
   400  //
   401  // The results are written to the `maxValues` and `maxIndexes` vectors,
   402  // overwriting the inputs. `srcValues` and `srcIndexes` are read-only
   403  // parameters.
   404  //
   405  // At a high level, for two pairs of quad words formaxg two 128 bits values
   406  // A and B, the test implemented by this macro is:
   407  //
   408  //   A[1] > B[1] || (A[1] == B[1] && A[0] > B[0])
   409  //
   410  // Values in the source vector that evaluate to true on this expression are
   411  // written to the vector of maximum values, and their indexes are written to
   412  // the vector of indexes.
   413  #define vpmaxu128(srcValues, srcIndexes, maxValues, maxIndexes, K1, K2, R1, R2, R3, M) \
   414      VPCMPUQ $0, maxValues, srcValues, K1 \
   415      VPCMPUQ $6, maxValues, srcValues, K2 \
   416      KMOVB K1, R1 \
   417      KMOVB K2, R2 \
   418      MOVB R2, R3 \
   419      SHLB $1, R3 \
   420      ANDB R3, R1 \
   421      ORB R2, R1 \
   422      ANDB M, R1 \
   423      MOVB R1, R2 \
   424      SHRB $1, R2 \
   425      ORB R2, R1 \
   426      KMOVB R1, K1 \
   427      VPBLENDMQ srcValues, maxValues, K1, maxValues \
   428      VPBLENDMQ srcIndexes, maxIndexes, K1, maxIndexes
   429  
   430  // vpmaxu128mask is a macro used to initialize the mask passed as last argument
   431  // to vpmaxu128. The argument M is intended to be a general purpose register.
   432  //
   433  // The bit mask is used to merge the results of the "greater than" and "equal"
   434  // comparison that are performed on each lane of maximum vectors. The upper bits
   435  // are used to compute results of the operation to determine which of the pairs
   436  // of quad words representing the 128 bits elements are the maximums.
   437  #define vpmaxu128mask(M) MOVB $0b10101010, M
   438  
   439  // func maxBE128(data [][16]byte) []byte
   440  TEXT ·maxBE128(SB), NOSPLIT, $-48
   441      MOVQ data_base+0(FP), AX
   442      MOVQ data_len+8(FP), CX
   443      CMPQ CX, $0
   444      JE null
   445  
   446      SHLQ $4, CX
   447      MOVQ CX, DX // len
   448      MOVQ AX, BX // max
   449      ADDQ AX, CX // end
   450  
   451      CMPQ DX, $256
   452      JB loop
   453  
   454      CMPB ·hasAVX512MinMaxBE128(SB), $0
   455      JE loop
   456  
   457      // Z19 holds a vector of the count by which we increment the vectors of
   458      // swap at each loop iteration.
   459      MOVQ $16, DI
   460      VPBROADCASTQ DI, Z19
   461  
   462      // Z31 holds the shuffle mask used to convert 128 bits elements from big to
   463      // little endian so we can apply vectorized comparison instructions.
   464      VMOVDQU64 bswap128(SB), Z31
   465  
   466      // These vectors hold four lanes of maximum values found in the input.
   467      VBROADCASTI64X2 (AX), Z0
   468      VPSHUFB Z31, Z0, Z0
   469      VMOVDQU64 Z0, Z5
   470      VMOVDQU64 Z0, Z10
   471      VMOVDQU64 Z0, Z15
   472  
   473      // These vectors hold four lanes of swap of maximum values.
   474      //
   475      // We initialize them at zero because we broadcast the first value of the
   476      // input in the vectors that track the maximums of each lane; in other
   477      // words, we assume the maximum value is at the first offset and work our
   478      // way up from there.
   479      VPXORQ Z2, Z2, Z2
   480      VPXORQ Z7, Z7, Z7
   481      VPXORQ Z12, Z12, Z12
   482      VPXORQ Z17, Z17, Z17
   483  
   484      // These vectors are used to compute the swap of maximum values held
   485      // in [Z1, Z5, Z10, Z15]. Each vector holds a contiguous sequence of
   486      // swap; for example, Z3 is initialized with [0, 1, 2, 3]. At each
   487      // loop iteration, the swap are incremented by the number of elements
   488      // consumed from the input (4x4=16).
   489      VMOVDQU64 indexes128(SB), Z3
   490      VPXORQ Z8, Z8, Z8
   491      VPXORQ Z13, Z13, Z13
   492      VPXORQ Z18, Z18, Z18
   493      MOVQ $4, DI
   494      VPBROADCASTQ DI, Z1
   495      VPADDQ Z1, Z3, Z8
   496      VPADDQ Z1, Z8, Z13
   497      VPADDQ Z1, Z13, Z18
   498  
   499      // This bit mask is used to merge the results of the "less than" and "equal"
   500      // comparison that we perform on each lane of maximum vectors. We use the
   501      // upper bits to compute four results of the operation which determines
   502      // which of the pair of quad words representing the 128 bits elements is the
   503      // maximum.
   504      vpmaxu128mask(DI)
   505      SHRQ $8, DX
   506      SHLQ $8, DX
   507      ADDQ AX, DX
   508  loop16:
   509      // Compute 4x4 maximum values in vector registers, along with their swap
   510      // in the input array.
   511      VMOVDQU64 (AX), Z1
   512      VMOVDQU64 64(AX), Z6
   513      VMOVDQU64 128(AX), Z11
   514      VMOVDQU64 192(AX), Z16
   515      VPSHUFB Z31, Z1, Z1
   516      VPSHUFB Z31, Z6, Z6
   517      VPSHUFB Z31, Z11, Z11
   518      VPSHUFB Z31, Z16, Z16
   519      vpmaxu128(Z1, Z3, Z0, Z2, K1, K2, R8, R9, R10, DI)
   520      vpmaxu128(Z6, Z8, Z5, Z7, K3, K4, R11, R12, R13, DI)
   521      vpmaxu128(Z11, Z13, Z10, Z12, K1, K2, R8, R9, R10, DI)
   522      vpmaxu128(Z16, Z18, Z15, Z17, K3, K4, R11, R12, R13, DI)
   523      VPADDQ Z19, Z3, Z3
   524      VPADDQ Z19, Z8, Z8
   525      VPADDQ Z19, Z13, Z13
   526      VPADDQ Z19, Z18, Z18
   527      ADDQ $256, AX
   528      CMPQ AX, DX
   529      JB loop16
   530  
   531      // After the loop completed, we need to merge the lanes that each contain
   532      // 4 maximum values (so 16 total candidate at this stage). The results are
   533      // reduced into 4 candidates in Z0, with their swap in Z2.
   534      vpmaxu128(Z10, Z12, Z0, Z2, K1, K2, R8, R9, R10, DI)
   535      vpmaxu128(Z15, Z17, Z5, Z7, K3, K4, R11, R12, R13, DI)
   536      vpmaxu128(Z5, Z7, Z0, Z2, K1, K2, R8, R9, R10, DI)
   537  
   538      // Further reduce the results by swapping the upper and lower parts of the
   539      // vector registers, and comparing them to determaxe which values are the
   540      // smallest. We compare 2x2 values at this step, then 2x1 values at the next
   541      // to find the index of the maximum.
   542      VMOVDQU64 swap64+0(SB), Z1
   543      VMOVDQU64 swap64+0(SB), Z3
   544      VPERMI2Q Z0, Z0, Z1
   545      VPERMI2Q Z2, Z2, Z3
   546      vpmaxu128(Y1, Y3, Y0, Y2, K1, K2, R8, R9, R10, DI)
   547  
   548      VMOVDQU64 swap64+32(SB), Y1
   549      VMOVDQU64 swap64+32(SB), Y3
   550      VPERMI2Q Y0, Y0, Y1
   551      VPERMI2Q Y2, Y2, Y3
   552      vpmaxu128(X1, X3, X0, X2, K1, K2, R8, R9, R10, DI)
   553      VZEROUPPER
   554  
   555      // Extract the index of the maximum value computed in the lower 64 bits of
   556      // X2 and position the BX pointer at the index of the maximum value.
   557      MOVQ X2, DX
   558      SHLQ $4, DX
   559      ADDQ DX, BX
   560      CMPQ AX, CX
   561      JE done
   562  
   563      // Unless the input was aligned on 256 bytes, we need to perform a few more
   564      // iterations on the remaining elements.
   565      //
   566      // This loop is also taken if the CPU has no support for AVX-512.
   567  loop:
   568      MOVQ (AX), R8
   569      MOVQ (BX), R9
   570      BSWAPQ R8
   571      BSWAPQ R9
   572      CMPQ R8, R9
   573      JA more
   574      JB next
   575      MOVQ 8(AX), R8
   576      MOVQ 8(BX), R9
   577      BSWAPQ R8
   578      BSWAPQ R9
   579      CMPQ R8, R9
   580      JBE next
   581  more:
   582      MOVQ AX, BX
   583  next:
   584      ADDQ $16, AX
   585      CMPQ AX, CX
   586      JB loop
   587  done:
   588      MOVQ BX, ret_base+24(FP)
   589      MOVQ $16, ret_len+32(FP)
   590      MOVQ $16, ret_cap+40(FP)
   591      RET
   592  null:
   593      XORQ BX, BX
   594      MOVQ BX, ret_base+24(FP)
   595      MOVQ BX, ret_len+32(FP)
   596      MOVQ BX, ret_cap+40(FP)
   597      RET
   598