github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/page_bounds_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define bswap128lo 0x08080A0B0C0D0E0F
     6  #define bswap128hi 0x0001020304050607
     7  
     8  DATA bswap128+0(SB)/8, $bswap128lo
     9  DATA bswap128+8(SB)/8, $bswap128hi
    10  DATA bswap128+16(SB)/8, $bswap128lo
    11  DATA bswap128+24(SB)/8, $bswap128hi
    12  DATA bswap128+32(SB)/8, $bswap128lo
    13  DATA bswap128+40(SB)/8, $bswap128hi
    14  DATA bswap128+48(SB)/8, $bswap128lo
    15  DATA bswap128+56(SB)/8, $bswap128hi
    16  GLOBL bswap128(SB), RODATA|NOPTR, $64
    17  
    18  DATA indexes128+0(SB)/8, $0
    19  DATA indexes128+8(SB)/8, $0
    20  DATA indexes128+16(SB)/8, $1
    21  DATA indexes128+24(SB)/8, $1
    22  DATA indexes128+32(SB)/8, $2
    23  DATA indexes128+40(SB)/8, $2
    24  DATA indexes128+48(SB)/8, $3
    25  DATA indexes128+56(SB)/8, $3
    26  GLOBL indexes128(SB), RODATA|NOPTR, $64
    27  
    28  DATA swap64+0(SB)/8, $4
    29  DATA swap64+8(SB)/8, $5
    30  DATA swap64+16(SB)/8, $6
    31  DATA swap64+24(SB)/8, $7
    32  DATA swap64+32(SB)/8, $2
    33  DATA swap64+40(SB)/8, $3
    34  DATA swap64+48(SB)/8, $0
    35  DATA swap64+56(SB)/8, $1
    36  GLOBL swap64(SB), RODATA|NOPTR, $64
    37  
    38  DATA swap32+0(SB)/4, $8
    39  DATA swap32+4(SB)/4, $9
    40  DATA swap32+8(SB)/4, $10
    41  DATA swap32+12(SB)/4, $11
    42  DATA swap32+16(SB)/4, $12
    43  DATA swap32+20(SB)/4, $13
    44  DATA swap32+24(SB)/4, $14
    45  DATA swap32+28(SB)/4, $15
    46  DATA swap32+32(SB)/4, $4
    47  DATA swap32+36(SB)/4, $5
    48  DATA swap32+40(SB)/4, $6
    49  DATA swap32+44(SB)/4, $7
    50  DATA swap32+48(SB)/4, $2
    51  DATA swap32+52(SB)/4, $3
    52  DATA swap32+56(SB)/4, $0
    53  DATA swap32+60(SB)/4, $1
    54  GLOBL swap32(SB), RODATA|NOPTR, $64
    55  
    56  // func combinedBoundsInt32(data []int32) (min, max int32)
    57  TEXT ·combinedBoundsInt32(SB), NOSPLIT, $-32
    58      MOVQ data_base+0(FP), AX
    59      MOVQ data_len+8(FP), CX
    60      XORQ R8, R8
    61      XORQ R9, R9
    62  
    63      CMPQ CX, $0
    64      JE done
    65      XORQ SI, SI
    66      MOVLQZX (AX), R8 // min
    67      MOVLQZX (AX), R9 // max
    68  
    69      CMPB ·hasAVX512VL(SB), $0
    70      JE loop
    71  
    72      CMPQ CX, $32
    73      JB loop
    74  
    75      MOVQ CX, DI
    76      SHRQ $5, DI
    77      SHLQ $5, DI
    78      VPBROADCASTD (AX), Z0
    79      VPBROADCASTD (AX), Z3
    80  loop32:
    81      VMOVDQU32 (AX)(SI*4), Z1
    82      VMOVDQU32 64(AX)(SI*4), Z2
    83      VPMINSD Z1, Z0, Z0
    84      VPMINSD Z2, Z0, Z0
    85      VPMAXSD Z1, Z3, Z3
    86      VPMAXSD Z2, Z3, Z3
    87      ADDQ $32, SI
    88      CMPQ SI, DI
    89      JNE loop32
    90  
    91      VMOVDQU32 swap32+0(SB), Z1
    92      VMOVDQU32 swap32+0(SB), Z2
    93      VPERMI2D Z0, Z0, Z1
    94      VPERMI2D Z3, Z3, Z2
    95      VPMINSD Y1, Y0, Y0
    96      VPMAXSD Y2, Y3, Y3
    97  
    98      VMOVDQU32 swap32+32(SB), Y1
    99      VMOVDQU32 swap32+32(SB), Y2
   100      VPERMI2D Y0, Y0, Y1
   101      VPERMI2D Y3, Y3, Y2
   102      VPMINSD X1, X0, X0
   103      VPMAXSD X2, X3, X3
   104  
   105      VMOVDQU32 swap32+48(SB), X1
   106      VMOVDQU32 swap32+48(SB), X2
   107      VPERMI2D X0, X0, X1
   108      VPERMI2D X3, X3, X2
   109      VPMINSD X1, X0, X0
   110      VPMAXSD X2, X3, X3
   111      VZEROUPPER
   112  
   113      MOVQ X0, BX
   114      MOVQ X3, DX
   115      MOVL BX, R8
   116      MOVL DX, R9
   117      SHRQ $32, BX
   118      SHRQ $32, DX
   119      CMPL BX, R8
   120      CMOVLLT BX, R8
   121      CMPL DX, R9
   122      CMOVLGT DX, R9
   123  
   124      CMPQ SI, CX
   125      JE done
   126  loop:
   127      MOVLQZX (AX)(SI*4), DX
   128      CMPL DX, R8
   129      CMOVLLT DX, R8
   130      CMPL DX, R9
   131      CMOVLGT DX, R9
   132      INCQ SI
   133      CMPQ SI, CX
   134      JNE loop
   135  done:
   136      MOVL R8, min+24(FP)
   137      MOVL R9, max+28(FP)
   138      RET
   139  
   140  // func combinedBoundsInt64(data []int64) (min, max int64)
   141  TEXT ·combinedBoundsInt64(SB), NOSPLIT, $-40
   142      MOVQ data_base+0(FP), AX
   143      MOVQ data_len+8(FP), CX
   144      XORQ R8, R8
   145      XORQ R9, R9
   146  
   147      CMPQ CX, $0
   148      JE done
   149      XORQ SI, SI
   150      MOVQ (AX), R8 // min
   151      MOVQ (AX), R9 // max
   152  
   153      CMPB ·hasAVX512VL(SB), $0
   154      JE loop
   155  
   156      CMPQ CX, $16
   157      JB loop
   158  
   159      MOVQ CX, DI
   160      SHRQ $4, DI
   161      SHLQ $4, DI
   162      VPBROADCASTQ (AX), Z0
   163      VPBROADCASTQ (AX), Z3
   164  loop16:
   165      VMOVDQU64 (AX)(SI*8), Z1
   166      VMOVDQU64 64(AX)(SI*8), Z2
   167      VPMINSQ Z1, Z0, Z0
   168      VPMINSQ Z2, Z0, Z0
   169      VPMAXSQ Z1, Z3, Z3
   170      VPMAXSQ Z2, Z3, Z3
   171      ADDQ $16, SI
   172      CMPQ SI, DI
   173      JNE loop16
   174  
   175      VMOVDQU32 swap32+0(SB), Z1
   176      VMOVDQU32 swap32+0(SB), Z2
   177      VPERMI2D Z0, Z0, Z1
   178      VPERMI2D Z3, Z3, Z2
   179      VPMINSQ Y1, Y0, Y0
   180      VPMAXSQ Y2, Y3, Y3
   181  
   182      VMOVDQU32 swap32+32(SB), Y1
   183      VMOVDQU32 swap32+32(SB), Y2
   184      VPERMI2D Y0, Y0, Y1
   185      VPERMI2D Y3, Y3, Y2
   186      VPMINSQ X1, X0, X0
   187      VPMAXSQ X2, X3, X3
   188  
   189      VMOVDQU32 swap32+48(SB), X1
   190      VMOVDQU32 swap32+48(SB), X2
   191      VPERMI2D X0, X0, X1
   192      VPERMI2D X3, X3, X2
   193      VPMINSQ X1, X0, X0
   194      VPMAXSQ X2, X3, X3
   195      VZEROUPPER
   196  
   197      MOVQ X0, R8
   198      MOVQ X3, R9
   199      CMPQ SI, CX
   200      JE done
   201  loop:
   202      MOVQ (AX)(SI*8), DX
   203      CMPQ DX, R8
   204      CMOVQLT DX, R8
   205      CMPQ DX, R9
   206      CMOVQGT DX, R9
   207      INCQ SI
   208      CMPQ SI, CX
   209      JNE loop
   210  done:
   211      MOVQ R8, min+24(FP)
   212      MOVQ R9, max+32(FP)
   213      RET
   214  
   215  // func combinedBoundsUint32(data []uint32) (min, max uint32)
   216  TEXT ·combinedBoundsUint32(SB), NOSPLIT, $-32
   217      MOVQ data_base+0(FP), AX
   218      MOVQ data_len+8(FP), CX
   219      XORQ R8, R8
   220      XORQ R9, R9
   221  
   222      CMPQ CX, $0
   223      JE done
   224      XORQ SI, SI
   225      MOVLQZX (AX), R8 // min
   226      MOVLQZX (AX), R9 // max
   227  
   228      CMPB ·hasAVX512VL(SB), $0
   229      JE loop
   230  
   231      CMPQ CX, $32
   232      JB loop
   233  
   234      MOVQ CX, DI
   235      SHRQ $5, DI
   236      SHLQ $5, DI
   237      VPBROADCASTD (AX), Z0
   238      VPBROADCASTD (AX), Z3
   239  loop32:
   240      VMOVDQU32 (AX)(SI*4), Z1
   241      VMOVDQU32 64(AX)(SI*4), Z2
   242      VPMINUD Z1, Z0, Z0
   243      VPMINUD Z2, Z0, Z0
   244      VPMAXUD Z1, Z3, Z3
   245      VPMAXUD Z2, Z3, Z3
   246      ADDQ $32, SI
   247      CMPQ SI, DI
   248      JNE loop32
   249  
   250      VMOVDQU32 swap32+0(SB), Z1
   251      VMOVDQU32 swap32+0(SB), Z2
   252      VPERMI2D Z0, Z0, Z1
   253      VPERMI2D Z3, Z3, Z2
   254      VPMINUD Y1, Y0, Y0
   255      VPMAXUD Y2, Y3, Y3
   256  
   257      VMOVDQU32 swap32+32(SB), Y1
   258      VMOVDQU32 swap32+32(SB), Y2
   259      VPERMI2D Y0, Y0, Y1
   260      VPERMI2D Y3, Y3, Y2
   261      VPMINUD X1, X0, X0
   262      VPMAXUD X2, X3, X3
   263  
   264      VMOVDQU32 swap32+48(SB), X1
   265      VMOVDQU32 swap32+48(SB), X2
   266      VPERMI2D X0, X0, X1
   267      VPERMI2D X3, X3, X2
   268      VPMINUD X1, X0, X0
   269      VPMAXUD X2, X3, X3
   270      VZEROUPPER
   271  
   272      MOVQ X0, BX
   273      MOVQ X3, DX
   274      MOVL BX, R8
   275      MOVL DX, R9
   276      SHRQ $32, BX
   277      SHRQ $32, DX
   278      CMPL BX, R8
   279      CMOVLCS BX, R8
   280      CMPL DX, R9
   281      CMOVLHI DX, R9
   282  
   283      CMPQ SI, CX
   284      JE done
   285  loop:
   286      MOVLQZX (AX)(SI*4), DX
   287      CMPL DX, R8
   288      CMOVLCS DX, R8
   289      CMPL DX, R9
   290      CMOVLHI DX, R9
   291      INCQ SI
   292      CMPQ SI, CX
   293      JNE loop
   294  done:
   295      MOVL R8, min+24(FP)
   296      MOVL R9, max+28(FP)
   297      RET
   298  
   299  // func combinedBoundsUint64(data []uint64) (min, max uint64)
   300  TEXT ·combinedBoundsUint64(SB), NOSPLIT, $-40
   301      MOVQ data_base+0(FP), AX
   302      MOVQ data_len+8(FP), CX
   303      XORQ R8, R8
   304      XORQ R9, R9
   305  
   306      CMPQ CX, $0
   307      JE done
   308      XORQ SI, SI
   309      MOVQ (AX), R8 // min
   310      MOVQ (AX), R9 // max
   311  
   312      CMPB ·hasAVX512VL(SB), $0
   313      JE loop
   314  
   315      CMPQ CX, $16
   316      JB loop
   317  
   318      MOVQ CX, DI
   319      SHRQ $4, DI
   320      SHLQ $4, DI
   321      VPBROADCASTQ (AX), Z0
   322      VPBROADCASTQ (AX), Z3
   323  loop16:
   324      VMOVDQU64 (AX)(SI*8), Z1
   325      VMOVDQU64 64(AX)(SI*8), Z2
   326      VPMINUQ Z1, Z0, Z0
   327      VPMINUQ Z2, Z0, Z0
   328      VPMAXUQ Z1, Z3, Z3
   329      VPMAXUQ Z2, Z3, Z3
   330      ADDQ $16, SI
   331      CMPQ SI, DI
   332      JNE loop16
   333  
   334      VMOVDQU32 swap32+0(SB), Z1
   335      VMOVDQU32 swap32+0(SB), Z2
   336      VPERMI2D Z0, Z0, Z1
   337      VPERMI2D Z3, Z3, Z2
   338      VPMINUQ Y1, Y0, Y0
   339      VPMAXUQ Y2, Y3, Y3
   340  
   341      VMOVDQU32 swap32+32(SB), Y1
   342      VMOVDQU32 swap32+32(SB), Y2
   343      VPERMI2D Y0, Y0, Y1
   344      VPERMI2D Y3, Y3, Y2
   345      VPMINUQ X1, X0, X0
   346      VPMAXUQ X2, X3, X3
   347  
   348      VMOVDQU32 swap32+48(SB), X1
   349      VMOVDQU32 swap32+48(SB), X2
   350      VPERMI2D X0, X0, X1
   351      VPERMI2D X3, X3, X2
   352      VPMINUQ X1, X0, X0
   353      VPMAXUQ X2, X3, X3
   354      VZEROUPPER
   355  
   356      MOVQ X0, R8
   357      MOVQ X3, R9
   358      CMPQ SI, CX
   359      JE done
   360  loop:
   361      MOVQ (AX)(SI*8), DX
   362      CMPQ DX, R8
   363      CMOVQCS DX, R8
   364      CMPQ DX, R9
   365      CMOVQHI DX, R9
   366      INCQ SI
   367      CMPQ SI, CX
   368      JNE loop
   369  done:
   370      MOVQ R8, min+24(FP)
   371      MOVQ R9, max+32(FP)
   372      RET
   373  
   374  // func combinedBoundsFloat32(data []float32) (min, max float32)
   375  TEXT ·combinedBoundsFloat32(SB), NOSPLIT, $-32
   376      MOVQ data_base+0(FP), AX
   377      MOVQ data_len+8(FP), CX
   378      XORQ R8, R8
   379      XORQ R9, R9
   380  
   381      CMPQ CX, $0
   382      JE done
   383      XORPS X0, X0
   384      XORPS X1, X1
   385      XORQ SI, SI
   386      MOVLQZX (AX), R8 // min
   387      MOVLQZX (AX), R9 // max
   388      MOVQ R8, X0
   389      MOVQ R9, X1
   390  
   391      CMPB ·hasAVX512VL(SB), $0
   392      JE loop
   393  
   394      CMPQ CX, $32
   395      JB loop
   396  
   397      MOVQ CX, DI
   398      SHRQ $5, DI
   399      SHLQ $5, DI
   400      VPBROADCASTD (AX), Z0
   401      VPBROADCASTD (AX), Z3
   402  loop32:
   403      VMOVDQU32 (AX)(SI*4), Z1
   404      VMOVDQU32 64(AX)(SI*4), Z2
   405      VMINPS Z1, Z0, Z0
   406      VMINPS Z2, Z0, Z0
   407      VMAXPS Z1, Z3, Z3
   408      VMAXPS Z2, Z3, Z3
   409      ADDQ $32, SI
   410      CMPQ SI, DI
   411      JNE loop32
   412  
   413      VMOVDQU32 swap32+0(SB), Z1
   414      VMOVDQU32 swap32+0(SB), Z2
   415      VPERMI2D Z0, Z0, Z1
   416      VPERMI2D Z3, Z3, Z2
   417      VMINPS Y1, Y0, Y0
   418      VMAXPS Y2, Y3, Y3
   419  
   420      VMOVDQU32 swap32+32(SB), Y1
   421      VMOVDQU32 swap32+32(SB), Y2
   422      VPERMI2D Y0, Y0, Y1
   423      VPERMI2D Y3, Y3, Y2
   424      VMINPS X1, X0, X0
   425      VMAXPS X2, X3, X3
   426  
   427      VMOVDQU32 swap32+48(SB), X1
   428      VMOVDQU32 swap32+48(SB), X2
   429      VPERMI2D X0, X0, X1
   430      VPERMI2D X3, X3, X2
   431      VMINPS X1, X0, X0
   432      VMAXPS X2, X3, X3
   433      VZEROUPPER
   434  
   435      MOVAPS X0, X1
   436      MOVAPS X3, X2
   437  
   438      PSRLQ $32, X1
   439      MOVQ X0, R8
   440      MOVQ X1, R10
   441      UCOMISS X0, X1
   442      CMOVLCS R10, R8
   443  
   444      PSRLQ $32, X2
   445      MOVQ X3, R9
   446      MOVQ X2, R11
   447      UCOMISS X3, X2
   448      CMOVLHI R11, R9
   449  
   450      CMPQ SI, CX
   451      JE done
   452      MOVQ R8, X0
   453      MOVQ R9, X1
   454  loop:
   455      MOVLQZX (AX)(SI*4), DX
   456      MOVQ DX, X2
   457      UCOMISS X0, X2
   458      CMOVLCS DX, R8
   459      UCOMISS X1, X2
   460      CMOVLHI DX, R9
   461      MOVQ R8, X0
   462      MOVQ R9, X1
   463      INCQ SI
   464      CMPQ SI, CX
   465      JNE loop
   466  done:
   467      MOVL R8, min+24(FP)
   468      MOVL R9, max+28(FP)
   469      RET
   470  
   471  // func combinedBoundsFloat64(data []float64) (min, max float64)
   472  TEXT ·combinedBoundsFloat64(SB), NOSPLIT, $-40
   473      MOVQ data_base+0(FP), AX
   474      MOVQ data_len+8(FP), CX
   475      XORQ R8, R8
   476      XORQ R9, R9
   477  
   478      CMPQ CX, $0
   479      JE done
   480      XORPD X0, X0
   481      XORPD X1, X1
   482      XORQ SI, SI
   483      MOVQ (AX), R8 // min
   484      MOVQ (AX), R9 // max
   485      MOVQ R8, X0
   486      MOVQ R9, X1
   487  
   488      CMPB ·hasAVX512VL(SB), $0
   489      JE loop
   490  
   491      CMPQ CX, $16
   492      JB loop
   493  
   494      MOVQ CX, DI
   495      SHRQ $4, DI
   496      SHLQ $4, DI
   497      VPBROADCASTQ (AX), Z0
   498      VPBROADCASTQ (AX), Z3
   499  loop16:
   500      VMOVDQU64 (AX)(SI*8), Z1
   501      VMOVDQU64 64(AX)(SI*8), Z2
   502      VMINPD Z1, Z0, Z0
   503      VMINPD Z2, Z0, Z0
   504      VMAXPD Z1, Z3, Z3
   505      VMAXPD Z2, Z3, Z3
   506      ADDQ $16, SI
   507      CMPQ SI, DI
   508      JNE loop16
   509  
   510      VMOVDQU64 swap32+0(SB), Z1
   511      VMOVDQU64 swap32+0(SB), Z2
   512      VPERMI2D Z0, Z0, Z1
   513      VPERMI2D Z3, Z3, Z2
   514      VMINPD Y1, Y0, Y0
   515      VMAXPD Y2, Y3, Y3
   516  
   517      VMOVDQU64 swap32+32(SB), Y1
   518      VMOVDQU64 swap32+32(SB), Y2
   519      VPERMI2D Y0, Y0, Y1
   520      VPERMI2D Y3, Y3, Y2
   521      VMINPD X1, X0, X0
   522      VMAXPD X2, X3, X3
   523  
   524      VMOVDQU64 swap32+48(SB), X1
   525      VMOVDQU64 swap32+48(SB), X2
   526      VPERMI2D X0, X0, X1
   527      VPERMI2D X3, X3, X2
   528      VMINPD X1, X0, X0
   529      VMAXPD X2, X3, X1
   530      VZEROUPPER
   531  
   532      MOVQ X0, R8
   533      MOVQ X1, R9
   534      CMPQ SI, CX
   535      JE done
   536  loop:
   537      MOVQ (AX)(SI*8), DX
   538      MOVQ DX, X2
   539      UCOMISD X0, X2
   540      CMOVQCS DX, R8
   541      UCOMISD X1, X2
   542      CMOVQHI DX, R9
   543      MOVQ R8, X0
   544      MOVQ R9, X1
   545      INCQ SI
   546      CMPQ SI, CX
   547      JNE loop
   548  done:
   549      MOVQ R8, min+24(FP)
   550      MOVQ R9, max+32(FP)
   551      RET