github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/dictionary_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define errnoIndexOutOfBounds 1
     6  
     7  // func dictionaryBoundsInt32(dict []int32, indexes []int32) (min, max int32, err errno)
     8  TEXT ·dictionaryBoundsInt32(SB), NOSPLIT, $0-64
     9      MOVQ dict_base+0(FP), AX
    10      MOVQ dict_len+8(FP), BX
    11  
    12      MOVQ indexes_base+24(FP), CX
    13      MOVQ indexes_len+32(FP), DX
    14  
    15      XORQ R10, R10 // min
    16      XORQ R11, R11 // max
    17      XORQ R12, R12 // err
    18      XORQ SI, SI
    19  
    20      CMPQ DX, $0
    21      JE return
    22  
    23      MOVL (CX), DI
    24      CMPL DI, BX
    25      JAE indexOutOfBounds
    26      MOVL (AX)(DI*4), R10
    27      MOVL R10, R11
    28  
    29      CMPQ DX, $8
    30      JB test
    31  
    32      CMPB ·hasAVX512VL(SB), $0
    33      JE test
    34  
    35      MOVQ DX, DI
    36      SHRQ $3, DI
    37      SHLQ $3, DI
    38  
    39      MOVQ $0xFFFF, R8
    40      KMOVW R8, K1
    41  
    42      VPBROADCASTD BX, Y2  // [len(dict)...]
    43      VPBROADCASTD R10, Y3 // [min...]
    44      VMOVDQU32 Y3, Y4     // [max...]
    45  loopAVX512:
    46      VMOVDQU32 (CX)(SI*4), Y0
    47      VPCMPUD $1, Y2, Y0, K2
    48      KMOVW K2, R9
    49      CMPB R9, $0xFF
    50      JNE indexOutOfBounds
    51      VPGATHERDD (AX)(Y0*4), K1, Y1
    52      VPMINSD Y1, Y3, Y3
    53      VPMAXSD Y1, Y4, Y4
    54      KMOVW R8, K1
    55      ADDQ $8, SI
    56      CMPQ SI, DI
    57      JNE loopAVX512
    58  
    59      VPERM2I128 $1, Y3, Y3, Y0
    60      VPERM2I128 $1, Y4, Y4, Y1
    61      VPMINSD Y0, Y3, Y3
    62      VPMAXSD Y1, Y4, Y4
    63  
    64      VPSHUFD $0b1110, Y3, Y0
    65      VPSHUFD $0b1110, Y4, Y1
    66      VPMINSD Y0, Y3, Y3
    67      VPMAXSD Y1, Y4, Y4
    68  
    69      VPSHUFD $1, Y3, Y0
    70      VPSHUFD $1, Y4, Y1
    71      VPMINSD Y0, Y3, Y3
    72      VPMAXSD Y1, Y4, Y4
    73  
    74      MOVQ X3, R10
    75      MOVQ X4, R11
    76      ANDQ $0xFFFFFFFF, R10
    77      ANDQ $0xFFFFFFFF, R11
    78  
    79      VZEROUPPER
    80      JMP test
    81  loop:
    82      MOVL (CX)(SI*4), DI
    83      CMPL DI, BX
    84      JAE indexOutOfBounds
    85      MOVL (AX)(DI*4), DI
    86      CMPL DI, R10
    87      CMOVLLT DI, R10
    88      CMPL DI, R11
    89      CMOVLGT DI, R11
    90      INCQ SI
    91  test:
    92      CMPQ SI, DX
    93      JNE loop
    94  return:
    95      MOVL R10, min+48(FP)
    96      MOVL R11, max+52(FP)
    97      MOVQ R12, err+56(FP)
    98      RET
    99  indexOutOfBounds:
   100      MOVQ $errnoIndexOutOfBounds, R12
   101      JMP return
   102  
   103  // func dictionaryBoundsInt64(dict []int64, indexes []int32) (min, max int64, err errno)
   104  TEXT ·dictionaryBoundsInt64(SB), NOSPLIT, $0-72
   105      MOVQ dict_base+0(FP), AX
   106      MOVQ dict_len+8(FP), BX
   107  
   108      MOVQ indexes_base+24(FP), CX
   109      MOVQ indexes_len+32(FP), DX
   110  
   111      XORQ R10, R10 // min
   112      XORQ R11, R11 // max
   113      XORQ R12, R12 // err
   114      XORQ SI, SI
   115  
   116      CMPQ DX, $0
   117      JE return
   118  
   119      MOVL (CX), DI
   120      CMPL DI, BX
   121      JAE indexOutOfBounds
   122      MOVQ (AX)(DI*8), R10
   123      MOVQ R10, R11
   124  
   125      CMPQ DX, $8
   126      JB test
   127  
   128      CMPB ·hasAVX512VL(SB), $0
   129      JE test
   130  
   131      MOVQ DX, DI
   132      SHRQ $3, DI
   133      SHLQ $3, DI
   134  
   135      MOVQ $0xFFFF, R8
   136      KMOVW R8, K1
   137  
   138      VPBROADCASTD BX, Y2  // [len(dict)...]
   139      VPBROADCASTQ R10, Z3 // [min...]
   140      VMOVDQU64 Z3, Z4     // [max...]
   141  loopAVX512:
   142      VMOVDQU32 (CX)(SI*4), Y0
   143      VPCMPUD $1, Y2, Y0, K2
   144      KMOVW K2, R9
   145      CMPB R9, $0xFF
   146      JNE indexOutOfBounds
   147      VPGATHERDQ (AX)(Y0*8), K1, Z1
   148      VPMINSQ Z1, Z3, Z3
   149      VPMAXSQ Z1, Z4, Z4
   150      KMOVW R8, K1
   151      ADDQ $8, SI
   152      CMPQ SI, DI
   153      JNE loopAVX512
   154  
   155      VPERMQ $0b1110, Z3, Z0
   156      VPERMQ $0b1110, Z4, Z1
   157      VPMINSQ Z0, Z3, Z3
   158      VPMAXSQ Z1, Z4, Z4
   159  
   160      VPERMQ $1, Z3, Z0
   161      VPERMQ $1, Z4, Z1
   162      VPMINSQ Z0, Z3, Z3
   163      VPMAXSQ Z1, Z4, Z4
   164  
   165      VSHUFF64X2 $2, Z3, Z3, Z0
   166      VSHUFF64X2 $2, Z4, Z4, Z1
   167      VPMINSQ Z0, Z3, Z3
   168      VPMAXSQ Z1, Z4, Z4
   169  
   170      MOVQ X3, R10
   171      MOVQ X4, R11
   172  
   173      VZEROUPPER
   174      JMP test
   175  loop:
   176      MOVL (CX)(SI*4), DI
   177      CMPL DI, BX
   178      JAE indexOutOfBounds
   179      MOVQ (AX)(DI*8), DI
   180      CMPQ DI, R10
   181      CMOVQLT DI, R10
   182      CMPQ DI, R11
   183      CMOVQGT DI, R11
   184      INCQ SI
   185  test:
   186      CMPQ SI, DX
   187      JNE loop
   188  return:
   189      MOVQ R10, min+48(FP)
   190      MOVQ R11, max+56(FP)
   191      MOVQ R12, err+64(FP)
   192      RET
   193  indexOutOfBounds:
   194      MOVQ $errnoIndexOutOfBounds, R12
   195      JMP return
   196  
   197  // func dictionaryBoundsFloat32(dict []float32, indexes []int32) (min, max float32, err errno)
   198  TEXT ·dictionaryBoundsFloat32(SB), NOSPLIT, $0-64
   199      MOVQ dict_base+0(FP), AX
   200      MOVQ dict_len+8(FP), BX
   201  
   202      MOVQ indexes_base+24(FP), CX
   203      MOVQ indexes_len+32(FP), DX
   204  
   205      PXOR X3, X3   // min
   206      PXOR X4, X4   // max
   207      XORQ R12, R12 // err
   208      XORQ SI, SI
   209  
   210      CMPQ DX, $0
   211      JE return
   212  
   213      MOVL (CX), DI
   214      CMPL DI, BX
   215      JAE indexOutOfBounds
   216      MOVSS (AX)(DI*4), X3
   217      MOVAPS X3, X4
   218  
   219      CMPQ DX, $8
   220      JB test
   221  
   222      CMPB ·hasAVX512VL(SB), $0
   223      JE test
   224  
   225      MOVQ DX, DI
   226      SHRQ $3, DI
   227      SHLQ $3, DI
   228  
   229      MOVQ $0xFFFF, R8
   230      KMOVW R8, K1
   231  
   232      VPBROADCASTD BX, Y2 // [len(dict)...]
   233      VPBROADCASTD X3, Y3 // [min...]
   234      VMOVDQU32 Y3, Y4    // [max...]
   235  loopAVX512:
   236      VMOVDQU32 (CX)(SI*4), Y0
   237      VPCMPUD $1, Y2, Y0, K2
   238      KMOVW K2, R9
   239      CMPB R9, $0xFF
   240      JNE indexOutOfBounds
   241      VPGATHERDD (AX)(Y0*4), K1, Y1
   242      VMINPS Y1, Y3, Y3
   243      VMAXPS Y1, Y4, Y4
   244      KMOVW R8, K1
   245      ADDQ $8, SI
   246      CMPQ SI, DI
   247      JNE loopAVX512
   248  
   249      VPERM2I128 $1, Y3, Y3, Y0
   250      VPERM2I128 $1, Y4, Y4, Y1
   251      VMINPS Y0, Y3, Y3
   252      VMAXPS Y1, Y4, Y4
   253  
   254      VPSHUFD $0b1110, Y3, Y0
   255      VPSHUFD $0b1110, Y4, Y1
   256      VMINPS Y0, Y3, Y3
   257      VMAXPS Y1, Y4, Y4
   258  
   259      VPSHUFD $1, Y3, Y0
   260      VPSHUFD $1, Y4, Y1
   261      VMINPS Y0, Y3, Y3
   262      VMAXPS Y1, Y4, Y4
   263  
   264      VZEROUPPER
   265      JMP test
   266  loop:
   267      MOVL (CX)(SI*4), DI
   268      CMPL DI, BX
   269      JAE indexOutOfBounds
   270      MOVSS (AX)(DI*4), X1
   271      UCOMISS X3, X1
   272      JAE skipAssignMin
   273      MOVAPS X1, X3
   274  skipAssignMin:
   275      UCOMISS X4, X1
   276      JBE skipAssignMax
   277      MOVAPS X1, X4
   278  skipAssignMax:
   279      INCQ SI
   280  test:
   281      CMPQ SI, DX
   282      JNE loop
   283  return:
   284      MOVSS X3, min+48(FP)
   285      MOVSS X4, max+52(FP)
   286      MOVQ R12, err+56(FP)
   287      RET
   288  indexOutOfBounds:
   289      MOVQ $errnoIndexOutOfBounds, R12
   290      JMP return
   291  
   292  // func dictionaryBoundsFloat64(dict []float64, indexes []int32) (min, max float64, err errno)
   293  TEXT ·dictionaryBoundsFloat64(SB), NOSPLIT, $0-72
   294      MOVQ dict_base+0(FP), AX
   295      MOVQ dict_len+8(FP), BX
   296  
   297      MOVQ indexes_base+24(FP), CX
   298      MOVQ indexes_len+32(FP), DX
   299  
   300      PXOR X3, X3   // min
   301      PXOR X4, X4   // max
   302      XORQ R12, R12 // err
   303      XORQ SI, SI
   304  
   305      CMPQ DX, $0
   306      JE return
   307  
   308      MOVL (CX), DI
   309      CMPL DI, BX
   310      JAE indexOutOfBounds
   311      MOVSD (AX)(DI*8), X3
   312      MOVAPS X3, X4
   313  
   314      CMPQ DX, $8
   315      JB test
   316  
   317      CMPB ·hasAVX512VL(SB), $0
   318      JE test
   319  
   320      MOVQ DX, DI
   321      SHRQ $3, DI
   322      SHLQ $3, DI
   323  
   324      MOVQ $0xFFFF, R8
   325      KMOVW R8, K1
   326  
   327      VPBROADCASTD BX, Y2 // [len(dict)...]
   328      VPBROADCASTQ X3, Z3 // [min...]
   329      VMOVDQU64 Z3, Z4    // [max...]
   330  loopAVX512:
   331      VMOVDQU32 (CX)(SI*4), Y0
   332      VPCMPUD $1, Y2, Y0, K2
   333      KMOVW K2, R9
   334      CMPB R9, $0xFF
   335      JNE indexOutOfBounds
   336      VPGATHERDQ (AX)(Y0*8), K1, Z1
   337      VMINPD Z1, Z3, Z3
   338      VMAXPD Z1, Z4, Z4
   339      KMOVW R8, K1
   340      ADDQ $8, SI
   341      CMPQ SI, DI
   342      JNE loopAVX512
   343  
   344      VPERMQ $0b1110, Z3, Z0
   345      VPERMQ $0b1110, Z4, Z1
   346      VMINPD Z0, Z3, Z3
   347      VMAXPD Z1, Z4, Z4
   348  
   349      VPERMQ $1, Z3, Z0
   350      VPERMQ $1, Z4, Z1
   351      VMINPD Z0, Z3, Z3
   352      VMAXPD Z1, Z4, Z4
   353  
   354      VSHUFF64X2 $2, Z3, Z3, Z0
   355      VSHUFF64X2 $2, Z4, Z4, Z1
   356      VMINPD Z0, Z3, Z3
   357      VMAXPD Z1, Z4, Z4
   358  
   359      VZEROUPPER
   360      JMP test
   361  loop:
   362      MOVL (CX)(SI*4), DI
   363      CMPL DI, BX
   364      JAE indexOutOfBounds
   365      MOVSD (AX)(DI*8), X1
   366      UCOMISD X3, X1
   367      JAE skipAssignMin
   368      MOVAPD X1, X3
   369  skipAssignMin:
   370      UCOMISD X4, X1
   371      JBE skipAssignMax
   372      MOVAPD X1, X4
   373  skipAssignMax:
   374      INCQ SI
   375  test:
   376      CMPQ SI, DX
   377      JNE loop
   378  return:
   379      MOVSD X3, min+48(FP)
   380      MOVSD X4, max+56(FP)
   381      MOVQ R12, err+64(FP)
   382      RET
   383  indexOutOfBounds:
   384      MOVQ $errnoIndexOutOfBounds, R12
   385      JMP return
   386  
   387  // func dictionaryBoundsUint32(dict []uint32, indexes []int32) (min, max uint32, err errno)
   388  TEXT ·dictionaryBoundsUint32(SB), NOSPLIT, $0-64
   389      MOVQ dict_base+0(FP), AX
   390      MOVQ dict_len+8(FP), BX
   391  
   392      MOVQ indexes_base+24(FP), CX
   393      MOVQ indexes_len+32(FP), DX
   394  
   395      XORQ R10, R10 // min
   396      XORQ R11, R11 // max
   397      XORQ R12, R12 // err
   398      XORQ SI, SI
   399  
   400      CMPQ DX, $0
   401      JE return
   402  
   403      MOVL (CX), DI
   404      CMPL DI, BX
   405      JAE indexOutOfBounds
   406      MOVL (AX)(DI*4), R10
   407      MOVL R10, R11
   408  
   409      CMPQ DX, $8
   410      JB test
   411  
   412      CMPB ·hasAVX512VL(SB), $0
   413      JE test
   414  
   415      MOVQ DX, DI
   416      SHRQ $3, DI
   417      SHLQ $3, DI
   418  
   419      MOVQ $0xFFFF, R8
   420      KMOVW R8, K1
   421  
   422      VPBROADCASTD BX, Y2  // [len(dict)...]
   423      VPBROADCASTD R10, Y3 // [min...]
   424      VMOVDQU32 Y3, Y4     // [max...]
   425  loopAVX512:
   426      VMOVDQU32 (CX)(SI*4), Y0
   427      VPCMPUD $1, Y2, Y0, K2
   428      KMOVW K2, R9
   429      CMPB R9, $0xFF
   430      JNE indexOutOfBounds
   431      VPGATHERDD (AX)(Y0*4), K1, Y1
   432      VPMINUD Y1, Y3, Y3
   433      VPMAXUD Y1, Y4, Y4
   434      KMOVW R8, K1
   435      ADDQ $8, SI
   436      CMPQ SI, DI
   437      JNE loopAVX512
   438  
   439      VPERM2I128 $1, Y3, Y3, Y0
   440      VPERM2I128 $1, Y4, Y4, Y1
   441      VPMINUD Y0, Y3, Y3
   442      VPMAXUD Y1, Y4, Y4
   443  
   444      VPSHUFD $0b1110, Y3, Y0
   445      VPSHUFD $0b1110, Y4, Y1
   446      VPMINUD Y0, Y3, Y3
   447      VPMAXUD Y1, Y4, Y4
   448  
   449      VPSHUFD $1, Y3, Y0
   450      VPSHUFD $1, Y4, Y1
   451      VPMINUD Y0, Y3, Y3
   452      VPMAXUD Y1, Y4, Y4
   453  
   454      MOVQ X3, R10
   455      MOVQ X4, R11
   456      ANDQ $0xFFFFFFFF, R10
   457      ANDQ $0xFFFFFFFF, R11
   458  
   459      VZEROUPPER
   460      JMP test
   461  loop:
   462      MOVL (CX)(SI*4), DI
   463      CMPL DI, BX
   464      JAE indexOutOfBounds
   465      MOVL (AX)(DI*4), DI
   466      CMPL DI, R10
   467      CMOVLCS DI, R10
   468      CMPL DI, R11
   469      CMOVLHI DI, R11
   470      INCQ SI
   471  test:
   472      CMPQ SI, DX
   473      JNE loop
   474  return:
   475      MOVL R10, min+48(FP)
   476      MOVL R11, max+52(FP)
   477      MOVQ R12, err+56(FP)
   478      RET
   479  indexOutOfBounds:
   480      MOVQ $errnoIndexOutOfBounds, R12
   481      JMP return
   482  
   483  // func dictionaryBoundsUint64(dict []uint64, indexes []int32) (min, max uint64, err errno)
   484  TEXT ·dictionaryBoundsUint64(SB), NOSPLIT, $0-72
   485      MOVQ dict_base+0(FP), AX
   486      MOVQ dict_len+8(FP), BX
   487  
   488      MOVQ indexes_base+24(FP), CX
   489      MOVQ indexes_len+32(FP), DX
   490  
   491      XORQ R10, R10 // min
   492      XORQ R11, R11 // max
   493      XORQ R12, R12 // err
   494      XORQ SI, SI
   495  
   496      CMPQ DX, $0
   497      JE return
   498  
   499      MOVL (CX)(SI*4), DI
   500      CMPL DI, BX
   501      JAE indexOutOfBounds
   502      MOVQ (AX)(DI*8), R10
   503      MOVQ R10, R11
   504  
   505      CMPQ DX, $8
   506      JB test
   507  
   508      CMPB ·hasAVX512VL(SB), $0
   509      JE test
   510  
   511      MOVQ DX, DI
   512      SHRQ $3, DI
   513      SHLQ $3, DI
   514  
   515      MOVQ $0xFFFF, R8
   516      KMOVW R8, K1
   517  
   518      VPBROADCASTD BX, Y2  // [len(dict)...]
   519      VPBROADCASTQ R10, Z3 // [min...]
   520      VMOVDQU64 Z3, Z4     // [max...]
   521  loopAVX512:
   522      VMOVDQU32 (CX)(SI*4), Y0
   523      VPCMPUD $1, Y2, Y0, K2
   524      KMOVW K2, R9
   525      CMPB R9, $0xFF
   526      JNE indexOutOfBounds
   527      VPGATHERDQ (AX)(Y0*8), K1, Z1
   528      VPMINUQ Z1, Z3, Z3
   529      VPMAXUQ Z1, Z4, Z4
   530      KMOVW R8, K1
   531      ADDQ $8, SI
   532      CMPQ SI, DI
   533      JNE loopAVX512
   534  
   535      VPERMQ $0b1110, Z3, Z0
   536      VPERMQ $0b1110, Z4, Z1
   537      VPMINUQ Z0, Z3, Z3
   538      VPMAXUQ Z1, Z4, Z4
   539  
   540      VPERMQ $1, Z3, Z0
   541      VPERMQ $1, Z4, Z1
   542      VPMINUQ Z0, Z3, Z3
   543      VPMAXUQ Z1, Z4, Z4
   544  
   545      VSHUFF64X2 $2, Z3, Z3, Z0
   546      VSHUFF64X2 $2, Z4, Z4, Z1
   547      VPMINUQ Z0, Z3, Z3
   548      VPMAXUQ Z1, Z4, Z4
   549  
   550      MOVQ X3, R10
   551      MOVQ X4, R11
   552  
   553      VZEROUPPER
   554      JMP test
   555  loop:
   556      MOVL (CX)(SI*4), DI
   557      CMPL DI, BX
   558      JAE indexOutOfBounds
   559      MOVQ (AX)(DI*8), DI
   560      CMPQ DI, R10
   561      CMOVQCS DI, R10
   562      CMPQ DI, R11
   563      CMOVQHI DI, R11
   564      INCQ SI
   565  test:
   566      CMPQ SI, DX
   567      JNE loop
   568  return:
   569      MOVQ R10, min+48(FP)
   570      MOVQ R11, max+56(FP)
   571      MOVQ R12, err+64(FP)
   572      RET
   573  indexOutOfBounds:
   574      MOVQ $errnoIndexOutOfBounds, R12
   575      JMP return
   576  
   577  // func dictionaryBoundsBE128(dict [][16]byte, indexes []int32) (min, max *[16]byte, err errno)
   578  TEXT ·dictionaryBoundsBE128(SB), NOSPLIT, $0-72
   579      MOVQ dict_base+0(FP), AX
   580      MOVQ dict_len+8(FP), BX
   581  
   582      MOVQ indexes_base+24(FP), CX
   583      MOVQ indexes_len+32(FP), DX
   584      SHLQ $2, DX // x 4
   585      ADDQ CX, DX // end
   586  
   587      XORQ R8, R8 // min (pointer)
   588      XORQ R9, R9 // max (pointer)
   589      XORQ SI, SI // err
   590      XORQ DI, DI
   591  
   592      CMPQ DX, $0
   593      JE return
   594  
   595      MOVL (CX), DI
   596      CMPL DI, BX
   597      JAE indexOutOfBounds
   598      SHLQ $4, DI // the dictionary contains 16 byte words
   599      LEAQ (AX)(DI*1), R8
   600      MOVQ R8, R9
   601      MOVQ 0(AX)(DI*1), R10 // min (high)
   602      MOVQ 8(AX)(DI*1), R11 // min (low)
   603      BSWAPQ R10
   604      BSWAPQ R11
   605      MOVQ R10, R12 // max (high)
   606      MOVQ R11, R13 // max (low)
   607  
   608      JMP next
   609  loop:
   610      MOVL (CX), DI
   611      CMPL DI, BX
   612      JAE indexOutOfBounds
   613      SHLQ $4, DI
   614      MOVQ 0(AX)(DI*1), R14
   615      MOVQ 8(AX)(DI*1), R15
   616      BSWAPQ R14
   617      BSWAPQ R15
   618  testLessThan:
   619      CMPQ R14, R10
   620      JA testGreaterThan
   621      JB lessThan
   622      CMPQ R15, R11
   623      JAE testGreaterThan
   624  lessThan:
   625      LEAQ (AX)(DI*1), R8
   626      MOVQ R14, R10
   627      MOVQ R15, R11
   628      JMP next
   629  testGreaterThan:
   630      CMPQ R14, R12
   631      JB next
   632      JA greaterThan
   633      CMPQ R15, R13
   634      JBE next
   635  greaterThan:
   636      LEAQ (AX)(DI*1), R9
   637      MOVQ R14, R12
   638      MOVQ R15, R13
   639  next:
   640      ADDQ $4, CX
   641      CMPQ CX, DX
   642      JNE loop
   643  return:
   644      MOVQ R8, min+48(FP)
   645      MOVQ R9, max+56(FP)
   646      MOVQ SI, err+64(FP)
   647      RET
   648  indexOutOfBounds:
   649      MOVQ $errnoIndexOutOfBounds, SI
   650      JMP return
   651  
   652  // The lookup functions provide optimized versions of the dictionary index
   653  // lookup logic.
   654  //
   655  // When AVX512 is available, the AVX512 versions of the functions are used
   656  // which use the VPGATHER* instructions to perform 8 parallel lookups of the
   657  // values in the dictionary, then VPSCATTER* to do 8 parallel writes to the
   658  // sparse output buffer.
   659  
   660  // func dictionaryLookup32(dict []uint32, indexes []int32, rows sparse.Array) errno
   661  TEXT ·dictionaryLookup32(SB), NOSPLIT, $0-80
   662      MOVQ dict_base+0(FP), AX
   663      MOVQ dict_len+8(FP), BX
   664  
   665      MOVQ indexes_base+24(FP), CX
   666      MOVQ indexes_len+32(FP), DX
   667  
   668      MOVQ rows_array_ptr+48(FP), R8
   669      MOVQ rows_array_off+64(FP), R9
   670  
   671      XORQ SI, SI
   672  
   673      CMPQ DX, $8
   674      JB test
   675  
   676      CMPB ·hasAVX512VL(SB), $0
   677      JE test
   678  
   679      MOVQ DX, DI
   680      SHRQ $3, DI
   681      SHLQ $3, DI
   682  
   683      MOVQ R9, R10
   684      SHLQ $3, R10 // 8 * size
   685  
   686      MOVW $0xFFFF, R11
   687      KMOVW R11, K1
   688      KMOVW R11, K2
   689  
   690      VPBROADCASTD R9, Y2           // [size...]
   691      VPMULLD ·range0n8(SB), Y2, Y2 // [0*size,1*size,...]
   692      VPBROADCASTD BX, Y3           // [len(dict)...]
   693  loopAVX512:
   694      VMOVDQU32 (CX)(SI*4), Y0
   695      VPCMPUD $1, Y3, Y0, K3
   696      KMOVW K3, R11
   697      CMPB R11, $0xFF
   698      JNE indexOutOfBounds
   699      VPGATHERDD (AX)(Y0*4), K1, Y1
   700      VPSCATTERDD Y1, K2, (R8)(Y2*1)
   701      KMOVW R11, K1
   702      KMOVW R11, K2
   703      ADDQ R10, R8
   704      ADDQ $8, SI
   705      CMPQ SI, DI
   706      JNE loopAVX512
   707      VZEROUPPER
   708      JMP test
   709  loop:
   710      MOVL (CX)(SI*4), DI
   711      CMPL DI, BX
   712      JAE indexOutOfBounds
   713      MOVL (AX)(DI*4), DI
   714      MOVL DI, (R8)
   715      ADDQ R9, R8
   716      INCQ SI
   717  test:
   718      CMPQ SI, DX
   719      JNE loop
   720      XORQ AX, AX
   721  return:
   722      MOVQ AX, ret+72(FP)
   723      RET
   724  indexOutOfBounds:
   725      MOVQ $errnoIndexOutOfBounds, AX
   726      JMP return
   727  
   728  // func dictionaryLookup64(dict []uint64, indexes []int32, rows sparse.Array) errno
   729  TEXT ·dictionaryLookup64(SB), NOSPLIT, $0-80
   730      MOVQ dict_base+0(FP), AX
   731      MOVQ dict_len+8(FP), BX
   732  
   733      MOVQ indexes_base+24(FP), CX
   734      MOVQ indexes_len+32(FP), DX
   735  
   736      MOVQ rows_array_ptr+48(FP), R8
   737      MOVQ rows_array_off+64(FP), R9
   738  
   739      XORQ SI, SI
   740  
   741      CMPQ DX, $8
   742      JB test
   743  
   744      CMPB ·hasAVX512VL(SB), $0
   745      JE test
   746  
   747      MOVQ DX, DI
   748      SHRQ $3, DI
   749      SHLQ $3, DI
   750  
   751      MOVQ R9, R10
   752      SHLQ $3, R10 // 8 * size
   753  
   754      MOVW $0xFFFF, R11
   755      KMOVW R11, K1
   756      KMOVW R11, K2
   757  
   758      VPBROADCASTD R9, Y2           // [size...]
   759      VPMULLD ·range0n8(SB), Y2, Y2 // [0*size,1*size,...]
   760      VPBROADCASTD BX, Y3           // [len(dict)...]
   761  loopAVX512:
   762      VMOVDQU32 (CX)(SI*4), Y0
   763      VPCMPUD $1, Y3, Y0, K3
   764      KMOVW K3, R11
   765      CMPB R11, $0xFF
   766      JNE indexOutOfBounds
   767      VPGATHERDQ (AX)(Y0*8), K1, Z1
   768      VPSCATTERDQ Z1, K2, (R8)(Y2*1)
   769      KMOVW R11, K1
   770      KMOVW R11, K2
   771      ADDQ R10, R8
   772      ADDQ $8, SI
   773      CMPQ SI, DI
   774      JNE loopAVX512
   775      VZEROUPPER
   776      JMP test
   777  loop:
   778      MOVL (CX)(SI*4), DI
   779      CMPL DI, BX
   780      JAE indexOutOfBounds
   781      MOVQ (AX)(DI*8), DI
   782      MOVQ DI, (R8)
   783      ADDQ R9, R8
   784      INCQ SI
   785  test:
   786      CMPQ SI, DX
   787      JNE loop
   788      XORQ AX, AX
   789  return:
   790      MOVQ AX, ret+72(FP)
   791      RET
   792  indexOutOfBounds:
   793      MOVQ $errnoIndexOutOfBounds, AX
   794      JMP return
   795  
   796  // func dictionaryLookupByteArrayString(dict []uint32, page []byte, indexes []int32, rows sparse.Array) errno
   797  TEXT ·dictionaryLookupByteArrayString(SB), NOSPLIT, $0-104
   798      MOVQ dict_base+0(FP), AX
   799      MOVQ dict_len+8(FP), BX
   800  
   801      MOVQ page+24(FP), CX
   802  
   803      MOVQ indexes_base+48(FP), R8
   804      MOVQ indexes_len+56(FP), R9
   805  
   806      MOVQ rows_array_ptr+72(FP), R10
   807      MOVQ rows_array_off+88(FP), R11
   808  
   809      XORQ DI, DI
   810      XORQ SI, SI
   811  loop:
   812      // Load the index that we want to read the value from. This may come from
   813      // user input so we must validate that the indexes are within the bounds of
   814      // the dictionary.
   815      MOVL (R8)(SI*4), DI
   816      CMPL DI, BX
   817      JAE indexOutOfBounds
   818  
   819      // Load the offset within the dictionary page where the value is stored.
   820      // We trust the offsets to be correct since they are generated internally by
   821      // the dictionary code, there is no need to check that they are within the
   822      // bounds of the dictionary page.
   823      MOVL (AX)(DI*4), DI
   824  
   825      // Load the value from the dictionary page. The page uses the PLAIN encoding
   826      // where each byte array is prefixed with a 4 bytes little endian length.
   827      LEAQ 4(CX)(DI*1), DX
   828      MOVL (CX)(DI*1), DI
   829  
   830      // Store the length and pointer to the value into the output location.
   831      // The memory layout is expected to hold a pointer and length, which are
   832      // both 64 bits words. This is the layout used by parquet.Value and the Go
   833      // string value type.
   834      MOVQ DX, (R10)
   835      MOVQ DI, 8(R10)
   836  
   837      ADDQ R11, R10
   838      INCQ SI
   839  test:
   840      CMPQ SI, R9
   841      JNE loop
   842      XORQ AX, AX
   843  return:
   844      MOVQ AX, ret+96(FP)
   845      RET
   846  indexOutOfBounds:
   847      MOVQ $errnoIndexOutOfBounds, AX
   848      JMP return
   849  
   850  // func dictionaryLookupFixedLenByteArrayString(dict []byte, len int, indexes []int32, rows sparse.Array) errno
   851  TEXT ·dictionaryLookupFixedLenByteArrayString(SB), NOSPLIT, $0-88
   852      MOVQ dict_base+0(FP), AX
   853      MOVQ dict_len+8(FP), BX
   854  
   855      MOVQ len+24(FP), CX
   856  
   857      MOVQ indexes_base+32(FP), DX
   858      MOVQ indexes_len+40(FP), R8
   859  
   860      MOVQ rows_array_ptr+56(FP), R9
   861      MOVQ rows_array_off+72(FP), R10
   862  
   863      XORQ DI, DI
   864      XORQ SI, SI
   865  loop:
   866      MOVL (DX)(SI*4), DI
   867      IMULQ CX, DI
   868      CMPL DI, BX
   869      JAE indexOutOfBounds
   870  
   871      ADDQ AX, DI
   872      MOVQ DI, (R9)
   873      MOVQ CX, 8(R9)
   874  
   875      ADDQ R10, R9
   876      INCQ SI
   877  test:
   878      CMPQ SI, R8
   879      JNE loop
   880      XORQ AX, AX
   881  return:
   882      MOVQ AX, ret+80(FP)
   883      RET
   884  indexOutOfBounds:
   885      MOVQ $errnoIndexOutOfBounds, AX
   886      JMP return
   887  
   888  // This is the same algorithm as dictionaryLookupFixedLenByteArrayString but we
   889  // only store the pointer to the location holding the value instead of storing
   890  // the pair of pointer and length. Since the length is fixed for this dictionary
   891  // type, the application can assume it at the call site.
   892  //
   893  // func dictionaryLookupFixedLenByteArrayPointer(dict []byte, len int, indexes []int32, rows sparse.Array) errno
   894  TEXT ·dictionaryLookupFixedLenByteArrayPointer(SB), NOSPLIT, $0-88
   895      MOVQ dict_base+0(FP), AX
   896      MOVQ dict_len+8(FP), BX
   897  
   898      MOVQ len+24(FP), CX
   899  
   900      MOVQ indexes_base+32(FP), DX
   901      MOVQ indexes_len+40(FP), R8
   902  
   903      MOVQ rows_array_ptr+56(FP), R9
   904      MOVQ rows_array_off+72(FP), R10
   905  
   906      XORQ DI, DI
   907      XORQ SI, SI
   908  loop:
   909      MOVL (DX)(SI*4), DI
   910      IMULQ CX, DI
   911      CMPL DI, BX
   912      JAE indexOutOfBounds
   913  
   914      ADDQ AX, DI
   915      MOVQ DI, (R9)
   916  
   917      ADDQ R10, R9
   918      INCQ SI
   919  test:
   920      CMPQ SI, R8
   921      JNE loop
   922      XORQ AX, AX
   923  return:
   924      MOVQ AX, ret+80(FP)
   925      RET
   926  indexOutOfBounds:
   927      MOVQ $errnoIndexOutOfBounds, AX
   928      JMP return
   929  
   930  GLOBL ·range0n8(SB), RODATA|NOPTR, $40
   931  DATA ·range0n8+0(SB)/4, $0
   932  DATA ·range0n8+4(SB)/4, $1
   933  DATA ·range0n8+8(SB)/4, $2
   934  DATA ·range0n8+12(SB)/4, $3
   935  DATA ·range0n8+16(SB)/4, $4
   936  DATA ·range0n8+20(SB)/4, $5
   937  DATA ·range0n8+24(SB)/4, $6
   938  DATA ·range0n8+28(SB)/4, $7
   939  DATA ·range0n8+32(SB)/4, $8