github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/bloom/xxhash/sum64uint_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  /*
     6  The algorithms in this file are assembly versions of the Go functions in the
     7  sum64uint_default.go file.
     8  
     9  The implementations are mostly direct translations of the Go code to assembly,
    10  leveraging SIMD instructions to process chunks of the input variables in
    11  parallel at each loop iteration. To maximize utilization of the CPU capacity,
    12  some of the functions unroll two steps of the vectorized loop per iteration,
    13  which yields further throughput because the CPU is able to process some of the
    14  instruction from the two steps in parallel due to having no data dependencies
    15  between the inputs and outputs.
    16  
    17  The use of AVX-512 yields a significant increase in throughput on all the
    18  algorithms, in most part thanks to the VPMULLQ instructions which compute
    19  8 x 64 bits multiplication. There were no equivalent instruction in AVX2, which
    20  required emulating vector multiplication with a combination of 32 bits multiply,
    21  additions, shifts, and masks: the amount of instructions and data dependencies
    22  resulted in the AVX2 code yielding equivalent performance characteristics for a
    23  much higher complexity.
    24  
    25  The benchmark results below showcase the improvements that the AVX-512 code
    26  yields on the XXH64 algorithms:
    27  
    28  name                   old speed      new speed       delta
    29  MultiSum64Uint8/4KB    4.97GB/s ± 0%  14.59GB/s ± 1%  +193.73%  (p=0.000 n=10+10)
    30  MultiSum64Uint16/4KB   3.55GB/s ± 0%   9.46GB/s ± 0%  +166.20%  (p=0.000 n=10+9)
    31  MultiSum64Uint32/4KB   4.48GB/s ± 0%  13.93GB/s ± 1%  +210.93%  (p=0.000 n=10+10)
    32  MultiSum64Uint64/4KB   3.57GB/s ± 0%  11.12GB/s ± 1%  +211.73%  (p=0.000 n=9+10)
    33  MultiSum64Uint128/4KB  2.54GB/s ± 0%   6.49GB/s ± 1%  +155.69%  (p=0.000 n=10+10)
    34  
    35  name                   old hash/s     new hash/s      delta
    36  MultiSum64Uint8/4KB        621M ± 0%      1823M ± 1%  +193.73%  (p=0.000 n=10+10)
    37  MultiSum64Uint16/4KB       444M ± 0%      1182M ± 0%  +166.20%  (p=0.000 n=10+9)
    38  MultiSum64Uint32/4KB       560M ± 0%      1742M ± 1%  +210.93%  (p=0.000 n=10+10)
    39  MultiSum64Uint64/4KB       446M ± 0%      1391M ± 1%  +211.73%  (p=0.000 n=9+10)
    40  MultiSum64Uint128/4KB      317M ± 0%       811M ± 1%  +155.69%  (p=0.000 n=10+10)
    41  
    42  The functions perform runtime detection of AVX-512 support by testing the value
    43  of the xxhash.hasAVX512 variable declared and initialized in sum64uint_amd64.go.
    44  Branch mispredictions on those tests are very unlikely since the value is never
    45  modified by the application. The cost of the comparisons are also amortized by
    46  the bulk APIs of the MultiSum64* functions (a single test is required per call).
    47  
    48  If a bug is suspected in the vectorized code, compiling the program or running
    49  the tests with -tags=purego can help verify whether the behavior changes when
    50  the program does not use the assembly versions.
    51  
    52  Maintenance of these functions can be complex; however, the XXH64 algorithm is
    53  unlikely to evolve, and the implementations unlikely to change. The tests in
    54  sum64uint_test.go compare the outputs of MultiSum64* functions with the
    55  reference xxhash.Sum64 function, future maintainers can rely on those tests
    56  passing as a guarantee that they have not introduced regressions.
    57  */
    58  
    59  #define PRIME1 0x9E3779B185EBCA87
    60  #define PRIME2 0xC2B2AE3D27D4EB4F
    61  #define PRIME3 0x165667B19E3779F9
    62  #define PRIME4 0x85EBCA77C2B2AE63
    63  #define PRIME5 0x27D4EB2F165667C5
    64  
    65  #define prime1 R12
    66  #define prime2 R13
    67  #define prime3 R14
    68  #define prime4 R15
    69  #define prime5 R15 // same as prime4 because they are not used together
    70  
    71  #define prime1ZMM Z12
    72  #define prime2ZMM Z13
    73  #define prime3ZMM Z14
    74  #define prime4ZMM Z15
    75  #define prime5ZMM Z15
    76  
    77  DATA prime1vec<>+0(SB)/8, $PRIME1
    78  DATA prime1vec<>+8(SB)/8, $PRIME1
    79  DATA prime1vec<>+16(SB)/8, $PRIME1
    80  DATA prime1vec<>+24(SB)/8, $PRIME1
    81  DATA prime1vec<>+32(SB)/8, $PRIME1
    82  DATA prime1vec<>+40(SB)/8, $PRIME1
    83  DATA prime1vec<>+48(SB)/8, $PRIME1
    84  DATA prime1vec<>+56(SB)/8, $PRIME1
    85  GLOBL prime1vec<>(SB), RODATA|NOPTR, $64
    86  
    87  DATA prime2vec<>+0(SB)/8, $PRIME2
    88  DATA prime2vec<>+8(SB)/8, $PRIME2
    89  DATA prime2vec<>+16(SB)/8, $PRIME2
    90  DATA prime2vec<>+24(SB)/8, $PRIME2
    91  DATA prime2vec<>+32(SB)/8, $PRIME2
    92  DATA prime2vec<>+40(SB)/8, $PRIME2
    93  DATA prime2vec<>+48(SB)/8, $PRIME2
    94  DATA prime2vec<>+56(SB)/8, $PRIME2
    95  GLOBL prime2vec<>(SB), RODATA|NOPTR, $64
    96  
    97  DATA prime3vec<>+0(SB)/8, $PRIME3
    98  DATA prime3vec<>+8(SB)/8, $PRIME3
    99  DATA prime3vec<>+16(SB)/8, $PRIME3
   100  DATA prime3vec<>+24(SB)/8, $PRIME3
   101  DATA prime3vec<>+32(SB)/8, $PRIME3
   102  DATA prime3vec<>+40(SB)/8, $PRIME3
   103  DATA prime3vec<>+48(SB)/8, $PRIME3
   104  DATA prime3vec<>+56(SB)/8, $PRIME3
   105  GLOBL prime3vec<>(SB), RODATA|NOPTR, $64
   106  
   107  DATA prime4vec<>+0(SB)/8, $PRIME4
   108  DATA prime4vec<>+8(SB)/8, $PRIME4
   109  DATA prime4vec<>+16(SB)/8, $PRIME4
   110  DATA prime4vec<>+24(SB)/8, $PRIME4
   111  DATA prime4vec<>+32(SB)/8, $PRIME4
   112  DATA prime4vec<>+40(SB)/8, $PRIME4
   113  DATA prime4vec<>+48(SB)/8, $PRIME4
   114  DATA prime4vec<>+56(SB)/8, $PRIME4
   115  GLOBL prime4vec<>(SB), RODATA|NOPTR, $64
   116  
   117  DATA prime5vec<>+0(SB)/8, $PRIME5
   118  DATA prime5vec<>+8(SB)/8, $PRIME5
   119  DATA prime5vec<>+16(SB)/8, $PRIME5
   120  DATA prime5vec<>+24(SB)/8, $PRIME5
   121  DATA prime5vec<>+32(SB)/8, $PRIME5
   122  DATA prime5vec<>+40(SB)/8, $PRIME5
   123  DATA prime5vec<>+48(SB)/8, $PRIME5
   124  DATA prime5vec<>+56(SB)/8, $PRIME5
   125  GLOBL prime5vec<>(SB), RODATA|NOPTR, $64
   126  
   127  DATA prime5vec1<>+0(SB)/8, $PRIME5+1
   128  DATA prime5vec1<>+8(SB)/8, $PRIME5+1
   129  DATA prime5vec1<>+16(SB)/8, $PRIME5+1
   130  DATA prime5vec1<>+24(SB)/8, $PRIME5+1
   131  DATA prime5vec1<>+32(SB)/8, $PRIME5+1
   132  DATA prime5vec1<>+40(SB)/8, $PRIME5+1
   133  DATA prime5vec1<>+48(SB)/8, $PRIME5+1
   134  DATA prime5vec1<>+56(SB)/8, $PRIME5+1
   135  GLOBL prime5vec1<>(SB), RODATA|NOPTR, $64
   136  
   137  DATA prime5vec2<>+0(SB)/8, $PRIME5+2
   138  DATA prime5vec2<>+8(SB)/8, $PRIME5+2
   139  DATA prime5vec2<>+16(SB)/8, $PRIME5+2
   140  DATA prime5vec2<>+24(SB)/8, $PRIME5+2
   141  DATA prime5vec2<>+32(SB)/8, $PRIME5+2
   142  DATA prime5vec2<>+40(SB)/8, $PRIME5+2
   143  DATA prime5vec2<>+48(SB)/8, $PRIME5+2
   144  DATA prime5vec2<>+56(SB)/8, $PRIME5+2
   145  GLOBL prime5vec2<>(SB), RODATA|NOPTR, $64
   146  
   147  DATA prime5vec4<>+0(SB)/8, $PRIME5+4
   148  DATA prime5vec4<>+8(SB)/8, $PRIME5+4
   149  DATA prime5vec4<>+16(SB)/8, $PRIME5+4
   150  DATA prime5vec4<>+24(SB)/8, $PRIME5+4
   151  DATA prime5vec4<>+32(SB)/8, $PRIME5+4
   152  DATA prime5vec4<>+40(SB)/8, $PRIME5+4
   153  DATA prime5vec4<>+48(SB)/8, $PRIME5+4
   154  DATA prime5vec4<>+56(SB)/8, $PRIME5+4
   155  GLOBL prime5vec4<>(SB), RODATA|NOPTR, $64
   156  
   157  DATA prime5vec8<>+0(SB)/8, $PRIME5+8
   158  DATA prime5vec8<>+8(SB)/8, $PRIME5+8
   159  DATA prime5vec8<>+16(SB)/8, $PRIME5+8
   160  DATA prime5vec8<>+24(SB)/8, $PRIME5+8
   161  DATA prime5vec8<>+32(SB)/8, $PRIME5+8
   162  DATA prime5vec8<>+40(SB)/8, $PRIME5+8
   163  DATA prime5vec8<>+48(SB)/8, $PRIME5+8
   164  DATA prime5vec8<>+56(SB)/8, $PRIME5+8
   165  GLOBL prime5vec8<>(SB), RODATA|NOPTR, $64
   166  
   167  DATA prime5vec16<>+0(SB)/8, $PRIME5+16
   168  DATA prime5vec16<>+8(SB)/8, $PRIME5+16
   169  DATA prime5vec16<>+16(SB)/8, $PRIME5+16
   170  DATA prime5vec16<>+24(SB)/8, $PRIME5+16
   171  DATA prime5vec16<>+32(SB)/8, $PRIME5+16
   172  DATA prime5vec16<>+40(SB)/8, $PRIME5+16
   173  DATA prime5vec16<>+48(SB)/8, $PRIME5+16
   174  DATA prime5vec16<>+56(SB)/8, $PRIME5+16
   175  GLOBL prime5vec16<>(SB), RODATA|NOPTR, $64
   176  
   177  DATA lowbytemask<>+0(SB)/8, $0xFF
   178  DATA lowbytemask<>+8(SB)/8, $0xFF
   179  DATA lowbytemask<>+16(SB)/8, $0xFF
   180  DATA lowbytemask<>+24(SB)/8, $0xFF
   181  DATA lowbytemask<>+32(SB)/8, $0xFF
   182  DATA lowbytemask<>+40(SB)/8, $0xFF
   183  DATA lowbytemask<>+48(SB)/8, $0xFF
   184  DATA lowbytemask<>+56(SB)/8, $0xFF
   185  GLOBL lowbytemask<>(SB), RODATA|NOPTR, $64
   186  
   187  DATA vpermi2qeven<>+0(SB)/8, $0
   188  DATA vpermi2qeven<>+8(SB)/8, $2
   189  DATA vpermi2qeven<>+16(SB)/8, $4
   190  DATA vpermi2qeven<>+24(SB)/8, $6
   191  DATA vpermi2qeven<>+32(SB)/8, $(1<<3)|0
   192  DATA vpermi2qeven<>+40(SB)/8, $(1<<3)|2
   193  DATA vpermi2qeven<>+48(SB)/8, $(1<<3)|4
   194  DATA vpermi2qeven<>+56(SB)/8, $(1<<3)|6
   195  GLOBL vpermi2qeven<>(SB), RODATA|NOPTR, $64
   196  
   197  DATA vpermi2qodd<>+0(SB)/8, $1
   198  DATA vpermi2qodd<>+8(SB)/8, $3
   199  DATA vpermi2qodd<>+16(SB)/8, $5
   200  DATA vpermi2qodd<>+24(SB)/8, $7
   201  DATA vpermi2qodd<>+32(SB)/8, $(1<<3)|1
   202  DATA vpermi2qodd<>+40(SB)/8, $(1<<3)|3
   203  DATA vpermi2qodd<>+48(SB)/8, $(1<<3)|5
   204  DATA vpermi2qodd<>+56(SB)/8, $(1<<3)|7
   205  GLOBL vpermi2qodd<>(SB), RODATA|NOPTR, $64
   206  
   207  #define round(input, acc) \
   208  	IMULQ prime2, input \
   209  	ADDQ  input, acc \
   210  	ROLQ  $31, acc \
   211  	IMULQ prime1, acc
   212  
   213  #define avalanche(tmp, acc) \
   214      MOVQ acc, tmp \
   215      SHRQ $33, tmp \
   216      XORQ tmp, acc \
   217      IMULQ prime2, acc \
   218      MOVQ acc, tmp \
   219      SHRQ $29, tmp \
   220      XORQ tmp, acc \
   221      IMULQ prime3, acc \
   222      MOVQ acc, tmp \
   223      SHRQ $32, tmp \
   224      XORQ tmp, acc
   225  
   226  #define round8x64(input, acc) \
   227      VPMULLQ prime2ZMM, input, input \
   228      VPADDQ input, acc, acc \
   229      VPROLQ $31, acc, acc \
   230      VPMULLQ prime1ZMM, acc, acc
   231  
   232  #define avalanche8x64(tmp, acc) \
   233      VPSRLQ $33, acc, tmp \
   234      VPXORQ tmp, acc, acc \
   235      VPMULLQ prime2ZMM, acc, acc \
   236      VPSRLQ $29, acc, tmp \
   237      VPXORQ tmp, acc, acc \
   238      VPMULLQ prime3ZMM, acc, acc \
   239      VPSRLQ $32, acc, tmp \
   240      VPXORQ tmp, acc, acc
   241  
   242  // func MultiSum64Uint8(h []uint64, v []uint8) int
   243  TEXT ·MultiSum64Uint8(SB), NOSPLIT, $0-54
   244      MOVQ $PRIME1, prime1
   245      MOVQ $PRIME2, prime2
   246      MOVQ $PRIME3, prime3
   247      MOVQ $PRIME5, prime5
   248  
   249      MOVQ h_base+0(FP), AX
   250      MOVQ h_len+8(FP), CX
   251      MOVQ v_base+24(FP), BX
   252      MOVQ v_len+32(FP), DX
   253  
   254      CMPQ CX, DX
   255      CMOVQGT DX, CX
   256      MOVQ CX, ret+48(FP)
   257  
   258      XORQ SI, SI
   259      CMPQ CX, $32
   260      JB loop
   261      CMPB ·hasAVX512(SB), $0
   262      JE loop
   263  
   264      MOVQ CX, DI
   265      SHRQ $5, DI
   266      SHLQ $5, DI
   267  
   268      VMOVDQU64 prime1vec<>(SB), prime1ZMM
   269      VMOVDQU64 prime2vec<>(SB), prime2ZMM
   270      VMOVDQU64 prime3vec<>(SB), prime3ZMM
   271      VMOVDQU64 prime5vec<>(SB), prime5ZMM
   272      VMOVDQU64 prime5vec1<>(SB), Z6
   273  loop32x64:
   274      VMOVDQA64 Z6, Z0
   275      VMOVDQA64 Z6, Z3
   276      VMOVDQA64 Z6, Z20
   277      VMOVDQA64 Z6, Z23
   278      VPMOVZXBQ (BX)(SI*1), Z1
   279      VPMOVZXBQ 8(BX)(SI*1), Z4
   280      VPMOVZXBQ 16(BX)(SI*1), Z21
   281      VPMOVZXBQ 24(BX)(SI*1), Z24
   282  
   283      VPMULLQ prime5ZMM, Z1, Z1
   284      VPMULLQ prime5ZMM, Z4, Z4
   285      VPMULLQ prime5ZMM, Z21, Z21
   286      VPMULLQ prime5ZMM, Z24, Z24
   287      VPXORQ Z1, Z0, Z0
   288      VPXORQ Z4, Z3, Z3
   289      VPXORQ Z21, Z20, Z20
   290      VPXORQ Z24, Z23, Z23
   291      VPROLQ $11, Z0, Z0
   292      VPROLQ $11, Z3, Z3
   293      VPROLQ $11, Z20, Z20
   294      VPROLQ $11, Z23, Z23
   295      VPMULLQ prime1ZMM, Z0, Z0
   296      VPMULLQ prime1ZMM, Z3, Z3
   297      VPMULLQ prime1ZMM, Z20, Z20
   298      VPMULLQ prime1ZMM, Z23, Z23
   299  
   300      avalanche8x64(Z1, Z0)
   301      avalanche8x64(Z4, Z3)
   302      avalanche8x64(Z21, Z20)
   303      avalanche8x64(Z24, Z23)
   304  
   305      VMOVDQU64 Z0, (AX)(SI*8)
   306      VMOVDQU64 Z3, 64(AX)(SI*8)
   307      VMOVDQU64 Z20, 128(AX)(SI*8)
   308      VMOVDQU64 Z23, 192(AX)(SI*8)
   309      ADDQ $32, SI
   310      CMPQ SI, DI
   311      JB loop32x64
   312      VZEROUPPER
   313  loop:
   314      CMPQ SI, CX
   315      JE done
   316      MOVQ $PRIME5+1, R8
   317      MOVBQZX (BX)(SI*1), R9
   318  
   319      IMULQ prime5, R9
   320      XORQ R9, R8
   321      ROLQ $11, R8
   322      IMULQ prime1, R8
   323      avalanche(R9, R8)
   324  
   325      MOVQ R8, (AX)(SI*8)
   326      INCQ SI
   327      JMP loop
   328  done:
   329      RET
   330  
   331  // func MultiSum64Uint16(h []uint64, v []uint16) int
   332  TEXT ·MultiSum64Uint16(SB), NOSPLIT, $0-54
   333      MOVQ $PRIME1, prime1
   334      MOVQ $PRIME2, prime2
   335      MOVQ $PRIME3, prime3
   336      MOVQ $PRIME5, prime5
   337  
   338      MOVQ h_base+0(FP), AX
   339      MOVQ h_len+8(FP), CX
   340      MOVQ v_base+24(FP), BX
   341      MOVQ v_len+32(FP), DX
   342  
   343      CMPQ CX, DX
   344      CMOVQGT DX, CX
   345      MOVQ CX, ret+48(FP)
   346  
   347      XORQ SI, SI
   348      CMPQ CX, $16
   349      JB loop
   350      CMPB ·hasAVX512(SB), $0
   351      JE loop
   352  
   353      MOVQ CX, DI
   354      SHRQ $4, DI
   355      SHLQ $4, DI
   356  
   357      VMOVDQU64 prime1vec<>(SB), prime1ZMM
   358      VMOVDQU64 prime2vec<>(SB), prime2ZMM
   359      VMOVDQU64 prime3vec<>(SB), prime3ZMM
   360      VMOVDQU64 prime5vec<>(SB), prime5ZMM
   361      VMOVDQU64 prime5vec2<>(SB), Z6
   362      VMOVDQU64 lowbytemask<>(SB), Z7
   363  loop16x64:
   364      VMOVDQA64 Z6, Z0
   365      VMOVDQA64 Z6, Z3
   366      VPMOVZXWQ (BX)(SI*2), Z1
   367      VPMOVZXWQ 16(BX)(SI*2), Z4
   368  
   369      VMOVDQA64 Z1, Z8
   370      VMOVDQA64 Z4, Z9
   371      VPSRLQ $8, Z8, Z8
   372      VPSRLQ $8, Z9, Z9
   373      VPANDQ Z7, Z1, Z1
   374      VPANDQ Z7, Z4, Z4
   375  
   376      VPMULLQ prime5ZMM, Z1, Z1
   377      VPMULLQ prime5ZMM, Z4, Z4
   378      VPXORQ Z1, Z0, Z0
   379      VPXORQ Z4, Z3, Z3
   380      VPROLQ $11, Z0, Z0
   381      VPROLQ $11, Z3, Z3
   382      VPMULLQ prime1ZMM, Z0, Z0
   383      VPMULLQ prime1ZMM, Z3, Z3
   384  
   385      VPMULLQ prime5ZMM, Z8, Z8
   386      VPMULLQ prime5ZMM, Z9, Z9
   387      VPXORQ Z8, Z0, Z0
   388      VPXORQ Z9, Z3, Z3
   389      VPROLQ $11, Z0, Z0
   390      VPROLQ $11, Z3, Z3
   391      VPMULLQ prime1ZMM, Z0, Z0
   392      VPMULLQ prime1ZMM, Z3, Z3
   393  
   394      avalanche8x64(Z1, Z0)
   395      avalanche8x64(Z4, Z3)
   396  
   397      VMOVDQU64 Z0, (AX)(SI*8)
   398      VMOVDQU64 Z3, 64(AX)(SI*8)
   399      ADDQ $16, SI
   400      CMPQ SI, DI
   401      JB loop16x64
   402      VZEROUPPER
   403  loop:
   404      CMPQ SI, CX
   405      JE done
   406      MOVQ $PRIME5+2, R8
   407      MOVWQZX (BX)(SI*2), R9
   408  
   409      MOVQ R9, R10
   410      SHRQ $8, R10
   411      ANDQ $0xFF, R9
   412  
   413      IMULQ prime5, R9
   414      XORQ R9, R8
   415      ROLQ $11, R8
   416      IMULQ prime1, R8
   417  
   418      IMULQ prime5, R10
   419      XORQ R10, R8
   420      ROLQ $11, R8
   421      IMULQ prime1, R8
   422  
   423      avalanche(R9, R8)
   424  
   425      MOVQ R8, (AX)(SI*8)
   426      INCQ SI
   427      JMP loop
   428  done:
   429      RET
   430  
   431  // func MultiSum64Uint32(h []uint64, v []uint32) int
   432  TEXT ·MultiSum64Uint32(SB), NOSPLIT, $0-54
   433      MOVQ $PRIME1, prime1
   434      MOVQ $PRIME2, prime2
   435      MOVQ $PRIME3, prime3
   436  
   437      MOVQ h_base+0(FP), AX
   438      MOVQ h_len+8(FP), CX
   439      MOVQ v_base+24(FP), BX
   440      MOVQ v_len+32(FP), DX
   441  
   442      CMPQ CX, DX
   443      CMOVQGT DX, CX
   444      MOVQ CX, ret+48(FP)
   445  
   446      XORQ SI, SI
   447      CMPQ CX, $32
   448      JB loop
   449      CMPB ·hasAVX512(SB), $0
   450      JE loop
   451  
   452      MOVQ CX, DI
   453      SHRQ $5, DI
   454      SHLQ $5, DI
   455  
   456      VMOVDQU64 prime1vec<>(SB), prime1ZMM
   457      VMOVDQU64 prime2vec<>(SB), prime2ZMM
   458      VMOVDQU64 prime3vec<>(SB), prime3ZMM
   459      VMOVDQU64 prime5vec4<>(SB), Z6
   460  loop32x64:
   461      VMOVDQA64 Z6, Z0
   462      VMOVDQA64 Z6, Z3
   463      VMOVDQA64 Z6, Z20
   464      VMOVDQA64 Z6, Z23
   465      VPMOVZXDQ (BX)(SI*4), Z1
   466      VPMOVZXDQ 32(BX)(SI*4), Z4
   467      VPMOVZXDQ 64(BX)(SI*4), Z21
   468      VPMOVZXDQ 96(BX)(SI*4), Z24
   469  
   470      VPMULLQ prime1ZMM, Z1, Z1
   471      VPMULLQ prime1ZMM, Z4, Z4
   472      VPMULLQ prime1ZMM, Z21, Z21
   473      VPMULLQ prime1ZMM, Z24, Z24
   474      VPXORQ Z1, Z0, Z0
   475      VPXORQ Z4, Z3, Z3
   476      VPXORQ Z21, Z20, Z20
   477      VPXORQ Z24, Z23, Z23
   478      VPROLQ $23, Z0, Z0
   479      VPROLQ $23, Z3, Z3
   480      VPROLQ $23, Z20, Z20
   481      VPROLQ $23, Z23, Z23
   482      VPMULLQ prime2ZMM, Z0, Z0
   483      VPMULLQ prime2ZMM, Z3, Z3
   484      VPMULLQ prime2ZMM, Z20, Z20
   485      VPMULLQ prime2ZMM, Z23, Z23
   486      VPADDQ prime3ZMM, Z0, Z0
   487      VPADDQ prime3ZMM, Z3, Z3
   488      VPADDQ prime3ZMM, Z20, Z20
   489      VPADDQ prime3ZMM, Z23, Z23
   490  
   491      avalanche8x64(Z1, Z0)
   492      avalanche8x64(Z4, Z3)
   493      avalanche8x64(Z21, Z20)
   494      avalanche8x64(Z24, Z23)
   495  
   496      VMOVDQU64 Z0, (AX)(SI*8)
   497      VMOVDQU64 Z3, 64(AX)(SI*8)
   498      VMOVDQU64 Z20, 128(AX)(SI*8)
   499      VMOVDQU64 Z23, 192(AX)(SI*8)
   500      ADDQ $32, SI
   501      CMPQ SI, DI
   502      JB loop32x64
   503      VZEROUPPER
   504  loop:
   505      CMPQ SI, CX
   506      JE done
   507      MOVQ $PRIME5+4, R8
   508      MOVLQZX (BX)(SI*4), R9
   509  
   510      IMULQ prime1, R9
   511      XORQ R9, R8
   512      ROLQ $23, R8
   513      IMULQ prime2, R8
   514      ADDQ prime3, R8
   515      avalanche(R9, R8)
   516  
   517      MOVQ R8, (AX)(SI*8)
   518      INCQ SI
   519      JMP loop
   520  done:
   521      RET
   522  
   523  // func MultiSum64Uint64(h []uint64, v []uint64) int
   524  TEXT ·MultiSum64Uint64(SB), NOSPLIT, $0-54
   525      MOVQ $PRIME1, prime1
   526      MOVQ $PRIME2, prime2
   527      MOVQ $PRIME3, prime3
   528      MOVQ $PRIME4, prime4
   529  
   530      MOVQ h_base+0(FP), AX
   531      MOVQ h_len+8(FP), CX
   532      MOVQ v_base+24(FP), BX
   533      MOVQ v_len+32(FP), DX
   534  
   535      CMPQ CX, DX
   536      CMOVQGT DX, CX
   537      MOVQ CX, ret+48(FP)
   538  
   539      XORQ SI, SI
   540      CMPQ CX, $32
   541      JB loop
   542      CMPB ·hasAVX512(SB), $0
   543      JE loop
   544  
   545      MOVQ CX, DI
   546      SHRQ $5, DI
   547      SHLQ $5, DI
   548  
   549      VMOVDQU64 prime1vec<>(SB), prime1ZMM
   550      VMOVDQU64 prime2vec<>(SB), prime2ZMM
   551      VMOVDQU64 prime3vec<>(SB), prime3ZMM
   552      VMOVDQU64 prime4vec<>(SB), prime4ZMM
   553      VMOVDQU64 prime5vec8<>(SB), Z6
   554  loop32x64:
   555      VMOVDQA64 Z6, Z0
   556      VMOVDQA64 Z6, Z3
   557      VMOVDQA64 Z6, Z20
   558      VMOVDQA64 Z6, Z23
   559      VMOVDQU64 (BX)(SI*8), Z1
   560      VMOVDQU64 64(BX)(SI*8), Z4
   561      VMOVDQU64 128(BX)(SI*8), Z21
   562      VMOVDQU64 192(BX)(SI*8), Z24
   563  
   564      VPXORQ Z2, Z2, Z2
   565      VPXORQ Z5, Z5, Z5
   566      VPXORQ Z22, Z22, Z22
   567      VPXORQ Z25, Z25, Z25
   568      round8x64(Z1, Z2)
   569      round8x64(Z4, Z5)
   570      round8x64(Z21, Z22)
   571      round8x64(Z24, Z25)
   572  
   573      VPXORQ Z2, Z0, Z0
   574      VPXORQ Z5, Z3, Z3
   575      VPXORQ Z22, Z20, Z20
   576      VPXORQ Z25, Z23, Z23
   577      VPROLQ $27, Z0, Z0
   578      VPROLQ $27, Z3, Z3
   579      VPROLQ $27, Z20, Z20
   580      VPROLQ $27, Z23, Z23
   581      VPMULLQ prime1ZMM, Z0, Z0
   582      VPMULLQ prime1ZMM, Z3, Z3
   583      VPMULLQ prime1ZMM, Z20, Z20
   584      VPMULLQ prime1ZMM, Z23, Z23
   585      VPADDQ prime4ZMM, Z0, Z0
   586      VPADDQ prime4ZMM, Z3, Z3
   587      VPADDQ prime4ZMM, Z20, Z20
   588      VPADDQ prime4ZMM, Z23, Z23
   589  
   590      avalanche8x64(Z1, Z0)
   591      avalanche8x64(Z4, Z3)
   592      avalanche8x64(Z21, Z20)
   593      avalanche8x64(Z24, Z23)
   594  
   595      VMOVDQU64 Z0, (AX)(SI*8)
   596      VMOVDQU64 Z3, 64(AX)(SI*8)
   597      VMOVDQU64 Z20, 128(AX)(SI*8)
   598      VMOVDQU64 Z23, 192(AX)(SI*8)
   599      ADDQ $32, SI
   600      CMPQ SI, DI
   601      JB loop32x64
   602      VZEROUPPER
   603  loop:
   604      CMPQ SI, CX
   605      JE done
   606      MOVQ $PRIME5+8, R8
   607      MOVQ (BX)(SI*8), R9
   608  
   609      XORQ R10, R10
   610      round(R9, R10)
   611      XORQ R10, R8
   612      ROLQ $27, R8
   613      IMULQ prime1, R8
   614      ADDQ prime4, R8
   615      avalanche(R9, R8)
   616  
   617      MOVQ R8, (AX)(SI*8)
   618      INCQ SI
   619      JMP loop
   620  done:
   621      RET
   622  
   623  // func MultiSum64Uint128(h []uint64, v [][16]byte) int
   624  TEXT ·MultiSum64Uint128(SB), NOSPLIT, $0-54
   625      MOVQ $PRIME1, prime1
   626      MOVQ $PRIME2, prime2
   627      MOVQ $PRIME3, prime3
   628      MOVQ $PRIME4, prime4
   629  
   630      MOVQ h_base+0(FP), AX
   631      MOVQ h_len+8(FP), CX
   632      MOVQ v_base+24(FP), BX
   633      MOVQ v_len+32(FP), DX
   634  
   635      CMPQ CX, DX
   636      CMOVQGT DX, CX
   637      MOVQ CX, ret+48(FP)
   638  
   639      XORQ SI, SI
   640      CMPQ CX, $16
   641      JB loop
   642      CMPB ·hasAVX512(SB), $0
   643      JE loop
   644  
   645      MOVQ CX, DI
   646      SHRQ $4, DI
   647      SHLQ $4, DI
   648  
   649      VMOVDQU64 prime1vec<>(SB), prime1ZMM
   650      VMOVDQU64 prime2vec<>(SB), prime2ZMM
   651      VMOVDQU64 prime3vec<>(SB), prime3ZMM
   652      VMOVDQU64 prime4vec<>(SB), prime4ZMM
   653      VMOVDQU64 prime5vec16<>(SB), Z6
   654      VMOVDQU64 vpermi2qeven<>(SB), Z7
   655      VMOVDQU64 vpermi2qodd<>(SB), Z8
   656  loop16x64:
   657      // This algorithm is slightly different from the other ones, because it is
   658      // the only case where the input values are larger than the output (128 bits
   659      // vs 64 bits).
   660      //
   661      // Computing the XXH64 of 128 bits values requires doing two passes over the
   662      // lower and upper 64 bits. The lower and upper quad/ words are split in
   663      // separate vectors, the first pass is applied on the vector holding the
   664      // lower bits of 8 input values, then the second pass is applied with the
   665      // vector holding the upper bits.
   666      //
   667      // Following the model used in the other functions, we unroll the work of
   668      // two consecutive groups of 8 values per loop iteration in order to
   669      // maximize utilization of CPU resources.
   670      CMPQ SI, DI
   671      JE loop
   672      VMOVDQA64 Z6, Z0
   673      VMOVDQA64 Z6, Z20
   674      VMOVDQU64 (BX), Z1
   675      VMOVDQU64 64(BX), Z9
   676      VMOVDQU64 128(BX), Z21
   677      VMOVDQU64 192(BX), Z29
   678  
   679      VMOVDQA64 Z7, Z2
   680      VMOVDQA64 Z8, Z3
   681      VMOVDQA64 Z7, Z22
   682      VMOVDQA64 Z8, Z23
   683  
   684      VPERMI2Q Z9, Z1, Z2
   685      VPERMI2Q Z9, Z1, Z3
   686      VPERMI2Q Z29, Z21, Z22
   687      VPERMI2Q Z29, Z21, Z23
   688  
   689      // Compute the rounds on inputs.
   690      VPXORQ Z4, Z4, Z4
   691      VPXORQ Z5, Z5, Z5
   692      VPXORQ Z24, Z24, Z24
   693      VPXORQ Z25, Z25, Z25
   694      round8x64(Z2, Z4)
   695      round8x64(Z3, Z5)
   696      round8x64(Z22, Z24)
   697      round8x64(Z23, Z25)
   698  
   699      // Lower 64 bits.
   700      VPXORQ Z4, Z0, Z0
   701      VPXORQ Z24, Z20, Z20
   702      VPROLQ $27, Z0, Z0
   703      VPROLQ $27, Z20, Z20
   704      VPMULLQ prime1ZMM, Z0, Z0
   705      VPMULLQ prime1ZMM, Z20, Z20
   706      VPADDQ prime4ZMM, Z0, Z0
   707      VPADDQ prime4ZMM, Z20, Z20
   708  
   709      // Upper 64 bits.
   710      VPXORQ Z5, Z0, Z0
   711      VPXORQ Z25, Z20, Z20
   712      VPROLQ $27, Z0, Z0
   713      VPROLQ $27, Z20, Z20
   714      VPMULLQ prime1ZMM, Z0, Z0
   715      VPMULLQ prime1ZMM, Z20, Z20
   716      VPADDQ prime4ZMM, Z0, Z0
   717      VPADDQ prime4ZMM, Z20, Z20
   718  
   719      avalanche8x64(Z1, Z0)
   720      avalanche8x64(Z21, Z20)
   721      VMOVDQU64 Z0, (AX)(SI*8)
   722      VMOVDQU64 Z20, 64(AX)(SI*8)
   723      ADDQ $256, BX
   724      ADDQ $16, SI
   725      JMP loop16x64
   726      VZEROUPPER
   727  loop:
   728      CMPQ SI, CX
   729      JE done
   730      MOVQ $PRIME5+16, R8
   731      MOVQ (BX), DX
   732      MOVQ 8(BX), DI
   733  
   734      XORQ R9, R9
   735      XORQ R10, R10
   736      round(DX, R9)
   737      round(DI, R10)
   738  
   739      XORQ R9, R8
   740      ROLQ $27, R8
   741      IMULQ prime1, R8
   742      ADDQ prime4, R8
   743  
   744      XORQ R10, R8
   745      ROLQ $27, R8
   746      IMULQ prime1, R8
   747      ADDQ prime4, R8
   748  
   749      avalanche(R9, R8)
   750      MOVQ R8, (AX)(SI*8)
   751      ADDQ $16, BX
   752      INCQ SI
   753      JMP loop
   754  done:
   755      RET