github.com/bytedance/gopkg@v0.0.0-20240514070511-01b2cbcf35e1/util/xxhash3/avx2_amd64.s (about)

     1  // Copyright 2021 ByteDance Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Code generated by command: go run gen.go -avx2 -out ./avx2.s. DO NOT EDIT.
    16  
    17  #include "textflag.h"
    18  
    19  DATA prime_avx<>+0(SB)/4, $0x9e3779b1
    20  GLOBL prime_avx<>(SB), RODATA|NOPTR, $4
    21  
    22  // func accumAVX2(acc *[8]uint64, xinput *byte, xsecret *byte, len uint64)
    23  // Requires: AVX, AVX2
    24  TEXT ·accumAVX2(SB), NOSPLIT, $0-32
    25  	MOVQ         acc+0(FP), AX
    26  	MOVQ         xinput+8(FP), CX
    27  	MOVQ         xsecret+16(FP), DX
    28  	MOVQ         xsecret+16(FP), BX
    29  	MOVQ         len+24(FP), SI
    30  	VMOVDQU      (AX), Y1
    31  	VMOVDQU      32(AX), Y2
    32  	VPBROADCASTQ prime_avx<>+0(SB), Y0
    33  
    34  accumBlock:
    35  	CMPQ     SI, $0x00000400
    36  	JLE      accumStripe
    37  	VMOVDQU  (CX), Y3
    38  	VMOVDQU  (DX), Y4
    39  	VMOVDQU  32(CX), Y6
    40  	VMOVDQU  32(DX), Y7
    41  	VMOVDQU  64(CX), Y8
    42  	VMOVDQU  8(DX), Y9
    43  	VMOVDQU  96(CX), Y10
    44  	VMOVDQU  40(DX), Y11
    45  	VPXOR    Y3, Y4, Y4
    46  	VPSRLQ   $0x20, Y4, Y5
    47  	VPSHUFD  $0x4e, Y3, Y3
    48  	VPMULUDQ Y4, Y5, Y5
    49  	VPADDQ   Y1, Y3, Y1
    50  	VPADDQ   Y1, Y5, Y1
    51  	VPXOR    Y6, Y7, Y7
    52  	VPSRLQ   $0x20, Y7, Y5
    53  	VPSHUFD  $0x4e, Y6, Y6
    54  	VPMULUDQ Y7, Y5, Y5
    55  	VPADDQ   Y2, Y6, Y2
    56  	VPADDQ   Y2, Y5, Y2
    57  	VPXOR    Y8, Y9, Y9
    58  	VPSRLQ   $0x20, Y9, Y5
    59  	VPSHUFD  $0x4e, Y8, Y8
    60  	VPMULUDQ Y9, Y5, Y5
    61  	VPADDQ   Y1, Y8, Y1
    62  	VPADDQ   Y1, Y5, Y1
    63  	VPXOR    Y10, Y11, Y11
    64  	VPSRLQ   $0x20, Y11, Y5
    65  	VPSHUFD  $0x4e, Y10, Y10
    66  	VPMULUDQ Y11, Y5, Y5
    67  	VPADDQ   Y2, Y10, Y2
    68  	VPADDQ   Y2, Y5, Y2
    69  	VMOVDQU  128(CX), Y3
    70  	VMOVDQU  16(DX), Y4
    71  	VMOVDQU  160(CX), Y6
    72  	VMOVDQU  48(DX), Y7
    73  	VMOVDQU  192(CX), Y8
    74  	VMOVDQU  24(DX), Y9
    75  	VMOVDQU  224(CX), Y10
    76  	VMOVDQU  56(DX), Y11
    77  	VPXOR    Y3, Y4, Y4
    78  	VPSRLQ   $0x20, Y4, Y5
    79  	VPSHUFD  $0x4e, Y3, Y3
    80  	VPMULUDQ Y4, Y5, Y5
    81  	VPADDQ   Y1, Y3, Y1
    82  	VPADDQ   Y1, Y5, Y1
    83  	VPXOR    Y6, Y7, Y7
    84  	VPSRLQ   $0x20, Y7, Y5
    85  	VPSHUFD  $0x4e, Y6, Y6
    86  	VPMULUDQ Y7, Y5, Y5
    87  	VPADDQ   Y2, Y6, Y2
    88  	VPADDQ   Y2, Y5, Y2
    89  	VPXOR    Y8, Y9, Y9
    90  	VPSRLQ   $0x20, Y9, Y5
    91  	VPSHUFD  $0x4e, Y8, Y8
    92  	VPMULUDQ Y9, Y5, Y5
    93  	VPADDQ   Y1, Y8, Y1
    94  	VPADDQ   Y1, Y5, Y1
    95  	VPXOR    Y10, Y11, Y11
    96  	VPSRLQ   $0x20, Y11, Y5
    97  	VPSHUFD  $0x4e, Y10, Y10
    98  	VPMULUDQ Y11, Y5, Y5
    99  	VPADDQ   Y2, Y10, Y2
   100  	VPADDQ   Y2, Y5, Y2
   101  	VMOVDQU  256(CX), Y3
   102  	VMOVDQU  32(DX), Y4
   103  	VMOVDQU  288(CX), Y6
   104  	VMOVDQU  64(DX), Y7
   105  	VMOVDQU  320(CX), Y8
   106  	VMOVDQU  40(DX), Y9
   107  	VMOVDQU  352(CX), Y10
   108  	VMOVDQU  72(DX), Y11
   109  	VPXOR    Y3, Y4, Y4
   110  	VPSRLQ   $0x20, Y4, Y5
   111  	VPSHUFD  $0x4e, Y3, Y3
   112  	VPMULUDQ Y4, Y5, Y5
   113  	VPADDQ   Y1, Y3, Y1
   114  	VPADDQ   Y1, Y5, Y1
   115  	VPXOR    Y6, Y7, Y7
   116  	VPSRLQ   $0x20, Y7, Y5
   117  	VPSHUFD  $0x4e, Y6, Y6
   118  	VPMULUDQ Y7, Y5, Y5
   119  	VPADDQ   Y2, Y6, Y2
   120  	VPADDQ   Y2, Y5, Y2
   121  	VPXOR    Y8, Y9, Y9
   122  	VPSRLQ   $0x20, Y9, Y5
   123  	VPSHUFD  $0x4e, Y8, Y8
   124  	VPMULUDQ Y9, Y5, Y5
   125  	VPADDQ   Y1, Y8, Y1
   126  	VPADDQ   Y1, Y5, Y1
   127  	VPXOR    Y10, Y11, Y11
   128  	VPSRLQ   $0x20, Y11, Y5
   129  	VPSHUFD  $0x4e, Y10, Y10
   130  	VPMULUDQ Y11, Y5, Y5
   131  	VPADDQ   Y2, Y10, Y2
   132  	VPADDQ   Y2, Y5, Y2
   133  	VMOVDQU  384(CX), Y3
   134  	VMOVDQU  48(DX), Y4
   135  	VMOVDQU  416(CX), Y6
   136  	VMOVDQU  80(DX), Y7
   137  	VMOVDQU  448(CX), Y8
   138  	VMOVDQU  56(DX), Y9
   139  	VMOVDQU  480(CX), Y10
   140  	VMOVDQU  88(DX), Y11
   141  	VPXOR    Y3, Y4, Y4
   142  	VPSRLQ   $0x20, Y4, Y5
   143  	VPSHUFD  $0x4e, Y3, Y3
   144  	VPMULUDQ Y4, Y5, Y5
   145  	VPADDQ   Y1, Y3, Y1
   146  	VPADDQ   Y1, Y5, Y1
   147  	VPXOR    Y6, Y7, Y7
   148  	VPSRLQ   $0x20, Y7, Y5
   149  	VPSHUFD  $0x4e, Y6, Y6
   150  	VPMULUDQ Y7, Y5, Y5
   151  	VPADDQ   Y2, Y6, Y2
   152  	VPADDQ   Y2, Y5, Y2
   153  	VPXOR    Y8, Y9, Y9
   154  	VPSRLQ   $0x20, Y9, Y5
   155  	VPSHUFD  $0x4e, Y8, Y8
   156  	VPMULUDQ Y9, Y5, Y5
   157  	VPADDQ   Y1, Y8, Y1
   158  	VPADDQ   Y1, Y5, Y1
   159  	VPXOR    Y10, Y11, Y11
   160  	VPSRLQ   $0x20, Y11, Y5
   161  	VPSHUFD  $0x4e, Y10, Y10
   162  	VPMULUDQ Y11, Y5, Y5
   163  	VPADDQ   Y2, Y10, Y2
   164  	VPADDQ   Y2, Y5, Y2
   165  	VMOVDQU  512(CX), Y3
   166  	VMOVDQU  64(DX), Y4
   167  	VMOVDQU  544(CX), Y6
   168  	VMOVDQU  96(DX), Y7
   169  	VMOVDQU  576(CX), Y8
   170  	VMOVDQU  72(DX), Y9
   171  	VMOVDQU  608(CX), Y10
   172  	VMOVDQU  104(DX), Y11
   173  	VPXOR    Y3, Y4, Y4
   174  	VPSRLQ   $0x20, Y4, Y5
   175  	VPSHUFD  $0x4e, Y3, Y3
   176  	VPMULUDQ Y4, Y5, Y5
   177  	VPADDQ   Y1, Y3, Y1
   178  	VPADDQ   Y1, Y5, Y1
   179  	VPXOR    Y6, Y7, Y7
   180  	VPSRLQ   $0x20, Y7, Y5
   181  	VPSHUFD  $0x4e, Y6, Y6
   182  	VPMULUDQ Y7, Y5, Y5
   183  	VPADDQ   Y2, Y6, Y2
   184  	VPADDQ   Y2, Y5, Y2
   185  	VPXOR    Y8, Y9, Y9
   186  	VPSRLQ   $0x20, Y9, Y5
   187  	VPSHUFD  $0x4e, Y8, Y8
   188  	VPMULUDQ Y9, Y5, Y5
   189  	VPADDQ   Y1, Y8, Y1
   190  	VPADDQ   Y1, Y5, Y1
   191  	VPXOR    Y10, Y11, Y11
   192  	VPSRLQ   $0x20, Y11, Y5
   193  	VPSHUFD  $0x4e, Y10, Y10
   194  	VPMULUDQ Y11, Y5, Y5
   195  	VPADDQ   Y2, Y10, Y2
   196  	VPADDQ   Y2, Y5, Y2
   197  	VMOVDQU  640(CX), Y3
   198  	VMOVDQU  80(DX), Y4
   199  	VMOVDQU  672(CX), Y6
   200  	VMOVDQU  112(DX), Y7
   201  	VMOVDQU  704(CX), Y8
   202  	VMOVDQU  88(DX), Y9
   203  	VMOVDQU  736(CX), Y10
   204  	VMOVDQU  120(DX), Y11
   205  	VPXOR    Y3, Y4, Y4
   206  	VPSRLQ   $0x20, Y4, Y5
   207  	VPSHUFD  $0x4e, Y3, Y3
   208  	VPMULUDQ Y4, Y5, Y5
   209  	VPADDQ   Y1, Y3, Y1
   210  	VPADDQ   Y1, Y5, Y1
   211  	VPXOR    Y6, Y7, Y7
   212  	VPSRLQ   $0x20, Y7, Y5
   213  	VPSHUFD  $0x4e, Y6, Y6
   214  	VPMULUDQ Y7, Y5, Y5
   215  	VPADDQ   Y2, Y6, Y2
   216  	VPADDQ   Y2, Y5, Y2
   217  	VPXOR    Y8, Y9, Y9
   218  	VPSRLQ   $0x20, Y9, Y5
   219  	VPSHUFD  $0x4e, Y8, Y8
   220  	VPMULUDQ Y9, Y5, Y5
   221  	VPADDQ   Y1, Y8, Y1
   222  	VPADDQ   Y1, Y5, Y1
   223  	VPXOR    Y10, Y11, Y11
   224  	VPSRLQ   $0x20, Y11, Y5
   225  	VPSHUFD  $0x4e, Y10, Y10
   226  	VPMULUDQ Y11, Y5, Y5
   227  	VPADDQ   Y2, Y10, Y2
   228  	VPADDQ   Y2, Y5, Y2
   229  	VMOVDQU  768(CX), Y3
   230  	VMOVDQU  96(DX), Y4
   231  	VMOVDQU  800(CX), Y6
   232  	VMOVDQU  128(DX), Y7
   233  	VMOVDQU  832(CX), Y8
   234  	VMOVDQU  104(DX), Y9
   235  	VMOVDQU  864(CX), Y10
   236  	VMOVDQU  136(DX), Y11
   237  	VPXOR    Y3, Y4, Y4
   238  	VPSRLQ   $0x20, Y4, Y5
   239  	VPSHUFD  $0x4e, Y3, Y3
   240  	VPMULUDQ Y4, Y5, Y5
   241  	VPADDQ   Y1, Y3, Y1
   242  	VPADDQ   Y1, Y5, Y1
   243  	VPXOR    Y6, Y7, Y7
   244  	VPSRLQ   $0x20, Y7, Y5
   245  	VPSHUFD  $0x4e, Y6, Y6
   246  	VPMULUDQ Y7, Y5, Y5
   247  	VPADDQ   Y2, Y6, Y2
   248  	VPADDQ   Y2, Y5, Y2
   249  	VPXOR    Y8, Y9, Y9
   250  	VPSRLQ   $0x20, Y9, Y5
   251  	VPSHUFD  $0x4e, Y8, Y8
   252  	VPMULUDQ Y9, Y5, Y5
   253  	VPADDQ   Y1, Y8, Y1
   254  	VPADDQ   Y1, Y5, Y1
   255  	VPXOR    Y10, Y11, Y11
   256  	VPSRLQ   $0x20, Y11, Y5
   257  	VPSHUFD  $0x4e, Y10, Y10
   258  	VPMULUDQ Y11, Y5, Y5
   259  	VPADDQ   Y2, Y10, Y2
   260  	VPADDQ   Y2, Y5, Y2
   261  	VMOVDQU  896(CX), Y3
   262  	VMOVDQU  112(DX), Y4
   263  	VMOVDQU  928(CX), Y6
   264  	VMOVDQU  144(DX), Y7
   265  	VMOVDQU  960(CX), Y8
   266  	VMOVDQU  120(DX), Y9
   267  	VMOVDQU  992(CX), Y10
   268  	VMOVDQU  152(DX), Y11
   269  	VPXOR    Y3, Y4, Y4
   270  	VPSRLQ   $0x20, Y4, Y5
   271  	VPSHUFD  $0x4e, Y3, Y3
   272  	VPMULUDQ Y4, Y5, Y5
   273  	VPADDQ   Y1, Y3, Y1
   274  	VPADDQ   Y1, Y5, Y1
   275  	VPXOR    Y6, Y7, Y7
   276  	VPSRLQ   $0x20, Y7, Y5
   277  	VPSHUFD  $0x4e, Y6, Y6
   278  	VPMULUDQ Y7, Y5, Y5
   279  	VPADDQ   Y2, Y6, Y2
   280  	VPADDQ   Y2, Y5, Y2
   281  	VPXOR    Y8, Y9, Y9
   282  	VPSRLQ   $0x20, Y9, Y5
   283  	VPSHUFD  $0x4e, Y8, Y8
   284  	VPMULUDQ Y9, Y5, Y5
   285  	VPADDQ   Y1, Y8, Y1
   286  	VPADDQ   Y1, Y5, Y1
   287  	VPXOR    Y10, Y11, Y11
   288  	VPSRLQ   $0x20, Y11, Y5
   289  	VPSHUFD  $0x4e, Y10, Y10
   290  	VPMULUDQ Y11, Y5, Y5
   291  	VPADDQ   Y2, Y10, Y2
   292  	VPADDQ   Y2, Y5, Y2
   293  	ADDQ     $0x00000400, CX
   294  	SUBQ     $0x00000400, SI
   295  	VPSRLQ   $0x2f, Y1, Y3
   296  	VPXOR    Y1, Y3, Y3
   297  	VPXOR    128(DX), Y3, Y3
   298  	VPMULUDQ Y0, Y3, Y4
   299  	VPSRLQ   $0x20, Y3, Y3
   300  	VPMULUDQ Y0, Y3, Y3
   301  	VPSLLQ   $0x20, Y3, Y3
   302  	VPADDQ   Y4, Y3, Y1
   303  	VPSRLQ   $0x2f, Y2, Y3
   304  	VPXOR    Y2, Y3, Y3
   305  	VPXOR    160(DX), Y3, Y3
   306  	VPMULUDQ Y0, Y3, Y4
   307  	VPSRLQ   $0x20, Y3, Y3
   308  	VPMULUDQ Y0, Y3, Y3
   309  	VPSLLQ   $0x20, Y3, Y3
   310  	VPADDQ   Y4, Y3, Y2
   311  	JMP      accumBlock
   312  
   313  accumStripe:
   314  	CMPQ     SI, $0x40
   315  	JLE      accumLastStripe
   316  	VMOVDQU  (CX), Y0
   317  	VMOVDQU  (BX), Y3
   318  	VMOVDQU  32(CX), Y5
   319  	VMOVDQU  32(BX), Y6
   320  	VPXOR    Y0, Y3, Y3
   321  	VPSRLQ   $0x20, Y3, Y4
   322  	VPSHUFD  $0x4e, Y0, Y0
   323  	VPMULUDQ Y3, Y4, Y4
   324  	VPADDQ   Y1, Y0, Y1
   325  	VPADDQ   Y1, Y4, Y1
   326  	VPXOR    Y5, Y6, Y6
   327  	VPSRLQ   $0x20, Y6, Y4
   328  	VPMULUDQ Y6, Y4, Y4
   329  	VPSHUFD  $0x4e, Y5, Y5
   330  	VPADDQ   Y2, Y5, Y2
   331  	VPADDQ   Y2, Y4, Y2
   332  	ADDQ     $0x00000040, CX
   333  	SUBQ     $0x00000040, SI
   334  	ADDQ     $0x00000008, BX
   335  	JMP      accumStripe
   336  
   337  accumLastStripe:
   338  	CMPQ     SI, $0x00
   339  	JE       return
   340  	SUBQ     $0x40, CX
   341  	ADDQ     SI, CX
   342  	VMOVDQU  (CX), Y0
   343  	VMOVDQU  121(DX), Y3
   344  	VMOVDQU  32(CX), Y5
   345  	VMOVDQU  153(DX), Y6
   346  	VPXOR    Y0, Y3, Y3
   347  	VPSRLQ   $0x20, Y3, Y4
   348  	VPSHUFD  $0x4e, Y0, Y0
   349  	VPMULUDQ Y3, Y4, Y4
   350  	VPADDQ   Y1, Y0, Y1
   351  	VPADDQ   Y1, Y4, Y1
   352  	VPXOR    Y5, Y6, Y6
   353  	VPSRLQ   $0x20, Y6, Y4
   354  	VPMULUDQ Y6, Y4, Y4
   355  	VPSHUFD  $0x4e, Y5, Y5
   356  	VPADDQ   Y2, Y5, Y2
   357  	VPADDQ   Y2, Y4, Y2
   358  
   359  return:
   360  	VMOVDQU Y1, (AX)
   361  	VMOVDQU Y2, 32(AX)
   362  	RET