github.com/songzhibin97/go-baseutils@v0.0.2-0.20240302024150-487d8ce9c082/sys/xxhash3/avx2_amd64.s (about)

     1  
     2  
     3  // Code generated by command: go run gen.go -avx2 -out ./avx2.s. DO NOT EDIT.
     4  
     5  #include "textflag.h"
     6  
     7  DATA prime_avx<>+0(SB)/4, $0x9e3779b1
     8  GLOBL prime_avx<>(SB), RODATA|NOPTR, $4
     9  
    10  // func accumAVX2(acc *[8]uint64, xinput *byte, xsecret *byte, len uint64)
    11  // Requires: AVX, AVX2
    12  TEXT ·accumAVX2(SB), NOSPLIT, $0-32
    13  	MOVQ         acc+0(FP), AX
    14  	MOVQ         xinput+8(FP), CX
    15  	MOVQ         xsecret+16(FP), DX
    16  	MOVQ         xsecret+16(FP), BX
    17  	MOVQ         len+24(FP), SI
    18  	VMOVDQU      (AX), Y1
    19  	VMOVDQU      32(AX), Y2
    20  	VPBROADCASTQ prime_avx<>+0(SB), Y0
    21  
    22  accumBlock:
    23  	CMPQ     SI, $0x00000400
    24  	JLE      accumStripe
    25  	VMOVDQU  (CX), Y3
    26  	VMOVDQU  (DX), Y4
    27  	VMOVDQU  32(CX), Y6
    28  	VMOVDQU  32(DX), Y7
    29  	VMOVDQU  64(CX), Y8
    30  	VMOVDQU  8(DX), Y9
    31  	VMOVDQU  96(CX), Y10
    32  	VMOVDQU  40(DX), Y11
    33  	VPXOR    Y3, Y4, Y4
    34  	VPSRLQ   $0x20, Y4, Y5
    35  	VPSHUFD  $0x4e, Y3, Y3
    36  	VPMULUDQ Y4, Y5, Y5
    37  	VPADDQ   Y1, Y3, Y1
    38  	VPADDQ   Y1, Y5, Y1
    39  	VPXOR    Y6, Y7, Y7
    40  	VPSRLQ   $0x20, Y7, Y5
    41  	VPSHUFD  $0x4e, Y6, Y6
    42  	VPMULUDQ Y7, Y5, Y5
    43  	VPADDQ   Y2, Y6, Y2
    44  	VPADDQ   Y2, Y5, Y2
    45  	VPXOR    Y8, Y9, Y9
    46  	VPSRLQ   $0x20, Y9, Y5
    47  	VPSHUFD  $0x4e, Y8, Y8
    48  	VPMULUDQ Y9, Y5, Y5
    49  	VPADDQ   Y1, Y8, Y1
    50  	VPADDQ   Y1, Y5, Y1
    51  	VPXOR    Y10, Y11, Y11
    52  	VPSRLQ   $0x20, Y11, Y5
    53  	VPSHUFD  $0x4e, Y10, Y10
    54  	VPMULUDQ Y11, Y5, Y5
    55  	VPADDQ   Y2, Y10, Y2
    56  	VPADDQ   Y2, Y5, Y2
    57  	VMOVDQU  128(CX), Y3
    58  	VMOVDQU  16(DX), Y4
    59  	VMOVDQU  160(CX), Y6
    60  	VMOVDQU  48(DX), Y7
    61  	VMOVDQU  192(CX), Y8
    62  	VMOVDQU  24(DX), Y9
    63  	VMOVDQU  224(CX), Y10
    64  	VMOVDQU  56(DX), Y11
    65  	VPXOR    Y3, Y4, Y4
    66  	VPSRLQ   $0x20, Y4, Y5
    67  	VPSHUFD  $0x4e, Y3, Y3
    68  	VPMULUDQ Y4, Y5, Y5
    69  	VPADDQ   Y1, Y3, Y1
    70  	VPADDQ   Y1, Y5, Y1
    71  	VPXOR    Y6, Y7, Y7
    72  	VPSRLQ   $0x20, Y7, Y5
    73  	VPSHUFD  $0x4e, Y6, Y6
    74  	VPMULUDQ Y7, Y5, Y5
    75  	VPADDQ   Y2, Y6, Y2
    76  	VPADDQ   Y2, Y5, Y2
    77  	VPXOR    Y8, Y9, Y9
    78  	VPSRLQ   $0x20, Y9, Y5
    79  	VPSHUFD  $0x4e, Y8, Y8
    80  	VPMULUDQ Y9, Y5, Y5
    81  	VPADDQ   Y1, Y8, Y1
    82  	VPADDQ   Y1, Y5, Y1
    83  	VPXOR    Y10, Y11, Y11
    84  	VPSRLQ   $0x20, Y11, Y5
    85  	VPSHUFD  $0x4e, Y10, Y10
    86  	VPMULUDQ Y11, Y5, Y5
    87  	VPADDQ   Y2, Y10, Y2
    88  	VPADDQ   Y2, Y5, Y2
    89  	VMOVDQU  256(CX), Y3
    90  	VMOVDQU  32(DX), Y4
    91  	VMOVDQU  288(CX), Y6
    92  	VMOVDQU  64(DX), Y7
    93  	VMOVDQU  320(CX), Y8
    94  	VMOVDQU  40(DX), Y9
    95  	VMOVDQU  352(CX), Y10
    96  	VMOVDQU  72(DX), Y11
    97  	VPXOR    Y3, Y4, Y4
    98  	VPSRLQ   $0x20, Y4, Y5
    99  	VPSHUFD  $0x4e, Y3, Y3
   100  	VPMULUDQ Y4, Y5, Y5
   101  	VPADDQ   Y1, Y3, Y1
   102  	VPADDQ   Y1, Y5, Y1
   103  	VPXOR    Y6, Y7, Y7
   104  	VPSRLQ   $0x20, Y7, Y5
   105  	VPSHUFD  $0x4e, Y6, Y6
   106  	VPMULUDQ Y7, Y5, Y5
   107  	VPADDQ   Y2, Y6, Y2
   108  	VPADDQ   Y2, Y5, Y2
   109  	VPXOR    Y8, Y9, Y9
   110  	VPSRLQ   $0x20, Y9, Y5
   111  	VPSHUFD  $0x4e, Y8, Y8
   112  	VPMULUDQ Y9, Y5, Y5
   113  	VPADDQ   Y1, Y8, Y1
   114  	VPADDQ   Y1, Y5, Y1
   115  	VPXOR    Y10, Y11, Y11
   116  	VPSRLQ   $0x20, Y11, Y5
   117  	VPSHUFD  $0x4e, Y10, Y10
   118  	VPMULUDQ Y11, Y5, Y5
   119  	VPADDQ   Y2, Y10, Y2
   120  	VPADDQ   Y2, Y5, Y2
   121  	VMOVDQU  384(CX), Y3
   122  	VMOVDQU  48(DX), Y4
   123  	VMOVDQU  416(CX), Y6
   124  	VMOVDQU  80(DX), Y7
   125  	VMOVDQU  448(CX), Y8
   126  	VMOVDQU  56(DX), Y9
   127  	VMOVDQU  480(CX), Y10
   128  	VMOVDQU  88(DX), Y11
   129  	VPXOR    Y3, Y4, Y4
   130  	VPSRLQ   $0x20, Y4, Y5
   131  	VPSHUFD  $0x4e, Y3, Y3
   132  	VPMULUDQ Y4, Y5, Y5
   133  	VPADDQ   Y1, Y3, Y1
   134  	VPADDQ   Y1, Y5, Y1
   135  	VPXOR    Y6, Y7, Y7
   136  	VPSRLQ   $0x20, Y7, Y5
   137  	VPSHUFD  $0x4e, Y6, Y6
   138  	VPMULUDQ Y7, Y5, Y5
   139  	VPADDQ   Y2, Y6, Y2
   140  	VPADDQ   Y2, Y5, Y2
   141  	VPXOR    Y8, Y9, Y9
   142  	VPSRLQ   $0x20, Y9, Y5
   143  	VPSHUFD  $0x4e, Y8, Y8
   144  	VPMULUDQ Y9, Y5, Y5
   145  	VPADDQ   Y1, Y8, Y1
   146  	VPADDQ   Y1, Y5, Y1
   147  	VPXOR    Y10, Y11, Y11
   148  	VPSRLQ   $0x20, Y11, Y5
   149  	VPSHUFD  $0x4e, Y10, Y10
   150  	VPMULUDQ Y11, Y5, Y5
   151  	VPADDQ   Y2, Y10, Y2
   152  	VPADDQ   Y2, Y5, Y2
   153  	VMOVDQU  512(CX), Y3
   154  	VMOVDQU  64(DX), Y4
   155  	VMOVDQU  544(CX), Y6
   156  	VMOVDQU  96(DX), Y7
   157  	VMOVDQU  576(CX), Y8
   158  	VMOVDQU  72(DX), Y9
   159  	VMOVDQU  608(CX), Y10
   160  	VMOVDQU  104(DX), Y11
   161  	VPXOR    Y3, Y4, Y4
   162  	VPSRLQ   $0x20, Y4, Y5
   163  	VPSHUFD  $0x4e, Y3, Y3
   164  	VPMULUDQ Y4, Y5, Y5
   165  	VPADDQ   Y1, Y3, Y1
   166  	VPADDQ   Y1, Y5, Y1
   167  	VPXOR    Y6, Y7, Y7
   168  	VPSRLQ   $0x20, Y7, Y5
   169  	VPSHUFD  $0x4e, Y6, Y6
   170  	VPMULUDQ Y7, Y5, Y5
   171  	VPADDQ   Y2, Y6, Y2
   172  	VPADDQ   Y2, Y5, Y2
   173  	VPXOR    Y8, Y9, Y9
   174  	VPSRLQ   $0x20, Y9, Y5
   175  	VPSHUFD  $0x4e, Y8, Y8
   176  	VPMULUDQ Y9, Y5, Y5
   177  	VPADDQ   Y1, Y8, Y1
   178  	VPADDQ   Y1, Y5, Y1
   179  	VPXOR    Y10, Y11, Y11
   180  	VPSRLQ   $0x20, Y11, Y5
   181  	VPSHUFD  $0x4e, Y10, Y10
   182  	VPMULUDQ Y11, Y5, Y5
   183  	VPADDQ   Y2, Y10, Y2
   184  	VPADDQ   Y2, Y5, Y2
   185  	VMOVDQU  640(CX), Y3
   186  	VMOVDQU  80(DX), Y4
   187  	VMOVDQU  672(CX), Y6
   188  	VMOVDQU  112(DX), Y7
   189  	VMOVDQU  704(CX), Y8
   190  	VMOVDQU  88(DX), Y9
   191  	VMOVDQU  736(CX), Y10
   192  	VMOVDQU  120(DX), Y11
   193  	VPXOR    Y3, Y4, Y4
   194  	VPSRLQ   $0x20, Y4, Y5
   195  	VPSHUFD  $0x4e, Y3, Y3
   196  	VPMULUDQ Y4, Y5, Y5
   197  	VPADDQ   Y1, Y3, Y1
   198  	VPADDQ   Y1, Y5, Y1
   199  	VPXOR    Y6, Y7, Y7
   200  	VPSRLQ   $0x20, Y7, Y5
   201  	VPSHUFD  $0x4e, Y6, Y6
   202  	VPMULUDQ Y7, Y5, Y5
   203  	VPADDQ   Y2, Y6, Y2
   204  	VPADDQ   Y2, Y5, Y2
   205  	VPXOR    Y8, Y9, Y9
   206  	VPSRLQ   $0x20, Y9, Y5
   207  	VPSHUFD  $0x4e, Y8, Y8
   208  	VPMULUDQ Y9, Y5, Y5
   209  	VPADDQ   Y1, Y8, Y1
   210  	VPADDQ   Y1, Y5, Y1
   211  	VPXOR    Y10, Y11, Y11
   212  	VPSRLQ   $0x20, Y11, Y5
   213  	VPSHUFD  $0x4e, Y10, Y10
   214  	VPMULUDQ Y11, Y5, Y5
   215  	VPADDQ   Y2, Y10, Y2
   216  	VPADDQ   Y2, Y5, Y2
   217  	VMOVDQU  768(CX), Y3
   218  	VMOVDQU  96(DX), Y4
   219  	VMOVDQU  800(CX), Y6
   220  	VMOVDQU  128(DX), Y7
   221  	VMOVDQU  832(CX), Y8
   222  	VMOVDQU  104(DX), Y9
   223  	VMOVDQU  864(CX), Y10
   224  	VMOVDQU  136(DX), Y11
   225  	VPXOR    Y3, Y4, Y4
   226  	VPSRLQ   $0x20, Y4, Y5
   227  	VPSHUFD  $0x4e, Y3, Y3
   228  	VPMULUDQ Y4, Y5, Y5
   229  	VPADDQ   Y1, Y3, Y1
   230  	VPADDQ   Y1, Y5, Y1
   231  	VPXOR    Y6, Y7, Y7
   232  	VPSRLQ   $0x20, Y7, Y5
   233  	VPSHUFD  $0x4e, Y6, Y6
   234  	VPMULUDQ Y7, Y5, Y5
   235  	VPADDQ   Y2, Y6, Y2
   236  	VPADDQ   Y2, Y5, Y2
   237  	VPXOR    Y8, Y9, Y9
   238  	VPSRLQ   $0x20, Y9, Y5
   239  	VPSHUFD  $0x4e, Y8, Y8
   240  	VPMULUDQ Y9, Y5, Y5
   241  	VPADDQ   Y1, Y8, Y1
   242  	VPADDQ   Y1, Y5, Y1
   243  	VPXOR    Y10, Y11, Y11
   244  	VPSRLQ   $0x20, Y11, Y5
   245  	VPSHUFD  $0x4e, Y10, Y10
   246  	VPMULUDQ Y11, Y5, Y5
   247  	VPADDQ   Y2, Y10, Y2
   248  	VPADDQ   Y2, Y5, Y2
   249  	VMOVDQU  896(CX), Y3
   250  	VMOVDQU  112(DX), Y4
   251  	VMOVDQU  928(CX), Y6
   252  	VMOVDQU  144(DX), Y7
   253  	VMOVDQU  960(CX), Y8
   254  	VMOVDQU  120(DX), Y9
   255  	VMOVDQU  992(CX), Y10
   256  	VMOVDQU  152(DX), Y11
   257  	VPXOR    Y3, Y4, Y4
   258  	VPSRLQ   $0x20, Y4, Y5
   259  	VPSHUFD  $0x4e, Y3, Y3
   260  	VPMULUDQ Y4, Y5, Y5
   261  	VPADDQ   Y1, Y3, Y1
   262  	VPADDQ   Y1, Y5, Y1
   263  	VPXOR    Y6, Y7, Y7
   264  	VPSRLQ   $0x20, Y7, Y5
   265  	VPSHUFD  $0x4e, Y6, Y6
   266  	VPMULUDQ Y7, Y5, Y5
   267  	VPADDQ   Y2, Y6, Y2
   268  	VPADDQ   Y2, Y5, Y2
   269  	VPXOR    Y8, Y9, Y9
   270  	VPSRLQ   $0x20, Y9, Y5
   271  	VPSHUFD  $0x4e, Y8, Y8
   272  	VPMULUDQ Y9, Y5, Y5
   273  	VPADDQ   Y1, Y8, Y1
   274  	VPADDQ   Y1, Y5, Y1
   275  	VPXOR    Y10, Y11, Y11
   276  	VPSRLQ   $0x20, Y11, Y5
   277  	VPSHUFD  $0x4e, Y10, Y10
   278  	VPMULUDQ Y11, Y5, Y5
   279  	VPADDQ   Y2, Y10, Y2
   280  	VPADDQ   Y2, Y5, Y2
   281  	ADDQ     $0x00000400, CX
   282  	SUBQ     $0x00000400, SI
   283  	VPSRLQ   $0x2f, Y1, Y3
   284  	VPXOR    Y1, Y3, Y3
   285  	VPXOR    128(DX), Y3, Y3
   286  	VPMULUDQ Y0, Y3, Y4
   287  	VPSRLQ   $0x20, Y3, Y3
   288  	VPMULUDQ Y0, Y3, Y3
   289  	VPSLLQ   $0x20, Y3, Y3
   290  	VPADDQ   Y4, Y3, Y1
   291  	VPSRLQ   $0x2f, Y2, Y3
   292  	VPXOR    Y2, Y3, Y3
   293  	VPXOR    160(DX), Y3, Y3
   294  	VPMULUDQ Y0, Y3, Y4
   295  	VPSRLQ   $0x20, Y3, Y3
   296  	VPMULUDQ Y0, Y3, Y3
   297  	VPSLLQ   $0x20, Y3, Y3
   298  	VPADDQ   Y4, Y3, Y2
   299  	JMP      accumBlock
   300  
   301  accumStripe:
   302  	CMPQ     SI, $0x40
   303  	JLE      accumLastStripe
   304  	VMOVDQU  (CX), Y0
   305  	VMOVDQU  (BX), Y3
   306  	VMOVDQU  32(CX), Y5
   307  	VMOVDQU  32(BX), Y6
   308  	VPXOR    Y0, Y3, Y3
   309  	VPSRLQ   $0x20, Y3, Y4
   310  	VPSHUFD  $0x4e, Y0, Y0
   311  	VPMULUDQ Y3, Y4, Y4
   312  	VPADDQ   Y1, Y0, Y1
   313  	VPADDQ   Y1, Y4, Y1
   314  	VPXOR    Y5, Y6, Y6
   315  	VPSRLQ   $0x20, Y6, Y4
   316  	VPMULUDQ Y6, Y4, Y4
   317  	VPSHUFD  $0x4e, Y5, Y5
   318  	VPADDQ   Y2, Y5, Y2
   319  	VPADDQ   Y2, Y4, Y2
   320  	ADDQ     $0x00000040, CX
   321  	SUBQ     $0x00000040, SI
   322  	ADDQ     $0x00000008, BX
   323  	JMP      accumStripe
   324  
   325  accumLastStripe:
   326  	CMPQ     SI, $0x00
   327  	JE       return
   328  	SUBQ     $0x40, CX
   329  	ADDQ     SI, CX
   330  	VMOVDQU  (CX), Y0
   331  	VMOVDQU  121(DX), Y3
   332  	VMOVDQU  32(CX), Y5
   333  	VMOVDQU  153(DX), Y6
   334  	VPXOR    Y0, Y3, Y3
   335  	VPSRLQ   $0x20, Y3, Y4
   336  	VPSHUFD  $0x4e, Y0, Y0
   337  	VPMULUDQ Y3, Y4, Y4
   338  	VPADDQ   Y1, Y0, Y1
   339  	VPADDQ   Y1, Y4, Y1
   340  	VPXOR    Y5, Y6, Y6
   341  	VPSRLQ   $0x20, Y6, Y4
   342  	VPMULUDQ Y6, Y4, Y4
   343  	VPSHUFD  $0x4e, Y5, Y5
   344  	VPADDQ   Y2, Y5, Y2
   345  	VPADDQ   Y2, Y4, Y2
   346  
   347  return:
   348  	VMOVDQU Y1, (AX)
   349  	VMOVDQU Y2, 32(AX)
   350  	RET