github.com/cloudflare/circl@v1.5.0/pke/kyber/internal/common/amd64.s (about)

     1  // Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
     2  
     3  //go:build amd64 && !purego
     4  
     5  #include "textflag.h"
     6  
     7  // func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
     8  // Requires: AVX, AVX2
     9  TEXT ·addAVX2(SB), NOSPLIT, $0-24
    10  	MOVQ    p+0(FP), AX
    11  	MOVQ    a+8(FP), CX
    12  	MOVQ    b+16(FP), DX
    13  	VMOVDQU (CX), Y0
    14  	VMOVDQU 32(CX), Y2
    15  	VMOVDQU 64(CX), Y4
    16  	VMOVDQU 96(CX), Y6
    17  	VMOVDQU 128(CX), Y8
    18  	VMOVDQU 160(CX), Y10
    19  	VMOVDQU 192(CX), Y12
    20  	VMOVDQU 224(CX), Y14
    21  	VMOVDQU (DX), Y1
    22  	VMOVDQU 32(DX), Y3
    23  	VMOVDQU 64(DX), Y5
    24  	VMOVDQU 96(DX), Y7
    25  	VMOVDQU 128(DX), Y9
    26  	VMOVDQU 160(DX), Y11
    27  	VMOVDQU 192(DX), Y13
    28  	VMOVDQU 224(DX), Y15
    29  	VPADDW  Y0, Y1, Y1
    30  	VPADDW  Y2, Y3, Y3
    31  	VPADDW  Y4, Y5, Y5
    32  	VPADDW  Y6, Y7, Y7
    33  	VPADDW  Y8, Y9, Y9
    34  	VPADDW  Y10, Y11, Y11
    35  	VPADDW  Y12, Y13, Y13
    36  	VPADDW  Y14, Y15, Y15
    37  	VMOVDQU Y1, (AX)
    38  	VMOVDQU Y3, 32(AX)
    39  	VMOVDQU Y5, 64(AX)
    40  	VMOVDQU Y7, 96(AX)
    41  	VMOVDQU Y9, 128(AX)
    42  	VMOVDQU Y11, 160(AX)
    43  	VMOVDQU Y13, 192(AX)
    44  	VMOVDQU Y15, 224(AX)
    45  	VMOVDQU 256(CX), Y0
    46  	VMOVDQU 288(CX), Y2
    47  	VMOVDQU 320(CX), Y4
    48  	VMOVDQU 352(CX), Y6
    49  	VMOVDQU 384(CX), Y8
    50  	VMOVDQU 416(CX), Y10
    51  	VMOVDQU 448(CX), Y12
    52  	VMOVDQU 480(CX), Y14
    53  	VMOVDQU 256(DX), Y1
    54  	VMOVDQU 288(DX), Y3
    55  	VMOVDQU 320(DX), Y5
    56  	VMOVDQU 352(DX), Y7
    57  	VMOVDQU 384(DX), Y9
    58  	VMOVDQU 416(DX), Y11
    59  	VMOVDQU 448(DX), Y13
    60  	VMOVDQU 480(DX), Y15
    61  	VPADDW  Y0, Y1, Y1
    62  	VPADDW  Y2, Y3, Y3
    63  	VPADDW  Y4, Y5, Y5
    64  	VPADDW  Y6, Y7, Y7
    65  	VPADDW  Y8, Y9, Y9
    66  	VPADDW  Y10, Y11, Y11
    67  	VPADDW  Y12, Y13, Y13
    68  	VPADDW  Y14, Y15, Y15
    69  	VMOVDQU Y1, 256(AX)
    70  	VMOVDQU Y3, 288(AX)
    71  	VMOVDQU Y5, 320(AX)
    72  	VMOVDQU Y7, 352(AX)
    73  	VMOVDQU Y9, 384(AX)
    74  	VMOVDQU Y11, 416(AX)
    75  	VMOVDQU Y13, 448(AX)
    76  	VMOVDQU Y15, 480(AX)
    77  	RET
    78  
    79  // func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
    80  // Requires: AVX, AVX2
    81  TEXT ·subAVX2(SB), NOSPLIT, $0-24
    82  	MOVQ    p+0(FP), AX
    83  	MOVQ    a+8(FP), CX
    84  	MOVQ    b+16(FP), DX
    85  	VMOVDQU (CX), Y0
    86  	VMOVDQU 32(CX), Y2
    87  	VMOVDQU 64(CX), Y4
    88  	VMOVDQU 96(CX), Y6
    89  	VMOVDQU 128(CX), Y8
    90  	VMOVDQU 160(CX), Y10
    91  	VMOVDQU 192(CX), Y12
    92  	VMOVDQU 224(CX), Y14
    93  	VMOVDQU (DX), Y1
    94  	VMOVDQU 32(DX), Y3
    95  	VMOVDQU 64(DX), Y5
    96  	VMOVDQU 96(DX), Y7
    97  	VMOVDQU 128(DX), Y9
    98  	VMOVDQU 160(DX), Y11
    99  	VMOVDQU 192(DX), Y13
   100  	VMOVDQU 224(DX), Y15
   101  	VPSUBW  Y1, Y0, Y1
   102  	VPSUBW  Y3, Y2, Y3
   103  	VPSUBW  Y5, Y4, Y5
   104  	VPSUBW  Y7, Y6, Y7
   105  	VPSUBW  Y9, Y8, Y9
   106  	VPSUBW  Y11, Y10, Y11
   107  	VPSUBW  Y13, Y12, Y13
   108  	VPSUBW  Y15, Y14, Y15
   109  	VMOVDQU Y1, (AX)
   110  	VMOVDQU Y3, 32(AX)
   111  	VMOVDQU Y5, 64(AX)
   112  	VMOVDQU Y7, 96(AX)
   113  	VMOVDQU Y9, 128(AX)
   114  	VMOVDQU Y11, 160(AX)
   115  	VMOVDQU Y13, 192(AX)
   116  	VMOVDQU Y15, 224(AX)
   117  	VMOVDQU 256(CX), Y0
   118  	VMOVDQU 288(CX), Y2
   119  	VMOVDQU 320(CX), Y4
   120  	VMOVDQU 352(CX), Y6
   121  	VMOVDQU 384(CX), Y8
   122  	VMOVDQU 416(CX), Y10
   123  	VMOVDQU 448(CX), Y12
   124  	VMOVDQU 480(CX), Y14
   125  	VMOVDQU 256(DX), Y1
   126  	VMOVDQU 288(DX), Y3
   127  	VMOVDQU 320(DX), Y5
   128  	VMOVDQU 352(DX), Y7
   129  	VMOVDQU 384(DX), Y9
   130  	VMOVDQU 416(DX), Y11
   131  	VMOVDQU 448(DX), Y13
   132  	VMOVDQU 480(DX), Y15
   133  	VPSUBW  Y1, Y0, Y1
   134  	VPSUBW  Y3, Y2, Y3
   135  	VPSUBW  Y5, Y4, Y5
   136  	VPSUBW  Y7, Y6, Y7
   137  	VPSUBW  Y9, Y8, Y9
   138  	VPSUBW  Y11, Y10, Y11
   139  	VPSUBW  Y13, Y12, Y13
   140  	VPSUBW  Y15, Y14, Y15
   141  	VMOVDQU Y1, 256(AX)
   142  	VMOVDQU Y3, 288(AX)
   143  	VMOVDQU Y5, 320(AX)
   144  	VMOVDQU Y7, 352(AX)
   145  	VMOVDQU Y9, 384(AX)
   146  	VMOVDQU Y11, 416(AX)
   147  	VMOVDQU Y13, 448(AX)
   148  	VMOVDQU Y15, 480(AX)
   149  	RET
   150  
   151  // func nttAVX2(p *[256]int16)
   152  // Requires: AVX, AVX2
   153  TEXT ·nttAVX2(SB), NOSPLIT, $0-8
   154  	MOVQ         p+0(FP), AX
   155  	LEAQ         ·ZetasAVX2+0(SB), CX
   156  	MOVL         $0x00000d01, DX
   157  	VMOVD        DX, X0
   158  	VPBROADCASTW X0, Y15
   159  	VPBROADCASTW (CX), Y0
   160  	VPBROADCASTW 2(CX), Y1
   161  	VMOVDQU      (AX), Y7
   162  	VMOVDQU      32(AX), Y8
   163  	VMOVDQU      64(AX), Y9
   164  	VMOVDQU      96(AX), Y10
   165  	VMOVDQU      256(AX), Y11
   166  	VMOVDQU      288(AX), Y12
   167  	VMOVDQU      320(AX), Y13
   168  	VMOVDQU      352(AX), Y14
   169  	VPMULLW      Y11, Y0, Y2
   170  	VPMULLW      Y12, Y0, Y3
   171  	VPMULLW      Y13, Y0, Y4
   172  	VPMULLW      Y14, Y0, Y5
   173  	VPMULHW      Y11, Y1, Y11
   174  	VPMULHW      Y12, Y1, Y12
   175  	VPMULHW      Y13, Y1, Y13
   176  	VPMULHW      Y14, Y1, Y14
   177  	VPMULHW      Y2, Y15, Y2
   178  	VPMULHW      Y3, Y15, Y3
   179  	VPMULHW      Y4, Y15, Y4
   180  	VPMULHW      Y5, Y15, Y5
   181  	VPSUBW       Y2, Y11, Y2
   182  	VPSUBW       Y3, Y12, Y3
   183  	VPSUBW       Y4, Y13, Y4
   184  	VPSUBW       Y5, Y14, Y5
   185  	VPSUBW       Y2, Y7, Y11
   186  	VPSUBW       Y3, Y8, Y12
   187  	VPSUBW       Y4, Y9, Y13
   188  	VPSUBW       Y5, Y10, Y14
   189  	VPADDW       Y2, Y7, Y7
   190  	VPADDW       Y3, Y8, Y8
   191  	VPADDW       Y4, Y9, Y9
   192  	VPADDW       Y5, Y10, Y10
   193  	VMOVDQU      Y7, (AX)
   194  	VMOVDQU      Y8, 32(AX)
   195  	VMOVDQU      Y9, 64(AX)
   196  	VMOVDQU      Y10, 96(AX)
   197  	VMOVDQU      Y11, 256(AX)
   198  	VMOVDQU      Y12, 288(AX)
   199  	VMOVDQU      Y13, 320(AX)
   200  	VMOVDQU      Y14, 352(AX)
   201  	VMOVDQU      128(AX), Y7
   202  	VMOVDQU      160(AX), Y8
   203  	VMOVDQU      192(AX), Y9
   204  	VMOVDQU      224(AX), Y10
   205  	VMOVDQU      384(AX), Y11
   206  	VMOVDQU      416(AX), Y12
   207  	VMOVDQU      448(AX), Y13
   208  	VMOVDQU      480(AX), Y14
   209  	VPMULLW      Y11, Y0, Y2
   210  	VPMULLW      Y12, Y0, Y3
   211  	VPMULLW      Y13, Y0, Y4
   212  	VPMULLW      Y14, Y0, Y5
   213  	VPMULHW      Y11, Y1, Y11
   214  	VPMULHW      Y12, Y1, Y12
   215  	VPMULHW      Y13, Y1, Y13
   216  	VPMULHW      Y14, Y1, Y14
   217  	VPMULHW      Y2, Y15, Y2
   218  	VPMULHW      Y3, Y15, Y3
   219  	VPMULHW      Y4, Y15, Y4
   220  	VPMULHW      Y5, Y15, Y5
   221  	VPSUBW       Y2, Y11, Y2
   222  	VPSUBW       Y3, Y12, Y3
   223  	VPSUBW       Y4, Y13, Y4
   224  	VPSUBW       Y5, Y14, Y5
   225  	VPSUBW       Y2, Y7, Y11
   226  	VPSUBW       Y3, Y8, Y12
   227  	VPSUBW       Y4, Y9, Y13
   228  	VPSUBW       Y5, Y10, Y14
   229  	VPADDW       Y2, Y7, Y7
   230  	VPADDW       Y3, Y8, Y8
   231  	VPADDW       Y4, Y9, Y9
   232  	VPADDW       Y5, Y10, Y10
   233  	VMOVDQU      Y7, 128(AX)
   234  	VMOVDQU      Y8, 160(AX)
   235  	VMOVDQU      Y9, 192(AX)
   236  	VMOVDQU      Y10, 224(AX)
   237  	VMOVDQU      Y11, 384(AX)
   238  	VMOVDQU      Y12, 416(AX)
   239  	VMOVDQU      Y13, 448(AX)
   240  	VMOVDQU      Y14, 480(AX)
   241  	VPBROADCASTW 4(CX), Y0
   242  	VPBROADCASTW 6(CX), Y1
   243  	VMOVDQU      (AX), Y7
   244  	VMOVDQU      32(AX), Y8
   245  	VMOVDQU      64(AX), Y9
   246  	VMOVDQU      96(AX), Y10
   247  	VMOVDQU      128(AX), Y11
   248  	VMOVDQU      160(AX), Y12
   249  	VMOVDQU      192(AX), Y13
   250  	VMOVDQU      224(AX), Y14
   251  	VPMULLW      Y11, Y0, Y2
   252  	VPMULLW      Y12, Y0, Y3
   253  	VPMULLW      Y13, Y0, Y4
   254  	VPMULLW      Y14, Y0, Y5
   255  	VPMULHW      Y11, Y1, Y11
   256  	VPMULHW      Y12, Y1, Y12
   257  	VPMULHW      Y13, Y1, Y13
   258  	VPMULHW      Y14, Y1, Y14
   259  	VPMULHW      Y2, Y15, Y2
   260  	VPMULHW      Y3, Y15, Y3
   261  	VPMULHW      Y4, Y15, Y4
   262  	VPMULHW      Y5, Y15, Y5
   263  	VPSUBW       Y2, Y11, Y2
   264  	VPSUBW       Y3, Y12, Y3
   265  	VPSUBW       Y4, Y13, Y4
   266  	VPSUBW       Y5, Y14, Y5
   267  	VPSUBW       Y2, Y7, Y11
   268  	VPSUBW       Y3, Y8, Y12
   269  	VPSUBW       Y4, Y9, Y13
   270  	VPSUBW       Y5, Y10, Y14
   271  	VPADDW       Y2, Y7, Y7
   272  	VPADDW       Y3, Y8, Y8
   273  	VPADDW       Y4, Y9, Y9
   274  	VPADDW       Y5, Y10, Y10
   275  	VPBROADCASTW 12(CX), Y0
   276  	VPBROADCASTW 14(CX), Y1
   277  	VPBROADCASTW 16(CX), Y2
   278  	VPBROADCASTW 18(CX), Y3
   279  	VPMULLW      Y9, Y0, Y4
   280  	VPMULLW      Y10, Y0, Y5
   281  	VPMULLW      Y13, Y2, Y6
   282  	VPMULLW      Y14, Y2, Y0
   283  	VPMULHW      Y9, Y1, Y9
   284  	VPMULHW      Y10, Y1, Y10
   285  	VPMULHW      Y13, Y3, Y13
   286  	VPMULHW      Y14, Y3, Y14
   287  	VPMULHW      Y4, Y15, Y4
   288  	VPMULHW      Y5, Y15, Y5
   289  	VPMULHW      Y6, Y15, Y6
   290  	VPMULHW      Y0, Y15, Y0
   291  	VPSUBW       Y4, Y9, Y4
   292  	VPSUBW       Y5, Y10, Y5
   293  	VPSUBW       Y6, Y13, Y6
   294  	VPSUBW       Y0, Y14, Y0
   295  	VPSUBW       Y4, Y7, Y9
   296  	VPSUBW       Y5, Y8, Y10
   297  	VPSUBW       Y6, Y11, Y13
   298  	VPSUBW       Y0, Y12, Y14
   299  	VPADDW       Y4, Y7, Y7
   300  	VPADDW       Y5, Y8, Y8
   301  	VPADDW       Y6, Y11, Y11
   302  	VPADDW       Y0, Y12, Y12
   303  	VMOVDQU      32(CX), Y0
   304  	VMOVDQU      64(CX), Y1
   305  	VMOVDQU      96(CX), Y2
   306  	VMOVDQU      128(CX), Y3
   307  	VPERM2I128   $0x20, Y9, Y7, Y4
   308  	VPERM2I128   $0x31, Y9, Y7, Y9
   309  	VMOVDQA      Y4, Y7
   310  	VPERM2I128   $0x20, Y10, Y8, Y4
   311  	VPERM2I128   $0x31, Y10, Y8, Y10
   312  	VMOVDQA      Y4, Y8
   313  	VPERM2I128   $0x20, Y13, Y11, Y4
   314  	VPERM2I128   $0x31, Y13, Y11, Y13
   315  	VMOVDQA      Y4, Y11
   316  	VPERM2I128   $0x20, Y14, Y12, Y4
   317  	VPERM2I128   $0x31, Y14, Y12, Y14
   318  	VMOVDQA      Y4, Y12
   319  	VPMULLW      Y8, Y0, Y4
   320  	VPMULLW      Y10, Y0, Y5
   321  	VPMULLW      Y12, Y2, Y6
   322  	VPMULLW      Y14, Y2, Y0
   323  	VPMULHW      Y8, Y1, Y8
   324  	VPMULHW      Y10, Y1, Y10
   325  	VPMULHW      Y12, Y3, Y12
   326  	VPMULHW      Y14, Y3, Y14
   327  	VPMULHW      Y4, Y15, Y4
   328  	VPMULHW      Y5, Y15, Y5
   329  	VPMULHW      Y6, Y15, Y6
   330  	VPMULHW      Y0, Y15, Y0
   331  	VPSUBW       Y4, Y8, Y4
   332  	VPSUBW       Y5, Y10, Y5
   333  	VPSUBW       Y6, Y12, Y6
   334  	VPSUBW       Y0, Y14, Y0
   335  	VPSUBW       Y4, Y7, Y8
   336  	VPSUBW       Y5, Y9, Y10
   337  	VPSUBW       Y6, Y11, Y12
   338  	VPSUBW       Y0, Y13, Y14
   339  	VPADDW       Y4, Y7, Y7
   340  	VPADDW       Y5, Y9, Y9
   341  	VPADDW       Y6, Y11, Y11
   342  	VPADDW       Y0, Y13, Y13
   343  	VMOVDQU      288(CX), Y0
   344  	VMOVDQU      320(CX), Y1
   345  	VMOVDQU      352(CX), Y2
   346  	VMOVDQU      384(CX), Y3
   347  	VPUNPCKLQDQ  Y8, Y7, Y4
   348  	VPUNPCKHQDQ  Y8, Y7, Y8
   349  	VMOVDQA      Y4, Y7
   350  	VPUNPCKLQDQ  Y10, Y9, Y4
   351  	VPUNPCKHQDQ  Y10, Y9, Y10
   352  	VMOVDQA      Y4, Y9
   353  	VPUNPCKLQDQ  Y12, Y11, Y4
   354  	VPUNPCKHQDQ  Y12, Y11, Y12
   355  	VMOVDQA      Y4, Y11
   356  	VPUNPCKLQDQ  Y14, Y13, Y4
   357  	VPUNPCKHQDQ  Y14, Y13, Y14
   358  	VMOVDQA      Y4, Y13
   359  	VPMULLW      Y9, Y0, Y4
   360  	VPMULLW      Y10, Y0, Y5
   361  	VPMULLW      Y13, Y2, Y6
   362  	VPMULLW      Y14, Y2, Y0
   363  	VPMULHW      Y9, Y1, Y9
   364  	VPMULHW      Y10, Y1, Y10
   365  	VPMULHW      Y13, Y3, Y13
   366  	VPMULHW      Y14, Y3, Y14
   367  	VPMULHW      Y4, Y15, Y4
   368  	VPMULHW      Y5, Y15, Y5
   369  	VPMULHW      Y6, Y15, Y6
   370  	VPMULHW      Y0, Y15, Y0
   371  	VPSUBW       Y4, Y9, Y4
   372  	VPSUBW       Y5, Y10, Y5
   373  	VPSUBW       Y6, Y13, Y6
   374  	VPSUBW       Y0, Y14, Y0
   375  	VPSUBW       Y4, Y7, Y9
   376  	VPSUBW       Y5, Y8, Y10
   377  	VPSUBW       Y6, Y11, Y13
   378  	VPSUBW       Y0, Y12, Y14
   379  	VPADDW       Y4, Y7, Y7
   380  	VPADDW       Y5, Y8, Y8
   381  	VPADDW       Y6, Y11, Y11
   382  	VPADDW       Y0, Y12, Y12
   383  	VMOVDQU      544(CX), Y0
   384  	VMOVDQU      576(CX), Y1
   385  	VMOVDQU      608(CX), Y2
   386  	VMOVDQU      640(CX), Y3
   387  	VMOVSLDUP    Y9, Y4
   388  	VPBLENDD     $0xaa, Y4, Y7, Y4
   389  	VPSRLQ       $0x20, Y7, Y7
   390  	VPBLENDD     $0xaa, Y9, Y7, Y9
   391  	VMOVDQA      Y4, Y7
   392  	VMOVSLDUP    Y10, Y4
   393  	VPBLENDD     $0xaa, Y4, Y8, Y4
   394  	VPSRLQ       $0x20, Y8, Y8
   395  	VPBLENDD     $0xaa, Y10, Y8, Y10
   396  	VMOVDQA      Y4, Y8
   397  	VMOVSLDUP    Y13, Y4
   398  	VPBLENDD     $0xaa, Y4, Y11, Y4
   399  	VPSRLQ       $0x20, Y11, Y11
   400  	VPBLENDD     $0xaa, Y13, Y11, Y13
   401  	VMOVDQA      Y4, Y11
   402  	VMOVSLDUP    Y14, Y4
   403  	VPBLENDD     $0xaa, Y4, Y12, Y4
   404  	VPSRLQ       $0x20, Y12, Y12
   405  	VPBLENDD     $0xaa, Y14, Y12, Y14
   406  	VMOVDQA      Y4, Y12
   407  	VPMULLW      Y8, Y0, Y4
   408  	VPMULLW      Y10, Y0, Y5
   409  	VPMULLW      Y12, Y2, Y6
   410  	VPMULLW      Y14, Y2, Y0
   411  	VPMULHW      Y8, Y1, Y8
   412  	VPMULHW      Y10, Y1, Y10
   413  	VPMULHW      Y12, Y3, Y12
   414  	VPMULHW      Y14, Y3, Y14
   415  	VPMULHW      Y4, Y15, Y4
   416  	VPMULHW      Y5, Y15, Y5
   417  	VPMULHW      Y6, Y15, Y6
   418  	VPMULHW      Y0, Y15, Y0
   419  	VPSUBW       Y4, Y8, Y4
   420  	VPSUBW       Y5, Y10, Y5
   421  	VPSUBW       Y6, Y12, Y6
   422  	VPSUBW       Y0, Y14, Y0
   423  	VPSUBW       Y4, Y7, Y8
   424  	VPSUBW       Y5, Y9, Y10
   425  	VPSUBW       Y6, Y11, Y12
   426  	VPSUBW       Y0, Y13, Y14
   427  	VPADDW       Y4, Y7, Y7
   428  	VPADDW       Y5, Y9, Y9
   429  	VPADDW       Y6, Y11, Y11
   430  	VPADDW       Y0, Y13, Y13
   431  	VMOVDQU      800(CX), Y0
   432  	VMOVDQU      832(CX), Y1
   433  	VMOVDQU      864(CX), Y2
   434  	VMOVDQU      896(CX), Y3
   435  	VPSLLD       $0x10, Y8, Y4
   436  	VPBLENDW     $0xaa, Y4, Y7, Y4
   437  	VPSRLD       $0x10, Y7, Y7
   438  	VPBLENDW     $0xaa, Y8, Y7, Y8
   439  	VMOVDQA      Y4, Y7
   440  	VPSLLD       $0x10, Y10, Y4
   441  	VPBLENDW     $0xaa, Y4, Y9, Y4
   442  	VPSRLD       $0x10, Y9, Y9
   443  	VPBLENDW     $0xaa, Y10, Y9, Y10
   444  	VMOVDQA      Y4, Y9
   445  	VPSLLD       $0x10, Y12, Y4
   446  	VPBLENDW     $0xaa, Y4, Y11, Y4
   447  	VPSRLD       $0x10, Y11, Y11
   448  	VPBLENDW     $0xaa, Y12, Y11, Y12
   449  	VMOVDQA      Y4, Y11
   450  	VPSLLD       $0x10, Y14, Y4
   451  	VPBLENDW     $0xaa, Y4, Y13, Y4
   452  	VPSRLD       $0x10, Y13, Y13
   453  	VPBLENDW     $0xaa, Y14, Y13, Y14
   454  	VMOVDQA      Y4, Y13
   455  	VPMULLW      Y9, Y0, Y4
   456  	VPMULLW      Y10, Y0, Y5
   457  	VPMULLW      Y13, Y2, Y6
   458  	VPMULLW      Y14, Y2, Y0
   459  	VPMULHW      Y9, Y1, Y9
   460  	VPMULHW      Y10, Y1, Y10
   461  	VPMULHW      Y13, Y3, Y13
   462  	VPMULHW      Y14, Y3, Y14
   463  	VPMULHW      Y4, Y15, Y4
   464  	VPMULHW      Y5, Y15, Y5
   465  	VPMULHW      Y6, Y15, Y6
   466  	VPMULHW      Y0, Y15, Y0
   467  	VPSUBW       Y4, Y9, Y4
   468  	VPSUBW       Y5, Y10, Y5
   469  	VPSUBW       Y6, Y13, Y6
   470  	VPSUBW       Y0, Y14, Y0
   471  	VPSUBW       Y4, Y7, Y9
   472  	VPSUBW       Y5, Y8, Y10
   473  	VPSUBW       Y6, Y11, Y13
   474  	VPSUBW       Y0, Y12, Y14
   475  	VPADDW       Y4, Y7, Y7
   476  	VPADDW       Y5, Y8, Y8
   477  	VPADDW       Y6, Y11, Y11
   478  	VPADDW       Y0, Y12, Y12
   479  	VMOVDQU      Y7, (AX)
   480  	VMOVDQU      Y8, 32(AX)
   481  	VMOVDQU      Y9, 64(AX)
   482  	VMOVDQU      Y10, 96(AX)
   483  	VMOVDQU      Y11, 128(AX)
   484  	VMOVDQU      Y12, 160(AX)
   485  	VMOVDQU      Y13, 192(AX)
   486  	VMOVDQU      Y14, 224(AX)
   487  	VPBROADCASTW 8(CX), Y0
   488  	VPBROADCASTW 10(CX), Y1
   489  	VMOVDQU      256(AX), Y7
   490  	VMOVDQU      288(AX), Y8
   491  	VMOVDQU      320(AX), Y9
   492  	VMOVDQU      352(AX), Y10
   493  	VMOVDQU      384(AX), Y11
   494  	VMOVDQU      416(AX), Y12
   495  	VMOVDQU      448(AX), Y13
   496  	VMOVDQU      480(AX), Y14
   497  	VPMULLW      Y11, Y0, Y2
   498  	VPMULLW      Y12, Y0, Y3
   499  	VPMULLW      Y13, Y0, Y4
   500  	VPMULLW      Y14, Y0, Y5
   501  	VPMULHW      Y11, Y1, Y11
   502  	VPMULHW      Y12, Y1, Y12
   503  	VPMULHW      Y13, Y1, Y13
   504  	VPMULHW      Y14, Y1, Y14
   505  	VPMULHW      Y2, Y15, Y2
   506  	VPMULHW      Y3, Y15, Y3
   507  	VPMULHW      Y4, Y15, Y4
   508  	VPMULHW      Y5, Y15, Y5
   509  	VPSUBW       Y2, Y11, Y2
   510  	VPSUBW       Y3, Y12, Y3
   511  	VPSUBW       Y4, Y13, Y4
   512  	VPSUBW       Y5, Y14, Y5
   513  	VPSUBW       Y2, Y7, Y11
   514  	VPSUBW       Y3, Y8, Y12
   515  	VPSUBW       Y4, Y9, Y13
   516  	VPSUBW       Y5, Y10, Y14
   517  	VPADDW       Y2, Y7, Y7
   518  	VPADDW       Y3, Y8, Y8
   519  	VPADDW       Y4, Y9, Y9
   520  	VPADDW       Y5, Y10, Y10
   521  	VPBROADCASTW 20(CX), Y0
   522  	VPBROADCASTW 22(CX), Y1
   523  	VPBROADCASTW 24(CX), Y2
   524  	VPBROADCASTW 26(CX), Y3
   525  	VPMULLW      Y9, Y0, Y4
   526  	VPMULLW      Y10, Y0, Y5
   527  	VPMULLW      Y13, Y2, Y6
   528  	VPMULLW      Y14, Y2, Y0
   529  	VPMULHW      Y9, Y1, Y9
   530  	VPMULHW      Y10, Y1, Y10
   531  	VPMULHW      Y13, Y3, Y13
   532  	VPMULHW      Y14, Y3, Y14
   533  	VPMULHW      Y4, Y15, Y4
   534  	VPMULHW      Y5, Y15, Y5
   535  	VPMULHW      Y6, Y15, Y6
   536  	VPMULHW      Y0, Y15, Y0
   537  	VPSUBW       Y4, Y9, Y4
   538  	VPSUBW       Y5, Y10, Y5
   539  	VPSUBW       Y6, Y13, Y6
   540  	VPSUBW       Y0, Y14, Y0
   541  	VPSUBW       Y4, Y7, Y9
   542  	VPSUBW       Y5, Y8, Y10
   543  	VPSUBW       Y6, Y11, Y13
   544  	VPSUBW       Y0, Y12, Y14
   545  	VPADDW       Y4, Y7, Y7
   546  	VPADDW       Y5, Y8, Y8
   547  	VPADDW       Y6, Y11, Y11
   548  	VPADDW       Y0, Y12, Y12
   549  	VMOVDQU      160(CX), Y0
   550  	VMOVDQU      192(CX), Y1
   551  	VMOVDQU      224(CX), Y2
   552  	VMOVDQU      256(CX), Y3
   553  	VPERM2I128   $0x20, Y9, Y7, Y4
   554  	VPERM2I128   $0x31, Y9, Y7, Y9
   555  	VMOVDQA      Y4, Y7
   556  	VPERM2I128   $0x20, Y10, Y8, Y4
   557  	VPERM2I128   $0x31, Y10, Y8, Y10
   558  	VMOVDQA      Y4, Y8
   559  	VPERM2I128   $0x20, Y13, Y11, Y4
   560  	VPERM2I128   $0x31, Y13, Y11, Y13
   561  	VMOVDQA      Y4, Y11
   562  	VPERM2I128   $0x20, Y14, Y12, Y4
   563  	VPERM2I128   $0x31, Y14, Y12, Y14
   564  	VMOVDQA      Y4, Y12
   565  	VPMULLW      Y8, Y0, Y4
   566  	VPMULLW      Y10, Y0, Y5
   567  	VPMULLW      Y12, Y2, Y6
   568  	VPMULLW      Y14, Y2, Y0
   569  	VPMULHW      Y8, Y1, Y8
   570  	VPMULHW      Y10, Y1, Y10
   571  	VPMULHW      Y12, Y3, Y12
   572  	VPMULHW      Y14, Y3, Y14
   573  	VPMULHW      Y4, Y15, Y4
   574  	VPMULHW      Y5, Y15, Y5
   575  	VPMULHW      Y6, Y15, Y6
   576  	VPMULHW      Y0, Y15, Y0
   577  	VPSUBW       Y4, Y8, Y4
   578  	VPSUBW       Y5, Y10, Y5
   579  	VPSUBW       Y6, Y12, Y6
   580  	VPSUBW       Y0, Y14, Y0
   581  	VPSUBW       Y4, Y7, Y8
   582  	VPSUBW       Y5, Y9, Y10
   583  	VPSUBW       Y6, Y11, Y12
   584  	VPSUBW       Y0, Y13, Y14
   585  	VPADDW       Y4, Y7, Y7
   586  	VPADDW       Y5, Y9, Y9
   587  	VPADDW       Y6, Y11, Y11
   588  	VPADDW       Y0, Y13, Y13
   589  	VMOVDQU      416(CX), Y0
   590  	VMOVDQU      448(CX), Y1
   591  	VMOVDQU      480(CX), Y2
   592  	VMOVDQU      512(CX), Y3
   593  	VPUNPCKLQDQ  Y8, Y7, Y4
   594  	VPUNPCKHQDQ  Y8, Y7, Y8
   595  	VMOVDQA      Y4, Y7
   596  	VPUNPCKLQDQ  Y10, Y9, Y4
   597  	VPUNPCKHQDQ  Y10, Y9, Y10
   598  	VMOVDQA      Y4, Y9
   599  	VPUNPCKLQDQ  Y12, Y11, Y4
   600  	VPUNPCKHQDQ  Y12, Y11, Y12
   601  	VMOVDQA      Y4, Y11
   602  	VPUNPCKLQDQ  Y14, Y13, Y4
   603  	VPUNPCKHQDQ  Y14, Y13, Y14
   604  	VMOVDQA      Y4, Y13
   605  	VPMULLW      Y9, Y0, Y4
   606  	VPMULLW      Y10, Y0, Y5
   607  	VPMULLW      Y13, Y2, Y6
   608  	VPMULLW      Y14, Y2, Y0
   609  	VPMULHW      Y9, Y1, Y9
   610  	VPMULHW      Y10, Y1, Y10
   611  	VPMULHW      Y13, Y3, Y13
   612  	VPMULHW      Y14, Y3, Y14
   613  	VPMULHW      Y4, Y15, Y4
   614  	VPMULHW      Y5, Y15, Y5
   615  	VPMULHW      Y6, Y15, Y6
   616  	VPMULHW      Y0, Y15, Y0
   617  	VPSUBW       Y4, Y9, Y4
   618  	VPSUBW       Y5, Y10, Y5
   619  	VPSUBW       Y6, Y13, Y6
   620  	VPSUBW       Y0, Y14, Y0
   621  	VPSUBW       Y4, Y7, Y9
   622  	VPSUBW       Y5, Y8, Y10
   623  	VPSUBW       Y6, Y11, Y13
   624  	VPSUBW       Y0, Y12, Y14
   625  	VPADDW       Y4, Y7, Y7
   626  	VPADDW       Y5, Y8, Y8
   627  	VPADDW       Y6, Y11, Y11
   628  	VPADDW       Y0, Y12, Y12
   629  	VMOVDQU      672(CX), Y0
   630  	VMOVDQU      704(CX), Y1
   631  	VMOVDQU      736(CX), Y2
   632  	VMOVDQU      768(CX), Y3
   633  	VMOVSLDUP    Y9, Y4
   634  	VPBLENDD     $0xaa, Y4, Y7, Y4
   635  	VPSRLQ       $0x20, Y7, Y7
   636  	VPBLENDD     $0xaa, Y9, Y7, Y9
   637  	VMOVDQA      Y4, Y7
   638  	VMOVSLDUP    Y10, Y4
   639  	VPBLENDD     $0xaa, Y4, Y8, Y4
   640  	VPSRLQ       $0x20, Y8, Y8
   641  	VPBLENDD     $0xaa, Y10, Y8, Y10
   642  	VMOVDQA      Y4, Y8
   643  	VMOVSLDUP    Y13, Y4
   644  	VPBLENDD     $0xaa, Y4, Y11, Y4
   645  	VPSRLQ       $0x20, Y11, Y11
   646  	VPBLENDD     $0xaa, Y13, Y11, Y13
   647  	VMOVDQA      Y4, Y11
   648  	VMOVSLDUP    Y14, Y4
   649  	VPBLENDD     $0xaa, Y4, Y12, Y4
   650  	VPSRLQ       $0x20, Y12, Y12
   651  	VPBLENDD     $0xaa, Y14, Y12, Y14
   652  	VMOVDQA      Y4, Y12
   653  	VPMULLW      Y8, Y0, Y4
   654  	VPMULLW      Y10, Y0, Y5
   655  	VPMULLW      Y12, Y2, Y6
   656  	VPMULLW      Y14, Y2, Y0
   657  	VPMULHW      Y8, Y1, Y8
   658  	VPMULHW      Y10, Y1, Y10
   659  	VPMULHW      Y12, Y3, Y12
   660  	VPMULHW      Y14, Y3, Y14
   661  	VPMULHW      Y4, Y15, Y4
   662  	VPMULHW      Y5, Y15, Y5
   663  	VPMULHW      Y6, Y15, Y6
   664  	VPMULHW      Y0, Y15, Y0
   665  	VPSUBW       Y4, Y8, Y4
   666  	VPSUBW       Y5, Y10, Y5
   667  	VPSUBW       Y6, Y12, Y6
   668  	VPSUBW       Y0, Y14, Y0
   669  	VPSUBW       Y4, Y7, Y8
   670  	VPSUBW       Y5, Y9, Y10
   671  	VPSUBW       Y6, Y11, Y12
   672  	VPSUBW       Y0, Y13, Y14
   673  	VPADDW       Y4, Y7, Y7
   674  	VPADDW       Y5, Y9, Y9
   675  	VPADDW       Y6, Y11, Y11
   676  	VPADDW       Y0, Y13, Y13
   677  	VMOVDQU      928(CX), Y0
   678  	VMOVDQU      960(CX), Y1
   679  	VMOVDQU      992(CX), Y2
   680  	VMOVDQU      1024(CX), Y3
   681  	VPSLLD       $0x10, Y8, Y4
   682  	VPBLENDW     $0xaa, Y4, Y7, Y4
   683  	VPSRLD       $0x10, Y7, Y7
   684  	VPBLENDW     $0xaa, Y8, Y7, Y8
   685  	VMOVDQA      Y4, Y7
   686  	VPSLLD       $0x10, Y10, Y4
   687  	VPBLENDW     $0xaa, Y4, Y9, Y4
   688  	VPSRLD       $0x10, Y9, Y9
   689  	VPBLENDW     $0xaa, Y10, Y9, Y10
   690  	VMOVDQA      Y4, Y9
   691  	VPSLLD       $0x10, Y12, Y4
   692  	VPBLENDW     $0xaa, Y4, Y11, Y4
   693  	VPSRLD       $0x10, Y11, Y11
   694  	VPBLENDW     $0xaa, Y12, Y11, Y12
   695  	VMOVDQA      Y4, Y11
   696  	VPSLLD       $0x10, Y14, Y4
   697  	VPBLENDW     $0xaa, Y4, Y13, Y4
   698  	VPSRLD       $0x10, Y13, Y13
   699  	VPBLENDW     $0xaa, Y14, Y13, Y14
   700  	VMOVDQA      Y4, Y13
   701  	VPMULLW      Y9, Y0, Y4
   702  	VPMULLW      Y10, Y0, Y5
   703  	VPMULLW      Y13, Y2, Y6
   704  	VPMULLW      Y14, Y2, Y0
   705  	VPMULHW      Y9, Y1, Y9
   706  	VPMULHW      Y10, Y1, Y10
   707  	VPMULHW      Y13, Y3, Y13
   708  	VPMULHW      Y14, Y3, Y14
   709  	VPMULHW      Y4, Y15, Y4
   710  	VPMULHW      Y5, Y15, Y5
   711  	VPMULHW      Y6, Y15, Y6
   712  	VPMULHW      Y0, Y15, Y0
   713  	VPSUBW       Y4, Y9, Y4
   714  	VPSUBW       Y5, Y10, Y5
   715  	VPSUBW       Y6, Y13, Y6
   716  	VPSUBW       Y0, Y14, Y0
   717  	VPSUBW       Y4, Y7, Y9
   718  	VPSUBW       Y5, Y8, Y10
   719  	VPSUBW       Y6, Y11, Y13
   720  	VPSUBW       Y0, Y12, Y14
   721  	VPADDW       Y4, Y7, Y7
   722  	VPADDW       Y5, Y8, Y8
   723  	VPADDW       Y6, Y11, Y11
   724  	VPADDW       Y0, Y12, Y12
   725  	VMOVDQU      Y7, 256(AX)
   726  	VMOVDQU      Y8, 288(AX)
   727  	VMOVDQU      Y9, 320(AX)
   728  	VMOVDQU      Y10, 352(AX)
   729  	VMOVDQU      Y11, 384(AX)
   730  	VMOVDQU      Y12, 416(AX)
   731  	VMOVDQU      Y13, 448(AX)
   732  	VMOVDQU      Y14, 480(AX)
   733  	RET
   734  
   735  // func invNttAVX2(p *[256]int16)
   736  // Requires: AVX, AVX2
   737  TEXT ·invNttAVX2(SB), NOSPLIT, $0-8
   738  	MOVQ         p+0(FP), AX
   739  	LEAQ         ·ZetasAVX2+0(SB), CX
   740  	MOVL         $0x00000d01, DX
   741  	VMOVD        DX, X0
   742  	VPBROADCASTW X0, Y15
   743  	VMOVDQU      (AX), Y7
   744  	VMOVDQU      32(AX), Y8
   745  	VMOVDQU      64(AX), Y9
   746  	VMOVDQU      96(AX), Y10
   747  	VMOVDQU      128(AX), Y11
   748  	VMOVDQU      160(AX), Y12
   749  	VMOVDQU      192(AX), Y13
   750  	VMOVDQU      224(AX), Y14
   751  	VMOVDQU      1056(CX), Y0
   752  	VMOVDQU      1088(CX), Y1
   753  	VMOVDQU      1120(CX), Y2
   754  	VMOVDQU      1152(CX), Y3
   755  	VPSUBW       Y7, Y9, Y4
   756  	VPSUBW       Y8, Y10, Y5
   757  	VPSUBW       Y11, Y13, Y6
   758  	VPADDW       Y7, Y9, Y7
   759  	VPADDW       Y8, Y10, Y8
   760  	VPADDW       Y11, Y13, Y11
   761  	VPMULLW      Y4, Y0, Y9
   762  	VPMULLW      Y5, Y0, Y10
   763  	VPSUBW       Y12, Y14, Y0
   764  	VPMULLW      Y6, Y2, Y13
   765  	VPADDW       Y12, Y14, Y12
   766  	VPMULLW      Y0, Y2, Y14
   767  	VPMULHW      Y4, Y1, Y4
   768  	VPMULHW      Y5, Y1, Y5
   769  	VPMULHW      Y6, Y3, Y6
   770  	VPMULHW      Y0, Y3, Y0
   771  	VPMULHW      Y9, Y15, Y9
   772  	VPMULHW      Y10, Y15, Y10
   773  	VPMULHW      Y13, Y15, Y13
   774  	VPMULHW      Y14, Y15, Y14
   775  	VPSUBW       Y9, Y4, Y9
   776  	VPSUBW       Y10, Y5, Y10
   777  	VPSUBW       Y13, Y6, Y13
   778  	VPSUBW       Y14, Y0, Y14
   779  	VMOVDQU      1312(CX), Y0
   780  	VMOVDQU      1344(CX), Y1
   781  	VMOVDQU      1376(CX), Y2
   782  	VMOVDQU      1408(CX), Y3
   783  	VPSLLD       $0x10, Y8, Y4
   784  	VPBLENDW     $0xaa, Y4, Y7, Y4
   785  	VPSRLD       $0x10, Y7, Y7
   786  	VPBLENDW     $0xaa, Y8, Y7, Y8
   787  	VMOVDQA      Y4, Y7
   788  	VPSLLD       $0x10, Y10, Y4
   789  	VPBLENDW     $0xaa, Y4, Y9, Y4
   790  	VPSRLD       $0x10, Y9, Y9
   791  	VPBLENDW     $0xaa, Y10, Y9, Y10
   792  	VMOVDQA      Y4, Y9
   793  	VPSLLD       $0x10, Y12, Y4
   794  	VPBLENDW     $0xaa, Y4, Y11, Y4
   795  	VPSRLD       $0x10, Y11, Y11
   796  	VPBLENDW     $0xaa, Y12, Y11, Y12
   797  	VMOVDQA      Y4, Y11
   798  	VPSLLD       $0x10, Y14, Y4
   799  	VPBLENDW     $0xaa, Y4, Y13, Y4
   800  	VPSRLD       $0x10, Y13, Y13
   801  	VPBLENDW     $0xaa, Y14, Y13, Y14
   802  	VMOVDQA      Y4, Y13
   803  	VPSUBW       Y7, Y8, Y4
   804  	VPSUBW       Y9, Y10, Y5
   805  	VPSUBW       Y11, Y12, Y6
   806  	VPADDW       Y7, Y8, Y7
   807  	VPADDW       Y9, Y10, Y9
   808  	VPADDW       Y11, Y12, Y11
   809  	VPMULLW      Y4, Y0, Y8
   810  	VPMULLW      Y5, Y0, Y10
   811  	VPSUBW       Y13, Y14, Y0
   812  	VPMULLW      Y6, Y2, Y12
   813  	VPADDW       Y13, Y14, Y13
   814  	VPMULLW      Y0, Y2, Y14
   815  	VPMULHW      Y4, Y1, Y4
   816  	VPMULHW      Y5, Y1, Y5
   817  	VPMULHW      Y6, Y3, Y6
   818  	VPMULHW      Y0, Y3, Y0
   819  	VPMULHW      Y8, Y15, Y8
   820  	VPMULHW      Y10, Y15, Y10
   821  	VPMULHW      Y12, Y15, Y12
   822  	VPMULHW      Y14, Y15, Y14
   823  	VPSUBW       Y8, Y4, Y8
   824  	VPSUBW       Y10, Y5, Y10
   825  	VPSUBW       Y12, Y6, Y12
   826  	VPSUBW       Y14, Y0, Y14
   827  	VMOVDQU      1568(CX), Y0
   828  	VMOVDQU      1600(CX), Y1
   829  	VMOVDQU      1632(CX), Y2
   830  	VMOVDQU      1664(CX), Y3
   831  	VMOVSLDUP    Y9, Y4
   832  	VPBLENDD     $0xaa, Y4, Y7, Y4
   833  	VPSRLQ       $0x20, Y7, Y7
   834  	VPBLENDD     $0xaa, Y9, Y7, Y9
   835  	VMOVDQA      Y4, Y7
   836  	VMOVSLDUP    Y10, Y4
   837  	VPBLENDD     $0xaa, Y4, Y8, Y4
   838  	VPSRLQ       $0x20, Y8, Y8
   839  	VPBLENDD     $0xaa, Y10, Y8, Y10
   840  	VMOVDQA      Y4, Y8
   841  	VMOVSLDUP    Y13, Y4
   842  	VPBLENDD     $0xaa, Y4, Y11, Y4
   843  	VPSRLQ       $0x20, Y11, Y11
   844  	VPBLENDD     $0xaa, Y13, Y11, Y13
   845  	VMOVDQA      Y4, Y11
   846  	VMOVSLDUP    Y14, Y4
   847  	VPBLENDD     $0xaa, Y4, Y12, Y4
   848  	VPSRLQ       $0x20, Y12, Y12
   849  	VPBLENDD     $0xaa, Y14, Y12, Y14
   850  	VMOVDQA      Y4, Y12
   851  	VPSUBW       Y7, Y9, Y4
   852  	VPSUBW       Y8, Y10, Y5
   853  	VPSUBW       Y11, Y13, Y6
   854  	VPADDW       Y7, Y9, Y7
   855  	VPADDW       Y8, Y10, Y8
   856  	VPADDW       Y11, Y13, Y11
   857  	VPMULLW      Y4, Y0, Y9
   858  	VPMULLW      Y5, Y0, Y10
   859  	VPSUBW       Y12, Y14, Y0
   860  	VPMULLW      Y6, Y2, Y13
   861  	VPADDW       Y12, Y14, Y12
   862  	VPMULLW      Y0, Y2, Y14
   863  	VPMULHW      Y4, Y1, Y4
   864  	VPMULHW      Y5, Y1, Y5
   865  	VPMULHW      Y6, Y3, Y6
   866  	VPMULHW      Y0, Y3, Y0
   867  	VPMULHW      Y9, Y15, Y9
   868  	VPMULHW      Y10, Y15, Y10
   869  	VPMULHW      Y13, Y15, Y13
   870  	VPMULHW      Y14, Y15, Y14
   871  	VPSUBW       Y9, Y4, Y9
   872  	VPSUBW       Y10, Y5, Y10
   873  	VPSUBW       Y13, Y6, Y13
   874  	VPSUBW       Y14, Y0, Y14
   875  	MOVL         $0x00004ebf, DX
   876  	VMOVD        DX, X0
   877  	VPBROADCASTW X0, Y4
   878  	VPMULHW      Y4, Y7, Y5
   879  	VPSRAW       $0x0a, Y5, Y5
   880  	VPMULLW      Y15, Y5, Y5
   881  	VPSUBW       Y5, Y7, Y7
   882  	VPMULHW      Y4, Y11, Y5
   883  	VPSRAW       $0x0a, Y5, Y5
   884  	VPMULLW      Y15, Y5, Y5
   885  	VPSUBW       Y5, Y11, Y11
   886  	VMOVDQU      1824(CX), Y0
   887  	VMOVDQU      1856(CX), Y1
   888  	VMOVDQU      1888(CX), Y2
   889  	VMOVDQU      1920(CX), Y3
   890  	VPUNPCKLQDQ  Y8, Y7, Y4
   891  	VPUNPCKHQDQ  Y8, Y7, Y8
   892  	VMOVDQA      Y4, Y7
   893  	VPUNPCKLQDQ  Y10, Y9, Y4
   894  	VPUNPCKHQDQ  Y10, Y9, Y10
   895  	VMOVDQA      Y4, Y9
   896  	VPUNPCKLQDQ  Y12, Y11, Y4
   897  	VPUNPCKHQDQ  Y12, Y11, Y12
   898  	VMOVDQA      Y4, Y11
   899  	VPUNPCKLQDQ  Y14, Y13, Y4
   900  	VPUNPCKHQDQ  Y14, Y13, Y14
   901  	VMOVDQA      Y4, Y13
   902  	VPSUBW       Y7, Y8, Y4
   903  	VPSUBW       Y9, Y10, Y5
   904  	VPSUBW       Y11, Y12, Y6
   905  	VPADDW       Y7, Y8, Y7
   906  	VPADDW       Y9, Y10, Y9
   907  	VPADDW       Y11, Y12, Y11
   908  	VPMULLW      Y4, Y0, Y8
   909  	VPMULLW      Y5, Y0, Y10
   910  	VPSUBW       Y13, Y14, Y0
   911  	VPMULLW      Y6, Y2, Y12
   912  	VPADDW       Y13, Y14, Y13
   913  	VPMULLW      Y0, Y2, Y14
   914  	VPMULHW      Y4, Y1, Y4
   915  	VPMULHW      Y5, Y1, Y5
   916  	VPMULHW      Y6, Y3, Y6
   917  	VPMULHW      Y0, Y3, Y0
   918  	VPMULHW      Y8, Y15, Y8
   919  	VPMULHW      Y10, Y15, Y10
   920  	VPMULHW      Y12, Y15, Y12
   921  	VPMULHW      Y14, Y15, Y14
   922  	VPSUBW       Y8, Y4, Y8
   923  	VPSUBW       Y10, Y5, Y10
   924  	VPSUBW       Y12, Y6, Y12
   925  	VPSUBW       Y14, Y0, Y14
   926  	VPBROADCASTW 2080(CX), Y0
   927  	VPBROADCASTW 2082(CX), Y1
   928  	VPBROADCASTW 2084(CX), Y2
   929  	VPBROADCASTW 2086(CX), Y3
   930  	VPERM2I128   $0x20, Y9, Y7, Y4
   931  	VPERM2I128   $0x31, Y9, Y7, Y9
   932  	VMOVDQA      Y4, Y7
   933  	VPERM2I128   $0x20, Y10, Y8, Y4
   934  	VPERM2I128   $0x31, Y10, Y8, Y10
   935  	VMOVDQA      Y4, Y8
   936  	VPERM2I128   $0x20, Y13, Y11, Y4
   937  	VPERM2I128   $0x31, Y13, Y11, Y13
   938  	VMOVDQA      Y4, Y11
   939  	VPERM2I128   $0x20, Y14, Y12, Y4
   940  	VPERM2I128   $0x31, Y14, Y12, Y14
   941  	VMOVDQA      Y4, Y12
   942  	VPSUBW       Y7, Y9, Y4
   943  	VPSUBW       Y8, Y10, Y5
   944  	VPSUBW       Y11, Y13, Y6
   945  	VPADDW       Y7, Y9, Y7
   946  	VPADDW       Y8, Y10, Y8
   947  	VPADDW       Y11, Y13, Y11
   948  	VPMULLW      Y4, Y0, Y9
   949  	VPMULLW      Y5, Y0, Y10
   950  	VPSUBW       Y12, Y14, Y0
   951  	VPMULLW      Y6, Y2, Y13
   952  	VPADDW       Y12, Y14, Y12
   953  	VPMULLW      Y0, Y2, Y14
   954  	VPMULHW      Y4, Y1, Y4
   955  	VPMULHW      Y5, Y1, Y5
   956  	VPMULHW      Y6, Y3, Y6
   957  	VPMULHW      Y0, Y3, Y0
   958  	VPMULHW      Y9, Y15, Y9
   959  	VPMULHW      Y10, Y15, Y10
   960  	VPMULHW      Y13, Y15, Y13
   961  	VPMULHW      Y14, Y15, Y14
   962  	VPSUBW       Y9, Y4, Y9
   963  	VPSUBW       Y10, Y5, Y10
   964  	VPSUBW       Y13, Y6, Y13
   965  	VPSUBW       Y14, Y0, Y14
   966  	MOVL         $0x00004ebf, DX
   967  	VMOVD        DX, X0
   968  	VPBROADCASTW X0, Y4
   969  	VPMULHW      Y4, Y7, Y5
   970  	VPSRAW       $0x0a, Y5, Y5
   971  	VPMULLW      Y15, Y5, Y5
   972  	VPSUBW       Y5, Y7, Y7
   973  	VPMULHW      Y4, Y11, Y5
   974  	VPSRAW       $0x0a, Y5, Y5
   975  	VPMULLW      Y15, Y5, Y5
   976  	VPSUBW       Y5, Y11, Y11
   977  	VPBROADCASTW 2096(CX), Y0
   978  	VPBROADCASTW 2098(CX), Y1
   979  	VPSUBW       Y7, Y11, Y4
   980  	VPSUBW       Y8, Y12, Y5
   981  	VPSUBW       Y9, Y13, Y6
   982  	VPADDW       Y7, Y11, Y7
   983  	VPADDW       Y8, Y12, Y8
   984  	VPADDW       Y9, Y13, Y9
   985  	VPMULLW      Y4, Y0, Y11
   986  	VPMULLW      Y5, Y0, Y12
   987  	VPSUBW       Y10, Y14, Y2
   988  	VPMULLW      Y6, Y0, Y13
   989  	VPADDW       Y10, Y14, Y10
   990  	VPMULLW      Y2, Y0, Y14
   991  	VPMULHW      Y4, Y1, Y4
   992  	VPMULHW      Y5, Y1, Y5
   993  	VPMULHW      Y6, Y1, Y6
   994  	VPMULHW      Y2, Y1, Y2
   995  	VPMULHW      Y11, Y15, Y11
   996  	VPMULHW      Y12, Y15, Y12
   997  	VPMULHW      Y13, Y15, Y13
   998  	VPMULHW      Y14, Y15, Y14
   999  	VPSUBW       Y11, Y4, Y11
  1000  	VPSUBW       Y12, Y5, Y12
  1001  	VPSUBW       Y13, Y6, Y13
  1002  	VPSUBW       Y14, Y2, Y14
  1003  	VMOVDQU      Y7, (AX)
  1004  	VMOVDQU      Y8, 32(AX)
  1005  	VMOVDQU      Y9, 64(AX)
  1006  	VMOVDQU      Y10, 96(AX)
  1007  	VMOVDQU      Y11, 128(AX)
  1008  	VMOVDQU      Y12, 160(AX)
  1009  	VMOVDQU      Y13, 192(AX)
  1010  	VMOVDQU      Y14, 224(AX)
  1011  	VMOVDQU      256(AX), Y7
  1012  	VMOVDQU      288(AX), Y8
  1013  	VMOVDQU      320(AX), Y9
  1014  	VMOVDQU      352(AX), Y10
  1015  	VMOVDQU      384(AX), Y11
  1016  	VMOVDQU      416(AX), Y12
  1017  	VMOVDQU      448(AX), Y13
  1018  	VMOVDQU      480(AX), Y14
  1019  	VMOVDQU      1184(CX), Y0
  1020  	VMOVDQU      1216(CX), Y1
  1021  	VMOVDQU      1248(CX), Y2
  1022  	VMOVDQU      1280(CX), Y3
  1023  	VPSUBW       Y7, Y9, Y4
  1024  	VPSUBW       Y8, Y10, Y5
  1025  	VPSUBW       Y11, Y13, Y6
  1026  	VPADDW       Y7, Y9, Y7
  1027  	VPADDW       Y8, Y10, Y8
  1028  	VPADDW       Y11, Y13, Y11
  1029  	VPMULLW      Y4, Y0, Y9
  1030  	VPMULLW      Y5, Y0, Y10
  1031  	VPSUBW       Y12, Y14, Y0
  1032  	VPMULLW      Y6, Y2, Y13
  1033  	VPADDW       Y12, Y14, Y12
  1034  	VPMULLW      Y0, Y2, Y14
  1035  	VPMULHW      Y4, Y1, Y4
  1036  	VPMULHW      Y5, Y1, Y5
  1037  	VPMULHW      Y6, Y3, Y6
  1038  	VPMULHW      Y0, Y3, Y0
  1039  	VPMULHW      Y9, Y15, Y9
  1040  	VPMULHW      Y10, Y15, Y10
  1041  	VPMULHW      Y13, Y15, Y13
  1042  	VPMULHW      Y14, Y15, Y14
  1043  	VPSUBW       Y9, Y4, Y9
  1044  	VPSUBW       Y10, Y5, Y10
  1045  	VPSUBW       Y13, Y6, Y13
  1046  	VPSUBW       Y14, Y0, Y14
  1047  	VMOVDQU      1440(CX), Y0
  1048  	VMOVDQU      1472(CX), Y1
  1049  	VMOVDQU      1504(CX), Y2
  1050  	VMOVDQU      1536(CX), Y3
  1051  	VPSLLD       $0x10, Y8, Y4
  1052  	VPBLENDW     $0xaa, Y4, Y7, Y4
  1053  	VPSRLD       $0x10, Y7, Y7
  1054  	VPBLENDW     $0xaa, Y8, Y7, Y8
  1055  	VMOVDQA      Y4, Y7
  1056  	VPSLLD       $0x10, Y10, Y4
  1057  	VPBLENDW     $0xaa, Y4, Y9, Y4
  1058  	VPSRLD       $0x10, Y9, Y9
  1059  	VPBLENDW     $0xaa, Y10, Y9, Y10
  1060  	VMOVDQA      Y4, Y9
  1061  	VPSLLD       $0x10, Y12, Y4
  1062  	VPBLENDW     $0xaa, Y4, Y11, Y4
  1063  	VPSRLD       $0x10, Y11, Y11
  1064  	VPBLENDW     $0xaa, Y12, Y11, Y12
  1065  	VMOVDQA      Y4, Y11
  1066  	VPSLLD       $0x10, Y14, Y4
  1067  	VPBLENDW     $0xaa, Y4, Y13, Y4
  1068  	VPSRLD       $0x10, Y13, Y13
  1069  	VPBLENDW     $0xaa, Y14, Y13, Y14
  1070  	VMOVDQA      Y4, Y13
  1071  	VPSUBW       Y7, Y8, Y4
  1072  	VPSUBW       Y9, Y10, Y5
  1073  	VPSUBW       Y11, Y12, Y6
  1074  	VPADDW       Y7, Y8, Y7
  1075  	VPADDW       Y9, Y10, Y9
  1076  	VPADDW       Y11, Y12, Y11
  1077  	VPMULLW      Y4, Y0, Y8
  1078  	VPMULLW      Y5, Y0, Y10
  1079  	VPSUBW       Y13, Y14, Y0
  1080  	VPMULLW      Y6, Y2, Y12
  1081  	VPADDW       Y13, Y14, Y13
  1082  	VPMULLW      Y0, Y2, Y14
  1083  	VPMULHW      Y4, Y1, Y4
  1084  	VPMULHW      Y5, Y1, Y5
  1085  	VPMULHW      Y6, Y3, Y6
  1086  	VPMULHW      Y0, Y3, Y0
  1087  	VPMULHW      Y8, Y15, Y8
  1088  	VPMULHW      Y10, Y15, Y10
  1089  	VPMULHW      Y12, Y15, Y12
  1090  	VPMULHW      Y14, Y15, Y14
  1091  	VPSUBW       Y8, Y4, Y8
  1092  	VPSUBW       Y10, Y5, Y10
  1093  	VPSUBW       Y12, Y6, Y12
  1094  	VPSUBW       Y14, Y0, Y14
  1095  	VMOVDQU      1696(CX), Y0
  1096  	VMOVDQU      1728(CX), Y1
  1097  	VMOVDQU      1760(CX), Y2
  1098  	VMOVDQU      1792(CX), Y3
  1099  	VMOVSLDUP    Y9, Y4
  1100  	VPBLENDD     $0xaa, Y4, Y7, Y4
  1101  	VPSRLQ       $0x20, Y7, Y7
  1102  	VPBLENDD     $0xaa, Y9, Y7, Y9
  1103  	VMOVDQA      Y4, Y7
  1104  	VMOVSLDUP    Y10, Y4
  1105  	VPBLENDD     $0xaa, Y4, Y8, Y4
  1106  	VPSRLQ       $0x20, Y8, Y8
  1107  	VPBLENDD     $0xaa, Y10, Y8, Y10
  1108  	VMOVDQA      Y4, Y8
  1109  	VMOVSLDUP    Y13, Y4
  1110  	VPBLENDD     $0xaa, Y4, Y11, Y4
  1111  	VPSRLQ       $0x20, Y11, Y11
  1112  	VPBLENDD     $0xaa, Y13, Y11, Y13
  1113  	VMOVDQA      Y4, Y11
  1114  	VMOVSLDUP    Y14, Y4
  1115  	VPBLENDD     $0xaa, Y4, Y12, Y4
  1116  	VPSRLQ       $0x20, Y12, Y12
  1117  	VPBLENDD     $0xaa, Y14, Y12, Y14
  1118  	VMOVDQA      Y4, Y12
  1119  	VPSUBW       Y7, Y9, Y4
  1120  	VPSUBW       Y8, Y10, Y5
  1121  	VPSUBW       Y11, Y13, Y6
  1122  	VPADDW       Y7, Y9, Y7
  1123  	VPADDW       Y8, Y10, Y8
  1124  	VPADDW       Y11, Y13, Y11
  1125  	VPMULLW      Y4, Y0, Y9
  1126  	VPMULLW      Y5, Y0, Y10
  1127  	VPSUBW       Y12, Y14, Y0
  1128  	VPMULLW      Y6, Y2, Y13
  1129  	VPADDW       Y12, Y14, Y12
  1130  	VPMULLW      Y0, Y2, Y14
  1131  	VPMULHW      Y4, Y1, Y4
  1132  	VPMULHW      Y5, Y1, Y5
  1133  	VPMULHW      Y6, Y3, Y6
  1134  	VPMULHW      Y0, Y3, Y0
  1135  	VPMULHW      Y9, Y15, Y9
  1136  	VPMULHW      Y10, Y15, Y10
  1137  	VPMULHW      Y13, Y15, Y13
  1138  	VPMULHW      Y14, Y15, Y14
  1139  	VPSUBW       Y9, Y4, Y9
  1140  	VPSUBW       Y10, Y5, Y10
  1141  	VPSUBW       Y13, Y6, Y13
  1142  	VPSUBW       Y14, Y0, Y14
  1143  	MOVL         $0x00004ebf, DX
  1144  	VMOVD        DX, X0
  1145  	VPBROADCASTW X0, Y4
  1146  	VPMULHW      Y4, Y7, Y5
  1147  	VPSRAW       $0x0a, Y5, Y5
  1148  	VPMULLW      Y15, Y5, Y5
  1149  	VPSUBW       Y5, Y7, Y7
  1150  	VPMULHW      Y4, Y11, Y5
  1151  	VPSRAW       $0x0a, Y5, Y5
  1152  	VPMULLW      Y15, Y5, Y5
  1153  	VPSUBW       Y5, Y11, Y11
  1154  	VMOVDQU      1952(CX), Y0
  1155  	VMOVDQU      1984(CX), Y1
  1156  	VMOVDQU      2016(CX), Y2
  1157  	VMOVDQU      2048(CX), Y3
  1158  	VPUNPCKLQDQ  Y8, Y7, Y4
  1159  	VPUNPCKHQDQ  Y8, Y7, Y8
  1160  	VMOVDQA      Y4, Y7
  1161  	VPUNPCKLQDQ  Y10, Y9, Y4
  1162  	VPUNPCKHQDQ  Y10, Y9, Y10
  1163  	VMOVDQA      Y4, Y9
  1164  	VPUNPCKLQDQ  Y12, Y11, Y4
  1165  	VPUNPCKHQDQ  Y12, Y11, Y12
  1166  	VMOVDQA      Y4, Y11
  1167  	VPUNPCKLQDQ  Y14, Y13, Y4
  1168  	VPUNPCKHQDQ  Y14, Y13, Y14
  1169  	VMOVDQA      Y4, Y13
  1170  	VPSUBW       Y7, Y8, Y4
  1171  	VPSUBW       Y9, Y10, Y5
  1172  	VPSUBW       Y11, Y12, Y6
  1173  	VPADDW       Y7, Y8, Y7
  1174  	VPADDW       Y9, Y10, Y9
  1175  	VPADDW       Y11, Y12, Y11
  1176  	VPMULLW      Y4, Y0, Y8
  1177  	VPMULLW      Y5, Y0, Y10
  1178  	VPSUBW       Y13, Y14, Y0
  1179  	VPMULLW      Y6, Y2, Y12
  1180  	VPADDW       Y13, Y14, Y13
  1181  	VPMULLW      Y0, Y2, Y14
  1182  	VPMULHW      Y4, Y1, Y4
  1183  	VPMULHW      Y5, Y1, Y5
  1184  	VPMULHW      Y6, Y3, Y6
  1185  	VPMULHW      Y0, Y3, Y0
  1186  	VPMULHW      Y8, Y15, Y8
  1187  	VPMULHW      Y10, Y15, Y10
  1188  	VPMULHW      Y12, Y15, Y12
  1189  	VPMULHW      Y14, Y15, Y14
  1190  	VPSUBW       Y8, Y4, Y8
  1191  	VPSUBW       Y10, Y5, Y10
  1192  	VPSUBW       Y12, Y6, Y12
  1193  	VPSUBW       Y14, Y0, Y14
  1194  	VPBROADCASTW 2088(CX), Y0
  1195  	VPBROADCASTW 2090(CX), Y1
  1196  	VPBROADCASTW 2092(CX), Y2
  1197  	VPBROADCASTW 2094(CX), Y3
  1198  	VPERM2I128   $0x20, Y9, Y7, Y4
  1199  	VPERM2I128   $0x31, Y9, Y7, Y9
  1200  	VMOVDQA      Y4, Y7
  1201  	VPERM2I128   $0x20, Y10, Y8, Y4
  1202  	VPERM2I128   $0x31, Y10, Y8, Y10
  1203  	VMOVDQA      Y4, Y8
  1204  	VPERM2I128   $0x20, Y13, Y11, Y4
  1205  	VPERM2I128   $0x31, Y13, Y11, Y13
  1206  	VMOVDQA      Y4, Y11
  1207  	VPERM2I128   $0x20, Y14, Y12, Y4
  1208  	VPERM2I128   $0x31, Y14, Y12, Y14
  1209  	VMOVDQA      Y4, Y12
  1210  	VPSUBW       Y7, Y9, Y4
  1211  	VPSUBW       Y8, Y10, Y5
  1212  	VPSUBW       Y11, Y13, Y6
  1213  	VPADDW       Y7, Y9, Y7
  1214  	VPADDW       Y8, Y10, Y8
  1215  	VPADDW       Y11, Y13, Y11
  1216  	VPMULLW      Y4, Y0, Y9
  1217  	VPMULLW      Y5, Y0, Y10
  1218  	VPSUBW       Y12, Y14, Y0
  1219  	VPMULLW      Y6, Y2, Y13
  1220  	VPADDW       Y12, Y14, Y12
  1221  	VPMULLW      Y0, Y2, Y14
  1222  	VPMULHW      Y4, Y1, Y4
  1223  	VPMULHW      Y5, Y1, Y5
  1224  	VPMULHW      Y6, Y3, Y6
  1225  	VPMULHW      Y0, Y3, Y0
  1226  	VPMULHW      Y9, Y15, Y9
  1227  	VPMULHW      Y10, Y15, Y10
  1228  	VPMULHW      Y13, Y15, Y13
  1229  	VPMULHW      Y14, Y15, Y14
  1230  	VPSUBW       Y9, Y4, Y9
  1231  	VPSUBW       Y10, Y5, Y10
  1232  	VPSUBW       Y13, Y6, Y13
  1233  	VPSUBW       Y14, Y0, Y14
  1234  	MOVL         $0x00004ebf, DX
  1235  	VMOVD        DX, X0
  1236  	VPBROADCASTW X0, Y4
  1237  	VPMULHW      Y4, Y7, Y5
  1238  	VPSRAW       $0x0a, Y5, Y5
  1239  	VPMULLW      Y15, Y5, Y5
  1240  	VPSUBW       Y5, Y7, Y7
  1241  	VPMULHW      Y4, Y11, Y5
  1242  	VPSRAW       $0x0a, Y5, Y5
  1243  	VPMULLW      Y15, Y5, Y5
  1244  	VPSUBW       Y5, Y11, Y11
  1245  	VPBROADCASTW 2100(CX), Y0
  1246  	VPBROADCASTW 2102(CX), Y1
  1247  	VPSUBW       Y7, Y11, Y4
  1248  	VPSUBW       Y8, Y12, Y5
  1249  	VPSUBW       Y9, Y13, Y6
  1250  	VPADDW       Y7, Y11, Y7
  1251  	VPADDW       Y8, Y12, Y8
  1252  	VPADDW       Y9, Y13, Y9
  1253  	VPMULLW      Y4, Y0, Y11
  1254  	VPMULLW      Y5, Y0, Y12
  1255  	VPSUBW       Y10, Y14, Y2
  1256  	VPMULLW      Y6, Y0, Y13
  1257  	VPADDW       Y10, Y14, Y10
  1258  	VPMULLW      Y2, Y0, Y14
  1259  	VPMULHW      Y4, Y1, Y4
  1260  	VPMULHW      Y5, Y1, Y5
  1261  	VPMULHW      Y6, Y1, Y6
  1262  	VPMULHW      Y2, Y1, Y2
  1263  	VPMULHW      Y11, Y15, Y11
  1264  	VPMULHW      Y12, Y15, Y12
  1265  	VPMULHW      Y13, Y15, Y13
  1266  	VPMULHW      Y14, Y15, Y14
  1267  	VPSUBW       Y11, Y4, Y11
  1268  	VPSUBW       Y12, Y5, Y12
  1269  	VPSUBW       Y13, Y6, Y13
  1270  	VPSUBW       Y14, Y2, Y14
  1271  	VMOVDQU      Y7, 256(AX)
  1272  	VMOVDQU      Y8, 288(AX)
  1273  	VMOVDQU      Y9, 320(AX)
  1274  	VMOVDQU      Y10, 352(AX)
  1275  	VMOVDQU      Y11, 384(AX)
  1276  	VMOVDQU      Y12, 416(AX)
  1277  	VMOVDQU      Y13, 448(AX)
  1278  	VMOVDQU      Y14, 480(AX)
  1279  	VPBROADCASTW 2104(CX), Y0
  1280  	VPBROADCASTW 2106(CX), Y1
  1281  	VMOVDQU      (AX), Y7
  1282  	VMOVDQU      32(AX), Y8
  1283  	VMOVDQU      64(AX), Y9
  1284  	VMOVDQU      96(AX), Y10
  1285  	VMOVDQU      256(AX), Y11
  1286  	VMOVDQU      288(AX), Y12
  1287  	VMOVDQU      320(AX), Y13
  1288  	VMOVDQU      352(AX), Y14
  1289  	VPSUBW       Y7, Y11, Y2
  1290  	VPSUBW       Y8, Y12, Y3
  1291  	VPSUBW       Y9, Y13, Y4
  1292  	VPADDW       Y7, Y11, Y7
  1293  	VPADDW       Y8, Y12, Y8
  1294  	VPADDW       Y9, Y13, Y9
  1295  	VPMULLW      Y2, Y0, Y11
  1296  	VPMULLW      Y3, Y0, Y12
  1297  	VPSUBW       Y10, Y14, Y5
  1298  	VPMULLW      Y4, Y0, Y13
  1299  	VPADDW       Y10, Y14, Y10
  1300  	VPMULLW      Y5, Y0, Y14
  1301  	VPMULHW      Y2, Y1, Y2
  1302  	VPMULHW      Y3, Y1, Y3
  1303  	VPMULHW      Y4, Y1, Y4
  1304  	VPMULHW      Y5, Y1, Y5
  1305  	VPMULHW      Y11, Y15, Y11
  1306  	VPMULHW      Y12, Y15, Y12
  1307  	VPMULHW      Y13, Y15, Y13
  1308  	VPMULHW      Y14, Y15, Y14
  1309  	VPSUBW       Y11, Y2, Y11
  1310  	VPSUBW       Y12, Y3, Y12
  1311  	VPSUBW       Y13, Y4, Y13
  1312  	VPSUBW       Y14, Y5, Y14
  1313  	MOVL         $0xffffd8a1, DX
  1314  	VMOVD        DX, X0
  1315  	VPBROADCASTW X0, Y0
  1316  	MOVL         $0x000005a1, DX
  1317  	VMOVD        DX, X1
  1318  	VPBROADCASTW X1, Y1
  1319  	VPMULLW      Y7, Y0, Y2
  1320  	VPMULLW      Y8, Y0, Y3
  1321  	VPMULLW      Y9, Y0, Y4
  1322  	VPMULLW      Y10, Y0, Y5
  1323  	VPMULHW      Y7, Y1, Y7
  1324  	VPMULHW      Y8, Y1, Y8
  1325  	VPMULHW      Y9, Y1, Y9
  1326  	VPMULHW      Y10, Y1, Y10
  1327  	VPMULHW      Y2, Y15, Y2
  1328  	VPMULHW      Y3, Y15, Y3
  1329  	VPMULHW      Y4, Y15, Y4
  1330  	VPMULHW      Y5, Y15, Y5
  1331  	VPSUBW       Y2, Y7, Y7
  1332  	VPSUBW       Y3, Y8, Y8
  1333  	VPSUBW       Y4, Y9, Y9
  1334  	VPSUBW       Y5, Y10, Y10
  1335  	VPMULLW      Y11, Y0, Y2
  1336  	VPMULLW      Y12, Y0, Y3
  1337  	VPMULLW      Y13, Y0, Y4
  1338  	VPMULLW      Y14, Y0, Y5
  1339  	VPMULHW      Y11, Y1, Y11
  1340  	VPMULHW      Y12, Y1, Y12
  1341  	VPMULHW      Y13, Y1, Y13
  1342  	VPMULHW      Y14, Y1, Y14
  1343  	VPMULHW      Y2, Y15, Y2
  1344  	VPMULHW      Y3, Y15, Y3
  1345  	VPMULHW      Y4, Y15, Y4
  1346  	VPMULHW      Y5, Y15, Y5
  1347  	VPSUBW       Y2, Y11, Y11
  1348  	VPSUBW       Y3, Y12, Y12
  1349  	VPSUBW       Y4, Y13, Y13
  1350  	VPSUBW       Y5, Y14, Y14
  1351  	VMOVDQU      Y7, (AX)
  1352  	VMOVDQU      Y8, 32(AX)
  1353  	VMOVDQU      Y9, 64(AX)
  1354  	VMOVDQU      Y10, 96(AX)
  1355  	VMOVDQU      Y11, 256(AX)
  1356  	VMOVDQU      Y12, 288(AX)
  1357  	VMOVDQU      Y13, 320(AX)
  1358  	VMOVDQU      Y14, 352(AX)
  1359  	VPBROADCASTW 2104(CX), Y0
  1360  	VPBROADCASTW 2106(CX), Y1
  1361  	VMOVDQU      128(AX), Y7
  1362  	VMOVDQU      160(AX), Y8
  1363  	VMOVDQU      192(AX), Y9
  1364  	VMOVDQU      224(AX), Y10
  1365  	VMOVDQU      384(AX), Y11
  1366  	VMOVDQU      416(AX), Y12
  1367  	VMOVDQU      448(AX), Y13
  1368  	VMOVDQU      480(AX), Y14
  1369  	VPSUBW       Y7, Y11, Y2
  1370  	VPSUBW       Y8, Y12, Y3
  1371  	VPSUBW       Y9, Y13, Y4
  1372  	VPADDW       Y7, Y11, Y7
  1373  	VPADDW       Y8, Y12, Y8
  1374  	VPADDW       Y9, Y13, Y9
  1375  	VPMULLW      Y2, Y0, Y11
  1376  	VPMULLW      Y3, Y0, Y12
  1377  	VPSUBW       Y10, Y14, Y5
  1378  	VPMULLW      Y4, Y0, Y13
  1379  	VPADDW       Y10, Y14, Y10
  1380  	VPMULLW      Y5, Y0, Y14
  1381  	VPMULHW      Y2, Y1, Y2
  1382  	VPMULHW      Y3, Y1, Y3
  1383  	VPMULHW      Y4, Y1, Y4
  1384  	VPMULHW      Y5, Y1, Y5
  1385  	VPMULHW      Y11, Y15, Y11
  1386  	VPMULHW      Y12, Y15, Y12
  1387  	VPMULHW      Y13, Y15, Y13
  1388  	VPMULHW      Y14, Y15, Y14
  1389  	VPSUBW       Y11, Y2, Y11
  1390  	VPSUBW       Y12, Y3, Y12
  1391  	VPSUBW       Y13, Y4, Y13
  1392  	VPSUBW       Y14, Y5, Y14
  1393  	MOVL         $0xffffd8a1, CX
  1394  	VMOVD        CX, X0
  1395  	VPBROADCASTW X0, Y0
  1396  	MOVL         $0x000005a1, CX
  1397  	VMOVD        CX, X1
  1398  	VPBROADCASTW X1, Y1
  1399  	VPMULLW      Y7, Y0, Y2
  1400  	VPMULLW      Y8, Y0, Y3
  1401  	VPMULLW      Y9, Y0, Y4
  1402  	VPMULLW      Y10, Y0, Y5
  1403  	VPMULHW      Y7, Y1, Y7
  1404  	VPMULHW      Y8, Y1, Y8
  1405  	VPMULHW      Y9, Y1, Y9
  1406  	VPMULHW      Y10, Y1, Y10
  1407  	VPMULHW      Y2, Y15, Y2
  1408  	VPMULHW      Y3, Y15, Y3
  1409  	VPMULHW      Y4, Y15, Y4
  1410  	VPMULHW      Y5, Y15, Y5
  1411  	VPSUBW       Y2, Y7, Y7
  1412  	VPSUBW       Y3, Y8, Y8
  1413  	VPSUBW       Y4, Y9, Y9
  1414  	VPSUBW       Y5, Y10, Y10
  1415  	VPMULLW      Y11, Y0, Y2
  1416  	VPMULLW      Y12, Y0, Y3
  1417  	VPMULLW      Y13, Y0, Y4
  1418  	VPMULLW      Y14, Y0, Y5
  1419  	VPMULHW      Y11, Y1, Y11
  1420  	VPMULHW      Y12, Y1, Y12
  1421  	VPMULHW      Y13, Y1, Y13
  1422  	VPMULHW      Y14, Y1, Y14
  1423  	VPMULHW      Y2, Y15, Y2
  1424  	VPMULHW      Y3, Y15, Y3
  1425  	VPMULHW      Y4, Y15, Y4
  1426  	VPMULHW      Y5, Y15, Y5
  1427  	VPSUBW       Y2, Y11, Y11
  1428  	VPSUBW       Y3, Y12, Y12
  1429  	VPSUBW       Y4, Y13, Y13
  1430  	VPSUBW       Y5, Y14, Y14
  1431  	VMOVDQU      Y7, 128(AX)
  1432  	VMOVDQU      Y8, 160(AX)
  1433  	VMOVDQU      Y9, 192(AX)
  1434  	VMOVDQU      Y10, 224(AX)
  1435  	VMOVDQU      Y11, 384(AX)
  1436  	VMOVDQU      Y12, 416(AX)
  1437  	VMOVDQU      Y13, 448(AX)
  1438  	VMOVDQU      Y14, 480(AX)
  1439  	RET
  1440  
  1441  // func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
  1442  // Requires: AVX, AVX2
  1443  TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24
  1444  	MOVQ         p+0(FP), AX
  1445  	MOVQ         a+8(FP), CX
  1446  	MOVQ         b+16(FP), DX
  1447  	LEAQ         ·ZetasAVX2+0(SB), BX
  1448  	MOVL         $0xfffff301, SI
  1449  	VMOVD        SI, X0
  1450  	VPBROADCASTW X0, Y14
  1451  	MOVL         $0x00000d01, SI
  1452  	VMOVD        SI, X0
  1453  	VPBROADCASTW X0, Y15
  1454  	VMOVDQU      (CX), Y0
  1455  	VMOVDQU      32(CX), Y1
  1456  	VMOVDQU      64(CX), Y2
  1457  	VMOVDQU      96(CX), Y3
  1458  	VMOVDQU      (DX), Y4
  1459  	VMOVDQU      32(DX), Y5
  1460  	VMOVDQU      64(DX), Y6
  1461  	VMOVDQU      96(DX), Y7
  1462  	VPMULLW      Y1, Y5, Y8
  1463  	VPMULLW      Y0, Y4, Y9
  1464  	VPMULLW      Y0, Y5, Y10
  1465  	VPMULLW      Y1, Y4, Y11
  1466  	VPMULLW      Y8, Y14, Y8
  1467  	VPMULLW      Y9, Y14, Y9
  1468  	VPMULLW      Y10, Y14, Y10
  1469  	VPMULLW      Y11, Y14, Y11
  1470  	VPMULHW      Y1, Y5, Y12
  1471  	VPMULHW      Y0, Y4, Y13
  1472  	VPMULHW      Y0, Y5, Y0
  1473  	VPMULHW      Y1, Y4, Y1
  1474  	VMOVDQA      Y12, Y4
  1475  	VMOVDQA      Y13, Y5
  1476  	VPMULHW      Y8, Y15, Y8
  1477  	VPMULHW      Y9, Y15, Y9
  1478  	VPMULHW      Y10, Y15, Y10
  1479  	VPMULHW      Y11, Y15, Y11
  1480  	VPSUBW       Y8, Y4, Y4
  1481  	VPSUBW       Y9, Y5, Y5
  1482  	VPSUBW       Y10, Y0, Y0
  1483  	VPSUBW       Y11, Y1, Y1
  1484  	VMOVDQU      800(BX), Y12
  1485  	VMOVDQU      832(BX), Y13
  1486  	VPMULLW      Y4, Y12, Y8
  1487  	VPMULHW      Y4, Y13, Y4
  1488  	VPMULHW      Y8, Y15, Y8
  1489  	VPSUBW       Y8, Y4, Y4
  1490  	VPADDW       Y4, Y5, Y4
  1491  	VPADDW       Y0, Y1, Y5
  1492  	VPMULLW      Y3, Y7, Y8
  1493  	VPMULLW      Y2, Y6, Y9
  1494  	VPMULLW      Y2, Y7, Y10
  1495  	VPMULLW      Y3, Y6, Y11
  1496  	VPMULLW      Y8, Y14, Y8
  1497  	VPMULLW      Y9, Y14, Y9
  1498  	VPMULLW      Y10, Y14, Y10
  1499  	VPMULLW      Y11, Y14, Y11
  1500  	VPMULHW      Y3, Y7, Y12
  1501  	VPMULHW      Y2, Y6, Y13
  1502  	VPMULHW      Y2, Y7, Y2
  1503  	VPMULHW      Y3, Y6, Y3
  1504  	VMOVDQA      Y12, Y6
  1505  	VMOVDQA      Y13, Y7
  1506  	VPMULHW      Y8, Y15, Y8
  1507  	VPMULHW      Y9, Y15, Y9
  1508  	VPMULHW      Y10, Y15, Y10
  1509  	VPMULHW      Y11, Y15, Y11
  1510  	VPSUBW       Y8, Y6, Y6
  1511  	VPSUBW       Y9, Y7, Y7
  1512  	VPSUBW       Y10, Y2, Y2
  1513  	VPSUBW       Y11, Y3, Y3
  1514  	VMOVDQU      800(BX), Y12
  1515  	VMOVDQU      832(BX), Y13
  1516  	VPMULLW      Y6, Y12, Y8
  1517  	VPMULHW      Y6, Y13, Y6
  1518  	VPMULHW      Y8, Y15, Y8
  1519  	VPSUBW       Y8, Y6, Y6
  1520  	VPSUBW       Y6, Y7, Y6
  1521  	VPADDW       Y2, Y3, Y7
  1522  	VMOVDQU      Y4, (AX)
  1523  	VMOVDQU      Y5, 32(AX)
  1524  	VMOVDQU      Y6, 64(AX)
  1525  	VMOVDQU      Y7, 96(AX)
  1526  	VMOVDQU      128(CX), Y0
  1527  	VMOVDQU      160(CX), Y1
  1528  	VMOVDQU      192(CX), Y2
  1529  	VMOVDQU      224(CX), Y3
  1530  	VMOVDQU      128(DX), Y4
  1531  	VMOVDQU      160(DX), Y5
  1532  	VMOVDQU      192(DX), Y6
  1533  	VMOVDQU      224(DX), Y7
  1534  	VPMULLW      Y1, Y5, Y8
  1535  	VPMULLW      Y0, Y4, Y9
  1536  	VPMULLW      Y0, Y5, Y10
  1537  	VPMULLW      Y1, Y4, Y11
  1538  	VPMULLW      Y8, Y14, Y8
  1539  	VPMULLW      Y9, Y14, Y9
  1540  	VPMULLW      Y10, Y14, Y10
  1541  	VPMULLW      Y11, Y14, Y11
  1542  	VPMULHW      Y1, Y5, Y12
  1543  	VPMULHW      Y0, Y4, Y13
  1544  	VPMULHW      Y0, Y5, Y0
  1545  	VPMULHW      Y1, Y4, Y1
  1546  	VMOVDQA      Y12, Y4
  1547  	VMOVDQA      Y13, Y5
  1548  	VPMULHW      Y8, Y15, Y8
  1549  	VPMULHW      Y9, Y15, Y9
  1550  	VPMULHW      Y10, Y15, Y10
  1551  	VPMULHW      Y11, Y15, Y11
  1552  	VPSUBW       Y8, Y4, Y4
  1553  	VPSUBW       Y9, Y5, Y5
  1554  	VPSUBW       Y10, Y0, Y0
  1555  	VPSUBW       Y11, Y1, Y1
  1556  	VMOVDQU      864(BX), Y12
  1557  	VMOVDQU      896(BX), Y13
  1558  	VPMULLW      Y4, Y12, Y8
  1559  	VPMULHW      Y4, Y13, Y4
  1560  	VPMULHW      Y8, Y15, Y8
  1561  	VPSUBW       Y8, Y4, Y4
  1562  	VPADDW       Y4, Y5, Y4
  1563  	VPADDW       Y0, Y1, Y5
  1564  	VPMULLW      Y3, Y7, Y8
  1565  	VPMULLW      Y2, Y6, Y9
  1566  	VPMULLW      Y2, Y7, Y10
  1567  	VPMULLW      Y3, Y6, Y11
  1568  	VPMULLW      Y8, Y14, Y8
  1569  	VPMULLW      Y9, Y14, Y9
  1570  	VPMULLW      Y10, Y14, Y10
  1571  	VPMULLW      Y11, Y14, Y11
  1572  	VPMULHW      Y3, Y7, Y12
  1573  	VPMULHW      Y2, Y6, Y13
  1574  	VPMULHW      Y2, Y7, Y2
  1575  	VPMULHW      Y3, Y6, Y3
  1576  	VMOVDQA      Y12, Y6
  1577  	VMOVDQA      Y13, Y7
  1578  	VPMULHW      Y8, Y15, Y8
  1579  	VPMULHW      Y9, Y15, Y9
  1580  	VPMULHW      Y10, Y15, Y10
  1581  	VPMULHW      Y11, Y15, Y11
  1582  	VPSUBW       Y8, Y6, Y6
  1583  	VPSUBW       Y9, Y7, Y7
  1584  	VPSUBW       Y10, Y2, Y2
  1585  	VPSUBW       Y11, Y3, Y3
  1586  	VMOVDQU      864(BX), Y12
  1587  	VMOVDQU      896(BX), Y13
  1588  	VPMULLW      Y6, Y12, Y8
  1589  	VPMULHW      Y6, Y13, Y6
  1590  	VPMULHW      Y8, Y15, Y8
  1591  	VPSUBW       Y8, Y6, Y6
  1592  	VPSUBW       Y6, Y7, Y6
  1593  	VPADDW       Y2, Y3, Y7
  1594  	VMOVDQU      Y4, 128(AX)
  1595  	VMOVDQU      Y5, 160(AX)
  1596  	VMOVDQU      Y6, 192(AX)
  1597  	VMOVDQU      Y7, 224(AX)
  1598  	VMOVDQU      256(CX), Y0
  1599  	VMOVDQU      288(CX), Y1
  1600  	VMOVDQU      320(CX), Y2
  1601  	VMOVDQU      352(CX), Y3
  1602  	VMOVDQU      256(DX), Y4
  1603  	VMOVDQU      288(DX), Y5
  1604  	VMOVDQU      320(DX), Y6
  1605  	VMOVDQU      352(DX), Y7
  1606  	VPMULLW      Y1, Y5, Y8
  1607  	VPMULLW      Y0, Y4, Y9
  1608  	VPMULLW      Y0, Y5, Y10
  1609  	VPMULLW      Y1, Y4, Y11
  1610  	VPMULLW      Y8, Y14, Y8
  1611  	VPMULLW      Y9, Y14, Y9
  1612  	VPMULLW      Y10, Y14, Y10
  1613  	VPMULLW      Y11, Y14, Y11
  1614  	VPMULHW      Y1, Y5, Y12
  1615  	VPMULHW      Y0, Y4, Y13
  1616  	VPMULHW      Y0, Y5, Y0
  1617  	VPMULHW      Y1, Y4, Y1
  1618  	VMOVDQA      Y12, Y4
  1619  	VMOVDQA      Y13, Y5
  1620  	VPMULHW      Y8, Y15, Y8
  1621  	VPMULHW      Y9, Y15, Y9
  1622  	VPMULHW      Y10, Y15, Y10
  1623  	VPMULHW      Y11, Y15, Y11
  1624  	VPSUBW       Y8, Y4, Y4
  1625  	VPSUBW       Y9, Y5, Y5
  1626  	VPSUBW       Y10, Y0, Y0
  1627  	VPSUBW       Y11, Y1, Y1
  1628  	VMOVDQU      928(BX), Y12
  1629  	VMOVDQU      960(BX), Y13
  1630  	VPMULLW      Y4, Y12, Y8
  1631  	VPMULHW      Y4, Y13, Y4
  1632  	VPMULHW      Y8, Y15, Y8
  1633  	VPSUBW       Y8, Y4, Y4
  1634  	VPADDW       Y4, Y5, Y4
  1635  	VPADDW       Y0, Y1, Y5
  1636  	VPMULLW      Y3, Y7, Y8
  1637  	VPMULLW      Y2, Y6, Y9
  1638  	VPMULLW      Y2, Y7, Y10
  1639  	VPMULLW      Y3, Y6, Y11
  1640  	VPMULLW      Y8, Y14, Y8
  1641  	VPMULLW      Y9, Y14, Y9
  1642  	VPMULLW      Y10, Y14, Y10
  1643  	VPMULLW      Y11, Y14, Y11
  1644  	VPMULHW      Y3, Y7, Y12
  1645  	VPMULHW      Y2, Y6, Y13
  1646  	VPMULHW      Y2, Y7, Y2
  1647  	VPMULHW      Y3, Y6, Y3
  1648  	VMOVDQA      Y12, Y6
  1649  	VMOVDQA      Y13, Y7
  1650  	VPMULHW      Y8, Y15, Y8
  1651  	VPMULHW      Y9, Y15, Y9
  1652  	VPMULHW      Y10, Y15, Y10
  1653  	VPMULHW      Y11, Y15, Y11
  1654  	VPSUBW       Y8, Y6, Y6
  1655  	VPSUBW       Y9, Y7, Y7
  1656  	VPSUBW       Y10, Y2, Y2
  1657  	VPSUBW       Y11, Y3, Y3
  1658  	VMOVDQU      928(BX), Y12
  1659  	VMOVDQU      960(BX), Y13
  1660  	VPMULLW      Y6, Y12, Y8
  1661  	VPMULHW      Y6, Y13, Y6
  1662  	VPMULHW      Y8, Y15, Y8
  1663  	VPSUBW       Y8, Y6, Y6
  1664  	VPSUBW       Y6, Y7, Y6
  1665  	VPADDW       Y2, Y3, Y7
  1666  	VMOVDQU      Y4, 256(AX)
  1667  	VMOVDQU      Y5, 288(AX)
  1668  	VMOVDQU      Y6, 320(AX)
  1669  	VMOVDQU      Y7, 352(AX)
  1670  	VMOVDQU      384(CX), Y0
  1671  	VMOVDQU      416(CX), Y1
  1672  	VMOVDQU      448(CX), Y2
  1673  	VMOVDQU      480(CX), Y3
  1674  	VMOVDQU      384(DX), Y4
  1675  	VMOVDQU      416(DX), Y5
  1676  	VMOVDQU      448(DX), Y6
  1677  	VMOVDQU      480(DX), Y7
  1678  	VPMULLW      Y1, Y5, Y8
  1679  	VPMULLW      Y0, Y4, Y9
  1680  	VPMULLW      Y0, Y5, Y10
  1681  	VPMULLW      Y1, Y4, Y11
  1682  	VPMULLW      Y8, Y14, Y8
  1683  	VPMULLW      Y9, Y14, Y9
  1684  	VPMULLW      Y10, Y14, Y10
  1685  	VPMULLW      Y11, Y14, Y11
  1686  	VPMULHW      Y1, Y5, Y12
  1687  	VPMULHW      Y0, Y4, Y13
  1688  	VPMULHW      Y0, Y5, Y0
  1689  	VPMULHW      Y1, Y4, Y1
  1690  	VMOVDQA      Y12, Y4
  1691  	VMOVDQA      Y13, Y5
  1692  	VPMULHW      Y8, Y15, Y8
  1693  	VPMULHW      Y9, Y15, Y9
  1694  	VPMULHW      Y10, Y15, Y10
  1695  	VPMULHW      Y11, Y15, Y11
  1696  	VPSUBW       Y8, Y4, Y4
  1697  	VPSUBW       Y9, Y5, Y5
  1698  	VPSUBW       Y10, Y0, Y0
  1699  	VPSUBW       Y11, Y1, Y1
  1700  	VMOVDQU      992(BX), Y12
  1701  	VMOVDQU      1024(BX), Y13
  1702  	VPMULLW      Y4, Y12, Y8
  1703  	VPMULHW      Y4, Y13, Y4
  1704  	VPMULHW      Y8, Y15, Y8
  1705  	VPSUBW       Y8, Y4, Y4
  1706  	VPADDW       Y4, Y5, Y4
  1707  	VPADDW       Y0, Y1, Y5
  1708  	VPMULLW      Y3, Y7, Y8
  1709  	VPMULLW      Y2, Y6, Y9
  1710  	VPMULLW      Y2, Y7, Y10
  1711  	VPMULLW      Y3, Y6, Y11
  1712  	VPMULLW      Y8, Y14, Y8
  1713  	VPMULLW      Y9, Y14, Y9
  1714  	VPMULLW      Y10, Y14, Y10
  1715  	VPMULLW      Y11, Y14, Y11
  1716  	VPMULHW      Y3, Y7, Y12
  1717  	VPMULHW      Y2, Y6, Y13
  1718  	VPMULHW      Y2, Y7, Y2
  1719  	VPMULHW      Y3, Y6, Y3
  1720  	VMOVDQA      Y12, Y6
  1721  	VMOVDQA      Y13, Y7
  1722  	VPMULHW      Y8, Y15, Y8
  1723  	VPMULHW      Y9, Y15, Y9
  1724  	VPMULHW      Y10, Y15, Y10
  1725  	VPMULHW      Y11, Y15, Y11
  1726  	VPSUBW       Y8, Y6, Y6
  1727  	VPSUBW       Y9, Y7, Y7
  1728  	VPSUBW       Y10, Y2, Y2
  1729  	VPSUBW       Y11, Y3, Y3
  1730  	VMOVDQU      992(BX), Y12
  1731  	VMOVDQU      1024(BX), Y13
  1732  	VPMULLW      Y6, Y12, Y8
  1733  	VPMULHW      Y6, Y13, Y6
  1734  	VPMULHW      Y8, Y15, Y8
  1735  	VPSUBW       Y8, Y6, Y6
  1736  	VPSUBW       Y6, Y7, Y6
  1737  	VPADDW       Y2, Y3, Y7
  1738  	VMOVDQU      Y4, 384(AX)
  1739  	VMOVDQU      Y5, 416(AX)
  1740  	VMOVDQU      Y6, 448(AX)
  1741  	VMOVDQU      Y7, 480(AX)
  1742  	RET
  1743  
  1744  // func detangleAVX2(p *[256]int16)
  1745  // Requires: AVX, AVX2
  1746  TEXT ·detangleAVX2(SB), NOSPLIT, $0-8
  1747  	MOVQ        p+0(FP), AX
  1748  	VMOVDQU     (AX), Y0
  1749  	VMOVDQU     32(AX), Y1
  1750  	VMOVDQU     64(AX), Y2
  1751  	VMOVDQU     96(AX), Y3
  1752  	VMOVDQU     128(AX), Y4
  1753  	VMOVDQU     160(AX), Y5
  1754  	VMOVDQU     192(AX), Y6
  1755  	VMOVDQU     224(AX), Y7
  1756  	VPSLLD      $0x10, Y1, Y8
  1757  	VPBLENDW    $0xaa, Y8, Y0, Y8
  1758  	VPSRLD      $0x10, Y0, Y0
  1759  	VPBLENDW    $0xaa, Y1, Y0, Y1
  1760  	VMOVDQA     Y8, Y0
  1761  	VPSLLD      $0x10, Y3, Y8
  1762  	VPBLENDW    $0xaa, Y8, Y2, Y8
  1763  	VPSRLD      $0x10, Y2, Y2
  1764  	VPBLENDW    $0xaa, Y3, Y2, Y3
  1765  	VMOVDQA     Y8, Y2
  1766  	VPSLLD      $0x10, Y5, Y8
  1767  	VPBLENDW    $0xaa, Y8, Y4, Y8
  1768  	VPSRLD      $0x10, Y4, Y4
  1769  	VPBLENDW    $0xaa, Y5, Y4, Y5
  1770  	VMOVDQA     Y8, Y4
  1771  	VPSLLD      $0x10, Y7, Y8
  1772  	VPBLENDW    $0xaa, Y8, Y6, Y8
  1773  	VPSRLD      $0x10, Y6, Y6
  1774  	VPBLENDW    $0xaa, Y7, Y6, Y7
  1775  	VMOVDQA     Y8, Y6
  1776  	VMOVSLDUP   Y2, Y8
  1777  	VPBLENDD    $0xaa, Y8, Y0, Y8
  1778  	VPSRLQ      $0x20, Y0, Y0
  1779  	VPBLENDD    $0xaa, Y2, Y0, Y2
  1780  	VMOVDQA     Y8, Y0
  1781  	VMOVSLDUP   Y3, Y8
  1782  	VPBLENDD    $0xaa, Y8, Y1, Y8
  1783  	VPSRLQ      $0x20, Y1, Y1
  1784  	VPBLENDD    $0xaa, Y3, Y1, Y3
  1785  	VMOVDQA     Y8, Y1
  1786  	VMOVSLDUP   Y6, Y8
  1787  	VPBLENDD    $0xaa, Y8, Y4, Y8
  1788  	VPSRLQ      $0x20, Y4, Y4
  1789  	VPBLENDD    $0xaa, Y6, Y4, Y6
  1790  	VMOVDQA     Y8, Y4
  1791  	VMOVSLDUP   Y7, Y8
  1792  	VPBLENDD    $0xaa, Y8, Y5, Y8
  1793  	VPSRLQ      $0x20, Y5, Y5
  1794  	VPBLENDD    $0xaa, Y7, Y5, Y7
  1795  	VMOVDQA     Y8, Y5
  1796  	VPUNPCKLQDQ Y1, Y0, Y8
  1797  	VPUNPCKHQDQ Y1, Y0, Y1
  1798  	VMOVDQA     Y8, Y0
  1799  	VPUNPCKLQDQ Y3, Y2, Y8
  1800  	VPUNPCKHQDQ Y3, Y2, Y3
  1801  	VMOVDQA     Y8, Y2
  1802  	VPUNPCKLQDQ Y5, Y4, Y8
  1803  	VPUNPCKHQDQ Y5, Y4, Y5
  1804  	VMOVDQA     Y8, Y4
  1805  	VPUNPCKLQDQ Y7, Y6, Y8
  1806  	VPUNPCKHQDQ Y7, Y6, Y7
  1807  	VMOVDQA     Y8, Y6
  1808  	VPERM2I128  $0x20, Y2, Y0, Y8
  1809  	VPERM2I128  $0x31, Y2, Y0, Y2
  1810  	VMOVDQA     Y8, Y0
  1811  	VPERM2I128  $0x20, Y3, Y1, Y8
  1812  	VPERM2I128  $0x31, Y3, Y1, Y3
  1813  	VMOVDQA     Y8, Y1
  1814  	VPERM2I128  $0x20, Y6, Y4, Y8
  1815  	VPERM2I128  $0x31, Y6, Y4, Y6
  1816  	VMOVDQA     Y8, Y4
  1817  	VPERM2I128  $0x20, Y7, Y5, Y8
  1818  	VPERM2I128  $0x31, Y7, Y5, Y7
  1819  	VMOVDQA     Y8, Y5
  1820  	VMOVDQU     Y0, (AX)
  1821  	VMOVDQU     Y1, 32(AX)
  1822  	VMOVDQU     Y2, 64(AX)
  1823  	VMOVDQU     Y3, 96(AX)
  1824  	VMOVDQU     Y4, 128(AX)
  1825  	VMOVDQU     Y5, 160(AX)
  1826  	VMOVDQU     Y6, 192(AX)
  1827  	VMOVDQU     Y7, 224(AX)
  1828  	VMOVDQU     256(AX), Y0
  1829  	VMOVDQU     288(AX), Y1
  1830  	VMOVDQU     320(AX), Y2
  1831  	VMOVDQU     352(AX), Y3
  1832  	VMOVDQU     384(AX), Y4
  1833  	VMOVDQU     416(AX), Y5
  1834  	VMOVDQU     448(AX), Y6
  1835  	VMOVDQU     480(AX), Y7
  1836  	VPSLLD      $0x10, Y1, Y8
  1837  	VPBLENDW    $0xaa, Y8, Y0, Y8
  1838  	VPSRLD      $0x10, Y0, Y0
  1839  	VPBLENDW    $0xaa, Y1, Y0, Y1
  1840  	VMOVDQA     Y8, Y0
  1841  	VPSLLD      $0x10, Y3, Y8
  1842  	VPBLENDW    $0xaa, Y8, Y2, Y8
  1843  	VPSRLD      $0x10, Y2, Y2
  1844  	VPBLENDW    $0xaa, Y3, Y2, Y3
  1845  	VMOVDQA     Y8, Y2
  1846  	VPSLLD      $0x10, Y5, Y8
  1847  	VPBLENDW    $0xaa, Y8, Y4, Y8
  1848  	VPSRLD      $0x10, Y4, Y4
  1849  	VPBLENDW    $0xaa, Y5, Y4, Y5
  1850  	VMOVDQA     Y8, Y4
  1851  	VPSLLD      $0x10, Y7, Y8
  1852  	VPBLENDW    $0xaa, Y8, Y6, Y8
  1853  	VPSRLD      $0x10, Y6, Y6
  1854  	VPBLENDW    $0xaa, Y7, Y6, Y7
  1855  	VMOVDQA     Y8, Y6
  1856  	VMOVSLDUP   Y2, Y8
  1857  	VPBLENDD    $0xaa, Y8, Y0, Y8
  1858  	VPSRLQ      $0x20, Y0, Y0
  1859  	VPBLENDD    $0xaa, Y2, Y0, Y2
  1860  	VMOVDQA     Y8, Y0
  1861  	VMOVSLDUP   Y3, Y8
  1862  	VPBLENDD    $0xaa, Y8, Y1, Y8
  1863  	VPSRLQ      $0x20, Y1, Y1
  1864  	VPBLENDD    $0xaa, Y3, Y1, Y3
  1865  	VMOVDQA     Y8, Y1
  1866  	VMOVSLDUP   Y6, Y8
  1867  	VPBLENDD    $0xaa, Y8, Y4, Y8
  1868  	VPSRLQ      $0x20, Y4, Y4
  1869  	VPBLENDD    $0xaa, Y6, Y4, Y6
  1870  	VMOVDQA     Y8, Y4
  1871  	VMOVSLDUP   Y7, Y8
  1872  	VPBLENDD    $0xaa, Y8, Y5, Y8
  1873  	VPSRLQ      $0x20, Y5, Y5
  1874  	VPBLENDD    $0xaa, Y7, Y5, Y7
  1875  	VMOVDQA     Y8, Y5
  1876  	VPUNPCKLQDQ Y1, Y0, Y8
  1877  	VPUNPCKHQDQ Y1, Y0, Y1
  1878  	VMOVDQA     Y8, Y0
  1879  	VPUNPCKLQDQ Y3, Y2, Y8
  1880  	VPUNPCKHQDQ Y3, Y2, Y3
  1881  	VMOVDQA     Y8, Y2
  1882  	VPUNPCKLQDQ Y5, Y4, Y8
  1883  	VPUNPCKHQDQ Y5, Y4, Y5
  1884  	VMOVDQA     Y8, Y4
  1885  	VPUNPCKLQDQ Y7, Y6, Y8
  1886  	VPUNPCKHQDQ Y7, Y6, Y7
  1887  	VMOVDQA     Y8, Y6
  1888  	VPERM2I128  $0x20, Y2, Y0, Y8
  1889  	VPERM2I128  $0x31, Y2, Y0, Y2
  1890  	VMOVDQA     Y8, Y0
  1891  	VPERM2I128  $0x20, Y3, Y1, Y8
  1892  	VPERM2I128  $0x31, Y3, Y1, Y3
  1893  	VMOVDQA     Y8, Y1
  1894  	VPERM2I128  $0x20, Y6, Y4, Y8
  1895  	VPERM2I128  $0x31, Y6, Y4, Y6
  1896  	VMOVDQA     Y8, Y4
  1897  	VPERM2I128  $0x20, Y7, Y5, Y8
  1898  	VPERM2I128  $0x31, Y7, Y5, Y7
  1899  	VMOVDQA     Y8, Y5
  1900  	VMOVDQU     Y0, 256(AX)
  1901  	VMOVDQU     Y1, 288(AX)
  1902  	VMOVDQU     Y2, 320(AX)
  1903  	VMOVDQU     Y3, 352(AX)
  1904  	VMOVDQU     Y4, 384(AX)
  1905  	VMOVDQU     Y5, 416(AX)
  1906  	VMOVDQU     Y6, 448(AX)
  1907  	VMOVDQU     Y7, 480(AX)
  1908  	RET
  1909  
  1910  // func tangleAVX2(p *[256]int16)
  1911  // Requires: AVX, AVX2
  1912  TEXT ·tangleAVX2(SB), NOSPLIT, $0-8
  1913  	MOVQ        p+0(FP), AX
  1914  	VMOVDQU     (AX), Y0
  1915  	VMOVDQU     32(AX), Y1
  1916  	VMOVDQU     64(AX), Y2
  1917  	VMOVDQU     96(AX), Y3
  1918  	VMOVDQU     128(AX), Y4
  1919  	VMOVDQU     160(AX), Y5
  1920  	VMOVDQU     192(AX), Y6
  1921  	VMOVDQU     224(AX), Y7
  1922  	VPERM2I128  $0x20, Y2, Y0, Y8
  1923  	VPERM2I128  $0x31, Y2, Y0, Y2
  1924  	VMOVDQA     Y8, Y0
  1925  	VPERM2I128  $0x20, Y3, Y1, Y8
  1926  	VPERM2I128  $0x31, Y3, Y1, Y3
  1927  	VMOVDQA     Y8, Y1
  1928  	VPERM2I128  $0x20, Y6, Y4, Y8
  1929  	VPERM2I128  $0x31, Y6, Y4, Y6
  1930  	VMOVDQA     Y8, Y4
  1931  	VPERM2I128  $0x20, Y7, Y5, Y8
  1932  	VPERM2I128  $0x31, Y7, Y5, Y7
  1933  	VMOVDQA     Y8, Y5
  1934  	VPUNPCKLQDQ Y1, Y0, Y8
  1935  	VPUNPCKHQDQ Y1, Y0, Y1
  1936  	VMOVDQA     Y8, Y0
  1937  	VPUNPCKLQDQ Y3, Y2, Y8
  1938  	VPUNPCKHQDQ Y3, Y2, Y3
  1939  	VMOVDQA     Y8, Y2
  1940  	VPUNPCKLQDQ Y5, Y4, Y8
  1941  	VPUNPCKHQDQ Y5, Y4, Y5
  1942  	VMOVDQA     Y8, Y4
  1943  	VPUNPCKLQDQ Y7, Y6, Y8
  1944  	VPUNPCKHQDQ Y7, Y6, Y7
  1945  	VMOVDQA     Y8, Y6
  1946  	VMOVSLDUP   Y2, Y8
  1947  	VPBLENDD    $0xaa, Y8, Y0, Y8
  1948  	VPSRLQ      $0x20, Y0, Y0
  1949  	VPBLENDD    $0xaa, Y2, Y0, Y2
  1950  	VMOVDQA     Y8, Y0
  1951  	VMOVSLDUP   Y3, Y8
  1952  	VPBLENDD    $0xaa, Y8, Y1, Y8
  1953  	VPSRLQ      $0x20, Y1, Y1
  1954  	VPBLENDD    $0xaa, Y3, Y1, Y3
  1955  	VMOVDQA     Y8, Y1
  1956  	VMOVSLDUP   Y6, Y8
  1957  	VPBLENDD    $0xaa, Y8, Y4, Y8
  1958  	VPSRLQ      $0x20, Y4, Y4
  1959  	VPBLENDD    $0xaa, Y6, Y4, Y6
  1960  	VMOVDQA     Y8, Y4
  1961  	VMOVSLDUP   Y7, Y8
  1962  	VPBLENDD    $0xaa, Y8, Y5, Y8
  1963  	VPSRLQ      $0x20, Y5, Y5
  1964  	VPBLENDD    $0xaa, Y7, Y5, Y7
  1965  	VMOVDQA     Y8, Y5
  1966  	VPSLLD      $0x10, Y1, Y8
  1967  	VPBLENDW    $0xaa, Y8, Y0, Y8
  1968  	VPSRLD      $0x10, Y0, Y0
  1969  	VPBLENDW    $0xaa, Y1, Y0, Y1
  1970  	VMOVDQA     Y8, Y0
  1971  	VPSLLD      $0x10, Y3, Y8
  1972  	VPBLENDW    $0xaa, Y8, Y2, Y8
  1973  	VPSRLD      $0x10, Y2, Y2
  1974  	VPBLENDW    $0xaa, Y3, Y2, Y3
  1975  	VMOVDQA     Y8, Y2
  1976  	VPSLLD      $0x10, Y5, Y8
  1977  	VPBLENDW    $0xaa, Y8, Y4, Y8
  1978  	VPSRLD      $0x10, Y4, Y4
  1979  	VPBLENDW    $0xaa, Y5, Y4, Y5
  1980  	VMOVDQA     Y8, Y4
  1981  	VPSLLD      $0x10, Y7, Y8
  1982  	VPBLENDW    $0xaa, Y8, Y6, Y8
  1983  	VPSRLD      $0x10, Y6, Y6
  1984  	VPBLENDW    $0xaa, Y7, Y6, Y7
  1985  	VMOVDQA     Y8, Y6
  1986  	VMOVDQU     Y0, (AX)
  1987  	VMOVDQU     Y1, 32(AX)
  1988  	VMOVDQU     Y2, 64(AX)
  1989  	VMOVDQU     Y3, 96(AX)
  1990  	VMOVDQU     Y4, 128(AX)
  1991  	VMOVDQU     Y5, 160(AX)
  1992  	VMOVDQU     Y6, 192(AX)
  1993  	VMOVDQU     Y7, 224(AX)
  1994  	VMOVDQU     256(AX), Y0
  1995  	VMOVDQU     288(AX), Y1
  1996  	VMOVDQU     320(AX), Y2
  1997  	VMOVDQU     352(AX), Y3
  1998  	VMOVDQU     384(AX), Y4
  1999  	VMOVDQU     416(AX), Y5
  2000  	VMOVDQU     448(AX), Y6
  2001  	VMOVDQU     480(AX), Y7
  2002  	VPERM2I128  $0x20, Y2, Y0, Y8
  2003  	VPERM2I128  $0x31, Y2, Y0, Y2
  2004  	VMOVDQA     Y8, Y0
  2005  	VPERM2I128  $0x20, Y3, Y1, Y8
  2006  	VPERM2I128  $0x31, Y3, Y1, Y3
  2007  	VMOVDQA     Y8, Y1
  2008  	VPERM2I128  $0x20, Y6, Y4, Y8
  2009  	VPERM2I128  $0x31, Y6, Y4, Y6
  2010  	VMOVDQA     Y8, Y4
  2011  	VPERM2I128  $0x20, Y7, Y5, Y8
  2012  	VPERM2I128  $0x31, Y7, Y5, Y7
  2013  	VMOVDQA     Y8, Y5
  2014  	VPUNPCKLQDQ Y1, Y0, Y8
  2015  	VPUNPCKHQDQ Y1, Y0, Y1
  2016  	VMOVDQA     Y8, Y0
  2017  	VPUNPCKLQDQ Y3, Y2, Y8
  2018  	VPUNPCKHQDQ Y3, Y2, Y3
  2019  	VMOVDQA     Y8, Y2
  2020  	VPUNPCKLQDQ Y5, Y4, Y8
  2021  	VPUNPCKHQDQ Y5, Y4, Y5
  2022  	VMOVDQA     Y8, Y4
  2023  	VPUNPCKLQDQ Y7, Y6, Y8
  2024  	VPUNPCKHQDQ Y7, Y6, Y7
  2025  	VMOVDQA     Y8, Y6
  2026  	VMOVSLDUP   Y2, Y8
  2027  	VPBLENDD    $0xaa, Y8, Y0, Y8
  2028  	VPSRLQ      $0x20, Y0, Y0
  2029  	VPBLENDD    $0xaa, Y2, Y0, Y2
  2030  	VMOVDQA     Y8, Y0
  2031  	VMOVSLDUP   Y3, Y8
  2032  	VPBLENDD    $0xaa, Y8, Y1, Y8
  2033  	VPSRLQ      $0x20, Y1, Y1
  2034  	VPBLENDD    $0xaa, Y3, Y1, Y3
  2035  	VMOVDQA     Y8, Y1
  2036  	VMOVSLDUP   Y6, Y8
  2037  	VPBLENDD    $0xaa, Y8, Y4, Y8
  2038  	VPSRLQ      $0x20, Y4, Y4
  2039  	VPBLENDD    $0xaa, Y6, Y4, Y6
  2040  	VMOVDQA     Y8, Y4
  2041  	VMOVSLDUP   Y7, Y8
  2042  	VPBLENDD    $0xaa, Y8, Y5, Y8
  2043  	VPSRLQ      $0x20, Y5, Y5
  2044  	VPBLENDD    $0xaa, Y7, Y5, Y7
  2045  	VMOVDQA     Y8, Y5
  2046  	VPSLLD      $0x10, Y1, Y8
  2047  	VPBLENDW    $0xaa, Y8, Y0, Y8
  2048  	VPSRLD      $0x10, Y0, Y0
  2049  	VPBLENDW    $0xaa, Y1, Y0, Y1
  2050  	VMOVDQA     Y8, Y0
  2051  	VPSLLD      $0x10, Y3, Y8
  2052  	VPBLENDW    $0xaa, Y8, Y2, Y8
  2053  	VPSRLD      $0x10, Y2, Y2
  2054  	VPBLENDW    $0xaa, Y3, Y2, Y3
  2055  	VMOVDQA     Y8, Y2
  2056  	VPSLLD      $0x10, Y5, Y8
  2057  	VPBLENDW    $0xaa, Y8, Y4, Y8
  2058  	VPSRLD      $0x10, Y4, Y4
  2059  	VPBLENDW    $0xaa, Y5, Y4, Y5
  2060  	VMOVDQA     Y8, Y4
  2061  	VPSLLD      $0x10, Y7, Y8
  2062  	VPBLENDW    $0xaa, Y8, Y6, Y8
  2063  	VPSRLD      $0x10, Y6, Y6
  2064  	VPBLENDW    $0xaa, Y7, Y6, Y7
  2065  	VMOVDQA     Y8, Y6
  2066  	VMOVDQU     Y0, 256(AX)
  2067  	VMOVDQU     Y1, 288(AX)
  2068  	VMOVDQU     Y2, 320(AX)
  2069  	VMOVDQU     Y3, 352(AX)
  2070  	VMOVDQU     Y4, 384(AX)
  2071  	VMOVDQU     Y5, 416(AX)
  2072  	VMOVDQU     Y6, 448(AX)
  2073  	VMOVDQU     Y7, 480(AX)
  2074  	RET
  2075  
  2076  // func barrettReduceAVX2(p *[256]int16)
  2077  // Requires: AVX, AVX2
  2078  TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8
  2079  	MOVQ         p+0(FP), AX
  2080  	MOVL         $0x00000d01, CX
  2081  	VMOVD        CX, X0
  2082  	VPBROADCASTW X0, Y9
  2083  	MOVL         $0x00004ebf, CX
  2084  	VMOVD        CX, X0
  2085  	VPBROADCASTW X0, Y8
  2086  	VMOVDQU      (AX), Y0
  2087  	VMOVDQU      32(AX), Y1
  2088  	VMOVDQU      64(AX), Y2
  2089  	VMOVDQU      96(AX), Y3
  2090  	VPMULHW      Y8, Y0, Y4
  2091  	VPMULHW      Y8, Y1, Y5
  2092  	VPMULHW      Y8, Y2, Y6
  2093  	VPMULHW      Y8, Y3, Y7
  2094  	VPSRAW       $0x0a, Y4, Y4
  2095  	VPSRAW       $0x0a, Y5, Y5
  2096  	VPSRAW       $0x0a, Y6, Y6
  2097  	VPSRAW       $0x0a, Y7, Y7
  2098  	VPMULLW      Y9, Y4, Y4
  2099  	VPMULLW      Y9, Y5, Y5
  2100  	VPMULLW      Y9, Y6, Y6
  2101  	VPMULLW      Y9, Y7, Y7
  2102  	VPSUBW       Y4, Y0, Y0
  2103  	VPSUBW       Y5, Y1, Y1
  2104  	VPSUBW       Y6, Y2, Y2
  2105  	VPSUBW       Y7, Y3, Y3
  2106  	VMOVDQU      Y0, (AX)
  2107  	VMOVDQU      Y1, 32(AX)
  2108  	VMOVDQU      Y2, 64(AX)
  2109  	VMOVDQU      Y3, 96(AX)
  2110  	VMOVDQU      128(AX), Y0
  2111  	VMOVDQU      160(AX), Y1
  2112  	VMOVDQU      192(AX), Y2
  2113  	VMOVDQU      224(AX), Y3
  2114  	VPMULHW      Y8, Y0, Y4
  2115  	VPMULHW      Y8, Y1, Y5
  2116  	VPMULHW      Y8, Y2, Y6
  2117  	VPMULHW      Y8, Y3, Y7
  2118  	VPSRAW       $0x0a, Y4, Y4
  2119  	VPSRAW       $0x0a, Y5, Y5
  2120  	VPSRAW       $0x0a, Y6, Y6
  2121  	VPSRAW       $0x0a, Y7, Y7
  2122  	VPMULLW      Y9, Y4, Y4
  2123  	VPMULLW      Y9, Y5, Y5
  2124  	VPMULLW      Y9, Y6, Y6
  2125  	VPMULLW      Y9, Y7, Y7
  2126  	VPSUBW       Y4, Y0, Y0
  2127  	VPSUBW       Y5, Y1, Y1
  2128  	VPSUBW       Y6, Y2, Y2
  2129  	VPSUBW       Y7, Y3, Y3
  2130  	VMOVDQU      Y0, 128(AX)
  2131  	VMOVDQU      Y1, 160(AX)
  2132  	VMOVDQU      Y2, 192(AX)
  2133  	VMOVDQU      Y3, 224(AX)
  2134  	VMOVDQU      256(AX), Y0
  2135  	VMOVDQU      288(AX), Y1
  2136  	VMOVDQU      320(AX), Y2
  2137  	VMOVDQU      352(AX), Y3
  2138  	VPMULHW      Y8, Y0, Y4
  2139  	VPMULHW      Y8, Y1, Y5
  2140  	VPMULHW      Y8, Y2, Y6
  2141  	VPMULHW      Y8, Y3, Y7
  2142  	VPSRAW       $0x0a, Y4, Y4
  2143  	VPSRAW       $0x0a, Y5, Y5
  2144  	VPSRAW       $0x0a, Y6, Y6
  2145  	VPSRAW       $0x0a, Y7, Y7
  2146  	VPMULLW      Y9, Y4, Y4
  2147  	VPMULLW      Y9, Y5, Y5
  2148  	VPMULLW      Y9, Y6, Y6
  2149  	VPMULLW      Y9, Y7, Y7
  2150  	VPSUBW       Y4, Y0, Y0
  2151  	VPSUBW       Y5, Y1, Y1
  2152  	VPSUBW       Y6, Y2, Y2
  2153  	VPSUBW       Y7, Y3, Y3
  2154  	VMOVDQU      Y0, 256(AX)
  2155  	VMOVDQU      Y1, 288(AX)
  2156  	VMOVDQU      Y2, 320(AX)
  2157  	VMOVDQU      Y3, 352(AX)
  2158  	VMOVDQU      384(AX), Y0
  2159  	VMOVDQU      416(AX), Y1
  2160  	VMOVDQU      448(AX), Y2
  2161  	VMOVDQU      480(AX), Y3
  2162  	VPMULHW      Y8, Y0, Y4
  2163  	VPMULHW      Y8, Y1, Y5
  2164  	VPMULHW      Y8, Y2, Y6
  2165  	VPMULHW      Y8, Y3, Y7
  2166  	VPSRAW       $0x0a, Y4, Y4
  2167  	VPSRAW       $0x0a, Y5, Y5
  2168  	VPSRAW       $0x0a, Y6, Y6
  2169  	VPSRAW       $0x0a, Y7, Y7
  2170  	VPMULLW      Y9, Y4, Y4
  2171  	VPMULLW      Y9, Y5, Y5
  2172  	VPMULLW      Y9, Y6, Y6
  2173  	VPMULLW      Y9, Y7, Y7
  2174  	VPSUBW       Y4, Y0, Y0
  2175  	VPSUBW       Y5, Y1, Y1
  2176  	VPSUBW       Y6, Y2, Y2
  2177  	VPSUBW       Y7, Y3, Y3
  2178  	VMOVDQU      Y0, 384(AX)
  2179  	VMOVDQU      Y1, 416(AX)
  2180  	VMOVDQU      Y2, 448(AX)
  2181  	VMOVDQU      Y3, 480(AX)
  2182  	RET
  2183  
  2184  // func normalizeAVX2(p *[256]int16)
  2185  // Requires: AVX, AVX2
  2186  TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8
  2187  	MOVQ         p+0(FP), AX
  2188  	MOVL         $0x00000d01, CX
  2189  	VMOVD        CX, X0
  2190  	VPBROADCASTW X0, Y9
  2191  	MOVL         $0x00004ebf, CX
  2192  	VMOVD        CX, X0
  2193  	VPBROADCASTW X0, Y8
  2194  	VMOVDQU      (AX), Y0
  2195  	VMOVDQU      32(AX), Y1
  2196  	VMOVDQU      64(AX), Y2
  2197  	VMOVDQU      96(AX), Y3
  2198  	VPMULHW      Y8, Y0, Y4
  2199  	VPMULHW      Y8, Y1, Y5
  2200  	VPMULHW      Y8, Y2, Y6
  2201  	VPMULHW      Y8, Y3, Y7
  2202  	VPSRAW       $0x0a, Y4, Y4
  2203  	VPSRAW       $0x0a, Y5, Y5
  2204  	VPSRAW       $0x0a, Y6, Y6
  2205  	VPSRAW       $0x0a, Y7, Y7
  2206  	VPMULLW      Y9, Y4, Y4
  2207  	VPMULLW      Y9, Y5, Y5
  2208  	VPMULLW      Y9, Y6, Y6
  2209  	VPMULLW      Y9, Y7, Y7
  2210  	VPSUBW       Y4, Y0, Y0
  2211  	VPSUBW       Y5, Y1, Y1
  2212  	VPSUBW       Y6, Y2, Y2
  2213  	VPSUBW       Y7, Y3, Y3
  2214  	VPSUBW       Y9, Y0, Y0
  2215  	VPSUBW       Y9, Y1, Y1
  2216  	VPSUBW       Y9, Y2, Y2
  2217  	VPSUBW       Y9, Y3, Y3
  2218  	VPSRAW       $0x0f, Y0, Y4
  2219  	VPSRAW       $0x0f, Y1, Y5
  2220  	VPSRAW       $0x0f, Y2, Y6
  2221  	VPSRAW       $0x0f, Y3, Y7
  2222  	VPAND        Y4, Y9, Y4
  2223  	VPAND        Y5, Y9, Y5
  2224  	VPAND        Y6, Y9, Y6
  2225  	VPAND        Y7, Y9, Y7
  2226  	VPADDW       Y0, Y4, Y0
  2227  	VPADDW       Y1, Y5, Y1
  2228  	VPADDW       Y2, Y6, Y2
  2229  	VPADDW       Y3, Y7, Y3
  2230  	VMOVDQU      Y0, (AX)
  2231  	VMOVDQU      Y1, 32(AX)
  2232  	VMOVDQU      Y2, 64(AX)
  2233  	VMOVDQU      Y3, 96(AX)
  2234  	VMOVDQU      128(AX), Y0
  2235  	VMOVDQU      160(AX), Y1
  2236  	VMOVDQU      192(AX), Y2
  2237  	VMOVDQU      224(AX), Y3
  2238  	VPMULHW      Y8, Y0, Y4
  2239  	VPMULHW      Y8, Y1, Y5
  2240  	VPMULHW      Y8, Y2, Y6
  2241  	VPMULHW      Y8, Y3, Y7
  2242  	VPSRAW       $0x0a, Y4, Y4
  2243  	VPSRAW       $0x0a, Y5, Y5
  2244  	VPSRAW       $0x0a, Y6, Y6
  2245  	VPSRAW       $0x0a, Y7, Y7
  2246  	VPMULLW      Y9, Y4, Y4
  2247  	VPMULLW      Y9, Y5, Y5
  2248  	VPMULLW      Y9, Y6, Y6
  2249  	VPMULLW      Y9, Y7, Y7
  2250  	VPSUBW       Y4, Y0, Y0
  2251  	VPSUBW       Y5, Y1, Y1
  2252  	VPSUBW       Y6, Y2, Y2
  2253  	VPSUBW       Y7, Y3, Y3
  2254  	VPSUBW       Y9, Y0, Y0
  2255  	VPSUBW       Y9, Y1, Y1
  2256  	VPSUBW       Y9, Y2, Y2
  2257  	VPSUBW       Y9, Y3, Y3
  2258  	VPSRAW       $0x0f, Y0, Y4
  2259  	VPSRAW       $0x0f, Y1, Y5
  2260  	VPSRAW       $0x0f, Y2, Y6
  2261  	VPSRAW       $0x0f, Y3, Y7
  2262  	VPAND        Y4, Y9, Y4
  2263  	VPAND        Y5, Y9, Y5
  2264  	VPAND        Y6, Y9, Y6
  2265  	VPAND        Y7, Y9, Y7
  2266  	VPADDW       Y0, Y4, Y0
  2267  	VPADDW       Y1, Y5, Y1
  2268  	VPADDW       Y2, Y6, Y2
  2269  	VPADDW       Y3, Y7, Y3
  2270  	VMOVDQU      Y0, 128(AX)
  2271  	VMOVDQU      Y1, 160(AX)
  2272  	VMOVDQU      Y2, 192(AX)
  2273  	VMOVDQU      Y3, 224(AX)
  2274  	VMOVDQU      256(AX), Y0
  2275  	VMOVDQU      288(AX), Y1
  2276  	VMOVDQU      320(AX), Y2
  2277  	VMOVDQU      352(AX), Y3
  2278  	VPMULHW      Y8, Y0, Y4
  2279  	VPMULHW      Y8, Y1, Y5
  2280  	VPMULHW      Y8, Y2, Y6
  2281  	VPMULHW      Y8, Y3, Y7
  2282  	VPSRAW       $0x0a, Y4, Y4
  2283  	VPSRAW       $0x0a, Y5, Y5
  2284  	VPSRAW       $0x0a, Y6, Y6
  2285  	VPSRAW       $0x0a, Y7, Y7
  2286  	VPMULLW      Y9, Y4, Y4
  2287  	VPMULLW      Y9, Y5, Y5
  2288  	VPMULLW      Y9, Y6, Y6
  2289  	VPMULLW      Y9, Y7, Y7
  2290  	VPSUBW       Y4, Y0, Y0
  2291  	VPSUBW       Y5, Y1, Y1
  2292  	VPSUBW       Y6, Y2, Y2
  2293  	VPSUBW       Y7, Y3, Y3
  2294  	VPSUBW       Y9, Y0, Y0
  2295  	VPSUBW       Y9, Y1, Y1
  2296  	VPSUBW       Y9, Y2, Y2
  2297  	VPSUBW       Y9, Y3, Y3
  2298  	VPSRAW       $0x0f, Y0, Y4
  2299  	VPSRAW       $0x0f, Y1, Y5
  2300  	VPSRAW       $0x0f, Y2, Y6
  2301  	VPSRAW       $0x0f, Y3, Y7
  2302  	VPAND        Y4, Y9, Y4
  2303  	VPAND        Y5, Y9, Y5
  2304  	VPAND        Y6, Y9, Y6
  2305  	VPAND        Y7, Y9, Y7
  2306  	VPADDW       Y0, Y4, Y0
  2307  	VPADDW       Y1, Y5, Y1
  2308  	VPADDW       Y2, Y6, Y2
  2309  	VPADDW       Y3, Y7, Y3
  2310  	VMOVDQU      Y0, 256(AX)
  2311  	VMOVDQU      Y1, 288(AX)
  2312  	VMOVDQU      Y2, 320(AX)
  2313  	VMOVDQU      Y3, 352(AX)
  2314  	VMOVDQU      384(AX), Y0
  2315  	VMOVDQU      416(AX), Y1
  2316  	VMOVDQU      448(AX), Y2
  2317  	VMOVDQU      480(AX), Y3
  2318  	VPMULHW      Y8, Y0, Y4
  2319  	VPMULHW      Y8, Y1, Y5
  2320  	VPMULHW      Y8, Y2, Y6
  2321  	VPMULHW      Y8, Y3, Y7
  2322  	VPSRAW       $0x0a, Y4, Y4
  2323  	VPSRAW       $0x0a, Y5, Y5
  2324  	VPSRAW       $0x0a, Y6, Y6
  2325  	VPSRAW       $0x0a, Y7, Y7
  2326  	VPMULLW      Y9, Y4, Y4
  2327  	VPMULLW      Y9, Y5, Y5
  2328  	VPMULLW      Y9, Y6, Y6
  2329  	VPMULLW      Y9, Y7, Y7
  2330  	VPSUBW       Y4, Y0, Y0
  2331  	VPSUBW       Y5, Y1, Y1
  2332  	VPSUBW       Y6, Y2, Y2
  2333  	VPSUBW       Y7, Y3, Y3
  2334  	VPSUBW       Y9, Y0, Y0
  2335  	VPSUBW       Y9, Y1, Y1
  2336  	VPSUBW       Y9, Y2, Y2
  2337  	VPSUBW       Y9, Y3, Y3
  2338  	VPSRAW       $0x0f, Y0, Y4
  2339  	VPSRAW       $0x0f, Y1, Y5
  2340  	VPSRAW       $0x0f, Y2, Y6
  2341  	VPSRAW       $0x0f, Y3, Y7
  2342  	VPAND        Y4, Y9, Y4
  2343  	VPAND        Y5, Y9, Y5
  2344  	VPAND        Y6, Y9, Y6
  2345  	VPAND        Y7, Y9, Y7
  2346  	VPADDW       Y0, Y4, Y0
  2347  	VPADDW       Y1, Y5, Y1
  2348  	VPADDW       Y2, Y6, Y2
  2349  	VPADDW       Y3, Y7, Y3
  2350  	VMOVDQU      Y0, 384(AX)
  2351  	VMOVDQU      Y1, 416(AX)
  2352  	VMOVDQU      Y2, 448(AX)
  2353  	VMOVDQU      Y3, 480(AX)
  2354  	RET