git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/internal/blake3/blake3_amd64.s (about)

     1  // Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT.
     2  
     3  #include "textflag.h"
     4  
     5  DATA iv<>+0(SB)/4, $0x6a09e667
     6  DATA iv<>+4(SB)/4, $0xbb67ae85
     7  DATA iv<>+8(SB)/4, $0x3c6ef372
     8  DATA iv<>+12(SB)/4, $0xa54ff53a
     9  GLOBL iv<>(SB), RODATA|NOPTR, $16
    10  
    11  DATA seq<>+0(SB)/4, $0x00000000
    12  DATA seq<>+4(SB)/4, $0x00000001
    13  DATA seq<>+8(SB)/4, $0x00000002
    14  DATA seq<>+12(SB)/4, $0x00000003
    15  DATA seq<>+16(SB)/4, $0x00000004
    16  DATA seq<>+20(SB)/4, $0x00000005
    17  DATA seq<>+24(SB)/4, $0x00000006
    18  DATA seq<>+28(SB)/4, $0x00000007
    19  DATA seq<>+32(SB)/4, $0x00000008
    20  DATA seq<>+36(SB)/4, $0x00000009
    21  DATA seq<>+40(SB)/4, $0x0000000a
    22  DATA seq<>+44(SB)/4, $0x0000000b
    23  DATA seq<>+48(SB)/4, $0x0000000c
    24  DATA seq<>+52(SB)/4, $0x0000000d
    25  DATA seq<>+56(SB)/4, $0x0000000e
    26  DATA seq<>+60(SB)/4, $0x0000000f
    27  GLOBL seq<>(SB), RODATA|NOPTR, $64
    28  
    29  DATA seq64<>+0(SB)/8, $0x0000000000000000
    30  DATA seq64<>+8(SB)/8, $0x0000000000000001
    31  DATA seq64<>+16(SB)/8, $0x0000000000000002
    32  DATA seq64<>+24(SB)/8, $0x0000000000000003
    33  DATA seq64<>+32(SB)/8, $0x0000000000000004
    34  DATA seq64<>+40(SB)/8, $0x0000000000000005
    35  DATA seq64<>+48(SB)/8, $0x0000000000000006
    36  DATA seq64<>+56(SB)/8, $0x0000000000000007
    37  GLOBL seq64<>(SB), RODATA|NOPTR, $64
    38  
    39  DATA shuffle_rot8<>+0(SB)/4, $0x00030201
    40  DATA shuffle_rot8<>+4(SB)/4, $0x04070605
    41  DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
    42  DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
    43  DATA shuffle_rot8<>+16(SB)/4, $0x10131211
    44  DATA shuffle_rot8<>+20(SB)/4, $0x14171615
    45  DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
    46  DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
    47  GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32
    48  
    49  DATA shuffle_rot16<>+0(SB)/4, $0x01000302
    50  DATA shuffle_rot16<>+4(SB)/4, $0x05040706
    51  DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
    52  DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
    53  DATA shuffle_rot16<>+16(SB)/4, $0x11101312
    54  DATA shuffle_rot16<>+20(SB)/4, $0x15141716
    55  DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
    56  DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
    57  GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
    58  
    59  // func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
    60  // Requires: AVX512BW, AVX512F
    61  TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
    62  	MOVQ out+0(FP), AX
    63  	MOVQ block+8(FP), CX
    64  	MOVQ cv+16(FP), DX
    65  
    66  	// Initialize block vectors
    67  	VPBROADCASTD (CX), Z1
    68  	VPBROADCASTD 4(CX), Z3
    69  	VPBROADCASTD 8(CX), Z5
    70  	VPBROADCASTD 12(CX), Z7
    71  	VPBROADCASTD 16(CX), Z9
    72  	VPBROADCASTD 20(CX), Z11
    73  	VPBROADCASTD 24(CX), Z13
    74  	VPBROADCASTD 28(CX), Z15
    75  	VPBROADCASTD 32(CX), Z17
    76  	VPBROADCASTD 36(CX), Z19
    77  	VPBROADCASTD 40(CX), Z21
    78  	VPBROADCASTD 44(CX), Z23
    79  	VPBROADCASTD 48(CX), Z25
    80  	VPBROADCASTD 52(CX), Z27
    81  	VPBROADCASTD 56(CX), Z29
    82  	VPBROADCASTD 60(CX), Z31
    83  
    84  	// Initialize state vectors
    85  	VPBROADCASTD (DX), Z0
    86  	VPBROADCASTD 4(DX), Z2
    87  	VPBROADCASTD 8(DX), Z4
    88  	VPBROADCASTD 12(DX), Z6
    89  	VPBROADCASTD 16(DX), Z8
    90  	VPBROADCASTD 20(DX), Z10
    91  	VPBROADCASTD 24(DX), Z12
    92  	VPBROADCASTD 28(DX), Z14
    93  	VPBROADCASTD iv<>+0(SB), Z16
    94  	VPBROADCASTD iv<>+4(SB), Z18
    95  	VPBROADCASTD iv<>+8(SB), Z20
    96  	VPBROADCASTD iv<>+12(SB), Z22
    97  	VPBROADCASTD counter+24(FP), Z24
    98  	VPADDD       seq<>+0(SB), Z24, Z24
    99  	VPCMPUD      $0x01, seq<>+0(SB), Z24, K1
   100  	VPBROADCASTD counter+28(FP), Z26
   101  	VPADDD.BCST  seq<>+4(SB), Z26, K1, Z26
   102  	VPBROADCASTD blockLen+32(FP), Z28
   103  	VPBROADCASTD flags+36(FP), Z30
   104  
   105  	// Round 1
   106  	VPADDD Z0, Z8, Z0
   107  	VPADDD Z1, Z0, Z0
   108  	VPXORD Z24, Z0, Z24
   109  	VPRORD $0x10, Z24, Z24
   110  	VPADDD Z16, Z24, Z16
   111  	VPXORD Z8, Z16, Z8
   112  	VPRORD $0x0c, Z8, Z8
   113  	VPADDD Z0, Z8, Z0
   114  	VPADDD Z3, Z0, Z0
   115  	VPXORD Z24, Z0, Z24
   116  	VPRORD $0x08, Z24, Z24
   117  	VPADDD Z16, Z24, Z16
   118  	VPXORD Z8, Z16, Z8
   119  	VPRORD $0x07, Z8, Z8
   120  	VPADDD Z2, Z10, Z2
   121  	VPADDD Z5, Z2, Z2
   122  	VPXORD Z26, Z2, Z26
   123  	VPRORD $0x10, Z26, Z26
   124  	VPADDD Z18, Z26, Z18
   125  	VPXORD Z10, Z18, Z10
   126  	VPRORD $0x0c, Z10, Z10
   127  	VPADDD Z2, Z10, Z2
   128  	VPADDD Z7, Z2, Z2
   129  	VPXORD Z26, Z2, Z26
   130  	VPRORD $0x08, Z26, Z26
   131  	VPADDD Z18, Z26, Z18
   132  	VPXORD Z10, Z18, Z10
   133  	VPRORD $0x07, Z10, Z10
   134  	VPADDD Z4, Z12, Z4
   135  	VPADDD Z9, Z4, Z4
   136  	VPXORD Z28, Z4, Z28
   137  	VPRORD $0x10, Z28, Z28
   138  	VPADDD Z20, Z28, Z20
   139  	VPXORD Z12, Z20, Z12
   140  	VPRORD $0x0c, Z12, Z12
   141  	VPADDD Z4, Z12, Z4
   142  	VPADDD Z11, Z4, Z4
   143  	VPXORD Z28, Z4, Z28
   144  	VPRORD $0x08, Z28, Z28
   145  	VPADDD Z20, Z28, Z20
   146  	VPXORD Z12, Z20, Z12
   147  	VPRORD $0x07, Z12, Z12
   148  	VPADDD Z6, Z14, Z6
   149  	VPADDD Z13, Z6, Z6
   150  	VPXORD Z30, Z6, Z30
   151  	VPRORD $0x10, Z30, Z30
   152  	VPADDD Z22, Z30, Z22
   153  	VPXORD Z14, Z22, Z14
   154  	VPRORD $0x0c, Z14, Z14
   155  	VPADDD Z6, Z14, Z6
   156  	VPADDD Z15, Z6, Z6
   157  	VPXORD Z30, Z6, Z30
   158  	VPRORD $0x08, Z30, Z30
   159  	VPADDD Z22, Z30, Z22
   160  	VPXORD Z14, Z22, Z14
   161  	VPRORD $0x07, Z14, Z14
   162  	VPADDD Z0, Z10, Z0
   163  	VPADDD Z17, Z0, Z0
   164  	VPXORD Z30, Z0, Z30
   165  	VPRORD $0x10, Z30, Z30
   166  	VPADDD Z20, Z30, Z20
   167  	VPXORD Z10, Z20, Z10
   168  	VPRORD $0x0c, Z10, Z10
   169  	VPADDD Z0, Z10, Z0
   170  	VPADDD Z19, Z0, Z0
   171  	VPXORD Z30, Z0, Z30
   172  	VPRORD $0x08, Z30, Z30
   173  	VPADDD Z20, Z30, Z20
   174  	VPXORD Z10, Z20, Z10
   175  	VPRORD $0x07, Z10, Z10
   176  	VPADDD Z2, Z12, Z2
   177  	VPADDD Z21, Z2, Z2
   178  	VPXORD Z24, Z2, Z24
   179  	VPRORD $0x10, Z24, Z24
   180  	VPADDD Z22, Z24, Z22
   181  	VPXORD Z12, Z22, Z12
   182  	VPRORD $0x0c, Z12, Z12
   183  	VPADDD Z2, Z12, Z2
   184  	VPADDD Z23, Z2, Z2
   185  	VPXORD Z24, Z2, Z24
   186  	VPRORD $0x08, Z24, Z24
   187  	VPADDD Z22, Z24, Z22
   188  	VPXORD Z12, Z22, Z12
   189  	VPRORD $0x07, Z12, Z12
   190  	VPADDD Z4, Z14, Z4
   191  	VPADDD Z25, Z4, Z4
   192  	VPXORD Z26, Z4, Z26
   193  	VPRORD $0x10, Z26, Z26
   194  	VPADDD Z16, Z26, Z16
   195  	VPXORD Z14, Z16, Z14
   196  	VPRORD $0x0c, Z14, Z14
   197  	VPADDD Z4, Z14, Z4
   198  	VPADDD Z27, Z4, Z4
   199  	VPXORD Z26, Z4, Z26
   200  	VPRORD $0x08, Z26, Z26
   201  	VPADDD Z16, Z26, Z16
   202  	VPXORD Z14, Z16, Z14
   203  	VPRORD $0x07, Z14, Z14
   204  	VPADDD Z6, Z8, Z6
   205  	VPADDD Z29, Z6, Z6
   206  	VPXORD Z28, Z6, Z28
   207  	VPRORD $0x10, Z28, Z28
   208  	VPADDD Z18, Z28, Z18
   209  	VPXORD Z8, Z18, Z8
   210  	VPRORD $0x0c, Z8, Z8
   211  	VPADDD Z6, Z8, Z6
   212  	VPADDD Z31, Z6, Z6
   213  	VPXORD Z28, Z6, Z28
   214  	VPRORD $0x08, Z28, Z28
   215  	VPADDD Z18, Z28, Z18
   216  	VPXORD Z8, Z18, Z8
   217  	VPRORD $0x07, Z8, Z8
   218  
   219  	// Round 2
   220  	VPADDD Z0, Z8, Z0
   221  	VPADDD Z5, Z0, Z0
   222  	VPXORD Z24, Z0, Z24
   223  	VPRORD $0x10, Z24, Z24
   224  	VPADDD Z16, Z24, Z16
   225  	VPXORD Z8, Z16, Z8
   226  	VPRORD $0x0c, Z8, Z8
   227  	VPADDD Z0, Z8, Z0
   228  	VPADDD Z13, Z0, Z0
   229  	VPXORD Z24, Z0, Z24
   230  	VPRORD $0x08, Z24, Z24
   231  	VPADDD Z16, Z24, Z16
   232  	VPXORD Z8, Z16, Z8
   233  	VPRORD $0x07, Z8, Z8
   234  	VPADDD Z2, Z10, Z2
   235  	VPADDD Z7, Z2, Z2
   236  	VPXORD Z26, Z2, Z26
   237  	VPRORD $0x10, Z26, Z26
   238  	VPADDD Z18, Z26, Z18
   239  	VPXORD Z10, Z18, Z10
   240  	VPRORD $0x0c, Z10, Z10
   241  	VPADDD Z2, Z10, Z2
   242  	VPADDD Z21, Z2, Z2
   243  	VPXORD Z26, Z2, Z26
   244  	VPRORD $0x08, Z26, Z26
   245  	VPADDD Z18, Z26, Z18
   246  	VPXORD Z10, Z18, Z10
   247  	VPRORD $0x07, Z10, Z10
   248  	VPADDD Z4, Z12, Z4
   249  	VPADDD Z15, Z4, Z4
   250  	VPXORD Z28, Z4, Z28
   251  	VPRORD $0x10, Z28, Z28
   252  	VPADDD Z20, Z28, Z20
   253  	VPXORD Z12, Z20, Z12
   254  	VPRORD $0x0c, Z12, Z12
   255  	VPADDD Z4, Z12, Z4
   256  	VPADDD Z1, Z4, Z4
   257  	VPXORD Z28, Z4, Z28
   258  	VPRORD $0x08, Z28, Z28
   259  	VPADDD Z20, Z28, Z20
   260  	VPXORD Z12, Z20, Z12
   261  	VPRORD $0x07, Z12, Z12
   262  	VPADDD Z6, Z14, Z6
   263  	VPADDD Z9, Z6, Z6
   264  	VPXORD Z30, Z6, Z30
   265  	VPRORD $0x10, Z30, Z30
   266  	VPADDD Z22, Z30, Z22
   267  	VPXORD Z14, Z22, Z14
   268  	VPRORD $0x0c, Z14, Z14
   269  	VPADDD Z6, Z14, Z6
   270  	VPADDD Z27, Z6, Z6
   271  	VPXORD Z30, Z6, Z30
   272  	VPRORD $0x08, Z30, Z30
   273  	VPADDD Z22, Z30, Z22
   274  	VPXORD Z14, Z22, Z14
   275  	VPRORD $0x07, Z14, Z14
   276  	VPADDD Z0, Z10, Z0
   277  	VPADDD Z3, Z0, Z0
   278  	VPXORD Z30, Z0, Z30
   279  	VPRORD $0x10, Z30, Z30
   280  	VPADDD Z20, Z30, Z20
   281  	VPXORD Z10, Z20, Z10
   282  	VPRORD $0x0c, Z10, Z10
   283  	VPADDD Z0, Z10, Z0
   284  	VPADDD Z23, Z0, Z0
   285  	VPXORD Z30, Z0, Z30
   286  	VPRORD $0x08, Z30, Z30
   287  	VPADDD Z20, Z30, Z20
   288  	VPXORD Z10, Z20, Z10
   289  	VPRORD $0x07, Z10, Z10
   290  	VPADDD Z2, Z12, Z2
   291  	VPADDD Z25, Z2, Z2
   292  	VPXORD Z24, Z2, Z24
   293  	VPRORD $0x10, Z24, Z24
   294  	VPADDD Z22, Z24, Z22
   295  	VPXORD Z12, Z22, Z12
   296  	VPRORD $0x0c, Z12, Z12
   297  	VPADDD Z2, Z12, Z2
   298  	VPADDD Z11, Z2, Z2
   299  	VPXORD Z24, Z2, Z24
   300  	VPRORD $0x08, Z24, Z24
   301  	VPADDD Z22, Z24, Z22
   302  	VPXORD Z12, Z22, Z12
   303  	VPRORD $0x07, Z12, Z12
   304  	VPADDD Z4, Z14, Z4
   305  	VPADDD Z19, Z4, Z4
   306  	VPXORD Z26, Z4, Z26
   307  	VPRORD $0x10, Z26, Z26
   308  	VPADDD Z16, Z26, Z16
   309  	VPXORD Z14, Z16, Z14
   310  	VPRORD $0x0c, Z14, Z14
   311  	VPADDD Z4, Z14, Z4
   312  	VPADDD Z29, Z4, Z4
   313  	VPXORD Z26, Z4, Z26
   314  	VPRORD $0x08, Z26, Z26
   315  	VPADDD Z16, Z26, Z16
   316  	VPXORD Z14, Z16, Z14
   317  	VPRORD $0x07, Z14, Z14
   318  	VPADDD Z6, Z8, Z6
   319  	VPADDD Z31, Z6, Z6
   320  	VPXORD Z28, Z6, Z28
   321  	VPRORD $0x10, Z28, Z28
   322  	VPADDD Z18, Z28, Z18
   323  	VPXORD Z8, Z18, Z8
   324  	VPRORD $0x0c, Z8, Z8
   325  	VPADDD Z6, Z8, Z6
   326  	VPADDD Z17, Z6, Z6
   327  	VPXORD Z28, Z6, Z28
   328  	VPRORD $0x08, Z28, Z28
   329  	VPADDD Z18, Z28, Z18
   330  	VPXORD Z8, Z18, Z8
   331  	VPRORD $0x07, Z8, Z8
   332  
   333  	// Round 3
   334  	VPADDD Z0, Z8, Z0
   335  	VPADDD Z7, Z0, Z0
   336  	VPXORD Z24, Z0, Z24
   337  	VPRORD $0x10, Z24, Z24
   338  	VPADDD Z16, Z24, Z16
   339  	VPXORD Z8, Z16, Z8
   340  	VPRORD $0x0c, Z8, Z8
   341  	VPADDD Z0, Z8, Z0
   342  	VPADDD Z9, Z0, Z0
   343  	VPXORD Z24, Z0, Z24
   344  	VPRORD $0x08, Z24, Z24
   345  	VPADDD Z16, Z24, Z16
   346  	VPXORD Z8, Z16, Z8
   347  	VPRORD $0x07, Z8, Z8
   348  	VPADDD Z2, Z10, Z2
   349  	VPADDD Z21, Z2, Z2
   350  	VPXORD Z26, Z2, Z26
   351  	VPRORD $0x10, Z26, Z26
   352  	VPADDD Z18, Z26, Z18
   353  	VPXORD Z10, Z18, Z10
   354  	VPRORD $0x0c, Z10, Z10
   355  	VPADDD Z2, Z10, Z2
   356  	VPADDD Z25, Z2, Z2
   357  	VPXORD Z26, Z2, Z26
   358  	VPRORD $0x08, Z26, Z26
   359  	VPADDD Z18, Z26, Z18
   360  	VPXORD Z10, Z18, Z10
   361  	VPRORD $0x07, Z10, Z10
   362  	VPADDD Z4, Z12, Z4
   363  	VPADDD Z27, Z4, Z4
   364  	VPXORD Z28, Z4, Z28
   365  	VPRORD $0x10, Z28, Z28
   366  	VPADDD Z20, Z28, Z20
   367  	VPXORD Z12, Z20, Z12
   368  	VPRORD $0x0c, Z12, Z12
   369  	VPADDD Z4, Z12, Z4
   370  	VPADDD Z5, Z4, Z4
   371  	VPXORD Z28, Z4, Z28
   372  	VPRORD $0x08, Z28, Z28
   373  	VPADDD Z20, Z28, Z20
   374  	VPXORD Z12, Z20, Z12
   375  	VPRORD $0x07, Z12, Z12
   376  	VPADDD Z6, Z14, Z6
   377  	VPADDD Z15, Z6, Z6
   378  	VPXORD Z30, Z6, Z30
   379  	VPRORD $0x10, Z30, Z30
   380  	VPADDD Z22, Z30, Z22
   381  	VPXORD Z14, Z22, Z14
   382  	VPRORD $0x0c, Z14, Z14
   383  	VPADDD Z6, Z14, Z6
   384  	VPADDD Z29, Z6, Z6
   385  	VPXORD Z30, Z6, Z30
   386  	VPRORD $0x08, Z30, Z30
   387  	VPADDD Z22, Z30, Z22
   388  	VPXORD Z14, Z22, Z14
   389  	VPRORD $0x07, Z14, Z14
   390  	VPADDD Z0, Z10, Z0
   391  	VPADDD Z13, Z0, Z0
   392  	VPXORD Z30, Z0, Z30
   393  	VPRORD $0x10, Z30, Z30
   394  	VPADDD Z20, Z30, Z20
   395  	VPXORD Z10, Z20, Z10
   396  	VPRORD $0x0c, Z10, Z10
   397  	VPADDD Z0, Z10, Z0
   398  	VPADDD Z11, Z0, Z0
   399  	VPXORD Z30, Z0, Z30
   400  	VPRORD $0x08, Z30, Z30
   401  	VPADDD Z20, Z30, Z20
   402  	VPXORD Z10, Z20, Z10
   403  	VPRORD $0x07, Z10, Z10
   404  	VPADDD Z2, Z12, Z2
   405  	VPADDD Z19, Z2, Z2
   406  	VPXORD Z24, Z2, Z24
   407  	VPRORD $0x10, Z24, Z24
   408  	VPADDD Z22, Z24, Z22
   409  	VPXORD Z12, Z22, Z12
   410  	VPRORD $0x0c, Z12, Z12
   411  	VPADDD Z2, Z12, Z2
   412  	VPADDD Z1, Z2, Z2
   413  	VPXORD Z24, Z2, Z24
   414  	VPRORD $0x08, Z24, Z24
   415  	VPADDD Z22, Z24, Z22
   416  	VPXORD Z12, Z22, Z12
   417  	VPRORD $0x07, Z12, Z12
   418  	VPADDD Z4, Z14, Z4
   419  	VPADDD Z23, Z4, Z4
   420  	VPXORD Z26, Z4, Z26
   421  	VPRORD $0x10, Z26, Z26
   422  	VPADDD Z16, Z26, Z16
   423  	VPXORD Z14, Z16, Z14
   424  	VPRORD $0x0c, Z14, Z14
   425  	VPADDD Z4, Z14, Z4
   426  	VPADDD Z31, Z4, Z4
   427  	VPXORD Z26, Z4, Z26
   428  	VPRORD $0x08, Z26, Z26
   429  	VPADDD Z16, Z26, Z16
   430  	VPXORD Z14, Z16, Z14
   431  	VPRORD $0x07, Z14, Z14
   432  	VPADDD Z6, Z8, Z6
   433  	VPADDD Z17, Z6, Z6
   434  	VPXORD Z28, Z6, Z28
   435  	VPRORD $0x10, Z28, Z28
   436  	VPADDD Z18, Z28, Z18
   437  	VPXORD Z8, Z18, Z8
   438  	VPRORD $0x0c, Z8, Z8
   439  	VPADDD Z6, Z8, Z6
   440  	VPADDD Z3, Z6, Z6
   441  	VPXORD Z28, Z6, Z28
   442  	VPRORD $0x08, Z28, Z28
   443  	VPADDD Z18, Z28, Z18
   444  	VPXORD Z8, Z18, Z8
   445  	VPRORD $0x07, Z8, Z8
   446  
   447  	// Round 4
   448  	VPADDD Z0, Z8, Z0
   449  	VPADDD Z21, Z0, Z0
   450  	VPXORD Z24, Z0, Z24
   451  	VPRORD $0x10, Z24, Z24
   452  	VPADDD Z16, Z24, Z16
   453  	VPXORD Z8, Z16, Z8
   454  	VPRORD $0x0c, Z8, Z8
   455  	VPADDD Z0, Z8, Z0
   456  	VPADDD Z15, Z0, Z0
   457  	VPXORD Z24, Z0, Z24
   458  	VPRORD $0x08, Z24, Z24
   459  	VPADDD Z16, Z24, Z16
   460  	VPXORD Z8, Z16, Z8
   461  	VPRORD $0x07, Z8, Z8
   462  	VPADDD Z2, Z10, Z2
   463  	VPADDD Z25, Z2, Z2
   464  	VPXORD Z26, Z2, Z26
   465  	VPRORD $0x10, Z26, Z26
   466  	VPADDD Z18, Z26, Z18
   467  	VPXORD Z10, Z18, Z10
   468  	VPRORD $0x0c, Z10, Z10
   469  	VPADDD Z2, Z10, Z2
   470  	VPADDD Z19, Z2, Z2
   471  	VPXORD Z26, Z2, Z26
   472  	VPRORD $0x08, Z26, Z26
   473  	VPADDD Z18, Z26, Z18
   474  	VPXORD Z10, Z18, Z10
   475  	VPRORD $0x07, Z10, Z10
   476  	VPADDD Z4, Z12, Z4
   477  	VPADDD Z29, Z4, Z4
   478  	VPXORD Z28, Z4, Z28
   479  	VPRORD $0x10, Z28, Z28
   480  	VPADDD Z20, Z28, Z20
   481  	VPXORD Z12, Z20, Z12
   482  	VPRORD $0x0c, Z12, Z12
   483  	VPADDD Z4, Z12, Z4
   484  	VPADDD Z7, Z4, Z4
   485  	VPXORD Z28, Z4, Z28
   486  	VPRORD $0x08, Z28, Z28
   487  	VPADDD Z20, Z28, Z20
   488  	VPXORD Z12, Z20, Z12
   489  	VPRORD $0x07, Z12, Z12
   490  	VPADDD Z6, Z14, Z6
   491  	VPADDD Z27, Z6, Z6
   492  	VPXORD Z30, Z6, Z30
   493  	VPRORD $0x10, Z30, Z30
   494  	VPADDD Z22, Z30, Z22
   495  	VPXORD Z14, Z22, Z14
   496  	VPRORD $0x0c, Z14, Z14
   497  	VPADDD Z6, Z14, Z6
   498  	VPADDD Z31, Z6, Z6
   499  	VPXORD Z30, Z6, Z30
   500  	VPRORD $0x08, Z30, Z30
   501  	VPADDD Z22, Z30, Z22
   502  	VPXORD Z14, Z22, Z14
   503  	VPRORD $0x07, Z14, Z14
   504  	VPADDD Z0, Z10, Z0
   505  	VPADDD Z9, Z0, Z0
   506  	VPXORD Z30, Z0, Z30
   507  	VPRORD $0x10, Z30, Z30
   508  	VPADDD Z20, Z30, Z20
   509  	VPXORD Z10, Z20, Z10
   510  	VPRORD $0x0c, Z10, Z10
   511  	VPADDD Z0, Z10, Z0
   512  	VPADDD Z1, Z0, Z0
   513  	VPXORD Z30, Z0, Z30
   514  	VPRORD $0x08, Z30, Z30
   515  	VPADDD Z20, Z30, Z20
   516  	VPXORD Z10, Z20, Z10
   517  	VPRORD $0x07, Z10, Z10
   518  	VPADDD Z2, Z12, Z2
   519  	VPADDD Z23, Z2, Z2
   520  	VPXORD Z24, Z2, Z24
   521  	VPRORD $0x10, Z24, Z24
   522  	VPADDD Z22, Z24, Z22
   523  	VPXORD Z12, Z22, Z12
   524  	VPRORD $0x0c, Z12, Z12
   525  	VPADDD Z2, Z12, Z2
   526  	VPADDD Z5, Z2, Z2
   527  	VPXORD Z24, Z2, Z24
   528  	VPRORD $0x08, Z24, Z24
   529  	VPADDD Z22, Z24, Z22
   530  	VPXORD Z12, Z22, Z12
   531  	VPRORD $0x07, Z12, Z12
   532  	VPADDD Z4, Z14, Z4
   533  	VPADDD Z11, Z4, Z4
   534  	VPXORD Z26, Z4, Z26
   535  	VPRORD $0x10, Z26, Z26
   536  	VPADDD Z16, Z26, Z16
   537  	VPXORD Z14, Z16, Z14
   538  	VPRORD $0x0c, Z14, Z14
   539  	VPADDD Z4, Z14, Z4
   540  	VPADDD Z17, Z4, Z4
   541  	VPXORD Z26, Z4, Z26
   542  	VPRORD $0x08, Z26, Z26
   543  	VPADDD Z16, Z26, Z16
   544  	VPXORD Z14, Z16, Z14
   545  	VPRORD $0x07, Z14, Z14
   546  	VPADDD Z6, Z8, Z6
   547  	VPADDD Z3, Z6, Z6
   548  	VPXORD Z28, Z6, Z28
   549  	VPRORD $0x10, Z28, Z28
   550  	VPADDD Z18, Z28, Z18
   551  	VPXORD Z8, Z18, Z8
   552  	VPRORD $0x0c, Z8, Z8
   553  	VPADDD Z6, Z8, Z6
   554  	VPADDD Z13, Z6, Z6
   555  	VPXORD Z28, Z6, Z28
   556  	VPRORD $0x08, Z28, Z28
   557  	VPADDD Z18, Z28, Z18
   558  	VPXORD Z8, Z18, Z8
   559  	VPRORD $0x07, Z8, Z8
   560  
   561  	// Round 5
   562  	VPADDD Z0, Z8, Z0
   563  	VPADDD Z25, Z0, Z0
   564  	VPXORD Z24, Z0, Z24
   565  	VPRORD $0x10, Z24, Z24
   566  	VPADDD Z16, Z24, Z16
   567  	VPXORD Z8, Z16, Z8
   568  	VPRORD $0x0c, Z8, Z8
   569  	VPADDD Z0, Z8, Z0
   570  	VPADDD Z27, Z0, Z0
   571  	VPXORD Z24, Z0, Z24
   572  	VPRORD $0x08, Z24, Z24
   573  	VPADDD Z16, Z24, Z16
   574  	VPXORD Z8, Z16, Z8
   575  	VPRORD $0x07, Z8, Z8
   576  	VPADDD Z2, Z10, Z2
   577  	VPADDD Z19, Z2, Z2
   578  	VPXORD Z26, Z2, Z26
   579  	VPRORD $0x10, Z26, Z26
   580  	VPADDD Z18, Z26, Z18
   581  	VPXORD Z10, Z18, Z10
   582  	VPRORD $0x0c, Z10, Z10
   583  	VPADDD Z2, Z10, Z2
   584  	VPADDD Z23, Z2, Z2
   585  	VPXORD Z26, Z2, Z26
   586  	VPRORD $0x08, Z26, Z26
   587  	VPADDD Z18, Z26, Z18
   588  	VPXORD Z10, Z18, Z10
   589  	VPRORD $0x07, Z10, Z10
   590  	VPADDD Z4, Z12, Z4
   591  	VPADDD Z31, Z4, Z4
   592  	VPXORD Z28, Z4, Z28
   593  	VPRORD $0x10, Z28, Z28
   594  	VPADDD Z20, Z28, Z20
   595  	VPXORD Z12, Z20, Z12
   596  	VPRORD $0x0c, Z12, Z12
   597  	VPADDD Z4, Z12, Z4
   598  	VPADDD Z21, Z4, Z4
   599  	VPXORD Z28, Z4, Z28
   600  	VPRORD $0x08, Z28, Z28
   601  	VPADDD Z20, Z28, Z20
   602  	VPXORD Z12, Z20, Z12
   603  	VPRORD $0x07, Z12, Z12
   604  	VPADDD Z6, Z14, Z6
   605  	VPADDD Z29, Z6, Z6
   606  	VPXORD Z30, Z6, Z30
   607  	VPRORD $0x10, Z30, Z30
   608  	VPADDD Z22, Z30, Z22
   609  	VPXORD Z14, Z22, Z14
   610  	VPRORD $0x0c, Z14, Z14
   611  	VPADDD Z6, Z14, Z6
   612  	VPADDD Z17, Z6, Z6
   613  	VPXORD Z30, Z6, Z30
   614  	VPRORD $0x08, Z30, Z30
   615  	VPADDD Z22, Z30, Z22
   616  	VPXORD Z14, Z22, Z14
   617  	VPRORD $0x07, Z14, Z14
   618  	VPADDD Z0, Z10, Z0
   619  	VPADDD Z15, Z0, Z0
   620  	VPXORD Z30, Z0, Z30
   621  	VPRORD $0x10, Z30, Z30
   622  	VPADDD Z20, Z30, Z20
   623  	VPXORD Z10, Z20, Z10
   624  	VPRORD $0x0c, Z10, Z10
   625  	VPADDD Z0, Z10, Z0
   626  	VPADDD Z5, Z0, Z0
   627  	VPXORD Z30, Z0, Z30
   628  	VPRORD $0x08, Z30, Z30
   629  	VPADDD Z20, Z30, Z20
   630  	VPXORD Z10, Z20, Z10
   631  	VPRORD $0x07, Z10, Z10
   632  	VPADDD Z2, Z12, Z2
   633  	VPADDD Z11, Z2, Z2
   634  	VPXORD Z24, Z2, Z24
   635  	VPRORD $0x10, Z24, Z24
   636  	VPADDD Z22, Z24, Z22
   637  	VPXORD Z12, Z22, Z12
   638  	VPRORD $0x0c, Z12, Z12
   639  	VPADDD Z2, Z12, Z2
   640  	VPADDD Z7, Z2, Z2
   641  	VPXORD Z24, Z2, Z24
   642  	VPRORD $0x08, Z24, Z24
   643  	VPADDD Z22, Z24, Z22
   644  	VPXORD Z12, Z22, Z12
   645  	VPRORD $0x07, Z12, Z12
   646  	VPADDD Z4, Z14, Z4
   647  	VPADDD Z1, Z4, Z4
   648  	VPXORD Z26, Z4, Z26
   649  	VPRORD $0x10, Z26, Z26
   650  	VPADDD Z16, Z26, Z16
   651  	VPXORD Z14, Z16, Z14
   652  	VPRORD $0x0c, Z14, Z14
   653  	VPADDD Z4, Z14, Z4
   654  	VPADDD Z3, Z4, Z4
   655  	VPXORD Z26, Z4, Z26
   656  	VPRORD $0x08, Z26, Z26
   657  	VPADDD Z16, Z26, Z16
   658  	VPXORD Z14, Z16, Z14
   659  	VPRORD $0x07, Z14, Z14
   660  	VPADDD Z6, Z8, Z6
   661  	VPADDD Z13, Z6, Z6
   662  	VPXORD Z28, Z6, Z28
   663  	VPRORD $0x10, Z28, Z28
   664  	VPADDD Z18, Z28, Z18
   665  	VPXORD Z8, Z18, Z8
   666  	VPRORD $0x0c, Z8, Z8
   667  	VPADDD Z6, Z8, Z6
   668  	VPADDD Z9, Z6, Z6
   669  	VPXORD Z28, Z6, Z28
   670  	VPRORD $0x08, Z28, Z28
   671  	VPADDD Z18, Z28, Z18
   672  	VPXORD Z8, Z18, Z8
   673  	VPRORD $0x07, Z8, Z8
   674  
   675  	// Round 6
   676  	VPADDD Z0, Z8, Z0
   677  	VPADDD Z19, Z0, Z0
   678  	VPXORD Z24, Z0, Z24
   679  	VPRORD $0x10, Z24, Z24
   680  	VPADDD Z16, Z24, Z16
   681  	VPXORD Z8, Z16, Z8
   682  	VPRORD $0x0c, Z8, Z8
   683  	VPADDD Z0, Z8, Z0
   684  	VPADDD Z29, Z0, Z0
   685  	VPXORD Z24, Z0, Z24
   686  	VPRORD $0x08, Z24, Z24
   687  	VPADDD Z16, Z24, Z16
   688  	VPXORD Z8, Z16, Z8
   689  	VPRORD $0x07, Z8, Z8
   690  	VPADDD Z2, Z10, Z2
   691  	VPADDD Z23, Z2, Z2
   692  	VPXORD Z26, Z2, Z26
   693  	VPRORD $0x10, Z26, Z26
   694  	VPADDD Z18, Z26, Z18
   695  	VPXORD Z10, Z18, Z10
   696  	VPRORD $0x0c, Z10, Z10
   697  	VPADDD Z2, Z10, Z2
   698  	VPADDD Z11, Z2, Z2
   699  	VPXORD Z26, Z2, Z26
   700  	VPRORD $0x08, Z26, Z26
   701  	VPADDD Z18, Z26, Z18
   702  	VPXORD Z10, Z18, Z10
   703  	VPRORD $0x07, Z10, Z10
   704  	VPADDD Z4, Z12, Z4
   705  	VPADDD Z17, Z4, Z4
   706  	VPXORD Z28, Z4, Z28
   707  	VPRORD $0x10, Z28, Z28
   708  	VPADDD Z20, Z28, Z20
   709  	VPXORD Z12, Z20, Z12
   710  	VPRORD $0x0c, Z12, Z12
   711  	VPADDD Z4, Z12, Z4
   712  	VPADDD Z25, Z4, Z4
   713  	VPXORD Z28, Z4, Z28
   714  	VPRORD $0x08, Z28, Z28
   715  	VPADDD Z20, Z28, Z20
   716  	VPXORD Z12, Z20, Z12
   717  	VPRORD $0x07, Z12, Z12
   718  	VPADDD Z6, Z14, Z6
   719  	VPADDD Z31, Z6, Z6
   720  	VPXORD Z30, Z6, Z30
   721  	VPRORD $0x10, Z30, Z30
   722  	VPADDD Z22, Z30, Z22
   723  	VPXORD Z14, Z22, Z14
   724  	VPRORD $0x0c, Z14, Z14
   725  	VPADDD Z6, Z14, Z6
   726  	VPADDD Z3, Z6, Z6
   727  	VPXORD Z30, Z6, Z30
   728  	VPRORD $0x08, Z30, Z30
   729  	VPADDD Z22, Z30, Z22
   730  	VPXORD Z14, Z22, Z14
   731  	VPRORD $0x07, Z14, Z14
   732  	VPADDD Z0, Z10, Z0
   733  	VPADDD Z27, Z0, Z0
   734  	VPXORD Z30, Z0, Z30
   735  	VPRORD $0x10, Z30, Z30
   736  	VPADDD Z20, Z30, Z20
   737  	VPXORD Z10, Z20, Z10
   738  	VPRORD $0x0c, Z10, Z10
   739  	VPADDD Z0, Z10, Z0
   740  	VPADDD Z7, Z0, Z0
   741  	VPXORD Z30, Z0, Z30
   742  	VPRORD $0x08, Z30, Z30
   743  	VPADDD Z20, Z30, Z20
   744  	VPXORD Z10, Z20, Z10
   745  	VPRORD $0x07, Z10, Z10
   746  	VPADDD Z2, Z12, Z2
   747  	VPADDD Z1, Z2, Z2
   748  	VPXORD Z24, Z2, Z24
   749  	VPRORD $0x10, Z24, Z24
   750  	VPADDD Z22, Z24, Z22
   751  	VPXORD Z12, Z22, Z12
   752  	VPRORD $0x0c, Z12, Z12
   753  	VPADDD Z2, Z12, Z2
   754  	VPADDD Z21, Z2, Z2
   755  	VPXORD Z24, Z2, Z24
   756  	VPRORD $0x08, Z24, Z24
   757  	VPADDD Z22, Z24, Z22
   758  	VPXORD Z12, Z22, Z12
   759  	VPRORD $0x07, Z12, Z12
   760  	VPADDD Z4, Z14, Z4
   761  	VPADDD Z5, Z4, Z4
   762  	VPXORD Z26, Z4, Z26
   763  	VPRORD $0x10, Z26, Z26
   764  	VPADDD Z16, Z26, Z16
   765  	VPXORD Z14, Z16, Z14
   766  	VPRORD $0x0c, Z14, Z14
   767  	VPADDD Z4, Z14, Z4
   768  	VPADDD Z13, Z4, Z4
   769  	VPXORD Z26, Z4, Z26
   770  	VPRORD $0x08, Z26, Z26
   771  	VPADDD Z16, Z26, Z16
   772  	VPXORD Z14, Z16, Z14
   773  	VPRORD $0x07, Z14, Z14
   774  	VPADDD Z6, Z8, Z6
   775  	VPADDD Z9, Z6, Z6
   776  	VPXORD Z28, Z6, Z28
   777  	VPRORD $0x10, Z28, Z28
   778  	VPADDD Z18, Z28, Z18
   779  	VPXORD Z8, Z18, Z8
   780  	VPRORD $0x0c, Z8, Z8
   781  	VPADDD Z6, Z8, Z6
   782  	VPADDD Z15, Z6, Z6
   783  	VPXORD Z28, Z6, Z28
   784  	VPRORD $0x08, Z28, Z28
   785  	VPADDD Z18, Z28, Z18
   786  	VPXORD Z8, Z18, Z8
   787  	VPRORD $0x07, Z8, Z8
   788  
   789  	// Round 7
   790  	VPADDD Z0, Z8, Z0
   791  	VPADDD Z23, Z0, Z0
   792  	VPXORD Z24, Z0, Z24
   793  	VPRORD $0x10, Z24, Z24
   794  	VPADDD Z16, Z24, Z16
   795  	VPXORD Z8, Z16, Z8
   796  	VPRORD $0x0c, Z8, Z8
   797  	VPADDD Z0, Z8, Z0
   798  	VPADDD Z31, Z0, Z0
   799  	VPXORD Z24, Z0, Z24
   800  	VPRORD $0x08, Z24, Z24
   801  	VPADDD Z16, Z24, Z16
   802  	VPXORD Z8, Z16, Z8
   803  	VPRORD $0x07, Z8, Z8
   804  	VPADDD Z2, Z10, Z2
   805  	VPADDD Z11, Z2, Z2
   806  	VPXORD Z26, Z2, Z26
   807  	VPRORD $0x10, Z26, Z26
   808  	VPADDD Z18, Z26, Z18
   809  	VPXORD Z10, Z18, Z10
   810  	VPRORD $0x0c, Z10, Z10
   811  	VPADDD Z2, Z10, Z2
   812  	VPADDD Z1, Z2, Z2
   813  	VPXORD Z26, Z2, Z26
   814  	VPRORD $0x08, Z26, Z26
   815  	VPADDD Z18, Z26, Z18
   816  	VPXORD Z10, Z18, Z10
   817  	VPRORD $0x07, Z10, Z10
   818  	VPADDD Z4, Z12, Z4
   819  	VPADDD Z3, Z4, Z4
   820  	VPXORD Z28, Z4, Z28
   821  	VPRORD $0x10, Z28, Z28
   822  	VPADDD Z20, Z28, Z20
   823  	VPXORD Z12, Z20, Z12
   824  	VPRORD $0x0c, Z12, Z12
   825  	VPADDD Z4, Z12, Z4
   826  	VPADDD Z19, Z4, Z4
   827  	VPXORD Z28, Z4, Z28
   828  	VPRORD $0x08, Z28, Z28
   829  	VPADDD Z20, Z28, Z20
   830  	VPXORD Z12, Z20, Z12
   831  	VPRORD $0x07, Z12, Z12
   832  	VPADDD Z6, Z14, Z6
   833  	VPADDD Z17, Z6, Z6
   834  	VPXORD Z30, Z6, Z30
   835  	VPRORD $0x10, Z30, Z30
   836  	VPADDD Z22, Z30, Z22
   837  	VPXORD Z14, Z22, Z14
   838  	VPRORD $0x0c, Z14, Z14
   839  	VPADDD Z6, Z14, Z6
   840  	VPADDD Z13, Z6, Z6
   841  	VPXORD Z30, Z6, Z30
   842  	VPRORD $0x08, Z30, Z30
   843  	VPADDD Z22, Z30, Z22
   844  	VPXORD Z14, Z22, Z14
   845  	VPRORD $0x07, Z14, Z14
   846  	VPADDD Z0, Z10, Z0
   847  	VPADDD Z29, Z0, Z0
   848  	VPXORD Z30, Z0, Z30
   849  	VPRORD $0x10, Z30, Z30
   850  	VPADDD Z20, Z30, Z20
   851  	VPXORD Z10, Z20, Z10
   852  	VPRORD $0x0c, Z10, Z10
   853  	VPADDD Z0, Z10, Z0
   854  	VPADDD Z21, Z0, Z0
   855  	VPXORD Z30, Z0, Z30
   856  	VPRORD $0x08, Z30, Z30
   857  	VPADDD Z20, Z30, Z20
   858  	VPXORD Z10, Z20, Z10
   859  	VPRORD $0x07, Z10, Z10
   860  	VPADDD Z2, Z12, Z2
   861  	VPADDD Z5, Z2, Z2
   862  	VPXORD Z24, Z2, Z24
   863  	VPRORD $0x10, Z24, Z24
   864  	VPADDD Z22, Z24, Z22
   865  	VPXORD Z12, Z22, Z12
   866  	VPRORD $0x0c, Z12, Z12
   867  	VPADDD Z2, Z12, Z2
   868  	VPADDD Z25, Z2, Z2
   869  	VPXORD Z24, Z2, Z24
   870  	VPRORD $0x08, Z24, Z24
   871  	VPADDD Z22, Z24, Z22
   872  	VPXORD Z12, Z22, Z12
   873  	VPRORD $0x07, Z12, Z12
   874  	VPADDD Z4, Z14, Z4
   875  	VPADDD Z7, Z4, Z4
   876  	VPXORD Z26, Z4, Z26
   877  	VPRORD $0x10, Z26, Z26
   878  	VPADDD Z16, Z26, Z16
   879  	VPXORD Z14, Z16, Z14
   880  	VPRORD $0x0c, Z14, Z14
   881  	VPADDD Z4, Z14, Z4
   882  	VPADDD Z9, Z4, Z4
   883  	VPXORD Z26, Z4, Z26
   884  	VPRORD $0x08, Z26, Z26
   885  	VPADDD Z16, Z26, Z16
   886  	VPXORD Z14, Z16, Z14
   887  	VPRORD $0x07, Z14, Z14
   888  	VPADDD Z6, Z8, Z6
   889  	VPADDD Z15, Z6, Z6
   890  	VPXORD Z28, Z6, Z28
   891  	VPRORD $0x10, Z28, Z28
   892  	VPADDD Z18, Z28, Z18
   893  	VPXORD Z8, Z18, Z8
   894  	VPRORD $0x0c, Z8, Z8
   895  	VPADDD Z6, Z8, Z6
   896  	VPADDD Z27, Z6, Z6
   897  	VPXORD Z28, Z6, Z28
   898  	VPRORD $0x08, Z28, Z28
   899  	VPADDD Z18, Z28, Z18
   900  	VPXORD Z8, Z18, Z8
   901  	VPRORD $0x07, Z8, Z8
   902  
   903  	// Finalize CVs
   904  	VPXORD      Z0, Z16, Z0
   905  	VPXORD      Z2, Z18, Z2
   906  	VPXORD      Z4, Z20, Z4
   907  	VPXORD      Z6, Z22, Z6
   908  	VPXORD      Z8, Z24, Z8
   909  	VPXORD      Z10, Z26, Z10
   910  	VPXORD      Z12, Z28, Z12
   911  	VPXORD      Z14, Z30, Z14
   912  	VPXORD.BCST (DX), Z16, Z16
   913  	VPXORD.BCST 4(DX), Z18, Z18
   914  	VPXORD.BCST 8(DX), Z20, Z20
   915  	VPXORD.BCST 12(DX), Z22, Z22
   916  	VPXORD.BCST 16(DX), Z24, Z24
   917  	VPXORD.BCST 20(DX), Z26, Z26
   918  	VPXORD.BCST 24(DX), Z28, Z28
   919  	VPXORD.BCST 28(DX), Z30, Z30
   920  	VMOVDQU32   seq<>+0(SB), Z1
   921  	VPSLLD      $0x06, Z1, Z1
   922  	KXNORD      K1, K1, K1
   923  	VPSCATTERDD Z0, K1, (AX)(Z1*1)
   924  	KXNORD      K1, K1, K1
   925  	VPSCATTERDD Z2, K1, 4(AX)(Z1*1)
   926  	KXNORD      K1, K1, K1
   927  	VPSCATTERDD Z4, K1, 8(AX)(Z1*1)
   928  	KXNORD      K1, K1, K1
   929  	VPSCATTERDD Z6, K1, 12(AX)(Z1*1)
   930  	KXNORD      K1, K1, K1
   931  	VPSCATTERDD Z8, K1, 16(AX)(Z1*1)
   932  	KXNORD      K1, K1, K1
   933  	VPSCATTERDD Z10, K1, 20(AX)(Z1*1)
   934  	KXNORD      K1, K1, K1
   935  	VPSCATTERDD Z12, K1, 24(AX)(Z1*1)
   936  	KXNORD      K1, K1, K1
   937  	VPSCATTERDD Z14, K1, 28(AX)(Z1*1)
   938  	KXNORD      K1, K1, K1
   939  	VPSCATTERDD Z16, K1, 32(AX)(Z1*1)
   940  	KXNORD      K1, K1, K1
   941  	VPSCATTERDD Z18, K1, 36(AX)(Z1*1)
   942  	KXNORD      K1, K1, K1
   943  	VPSCATTERDD Z20, K1, 40(AX)(Z1*1)
   944  	KXNORD      K1, K1, K1
   945  	VPSCATTERDD Z22, K1, 44(AX)(Z1*1)
   946  	KXNORD      K1, K1, K1
   947  	VPSCATTERDD Z24, K1, 48(AX)(Z1*1)
   948  	KXNORD      K1, K1, K1
   949  	VPSCATTERDD Z26, K1, 52(AX)(Z1*1)
   950  	KXNORD      K1, K1, K1
   951  	VPSCATTERDD Z28, K1, 56(AX)(Z1*1)
   952  	KXNORD      K1, K1, K1
   953  	VPSCATTERDD Z30, K1, 60(AX)(Z1*1)
   954  	RET
   955  
   956  // func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
   957  // Requires: AVX512BW, AVX512F
   958  TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36
   959  	MOVQ cvs+0(FP), AX
   960  	MOVQ buf+8(FP), CX
   961  	MOVQ key+16(FP), DX
   962  
   963  	// Initialize counter
   964  	VPBROADCASTD counter+24(FP), Z0
   965  	VPADDD       seq<>+0(SB), Z0, Z0
   966  	VPCMPUD      $0x01, seq<>+0(SB), Z0, K1
   967  	VPBROADCASTD counter+28(FP), Z2
   968  	VPADDD.BCST  seq<>+4(SB), Z2, K1, Z2
   969  	VMOVDQU32    Z0, (SP)
   970  	VMOVDQU32    Z2, 64(SP)
   971  
   972  	// Initialize flags
   973  	VPBROADCASTD flags+32(FP), Z0
   974  	VMOVDQU32    Z0, 128(SP)
   975  	ORL          $0x01, 128(SP)
   976  	ORL          $0x02, 188(SP)
   977  
   978  	// Load key
   979  	VPBROADCASTD (DX), Z0
   980  	VPBROADCASTD 4(DX), Z2
   981  	VPBROADCASTD 8(DX), Z4
   982  	VPBROADCASTD 12(DX), Z6
   983  	VPBROADCASTD 16(DX), Z8
   984  	VPBROADCASTD 20(DX), Z10
   985  	VPBROADCASTD 24(DX), Z12
   986  	VPBROADCASTD 28(DX), Z14
   987  
   988  	// Loop index
   989  	XORQ DX, DX
   990  
   991  loop:
   992  	// Load transposed block
   993  	VMOVDQU32  seq<>+0(SB), Z16
   994  	VPSLLD     $0x0a, Z16, Z16
   995  	KXNORD     K1, K1, K1
   996  	VPGATHERDD (CX)(Z16*1), K1, Z1
   997  	KXNORD     K1, K1, K1
   998  	VPGATHERDD 4(CX)(Z16*1), K1, Z3
   999  	KXNORD     K1, K1, K1
  1000  	VPGATHERDD 8(CX)(Z16*1), K1, Z5
  1001  	KXNORD     K1, K1, K1
  1002  	VPGATHERDD 12(CX)(Z16*1), K1, Z7
  1003  	KXNORD     K1, K1, K1
  1004  	VPGATHERDD 16(CX)(Z16*1), K1, Z9
  1005  	KXNORD     K1, K1, K1
  1006  	VPGATHERDD 20(CX)(Z16*1), K1, Z11
  1007  	KXNORD     K1, K1, K1
  1008  	VPGATHERDD 24(CX)(Z16*1), K1, Z13
  1009  	KXNORD     K1, K1, K1
  1010  	VPGATHERDD 28(CX)(Z16*1), K1, Z15
  1011  	KXNORD     K1, K1, K1
  1012  	VPGATHERDD 32(CX)(Z16*1), K1, Z17
  1013  	KXNORD     K1, K1, K1
  1014  	VPGATHERDD 36(CX)(Z16*1), K1, Z19
  1015  	KXNORD     K1, K1, K1
  1016  	VPGATHERDD 40(CX)(Z16*1), K1, Z21
  1017  	KXNORD     K1, K1, K1
  1018  	VPGATHERDD 44(CX)(Z16*1), K1, Z23
  1019  	KXNORD     K1, K1, K1
  1020  	VPGATHERDD 48(CX)(Z16*1), K1, Z25
  1021  	KXNORD     K1, K1, K1
  1022  	VPGATHERDD 52(CX)(Z16*1), K1, Z27
  1023  	KXNORD     K1, K1, K1
  1024  	VPGATHERDD 56(CX)(Z16*1), K1, Z29
  1025  	KXNORD     K1, K1, K1
  1026  	VPGATHERDD 60(CX)(Z16*1), K1, Z31
  1027  	ADDQ       $0x40, CX
  1028  
  1029  	// Reload state vectors (other than CVs)
  1030  	VPBROADCASTD iv<>+0(SB), Z16
  1031  	VPBROADCASTD iv<>+4(SB), Z18
  1032  	VPBROADCASTD iv<>+8(SB), Z20
  1033  	VPBROADCASTD iv<>+12(SB), Z22
  1034  	VMOVDQU32    (SP), Z24
  1035  	VMOVDQU32    64(SP), Z26
  1036  	VPBROADCASTD seq<>+4(SB), Z28
  1037  	VPSLLD       $0x06, Z28, Z28
  1038  	VPBROADCASTD 128(SP)(DX*4), Z30
  1039  
  1040  	// Round 1
  1041  	VPADDD Z0, Z8, Z0
  1042  	VPADDD Z1, Z0, Z0
  1043  	VPXORD Z24, Z0, Z24
  1044  	VPRORD $0x10, Z24, Z24
  1045  	VPADDD Z16, Z24, Z16
  1046  	VPXORD Z8, Z16, Z8
  1047  	VPRORD $0x0c, Z8, Z8
  1048  	VPADDD Z0, Z8, Z0
  1049  	VPADDD Z3, Z0, Z0
  1050  	VPXORD Z24, Z0, Z24
  1051  	VPRORD $0x08, Z24, Z24
  1052  	VPADDD Z16, Z24, Z16
  1053  	VPXORD Z8, Z16, Z8
  1054  	VPRORD $0x07, Z8, Z8
  1055  	VPADDD Z2, Z10, Z2
  1056  	VPADDD Z5, Z2, Z2
  1057  	VPXORD Z26, Z2, Z26
  1058  	VPRORD $0x10, Z26, Z26
  1059  	VPADDD Z18, Z26, Z18
  1060  	VPXORD Z10, Z18, Z10
  1061  	VPRORD $0x0c, Z10, Z10
  1062  	VPADDD Z2, Z10, Z2
  1063  	VPADDD Z7, Z2, Z2
  1064  	VPXORD Z26, Z2, Z26
  1065  	VPRORD $0x08, Z26, Z26
  1066  	VPADDD Z18, Z26, Z18
  1067  	VPXORD Z10, Z18, Z10
  1068  	VPRORD $0x07, Z10, Z10
  1069  	VPADDD Z4, Z12, Z4
  1070  	VPADDD Z9, Z4, Z4
  1071  	VPXORD Z28, Z4, Z28
  1072  	VPRORD $0x10, Z28, Z28
  1073  	VPADDD Z20, Z28, Z20
  1074  	VPXORD Z12, Z20, Z12
  1075  	VPRORD $0x0c, Z12, Z12
  1076  	VPADDD Z4, Z12, Z4
  1077  	VPADDD Z11, Z4, Z4
  1078  	VPXORD Z28, Z4, Z28
  1079  	VPRORD $0x08, Z28, Z28
  1080  	VPADDD Z20, Z28, Z20
  1081  	VPXORD Z12, Z20, Z12
  1082  	VPRORD $0x07, Z12, Z12
  1083  	VPADDD Z6, Z14, Z6
  1084  	VPADDD Z13, Z6, Z6
  1085  	VPXORD Z30, Z6, Z30
  1086  	VPRORD $0x10, Z30, Z30
  1087  	VPADDD Z22, Z30, Z22
  1088  	VPXORD Z14, Z22, Z14
  1089  	VPRORD $0x0c, Z14, Z14
  1090  	VPADDD Z6, Z14, Z6
  1091  	VPADDD Z15, Z6, Z6
  1092  	VPXORD Z30, Z6, Z30
  1093  	VPRORD $0x08, Z30, Z30
  1094  	VPADDD Z22, Z30, Z22
  1095  	VPXORD Z14, Z22, Z14
  1096  	VPRORD $0x07, Z14, Z14
  1097  	VPADDD Z0, Z10, Z0
  1098  	VPADDD Z17, Z0, Z0
  1099  	VPXORD Z30, Z0, Z30
  1100  	VPRORD $0x10, Z30, Z30
  1101  	VPADDD Z20, Z30, Z20
  1102  	VPXORD Z10, Z20, Z10
  1103  	VPRORD $0x0c, Z10, Z10
  1104  	VPADDD Z0, Z10, Z0
  1105  	VPADDD Z19, Z0, Z0
  1106  	VPXORD Z30, Z0, Z30
  1107  	VPRORD $0x08, Z30, Z30
  1108  	VPADDD Z20, Z30, Z20
  1109  	VPXORD Z10, Z20, Z10
  1110  	VPRORD $0x07, Z10, Z10
  1111  	VPADDD Z2, Z12, Z2
  1112  	VPADDD Z21, Z2, Z2
  1113  	VPXORD Z24, Z2, Z24
  1114  	VPRORD $0x10, Z24, Z24
  1115  	VPADDD Z22, Z24, Z22
  1116  	VPXORD Z12, Z22, Z12
  1117  	VPRORD $0x0c, Z12, Z12
  1118  	VPADDD Z2, Z12, Z2
  1119  	VPADDD Z23, Z2, Z2
  1120  	VPXORD Z24, Z2, Z24
  1121  	VPRORD $0x08, Z24, Z24
  1122  	VPADDD Z22, Z24, Z22
  1123  	VPXORD Z12, Z22, Z12
  1124  	VPRORD $0x07, Z12, Z12
  1125  	VPADDD Z4, Z14, Z4
  1126  	VPADDD Z25, Z4, Z4
  1127  	VPXORD Z26, Z4, Z26
  1128  	VPRORD $0x10, Z26, Z26
  1129  	VPADDD Z16, Z26, Z16
  1130  	VPXORD Z14, Z16, Z14
  1131  	VPRORD $0x0c, Z14, Z14
  1132  	VPADDD Z4, Z14, Z4
  1133  	VPADDD Z27, Z4, Z4
  1134  	VPXORD Z26, Z4, Z26
  1135  	VPRORD $0x08, Z26, Z26
  1136  	VPADDD Z16, Z26, Z16
  1137  	VPXORD Z14, Z16, Z14
  1138  	VPRORD $0x07, Z14, Z14
  1139  	VPADDD Z6, Z8, Z6
  1140  	VPADDD Z29, Z6, Z6
  1141  	VPXORD Z28, Z6, Z28
  1142  	VPRORD $0x10, Z28, Z28
  1143  	VPADDD Z18, Z28, Z18
  1144  	VPXORD Z8, Z18, Z8
  1145  	VPRORD $0x0c, Z8, Z8
  1146  	VPADDD Z6, Z8, Z6
  1147  	VPADDD Z31, Z6, Z6
  1148  	VPXORD Z28, Z6, Z28
  1149  	VPRORD $0x08, Z28, Z28
  1150  	VPADDD Z18, Z28, Z18
  1151  	VPXORD Z8, Z18, Z8
  1152  	VPRORD $0x07, Z8, Z8
  1153  
  1154  	// Round 2
  1155  	VPADDD Z0, Z8, Z0
  1156  	VPADDD Z5, Z0, Z0
  1157  	VPXORD Z24, Z0, Z24
  1158  	VPRORD $0x10, Z24, Z24
  1159  	VPADDD Z16, Z24, Z16
  1160  	VPXORD Z8, Z16, Z8
  1161  	VPRORD $0x0c, Z8, Z8
  1162  	VPADDD Z0, Z8, Z0
  1163  	VPADDD Z13, Z0, Z0
  1164  	VPXORD Z24, Z0, Z24
  1165  	VPRORD $0x08, Z24, Z24
  1166  	VPADDD Z16, Z24, Z16
  1167  	VPXORD Z8, Z16, Z8
  1168  	VPRORD $0x07, Z8, Z8
  1169  	VPADDD Z2, Z10, Z2
  1170  	VPADDD Z7, Z2, Z2
  1171  	VPXORD Z26, Z2, Z26
  1172  	VPRORD $0x10, Z26, Z26
  1173  	VPADDD Z18, Z26, Z18
  1174  	VPXORD Z10, Z18, Z10
  1175  	VPRORD $0x0c, Z10, Z10
  1176  	VPADDD Z2, Z10, Z2
  1177  	VPADDD Z21, Z2, Z2
  1178  	VPXORD Z26, Z2, Z26
  1179  	VPRORD $0x08, Z26, Z26
  1180  	VPADDD Z18, Z26, Z18
  1181  	VPXORD Z10, Z18, Z10
  1182  	VPRORD $0x07, Z10, Z10
  1183  	VPADDD Z4, Z12, Z4
  1184  	VPADDD Z15, Z4, Z4
  1185  	VPXORD Z28, Z4, Z28
  1186  	VPRORD $0x10, Z28, Z28
  1187  	VPADDD Z20, Z28, Z20
  1188  	VPXORD Z12, Z20, Z12
  1189  	VPRORD $0x0c, Z12, Z12
  1190  	VPADDD Z4, Z12, Z4
  1191  	VPADDD Z1, Z4, Z4
  1192  	VPXORD Z28, Z4, Z28
  1193  	VPRORD $0x08, Z28, Z28
  1194  	VPADDD Z20, Z28, Z20
  1195  	VPXORD Z12, Z20, Z12
  1196  	VPRORD $0x07, Z12, Z12
  1197  	VPADDD Z6, Z14, Z6
  1198  	VPADDD Z9, Z6, Z6
  1199  	VPXORD Z30, Z6, Z30
  1200  	VPRORD $0x10, Z30, Z30
  1201  	VPADDD Z22, Z30, Z22
  1202  	VPXORD Z14, Z22, Z14
  1203  	VPRORD $0x0c, Z14, Z14
  1204  	VPADDD Z6, Z14, Z6
  1205  	VPADDD Z27, Z6, Z6
  1206  	VPXORD Z30, Z6, Z30
  1207  	VPRORD $0x08, Z30, Z30
  1208  	VPADDD Z22, Z30, Z22
  1209  	VPXORD Z14, Z22, Z14
  1210  	VPRORD $0x07, Z14, Z14
  1211  	VPADDD Z0, Z10, Z0
  1212  	VPADDD Z3, Z0, Z0
  1213  	VPXORD Z30, Z0, Z30
  1214  	VPRORD $0x10, Z30, Z30
  1215  	VPADDD Z20, Z30, Z20
  1216  	VPXORD Z10, Z20, Z10
  1217  	VPRORD $0x0c, Z10, Z10
  1218  	VPADDD Z0, Z10, Z0
  1219  	VPADDD Z23, Z0, Z0
  1220  	VPXORD Z30, Z0, Z30
  1221  	VPRORD $0x08, Z30, Z30
  1222  	VPADDD Z20, Z30, Z20
  1223  	VPXORD Z10, Z20, Z10
  1224  	VPRORD $0x07, Z10, Z10
  1225  	VPADDD Z2, Z12, Z2
  1226  	VPADDD Z25, Z2, Z2
  1227  	VPXORD Z24, Z2, Z24
  1228  	VPRORD $0x10, Z24, Z24
  1229  	VPADDD Z22, Z24, Z22
  1230  	VPXORD Z12, Z22, Z12
  1231  	VPRORD $0x0c, Z12, Z12
  1232  	VPADDD Z2, Z12, Z2
  1233  	VPADDD Z11, Z2, Z2
  1234  	VPXORD Z24, Z2, Z24
  1235  	VPRORD $0x08, Z24, Z24
  1236  	VPADDD Z22, Z24, Z22
  1237  	VPXORD Z12, Z22, Z12
  1238  	VPRORD $0x07, Z12, Z12
  1239  	VPADDD Z4, Z14, Z4
  1240  	VPADDD Z19, Z4, Z4
  1241  	VPXORD Z26, Z4, Z26
  1242  	VPRORD $0x10, Z26, Z26
  1243  	VPADDD Z16, Z26, Z16
  1244  	VPXORD Z14, Z16, Z14
  1245  	VPRORD $0x0c, Z14, Z14
  1246  	VPADDD Z4, Z14, Z4
  1247  	VPADDD Z29, Z4, Z4
  1248  	VPXORD Z26, Z4, Z26
  1249  	VPRORD $0x08, Z26, Z26
  1250  	VPADDD Z16, Z26, Z16
  1251  	VPXORD Z14, Z16, Z14
  1252  	VPRORD $0x07, Z14, Z14
  1253  	VPADDD Z6, Z8, Z6
  1254  	VPADDD Z31, Z6, Z6
  1255  	VPXORD Z28, Z6, Z28
  1256  	VPRORD $0x10, Z28, Z28
  1257  	VPADDD Z18, Z28, Z18
  1258  	VPXORD Z8, Z18, Z8
  1259  	VPRORD $0x0c, Z8, Z8
  1260  	VPADDD Z6, Z8, Z6
  1261  	VPADDD Z17, Z6, Z6
  1262  	VPXORD Z28, Z6, Z28
  1263  	VPRORD $0x08, Z28, Z28
  1264  	VPADDD Z18, Z28, Z18
  1265  	VPXORD Z8, Z18, Z8
  1266  	VPRORD $0x07, Z8, Z8
  1267  
  1268  	// Round 3
  1269  	VPADDD Z0, Z8, Z0
  1270  	VPADDD Z7, Z0, Z0
  1271  	VPXORD Z24, Z0, Z24
  1272  	VPRORD $0x10, Z24, Z24
  1273  	VPADDD Z16, Z24, Z16
  1274  	VPXORD Z8, Z16, Z8
  1275  	VPRORD $0x0c, Z8, Z8
  1276  	VPADDD Z0, Z8, Z0
  1277  	VPADDD Z9, Z0, Z0
  1278  	VPXORD Z24, Z0, Z24
  1279  	VPRORD $0x08, Z24, Z24
  1280  	VPADDD Z16, Z24, Z16
  1281  	VPXORD Z8, Z16, Z8
  1282  	VPRORD $0x07, Z8, Z8
  1283  	VPADDD Z2, Z10, Z2
  1284  	VPADDD Z21, Z2, Z2
  1285  	VPXORD Z26, Z2, Z26
  1286  	VPRORD $0x10, Z26, Z26
  1287  	VPADDD Z18, Z26, Z18
  1288  	VPXORD Z10, Z18, Z10
  1289  	VPRORD $0x0c, Z10, Z10
  1290  	VPADDD Z2, Z10, Z2
  1291  	VPADDD Z25, Z2, Z2
  1292  	VPXORD Z26, Z2, Z26
  1293  	VPRORD $0x08, Z26, Z26
  1294  	VPADDD Z18, Z26, Z18
  1295  	VPXORD Z10, Z18, Z10
  1296  	VPRORD $0x07, Z10, Z10
  1297  	VPADDD Z4, Z12, Z4
  1298  	VPADDD Z27, Z4, Z4
  1299  	VPXORD Z28, Z4, Z28
  1300  	VPRORD $0x10, Z28, Z28
  1301  	VPADDD Z20, Z28, Z20
  1302  	VPXORD Z12, Z20, Z12
  1303  	VPRORD $0x0c, Z12, Z12
  1304  	VPADDD Z4, Z12, Z4
  1305  	VPADDD Z5, Z4, Z4
  1306  	VPXORD Z28, Z4, Z28
  1307  	VPRORD $0x08, Z28, Z28
  1308  	VPADDD Z20, Z28, Z20
  1309  	VPXORD Z12, Z20, Z12
  1310  	VPRORD $0x07, Z12, Z12
  1311  	VPADDD Z6, Z14, Z6
  1312  	VPADDD Z15, Z6, Z6
  1313  	VPXORD Z30, Z6, Z30
  1314  	VPRORD $0x10, Z30, Z30
  1315  	VPADDD Z22, Z30, Z22
  1316  	VPXORD Z14, Z22, Z14
  1317  	VPRORD $0x0c, Z14, Z14
  1318  	VPADDD Z6, Z14, Z6
  1319  	VPADDD Z29, Z6, Z6
  1320  	VPXORD Z30, Z6, Z30
  1321  	VPRORD $0x08, Z30, Z30
  1322  	VPADDD Z22, Z30, Z22
  1323  	VPXORD Z14, Z22, Z14
  1324  	VPRORD $0x07, Z14, Z14
  1325  	VPADDD Z0, Z10, Z0
  1326  	VPADDD Z13, Z0, Z0
  1327  	VPXORD Z30, Z0, Z30
  1328  	VPRORD $0x10, Z30, Z30
  1329  	VPADDD Z20, Z30, Z20
  1330  	VPXORD Z10, Z20, Z10
  1331  	VPRORD $0x0c, Z10, Z10
  1332  	VPADDD Z0, Z10, Z0
  1333  	VPADDD Z11, Z0, Z0
  1334  	VPXORD Z30, Z0, Z30
  1335  	VPRORD $0x08, Z30, Z30
  1336  	VPADDD Z20, Z30, Z20
  1337  	VPXORD Z10, Z20, Z10
  1338  	VPRORD $0x07, Z10, Z10
  1339  	VPADDD Z2, Z12, Z2
  1340  	VPADDD Z19, Z2, Z2
  1341  	VPXORD Z24, Z2, Z24
  1342  	VPRORD $0x10, Z24, Z24
  1343  	VPADDD Z22, Z24, Z22
  1344  	VPXORD Z12, Z22, Z12
  1345  	VPRORD $0x0c, Z12, Z12
  1346  	VPADDD Z2, Z12, Z2
  1347  	VPADDD Z1, Z2, Z2
  1348  	VPXORD Z24, Z2, Z24
  1349  	VPRORD $0x08, Z24, Z24
  1350  	VPADDD Z22, Z24, Z22
  1351  	VPXORD Z12, Z22, Z12
  1352  	VPRORD $0x07, Z12, Z12
  1353  	VPADDD Z4, Z14, Z4
  1354  	VPADDD Z23, Z4, Z4
  1355  	VPXORD Z26, Z4, Z26
  1356  	VPRORD $0x10, Z26, Z26
  1357  	VPADDD Z16, Z26, Z16
  1358  	VPXORD Z14, Z16, Z14
  1359  	VPRORD $0x0c, Z14, Z14
  1360  	VPADDD Z4, Z14, Z4
  1361  	VPADDD Z31, Z4, Z4
  1362  	VPXORD Z26, Z4, Z26
  1363  	VPRORD $0x08, Z26, Z26
  1364  	VPADDD Z16, Z26, Z16
  1365  	VPXORD Z14, Z16, Z14
  1366  	VPRORD $0x07, Z14, Z14
  1367  	VPADDD Z6, Z8, Z6
  1368  	VPADDD Z17, Z6, Z6
  1369  	VPXORD Z28, Z6, Z28
  1370  	VPRORD $0x10, Z28, Z28
  1371  	VPADDD Z18, Z28, Z18
  1372  	VPXORD Z8, Z18, Z8
  1373  	VPRORD $0x0c, Z8, Z8
  1374  	VPADDD Z6, Z8, Z6
  1375  	VPADDD Z3, Z6, Z6
  1376  	VPXORD Z28, Z6, Z28
  1377  	VPRORD $0x08, Z28, Z28
  1378  	VPADDD Z18, Z28, Z18
  1379  	VPXORD Z8, Z18, Z8
  1380  	VPRORD $0x07, Z8, Z8
  1381  
  1382  	// Round 4
  1383  	VPADDD Z0, Z8, Z0
  1384  	VPADDD Z21, Z0, Z0
  1385  	VPXORD Z24, Z0, Z24
  1386  	VPRORD $0x10, Z24, Z24
  1387  	VPADDD Z16, Z24, Z16
  1388  	VPXORD Z8, Z16, Z8
  1389  	VPRORD $0x0c, Z8, Z8
  1390  	VPADDD Z0, Z8, Z0
  1391  	VPADDD Z15, Z0, Z0
  1392  	VPXORD Z24, Z0, Z24
  1393  	VPRORD $0x08, Z24, Z24
  1394  	VPADDD Z16, Z24, Z16
  1395  	VPXORD Z8, Z16, Z8
  1396  	VPRORD $0x07, Z8, Z8
  1397  	VPADDD Z2, Z10, Z2
  1398  	VPADDD Z25, Z2, Z2
  1399  	VPXORD Z26, Z2, Z26
  1400  	VPRORD $0x10, Z26, Z26
  1401  	VPADDD Z18, Z26, Z18
  1402  	VPXORD Z10, Z18, Z10
  1403  	VPRORD $0x0c, Z10, Z10
  1404  	VPADDD Z2, Z10, Z2
  1405  	VPADDD Z19, Z2, Z2
  1406  	VPXORD Z26, Z2, Z26
  1407  	VPRORD $0x08, Z26, Z26
  1408  	VPADDD Z18, Z26, Z18
  1409  	VPXORD Z10, Z18, Z10
  1410  	VPRORD $0x07, Z10, Z10
  1411  	VPADDD Z4, Z12, Z4
  1412  	VPADDD Z29, Z4, Z4
  1413  	VPXORD Z28, Z4, Z28
  1414  	VPRORD $0x10, Z28, Z28
  1415  	VPADDD Z20, Z28, Z20
  1416  	VPXORD Z12, Z20, Z12
  1417  	VPRORD $0x0c, Z12, Z12
  1418  	VPADDD Z4, Z12, Z4
  1419  	VPADDD Z7, Z4, Z4
  1420  	VPXORD Z28, Z4, Z28
  1421  	VPRORD $0x08, Z28, Z28
  1422  	VPADDD Z20, Z28, Z20
  1423  	VPXORD Z12, Z20, Z12
  1424  	VPRORD $0x07, Z12, Z12
  1425  	VPADDD Z6, Z14, Z6
  1426  	VPADDD Z27, Z6, Z6
  1427  	VPXORD Z30, Z6, Z30
  1428  	VPRORD $0x10, Z30, Z30
  1429  	VPADDD Z22, Z30, Z22
  1430  	VPXORD Z14, Z22, Z14
  1431  	VPRORD $0x0c, Z14, Z14
  1432  	VPADDD Z6, Z14, Z6
  1433  	VPADDD Z31, Z6, Z6
  1434  	VPXORD Z30, Z6, Z30
  1435  	VPRORD $0x08, Z30, Z30
  1436  	VPADDD Z22, Z30, Z22
  1437  	VPXORD Z14, Z22, Z14
  1438  	VPRORD $0x07, Z14, Z14
  1439  	VPADDD Z0, Z10, Z0
  1440  	VPADDD Z9, Z0, Z0
  1441  	VPXORD Z30, Z0, Z30
  1442  	VPRORD $0x10, Z30, Z30
  1443  	VPADDD Z20, Z30, Z20
  1444  	VPXORD Z10, Z20, Z10
  1445  	VPRORD $0x0c, Z10, Z10
  1446  	VPADDD Z0, Z10, Z0
  1447  	VPADDD Z1, Z0, Z0
  1448  	VPXORD Z30, Z0, Z30
  1449  	VPRORD $0x08, Z30, Z30
  1450  	VPADDD Z20, Z30, Z20
  1451  	VPXORD Z10, Z20, Z10
  1452  	VPRORD $0x07, Z10, Z10
  1453  	VPADDD Z2, Z12, Z2
  1454  	VPADDD Z23, Z2, Z2
  1455  	VPXORD Z24, Z2, Z24
  1456  	VPRORD $0x10, Z24, Z24
  1457  	VPADDD Z22, Z24, Z22
  1458  	VPXORD Z12, Z22, Z12
  1459  	VPRORD $0x0c, Z12, Z12
  1460  	VPADDD Z2, Z12, Z2
  1461  	VPADDD Z5, Z2, Z2
  1462  	VPXORD Z24, Z2, Z24
  1463  	VPRORD $0x08, Z24, Z24
  1464  	VPADDD Z22, Z24, Z22
  1465  	VPXORD Z12, Z22, Z12
  1466  	VPRORD $0x07, Z12, Z12
  1467  	VPADDD Z4, Z14, Z4
  1468  	VPADDD Z11, Z4, Z4
  1469  	VPXORD Z26, Z4, Z26
  1470  	VPRORD $0x10, Z26, Z26
  1471  	VPADDD Z16, Z26, Z16
  1472  	VPXORD Z14, Z16, Z14
  1473  	VPRORD $0x0c, Z14, Z14
  1474  	VPADDD Z4, Z14, Z4
  1475  	VPADDD Z17, Z4, Z4
  1476  	VPXORD Z26, Z4, Z26
  1477  	VPRORD $0x08, Z26, Z26
  1478  	VPADDD Z16, Z26, Z16
  1479  	VPXORD Z14, Z16, Z14
  1480  	VPRORD $0x07, Z14, Z14
  1481  	VPADDD Z6, Z8, Z6
  1482  	VPADDD Z3, Z6, Z6
  1483  	VPXORD Z28, Z6, Z28
  1484  	VPRORD $0x10, Z28, Z28
  1485  	VPADDD Z18, Z28, Z18
  1486  	VPXORD Z8, Z18, Z8
  1487  	VPRORD $0x0c, Z8, Z8
  1488  	VPADDD Z6, Z8, Z6
  1489  	VPADDD Z13, Z6, Z6
  1490  	VPXORD Z28, Z6, Z28
  1491  	VPRORD $0x08, Z28, Z28
  1492  	VPADDD Z18, Z28, Z18
  1493  	VPXORD Z8, Z18, Z8
  1494  	VPRORD $0x07, Z8, Z8
  1495  
  1496  	// Round 5
  1497  	VPADDD Z0, Z8, Z0
  1498  	VPADDD Z25, Z0, Z0
  1499  	VPXORD Z24, Z0, Z24
  1500  	VPRORD $0x10, Z24, Z24
  1501  	VPADDD Z16, Z24, Z16
  1502  	VPXORD Z8, Z16, Z8
  1503  	VPRORD $0x0c, Z8, Z8
  1504  	VPADDD Z0, Z8, Z0
  1505  	VPADDD Z27, Z0, Z0
  1506  	VPXORD Z24, Z0, Z24
  1507  	VPRORD $0x08, Z24, Z24
  1508  	VPADDD Z16, Z24, Z16
  1509  	VPXORD Z8, Z16, Z8
  1510  	VPRORD $0x07, Z8, Z8
  1511  	VPADDD Z2, Z10, Z2
  1512  	VPADDD Z19, Z2, Z2
  1513  	VPXORD Z26, Z2, Z26
  1514  	VPRORD $0x10, Z26, Z26
  1515  	VPADDD Z18, Z26, Z18
  1516  	VPXORD Z10, Z18, Z10
  1517  	VPRORD $0x0c, Z10, Z10
  1518  	VPADDD Z2, Z10, Z2
  1519  	VPADDD Z23, Z2, Z2
  1520  	VPXORD Z26, Z2, Z26
  1521  	VPRORD $0x08, Z26, Z26
  1522  	VPADDD Z18, Z26, Z18
  1523  	VPXORD Z10, Z18, Z10
  1524  	VPRORD $0x07, Z10, Z10
  1525  	VPADDD Z4, Z12, Z4
  1526  	VPADDD Z31, Z4, Z4
  1527  	VPXORD Z28, Z4, Z28
  1528  	VPRORD $0x10, Z28, Z28
  1529  	VPADDD Z20, Z28, Z20
  1530  	VPXORD Z12, Z20, Z12
  1531  	VPRORD $0x0c, Z12, Z12
  1532  	VPADDD Z4, Z12, Z4
  1533  	VPADDD Z21, Z4, Z4
  1534  	VPXORD Z28, Z4, Z28
  1535  	VPRORD $0x08, Z28, Z28
  1536  	VPADDD Z20, Z28, Z20
  1537  	VPXORD Z12, Z20, Z12
  1538  	VPRORD $0x07, Z12, Z12
  1539  	VPADDD Z6, Z14, Z6
  1540  	VPADDD Z29, Z6, Z6
  1541  	VPXORD Z30, Z6, Z30
  1542  	VPRORD $0x10, Z30, Z30
  1543  	VPADDD Z22, Z30, Z22
  1544  	VPXORD Z14, Z22, Z14
  1545  	VPRORD $0x0c, Z14, Z14
  1546  	VPADDD Z6, Z14, Z6
  1547  	VPADDD Z17, Z6, Z6
  1548  	VPXORD Z30, Z6, Z30
  1549  	VPRORD $0x08, Z30, Z30
  1550  	VPADDD Z22, Z30, Z22
  1551  	VPXORD Z14, Z22, Z14
  1552  	VPRORD $0x07, Z14, Z14
  1553  	VPADDD Z0, Z10, Z0
  1554  	VPADDD Z15, Z0, Z0
  1555  	VPXORD Z30, Z0, Z30
  1556  	VPRORD $0x10, Z30, Z30
  1557  	VPADDD Z20, Z30, Z20
  1558  	VPXORD Z10, Z20, Z10
  1559  	VPRORD $0x0c, Z10, Z10
  1560  	VPADDD Z0, Z10, Z0
  1561  	VPADDD Z5, Z0, Z0
  1562  	VPXORD Z30, Z0, Z30
  1563  	VPRORD $0x08, Z30, Z30
  1564  	VPADDD Z20, Z30, Z20
  1565  	VPXORD Z10, Z20, Z10
  1566  	VPRORD $0x07, Z10, Z10
  1567  	VPADDD Z2, Z12, Z2
  1568  	VPADDD Z11, Z2, Z2
  1569  	VPXORD Z24, Z2, Z24
  1570  	VPRORD $0x10, Z24, Z24
  1571  	VPADDD Z22, Z24, Z22
  1572  	VPXORD Z12, Z22, Z12
  1573  	VPRORD $0x0c, Z12, Z12
  1574  	VPADDD Z2, Z12, Z2
  1575  	VPADDD Z7, Z2, Z2
  1576  	VPXORD Z24, Z2, Z24
  1577  	VPRORD $0x08, Z24, Z24
  1578  	VPADDD Z22, Z24, Z22
  1579  	VPXORD Z12, Z22, Z12
  1580  	VPRORD $0x07, Z12, Z12
  1581  	VPADDD Z4, Z14, Z4
  1582  	VPADDD Z1, Z4, Z4
  1583  	VPXORD Z26, Z4, Z26
  1584  	VPRORD $0x10, Z26, Z26
  1585  	VPADDD Z16, Z26, Z16
  1586  	VPXORD Z14, Z16, Z14
  1587  	VPRORD $0x0c, Z14, Z14
  1588  	VPADDD Z4, Z14, Z4
  1589  	VPADDD Z3, Z4, Z4
  1590  	VPXORD Z26, Z4, Z26
  1591  	VPRORD $0x08, Z26, Z26
  1592  	VPADDD Z16, Z26, Z16
  1593  	VPXORD Z14, Z16, Z14
  1594  	VPRORD $0x07, Z14, Z14
  1595  	VPADDD Z6, Z8, Z6
  1596  	VPADDD Z13, Z6, Z6
  1597  	VPXORD Z28, Z6, Z28
  1598  	VPRORD $0x10, Z28, Z28
  1599  	VPADDD Z18, Z28, Z18
  1600  	VPXORD Z8, Z18, Z8
  1601  	VPRORD $0x0c, Z8, Z8
  1602  	VPADDD Z6, Z8, Z6
  1603  	VPADDD Z9, Z6, Z6
  1604  	VPXORD Z28, Z6, Z28
  1605  	VPRORD $0x08, Z28, Z28
  1606  	VPADDD Z18, Z28, Z18
  1607  	VPXORD Z8, Z18, Z8
  1608  	VPRORD $0x07, Z8, Z8
  1609  
  1610  	// Round 6
  1611  	VPADDD Z0, Z8, Z0
  1612  	VPADDD Z19, Z0, Z0
  1613  	VPXORD Z24, Z0, Z24
  1614  	VPRORD $0x10, Z24, Z24
  1615  	VPADDD Z16, Z24, Z16
  1616  	VPXORD Z8, Z16, Z8
  1617  	VPRORD $0x0c, Z8, Z8
  1618  	VPADDD Z0, Z8, Z0
  1619  	VPADDD Z29, Z0, Z0
  1620  	VPXORD Z24, Z0, Z24
  1621  	VPRORD $0x08, Z24, Z24
  1622  	VPADDD Z16, Z24, Z16
  1623  	VPXORD Z8, Z16, Z8
  1624  	VPRORD $0x07, Z8, Z8
  1625  	VPADDD Z2, Z10, Z2
  1626  	VPADDD Z23, Z2, Z2
  1627  	VPXORD Z26, Z2, Z26
  1628  	VPRORD $0x10, Z26, Z26
  1629  	VPADDD Z18, Z26, Z18
  1630  	VPXORD Z10, Z18, Z10
  1631  	VPRORD $0x0c, Z10, Z10
  1632  	VPADDD Z2, Z10, Z2
  1633  	VPADDD Z11, Z2, Z2
  1634  	VPXORD Z26, Z2, Z26
  1635  	VPRORD $0x08, Z26, Z26
  1636  	VPADDD Z18, Z26, Z18
  1637  	VPXORD Z10, Z18, Z10
  1638  	VPRORD $0x07, Z10, Z10
  1639  	VPADDD Z4, Z12, Z4
  1640  	VPADDD Z17, Z4, Z4
  1641  	VPXORD Z28, Z4, Z28
  1642  	VPRORD $0x10, Z28, Z28
  1643  	VPADDD Z20, Z28, Z20
  1644  	VPXORD Z12, Z20, Z12
  1645  	VPRORD $0x0c, Z12, Z12
  1646  	VPADDD Z4, Z12, Z4
  1647  	VPADDD Z25, Z4, Z4
  1648  	VPXORD Z28, Z4, Z28
  1649  	VPRORD $0x08, Z28, Z28
  1650  	VPADDD Z20, Z28, Z20
  1651  	VPXORD Z12, Z20, Z12
  1652  	VPRORD $0x07, Z12, Z12
  1653  	VPADDD Z6, Z14, Z6
  1654  	VPADDD Z31, Z6, Z6
  1655  	VPXORD Z30, Z6, Z30
  1656  	VPRORD $0x10, Z30, Z30
  1657  	VPADDD Z22, Z30, Z22
  1658  	VPXORD Z14, Z22, Z14
  1659  	VPRORD $0x0c, Z14, Z14
  1660  	VPADDD Z6, Z14, Z6
  1661  	VPADDD Z3, Z6, Z6
  1662  	VPXORD Z30, Z6, Z30
  1663  	VPRORD $0x08, Z30, Z30
  1664  	VPADDD Z22, Z30, Z22
  1665  	VPXORD Z14, Z22, Z14
  1666  	VPRORD $0x07, Z14, Z14
  1667  	VPADDD Z0, Z10, Z0
  1668  	VPADDD Z27, Z0, Z0
  1669  	VPXORD Z30, Z0, Z30
  1670  	VPRORD $0x10, Z30, Z30
  1671  	VPADDD Z20, Z30, Z20
  1672  	VPXORD Z10, Z20, Z10
  1673  	VPRORD $0x0c, Z10, Z10
  1674  	VPADDD Z0, Z10, Z0
  1675  	VPADDD Z7, Z0, Z0
  1676  	VPXORD Z30, Z0, Z30
  1677  	VPRORD $0x08, Z30, Z30
  1678  	VPADDD Z20, Z30, Z20
  1679  	VPXORD Z10, Z20, Z10
  1680  	VPRORD $0x07, Z10, Z10
  1681  	VPADDD Z2, Z12, Z2
  1682  	VPADDD Z1, Z2, Z2
  1683  	VPXORD Z24, Z2, Z24
  1684  	VPRORD $0x10, Z24, Z24
  1685  	VPADDD Z22, Z24, Z22
  1686  	VPXORD Z12, Z22, Z12
  1687  	VPRORD $0x0c, Z12, Z12
  1688  	VPADDD Z2, Z12, Z2
  1689  	VPADDD Z21, Z2, Z2
  1690  	VPXORD Z24, Z2, Z24
  1691  	VPRORD $0x08, Z24, Z24
  1692  	VPADDD Z22, Z24, Z22
  1693  	VPXORD Z12, Z22, Z12
  1694  	VPRORD $0x07, Z12, Z12
  1695  	VPADDD Z4, Z14, Z4
  1696  	VPADDD Z5, Z4, Z4
  1697  	VPXORD Z26, Z4, Z26
  1698  	VPRORD $0x10, Z26, Z26
  1699  	VPADDD Z16, Z26, Z16
  1700  	VPXORD Z14, Z16, Z14
  1701  	VPRORD $0x0c, Z14, Z14
  1702  	VPADDD Z4, Z14, Z4
  1703  	VPADDD Z13, Z4, Z4
  1704  	VPXORD Z26, Z4, Z26
  1705  	VPRORD $0x08, Z26, Z26
  1706  	VPADDD Z16, Z26, Z16
  1707  	VPXORD Z14, Z16, Z14
  1708  	VPRORD $0x07, Z14, Z14
  1709  	VPADDD Z6, Z8, Z6
  1710  	VPADDD Z9, Z6, Z6
  1711  	VPXORD Z28, Z6, Z28
  1712  	VPRORD $0x10, Z28, Z28
  1713  	VPADDD Z18, Z28, Z18
  1714  	VPXORD Z8, Z18, Z8
  1715  	VPRORD $0x0c, Z8, Z8
  1716  	VPADDD Z6, Z8, Z6
  1717  	VPADDD Z15, Z6, Z6
  1718  	VPXORD Z28, Z6, Z28
  1719  	VPRORD $0x08, Z28, Z28
  1720  	VPADDD Z18, Z28, Z18
  1721  	VPXORD Z8, Z18, Z8
  1722  	VPRORD $0x07, Z8, Z8
  1723  
  1724  	// Round 7
  1725  	VPADDD Z0, Z8, Z0
  1726  	VPADDD Z23, Z0, Z0
  1727  	VPXORD Z24, Z0, Z24
  1728  	VPRORD $0x10, Z24, Z24
  1729  	VPADDD Z16, Z24, Z16
  1730  	VPXORD Z8, Z16, Z8
  1731  	VPRORD $0x0c, Z8, Z8
  1732  	VPADDD Z0, Z8, Z0
  1733  	VPADDD Z31, Z0, Z0
  1734  	VPXORD Z24, Z0, Z24
  1735  	VPRORD $0x08, Z24, Z24
  1736  	VPADDD Z16, Z24, Z16
  1737  	VPXORD Z8, Z16, Z8
  1738  	VPRORD $0x07, Z8, Z8
  1739  	VPADDD Z2, Z10, Z2
  1740  	VPADDD Z11, Z2, Z2
  1741  	VPXORD Z26, Z2, Z26
  1742  	VPRORD $0x10, Z26, Z26
  1743  	VPADDD Z18, Z26, Z18
  1744  	VPXORD Z10, Z18, Z10
  1745  	VPRORD $0x0c, Z10, Z10
  1746  	VPADDD Z2, Z10, Z2
  1747  	VPADDD Z1, Z2, Z2
  1748  	VPXORD Z26, Z2, Z26
  1749  	VPRORD $0x08, Z26, Z26
  1750  	VPADDD Z18, Z26, Z18
  1751  	VPXORD Z10, Z18, Z10
  1752  	VPRORD $0x07, Z10, Z10
  1753  	VPADDD Z4, Z12, Z4
  1754  	VPADDD Z3, Z4, Z4
  1755  	VPXORD Z28, Z4, Z28
  1756  	VPRORD $0x10, Z28, Z28
  1757  	VPADDD Z20, Z28, Z20
  1758  	VPXORD Z12, Z20, Z12
  1759  	VPRORD $0x0c, Z12, Z12
  1760  	VPADDD Z4, Z12, Z4
  1761  	VPADDD Z19, Z4, Z4
  1762  	VPXORD Z28, Z4, Z28
  1763  	VPRORD $0x08, Z28, Z28
  1764  	VPADDD Z20, Z28, Z20
  1765  	VPXORD Z12, Z20, Z12
  1766  	VPRORD $0x07, Z12, Z12
  1767  	VPADDD Z6, Z14, Z6
  1768  	VPADDD Z17, Z6, Z6
  1769  	VPXORD Z30, Z6, Z30
  1770  	VPRORD $0x10, Z30, Z30
  1771  	VPADDD Z22, Z30, Z22
  1772  	VPXORD Z14, Z22, Z14
  1773  	VPRORD $0x0c, Z14, Z14
  1774  	VPADDD Z6, Z14, Z6
  1775  	VPADDD Z13, Z6, Z6
  1776  	VPXORD Z30, Z6, Z30
  1777  	VPRORD $0x08, Z30, Z30
  1778  	VPADDD Z22, Z30, Z22
  1779  	VPXORD Z14, Z22, Z14
  1780  	VPRORD $0x07, Z14, Z14
  1781  	VPADDD Z0, Z10, Z0
  1782  	VPADDD Z29, Z0, Z0
  1783  	VPXORD Z30, Z0, Z30
  1784  	VPRORD $0x10, Z30, Z30
  1785  	VPADDD Z20, Z30, Z20
  1786  	VPXORD Z10, Z20, Z10
  1787  	VPRORD $0x0c, Z10, Z10
  1788  	VPADDD Z0, Z10, Z0
  1789  	VPADDD Z21, Z0, Z0
  1790  	VPXORD Z30, Z0, Z30
  1791  	VPRORD $0x08, Z30, Z30
  1792  	VPADDD Z20, Z30, Z20
  1793  	VPXORD Z10, Z20, Z10
  1794  	VPRORD $0x07, Z10, Z10
  1795  	VPADDD Z2, Z12, Z2
  1796  	VPADDD Z5, Z2, Z2
  1797  	VPXORD Z24, Z2, Z24
  1798  	VPRORD $0x10, Z24, Z24
  1799  	VPADDD Z22, Z24, Z22
  1800  	VPXORD Z12, Z22, Z12
  1801  	VPRORD $0x0c, Z12, Z12
  1802  	VPADDD Z2, Z12, Z2
  1803  	VPADDD Z25, Z2, Z2
  1804  	VPXORD Z24, Z2, Z24
  1805  	VPRORD $0x08, Z24, Z24
  1806  	VPADDD Z22, Z24, Z22
  1807  	VPXORD Z12, Z22, Z12
  1808  	VPRORD $0x07, Z12, Z12
  1809  	VPADDD Z4, Z14, Z4
  1810  	VPADDD Z7, Z4, Z4
  1811  	VPXORD Z26, Z4, Z26
  1812  	VPRORD $0x10, Z26, Z26
  1813  	VPADDD Z16, Z26, Z16
  1814  	VPXORD Z14, Z16, Z14
  1815  	VPRORD $0x0c, Z14, Z14
  1816  	VPADDD Z4, Z14, Z4
  1817  	VPADDD Z9, Z4, Z4
  1818  	VPXORD Z26, Z4, Z26
  1819  	VPRORD $0x08, Z26, Z26
  1820  	VPADDD Z16, Z26, Z16
  1821  	VPXORD Z14, Z16, Z14
  1822  	VPRORD $0x07, Z14, Z14
  1823  	VPADDD Z6, Z8, Z6
  1824  	VPADDD Z15, Z6, Z6
  1825  	VPXORD Z28, Z6, Z28
  1826  	VPRORD $0x10, Z28, Z28
  1827  	VPADDD Z18, Z28, Z18
  1828  	VPXORD Z8, Z18, Z8
  1829  	VPRORD $0x0c, Z8, Z8
  1830  	VPADDD Z6, Z8, Z6
  1831  	VPADDD Z27, Z6, Z6
  1832  	VPXORD Z28, Z6, Z28
  1833  	VPRORD $0x08, Z28, Z28
  1834  	VPADDD Z18, Z28, Z18
  1835  	VPXORD Z8, Z18, Z8
  1836  	VPRORD $0x07, Z8, Z8
  1837  
  1838  	// Finalize CVs
  1839  	VPXORD Z0, Z16, Z0
  1840  	VPXORD Z2, Z18, Z2
  1841  	VPXORD Z4, Z20, Z4
  1842  	VPXORD Z6, Z22, Z6
  1843  	VPXORD Z8, Z24, Z8
  1844  	VPXORD Z10, Z26, Z10
  1845  	VPXORD Z12, Z28, Z12
  1846  	VPXORD Z14, Z30, Z14
  1847  
  1848  	// Loop
  1849  	INCQ DX
  1850  	CMPQ DX, $0x00000010
  1851  	JNE  loop
  1852  
  1853  	// Finished; transpose CVs
  1854  	VMOVDQU32   seq<>+0(SB), Z16
  1855  	VPSLLD      $0x05, Z16, Z16
  1856  	KXNORD      K1, K1, K1
  1857  	VPSCATTERDD Z0, K1, (AX)(Z16*1)
  1858  	KXNORD      K1, K1, K1
  1859  	VPSCATTERDD Z2, K1, 4(AX)(Z16*1)
  1860  	KXNORD      K1, K1, K1
  1861  	VPSCATTERDD Z4, K1, 8(AX)(Z16*1)
  1862  	KXNORD      K1, K1, K1
  1863  	VPSCATTERDD Z6, K1, 12(AX)(Z16*1)
  1864  	KXNORD      K1, K1, K1
  1865  	VPSCATTERDD Z8, K1, 16(AX)(Z16*1)
  1866  	KXNORD      K1, K1, K1
  1867  	VPSCATTERDD Z10, K1, 20(AX)(Z16*1)
  1868  	KXNORD      K1, K1, K1
  1869  	VPSCATTERDD Z12, K1, 24(AX)(Z16*1)
  1870  	KXNORD      K1, K1, K1
  1871  	VPSCATTERDD Z14, K1, 28(AX)(Z16*1)
  1872  	RET
  1873  
  1874  // func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
  1875  // Requires: AVX, AVX2
  1876  TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40
  1877  	MOVQ out+0(FP), AX
  1878  	MOVQ block+8(FP), CX
  1879  	MOVQ cv+16(FP), DX
  1880  
  1881  	// Load block
  1882  	VPBROADCASTD (CX), Y0
  1883  	VMOVDQU      Y0, (SP)
  1884  	VPBROADCASTD 4(CX), Y0
  1885  	VMOVDQU      Y0, 32(SP)
  1886  	VPBROADCASTD 8(CX), Y0
  1887  	VMOVDQU      Y0, 64(SP)
  1888  	VPBROADCASTD 12(CX), Y0
  1889  	VMOVDQU      Y0, 96(SP)
  1890  	VPBROADCASTD 16(CX), Y0
  1891  	VMOVDQU      Y0, 128(SP)
  1892  	VPBROADCASTD 20(CX), Y0
  1893  	VMOVDQU      Y0, 160(SP)
  1894  	VPBROADCASTD 24(CX), Y0
  1895  	VMOVDQU      Y0, 192(SP)
  1896  	VPBROADCASTD 28(CX), Y0
  1897  	VMOVDQU      Y0, 224(SP)
  1898  	VPBROADCASTD 32(CX), Y0
  1899  	VMOVDQU      Y0, 256(SP)
  1900  	VPBROADCASTD 36(CX), Y0
  1901  	VMOVDQU      Y0, 288(SP)
  1902  	VPBROADCASTD 40(CX), Y0
  1903  	VMOVDQU      Y0, 320(SP)
  1904  	VPBROADCASTD 44(CX), Y0
  1905  	VMOVDQU      Y0, 352(SP)
  1906  	VPBROADCASTD 48(CX), Y0
  1907  	VMOVDQU      Y0, 384(SP)
  1908  	VPBROADCASTD 52(CX), Y0
  1909  	VMOVDQU      Y0, 416(SP)
  1910  	VPBROADCASTD 56(CX), Y0
  1911  	VMOVDQU      Y0, 448(SP)
  1912  	VPBROADCASTD 60(CX), Y0
  1913  	VMOVDQU      Y0, 480(SP)
  1914  
  1915  	// Initialize state vectors
  1916  	VPBROADCASTD (DX), Y0
  1917  	VPBROADCASTD 4(DX), Y1
  1918  	VPBROADCASTD 8(DX), Y2
  1919  	VPBROADCASTD 12(DX), Y3
  1920  	VPBROADCASTD 16(DX), Y4
  1921  	VPBROADCASTD 20(DX), Y5
  1922  	VPBROADCASTD 24(DX), Y6
  1923  	VPBROADCASTD 28(DX), Y7
  1924  	VPBROADCASTD iv<>+0(SB), Y8
  1925  	VPBROADCASTD iv<>+4(SB), Y9
  1926  	VPBROADCASTD iv<>+8(SB), Y10
  1927  	VPBROADCASTD iv<>+12(SB), Y11
  1928  	VPBROADCASTQ counter+24(FP), Y12
  1929  	VPBROADCASTQ counter+24(FP), Y13
  1930  	VPADDQ       seq64<>+0(SB), Y12, Y12
  1931  	VPADDQ       seq64<>+32(SB), Y13, Y13
  1932  	VPUNPCKLDQ   Y13, Y12, Y14
  1933  	VPUNPCKHDQ   Y13, Y12, Y15
  1934  	VPUNPCKLDQ   Y15, Y14, Y12
  1935  	VPUNPCKHDQ   Y15, Y14, Y13
  1936  	VPERMQ       $0xd8, Y12, Y12
  1937  	VPERMQ       $0xd8, Y13, Y13
  1938  	VPBROADCASTD blockLen+32(FP), Y14
  1939  	VPBROADCASTD flags+36(FP), Y15
  1940  	VMOVDQU      Y8, 512(SP)
  1941  
  1942  	// Round 1
  1943  	VPADDD  Y0, Y4, Y0
  1944  	VPADDD  (SP), Y0, Y0
  1945  	VPXOR   Y12, Y0, Y12
  1946  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  1947  	VMOVDQU 512(SP), Y8
  1948  	VPADDD  Y8, Y12, Y8
  1949  	VPXOR   Y4, Y8, Y4
  1950  	VMOVDQU Y8, 512(SP)
  1951  	VPSRLD  $0x0c, Y4, Y8
  1952  	VPSLLD  $0x14, Y4, Y4
  1953  	VPOR    Y4, Y8, Y4
  1954  	VPADDD  Y0, Y4, Y0
  1955  	VPADDD  32(SP), Y0, Y0
  1956  	VPXOR   Y12, Y0, Y12
  1957  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  1958  	VMOVDQU 512(SP), Y8
  1959  	VPADDD  Y8, Y12, Y8
  1960  	VPXOR   Y4, Y8, Y4
  1961  	VMOVDQU Y8, 512(SP)
  1962  	VPSRLD  $0x07, Y4, Y8
  1963  	VPSLLD  $0x19, Y4, Y4
  1964  	VPOR    Y4, Y8, Y4
  1965  	VPADDD  Y1, Y5, Y1
  1966  	VPADDD  64(SP), Y1, Y1
  1967  	VPXOR   Y13, Y1, Y13
  1968  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  1969  	VPADDD  Y9, Y13, Y9
  1970  	VPXOR   Y5, Y9, Y5
  1971  	VPSRLD  $0x0c, Y5, Y8
  1972  	VPSLLD  $0x14, Y5, Y5
  1973  	VPOR    Y5, Y8, Y5
  1974  	VPADDD  Y1, Y5, Y1
  1975  	VPADDD  96(SP), Y1, Y1
  1976  	VPXOR   Y13, Y1, Y13
  1977  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  1978  	VPADDD  Y9, Y13, Y9
  1979  	VPXOR   Y5, Y9, Y5
  1980  	VPSRLD  $0x07, Y5, Y8
  1981  	VPSLLD  $0x19, Y5, Y5
  1982  	VPOR    Y5, Y8, Y5
  1983  	VPADDD  Y2, Y6, Y2
  1984  	VPADDD  128(SP), Y2, Y2
  1985  	VPXOR   Y14, Y2, Y14
  1986  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  1987  	VPADDD  Y10, Y14, Y10
  1988  	VPXOR   Y6, Y10, Y6
  1989  	VPSRLD  $0x0c, Y6, Y8
  1990  	VPSLLD  $0x14, Y6, Y6
  1991  	VPOR    Y6, Y8, Y6
  1992  	VPADDD  Y2, Y6, Y2
  1993  	VPADDD  160(SP), Y2, Y2
  1994  	VPXOR   Y14, Y2, Y14
  1995  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  1996  	VPADDD  Y10, Y14, Y10
  1997  	VPXOR   Y6, Y10, Y6
  1998  	VPSRLD  $0x07, Y6, Y8
  1999  	VPSLLD  $0x19, Y6, Y6
  2000  	VPOR    Y6, Y8, Y6
  2001  	VPADDD  Y3, Y7, Y3
  2002  	VPADDD  192(SP), Y3, Y3
  2003  	VPXOR   Y15, Y3, Y15
  2004  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2005  	VPADDD  Y11, Y15, Y11
  2006  	VPXOR   Y7, Y11, Y7
  2007  	VPSRLD  $0x0c, Y7, Y8
  2008  	VPSLLD  $0x14, Y7, Y7
  2009  	VPOR    Y7, Y8, Y7
  2010  	VPADDD  Y3, Y7, Y3
  2011  	VPADDD  224(SP), Y3, Y3
  2012  	VPXOR   Y15, Y3, Y15
  2013  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2014  	VPADDD  Y11, Y15, Y11
  2015  	VPXOR   Y7, Y11, Y7
  2016  	VPSRLD  $0x07, Y7, Y8
  2017  	VPSLLD  $0x19, Y7, Y7
  2018  	VPOR    Y7, Y8, Y7
  2019  	VPADDD  Y0, Y5, Y0
  2020  	VPADDD  256(SP), Y0, Y0
  2021  	VPXOR   Y15, Y0, Y15
  2022  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2023  	VPADDD  Y10, Y15, Y10
  2024  	VPXOR   Y5, Y10, Y5
  2025  	VPSRLD  $0x0c, Y5, Y8
  2026  	VPSLLD  $0x14, Y5, Y5
  2027  	VPOR    Y5, Y8, Y5
  2028  	VPADDD  Y0, Y5, Y0
  2029  	VPADDD  288(SP), Y0, Y0
  2030  	VPXOR   Y15, Y0, Y15
  2031  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2032  	VPADDD  Y10, Y15, Y10
  2033  	VPXOR   Y5, Y10, Y5
  2034  	VPSRLD  $0x07, Y5, Y8
  2035  	VPSLLD  $0x19, Y5, Y5
  2036  	VPOR    Y5, Y8, Y5
  2037  	VPADDD  Y1, Y6, Y1
  2038  	VPADDD  320(SP), Y1, Y1
  2039  	VPXOR   Y12, Y1, Y12
  2040  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2041  	VPADDD  Y11, Y12, Y11
  2042  	VPXOR   Y6, Y11, Y6
  2043  	VPSRLD  $0x0c, Y6, Y8
  2044  	VPSLLD  $0x14, Y6, Y6
  2045  	VPOR    Y6, Y8, Y6
  2046  	VPADDD  Y1, Y6, Y1
  2047  	VPADDD  352(SP), Y1, Y1
  2048  	VPXOR   Y12, Y1, Y12
  2049  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2050  	VPADDD  Y11, Y12, Y11
  2051  	VPXOR   Y6, Y11, Y6
  2052  	VPSRLD  $0x07, Y6, Y8
  2053  	VPSLLD  $0x19, Y6, Y6
  2054  	VPOR    Y6, Y8, Y6
  2055  	VPADDD  Y2, Y7, Y2
  2056  	VPADDD  384(SP), Y2, Y2
  2057  	VPXOR   Y13, Y2, Y13
  2058  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2059  	VMOVDQU 512(SP), Y8
  2060  	VPADDD  Y8, Y13, Y8
  2061  	VPXOR   Y7, Y8, Y7
  2062  	VMOVDQU Y8, 512(SP)
  2063  	VPSRLD  $0x0c, Y7, Y8
  2064  	VPSLLD  $0x14, Y7, Y7
  2065  	VPOR    Y7, Y8, Y7
  2066  	VPADDD  Y2, Y7, Y2
  2067  	VPADDD  416(SP), Y2, Y2
  2068  	VPXOR   Y13, Y2, Y13
  2069  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2070  	VMOVDQU 512(SP), Y8
  2071  	VPADDD  Y8, Y13, Y8
  2072  	VPXOR   Y7, Y8, Y7
  2073  	VMOVDQU Y8, 512(SP)
  2074  	VPSRLD  $0x07, Y7, Y8
  2075  	VPSLLD  $0x19, Y7, Y7
  2076  	VPOR    Y7, Y8, Y7
  2077  	VPADDD  Y3, Y4, Y3
  2078  	VPADDD  448(SP), Y3, Y3
  2079  	VPXOR   Y14, Y3, Y14
  2080  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2081  	VPADDD  Y9, Y14, Y9
  2082  	VPXOR   Y4, Y9, Y4
  2083  	VPSRLD  $0x0c, Y4, Y8
  2084  	VPSLLD  $0x14, Y4, Y4
  2085  	VPOR    Y4, Y8, Y4
  2086  	VPADDD  Y3, Y4, Y3
  2087  	VPADDD  480(SP), Y3, Y3
  2088  	VPXOR   Y14, Y3, Y14
  2089  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2090  	VPADDD  Y9, Y14, Y9
  2091  	VPXOR   Y4, Y9, Y4
  2092  	VPSRLD  $0x07, Y4, Y8
  2093  	VPSLLD  $0x19, Y4, Y4
  2094  	VPOR    Y4, Y8, Y4
  2095  
  2096  	// Round 2
  2097  	VPADDD  Y0, Y4, Y0
  2098  	VPADDD  64(SP), Y0, Y0
  2099  	VPXOR   Y12, Y0, Y12
  2100  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2101  	VMOVDQU 512(SP), Y8
  2102  	VPADDD  Y8, Y12, Y8
  2103  	VPXOR   Y4, Y8, Y4
  2104  	VMOVDQU Y8, 512(SP)
  2105  	VPSRLD  $0x0c, Y4, Y8
  2106  	VPSLLD  $0x14, Y4, Y4
  2107  	VPOR    Y4, Y8, Y4
  2108  	VPADDD  Y0, Y4, Y0
  2109  	VPADDD  192(SP), Y0, Y0
  2110  	VPXOR   Y12, Y0, Y12
  2111  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2112  	VMOVDQU 512(SP), Y8
  2113  	VPADDD  Y8, Y12, Y8
  2114  	VPXOR   Y4, Y8, Y4
  2115  	VMOVDQU Y8, 512(SP)
  2116  	VPSRLD  $0x07, Y4, Y8
  2117  	VPSLLD  $0x19, Y4, Y4
  2118  	VPOR    Y4, Y8, Y4
  2119  	VPADDD  Y1, Y5, Y1
  2120  	VPADDD  96(SP), Y1, Y1
  2121  	VPXOR   Y13, Y1, Y13
  2122  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2123  	VPADDD  Y9, Y13, Y9
  2124  	VPXOR   Y5, Y9, Y5
  2125  	VPSRLD  $0x0c, Y5, Y8
  2126  	VPSLLD  $0x14, Y5, Y5
  2127  	VPOR    Y5, Y8, Y5
  2128  	VPADDD  Y1, Y5, Y1
  2129  	VPADDD  320(SP), Y1, Y1
  2130  	VPXOR   Y13, Y1, Y13
  2131  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2132  	VPADDD  Y9, Y13, Y9
  2133  	VPXOR   Y5, Y9, Y5
  2134  	VPSRLD  $0x07, Y5, Y8
  2135  	VPSLLD  $0x19, Y5, Y5
  2136  	VPOR    Y5, Y8, Y5
  2137  	VPADDD  Y2, Y6, Y2
  2138  	VPADDD  224(SP), Y2, Y2
  2139  	VPXOR   Y14, Y2, Y14
  2140  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2141  	VPADDD  Y10, Y14, Y10
  2142  	VPXOR   Y6, Y10, Y6
  2143  	VPSRLD  $0x0c, Y6, Y8
  2144  	VPSLLD  $0x14, Y6, Y6
  2145  	VPOR    Y6, Y8, Y6
  2146  	VPADDD  Y2, Y6, Y2
  2147  	VPADDD  (SP), Y2, Y2
  2148  	VPXOR   Y14, Y2, Y14
  2149  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2150  	VPADDD  Y10, Y14, Y10
  2151  	VPXOR   Y6, Y10, Y6
  2152  	VPSRLD  $0x07, Y6, Y8
  2153  	VPSLLD  $0x19, Y6, Y6
  2154  	VPOR    Y6, Y8, Y6
  2155  	VPADDD  Y3, Y7, Y3
  2156  	VPADDD  128(SP), Y3, Y3
  2157  	VPXOR   Y15, Y3, Y15
  2158  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2159  	VPADDD  Y11, Y15, Y11
  2160  	VPXOR   Y7, Y11, Y7
  2161  	VPSRLD  $0x0c, Y7, Y8
  2162  	VPSLLD  $0x14, Y7, Y7
  2163  	VPOR    Y7, Y8, Y7
  2164  	VPADDD  Y3, Y7, Y3
  2165  	VPADDD  416(SP), Y3, Y3
  2166  	VPXOR   Y15, Y3, Y15
  2167  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2168  	VPADDD  Y11, Y15, Y11
  2169  	VPXOR   Y7, Y11, Y7
  2170  	VPSRLD  $0x07, Y7, Y8
  2171  	VPSLLD  $0x19, Y7, Y7
  2172  	VPOR    Y7, Y8, Y7
  2173  	VPADDD  Y0, Y5, Y0
  2174  	VPADDD  32(SP), Y0, Y0
  2175  	VPXOR   Y15, Y0, Y15
  2176  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2177  	VPADDD  Y10, Y15, Y10
  2178  	VPXOR   Y5, Y10, Y5
  2179  	VPSRLD  $0x0c, Y5, Y8
  2180  	VPSLLD  $0x14, Y5, Y5
  2181  	VPOR    Y5, Y8, Y5
  2182  	VPADDD  Y0, Y5, Y0
  2183  	VPADDD  352(SP), Y0, Y0
  2184  	VPXOR   Y15, Y0, Y15
  2185  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2186  	VPADDD  Y10, Y15, Y10
  2187  	VPXOR   Y5, Y10, Y5
  2188  	VPSRLD  $0x07, Y5, Y8
  2189  	VPSLLD  $0x19, Y5, Y5
  2190  	VPOR    Y5, Y8, Y5
  2191  	VPADDD  Y1, Y6, Y1
  2192  	VPADDD  384(SP), Y1, Y1
  2193  	VPXOR   Y12, Y1, Y12
  2194  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2195  	VPADDD  Y11, Y12, Y11
  2196  	VPXOR   Y6, Y11, Y6
  2197  	VPSRLD  $0x0c, Y6, Y8
  2198  	VPSLLD  $0x14, Y6, Y6
  2199  	VPOR    Y6, Y8, Y6
  2200  	VPADDD  Y1, Y6, Y1
  2201  	VPADDD  160(SP), Y1, Y1
  2202  	VPXOR   Y12, Y1, Y12
  2203  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2204  	VPADDD  Y11, Y12, Y11
  2205  	VPXOR   Y6, Y11, Y6
  2206  	VPSRLD  $0x07, Y6, Y8
  2207  	VPSLLD  $0x19, Y6, Y6
  2208  	VPOR    Y6, Y8, Y6
  2209  	VPADDD  Y2, Y7, Y2
  2210  	VPADDD  288(SP), Y2, Y2
  2211  	VPXOR   Y13, Y2, Y13
  2212  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2213  	VMOVDQU 512(SP), Y8
  2214  	VPADDD  Y8, Y13, Y8
  2215  	VPXOR   Y7, Y8, Y7
  2216  	VMOVDQU Y8, 512(SP)
  2217  	VPSRLD  $0x0c, Y7, Y8
  2218  	VPSLLD  $0x14, Y7, Y7
  2219  	VPOR    Y7, Y8, Y7
  2220  	VPADDD  Y2, Y7, Y2
  2221  	VPADDD  448(SP), Y2, Y2
  2222  	VPXOR   Y13, Y2, Y13
  2223  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2224  	VMOVDQU 512(SP), Y8
  2225  	VPADDD  Y8, Y13, Y8
  2226  	VPXOR   Y7, Y8, Y7
  2227  	VMOVDQU Y8, 512(SP)
  2228  	VPSRLD  $0x07, Y7, Y8
  2229  	VPSLLD  $0x19, Y7, Y7
  2230  	VPOR    Y7, Y8, Y7
  2231  	VPADDD  Y3, Y4, Y3
  2232  	VPADDD  480(SP), Y3, Y3
  2233  	VPXOR   Y14, Y3, Y14
  2234  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2235  	VPADDD  Y9, Y14, Y9
  2236  	VPXOR   Y4, Y9, Y4
  2237  	VPSRLD  $0x0c, Y4, Y8
  2238  	VPSLLD  $0x14, Y4, Y4
  2239  	VPOR    Y4, Y8, Y4
  2240  	VPADDD  Y3, Y4, Y3
  2241  	VPADDD  256(SP), Y3, Y3
  2242  	VPXOR   Y14, Y3, Y14
  2243  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2244  	VPADDD  Y9, Y14, Y9
  2245  	VPXOR   Y4, Y9, Y4
  2246  	VPSRLD  $0x07, Y4, Y8
  2247  	VPSLLD  $0x19, Y4, Y4
  2248  	VPOR    Y4, Y8, Y4
  2249  
  2250  	// Round 3
  2251  	VPADDD  Y0, Y4, Y0
  2252  	VPADDD  96(SP), Y0, Y0
  2253  	VPXOR   Y12, Y0, Y12
  2254  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2255  	VMOVDQU 512(SP), Y8
  2256  	VPADDD  Y8, Y12, Y8
  2257  	VPXOR   Y4, Y8, Y4
  2258  	VMOVDQU Y8, 512(SP)
  2259  	VPSRLD  $0x0c, Y4, Y8
  2260  	VPSLLD  $0x14, Y4, Y4
  2261  	VPOR    Y4, Y8, Y4
  2262  	VPADDD  Y0, Y4, Y0
  2263  	VPADDD  128(SP), Y0, Y0
  2264  	VPXOR   Y12, Y0, Y12
  2265  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2266  	VMOVDQU 512(SP), Y8
  2267  	VPADDD  Y8, Y12, Y8
  2268  	VPXOR   Y4, Y8, Y4
  2269  	VMOVDQU Y8, 512(SP)
  2270  	VPSRLD  $0x07, Y4, Y8
  2271  	VPSLLD  $0x19, Y4, Y4
  2272  	VPOR    Y4, Y8, Y4
  2273  	VPADDD  Y1, Y5, Y1
  2274  	VPADDD  320(SP), Y1, Y1
  2275  	VPXOR   Y13, Y1, Y13
  2276  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2277  	VPADDD  Y9, Y13, Y9
  2278  	VPXOR   Y5, Y9, Y5
  2279  	VPSRLD  $0x0c, Y5, Y8
  2280  	VPSLLD  $0x14, Y5, Y5
  2281  	VPOR    Y5, Y8, Y5
  2282  	VPADDD  Y1, Y5, Y1
  2283  	VPADDD  384(SP), Y1, Y1
  2284  	VPXOR   Y13, Y1, Y13
  2285  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2286  	VPADDD  Y9, Y13, Y9
  2287  	VPXOR   Y5, Y9, Y5
  2288  	VPSRLD  $0x07, Y5, Y8
  2289  	VPSLLD  $0x19, Y5, Y5
  2290  	VPOR    Y5, Y8, Y5
  2291  	VPADDD  Y2, Y6, Y2
  2292  	VPADDD  416(SP), Y2, Y2
  2293  	VPXOR   Y14, Y2, Y14
  2294  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2295  	VPADDD  Y10, Y14, Y10
  2296  	VPXOR   Y6, Y10, Y6
  2297  	VPSRLD  $0x0c, Y6, Y8
  2298  	VPSLLD  $0x14, Y6, Y6
  2299  	VPOR    Y6, Y8, Y6
  2300  	VPADDD  Y2, Y6, Y2
  2301  	VPADDD  64(SP), Y2, Y2
  2302  	VPXOR   Y14, Y2, Y14
  2303  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2304  	VPADDD  Y10, Y14, Y10
  2305  	VPXOR   Y6, Y10, Y6
  2306  	VPSRLD  $0x07, Y6, Y8
  2307  	VPSLLD  $0x19, Y6, Y6
  2308  	VPOR    Y6, Y8, Y6
  2309  	VPADDD  Y3, Y7, Y3
  2310  	VPADDD  224(SP), Y3, Y3
  2311  	VPXOR   Y15, Y3, Y15
  2312  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2313  	VPADDD  Y11, Y15, Y11
  2314  	VPXOR   Y7, Y11, Y7
  2315  	VPSRLD  $0x0c, Y7, Y8
  2316  	VPSLLD  $0x14, Y7, Y7
  2317  	VPOR    Y7, Y8, Y7
  2318  	VPADDD  Y3, Y7, Y3
  2319  	VPADDD  448(SP), Y3, Y3
  2320  	VPXOR   Y15, Y3, Y15
  2321  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2322  	VPADDD  Y11, Y15, Y11
  2323  	VPXOR   Y7, Y11, Y7
  2324  	VPSRLD  $0x07, Y7, Y8
  2325  	VPSLLD  $0x19, Y7, Y7
  2326  	VPOR    Y7, Y8, Y7
  2327  	VPADDD  Y0, Y5, Y0
  2328  	VPADDD  192(SP), Y0, Y0
  2329  	VPXOR   Y15, Y0, Y15
  2330  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2331  	VPADDD  Y10, Y15, Y10
  2332  	VPXOR   Y5, Y10, Y5
  2333  	VPSRLD  $0x0c, Y5, Y8
  2334  	VPSLLD  $0x14, Y5, Y5
  2335  	VPOR    Y5, Y8, Y5
  2336  	VPADDD  Y0, Y5, Y0
  2337  	VPADDD  160(SP), Y0, Y0
  2338  	VPXOR   Y15, Y0, Y15
  2339  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2340  	VPADDD  Y10, Y15, Y10
  2341  	VPXOR   Y5, Y10, Y5
  2342  	VPSRLD  $0x07, Y5, Y8
  2343  	VPSLLD  $0x19, Y5, Y5
  2344  	VPOR    Y5, Y8, Y5
  2345  	VPADDD  Y1, Y6, Y1
  2346  	VPADDD  288(SP), Y1, Y1
  2347  	VPXOR   Y12, Y1, Y12
  2348  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2349  	VPADDD  Y11, Y12, Y11
  2350  	VPXOR   Y6, Y11, Y6
  2351  	VPSRLD  $0x0c, Y6, Y8
  2352  	VPSLLD  $0x14, Y6, Y6
  2353  	VPOR    Y6, Y8, Y6
  2354  	VPADDD  Y1, Y6, Y1
  2355  	VPADDD  (SP), Y1, Y1
  2356  	VPXOR   Y12, Y1, Y12
  2357  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2358  	VPADDD  Y11, Y12, Y11
  2359  	VPXOR   Y6, Y11, Y6
  2360  	VPSRLD  $0x07, Y6, Y8
  2361  	VPSLLD  $0x19, Y6, Y6
  2362  	VPOR    Y6, Y8, Y6
  2363  	VPADDD  Y2, Y7, Y2
  2364  	VPADDD  352(SP), Y2, Y2
  2365  	VPXOR   Y13, Y2, Y13
  2366  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2367  	VMOVDQU 512(SP), Y8
  2368  	VPADDD  Y8, Y13, Y8
  2369  	VPXOR   Y7, Y8, Y7
  2370  	VMOVDQU Y8, 512(SP)
  2371  	VPSRLD  $0x0c, Y7, Y8
  2372  	VPSLLD  $0x14, Y7, Y7
  2373  	VPOR    Y7, Y8, Y7
  2374  	VPADDD  Y2, Y7, Y2
  2375  	VPADDD  480(SP), Y2, Y2
  2376  	VPXOR   Y13, Y2, Y13
  2377  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2378  	VMOVDQU 512(SP), Y8
  2379  	VPADDD  Y8, Y13, Y8
  2380  	VPXOR   Y7, Y8, Y7
  2381  	VMOVDQU Y8, 512(SP)
  2382  	VPSRLD  $0x07, Y7, Y8
  2383  	VPSLLD  $0x19, Y7, Y7
  2384  	VPOR    Y7, Y8, Y7
  2385  	VPADDD  Y3, Y4, Y3
  2386  	VPADDD  256(SP), Y3, Y3
  2387  	VPXOR   Y14, Y3, Y14
  2388  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2389  	VPADDD  Y9, Y14, Y9
  2390  	VPXOR   Y4, Y9, Y4
  2391  	VPSRLD  $0x0c, Y4, Y8
  2392  	VPSLLD  $0x14, Y4, Y4
  2393  	VPOR    Y4, Y8, Y4
  2394  	VPADDD  Y3, Y4, Y3
  2395  	VPADDD  32(SP), Y3, Y3
  2396  	VPXOR   Y14, Y3, Y14
  2397  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2398  	VPADDD  Y9, Y14, Y9
  2399  	VPXOR   Y4, Y9, Y4
  2400  	VPSRLD  $0x07, Y4, Y8
  2401  	VPSLLD  $0x19, Y4, Y4
  2402  	VPOR    Y4, Y8, Y4
  2403  
  2404  	// Round 4
  2405  	VPADDD  Y0, Y4, Y0
  2406  	VPADDD  320(SP), Y0, Y0
  2407  	VPXOR   Y12, Y0, Y12
  2408  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2409  	VMOVDQU 512(SP), Y8
  2410  	VPADDD  Y8, Y12, Y8
  2411  	VPXOR   Y4, Y8, Y4
  2412  	VMOVDQU Y8, 512(SP)
  2413  	VPSRLD  $0x0c, Y4, Y8
  2414  	VPSLLD  $0x14, Y4, Y4
  2415  	VPOR    Y4, Y8, Y4
  2416  	VPADDD  Y0, Y4, Y0
  2417  	VPADDD  224(SP), Y0, Y0
  2418  	VPXOR   Y12, Y0, Y12
  2419  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2420  	VMOVDQU 512(SP), Y8
  2421  	VPADDD  Y8, Y12, Y8
  2422  	VPXOR   Y4, Y8, Y4
  2423  	VMOVDQU Y8, 512(SP)
  2424  	VPSRLD  $0x07, Y4, Y8
  2425  	VPSLLD  $0x19, Y4, Y4
  2426  	VPOR    Y4, Y8, Y4
  2427  	VPADDD  Y1, Y5, Y1
  2428  	VPADDD  384(SP), Y1, Y1
  2429  	VPXOR   Y13, Y1, Y13
  2430  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2431  	VPADDD  Y9, Y13, Y9
  2432  	VPXOR   Y5, Y9, Y5
  2433  	VPSRLD  $0x0c, Y5, Y8
  2434  	VPSLLD  $0x14, Y5, Y5
  2435  	VPOR    Y5, Y8, Y5
  2436  	VPADDD  Y1, Y5, Y1
  2437  	VPADDD  288(SP), Y1, Y1
  2438  	VPXOR   Y13, Y1, Y13
  2439  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2440  	VPADDD  Y9, Y13, Y9
  2441  	VPXOR   Y5, Y9, Y5
  2442  	VPSRLD  $0x07, Y5, Y8
  2443  	VPSLLD  $0x19, Y5, Y5
  2444  	VPOR    Y5, Y8, Y5
  2445  	VPADDD  Y2, Y6, Y2
  2446  	VPADDD  448(SP), Y2, Y2
  2447  	VPXOR   Y14, Y2, Y14
  2448  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2449  	VPADDD  Y10, Y14, Y10
  2450  	VPXOR   Y6, Y10, Y6
  2451  	VPSRLD  $0x0c, Y6, Y8
  2452  	VPSLLD  $0x14, Y6, Y6
  2453  	VPOR    Y6, Y8, Y6
  2454  	VPADDD  Y2, Y6, Y2
  2455  	VPADDD  96(SP), Y2, Y2
  2456  	VPXOR   Y14, Y2, Y14
  2457  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2458  	VPADDD  Y10, Y14, Y10
  2459  	VPXOR   Y6, Y10, Y6
  2460  	VPSRLD  $0x07, Y6, Y8
  2461  	VPSLLD  $0x19, Y6, Y6
  2462  	VPOR    Y6, Y8, Y6
  2463  	VPADDD  Y3, Y7, Y3
  2464  	VPADDD  416(SP), Y3, Y3
  2465  	VPXOR   Y15, Y3, Y15
  2466  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2467  	VPADDD  Y11, Y15, Y11
  2468  	VPXOR   Y7, Y11, Y7
  2469  	VPSRLD  $0x0c, Y7, Y8
  2470  	VPSLLD  $0x14, Y7, Y7
  2471  	VPOR    Y7, Y8, Y7
  2472  	VPADDD  Y3, Y7, Y3
  2473  	VPADDD  480(SP), Y3, Y3
  2474  	VPXOR   Y15, Y3, Y15
  2475  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2476  	VPADDD  Y11, Y15, Y11
  2477  	VPXOR   Y7, Y11, Y7
  2478  	VPSRLD  $0x07, Y7, Y8
  2479  	VPSLLD  $0x19, Y7, Y7
  2480  	VPOR    Y7, Y8, Y7
  2481  	VPADDD  Y0, Y5, Y0
  2482  	VPADDD  128(SP), Y0, Y0
  2483  	VPXOR   Y15, Y0, Y15
  2484  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2485  	VPADDD  Y10, Y15, Y10
  2486  	VPXOR   Y5, Y10, Y5
  2487  	VPSRLD  $0x0c, Y5, Y8
  2488  	VPSLLD  $0x14, Y5, Y5
  2489  	VPOR    Y5, Y8, Y5
  2490  	VPADDD  Y0, Y5, Y0
  2491  	VPADDD  (SP), Y0, Y0
  2492  	VPXOR   Y15, Y0, Y15
  2493  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2494  	VPADDD  Y10, Y15, Y10
  2495  	VPXOR   Y5, Y10, Y5
  2496  	VPSRLD  $0x07, Y5, Y8
  2497  	VPSLLD  $0x19, Y5, Y5
  2498  	VPOR    Y5, Y8, Y5
  2499  	VPADDD  Y1, Y6, Y1
  2500  	VPADDD  352(SP), Y1, Y1
  2501  	VPXOR   Y12, Y1, Y12
  2502  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2503  	VPADDD  Y11, Y12, Y11
  2504  	VPXOR   Y6, Y11, Y6
  2505  	VPSRLD  $0x0c, Y6, Y8
  2506  	VPSLLD  $0x14, Y6, Y6
  2507  	VPOR    Y6, Y8, Y6
  2508  	VPADDD  Y1, Y6, Y1
  2509  	VPADDD  64(SP), Y1, Y1
  2510  	VPXOR   Y12, Y1, Y12
  2511  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2512  	VPADDD  Y11, Y12, Y11
  2513  	VPXOR   Y6, Y11, Y6
  2514  	VPSRLD  $0x07, Y6, Y8
  2515  	VPSLLD  $0x19, Y6, Y6
  2516  	VPOR    Y6, Y8, Y6
  2517  	VPADDD  Y2, Y7, Y2
  2518  	VPADDD  160(SP), Y2, Y2
  2519  	VPXOR   Y13, Y2, Y13
  2520  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2521  	VMOVDQU 512(SP), Y8
  2522  	VPADDD  Y8, Y13, Y8
  2523  	VPXOR   Y7, Y8, Y7
  2524  	VMOVDQU Y8, 512(SP)
  2525  	VPSRLD  $0x0c, Y7, Y8
  2526  	VPSLLD  $0x14, Y7, Y7
  2527  	VPOR    Y7, Y8, Y7
  2528  	VPADDD  Y2, Y7, Y2
  2529  	VPADDD  256(SP), Y2, Y2
  2530  	VPXOR   Y13, Y2, Y13
  2531  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2532  	VMOVDQU 512(SP), Y8
  2533  	VPADDD  Y8, Y13, Y8
  2534  	VPXOR   Y7, Y8, Y7
  2535  	VMOVDQU Y8, 512(SP)
  2536  	VPSRLD  $0x07, Y7, Y8
  2537  	VPSLLD  $0x19, Y7, Y7
  2538  	VPOR    Y7, Y8, Y7
  2539  	VPADDD  Y3, Y4, Y3
  2540  	VPADDD  32(SP), Y3, Y3
  2541  	VPXOR   Y14, Y3, Y14
  2542  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2543  	VPADDD  Y9, Y14, Y9
  2544  	VPXOR   Y4, Y9, Y4
  2545  	VPSRLD  $0x0c, Y4, Y8
  2546  	VPSLLD  $0x14, Y4, Y4
  2547  	VPOR    Y4, Y8, Y4
  2548  	VPADDD  Y3, Y4, Y3
  2549  	VPADDD  192(SP), Y3, Y3
  2550  	VPXOR   Y14, Y3, Y14
  2551  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2552  	VPADDD  Y9, Y14, Y9
  2553  	VPXOR   Y4, Y9, Y4
  2554  	VPSRLD  $0x07, Y4, Y8
  2555  	VPSLLD  $0x19, Y4, Y4
  2556  	VPOR    Y4, Y8, Y4
  2557  
  2558  	// Round 5
  2559  	VPADDD  Y0, Y4, Y0
  2560  	VPADDD  384(SP), Y0, Y0
  2561  	VPXOR   Y12, Y0, Y12
  2562  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2563  	VMOVDQU 512(SP), Y8
  2564  	VPADDD  Y8, Y12, Y8
  2565  	VPXOR   Y4, Y8, Y4
  2566  	VMOVDQU Y8, 512(SP)
  2567  	VPSRLD  $0x0c, Y4, Y8
  2568  	VPSLLD  $0x14, Y4, Y4
  2569  	VPOR    Y4, Y8, Y4
  2570  	VPADDD  Y0, Y4, Y0
  2571  	VPADDD  416(SP), Y0, Y0
  2572  	VPXOR   Y12, Y0, Y12
  2573  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2574  	VMOVDQU 512(SP), Y8
  2575  	VPADDD  Y8, Y12, Y8
  2576  	VPXOR   Y4, Y8, Y4
  2577  	VMOVDQU Y8, 512(SP)
  2578  	VPSRLD  $0x07, Y4, Y8
  2579  	VPSLLD  $0x19, Y4, Y4
  2580  	VPOR    Y4, Y8, Y4
  2581  	VPADDD  Y1, Y5, Y1
  2582  	VPADDD  288(SP), Y1, Y1
  2583  	VPXOR   Y13, Y1, Y13
  2584  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2585  	VPADDD  Y9, Y13, Y9
  2586  	VPXOR   Y5, Y9, Y5
  2587  	VPSRLD  $0x0c, Y5, Y8
  2588  	VPSLLD  $0x14, Y5, Y5
  2589  	VPOR    Y5, Y8, Y5
  2590  	VPADDD  Y1, Y5, Y1
  2591  	VPADDD  352(SP), Y1, Y1
  2592  	VPXOR   Y13, Y1, Y13
  2593  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2594  	VPADDD  Y9, Y13, Y9
  2595  	VPXOR   Y5, Y9, Y5
  2596  	VPSRLD  $0x07, Y5, Y8
  2597  	VPSLLD  $0x19, Y5, Y5
  2598  	VPOR    Y5, Y8, Y5
  2599  	VPADDD  Y2, Y6, Y2
  2600  	VPADDD  480(SP), Y2, Y2
  2601  	VPXOR   Y14, Y2, Y14
  2602  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2603  	VPADDD  Y10, Y14, Y10
  2604  	VPXOR   Y6, Y10, Y6
  2605  	VPSRLD  $0x0c, Y6, Y8
  2606  	VPSLLD  $0x14, Y6, Y6
  2607  	VPOR    Y6, Y8, Y6
  2608  	VPADDD  Y2, Y6, Y2
  2609  	VPADDD  320(SP), Y2, Y2
  2610  	VPXOR   Y14, Y2, Y14
  2611  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2612  	VPADDD  Y10, Y14, Y10
  2613  	VPXOR   Y6, Y10, Y6
  2614  	VPSRLD  $0x07, Y6, Y8
  2615  	VPSLLD  $0x19, Y6, Y6
  2616  	VPOR    Y6, Y8, Y6
  2617  	VPADDD  Y3, Y7, Y3
  2618  	VPADDD  448(SP), Y3, Y3
  2619  	VPXOR   Y15, Y3, Y15
  2620  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2621  	VPADDD  Y11, Y15, Y11
  2622  	VPXOR   Y7, Y11, Y7
  2623  	VPSRLD  $0x0c, Y7, Y8
  2624  	VPSLLD  $0x14, Y7, Y7
  2625  	VPOR    Y7, Y8, Y7
  2626  	VPADDD  Y3, Y7, Y3
  2627  	VPADDD  256(SP), Y3, Y3
  2628  	VPXOR   Y15, Y3, Y15
  2629  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2630  	VPADDD  Y11, Y15, Y11
  2631  	VPXOR   Y7, Y11, Y7
  2632  	VPSRLD  $0x07, Y7, Y8
  2633  	VPSLLD  $0x19, Y7, Y7
  2634  	VPOR    Y7, Y8, Y7
  2635  	VPADDD  Y0, Y5, Y0
  2636  	VPADDD  224(SP), Y0, Y0
  2637  	VPXOR   Y15, Y0, Y15
  2638  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2639  	VPADDD  Y10, Y15, Y10
  2640  	VPXOR   Y5, Y10, Y5
  2641  	VPSRLD  $0x0c, Y5, Y8
  2642  	VPSLLD  $0x14, Y5, Y5
  2643  	VPOR    Y5, Y8, Y5
  2644  	VPADDD  Y0, Y5, Y0
  2645  	VPADDD  64(SP), Y0, Y0
  2646  	VPXOR   Y15, Y0, Y15
  2647  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2648  	VPADDD  Y10, Y15, Y10
  2649  	VPXOR   Y5, Y10, Y5
  2650  	VPSRLD  $0x07, Y5, Y8
  2651  	VPSLLD  $0x19, Y5, Y5
  2652  	VPOR    Y5, Y8, Y5
  2653  	VPADDD  Y1, Y6, Y1
  2654  	VPADDD  160(SP), Y1, Y1
  2655  	VPXOR   Y12, Y1, Y12
  2656  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2657  	VPADDD  Y11, Y12, Y11
  2658  	VPXOR   Y6, Y11, Y6
  2659  	VPSRLD  $0x0c, Y6, Y8
  2660  	VPSLLD  $0x14, Y6, Y6
  2661  	VPOR    Y6, Y8, Y6
  2662  	VPADDD  Y1, Y6, Y1
  2663  	VPADDD  96(SP), Y1, Y1
  2664  	VPXOR   Y12, Y1, Y12
  2665  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2666  	VPADDD  Y11, Y12, Y11
  2667  	VPXOR   Y6, Y11, Y6
  2668  	VPSRLD  $0x07, Y6, Y8
  2669  	VPSLLD  $0x19, Y6, Y6
  2670  	VPOR    Y6, Y8, Y6
  2671  	VPADDD  Y2, Y7, Y2
  2672  	VPADDD  (SP), Y2, Y2
  2673  	VPXOR   Y13, Y2, Y13
  2674  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2675  	VMOVDQU 512(SP), Y8
  2676  	VPADDD  Y8, Y13, Y8
  2677  	VPXOR   Y7, Y8, Y7
  2678  	VMOVDQU Y8, 512(SP)
  2679  	VPSRLD  $0x0c, Y7, Y8
  2680  	VPSLLD  $0x14, Y7, Y7
  2681  	VPOR    Y7, Y8, Y7
  2682  	VPADDD  Y2, Y7, Y2
  2683  	VPADDD  32(SP), Y2, Y2
  2684  	VPXOR   Y13, Y2, Y13
  2685  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2686  	VMOVDQU 512(SP), Y8
  2687  	VPADDD  Y8, Y13, Y8
  2688  	VPXOR   Y7, Y8, Y7
  2689  	VMOVDQU Y8, 512(SP)
  2690  	VPSRLD  $0x07, Y7, Y8
  2691  	VPSLLD  $0x19, Y7, Y7
  2692  	VPOR    Y7, Y8, Y7
  2693  	VPADDD  Y3, Y4, Y3
  2694  	VPADDD  192(SP), Y3, Y3
  2695  	VPXOR   Y14, Y3, Y14
  2696  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2697  	VPADDD  Y9, Y14, Y9
  2698  	VPXOR   Y4, Y9, Y4
  2699  	VPSRLD  $0x0c, Y4, Y8
  2700  	VPSLLD  $0x14, Y4, Y4
  2701  	VPOR    Y4, Y8, Y4
  2702  	VPADDD  Y3, Y4, Y3
  2703  	VPADDD  128(SP), Y3, Y3
  2704  	VPXOR   Y14, Y3, Y14
  2705  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2706  	VPADDD  Y9, Y14, Y9
  2707  	VPXOR   Y4, Y9, Y4
  2708  	VPSRLD  $0x07, Y4, Y8
  2709  	VPSLLD  $0x19, Y4, Y4
  2710  	VPOR    Y4, Y8, Y4
  2711  
  2712  	// Round 6
  2713  	VPADDD  Y0, Y4, Y0
  2714  	VPADDD  288(SP), Y0, Y0
  2715  	VPXOR   Y12, Y0, Y12
  2716  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2717  	VMOVDQU 512(SP), Y8
  2718  	VPADDD  Y8, Y12, Y8
  2719  	VPXOR   Y4, Y8, Y4
  2720  	VMOVDQU Y8, 512(SP)
  2721  	VPSRLD  $0x0c, Y4, Y8
  2722  	VPSLLD  $0x14, Y4, Y4
  2723  	VPOR    Y4, Y8, Y4
  2724  	VPADDD  Y0, Y4, Y0
  2725  	VPADDD  448(SP), Y0, Y0
  2726  	VPXOR   Y12, Y0, Y12
  2727  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2728  	VMOVDQU 512(SP), Y8
  2729  	VPADDD  Y8, Y12, Y8
  2730  	VPXOR   Y4, Y8, Y4
  2731  	VMOVDQU Y8, 512(SP)
  2732  	VPSRLD  $0x07, Y4, Y8
  2733  	VPSLLD  $0x19, Y4, Y4
  2734  	VPOR    Y4, Y8, Y4
  2735  	VPADDD  Y1, Y5, Y1
  2736  	VPADDD  352(SP), Y1, Y1
  2737  	VPXOR   Y13, Y1, Y13
  2738  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2739  	VPADDD  Y9, Y13, Y9
  2740  	VPXOR   Y5, Y9, Y5
  2741  	VPSRLD  $0x0c, Y5, Y8
  2742  	VPSLLD  $0x14, Y5, Y5
  2743  	VPOR    Y5, Y8, Y5
  2744  	VPADDD  Y1, Y5, Y1
  2745  	VPADDD  160(SP), Y1, Y1
  2746  	VPXOR   Y13, Y1, Y13
  2747  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2748  	VPADDD  Y9, Y13, Y9
  2749  	VPXOR   Y5, Y9, Y5
  2750  	VPSRLD  $0x07, Y5, Y8
  2751  	VPSLLD  $0x19, Y5, Y5
  2752  	VPOR    Y5, Y8, Y5
  2753  	VPADDD  Y2, Y6, Y2
  2754  	VPADDD  256(SP), Y2, Y2
  2755  	VPXOR   Y14, Y2, Y14
  2756  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2757  	VPADDD  Y10, Y14, Y10
  2758  	VPXOR   Y6, Y10, Y6
  2759  	VPSRLD  $0x0c, Y6, Y8
  2760  	VPSLLD  $0x14, Y6, Y6
  2761  	VPOR    Y6, Y8, Y6
  2762  	VPADDD  Y2, Y6, Y2
  2763  	VPADDD  384(SP), Y2, Y2
  2764  	VPXOR   Y14, Y2, Y14
  2765  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2766  	VPADDD  Y10, Y14, Y10
  2767  	VPXOR   Y6, Y10, Y6
  2768  	VPSRLD  $0x07, Y6, Y8
  2769  	VPSLLD  $0x19, Y6, Y6
  2770  	VPOR    Y6, Y8, Y6
  2771  	VPADDD  Y3, Y7, Y3
  2772  	VPADDD  480(SP), Y3, Y3
  2773  	VPXOR   Y15, Y3, Y15
  2774  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2775  	VPADDD  Y11, Y15, Y11
  2776  	VPXOR   Y7, Y11, Y7
  2777  	VPSRLD  $0x0c, Y7, Y8
  2778  	VPSLLD  $0x14, Y7, Y7
  2779  	VPOR    Y7, Y8, Y7
  2780  	VPADDD  Y3, Y7, Y3
  2781  	VPADDD  32(SP), Y3, Y3
  2782  	VPXOR   Y15, Y3, Y15
  2783  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2784  	VPADDD  Y11, Y15, Y11
  2785  	VPXOR   Y7, Y11, Y7
  2786  	VPSRLD  $0x07, Y7, Y8
  2787  	VPSLLD  $0x19, Y7, Y7
  2788  	VPOR    Y7, Y8, Y7
  2789  	VPADDD  Y0, Y5, Y0
  2790  	VPADDD  416(SP), Y0, Y0
  2791  	VPXOR   Y15, Y0, Y15
  2792  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2793  	VPADDD  Y10, Y15, Y10
  2794  	VPXOR   Y5, Y10, Y5
  2795  	VPSRLD  $0x0c, Y5, Y8
  2796  	VPSLLD  $0x14, Y5, Y5
  2797  	VPOR    Y5, Y8, Y5
  2798  	VPADDD  Y0, Y5, Y0
  2799  	VPADDD  96(SP), Y0, Y0
  2800  	VPXOR   Y15, Y0, Y15
  2801  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2802  	VPADDD  Y10, Y15, Y10
  2803  	VPXOR   Y5, Y10, Y5
  2804  	VPSRLD  $0x07, Y5, Y8
  2805  	VPSLLD  $0x19, Y5, Y5
  2806  	VPOR    Y5, Y8, Y5
  2807  	VPADDD  Y1, Y6, Y1
  2808  	VPADDD  (SP), Y1, Y1
  2809  	VPXOR   Y12, Y1, Y12
  2810  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2811  	VPADDD  Y11, Y12, Y11
  2812  	VPXOR   Y6, Y11, Y6
  2813  	VPSRLD  $0x0c, Y6, Y8
  2814  	VPSLLD  $0x14, Y6, Y6
  2815  	VPOR    Y6, Y8, Y6
  2816  	VPADDD  Y1, Y6, Y1
  2817  	VPADDD  320(SP), Y1, Y1
  2818  	VPXOR   Y12, Y1, Y12
  2819  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2820  	VPADDD  Y11, Y12, Y11
  2821  	VPXOR   Y6, Y11, Y6
  2822  	VPSRLD  $0x07, Y6, Y8
  2823  	VPSLLD  $0x19, Y6, Y6
  2824  	VPOR    Y6, Y8, Y6
  2825  	VPADDD  Y2, Y7, Y2
  2826  	VPADDD  64(SP), Y2, Y2
  2827  	VPXOR   Y13, Y2, Y13
  2828  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2829  	VMOVDQU 512(SP), Y8
  2830  	VPADDD  Y8, Y13, Y8
  2831  	VPXOR   Y7, Y8, Y7
  2832  	VMOVDQU Y8, 512(SP)
  2833  	VPSRLD  $0x0c, Y7, Y8
  2834  	VPSLLD  $0x14, Y7, Y7
  2835  	VPOR    Y7, Y8, Y7
  2836  	VPADDD  Y2, Y7, Y2
  2837  	VPADDD  192(SP), Y2, Y2
  2838  	VPXOR   Y13, Y2, Y13
  2839  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2840  	VMOVDQU 512(SP), Y8
  2841  	VPADDD  Y8, Y13, Y8
  2842  	VPXOR   Y7, Y8, Y7
  2843  	VMOVDQU Y8, 512(SP)
  2844  	VPSRLD  $0x07, Y7, Y8
  2845  	VPSLLD  $0x19, Y7, Y7
  2846  	VPOR    Y7, Y8, Y7
  2847  	VPADDD  Y3, Y4, Y3
  2848  	VPADDD  128(SP), Y3, Y3
  2849  	VPXOR   Y14, Y3, Y14
  2850  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2851  	VPADDD  Y9, Y14, Y9
  2852  	VPXOR   Y4, Y9, Y4
  2853  	VPSRLD  $0x0c, Y4, Y8
  2854  	VPSLLD  $0x14, Y4, Y4
  2855  	VPOR    Y4, Y8, Y4
  2856  	VPADDD  Y3, Y4, Y3
  2857  	VPADDD  224(SP), Y3, Y3
  2858  	VPXOR   Y14, Y3, Y14
  2859  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2860  	VPADDD  Y9, Y14, Y9
  2861  	VPXOR   Y4, Y9, Y4
  2862  	VPSRLD  $0x07, Y4, Y8
  2863  	VPSLLD  $0x19, Y4, Y4
  2864  	VPOR    Y4, Y8, Y4
  2865  
  2866  	// Round 7
  2867  	VPADDD  Y0, Y4, Y0
  2868  	VPADDD  352(SP), Y0, Y0
  2869  	VPXOR   Y12, Y0, Y12
  2870  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2871  	VMOVDQU 512(SP), Y8
  2872  	VPADDD  Y8, Y12, Y8
  2873  	VPXOR   Y4, Y8, Y4
  2874  	VMOVDQU Y8, 512(SP)
  2875  	VPSRLD  $0x0c, Y4, Y8
  2876  	VPSLLD  $0x14, Y4, Y4
  2877  	VPOR    Y4, Y8, Y4
  2878  	VPADDD  Y0, Y4, Y0
  2879  	VPADDD  480(SP), Y0, Y0
  2880  	VPXOR   Y12, Y0, Y12
  2881  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2882  	VMOVDQU 512(SP), Y8
  2883  	VPADDD  Y8, Y12, Y8
  2884  	VPXOR   Y4, Y8, Y4
  2885  	VMOVDQU Y8, 512(SP)
  2886  	VPSRLD  $0x07, Y4, Y8
  2887  	VPSLLD  $0x19, Y4, Y4
  2888  	VPOR    Y4, Y8, Y4
  2889  	VPADDD  Y1, Y5, Y1
  2890  	VPADDD  160(SP), Y1, Y1
  2891  	VPXOR   Y13, Y1, Y13
  2892  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2893  	VPADDD  Y9, Y13, Y9
  2894  	VPXOR   Y5, Y9, Y5
  2895  	VPSRLD  $0x0c, Y5, Y8
  2896  	VPSLLD  $0x14, Y5, Y5
  2897  	VPOR    Y5, Y8, Y5
  2898  	VPADDD  Y1, Y5, Y1
  2899  	VPADDD  (SP), Y1, Y1
  2900  	VPXOR   Y13, Y1, Y13
  2901  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2902  	VPADDD  Y9, Y13, Y9
  2903  	VPXOR   Y5, Y9, Y5
  2904  	VPSRLD  $0x07, Y5, Y8
  2905  	VPSLLD  $0x19, Y5, Y5
  2906  	VPOR    Y5, Y8, Y5
  2907  	VPADDD  Y2, Y6, Y2
  2908  	VPADDD  32(SP), Y2, Y2
  2909  	VPXOR   Y14, Y2, Y14
  2910  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2911  	VPADDD  Y10, Y14, Y10
  2912  	VPXOR   Y6, Y10, Y6
  2913  	VPSRLD  $0x0c, Y6, Y8
  2914  	VPSLLD  $0x14, Y6, Y6
  2915  	VPOR    Y6, Y8, Y6
  2916  	VPADDD  Y2, Y6, Y2
  2917  	VPADDD  288(SP), Y2, Y2
  2918  	VPXOR   Y14, Y2, Y14
  2919  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2920  	VPADDD  Y10, Y14, Y10
  2921  	VPXOR   Y6, Y10, Y6
  2922  	VPSRLD  $0x07, Y6, Y8
  2923  	VPSLLD  $0x19, Y6, Y6
  2924  	VPOR    Y6, Y8, Y6
  2925  	VPADDD  Y3, Y7, Y3
  2926  	VPADDD  256(SP), Y3, Y3
  2927  	VPXOR   Y15, Y3, Y15
  2928  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2929  	VPADDD  Y11, Y15, Y11
  2930  	VPXOR   Y7, Y11, Y7
  2931  	VPSRLD  $0x0c, Y7, Y8
  2932  	VPSLLD  $0x14, Y7, Y7
  2933  	VPOR    Y7, Y8, Y7
  2934  	VPADDD  Y3, Y7, Y3
  2935  	VPADDD  192(SP), Y3, Y3
  2936  	VPXOR   Y15, Y3, Y15
  2937  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2938  	VPADDD  Y11, Y15, Y11
  2939  	VPXOR   Y7, Y11, Y7
  2940  	VPSRLD  $0x07, Y7, Y8
  2941  	VPSLLD  $0x19, Y7, Y7
  2942  	VPOR    Y7, Y8, Y7
  2943  	VPADDD  Y0, Y5, Y0
  2944  	VPADDD  448(SP), Y0, Y0
  2945  	VPXOR   Y15, Y0, Y15
  2946  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2947  	VPADDD  Y10, Y15, Y10
  2948  	VPXOR   Y5, Y10, Y5
  2949  	VPSRLD  $0x0c, Y5, Y8
  2950  	VPSLLD  $0x14, Y5, Y5
  2951  	VPOR    Y5, Y8, Y5
  2952  	VPADDD  Y0, Y5, Y0
  2953  	VPADDD  320(SP), Y0, Y0
  2954  	VPXOR   Y15, Y0, Y15
  2955  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2956  	VPADDD  Y10, Y15, Y10
  2957  	VPXOR   Y5, Y10, Y5
  2958  	VPSRLD  $0x07, Y5, Y8
  2959  	VPSLLD  $0x19, Y5, Y5
  2960  	VPOR    Y5, Y8, Y5
  2961  	VPADDD  Y1, Y6, Y1
  2962  	VPADDD  64(SP), Y1, Y1
  2963  	VPXOR   Y12, Y1, Y12
  2964  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2965  	VPADDD  Y11, Y12, Y11
  2966  	VPXOR   Y6, Y11, Y6
  2967  	VPSRLD  $0x0c, Y6, Y8
  2968  	VPSLLD  $0x14, Y6, Y6
  2969  	VPOR    Y6, Y8, Y6
  2970  	VPADDD  Y1, Y6, Y1
  2971  	VPADDD  384(SP), Y1, Y1
  2972  	VPXOR   Y12, Y1, Y12
  2973  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2974  	VPADDD  Y11, Y12, Y11
  2975  	VPXOR   Y6, Y11, Y6
  2976  	VPSRLD  $0x07, Y6, Y8
  2977  	VPSLLD  $0x19, Y6, Y6
  2978  	VPOR    Y6, Y8, Y6
  2979  	VPADDD  Y2, Y7, Y2
  2980  	VPADDD  96(SP), Y2, Y2
  2981  	VPXOR   Y13, Y2, Y13
  2982  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2983  	VMOVDQU 512(SP), Y8
  2984  	VPADDD  Y8, Y13, Y8
  2985  	VPXOR   Y7, Y8, Y7
  2986  	VMOVDQU Y8, 512(SP)
  2987  	VPSRLD  $0x0c, Y7, Y8
  2988  	VPSLLD  $0x14, Y7, Y7
  2989  	VPOR    Y7, Y8, Y7
  2990  	VPADDD  Y2, Y7, Y2
  2991  	VPADDD  128(SP), Y2, Y2
  2992  	VPXOR   Y13, Y2, Y13
  2993  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2994  	VMOVDQU 512(SP), Y8
  2995  	VPADDD  Y8, Y13, Y8
  2996  	VPXOR   Y7, Y8, Y7
  2997  	VMOVDQU Y8, 512(SP)
  2998  	VPSRLD  $0x07, Y7, Y8
  2999  	VPSLLD  $0x19, Y7, Y7
  3000  	VPOR    Y7, Y8, Y7
  3001  	VPADDD  Y3, Y4, Y3
  3002  	VPADDD  224(SP), Y3, Y3
  3003  	VPXOR   Y14, Y3, Y14
  3004  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3005  	VPADDD  Y9, Y14, Y9
  3006  	VPXOR   Y4, Y9, Y4
  3007  	VPSRLD  $0x0c, Y4, Y8
  3008  	VPSLLD  $0x14, Y4, Y4
  3009  	VPOR    Y4, Y8, Y4
  3010  	VPADDD  Y3, Y4, Y3
  3011  	VPADDD  416(SP), Y3, Y3
  3012  	VPXOR   Y14, Y3, Y14
  3013  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3014  	VPADDD  Y9, Y14, Y9
  3015  	VPXOR   Y4, Y9, Y4
  3016  	VPSRLD  $0x07, Y4, Y8
  3017  	VPSLLD  $0x19, Y4, Y4
  3018  	VPOR    Y4, Y8, Y4
  3019  	VMOVDQU 512(SP), Y8
  3020  
  3021  	// Finalize CVs
  3022  	VMOVDQU      Y8, 256(SP)
  3023  	VMOVDQU      Y9, 288(SP)
  3024  	VMOVDQU      Y10, 320(SP)
  3025  	VMOVDQU      Y11, 352(SP)
  3026  	VMOVDQU      Y12, 384(SP)
  3027  	VMOVDQU      Y13, 416(SP)
  3028  	VMOVDQU      Y14, 448(SP)
  3029  	VMOVDQU      Y15, 480(SP)
  3030  	VPXOR        Y0, Y8, Y0
  3031  	VPXOR        Y1, Y9, Y1
  3032  	VPXOR        Y2, Y10, Y2
  3033  	VPXOR        Y3, Y11, Y3
  3034  	VPXOR        Y4, Y12, Y4
  3035  	VPXOR        Y5, Y13, Y5
  3036  	VPXOR        Y6, Y14, Y6
  3037  	VPXOR        Y7, Y15, Y7
  3038  	VPUNPCKLDQ   Y1, Y0, Y8
  3039  	VPUNPCKHDQ   Y1, Y0, Y9
  3040  	VPUNPCKLDQ   Y3, Y2, Y10
  3041  	VPUNPCKHDQ   Y3, Y2, Y11
  3042  	VPUNPCKLDQ   Y5, Y4, Y12
  3043  	VPUNPCKHDQ   Y5, Y4, Y13
  3044  	VPUNPCKLDQ   Y7, Y6, Y14
  3045  	VPUNPCKHDQ   Y7, Y6, Y15
  3046  	VPUNPCKLQDQ  Y10, Y8, Y0
  3047  	VPUNPCKHQDQ  Y10, Y8, Y1
  3048  	VPUNPCKLQDQ  Y11, Y9, Y2
  3049  	VPUNPCKHQDQ  Y11, Y9, Y3
  3050  	VPUNPCKLQDQ  Y14, Y12, Y4
  3051  	VPUNPCKHQDQ  Y14, Y12, Y5
  3052  	VPUNPCKLQDQ  Y15, Y13, Y6
  3053  	VPUNPCKHQDQ  Y15, Y13, Y7
  3054  	VPERM2I128   $0x20, Y4, Y0, Y8
  3055  	VPERM2I128   $0x31, Y4, Y0, Y12
  3056  	VPERM2I128   $0x20, Y5, Y1, Y9
  3057  	VPERM2I128   $0x31, Y5, Y1, Y13
  3058  	VPERM2I128   $0x20, Y6, Y2, Y10
  3059  	VPERM2I128   $0x31, Y6, Y2, Y14
  3060  	VPERM2I128   $0x20, Y7, Y3, Y11
  3061  	VPERM2I128   $0x31, Y7, Y3, Y15
  3062  	VMOVDQU      Y8, (AX)
  3063  	VMOVDQU      Y9, 64(AX)
  3064  	VMOVDQU      Y10, 128(AX)
  3065  	VMOVDQU      Y11, 192(AX)
  3066  	VMOVDQU      Y12, 256(AX)
  3067  	VMOVDQU      Y13, 320(AX)
  3068  	VMOVDQU      Y14, 384(AX)
  3069  	VMOVDQU      Y15, 448(AX)
  3070  	VMOVDQU      256(SP), Y8
  3071  	VMOVDQU      288(SP), Y9
  3072  	VMOVDQU      320(SP), Y10
  3073  	VMOVDQU      352(SP), Y11
  3074  	VMOVDQU      384(SP), Y12
  3075  	VMOVDQU      416(SP), Y13
  3076  	VMOVDQU      448(SP), Y14
  3077  	VMOVDQU      480(SP), Y15
  3078  	VPBROADCASTD (DX), Y0
  3079  	VPXOR        Y0, Y8, Y8
  3080  	VPBROADCASTD 4(DX), Y0
  3081  	VPXOR        Y0, Y9, Y9
  3082  	VPBROADCASTD 8(DX), Y0
  3083  	VPXOR        Y0, Y10, Y10
  3084  	VPBROADCASTD 12(DX), Y0
  3085  	VPXOR        Y0, Y11, Y11
  3086  	VPBROADCASTD 16(DX), Y0
  3087  	VPXOR        Y0, Y12, Y12
  3088  	VPBROADCASTD 20(DX), Y0
  3089  	VPXOR        Y0, Y13, Y13
  3090  	VPBROADCASTD 24(DX), Y0
  3091  	VPXOR        Y0, Y14, Y14
  3092  	VPBROADCASTD 28(DX), Y0
  3093  	VPXOR        Y0, Y15, Y15
  3094  	VPUNPCKLDQ   Y9, Y8, Y0
  3095  	VPUNPCKHDQ   Y9, Y8, Y1
  3096  	VPUNPCKLDQ   Y11, Y10, Y2
  3097  	VPUNPCKHDQ   Y11, Y10, Y3
  3098  	VPUNPCKLDQ   Y13, Y12, Y4
  3099  	VPUNPCKHDQ   Y13, Y12, Y5
  3100  	VPUNPCKLDQ   Y15, Y14, Y6
  3101  	VPUNPCKHDQ   Y15, Y14, Y7
  3102  	VPUNPCKLQDQ  Y2, Y0, Y8
  3103  	VPUNPCKHQDQ  Y2, Y0, Y9
  3104  	VPUNPCKLQDQ  Y3, Y1, Y10
  3105  	VPUNPCKHQDQ  Y3, Y1, Y11
  3106  	VPUNPCKLQDQ  Y6, Y4, Y12
  3107  	VPUNPCKHQDQ  Y6, Y4, Y13
  3108  	VPUNPCKLQDQ  Y7, Y5, Y14
  3109  	VPUNPCKHQDQ  Y7, Y5, Y15
  3110  	VPERM2I128   $0x20, Y12, Y8, Y0
  3111  	VPERM2I128   $0x31, Y12, Y8, Y4
  3112  	VPERM2I128   $0x20, Y13, Y9, Y1
  3113  	VPERM2I128   $0x31, Y13, Y9, Y5
  3114  	VPERM2I128   $0x20, Y14, Y10, Y2
  3115  	VPERM2I128   $0x31, Y14, Y10, Y6
  3116  	VPERM2I128   $0x20, Y15, Y11, Y3
  3117  	VPERM2I128   $0x31, Y15, Y11, Y7
  3118  	VMOVDQU      Y0, 32(AX)
  3119  	VMOVDQU      Y1, 96(AX)
  3120  	VMOVDQU      Y2, 160(AX)
  3121  	VMOVDQU      Y3, 224(AX)
  3122  	VMOVDQU      Y4, 288(AX)
  3123  	VMOVDQU      Y5, 352(AX)
  3124  	VMOVDQU      Y6, 416(AX)
  3125  	VMOVDQU      Y7, 480(AX)
  3126  	RET
  3127  
  3128  // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
  3129  // Requires: AVX, AVX2
  3130  TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36
  3131  	MOVQ cvs+0(FP), AX
  3132  	MOVQ buf+8(FP), CX
  3133  	MOVQ key+16(FP), DX
  3134  
  3135  	// Load key
  3136  	VPBROADCASTD (DX), Y0
  3137  	VPBROADCASTD 4(DX), Y1
  3138  	VPBROADCASTD 8(DX), Y2
  3139  	VPBROADCASTD 12(DX), Y3
  3140  	VPBROADCASTD 16(DX), Y4
  3141  	VPBROADCASTD 20(DX), Y5
  3142  	VPBROADCASTD 24(DX), Y6
  3143  	VPBROADCASTD 28(DX), Y7
  3144  
  3145  	// Initialize counter
  3146  	VPBROADCASTQ counter+24(FP), Y12
  3147  	VPBROADCASTQ counter+24(FP), Y13
  3148  	VPADDQ       seq64<>+0(SB), Y12, Y12
  3149  	VPADDQ       seq64<>+32(SB), Y13, Y13
  3150  	VPUNPCKLDQ   Y13, Y12, Y14
  3151  	VPUNPCKHDQ   Y13, Y12, Y15
  3152  	VPUNPCKLDQ   Y15, Y14, Y12
  3153  	VPUNPCKHDQ   Y15, Y14, Y13
  3154  	VPERMQ       $0xd8, Y12, Y12
  3155  	VPERMQ       $0xd8, Y13, Y13
  3156  	VMOVDQU      Y12, 512(SP)
  3157  	VMOVDQU      Y13, 544(SP)
  3158  
  3159  	// Initialize flags
  3160  	VPBROADCASTD flags+32(FP), Y14
  3161  	VMOVDQU      Y14, 576(SP)
  3162  	VMOVDQU      Y14, 608(SP)
  3163  	ORL          $0x01, 576(SP)
  3164  	ORL          $0x02, 636(SP)
  3165  
  3166  	// Loop index
  3167  	XORQ DX, DX
  3168  
  3169  loop:
  3170  	// Load transposed block
  3171  	VMOVDQU    seq<>+0(SB), Y9
  3172  	VPSLLD     $0x0a, Y9, Y9
  3173  	VPCMPEQD   Y8, Y8, Y8
  3174  	VPGATHERDD Y8, (CX)(Y9*1), Y10
  3175  	VMOVDQU    Y10, (SP)
  3176  	VPCMPEQD   Y8, Y8, Y8
  3177  	VPGATHERDD Y8, 4(CX)(Y9*1), Y10
  3178  	VMOVDQU    Y10, 32(SP)
  3179  	VPCMPEQD   Y8, Y8, Y8
  3180  	VPGATHERDD Y8, 8(CX)(Y9*1), Y10
  3181  	VMOVDQU    Y10, 64(SP)
  3182  	VPCMPEQD   Y8, Y8, Y8
  3183  	VPGATHERDD Y8, 12(CX)(Y9*1), Y10
  3184  	VMOVDQU    Y10, 96(SP)
  3185  	VPCMPEQD   Y8, Y8, Y8
  3186  	VPGATHERDD Y8, 16(CX)(Y9*1), Y10
  3187  	VMOVDQU    Y10, 128(SP)
  3188  	VPCMPEQD   Y8, Y8, Y8
  3189  	VPGATHERDD Y8, 20(CX)(Y9*1), Y10
  3190  	VMOVDQU    Y10, 160(SP)
  3191  	VPCMPEQD   Y8, Y8, Y8
  3192  	VPGATHERDD Y8, 24(CX)(Y9*1), Y10
  3193  	VMOVDQU    Y10, 192(SP)
  3194  	VPCMPEQD   Y8, Y8, Y8
  3195  	VPGATHERDD Y8, 28(CX)(Y9*1), Y10
  3196  	VMOVDQU    Y10, 224(SP)
  3197  	VPCMPEQD   Y8, Y8, Y8
  3198  	VPGATHERDD Y8, 32(CX)(Y9*1), Y10
  3199  	VMOVDQU    Y10, 256(SP)
  3200  	VPCMPEQD   Y8, Y8, Y8
  3201  	VPGATHERDD Y8, 36(CX)(Y9*1), Y10
  3202  	VMOVDQU    Y10, 288(SP)
  3203  	VPCMPEQD   Y8, Y8, Y8
  3204  	VPGATHERDD Y8, 40(CX)(Y9*1), Y10
  3205  	VMOVDQU    Y10, 320(SP)
  3206  	VPCMPEQD   Y8, Y8, Y8
  3207  	VPGATHERDD Y8, 44(CX)(Y9*1), Y10
  3208  	VMOVDQU    Y10, 352(SP)
  3209  	VPCMPEQD   Y8, Y8, Y8
  3210  	VPGATHERDD Y8, 48(CX)(Y9*1), Y10
  3211  	VMOVDQU    Y10, 384(SP)
  3212  	VPCMPEQD   Y8, Y8, Y8
  3213  	VPGATHERDD Y8, 52(CX)(Y9*1), Y10
  3214  	VMOVDQU    Y10, 416(SP)
  3215  	VPCMPEQD   Y8, Y8, Y8
  3216  	VPGATHERDD Y8, 56(CX)(Y9*1), Y10
  3217  	VMOVDQU    Y10, 448(SP)
  3218  	VPCMPEQD   Y8, Y8, Y8
  3219  	VPGATHERDD Y8, 60(CX)(Y9*1), Y10
  3220  	VMOVDQU    Y10, 480(SP)
  3221  	ADDQ       $0x40, CX
  3222  
  3223  	// Reload state vectors (other than CVs)
  3224  	VPBROADCASTD iv<>+0(SB), Y8
  3225  	VPBROADCASTD iv<>+4(SB), Y9
  3226  	VPBROADCASTD iv<>+8(SB), Y10
  3227  	VPBROADCASTD iv<>+12(SB), Y11
  3228  	VMOVDQU      512(SP), Y12
  3229  	VMOVDQU      544(SP), Y13
  3230  	VPBROADCASTD seq<>+4(SB), Y14
  3231  	VPSLLD       $0x06, Y14, Y14
  3232  	VPBROADCASTD 576(SP)(DX*4), Y15
  3233  	VMOVDQU      Y8, 640(SP)
  3234  
  3235  	// Round 1
  3236  	VPADDD  Y0, Y4, Y0
  3237  	VPADDD  (SP), Y0, Y0
  3238  	VPXOR   Y12, Y0, Y12
  3239  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3240  	VMOVDQU 640(SP), Y8
  3241  	VPADDD  Y8, Y12, Y8
  3242  	VPXOR   Y4, Y8, Y4
  3243  	VMOVDQU Y8, 640(SP)
  3244  	VPSRLD  $0x0c, Y4, Y8
  3245  	VPSLLD  $0x14, Y4, Y4
  3246  	VPOR    Y4, Y8, Y4
  3247  	VPADDD  Y0, Y4, Y0
  3248  	VPADDD  32(SP), Y0, Y0
  3249  	VPXOR   Y12, Y0, Y12
  3250  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3251  	VMOVDQU 640(SP), Y8
  3252  	VPADDD  Y8, Y12, Y8
  3253  	VPXOR   Y4, Y8, Y4
  3254  	VMOVDQU Y8, 640(SP)
  3255  	VPSRLD  $0x07, Y4, Y8
  3256  	VPSLLD  $0x19, Y4, Y4
  3257  	VPOR    Y4, Y8, Y4
  3258  	VPADDD  Y1, Y5, Y1
  3259  	VPADDD  64(SP), Y1, Y1
  3260  	VPXOR   Y13, Y1, Y13
  3261  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3262  	VPADDD  Y9, Y13, Y9
  3263  	VPXOR   Y5, Y9, Y5
  3264  	VPSRLD  $0x0c, Y5, Y8
  3265  	VPSLLD  $0x14, Y5, Y5
  3266  	VPOR    Y5, Y8, Y5
  3267  	VPADDD  Y1, Y5, Y1
  3268  	VPADDD  96(SP), Y1, Y1
  3269  	VPXOR   Y13, Y1, Y13
  3270  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3271  	VPADDD  Y9, Y13, Y9
  3272  	VPXOR   Y5, Y9, Y5
  3273  	VPSRLD  $0x07, Y5, Y8
  3274  	VPSLLD  $0x19, Y5, Y5
  3275  	VPOR    Y5, Y8, Y5
  3276  	VPADDD  Y2, Y6, Y2
  3277  	VPADDD  128(SP), Y2, Y2
  3278  	VPXOR   Y14, Y2, Y14
  3279  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3280  	VPADDD  Y10, Y14, Y10
  3281  	VPXOR   Y6, Y10, Y6
  3282  	VPSRLD  $0x0c, Y6, Y8
  3283  	VPSLLD  $0x14, Y6, Y6
  3284  	VPOR    Y6, Y8, Y6
  3285  	VPADDD  Y2, Y6, Y2
  3286  	VPADDD  160(SP), Y2, Y2
  3287  	VPXOR   Y14, Y2, Y14
  3288  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3289  	VPADDD  Y10, Y14, Y10
  3290  	VPXOR   Y6, Y10, Y6
  3291  	VPSRLD  $0x07, Y6, Y8
  3292  	VPSLLD  $0x19, Y6, Y6
  3293  	VPOR    Y6, Y8, Y6
  3294  	VPADDD  Y3, Y7, Y3
  3295  	VPADDD  192(SP), Y3, Y3
  3296  	VPXOR   Y15, Y3, Y15
  3297  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3298  	VPADDD  Y11, Y15, Y11
  3299  	VPXOR   Y7, Y11, Y7
  3300  	VPSRLD  $0x0c, Y7, Y8
  3301  	VPSLLD  $0x14, Y7, Y7
  3302  	VPOR    Y7, Y8, Y7
  3303  	VPADDD  Y3, Y7, Y3
  3304  	VPADDD  224(SP), Y3, Y3
  3305  	VPXOR   Y15, Y3, Y15
  3306  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3307  	VPADDD  Y11, Y15, Y11
  3308  	VPXOR   Y7, Y11, Y7
  3309  	VPSRLD  $0x07, Y7, Y8
  3310  	VPSLLD  $0x19, Y7, Y7
  3311  	VPOR    Y7, Y8, Y7
  3312  	VPADDD  Y0, Y5, Y0
  3313  	VPADDD  256(SP), Y0, Y0
  3314  	VPXOR   Y15, Y0, Y15
  3315  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3316  	VPADDD  Y10, Y15, Y10
  3317  	VPXOR   Y5, Y10, Y5
  3318  	VPSRLD  $0x0c, Y5, Y8
  3319  	VPSLLD  $0x14, Y5, Y5
  3320  	VPOR    Y5, Y8, Y5
  3321  	VPADDD  Y0, Y5, Y0
  3322  	VPADDD  288(SP), Y0, Y0
  3323  	VPXOR   Y15, Y0, Y15
  3324  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3325  	VPADDD  Y10, Y15, Y10
  3326  	VPXOR   Y5, Y10, Y5
  3327  	VPSRLD  $0x07, Y5, Y8
  3328  	VPSLLD  $0x19, Y5, Y5
  3329  	VPOR    Y5, Y8, Y5
  3330  	VPADDD  Y1, Y6, Y1
  3331  	VPADDD  320(SP), Y1, Y1
  3332  	VPXOR   Y12, Y1, Y12
  3333  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3334  	VPADDD  Y11, Y12, Y11
  3335  	VPXOR   Y6, Y11, Y6
  3336  	VPSRLD  $0x0c, Y6, Y8
  3337  	VPSLLD  $0x14, Y6, Y6
  3338  	VPOR    Y6, Y8, Y6
  3339  	VPADDD  Y1, Y6, Y1
  3340  	VPADDD  352(SP), Y1, Y1
  3341  	VPXOR   Y12, Y1, Y12
  3342  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3343  	VPADDD  Y11, Y12, Y11
  3344  	VPXOR   Y6, Y11, Y6
  3345  	VPSRLD  $0x07, Y6, Y8
  3346  	VPSLLD  $0x19, Y6, Y6
  3347  	VPOR    Y6, Y8, Y6
  3348  	VPADDD  Y2, Y7, Y2
  3349  	VPADDD  384(SP), Y2, Y2
  3350  	VPXOR   Y13, Y2, Y13
  3351  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3352  	VMOVDQU 640(SP), Y8
  3353  	VPADDD  Y8, Y13, Y8
  3354  	VPXOR   Y7, Y8, Y7
  3355  	VMOVDQU Y8, 640(SP)
  3356  	VPSRLD  $0x0c, Y7, Y8
  3357  	VPSLLD  $0x14, Y7, Y7
  3358  	VPOR    Y7, Y8, Y7
  3359  	VPADDD  Y2, Y7, Y2
  3360  	VPADDD  416(SP), Y2, Y2
  3361  	VPXOR   Y13, Y2, Y13
  3362  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3363  	VMOVDQU 640(SP), Y8
  3364  	VPADDD  Y8, Y13, Y8
  3365  	VPXOR   Y7, Y8, Y7
  3366  	VMOVDQU Y8, 640(SP)
  3367  	VPSRLD  $0x07, Y7, Y8
  3368  	VPSLLD  $0x19, Y7, Y7
  3369  	VPOR    Y7, Y8, Y7
  3370  	VPADDD  Y3, Y4, Y3
  3371  	VPADDD  448(SP), Y3, Y3
  3372  	VPXOR   Y14, Y3, Y14
  3373  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3374  	VPADDD  Y9, Y14, Y9
  3375  	VPXOR   Y4, Y9, Y4
  3376  	VPSRLD  $0x0c, Y4, Y8
  3377  	VPSLLD  $0x14, Y4, Y4
  3378  	VPOR    Y4, Y8, Y4
  3379  	VPADDD  Y3, Y4, Y3
  3380  	VPADDD  480(SP), Y3, Y3
  3381  	VPXOR   Y14, Y3, Y14
  3382  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3383  	VPADDD  Y9, Y14, Y9
  3384  	VPXOR   Y4, Y9, Y4
  3385  	VPSRLD  $0x07, Y4, Y8
  3386  	VPSLLD  $0x19, Y4, Y4
  3387  	VPOR    Y4, Y8, Y4
  3388  
  3389  	// Round 2
  3390  	VPADDD  Y0, Y4, Y0
  3391  	VPADDD  64(SP), Y0, Y0
  3392  	VPXOR   Y12, Y0, Y12
  3393  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3394  	VMOVDQU 640(SP), Y8
  3395  	VPADDD  Y8, Y12, Y8
  3396  	VPXOR   Y4, Y8, Y4
  3397  	VMOVDQU Y8, 640(SP)
  3398  	VPSRLD  $0x0c, Y4, Y8
  3399  	VPSLLD  $0x14, Y4, Y4
  3400  	VPOR    Y4, Y8, Y4
  3401  	VPADDD  Y0, Y4, Y0
  3402  	VPADDD  192(SP), Y0, Y0
  3403  	VPXOR   Y12, Y0, Y12
  3404  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3405  	VMOVDQU 640(SP), Y8
  3406  	VPADDD  Y8, Y12, Y8
  3407  	VPXOR   Y4, Y8, Y4
  3408  	VMOVDQU Y8, 640(SP)
  3409  	VPSRLD  $0x07, Y4, Y8
  3410  	VPSLLD  $0x19, Y4, Y4
  3411  	VPOR    Y4, Y8, Y4
  3412  	VPADDD  Y1, Y5, Y1
  3413  	VPADDD  96(SP), Y1, Y1
  3414  	VPXOR   Y13, Y1, Y13
  3415  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3416  	VPADDD  Y9, Y13, Y9
  3417  	VPXOR   Y5, Y9, Y5
  3418  	VPSRLD  $0x0c, Y5, Y8
  3419  	VPSLLD  $0x14, Y5, Y5
  3420  	VPOR    Y5, Y8, Y5
  3421  	VPADDD  Y1, Y5, Y1
  3422  	VPADDD  320(SP), Y1, Y1
  3423  	VPXOR   Y13, Y1, Y13
  3424  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3425  	VPADDD  Y9, Y13, Y9
  3426  	VPXOR   Y5, Y9, Y5
  3427  	VPSRLD  $0x07, Y5, Y8
  3428  	VPSLLD  $0x19, Y5, Y5
  3429  	VPOR    Y5, Y8, Y5
  3430  	VPADDD  Y2, Y6, Y2
  3431  	VPADDD  224(SP), Y2, Y2
  3432  	VPXOR   Y14, Y2, Y14
  3433  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3434  	VPADDD  Y10, Y14, Y10
  3435  	VPXOR   Y6, Y10, Y6
  3436  	VPSRLD  $0x0c, Y6, Y8
  3437  	VPSLLD  $0x14, Y6, Y6
  3438  	VPOR    Y6, Y8, Y6
  3439  	VPADDD  Y2, Y6, Y2
  3440  	VPADDD  (SP), Y2, Y2
  3441  	VPXOR   Y14, Y2, Y14
  3442  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3443  	VPADDD  Y10, Y14, Y10
  3444  	VPXOR   Y6, Y10, Y6
  3445  	VPSRLD  $0x07, Y6, Y8
  3446  	VPSLLD  $0x19, Y6, Y6
  3447  	VPOR    Y6, Y8, Y6
  3448  	VPADDD  Y3, Y7, Y3
  3449  	VPADDD  128(SP), Y3, Y3
  3450  	VPXOR   Y15, Y3, Y15
  3451  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3452  	VPADDD  Y11, Y15, Y11
  3453  	VPXOR   Y7, Y11, Y7
  3454  	VPSRLD  $0x0c, Y7, Y8
  3455  	VPSLLD  $0x14, Y7, Y7
  3456  	VPOR    Y7, Y8, Y7
  3457  	VPADDD  Y3, Y7, Y3
  3458  	VPADDD  416(SP), Y3, Y3
  3459  	VPXOR   Y15, Y3, Y15
  3460  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3461  	VPADDD  Y11, Y15, Y11
  3462  	VPXOR   Y7, Y11, Y7
  3463  	VPSRLD  $0x07, Y7, Y8
  3464  	VPSLLD  $0x19, Y7, Y7
  3465  	VPOR    Y7, Y8, Y7
  3466  	VPADDD  Y0, Y5, Y0
  3467  	VPADDD  32(SP), Y0, Y0
  3468  	VPXOR   Y15, Y0, Y15
  3469  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3470  	VPADDD  Y10, Y15, Y10
  3471  	VPXOR   Y5, Y10, Y5
  3472  	VPSRLD  $0x0c, Y5, Y8
  3473  	VPSLLD  $0x14, Y5, Y5
  3474  	VPOR    Y5, Y8, Y5
  3475  	VPADDD  Y0, Y5, Y0
  3476  	VPADDD  352(SP), Y0, Y0
  3477  	VPXOR   Y15, Y0, Y15
  3478  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3479  	VPADDD  Y10, Y15, Y10
  3480  	VPXOR   Y5, Y10, Y5
  3481  	VPSRLD  $0x07, Y5, Y8
  3482  	VPSLLD  $0x19, Y5, Y5
  3483  	VPOR    Y5, Y8, Y5
  3484  	VPADDD  Y1, Y6, Y1
  3485  	VPADDD  384(SP), Y1, Y1
  3486  	VPXOR   Y12, Y1, Y12
  3487  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3488  	VPADDD  Y11, Y12, Y11
  3489  	VPXOR   Y6, Y11, Y6
  3490  	VPSRLD  $0x0c, Y6, Y8
  3491  	VPSLLD  $0x14, Y6, Y6
  3492  	VPOR    Y6, Y8, Y6
  3493  	VPADDD  Y1, Y6, Y1
  3494  	VPADDD  160(SP), Y1, Y1
  3495  	VPXOR   Y12, Y1, Y12
  3496  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3497  	VPADDD  Y11, Y12, Y11
  3498  	VPXOR   Y6, Y11, Y6
  3499  	VPSRLD  $0x07, Y6, Y8
  3500  	VPSLLD  $0x19, Y6, Y6
  3501  	VPOR    Y6, Y8, Y6
  3502  	VPADDD  Y2, Y7, Y2
  3503  	VPADDD  288(SP), Y2, Y2
  3504  	VPXOR   Y13, Y2, Y13
  3505  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3506  	VMOVDQU 640(SP), Y8
  3507  	VPADDD  Y8, Y13, Y8
  3508  	VPXOR   Y7, Y8, Y7
  3509  	VMOVDQU Y8, 640(SP)
  3510  	VPSRLD  $0x0c, Y7, Y8
  3511  	VPSLLD  $0x14, Y7, Y7
  3512  	VPOR    Y7, Y8, Y7
  3513  	VPADDD  Y2, Y7, Y2
  3514  	VPADDD  448(SP), Y2, Y2
  3515  	VPXOR   Y13, Y2, Y13
  3516  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3517  	VMOVDQU 640(SP), Y8
  3518  	VPADDD  Y8, Y13, Y8
  3519  	VPXOR   Y7, Y8, Y7
  3520  	VMOVDQU Y8, 640(SP)
  3521  	VPSRLD  $0x07, Y7, Y8
  3522  	VPSLLD  $0x19, Y7, Y7
  3523  	VPOR    Y7, Y8, Y7
  3524  	VPADDD  Y3, Y4, Y3
  3525  	VPADDD  480(SP), Y3, Y3
  3526  	VPXOR   Y14, Y3, Y14
  3527  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3528  	VPADDD  Y9, Y14, Y9
  3529  	VPXOR   Y4, Y9, Y4
  3530  	VPSRLD  $0x0c, Y4, Y8
  3531  	VPSLLD  $0x14, Y4, Y4
  3532  	VPOR    Y4, Y8, Y4
  3533  	VPADDD  Y3, Y4, Y3
  3534  	VPADDD  256(SP), Y3, Y3
  3535  	VPXOR   Y14, Y3, Y14
  3536  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3537  	VPADDD  Y9, Y14, Y9
  3538  	VPXOR   Y4, Y9, Y4
  3539  	VPSRLD  $0x07, Y4, Y8
  3540  	VPSLLD  $0x19, Y4, Y4
  3541  	VPOR    Y4, Y8, Y4
  3542  
  3543  	// Round 3
  3544  	VPADDD  Y0, Y4, Y0
  3545  	VPADDD  96(SP), Y0, Y0
  3546  	VPXOR   Y12, Y0, Y12
  3547  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3548  	VMOVDQU 640(SP), Y8
  3549  	VPADDD  Y8, Y12, Y8
  3550  	VPXOR   Y4, Y8, Y4
  3551  	VMOVDQU Y8, 640(SP)
  3552  	VPSRLD  $0x0c, Y4, Y8
  3553  	VPSLLD  $0x14, Y4, Y4
  3554  	VPOR    Y4, Y8, Y4
  3555  	VPADDD  Y0, Y4, Y0
  3556  	VPADDD  128(SP), Y0, Y0
  3557  	VPXOR   Y12, Y0, Y12
  3558  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3559  	VMOVDQU 640(SP), Y8
  3560  	VPADDD  Y8, Y12, Y8
  3561  	VPXOR   Y4, Y8, Y4
  3562  	VMOVDQU Y8, 640(SP)
  3563  	VPSRLD  $0x07, Y4, Y8
  3564  	VPSLLD  $0x19, Y4, Y4
  3565  	VPOR    Y4, Y8, Y4
  3566  	VPADDD  Y1, Y5, Y1
  3567  	VPADDD  320(SP), Y1, Y1
  3568  	VPXOR   Y13, Y1, Y13
  3569  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3570  	VPADDD  Y9, Y13, Y9
  3571  	VPXOR   Y5, Y9, Y5
  3572  	VPSRLD  $0x0c, Y5, Y8
  3573  	VPSLLD  $0x14, Y5, Y5
  3574  	VPOR    Y5, Y8, Y5
  3575  	VPADDD  Y1, Y5, Y1
  3576  	VPADDD  384(SP), Y1, Y1
  3577  	VPXOR   Y13, Y1, Y13
  3578  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3579  	VPADDD  Y9, Y13, Y9
  3580  	VPXOR   Y5, Y9, Y5
  3581  	VPSRLD  $0x07, Y5, Y8
  3582  	VPSLLD  $0x19, Y5, Y5
  3583  	VPOR    Y5, Y8, Y5
  3584  	VPADDD  Y2, Y6, Y2
  3585  	VPADDD  416(SP), Y2, Y2
  3586  	VPXOR   Y14, Y2, Y14
  3587  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3588  	VPADDD  Y10, Y14, Y10
  3589  	VPXOR   Y6, Y10, Y6
  3590  	VPSRLD  $0x0c, Y6, Y8
  3591  	VPSLLD  $0x14, Y6, Y6
  3592  	VPOR    Y6, Y8, Y6
  3593  	VPADDD  Y2, Y6, Y2
  3594  	VPADDD  64(SP), Y2, Y2
  3595  	VPXOR   Y14, Y2, Y14
  3596  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3597  	VPADDD  Y10, Y14, Y10
  3598  	VPXOR   Y6, Y10, Y6
  3599  	VPSRLD  $0x07, Y6, Y8
  3600  	VPSLLD  $0x19, Y6, Y6
  3601  	VPOR    Y6, Y8, Y6
  3602  	VPADDD  Y3, Y7, Y3
  3603  	VPADDD  224(SP), Y3, Y3
  3604  	VPXOR   Y15, Y3, Y15
  3605  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3606  	VPADDD  Y11, Y15, Y11
  3607  	VPXOR   Y7, Y11, Y7
  3608  	VPSRLD  $0x0c, Y7, Y8
  3609  	VPSLLD  $0x14, Y7, Y7
  3610  	VPOR    Y7, Y8, Y7
  3611  	VPADDD  Y3, Y7, Y3
  3612  	VPADDD  448(SP), Y3, Y3
  3613  	VPXOR   Y15, Y3, Y15
  3614  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3615  	VPADDD  Y11, Y15, Y11
  3616  	VPXOR   Y7, Y11, Y7
  3617  	VPSRLD  $0x07, Y7, Y8
  3618  	VPSLLD  $0x19, Y7, Y7
  3619  	VPOR    Y7, Y8, Y7
  3620  	VPADDD  Y0, Y5, Y0
  3621  	VPADDD  192(SP), Y0, Y0
  3622  	VPXOR   Y15, Y0, Y15
  3623  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3624  	VPADDD  Y10, Y15, Y10
  3625  	VPXOR   Y5, Y10, Y5
  3626  	VPSRLD  $0x0c, Y5, Y8
  3627  	VPSLLD  $0x14, Y5, Y5
  3628  	VPOR    Y5, Y8, Y5
  3629  	VPADDD  Y0, Y5, Y0
  3630  	VPADDD  160(SP), Y0, Y0
  3631  	VPXOR   Y15, Y0, Y15
  3632  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3633  	VPADDD  Y10, Y15, Y10
  3634  	VPXOR   Y5, Y10, Y5
  3635  	VPSRLD  $0x07, Y5, Y8
  3636  	VPSLLD  $0x19, Y5, Y5
  3637  	VPOR    Y5, Y8, Y5
  3638  	VPADDD  Y1, Y6, Y1
  3639  	VPADDD  288(SP), Y1, Y1
  3640  	VPXOR   Y12, Y1, Y12
  3641  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3642  	VPADDD  Y11, Y12, Y11
  3643  	VPXOR   Y6, Y11, Y6
  3644  	VPSRLD  $0x0c, Y6, Y8
  3645  	VPSLLD  $0x14, Y6, Y6
  3646  	VPOR    Y6, Y8, Y6
  3647  	VPADDD  Y1, Y6, Y1
  3648  	VPADDD  (SP), Y1, Y1
  3649  	VPXOR   Y12, Y1, Y12
  3650  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3651  	VPADDD  Y11, Y12, Y11
  3652  	VPXOR   Y6, Y11, Y6
  3653  	VPSRLD  $0x07, Y6, Y8
  3654  	VPSLLD  $0x19, Y6, Y6
  3655  	VPOR    Y6, Y8, Y6
  3656  	VPADDD  Y2, Y7, Y2
  3657  	VPADDD  352(SP), Y2, Y2
  3658  	VPXOR   Y13, Y2, Y13
  3659  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3660  	VMOVDQU 640(SP), Y8
  3661  	VPADDD  Y8, Y13, Y8
  3662  	VPXOR   Y7, Y8, Y7
  3663  	VMOVDQU Y8, 640(SP)
  3664  	VPSRLD  $0x0c, Y7, Y8
  3665  	VPSLLD  $0x14, Y7, Y7
  3666  	VPOR    Y7, Y8, Y7
  3667  	VPADDD  Y2, Y7, Y2
  3668  	VPADDD  480(SP), Y2, Y2
  3669  	VPXOR   Y13, Y2, Y13
  3670  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3671  	VMOVDQU 640(SP), Y8
  3672  	VPADDD  Y8, Y13, Y8
  3673  	VPXOR   Y7, Y8, Y7
  3674  	VMOVDQU Y8, 640(SP)
  3675  	VPSRLD  $0x07, Y7, Y8
  3676  	VPSLLD  $0x19, Y7, Y7
  3677  	VPOR    Y7, Y8, Y7
  3678  	VPADDD  Y3, Y4, Y3
  3679  	VPADDD  256(SP), Y3, Y3
  3680  	VPXOR   Y14, Y3, Y14
  3681  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3682  	VPADDD  Y9, Y14, Y9
  3683  	VPXOR   Y4, Y9, Y4
  3684  	VPSRLD  $0x0c, Y4, Y8
  3685  	VPSLLD  $0x14, Y4, Y4
  3686  	VPOR    Y4, Y8, Y4
  3687  	VPADDD  Y3, Y4, Y3
  3688  	VPADDD  32(SP), Y3, Y3
  3689  	VPXOR   Y14, Y3, Y14
  3690  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3691  	VPADDD  Y9, Y14, Y9
  3692  	VPXOR   Y4, Y9, Y4
  3693  	VPSRLD  $0x07, Y4, Y8
  3694  	VPSLLD  $0x19, Y4, Y4
  3695  	VPOR    Y4, Y8, Y4
  3696  
  3697  	// Round 4
  3698  	VPADDD  Y0, Y4, Y0
  3699  	VPADDD  320(SP), Y0, Y0
  3700  	VPXOR   Y12, Y0, Y12
  3701  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3702  	VMOVDQU 640(SP), Y8
  3703  	VPADDD  Y8, Y12, Y8
  3704  	VPXOR   Y4, Y8, Y4
  3705  	VMOVDQU Y8, 640(SP)
  3706  	VPSRLD  $0x0c, Y4, Y8
  3707  	VPSLLD  $0x14, Y4, Y4
  3708  	VPOR    Y4, Y8, Y4
  3709  	VPADDD  Y0, Y4, Y0
  3710  	VPADDD  224(SP), Y0, Y0
  3711  	VPXOR   Y12, Y0, Y12
  3712  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3713  	VMOVDQU 640(SP), Y8
  3714  	VPADDD  Y8, Y12, Y8
  3715  	VPXOR   Y4, Y8, Y4
  3716  	VMOVDQU Y8, 640(SP)
  3717  	VPSRLD  $0x07, Y4, Y8
  3718  	VPSLLD  $0x19, Y4, Y4
  3719  	VPOR    Y4, Y8, Y4
  3720  	VPADDD  Y1, Y5, Y1
  3721  	VPADDD  384(SP), Y1, Y1
  3722  	VPXOR   Y13, Y1, Y13
  3723  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3724  	VPADDD  Y9, Y13, Y9
  3725  	VPXOR   Y5, Y9, Y5
  3726  	VPSRLD  $0x0c, Y5, Y8
  3727  	VPSLLD  $0x14, Y5, Y5
  3728  	VPOR    Y5, Y8, Y5
  3729  	VPADDD  Y1, Y5, Y1
  3730  	VPADDD  288(SP), Y1, Y1
  3731  	VPXOR   Y13, Y1, Y13
  3732  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3733  	VPADDD  Y9, Y13, Y9
  3734  	VPXOR   Y5, Y9, Y5
  3735  	VPSRLD  $0x07, Y5, Y8
  3736  	VPSLLD  $0x19, Y5, Y5
  3737  	VPOR    Y5, Y8, Y5
  3738  	VPADDD  Y2, Y6, Y2
  3739  	VPADDD  448(SP), Y2, Y2
  3740  	VPXOR   Y14, Y2, Y14
  3741  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3742  	VPADDD  Y10, Y14, Y10
  3743  	VPXOR   Y6, Y10, Y6
  3744  	VPSRLD  $0x0c, Y6, Y8
  3745  	VPSLLD  $0x14, Y6, Y6
  3746  	VPOR    Y6, Y8, Y6
  3747  	VPADDD  Y2, Y6, Y2
  3748  	VPADDD  96(SP), Y2, Y2
  3749  	VPXOR   Y14, Y2, Y14
  3750  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3751  	VPADDD  Y10, Y14, Y10
  3752  	VPXOR   Y6, Y10, Y6
  3753  	VPSRLD  $0x07, Y6, Y8
  3754  	VPSLLD  $0x19, Y6, Y6
  3755  	VPOR    Y6, Y8, Y6
  3756  	VPADDD  Y3, Y7, Y3
  3757  	VPADDD  416(SP), Y3, Y3
  3758  	VPXOR   Y15, Y3, Y15
  3759  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3760  	VPADDD  Y11, Y15, Y11
  3761  	VPXOR   Y7, Y11, Y7
  3762  	VPSRLD  $0x0c, Y7, Y8
  3763  	VPSLLD  $0x14, Y7, Y7
  3764  	VPOR    Y7, Y8, Y7
  3765  	VPADDD  Y3, Y7, Y3
  3766  	VPADDD  480(SP), Y3, Y3
  3767  	VPXOR   Y15, Y3, Y15
  3768  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3769  	VPADDD  Y11, Y15, Y11
  3770  	VPXOR   Y7, Y11, Y7
  3771  	VPSRLD  $0x07, Y7, Y8
  3772  	VPSLLD  $0x19, Y7, Y7
  3773  	VPOR    Y7, Y8, Y7
  3774  	VPADDD  Y0, Y5, Y0
  3775  	VPADDD  128(SP), Y0, Y0
  3776  	VPXOR   Y15, Y0, Y15
  3777  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3778  	VPADDD  Y10, Y15, Y10
  3779  	VPXOR   Y5, Y10, Y5
  3780  	VPSRLD  $0x0c, Y5, Y8
  3781  	VPSLLD  $0x14, Y5, Y5
  3782  	VPOR    Y5, Y8, Y5
  3783  	VPADDD  Y0, Y5, Y0
  3784  	VPADDD  (SP), Y0, Y0
  3785  	VPXOR   Y15, Y0, Y15
  3786  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3787  	VPADDD  Y10, Y15, Y10
  3788  	VPXOR   Y5, Y10, Y5
  3789  	VPSRLD  $0x07, Y5, Y8
  3790  	VPSLLD  $0x19, Y5, Y5
  3791  	VPOR    Y5, Y8, Y5
  3792  	VPADDD  Y1, Y6, Y1
  3793  	VPADDD  352(SP), Y1, Y1
  3794  	VPXOR   Y12, Y1, Y12
  3795  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3796  	VPADDD  Y11, Y12, Y11
  3797  	VPXOR   Y6, Y11, Y6
  3798  	VPSRLD  $0x0c, Y6, Y8
  3799  	VPSLLD  $0x14, Y6, Y6
  3800  	VPOR    Y6, Y8, Y6
  3801  	VPADDD  Y1, Y6, Y1
  3802  	VPADDD  64(SP), Y1, Y1
  3803  	VPXOR   Y12, Y1, Y12
  3804  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3805  	VPADDD  Y11, Y12, Y11
  3806  	VPXOR   Y6, Y11, Y6
  3807  	VPSRLD  $0x07, Y6, Y8
  3808  	VPSLLD  $0x19, Y6, Y6
  3809  	VPOR    Y6, Y8, Y6
  3810  	VPADDD  Y2, Y7, Y2
  3811  	VPADDD  160(SP), Y2, Y2
  3812  	VPXOR   Y13, Y2, Y13
  3813  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3814  	VMOVDQU 640(SP), Y8
  3815  	VPADDD  Y8, Y13, Y8
  3816  	VPXOR   Y7, Y8, Y7
  3817  	VMOVDQU Y8, 640(SP)
  3818  	VPSRLD  $0x0c, Y7, Y8
  3819  	VPSLLD  $0x14, Y7, Y7
  3820  	VPOR    Y7, Y8, Y7
  3821  	VPADDD  Y2, Y7, Y2
  3822  	VPADDD  256(SP), Y2, Y2
  3823  	VPXOR   Y13, Y2, Y13
  3824  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3825  	VMOVDQU 640(SP), Y8
  3826  	VPADDD  Y8, Y13, Y8
  3827  	VPXOR   Y7, Y8, Y7
  3828  	VMOVDQU Y8, 640(SP)
  3829  	VPSRLD  $0x07, Y7, Y8
  3830  	VPSLLD  $0x19, Y7, Y7
  3831  	VPOR    Y7, Y8, Y7
  3832  	VPADDD  Y3, Y4, Y3
  3833  	VPADDD  32(SP), Y3, Y3
  3834  	VPXOR   Y14, Y3, Y14
  3835  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3836  	VPADDD  Y9, Y14, Y9
  3837  	VPXOR   Y4, Y9, Y4
  3838  	VPSRLD  $0x0c, Y4, Y8
  3839  	VPSLLD  $0x14, Y4, Y4
  3840  	VPOR    Y4, Y8, Y4
  3841  	VPADDD  Y3, Y4, Y3
  3842  	VPADDD  192(SP), Y3, Y3
  3843  	VPXOR   Y14, Y3, Y14
  3844  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3845  	VPADDD  Y9, Y14, Y9
  3846  	VPXOR   Y4, Y9, Y4
  3847  	VPSRLD  $0x07, Y4, Y8
  3848  	VPSLLD  $0x19, Y4, Y4
  3849  	VPOR    Y4, Y8, Y4
  3850  
  3851  	// Round 5
  3852  	VPADDD  Y0, Y4, Y0
  3853  	VPADDD  384(SP), Y0, Y0
  3854  	VPXOR   Y12, Y0, Y12
  3855  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3856  	VMOVDQU 640(SP), Y8
  3857  	VPADDD  Y8, Y12, Y8
  3858  	VPXOR   Y4, Y8, Y4
  3859  	VMOVDQU Y8, 640(SP)
  3860  	VPSRLD  $0x0c, Y4, Y8
  3861  	VPSLLD  $0x14, Y4, Y4
  3862  	VPOR    Y4, Y8, Y4
  3863  	VPADDD  Y0, Y4, Y0
  3864  	VPADDD  416(SP), Y0, Y0
  3865  	VPXOR   Y12, Y0, Y12
  3866  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3867  	VMOVDQU 640(SP), Y8
  3868  	VPADDD  Y8, Y12, Y8
  3869  	VPXOR   Y4, Y8, Y4
  3870  	VMOVDQU Y8, 640(SP)
  3871  	VPSRLD  $0x07, Y4, Y8
  3872  	VPSLLD  $0x19, Y4, Y4
  3873  	VPOR    Y4, Y8, Y4
  3874  	VPADDD  Y1, Y5, Y1
  3875  	VPADDD  288(SP), Y1, Y1
  3876  	VPXOR   Y13, Y1, Y13
  3877  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3878  	VPADDD  Y9, Y13, Y9
  3879  	VPXOR   Y5, Y9, Y5
  3880  	VPSRLD  $0x0c, Y5, Y8
  3881  	VPSLLD  $0x14, Y5, Y5
  3882  	VPOR    Y5, Y8, Y5
  3883  	VPADDD  Y1, Y5, Y1
  3884  	VPADDD  352(SP), Y1, Y1
  3885  	VPXOR   Y13, Y1, Y13
  3886  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3887  	VPADDD  Y9, Y13, Y9
  3888  	VPXOR   Y5, Y9, Y5
  3889  	VPSRLD  $0x07, Y5, Y8
  3890  	VPSLLD  $0x19, Y5, Y5
  3891  	VPOR    Y5, Y8, Y5
  3892  	VPADDD  Y2, Y6, Y2
  3893  	VPADDD  480(SP), Y2, Y2
  3894  	VPXOR   Y14, Y2, Y14
  3895  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3896  	VPADDD  Y10, Y14, Y10
  3897  	VPXOR   Y6, Y10, Y6
  3898  	VPSRLD  $0x0c, Y6, Y8
  3899  	VPSLLD  $0x14, Y6, Y6
  3900  	VPOR    Y6, Y8, Y6
  3901  	VPADDD  Y2, Y6, Y2
  3902  	VPADDD  320(SP), Y2, Y2
  3903  	VPXOR   Y14, Y2, Y14
  3904  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3905  	VPADDD  Y10, Y14, Y10
  3906  	VPXOR   Y6, Y10, Y6
  3907  	VPSRLD  $0x07, Y6, Y8
  3908  	VPSLLD  $0x19, Y6, Y6
  3909  	VPOR    Y6, Y8, Y6
  3910  	VPADDD  Y3, Y7, Y3
  3911  	VPADDD  448(SP), Y3, Y3
  3912  	VPXOR   Y15, Y3, Y15
  3913  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3914  	VPADDD  Y11, Y15, Y11
  3915  	VPXOR   Y7, Y11, Y7
  3916  	VPSRLD  $0x0c, Y7, Y8
  3917  	VPSLLD  $0x14, Y7, Y7
  3918  	VPOR    Y7, Y8, Y7
  3919  	VPADDD  Y3, Y7, Y3
  3920  	VPADDD  256(SP), Y3, Y3
  3921  	VPXOR   Y15, Y3, Y15
  3922  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3923  	VPADDD  Y11, Y15, Y11
  3924  	VPXOR   Y7, Y11, Y7
  3925  	VPSRLD  $0x07, Y7, Y8
  3926  	VPSLLD  $0x19, Y7, Y7
  3927  	VPOR    Y7, Y8, Y7
  3928  	VPADDD  Y0, Y5, Y0
  3929  	VPADDD  224(SP), Y0, Y0
  3930  	VPXOR   Y15, Y0, Y15
  3931  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3932  	VPADDD  Y10, Y15, Y10
  3933  	VPXOR   Y5, Y10, Y5
  3934  	VPSRLD  $0x0c, Y5, Y8
  3935  	VPSLLD  $0x14, Y5, Y5
  3936  	VPOR    Y5, Y8, Y5
  3937  	VPADDD  Y0, Y5, Y0
  3938  	VPADDD  64(SP), Y0, Y0
  3939  	VPXOR   Y15, Y0, Y15
  3940  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3941  	VPADDD  Y10, Y15, Y10
  3942  	VPXOR   Y5, Y10, Y5
  3943  	VPSRLD  $0x07, Y5, Y8
  3944  	VPSLLD  $0x19, Y5, Y5
  3945  	VPOR    Y5, Y8, Y5
  3946  	VPADDD  Y1, Y6, Y1
  3947  	VPADDD  160(SP), Y1, Y1
  3948  	VPXOR   Y12, Y1, Y12
  3949  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3950  	VPADDD  Y11, Y12, Y11
  3951  	VPXOR   Y6, Y11, Y6
  3952  	VPSRLD  $0x0c, Y6, Y8
  3953  	VPSLLD  $0x14, Y6, Y6
  3954  	VPOR    Y6, Y8, Y6
  3955  	VPADDD  Y1, Y6, Y1
  3956  	VPADDD  96(SP), Y1, Y1
  3957  	VPXOR   Y12, Y1, Y12
  3958  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3959  	VPADDD  Y11, Y12, Y11
  3960  	VPXOR   Y6, Y11, Y6
  3961  	VPSRLD  $0x07, Y6, Y8
  3962  	VPSLLD  $0x19, Y6, Y6
  3963  	VPOR    Y6, Y8, Y6
  3964  	VPADDD  Y2, Y7, Y2
  3965  	VPADDD  (SP), Y2, Y2
  3966  	VPXOR   Y13, Y2, Y13
  3967  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3968  	VMOVDQU 640(SP), Y8
  3969  	VPADDD  Y8, Y13, Y8
  3970  	VPXOR   Y7, Y8, Y7
  3971  	VMOVDQU Y8, 640(SP)
  3972  	VPSRLD  $0x0c, Y7, Y8
  3973  	VPSLLD  $0x14, Y7, Y7
  3974  	VPOR    Y7, Y8, Y7
  3975  	VPADDD  Y2, Y7, Y2
  3976  	VPADDD  32(SP), Y2, Y2
  3977  	VPXOR   Y13, Y2, Y13
  3978  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3979  	VMOVDQU 640(SP), Y8
  3980  	VPADDD  Y8, Y13, Y8
  3981  	VPXOR   Y7, Y8, Y7
  3982  	VMOVDQU Y8, 640(SP)
  3983  	VPSRLD  $0x07, Y7, Y8
  3984  	VPSLLD  $0x19, Y7, Y7
  3985  	VPOR    Y7, Y8, Y7
  3986  	VPADDD  Y3, Y4, Y3
  3987  	VPADDD  192(SP), Y3, Y3
  3988  	VPXOR   Y14, Y3, Y14
  3989  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3990  	VPADDD  Y9, Y14, Y9
  3991  	VPXOR   Y4, Y9, Y4
  3992  	VPSRLD  $0x0c, Y4, Y8
  3993  	VPSLLD  $0x14, Y4, Y4
  3994  	VPOR    Y4, Y8, Y4
  3995  	VPADDD  Y3, Y4, Y3
  3996  	VPADDD  128(SP), Y3, Y3
  3997  	VPXOR   Y14, Y3, Y14
  3998  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3999  	VPADDD  Y9, Y14, Y9
  4000  	VPXOR   Y4, Y9, Y4
  4001  	VPSRLD  $0x07, Y4, Y8
  4002  	VPSLLD  $0x19, Y4, Y4
  4003  	VPOR    Y4, Y8, Y4
  4004  
  4005  	// Round 6
  4006  	VPADDD  Y0, Y4, Y0
  4007  	VPADDD  288(SP), Y0, Y0
  4008  	VPXOR   Y12, Y0, Y12
  4009  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4010  	VMOVDQU 640(SP), Y8
  4011  	VPADDD  Y8, Y12, Y8
  4012  	VPXOR   Y4, Y8, Y4
  4013  	VMOVDQU Y8, 640(SP)
  4014  	VPSRLD  $0x0c, Y4, Y8
  4015  	VPSLLD  $0x14, Y4, Y4
  4016  	VPOR    Y4, Y8, Y4
  4017  	VPADDD  Y0, Y4, Y0
  4018  	VPADDD  448(SP), Y0, Y0
  4019  	VPXOR   Y12, Y0, Y12
  4020  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4021  	VMOVDQU 640(SP), Y8
  4022  	VPADDD  Y8, Y12, Y8
  4023  	VPXOR   Y4, Y8, Y4
  4024  	VMOVDQU Y8, 640(SP)
  4025  	VPSRLD  $0x07, Y4, Y8
  4026  	VPSLLD  $0x19, Y4, Y4
  4027  	VPOR    Y4, Y8, Y4
  4028  	VPADDD  Y1, Y5, Y1
  4029  	VPADDD  352(SP), Y1, Y1
  4030  	VPXOR   Y13, Y1, Y13
  4031  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4032  	VPADDD  Y9, Y13, Y9
  4033  	VPXOR   Y5, Y9, Y5
  4034  	VPSRLD  $0x0c, Y5, Y8
  4035  	VPSLLD  $0x14, Y5, Y5
  4036  	VPOR    Y5, Y8, Y5
  4037  	VPADDD  Y1, Y5, Y1
  4038  	VPADDD  160(SP), Y1, Y1
  4039  	VPXOR   Y13, Y1, Y13
  4040  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4041  	VPADDD  Y9, Y13, Y9
  4042  	VPXOR   Y5, Y9, Y5
  4043  	VPSRLD  $0x07, Y5, Y8
  4044  	VPSLLD  $0x19, Y5, Y5
  4045  	VPOR    Y5, Y8, Y5
  4046  	VPADDD  Y2, Y6, Y2
  4047  	VPADDD  256(SP), Y2, Y2
  4048  	VPXOR   Y14, Y2, Y14
  4049  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4050  	VPADDD  Y10, Y14, Y10
  4051  	VPXOR   Y6, Y10, Y6
  4052  	VPSRLD  $0x0c, Y6, Y8
  4053  	VPSLLD  $0x14, Y6, Y6
  4054  	VPOR    Y6, Y8, Y6
  4055  	VPADDD  Y2, Y6, Y2
  4056  	VPADDD  384(SP), Y2, Y2
  4057  	VPXOR   Y14, Y2, Y14
  4058  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4059  	VPADDD  Y10, Y14, Y10
  4060  	VPXOR   Y6, Y10, Y6
  4061  	VPSRLD  $0x07, Y6, Y8
  4062  	VPSLLD  $0x19, Y6, Y6
  4063  	VPOR    Y6, Y8, Y6
  4064  	VPADDD  Y3, Y7, Y3
  4065  	VPADDD  480(SP), Y3, Y3
  4066  	VPXOR   Y15, Y3, Y15
  4067  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4068  	VPADDD  Y11, Y15, Y11
  4069  	VPXOR   Y7, Y11, Y7
  4070  	VPSRLD  $0x0c, Y7, Y8
  4071  	VPSLLD  $0x14, Y7, Y7
  4072  	VPOR    Y7, Y8, Y7
  4073  	VPADDD  Y3, Y7, Y3
  4074  	VPADDD  32(SP), Y3, Y3
  4075  	VPXOR   Y15, Y3, Y15
  4076  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4077  	VPADDD  Y11, Y15, Y11
  4078  	VPXOR   Y7, Y11, Y7
  4079  	VPSRLD  $0x07, Y7, Y8
  4080  	VPSLLD  $0x19, Y7, Y7
  4081  	VPOR    Y7, Y8, Y7
  4082  	VPADDD  Y0, Y5, Y0
  4083  	VPADDD  416(SP), Y0, Y0
  4084  	VPXOR   Y15, Y0, Y15
  4085  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4086  	VPADDD  Y10, Y15, Y10
  4087  	VPXOR   Y5, Y10, Y5
  4088  	VPSRLD  $0x0c, Y5, Y8
  4089  	VPSLLD  $0x14, Y5, Y5
  4090  	VPOR    Y5, Y8, Y5
  4091  	VPADDD  Y0, Y5, Y0
  4092  	VPADDD  96(SP), Y0, Y0
  4093  	VPXOR   Y15, Y0, Y15
  4094  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4095  	VPADDD  Y10, Y15, Y10
  4096  	VPXOR   Y5, Y10, Y5
  4097  	VPSRLD  $0x07, Y5, Y8
  4098  	VPSLLD  $0x19, Y5, Y5
  4099  	VPOR    Y5, Y8, Y5
  4100  	VPADDD  Y1, Y6, Y1
  4101  	VPADDD  (SP), Y1, Y1
  4102  	VPXOR   Y12, Y1, Y12
  4103  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4104  	VPADDD  Y11, Y12, Y11
  4105  	VPXOR   Y6, Y11, Y6
  4106  	VPSRLD  $0x0c, Y6, Y8
  4107  	VPSLLD  $0x14, Y6, Y6
  4108  	VPOR    Y6, Y8, Y6
  4109  	VPADDD  Y1, Y6, Y1
  4110  	VPADDD  320(SP), Y1, Y1
  4111  	VPXOR   Y12, Y1, Y12
  4112  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4113  	VPADDD  Y11, Y12, Y11
  4114  	VPXOR   Y6, Y11, Y6
  4115  	VPSRLD  $0x07, Y6, Y8
  4116  	VPSLLD  $0x19, Y6, Y6
  4117  	VPOR    Y6, Y8, Y6
  4118  	VPADDD  Y2, Y7, Y2
  4119  	VPADDD  64(SP), Y2, Y2
  4120  	VPXOR   Y13, Y2, Y13
  4121  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4122  	VMOVDQU 640(SP), Y8
  4123  	VPADDD  Y8, Y13, Y8
  4124  	VPXOR   Y7, Y8, Y7
  4125  	VMOVDQU Y8, 640(SP)
  4126  	VPSRLD  $0x0c, Y7, Y8
  4127  	VPSLLD  $0x14, Y7, Y7
  4128  	VPOR    Y7, Y8, Y7
  4129  	VPADDD  Y2, Y7, Y2
  4130  	VPADDD  192(SP), Y2, Y2
  4131  	VPXOR   Y13, Y2, Y13
  4132  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4133  	VMOVDQU 640(SP), Y8
  4134  	VPADDD  Y8, Y13, Y8
  4135  	VPXOR   Y7, Y8, Y7
  4136  	VMOVDQU Y8, 640(SP)
  4137  	VPSRLD  $0x07, Y7, Y8
  4138  	VPSLLD  $0x19, Y7, Y7
  4139  	VPOR    Y7, Y8, Y7
  4140  	VPADDD  Y3, Y4, Y3
  4141  	VPADDD  128(SP), Y3, Y3
  4142  	VPXOR   Y14, Y3, Y14
  4143  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4144  	VPADDD  Y9, Y14, Y9
  4145  	VPXOR   Y4, Y9, Y4
  4146  	VPSRLD  $0x0c, Y4, Y8
  4147  	VPSLLD  $0x14, Y4, Y4
  4148  	VPOR    Y4, Y8, Y4
  4149  	VPADDD  Y3, Y4, Y3
  4150  	VPADDD  224(SP), Y3, Y3
  4151  	VPXOR   Y14, Y3, Y14
  4152  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4153  	VPADDD  Y9, Y14, Y9
  4154  	VPXOR   Y4, Y9, Y4
  4155  	VPSRLD  $0x07, Y4, Y8
  4156  	VPSLLD  $0x19, Y4, Y4
  4157  	VPOR    Y4, Y8, Y4
  4158  
  4159  	// Round 7
  4160  	VPADDD  Y0, Y4, Y0
  4161  	VPADDD  352(SP), Y0, Y0
  4162  	VPXOR   Y12, Y0, Y12
  4163  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4164  	VMOVDQU 640(SP), Y8
  4165  	VPADDD  Y8, Y12, Y8
  4166  	VPXOR   Y4, Y8, Y4
  4167  	VMOVDQU Y8, 640(SP)
  4168  	VPSRLD  $0x0c, Y4, Y8
  4169  	VPSLLD  $0x14, Y4, Y4
  4170  	VPOR    Y4, Y8, Y4
  4171  	VPADDD  Y0, Y4, Y0
  4172  	VPADDD  480(SP), Y0, Y0
  4173  	VPXOR   Y12, Y0, Y12
  4174  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4175  	VMOVDQU 640(SP), Y8
  4176  	VPADDD  Y8, Y12, Y8
  4177  	VPXOR   Y4, Y8, Y4
  4178  	VMOVDQU Y8, 640(SP)
  4179  	VPSRLD  $0x07, Y4, Y8
  4180  	VPSLLD  $0x19, Y4, Y4
  4181  	VPOR    Y4, Y8, Y4
  4182  	VPADDD  Y1, Y5, Y1
  4183  	VPADDD  160(SP), Y1, Y1
  4184  	VPXOR   Y13, Y1, Y13
  4185  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4186  	VPADDD  Y9, Y13, Y9
  4187  	VPXOR   Y5, Y9, Y5
  4188  	VPSRLD  $0x0c, Y5, Y8
  4189  	VPSLLD  $0x14, Y5, Y5
  4190  	VPOR    Y5, Y8, Y5
  4191  	VPADDD  Y1, Y5, Y1
  4192  	VPADDD  (SP), Y1, Y1
  4193  	VPXOR   Y13, Y1, Y13
  4194  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4195  	VPADDD  Y9, Y13, Y9
  4196  	VPXOR   Y5, Y9, Y5
  4197  	VPSRLD  $0x07, Y5, Y8
  4198  	VPSLLD  $0x19, Y5, Y5
  4199  	VPOR    Y5, Y8, Y5
  4200  	VPADDD  Y2, Y6, Y2
  4201  	VPADDD  32(SP), Y2, Y2
  4202  	VPXOR   Y14, Y2, Y14
  4203  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4204  	VPADDD  Y10, Y14, Y10
  4205  	VPXOR   Y6, Y10, Y6
  4206  	VPSRLD  $0x0c, Y6, Y8
  4207  	VPSLLD  $0x14, Y6, Y6
  4208  	VPOR    Y6, Y8, Y6
  4209  	VPADDD  Y2, Y6, Y2
  4210  	VPADDD  288(SP), Y2, Y2
  4211  	VPXOR   Y14, Y2, Y14
  4212  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4213  	VPADDD  Y10, Y14, Y10
  4214  	VPXOR   Y6, Y10, Y6
  4215  	VPSRLD  $0x07, Y6, Y8
  4216  	VPSLLD  $0x19, Y6, Y6
  4217  	VPOR    Y6, Y8, Y6
  4218  	VPADDD  Y3, Y7, Y3
  4219  	VPADDD  256(SP), Y3, Y3
  4220  	VPXOR   Y15, Y3, Y15
  4221  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4222  	VPADDD  Y11, Y15, Y11
  4223  	VPXOR   Y7, Y11, Y7
  4224  	VPSRLD  $0x0c, Y7, Y8
  4225  	VPSLLD  $0x14, Y7, Y7
  4226  	VPOR    Y7, Y8, Y7
  4227  	VPADDD  Y3, Y7, Y3
  4228  	VPADDD  192(SP), Y3, Y3
  4229  	VPXOR   Y15, Y3, Y15
  4230  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4231  	VPADDD  Y11, Y15, Y11
  4232  	VPXOR   Y7, Y11, Y7
  4233  	VPSRLD  $0x07, Y7, Y8
  4234  	VPSLLD  $0x19, Y7, Y7
  4235  	VPOR    Y7, Y8, Y7
  4236  	VPADDD  Y0, Y5, Y0
  4237  	VPADDD  448(SP), Y0, Y0
  4238  	VPXOR   Y15, Y0, Y15
  4239  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4240  	VPADDD  Y10, Y15, Y10
  4241  	VPXOR   Y5, Y10, Y5
  4242  	VPSRLD  $0x0c, Y5, Y8
  4243  	VPSLLD  $0x14, Y5, Y5
  4244  	VPOR    Y5, Y8, Y5
  4245  	VPADDD  Y0, Y5, Y0
  4246  	VPADDD  320(SP), Y0, Y0
  4247  	VPXOR   Y15, Y0, Y15
  4248  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4249  	VPADDD  Y10, Y15, Y10
  4250  	VPXOR   Y5, Y10, Y5
  4251  	VPSRLD  $0x07, Y5, Y8
  4252  	VPSLLD  $0x19, Y5, Y5
  4253  	VPOR    Y5, Y8, Y5
  4254  	VPADDD  Y1, Y6, Y1
  4255  	VPADDD  64(SP), Y1, Y1
  4256  	VPXOR   Y12, Y1, Y12
  4257  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4258  	VPADDD  Y11, Y12, Y11
  4259  	VPXOR   Y6, Y11, Y6
  4260  	VPSRLD  $0x0c, Y6, Y8
  4261  	VPSLLD  $0x14, Y6, Y6
  4262  	VPOR    Y6, Y8, Y6
  4263  	VPADDD  Y1, Y6, Y1
  4264  	VPADDD  384(SP), Y1, Y1
  4265  	VPXOR   Y12, Y1, Y12
  4266  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4267  	VPADDD  Y11, Y12, Y11
  4268  	VPXOR   Y6, Y11, Y6
  4269  	VPSRLD  $0x07, Y6, Y8
  4270  	VPSLLD  $0x19, Y6, Y6
  4271  	VPOR    Y6, Y8, Y6
  4272  	VPADDD  Y2, Y7, Y2
  4273  	VPADDD  96(SP), Y2, Y2
  4274  	VPXOR   Y13, Y2, Y13
  4275  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4276  	VMOVDQU 640(SP), Y8
  4277  	VPADDD  Y8, Y13, Y8
  4278  	VPXOR   Y7, Y8, Y7
  4279  	VMOVDQU Y8, 640(SP)
  4280  	VPSRLD  $0x0c, Y7, Y8
  4281  	VPSLLD  $0x14, Y7, Y7
  4282  	VPOR    Y7, Y8, Y7
  4283  	VPADDD  Y2, Y7, Y2
  4284  	VPADDD  128(SP), Y2, Y2
  4285  	VPXOR   Y13, Y2, Y13
  4286  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4287  	VMOVDQU 640(SP), Y8
  4288  	VPADDD  Y8, Y13, Y8
  4289  	VPXOR   Y7, Y8, Y7
  4290  	VMOVDQU Y8, 640(SP)
  4291  	VPSRLD  $0x07, Y7, Y8
  4292  	VPSLLD  $0x19, Y7, Y7
  4293  	VPOR    Y7, Y8, Y7
  4294  	VPADDD  Y3, Y4, Y3
  4295  	VPADDD  224(SP), Y3, Y3
  4296  	VPXOR   Y14, Y3, Y14
  4297  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4298  	VPADDD  Y9, Y14, Y9
  4299  	VPXOR   Y4, Y9, Y4
  4300  	VPSRLD  $0x0c, Y4, Y8
  4301  	VPSLLD  $0x14, Y4, Y4
  4302  	VPOR    Y4, Y8, Y4
  4303  	VPADDD  Y3, Y4, Y3
  4304  	VPADDD  416(SP), Y3, Y3
  4305  	VPXOR   Y14, Y3, Y14
  4306  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4307  	VPADDD  Y9, Y14, Y9
  4308  	VPXOR   Y4, Y9, Y4
  4309  	VPSRLD  $0x07, Y4, Y8
  4310  	VPSLLD  $0x19, Y4, Y4
  4311  	VPOR    Y4, Y8, Y4
  4312  	VMOVDQU 640(SP), Y8
  4313  
  4314  	// Finalize CVs
  4315  	VPXOR Y0, Y8, Y0
  4316  	VPXOR Y1, Y9, Y1
  4317  	VPXOR Y2, Y10, Y2
  4318  	VPXOR Y3, Y11, Y3
  4319  	VPXOR Y4, Y12, Y4
  4320  	VPXOR Y5, Y13, Y5
  4321  	VPXOR Y6, Y14, Y6
  4322  	VPXOR Y7, Y15, Y7
  4323  
  4324  	// Loop
  4325  	INCQ DX
  4326  	CMPQ DX, $0x00000010
  4327  	JNE  loop
  4328  
  4329  	// Finished; transpose CVs
  4330  	VPUNPCKLDQ  Y1, Y0, Y8
  4331  	VPUNPCKHDQ  Y1, Y0, Y9
  4332  	VPUNPCKLDQ  Y3, Y2, Y10
  4333  	VPUNPCKHDQ  Y3, Y2, Y11
  4334  	VPUNPCKLDQ  Y5, Y4, Y12
  4335  	VPUNPCKHDQ  Y5, Y4, Y13
  4336  	VPUNPCKLDQ  Y7, Y6, Y14
  4337  	VPUNPCKHDQ  Y7, Y6, Y15
  4338  	VPUNPCKLQDQ Y10, Y8, Y0
  4339  	VPUNPCKHQDQ Y10, Y8, Y1
  4340  	VPUNPCKLQDQ Y11, Y9, Y2
  4341  	VPUNPCKHQDQ Y11, Y9, Y3
  4342  	VPUNPCKLQDQ Y14, Y12, Y4
  4343  	VPUNPCKHQDQ Y14, Y12, Y5
  4344  	VPUNPCKLQDQ Y15, Y13, Y6
  4345  	VPUNPCKHQDQ Y15, Y13, Y7
  4346  	VPERM2I128  $0x20, Y4, Y0, Y8
  4347  	VPERM2I128  $0x31, Y4, Y0, Y12
  4348  	VPERM2I128  $0x20, Y5, Y1, Y9
  4349  	VPERM2I128  $0x31, Y5, Y1, Y13
  4350  	VPERM2I128  $0x20, Y6, Y2, Y10
  4351  	VPERM2I128  $0x31, Y6, Y2, Y14
  4352  	VPERM2I128  $0x20, Y7, Y3, Y11
  4353  	VPERM2I128  $0x31, Y7, Y3, Y15
  4354  	VMOVDQU     Y8, (AX)
  4355  	VMOVDQU     Y9, 32(AX)
  4356  	VMOVDQU     Y10, 64(AX)
  4357  	VMOVDQU     Y11, 96(AX)
  4358  	VMOVDQU     Y12, 128(AX)
  4359  	VMOVDQU     Y13, 160(AX)
  4360  	VMOVDQU     Y14, 192(AX)
  4361  	VMOVDQU     Y15, 224(AX)
  4362  	RET
  4363  
  4364  // func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
  4365  // Requires: AVX, AVX2
  4366  TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28
  4367  	MOVQ parents+0(FP), AX
  4368  	MOVQ cvs+8(FP), CX
  4369  	MOVQ key+16(FP), DX
  4370  
  4371  	// Load transposed block
  4372  	VMOVDQU    seq<>+0(SB), Y9
  4373  	VPSLLD     $0x06, Y9, Y9
  4374  	VPCMPEQD   Y8, Y8, Y8
  4375  	VPGATHERDD Y8, (CX)(Y9*1), Y10
  4376  	VMOVDQU    Y10, (SP)
  4377  	VPCMPEQD   Y8, Y8, Y8
  4378  	VPGATHERDD Y8, 4(CX)(Y9*1), Y10
  4379  	VMOVDQU    Y10, 32(SP)
  4380  	VPCMPEQD   Y8, Y8, Y8
  4381  	VPGATHERDD Y8, 8(CX)(Y9*1), Y10
  4382  	VMOVDQU    Y10, 64(SP)
  4383  	VPCMPEQD   Y8, Y8, Y8
  4384  	VPGATHERDD Y8, 12(CX)(Y9*1), Y10
  4385  	VMOVDQU    Y10, 96(SP)
  4386  	VPCMPEQD   Y8, Y8, Y8
  4387  	VPGATHERDD Y8, 16(CX)(Y9*1), Y10
  4388  	VMOVDQU    Y10, 128(SP)
  4389  	VPCMPEQD   Y8, Y8, Y8
  4390  	VPGATHERDD Y8, 20(CX)(Y9*1), Y10
  4391  	VMOVDQU    Y10, 160(SP)
  4392  	VPCMPEQD   Y8, Y8, Y8
  4393  	VPGATHERDD Y8, 24(CX)(Y9*1), Y10
  4394  	VMOVDQU    Y10, 192(SP)
  4395  	VPCMPEQD   Y8, Y8, Y8
  4396  	VPGATHERDD Y8, 28(CX)(Y9*1), Y10
  4397  	VMOVDQU    Y10, 224(SP)
  4398  	VPCMPEQD   Y8, Y8, Y8
  4399  	VPGATHERDD Y8, 32(CX)(Y9*1), Y10
  4400  	VMOVDQU    Y10, 256(SP)
  4401  	VPCMPEQD   Y8, Y8, Y8
  4402  	VPGATHERDD Y8, 36(CX)(Y9*1), Y10
  4403  	VMOVDQU    Y10, 288(SP)
  4404  	VPCMPEQD   Y8, Y8, Y8
  4405  	VPGATHERDD Y8, 40(CX)(Y9*1), Y10
  4406  	VMOVDQU    Y10, 320(SP)
  4407  	VPCMPEQD   Y8, Y8, Y8
  4408  	VPGATHERDD Y8, 44(CX)(Y9*1), Y10
  4409  	VMOVDQU    Y10, 352(SP)
  4410  	VPCMPEQD   Y8, Y8, Y8
  4411  	VPGATHERDD Y8, 48(CX)(Y9*1), Y10
  4412  	VMOVDQU    Y10, 384(SP)
  4413  	VPCMPEQD   Y8, Y8, Y8
  4414  	VPGATHERDD Y8, 52(CX)(Y9*1), Y10
  4415  	VMOVDQU    Y10, 416(SP)
  4416  	VPCMPEQD   Y8, Y8, Y8
  4417  	VPGATHERDD Y8, 56(CX)(Y9*1), Y10
  4418  	VMOVDQU    Y10, 448(SP)
  4419  	VPCMPEQD   Y8, Y8, Y8
  4420  	VPGATHERDD Y8, 60(CX)(Y9*1), Y10
  4421  	VMOVDQU    Y10, 480(SP)
  4422  
  4423  	// Initialize state vectors
  4424  	VPBROADCASTD (DX), Y0
  4425  	VPBROADCASTD 4(DX), Y1
  4426  	VPBROADCASTD 8(DX), Y2
  4427  	VPBROADCASTD 12(DX), Y3
  4428  	VPBROADCASTD 16(DX), Y4
  4429  	VPBROADCASTD 20(DX), Y5
  4430  	VPBROADCASTD 24(DX), Y6
  4431  	VPBROADCASTD 28(DX), Y7
  4432  	VPBROADCASTD iv<>+0(SB), Y8
  4433  	VPBROADCASTD iv<>+4(SB), Y9
  4434  	VPBROADCASTD iv<>+8(SB), Y10
  4435  	VPBROADCASTD iv<>+12(SB), Y11
  4436  	VPXOR        Y12, Y12, Y12
  4437  	VPXOR        Y13, Y13, Y13
  4438  	VPBROADCASTD seq<>+4(SB), Y14
  4439  	VPSLLD       $0x06, Y14, Y14
  4440  	ORL          $0x04, flags+24(FP)
  4441  	VPBROADCASTD flags+24(FP), Y15
  4442  	VMOVDQU      Y8, 512(SP)
  4443  
  4444  	// Round 1
  4445  	VPADDD  Y0, Y4, Y0
  4446  	VPADDD  (SP), Y0, Y0
  4447  	VPXOR   Y12, Y0, Y12
  4448  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4449  	VMOVDQU 512(SP), Y8
  4450  	VPADDD  Y8, Y12, Y8
  4451  	VPXOR   Y4, Y8, Y4
  4452  	VMOVDQU Y8, 512(SP)
  4453  	VPSRLD  $0x0c, Y4, Y8
  4454  	VPSLLD  $0x14, Y4, Y4
  4455  	VPOR    Y4, Y8, Y4
  4456  	VPADDD  Y0, Y4, Y0
  4457  	VPADDD  32(SP), Y0, Y0
  4458  	VPXOR   Y12, Y0, Y12
  4459  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4460  	VMOVDQU 512(SP), Y8
  4461  	VPADDD  Y8, Y12, Y8
  4462  	VPXOR   Y4, Y8, Y4
  4463  	VMOVDQU Y8, 512(SP)
  4464  	VPSRLD  $0x07, Y4, Y8
  4465  	VPSLLD  $0x19, Y4, Y4
  4466  	VPOR    Y4, Y8, Y4
  4467  	VPADDD  Y1, Y5, Y1
  4468  	VPADDD  64(SP), Y1, Y1
  4469  	VPXOR   Y13, Y1, Y13
  4470  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4471  	VPADDD  Y9, Y13, Y9
  4472  	VPXOR   Y5, Y9, Y5
  4473  	VPSRLD  $0x0c, Y5, Y8
  4474  	VPSLLD  $0x14, Y5, Y5
  4475  	VPOR    Y5, Y8, Y5
  4476  	VPADDD  Y1, Y5, Y1
  4477  	VPADDD  96(SP), Y1, Y1
  4478  	VPXOR   Y13, Y1, Y13
  4479  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4480  	VPADDD  Y9, Y13, Y9
  4481  	VPXOR   Y5, Y9, Y5
  4482  	VPSRLD  $0x07, Y5, Y8
  4483  	VPSLLD  $0x19, Y5, Y5
  4484  	VPOR    Y5, Y8, Y5
  4485  	VPADDD  Y2, Y6, Y2
  4486  	VPADDD  128(SP), Y2, Y2
  4487  	VPXOR   Y14, Y2, Y14
  4488  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4489  	VPADDD  Y10, Y14, Y10
  4490  	VPXOR   Y6, Y10, Y6
  4491  	VPSRLD  $0x0c, Y6, Y8
  4492  	VPSLLD  $0x14, Y6, Y6
  4493  	VPOR    Y6, Y8, Y6
  4494  	VPADDD  Y2, Y6, Y2
  4495  	VPADDD  160(SP), Y2, Y2
  4496  	VPXOR   Y14, Y2, Y14
  4497  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4498  	VPADDD  Y10, Y14, Y10
  4499  	VPXOR   Y6, Y10, Y6
  4500  	VPSRLD  $0x07, Y6, Y8
  4501  	VPSLLD  $0x19, Y6, Y6
  4502  	VPOR    Y6, Y8, Y6
  4503  	VPADDD  Y3, Y7, Y3
  4504  	VPADDD  192(SP), Y3, Y3
  4505  	VPXOR   Y15, Y3, Y15
  4506  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4507  	VPADDD  Y11, Y15, Y11
  4508  	VPXOR   Y7, Y11, Y7
  4509  	VPSRLD  $0x0c, Y7, Y8
  4510  	VPSLLD  $0x14, Y7, Y7
  4511  	VPOR    Y7, Y8, Y7
  4512  	VPADDD  Y3, Y7, Y3
  4513  	VPADDD  224(SP), Y3, Y3
  4514  	VPXOR   Y15, Y3, Y15
  4515  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4516  	VPADDD  Y11, Y15, Y11
  4517  	VPXOR   Y7, Y11, Y7
  4518  	VPSRLD  $0x07, Y7, Y8
  4519  	VPSLLD  $0x19, Y7, Y7
  4520  	VPOR    Y7, Y8, Y7
  4521  	VPADDD  Y0, Y5, Y0
  4522  	VPADDD  256(SP), Y0, Y0
  4523  	VPXOR   Y15, Y0, Y15
  4524  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4525  	VPADDD  Y10, Y15, Y10
  4526  	VPXOR   Y5, Y10, Y5
  4527  	VPSRLD  $0x0c, Y5, Y8
  4528  	VPSLLD  $0x14, Y5, Y5
  4529  	VPOR    Y5, Y8, Y5
  4530  	VPADDD  Y0, Y5, Y0
  4531  	VPADDD  288(SP), Y0, Y0
  4532  	VPXOR   Y15, Y0, Y15
  4533  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4534  	VPADDD  Y10, Y15, Y10
  4535  	VPXOR   Y5, Y10, Y5
  4536  	VPSRLD  $0x07, Y5, Y8
  4537  	VPSLLD  $0x19, Y5, Y5
  4538  	VPOR    Y5, Y8, Y5
  4539  	VPADDD  Y1, Y6, Y1
  4540  	VPADDD  320(SP), Y1, Y1
  4541  	VPXOR   Y12, Y1, Y12
  4542  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4543  	VPADDD  Y11, Y12, Y11
  4544  	VPXOR   Y6, Y11, Y6
  4545  	VPSRLD  $0x0c, Y6, Y8
  4546  	VPSLLD  $0x14, Y6, Y6
  4547  	VPOR    Y6, Y8, Y6
  4548  	VPADDD  Y1, Y6, Y1
  4549  	VPADDD  352(SP), Y1, Y1
  4550  	VPXOR   Y12, Y1, Y12
  4551  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4552  	VPADDD  Y11, Y12, Y11
  4553  	VPXOR   Y6, Y11, Y6
  4554  	VPSRLD  $0x07, Y6, Y8
  4555  	VPSLLD  $0x19, Y6, Y6
  4556  	VPOR    Y6, Y8, Y6
  4557  	VPADDD  Y2, Y7, Y2
  4558  	VPADDD  384(SP), Y2, Y2
  4559  	VPXOR   Y13, Y2, Y13
  4560  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4561  	VMOVDQU 512(SP), Y8
  4562  	VPADDD  Y8, Y13, Y8
  4563  	VPXOR   Y7, Y8, Y7
  4564  	VMOVDQU Y8, 512(SP)
  4565  	VPSRLD  $0x0c, Y7, Y8
  4566  	VPSLLD  $0x14, Y7, Y7
  4567  	VPOR    Y7, Y8, Y7
  4568  	VPADDD  Y2, Y7, Y2
  4569  	VPADDD  416(SP), Y2, Y2
  4570  	VPXOR   Y13, Y2, Y13
  4571  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4572  	VMOVDQU 512(SP), Y8
  4573  	VPADDD  Y8, Y13, Y8
  4574  	VPXOR   Y7, Y8, Y7
  4575  	VMOVDQU Y8, 512(SP)
  4576  	VPSRLD  $0x07, Y7, Y8
  4577  	VPSLLD  $0x19, Y7, Y7
  4578  	VPOR    Y7, Y8, Y7
  4579  	VPADDD  Y3, Y4, Y3
  4580  	VPADDD  448(SP), Y3, Y3
  4581  	VPXOR   Y14, Y3, Y14
  4582  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4583  	VPADDD  Y9, Y14, Y9
  4584  	VPXOR   Y4, Y9, Y4
  4585  	VPSRLD  $0x0c, Y4, Y8
  4586  	VPSLLD  $0x14, Y4, Y4
  4587  	VPOR    Y4, Y8, Y4
  4588  	VPADDD  Y3, Y4, Y3
  4589  	VPADDD  480(SP), Y3, Y3
  4590  	VPXOR   Y14, Y3, Y14
  4591  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4592  	VPADDD  Y9, Y14, Y9
  4593  	VPXOR   Y4, Y9, Y4
  4594  	VPSRLD  $0x07, Y4, Y8
  4595  	VPSLLD  $0x19, Y4, Y4
  4596  	VPOR    Y4, Y8, Y4
  4597  
  4598  	// Round 2
  4599  	VPADDD  Y0, Y4, Y0
  4600  	VPADDD  64(SP), Y0, Y0
  4601  	VPXOR   Y12, Y0, Y12
  4602  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4603  	VMOVDQU 512(SP), Y8
  4604  	VPADDD  Y8, Y12, Y8
  4605  	VPXOR   Y4, Y8, Y4
  4606  	VMOVDQU Y8, 512(SP)
  4607  	VPSRLD  $0x0c, Y4, Y8
  4608  	VPSLLD  $0x14, Y4, Y4
  4609  	VPOR    Y4, Y8, Y4
  4610  	VPADDD  Y0, Y4, Y0
  4611  	VPADDD  192(SP), Y0, Y0
  4612  	VPXOR   Y12, Y0, Y12
  4613  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4614  	VMOVDQU 512(SP), Y8
  4615  	VPADDD  Y8, Y12, Y8
  4616  	VPXOR   Y4, Y8, Y4
  4617  	VMOVDQU Y8, 512(SP)
  4618  	VPSRLD  $0x07, Y4, Y8
  4619  	VPSLLD  $0x19, Y4, Y4
  4620  	VPOR    Y4, Y8, Y4
  4621  	VPADDD  Y1, Y5, Y1
  4622  	VPADDD  96(SP), Y1, Y1
  4623  	VPXOR   Y13, Y1, Y13
  4624  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4625  	VPADDD  Y9, Y13, Y9
  4626  	VPXOR   Y5, Y9, Y5
  4627  	VPSRLD  $0x0c, Y5, Y8
  4628  	VPSLLD  $0x14, Y5, Y5
  4629  	VPOR    Y5, Y8, Y5
  4630  	VPADDD  Y1, Y5, Y1
  4631  	VPADDD  320(SP), Y1, Y1
  4632  	VPXOR   Y13, Y1, Y13
  4633  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4634  	VPADDD  Y9, Y13, Y9
  4635  	VPXOR   Y5, Y9, Y5
  4636  	VPSRLD  $0x07, Y5, Y8
  4637  	VPSLLD  $0x19, Y5, Y5
  4638  	VPOR    Y5, Y8, Y5
  4639  	VPADDD  Y2, Y6, Y2
  4640  	VPADDD  224(SP), Y2, Y2
  4641  	VPXOR   Y14, Y2, Y14
  4642  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4643  	VPADDD  Y10, Y14, Y10
  4644  	VPXOR   Y6, Y10, Y6
  4645  	VPSRLD  $0x0c, Y6, Y8
  4646  	VPSLLD  $0x14, Y6, Y6
  4647  	VPOR    Y6, Y8, Y6
  4648  	VPADDD  Y2, Y6, Y2
  4649  	VPADDD  (SP), Y2, Y2
  4650  	VPXOR   Y14, Y2, Y14
  4651  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4652  	VPADDD  Y10, Y14, Y10
  4653  	VPXOR   Y6, Y10, Y6
  4654  	VPSRLD  $0x07, Y6, Y8
  4655  	VPSLLD  $0x19, Y6, Y6
  4656  	VPOR    Y6, Y8, Y6
  4657  	VPADDD  Y3, Y7, Y3
  4658  	VPADDD  128(SP), Y3, Y3
  4659  	VPXOR   Y15, Y3, Y15
  4660  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4661  	VPADDD  Y11, Y15, Y11
  4662  	VPXOR   Y7, Y11, Y7
  4663  	VPSRLD  $0x0c, Y7, Y8
  4664  	VPSLLD  $0x14, Y7, Y7
  4665  	VPOR    Y7, Y8, Y7
  4666  	VPADDD  Y3, Y7, Y3
  4667  	VPADDD  416(SP), Y3, Y3
  4668  	VPXOR   Y15, Y3, Y15
  4669  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4670  	VPADDD  Y11, Y15, Y11
  4671  	VPXOR   Y7, Y11, Y7
  4672  	VPSRLD  $0x07, Y7, Y8
  4673  	VPSLLD  $0x19, Y7, Y7
  4674  	VPOR    Y7, Y8, Y7
  4675  	VPADDD  Y0, Y5, Y0
  4676  	VPADDD  32(SP), Y0, Y0
  4677  	VPXOR   Y15, Y0, Y15
  4678  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4679  	VPADDD  Y10, Y15, Y10
  4680  	VPXOR   Y5, Y10, Y5
  4681  	VPSRLD  $0x0c, Y5, Y8
  4682  	VPSLLD  $0x14, Y5, Y5
  4683  	VPOR    Y5, Y8, Y5
  4684  	VPADDD  Y0, Y5, Y0
  4685  	VPADDD  352(SP), Y0, Y0
  4686  	VPXOR   Y15, Y0, Y15
  4687  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4688  	VPADDD  Y10, Y15, Y10
  4689  	VPXOR   Y5, Y10, Y5
  4690  	VPSRLD  $0x07, Y5, Y8
  4691  	VPSLLD  $0x19, Y5, Y5
  4692  	VPOR    Y5, Y8, Y5
  4693  	VPADDD  Y1, Y6, Y1
  4694  	VPADDD  384(SP), Y1, Y1
  4695  	VPXOR   Y12, Y1, Y12
  4696  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4697  	VPADDD  Y11, Y12, Y11
  4698  	VPXOR   Y6, Y11, Y6
  4699  	VPSRLD  $0x0c, Y6, Y8
  4700  	VPSLLD  $0x14, Y6, Y6
  4701  	VPOR    Y6, Y8, Y6
  4702  	VPADDD  Y1, Y6, Y1
  4703  	VPADDD  160(SP), Y1, Y1
  4704  	VPXOR   Y12, Y1, Y12
  4705  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4706  	VPADDD  Y11, Y12, Y11
  4707  	VPXOR   Y6, Y11, Y6
  4708  	VPSRLD  $0x07, Y6, Y8
  4709  	VPSLLD  $0x19, Y6, Y6
  4710  	VPOR    Y6, Y8, Y6
  4711  	VPADDD  Y2, Y7, Y2
  4712  	VPADDD  288(SP), Y2, Y2
  4713  	VPXOR   Y13, Y2, Y13
  4714  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4715  	VMOVDQU 512(SP), Y8
  4716  	VPADDD  Y8, Y13, Y8
  4717  	VPXOR   Y7, Y8, Y7
  4718  	VMOVDQU Y8, 512(SP)
  4719  	VPSRLD  $0x0c, Y7, Y8
  4720  	VPSLLD  $0x14, Y7, Y7
  4721  	VPOR    Y7, Y8, Y7
  4722  	VPADDD  Y2, Y7, Y2
  4723  	VPADDD  448(SP), Y2, Y2
  4724  	VPXOR   Y13, Y2, Y13
  4725  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4726  	VMOVDQU 512(SP), Y8
  4727  	VPADDD  Y8, Y13, Y8
  4728  	VPXOR   Y7, Y8, Y7
  4729  	VMOVDQU Y8, 512(SP)
  4730  	VPSRLD  $0x07, Y7, Y8
  4731  	VPSLLD  $0x19, Y7, Y7
  4732  	VPOR    Y7, Y8, Y7
  4733  	VPADDD  Y3, Y4, Y3
  4734  	VPADDD  480(SP), Y3, Y3
  4735  	VPXOR   Y14, Y3, Y14
  4736  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4737  	VPADDD  Y9, Y14, Y9
  4738  	VPXOR   Y4, Y9, Y4
  4739  	VPSRLD  $0x0c, Y4, Y8
  4740  	VPSLLD  $0x14, Y4, Y4
  4741  	VPOR    Y4, Y8, Y4
  4742  	VPADDD  Y3, Y4, Y3
  4743  	VPADDD  256(SP), Y3, Y3
  4744  	VPXOR   Y14, Y3, Y14
  4745  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4746  	VPADDD  Y9, Y14, Y9
  4747  	VPXOR   Y4, Y9, Y4
  4748  	VPSRLD  $0x07, Y4, Y8
  4749  	VPSLLD  $0x19, Y4, Y4
  4750  	VPOR    Y4, Y8, Y4
  4751  
  4752  	// Round 3
  4753  	VPADDD  Y0, Y4, Y0
  4754  	VPADDD  96(SP), Y0, Y0
  4755  	VPXOR   Y12, Y0, Y12
  4756  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4757  	VMOVDQU 512(SP), Y8
  4758  	VPADDD  Y8, Y12, Y8
  4759  	VPXOR   Y4, Y8, Y4
  4760  	VMOVDQU Y8, 512(SP)
  4761  	VPSRLD  $0x0c, Y4, Y8
  4762  	VPSLLD  $0x14, Y4, Y4
  4763  	VPOR    Y4, Y8, Y4
  4764  	VPADDD  Y0, Y4, Y0
  4765  	VPADDD  128(SP), Y0, Y0
  4766  	VPXOR   Y12, Y0, Y12
  4767  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4768  	VMOVDQU 512(SP), Y8
  4769  	VPADDD  Y8, Y12, Y8
  4770  	VPXOR   Y4, Y8, Y4
  4771  	VMOVDQU Y8, 512(SP)
  4772  	VPSRLD  $0x07, Y4, Y8
  4773  	VPSLLD  $0x19, Y4, Y4
  4774  	VPOR    Y4, Y8, Y4
  4775  	VPADDD  Y1, Y5, Y1
  4776  	VPADDD  320(SP), Y1, Y1
  4777  	VPXOR   Y13, Y1, Y13
  4778  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4779  	VPADDD  Y9, Y13, Y9
  4780  	VPXOR   Y5, Y9, Y5
  4781  	VPSRLD  $0x0c, Y5, Y8
  4782  	VPSLLD  $0x14, Y5, Y5
  4783  	VPOR    Y5, Y8, Y5
  4784  	VPADDD  Y1, Y5, Y1
  4785  	VPADDD  384(SP), Y1, Y1
  4786  	VPXOR   Y13, Y1, Y13
  4787  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4788  	VPADDD  Y9, Y13, Y9
  4789  	VPXOR   Y5, Y9, Y5
  4790  	VPSRLD  $0x07, Y5, Y8
  4791  	VPSLLD  $0x19, Y5, Y5
  4792  	VPOR    Y5, Y8, Y5
  4793  	VPADDD  Y2, Y6, Y2
  4794  	VPADDD  416(SP), Y2, Y2
  4795  	VPXOR   Y14, Y2, Y14
  4796  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4797  	VPADDD  Y10, Y14, Y10
  4798  	VPXOR   Y6, Y10, Y6
  4799  	VPSRLD  $0x0c, Y6, Y8
  4800  	VPSLLD  $0x14, Y6, Y6
  4801  	VPOR    Y6, Y8, Y6
  4802  	VPADDD  Y2, Y6, Y2
  4803  	VPADDD  64(SP), Y2, Y2
  4804  	VPXOR   Y14, Y2, Y14
  4805  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4806  	VPADDD  Y10, Y14, Y10
  4807  	VPXOR   Y6, Y10, Y6
  4808  	VPSRLD  $0x07, Y6, Y8
  4809  	VPSLLD  $0x19, Y6, Y6
  4810  	VPOR    Y6, Y8, Y6
  4811  	VPADDD  Y3, Y7, Y3
  4812  	VPADDD  224(SP), Y3, Y3
  4813  	VPXOR   Y15, Y3, Y15
  4814  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4815  	VPADDD  Y11, Y15, Y11
  4816  	VPXOR   Y7, Y11, Y7
  4817  	VPSRLD  $0x0c, Y7, Y8
  4818  	VPSLLD  $0x14, Y7, Y7
  4819  	VPOR    Y7, Y8, Y7
  4820  	VPADDD  Y3, Y7, Y3
  4821  	VPADDD  448(SP), Y3, Y3
  4822  	VPXOR   Y15, Y3, Y15
  4823  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4824  	VPADDD  Y11, Y15, Y11
  4825  	VPXOR   Y7, Y11, Y7
  4826  	VPSRLD  $0x07, Y7, Y8
  4827  	VPSLLD  $0x19, Y7, Y7
  4828  	VPOR    Y7, Y8, Y7
  4829  	VPADDD  Y0, Y5, Y0
  4830  	VPADDD  192(SP), Y0, Y0
  4831  	VPXOR   Y15, Y0, Y15
  4832  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4833  	VPADDD  Y10, Y15, Y10
  4834  	VPXOR   Y5, Y10, Y5
  4835  	VPSRLD  $0x0c, Y5, Y8
  4836  	VPSLLD  $0x14, Y5, Y5
  4837  	VPOR    Y5, Y8, Y5
  4838  	VPADDD  Y0, Y5, Y0
  4839  	VPADDD  160(SP), Y0, Y0
  4840  	VPXOR   Y15, Y0, Y15
  4841  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4842  	VPADDD  Y10, Y15, Y10
  4843  	VPXOR   Y5, Y10, Y5
  4844  	VPSRLD  $0x07, Y5, Y8
  4845  	VPSLLD  $0x19, Y5, Y5
  4846  	VPOR    Y5, Y8, Y5
  4847  	VPADDD  Y1, Y6, Y1
  4848  	VPADDD  288(SP), Y1, Y1
  4849  	VPXOR   Y12, Y1, Y12
  4850  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4851  	VPADDD  Y11, Y12, Y11
  4852  	VPXOR   Y6, Y11, Y6
  4853  	VPSRLD  $0x0c, Y6, Y8
  4854  	VPSLLD  $0x14, Y6, Y6
  4855  	VPOR    Y6, Y8, Y6
  4856  	VPADDD  Y1, Y6, Y1
  4857  	VPADDD  (SP), Y1, Y1
  4858  	VPXOR   Y12, Y1, Y12
  4859  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4860  	VPADDD  Y11, Y12, Y11
  4861  	VPXOR   Y6, Y11, Y6
  4862  	VPSRLD  $0x07, Y6, Y8
  4863  	VPSLLD  $0x19, Y6, Y6
  4864  	VPOR    Y6, Y8, Y6
  4865  	VPADDD  Y2, Y7, Y2
  4866  	VPADDD  352(SP), Y2, Y2
  4867  	VPXOR   Y13, Y2, Y13
  4868  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4869  	VMOVDQU 512(SP), Y8
  4870  	VPADDD  Y8, Y13, Y8
  4871  	VPXOR   Y7, Y8, Y7
  4872  	VMOVDQU Y8, 512(SP)
  4873  	VPSRLD  $0x0c, Y7, Y8
  4874  	VPSLLD  $0x14, Y7, Y7
  4875  	VPOR    Y7, Y8, Y7
  4876  	VPADDD  Y2, Y7, Y2
  4877  	VPADDD  480(SP), Y2, Y2
  4878  	VPXOR   Y13, Y2, Y13
  4879  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4880  	VMOVDQU 512(SP), Y8
  4881  	VPADDD  Y8, Y13, Y8
  4882  	VPXOR   Y7, Y8, Y7
  4883  	VMOVDQU Y8, 512(SP)
  4884  	VPSRLD  $0x07, Y7, Y8
  4885  	VPSLLD  $0x19, Y7, Y7
  4886  	VPOR    Y7, Y8, Y7
  4887  	VPADDD  Y3, Y4, Y3
  4888  	VPADDD  256(SP), Y3, Y3
  4889  	VPXOR   Y14, Y3, Y14
  4890  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4891  	VPADDD  Y9, Y14, Y9
  4892  	VPXOR   Y4, Y9, Y4
  4893  	VPSRLD  $0x0c, Y4, Y8
  4894  	VPSLLD  $0x14, Y4, Y4
  4895  	VPOR    Y4, Y8, Y4
  4896  	VPADDD  Y3, Y4, Y3
  4897  	VPADDD  32(SP), Y3, Y3
  4898  	VPXOR   Y14, Y3, Y14
  4899  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4900  	VPADDD  Y9, Y14, Y9
  4901  	VPXOR   Y4, Y9, Y4
  4902  	VPSRLD  $0x07, Y4, Y8
  4903  	VPSLLD  $0x19, Y4, Y4
  4904  	VPOR    Y4, Y8, Y4
  4905  
  4906  	// Round 4
  4907  	VPADDD  Y0, Y4, Y0
  4908  	VPADDD  320(SP), Y0, Y0
  4909  	VPXOR   Y12, Y0, Y12
  4910  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4911  	VMOVDQU 512(SP), Y8
  4912  	VPADDD  Y8, Y12, Y8
  4913  	VPXOR   Y4, Y8, Y4
  4914  	VMOVDQU Y8, 512(SP)
  4915  	VPSRLD  $0x0c, Y4, Y8
  4916  	VPSLLD  $0x14, Y4, Y4
  4917  	VPOR    Y4, Y8, Y4
  4918  	VPADDD  Y0, Y4, Y0
  4919  	VPADDD  224(SP), Y0, Y0
  4920  	VPXOR   Y12, Y0, Y12
  4921  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4922  	VMOVDQU 512(SP), Y8
  4923  	VPADDD  Y8, Y12, Y8
  4924  	VPXOR   Y4, Y8, Y4
  4925  	VMOVDQU Y8, 512(SP)
  4926  	VPSRLD  $0x07, Y4, Y8
  4927  	VPSLLD  $0x19, Y4, Y4
  4928  	VPOR    Y4, Y8, Y4
  4929  	VPADDD  Y1, Y5, Y1
  4930  	VPADDD  384(SP), Y1, Y1
  4931  	VPXOR   Y13, Y1, Y13
  4932  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4933  	VPADDD  Y9, Y13, Y9
  4934  	VPXOR   Y5, Y9, Y5
  4935  	VPSRLD  $0x0c, Y5, Y8
  4936  	VPSLLD  $0x14, Y5, Y5
  4937  	VPOR    Y5, Y8, Y5
  4938  	VPADDD  Y1, Y5, Y1
  4939  	VPADDD  288(SP), Y1, Y1
  4940  	VPXOR   Y13, Y1, Y13
  4941  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4942  	VPADDD  Y9, Y13, Y9
  4943  	VPXOR   Y5, Y9, Y5
  4944  	VPSRLD  $0x07, Y5, Y8
  4945  	VPSLLD  $0x19, Y5, Y5
  4946  	VPOR    Y5, Y8, Y5
  4947  	VPADDD  Y2, Y6, Y2
  4948  	VPADDD  448(SP), Y2, Y2
  4949  	VPXOR   Y14, Y2, Y14
  4950  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4951  	VPADDD  Y10, Y14, Y10
  4952  	VPXOR   Y6, Y10, Y6
  4953  	VPSRLD  $0x0c, Y6, Y8
  4954  	VPSLLD  $0x14, Y6, Y6
  4955  	VPOR    Y6, Y8, Y6
  4956  	VPADDD  Y2, Y6, Y2
  4957  	VPADDD  96(SP), Y2, Y2
  4958  	VPXOR   Y14, Y2, Y14
  4959  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4960  	VPADDD  Y10, Y14, Y10
  4961  	VPXOR   Y6, Y10, Y6
  4962  	VPSRLD  $0x07, Y6, Y8
  4963  	VPSLLD  $0x19, Y6, Y6
  4964  	VPOR    Y6, Y8, Y6
  4965  	VPADDD  Y3, Y7, Y3
  4966  	VPADDD  416(SP), Y3, Y3
  4967  	VPXOR   Y15, Y3, Y15
  4968  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4969  	VPADDD  Y11, Y15, Y11
  4970  	VPXOR   Y7, Y11, Y7
  4971  	VPSRLD  $0x0c, Y7, Y8
  4972  	VPSLLD  $0x14, Y7, Y7
  4973  	VPOR    Y7, Y8, Y7
  4974  	VPADDD  Y3, Y7, Y3
  4975  	VPADDD  480(SP), Y3, Y3
  4976  	VPXOR   Y15, Y3, Y15
  4977  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4978  	VPADDD  Y11, Y15, Y11
  4979  	VPXOR   Y7, Y11, Y7
  4980  	VPSRLD  $0x07, Y7, Y8
  4981  	VPSLLD  $0x19, Y7, Y7
  4982  	VPOR    Y7, Y8, Y7
  4983  	VPADDD  Y0, Y5, Y0
  4984  	VPADDD  128(SP), Y0, Y0
  4985  	VPXOR   Y15, Y0, Y15
  4986  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4987  	VPADDD  Y10, Y15, Y10
  4988  	VPXOR   Y5, Y10, Y5
  4989  	VPSRLD  $0x0c, Y5, Y8
  4990  	VPSLLD  $0x14, Y5, Y5
  4991  	VPOR    Y5, Y8, Y5
  4992  	VPADDD  Y0, Y5, Y0
  4993  	VPADDD  (SP), Y0, Y0
  4994  	VPXOR   Y15, Y0, Y15
  4995  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4996  	VPADDD  Y10, Y15, Y10
  4997  	VPXOR   Y5, Y10, Y5
  4998  	VPSRLD  $0x07, Y5, Y8
  4999  	VPSLLD  $0x19, Y5, Y5
  5000  	VPOR    Y5, Y8, Y5
  5001  	VPADDD  Y1, Y6, Y1
  5002  	VPADDD  352(SP), Y1, Y1
  5003  	VPXOR   Y12, Y1, Y12
  5004  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5005  	VPADDD  Y11, Y12, Y11
  5006  	VPXOR   Y6, Y11, Y6
  5007  	VPSRLD  $0x0c, Y6, Y8
  5008  	VPSLLD  $0x14, Y6, Y6
  5009  	VPOR    Y6, Y8, Y6
  5010  	VPADDD  Y1, Y6, Y1
  5011  	VPADDD  64(SP), Y1, Y1
  5012  	VPXOR   Y12, Y1, Y12
  5013  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5014  	VPADDD  Y11, Y12, Y11
  5015  	VPXOR   Y6, Y11, Y6
  5016  	VPSRLD  $0x07, Y6, Y8
  5017  	VPSLLD  $0x19, Y6, Y6
  5018  	VPOR    Y6, Y8, Y6
  5019  	VPADDD  Y2, Y7, Y2
  5020  	VPADDD  160(SP), Y2, Y2
  5021  	VPXOR   Y13, Y2, Y13
  5022  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5023  	VMOVDQU 512(SP), Y8
  5024  	VPADDD  Y8, Y13, Y8
  5025  	VPXOR   Y7, Y8, Y7
  5026  	VMOVDQU Y8, 512(SP)
  5027  	VPSRLD  $0x0c, Y7, Y8
  5028  	VPSLLD  $0x14, Y7, Y7
  5029  	VPOR    Y7, Y8, Y7
  5030  	VPADDD  Y2, Y7, Y2
  5031  	VPADDD  256(SP), Y2, Y2
  5032  	VPXOR   Y13, Y2, Y13
  5033  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5034  	VMOVDQU 512(SP), Y8
  5035  	VPADDD  Y8, Y13, Y8
  5036  	VPXOR   Y7, Y8, Y7
  5037  	VMOVDQU Y8, 512(SP)
  5038  	VPSRLD  $0x07, Y7, Y8
  5039  	VPSLLD  $0x19, Y7, Y7
  5040  	VPOR    Y7, Y8, Y7
  5041  	VPADDD  Y3, Y4, Y3
  5042  	VPADDD  32(SP), Y3, Y3
  5043  	VPXOR   Y14, Y3, Y14
  5044  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5045  	VPADDD  Y9, Y14, Y9
  5046  	VPXOR   Y4, Y9, Y4
  5047  	VPSRLD  $0x0c, Y4, Y8
  5048  	VPSLLD  $0x14, Y4, Y4
  5049  	VPOR    Y4, Y8, Y4
  5050  	VPADDD  Y3, Y4, Y3
  5051  	VPADDD  192(SP), Y3, Y3
  5052  	VPXOR   Y14, Y3, Y14
  5053  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5054  	VPADDD  Y9, Y14, Y9
  5055  	VPXOR   Y4, Y9, Y4
  5056  	VPSRLD  $0x07, Y4, Y8
  5057  	VPSLLD  $0x19, Y4, Y4
  5058  	VPOR    Y4, Y8, Y4
  5059  
  5060  	// Round 5
  5061  	VPADDD  Y0, Y4, Y0
  5062  	VPADDD  384(SP), Y0, Y0
  5063  	VPXOR   Y12, Y0, Y12
  5064  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5065  	VMOVDQU 512(SP), Y8
  5066  	VPADDD  Y8, Y12, Y8
  5067  	VPXOR   Y4, Y8, Y4
  5068  	VMOVDQU Y8, 512(SP)
  5069  	VPSRLD  $0x0c, Y4, Y8
  5070  	VPSLLD  $0x14, Y4, Y4
  5071  	VPOR    Y4, Y8, Y4
  5072  	VPADDD  Y0, Y4, Y0
  5073  	VPADDD  416(SP), Y0, Y0
  5074  	VPXOR   Y12, Y0, Y12
  5075  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5076  	VMOVDQU 512(SP), Y8
  5077  	VPADDD  Y8, Y12, Y8
  5078  	VPXOR   Y4, Y8, Y4
  5079  	VMOVDQU Y8, 512(SP)
  5080  	VPSRLD  $0x07, Y4, Y8
  5081  	VPSLLD  $0x19, Y4, Y4
  5082  	VPOR    Y4, Y8, Y4
  5083  	VPADDD  Y1, Y5, Y1
  5084  	VPADDD  288(SP), Y1, Y1
  5085  	VPXOR   Y13, Y1, Y13
  5086  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5087  	VPADDD  Y9, Y13, Y9
  5088  	VPXOR   Y5, Y9, Y5
  5089  	VPSRLD  $0x0c, Y5, Y8
  5090  	VPSLLD  $0x14, Y5, Y5
  5091  	VPOR    Y5, Y8, Y5
  5092  	VPADDD  Y1, Y5, Y1
  5093  	VPADDD  352(SP), Y1, Y1
  5094  	VPXOR   Y13, Y1, Y13
  5095  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5096  	VPADDD  Y9, Y13, Y9
  5097  	VPXOR   Y5, Y9, Y5
  5098  	VPSRLD  $0x07, Y5, Y8
  5099  	VPSLLD  $0x19, Y5, Y5
  5100  	VPOR    Y5, Y8, Y5
  5101  	VPADDD  Y2, Y6, Y2
  5102  	VPADDD  480(SP), Y2, Y2
  5103  	VPXOR   Y14, Y2, Y14
  5104  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5105  	VPADDD  Y10, Y14, Y10
  5106  	VPXOR   Y6, Y10, Y6
  5107  	VPSRLD  $0x0c, Y6, Y8
  5108  	VPSLLD  $0x14, Y6, Y6
  5109  	VPOR    Y6, Y8, Y6
  5110  	VPADDD  Y2, Y6, Y2
  5111  	VPADDD  320(SP), Y2, Y2
  5112  	VPXOR   Y14, Y2, Y14
  5113  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5114  	VPADDD  Y10, Y14, Y10
  5115  	VPXOR   Y6, Y10, Y6
  5116  	VPSRLD  $0x07, Y6, Y8
  5117  	VPSLLD  $0x19, Y6, Y6
  5118  	VPOR    Y6, Y8, Y6
  5119  	VPADDD  Y3, Y7, Y3
  5120  	VPADDD  448(SP), Y3, Y3
  5121  	VPXOR   Y15, Y3, Y15
  5122  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5123  	VPADDD  Y11, Y15, Y11
  5124  	VPXOR   Y7, Y11, Y7
  5125  	VPSRLD  $0x0c, Y7, Y8
  5126  	VPSLLD  $0x14, Y7, Y7
  5127  	VPOR    Y7, Y8, Y7
  5128  	VPADDD  Y3, Y7, Y3
  5129  	VPADDD  256(SP), Y3, Y3
  5130  	VPXOR   Y15, Y3, Y15
  5131  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5132  	VPADDD  Y11, Y15, Y11
  5133  	VPXOR   Y7, Y11, Y7
  5134  	VPSRLD  $0x07, Y7, Y8
  5135  	VPSLLD  $0x19, Y7, Y7
  5136  	VPOR    Y7, Y8, Y7
  5137  	VPADDD  Y0, Y5, Y0
  5138  	VPADDD  224(SP), Y0, Y0
  5139  	VPXOR   Y15, Y0, Y15
  5140  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5141  	VPADDD  Y10, Y15, Y10
  5142  	VPXOR   Y5, Y10, Y5
  5143  	VPSRLD  $0x0c, Y5, Y8
  5144  	VPSLLD  $0x14, Y5, Y5
  5145  	VPOR    Y5, Y8, Y5
  5146  	VPADDD  Y0, Y5, Y0
  5147  	VPADDD  64(SP), Y0, Y0
  5148  	VPXOR   Y15, Y0, Y15
  5149  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5150  	VPADDD  Y10, Y15, Y10
  5151  	VPXOR   Y5, Y10, Y5
  5152  	VPSRLD  $0x07, Y5, Y8
  5153  	VPSLLD  $0x19, Y5, Y5
  5154  	VPOR    Y5, Y8, Y5
  5155  	VPADDD  Y1, Y6, Y1
  5156  	VPADDD  160(SP), Y1, Y1
  5157  	VPXOR   Y12, Y1, Y12
  5158  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5159  	VPADDD  Y11, Y12, Y11
  5160  	VPXOR   Y6, Y11, Y6
  5161  	VPSRLD  $0x0c, Y6, Y8
  5162  	VPSLLD  $0x14, Y6, Y6
  5163  	VPOR    Y6, Y8, Y6
  5164  	VPADDD  Y1, Y6, Y1
  5165  	VPADDD  96(SP), Y1, Y1
  5166  	VPXOR   Y12, Y1, Y12
  5167  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5168  	VPADDD  Y11, Y12, Y11
  5169  	VPXOR   Y6, Y11, Y6
  5170  	VPSRLD  $0x07, Y6, Y8
  5171  	VPSLLD  $0x19, Y6, Y6
  5172  	VPOR    Y6, Y8, Y6
  5173  	VPADDD  Y2, Y7, Y2
  5174  	VPADDD  (SP), Y2, Y2
  5175  	VPXOR   Y13, Y2, Y13
  5176  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5177  	VMOVDQU 512(SP), Y8
  5178  	VPADDD  Y8, Y13, Y8
  5179  	VPXOR   Y7, Y8, Y7
  5180  	VMOVDQU Y8, 512(SP)
  5181  	VPSRLD  $0x0c, Y7, Y8
  5182  	VPSLLD  $0x14, Y7, Y7
  5183  	VPOR    Y7, Y8, Y7
  5184  	VPADDD  Y2, Y7, Y2
  5185  	VPADDD  32(SP), Y2, Y2
  5186  	VPXOR   Y13, Y2, Y13
  5187  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5188  	VMOVDQU 512(SP), Y8
  5189  	VPADDD  Y8, Y13, Y8
  5190  	VPXOR   Y7, Y8, Y7
  5191  	VMOVDQU Y8, 512(SP)
  5192  	VPSRLD  $0x07, Y7, Y8
  5193  	VPSLLD  $0x19, Y7, Y7
  5194  	VPOR    Y7, Y8, Y7
  5195  	VPADDD  Y3, Y4, Y3
  5196  	VPADDD  192(SP), Y3, Y3
  5197  	VPXOR   Y14, Y3, Y14
  5198  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5199  	VPADDD  Y9, Y14, Y9
  5200  	VPXOR   Y4, Y9, Y4
  5201  	VPSRLD  $0x0c, Y4, Y8
  5202  	VPSLLD  $0x14, Y4, Y4
  5203  	VPOR    Y4, Y8, Y4
  5204  	VPADDD  Y3, Y4, Y3
  5205  	VPADDD  128(SP), Y3, Y3
  5206  	VPXOR   Y14, Y3, Y14
  5207  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5208  	VPADDD  Y9, Y14, Y9
  5209  	VPXOR   Y4, Y9, Y4
  5210  	VPSRLD  $0x07, Y4, Y8
  5211  	VPSLLD  $0x19, Y4, Y4
  5212  	VPOR    Y4, Y8, Y4
  5213  
  5214  	// Round 6
  5215  	VPADDD  Y0, Y4, Y0
  5216  	VPADDD  288(SP), Y0, Y0
  5217  	VPXOR   Y12, Y0, Y12
  5218  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5219  	VMOVDQU 512(SP), Y8
  5220  	VPADDD  Y8, Y12, Y8
  5221  	VPXOR   Y4, Y8, Y4
  5222  	VMOVDQU Y8, 512(SP)
  5223  	VPSRLD  $0x0c, Y4, Y8
  5224  	VPSLLD  $0x14, Y4, Y4
  5225  	VPOR    Y4, Y8, Y4
  5226  	VPADDD  Y0, Y4, Y0
  5227  	VPADDD  448(SP), Y0, Y0
  5228  	VPXOR   Y12, Y0, Y12
  5229  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5230  	VMOVDQU 512(SP), Y8
  5231  	VPADDD  Y8, Y12, Y8
  5232  	VPXOR   Y4, Y8, Y4
  5233  	VMOVDQU Y8, 512(SP)
  5234  	VPSRLD  $0x07, Y4, Y8
  5235  	VPSLLD  $0x19, Y4, Y4
  5236  	VPOR    Y4, Y8, Y4
  5237  	VPADDD  Y1, Y5, Y1
  5238  	VPADDD  352(SP), Y1, Y1
  5239  	VPXOR   Y13, Y1, Y13
  5240  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5241  	VPADDD  Y9, Y13, Y9
  5242  	VPXOR   Y5, Y9, Y5
  5243  	VPSRLD  $0x0c, Y5, Y8
  5244  	VPSLLD  $0x14, Y5, Y5
  5245  	VPOR    Y5, Y8, Y5
  5246  	VPADDD  Y1, Y5, Y1
  5247  	VPADDD  160(SP), Y1, Y1
  5248  	VPXOR   Y13, Y1, Y13
  5249  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5250  	VPADDD  Y9, Y13, Y9
  5251  	VPXOR   Y5, Y9, Y5
  5252  	VPSRLD  $0x07, Y5, Y8
  5253  	VPSLLD  $0x19, Y5, Y5
  5254  	VPOR    Y5, Y8, Y5
  5255  	VPADDD  Y2, Y6, Y2
  5256  	VPADDD  256(SP), Y2, Y2
  5257  	VPXOR   Y14, Y2, Y14
  5258  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5259  	VPADDD  Y10, Y14, Y10
  5260  	VPXOR   Y6, Y10, Y6
  5261  	VPSRLD  $0x0c, Y6, Y8
  5262  	VPSLLD  $0x14, Y6, Y6
  5263  	VPOR    Y6, Y8, Y6
  5264  	VPADDD  Y2, Y6, Y2
  5265  	VPADDD  384(SP), Y2, Y2
  5266  	VPXOR   Y14, Y2, Y14
  5267  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5268  	VPADDD  Y10, Y14, Y10
  5269  	VPXOR   Y6, Y10, Y6
  5270  	VPSRLD  $0x07, Y6, Y8
  5271  	VPSLLD  $0x19, Y6, Y6
  5272  	VPOR    Y6, Y8, Y6
  5273  	VPADDD  Y3, Y7, Y3
  5274  	VPADDD  480(SP), Y3, Y3
  5275  	VPXOR   Y15, Y3, Y15
  5276  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5277  	VPADDD  Y11, Y15, Y11
  5278  	VPXOR   Y7, Y11, Y7
  5279  	VPSRLD  $0x0c, Y7, Y8
  5280  	VPSLLD  $0x14, Y7, Y7
  5281  	VPOR    Y7, Y8, Y7
  5282  	VPADDD  Y3, Y7, Y3
  5283  	VPADDD  32(SP), Y3, Y3
  5284  	VPXOR   Y15, Y3, Y15
  5285  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5286  	VPADDD  Y11, Y15, Y11
  5287  	VPXOR   Y7, Y11, Y7
  5288  	VPSRLD  $0x07, Y7, Y8
  5289  	VPSLLD  $0x19, Y7, Y7
  5290  	VPOR    Y7, Y8, Y7
  5291  	VPADDD  Y0, Y5, Y0
  5292  	VPADDD  416(SP), Y0, Y0
  5293  	VPXOR   Y15, Y0, Y15
  5294  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5295  	VPADDD  Y10, Y15, Y10
  5296  	VPXOR   Y5, Y10, Y5
  5297  	VPSRLD  $0x0c, Y5, Y8
  5298  	VPSLLD  $0x14, Y5, Y5
  5299  	VPOR    Y5, Y8, Y5
  5300  	VPADDD  Y0, Y5, Y0
  5301  	VPADDD  96(SP), Y0, Y0
  5302  	VPXOR   Y15, Y0, Y15
  5303  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5304  	VPADDD  Y10, Y15, Y10
  5305  	VPXOR   Y5, Y10, Y5
  5306  	VPSRLD  $0x07, Y5, Y8
  5307  	VPSLLD  $0x19, Y5, Y5
  5308  	VPOR    Y5, Y8, Y5
  5309  	VPADDD  Y1, Y6, Y1
  5310  	VPADDD  (SP), Y1, Y1
  5311  	VPXOR   Y12, Y1, Y12
  5312  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5313  	VPADDD  Y11, Y12, Y11
  5314  	VPXOR   Y6, Y11, Y6
  5315  	VPSRLD  $0x0c, Y6, Y8
  5316  	VPSLLD  $0x14, Y6, Y6
  5317  	VPOR    Y6, Y8, Y6
  5318  	VPADDD  Y1, Y6, Y1
  5319  	VPADDD  320(SP), Y1, Y1
  5320  	VPXOR   Y12, Y1, Y12
  5321  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5322  	VPADDD  Y11, Y12, Y11
  5323  	VPXOR   Y6, Y11, Y6
  5324  	VPSRLD  $0x07, Y6, Y8
  5325  	VPSLLD  $0x19, Y6, Y6
  5326  	VPOR    Y6, Y8, Y6
  5327  	VPADDD  Y2, Y7, Y2
  5328  	VPADDD  64(SP), Y2, Y2
  5329  	VPXOR   Y13, Y2, Y13
  5330  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5331  	VMOVDQU 512(SP), Y8
  5332  	VPADDD  Y8, Y13, Y8
  5333  	VPXOR   Y7, Y8, Y7
  5334  	VMOVDQU Y8, 512(SP)
  5335  	VPSRLD  $0x0c, Y7, Y8
  5336  	VPSLLD  $0x14, Y7, Y7
  5337  	VPOR    Y7, Y8, Y7
  5338  	VPADDD  Y2, Y7, Y2
  5339  	VPADDD  192(SP), Y2, Y2
  5340  	VPXOR   Y13, Y2, Y13
  5341  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5342  	VMOVDQU 512(SP), Y8
  5343  	VPADDD  Y8, Y13, Y8
  5344  	VPXOR   Y7, Y8, Y7
  5345  	VMOVDQU Y8, 512(SP)
  5346  	VPSRLD  $0x07, Y7, Y8
  5347  	VPSLLD  $0x19, Y7, Y7
  5348  	VPOR    Y7, Y8, Y7
  5349  	VPADDD  Y3, Y4, Y3
  5350  	VPADDD  128(SP), Y3, Y3
  5351  	VPXOR   Y14, Y3, Y14
  5352  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5353  	VPADDD  Y9, Y14, Y9
  5354  	VPXOR   Y4, Y9, Y4
  5355  	VPSRLD  $0x0c, Y4, Y8
  5356  	VPSLLD  $0x14, Y4, Y4
  5357  	VPOR    Y4, Y8, Y4
  5358  	VPADDD  Y3, Y4, Y3
  5359  	VPADDD  224(SP), Y3, Y3
  5360  	VPXOR   Y14, Y3, Y14
  5361  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5362  	VPADDD  Y9, Y14, Y9
  5363  	VPXOR   Y4, Y9, Y4
  5364  	VPSRLD  $0x07, Y4, Y8
  5365  	VPSLLD  $0x19, Y4, Y4
  5366  	VPOR    Y4, Y8, Y4
  5367  
  5368  	// Round 7
  5369  	VPADDD  Y0, Y4, Y0
  5370  	VPADDD  352(SP), Y0, Y0
  5371  	VPXOR   Y12, Y0, Y12
  5372  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5373  	VMOVDQU 512(SP), Y8
  5374  	VPADDD  Y8, Y12, Y8
  5375  	VPXOR   Y4, Y8, Y4
  5376  	VMOVDQU Y8, 512(SP)
  5377  	VPSRLD  $0x0c, Y4, Y8
  5378  	VPSLLD  $0x14, Y4, Y4
  5379  	VPOR    Y4, Y8, Y4
  5380  	VPADDD  Y0, Y4, Y0
  5381  	VPADDD  480(SP), Y0, Y0
  5382  	VPXOR   Y12, Y0, Y12
  5383  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5384  	VMOVDQU 512(SP), Y8
  5385  	VPADDD  Y8, Y12, Y8
  5386  	VPXOR   Y4, Y8, Y4
  5387  	VMOVDQU Y8, 512(SP)
  5388  	VPSRLD  $0x07, Y4, Y8
  5389  	VPSLLD  $0x19, Y4, Y4
  5390  	VPOR    Y4, Y8, Y4
  5391  	VPADDD  Y1, Y5, Y1
  5392  	VPADDD  160(SP), Y1, Y1
  5393  	VPXOR   Y13, Y1, Y13
  5394  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5395  	VPADDD  Y9, Y13, Y9
  5396  	VPXOR   Y5, Y9, Y5
  5397  	VPSRLD  $0x0c, Y5, Y8
  5398  	VPSLLD  $0x14, Y5, Y5
  5399  	VPOR    Y5, Y8, Y5
  5400  	VPADDD  Y1, Y5, Y1
  5401  	VPADDD  (SP), Y1, Y1
  5402  	VPXOR   Y13, Y1, Y13
  5403  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5404  	VPADDD  Y9, Y13, Y9
  5405  	VPXOR   Y5, Y9, Y5
  5406  	VPSRLD  $0x07, Y5, Y8
  5407  	VPSLLD  $0x19, Y5, Y5
  5408  	VPOR    Y5, Y8, Y5
  5409  	VPADDD  Y2, Y6, Y2
  5410  	VPADDD  32(SP), Y2, Y2
  5411  	VPXOR   Y14, Y2, Y14
  5412  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5413  	VPADDD  Y10, Y14, Y10
  5414  	VPXOR   Y6, Y10, Y6
  5415  	VPSRLD  $0x0c, Y6, Y8
  5416  	VPSLLD  $0x14, Y6, Y6
  5417  	VPOR    Y6, Y8, Y6
  5418  	VPADDD  Y2, Y6, Y2
  5419  	VPADDD  288(SP), Y2, Y2
  5420  	VPXOR   Y14, Y2, Y14
  5421  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5422  	VPADDD  Y10, Y14, Y10
  5423  	VPXOR   Y6, Y10, Y6
  5424  	VPSRLD  $0x07, Y6, Y8
  5425  	VPSLLD  $0x19, Y6, Y6
  5426  	VPOR    Y6, Y8, Y6
  5427  	VPADDD  Y3, Y7, Y3
  5428  	VPADDD  256(SP), Y3, Y3
  5429  	VPXOR   Y15, Y3, Y15
  5430  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5431  	VPADDD  Y11, Y15, Y11
  5432  	VPXOR   Y7, Y11, Y7
  5433  	VPSRLD  $0x0c, Y7, Y8
  5434  	VPSLLD  $0x14, Y7, Y7
  5435  	VPOR    Y7, Y8, Y7
  5436  	VPADDD  Y3, Y7, Y3
  5437  	VPADDD  192(SP), Y3, Y3
  5438  	VPXOR   Y15, Y3, Y15
  5439  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5440  	VPADDD  Y11, Y15, Y11
  5441  	VPXOR   Y7, Y11, Y7
  5442  	VPSRLD  $0x07, Y7, Y8
  5443  	VPSLLD  $0x19, Y7, Y7
  5444  	VPOR    Y7, Y8, Y7
  5445  	VPADDD  Y0, Y5, Y0
  5446  	VPADDD  448(SP), Y0, Y0
  5447  	VPXOR   Y15, Y0, Y15
  5448  	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5449  	VPADDD  Y10, Y15, Y10
  5450  	VPXOR   Y5, Y10, Y5
  5451  	VPSRLD  $0x0c, Y5, Y8
  5452  	VPSLLD  $0x14, Y5, Y5
  5453  	VPOR    Y5, Y8, Y5
  5454  	VPADDD  Y0, Y5, Y0
  5455  	VPADDD  320(SP), Y0, Y0
  5456  	VPXOR   Y15, Y0, Y15
  5457  	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5458  	VPADDD  Y10, Y15, Y10
  5459  	VPXOR   Y5, Y10, Y5
  5460  	VPSRLD  $0x07, Y5, Y8
  5461  	VPSLLD  $0x19, Y5, Y5
  5462  	VPOR    Y5, Y8, Y5
  5463  	VPADDD  Y1, Y6, Y1
  5464  	VPADDD  64(SP), Y1, Y1
  5465  	VPXOR   Y12, Y1, Y12
  5466  	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5467  	VPADDD  Y11, Y12, Y11
  5468  	VPXOR   Y6, Y11, Y6
  5469  	VPSRLD  $0x0c, Y6, Y8
  5470  	VPSLLD  $0x14, Y6, Y6
  5471  	VPOR    Y6, Y8, Y6
  5472  	VPADDD  Y1, Y6, Y1
  5473  	VPADDD  384(SP), Y1, Y1
  5474  	VPXOR   Y12, Y1, Y12
  5475  	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5476  	VPADDD  Y11, Y12, Y11
  5477  	VPXOR   Y6, Y11, Y6
  5478  	VPSRLD  $0x07, Y6, Y8
  5479  	VPSLLD  $0x19, Y6, Y6
  5480  	VPOR    Y6, Y8, Y6
  5481  	VPADDD  Y2, Y7, Y2
  5482  	VPADDD  96(SP), Y2, Y2
  5483  	VPXOR   Y13, Y2, Y13
  5484  	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5485  	VMOVDQU 512(SP), Y8
  5486  	VPADDD  Y8, Y13, Y8
  5487  	VPXOR   Y7, Y8, Y7
  5488  	VMOVDQU Y8, 512(SP)
  5489  	VPSRLD  $0x0c, Y7, Y8
  5490  	VPSLLD  $0x14, Y7, Y7
  5491  	VPOR    Y7, Y8, Y7
  5492  	VPADDD  Y2, Y7, Y2
  5493  	VPADDD  128(SP), Y2, Y2
  5494  	VPXOR   Y13, Y2, Y13
  5495  	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5496  	VMOVDQU 512(SP), Y8
  5497  	VPADDD  Y8, Y13, Y8
  5498  	VPXOR   Y7, Y8, Y7
  5499  	VMOVDQU Y8, 512(SP)
  5500  	VPSRLD  $0x07, Y7, Y8
  5501  	VPSLLD  $0x19, Y7, Y7
  5502  	VPOR    Y7, Y8, Y7
  5503  	VPADDD  Y3, Y4, Y3
  5504  	VPADDD  224(SP), Y3, Y3
  5505  	VPXOR   Y14, Y3, Y14
  5506  	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5507  	VPADDD  Y9, Y14, Y9
  5508  	VPXOR   Y4, Y9, Y4
  5509  	VPSRLD  $0x0c, Y4, Y8
  5510  	VPSLLD  $0x14, Y4, Y4
  5511  	VPOR    Y4, Y8, Y4
  5512  	VPADDD  Y3, Y4, Y3
  5513  	VPADDD  416(SP), Y3, Y3
  5514  	VPXOR   Y14, Y3, Y14
  5515  	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5516  	VPADDD  Y9, Y14, Y9
  5517  	VPXOR   Y4, Y9, Y4
  5518  	VPSRLD  $0x07, Y4, Y8
  5519  	VPSLLD  $0x19, Y4, Y4
  5520  	VPOR    Y4, Y8, Y4
  5521  	VMOVDQU 512(SP), Y8
  5522  
  5523  	// Finalize CVs
  5524  	VPXOR       Y0, Y8, Y0
  5525  	VPXOR       Y1, Y9, Y1
  5526  	VPXOR       Y2, Y10, Y2
  5527  	VPXOR       Y3, Y11, Y3
  5528  	VPXOR       Y4, Y12, Y4
  5529  	VPXOR       Y5, Y13, Y5
  5530  	VPXOR       Y6, Y14, Y6
  5531  	VPXOR       Y7, Y15, Y7
  5532  	VPUNPCKLDQ  Y1, Y0, Y8
  5533  	VPUNPCKHDQ  Y1, Y0, Y9
  5534  	VPUNPCKLDQ  Y3, Y2, Y10
  5535  	VPUNPCKHDQ  Y3, Y2, Y11
  5536  	VPUNPCKLDQ  Y5, Y4, Y12
  5537  	VPUNPCKHDQ  Y5, Y4, Y13
  5538  	VPUNPCKLDQ  Y7, Y6, Y14
  5539  	VPUNPCKHDQ  Y7, Y6, Y15
  5540  	VPUNPCKLQDQ Y10, Y8, Y0
  5541  	VPUNPCKHQDQ Y10, Y8, Y1
  5542  	VPUNPCKLQDQ Y11, Y9, Y2
  5543  	VPUNPCKHQDQ Y11, Y9, Y3
  5544  	VPUNPCKLQDQ Y14, Y12, Y4
  5545  	VPUNPCKHQDQ Y14, Y12, Y5
  5546  	VPUNPCKLQDQ Y15, Y13, Y6
  5547  	VPUNPCKHQDQ Y15, Y13, Y7
  5548  	VPERM2I128  $0x20, Y4, Y0, Y8
  5549  	VPERM2I128  $0x31, Y4, Y0, Y12
  5550  	VPERM2I128  $0x20, Y5, Y1, Y9
  5551  	VPERM2I128  $0x31, Y5, Y1, Y13
  5552  	VPERM2I128  $0x20, Y6, Y2, Y10
  5553  	VPERM2I128  $0x31, Y6, Y2, Y14
  5554  	VPERM2I128  $0x20, Y7, Y3, Y11
  5555  	VPERM2I128  $0x31, Y7, Y3, Y15
  5556  	VMOVDQU     Y8, (AX)
  5557  	VMOVDQU     Y9, 32(AX)
  5558  	VMOVDQU     Y10, 64(AX)
  5559  	VMOVDQU     Y11, 96(AX)
  5560  	VMOVDQU     Y12, 128(AX)
  5561  	VMOVDQU     Y13, 160(AX)
  5562  	VMOVDQU     Y14, 192(AX)
  5563  	VMOVDQU     Y15, 224(AX)
  5564  	RET