github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
     6  
     7  // +build go1.7,amd64,!gccgo,!appengine
     8  
     9  #include "textflag.h"
    10  // General register allocation
    11  #define oup DI
    12  #define inp SI
    13  #define inl BX
    14  #define adp CX // free to reuse, after we hash the additional data
    15  #define keyp R8 // free to reuse, when we copy the key to stack
    16  #define itr2 R9 // general iterator
    17  #define itr1 CX // general iterator
    18  #define acc0 R10
    19  #define acc1 R11
    20  #define acc2 R12
    21  #define t0 R13
    22  #define t1 R14
    23  #define t2 R15
    24  #define t3 R8
    25  // Register and stack allocation for the SSE code
    26  #define rStore (0*16)(BP)
    27  #define sStore (1*16)(BP)
    28  #define state1Store (2*16)(BP)
    29  #define state2Store (3*16)(BP)
    30  #define tmpStore (4*16)(BP)
    31  #define ctr0Store (5*16)(BP)
    32  #define ctr1Store (6*16)(BP)
    33  #define ctr2Store (7*16)(BP)
    34  #define ctr3Store (8*16)(BP)
    35  #define A0 X0
    36  #define A1 X1
    37  #define A2 X2
    38  #define B0 X3
    39  #define B1 X4
    40  #define B2 X5
    41  #define C0 X6
    42  #define C1 X7
    43  #define C2 X8
    44  #define D0 X9
    45  #define D1 X10
    46  #define D2 X11
    47  #define T0 X12
    48  #define T1 X13
    49  #define T2 X14
    50  #define T3 X15
    51  #define A3 T0
    52  #define B3 T1
    53  #define C3 T2
    54  #define D3 T3
    55  // Register and stack allocation for the AVX2 code
    56  #define rsStoreAVX2 (0*32)(BP)
    57  #define state1StoreAVX2 (1*32)(BP)
    58  #define state2StoreAVX2 (2*32)(BP)
    59  #define ctr0StoreAVX2 (3*32)(BP)
    60  #define ctr1StoreAVX2 (4*32)(BP)
    61  #define ctr2StoreAVX2 (5*32)(BP)
    62  #define ctr3StoreAVX2 (6*32)(BP)
    63  #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
    64  #define AA0 Y0
    65  #define AA1 Y5
    66  #define AA2 Y6
    67  #define AA3 Y7
    68  #define BB0 Y14
    69  #define BB1 Y9
    70  #define BB2 Y10
    71  #define BB3 Y11
    72  #define CC0 Y12
    73  #define CC1 Y13
    74  #define CC2 Y8
    75  #define CC3 Y15
    76  #define DD0 Y4
    77  #define DD1 Y1
    78  #define DD2 Y2
    79  #define DD3 Y3
    80  #define TT0 DD3
    81  #define TT1 AA3
    82  #define TT2 BB3
    83  #define TT3 CC3
    84  // ChaCha20 constants
    85  DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
    86  DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
    87  DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
    88  DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
    89  DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
    90  DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
    91  DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
    92  DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
    93  // <<< 16 with PSHUFB
    94  DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
    95  DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
    96  DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
    97  DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
    98  // <<< 8 with PSHUFB
    99  DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
   100  DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
   101  DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
   102  DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
   103  
   104  DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
   105  DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
   106  DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
   107  DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
   108  
   109  DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
   110  DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
   111  DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
   112  DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
   113  // Poly1305 key clamp
   114  DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
   115  DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
   116  DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   117  DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   118  
   119  DATA ·sseIncMask<>+0x00(SB)/8, $0x1
   120  DATA ·sseIncMask<>+0x08(SB)/8, $0x0
   121  // To load/store the last < 16 bytes in a buffer
   122  DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
   123  DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
   124  DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
   125  DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
   126  DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
   127  DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
   128  DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
   129  DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
   130  DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
   131  DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
   132  DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
   133  DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
   134  DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
   135  DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
   136  DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
   137  DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
   138  DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
   139  DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
   140  DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
   141  DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
   142  DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
   143  DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
   144  DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
   145  DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
   146  DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
   147  DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
   148  DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
   149  DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
   150  DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
   151  DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
   152  
   153  GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
   154  GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
   155  GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
   156  GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
   157  GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
   158  GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
   159  GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
   160  GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
   161  // No PALIGNR in Go ASM yet (but VPALIGNR is present).
   162  #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
   163  #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
   164  #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
   165  #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
   166  #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
   167  #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
   168  #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
   169  #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
   170  #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
   171  #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
   172  #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
   173  #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
   174  #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
   175  #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
   176  #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
   177  #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
   178  #define shiftC0Right shiftC0Left
   179  #define shiftC1Right shiftC1Left
   180  #define shiftC2Right shiftC2Left
   181  #define shiftC3Right shiftC3Left
   182  #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
   183  #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
   184  #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
   185  #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
   186  // Some macros
   187  #define chachaQR(A, B, C, D, T) \
   188  	PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
   189  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
   190  	PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
   191  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
   192  
   193  #define chachaQR_AVX2(A, B, C, D, T) \
   194  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
   195  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
   196  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
   197  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
   198  
   199  #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
   200  #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
   201  #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
   202  #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
   203  #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
   204  
   205  #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
   206  #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
   207  #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
   208  
   209  #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
   210  #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
   211  // ----------------------------------------------------------------------------
   212  TEXT polyHashADInternal<>(SB), NOSPLIT, $0
   213  	// adp points to beginning of additional data
   214  	// itr2 holds ad length
   215  	XORQ acc0, acc0
   216  	XORQ acc1, acc1
   217  	XORQ acc2, acc2
   218  	CMPQ itr2, $13
   219  	JNE  hashADLoop
   220  
   221  openFastTLSAD:
   222  	// Special treatment for the TLS case of 13 bytes
   223  	MOVQ (adp), acc0
   224  	MOVQ 5(adp), acc1
   225  	SHRQ $24, acc1
   226  	MOVQ $1, acc2
   227  	polyMul
   228  	RET
   229  
   230  hashADLoop:
   231  	// Hash in 16 byte chunks
   232  	CMPQ itr2, $16
   233  	JB   hashADTail
   234  	polyAdd(0(adp))
   235  	LEAQ (1*16)(adp), adp
   236  	SUBQ $16, itr2
   237  	polyMul
   238  	JMP  hashADLoop
   239  
   240  hashADTail:
   241  	CMPQ itr2, $0
   242  	JE   hashADDone
   243  
   244  	// Hash last < 16 byte tail
   245  	XORQ t0, t0
   246  	XORQ t1, t1
   247  	XORQ t2, t2
   248  	ADDQ itr2, adp
   249  
   250  hashADTailLoop:
   251  	SHLQ $8, t1:t0
   252  	SHLQ $8, t0
   253  	MOVB -1(adp), t2
   254  	XORQ t2, t0
   255  	DECQ adp
   256  	DECQ itr2
   257  	JNE  hashADTailLoop
   258  
   259  hashADTailFinish:
   260  	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   261  	polyMul
   262  
   263  	// Finished AD
   264  hashADDone:
   265  	RET
   266  
   267  // ----------------------------------------------------------------------------
   268  // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
   269  TEXT ·chacha20Poly1305Open(SB), 0, $288-97
   270  	// For aligned stack access
   271  	MOVQ SP, BP
   272  	ADDQ $32, BP
   273  	ANDQ $-32, BP
   274  	MOVQ dst+0(FP), oup
   275  	MOVQ key+24(FP), keyp
   276  	MOVQ src+48(FP), inp
   277  	MOVQ src_len+56(FP), inl
   278  	MOVQ ad+72(FP), adp
   279  
   280  	// Check for AVX2 support
   281  	CMPB ·useAVX2(SB), $1
   282  	JE   chacha20Poly1305Open_AVX2
   283  
   284  	// Special optimization, for very short buffers
   285  	CMPQ inl, $128
   286  	JBE  openSSE128 // About 16% faster
   287  
   288  	// For long buffers, prepare the poly key first
   289  	MOVOU ·chacha20Constants<>(SB), A0
   290  	MOVOU (1*16)(keyp), B0
   291  	MOVOU (2*16)(keyp), C0
   292  	MOVOU (3*16)(keyp), D0
   293  	MOVO  D0, T1
   294  
   295  	// Store state on stack for future use
   296  	MOVO B0, state1Store
   297  	MOVO C0, state2Store
   298  	MOVO D0, ctr3Store
   299  	MOVQ $10, itr2
   300  
   301  openSSEPreparePolyKey:
   302  	chachaQR(A0, B0, C0, D0, T0)
   303  	shiftB0Left;  shiftC0Left; shiftD0Left
   304  	chachaQR(A0, B0, C0, D0, T0)
   305  	shiftB0Right; shiftC0Right; shiftD0Right
   306  	DECQ          itr2
   307  	JNE           openSSEPreparePolyKey
   308  
   309  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   310  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
   311  
   312  	// Clamp and store the key
   313  	PAND ·polyClampMask<>(SB), A0
   314  	MOVO A0, rStore; MOVO B0, sStore
   315  
   316  	// Hash AAD
   317  	MOVQ ad_len+80(FP), itr2
   318  	CALL polyHashADInternal<>(SB)
   319  
   320  openSSEMainLoop:
   321  	CMPQ inl, $256
   322  	JB   openSSEMainLoopDone
   323  
   324  	// Load state, increment counter blocks
   325  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   326  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   327  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   328  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   329  
   330  	// Store counters
   331  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   332  
   333  	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
   334  	MOVQ $4, itr1
   335  	MOVQ inp, itr2
   336  
   337  openSSEInternalLoop:
   338  	MOVO          C3, tmpStore
   339  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   340  	MOVO          tmpStore, C3
   341  	MOVO          C1, tmpStore
   342  	chachaQR(A3, B3, C3, D3, C1)
   343  	MOVO          tmpStore, C1
   344  	polyAdd(0(itr2))
   345  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   346  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   347  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   348  	polyMulStage1
   349  	polyMulStage2
   350  	LEAQ          (2*8)(itr2), itr2
   351  	MOVO          C3, tmpStore
   352  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   353  	MOVO          tmpStore, C3
   354  	MOVO          C1, tmpStore
   355  	polyMulStage3
   356  	chachaQR(A3, B3, C3, D3, C1)
   357  	MOVO          tmpStore, C1
   358  	polyMulReduceStage
   359  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   360  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   361  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   362  	DECQ          itr1
   363  	JGE           openSSEInternalLoop
   364  
   365  	polyAdd(0(itr2))
   366  	polyMul
   367  	LEAQ (2*8)(itr2), itr2
   368  
   369  	CMPQ itr1, $-6
   370  	JG   openSSEInternalLoop
   371  
   372  	// Add in the state
   373  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   374  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   375  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   376  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   377  
   378  	// Load - xor - store
   379  	MOVO  D3, tmpStore
   380  	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
   381  	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
   382  	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
   383  	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
   384  	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
   385  	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
   386  	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
   387  	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
   388  	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
   389  	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
   390  	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
   391  	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
   392  	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
   393  	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
   394  	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
   395  	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
   396  	LEAQ  256(inp), inp
   397  	LEAQ  256(oup), oup
   398  	SUBQ  $256, inl
   399  	JMP   openSSEMainLoop
   400  
   401  openSSEMainLoopDone:
   402  	// Handle the various tail sizes efficiently
   403  	TESTQ inl, inl
   404  	JE    openSSEFinalize
   405  	CMPQ  inl, $64
   406  	JBE   openSSETail64
   407  	CMPQ  inl, $128
   408  	JBE   openSSETail128
   409  	CMPQ  inl, $192
   410  	JBE   openSSETail192
   411  	JMP   openSSETail256
   412  
   413  openSSEFinalize:
   414  	// Hash in the PT, AAD lengths
   415  	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
   416  	polyMul
   417  
   418  	// Final reduce
   419  	MOVQ    acc0, t0
   420  	MOVQ    acc1, t1
   421  	MOVQ    acc2, t2
   422  	SUBQ    $-5, acc0
   423  	SBBQ    $-1, acc1
   424  	SBBQ    $3, acc2
   425  	CMOVQCS t0, acc0
   426  	CMOVQCS t1, acc1
   427  	CMOVQCS t2, acc2
   428  
   429  	// Add in the "s" part of the key
   430  	ADDQ 0+sStore, acc0
   431  	ADCQ 8+sStore, acc1
   432  
   433  	// Finally, constant time compare to the tag at the end of the message
   434  	XORQ    AX, AX
   435  	MOVQ    $1, DX
   436  	XORQ    (0*8)(inp), acc0
   437  	XORQ    (1*8)(inp), acc1
   438  	ORQ     acc1, acc0
   439  	CMOVQEQ DX, AX
   440  
   441  	// Return true iff tags are equal
   442  	MOVB AX, ret+96(FP)
   443  	RET
   444  
   445  // ----------------------------------------------------------------------------
   446  // Special optimization for buffers smaller than 129 bytes
   447  openSSE128:
   448  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
   449  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
   450  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   451  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   452  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
   453  	MOVQ  $10, itr2
   454  
   455  openSSE128InnerCipherLoop:
   456  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   457  	shiftB0Left;  shiftB1Left; shiftB2Left
   458  	shiftC0Left;  shiftC1Left; shiftC2Left
   459  	shiftD0Left;  shiftD1Left; shiftD2Left
   460  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   461  	shiftB0Right; shiftB1Right; shiftB2Right
   462  	shiftC0Right; shiftC1Right; shiftC2Right
   463  	shiftD0Right; shiftD1Right; shiftD2Right
   464  	DECQ          itr2
   465  	JNE           openSSE128InnerCipherLoop
   466  
   467  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   468  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   469  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
   470  	PADDL T2, C1; PADDL T2, C2
   471  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
   472  
   473  	// Clamp and store the key
   474  	PAND  ·polyClampMask<>(SB), A0
   475  	MOVOU A0, rStore; MOVOU B0, sStore
   476  
   477  	// Hash
   478  	MOVQ ad_len+80(FP), itr2
   479  	CALL polyHashADInternal<>(SB)
   480  
   481  openSSE128Open:
   482  	CMPQ inl, $16
   483  	JB   openSSETail16
   484  	SUBQ $16, inl
   485  
   486  	// Load for hashing
   487  	polyAdd(0(inp))
   488  
   489  	// Load for decryption
   490  	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
   491  	LEAQ  (1*16)(inp), inp
   492  	LEAQ  (1*16)(oup), oup
   493  	polyMul
   494  
   495  	// Shift the stream "left"
   496  	MOVO B1, A1
   497  	MOVO C1, B1
   498  	MOVO D1, C1
   499  	MOVO A2, D1
   500  	MOVO B2, A2
   501  	MOVO C2, B2
   502  	MOVO D2, C2
   503  	JMP  openSSE128Open
   504  
   505  openSSETail16:
   506  	TESTQ inl, inl
   507  	JE    openSSEFinalize
   508  
   509  	// We can safely load the CT from the end, because it is padded with the MAC
   510  	MOVQ   inl, itr2
   511  	SHLQ   $4, itr2
   512  	LEAQ   ·andMask<>(SB), t0
   513  	MOVOU  (inp), T0
   514  	ADDQ   inl, inp
   515  	PAND   -16(t0)(itr2*1), T0
   516  	MOVO   T0, 0+tmpStore
   517  	MOVQ   T0, t0
   518  	MOVQ   8+tmpStore, t1
   519  	PXOR   A1, T0
   520  
   521  	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
   522  openSSETail16Store:
   523  	MOVQ T0, t3
   524  	MOVB t3, (oup)
   525  	PSRLDQ $1, T0
   526  	INCQ   oup
   527  	DECQ   inl
   528  	JNE    openSSETail16Store
   529  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   530  	polyMul
   531  	JMP    openSSEFinalize
   532  
   533  // ----------------------------------------------------------------------------
   534  // Special optimization for the last 64 bytes of ciphertext
   535  openSSETail64:
   536  	// Need to decrypt up to 64 bytes - prepare single block
   537  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
   538  	XORQ itr2, itr2
   539  	MOVQ inl, itr1
   540  	CMPQ itr1, $16
   541  	JB   openSSETail64LoopB
   542  
   543  openSSETail64LoopA:
   544  	// Perform ChaCha rounds, while hashing the remaining input
   545  	polyAdd(0(inp)(itr2*1))
   546  	polyMul
   547  	SUBQ $16, itr1
   548  
   549  openSSETail64LoopB:
   550  	ADDQ          $16, itr2
   551  	chachaQR(A0, B0, C0, D0, T0)
   552  	shiftB0Left;  shiftC0Left; shiftD0Left
   553  	chachaQR(A0, B0, C0, D0, T0)
   554  	shiftB0Right; shiftC0Right; shiftD0Right
   555  
   556  	CMPQ itr1, $16
   557  	JAE  openSSETail64LoopA
   558  
   559  	CMPQ itr2, $160
   560  	JNE  openSSETail64LoopB
   561  
   562  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
   563  
   564  openSSETail64DecLoop:
   565  	CMPQ  inl, $16
   566  	JB    openSSETail64DecLoopDone
   567  	SUBQ  $16, inl
   568  	MOVOU (inp), T0
   569  	PXOR  T0, A0
   570  	MOVOU A0, (oup)
   571  	LEAQ  16(inp), inp
   572  	LEAQ  16(oup), oup
   573  	MOVO  B0, A0
   574  	MOVO  C0, B0
   575  	MOVO  D0, C0
   576  	JMP   openSSETail64DecLoop
   577  
   578  openSSETail64DecLoopDone:
   579  	MOVO A0, A1
   580  	JMP  openSSETail16
   581  
   582  // ----------------------------------------------------------------------------
   583  // Special optimization for the last 128 bytes of ciphertext
   584  openSSETail128:
   585  	// Need to decrypt up to 128 bytes - prepare two blocks
   586  	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
   587  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
   588  	XORQ itr2, itr2
   589  	MOVQ inl, itr1
   590  	ANDQ $-16, itr1
   591  
   592  openSSETail128LoopA:
   593  	// Perform ChaCha rounds, while hashing the remaining input
   594  	polyAdd(0(inp)(itr2*1))
   595  	polyMul
   596  
   597  openSSETail128LoopB:
   598  	ADDQ          $16, itr2
   599  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   600  	shiftB0Left;  shiftC0Left; shiftD0Left
   601  	shiftB1Left;  shiftC1Left; shiftD1Left
   602  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   603  	shiftB0Right; shiftC0Right; shiftD0Right
   604  	shiftB1Right; shiftC1Right; shiftD1Right
   605  
   606  	CMPQ itr2, itr1
   607  	JB   openSSETail128LoopA
   608  
   609  	CMPQ itr2, $160
   610  	JNE  openSSETail128LoopB
   611  
   612  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
   613  	PADDL state1Store, B0; PADDL state1Store, B1
   614  	PADDL state2Store, C0; PADDL state2Store, C1
   615  	PADDL ctr1Store, D0; PADDL ctr0Store, D1
   616  
   617  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   618  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   619  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
   620  
   621  	SUBQ $64, inl
   622  	LEAQ 64(inp), inp
   623  	LEAQ 64(oup), oup
   624  	JMP  openSSETail64DecLoop
   625  
   626  // ----------------------------------------------------------------------------
   627  // Special optimization for the last 192 bytes of ciphertext
   628  openSSETail192:
   629  	// Need to decrypt up to 192 bytes - prepare three blocks
   630  	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
   631  	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
   632  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
   633  
   634  	MOVQ    inl, itr1
   635  	MOVQ    $160, itr2
   636  	CMPQ    itr1, $160
   637  	CMOVQGT itr2, itr1
   638  	ANDQ    $-16, itr1
   639  	XORQ    itr2, itr2
   640  
   641  openSSLTail192LoopA:
   642  	// Perform ChaCha rounds, while hashing the remaining input
   643  	polyAdd(0(inp)(itr2*1))
   644  	polyMul
   645  
   646  openSSLTail192LoopB:
   647  	ADDQ         $16, itr2
   648  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   649  	shiftB0Left; shiftC0Left; shiftD0Left
   650  	shiftB1Left; shiftC1Left; shiftD1Left
   651  	shiftB2Left; shiftC2Left; shiftD2Left
   652  
   653  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   654  	shiftB0Right; shiftC0Right; shiftD0Right
   655  	shiftB1Right; shiftC1Right; shiftD1Right
   656  	shiftB2Right; shiftC2Right; shiftD2Right
   657  
   658  	CMPQ itr2, itr1
   659  	JB   openSSLTail192LoopA
   660  
   661  	CMPQ itr2, $160
   662  	JNE  openSSLTail192LoopB
   663  
   664  	CMPQ inl, $176
   665  	JB   openSSLTail192Store
   666  
   667  	polyAdd(160(inp))
   668  	polyMul
   669  
   670  	CMPQ inl, $192
   671  	JB   openSSLTail192Store
   672  
   673  	polyAdd(176(inp))
   674  	polyMul
   675  
   676  openSSLTail192Store:
   677  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   678  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
   679  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
   680  	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
   681  
   682  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   683  	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
   684  	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
   685  
   686  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
   687  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   688  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   689  
   690  	SUBQ $128, inl
   691  	LEAQ 128(inp), inp
   692  	LEAQ 128(oup), oup
   693  	JMP  openSSETail64DecLoop
   694  
   695  // ----------------------------------------------------------------------------
   696  // Special optimization for the last 256 bytes of ciphertext
   697  openSSETail256:
   698  	// Need to decrypt up to 256 bytes - prepare four blocks
   699  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   700  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   701  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   702  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   703  
   704  	// Store counters
   705  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   706  	XORQ itr2, itr2
   707  
   708  openSSETail256Loop:
   709  	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
   710  	polyAdd(0(inp)(itr2*1))
   711  	MOVO          C3, tmpStore
   712  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   713  	MOVO          tmpStore, C3
   714  	MOVO          C1, tmpStore
   715  	chachaQR(A3, B3, C3, D3, C1)
   716  	MOVO          tmpStore, C1
   717  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   718  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   719  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   720  	polyMulStage1
   721  	polyMulStage2
   722  	MOVO          C3, tmpStore
   723  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   724  	MOVO          tmpStore, C3
   725  	MOVO          C1, tmpStore
   726  	chachaQR(A3, B3, C3, D3, C1)
   727  	MOVO          tmpStore, C1
   728  	polyMulStage3
   729  	polyMulReduceStage
   730  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   731  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   732  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   733  	ADDQ          $2*8, itr2
   734  	CMPQ          itr2, $160
   735  	JB            openSSETail256Loop
   736  	MOVQ          inl, itr1
   737  	ANDQ          $-16, itr1
   738  
   739  openSSETail256HashLoop:
   740  	polyAdd(0(inp)(itr2*1))
   741  	polyMul
   742  	ADDQ $2*8, itr2
   743  	CMPQ itr2, itr1
   744  	JB   openSSETail256HashLoop
   745  
   746  	// Add in the state
   747  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   748  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   749  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   750  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   751  	MOVO  D3, tmpStore
   752  
   753  	// Load - xor - store
   754  	MOVOU (0*16)(inp), D3; PXOR D3, A0
   755  	MOVOU (1*16)(inp), D3; PXOR D3, B0
   756  	MOVOU (2*16)(inp), D3; PXOR D3, C0
   757  	MOVOU (3*16)(inp), D3; PXOR D3, D0
   758  	MOVOU A0, (0*16)(oup)
   759  	MOVOU B0, (1*16)(oup)
   760  	MOVOU C0, (2*16)(oup)
   761  	MOVOU D0, (3*16)(oup)
   762  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
   763  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
   764  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   765  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
   766  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
   767  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
   768  	LEAQ  192(inp), inp
   769  	LEAQ  192(oup), oup
   770  	SUBQ  $192, inl
   771  	MOVO  A3, A0
   772  	MOVO  B3, B0
   773  	MOVO  C3, C0
   774  	MOVO  tmpStore, D0
   775  
   776  	JMP openSSETail64DecLoop
   777  
   778  // ----------------------------------------------------------------------------
   779  // ------------------------- AVX2 Code ----------------------------------------
   780  chacha20Poly1305Open_AVX2:
   781  	VZEROUPPER
   782  	VMOVDQU ·chacha20Constants<>(SB), AA0
   783  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
   784  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
   785  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
   786  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
   787  
   788  	// Special optimization, for very short buffers
   789  	CMPQ inl, $192
   790  	JBE  openAVX2192
   791  	CMPQ inl, $320
   792  	JBE  openAVX2320
   793  
   794  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
   795  	VMOVDQA BB0, state1StoreAVX2
   796  	VMOVDQA CC0, state2StoreAVX2
   797  	VMOVDQA DD0, ctr3StoreAVX2
   798  	MOVQ    $10, itr2
   799  
   800  openAVX2PreparePolyKey:
   801  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   802  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
   803  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   804  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
   805  	DECQ     itr2
   806  	JNE      openAVX2PreparePolyKey
   807  
   808  	VPADDD ·chacha20Constants<>(SB), AA0, AA0
   809  	VPADDD state1StoreAVX2, BB0, BB0
   810  	VPADDD state2StoreAVX2, CC0, CC0
   811  	VPADDD ctr3StoreAVX2, DD0, DD0
   812  
   813  	VPERM2I128 $0x02, AA0, BB0, TT0
   814  
   815  	// Clamp and store poly key
   816  	VPAND   ·polyClampMask<>(SB), TT0, TT0
   817  	VMOVDQA TT0, rsStoreAVX2
   818  
   819  	// Stream for the first 64 bytes
   820  	VPERM2I128 $0x13, AA0, BB0, AA0
   821  	VPERM2I128 $0x13, CC0, DD0, BB0
   822  
   823  	// Hash AD + first 64 bytes
   824  	MOVQ ad_len+80(FP), itr2
   825  	CALL polyHashADInternal<>(SB)
   826  	XORQ itr1, itr1
   827  
   828  openAVX2InitialHash64:
   829  	polyAdd(0(inp)(itr1*1))
   830  	polyMulAVX2
   831  	ADDQ $16, itr1
   832  	CMPQ itr1, $64
   833  	JNE  openAVX2InitialHash64
   834  
   835  	// Decrypt the first 64 bytes
   836  	VPXOR   (0*32)(inp), AA0, AA0
   837  	VPXOR   (1*32)(inp), BB0, BB0
   838  	VMOVDQU AA0, (0*32)(oup)
   839  	VMOVDQU BB0, (1*32)(oup)
   840  	LEAQ    (2*32)(inp), inp
   841  	LEAQ    (2*32)(oup), oup
   842  	SUBQ    $64, inl
   843  
   844  openAVX2MainLoop:
   845  	CMPQ inl, $512
   846  	JB   openAVX2MainLoopDone
   847  
   848  	// Load state, increment counter blocks, store the incremented counters
   849  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   850  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   851  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   852  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
   853  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   854  	XORQ    itr1, itr1
   855  
   856  openAVX2InternalLoop:
   857  	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
   858  	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
   859  	polyAdd(0*8(inp)(itr1*1))
   860  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   861  	polyMulStage1_AVX2
   862  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   863  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   864  	polyMulStage2_AVX2
   865  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   866  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   867  	polyMulStage3_AVX2
   868  	VMOVDQA  CC3, tmpStoreAVX2
   869  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   870  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   871  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   872  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   873  	VMOVDQA  tmpStoreAVX2, CC3
   874  	polyMulReduceStage
   875  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   876  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   877  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   878  	polyAdd(2*8(inp)(itr1*1))
   879  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   880  	polyMulStage1_AVX2
   881  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   882  	VMOVDQA  CC3, tmpStoreAVX2
   883  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   884  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   885  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   886  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   887  	VMOVDQA  tmpStoreAVX2, CC3
   888  	polyMulStage2_AVX2
   889  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   890  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   891  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   892  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   893  	polyMulStage3_AVX2
   894  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   895  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   896  	polyMulReduceStage
   897  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   898  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   899  	polyAdd(4*8(inp)(itr1*1))
   900  	LEAQ     (6*8)(itr1), itr1
   901  	VMOVDQA  CC3, tmpStoreAVX2
   902  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   903  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   904  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   905  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   906  	VMOVDQA  tmpStoreAVX2, CC3
   907  	polyMulStage1_AVX2
   908  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   909  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   910  	polyMulStage2_AVX2
   911  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   912  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   913  	polyMulStage3_AVX2
   914  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   915  	VMOVDQA  CC3, tmpStoreAVX2
   916  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   917  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   918  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   919  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   920  	VMOVDQA  tmpStoreAVX2, CC3
   921  	polyMulReduceStage
   922  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   923  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   924  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   925  	CMPQ     itr1, $480
   926  	JNE      openAVX2InternalLoop
   927  
   928  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
   929  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   930  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   931  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   932  	VMOVDQA CC3, tmpStoreAVX2
   933  
   934  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
   935  	polyAdd(480(inp))
   936  	polyMulAVX2
   937  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
   938  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
   939  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
   940  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   941  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   942  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   943  
   944  	// and here
   945  	polyAdd(496(inp))
   946  	polyMulAVX2
   947  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   948  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   949  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   950  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   951  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
   952  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
   953  	LEAQ       (32*16)(inp), inp
   954  	LEAQ       (32*16)(oup), oup
   955  	SUBQ       $(32*16), inl
   956  	JMP        openAVX2MainLoop
   957  
   958  openAVX2MainLoopDone:
   959  	// Handle the various tail sizes efficiently
   960  	TESTQ inl, inl
   961  	JE    openSSEFinalize
   962  	CMPQ  inl, $128
   963  	JBE   openAVX2Tail128
   964  	CMPQ  inl, $256
   965  	JBE   openAVX2Tail256
   966  	CMPQ  inl, $384
   967  	JBE   openAVX2Tail384
   968  	JMP   openAVX2Tail512
   969  
   970  // ----------------------------------------------------------------------------
   971  // Special optimization for buffers smaller than 193 bytes
   972  openAVX2192:
   973  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
   974  	VMOVDQA AA0, AA1
   975  	VMOVDQA BB0, BB1
   976  	VMOVDQA CC0, CC1
   977  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
   978  	VMOVDQA AA0, AA2
   979  	VMOVDQA BB0, BB2
   980  	VMOVDQA CC0, CC2
   981  	VMOVDQA DD0, DD2
   982  	VMOVDQA DD1, TT3
   983  	MOVQ    $10, itr2
   984  
   985  openAVX2192InnerCipherLoop:
   986  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   987  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
   988  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   989  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
   990  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   991  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
   992  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   993  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
   994  	DECQ       itr2
   995  	JNE        openAVX2192InnerCipherLoop
   996  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
   997  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
   998  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
   999  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  1000  	VPERM2I128 $0x02, AA0, BB0, TT0
  1001  
  1002  	// Clamp and store poly key
  1003  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  1004  	VMOVDQA TT0, rsStoreAVX2
  1005  
  1006  	// Stream for up to 192 bytes
  1007  	VPERM2I128 $0x13, AA0, BB0, AA0
  1008  	VPERM2I128 $0x13, CC0, DD0, BB0
  1009  	VPERM2I128 $0x02, AA1, BB1, CC0
  1010  	VPERM2I128 $0x02, CC1, DD1, DD0
  1011  	VPERM2I128 $0x13, AA1, BB1, AA1
  1012  	VPERM2I128 $0x13, CC1, DD1, BB1
  1013  
  1014  openAVX2ShortOpen:
  1015  	// Hash
  1016  	MOVQ ad_len+80(FP), itr2
  1017  	CALL polyHashADInternal<>(SB)
  1018  
  1019  openAVX2ShortOpenLoop:
  1020  	CMPQ inl, $32
  1021  	JB   openAVX2ShortTail32
  1022  	SUBQ $32, inl
  1023  
  1024  	// Load for hashing
  1025  	polyAdd(0*8(inp))
  1026  	polyMulAVX2
  1027  	polyAdd(2*8(inp))
  1028  	polyMulAVX2
  1029  
  1030  	// Load for decryption
  1031  	VPXOR   (inp), AA0, AA0
  1032  	VMOVDQU AA0, (oup)
  1033  	LEAQ    (1*32)(inp), inp
  1034  	LEAQ    (1*32)(oup), oup
  1035  
  1036  	// Shift stream left
  1037  	VMOVDQA BB0, AA0
  1038  	VMOVDQA CC0, BB0
  1039  	VMOVDQA DD0, CC0
  1040  	VMOVDQA AA1, DD0
  1041  	VMOVDQA BB1, AA1
  1042  	VMOVDQA CC1, BB1
  1043  	VMOVDQA DD1, CC1
  1044  	VMOVDQA AA2, DD1
  1045  	VMOVDQA BB2, AA2
  1046  	JMP     openAVX2ShortOpenLoop
  1047  
  1048  openAVX2ShortTail32:
  1049  	CMPQ    inl, $16
  1050  	VMOVDQA A0, A1
  1051  	JB      openAVX2ShortDone
  1052  
  1053  	SUBQ $16, inl
  1054  
  1055  	// Load for hashing
  1056  	polyAdd(0*8(inp))
  1057  	polyMulAVX2
  1058  
  1059  	// Load for decryption
  1060  	VPXOR      (inp), A0, T0
  1061  	VMOVDQU    T0, (oup)
  1062  	LEAQ       (1*16)(inp), inp
  1063  	LEAQ       (1*16)(oup), oup
  1064  	VPERM2I128 $0x11, AA0, AA0, AA0
  1065  	VMOVDQA    A0, A1
  1066  
  1067  openAVX2ShortDone:
  1068  	VZEROUPPER
  1069  	JMP openSSETail16
  1070  
  1071  // ----------------------------------------------------------------------------
  1072  // Special optimization for buffers smaller than 321 bytes
  1073  openAVX2320:
  1074  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  1075  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1076  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  1077  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  1078  	MOVQ    $10, itr2
  1079  
  1080  openAVX2320InnerCipherLoop:
  1081  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1082  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1083  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1084  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1085  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1086  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1087  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1088  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1089  	DECQ     itr2
  1090  	JNE      openAVX2320InnerCipherLoop
  1091  
  1092  	VMOVDQA ·chacha20Constants<>(SB), TT0
  1093  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  1094  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  1095  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  1096  	VMOVDQA ·avx2IncMask<>(SB), TT0
  1097  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  1098  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  1099  	VPADDD  TT3, DD2, DD2
  1100  
  1101  	// Clamp and store poly key
  1102  	VPERM2I128 $0x02, AA0, BB0, TT0
  1103  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  1104  	VMOVDQA    TT0, rsStoreAVX2
  1105  
  1106  	// Stream for up to 320 bytes
  1107  	VPERM2I128 $0x13, AA0, BB0, AA0
  1108  	VPERM2I128 $0x13, CC0, DD0, BB0
  1109  	VPERM2I128 $0x02, AA1, BB1, CC0
  1110  	VPERM2I128 $0x02, CC1, DD1, DD0
  1111  	VPERM2I128 $0x13, AA1, BB1, AA1
  1112  	VPERM2I128 $0x13, CC1, DD1, BB1
  1113  	VPERM2I128 $0x02, AA2, BB2, CC1
  1114  	VPERM2I128 $0x02, CC2, DD2, DD1
  1115  	VPERM2I128 $0x13, AA2, BB2, AA2
  1116  	VPERM2I128 $0x13, CC2, DD2, BB2
  1117  	JMP        openAVX2ShortOpen
  1118  
  1119  // ----------------------------------------------------------------------------
  1120  // Special optimization for the last 128 bytes of ciphertext
  1121  openAVX2Tail128:
  1122  	// Need to decrypt up to 128 bytes - prepare two blocks
  1123  	VMOVDQA ·chacha20Constants<>(SB), AA1
  1124  	VMOVDQA state1StoreAVX2, BB1
  1125  	VMOVDQA state2StoreAVX2, CC1
  1126  	VMOVDQA ctr3StoreAVX2, DD1
  1127  	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
  1128  	VMOVDQA DD1, DD0
  1129  
  1130  	XORQ  itr2, itr2
  1131  	MOVQ  inl, itr1
  1132  	ANDQ  $-16, itr1
  1133  	TESTQ itr1, itr1
  1134  	JE    openAVX2Tail128LoopB
  1135  
  1136  openAVX2Tail128LoopA:
  1137  	// Perform ChaCha rounds, while hashing the remaining input
  1138  	polyAdd(0(inp)(itr2*1))
  1139  	polyMulAVX2
  1140  
  1141  openAVX2Tail128LoopB:
  1142  	ADDQ     $16, itr2
  1143  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1144  	VPALIGNR $4, BB1, BB1, BB1
  1145  	VPALIGNR $8, CC1, CC1, CC1
  1146  	VPALIGNR $12, DD1, DD1, DD1
  1147  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1148  	VPALIGNR $12, BB1, BB1, BB1
  1149  	VPALIGNR $8, CC1, CC1, CC1
  1150  	VPALIGNR $4, DD1, DD1, DD1
  1151  	CMPQ     itr2, itr1
  1152  	JB       openAVX2Tail128LoopA
  1153  	CMPQ     itr2, $160
  1154  	JNE      openAVX2Tail128LoopB
  1155  
  1156  	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
  1157  	VPADDD     state1StoreAVX2, BB1, BB1
  1158  	VPADDD     state2StoreAVX2, CC1, CC1
  1159  	VPADDD     DD0, DD1, DD1
  1160  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1161  
  1162  openAVX2TailLoop:
  1163  	CMPQ inl, $32
  1164  	JB   openAVX2Tail
  1165  	SUBQ $32, inl
  1166  
  1167  	// Load for decryption
  1168  	VPXOR   (inp), AA0, AA0
  1169  	VMOVDQU AA0, (oup)
  1170  	LEAQ    (1*32)(inp), inp
  1171  	LEAQ    (1*32)(oup), oup
  1172  	VMOVDQA BB0, AA0
  1173  	VMOVDQA CC0, BB0
  1174  	VMOVDQA DD0, CC0
  1175  	JMP     openAVX2TailLoop
  1176  
  1177  openAVX2Tail:
  1178  	CMPQ    inl, $16
  1179  	VMOVDQA A0, A1
  1180  	JB      openAVX2TailDone
  1181  	SUBQ    $16, inl
  1182  
  1183  	// Load for decryption
  1184  	VPXOR      (inp), A0, T0
  1185  	VMOVDQU    T0, (oup)
  1186  	LEAQ       (1*16)(inp), inp
  1187  	LEAQ       (1*16)(oup), oup
  1188  	VPERM2I128 $0x11, AA0, AA0, AA0
  1189  	VMOVDQA    A0, A1
  1190  
  1191  openAVX2TailDone:
  1192  	VZEROUPPER
  1193  	JMP openSSETail16
  1194  
  1195  // ----------------------------------------------------------------------------
  1196  // Special optimization for the last 256 bytes of ciphertext
  1197  openAVX2Tail256:
  1198  	// Need to decrypt up to 256 bytes - prepare four blocks
  1199  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
  1200  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
  1201  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
  1202  	VMOVDQA ctr3StoreAVX2, DD0
  1203  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1204  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1205  	VMOVDQA DD0, TT1
  1206  	VMOVDQA DD1, TT2
  1207  
  1208  	// Compute the number of iterations that will hash data
  1209  	MOVQ    inl, tmpStoreAVX2
  1210  	MOVQ    inl, itr1
  1211  	SUBQ    $128, itr1
  1212  	SHRQ    $4, itr1
  1213  	MOVQ    $10, itr2
  1214  	CMPQ    itr1, $10
  1215  	CMOVQGT itr2, itr1
  1216  	MOVQ    inp, inl
  1217  	XORQ    itr2, itr2
  1218  
  1219  openAVX2Tail256LoopA:
  1220  	polyAdd(0(inl))
  1221  	polyMulAVX2
  1222  	LEAQ 16(inl), inl
  1223  
  1224  	// Perform ChaCha rounds, while hashing the remaining input
  1225  openAVX2Tail256LoopB:
  1226  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1227  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1228  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1229  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1230  	INCQ     itr2
  1231  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1232  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1233  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1234  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1235  	CMPQ     itr2, itr1
  1236  	JB       openAVX2Tail256LoopA
  1237  
  1238  	CMPQ itr2, $10
  1239  	JNE  openAVX2Tail256LoopB
  1240  
  1241  	MOVQ inl, itr2
  1242  	SUBQ inp, inl
  1243  	MOVQ inl, itr1
  1244  	MOVQ tmpStoreAVX2, inl
  1245  
  1246  	// Hash the remainder of data (if any)
  1247  openAVX2Tail256Hash:
  1248  	ADDQ $16, itr1
  1249  	CMPQ itr1, inl
  1250  	JGT  openAVX2Tail256HashEnd
  1251  	polyAdd (0(itr2))
  1252  	polyMulAVX2
  1253  	LEAQ 16(itr2), itr2
  1254  	JMP  openAVX2Tail256Hash
  1255  
  1256  // Store 128 bytes safely, then go to store loop
  1257  openAVX2Tail256HashEnd:
  1258  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1259  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  1260  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  1261  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  1262  	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
  1263  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1264  
  1265  	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
  1266  	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
  1267  	LEAQ    (4*32)(inp), inp
  1268  	LEAQ    (4*32)(oup), oup
  1269  	SUBQ    $4*32, inl
  1270  
  1271  	JMP openAVX2TailLoop
  1272  
  1273  // ----------------------------------------------------------------------------
  1274  // Special optimization for the last 384 bytes of ciphertext
  1275  openAVX2Tail384:
  1276  	// Need to decrypt up to 384 bytes - prepare six blocks
  1277  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  1278  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  1279  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  1280  	VMOVDQA ctr3StoreAVX2, DD0
  1281  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1282  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1283  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
  1284  	VMOVDQA DD0, ctr0StoreAVX2
  1285  	VMOVDQA DD1, ctr1StoreAVX2
  1286  	VMOVDQA DD2, ctr2StoreAVX2
  1287  
  1288  	// Compute the number of iterations that will hash two blocks of data
  1289  	MOVQ    inl, tmpStoreAVX2
  1290  	MOVQ    inl, itr1
  1291  	SUBQ    $256, itr1
  1292  	SHRQ    $4, itr1
  1293  	ADDQ    $6, itr1
  1294  	MOVQ    $10, itr2
  1295  	CMPQ    itr1, $10
  1296  	CMOVQGT itr2, itr1
  1297  	MOVQ    inp, inl
  1298  	XORQ    itr2, itr2
  1299  
  1300  	// Perform ChaCha rounds, while hashing the remaining input
  1301  openAVX2Tail384LoopB:
  1302  	polyAdd(0(inl))
  1303  	polyMulAVX2
  1304  	LEAQ 16(inl), inl
  1305  
  1306  openAVX2Tail384LoopA:
  1307  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1308  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1309  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1310  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1311  	polyAdd(0(inl))
  1312  	polyMulAVX2
  1313  	LEAQ     16(inl), inl
  1314  	INCQ     itr2
  1315  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1316  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1317  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1318  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1319  
  1320  	CMPQ itr2, itr1
  1321  	JB   openAVX2Tail384LoopB
  1322  
  1323  	CMPQ itr2, $10
  1324  	JNE  openAVX2Tail384LoopA
  1325  
  1326  	MOVQ inl, itr2
  1327  	SUBQ inp, inl
  1328  	MOVQ inl, itr1
  1329  	MOVQ tmpStoreAVX2, inl
  1330  
  1331  openAVX2Tail384Hash:
  1332  	ADDQ $16, itr1
  1333  	CMPQ itr1, inl
  1334  	JGT  openAVX2Tail384HashEnd
  1335  	polyAdd(0(itr2))
  1336  	polyMulAVX2
  1337  	LEAQ 16(itr2), itr2
  1338  	JMP  openAVX2Tail384Hash
  1339  
  1340  // Store 256 bytes safely, then go to store loop
  1341  openAVX2Tail384HashEnd:
  1342  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  1343  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  1344  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  1345  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
  1346  	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
  1347  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  1348  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  1349  	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
  1350  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  1351  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  1352  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1353  	LEAQ       (8*32)(inp), inp
  1354  	LEAQ       (8*32)(oup), oup
  1355  	SUBQ       $8*32, inl
  1356  	JMP        openAVX2TailLoop
  1357  
  1358  // ----------------------------------------------------------------------------
  1359  // Special optimization for the last 512 bytes of ciphertext
  1360  openAVX2Tail512:
  1361  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1362  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1363  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1364  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1365  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1366  	XORQ    itr1, itr1
  1367  	MOVQ    inp, itr2
  1368  
  1369  openAVX2Tail512LoopB:
  1370  	polyAdd(0(itr2))
  1371  	polyMulAVX2
  1372  	LEAQ (2*8)(itr2), itr2
  1373  
  1374  openAVX2Tail512LoopA:
  1375  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1376  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1377  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1378  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1379  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1380  	VMOVDQA  CC3, tmpStoreAVX2
  1381  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1382  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1383  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1384  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1385  	VMOVDQA  tmpStoreAVX2, CC3
  1386  	polyAdd(0*8(itr2))
  1387  	polyMulAVX2
  1388  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1389  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1390  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1391  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1392  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1393  	VMOVDQA  CC3, tmpStoreAVX2
  1394  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1395  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1396  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1397  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1398  	VMOVDQA  tmpStoreAVX2, CC3
  1399  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1400  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1401  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1402  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1403  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1404  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1405  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1406  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1407  	polyAdd(2*8(itr2))
  1408  	polyMulAVX2
  1409  	LEAQ     (4*8)(itr2), itr2
  1410  	VMOVDQA  CC3, tmpStoreAVX2
  1411  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1412  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1413  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1414  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1415  	VMOVDQA  tmpStoreAVX2, CC3
  1416  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1417  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1418  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1419  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1420  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1421  	VMOVDQA  CC3, tmpStoreAVX2
  1422  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1423  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1424  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1425  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1426  	VMOVDQA  tmpStoreAVX2, CC3
  1427  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1428  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1429  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1430  	INCQ     itr1
  1431  	CMPQ     itr1, $4
  1432  	JLT      openAVX2Tail512LoopB
  1433  
  1434  	CMPQ itr1, $10
  1435  	JNE  openAVX2Tail512LoopA
  1436  
  1437  	MOVQ inl, itr1
  1438  	SUBQ $384, itr1
  1439  	ANDQ $-16, itr1
  1440  
  1441  openAVX2Tail512HashLoop:
  1442  	TESTQ itr1, itr1
  1443  	JE    openAVX2Tail512HashEnd
  1444  	polyAdd(0(itr2))
  1445  	polyMulAVX2
  1446  	LEAQ  16(itr2), itr2
  1447  	SUBQ  $16, itr1
  1448  	JMP   openAVX2Tail512HashLoop
  1449  
  1450  openAVX2Tail512HashEnd:
  1451  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1452  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1453  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1454  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1455  	VMOVDQA    CC3, tmpStoreAVX2
  1456  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1457  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1458  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1459  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1460  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1461  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1462  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1463  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1464  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1465  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1466  
  1467  	LEAQ (12*32)(inp), inp
  1468  	LEAQ (12*32)(oup), oup
  1469  	SUBQ $12*32, inl
  1470  
  1471  	JMP openAVX2TailLoop
  1472  
  1473  // ----------------------------------------------------------------------------
  1474  // ----------------------------------------------------------------------------
  1475  // func chacha20Poly1305Seal(dst, key, src, ad []byte)
  1476  TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
  1477  	// For aligned stack access
  1478  	MOVQ SP, BP
  1479  	ADDQ $32, BP
  1480  	ANDQ $-32, BP
  1481  	MOVQ dst+0(FP), oup
  1482  	MOVQ key+24(FP), keyp
  1483  	MOVQ src+48(FP), inp
  1484  	MOVQ src_len+56(FP), inl
  1485  	MOVQ ad+72(FP), adp
  1486  
  1487  	CMPB ·useAVX2(SB), $1
  1488  	JE   chacha20Poly1305Seal_AVX2
  1489  
  1490  	// Special optimization, for very short buffers
  1491  	CMPQ inl, $128
  1492  	JBE  sealSSE128 // About 15% faster
  1493  
  1494  	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  1495  	MOVOU ·chacha20Constants<>(SB), A0
  1496  	MOVOU (1*16)(keyp), B0
  1497  	MOVOU (2*16)(keyp), C0
  1498  	MOVOU (3*16)(keyp), D0
  1499  
  1500  	// Store state on stack for future use
  1501  	MOVO B0, state1Store
  1502  	MOVO C0, state2Store
  1503  
  1504  	// Load state, increment counter blocks
  1505  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1506  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1507  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1508  
  1509  	// Store counters
  1510  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1511  	MOVQ $10, itr2
  1512  
  1513  sealSSEIntroLoop:
  1514  	MOVO         C3, tmpStore
  1515  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1516  	MOVO         tmpStore, C3
  1517  	MOVO         C1, tmpStore
  1518  	chachaQR(A3, B3, C3, D3, C1)
  1519  	MOVO         tmpStore, C1
  1520  	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1521  	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1522  	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1523  
  1524  	MOVO          C3, tmpStore
  1525  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1526  	MOVO          tmpStore, C3
  1527  	MOVO          C1, tmpStore
  1528  	chachaQR(A3, B3, C3, D3, C1)
  1529  	MOVO          tmpStore, C1
  1530  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1531  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1532  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1533  	DECQ          itr2
  1534  	JNE           sealSSEIntroLoop
  1535  
  1536  	// Add in the state
  1537  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1538  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1539  	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1540  	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1541  
  1542  	// Clamp and store the key
  1543  	PAND ·polyClampMask<>(SB), A0
  1544  	MOVO A0, rStore
  1545  	MOVO B0, sStore
  1546  
  1547  	// Hash AAD
  1548  	MOVQ ad_len+80(FP), itr2
  1549  	CALL polyHashADInternal<>(SB)
  1550  
  1551  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1552  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1553  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  1554  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1555  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1556  	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
  1557  
  1558  	MOVQ $128, itr1
  1559  	SUBQ $128, inl
  1560  	LEAQ 128(inp), inp
  1561  
  1562  	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
  1563  
  1564  	CMPQ inl, $64
  1565  	JBE  sealSSE128SealHash
  1566  
  1567  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1568  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1569  	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
  1570  
  1571  	ADDQ $64, itr1
  1572  	SUBQ $64, inl
  1573  	LEAQ 64(inp), inp
  1574  
  1575  	MOVQ $2, itr1
  1576  	MOVQ $8, itr2
  1577  
  1578  	CMPQ inl, $64
  1579  	JBE  sealSSETail64
  1580  	CMPQ inl, $128
  1581  	JBE  sealSSETail128
  1582  	CMPQ inl, $192
  1583  	JBE  sealSSETail192
  1584  
  1585  sealSSEMainLoop:
  1586  	// Load state, increment counter blocks
  1587  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  1588  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1589  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1590  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1591  
  1592  	// Store counters
  1593  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1594  
  1595  sealSSEInnerLoop:
  1596  	MOVO          C3, tmpStore
  1597  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1598  	MOVO          tmpStore, C3
  1599  	MOVO          C1, tmpStore
  1600  	chachaQR(A3, B3, C3, D3, C1)
  1601  	MOVO          tmpStore, C1
  1602  	polyAdd(0(oup))
  1603  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
  1604  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
  1605  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
  1606  	polyMulStage1
  1607  	polyMulStage2
  1608  	LEAQ          (2*8)(oup), oup
  1609  	MOVO          C3, tmpStore
  1610  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1611  	MOVO          tmpStore, C3
  1612  	MOVO          C1, tmpStore
  1613  	polyMulStage3
  1614  	chachaQR(A3, B3, C3, D3, C1)
  1615  	MOVO          tmpStore, C1
  1616  	polyMulReduceStage
  1617  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1618  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1619  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1620  	DECQ          itr2
  1621  	JGE           sealSSEInnerLoop
  1622  	polyAdd(0(oup))
  1623  	polyMul
  1624  	LEAQ          (2*8)(oup), oup
  1625  	DECQ          itr1
  1626  	JG            sealSSEInnerLoop
  1627  
  1628  	// Add in the state
  1629  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1630  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1631  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1632  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1633  	MOVO  D3, tmpStore
  1634  
  1635  	// Load - xor - store
  1636  	MOVOU (0*16)(inp), D3; PXOR D3, A0
  1637  	MOVOU (1*16)(inp), D3; PXOR D3, B0
  1638  	MOVOU (2*16)(inp), D3; PXOR D3, C0
  1639  	MOVOU (3*16)(inp), D3; PXOR D3, D0
  1640  	MOVOU A0, (0*16)(oup)
  1641  	MOVOU B0, (1*16)(oup)
  1642  	MOVOU C0, (2*16)(oup)
  1643  	MOVOU D0, (3*16)(oup)
  1644  	MOVO  tmpStore, D3
  1645  
  1646  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1647  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1648  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1649  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  1650  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1651  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  1652  	ADDQ  $192, inp
  1653  	MOVQ  $192, itr1
  1654  	SUBQ  $192, inl
  1655  	MOVO  A3, A1
  1656  	MOVO  B3, B1
  1657  	MOVO  C3, C1
  1658  	MOVO  D3, D1
  1659  	CMPQ  inl, $64
  1660  	JBE   sealSSE128SealHash
  1661  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1662  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1663  	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
  1664  	LEAQ  64(inp), inp
  1665  	SUBQ  $64, inl
  1666  	MOVQ  $6, itr1
  1667  	MOVQ  $4, itr2
  1668  	CMPQ  inl, $192
  1669  	JG    sealSSEMainLoop
  1670  
  1671  	MOVQ  inl, itr1
  1672  	TESTQ inl, inl
  1673  	JE    sealSSE128SealHash
  1674  	MOVQ  $6, itr1
  1675  	CMPQ  inl, $64
  1676  	JBE   sealSSETail64
  1677  	CMPQ  inl, $128
  1678  	JBE   sealSSETail128
  1679  	JMP   sealSSETail192
  1680  
  1681  // ----------------------------------------------------------------------------
  1682  // Special optimization for the last 64 bytes of plaintext
  1683  sealSSETail64:
  1684  	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
  1685  	MOVO  ·chacha20Constants<>(SB), A1
  1686  	MOVO  state1Store, B1
  1687  	MOVO  state2Store, C1
  1688  	MOVO  ctr3Store, D1
  1689  	PADDL ·sseIncMask<>(SB), D1
  1690  	MOVO  D1, ctr0Store
  1691  
  1692  sealSSETail64LoopA:
  1693  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1694  	polyAdd(0(oup))
  1695  	polyMul
  1696  	LEAQ 16(oup), oup
  1697  
  1698  sealSSETail64LoopB:
  1699  	chachaQR(A1, B1, C1, D1, T1)
  1700  	shiftB1Left;  shiftC1Left; shiftD1Left
  1701  	chachaQR(A1, B1, C1, D1, T1)
  1702  	shiftB1Right; shiftC1Right; shiftD1Right
  1703  	polyAdd(0(oup))
  1704  	polyMul
  1705  	LEAQ          16(oup), oup
  1706  
  1707  	DECQ itr1
  1708  	JG   sealSSETail64LoopA
  1709  
  1710  	DECQ  itr2
  1711  	JGE   sealSSETail64LoopB
  1712  	PADDL ·chacha20Constants<>(SB), A1
  1713  	PADDL state1Store, B1
  1714  	PADDL state2Store, C1
  1715  	PADDL ctr0Store, D1
  1716  
  1717  	JMP sealSSE128Seal
  1718  
  1719  // ----------------------------------------------------------------------------
  1720  // Special optimization for the last 128 bytes of plaintext
  1721  sealSSETail128:
  1722  	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
  1723  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1724  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1725  
  1726  sealSSETail128LoopA:
  1727  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1728  	polyAdd(0(oup))
  1729  	polyMul
  1730  	LEAQ 16(oup), oup
  1731  
  1732  sealSSETail128LoopB:
  1733  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1734  	shiftB0Left;  shiftC0Left; shiftD0Left
  1735  	shiftB1Left;  shiftC1Left; shiftD1Left
  1736  	polyAdd(0(oup))
  1737  	polyMul
  1738  	LEAQ          16(oup), oup
  1739  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1740  	shiftB0Right; shiftC0Right; shiftD0Right
  1741  	shiftB1Right; shiftC1Right; shiftD1Right
  1742  
  1743  	DECQ itr1
  1744  	JG   sealSSETail128LoopA
  1745  
  1746  	DECQ itr2
  1747  	JGE  sealSSETail128LoopB
  1748  
  1749  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  1750  	PADDL state1Store, B0; PADDL state1Store, B1
  1751  	PADDL state2Store, C0; PADDL state2Store, C1
  1752  	PADDL ctr0Store, D0; PADDL ctr1Store, D1
  1753  
  1754  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1755  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1756  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1757  
  1758  	MOVQ $64, itr1
  1759  	LEAQ 64(inp), inp
  1760  	SUBQ $64, inl
  1761  
  1762  	JMP sealSSE128SealHash
  1763  
  1764  // ----------------------------------------------------------------------------
  1765  // Special optimization for the last 192 bytes of plaintext
  1766  sealSSETail192:
  1767  	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
  1768  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1769  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1770  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
  1771  
  1772  sealSSETail192LoopA:
  1773  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1774  	polyAdd(0(oup))
  1775  	polyMul
  1776  	LEAQ 16(oup), oup
  1777  
  1778  sealSSETail192LoopB:
  1779  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1780  	shiftB0Left; shiftC0Left; shiftD0Left
  1781  	shiftB1Left; shiftC1Left; shiftD1Left
  1782  	shiftB2Left; shiftC2Left; shiftD2Left
  1783  
  1784  	polyAdd(0(oup))
  1785  	polyMul
  1786  	LEAQ 16(oup), oup
  1787  
  1788  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1789  	shiftB0Right; shiftC0Right; shiftD0Right
  1790  	shiftB1Right; shiftC1Right; shiftD1Right
  1791  	shiftB2Right; shiftC2Right; shiftD2Right
  1792  
  1793  	DECQ itr1
  1794  	JG   sealSSETail192LoopA
  1795  
  1796  	DECQ itr2
  1797  	JGE  sealSSETail192LoopB
  1798  
  1799  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1800  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  1801  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  1802  	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
  1803  
  1804  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1805  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1806  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1807  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  1808  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  1809  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1810  
  1811  	MOVO A2, A1
  1812  	MOVO B2, B1
  1813  	MOVO C2, C1
  1814  	MOVO D2, D1
  1815  	MOVQ $128, itr1
  1816  	LEAQ 128(inp), inp
  1817  	SUBQ $128, inl
  1818  
  1819  	JMP sealSSE128SealHash
  1820  
  1821  // ----------------------------------------------------------------------------
  1822  // Special seal optimization for buffers smaller than 129 bytes
  1823  sealSSE128:
  1824  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  1825  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  1826  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1827  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1828  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
  1829  	MOVQ  $10, itr2
  1830  
  1831  sealSSE128InnerCipherLoop:
  1832  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1833  	shiftB0Left;  shiftB1Left; shiftB2Left
  1834  	shiftC0Left;  shiftC1Left; shiftC2Left
  1835  	shiftD0Left;  shiftD1Left; shiftD2Left
  1836  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1837  	shiftB0Right; shiftB1Right; shiftB2Right
  1838  	shiftC0Right; shiftC1Right; shiftC2Right
  1839  	shiftD0Right; shiftD1Right; shiftD2Right
  1840  	DECQ          itr2
  1841  	JNE           sealSSE128InnerCipherLoop
  1842  
  1843  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1844  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1845  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  1846  	PADDL T2, C1; PADDL T2, C2
  1847  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  1848  	PAND  ·polyClampMask<>(SB), A0
  1849  	MOVOU A0, rStore
  1850  	MOVOU B0, sStore
  1851  
  1852  	// Hash
  1853  	MOVQ ad_len+80(FP), itr2
  1854  	CALL polyHashADInternal<>(SB)
  1855  	XORQ itr1, itr1
  1856  
  1857  sealSSE128SealHash:
  1858  	// itr1 holds the number of bytes encrypted but not yet hashed
  1859  	CMPQ itr1, $16
  1860  	JB   sealSSE128Seal
  1861  	polyAdd(0(oup))
  1862  	polyMul
  1863  
  1864  	SUBQ $16, itr1
  1865  	ADDQ $16, oup
  1866  
  1867  	JMP sealSSE128SealHash
  1868  
  1869  sealSSE128Seal:
  1870  	CMPQ inl, $16
  1871  	JB   sealSSETail
  1872  	SUBQ $16, inl
  1873  
  1874  	// Load for decryption
  1875  	MOVOU (inp), T0
  1876  	PXOR  T0, A1
  1877  	MOVOU A1, (oup)
  1878  	LEAQ  (1*16)(inp), inp
  1879  	LEAQ  (1*16)(oup), oup
  1880  
  1881  	// Extract for hashing
  1882  	MOVQ   A1, t0
  1883  	PSRLDQ $8, A1
  1884  	MOVQ A1, t1
  1885  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1886  	polyMul
  1887  
  1888  	// Shift the stream "left"
  1889  	MOVO B1, A1
  1890  	MOVO C1, B1
  1891  	MOVO D1, C1
  1892  	MOVO A2, D1
  1893  	MOVO B2, A2
  1894  	MOVO C2, B2
  1895  	MOVO D2, C2
  1896  	JMP  sealSSE128Seal
  1897  
  1898  sealSSETail:
  1899  	TESTQ inl, inl
  1900  	JE    sealSSEFinalize
  1901  
  1902  	// We can only load the PT one byte at a time to avoid read after end of buffer
  1903  	MOVQ inl, itr2
  1904  	SHLQ $4, itr2
  1905  	LEAQ ·andMask<>(SB), t0
  1906  	MOVQ inl, itr1
  1907  	LEAQ -1(inp)(inl*1), inp
  1908  	XORQ t2, t2
  1909  	XORQ t3, t3
  1910  	XORQ AX, AX
  1911  
  1912  sealSSETailLoadLoop:
  1913  	SHLQ $8, t2, t3
  1914  	SHLQ $8, t2
  1915  	MOVB (inp), AX
  1916  	XORQ AX, t2
  1917  	LEAQ   -1(inp), inp
  1918  	DECQ   itr1
  1919  	JNE    sealSSETailLoadLoop
  1920  	MOVQ t2, 0+tmpStore
  1921  	MOVQ t3, 8+tmpStore
  1922  	PXOR 0+tmpStore, A1
  1923  	MOVOU  A1, (oup)
  1924  	MOVOU  -16(t0)(itr2*1), T0
  1925  	PAND   T0, A1
  1926  	MOVQ   A1, t0
  1927  	PSRLDQ $8, A1
  1928  	MOVQ   A1, t1
  1929  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1930  	polyMul
  1931  
  1932  	ADDQ inl, oup
  1933  
  1934  sealSSEFinalize:
  1935  	// Hash in the buffer lengths
  1936  	ADDQ ad_len+80(FP), acc0
  1937  	ADCQ src_len+56(FP), acc1
  1938  	ADCQ $1, acc2
  1939  	polyMul
  1940  
  1941  	// Final reduce
  1942  	MOVQ    acc0, t0
  1943  	MOVQ    acc1, t1
  1944  	MOVQ    acc2, t2
  1945  	SUBQ    $-5, acc0
  1946  	SBBQ    $-1, acc1
  1947  	SBBQ    $3, acc2
  1948  	CMOVQCS t0, acc0
  1949  	CMOVQCS t1, acc1
  1950  	CMOVQCS t2, acc2
  1951  
  1952  	// Add in the "s" part of the key
  1953  	ADDQ 0+sStore, acc0
  1954  	ADCQ 8+sStore, acc1
  1955  
  1956  	// Finally store the tag at the end of the message
  1957  	MOVQ acc0, (0*8)(oup)
  1958  	MOVQ acc1, (1*8)(oup)
  1959  	RET
  1960  
  1961  // ----------------------------------------------------------------------------
  1962  // ------------------------- AVX2 Code ----------------------------------------
  1963  chacha20Poly1305Seal_AVX2:
  1964  	VZEROUPPER
  1965  	VMOVDQU ·chacha20Constants<>(SB), AA0
  1966  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  1967  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  1968  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  1969  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
  1970  
  1971  	// Special optimizations, for very short buffers
  1972  	CMPQ inl, $192
  1973  	JBE  seal192AVX2 // 33% faster
  1974  	CMPQ inl, $320
  1975  	JBE  seal320AVX2 // 17% faster
  1976  
  1977  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  1978  	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1979  	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
  1980  	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
  1981  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
  1982  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
  1983  	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
  1984  	VMOVDQA DD3, ctr3StoreAVX2
  1985  	MOVQ    $10, itr2
  1986  
  1987  sealAVX2IntroLoop:
  1988  	VMOVDQA CC3, tmpStoreAVX2
  1989  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1990  	VMOVDQA tmpStoreAVX2, CC3
  1991  	VMOVDQA CC1, tmpStoreAVX2
  1992  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1993  	VMOVDQA tmpStoreAVX2, CC1
  1994  
  1995  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  1996  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  1997  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  1998  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  1999  
  2000  	VMOVDQA CC3, tmpStoreAVX2
  2001  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2002  	VMOVDQA tmpStoreAVX2, CC3
  2003  	VMOVDQA CC1, tmpStoreAVX2
  2004  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2005  	VMOVDQA tmpStoreAVX2, CC1
  2006  
  2007  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2008  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2009  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2010  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2011  	DECQ     itr2
  2012  	JNE      sealAVX2IntroLoop
  2013  
  2014  	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2015  	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2016  	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2017  	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2018  
  2019  	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
  2020  	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
  2021  	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
  2022  
  2023  	// Clamp and store poly key
  2024  	VPAND   ·polyClampMask<>(SB), DD0, DD0
  2025  	VMOVDQA DD0, rsStoreAVX2
  2026  
  2027  	// Hash AD
  2028  	MOVQ ad_len+80(FP), itr2
  2029  	CALL polyHashADInternal<>(SB)
  2030  
  2031  	// Can store at least 320 bytes
  2032  	VPXOR   (0*32)(inp), AA0, AA0
  2033  	VPXOR   (1*32)(inp), CC0, CC0
  2034  	VMOVDQU AA0, (0*32)(oup)
  2035  	VMOVDQU CC0, (1*32)(oup)
  2036  
  2037  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2038  	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
  2039  	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
  2040  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2041  	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
  2042  	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
  2043  
  2044  	MOVQ $320, itr1
  2045  	SUBQ $320, inl
  2046  	LEAQ 320(inp), inp
  2047  
  2048  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
  2049  	CMPQ       inl, $128
  2050  	JBE        sealAVX2SealHash
  2051  
  2052  	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
  2053  	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
  2054  	SUBQ    $128, inl
  2055  	LEAQ    128(inp), inp
  2056  
  2057  	MOVQ $8, itr1
  2058  	MOVQ $2, itr2
  2059  
  2060  	CMPQ inl, $128
  2061  	JBE  sealAVX2Tail128
  2062  	CMPQ inl, $256
  2063  	JBE  sealAVX2Tail256
  2064  	CMPQ inl, $384
  2065  	JBE  sealAVX2Tail384
  2066  	CMPQ inl, $512
  2067  	JBE  sealAVX2Tail512
  2068  
  2069  	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  2070  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2071  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2072  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2073  	VMOVDQA ctr3StoreAVX2, DD0
  2074  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2075  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2076  
  2077  	VMOVDQA CC3, tmpStoreAVX2
  2078  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2079  	VMOVDQA tmpStoreAVX2, CC3
  2080  	VMOVDQA CC1, tmpStoreAVX2
  2081  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2082  	VMOVDQA tmpStoreAVX2, CC1
  2083  
  2084  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  2085  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  2086  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  2087  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2088  
  2089  	VMOVDQA CC3, tmpStoreAVX2
  2090  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2091  	VMOVDQA tmpStoreAVX2, CC3
  2092  	VMOVDQA CC1, tmpStoreAVX2
  2093  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2094  	VMOVDQA tmpStoreAVX2, CC1
  2095  
  2096  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2097  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2098  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2099  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2100  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2101  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2102  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2103  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2104  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2105  	VMOVDQA  CC3, tmpStoreAVX2
  2106  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2107  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2108  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2109  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2110  	VMOVDQA  tmpStoreAVX2, CC3
  2111  
  2112  	SUBQ $16, oup                  // Adjust the pointer
  2113  	MOVQ $9, itr1
  2114  	JMP  sealAVX2InternalLoopStart
  2115  
  2116  sealAVX2MainLoop:
  2117  	// Load state, increment counter blocks, store the incremented counters
  2118  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2119  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2120  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2121  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2122  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2123  	MOVQ    $10, itr1
  2124  
  2125  sealAVX2InternalLoop:
  2126  	polyAdd(0*8(oup))
  2127  	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2128  	polyMulStage1_AVX2
  2129  	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2130  	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2131  	polyMulStage2_AVX2
  2132  	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2133  	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2134  	polyMulStage3_AVX2
  2135  	VMOVDQA CC3, tmpStoreAVX2
  2136  	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2137  	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2138  	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2139  	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2140  	VMOVDQA tmpStoreAVX2, CC3
  2141  	polyMulReduceStage
  2142  
  2143  sealAVX2InternalLoopStart:
  2144  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2145  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2146  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2147  	polyAdd(2*8(oup))
  2148  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2149  	polyMulStage1_AVX2
  2150  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2151  	VMOVDQA  CC3, tmpStoreAVX2
  2152  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2153  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2154  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2155  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2156  	VMOVDQA  tmpStoreAVX2, CC3
  2157  	polyMulStage2_AVX2
  2158  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2159  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2160  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2161  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2162  	polyMulStage3_AVX2
  2163  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2164  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2165  	polyMulReduceStage
  2166  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2167  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2168  	polyAdd(4*8(oup))
  2169  	LEAQ     (6*8)(oup), oup
  2170  	VMOVDQA  CC3, tmpStoreAVX2
  2171  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2172  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2173  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2174  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2175  	VMOVDQA  tmpStoreAVX2, CC3
  2176  	polyMulStage1_AVX2
  2177  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2178  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2179  	polyMulStage2_AVX2
  2180  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2181  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2182  	polyMulStage3_AVX2
  2183  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2184  	VMOVDQA  CC3, tmpStoreAVX2
  2185  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2186  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2187  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2188  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2189  	VMOVDQA  tmpStoreAVX2, CC3
  2190  	polyMulReduceStage
  2191  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2192  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2193  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2194  	DECQ     itr1
  2195  	JNE      sealAVX2InternalLoop
  2196  
  2197  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2198  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2199  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2200  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2201  	VMOVDQA CC3, tmpStoreAVX2
  2202  
  2203  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  2204  	polyAdd(0*8(oup))
  2205  	polyMulAVX2
  2206  	LEAQ       (4*8)(oup), oup
  2207  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  2208  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  2209  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  2210  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2211  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2212  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2213  
  2214  	// and here
  2215  	polyAdd(-2*8(oup))
  2216  	polyMulAVX2
  2217  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2218  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2219  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2220  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2221  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  2222  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  2223  	LEAQ       (32*16)(inp), inp
  2224  	SUBQ       $(32*16), inl
  2225  	CMPQ       inl, $512
  2226  	JG         sealAVX2MainLoop
  2227  
  2228  	// Tail can only hash 480 bytes
  2229  	polyAdd(0*8(oup))
  2230  	polyMulAVX2
  2231  	polyAdd(2*8(oup))
  2232  	polyMulAVX2
  2233  	LEAQ 32(oup), oup
  2234  
  2235  	MOVQ $10, itr1
  2236  	MOVQ $0, itr2
  2237  	CMPQ inl, $128
  2238  	JBE  sealAVX2Tail128
  2239  	CMPQ inl, $256
  2240  	JBE  sealAVX2Tail256
  2241  	CMPQ inl, $384
  2242  	JBE  sealAVX2Tail384
  2243  	JMP  sealAVX2Tail512
  2244  
  2245  // ----------------------------------------------------------------------------
  2246  // Special optimization for buffers smaller than 193 bytes
  2247  seal192AVX2:
  2248  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  2249  	VMOVDQA AA0, AA1
  2250  	VMOVDQA BB0, BB1
  2251  	VMOVDQA CC0, CC1
  2252  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2253  	VMOVDQA AA0, AA2
  2254  	VMOVDQA BB0, BB2
  2255  	VMOVDQA CC0, CC2
  2256  	VMOVDQA DD0, DD2
  2257  	VMOVDQA DD1, TT3
  2258  	MOVQ    $10, itr2
  2259  
  2260  sealAVX2192InnerCipherLoop:
  2261  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2262  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2263  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2264  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2265  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2266  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2267  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2268  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2269  	DECQ       itr2
  2270  	JNE        sealAVX2192InnerCipherLoop
  2271  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  2272  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  2273  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  2274  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  2275  	VPERM2I128 $0x02, AA0, BB0, TT0
  2276  
  2277  	// Clamp and store poly key
  2278  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  2279  	VMOVDQA TT0, rsStoreAVX2
  2280  
  2281  	// Stream for up to 192 bytes
  2282  	VPERM2I128 $0x13, AA0, BB0, AA0
  2283  	VPERM2I128 $0x13, CC0, DD0, BB0
  2284  	VPERM2I128 $0x02, AA1, BB1, CC0
  2285  	VPERM2I128 $0x02, CC1, DD1, DD0
  2286  	VPERM2I128 $0x13, AA1, BB1, AA1
  2287  	VPERM2I128 $0x13, CC1, DD1, BB1
  2288  
  2289  sealAVX2ShortSeal:
  2290  	// Hash aad
  2291  	MOVQ ad_len+80(FP), itr2
  2292  	CALL polyHashADInternal<>(SB)
  2293  	XORQ itr1, itr1
  2294  
  2295  sealAVX2SealHash:
  2296  	// itr1 holds the number of bytes encrypted but not yet hashed
  2297  	CMPQ itr1, $16
  2298  	JB   sealAVX2ShortSealLoop
  2299  	polyAdd(0(oup))
  2300  	polyMul
  2301  	SUBQ $16, itr1
  2302  	ADDQ $16, oup
  2303  	JMP  sealAVX2SealHash
  2304  
  2305  sealAVX2ShortSealLoop:
  2306  	CMPQ inl, $32
  2307  	JB   sealAVX2ShortTail32
  2308  	SUBQ $32, inl
  2309  
  2310  	// Load for encryption
  2311  	VPXOR   (inp), AA0, AA0
  2312  	VMOVDQU AA0, (oup)
  2313  	LEAQ    (1*32)(inp), inp
  2314  
  2315  	// Now can hash
  2316  	polyAdd(0*8(oup))
  2317  	polyMulAVX2
  2318  	polyAdd(2*8(oup))
  2319  	polyMulAVX2
  2320  	LEAQ (1*32)(oup), oup
  2321  
  2322  	// Shift stream left
  2323  	VMOVDQA BB0, AA0
  2324  	VMOVDQA CC0, BB0
  2325  	VMOVDQA DD0, CC0
  2326  	VMOVDQA AA1, DD0
  2327  	VMOVDQA BB1, AA1
  2328  	VMOVDQA CC1, BB1
  2329  	VMOVDQA DD1, CC1
  2330  	VMOVDQA AA2, DD1
  2331  	VMOVDQA BB2, AA2
  2332  	JMP     sealAVX2ShortSealLoop
  2333  
  2334  sealAVX2ShortTail32:
  2335  	CMPQ    inl, $16
  2336  	VMOVDQA A0, A1
  2337  	JB      sealAVX2ShortDone
  2338  
  2339  	SUBQ $16, inl
  2340  
  2341  	// Load for encryption
  2342  	VPXOR   (inp), A0, T0
  2343  	VMOVDQU T0, (oup)
  2344  	LEAQ    (1*16)(inp), inp
  2345  
  2346  	// Hash
  2347  	polyAdd(0*8(oup))
  2348  	polyMulAVX2
  2349  	LEAQ       (1*16)(oup), oup
  2350  	VPERM2I128 $0x11, AA0, AA0, AA0
  2351  	VMOVDQA    A0, A1
  2352  
  2353  sealAVX2ShortDone:
  2354  	VZEROUPPER
  2355  	JMP sealSSETail
  2356  
  2357  // ----------------------------------------------------------------------------
  2358  // Special optimization for buffers smaller than 321 bytes
  2359  seal320AVX2:
  2360  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  2361  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2362  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2363  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  2364  	MOVQ    $10, itr2
  2365  
  2366  sealAVX2320InnerCipherLoop:
  2367  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2368  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2369  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2370  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2371  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2372  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2373  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2374  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2375  	DECQ     itr2
  2376  	JNE      sealAVX2320InnerCipherLoop
  2377  
  2378  	VMOVDQA ·chacha20Constants<>(SB), TT0
  2379  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  2380  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  2381  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  2382  	VMOVDQA ·avx2IncMask<>(SB), TT0
  2383  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  2384  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  2385  	VPADDD  TT3, DD2, DD2
  2386  
  2387  	// Clamp and store poly key
  2388  	VPERM2I128 $0x02, AA0, BB0, TT0
  2389  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  2390  	VMOVDQA    TT0, rsStoreAVX2
  2391  
  2392  	// Stream for up to 320 bytes
  2393  	VPERM2I128 $0x13, AA0, BB0, AA0
  2394  	VPERM2I128 $0x13, CC0, DD0, BB0
  2395  	VPERM2I128 $0x02, AA1, BB1, CC0
  2396  	VPERM2I128 $0x02, CC1, DD1, DD0
  2397  	VPERM2I128 $0x13, AA1, BB1, AA1
  2398  	VPERM2I128 $0x13, CC1, DD1, BB1
  2399  	VPERM2I128 $0x02, AA2, BB2, CC1
  2400  	VPERM2I128 $0x02, CC2, DD2, DD1
  2401  	VPERM2I128 $0x13, AA2, BB2, AA2
  2402  	VPERM2I128 $0x13, CC2, DD2, BB2
  2403  	JMP        sealAVX2ShortSeal
  2404  
  2405  // ----------------------------------------------------------------------------
  2406  // Special optimization for the last 128 bytes of ciphertext
  2407  sealAVX2Tail128:
  2408  	// Need to decrypt up to 128 bytes - prepare two blocks
  2409  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2410  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2411  	VMOVDQA ·chacha20Constants<>(SB), AA0
  2412  	VMOVDQA state1StoreAVX2, BB0
  2413  	VMOVDQA state2StoreAVX2, CC0
  2414  	VMOVDQA ctr3StoreAVX2, DD0
  2415  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2416  	VMOVDQA DD0, DD1
  2417  
  2418  sealAVX2Tail128LoopA:
  2419  	polyAdd(0(oup))
  2420  	polyMul
  2421  	LEAQ 16(oup), oup
  2422  
  2423  sealAVX2Tail128LoopB:
  2424  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2425  	polyAdd(0(oup))
  2426  	polyMul
  2427  	VPALIGNR $4, BB0, BB0, BB0
  2428  	VPALIGNR $8, CC0, CC0, CC0
  2429  	VPALIGNR $12, DD0, DD0, DD0
  2430  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2431  	polyAdd(16(oup))
  2432  	polyMul
  2433  	LEAQ     32(oup), oup
  2434  	VPALIGNR $12, BB0, BB0, BB0
  2435  	VPALIGNR $8, CC0, CC0, CC0
  2436  	VPALIGNR $4, DD0, DD0, DD0
  2437  	DECQ     itr1
  2438  	JG       sealAVX2Tail128LoopA
  2439  	DECQ     itr2
  2440  	JGE      sealAVX2Tail128LoopB
  2441  
  2442  	VPADDD ·chacha20Constants<>(SB), AA0, AA1
  2443  	VPADDD state1StoreAVX2, BB0, BB1
  2444  	VPADDD state2StoreAVX2, CC0, CC1
  2445  	VPADDD DD1, DD0, DD1
  2446  
  2447  	VPERM2I128 $0x02, AA1, BB1, AA0
  2448  	VPERM2I128 $0x02, CC1, DD1, BB0
  2449  	VPERM2I128 $0x13, AA1, BB1, CC0
  2450  	VPERM2I128 $0x13, CC1, DD1, DD0
  2451  	JMP        sealAVX2ShortSealLoop
  2452  
  2453  // ----------------------------------------------------------------------------
  2454  // Special optimization for the last 256 bytes of ciphertext
  2455  sealAVX2Tail256:
  2456  	// Need to decrypt up to 256 bytes - prepare two blocks
  2457  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2458  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2459  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
  2460  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
  2461  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
  2462  	VMOVDQA ctr3StoreAVX2, DD0
  2463  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2464  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2465  	VMOVDQA DD0, TT1
  2466  	VMOVDQA DD1, TT2
  2467  
  2468  sealAVX2Tail256LoopA:
  2469  	polyAdd(0(oup))
  2470  	polyMul
  2471  	LEAQ 16(oup), oup
  2472  
  2473  sealAVX2Tail256LoopB:
  2474  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2475  	polyAdd(0(oup))
  2476  	polyMul
  2477  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2478  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2479  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2480  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2481  	polyAdd(16(oup))
  2482  	polyMul
  2483  	LEAQ     32(oup), oup
  2484  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2485  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2486  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2487  	DECQ     itr1
  2488  	JG       sealAVX2Tail256LoopA
  2489  	DECQ     itr2
  2490  	JGE      sealAVX2Tail256LoopB
  2491  
  2492  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  2493  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  2494  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  2495  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  2496  	VPERM2I128 $0x02, AA0, BB0, TT0
  2497  	VPERM2I128 $0x02, CC0, DD0, TT1
  2498  	VPERM2I128 $0x13, AA0, BB0, TT2
  2499  	VPERM2I128 $0x13, CC0, DD0, TT3
  2500  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2501  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2502  	MOVQ       $128, itr1
  2503  	LEAQ       128(inp), inp
  2504  	SUBQ       $128, inl
  2505  	VPERM2I128 $0x02, AA1, BB1, AA0
  2506  	VPERM2I128 $0x02, CC1, DD1, BB0
  2507  	VPERM2I128 $0x13, AA1, BB1, CC0
  2508  	VPERM2I128 $0x13, CC1, DD1, DD0
  2509  
  2510  	JMP sealAVX2SealHash
  2511  
  2512  // ----------------------------------------------------------------------------
  2513  // Special optimization for the last 384 bytes of ciphertext
  2514  sealAVX2Tail384:
  2515  	// Need to decrypt up to 384 bytes - prepare two blocks
  2516  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2517  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2518  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  2519  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  2520  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  2521  	VMOVDQA ctr3StoreAVX2, DD0
  2522  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2523  	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
  2524  
  2525  sealAVX2Tail384LoopA:
  2526  	polyAdd(0(oup))
  2527  	polyMul
  2528  	LEAQ 16(oup), oup
  2529  
  2530  sealAVX2Tail384LoopB:
  2531  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2532  	polyAdd(0(oup))
  2533  	polyMul
  2534  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2535  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2536  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2537  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2538  	polyAdd(16(oup))
  2539  	polyMul
  2540  	LEAQ     32(oup), oup
  2541  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2542  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2543  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2544  	DECQ     itr1
  2545  	JG       sealAVX2Tail384LoopA
  2546  	DECQ     itr2
  2547  	JGE      sealAVX2Tail384LoopB
  2548  
  2549  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  2550  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  2551  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  2552  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
  2553  	VPERM2I128 $0x02, AA0, BB0, TT0
  2554  	VPERM2I128 $0x02, CC0, DD0, TT1
  2555  	VPERM2I128 $0x13, AA0, BB0, TT2
  2556  	VPERM2I128 $0x13, CC0, DD0, TT3
  2557  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2558  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2559  	VPERM2I128 $0x02, AA1, BB1, TT0
  2560  	VPERM2I128 $0x02, CC1, DD1, TT1
  2561  	VPERM2I128 $0x13, AA1, BB1, TT2
  2562  	VPERM2I128 $0x13, CC1, DD1, TT3
  2563  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  2564  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  2565  	MOVQ       $256, itr1
  2566  	LEAQ       256(inp), inp
  2567  	SUBQ       $256, inl
  2568  	VPERM2I128 $0x02, AA2, BB2, AA0
  2569  	VPERM2I128 $0x02, CC2, DD2, BB0
  2570  	VPERM2I128 $0x13, AA2, BB2, CC0
  2571  	VPERM2I128 $0x13, CC2, DD2, DD0
  2572  
  2573  	JMP sealAVX2SealHash
  2574  
  2575  // ----------------------------------------------------------------------------
  2576  // Special optimization for the last 512 bytes of ciphertext
  2577  sealAVX2Tail512:
  2578  	// Need to decrypt up to 512 bytes - prepare two blocks
  2579  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2580  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2581  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2582  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2583  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2584  	VMOVDQA ctr3StoreAVX2, DD0
  2585  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2586  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2587  
  2588  sealAVX2Tail512LoopA:
  2589  	polyAdd(0(oup))
  2590  	polyMul
  2591  	LEAQ 16(oup), oup
  2592  
  2593  sealAVX2Tail512LoopB:
  2594  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2595  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2596  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2597  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2598  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2599  	VMOVDQA  CC3, tmpStoreAVX2
  2600  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2601  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2602  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2603  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2604  	VMOVDQA  tmpStoreAVX2, CC3
  2605  	polyAdd(0*8(oup))
  2606  	polyMulAVX2
  2607  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2608  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2609  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2610  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2611  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2612  	VMOVDQA  CC3, tmpStoreAVX2
  2613  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2614  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2615  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2616  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2617  	VMOVDQA  tmpStoreAVX2, CC3
  2618  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2619  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2620  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2621  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2622  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2623  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2624  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2625  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2626  	polyAdd(2*8(oup))
  2627  	polyMulAVX2
  2628  	LEAQ     (4*8)(oup), oup
  2629  	VMOVDQA  CC3, tmpStoreAVX2
  2630  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2631  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2632  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2633  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2634  	VMOVDQA  tmpStoreAVX2, CC3
  2635  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2636  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2637  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2638  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2639  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2640  	VMOVDQA  CC3, tmpStoreAVX2
  2641  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2642  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2643  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2644  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2645  	VMOVDQA  tmpStoreAVX2, CC3
  2646  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2647  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2648  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2649  
  2650  	DECQ itr1
  2651  	JG   sealAVX2Tail512LoopA
  2652  	DECQ itr2
  2653  	JGE  sealAVX2Tail512LoopB
  2654  
  2655  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2656  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2657  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2658  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2659  	VMOVDQA    CC3, tmpStoreAVX2
  2660  	VPERM2I128 $0x02, AA0, BB0, CC3
  2661  	VPXOR      (0*32)(inp), CC3, CC3
  2662  	VMOVDQU    CC3, (0*32)(oup)
  2663  	VPERM2I128 $0x02, CC0, DD0, CC3
  2664  	VPXOR      (1*32)(inp), CC3, CC3
  2665  	VMOVDQU    CC3, (1*32)(oup)
  2666  	VPERM2I128 $0x13, AA0, BB0, CC3
  2667  	VPXOR      (2*32)(inp), CC3, CC3
  2668  	VMOVDQU    CC3, (2*32)(oup)
  2669  	VPERM2I128 $0x13, CC0, DD0, CC3
  2670  	VPXOR      (3*32)(inp), CC3, CC3
  2671  	VMOVDQU    CC3, (3*32)(oup)
  2672  
  2673  	VPERM2I128 $0x02, AA1, BB1, AA0
  2674  	VPERM2I128 $0x02, CC1, DD1, BB0
  2675  	VPERM2I128 $0x13, AA1, BB1, CC0
  2676  	VPERM2I128 $0x13, CC1, DD1, DD0
  2677  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2678  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2679  
  2680  	VPERM2I128 $0x02, AA2, BB2, AA0
  2681  	VPERM2I128 $0x02, CC2, DD2, BB0
  2682  	VPERM2I128 $0x13, AA2, BB2, CC0
  2683  	VPERM2I128 $0x13, CC2, DD2, DD0
  2684  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2685  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2686  
  2687  	MOVQ       $384, itr1
  2688  	LEAQ       384(inp), inp
  2689  	SUBQ       $384, inl
  2690  	VPERM2I128 $0x02, AA3, BB3, AA0
  2691  	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
  2692  	VPERM2I128 $0x13, AA3, BB3, CC0
  2693  	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2694  
  2695  	JMP sealAVX2SealHash