github.com/devops-filetransfer/sshego@v7.0.4+incompatible/_vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
     6  
     7  // +build go1.7,amd64,!gccgo,!appengine
     8  
     9  #include "textflag.h"
    10  // General register allocation
    11  #define oup DI
    12  #define inp SI
    13  #define inl BX
    14  #define adp CX // free to reuse, after we hash the additional data
    15  #define keyp R8 // free to reuse, when we copy the key to stack
    16  #define itr2 R9 // general iterator
    17  #define itr1 CX // general iterator
    18  #define acc0 R10
    19  #define acc1 R11
    20  #define acc2 R12
    21  #define t0 R13
    22  #define t1 R14
    23  #define t2 R15
    24  #define t3 R8
    25  // Register and stack allocation for the SSE code
    26  #define rStore (0*16)(BP)
    27  #define sStore (1*16)(BP)
    28  #define state1Store (2*16)(BP)
    29  #define state2Store (3*16)(BP)
    30  #define tmpStore (4*16)(BP)
    31  #define ctr0Store (5*16)(BP)
    32  #define ctr1Store (6*16)(BP)
    33  #define ctr2Store (7*16)(BP)
    34  #define ctr3Store (8*16)(BP)
    35  #define A0 X0
    36  #define A1 X1
    37  #define A2 X2
    38  #define B0 X3
    39  #define B1 X4
    40  #define B2 X5
    41  #define C0 X6
    42  #define C1 X7
    43  #define C2 X8
    44  #define D0 X9
    45  #define D1 X10
    46  #define D2 X11
    47  #define T0 X12
    48  #define T1 X13
    49  #define T2 X14
    50  #define T3 X15
    51  #define A3 T0
    52  #define B3 T1
    53  #define C3 T2
    54  #define D3 T3
    55  // Register and stack allocation for the AVX2 code
    56  #define rsStoreAVX2 (0*32)(BP)
    57  #define state1StoreAVX2 (1*32)(BP)
    58  #define state2StoreAVX2 (2*32)(BP)
    59  #define ctr0StoreAVX2 (3*32)(BP)
    60  #define ctr1StoreAVX2 (4*32)(BP)
    61  #define ctr2StoreAVX2 (5*32)(BP)
    62  #define ctr3StoreAVX2 (6*32)(BP)
    63  #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
    64  #define AA0 Y0
    65  #define AA1 Y5
    66  #define AA2 Y6
    67  #define AA3 Y7
    68  #define BB0 Y14
    69  #define BB1 Y9
    70  #define BB2 Y10
    71  #define BB3 Y11
    72  #define CC0 Y12
    73  #define CC1 Y13
    74  #define CC2 Y8
    75  #define CC3 Y15
    76  #define DD0 Y4
    77  #define DD1 Y1
    78  #define DD2 Y2
    79  #define DD3 Y3
    80  #define TT0 DD3
    81  #define TT1 AA3
    82  #define TT2 BB3
    83  #define TT3 CC3
    84  // ChaCha20 constants
    85  DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
    86  DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
    87  DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
    88  DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
    89  DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
    90  DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
    91  DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
    92  DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
    93  // <<< 16 with PSHUFB
    94  DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
    95  DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
    96  DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
    97  DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
    98  // <<< 8 with PSHUFB
    99  DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
   100  DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
   101  DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
   102  DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
   103  
   104  DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
   105  DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
   106  DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
   107  DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
   108  
   109  DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
   110  DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
   111  DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
   112  DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
   113  // Poly1305 key clamp
   114  DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
   115  DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
   116  DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   117  DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   118  
   119  DATA ·sseIncMask<>+0x00(SB)/8, $0x1
   120  DATA ·sseIncMask<>+0x08(SB)/8, $0x0
   121  // To load/store the last < 16 bytes in a buffer
   122  DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
   123  DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
   124  DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
   125  DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
   126  DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
   127  DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
   128  DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
   129  DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
   130  DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
   131  DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
   132  DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
   133  DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
   134  DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
   135  DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
   136  DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
   137  DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
   138  DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
   139  DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
   140  DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
   141  DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
   142  DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
   143  DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
   144  DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
   145  DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
   146  DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
   147  DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
   148  DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
   149  DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
   150  DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
   151  DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
   152  
   153  GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
   154  GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
   155  GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
   156  GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
   157  GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
   158  GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
   159  GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
   160  GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
   161  // No PALIGNR in Go ASM yet (but VPALIGNR is present).
   162  #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
   163  #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
   164  #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
   165  #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
   166  #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
   167  #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
   168  #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
   169  #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
   170  #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
   171  #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
   172  #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
   173  #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
   174  #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
   175  #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
   176  #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
   177  #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
   178  #define shiftC0Right shiftC0Left
   179  #define shiftC1Right shiftC1Left
   180  #define shiftC2Right shiftC2Left
   181  #define shiftC3Right shiftC3Left
   182  #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
   183  #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
   184  #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
   185  #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
   186  // Some macros
   187  #define chachaQR(A, B, C, D, T) \
   188  	PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
   189  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
   190  	PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
   191  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
   192  
   193  #define chachaQR_AVX2(A, B, C, D, T) \
   194  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
   195  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
   196  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
   197  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
   198  
   199  #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
   200  #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
   201  #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
   202  #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
   203  #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
   204  
   205  #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
   206  #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
   207  #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
   208  
   209  #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
   210  #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
   211  // ----------------------------------------------------------------------------
   212  TEXT polyHashADInternal<>(SB), NOSPLIT, $0
   213  	// adp points to beginning of additional data
   214  	// itr2 holds ad length
   215  	XORQ acc0, acc0
   216  	XORQ acc1, acc1
   217  	XORQ acc2, acc2
   218  	CMPQ itr2, $13
   219  	JNE  hashADLoop
   220  
   221  openFastTLSAD:
   222  	// Special treatment for the TLS case of 13 bytes
   223  	MOVQ (adp), acc0
   224  	MOVQ 5(adp), acc1
   225  	SHRQ $24, acc1
   226  	MOVQ $1, acc2
   227  	polyMul
   228  	RET
   229  
   230  hashADLoop:
   231  	// Hash in 16 byte chunks
   232  	CMPQ itr2, $16
   233  	JB   hashADTail
   234  	polyAdd(0(adp))
   235  	LEAQ (1*16)(adp), adp
   236  	SUBQ $16, itr2
   237  	polyMul
   238  	JMP  hashADLoop
   239  
   240  hashADTail:
   241  	CMPQ itr2, $0
   242  	JE   hashADDone
   243  
   244  	// Hash last < 16 byte tail
   245  	XORQ t0, t0
   246  	XORQ t1, t1
   247  	XORQ t2, t2
   248  	ADDQ itr2, adp
   249  
   250  hashADTailLoop:
   251  	SHLQ $8, t1:t0
   252  	SHLQ $8, t0
   253  	MOVB -1(adp), t2
   254  	XORQ t2, t0
   255  	DECQ adp
   256  	DECQ itr2
   257  	JNE  hashADTailLoop
   258  
   259  hashADTailFinish:
   260  	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   261  	polyMul
   262  
   263  	// Finished AD
   264  hashADDone:
   265  	RET
   266  
   267  // ----------------------------------------------------------------------------
   268  // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
   269  TEXT ·chacha20Poly1305Open(SB), 0, $288-97
   270  	// For aligned stack access
   271  	MOVQ SP, BP
   272  	ADDQ $32, BP
   273  	ANDQ $-32, BP
   274  	MOVQ dst+0(FP), oup
   275  	MOVQ key+24(FP), keyp
   276  	MOVQ src+48(FP), inp
   277  	MOVQ src_len+56(FP), inl
   278  	MOVQ ad+72(FP), adp
   279  
   280  	// Check for AVX2 support
   281  	CMPB runtime·support_avx2(SB), $0
   282  	JE   noavx2bmi2Open
   283  
   284  	// Check BMI2 bit for MULXQ.
   285  	// runtime·cpuid_ebx7 is always available here
   286  	// because it passed avx2 check
   287  	TESTL $(1<<8), runtime·cpuid_ebx7(SB)
   288  	JNE   chacha20Poly1305Open_AVX2
   289  noavx2bmi2Open:
   290  
   291  	// Special optimization, for very short buffers
   292  	CMPQ inl, $128
   293  	JBE  openSSE128 // About 16% faster
   294  
   295  	// For long buffers, prepare the poly key first
   296  	MOVOU ·chacha20Constants<>(SB), A0
   297  	MOVOU (1*16)(keyp), B0
   298  	MOVOU (2*16)(keyp), C0
   299  	MOVOU (3*16)(keyp), D0
   300  	MOVO  D0, T1
   301  
   302  	// Store state on stack for future use
   303  	MOVO B0, state1Store
   304  	MOVO C0, state2Store
   305  	MOVO D0, ctr3Store
   306  	MOVQ $10, itr2
   307  
   308  openSSEPreparePolyKey:
   309  	chachaQR(A0, B0, C0, D0, T0)
   310  	shiftB0Left;  shiftC0Left; shiftD0Left
   311  	chachaQR(A0, B0, C0, D0, T0)
   312  	shiftB0Right; shiftC0Right; shiftD0Right
   313  	DECQ          itr2
   314  	JNE           openSSEPreparePolyKey
   315  
   316  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   317  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
   318  
   319  	// Clamp and store the key
   320  	PAND ·polyClampMask<>(SB), A0
   321  	MOVO A0, rStore; MOVO B0, sStore
   322  
   323  	// Hash AAD
   324  	MOVQ ad_len+80(FP), itr2
   325  	CALL polyHashADInternal<>(SB)
   326  
   327  openSSEMainLoop:
   328  	CMPQ inl, $256
   329  	JB   openSSEMainLoopDone
   330  
   331  	// Load state, increment counter blocks
   332  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   333  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   334  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   335  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   336  
   337  	// Store counters
   338  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   339  
   340  	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
   341  	MOVQ $4, itr1
   342  	MOVQ inp, itr2
   343  
   344  openSSEInternalLoop:
   345  	MOVO          C3, tmpStore
   346  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   347  	MOVO          tmpStore, C3
   348  	MOVO          C1, tmpStore
   349  	chachaQR(A3, B3, C3, D3, C1)
   350  	MOVO          tmpStore, C1
   351  	polyAdd(0(itr2))
   352  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   353  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   354  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   355  	polyMulStage1
   356  	polyMulStage2
   357  	LEAQ          (2*8)(itr2), itr2
   358  	MOVO          C3, tmpStore
   359  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   360  	MOVO          tmpStore, C3
   361  	MOVO          C1, tmpStore
   362  	polyMulStage3
   363  	chachaQR(A3, B3, C3, D3, C1)
   364  	MOVO          tmpStore, C1
   365  	polyMulReduceStage
   366  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   367  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   368  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   369  	DECQ          itr1
   370  	JGE           openSSEInternalLoop
   371  
   372  	polyAdd(0(itr2))
   373  	polyMul
   374  	LEAQ (2*8)(itr2), itr2
   375  
   376  	CMPQ itr1, $-6
   377  	JG   openSSEInternalLoop
   378  
   379  	// Add in the state
   380  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   381  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   382  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   383  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   384  
   385  	// Load - xor - store
   386  	MOVO  D3, tmpStore
   387  	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
   388  	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
   389  	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
   390  	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
   391  	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
   392  	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
   393  	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
   394  	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
   395  	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
   396  	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
   397  	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
   398  	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
   399  	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
   400  	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
   401  	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
   402  	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
   403  	LEAQ  256(inp), inp
   404  	LEAQ  256(oup), oup
   405  	SUBQ  $256, inl
   406  	JMP   openSSEMainLoop
   407  
   408  openSSEMainLoopDone:
   409  	// Handle the various tail sizes efficiently
   410  	TESTQ inl, inl
   411  	JE    openSSEFinalize
   412  	CMPQ  inl, $64
   413  	JBE   openSSETail64
   414  	CMPQ  inl, $128
   415  	JBE   openSSETail128
   416  	CMPQ  inl, $192
   417  	JBE   openSSETail192
   418  	JMP   openSSETail256
   419  
   420  openSSEFinalize:
   421  	// Hash in the PT, AAD lengths
   422  	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
   423  	polyMul
   424  
   425  	// Final reduce
   426  	MOVQ    acc0, t0
   427  	MOVQ    acc1, t1
   428  	MOVQ    acc2, t2
   429  	SUBQ    $-5, acc0
   430  	SBBQ    $-1, acc1
   431  	SBBQ    $3, acc2
   432  	CMOVQCS t0, acc0
   433  	CMOVQCS t1, acc1
   434  	CMOVQCS t2, acc2
   435  
   436  	// Add in the "s" part of the key
   437  	ADDQ 0+sStore, acc0
   438  	ADCQ 8+sStore, acc1
   439  
   440  	// Finally, constant time compare to the tag at the end of the message
   441  	XORQ    AX, AX
   442  	MOVQ    $1, DX
   443  	XORQ    (0*8)(inp), acc0
   444  	XORQ    (1*8)(inp), acc1
   445  	ORQ     acc1, acc0
   446  	CMOVQEQ DX, AX
   447  
   448  	// Return true iff tags are equal
   449  	MOVB AX, ret+96(FP)
   450  	RET
   451  
   452  // ----------------------------------------------------------------------------
   453  // Special optimization for buffers smaller than 129 bytes
   454  openSSE128:
   455  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
   456  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
   457  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   458  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   459  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
   460  	MOVQ  $10, itr2
   461  
   462  openSSE128InnerCipherLoop:
   463  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   464  	shiftB0Left;  shiftB1Left; shiftB2Left
   465  	shiftC0Left;  shiftC1Left; shiftC2Left
   466  	shiftD0Left;  shiftD1Left; shiftD2Left
   467  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   468  	shiftB0Right; shiftB1Right; shiftB2Right
   469  	shiftC0Right; shiftC1Right; shiftC2Right
   470  	shiftD0Right; shiftD1Right; shiftD2Right
   471  	DECQ          itr2
   472  	JNE           openSSE128InnerCipherLoop
   473  
   474  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   475  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   476  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
   477  	PADDL T2, C1; PADDL T2, C2
   478  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
   479  
   480  	// Clamp and store the key
   481  	PAND  ·polyClampMask<>(SB), A0
   482  	MOVOU A0, rStore; MOVOU B0, sStore
   483  
   484  	// Hash
   485  	MOVQ ad_len+80(FP), itr2
   486  	CALL polyHashADInternal<>(SB)
   487  
   488  openSSE128Open:
   489  	CMPQ inl, $16
   490  	JB   openSSETail16
   491  	SUBQ $16, inl
   492  
   493  	// Load for hashing
   494  	polyAdd(0(inp))
   495  
   496  	// Load for decryption
   497  	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
   498  	LEAQ  (1*16)(inp), inp
   499  	LEAQ  (1*16)(oup), oup
   500  	polyMul
   501  
   502  	// Shift the stream "left"
   503  	MOVO B1, A1
   504  	MOVO C1, B1
   505  	MOVO D1, C1
   506  	MOVO A2, D1
   507  	MOVO B2, A2
   508  	MOVO C2, B2
   509  	MOVO D2, C2
   510  	JMP  openSSE128Open
   511  
   512  openSSETail16:
   513  	TESTQ inl, inl
   514  	JE    openSSEFinalize
   515  
   516  	// We can safely load the CT from the end, because it is padded with the MAC
   517  	MOVQ   inl, itr2
   518  	SHLQ   $4, itr2
   519  	LEAQ   ·andMask<>(SB), t0
   520  	MOVOU  (inp), T0
   521  	ADDQ   inl, inp
   522  	PAND   -16(t0)(itr2*1), T0
   523  	MOVO   T0, 0+tmpStore
   524  	MOVQ   T0, t0
   525  	MOVQ   8+tmpStore, t1
   526  	PXOR   A1, T0
   527  
   528  	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
   529  openSSETail16Store:
   530  	MOVQ T0, t3
   531  	MOVB t3, (oup)
   532  	PSRLDQ $1, T0
   533  	INCQ   oup
   534  	DECQ   inl
   535  	JNE    openSSETail16Store
   536  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   537  	polyMul
   538  	JMP    openSSEFinalize
   539  
   540  // ----------------------------------------------------------------------------
   541  // Special optimization for the last 64 bytes of ciphertext
   542  openSSETail64:
   543  	// Need to decrypt up to 64 bytes - prepare single block
   544  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
   545  	XORQ itr2, itr2
   546  	MOVQ inl, itr1
   547  	CMPQ itr1, $16
   548  	JB   openSSETail64LoopB
   549  
   550  openSSETail64LoopA:
   551  	// Perform ChaCha rounds, while hashing the remaining input
   552  	polyAdd(0(inp)(itr2*1))
   553  	polyMul
   554  	SUBQ $16, itr1
   555  
   556  openSSETail64LoopB:
   557  	ADDQ          $16, itr2
   558  	chachaQR(A0, B0, C0, D0, T0)
   559  	shiftB0Left;  shiftC0Left; shiftD0Left
   560  	chachaQR(A0, B0, C0, D0, T0)
   561  	shiftB0Right; shiftC0Right; shiftD0Right
   562  
   563  	CMPQ itr1, $16
   564  	JAE  openSSETail64LoopA
   565  
   566  	CMPQ itr2, $160
   567  	JNE  openSSETail64LoopB
   568  
   569  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
   570  
   571  openSSETail64DecLoop:
   572  	CMPQ  inl, $16
   573  	JB    openSSETail64DecLoopDone
   574  	SUBQ  $16, inl
   575  	MOVOU (inp), T0
   576  	PXOR  T0, A0
   577  	MOVOU A0, (oup)
   578  	LEAQ  16(inp), inp
   579  	LEAQ  16(oup), oup
   580  	MOVO  B0, A0
   581  	MOVO  C0, B0
   582  	MOVO  D0, C0
   583  	JMP   openSSETail64DecLoop
   584  
   585  openSSETail64DecLoopDone:
   586  	MOVO A0, A1
   587  	JMP  openSSETail16
   588  
   589  // ----------------------------------------------------------------------------
   590  // Special optimization for the last 128 bytes of ciphertext
   591  openSSETail128:
   592  	// Need to decrypt up to 128 bytes - prepare two blocks
   593  	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
   594  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
   595  	XORQ itr2, itr2
   596  	MOVQ inl, itr1
   597  	ANDQ $-16, itr1
   598  
   599  openSSETail128LoopA:
   600  	// Perform ChaCha rounds, while hashing the remaining input
   601  	polyAdd(0(inp)(itr2*1))
   602  	polyMul
   603  
   604  openSSETail128LoopB:
   605  	ADDQ          $16, itr2
   606  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   607  	shiftB0Left;  shiftC0Left; shiftD0Left
   608  	shiftB1Left;  shiftC1Left; shiftD1Left
   609  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   610  	shiftB0Right; shiftC0Right; shiftD0Right
   611  	shiftB1Right; shiftC1Right; shiftD1Right
   612  
   613  	CMPQ itr2, itr1
   614  	JB   openSSETail128LoopA
   615  
   616  	CMPQ itr2, $160
   617  	JNE  openSSETail128LoopB
   618  
   619  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
   620  	PADDL state1Store, B0; PADDL state1Store, B1
   621  	PADDL state2Store, C0; PADDL state2Store, C1
   622  	PADDL ctr1Store, D0; PADDL ctr0Store, D1
   623  
   624  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   625  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   626  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
   627  
   628  	SUBQ $64, inl
   629  	LEAQ 64(inp), inp
   630  	LEAQ 64(oup), oup
   631  	JMP  openSSETail64DecLoop
   632  
   633  // ----------------------------------------------------------------------------
   634  // Special optimization for the last 192 bytes of ciphertext
   635  openSSETail192:
   636  	// Need to decrypt up to 192 bytes - prepare three blocks
   637  	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
   638  	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
   639  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
   640  
   641  	MOVQ    inl, itr1
   642  	MOVQ    $160, itr2
   643  	CMPQ    itr1, $160
   644  	CMOVQGT itr2, itr1
   645  	ANDQ    $-16, itr1
   646  	XORQ    itr2, itr2
   647  
   648  openSSLTail192LoopA:
   649  	// Perform ChaCha rounds, while hashing the remaining input
   650  	polyAdd(0(inp)(itr2*1))
   651  	polyMul
   652  
   653  openSSLTail192LoopB:
   654  	ADDQ         $16, itr2
   655  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   656  	shiftB0Left; shiftC0Left; shiftD0Left
   657  	shiftB1Left; shiftC1Left; shiftD1Left
   658  	shiftB2Left; shiftC2Left; shiftD2Left
   659  
   660  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   661  	shiftB0Right; shiftC0Right; shiftD0Right
   662  	shiftB1Right; shiftC1Right; shiftD1Right
   663  	shiftB2Right; shiftC2Right; shiftD2Right
   664  
   665  	CMPQ itr2, itr1
   666  	JB   openSSLTail192LoopA
   667  
   668  	CMPQ itr2, $160
   669  	JNE  openSSLTail192LoopB
   670  
   671  	CMPQ inl, $176
   672  	JB   openSSLTail192Store
   673  
   674  	polyAdd(160(inp))
   675  	polyMul
   676  
   677  	CMPQ inl, $192
   678  	JB   openSSLTail192Store
   679  
   680  	polyAdd(176(inp))
   681  	polyMul
   682  
   683  openSSLTail192Store:
   684  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   685  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
   686  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
   687  	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
   688  
   689  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   690  	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
   691  	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
   692  
   693  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
   694  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   695  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   696  
   697  	SUBQ $128, inl
   698  	LEAQ 128(inp), inp
   699  	LEAQ 128(oup), oup
   700  	JMP  openSSETail64DecLoop
   701  
   702  // ----------------------------------------------------------------------------
   703  // Special optimization for the last 256 bytes of ciphertext
   704  openSSETail256:
   705  	// Need to decrypt up to 256 bytes - prepare four blocks
   706  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   707  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   708  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   709  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   710  
   711  	// Store counters
   712  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   713  	XORQ itr2, itr2
   714  
   715  openSSETail256Loop:
   716  	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
   717  	polyAdd(0(inp)(itr2*1))
   718  	MOVO          C3, tmpStore
   719  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   720  	MOVO          tmpStore, C3
   721  	MOVO          C1, tmpStore
   722  	chachaQR(A3, B3, C3, D3, C1)
   723  	MOVO          tmpStore, C1
   724  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   725  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   726  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   727  	polyMulStage1
   728  	polyMulStage2
   729  	MOVO          C3, tmpStore
   730  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   731  	MOVO          tmpStore, C3
   732  	MOVO          C1, tmpStore
   733  	chachaQR(A3, B3, C3, D3, C1)
   734  	MOVO          tmpStore, C1
   735  	polyMulStage3
   736  	polyMulReduceStage
   737  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   738  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   739  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   740  	ADDQ          $2*8, itr2
   741  	CMPQ          itr2, $160
   742  	JB            openSSETail256Loop
   743  	MOVQ          inl, itr1
   744  	ANDQ          $-16, itr1
   745  
   746  openSSETail256HashLoop:
   747  	polyAdd(0(inp)(itr2*1))
   748  	polyMul
   749  	ADDQ $2*8, itr2
   750  	CMPQ itr2, itr1
   751  	JB   openSSETail256HashLoop
   752  
   753  	// Add in the state
   754  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   755  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   756  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   757  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   758  	MOVO  D3, tmpStore
   759  
   760  	// Load - xor - store
   761  	MOVOU (0*16)(inp), D3; PXOR D3, A0
   762  	MOVOU (1*16)(inp), D3; PXOR D3, B0
   763  	MOVOU (2*16)(inp), D3; PXOR D3, C0
   764  	MOVOU (3*16)(inp), D3; PXOR D3, D0
   765  	MOVOU A0, (0*16)(oup)
   766  	MOVOU B0, (1*16)(oup)
   767  	MOVOU C0, (2*16)(oup)
   768  	MOVOU D0, (3*16)(oup)
   769  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
   770  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
   771  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   772  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
   773  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
   774  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
   775  	LEAQ  192(inp), inp
   776  	LEAQ  192(oup), oup
   777  	SUBQ  $192, inl
   778  	MOVO  A3, A0
   779  	MOVO  B3, B0
   780  	MOVO  C3, C0
   781  	MOVO  tmpStore, D0
   782  
   783  	JMP openSSETail64DecLoop
   784  
   785  // ----------------------------------------------------------------------------
   786  // ------------------------- AVX2 Code ----------------------------------------
   787  chacha20Poly1305Open_AVX2:
   788  	VZEROUPPER
   789  	VMOVDQU ·chacha20Constants<>(SB), AA0
   790  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
   791  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
   792  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
   793  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
   794  
   795  	// Special optimization, for very short buffers
   796  	CMPQ inl, $192
   797  	JBE  openAVX2192
   798  	CMPQ inl, $320
   799  	JBE  openAVX2320
   800  
   801  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
   802  	VMOVDQA BB0, state1StoreAVX2
   803  	VMOVDQA CC0, state2StoreAVX2
   804  	VMOVDQA DD0, ctr3StoreAVX2
   805  	MOVQ    $10, itr2
   806  
   807  openAVX2PreparePolyKey:
   808  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   809  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
   810  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   811  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
   812  	DECQ     itr2
   813  	JNE      openAVX2PreparePolyKey
   814  
   815  	VPADDD ·chacha20Constants<>(SB), AA0, AA0
   816  	VPADDD state1StoreAVX2, BB0, BB0
   817  	VPADDD state2StoreAVX2, CC0, CC0
   818  	VPADDD ctr3StoreAVX2, DD0, DD0
   819  
   820  	VPERM2I128 $0x02, AA0, BB0, TT0
   821  
   822  	// Clamp and store poly key
   823  	VPAND   ·polyClampMask<>(SB), TT0, TT0
   824  	VMOVDQA TT0, rsStoreAVX2
   825  
   826  	// Stream for the first 64 bytes
   827  	VPERM2I128 $0x13, AA0, BB0, AA0
   828  	VPERM2I128 $0x13, CC0, DD0, BB0
   829  
   830  	// Hash AD + first 64 bytes
   831  	MOVQ ad_len+80(FP), itr2
   832  	CALL polyHashADInternal<>(SB)
   833  	XORQ itr1, itr1
   834  
   835  openAVX2InitialHash64:
   836  	polyAdd(0(inp)(itr1*1))
   837  	polyMulAVX2
   838  	ADDQ $16, itr1
   839  	CMPQ itr1, $64
   840  	JNE  openAVX2InitialHash64
   841  
   842  	// Decrypt the first 64 bytes
   843  	VPXOR   (0*32)(inp), AA0, AA0
   844  	VPXOR   (1*32)(inp), BB0, BB0
   845  	VMOVDQU AA0, (0*32)(oup)
   846  	VMOVDQU BB0, (1*32)(oup)
   847  	LEAQ    (2*32)(inp), inp
   848  	LEAQ    (2*32)(oup), oup
   849  	SUBQ    $64, inl
   850  
   851  openAVX2MainLoop:
   852  	CMPQ inl, $512
   853  	JB   openAVX2MainLoopDone
   854  
   855  	// Load state, increment counter blocks, store the incremented counters
   856  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   857  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   858  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   859  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
   860  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   861  	XORQ    itr1, itr1
   862  
   863  openAVX2InternalLoop:
   864  	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
   865  	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
   866  	polyAdd(0*8(inp)(itr1*1))
   867  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   868  	polyMulStage1_AVX2
   869  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   870  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   871  	polyMulStage2_AVX2
   872  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   873  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   874  	polyMulStage3_AVX2
   875  	VMOVDQA  CC3, tmpStoreAVX2
   876  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   877  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   878  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   879  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   880  	VMOVDQA  tmpStoreAVX2, CC3
   881  	polyMulReduceStage
   882  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   883  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   884  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   885  	polyAdd(2*8(inp)(itr1*1))
   886  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   887  	polyMulStage1_AVX2
   888  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   889  	VMOVDQA  CC3, tmpStoreAVX2
   890  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   891  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   892  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   893  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   894  	VMOVDQA  tmpStoreAVX2, CC3
   895  	polyMulStage2_AVX2
   896  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   897  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   898  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   899  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   900  	polyMulStage3_AVX2
   901  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   902  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   903  	polyMulReduceStage
   904  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   905  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   906  	polyAdd(4*8(inp)(itr1*1))
   907  	LEAQ     (6*8)(itr1), itr1
   908  	VMOVDQA  CC3, tmpStoreAVX2
   909  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   910  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   911  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   912  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   913  	VMOVDQA  tmpStoreAVX2, CC3
   914  	polyMulStage1_AVX2
   915  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   916  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   917  	polyMulStage2_AVX2
   918  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   919  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   920  	polyMulStage3_AVX2
   921  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   922  	VMOVDQA  CC3, tmpStoreAVX2
   923  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   924  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   925  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   926  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   927  	VMOVDQA  tmpStoreAVX2, CC3
   928  	polyMulReduceStage
   929  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   930  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   931  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   932  	CMPQ     itr1, $480
   933  	JNE      openAVX2InternalLoop
   934  
   935  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
   936  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   937  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   938  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   939  	VMOVDQA CC3, tmpStoreAVX2
   940  
   941  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
   942  	polyAdd(480(inp))
   943  	polyMulAVX2
   944  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
   945  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
   946  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
   947  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   948  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   949  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   950  
   951  	// and here
   952  	polyAdd(496(inp))
   953  	polyMulAVX2
   954  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   955  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   956  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   957  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   958  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
   959  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
   960  	LEAQ       (32*16)(inp), inp
   961  	LEAQ       (32*16)(oup), oup
   962  	SUBQ       $(32*16), inl
   963  	JMP        openAVX2MainLoop
   964  
   965  openAVX2MainLoopDone:
   966  	// Handle the various tail sizes efficiently
   967  	TESTQ inl, inl
   968  	JE    openSSEFinalize
   969  	CMPQ  inl, $128
   970  	JBE   openAVX2Tail128
   971  	CMPQ  inl, $256
   972  	JBE   openAVX2Tail256
   973  	CMPQ  inl, $384
   974  	JBE   openAVX2Tail384
   975  	JMP   openAVX2Tail512
   976  
   977  // ----------------------------------------------------------------------------
   978  // Special optimization for buffers smaller than 193 bytes
   979  openAVX2192:
   980  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
   981  	VMOVDQA AA0, AA1
   982  	VMOVDQA BB0, BB1
   983  	VMOVDQA CC0, CC1
   984  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
   985  	VMOVDQA AA0, AA2
   986  	VMOVDQA BB0, BB2
   987  	VMOVDQA CC0, CC2
   988  	VMOVDQA DD0, DD2
   989  	VMOVDQA DD1, TT3
   990  	MOVQ    $10, itr2
   991  
   992  openAVX2192InnerCipherLoop:
   993  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   994  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
   995  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   996  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
   997  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   998  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
   999  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1000  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1001  	DECQ       itr2
  1002  	JNE        openAVX2192InnerCipherLoop
  1003  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  1004  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  1005  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  1006  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  1007  	VPERM2I128 $0x02, AA0, BB0, TT0
  1008  
  1009  	// Clamp and store poly key
  1010  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  1011  	VMOVDQA TT0, rsStoreAVX2
  1012  
  1013  	// Stream for up to 192 bytes
  1014  	VPERM2I128 $0x13, AA0, BB0, AA0
  1015  	VPERM2I128 $0x13, CC0, DD0, BB0
  1016  	VPERM2I128 $0x02, AA1, BB1, CC0
  1017  	VPERM2I128 $0x02, CC1, DD1, DD0
  1018  	VPERM2I128 $0x13, AA1, BB1, AA1
  1019  	VPERM2I128 $0x13, CC1, DD1, BB1
  1020  
  1021  openAVX2ShortOpen:
  1022  	// Hash
  1023  	MOVQ ad_len+80(FP), itr2
  1024  	CALL polyHashADInternal<>(SB)
  1025  
  1026  openAVX2ShortOpenLoop:
  1027  	CMPQ inl, $32
  1028  	JB   openAVX2ShortTail32
  1029  	SUBQ $32, inl
  1030  
  1031  	// Load for hashing
  1032  	polyAdd(0*8(inp))
  1033  	polyMulAVX2
  1034  	polyAdd(2*8(inp))
  1035  	polyMulAVX2
  1036  
  1037  	// Load for decryption
  1038  	VPXOR   (inp), AA0, AA0
  1039  	VMOVDQU AA0, (oup)
  1040  	LEAQ    (1*32)(inp), inp
  1041  	LEAQ    (1*32)(oup), oup
  1042  
  1043  	// Shift stream left
  1044  	VMOVDQA BB0, AA0
  1045  	VMOVDQA CC0, BB0
  1046  	VMOVDQA DD0, CC0
  1047  	VMOVDQA AA1, DD0
  1048  	VMOVDQA BB1, AA1
  1049  	VMOVDQA CC1, BB1
  1050  	VMOVDQA DD1, CC1
  1051  	VMOVDQA AA2, DD1
  1052  	VMOVDQA BB2, AA2
  1053  	JMP     openAVX2ShortOpenLoop
  1054  
  1055  openAVX2ShortTail32:
  1056  	CMPQ    inl, $16
  1057  	VMOVDQA A0, A1
  1058  	JB      openAVX2ShortDone
  1059  
  1060  	SUBQ $16, inl
  1061  
  1062  	// Load for hashing
  1063  	polyAdd(0*8(inp))
  1064  	polyMulAVX2
  1065  
  1066  	// Load for decryption
  1067  	VPXOR      (inp), A0, T0
  1068  	VMOVDQU    T0, (oup)
  1069  	LEAQ       (1*16)(inp), inp
  1070  	LEAQ       (1*16)(oup), oup
  1071  	VPERM2I128 $0x11, AA0, AA0, AA0
  1072  	VMOVDQA    A0, A1
  1073  
  1074  openAVX2ShortDone:
  1075  	VZEROUPPER
  1076  	JMP openSSETail16
  1077  
  1078  // ----------------------------------------------------------------------------
  1079  // Special optimization for buffers smaller than 321 bytes
  1080  openAVX2320:
  1081  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  1082  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1083  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  1084  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  1085  	MOVQ    $10, itr2
  1086  
  1087  openAVX2320InnerCipherLoop:
  1088  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1089  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1090  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1091  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1092  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1093  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1094  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1095  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1096  	DECQ     itr2
  1097  	JNE      openAVX2320InnerCipherLoop
  1098  
  1099  	VMOVDQA ·chacha20Constants<>(SB), TT0
  1100  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  1101  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  1102  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  1103  	VMOVDQA ·avx2IncMask<>(SB), TT0
  1104  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  1105  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  1106  	VPADDD  TT3, DD2, DD2
  1107  
  1108  	// Clamp and store poly key
  1109  	VPERM2I128 $0x02, AA0, BB0, TT0
  1110  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  1111  	VMOVDQA    TT0, rsStoreAVX2
  1112  
  1113  	// Stream for up to 320 bytes
  1114  	VPERM2I128 $0x13, AA0, BB0, AA0
  1115  	VPERM2I128 $0x13, CC0, DD0, BB0
  1116  	VPERM2I128 $0x02, AA1, BB1, CC0
  1117  	VPERM2I128 $0x02, CC1, DD1, DD0
  1118  	VPERM2I128 $0x13, AA1, BB1, AA1
  1119  	VPERM2I128 $0x13, CC1, DD1, BB1
  1120  	VPERM2I128 $0x02, AA2, BB2, CC1
  1121  	VPERM2I128 $0x02, CC2, DD2, DD1
  1122  	VPERM2I128 $0x13, AA2, BB2, AA2
  1123  	VPERM2I128 $0x13, CC2, DD2, BB2
  1124  	JMP        openAVX2ShortOpen
  1125  
  1126  // ----------------------------------------------------------------------------
  1127  // Special optimization for the last 128 bytes of ciphertext
  1128  openAVX2Tail128:
  1129  	// Need to decrypt up to 128 bytes - prepare two blocks
  1130  	VMOVDQA ·chacha20Constants<>(SB), AA1
  1131  	VMOVDQA state1StoreAVX2, BB1
  1132  	VMOVDQA state2StoreAVX2, CC1
  1133  	VMOVDQA ctr3StoreAVX2, DD1
  1134  	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
  1135  	VMOVDQA DD1, DD0
  1136  
  1137  	XORQ  itr2, itr2
  1138  	MOVQ  inl, itr1
  1139  	ANDQ  $-16, itr1
  1140  	TESTQ itr1, itr1
  1141  	JE    openAVX2Tail128LoopB
  1142  
  1143  openAVX2Tail128LoopA:
  1144  	// Perform ChaCha rounds, while hashing the remaining input
  1145  	polyAdd(0(inp)(itr2*1))
  1146  	polyMulAVX2
  1147  
  1148  openAVX2Tail128LoopB:
  1149  	ADDQ     $16, itr2
  1150  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1151  	VPALIGNR $4, BB1, BB1, BB1
  1152  	VPALIGNR $8, CC1, CC1, CC1
  1153  	VPALIGNR $12, DD1, DD1, DD1
  1154  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1155  	VPALIGNR $12, BB1, BB1, BB1
  1156  	VPALIGNR $8, CC1, CC1, CC1
  1157  	VPALIGNR $4, DD1, DD1, DD1
  1158  	CMPQ     itr2, itr1
  1159  	JB       openAVX2Tail128LoopA
  1160  	CMPQ     itr2, $160
  1161  	JNE      openAVX2Tail128LoopB
  1162  
  1163  	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
  1164  	VPADDD     state1StoreAVX2, BB1, BB1
  1165  	VPADDD     state2StoreAVX2, CC1, CC1
  1166  	VPADDD     DD0, DD1, DD1
  1167  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1168  
  1169  openAVX2TailLoop:
  1170  	CMPQ inl, $32
  1171  	JB   openAVX2Tail
  1172  	SUBQ $32, inl
  1173  
  1174  	// Load for decryption
  1175  	VPXOR   (inp), AA0, AA0
  1176  	VMOVDQU AA0, (oup)
  1177  	LEAQ    (1*32)(inp), inp
  1178  	LEAQ    (1*32)(oup), oup
  1179  	VMOVDQA BB0, AA0
  1180  	VMOVDQA CC0, BB0
  1181  	VMOVDQA DD0, CC0
  1182  	JMP     openAVX2TailLoop
  1183  
  1184  openAVX2Tail:
  1185  	CMPQ    inl, $16
  1186  	VMOVDQA A0, A1
  1187  	JB      openAVX2TailDone
  1188  	SUBQ    $16, inl
  1189  
  1190  	// Load for decryption
  1191  	VPXOR      (inp), A0, T0
  1192  	VMOVDQU    T0, (oup)
  1193  	LEAQ       (1*16)(inp), inp
  1194  	LEAQ       (1*16)(oup), oup
  1195  	VPERM2I128 $0x11, AA0, AA0, AA0
  1196  	VMOVDQA    A0, A1
  1197  
  1198  openAVX2TailDone:
  1199  	VZEROUPPER
  1200  	JMP openSSETail16
  1201  
  1202  // ----------------------------------------------------------------------------
  1203  // Special optimization for the last 256 bytes of ciphertext
  1204  openAVX2Tail256:
  1205  	// Need to decrypt up to 256 bytes - prepare four blocks
  1206  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
  1207  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
  1208  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
  1209  	VMOVDQA ctr3StoreAVX2, DD0
  1210  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1211  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1212  	VMOVDQA DD0, TT1
  1213  	VMOVDQA DD1, TT2
  1214  
  1215  	// Compute the number of iterations that will hash data
  1216  	MOVQ    inl, tmpStoreAVX2
  1217  	MOVQ    inl, itr1
  1218  	SUBQ    $128, itr1
  1219  	SHRQ    $4, itr1
  1220  	MOVQ    $10, itr2
  1221  	CMPQ    itr1, $10
  1222  	CMOVQGT itr2, itr1
  1223  	MOVQ    inp, inl
  1224  	XORQ    itr2, itr2
  1225  
  1226  openAVX2Tail256LoopA:
  1227  	polyAdd(0(inl))
  1228  	polyMulAVX2
  1229  	LEAQ 16(inl), inl
  1230  
  1231  	// Perform ChaCha rounds, while hashing the remaining input
  1232  openAVX2Tail256LoopB:
  1233  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1234  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1235  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1236  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1237  	INCQ     itr2
  1238  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1239  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1240  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1241  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1242  	CMPQ     itr2, itr1
  1243  	JB       openAVX2Tail256LoopA
  1244  
  1245  	CMPQ itr2, $10
  1246  	JNE  openAVX2Tail256LoopB
  1247  
  1248  	MOVQ inl, itr2
  1249  	SUBQ inp, inl
  1250  	MOVQ inl, itr1
  1251  	MOVQ tmpStoreAVX2, inl
  1252  
  1253  	// Hash the remainder of data (if any)
  1254  openAVX2Tail256Hash:
  1255  	ADDQ $16, itr1
  1256  	CMPQ itr1, inl
  1257  	JGT  openAVX2Tail256HashEnd
  1258  	polyAdd (0(itr2))
  1259  	polyMulAVX2
  1260  	LEAQ 16(itr2), itr2
  1261  	JMP  openAVX2Tail256Hash
  1262  
  1263  // Store 128 bytes safely, then go to store loop
  1264  openAVX2Tail256HashEnd:
  1265  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1266  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  1267  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  1268  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  1269  	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
  1270  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1271  
  1272  	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
  1273  	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
  1274  	LEAQ    (4*32)(inp), inp
  1275  	LEAQ    (4*32)(oup), oup
  1276  	SUBQ    $4*32, inl
  1277  
  1278  	JMP openAVX2TailLoop
  1279  
  1280  // ----------------------------------------------------------------------------
  1281  // Special optimization for the last 384 bytes of ciphertext
  1282  openAVX2Tail384:
  1283  	// Need to decrypt up to 384 bytes - prepare six blocks
  1284  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  1285  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  1286  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  1287  	VMOVDQA ctr3StoreAVX2, DD0
  1288  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1289  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1290  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
  1291  	VMOVDQA DD0, ctr0StoreAVX2
  1292  	VMOVDQA DD1, ctr1StoreAVX2
  1293  	VMOVDQA DD2, ctr2StoreAVX2
  1294  
  1295  	// Compute the number of iterations that will hash two blocks of data
  1296  	MOVQ    inl, tmpStoreAVX2
  1297  	MOVQ    inl, itr1
  1298  	SUBQ    $256, itr1
  1299  	SHRQ    $4, itr1
  1300  	ADDQ    $6, itr1
  1301  	MOVQ    $10, itr2
  1302  	CMPQ    itr1, $10
  1303  	CMOVQGT itr2, itr1
  1304  	MOVQ    inp, inl
  1305  	XORQ    itr2, itr2
  1306  
  1307  	// Perform ChaCha rounds, while hashing the remaining input
  1308  openAVX2Tail384LoopB:
  1309  	polyAdd(0(inl))
  1310  	polyMulAVX2
  1311  	LEAQ 16(inl), inl
  1312  
  1313  openAVX2Tail384LoopA:
  1314  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1315  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1316  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1317  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1318  	polyAdd(0(inl))
  1319  	polyMulAVX2
  1320  	LEAQ     16(inl), inl
  1321  	INCQ     itr2
  1322  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1323  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1324  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1325  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1326  
  1327  	CMPQ itr2, itr1
  1328  	JB   openAVX2Tail384LoopB
  1329  
  1330  	CMPQ itr2, $10
  1331  	JNE  openAVX2Tail384LoopA
  1332  
  1333  	MOVQ inl, itr2
  1334  	SUBQ inp, inl
  1335  	MOVQ inl, itr1
  1336  	MOVQ tmpStoreAVX2, inl
  1337  
  1338  openAVX2Tail384Hash:
  1339  	ADDQ $16, itr1
  1340  	CMPQ itr1, inl
  1341  	JGT  openAVX2Tail384HashEnd
  1342  	polyAdd(0(itr2))
  1343  	polyMulAVX2
  1344  	LEAQ 16(itr2), itr2
  1345  	JMP  openAVX2Tail384Hash
  1346  
  1347  // Store 256 bytes safely, then go to store loop
  1348  openAVX2Tail384HashEnd:
  1349  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  1350  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  1351  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  1352  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
  1353  	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
  1354  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  1355  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  1356  	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
  1357  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  1358  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  1359  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1360  	LEAQ       (8*32)(inp), inp
  1361  	LEAQ       (8*32)(oup), oup
  1362  	SUBQ       $8*32, inl
  1363  	JMP        openAVX2TailLoop
  1364  
  1365  // ----------------------------------------------------------------------------
  1366  // Special optimization for the last 512 bytes of ciphertext
  1367  openAVX2Tail512:
  1368  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1369  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1370  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1371  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1372  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1373  	XORQ    itr1, itr1
  1374  	MOVQ    inp, itr2
  1375  
  1376  openAVX2Tail512LoopB:
  1377  	polyAdd(0(itr2))
  1378  	polyMulAVX2
  1379  	LEAQ (2*8)(itr2), itr2
  1380  
  1381  openAVX2Tail512LoopA:
  1382  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1383  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1384  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1385  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1386  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1387  	VMOVDQA  CC3, tmpStoreAVX2
  1388  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1389  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1390  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1391  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1392  	VMOVDQA  tmpStoreAVX2, CC3
  1393  	polyAdd(0*8(itr2))
  1394  	polyMulAVX2
  1395  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1396  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1397  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1398  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1399  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1400  	VMOVDQA  CC3, tmpStoreAVX2
  1401  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1402  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1403  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1404  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1405  	VMOVDQA  tmpStoreAVX2, CC3
  1406  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1407  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1408  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1409  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1410  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1411  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1412  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1413  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1414  	polyAdd(2*8(itr2))
  1415  	polyMulAVX2
  1416  	LEAQ     (4*8)(itr2), itr2
  1417  	VMOVDQA  CC3, tmpStoreAVX2
  1418  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1419  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1420  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1421  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1422  	VMOVDQA  tmpStoreAVX2, CC3
  1423  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1424  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1425  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1426  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1427  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1428  	VMOVDQA  CC3, tmpStoreAVX2
  1429  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1430  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1431  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1432  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1433  	VMOVDQA  tmpStoreAVX2, CC3
  1434  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1435  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1436  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1437  	INCQ     itr1
  1438  	CMPQ     itr1, $4
  1439  	JLT      openAVX2Tail512LoopB
  1440  
  1441  	CMPQ itr1, $10
  1442  	JNE  openAVX2Tail512LoopA
  1443  
  1444  	MOVQ inl, itr1
  1445  	SUBQ $384, itr1
  1446  	ANDQ $-16, itr1
  1447  
  1448  openAVX2Tail512HashLoop:
  1449  	TESTQ itr1, itr1
  1450  	JE    openAVX2Tail512HashEnd
  1451  	polyAdd(0(itr2))
  1452  	polyMulAVX2
  1453  	LEAQ  16(itr2), itr2
  1454  	SUBQ  $16, itr1
  1455  	JMP   openAVX2Tail512HashLoop
  1456  
  1457  openAVX2Tail512HashEnd:
  1458  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1459  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1460  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1461  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1462  	VMOVDQA    CC3, tmpStoreAVX2
  1463  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1464  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1465  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1466  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1467  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1468  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1469  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1470  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1471  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1472  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1473  
  1474  	LEAQ (12*32)(inp), inp
  1475  	LEAQ (12*32)(oup), oup
  1476  	SUBQ $12*32, inl
  1477  
  1478  	JMP openAVX2TailLoop
  1479  
  1480  // ----------------------------------------------------------------------------
  1481  // ----------------------------------------------------------------------------
  1482  // func chacha20Poly1305Seal(dst, key, src, ad []byte)
  1483  TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
  1484  	// For aligned stack access
  1485  	MOVQ SP, BP
  1486  	ADDQ $32, BP
  1487  	ANDQ $-32, BP
  1488  	MOVQ dst+0(FP), oup
  1489  	MOVQ key+24(FP), keyp
  1490  	MOVQ src+48(FP), inp
  1491  	MOVQ src_len+56(FP), inl
  1492  	MOVQ ad+72(FP), adp
  1493  
  1494  	// Check for AVX2 support
  1495  	CMPB runtime·support_avx2(SB), $0
  1496  	JE   noavx2bmi2Seal
  1497  
  1498  	// Check BMI2 bit for MULXQ.
  1499  	// runtime·cpuid_ebx7 is always available here
  1500  	// because it passed avx2 check
  1501  	TESTL $(1<<8), runtime·cpuid_ebx7(SB)
  1502  	JNE   chacha20Poly1305Seal_AVX2
  1503  noavx2bmi2Seal:
  1504  
  1505  	// Special optimization, for very short buffers
  1506  	CMPQ inl, $128
  1507  	JBE  sealSSE128 // About 15% faster
  1508  
  1509  	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  1510  	MOVOU ·chacha20Constants<>(SB), A0
  1511  	MOVOU (1*16)(keyp), B0
  1512  	MOVOU (2*16)(keyp), C0
  1513  	MOVOU (3*16)(keyp), D0
  1514  
  1515  	// Store state on stack for future use
  1516  	MOVO B0, state1Store
  1517  	MOVO C0, state2Store
  1518  
  1519  	// Load state, increment counter blocks
  1520  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1521  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1522  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1523  
  1524  	// Store counters
  1525  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1526  	MOVQ $10, itr2
  1527  
  1528  sealSSEIntroLoop:
  1529  	MOVO         C3, tmpStore
  1530  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1531  	MOVO         tmpStore, C3
  1532  	MOVO         C1, tmpStore
  1533  	chachaQR(A3, B3, C3, D3, C1)
  1534  	MOVO         tmpStore, C1
  1535  	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1536  	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1537  	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1538  
  1539  	MOVO          C3, tmpStore
  1540  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1541  	MOVO          tmpStore, C3
  1542  	MOVO          C1, tmpStore
  1543  	chachaQR(A3, B3, C3, D3, C1)
  1544  	MOVO          tmpStore, C1
  1545  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1546  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1547  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1548  	DECQ          itr2
  1549  	JNE           sealSSEIntroLoop
  1550  
  1551  	// Add in the state
  1552  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1553  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1554  	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1555  	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1556  
  1557  	// Clamp and store the key
  1558  	PAND ·polyClampMask<>(SB), A0
  1559  	MOVO A0, rStore
  1560  	MOVO B0, sStore
  1561  
  1562  	// Hash AAD
  1563  	MOVQ ad_len+80(FP), itr2
  1564  	CALL polyHashADInternal<>(SB)
  1565  
  1566  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1567  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1568  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  1569  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1570  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1571  	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
  1572  
  1573  	MOVQ $128, itr1
  1574  	SUBQ $128, inl
  1575  	LEAQ 128(inp), inp
  1576  
  1577  	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
  1578  
  1579  	CMPQ inl, $64
  1580  	JBE  sealSSE128SealHash
  1581  
  1582  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1583  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1584  	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
  1585  
  1586  	ADDQ $64, itr1
  1587  	SUBQ $64, inl
  1588  	LEAQ 64(inp), inp
  1589  
  1590  	MOVQ $2, itr1
  1591  	MOVQ $8, itr2
  1592  
  1593  	CMPQ inl, $64
  1594  	JBE  sealSSETail64
  1595  	CMPQ inl, $128
  1596  	JBE  sealSSETail128
  1597  	CMPQ inl, $192
  1598  	JBE  sealSSETail192
  1599  
  1600  sealSSEMainLoop:
  1601  	// Load state, increment counter blocks
  1602  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  1603  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1604  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1605  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1606  
  1607  	// Store counters
  1608  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1609  
  1610  sealSSEInnerLoop:
  1611  	MOVO          C3, tmpStore
  1612  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1613  	MOVO          tmpStore, C3
  1614  	MOVO          C1, tmpStore
  1615  	chachaQR(A3, B3, C3, D3, C1)
  1616  	MOVO          tmpStore, C1
  1617  	polyAdd(0(oup))
  1618  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
  1619  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
  1620  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
  1621  	polyMulStage1
  1622  	polyMulStage2
  1623  	LEAQ          (2*8)(oup), oup
  1624  	MOVO          C3, tmpStore
  1625  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1626  	MOVO          tmpStore, C3
  1627  	MOVO          C1, tmpStore
  1628  	polyMulStage3
  1629  	chachaQR(A3, B3, C3, D3, C1)
  1630  	MOVO          tmpStore, C1
  1631  	polyMulReduceStage
  1632  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1633  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1634  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1635  	DECQ          itr2
  1636  	JGE           sealSSEInnerLoop
  1637  	polyAdd(0(oup))
  1638  	polyMul
  1639  	LEAQ          (2*8)(oup), oup
  1640  	DECQ          itr1
  1641  	JG            sealSSEInnerLoop
  1642  
  1643  	// Add in the state
  1644  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1645  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1646  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1647  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1648  	MOVO  D3, tmpStore
  1649  
  1650  	// Load - xor - store
  1651  	MOVOU (0*16)(inp), D3; PXOR D3, A0
  1652  	MOVOU (1*16)(inp), D3; PXOR D3, B0
  1653  	MOVOU (2*16)(inp), D3; PXOR D3, C0
  1654  	MOVOU (3*16)(inp), D3; PXOR D3, D0
  1655  	MOVOU A0, (0*16)(oup)
  1656  	MOVOU B0, (1*16)(oup)
  1657  	MOVOU C0, (2*16)(oup)
  1658  	MOVOU D0, (3*16)(oup)
  1659  	MOVO  tmpStore, D3
  1660  
  1661  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1662  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1663  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1664  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  1665  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1666  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  1667  	ADDQ  $192, inp
  1668  	MOVQ  $192, itr1
  1669  	SUBQ  $192, inl
  1670  	MOVO  A3, A1
  1671  	MOVO  B3, B1
  1672  	MOVO  C3, C1
  1673  	MOVO  D3, D1
  1674  	CMPQ  inl, $64
  1675  	JBE   sealSSE128SealHash
  1676  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1677  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1678  	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
  1679  	LEAQ  64(inp), inp
  1680  	SUBQ  $64, inl
  1681  	MOVQ  $6, itr1
  1682  	MOVQ  $4, itr2
  1683  	CMPQ  inl, $192
  1684  	JG    sealSSEMainLoop
  1685  
  1686  	MOVQ  inl, itr1
  1687  	TESTQ inl, inl
  1688  	JE    sealSSE128SealHash
  1689  	MOVQ  $6, itr1
  1690  	CMPQ  inl, $64
  1691  	JBE   sealSSETail64
  1692  	CMPQ  inl, $128
  1693  	JBE   sealSSETail128
  1694  	JMP   sealSSETail192
  1695  
  1696  // ----------------------------------------------------------------------------
  1697  // Special optimization for the last 64 bytes of plaintext
  1698  sealSSETail64:
  1699  	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
  1700  	MOVO  ·chacha20Constants<>(SB), A1
  1701  	MOVO  state1Store, B1
  1702  	MOVO  state2Store, C1
  1703  	MOVO  ctr3Store, D1
  1704  	PADDL ·sseIncMask<>(SB), D1
  1705  	MOVO  D1, ctr0Store
  1706  
  1707  sealSSETail64LoopA:
  1708  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1709  	polyAdd(0(oup))
  1710  	polyMul
  1711  	LEAQ 16(oup), oup
  1712  
  1713  sealSSETail64LoopB:
  1714  	chachaQR(A1, B1, C1, D1, T1)
  1715  	shiftB1Left;  shiftC1Left; shiftD1Left
  1716  	chachaQR(A1, B1, C1, D1, T1)
  1717  	shiftB1Right; shiftC1Right; shiftD1Right
  1718  	polyAdd(0(oup))
  1719  	polyMul
  1720  	LEAQ          16(oup), oup
  1721  
  1722  	DECQ itr1
  1723  	JG   sealSSETail64LoopA
  1724  
  1725  	DECQ  itr2
  1726  	JGE   sealSSETail64LoopB
  1727  	PADDL ·chacha20Constants<>(SB), A1
  1728  	PADDL state1Store, B1
  1729  	PADDL state2Store, C1
  1730  	PADDL ctr0Store, D1
  1731  
  1732  	JMP sealSSE128Seal
  1733  
  1734  // ----------------------------------------------------------------------------
  1735  // Special optimization for the last 128 bytes of plaintext
  1736  sealSSETail128:
  1737  	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
  1738  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1739  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1740  
  1741  sealSSETail128LoopA:
  1742  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1743  	polyAdd(0(oup))
  1744  	polyMul
  1745  	LEAQ 16(oup), oup
  1746  
  1747  sealSSETail128LoopB:
  1748  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1749  	shiftB0Left;  shiftC0Left; shiftD0Left
  1750  	shiftB1Left;  shiftC1Left; shiftD1Left
  1751  	polyAdd(0(oup))
  1752  	polyMul
  1753  	LEAQ          16(oup), oup
  1754  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1755  	shiftB0Right; shiftC0Right; shiftD0Right
  1756  	shiftB1Right; shiftC1Right; shiftD1Right
  1757  
  1758  	DECQ itr1
  1759  	JG   sealSSETail128LoopA
  1760  
  1761  	DECQ itr2
  1762  	JGE  sealSSETail128LoopB
  1763  
  1764  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  1765  	PADDL state1Store, B0; PADDL state1Store, B1
  1766  	PADDL state2Store, C0; PADDL state2Store, C1
  1767  	PADDL ctr0Store, D0; PADDL ctr1Store, D1
  1768  
  1769  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1770  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1771  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1772  
  1773  	MOVQ $64, itr1
  1774  	LEAQ 64(inp), inp
  1775  	SUBQ $64, inl
  1776  
  1777  	JMP sealSSE128SealHash
  1778  
  1779  // ----------------------------------------------------------------------------
  1780  // Special optimization for the last 192 bytes of plaintext
  1781  sealSSETail192:
  1782  	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
  1783  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1784  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1785  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
  1786  
  1787  sealSSETail192LoopA:
  1788  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1789  	polyAdd(0(oup))
  1790  	polyMul
  1791  	LEAQ 16(oup), oup
  1792  
  1793  sealSSETail192LoopB:
  1794  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1795  	shiftB0Left; shiftC0Left; shiftD0Left
  1796  	shiftB1Left; shiftC1Left; shiftD1Left
  1797  	shiftB2Left; shiftC2Left; shiftD2Left
  1798  
  1799  	polyAdd(0(oup))
  1800  	polyMul
  1801  	LEAQ 16(oup), oup
  1802  
  1803  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1804  	shiftB0Right; shiftC0Right; shiftD0Right
  1805  	shiftB1Right; shiftC1Right; shiftD1Right
  1806  	shiftB2Right; shiftC2Right; shiftD2Right
  1807  
  1808  	DECQ itr1
  1809  	JG   sealSSETail192LoopA
  1810  
  1811  	DECQ itr2
  1812  	JGE  sealSSETail192LoopB
  1813  
  1814  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1815  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  1816  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  1817  	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
  1818  
  1819  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1820  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1821  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1822  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  1823  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  1824  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1825  
  1826  	MOVO A2, A1
  1827  	MOVO B2, B1
  1828  	MOVO C2, C1
  1829  	MOVO D2, D1
  1830  	MOVQ $128, itr1
  1831  	LEAQ 128(inp), inp
  1832  	SUBQ $128, inl
  1833  
  1834  	JMP sealSSE128SealHash
  1835  
  1836  // ----------------------------------------------------------------------------
  1837  // Special seal optimization for buffers smaller than 129 bytes
  1838  sealSSE128:
  1839  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  1840  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  1841  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1842  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1843  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
  1844  	MOVQ  $10, itr2
  1845  
  1846  sealSSE128InnerCipherLoop:
  1847  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1848  	shiftB0Left;  shiftB1Left; shiftB2Left
  1849  	shiftC0Left;  shiftC1Left; shiftC2Left
  1850  	shiftD0Left;  shiftD1Left; shiftD2Left
  1851  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1852  	shiftB0Right; shiftB1Right; shiftB2Right
  1853  	shiftC0Right; shiftC1Right; shiftC2Right
  1854  	shiftD0Right; shiftD1Right; shiftD2Right
  1855  	DECQ          itr2
  1856  	JNE           sealSSE128InnerCipherLoop
  1857  
  1858  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1859  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1860  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  1861  	PADDL T2, C1; PADDL T2, C2
  1862  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  1863  	PAND  ·polyClampMask<>(SB), A0
  1864  	MOVOU A0, rStore
  1865  	MOVOU B0, sStore
  1866  
  1867  	// Hash
  1868  	MOVQ ad_len+80(FP), itr2
  1869  	CALL polyHashADInternal<>(SB)
  1870  	XORQ itr1, itr1
  1871  
  1872  sealSSE128SealHash:
  1873  	// itr1 holds the number of bytes encrypted but not yet hashed
  1874  	CMPQ itr1, $16
  1875  	JB   sealSSE128Seal
  1876  	polyAdd(0(oup))
  1877  	polyMul
  1878  
  1879  	SUBQ $16, itr1
  1880  	ADDQ $16, oup
  1881  
  1882  	JMP sealSSE128SealHash
  1883  
  1884  sealSSE128Seal:
  1885  	CMPQ inl, $16
  1886  	JB   sealSSETail
  1887  	SUBQ $16, inl
  1888  
  1889  	// Load for decryption
  1890  	MOVOU (inp), T0
  1891  	PXOR  T0, A1
  1892  	MOVOU A1, (oup)
  1893  	LEAQ  (1*16)(inp), inp
  1894  	LEAQ  (1*16)(oup), oup
  1895  
  1896  	// Extract for hashing
  1897  	MOVQ   A1, t0
  1898  	PSRLDQ $8, A1
  1899  	MOVQ A1, t1
  1900  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1901  	polyMul
  1902  
  1903  	// Shift the stream "left"
  1904  	MOVO B1, A1
  1905  	MOVO C1, B1
  1906  	MOVO D1, C1
  1907  	MOVO A2, D1
  1908  	MOVO B2, A2
  1909  	MOVO C2, B2
  1910  	MOVO D2, C2
  1911  	JMP  sealSSE128Seal
  1912  
  1913  sealSSETail:
  1914  	TESTQ inl, inl
  1915  	JE    sealSSEFinalize
  1916  
  1917  	// We can only load the PT one byte at a time to avoid read after end of buffer
  1918  	MOVQ inl, itr2
  1919  	SHLQ $4, itr2
  1920  	LEAQ ·andMask<>(SB), t0
  1921  	MOVQ inl, itr1
  1922  	LEAQ -1(inp)(inl*1), inp
  1923  	XORQ t2, t2
  1924  	XORQ t3, t3
  1925  	XORQ AX, AX
  1926  
  1927  sealSSETailLoadLoop:
  1928  	SHLQ $8, t2, t3
  1929  	SHLQ $8, t2
  1930  	MOVB (inp), AX
  1931  	XORQ AX, t2
  1932  	LEAQ   -1(inp), inp
  1933  	DECQ   itr1
  1934  	JNE    sealSSETailLoadLoop
  1935  	MOVQ t2, 0+tmpStore
  1936  	MOVQ t3, 8+tmpStore
  1937  	PXOR 0+tmpStore, A1
  1938  	MOVOU  A1, (oup)
  1939  	MOVOU  -16(t0)(itr2*1), T0
  1940  	PAND   T0, A1
  1941  	MOVQ   A1, t0
  1942  	PSRLDQ $8, A1
  1943  	MOVQ   A1, t1
  1944  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1945  	polyMul
  1946  
  1947  	ADDQ inl, oup
  1948  
  1949  sealSSEFinalize:
  1950  	// Hash in the buffer lengths
  1951  	ADDQ ad_len+80(FP), acc0
  1952  	ADCQ src_len+56(FP), acc1
  1953  	ADCQ $1, acc2
  1954  	polyMul
  1955  
  1956  	// Final reduce
  1957  	MOVQ    acc0, t0
  1958  	MOVQ    acc1, t1
  1959  	MOVQ    acc2, t2
  1960  	SUBQ    $-5, acc0
  1961  	SBBQ    $-1, acc1
  1962  	SBBQ    $3, acc2
  1963  	CMOVQCS t0, acc0
  1964  	CMOVQCS t1, acc1
  1965  	CMOVQCS t2, acc2
  1966  
  1967  	// Add in the "s" part of the key
  1968  	ADDQ 0+sStore, acc0
  1969  	ADCQ 8+sStore, acc1
  1970  
  1971  	// Finally store the tag at the end of the message
  1972  	MOVQ acc0, (0*8)(oup)
  1973  	MOVQ acc1, (1*8)(oup)
  1974  	RET
  1975  
  1976  // ----------------------------------------------------------------------------
  1977  // ------------------------- AVX2 Code ----------------------------------------
  1978  chacha20Poly1305Seal_AVX2:
  1979  	VZEROUPPER
  1980  	VMOVDQU ·chacha20Constants<>(SB), AA0
  1981  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  1982  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  1983  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  1984  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
  1985  
  1986  	// Special optimizations, for very short buffers
  1987  	CMPQ inl, $192
  1988  	JBE  seal192AVX2 // 33% faster
  1989  	CMPQ inl, $320
  1990  	JBE  seal320AVX2 // 17% faster
  1991  
  1992  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  1993  	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1994  	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
  1995  	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
  1996  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
  1997  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
  1998  	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
  1999  	VMOVDQA DD3, ctr3StoreAVX2
  2000  	MOVQ    $10, itr2
  2001  
  2002  sealAVX2IntroLoop:
  2003  	VMOVDQA CC3, tmpStoreAVX2
  2004  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2005  	VMOVDQA tmpStoreAVX2, CC3
  2006  	VMOVDQA CC1, tmpStoreAVX2
  2007  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2008  	VMOVDQA tmpStoreAVX2, CC1
  2009  
  2010  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  2011  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  2012  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  2013  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2014  
  2015  	VMOVDQA CC3, tmpStoreAVX2
  2016  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2017  	VMOVDQA tmpStoreAVX2, CC3
  2018  	VMOVDQA CC1, tmpStoreAVX2
  2019  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2020  	VMOVDQA tmpStoreAVX2, CC1
  2021  
  2022  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2023  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2024  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2025  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2026  	DECQ     itr2
  2027  	JNE      sealAVX2IntroLoop
  2028  
  2029  	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2030  	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2031  	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2032  	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2033  
  2034  	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
  2035  	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
  2036  	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
  2037  
  2038  	// Clamp and store poly key
  2039  	VPAND   ·polyClampMask<>(SB), DD0, DD0
  2040  	VMOVDQA DD0, rsStoreAVX2
  2041  
  2042  	// Hash AD
  2043  	MOVQ ad_len+80(FP), itr2
  2044  	CALL polyHashADInternal<>(SB)
  2045  
  2046  	// Can store at least 320 bytes
  2047  	VPXOR   (0*32)(inp), AA0, AA0
  2048  	VPXOR   (1*32)(inp), CC0, CC0
  2049  	VMOVDQU AA0, (0*32)(oup)
  2050  	VMOVDQU CC0, (1*32)(oup)
  2051  
  2052  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2053  	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
  2054  	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
  2055  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2056  	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
  2057  	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
  2058  
  2059  	MOVQ $320, itr1
  2060  	SUBQ $320, inl
  2061  	LEAQ 320(inp), inp
  2062  
  2063  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
  2064  	CMPQ       inl, $128
  2065  	JBE        sealAVX2SealHash
  2066  
  2067  	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
  2068  	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
  2069  	SUBQ    $128, inl
  2070  	LEAQ    128(inp), inp
  2071  
  2072  	MOVQ $8, itr1
  2073  	MOVQ $2, itr2
  2074  
  2075  	CMPQ inl, $128
  2076  	JBE  sealAVX2Tail128
  2077  	CMPQ inl, $256
  2078  	JBE  sealAVX2Tail256
  2079  	CMPQ inl, $384
  2080  	JBE  sealAVX2Tail384
  2081  	CMPQ inl, $512
  2082  	JBE  sealAVX2Tail512
  2083  
  2084  	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  2085  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2086  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2087  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2088  	VMOVDQA ctr3StoreAVX2, DD0
  2089  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2090  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2091  
  2092  	VMOVDQA CC3, tmpStoreAVX2
  2093  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2094  	VMOVDQA tmpStoreAVX2, CC3
  2095  	VMOVDQA CC1, tmpStoreAVX2
  2096  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2097  	VMOVDQA tmpStoreAVX2, CC1
  2098  
  2099  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  2100  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  2101  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  2102  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2103  
  2104  	VMOVDQA CC3, tmpStoreAVX2
  2105  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2106  	VMOVDQA tmpStoreAVX2, CC3
  2107  	VMOVDQA CC1, tmpStoreAVX2
  2108  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2109  	VMOVDQA tmpStoreAVX2, CC1
  2110  
  2111  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2112  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2113  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2114  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2115  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2116  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2117  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2118  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2119  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2120  	VMOVDQA  CC3, tmpStoreAVX2
  2121  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2122  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2123  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2124  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2125  	VMOVDQA  tmpStoreAVX2, CC3
  2126  
  2127  	SUBQ $16, oup                  // Adjust the pointer
  2128  	MOVQ $9, itr1
  2129  	JMP  sealAVX2InternalLoopStart
  2130  
  2131  sealAVX2MainLoop:
  2132  	// Load state, increment counter blocks, store the incremented counters
  2133  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2134  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2135  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2136  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2137  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2138  	MOVQ    $10, itr1
  2139  
  2140  sealAVX2InternalLoop:
  2141  	polyAdd(0*8(oup))
  2142  	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2143  	polyMulStage1_AVX2
  2144  	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2145  	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2146  	polyMulStage2_AVX2
  2147  	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2148  	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2149  	polyMulStage3_AVX2
  2150  	VMOVDQA CC3, tmpStoreAVX2
  2151  	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2152  	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2153  	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2154  	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2155  	VMOVDQA tmpStoreAVX2, CC3
  2156  	polyMulReduceStage
  2157  
  2158  sealAVX2InternalLoopStart:
  2159  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2160  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2161  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2162  	polyAdd(2*8(oup))
  2163  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2164  	polyMulStage1_AVX2
  2165  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2166  	VMOVDQA  CC3, tmpStoreAVX2
  2167  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2168  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2169  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2170  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2171  	VMOVDQA  tmpStoreAVX2, CC3
  2172  	polyMulStage2_AVX2
  2173  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2174  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2175  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2176  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2177  	polyMulStage3_AVX2
  2178  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2179  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2180  	polyMulReduceStage
  2181  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2182  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2183  	polyAdd(4*8(oup))
  2184  	LEAQ     (6*8)(oup), oup
  2185  	VMOVDQA  CC3, tmpStoreAVX2
  2186  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2187  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2188  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2189  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2190  	VMOVDQA  tmpStoreAVX2, CC3
  2191  	polyMulStage1_AVX2
  2192  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2193  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2194  	polyMulStage2_AVX2
  2195  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2196  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2197  	polyMulStage3_AVX2
  2198  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2199  	VMOVDQA  CC3, tmpStoreAVX2
  2200  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2201  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2202  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2203  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2204  	VMOVDQA  tmpStoreAVX2, CC3
  2205  	polyMulReduceStage
  2206  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2207  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2208  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2209  	DECQ     itr1
  2210  	JNE      sealAVX2InternalLoop
  2211  
  2212  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2213  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2214  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2215  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2216  	VMOVDQA CC3, tmpStoreAVX2
  2217  
  2218  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  2219  	polyAdd(0*8(oup))
  2220  	polyMulAVX2
  2221  	LEAQ       (4*8)(oup), oup
  2222  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  2223  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  2224  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  2225  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2226  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2227  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2228  
  2229  	// and here
  2230  	polyAdd(-2*8(oup))
  2231  	polyMulAVX2
  2232  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2233  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2234  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2235  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2236  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  2237  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  2238  	LEAQ       (32*16)(inp), inp
  2239  	SUBQ       $(32*16), inl
  2240  	CMPQ       inl, $512
  2241  	JG         sealAVX2MainLoop
  2242  
  2243  	// Tail can only hash 480 bytes
  2244  	polyAdd(0*8(oup))
  2245  	polyMulAVX2
  2246  	polyAdd(2*8(oup))
  2247  	polyMulAVX2
  2248  	LEAQ 32(oup), oup
  2249  
  2250  	MOVQ $10, itr1
  2251  	MOVQ $0, itr2
  2252  	CMPQ inl, $128
  2253  	JBE  sealAVX2Tail128
  2254  	CMPQ inl, $256
  2255  	JBE  sealAVX2Tail256
  2256  	CMPQ inl, $384
  2257  	JBE  sealAVX2Tail384
  2258  	JMP  sealAVX2Tail512
  2259  
  2260  // ----------------------------------------------------------------------------
  2261  // Special optimization for buffers smaller than 193 bytes
  2262  seal192AVX2:
  2263  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  2264  	VMOVDQA AA0, AA1
  2265  	VMOVDQA BB0, BB1
  2266  	VMOVDQA CC0, CC1
  2267  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2268  	VMOVDQA AA0, AA2
  2269  	VMOVDQA BB0, BB2
  2270  	VMOVDQA CC0, CC2
  2271  	VMOVDQA DD0, DD2
  2272  	VMOVDQA DD1, TT3
  2273  	MOVQ    $10, itr2
  2274  
  2275  sealAVX2192InnerCipherLoop:
  2276  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2277  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2278  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2279  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2280  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2281  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2282  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2283  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2284  	DECQ       itr2
  2285  	JNE        sealAVX2192InnerCipherLoop
  2286  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  2287  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  2288  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  2289  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  2290  	VPERM2I128 $0x02, AA0, BB0, TT0
  2291  
  2292  	// Clamp and store poly key
  2293  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  2294  	VMOVDQA TT0, rsStoreAVX2
  2295  
  2296  	// Stream for up to 192 bytes
  2297  	VPERM2I128 $0x13, AA0, BB0, AA0
  2298  	VPERM2I128 $0x13, CC0, DD0, BB0
  2299  	VPERM2I128 $0x02, AA1, BB1, CC0
  2300  	VPERM2I128 $0x02, CC1, DD1, DD0
  2301  	VPERM2I128 $0x13, AA1, BB1, AA1
  2302  	VPERM2I128 $0x13, CC1, DD1, BB1
  2303  
  2304  sealAVX2ShortSeal:
  2305  	// Hash aad
  2306  	MOVQ ad_len+80(FP), itr2
  2307  	CALL polyHashADInternal<>(SB)
  2308  	XORQ itr1, itr1
  2309  
  2310  sealAVX2SealHash:
  2311  	// itr1 holds the number of bytes encrypted but not yet hashed
  2312  	CMPQ itr1, $16
  2313  	JB   sealAVX2ShortSealLoop
  2314  	polyAdd(0(oup))
  2315  	polyMul
  2316  	SUBQ $16, itr1
  2317  	ADDQ $16, oup
  2318  	JMP  sealAVX2SealHash
  2319  
  2320  sealAVX2ShortSealLoop:
  2321  	CMPQ inl, $32
  2322  	JB   sealAVX2ShortTail32
  2323  	SUBQ $32, inl
  2324  
  2325  	// Load for encryption
  2326  	VPXOR   (inp), AA0, AA0
  2327  	VMOVDQU AA0, (oup)
  2328  	LEAQ    (1*32)(inp), inp
  2329  
  2330  	// Now can hash
  2331  	polyAdd(0*8(oup))
  2332  	polyMulAVX2
  2333  	polyAdd(2*8(oup))
  2334  	polyMulAVX2
  2335  	LEAQ (1*32)(oup), oup
  2336  
  2337  	// Shift stream left
  2338  	VMOVDQA BB0, AA0
  2339  	VMOVDQA CC0, BB0
  2340  	VMOVDQA DD0, CC0
  2341  	VMOVDQA AA1, DD0
  2342  	VMOVDQA BB1, AA1
  2343  	VMOVDQA CC1, BB1
  2344  	VMOVDQA DD1, CC1
  2345  	VMOVDQA AA2, DD1
  2346  	VMOVDQA BB2, AA2
  2347  	JMP     sealAVX2ShortSealLoop
  2348  
  2349  sealAVX2ShortTail32:
  2350  	CMPQ    inl, $16
  2351  	VMOVDQA A0, A1
  2352  	JB      sealAVX2ShortDone
  2353  
  2354  	SUBQ $16, inl
  2355  
  2356  	// Load for encryption
  2357  	VPXOR   (inp), A0, T0
  2358  	VMOVDQU T0, (oup)
  2359  	LEAQ    (1*16)(inp), inp
  2360  
  2361  	// Hash
  2362  	polyAdd(0*8(oup))
  2363  	polyMulAVX2
  2364  	LEAQ       (1*16)(oup), oup
  2365  	VPERM2I128 $0x11, AA0, AA0, AA0
  2366  	VMOVDQA    A0, A1
  2367  
  2368  sealAVX2ShortDone:
  2369  	VZEROUPPER
  2370  	JMP sealSSETail
  2371  
  2372  // ----------------------------------------------------------------------------
  2373  // Special optimization for buffers smaller than 321 bytes
  2374  seal320AVX2:
  2375  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  2376  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2377  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2378  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  2379  	MOVQ    $10, itr2
  2380  
  2381  sealAVX2320InnerCipherLoop:
  2382  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2383  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2384  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2385  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2386  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2387  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2388  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2389  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2390  	DECQ     itr2
  2391  	JNE      sealAVX2320InnerCipherLoop
  2392  
  2393  	VMOVDQA ·chacha20Constants<>(SB), TT0
  2394  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  2395  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  2396  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  2397  	VMOVDQA ·avx2IncMask<>(SB), TT0
  2398  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  2399  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  2400  	VPADDD  TT3, DD2, DD2
  2401  
  2402  	// Clamp and store poly key
  2403  	VPERM2I128 $0x02, AA0, BB0, TT0
  2404  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  2405  	VMOVDQA    TT0, rsStoreAVX2
  2406  
  2407  	// Stream for up to 320 bytes
  2408  	VPERM2I128 $0x13, AA0, BB0, AA0
  2409  	VPERM2I128 $0x13, CC0, DD0, BB0
  2410  	VPERM2I128 $0x02, AA1, BB1, CC0
  2411  	VPERM2I128 $0x02, CC1, DD1, DD0
  2412  	VPERM2I128 $0x13, AA1, BB1, AA1
  2413  	VPERM2I128 $0x13, CC1, DD1, BB1
  2414  	VPERM2I128 $0x02, AA2, BB2, CC1
  2415  	VPERM2I128 $0x02, CC2, DD2, DD1
  2416  	VPERM2I128 $0x13, AA2, BB2, AA2
  2417  	VPERM2I128 $0x13, CC2, DD2, BB2
  2418  	JMP        sealAVX2ShortSeal
  2419  
  2420  // ----------------------------------------------------------------------------
  2421  // Special optimization for the last 128 bytes of ciphertext
  2422  sealAVX2Tail128:
  2423  	// Need to decrypt up to 128 bytes - prepare two blocks
  2424  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2425  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2426  	VMOVDQA ·chacha20Constants<>(SB), AA0
  2427  	VMOVDQA state1StoreAVX2, BB0
  2428  	VMOVDQA state2StoreAVX2, CC0
  2429  	VMOVDQA ctr3StoreAVX2, DD0
  2430  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2431  	VMOVDQA DD0, DD1
  2432  
  2433  sealAVX2Tail128LoopA:
  2434  	polyAdd(0(oup))
  2435  	polyMul
  2436  	LEAQ 16(oup), oup
  2437  
  2438  sealAVX2Tail128LoopB:
  2439  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2440  	polyAdd(0(oup))
  2441  	polyMul
  2442  	VPALIGNR $4, BB0, BB0, BB0
  2443  	VPALIGNR $8, CC0, CC0, CC0
  2444  	VPALIGNR $12, DD0, DD0, DD0
  2445  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2446  	polyAdd(16(oup))
  2447  	polyMul
  2448  	LEAQ     32(oup), oup
  2449  	VPALIGNR $12, BB0, BB0, BB0
  2450  	VPALIGNR $8, CC0, CC0, CC0
  2451  	VPALIGNR $4, DD0, DD0, DD0
  2452  	DECQ     itr1
  2453  	JG       sealAVX2Tail128LoopA
  2454  	DECQ     itr2
  2455  	JGE      sealAVX2Tail128LoopB
  2456  
  2457  	VPADDD ·chacha20Constants<>(SB), AA0, AA1
  2458  	VPADDD state1StoreAVX2, BB0, BB1
  2459  	VPADDD state2StoreAVX2, CC0, CC1
  2460  	VPADDD DD1, DD0, DD1
  2461  
  2462  	VPERM2I128 $0x02, AA1, BB1, AA0
  2463  	VPERM2I128 $0x02, CC1, DD1, BB0
  2464  	VPERM2I128 $0x13, AA1, BB1, CC0
  2465  	VPERM2I128 $0x13, CC1, DD1, DD0
  2466  	JMP        sealAVX2ShortSealLoop
  2467  
  2468  // ----------------------------------------------------------------------------
  2469  // Special optimization for the last 256 bytes of ciphertext
  2470  sealAVX2Tail256:
  2471  	// Need to decrypt up to 256 bytes - prepare two blocks
  2472  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2473  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2474  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
  2475  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
  2476  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
  2477  	VMOVDQA ctr3StoreAVX2, DD0
  2478  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2479  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2480  	VMOVDQA DD0, TT1
  2481  	VMOVDQA DD1, TT2
  2482  
  2483  sealAVX2Tail256LoopA:
  2484  	polyAdd(0(oup))
  2485  	polyMul
  2486  	LEAQ 16(oup), oup
  2487  
  2488  sealAVX2Tail256LoopB:
  2489  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2490  	polyAdd(0(oup))
  2491  	polyMul
  2492  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2493  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2494  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2495  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2496  	polyAdd(16(oup))
  2497  	polyMul
  2498  	LEAQ     32(oup), oup
  2499  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2500  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2501  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2502  	DECQ     itr1
  2503  	JG       sealAVX2Tail256LoopA
  2504  	DECQ     itr2
  2505  	JGE      sealAVX2Tail256LoopB
  2506  
  2507  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  2508  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  2509  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  2510  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  2511  	VPERM2I128 $0x02, AA0, BB0, TT0
  2512  	VPERM2I128 $0x02, CC0, DD0, TT1
  2513  	VPERM2I128 $0x13, AA0, BB0, TT2
  2514  	VPERM2I128 $0x13, CC0, DD0, TT3
  2515  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2516  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2517  	MOVQ       $128, itr1
  2518  	LEAQ       128(inp), inp
  2519  	SUBQ       $128, inl
  2520  	VPERM2I128 $0x02, AA1, BB1, AA0
  2521  	VPERM2I128 $0x02, CC1, DD1, BB0
  2522  	VPERM2I128 $0x13, AA1, BB1, CC0
  2523  	VPERM2I128 $0x13, CC1, DD1, DD0
  2524  
  2525  	JMP sealAVX2SealHash
  2526  
  2527  // ----------------------------------------------------------------------------
  2528  // Special optimization for the last 384 bytes of ciphertext
  2529  sealAVX2Tail384:
  2530  	// Need to decrypt up to 384 bytes - prepare two blocks
  2531  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2532  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2533  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  2534  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  2535  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  2536  	VMOVDQA ctr3StoreAVX2, DD0
  2537  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2538  	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
  2539  
  2540  sealAVX2Tail384LoopA:
  2541  	polyAdd(0(oup))
  2542  	polyMul
  2543  	LEAQ 16(oup), oup
  2544  
  2545  sealAVX2Tail384LoopB:
  2546  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2547  	polyAdd(0(oup))
  2548  	polyMul
  2549  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2550  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2551  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2552  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2553  	polyAdd(16(oup))
  2554  	polyMul
  2555  	LEAQ     32(oup), oup
  2556  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2557  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2558  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2559  	DECQ     itr1
  2560  	JG       sealAVX2Tail384LoopA
  2561  	DECQ     itr2
  2562  	JGE      sealAVX2Tail384LoopB
  2563  
  2564  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  2565  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  2566  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  2567  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
  2568  	VPERM2I128 $0x02, AA0, BB0, TT0
  2569  	VPERM2I128 $0x02, CC0, DD0, TT1
  2570  	VPERM2I128 $0x13, AA0, BB0, TT2
  2571  	VPERM2I128 $0x13, CC0, DD0, TT3
  2572  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2573  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2574  	VPERM2I128 $0x02, AA1, BB1, TT0
  2575  	VPERM2I128 $0x02, CC1, DD1, TT1
  2576  	VPERM2I128 $0x13, AA1, BB1, TT2
  2577  	VPERM2I128 $0x13, CC1, DD1, TT3
  2578  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  2579  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  2580  	MOVQ       $256, itr1
  2581  	LEAQ       256(inp), inp
  2582  	SUBQ       $256, inl
  2583  	VPERM2I128 $0x02, AA2, BB2, AA0
  2584  	VPERM2I128 $0x02, CC2, DD2, BB0
  2585  	VPERM2I128 $0x13, AA2, BB2, CC0
  2586  	VPERM2I128 $0x13, CC2, DD2, DD0
  2587  
  2588  	JMP sealAVX2SealHash
  2589  
  2590  // ----------------------------------------------------------------------------
  2591  // Special optimization for the last 512 bytes of ciphertext
  2592  sealAVX2Tail512:
  2593  	// Need to decrypt up to 512 bytes - prepare two blocks
  2594  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2595  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2596  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2597  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2598  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2599  	VMOVDQA ctr3StoreAVX2, DD0
  2600  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2601  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2602  
  2603  sealAVX2Tail512LoopA:
  2604  	polyAdd(0(oup))
  2605  	polyMul
  2606  	LEAQ 16(oup), oup
  2607  
  2608  sealAVX2Tail512LoopB:
  2609  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2610  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2611  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2612  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2613  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2614  	VMOVDQA  CC3, tmpStoreAVX2
  2615  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2616  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2617  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2618  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2619  	VMOVDQA  tmpStoreAVX2, CC3
  2620  	polyAdd(0*8(oup))
  2621  	polyMulAVX2
  2622  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2623  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2624  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2625  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2626  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2627  	VMOVDQA  CC3, tmpStoreAVX2
  2628  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2629  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2630  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2631  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2632  	VMOVDQA  tmpStoreAVX2, CC3
  2633  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2634  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2635  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2636  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2637  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2638  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2639  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2640  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2641  	polyAdd(2*8(oup))
  2642  	polyMulAVX2
  2643  	LEAQ     (4*8)(oup), oup
  2644  	VMOVDQA  CC3, tmpStoreAVX2
  2645  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2646  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2647  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2648  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2649  	VMOVDQA  tmpStoreAVX2, CC3
  2650  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2651  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2652  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2653  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2654  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2655  	VMOVDQA  CC3, tmpStoreAVX2
  2656  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2657  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2658  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2659  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2660  	VMOVDQA  tmpStoreAVX2, CC3
  2661  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2662  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2663  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2664  
  2665  	DECQ itr1
  2666  	JG   sealAVX2Tail512LoopA
  2667  	DECQ itr2
  2668  	JGE  sealAVX2Tail512LoopB
  2669  
  2670  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2671  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2672  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2673  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2674  	VMOVDQA    CC3, tmpStoreAVX2
  2675  	VPERM2I128 $0x02, AA0, BB0, CC3
  2676  	VPXOR      (0*32)(inp), CC3, CC3
  2677  	VMOVDQU    CC3, (0*32)(oup)
  2678  	VPERM2I128 $0x02, CC0, DD0, CC3
  2679  	VPXOR      (1*32)(inp), CC3, CC3
  2680  	VMOVDQU    CC3, (1*32)(oup)
  2681  	VPERM2I128 $0x13, AA0, BB0, CC3
  2682  	VPXOR      (2*32)(inp), CC3, CC3
  2683  	VMOVDQU    CC3, (2*32)(oup)
  2684  	VPERM2I128 $0x13, CC0, DD0, CC3
  2685  	VPXOR      (3*32)(inp), CC3, CC3
  2686  	VMOVDQU    CC3, (3*32)(oup)
  2687  
  2688  	VPERM2I128 $0x02, AA1, BB1, AA0
  2689  	VPERM2I128 $0x02, CC1, DD1, BB0
  2690  	VPERM2I128 $0x13, AA1, BB1, CC0
  2691  	VPERM2I128 $0x13, CC1, DD1, DD0
  2692  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2693  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2694  
  2695  	VPERM2I128 $0x02, AA2, BB2, AA0
  2696  	VPERM2I128 $0x02, CC2, DD2, BB0
  2697  	VPERM2I128 $0x13, AA2, BB2, CC0
  2698  	VPERM2I128 $0x13, CC2, DD2, DD0
  2699  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2700  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2701  
  2702  	MOVQ       $384, itr1
  2703  	LEAQ       384(inp), inp
  2704  	SUBQ       $384, inl
  2705  	VPERM2I128 $0x02, AA3, BB3, AA0
  2706  	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
  2707  	VPERM2I128 $0x13, AA3, BB3, CC0
  2708  	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2709  
  2710  	JMP sealAVX2SealHash
  2711  
  2712  // func haveSSSE3() bool
  2713  TEXT ·haveSSSE3(SB), NOSPLIT, $0
  2714  	XORQ AX, AX
  2715  	INCL AX
  2716  	CPUID
  2717  	SHRQ $9, CX
  2718  	ANDQ $1, CX
  2719  	MOVB CX, ret+0(FP)
  2720  	RET
  2721