github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. 6 7 // +build go1.7,amd64,!gccgo,!appengine 8 9 #include "textflag.h" 10 // General register allocation 11 #define oup DI 12 #define inp SI 13 #define inl BX 14 #define adp CX // free to reuse, after we hash the additional data 15 #define keyp R8 // free to reuse, when we copy the key to stack 16 #define itr2 R9 // general iterator 17 #define itr1 CX // general iterator 18 #define acc0 R10 19 #define acc1 R11 20 #define acc2 R12 21 #define t0 R13 22 #define t1 R14 23 #define t2 R15 24 #define t3 R8 25 // Register and stack allocation for the SSE code 26 #define rStore (0*16)(BP) 27 #define sStore (1*16)(BP) 28 #define state1Store (2*16)(BP) 29 #define state2Store (3*16)(BP) 30 #define tmpStore (4*16)(BP) 31 #define ctr0Store (5*16)(BP) 32 #define ctr1Store (6*16)(BP) 33 #define ctr2Store (7*16)(BP) 34 #define ctr3Store (8*16)(BP) 35 #define A0 X0 36 #define A1 X1 37 #define A2 X2 38 #define B0 X3 39 #define B1 X4 40 #define B2 X5 41 #define C0 X6 42 #define C1 X7 43 #define C2 X8 44 #define D0 X9 45 #define D1 X10 46 #define D2 X11 47 #define T0 X12 48 #define T1 X13 49 #define T2 X14 50 #define T3 X15 51 #define A3 T0 52 #define B3 T1 53 #define C3 T2 54 #define D3 T3 55 // Register and stack allocation for the AVX2 code 56 #define rsStoreAVX2 (0*32)(BP) 57 #define state1StoreAVX2 (1*32)(BP) 58 #define state2StoreAVX2 (2*32)(BP) 59 #define ctr0StoreAVX2 (3*32)(BP) 60 #define ctr1StoreAVX2 (4*32)(BP) 61 #define ctr2StoreAVX2 (5*32)(BP) 62 #define ctr3StoreAVX2 (6*32)(BP) 63 #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack 64 #define AA0 Y0 65 #define AA1 Y5 66 #define AA2 Y6 67 #define AA3 Y7 68 #define BB0 Y14 69 #define BB1 Y9 70 #define BB2 Y10 71 #define BB3 Y11 72 #define CC0 Y12 73 #define CC1 Y13 74 #define CC2 Y8 75 #define CC3 Y15 76 #define DD0 Y4 77 #define DD1 Y1 78 #define DD2 Y2 79 #define DD3 Y3 80 #define TT0 DD3 81 #define TT1 AA3 82 #define TT2 BB3 83 #define TT3 CC3 84 // ChaCha20 constants 85 DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 86 DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e 87 DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 88 DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 89 DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 90 DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e 91 DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 92 DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 93 // <<< 16 with PSHUFB 94 DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 95 DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 96 DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 97 DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A 98 // <<< 8 with PSHUFB 99 DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 100 DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 101 DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 102 DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B 103 104 DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 105 DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 106 DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 107 DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 108 109 DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 110 DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 111 DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 112 DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 113 // Poly1305 key clamp 114 DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF 115 DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC 116 DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 117 DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 118 119 DATA ·sseIncMask<>+0x00(SB)/8, $0x1 120 DATA ·sseIncMask<>+0x08(SB)/8, $0x0 121 // To load/store the last < 16 bytes in a buffer 122 DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff 123 DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 124 DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff 125 DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 126 DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff 127 DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 128 DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff 129 DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 130 DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff 131 DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 132 DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff 133 DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 134 DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff 135 DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 136 DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff 137 DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 138 DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff 139 DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff 140 DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff 141 DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff 142 DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff 143 DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff 144 DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff 145 DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff 146 DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff 147 DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff 148 DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff 149 DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 150 DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff 151 DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 152 153 GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 154 GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 155 GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 156 GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 157 GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 158 GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 159 GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 160 GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 161 // No PALIGNR in Go ASM yet (but VPALIGNR is present). 162 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 163 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 164 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 165 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 166 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 167 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 168 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 169 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 170 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 171 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 172 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 173 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 174 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 175 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 176 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 177 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 178 #define shiftC0Right shiftC0Left 179 #define shiftC1Right shiftC1Left 180 #define shiftC2Right shiftC2Left 181 #define shiftC3Right shiftC3Left 182 #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 183 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 184 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 185 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 186 // Some macros 187 #define chachaQR(A, B, C, D, T) \ 188 PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ 189 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ 190 PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ 191 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B 192 193 #define chachaQR_AVX2(A, B, C, D, T) \ 194 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ 195 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ 196 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ 197 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B 198 199 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 200 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 201 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX 202 #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 203 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 204 205 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 206 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 207 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 208 209 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage 210 #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage 211 // ---------------------------------------------------------------------------- 212 TEXT polyHashADInternal<>(SB), NOSPLIT, $0 213 // adp points to beginning of additional data 214 // itr2 holds ad length 215 XORQ acc0, acc0 216 XORQ acc1, acc1 217 XORQ acc2, acc2 218 CMPQ itr2, $13 219 JNE hashADLoop 220 221 openFastTLSAD: 222 // Special treatment for the TLS case of 13 bytes 223 MOVQ (adp), acc0 224 MOVQ 5(adp), acc1 225 SHRQ $24, acc1 226 MOVQ $1, acc2 227 polyMul 228 RET 229 230 hashADLoop: 231 // Hash in 16 byte chunks 232 CMPQ itr2, $16 233 JB hashADTail 234 polyAdd(0(adp)) 235 LEAQ (1*16)(adp), adp 236 SUBQ $16, itr2 237 polyMul 238 JMP hashADLoop 239 240 hashADTail: 241 CMPQ itr2, $0 242 JE hashADDone 243 244 // Hash last < 16 byte tail 245 XORQ t0, t0 246 XORQ t1, t1 247 XORQ t2, t2 248 ADDQ itr2, adp 249 250 hashADTailLoop: 251 SHLQ $8, t1:t0 252 SHLQ $8, t0 253 MOVB -1(adp), t2 254 XORQ t2, t0 255 DECQ adp 256 DECQ itr2 257 JNE hashADTailLoop 258 259 hashADTailFinish: 260 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 261 polyMul 262 263 // Finished AD 264 hashADDone: 265 RET 266 267 // ---------------------------------------------------------------------------- 268 // func chacha20Poly1305Open(dst, key, src, ad []byte) bool 269 TEXT ·chacha20Poly1305Open(SB), 0, $288-97 270 // For aligned stack access 271 MOVQ SP, BP 272 ADDQ $32, BP 273 ANDQ $-32, BP 274 MOVQ dst+0(FP), oup 275 MOVQ key+24(FP), keyp 276 MOVQ src+48(FP), inp 277 MOVQ src_len+56(FP), inl 278 MOVQ ad+72(FP), adp 279 280 // Check for AVX2 support 281 CMPB ·useAVX2(SB), $1 282 JE chacha20Poly1305Open_AVX2 283 284 // Special optimization, for very short buffers 285 CMPQ inl, $128 286 JBE openSSE128 // About 16% faster 287 288 // For long buffers, prepare the poly key first 289 MOVOU ·chacha20Constants<>(SB), A0 290 MOVOU (1*16)(keyp), B0 291 MOVOU (2*16)(keyp), C0 292 MOVOU (3*16)(keyp), D0 293 MOVO D0, T1 294 295 // Store state on stack for future use 296 MOVO B0, state1Store 297 MOVO C0, state2Store 298 MOVO D0, ctr3Store 299 MOVQ $10, itr2 300 301 openSSEPreparePolyKey: 302 chachaQR(A0, B0, C0, D0, T0) 303 shiftB0Left; shiftC0Left; shiftD0Left 304 chachaQR(A0, B0, C0, D0, T0) 305 shiftB0Right; shiftC0Right; shiftD0Right 306 DECQ itr2 307 JNE openSSEPreparePolyKey 308 309 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 310 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 311 312 // Clamp and store the key 313 PAND ·polyClampMask<>(SB), A0 314 MOVO A0, rStore; MOVO B0, sStore 315 316 // Hash AAD 317 MOVQ ad_len+80(FP), itr2 318 CALL polyHashADInternal<>(SB) 319 320 openSSEMainLoop: 321 CMPQ inl, $256 322 JB openSSEMainLoopDone 323 324 // Load state, increment counter blocks 325 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 326 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 327 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 328 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 329 330 // Store counters 331 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 332 333 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 334 MOVQ $4, itr1 335 MOVQ inp, itr2 336 337 openSSEInternalLoop: 338 MOVO C3, tmpStore 339 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 340 MOVO tmpStore, C3 341 MOVO C1, tmpStore 342 chachaQR(A3, B3, C3, D3, C1) 343 MOVO tmpStore, C1 344 polyAdd(0(itr2)) 345 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 346 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 347 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 348 polyMulStage1 349 polyMulStage2 350 LEAQ (2*8)(itr2), itr2 351 MOVO C3, tmpStore 352 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 353 MOVO tmpStore, C3 354 MOVO C1, tmpStore 355 polyMulStage3 356 chachaQR(A3, B3, C3, D3, C1) 357 MOVO tmpStore, C1 358 polyMulReduceStage 359 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 360 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 361 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 362 DECQ itr1 363 JGE openSSEInternalLoop 364 365 polyAdd(0(itr2)) 366 polyMul 367 LEAQ (2*8)(itr2), itr2 368 369 CMPQ itr1, $-6 370 JG openSSEInternalLoop 371 372 // Add in the state 373 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 374 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 375 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 376 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 377 378 // Load - xor - store 379 MOVO D3, tmpStore 380 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) 381 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) 382 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) 383 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) 384 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) 385 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) 386 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) 387 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) 388 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) 389 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) 390 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) 391 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) 392 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) 393 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) 394 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) 395 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) 396 LEAQ 256(inp), inp 397 LEAQ 256(oup), oup 398 SUBQ $256, inl 399 JMP openSSEMainLoop 400 401 openSSEMainLoopDone: 402 // Handle the various tail sizes efficiently 403 TESTQ inl, inl 404 JE openSSEFinalize 405 CMPQ inl, $64 406 JBE openSSETail64 407 CMPQ inl, $128 408 JBE openSSETail128 409 CMPQ inl, $192 410 JBE openSSETail192 411 JMP openSSETail256 412 413 openSSEFinalize: 414 // Hash in the PT, AAD lengths 415 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 416 polyMul 417 418 // Final reduce 419 MOVQ acc0, t0 420 MOVQ acc1, t1 421 MOVQ acc2, t2 422 SUBQ $-5, acc0 423 SBBQ $-1, acc1 424 SBBQ $3, acc2 425 CMOVQCS t0, acc0 426 CMOVQCS t1, acc1 427 CMOVQCS t2, acc2 428 429 // Add in the "s" part of the key 430 ADDQ 0+sStore, acc0 431 ADCQ 8+sStore, acc1 432 433 // Finally, constant time compare to the tag at the end of the message 434 XORQ AX, AX 435 MOVQ $1, DX 436 XORQ (0*8)(inp), acc0 437 XORQ (1*8)(inp), acc1 438 ORQ acc1, acc0 439 CMOVQEQ DX, AX 440 441 // Return true iff tags are equal 442 MOVB AX, ret+96(FP) 443 RET 444 445 // ---------------------------------------------------------------------------- 446 // Special optimization for buffers smaller than 129 bytes 447 openSSE128: 448 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 449 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 450 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 451 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 452 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 453 MOVQ $10, itr2 454 455 openSSE128InnerCipherLoop: 456 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 457 shiftB0Left; shiftB1Left; shiftB2Left 458 shiftC0Left; shiftC1Left; shiftC2Left 459 shiftD0Left; shiftD1Left; shiftD2Left 460 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 461 shiftB0Right; shiftB1Right; shiftB2Right 462 shiftC0Right; shiftC1Right; shiftC2Right 463 shiftD0Right; shiftD1Right; shiftD2Right 464 DECQ itr2 465 JNE openSSE128InnerCipherLoop 466 467 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 468 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 469 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 470 PADDL T2, C1; PADDL T2, C2 471 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 472 473 // Clamp and store the key 474 PAND ·polyClampMask<>(SB), A0 475 MOVOU A0, rStore; MOVOU B0, sStore 476 477 // Hash 478 MOVQ ad_len+80(FP), itr2 479 CALL polyHashADInternal<>(SB) 480 481 openSSE128Open: 482 CMPQ inl, $16 483 JB openSSETail16 484 SUBQ $16, inl 485 486 // Load for hashing 487 polyAdd(0(inp)) 488 489 // Load for decryption 490 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) 491 LEAQ (1*16)(inp), inp 492 LEAQ (1*16)(oup), oup 493 polyMul 494 495 // Shift the stream "left" 496 MOVO B1, A1 497 MOVO C1, B1 498 MOVO D1, C1 499 MOVO A2, D1 500 MOVO B2, A2 501 MOVO C2, B2 502 MOVO D2, C2 503 JMP openSSE128Open 504 505 openSSETail16: 506 TESTQ inl, inl 507 JE openSSEFinalize 508 509 // We can safely load the CT from the end, because it is padded with the MAC 510 MOVQ inl, itr2 511 SHLQ $4, itr2 512 LEAQ ·andMask<>(SB), t0 513 MOVOU (inp), T0 514 ADDQ inl, inp 515 PAND -16(t0)(itr2*1), T0 516 MOVO T0, 0+tmpStore 517 MOVQ T0, t0 518 MOVQ 8+tmpStore, t1 519 PXOR A1, T0 520 521 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes 522 openSSETail16Store: 523 MOVQ T0, t3 524 MOVB t3, (oup) 525 PSRLDQ $1, T0 526 INCQ oup 527 DECQ inl 528 JNE openSSETail16Store 529 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 530 polyMul 531 JMP openSSEFinalize 532 533 // ---------------------------------------------------------------------------- 534 // Special optimization for the last 64 bytes of ciphertext 535 openSSETail64: 536 // Need to decrypt up to 64 bytes - prepare single block 537 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 538 XORQ itr2, itr2 539 MOVQ inl, itr1 540 CMPQ itr1, $16 541 JB openSSETail64LoopB 542 543 openSSETail64LoopA: 544 // Perform ChaCha rounds, while hashing the remaining input 545 polyAdd(0(inp)(itr2*1)) 546 polyMul 547 SUBQ $16, itr1 548 549 openSSETail64LoopB: 550 ADDQ $16, itr2 551 chachaQR(A0, B0, C0, D0, T0) 552 shiftB0Left; shiftC0Left; shiftD0Left 553 chachaQR(A0, B0, C0, D0, T0) 554 shiftB0Right; shiftC0Right; shiftD0Right 555 556 CMPQ itr1, $16 557 JAE openSSETail64LoopA 558 559 CMPQ itr2, $160 560 JNE openSSETail64LoopB 561 562 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 563 564 openSSETail64DecLoop: 565 CMPQ inl, $16 566 JB openSSETail64DecLoopDone 567 SUBQ $16, inl 568 MOVOU (inp), T0 569 PXOR T0, A0 570 MOVOU A0, (oup) 571 LEAQ 16(inp), inp 572 LEAQ 16(oup), oup 573 MOVO B0, A0 574 MOVO C0, B0 575 MOVO D0, C0 576 JMP openSSETail64DecLoop 577 578 openSSETail64DecLoopDone: 579 MOVO A0, A1 580 JMP openSSETail16 581 582 // ---------------------------------------------------------------------------- 583 // Special optimization for the last 128 bytes of ciphertext 584 openSSETail128: 585 // Need to decrypt up to 128 bytes - prepare two blocks 586 MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store 587 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store 588 XORQ itr2, itr2 589 MOVQ inl, itr1 590 ANDQ $-16, itr1 591 592 openSSETail128LoopA: 593 // Perform ChaCha rounds, while hashing the remaining input 594 polyAdd(0(inp)(itr2*1)) 595 polyMul 596 597 openSSETail128LoopB: 598 ADDQ $16, itr2 599 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 600 shiftB0Left; shiftC0Left; shiftD0Left 601 shiftB1Left; shiftC1Left; shiftD1Left 602 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 603 shiftB0Right; shiftC0Right; shiftD0Right 604 shiftB1Right; shiftC1Right; shiftD1Right 605 606 CMPQ itr2, itr1 607 JB openSSETail128LoopA 608 609 CMPQ itr2, $160 610 JNE openSSETail128LoopB 611 612 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 613 PADDL state1Store, B0; PADDL state1Store, B1 614 PADDL state2Store, C0; PADDL state2Store, C1 615 PADDL ctr1Store, D0; PADDL ctr0Store, D1 616 617 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 618 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 619 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 620 621 SUBQ $64, inl 622 LEAQ 64(inp), inp 623 LEAQ 64(oup), oup 624 JMP openSSETail64DecLoop 625 626 // ---------------------------------------------------------------------------- 627 // Special optimization for the last 192 bytes of ciphertext 628 openSSETail192: 629 // Need to decrypt up to 192 bytes - prepare three blocks 630 MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store 631 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 632 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store 633 634 MOVQ inl, itr1 635 MOVQ $160, itr2 636 CMPQ itr1, $160 637 CMOVQGT itr2, itr1 638 ANDQ $-16, itr1 639 XORQ itr2, itr2 640 641 openSSLTail192LoopA: 642 // Perform ChaCha rounds, while hashing the remaining input 643 polyAdd(0(inp)(itr2*1)) 644 polyMul 645 646 openSSLTail192LoopB: 647 ADDQ $16, itr2 648 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 649 shiftB0Left; shiftC0Left; shiftD0Left 650 shiftB1Left; shiftC1Left; shiftD1Left 651 shiftB2Left; shiftC2Left; shiftD2Left 652 653 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 654 shiftB0Right; shiftC0Right; shiftD0Right 655 shiftB1Right; shiftC1Right; shiftD1Right 656 shiftB2Right; shiftC2Right; shiftD2Right 657 658 CMPQ itr2, itr1 659 JB openSSLTail192LoopA 660 661 CMPQ itr2, $160 662 JNE openSSLTail192LoopB 663 664 CMPQ inl, $176 665 JB openSSLTail192Store 666 667 polyAdd(160(inp)) 668 polyMul 669 670 CMPQ inl, $192 671 JB openSSLTail192Store 672 673 polyAdd(176(inp)) 674 polyMul 675 676 openSSLTail192Store: 677 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 678 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 679 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 680 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 681 682 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 683 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 684 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) 685 686 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 687 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 688 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 689 690 SUBQ $128, inl 691 LEAQ 128(inp), inp 692 LEAQ 128(oup), oup 693 JMP openSSETail64DecLoop 694 695 // ---------------------------------------------------------------------------- 696 // Special optimization for the last 256 bytes of ciphertext 697 openSSETail256: 698 // Need to decrypt up to 256 bytes - prepare four blocks 699 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 700 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 701 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 702 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 703 704 // Store counters 705 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 706 XORQ itr2, itr2 707 708 openSSETail256Loop: 709 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication 710 polyAdd(0(inp)(itr2*1)) 711 MOVO C3, tmpStore 712 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 713 MOVO tmpStore, C3 714 MOVO C1, tmpStore 715 chachaQR(A3, B3, C3, D3, C1) 716 MOVO tmpStore, C1 717 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 718 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 719 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 720 polyMulStage1 721 polyMulStage2 722 MOVO C3, tmpStore 723 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 724 MOVO tmpStore, C3 725 MOVO C1, tmpStore 726 chachaQR(A3, B3, C3, D3, C1) 727 MOVO tmpStore, C1 728 polyMulStage3 729 polyMulReduceStage 730 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 731 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 732 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 733 ADDQ $2*8, itr2 734 CMPQ itr2, $160 735 JB openSSETail256Loop 736 MOVQ inl, itr1 737 ANDQ $-16, itr1 738 739 openSSETail256HashLoop: 740 polyAdd(0(inp)(itr2*1)) 741 polyMul 742 ADDQ $2*8, itr2 743 CMPQ itr2, itr1 744 JB openSSETail256HashLoop 745 746 // Add in the state 747 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 748 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 749 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 750 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 751 MOVO D3, tmpStore 752 753 // Load - xor - store 754 MOVOU (0*16)(inp), D3; PXOR D3, A0 755 MOVOU (1*16)(inp), D3; PXOR D3, B0 756 MOVOU (2*16)(inp), D3; PXOR D3, C0 757 MOVOU (3*16)(inp), D3; PXOR D3, D0 758 MOVOU A0, (0*16)(oup) 759 MOVOU B0, (1*16)(oup) 760 MOVOU C0, (2*16)(oup) 761 MOVOU D0, (3*16)(oup) 762 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 763 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 764 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 765 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 766 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 767 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 768 LEAQ 192(inp), inp 769 LEAQ 192(oup), oup 770 SUBQ $192, inl 771 MOVO A3, A0 772 MOVO B3, B0 773 MOVO C3, C0 774 MOVO tmpStore, D0 775 776 JMP openSSETail64DecLoop 777 778 // ---------------------------------------------------------------------------- 779 // ------------------------- AVX2 Code ---------------------------------------- 780 chacha20Poly1305Open_AVX2: 781 VZEROUPPER 782 VMOVDQU ·chacha20Constants<>(SB), AA0 783 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 784 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 785 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 786 VPADDD ·avx2InitMask<>(SB), DD0, DD0 787 788 // Special optimization, for very short buffers 789 CMPQ inl, $192 790 JBE openAVX2192 791 CMPQ inl, $320 792 JBE openAVX2320 793 794 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 795 VMOVDQA BB0, state1StoreAVX2 796 VMOVDQA CC0, state2StoreAVX2 797 VMOVDQA DD0, ctr3StoreAVX2 798 MOVQ $10, itr2 799 800 openAVX2PreparePolyKey: 801 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 802 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 803 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 804 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 805 DECQ itr2 806 JNE openAVX2PreparePolyKey 807 808 VPADDD ·chacha20Constants<>(SB), AA0, AA0 809 VPADDD state1StoreAVX2, BB0, BB0 810 VPADDD state2StoreAVX2, CC0, CC0 811 VPADDD ctr3StoreAVX2, DD0, DD0 812 813 VPERM2I128 $0x02, AA0, BB0, TT0 814 815 // Clamp and store poly key 816 VPAND ·polyClampMask<>(SB), TT0, TT0 817 VMOVDQA TT0, rsStoreAVX2 818 819 // Stream for the first 64 bytes 820 VPERM2I128 $0x13, AA0, BB0, AA0 821 VPERM2I128 $0x13, CC0, DD0, BB0 822 823 // Hash AD + first 64 bytes 824 MOVQ ad_len+80(FP), itr2 825 CALL polyHashADInternal<>(SB) 826 XORQ itr1, itr1 827 828 openAVX2InitialHash64: 829 polyAdd(0(inp)(itr1*1)) 830 polyMulAVX2 831 ADDQ $16, itr1 832 CMPQ itr1, $64 833 JNE openAVX2InitialHash64 834 835 // Decrypt the first 64 bytes 836 VPXOR (0*32)(inp), AA0, AA0 837 VPXOR (1*32)(inp), BB0, BB0 838 VMOVDQU AA0, (0*32)(oup) 839 VMOVDQU BB0, (1*32)(oup) 840 LEAQ (2*32)(inp), inp 841 LEAQ (2*32)(oup), oup 842 SUBQ $64, inl 843 844 openAVX2MainLoop: 845 CMPQ inl, $512 846 JB openAVX2MainLoopDone 847 848 // Load state, increment counter blocks, store the incremented counters 849 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 850 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 851 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 852 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 853 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 854 XORQ itr1, itr1 855 856 openAVX2InternalLoop: 857 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications 858 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext 859 polyAdd(0*8(inp)(itr1*1)) 860 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 861 polyMulStage1_AVX2 862 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 863 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 864 polyMulStage2_AVX2 865 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 866 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 867 polyMulStage3_AVX2 868 VMOVDQA CC3, tmpStoreAVX2 869 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 870 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 871 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 872 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 873 VMOVDQA tmpStoreAVX2, CC3 874 polyMulReduceStage 875 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 876 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 877 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 878 polyAdd(2*8(inp)(itr1*1)) 879 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 880 polyMulStage1_AVX2 881 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 882 VMOVDQA CC3, tmpStoreAVX2 883 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 884 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 885 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 886 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 887 VMOVDQA tmpStoreAVX2, CC3 888 polyMulStage2_AVX2 889 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 890 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 891 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 892 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 893 polyMulStage3_AVX2 894 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 895 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 896 polyMulReduceStage 897 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 898 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 899 polyAdd(4*8(inp)(itr1*1)) 900 LEAQ (6*8)(itr1), itr1 901 VMOVDQA CC3, tmpStoreAVX2 902 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 903 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 904 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 905 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 906 VMOVDQA tmpStoreAVX2, CC3 907 polyMulStage1_AVX2 908 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 909 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 910 polyMulStage2_AVX2 911 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 912 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 913 polyMulStage3_AVX2 914 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 915 VMOVDQA CC3, tmpStoreAVX2 916 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 917 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 918 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 919 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 920 VMOVDQA tmpStoreAVX2, CC3 921 polyMulReduceStage 922 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 923 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 924 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 925 CMPQ itr1, $480 926 JNE openAVX2InternalLoop 927 928 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 929 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 930 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 931 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 932 VMOVDQA CC3, tmpStoreAVX2 933 934 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 935 polyAdd(480(inp)) 936 polyMulAVX2 937 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 938 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 939 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 940 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 941 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 942 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 943 944 // and here 945 polyAdd(496(inp)) 946 polyMulAVX2 947 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 948 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 949 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 950 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 951 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 952 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 953 LEAQ (32*16)(inp), inp 954 LEAQ (32*16)(oup), oup 955 SUBQ $(32*16), inl 956 JMP openAVX2MainLoop 957 958 openAVX2MainLoopDone: 959 // Handle the various tail sizes efficiently 960 TESTQ inl, inl 961 JE openSSEFinalize 962 CMPQ inl, $128 963 JBE openAVX2Tail128 964 CMPQ inl, $256 965 JBE openAVX2Tail256 966 CMPQ inl, $384 967 JBE openAVX2Tail384 968 JMP openAVX2Tail512 969 970 // ---------------------------------------------------------------------------- 971 // Special optimization for buffers smaller than 193 bytes 972 openAVX2192: 973 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 974 VMOVDQA AA0, AA1 975 VMOVDQA BB0, BB1 976 VMOVDQA CC0, CC1 977 VPADDD ·avx2IncMask<>(SB), DD0, DD1 978 VMOVDQA AA0, AA2 979 VMOVDQA BB0, BB2 980 VMOVDQA CC0, CC2 981 VMOVDQA DD0, DD2 982 VMOVDQA DD1, TT3 983 MOVQ $10, itr2 984 985 openAVX2192InnerCipherLoop: 986 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 987 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 988 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 989 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 990 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 991 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 992 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 993 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 994 DECQ itr2 995 JNE openAVX2192InnerCipherLoop 996 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 997 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 998 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 999 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 1000 VPERM2I128 $0x02, AA0, BB0, TT0 1001 1002 // Clamp and store poly key 1003 VPAND ·polyClampMask<>(SB), TT0, TT0 1004 VMOVDQA TT0, rsStoreAVX2 1005 1006 // Stream for up to 192 bytes 1007 VPERM2I128 $0x13, AA0, BB0, AA0 1008 VPERM2I128 $0x13, CC0, DD0, BB0 1009 VPERM2I128 $0x02, AA1, BB1, CC0 1010 VPERM2I128 $0x02, CC1, DD1, DD0 1011 VPERM2I128 $0x13, AA1, BB1, AA1 1012 VPERM2I128 $0x13, CC1, DD1, BB1 1013 1014 openAVX2ShortOpen: 1015 // Hash 1016 MOVQ ad_len+80(FP), itr2 1017 CALL polyHashADInternal<>(SB) 1018 1019 openAVX2ShortOpenLoop: 1020 CMPQ inl, $32 1021 JB openAVX2ShortTail32 1022 SUBQ $32, inl 1023 1024 // Load for hashing 1025 polyAdd(0*8(inp)) 1026 polyMulAVX2 1027 polyAdd(2*8(inp)) 1028 polyMulAVX2 1029 1030 // Load for decryption 1031 VPXOR (inp), AA0, AA0 1032 VMOVDQU AA0, (oup) 1033 LEAQ (1*32)(inp), inp 1034 LEAQ (1*32)(oup), oup 1035 1036 // Shift stream left 1037 VMOVDQA BB0, AA0 1038 VMOVDQA CC0, BB0 1039 VMOVDQA DD0, CC0 1040 VMOVDQA AA1, DD0 1041 VMOVDQA BB1, AA1 1042 VMOVDQA CC1, BB1 1043 VMOVDQA DD1, CC1 1044 VMOVDQA AA2, DD1 1045 VMOVDQA BB2, AA2 1046 JMP openAVX2ShortOpenLoop 1047 1048 openAVX2ShortTail32: 1049 CMPQ inl, $16 1050 VMOVDQA A0, A1 1051 JB openAVX2ShortDone 1052 1053 SUBQ $16, inl 1054 1055 // Load for hashing 1056 polyAdd(0*8(inp)) 1057 polyMulAVX2 1058 1059 // Load for decryption 1060 VPXOR (inp), A0, T0 1061 VMOVDQU T0, (oup) 1062 LEAQ (1*16)(inp), inp 1063 LEAQ (1*16)(oup), oup 1064 VPERM2I128 $0x11, AA0, AA0, AA0 1065 VMOVDQA A0, A1 1066 1067 openAVX2ShortDone: 1068 VZEROUPPER 1069 JMP openSSETail16 1070 1071 // ---------------------------------------------------------------------------- 1072 // Special optimization for buffers smaller than 321 bytes 1073 openAVX2320: 1074 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 1075 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 1076 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 1077 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 1078 MOVQ $10, itr2 1079 1080 openAVX2320InnerCipherLoop: 1081 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1082 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1083 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1084 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1085 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1086 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1087 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1088 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1089 DECQ itr2 1090 JNE openAVX2320InnerCipherLoop 1091 1092 VMOVDQA ·chacha20Constants<>(SB), TT0 1093 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 1094 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 1095 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 1096 VMOVDQA ·avx2IncMask<>(SB), TT0 1097 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 1098 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 1099 VPADDD TT3, DD2, DD2 1100 1101 // Clamp and store poly key 1102 VPERM2I128 $0x02, AA0, BB0, TT0 1103 VPAND ·polyClampMask<>(SB), TT0, TT0 1104 VMOVDQA TT0, rsStoreAVX2 1105 1106 // Stream for up to 320 bytes 1107 VPERM2I128 $0x13, AA0, BB0, AA0 1108 VPERM2I128 $0x13, CC0, DD0, BB0 1109 VPERM2I128 $0x02, AA1, BB1, CC0 1110 VPERM2I128 $0x02, CC1, DD1, DD0 1111 VPERM2I128 $0x13, AA1, BB1, AA1 1112 VPERM2I128 $0x13, CC1, DD1, BB1 1113 VPERM2I128 $0x02, AA2, BB2, CC1 1114 VPERM2I128 $0x02, CC2, DD2, DD1 1115 VPERM2I128 $0x13, AA2, BB2, AA2 1116 VPERM2I128 $0x13, CC2, DD2, BB2 1117 JMP openAVX2ShortOpen 1118 1119 // ---------------------------------------------------------------------------- 1120 // Special optimization for the last 128 bytes of ciphertext 1121 openAVX2Tail128: 1122 // Need to decrypt up to 128 bytes - prepare two blocks 1123 VMOVDQA ·chacha20Constants<>(SB), AA1 1124 VMOVDQA state1StoreAVX2, BB1 1125 VMOVDQA state2StoreAVX2, CC1 1126 VMOVDQA ctr3StoreAVX2, DD1 1127 VPADDD ·avx2IncMask<>(SB), DD1, DD1 1128 VMOVDQA DD1, DD0 1129 1130 XORQ itr2, itr2 1131 MOVQ inl, itr1 1132 ANDQ $-16, itr1 1133 TESTQ itr1, itr1 1134 JE openAVX2Tail128LoopB 1135 1136 openAVX2Tail128LoopA: 1137 // Perform ChaCha rounds, while hashing the remaining input 1138 polyAdd(0(inp)(itr2*1)) 1139 polyMulAVX2 1140 1141 openAVX2Tail128LoopB: 1142 ADDQ $16, itr2 1143 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1144 VPALIGNR $4, BB1, BB1, BB1 1145 VPALIGNR $8, CC1, CC1, CC1 1146 VPALIGNR $12, DD1, DD1, DD1 1147 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1148 VPALIGNR $12, BB1, BB1, BB1 1149 VPALIGNR $8, CC1, CC1, CC1 1150 VPALIGNR $4, DD1, DD1, DD1 1151 CMPQ itr2, itr1 1152 JB openAVX2Tail128LoopA 1153 CMPQ itr2, $160 1154 JNE openAVX2Tail128LoopB 1155 1156 VPADDD ·chacha20Constants<>(SB), AA1, AA1 1157 VPADDD state1StoreAVX2, BB1, BB1 1158 VPADDD state2StoreAVX2, CC1, CC1 1159 VPADDD DD0, DD1, DD1 1160 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1161 1162 openAVX2TailLoop: 1163 CMPQ inl, $32 1164 JB openAVX2Tail 1165 SUBQ $32, inl 1166 1167 // Load for decryption 1168 VPXOR (inp), AA0, AA0 1169 VMOVDQU AA0, (oup) 1170 LEAQ (1*32)(inp), inp 1171 LEAQ (1*32)(oup), oup 1172 VMOVDQA BB0, AA0 1173 VMOVDQA CC0, BB0 1174 VMOVDQA DD0, CC0 1175 JMP openAVX2TailLoop 1176 1177 openAVX2Tail: 1178 CMPQ inl, $16 1179 VMOVDQA A0, A1 1180 JB openAVX2TailDone 1181 SUBQ $16, inl 1182 1183 // Load for decryption 1184 VPXOR (inp), A0, T0 1185 VMOVDQU T0, (oup) 1186 LEAQ (1*16)(inp), inp 1187 LEAQ (1*16)(oup), oup 1188 VPERM2I128 $0x11, AA0, AA0, AA0 1189 VMOVDQA A0, A1 1190 1191 openAVX2TailDone: 1192 VZEROUPPER 1193 JMP openSSETail16 1194 1195 // ---------------------------------------------------------------------------- 1196 // Special optimization for the last 256 bytes of ciphertext 1197 openAVX2Tail256: 1198 // Need to decrypt up to 256 bytes - prepare four blocks 1199 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 1200 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 1201 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 1202 VMOVDQA ctr3StoreAVX2, DD0 1203 VPADDD ·avx2IncMask<>(SB), DD0, DD0 1204 VPADDD ·avx2IncMask<>(SB), DD0, DD1 1205 VMOVDQA DD0, TT1 1206 VMOVDQA DD1, TT2 1207 1208 // Compute the number of iterations that will hash data 1209 MOVQ inl, tmpStoreAVX2 1210 MOVQ inl, itr1 1211 SUBQ $128, itr1 1212 SHRQ $4, itr1 1213 MOVQ $10, itr2 1214 CMPQ itr1, $10 1215 CMOVQGT itr2, itr1 1216 MOVQ inp, inl 1217 XORQ itr2, itr2 1218 1219 openAVX2Tail256LoopA: 1220 polyAdd(0(inl)) 1221 polyMulAVX2 1222 LEAQ 16(inl), inl 1223 1224 // Perform ChaCha rounds, while hashing the remaining input 1225 openAVX2Tail256LoopB: 1226 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1227 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 1228 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1229 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 1230 INCQ itr2 1231 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1232 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 1233 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1234 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 1235 CMPQ itr2, itr1 1236 JB openAVX2Tail256LoopA 1237 1238 CMPQ itr2, $10 1239 JNE openAVX2Tail256LoopB 1240 1241 MOVQ inl, itr2 1242 SUBQ inp, inl 1243 MOVQ inl, itr1 1244 MOVQ tmpStoreAVX2, inl 1245 1246 // Hash the remainder of data (if any) 1247 openAVX2Tail256Hash: 1248 ADDQ $16, itr1 1249 CMPQ itr1, inl 1250 JGT openAVX2Tail256HashEnd 1251 polyAdd (0(itr2)) 1252 polyMulAVX2 1253 LEAQ 16(itr2), itr2 1254 JMP openAVX2Tail256Hash 1255 1256 // Store 128 bytes safely, then go to store loop 1257 openAVX2Tail256HashEnd: 1258 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 1259 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 1260 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 1261 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 1262 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 1263 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1264 1265 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 1266 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) 1267 LEAQ (4*32)(inp), inp 1268 LEAQ (4*32)(oup), oup 1269 SUBQ $4*32, inl 1270 1271 JMP openAVX2TailLoop 1272 1273 // ---------------------------------------------------------------------------- 1274 // Special optimization for the last 384 bytes of ciphertext 1275 openAVX2Tail384: 1276 // Need to decrypt up to 384 bytes - prepare six blocks 1277 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 1278 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 1279 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 1280 VMOVDQA ctr3StoreAVX2, DD0 1281 VPADDD ·avx2IncMask<>(SB), DD0, DD0 1282 VPADDD ·avx2IncMask<>(SB), DD0, DD1 1283 VPADDD ·avx2IncMask<>(SB), DD1, DD2 1284 VMOVDQA DD0, ctr0StoreAVX2 1285 VMOVDQA DD1, ctr1StoreAVX2 1286 VMOVDQA DD2, ctr2StoreAVX2 1287 1288 // Compute the number of iterations that will hash two blocks of data 1289 MOVQ inl, tmpStoreAVX2 1290 MOVQ inl, itr1 1291 SUBQ $256, itr1 1292 SHRQ $4, itr1 1293 ADDQ $6, itr1 1294 MOVQ $10, itr2 1295 CMPQ itr1, $10 1296 CMOVQGT itr2, itr1 1297 MOVQ inp, inl 1298 XORQ itr2, itr2 1299 1300 // Perform ChaCha rounds, while hashing the remaining input 1301 openAVX2Tail384LoopB: 1302 polyAdd(0(inl)) 1303 polyMulAVX2 1304 LEAQ 16(inl), inl 1305 1306 openAVX2Tail384LoopA: 1307 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1308 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1309 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1310 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1311 polyAdd(0(inl)) 1312 polyMulAVX2 1313 LEAQ 16(inl), inl 1314 INCQ itr2 1315 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1316 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1317 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1318 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1319 1320 CMPQ itr2, itr1 1321 JB openAVX2Tail384LoopB 1322 1323 CMPQ itr2, $10 1324 JNE openAVX2Tail384LoopA 1325 1326 MOVQ inl, itr2 1327 SUBQ inp, inl 1328 MOVQ inl, itr1 1329 MOVQ tmpStoreAVX2, inl 1330 1331 openAVX2Tail384Hash: 1332 ADDQ $16, itr1 1333 CMPQ itr1, inl 1334 JGT openAVX2Tail384HashEnd 1335 polyAdd(0(itr2)) 1336 polyMulAVX2 1337 LEAQ 16(itr2), itr2 1338 JMP openAVX2Tail384Hash 1339 1340 // Store 256 bytes safely, then go to store loop 1341 openAVX2Tail384HashEnd: 1342 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 1343 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 1344 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 1345 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 1346 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 1347 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 1348 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 1349 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 1350 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 1351 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 1352 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1353 LEAQ (8*32)(inp), inp 1354 LEAQ (8*32)(oup), oup 1355 SUBQ $8*32, inl 1356 JMP openAVX2TailLoop 1357 1358 // ---------------------------------------------------------------------------- 1359 // Special optimization for the last 512 bytes of ciphertext 1360 openAVX2Tail512: 1361 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1362 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 1363 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 1364 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 1365 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 1366 XORQ itr1, itr1 1367 MOVQ inp, itr2 1368 1369 openAVX2Tail512LoopB: 1370 polyAdd(0(itr2)) 1371 polyMulAVX2 1372 LEAQ (2*8)(itr2), itr2 1373 1374 openAVX2Tail512LoopA: 1375 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1376 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1377 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 1378 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1379 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1380 VMOVDQA CC3, tmpStoreAVX2 1381 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1382 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1383 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1384 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1385 VMOVDQA tmpStoreAVX2, CC3 1386 polyAdd(0*8(itr2)) 1387 polyMulAVX2 1388 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1389 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1390 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 1391 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1392 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1393 VMOVDQA CC3, tmpStoreAVX2 1394 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1395 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1396 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1397 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1398 VMOVDQA tmpStoreAVX2, CC3 1399 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 1400 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1401 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 1402 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1403 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1404 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 1405 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1406 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1407 polyAdd(2*8(itr2)) 1408 polyMulAVX2 1409 LEAQ (4*8)(itr2), itr2 1410 VMOVDQA CC3, tmpStoreAVX2 1411 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1412 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1413 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1414 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1415 VMOVDQA tmpStoreAVX2, CC3 1416 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1417 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1418 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 1419 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1420 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1421 VMOVDQA CC3, tmpStoreAVX2 1422 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1423 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1424 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1425 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1426 VMOVDQA tmpStoreAVX2, CC3 1427 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 1428 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1429 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 1430 INCQ itr1 1431 CMPQ itr1, $4 1432 JLT openAVX2Tail512LoopB 1433 1434 CMPQ itr1, $10 1435 JNE openAVX2Tail512LoopA 1436 1437 MOVQ inl, itr1 1438 SUBQ $384, itr1 1439 ANDQ $-16, itr1 1440 1441 openAVX2Tail512HashLoop: 1442 TESTQ itr1, itr1 1443 JE openAVX2Tail512HashEnd 1444 polyAdd(0(itr2)) 1445 polyMulAVX2 1446 LEAQ 16(itr2), itr2 1447 SUBQ $16, itr1 1448 JMP openAVX2Tail512HashLoop 1449 1450 openAVX2Tail512HashEnd: 1451 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 1452 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 1453 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 1454 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 1455 VMOVDQA CC3, tmpStoreAVX2 1456 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 1457 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 1458 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 1459 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1460 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 1461 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 1462 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1463 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 1464 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 1465 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 1466 1467 LEAQ (12*32)(inp), inp 1468 LEAQ (12*32)(oup), oup 1469 SUBQ $12*32, inl 1470 1471 JMP openAVX2TailLoop 1472 1473 // ---------------------------------------------------------------------------- 1474 // ---------------------------------------------------------------------------- 1475 // func chacha20Poly1305Seal(dst, key, src, ad []byte) 1476 TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 1477 // For aligned stack access 1478 MOVQ SP, BP 1479 ADDQ $32, BP 1480 ANDQ $-32, BP 1481 MOVQ dst+0(FP), oup 1482 MOVQ key+24(FP), keyp 1483 MOVQ src+48(FP), inp 1484 MOVQ src_len+56(FP), inl 1485 MOVQ ad+72(FP), adp 1486 1487 CMPB ·useAVX2(SB), $1 1488 JE chacha20Poly1305Seal_AVX2 1489 1490 // Special optimization, for very short buffers 1491 CMPQ inl, $128 1492 JBE sealSSE128 // About 15% faster 1493 1494 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration 1495 MOVOU ·chacha20Constants<>(SB), A0 1496 MOVOU (1*16)(keyp), B0 1497 MOVOU (2*16)(keyp), C0 1498 MOVOU (3*16)(keyp), D0 1499 1500 // Store state on stack for future use 1501 MOVO B0, state1Store 1502 MOVO C0, state2Store 1503 1504 // Load state, increment counter blocks 1505 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1506 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1507 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 1508 1509 // Store counters 1510 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1511 MOVQ $10, itr2 1512 1513 sealSSEIntroLoop: 1514 MOVO C3, tmpStore 1515 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1516 MOVO tmpStore, C3 1517 MOVO C1, tmpStore 1518 chachaQR(A3, B3, C3, D3, C1) 1519 MOVO tmpStore, C1 1520 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1521 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1522 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1523 1524 MOVO C3, tmpStore 1525 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1526 MOVO tmpStore, C3 1527 MOVO C1, tmpStore 1528 chachaQR(A3, B3, C3, D3, C1) 1529 MOVO tmpStore, C1 1530 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1531 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1532 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1533 DECQ itr2 1534 JNE sealSSEIntroLoop 1535 1536 // Add in the state 1537 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 1538 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1539 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1540 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1541 1542 // Clamp and store the key 1543 PAND ·polyClampMask<>(SB), A0 1544 MOVO A0, rStore 1545 MOVO B0, sStore 1546 1547 // Hash AAD 1548 MOVQ ad_len+80(FP), itr2 1549 CALL polyHashADInternal<>(SB) 1550 1551 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1552 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1553 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 1554 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1555 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1556 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) 1557 1558 MOVQ $128, itr1 1559 SUBQ $128, inl 1560 LEAQ 128(inp), inp 1561 1562 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 1563 1564 CMPQ inl, $64 1565 JBE sealSSE128SealHash 1566 1567 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1568 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1569 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) 1570 1571 ADDQ $64, itr1 1572 SUBQ $64, inl 1573 LEAQ 64(inp), inp 1574 1575 MOVQ $2, itr1 1576 MOVQ $8, itr2 1577 1578 CMPQ inl, $64 1579 JBE sealSSETail64 1580 CMPQ inl, $128 1581 JBE sealSSETail128 1582 CMPQ inl, $192 1583 JBE sealSSETail192 1584 1585 sealSSEMainLoop: 1586 // Load state, increment counter blocks 1587 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 1588 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1589 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1590 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 1591 1592 // Store counters 1593 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1594 1595 sealSSEInnerLoop: 1596 MOVO C3, tmpStore 1597 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1598 MOVO tmpStore, C3 1599 MOVO C1, tmpStore 1600 chachaQR(A3, B3, C3, D3, C1) 1601 MOVO tmpStore, C1 1602 polyAdd(0(oup)) 1603 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1604 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1605 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1606 polyMulStage1 1607 polyMulStage2 1608 LEAQ (2*8)(oup), oup 1609 MOVO C3, tmpStore 1610 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1611 MOVO tmpStore, C3 1612 MOVO C1, tmpStore 1613 polyMulStage3 1614 chachaQR(A3, B3, C3, D3, C1) 1615 MOVO tmpStore, C1 1616 polyMulReduceStage 1617 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1618 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1619 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1620 DECQ itr2 1621 JGE sealSSEInnerLoop 1622 polyAdd(0(oup)) 1623 polyMul 1624 LEAQ (2*8)(oup), oup 1625 DECQ itr1 1626 JG sealSSEInnerLoop 1627 1628 // Add in the state 1629 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 1630 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1631 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1632 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1633 MOVO D3, tmpStore 1634 1635 // Load - xor - store 1636 MOVOU (0*16)(inp), D3; PXOR D3, A0 1637 MOVOU (1*16)(inp), D3; PXOR D3, B0 1638 MOVOU (2*16)(inp), D3; PXOR D3, C0 1639 MOVOU (3*16)(inp), D3; PXOR D3, D0 1640 MOVOU A0, (0*16)(oup) 1641 MOVOU B0, (1*16)(oup) 1642 MOVOU C0, (2*16)(oup) 1643 MOVOU D0, (3*16)(oup) 1644 MOVO tmpStore, D3 1645 1646 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1647 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1648 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1649 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 1650 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1651 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 1652 ADDQ $192, inp 1653 MOVQ $192, itr1 1654 SUBQ $192, inl 1655 MOVO A3, A1 1656 MOVO B3, B1 1657 MOVO C3, C1 1658 MOVO D3, D1 1659 CMPQ inl, $64 1660 JBE sealSSE128SealHash 1661 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1662 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1663 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) 1664 LEAQ 64(inp), inp 1665 SUBQ $64, inl 1666 MOVQ $6, itr1 1667 MOVQ $4, itr2 1668 CMPQ inl, $192 1669 JG sealSSEMainLoop 1670 1671 MOVQ inl, itr1 1672 TESTQ inl, inl 1673 JE sealSSE128SealHash 1674 MOVQ $6, itr1 1675 CMPQ inl, $64 1676 JBE sealSSETail64 1677 CMPQ inl, $128 1678 JBE sealSSETail128 1679 JMP sealSSETail192 1680 1681 // ---------------------------------------------------------------------------- 1682 // Special optimization for the last 64 bytes of plaintext 1683 sealSSETail64: 1684 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes 1685 MOVO ·chacha20Constants<>(SB), A1 1686 MOVO state1Store, B1 1687 MOVO state2Store, C1 1688 MOVO ctr3Store, D1 1689 PADDL ·sseIncMask<>(SB), D1 1690 MOVO D1, ctr0Store 1691 1692 sealSSETail64LoopA: 1693 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1694 polyAdd(0(oup)) 1695 polyMul 1696 LEAQ 16(oup), oup 1697 1698 sealSSETail64LoopB: 1699 chachaQR(A1, B1, C1, D1, T1) 1700 shiftB1Left; shiftC1Left; shiftD1Left 1701 chachaQR(A1, B1, C1, D1, T1) 1702 shiftB1Right; shiftC1Right; shiftD1Right 1703 polyAdd(0(oup)) 1704 polyMul 1705 LEAQ 16(oup), oup 1706 1707 DECQ itr1 1708 JG sealSSETail64LoopA 1709 1710 DECQ itr2 1711 JGE sealSSETail64LoopB 1712 PADDL ·chacha20Constants<>(SB), A1 1713 PADDL state1Store, B1 1714 PADDL state2Store, C1 1715 PADDL ctr0Store, D1 1716 1717 JMP sealSSE128Seal 1718 1719 // ---------------------------------------------------------------------------- 1720 // Special optimization for the last 128 bytes of plaintext 1721 sealSSETail128: 1722 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes 1723 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1724 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1725 1726 sealSSETail128LoopA: 1727 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1728 polyAdd(0(oup)) 1729 polyMul 1730 LEAQ 16(oup), oup 1731 1732 sealSSETail128LoopB: 1733 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1734 shiftB0Left; shiftC0Left; shiftD0Left 1735 shiftB1Left; shiftC1Left; shiftD1Left 1736 polyAdd(0(oup)) 1737 polyMul 1738 LEAQ 16(oup), oup 1739 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1740 shiftB0Right; shiftC0Right; shiftD0Right 1741 shiftB1Right; shiftC1Right; shiftD1Right 1742 1743 DECQ itr1 1744 JG sealSSETail128LoopA 1745 1746 DECQ itr2 1747 JGE sealSSETail128LoopB 1748 1749 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 1750 PADDL state1Store, B0; PADDL state1Store, B1 1751 PADDL state2Store, C0; PADDL state2Store, C1 1752 PADDL ctr0Store, D0; PADDL ctr1Store, D1 1753 1754 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1755 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1756 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1757 1758 MOVQ $64, itr1 1759 LEAQ 64(inp), inp 1760 SUBQ $64, inl 1761 1762 JMP sealSSE128SealHash 1763 1764 // ---------------------------------------------------------------------------- 1765 // Special optimization for the last 192 bytes of plaintext 1766 sealSSETail192: 1767 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes 1768 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1769 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1770 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store 1771 1772 sealSSETail192LoopA: 1773 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1774 polyAdd(0(oup)) 1775 polyMul 1776 LEAQ 16(oup), oup 1777 1778 sealSSETail192LoopB: 1779 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1780 shiftB0Left; shiftC0Left; shiftD0Left 1781 shiftB1Left; shiftC1Left; shiftD1Left 1782 shiftB2Left; shiftC2Left; shiftD2Left 1783 1784 polyAdd(0(oup)) 1785 polyMul 1786 LEAQ 16(oup), oup 1787 1788 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1789 shiftB0Right; shiftC0Right; shiftD0Right 1790 shiftB1Right; shiftC1Right; shiftD1Right 1791 shiftB2Right; shiftC2Right; shiftD2Right 1792 1793 DECQ itr1 1794 JG sealSSETail192LoopA 1795 1796 DECQ itr2 1797 JGE sealSSETail192LoopB 1798 1799 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 1800 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 1801 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 1802 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 1803 1804 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1805 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1806 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1807 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 1808 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 1809 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1810 1811 MOVO A2, A1 1812 MOVO B2, B1 1813 MOVO C2, C1 1814 MOVO D2, D1 1815 MOVQ $128, itr1 1816 LEAQ 128(inp), inp 1817 SUBQ $128, inl 1818 1819 JMP sealSSE128SealHash 1820 1821 // ---------------------------------------------------------------------------- 1822 // Special seal optimization for buffers smaller than 129 bytes 1823 sealSSE128: 1824 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 1825 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 1826 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1827 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1828 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 1829 MOVQ $10, itr2 1830 1831 sealSSE128InnerCipherLoop: 1832 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1833 shiftB0Left; shiftB1Left; shiftB2Left 1834 shiftC0Left; shiftC1Left; shiftC2Left 1835 shiftD0Left; shiftD1Left; shiftD2Left 1836 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1837 shiftB0Right; shiftB1Right; shiftB2Right 1838 shiftC0Right; shiftC1Right; shiftC2Right 1839 shiftD0Right; shiftD1Right; shiftD2Right 1840 DECQ itr2 1841 JNE sealSSE128InnerCipherLoop 1842 1843 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 1844 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 1845 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 1846 PADDL T2, C1; PADDL T2, C2 1847 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 1848 PAND ·polyClampMask<>(SB), A0 1849 MOVOU A0, rStore 1850 MOVOU B0, sStore 1851 1852 // Hash 1853 MOVQ ad_len+80(FP), itr2 1854 CALL polyHashADInternal<>(SB) 1855 XORQ itr1, itr1 1856 1857 sealSSE128SealHash: 1858 // itr1 holds the number of bytes encrypted but not yet hashed 1859 CMPQ itr1, $16 1860 JB sealSSE128Seal 1861 polyAdd(0(oup)) 1862 polyMul 1863 1864 SUBQ $16, itr1 1865 ADDQ $16, oup 1866 1867 JMP sealSSE128SealHash 1868 1869 sealSSE128Seal: 1870 CMPQ inl, $16 1871 JB sealSSETail 1872 SUBQ $16, inl 1873 1874 // Load for decryption 1875 MOVOU (inp), T0 1876 PXOR T0, A1 1877 MOVOU A1, (oup) 1878 LEAQ (1*16)(inp), inp 1879 LEAQ (1*16)(oup), oup 1880 1881 // Extract for hashing 1882 MOVQ A1, t0 1883 PSRLDQ $8, A1 1884 MOVQ A1, t1 1885 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1886 polyMul 1887 1888 // Shift the stream "left" 1889 MOVO B1, A1 1890 MOVO C1, B1 1891 MOVO D1, C1 1892 MOVO A2, D1 1893 MOVO B2, A2 1894 MOVO C2, B2 1895 MOVO D2, C2 1896 JMP sealSSE128Seal 1897 1898 sealSSETail: 1899 TESTQ inl, inl 1900 JE sealSSEFinalize 1901 1902 // We can only load the PT one byte at a time to avoid read after end of buffer 1903 MOVQ inl, itr2 1904 SHLQ $4, itr2 1905 LEAQ ·andMask<>(SB), t0 1906 MOVQ inl, itr1 1907 LEAQ -1(inp)(inl*1), inp 1908 XORQ t2, t2 1909 XORQ t3, t3 1910 XORQ AX, AX 1911 1912 sealSSETailLoadLoop: 1913 SHLQ $8, t2, t3 1914 SHLQ $8, t2 1915 MOVB (inp), AX 1916 XORQ AX, t2 1917 LEAQ -1(inp), inp 1918 DECQ itr1 1919 JNE sealSSETailLoadLoop 1920 MOVQ t2, 0+tmpStore 1921 MOVQ t3, 8+tmpStore 1922 PXOR 0+tmpStore, A1 1923 MOVOU A1, (oup) 1924 MOVOU -16(t0)(itr2*1), T0 1925 PAND T0, A1 1926 MOVQ A1, t0 1927 PSRLDQ $8, A1 1928 MOVQ A1, t1 1929 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1930 polyMul 1931 1932 ADDQ inl, oup 1933 1934 sealSSEFinalize: 1935 // Hash in the buffer lengths 1936 ADDQ ad_len+80(FP), acc0 1937 ADCQ src_len+56(FP), acc1 1938 ADCQ $1, acc2 1939 polyMul 1940 1941 // Final reduce 1942 MOVQ acc0, t0 1943 MOVQ acc1, t1 1944 MOVQ acc2, t2 1945 SUBQ $-5, acc0 1946 SBBQ $-1, acc1 1947 SBBQ $3, acc2 1948 CMOVQCS t0, acc0 1949 CMOVQCS t1, acc1 1950 CMOVQCS t2, acc2 1951 1952 // Add in the "s" part of the key 1953 ADDQ 0+sStore, acc0 1954 ADCQ 8+sStore, acc1 1955 1956 // Finally store the tag at the end of the message 1957 MOVQ acc0, (0*8)(oup) 1958 MOVQ acc1, (1*8)(oup) 1959 RET 1960 1961 // ---------------------------------------------------------------------------- 1962 // ------------------------- AVX2 Code ---------------------------------------- 1963 chacha20Poly1305Seal_AVX2: 1964 VZEROUPPER 1965 VMOVDQU ·chacha20Constants<>(SB), AA0 1966 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 1967 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 1968 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 1969 VPADDD ·avx2InitMask<>(SB), DD0, DD0 1970 1971 // Special optimizations, for very short buffers 1972 CMPQ inl, $192 1973 JBE seal192AVX2 // 33% faster 1974 CMPQ inl, $320 1975 JBE seal320AVX2 // 17% faster 1976 1977 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 1978 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1979 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 1980 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 1981 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 1982 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 1983 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 1984 VMOVDQA DD3, ctr3StoreAVX2 1985 MOVQ $10, itr2 1986 1987 sealAVX2IntroLoop: 1988 VMOVDQA CC3, tmpStoreAVX2 1989 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 1990 VMOVDQA tmpStoreAVX2, CC3 1991 VMOVDQA CC1, tmpStoreAVX2 1992 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 1993 VMOVDQA tmpStoreAVX2, CC1 1994 1995 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 1996 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 1997 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 1998 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 1999 2000 VMOVDQA CC3, tmpStoreAVX2 2001 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2002 VMOVDQA tmpStoreAVX2, CC3 2003 VMOVDQA CC1, tmpStoreAVX2 2004 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2005 VMOVDQA tmpStoreAVX2, CC1 2006 2007 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2008 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2009 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2010 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2011 DECQ itr2 2012 JNE sealAVX2IntroLoop 2013 2014 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2015 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2016 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2017 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2018 2019 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 2020 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key 2021 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 2022 2023 // Clamp and store poly key 2024 VPAND ·polyClampMask<>(SB), DD0, DD0 2025 VMOVDQA DD0, rsStoreAVX2 2026 2027 // Hash AD 2028 MOVQ ad_len+80(FP), itr2 2029 CALL polyHashADInternal<>(SB) 2030 2031 // Can store at least 320 bytes 2032 VPXOR (0*32)(inp), AA0, AA0 2033 VPXOR (1*32)(inp), CC0, CC0 2034 VMOVDQU AA0, (0*32)(oup) 2035 VMOVDQU CC0, (1*32)(oup) 2036 2037 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2038 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 2039 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) 2040 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2041 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 2042 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) 2043 2044 MOVQ $320, itr1 2045 SUBQ $320, inl 2046 LEAQ 320(inp), inp 2047 2048 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 2049 CMPQ inl, $128 2050 JBE sealAVX2SealHash 2051 2052 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 2053 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) 2054 SUBQ $128, inl 2055 LEAQ 128(inp), inp 2056 2057 MOVQ $8, itr1 2058 MOVQ $2, itr2 2059 2060 CMPQ inl, $128 2061 JBE sealAVX2Tail128 2062 CMPQ inl, $256 2063 JBE sealAVX2Tail256 2064 CMPQ inl, $384 2065 JBE sealAVX2Tail384 2066 CMPQ inl, $512 2067 JBE sealAVX2Tail512 2068 2069 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2070 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2071 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2072 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2073 VMOVDQA ctr3StoreAVX2, DD0 2074 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2075 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2076 2077 VMOVDQA CC3, tmpStoreAVX2 2078 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2079 VMOVDQA tmpStoreAVX2, CC3 2080 VMOVDQA CC1, tmpStoreAVX2 2081 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2082 VMOVDQA tmpStoreAVX2, CC1 2083 2084 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 2085 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 2086 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 2087 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 2088 2089 VMOVDQA CC3, tmpStoreAVX2 2090 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2091 VMOVDQA tmpStoreAVX2, CC3 2092 VMOVDQA CC1, tmpStoreAVX2 2093 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2094 VMOVDQA tmpStoreAVX2, CC1 2095 2096 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2097 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2098 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2099 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2100 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2101 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2102 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2103 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2104 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2105 VMOVDQA CC3, tmpStoreAVX2 2106 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2107 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2108 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2109 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2110 VMOVDQA tmpStoreAVX2, CC3 2111 2112 SUBQ $16, oup // Adjust the pointer 2113 MOVQ $9, itr1 2114 JMP sealAVX2InternalLoopStart 2115 2116 sealAVX2MainLoop: 2117 // Load state, increment counter blocks, store the incremented counters 2118 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2119 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2120 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2121 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2122 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2123 MOVQ $10, itr1 2124 2125 sealAVX2InternalLoop: 2126 polyAdd(0*8(oup)) 2127 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2128 polyMulStage1_AVX2 2129 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2130 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2131 polyMulStage2_AVX2 2132 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2133 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2134 polyMulStage3_AVX2 2135 VMOVDQA CC3, tmpStoreAVX2 2136 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2137 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2138 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2139 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2140 VMOVDQA tmpStoreAVX2, CC3 2141 polyMulReduceStage 2142 2143 sealAVX2InternalLoopStart: 2144 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2145 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2146 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2147 polyAdd(2*8(oup)) 2148 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2149 polyMulStage1_AVX2 2150 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2151 VMOVDQA CC3, tmpStoreAVX2 2152 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2153 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2154 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2155 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2156 VMOVDQA tmpStoreAVX2, CC3 2157 polyMulStage2_AVX2 2158 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2159 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2160 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2161 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2162 polyMulStage3_AVX2 2163 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2164 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2165 polyMulReduceStage 2166 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2167 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2168 polyAdd(4*8(oup)) 2169 LEAQ (6*8)(oup), oup 2170 VMOVDQA CC3, tmpStoreAVX2 2171 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2172 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2173 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2174 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2175 VMOVDQA tmpStoreAVX2, CC3 2176 polyMulStage1_AVX2 2177 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2178 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2179 polyMulStage2_AVX2 2180 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2181 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2182 polyMulStage3_AVX2 2183 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2184 VMOVDQA CC3, tmpStoreAVX2 2185 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2186 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2187 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2188 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2189 VMOVDQA tmpStoreAVX2, CC3 2190 polyMulReduceStage 2191 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2192 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2193 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2194 DECQ itr1 2195 JNE sealAVX2InternalLoop 2196 2197 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2198 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2199 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2200 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2201 VMOVDQA CC3, tmpStoreAVX2 2202 2203 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 2204 polyAdd(0*8(oup)) 2205 polyMulAVX2 2206 LEAQ (4*8)(oup), oup 2207 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 2208 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 2209 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 2210 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2211 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2212 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2213 2214 // and here 2215 polyAdd(-2*8(oup)) 2216 polyMulAVX2 2217 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2218 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2219 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2220 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2221 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 2222 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 2223 LEAQ (32*16)(inp), inp 2224 SUBQ $(32*16), inl 2225 CMPQ inl, $512 2226 JG sealAVX2MainLoop 2227 2228 // Tail can only hash 480 bytes 2229 polyAdd(0*8(oup)) 2230 polyMulAVX2 2231 polyAdd(2*8(oup)) 2232 polyMulAVX2 2233 LEAQ 32(oup), oup 2234 2235 MOVQ $10, itr1 2236 MOVQ $0, itr2 2237 CMPQ inl, $128 2238 JBE sealAVX2Tail128 2239 CMPQ inl, $256 2240 JBE sealAVX2Tail256 2241 CMPQ inl, $384 2242 JBE sealAVX2Tail384 2243 JMP sealAVX2Tail512 2244 2245 // ---------------------------------------------------------------------------- 2246 // Special optimization for buffers smaller than 193 bytes 2247 seal192AVX2: 2248 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 2249 VMOVDQA AA0, AA1 2250 VMOVDQA BB0, BB1 2251 VMOVDQA CC0, CC1 2252 VPADDD ·avx2IncMask<>(SB), DD0, DD1 2253 VMOVDQA AA0, AA2 2254 VMOVDQA BB0, BB2 2255 VMOVDQA CC0, CC2 2256 VMOVDQA DD0, DD2 2257 VMOVDQA DD1, TT3 2258 MOVQ $10, itr2 2259 2260 sealAVX2192InnerCipherLoop: 2261 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2262 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2263 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2264 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2265 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2266 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2267 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2268 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2269 DECQ itr2 2270 JNE sealAVX2192InnerCipherLoop 2271 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 2272 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 2273 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 2274 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 2275 VPERM2I128 $0x02, AA0, BB0, TT0 2276 2277 // Clamp and store poly key 2278 VPAND ·polyClampMask<>(SB), TT0, TT0 2279 VMOVDQA TT0, rsStoreAVX2 2280 2281 // Stream for up to 192 bytes 2282 VPERM2I128 $0x13, AA0, BB0, AA0 2283 VPERM2I128 $0x13, CC0, DD0, BB0 2284 VPERM2I128 $0x02, AA1, BB1, CC0 2285 VPERM2I128 $0x02, CC1, DD1, DD0 2286 VPERM2I128 $0x13, AA1, BB1, AA1 2287 VPERM2I128 $0x13, CC1, DD1, BB1 2288 2289 sealAVX2ShortSeal: 2290 // Hash aad 2291 MOVQ ad_len+80(FP), itr2 2292 CALL polyHashADInternal<>(SB) 2293 XORQ itr1, itr1 2294 2295 sealAVX2SealHash: 2296 // itr1 holds the number of bytes encrypted but not yet hashed 2297 CMPQ itr1, $16 2298 JB sealAVX2ShortSealLoop 2299 polyAdd(0(oup)) 2300 polyMul 2301 SUBQ $16, itr1 2302 ADDQ $16, oup 2303 JMP sealAVX2SealHash 2304 2305 sealAVX2ShortSealLoop: 2306 CMPQ inl, $32 2307 JB sealAVX2ShortTail32 2308 SUBQ $32, inl 2309 2310 // Load for encryption 2311 VPXOR (inp), AA0, AA0 2312 VMOVDQU AA0, (oup) 2313 LEAQ (1*32)(inp), inp 2314 2315 // Now can hash 2316 polyAdd(0*8(oup)) 2317 polyMulAVX2 2318 polyAdd(2*8(oup)) 2319 polyMulAVX2 2320 LEAQ (1*32)(oup), oup 2321 2322 // Shift stream left 2323 VMOVDQA BB0, AA0 2324 VMOVDQA CC0, BB0 2325 VMOVDQA DD0, CC0 2326 VMOVDQA AA1, DD0 2327 VMOVDQA BB1, AA1 2328 VMOVDQA CC1, BB1 2329 VMOVDQA DD1, CC1 2330 VMOVDQA AA2, DD1 2331 VMOVDQA BB2, AA2 2332 JMP sealAVX2ShortSealLoop 2333 2334 sealAVX2ShortTail32: 2335 CMPQ inl, $16 2336 VMOVDQA A0, A1 2337 JB sealAVX2ShortDone 2338 2339 SUBQ $16, inl 2340 2341 // Load for encryption 2342 VPXOR (inp), A0, T0 2343 VMOVDQU T0, (oup) 2344 LEAQ (1*16)(inp), inp 2345 2346 // Hash 2347 polyAdd(0*8(oup)) 2348 polyMulAVX2 2349 LEAQ (1*16)(oup), oup 2350 VPERM2I128 $0x11, AA0, AA0, AA0 2351 VMOVDQA A0, A1 2352 2353 sealAVX2ShortDone: 2354 VZEROUPPER 2355 JMP sealSSETail 2356 2357 // ---------------------------------------------------------------------------- 2358 // Special optimization for buffers smaller than 321 bytes 2359 seal320AVX2: 2360 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 2361 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 2362 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 2363 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 2364 MOVQ $10, itr2 2365 2366 sealAVX2320InnerCipherLoop: 2367 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2368 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2369 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2370 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2371 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2372 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2373 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2374 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2375 DECQ itr2 2376 JNE sealAVX2320InnerCipherLoop 2377 2378 VMOVDQA ·chacha20Constants<>(SB), TT0 2379 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 2380 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 2381 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 2382 VMOVDQA ·avx2IncMask<>(SB), TT0 2383 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 2384 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 2385 VPADDD TT3, DD2, DD2 2386 2387 // Clamp and store poly key 2388 VPERM2I128 $0x02, AA0, BB0, TT0 2389 VPAND ·polyClampMask<>(SB), TT0, TT0 2390 VMOVDQA TT0, rsStoreAVX2 2391 2392 // Stream for up to 320 bytes 2393 VPERM2I128 $0x13, AA0, BB0, AA0 2394 VPERM2I128 $0x13, CC0, DD0, BB0 2395 VPERM2I128 $0x02, AA1, BB1, CC0 2396 VPERM2I128 $0x02, CC1, DD1, DD0 2397 VPERM2I128 $0x13, AA1, BB1, AA1 2398 VPERM2I128 $0x13, CC1, DD1, BB1 2399 VPERM2I128 $0x02, AA2, BB2, CC1 2400 VPERM2I128 $0x02, CC2, DD2, DD1 2401 VPERM2I128 $0x13, AA2, BB2, AA2 2402 VPERM2I128 $0x13, CC2, DD2, BB2 2403 JMP sealAVX2ShortSeal 2404 2405 // ---------------------------------------------------------------------------- 2406 // Special optimization for the last 128 bytes of ciphertext 2407 sealAVX2Tail128: 2408 // Need to decrypt up to 128 bytes - prepare two blocks 2409 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2410 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2411 VMOVDQA ·chacha20Constants<>(SB), AA0 2412 VMOVDQA state1StoreAVX2, BB0 2413 VMOVDQA state2StoreAVX2, CC0 2414 VMOVDQA ctr3StoreAVX2, DD0 2415 VPADDD ·avx2IncMask<>(SB), DD0, DD0 2416 VMOVDQA DD0, DD1 2417 2418 sealAVX2Tail128LoopA: 2419 polyAdd(0(oup)) 2420 polyMul 2421 LEAQ 16(oup), oup 2422 2423 sealAVX2Tail128LoopB: 2424 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2425 polyAdd(0(oup)) 2426 polyMul 2427 VPALIGNR $4, BB0, BB0, BB0 2428 VPALIGNR $8, CC0, CC0, CC0 2429 VPALIGNR $12, DD0, DD0, DD0 2430 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2431 polyAdd(16(oup)) 2432 polyMul 2433 LEAQ 32(oup), oup 2434 VPALIGNR $12, BB0, BB0, BB0 2435 VPALIGNR $8, CC0, CC0, CC0 2436 VPALIGNR $4, DD0, DD0, DD0 2437 DECQ itr1 2438 JG sealAVX2Tail128LoopA 2439 DECQ itr2 2440 JGE sealAVX2Tail128LoopB 2441 2442 VPADDD ·chacha20Constants<>(SB), AA0, AA1 2443 VPADDD state1StoreAVX2, BB0, BB1 2444 VPADDD state2StoreAVX2, CC0, CC1 2445 VPADDD DD1, DD0, DD1 2446 2447 VPERM2I128 $0x02, AA1, BB1, AA0 2448 VPERM2I128 $0x02, CC1, DD1, BB0 2449 VPERM2I128 $0x13, AA1, BB1, CC0 2450 VPERM2I128 $0x13, CC1, DD1, DD0 2451 JMP sealAVX2ShortSealLoop 2452 2453 // ---------------------------------------------------------------------------- 2454 // Special optimization for the last 256 bytes of ciphertext 2455 sealAVX2Tail256: 2456 // Need to decrypt up to 256 bytes - prepare two blocks 2457 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2458 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2459 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 2460 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 2461 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 2462 VMOVDQA ctr3StoreAVX2, DD0 2463 VPADDD ·avx2IncMask<>(SB), DD0, DD0 2464 VPADDD ·avx2IncMask<>(SB), DD0, DD1 2465 VMOVDQA DD0, TT1 2466 VMOVDQA DD1, TT2 2467 2468 sealAVX2Tail256LoopA: 2469 polyAdd(0(oup)) 2470 polyMul 2471 LEAQ 16(oup), oup 2472 2473 sealAVX2Tail256LoopB: 2474 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2475 polyAdd(0(oup)) 2476 polyMul 2477 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2478 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2479 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2480 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2481 polyAdd(16(oup)) 2482 polyMul 2483 LEAQ 32(oup), oup 2484 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2485 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2486 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2487 DECQ itr1 2488 JG sealAVX2Tail256LoopA 2489 DECQ itr2 2490 JGE sealAVX2Tail256LoopB 2491 2492 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 2493 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 2494 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 2495 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 2496 VPERM2I128 $0x02, AA0, BB0, TT0 2497 VPERM2I128 $0x02, CC0, DD0, TT1 2498 VPERM2I128 $0x13, AA0, BB0, TT2 2499 VPERM2I128 $0x13, CC0, DD0, TT3 2500 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2501 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2502 MOVQ $128, itr1 2503 LEAQ 128(inp), inp 2504 SUBQ $128, inl 2505 VPERM2I128 $0x02, AA1, BB1, AA0 2506 VPERM2I128 $0x02, CC1, DD1, BB0 2507 VPERM2I128 $0x13, AA1, BB1, CC0 2508 VPERM2I128 $0x13, CC1, DD1, DD0 2509 2510 JMP sealAVX2SealHash 2511 2512 // ---------------------------------------------------------------------------- 2513 // Special optimization for the last 384 bytes of ciphertext 2514 sealAVX2Tail384: 2515 // Need to decrypt up to 384 bytes - prepare two blocks 2516 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2517 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2518 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 2519 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 2520 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 2521 VMOVDQA ctr3StoreAVX2, DD0 2522 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 2523 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 2524 2525 sealAVX2Tail384LoopA: 2526 polyAdd(0(oup)) 2527 polyMul 2528 LEAQ 16(oup), oup 2529 2530 sealAVX2Tail384LoopB: 2531 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2532 polyAdd(0(oup)) 2533 polyMul 2534 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2535 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2536 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2537 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2538 polyAdd(16(oup)) 2539 polyMul 2540 LEAQ 32(oup), oup 2541 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2542 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2543 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2544 DECQ itr1 2545 JG sealAVX2Tail384LoopA 2546 DECQ itr2 2547 JGE sealAVX2Tail384LoopB 2548 2549 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 2550 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 2551 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 2552 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 2553 VPERM2I128 $0x02, AA0, BB0, TT0 2554 VPERM2I128 $0x02, CC0, DD0, TT1 2555 VPERM2I128 $0x13, AA0, BB0, TT2 2556 VPERM2I128 $0x13, CC0, DD0, TT3 2557 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2558 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2559 VPERM2I128 $0x02, AA1, BB1, TT0 2560 VPERM2I128 $0x02, CC1, DD1, TT1 2561 VPERM2I128 $0x13, AA1, BB1, TT2 2562 VPERM2I128 $0x13, CC1, DD1, TT3 2563 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 2564 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 2565 MOVQ $256, itr1 2566 LEAQ 256(inp), inp 2567 SUBQ $256, inl 2568 VPERM2I128 $0x02, AA2, BB2, AA0 2569 VPERM2I128 $0x02, CC2, DD2, BB0 2570 VPERM2I128 $0x13, AA2, BB2, CC0 2571 VPERM2I128 $0x13, CC2, DD2, DD0 2572 2573 JMP sealAVX2SealHash 2574 2575 // ---------------------------------------------------------------------------- 2576 // Special optimization for the last 512 bytes of ciphertext 2577 sealAVX2Tail512: 2578 // Need to decrypt up to 512 bytes - prepare two blocks 2579 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2580 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2581 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2582 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2583 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2584 VMOVDQA ctr3StoreAVX2, DD0 2585 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2586 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2587 2588 sealAVX2Tail512LoopA: 2589 polyAdd(0(oup)) 2590 polyMul 2591 LEAQ 16(oup), oup 2592 2593 sealAVX2Tail512LoopB: 2594 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2595 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2596 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2597 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2598 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2599 VMOVDQA CC3, tmpStoreAVX2 2600 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2601 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2602 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2603 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2604 VMOVDQA tmpStoreAVX2, CC3 2605 polyAdd(0*8(oup)) 2606 polyMulAVX2 2607 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2608 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2609 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2610 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2611 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2612 VMOVDQA CC3, tmpStoreAVX2 2613 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2614 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2615 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2616 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2617 VMOVDQA tmpStoreAVX2, CC3 2618 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2619 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2620 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2621 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2622 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2623 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2624 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2625 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2626 polyAdd(2*8(oup)) 2627 polyMulAVX2 2628 LEAQ (4*8)(oup), oup 2629 VMOVDQA CC3, tmpStoreAVX2 2630 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2631 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2632 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2633 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2634 VMOVDQA tmpStoreAVX2, CC3 2635 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2636 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2637 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2638 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2639 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2640 VMOVDQA CC3, tmpStoreAVX2 2641 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2642 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2643 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2644 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2645 VMOVDQA tmpStoreAVX2, CC3 2646 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2647 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2648 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2649 2650 DECQ itr1 2651 JG sealAVX2Tail512LoopA 2652 DECQ itr2 2653 JGE sealAVX2Tail512LoopB 2654 2655 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2656 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2657 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2658 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2659 VMOVDQA CC3, tmpStoreAVX2 2660 VPERM2I128 $0x02, AA0, BB0, CC3 2661 VPXOR (0*32)(inp), CC3, CC3 2662 VMOVDQU CC3, (0*32)(oup) 2663 VPERM2I128 $0x02, CC0, DD0, CC3 2664 VPXOR (1*32)(inp), CC3, CC3 2665 VMOVDQU CC3, (1*32)(oup) 2666 VPERM2I128 $0x13, AA0, BB0, CC3 2667 VPXOR (2*32)(inp), CC3, CC3 2668 VMOVDQU CC3, (2*32)(oup) 2669 VPERM2I128 $0x13, CC0, DD0, CC3 2670 VPXOR (3*32)(inp), CC3, CC3 2671 VMOVDQU CC3, (3*32)(oup) 2672 2673 VPERM2I128 $0x02, AA1, BB1, AA0 2674 VPERM2I128 $0x02, CC1, DD1, BB0 2675 VPERM2I128 $0x13, AA1, BB1, CC0 2676 VPERM2I128 $0x13, CC1, DD1, DD0 2677 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2678 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2679 2680 VPERM2I128 $0x02, AA2, BB2, AA0 2681 VPERM2I128 $0x02, CC2, DD2, BB0 2682 VPERM2I128 $0x13, AA2, BB2, CC0 2683 VPERM2I128 $0x13, CC2, DD2, DD0 2684 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2685 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2686 2687 MOVQ $384, itr1 2688 LEAQ 384(inp), inp 2689 SUBQ $384, inl 2690 VPERM2I128 $0x02, AA3, BB3, AA0 2691 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 2692 VPERM2I128 $0x13, AA3, BB3, CC0 2693 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2694 2695 JMP sealAVX2SealHash