github.com/epfl-dcsl/gotee@v0.0.0-20200909122901-014b35f5e5e9/src/crypto/aes/gcm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI 6 // The implementation uses some optimization as described in: 7 // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication 8 // Instruction and its Usage for Computing the GCM Mode rev. 2.02 9 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and 10 // Hardware 11 12 #include "textflag.h" 13 14 #define B0 X0 15 #define B1 X1 16 #define B2 X2 17 #define B3 X3 18 #define B4 X4 19 #define B5 X5 20 #define B6 X6 21 #define B7 X7 22 23 #define ACC0 X8 24 #define ACC1 X9 25 #define ACCM X10 26 27 #define T0 X11 28 #define T1 X12 29 #define T2 X13 30 #define POLY X14 31 #define BSWAP X15 32 33 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 34 DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 35 36 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 37 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 38 39 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 40 DATA andMask<>+0x08(SB)/8, $0x0000000000000000 41 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 42 DATA andMask<>+0x18(SB)/8, $0x0000000000000000 43 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 44 DATA andMask<>+0x28(SB)/8, $0x0000000000000000 45 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 46 DATA andMask<>+0x38(SB)/8, $0x0000000000000000 47 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 48 DATA andMask<>+0x48(SB)/8, $0x0000000000000000 49 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 50 DATA andMask<>+0x58(SB)/8, $0x0000000000000000 51 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 52 DATA andMask<>+0x68(SB)/8, $0x0000000000000000 53 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 54 DATA andMask<>+0x78(SB)/8, $0x0000000000000000 55 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 56 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 57 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 58 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 59 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 60 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 61 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 62 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 63 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 64 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 65 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 66 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 67 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 68 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 69 70 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 71 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 72 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 73 74 // func hasGCMAsm() bool 75 // returns whether AES-NI AND CLMUL-NI are supported 76 TEXT ·hasGCMAsm(SB),NOSPLIT,$0 77 // @aghosn we want to avoid the cpuid 78 MOVB runtime·isEnclave(SB), R8 79 CMPB R8, $1 80 JNE normal 81 MOVB $1, CX 82 JMP end 83 84 normal: 85 XORQ AX, AX 86 INCL AX 87 CPUID 88 MOVQ CX, DX 89 SHRQ $25, CX 90 SHRQ $1, DX 91 ANDQ DX, CX 92 ANDQ $1, CX 93 end: 94 MOVB CX, ret+0(FP) 95 RET 96 97 // func aesEncBlock(dst, src *[16]byte, ks []uint32) 98 TEXT ·aesEncBlock(SB),NOSPLIT,$0 99 MOVQ dst+0(FP), DI 100 MOVQ src+8(FP), SI 101 MOVQ ks_base+16(FP), DX 102 MOVQ ks_len+24(FP), CX 103 104 SHRQ $2, CX 105 DECQ CX 106 107 MOVOU (SI), X0 108 MOVOU (16*0)(DX), X1 109 PXOR X1, X0 110 MOVOU (16*1)(DX), X1 111 AESENC X1, X0 112 MOVOU (16*2)(DX), X1 113 AESENC X1, X0 114 MOVOU (16*3)(DX), X1 115 AESENC X1, X0 116 MOVOU (16*4)(DX), X1 117 AESENC X1, X0 118 MOVOU (16*5)(DX), X1 119 AESENC X1, X0 120 MOVOU (16*6)(DX), X1 121 AESENC X1, X0 122 MOVOU (16*7)(DX), X1 123 AESENC X1, X0 124 MOVOU (16*8)(DX), X1 125 AESENC X1, X0 126 MOVOU (16*9)(DX), X1 127 AESENC X1, X0 128 MOVOU (16*10)(DX), X1 129 CMPQ CX, $12 130 JB encLast 131 AESENC X1, X0 132 MOVOU (16*11)(DX), X1 133 AESENC X1, X0 134 MOVOU (16*12)(DX), X1 135 JE encLast 136 AESENC X1, X0 137 MOVOU (16*13)(DX), X1 138 AESENC X1, X0 139 MOVOU (16*14)(DX), X1 140 141 encLast: 142 AESENCLAST X1, X0 143 MOVOU X0, (DI) 144 145 RET 146 147 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 148 TEXT ·gcmAesFinish(SB),NOSPLIT,$0 149 #define pTbl DI 150 #define tMsk SI 151 #define tPtr DX 152 #define plen AX 153 #define dlen CX 154 155 MOVQ productTable+0(FP), pTbl 156 MOVQ tagMask+8(FP), tMsk 157 MOVQ T+16(FP), tPtr 158 MOVQ pLen+24(FP), plen 159 MOVQ dLen+32(FP), dlen 160 161 MOVOU (tPtr), ACC0 162 MOVOU (tMsk), T2 163 164 MOVOU bswapMask<>(SB), BSWAP 165 MOVOU gcmPoly<>(SB), POLY 166 167 SHLQ $3, plen 168 SHLQ $3, dlen 169 170 MOVQ plen, B0 171 PINSRQ $1, dlen, B0 172 173 PXOR ACC0, B0 174 175 MOVOU (16*14)(pTbl), ACC0 176 MOVOU (16*15)(pTbl), ACCM 177 MOVOU ACC0, ACC1 178 179 PCLMULQDQ $0x00, B0, ACC0 180 PCLMULQDQ $0x11, B0, ACC1 181 PSHUFD $78, B0, T0 182 PXOR B0, T0 183 PCLMULQDQ $0x00, T0, ACCM 184 185 PXOR ACC0, ACCM 186 PXOR ACC1, ACCM 187 MOVOU ACCM, T0 188 PSRLDQ $8, ACCM 189 PSLLDQ $8, T0 190 PXOR ACCM, ACC1 191 PXOR T0, ACC0 192 193 MOVOU POLY, T0 194 PCLMULQDQ $0x01, ACC0, T0 195 PSHUFD $78, ACC0, ACC0 196 PXOR T0, ACC0 197 198 MOVOU POLY, T0 199 PCLMULQDQ $0x01, ACC0, T0 200 PSHUFD $78, ACC0, ACC0 201 PXOR T0, ACC0 202 203 PXOR ACC1, ACC0 204 205 PSHUFB BSWAP, ACC0 206 PXOR T2, ACC0 207 MOVOU ACC0, (tPtr) 208 209 RET 210 #undef pTbl 211 #undef tMsk 212 #undef tPtr 213 #undef plen 214 #undef dlen 215 216 // func gcmAesInit(productTable *[256]byte, ks []uint32) 217 TEXT ·gcmAesInit(SB),NOSPLIT,$0 218 #define dst DI 219 #define KS SI 220 #define NR DX 221 222 MOVQ productTable+0(FP), dst 223 MOVQ ks_base+8(FP), KS 224 MOVQ ks_len+16(FP), NR 225 226 SHRQ $2, NR 227 DECQ NR 228 229 MOVOU bswapMask<>(SB), BSWAP 230 MOVOU gcmPoly<>(SB), POLY 231 232 // Encrypt block 0, with the AES key to generate the hash key H 233 MOVOU (16*0)(KS), B0 234 MOVOU (16*1)(KS), T0 235 AESENC T0, B0 236 MOVOU (16*2)(KS), T0 237 AESENC T0, B0 238 MOVOU (16*3)(KS), T0 239 AESENC T0, B0 240 MOVOU (16*4)(KS), T0 241 AESENC T0, B0 242 MOVOU (16*5)(KS), T0 243 AESENC T0, B0 244 MOVOU (16*6)(KS), T0 245 AESENC T0, B0 246 MOVOU (16*7)(KS), T0 247 AESENC T0, B0 248 MOVOU (16*8)(KS), T0 249 AESENC T0, B0 250 MOVOU (16*9)(KS), T0 251 AESENC T0, B0 252 MOVOU (16*10)(KS), T0 253 CMPQ NR, $12 254 JB initEncLast 255 AESENC T0, B0 256 MOVOU (16*11)(KS), T0 257 AESENC T0, B0 258 MOVOU (16*12)(KS), T0 259 JE initEncLast 260 AESENC T0, B0 261 MOVOU (16*13)(KS), T0 262 AESENC T0, B0 263 MOVOU (16*14)(KS), T0 264 initEncLast: 265 AESENCLAST T0, B0 266 267 PSHUFB BSWAP, B0 268 // H * 2 269 PSHUFD $0xff, B0, T0 270 MOVOU B0, T1 271 PSRAL $31, T0 272 PAND POLY, T0 273 PSRLL $31, T1 274 PSLLDQ $4, T1 275 PSLLL $1, B0 276 PXOR T0, B0 277 PXOR T1, B0 278 // Karatsuba pre-computations 279 MOVOU B0, (16*14)(dst) 280 PSHUFD $78, B0, B1 281 PXOR B0, B1 282 MOVOU B1, (16*15)(dst) 283 284 MOVOU B0, B2 285 MOVOU B1, B3 286 // Now prepare powers of H and pre-computations for them 287 MOVQ $7, AX 288 289 initLoop: 290 MOVOU B2, T0 291 MOVOU B2, T1 292 MOVOU B3, T2 293 PCLMULQDQ $0x00, B0, T0 294 PCLMULQDQ $0x11, B0, T1 295 PCLMULQDQ $0x00, B1, T2 296 297 PXOR T0, T2 298 PXOR T1, T2 299 MOVOU T2, B4 300 PSLLDQ $8, B4 301 PSRLDQ $8, T2 302 PXOR B4, T0 303 PXOR T2, T1 304 305 MOVOU POLY, B2 306 PCLMULQDQ $0x01, T0, B2 307 PSHUFD $78, T0, T0 308 PXOR B2, T0 309 MOVOU POLY, B2 310 PCLMULQDQ $0x01, T0, B2 311 PSHUFD $78, T0, T0 312 PXOR T0, B2 313 PXOR T1, B2 314 315 MOVOU B2, (16*12)(dst) 316 PSHUFD $78, B2, B3 317 PXOR B2, B3 318 MOVOU B3, (16*13)(dst) 319 320 DECQ AX 321 LEAQ (-16*2)(dst), dst 322 JNE initLoop 323 324 RET 325 #undef NR 326 #undef KS 327 #undef dst 328 329 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) 330 TEXT ·gcmAesData(SB),NOSPLIT,$0 331 #define pTbl DI 332 #define aut SI 333 #define tPtr CX 334 #define autLen DX 335 336 #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a 337 #define mulRoundAAD(X ,i) \ 338 MOVOU (16*(i*2))(pTbl), T1;\ 339 MOVOU T1, T2;\ 340 PCLMULQDQ $0x00, X, T1;\ 341 PXOR T1, ACC0;\ 342 PCLMULQDQ $0x11, X, T2;\ 343 PXOR T2, ACC1;\ 344 PSHUFD $78, X, T1;\ 345 PXOR T1, X;\ 346 MOVOU (16*(i*2+1))(pTbl), T1;\ 347 PCLMULQDQ $0x00, X, T1;\ 348 PXOR T1, ACCM 349 350 MOVQ productTable+0(FP), pTbl 351 MOVQ data_base+8(FP), aut 352 MOVQ data_len+16(FP), autLen 353 MOVQ T+32(FP), tPtr 354 355 PXOR ACC0, ACC0 356 MOVOU bswapMask<>(SB), BSWAP 357 MOVOU gcmPoly<>(SB), POLY 358 359 TESTQ autLen, autLen 360 JEQ dataBail 361 362 CMPQ autLen, $13 // optimize the TLS case 363 JE dataTLS 364 CMPQ autLen, $128 365 JB startSinglesLoop 366 JMP dataOctaLoop 367 368 dataTLS: 369 MOVOU (16*14)(pTbl), T1 370 MOVOU (16*15)(pTbl), T2 371 PXOR B0, B0 372 MOVQ (aut), B0 373 PINSRD $2, 8(aut), B0 374 PINSRB $12, 12(aut), B0 375 XORQ autLen, autLen 376 JMP dataMul 377 378 dataOctaLoop: 379 CMPQ autLen, $128 380 JB startSinglesLoop 381 SUBQ $128, autLen 382 383 MOVOU (16*0)(aut), X0 384 MOVOU (16*1)(aut), X1 385 MOVOU (16*2)(aut), X2 386 MOVOU (16*3)(aut), X3 387 MOVOU (16*4)(aut), X4 388 MOVOU (16*5)(aut), X5 389 MOVOU (16*6)(aut), X6 390 MOVOU (16*7)(aut), X7 391 LEAQ (16*8)(aut), aut 392 PSHUFB BSWAP, X0 393 PSHUFB BSWAP, X1 394 PSHUFB BSWAP, X2 395 PSHUFB BSWAP, X3 396 PSHUFB BSWAP, X4 397 PSHUFB BSWAP, X5 398 PSHUFB BSWAP, X6 399 PSHUFB BSWAP, X7 400 PXOR ACC0, X0 401 402 MOVOU (16*0)(pTbl), ACC0 403 MOVOU (16*1)(pTbl), ACCM 404 MOVOU ACC0, ACC1 405 PSHUFD $78, X0, T1 406 PXOR X0, T1 407 PCLMULQDQ $0x00, X0, ACC0 408 PCLMULQDQ $0x11, X0, ACC1 409 PCLMULQDQ $0x00, T1, ACCM 410 411 mulRoundAAD(X1, 1) 412 mulRoundAAD(X2, 2) 413 mulRoundAAD(X3, 3) 414 mulRoundAAD(X4, 4) 415 mulRoundAAD(X5, 5) 416 mulRoundAAD(X6, 6) 417 mulRoundAAD(X7, 7) 418 419 PXOR ACC0, ACCM 420 PXOR ACC1, ACCM 421 MOVOU ACCM, T0 422 PSRLDQ $8, ACCM 423 PSLLDQ $8, T0 424 PXOR ACCM, ACC1 425 PXOR T0, ACC0 426 reduceRound(ACC0) 427 reduceRound(ACC0) 428 PXOR ACC1, ACC0 429 JMP dataOctaLoop 430 431 startSinglesLoop: 432 MOVOU (16*14)(pTbl), T1 433 MOVOU (16*15)(pTbl), T2 434 435 dataSinglesLoop: 436 437 CMPQ autLen, $16 438 JB dataEnd 439 SUBQ $16, autLen 440 441 MOVOU (aut), B0 442 dataMul: 443 PSHUFB BSWAP, B0 444 PXOR ACC0, B0 445 446 MOVOU T1, ACC0 447 MOVOU T2, ACCM 448 MOVOU T1, ACC1 449 450 PSHUFD $78, B0, T0 451 PXOR B0, T0 452 PCLMULQDQ $0x00, B0, ACC0 453 PCLMULQDQ $0x11, B0, ACC1 454 PCLMULQDQ $0x00, T0, ACCM 455 456 PXOR ACC0, ACCM 457 PXOR ACC1, ACCM 458 MOVOU ACCM, T0 459 PSRLDQ $8, ACCM 460 PSLLDQ $8, T0 461 PXOR ACCM, ACC1 462 PXOR T0, ACC0 463 464 MOVOU POLY, T0 465 PCLMULQDQ $0x01, ACC0, T0 466 PSHUFD $78, ACC0, ACC0 467 PXOR T0, ACC0 468 469 MOVOU POLY, T0 470 PCLMULQDQ $0x01, ACC0, T0 471 PSHUFD $78, ACC0, ACC0 472 PXOR T0, ACC0 473 PXOR ACC1, ACC0 474 475 LEAQ 16(aut), aut 476 477 JMP dataSinglesLoop 478 479 dataEnd: 480 481 TESTQ autLen, autLen 482 JEQ dataBail 483 484 PXOR B0, B0 485 LEAQ -1(aut)(autLen*1), aut 486 487 dataLoadLoop: 488 489 PSLLDQ $1, B0 490 PINSRB $0, (aut), B0 491 492 LEAQ -1(aut), aut 493 DECQ autLen 494 JNE dataLoadLoop 495 496 JMP dataMul 497 498 dataBail: 499 MOVOU ACC0, (tPtr) 500 RET 501 #undef pTbl 502 #undef aut 503 #undef tPtr 504 #undef autLen 505 506 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 507 TEXT ·gcmAesEnc(SB),0,$256-96 508 #define pTbl DI 509 #define ctx DX 510 #define ctrPtr CX 511 #define ptx SI 512 #define ks AX 513 #define tPtr R8 514 #define ptxLen R9 515 #define aluCTR R10 516 #define aluTMP R11 517 #define aluK R12 518 #define NR R13 519 520 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) 521 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 522 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 523 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 524 #define combinedRound(i) \ 525 MOVOU (16*i)(ks), T0;\ 526 AESENC T0, B0;\ 527 AESENC T0, B1;\ 528 AESENC T0, B2;\ 529 AESENC T0, B3;\ 530 MOVOU (16*(i*2))(pTbl), T1;\ 531 MOVOU T1, T2;\ 532 AESENC T0, B4;\ 533 AESENC T0, B5;\ 534 AESENC T0, B6;\ 535 AESENC T0, B7;\ 536 MOVOU (16*i)(SP), T0;\ 537 PCLMULQDQ $0x00, T0, T1;\ 538 PXOR T1, ACC0;\ 539 PSHUFD $78, T0, T1;\ 540 PCLMULQDQ $0x11, T0, T2;\ 541 PXOR T1, T0;\ 542 PXOR T2, ACC1;\ 543 MOVOU (16*(i*2+1))(pTbl), T2;\ 544 PCLMULQDQ $0x00, T2, T0;\ 545 PXOR T0, ACCM 546 #define mulRound(i) \ 547 MOVOU (16*i)(SP), T0;\ 548 MOVOU (16*(i*2))(pTbl), T1;\ 549 MOVOU T1, T2;\ 550 PCLMULQDQ $0x00, T0, T1;\ 551 PXOR T1, ACC0;\ 552 PCLMULQDQ $0x11, T0, T2;\ 553 PXOR T2, ACC1;\ 554 PSHUFD $78, T0, T1;\ 555 PXOR T1, T0;\ 556 MOVOU (16*(i*2+1))(pTbl), T1;\ 557 PCLMULQDQ $0x00, T0, T1;\ 558 PXOR T1, ACCM 559 560 MOVQ productTable+0(FP), pTbl 561 MOVQ dst+8(FP), ctx 562 MOVQ src_base+32(FP), ptx 563 MOVQ src_len+40(FP), ptxLen 564 MOVQ ctr+56(FP), ctrPtr 565 MOVQ T+64(FP), tPtr 566 MOVQ ks_base+72(FP), ks 567 MOVQ ks_len+80(FP), NR 568 569 SHRQ $2, NR 570 DECQ NR 571 572 MOVOU bswapMask<>(SB), BSWAP 573 MOVOU gcmPoly<>(SB), POLY 574 575 MOVOU (tPtr), ACC0 576 PXOR ACC1, ACC1 577 PXOR ACCM, ACCM 578 MOVOU (ctrPtr), B0 579 MOVL (3*4)(ctrPtr), aluCTR 580 MOVOU (ks), T0 581 MOVL (3*4)(ks), aluK 582 BSWAPL aluCTR 583 BSWAPL aluK 584 585 PXOR B0, T0 586 MOVOU T0, (8*16 + 0*16)(SP) 587 increment(0) 588 589 CMPQ ptxLen, $128 590 JB gcmAesEncSingles 591 SUBQ $128, ptxLen 592 593 // We have at least 8 blocks to encrypt, prepare the rest of the counters 594 MOVOU T0, (8*16 + 1*16)(SP) 595 increment(1) 596 MOVOU T0, (8*16 + 2*16)(SP) 597 increment(2) 598 MOVOU T0, (8*16 + 3*16)(SP) 599 increment(3) 600 MOVOU T0, (8*16 + 4*16)(SP) 601 increment(4) 602 MOVOU T0, (8*16 + 5*16)(SP) 603 increment(5) 604 MOVOU T0, (8*16 + 6*16)(SP) 605 increment(6) 606 MOVOU T0, (8*16 + 7*16)(SP) 607 increment(7) 608 609 MOVOU (8*16 + 0*16)(SP), B0 610 MOVOU (8*16 + 1*16)(SP), B1 611 MOVOU (8*16 + 2*16)(SP), B2 612 MOVOU (8*16 + 3*16)(SP), B3 613 MOVOU (8*16 + 4*16)(SP), B4 614 MOVOU (8*16 + 5*16)(SP), B5 615 MOVOU (8*16 + 6*16)(SP), B6 616 MOVOU (8*16 + 7*16)(SP), B7 617 618 aesRound(1) 619 increment(0) 620 aesRound(2) 621 increment(1) 622 aesRound(3) 623 increment(2) 624 aesRound(4) 625 increment(3) 626 aesRound(5) 627 increment(4) 628 aesRound(6) 629 increment(5) 630 aesRound(7) 631 increment(6) 632 aesRound(8) 633 increment(7) 634 aesRound(9) 635 MOVOU (16*10)(ks), T0 636 CMPQ NR, $12 637 JB encLast1 638 aesRnd(T0) 639 aesRound(11) 640 MOVOU (16*12)(ks), T0 641 JE encLast1 642 aesRnd(T0) 643 aesRound(13) 644 MOVOU (16*14)(ks), T0 645 encLast1: 646 aesRndLast(T0) 647 648 MOVOU (16*0)(ptx), T0 649 PXOR T0, B0 650 MOVOU (16*1)(ptx), T0 651 PXOR T0, B1 652 MOVOU (16*2)(ptx), T0 653 PXOR T0, B2 654 MOVOU (16*3)(ptx), T0 655 PXOR T0, B3 656 MOVOU (16*4)(ptx), T0 657 PXOR T0, B4 658 MOVOU (16*5)(ptx), T0 659 PXOR T0, B5 660 MOVOU (16*6)(ptx), T0 661 PXOR T0, B6 662 MOVOU (16*7)(ptx), T0 663 PXOR T0, B7 664 665 MOVOU B0, (16*0)(ctx) 666 PSHUFB BSWAP, B0 667 PXOR ACC0, B0 668 MOVOU B1, (16*1)(ctx) 669 PSHUFB BSWAP, B1 670 MOVOU B2, (16*2)(ctx) 671 PSHUFB BSWAP, B2 672 MOVOU B3, (16*3)(ctx) 673 PSHUFB BSWAP, B3 674 MOVOU B4, (16*4)(ctx) 675 PSHUFB BSWAP, B4 676 MOVOU B5, (16*5)(ctx) 677 PSHUFB BSWAP, B5 678 MOVOU B6, (16*6)(ctx) 679 PSHUFB BSWAP, B6 680 MOVOU B7, (16*7)(ctx) 681 PSHUFB BSWAP, B7 682 683 MOVOU B0, (16*0)(SP) 684 MOVOU B1, (16*1)(SP) 685 MOVOU B2, (16*2)(SP) 686 MOVOU B3, (16*3)(SP) 687 MOVOU B4, (16*4)(SP) 688 MOVOU B5, (16*5)(SP) 689 MOVOU B6, (16*6)(SP) 690 MOVOU B7, (16*7)(SP) 691 692 LEAQ 128(ptx), ptx 693 LEAQ 128(ctx), ctx 694 695 gcmAesEncOctetsLoop: 696 697 CMPQ ptxLen, $128 698 JB gcmAesEncOctetsEnd 699 SUBQ $128, ptxLen 700 701 MOVOU (8*16 + 0*16)(SP), B0 702 MOVOU (8*16 + 1*16)(SP), B1 703 MOVOU (8*16 + 2*16)(SP), B2 704 MOVOU (8*16 + 3*16)(SP), B3 705 MOVOU (8*16 + 4*16)(SP), B4 706 MOVOU (8*16 + 5*16)(SP), B5 707 MOVOU (8*16 + 6*16)(SP), B6 708 MOVOU (8*16 + 7*16)(SP), B7 709 710 MOVOU (16*0)(SP), T0 711 PSHUFD $78, T0, T1 712 PXOR T0, T1 713 714 MOVOU (16*0)(pTbl), ACC0 715 MOVOU (16*1)(pTbl), ACCM 716 MOVOU ACC0, ACC1 717 718 PCLMULQDQ $0x00, T1, ACCM 719 PCLMULQDQ $0x00, T0, ACC0 720 PCLMULQDQ $0x11, T0, ACC1 721 722 combinedRound(1) 723 increment(0) 724 combinedRound(2) 725 increment(1) 726 combinedRound(3) 727 increment(2) 728 combinedRound(4) 729 increment(3) 730 combinedRound(5) 731 increment(4) 732 combinedRound(6) 733 increment(5) 734 combinedRound(7) 735 increment(6) 736 737 aesRound(8) 738 increment(7) 739 740 PXOR ACC0, ACCM 741 PXOR ACC1, ACCM 742 MOVOU ACCM, T0 743 PSRLDQ $8, ACCM 744 PSLLDQ $8, T0 745 PXOR ACCM, ACC1 746 PXOR T0, ACC0 747 748 reduceRound(ACC0) 749 aesRound(9) 750 751 reduceRound(ACC0) 752 PXOR ACC1, ACC0 753 754 MOVOU (16*10)(ks), T0 755 CMPQ NR, $12 756 JB encLast2 757 aesRnd(T0) 758 aesRound(11) 759 MOVOU (16*12)(ks), T0 760 JE encLast2 761 aesRnd(T0) 762 aesRound(13) 763 MOVOU (16*14)(ks), T0 764 encLast2: 765 aesRndLast(T0) 766 767 MOVOU (16*0)(ptx), T0 768 PXOR T0, B0 769 MOVOU (16*1)(ptx), T0 770 PXOR T0, B1 771 MOVOU (16*2)(ptx), T0 772 PXOR T0, B2 773 MOVOU (16*3)(ptx), T0 774 PXOR T0, B3 775 MOVOU (16*4)(ptx), T0 776 PXOR T0, B4 777 MOVOU (16*5)(ptx), T0 778 PXOR T0, B5 779 MOVOU (16*6)(ptx), T0 780 PXOR T0, B6 781 MOVOU (16*7)(ptx), T0 782 PXOR T0, B7 783 784 MOVOU B0, (16*0)(ctx) 785 PSHUFB BSWAP, B0 786 PXOR ACC0, B0 787 MOVOU B1, (16*1)(ctx) 788 PSHUFB BSWAP, B1 789 MOVOU B2, (16*2)(ctx) 790 PSHUFB BSWAP, B2 791 MOVOU B3, (16*3)(ctx) 792 PSHUFB BSWAP, B3 793 MOVOU B4, (16*4)(ctx) 794 PSHUFB BSWAP, B4 795 MOVOU B5, (16*5)(ctx) 796 PSHUFB BSWAP, B5 797 MOVOU B6, (16*6)(ctx) 798 PSHUFB BSWAP, B6 799 MOVOU B7, (16*7)(ctx) 800 PSHUFB BSWAP, B7 801 802 MOVOU B0, (16*0)(SP) 803 MOVOU B1, (16*1)(SP) 804 MOVOU B2, (16*2)(SP) 805 MOVOU B3, (16*3)(SP) 806 MOVOU B4, (16*4)(SP) 807 MOVOU B5, (16*5)(SP) 808 MOVOU B6, (16*6)(SP) 809 MOVOU B7, (16*7)(SP) 810 811 LEAQ 128(ptx), ptx 812 LEAQ 128(ctx), ctx 813 814 JMP gcmAesEncOctetsLoop 815 816 gcmAesEncOctetsEnd: 817 818 MOVOU (16*0)(SP), T0 819 MOVOU (16*0)(pTbl), ACC0 820 MOVOU (16*1)(pTbl), ACCM 821 MOVOU ACC0, ACC1 822 PSHUFD $78, T0, T1 823 PXOR T0, T1 824 PCLMULQDQ $0x00, T0, ACC0 825 PCLMULQDQ $0x11, T0, ACC1 826 PCLMULQDQ $0x00, T1, ACCM 827 828 mulRound(1) 829 mulRound(2) 830 mulRound(3) 831 mulRound(4) 832 mulRound(5) 833 mulRound(6) 834 mulRound(7) 835 836 PXOR ACC0, ACCM 837 PXOR ACC1, ACCM 838 MOVOU ACCM, T0 839 PSRLDQ $8, ACCM 840 PSLLDQ $8, T0 841 PXOR ACCM, ACC1 842 PXOR T0, ACC0 843 844 reduceRound(ACC0) 845 reduceRound(ACC0) 846 PXOR ACC1, ACC0 847 848 TESTQ ptxLen, ptxLen 849 JE gcmAesEncDone 850 851 SUBQ $7, aluCTR 852 853 gcmAesEncSingles: 854 855 MOVOU (16*1)(ks), B1 856 MOVOU (16*2)(ks), B2 857 MOVOU (16*3)(ks), B3 858 MOVOU (16*4)(ks), B4 859 MOVOU (16*5)(ks), B5 860 MOVOU (16*6)(ks), B6 861 MOVOU (16*7)(ks), B7 862 863 MOVOU (16*14)(pTbl), T2 864 865 gcmAesEncSinglesLoop: 866 867 CMPQ ptxLen, $16 868 JB gcmAesEncTail 869 SUBQ $16, ptxLen 870 871 MOVOU (8*16 + 0*16)(SP), B0 872 increment(0) 873 874 AESENC B1, B0 875 AESENC B2, B0 876 AESENC B3, B0 877 AESENC B4, B0 878 AESENC B5, B0 879 AESENC B6, B0 880 AESENC B7, B0 881 MOVOU (16*8)(ks), T0 882 AESENC T0, B0 883 MOVOU (16*9)(ks), T0 884 AESENC T0, B0 885 MOVOU (16*10)(ks), T0 886 CMPQ NR, $12 887 JB encLast3 888 AESENC T0, B0 889 MOVOU (16*11)(ks), T0 890 AESENC T0, B0 891 MOVOU (16*12)(ks), T0 892 JE encLast3 893 AESENC T0, B0 894 MOVOU (16*13)(ks), T0 895 AESENC T0, B0 896 MOVOU (16*14)(ks), T0 897 encLast3: 898 AESENCLAST T0, B0 899 900 MOVOU (ptx), T0 901 PXOR T0, B0 902 MOVOU B0, (ctx) 903 904 PSHUFB BSWAP, B0 905 PXOR ACC0, B0 906 907 MOVOU T2, ACC0 908 MOVOU T2, ACC1 909 MOVOU (16*15)(pTbl), ACCM 910 911 PSHUFD $78, B0, T0 912 PXOR B0, T0 913 PCLMULQDQ $0x00, B0, ACC0 914 PCLMULQDQ $0x11, B0, ACC1 915 PCLMULQDQ $0x00, T0, ACCM 916 917 PXOR ACC0, ACCM 918 PXOR ACC1, ACCM 919 MOVOU ACCM, T0 920 PSRLDQ $8, ACCM 921 PSLLDQ $8, T0 922 PXOR ACCM, ACC1 923 PXOR T0, ACC0 924 925 reduceRound(ACC0) 926 reduceRound(ACC0) 927 PXOR ACC1, ACC0 928 929 LEAQ (16*1)(ptx), ptx 930 LEAQ (16*1)(ctx), ctx 931 932 JMP gcmAesEncSinglesLoop 933 934 gcmAesEncTail: 935 TESTQ ptxLen, ptxLen 936 JE gcmAesEncDone 937 938 MOVOU (8*16 + 0*16)(SP), B0 939 AESENC B1, B0 940 AESENC B2, B0 941 AESENC B3, B0 942 AESENC B4, B0 943 AESENC B5, B0 944 AESENC B6, B0 945 AESENC B7, B0 946 MOVOU (16*8)(ks), T0 947 AESENC T0, B0 948 MOVOU (16*9)(ks), T0 949 AESENC T0, B0 950 MOVOU (16*10)(ks), T0 951 CMPQ NR, $12 952 JB encLast4 953 AESENC T0, B0 954 MOVOU (16*11)(ks), T0 955 AESENC T0, B0 956 MOVOU (16*12)(ks), T0 957 JE encLast4 958 AESENC T0, B0 959 MOVOU (16*13)(ks), T0 960 AESENC T0, B0 961 MOVOU (16*14)(ks), T0 962 encLast4: 963 AESENCLAST T0, B0 964 MOVOU B0, T0 965 966 LEAQ -1(ptx)(ptxLen*1), ptx 967 968 MOVQ ptxLen, aluTMP 969 SHLQ $4, aluTMP 970 971 LEAQ andMask<>(SB), aluCTR 972 MOVOU -16(aluCTR)(aluTMP*1), T1 973 974 PXOR B0, B0 975 ptxLoadLoop: 976 PSLLDQ $1, B0 977 PINSRB $0, (ptx), B0 978 LEAQ -1(ptx), ptx 979 DECQ ptxLen 980 JNE ptxLoadLoop 981 982 PXOR T0, B0 983 PAND T1, B0 984 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 985 986 PSHUFB BSWAP, B0 987 PXOR ACC0, B0 988 989 MOVOU T2, ACC0 990 MOVOU T2, ACC1 991 MOVOU (16*15)(pTbl), ACCM 992 993 PSHUFD $78, B0, T0 994 PXOR B0, T0 995 PCLMULQDQ $0x00, B0, ACC0 996 PCLMULQDQ $0x11, B0, ACC1 997 PCLMULQDQ $0x00, T0, ACCM 998 999 PXOR ACC0, ACCM 1000 PXOR ACC1, ACCM 1001 MOVOU ACCM, T0 1002 PSRLDQ $8, ACCM 1003 PSLLDQ $8, T0 1004 PXOR ACCM, ACC1 1005 PXOR T0, ACC0 1006 1007 reduceRound(ACC0) 1008 reduceRound(ACC0) 1009 PXOR ACC1, ACC0 1010 1011 gcmAesEncDone: 1012 MOVOU ACC0, (tPtr) 1013 RET 1014 #undef increment 1015 1016 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 1017 TEXT ·gcmAesDec(SB),0,$128-96 1018 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) 1019 #define combinedDecRound(i) \ 1020 MOVOU (16*i)(ks), T0;\ 1021 AESENC T0, B0;\ 1022 AESENC T0, B1;\ 1023 AESENC T0, B2;\ 1024 AESENC T0, B3;\ 1025 MOVOU (16*(i*2))(pTbl), T1;\ 1026 MOVOU T1, T2;\ 1027 AESENC T0, B4;\ 1028 AESENC T0, B5;\ 1029 AESENC T0, B6;\ 1030 AESENC T0, B7;\ 1031 MOVOU (16*i)(ctx), T0;\ 1032 PSHUFB BSWAP, T0;\ 1033 PCLMULQDQ $0x00, T0, T1;\ 1034 PXOR T1, ACC0;\ 1035 PSHUFD $78, T0, T1;\ 1036 PCLMULQDQ $0x11, T0, T2;\ 1037 PXOR T1, T0;\ 1038 PXOR T2, ACC1;\ 1039 MOVOU (16*(i*2+1))(pTbl), T2;\ 1040 PCLMULQDQ $0x00, T2, T0;\ 1041 PXOR T0, ACCM 1042 1043 MOVQ productTable+0(FP), pTbl 1044 MOVQ dst+8(FP), ptx 1045 MOVQ src_base+32(FP), ctx 1046 MOVQ src_len+40(FP), ptxLen 1047 MOVQ ctr+56(FP), ctrPtr 1048 MOVQ T+64(FP), tPtr 1049 MOVQ ks_base+72(FP), ks 1050 MOVQ ks_len+80(FP), NR 1051 1052 SHRQ $2, NR 1053 DECQ NR 1054 1055 MOVOU bswapMask<>(SB), BSWAP 1056 MOVOU gcmPoly<>(SB), POLY 1057 1058 MOVOU (tPtr), ACC0 1059 PXOR ACC1, ACC1 1060 PXOR ACCM, ACCM 1061 MOVOU (ctrPtr), B0 1062 MOVL (3*4)(ctrPtr), aluCTR 1063 MOVOU (ks), T0 1064 MOVL (3*4)(ks), aluK 1065 BSWAPL aluCTR 1066 BSWAPL aluK 1067 1068 PXOR B0, T0 1069 MOVOU T0, (0*16)(SP) 1070 increment(0) 1071 1072 CMPQ ptxLen, $128 1073 JB gcmAesDecSingles 1074 1075 MOVOU T0, (1*16)(SP) 1076 increment(1) 1077 MOVOU T0, (2*16)(SP) 1078 increment(2) 1079 MOVOU T0, (3*16)(SP) 1080 increment(3) 1081 MOVOU T0, (4*16)(SP) 1082 increment(4) 1083 MOVOU T0, (5*16)(SP) 1084 increment(5) 1085 MOVOU T0, (6*16)(SP) 1086 increment(6) 1087 MOVOU T0, (7*16)(SP) 1088 increment(7) 1089 1090 gcmAesDecOctetsLoop: 1091 1092 CMPQ ptxLen, $128 1093 JB gcmAesDecEndOctets 1094 SUBQ $128, ptxLen 1095 1096 MOVOU (0*16)(SP), B0 1097 MOVOU (1*16)(SP), B1 1098 MOVOU (2*16)(SP), B2 1099 MOVOU (3*16)(SP), B3 1100 MOVOU (4*16)(SP), B4 1101 MOVOU (5*16)(SP), B5 1102 MOVOU (6*16)(SP), B6 1103 MOVOU (7*16)(SP), B7 1104 1105 MOVOU (16*0)(ctx), T0 1106 PSHUFB BSWAP, T0 1107 PXOR ACC0, T0 1108 PSHUFD $78, T0, T1 1109 PXOR T0, T1 1110 1111 MOVOU (16*0)(pTbl), ACC0 1112 MOVOU (16*1)(pTbl), ACCM 1113 MOVOU ACC0, ACC1 1114 1115 PCLMULQDQ $0x00, T1, ACCM 1116 PCLMULQDQ $0x00, T0, ACC0 1117 PCLMULQDQ $0x11, T0, ACC1 1118 1119 combinedDecRound(1) 1120 increment(0) 1121 combinedDecRound(2) 1122 increment(1) 1123 combinedDecRound(3) 1124 increment(2) 1125 combinedDecRound(4) 1126 increment(3) 1127 combinedDecRound(5) 1128 increment(4) 1129 combinedDecRound(6) 1130 increment(5) 1131 combinedDecRound(7) 1132 increment(6) 1133 1134 aesRound(8) 1135 increment(7) 1136 1137 PXOR ACC0, ACCM 1138 PXOR ACC1, ACCM 1139 MOVOU ACCM, T0 1140 PSRLDQ $8, ACCM 1141 PSLLDQ $8, T0 1142 PXOR ACCM, ACC1 1143 PXOR T0, ACC0 1144 1145 reduceRound(ACC0) 1146 aesRound(9) 1147 1148 reduceRound(ACC0) 1149 PXOR ACC1, ACC0 1150 1151 MOVOU (16*10)(ks), T0 1152 CMPQ NR, $12 1153 JB decLast1 1154 aesRnd(T0) 1155 aesRound(11) 1156 MOVOU (16*12)(ks), T0 1157 JE decLast1 1158 aesRnd(T0) 1159 aesRound(13) 1160 MOVOU (16*14)(ks), T0 1161 decLast1: 1162 aesRndLast(T0) 1163 1164 MOVOU (16*0)(ctx), T0 1165 PXOR T0, B0 1166 MOVOU (16*1)(ctx), T0 1167 PXOR T0, B1 1168 MOVOU (16*2)(ctx), T0 1169 PXOR T0, B2 1170 MOVOU (16*3)(ctx), T0 1171 PXOR T0, B3 1172 MOVOU (16*4)(ctx), T0 1173 PXOR T0, B4 1174 MOVOU (16*5)(ctx), T0 1175 PXOR T0, B5 1176 MOVOU (16*6)(ctx), T0 1177 PXOR T0, B6 1178 MOVOU (16*7)(ctx), T0 1179 PXOR T0, B7 1180 1181 MOVOU B0, (16*0)(ptx) 1182 MOVOU B1, (16*1)(ptx) 1183 MOVOU B2, (16*2)(ptx) 1184 MOVOU B3, (16*3)(ptx) 1185 MOVOU B4, (16*4)(ptx) 1186 MOVOU B5, (16*5)(ptx) 1187 MOVOU B6, (16*6)(ptx) 1188 MOVOU B7, (16*7)(ptx) 1189 1190 LEAQ 128(ptx), ptx 1191 LEAQ 128(ctx), ctx 1192 1193 JMP gcmAesDecOctetsLoop 1194 1195 gcmAesDecEndOctets: 1196 1197 SUBQ $7, aluCTR 1198 1199 gcmAesDecSingles: 1200 1201 MOVOU (16*1)(ks), B1 1202 MOVOU (16*2)(ks), B2 1203 MOVOU (16*3)(ks), B3 1204 MOVOU (16*4)(ks), B4 1205 MOVOU (16*5)(ks), B5 1206 MOVOU (16*6)(ks), B6 1207 MOVOU (16*7)(ks), B7 1208 1209 MOVOU (16*14)(pTbl), T2 1210 1211 gcmAesDecSinglesLoop: 1212 1213 CMPQ ptxLen, $16 1214 JB gcmAesDecTail 1215 SUBQ $16, ptxLen 1216 1217 MOVOU (ctx), B0 1218 MOVOU B0, T1 1219 PSHUFB BSWAP, B0 1220 PXOR ACC0, B0 1221 1222 MOVOU T2, ACC0 1223 MOVOU T2, ACC1 1224 MOVOU (16*15)(pTbl), ACCM 1225 1226 PCLMULQDQ $0x00, B0, ACC0 1227 PCLMULQDQ $0x11, B0, ACC1 1228 PSHUFD $78, B0, T0 1229 PXOR B0, T0 1230 PCLMULQDQ $0x00, T0, ACCM 1231 1232 PXOR ACC0, ACCM 1233 PXOR ACC1, ACCM 1234 MOVOU ACCM, T0 1235 PSRLDQ $8, ACCM 1236 PSLLDQ $8, T0 1237 PXOR ACCM, ACC1 1238 PXOR T0, ACC0 1239 1240 reduceRound(ACC0) 1241 reduceRound(ACC0) 1242 PXOR ACC1, ACC0 1243 1244 MOVOU (0*16)(SP), B0 1245 increment(0) 1246 AESENC B1, B0 1247 AESENC B2, B0 1248 AESENC B3, B0 1249 AESENC B4, B0 1250 AESENC B5, B0 1251 AESENC B6, B0 1252 AESENC B7, B0 1253 MOVOU (16*8)(ks), T0 1254 AESENC T0, B0 1255 MOVOU (16*9)(ks), T0 1256 AESENC T0, B0 1257 MOVOU (16*10)(ks), T0 1258 CMPQ NR, $12 1259 JB decLast2 1260 AESENC T0, B0 1261 MOVOU (16*11)(ks), T0 1262 AESENC T0, B0 1263 MOVOU (16*12)(ks), T0 1264 JE decLast2 1265 AESENC T0, B0 1266 MOVOU (16*13)(ks), T0 1267 AESENC T0, B0 1268 MOVOU (16*14)(ks), T0 1269 decLast2: 1270 AESENCLAST T0, B0 1271 1272 PXOR T1, B0 1273 MOVOU B0, (ptx) 1274 1275 LEAQ (16*1)(ptx), ptx 1276 LEAQ (16*1)(ctx), ctx 1277 1278 JMP gcmAesDecSinglesLoop 1279 1280 gcmAesDecTail: 1281 1282 TESTQ ptxLen, ptxLen 1283 JE gcmAesDecDone 1284 1285 MOVQ ptxLen, aluTMP 1286 SHLQ $4, aluTMP 1287 LEAQ andMask<>(SB), aluCTR 1288 MOVOU -16(aluCTR)(aluTMP*1), T1 1289 1290 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 1291 PAND T1, B0 1292 1293 MOVOU B0, T1 1294 PSHUFB BSWAP, B0 1295 PXOR ACC0, B0 1296 1297 MOVOU (16*14)(pTbl), ACC0 1298 MOVOU (16*15)(pTbl), ACCM 1299 MOVOU ACC0, ACC1 1300 1301 PCLMULQDQ $0x00, B0, ACC0 1302 PCLMULQDQ $0x11, B0, ACC1 1303 PSHUFD $78, B0, T0 1304 PXOR B0, T0 1305 PCLMULQDQ $0x00, T0, ACCM 1306 1307 PXOR ACC0, ACCM 1308 PXOR ACC1, ACCM 1309 MOVOU ACCM, T0 1310 PSRLDQ $8, ACCM 1311 PSLLDQ $8, T0 1312 PXOR ACCM, ACC1 1313 PXOR T0, ACC0 1314 1315 reduceRound(ACC0) 1316 reduceRound(ACC0) 1317 PXOR ACC1, ACC0 1318 1319 MOVOU (0*16)(SP), B0 1320 increment(0) 1321 AESENC B1, B0 1322 AESENC B2, B0 1323 AESENC B3, B0 1324 AESENC B4, B0 1325 AESENC B5, B0 1326 AESENC B6, B0 1327 AESENC B7, B0 1328 MOVOU (16*8)(ks), T0 1329 AESENC T0, B0 1330 MOVOU (16*9)(ks), T0 1331 AESENC T0, B0 1332 MOVOU (16*10)(ks), T0 1333 CMPQ NR, $12 1334 JB decLast3 1335 AESENC T0, B0 1336 MOVOU (16*11)(ks), T0 1337 AESENC T0, B0 1338 MOVOU (16*12)(ks), T0 1339 JE decLast3 1340 AESENC T0, B0 1341 MOVOU (16*13)(ks), T0 1342 AESENC T0, B0 1343 MOVOU (16*14)(ks), T0 1344 decLast3: 1345 AESENCLAST T0, B0 1346 PXOR T1, B0 1347 1348 ptxStoreLoop: 1349 PEXTRB $0, B0, (ptx) 1350 PSRLDQ $1, B0 1351 LEAQ 1(ptx), ptx 1352 DECQ ptxLen 1353 1354 JNE ptxStoreLoop 1355 1356 gcmAesDecDone: 1357 1358 MOVOU ACC0, (tPtr) 1359 RET