github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/aes/gcm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI 6 // The implementation uses some optimization as described in: 7 // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication 8 // Instruction and its Usage for Computing the GCM Mode rev. 2.02 9 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and 10 // Hardware 11 12 #include "textflag.h" 13 14 #define B0 X0 15 #define B1 X1 16 #define B2 X2 17 #define B3 X3 18 #define B4 X4 19 #define B5 X5 20 #define B6 X6 21 #define B7 X7 22 23 #define ACC0 X8 24 #define ACC1 X9 25 #define ACCM X10 26 27 #define T0 X11 28 #define T1 X12 29 #define T2 X13 30 #define POLY X14 31 #define BSWAP X15 32 33 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 34 DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 35 36 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 37 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 38 39 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 40 DATA andMask<>+0x08(SB)/8, $0x0000000000000000 41 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 42 DATA andMask<>+0x18(SB)/8, $0x0000000000000000 43 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 44 DATA andMask<>+0x28(SB)/8, $0x0000000000000000 45 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 46 DATA andMask<>+0x38(SB)/8, $0x0000000000000000 47 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 48 DATA andMask<>+0x48(SB)/8, $0x0000000000000000 49 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 50 DATA andMask<>+0x58(SB)/8, $0x0000000000000000 51 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 52 DATA andMask<>+0x68(SB)/8, $0x0000000000000000 53 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 54 DATA andMask<>+0x78(SB)/8, $0x0000000000000000 55 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 56 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 57 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 58 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 59 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 60 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 61 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 62 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 63 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 64 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 65 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 66 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 67 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 68 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 69 70 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 71 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 72 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 73 74 // func hasGCMAsm() bool 75 // returns whether AES-NI AND CLMUL-NI are supported 76 TEXT ·hasGCMAsm(SB),NOSPLIT,$0 77 XORQ AX, AX 78 INCL AX 79 CPUID 80 MOVQ CX, DX 81 SHRQ $25, CX 82 SHRQ $1, DX 83 ANDQ DX, CX 84 ANDQ $1, CX 85 MOVB CX, ret+0(FP) 86 RET 87 88 // func aesEncBlock(dst, src *[16]byte, ks []uint32) 89 TEXT ·aesEncBlock(SB),NOSPLIT,$0 90 MOVQ dst+0(FP), DI 91 MOVQ src+8(FP), SI 92 MOVQ ks_base+16(FP), DX 93 MOVQ ks_len+24(FP), CX 94 95 SHRQ $2, CX 96 DECQ CX 97 98 MOVOU (SI), X0 99 MOVOU (16*0)(DX), X1 100 PXOR X1, X0 101 MOVOU (16*1)(DX), X1 102 AESENC X1, X0 103 MOVOU (16*2)(DX), X1 104 AESENC X1, X0 105 MOVOU (16*3)(DX), X1 106 AESENC X1, X0 107 MOVOU (16*4)(DX), X1 108 AESENC X1, X0 109 MOVOU (16*5)(DX), X1 110 AESENC X1, X0 111 MOVOU (16*6)(DX), X1 112 AESENC X1, X0 113 MOVOU (16*7)(DX), X1 114 AESENC X1, X0 115 MOVOU (16*8)(DX), X1 116 AESENC X1, X0 117 MOVOU (16*9)(DX), X1 118 AESENC X1, X0 119 MOVOU (16*10)(DX), X1 120 CMPQ CX, $12 121 JB encLast 122 AESENC X1, X0 123 MOVOU (16*11)(DX), X1 124 AESENC X1, X0 125 MOVOU (16*12)(DX), X1 126 JE encLast 127 AESENC X1, X0 128 MOVOU (16*13)(DX), X1 129 AESENC X1, X0 130 MOVOU (16*14)(DX), X1 131 132 encLast: 133 AESENCLAST X1, X0 134 MOVOU X0, (DI) 135 136 RET 137 138 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 139 TEXT ·gcmAesFinish(SB),NOSPLIT,$0 140 #define pTbl DI 141 #define tMsk SI 142 #define tPtr DX 143 #define plen AX 144 #define dlen CX 145 146 MOVQ productTable+0(FP), pTbl 147 MOVQ tagMask+8(FP), tMsk 148 MOVQ T+16(FP), tPtr 149 MOVQ pLen+24(FP), plen 150 MOVQ dLen+32(FP), dlen 151 152 MOVOU (tPtr), ACC0 153 MOVOU (tMsk), T2 154 155 MOVOU bswapMask<>(SB), BSWAP 156 MOVOU gcmPoly<>(SB), POLY 157 158 SHLQ $3, plen 159 SHLQ $3, dlen 160 161 MOVQ plen, B0 162 PINSRQ $1, dlen, B0 163 164 PXOR ACC0, B0 165 166 MOVOU (16*14)(pTbl), ACC0 167 MOVOU (16*15)(pTbl), ACCM 168 MOVOU ACC0, ACC1 169 170 PCLMULQDQ $0x00, B0, ACC0 171 PCLMULQDQ $0x11, B0, ACC1 172 PSHUFD $78, B0, T0 173 PXOR B0, T0 174 PCLMULQDQ $0x00, T0, ACCM 175 176 PXOR ACC0, ACCM 177 PXOR ACC1, ACCM 178 MOVOU ACCM, T0 179 PSRLDQ $8, ACCM 180 PSLLDQ $8, T0 181 PXOR ACCM, ACC1 182 PXOR T0, ACC0 183 184 MOVOU POLY, T0 185 PCLMULQDQ $0x01, ACC0, T0 186 PSHUFD $78, ACC0, ACC0 187 PXOR T0, ACC0 188 189 MOVOU POLY, T0 190 PCLMULQDQ $0x01, ACC0, T0 191 PSHUFD $78, ACC0, ACC0 192 PXOR T0, ACC0 193 194 PXOR ACC1, ACC0 195 196 PSHUFB BSWAP, ACC0 197 PXOR T2, ACC0 198 MOVOU ACC0, (tPtr) 199 200 RET 201 #undef pTbl 202 #undef tMsk 203 #undef tPtr 204 #undef plen 205 #undef dlen 206 207 // func gcmAesInit(productTable *[256]byte, ks []uint32) 208 TEXT ·gcmAesInit(SB),NOSPLIT,$0 209 #define dst DI 210 #define KS SI 211 #define NR DX 212 213 MOVQ productTable+0(FP), dst 214 MOVQ ks_base+8(FP), KS 215 MOVQ ks_len+16(FP), NR 216 217 SHRQ $2, NR 218 DECQ NR 219 220 MOVOU bswapMask<>(SB), BSWAP 221 MOVOU gcmPoly<>(SB), POLY 222 223 // Encrypt block 0, with the AES key to generate the hash key H 224 MOVOU (16*0)(KS), B0 225 MOVOU (16*1)(KS), T0 226 AESENC T0, B0 227 MOVOU (16*2)(KS), T0 228 AESENC T0, B0 229 MOVOU (16*3)(KS), T0 230 AESENC T0, B0 231 MOVOU (16*4)(KS), T0 232 AESENC T0, B0 233 MOVOU (16*5)(KS), T0 234 AESENC T0, B0 235 MOVOU (16*6)(KS), T0 236 AESENC T0, B0 237 MOVOU (16*7)(KS), T0 238 AESENC T0, B0 239 MOVOU (16*8)(KS), T0 240 AESENC T0, B0 241 MOVOU (16*9)(KS), T0 242 AESENC T0, B0 243 MOVOU (16*10)(KS), T0 244 CMPQ NR, $12 245 JB initEncLast 246 AESENC T0, B0 247 MOVOU (16*11)(KS), T0 248 AESENC T0, B0 249 MOVOU (16*12)(KS), T0 250 JE initEncLast 251 AESENC T0, B0 252 MOVOU (16*13)(KS), T0 253 AESENC T0, B0 254 MOVOU (16*14)(KS), T0 255 initEncLast: 256 AESENCLAST T0, B0 257 258 PSHUFB BSWAP, B0 259 // H * 2 260 PSHUFD $0xff, B0, T0 261 MOVOU B0, T1 262 PSRAL $31, T0 263 PAND POLY, T0 264 PSRLL $31, T1 265 PSLLDQ $4, T1 266 PSLLL $1, B0 267 PXOR T0, B0 268 PXOR T1, B0 269 // Karatsuba pre-computations 270 MOVOU B0, (16*14)(dst) 271 PSHUFD $78, B0, B1 272 PXOR B0, B1 273 MOVOU B1, (16*15)(dst) 274 275 MOVOU B0, B2 276 MOVOU B1, B3 277 // Now prepare powers of H and pre-computations for them 278 MOVQ $7, AX 279 280 initLoop: 281 MOVOU B2, T0 282 MOVOU B2, T1 283 MOVOU B3, T2 284 PCLMULQDQ $0x00, B0, T0 285 PCLMULQDQ $0x11, B0, T1 286 PCLMULQDQ $0x00, B1, T2 287 288 PXOR T0, T2 289 PXOR T1, T2 290 MOVOU T2, B4 291 PSLLDQ $8, B4 292 PSRLDQ $8, T2 293 PXOR B4, T0 294 PXOR T2, T1 295 296 MOVOU POLY, B2 297 PCLMULQDQ $0x01, T0, B2 298 PSHUFD $78, T0, T0 299 PXOR B2, T0 300 MOVOU POLY, B2 301 PCLMULQDQ $0x01, T0, B2 302 PSHUFD $78, T0, T0 303 PXOR T0, B2 304 PXOR T1, B2 305 306 MOVOU B2, (16*12)(dst) 307 PSHUFD $78, B2, B3 308 PXOR B2, B3 309 MOVOU B3, (16*13)(dst) 310 311 DECQ AX 312 LEAQ (-16*2)(dst), dst 313 JNE initLoop 314 315 RET 316 #undef NR 317 #undef KS 318 #undef dst 319 320 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) 321 TEXT ·gcmAesData(SB),NOSPLIT,$0 322 #define pTbl DI 323 #define aut SI 324 #define tPtr CX 325 #define autLen DX 326 327 #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a 328 #define mulRoundAAD(X ,i) \ 329 MOVOU (16*(i*2))(pTbl), T1;\ 330 MOVOU T1, T2;\ 331 PCLMULQDQ $0x00, X, T1;\ 332 PXOR T1, ACC0;\ 333 PCLMULQDQ $0x11, X, T2;\ 334 PXOR T2, ACC1;\ 335 PSHUFD $78, X, T1;\ 336 PXOR T1, X;\ 337 MOVOU (16*(i*2+1))(pTbl), T1;\ 338 PCLMULQDQ $0x00, X, T1;\ 339 PXOR T1, ACCM 340 341 MOVQ productTable+0(FP), pTbl 342 MOVQ data_base+8(FP), aut 343 MOVQ data_len+16(FP), autLen 344 MOVQ T+32(FP), tPtr 345 346 PXOR ACC0, ACC0 347 MOVOU bswapMask<>(SB), BSWAP 348 MOVOU gcmPoly<>(SB), POLY 349 350 TESTQ autLen, autLen 351 JEQ dataBail 352 353 CMPQ autLen, $13 // optimize the TLS case 354 JE dataTLS 355 CMPQ autLen, $128 356 JB startSinglesLoop 357 JMP dataOctaLoop 358 359 dataTLS: 360 MOVOU (16*14)(pTbl), T1 361 MOVOU (16*15)(pTbl), T2 362 PXOR B0, B0 363 MOVQ (aut), B0 364 PINSRD $2, 8(aut), B0 365 PINSRB $12, 12(aut), B0 366 XORQ autLen, autLen 367 JMP dataMul 368 369 dataOctaLoop: 370 CMPQ autLen, $128 371 JB startSinglesLoop 372 SUBQ $128, autLen 373 374 MOVOU (16*0)(aut), X0 375 MOVOU (16*1)(aut), X1 376 MOVOU (16*2)(aut), X2 377 MOVOU (16*3)(aut), X3 378 MOVOU (16*4)(aut), X4 379 MOVOU (16*5)(aut), X5 380 MOVOU (16*6)(aut), X6 381 MOVOU (16*7)(aut), X7 382 LEAQ (16*8)(aut), aut 383 PSHUFB BSWAP, X0 384 PSHUFB BSWAP, X1 385 PSHUFB BSWAP, X2 386 PSHUFB BSWAP, X3 387 PSHUFB BSWAP, X4 388 PSHUFB BSWAP, X5 389 PSHUFB BSWAP, X6 390 PSHUFB BSWAP, X7 391 PXOR ACC0, X0 392 393 MOVOU (16*0)(pTbl), ACC0 394 MOVOU (16*1)(pTbl), ACCM 395 MOVOU ACC0, ACC1 396 PSHUFD $78, X0, T1 397 PXOR X0, T1 398 PCLMULQDQ $0x00, X0, ACC0 399 PCLMULQDQ $0x11, X0, ACC1 400 PCLMULQDQ $0x00, T1, ACCM 401 402 mulRoundAAD(X1, 1) 403 mulRoundAAD(X2, 2) 404 mulRoundAAD(X3, 3) 405 mulRoundAAD(X4, 4) 406 mulRoundAAD(X5, 5) 407 mulRoundAAD(X6, 6) 408 mulRoundAAD(X7, 7) 409 410 PXOR ACC0, ACCM 411 PXOR ACC1, ACCM 412 MOVOU ACCM, T0 413 PSRLDQ $8, ACCM 414 PSLLDQ $8, T0 415 PXOR ACCM, ACC1 416 PXOR T0, ACC0 417 reduceRound(ACC0) 418 reduceRound(ACC0) 419 PXOR ACC1, ACC0 420 JMP dataOctaLoop 421 422 startSinglesLoop: 423 MOVOU (16*14)(pTbl), T1 424 MOVOU (16*15)(pTbl), T2 425 426 dataSinglesLoop: 427 428 CMPQ autLen, $16 429 JB dataEnd 430 SUBQ $16, autLen 431 432 MOVOU (aut), B0 433 dataMul: 434 PSHUFB BSWAP, B0 435 PXOR ACC0, B0 436 437 MOVOU T1, ACC0 438 MOVOU T2, ACCM 439 MOVOU T1, ACC1 440 441 PSHUFD $78, B0, T0 442 PXOR B0, T0 443 PCLMULQDQ $0x00, B0, ACC0 444 PCLMULQDQ $0x11, B0, ACC1 445 PCLMULQDQ $0x00, T0, ACCM 446 447 PXOR ACC0, ACCM 448 PXOR ACC1, ACCM 449 MOVOU ACCM, T0 450 PSRLDQ $8, ACCM 451 PSLLDQ $8, T0 452 PXOR ACCM, ACC1 453 PXOR T0, ACC0 454 455 MOVOU POLY, T0 456 PCLMULQDQ $0x01, ACC0, T0 457 PSHUFD $78, ACC0, ACC0 458 PXOR T0, ACC0 459 460 MOVOU POLY, T0 461 PCLMULQDQ $0x01, ACC0, T0 462 PSHUFD $78, ACC0, ACC0 463 PXOR T0, ACC0 464 PXOR ACC1, ACC0 465 466 LEAQ 16(aut), aut 467 468 JMP dataSinglesLoop 469 470 dataEnd: 471 472 TESTQ autLen, autLen 473 JEQ dataBail 474 475 PXOR B0, B0 476 LEAQ -1(aut)(autLen*1), aut 477 478 dataLoadLoop: 479 480 PSLLDQ $1, B0 481 PINSRB $0, (aut), B0 482 483 LEAQ -1(aut), aut 484 DECQ autLen 485 JNE dataLoadLoop 486 487 JMP dataMul 488 489 dataBail: 490 MOVOU ACC0, (tPtr) 491 RET 492 #undef pTbl 493 #undef aut 494 #undef tPtr 495 #undef autLen 496 497 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 498 TEXT ·gcmAesEnc(SB),0,$256-96 499 #define pTbl DI 500 #define ctx DX 501 #define ctrPtr CX 502 #define ptx SI 503 #define ks AX 504 #define tPtr R8 505 #define ptxLen R9 506 #define aluCTR R10 507 #define aluTMP R11 508 #define aluK R12 509 #define NR R13 510 511 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) 512 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 513 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 514 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 515 #define combinedRound(i) \ 516 MOVOU (16*i)(ks), T0;\ 517 AESENC T0, B0;\ 518 AESENC T0, B1;\ 519 AESENC T0, B2;\ 520 AESENC T0, B3;\ 521 MOVOU (16*(i*2))(pTbl), T1;\ 522 MOVOU T1, T2;\ 523 AESENC T0, B4;\ 524 AESENC T0, B5;\ 525 AESENC T0, B6;\ 526 AESENC T0, B7;\ 527 MOVOU (16*i)(SP), T0;\ 528 PCLMULQDQ $0x00, T0, T1;\ 529 PXOR T1, ACC0;\ 530 PSHUFD $78, T0, T1;\ 531 PCLMULQDQ $0x11, T0, T2;\ 532 PXOR T1, T0;\ 533 PXOR T2, ACC1;\ 534 MOVOU (16*(i*2+1))(pTbl), T2;\ 535 PCLMULQDQ $0x00, T2, T0;\ 536 PXOR T0, ACCM 537 #define mulRound(i) \ 538 MOVOU (16*i)(SP), T0;\ 539 MOVOU (16*(i*2))(pTbl), T1;\ 540 MOVOU T1, T2;\ 541 PCLMULQDQ $0x00, T0, T1;\ 542 PXOR T1, ACC0;\ 543 PCLMULQDQ $0x11, T0, T2;\ 544 PXOR T2, ACC1;\ 545 PSHUFD $78, T0, T1;\ 546 PXOR T1, T0;\ 547 MOVOU (16*(i*2+1))(pTbl), T1;\ 548 PCLMULQDQ $0x00, T0, T1;\ 549 PXOR T1, ACCM 550 551 MOVQ productTable+0(FP), pTbl 552 MOVQ dst+8(FP), ctx 553 MOVQ src_base+32(FP), ptx 554 MOVQ src_len+40(FP), ptxLen 555 MOVQ ctr+56(FP), ctrPtr 556 MOVQ T+64(FP), tPtr 557 MOVQ ks_base+72(FP), ks 558 MOVQ ks_len+80(FP), NR 559 560 SHRQ $2, NR 561 DECQ NR 562 563 MOVOU bswapMask<>(SB), BSWAP 564 MOVOU gcmPoly<>(SB), POLY 565 566 MOVOU (tPtr), ACC0 567 PXOR ACC1, ACC1 568 PXOR ACCM, ACCM 569 MOVOU (ctrPtr), B0 570 MOVL (3*4)(ctrPtr), aluCTR 571 MOVOU (ks), T0 572 MOVL (3*4)(ks), aluK 573 BSWAPL aluCTR 574 BSWAPL aluK 575 576 PXOR B0, T0 577 MOVOU T0, (8*16 + 0*16)(SP) 578 increment(0) 579 580 CMPQ ptxLen, $128 581 JB gcmAesEncSingles 582 SUBQ $128, ptxLen 583 584 // We have at least 8 blocks to encrypt, prepare the rest of the counters 585 MOVOU T0, (8*16 + 1*16)(SP) 586 increment(1) 587 MOVOU T0, (8*16 + 2*16)(SP) 588 increment(2) 589 MOVOU T0, (8*16 + 3*16)(SP) 590 increment(3) 591 MOVOU T0, (8*16 + 4*16)(SP) 592 increment(4) 593 MOVOU T0, (8*16 + 5*16)(SP) 594 increment(5) 595 MOVOU T0, (8*16 + 6*16)(SP) 596 increment(6) 597 MOVOU T0, (8*16 + 7*16)(SP) 598 increment(7) 599 600 MOVOU (8*16 + 0*16)(SP), B0 601 MOVOU (8*16 + 1*16)(SP), B1 602 MOVOU (8*16 + 2*16)(SP), B2 603 MOVOU (8*16 + 3*16)(SP), B3 604 MOVOU (8*16 + 4*16)(SP), B4 605 MOVOU (8*16 + 5*16)(SP), B5 606 MOVOU (8*16 + 6*16)(SP), B6 607 MOVOU (8*16 + 7*16)(SP), B7 608 609 aesRound(1) 610 increment(0) 611 aesRound(2) 612 increment(1) 613 aesRound(3) 614 increment(2) 615 aesRound(4) 616 increment(3) 617 aesRound(5) 618 increment(4) 619 aesRound(6) 620 increment(5) 621 aesRound(7) 622 increment(6) 623 aesRound(8) 624 increment(7) 625 aesRound(9) 626 MOVOU (16*10)(ks), T0 627 CMPQ NR, $12 628 JB encLast1 629 aesRnd(T0) 630 aesRound(11) 631 MOVOU (16*12)(ks), T0 632 JE encLast1 633 aesRnd(T0) 634 aesRound(13) 635 MOVOU (16*14)(ks), T0 636 encLast1: 637 aesRndLast(T0) 638 639 MOVOU (16*0)(ptx), T0 640 PXOR T0, B0 641 MOVOU (16*1)(ptx), T0 642 PXOR T0, B1 643 MOVOU (16*2)(ptx), T0 644 PXOR T0, B2 645 MOVOU (16*3)(ptx), T0 646 PXOR T0, B3 647 MOVOU (16*4)(ptx), T0 648 PXOR T0, B4 649 MOVOU (16*5)(ptx), T0 650 PXOR T0, B5 651 MOVOU (16*6)(ptx), T0 652 PXOR T0, B6 653 MOVOU (16*7)(ptx), T0 654 PXOR T0, B7 655 656 MOVOU B0, (16*0)(ctx) 657 PSHUFB BSWAP, B0 658 PXOR ACC0, B0 659 MOVOU B1, (16*1)(ctx) 660 PSHUFB BSWAP, B1 661 MOVOU B2, (16*2)(ctx) 662 PSHUFB BSWAP, B2 663 MOVOU B3, (16*3)(ctx) 664 PSHUFB BSWAP, B3 665 MOVOU B4, (16*4)(ctx) 666 PSHUFB BSWAP, B4 667 MOVOU B5, (16*5)(ctx) 668 PSHUFB BSWAP, B5 669 MOVOU B6, (16*6)(ctx) 670 PSHUFB BSWAP, B6 671 MOVOU B7, (16*7)(ctx) 672 PSHUFB BSWAP, B7 673 674 MOVOU B0, (16*0)(SP) 675 MOVOU B1, (16*1)(SP) 676 MOVOU B2, (16*2)(SP) 677 MOVOU B3, (16*3)(SP) 678 MOVOU B4, (16*4)(SP) 679 MOVOU B5, (16*5)(SP) 680 MOVOU B6, (16*6)(SP) 681 MOVOU B7, (16*7)(SP) 682 683 LEAQ 128(ptx), ptx 684 LEAQ 128(ctx), ctx 685 686 gcmAesEncOctetsLoop: 687 688 CMPQ ptxLen, $128 689 JB gcmAesEncOctetsEnd 690 SUBQ $128, ptxLen 691 692 MOVOU (8*16 + 0*16)(SP), B0 693 MOVOU (8*16 + 1*16)(SP), B1 694 MOVOU (8*16 + 2*16)(SP), B2 695 MOVOU (8*16 + 3*16)(SP), B3 696 MOVOU (8*16 + 4*16)(SP), B4 697 MOVOU (8*16 + 5*16)(SP), B5 698 MOVOU (8*16 + 6*16)(SP), B6 699 MOVOU (8*16 + 7*16)(SP), B7 700 701 MOVOU (16*0)(SP), T0 702 PSHUFD $78, T0, T1 703 PXOR T0, T1 704 705 MOVOU (16*0)(pTbl), ACC0 706 MOVOU (16*1)(pTbl), ACCM 707 MOVOU ACC0, ACC1 708 709 PCLMULQDQ $0x00, T1, ACCM 710 PCLMULQDQ $0x00, T0, ACC0 711 PCLMULQDQ $0x11, T0, ACC1 712 713 combinedRound(1) 714 increment(0) 715 combinedRound(2) 716 increment(1) 717 combinedRound(3) 718 increment(2) 719 combinedRound(4) 720 increment(3) 721 combinedRound(5) 722 increment(4) 723 combinedRound(6) 724 increment(5) 725 combinedRound(7) 726 increment(6) 727 728 aesRound(8) 729 increment(7) 730 731 PXOR ACC0, ACCM 732 PXOR ACC1, ACCM 733 MOVOU ACCM, T0 734 PSRLDQ $8, ACCM 735 PSLLDQ $8, T0 736 PXOR ACCM, ACC1 737 PXOR T0, ACC0 738 739 reduceRound(ACC0) 740 aesRound(9) 741 742 reduceRound(ACC0) 743 PXOR ACC1, ACC0 744 745 MOVOU (16*10)(ks), T0 746 CMPQ NR, $12 747 JB encLast2 748 aesRnd(T0) 749 aesRound(11) 750 MOVOU (16*12)(ks), T0 751 JE encLast2 752 aesRnd(T0) 753 aesRound(13) 754 MOVOU (16*14)(ks), T0 755 encLast2: 756 aesRndLast(T0) 757 758 MOVOU (16*0)(ptx), T0 759 PXOR T0, B0 760 MOVOU (16*1)(ptx), T0 761 PXOR T0, B1 762 MOVOU (16*2)(ptx), T0 763 PXOR T0, B2 764 MOVOU (16*3)(ptx), T0 765 PXOR T0, B3 766 MOVOU (16*4)(ptx), T0 767 PXOR T0, B4 768 MOVOU (16*5)(ptx), T0 769 PXOR T0, B5 770 MOVOU (16*6)(ptx), T0 771 PXOR T0, B6 772 MOVOU (16*7)(ptx), T0 773 PXOR T0, B7 774 775 MOVOU B0, (16*0)(ctx) 776 PSHUFB BSWAP, B0 777 PXOR ACC0, B0 778 MOVOU B1, (16*1)(ctx) 779 PSHUFB BSWAP, B1 780 MOVOU B2, (16*2)(ctx) 781 PSHUFB BSWAP, B2 782 MOVOU B3, (16*3)(ctx) 783 PSHUFB BSWAP, B3 784 MOVOU B4, (16*4)(ctx) 785 PSHUFB BSWAP, B4 786 MOVOU B5, (16*5)(ctx) 787 PSHUFB BSWAP, B5 788 MOVOU B6, (16*6)(ctx) 789 PSHUFB BSWAP, B6 790 MOVOU B7, (16*7)(ctx) 791 PSHUFB BSWAP, B7 792 793 MOVOU B0, (16*0)(SP) 794 MOVOU B1, (16*1)(SP) 795 MOVOU B2, (16*2)(SP) 796 MOVOU B3, (16*3)(SP) 797 MOVOU B4, (16*4)(SP) 798 MOVOU B5, (16*5)(SP) 799 MOVOU B6, (16*6)(SP) 800 MOVOU B7, (16*7)(SP) 801 802 LEAQ 128(ptx), ptx 803 LEAQ 128(ctx), ctx 804 805 JMP gcmAesEncOctetsLoop 806 807 gcmAesEncOctetsEnd: 808 809 MOVOU (16*0)(SP), T0 810 MOVOU (16*0)(pTbl), ACC0 811 MOVOU (16*1)(pTbl), ACCM 812 MOVOU ACC0, ACC1 813 PSHUFD $78, T0, T1 814 PXOR T0, T1 815 PCLMULQDQ $0x00, T0, ACC0 816 PCLMULQDQ $0x11, T0, ACC1 817 PCLMULQDQ $0x00, T1, ACCM 818 819 mulRound(1) 820 mulRound(2) 821 mulRound(3) 822 mulRound(4) 823 mulRound(5) 824 mulRound(6) 825 mulRound(7) 826 827 PXOR ACC0, ACCM 828 PXOR ACC1, ACCM 829 MOVOU ACCM, T0 830 PSRLDQ $8, ACCM 831 PSLLDQ $8, T0 832 PXOR ACCM, ACC1 833 PXOR T0, ACC0 834 835 reduceRound(ACC0) 836 reduceRound(ACC0) 837 PXOR ACC1, ACC0 838 839 TESTQ ptxLen, ptxLen 840 JE gcmAesEncDone 841 842 SUBQ $7, aluCTR 843 844 gcmAesEncSingles: 845 846 MOVOU (16*1)(ks), B1 847 MOVOU (16*2)(ks), B2 848 MOVOU (16*3)(ks), B3 849 MOVOU (16*4)(ks), B4 850 MOVOU (16*5)(ks), B5 851 MOVOU (16*6)(ks), B6 852 MOVOU (16*7)(ks), B7 853 854 MOVOU (16*14)(pTbl), T2 855 856 gcmAesEncSinglesLoop: 857 858 CMPQ ptxLen, $16 859 JB gcmAesEncTail 860 SUBQ $16, ptxLen 861 862 MOVOU (8*16 + 0*16)(SP), B0 863 increment(0) 864 865 AESENC B1, B0 866 AESENC B2, B0 867 AESENC B3, B0 868 AESENC B4, B0 869 AESENC B5, B0 870 AESENC B6, B0 871 AESENC B7, B0 872 MOVOU (16*8)(ks), T0 873 AESENC T0, B0 874 MOVOU (16*9)(ks), T0 875 AESENC T0, B0 876 MOVOU (16*10)(ks), T0 877 CMPQ NR, $12 878 JB encLast3 879 AESENC T0, B0 880 MOVOU (16*11)(ks), T0 881 AESENC T0, B0 882 MOVOU (16*12)(ks), T0 883 JE encLast3 884 AESENC T0, B0 885 MOVOU (16*13)(ks), T0 886 AESENC T0, B0 887 MOVOU (16*14)(ks), T0 888 encLast3: 889 AESENCLAST T0, B0 890 891 MOVOU (ptx), T0 892 PXOR T0, B0 893 MOVOU B0, (ctx) 894 895 PSHUFB BSWAP, B0 896 PXOR ACC0, B0 897 898 MOVOU T2, ACC0 899 MOVOU T2, ACC1 900 MOVOU (16*15)(pTbl), ACCM 901 902 PSHUFD $78, B0, T0 903 PXOR B0, T0 904 PCLMULQDQ $0x00, B0, ACC0 905 PCLMULQDQ $0x11, B0, ACC1 906 PCLMULQDQ $0x00, T0, ACCM 907 908 PXOR ACC0, ACCM 909 PXOR ACC1, ACCM 910 MOVOU ACCM, T0 911 PSRLDQ $8, ACCM 912 PSLLDQ $8, T0 913 PXOR ACCM, ACC1 914 PXOR T0, ACC0 915 916 reduceRound(ACC0) 917 reduceRound(ACC0) 918 PXOR ACC1, ACC0 919 920 LEAQ (16*1)(ptx), ptx 921 LEAQ (16*1)(ctx), ctx 922 923 JMP gcmAesEncSinglesLoop 924 925 gcmAesEncTail: 926 TESTQ ptxLen, ptxLen 927 JE gcmAesEncDone 928 929 MOVOU (8*16 + 0*16)(SP), B0 930 AESENC B1, B0 931 AESENC B2, B0 932 AESENC B3, B0 933 AESENC B4, B0 934 AESENC B5, B0 935 AESENC B6, B0 936 AESENC B7, B0 937 MOVOU (16*8)(ks), T0 938 AESENC T0, B0 939 MOVOU (16*9)(ks), T0 940 AESENC T0, B0 941 MOVOU (16*10)(ks), T0 942 CMPQ NR, $12 943 JB encLast4 944 AESENC T0, B0 945 MOVOU (16*11)(ks), T0 946 AESENC T0, B0 947 MOVOU (16*12)(ks), T0 948 JE encLast4 949 AESENC T0, B0 950 MOVOU (16*13)(ks), T0 951 AESENC T0, B0 952 MOVOU (16*14)(ks), T0 953 encLast4: 954 AESENCLAST T0, B0 955 MOVOU B0, T0 956 957 LEAQ -1(ptx)(ptxLen*1), ptx 958 959 MOVQ ptxLen, aluTMP 960 SHLQ $4, aluTMP 961 962 LEAQ andMask<>(SB), aluCTR 963 MOVOU -16(aluCTR)(aluTMP*1), T1 964 965 PXOR B0, B0 966 ptxLoadLoop: 967 PSLLDQ $1, B0 968 PINSRB $0, (ptx), B0 969 LEAQ -1(ptx), ptx 970 DECQ ptxLen 971 JNE ptxLoadLoop 972 973 PXOR T0, B0 974 PAND T1, B0 975 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 976 977 PSHUFB BSWAP, B0 978 PXOR ACC0, B0 979 980 MOVOU T2, ACC0 981 MOVOU T2, ACC1 982 MOVOU (16*15)(pTbl), ACCM 983 984 PSHUFD $78, B0, T0 985 PXOR B0, T0 986 PCLMULQDQ $0x00, B0, ACC0 987 PCLMULQDQ $0x11, B0, ACC1 988 PCLMULQDQ $0x00, T0, ACCM 989 990 PXOR ACC0, ACCM 991 PXOR ACC1, ACCM 992 MOVOU ACCM, T0 993 PSRLDQ $8, ACCM 994 PSLLDQ $8, T0 995 PXOR ACCM, ACC1 996 PXOR T0, ACC0 997 998 reduceRound(ACC0) 999 reduceRound(ACC0) 1000 PXOR ACC1, ACC0 1001 1002 gcmAesEncDone: 1003 MOVOU ACC0, (tPtr) 1004 RET 1005 #undef increment 1006 1007 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 1008 TEXT ·gcmAesDec(SB),0,$128-96 1009 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) 1010 #define combinedDecRound(i) \ 1011 MOVOU (16*i)(ks), T0;\ 1012 AESENC T0, B0;\ 1013 AESENC T0, B1;\ 1014 AESENC T0, B2;\ 1015 AESENC T0, B3;\ 1016 MOVOU (16*(i*2))(pTbl), T1;\ 1017 MOVOU T1, T2;\ 1018 AESENC T0, B4;\ 1019 AESENC T0, B5;\ 1020 AESENC T0, B6;\ 1021 AESENC T0, B7;\ 1022 MOVOU (16*i)(ctx), T0;\ 1023 PSHUFB BSWAP, T0;\ 1024 PCLMULQDQ $0x00, T0, T1;\ 1025 PXOR T1, ACC0;\ 1026 PSHUFD $78, T0, T1;\ 1027 PCLMULQDQ $0x11, T0, T2;\ 1028 PXOR T1, T0;\ 1029 PXOR T2, ACC1;\ 1030 MOVOU (16*(i*2+1))(pTbl), T2;\ 1031 PCLMULQDQ $0x00, T2, T0;\ 1032 PXOR T0, ACCM 1033 1034 MOVQ productTable+0(FP), pTbl 1035 MOVQ dst+8(FP), ptx 1036 MOVQ src_base+32(FP), ctx 1037 MOVQ src_len+40(FP), ptxLen 1038 MOVQ ctr+56(FP), ctrPtr 1039 MOVQ T+64(FP), tPtr 1040 MOVQ ks_base+72(FP), ks 1041 MOVQ ks_len+80(FP), NR 1042 1043 SHRQ $2, NR 1044 DECQ NR 1045 1046 MOVOU bswapMask<>(SB), BSWAP 1047 MOVOU gcmPoly<>(SB), POLY 1048 1049 MOVOU (tPtr), ACC0 1050 PXOR ACC1, ACC1 1051 PXOR ACCM, ACCM 1052 MOVOU (ctrPtr), B0 1053 MOVL (3*4)(ctrPtr), aluCTR 1054 MOVOU (ks), T0 1055 MOVL (3*4)(ks), aluK 1056 BSWAPL aluCTR 1057 BSWAPL aluK 1058 1059 PXOR B0, T0 1060 MOVOU T0, (0*16)(SP) 1061 increment(0) 1062 1063 CMPQ ptxLen, $128 1064 JB gcmAesDecSingles 1065 1066 MOVOU T0, (1*16)(SP) 1067 increment(1) 1068 MOVOU T0, (2*16)(SP) 1069 increment(2) 1070 MOVOU T0, (3*16)(SP) 1071 increment(3) 1072 MOVOU T0, (4*16)(SP) 1073 increment(4) 1074 MOVOU T0, (5*16)(SP) 1075 increment(5) 1076 MOVOU T0, (6*16)(SP) 1077 increment(6) 1078 MOVOU T0, (7*16)(SP) 1079 increment(7) 1080 1081 gcmAesDecOctetsLoop: 1082 1083 CMPQ ptxLen, $128 1084 JB gcmAesDecEndOctets 1085 SUBQ $128, ptxLen 1086 1087 MOVOU (0*16)(SP), B0 1088 MOVOU (1*16)(SP), B1 1089 MOVOU (2*16)(SP), B2 1090 MOVOU (3*16)(SP), B3 1091 MOVOU (4*16)(SP), B4 1092 MOVOU (5*16)(SP), B5 1093 MOVOU (6*16)(SP), B6 1094 MOVOU (7*16)(SP), B7 1095 1096 MOVOU (16*0)(ctx), T0 1097 PSHUFB BSWAP, T0 1098 PXOR ACC0, T0 1099 PSHUFD $78, T0, T1 1100 PXOR T0, T1 1101 1102 MOVOU (16*0)(pTbl), ACC0 1103 MOVOU (16*1)(pTbl), ACCM 1104 MOVOU ACC0, ACC1 1105 1106 PCLMULQDQ $0x00, T1, ACCM 1107 PCLMULQDQ $0x00, T0, ACC0 1108 PCLMULQDQ $0x11, T0, ACC1 1109 1110 combinedDecRound(1) 1111 increment(0) 1112 combinedDecRound(2) 1113 increment(1) 1114 combinedDecRound(3) 1115 increment(2) 1116 combinedDecRound(4) 1117 increment(3) 1118 combinedDecRound(5) 1119 increment(4) 1120 combinedDecRound(6) 1121 increment(5) 1122 combinedDecRound(7) 1123 increment(6) 1124 1125 aesRound(8) 1126 increment(7) 1127 1128 PXOR ACC0, ACCM 1129 PXOR ACC1, ACCM 1130 MOVOU ACCM, T0 1131 PSRLDQ $8, ACCM 1132 PSLLDQ $8, T0 1133 PXOR ACCM, ACC1 1134 PXOR T0, ACC0 1135 1136 reduceRound(ACC0) 1137 aesRound(9) 1138 1139 reduceRound(ACC0) 1140 PXOR ACC1, ACC0 1141 1142 MOVOU (16*10)(ks), T0 1143 CMPQ NR, $12 1144 JB decLast1 1145 aesRnd(T0) 1146 aesRound(11) 1147 MOVOU (16*12)(ks), T0 1148 JE decLast1 1149 aesRnd(T0) 1150 aesRound(13) 1151 MOVOU (16*14)(ks), T0 1152 decLast1: 1153 aesRndLast(T0) 1154 1155 MOVOU (16*0)(ctx), T0 1156 PXOR T0, B0 1157 MOVOU (16*1)(ctx), T0 1158 PXOR T0, B1 1159 MOVOU (16*2)(ctx), T0 1160 PXOR T0, B2 1161 MOVOU (16*3)(ctx), T0 1162 PXOR T0, B3 1163 MOVOU (16*4)(ctx), T0 1164 PXOR T0, B4 1165 MOVOU (16*5)(ctx), T0 1166 PXOR T0, B5 1167 MOVOU (16*6)(ctx), T0 1168 PXOR T0, B6 1169 MOVOU (16*7)(ctx), T0 1170 PXOR T0, B7 1171 1172 MOVOU B0, (16*0)(ptx) 1173 MOVOU B1, (16*1)(ptx) 1174 MOVOU B2, (16*2)(ptx) 1175 MOVOU B3, (16*3)(ptx) 1176 MOVOU B4, (16*4)(ptx) 1177 MOVOU B5, (16*5)(ptx) 1178 MOVOU B6, (16*6)(ptx) 1179 MOVOU B7, (16*7)(ptx) 1180 1181 LEAQ 128(ptx), ptx 1182 LEAQ 128(ctx), ctx 1183 1184 JMP gcmAesDecOctetsLoop 1185 1186 gcmAesDecEndOctets: 1187 1188 SUBQ $7, aluCTR 1189 1190 gcmAesDecSingles: 1191 1192 MOVOU (16*1)(ks), B1 1193 MOVOU (16*2)(ks), B2 1194 MOVOU (16*3)(ks), B3 1195 MOVOU (16*4)(ks), B4 1196 MOVOU (16*5)(ks), B5 1197 MOVOU (16*6)(ks), B6 1198 MOVOU (16*7)(ks), B7 1199 1200 MOVOU (16*14)(pTbl), T2 1201 1202 gcmAesDecSinglesLoop: 1203 1204 CMPQ ptxLen, $16 1205 JB gcmAesDecTail 1206 SUBQ $16, ptxLen 1207 1208 MOVOU (ctx), B0 1209 MOVOU B0, T1 1210 PSHUFB BSWAP, B0 1211 PXOR ACC0, B0 1212 1213 MOVOU T2, ACC0 1214 MOVOU T2, ACC1 1215 MOVOU (16*15)(pTbl), ACCM 1216 1217 PCLMULQDQ $0x00, B0, ACC0 1218 PCLMULQDQ $0x11, B0, ACC1 1219 PSHUFD $78, B0, T0 1220 PXOR B0, T0 1221 PCLMULQDQ $0x00, T0, ACCM 1222 1223 PXOR ACC0, ACCM 1224 PXOR ACC1, ACCM 1225 MOVOU ACCM, T0 1226 PSRLDQ $8, ACCM 1227 PSLLDQ $8, T0 1228 PXOR ACCM, ACC1 1229 PXOR T0, ACC0 1230 1231 reduceRound(ACC0) 1232 reduceRound(ACC0) 1233 PXOR ACC1, ACC0 1234 1235 MOVOU (0*16)(SP), B0 1236 increment(0) 1237 AESENC B1, B0 1238 AESENC B2, B0 1239 AESENC B3, B0 1240 AESENC B4, B0 1241 AESENC B5, B0 1242 AESENC B6, B0 1243 AESENC B7, B0 1244 MOVOU (16*8)(ks), T0 1245 AESENC T0, B0 1246 MOVOU (16*9)(ks), T0 1247 AESENC T0, B0 1248 MOVOU (16*10)(ks), T0 1249 CMPQ NR, $12 1250 JB decLast2 1251 AESENC T0, B0 1252 MOVOU (16*11)(ks), T0 1253 AESENC T0, B0 1254 MOVOU (16*12)(ks), T0 1255 JE decLast2 1256 AESENC T0, B0 1257 MOVOU (16*13)(ks), T0 1258 AESENC T0, B0 1259 MOVOU (16*14)(ks), T0 1260 decLast2: 1261 AESENCLAST T0, B0 1262 1263 PXOR T1, B0 1264 MOVOU B0, (ptx) 1265 1266 LEAQ (16*1)(ptx), ptx 1267 LEAQ (16*1)(ctx), ctx 1268 1269 JMP gcmAesDecSinglesLoop 1270 1271 gcmAesDecTail: 1272 1273 TESTQ ptxLen, ptxLen 1274 JE gcmAesDecDone 1275 1276 MOVQ ptxLen, aluTMP 1277 SHLQ $4, aluTMP 1278 LEAQ andMask<>(SB), aluCTR 1279 MOVOU -16(aluCTR)(aluTMP*1), T1 1280 1281 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 1282 PAND T1, B0 1283 1284 MOVOU B0, T1 1285 PSHUFB BSWAP, B0 1286 PXOR ACC0, B0 1287 1288 MOVOU (16*14)(pTbl), ACC0 1289 MOVOU (16*15)(pTbl), ACCM 1290 MOVOU ACC0, ACC1 1291 1292 PCLMULQDQ $0x00, B0, ACC0 1293 PCLMULQDQ $0x11, B0, ACC1 1294 PSHUFD $78, B0, T0 1295 PXOR B0, T0 1296 PCLMULQDQ $0x00, T0, ACCM 1297 1298 PXOR ACC0, ACCM 1299 PXOR ACC1, ACCM 1300 MOVOU ACCM, T0 1301 PSRLDQ $8, ACCM 1302 PSLLDQ $8, T0 1303 PXOR ACCM, ACC1 1304 PXOR T0, ACC0 1305 1306 reduceRound(ACC0) 1307 reduceRound(ACC0) 1308 PXOR ACC1, ACC0 1309 1310 MOVOU (0*16)(SP), B0 1311 increment(0) 1312 AESENC B1, B0 1313 AESENC B2, B0 1314 AESENC B3, B0 1315 AESENC B4, B0 1316 AESENC B5, B0 1317 AESENC B6, B0 1318 AESENC B7, B0 1319 MOVOU (16*8)(ks), T0 1320 AESENC T0, B0 1321 MOVOU (16*9)(ks), T0 1322 AESENC T0, B0 1323 MOVOU (16*10)(ks), T0 1324 CMPQ NR, $12 1325 JB decLast3 1326 AESENC T0, B0 1327 MOVOU (16*11)(ks), T0 1328 AESENC T0, B0 1329 MOVOU (16*12)(ks), T0 1330 JE decLast3 1331 AESENC T0, B0 1332 MOVOU (16*13)(ks), T0 1333 AESENC T0, B0 1334 MOVOU (16*14)(ks), T0 1335 decLast3: 1336 AESENCLAST T0, B0 1337 PXOR T1, B0 1338 1339 ptxStoreLoop: 1340 PEXTRB $0, B0, (ptx) 1341 PSRLDQ $1, B0 1342 LEAQ 1(ptx), ptx 1343 DECQ ptxLen 1344 1345 JNE ptxStoreLoop 1346 1347 gcmAesDecDone: 1348 1349 MOVOU ACC0, (tPtr) 1350 RET