github.com/sean-/go@v0.0.0-20151219100004-97f854cd7bb6/src/crypto/aes/gcm_amd64.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI 6 // The implementation uses some optimization as described in: 7 // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication 8 // Instruction and its Usage for Computing the GCM Mode rev. 2.02 9 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and 10 // Hardware 11 12 #include "textflag.h" 13 14 #define B0 X0 15 #define B1 X1 16 #define B2 X2 17 #define B3 X3 18 #define B4 X4 19 #define B5 X5 20 #define B6 X6 21 #define B7 X7 22 23 #define ACC0 X8 24 #define ACC1 X9 25 #define ACCM X10 26 27 #define T0 X11 28 #define T1 X12 29 #define T2 X13 30 #define POLY X14 31 #define BSWAP X15 32 33 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 34 DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 35 36 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 37 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 38 39 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 40 DATA andMask<>+0x08(SB)/8, $0x0000000000000000 41 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 42 DATA andMask<>+0x18(SB)/8, $0x0000000000000000 43 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 44 DATA andMask<>+0x28(SB)/8, $0x0000000000000000 45 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 46 DATA andMask<>+0x38(SB)/8, $0x0000000000000000 47 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 48 DATA andMask<>+0x48(SB)/8, $0x0000000000000000 49 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 50 DATA andMask<>+0x58(SB)/8, $0x0000000000000000 51 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 52 DATA andMask<>+0x68(SB)/8, $0x0000000000000000 53 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 54 DATA andMask<>+0x78(SB)/8, $0x0000000000000000 55 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 56 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 57 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 58 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 59 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 60 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 61 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 62 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 63 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 64 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 65 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 66 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 67 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 68 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 69 70 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 71 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 72 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 73 74 // func hasGCMAsm() bool 75 // returns whether AES-NI AND CLMUL-NI are supported 76 TEXT ·hasGCMAsm(SB),NOSPLIT,$0 77 XORQ AX, AX 78 INCL AX 79 CPUID 80 MOVQ CX, DX 81 SHRQ $25, CX 82 SHRQ $1, DX 83 ANDQ DX, CX 84 ANDQ $1, CX 85 MOVB CX, ret+0(FP) 86 RET 87 88 // func aesEncBlock(dst, src *[16]byte, ks []uint32) 89 TEXT ·aesEncBlock(SB),NOSPLIT,$0 90 MOVQ dst+0(FP), DI 91 MOVQ src+8(FP), SI 92 MOVQ ks+16(FP), DX 93 MOVQ ks+24(FP), CX 94 95 SHRQ $2, CX 96 DECQ CX 97 98 MOVOU (SI), X0 99 MOVOU (16*0)(DX), X1 100 PXOR X1, X0 101 MOVOU (16*1)(DX), X1 102 AESENC X1, X0 103 MOVOU (16*2)(DX), X1 104 AESENC X1, X0 105 MOVOU (16*3)(DX), X1 106 AESENC X1, X0 107 MOVOU (16*4)(DX), X1 108 AESENC X1, X0 109 MOVOU (16*5)(DX), X1 110 AESENC X1, X0 111 MOVOU (16*6)(DX), X1 112 AESENC X1, X0 113 MOVOU (16*7)(DX), X1 114 AESENC X1, X0 115 MOVOU (16*8)(DX), X1 116 AESENC X1, X0 117 MOVOU (16*9)(DX), X1 118 AESENC X1, X0 119 MOVOU (16*10)(DX), X1 120 CMPQ CX, $12 121 JB encLast 122 AESENC X1, X0 123 MOVOU (16*11)(DX), X1 124 AESENC X1, X0 125 MOVOU (16*12)(DX), X1 126 JE encLast 127 AESENC X1, X0 128 MOVOU (16*13)(DX), X1 129 AESENC X1, X0 130 MOVOU (16*14)(DX), X1 131 132 encLast: 133 AESENCLAST X1, X0 134 MOVOU X0, (DI) 135 136 RET 137 138 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 139 TEXT ·gcmAesFinish(SB),NOSPLIT,$0 140 #define pTbl DI 141 #define tMsk SI 142 #define tPtr DX 143 #define plen AX 144 #define dlen CX 145 146 MOVQ productTable+0(FP), pTbl 147 MOVQ tagMask+8(FP), tMsk 148 MOVQ T+16(FP), tPtr 149 MOVQ pLen+24(FP), plen 150 MOVQ dLen+32(FP), dlen 151 152 MOVOU (tPtr), ACC0 153 MOVOU (tMsk), T2 154 155 MOVOU bswapMask<>(SB), BSWAP 156 MOVOU gcmPoly<>(SB), POLY 157 158 SHLQ $3, plen 159 SHLQ $3, dlen 160 161 MOVQ plen, B0 162 PINSRQ $1, dlen, B0 163 164 PXOR ACC0, B0 165 166 MOVOU (16*14)(pTbl), ACC0 167 MOVOU (16*15)(pTbl), ACCM 168 MOVOU ACC0, ACC1 169 170 PCLMULQDQ $0x00, B0, ACC0 171 PCLMULQDQ $0x11, B0, ACC1 172 PSHUFD $78, B0, T0 173 PXOR B0, T0 174 PCLMULQDQ $0x00, T0, ACCM 175 176 PXOR ACC0, ACCM 177 PXOR ACC1, ACCM 178 MOVOU ACCM, T0 179 PSRLDQ $8, ACCM 180 PSLLDQ $8, T0 181 PXOR ACCM, ACC1 182 PXOR T0, ACC0 183 184 MOVOU POLY, T0 185 PCLMULQDQ $0x01, ACC0, T0 186 PSHUFD $78, ACC0, ACC0 187 PXOR T0, ACC0 188 189 MOVOU POLY, T0 190 PCLMULQDQ $0x01, ACC0, T0 191 PSHUFD $78, ACC0, ACC0 192 PXOR T0, ACC0 193 194 PXOR ACC1, ACC0 195 196 PSHUFB BSWAP, ACC0 197 PXOR T2, ACC0 198 MOVOU ACC0, (tPtr) 199 200 RET 201 #undef pTbl 202 #undef tMsk 203 #undef tPtr 204 #undef plen 205 #undef dlen 206 207 // func gcmAesInit(productTable *[256]byte, ks []uint32) 208 TEXT ·gcmAesInit(SB),NOSPLIT,$0 209 #define dst DI 210 #define KS SI 211 #define NR DX 212 213 MOVQ productTable+0(FP), dst 214 MOVQ ks+8(FP), KS 215 MOVQ ks+16(FP), NR 216 217 SHRQ $2, NR 218 DECQ NR 219 220 MOVOU bswapMask<>(SB), BSWAP 221 MOVOU gcmPoly<>(SB), POLY 222 223 // Encrypt block 0, with the AES key to generate the hash key H 224 MOVOU (16*0)(KS), B0 225 MOVOU (16*1)(KS), T0 226 AESENC T0, B0 227 MOVOU (16*2)(KS), T0 228 AESENC T0, B0 229 MOVOU (16*3)(KS), T0 230 AESENC T0, B0 231 MOVOU (16*4)(KS), T0 232 AESENC T0, B0 233 MOVOU (16*5)(KS), T0 234 AESENC T0, B0 235 MOVOU (16*6)(KS), T0 236 AESENC T0, B0 237 MOVOU (16*7)(KS), T0 238 AESENC T0, B0 239 MOVOU (16*8)(KS), T0 240 AESENC T0, B0 241 MOVOU (16*9)(KS), T0 242 AESENC T0, B0 243 MOVOU (16*10)(KS), T0 244 CMPQ NR, $12 245 JB initEncLast 246 AESENC T0, B0 247 MOVOU (16*11)(KS), T0 248 AESENC T0, B0 249 MOVOU (16*12)(KS), T0 250 JE initEncLast 251 AESENC T0, B0 252 MOVOU (16*13)(KS), T0 253 AESENC T0, B0 254 MOVOU (16*14)(KS), T0 255 initEncLast: 256 AESENCLAST T0, B0 257 258 PSHUFB BSWAP, B0 259 // H * 2 260 PSHUFD $0xff, B0, T0 261 MOVOU B0, T1 262 PSRAL $31, T0 263 PAND POLY, T0 264 PSRLL $31, T1 265 PSLLDQ $4, T1 266 PSLLL $1, B0 267 PXOR T0, B0 268 PXOR T1, B0 269 // Karatsuba pre-computations 270 MOVOU B0, (16*14)(dst) 271 PSHUFD $78, B0, B1 272 PXOR B0, B1 273 MOVOU B1, (16*15)(dst) 274 275 MOVOU B0, B2 276 MOVOU B1, B3 277 // Now prepare powers of H and pre-computations for them 278 MOVQ $7, AX 279 280 initLoop: 281 MOVOU B2, T0 282 MOVOU B2, T1 283 MOVOU B3, T2 284 PCLMULQDQ $0x00, B0, T0 285 PCLMULQDQ $0x11, B0, T1 286 PCLMULQDQ $0x00, B1, T2 287 288 PXOR T0, T2 289 PXOR T1, T2 290 MOVOU T2, B4 291 PSLLDQ $8, B4 292 PSRLDQ $8, T2 293 PXOR B4, T0 294 PXOR T2, T1 295 296 MOVOU POLY, B2 297 PCLMULQDQ $0x01, T0, B2 298 PSHUFD $78, T0, T0 299 PXOR B2, T0 300 MOVOU POLY, B2 301 PCLMULQDQ $0x01, T0, B2 302 PSHUFD $78, T0, T0 303 PXOR T0, B2 304 PXOR T1, B2 305 306 MOVOU B2, (16*12)(dst) 307 PSHUFD $78, B2, B3 308 PXOR B2, B3 309 MOVOU B3, (16*13)(dst) 310 311 DECQ AX 312 LEAQ (-16*2)(dst), dst 313 JNE initLoop 314 315 RET 316 #undef NR 317 #undef KS 318 #undef dst 319 320 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) 321 TEXT ·gcmAesData(SB),NOSPLIT,$0 322 #define pTbl DI 323 #define aut SI 324 #define tPtr CX 325 #define autLen DX 326 327 MOVQ productTable+0(FP), pTbl 328 MOVQ data+8(FP), aut 329 MOVQ data+16(FP), autLen 330 MOVQ T+32(FP), tPtr 331 332 PXOR ACC0, ACC0 333 MOVOU bswapMask<>(SB), BSWAP 334 MOVOU gcmPoly<>(SB), POLY 335 336 MOVOU (16*14)(pTbl), T1 337 MOVOU (16*15)(pTbl), T2 338 339 TESTQ autLen, autLen 340 JEQ dataBail 341 342 CMPQ autLen, $13 // optimize the TLS case 343 JNE dataSinglesLoop 344 345 PXOR B0, B0 346 MOVQ (aut), B0 347 PINSRD $2, 8(aut), B0 348 BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x20; BYTE $0x46; BYTE $0x0c; BYTE $0x0c //PINSRB $12, 12(aut), B0 349 XORQ autLen, autLen 350 JMP dataMul 351 352 dataSinglesLoop: 353 354 CMPQ autLen, $16 355 JB dataEnd 356 SUBQ $16, autLen 357 358 MOVOU (aut), B0 359 dataMul: 360 PSHUFB BSWAP, B0 361 PXOR ACC0, B0 362 363 MOVOU T1, ACC0 364 MOVOU T2, ACCM 365 MOVOU T1, ACC1 366 367 PSHUFD $78, B0, T0 368 PXOR B0, T0 369 PCLMULQDQ $0x00, B0, ACC0 370 PCLMULQDQ $0x11, B0, ACC1 371 PCLMULQDQ $0x00, T0, ACCM 372 373 PXOR ACC0, ACCM 374 PXOR ACC1, ACCM 375 MOVOU ACCM, T0 376 PSRLDQ $8, ACCM 377 PSLLDQ $8, T0 378 PXOR ACCM, ACC1 379 PXOR T0, ACC0 380 381 MOVOU POLY, T0 382 PCLMULQDQ $0x01, ACC0, T0 383 PSHUFD $78, ACC0, ACC0 384 PXOR T0, ACC0 385 386 MOVOU POLY, T0 387 PCLMULQDQ $0x01, ACC0, T0 388 PSHUFD $78, ACC0, ACC0 389 PXOR T0, ACC0 390 PXOR ACC1, ACC0 391 392 LEAQ 16(aut), aut 393 394 JMP dataSinglesLoop 395 396 dataEnd: 397 398 TESTQ autLen, autLen 399 JEQ dataBail 400 401 PXOR B0, B0 402 LEAQ -1(aut)(autLen*1), aut 403 404 dataLoadLoop: 405 406 PSLLDQ $1, B0 407 BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x20; BYTE $0x06; BYTE $0x00 //PINSRB $0, (aut), B0 408 409 LEAQ -1(aut), aut 410 DECQ autLen 411 JNE dataLoadLoop 412 413 JMP dataMul 414 415 dataBail: 416 MOVOU ACC0, (tPtr) 417 RET 418 #undef pTbl 419 #undef aut 420 #undef tPtr 421 #undef autLen 422 423 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 424 TEXT ·gcmAesEnc(SB),0,$256-144 425 #define pTbl DI 426 #define ctx DX 427 #define ctrPtr CX 428 #define ptx SI 429 #define ks AX 430 #define tPtr R8 431 #define ptxLen R9 432 #define aluCTR R10 433 #define aluTMP R11 434 #define aluK R12 435 #define NR R13 436 437 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) 438 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 439 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 440 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 441 #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a 442 #define combinedRound(i) \ 443 MOVOU (16*i)(ks), T0;\ 444 AESENC T0, B0;\ 445 AESENC T0, B1;\ 446 AESENC T0, B2;\ 447 AESENC T0, B3;\ 448 MOVOU (16*(i*2))(pTbl), T1;\ 449 MOVOU T1, T2;\ 450 AESENC T0, B4;\ 451 AESENC T0, B5;\ 452 AESENC T0, B6;\ 453 AESENC T0, B7;\ 454 MOVOU (16*i)(SP), T0;\ 455 PCLMULQDQ $0x00, T0, T1;\ 456 PXOR T1, ACC0;\ 457 PSHUFD $78, T0, T1;\ 458 PCLMULQDQ $0x11, T0, T2;\ 459 PXOR T1, T0;\ 460 PXOR T2, ACC1;\ 461 MOVOU (16*(i*2+1))(pTbl), T2;\ 462 PCLMULQDQ $0x00, T2, T0;\ 463 PXOR T0, ACCM 464 #define mulRound(i) \ 465 MOVOU (16*i)(SP), T0;\ 466 MOVOU (16*(i*2))(pTbl), T1;\ 467 MOVOU T1, T2;\ 468 PCLMULQDQ $0x00, T0, T1;\ 469 PXOR T1, ACC0;\ 470 PCLMULQDQ $0x11, T0, T2;\ 471 PXOR T2, ACC1;\ 472 PSHUFD $78, T0, T1;\ 473 PXOR T1, T0;\ 474 MOVOU (16*(i*2+1))(pTbl), T1;\ 475 PCLMULQDQ $0x00, T0, T1;\ 476 PXOR T1, ACCM 477 478 MOVQ productTable+0(FP), pTbl 479 MOVQ dst+8(FP), ctx 480 MOVQ src+32(FP), ptx 481 MOVQ src+40(FP), ptxLen 482 MOVQ ctr+56(FP), ctrPtr 483 MOVQ T+64(FP), tPtr 484 MOVQ KS+72(FP), ks 485 MOVQ nr+80(FP), NR 486 487 SHRQ $2, NR 488 DECQ NR 489 490 MOVOU bswapMask<>(SB), BSWAP 491 MOVOU gcmPoly<>(SB), POLY 492 493 MOVOU (tPtr), ACC0 494 PXOR ACC1, ACC1 495 PXOR ACCM, ACCM 496 MOVOU (ctrPtr), B0 497 MOVL (3*4)(ctrPtr), aluCTR 498 MOVOU (ks), T0 499 MOVL (3*4)(ks), aluK 500 BSWAPL aluCTR 501 BSWAPL aluK 502 503 PXOR B0, T0 504 MOVOU T0, (8*16 + 0*16)(SP) 505 increment(0) 506 507 CMPQ ptxLen, $128 508 JB gcmAesEncSingles 509 SUBQ $128, ptxLen 510 511 // We have at least 8 blocks to encrypt, prepare the rest of the counters 512 MOVOU T0, (8*16 + 1*16)(SP) 513 increment(1) 514 MOVOU T0, (8*16 + 2*16)(SP) 515 increment(2) 516 MOVOU T0, (8*16 + 3*16)(SP) 517 increment(3) 518 MOVOU T0, (8*16 + 4*16)(SP) 519 increment(4) 520 MOVOU T0, (8*16 + 5*16)(SP) 521 increment(5) 522 MOVOU T0, (8*16 + 6*16)(SP) 523 increment(6) 524 MOVOU T0, (8*16 + 7*16)(SP) 525 increment(7) 526 527 MOVOU (8*16 + 0*16)(SP), B0 528 MOVOU (8*16 + 1*16)(SP), B1 529 MOVOU (8*16 + 2*16)(SP), B2 530 MOVOU (8*16 + 3*16)(SP), B3 531 MOVOU (8*16 + 4*16)(SP), B4 532 MOVOU (8*16 + 5*16)(SP), B5 533 MOVOU (8*16 + 6*16)(SP), B6 534 MOVOU (8*16 + 7*16)(SP), B7 535 536 aesRound(1) 537 increment(0) 538 aesRound(2) 539 increment(1) 540 aesRound(3) 541 increment(2) 542 aesRound(4) 543 increment(3) 544 aesRound(5) 545 increment(4) 546 aesRound(6) 547 increment(5) 548 aesRound(7) 549 increment(6) 550 aesRound(8) 551 increment(7) 552 aesRound(9) 553 MOVOU (16*10)(ks), T0 554 CMPQ NR, $12 555 JB encLast1 556 aesRnd(T0) 557 aesRound(11) 558 MOVOU (16*12)(ks), T0 559 JE encLast1 560 aesRnd(T0) 561 aesRound(13) 562 MOVOU (16*14)(ks), T0 563 encLast1: 564 aesRndLast(T0) 565 566 MOVOU (16*0)(ptx), T0 567 PXOR T0, B0 568 MOVOU (16*1)(ptx), T0 569 PXOR T0, B1 570 MOVOU (16*2)(ptx), T0 571 PXOR T0, B2 572 MOVOU (16*3)(ptx), T0 573 PXOR T0, B3 574 MOVOU (16*4)(ptx), T0 575 PXOR T0, B4 576 MOVOU (16*5)(ptx), T0 577 PXOR T0, B5 578 MOVOU (16*6)(ptx), T0 579 PXOR T0, B6 580 MOVOU (16*7)(ptx), T0 581 PXOR T0, B7 582 583 MOVOU B0, (16*0)(ctx) 584 PSHUFB BSWAP, B0 585 PXOR ACC0, B0 586 MOVOU B1, (16*1)(ctx) 587 PSHUFB BSWAP, B1 588 MOVOU B2, (16*2)(ctx) 589 PSHUFB BSWAP, B2 590 MOVOU B3, (16*3)(ctx) 591 PSHUFB BSWAP, B3 592 MOVOU B4, (16*4)(ctx) 593 PSHUFB BSWAP, B4 594 MOVOU B5, (16*5)(ctx) 595 PSHUFB BSWAP, B5 596 MOVOU B6, (16*6)(ctx) 597 PSHUFB BSWAP, B6 598 MOVOU B7, (16*7)(ctx) 599 PSHUFB BSWAP, B7 600 601 MOVOU B0, (16*0)(SP) 602 MOVOU B1, (16*1)(SP) 603 MOVOU B2, (16*2)(SP) 604 MOVOU B3, (16*3)(SP) 605 MOVOU B4, (16*4)(SP) 606 MOVOU B5, (16*5)(SP) 607 MOVOU B6, (16*6)(SP) 608 MOVOU B7, (16*7)(SP) 609 610 LEAQ 128(ptx), ptx 611 LEAQ 128(ctx), ctx 612 613 gcmAesEncOctetsLoop: 614 615 CMPQ ptxLen, $128 616 JB gcmAesEncOctetsEnd 617 SUBQ $128, ptxLen 618 619 MOVOU (8*16 + 0*16)(SP), B0 620 MOVOU (8*16 + 1*16)(SP), B1 621 MOVOU (8*16 + 2*16)(SP), B2 622 MOVOU (8*16 + 3*16)(SP), B3 623 MOVOU (8*16 + 4*16)(SP), B4 624 MOVOU (8*16 + 5*16)(SP), B5 625 MOVOU (8*16 + 6*16)(SP), B6 626 MOVOU (8*16 + 7*16)(SP), B7 627 628 MOVOU (16*0)(SP), T0 629 PSHUFD $78, T0, T1 630 PXOR T0, T1 631 632 MOVOU (16*0)(pTbl), ACC0 633 MOVOU (16*1)(pTbl), ACCM 634 MOVOU ACC0, ACC1 635 636 PCLMULQDQ $0x00, T1, ACCM 637 PCLMULQDQ $0x00, T0, ACC0 638 PCLMULQDQ $0x11, T0, ACC1 639 640 combinedRound(1) 641 increment(0) 642 combinedRound(2) 643 increment(1) 644 combinedRound(3) 645 increment(2) 646 combinedRound(4) 647 increment(3) 648 combinedRound(5) 649 increment(4) 650 combinedRound(6) 651 increment(5) 652 combinedRound(7) 653 increment(6) 654 655 aesRound(8) 656 increment(7) 657 658 PXOR ACC0, ACCM 659 PXOR ACC1, ACCM 660 MOVOU ACCM, T0 661 PSRLDQ $8, ACCM 662 PSLLDQ $8, T0 663 PXOR ACCM, ACC1 664 PXOR T0, ACC0 665 666 reduceRound(ACC0) 667 aesRound(9) 668 669 reduceRound(ACC0) 670 PXOR ACC1, ACC0 671 672 MOVOU (16*10)(ks), T0 673 CMPQ NR, $12 674 JB encLast2 675 aesRnd(T0) 676 aesRound(11) 677 MOVOU (16*12)(ks), T0 678 JE encLast2 679 aesRnd(T0) 680 aesRound(13) 681 MOVOU (16*14)(ks), T0 682 encLast2: 683 aesRndLast(T0) 684 685 MOVOU (16*0)(ptx), T0 686 PXOR T0, B0 687 MOVOU (16*1)(ptx), T0 688 PXOR T0, B1 689 MOVOU (16*2)(ptx), T0 690 PXOR T0, B2 691 MOVOU (16*3)(ptx), T0 692 PXOR T0, B3 693 MOVOU (16*4)(ptx), T0 694 PXOR T0, B4 695 MOVOU (16*5)(ptx), T0 696 PXOR T0, B5 697 MOVOU (16*6)(ptx), T0 698 PXOR T0, B6 699 MOVOU (16*7)(ptx), T0 700 PXOR T0, B7 701 702 MOVOU B0, (16*0)(ctx) 703 PSHUFB BSWAP, B0 704 PXOR ACC0, B0 705 MOVOU B1, (16*1)(ctx) 706 PSHUFB BSWAP, B1 707 MOVOU B2, (16*2)(ctx) 708 PSHUFB BSWAP, B2 709 MOVOU B3, (16*3)(ctx) 710 PSHUFB BSWAP, B3 711 MOVOU B4, (16*4)(ctx) 712 PSHUFB BSWAP, B4 713 MOVOU B5, (16*5)(ctx) 714 PSHUFB BSWAP, B5 715 MOVOU B6, (16*6)(ctx) 716 PSHUFB BSWAP, B6 717 MOVOU B7, (16*7)(ctx) 718 PSHUFB BSWAP, B7 719 720 MOVOU B0, (16*0)(SP) 721 MOVOU B1, (16*1)(SP) 722 MOVOU B2, (16*2)(SP) 723 MOVOU B3, (16*3)(SP) 724 MOVOU B4, (16*4)(SP) 725 MOVOU B5, (16*5)(SP) 726 MOVOU B6, (16*6)(SP) 727 MOVOU B7, (16*7)(SP) 728 729 LEAQ 128(ptx), ptx 730 LEAQ 128(ctx), ctx 731 732 JMP gcmAesEncOctetsLoop 733 734 gcmAesEncOctetsEnd: 735 736 MOVOU (16*0)(SP), T0 737 MOVOU (16*0)(pTbl), ACC0 738 MOVOU (16*1)(pTbl), ACCM 739 MOVOU ACC0, ACC1 740 PSHUFD $78, T0, T1 741 PXOR T0, T1 742 PCLMULQDQ $0x00, T0, ACC0 743 PCLMULQDQ $0x11, T0, ACC1 744 PCLMULQDQ $0x00, T1, ACCM 745 746 mulRound(1) 747 mulRound(2) 748 mulRound(3) 749 mulRound(4) 750 mulRound(5) 751 mulRound(6) 752 mulRound(7) 753 754 PXOR ACC0, ACCM 755 PXOR ACC1, ACCM 756 MOVOU ACCM, T0 757 PSRLDQ $8, ACCM 758 PSLLDQ $8, T0 759 PXOR ACCM, ACC1 760 PXOR T0, ACC0 761 762 reduceRound(ACC0) 763 reduceRound(ACC0) 764 PXOR ACC1, ACC0 765 766 TESTQ ptxLen, ptxLen 767 JE gcmAesEncDone 768 769 SUBQ $7, aluCTR 770 771 gcmAesEncSingles: 772 773 MOVOU (16*1)(ks), B1 774 MOVOU (16*2)(ks), B2 775 MOVOU (16*3)(ks), B3 776 MOVOU (16*4)(ks), B4 777 MOVOU (16*5)(ks), B5 778 MOVOU (16*6)(ks), B6 779 MOVOU (16*7)(ks), B7 780 781 MOVOU (16*14)(pTbl), T2 782 783 gcmAesEncSinglesLoop: 784 785 CMPQ ptxLen, $16 786 JB gcmAesEncTail 787 SUBQ $16, ptxLen 788 789 MOVOU (8*16 + 0*16)(SP), B0 790 increment(0) 791 792 AESENC B1, B0 793 AESENC B2, B0 794 AESENC B3, B0 795 AESENC B4, B0 796 AESENC B5, B0 797 AESENC B6, B0 798 AESENC B7, B0 799 MOVOU (16*8)(ks), T0 800 AESENC T0, B0 801 MOVOU (16*9)(ks), T0 802 AESENC T0, B0 803 MOVOU (16*10)(ks), T0 804 CMPQ NR, $12 805 JB encLast3 806 AESENC T0, B0 807 MOVOU (16*11)(ks), T0 808 AESENC T0, B0 809 MOVOU (16*12)(ks), T0 810 JE encLast3 811 AESENC T0, B0 812 MOVOU (16*13)(ks), T0 813 AESENC T0, B0 814 MOVOU (16*14)(ks), T0 815 encLast3: 816 AESENCLAST T0, B0 817 818 MOVOU (ptx), T0 819 PXOR T0, B0 820 MOVOU B0, (ctx) 821 822 PSHUFB BSWAP, B0 823 PXOR ACC0, B0 824 825 MOVOU T2, ACC0 826 MOVOU T2, ACC1 827 MOVOU (16*15)(pTbl), ACCM 828 829 PSHUFD $78, B0, T0 830 PXOR B0, T0 831 PCLMULQDQ $0x00, B0, ACC0 832 PCLMULQDQ $0x11, B0, ACC1 833 PCLMULQDQ $0x00, T0, ACCM 834 835 PXOR ACC0, ACCM 836 PXOR ACC1, ACCM 837 MOVOU ACCM, T0 838 PSRLDQ $8, ACCM 839 PSLLDQ $8, T0 840 PXOR ACCM, ACC1 841 PXOR T0, ACC0 842 843 reduceRound(ACC0) 844 reduceRound(ACC0) 845 PXOR ACC1, ACC0 846 847 LEAQ (16*1)(ptx), ptx 848 LEAQ (16*1)(ctx), ctx 849 850 JMP gcmAesEncSinglesLoop 851 852 gcmAesEncTail: 853 TESTQ ptxLen, ptxLen 854 JE gcmAesEncDone 855 856 MOVOU (8*16 + 0*16)(SP), B0 857 AESENC B1, B0 858 AESENC B2, B0 859 AESENC B3, B0 860 AESENC B4, B0 861 AESENC B5, B0 862 AESENC B6, B0 863 AESENC B7, B0 864 MOVOU (16*8)(ks), T0 865 AESENC T0, B0 866 MOVOU (16*9)(ks), T0 867 AESENC T0, B0 868 MOVOU (16*10)(ks), T0 869 CMPQ NR, $12 870 JB encLast4 871 AESENC T0, B0 872 MOVOU (16*11)(ks), T0 873 AESENC T0, B0 874 MOVOU (16*12)(ks), T0 875 JE encLast4 876 AESENC T0, B0 877 MOVOU (16*13)(ks), T0 878 AESENC T0, B0 879 MOVOU (16*14)(ks), T0 880 encLast4: 881 AESENCLAST T0, B0 882 MOVOU B0, T0 883 884 LEAQ -1(ptx)(ptxLen*1), ptx 885 886 MOVQ ptxLen, aluTMP 887 SHLQ $4, aluTMP 888 889 LEAQ andMask<>(SB), aluCTR 890 MOVOU -16(aluCTR)(aluTMP*1), T1 891 892 PXOR B0, B0 893 ptxLoadLoop: 894 PSLLDQ $1, B0 895 BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x20; BYTE $0x06; BYTE $0x00 //PINSRB $0, (ptx), B0 896 LEAQ -1(ptx), ptx 897 DECQ ptxLen 898 JNE ptxLoadLoop 899 900 PXOR T0, B0 901 PAND T1, B0 902 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 903 904 PSHUFB BSWAP, B0 905 PXOR ACC0, B0 906 907 MOVOU T2, ACC0 908 MOVOU T2, ACC1 909 MOVOU (16*15)(pTbl), ACCM 910 911 PSHUFD $78, B0, T0 912 PXOR B0, T0 913 PCLMULQDQ $0x00, B0, ACC0 914 PCLMULQDQ $0x11, B0, ACC1 915 PCLMULQDQ $0x00, T0, ACCM 916 917 PXOR ACC0, ACCM 918 PXOR ACC1, ACCM 919 MOVOU ACCM, T0 920 PSRLDQ $8, ACCM 921 PSLLDQ $8, T0 922 PXOR ACCM, ACC1 923 PXOR T0, ACC0 924 925 reduceRound(ACC0) 926 reduceRound(ACC0) 927 PXOR ACC1, ACC0 928 929 gcmAesEncDone: 930 MOVOU ACC0, (tPtr) 931 RET 932 #undef increment 933 934 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 935 TEXT ·gcmAesDec(SB),0,$128-144 936 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) 937 #define combinedDecRound(i) \ 938 MOVOU (16*i)(ks), T0;\ 939 AESENC T0, B0;\ 940 AESENC T0, B1;\ 941 AESENC T0, B2;\ 942 AESENC T0, B3;\ 943 MOVOU (16*(i*2))(pTbl), T1;\ 944 MOVOU T1, T2;\ 945 AESENC T0, B4;\ 946 AESENC T0, B5;\ 947 AESENC T0, B6;\ 948 AESENC T0, B7;\ 949 MOVOU (16*i)(ctx), T0;\ 950 PSHUFB BSWAP, T0;\ 951 PCLMULQDQ $0x00, T0, T1;\ 952 PXOR T1, ACC0;\ 953 PSHUFD $78, T0, T1;\ 954 PCLMULQDQ $0x11, T0, T2;\ 955 PXOR T1, T0;\ 956 PXOR T2, ACC1;\ 957 MOVOU (16*(i*2+1))(pTbl), T2;\ 958 PCLMULQDQ $0x00, T2, T0;\ 959 PXOR T0, ACCM 960 961 MOVQ productTable+0(FP), pTbl 962 MOVQ dst+8(FP), ptx 963 MOVQ src+32(FP), ctx 964 MOVQ src+40(FP), ptxLen 965 MOVQ ctr+56(FP), ctrPtr 966 MOVQ T+64(FP), tPtr 967 MOVQ KS+72(FP), ks 968 MOVQ nr+80(FP), NR 969 970 SHRQ $2, NR 971 DECQ NR 972 973 MOVOU bswapMask<>(SB), BSWAP 974 MOVOU gcmPoly<>(SB), POLY 975 976 MOVOU (tPtr), ACC0 977 PXOR ACC1, ACC1 978 PXOR ACCM, ACCM 979 MOVOU (ctrPtr), B0 980 MOVL (3*4)(ctrPtr), aluCTR 981 MOVOU (ks), T0 982 MOVL (3*4)(ks), aluK 983 BSWAPL aluCTR 984 BSWAPL aluK 985 986 PXOR B0, T0 987 MOVOU T0, (0*16)(SP) 988 increment(0) 989 990 CMPQ ptxLen, $128 991 JB gcmAesDecSingles 992 993 MOVOU T0, (1*16)(SP) 994 increment(1) 995 MOVOU T0, (2*16)(SP) 996 increment(2) 997 MOVOU T0, (3*16)(SP) 998 increment(3) 999 MOVOU T0, (4*16)(SP) 1000 increment(4) 1001 MOVOU T0, (5*16)(SP) 1002 increment(5) 1003 MOVOU T0, (6*16)(SP) 1004 increment(6) 1005 MOVOU T0, (7*16)(SP) 1006 increment(7) 1007 1008 gcmAesDecOctetsLoop: 1009 1010 CMPQ ptxLen, $128 1011 JB gcmAesDecEndOctets 1012 SUBQ $128, ptxLen 1013 1014 MOVOU (0*16)(SP), B0 1015 MOVOU (1*16)(SP), B1 1016 MOVOU (2*16)(SP), B2 1017 MOVOU (3*16)(SP), B3 1018 MOVOU (4*16)(SP), B4 1019 MOVOU (5*16)(SP), B5 1020 MOVOU (6*16)(SP), B6 1021 MOVOU (7*16)(SP), B7 1022 1023 MOVOU (16*0)(ctx), T0 1024 PSHUFB BSWAP, T0 1025 PXOR ACC0, T0 1026 PSHUFD $78, T0, T1 1027 PXOR T0, T1 1028 1029 MOVOU (16*0)(pTbl), ACC0 1030 MOVOU (16*1)(pTbl), ACCM 1031 MOVOU ACC0, ACC1 1032 1033 PCLMULQDQ $0x00, T1, ACCM 1034 PCLMULQDQ $0x00, T0, ACC0 1035 PCLMULQDQ $0x11, T0, ACC1 1036 1037 combinedDecRound(1) 1038 increment(0) 1039 combinedDecRound(2) 1040 increment(1) 1041 combinedDecRound(3) 1042 increment(2) 1043 combinedDecRound(4) 1044 increment(3) 1045 combinedDecRound(5) 1046 increment(4) 1047 combinedDecRound(6) 1048 increment(5) 1049 combinedDecRound(7) 1050 increment(6) 1051 1052 aesRound(8) 1053 increment(7) 1054 1055 PXOR ACC0, ACCM 1056 PXOR ACC1, ACCM 1057 MOVOU ACCM, T0 1058 PSRLDQ $8, ACCM 1059 PSLLDQ $8, T0 1060 PXOR ACCM, ACC1 1061 PXOR T0, ACC0 1062 1063 reduceRound(ACC0) 1064 aesRound(9) 1065 1066 reduceRound(ACC0) 1067 PXOR ACC1, ACC0 1068 1069 MOVOU (16*10)(ks), T0 1070 CMPQ NR, $12 1071 JB decLast1 1072 aesRnd(T0) 1073 aesRound(11) 1074 MOVOU (16*12)(ks), T0 1075 JE decLast1 1076 aesRnd(T0) 1077 aesRound(13) 1078 MOVOU (16*14)(ks), T0 1079 decLast1: 1080 aesRndLast(T0) 1081 1082 MOVOU (16*0)(ctx), T0 1083 PXOR T0, B0 1084 MOVOU (16*1)(ctx), T0 1085 PXOR T0, B1 1086 MOVOU (16*2)(ctx), T0 1087 PXOR T0, B2 1088 MOVOU (16*3)(ctx), T0 1089 PXOR T0, B3 1090 MOVOU (16*4)(ctx), T0 1091 PXOR T0, B4 1092 MOVOU (16*5)(ctx), T0 1093 PXOR T0, B5 1094 MOVOU (16*6)(ctx), T0 1095 PXOR T0, B6 1096 MOVOU (16*7)(ctx), T0 1097 PXOR T0, B7 1098 1099 MOVOU B0, (16*0)(ptx) 1100 MOVOU B1, (16*1)(ptx) 1101 MOVOU B2, (16*2)(ptx) 1102 MOVOU B3, (16*3)(ptx) 1103 MOVOU B4, (16*4)(ptx) 1104 MOVOU B5, (16*5)(ptx) 1105 MOVOU B6, (16*6)(ptx) 1106 MOVOU B7, (16*7)(ptx) 1107 1108 LEAQ 128(ptx), ptx 1109 LEAQ 128(ctx), ctx 1110 1111 JMP gcmAesDecOctetsLoop 1112 1113 gcmAesDecEndOctets: 1114 1115 SUBQ $7, aluCTR 1116 1117 gcmAesDecSingles: 1118 1119 MOVOU (16*1)(ks), B1 1120 MOVOU (16*2)(ks), B2 1121 MOVOU (16*3)(ks), B3 1122 MOVOU (16*4)(ks), B4 1123 MOVOU (16*5)(ks), B5 1124 MOVOU (16*6)(ks), B6 1125 MOVOU (16*7)(ks), B7 1126 1127 MOVOU (16*14)(pTbl), T2 1128 1129 gcmAesDecSinglesLoop: 1130 1131 CMPQ ptxLen, $16 1132 JB gcmAesDecTail 1133 SUBQ $16, ptxLen 1134 1135 MOVOU (ctx), B0 1136 MOVOU B0, T1 1137 PSHUFB BSWAP, B0 1138 PXOR ACC0, B0 1139 1140 MOVOU T2, ACC0 1141 MOVOU T2, ACC1 1142 MOVOU (16*15)(pTbl), ACCM 1143 1144 PCLMULQDQ $0x00, B0, ACC0 1145 PCLMULQDQ $0x11, B0, ACC1 1146 PSHUFD $78, B0, T0 1147 PXOR B0, T0 1148 PCLMULQDQ $0x00, T0, ACCM 1149 1150 PXOR ACC0, ACCM 1151 PXOR ACC1, ACCM 1152 MOVOU ACCM, T0 1153 PSRLDQ $8, ACCM 1154 PSLLDQ $8, T0 1155 PXOR ACCM, ACC1 1156 PXOR T0, ACC0 1157 1158 reduceRound(ACC0) 1159 reduceRound(ACC0) 1160 PXOR ACC1, ACC0 1161 1162 MOVOU (0*16)(SP), B0 1163 increment(0) 1164 AESENC B1, B0 1165 AESENC B2, B0 1166 AESENC B3, B0 1167 AESENC B4, B0 1168 AESENC B5, B0 1169 AESENC B6, B0 1170 AESENC B7, B0 1171 MOVOU (16*8)(ks), T0 1172 AESENC T0, B0 1173 MOVOU (16*9)(ks), T0 1174 AESENC T0, B0 1175 MOVOU (16*10)(ks), T0 1176 CMPQ NR, $12 1177 JB decLast2 1178 AESENC T0, B0 1179 MOVOU (16*11)(ks), T0 1180 AESENC T0, B0 1181 MOVOU (16*12)(ks), T0 1182 JE decLast2 1183 AESENC T0, B0 1184 MOVOU (16*13)(ks), T0 1185 AESENC T0, B0 1186 MOVOU (16*14)(ks), T0 1187 decLast2: 1188 AESENCLAST T0, B0 1189 1190 PXOR T1, B0 1191 MOVOU B0, (ptx) 1192 1193 LEAQ (16*1)(ptx), ptx 1194 LEAQ (16*1)(ctx), ctx 1195 1196 JMP gcmAesDecSinglesLoop 1197 1198 gcmAesDecTail: 1199 1200 TESTQ ptxLen, ptxLen 1201 JE gcmAesDecDone 1202 1203 MOVQ ptxLen, aluTMP 1204 SHLQ $4, aluTMP 1205 LEAQ andMask<>(SB), aluCTR 1206 MOVOU -16(aluCTR)(aluTMP*1), T1 1207 1208 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 1209 PAND T1, B0 1210 1211 MOVOU B0, T1 1212 PSHUFB BSWAP, B0 1213 PXOR ACC0, B0 1214 1215 MOVOU (16*14)(pTbl), ACC0 1216 MOVOU (16*15)(pTbl), ACCM 1217 MOVOU ACC0, ACC1 1218 1219 PCLMULQDQ $0x00, B0, ACC0 1220 PCLMULQDQ $0x11, B0, ACC1 1221 PSHUFD $78, B0, T0 1222 PXOR B0, T0 1223 PCLMULQDQ $0x00, T0, ACCM 1224 1225 PXOR ACC0, ACCM 1226 PXOR ACC1, ACCM 1227 MOVOU ACCM, T0 1228 PSRLDQ $8, ACCM 1229 PSLLDQ $8, T0 1230 PXOR ACCM, ACC1 1231 PXOR T0, ACC0 1232 1233 reduceRound(ACC0) 1234 reduceRound(ACC0) 1235 PXOR ACC1, ACC0 1236 1237 MOVOU (0*16)(SP), B0 1238 increment(0) 1239 AESENC B1, B0 1240 AESENC B2, B0 1241 AESENC B3, B0 1242 AESENC B4, B0 1243 AESENC B5, B0 1244 AESENC B6, B0 1245 AESENC B7, B0 1246 MOVOU (16*8)(ks), T0 1247 AESENC T0, B0 1248 MOVOU (16*9)(ks), T0 1249 AESENC T0, B0 1250 MOVOU (16*10)(ks), T0 1251 CMPQ NR, $12 1252 JB decLast3 1253 AESENC T0, B0 1254 MOVOU (16*11)(ks), T0 1255 AESENC T0, B0 1256 MOVOU (16*12)(ks), T0 1257 JE decLast3 1258 AESENC T0, B0 1259 MOVOU (16*13)(ks), T0 1260 AESENC T0, B0 1261 MOVOU (16*14)(ks), T0 1262 decLast3: 1263 AESENCLAST T0, B0 1264 PXOR T1, B0 1265 1266 ptxStoreLoop: 1267 BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x14; BYTE $0x06; BYTE $0x00 // PEXTRB $0, B0, (ptx) 1268 PSRLDQ $1, B0 1269 LEAQ 1(ptx), ptx 1270 DECQ ptxLen 1271 1272 JNE ptxStoreLoop 1273 1274 gcmAesDecDone: 1275 1276 MOVOU ACC0, (tPtr) 1277 RET