github.com/emmansun/gmsm@v0.29.1/sm4/gcm_amd64.s (about) 1 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI 2 // The implementation uses some optimization as described in: 3 // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication 4 // Instruction and its Usage for Computing the GCM Mode rev. 2.02 5 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and 6 // Hardware 7 //go:build !purego 8 9 #include "textflag.h" 10 11 #define B0 X0 12 #define B1 X1 13 #define B2 X2 14 #define B3 X3 15 #define B4 X4 16 #define B5 X5 17 #define B6 X6 18 #define B7 X7 19 20 #define DWB0 Y0 21 #define DWB1 Y2 22 #define DWB2 Y4 23 #define DWB3 Y6 24 25 #define XDWORD Y1 26 #define YDWORD Y3 27 #define XDWTMP0 Y5 28 29 #define ACC0 X8 30 #define ACC1 X9 31 #define ACCM X10 32 33 #define T0 X11 34 #define T1 X12 35 #define T2 X13 36 #define POLY X14 37 #define BSWAP X15 38 #define DWBSWAP Y15 39 #define NIBBLE_MASK Y7 40 #define X_NIBBLE_MASK X7 41 42 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 43 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 44 45 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 46 DATA andMask<>+0x08(SB)/8, $0x0000000000000000 47 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 48 DATA andMask<>+0x18(SB)/8, $0x0000000000000000 49 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 50 DATA andMask<>+0x28(SB)/8, $0x0000000000000000 51 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 52 DATA andMask<>+0x38(SB)/8, $0x0000000000000000 53 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 54 DATA andMask<>+0x48(SB)/8, $0x0000000000000000 55 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 56 DATA andMask<>+0x58(SB)/8, $0x0000000000000000 57 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 58 DATA andMask<>+0x68(SB)/8, $0x0000000000000000 59 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 60 DATA andMask<>+0x78(SB)/8, $0x0000000000000000 61 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 62 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 63 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 64 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 65 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 66 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 67 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 68 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 69 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 70 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 71 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 72 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 73 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 74 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 75 76 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 77 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 78 79 #include "aesni_macros_amd64.s" 80 81 // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 82 TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 83 #define pTbl DI 84 #define tMsk SI 85 #define tPtr DX 86 #define plen AX 87 #define dlen CX 88 89 MOVQ productTable+0(FP), pTbl 90 MOVQ tagMask+8(FP), tMsk 91 MOVQ T+16(FP), tPtr 92 MOVQ pLen+24(FP), plen 93 MOVQ dLen+32(FP), dlen 94 95 MOVOU (tPtr), ACC0 96 MOVOU (tMsk), T2 97 98 MOVOU bswap_mask<>(SB), BSWAP 99 MOVOU gcmPoly<>(SB), POLY 100 101 SHLQ $3, plen 102 SHLQ $3, dlen 103 104 MOVQ plen, B0 105 PINSRQ $1, dlen, B0 106 107 PXOR ACC0, B0 108 109 MOVOU (16*14)(pTbl), ACC0 110 MOVOU (16*15)(pTbl), ACCM 111 MOVOU ACC0, ACC1 112 113 PCLMULQDQ $0x00, B0, ACC0 114 PCLMULQDQ $0x11, B0, ACC1 115 PSHUFD $78, B0, T0 116 PXOR B0, T0 117 PCLMULQDQ $0x00, T0, ACCM 118 119 PXOR ACC0, ACCM 120 PXOR ACC1, ACCM 121 MOVOU ACCM, T0 122 PSRLDQ $8, ACCM 123 PSLLDQ $8, T0 124 PXOR ACCM, ACC1 125 PXOR T0, ACC0 126 127 MOVOU POLY, T0 128 PCLMULQDQ $0x01, ACC0, T0 129 PSHUFD $78, ACC0, ACC0 130 PXOR T0, ACC0 131 132 MOVOU POLY, T0 133 PCLMULQDQ $0x01, ACC0, T0 134 PSHUFD $78, ACC0, ACC0 135 PXOR T0, ACC0 136 137 PXOR ACC1, ACC0 138 139 PSHUFB BSWAP, ACC0 140 PXOR T2, ACC0 141 MOVOU ACC0, (tPtr) 142 143 RET 144 145 #undef pTbl 146 #undef tMsk 147 #undef tPtr 148 #undef plen 149 #undef dlen 150 151 // func gcmSm4Init(productTable *[256]byte, rk []uint32) 152 TEXT ·gcmSm4Init(SB),NOSPLIT,$0 153 #define dst DI 154 #define RK SI 155 156 MOVQ productTable+0(FP), dst 157 MOVQ rk+8(FP), RK 158 159 MOVOU gcmPoly<>(SB), POLY 160 161 // Encrypt block 0, with the sm4 round keys to generate the hash key H 162 PXOR B0, B0 163 PXOR B1, B1 164 PXOR B2, B2 165 PXOR B3, B3 166 XORL CX, CX 167 168 sm4InitEncLoop: 169 MOVUPS (RK)(CX*1), B4 170 MOVOU B4, T0 171 SM4_SINGLE_ROUND(T0, T1, T2, B3, B2, B1, B0) 172 PSHUFD $1, B4, T0 173 SM4_SINGLE_ROUND(T0, T1, T2, B2, B1, B0, B3) 174 PSHUFD $2, B4, T0 175 SM4_SINGLE_ROUND(T0, T1, T2, B1, B0, B3, B2) 176 PSHUFD $3, B4, T0 177 SM4_SINGLE_ROUND(T0, T1, T2, B0, B3, B2, B1) 178 179 ADDL $16, CX 180 CMPL CX, $4*32 181 JB sm4InitEncLoop 182 183 PALIGNR $4, B3, B3 184 PALIGNR $4, B3, B2 185 PALIGNR $4, B2, B1 186 PALIGNR $4, B1, B0 187 188 // H * 2 189 PSHUFD $0xff, B0, T0 190 MOVOU B0, T1 191 PSRAL $31, T0 192 PAND POLY, T0 193 PSRLL $31, T1 194 PSLLDQ $4, T1 195 PSLLL $1, B0 196 PXOR T0, B0 197 PXOR T1, B0 198 // Karatsuba pre-computations 199 MOVOU B0, (16*14)(dst) 200 PSHUFD $78, B0, B1 201 PXOR B0, B1 202 MOVOU B1, (16*15)(dst) 203 204 MOVOU B0, B2 205 MOVOU B1, B3 206 // Now prepare powers of H and pre-computations for them 207 MOVQ $7, AX 208 209 initLoop: 210 MOVOU B2, T0 211 MOVOU B2, T1 212 MOVOU B3, T2 213 PCLMULQDQ $0x00, B0, T0 214 PCLMULQDQ $0x11, B0, T1 215 PCLMULQDQ $0x00, B1, T2 216 217 PXOR T0, T2 218 PXOR T1, T2 219 MOVOU T2, B4 220 PSLLDQ $8, B4 221 PSRLDQ $8, T2 222 PXOR B4, T0 223 PXOR T2, T1 224 225 MOVOU POLY, B2 226 PCLMULQDQ $0x01, T0, B2 227 PSHUFD $78, T0, T0 228 PXOR B2, T0 229 MOVOU POLY, B2 230 PCLMULQDQ $0x01, T0, B2 231 PSHUFD $78, T0, T0 232 PXOR T0, B2 233 PXOR T1, B2 234 235 MOVOU B2, (16*12)(dst) 236 PSHUFD $78, B2, B3 237 PXOR B2, B3 238 MOVOU B3, (16*13)(dst) 239 240 DECQ AX 241 LEAQ (-16*2)(dst), dst 242 JNE initLoop 243 244 RET 245 246 #undef RK 247 #undef dst 248 249 // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) 250 TEXT ·gcmSm4Data(SB),NOSPLIT,$0 251 #define pTbl DI 252 #define aut SI 253 #define tPtr CX 254 #define autLen DX 255 256 #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a 257 #define avxReduceRound(a) VPCLMULQDQ $0x01, a, POLY, T0; VPSHUFD $78, a, a; VPXOR T0, a, a 258 #define mulRoundAAD(X ,i) \ 259 MOVOU (16*(i*2))(pTbl), T1;\ 260 MOVOU T1, T2;\ 261 PCLMULQDQ $0x00, X, T1;\ 262 PXOR T1, ACC0;\ 263 PCLMULQDQ $0x11, X, T2;\ 264 PXOR T2, ACC1;\ 265 PSHUFD $78, X, T1;\ 266 PXOR T1, X;\ 267 MOVOU (16*(i*2+1))(pTbl), T1;\ 268 PCLMULQDQ $0x00, X, T1;\ 269 PXOR T1, ACCM 270 271 MOVQ productTable+0(FP), pTbl 272 MOVQ data_base+8(FP), aut 273 MOVQ data_len+16(FP), autLen 274 MOVQ T+32(FP), tPtr 275 276 PXOR ACC0, ACC0 277 // MOVOU (tPtr), ACC0 // originally we passed in tag initial value 278 MOVOU bswap_mask<>(SB), BSWAP 279 MOVOU gcmPoly<>(SB), POLY 280 281 TESTQ autLen, autLen 282 JEQ dataBail 283 284 CMPQ autLen, $13 // optimize the TLS case 285 JE dataTLS 286 CMPQ autLen, $128 287 JB startSinglesLoop 288 JMP dataOctaLoop 289 290 dataTLS: 291 MOVOU (16*14)(pTbl), T1 292 MOVOU (16*15)(pTbl), T2 293 PXOR B0, B0 294 MOVQ (aut), B0 295 PINSRD $2, 8(aut), B0 296 PINSRB $12, 12(aut), B0 297 XORQ autLen, autLen 298 JMP dataMul 299 300 dataOctaLoop: 301 CMPQ autLen, $128 302 JB startSinglesLoop 303 SUBQ $128, autLen 304 305 MOVOU (16*0)(aut), X0 306 MOVOU (16*1)(aut), X1 307 MOVOU (16*2)(aut), X2 308 MOVOU (16*3)(aut), X3 309 MOVOU (16*4)(aut), X4 310 MOVOU (16*5)(aut), X5 311 MOVOU (16*6)(aut), X6 312 MOVOU (16*7)(aut), X7 313 LEAQ (16*8)(aut), aut 314 PSHUFB BSWAP, X0 315 PSHUFB BSWAP, X1 316 PSHUFB BSWAP, X2 317 PSHUFB BSWAP, X3 318 PSHUFB BSWAP, X4 319 PSHUFB BSWAP, X5 320 PSHUFB BSWAP, X6 321 PSHUFB BSWAP, X7 322 PXOR ACC0, X0 323 324 MOVOU (16*0)(pTbl), ACC0 325 MOVOU (16*1)(pTbl), ACCM 326 MOVOU ACC0, ACC1 327 PSHUFD $78, X0, T1 328 PXOR X0, T1 329 PCLMULQDQ $0x00, X0, ACC0 330 PCLMULQDQ $0x11, X0, ACC1 331 PCLMULQDQ $0x00, T1, ACCM 332 333 mulRoundAAD(X1, 1) 334 mulRoundAAD(X2, 2) 335 mulRoundAAD(X3, 3) 336 mulRoundAAD(X4, 4) 337 mulRoundAAD(X5, 5) 338 mulRoundAAD(X6, 6) 339 mulRoundAAD(X7, 7) 340 341 PXOR ACC0, ACCM 342 PXOR ACC1, ACCM 343 MOVOU ACCM, T0 344 PSRLDQ $8, ACCM 345 PSLLDQ $8, T0 346 PXOR ACCM, ACC1 347 PXOR T0, ACC0 348 reduceRound(ACC0) 349 reduceRound(ACC0) 350 PXOR ACC1, ACC0 351 JMP dataOctaLoop 352 353 startSinglesLoop: 354 MOVOU (16*14)(pTbl), T1 355 MOVOU (16*15)(pTbl), T2 356 357 dataSinglesLoop: 358 359 CMPQ autLen, $16 360 JB dataEnd 361 SUBQ $16, autLen 362 363 MOVOU (aut), B0 364 dataMul: 365 PSHUFB BSWAP, B0 366 PXOR ACC0, B0 367 368 MOVOU T1, ACC0 369 MOVOU T2, ACCM 370 MOVOU T1, ACC1 371 372 PSHUFD $78, B0, T0 373 PXOR B0, T0 374 PCLMULQDQ $0x00, B0, ACC0 375 PCLMULQDQ $0x11, B0, ACC1 376 PCLMULQDQ $0x00, T0, ACCM 377 378 PXOR ACC0, ACCM 379 PXOR ACC1, ACCM 380 MOVOU ACCM, T0 381 PSRLDQ $8, ACCM 382 PSLLDQ $8, T0 383 PXOR ACCM, ACC1 384 PXOR T0, ACC0 385 386 MOVOU POLY, T0 387 PCLMULQDQ $0x01, ACC0, T0 388 PSHUFD $78, ACC0, ACC0 389 PXOR T0, ACC0 390 391 MOVOU POLY, T0 392 PCLMULQDQ $0x01, ACC0, T0 393 PSHUFD $78, ACC0, ACC0 394 PXOR T0, ACC0 395 PXOR ACC1, ACC0 396 397 LEAQ 16(aut), aut 398 399 JMP dataSinglesLoop 400 401 dataEnd: 402 403 TESTQ autLen, autLen 404 JEQ dataBail 405 406 PXOR B0, B0 407 LEAQ -1(aut)(autLen*1), aut 408 409 dataLoadLoop: 410 411 PSLLDQ $1, B0 412 PINSRB $0, (aut), B0 413 414 LEAQ -1(aut), aut 415 DECQ autLen 416 JNE dataLoadLoop 417 418 JMP dataMul 419 420 dataBail: 421 MOVOU ACC0, (tPtr) 422 RET 423 424 #undef pTbl 425 #undef aut 426 #undef tPtr 427 #undef autLen 428 429 430 // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 431 TEXT ·gcmSm4Enc(SB),0,$256-96 432 #define pTbl DI 433 #define ctx DX 434 #define ctrPtr CX 435 #define ptx SI 436 #define rk AX 437 #define tPtr R8 438 #define ptxLen R9 439 #define aluCTR R10 440 #define aluTMP R11 441 442 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + 8*16 + i*16)(SP) 443 444 #define mulRound(i) \ 445 MOVOU (16*i)(SP), T0;\ 446 MOVOU (16*(i*2))(pTbl), T1;\ 447 MOVOU T1, T2;\ 448 PCLMULQDQ $0x00, T0, T1;\ 449 PXOR T1, ACC0;\ 450 PCLMULQDQ $0x11, T0, T2;\ 451 PXOR T2, ACC1;\ 452 PSHUFD $78, T0, T1;\ 453 PXOR T1, T0;\ 454 MOVOU (16*(i*2+1))(pTbl), T1;\ 455 PCLMULQDQ $0x00, T0, T1;\ 456 PXOR T1, ACCM 457 458 #define gcmEncDataStep(B) \ 459 PSHUFB BSWAP, B; \ 460 PXOR ACC0, B; \ 461 MOVOU T2, ACC0; \ 462 MOVOU T2, ACC1; \ 463 MOVOU (16*15)(pTbl), ACCM; \ 464 PSHUFD $78, B, T0; \ 465 PXOR B, T0; \ 466 PCLMULQDQ $0x00, B, ACC0; \ 467 PCLMULQDQ $0x11, B, ACC1; \ 468 PCLMULQDQ $0x00, T0, ACCM; \ 469 PXOR ACC0, ACCM; \ 470 PXOR ACC1, ACCM; \ 471 MOVOU ACCM, T0; \ 472 PSRLDQ $8, ACCM; \ 473 PSLLDQ $8, T0; \ 474 PXOR ACCM, ACC1; \ 475 PXOR T0, ACC0; \ 476 reduceRound(ACC0); \ 477 reduceRound(ACC0); \ 478 PXOR ACC1, ACC0 479 480 #define avxMulRound(i) \ 481 VMOVDQU (16*i)(SP), T0;\ 482 VMOVDQU (16*(i*2))(pTbl), T2;\ 483 VPCLMULQDQ $0x00, T0, T2, T1;\ 484 VPXOR T1, ACC0, ACC0;\ 485 VPCLMULQDQ $0x11, T0, T2, T2;\ 486 VPXOR T2, ACC1, ACC1;\ 487 VPSHUFD $78, T0, T1;\ 488 VPXOR T1, T0, T0;\ 489 VMOVDQU (16*(i*2+1))(pTbl), T1;\ 490 VPCLMULQDQ $0x00, T0, T1, T1;\ 491 VPXOR T1, ACCM, ACCM 492 493 #define avxGcmEncDataStep(B) \ 494 VPSHUFB BSWAP, B, B; \ 495 VPXOR ACC0, B, B; \ 496 VMOVDQU (16*15)(pTbl), ACCM; \ 497 VPSHUFD $78, B, T0; \ 498 VPXOR B, T0, T0; \ 499 VPCLMULQDQ $0x00, B, T2, ACC0; \ 500 VPCLMULQDQ $0x11, B, T2, ACC1; \ 501 VPCLMULQDQ $0x00, T0, ACCM, ACCM; \ 502 VPXOR ACC0, ACCM, ACCM; \ 503 VPXOR ACC1, ACCM, ACCM; \ 504 VPSLLDQ $8, ACCM, T0; \ 505 VPSRLDQ $8, ACCM, ACCM; \ 506 VPXOR ACCM, ACC1, ACC1; \ 507 VPXOR T0, ACC0, ACC0; \ 508 avxReduceRound(ACC0); \ 509 avxReduceRound(ACC0); \ 510 VPXOR ACC1, ACC0, ACC0 511 512 MOVQ productTable+0(FP), pTbl 513 MOVQ dst+8(FP), ctx 514 MOVQ src_base+32(FP), ptx 515 MOVQ src_len+40(FP), ptxLen 516 MOVQ ctr+56(FP), ctrPtr 517 MOVQ T+64(FP), tPtr 518 MOVQ rk_base+72(FP), rk 519 520 CMPB ·useAVX2(SB), $1 521 JE avx2GcmSm4Enc 522 523 CMPB ·useAVX(SB), $1 524 JE avxGcmSm4Enc 525 526 MOVOU bswap_mask<>(SB), BSWAP 527 MOVOU gcmPoly<>(SB), POLY 528 529 MOVOU (tPtr), ACC0 530 PXOR ACC1, ACC1 531 PXOR ACCM, ACCM 532 MOVOU (ctrPtr), T0 533 PSHUFB flip_mask<>(SB), T0 534 PEXTRD $3, T0, aluCTR 535 536 MOVOU T0, (8*16 + 0*16)(SP) 537 increment(0) 538 MOVOU T0, (8*16 + 1*16)(SP) 539 increment(1) 540 MOVOU T0, (8*16 + 2*16)(SP) 541 increment(2) 542 MOVOU T0, (8*16 + 3*16)(SP) 543 increment(3) 544 545 CMPQ ptxLen, $128 546 JB gcmSm4EncNibbles 547 SUBQ $128, ptxLen 548 549 // We have at least 8 blocks to encrypt, prepare the rest of the counters 550 MOVOU T0, (8*16 + 4*16)(SP) 551 increment(4) 552 MOVOU T0, (8*16 + 5*16)(SP) 553 increment(5) 554 MOVOU T0, (8*16 + 6*16)(SP) 555 increment(6) 556 MOVOU T0, (8*16 + 7*16)(SP) 557 increment(7) 558 559 // load 8 ctrs for encryption 560 MOVOU (8*16 + 0*16)(SP), B0 561 MOVOU (8*16 + 1*16)(SP), B1 562 MOVOU (8*16 + 2*16)(SP), B2 563 MOVOU (8*16 + 3*16)(SP), B3 564 MOVOU (8*16 + 4*16)(SP), B4 565 MOVOU (8*16 + 5*16)(SP), B5 566 MOVOU (8*16 + 6*16)(SP), B6 567 MOVOU (8*16 + 7*16)(SP), B7 568 569 SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 570 increment(0) 571 572 // XOR plaintext 573 MOVOU (16*0)(ptx), T0 574 PXOR T0, B0 575 increment(1) 576 MOVOU (16*1)(ptx), T0 577 PXOR T0, B1 578 increment(2) 579 MOVOU (16*2)(ptx), T0 580 PXOR T0, B2 581 increment(3) 582 MOVOU (16*3)(ptx), T0 583 PXOR T0, B3 584 increment(4) 585 MOVOU (16*4)(ptx), T0 586 PXOR T0, B4 587 increment(5) 588 MOVOU (16*5)(ptx), T0 589 PXOR T0, B5 590 increment(6) 591 MOVOU (16*6)(ptx), T0 592 PXOR T0, B6 593 increment(7) 594 MOVOU (16*7)(ptx), T0 595 PXOR T0, B7 596 597 // Store ciphertext 598 MOVOU B0, (16*0)(ctx) 599 PSHUFB BSWAP, B0 600 PXOR ACC0, B0 601 MOVOU B1, (16*1)(ctx) 602 PSHUFB BSWAP, B1 603 MOVOU B2, (16*2)(ctx) 604 PSHUFB BSWAP, B2 605 MOVOU B3, (16*3)(ctx) 606 PSHUFB BSWAP, B3 607 MOVOU B4, (16*4)(ctx) 608 PSHUFB BSWAP, B4 609 MOVOU B5, (16*5)(ctx) 610 PSHUFB BSWAP, B5 611 MOVOU B6, (16*6)(ctx) 612 PSHUFB BSWAP, B6 613 MOVOU B7, (16*7)(ctx) 614 PSHUFB BSWAP, B7 615 616 MOVOU B0, (16*0)(SP) 617 MOVOU B1, (16*1)(SP) 618 MOVOU B2, (16*2)(SP) 619 MOVOU B3, (16*3)(SP) 620 MOVOU B4, (16*4)(SP) 621 MOVOU B5, (16*5)(SP) 622 MOVOU B6, (16*6)(SP) 623 MOVOU B7, (16*7)(SP) 624 625 LEAQ 128(ptx), ptx 626 LEAQ 128(ctx), ctx 627 628 gcmSm4EncOctetsLoop: 629 CMPQ ptxLen, $128 630 JB gcmSm4EncOctetsEnd 631 SUBQ $128, ptxLen 632 633 MOVOU (8*16 + 0*16)(SP), B0 634 MOVOU (8*16 + 1*16)(SP), B1 635 MOVOU (8*16 + 2*16)(SP), B2 636 MOVOU (8*16 + 3*16)(SP), B3 637 MOVOU (8*16 + 4*16)(SP), B4 638 MOVOU (8*16 + 5*16)(SP), B5 639 MOVOU (8*16 + 6*16)(SP), B6 640 MOVOU (8*16 + 7*16)(SP), B7 641 642 MOVOU (16*0)(SP), T0 643 PSHUFD $78, T0, T1 644 PXOR T0, T1 645 646 MOVOU (16*0)(pTbl), ACC0 647 MOVOU (16*1)(pTbl), ACCM 648 MOVOU ACC0, ACC1 649 650 PCLMULQDQ $0x00, T1, ACCM 651 PCLMULQDQ $0x00, T0, ACC0 652 PCLMULQDQ $0x11, T0, ACC1 653 654 mulRound(1) 655 increment(0) 656 mulRound(2) 657 increment(1) 658 mulRound(3) 659 increment(2) 660 mulRound(4) 661 increment(3) 662 mulRound(5) 663 increment(4) 664 mulRound(6) 665 increment(5) 666 mulRound(7) 667 increment(6) 668 669 PXOR ACC0, ACCM 670 PXOR ACC1, ACCM 671 MOVOU ACCM, T0 672 PSRLDQ $8, ACCM 673 PSLLDQ $8, T0 674 PXOR ACCM, ACC1 675 PXOR T0, ACC0 676 677 increment(7) 678 reduceRound(ACC0) 679 reduceRound(ACC0) 680 PXOR ACC1, ACC0 681 682 SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 683 684 MOVOU (16*0)(ptx), T0 685 PXOR T0, B0 686 MOVOU (16*1)(ptx), T0 687 PXOR T0, B1 688 MOVOU (16*2)(ptx), T0 689 PXOR T0, B2 690 MOVOU (16*3)(ptx), T0 691 PXOR T0, B3 692 MOVOU (16*4)(ptx), T0 693 PXOR T0, B4 694 MOVOU (16*5)(ptx), T0 695 PXOR T0, B5 696 MOVOU (16*6)(ptx), T0 697 PXOR T0, B6 698 MOVOU (16*7)(ptx), T0 699 PXOR T0, B7 700 701 MOVOU B0, (16*0)(ctx) 702 PSHUFB BSWAP, B0 703 PXOR ACC0, B0 704 MOVOU B1, (16*1)(ctx) 705 PSHUFB BSWAP, B1 706 MOVOU B2, (16*2)(ctx) 707 PSHUFB BSWAP, B2 708 MOVOU B3, (16*3)(ctx) 709 PSHUFB BSWAP, B3 710 MOVOU B4, (16*4)(ctx) 711 PSHUFB BSWAP, B4 712 MOVOU B5, (16*5)(ctx) 713 PSHUFB BSWAP, B5 714 MOVOU B6, (16*6)(ctx) 715 PSHUFB BSWAP, B6 716 MOVOU B7, (16*7)(ctx) 717 PSHUFB BSWAP, B7 718 719 MOVOU B0, (16*0)(SP) 720 MOVOU B1, (16*1)(SP) 721 MOVOU B2, (16*2)(SP) 722 MOVOU B3, (16*3)(SP) 723 MOVOU B4, (16*4)(SP) 724 MOVOU B5, (16*5)(SP) 725 MOVOU B6, (16*6)(SP) 726 MOVOU B7, (16*7)(SP) 727 728 LEAQ 128(ptx), ptx 729 LEAQ 128(ctx), ctx 730 731 JMP gcmSm4EncOctetsLoop 732 733 gcmSm4EncOctetsEnd: 734 MOVOU (16*0)(SP), T0 735 MOVOU (16*0)(pTbl), ACC0 736 MOVOU (16*1)(pTbl), ACCM 737 MOVOU ACC0, ACC1 738 PSHUFD $78, T0, T1 739 PXOR T0, T1 740 PCLMULQDQ $0x00, T0, ACC0 741 PCLMULQDQ $0x11, T0, ACC1 742 PCLMULQDQ $0x00, T1, ACCM 743 744 mulRound(1) 745 mulRound(2) 746 mulRound(3) 747 mulRound(4) 748 mulRound(5) 749 mulRound(6) 750 mulRound(7) 751 752 PXOR ACC0, ACCM 753 PXOR ACC1, ACCM 754 MOVOU ACCM, T0 755 PSRLDQ $8, ACCM 756 PSLLDQ $8, T0 757 PXOR ACCM, ACC1 758 PXOR T0, ACC0 759 760 reduceRound(ACC0) 761 reduceRound(ACC0) 762 PXOR ACC1, ACC0 763 764 TESTQ ptxLen, ptxLen 765 JE gcmSm4EncDone 766 767 SUBQ $4, aluCTR 768 769 gcmSm4EncNibbles: 770 CMPQ ptxLen, $64 771 JBE gcmSm4EncSingles 772 SUBQ $64, ptxLen 773 774 MOVOU (8*16 + 0*16)(SP), B0 775 MOVOU (8*16 + 1*16)(SP), B1 776 MOVOU (8*16 + 2*16)(SP), B2 777 MOVOU (8*16 + 3*16)(SP), B3 778 779 SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 780 MOVOU (16*0)(ptx), T0 781 PXOR T0, B0 782 MOVOU (16*1)(ptx), T0 783 PXOR T0, B1 784 MOVOU (16*2)(ptx), T0 785 PXOR T0, B2 786 MOVOU (16*3)(ptx), T0 787 PXOR T0, B3 788 789 MOVOU B0, (16*0)(ctx) 790 MOVOU B1, (16*1)(ctx) 791 MOVOU B2, (16*2)(ctx) 792 MOVOU B3, (16*3)(ctx) 793 794 MOVOU (16*14)(pTbl), T2 795 increment(0) 796 gcmEncDataStep(B0) 797 increment(1) 798 gcmEncDataStep(B1) 799 increment(2) 800 gcmEncDataStep(B2) 801 increment(3) 802 gcmEncDataStep(B3) 803 804 LEAQ 64(ptx), ptx 805 LEAQ 64(ctx), ctx 806 807 gcmSm4EncSingles: 808 TESTQ ptxLen, ptxLen 809 JE gcmSm4EncDone 810 MOVOU (8*16 + 0*16)(SP), B0 811 MOVOU (8*16 + 1*16)(SP), B1 812 MOVOU (8*16 + 2*16)(SP), B2 813 MOVOU (8*16 + 3*16)(SP), B3 814 815 SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 816 MOVOU B0, (16*0)(SP) 817 MOVOU B1, (16*1)(SP) 818 MOVOU B2, (16*2)(SP) 819 MOVOU B3, (16*3)(SP) 820 821 MOVOU (16*14)(pTbl), T2 822 MOVQ SP, BP 823 824 gcmSm4EncSinglesLoop: 825 CMPQ ptxLen, $16 826 JB gcmSm4EncTail 827 SUBQ $16, ptxLen 828 MOVOU (16*0)(BP), B0 829 MOVOU (ptx), T0 830 PXOR T0, B0 831 MOVOU B0, (ctx) 832 gcmEncDataStep(B0) 833 LEAQ (16*1)(ptx), ptx 834 LEAQ (16*1)(ctx), ctx 835 ADDQ $16, BP 836 JMP gcmSm4EncSinglesLoop 837 838 gcmSm4EncTail: 839 TESTQ ptxLen, ptxLen 840 JE gcmSm4EncDone 841 MOVOU (16*0)(BP), B0 842 MOVOU B0, T0 843 844 LEAQ -1(ptx)(ptxLen*1), ptx 845 846 MOVQ ptxLen, aluTMP 847 SHLQ $4, aluTMP 848 849 LEAQ andMask<>(SB), aluCTR 850 MOVOU -16(aluCTR)(aluTMP*1), T1 851 PXOR B0, B0 852 ptxLoadLoop: 853 PSLLDQ $1, B0 854 PINSRB $0, (ptx), B0 855 LEAQ -1(ptx), ptx 856 DECQ ptxLen 857 JNE ptxLoadLoop 858 859 PXOR T0, B0 860 PAND T1, B0 861 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 862 gcmEncDataStep(B0) 863 864 gcmSm4EncDone: 865 MOVOU ACC0, (tPtr) 866 RET 867 868 avxGcmSm4Enc: 869 VMOVDQU bswap_mask<>(SB), BSWAP 870 VMOVDQU gcmPoly<>(SB), POLY 871 872 VMOVDQU (tPtr), ACC0 873 VPXOR ACC1, ACC1, ACC1 874 VPXOR ACCM, ACCM, ACCM 875 VMOVDQU (ctrPtr), T0 876 VPSHUFB flip_mask<>(SB), T0, T0 877 VPEXTRD $3, T0, aluCTR 878 879 VMOVDQU T0, (8*16 + 0*16)(SP) 880 increment(0) 881 VMOVDQU T0, (8*16 + 1*16)(SP) 882 increment(1) 883 VMOVDQU T0, (8*16 + 2*16)(SP) 884 increment(2) 885 VMOVDQU T0, (8*16 + 3*16)(SP) 886 increment(3) 887 888 CMPQ ptxLen, $128 889 JB avxGcmSm4EncNibbles 890 SUBQ $128, ptxLen 891 892 // We have at least 8 blocks to encrypt, prepare the rest of the counters 893 VMOVDQU T0, (8*16 + 4*16)(SP) 894 increment(4) 895 VMOVDQU T0, (8*16 + 5*16)(SP) 896 increment(5) 897 VMOVDQU T0, (8*16 + 6*16)(SP) 898 increment(6) 899 VMOVDQU T0, (8*16 + 7*16)(SP) 900 increment(7) 901 902 // load 8 ctrs for encryption 903 VMOVDQU (8*16 + 0*16)(SP), B0 904 VMOVDQU (8*16 + 1*16)(SP), B1 905 VMOVDQU (8*16 + 2*16)(SP), B2 906 VMOVDQU (8*16 + 3*16)(SP), B3 907 VMOVDQU (8*16 + 4*16)(SP), B4 908 VMOVDQU (8*16 + 5*16)(SP), B5 909 VMOVDQU (8*16 + 6*16)(SP), B6 910 VMOVDQU (8*16 + 7*16)(SP), B7 911 912 AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 913 increment(0) 914 915 // XOR plaintext 916 VPXOR (16*0)(ptx), B0, B0 917 VPXOR (16*1)(ptx), B1, B1 918 increment(1) 919 VPXOR (16*2)(ptx), B2, B2 920 VPXOR (16*3)(ptx), B3, B3 921 increment(2) 922 VPXOR (16*4)(ptx), B4, B4 923 VPXOR (16*5)(ptx), B5, B5 924 increment(3) 925 VPXOR (16*6)(ptx), B6, B6 926 VPXOR (16*7)(ptx), B7, B7 927 // Store ciphertext 928 VMOVDQU B0, (16*0)(ctx) 929 VPSHUFB BSWAP, B0, B0 930 increment(4) 931 VMOVDQU B1, (16*1)(ctx) 932 VPSHUFB BSWAP, B1, B1 933 increment(5) 934 VMOVDQU B2, (16*2)(ctx) 935 VPSHUFB BSWAP, B2, B2 936 increment(6) 937 VMOVDQU B3, (16*3)(ctx) 938 VPSHUFB BSWAP, B3, B3 939 increment(7) 940 VMOVDQU B4, (16*4)(ctx) 941 VPSHUFB BSWAP, B4, B4 942 VMOVDQU B5, (16*5)(ctx) 943 VPSHUFB BSWAP, B5, B5 944 VMOVDQU B6, (16*6)(ctx) 945 VPSHUFB BSWAP, B6, B6 946 VMOVDQU B7, (16*7)(ctx) 947 VPSHUFB BSWAP, B7, B7 948 949 VPXOR ACC0, B0, B0 950 951 VMOVDQU B0, (16*0)(SP) 952 VMOVDQU B1, (16*1)(SP) 953 VMOVDQU B2, (16*2)(SP) 954 VMOVDQU B3, (16*3)(SP) 955 VMOVDQU B4, (16*4)(SP) 956 VMOVDQU B5, (16*5)(SP) 957 VMOVDQU B6, (16*6)(SP) 958 VMOVDQU B7, (16*7)(SP) 959 960 LEAQ 128(ptx), ptx 961 LEAQ 128(ctx), ctx 962 963 avxGcmSm4EncOctetsLoop: 964 CMPQ ptxLen, $128 965 JB avxGcmSm4EncOctetsEnd 966 SUBQ $128, ptxLen 967 968 // load 8 ctrs for encryption 969 VMOVDQU (8*16 + 0*16)(SP), B0 970 VMOVDQU (8*16 + 1*16)(SP), B1 971 VMOVDQU (8*16 + 2*16)(SP), B2 972 VMOVDQU (8*16 + 3*16)(SP), B3 973 VMOVDQU (8*16 + 4*16)(SP), B4 974 VMOVDQU (8*16 + 5*16)(SP), B5 975 VMOVDQU (8*16 + 6*16)(SP), B6 976 VMOVDQU (8*16 + 7*16)(SP), B7 977 978 VMOVDQU (16*0)(SP), T0 979 VPSHUFD $78, T0, T1 980 VPXOR T0, T1, T1 981 982 VMOVDQU (16*0)(pTbl), ACC1 983 VMOVDQU (16*1)(pTbl), ACCM 984 985 VPCLMULQDQ $0x00, T1, ACCM, ACCM 986 VPCLMULQDQ $0x00, T0, ACC1, ACC0 987 VPCLMULQDQ $0x11, T0, ACC1, ACC1 988 989 avxMulRound(1) 990 increment(0) 991 avxMulRound(2) 992 increment(1) 993 avxMulRound(3) 994 increment(2) 995 avxMulRound(4) 996 increment(3) 997 avxMulRound(5) 998 increment(4) 999 avxMulRound(6) 1000 increment(5) 1001 avxMulRound(7) 1002 increment(6) 1003 VPXOR ACC0, ACCM, ACCM 1004 VPXOR ACC1, ACCM, ACCM 1005 VPSLLDQ $8, ACCM, T0 1006 VPSRLDQ $8, ACCM, ACCM 1007 1008 VPXOR ACCM, ACC1, ACC1 1009 VPXOR T0, ACC0, ACC0 1010 1011 increment(7) 1012 avxReduceRound(ACC0) 1013 avxReduceRound(ACC0) 1014 VPXOR ACC1, ACC0, ACC0 1015 1016 AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 1017 // XOR plaintext 1018 VPXOR (16*0)(ptx), B0, B0 1019 VPXOR (16*1)(ptx), B1, B1 1020 VPXOR (16*2)(ptx), B2, B2 1021 VPXOR (16*3)(ptx), B3, B3 1022 VPXOR (16*4)(ptx), B4, B4 1023 VPXOR (16*5)(ptx), B5, B5 1024 VPXOR (16*6)(ptx), B6, B6 1025 VPXOR (16*7)(ptx), B7, B7 1026 1027 // Store ciphertext 1028 VMOVDQU B0, (16*0)(ctx) 1029 VPSHUFB BSWAP, B0, B0 1030 VMOVDQU B1, (16*1)(ctx) 1031 VPSHUFB BSWAP, B1, B1 1032 VMOVDQU B2, (16*2)(ctx) 1033 VPSHUFB BSWAP, B2, B2 1034 VMOVDQU B3, (16*3)(ctx) 1035 VPSHUFB BSWAP, B3, B3 1036 VMOVDQU B4, (16*4)(ctx) 1037 VPSHUFB BSWAP, B4, B4 1038 VMOVDQU B5, (16*5)(ctx) 1039 VPSHUFB BSWAP, B5, B5 1040 VMOVDQU B6, (16*6)(ctx) 1041 VPSHUFB BSWAP, B6, B6 1042 VMOVDQU B7, (16*7)(ctx) 1043 VPSHUFB BSWAP, B7, B7 1044 1045 VPXOR ACC0, B0, B0 1046 VMOVDQU B0, (16*0)(SP) 1047 VMOVDQU B1, (16*1)(SP) 1048 VMOVDQU B2, (16*2)(SP) 1049 VMOVDQU B3, (16*3)(SP) 1050 VMOVDQU B4, (16*4)(SP) 1051 VMOVDQU B5, (16*5)(SP) 1052 VMOVDQU B6, (16*6)(SP) 1053 VMOVDQU B7, (16*7)(SP) 1054 1055 LEAQ 128(ptx), ptx 1056 LEAQ 128(ctx), ctx 1057 1058 JMP avxGcmSm4EncOctetsLoop 1059 1060 avxGcmSm4EncOctetsEnd: 1061 VMOVDQU (16*0)(SP), T0 1062 VMOVDQU (16*0)(pTbl), ACC0 1063 VMOVDQU (16*1)(pTbl), ACCM 1064 VMOVDQU ACC0, ACC1 1065 VPSHUFD $78, T0, T1 1066 VPXOR T0, T1, T1 1067 VPCLMULQDQ $0x00, T0, ACC0, ACC0 1068 VPCLMULQDQ $0x11, T0, ACC1, ACC1 1069 VPCLMULQDQ $0x00, T1, ACCM, ACCM 1070 1071 avxMulRound(1) 1072 avxMulRound(2) 1073 avxMulRound(3) 1074 avxMulRound(4) 1075 avxMulRound(5) 1076 avxMulRound(6) 1077 avxMulRound(7) 1078 1079 VPXOR ACC0, ACCM, ACCM 1080 VPXOR ACC1, ACCM, ACCM 1081 VPSLLDQ $8, ACCM, T0 1082 VPSRLDQ $8, ACCM, ACCM 1083 1084 VPXOR ACCM, ACC1, ACC1 1085 VPXOR T0, ACC0, ACC0 1086 1087 avxReduceRound(ACC0) 1088 avxReduceRound(ACC0) 1089 VPXOR ACC1, ACC0, ACC0 1090 1091 TESTQ ptxLen, ptxLen 1092 JE avxGcmSm4EncDone 1093 1094 SUBQ $4, aluCTR 1095 1096 avxGcmSm4EncNibbles: 1097 CMPQ ptxLen, $64 1098 JBE avxGcmSm4EncSingles 1099 SUBQ $64, ptxLen 1100 1101 // load 4 ctrs for encryption 1102 VMOVDQU (8*16 + 0*16)(SP), B0 1103 VMOVDQU (8*16 + 1*16)(SP), B1 1104 VMOVDQU (8*16 + 2*16)(SP), B2 1105 VMOVDQU (8*16 + 3*16)(SP), B3 1106 1107 AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3) 1108 // XOR plaintext 1109 VPXOR (16*0)(ptx), B0, B0 1110 VPXOR (16*1)(ptx), B1, B1 1111 VPXOR (16*2)(ptx), B2, B2 1112 VPXOR (16*3)(ptx), B3, B3 1113 1114 // Store ciphertext 1115 VMOVDQU B0, (16*0)(ctx) 1116 VMOVDQU B1, (16*1)(ctx) 1117 VMOVDQU B2, (16*2)(ctx) 1118 VMOVDQU B3, (16*3)(ctx) 1119 1120 VMOVDQU (16*14)(pTbl), T2 1121 increment(0) 1122 avxGcmEncDataStep(B0) 1123 increment(1) 1124 avxGcmEncDataStep(B1) 1125 increment(2) 1126 avxGcmEncDataStep(B2) 1127 increment(3) 1128 avxGcmEncDataStep(B3) 1129 1130 LEAQ 64(ptx), ptx 1131 LEAQ 64(ctx), ctx 1132 1133 avxGcmSm4EncSingles: 1134 TESTQ ptxLen, ptxLen 1135 JE avxGcmSm4EncDone 1136 1137 VMOVDQU (8*16 + 0*16)(SP), B0 1138 VMOVDQU (8*16 + 1*16)(SP), B1 1139 VMOVDQU (8*16 + 2*16)(SP), B2 1140 VMOVDQU (8*16 + 3*16)(SP), B3 1141 1142 AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3) 1143 VMOVDQU B0, (16*0)(SP) 1144 VMOVDQU B1, (16*1)(SP) 1145 VMOVDQU B2, (16*2)(SP) 1146 VMOVDQU B3, (16*3)(SP) 1147 1148 VMOVDQU (16*14)(pTbl), T2 1149 MOVQ SP, BP 1150 1151 avxGcmSm4EncSinglesLoop: 1152 CMPQ ptxLen, $16 1153 JB avxGcmSm4EncTail 1154 SUBQ $16, ptxLen 1155 VMOVDQU (16*0)(BP), B0 1156 VMOVDQU (ptx), T0 1157 VPXOR T0, B0, B0 1158 VMOVDQU B0, (ctx) 1159 avxGcmEncDataStep(B0) 1160 LEAQ (16*1)(ptx), ptx 1161 LEAQ (16*1)(ctx), ctx 1162 ADDQ $16, BP 1163 JMP avxGcmSm4EncSinglesLoop 1164 1165 avxGcmSm4EncTail: 1166 TESTQ ptxLen, ptxLen 1167 JE avxGcmSm4EncDone 1168 VMOVDQU (16*0)(BP), B0 1169 VMOVDQU B0, T0 1170 1171 LEAQ -1(ptx)(ptxLen*1), ptx 1172 1173 MOVQ ptxLen, aluTMP 1174 SHLQ $4, aluTMP 1175 1176 LEAQ andMask<>(SB), aluCTR 1177 VMOVDQU -16(aluCTR)(aluTMP*1), T1 1178 VPXOR B0, B0, B0 1179 1180 avxPtxLoadLoop: 1181 VPSLLDQ $1, B0, B0 1182 VPINSRB $0, (ptx), B0, B0 1183 LEAQ -1(ptx), ptx 1184 DECQ ptxLen 1185 JNE avxPtxLoadLoop 1186 1187 VPXOR T0, B0, B0 1188 VPAND T1, B0, B0 1189 VMOVDQU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 1190 avxGcmEncDataStep(B0) 1191 1192 avxGcmSm4EncDone: 1193 VMOVDQU ACC0, (tPtr) 1194 RET 1195 1196 avx2GcmSm4Enc: 1197 VMOVDQU bswap_mask<>(SB), BSWAP 1198 VMOVDQU gcmPoly<>(SB), POLY 1199 1200 VMOVDQU (tPtr), ACC0 1201 VPXOR ACC1, ACC1, ACC1 1202 VPXOR ACCM, ACCM, ACCM 1203 VMOVDQU (ctrPtr), T0 1204 VPSHUFB flip_mask<>(SB), T0, T0 1205 VPEXTRD $3, T0, aluCTR 1206 1207 VINSERTI128 $1, T0, Y11, Y11 1208 VMOVDQU Y11, (8*16 + 0*32)(SP) 1209 increment(0) 1210 increment(1) 1211 VMOVDQU Y11, (8*16 + 1*32)(SP) 1212 increment(2) 1213 increment(3) 1214 1215 CMPQ ptxLen, $128 1216 JB avx2GcmSm4EncNibbles 1217 SUBQ $128, ptxLen 1218 1219 // We have at least 8 blocks to encrypt, prepare the rest of the counters 1220 VMOVDQU Y11, (8*16 + 2*32)(SP) 1221 increment(4) 1222 increment(5) 1223 VMOVDQU Y11, (8*16 + 3*32)(SP) 1224 increment(6) 1225 increment(7) 1226 1227 VBROADCASTI128 bswap_mask<>(SB), DWBSWAP 1228 // load 8 ctrs for encryption 1229 VMOVDQU (4*32 + 0*32)(SP), DWB0 1230 VMOVDQU (4*32 + 1*32)(SP), DWB1 1231 VMOVDQU (4*32 + 2*32)(SP), DWB2 1232 VMOVDQU (4*32 + 3*32)(SP), DWB3 1233 1234 increment(0) 1235 // Transpose matrix 4 x 4 32bits word 1236 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) 1237 1238 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 1239 increment(1) 1240 AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3) 1241 increment(2) 1242 // Transpose matrix 4 x 4 32bits word 1243 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) 1244 1245 VPSHUFB DWBSWAP, DWB0, DWB0 1246 VPSHUFB DWBSWAP, DWB1, DWB1 1247 increment(3) 1248 VPSHUFB DWBSWAP, DWB2, DWB2 1249 VPSHUFB DWBSWAP, DWB3, DWB3 1250 increment(4) 1251 1252 // XOR plaintext 1253 VMOVDQU (32*0)(ptx), XDWTMP0 1254 VPXOR XDWTMP0, DWB0, DWB0 1255 VMOVDQU (32*1)(ptx), XDWTMP0 1256 VPXOR XDWTMP0, DWB1, DWB1 1257 increment(5) 1258 VMOVDQU (32*2)(ptx), XDWTMP0 1259 VPXOR XDWTMP0, DWB2, DWB2 1260 VMOVDQU (32*3)(ptx), XDWTMP0 1261 VPXOR XDWTMP0, DWB3, DWB3 1262 increment(6) 1263 1264 // Store ciphertext 1265 VMOVDQU DWB0, (32*0)(ctx) 1266 VPSHUFB DWBSWAP, DWB0, DWB0 1267 VMOVDQU DWB1, (32*1)(ctx) 1268 VPSHUFB DWBSWAP, DWB1, DWB1 1269 VMOVDQU DWB2, (32*2)(ctx) 1270 VPSHUFB DWBSWAP, DWB2, DWB2 1271 VMOVDQU DWB3, (32*3)(ctx) 1272 VPSHUFB DWBSWAP, DWB3, DWB3 1273 increment(7) 1274 //VPXOR XDWTMP0, XDWTMP0, XDWTMP0 1275 //VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0 1276 //VPXOR XDWTMP0, DWB0, DWB0 1277 PXOR ACC0, B0 // Can't call VPXOR here 1278 VMOVDQU DWB0, (32*0)(SP) 1279 VMOVDQU DWB1, (32*1)(SP) 1280 VMOVDQU DWB2, (32*2)(SP) 1281 VMOVDQU DWB3, (32*3)(SP) 1282 1283 LEAQ 128(ptx), ptx 1284 LEAQ 128(ctx), ctx 1285 1286 avx2GcmSm4EncOctetsLoop: 1287 CMPQ ptxLen, $128 1288 JB avx2GcmSm4EncOctetsEnd 1289 SUBQ $128, ptxLen 1290 1291 // load 8 ctrs for encryption 1292 VMOVDQU (4*32 + 0*32)(SP), DWB0 1293 VMOVDQU (4*32 + 1*32)(SP), DWB1 1294 VMOVDQU (4*32 + 2*32)(SP), DWB2 1295 VMOVDQU (4*32 + 3*32)(SP), DWB3 1296 1297 VMOVDQU (16*0)(SP), T0 1298 VPSHUFD $78, T0, T1 1299 VPXOR T0, T1, T1 1300 1301 VMOVDQU (16*0)(pTbl), ACC1 1302 VMOVDQU (16*1)(pTbl), ACCM 1303 1304 VPCLMULQDQ $0x00, T1, ACCM, ACCM 1305 VPCLMULQDQ $0x00, T0, ACC1, ACC0 1306 VPCLMULQDQ $0x11, T0, ACC1, ACC1 1307 1308 // Transpose matrix 4 x 4 32bits word 1309 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) 1310 1311 AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3) 1312 1313 // Transpose matrix 4 x 4 32bits word 1314 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) 1315 1316 VPSHUFB DWBSWAP, DWB0, DWB0 1317 VPSHUFB DWBSWAP, DWB1, DWB1 1318 VPSHUFB DWBSWAP, DWB2, DWB2 1319 VPSHUFB DWBSWAP, DWB3, DWB3 1320 1321 avxMulRound(1) 1322 increment(0) 1323 avxMulRound(2) 1324 increment(1) 1325 avxMulRound(3) 1326 increment(2) 1327 avxMulRound(4) 1328 increment(3) 1329 avxMulRound(5) 1330 increment(4) 1331 avxMulRound(6) 1332 increment(5) 1333 avxMulRound(7) 1334 increment(6) 1335 1336 VPXOR ACC0, ACCM, ACCM 1337 VPXOR ACC1, ACCM, ACCM 1338 VPSLLDQ $8, ACCM, T0 1339 VPSRLDQ $8, ACCM, ACCM 1340 1341 VPXOR ACCM, ACC1, ACC1 1342 VPXOR T0, ACC0, ACC0 1343 1344 increment(7) 1345 avxReduceRound(ACC0) 1346 avxReduceRound(ACC0) 1347 VPXOR ACC1, ACC0, ACC0 1348 1349 // XOR plaintext 1350 VPXOR (32*0)(ptx), DWB0, DWB0 1351 VPXOR (32*1)(ptx), DWB1, DWB1 1352 VPXOR (32*2)(ptx), DWB2, DWB2 1353 VPXOR (32*3)(ptx), DWB3, DWB3 1354 1355 // Store ciphertext 1356 VMOVDQU DWB0, (32*0)(ctx) 1357 VPSHUFB DWBSWAP, DWB0, DWB0 1358 VMOVDQU DWB1, (32*1)(ctx) 1359 VPSHUFB DWBSWAP, DWB1, DWB1 1360 VMOVDQU DWB2, (32*2)(ctx) 1361 VPSHUFB DWBSWAP, DWB2, DWB2 1362 VMOVDQU DWB3, (32*3)(ctx) 1363 VPSHUFB DWBSWAP, DWB3, DWB3 1364 1365 //VPXOR XDWTMP0, XDWTMP0, XDWTMP0 1366 //VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0 1367 //VPXOR XDWTMP0, DWB0, DWB0 1368 PXOR ACC0, B0 // Can't call VPXOR here 1369 VMOVDQU DWB0, (32*0)(SP) 1370 VMOVDQU DWB1, (32*1)(SP) 1371 VMOVDQU DWB2, (32*2)(SP) 1372 VMOVDQU DWB3, (32*3)(SP) 1373 1374 LEAQ 128(ptx), ptx 1375 LEAQ 128(ctx), ctx 1376 1377 JMP avx2GcmSm4EncOctetsLoop 1378 1379 avx2GcmSm4EncOctetsEnd: 1380 VMOVDQU (16*0)(SP), T0 1381 VMOVDQU (16*0)(pTbl), ACC0 1382 VMOVDQU (16*1)(pTbl), ACCM 1383 VMOVDQU ACC0, ACC1 1384 VPSHUFD $78, T0, T1 1385 VPXOR T0, T1, T1 1386 VPCLMULQDQ $0x00, T0, ACC0, ACC0 1387 VPCLMULQDQ $0x11, T0, ACC1, ACC1 1388 VPCLMULQDQ $0x00, T1, ACCM, ACCM 1389 1390 avxMulRound(1) 1391 avxMulRound(2) 1392 avxMulRound(3) 1393 avxMulRound(4) 1394 avxMulRound(5) 1395 avxMulRound(6) 1396 avxMulRound(7) 1397 1398 VPXOR ACC0, ACCM, ACCM 1399 VPXOR ACC1, ACCM, ACCM 1400 VPSLLDQ $8, ACCM, T0 1401 VPSRLDQ $8, ACCM, ACCM 1402 1403 VPXOR ACCM, ACC1, ACC1 1404 VPXOR T0, ACC0, ACC0 1405 1406 avxReduceRound(ACC0) 1407 avxReduceRound(ACC0) 1408 VPXOR ACC1, ACC0, ACC0 1409 1410 TESTQ ptxLen, ptxLen 1411 JE avx2GcmSm4EncDone 1412 1413 SUBQ $4, aluCTR 1414 1415 avx2GcmSm4EncNibbles: 1416 CMPQ ptxLen, $64 1417 JBE avx2GcmSm4EncSingles 1418 SUBQ $64, ptxLen 1419 1420 VMOVDQU (8*16 + 0*16)(SP), B0 1421 VMOVDQU (8*16 + 1*16)(SP), B1 1422 VMOVDQU (8*16 + 2*16)(SP), B2 1423 VMOVDQU (8*16 + 3*16)(SP), B3 1424 1425 AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3) 1426 1427 VPXOR (16*0)(ptx), B0, B0 1428 VPXOR (16*1)(ptx), B1, B1 1429 VPXOR (16*2)(ptx), B2, B2 1430 VPXOR (16*3)(ptx), B3, B3 1431 1432 VMOVDQU B0, (16*0)(ctx) 1433 VMOVDQU B1, (16*1)(ctx) 1434 VMOVDQU B2, (16*2)(ctx) 1435 VMOVDQU B3, (16*3)(ctx) 1436 1437 VMOVDQU (16*14)(pTbl), T2 1438 avxGcmEncDataStep(B0) 1439 increment(0) 1440 avxGcmEncDataStep(B1) 1441 increment(1) 1442 avxGcmEncDataStep(B2) 1443 increment(2) 1444 avxGcmEncDataStep(B3) 1445 increment(3) 1446 1447 LEAQ 64(ptx), ptx 1448 LEAQ 64(ctx), ctx 1449 1450 avx2GcmSm4EncSingles: 1451 TESTQ ptxLen, ptxLen 1452 JE avx2GcmSm4EncDone 1453 1454 VMOVDQU (8*16 + 0*16)(SP), B0 1455 VMOVDQU (8*16 + 1*16)(SP), B1 1456 VMOVDQU (8*16 + 2*16)(SP), B2 1457 VMOVDQU (8*16 + 3*16)(SP), B3 1458 1459 AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3) 1460 1461 VMOVDQU B0, (16*0)(SP) 1462 VMOVDQU B1, (16*1)(SP) 1463 VMOVDQU B2, (16*2)(SP) 1464 VMOVDQU B3, (16*3)(SP) 1465 1466 VMOVDQU (16*14)(pTbl), T2 1467 MOVQ SP, BP 1468 1469 avx2GcmSm4EncSinglesLoop: 1470 CMPQ ptxLen, $16 1471 JB avx2GcmSm4EncTail 1472 SUBQ $16, ptxLen 1473 VMOVDQU (16*0)(BP), B0 1474 VMOVDQU (ptx), T0 1475 VPXOR T0, B0, B0 1476 VMOVDQU B0, (ctx) 1477 avxGcmEncDataStep(B0) 1478 LEAQ (16*1)(ptx), ptx 1479 LEAQ (16*1)(ctx), ctx 1480 ADDQ $16, BP 1481 JMP avx2GcmSm4EncSinglesLoop 1482 1483 avx2GcmSm4EncTail: 1484 TESTQ ptxLen, ptxLen 1485 JE avx2GcmSm4EncDone 1486 VMOVDQU (16*0)(BP), B0 1487 VMOVDQU B0, T0 1488 1489 LEAQ -1(ptx)(ptxLen*1), ptx 1490 1491 MOVQ ptxLen, aluTMP 1492 SHLQ $4, aluTMP 1493 1494 LEAQ andMask<>(SB), aluCTR 1495 VMOVDQU -16(aluCTR)(aluTMP*1), T1 1496 VPXOR B0, B0, B0 1497 1498 avx2PtxLoadLoop: 1499 VPSLLDQ $1, B0, B0 1500 VPINSRB $0, (ptx), B0, B0 1501 LEAQ -1(ptx), ptx 1502 DECQ ptxLen 1503 JNE avx2PtxLoadLoop 1504 1505 VPXOR T0, B0, B0 1506 VPAND T1, B0, B0 1507 VMOVDQU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 1508 avxGcmEncDataStep(B0) 1509 1510 avx2GcmSm4EncDone: 1511 VMOVDQU ACC0, (tPtr) 1512 VZEROUPPER 1513 RET 1514 1515 #undef increment 1516 1517 // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 1518 TEXT ·gcmSm4Dec(SB),0,$128-96 1519 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + i*16)(SP) 1520 1521 #define decMulRound(i) \ 1522 MOVOU (16*i)(ctx), T0;\ 1523 PSHUFB BSWAP, T0;\ 1524 internalDecMulRound(i) 1525 1526 #define internalDecMulRound(i) \ 1527 MOVOU (16*(i*2))(pTbl), T1;\ 1528 MOVOU T1, T2;\ 1529 PCLMULQDQ $0x00, T0, T1;\ 1530 PXOR T1, ACC0;\ 1531 PSHUFD $78, T0, T1;\ 1532 PCLMULQDQ $0x11, T0, T2;\ 1533 PXOR T1, T0;\ 1534 PXOR T2, ACC1;\ 1535 MOVOU (16*(i*2+1))(pTbl), T2;\ 1536 PCLMULQDQ $0x00, T2, T0;\ 1537 PXOR T0, ACCM 1538 1539 #define decGhashRound(i) \ 1540 MOVOU (16*i)(ctx), B0; \ 1541 internalDecGhashRound() 1542 1543 #define internalDecGhashRound() \ 1544 PSHUFB BSWAP, B0; \ 1545 PXOR ACC0, B0; \ 1546 MOVOU T2, ACC0; \ 1547 MOVOU T2, ACC1; \ 1548 MOVOU (16*15)(pTbl), ACCM; \ 1549 PCLMULQDQ $0x00, B0, ACC0; \ 1550 PCLMULQDQ $0x11, B0, ACC1; \ 1551 PSHUFD $78, B0, T0; \ 1552 PXOR B0, T0; \ 1553 PCLMULQDQ $0x00, T0, ACCM; \ 1554 PXOR ACC0, ACCM; \ 1555 PXOR ACC1, ACCM; \ 1556 MOVOU ACCM, T0; \ 1557 PSRLDQ $8, ACCM; \ 1558 PSLLDQ $8, T0; \ 1559 PXOR ACCM, ACC1; \ 1560 PXOR T0, ACC0; \ 1561 reduceRound(ACC0); \ 1562 reduceRound(ACC0); \ 1563 PXOR ACC1, ACC0 1564 1565 #define avxDecMulRound(i) \ 1566 VMOVDQU (16*i)(ctx), T0;\ 1567 VPSHUFB BSWAP, T0, T0;\ 1568 VMOVDQU (16*(i*2))(pTbl), T2;\ 1569 VPCLMULQDQ $0x00, T0, T2, T1;\ 1570 VPXOR T1, ACC0, ACC0;\ 1571 VPSHUFD $78, T0, T1;\ 1572 VPCLMULQDQ $0x11, T0, T2, T2;\ 1573 VPXOR T1, T0, T0;\ 1574 VPXOR T2, ACC1, ACC1;\ 1575 VMOVDQU (16*(i*2+1))(pTbl), T2;\ 1576 VPCLMULQDQ $0x00, T2, T0, T0;\ 1577 VPXOR T0, ACCM, ACCM 1578 1579 #define internalAvxDecGhashRound() \ 1580 VPSHUFB BSWAP, B0, B0; \ 1581 VPXOR ACC0, B0, B0; \ 1582 VMOVDQU (16*15)(pTbl), ACCM; \ 1583 VPCLMULQDQ $0x00, B0, T2, ACC0; \ 1584 VPCLMULQDQ $0x11, B0, T2, ACC1; \ 1585 VPSHUFD $78, B0, T0; \ 1586 VPXOR B0, T0, T0; \ 1587 VPCLMULQDQ $0x00, T0, ACCM, ACCM; \ 1588 VPXOR ACC0, ACCM, ACCM; \ 1589 VPXOR ACC1, ACCM, ACCM; \ 1590 VPSLLDQ $8, ACCM, T0; \ 1591 VPSRLDQ $8, ACCM, ACCM; \ 1592 VPXOR ACCM, ACC1, ACC1; \ 1593 VPXOR T0, ACC0, ACC0; \ 1594 avxReduceRound(ACC0); \ 1595 avxReduceRound(ACC0); \ 1596 VPXOR ACC1, ACC0, ACC0 1597 1598 MOVQ productTable+0(FP), pTbl 1599 MOVQ dst+8(FP), ptx 1600 MOVQ src_base+32(FP), ctx 1601 MOVQ src_len+40(FP), ptxLen 1602 MOVQ ctr+56(FP), ctrPtr 1603 MOVQ T+64(FP), tPtr 1604 MOVQ rk_base+72(FP), rk 1605 1606 CMPB ·useAVX2(SB), $1 1607 JE avx2GcmSm4Dec 1608 1609 CMPB ·useAVX(SB), $1 1610 JE avxGcmSm4Dec 1611 1612 MOVOU bswap_mask<>(SB), BSWAP 1613 MOVOU gcmPoly<>(SB), POLY 1614 1615 MOVOU (tPtr), ACC0 1616 PXOR ACC1, ACC1 1617 PXOR ACCM, ACCM 1618 MOVOU (ctrPtr), T0 1619 PSHUFB flip_mask<>(SB), T0 1620 PEXTRD $3, T0, aluCTR 1621 1622 MOVOU T0, (0*16)(SP) 1623 increment(0) 1624 MOVOU T0, (1*16)(SP) 1625 increment(1) 1626 MOVOU T0, (2*16)(SP) 1627 increment(2) 1628 MOVOU T0, (3*16)(SP) 1629 increment(3) 1630 1631 CMPQ ptxLen, $128 1632 JB gcmSm4DecNibbles 1633 1634 // We have at least 8 blocks to dencrypt, prepare the rest of the counters 1635 MOVOU T0, (4*16)(SP) 1636 increment(4) 1637 MOVOU T0, (5*16)(SP) 1638 increment(5) 1639 MOVOU T0, (6*16)(SP) 1640 increment(6) 1641 MOVOU T0, (7*16)(SP) 1642 increment(7) 1643 1644 gcmSm4DecOctetsLoop: 1645 CMPQ ptxLen, $128 1646 JB gcmSm4DecEndOctets 1647 SUBQ $128, ptxLen 1648 1649 MOVOU (0*16)(SP), B0 1650 MOVOU (1*16)(SP), B1 1651 MOVOU (2*16)(SP), B2 1652 MOVOU (3*16)(SP), B3 1653 MOVOU (4*16)(SP), B4 1654 MOVOU (5*16)(SP), B5 1655 MOVOU (6*16)(SP), B6 1656 MOVOU (7*16)(SP), B7 1657 1658 MOVOU (16*0)(ctx), T0 1659 PSHUFB BSWAP, T0 1660 PXOR ACC0, T0 1661 PSHUFD $78, T0, T1 1662 PXOR T0, T1 1663 1664 MOVOU (16*0)(pTbl), ACC0 1665 MOVOU (16*1)(pTbl), ACCM 1666 MOVOU ACC0, ACC1 1667 1668 PCLMULQDQ $0x00, T1, ACCM 1669 PCLMULQDQ $0x00, T0, ACC0 1670 PCLMULQDQ $0x11, T0, ACC1 1671 1672 decMulRound(1) 1673 increment(0) 1674 decMulRound(2) 1675 increment(1) 1676 decMulRound(3) 1677 increment(2) 1678 decMulRound(4) 1679 increment(3) 1680 decMulRound(5) 1681 increment(4) 1682 decMulRound(6) 1683 increment(5) 1684 decMulRound(7) 1685 increment(6) 1686 increment(7) 1687 1688 PXOR ACC0, ACCM 1689 PXOR ACC1, ACCM 1690 MOVOU ACCM, T0 1691 PSRLDQ $8, ACCM 1692 PSLLDQ $8, T0 1693 PXOR ACCM, ACC1 1694 PXOR T0, ACC0 1695 1696 reduceRound(ACC0) 1697 reduceRound(ACC0) 1698 PXOR ACC1, ACC0 1699 1700 SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 1701 1702 MOVOU (16*0)(ctx), T0 1703 PXOR T0, B0 1704 MOVOU (16*1)(ctx), T0 1705 PXOR T0, B1 1706 MOVOU (16*2)(ctx), T0 1707 PXOR T0, B2 1708 MOVOU (16*3)(ctx), T0 1709 PXOR T0, B3 1710 MOVOU (16*4)(ctx), T0 1711 PXOR T0, B4 1712 MOVOU (16*5)(ctx), T0 1713 PXOR T0, B5 1714 MOVOU (16*6)(ctx), T0 1715 PXOR T0, B6 1716 MOVOU (16*7)(ctx), T0 1717 PXOR T0, B7 1718 1719 MOVOU B0, (16*0)(ptx) 1720 MOVOU B1, (16*1)(ptx) 1721 MOVOU B2, (16*2)(ptx) 1722 MOVOU B3, (16*3)(ptx) 1723 MOVOU B4, (16*4)(ptx) 1724 MOVOU B5, (16*5)(ptx) 1725 MOVOU B6, (16*6)(ptx) 1726 MOVOU B7, (16*7)(ptx) 1727 1728 LEAQ 128(ptx), ptx 1729 LEAQ 128(ctx), ctx 1730 1731 JMP gcmSm4DecOctetsLoop 1732 1733 gcmSm4DecEndOctets: 1734 SUBQ $4, aluCTR 1735 1736 gcmSm4DecNibbles: 1737 CMPQ ptxLen, $64 1738 JBE gcmSm4DecSingles 1739 SUBQ $64, ptxLen 1740 1741 MOVOU (0*16)(SP), B4 1742 MOVOU (1*16)(SP), B5 1743 MOVOU (2*16)(SP), B6 1744 MOVOU (3*16)(SP), B7 1745 1746 SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7) 1747 MOVOU (16*14)(pTbl), T2 1748 1749 MOVOU (16*0)(ctx), B0 1750 PXOR B0, B4 1751 internalDecGhashRound() 1752 increment(0) 1753 MOVOU (16*1)(ctx), B0 1754 PXOR B0, B5 1755 internalDecGhashRound() 1756 increment(1) 1757 MOVOU (16*2)(ctx), B0 1758 PXOR B0, B6 1759 internalDecGhashRound() 1760 increment(2) 1761 MOVOU (16*3)(ctx), B0 1762 PXOR B0, B7 1763 internalDecGhashRound() 1764 increment(3) 1765 1766 MOVOU B4, (16*0)(ptx) 1767 MOVOU B5, (16*1)(ptx) 1768 MOVOU B6, (16*2)(ptx) 1769 MOVOU B7, (16*3)(ptx) 1770 1771 LEAQ 64(ptx), ptx 1772 LEAQ 64(ctx), ctx 1773 1774 gcmSm4DecSingles: 1775 TESTQ ptxLen, ptxLen 1776 JE gcmSm4DecDone 1777 MOVOU (0*16)(SP), B0 1778 MOVOU (1*16)(SP), B1 1779 MOVOU (2*16)(SP), B2 1780 MOVOU (3*16)(SP), B3 1781 1782 SM4_4BLOCKS_WO_BS(rk, B4, T0, T1, T2, B0, B1, B2, B3) 1783 MOVOU B0, (16*4)(SP) 1784 MOVOU B1, (16*5)(SP) 1785 MOVOU B2, (16*6)(SP) 1786 MOVOU B3, (16*7)(SP) 1787 1788 MOVOU (16*14)(pTbl), T2 1789 MOVQ SP, BP 1790 ADDQ $64, BP 1791 1792 gcmSm4DecSinglesLoop: 1793 CMPQ ptxLen, $16 1794 JB gcmSm4DecTail 1795 SUBQ $16, ptxLen 1796 1797 MOVOU (16*0)(BP), B1 1798 MOVOU (ctx), T0 1799 PXOR T0, B1 1800 1801 decGhashRound(0) 1802 MOVOU B1, (ptx) 1803 1804 LEAQ (16*1)(ptx), ptx 1805 LEAQ (16*1)(ctx), ctx 1806 ADDQ $16, BP 1807 JMP gcmSm4DecSinglesLoop 1808 1809 gcmSm4DecTail: 1810 TESTQ ptxLen, ptxLen 1811 JE gcmSm4DecDone 1812 1813 MOVQ ptxLen, aluTMP 1814 SHLQ $4, aluTMP 1815 LEAQ andMask<>(SB), aluCTR 1816 MOVOU -16(aluCTR)(aluTMP*1), T1 1817 1818 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 1819 PAND T1, B0 1820 1821 MOVOU B0, T1 1822 internalDecGhashRound() 1823 1824 MOVOU (16*0)(BP), B0 1825 PXOR T1, B0 1826 1827 ptxStoreLoop: 1828 PEXTRB $0, B0, (ptx) 1829 PSRLDQ $1, B0 1830 LEAQ 1(ptx), ptx 1831 DECQ ptxLen 1832 1833 JNE ptxStoreLoop 1834 1835 gcmSm4DecDone: 1836 MOVOU ACC0, (tPtr) 1837 RET 1838 1839 avxGcmSm4Dec: 1840 VMOVDQU bswap_mask<>(SB), BSWAP 1841 VMOVDQU gcmPoly<>(SB), POLY 1842 1843 VMOVDQU (tPtr), ACC0 1844 VPXOR ACC1, ACC1, ACC1 1845 VPXOR ACCM, ACCM, ACCM 1846 VMOVDQU (ctrPtr), T0 1847 VPSHUFB flip_mask<>(SB), T0, T0 1848 VPEXTRD $3, T0, aluCTR 1849 1850 VMOVDQU T0, (0*16)(SP) 1851 increment(0) 1852 VMOVDQU T0, (1*16)(SP) 1853 increment(1) 1854 VMOVDQU T0, (2*16)(SP) 1855 increment(2) 1856 VMOVDQU T0, (3*16)(SP) 1857 increment(3) 1858 1859 CMPQ ptxLen, $128 1860 JB avxGcmSm4DecNibbles 1861 1862 // We have at least 8 blocks to dencrypt, prepare the rest of the counters 1863 VMOVDQU T0, (4*16)(SP) 1864 increment(4) 1865 VMOVDQU T0, (5*16)(SP) 1866 increment(5) 1867 VMOVDQU T0, (6*16)(SP) 1868 increment(6) 1869 VMOVDQU T0, (7*16)(SP) 1870 increment(7) 1871 1872 avxGcmSm4DecOctetsLoop: 1873 CMPQ ptxLen, $128 1874 JB avxGcmSm4DecEndOctets 1875 SUBQ $128, ptxLen 1876 1877 VMOVDQU (0*16)(SP), B0 1878 VMOVDQU (1*16)(SP), B1 1879 VMOVDQU (2*16)(SP), B2 1880 VMOVDQU (3*16)(SP), B3 1881 VMOVDQU (4*16)(SP), B4 1882 VMOVDQU (5*16)(SP), B5 1883 VMOVDQU (6*16)(SP), B6 1884 VMOVDQU (7*16)(SP), B7 1885 1886 VMOVDQU (16*0)(ctx), T0 1887 VPSHUFB BSWAP, T0, T0 1888 VPXOR ACC0, T0, T0 1889 VPSHUFD $78, T0, T1 1890 VPXOR T0, T1, T1 1891 1892 VMOVDQU (16*0)(pTbl), ACC1 1893 VMOVDQU (16*1)(pTbl), ACCM 1894 1895 VPCLMULQDQ $0x00, T1, ACCM, ACCM 1896 VPCLMULQDQ $0x00, T0, ACC1, ACC0 1897 VPCLMULQDQ $0x11, T0, ACC1, ACC1 1898 1899 avxDecMulRound(1) 1900 increment(0) 1901 avxDecMulRound(2) 1902 increment(1) 1903 avxDecMulRound(3) 1904 increment(2) 1905 avxDecMulRound(4) 1906 increment(3) 1907 avxDecMulRound(5) 1908 increment(4) 1909 avxDecMulRound(6) 1910 increment(5) 1911 avxDecMulRound(7) 1912 increment(6) 1913 1914 VPXOR ACC0, ACCM, ACCM 1915 VPXOR ACC1, ACCM, ACCM 1916 1917 VPSLLDQ $8, ACCM, T0 1918 VPSRLDQ $8, ACCM, ACCM 1919 1920 VPXOR ACCM, ACC1, ACC1 1921 VPXOR T0, ACC0, ACC0 1922 1923 increment(7) 1924 avxReduceRound(ACC0) 1925 avxReduceRound(ACC0) 1926 VPXOR ACC1, ACC0, ACC0 1927 1928 AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 1929 1930 VPXOR (16*0)(ctx), B0, B0 1931 VPXOR (16*1)(ctx), B1, B1 1932 VPXOR (16*2)(ctx), B2, B2 1933 VPXOR (16*3)(ctx), B3, B3 1934 VPXOR (16*4)(ctx), B4, B4 1935 VPXOR (16*5)(ctx), B5, B5 1936 VPXOR (16*6)(ctx), B6, B6 1937 VPXOR (16*7)(ctx), B7, B7 1938 1939 VMOVDQU B0, (16*0)(ptx) 1940 VMOVDQU B1, (16*1)(ptx) 1941 VMOVDQU B2, (16*2)(ptx) 1942 VMOVDQU B3, (16*3)(ptx) 1943 VMOVDQU B4, (16*4)(ptx) 1944 VMOVDQU B5, (16*5)(ptx) 1945 VMOVDQU B6, (16*6)(ptx) 1946 VMOVDQU B7, (16*7)(ptx) 1947 1948 LEAQ 128(ptx), ptx 1949 LEAQ 128(ctx), ctx 1950 1951 JMP avxGcmSm4DecOctetsLoop 1952 1953 avxGcmSm4DecEndOctets: 1954 SUBQ $4, aluCTR 1955 1956 avxGcmSm4DecNibbles: 1957 CMPQ ptxLen, $64 1958 JBE avxGcmSm4DecSingles 1959 SUBQ $64, ptxLen 1960 1961 VMOVDQU (0*16)(SP), B4 1962 VMOVDQU (1*16)(SP), B5 1963 VMOVDQU (2*16)(SP), B6 1964 VMOVDQU (3*16)(SP), B7 1965 1966 AVX_SM4_4BLOCKS_WO_BS(rk, B0, B1, T1, T2, B4, B5, B6, B7) 1967 1968 VMOVDQU (16*14)(pTbl), T2 1969 VMOVDQU (16*0)(ctx), B0 1970 VPXOR B0, B4, B4 1971 internalAvxDecGhashRound() 1972 increment(0) 1973 1974 VMOVDQU (16*1)(ctx), B0 1975 VPXOR B0, B5, B5 1976 internalAvxDecGhashRound() 1977 increment(1) 1978 1979 VMOVDQU (16*2)(ctx), B0 1980 VPXOR B0, B6, B6 1981 internalAvxDecGhashRound() 1982 increment(2) 1983 1984 VMOVDQU (16*3)(ctx), B0 1985 VPXOR B0, B7, B7 1986 internalAvxDecGhashRound() 1987 increment(3) 1988 1989 VMOVDQU B4, (16*0)(ptx) 1990 VMOVDQU B5, (16*1)(ptx) 1991 VMOVDQU B6, (16*2)(ptx) 1992 VMOVDQU B7, (16*3)(ptx) 1993 1994 LEAQ 64(ptx), ptx 1995 LEAQ 64(ctx), ctx 1996 1997 avxGcmSm4DecSingles: 1998 TESTQ ptxLen, ptxLen 1999 JE avxGcmSm4DecDone 2000 2001 VMOVDQU (0*16)(SP), B0 2002 VMOVDQU (1*16)(SP), B1 2003 VMOVDQU (2*16)(SP), B2 2004 VMOVDQU (3*16)(SP), B3 2005 2006 AVX_SM4_4BLOCKS_WO_BS(rk, B7, B6, B5, B4, B0, B1, B2, B3) 2007 VMOVDQU B0, (16*4)(SP) 2008 VMOVDQU B1, (16*5)(SP) 2009 VMOVDQU B2, (16*6)(SP) 2010 VMOVDQU B3, (16*7)(SP) 2011 2012 VMOVDQU (16*14)(pTbl), T2 2013 MOVQ SP, BP 2014 ADDQ $64, BP 2015 2016 avxGcmSm4DecSinglesLoop: 2017 CMPQ ptxLen, $16 2018 JB avxGcmSm4DecTail 2019 SUBQ $16, ptxLen 2020 2021 VMOVDQU (16*0)(BP), T0 2022 VMOVDQU (ctx), B0 2023 VPXOR T0, B0, T0 2024 VMOVDQU T0, (ptx) 2025 2026 internalAvxDecGhashRound() 2027 2028 LEAQ (16*1)(ptx), ptx 2029 LEAQ (16*1)(ctx), ctx 2030 ADDQ $16, BP 2031 JMP avxGcmSm4DecSinglesLoop 2032 2033 avxGcmSm4DecTail: 2034 TESTQ ptxLen, ptxLen 2035 JE avxGcmSm4DecDone 2036 2037 MOVQ ptxLen, aluTMP 2038 SHLQ $4, aluTMP 2039 LEAQ andMask<>(SB), aluCTR 2040 VMOVDQU -16(aluCTR)(aluTMP*1), T1 2041 2042 VMOVDQU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 2043 VPAND T1, B0, B0 2044 2045 VMOVDQU B0, T1 2046 internalAvxDecGhashRound() 2047 2048 VMOVDQU (16*0)(BP), B0 2049 VPXOR T1, B0, B0 2050 2051 avxPtxStoreLoop: 2052 VPEXTRB $0, B0, (ptx) 2053 VPSRLDQ $1, B0, B0 2054 LEAQ 1(ptx), ptx 2055 DECQ ptxLen 2056 2057 JNE avxPtxStoreLoop 2058 2059 avxGcmSm4DecDone: 2060 VMOVDQU ACC0, (tPtr) 2061 RET 2062 2063 avx2GcmSm4Dec: 2064 VMOVDQU bswap_mask<>(SB), BSWAP 2065 VMOVDQU gcmPoly<>(SB), POLY 2066 2067 VMOVDQU (tPtr), ACC0 2068 VPXOR ACC1, ACC1, ACC1 2069 VPXOR ACCM, ACCM, ACCM 2070 VMOVDQU (ctrPtr), T0 2071 VPSHUFB flip_mask<>(SB), T0, T0 2072 VPEXTRD $3, T0, aluCTR 2073 2074 VINSERTI128 $1, T0, Y11, Y11 2075 VMOVDQU Y11, (0*32)(SP) 2076 increment(0) 2077 increment(1) 2078 VMOVDQU Y11, (1*32)(SP) 2079 increment(2) 2080 increment(3) 2081 2082 CMPQ ptxLen, $128 2083 JB avx2GcmSm4DecNibbles 2084 2085 // We have at least 8 blocks to dencrypt, prepare the rest of the counters 2086 VMOVDQU Y11, (2*32)(SP) 2087 increment(4) 2088 increment(5) 2089 VMOVDQU Y11, (3*32)(SP) 2090 increment(6) 2091 increment(7) 2092 2093 VBROADCASTI128 bswap_mask<>(SB), DWBSWAP 2094 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 2095 2096 avx2GcmSm4DecOctetsLoop: 2097 CMPQ ptxLen, $128 2098 JB avx2GcmSm4DecEndOctets 2099 SUBQ $128, ptxLen 2100 2101 // load 8 ctrs for encryption 2102 VMOVDQU (0*32)(SP), DWB0 2103 VMOVDQU (1*32)(SP), DWB1 2104 VMOVDQU (2*32)(SP), DWB2 2105 VMOVDQU (3*32)(SP), DWB3 2106 2107 VMOVDQU (16*0)(ctx), T0 2108 VPSHUFB BSWAP, T0, T0 2109 VPXOR ACC0, T0, T0 2110 VPSHUFD $78, T0, T1 2111 VPXOR T0, T1, T1 2112 2113 VMOVDQU (16*0)(pTbl), ACC1 2114 VMOVDQU (16*1)(pTbl), ACCM 2115 2116 VPCLMULQDQ $0x00, T1, ACCM, ACCM 2117 VPCLMULQDQ $0x00, T0, ACC1, ACC0 2118 VPCLMULQDQ $0x11, T0, ACC1, ACC1 2119 2120 avxDecMulRound(1) 2121 increment(0) 2122 avxDecMulRound(2) 2123 increment(1) 2124 avxDecMulRound(3) 2125 increment(2) 2126 avxDecMulRound(4) 2127 increment(3) 2128 avxDecMulRound(5) 2129 increment(4) 2130 avxDecMulRound(6) 2131 increment(5) 2132 avxDecMulRound(7) 2133 increment(6) 2134 2135 VPXOR ACC0, ACCM, ACCM 2136 VPXOR ACC1, ACCM, ACCM 2137 VPSLLDQ $8, ACCM, T0 2138 VPSRLDQ $8, ACCM, ACCM 2139 2140 VPXOR ACCM, ACC1, ACC1 2141 VPXOR T0, ACC0, ACC0 2142 increment(7) 2143 2144 avxReduceRound(ACC0) 2145 avxReduceRound(ACC0) 2146 VPXOR ACC1, ACC0, ACC0 2147 2148 // Transpose matrix 4 x 4 32bits word 2149 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) 2150 2151 AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3) 2152 2153 // Transpose matrix 4 x 4 32bits word 2154 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) 2155 2156 VPSHUFB DWBSWAP, DWB0, DWB0 2157 VPSHUFB DWBSWAP, DWB1, DWB1 2158 VPSHUFB DWBSWAP, DWB2, DWB2 2159 VPSHUFB DWBSWAP, DWB3, DWB3 2160 2161 VPXOR (32*0)(ctx), DWB0, DWB0 2162 VPXOR (32*1)(ctx), DWB1, DWB1 2163 VPXOR (32*2)(ctx), DWB2, DWB2 2164 VPXOR (32*3)(ctx), DWB3, DWB3 2165 2166 VMOVDQU DWB0, (32*0)(ptx) 2167 VMOVDQU DWB1, (32*1)(ptx) 2168 VMOVDQU DWB2, (32*2)(ptx) 2169 VMOVDQU DWB3, (32*3)(ptx) 2170 2171 LEAQ 128(ptx), ptx 2172 LEAQ 128(ctx), ctx 2173 2174 JMP avx2GcmSm4DecOctetsLoop 2175 2176 avx2GcmSm4DecEndOctets: 2177 SUBQ $4, aluCTR 2178 2179 avx2GcmSm4DecNibbles: 2180 CMPQ ptxLen, $64 2181 JBE avx2GcmSm4DecSingles 2182 SUBQ $64, ptxLen 2183 2184 VMOVDQU (0*16)(SP), B4 2185 VMOVDQU (1*16)(SP), B1 2186 VMOVDQU (2*16)(SP), B2 2187 VMOVDQU (3*16)(SP), B3 2188 2189 AVX_SM4_4BLOCKS_WO_BS(rk, B0, B5, B6, B7, B4, B1, B2, B3) 2190 2191 VMOVDQU (16*14)(pTbl), T2 2192 VMOVDQU (16*0)(ctx), B0 2193 VPXOR B0, B4, B4 2194 increment(0) 2195 internalAvxDecGhashRound() 2196 2197 VMOVDQU (16*1)(ctx), B0 2198 VPXOR B0, B1, B1 2199 increment(1) 2200 internalAvxDecGhashRound() 2201 2202 VMOVDQU (16*2)(ctx), B0 2203 VPXOR B0, B2, B2 2204 increment(2) 2205 internalAvxDecGhashRound() 2206 2207 VMOVDQU (16*3)(ctx), B0 2208 VPXOR B0, B3, B3 2209 increment(3) 2210 internalAvxDecGhashRound() 2211 2212 VMOVDQU B4, (16*0)(ptx) 2213 VMOVDQU B1, (16*1)(ptx) 2214 VMOVDQU B2, (16*2)(ptx) 2215 VMOVDQU B3, (16*3)(ptx) 2216 2217 LEAQ 64(ptx), ptx 2218 LEAQ 64(ctx), ctx 2219 2220 avx2GcmSm4DecSingles: 2221 TESTQ ptxLen, ptxLen 2222 JE avx2GcmSm4DecDone 2223 2224 VMOVDQU (0*16)(SP), B0 2225 VMOVDQU (1*16)(SP), B1 2226 VMOVDQU (2*16)(SP), B2 2227 VMOVDQU (3*16)(SP), B3 2228 2229 AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3) 2230 2231 VMOVDQU B0, (16*4)(SP) 2232 VMOVDQU B1, (16*5)(SP) 2233 VMOVDQU B2, (16*6)(SP) 2234 VMOVDQU B3, (16*7)(SP) 2235 2236 VMOVDQU (16*14)(pTbl), T2 2237 MOVQ SP, BP 2238 ADDQ $64, BP 2239 2240 avx2GcmSm4DecSinglesLoop: 2241 CMPQ ptxLen, $16 2242 JB avx2GcmSm4DecTail 2243 SUBQ $16, ptxLen 2244 2245 VMOVDQU (16*0)(BP), T0 2246 VMOVDQU (ctx), B0 2247 VPXOR T0, B0, T0 2248 VMOVDQU T0, (ptx) 2249 2250 internalAvxDecGhashRound() 2251 LEAQ (16*1)(ptx), ptx 2252 LEAQ (16*1)(ctx), ctx 2253 ADDQ $16, BP 2254 JMP avx2GcmSm4DecSinglesLoop 2255 2256 avx2GcmSm4DecTail: 2257 TESTQ ptxLen, ptxLen 2258 JE avx2GcmSm4DecDone 2259 2260 MOVQ ptxLen, aluTMP 2261 SHLQ $4, aluTMP 2262 LEAQ andMask<>(SB), aluCTR 2263 VMOVDQU -16(aluCTR)(aluTMP*1), T1 // Fetch and-mask according ptxLen 2264 2265 VMOVDQU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 2266 VPAND T1, B0, B0 // Just keep ptxLen bytes, others will be zero 2267 2268 VMOVDQU B0, T1 2269 internalAvxDecGhashRound() 2270 VMOVDQU (16*0)(BP), B0 2271 VPXOR T1, B0, B0 2272 2273 avx2PtxStoreLoop: 2274 VPEXTRB $0, B0, (ptx) 2275 VPSRLDQ $1, B0, B0 2276 LEAQ 1(ptx), ptx 2277 DECQ ptxLen 2278 2279 JNE avx2PtxStoreLoop 2280 2281 avx2GcmSm4DecDone: 2282 VMOVDQU ACC0, (tPtr) 2283 VZEROUPPER 2284 RET 2285 2286 // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 2287 TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 2288 RET 2289 2290 // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 2291 TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 2292 RET