github.com/emmansun/gmsm@v0.29.1/sm4/xts_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define B0 X0 6 #define B1 X1 7 #define B2 X2 8 #define B3 X3 9 #define B4 X4 10 #define B5 X5 11 #define B6 X6 12 #define B7 X7 13 14 #define TW X10 15 16 #define T0 X11 17 #define T1 X12 18 #define T2 X13 19 #define POLY X14 20 #define NIBBLE_MASK Y13 21 #define X_NIBBLE_MASK X13 22 #define BSWAP X15 23 #define DWBSWAP Y15 24 25 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000087 26 DATA gcmPoly<>+0x08(SB)/8, $0x0000000000000000 27 28 DATA gbGcmPoly<>+0x00(SB)/8, $0x0000000000000000 29 DATA gbGcmPoly<>+0x08(SB)/8, $0xe100000000000000 30 31 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 32 GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16 33 34 #include "aesni_macros_amd64.s" 35 36 #define mul2GBInline \ 37 PSHUFB BSWAP, TW; \ 38 \// TW * 2 39 MOVOU TW, T0; \ 40 PSHUFD $0, TW, T1; \ 41 PSRLQ $1, TW; \ 42 PSLLQ $63, T0; \ 43 PSRLDQ $8, T0; \ 44 POR T0, TW; \ 45 \// reduction 46 PSLLL $31, T1; \ 47 PSRAL $31, T1; \ 48 PAND POLY, T1; \ 49 PXOR T1, TW; \ 50 PSHUFB BSWAP, TW 51 52 #define avxMul2GBInline \ 53 VPSHUFB BSWAP, TW, TW; \ 54 \// TW * 2 55 VPSLLQ $63, TW, T0; \ 56 VPSHUFD $0, TW, T1; \ 57 VPSRLQ $1, TW, TW; \ 58 VPSRLDQ $8, T0, T0; \ 59 VPOR T0, TW, TW; \ 60 \// reduction 61 VPSLLD $31, T1, T1; \ 62 VPSRAD $31, T1, T1; \ 63 VPAND POLY, T1, T1; \ 64 VPXOR T1, TW, TW; \ 65 VPSHUFB BSWAP, TW, TW 66 67 #define prepareGB4Tweaks \ 68 MOVOU TW, (16*0)(SP); \ 69 mul2GBInline; \ 70 MOVOU TW, (16*1)(SP); \ 71 mul2GBInline; \ 72 MOVOU TW, (16*2)(SP); \ 73 mul2GBInline; \ 74 MOVOU TW, (16*3)(SP); \ 75 mul2GBInline 76 77 #define prepareGB8Tweaks \ 78 prepareGB4Tweaks; \ 79 MOVOU TW, (16*4)(SP); \ 80 mul2GBInline; \ 81 MOVOU TW, (16*5)(SP); \ 82 mul2GBInline; \ 83 MOVOU TW, (16*6)(SP); \ 84 mul2GBInline; \ 85 MOVOU TW, (16*7)(SP); \ 86 mul2GBInline 87 88 #define avxPrepareGB4Tweaks \ 89 VMOVDQU TW, (16*0)(SP); \ 90 avxMul2GBInline; \ 91 VMOVDQU TW, (16*1)(SP); \ 92 avxMul2GBInline; \ 93 VMOVDQU TW, (16*2)(SP); \ 94 avxMul2GBInline; \ 95 VMOVDQU TW, (16*3)(SP); \ 96 avxMul2GBInline 97 98 #define avxPrepareGB8Tweaks \ 99 avxPrepareGB4Tweaks; \ 100 VMOVDQU TW, (16*4)(SP); \ 101 avxMul2GBInline; \ 102 VMOVDQU TW, (16*5)(SP); \ 103 avxMul2GBInline; \ 104 VMOVDQU TW, (16*6)(SP); \ 105 avxMul2GBInline; \ 106 VMOVDQU TW, (16*7)(SP); \ 107 avxMul2GBInline 108 109 #define avxPrepareGB16Tweaks \ 110 avxPrepareGB8Tweaks; \ 111 VMOVDQU TW, (16*8)(SP); \ 112 avxMul2GBInline; \ 113 VMOVDQU TW, (16*9)(SP); \ 114 avxMul2GBInline; \ 115 VMOVDQU TW, (16*10)(SP); \ 116 avxMul2GBInline; \ 117 VMOVDQU TW, (16*11)(SP); \ 118 avxMul2GBInline; \ 119 VMOVDQU TW, (16*12)(SP); \ 120 avxMul2GBInline; \ 121 VMOVDQU TW, (16*13)(SP); \ 122 avxMul2GBInline; \ 123 VMOVDQU TW, (16*14)(SP); \ 124 avxMul2GBInline; \ 125 VMOVDQU TW, (16*15)(SP); \ 126 avxMul2GBInline 127 128 #define mul2Inline \ 129 PSHUFD $0xff, TW, T0; \ 130 MOVOU TW, T1; \ 131 PSRAL $31, T0; \ 132 PAND POLY, T0; \ 133 PSRLL $31, T1; \ 134 PSLLDQ $4, T1; \ 135 PSLLL $1, TW; \ 136 PXOR T0, TW; \ 137 PXOR T1, TW 138 139 #define avxMul2Inline \ 140 VPSHUFD $0xff, TW, T0; \ 141 VPSRLD $31, TW, T1; \ 142 VPSRAD $31, T0, T0; \ 143 VPAND POLY, T0, T0; \ 144 VPSLLDQ $4, T1, T1; \ 145 VPSLLD $1, TW, TW; \ 146 VPXOR T0, TW, TW; \ 147 VPXOR T1, TW, TW 148 149 #define prepare4Tweaks \ 150 MOVOU TW, (16*0)(SP); \ 151 mul2Inline; \ 152 MOVOU TW, (16*1)(SP); \ 153 mul2Inline; \ 154 MOVOU TW, (16*2)(SP); \ 155 mul2Inline; \ 156 MOVOU TW, (16*3)(SP); \ 157 mul2Inline 158 159 #define prepare8Tweaks \ 160 prepare4Tweaks; \ 161 MOVOU TW, (16*4)(SP); \ 162 mul2Inline; \ 163 MOVOU TW, (16*5)(SP); \ 164 mul2Inline; \ 165 MOVOU TW, (16*6)(SP); \ 166 mul2Inline; \ 167 MOVOU TW, (16*7)(SP); \ 168 mul2Inline 169 170 #define avxPrepare4Tweaks \ 171 VMOVDQU TW, (16*0)(SP); \ 172 avxMul2Inline; \ 173 VMOVDQU TW, (16*1)(SP); \ 174 avxMul2Inline; \ 175 VMOVDQU TW, (16*2)(SP); \ 176 avxMul2Inline; \ 177 VMOVDQU TW, (16*3)(SP); \ 178 avxMul2Inline 179 180 #define avxPrepare8Tweaks \ 181 prepare4Tweaks; \ 182 VMOVDQU TW, (16*4)(SP); \ 183 avxMul2Inline; \ 184 VMOVDQU TW, (16*5)(SP); \ 185 avxMul2Inline; \ 186 VMOVDQU TW, (16*6)(SP); \ 187 avxMul2Inline; \ 188 VMOVDQU TW, (16*7)(SP); \ 189 avxMul2Inline 190 191 #define avxPrepare16Tweaks \ 192 prepare8Tweaks; \ 193 VMOVDQU TW, (16*8)(SP); \ 194 avxMul2Inline; \ 195 VMOVDQU TW, (16*9)(SP); \ 196 avxMul2Inline; \ 197 VMOVDQU TW, (16*10)(SP); \ 198 avxMul2Inline; \ 199 VMOVDQU TW, (16*11)(SP); \ 200 avxMul2Inline; \ 201 VMOVDQU TW, (16*12)(SP); \ 202 avxMul2Inline; \ 203 VMOVDQU TW, (16*13)(SP); \ 204 avxMul2Inline; \ 205 VMOVDQU TW, (16*14)(SP); \ 206 avxMul2Inline; \ 207 VMOVDQU TW, (16*15)(SP); \ 208 avxMul2Inline 209 210 #define sseLoad4Blocks \ 211 MOVOU (16*0)(DX), B0; \ 212 MOVOU (16*0)(SP), T0; \ 213 PXOR T0, B0; \ 214 MOVOU (16*1)(DX), B1; \ 215 MOVOU (16*1)(SP), T0; \ 216 PXOR T0, B1; \ 217 MOVOU (16*2)(DX), B2; \ 218 MOVOU (16*2)(SP), T0; \ 219 PXOR T0, B2; \ 220 MOVOU (16*3)(DX), B3; \ 221 MOVOU (16*3)(SP), T0; \ 222 PXOR T0, B3 223 224 #define sseStore4Blocks \ 225 MOVOU (16*0)(SP), T0; \ 226 PXOR T0, B0; \ 227 MOVOU B0, (16*0)(CX); \ 228 MOVOU (16*1)(SP), T0; \ 229 PXOR T0, B1; \ 230 MOVOU B1, (16*1)(CX); \ 231 MOVOU (16*2)(SP), T0; \ 232 PXOR T0, B2; \ 233 MOVOU B2, (16*2)(CX); \ 234 MOVOU (16*3)(SP), T0; \ 235 PXOR T0, B3; \ 236 MOVOU B3, (16*3)(CX) 237 238 #define sseLoad8Blocks \ 239 sseLoad4Blocks; \ 240 MOVOU (16*4)(DX), B4; \ 241 MOVOU (16*4)(SP), T0; \ 242 PXOR T0, B4; \ 243 MOVOU (16*5)(DX), B5; \ 244 MOVOU (16*5)(SP), T0; \ 245 PXOR T0, B5; \ 246 MOVOU (16*6)(DX), B6; \ 247 MOVOU (16*6)(SP), T0; \ 248 PXOR T0, B6; \ 249 MOVOU (16*7)(DX), B7; \ 250 MOVOU (16*7)(SP), T0; \ 251 PXOR T0, B7 252 253 #define sseStore8Blocks \ 254 sseStore4Blocks; \ 255 MOVOU (16*4)(SP), T0; \ 256 PXOR T0, B4; \ 257 MOVOU B4, (16*4)(CX); \ 258 MOVOU (16*5)(SP), T0; \ 259 PXOR T0, B5; \ 260 MOVOU B5, (16*5)(CX); \ 261 MOVOU (16*6)(SP), T0; \ 262 PXOR T0, B6; \ 263 MOVOU B6, (16*6)(CX); \ 264 MOVOU (16*7)(SP), T0; \ 265 PXOR T0, B7; \ 266 MOVOU B7, (16*7)(CX) 267 268 #define avxLoad4Blocks \ 269 VMOVDQU (16*0)(DX), B0; \ 270 VPXOR (16*0)(SP), B0, B0; \ 271 VMOVDQU (16*1)(DX), B1; \ 272 VPXOR (16*1)(SP), B1, B1; \ 273 VMOVDQU (16*2)(DX), B2; \ 274 VPXOR (16*2)(SP), B2, B2; \ 275 VMOVDQU (16*3)(DX), B3; \ 276 VPXOR (16*3)(SP), B3, B3 277 278 #define avxStore4Blocks \ 279 VPXOR (16*0)(SP), B0, B0; \ 280 VMOVDQU B0, (16*0)(CX); \ 281 VPXOR (16*1)(SP), B1, B1; \ 282 VMOVDQU B1, (16*1)(CX); \ 283 VPXOR (16*2)(SP), B2, B2; \ 284 VMOVDQU B2, (16*2)(CX); \ 285 VPXOR (16*3)(SP), B3, B3; \ 286 VMOVDQU B3, (16*3)(CX) 287 288 #define avxLoad8Blocks \ 289 avxLoad4Blocks; \ 290 VMOVDQU (16*4)(DX), B4; \ 291 VPXOR (16*4)(SP), B4, B4; \ 292 VMOVDQU (16*5)(DX), B5; \ 293 VPXOR (16*5)(SP), B5, B5; \ 294 VMOVDQU (16*6)(DX), B6; \ 295 VPXOR (16*6)(SP), B6, B6; \ 296 VMOVDQU (16*7)(DX), B7; \ 297 VPXOR (16*7)(SP), B7, B7 298 299 #define avxStore8Blocks \ 300 avxStore4Blocks; \ 301 VPXOR (16*4)(SP), B4, B4; \ 302 VMOVDQU B4, (16*4)(CX); \ 303 VPXOR (16*5)(SP), B5, B5; \ 304 VMOVDQU B5, (16*5)(CX); \ 305 VPXOR (16*6)(SP), B6, B6; \ 306 VMOVDQU B6, (16*6)(CX); \ 307 VPXOR (16*7)(SP), B7, B7; \ 308 VMOVDQU B7, (16*7)(CX) 309 310 #define avx2Load8Blocks \ 311 VMOVDQU (32*0)(DX), Y0; \ 312 VPXOR (32*0)(SP), Y0, Y0; \ 313 VMOVDQU (32*1)(DX), Y1; \ 314 VPXOR (32*1)(SP), Y1, Y1; \ 315 VMOVDQU (32*2)(DX), Y2; \ 316 VPXOR (32*2)(SP), Y2, Y2; \ 317 VMOVDQU (32*3)(DX), Y3; \ 318 VPXOR (32*3)(SP), Y3, Y3 319 320 #define avx2Load16Blocks \ 321 avx2Load8Blocks; \ 322 VMOVDQU (32*4)(DX), Y4; \ 323 VPXOR (32*4)(SP), Y4, Y4; \ 324 VMOVDQU (32*5)(DX), Y5; \ 325 VPXOR (32*5)(SP), Y5, Y5; \ 326 VMOVDQU (32*6)(DX), Y6; \ 327 VPXOR (32*6)(SP), Y6, Y6; \ 328 VMOVDQU (32*7)(DX), Y7; \ 329 VPXOR (32*7)(SP), Y7, Y7 330 331 #define avx2LE2BE8Blocks \ 332 VBROADCASTI128 flip_mask<>(SB), Y11; \ 333 VPSHUFB Y11, Y0, Y0; \ 334 VPSHUFB Y11, Y1, Y1; \ 335 VPSHUFB Y11, Y2, Y2; \ 336 VPSHUFB Y11, Y3, Y3; \ 337 338 #define avx2LE2BE16Blocks \ 339 avx2LE2BE8Blocks; \ 340 VPSHUFB Y11, Y4, Y4; \ 341 VPSHUFB Y11, Y5, Y5; \ 342 VPSHUFB Y11, Y6, Y6; \ 343 VPSHUFB Y11, Y7, Y7 344 345 #define avx2Store8Blocks \ 346 VPXOR (32*0)(SP), Y0, Y0; \ 347 VMOVDQU Y0, (32*0)(CX); \ 348 VPXOR (32*1)(SP), Y1, Y1; \ 349 VMOVDQU Y1, (32*1)(CX); \ 350 VPXOR (32*2)(SP), Y2, Y2; \ 351 VMOVDQU Y2, (32*2)(CX); \ 352 VPXOR (32*3)(SP), Y3, Y3; \ 353 VMOVDQU Y3, (32*3)(CX); \ 354 355 #define avx2Store16Blocks \ 356 avx2Store8Blocks; \ 357 VPXOR (32*4)(SP), Y4, Y4; \ 358 VMOVDQU Y4, (32*4)(CX); \ 359 VPXOR (32*5)(SP), Y5, Y5; \ 360 VMOVDQU Y5, (32*5)(CX); \ 361 VPXOR (32*6)(SP), Y6, Y6; \ 362 VMOVDQU Y6, (32*6)(CX); \ 363 VPXOR (32*7)(SP), Y7, Y7; \ 364 VMOVDQU Y7, (32*7)(CX) 365 366 #define avx2ByteSwap8Blocks \ 367 VPSHUFB DWBSWAP, Y0, Y0; \ 368 VPSHUFB DWBSWAP, Y1, Y1; \ 369 VPSHUFB DWBSWAP, Y2, Y2; \ 370 VPSHUFB DWBSWAP, Y3, Y3; \ 371 372 #define avx2ByteSwap16Blocks \ 373 avx2ByteSwap8Blocks; \ 374 VPSHUFB DWBSWAP, Y4, Y4; \ 375 VPSHUFB DWBSWAP, Y5, Y5; \ 376 VPSHUFB DWBSWAP, Y6, Y6; \ 377 VPSHUFB DWBSWAP, Y7, Y7 378 379 // func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 380 TEXT ·encryptSm4Xts(SB),0,$256-64 381 MOVQ xk+0(FP), AX 382 MOVQ tweak+8(FP), BX 383 MOVQ dst+16(FP), CX 384 MOVQ src+40(FP), DX 385 MOVQ src_len+48(FP), DI 386 387 CMPB ·useAVX2(SB), $1 388 JE avx2XtsSm4Enc 389 390 CMPB ·useAVX(SB), $1 391 JE avxXtsSm4Enc 392 393 MOVOU gcmPoly<>(SB), POLY 394 395 MOVOU (0*16)(BX), TW 396 397 xtsSm4EncOctets: 398 CMPQ DI, $128 399 JB xtsSm4EncNibbles 400 SUBQ $128, DI 401 402 // prepare tweaks 403 prepare8Tweaks 404 // load 8 blocks for encryption 405 sseLoad8Blocks 406 407 SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 408 409 sseStore8Blocks 410 411 LEAQ 128(DX), DX 412 LEAQ 128(CX), CX 413 414 JMP xtsSm4EncOctets 415 416 xtsSm4EncNibbles: 417 CMPQ DI, $64 418 JB xtsSm4EncSingles 419 SUBQ $64, DI 420 421 // prepare tweaks 422 prepare4Tweaks 423 // load 4 blocks for encryption 424 sseLoad4Blocks 425 426 SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 427 428 sseStore4Blocks 429 430 LEAQ 64(DX), DX 431 LEAQ 64(CX), CX 432 433 xtsSm4EncSingles: 434 CMPQ DI, $16 435 JB xtsSm4EncTail 436 SUBQ $16, DI 437 438 // load 1 block for encryption 439 MOVOU (16*0)(DX), B0 440 441 PXOR TW, B0 442 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 443 PXOR TW, B0 444 MOVOU B0, (16*0)(CX) 445 mul2Inline 446 447 LEAQ 16(DX), DX 448 LEAQ 16(CX), CX 449 450 JMP xtsSm4EncSingles 451 452 xtsSm4EncTail: 453 TESTQ DI, DI 454 JE xtsSm4EncDone 455 456 LEAQ -16(CX), R8 457 MOVOU (16*0)(R8), B0 458 MOVOU B0, (16*0)(SP) 459 460 CMPQ DI, $8 461 JB loop_1b 462 SUBQ $8, DI 463 MOVQ (DX)(DI*1), R9 464 MOVQ (SP)(DI*1), R10 465 MOVQ R9, (SP)(DI*1) 466 MOVQ R10, (CX)(DI*1) 467 468 TESTQ DI, DI 469 JE xtsSm4EncTailEnc 470 471 loop_1b: 472 SUBQ $1, DI 473 MOVB (DX)(DI*1), R9 474 MOVB (SP)(DI*1), R10 475 MOVB R9, (SP)(DI*1) 476 MOVB R10, (CX)(DI*1) 477 TESTQ DI, DI 478 JNE loop_1b 479 480 xtsSm4EncTailEnc: 481 MOVOU (16*0)(SP), B0 482 PXOR TW, B0 483 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 484 PXOR TW, B0 485 MOVOU B0, (16*0)(R8) 486 487 xtsSm4EncDone: 488 MOVOU TW, (16*0)(BX) 489 RET 490 491 avxXtsSm4Enc: 492 VMOVDQU gcmPoly<>(SB), POLY 493 VMOVDQU (0*16)(BX), TW 494 495 avxXtsSm4EncOctets: 496 CMPQ DI, $128 497 JB avxXtsSm4EncNibbles 498 SUBQ $128, DI 499 500 // prepare tweaks 501 avxPrepare8Tweaks 502 // load 8 blocks for encryption 503 avxLoad8Blocks 504 505 AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 506 507 avxStore8Blocks 508 509 LEAQ 128(DX), DX 510 LEAQ 128(CX), CX 511 512 JMP avxXtsSm4EncOctets 513 514 avxXtsSm4EncNibbles: 515 CMPQ DI, $64 516 JB avxXtsSm4EncSingles 517 SUBQ $64, DI 518 519 // prepare tweaks 520 avxPrepare4Tweaks 521 // load 4 blocks for encryption 522 avxLoad4Blocks 523 524 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 525 526 avxStore4Blocks 527 528 LEAQ 64(DX), DX 529 LEAQ 64(CX), CX 530 531 avxXtsSm4EncSingles: 532 CMPQ DI, $16 533 JB avxXtsSm4EncTail 534 SUBQ $16, DI 535 536 // load 1 block for encryption 537 VMOVDQU (16*0)(DX), B0 538 539 VPXOR TW, B0, B0 540 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 541 VPXOR TW, B0, B0 542 VMOVDQU B0, (16*0)(CX) 543 avxMul2Inline 544 545 LEAQ 16(DX), DX 546 LEAQ 16(CX), CX 547 548 JMP avxXtsSm4EncSingles 549 550 avxXtsSm4EncTail: 551 TESTQ DI, DI 552 JE avxXtsSm4EncDone 553 554 LEAQ -16(CX), R8 555 VMOVDQU (16*0)(R8), B0 556 VMOVDQU B0, (16*0)(SP) 557 558 CMPQ DI, $8 559 JB avx_loop_1b 560 SUBQ $8, DI 561 MOVQ (DX)(DI*1), R9 562 MOVQ (SP)(DI*1), R10 563 MOVQ R9, (SP)(DI*1) 564 MOVQ R10, (CX)(DI*1) 565 566 TESTQ DI, DI 567 JE avxXtsSm4EncTailEnc 568 569 avx_loop_1b: 570 SUBQ $1, DI 571 MOVB (DX)(DI*1), R9 572 MOVB (SP)(DI*1), R10 573 MOVB R9, (SP)(DI*1) 574 MOVB R10, (CX)(DI*1) 575 TESTQ DI, DI 576 JNE avx_loop_1b 577 578 avxXtsSm4EncTailEnc: 579 VMOVDQU (16*0)(SP), B0 580 VPXOR TW, B0, B0 581 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 582 VPXOR TW, B0, B0 583 VMOVDQU B0, (16*0)(R8) 584 585 avxXtsSm4EncDone: 586 VMOVDQU TW, (16*0)(BX) 587 RET 588 589 avx2XtsSm4Enc: 590 VMOVDQU gcmPoly<>(SB), POLY 591 VMOVDQU (0*16)(BX), TW 592 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 593 VBROADCASTI128 bswap_mask<>(SB), DWBSWAP 594 595 avx2XtsSm4Enc16Blocks: 596 CMPQ DI, $256 597 JB avx2XtsSm4EncOctets 598 SUBQ $256, DI 599 600 // prepare tweaks 601 avxPrepare16Tweaks 602 // load 16 blocks for encryption 603 avx2Load16Blocks 604 // Apply Byte Flip Mask: LE -> BE 605 avx2LE2BE16Blocks 606 // Transpose matrix 4 x 4 32bits word 607 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 608 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 609 610 AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) 611 612 // Transpose matrix 4 x 4 32bits word 613 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 614 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 615 avx2ByteSwap16Blocks 616 avx2Store16Blocks 617 618 LEAQ 256(DX), DX 619 LEAQ 256(CX), CX 620 JMP avx2XtsSm4Enc16Blocks 621 622 avx2XtsSm4EncOctets: 623 CMPQ DI, $128 624 JB avx2XtsSm4EncNibbles 625 SUBQ $128, DI 626 627 // prepare tweaks 628 avxPrepare8Tweaks 629 // load 8 blocks for encryption 630 avx2Load8Blocks 631 // Apply Byte Flip Mask: LE -> BE 632 avx2LE2BE8Blocks 633 // Transpose matrix 4 x 4 32bits word 634 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 635 636 AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) 637 638 // Transpose matrix 4 x 4 32bits word 639 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 640 avx2ByteSwap8Blocks 641 avx2Store8Blocks 642 643 LEAQ 128(DX), DX 644 LEAQ 128(CX), CX 645 646 avx2XtsSm4EncNibbles: 647 CMPQ DI, $64 648 JB avx2XtsSm4EncSingles 649 SUBQ $64, DI 650 651 // prepare tweaks 652 avxPrepare4Tweaks 653 654 // load 4 blocks for encryption 655 avxLoad4Blocks 656 657 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 658 659 avxStore4Blocks 660 661 LEAQ 64(DX), DX 662 LEAQ 64(CX), CX 663 664 avx2XtsSm4EncSingles: 665 CMPQ DI, $16 666 JB avx2XtsSm4EncTail 667 SUBQ $16, DI 668 669 // load 1 block for encryption 670 VMOVDQU (16*0)(DX), B0 671 672 VPXOR TW, B0, B0 673 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 674 VPXOR TW, B0, B0 675 VMOVDQU B0, (16*0)(CX) 676 avxMul2Inline 677 678 LEAQ 16(DX), DX 679 LEAQ 16(CX), CX 680 681 JMP avx2XtsSm4EncSingles 682 683 avx2XtsSm4EncTail: 684 TESTQ DI, DI 685 JE avx2XtsSm4EncDone 686 687 LEAQ -16(CX), R8 688 VMOVDQU (16*0)(R8), B0 689 VMOVDQU B0, (16*0)(SP) 690 691 CMPQ DI, $8 692 JB avx2_loop_1b 693 SUBQ $8, DI 694 MOVQ (DX)(DI*1), R9 695 MOVQ (SP)(DI*1), R10 696 MOVQ R9, (SP)(DI*1) 697 MOVQ R10, (CX)(DI*1) 698 699 TESTQ DI, DI 700 JE avx2XtsSm4EncTailEnc 701 702 avx2_loop_1b: 703 SUBQ $1, DI 704 MOVB (DX)(DI*1), R9 705 MOVB (SP)(DI*1), R10 706 MOVB R9, (SP)(DI*1) 707 MOVB R10, (CX)(DI*1) 708 TESTQ DI, DI 709 JNE avx2_loop_1b 710 711 avx2XtsSm4EncTailEnc: 712 VMOVDQU (16*0)(SP), B0 713 VPXOR TW, B0, B0 714 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 715 VPXOR TW, B0, B0 716 VMOVDQU B0, (16*0)(R8) 717 718 avx2XtsSm4EncDone: 719 VMOVDQU TW, (16*0)(BX) 720 VZEROUPPER 721 RET 722 723 // func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 724 TEXT ·encryptSm4XtsGB(SB),0,$256-64 725 MOVQ xk+0(FP), AX 726 MOVQ tweak+8(FP), BX 727 MOVQ dst+16(FP), CX 728 MOVQ src+40(FP), DX 729 MOVQ src_len+48(FP), DI 730 731 CMPB ·useAVX2(SB), $1 732 JE avx2XtsSm4Enc 733 734 CMPB ·useAVX(SB), $1 735 JE avxXtsSm4Enc 736 737 MOVOU gbGcmPoly<>(SB), POLY 738 MOVOU bswap_mask<>(SB), BSWAP 739 MOVOU (0*16)(BX), TW 740 741 xtsSm4EncOctets: 742 CMPQ DI, $128 743 JB xtsSm4EncNibbles 744 SUBQ $128, DI 745 746 // prepare tweaks 747 prepareGB8Tweaks 748 // load 8 blocks for encryption 749 sseLoad8Blocks 750 751 SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 752 753 sseStore8Blocks 754 755 LEAQ 128(DX), DX 756 LEAQ 128(CX), CX 757 758 JMP xtsSm4EncOctets 759 760 xtsSm4EncNibbles: 761 CMPQ DI, $64 762 JB xtsSm4EncSingles 763 SUBQ $64, DI 764 765 // prepare tweaks 766 prepareGB4Tweaks 767 // load 4 blocks for encryption 768 sseLoad4Blocks 769 770 SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 771 772 sseStore4Blocks 773 774 LEAQ 64(DX), DX 775 LEAQ 64(CX), CX 776 777 xtsSm4EncSingles: 778 CMPQ DI, $16 779 JB xtsSm4EncTail 780 SUBQ $16, DI 781 782 // load 1 block for encryption 783 MOVOU (16*0)(DX), B0 784 785 PXOR TW, B0 786 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 787 PXOR TW, B0 788 MOVOU B0, (16*0)(CX) 789 mul2GBInline 790 791 LEAQ 16(DX), DX 792 LEAQ 16(CX), CX 793 794 JMP xtsSm4EncSingles 795 796 xtsSm4EncTail: 797 TESTQ DI, DI 798 JE xtsSm4EncDone 799 800 LEAQ -16(CX), R8 801 MOVOU (16*0)(R8), B0 802 MOVOU B0, (16*0)(SP) 803 804 CMPQ DI, $8 805 JB loop_1b 806 SUBQ $8, DI 807 MOVQ (DX)(DI*1), R9 808 MOVQ (SP)(DI*1), R10 809 MOVQ R9, (SP)(DI*1) 810 MOVQ R10, (CX)(DI*1) 811 812 TESTQ DI, DI 813 JE xtsSm4EncTailEnc 814 815 loop_1b: 816 SUBQ $1, DI 817 MOVB (DX)(DI*1), R9 818 MOVB (SP)(DI*1), R10 819 MOVB R9, (SP)(DI*1) 820 MOVB R10, (CX)(DI*1) 821 TESTQ DI, DI 822 JNE loop_1b 823 824 xtsSm4EncTailEnc: 825 MOVOU (16*0)(SP), B0 826 PXOR TW, B0 827 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 828 PXOR TW, B0 829 MOVOU B0, (16*0)(R8) 830 831 xtsSm4EncDone: 832 MOVOU TW, (16*0)(BX) 833 RET 834 835 avxXtsSm4Enc: 836 VMOVDQU gbGcmPoly<>(SB), POLY 837 VMOVDQU bswap_mask<>(SB), BSWAP 838 VMOVDQU (0*16)(BX), TW 839 840 avxXtsSm4EncOctets: 841 CMPQ DI, $128 842 JB avxXtsSm4EncNibbles 843 SUBQ $128, DI 844 845 // prepare tweaks 846 avxPrepareGB8Tweaks 847 // load 8 blocks for encryption 848 avxLoad8Blocks 849 850 AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 851 852 avxStore8Blocks 853 854 LEAQ 128(DX), DX 855 LEAQ 128(CX), CX 856 857 JMP avxXtsSm4EncOctets 858 859 avxXtsSm4EncNibbles: 860 CMPQ DI, $64 861 JB avxXtsSm4EncSingles 862 SUBQ $64, DI 863 864 // prepare tweaks 865 avxPrepareGB4Tweaks 866 // load 4 blocks for encryption 867 avxLoad4Blocks 868 869 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 870 871 avxStore4Blocks 872 873 LEAQ 64(DX), DX 874 LEAQ 64(CX), CX 875 876 avxXtsSm4EncSingles: 877 CMPQ DI, $16 878 JB avxXtsSm4EncTail 879 SUBQ $16, DI 880 881 // load 1 block for encryption 882 VMOVDQU (16*0)(DX), B0 883 884 VPXOR TW, B0, B0 885 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 886 VPXOR TW, B0, B0 887 VMOVDQU B0, (16*0)(CX) 888 avxMul2GBInline 889 890 LEAQ 16(DX), DX 891 LEAQ 16(CX), CX 892 893 JMP avxXtsSm4EncSingles 894 895 avxXtsSm4EncTail: 896 TESTQ DI, DI 897 JE avxXtsSm4EncDone 898 899 LEAQ -16(CX), R8 900 VMOVDQU (16*0)(R8), B0 901 VMOVDQU B0, (16*0)(SP) 902 903 CMPQ DI, $8 904 JB avx_loop_1b 905 SUBQ $8, DI 906 MOVQ (DX)(DI*1), R9 907 MOVQ (SP)(DI*1), R10 908 MOVQ R9, (SP)(DI*1) 909 MOVQ R10, (CX)(DI*1) 910 911 TESTQ DI, DI 912 JE avxXtsSm4EncTailEnc 913 914 avx_loop_1b: 915 SUBQ $1, DI 916 MOVB (DX)(DI*1), R9 917 MOVB (SP)(DI*1), R10 918 MOVB R9, (SP)(DI*1) 919 MOVB R10, (CX)(DI*1) 920 TESTQ DI, DI 921 JNE avx_loop_1b 922 923 avxXtsSm4EncTailEnc: 924 VMOVDQU (16*0)(SP), B0 925 VPXOR TW, B0, B0 926 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 927 VPXOR TW, B0, B0 928 VMOVDQU B0, (16*0)(R8) 929 930 avxXtsSm4EncDone: 931 VMOVDQU TW, (16*0)(BX) 932 RET 933 934 avx2XtsSm4Enc: 935 VMOVDQU gbGcmPoly<>(SB), POLY 936 VMOVDQU (0*16)(BX), TW 937 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 938 VBROADCASTI128 bswap_mask<>(SB), DWBSWAP 939 940 avx2XtsSm4Enc16Blocks: 941 CMPQ DI, $256 942 JB avx2XtsSm4EncOctets 943 SUBQ $256, DI 944 945 // prepare tweaks 946 avxPrepareGB16Tweaks 947 // load 16 blocks for encryption 948 avx2Load16Blocks 949 // Apply Byte Flip Mask: LE -> BE 950 avx2LE2BE16Blocks 951 // Transpose matrix 4 x 4 32bits word 952 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 953 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 954 955 AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) 956 957 // Transpose matrix 4 x 4 32bits word 958 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 959 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 960 avx2ByteSwap16Blocks 961 avx2Store16Blocks 962 963 LEAQ 256(DX), DX 964 LEAQ 256(CX), CX 965 JMP avx2XtsSm4Enc16Blocks 966 967 avx2XtsSm4EncOctets: 968 CMPQ DI, $128 969 JB avx2XtsSm4EncNibbles 970 SUBQ $128, DI 971 972 // prepare tweaks 973 avxPrepareGB8Tweaks 974 // load 8 blocks for encryption 975 avx2Load8Blocks 976 // Apply Byte Flip Mask: LE -> BE 977 avx2LE2BE8Blocks 978 // Transpose matrix 4 x 4 32bits word 979 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 980 981 AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) 982 983 // Transpose matrix 4 x 4 32bits word 984 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 985 avx2ByteSwap8Blocks 986 avx2Store8Blocks 987 988 LEAQ 128(DX), DX 989 LEAQ 128(CX), CX 990 991 avx2XtsSm4EncNibbles: 992 CMPQ DI, $64 993 JB avx2XtsSm4EncSingles 994 SUBQ $64, DI 995 996 // prepare tweaks 997 avxPrepareGB4Tweaks 998 // load 4 blocks for encryption 999 avxLoad4Blocks 1000 1001 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1002 1003 avxStore4Blocks 1004 1005 LEAQ 64(DX), DX 1006 LEAQ 64(CX), CX 1007 1008 avx2XtsSm4EncSingles: 1009 CMPQ DI, $16 1010 JB avx2XtsSm4EncTail 1011 SUBQ $16, DI 1012 1013 // load 1 block for encryption 1014 VMOVDQU (16*0)(DX), B0 1015 1016 VPXOR TW, B0, B0 1017 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1018 VPXOR TW, B0, B0 1019 VMOVDQU B0, (16*0)(CX) 1020 avxMul2GBInline 1021 1022 LEAQ 16(DX), DX 1023 LEAQ 16(CX), CX 1024 1025 JMP avx2XtsSm4EncSingles 1026 1027 avx2XtsSm4EncTail: 1028 TESTQ DI, DI 1029 JE avx2XtsSm4EncDone 1030 1031 LEAQ -16(CX), R8 1032 VMOVDQU (16*0)(R8), B0 1033 VMOVDQU B0, (16*0)(SP) 1034 1035 CMPQ DI, $8 1036 JB avx2_loop_1b 1037 SUBQ $8, DI 1038 MOVQ (DX)(DI*1), R9 1039 MOVQ (SP)(DI*1), R10 1040 MOVQ R9, (SP)(DI*1) 1041 MOVQ R10, (CX)(DI*1) 1042 1043 TESTQ DI, DI 1044 JE avx2XtsSm4EncTailEnc 1045 1046 avx2_loop_1b: 1047 SUBQ $1, DI 1048 MOVB (DX)(DI*1), R9 1049 MOVB (SP)(DI*1), R10 1050 MOVB R9, (SP)(DI*1) 1051 MOVB R10, (CX)(DI*1) 1052 TESTQ DI, DI 1053 JNE avx2_loop_1b 1054 1055 avx2XtsSm4EncTailEnc: 1056 VMOVDQU (16*0)(SP), B0 1057 VPXOR TW, B0, B0 1058 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1059 VPXOR TW, B0, B0 1060 VMOVDQU B0, (16*0)(R8) 1061 1062 avx2XtsSm4EncDone: 1063 VMOVDQU TW, (16*0)(BX) 1064 VZEROUPPER 1065 RET 1066 1067 // func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 1068 TEXT ·decryptSm4Xts(SB),0,$256-64 1069 MOVQ xk+0(FP), AX 1070 MOVQ tweak+8(FP), BX 1071 MOVQ dst+16(FP), CX 1072 MOVQ src+40(FP), DX 1073 MOVQ src_len+48(FP), DI 1074 1075 CMPB ·useAVX2(SB), $1 1076 JE avx2XtsSm4Dec 1077 1078 CMPB ·useAVX(SB), $1 1079 JE avxXtsSm4Dec 1080 1081 MOVOU gcmPoly<>(SB), POLY 1082 MOVOU (0*16)(BX), TW 1083 1084 xtsSm4DecOctets: 1085 CMPQ DI, $128 1086 JB xtsSm4DecNibbles 1087 SUBQ $128, DI 1088 1089 // prepare tweaks 1090 prepare8Tweaks 1091 // load 8 blocks for decryption 1092 sseLoad8Blocks 1093 1094 SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 1095 1096 sseStore8Blocks 1097 1098 LEAQ 128(DX), DX 1099 LEAQ 128(CX), CX 1100 1101 JMP xtsSm4DecOctets 1102 1103 xtsSm4DecNibbles: 1104 CMPQ DI, $64 1105 JB xtsSm4DecSingles 1106 SUBQ $64, DI 1107 1108 // prepare tweaks 1109 prepare4Tweaks 1110 // load 4 blocks for decryption 1111 sseLoad4Blocks 1112 1113 SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1114 1115 sseStore4Blocks 1116 1117 LEAQ 64(DX), DX 1118 LEAQ 64(CX), CX 1119 1120 xtsSm4DecSingles: 1121 CMPQ DI, $32 1122 JB xtsSm4DecTail 1123 SUBQ $16, DI 1124 1125 // load 1 block for decryption 1126 MOVOU (16*0)(DX), B0 1127 1128 PXOR TW, B0 1129 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1130 PXOR TW, B0 1131 MOVOU B0, (16*0)(CX) 1132 mul2Inline 1133 1134 LEAQ 16(DX), DX 1135 LEAQ 16(CX), CX 1136 1137 JMP xtsSm4DecSingles 1138 1139 xtsSm4DecTail: 1140 TESTQ DI, DI 1141 JE xtsSm4DecDone 1142 1143 CMPQ DI, $16 1144 JE xtsSm4DecLastBlock 1145 1146 // length > 16 1147 // load 1 block for decryption 1148 MOVOU (16*0)(DX), B0 1149 MOVOU TW, B5 1150 mul2Inline 1151 PXOR TW, B0 1152 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1153 PXOR TW, B0 1154 MOVOU B0, (16*0)(CX) 1155 MOVOU B5, TW 1156 1157 SUBQ $16, DI 1158 LEAQ 16(DX), DX 1159 LEAQ 16(CX), CX 1160 LEAQ -16(CX), R8 1161 MOVOU B0, (16*0)(SP) 1162 1163 CMPQ DI, $8 1164 JB loop_1b 1165 SUBQ $8, DI 1166 MOVQ (DX)(DI*1), R9 1167 MOVQ (SP)(DI*1), R10 1168 MOVQ R9, (SP)(DI*1) 1169 MOVQ R10, (CX)(DI*1) 1170 1171 TESTQ DI, DI 1172 JE xtsSm4DecTailDec 1173 1174 loop_1b: 1175 SUBQ $1, DI 1176 MOVB (DX)(DI*1), R9 1177 MOVB (SP)(DI*1), R10 1178 MOVB R9, (SP)(DI*1) 1179 MOVB R10, (CX)(DI*1) 1180 TESTQ DI, DI 1181 JNE loop_1b 1182 1183 xtsSm4DecTailDec: 1184 MOVOU (16*0)(SP), B0 1185 PXOR TW, B0 1186 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1187 PXOR TW, B0 1188 MOVOU B0, (16*0)(R8) 1189 JMP xtsSm4DecDone 1190 1191 xtsSm4DecLastBlock: 1192 MOVOU (16*0)(DX), B0 1193 PXOR TW, B0 1194 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1195 PXOR TW, B0 1196 MOVOU B0, (16*0)(CX) 1197 mul2Inline 1198 1199 xtsSm4DecDone: 1200 MOVOU TW, (16*0)(BX) 1201 RET 1202 1203 avxXtsSm4Dec: 1204 VMOVDQU gcmPoly<>(SB), POLY 1205 VMOVDQU (0*16)(BX), TW 1206 1207 avxXtsSm4DecOctets: 1208 CMPQ DI, $128 1209 JB avxXtsSm4DecNibbles 1210 SUBQ $128, DI 1211 1212 // prepare tweaks 1213 avxPrepare8Tweaks 1214 1215 // load 8 blocks for decryption 1216 avxLoad8Blocks 1217 1218 AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 1219 1220 avxStore8Blocks 1221 1222 LEAQ 128(DX), DX 1223 LEAQ 128(CX), CX 1224 1225 JMP avxXtsSm4DecOctets 1226 1227 avxXtsSm4DecNibbles: 1228 CMPQ DI, $64 1229 JB avxXtsSm4DecSingles 1230 SUBQ $64, DI 1231 1232 // prepare tweaks 1233 avxPrepare4Tweaks 1234 // load 4 blocks for decryption 1235 avxLoad4Blocks 1236 1237 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1238 1239 avxStore4Blocks 1240 1241 LEAQ 64(DX), DX 1242 LEAQ 64(CX), CX 1243 1244 avxXtsSm4DecSingles: 1245 CMPQ DI, $32 1246 JB avxXtsSm4DecTail 1247 SUBQ $16, DI 1248 1249 // load 1 block for decryption 1250 VMOVDQU (16*0)(DX), B0 1251 1252 VPXOR TW, B0, B0 1253 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1254 VPXOR TW, B0, B0 1255 VMOVDQU B0, (16*0)(CX) 1256 avxMul2Inline 1257 1258 LEAQ 16(DX), DX 1259 LEAQ 16(CX), CX 1260 1261 JMP avxXtsSm4DecSingles 1262 1263 avxXtsSm4DecTail: 1264 TESTQ DI, DI 1265 JE avxXtsSm4DecDone 1266 1267 CMPQ DI, $16 1268 JE avxXtsSm4DecLastBlock 1269 1270 // length > 16 1271 // load 1 block for decryption 1272 VMOVDQU (16*0)(DX), B0 1273 VMOVDQU TW, B5 1274 avxMul2Inline 1275 VPXOR TW, B0, B0 1276 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1277 VPXOR TW, B0, B0 1278 VMOVDQU B0, (16*0)(CX) 1279 VMOVDQU B5, TW 1280 1281 SUBQ $16, DI 1282 LEAQ 16(DX), DX 1283 LEAQ 16(CX), CX 1284 LEAQ -16(CX), R8 1285 VMOVDQU B0, (16*0)(SP) 1286 1287 CMPQ DI, $8 1288 JB avx_loop_1b 1289 SUBQ $8, DI 1290 MOVQ (DX)(DI*1), R9 1291 MOVQ (SP)(DI*1), R10 1292 MOVQ R9, (SP)(DI*1) 1293 MOVQ R10, (CX)(DI*1) 1294 1295 TESTQ DI, DI 1296 JE avxXtsSm4DecTailDec 1297 1298 avx_loop_1b: 1299 SUBQ $1, DI 1300 MOVB (DX)(DI*1), R9 1301 MOVB (SP)(DI*1), R10 1302 MOVB R9, (SP)(DI*1) 1303 MOVB R10, (CX)(DI*1) 1304 TESTQ DI, DI 1305 JNE avx_loop_1b 1306 1307 avxXtsSm4DecTailDec: 1308 VMOVDQU (16*0)(SP), B0 1309 VPXOR TW, B0, B0 1310 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1311 VPXOR TW, B0, B0 1312 VMOVDQU B0, (16*0)(R8) 1313 JMP avxXtsSm4DecDone 1314 1315 avxXtsSm4DecLastBlock: 1316 VMOVDQU (16*0)(DX), B0 1317 VPXOR TW, B0, B0 1318 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1319 VPXOR TW, B0, B0 1320 VMOVDQU B0, (16*0)(CX) 1321 avxMul2Inline 1322 1323 avxXtsSm4DecDone: 1324 VMOVDQU TW, (16*0)(BX) 1325 RET 1326 1327 avx2XtsSm4Dec: 1328 VMOVDQU gcmPoly<>(SB), POLY 1329 VMOVDQU (0*16)(BX), TW 1330 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 1331 VBROADCASTI128 bswap_mask<>(SB), DWBSWAP 1332 1333 avx2XtsSm4Dec16Blocks: 1334 CMPQ DI, $256 1335 JB avx2XtsSm4DecOctets 1336 SUBQ $256, DI 1337 1338 // prepare tweaks 1339 avxPrepare16Tweaks 1340 // load 16 blocks for encryption 1341 avx2Load16Blocks 1342 // Apply Byte Flip Mask: LE -> BE 1343 avx2LE2BE16Blocks 1344 // Transpose matrix 4 x 4 32bits word 1345 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1346 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 1347 1348 AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) 1349 1350 // Transpose matrix 4 x 4 32bits word 1351 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1352 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 1353 avx2ByteSwap16Blocks 1354 avx2Store16Blocks 1355 1356 LEAQ 256(DX), DX 1357 LEAQ 256(CX), CX 1358 1359 JMP avx2XtsSm4Dec16Blocks 1360 1361 avx2XtsSm4DecOctets: 1362 CMPQ DI, $128 1363 JB avx2XtsSm4DecNibbles 1364 SUBQ $128, DI 1365 1366 // prepare tweaks 1367 avxPrepare8Tweaks 1368 // load 8 blocks for encryption 1369 avx2Load8Blocks 1370 // Apply Byte Flip Mask: LE -> BE 1371 avx2LE2BE8Blocks 1372 // Transpose matrix 4 x 4 32bits word 1373 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1374 1375 AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) 1376 1377 // Transpose matrix 4 x 4 32bits word 1378 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1379 avx2ByteSwap8Blocks 1380 avx2Store8Blocks 1381 1382 LEAQ 128(DX), DX 1383 LEAQ 128(CX), CX 1384 1385 avx2XtsSm4DecNibbles: 1386 CMPQ DI, $64 1387 JB avxXtsSm4DecSingles 1388 SUBQ $64, DI 1389 1390 // prepare tweaks 1391 avxPrepare4Tweaks 1392 // load 4 blocks for decryption 1393 avxLoad4Blocks 1394 1395 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1396 1397 avxStore4Blocks 1398 1399 LEAQ 64(DX), DX 1400 LEAQ 64(CX), CX 1401 1402 avx2XtsSm4DecSingles: 1403 CMPQ DI, $32 1404 JB avx2XtsSm4DecTail 1405 SUBQ $16, DI 1406 1407 // load 1 block for decryption 1408 VMOVDQU (16*0)(DX), B0 1409 1410 VPXOR TW, B0, B0 1411 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1412 VPXOR TW, B0, B0 1413 VMOVDQU B0, (16*0)(CX) 1414 avxMul2Inline 1415 1416 LEAQ 16(DX), DX 1417 LEAQ 16(CX), CX 1418 1419 JMP avx2XtsSm4DecSingles 1420 1421 avx2XtsSm4DecTail: 1422 TESTQ DI, DI 1423 JE avx2XtsSm4DecDone 1424 1425 CMPQ DI, $16 1426 JE avx2XtsSm4DecLastBlock 1427 1428 // length > 16 1429 // load 1 block for decryption 1430 VMOVDQU (16*0)(DX), B0 1431 VMOVDQU TW, B5 1432 avxMul2Inline 1433 VPXOR TW, B0, B0 1434 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1435 VPXOR TW, B0, B0 1436 VMOVDQU B0, (16*0)(CX) 1437 VMOVDQU B5, TW 1438 1439 SUBQ $16, DI 1440 LEAQ 16(DX), DX 1441 LEAQ 16(CX), CX 1442 LEAQ -16(CX), R8 1443 VMOVDQU B0, (16*0)(SP) 1444 1445 CMPQ DI, $8 1446 JB avx2_loop_1b 1447 SUBQ $8, DI 1448 MOVQ (DX)(DI*1), R9 1449 MOVQ (SP)(DI*1), R10 1450 MOVQ R9, (SP)(DI*1) 1451 MOVQ R10, (CX)(DI*1) 1452 1453 TESTQ DI, DI 1454 JE avx2XtsSm4DecTailDec 1455 1456 avx2_loop_1b: 1457 SUBQ $1, DI 1458 MOVB (DX)(DI*1), R9 1459 MOVB (SP)(DI*1), R10 1460 MOVB R9, (SP)(DI*1) 1461 MOVB R10, (CX)(DI*1) 1462 TESTQ DI, DI 1463 JNE avx2_loop_1b 1464 1465 avx2XtsSm4DecTailDec: 1466 VMOVDQU (16*0)(SP), B0 1467 VPXOR TW, B0, B0 1468 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1469 VPXOR TW, B0, B0 1470 VMOVDQU B0, (16*0)(R8) 1471 JMP avx2XtsSm4DecDone 1472 1473 avx2XtsSm4DecLastBlock: 1474 VMOVDQU (16*0)(DX), B0 1475 VPXOR TW, B0, B0 1476 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1477 VPXOR TW, B0, B0 1478 VMOVDQU B0, (16*0)(CX) 1479 avxMul2Inline 1480 1481 avx2XtsSm4DecDone: 1482 VMOVDQU TW, (16*0)(BX) 1483 VZEROUPPER 1484 RET 1485 1486 // func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 1487 TEXT ·decryptSm4XtsGB(SB),0,$256-64 1488 MOVQ xk+0(FP), AX 1489 MOVQ tweak+8(FP), BX 1490 MOVQ dst+16(FP), CX 1491 MOVQ src+40(FP), DX 1492 MOVQ src_len+48(FP), DI 1493 1494 CMPB ·useAVX2(SB), $1 1495 JE avx2XtsSm4Dec 1496 1497 CMPB ·useAVX(SB), $1 1498 JE avxXtsSm4Dec 1499 1500 MOVOU gbGcmPoly<>(SB), POLY 1501 MOVOU bswap_mask<>(SB), BSWAP 1502 MOVOU (0*16)(BX), TW 1503 1504 xtsSm4DecOctets: 1505 CMPQ DI, $128 1506 JB xtsSm4DecNibbles 1507 SUBQ $128, DI 1508 1509 // prepare tweaks 1510 prepareGB8Tweaks 1511 // load 8 blocks for decryption 1512 sseLoad8Blocks 1513 1514 SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 1515 1516 sseStore8Blocks 1517 1518 LEAQ 128(DX), DX 1519 LEAQ 128(CX), CX 1520 1521 JMP xtsSm4DecOctets 1522 1523 xtsSm4DecNibbles: 1524 CMPQ DI, $64 1525 JB xtsSm4DecSingles 1526 SUBQ $64, DI 1527 1528 // prepare tweaks 1529 prepareGB4Tweaks 1530 // load 4 blocks for decryption 1531 sseLoad4Blocks 1532 1533 SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1534 1535 sseStore4Blocks 1536 1537 LEAQ 64(DX), DX 1538 LEAQ 64(CX), CX 1539 1540 xtsSm4DecSingles: 1541 CMPQ DI, $32 1542 JB xtsSm4DecTail 1543 SUBQ $16, DI 1544 1545 // load 1 block for decryption 1546 MOVOU (16*0)(DX), B0 1547 1548 PXOR TW, B0 1549 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1550 PXOR TW, B0 1551 MOVOU B0, (16*0)(CX) 1552 mul2GBInline 1553 1554 LEAQ 16(DX), DX 1555 LEAQ 16(CX), CX 1556 1557 JMP xtsSm4DecSingles 1558 1559 xtsSm4DecTail: 1560 TESTQ DI, DI 1561 JE xtsSm4DecDone 1562 1563 CMPQ DI, $16 1564 JE xtsSm4DecLastBlock 1565 1566 // length > 16 1567 // load 1 block for decryption 1568 MOVOU (16*0)(DX), B0 1569 MOVOU TW, B5 1570 mul2GBInline 1571 PXOR TW, B0 1572 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1573 PXOR TW, B0 1574 MOVOU B0, (16*0)(CX) 1575 MOVOU B5, TW 1576 1577 SUBQ $16, DI 1578 LEAQ 16(DX), DX 1579 LEAQ 16(CX), CX 1580 LEAQ -16(CX), R8 1581 MOVOU B0, (16*0)(SP) 1582 1583 CMPQ DI, $8 1584 JB loop_1b 1585 SUBQ $8, DI 1586 MOVQ (DX)(DI*1), R9 1587 MOVQ (SP)(DI*1), R10 1588 MOVQ R9, (SP)(DI*1) 1589 MOVQ R10, (CX)(DI*1) 1590 1591 TESTQ DI, DI 1592 JE xtsSm4DecTailDec 1593 1594 loop_1b: 1595 SUBQ $1, DI 1596 MOVB (DX)(DI*1), R9 1597 MOVB (SP)(DI*1), R10 1598 MOVB R9, (SP)(DI*1) 1599 MOVB R10, (CX)(DI*1) 1600 TESTQ DI, DI 1601 JNE loop_1b 1602 1603 xtsSm4DecTailDec: 1604 MOVOU (16*0)(SP), B0 1605 PXOR TW, B0 1606 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1607 PXOR TW, B0 1608 MOVOU B0, (16*0)(R8) 1609 JMP xtsSm4DecDone 1610 1611 xtsSm4DecLastBlock: 1612 MOVOU (16*0)(DX), B0 1613 PXOR TW, B0 1614 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1615 PXOR TW, B0 1616 MOVOU B0, (16*0)(CX) 1617 mul2GBInline 1618 1619 xtsSm4DecDone: 1620 MOVOU TW, (16*0)(BX) 1621 RET 1622 1623 avxXtsSm4Dec: 1624 VMOVDQU gbGcmPoly<>(SB), POLY 1625 VMOVDQU bswap_mask<>(SB), BSWAP 1626 VMOVDQU (0*16)(BX), TW 1627 1628 avxXtsSm4DecOctets: 1629 CMPQ DI, $128 1630 JB avxXtsSm4DecNibbles 1631 SUBQ $128, DI 1632 1633 // prepare tweaks 1634 avxPrepareGB8Tweaks 1635 // load 8 blocks for decryption 1636 avxLoad8Blocks 1637 1638 AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) 1639 1640 avxStore8Blocks 1641 1642 LEAQ 128(DX), DX 1643 LEAQ 128(CX), CX 1644 1645 JMP avxXtsSm4DecOctets 1646 1647 avxXtsSm4DecNibbles: 1648 CMPQ DI, $64 1649 JB avxXtsSm4DecSingles 1650 SUBQ $64, DI 1651 1652 // prepare tweaks 1653 avxPrepareGB4Tweaks 1654 // load 4 blocks for decryption 1655 avxLoad4Blocks 1656 1657 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1658 1659 avxStore4Blocks 1660 1661 LEAQ 64(DX), DX 1662 LEAQ 64(CX), CX 1663 1664 avxXtsSm4DecSingles: 1665 CMPQ DI, $32 1666 JB avxXtsSm4DecTail 1667 SUBQ $16, DI 1668 1669 // load 1 block for decryption 1670 VMOVDQU (16*0)(DX), B0 1671 1672 VPXOR TW, B0, B0 1673 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1674 VPXOR TW, B0, B0 1675 VMOVDQU B0, (16*0)(CX) 1676 avxMul2GBInline 1677 1678 LEAQ 16(DX), DX 1679 LEAQ 16(CX), CX 1680 1681 JMP avxXtsSm4DecSingles 1682 1683 avxXtsSm4DecTail: 1684 TESTQ DI, DI 1685 JE avxXtsSm4DecDone 1686 1687 CMPQ DI, $16 1688 JE avxXtsSm4DecLastBlock 1689 1690 // length > 16 1691 // load 1 block for decryption 1692 VMOVDQU (16*0)(DX), B0 1693 VMOVDQU TW, B5 1694 avxMul2GBInline 1695 VPXOR TW, B0, B0 1696 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1697 VPXOR TW, B0, B0 1698 VMOVDQU B0, (16*0)(CX) 1699 VMOVDQU B5, TW 1700 1701 SUBQ $16, DI 1702 LEAQ 16(DX), DX 1703 LEAQ 16(CX), CX 1704 LEAQ -16(CX), R8 1705 VMOVDQU B0, (16*0)(SP) 1706 1707 CMPQ DI, $8 1708 JB avx_loop_1b 1709 SUBQ $8, DI 1710 MOVQ (DX)(DI*1), R9 1711 MOVQ (SP)(DI*1), R10 1712 MOVQ R9, (SP)(DI*1) 1713 MOVQ R10, (CX)(DI*1) 1714 1715 TESTQ DI, DI 1716 JE avxXtsSm4DecTailDec 1717 1718 avx_loop_1b: 1719 SUBQ $1, DI 1720 MOVB (DX)(DI*1), R9 1721 MOVB (SP)(DI*1), R10 1722 MOVB R9, (SP)(DI*1) 1723 MOVB R10, (CX)(DI*1) 1724 TESTQ DI, DI 1725 JNE avx_loop_1b 1726 1727 avxXtsSm4DecTailDec: 1728 VMOVDQU (16*0)(SP), B0 1729 VPXOR TW, B0, B0 1730 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1731 VPXOR TW, B0, B0 1732 VMOVDQU B0, (16*0)(R8) 1733 JMP avxXtsSm4DecDone 1734 1735 avxXtsSm4DecLastBlock: 1736 VMOVDQU (16*0)(DX), B0 1737 VPXOR TW, B0, B0 1738 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1739 VPXOR TW, B0, B0 1740 VMOVDQU B0, (16*0)(CX) 1741 avxMul2GBInline 1742 1743 avxXtsSm4DecDone: 1744 VMOVDQU TW, (16*0)(BX) 1745 RET 1746 1747 avx2XtsSm4Dec: 1748 VMOVDQU gbGcmPoly<>(SB), POLY 1749 VMOVDQU (0*16)(BX), TW 1750 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 1751 VBROADCASTI128 bswap_mask<>(SB), DWBSWAP 1752 1753 avx2XtsSm4Dec16Blocks: 1754 CMPQ DI, $256 1755 JB avx2XtsSm4DecOctets 1756 SUBQ $256, DI 1757 1758 // prepare tweaks 1759 avxPrepareGB16Tweaks 1760 // load 16 blocks for encryption 1761 avx2Load16Blocks 1762 // Apply Byte Flip Mask: LE -> BE 1763 avx2LE2BE16Blocks 1764 // Transpose matrix 4 x 4 32bits word 1765 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1766 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 1767 1768 AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7) 1769 1770 // Transpose matrix 4 x 4 32bits word 1771 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1772 TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9) 1773 avx2ByteSwap16Blocks 1774 avx2Store16Blocks 1775 1776 LEAQ 256(DX), DX 1777 LEAQ 256(CX), CX 1778 1779 JMP avx2XtsSm4Dec16Blocks 1780 1781 avx2XtsSm4DecOctets: 1782 CMPQ DI, $128 1783 JB avx2XtsSm4DecNibbles 1784 SUBQ $128, DI 1785 1786 // prepare tweaks 1787 avxPrepareGB8Tweaks 1788 // load 8 blocks for encryption 1789 avx2Load8Blocks 1790 // Apply Byte Flip Mask: LE -> BE 1791 avx2LE2BE8Blocks 1792 // Transpose matrix 4 x 4 32bits word 1793 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1794 1795 AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3) 1796 1797 // Transpose matrix 4 x 4 32bits word 1798 TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9) 1799 avx2ByteSwap8Blocks 1800 avx2Store8Blocks 1801 1802 LEAQ 128(DX), DX 1803 LEAQ 128(CX), CX 1804 1805 avx2XtsSm4DecNibbles: 1806 CMPQ DI, $64 1807 JB avxXtsSm4DecSingles 1808 SUBQ $64, DI 1809 1810 // prepare tweaks 1811 avxPrepareGB4Tweaks 1812 // load 4 blocks for decryption 1813 avxLoad4Blocks 1814 1815 AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1816 1817 avxStore4Blocks 1818 1819 LEAQ 64(DX), DX 1820 LEAQ 64(CX), CX 1821 1822 avx2XtsSm4DecSingles: 1823 CMPQ DI, $32 1824 JB avx2XtsSm4DecTail 1825 SUBQ $16, DI 1826 1827 // load 1 block for decryption 1828 VMOVDQU (16*0)(DX), B0 1829 1830 VPXOR TW, B0, B0 1831 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1832 VPXOR TW, B0, B0 1833 VMOVDQU B0, (16*0)(CX) 1834 avxMul2Inline 1835 1836 LEAQ 16(DX), DX 1837 LEAQ 16(CX), CX 1838 1839 JMP avx2XtsSm4DecSingles 1840 1841 avx2XtsSm4DecTail: 1842 TESTQ DI, DI 1843 JE avx2XtsSm4DecDone 1844 1845 CMPQ DI, $16 1846 JE avx2XtsSm4DecLastBlock 1847 1848 // length > 16 1849 // load 1 block for decryption 1850 VMOVDQU (16*0)(DX), B0 1851 VMOVDQU TW, B5 1852 avxMul2GBInline 1853 VPXOR TW, B0, B0 1854 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1855 VPXOR TW, B0, B0 1856 VMOVDQU B0, (16*0)(CX) 1857 VMOVDQU B5, TW 1858 1859 SUBQ $16, DI 1860 LEAQ 16(DX), DX 1861 LEAQ 16(CX), CX 1862 LEAQ -16(CX), R8 1863 VMOVDQU B0, (16*0)(SP) 1864 1865 CMPQ DI, $8 1866 JB avx2_loop_1b 1867 SUBQ $8, DI 1868 MOVQ (DX)(DI*1), R9 1869 MOVQ (SP)(DI*1), R10 1870 MOVQ R9, (SP)(DI*1) 1871 MOVQ R10, (CX)(DI*1) 1872 1873 TESTQ DI, DI 1874 JE avx2XtsSm4DecTailDec 1875 1876 avx2_loop_1b: 1877 SUBQ $1, DI 1878 MOVB (DX)(DI*1), R9 1879 MOVB (SP)(DI*1), R10 1880 MOVB R9, (SP)(DI*1) 1881 MOVB R10, (CX)(DI*1) 1882 TESTQ DI, DI 1883 JNE avx2_loop_1b 1884 1885 avx2XtsSm4DecTailDec: 1886 VMOVDQU (16*0)(SP), B0 1887 VPXOR TW, B0, B0 1888 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1889 VPXOR TW, B0, B0 1890 VMOVDQU B0, (16*0)(R8) 1891 JMP avx2XtsSm4DecDone 1892 1893 avx2XtsSm4DecLastBlock: 1894 VMOVDQU (16*0)(DX), B0 1895 VPXOR TW, B0, B0 1896 SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3) 1897 VPXOR TW, B0, B0 1898 VMOVDQU B0, (16*0)(CX) 1899 avxMul2GBInline 1900 1901 avx2XtsSm4DecDone: 1902 VMOVDQU TW, (16*0)(BX) 1903 VZEROUPPER 1904 RET