github.com/emmansun/gmsm@v0.29.1/sm4/cbc_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 #include "aesni_macros_amd64.s" 5 6 7 #define XDWTMP0 Y0 8 #define XDWTMP1 Y1 9 10 #define XDWORD0 Y4 11 #define XDWORD1 Y5 12 #define XDWORD2 Y6 13 #define XDWORD3 Y7 14 15 #define XDWORD4 Y10 16 #define XDWORD5 Y11 17 #define XDWORD6 Y12 18 #define XDWORD7 Y14 19 20 #define XWTMP0 X0 21 #define XWTMP1 X1 22 #define XWTMP2 X2 23 24 #define XWORD0 X4 25 #define XWORD1 X5 26 #define XWORD2 X6 27 #define XWORD3 X7 28 29 #define XWORD4 X10 30 #define XWORD5 X11 31 #define XWORD6 X12 32 #define XWORD7 X14 33 34 #define NIBBLE_MASK Y3 35 #define X_NIBBLE_MASK X3 36 37 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 38 #define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE 39 40 #define BSWAP_MASK Y2 41 42 #define XDWORD Y8 43 #define YDWORD Y9 44 45 #define XWORD X8 46 #define YWORD X9 47 48 // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) 49 TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 50 MOVQ xk+0(FP), AX 51 MOVQ dst+8(FP), BX 52 MOVQ src+32(FP), DX 53 MOVQ src_len+40(FP), DI 54 MOVQ iv+56(FP), SI 55 56 LEAQ (DX)(DI*1), DX 57 LEAQ (BX)(DI*1), BX 58 59 CMPB ·useAVX2(SB), $1 60 JE avx2Start 61 62 CMPB ·useAVX(SB), $1 63 JE avxStart 64 65 MOVOU -16(DX), X15 66 67 cbcSm4Octets: 68 CMPQ DI, $128 69 JLE cbcSm4Nibbles 70 SUBQ $128, DI 71 LEAQ -128(DX), DX 72 LEAQ -128(BX), BX 73 74 MOVOU 0(DX), XWORD0 75 MOVOU 16(DX), XWORD1 76 MOVOU 32(DX), XWORD2 77 MOVOU 48(DX), XWORD3 78 MOVOU 64(DX), XWORD4 79 MOVOU 80(DX), XWORD5 80 MOVOU 96(DX), XWORD6 81 MOVOU 112(DX), XWORD7 82 83 SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) 84 85 MOVOU -16(DX), XWTMP0 86 PXOR XWTMP0, XWORD0 87 MOVOU 0(DX), XWTMP0 88 PXOR XWTMP0, XWORD1 89 MOVOU 16(DX), XWTMP0 90 PXOR XWTMP0, XWORD2 91 MOVOU 32(DX), XWTMP0 92 PXOR XWTMP0, XWORD3 93 MOVOU 48(DX), XWTMP0 94 PXOR XWTMP0, XWORD4 95 MOVOU 64(DX), XWTMP0 96 PXOR XWTMP0, XWORD5 97 MOVOU 80(DX), XWTMP0 98 PXOR XWTMP0, XWORD6 99 MOVOU 96(DX), XWTMP0 100 PXOR XWTMP0, XWORD7 101 102 MOVOU XWORD0, 0(BX) 103 MOVOU XWORD1, 16(BX) 104 MOVOU XWORD2, 32(BX) 105 MOVOU XWORD3, 48(BX) 106 MOVOU XWORD4, 64(BX) 107 MOVOU XWORD5, 80(BX) 108 MOVOU XWORD6, 96(BX) 109 MOVOU XWORD7, 112(BX) 110 111 JMP cbcSm4Octets 112 113 cbcSm4Nibbles: 114 CMPQ DI, $64 115 JLE cbCSm4Single 116 SUBQ $64, DI 117 LEAQ -64(DX), DX 118 LEAQ -64(BX), BX 119 120 MOVOU 0(DX), XWORD0 121 MOVOU 16(DX), XWORD1 122 MOVOU 32(DX), XWORD2 123 MOVOU 48(DX), XWORD3 124 125 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 126 127 MOVUPS -16(DX), XWTMP0 128 PXOR XWTMP0, XWORD0 129 MOVUPS 0(DX), XWTMP0 130 PXOR XWTMP0, XWORD1 131 MOVUPS 16(DX), XWTMP0 132 PXOR XWTMP0, XWORD2 133 MOVUPS 32(DX), XWTMP0 134 PXOR XWTMP0, XWORD3 135 136 MOVUPS XWORD0, 0(BX) 137 MOVUPS XWORD1, 16(BX) 138 MOVUPS XWORD2, 32(BX) 139 MOVUPS XWORD3, 48(BX) 140 141 cbCSm4Single: 142 CMPQ DI, $16 143 JEQ cbcSm4Single16 144 145 CMPQ DI, $32 146 JEQ cbcSm4Single32 147 148 CMPQ DI, $48 149 JEQ cbcSm4Single48 150 151 MOVOU -64(DX), XWORD0 152 MOVOU -48(DX), XWORD1 153 MOVOU -32(DX), XWORD2 154 MOVOU -16(DX), XWORD3 155 156 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 157 158 MOVUPS 0(SI), XWTMP0 159 PXOR XWTMP0, XWORD0 160 MOVUPS -64(DX), XWTMP0 161 PXOR XWTMP0, XWORD1 162 MOVUPS -48(DX), XWTMP0 163 PXOR XWTMP0, XWORD2 164 MOVUPS -32(DX), XWTMP0 165 PXOR XWTMP0, XWORD3 166 167 MOVUPS XWORD0, -64(BX) 168 MOVUPS XWORD1, -48(BX) 169 MOVUPS XWORD2, -32(BX) 170 MOVUPS XWORD3, -16(BX) 171 172 JMP cbcSm4Done 173 174 cbcSm4Single16: 175 MOVOU -16(DX), XWORD0 176 177 SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 178 179 MOVUPS 0(SI), XWTMP0 180 PXOR XWTMP0, XWORD0 181 182 MOVUPS XWORD0, -16(BX) 183 184 JMP cbcSm4Done 185 186 cbcSm4Single32: 187 MOVOU -32(DX), XWORD0 188 MOVOU -16(DX), XWORD1 189 190 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 191 192 MOVUPS 0(SI), XWTMP0 193 PXOR XWTMP0, XWORD0 194 MOVUPS -32(DX), XWTMP0 195 PXOR XWTMP0, XWORD1 196 197 MOVUPS XWORD0, -32(BX) 198 MOVUPS XWORD1, -16(BX) 199 200 JMP cbcSm4Done 201 202 cbcSm4Single48: 203 MOVOU -48(DX), XWORD0 204 MOVOU -32(DX), XWORD1 205 MOVOU -16(DX), XWORD2 206 207 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 208 209 MOVUPS 0(SI), XWTMP0 210 PXOR XWTMP0, XWORD0 211 MOVUPS -48(DX), XWTMP0 212 PXOR XWTMP0, XWORD1 213 MOVUPS -32(DX), XWTMP0 214 PXOR XWTMP0, XWORD2 215 216 MOVUPS XWORD0, -48(BX) 217 MOVUPS XWORD1, -32(BX) 218 MOVUPS XWORD2, -16(BX) 219 220 cbcSm4Done: 221 MOVUPS X15, (SI) 222 RET 223 224 avxStart: 225 VMOVDQU -16(DX), X15 226 227 avxCbcSm4Octets: 228 CMPQ DI, $128 229 JLE avxCbcSm4Nibbles 230 SUBQ $128, DI 231 LEAQ -128(DX), DX 232 LEAQ -128(BX), BX 233 234 VMOVDQU 0(DX), XWORD0 235 VMOVDQU 16(DX), XWORD1 236 VMOVDQU 32(DX), XWORD2 237 VMOVDQU 48(DX), XWORD3 238 VMOVDQU 64(DX), XWORD4 239 VMOVDQU 80(DX), XWORD5 240 VMOVDQU 96(DX), XWORD6 241 VMOVDQU 112(DX), XWORD7 242 243 AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) 244 245 VPXOR -16(DX), XWORD0, XWORD0 246 VPXOR 0(DX), XWORD1, XWORD1 247 VPXOR 16(DX), XWORD2, XWORD2 248 VPXOR 32(DX), XWORD3, XWORD3 249 VPXOR 48(DX), XWORD4, XWORD4 250 VPXOR 64(DX), XWORD5, XWORD5 251 VPXOR 80(DX), XWORD6, XWORD6 252 VPXOR 96(DX), XWORD7, XWORD7 253 254 VMOVDQU XWORD0, 0(BX) 255 VMOVDQU XWORD1, 16(BX) 256 VMOVDQU XWORD2, 32(BX) 257 VMOVDQU XWORD3, 48(BX) 258 VMOVDQU XWORD4, 64(BX) 259 VMOVDQU XWORD5, 80(BX) 260 VMOVDQU XWORD6, 96(BX) 261 VMOVDQU XWORD7, 112(BX) 262 263 JMP avxCbcSm4Octets 264 265 avxCbcSm4Nibbles: 266 CMPQ DI, $64 267 JLE avxCbCSm4Single 268 SUBQ $64, DI 269 LEAQ -64(DX), DX 270 LEAQ -64(BX), BX 271 272 VMOVDQU 0(DX), XWORD0 273 VMOVDQU 16(DX), XWORD1 274 VMOVDQU 32(DX), XWORD2 275 VMOVDQU 48(DX), XWORD3 276 277 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 278 279 VPXOR -16(DX), XWORD0, XWORD0 280 VPXOR 0(DX), XWORD1, XWORD1 281 VPXOR 16(DX), XWORD2, XWORD2 282 VPXOR 32(DX), XWORD3, XWORD3 283 284 VMOVDQU XWORD0, 0(BX) 285 VMOVDQU XWORD1, 16(BX) 286 VMOVDQU XWORD2, 32(BX) 287 VMOVDQU XWORD3, 48(BX) 288 289 avxCbCSm4Single: 290 CMPQ DI, $16 291 JEQ avxCbcSm4Single16 292 293 CMPQ DI, $32 294 JEQ avxCbcSm4Single32 295 296 CMPQ DI, $48 297 JEQ avxCbcSm4Single48 298 299 VMOVDQU -64(DX), XWORD0 300 VMOVDQU -48(DX), XWORD1 301 VMOVDQU -32(DX), XWORD2 302 VMOVDQU -16(DX), XWORD3 303 304 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 305 306 VPXOR 0(SI), XWORD0, XWORD0 307 VPXOR -64(DX), XWORD1, XWORD1 308 VPXOR -48(DX), XWORD2, XWORD2 309 VPXOR -32(DX), XWORD3, XWORD3 310 311 VMOVDQU XWORD0, -64(BX) 312 VMOVDQU XWORD1, -48(BX) 313 VMOVDQU XWORD2, -32(BX) 314 VMOVDQU XWORD3, -16(BX) 315 316 JMP avxCbcSm4Done 317 318 avxCbcSm4Single16: 319 VMOVDQU -16(DX), XWORD0 320 321 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 322 323 VPXOR 0(SI), XWORD0, XWORD0 324 325 VMOVDQU XWORD0, -16(BX) 326 327 JMP avxCbcSm4Done 328 329 avxCbcSm4Single32: 330 VMOVDQU -32(DX), XWORD0 331 VMOVDQU -16(DX), XWORD1 332 333 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 334 335 VPXOR 0(SI), XWORD0, XWORD0 336 VPXOR -32(DX), XWORD1, XWORD1 337 338 VMOVDQU XWORD0, -32(BX) 339 VMOVDQU XWORD1, -16(BX) 340 341 JMP avxCbcSm4Done 342 343 avxCbcSm4Single48: 344 VMOVDQU -48(DX), XWORD0 345 VMOVDQU -32(DX), XWORD1 346 VMOVDQU -16(DX), XWORD2 347 348 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 349 350 VPXOR 0(SI), XWORD0, XWORD0 351 VPXOR -48(DX), XWORD1, XWORD1 352 VPXOR -32(DX), XWORD2, XWORD2 353 354 VMOVDQU XWORD0, -48(BX) 355 VMOVDQU XWORD1, -32(BX) 356 VMOVDQU XWORD2, -16(BX) 357 358 avxCbcSm4Done: 359 VMOVDQU X15, (SI) 360 RET 361 362 avx2Start: 363 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 364 VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK 365 VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK 366 367 VMOVDQU -16(DX), X15 368 369 avx2_16blocks: 370 CMPQ DI, $256 371 JLE avx2CbcSm4Octets 372 SUBQ $256, DI 373 LEAQ -256(DX), DX 374 LEAQ -256(BX), BX 375 376 VMOVDQU 0(DX), XDWORD0 377 VMOVDQU 32(DX), XDWORD1 378 VMOVDQU 64(DX), XDWORD2 379 VMOVDQU 96(DX), XDWORD3 380 VMOVDQU 128(DX), XDWORD4 381 VMOVDQU 160(DX), XDWORD5 382 VMOVDQU 192(DX), XDWORD6 383 VMOVDQU 224(DX), XDWORD7 384 385 // Apply Byte Flip Mask: LE -> BE 386 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 387 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 388 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 389 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 390 VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 391 VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 392 VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 393 VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 394 395 // Transpose matrix 4 x 4 32bits word 396 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 397 TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) 398 399 AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7) 400 401 // Transpose matrix 4 x 4 32bits word 402 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 403 TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) 404 405 VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 406 VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 407 VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 408 VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 409 VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4 410 VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5 411 VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6 412 VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7 413 414 VPXOR -16(DX), XDWORD0, XDWORD0 415 VPXOR 16(DX), XDWORD1, XDWORD1 416 VPXOR 48(DX), XDWORD2, XDWORD2 417 VPXOR 80(DX), XDWORD3, XDWORD3 418 VPXOR 112(DX), XDWORD4, XDWORD4 419 VPXOR 144(DX), XDWORD5, XDWORD5 420 VPXOR 176(DX), XDWORD6, XDWORD6 421 VPXOR 208(DX), XDWORD7, XDWORD7 422 423 VMOVDQU XDWORD0, 0(BX) 424 VMOVDQU XDWORD1, 32(BX) 425 VMOVDQU XDWORD2, 64(BX) 426 VMOVDQU XDWORD3, 96(BX) 427 VMOVDQU XDWORD4, 128(BX) 428 VMOVDQU XDWORD5, 160(BX) 429 VMOVDQU XDWORD6, 192(BX) 430 VMOVDQU XDWORD7, 224(BX) 431 432 JMP avx2_16blocks 433 434 avx2CbcSm4Octets: 435 CMPQ DI, $128 436 JLE avx2CbcSm4Nibbles 437 SUBQ $128, DI 438 LEAQ -128(DX), DX 439 LEAQ -128(BX), BX 440 441 VMOVDQU 0(DX), XDWORD0 442 VMOVDQU 32(DX), XDWORD1 443 VMOVDQU 64(DX), XDWORD2 444 VMOVDQU 96(DX), XDWORD3 445 446 // Apply Byte Flip Mask: LE -> BE 447 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 448 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 449 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 450 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 451 452 // Transpose matrix 4 x 4 32bits word 453 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 454 455 AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 456 457 // Transpose matrix 4 x 4 32bits word 458 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 459 460 VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 461 VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 462 VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 463 VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 464 465 VPXOR -16(DX), XDWORD0, XDWORD0 466 VPXOR 16(DX), XDWORD1, XDWORD1 467 VPXOR 48(DX), XDWORD2, XDWORD2 468 VPXOR 80(DX), XDWORD3, XDWORD3 469 470 VMOVDQU XDWORD0, 0(BX) 471 VMOVDQU XDWORD1, 32(BX) 472 VMOVDQU XDWORD2, 64(BX) 473 VMOVDQU XDWORD3, 96(BX) 474 475 JMP avx2CbcSm4Octets 476 477 avx2CbcSm4Nibbles: 478 CMPQ DI, $64 479 JLE avx2CbCSm4Single 480 SUBQ $64, DI 481 LEAQ -64(DX), DX 482 LEAQ -64(BX), BX 483 484 VMOVDQU 0(DX), XWORD0 485 VMOVDQU 16(DX), XWORD1 486 VMOVDQU 32(DX), XWORD2 487 VMOVDQU 48(DX), XWORD3 488 489 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 490 491 VPXOR -16(DX), XWORD0, XWORD0 492 VPXOR 0(DX), XWORD1, XWORD1 493 VPXOR 16(DX), XWORD2, XWORD2 494 VPXOR 32(DX), XWORD3, XWORD3 495 496 VMOVDQU XWORD0, 0(BX) 497 VMOVDQU XWORD1, 16(BX) 498 VMOVDQU XWORD2, 32(BX) 499 VMOVDQU XWORD3, 48(BX) 500 501 avx2CbCSm4Single: 502 CMPQ DI, $16 503 JEQ avx2CbcSm4Single16 504 505 CMPQ DI, $32 506 JEQ avx2CbcSm4Single32 507 508 CMPQ DI, $48 509 JEQ avx2CbcSm4Single48 510 511 VMOVDQU -64(DX), XWORD0 512 VMOVDQU -48(DX), XWORD1 513 VMOVDQU -32(DX), XWORD2 514 VMOVDQU -16(DX), XWORD3 515 516 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 517 518 VPXOR 0(SI), XWORD0, XWORD0 519 VPXOR -64(DX), XWORD1, XWORD1 520 VPXOR -48(DX), XWORD2, XWORD2 521 VPXOR -32(DX), XWORD3, XWORD3 522 523 VMOVDQU XWORD0, -64(BX) 524 VMOVDQU XWORD1, -48(BX) 525 VMOVDQU XWORD2, -32(BX) 526 VMOVDQU XWORD3, -16(BX) 527 528 JMP avx2CbcSm4Done 529 530 avx2CbcSm4Single16: 531 VMOVDQU -16(DX), XWORD0 532 533 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 534 535 VPXOR 0(SI), XWORD0, XWORD0 536 537 VMOVDQU XWORD0, -16(BX) 538 539 JMP avx2CbcSm4Done 540 541 avx2CbcSm4Single32: 542 VMOVDQU -32(DX), XWORD0 543 VMOVDQU -16(DX), XWORD1 544 545 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 546 547 VPXOR 0(SI), XWORD0, XWORD0 548 VPXOR -32(DX), XWORD1, XWORD1 549 550 VMOVDQU XWORD0, -32(BX) 551 VMOVDQU XWORD1, -16(BX) 552 553 JMP avx2CbcSm4Done 554 555 avx2CbcSm4Single48: 556 VMOVDQU -48(DX), XWORD0 557 VMOVDQU -32(DX), XWORD1 558 VMOVDQU -16(DX), XWORD2 559 560 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 561 562 VPXOR 0(SI), XWORD0, XWORD0 563 VPXOR -48(DX), XWORD1, XWORD1 564 VPXOR -32(DX), XWORD2, XWORD2 565 566 VMOVDQU XWORD0, -48(BX) 567 VMOVDQU XWORD1, -32(BX) 568 VMOVDQU XWORD2, -16(BX) 569 570 avx2CbcSm4Done: 571 VMOVDQU X15, (SI) 572 VZEROUPPER 573 RET