github.com/emmansun/gmsm@v0.29.1/sm4/xts_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define B0 V0 6 #define B1 V1 7 #define B2 V2 8 #define B3 V3 9 #define B4 V4 10 #define B5 V5 11 #define B6 V6 12 #define B7 V7 13 14 #define T0 V8 15 #define T1 V9 16 #define T2 V10 17 #define T3 V11 18 #define T4 V12 19 #define T5 V13 20 #define T6 V14 21 #define T7 V15 22 23 #define POLY V16 24 #define ZERO V17 25 #define TW V18 26 27 #define K0 V19 28 #define K1 V20 29 #define K2 V21 30 #define K3 V22 31 32 #define NIBBLE_MASK V23 33 #define INVERSE_SHIFT_ROWS V24 34 #define M1L V25 35 #define M1H V26 36 #define M2L V27 37 #define M2H V28 38 #define R08_MASK V29 39 40 #include "aesni_macros_arm64.s" 41 #include "xts_macros_arm64.s" 42 43 #define load8blocks \ 44 VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \ 45 VEOR T0.B16, B0.B16, B0.B16; \ 46 VEOR T1.B16, B1.B16, B1.B16; \ 47 VEOR T2.B16, B2.B16, B2.B16; \ 48 VEOR T3.B16, B3.B16, B3.B16; \ 49 \ 50 VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \ 51 VEOR T4.B16, B4.B16, B4.B16; \ 52 VEOR T5.B16, B5.B16, B5.B16; \ 53 VEOR T6.B16, B6.B16, B6.B16; \ 54 VEOR T7.B16, B7.B16, B7.B16; \ 55 \ 56 VREV32 B0.B16, B0.B16; \ 57 VREV32 B1.B16, B1.B16; \ 58 VREV32 B2.B16, B2.B16; \ 59 VREV32 B3.B16, B3.B16; \ 60 VREV32 B4.B16, B4.B16; \ 61 VREV32 B5.B16, B5.B16; \ 62 VREV32 B6.B16, B6.B16; \ 63 VREV32 B7.B16, B7.B16; \ 64 \ 65 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3); \ 66 PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) 67 68 #define store8blocks \ 69 VREV32 B0.B16, B0.B16; \ 70 VREV32 B1.B16, B1.B16; \ 71 VREV32 B2.B16, B2.B16; \ 72 VREV32 B3.B16, B3.B16; \ 73 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3); \ 74 VREV32 B4.B16, B4.B16; \ 75 VREV32 B5.B16, B5.B16; \ 76 VREV32 B6.B16, B6.B16; \ 77 VREV32 B7.B16, B7.B16; \ 78 TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3); \ 79 \ 80 VEOR T0.B16, B0.B16, B0.B16; \ 81 VEOR T1.B16, B1.B16, B1.B16; \ 82 VEOR T2.B16, B2.B16, B2.B16; \ 83 VEOR T3.B16, B3.B16, B3.B16; \ 84 VEOR T4.B16, B4.B16, B4.B16; \ 85 VEOR T5.B16, B5.B16, B5.B16; \ 86 VEOR T6.B16, B6.B16, B6.B16; \ 87 VEOR T7.B16, B7.B16, B7.B16; \ 88 \ 89 VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \ 90 VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr) 91 92 #define load4blocks \ 93 VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \ 94 VEOR T0.B16, B0.B16, B0.B16; \ 95 VEOR T1.B16, B1.B16, B1.B16; \ 96 VEOR T2.B16, B2.B16, B2.B16; \ 97 VEOR T3.B16, B3.B16, B3.B16; \ 98 \ 99 VREV32 B0.B16, B0.B16; \ 100 VREV32 B1.B16, B1.B16; \ 101 VREV32 B2.B16, B2.B16; \ 102 VREV32 B3.B16, B3.B16; \ 103 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 104 105 #define store4blocks \ 106 VREV32 B0.B16, B0.B16; \ 107 VREV32 B1.B16, B1.B16; \ 108 VREV32 B2.B16, B2.B16; \ 109 VREV32 B3.B16, B3.B16; \ 110 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3); \ 111 \ 112 VEOR T0.B16, B0.B16, B0.B16; \ 113 VEOR T1.B16, B1.B16, B1.B16; \ 114 VEOR T2.B16, B2.B16, B2.B16; \ 115 VEOR T3.B16, B3.B16, B3.B16; \ 116 \ 117 VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr) 118 119 #define loadOneBlock \ 120 VLD1.P 16(srcPtr), [B0.S4]; \ 121 VEOR TW.B16, B0.B16, B0.B16; \ 122 \ 123 VREV32 B0.B16, B0.B16; \ 124 VMOV B0.S[1], B1.S[0]; \ 125 VMOV B0.S[2], B2.S[0]; \ 126 VMOV B0.S[3], B3.S[0] 127 128 #define storeOneBlock \ 129 VMOV B2.S[0], B3.S[1]; \ 130 VMOV B1.S[0], B3.S[2]; \ 131 VMOV B0.S[0], B3.S[3]; \ 132 VREV32 B3.B16, B3.B16; \ 133 \ 134 VEOR TW.B16, B3.B16, B3.B16; \ 135 VST1.P [B3.S4], 16(dstPtr) 136 137 #define dstPtr R2 138 #define srcPtr R3 139 #define rk R0 140 #define twPtr R1 141 #define srcPtrLen R4 142 #define I R5 143 #define rkSave R6 144 145 // func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 146 TEXT ·encryptSm4Xts(SB),0,$128-64 147 LOAD_SM4_AESNI_CONSTS() 148 MOVD xk+0(FP), rk 149 MOVD tweak+8(FP), twPtr 150 MOVD dst+16(FP), dstPtr 151 MOVD src+40(FP), srcPtr 152 MOVD src_len+48(FP), srcPtrLen 153 154 VEOR POLY.B16, POLY.B16, POLY.B16 155 VEOR ZERO.B16, ZERO.B16, ZERO.B16 156 157 MOVD $0x87, I 158 VMOV I, POLY.D[0] 159 160 MOVD rk, rkSave 161 VLD1 (twPtr), [TW.B16] 162 163 xtsSm4EncOctets: 164 CMP $128, srcPtrLen 165 BLT xtsSm4EncNibbles 166 SUB $128, srcPtrLen 167 168 prepare8Tweaks 169 load8blocks 170 MOVD rkSave, rk 171 EOR R13, R13 172 173 encOctetsEnc8Blocks: 174 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) 175 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) 176 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) 177 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) 178 179 ADD $1, R13 180 CMP $8, R13 181 BNE encOctetsEnc8Blocks 182 183 store8blocks 184 B xtsSm4EncOctets 185 186 xtsSm4EncNibbles: 187 CMP $64, srcPtrLen 188 BLT xtsSm4EncSingles 189 SUB $64, srcPtrLen 190 191 prepare4Tweaks 192 load4blocks 193 MOVD rkSave, rk 194 EOR R13, R13 195 196 encNibblesEnc4Blocks: 197 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 198 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 199 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 200 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 201 202 ADD $1, R13 203 CMP $8, R13 204 BNE encNibblesEnc4Blocks 205 206 store4blocks 207 208 xtsSm4EncSingles: 209 CMP $16, srcPtrLen 210 BLT xtsSm4EncTail 211 SUB $16, srcPtrLen 212 213 loadOneBlock 214 215 MOVD rkSave, rk 216 EOR R13, R13 217 218 encSinglesEnc4Blocks: 219 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 220 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 221 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 222 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 223 ADD $1, R13 224 CMP $8, R13 225 BNE encSinglesEnc4Blocks 226 227 storeOneBlock 228 mul2Inline 229 B xtsSm4EncSingles 230 231 xtsSm4EncTail: 232 CBZ srcPtrLen, xtsSm4EncDone 233 SUB $16, dstPtr, R7 234 MOVD R7, R9 235 MOVD RSP, R8 236 VLD1 (R7), [B0.B16] 237 VST1 [B0.B16], (R8) 238 239 TBZ $3, srcPtrLen, less_than8 240 MOVD.P 8(srcPtr), R11 241 MOVD.P R11, 8(R8) 242 MOVD.P 8(R7), R12 243 MOVD.P R12, 8(dstPtr) 244 245 less_than8: 246 TBZ $2, srcPtrLen, less_than4 247 MOVWU.P 4(srcPtr), R11 248 MOVWU.P R11, 4(R8) 249 MOVWU.P 4(R7), R12 250 MOVWU.P R12, 4(dstPtr) 251 252 less_than4: 253 TBZ $1, srcPtrLen, less_than2 254 MOVHU.P 2(srcPtr), R11 255 MOVHU.P R11, 2(R8) 256 MOVHU.P 2(R7), R12 257 MOVHU.P R12, 2(dstPtr) 258 259 less_than2: 260 TBZ $0, srcPtrLen, xtsSm4EncTailEnc 261 MOVBU (srcPtr), R11 262 MOVBU R11, (R8) 263 MOVBU (R7), R12 264 MOVBU R12, (dstPtr) 265 266 xtsSm4EncTailEnc: 267 VLD1 (RSP), [B0.B16] 268 VEOR TW.B16, B0.B16, B0.B16 269 VREV32 B0.B16, B0.B16 270 VMOV B0.S[1], B1.S[0] 271 VMOV B0.S[2], B2.S[0] 272 VMOV B0.S[3], B3.S[0] 273 274 MOVD rkSave, rk 275 EOR R13, R13 276 277 tailEncLoop: 278 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 279 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 280 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 281 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 282 ADD $1, R13 283 CMP $8, R13 284 BNE tailEncLoop 285 286 VMOV B2.S[0], B3.S[1] 287 VMOV B1.S[0], B3.S[2] 288 VMOV B0.S[0], B3.S[3] 289 VREV32 B3.B16, B3.B16 290 291 VEOR TW.B16, B3.B16, B3.B16 292 VST1 [B3.B16], (R9) 293 294 xtsSm4EncDone: 295 VST1 [TW.B16], (twPtr) 296 RET 297 298 // func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 299 TEXT ·encryptSm4XtsGB(SB),0,$128-64 300 LOAD_SM4_AESNI_CONSTS() 301 MOVD xk+0(FP), rk 302 MOVD tweak+8(FP), twPtr 303 MOVD dst+16(FP), dstPtr 304 MOVD src+40(FP), srcPtr 305 MOVD src_len+48(FP), srcPtrLen 306 307 VEOR POLY.B16, POLY.B16, POLY.B16 308 VEOR ZERO.B16, ZERO.B16, ZERO.B16 309 310 MOVD $0xE1, I 311 LSL $56, I 312 VMOV I, POLY.D[1] 313 314 MOVD rk, rkSave 315 VLD1 (twPtr), [TW.B16] 316 317 xtsSm4EncOctets: 318 CMP $128, srcPtrLen 319 BLT xtsSm4EncNibbles 320 SUB $128, srcPtrLen 321 322 prepareGB8Tweaks 323 load8blocks 324 MOVD rkSave, rk 325 EOR R13, R13 326 327 encOctetsEnc8Blocks: 328 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) 329 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) 330 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) 331 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) 332 333 ADD $1, R13 334 CMP $8, R13 335 BNE encOctetsEnc8Blocks 336 337 store8blocks 338 B xtsSm4EncOctets 339 340 xtsSm4EncNibbles: 341 CMP $64, srcPtrLen 342 BLT xtsSm4EncSingles 343 SUB $64, srcPtrLen 344 345 prepareGB4Tweaks 346 load4blocks 347 MOVD rkSave, rk 348 EOR R13, R13 349 350 encNibblesEnc4Blocks: 351 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 352 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 353 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 354 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 355 356 ADD $1, R13 357 CMP $8, R13 358 BNE encNibblesEnc4Blocks 359 360 store4blocks 361 362 xtsSm4EncSingles: 363 CMP $16, srcPtrLen 364 BLT xtsSm4EncTail 365 SUB $16, srcPtrLen 366 367 loadOneBlock 368 369 MOVD rkSave, rk 370 EOR R13, R13 371 372 encSinglesEnc4Blocks: 373 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 374 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 375 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 376 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 377 ADD $1, R13 378 CMP $8, R13 379 BNE encSinglesEnc4Blocks 380 381 storeOneBlock 382 mul2GBInline 383 B xtsSm4EncSingles 384 385 xtsSm4EncTail: 386 CBZ srcPtrLen, xtsSm4EncDone 387 SUB $16, dstPtr, R7 388 MOVD R7, R9 389 MOVD RSP, R8 390 VLD1 (R7), [B0.B16] 391 VST1 [B0.B16], (R8) 392 393 TBZ $3, srcPtrLen, less_than8 394 MOVD.P 8(srcPtr), R11 395 MOVD.P R11, 8(R8) 396 MOVD.P 8(R7), R12 397 MOVD.P R12, 8(dstPtr) 398 399 less_than8: 400 TBZ $2, srcPtrLen, less_than4 401 MOVWU.P 4(srcPtr), R11 402 MOVWU.P R11, 4(R8) 403 MOVWU.P 4(R7), R12 404 MOVWU.P R12, 4(dstPtr) 405 406 less_than4: 407 TBZ $1, srcPtrLen, less_than2 408 MOVHU.P 2(srcPtr), R11 409 MOVHU.P R11, 2(R8) 410 MOVHU.P 2(R7), R12 411 MOVHU.P R12, 2(dstPtr) 412 413 less_than2: 414 TBZ $0, srcPtrLen, xtsSm4EncTailEnc 415 MOVBU (srcPtr), R11 416 MOVBU R11, (R8) 417 MOVBU (R7), R12 418 MOVBU R12, (dstPtr) 419 420 xtsSm4EncTailEnc: 421 VLD1 (RSP), [B0.B16] 422 VEOR TW.B16, B0.B16, B0.B16 423 VREV32 B0.B16, B0.B16 424 VMOV B0.S[1], B1.S[0] 425 VMOV B0.S[2], B2.S[0] 426 VMOV B0.S[3], B3.S[0] 427 428 MOVD rkSave, rk 429 EOR R13, R13 430 431 tailEncLoop: 432 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 433 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 434 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 435 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 436 ADD $1, R13 437 CMP $8, R13 438 BNE tailEncLoop 439 440 VMOV B2.S[0], B3.S[1] 441 VMOV B1.S[0], B3.S[2] 442 VMOV B0.S[0], B3.S[3] 443 VREV32 B3.B16, B3.B16 444 445 VEOR TW.B16, B3.B16, B3.B16 446 VST1 [B3.B16], (R9) 447 448 xtsSm4EncDone: 449 VST1 [TW.B16], (twPtr) 450 RET 451 452 // func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 453 TEXT ·decryptSm4Xts(SB),0,$128-64 454 LOAD_SM4_AESNI_CONSTS() 455 MOVD xk+0(FP), rk 456 MOVD tweak+8(FP), twPtr 457 MOVD dst+16(FP), dstPtr 458 MOVD src+40(FP), srcPtr 459 MOVD src_len+48(FP), srcPtrLen 460 461 VEOR POLY.B16, POLY.B16, POLY.B16 462 VEOR ZERO.B16, ZERO.B16, ZERO.B16 463 464 MOVD $0x87, I 465 VMOV I, POLY.D[0] 466 467 MOVD rk, rkSave 468 VLD1 (twPtr), [TW.B16] 469 470 xtsSm4DecOctets: 471 CMP $128, srcPtrLen 472 BLT xtsSm4DecNibbles 473 SUB $128, srcPtrLen 474 475 prepare8Tweaks 476 load8blocks 477 MOVD rkSave, rk 478 EOR R13, R13 479 480 decOctetsDec8Blocks: 481 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) 482 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) 483 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) 484 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) 485 486 ADD $1, R13 487 CMP $8, R13 488 BNE decOctetsDec8Blocks 489 490 store8blocks 491 B xtsSm4DecOctets 492 493 xtsSm4DecNibbles: 494 CMP $64, srcPtrLen 495 BLT xtsSm4DecSingles 496 SUB $64, srcPtrLen 497 498 prepare4Tweaks 499 load4blocks 500 MOVD rkSave, rk 501 EOR R13, R13 502 503 decNibblesDec4Blocks: 504 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 505 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 506 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 507 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 508 509 ADD $1, R13 510 CMP $8, R13 511 BNE decNibblesDec4Blocks 512 513 store4blocks 514 515 xtsSm4DecSingles: 516 CMP $32, srcPtrLen 517 BLT xtsSm4DecTail 518 SUB $16, srcPtrLen 519 520 loadOneBlock 521 522 MOVD rkSave, rk 523 EOR R13, R13 524 525 decSinglesDec4Blocks: 526 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 527 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 528 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 529 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 530 ADD $1, R13 531 CMP $8, R13 532 BNE decSinglesDec4Blocks 533 534 storeOneBlock 535 mul2Inline 536 537 B xtsSm4DecSingles 538 539 xtsSm4DecTail: 540 CBZ srcPtrLen, xtsSm4DecDone 541 542 CMP $16, srcPtrLen 543 BEQ xtsSm4DecLastBlock 544 545 VMOV TW.B16, B4.B16 546 mul2Inline 547 loadOneBlock 548 MOVD rkSave, rk 549 EOR R13, R13 550 551 decLastCompleteBlockLoop: 552 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 553 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 554 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 555 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 556 ADD $1, R13 557 CMP $8, R13 558 BNE decLastCompleteBlockLoop 559 storeOneBlock 560 VMOV B4.B16, TW.B16 561 VST1 [B3.B16], (RSP) 562 563 SUB $16, dstPtr, R7 564 MOVD R7, R9 565 MOVD RSP, R8 566 567 TBZ $3, srcPtrLen, less_than8 568 MOVD.P 8(srcPtr), R11 569 MOVD.P R11, 8(R8) 570 MOVD.P 8(R7), R12 571 MOVD.P R12, 8(dstPtr) 572 573 less_than8: 574 TBZ $2, srcPtrLen, less_than4 575 MOVWU.P 4(srcPtr), R11 576 MOVWU.P R11, 4(R8) 577 MOVWU.P 4(R7), R12 578 MOVWU.P R12, 4(dstPtr) 579 580 less_than4: 581 TBZ $1, srcPtrLen, less_than2 582 MOVHU.P 2(srcPtr), R11 583 MOVHU.P R11, 2(R8) 584 MOVHU.P 2(R7), R12 585 MOVHU.P R12, 2(dstPtr) 586 587 less_than2: 588 TBZ $0, srcPtrLen, xtsSm4DecTailDec 589 MOVBU (srcPtr), R11 590 MOVBU R11, (R8) 591 MOVBU (R7), R12 592 MOVBU R12, (dstPtr) 593 594 xtsSm4DecTailDec: 595 VLD1 (RSP), [B0.B16] 596 VEOR TW.B16, B0.B16, B0.B16 597 VREV32 B0.B16, B0.B16 598 VMOV B0.S[1], B1.S[0] 599 VMOV B0.S[2], B2.S[0] 600 VMOV B0.S[3], B3.S[0] 601 602 MOVD rkSave, rk 603 EOR R13, R13 604 605 tailDecLoop: 606 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 607 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 608 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 609 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 610 ADD $1, R13 611 CMP $8, R13 612 BNE tailDecLoop 613 614 VMOV B2.S[0], B3.S[1] 615 VMOV B1.S[0], B3.S[2] 616 VMOV B0.S[0], B3.S[3] 617 VREV32 B3.B16, B3.B16 618 619 VEOR TW.B16, B3.B16, B3.B16 620 VST1 [B3.B16], (R9) 621 622 B xtsSm4DecDone 623 624 xtsSm4DecLastBlock: 625 loadOneBlock 626 627 MOVD rkSave, rk 628 EOR R13, R13 629 630 decLastBlockLoop: 631 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 632 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 633 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 634 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 635 ADD $1, R13 636 CMP $8, R13 637 BNE decLastBlockLoop 638 639 storeOneBlock 640 mul2Inline 641 642 xtsSm4DecDone: 643 VST1 [TW.B16], (twPtr) 644 RET 645 646 // func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 647 TEXT ·decryptSm4XtsGB(SB),0,$128-64 648 LOAD_SM4_AESNI_CONSTS() 649 MOVD xk+0(FP), rk 650 MOVD tweak+8(FP), twPtr 651 MOVD dst+16(FP), dstPtr 652 MOVD src+40(FP), srcPtr 653 MOVD src_len+48(FP), srcPtrLen 654 655 VEOR POLY.B16, POLY.B16, POLY.B16 656 VEOR ZERO.B16, ZERO.B16, ZERO.B16 657 658 MOVD $0xE1, I 659 LSL $56, I 660 VMOV I, POLY.D[1] 661 662 MOVD rk, rkSave 663 VLD1 (twPtr), [TW.B16] 664 665 xtsSm4DecOctets: 666 CMP $128, srcPtrLen 667 BLT xtsSm4DecNibbles 668 SUB $128, srcPtrLen 669 670 prepareGB8Tweaks 671 load8blocks 672 MOVD rkSave, rk 673 EOR R13, R13 674 675 decOctetsDec8Blocks: 676 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) 677 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) 678 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) 679 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) 680 681 ADD $1, R13 682 CMP $8, R13 683 BNE decOctetsDec8Blocks 684 685 store8blocks 686 B xtsSm4DecOctets 687 688 xtsSm4DecNibbles: 689 CMP $64, srcPtrLen 690 BLT xtsSm4DecSingles 691 SUB $64, srcPtrLen 692 693 prepareGB4Tweaks 694 load4blocks 695 MOVD rkSave, rk 696 EOR R13, R13 697 698 decNibblesDec4Blocks: 699 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 700 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 701 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 702 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 703 704 ADD $1, R13 705 CMP $8, R13 706 BNE decNibblesDec4Blocks 707 708 store4blocks 709 710 xtsSm4DecSingles: 711 CMP $32, srcPtrLen 712 BLT xtsSm4DecTail 713 SUB $16, srcPtrLen 714 715 loadOneBlock 716 717 MOVD rkSave, rk 718 EOR R13, R13 719 720 decSinglesDec4Blocks: 721 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 722 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 723 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 724 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 725 ADD $1, R13 726 CMP $8, R13 727 BNE decSinglesDec4Blocks 728 729 storeOneBlock 730 mul2GBInline 731 732 B xtsSm4DecSingles 733 734 xtsSm4DecTail: 735 CBZ srcPtrLen, xtsSm4DecDone 736 737 CMP $16, srcPtrLen 738 BEQ xtsSm4DecLastBlock 739 740 VMOV TW.B16, B4.B16 741 mul2GBInline 742 loadOneBlock 743 MOVD rkSave, rk 744 EOR R13, R13 745 746 decLastCompleteBlockLoop: 747 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 748 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 749 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 750 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 751 ADD $1, R13 752 CMP $8, R13 753 BNE decLastCompleteBlockLoop 754 storeOneBlock 755 VMOV B4.B16, TW.B16 756 VST1 [B3.B16], (RSP) 757 758 SUB $16, dstPtr, R7 759 MOVD R7, R9 760 MOVD RSP, R8 761 762 TBZ $3, srcPtrLen, less_than8 763 MOVD.P 8(srcPtr), R11 764 MOVD.P R11, 8(R8) 765 MOVD.P 8(R7), R12 766 MOVD.P R12, 8(dstPtr) 767 768 less_than8: 769 TBZ $2, srcPtrLen, less_than4 770 MOVWU.P 4(srcPtr), R11 771 MOVWU.P R11, 4(R8) 772 MOVWU.P 4(R7), R12 773 MOVWU.P R12, 4(dstPtr) 774 775 less_than4: 776 TBZ $1, srcPtrLen, less_than2 777 MOVHU.P 2(srcPtr), R11 778 MOVHU.P R11, 2(R8) 779 MOVHU.P 2(R7), R12 780 MOVHU.P R12, 2(dstPtr) 781 782 less_than2: 783 TBZ $0, srcPtrLen, xtsSm4DecTailDec 784 MOVBU (srcPtr), R11 785 MOVBU R11, (R8) 786 MOVBU (R7), R12 787 MOVBU R12, (dstPtr) 788 789 xtsSm4DecTailDec: 790 VLD1 (RSP), [B0.B16] 791 VEOR TW.B16, B0.B16, B0.B16 792 VREV32 B0.B16, B0.B16 793 VMOV B0.S[1], B1.S[0] 794 VMOV B0.S[2], B2.S[0] 795 VMOV B0.S[3], B3.S[0] 796 797 MOVD rkSave, rk 798 EOR R13, R13 799 800 tailDecLoop: 801 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 802 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 803 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 804 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 805 ADD $1, R13 806 CMP $8, R13 807 BNE tailDecLoop 808 809 VMOV B2.S[0], B3.S[1] 810 VMOV B1.S[0], B3.S[2] 811 VMOV B0.S[0], B3.S[3] 812 VREV32 B3.B16, B3.B16 813 814 VEOR TW.B16, B3.B16, B3.B16 815 VST1 [B3.B16], (R9) 816 817 B xtsSm4DecDone 818 819 xtsSm4DecLastBlock: 820 loadOneBlock 821 822 MOVD rkSave, rk 823 EOR R13, R13 824 825 decLastBlockLoop: 826 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 827 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 828 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 829 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 830 ADD $1, R13 831 CMP $8, R13 832 BNE decLastBlockLoop 833 834 storeOneBlock 835 mul2GBInline 836 837 xtsSm4DecDone: 838 VST1 [TW.B16], (twPtr) 839 RET