github.com/emmansun/gmsm@v0.29.1/sm4/gcm_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define B0 V0 6 #define B1 V1 7 #define B2 V2 8 #define B3 V3 9 #define B4 V4 10 #define B5 V5 11 #define B6 V6 12 #define B7 V7 13 14 #define ACC0 V8 15 #define ACC1 V9 16 #define ACCM V10 17 18 #define T0 V11 19 #define T1 V12 20 #define T2 V13 21 #define T3 V14 22 23 #define POLY V15 24 #define ZERO V16 25 #define INC V17 26 #define CTR V18 27 28 #define K0 V19 29 #define K1 V20 30 #define K2 V21 31 #define K3 V22 32 #define NIBBLE_MASK V23 33 #define INVERSE_SHIFT_ROWS V24 34 #define M1L V25 35 #define M1H V26 36 #define M2L V27 37 #define M2H V28 38 #define R08_MASK V29 39 40 #define reduce() \ 41 VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ 42 VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ 43 VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ 44 VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ 45 VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ 46 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 47 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 48 VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ 49 VEOR T0.B16, ACC0.B16, ACC0.B16 \ 50 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 51 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 52 VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ 53 VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ 54 55 // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 56 TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 57 #define pTbl R0 58 #define tMsk R1 59 #define tPtr R2 60 #define plen R3 61 #define dlen R4 62 63 MOVD $0xC2, R1 64 LSL $56, R1 65 MOVD $1, R0 66 VMOV R1, POLY.D[0] 67 VMOV R0, POLY.D[1] 68 VEOR ZERO.B16, ZERO.B16, ZERO.B16 69 70 MOVD productTable+0(FP), pTbl 71 MOVD tagMask+8(FP), tMsk 72 MOVD T+16(FP), tPtr 73 MOVD pLen+24(FP), plen 74 MOVD dLen+32(FP), dlen 75 76 VLD1 (tPtr), [ACC0.B16] 77 VLD1 (tMsk), [B1.B16] 78 79 LSL $3, plen 80 LSL $3, dlen 81 82 VMOV dlen, B0.D[0] 83 VMOV plen, B0.D[1] 84 85 ADD $14*16, pTbl 86 VLD1.P (pTbl), [T1.B16, T2.B16] 87 88 VEOR ACC0.B16, B0.B16, B0.B16 89 90 VEXT $8, B0.B16, B0.B16, T0.B16 91 VEOR B0.B16, T0.B16, T0.B16 92 VPMULL B0.D1, T1.D1, ACC1.Q1 93 VPMULL2 B0.D2, T1.D2, ACC0.Q1 94 VPMULL T0.D1, T2.D1, ACCM.Q1 95 96 reduce() 97 98 VREV64 ACC0.B16, ACC0.B16 99 VEOR B1.B16, ACC0.B16, ACC0.B16 100 101 VST1 [ACC0.B16], (tPtr) 102 RET 103 #undef pTbl 104 #undef tMsk 105 #undef tPtr 106 #undef plen 107 #undef dlen 108 109 #include "aesni_macros_arm64.s" 110 111 // func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int) 112 TEXT ·gcmSm4Init(SB),NOSPLIT,$0 113 #define pTbl R0 114 #define RK R1 115 #define I R2 116 117 MOVD productTable+0(FP), pTbl 118 MOVD rk+8(FP), RK 119 MOVD inst+16(FP), R5 120 121 MOVD $0xC2, I 122 LSL $56, I 123 VMOV I, POLY.D[0] 124 MOVD $1, I 125 VMOV I, POLY.D[1] 126 VEOR ZERO.B16, ZERO.B16, ZERO.B16 127 128 // Encrypt block 0 with the SM4 keys to generate the hash key H 129 CMP $1, R5 130 BEQ sm4InitSM4E 131 132 LOAD_SM4_AESNI_CONSTS() 133 VEOR B0.B16, B0.B16, B0.B16 134 VEOR B1.B16, B1.B16, B1.B16 135 VEOR B2.B16, B2.B16, B2.B16 136 VEOR B3.B16, B3.B16, B3.B16 137 EOR R3, R3 138 139 sm4InitEncLoop: 140 SM4_ROUND(RK, R19, K0, K1, K2, B0, B1, B2, B3) 141 SM4_ROUND(RK, R19, K0, K1, K2, B1, B2, B3, B0) 142 SM4_ROUND(RK, R19, K0, K1, K2, B2, B3, B0, B1) 143 SM4_ROUND(RK, R19, K0, K1, K2, B3, B0, B1, B2) 144 145 ADD $1, R3 146 CMP $8, R3 147 BNE sm4InitEncLoop 148 149 VMOV B0.S[0], B0.S[2] 150 VMOV B1.S[0], B0.S[3] 151 VMOV B2.S[0], B0.S[0] 152 VMOV B3.S[0], B0.S[1] 153 B sm4InitEncDone 154 sm4InitSM4E: 155 VEOR B0.B16, B0.B16, B0.B16 156 VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] 157 WORD $0x6085c0ce //SM4E V0.4S, V11.4S 158 WORD $0x8085c0ce //SM4E V0.4S, V12.4S 159 WORD $0xa085c0ce //SM4E V0.4S, V13.4S 160 WORD $0xc085c0ce //SM4E V0.4S, V14.4S 161 VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] 162 WORD $0x6085c0ce //SM4E V0.4S, V11.4S 163 WORD $0x8085c0ce //SM4E V0.4S, V12.4S 164 WORD $0xa085c0ce //SM4E V0.4S, V13.4S 165 WORD $0xc085c0ce //SM4E V0.4S, V14.4S 166 VREV32 B0.B16, B0.B16 167 VREV64 B0.B16, B0.B16 168 sm4InitEncDone: 169 // Multiply by 2 modulo P 170 VMOV B0.D[0], I 171 ASR $63, I 172 VMOV I, T1.D[0] 173 VMOV I, T1.D[1] 174 VAND POLY.B16, T1.B16, T1.B16 175 VUSHR $63, B0.D2, T2.D2 176 VEXT $8, ZERO.B16, T2.B16, T2.B16 177 VSLI $1, B0.D2, T2.D2 178 VEOR T1.B16, T2.B16, B0.B16 179 180 // Karatsuba pre-computation 181 VEXT $8, B0.B16, B0.B16, B1.B16 182 VEOR B0.B16, B1.B16, B1.B16 183 184 ADD $14*16, pTbl 185 186 VST1 [B0.B16, B1.B16], (pTbl) 187 SUB $2*16, pTbl 188 189 VMOV B0.B16, B2.B16 190 VMOV B1.B16, B3.B16 191 192 MOVD $7, I 193 194 initLoop: 195 // Compute powers of H 196 SUBS $1, I 197 198 VPMULL B0.D1, B2.D1, T1.Q1 199 VPMULL2 B0.D2, B2.D2, T0.Q1 200 VPMULL B1.D1, B3.D1, T2.Q1 201 VEOR T0.B16, T2.B16, T2.B16 202 VEOR T1.B16, T2.B16, T2.B16 203 VEXT $8, ZERO.B16, T2.B16, T3.B16 204 VEXT $8, T2.B16, ZERO.B16, T2.B16 205 VEOR T2.B16, T0.B16, T0.B16 206 VEOR T3.B16, T1.B16, T1.B16 207 VPMULL POLY.D1, T0.D1, T2.Q1 208 VEXT $8, T0.B16, T0.B16, T0.B16 209 VEOR T2.B16, T0.B16, T0.B16 210 VPMULL POLY.D1, T0.D1, T2.Q1 211 VEXT $8, T0.B16, T0.B16, T0.B16 212 VEOR T2.B16, T0.B16, T0.B16 213 VEOR T1.B16, T0.B16, B2.B16 214 VMOV B2.B16, B3.B16 215 VEXT $8, B2.B16, B2.B16, B2.B16 216 VEOR B2.B16, B3.B16, B3.B16 217 218 VST1 [B2.B16, B3.B16], (pTbl) 219 SUB $2*16, pTbl 220 221 BNE initLoop 222 RET 223 #undef I 224 #undef RK 225 #undef pTbl 226 227 // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) 228 TEXT ·gcmSm4Data(SB),NOSPLIT,$0 229 #define pTbl R0 230 #define aut R1 231 #define tPtr R2 232 #define autLen R3 233 #define H0 R4 234 #define pTblSave R5 235 236 #define mulRound(X) \ 237 VLD1.P 32(pTbl), [T1.B16, T2.B16] \ 238 VREV64 X.B16, X.B16 \ 239 VEXT $8, X.B16, X.B16, T0.B16 \ 240 VEOR X.B16, T0.B16, T0.B16 \ 241 VPMULL X.D1, T1.D1, T3.Q1 \ 242 VEOR T3.B16, ACC1.B16, ACC1.B16 \ 243 VPMULL2 X.D2, T1.D2, T3.Q1 \ 244 VEOR T3.B16, ACC0.B16, ACC0.B16 \ 245 VPMULL T0.D1, T2.D1, T3.Q1 \ 246 VEOR T3.B16, ACCM.B16, ACCM.B16 247 248 MOVD productTable+0(FP), pTbl 249 MOVD data_base+8(FP), aut 250 MOVD data_len+16(FP), autLen 251 MOVD T+32(FP), tPtr 252 253 VEOR ACC0.B16, ACC0.B16, ACC0.B16 254 //VLD1 (tPtr), [ACC0.B16] // originally we passed in tag initial value 255 CBZ autLen, dataBail 256 257 MOVD $0xC2, H0 258 LSL $56, H0 259 VMOV H0, POLY.D[0] 260 MOVD $1, H0 261 VMOV H0, POLY.D[1] 262 VEOR ZERO.B16, ZERO.B16, ZERO.B16 263 MOVD pTbl, pTblSave 264 265 CMP $13, autLen 266 BEQ dataTLS 267 CMP $128, autLen 268 BLT startSinglesLoop 269 B octetsLoop 270 271 dataTLS: 272 ADD $14*16, pTbl 273 VLD1.P (pTbl), [T1.B16, T2.B16] 274 VEOR B0.B16, B0.B16, B0.B16 275 276 MOVD (aut), H0 277 VMOV H0, B0.D[0] 278 MOVW 8(aut), H0 279 VMOV H0, B0.S[2] 280 MOVB 12(aut), H0 281 VMOV H0, B0.B[12] 282 283 MOVD $0, autLen 284 B dataMul 285 286 octetsLoop: 287 CMP $128, autLen 288 BLT startSinglesLoop 289 SUB $128, autLen 290 291 VLD1.P 32(aut), [B0.B16, B1.B16] 292 293 VLD1.P 32(pTbl), [T1.B16, T2.B16] 294 VREV64 B0.B16, B0.B16 295 VEOR ACC0.B16, B0.B16, B0.B16 296 VEXT $8, B0.B16, B0.B16, T0.B16 297 VEOR B0.B16, T0.B16, T0.B16 298 VPMULL B0.D1, T1.D1, ACC1.Q1 299 VPMULL2 B0.D2, T1.D2, ACC0.Q1 300 VPMULL T0.D1, T2.D1, ACCM.Q1 301 302 mulRound(B1) 303 VLD1.P 32(aut), [B2.B16, B3.B16] 304 mulRound(B2) 305 mulRound(B3) 306 VLD1.P 32(aut), [B4.B16, B5.B16] 307 mulRound(B4) 308 mulRound(B5) 309 VLD1.P 32(aut), [B6.B16, B7.B16] 310 mulRound(B6) 311 mulRound(B7) 312 313 MOVD pTblSave, pTbl 314 reduce() 315 B octetsLoop 316 317 startSinglesLoop: 318 319 ADD $14*16, pTbl 320 VLD1.P (pTbl), [T1.B16, T2.B16] 321 322 singlesLoop: 323 324 CMP $16, autLen 325 BLT dataEnd 326 SUB $16, autLen 327 328 VLD1.P 16(aut), [B0.B16] 329 dataMul: 330 VREV64 B0.B16, B0.B16 331 VEOR ACC0.B16, B0.B16, B0.B16 332 333 VEXT $8, B0.B16, B0.B16, T0.B16 334 VEOR B0.B16, T0.B16, T0.B16 335 VPMULL B0.D1, T1.D1, ACC1.Q1 336 VPMULL2 B0.D2, T1.D2, ACC0.Q1 337 VPMULL T0.D1, T2.D1, ACCM.Q1 338 339 reduce() 340 341 B singlesLoop 342 343 dataEnd: 344 345 CBZ autLen, dataBail 346 VEOR B0.B16, B0.B16, B0.B16 347 ADD autLen, aut 348 349 dataLoadLoop: 350 MOVB.W -1(aut), H0 351 VEXT $15, B0.B16, ZERO.B16, B0.B16 352 VMOV H0, B0.B[0] 353 SUBS $1, autLen 354 BNE dataLoadLoop 355 B dataMul 356 357 dataBail: 358 VST1 [ACC0.B16], (tPtr) 359 RET 360 361 #undef pTbl 362 #undef aut 363 #undef tPtr 364 #undef autLen 365 #undef H0 366 #undef pTblSave 367 368 // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 369 TEXT ·gcmSm4Enc(SB),NOSPLIT,$0 370 #define pTbl R0 371 #define dstPtr R1 372 #define ctrPtr R2 373 #define srcPtr R3 374 #define rk R4 375 #define tPtr R5 376 #define srcPtrLen R6 377 #define aluCTR R7 378 #define aluTMP R8 379 #define H0 R9 380 #define H1 R10 381 #define pTblSave R11 382 #define rkSave R12 383 #define mulRoundSingleWithoutRev(X) \ 384 VEOR ACC0.B16, X.B16, X.B16 \ 385 VEXT $8, X.B16, X.B16, T0.B16 \ 386 VEOR X.B16, T0.B16, T0.B16 \ 387 VPMULL X.D1, T1.D1, ACC1.Q1 \ 388 VPMULL2 X.D2, T1.D2, ACC0.Q1 \ 389 VPMULL T0.D1, T2.D1, ACCM.Q1 \ 390 reduce() \ 391 392 #define mulRoundSingle(X) \ 393 VREV64 X.B16, X.B16 \ 394 mulRoundSingleWithoutRev(X) \ 395 396 MOVD productTable+0(FP), pTbl 397 MOVD dst+8(FP), dstPtr 398 MOVD src_base+32(FP), srcPtr 399 MOVD src_len+40(FP), srcPtrLen 400 MOVD ctr+56(FP), ctrPtr 401 MOVD T+64(FP), tPtr 402 MOVD rk_base+72(FP), rk 403 404 MOVD $0xC2, H1 405 LSL $56, H1 406 MOVD $1, H0 407 VMOV H1, POLY.D[0] 408 VMOV H0, POLY.D[1] 409 VEOR ZERO.B16, ZERO.B16, ZERO.B16 410 411 MOVD pTbl, pTblSave 412 MOVD rk, rkSave 413 // Current tag, after AAD 414 VLD1 (tPtr), [ACC0.B16] 415 VEOR ACC1.B16, ACC1.B16, ACC1.B16 416 VEOR ACCM.B16, ACCM.B16, ACCM.B16 417 // Prepare initial counter, and the increment vector 418 VLD1 (ctrPtr), [CTR.B16] 419 VEOR INC.B16, INC.B16, INC.B16 420 MOVD $1, H0 421 VMOV H0, INC.S[3] 422 VREV32 CTR.B16, CTR.B16 423 VADD CTR.S4, INC.S4, CTR.S4 424 425 // Skip to <8 blocks loop 426 CMP $128, srcPtrLen 427 428 LOAD_SM4_AESNI_CONSTS() 429 430 BLT encNibblesLoop 431 // There are at least 8 blocks to encrypt 432 433 encOctetsLoop: 434 SUB $128, srcPtrLen 435 // Prepare 8 counters 436 VMOV CTR.B16, B0.B16 437 VADD B0.S4, INC.S4, B1.S4 438 VADD B1.S4, INC.S4, B2.S4 439 VADD B2.S4, INC.S4, B3.S4 440 VADD B3.S4, INC.S4, B4.S4 441 VADD B4.S4, INC.S4, B5.S4 442 VADD B5.S4, INC.S4, B6.S4 443 VADD B6.S4, INC.S4, B7.S4 444 VADD B7.S4, INC.S4, CTR.S4 445 446 // encryption first 4 blocks 447 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 448 PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) 449 EOR R13, R13 450 MOVD rkSave, rk 451 452 encOctetsEnc8Blocks: 453 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) 454 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) 455 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) 456 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) 457 458 ADD $1, R13 459 CMP $8, R13 460 BNE encOctetsEnc8Blocks 461 VREV32 B0.B16, B0.B16 462 VREV32 B1.B16, B1.B16 463 VREV32 B2.B16, B2.B16 464 VREV32 B3.B16, B3.B16 465 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 466 VREV32 B4.B16, B4.B16 467 VREV32 B5.B16, B5.B16 468 VREV32 B6.B16, B6.B16 469 VREV32 B7.B16, B7.B16 470 TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) 471 472 // XOR plaintext and store ciphertext 473 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 474 VEOR B0.B16, T1.B16, B0.B16 475 VEOR B1.B16, T2.B16, B1.B16 476 VST1.P [B0.B16, B1.B16], 32(dstPtr) 477 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 478 VEOR B2.B16, T1.B16, B2.B16 479 VEOR B3.B16, T2.B16, B3.B16 480 VST1.P [B2.B16, B3.B16], 32(dstPtr) 481 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 482 VEOR B4.B16, T1.B16, B4.B16 483 VEOR B5.B16, T2.B16, B5.B16 484 VST1.P [B4.B16, B5.B16], 32(dstPtr) 485 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 486 VEOR B6.B16, T1.B16, B6.B16 487 VEOR B7.B16, T2.B16, B7.B16 488 VST1.P [B6.B16, B7.B16], 32(dstPtr) 489 490 VLD1.P 32(pTbl), [T1.B16, T2.B16] 491 VREV64 B0.B16, B0.B16 492 VEOR ACC0.B16, B0.B16, B0.B16 493 VEXT $8, B0.B16, B0.B16, T0.B16 494 VEOR B0.B16, T0.B16, T0.B16 495 VPMULL B0.D1, T1.D1, ACC1.Q1 496 VPMULL2 B0.D2, T1.D2, ACC0.Q1 497 VPMULL T0.D1, T2.D1, ACCM.Q1 498 499 mulRound(B1) 500 mulRound(B2) 501 mulRound(B3) 502 mulRound(B4) 503 mulRound(B5) 504 mulRound(B6) 505 mulRound(B7) 506 MOVD pTblSave, pTbl 507 reduce() 508 509 CMP $128, srcPtrLen 510 BGE encOctetsLoop 511 512 encNibblesLoop: 513 CBZ srcPtrLen, encDone 514 ADD $14*16, pTbl 515 // Preload H and its Karatsuba precomp 516 VLD1.P (pTbl), [T1.B16, T2.B16] 517 518 CMP $64, srcPtrLen 519 BLT encStartSingles 520 SUB $64, srcPtrLen 521 522 // Prepare 4 counters 523 VMOV CTR.B16, B0.B16 524 VADD B0.S4, INC.S4, B1.S4 525 VADD B1.S4, INC.S4, B2.S4 526 VADD B2.S4, INC.S4, B3.S4 527 VADD B3.S4, INC.S4, CTR.S4 528 529 // encryption first 4 blocks 530 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 531 EOR R13, R13 532 MOVD rkSave, rk 533 534 encNibblesEnc4Blocks: 535 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 536 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 537 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 538 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 539 540 ADD $1, R13 541 CMP $8, R13 542 BNE encNibblesEnc4Blocks 543 VREV32 B0.B16, B0.B16 544 VREV32 B1.B16, B1.B16 545 VREV32 B2.B16, B2.B16 546 VREV32 B3.B16, B3.B16 547 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 548 549 // XOR plaintext and store ciphertext 550 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 551 VEOR B0.B16, K1.B16, B0.B16 552 VEOR B1.B16, K2.B16, B1.B16 553 VST1.P [B0.B16, B1.B16], 32(dstPtr) 554 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 555 VEOR B2.B16, K1.B16, B2.B16 556 VEOR B3.B16, K2.B16, B3.B16 557 VST1.P [B2.B16, B3.B16], 32(dstPtr) 558 559 mulRoundSingle(B0) 560 mulRoundSingle(B1) 561 mulRoundSingle(B2) 562 mulRoundSingle(B3) 563 564 encStartSingles: 565 CBZ srcPtrLen, encDone 566 567 // Prepare 4 counters 568 VMOV CTR.B16, B0.B16 569 VADD B0.S4, INC.S4, B1.S4 570 VADD B1.S4, INC.S4, B2.S4 571 VADD B2.S4, INC.S4, B3.S4 572 VADD B3.S4, INC.S4, CTR.S4 573 574 // encryption first 4 blocks 575 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 576 EOR R13, R13 577 MOVD rkSave, rk 578 579 encSinglesEnc4Blocks: 580 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 581 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 582 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 583 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 584 585 ADD $1, R13 586 CMP $8, R13 587 BNE encSinglesEnc4Blocks 588 VREV32 B0.B16, B0.B16 589 VREV32 B1.B16, B1.B16 590 VREV32 B2.B16, B2.B16 591 VREV32 B3.B16, B3.B16 592 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 593 594 VMOV B0.B16, K0.B16 595 CMP $16, srcPtrLen 596 BLT encTail 597 SUB $16, srcPtrLen 598 VLD1.P 16(srcPtr), [K1.B16] 599 VEOR K0.B16, K1.B16, K0.B16 600 VST1.P [K0.B16], 16(dstPtr) 601 mulRoundSingle(K0) 602 603 VMOV B1.B16, K0.B16 604 CMP $16, srcPtrLen 605 BLT encTail 606 SUB $16, srcPtrLen 607 VLD1.P 16(srcPtr), [K1.B16] 608 VEOR K0.B16, K1.B16, K0.B16 609 VST1.P [K0.B16], 16(dstPtr) 610 mulRoundSingle(K0) 611 612 VMOV B2.B16, K0.B16 613 CMP $16, srcPtrLen 614 BLT encTail 615 SUB $16, srcPtrLen 616 VLD1.P 16(srcPtr), [K1.B16] 617 VEOR K0.B16, K1.B16, K0.B16 618 VST1.P [K0.B16], 16(dstPtr) 619 mulRoundSingle(K0) 620 621 VMOV B3.B16, K0.B16 622 CMP $16, srcPtrLen 623 BLT encTail 624 SUB $16, srcPtrLen 625 VLD1.P 16(srcPtr), [K1.B16] 626 VEOR K0.B16, K1.B16, K0.B16 627 VST1.P [K0.B16], 16(dstPtr) 628 mulRoundSingle(K0) 629 630 encTail: 631 CBZ srcPtrLen, encDone 632 VEOR T0.B16, T0.B16, T0.B16 633 VEOR T3.B16, T3.B16, T3.B16 634 MOVD $0, H1 635 SUB $1, H1 636 ADD srcPtrLen, srcPtr 637 638 TBZ $3, srcPtrLen, ld4 639 MOVD.W -8(srcPtr), H0 640 VMOV H0, T0.D[0] 641 VMOV H1, T3.D[0] 642 ld4: 643 TBZ $2, srcPtrLen, ld2 644 MOVW.W -4(srcPtr), H0 645 VEXT $12, T0.B16, ZERO.B16, T0.B16 646 VEXT $12, T3.B16, ZERO.B16, T3.B16 647 VMOV H0, T0.S[0] 648 VMOV H1, T3.S[0] 649 ld2: 650 TBZ $1, srcPtrLen, ld1 651 MOVH.W -2(srcPtr), H0 652 VEXT $14, T0.B16, ZERO.B16, T0.B16 653 VEXT $14, T3.B16, ZERO.B16, T3.B16 654 VMOV H0, T0.H[0] 655 VMOV H1, T3.H[0] 656 ld1: 657 TBZ $0, srcPtrLen, ld0 658 MOVB.W -1(srcPtr), H0 659 VEXT $15, T0.B16, ZERO.B16, T0.B16 660 VEXT $15, T3.B16, ZERO.B16, T3.B16 661 VMOV H0, T0.B[0] 662 VMOV H1, T3.B[0] 663 ld0: 664 MOVD ZR, srcPtrLen 665 VEOR T0.B16, K0.B16, K0.B16 666 VAND T3.B16, K0.B16, K0.B16 667 VST1.P [K0.B16], 16(dstPtr) 668 mulRoundSingle(K0) 669 670 encDone: 671 VST1 [ACC0.B16], (tPtr) 672 RET 673 674 // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 675 TEXT ·gcmSm4Dec(SB),NOSPLIT,$0 676 MOVD productTable+0(FP), pTbl 677 MOVD dst+8(FP), dstPtr 678 MOVD src_base+32(FP), srcPtr 679 MOVD src_len+40(FP), srcPtrLen 680 MOVD ctr+56(FP), ctrPtr 681 MOVD T+64(FP), tPtr 682 MOVD rk_base+72(FP), rk 683 684 MOVD $0xC2, H1 685 LSL $56, H1 686 MOVD $1, H0 687 VMOV H1, POLY.D[0] 688 VMOV H0, POLY.D[1] 689 VEOR ZERO.B16, ZERO.B16, ZERO.B16 690 691 MOVD pTbl, pTblSave 692 MOVD rk, rkSave 693 // Current tag, after AAD 694 VLD1 (tPtr), [ACC0.B16] 695 VEOR ACC1.B16, ACC1.B16, ACC1.B16 696 VEOR ACCM.B16, ACCM.B16, ACCM.B16 697 // Prepare initial counter, and the increment vector 698 VLD1 (ctrPtr), [CTR.B16] 699 VEOR INC.B16, INC.B16, INC.B16 700 MOVD $1, H0 701 VMOV H0, INC.S[3] 702 VREV32 CTR.B16, CTR.B16 703 VADD CTR.S4, INC.S4, CTR.S4 704 705 // Skip to <8 blocks loop 706 CMP $128, srcPtrLen 707 708 LOAD_SM4_AESNI_CONSTS() 709 710 BLT decNibblesLoop 711 // There are at least 8 blocks to encrypt 712 713 decOctetsLoop: 714 SUB $128, srcPtrLen 715 716 VMOV CTR.B16, B0.B16 717 VADD B0.S4, INC.S4, B1.S4 718 VADD B1.S4, INC.S4, B2.S4 719 VADD B2.S4, INC.S4, B3.S4 720 VADD B3.S4, INC.S4, B4.S4 721 VADD B4.S4, INC.S4, B5.S4 722 VADD B5.S4, INC.S4, B6.S4 723 VADD B6.S4, INC.S4, B7.S4 724 VADD B7.S4, INC.S4, CTR.S4 725 726 // encryption first 4 blocks 727 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 728 PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) 729 EOR R13, R13 730 MOVD rkSave, rk 731 732 decOctetsEnc8Blocks: 733 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7) 734 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4) 735 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5) 736 SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6) 737 738 ADD $1, R13 739 CMP $8, R13 740 BNE decOctetsEnc8Blocks 741 VREV32 B0.B16, T1.B16 742 VREV32 B1.B16, T2.B16 743 VREV32 B2.B16, B2.B16 744 VREV32 B3.B16, B3.B16 745 TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3) 746 VREV32 B4.B16, B4.B16 747 VREV32 B5.B16, B5.B16 748 VREV32 B6.B16, B6.B16 749 VREV32 B7.B16, B7.B16 750 TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) 751 752 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 753 VEOR B0.B16, T1.B16, T1.B16 754 VEOR B1.B16, T2.B16, T2.B16 755 VST1.P [T1.B16, T2.B16], 32(dstPtr) 756 757 VLD1.P 32(pTbl), [T1.B16, T2.B16] 758 VREV64 B0.B16, B0.B16 759 VEOR ACC0.B16, B0.B16, B0.B16 760 VEXT $8, B0.B16, B0.B16, T0.B16 761 VEOR B0.B16, T0.B16, T0.B16 762 VPMULL B0.D1, T1.D1, ACC1.Q1 763 VPMULL2 B0.D2, T1.D2, ACC0.Q1 764 VPMULL T0.D1, T2.D1, ACCM.Q1 765 mulRound(B1) 766 767 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 768 VEOR B2.B16, B0.B16, T1.B16 769 VEOR B3.B16, B1.B16, T2.B16 770 VST1.P [T1.B16, T2.B16], 32(dstPtr) 771 mulRound(B0) 772 mulRound(B1) 773 774 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 775 VEOR B4.B16, B0.B16, T1.B16 776 VEOR B5.B16, B1.B16, T2.B16 777 VST1.P [T1.B16, T2.B16], 32(dstPtr) 778 mulRound(B0) 779 mulRound(B1) 780 781 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 782 VEOR B6.B16, B0.B16, T1.B16 783 VEOR B7.B16, B1.B16, T2.B16 784 VST1.P [T1.B16, T2.B16], 32(dstPtr) 785 mulRound(B0) 786 mulRound(B1) 787 788 MOVD pTblSave, pTbl 789 reduce() 790 791 CMP $128, srcPtrLen 792 BGE decOctetsLoop 793 794 decNibblesLoop: 795 CBZ srcPtrLen, decDone 796 ADD $14*16, pTbl 797 // Preload H and its Karatsuba precomp 798 VLD1.P (pTbl), [T1.B16, T2.B16] 799 CMP $64, srcPtrLen 800 BLT decStartSingles 801 SUB $64, srcPtrLen 802 803 // Prepare 4 counters 804 VMOV CTR.B16, B0.B16 805 VADD B0.S4, INC.S4, B1.S4 806 VADD B1.S4, INC.S4, B2.S4 807 VADD B2.S4, INC.S4, B3.S4 808 VADD B3.S4, INC.S4, CTR.S4 809 810 // encryption first 4 blocks 811 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 812 EOR R13, R13 813 MOVD rkSave, rk 814 815 decNibblesEnc4Blocks: 816 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 817 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 818 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 819 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 820 821 ADD $1, R13 822 CMP $8, R13 823 BNE decNibblesEnc4Blocks 824 VREV32 B0.B16, B0.B16 825 VREV32 B1.B16, B1.B16 826 VREV32 B2.B16, B2.B16 827 VREV32 B3.B16, B3.B16 828 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 829 830 // XOR plaintext and store ciphertext 831 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 832 VREV64 K1.B16, B4.B16 833 VREV64 K2.B16, B5.B16 834 VEOR B0.B16, K1.B16, B0.B16 835 VEOR B1.B16, K2.B16, B1.B16 836 VST1.P [B0.B16, B1.B16], 32(dstPtr) 837 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 838 VREV64 K1.B16, B6.B16 839 VREV64 K2.B16, B7.B16 840 VEOR B2.B16, K1.B16, B2.B16 841 VEOR B3.B16, K2.B16, B3.B16 842 VST1.P [B2.B16, B3.B16], 32(dstPtr) 843 mulRoundSingleWithoutRev(B4) 844 mulRoundSingleWithoutRev(B5) 845 mulRoundSingleWithoutRev(B6) 846 mulRoundSingleWithoutRev(B7) 847 848 decStartSingles: 849 CBZ srcPtrLen, decDone 850 851 // Prepare 4 counters 852 VMOV CTR.B16, B0.B16 853 VADD B0.S4, INC.S4, B1.S4 854 VADD B1.S4, INC.S4, B2.S4 855 VADD B2.S4, INC.S4, B3.S4 856 VADD B3.S4, INC.S4, CTR.S4 857 858 // encryption first 4 blocks 859 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 860 EOR R13, R13 861 MOVD rkSave, rk 862 863 decSinglesEnc4Blocks: 864 SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3) 865 SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0) 866 SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1) 867 SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2) 868 869 ADD $1, R13 870 CMP $8, R13 871 BNE decSinglesEnc4Blocks 872 VREV32 B0.B16, B0.B16 873 VREV32 B1.B16, B1.B16 874 VREV32 B2.B16, B2.B16 875 VREV32 B3.B16, B3.B16 876 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) 877 878 VMOV B0.B16, K0.B16 879 CMP $16, srcPtrLen 880 BLT decTail 881 SUB $16, srcPtrLen 882 VLD1.P 16(srcPtr), [K1.B16] 883 VREV64 K1.B16, B5.B16 884 VEOR K0.B16, K1.B16, K0.B16 885 VST1.P [K0.B16], 16(dstPtr) 886 mulRoundSingleWithoutRev(B5) 887 888 VMOV B1.B16, K0.B16 889 CMP $16, srcPtrLen 890 BLT decTail 891 SUB $16, srcPtrLen 892 VLD1.P 16(srcPtr), [K1.B16] 893 VREV64 K1.B16, B5.B16 894 VEOR K0.B16, K1.B16, K0.B16 895 VST1.P [K0.B16], 16(dstPtr) 896 mulRoundSingleWithoutRev(B5) 897 898 VMOV B2.B16, K0.B16 899 CMP $16, srcPtrLen 900 BLT decTail 901 SUB $16, srcPtrLen 902 VLD1.P 16(srcPtr), [K1.B16] 903 VREV64 K1.B16, B5.B16 904 VEOR K0.B16, K1.B16, K0.B16 905 VST1.P [K0.B16], 16(dstPtr) 906 mulRoundSingleWithoutRev(B5) 907 908 VMOV B3.B16, K0.B16 909 CMP $16, srcPtrLen 910 BLT decTail 911 SUB $16, srcPtrLen 912 VLD1.P 16(srcPtr), [K1.B16] 913 VREV64 K1.B16, B5.B16 914 VEOR K0.B16, K1.B16, K0.B16 915 VST1.P [K0.B16], 16(dstPtr) 916 mulRoundSingleWithoutRev(B5) 917 918 decTail: 919 CBZ srcPtrLen, decDone 920 // Assuming it is safe to load past dstPtr due to the presence of the tag 921 VLD1 (srcPtr), [B5.B16] 922 923 VEOR B5.B16, K0.B16, B0.B16 924 925 VEOR T3.B16, T3.B16, T3.B16 926 MOVD $0, H1 927 SUB $1, H1 928 929 TBZ $3, srcPtrLen, decLd4 930 VMOV B0.D[0], H0 931 MOVD.P H0, 8(dstPtr) 932 VMOV H1, T3.D[0] 933 VEXT $8, ZERO.B16, B0.B16, B0.B16 934 935 decLd4: 936 TBZ $2, srcPtrLen, decLd2 937 VMOV B0.S[0], H0 938 MOVW.P H0, 4(dstPtr) 939 VEXT $12, T3.B16, ZERO.B16, T3.B16 940 VMOV H1, T3.S[0] 941 VEXT $4, ZERO.B16, B0.B16, B0.B16 942 decLd2: 943 TBZ $1, srcPtrLen, decLd1 944 VMOV B0.H[0], H0 945 MOVH.P H0, 2(dstPtr) 946 VEXT $14, T3.B16, ZERO.B16, T3.B16 947 VMOV H1, T3.H[0] 948 VEXT $2, ZERO.B16, B0.B16, B0.B16 949 decLd1: 950 TBZ $0, srcPtrLen, decLd0 951 VMOV B0.B[0], H0 952 MOVB.P H0, 1(dstPtr) 953 VEXT $15, T3.B16, ZERO.B16, T3.B16 954 VMOV H1, T3.B[0] 955 decLd0: 956 957 VAND T3.B16, B5.B16, B5.B16 958 VREV64 B5.B16, B5.B16 959 960 VEOR ACC0.B16, B5.B16, B5.B16 961 VEXT $8, B5.B16, B5.B16, T0.B16 962 VEOR B5.B16, T0.B16, T0.B16 963 VPMULL B5.D1, T1.D1, ACC1.Q1 964 VPMULL2 B5.D2, T1.D2, ACC0.Q1 965 VPMULL T0.D1, T2.D1, ACCM.Q1 966 reduce() 967 968 decDone: 969 VST1 [ACC0.B16], (tPtr) 970 RET