github.com/emmansun/gmsm@v0.29.1/sm4/gcm_sm4ni_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define B0 V0 6 #define B1 V1 7 #define B2 V2 8 #define B3 V3 9 #define B4 V4 10 #define B5 V5 11 #define B6 V6 12 #define B7 V7 13 14 #define ACC0 V8 15 #define ACC1 V9 16 #define ACCM V10 17 18 #define T0 V11 19 #define T1 V12 20 #define T2 V13 21 #define T3 V14 22 23 #define POLY V15 24 #define ZERO V16 25 #define INC V17 26 #define CTR V18 27 28 #define K0 V19 29 #define K1 V20 30 #define K2 V21 31 #define K3 V22 32 #define K4 V23 33 #define K5 V24 34 #define K6 V25 35 #define K7 V26 36 37 #define reduce() \ 38 VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ 39 VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ 40 VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ 41 VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ 42 VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ 43 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 44 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 45 VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ 46 VEOR T0.B16, ACC0.B16, ACC0.B16 \ 47 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 48 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 49 VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ 50 VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ 51 52 #define mulRound(X) \ 53 VLD1.P 32(pTbl), [T1.B16, T2.B16] \ 54 VREV64 X.B16, X.B16 \ 55 VEXT $8, X.B16, X.B16, T0.B16 \ 56 VEOR X.B16, T0.B16, T0.B16 \ 57 VPMULL X.D1, T1.D1, T3.Q1 \ 58 VEOR T3.B16, ACC1.B16, ACC1.B16 \ 59 VPMULL2 X.D2, T1.D2, T3.Q1 \ 60 VEOR T3.B16, ACC0.B16, ACC0.B16 \ 61 VPMULL T0.D1, T2.D1, T3.Q1 \ 62 VEOR T3.B16, ACCM.B16, ACCM.B16 63 64 #include "sm4ni_macros_arm64.s" 65 66 // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 67 TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 68 #define pTbl R0 69 #define dstPtr R1 70 #define ctrPtr R2 71 #define srcPtr R3 72 #define rk R4 73 #define tPtr R5 74 #define srcPtrLen R6 75 #define aluCTR R7 76 #define aluTMP R8 77 #define H0 R9 78 #define H1 R10 79 #define pTblSave R11 80 #define rkSave R12 81 MOVD productTable+0(FP), pTbl 82 MOVD dst+8(FP), dstPtr 83 MOVD src_base+32(FP), srcPtr 84 MOVD src_len+40(FP), srcPtrLen 85 MOVD ctr+56(FP), ctrPtr 86 MOVD T+64(FP), tPtr 87 MOVD rk_base+72(FP), rk 88 89 MOVD $0xC2, H1 90 LSL $56, H1 91 MOVD $1, H0 92 VMOV H1, POLY.D[0] 93 VMOV H0, POLY.D[1] 94 VEOR ZERO.B16, ZERO.B16, ZERO.B16 95 96 MOVD pTbl, pTblSave 97 // Current tag, after AAD 98 VLD1 (tPtr), [ACC0.B16] 99 VEOR ACC1.B16, ACC1.B16, ACC1.B16 100 VEOR ACCM.B16, ACCM.B16, ACCM.B16 101 // Prepare initial counter, and the increment vector 102 VLD1 (ctrPtr), [CTR.B16] 103 VEOR INC.B16, INC.B16, INC.B16 104 MOVD $1, H0 105 VMOV H0, INC.S[3] 106 VREV32 CTR.B16, CTR.B16 107 VADD CTR.S4, INC.S4, CTR.S4 108 109 // Skip to <8 blocks loop 110 CMP $128, srcPtrLen 111 112 MOVD rk, H0 113 // For SM4 round keys are stored in: K0 .. K7 114 VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4] 115 VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4] 116 117 BLT startSingles 118 octetsLoop: 119 SUB $128, srcPtrLen 120 // Prepare 8 counters 121 VMOV CTR.B16, B0.B16 122 VADD B0.S4, INC.S4, B1.S4 123 VADD B1.S4, INC.S4, B2.S4 124 VADD B2.S4, INC.S4, B3.S4 125 VADD B3.S4, INC.S4, B4.S4 126 VADD B4.S4, INC.S4, B5.S4 127 VADD B5.S4, INC.S4, B6.S4 128 VADD B6.S4, INC.S4, B7.S4 129 VADD B7.S4, INC.S4, CTR.S4 130 131 sm4eEnc8blocks() 132 133 // XOR plaintext and store ciphertext 134 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 135 VEOR B0.B16, T1.B16, B0.B16 136 VEOR B1.B16, T2.B16, B1.B16 137 VST1.P [B0.B16, B1.B16], 32(dstPtr) 138 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 139 VEOR B2.B16, T1.B16, B2.B16 140 VEOR B3.B16, T2.B16, B3.B16 141 VST1.P [B2.B16, B3.B16], 32(dstPtr) 142 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 143 VEOR B4.B16, T1.B16, B4.B16 144 VEOR B5.B16, T2.B16, B5.B16 145 VST1.P [B4.B16, B5.B16], 32(dstPtr) 146 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 147 VEOR B6.B16, T1.B16, B6.B16 148 VEOR B7.B16, T2.B16, B7.B16 149 VST1.P [B6.B16, B7.B16], 32(dstPtr) 150 151 VLD1.P 32(pTbl), [T1.B16, T2.B16] 152 VREV64 B0.B16, B0.B16 153 VEOR ACC0.B16, B0.B16, B0.B16 154 VEXT $8, B0.B16, B0.B16, T0.B16 155 VEOR B0.B16, T0.B16, T0.B16 156 VPMULL B0.D1, T1.D1, ACC1.Q1 157 VPMULL2 B0.D2, T1.D2, ACC0.Q1 158 VPMULL T0.D1, T2.D1, ACCM.Q1 159 160 mulRound(B1) 161 mulRound(B2) 162 mulRound(B3) 163 mulRound(B4) 164 mulRound(B5) 165 mulRound(B6) 166 mulRound(B7) 167 MOVD pTblSave, pTbl 168 reduce() 169 170 CMP $128, srcPtrLen 171 BGE octetsLoop 172 173 startSingles: 174 CBZ srcPtrLen, done 175 ADD $14*16, pTbl 176 // Preload H and its Karatsuba precomp 177 VLD1.P (pTbl), [T1.B16, T2.B16] 178 179 singlesLoop: 180 CMP $16, srcPtrLen 181 BLT tail 182 SUB $16, srcPtrLen 183 184 VMOV CTR.B16, B0.B16 185 VADD CTR.S4, INC.S4, CTR.S4 186 sm4eEnc1block() 187 188 singlesLast: 189 VLD1.P 16(srcPtr), [T0.B16] 190 VEOR T0.B16, B0.B16, B0.B16 191 192 encReduce: 193 VST1.P [B0.B16], 16(dstPtr) 194 195 VREV64 B0.B16, B0.B16 196 VEOR ACC0.B16, B0.B16, B0.B16 197 198 VEXT $8, B0.B16, B0.B16, T0.B16 199 VEOR B0.B16, T0.B16, T0.B16 200 VPMULL B0.D1, T1.D1, ACC1.Q1 201 VPMULL2 B0.D2, T1.D2, ACC0.Q1 202 VPMULL T0.D1, T2.D1, ACCM.Q1 203 204 reduce() 205 206 B singlesLoop 207 tail: 208 CBZ srcPtrLen, done 209 210 VEOR T0.B16, T0.B16, T0.B16 211 VEOR T3.B16, T3.B16, T3.B16 212 MOVD $0, H1 213 SUB $1, H1 214 ADD srcPtrLen, srcPtr 215 216 TBZ $3, srcPtrLen, ld4 217 MOVD.W -8(srcPtr), H0 218 VMOV H0, T0.D[0] 219 VMOV H1, T3.D[0] 220 221 ld4: 222 TBZ $2, srcPtrLen, ld2 223 MOVW.W -4(srcPtr), H0 224 VEXT $12, T0.B16, ZERO.B16, T0.B16 225 VEXT $12, T3.B16, ZERO.B16, T3.B16 226 VMOV H0, T0.S[0] 227 VMOV H1, T3.S[0] 228 ld2: 229 TBZ $1, srcPtrLen, ld1 230 MOVH.W -2(srcPtr), H0 231 VEXT $14, T0.B16, ZERO.B16, T0.B16 232 VEXT $14, T3.B16, ZERO.B16, T3.B16 233 VMOV H0, T0.H[0] 234 VMOV H1, T3.H[0] 235 ld1: 236 TBZ $0, srcPtrLen, ld0 237 MOVB.W -1(srcPtr), H0 238 VEXT $15, T0.B16, ZERO.B16, T0.B16 239 VEXT $15, T3.B16, ZERO.B16, T3.B16 240 VMOV H0, T0.B[0] 241 VMOV H1, T3.B[0] 242 ld0: 243 MOVD ZR, srcPtrLen 244 VMOV CTR.B16, B0.B16 245 sm4eEnc1block() 246 247 tailLast: 248 VEOR T0.B16, B0.B16, B0.B16 249 VAND T3.B16, B0.B16, B0.B16 250 B encReduce 251 252 done: 253 VST1 [ACC0.B16], (tPtr) 254 RET 255 256 // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 257 TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 258 MOVD productTable+0(FP), pTbl 259 MOVD dst+8(FP), dstPtr 260 MOVD src_base+32(FP), srcPtr 261 MOVD src_len+40(FP), srcPtrLen 262 MOVD ctr+56(FP), ctrPtr 263 MOVD T+64(FP), tPtr 264 MOVD rk_base+72(FP), rk 265 266 MOVD $0xC2, H1 267 LSL $56, H1 268 MOVD $1, H0 269 VMOV H1, POLY.D[0] 270 VMOV H0, POLY.D[1] 271 VEOR ZERO.B16, ZERO.B16, ZERO.B16 272 273 MOVD pTbl, pTblSave 274 MOVD rk, rkSave 275 // Current tag, after AAD 276 VLD1 (tPtr), [ACC0.B16] 277 VEOR ACC1.B16, ACC1.B16, ACC1.B16 278 VEOR ACCM.B16, ACCM.B16, ACCM.B16 279 // Prepare initial counter, and the increment vector 280 VLD1 (ctrPtr), [CTR.B16] 281 VEOR INC.B16, INC.B16, INC.B16 282 MOVD $1, H0 283 VMOV H0, INC.S[3] 284 VREV32 CTR.B16, CTR.B16 285 VADD CTR.S4, INC.S4, CTR.S4 286 287 // Skip to <8 blocks loop 288 CMP $128, srcPtrLen 289 290 MOVD rk, H0 291 // For SM4 round keys are stored in: K0 .. K7 292 VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4] 293 VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4] 294 295 BLT startSingles 296 octetsLoop: 297 SUB $128, srcPtrLen 298 299 VMOV CTR.B16, B0.B16 300 VADD B0.S4, INC.S4, B1.S4 301 VADD B1.S4, INC.S4, B2.S4 302 VADD B2.S4, INC.S4, B3.S4 303 VADD B3.S4, INC.S4, B4.S4 304 VADD B4.S4, INC.S4, B5.S4 305 VADD B5.S4, INC.S4, B6.S4 306 VADD B6.S4, INC.S4, B7.S4 307 VADD B7.S4, INC.S4, CTR.S4 308 309 sm4eEnc8blocks() 310 311 VMOV B0.B16, T1.B16 312 VMOV B1.B16, T2.B16 313 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 314 VEOR B0.B16, T1.B16, T1.B16 315 VEOR B1.B16, T2.B16, T2.B16 316 VST1.P [T1.B16, T2.B16], 32(dstPtr) 317 318 VLD1.P 32(pTbl), [T1.B16, T2.B16] 319 VREV64 B0.B16, B0.B16 320 VEOR ACC0.B16, B0.B16, B0.B16 321 VEXT $8, B0.B16, B0.B16, T0.B16 322 VEOR B0.B16, T0.B16, T0.B16 323 VPMULL B0.D1, T1.D1, ACC1.Q1 324 VPMULL2 B0.D2, T1.D2, ACC0.Q1 325 VPMULL T0.D1, T2.D1, ACCM.Q1 326 mulRound(B1) 327 328 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 329 VEOR B2.B16, B0.B16, T1.B16 330 VEOR B3.B16, B1.B16, T2.B16 331 VST1.P [T1.B16, T2.B16], 32(dstPtr) 332 mulRound(B0) 333 mulRound(B1) 334 335 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 336 VEOR B4.B16, B0.B16, T1.B16 337 VEOR B5.B16, B1.B16, T2.B16 338 VST1.P [T1.B16, T2.B16], 32(dstPtr) 339 mulRound(B0) 340 mulRound(B1) 341 342 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 343 VEOR B6.B16, B0.B16, T1.B16 344 VEOR B7.B16, B1.B16, T2.B16 345 VST1.P [T1.B16, T2.B16], 32(dstPtr) 346 mulRound(B0) 347 mulRound(B1) 348 349 MOVD pTblSave, pTbl 350 reduce() 351 352 CMP $128, srcPtrLen 353 BGE octetsLoop 354 355 startSingles: 356 CBZ srcPtrLen, done 357 ADD $14*16, pTbl 358 // Preload H and its Karatsuba precomp 359 VLD1.P (pTbl), [T1.B16, T2.B16] 360 361 singlesLoop: 362 CMP $16, srcPtrLen 363 BLT tail 364 SUB $16, srcPtrLen 365 366 VLD1.P 16(srcPtr), [T0.B16] 367 VREV64 T0.B16, B5.B16 368 369 VMOV CTR.B16, B0.B16 370 VADD CTR.S4, INC.S4, CTR.S4 371 sm4eEnc1block() 372 373 singlesLast: 374 VEOR T0.B16, B0.B16, B0.B16 375 VST1.P [B0.B16], 16(dstPtr) 376 377 VEOR ACC0.B16, B5.B16, B5.B16 378 VEXT $8, B5.B16, B5.B16, T0.B16 379 VEOR B5.B16, T0.B16, T0.B16 380 VPMULL B5.D1, T1.D1, ACC1.Q1 381 VPMULL2 B5.D2, T1.D2, ACC0.Q1 382 VPMULL T0.D1, T2.D1, ACCM.Q1 383 reduce() 384 385 B singlesLoop 386 tail: 387 CBZ srcPtrLen, done 388 VMOV CTR.B16, B0.B16 389 VADD CTR.S4, INC.S4, CTR.S4 390 sm4eEnc1block() 391 392 tailLast: 393 // Assuming it is safe to load past dstPtr due to the presence of the tag 394 // B5 stored last ciphertext 395 VLD1 (srcPtr), [B5.B16] 396 397 VEOR B5.B16, B0.B16, B0.B16 398 399 VEOR T3.B16, T3.B16, T3.B16 400 MOVD $0, H1 401 SUB $1, H1 402 403 TBZ $3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4 404 VMOV B0.D[0], H0 405 MOVD.P H0, 8(dstPtr) 406 VMOV H1, T3.D[0] 407 VEXT $8, ZERO.B16, B0.B16, B0.B16 408 ld4: 409 TBZ $2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2 410 VMOV B0.S[0], H0 411 MOVW.P H0, 4(dstPtr) 412 VEXT $12, T3.B16, ZERO.B16, T3.B16 413 VMOV H1, T3.S[0] 414 VEXT $4, ZERO.B16, B0.B16, B0.B16 415 ld2: 416 TBZ $1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1 417 VMOV B0.H[0], H0 418 MOVH.P H0, 2(dstPtr) 419 VEXT $14, T3.B16, ZERO.B16, T3.B16 420 VMOV H1, T3.H[0] 421 VEXT $2, ZERO.B16, B0.B16, B0.B16 422 ld1: 423 TBZ $0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0 424 VMOV B0.B[0], H0 425 MOVB.P H0, 1(dstPtr) 426 VEXT $15, T3.B16, ZERO.B16, T3.B16 427 VMOV H1, T3.B[0] 428 ld0: 429 430 VAND T3.B16, B5.B16, B5.B16 431 VREV64 B5.B16, B5.B16 432 433 VEOR ACC0.B16, B5.B16, B5.B16 434 VEXT $8, B5.B16, B5.B16, T0.B16 435 VEOR B5.B16, T0.B16, T0.B16 436 VPMULL B5.D1, T1.D1, ACC1.Q1 437 VPMULL2 B5.D2, T1.D2, ACC0.Q1 438 VPMULL T0.D1, T2.D1, ACCM.Q1 439 reduce() 440 done: 441 VST1 [ACC0.B16], (tPtr) 442 443 RET