github.com/emmansun/gmsm@v0.29.1/sm4/xts_sm4ni_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define B0 V0 6 #define B1 V1 7 #define B2 V2 8 #define B3 V3 9 #define B4 V4 10 #define B5 V5 11 #define B6 V6 12 #define B7 V7 13 14 #define POLY V8 15 #define ZERO V9 16 #define TW V10 17 18 #define T0 V11 19 #define T1 V12 20 #define T2 V13 21 #define T3 V14 22 #define T4 V15 23 #define T5 V16 24 #define T6 V17 25 #define T7 V18 26 27 #define RK0 V19 28 #define RK1 V20 29 #define RK2 V21 30 #define RK3 V22 31 #define RK4 V23 32 #define RK5 V24 33 #define RK6 V25 34 #define RK7 V26 35 36 #define K0 V27 37 #define K1 V28 38 39 #include "sm4ni_macros_arm64.s" 40 #include "xts_macros_arm64.s" 41 42 #define load8blocks \ 43 VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \ 44 VEOR T0.B16, B0.B16, B0.B16; \ 45 VEOR T1.B16, B1.B16, B1.B16; \ 46 VEOR T2.B16, B2.B16, B2.B16; \ 47 VEOR T3.B16, B3.B16, B3.B16; \ 48 \ 49 VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \ 50 VEOR T4.B16, B4.B16, B4.B16; \ 51 VEOR T5.B16, B5.B16, B5.B16; \ 52 VEOR T6.B16, B6.B16, B6.B16; \ 53 VEOR T7.B16, B7.B16, B7.B16; \ 54 \ 55 VREV32 B0.B16, B0.B16; \ 56 VREV32 B1.B16, B1.B16; \ 57 VREV32 B2.B16, B2.B16; \ 58 VREV32 B3.B16, B3.B16; \ 59 VREV32 B4.B16, B4.B16; \ 60 VREV32 B5.B16, B5.B16; \ 61 VREV32 B6.B16, B6.B16; \ 62 VREV32 B7.B16, B7.B16 63 64 #define store8blocks \ 65 VEOR T0.B16, B0.B16, B0.B16; \ 66 VEOR T1.B16, B1.B16, B1.B16; \ 67 VEOR T2.B16, B2.B16, B2.B16; \ 68 VEOR T3.B16, B3.B16, B3.B16; \ 69 VEOR T4.B16, B4.B16, B4.B16; \ 70 VEOR T5.B16, B5.B16, B5.B16; \ 71 VEOR T6.B16, B6.B16, B6.B16; \ 72 VEOR T7.B16, B7.B16, B7.B16; \ 73 \ 74 VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \ 75 VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr) 76 77 #define dstPtr R2 78 #define srcPtr R3 79 #define rk R0 80 #define twPtr R1 81 #define srcPtrLen R4 82 #define I R5 83 84 // func encryptSm4NiXts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 85 TEXT ·encryptSm4NiXts(SB),0,$128-64 86 MOVD xk+0(FP), rk 87 MOVD tweak+8(FP), twPtr 88 MOVD dst+16(FP), dstPtr 89 MOVD src+40(FP), srcPtr 90 MOVD src_len+48(FP), srcPtrLen 91 92 VEOR POLY.B16, POLY.B16, POLY.B16 93 VEOR ZERO.B16, ZERO.B16, ZERO.B16 94 95 MOVD $0x87, I 96 VMOV I, POLY.D[0] 97 98 // For SM4 round keys are stored in: RK0 .. RK7 99 VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4] 100 VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4] 101 102 VLD1 (twPtr), [TW.B16] 103 104 xtsSm4EncOctets: 105 CMP $128, srcPtrLen 106 BLT xtsSm4EncSingles 107 SUB $128, srcPtrLen 108 prepare8Tweaks 109 load8blocks 110 sm4eEnc8blocks() 111 store8blocks 112 113 B xtsSm4EncOctets 114 115 xtsSm4EncSingles: 116 CMP $16, srcPtrLen 117 BLT xtsSm4EncTail 118 SUB $16, srcPtrLen 119 120 VLD1.P 16(srcPtr), [B0.S4] 121 VEOR TW.B16, B0.B16, B0.B16 122 VREV32 B0.B16, B0.B16 123 sm4eEnc1block() 124 VEOR TW.B16, B0.B16, B0.B16 125 VST1.P [B0.S4], 16(dstPtr) 126 127 mul2Inline 128 B xtsSm4EncSingles 129 130 xtsSm4EncTail: 131 CBZ srcPtrLen, xtsSm4EncDone 132 SUB $16, dstPtr, R7 133 MOVD R7, R9 134 MOVD RSP, R8 135 VLD1 (R7), [B0.B16] 136 VST1 [B0.B16], (R8) 137 138 TBZ $3, srcPtrLen, less_than8 139 MOVD.P 8(srcPtr), R11 140 MOVD.P R11, 8(R8) 141 MOVD.P 8(R7), R12 142 MOVD.P R12, 8(dstPtr) 143 144 less_than8: 145 TBZ $2, srcPtrLen, less_than4 146 MOVWU.P 4(srcPtr), R11 147 MOVWU.P R11, 4(R8) 148 MOVWU.P 4(R7), R12 149 MOVWU.P R12, 4(dstPtr) 150 151 less_than4: 152 TBZ $1, srcPtrLen, less_than2 153 MOVHU.P 2(srcPtr), R11 154 MOVHU.P R11, 2(R8) 155 MOVHU.P 2(R7), R12 156 MOVHU.P R12, 2(dstPtr) 157 158 less_than2: 159 TBZ $0, srcPtrLen, xtsSm4EncTailEnc 160 MOVBU (srcPtr), R11 161 MOVBU R11, (R8) 162 MOVBU (R7), R12 163 MOVBU R12, (dstPtr) 164 165 xtsSm4EncTailEnc: 166 VLD1 (RSP), [B0.B16] 167 VEOR TW.B16, B0.B16, B0.B16 168 VREV32 B0.B16, B0.B16 169 sm4eEnc1block() 170 VEOR TW.B16, B0.B16, B0.B16 171 VST1 [B0.B16], (R9) 172 173 xtsSm4EncDone: 174 VST1 [TW.B16], (twPtr) 175 RET 176 177 // func encryptSm4NiXtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 178 TEXT ·encryptSm4NiXtsGB(SB),0,$128-64 179 MOVD xk+0(FP), rk 180 MOVD tweak+8(FP), twPtr 181 MOVD dst+16(FP), dstPtr 182 MOVD src+40(FP), srcPtr 183 MOVD src_len+48(FP), srcPtrLen 184 185 VEOR POLY.B16, POLY.B16, POLY.B16 186 VEOR ZERO.B16, ZERO.B16, ZERO.B16 187 188 MOVD $0xE1, I 189 LSL $56, I 190 VMOV I, POLY.D[1] 191 192 // For SM4 round keys are stored in: RK0 .. RK7 193 VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4] 194 VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4] 195 196 VLD1 (twPtr), [TW.B16] 197 198 xtsSm4EncOctets: 199 CMP $128, srcPtrLen 200 BLT xtsSm4EncSingles 201 SUB $128, srcPtrLen 202 prepareGB8Tweaks 203 load8blocks 204 sm4eEnc8blocks() 205 store8blocks 206 207 B xtsSm4EncOctets 208 209 xtsSm4EncSingles: 210 CMP $16, srcPtrLen 211 BLT xtsSm4EncTail 212 SUB $16, srcPtrLen 213 214 VLD1.P 16(srcPtr), [B0.S4] 215 VEOR TW.B16, B0.B16, B0.B16 216 VREV32 B0.B16, B0.B16 217 sm4eEnc1block() 218 VEOR TW.B16, B0.B16, B0.B16 219 VST1.P [B0.S4], 16(dstPtr) 220 221 mul2GBInline 222 B xtsSm4EncSingles 223 224 xtsSm4EncTail: 225 CBZ srcPtrLen, xtsSm4EncDone 226 SUB $16, dstPtr, R7 227 MOVD R7, R9 228 MOVD RSP, R8 229 VLD1 (R7), [B0.B16] 230 VST1 [B0.B16], (R8) 231 232 TBZ $3, srcPtrLen, less_than8 233 MOVD.P 8(srcPtr), R11 234 MOVD.P R11, 8(R8) 235 MOVD.P 8(R7), R12 236 MOVD.P R12, 8(dstPtr) 237 238 less_than8: 239 TBZ $2, srcPtrLen, less_than4 240 MOVWU.P 4(srcPtr), R11 241 MOVWU.P R11, 4(R8) 242 MOVWU.P 4(R7), R12 243 MOVWU.P R12, 4(dstPtr) 244 245 less_than4: 246 TBZ $1, srcPtrLen, less_than2 247 MOVHU.P 2(srcPtr), R11 248 MOVHU.P R11, 2(R8) 249 MOVHU.P 2(R7), R12 250 MOVHU.P R12, 2(dstPtr) 251 252 less_than2: 253 TBZ $0, srcPtrLen, xtsSm4EncTailEnc 254 MOVBU (srcPtr), R11 255 MOVBU R11, (R8) 256 MOVBU (R7), R12 257 MOVBU R12, (dstPtr) 258 259 xtsSm4EncTailEnc: 260 VLD1 (RSP), [B0.B16] 261 VEOR TW.B16, B0.B16, B0.B16 262 VREV32 B0.B16, B0.B16 263 sm4eEnc1block() 264 VEOR TW.B16, B0.B16, B0.B16 265 VST1 [B0.B16], (R9) 266 267 xtsSm4EncDone: 268 VST1 [TW.B16], (twPtr) 269 RET 270 271 // func decryptSm4NiXts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 272 TEXT ·decryptSm4NiXts(SB),0,$128-64 273 MOVD xk+0(FP), rk 274 MOVD tweak+8(FP), twPtr 275 MOVD dst+16(FP), dstPtr 276 MOVD src+40(FP), srcPtr 277 MOVD src_len+48(FP), srcPtrLen 278 279 VEOR POLY.B16, POLY.B16, POLY.B16 280 VEOR ZERO.B16, ZERO.B16, ZERO.B16 281 282 MOVD $0x87, I 283 VMOV I, POLY.D[0] 284 285 // For SM4 round keys are stored in: RK0 .. RK7 286 VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4] 287 VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4] 288 289 VLD1 (twPtr), [TW.B16] 290 291 xtsSm4DecOctets: 292 CMP $128, srcPtrLen 293 BLT xtsSm4DecSingles 294 SUB $128, srcPtrLen 295 296 prepare8Tweaks 297 load8blocks 298 sm4eEnc8blocks() 299 store8blocks 300 301 B xtsSm4DecOctets 302 303 xtsSm4DecSingles: 304 CMP $32, srcPtrLen 305 BLT xtsSm4DecTail 306 SUB $16, srcPtrLen 307 308 VLD1.P 16(srcPtr), [B0.S4] 309 VEOR TW.B16, B0.B16, B0.B16 310 VREV32 B0.B16, B0.B16 311 sm4eEnc1block() 312 VEOR TW.B16, B0.B16, B0.B16 313 VST1.P [B0.S4], 16(dstPtr) 314 315 mul2Inline 316 B xtsSm4DecSingles 317 318 xtsSm4DecTail: 319 CBZ srcPtrLen, xtsSm4DecDone 320 321 CMP $16, srcPtrLen 322 BEQ xtsSm4DecLastBlock 323 324 VMOV TW.B16, B4.B16 325 mul2Inline 326 VLD1.P 16(srcPtr), [B0.S4] 327 VEOR TW.B16, B0.B16, B0.B16 328 VREV32 B0.B16, B0.B16 329 sm4eEnc1block() 330 VEOR TW.B16, B0.B16, B0.B16 331 VST1.P [B0.S4], 16(dstPtr) 332 VMOV B4.B16, TW.B16 333 VST1 [B0.B16], (RSP) 334 335 SUB $16, dstPtr, R7 336 MOVD R7, R9 337 MOVD RSP, R8 338 339 TBZ $3, srcPtrLen, less_than8 340 MOVD.P 8(srcPtr), R11 341 MOVD.P R11, 8(R8) 342 MOVD.P 8(R7), R12 343 MOVD.P R12, 8(dstPtr) 344 345 less_than8: 346 TBZ $2, srcPtrLen, less_than4 347 MOVWU.P 4(srcPtr), R11 348 MOVWU.P R11, 4(R8) 349 MOVWU.P 4(R7), R12 350 MOVWU.P R12, 4(dstPtr) 351 352 less_than4: 353 TBZ $1, srcPtrLen, less_than2 354 MOVHU.P 2(srcPtr), R11 355 MOVHU.P R11, 2(R8) 356 MOVHU.P 2(R7), R12 357 MOVHU.P R12, 2(dstPtr) 358 359 less_than2: 360 TBZ $0, srcPtrLen, xtsSm4DecTailDec 361 MOVBU (srcPtr), R11 362 MOVBU R11, (R8) 363 MOVBU (R7), R12 364 MOVBU R12, (dstPtr) 365 366 xtsSm4DecTailDec: 367 VLD1 (RSP), [B0.B16] 368 VEOR TW.B16, B0.B16, B0.B16 369 VREV32 B0.B16, B0.B16 370 sm4eEnc1block() 371 VEOR TW.B16, B0.B16, B0.B16 372 VST1 [B0.B16], (R9) 373 374 B xtsSm4DecDone 375 376 xtsSm4DecLastBlock: 377 VLD1.P 16(srcPtr), [B0.S4] 378 VEOR TW.B16, B0.B16, B0.B16 379 VREV32 B0.B16, B0.B16 380 sm4eEnc1block() 381 VEOR TW.B16, B0.B16, B0.B16 382 VST1.P [B0.S4], 16(dstPtr) 383 mul2Inline 384 385 xtsSm4DecDone: 386 VST1 [TW.B16], (twPtr) 387 RET 388 389 // func decryptSm4NiXtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte) 390 TEXT ·decryptSm4NiXtsGB(SB),0,$128-64 391 MOVD xk+0(FP), rk 392 MOVD tweak+8(FP), twPtr 393 MOVD dst+16(FP), dstPtr 394 MOVD src+40(FP), srcPtr 395 MOVD src_len+48(FP), srcPtrLen 396 397 VEOR POLY.B16, POLY.B16, POLY.B16 398 VEOR ZERO.B16, ZERO.B16, ZERO.B16 399 400 MOVD $0xE1, I 401 LSL $56, I 402 VMOV I, POLY.D[1] 403 404 // For SM4 round keys are stored in: RK0 .. RK7 405 VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4] 406 VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4] 407 408 VLD1 (twPtr), [TW.B16] 409 410 xtsSm4DecOctets: 411 CMP $128, srcPtrLen 412 BLT xtsSm4DecSingles 413 SUB $128, srcPtrLen 414 415 prepareGB8Tweaks 416 load8blocks 417 sm4eEnc8blocks() 418 store8blocks 419 420 B xtsSm4DecOctets 421 422 xtsSm4DecSingles: 423 CMP $32, srcPtrLen 424 BLT xtsSm4DecTail 425 SUB $16, srcPtrLen 426 427 VLD1.P 16(srcPtr), [B0.S4] 428 VEOR TW.B16, B0.B16, B0.B16 429 VREV32 B0.B16, B0.B16 430 sm4eEnc1block() 431 VEOR TW.B16, B0.B16, B0.B16 432 VST1.P [B0.S4], 16(dstPtr) 433 434 mul2GBInline 435 B xtsSm4DecSingles 436 437 xtsSm4DecTail: 438 CBZ srcPtrLen, xtsSm4DecDone 439 440 CMP $16, srcPtrLen 441 BEQ xtsSm4DecLastBlock 442 443 VMOV TW.B16, B4.B16 444 mul2GBInline 445 VLD1.P 16(srcPtr), [B0.S4] 446 VEOR TW.B16, B0.B16, B0.B16 447 VREV32 B0.B16, B0.B16 448 sm4eEnc1block() 449 VEOR TW.B16, B0.B16, B0.B16 450 VST1.P [B0.S4], 16(dstPtr) 451 VMOV B4.B16, TW.B16 452 VST1 [B0.B16], (RSP) 453 454 SUB $16, dstPtr, R7 455 MOVD R7, R9 456 MOVD RSP, R8 457 458 TBZ $3, srcPtrLen, less_than8 459 MOVD.P 8(srcPtr), R11 460 MOVD.P R11, 8(R8) 461 MOVD.P 8(R7), R12 462 MOVD.P R12, 8(dstPtr) 463 464 less_than8: 465 TBZ $2, srcPtrLen, less_than4 466 MOVWU.P 4(srcPtr), R11 467 MOVWU.P R11, 4(R8) 468 MOVWU.P 4(R7), R12 469 MOVWU.P R12, 4(dstPtr) 470 471 less_than4: 472 TBZ $1, srcPtrLen, less_than2 473 MOVHU.P 2(srcPtr), R11 474 MOVHU.P R11, 2(R8) 475 MOVHU.P 2(R7), R12 476 MOVHU.P R12, 2(dstPtr) 477 478 less_than2: 479 TBZ $0, srcPtrLen, xtsSm4DecTailDec 480 MOVBU (srcPtr), R11 481 MOVBU R11, (R8) 482 MOVBU (R7), R12 483 MOVBU R12, (dstPtr) 484 485 xtsSm4DecTailDec: 486 VLD1 (RSP), [B0.B16] 487 VEOR TW.B16, B0.B16, B0.B16 488 VREV32 B0.B16, B0.B16 489 sm4eEnc1block() 490 VEOR TW.B16, B0.B16, B0.B16 491 VST1 [B0.B16], (R9) 492 493 B xtsSm4DecDone 494 495 xtsSm4DecLastBlock: 496 VLD1.P 16(srcPtr), [B0.S4] 497 VEOR TW.B16, B0.B16, B0.B16 498 VREV32 B0.B16, B0.B16 499 sm4eEnc1block() 500 VEOR TW.B16, B0.B16, B0.B16 501 VST1.P [B0.S4], 16(dstPtr) 502 mul2GBInline 503 504 xtsSm4DecDone: 505 VST1 [TW.B16], (twPtr) 506 RET