github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm4/asm_arm64.s (about) 1 #include "textflag.h" 2 3 #define x V0 4 #define y V1 5 #define t0 V2 6 #define t1 V3 7 #define t2 V4 8 #define t3 V5 9 #define ZERO V16 10 #define NIBBLE_MASK V20 11 #define INVERSE_SHIFT_ROWS V21 12 #define M1L V22 13 #define M1H V23 14 #define M2L V24 15 #define M2H V25 16 #define R08_MASK V26 17 #define R16_MASK V27 18 #define R24_MASK V28 19 #define FK_MASK V29 20 #define XTMP6 V6 21 #define XTMP7 V7 22 23 //nibble mask 24 DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F 25 DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F 26 GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16 27 28 // inverse shift rows 29 DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 30 DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 31 GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16 32 33 // Affine transform 1 (low and high hibbles) 34 DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 35 DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 36 GLOBL m1_low<>(SB), (NOPTR+RODATA), $16 37 38 DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 39 DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB 40 GLOBL m1_high<>(SB), (NOPTR+RODATA), $16 41 42 // Affine transform 2 (low and high hibbles) 43 DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 44 DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 45 GLOBL m2_low<>(SB), (NOPTR+RODATA), $16 46 47 DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 48 DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 49 GLOBL m2_high<>(SB), (NOPTR+RODATA), $16 50 51 // left rotations of 32-bit words by 8-bit increments 52 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 53 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 54 GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16 55 56 DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 57 DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 58 GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16 59 60 DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 61 DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 62 GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16 63 64 DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 65 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 66 GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 67 68 #define SM4_SBOX(x, y) \ 69 ; \ //############################# inner affine ############################// 70 VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ 71 VTBL XTMP7.B16, [M1L.B16], y.B16; \ 72 VUSHR $4, x.D2, x.D2; \ 73 VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ 74 VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \ 75 VEOR y.B16, XTMP7.B16, x.B16; \ 76 VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ 77 AESE ZERO.B16, x.B16; \ 78 VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ 79 VTBL XTMP7.B16, [M2L.B16], y.B16; \ 80 VUSHR $4, x.D2, x.D2; \ 81 VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ 82 VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \ 83 VEOR y.B16, XTMP7.B16, x.B16 84 85 #define SM4_TAO_L1(x, y) \ 86 SM4_SBOX(x, y); \ 87 ; \ //#################### 4 parallel L1 linear transforms ##################// 88 VTBL R08_MASK.B16, [x.B16], y.B16; \ 89 VEOR y.B16, x.B16, y.B16; \ 90 VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \ 91 VEOR XTMP7.B16, y.B16, y.B16; \ 92 VSHL $2, y.S4, XTMP7.S4; \ 93 VUSHR $30, y.S4, y.S4; \ 94 VORR y.B16, XTMP7.B16, y.B16; \ 95 VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \ 96 VEOR XTMP7.B16, x.B16, x.B16; \ 97 VEOR y.B16, x.B16, x.B16 98 99 #define SM4_TAO_L2(x, y) \ 100 SM4_SBOX(x, y); \ 101 ; \ //#################### 4 parallel L2 linear transforms ##################// 102 VSHL $13, x.S4, XTMP6.S4; \ 103 VUSHR $19, x.S4, y.S4; \ 104 VORR XTMP6.B16, y.B16, y.B16; \ 105 VSHL $23, x.S4, XTMP6.S4; \ 106 VUSHR $9, x.S4, XTMP7.S4; \ 107 VORR XTMP6.B16, XTMP7.B16, XTMP7.B16; \ 108 VEOR XTMP7.B16, y.B16, y.B16; \ 109 VEOR x.B16, y.B16, x.B16 110 111 #define SM4_ROUND(RK, x, y, t0, t1, t2, t3) \ 112 MOVW.P 4(RK), R19; \ 113 VMOV R19, x.S4; \ 114 VEOR t1.B16, x.B16, x.B16; \ 115 VEOR t2.B16, x.B16, x.B16; \ 116 VEOR t3.B16, x.B16, x.B16; \ 117 SM4_TAO_L1(x, y); \ 118 VEOR x.B16, t0.B16, t0.B16 119 120 #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \ 121 MOVW.P 4(R9), R19; \ 122 VMOV R19, x.S[0]; \ 123 VEOR t1.B16, x.B16, x.B16; \ 124 VEOR t2.B16, x.B16, x.B16; \ 125 VEOR t3.B16, x.B16, x.B16; \ 126 SM4_TAO_L2(x, y); \ 127 VEOR x.B16, t0.B16, t0.B16; \ 128 VMOV t0.S[0], R2; \ 129 MOVW.P R2, 4(R10); \ 130 MOVW.P R2, -4(R11) 131 132 #define load_global_data_1() \ 133 LDP nibble_mask<>(SB), (R0, R1) \ 134 VMOV R0, NIBBLE_MASK.D[0] \ 135 VMOV R1, NIBBLE_MASK.D[1] \ 136 LDP m1_low<>(SB), (R0, R1) \ 137 VMOV R0, M1L.D[0] \ 138 VMOV R1, M1L.D[1] \ 139 LDP m1_high<>(SB), (R0, R1) \ 140 VMOV R0, M1H.D[0] \ 141 VMOV R1, M1H.D[1] \ 142 LDP m2_low<>(SB), (R0, R1) \ 143 VMOV R0, M2L.D[0] \ 144 VMOV R1, M2L.D[1] \ 145 LDP m2_high<>(SB), (R0, R1) \ 146 VMOV R0, M2H.D[0] \ 147 VMOV R1, M2H.D[1] \ 148 LDP fk_mask<>(SB), (R0, R1) \ 149 VMOV R0, FK_MASK.D[0] \ 150 VMOV R1, FK_MASK.D[1] \ 151 LDP inverse_shift_rows<>(SB), (R0, R1) \ 152 VMOV R0, INVERSE_SHIFT_ROWS.D[0] \ 153 VMOV R1, INVERSE_SHIFT_ROWS.D[1] 154 155 #define load_global_data_2() \ 156 load_global_data_1() \ 157 LDP r08_mask<>(SB), (R0, R1) \ 158 VMOV R0, R08_MASK.D[0] \ 159 VMOV R1, R08_MASK.D[1] \ 160 LDP r16_mask<>(SB), (R0, R1) \ 161 VMOV R0, R16_MASK.D[0] \ 162 VMOV R1, R16_MASK.D[1] \ 163 LDP r24_mask<>(SB), (R0, R1) \ 164 VMOV R0, R24_MASK.D[0] \ 165 VMOV R1, R24_MASK.D[1] 166 167 #define SM4EKEY_EXPORT_KEYS() \ 168 VMOV V9.S[3], V10.S[0] \ 169 VMOV V9.S[2], V10.S[1] \ 170 VMOV V9.S[1], V10.S[2] \ 171 VMOV V9.S[0], V10.S[3] \ 172 VMOV V8.S[3], V11.S[0] \ 173 VMOV V8.S[2], V11.S[1] \ 174 VMOV V8.S[1], V11.S[2] \ 175 VMOV V8.S[0], V11.S[3] \ 176 VST1.P [V8.S4, V9.S4], 32(R10) \ 177 VST1 [V10.S4, V11.S4], (R11) \ 178 SUB $32, R11, R11 179 180 #define SM4E_ROUND() \ 181 VLD1.P 16(R10), [V8.B16] \ 182 VREV32 V8.B16, V8.B16 \ 183 WORD $0x0884c0ce \ 184 WORD $0x2884c0ce \ 185 WORD $0x4884c0ce \ 186 WORD $0x6884c0ce \ 187 WORD $0x8884c0ce \ 188 WORD $0xa884c0ce \ 189 WORD $0xc884c0ce \ 190 WORD $0xe884c0ce \ 191 VREV32 V8.B16, V8.B16 \ 192 VST1.P [V8.B16], 16(R9) 193 194 // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) 195 TEXT ·expandKeyAsm(SB),NOSPLIT,$0 196 MOVD key+0(FP), R8 197 MOVD ck+8(FP), R9 198 MOVD enc+16(FP), R10 199 MOVD dec+24(FP), R11 200 MOVD inst+32(FP), R12 201 202 CMP $1, R12 203 BEQ sm4ekey 204 205 load_global_data_1() 206 207 VLD1 (R8), [t0.B16] 208 VREV32 t0.B16, t0.B16 209 VEOR t0.B16, FK_MASK.B16, t0.B16 210 VMOV t0.S[1], t1.S[0] 211 VMOV t0.S[2], t2.S[0] 212 VMOV t0.S[3], t3.S[0] 213 214 EOR R0, R0 215 ADD $124, R11 216 VEOR ZERO.B16, ZERO.B16, ZERO.B16 217 218 ksLoop: 219 SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) 220 SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0) 221 SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1) 222 SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2) 223 224 ADD $16, R0 225 CMP $128, R0 226 BNE ksLoop 227 RET 228 229 sm4ekey: 230 LDP fk_mask<>(SB), (R0, R1) 231 VMOV R0, FK_MASK.D[0] 232 VMOV R1, FK_MASK.D[1] 233 VLD1 (R8), [V9.B16] 234 VREV32 V9.B16, V9.B16 235 VEOR FK_MASK.B16, V9.B16, V9.B16 236 ADD $96, R11 237 238 VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] 239 WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S 240 WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S 241 SM4EKEY_EXPORT_KEYS() 242 243 WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S 244 WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S 245 SM4EKEY_EXPORT_KEYS() 246 247 VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] 248 WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S 249 WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S 250 SM4EKEY_EXPORT_KEYS() 251 252 WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S 253 WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S 254 SM4EKEY_EXPORT_KEYS() 255 RET 256 257 // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) 258 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 259 MOVD xk+0(FP), R8 260 MOVD dst+8(FP), R9 261 MOVD src+32(FP), R10 262 MOVD src_len+40(FP), R12 263 MOVD inst+56(FP), R11 264 265 CMP $1, R11 266 BEQ sm4niblocks 267 268 VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4] 269 VMOV V5.S[0], t0.S[0] 270 VMOV V5.S[1], t1.S[0] 271 VMOV V5.S[2], t2.S[0] 272 VMOV V5.S[3], t3.S[0] 273 274 VMOV V6.S[0], t0.S[1] 275 VMOV V6.S[1], t1.S[1] 276 VMOV V6.S[2], t2.S[1] 277 VMOV V6.S[3], t3.S[1] 278 279 VMOV V7.S[0], t0.S[2] 280 VMOV V7.S[1], t1.S[2] 281 VMOV V7.S[2], t2.S[2] 282 VMOV V7.S[3], t3.S[2] 283 284 VMOV V8.S[0], t0.S[3] 285 VMOV V8.S[1], t1.S[3] 286 VMOV V8.S[2], t2.S[3] 287 VMOV V8.S[3], t3.S[3] 288 289 load_global_data_2() 290 291 VREV32 t0.B16, t0.B16 292 VREV32 t1.B16, t1.B16 293 VREV32 t2.B16, t2.B16 294 VREV32 t3.B16, t3.B16 295 296 VEOR ZERO.B16, ZERO.B16, ZERO.B16 297 EOR R0, R0 298 299 encryptBlocksLoop: 300 SM4_ROUND(R8, x, y, t0, t1, t2, t3) 301 SM4_ROUND(R8, x, y, t1, t2, t3, t0) 302 SM4_ROUND(R8, x, y, t2, t3, t0, t1) 303 SM4_ROUND(R8, x, y, t3, t0, t1, t2) 304 305 ADD $16, R0 306 CMP $128, R0 307 BNE encryptBlocksLoop 308 309 VREV32 t0.B16, t0.B16 310 VREV32 t1.B16, t1.B16 311 VREV32 t2.B16, t2.B16 312 VREV32 t3.B16, t3.B16 313 314 VMOV t3.S[0], V8.S[0] 315 VMOV t2.S[0], V8.S[1] 316 VMOV t1.S[0], V8.S[2] 317 VMOV t0.S[0], V8.S[3] 318 VST1.P [V8.B16], 16(R9) 319 320 VMOV t3.S[1], V8.S[0] 321 VMOV t2.S[1], V8.S[1] 322 VMOV t1.S[1], V8.S[2] 323 VMOV t0.S[1], V8.S[3] 324 VST1.P [V8.B16], 16(R9) 325 326 VMOV t3.S[2], V8.S[0] 327 VMOV t2.S[2], V8.S[1] 328 VMOV t1.S[2], V8.S[2] 329 VMOV t0.S[2], V8.S[3] 330 VST1.P [V8.B16], 16(R9) 331 332 VMOV t3.S[3], V8.S[0] 333 VMOV t2.S[3], V8.S[1] 334 VMOV t1.S[3], V8.S[2] 335 VMOV t0.S[3], V8.S[3] 336 VST1 [V8.B16], (R9) 337 RET 338 339 sm4niblocks: 340 VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] 341 VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] 342 sm4niblockloop: 343 SM4E_ROUND() 344 SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes 345 CBNZ R12, sm4niblockloop 346 RET 347 348 // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) 349 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 350 MOVD xk+0(FP), R8 351 MOVD dst+8(FP), R9 352 MOVD src+16(FP), R10 353 MOVD inst+24(FP), R11 354 355 CMP $1, R11 356 BEQ sm4niblock 357 358 VLD1 (R10), [t0.S4] 359 VREV32 t0.B16, t0.B16 360 VMOV t0.S[1], t1.S[0] 361 VMOV t0.S[2], t2.S[0] 362 VMOV t0.S[3], t3.S[0] 363 364 load_global_data_2() 365 366 VEOR ZERO.B16, ZERO.B16, ZERO.B16 367 EOR R0, R0 368 369 encryptBlockLoop: 370 SM4_ROUND(R8, x, y, t0, t1, t2, t3) 371 SM4_ROUND(R8, x, y, t1, t2, t3, t0) 372 SM4_ROUND(R8, x, y, t2, t3, t0, t1) 373 SM4_ROUND(R8, x, y, t3, t0, t1, t2) 374 375 ADD $16, R0 376 CMP $128, R0 377 BNE encryptBlockLoop 378 379 VREV32 t0.B16, t0.B16 380 VREV32 t1.B16, t1.B16 381 VREV32 t2.B16, t2.B16 382 VREV32 t3.B16, t3.B16 383 384 VMOV t3.S[0], V8.S[0] 385 VMOV t2.S[0], V8.S[1] 386 VMOV t1.S[0], V8.S[2] 387 VMOV t0.S[0], V8.S[3] 388 VST1 [V8.B16], (R9) 389 RET 390 391 sm4niblock: 392 VLD1 (R10), [V8.B16] 393 VREV32 V8.B16, V8.B16 394 VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] 395 WORD $0x0884c0ce //SM4E V8.4S, V0.4S 396 WORD $0x2884c0ce //SM4E V8.4S, V1.4S 397 WORD $0x4884c0ce //SM4E V8.4S, V2.4S 398 WORD $0x6884c0ce //SM4E V8.4S, V3.4S 399 VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] 400 WORD $0x0884c0ce //SM4E V8.4S, V0.4S 401 WORD $0x2884c0ce //SM4E V8.4S, V1.4S 402 WORD $0x4884c0ce //SM4E V8.4S, V2.4S 403 WORD $0x6884c0ce //SM4E V8.4S, V3.4S 404 VREV32 V8.B16, V8.B16 405 VST1 [V8.B16], (R9) 406 RET