github.com/emmansun/gmsm@v0.29.1/sm4/asm_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define t0 V0 6 #define t1 V1 7 #define t2 V2 8 #define t3 V3 9 #define t4 V4 10 #define t5 V5 11 #define t6 V6 12 #define t7 V7 13 #define x V8 14 #define y V9 15 #define XTMP6 V10 16 #define XTMP7 V11 17 #define M1L V20 18 #define M1H V21 19 #define M2L V22 20 #define M2H V23 21 #define R08_MASK V24 22 #define INVERSE_SHIFT_ROWS V25 23 #define NIBBLE_MASK V26 24 #define FK_MASK V27 25 #define ZERO V28 26 27 #include "aesni_macros_arm64.s" 28 29 #define SM4_TAO_L2(x, y) \ 30 SM4_SBOX(x, y, XTMP6); \ 31 ; \ //#################### 4 parallel L2 linear transforms ##################// 32 VSHL $13, x.S4, y.S4; \ 33 VSRI $19, x.S4, y.S4; \ 34 VSHL $23, x.S4, XTMP6.S4; \ 35 VSRI $9, x.S4, XTMP6.S4; \ 36 VEOR XTMP6.B16, y.B16, y.B16; \ 37 VEOR x.B16, y.B16, x.B16 38 39 #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \ 40 MOVW.P 4(R9), R19; \ 41 VMOV R19, x.S[0]; \ 42 VEOR t1.B16, x.B16, x.B16; \ 43 VEOR t2.B16, x.B16, x.B16; \ 44 VEOR t3.B16, x.B16, x.B16; \ 45 SM4_TAO_L2(x, y); \ 46 VEOR x.B16, t0.B16, t0.B16; \ 47 VMOV t0.S[0], R2; \ 48 MOVW.P R2, 4(R10); \ 49 MOVW.P R2, -4(R11) 50 51 #define LOAD_SM4KEY_AESNI_CONSTS() \ 52 MOVW $0x0F0F0F0F, R0 \ 53 VDUP R0, NIBBLE_MASK.S4 \ 54 MOVD $m1_2<>(SB), R0 \ 55 VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ 56 MOVD $fk_mask<>(SB), R0 \ 57 VLD1 (R0), [FK_MASK.B16] \ 58 MOVD $inverse_shift_rows<>(SB), R0 \ 59 VLD1 (R0), [INVERSE_SHIFT_ROWS.B16] 60 61 #define SM4EKEY_EXPORT_KEYS() \ 62 VMOV V9.S[3], V10.S[0] \ 63 VMOV V9.S[2], V10.S[1] \ 64 VMOV V9.S[1], V10.S[2] \ 65 VMOV V9.S[0], V10.S[3] \ 66 VMOV V8.S[3], V11.S[0] \ 67 VMOV V8.S[2], V11.S[1] \ 68 VMOV V8.S[1], V11.S[2] \ 69 VMOV V8.S[0], V11.S[3] \ 70 VST1.P [V8.S4, V9.S4], 32(R10) \ 71 VST1 [V10.S4, V11.S4], (R11) \ 72 SUB $32, R11, R11 73 74 #define SM4E_ROUND() \ 75 VLD1.P 16(R10), [V8.B16] \ 76 VREV32 V8.B16, V8.B16 \ 77 WORD $0xcec08408 \ //SM4E V8.4S, V0.4S 78 WORD $0xcec08428 \ //SM4E V8.4S, V1.4S 79 WORD $0xcec08448 \ //SM4E V8.4S, V2.4S 80 WORD $0xcec08468 \ //SM4E V8.4S, V3.4S 81 WORD $0xcec08488 \ //SM4E V8.4S, V4.4S 82 WORD $0xcec084a8 \ //SM4E V8.4S, V5.4S 83 WORD $0xcec084c8 \ //SM4E V8.4S, V6.4S 84 WORD $0xcec084e8 \ //SM4E V8.4S, V7.4S 85 VREV64 V8.B16, V8.B16 \ 86 VEXT $8, V8.B16, V8.B16, V8.B16 \ 87 VST1.P [V8.B16], 16(R9) 88 89 // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) 90 TEXT ·expandKeyAsm(SB),NOSPLIT,$0 91 MOVD key+0(FP), R8 92 MOVD ck+8(FP), R9 93 MOVD enc+16(FP), R10 94 MOVD dec+24(FP), R11 95 MOVD inst+32(FP), R12 96 97 CMP $1, R12 98 BEQ sm4ekey 99 100 LOAD_SM4KEY_AESNI_CONSTS() 101 102 VLD1 (R8), [t0.B16] 103 VREV32 t0.B16, t0.B16 104 VEOR t0.B16, FK_MASK.B16, t0.B16 105 VMOV t0.S[1], t1.S[0] 106 VMOV t0.S[2], t2.S[0] 107 VMOV t0.S[3], t3.S[0] 108 109 EOR R0, R0 110 ADD $124, R11 111 VEOR ZERO.B16, ZERO.B16, ZERO.B16 112 113 ksLoop: 114 SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) 115 SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0) 116 SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1) 117 SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2) 118 119 ADD $16, R0 120 CMP $128, R0 121 BNE ksLoop 122 RET 123 124 sm4ekey: 125 MOVD $fk_mask<>(SB), R0 126 VLD1 (R0), [FK_MASK.B16] 127 VLD1 (R8), [V9.B16] 128 VREV32 V9.B16, V9.B16 129 VEOR FK_MASK.B16, V9.B16, V9.B16 130 ADD $96, R11 131 132 VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] 133 WORD $0xce60c928 //SM4EKEY V8.4S, V9.4S, V0.4S 134 WORD $0xce61c909 //SM4EKEY V9.4S, V8.4S, V1.4S 135 SM4EKEY_EXPORT_KEYS() 136 137 WORD $0xce62c928 //SM4EKEY V8.4S, V9.4S, V2.4S 138 WORD $0xce63c909 //SM4EKEY V9.4S, V8.4S, V3.4S 139 SM4EKEY_EXPORT_KEYS() 140 141 VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] 142 WORD $0xce60c928 //SM4EKEY V8.4S, V9.4S, V0.4S 143 WORD $0xce61c909 //SM4EKEY V9.4S, V8.4S, V1.4S 144 SM4EKEY_EXPORT_KEYS() 145 146 WORD $0xce62c928 //SM4EKEY V8.4S, V9.4S, V2.4S 147 WORD $0xce63c909 //SM4EKEY V9.4S, V8.4S, V3.4S 148 SM4EKEY_EXPORT_KEYS() 149 RET 150 151 // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) 152 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 153 MOVD xk+0(FP), R8 154 MOVD dst+8(FP), R9 155 MOVD src+32(FP), R10 156 MOVD src_len+40(FP), R12 157 MOVD inst+56(FP), R11 158 159 CMP $1, R11 160 BEQ sm4niblocks 161 162 LOAD_SM4_AESNI_CONSTS() 163 164 CMP $128, R12 165 BEQ double_enc 166 167 VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] 168 VREV32 t0.B16, t0.B16 169 VREV32 t1.B16, t1.B16 170 VREV32 t2.B16, t2.B16 171 VREV32 t3.B16, t3.B16 172 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 173 174 VEOR ZERO.B16, ZERO.B16, ZERO.B16 175 EOR R0, R0 176 177 encryptBlocksLoop: 178 SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3) 179 SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0) 180 SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1) 181 SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2) 182 183 ADD $16, R0 184 CMP $128, R0 185 BNE encryptBlocksLoop 186 187 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 188 VREV32 t0.B16, t0.B16 189 VREV32 t1.B16, t1.B16 190 VREV32 t2.B16, t2.B16 191 VREV32 t3.B16, t3.B16 192 193 VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9) 194 RET 195 196 double_enc: 197 VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4] 198 VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4] 199 VREV32 t0.B16, t0.B16 200 VREV32 t1.B16, t1.B16 201 VREV32 t2.B16, t2.B16 202 VREV32 t3.B16, t3.B16 203 VREV32 t4.B16, t4.B16 204 VREV32 t5.B16, t5.B16 205 VREV32 t6.B16, t6.B16 206 VREV32 t7.B16, t7.B16 207 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 208 PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) 209 210 VEOR ZERO.B16, ZERO.B16, ZERO.B16 211 EOR R0, R0 212 213 encrypt8BlocksLoop: 214 SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) 215 SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) 216 SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) 217 SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) 218 219 ADD $16, R0 220 CMP $128, R0 221 BNE encrypt8BlocksLoop 222 223 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 224 TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) 225 VREV32 t0.B16, t0.B16 226 VREV32 t1.B16, t1.B16 227 VREV32 t2.B16, t2.B16 228 VREV32 t3.B16, t3.B16 229 VREV32 t4.B16, t4.B16 230 VREV32 t5.B16, t5.B16 231 VREV32 t6.B16, t6.B16 232 VREV32 t7.B16, t7.B16 233 234 VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9) 235 VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9) 236 237 RET 238 239 sm4niblocks: 240 VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] 241 VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] 242 243 sm4niblockloop: 244 SM4E_ROUND() 245 SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes 246 CBNZ R12, sm4niblockloop 247 RET 248 249 // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) 250 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 251 MOVD xk+0(FP), R8 252 MOVD dst+8(FP), R9 253 MOVD src+16(FP), R10 254 MOVD inst+24(FP), R11 255 256 CMP $1, R11 257 BEQ sm4niblock 258 259 VLD1 (R10), [t0.S4] 260 VREV32 t0.B16, t0.B16 261 VMOV t0.S[1], t1.S[0] 262 VMOV t0.S[2], t2.S[0] 263 VMOV t0.S[3], t3.S[0] 264 265 LOAD_SM4_AESNI_CONSTS() 266 267 VEOR ZERO.B16, ZERO.B16, ZERO.B16 268 EOR R0, R0 269 270 encryptBlockLoop: 271 SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3) 272 SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0) 273 SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1) 274 SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2) 275 276 ADD $16, R0 277 CMP $128, R0 278 BNE encryptBlockLoop 279 280 VMOV t2.S[0], t3.S[1] 281 VMOV t1.S[0], t3.S[2] 282 VMOV t0.S[0], t3.S[3] 283 VREV32 t3.B16, t3.B16 284 VST1 [t3.B16], (R9) 285 RET 286 287 sm4niblock: 288 VLD1 (R10), [V8.B16] 289 VREV32 V8.B16, V8.B16 290 VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] 291 WORD $0xcec08408 //SM4E V8.4S, V0.4S 292 WORD $0xcec08428 //SM4E V8.4S, V1.4S 293 WORD $0xcec08448 //SM4E V8.4S, V2.4S 294 WORD $0xcec08468 //SM4E V8.4S, V3.4S 295 VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] 296 WORD $0xcec08408 //SM4E V8.4S, V0.4S 297 WORD $0xcec08428 //SM4E V8.4S, V1.4S 298 WORD $0xcec08448 //SM4E V8.4S, V2.4S 299 WORD $0xcec08468 //SM4E V8.4S, V3.4S 300 VREV64 V8.B16, V8.B16 301 VEXT $8, V8.B16, V8.B16, V8.B16 302 VST1 [V8.B16], (R9) 303 RET