github.com/emmansun/gmsm@v0.29.1/sm4/cbc_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define x V0 6 #define y V1 7 #define t0 V2 8 #define t1 V3 9 #define t2 V4 10 #define t3 V5 11 #define XTMP6 V6 12 #define XTMP7 V7 13 #define t4 V10 14 #define t5 V11 15 #define t6 V12 16 #define t7 V13 17 #define IV V18 18 19 #define ZERO V16 20 #define NIBBLE_MASK V20 21 #define INVERSE_SHIFT_ROWS V21 22 #define M1L V22 23 #define M1H V23 24 #define M2L V24 25 #define M2H V25 26 #define R08_MASK V26 27 #define FK_MASK V27 28 29 #include "aesni_macros_arm64.s" 30 31 #define dstPtr R1 32 #define srcPtr R2 33 #define rk R3 34 #define rkSave R4 35 #define srcPtrLen R5 36 37 // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) 38 TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 39 LOAD_SM4_AESNI_CONSTS() 40 VEOR ZERO.B16, ZERO.B16, ZERO.B16 41 42 MOVD xk+0(FP), rk 43 MOVD dst+8(FP), dstPtr 44 MOVD src+32(FP), srcPtr 45 MOVD src_len+40(FP), srcPtrLen 46 MOVD iv+56(FP), R6 47 MOVD rk, rkSave 48 VLD1 (R6), [IV.B16] 49 50 ADD srcPtr, srcPtrLen, R10 51 SUB $16, R10, R10 52 VLD1 (R10), [V15.S4] 53 54 cbcSm4Octets: 55 CMP $128, srcPtrLen 56 BLE cbcSm4Nibbles 57 SUB $128, srcPtrLen 58 MOVD rkSave, rk 59 ADD srcPtr, srcPtrLen, R10 60 SUB $16, R10, R11 61 ADD dstPtr, srcPtrLen, R12 62 63 VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4] 64 VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4] 65 VREV32 t0.B16, t0.B16 66 VREV32 t1.B16, t1.B16 67 VREV32 t2.B16, t2.B16 68 VREV32 t3.B16, t3.B16 69 VREV32 t4.B16, t4.B16 70 VREV32 t5.B16, t5.B16 71 VREV32 t6.B16, t6.B16 72 VREV32 t7.B16, t7.B16 73 74 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 75 PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) 76 EOR R0, R0 77 78 cbc8BlocksLoop: 79 SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) 80 SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) 81 SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) 82 SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) 83 84 ADD $16, R0 85 CMP $128, R0 86 BNE cbc8BlocksLoop 87 88 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 89 TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) 90 VREV32 t0.B16, t0.B16 91 VREV32 t1.B16, t1.B16 92 VREV32 t2.B16, t2.B16 93 VREV32 t3.B16, t3.B16 94 VREV32 t4.B16, t4.B16 95 VREV32 t5.B16, t5.B16 96 VREV32 t6.B16, t6.B16 97 VREV32 t7.B16, t7.B16 98 99 VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4] 100 VEOR V6.B16, t0.B16, t0.B16 101 VEOR V7.B16, t1.B16, t1.B16 102 VEOR V8.B16, t2.B16, t2.B16 103 VEOR V9.B16, t3.B16, t3.B16 104 105 VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4] 106 VEOR V6.B16, t4.B16, t4.B16 107 VEOR V7.B16, t5.B16, t5.B16 108 VEOR V8.B16, t6.B16, t6.B16 109 VEOR V9.B16, t7.B16, t7.B16 110 111 VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R12) 112 VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R12) 113 114 B cbcSm4Octets 115 116 cbcSm4Nibbles: 117 CMP $64, srcPtrLen 118 BLE cbcSm4Single 119 SUB $64, srcPtrLen 120 MOVD rkSave, rk 121 ADD srcPtr, srcPtrLen, R10 122 SUB $16, R10, R11 123 ADD dstPtr, srcPtrLen, R12 124 125 VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] 126 VMOV t0.B16, t5.B16 127 VMOV t1.B16, t6.B16 128 VMOV t2.B16, t7.B16 129 VREV32 t0.B16, t0.B16 130 VREV32 t1.B16, t1.B16 131 VREV32 t2.B16, t2.B16 132 VREV32 t3.B16, t3.B16 133 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 134 135 EOR R0, R0 136 137 cbc4BlocksLoop: 138 SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) 139 SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) 140 SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) 141 SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) 142 143 ADD $16, R0 144 CMP $128, R0 145 BNE cbc4BlocksLoop 146 147 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 148 VREV32 t0.B16, t0.B16 149 VREV32 t1.B16, t1.B16 150 VREV32 t2.B16, t2.B16 151 VREV32 t3.B16, t3.B16 152 153 VLD1 (R11), [t4.S4] 154 VEOR t4.B16, t0.B16, t0.B16 155 VEOR t5.B16, t1.B16, t1.B16 156 VEOR t6.B16, t2.B16, t2.B16 157 VEOR t7.B16, t3.B16, t3.B16 158 159 VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R12) 160 161 cbcSm4Single: 162 MOVD rkSave, rk 163 EOR R0, R0 164 165 CMP $16, srcPtrLen 166 BEQ cbcSm4Single16 167 168 CMP $32, srcPtrLen 169 BEQ cbcSm4Single32 170 171 CMP $48, srcPtrLen 172 BEQ cbcSm4Single48 173 174 // 4 blocks 175 VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] 176 VMOV t0.B16, t4.B16 177 VMOV t1.B16, t5.B16 178 VMOV t2.B16, t6.B16 179 VREV32 t0.B16, t0.B16 180 VREV32 t1.B16, t1.B16 181 VREV32 t2.B16, t2.B16 182 VREV32 t3.B16, t3.B16 183 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 184 185 cbc4BlocksLoop64: 186 SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) 187 SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) 188 SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) 189 SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) 190 191 ADD $16, R0 192 CMP $128, R0 193 BNE cbc4BlocksLoop64 194 195 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 196 VREV32 t0.B16, t0.B16 197 VREV32 t1.B16, t1.B16 198 VREV32 t2.B16, t2.B16 199 VREV32 t3.B16, t3.B16 200 201 VEOR IV.B16, t0.B16, t0.B16 202 VEOR t4.B16, t1.B16, t1.B16 203 VEOR t5.B16, t2.B16, t2.B16 204 VEOR t6.B16, t3.B16, t3.B16 205 206 VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr) 207 208 B cbcSm4Done 209 210 cbcSm4Single16: 211 VLD1 (srcPtr), [t0.S4] 212 VREV32 t0.B16, t0.B16 213 VMOV t0.S[1], t1.S[0] 214 VMOV t0.S[2], t2.S[0] 215 VMOV t0.S[3], t3.S[0] 216 217 cbc4BlocksLoop16: 218 SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) 219 SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) 220 SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) 221 SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) 222 223 ADD $16, R0 224 CMP $128, R0 225 BNE cbc4BlocksLoop16 226 227 VMOV t2.S[0], t3.S[1] 228 VMOV t1.S[0], t3.S[2] 229 VMOV t0.S[0], t3.S[3] 230 VREV32 t3.B16, t3.B16 231 232 VEOR IV.B16, t3.B16, t3.B16 233 234 VST1 [t3.S4], (dstPtr) 235 236 B cbcSm4Done 237 238 cbcSm4Single32: 239 VLD1 (srcPtr), [t0.S4, t1.S4] 240 VMOV t0.B16, t4.B16 241 VREV32 t0.B16, t0.B16 242 VREV32 t1.B16, t1.B16 243 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 244 245 cbc4BlocksLoop32: 246 SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) 247 SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) 248 SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) 249 SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) 250 251 ADD $16, R0 252 CMP $128, R0 253 BNE cbc4BlocksLoop32 254 255 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 256 VREV32 t0.B16, t0.B16 257 VREV32 t1.B16, t1.B16 258 259 VEOR IV.B16, t0.B16, t0.B16 260 VEOR t4.B16, t1.B16, t1.B16 261 262 VST1 [t0.S4, t1.S4], (dstPtr) 263 B cbcSm4Done 264 265 cbcSm4Single48: 266 VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4] 267 VMOV t0.B16, t4.B16 268 VMOV t1.B16, t5.B16 269 VREV32 t0.B16, t0.B16 270 VREV32 t1.B16, t1.B16 271 VREV32 t2.B16, t2.B16 272 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 273 274 cbc4BlocksLoop48: 275 SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) 276 SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) 277 SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) 278 SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) 279 280 ADD $16, R0 281 CMP $128, R0 282 BNE cbc4BlocksLoop48 283 284 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 285 VREV32 t0.B16, t0.B16 286 VREV32 t1.B16, t1.B16 287 VREV32 t2.B16, t2.B16 288 289 VEOR IV.B16, t0.B16, t0.B16 290 VEOR t4.B16, t1.B16, t1.B16 291 VEOR t5.B16, t2.B16, t2.B16 292 293 VST1 [t0.S4, t1.S4, t2.S4], (dstPtr) 294 295 cbcSm4Done: 296 VST1 [V15.S4], (R6) 297 RET