github.com/emmansun/gmsm@v0.29.1/sm4/ecb_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define x V0 6 #define y V1 7 #define t0 V2 8 #define t1 V3 9 #define t2 V4 10 #define t3 V5 11 #define ZERO V16 12 #define NIBBLE_MASK V20 13 #define INVERSE_SHIFT_ROWS V21 14 #define M1L V22 15 #define M1H V23 16 #define M2L V24 17 #define M2H V25 18 #define R08_MASK V26 19 #define FK_MASK V27 20 #define XTMP6 V6 21 #define XTMP7 V7 22 #define t4 V10 23 #define t5 V11 24 #define t6 V12 25 #define t7 V13 26 27 #include "aesni_macros_arm64.s" 28 29 // func encryptSm4Ecb(xk *uint32, dst, src []byte) 30 TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0 31 #define dstPtr R1 32 #define srcPtr R2 33 #define rk R3 34 #define rkSave R4 35 #define srcPtrLen R5 36 LOAD_SM4_AESNI_CONSTS() 37 VEOR ZERO.B16, ZERO.B16, ZERO.B16 38 39 MOVD xk+0(FP), rk 40 MOVD dst+8(FP), dstPtr 41 MOVD src+32(FP), srcPtr 42 MOVD src_len+40(FP), srcPtrLen 43 MOVD rk, rkSave 44 45 ecbSm4Octets: 46 CMP $128, srcPtrLen 47 BLT ecbSm4Nibbles 48 SUB $128, srcPtrLen 49 MOVD rkSave, rk 50 51 VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] 52 VLD1.P 64(srcPtr), [t4.S4, t5.S4, t6.S4, t7.S4] 53 VREV32 t0.B16, t0.B16 54 VREV32 t1.B16, t1.B16 55 VREV32 t2.B16, t2.B16 56 VREV32 t3.B16, t3.B16 57 VREV32 t4.B16, t4.B16 58 VREV32 t5.B16, t5.B16 59 VREV32 t6.B16, t6.B16 60 VREV32 t7.B16, t7.B16 61 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 62 PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) 63 64 EOR R0, R0 65 66 ecb8BlocksLoop: 67 SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) 68 SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) 69 SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) 70 SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) 71 72 ADD $16, R0 73 CMP $128, R0 74 BNE ecb8BlocksLoop 75 76 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 77 TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) 78 VREV32 t0.B16, t0.B16 79 VREV32 t1.B16, t1.B16 80 VREV32 t2.B16, t2.B16 81 VREV32 t3.B16, t3.B16 82 VREV32 t4.B16, t4.B16 83 VREV32 t5.B16, t5.B16 84 VREV32 t6.B16, t6.B16 85 VREV32 t7.B16, t7.B16 86 87 VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr) 88 VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(dstPtr) 89 90 B ecbSm4Octets 91 92 ecbSm4Nibbles: 93 CMP $64, srcPtrLen 94 BLT ecbSm4Single 95 SUB $64, srcPtrLen 96 MOVD rkSave, rk 97 98 VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] 99 VREV32 t0.B16, t0.B16 100 VREV32 t1.B16, t1.B16 101 VREV32 t2.B16, t2.B16 102 VREV32 t3.B16, t3.B16 103 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 104 105 EOR R0, R0 106 107 ecb4BlocksLoop: 108 SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) 109 SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) 110 SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) 111 SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) 112 113 ADD $16, R0 114 CMP $128, R0 115 BNE ecb4BlocksLoop 116 117 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 118 VREV32 t0.B16, t0.B16 119 VREV32 t1.B16, t1.B16 120 VREV32 t2.B16, t2.B16 121 VREV32 t3.B16, t3.B16 122 VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr) 123 124 ecbSm4Single: 125 CBZ srcPtrLen, ecbSm4Done 126 MOVD rkSave, rk 127 EOR R0, R0 128 129 CMP $32, srcPtrLen 130 BEQ ecbSm4Single32 131 132 CMP $48, srcPtrLen 133 BEQ ecbSm4Single48 134 135 ecbSm4Single16: 136 VLD1.P 16(srcPtr), [t0.S4] 137 VREV32 t0.B16, t0.B16 138 VMOV t0.S[1], t1.S[0] 139 VMOV t0.S[2], t2.S[0] 140 VMOV t0.S[3], t3.S[0] 141 142 encryptBlocksLoop1: 143 SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) 144 SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) 145 SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) 146 SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) 147 148 ADD $16, R0 149 CMP $128, R0 150 BNE encryptBlocksLoop1 151 152 VMOV t2.S[0], t3.S[1] 153 VMOV t1.S[0], t3.S[2] 154 VMOV t0.S[0], t3.S[3] 155 VREV32 t3.B16, t3.B16 156 VST1.P [t3.S4], 16(dstPtr) 157 158 B ecbSm4Done 159 160 ecbSm4Single32: 161 VLD1.P 32(srcPtr), [t0.S4, t1.S4] 162 VREV32 t0.B16, t0.B16 163 VREV32 t1.B16, t1.B16 164 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 165 166 encryptBlocksLoop2: 167 SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) 168 SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) 169 SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) 170 SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) 171 172 ADD $16, R0 173 CMP $128, R0 174 BNE encryptBlocksLoop2 175 176 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 177 VREV32 t0.B16, t0.B16 178 VREV32 t1.B16, t1.B16 179 VST1.P [t0.S4, t1.S4], 32(dstPtr) 180 181 B ecbSm4Done 182 183 ecbSm4Single48: 184 VLD1.P 48(srcPtr), [t0.S4, t1.S4, t2.S4] 185 VREV32 t0.B16, t0.B16 186 VREV32 t1.B16, t1.B16 187 VREV32 t2.B16, t2.B16 188 PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 189 190 encryptBlocksLoop3: 191 SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) 192 SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) 193 SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) 194 SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) 195 196 ADD $16, R0 197 CMP $128, R0 198 BNE encryptBlocksLoop3 199 200 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) 201 VREV32 t0.B16, t0.B16 202 VREV32 t1.B16, t1.B16 203 VREV32 t2.B16, t2.B16 204 VST1.P [t0.S4, t1.S4, t2.S4], 48(dstPtr) 205 ecbSm4Done: 206 RET