github.com/emmansun/gmsm@v0.29.1/sm4/aesni_macros_arm64.s (about) 1 // inverse shift rows 2 DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 3 DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 4 GLOBL inverse_shift_rows<>(SB), (16+8), $16 5 6 // Affine transform 1 & 2 (low and high nibbles) 7 DATA m1_2<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 8 DATA m1_2<>+0x08(SB)/8, $0x3045F98CEF9A2653 9 DATA m1_2<>+0x10(SB)/8, $0xC35BF46CAF379800 10 DATA m1_2<>+0x18(SB)/8, $0x68F05FC7049C33AB 11 DATA m1_2<>+0x20(SB)/8, $0x9A950A05FEF16E61 12 DATA m1_2<>+0x28(SB)/8, $0x0E019E916A65FAF5 13 DATA m1_2<>+0x30(SB)/8, $0x892D69CD44E0A400 14 DATA m1_2<>+0x38(SB)/8, $0x2C88CC68E14501A5 15 GLOBL m1_2<>(SB), (16+8), $64 16 17 // left rotations of 32-bit words by 8-bit increments 18 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 19 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 20 GLOBL r08_mask<>(SB), (16+8), $16 21 22 DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 23 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 24 GLOBL fk_mask<>(SB), (16+8), $16 25 26 #define LOAD_SM4_AESNI_CONSTS() \ 27 MOVW $0x0F0F0F0F, R20 \ 28 VDUP R20, NIBBLE_MASK.S4 \ 29 MOVD $m1_2<>(SB), R20 \ 30 VLD1 (R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ 31 MOVD $inverse_shift_rows<>(SB), R20 \ 32 VLD1 (R20), [INVERSE_SHIFT_ROWS.B16] \ 33 MOVD $r08_mask<>(SB), R20 \ 34 VLD1 (R20), [R08_MASK.B16] \ 35 36 // input: from high to low 37 // t0 = t0.S3, t0.S2, t0.S1, t0.S0 38 // t1 = t1.S3, t1.S2, t1.S1, t1.S0 39 // t2 = t2.S3, t2.S2, t2.S1, t2.S0 40 // t3 = t3.S3, t3.S2, t3.S1, t3.S0 41 // output: from high to low 42 // t0 = t3.S0, t2.S0, t1.S0, t0.S0 43 // t1 = t3.S1, t2.S1, t1.S1, t0.S1 44 // t2 = t3.S2, t2.S2, t1.S2, t0.S2 45 // t3 = t3.S3, t2.S3, t1.S3, t0.S3 46 #define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \ 47 VZIP1 t1.S4, t0.S4, RTMP0.S4 \ 48 VZIP1 t3.S4, t2.S4, RTMP1.S4 \ 49 VZIP2 t1.S4, t0.S4, RTMP2.S4 \ 50 VZIP2 t3.S4, t2.S4, RTMP3.S4 \ 51 VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \ 52 VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \ 53 VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \ 54 VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 55 56 // input: from high to low 57 // t0 = t0.S3, t0.S2, t0.S1, t0.S0 58 // t1 = t1.S3, t1.S2, t1.S1, t1.S0 59 // t2 = t2.S3, t2.S2, t2.S1, t2.S0 60 // t3 = t3.S3, t3.S2, t3.S1, t3.S0 61 // output: from high to low 62 // t0 = t0.S0, t1.S0, t2.S0, t3.S0 63 // t1 = t0.S1, t1.S1, t2.S1, t3.S1 64 // t2 = t0.S2, t1.S2, t2.S2, t3.S2 65 // t3 = t0.S3, t1.S3, t2.S3, t3.S3 66 #define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \ 67 VZIP1 t0.S4, t1.S4, RTMP0.S4 \ 68 VZIP2 t0.S4, t1.S4, RTMP1.S4 \ 69 VZIP1 t2.S4, t3.S4, RTMP2.S4 \ 70 VZIP2 t2.S4, t3.S4, RTMP3.S4 \ 71 VZIP1 RTMP0.D2, RTMP2.D2, t0.D2 \ 72 VZIP2 RTMP0.D2, RTMP2.D2, t1.D2 \ 73 VZIP1 RTMP1.D2, RTMP3.D2, t2.D2 \ 74 VZIP2 RTMP1.D2, RTMP3.D2, t3.D2 75 76 // Affine Transform 77 // parameters: 78 // - L: table low nibbles 79 // - H: table high nibbles 80 // - x: 128 bits register as sbox input/output data 81 // - y: 128 bits temp register 82 // - z: 128 bits temp register 83 #define AFFINE_TRANSFORM(L, H, x, y, z) \ 84 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 85 VTBL z.B16, [L.B16], y.B16; \ 86 VUSHR $4, x.D2, x.D2; \ 87 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 88 VTBL z.B16, [H.B16], z.B16; \ 89 VEOR y.B16, z.B16, x.B16 90 91 // SM4 sbox function 92 // parameters: 93 // - x: 128 bits register as sbox input/output data 94 // - y: 128 bits temp register 95 // - z: 128 bits temp register 96 #define SM4_SBOX(x, y, z) \ 97 ; \ 98 AFFINE_TRANSFORM(M1L, M1H, x, y, z); \ 99 VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ 100 AESE ZERO.B16, x.B16; \ 101 AFFINE_TRANSFORM(M2L, M2H, x, y, z) 102 103 // SM4 TAO L1 function 104 // parameters: 105 // - x: 128 bits register as TAO_L1 input/output data 106 // - y: 128 bits temp register 107 // - z: 128 bits temp register 108 #define SM4_TAO_L1(x, y, z) \ 109 SM4_SBOX(x, y, z); \ 110 VTBL R08_MASK.B16, [x.B16], y.B16; \ // y = x <<< 8 111 VTBL R08_MASK.B16, [y.B16], z.B16; \ // z = x <<< 16 112 VEOR x.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8) 113 VEOR z.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8) ^ (x <<< 16) 114 VTBL R08_MASK.B16, [z.B16], z.B16; \ // z = x <<< 24 115 VEOR z.B16, x.B16, x.B16; \ // x = x ^ (x <<< 24) 116 VSHL $2, y.S4, z.S4; \ 117 VSRI $30, y.S4, z.S4; \ // z = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) 118 VEOR z.B16, x.B16, x.B16 119 120 // SM4 round function 121 // t0 ^= tao_l1(t1^t2^t3^xk) 122 // parameters: 123 // - RK: round key register 124 // - tmp32: temp 32/64 bits register 125 // - x: 128 bits temp register 126 // - y: 128 bits temp register 127 // - z: 128 bits temp register 128 // - t0: 128 bits register for data as result 129 // - t1: 128 bits register for data 130 // - t2: 128 bits register for data 131 // - t3: 128 bits register for data 132 #define SM4_ROUND(RK, tmp32, x, y, z, t0, t1, t2, t3) \ 133 MOVW.P 4(RK), tmp32; \ 134 VDUP tmp32, x.S4; \ 135 VEOR t1.B16, x.B16, x.B16; \ 136 VEOR t2.B16, x.B16, x.B16; \ 137 VEOR t3.B16, x.B16, x.B16; \ 138 SM4_TAO_L1(x, y, z); \ 139 VEOR x.B16, t0.B16, t0.B16 140 141 // SM4 round function 142 // t0 ^= tao_l1(t1^t2^t3^xk) 143 // parameters: 144 // - RK: round key register 145 // - tmp32: temp 32/64 bits register 146 // - x: 128 bits temp register 147 // - y: 128 bits temp register 148 // - z: 128 bits temp register 149 // - t0: 128 bits register for data as result 150 // - t1: 128 bits register for data 151 // - t2: 128 bits register for data 152 // - t3: 128 bits register for data 153 #define SM4_8BLOCKS_ROUND(RK, tmp32, x, y, z, tmp, t0, t1, t2, t3, t4, t5, t6, t7) \ 154 MOVW.P 4(RK), tmp32; \ 155 VDUP tmp32, tmp.S4; \ 156 VEOR t1.B16, tmp.B16, x.B16; \ 157 VEOR t2.B16, x.B16, x.B16; \ 158 VEOR t3.B16, x.B16, x.B16; \ 159 SM4_TAO_L1(x, y, z); \ 160 VEOR x.B16, t0.B16, t0.B16; \ 161 ; \ 162 VEOR t5.B16, tmp.B16, x.B16; \ 163 VEOR t6.B16, x.B16, x.B16; \ 164 VEOR t7.B16, x.B16, x.B16; \ 165 SM4_TAO_L1(x, y, z); \ 166 VEOR x.B16, t4.B16, t4.B16