github.com/emmansun/gmsm@v0.29.1/sm4/aesni_macros_amd64.s (about) 1 // shuffle byte order from LE to BE 2 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 3 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 4 GLOBL flip_mask<>(SB), 8, $16 5 6 // shuffle byte and word order 7 DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 8 DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607 9 GLOBL bswap_mask<>(SB), 8, $16 10 11 //nibble mask 12 DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F 13 DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F 14 GLOBL nibble_mask<>(SB), 8, $16 15 16 // inverse shift rows 17 DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 18 DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 19 DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00 20 DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508 21 GLOBL inverse_shift_rows<>(SB), 8, $32 22 23 // Affine transform 1 (low and high nibbles) 24 DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 25 DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 26 DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69 27 DATA m1_low<>+0x18(SB)/8, $0x3045F98CEF9A2653 28 GLOBL m1_low<>(SB), 8, $32 29 30 DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 31 DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB 32 DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800 33 DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB 34 GLOBL m1_high<>(SB), 8, $32 35 36 // Affine transform 2 (low and high nibbles) 37 DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 38 DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 39 DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61 40 DATA m2_low<>+0x18(SB)/8, $0x0E019E916A65FAF5 41 GLOBL m2_low<>(SB), 8, $32 42 43 DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 44 DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 45 DATA m2_high<>+0x10(SB)/8, $0x892D69CD44E0A400 46 DATA m2_high<>+0x18(SB)/8, $0x2C88CC68E14501A5 47 GLOBL m2_high<>(SB), 8, $32 48 49 // left rotations of 32-bit words by 8-bit increments 50 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 51 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 52 DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003 53 DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B 54 GLOBL r08_mask<>(SB), 8, $32 55 56 DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 57 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 58 GLOBL fk_mask<>(SB), 8, $16 59 60 // Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions. 61 // input: from high to low 62 // r0 = [w3, w2, w1, w0] 63 // r1 = [w7, w6, w5, w4] 64 // r2 = [w11, w10, w9, w8] 65 // r3 = [w15, w14, w13, w12] 66 // r: 32/64 temp register 67 // tmp1: 128 bits temp register 68 // tmp2: 128 bits temp register 69 // 70 // output: from high to low 71 // r0 = [w12, w8, w4, w0] 72 // r1 = [w13, w9, w5, w1] 73 // r2 = [w14, w10, w6, w2] 74 // r3 = [w15, w11, w7, w3] 75 // 76 // SSE2/MMX instructions: 77 // MOVOU r0, tmp2; 78 // PUNPCKHDQ r1, tmp2; 79 // PUNPCKLDQ r1, r0; 80 // MOVOU r2, tmp1; 81 // PUNPCKLDQ r3, tmp1; 82 // PUNPCKHDQ r3, r2; 83 // MOVOU r0, r1; 84 // PUNPCKHQDQ tmp1, r1; 85 // PUNPCKLQDQ tmp1, r0; 86 // MOVOU tmp2, r3; 87 // PUNPCKHQDQ r2, r3; 88 // PUNPCKLQDQ r2, tmp2; 89 // MOVOU tmp2, r2 90 #define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ 91 MOVOU r0, tmp2; \ 92 PUNPCKHLQ r1, tmp2; \ 93 PUNPCKLLQ r1, r0; \ 94 MOVOU r2, tmp1; \ 95 PUNPCKLLQ r3, tmp1; \ 96 PUNPCKHLQ r3, r2; \ 97 MOVOU r0, r1; \ 98 PUNPCKHQDQ tmp1, r1; \ 99 PUNPCKLQDQ tmp1, r0; \ 100 MOVOU tmp2, r3; \ 101 PUNPCKHQDQ r2, r3; \ 102 PUNPCKLQDQ r2, tmp2; \ 103 MOVOU tmp2, r2 104 105 // SM4 sbox function 106 // parameters: 107 // - x: 128 bits register as sbox input/output data 108 // - y: 128 bits temp register 109 // - z: 128 bits temp register 110 #define SM4_SBOX(x, y, z) \ 111 ; \ //############################# inner affine ############################// 112 MOVOU x, z; \ 113 PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f); 114 MOVOU m1_low<>(SB), y; \ 115 PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); 116 PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); 117 PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); 118 MOVOU m1_high<>(SB), z; \ 119 PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); 120 MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); 121 PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; 122 ; \ // inverse ShiftRows 123 PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); 124 AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction 125 ; \ //############################# outer affine ############################// 126 MOVOU x, z; \ 127 PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); 128 MOVOU m2_low<>(SB), y; \ 129 PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) 130 PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); 131 PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); 132 MOVOU m2_high<>(SB), z; \ 133 PSHUFB x, z; \ 134 MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) 135 PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; 136 137 // SM4 TAO L1 function 138 // parameters: 139 // - x: 128 bits register as TAO_L1 input/output data 140 // - y: 128 bits temp register 141 // - z: 128 bits temp register 142 #define SM4_TAO_L1(x, y, z) \ 143 SM4_SBOX(x, y, z); \ 144 ; \ //#################### 4 parallel L1 linear transforms ##################// 145 MOVOU x, y; \ 146 PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8 147 MOVOU y, z; \ 148 PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16 149 PXOR x, y; \ //y = x ^ (x <<< 8) 150 PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16) 151 PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24 152 PXOR z, x; \ //x = x ^ (x <<< 24) 153 MOVOU y, z; \ 154 PSLLL $2, z; \ 155 PSRLL $30, y; \ 156 POR z, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) 157 PXOR y, x 158 159 // SM4 single round function, handle 16 bytes data 160 // t0 ^= tao_l1(t1^t2^t3^xk) 161 // parameters: 162 // - x: 128 bits temp register (also as input RK) 163 // - y: 128 bits temp register 164 // - z: 128 bits temp register 165 // - t0: 128 bits register for data as result 166 // - t1: 128 bits register for data 167 // - t2: 128 bits register for data 168 // - t3: 128 bits register for data 169 #define SM4_SINGLE_ROUND(x, y, z, t0, t1, t2, t3) \ 170 PXOR t1, x; \ 171 PXOR t2, x; \ 172 PXOR t3, x; \ 173 SM4_TAO_L1(x, y, z); \ 174 PXOR x, t0 175 176 // SM4 round function, handle 64 bytes data 177 // t0 ^= tao_l1(t1^t2^t3^xk) 178 // parameters: 179 // - index: round key index immediate number 180 // - RK: round key register 181 // - IND: round key index base register 182 // - x: 128 bits temp register 183 // - y: 128 bits temp register 184 // - z: 128 bits temp register 185 // - t0: 128 bits register for data as result 186 // - t1: 128 bits register for data 187 // - t2: 128 bits register for data 188 // - t3: 128 bits register for data 189 #define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ 190 MOVL (index * 4)(RK)(IND*1), x; \ 191 PSHUFD $0, x, x; \ 192 PXOR t1, x; \ 193 PXOR t2, x; \ 194 PXOR t3, x; \ 195 SM4_TAO_L1(x, y, z); \ 196 PXOR x, t0 197 198 #define SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3) \ 199 PXOR t1, x; \ 200 PXOR t2, x; \ 201 PXOR t3, x; \ 202 SM4_TAO_L1(x, y, z); \ 203 PXOR x, t0 \ 204 205 #define SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3) \ 206 PSHUFD $0, rk128, x; \ 207 SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \ 208 PSHUFD $0x55, rk128, x; \ 209 SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \ 210 PSHUFD $0xAA, rk128, x; \ 211 SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \ 212 PSHUFD $0xFF, rk128, x; \ 213 SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \ 214 215 // Requires: SSSE3 216 #define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \ 217 PSHUFB flip_mask<>(SB), t0; \ 218 PSHUFD $1, t0, t1; \ 219 PSHUFD $2, t0, t2; \ 220 PSHUFD $3, t0, t3; \ 221 MOVOU (0*16)(RK), rk128; \ 222 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 223 MOVOU (1*16)(RK), rk128; \ 224 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 225 MOVOU (2*16)(RK), rk128; \ 226 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 227 MOVOU (3*16)(RK), rk128; \ 228 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 229 MOVOU (4*16)(RK), rk128; \ 230 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 231 MOVOU (5*16)(RK), rk128; \ 232 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 233 MOVOU (6*16)(RK), rk128; \ 234 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 235 MOVOU (7*16)(RK), rk128; \ 236 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 237 PALIGNR $4, t3, t3; \ 238 PALIGNR $4, t3, t2; \ 239 PALIGNR $4, t2, t1; \ 240 PALIGNR $4, t1, t0; \ 241 PSHUFB flip_mask<>(SB), t0 242 243 #define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ 244 PSHUFB flip_mask<>(SB), t0; \ 245 PSHUFB flip_mask<>(SB), t1; \ 246 PSHUFB flip_mask<>(SB), t2; \ 247 PSHUFB flip_mask<>(SB), t3; \ 248 SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) 249 250 #define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \ 251 SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ 252 MOVOU (0*16)(RK), rk128; \ 253 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 254 MOVOU (1*16)(RK), rk128; \ 255 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 256 MOVOU (2*16)(RK), rk128; \ 257 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 258 MOVOU (3*16)(RK), rk128; \ 259 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 260 MOVOU (4*16)(RK), rk128; \ 261 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 262 MOVOU (5*16)(RK), rk128; \ 263 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 264 MOVOU (6*16)(RK), rk128; \ 265 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 266 MOVOU (7*16)(RK), rk128; \ 267 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ 268 SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ 269 PSHUFB bswap_mask<>(SB), t3; \ 270 PSHUFB bswap_mask<>(SB), t2; \ 271 PSHUFB bswap_mask<>(SB), t1; \ 272 PSHUFB bswap_mask<>(SB), t0 273 274 #define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ 275 PSHUFD $0, rk128, x; \ 276 SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3); \ 277 PSHUFD $0, rk128, x; \ 278 SM4_ONE_ROUND_SSE(x, y, z, t4, t5, t6, t7); \ 279 PSHUFD $0x55, rk128, x; \ 280 SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0); \ 281 PSHUFD $0x55, rk128, x; \ 282 SM4_ONE_ROUND_SSE(x, y, z, t5, t6, t7, t4); \ 283 PSHUFD $0xAA, rk128, x; \ 284 SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1); \ 285 PSHUFD $0xAA, rk128, x; \ 286 SM4_ONE_ROUND_SSE(x, y, z, t6, t7, t4, t5); \ 287 PSHUFD $0xFF, rk128, x; \ 288 SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \ 289 PSHUFD $0xFF, rk128, x; \ 290 SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \ 291 292 #define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ 293 PSHUFB flip_mask<>(SB), t0; \ 294 PSHUFB flip_mask<>(SB), t1; \ 295 PSHUFB flip_mask<>(SB), t2; \ 296 PSHUFB flip_mask<>(SB), t3; \ 297 PSHUFB flip_mask<>(SB), t4; \ 298 PSHUFB flip_mask<>(SB), t5; \ 299 PSHUFB flip_mask<>(SB), t6; \ 300 PSHUFB flip_mask<>(SB), t7; \ 301 SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) 302 303 #define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ 304 SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ 305 SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ 306 MOVOU (0*16)(RK), rk128; \ 307 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 308 MOVOU (1*16)(RK), rk128; \ 309 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 310 MOVOU (2*16)(RK), rk128; \ 311 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 312 MOVOU (3*16)(RK), rk128; \ 313 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 314 MOVOU (4*16)(RK), rk128; \ 315 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 316 MOVOU (5*16)(RK), rk128; \ 317 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 318 MOVOU (6*16)(RK), rk128; \ 319 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 320 MOVOU (7*16)(RK), rk128; \ 321 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 322 SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ 323 SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ 324 PSHUFB bswap_mask<>(SB), t3; \ 325 PSHUFB bswap_mask<>(SB), t2; \ 326 PSHUFB bswap_mask<>(SB), t1; \ 327 PSHUFB bswap_mask<>(SB), t0; \ 328 PSHUFB bswap_mask<>(SB), t7; \ 329 PSHUFB bswap_mask<>(SB), t6; \ 330 PSHUFB bswap_mask<>(SB), t5; \ 331 PSHUFB bswap_mask<>(SB), t4 332 333 // SM4 sbox function, AVX version 334 // parameters: 335 // - x: 128 bits register as sbox input/output data 336 // - y: 128 bits temp register 337 // - tmp: 128 bits temp register 338 #define AVX_SM4_SBOX(x, y, tmp) \ 339 VPAND nibble_mask<>(SB), x, tmp; \ 340 VMOVDQU m1_low<>(SB), y; \ 341 VPSHUFB tmp, y, y; \ 342 VPSRLQ $4, x, x; \ 343 VPAND nibble_mask<>(SB), x, x; \ 344 VMOVDQU m1_high<>(SB), tmp; \ 345 VPSHUFB x, tmp, x; \ 346 VPXOR y, x, x; \ 347 VPSHUFB inverse_shift_rows<>(SB), x, x; \ 348 VAESENCLAST nibble_mask<>(SB), x, x; \ 349 VPANDN nibble_mask<>(SB), x, tmp; \ 350 VMOVDQU m2_low<>(SB), y; \ 351 VPSHUFB tmp, y, y; \ 352 VPSRLQ $4, x, x; \ 353 VPAND nibble_mask<>(SB), x, x; \ 354 VMOVDQU m2_high<>(SB), tmp; \ 355 VPSHUFB x, tmp, x; \ 356 VPXOR y, x, x 357 358 // SM4 TAO L1 function, AVX version 359 // parameters: 360 // - x: 128 bits register as sbox input/output data 361 // - y: 128 bits temp register 362 // - tmp: 128 bits temp register 363 #define AVX_SM4_TAO_L1(x, y, tmp) \ 364 AVX_SM4_SBOX(x, y, tmp); \ 365 VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 366 VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16 367 VPXOR x, y, y; \ // y = x ^ (x <<< 8) 368 VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) 369 VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24 370 VPXOR x, tmp, x; \ // x = x ^ (x <<< 24) 371 VPSLLD $2, y, tmp; \ 372 VPSRLD $30, y, y; \ 373 VPOR tmp, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) 374 VPXOR y, x, x 375 376 // transpose matrix function, AVX/AVX2 version 377 // parameters: 378 // - r0: 128/256 bits register as input/output data 379 // - r1: 128/256 bits register as input/output data 380 // - r2: 128/256 bits register as input/output data 381 // - r3: 128/256 bits register as input/output data 382 // - tmp1: 128/256 bits temp register 383 // - tmp2: 128/256 bits temp register 384 #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ 385 VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] 386 VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] 387 VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] 388 VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] 389 VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] 390 VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] 391 VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] 392 VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] 393 394 // SM4 round function, AVX version, handle 128 bits 395 // t0 ^= tao_l1(t1^t2^t3^xk) 396 // parameters: 397 // - index: round key index immediate number 398 // - x: 128 bits temp register 399 // - y: 128 bits temp register 400 // - t0: 128 bits register for data as result 401 // - t1: 128 bits register for data 402 // - t2: 128 bits register for data 403 // - t3: 128 bits register for data 404 #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ 405 MOVL (index * 4)(RK)(IND*1), x; \ 406 VPSHUFD $0, x, x; \ // Use VBROADCASTSS ? 407 VPXOR t1, x, x; \ 408 VPXOR t2, x, x; \ 409 VPXOR t3, x, x; \ 410 AVX_SM4_TAO_L1(x, y, tmp); \ 411 VPXOR x, t0, t0 412 413 414 #define SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3) \ 415 VPXOR t1, x, x; \ 416 VPXOR t2, x, x; \ 417 VPXOR t3, x, x; \ 418 AVX_SM4_TAO_L1(x, y, z); \ 419 VPXOR x, t0, t0 \ 420 421 #define SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3) \ 422 VPSHUFD $0, rk128, x; \ 423 SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \ 424 VPSHUFD $0x55, rk128, x; \ 425 SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \ 426 VPSHUFD $0xAA, rk128, x; \ 427 SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \ 428 VPSHUFD $0xFF, rk128, x; \ 429 SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \ 430 431 #define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ 432 VPSHUFB flip_mask<>(SB), t0, t0 \ 433 VPSHUFB flip_mask<>(SB), t1, t1 \ 434 VPSHUFB flip_mask<>(SB), t2, t2 \ 435 VPSHUFB flip_mask<>(SB), t3, t3 \ 436 ; \ 437 AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) 438 439 #define AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \ 440 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ 441 VMOVDQU (0*16)(RK), rk128; \ 442 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 443 VMOVDQU (1*16)(RK), rk128; \ 444 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 445 VMOVDQU (2*16)(RK), rk128; \ 446 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 447 VMOVDQU (3*16)(RK), rk128; \ 448 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 449 VMOVDQU (4*16)(RK), rk128; \ 450 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 451 VMOVDQU (5*16)(RK), rk128; \ 452 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 453 VMOVDQU (6*16)(RK), rk128; \ 454 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 455 VMOVDQU (7*16)(RK), rk128; \ 456 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ 457 ; \ // Transpose matrix 4 x 4 32bits word 458 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ 459 VPSHUFB bswap_mask<>(SB), t0, t0 \ 460 VPSHUFB bswap_mask<>(SB), t1, t1 \ 461 VPSHUFB bswap_mask<>(SB), t2, t2 \ 462 VPSHUFB bswap_mask<>(SB), t3, t3 \ 463 464 #define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ 465 VPSHUFD $0, rk128, x; \ 466 SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3); \ 467 VPSHUFD $0, rk128, x; \ 468 SM4_ONE_ROUND_AVX(x, y, z, t4, t5, t6, t7); \ 469 VPSHUFD $0x55, rk128, x; \ 470 SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0); \ 471 VPSHUFD $0x55, rk128, x; \ 472 SM4_ONE_ROUND_AVX(x, y, z, t5, t6, t7, t4); \ 473 VPSHUFD $0xAA, rk128, x; \ 474 SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1); \ 475 VPSHUFD $0xAA, rk128, x; \ 476 SM4_ONE_ROUND_AVX(x, y, z, t6, t7, t4, t5); \ 477 VPSHUFD $0xFF, rk128, x; \ 478 SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \ 479 VPSHUFD $0xFF, rk128, x; \ 480 SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \ 481 482 #define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ 483 VPSHUFB flip_mask<>(SB), t0, t0 \ 484 VPSHUFB flip_mask<>(SB), t1, t1 \ 485 VPSHUFB flip_mask<>(SB), t2, t2 \ 486 VPSHUFB flip_mask<>(SB), t3, t3 \ 487 VPSHUFB flip_mask<>(SB), t4, t4 \ 488 VPSHUFB flip_mask<>(SB), t5, t5 \ 489 VPSHUFB flip_mask<>(SB), t6, t6 \ 490 VPSHUFB flip_mask<>(SB), t7, t7 \ 491 ; \ 492 AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) 493 494 #define AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ 495 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ 496 TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ 497 VMOVDQU (0*16)(RK), rk128; \ 498 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 499 VMOVDQU (1*16)(RK), rk128; \ 500 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 501 VMOVDQU (2*16)(RK), rk128; \ 502 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 503 VMOVDQU (3*16)(RK), rk128; \ 504 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 505 VMOVDQU (4*16)(RK), rk128; \ 506 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 507 VMOVDQU (5*16)(RK), rk128; \ 508 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 509 VMOVDQU (6*16)(RK), rk128; \ 510 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 511 VMOVDQU (7*16)(RK), rk128; \ 512 SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ 513 ; \ // Transpose matrix 4 x 4 32bits word 514 TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ 515 TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ 516 VPSHUFB bswap_mask<>(SB), t0, t0 \ 517 VPSHUFB bswap_mask<>(SB), t1, t1 \ 518 VPSHUFB bswap_mask<>(SB), t2, t2 \ 519 VPSHUFB bswap_mask<>(SB), t3, t3 \ 520 VPSHUFB bswap_mask<>(SB), t4, t4 \ 521 VPSHUFB bswap_mask<>(SB), t5, t5 \ 522 VPSHUFB bswap_mask<>(SB), t6, t6 \ 523 VPSHUFB bswap_mask<>(SB), t7, t7 \ 524 525 // SM4 sbox function, AVX2 version 526 // parameters: 527 // - x: 256 bits register as sbox input/output data 528 // - y: 256 bits temp register 529 // - z: 256 bits temp register 530 // - xw: 128 bits temp register 531 // - yw: 128 bits temp register 532 // - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier. 533 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. 534 #define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ 535 VPAND yNibbleMask, x, z; \ 536 VMOVDQU m1_low<>(SB), y; \ 537 VPSHUFB z, y, y; \ 538 VPSRLQ $4, x, x; \ 539 VPAND yNibbleMask, x, x; \ 540 VMOVDQU m1_high<>(SB), z; \ 541 VPSHUFB x, z, x; \ 542 VPXOR y, x, x; \ 543 VPSHUFB inverse_shift_rows<>(SB), x, x; \ 544 VEXTRACTI128 $1, x, yw \ 545 VAESENCLAST xNibbleMask, xw, xw; \ 546 VAESENCLAST xNibbleMask, yw, yw; \ 547 VINSERTI128 $1, yw, x, x; \ 548 VPANDN yNibbleMask, x, z; \ 549 VMOVDQU m2_low<>(SB), y; \ 550 VPSHUFB z, y, y; \ 551 VPSRLQ $4, x, x; \ 552 VPAND yNibbleMask, x, x; \ 553 VMOVDQU m2_high<>(SB), z; \ 554 VPSHUFB x, z, x; \ 555 VPXOR y, x, x 556 557 // SM4 TAO L1 function, AVX2 version 558 // parameters: 559 // - x: 256 bits register as sbox input/output data 560 // - y: 256 bits temp register 561 // - z: 256 bits temp register 562 // - xw: 128 bits temp register, x's related low 128 bits register! 563 // - yw: 128 bits temp register, y's related low 128 bits register! 564 // - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier. 565 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. 566 #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ 567 AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ 568 VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 569 VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16 570 VPXOR x, y, y; \ // y = x ^ (x <<< 8) 571 VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) 572 VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24 573 VPXOR x, z, x; \ // x = x ^ (x <<< 24) 574 VPSLLD $2, y, z; \ 575 VPSRLD $30, y, y; \ 576 VPOR z, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18) 577 VPXOR y, x, x 578 579 // SM4 round function, AVX2 version, handle 256 bits 580 // t0 ^= tao_l1(t1^t2^t3^xk) 581 // parameters: 582 // - index: round key index immediate number 583 // - x: 256 bits temp register, MUST use XDWORD! 584 // - y: 256 bits temp register, MUST use YDWORD! 585 // - t0: 256 bits register for data as result 586 // - t1: 256 bits register for data 587 // - t2: 256 bits register for data 588 // - t3: 256 bits register for data 589 #define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \ 590 VPBROADCASTD (index * 4)(RK)(IND*1), x; \ 591 VPXOR t1, x, x; \ 592 VPXOR t2, x, x; \ 593 VPXOR t3, x, x; \ 594 AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ 595 VPXOR x, t0, t0 596 597 // SM4 round function, AVX2 version, handle 256 bits 598 // t0 ^= tao_l1(t1^t2^t3^xk) 599 // parameters: 600 // - index: round key index immediate number 601 // - x: 256 bits temp register, MUST use XDWORD! 602 // - y: 256 bits temp register, MUST use YDWORD! 603 // - t0: 256 bits register for data as result 604 // - t1: 256 bits register for data 605 // - t2: 256 bits register for data 606 // - t3: 256 bits register for data 607 #define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \ 608 VPBROADCASTD (index * 4)(RK), x; \ 609 VPXOR t1, x, x; \ 610 VPXOR t2, x, x; \ 611 VPXOR t3, x, x; \ 612 AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ 613 VPXOR x, t0, t0 614 615 // SM4 round function, AVX version, handle 128 bits 616 // t0 ^= tao_l1(t1^t2^t3^xk) 617 // parameters: 618 // - index: round key index immediate number 619 // - x: 128 bits temp register 620 // - y: 128 bits temp register 621 // - t0: 128 bits register for data as result 622 // - t1: 128 bits register for data 623 // - t2: 128 bits register for data 624 // - t3: 128 bits register for data 625 #define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ 626 VPBROADCASTD (index * 4)(RK)(IND*1), x; \ 627 VPXOR t1, x, x; \ 628 VPXOR t2, x, x; \ 629 VPXOR t3, x, x; \ 630 AVX_SM4_TAO_L1(x, y, tmp); \ 631 VPXOR x, t0, t0 632 633 #define AVX2_SM4_8BLOCKS(RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \ 634 AVX2_SM4_ROUND2(0, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 635 AVX2_SM4_ROUND2(1, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 636 AVX2_SM4_ROUND2(2, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 637 AVX2_SM4_ROUND2(3, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ 638 AVX2_SM4_ROUND2(4, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 639 AVX2_SM4_ROUND2(5, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 640 AVX2_SM4_ROUND2(6, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 641 AVX2_SM4_ROUND2(7, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ 642 AVX2_SM4_ROUND2(8, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 643 AVX2_SM4_ROUND2(9, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 644 AVX2_SM4_ROUND2(10, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 645 AVX2_SM4_ROUND2(11, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ 646 AVX2_SM4_ROUND2(12, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 647 AVX2_SM4_ROUND2(13, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 648 AVX2_SM4_ROUND2(14, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 649 AVX2_SM4_ROUND2(15, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ 650 AVX2_SM4_ROUND2(16, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 651 AVX2_SM4_ROUND2(17, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 652 AVX2_SM4_ROUND2(18, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 653 AVX2_SM4_ROUND2(19, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ 654 AVX2_SM4_ROUND2(20, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 655 AVX2_SM4_ROUND2(21, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 656 AVX2_SM4_ROUND2(22, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 657 AVX2_SM4_ROUND2(23, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ 658 AVX2_SM4_ROUND2(24, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 659 AVX2_SM4_ROUND2(25, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 660 AVX2_SM4_ROUND2(26, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 661 AVX2_SM4_ROUND2(27, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \ 662 AVX2_SM4_ROUND2(28, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \ 663 AVX2_SM4_ROUND2(29, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \ 664 AVX2_SM4_ROUND2(30, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \ 665 AVX2_SM4_ROUND2(31, RK, x, y, xw, yw, tmp, t3, t0, t1, t2) 666 667 // SM4 round function, AVX2 version, handle 256 bits 668 // t0 ^= tao_l1(t1^t2^t3^xk) 669 // parameters: 670 // - index: round key index immediate number 671 // - x: 256 bits temp register, MUST use XDWORD! 672 // - y: 256 bits temp register, MUST use YDWORD! 673 // - tmp: 256 bits temp register 674 // - tmp1: 256 bits temp register 675 // - t0: 256 bits register for data as result 676 // - t1: 256 bits register for data 677 // - t2: 256 bits register for data 678 // - t3: 256 bits register for data 679 #define AVX2_SM4_16BLOCKS_ROUND(index, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \ 680 VPBROADCASTD (index * 4)(RK), tmp1; \ 681 VPXOR t1, tmp1, x; \ 682 VPXOR t2, x, x; \ 683 VPXOR t3, x, x; \ 684 AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ 685 VPXOR x, t0, t0; \ 686 ;\ 687 VPXOR t5, tmp1, x; \ 688 VPXOR t6, x, x; \ 689 VPXOR t7, x, x; \ 690 AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ 691 VPXOR x, t4, t4; \ 692 693 #define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \ 694 AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 695 AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 696 AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 697 AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ 698 AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 699 AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 700 AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 701 AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ 702 AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 703 AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 704 AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 705 AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ 706 AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 707 AVX2_SM4_16BLOCKS_ROUND(13, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 708 AVX2_SM4_16BLOCKS_ROUND(14, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 709 AVX2_SM4_16BLOCKS_ROUND(15, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ 710 AVX2_SM4_16BLOCKS_ROUND(16, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 711 AVX2_SM4_16BLOCKS_ROUND(17, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 712 AVX2_SM4_16BLOCKS_ROUND(18, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 713 AVX2_SM4_16BLOCKS_ROUND(19, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ 714 AVX2_SM4_16BLOCKS_ROUND(20, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 715 AVX2_SM4_16BLOCKS_ROUND(21, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 716 AVX2_SM4_16BLOCKS_ROUND(22, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 717 AVX2_SM4_16BLOCKS_ROUND(23, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ 718 AVX2_SM4_16BLOCKS_ROUND(24, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 719 AVX2_SM4_16BLOCKS_ROUND(25, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 720 AVX2_SM4_16BLOCKS_ROUND(26, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 721 AVX2_SM4_16BLOCKS_ROUND(27, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ 722 AVX2_SM4_16BLOCKS_ROUND(28, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ 723 AVX2_SM4_16BLOCKS_ROUND(29, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ 724 AVX2_SM4_16BLOCKS_ROUND(30, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ 725 AVX2_SM4_16BLOCKS_ROUND(31, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6)