github.com/emmansun/gmsm@v0.29.1/sm4/aesni_macros_amd64.s (about)

     1  // shuffle byte order from LE to BE
     2  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
     3  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
     4  GLOBL flip_mask<>(SB), 8, $16
     5  
     6  // shuffle byte and word order
     7  DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
     8  DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
     9  GLOBL bswap_mask<>(SB), 8, $16
    10  
    11  //nibble mask
    12  DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
    13  DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
    14  GLOBL nibble_mask<>(SB), 8, $16
    15  
    16  // inverse shift rows
    17  DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
    18  DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
    19  DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00
    20  DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508
    21  GLOBL inverse_shift_rows<>(SB), 8, $32
    22  
    23  // Affine transform 1 (low and high nibbles)
    24  DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
    25  DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
    26  DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69
    27  DATA m1_low<>+0x18(SB)/8, $0x3045F98CEF9A2653
    28  GLOBL m1_low<>(SB), 8, $32
    29  
    30  DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
    31  DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB
    32  DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800
    33  DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB
    34  GLOBL m1_high<>(SB), 8, $32
    35  
    36  // Affine transform 2 (low and high nibbles)
    37  DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
    38  DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
    39  DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61
    40  DATA m2_low<>+0x18(SB)/8, $0x0E019E916A65FAF5
    41  GLOBL m2_low<>(SB), 8, $32
    42  
    43  DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
    44  DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
    45  DATA m2_high<>+0x10(SB)/8, $0x892D69CD44E0A400
    46  DATA m2_high<>+0x18(SB)/8, $0x2C88CC68E14501A5
    47  GLOBL m2_high<>(SB), 8, $32
    48  
    49  // left rotations of 32-bit words by 8-bit increments
    50  DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
    51  DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
    52  DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003
    53  DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
    54  GLOBL r08_mask<>(SB), 8, $32
    55  
    56  DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
    57  DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
    58  GLOBL fk_mask<>(SB), 8, $16
    59  
    60  // Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions.
    61  // input: from high to low
    62  // r0 = [w3, w2, w1, w0]
    63  // r1 = [w7, w6, w5, w4]
    64  // r2 = [w11, w10, w9, w8]
    65  // r3 = [w15, w14, w13, w12]
    66  // r: 32/64 temp register
    67  // tmp1: 128 bits temp register
    68  // tmp2: 128 bits temp register
    69  //
    70  // output: from high to low
    71  // r0 = [w12, w8, w4, w0]
    72  // r1 = [w13, w9, w5, w1]
    73  // r2 = [w14, w10, w6, w2]
    74  // r3 = [w15, w11, w7, w3]
    75  //
    76  // SSE2/MMX instructions:
    77  //	MOVOU r0, tmp2;
    78  //	PUNPCKHDQ r1, tmp2;
    79  //	PUNPCKLDQ	r1, r0; 
    80  //	MOVOU r2, tmp1; 
    81  //	PUNPCKLDQ r3, tmp1; 
    82  //	PUNPCKHDQ r3, r2; 
    83  //	MOVOU r0, r1; 
    84  //	PUNPCKHQDQ tmp1, r1; 
    85  //	PUNPCKLQDQ tmp1, r0; 
    86  //	MOVOU tmp2, r3; 
    87  //	PUNPCKHQDQ r2, r3; 
    88  //	PUNPCKLQDQ r2, tmp2; 
    89  //	MOVOU tmp2, r2
    90  #define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
    91  	MOVOU r0, tmp2;      \
    92  	PUNPCKHLQ r1, tmp2;  \
    93  	PUNPCKLLQ	r1, r0;  \
    94  	MOVOU r2, tmp1;      \
    95  	PUNPCKLLQ r3, tmp1;  \
    96  	PUNPCKHLQ r3, r2;    \
    97  	MOVOU r0, r1;        \
    98  	PUNPCKHQDQ tmp1, r1; \
    99  	PUNPCKLQDQ tmp1, r0; \
   100  	MOVOU tmp2, r3;      \
   101  	PUNPCKHQDQ r2, r3;   \
   102  	PUNPCKLQDQ r2, tmp2; \
   103  	MOVOU tmp2, r2
   104  
   105  // SM4 sbox function
   106  // parameters:
   107  // -  x: 128 bits register as sbox input/output data
   108  // -  y: 128 bits temp register
   109  // -  z: 128 bits temp register
   110  #define SM4_SBOX(x, y, z) \
   111  	;                                   \ //#############################  inner affine ############################//
   112  	MOVOU x, z;                         \
   113  	PAND nibble_mask<>(SB), z;          \ //y = _mm_and_si128(x, c0f); 
   114  	MOVOU m1_low<>(SB), y;              \
   115  	PSHUFB z, y;                        \ //y = _mm_shuffle_epi8(m1l, y);
   116  	PSRLQ $4, x;                        \ //x = _mm_srli_epi64(x, 4); 
   117  	PAND nibble_mask<>(SB), x;          \ //x = _mm_and_si128(x, c0f);
   118  	MOVOU m1_high<>(SB), z;             \
   119  	PSHUFB x, z;                        \ //x = _mm_shuffle_epi8(m1h, x);
   120  	MOVOU  z, x;                        \ //x = _mm_shuffle_epi8(m1h, x);
   121  	PXOR y, x;                          \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
   122  	;                                   \ // inverse ShiftRows
   123  	PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); 
   124  	AESENCLAST nibble_mask<>(SB), x;    \ // AESNI instruction
   125  	;                                   \ //#############################  outer affine ############################//
   126  	MOVOU  x, z;                        \
   127  	PANDN nibble_mask<>(SB), z;         \ //z = _mm_andnot_si128(x, c0f);
   128  	MOVOU m2_low<>(SB), y;              \ 
   129  	PSHUFB z, y;                        \ //y = _mm_shuffle_epi8(m2l, z)
   130  	PSRLQ $4, x;                        \ //x = _mm_srli_epi64(x, 4);
   131  	PAND nibble_mask<>(SB), x;          \ //x = _mm_and_si128(x, c0f); 
   132  	MOVOU m2_high<>(SB), z;             \
   133  	PSHUFB x, z;                        \
   134  	MOVOU  z, x;                        \ //x = _mm_shuffle_epi8(m2h, x)
   135  	PXOR y, x                             //x = _mm_shuffle_epi8(m2h, x) ^ y; 
   136  
   137  // SM4 TAO L1 function
   138  // parameters:
   139  // -  x: 128 bits register as TAO_L1 input/output data
   140  // -  y: 128 bits temp register
   141  // -  z: 128 bits temp register
   142  #define SM4_TAO_L1(x, y, z)         \
   143  	SM4_SBOX(x, y, z);                  \
   144  	;                                   \ //####################  4 parallel L1 linear transforms ##################//
   145  	MOVOU x, y;                         \
   146  	PSHUFB r08_mask<>(SB), y;           \ //y = x <<< 8
   147  	MOVOU y, z;                         \
   148  	PSHUFB r08_mask<>(SB), z;           \ //z = x <<< 16
   149  	PXOR x, y;                          \ //y = x ^ (x <<< 8)
   150  	PXOR z, y;                          \ //y = x ^ (x <<< 8) ^ (x <<< 16)
   151  	PSHUFB r08_mask<>(SB), z;           \ //z = x <<< 24
   152  	PXOR z, x;                          \ //x = x ^ (x <<< 24)
   153  	MOVOU y, z;                         \
   154  	PSLLL $2, z;                        \
   155  	PSRLL $30, y;                       \
   156  	POR z, y;                           \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
   157  	PXOR y, x
   158  
   159  // SM4 single round function, handle 16 bytes data
   160  // t0 ^= tao_l1(t1^t2^t3^xk)
   161  // parameters:
   162  // -  x: 128 bits temp register (also as input RK)
   163  // -  y: 128 bits temp register
   164  // -  z: 128 bits temp register
   165  // - t0: 128 bits register for data as result
   166  // - t1: 128 bits register for data
   167  // - t2: 128 bits register for data
   168  // - t3: 128 bits register for data
   169  #define SM4_SINGLE_ROUND(x, y, z, t0, t1, t2, t3)  \ 
   170  	PXOR t1, x;                                       \
   171  	PXOR t2, x;                                       \
   172  	PXOR t3, x;                                       \
   173  	SM4_TAO_L1(x, y, z);                              \
   174  	PXOR x, t0
   175  
   176  // SM4 round function, handle 64 bytes data
   177  // t0 ^= tao_l1(t1^t2^t3^xk)
   178  // parameters:
   179  // - index: round key index immediate number
   180  // - RK: round key register
   181  // - IND: round key index base register
   182  // -  x: 128 bits temp register
   183  // -  y: 128 bits temp register
   184  // -  z: 128 bits temp register
   185  // - t0: 128 bits register for data as result
   186  // - t1: 128 bits register for data
   187  // - t2: 128 bits register for data
   188  // - t3: 128 bits register for data
   189  #define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3)  \ 
   190  	MOVL (index * 4)(RK)(IND*1), x;                 \
   191  	PSHUFD $0, x, x;                                \
   192  	PXOR t1, x;                                     \
   193  	PXOR t2, x;                                     \
   194  	PXOR t3, x;                                     \
   195  	SM4_TAO_L1(x, y, z);                            \
   196  	PXOR x, t0
   197  
   198  #define SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3)  \
   199  	PXOR t1, x;                                     \
   200  	PXOR t2, x;                                     \
   201  	PXOR t3, x;                                     \
   202  	SM4_TAO_L1(x, y, z);                            \
   203  	PXOR x, t0                                      \
   204  
   205  #define SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3) \
   206  	PSHUFD $0, rk128, x;                                   \
   207  	SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3);            \
   208  	PSHUFD $0x55, rk128, x;                                \
   209  	SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0);            \
   210  	PSHUFD $0xAA, rk128, x;                                \
   211  	SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1);            \
   212  	PSHUFD $0xFF, rk128, x;                                \
   213  	SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2);            \
   214  
   215  // Requires: SSSE3
   216  #define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
   217  	PSHUFB flip_mask<>(SB), t0;                            \
   218  	PSHUFD $1, t0, t1;                                     \
   219  	PSHUFD $2, t0, t2;                                     \
   220  	PSHUFD $3, t0, t3;                                     \
   221  	MOVOU (0*16)(RK), rk128;                               \
   222  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   223  	MOVOU (1*16)(RK), rk128;                               \
   224  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   225  	MOVOU (2*16)(RK), rk128;                               \
   226  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   227  	MOVOU (3*16)(RK), rk128;                               \
   228  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   229  	MOVOU (4*16)(RK), rk128;                               \
   230  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   231  	MOVOU (5*16)(RK), rk128;                               \
   232  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   233  	MOVOU (6*16)(RK), rk128;                               \
   234  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   235  	MOVOU (7*16)(RK), rk128;                               \
   236  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   237  	PALIGNR $4, t3, t3;                                    \
   238  	PALIGNR $4, t3, t2;                                    \
   239  	PALIGNR $4, t2, t1;                                    \
   240  	PALIGNR $4, t1, t0;                                    \
   241  	PSHUFB flip_mask<>(SB), t0
   242  
   243  #define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3)  \ 
   244  	PSHUFB flip_mask<>(SB), t0; \
   245  	PSHUFB flip_mask<>(SB), t1; \
   246  	PSHUFB flip_mask<>(SB), t2; \
   247  	PSHUFB flip_mask<>(SB), t3; \
   248  	SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
   249  
   250  #define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)  \ 
   251  	SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);            \
   252  	MOVOU (0*16)(RK), rk128;                               \
   253  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   254  	MOVOU (1*16)(RK), rk128;                               \
   255  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   256  	MOVOU (2*16)(RK), rk128;                               \
   257  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   258  	MOVOU (3*16)(RK), rk128;                               \
   259  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   260  	MOVOU (4*16)(RK), rk128;                               \
   261  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   262  	MOVOU (5*16)(RK), rk128;                               \
   263  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   264  	MOVOU (6*16)(RK), rk128;                               \
   265  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   266  	MOVOU (7*16)(RK), rk128;                               \
   267  	SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3);   \
   268  	SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);            \
   269  	PSHUFB bswap_mask<>(SB), t3;                           \
   270  	PSHUFB bswap_mask<>(SB), t2;                           \
   271  	PSHUFB bswap_mask<>(SB), t1;                           \
   272  	PSHUFB bswap_mask<>(SB), t0
   273  
   274  #define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
   275  	PSHUFD $0, rk128, x;                                   \
   276  	SM4_ONE_ROUND_SSE(x, y, z, t0, t1, t2, t3);            \
   277  	PSHUFD $0, rk128, x;                                   \
   278  	SM4_ONE_ROUND_SSE(x, y, z, t4, t5, t6, t7);            \
   279  	PSHUFD $0x55, rk128, x;                                \
   280  	SM4_ONE_ROUND_SSE(x, y, z, t1, t2, t3, t0);            \
   281  	PSHUFD $0x55, rk128, x;                                \
   282  	SM4_ONE_ROUND_SSE(x, y, z, t5, t6, t7, t4);            \
   283  	PSHUFD $0xAA, rk128, x;                                \
   284  	SM4_ONE_ROUND_SSE(x, y, z, t2, t3, t0, t1);            \
   285  	PSHUFD $0xAA, rk128, x;                                \
   286  	SM4_ONE_ROUND_SSE(x, y, z, t6, t7, t4, t5);            \
   287  	PSHUFD $0xFF, rk128, x;                                \
   288  	SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2);            \
   289  	PSHUFD $0xFF, rk128, x;                                \
   290  	SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6);            \
   291  
   292  #define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)  \ 
   293  	PSHUFB flip_mask<>(SB), t0; \
   294  	PSHUFB flip_mask<>(SB), t1; \
   295  	PSHUFB flip_mask<>(SB), t2; \
   296  	PSHUFB flip_mask<>(SB), t3; \
   297  	PSHUFB flip_mask<>(SB), t4; \
   298  	PSHUFB flip_mask<>(SB), t5; \
   299  	PSHUFB flip_mask<>(SB), t6; \
   300  	PSHUFB flip_mask<>(SB), t7; \	
   301  	SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
   302  
   303  #define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)  \ 
   304  	SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);                           \
   305  	SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y);                           \
   306  	MOVOU (0*16)(RK), rk128;                                              \
   307  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   308  	MOVOU (1*16)(RK), rk128;                                              \
   309  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   310  	MOVOU (2*16)(RK), rk128;                                              \
   311  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   312  	MOVOU (3*16)(RK), rk128;                                              \
   313  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   314  	MOVOU (4*16)(RK), rk128;                                              \
   315  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   316  	MOVOU (5*16)(RK), rk128;                                              \
   317  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   318  	MOVOU (6*16)(RK), rk128;                                              \
   319  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   320  	MOVOU (7*16)(RK), rk128;                                              \
   321  	SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7);  \
   322  	SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y);                           \
   323  	SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y);                           \
   324  	PSHUFB bswap_mask<>(SB), t3;                                          \
   325  	PSHUFB bswap_mask<>(SB), t2;                                          \
   326  	PSHUFB bswap_mask<>(SB), t1;                                          \
   327  	PSHUFB bswap_mask<>(SB), t0;                                          \
   328  	PSHUFB bswap_mask<>(SB), t7;                                          \
   329  	PSHUFB bswap_mask<>(SB), t6;                                          \
   330  	PSHUFB bswap_mask<>(SB), t5;                                          \
   331  	PSHUFB bswap_mask<>(SB), t4
   332  
   333  // SM4 sbox function, AVX version
   334  // parameters:
   335  // -  x: 128 bits register as sbox input/output data
   336  // -  y: 128 bits temp register
   337  // - tmp: 128 bits temp register
   338  #define AVX_SM4_SBOX(x, y, tmp) \
   339  	VPAND nibble_mask<>(SB), x, tmp;                   \
   340  	VMOVDQU m1_low<>(SB), y;                           \
   341  	VPSHUFB tmp, y, y;                                 \
   342  	VPSRLQ $4, x, x;                                   \
   343  	VPAND nibble_mask<>(SB), x, x;                     \
   344  	VMOVDQU m1_high<>(SB), tmp;                        \
   345  	VPSHUFB x, tmp, x;                                 \
   346  	VPXOR y, x, x;                                     \
   347  	VPSHUFB inverse_shift_rows<>(SB), x, x;            \
   348  	VAESENCLAST nibble_mask<>(SB), x, x;               \
   349  	VPANDN nibble_mask<>(SB), x, tmp;                  \
   350  	VMOVDQU m2_low<>(SB), y;                           \
   351  	VPSHUFB tmp, y, y;                                 \
   352  	VPSRLQ $4, x, x;                                   \
   353  	VPAND nibble_mask<>(SB), x, x;                     \
   354  	VMOVDQU m2_high<>(SB), tmp;                        \
   355  	VPSHUFB x, tmp, x;                                 \
   356  	VPXOR y, x, x
   357  
   358  // SM4 TAO L1 function, AVX version
   359  // parameters:
   360  // -  x: 128 bits register as sbox input/output data
   361  // -  y: 128 bits temp register
   362  // - tmp: 128 bits temp register
   363  #define AVX_SM4_TAO_L1(x, y, tmp) \
   364  	AVX_SM4_SBOX(x, y, tmp);                \
   365  	VPSHUFB r08_mask<>(SB), x, y;           \ // y = x <<< 8
   366  	VPSHUFB r08_mask<>(SB), y, tmp;         \ // tmp = x <<< 16
   367  	VPXOR x, y, y;                          \ // y = x ^ (x <<< 8)
   368  	VPXOR tmp, y, y;                        \ // y = x ^ (x <<< 8) ^ (x <<< 16)
   369  	VPSHUFB r08_mask<>(SB), tmp, tmp;       \ // tmp = x <<< 24
   370  	VPXOR x, tmp, x;                        \ // x = x ^ (x <<< 24)
   371  	VPSLLD $2, y, tmp;                      \
   372  	VPSRLD $30, y, y;                       \
   373  	VPOR tmp, y, y;                         \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
   374  	VPXOR y, x, x
   375  
   376  // transpose matrix function, AVX/AVX2 version
   377  // parameters:
   378  // - r0: 128/256 bits register as input/output data
   379  // - r1: 128/256 bits register as input/output data
   380  // - r2: 128/256 bits register as input/output data
   381  // - r3: 128/256 bits register as input/output data
   382  // - tmp1: 128/256 bits temp register
   383  // - tmp2: 128/256 bits temp register
   384  #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
   385  	VPUNPCKHDQ r1, r0, tmp2;                 \ // tmp2 =  [w15, w7, w14, w6, w11, w3, w10, w2]          tmp2 = [w7, w3, w6, w2]
   386  	VPUNPCKLDQ r1, r0, r0;                   \ // r0 =    [w13, w5, w12, w4, w9, w1, w8, w0]              r0 = [w5, w1, w4, w0]
   387  	VPUNPCKLDQ r3, r2, tmp1;                 \ // tmp1 =  [w29, w21, w28, w20, w25, w17, w24, w16]      tmp1 = [w13, w9, w12, w8]
   388  	VPUNPCKHDQ r3, r2, r2;                   \ // r2 =    [w31, w27, w30, w22, w27, w19, w26, w18]        r2 = [w15, w11, w14, w10] 
   389  	VPUNPCKHQDQ tmp1, r0, r1;                \ // r1 =    [w29, w21, w13, w5, w25, w17, w9, w1]           r1 = [w13, w9, w5, w1]
   390  	VPUNPCKLQDQ tmp1, r0, r0;                \ // r0 =    [w28, w20, w12, w4, w24, w16, w8, w0]           r0 = [w12, w8, w4, w0]
   391  	VPUNPCKHQDQ r2, tmp2, r3;                \ // r3 =    [w31, w27, w15, w7, w27, w19, w11, w3]          r3 = [w15, w11, w7, w3]
   392  	VPUNPCKLQDQ r2, tmp2, r2                   // r2 =    [w30, w22, w14, w6, w26, w18, w10, w2]          r2 = [w14, w10, w6, w2]
   393  
   394  // SM4 round function, AVX version, handle 128 bits
   395  // t0 ^= tao_l1(t1^t2^t3^xk)
   396  // parameters:
   397  // - index: round key index immediate number
   398  // - x: 128 bits temp register
   399  // - y: 128 bits temp register
   400  // - t0: 128 bits register for data as result
   401  // - t1: 128 bits register for data
   402  // - t2: 128 bits register for data
   403  // - t3: 128 bits register for data
   404  #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3)  \ 
   405  	MOVL (index * 4)(RK)(IND*1), x;                    \
   406  	VPSHUFD $0, x, x;                                  \ // Use VBROADCASTSS ?
   407  	VPXOR t1, x, x;                                    \
   408  	VPXOR t2, x, x;                                    \
   409  	VPXOR t3, x, x;                                    \
   410  	AVX_SM4_TAO_L1(x, y, tmp);                         \  
   411  	VPXOR x, t0, t0
   412  
   413  
   414  #define SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3)  \
   415  	VPXOR t1, x, x;                                    \
   416  	VPXOR t2, x, x;                                    \
   417  	VPXOR t3, x, x;                                    \
   418  	AVX_SM4_TAO_L1(x, y, z);                           \
   419  	VPXOR x, t0, t0                                    \
   420  
   421  #define SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3) \
   422  	VPSHUFD $0, rk128, x;                                   \
   423  	SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3);             \
   424  	VPSHUFD $0x55, rk128, x;                                \
   425  	SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0);             \
   426  	VPSHUFD $0xAA, rk128, x;                                \
   427  	SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1);             \
   428  	VPSHUFD $0xFF, rk128, x;                                \
   429  	SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2);             \
   430  
   431  #define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \
   432  	VPSHUFB flip_mask<>(SB), t0, t0                           \
   433  	VPSHUFB flip_mask<>(SB), t1, t1                           \  
   434  	VPSHUFB flip_mask<>(SB), t2, t2                           \
   435  	VPSHUFB flip_mask<>(SB), t3, t3                           \
   436  	; \
   437  	AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3)
   438  
   439  #define AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \
   440  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)                     \
   441  	VMOVDQU (0*16)(RK), rk128;                                 \
   442  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   443  	VMOVDQU (1*16)(RK), rk128;                                 \
   444  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   445  	VMOVDQU (2*16)(RK), rk128;                                 \
   446  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   447  	VMOVDQU (3*16)(RK), rk128;                                 \
   448  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   449  	VMOVDQU (4*16)(RK), rk128;                                 \
   450  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   451  	VMOVDQU (5*16)(RK), rk128;                                 \
   452  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   453  	VMOVDQU (6*16)(RK), rk128;                                 \
   454  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   455  	VMOVDQU (7*16)(RK), rk128;                                 \
   456  	SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3);   \
   457  	; \ // Transpose matrix 4 x 4 32bits word
   458  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)                     \
   459  	VPSHUFB bswap_mask<>(SB), t0, t0                           \
   460  	VPSHUFB bswap_mask<>(SB), t1, t1                           \
   461  	VPSHUFB bswap_mask<>(SB), t2, t2                           \
   462  	VPSHUFB bswap_mask<>(SB), t3, t3                           \
   463  
   464  #define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
   465  	VPSHUFD $0, rk128, x;                                   \
   466  	SM4_ONE_ROUND_AVX(x, y, z, t0, t1, t2, t3);             \
   467  	VPSHUFD $0, rk128, x;                                   \
   468  	SM4_ONE_ROUND_AVX(x, y, z, t4, t5, t6, t7);             \
   469  	VPSHUFD $0x55, rk128, x;                                \
   470  	SM4_ONE_ROUND_AVX(x, y, z, t1, t2, t3, t0);             \
   471  	VPSHUFD $0x55, rk128, x;                                \
   472  	SM4_ONE_ROUND_AVX(x, y, z, t5, t6, t7, t4);             \
   473  	VPSHUFD $0xAA, rk128, x;                                \
   474  	SM4_ONE_ROUND_AVX(x, y, z, t2, t3, t0, t1);             \
   475  	VPSHUFD $0xAA, rk128, x;                                \
   476  	SM4_ONE_ROUND_AVX(x, y, z, t6, t7, t4, t5);             \
   477  	VPSHUFD $0xFF, rk128, x;                                \
   478  	SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2);             \
   479  	VPSHUFD $0xFF, rk128, x;                                \
   480  	SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6);             \
   481  
   482  #define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
   483  	VPSHUFB flip_mask<>(SB), t0, t0                              \
   484  	VPSHUFB flip_mask<>(SB), t1, t1                              \
   485  	VPSHUFB flip_mask<>(SB), t2, t2                              \
   486  	VPSHUFB flip_mask<>(SB), t3, t3                              \
   487  	VPSHUFB flip_mask<>(SB), t4, t4                              \
   488  	VPSHUFB flip_mask<>(SB), t5, t5                              \
   489  	VPSHUFB flip_mask<>(SB), t6, t6                              \
   490  	VPSHUFB flip_mask<>(SB), t7, t7                              \	
   491  	; \
   492  	AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7)
   493  
   494  #define AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \
   495  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)                                   \
   496  	TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y)                                   \
   497  	VMOVDQU (0*16)(RK), rk128;                                               \
   498  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
   499  	VMOVDQU (1*16)(RK), rk128;                                               \
   500  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
   501  	VMOVDQU (2*16)(RK), rk128;                                               \
   502  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
   503  	VMOVDQU (3*16)(RK), rk128;                                               \
   504  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
   505  	VMOVDQU (4*16)(RK), rk128;                                               \
   506  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
   507  	VMOVDQU (5*16)(RK), rk128;                                               \
   508  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
   509  	VMOVDQU (6*16)(RK), rk128;                                               \
   510  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \
   511  	VMOVDQU (7*16)(RK), rk128;                                               \
   512  	SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \		
   513  	; \ // Transpose matrix 4 x 4 32bits word
   514  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y)                                   \
   515  	TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y)                                   \
   516  	VPSHUFB bswap_mask<>(SB), t0, t0                                         \
   517  	VPSHUFB bswap_mask<>(SB), t1, t1                                         \
   518  	VPSHUFB bswap_mask<>(SB), t2, t2                                         \
   519  	VPSHUFB bswap_mask<>(SB), t3, t3                                         \
   520  	VPSHUFB bswap_mask<>(SB), t4, t4                                         \
   521  	VPSHUFB bswap_mask<>(SB), t5, t5                                         \
   522  	VPSHUFB bswap_mask<>(SB), t6, t6                                         \
   523  	VPSHUFB bswap_mask<>(SB), t7, t7                                         \
   524  
   525  // SM4 sbox function, AVX2 version
   526  // parameters:
   527  // -  x: 256 bits register as sbox input/output data
   528  // -  y: 256 bits temp register
   529  // -  z: 256 bits temp register
   530  // - xw: 128 bits temp register
   531  // - yw: 128 bits temp register
   532  // - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
   533  // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
   534  #define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
   535  	VPAND yNibbleMask, x, z;                       \
   536  	VMOVDQU m1_low<>(SB), y;                       \
   537  	VPSHUFB z, y, y;                               \
   538  	VPSRLQ $4, x, x;                               \
   539  	VPAND yNibbleMask, x, x;                       \
   540  	VMOVDQU m1_high<>(SB), z;                      \
   541  	VPSHUFB x, z, x;                               \
   542  	VPXOR y, x, x;                                 \
   543  	VPSHUFB inverse_shift_rows<>(SB), x, x;        \
   544  	VEXTRACTI128 $1, x, yw                         \
   545  	VAESENCLAST xNibbleMask, xw, xw;               \
   546  	VAESENCLAST xNibbleMask, yw, yw;               \
   547  	VINSERTI128 $1, yw, x, x;                      \
   548  	VPANDN yNibbleMask, x, z;                      \
   549  	VMOVDQU m2_low<>(SB), y;                       \
   550  	VPSHUFB z, y, y;                               \
   551  	VPSRLQ $4, x, x;                               \
   552  	VPAND yNibbleMask, x, x;                       \
   553  	VMOVDQU m2_high<>(SB), z;                      \
   554  	VPSHUFB x, z, x;                               \
   555  	VPXOR y, x, x
   556  
   557  // SM4 TAO L1 function, AVX2 version
   558  // parameters:
   559  // -  x: 256 bits register as sbox input/output data
   560  // -  y: 256 bits temp register
   561  // -  z: 256 bits temp register
   562  // - xw: 128 bits temp register, x's related low 128 bits register!
   563  // - yw: 128 bits temp register, y's related low 128 bits register!
   564  // - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
   565  // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
   566  #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
   567  	AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask);      \
   568  	VPSHUFB r08_mask<>(SB), x, y;            \ // y = x <<< 8
   569  	VPSHUFB r08_mask<>(SB), y, z;            \ // z = x <<< 16
   570  	VPXOR x, y, y;                           \ // y = x ^ (x <<< 8)
   571  	VPXOR z, y, y;                           \ // y = x ^ (x <<< 8) ^ (x <<< 16)
   572  	VPSHUFB r08_mask<>(SB), z, z;            \ // z = x <<< 24
   573  	VPXOR x, z, x;                           \ // x = x ^ (x <<< 24)
   574  	VPSLLD $2, y, z;                         \
   575  	VPSRLD $30, y, y;                        \
   576  	VPOR z, y, y;                            \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
   577  	VPXOR y, x, x
   578  
   579  // SM4 round function, AVX2 version, handle 256 bits
   580  // t0 ^= tao_l1(t1^t2^t3^xk)
   581  // parameters:
   582  // - index: round key index immediate number
   583  // - x: 256 bits temp register, MUST use XDWORD!
   584  // - y: 256 bits temp register, MUST use YDWORD!
   585  // - t0: 256 bits register for data as result
   586  // - t1: 256 bits register for data
   587  // - t2: 256 bits register for data
   588  // - t3: 256 bits register for data
   589  #define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3)  \ 
   590  	VPBROADCASTD (index * 4)(RK)(IND*1), x;                                  \
   591  	VPXOR t1, x, x;                                                          \
   592  	VPXOR t2, x, x;                                                          \
   593  	VPXOR t3, x, x;                                                          \
   594  	AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK);          \  
   595  	VPXOR x, t0, t0
   596  
   597  // SM4 round function, AVX2 version, handle 256 bits
   598  // t0 ^= tao_l1(t1^t2^t3^xk)
   599  // parameters:
   600  // - index: round key index immediate number
   601  // - x: 256 bits temp register, MUST use XDWORD!
   602  // - y: 256 bits temp register, MUST use YDWORD!
   603  // - t0: 256 bits register for data as result
   604  // - t1: 256 bits register for data
   605  // - t2: 256 bits register for data
   606  // - t3: 256 bits register for data
   607  #define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3)  \ 
   608  	VPBROADCASTD (index * 4)(RK), x;                                  \
   609  	VPXOR t1, x, x;                                                   \
   610  	VPXOR t2, x, x;                                                   \
   611  	VPXOR t3, x, x;                                                   \
   612  	AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK);   \  
   613  	VPXOR x, t0, t0
   614  
   615  // SM4 round function, AVX version, handle 128 bits
   616  // t0 ^= tao_l1(t1^t2^t3^xk)
   617  // parameters:
   618  // - index: round key index immediate number
   619  // - x: 128 bits temp register
   620  // - y: 128 bits temp register
   621  // - t0: 128 bits register for data as result
   622  // - t1: 128 bits register for data
   623  // - t2: 128 bits register for data
   624  // - t3: 128 bits register for data
   625  #define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3)  \ 
   626  	VPBROADCASTD (index * 4)(RK)(IND*1), x;            \
   627  	VPXOR t1, x, x;                                    \
   628  	VPXOR t2, x, x;                                    \
   629  	VPXOR t3, x, x;                                    \
   630  	AVX_SM4_TAO_L1(x, y, tmp);                         \  
   631  	VPXOR x, t0, t0
   632  
   633  #define AVX2_SM4_8BLOCKS(RK, x, y, xw, yw, tmp, t0, t1, t2, t3)	\
   634  	AVX2_SM4_ROUND2(0, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   635  	AVX2_SM4_ROUND2(1, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   636  	AVX2_SM4_ROUND2(2, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   637  	AVX2_SM4_ROUND2(3, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
   638  	AVX2_SM4_ROUND2(4, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   639  	AVX2_SM4_ROUND2(5, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   640  	AVX2_SM4_ROUND2(6, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   641  	AVX2_SM4_ROUND2(7, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
   642  	AVX2_SM4_ROUND2(8, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   643  	AVX2_SM4_ROUND2(9, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   644  	AVX2_SM4_ROUND2(10, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   645  	AVX2_SM4_ROUND2(11, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
   646  	AVX2_SM4_ROUND2(12, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   647  	AVX2_SM4_ROUND2(13, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   648  	AVX2_SM4_ROUND2(14, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   649  	AVX2_SM4_ROUND2(15, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
   650  	AVX2_SM4_ROUND2(16, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   651  	AVX2_SM4_ROUND2(17, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   652  	AVX2_SM4_ROUND2(18, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   653  	AVX2_SM4_ROUND2(19, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
   654  	AVX2_SM4_ROUND2(20, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   655  	AVX2_SM4_ROUND2(21, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   656  	AVX2_SM4_ROUND2(22, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   657  	AVX2_SM4_ROUND2(23, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
   658  	AVX2_SM4_ROUND2(24, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   659  	AVX2_SM4_ROUND2(25, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   660  	AVX2_SM4_ROUND2(26, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   661  	AVX2_SM4_ROUND2(27, RK, x, y, xw, yw, tmp, t3, t0, t1, t2); \
   662  	AVX2_SM4_ROUND2(28, RK, x, y, xw, yw, tmp, t0, t1, t2, t3); \
   663  	AVX2_SM4_ROUND2(29, RK, x, y, xw, yw, tmp, t1, t2, t3, t0); \
   664  	AVX2_SM4_ROUND2(30, RK, x, y, xw, yw, tmp, t2, t3, t0, t1); \
   665  	AVX2_SM4_ROUND2(31, RK, x, y, xw, yw, tmp, t3, t0, t1, t2)
   666  
   667  // SM4 round function, AVX2 version, handle 256 bits
   668  // t0 ^= tao_l1(t1^t2^t3^xk)
   669  // parameters:
   670  // - index: round key index immediate number
   671  // - x: 256 bits temp register, MUST use XDWORD!
   672  // - y: 256 bits temp register, MUST use YDWORD!
   673  // - tmp: 256 bits temp register
   674  // - tmp1: 256 bits temp register
   675  // - t0: 256 bits register for data as result
   676  // - t1: 256 bits register for data
   677  // - t2: 256 bits register for data
   678  // - t3: 256 bits register for data
   679  #define AVX2_SM4_16BLOCKS_ROUND(index, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7)  \ 
   680  	VPBROADCASTD (index * 4)(RK), tmp1;                                         \
   681  	VPXOR t1, tmp1, x;                                                          \
   682  	VPXOR t2, x, x;                                                             \
   683  	VPXOR t3, x, x;                                                             \
   684  	AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK);             \  
   685  	VPXOR x, t0, t0;                                                            \
   686  	;\
   687  	VPXOR t5, tmp1, x;                                                          \
   688  	VPXOR t6, x, x;                                                             \
   689  	VPXOR t7, x, x;                                                             \
   690  	AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK);             \  
   691  	VPXOR x, t4, t4;                                                            \
   692  
   693  #define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7)	\
   694  	AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7);  \
   695  	AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4);  \
   696  	AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5);  \
   697  	AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6);  \
   698  	AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7);  \
   699  	AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4);  \
   700  	AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5);  \
   701  	AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6);  \
   702  	AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7);  \
   703  	AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4);  \
   704  	AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
   705  	AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
   706  	AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
   707  	AVX2_SM4_16BLOCKS_ROUND(13, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
   708  	AVX2_SM4_16BLOCKS_ROUND(14, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
   709  	AVX2_SM4_16BLOCKS_ROUND(15, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
   710  	AVX2_SM4_16BLOCKS_ROUND(16, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
   711  	AVX2_SM4_16BLOCKS_ROUND(17, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
   712  	AVX2_SM4_16BLOCKS_ROUND(18, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
   713  	AVX2_SM4_16BLOCKS_ROUND(19, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
   714  	AVX2_SM4_16BLOCKS_ROUND(20, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
   715  	AVX2_SM4_16BLOCKS_ROUND(21, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
   716  	AVX2_SM4_16BLOCKS_ROUND(22, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
   717  	AVX2_SM4_16BLOCKS_ROUND(23, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
   718  	AVX2_SM4_16BLOCKS_ROUND(24, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
   719  	AVX2_SM4_16BLOCKS_ROUND(25, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
   720  	AVX2_SM4_16BLOCKS_ROUND(26, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
   721  	AVX2_SM4_16BLOCKS_ROUND(27, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \
   722  	AVX2_SM4_16BLOCKS_ROUND(28, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \
   723  	AVX2_SM4_16BLOCKS_ROUND(29, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \
   724  	AVX2_SM4_16BLOCKS_ROUND(30, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \
   725  	AVX2_SM4_16BLOCKS_ROUND(31, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6)