github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm4/asm_amd64.s (about)

     1  // This SM4 implementation referenced https://github.com/mjosaarinen/sm4ni/blob/master/sm4ni.c
     2  #include "textflag.h"
     3  
     4  #define x X0
     5  #define y X1
     6  #define t0 X2
     7  #define t1 X3
     8  #define t2 X4
     9  #define t3 X5
    10  
    11  #define XTMP6 X6
    12  #define XTMP7 X7
    13  
    14  // shuffle byte order from LE to BE
    15  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
    16  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
    17  GLOBL flip_mask<>(SB), RODATA, $16
    18  
    19  // shuffle byte and word order
    20  DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    21  DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607
    22  GLOBL bswap_mask<>(SB), RODATA, $16
    23  
    24  //nibble mask
    25  DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
    26  DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
    27  GLOBL nibble_mask<>(SB), RODATA, $16
    28  
    29  // inverse shift rows
    30  DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
    31  DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 
    32  GLOBL inverse_shift_rows<>(SB), RODATA, $16
    33  
    34  // Affine transform 1 (low and high hibbles)
    35  DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
    36  DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653
    37  GLOBL m1_low<>(SB), RODATA, $16
    38  
    39  DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800
    40  DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB  
    41  GLOBL m1_high<>(SB), RODATA, $16
    42  
    43  // Affine transform 2 (low and high hibbles)
    44  DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61
    45  DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5
    46  GLOBL m2_low<>(SB), RODATA, $16
    47  
    48  DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400
    49  DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5
    50  GLOBL m2_high<>(SB), RODATA, $16
    51  
    52  // left rotations of 32-bit words by 8-bit increments
    53  DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
    54  DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B  
    55  GLOBL r08_mask<>(SB), RODATA, $16
    56  
    57  DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
    58  DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A   
    59  GLOBL r16_mask<>(SB), RODATA, $16
    60  
    61  DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
    62  DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09  
    63  GLOBL r24_mask<>(SB), RODATA, $16
    64  
    65  DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
    66  DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
    67  GLOBL fk_mask<>(SB), RODATA, $16
    68  
    69  #define SM4_SBOX(x, y) \
    70    ;                                   \ //#############################  inner affine ############################//
    71    MOVOU x, XTMP6;                     \
    72    PAND nibble_mask<>(SB), XTMP6;      \ //y = _mm_and_si128(x, c0f); 
    73    MOVOU m1_low<>(SB), y;              \
    74    PSHUFB XTMP6, y;                    \ //y = _mm_shuffle_epi8(m1l, y);
    75    PSRLQ $4, x;                        \ //x = _mm_srli_epi64(x, 4); 
    76    PAND nibble_mask<>(SB), x;          \ //x = _mm_and_si128(x, c0f);
    77    MOVOU m1_high<>(SB), XTMP6;         \
    78    PSHUFB x, XTMP6;                    \ //x = _mm_shuffle_epi8(m1h, x);
    79    MOVOU  XTMP6, x;                    \ //x = _mm_shuffle_epi8(m1h, x);
    80    PXOR y, x;                          \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
    81    ;                                   \ // inverse ShiftRows
    82    PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); 
    83    AESENCLAST nibble_mask<>(SB), x;    \ // AESNI instruction
    84    ;                                   \ //#############################  outer affine ############################//
    85    MOVOU  x, XTMP6;                    \
    86    PANDN nibble_mask<>(SB), XTMP6;     \ //XTMP6 = _mm_andnot_si128(x, c0f);
    87    MOVOU m2_low<>(SB), y;              \ 
    88    PSHUFB XTMP6, y;                    \ //y = _mm_shuffle_epi8(m2l, XTMP6)
    89    PSRLQ $4, x;                        \ //x = _mm_srli_epi64(x, 4);
    90    PAND nibble_mask<>(SB), x;          \ //x = _mm_and_si128(x, c0f); 
    91    MOVOU m2_high<>(SB), XTMP6;         \
    92    PSHUFB x, XTMP6;                    \
    93    MOVOU  XTMP6, x;                    \ //x = _mm_shuffle_epi8(m2h, x)
    94    PXOR y, x;                          \ //x = _mm_shuffle_epi8(m2h, x) ^ y; 
    95  
    96  #define SM4_TAO_L1(x, y)         \
    97    SM4_SBOX(x, y);                     \
    98    ;                                   \ //####################  4 parallel L1 linear transforms ##################//
    99    MOVOU x, y;                         \
   100    PSHUFB r08_mask<>(SB), y;           \ //y = _mm_shuffle_epi8(x, r08)
   101    PXOR x, y;                          \ //y = x xor _mm_shuffle_epi8(x, r08)
   102    MOVOU x, XTMP6;                     \
   103    PSHUFB r16_mask<>(SB), XTMP6;       \ 
   104    PXOR XTMP6, y;                      \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
   105    MOVOU y, XTMP6;                     \
   106    PSLLL $2, XTMP6;                    \
   107    PSRLL $30, y;                       \
   108    POR XTMP6, y;                       \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);  
   109    MOVOU x, XTMP7;                     \
   110    PSHUFB r24_mask<>(SB), XTMP7;       \
   111    PXOR y, x;                          \ //x = x xor y
   112    PXOR XTMP7, x                         //x = x xor y xor _mm_shuffle_epi8(x, r24);
   113  
   114  #define SM4_TAO_L2(x, y)         \
   115    SM4_SBOX(x, y);                     \
   116    ;                                   \ //####################  4 parallel L2 linear transforms ##################//
   117    MOVOU x, y;                         \
   118    MOVOU x, XTMP6;                     \
   119    PSLLL $13, XTMP6;                   \
   120    PSRLL $19, y;                       \
   121    POR XTMP6, y;                      \ //y = X roll 13  
   122    PSLLL $10, XTMP6;                   \
   123    MOVOU x, XTMP7;                     \
   124    PSRLL $9, XTMP7;                    \
   125    POR XTMP6, XTMP7;                  \ //XTMP7 = x roll 23
   126    PXOR XTMP7, y;                      \
   127    PXOR y, x                        
   128  
   129  #define SM4_ROUND(index, x, y, t0, t1, t2, t3)  \ 
   130    PINSRD $0, (index * 4)(AX)(CX*1), x;           \
   131    PSHUFD $0, x, x;                               \
   132    PXOR t1, x;                                    \
   133    PXOR t2, x;                                    \
   134    PXOR t3, x;                                    \
   135    SM4_TAO_L1(x, y);                              \
   136    PXOR x, t0
   137  
   138  #define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3)  \ 
   139    PINSRD $0, (index * 4)(AX)(CX*1), x;           \
   140    PXOR t1, x;                                    \
   141    PXOR t2, x;                                    \
   142    PXOR t3, x;                                    \
   143    SM4_TAO_L1(x, y);                              \
   144    PXOR x, t0
   145  
   146  #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
   147    PINSRD $0, (index * 4)(BX)(CX*1), x;                   \
   148    PXOR t1, x;                                            \
   149    PXOR t2, x;                                            \
   150    PXOR t3, x;                                            \
   151    SM4_TAO_L2(x, y);                                      \
   152    PXOR x, t0;                                            \
   153    PEXTRD $0, t0, R8;                                     \
   154    MOVL R8, (index * 4)(DX)(CX*1);                        \
   155    MOVL R8, (12 - index * 4)(DI)(SI*1)
   156  
   157  #define XDWORD0 Y4
   158  #define XDWORD1 Y5
   159  #define XDWORD2 Y6
   160  #define XDWORD3 Y7
   161  
   162  #define XWORD0 X4
   163  #define XWORD1 X5
   164  #define XWORD2 X6
   165  #define XWORD3 X7
   166  
   167  #define XDWTMP0 Y0
   168  #define XDWTMP1 Y1
   169  #define XDWTMP2 Y2
   170  
   171  #define XWTMP0 X0
   172  #define XWTMP1 X1
   173  #define XWTMP2 X2
   174  
   175  #define NIBBLE_MASK Y3
   176  #define X_NIBBLE_MASK X3
   177  
   178  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
   179  #define X_BYTE_FLIP_MASK 	X13 // mask to convert LE -> BE
   180  
   181  #define XDWORD Y8
   182  #define YDWORD Y9
   183  
   184  #define XWORD X8
   185  #define YWORD X9
   186  
   187  #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
   188    VPUNPCKHDQ r1, r0, tmp2;                 \ // tmp2 =  [w15, w7, w14, w6, w11, w3, w10, w2]          tmp2 = [w7, w3, w6, w2]
   189    VPUNPCKLDQ r1, r0, r0;                   \ // r0 =    [w13, w5, w12, w4, w9, w1, w8, w0]              r0 = [w5, w1, w4, w0]
   190    VPUNPCKLDQ r3, r2, tmp1;                 \ // tmp1 =  [w29, w21, w28, w20, w25, w17, w24, w16]      tmp1 = [w13, w9, w12, w8]
   191    VPUNPCKHDQ r3, r2, r2;                   \ // r2 =    [w31, w27, w30, w22, w27, w19, w26, w18]        r2 = [w15, w11, w14, w10] 
   192    VPUNPCKHQDQ tmp1, r0, r1;                \ // r1 =    [w29, w21, w13, w5, w25, w17, w9, w1]           r1 = [w13, w9, w5, w1]
   193    VPUNPCKLQDQ tmp1, r0, r0;                \ // r0 =    [w28, w20, w12, w4, w24, w16, w8, w0]           r0 = [w12, w8, w4, w0]
   194    VPUNPCKHQDQ r2, tmp2, r3;                \ // r3 =    [w31, w27, w15, w7, w27, w19, w11, w3]          r3 = [w15, w11, w7, w3]
   195    VPUNPCKLQDQ r2, tmp2, r2                   // r2 =    [w30, w22, w14, w6, w26, w18, w10, w2]          r2 = [w14, w10, w6, w2]
   196  
   197  // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
   198  #define AVX2_SM4_SBOX(x, y) \
   199    VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK; \
   200    VPAND NIBBLE_MASK, x, XDWTMP1;                   \
   201    VBROADCASTI128 m1_low<>(SB), y;                  \
   202    VPSHUFB XDWTMP1, y, y;                           \
   203    VPSRLQ $4, x, x;                                 \
   204    VPAND NIBBLE_MASK, x, x;                         \
   205    VBROADCASTI128 m1_high<>(SB), XDWTMP1;           \
   206    VPSHUFB x, XDWTMP1, x;                           \
   207    VPXOR y, x, x;                                   \
   208    VBROADCASTI128 inverse_shift_rows<>(SB), XDWTMP1;\
   209    VPSHUFB XDWTMP1, x, x;                           \
   210    VEXTRACTI128 $1, x, YWORD                        \
   211    VAESENCLAST X_NIBBLE_MASK, XWORD, XWORD;         \
   212    VAESENCLAST X_NIBBLE_MASK, YWORD, YWORD;         \
   213    VINSERTI128 $1, YWORD, x, x;                     \
   214    VPANDN NIBBLE_MASK, x, XDWTMP1;                  \
   215    VBROADCASTI128 m2_low<>(SB), y;                  \
   216    VPSHUFB XDWTMP1, y, y;                           \
   217    VPSRLQ $4, x, x;                                 \
   218    VPAND NIBBLE_MASK, x, x;                         \
   219    VBROADCASTI128 m2_high<>(SB), XDWTMP1;           \
   220    VPSHUFB x, XDWTMP1, x;                           \
   221    VPXOR y, x, x
   222  
   223  #define AVX2_SM4_TAO_L1(x, y) \
   224    AVX2_SM4_SBOX(x, y);                       \
   225    VBROADCASTI128 r08_mask<>(SB), XDWTMP0;    \
   226    VPSHUFB XDWTMP0, x, y;                     \
   227    VPXOR x, y, y;                             \        
   228    VBROADCASTI128 r16_mask<>(SB), XDWTMP0;    \
   229    VPSHUFB XDWTMP0, x, XDWTMP0;               \
   230    VPXOR XDWTMP0, y, y;                       \
   231    VPSLLD $2, y, XDWTMP1;                     \
   232    VPSRLD $30, y, y;                          \
   233    VPXOR XDWTMP1, y, y;                       \
   234    VBROADCASTI128 r24_mask<>(SB), XDWTMP0;    \
   235    VPSHUFB XDWTMP0, x, XDWTMP0;               \
   236    VPXOR y, x, x;                             \
   237    VPXOR x, XDWTMP0, x
   238  
   239  #define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3)  \ 
   240    VPBROADCASTD (index * 4)(AX)(CX*1), x;             \
   241    VPXOR t1, x, x;                                    \
   242    VPXOR t2, x, x;                                    \
   243    VPXOR t3, x, x;                                    \
   244    AVX2_SM4_TAO_L1(x, y);                             \  
   245    VPXOR x, t0, t0
   246  
   247  #define AVX_SM4_SBOX(x, y) \
   248    VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK;          \
   249    VPAND X_NIBBLE_MASK, x, XWTMP1;                    \
   250    VMOVDQU m1_low<>(SB), y;                           \
   251    VPSHUFB XWTMP1, y, y;                              \
   252    VPSRLQ $4, x, x;                                   \
   253    VPAND X_NIBBLE_MASK, x, x;                         \
   254    VMOVDQU m1_high<>(SB), XWTMP1;                     \
   255    VPSHUFB x, XWTMP1, x;                              \
   256    VPXOR y, x, x;                                     \
   257    VMOVDQU inverse_shift_rows<>(SB), XWTMP1;          \
   258    VPSHUFB XWTMP1, x, x;                              \
   259    VAESENCLAST X_NIBBLE_MASK, x, x;                   \
   260    VPANDN X_NIBBLE_MASK, x, XWTMP1;                   \
   261    VMOVDQU m2_low<>(SB), y;                           \
   262    VPSHUFB XWTMP1, y, y;                              \
   263    VPSRLQ $4, x, x;                                   \
   264    VPAND X_NIBBLE_MASK, x, x;                         \
   265    VMOVDQU m2_high<>(SB), XWTMP1;                     \
   266    VPSHUFB x, XWTMP1, x;                              \
   267    VPXOR y, x, x
   268  
   269  #define AVX_SM4_TAO_L1(x, y) \
   270    AVX_SM4_SBOX(x, y);                     \
   271    VMOVDQU r08_mask<>(SB), XWTMP0;         \
   272    VPSHUFB XWTMP0, x, y;                   \
   273    VPXOR x, y, y;                          \        
   274    VMOVDQU r16_mask<>(SB), XWTMP0;         \
   275    VPSHUFB XWTMP0, x, XWTMP0;              \
   276    VPXOR XWTMP0, y, y;                     \
   277    VPSLLD $2, y, XWTMP1;                   \
   278    VPSRLD $30, y, y;                       \
   279    VPXOR XWTMP1, y, y;                     \
   280    VMOVDQU r24_mask<>(SB), XWTMP0;         \
   281    VPSHUFB XWTMP0, x, XWTMP0;              \
   282    VPXOR y, x, x;                          \
   283    VPXOR x, XWTMP0, x
   284  
   285  #define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3)  \ 
   286    VPBROADCASTD (index * 4)(AX)(CX*1), x;             \
   287    VPXOR t1, x, x;                                    \
   288    VPXOR t2, x, x;                                    \
   289    VPXOR t3, x, x;                                    \
   290    AVX_SM4_TAO_L1(x, y);                              \  
   291    VPXOR x, t0, t0
   292  
   293  // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
   294  TEXT ·expandKeyAsm(SB),NOSPLIT,$0
   295    MOVQ key+0(FP), AX
   296    MOVQ  ck+8(FP), BX
   297    MOVQ  enc+16(FP), DX
   298    MOVQ  dec+24(FP), DI
   299  
   300    MOVUPS 0(AX), t0
   301    PSHUFB flip_mask<>(SB), t0
   302    PXOR fk_mask<>(SB), t0
   303    PSHUFD $1, t0, t1
   304    PSHUFD $2, t0, t2
   305    PSHUFD $3, t0, t3
   306  
   307    XORL CX, CX
   308    MOVL $112, SI
   309  
   310  loop:
   311    SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
   312    SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
   313    SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
   314    SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
   315  
   316    ADDL $16, CX
   317    SUBL $16, SI
   318    CMPL CX, $4*32
   319    JB loop
   320  
   321  expand_end:  
   322    RET 
   323  
   324  // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
   325  TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
   326    MOVQ xk+0(FP), AX
   327    MOVQ dst+8(FP), BX
   328    MOVQ src+32(FP), DX
   329    MOVQ src_len+40(FP), DI
   330    
   331    CMPB ·useAVX2(SB), $1
   332    JE   avx2
   333  
   334  non_avx2_start:
   335    PINSRD $0, 0(DX), t0
   336    PINSRD $1, 16(DX), t0
   337    PINSRD $2, 32(DX), t0
   338    PINSRD $3, 48(DX), t0
   339    PSHUFB flip_mask<>(SB), t0
   340  
   341    PINSRD $0, 4(DX), t1
   342    PINSRD $1, 20(DX), t1
   343    PINSRD $2, 36(DX), t1
   344    PINSRD $3, 52(DX), t1
   345    PSHUFB flip_mask<>(SB), t1
   346  
   347    PINSRD $0, 8(DX), t2
   348    PINSRD $1, 24(DX), t2
   349    PINSRD $2, 40(DX), t2
   350    PINSRD $3, 56(DX), t2
   351    PSHUFB flip_mask<>(SB), t2
   352  
   353    PINSRD $0, 12(DX), t3
   354    PINSRD $1, 28(DX), t3
   355    PINSRD $2, 44(DX), t3
   356    PINSRD $3, 60(DX), t3
   357    PSHUFB flip_mask<>(SB), t3
   358  
   359    XORL CX, CX
   360  
   361  loop:
   362    SM4_ROUND(0, x, y, t0, t1, t2, t3)
   363    SM4_ROUND(1, x, y, t1, t2, t3, t0)
   364    SM4_ROUND(2, x, y, t2, t3, t0, t1)
   365    SM4_ROUND(3, x, y, t3, t0, t1, t2)
   366  
   367    ADDL $16, CX
   368    CMPL CX, $4*32
   369    JB loop
   370  
   371    PSHUFB flip_mask<>(SB), t3
   372    PSHUFB flip_mask<>(SB), t2
   373    PSHUFB flip_mask<>(SB), t1
   374    PSHUFB flip_mask<>(SB), t0
   375    MOVUPS t3, 0(BX)
   376    MOVUPS t2, 16(BX)
   377    MOVUPS t1, 32(BX)
   378    MOVUPS t0, 48(BX)
   379    MOVL  4(BX), R8
   380    MOVL  8(BX), R9
   381    MOVL  12(BX), R10
   382    MOVL  16(BX), R11
   383    MOVL  32(BX), R12
   384    MOVL  48(BX), R13
   385    MOVL  R11, 4(BX)
   386    MOVL  R12, 8(BX)
   387    MOVL  R13, 12(BX)
   388    MOVL  R8, 16(BX)
   389    MOVL  R9, 32(BX)
   390    MOVL  R10, 48(BX)
   391    MOVL  24(BX), R8
   392    MOVL  28(BX), R9
   393    MOVL  36(BX), R10
   394    MOVL  52(BX), R11
   395    MOVL  R10, 24(BX)
   396    MOVL  R11, 28(BX)
   397    MOVL  R8, 36(BX)
   398    MOVL  R9, 52(BX)
   399    MOVL  44(BX), R8
   400    MOVL  56(BX), R9
   401    MOVL  R9, 44(BX)
   402    MOVL  R8, 56(BX)
   403  
   404  done_sm4:
   405    RET
   406  
   407  avx2:
   408    CMPQ DI, $64
   409    JBE   avx2_4blocks
   410  
   411  avx2_8blocks:
   412    VMOVDQU 0(DX), XDWORD0
   413    VMOVDQU 32(DX), XDWORD1
   414    VMOVDQU 64(DX), XDWORD2
   415    VMOVDQU 96(DX), XDWORD3
   416    VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
   417  
   418    // Apply Byte Flip Mask: LE -> BE
   419    VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   420    VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   421    VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   422    VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   423  
   424    // Transpose matrix 4 x 4 32bits word
   425    TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
   426  
   427    XORL CX, CX
   428  
   429  avx2_loop:
   430    AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   431    AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   432    AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   433    AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   434  
   435    ADDL $16, CX
   436    CMPL CX, $4*32
   437    JB avx2_loop
   438  
   439    // Transpose matrix 4 x 4 32bits word
   440    TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
   441  
   442    VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
   443    VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   444    VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   445    VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   446    VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   447    
   448    VMOVDQU XDWORD0, 0(BX)
   449    VMOVDQU XDWORD1, 32(BX)
   450    VMOVDQU XDWORD2, 64(BX)
   451    VMOVDQU XDWORD3, 96(BX)
   452    JMP avx2_sm4_done
   453  
   454  avx2_4blocks:
   455    VMOVDQU 0(DX), XWORD0
   456    VMOVDQU 16(DX), XWORD1
   457    VMOVDQU 32(DX), XWORD2
   458    VMOVDQU 48(DX), XWORD3
   459  
   460    VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
   461  
   462    VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   463    VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   464    VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   465    VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   466  
   467    // Transpose matrix 4 x 4 32bits word
   468    TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
   469  
   470    XORL CX, CX
   471  
   472  avx_loop:
   473    AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
   474    AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
   475    AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
   476    AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
   477  
   478    ADDL $16, CX
   479    CMPL CX, $4*32
   480    JB avx_loop
   481  
   482    // Transpose matrix 4 x 4 32bits word
   483    TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2)
   484  
   485    VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK
   486    VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   487    VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   488    VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   489    VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   490    
   491    VMOVDQU XWORD0, 0(BX)
   492    VMOVDQU XWORD1, 16(BX)
   493    VMOVDQU XWORD2, 32(BX)
   494    VMOVDQU XWORD3, 48(BX)
   495  
   496  avx2_sm4_done:
   497    VZEROUPPER
   498    RET
   499  
   500  // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
   501  TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
   502    MOVQ xk+0(FP), AX
   503    MOVQ dst+8(FP), BX
   504    MOVQ src+16(FP), DX
   505    
   506    PINSRD $0, 0(DX), t0
   507    PSHUFB flip_mask<>(SB), t0
   508  
   509    PINSRD $0, 4(DX), t1
   510    PSHUFB flip_mask<>(SB), t1
   511  
   512    PINSRD $0, 8(DX), t2
   513    PSHUFB flip_mask<>(SB), t2
   514  
   515    PINSRD $0, 12(DX), t3
   516    PSHUFB flip_mask<>(SB), t3
   517  
   518    XORL CX, CX
   519  
   520  loop:
   521    SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
   522    SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
   523    SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
   524    SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
   525  
   526    ADDL $16, CX
   527    CMPL CX, $4*32
   528    JB loop
   529  
   530    PSHUFB flip_mask<>(SB), t3
   531    PSHUFB flip_mask<>(SB), t2
   532    PSHUFB flip_mask<>(SB), t1
   533    PSHUFB flip_mask<>(SB), t0
   534    MOVUPS t3, 0(BX)
   535    PEXTRD $0, t2, R8
   536    MOVL R8, 4(BX)
   537    PEXTRD $0, t1, R8
   538    MOVL R8, 8(BX)
   539    PEXTRD $0, t0, R8
   540    MOVL R8, 12(BX)
   541  done_sm4:
   542    RET