gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm4/gcm_amd64.s (about)

     1  // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     2  // The implementation uses some optimization as described in:
     3  // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     4  //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     5  // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
     6  //     Hardware
     7  
     8  #include "textflag.h"
     9  
    10  #define B0 X0
    11  #define B1 X1
    12  #define B2 X2
    13  #define B3 X3
    14  #define B4 X4
    15  #define B5 X5
    16  #define B6 X6
    17  #define B7 X7
    18  
    19  #define DWB0 Y0
    20  #define DWB1 Y2
    21  #define DWB2 Y4
    22  #define DWB3 Y6
    23  
    24  #define XDWORD Y1
    25  #define YDWORD Y3
    26  #define XDWTMP0 Y5
    27  #define XDWTMP1 Y7
    28  
    29  #define ACC0 X8
    30  #define ACC1 X9
    31  #define ACCM X10
    32  
    33  #define T0 X11
    34  #define T1 X12
    35  #define T2 X13
    36  #define POLY X14
    37  #define BSWAP X15
    38  #define DWBSWAP Y15
    39  #define NIBBLE_MASK Y11
    40  #define X_NIBBLE_MASK X11
    41  
    42  // shuffle byte order from LE to BE
    43  DATA flipMask<>+0x00(SB)/8, $0x0405060700010203
    44  DATA flipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
    45  
    46  //nibble mask
    47  DATA nibbleMask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
    48  DATA nibbleMask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
    49  
    50  // inverse shift rows
    51  DATA inverseShiftRows<>+0x00(SB)/8, $0x0B0E0104070A0D00
    52  DATA inverseShiftRows<>+0x08(SB)/8, $0x0306090C0F020508 
    53  
    54  // Affine transform 1 (low and high hibbles)
    55  DATA m1Low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
    56  DATA m1Low<>+0x08(SB)/8, $0x3045F98CEF9A2653
    57  
    58  DATA m1High<>+0x00(SB)/8, $0xC35BF46CAF379800
    59  DATA m1High<>+0x08(SB)/8, $0x68F05FC7049C33AB  
    60  
    61  // Affine transform 2 (low and high hibbles)
    62  DATA m2Low<>+0x00(SB)/8, $0x9A950A05FEF16E61
    63  DATA m2Low<>+0x08(SB)/8, $0x0E019E916A65FAF5
    64  
    65  DATA m2High<>+0x00(SB)/8, $0x892D69CD44E0A400
    66  DATA m2High<>+0x08(SB)/8, $0x2C88CC68E14501A5
    67  
    68  // left rotations of 32-bit words by 8-bit increments
    69  DATA r08Mask<>+0x00(SB)/8, $0x0605040702010003
    70  DATA r08Mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 
    71  
    72  DATA r16Mask<>+0x00(SB)/8, $0x0504070601000302
    73  DATA r16Mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A   
    74  
    75  DATA r24Mask<>+0x00(SB)/8, $0x0407060500030201
    76  DATA r24Mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09  
    77  
    78  DATA fkMask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
    79  DATA fkMask<>+0x08(SB)/8, $0xb27022dc677d9197
    80  
    81  DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    82  DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    83  
    84  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    85  DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    86  
    87  DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    88  DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    89  DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    90  DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    91  DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    92  DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    93  DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    94  DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    95  DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    96  DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    97  DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    98  DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    99  DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
   100  DATA andMask<>+0x68(SB)/8, $0x0000000000000000
   101  DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
   102  DATA andMask<>+0x78(SB)/8, $0x0000000000000000
   103  DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
   104  DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
   105  DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
   106  DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
   107  DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
   108  DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
   109  DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
   110  DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
   111  DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
   112  DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
   113  DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
   114  DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
   115  DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
   116  DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
   117  
   118  GLOBL flipMask<>(SB), (NOPTR+RODATA), $16
   119  GLOBL nibbleMask<>(SB), (NOPTR+RODATA), $16
   120  GLOBL inverseShiftRows<>(SB), (NOPTR+RODATA), $16
   121  GLOBL m1Low<>(SB), (NOPTR+RODATA), $16
   122  GLOBL m1High<>(SB), (NOPTR+RODATA), $16
   123  GLOBL m2Low<>(SB), (NOPTR+RODATA), $16
   124  GLOBL m2High<>(SB), (NOPTR+RODATA), $16
   125  GLOBL r08Mask<>(SB), (NOPTR+RODATA), $16
   126  GLOBL r16Mask<>(SB), (NOPTR+RODATA), $16
   127  GLOBL r24Mask<>(SB), (NOPTR+RODATA), $16
   128  GLOBL fkMask<>(SB), (NOPTR+RODATA), $16
   129  GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
   130  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
   131  GLOBL andMask<>(SB), (NOPTR+RODATA), $240
   132  
   133  // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
   134  TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
   135  #define pTbl DI
   136  #define tMsk SI
   137  #define tPtr DX
   138  #define plen AX
   139  #define dlen CX
   140  
   141  	MOVQ productTable+0(FP), pTbl
   142  	MOVQ tagMask+8(FP), tMsk
   143  	MOVQ T+16(FP), tPtr
   144  	MOVQ pLen+24(FP), plen
   145  	MOVQ dLen+32(FP), dlen
   146  
   147  	MOVOU (tPtr), ACC0
   148  	MOVOU (tMsk), T2
   149  
   150  	MOVOU bswapMask<>(SB), BSWAP
   151  	MOVOU gcmPoly<>(SB), POLY
   152  
   153  	SHLQ $3, plen
   154  	SHLQ $3, dlen
   155  
   156  	MOVQ plen, B0
   157  	PINSRQ $1, dlen, B0
   158  
   159  	PXOR ACC0, B0
   160  
   161  	MOVOU (16*14)(pTbl), ACC0
   162  	MOVOU (16*15)(pTbl), ACCM
   163  	MOVOU ACC0, ACC1
   164  
   165  	PCLMULQDQ $0x00, B0, ACC0
   166  	PCLMULQDQ $0x11, B0, ACC1
   167  	PSHUFD $78, B0, T0
   168  	PXOR B0, T0
   169  	PCLMULQDQ $0x00, T0, ACCM
   170  
   171  	PXOR ACC0, ACCM
   172  	PXOR ACC1, ACCM
   173  	MOVOU ACCM, T0
   174  	PSRLDQ $8, ACCM
   175  	PSLLDQ $8, T0
   176  	PXOR ACCM, ACC1
   177  	PXOR T0, ACC0
   178  
   179  	MOVOU POLY, T0
   180  	PCLMULQDQ $0x01, ACC0, T0
   181  	PSHUFD $78, ACC0, ACC0
   182  	PXOR T0, ACC0
   183  
   184  	MOVOU POLY, T0
   185  	PCLMULQDQ $0x01, ACC0, T0
   186  	PSHUFD $78, ACC0, ACC0
   187  	PXOR T0, ACC0
   188  
   189  	PXOR ACC1, ACC0
   190  
   191  	PSHUFB BSWAP, ACC0
   192  	PXOR T2, ACC0
   193  	MOVOU ACC0, (tPtr)
   194  
   195  	RET
   196  
   197  #undef pTbl
   198  #undef tMsk
   199  #undef tPtr
   200  #undef plen
   201  #undef dlen
   202  
   203  #define SM4_SBOX(x, y, z) \
   204    ;                                   \ //#############################  inner affine ############################//
   205    MOVOU x, z;                         \
   206    PAND nibbleMask<>(SB), z;           \ //y = _mm_and_si128(x, c0f); 
   207    MOVOU m1Low<>(SB), y;               \
   208    PSHUFB z, y;                        \ //y = _mm_shuffle_epi8(m1l, y);
   209    PSRLQ $4, x;                        \ //x = _mm_srli_epi64(x, 4); 
   210    PAND nibbleMask<>(SB), x;           \ //x = _mm_and_si128(x, c0f);
   211    MOVOU m1High<>(SB), z;              \
   212    PSHUFB x, z;                        \ //x = _mm_shuffle_epi8(m1h, x);
   213    MOVOU  z, x;                        \ //x = _mm_shuffle_epi8(m1h, x);
   214    PXOR y, x;                          \ //x = _mm_shuffle_epi8(m1h, x) ^ y;
   215    ;                                   \ // inverse ShiftRows
   216    PSHUFB inverseShiftRows<>(SB), x;   \ //x = _mm_shuffle_epi8(x, shr); 
   217    AESENCLAST nibbleMask<>(SB), x;     \ // AESNI instruction
   218    ;                                   \ //#############################  outer affine ############################//
   219    MOVOU  x, z;                        \
   220    PANDN nibbleMask<>(SB), z;          \ //z = _mm_andnot_si128(x, c0f);
   221    MOVOU m2Low<>(SB), y;               \ 
   222    PSHUFB z, y;                        \ //y = _mm_shuffle_epi8(m2l, z)
   223    PSRLQ $4, x;                        \ //x = _mm_srli_epi64(x, 4);
   224    PAND nibbleMask<>(SB), x;           \ //x = _mm_and_si128(x, c0f); 
   225    MOVOU m2High<>(SB), z;              \
   226    PSHUFB x, z;                        \
   227    MOVOU  z, x;                        \ //x = _mm_shuffle_epi8(m2h, x)
   228    PXOR y, x                             //x = _mm_shuffle_epi8(m2h, x) ^ y; 
   229  
   230  #define SM4_TAO_L1(x, y, z)         \
   231    SM4_SBOX(x, y, z);                     \
   232    ;                                   \ //####################  4 parallel L1 linear transforms ##################//
   233    MOVOU x, y;                         \
   234    PSHUFB r08Mask<>(SB), y;            \ //y = _mm_shuffle_epi8(x, r08)
   235    PXOR x, y;                          \ //y = x xor _mm_shuffle_epi8(x, r08)
   236    MOVOU x, z;                         \
   237    PSHUFB r16Mask<>(SB), z;            \ 
   238    PXOR z, y;                          \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
   239    MOVOU y, z;                         \
   240    PSLLL $2, z;                        \
   241    PSRLL $30, y;                       \
   242    POR z, y;                           \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);  
   243    MOVOU x, z;                         \
   244    PSHUFB r24Mask<>(SB), z;            \
   245    PXOR y, x;                          \ //x = x xor y
   246    PXOR z, x                             //x = x xor y xor _mm_shuffle_epi8(x, r24);
   247  
   248  #define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3)  \ 
   249    PINSRD $0, (index * 4)(RK)(IND*1), x;             \
   250    PXOR t1, x;                                       \
   251    PXOR t2, x;                                       \
   252    PXOR t3, x;                                       \
   253    SM4_TAO_L1(x, y, z);                              \
   254    PXOR x, t0
   255  
   256  #define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3)  \ 
   257    PINSRD $0, (index * 4)(RK)(IND*1), x;           \
   258    PSHUFD $0, x, x;                                \
   259    PXOR t1, x;                                     \
   260    PXOR t2, x;                                     \
   261    PXOR t3, x;                                     \
   262    SM4_TAO_L1(x, y, z);                            \
   263    PXOR x, t0
   264  
   265  //	MOVOU r0, tmp2;
   266  //	PUNPCKHDQ r1, tmp2;
   267  //	PUNPCKLDQ	r1, r0; 
   268  //	MOVOU r2, tmp1; 
   269  //	PUNPCKLDQ r3, tmp1; 
   270  //	PUNPCKHDQ r3, r2; 
   271  //	MOVOU r0, r1; 
   272  //	PUNPCKHQDQ tmp1, r1; 
   273  //	PUNPCKLQDQ tmp1, r0; 
   274  //	MOVOU tmp2, r3; 
   275  //	PUNPCKHQDQ r2, r3; 
   276  //	PUNPCKLQDQ r2, tmp2; 
   277  //	MOVOU tmp2, r2
   278  #define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \
   279    PEXTRD $2, r0, r; \
   280    PINSRD $0, r, tmp2;  \
   281    PEXTRD $2, r1, r; \
   282    PINSRD $1, r, tmp2;  \  
   283    ; \
   284    PEXTRD $3, r0, r; \
   285    PINSRD $2, r, tmp2;  \
   286    PEXTRD $3, r1, r; \
   287    PINSRD $3, r, tmp2;  \   // tmp2 = [w7, w3, w6, w2]
   288    ; \
   289    PEXTRD $1, r0, r; \
   290    PINSRD $2, r, r0;  \
   291    PEXTRD $0, r1, r; \
   292    PINSRD $1, r, r0;  \
   293    PEXTRD $1, r1, r; \
   294    PINSRD $3, r, r0;  \ //   r0 = [w5, w1, w4, w0] 
   295    ; \
   296    PEXTRD $0, r2, r; \
   297    PINSRD $0, r, tmp1;  \
   298    PEXTRD $0, r3, r; \
   299    PINSRD $1, r, tmp1;  \
   300    PEXTRD $1, r2, r; \
   301    PINSRD $2, r, tmp1;  \
   302    PEXTRD $1, r3, r; \
   303    PINSRD $3, r, tmp1;  \ // tmp1 = [w13, w9, w12, w8]
   304    ; \
   305    PEXTRD $2, r2, r; \
   306    PINSRD $0, r, r2;  \
   307    PEXTRD $2, r3, r; \
   308    PINSRD $1, r, r2;  \
   309    PEXTRD $3, r2, r; \
   310    PINSRD $2, r, r2;  \
   311    PEXTRD $3, r3, r; \
   312    PINSRD $3, r, r2;  \ //   r2 = [w15, w11, w14, w10] 
   313    ; \
   314  	MOVOU r0, r1; \
   315    PEXTRQ $1, r1, r; \
   316    PINSRQ $0, r, r1; \
   317    PEXTRQ $1, tmp1, r; \ 
   318    PINSRQ $1, r, r1; \ //  r1 = [w13, w9, w5, w1]
   319    ; \
   320    PEXTRQ $0, tmp1, r; \ 
   321    PINSRQ $1, r, r0; \ //  r0 = [w12, w8, w4, w0]
   322    ; \
   323  	MOVOU tmp2, r3; \
   324    PEXTRQ $1, r3, r; \
   325    PINSRQ $0, r, r3; \
   326    PEXTRQ $1, r2, r; \
   327    PINSRQ $1, r, r3; \ //  r3 = [w15, w11, w7, w3]
   328    ; \
   329    PEXTRQ $0, r2, r; \
   330    PINSRQ $1, r, r2; \
   331    PEXTRQ $0, tmp2, r; \
   332    PINSRQ $0, r, r2
   333  
   334  #define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3)  \ 
   335  	PSHUFB flipMask<>(SB), t0; \
   336  	PSHUFB flipMask<>(SB), t1; \
   337  	PSHUFB flipMask<>(SB), t2; \
   338  	PSHUFB flipMask<>(SB), t3; \
   339  	SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y);          \
   340  	XORL IND, IND;                                            \
   341  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   342  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   343  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   344  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   345  	ADDL $16, IND;                                            \
   346  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   347  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   348  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   349  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   350  	ADDL $16, IND;                                            \
   351  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   352  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   353  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   354  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   355  	ADDL $16, IND;                                            \
   356  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   357  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   358  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   359  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   360  	ADDL $16, IND;                                            \
   361  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   362  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   363  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   364  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   365  	ADDL $16, IND;                                            \
   366  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   367  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   368  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   369  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   370  	ADDL $16, IND;                                            \
   371  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   372  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   373  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   374  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   375  	ADDL $16, IND;                                            \
   376  	SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3);           \
   377  	SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0);           \
   378  	SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1);           \
   379  	SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2);           \
   380  	SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y);          \
   381  	PSHUFB BSWAP, t3; \
   382  	PSHUFB BSWAP, t2; \
   383  	PSHUFB BSWAP, t1; \
   384  	PSHUFB BSWAP, t0
   385  
   386  #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
   387    VPUNPCKHDQ r1, r0, tmp2;                 \ // tmp2 =  [w15, w7, w14, w6, w11, w3, w10, w2]          tmp2 = [w7, w3, w6, w2]
   388    VPUNPCKLDQ r1, r0, r0;                   \ // r0 =    [w13, w5, w12, w4, w9, w1, w8, w0]              r0 = [w5, w1, w4, w0]
   389    VPUNPCKLDQ r3, r2, tmp1;                 \ // tmp1 =  [w29, w21, w28, w20, w25, w17, w24, w16]      tmp1 = [w13, w9, w12, w8]
   390    VPUNPCKHDQ r3, r2, r2;                   \ // r2 =    [w31, w27, w30, w22, w27, w19, w26, w18]        r2 = [w15, w11, w14, w10] 
   391    VPUNPCKHQDQ tmp1, r0, r1;                \ // r1 =    [w29, w21, w13, w5, w25, w17, w9, w1]           r1 = [w13, w9, w5, w1]
   392    VPUNPCKLQDQ tmp1, r0, r0;                \ // r0 =    [w28, w20, w12, w4, w24, w16, w8, w0]           r0 = [w12, w8, w4, w0]
   393    VPUNPCKHQDQ r2, tmp2, r3;                \ // r3 =    [w31, w27, w15, w7, w27, w19, w11, w3]          r3 = [w15, w11, w7, w3]
   394    VPUNPCKLQDQ r2, tmp2, r2                   // r2 =    [w30, w22, w14, w6, w26, w18, w10, w2]          r2 = [w14, w10, w6, w2]
   395  
   396  #define AVX2_SM4_SBOX(x, y, xw, yw, tmp) \
   397    VPAND NIBBLE_MASK, x, tmp;                       \
   398    VBROADCASTI128 m1Low<>(SB), y;                   \
   399    VPSHUFB tmp, y, y;                               \
   400    VPSRLQ $4, x, x;                                 \
   401    VPAND NIBBLE_MASK, x, x;                         \
   402    VBROADCASTI128 m1High<>(SB), tmp;                \
   403    VPSHUFB x, tmp, x;                               \
   404    VPXOR y, x, x;                                   \
   405    VBROADCASTI128 inverseShiftRows<>(SB), tmp;      \
   406    VPSHUFB tmp, x, x;                               \
   407    VEXTRACTI128 $1, x, yw                           \
   408    VAESENCLAST X_NIBBLE_MASK, xw, xw;               \
   409    VAESENCLAST X_NIBBLE_MASK, yw, yw;               \
   410    VINSERTI128 $1, yw, x, x;                        \
   411    VPANDN NIBBLE_MASK, x, tmp;                      \
   412    VBROADCASTI128 m2Low<>(SB), y;                   \
   413    VPSHUFB tmp, y, y;                               \
   414    VPSRLQ $4, x, x;                                 \
   415    VPAND NIBBLE_MASK, x, x;                         \
   416    VBROADCASTI128 m2High<>(SB), tmp;                \
   417    VPSHUFB x, tmp, x;                               \
   418    VPXOR y, x, x
   419  
   420  #define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \
   421    AVX2_SM4_SBOX(x, y, xw, yw, tmp);          \
   422    VBROADCASTI128 r08Mask<>(SB), tmp;         \
   423    VPSHUFB tmp, x, y;                         \
   424    VPXOR x, y, y;                             \        
   425    VBROADCASTI128 r16Mask<>(SB), tmp;         \
   426    VPSHUFB tmp, x, tmp;                       \
   427    VPXOR tmp, y, y;                           \
   428    VPSLLD $2, y, tmp;                         \
   429    VPSRLD $30, y, y;                          \
   430    VPXOR tmp, y, y;                           \
   431    VBROADCASTI128 r24Mask<>(SB), tmp;         \
   432    VPSHUFB tmp, x, tmp;                       \
   433    VPXOR y, x, x;                             \
   434    VPXOR x, tmp, x
   435  
   436  #define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3)  \ 
   437    VPBROADCASTD (index * 4)(RK)(IND*1), x;            \
   438    VPXOR t1, x, x;                                    \
   439    VPXOR t2, x, x;                                    \
   440    VPXOR t3, x, x;                                    \
   441    AVX2_SM4_TAO_L1(x, y, xw, yw, tmp);         \  
   442    VPXOR x, t0, t0
   443  
   444  #define AVX_SM4_SBOX(x, y, tmp) \
   445    VPAND X_NIBBLE_MASK, x, tmp;                       \
   446    VMOVDQU m1Low<>(SB), y;                            \
   447    VPSHUFB tmp, y, y;                                 \
   448    VPSRLQ $4, x, x;                                   \
   449    VPAND X_NIBBLE_MASK, x, x;                         \
   450    VMOVDQU m1High<>(SB), tmp;                         \
   451    VPSHUFB x, tmp, x;                                 \
   452    VPXOR y, x, x;                                     \
   453    VMOVDQU inverseShiftRows<>(SB), tmp;               \
   454    VPSHUFB tmp, x, x;                                 \
   455    VAESENCLAST X_NIBBLE_MASK, x, x;                   \
   456    VPANDN X_NIBBLE_MASK, x, tmp;                      \
   457    VMOVDQU m2Low<>(SB), y;                            \
   458    VPSHUFB tmp, y, y;                                 \
   459    VPSRLQ $4, x, x;                                   \
   460    VPAND X_NIBBLE_MASK, x, x;                         \
   461    VMOVDQU m2High<>(SB), tmp;                         \
   462    VPSHUFB x, tmp, x;                                 \
   463    VPXOR y, x, x
   464  
   465  #define AVX_SM4_TAO_L1(x, y, tmp) \
   466    AVX_SM4_SBOX(x, y, tmp);                \
   467    VMOVDQU r08Mask<>(SB), tmp;             \
   468    VPSHUFB tmp, x, y;                      \
   469    VPXOR x, y, y;                          \        
   470    VMOVDQU r16Mask<>(SB), tmp;             \
   471    VPSHUFB tmp, x, tmp;                    \
   472    VPXOR tmp, y, y;                        \
   473    VPSLLD $2, y, tmp;                      \
   474    VPSRLD $30, y, y;                       \
   475    VPXOR tmp, y, y;                        \
   476    VMOVDQU r24Mask<>(SB), tmp;             \
   477    VPSHUFB tmp, x, tmp;                    \
   478    VPXOR y, x, x;                          \
   479    VPXOR x, tmp, x
   480  
   481  #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3)  \ 
   482    VPBROADCASTD (index * 4)(RK)(IND*1), x;                 \
   483    VPXOR t1, x, x;                                         \
   484    VPXOR t2, x, x;                                         \
   485    VPXOR t3, x, x;                                         \
   486    AVX_SM4_TAO_L1(x, y, tmp);                              \  
   487    VPXOR x, t0, t0
   488  
   489  // func gcmSm4Init(productTable *[256]byte, rk []uint32)
   490  TEXT ·gcmSm4Init(SB),NOSPLIT,$0
   491  #define dst DI
   492  #define RK SI
   493  
   494  	MOVQ productTable+0(FP), dst
   495  	MOVQ rk+8(FP), RK
   496  
   497  	MOVOU gcmPoly<>(SB), POLY
   498  
   499  	// Encrypt block 0, with the sm4 round keys to generate the hash key H
   500  	PXOR B0, B0
   501  	PXOR B1, B1
   502  	PXOR B2, B2
   503  	PXOR B3, B3
   504  	XORL CX, CX
   505  
   506  sm4InitEncLoop:
   507  	SM4_SINGLE_ROUND(0, RK, CX, T0, T1, T2, B0, B1, B2, B3)
   508  	SM4_SINGLE_ROUND(1, RK, CX, T0, T1, T2, B1, B2, B3, B0)
   509  	SM4_SINGLE_ROUND(2, RK, CX, T0, T1, T2, B2, B3, B0, B1)
   510  	SM4_SINGLE_ROUND(3, RK, CX, T0, T1, T2, B3, B0, B1, B2)
   511  
   512  	ADDL $16, CX
   513  	CMPL CX, $4*32
   514  	JB sm4InitEncLoop
   515  
   516  	PEXTRD $0, B1, R8
   517  	PINSRD $1, R8, B0
   518  	PEXTRD $0, B2, R8
   519  	PINSRD $2, R8, B0
   520  	PEXTRD $0, B3, R8
   521  	PINSRD $3, R8, B0
   522  
   523  	// H * 2
   524  	PSHUFD $0xff, B0, T0
   525  	MOVOU B0, T1
   526  	PSRAL $31, T0
   527  	PAND POLY, T0
   528  	PSRLL $31, T1
   529  	PSLLDQ $4, T1
   530  	PSLLL $1, B0
   531  	PXOR T0, B0
   532  	PXOR T1, B0
   533  	// Karatsuba pre-computations
   534  	MOVOU B0, (16*14)(dst)
   535  	PSHUFD $78, B0, B1
   536  	PXOR B0, B1
   537  	MOVOU B1, (16*15)(dst)
   538  
   539  	MOVOU B0, B2
   540  	MOVOU B1, B3
   541  	// Now prepare powers of H and pre-computations for them
   542  	MOVQ $7, AX
   543  
   544  initLoop:
   545  		MOVOU B2, T0
   546  		MOVOU B2, T1
   547  		MOVOU B3, T2
   548  		PCLMULQDQ $0x00, B0, T0
   549  		PCLMULQDQ $0x11, B0, T1
   550  		PCLMULQDQ $0x00, B1, T2
   551  
   552  		PXOR T0, T2
   553  		PXOR T1, T2
   554  		MOVOU T2, B4
   555  		PSLLDQ $8, B4
   556  		PSRLDQ $8, T2
   557  		PXOR B4, T0
   558  		PXOR T2, T1
   559  
   560  		MOVOU POLY, B2
   561  		PCLMULQDQ $0x01, T0, B2
   562  		PSHUFD $78, T0, T0
   563  		PXOR B2, T0
   564  		MOVOU POLY, B2
   565  		PCLMULQDQ $0x01, T0, B2
   566  		PSHUFD $78, T0, T0
   567  		PXOR T0, B2
   568  		PXOR T1, B2
   569  
   570  		MOVOU B2, (16*12)(dst)
   571  		PSHUFD $78, B2, B3
   572  		PXOR B2, B3
   573  		MOVOU B3, (16*13)(dst)
   574  
   575  		DECQ AX
   576  		LEAQ (-16*2)(dst), dst
   577  	JNE initLoop
   578  
   579  	RET
   580  
   581  #undef RK
   582  #undef dst
   583  
   584  // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
   585  TEXT ·gcmSm4Data(SB),NOSPLIT,$0
   586  #define pTbl DI
   587  #define aut SI
   588  #define tPtr CX
   589  #define autLen DX
   590  
   591  #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   592  #define mulRoundAAD(X ,i) \
   593  	MOVOU (16*(i*2))(pTbl), T1;\
   594  	MOVOU T1, T2;\
   595  	PCLMULQDQ $0x00, X, T1;\
   596  	PXOR T1, ACC0;\
   597  	PCLMULQDQ $0x11, X, T2;\
   598  	PXOR T2, ACC1;\
   599  	PSHUFD $78, X, T1;\
   600  	PXOR T1, X;\
   601  	MOVOU (16*(i*2+1))(pTbl), T1;\
   602  	PCLMULQDQ $0x00, X, T1;\
   603  	PXOR T1, ACCM
   604  
   605  	MOVQ productTable+0(FP), pTbl
   606  	MOVQ data_base+8(FP), aut
   607  	MOVQ data_len+16(FP), autLen
   608  	MOVQ T+32(FP), tPtr
   609  
   610  	//PXOR ACC0, ACC0
   611  	MOVOU (tPtr), ACC0
   612  	MOVOU bswapMask<>(SB), BSWAP
   613  	MOVOU gcmPoly<>(SB), POLY
   614  
   615  	TESTQ autLen, autLen
   616  	JEQ dataBail
   617  
   618  	CMPQ autLen, $13	// optimize the TLS case
   619  	JE dataTLS
   620  	CMPQ autLen, $128
   621  	JB startSinglesLoop
   622  	JMP dataOctaLoop
   623  
   624  dataTLS:
   625  	MOVOU (16*14)(pTbl), T1
   626  	MOVOU (16*15)(pTbl), T2
   627  	PXOR B0, B0
   628  	MOVQ (aut), B0
   629  	PINSRD $2, 8(aut), B0
   630  	PINSRB $12, 12(aut), B0
   631  	XORQ autLen, autLen
   632  	JMP dataMul
   633  
   634  dataOctaLoop:
   635  		CMPQ autLen, $128
   636  		JB startSinglesLoop
   637  		SUBQ $128, autLen
   638  
   639  		MOVOU (16*0)(aut), X0
   640  		MOVOU (16*1)(aut), X1
   641  		MOVOU (16*2)(aut), X2
   642  		MOVOU (16*3)(aut), X3
   643  		MOVOU (16*4)(aut), X4
   644  		MOVOU (16*5)(aut), X5
   645  		MOVOU (16*6)(aut), X6
   646  		MOVOU (16*7)(aut), X7
   647  		LEAQ (16*8)(aut), aut
   648  		PSHUFB BSWAP, X0
   649  		PSHUFB BSWAP, X1
   650  		PSHUFB BSWAP, X2
   651  		PSHUFB BSWAP, X3
   652  		PSHUFB BSWAP, X4
   653  		PSHUFB BSWAP, X5
   654  		PSHUFB BSWAP, X6
   655  		PSHUFB BSWAP, X7
   656  		PXOR ACC0, X0
   657  
   658  		MOVOU (16*0)(pTbl), ACC0
   659  		MOVOU (16*1)(pTbl), ACCM
   660  		MOVOU ACC0, ACC1
   661  		PSHUFD $78, X0, T1
   662  		PXOR X0, T1
   663  		PCLMULQDQ $0x00, X0, ACC0
   664  		PCLMULQDQ $0x11, X0, ACC1
   665  		PCLMULQDQ $0x00, T1, ACCM
   666  
   667  		mulRoundAAD(X1, 1)
   668  		mulRoundAAD(X2, 2)
   669  		mulRoundAAD(X3, 3)
   670  		mulRoundAAD(X4, 4)
   671  		mulRoundAAD(X5, 5)
   672  		mulRoundAAD(X6, 6)
   673  		mulRoundAAD(X7, 7)
   674  
   675  		PXOR ACC0, ACCM
   676  		PXOR ACC1, ACCM
   677  		MOVOU ACCM, T0
   678  		PSRLDQ $8, ACCM
   679  		PSLLDQ $8, T0
   680  		PXOR ACCM, ACC1
   681  		PXOR T0, ACC0
   682  		reduceRound(ACC0)
   683  		reduceRound(ACC0)
   684  		PXOR ACC1, ACC0
   685  	JMP dataOctaLoop
   686  
   687  startSinglesLoop:
   688  	MOVOU (16*14)(pTbl), T1
   689  	MOVOU (16*15)(pTbl), T2
   690  
   691  dataSinglesLoop:
   692  
   693  		CMPQ autLen, $16
   694  		JB dataEnd
   695  		SUBQ $16, autLen
   696  
   697  		MOVOU (aut), B0
   698  dataMul:
   699  		PSHUFB BSWAP, B0
   700  		PXOR ACC0, B0
   701  
   702  		MOVOU T1, ACC0
   703  		MOVOU T2, ACCM
   704  		MOVOU T1, ACC1
   705  
   706  		PSHUFD $78, B0, T0
   707  		PXOR B0, T0
   708  		PCLMULQDQ $0x00, B0, ACC0
   709  		PCLMULQDQ $0x11, B0, ACC1
   710  		PCLMULQDQ $0x00, T0, ACCM
   711  
   712  		PXOR ACC0, ACCM
   713  		PXOR ACC1, ACCM
   714  		MOVOU ACCM, T0
   715  		PSRLDQ $8, ACCM
   716  		PSLLDQ $8, T0
   717  		PXOR ACCM, ACC1
   718  		PXOR T0, ACC0
   719  
   720  		MOVOU POLY, T0
   721  		PCLMULQDQ $0x01, ACC0, T0
   722  		PSHUFD $78, ACC0, ACC0
   723  		PXOR T0, ACC0
   724  
   725  		MOVOU POLY, T0
   726  		PCLMULQDQ $0x01, ACC0, T0
   727  		PSHUFD $78, ACC0, ACC0
   728  		PXOR T0, ACC0
   729  		PXOR ACC1, ACC0
   730  
   731  		LEAQ 16(aut), aut
   732  
   733  	JMP dataSinglesLoop
   734  
   735  dataEnd:
   736  
   737  	TESTQ autLen, autLen
   738  	JEQ dataBail
   739  
   740  	PXOR B0, B0
   741  	LEAQ -1(aut)(autLen*1), aut
   742  
   743  dataLoadLoop:
   744  
   745  		PSLLDQ $1, B0
   746  		PINSRB $0, (aut), B0
   747  
   748  		LEAQ -1(aut), aut
   749  		DECQ autLen
   750  		JNE dataLoadLoop
   751  
   752  	JMP dataMul
   753  
   754  dataBail:
   755  	MOVOU ACC0, (tPtr)
   756  	RET
   757  
   758  #undef pTbl
   759  #undef aut
   760  #undef tPtr
   761  #undef autLen
   762  
   763  
   764  // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
   765  TEXT ·gcmSm4Enc(SB),0,$256-96
   766  #define pTbl DI
   767  #define ctx DX
   768  #define ctrPtr CX
   769  #define ptx SI
   770  #define rk AX
   771  #define tPtr R8
   772  #define ptxLen R9
   773  #define aluCTR R10
   774  #define aluTMP R11
   775  
   776  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   777  
   778  #define mulRound(i) \
   779  	MOVOU (16*i)(SP), T0;\
   780  	MOVOU (16*(i*2))(pTbl), T1;\
   781  	MOVOU T1, T2;\
   782  	PCLMULQDQ $0x00, T0, T1;\
   783  	PXOR T1, ACC0;\
   784  	PCLMULQDQ $0x11, T0, T2;\
   785  	PXOR T2, ACC1;\
   786  	PSHUFD $78, T0, T1;\
   787  	PXOR T1, T0;\
   788  	MOVOU (16*(i*2+1))(pTbl), T1;\
   789  	PCLMULQDQ $0x00, T0, T1;\
   790  	PXOR T1, ACCM
   791  
   792  #define gcmEncDataStep(B) \
   793  	PSHUFB BSWAP, B; \
   794  	PXOR ACC0, B; \
   795  	MOVOU T2, ACC0; \
   796  	MOVOU T2, ACC1; \
   797  	MOVOU (16*15)(pTbl), ACCM; \
   798  	PSHUFD $78, B, T0; \
   799  	PXOR B, T0; \
   800  	PCLMULQDQ $0x00, B, ACC0; \
   801  	PCLMULQDQ $0x11, B, ACC1; \
   802  	PCLMULQDQ $0x00, T0, ACCM; \
   803  	PXOR ACC0, ACCM; \
   804  	PXOR ACC1, ACCM; \
   805  	MOVOU ACCM, T0; \
   806  	PSRLDQ $8, ACCM; \
   807  	PSLLDQ $8, T0; \
   808  	PXOR ACCM, ACC1; \
   809  	PXOR T0, ACC0; \
   810  	reduceRound(ACC0); \
   811  	reduceRound(ACC0); \
   812  	PXOR ACC1, ACC0
   813  
   814  	MOVQ productTable+0(FP), pTbl
   815  	MOVQ dst+8(FP), ctx
   816  	MOVQ src_base+32(FP), ptx
   817  	MOVQ src_len+40(FP), ptxLen
   818  	MOVQ ctr+56(FP), ctrPtr
   819  	MOVQ T+64(FP), tPtr
   820  	MOVQ rk_base+72(FP), rk
   821  
   822  	CMPB ·useAVX2(SB), $1
   823  	JE   avx2GcmSm4Enc
   824  
   825  	MOVOU bswapMask<>(SB), BSWAP
   826  	MOVOU gcmPoly<>(SB), POLY
   827  
   828  	MOVOU (tPtr), ACC0
   829  	PXOR ACC1, ACC1
   830  	PXOR ACCM, ACCM
   831  	MOVOU (ctrPtr), T0
   832  	MOVL (3*4)(ctrPtr), aluCTR
   833  	
   834  	BSWAPL aluCTR
   835  	MOVOU T0, (8*16 + 0*16)(SP)
   836  	increment(0)
   837  	MOVOU T0, (8*16 + 1*16)(SP)
   838  	increment(1)
   839  	MOVOU T0, (8*16 + 2*16)(SP)
   840  	increment(2)
   841  	MOVOU T0, (8*16 + 3*16)(SP)
   842  	increment(3)
   843  
   844  	CMPQ ptxLen, $128
   845  	JB gcmSm4EncNibbles
   846  	SUBQ $128, ptxLen
   847  
   848  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   849  	MOVOU T0, (8*16 + 4*16)(SP)
   850  	increment(4)
   851  	MOVOU T0, (8*16 + 5*16)(SP)
   852  	increment(5)
   853  	MOVOU T0, (8*16 + 6*16)(SP)
   854  	increment(6)
   855  	MOVOU T0, (8*16 + 7*16)(SP)
   856  	increment(7)
   857  
   858  	// load 8 ctrs for encryption
   859  	MOVOU (8*16 + 0*16)(SP), B0
   860  	MOVOU (8*16 + 1*16)(SP), B1
   861  	MOVOU (8*16 + 2*16)(SP), B2
   862  	MOVOU (8*16 + 3*16)(SP), B3
   863  	MOVOU (8*16 + 4*16)(SP), B4
   864  	MOVOU (8*16 + 5*16)(SP), B5
   865  	MOVOU (8*16 + 6*16)(SP), B6
   866  	MOVOU (8*16 + 7*16)(SP), B7
   867  
   868  	SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
   869  	increment(0)
   870  	increment(1)
   871  	increment(2)
   872  	increment(3)
   873  	SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
   874  	increment(4)
   875  	increment(5)
   876  	increment(6)
   877  	increment(7)	
   878  
   879  	// XOR plaintext
   880  	MOVOU (16*0)(ptx), T0
   881  	PXOR T0, B0
   882  	MOVOU (16*1)(ptx), T0
   883  	PXOR T0, B1
   884  	MOVOU (16*2)(ptx), T0
   885  	PXOR T0, B2
   886  	MOVOU (16*3)(ptx), T0
   887  	PXOR T0, B3
   888  	MOVOU (16*4)(ptx), T0
   889  	PXOR T0, B4
   890  	MOVOU (16*5)(ptx), T0
   891  	PXOR T0, B5
   892  	MOVOU (16*6)(ptx), T0
   893  	PXOR T0, B6
   894  	MOVOU (16*7)(ptx), T0
   895  	PXOR T0, B7
   896  
   897  	// Store ciphertext
   898  	MOVOU B0, (16*0)(ctx)
   899  	PSHUFB BSWAP, B0
   900  	PXOR ACC0, B0
   901  	MOVOU B1, (16*1)(ctx)
   902  	PSHUFB BSWAP, B1
   903  	MOVOU B2, (16*2)(ctx)
   904  	PSHUFB BSWAP, B2
   905  	MOVOU B3, (16*3)(ctx)
   906  	PSHUFB BSWAP, B3
   907  	MOVOU B4, (16*4)(ctx)
   908  	PSHUFB BSWAP, B4
   909  	MOVOU B5, (16*5)(ctx)
   910  	PSHUFB BSWAP, B5
   911  	MOVOU B6, (16*6)(ctx)
   912  	PSHUFB BSWAP, B6
   913  	MOVOU B7, (16*7)(ctx)
   914  	PSHUFB BSWAP, B7
   915  
   916  	MOVOU B0, (16*0)(SP)
   917  	MOVOU B1, (16*1)(SP)
   918  	MOVOU B2, (16*2)(SP)
   919  	MOVOU B3, (16*3)(SP)
   920  	MOVOU B4, (16*4)(SP)
   921  	MOVOU B5, (16*5)(SP)
   922  	MOVOU B6, (16*6)(SP)
   923  	MOVOU B7, (16*7)(SP)
   924  
   925  	LEAQ 128(ptx), ptx
   926  	LEAQ 128(ctx), ctx
   927  
   928  gcmSm4EncOctetsLoop:
   929  		CMPQ ptxLen, $128
   930  		JB gcmSm4EncOctetsEnd
   931  		SUBQ $128, ptxLen
   932  
   933  		MOVOU (8*16 + 0*16)(SP), B0
   934  		MOVOU (8*16 + 1*16)(SP), B1
   935  		MOVOU (8*16 + 2*16)(SP), B2
   936  		MOVOU (8*16 + 3*16)(SP), B3
   937  		MOVOU (8*16 + 4*16)(SP), B4
   938  		MOVOU (8*16 + 5*16)(SP), B5
   939  		MOVOU (8*16 + 6*16)(SP), B6
   940  		MOVOU (8*16 + 7*16)(SP), B7
   941  
   942  		MOVOU (16*0)(SP), T0
   943  		PSHUFD $78, T0, T1
   944  		PXOR T0, T1
   945  
   946  		MOVOU (16*0)(pTbl), ACC0
   947  		MOVOU (16*1)(pTbl), ACCM
   948  		MOVOU ACC0, ACC1
   949  
   950  		PCLMULQDQ $0x00, T1, ACCM
   951  		PCLMULQDQ $0x00, T0, ACC0
   952  		PCLMULQDQ $0x11, T0, ACC1
   953  
   954  		SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
   955  		mulRound(1)
   956  		increment(0)
   957  		mulRound(2)
   958  		increment(1)
   959  		mulRound(3)
   960  		increment(2)
   961  	 	mulRound(4)
   962  		increment(3)
   963  		SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
   964  		mulRound(5)
   965  		increment(4)
   966  		mulRound(6)
   967  		increment(5)
   968  	 	mulRound(7)
   969  		increment(6)
   970  		increment(7)
   971  		PXOR ACC0, ACCM
   972  		PXOR ACC1, ACCM
   973  		MOVOU ACCM, T0
   974  		PSRLDQ $8, ACCM
   975  		PSLLDQ $8, T0
   976  		PXOR ACCM, ACC1
   977  		PXOR T0, ACC0
   978  
   979  		reduceRound(ACC0)
   980  		reduceRound(ACC0)
   981  		PXOR ACC1, ACC0
   982  		
   983  		MOVOU (16*0)(ptx), T0
   984  		PXOR T0, B0
   985  		MOVOU (16*1)(ptx), T0
   986  		PXOR T0, B1
   987  		MOVOU (16*2)(ptx), T0
   988  		PXOR T0, B2
   989  		MOVOU (16*3)(ptx), T0
   990  		PXOR T0, B3
   991  		MOVOU (16*4)(ptx), T0
   992  		PXOR T0, B4
   993  		MOVOU (16*5)(ptx), T0
   994  		PXOR T0, B5
   995  		MOVOU (16*6)(ptx), T0
   996  		PXOR T0, B6
   997  		MOVOU (16*7)(ptx), T0
   998  		PXOR T0, B7
   999  
  1000  		MOVOU B0, (16*0)(ctx)
  1001  		PSHUFB BSWAP, B0
  1002  		PXOR ACC0, B0
  1003  		MOVOU B1, (16*1)(ctx)
  1004  		PSHUFB BSWAP, B1
  1005  		MOVOU B2, (16*2)(ctx)
  1006  		PSHUFB BSWAP, B2
  1007  		MOVOU B3, (16*3)(ctx)
  1008  		PSHUFB BSWAP, B3
  1009  		MOVOU B4, (16*4)(ctx)
  1010  		PSHUFB BSWAP, B4
  1011  		MOVOU B5, (16*5)(ctx)
  1012  		PSHUFB BSWAP, B5
  1013  		MOVOU B6, (16*6)(ctx)
  1014  		PSHUFB BSWAP, B6
  1015  		MOVOU B7, (16*7)(ctx)
  1016  		PSHUFB BSWAP, B7
  1017  
  1018  		MOVOU B0, (16*0)(SP)
  1019  		MOVOU B1, (16*1)(SP)
  1020  		MOVOU B2, (16*2)(SP)
  1021  		MOVOU B3, (16*3)(SP)
  1022  		MOVOU B4, (16*4)(SP)
  1023  		MOVOU B5, (16*5)(SP)
  1024  		MOVOU B6, (16*6)(SP)
  1025  		MOVOU B7, (16*7)(SP)
  1026  
  1027  		LEAQ 128(ptx), ptx
  1028  		LEAQ 128(ctx), ctx
  1029  
  1030  		JMP gcmSm4EncOctetsLoop
  1031  
  1032  gcmSm4EncOctetsEnd:
  1033  	MOVOU (16*0)(SP), T0
  1034  	MOVOU (16*0)(pTbl), ACC0
  1035  	MOVOU (16*1)(pTbl), ACCM
  1036  	MOVOU ACC0, ACC1
  1037  	PSHUFD $78, T0, T1
  1038  	PXOR T0, T1
  1039  	PCLMULQDQ $0x00, T0, ACC0
  1040  	PCLMULQDQ $0x11, T0, ACC1
  1041  	PCLMULQDQ $0x00, T1, ACCM
  1042  
  1043  	mulRound(1)
  1044  	mulRound(2)
  1045  	mulRound(3)
  1046  	mulRound(4)
  1047  	mulRound(5)
  1048  	mulRound(6)
  1049  	mulRound(7)
  1050  
  1051  	PXOR ACC0, ACCM
  1052  	PXOR ACC1, ACCM
  1053  	MOVOU ACCM, T0
  1054  	PSRLDQ $8, ACCM
  1055  	PSLLDQ $8, T0
  1056  	PXOR ACCM, ACC1
  1057  	PXOR T0, ACC0
  1058  
  1059  	reduceRound(ACC0)
  1060  	reduceRound(ACC0)
  1061  	PXOR ACC1, ACC0
  1062  
  1063  	TESTQ ptxLen, ptxLen
  1064  	JE gcmSm4EncDone
  1065  
  1066  	SUBQ $4, aluCTR
  1067  
  1068  gcmSm4EncNibbles:
  1069  	CMPQ ptxLen, $64
  1070  	JBE gcmSm4EncSingles
  1071  	SUBQ $64, ptxLen
  1072  
  1073  	MOVOU (8*16 + 0*16)(SP), B0
  1074  	MOVOU (8*16 + 1*16)(SP), B1
  1075  	MOVOU (8*16 + 2*16)(SP), B2
  1076  	MOVOU (8*16 + 3*16)(SP), B3
  1077  	
  1078  	SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
  1079  	MOVOU (16*0)(ptx), T0
  1080  	PXOR T0, B0
  1081  	MOVOU (16*1)(ptx), T0
  1082  	PXOR T0, B1
  1083  	MOVOU (16*2)(ptx), T0
  1084  	PXOR T0, B2
  1085  	MOVOU (16*3)(ptx), T0
  1086  	PXOR T0, B3
  1087  
  1088  	MOVOU B0, (16*0)(ctx)
  1089  	MOVOU B1, (16*1)(ctx)
  1090  	MOVOU B2, (16*2)(ctx)
  1091  	MOVOU B3, (16*3)(ctx)
  1092  
  1093  	MOVOU (16*14)(pTbl), T2
  1094  	gcmEncDataStep(B0)
  1095  	gcmEncDataStep(B1)
  1096  	gcmEncDataStep(B2)
  1097  	gcmEncDataStep(B3)
  1098  	increment(0)
  1099  	increment(1)
  1100  	increment(2)
  1101  	increment(3)
  1102  
  1103  	LEAQ 64(ptx), ptx
  1104  	LEAQ 64(ctx), ctx
  1105  
  1106  gcmSm4EncSingles:
  1107  	TESTQ ptxLen, ptxLen
  1108  	JE gcmSm4EncDone
  1109  	MOVOU (8*16 + 0*16)(SP), B0
  1110  	MOVOU (8*16 + 1*16)(SP), B1
  1111  	MOVOU (8*16 + 2*16)(SP), B2
  1112  	MOVOU (8*16 + 3*16)(SP), B3
  1113  	
  1114  	SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3)
  1115  	MOVOU B0, (16*0)(SP)
  1116  	MOVOU B1, (16*1)(SP)
  1117  	MOVOU B2, (16*2)(SP)
  1118  	MOVOU B3, (16*3)(SP)
  1119  
  1120  	MOVOU (16*14)(pTbl), T2
  1121  	MOVQ SP, BP
  1122  
  1123  gcmSm4EncSinglesLoop:
  1124  		CMPQ ptxLen, $16
  1125  		JB gcmSm4EncTail
  1126  		SUBQ $16, ptxLen
  1127  		MOVOU (16*0)(BP), B0
  1128  		MOVOU (ptx), T0
  1129  		PXOR T0, B0
  1130  		MOVOU B0, (ctx)
  1131  		gcmEncDataStep(B0)
  1132  		LEAQ (16*1)(ptx), ptx
  1133  		LEAQ (16*1)(ctx), ctx
  1134  		ADDQ $16, BP
  1135  	JMP gcmSm4EncSinglesLoop		
  1136  
  1137  gcmSm4EncTail:
  1138  	TESTQ ptxLen, ptxLen
  1139  	JE gcmSm4EncDone
  1140  	MOVOU (16*0)(BP), B0
  1141  	MOVOU B0, T0
  1142  
  1143  	LEAQ -1(ptx)(ptxLen*1), ptx
  1144  
  1145  	MOVQ ptxLen, aluTMP
  1146  	SHLQ $4, aluTMP
  1147  
  1148  	LEAQ andMask<>(SB), aluCTR
  1149  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1150  	PXOR B0, B0
  1151  ptxLoadLoop:
  1152  		PSLLDQ $1, B0
  1153  		PINSRB $0, (ptx), B0
  1154  		LEAQ -1(ptx), ptx
  1155  		DECQ ptxLen
  1156  	JNE ptxLoadLoop
  1157  
  1158  	PXOR T0, B0
  1159  	PAND T1, B0
  1160  	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
  1161  	gcmEncDataStep(B0)
  1162  
  1163  gcmSm4EncDone:
  1164  	MOVOU ACC0, (tPtr)
  1165  	RET
  1166  
  1167  avx2GcmSm4Enc:
  1168  	VMOVDQU bswapMask<>(SB), BSWAP
  1169  	VMOVDQU gcmPoly<>(SB), POLY
  1170  
  1171  	VMOVDQU (tPtr), ACC0
  1172  	VPXOR ACC1, ACC1, ACC1
  1173  	VPXOR ACCM, ACCM, ACCM
  1174  	VMOVDQU (ctrPtr), T0
  1175  	MOVL (3*4)(ctrPtr), aluCTR
  1176  	
  1177  	BSWAPL aluCTR
  1178  	VMOVDQU T0, (8*16 + 0*16)(SP)
  1179  	increment(0)
  1180  	VMOVDQU T0, (8*16 + 1*16)(SP)
  1181  	increment(1)
  1182  	VMOVDQU T0, (8*16 + 2*16)(SP)
  1183  	increment(2)
  1184  	VMOVDQU T0, (8*16 + 3*16)(SP)
  1185  	increment(3)
  1186  
  1187  	CMPQ ptxLen, $128
  1188  	JB avx2GcmSm4EncNibbles
  1189  	SUBQ $128, ptxLen
  1190  
  1191  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
  1192  	VMOVDQU T0, (8*16 + 4*16)(SP)
  1193  	increment(4)
  1194  	VMOVDQU T0, (8*16 + 5*16)(SP)
  1195  	increment(5)
  1196  	VMOVDQU T0, (8*16 + 6*16)(SP)
  1197  	increment(6)
  1198  	VMOVDQU T0, (8*16 + 7*16)(SP)
  1199  	increment(7)
  1200  
  1201  	// load 8 ctrs for encryption
  1202  	VMOVDQU (4*32 + 0*32)(SP), DWB0
  1203  	VMOVDQU (4*32 + 1*32)(SP), DWB1
  1204  	VMOVDQU (4*32 + 2*32)(SP), DWB2
  1205  	VMOVDQU (4*32 + 3*32)(SP), DWB3
  1206  
  1207  	VBROADCASTI128 flipMask<>(SB), XDWTMP0
  1208  	// Apply Byte Flip Mask: LE -> BE
  1209  	VPSHUFB XDWTMP0, DWB0, DWB0
  1210  	VPSHUFB XDWTMP0, DWB1, DWB1
  1211  	VPSHUFB XDWTMP0, DWB2, DWB2
  1212  	VPSHUFB XDWTMP0, DWB3, DWB3
  1213  
  1214  	// Transpose matrix 4 x 4 32bits word
  1215  	TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
  1216  	XORL BX, BX
  1217  	VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
  1218  
  1219  avx2GcmSm4Enc8Loop1:
  1220  	AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) 
  1221  	AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) 
  1222  	AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) 
  1223  	AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) 
  1224  
  1225  	ADDL $16, BX
  1226  	CMPL BX, $4*32
  1227  	JB avx2GcmSm4Enc8Loop1
  1228  
  1229  	// Transpose matrix 4 x 4 32bits word
  1230  	TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
  1231  
  1232  	VBROADCASTI128 bswapMask<>(SB), DWBSWAP
  1233  	VPSHUFB DWBSWAP, DWB0, DWB0
  1234  	VPSHUFB DWBSWAP, DWB1, DWB1
  1235  	VPSHUFB DWBSWAP, DWB2, DWB2
  1236  	VPSHUFB DWBSWAP, DWB3, DWB3
  1237  
  1238  	increment(0)
  1239  	increment(1)
  1240  	increment(2)
  1241  	increment(3)
  1242  	increment(4)
  1243  	increment(5)
  1244  	increment(6)
  1245  	increment(7)
  1246  
  1247  	// XOR plaintext
  1248  	VMOVDQU (32*0)(ptx), XDWTMP0
  1249  	VPXOR XDWTMP0, DWB0, DWB0
  1250  	VMOVDQU (32*1)(ptx), XDWTMP0
  1251  	VPXOR XDWTMP0, DWB1, DWB1
  1252  	VMOVDQU (32*2)(ptx), XDWTMP0
  1253  	VPXOR XDWTMP0, DWB2, DWB2
  1254  	VMOVDQU (32*3)(ptx), XDWTMP0
  1255  	VPXOR XDWTMP0, DWB3, DWB3
  1256  
  1257  	// Store ciphertext
  1258  	VMOVDQU DWB0, (32*0)(ctx)
  1259  	VPSHUFB DWBSWAP, DWB0, DWB0
  1260  	VMOVDQU DWB1, (32*1)(ctx)
  1261  	VPSHUFB DWBSWAP, DWB1, DWB1
  1262  	VMOVDQU DWB2, (32*2)(ctx)
  1263  	VPSHUFB DWBSWAP, DWB2, DWB2
  1264  	VMOVDQU DWB3, (32*3)(ctx)
  1265  	VPSHUFB DWBSWAP, DWB3, DWB3
  1266  
  1267  	//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
  1268  	//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
  1269  	//VPXOR XDWTMP0, DWB0, DWB0
  1270  	PXOR ACC0, B0  // Can't call VPXOR here
  1271  	VMOVDQU DWB0, (32*0)(SP)
  1272  	VMOVDQU DWB1, (32*1)(SP)
  1273  	VMOVDQU DWB2, (32*2)(SP)
  1274  	VMOVDQU DWB3, (32*3)(SP)
  1275  
  1276  	LEAQ 128(ptx), ptx
  1277  	LEAQ 128(ctx), ctx
  1278  
  1279  avx2GcmSm4EncOctetsLoop:
  1280  		CMPQ ptxLen, $128
  1281  		JB avx2GcmSm4EncOctetsEnd
  1282  		SUBQ $128, ptxLen
  1283  
  1284  		// load 8 ctrs for encryption
  1285  		VMOVDQU (4*32 + 0*32)(SP), DWB0
  1286  		VMOVDQU (4*32 + 1*32)(SP), DWB1
  1287  		VMOVDQU (4*32 + 2*32)(SP), DWB2
  1288  		VMOVDQU (4*32 + 3*32)(SP), DWB3
  1289  
  1290  		VBROADCASTI128 flipMask<>(SB), XDWTMP0
  1291  		// Apply Byte Flip Mask: LE -> BE
  1292  		VPSHUFB XDWTMP0, DWB0, DWB0
  1293  		VPSHUFB XDWTMP0, DWB1, DWB1
  1294  		VPSHUFB XDWTMP0, DWB2, DWB2
  1295  		VPSHUFB XDWTMP0, DWB3, DWB3
  1296  
  1297  		VMOVDQU (16*0)(SP), T0
  1298  		VPSHUFD $78, T0, T1
  1299  		VPXOR T0, T1, T1
  1300  
  1301  		VMOVDQU (16*0)(pTbl), ACC0
  1302  		VMOVDQU (16*1)(pTbl), ACCM
  1303  		VMOVDQU ACC0, ACC1
  1304  
  1305  		PCLMULQDQ $0x00, T1, ACCM
  1306  		PCLMULQDQ $0x00, T0, ACC0
  1307  		PCLMULQDQ $0x11, T0, ACC1
  1308  
  1309  		// Transpose matrix 4 x 4 32bits word
  1310  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
  1311  		XORL BX, BX
  1312  		VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
  1313  
  1314  avx2GcmSm4Enc8Loop2:
  1315  			AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) 
  1316  			AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) 
  1317  			AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) 
  1318  			AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) 
  1319  
  1320    		ADDL $16, BX
  1321    		CMPL BX, $4*32
  1322  		JB avx2GcmSm4Enc8Loop2
  1323  
  1324  		// Transpose matrix 4 x 4 32bits word
  1325  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
  1326  
  1327  		VBROADCASTI128 bswapMask<>(SB), DWBSWAP
  1328  		VPSHUFB DWBSWAP, DWB0, DWB0
  1329  		VPSHUFB DWBSWAP, DWB1, DWB1
  1330  		VPSHUFB DWBSWAP, DWB2, DWB2
  1331  		VPSHUFB DWBSWAP, DWB3, DWB3
  1332  
  1333  		mulRound(1)
  1334  		increment(0)
  1335  		mulRound(2)
  1336  		increment(1)
  1337  		mulRound(3)
  1338  		increment(2)
  1339  	 	mulRound(4)
  1340  		increment(3)
  1341  		mulRound(5)
  1342  		increment(4)
  1343  		mulRound(6)
  1344  		increment(5)
  1345  	 	mulRound(7)
  1346  		increment(6)
  1347  		increment(7)
  1348  		VPXOR ACC0, ACCM, ACCM
  1349  		VPXOR ACC1, ACCM, ACCM
  1350  		VPSLLDQ $8, ACCM, T0
  1351  		VPSRLDQ $8, ACCM, ACCM
  1352  		
  1353  		VPXOR ACCM, ACC1, ACC1
  1354  		VPXOR T0, ACC0, ACC0
  1355  
  1356  		reduceRound(ACC0)
  1357  		reduceRound(ACC0)
  1358  		VPXOR ACC1, ACC0, ACC0
  1359  
  1360  		// XOR plaintext
  1361  		VMOVDQU (32*0)(ptx), XDWTMP0
  1362  		VPXOR XDWTMP0, DWB0, DWB0
  1363  		VMOVDQU (32*1)(ptx), XDWTMP0
  1364  		VPXOR XDWTMP0, DWB1, DWB1
  1365  		VMOVDQU (32*2)(ptx), XDWTMP0
  1366  		VPXOR XDWTMP0, DWB2, DWB2
  1367  		VMOVDQU (32*3)(ptx), XDWTMP0
  1368  		VPXOR XDWTMP0, DWB3, DWB3
  1369  
  1370  		// Store ciphertext
  1371  		VMOVDQU DWB0, (32*0)(ctx)
  1372  		VPSHUFB DWBSWAP, DWB0, DWB0
  1373  		VMOVDQU DWB1, (32*1)(ctx)
  1374  		VPSHUFB DWBSWAP, DWB1, DWB1
  1375  		VMOVDQU DWB2, (32*2)(ctx)
  1376  		VPSHUFB DWBSWAP, DWB2, DWB2
  1377  		VMOVDQU DWB3, (32*3)(ctx)
  1378  		VPSHUFB DWBSWAP, DWB3, DWB3
  1379  
  1380  		//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
  1381  		//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
  1382  		//VPXOR XDWTMP0, DWB0, DWB0
  1383  		PXOR ACC0, B0  // Can't call VPXOR here
  1384  		VMOVDQU DWB0, (32*0)(SP)
  1385  		VMOVDQU DWB1, (32*1)(SP)
  1386  		VMOVDQU DWB2, (32*2)(SP)
  1387  		VMOVDQU DWB3, (32*3)(SP)
  1388  
  1389  		LEAQ 128(ptx), ptx
  1390  		LEAQ 128(ctx), ctx
  1391  
  1392  		JMP avx2GcmSm4EncOctetsLoop
  1393  
  1394  avx2GcmSm4EncOctetsEnd:
  1395  	VMOVDQU (16*0)(SP), T0
  1396  	VMOVDQU (16*0)(pTbl), ACC0
  1397  	VMOVDQU (16*1)(pTbl), ACCM
  1398  	VMOVDQU ACC0, ACC1
  1399  	VPSHUFD $78, T0, T1
  1400  	VPXOR T0, T1, T1
  1401  	PCLMULQDQ $0x00, T0, ACC0
  1402  	PCLMULQDQ $0x11, T0, ACC1
  1403  	PCLMULQDQ $0x00, T1, ACCM
  1404  
  1405  	mulRound(1)
  1406  	mulRound(2)
  1407  	mulRound(3)
  1408  	mulRound(4)
  1409  	mulRound(5)
  1410  	mulRound(6)
  1411  	mulRound(7)
  1412  
  1413  	VPXOR ACC0, ACCM, ACCM
  1414  	VPXOR ACC1, ACCM, ACCM
  1415  	VPSLLDQ $8, ACCM, T0
  1416  	VPSRLDQ $8, ACCM, ACCM
  1417  	
  1418  	VPXOR ACCM, ACC1, ACC1
  1419  	VPXOR T0, ACC0, ACC0
  1420  
  1421  	reduceRound(ACC0)
  1422  	reduceRound(ACC0)
  1423  	VPXOR ACC1, ACC0, ACC0
  1424  
  1425  	TESTQ ptxLen, ptxLen
  1426  	JE avx2GcmSm4EncDone
  1427  
  1428  	SUBQ $4, aluCTR
  1429  
  1430  avx2GcmSm4EncNibbles:
  1431  	VMOVDQU flipMask<>(SB), B7
  1432  	CMPQ ptxLen, $64
  1433  	JBE avx2GcmSm4EncSingles
  1434  	SUBQ $64, ptxLen
  1435  
  1436  	VMOVDQU (8*16 + 0*16)(SP), B0
  1437  	VMOVDQU (8*16 + 1*16)(SP), B1
  1438  	VMOVDQU (8*16 + 2*16)(SP), B2
  1439  	VMOVDQU (8*16 + 3*16)(SP), B3
  1440  	
  1441  	VPSHUFB B7, B0, B0
  1442  	VPSHUFB B7, B1, B1
  1443  	VPSHUFB B7, B2, B2
  1444  	VPSHUFB B7, B3, B3
  1445  
  1446  	TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
  1447  	XORL BX, BX	
  1448  	VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
  1449  
  1450  avx2GcmSm4Enc4Loop2:
  1451  	AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
  1452  	AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
  1453  	AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
  1454  	AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
  1455  
  1456  	ADDL $16, BX
  1457  	CMPL BX, $4*32
  1458  	JB avx2GcmSm4Enc4Loop2
  1459  
  1460  	// Transpose matrix 4 x 4 32bits word
  1461  	TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
  1462  	VPSHUFB BSWAP, B0, B0
  1463  	VPSHUFB BSWAP, B1, B1
  1464  	VPSHUFB BSWAP, B2, B2
  1465  	VPSHUFB BSWAP, B3, B3
  1466  
  1467  	VMOVDQU (16*0)(ptx), T0
  1468  	VPXOR T0, B0, B0
  1469  	VMOVDQU (16*1)(ptx), T0
  1470  	VPXOR T0, B1, B1
  1471  	VMOVDQU (16*2)(ptx), T0
  1472  	VPXOR T0, B2, B2
  1473  	VMOVDQU (16*3)(ptx), T0
  1474  	VPXOR T0, B3, B3
  1475  
  1476  	VMOVDQU B0, (16*0)(ctx)
  1477  	VMOVDQU B1, (16*1)(ctx)
  1478  	VMOVDQU B2, (16*2)(ctx)
  1479  	VMOVDQU B3, (16*3)(ctx)
  1480  
  1481  	VMOVDQU (16*14)(pTbl), T2
  1482  	gcmEncDataStep(B0)
  1483  	gcmEncDataStep(B1)
  1484  	gcmEncDataStep(B2)
  1485  	gcmEncDataStep(B3)
  1486  	increment(0)
  1487  	increment(1)
  1488  	increment(2)
  1489  	increment(3)
  1490  
  1491  	LEAQ 64(ptx), ptx
  1492  	LEAQ 64(ctx), ctx
  1493  
  1494  avx2GcmSm4EncSingles:
  1495  	TESTQ ptxLen, ptxLen
  1496  	JE avx2GcmSm4EncDone
  1497  
  1498  	VMOVDQU (8*16 + 0*16)(SP), B0
  1499  	VMOVDQU (8*16 + 1*16)(SP), B1
  1500  	VMOVDQU (8*16 + 2*16)(SP), B2
  1501  	VMOVDQU (8*16 + 3*16)(SP), B3
  1502  
  1503  	VPSHUFB B7, B0, B0
  1504  	VPSHUFB B7, B1, B1
  1505  	VPSHUFB B7, B2, B2
  1506  	VPSHUFB B7, B3, B3
  1507  
  1508  	TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
  1509  	XORL BX, BX
  1510  	VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
  1511  
  1512  avx2GcmSm4Enc4Loop1:
  1513  	AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
  1514  	AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
  1515  	AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
  1516  	AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
  1517  
  1518  	ADDL $16, BX
  1519  	CMPL BX, $4*32
  1520  	JB avx2GcmSm4Enc4Loop1
  1521  
  1522  	// Transpose matrix 4 x 4 32bits word
  1523  	TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
  1524  	VPSHUFB BSWAP, B0, B0
  1525  	VPSHUFB BSWAP, B1, B1
  1526  	VPSHUFB BSWAP, B2, B2
  1527  	VPSHUFB BSWAP, B3, B3
  1528  
  1529  	VMOVDQU B0, (16*0)(SP)
  1530  	VMOVDQU B1, (16*1)(SP)
  1531  	VMOVDQU B2, (16*2)(SP)
  1532  	VMOVDQU B3, (16*3)(SP)
  1533  
  1534  	VMOVDQU (16*14)(pTbl), T2
  1535  	MOVQ SP, BP
  1536  
  1537  avx2GcmSm4EncSinglesLoop:
  1538  		CMPQ ptxLen, $16
  1539  		JB avx2GcmSm4EncTail
  1540  		SUBQ $16, ptxLen
  1541  		VMOVDQU (16*0)(BP), B0
  1542  		VMOVDQU (ptx), T0
  1543  		VPXOR T0, B0, B0
  1544  		VMOVDQU B0, (ctx)
  1545  		gcmEncDataStep(B0)
  1546  		LEAQ (16*1)(ptx), ptx
  1547  		LEAQ (16*1)(ctx), ctx
  1548  		ADDQ $16, BP
  1549  	JMP avx2GcmSm4EncSinglesLoop
  1550  
  1551  avx2GcmSm4EncTail:
  1552  	TESTQ ptxLen, ptxLen
  1553  	JE avx2GcmSm4EncDone
  1554  	VMOVDQU (16*0)(BP), B0
  1555  	VMOVDQU B0, T0
  1556  
  1557  	LEAQ -1(ptx)(ptxLen*1), ptx
  1558  
  1559  	MOVQ ptxLen, aluTMP
  1560  	SHLQ $4, aluTMP
  1561  
  1562  	LEAQ andMask<>(SB), aluCTR
  1563  	VMOVDQU -16(aluCTR)(aluTMP*1), T1
  1564  	VPXOR B0, B0, B0
  1565  
  1566  avx2PtxLoadLoop:
  1567  		PSLLDQ $1, B0
  1568  		PINSRB $0, (ptx), B0
  1569  		LEAQ -1(ptx), ptx
  1570  		DECQ ptxLen
  1571  	JNE avx2PtxLoadLoop
  1572  
  1573  	VPXOR T0, B0, B0
  1574  	VPAND T1, B0, B0
  1575  	VMOVDQU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
  1576  	gcmEncDataStep(B0)
  1577  
  1578  avx2GcmSm4EncDone:
  1579  	VMOVDQU ACC0, (tPtr)
  1580  	VZEROUPPER
  1581  	RET
  1582  
  1583  #undef increment
  1584  
  1585  // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
  1586  TEXT ·gcmSm4Dec(SB),0,$128-96
  1587  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
  1588  
  1589  #define decMulRound(i) \
  1590  	MOVOU (16*i)(ctx), T0;\
  1591  	PSHUFB BSWAP, T0;\
  1592  	internalDecMulRound(i)
  1593  
  1594  #define internalDecMulRound(i) \
  1595  	MOVOU (16*(i*2))(pTbl), T1;\
  1596  	MOVOU T1, T2;\
  1597  	PCLMULQDQ $0x00, T0, T1;\
  1598  	PXOR T1, ACC0;\
  1599  	PSHUFD $78, T0, T1;\
  1600  	PCLMULQDQ $0x11, T0, T2;\
  1601  	PXOR T1, T0;\
  1602  	PXOR T2, ACC1;\
  1603  	MOVOU (16*(i*2+1))(pTbl), T2;\
  1604  	PCLMULQDQ $0x00, T2, T0;\
  1605  	PXOR T0, ACCM
  1606  
  1607  #define decGhashRound(i) \
  1608  		MOVOU (16*i)(ctx), B0; \
  1609  		internalDecGhashRound()
  1610  
  1611  #define internalDecGhashRound() \
  1612  		PSHUFB BSWAP, B0; \
  1613  		PXOR ACC0, B0; \
  1614  		MOVOU T2, ACC0; \
  1615  		MOVOU T2, ACC1; \
  1616  		MOVOU (16*15)(pTbl), ACCM; \
  1617  		PCLMULQDQ $0x00, B0, ACC0; \
  1618  		PCLMULQDQ $0x11, B0, ACC1; \
  1619  		PSHUFD $78, B0, T0; \
  1620  		PXOR B0, T0; \
  1621  		PCLMULQDQ $0x00, T0, ACCM; \
  1622  		PXOR ACC0, ACCM; \
  1623  		PXOR ACC1, ACCM; \
  1624  		MOVOU ACCM, T0; \
  1625  		PSRLDQ $8, ACCM; \
  1626  		PSLLDQ $8, T0; \
  1627  		PXOR ACCM, ACC1; \
  1628  		PXOR T0, ACC0; \
  1629  		reduceRound(ACC0); \
  1630  		reduceRound(ACC0); \
  1631  		PXOR ACC1, ACC0
  1632  
  1633  	MOVQ productTable+0(FP), pTbl
  1634  	MOVQ dst+8(FP), ptx
  1635  	MOVQ src_base+32(FP), ctx
  1636  	MOVQ src_len+40(FP), ptxLen
  1637  	MOVQ ctr+56(FP), ctrPtr
  1638  	MOVQ T+64(FP), tPtr
  1639  	MOVQ rk_base+72(FP), rk
  1640  
  1641  	CMPB ·useAVX2(SB), $1
  1642  	JE   avx2GcmSm4Dec
  1643  
  1644  	MOVOU bswapMask<>(SB), BSWAP
  1645  	MOVOU gcmPoly<>(SB), POLY
  1646  
  1647  	MOVOU (tPtr), ACC0
  1648  	PXOR ACC1, ACC1
  1649  	PXOR ACCM, ACCM
  1650  	MOVOU (ctrPtr), T0
  1651  	MOVL (3*4)(ctrPtr), aluCTR
  1652  	BSWAPL aluCTR
  1653  
  1654  	MOVOU T0, (0*16)(SP)
  1655  	increment(0)
  1656  	MOVOU T0, (1*16)(SP)
  1657  	increment(1)
  1658  	MOVOU T0, (2*16)(SP)
  1659  	increment(2)
  1660  	MOVOU T0, (3*16)(SP)
  1661  	increment(3)
  1662  
  1663  	CMPQ ptxLen, $128
  1664  	JB gcmSm4DecNibbles
  1665  
  1666  	// We have at least 8 blocks to dencrypt, prepare the rest of the counters
  1667  	MOVOU T0, (4*16)(SP)
  1668  	increment(4)
  1669  	MOVOU T0, (5*16)(SP)
  1670  	increment(5)
  1671  	MOVOU T0, (6*16)(SP)
  1672  	increment(6)
  1673  	MOVOU T0, (7*16)(SP)
  1674  	increment(7)
  1675  
  1676  gcmSm4DecOctetsLoop:
  1677  		CMPQ ptxLen, $128
  1678  		JB gcmSm4DecEndOctets
  1679  		SUBQ $128, ptxLen
  1680  
  1681  		MOVOU (0*16)(SP), B0
  1682  		MOVOU (1*16)(SP), B1
  1683  		MOVOU (2*16)(SP), B2
  1684  		MOVOU (3*16)(SP), B3
  1685  		MOVOU (4*16)(SP), B4
  1686  		MOVOU (5*16)(SP), B5
  1687  		MOVOU (6*16)(SP), B6
  1688  		MOVOU (7*16)(SP), B7
  1689  
  1690  		MOVOU (16*0)(ctx), T0
  1691  		PSHUFB BSWAP, T0
  1692  		PXOR ACC0, T0
  1693  		PSHUFD $78, T0, T1
  1694  		PXOR T0, T1
  1695  
  1696  		MOVOU (16*0)(pTbl), ACC0
  1697  		MOVOU (16*1)(pTbl), ACCM
  1698  		MOVOU ACC0, ACC1
  1699  
  1700  		PCLMULQDQ $0x00, T1, ACCM
  1701  		PCLMULQDQ $0x00, T0, ACC0
  1702  		PCLMULQDQ $0x11, T0, ACC1
  1703  
  1704  		SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
  1705  		decMulRound(1)
  1706  		increment(0)
  1707  		decMulRound(2)
  1708  		increment(1)
  1709  		decMulRound(3)
  1710  		increment(2)
  1711  	 	decMulRound(4)
  1712  		increment(3)
  1713  		SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
  1714  		decMulRound(5)
  1715  		increment(4)
  1716  		decMulRound(6)
  1717  		increment(5)
  1718  	 	decMulRound(7)
  1719  		increment(6)
  1720  		increment(7)
  1721  
  1722  		PXOR ACC0, ACCM
  1723  		PXOR ACC1, ACCM
  1724  		MOVOU ACCM, T0
  1725  		PSRLDQ $8, ACCM
  1726  		PSLLDQ $8, T0
  1727  		PXOR ACCM, ACC1
  1728  		PXOR T0, ACC0
  1729  
  1730  		reduceRound(ACC0)
  1731  		reduceRound(ACC0)
  1732  		PXOR ACC1, ACC0
  1733  
  1734  		MOVOU (16*0)(ctx), T0
  1735  		PXOR T0, B0
  1736  		MOVOU (16*1)(ctx), T0
  1737  		PXOR T0, B1
  1738  		MOVOU (16*2)(ctx), T0
  1739  		PXOR T0, B2
  1740  		MOVOU (16*3)(ctx), T0
  1741  		PXOR T0, B3
  1742  		MOVOU (16*4)(ctx), T0
  1743  		PXOR T0, B4
  1744  		MOVOU (16*5)(ctx), T0
  1745  		PXOR T0, B5
  1746  		MOVOU (16*6)(ctx), T0
  1747  		PXOR T0, B6
  1748  		MOVOU (16*7)(ctx), T0
  1749  		PXOR T0, B7
  1750  
  1751  		MOVOU B0, (16*0)(ptx)
  1752  		MOVOU B1, (16*1)(ptx)
  1753  		MOVOU B2, (16*2)(ptx)
  1754  		MOVOU B3, (16*3)(ptx)
  1755  		MOVOU B4, (16*4)(ptx)
  1756  		MOVOU B5, (16*5)(ptx)
  1757  		MOVOU B6, (16*6)(ptx)
  1758  		MOVOU B7, (16*7)(ptx)
  1759  
  1760  		LEAQ 128(ptx), ptx
  1761  		LEAQ 128(ctx), ctx
  1762  
  1763  		JMP gcmSm4DecOctetsLoop
  1764  
  1765  gcmSm4DecEndOctets:
  1766  	SUBQ $4, aluCTR
  1767  
  1768  gcmSm4DecNibbles:
  1769  	CMPQ ptxLen, $64
  1770  	JBE gcmSm4DecSingles
  1771  	SUBQ $64, ptxLen
  1772  
  1773  	MOVOU (0*16)(SP), B4
  1774  	MOVOU (1*16)(SP), B5
  1775  	MOVOU (2*16)(SP), B6
  1776  	MOVOU (3*16)(SP), B7
  1777  
  1778  	SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7)
  1779  	MOVOU (16*14)(pTbl), T2
  1780  	MOVOU (16*0)(ctx), T0
  1781  	PXOR T0, B4
  1782  	MOVOU (16*1)(ctx), T0
  1783  	PXOR T0, B5
  1784  	MOVOU (16*2)(ctx), T0
  1785  	PXOR T0, B6
  1786  	MOVOU (16*3)(ctx), T0
  1787  	PXOR T0, B7
  1788  
  1789  	decGhashRound(0)
  1790  	increment(0)
  1791  	decGhashRound(1)
  1792  	increment(1)
  1793  	decGhashRound(2)
  1794  	increment(2)
  1795  	decGhashRound(3)
  1796  	increment(3)
  1797  
  1798  	MOVOU B4, (16*0)(ptx)
  1799  	MOVOU B5, (16*1)(ptx)
  1800  	MOVOU B6, (16*2)(ptx)
  1801  	MOVOU B7, (16*3)(ptx)
  1802  
  1803  	LEAQ 64(ptx), ptx
  1804  	LEAQ 64(ctx), ctx
  1805  
  1806  gcmSm4DecSingles:
  1807  	TESTQ ptxLen, ptxLen
  1808  	JE gcmSm4DecDone
  1809  	MOVOU (0*16)(SP), B0
  1810  	MOVOU (1*16)(SP), B1
  1811  	MOVOU (2*16)(SP), B2
  1812  	MOVOU (3*16)(SP), B3
  1813  	
  1814  	SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3)
  1815  	MOVOU B0, (16*4)(SP)
  1816  	MOVOU B1, (16*5)(SP)
  1817  	MOVOU B2, (16*6)(SP)
  1818  	MOVOU B3, (16*7)(SP)
  1819  
  1820  	MOVOU (16*14)(pTbl), T2
  1821  	MOVQ SP, BP
  1822  	ADDQ $64, BP
  1823  
  1824  gcmSm4DecSinglesLoop:
  1825  		CMPQ ptxLen, $16
  1826  		JB gcmSm4DecTail
  1827  		SUBQ $16, ptxLen
  1828  
  1829  		MOVOU (16*0)(BP), B1
  1830  		MOVOU (ctx), T0
  1831  		PXOR T0, B1
  1832  		
  1833  		decGhashRound(0)
  1834  		MOVOU B1, (ptx)
  1835  
  1836  		LEAQ (16*1)(ptx), ptx
  1837  		LEAQ (16*1)(ctx), ctx
  1838  		ADDQ $16, BP
  1839  	JMP gcmSm4DecSinglesLoop		
  1840  
  1841  gcmSm4DecTail:
  1842  	TESTQ ptxLen, ptxLen
  1843  	JE gcmSm4DecDone
  1844  
  1845  	MOVQ ptxLen, aluTMP
  1846  	SHLQ $4, aluTMP
  1847  	LEAQ andMask<>(SB), aluCTR
  1848  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1849  
  1850  	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1851  	PAND T1, B0
  1852  
  1853  	MOVOU B0, T1
  1854  	PSHUFB BSWAP, B0
  1855  	PXOR ACC0, B0
  1856  
  1857  	MOVOU (16*14)(pTbl), ACC0
  1858  	MOVOU (16*15)(pTbl), ACCM
  1859  	MOVOU ACC0, ACC1
  1860  
  1861  	PCLMULQDQ $0x00, B0, ACC0
  1862  	PCLMULQDQ $0x11, B0, ACC1
  1863  	PSHUFD $78, B0, T0
  1864  	PXOR B0, T0
  1865  	PCLMULQDQ $0x00, T0, ACCM
  1866  
  1867  	PXOR ACC0, ACCM
  1868  	PXOR ACC1, ACCM
  1869  	MOVOU ACCM, T0
  1870  	PSRLDQ $8, ACCM
  1871  	PSLLDQ $8, T0
  1872  	PXOR ACCM, ACC1
  1873  	PXOR T0, ACC0
  1874  
  1875  	reduceRound(ACC0)
  1876  	reduceRound(ACC0)
  1877  	PXOR ACC1, ACC0
  1878  
  1879  	MOVOU (16*0)(BP), B0
  1880  	PXOR T1, B0
  1881  
  1882  ptxStoreLoop:
  1883  		PEXTRB $0, B0, (ptx)
  1884  		PSRLDQ $1, B0
  1885  		LEAQ 1(ptx), ptx
  1886  		DECQ ptxLen
  1887  
  1888  	JNE ptxStoreLoop
  1889  
  1890  gcmSm4DecDone:
  1891  	MOVOU ACC0, (tPtr)
  1892  	RET
  1893  
  1894  avx2GcmSm4Dec:
  1895  	VMOVDQU bswapMask<>(SB), BSWAP
  1896  	VMOVDQU gcmPoly<>(SB), POLY
  1897  
  1898  	VMOVDQU (tPtr), ACC0
  1899  	VPXOR ACC1, ACC1, ACC1
  1900  	VPXOR ACCM, ACCM, ACCM
  1901  	VMOVDQU (ctrPtr), T0
  1902  	MOVL (3*4)(ctrPtr), aluCTR
  1903  	BSWAPL aluCTR
  1904  
  1905  	VMOVDQU T0, (0*16)(SP)
  1906  	increment(0)
  1907  	VMOVDQU T0, (1*16)(SP)
  1908  	increment(1)
  1909  	VMOVDQU T0, (2*16)(SP)
  1910  	increment(2)
  1911  	VMOVDQU T0, (3*16)(SP)
  1912  	increment(3)
  1913  
  1914  	CMPQ ptxLen, $128
  1915  	JB avx2GcmSm4DecNibbles
  1916  
  1917  	// We have at least 8 blocks to dencrypt, prepare the rest of the counters
  1918  	VMOVDQU T0, (4*16)(SP)
  1919  	increment(4)
  1920  	VMOVDQU T0, (5*16)(SP)
  1921  	increment(5)
  1922  	VMOVDQU T0, (6*16)(SP)
  1923  	increment(6)
  1924  	VMOVDQU T0, (7*16)(SP)
  1925  	increment(7)
  1926  
  1927  avx2GcmSm4DecOctetsLoop:
  1928  		CMPQ ptxLen, $128
  1929  		JB avx2GcmSm4DecEndOctets
  1930  		SUBQ $128, ptxLen
  1931  
  1932  		// load 8 ctrs for encryption
  1933  		VMOVDQU (0*32)(SP), DWB0
  1934  		VMOVDQU (1*32)(SP), DWB1
  1935  		VMOVDQU (2*32)(SP), DWB2
  1936  		VMOVDQU (3*32)(SP), DWB3
  1937  
  1938  		VBROADCASTI128 flipMask<>(SB), XDWTMP0
  1939  		// Apply Byte Flip Mask: LE -> BE
  1940  		VPSHUFB XDWTMP0, DWB0, DWB0
  1941  		VPSHUFB XDWTMP0, DWB1, DWB1
  1942  		VPSHUFB XDWTMP0, DWB2, DWB2
  1943  		VPSHUFB XDWTMP0, DWB3, DWB3
  1944  
  1945  		VMOVDQU (16*0)(ctx), T0
  1946  		VPSHUFB BSWAP, T0, T0
  1947  		VPXOR ACC0, T0, T0
  1948  		VPSHUFD $78, T0, T1
  1949  		VPXOR T0, T1, T1
  1950  
  1951  		VMOVDQU (16*0)(pTbl), ACC0
  1952  		VMOVDQU (16*1)(pTbl), ACCM
  1953  		VMOVDQU ACC0, ACC1
  1954  
  1955  		PCLMULQDQ $0x00, T1, ACCM
  1956  		PCLMULQDQ $0x00, T0, ACC0
  1957  		PCLMULQDQ $0x11, T0, ACC1
  1958  
  1959  
  1960  		// Transpose matrix 4 x 4 32bits word
  1961  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
  1962  		XORL BX, BX
  1963  		VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK
  1964  
  1965  avx2GcmSm4Dec8Loop2:
  1966  			AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) 
  1967  			AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) 
  1968  			AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) 
  1969  			AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) 
  1970  
  1971  			ADDL $16, BX
  1972  			CMPL BX, $4*32
  1973  		JB avx2GcmSm4Dec8Loop2
  1974  
  1975  		// Transpose matrix 4 x 4 32bits word
  1976  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1)
  1977  
  1978  		VBROADCASTI128 bswapMask<>(SB), DWBSWAP
  1979  		VPSHUFB DWBSWAP, DWB0, DWB0
  1980  		VPSHUFB DWBSWAP, DWB1, DWB1
  1981  		VPSHUFB DWBSWAP, DWB2, DWB2
  1982  		VPSHUFB DWBSWAP, DWB3, DWB3
  1983  
  1984  		VMOVDQU (32*0)(ctx), XDWTMP0
  1985  		VPXOR XDWTMP0, DWB0, DWB0
  1986  		VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
  1987  		VEXTRACTI128 $1, XDWTMP0, T0
  1988  		internalDecMulRound(1)
  1989  		increment(0)
  1990  
  1991  		VMOVDQU (32*1)(ctx), XDWTMP0
  1992  		VPXOR XDWTMP0, DWB1, DWB1
  1993  		VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
  1994  		VEXTRACTI128 $0, XDWTMP0, T0
  1995  		internalDecMulRound(2)
  1996  		increment(1)
  1997  		VEXTRACTI128 $1, XDWTMP0, T0
  1998  		internalDecMulRound(3)
  1999  		increment(2)
  2000  
  2001  		VMOVDQU (32*2)(ctx), XDWTMP0
  2002  		VPXOR XDWTMP0, DWB2, DWB2
  2003  		VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
  2004  		VEXTRACTI128 $0, XDWTMP0, T0
  2005  		internalDecMulRound(4)
  2006  		increment(3)
  2007  		VEXTRACTI128 $1, XDWTMP0, T0
  2008  		internalDecMulRound(5)
  2009  		increment(4)
  2010  
  2011  		VMOVDQU (32*3)(ctx), XDWTMP0
  2012  		VPXOR XDWTMP0, DWB3, DWB3
  2013  		VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0
  2014  		VEXTRACTI128 $0, XDWTMP0, T0
  2015  		internalDecMulRound(6)
  2016  		increment(5)
  2017  		VEXTRACTI128 $1, XDWTMP0, T0
  2018  		internalDecMulRound(7)
  2019  		increment(6)
  2020  		increment(7)
  2021  
  2022  		VMOVDQU DWB0, (32*0)(ptx)
  2023  		VMOVDQU DWB1, (32*1)(ptx)
  2024  		VMOVDQU DWB2, (32*2)(ptx)
  2025  		VMOVDQU DWB3, (32*3)(ptx)
  2026  
  2027  		VPXOR ACC0, ACCM, ACCM
  2028  		VPXOR ACC1, ACCM, ACCM
  2029  		VPSLLDQ $8, ACCM, T0
  2030  		VPSRLDQ $8, ACCM, ACCM
  2031  		
  2032  		VPXOR ACCM, ACC1, ACC1
  2033  		VPXOR T0, ACC0, ACC0
  2034  
  2035  		reduceRound(ACC0)
  2036  		reduceRound(ACC0)
  2037  		VPXOR ACC1, ACC0, ACC0
  2038  
  2039  		LEAQ 128(ptx), ptx
  2040  		LEAQ 128(ctx), ctx
  2041  
  2042  		JMP avx2GcmSm4DecOctetsLoop
  2043  
  2044  avx2GcmSm4DecEndOctets:
  2045  	SUBQ $4, aluCTR
  2046  
  2047  avx2GcmSm4DecNibbles:
  2048  	VMOVDQU flipMask<>(SB), B7 // DO NOT CHANGE B7
  2049  	CMPQ ptxLen, $64
  2050  	JBE avx2GcmSm4DecSingles
  2051  	SUBQ $64, ptxLen
  2052  
  2053  	VMOVDQU (0*16)(SP), B0
  2054  	VMOVDQU (1*16)(SP), B1
  2055  	VMOVDQU (2*16)(SP), B2
  2056  	VMOVDQU (3*16)(SP), B3
  2057  	
  2058  	VPSHUFB B7, B0, B0
  2059  	VPSHUFB B7, B1, B1
  2060  	VPSHUFB B7, B2, B2
  2061  	VPSHUFB B7, B3, B3
  2062  
  2063  	TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
  2064  	XORL BX, BX	
  2065  	VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
  2066  
  2067  avx2GcmSm4Dec4Loop2:
  2068  	AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
  2069  	AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
  2070  	AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
  2071  	AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
  2072  
  2073  	ADDL $16, BX
  2074  	CMPL BX, $4*32
  2075  	JB avx2GcmSm4Dec4Loop2
  2076  
  2077  	// Transpose matrix 4 x 4 32bits word
  2078  	TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
  2079  	VPSHUFB BSWAP, B0, B4
  2080  	VPSHUFB BSWAP, B1, B1
  2081  	VPSHUFB BSWAP, B2, B2
  2082  	VPSHUFB BSWAP, B3, B3
  2083  
  2084  	VMOVDQU (16*14)(pTbl), T2
  2085  	VMOVDQU (16*0)(ctx), B0
  2086  	VPXOR B0, B4, B4
  2087  	internalDecGhashRound()
  2088  
  2089  	VMOVDQU (16*1)(ctx), B0
  2090  	VPXOR B0, B1, B1
  2091  	internalDecGhashRound()
  2092  
  2093  	VMOVDQU (16*2)(ctx), B0
  2094  	VPXOR B0, B2, B2
  2095  	internalDecGhashRound()
  2096  
  2097  	VMOVDQU (16*3)(ctx), B0
  2098  	VPXOR B0, B3, B3
  2099  	internalDecGhashRound()
  2100  
  2101  	VMOVDQU B4, (16*0)(ptx)
  2102  	VMOVDQU B1, (16*1)(ptx)
  2103  	VMOVDQU B2, (16*2)(ptx)
  2104  	VMOVDQU B3, (16*3)(ptx)
  2105  
  2106  	increment(0)
  2107  	increment(1)
  2108  	increment(2)
  2109  	increment(3)
  2110  
  2111  	LEAQ 64(ptx), ptx
  2112  	LEAQ 64(ctx), ctx
  2113  
  2114  avx2GcmSm4DecSingles:
  2115  	TESTQ ptxLen, ptxLen
  2116  	JE avx2GcmSm4DecDone
  2117  
  2118  	VMOVDQU (0*16)(SP), B0
  2119  	VMOVDQU (1*16)(SP), B1
  2120  	VMOVDQU (2*16)(SP), B2
  2121  	VMOVDQU (3*16)(SP), B3
  2122  
  2123  	VPSHUFB B7, B0, B0
  2124  	VPSHUFB B7, B1, B1
  2125  	VPSHUFB B7, B2, B2
  2126  	VPSHUFB B7, B3, B3
  2127  
  2128  	TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1)
  2129  	
  2130  	XORL BX, BX	
  2131  	VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK
  2132  
  2133  avx2GcmSm4Dec4Loop1:
  2134  	AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
  2135  	AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
  2136  	AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
  2137  	AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
  2138  
  2139  	ADDL $16, BX
  2140  	CMPL BX, $4*32
  2141  	JB avx2GcmSm4Dec4Loop1
  2142  
  2143  	// Transpose matrix 4 x 4 32bits word
  2144  	TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5)
  2145  	VPSHUFB BSWAP, B0, B0
  2146  	VPSHUFB BSWAP, B1, B1
  2147  	VPSHUFB BSWAP, B2, B2
  2148  	VPSHUFB BSWAP, B3, B3
  2149  
  2150  	VMOVDQU B0, (16*4)(SP)
  2151  	VMOVDQU B1, (16*5)(SP)
  2152  	VMOVDQU B2, (16*6)(SP)
  2153  	VMOVDQU B3, (16*7)(SP)
  2154  
  2155  	VMOVDQU (16*14)(pTbl), T2
  2156  	MOVQ SP, BP
  2157  	ADDQ $64, BP
  2158  
  2159  avx2GcmSm4DecSinglesLoop:
  2160  		CMPQ ptxLen, $16
  2161  		JB avx2GcmSm4DecTail
  2162  		SUBQ $16, ptxLen
  2163  
  2164  		VMOVDQU (16*0)(BP), T0
  2165  		VMOVDQU (ctx), B0
  2166  		VPXOR T0, B0, T0
  2167  		VMOVDQU T0, (ptx)
  2168  
  2169  		internalDecGhashRound()
  2170  		LEAQ (16*1)(ptx), ptx
  2171  		LEAQ (16*1)(ctx), ctx
  2172  		ADDQ $16, BP
  2173  	JMP avx2GcmSm4DecSinglesLoop
  2174  
  2175  avx2GcmSm4DecTail:
  2176  	TESTQ ptxLen, ptxLen
  2177  	JE avx2GcmSm4DecDone
  2178  
  2179  	MOVQ ptxLen, aluTMP
  2180  	SHLQ $4, aluTMP
  2181  	LEAQ andMask<>(SB), aluCTR
  2182  	VMOVDQU -16(aluCTR)(aluTMP*1), T1 // Fetch and-mask according ptxLen
  2183  
  2184  	VMOVDQU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  2185  	VPAND T1, B0, B0  // Just keep ptxLen bytes, others will be zero
  2186  
  2187  	VMOVDQU B0, T1
  2188  	internalDecGhashRound()
  2189  	VMOVDQU (16*0)(BP), B0
  2190  	VPXOR T1, B0, B0
  2191  
  2192  avx2PtxStoreLoop:
  2193  		PEXTRB $0, B0, (ptx)
  2194  		PSRLDQ $1, B0
  2195  		LEAQ 1(ptx), ptx
  2196  		DECQ ptxLen
  2197  
  2198  	JNE avx2PtxStoreLoop
  2199  
  2200  avx2GcmSm4DecDone:
  2201  	VMOVDQU ACC0, (tPtr)
  2202  	VZEROUPPER	
  2203  	RET
  2204  
  2205  // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
  2206  TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
  2207  	RET
  2208  
  2209  // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
  2210  TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
  2211  	RET