github.com/emmansun/gmsm@v0.29.1/sm4/gcm_amd64.s (about)

     1  // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     2  // The implementation uses some optimization as described in:
     3  // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     4  //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     5  // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
     6  //     Hardware
     7  //go:build !purego
     8  
     9  #include "textflag.h"
    10  
    11  #define B0 X0
    12  #define B1 X1
    13  #define B2 X2
    14  #define B3 X3
    15  #define B4 X4
    16  #define B5 X5
    17  #define B6 X6
    18  #define B7 X7
    19  
    20  #define DWB0 Y0
    21  #define DWB1 Y2
    22  #define DWB2 Y4
    23  #define DWB3 Y6
    24  
    25  #define XDWORD Y1
    26  #define YDWORD Y3
    27  #define XDWTMP0 Y5
    28  
    29  #define ACC0 X8
    30  #define ACC1 X9
    31  #define ACCM X10
    32  
    33  #define T0 X11
    34  #define T1 X12
    35  #define T2 X13
    36  #define POLY X14
    37  #define BSWAP X15
    38  #define DWBSWAP Y15
    39  #define NIBBLE_MASK Y7
    40  #define X_NIBBLE_MASK X7
    41  
    42  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    43  DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    44  
    45  DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    46  DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    47  DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    48  DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    49  DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    50  DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    51  DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    52  DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    53  DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    54  DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    55  DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    56  DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    57  DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    58  DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    59  DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    60  DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    61  DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    62  DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    63  DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    64  DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    65  DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    66  DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    67  DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    68  DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    69  DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    70  DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    71  DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    72  DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    73  DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    74  DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    75  
    76  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    77  GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    78  
    79  #include "aesni_macros_amd64.s"
    80  
    81  // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    82  TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
    83  #define pTbl DI
    84  #define tMsk SI
    85  #define tPtr DX
    86  #define plen AX
    87  #define dlen CX
    88  
    89  	MOVQ productTable+0(FP), pTbl
    90  	MOVQ tagMask+8(FP), tMsk
    91  	MOVQ T+16(FP), tPtr
    92  	MOVQ pLen+24(FP), plen
    93  	MOVQ dLen+32(FP), dlen
    94  
    95  	MOVOU (tPtr), ACC0
    96  	MOVOU (tMsk), T2
    97  
    98  	MOVOU bswap_mask<>(SB), BSWAP
    99  	MOVOU gcmPoly<>(SB), POLY
   100  
   101  	SHLQ $3, plen
   102  	SHLQ $3, dlen
   103  
   104  	MOVQ plen, B0
   105  	PINSRQ $1, dlen, B0
   106  
   107  	PXOR ACC0, B0
   108  
   109  	MOVOU (16*14)(pTbl), ACC0
   110  	MOVOU (16*15)(pTbl), ACCM
   111  	MOVOU ACC0, ACC1
   112  
   113  	PCLMULQDQ $0x00, B0, ACC0
   114  	PCLMULQDQ $0x11, B0, ACC1
   115  	PSHUFD $78, B0, T0
   116  	PXOR B0, T0
   117  	PCLMULQDQ $0x00, T0, ACCM
   118  
   119  	PXOR ACC0, ACCM
   120  	PXOR ACC1, ACCM
   121  	MOVOU ACCM, T0
   122  	PSRLDQ $8, ACCM
   123  	PSLLDQ $8, T0
   124  	PXOR ACCM, ACC1
   125  	PXOR T0, ACC0
   126  
   127  	MOVOU POLY, T0
   128  	PCLMULQDQ $0x01, ACC0, T0
   129  	PSHUFD $78, ACC0, ACC0
   130  	PXOR T0, ACC0
   131  
   132  	MOVOU POLY, T0
   133  	PCLMULQDQ $0x01, ACC0, T0
   134  	PSHUFD $78, ACC0, ACC0
   135  	PXOR T0, ACC0
   136  
   137  	PXOR ACC1, ACC0
   138  
   139  	PSHUFB BSWAP, ACC0
   140  	PXOR T2, ACC0
   141  	MOVOU ACC0, (tPtr)
   142  
   143  	RET
   144  
   145  #undef pTbl
   146  #undef tMsk
   147  #undef tPtr
   148  #undef plen
   149  #undef dlen
   150  
   151  // func gcmSm4Init(productTable *[256]byte, rk []uint32)
   152  TEXT ·gcmSm4Init(SB),NOSPLIT,$0
   153  #define dst DI
   154  #define RK SI
   155  
   156  	MOVQ productTable+0(FP), dst
   157  	MOVQ rk+8(FP), RK
   158  
   159  	MOVOU gcmPoly<>(SB), POLY
   160  
   161  	// Encrypt block 0, with the sm4 round keys to generate the hash key H
   162  	PXOR B0, B0
   163  	PXOR B1, B1
   164  	PXOR B2, B2
   165  	PXOR B3, B3
   166  	XORL CX, CX
   167  
   168  sm4InitEncLoop:
   169  		MOVUPS (RK)(CX*1), B4
   170  		MOVOU B4, T0
   171  		SM4_SINGLE_ROUND(T0, T1, T2, B3, B2, B1, B0)
   172  		PSHUFD $1, B4, T0
   173  		SM4_SINGLE_ROUND(T0, T1, T2, B2, B1, B0, B3)
   174  		PSHUFD $2, B4, T0
   175  		SM4_SINGLE_ROUND(T0, T1, T2, B1, B0, B3, B2)
   176  		PSHUFD $3, B4, T0
   177  		SM4_SINGLE_ROUND(T0, T1, T2, B0, B3, B2, B1)
   178  
   179  		ADDL $16, CX
   180  		CMPL CX, $4*32
   181  		JB sm4InitEncLoop
   182  
   183  	PALIGNR $4, B3, B3
   184  	PALIGNR $4, B3, B2
   185  	PALIGNR $4, B2, B1
   186  	PALIGNR $4, B1, B0
   187  
   188  	// H * 2
   189  	PSHUFD $0xff, B0, T0
   190  	MOVOU B0, T1
   191  	PSRAL $31, T0
   192  	PAND POLY, T0
   193  	PSRLL $31, T1
   194  	PSLLDQ $4, T1
   195  	PSLLL $1, B0
   196  	PXOR T0, B0
   197  	PXOR T1, B0
   198  	// Karatsuba pre-computations
   199  	MOVOU B0, (16*14)(dst)
   200  	PSHUFD $78, B0, B1
   201  	PXOR B0, B1
   202  	MOVOU B1, (16*15)(dst)
   203  
   204  	MOVOU B0, B2
   205  	MOVOU B1, B3
   206  	// Now prepare powers of H and pre-computations for them
   207  	MOVQ $7, AX
   208  
   209  initLoop:
   210  		MOVOU B2, T0
   211  		MOVOU B2, T1
   212  		MOVOU B3, T2
   213  		PCLMULQDQ $0x00, B0, T0
   214  		PCLMULQDQ $0x11, B0, T1
   215  		PCLMULQDQ $0x00, B1, T2
   216  
   217  		PXOR T0, T2
   218  		PXOR T1, T2
   219  		MOVOU T2, B4
   220  		PSLLDQ $8, B4
   221  		PSRLDQ $8, T2
   222  		PXOR B4, T0
   223  		PXOR T2, T1
   224  
   225  		MOVOU POLY, B2
   226  		PCLMULQDQ $0x01, T0, B2
   227  		PSHUFD $78, T0, T0
   228  		PXOR B2, T0
   229  		MOVOU POLY, B2
   230  		PCLMULQDQ $0x01, T0, B2
   231  		PSHUFD $78, T0, T0
   232  		PXOR T0, B2
   233  		PXOR T1, B2
   234  
   235  		MOVOU B2, (16*12)(dst)
   236  		PSHUFD $78, B2, B3
   237  		PXOR B2, B3
   238  		MOVOU B3, (16*13)(dst)
   239  
   240  		DECQ AX
   241  		LEAQ (-16*2)(dst), dst
   242  	JNE initLoop
   243  
   244  	RET
   245  
   246  #undef RK
   247  #undef dst
   248  
   249  // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
   250  TEXT ·gcmSm4Data(SB),NOSPLIT,$0
   251  #define pTbl DI
   252  #define aut SI
   253  #define tPtr CX
   254  #define autLen DX
   255  
   256  #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   257  #define avxReduceRound(a) 	VPCLMULQDQ $0x01, a, POLY, T0; VPSHUFD $78, a, a; VPXOR T0, a, a
   258  #define mulRoundAAD(X ,i) \
   259  	MOVOU (16*(i*2))(pTbl), T1;\
   260  	MOVOU T1, T2;\
   261  	PCLMULQDQ $0x00, X, T1;\
   262  	PXOR T1, ACC0;\
   263  	PCLMULQDQ $0x11, X, T2;\
   264  	PXOR T2, ACC1;\
   265  	PSHUFD $78, X, T1;\
   266  	PXOR T1, X;\
   267  	MOVOU (16*(i*2+1))(pTbl), T1;\
   268  	PCLMULQDQ $0x00, X, T1;\
   269  	PXOR T1, ACCM
   270  
   271  	MOVQ productTable+0(FP), pTbl
   272  	MOVQ data_base+8(FP), aut
   273  	MOVQ data_len+16(FP), autLen
   274  	MOVQ T+32(FP), tPtr
   275  
   276  	PXOR ACC0, ACC0
   277  	// MOVOU (tPtr), ACC0 // originally we passed in tag initial value
   278  	MOVOU bswap_mask<>(SB), BSWAP
   279  	MOVOU gcmPoly<>(SB), POLY
   280  
   281  	TESTQ autLen, autLen
   282  	JEQ dataBail
   283  
   284  	CMPQ autLen, $13	// optimize the TLS case
   285  	JE dataTLS
   286  	CMPQ autLen, $128
   287  	JB startSinglesLoop
   288  	JMP dataOctaLoop
   289  
   290  dataTLS:
   291  	MOVOU (16*14)(pTbl), T1
   292  	MOVOU (16*15)(pTbl), T2
   293  	PXOR B0, B0
   294  	MOVQ (aut), B0
   295  	PINSRD $2, 8(aut), B0
   296  	PINSRB $12, 12(aut), B0
   297  	XORQ autLen, autLen
   298  	JMP dataMul
   299  
   300  dataOctaLoop:
   301  		CMPQ autLen, $128
   302  		JB startSinglesLoop
   303  		SUBQ $128, autLen
   304  
   305  		MOVOU (16*0)(aut), X0
   306  		MOVOU (16*1)(aut), X1
   307  		MOVOU (16*2)(aut), X2
   308  		MOVOU (16*3)(aut), X3
   309  		MOVOU (16*4)(aut), X4
   310  		MOVOU (16*5)(aut), X5
   311  		MOVOU (16*6)(aut), X6
   312  		MOVOU (16*7)(aut), X7
   313  		LEAQ (16*8)(aut), aut
   314  		PSHUFB BSWAP, X0
   315  		PSHUFB BSWAP, X1
   316  		PSHUFB BSWAP, X2
   317  		PSHUFB BSWAP, X3
   318  		PSHUFB BSWAP, X4
   319  		PSHUFB BSWAP, X5
   320  		PSHUFB BSWAP, X6
   321  		PSHUFB BSWAP, X7
   322  		PXOR ACC0, X0
   323  
   324  		MOVOU (16*0)(pTbl), ACC0
   325  		MOVOU (16*1)(pTbl), ACCM
   326  		MOVOU ACC0, ACC1
   327  		PSHUFD $78, X0, T1
   328  		PXOR X0, T1
   329  		PCLMULQDQ $0x00, X0, ACC0
   330  		PCLMULQDQ $0x11, X0, ACC1
   331  		PCLMULQDQ $0x00, T1, ACCM
   332  
   333  		mulRoundAAD(X1, 1)
   334  		mulRoundAAD(X2, 2)
   335  		mulRoundAAD(X3, 3)
   336  		mulRoundAAD(X4, 4)
   337  		mulRoundAAD(X5, 5)
   338  		mulRoundAAD(X6, 6)
   339  		mulRoundAAD(X7, 7)
   340  
   341  		PXOR ACC0, ACCM
   342  		PXOR ACC1, ACCM
   343  		MOVOU ACCM, T0
   344  		PSRLDQ $8, ACCM
   345  		PSLLDQ $8, T0
   346  		PXOR ACCM, ACC1
   347  		PXOR T0, ACC0
   348  		reduceRound(ACC0)
   349  		reduceRound(ACC0)
   350  		PXOR ACC1, ACC0
   351  	JMP dataOctaLoop
   352  
   353  startSinglesLoop:
   354  	MOVOU (16*14)(pTbl), T1
   355  	MOVOU (16*15)(pTbl), T2
   356  
   357  dataSinglesLoop:
   358  
   359  		CMPQ autLen, $16
   360  		JB dataEnd
   361  		SUBQ $16, autLen
   362  
   363  		MOVOU (aut), B0
   364  dataMul:
   365  		PSHUFB BSWAP, B0
   366  		PXOR ACC0, B0
   367  
   368  		MOVOU T1, ACC0
   369  		MOVOU T2, ACCM
   370  		MOVOU T1, ACC1
   371  
   372  		PSHUFD $78, B0, T0
   373  		PXOR B0, T0
   374  		PCLMULQDQ $0x00, B0, ACC0
   375  		PCLMULQDQ $0x11, B0, ACC1
   376  		PCLMULQDQ $0x00, T0, ACCM
   377  
   378  		PXOR ACC0, ACCM
   379  		PXOR ACC1, ACCM
   380  		MOVOU ACCM, T0
   381  		PSRLDQ $8, ACCM
   382  		PSLLDQ $8, T0
   383  		PXOR ACCM, ACC1
   384  		PXOR T0, ACC0
   385  
   386  		MOVOU POLY, T0
   387  		PCLMULQDQ $0x01, ACC0, T0
   388  		PSHUFD $78, ACC0, ACC0
   389  		PXOR T0, ACC0
   390  
   391  		MOVOU POLY, T0
   392  		PCLMULQDQ $0x01, ACC0, T0
   393  		PSHUFD $78, ACC0, ACC0
   394  		PXOR T0, ACC0
   395  		PXOR ACC1, ACC0
   396  
   397  		LEAQ 16(aut), aut
   398  
   399  	JMP dataSinglesLoop
   400  
   401  dataEnd:
   402  
   403  	TESTQ autLen, autLen
   404  	JEQ dataBail
   405  
   406  	PXOR B0, B0
   407  	LEAQ -1(aut)(autLen*1), aut
   408  
   409  dataLoadLoop:
   410  
   411  		PSLLDQ $1, B0
   412  		PINSRB $0, (aut), B0
   413  
   414  		LEAQ -1(aut), aut
   415  		DECQ autLen
   416  		JNE dataLoadLoop
   417  
   418  	JMP dataMul
   419  
   420  dataBail:
   421  	MOVOU ACC0, (tPtr)
   422  	RET
   423  
   424  #undef pTbl
   425  #undef aut
   426  #undef tPtr
   427  #undef autLen
   428  
   429  
   430  // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
   431  TEXT ·gcmSm4Enc(SB),0,$256-96
   432  #define pTbl DI
   433  #define ctx DX
   434  #define ctrPtr CX
   435  #define ptx SI
   436  #define rk AX
   437  #define tPtr R8
   438  #define ptxLen R9
   439  #define aluCTR R10
   440  #define aluTMP R11
   441  
   442  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + 8*16 + i*16)(SP)
   443  
   444  #define mulRound(i) \
   445  	MOVOU (16*i)(SP), T0;\
   446  	MOVOU (16*(i*2))(pTbl), T1;\
   447  	MOVOU T1, T2;\
   448  	PCLMULQDQ $0x00, T0, T1;\
   449  	PXOR T1, ACC0;\
   450  	PCLMULQDQ $0x11, T0, T2;\
   451  	PXOR T2, ACC1;\
   452  	PSHUFD $78, T0, T1;\
   453  	PXOR T1, T0;\
   454  	MOVOU (16*(i*2+1))(pTbl), T1;\
   455  	PCLMULQDQ $0x00, T0, T1;\
   456  	PXOR T1, ACCM
   457  
   458  #define gcmEncDataStep(B) \
   459  	PSHUFB BSWAP, B; \
   460  	PXOR ACC0, B; \
   461  	MOVOU T2, ACC0; \
   462  	MOVOU T2, ACC1; \
   463  	MOVOU (16*15)(pTbl), ACCM; \
   464  	PSHUFD $78, B, T0; \
   465  	PXOR B, T0; \
   466  	PCLMULQDQ $0x00, B, ACC0; \
   467  	PCLMULQDQ $0x11, B, ACC1; \
   468  	PCLMULQDQ $0x00, T0, ACCM; \
   469  	PXOR ACC0, ACCM; \
   470  	PXOR ACC1, ACCM; \
   471  	MOVOU ACCM, T0; \
   472  	PSRLDQ $8, ACCM; \
   473  	PSLLDQ $8, T0; \
   474  	PXOR ACCM, ACC1; \
   475  	PXOR T0, ACC0; \
   476  	reduceRound(ACC0); \
   477  	reduceRound(ACC0); \
   478  	PXOR ACC1, ACC0
   479  
   480  #define avxMulRound(i) \
   481  	VMOVDQU (16*i)(SP), T0;\
   482  	VMOVDQU (16*(i*2))(pTbl), T2;\
   483  	VPCLMULQDQ $0x00, T0, T2, T1;\
   484  	VPXOR T1, ACC0, ACC0;\
   485  	VPCLMULQDQ $0x11, T0, T2, T2;\
   486  	VPXOR T2, ACC1, ACC1;\
   487  	VPSHUFD $78, T0, T1;\
   488  	VPXOR T1, T0, T0;\
   489  	VMOVDQU (16*(i*2+1))(pTbl), T1;\
   490  	VPCLMULQDQ $0x00, T0, T1, T1;\
   491  	VPXOR T1, ACCM, ACCM
   492  
   493  #define avxGcmEncDataStep(B) \
   494  	VPSHUFB BSWAP, B, B; \
   495  	VPXOR ACC0, B, B; \
   496  	VMOVDQU (16*15)(pTbl), ACCM; \
   497  	VPSHUFD $78, B, T0; \
   498  	VPXOR B, T0, T0; \
   499  	VPCLMULQDQ $0x00, B, T2, ACC0; \
   500  	VPCLMULQDQ $0x11, B, T2, ACC1; \
   501  	VPCLMULQDQ $0x00, T0, ACCM, ACCM; \
   502  	VPXOR ACC0, ACCM, ACCM; \
   503  	VPXOR ACC1, ACCM, ACCM; \
   504  	VPSLLDQ $8, ACCM, T0; \
   505  	VPSRLDQ $8, ACCM, ACCM; \
   506  	VPXOR ACCM, ACC1, ACC1; \
   507  	VPXOR T0, ACC0, ACC0; \
   508  	avxReduceRound(ACC0); \
   509  	avxReduceRound(ACC0); \
   510  	VPXOR ACC1, ACC0, ACC0
   511  
   512  	MOVQ productTable+0(FP), pTbl
   513  	MOVQ dst+8(FP), ctx
   514  	MOVQ src_base+32(FP), ptx
   515  	MOVQ src_len+40(FP), ptxLen
   516  	MOVQ ctr+56(FP), ctrPtr
   517  	MOVQ T+64(FP), tPtr
   518  	MOVQ rk_base+72(FP), rk
   519  
   520  	CMPB ·useAVX2(SB), $1
   521  	JE   avx2GcmSm4Enc
   522  
   523  	CMPB ·useAVX(SB), $1
   524  	JE   avxGcmSm4Enc
   525  
   526  	MOVOU bswap_mask<>(SB), BSWAP
   527  	MOVOU gcmPoly<>(SB), POLY
   528  
   529  	MOVOU (tPtr), ACC0
   530  	PXOR ACC1, ACC1
   531  	PXOR ACCM, ACCM
   532  	MOVOU (ctrPtr), T0
   533  	PSHUFB flip_mask<>(SB), T0
   534  	PEXTRD $3, T0, aluCTR
   535  
   536  	MOVOU T0, (8*16 + 0*16)(SP)
   537  	increment(0)
   538  	MOVOU T0, (8*16 + 1*16)(SP)
   539  	increment(1)
   540  	MOVOU T0, (8*16 + 2*16)(SP)
   541  	increment(2)
   542  	MOVOU T0, (8*16 + 3*16)(SP)
   543  	increment(3)
   544  
   545  	CMPQ ptxLen, $128
   546  	JB gcmSm4EncNibbles
   547  	SUBQ $128, ptxLen
   548  
   549  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   550  	MOVOU T0, (8*16 + 4*16)(SP)
   551  	increment(4)
   552  	MOVOU T0, (8*16 + 5*16)(SP)
   553  	increment(5)
   554  	MOVOU T0, (8*16 + 6*16)(SP)
   555  	increment(6)
   556  	MOVOU T0, (8*16 + 7*16)(SP)
   557  	increment(7)
   558  
   559  	// load 8 ctrs for encryption
   560  	MOVOU (8*16 + 0*16)(SP), B0
   561  	MOVOU (8*16 + 1*16)(SP), B1
   562  	MOVOU (8*16 + 2*16)(SP), B2
   563  	MOVOU (8*16 + 3*16)(SP), B3
   564  	MOVOU (8*16 + 4*16)(SP), B4
   565  	MOVOU (8*16 + 5*16)(SP), B5
   566  	MOVOU (8*16 + 6*16)(SP), B6
   567  	MOVOU (8*16 + 7*16)(SP), B7
   568  
   569  	SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
   570  	increment(0)
   571  
   572  	// XOR plaintext
   573  	MOVOU (16*0)(ptx), T0
   574  	PXOR T0, B0
   575  	increment(1)
   576  	MOVOU (16*1)(ptx), T0
   577  	PXOR T0, B1
   578  	increment(2)
   579  	MOVOU (16*2)(ptx), T0
   580  	PXOR T0, B2
   581  	increment(3)
   582  	MOVOU (16*3)(ptx), T0
   583  	PXOR T0, B3
   584  	increment(4)
   585  	MOVOU (16*4)(ptx), T0
   586  	PXOR T0, B4
   587  	increment(5)
   588  	MOVOU (16*5)(ptx), T0
   589  	PXOR T0, B5
   590  	increment(6)
   591  	MOVOU (16*6)(ptx), T0
   592  	PXOR T0, B6
   593  	increment(7)
   594  	MOVOU (16*7)(ptx), T0
   595  	PXOR T0, B7
   596  
   597  	// Store ciphertext
   598  	MOVOU B0, (16*0)(ctx)
   599  	PSHUFB BSWAP, B0
   600  	PXOR ACC0, B0
   601  	MOVOU B1, (16*1)(ctx)
   602  	PSHUFB BSWAP, B1
   603  	MOVOU B2, (16*2)(ctx)
   604  	PSHUFB BSWAP, B2
   605  	MOVOU B3, (16*3)(ctx)
   606  	PSHUFB BSWAP, B3
   607  	MOVOU B4, (16*4)(ctx)
   608  	PSHUFB BSWAP, B4
   609  	MOVOU B5, (16*5)(ctx)
   610  	PSHUFB BSWAP, B5
   611  	MOVOU B6, (16*6)(ctx)
   612  	PSHUFB BSWAP, B6
   613  	MOVOU B7, (16*7)(ctx)
   614  	PSHUFB BSWAP, B7
   615  
   616  	MOVOU B0, (16*0)(SP)
   617  	MOVOU B1, (16*1)(SP)
   618  	MOVOU B2, (16*2)(SP)
   619  	MOVOU B3, (16*3)(SP)
   620  	MOVOU B4, (16*4)(SP)
   621  	MOVOU B5, (16*5)(SP)
   622  	MOVOU B6, (16*6)(SP)
   623  	MOVOU B7, (16*7)(SP)
   624  
   625  	LEAQ 128(ptx), ptx
   626  	LEAQ 128(ctx), ctx
   627  
   628  gcmSm4EncOctetsLoop:
   629  		CMPQ ptxLen, $128
   630  		JB gcmSm4EncOctetsEnd
   631  		SUBQ $128, ptxLen
   632  
   633  		MOVOU (8*16 + 0*16)(SP), B0
   634  		MOVOU (8*16 + 1*16)(SP), B1
   635  		MOVOU (8*16 + 2*16)(SP), B2
   636  		MOVOU (8*16 + 3*16)(SP), B3
   637  		MOVOU (8*16 + 4*16)(SP), B4
   638  		MOVOU (8*16 + 5*16)(SP), B5
   639  		MOVOU (8*16 + 6*16)(SP), B6
   640  		MOVOU (8*16 + 7*16)(SP), B7
   641  
   642  		MOVOU (16*0)(SP), T0
   643  		PSHUFD $78, T0, T1
   644  		PXOR T0, T1
   645  
   646  		MOVOU (16*0)(pTbl), ACC0
   647  		MOVOU (16*1)(pTbl), ACCM
   648  		MOVOU ACC0, ACC1
   649  
   650  		PCLMULQDQ $0x00, T1, ACCM
   651  		PCLMULQDQ $0x00, T0, ACC0
   652  		PCLMULQDQ $0x11, T0, ACC1
   653  
   654  		mulRound(1)
   655  		increment(0)
   656  		mulRound(2)
   657  		increment(1)
   658  		mulRound(3)
   659  		increment(2)
   660  	 	mulRound(4)
   661  		increment(3)
   662  		mulRound(5)
   663  		increment(4)
   664  		mulRound(6)
   665  		increment(5)
   666  	 	mulRound(7)
   667  		increment(6)
   668  		
   669  		PXOR ACC0, ACCM
   670  		PXOR ACC1, ACCM
   671  		MOVOU ACCM, T0
   672  		PSRLDQ $8, ACCM
   673  		PSLLDQ $8, T0
   674  		PXOR ACCM, ACC1
   675  		PXOR T0, ACC0
   676  		
   677  		increment(7)
   678  		reduceRound(ACC0)
   679  		reduceRound(ACC0)
   680  		PXOR ACC1, ACC0
   681  		
   682  		SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
   683  
   684  		MOVOU (16*0)(ptx), T0 
   685  		PXOR T0, B0
   686  		MOVOU (16*1)(ptx), T0
   687  		PXOR T0, B1
   688  		MOVOU (16*2)(ptx), T0
   689  		PXOR T0, B2
   690  		MOVOU (16*3)(ptx), T0
   691  		PXOR T0, B3
   692  		MOVOU (16*4)(ptx), T0
   693  		PXOR T0, B4
   694  		MOVOU (16*5)(ptx), T0
   695  		PXOR T0, B5
   696  		MOVOU (16*6)(ptx), T0
   697  		PXOR T0, B6
   698  		MOVOU (16*7)(ptx), T0
   699  		PXOR T0, B7
   700  
   701  		MOVOU B0, (16*0)(ctx)
   702  		PSHUFB BSWAP, B0
   703  		PXOR ACC0, B0
   704  		MOVOU B1, (16*1)(ctx)
   705  		PSHUFB BSWAP, B1
   706  		MOVOU B2, (16*2)(ctx)
   707  		PSHUFB BSWAP, B2
   708  		MOVOU B3, (16*3)(ctx)
   709  		PSHUFB BSWAP, B3
   710  		MOVOU B4, (16*4)(ctx)
   711  		PSHUFB BSWAP, B4
   712  		MOVOU B5, (16*5)(ctx)
   713  		PSHUFB BSWAP, B5
   714  		MOVOU B6, (16*6)(ctx)
   715  		PSHUFB BSWAP, B6
   716  		MOVOU B7, (16*7)(ctx)
   717  		PSHUFB BSWAP, B7
   718  
   719  		MOVOU B0, (16*0)(SP)
   720  		MOVOU B1, (16*1)(SP)
   721  		MOVOU B2, (16*2)(SP)
   722  		MOVOU B3, (16*3)(SP)
   723  		MOVOU B4, (16*4)(SP)
   724  		MOVOU B5, (16*5)(SP)
   725  		MOVOU B6, (16*6)(SP)
   726  		MOVOU B7, (16*7)(SP)
   727  
   728  		LEAQ 128(ptx), ptx
   729  		LEAQ 128(ctx), ctx
   730  
   731  		JMP gcmSm4EncOctetsLoop
   732  
   733  gcmSm4EncOctetsEnd:
   734  	MOVOU (16*0)(SP), T0
   735  	MOVOU (16*0)(pTbl), ACC0
   736  	MOVOU (16*1)(pTbl), ACCM
   737  	MOVOU ACC0, ACC1
   738  	PSHUFD $78, T0, T1
   739  	PXOR T0, T1
   740  	PCLMULQDQ $0x00, T0, ACC0
   741  	PCLMULQDQ $0x11, T0, ACC1
   742  	PCLMULQDQ $0x00, T1, ACCM
   743  
   744  	mulRound(1)
   745  	mulRound(2)
   746  	mulRound(3)
   747  	mulRound(4)
   748  	mulRound(5)
   749  	mulRound(6)
   750  	mulRound(7)
   751  
   752  	PXOR ACC0, ACCM
   753  	PXOR ACC1, ACCM
   754  	MOVOU ACCM, T0
   755  	PSRLDQ $8, ACCM
   756  	PSLLDQ $8, T0
   757  	PXOR ACCM, ACC1
   758  	PXOR T0, ACC0
   759  
   760  	reduceRound(ACC0)
   761  	reduceRound(ACC0)
   762  	PXOR ACC1, ACC0
   763  
   764  	TESTQ ptxLen, ptxLen
   765  	JE gcmSm4EncDone
   766  
   767  	SUBQ $4, aluCTR
   768  
   769  gcmSm4EncNibbles:
   770  	CMPQ ptxLen, $64
   771  	JBE gcmSm4EncSingles
   772  	SUBQ $64, ptxLen
   773  
   774  	MOVOU (8*16 + 0*16)(SP), B0
   775  	MOVOU (8*16 + 1*16)(SP), B1
   776  	MOVOU (8*16 + 2*16)(SP), B2
   777  	MOVOU (8*16 + 3*16)(SP), B3
   778  	
   779  	SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   780  	MOVOU (16*0)(ptx), T0
   781  	PXOR T0, B0
   782  	MOVOU (16*1)(ptx), T0
   783  	PXOR T0, B1
   784  	MOVOU (16*2)(ptx), T0
   785  	PXOR T0, B2
   786  	MOVOU (16*3)(ptx), T0
   787  	PXOR T0, B3
   788  
   789  	MOVOU B0, (16*0)(ctx)
   790  	MOVOU B1, (16*1)(ctx)
   791  	MOVOU B2, (16*2)(ctx)
   792  	MOVOU B3, (16*3)(ctx)
   793  
   794  	MOVOU (16*14)(pTbl), T2
   795  	increment(0)
   796  	gcmEncDataStep(B0)
   797  	increment(1)
   798  	gcmEncDataStep(B1)
   799  	increment(2)
   800  	gcmEncDataStep(B2)
   801  	increment(3)
   802  	gcmEncDataStep(B3)
   803  
   804  	LEAQ 64(ptx), ptx
   805  	LEAQ 64(ctx), ctx
   806  
   807  gcmSm4EncSingles:
   808  	TESTQ ptxLen, ptxLen
   809  	JE gcmSm4EncDone
   810  	MOVOU (8*16 + 0*16)(SP), B0
   811  	MOVOU (8*16 + 1*16)(SP), B1
   812  	MOVOU (8*16 + 2*16)(SP), B2
   813  	MOVOU (8*16 + 3*16)(SP), B3
   814  	
   815  	SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   816  	MOVOU B0, (16*0)(SP)
   817  	MOVOU B1, (16*1)(SP)
   818  	MOVOU B2, (16*2)(SP)
   819  	MOVOU B3, (16*3)(SP)
   820  
   821  	MOVOU (16*14)(pTbl), T2
   822  	MOVQ SP, BP
   823  
   824  gcmSm4EncSinglesLoop:
   825  		CMPQ ptxLen, $16
   826  		JB gcmSm4EncTail
   827  		SUBQ $16, ptxLen
   828  		MOVOU (16*0)(BP), B0
   829  		MOVOU (ptx), T0
   830  		PXOR T0, B0
   831  		MOVOU B0, (ctx)
   832  		gcmEncDataStep(B0)
   833  		LEAQ (16*1)(ptx), ptx
   834  		LEAQ (16*1)(ctx), ctx
   835  		ADDQ $16, BP
   836  	JMP gcmSm4EncSinglesLoop		
   837  
   838  gcmSm4EncTail:
   839  	TESTQ ptxLen, ptxLen
   840  	JE gcmSm4EncDone
   841  	MOVOU (16*0)(BP), B0
   842  	MOVOU B0, T0
   843  
   844  	LEAQ -1(ptx)(ptxLen*1), ptx
   845  
   846  	MOVQ ptxLen, aluTMP
   847  	SHLQ $4, aluTMP
   848  
   849  	LEAQ andMask<>(SB), aluCTR
   850  	MOVOU -16(aluCTR)(aluTMP*1), T1
   851  	PXOR B0, B0
   852  ptxLoadLoop:
   853  		PSLLDQ $1, B0
   854  		PINSRB $0, (ptx), B0
   855  		LEAQ -1(ptx), ptx
   856  		DECQ ptxLen
   857  	JNE ptxLoadLoop
   858  
   859  	PXOR T0, B0
   860  	PAND T1, B0
   861  	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   862  	gcmEncDataStep(B0)
   863  
   864  gcmSm4EncDone:
   865  	MOVOU ACC0, (tPtr)
   866  	RET
   867  
   868  avxGcmSm4Enc:
   869  	VMOVDQU bswap_mask<>(SB), BSWAP
   870  	VMOVDQU gcmPoly<>(SB), POLY
   871  
   872  	VMOVDQU (tPtr), ACC0
   873  	VPXOR ACC1, ACC1, ACC1
   874  	VPXOR ACCM, ACCM, ACCM
   875  	VMOVDQU (ctrPtr), T0
   876  	VPSHUFB flip_mask<>(SB), T0, T0
   877  	VPEXTRD $3, T0, aluCTR
   878  
   879  	VMOVDQU T0, (8*16 + 0*16)(SP)
   880  	increment(0)
   881  	VMOVDQU T0, (8*16 + 1*16)(SP)
   882  	increment(1)
   883  	VMOVDQU T0, (8*16 + 2*16)(SP)
   884  	increment(2)
   885  	VMOVDQU T0, (8*16 + 3*16)(SP)
   886  	increment(3)
   887  
   888  	CMPQ ptxLen, $128
   889  	JB avxGcmSm4EncNibbles
   890  	SUBQ $128, ptxLen
   891  
   892  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   893  	VMOVDQU T0, (8*16 + 4*16)(SP)
   894  	increment(4)
   895  	VMOVDQU T0, (8*16 + 5*16)(SP)
   896  	increment(5)
   897  	VMOVDQU T0, (8*16 + 6*16)(SP)
   898  	increment(6)
   899  	VMOVDQU T0, (8*16 + 7*16)(SP)
   900  	increment(7)
   901  
   902  	// load 8 ctrs for encryption
   903  	VMOVDQU (8*16 + 0*16)(SP), B0
   904  	VMOVDQU (8*16 + 1*16)(SP), B1
   905  	VMOVDQU (8*16 + 2*16)(SP), B2
   906  	VMOVDQU (8*16 + 3*16)(SP), B3
   907  	VMOVDQU (8*16 + 4*16)(SP), B4
   908  	VMOVDQU (8*16 + 5*16)(SP), B5
   909  	VMOVDQU (8*16 + 6*16)(SP), B6
   910  	VMOVDQU (8*16 + 7*16)(SP), B7
   911  
   912  	AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
   913  	increment(0)
   914  	
   915  	// XOR plaintext
   916  	VPXOR (16*0)(ptx), B0, B0
   917  	VPXOR (16*1)(ptx), B1, B1
   918  	increment(1)
   919  	VPXOR (16*2)(ptx), B2, B2
   920  	VPXOR (16*3)(ptx), B3, B3
   921  	increment(2)
   922  	VPXOR (16*4)(ptx), B4, B4
   923  	VPXOR (16*5)(ptx), B5, B5
   924  	increment(3)
   925  	VPXOR (16*6)(ptx), B6, B6
   926  	VPXOR (16*7)(ptx), B7, B7	
   927  	// Store ciphertext
   928  	VMOVDQU B0, (16*0)(ctx)
   929  	VPSHUFB BSWAP, B0, B0
   930  	increment(4)
   931  	VMOVDQU B1, (16*1)(ctx)
   932  	VPSHUFB BSWAP, B1, B1
   933  	increment(5)
   934  	VMOVDQU B2, (16*2)(ctx)
   935  	VPSHUFB BSWAP, B2, B2
   936  	increment(6)
   937  	VMOVDQU B3, (16*3)(ctx)
   938  	VPSHUFB BSWAP, B3, B3
   939  	increment(7)
   940  	VMOVDQU B4, (16*4)(ctx)
   941  	VPSHUFB BSWAP, B4, B4
   942  	VMOVDQU B5, (16*5)(ctx)
   943  	VPSHUFB BSWAP, B5, B5
   944  	VMOVDQU B6, (16*6)(ctx)
   945  	VPSHUFB BSWAP, B6, B6
   946  	VMOVDQU B7, (16*7)(ctx)
   947  	VPSHUFB BSWAP, B7, B7
   948  
   949  	VPXOR ACC0, B0, B0
   950  
   951  	VMOVDQU B0, (16*0)(SP)
   952  	VMOVDQU B1, (16*1)(SP)
   953  	VMOVDQU B2, (16*2)(SP)
   954  	VMOVDQU B3, (16*3)(SP)
   955  	VMOVDQU B4, (16*4)(SP)
   956  	VMOVDQU B5, (16*5)(SP)
   957  	VMOVDQU B6, (16*6)(SP)
   958  	VMOVDQU B7, (16*7)(SP)
   959  
   960  	LEAQ 128(ptx), ptx
   961  	LEAQ 128(ctx), ctx	
   962  
   963  avxGcmSm4EncOctetsLoop:
   964  		CMPQ ptxLen, $128
   965  		JB avxGcmSm4EncOctetsEnd
   966  		SUBQ $128, ptxLen
   967  
   968  		// load 8 ctrs for encryption
   969  		VMOVDQU (8*16 + 0*16)(SP), B0
   970  		VMOVDQU (8*16 + 1*16)(SP), B1
   971  		VMOVDQU (8*16 + 2*16)(SP), B2
   972  		VMOVDQU (8*16 + 3*16)(SP), B3
   973  		VMOVDQU (8*16 + 4*16)(SP), B4
   974  		VMOVDQU (8*16 + 5*16)(SP), B5
   975  		VMOVDQU (8*16 + 6*16)(SP), B6
   976  		VMOVDQU (8*16 + 7*16)(SP), B7
   977  
   978  		VMOVDQU (16*0)(SP), T0
   979  		VPSHUFD $78, T0, T1
   980  		VPXOR T0, T1, T1
   981  
   982  		VMOVDQU (16*0)(pTbl), ACC1
   983  		VMOVDQU (16*1)(pTbl), ACCM
   984  
   985  		VPCLMULQDQ $0x00, T1, ACCM, ACCM
   986  		VPCLMULQDQ $0x00, T0, ACC1, ACC0
   987  		VPCLMULQDQ $0x11, T0, ACC1, ACC1
   988  
   989  		avxMulRound(1)
   990  		increment(0)
   991  		avxMulRound(2)
   992  		increment(1)
   993  		avxMulRound(3)
   994  		increment(2)
   995  	 	avxMulRound(4)
   996  		increment(3)
   997  		avxMulRound(5)
   998  		increment(4)
   999  		avxMulRound(6)
  1000  		increment(5)
  1001  	 	avxMulRound(7)
  1002  		increment(6)
  1003  		VPXOR ACC0, ACCM, ACCM
  1004  		VPXOR ACC1, ACCM, ACCM
  1005  		VPSLLDQ $8, ACCM, T0
  1006  		VPSRLDQ $8, ACCM, ACCM
  1007  		
  1008  		VPXOR ACCM, ACC1, ACC1
  1009  		VPXOR T0, ACC0, ACC0
  1010  
  1011  		increment(7)
  1012  		avxReduceRound(ACC0)
  1013  		avxReduceRound(ACC0)
  1014  		VPXOR ACC1, ACC0, ACC0
  1015  
  1016  		AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
  1017  		// XOR plaintext
  1018  		VPXOR (16*0)(ptx), B0, B0
  1019  		VPXOR (16*1)(ptx), B1, B1
  1020  		VPXOR (16*2)(ptx), B2, B2
  1021  		VPXOR (16*3)(ptx), B3, B3
  1022  		VPXOR (16*4)(ptx), B4, B4
  1023  		VPXOR (16*5)(ptx), B5, B5
  1024  		VPXOR (16*6)(ptx), B6, B6
  1025  		VPXOR (16*7)(ptx), B7, B7
  1026  
  1027  		// Store ciphertext
  1028  		VMOVDQU B0, (16*0)(ctx)
  1029  		VPSHUFB BSWAP, B0, B0
  1030  		VMOVDQU B1, (16*1)(ctx)
  1031  		VPSHUFB BSWAP, B1, B1
  1032  		VMOVDQU B2, (16*2)(ctx)
  1033  		VPSHUFB BSWAP, B2, B2
  1034  		VMOVDQU B3, (16*3)(ctx)
  1035  		VPSHUFB BSWAP, B3, B3
  1036  		VMOVDQU B4, (16*4)(ctx)
  1037  		VPSHUFB BSWAP, B4, B4
  1038  		VMOVDQU B5, (16*5)(ctx)
  1039  		VPSHUFB BSWAP, B5, B5
  1040  		VMOVDQU B6, (16*6)(ctx)
  1041  		VPSHUFB BSWAP, B6, B6
  1042  		VMOVDQU B7, (16*7)(ctx)
  1043  		VPSHUFB BSWAP, B7, B7
  1044  
  1045  		VPXOR ACC0, B0, B0
  1046  		VMOVDQU B0, (16*0)(SP)
  1047  		VMOVDQU B1, (16*1)(SP)
  1048  		VMOVDQU B2, (16*2)(SP)
  1049  		VMOVDQU B3, (16*3)(SP)
  1050  		VMOVDQU B4, (16*4)(SP)
  1051  		VMOVDQU B5, (16*5)(SP)
  1052  		VMOVDQU B6, (16*6)(SP)
  1053  		VMOVDQU B7, (16*7)(SP)
  1054  
  1055  		LEAQ 128(ptx), ptx
  1056  		LEAQ 128(ctx), ctx	
  1057  
  1058  		JMP avxGcmSm4EncOctetsLoop
  1059  
  1060  avxGcmSm4EncOctetsEnd:
  1061  	VMOVDQU (16*0)(SP), T0
  1062  	VMOVDQU (16*0)(pTbl), ACC0
  1063  	VMOVDQU (16*1)(pTbl), ACCM
  1064  	VMOVDQU ACC0, ACC1
  1065  	VPSHUFD $78, T0, T1
  1066  	VPXOR T0, T1, T1
  1067  	VPCLMULQDQ $0x00, T0, ACC0, ACC0
  1068  	VPCLMULQDQ $0x11, T0, ACC1, ACC1
  1069  	VPCLMULQDQ $0x00, T1, ACCM, ACCM
  1070  
  1071  	avxMulRound(1)
  1072  	avxMulRound(2)
  1073  	avxMulRound(3)
  1074  	avxMulRound(4)
  1075  	avxMulRound(5)
  1076  	avxMulRound(6)
  1077  	avxMulRound(7)
  1078  
  1079  	VPXOR ACC0, ACCM, ACCM
  1080  	VPXOR ACC1, ACCM, ACCM
  1081  	VPSLLDQ $8, ACCM, T0
  1082  	VPSRLDQ $8, ACCM, ACCM
  1083  	
  1084  	VPXOR ACCM, ACC1, ACC1
  1085  	VPXOR T0, ACC0, ACC0
  1086  
  1087  	avxReduceRound(ACC0)
  1088  	avxReduceRound(ACC0)
  1089  	VPXOR ACC1, ACC0, ACC0
  1090  
  1091  	TESTQ ptxLen, ptxLen
  1092  	JE avxGcmSm4EncDone
  1093  
  1094  	SUBQ $4, aluCTR
  1095  
  1096  avxGcmSm4EncNibbles:
  1097  	CMPQ ptxLen, $64
  1098  	JBE avxGcmSm4EncSingles
  1099  	SUBQ $64, ptxLen
  1100  	
  1101  	// load 4 ctrs for encryption
  1102  	VMOVDQU (8*16 + 0*16)(SP), B0
  1103  	VMOVDQU (8*16 + 1*16)(SP), B1
  1104  	VMOVDQU (8*16 + 2*16)(SP), B2
  1105  	VMOVDQU (8*16 + 3*16)(SP), B3
  1106  
  1107  	AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
  1108  	// XOR plaintext
  1109  	VPXOR (16*0)(ptx), B0, B0
  1110  	VPXOR (16*1)(ptx), B1, B1
  1111  	VPXOR (16*2)(ptx), B2, B2
  1112  	VPXOR (16*3)(ptx), B3, B3	
  1113  
  1114  	// Store ciphertext
  1115  	VMOVDQU B0, (16*0)(ctx)
  1116  	VMOVDQU B1, (16*1)(ctx)
  1117  	VMOVDQU B2, (16*2)(ctx)
  1118  	VMOVDQU B3, (16*3)(ctx)
  1119  
  1120  	VMOVDQU (16*14)(pTbl), T2
  1121  	increment(0)
  1122  	avxGcmEncDataStep(B0)
  1123  	increment(1)
  1124  	avxGcmEncDataStep(B1)
  1125  	increment(2)
  1126  	avxGcmEncDataStep(B2)
  1127  	increment(3)
  1128  	avxGcmEncDataStep(B3)
  1129  	
  1130  	LEAQ 64(ptx), ptx
  1131  	LEAQ 64(ctx), ctx
  1132  
  1133  avxGcmSm4EncSingles:
  1134  	TESTQ ptxLen, ptxLen
  1135  	JE avxGcmSm4EncDone
  1136  
  1137  	VMOVDQU (8*16 + 0*16)(SP), B0
  1138  	VMOVDQU (8*16 + 1*16)(SP), B1
  1139  	VMOVDQU (8*16 + 2*16)(SP), B2
  1140  	VMOVDQU (8*16 + 3*16)(SP), B3
  1141  
  1142  	AVX_SM4_4BLOCKS_WO_BS(rk, B6, B7, T1, T2, B0, B1, B2, B3)
  1143  	VMOVDQU B0, (16*0)(SP)
  1144  	VMOVDQU B1, (16*1)(SP)
  1145  	VMOVDQU B2, (16*2)(SP)
  1146  	VMOVDQU B3, (16*3)(SP)
  1147  
  1148  	VMOVDQU (16*14)(pTbl), T2
  1149  	MOVQ SP, BP
  1150  
  1151  avxGcmSm4EncSinglesLoop:
  1152  		CMPQ ptxLen, $16
  1153  		JB avxGcmSm4EncTail
  1154  		SUBQ $16, ptxLen
  1155  		VMOVDQU (16*0)(BP), B0
  1156  		VMOVDQU (ptx), T0
  1157  		VPXOR T0, B0, B0
  1158  		VMOVDQU B0, (ctx)
  1159  		avxGcmEncDataStep(B0)
  1160  		LEAQ (16*1)(ptx), ptx
  1161  		LEAQ (16*1)(ctx), ctx
  1162  		ADDQ $16, BP
  1163  	JMP avxGcmSm4EncSinglesLoop
  1164  
  1165  avxGcmSm4EncTail:
  1166  	TESTQ ptxLen, ptxLen
  1167  	JE avxGcmSm4EncDone
  1168  	VMOVDQU (16*0)(BP), B0
  1169  	VMOVDQU B0, T0
  1170  
  1171  	LEAQ -1(ptx)(ptxLen*1), ptx
  1172  
  1173  	MOVQ ptxLen, aluTMP
  1174  	SHLQ $4, aluTMP
  1175  
  1176  	LEAQ andMask<>(SB), aluCTR
  1177  	VMOVDQU -16(aluCTR)(aluTMP*1), T1
  1178  	VPXOR B0, B0, B0
  1179  
  1180  avxPtxLoadLoop:
  1181  		VPSLLDQ $1, B0, B0
  1182  		VPINSRB $0, (ptx), B0, B0
  1183  		LEAQ -1(ptx), ptx
  1184  		DECQ ptxLen
  1185  	JNE avxPtxLoadLoop
  1186  
  1187  	VPXOR T0, B0, B0
  1188  	VPAND T1, B0, B0
  1189  	VMOVDQU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
  1190  	avxGcmEncDataStep(B0)
  1191  
  1192  avxGcmSm4EncDone:
  1193  	VMOVDQU ACC0, (tPtr)
  1194  	RET
  1195  
  1196  avx2GcmSm4Enc:
  1197  	VMOVDQU bswap_mask<>(SB), BSWAP
  1198  	VMOVDQU gcmPoly<>(SB), POLY
  1199  
  1200  	VMOVDQU (tPtr), ACC0
  1201  	VPXOR ACC1, ACC1, ACC1
  1202  	VPXOR ACCM, ACCM, ACCM
  1203  	VMOVDQU (ctrPtr), T0
  1204  	VPSHUFB flip_mask<>(SB), T0, T0
  1205  	VPEXTRD $3, T0, aluCTR
  1206  
  1207  	VINSERTI128 $1, T0, Y11, Y11
  1208  	VMOVDQU Y11, (8*16 + 0*32)(SP)
  1209  	increment(0)
  1210  	increment(1)
  1211  	VMOVDQU Y11, (8*16 + 1*32)(SP)
  1212  	increment(2)
  1213  	increment(3)
  1214  
  1215  	CMPQ ptxLen, $128
  1216  	JB avx2GcmSm4EncNibbles
  1217  	SUBQ $128, ptxLen
  1218  
  1219  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
  1220  	VMOVDQU Y11, (8*16 + 2*32)(SP)
  1221  	increment(4)
  1222  	increment(5)
  1223  	VMOVDQU Y11, (8*16 + 3*32)(SP)
  1224  	increment(6)
  1225  	increment(7)
  1226  
  1227  	VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
  1228  	// load 8 ctrs for encryption
  1229  	VMOVDQU (4*32 + 0*32)(SP), DWB0
  1230  	VMOVDQU (4*32 + 1*32)(SP), DWB1
  1231  	VMOVDQU (4*32 + 2*32)(SP), DWB2
  1232  	VMOVDQU (4*32 + 3*32)(SP), DWB3
  1233  
  1234  	increment(0)
  1235  	// Transpose matrix 4 x 4 32bits word
  1236  	TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
  1237  	
  1238  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
  1239  	increment(1)
  1240  	AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
  1241  	increment(2)
  1242  	// Transpose matrix 4 x 4 32bits word
  1243  	TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
  1244  	
  1245  	VPSHUFB DWBSWAP, DWB0, DWB0
  1246  	VPSHUFB DWBSWAP, DWB1, DWB1
  1247  	increment(3)
  1248  	VPSHUFB DWBSWAP, DWB2, DWB2
  1249  	VPSHUFB DWBSWAP, DWB3, DWB3
  1250  	increment(4)
  1251  	
  1252  	// XOR plaintext
  1253  	VMOVDQU (32*0)(ptx), XDWTMP0
  1254  	VPXOR XDWTMP0, DWB0, DWB0
  1255  	VMOVDQU (32*1)(ptx), XDWTMP0
  1256  	VPXOR XDWTMP0, DWB1, DWB1
  1257  	increment(5)
  1258  	VMOVDQU (32*2)(ptx), XDWTMP0
  1259  	VPXOR XDWTMP0, DWB2, DWB2
  1260  	VMOVDQU (32*3)(ptx), XDWTMP0
  1261  	VPXOR XDWTMP0, DWB3, DWB3
  1262  	increment(6)
  1263  	
  1264  	// Store ciphertext
  1265  	VMOVDQU DWB0, (32*0)(ctx)
  1266  	VPSHUFB DWBSWAP, DWB0, DWB0
  1267  	VMOVDQU DWB1, (32*1)(ctx)
  1268  	VPSHUFB DWBSWAP, DWB1, DWB1
  1269  	VMOVDQU DWB2, (32*2)(ctx)
  1270  	VPSHUFB DWBSWAP, DWB2, DWB2
  1271  	VMOVDQU DWB3, (32*3)(ctx)
  1272  	VPSHUFB DWBSWAP, DWB3, DWB3
  1273  	increment(7)
  1274  	//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
  1275  	//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
  1276  	//VPXOR XDWTMP0, DWB0, DWB0
  1277  	PXOR ACC0, B0  // Can't call VPXOR here
  1278  	VMOVDQU DWB0, (32*0)(SP)
  1279  	VMOVDQU DWB1, (32*1)(SP)
  1280  	VMOVDQU DWB2, (32*2)(SP)
  1281  	VMOVDQU DWB3, (32*3)(SP)
  1282  
  1283  	LEAQ 128(ptx), ptx
  1284  	LEAQ 128(ctx), ctx
  1285  
  1286  avx2GcmSm4EncOctetsLoop:
  1287  		CMPQ ptxLen, $128
  1288  		JB avx2GcmSm4EncOctetsEnd
  1289  		SUBQ $128, ptxLen
  1290  
  1291  		// load 8 ctrs for encryption
  1292  		VMOVDQU (4*32 + 0*32)(SP), DWB0
  1293  		VMOVDQU (4*32 + 1*32)(SP), DWB1
  1294  		VMOVDQU (4*32 + 2*32)(SP), DWB2
  1295  		VMOVDQU (4*32 + 3*32)(SP), DWB3
  1296  
  1297  		VMOVDQU (16*0)(SP), T0
  1298  		VPSHUFD $78, T0, T1
  1299  		VPXOR T0, T1, T1
  1300  
  1301  		VMOVDQU (16*0)(pTbl), ACC1
  1302  		VMOVDQU (16*1)(pTbl), ACCM
  1303  
  1304  		VPCLMULQDQ $0x00, T1, ACCM, ACCM
  1305  		VPCLMULQDQ $0x00, T0, ACC1, ACC0
  1306  		VPCLMULQDQ $0x11, T0, ACC1, ACC1
  1307  
  1308  		// Transpose matrix 4 x 4 32bits word
  1309  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
  1310  
  1311  		AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
  1312  
  1313  		// Transpose matrix 4 x 4 32bits word
  1314  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
  1315  
  1316  		VPSHUFB DWBSWAP, DWB0, DWB0
  1317  		VPSHUFB DWBSWAP, DWB1, DWB1
  1318  		VPSHUFB DWBSWAP, DWB2, DWB2
  1319  		VPSHUFB DWBSWAP, DWB3, DWB3
  1320  
  1321  		avxMulRound(1)
  1322  		increment(0)
  1323  		avxMulRound(2)
  1324  		increment(1)
  1325  		avxMulRound(3)
  1326  		increment(2)
  1327  	 	avxMulRound(4)
  1328  		increment(3)
  1329  		avxMulRound(5)
  1330  		increment(4)
  1331  		avxMulRound(6)
  1332  		increment(5)
  1333  	 	avxMulRound(7)
  1334  		increment(6)
  1335  		
  1336  		VPXOR ACC0, ACCM, ACCM
  1337  		VPXOR ACC1, ACCM, ACCM
  1338  		VPSLLDQ $8, ACCM, T0
  1339  		VPSRLDQ $8, ACCM, ACCM
  1340  		
  1341  		VPXOR ACCM, ACC1, ACC1
  1342  		VPXOR T0, ACC0, ACC0
  1343  
  1344  		increment(7)
  1345  		avxReduceRound(ACC0)
  1346  		avxReduceRound(ACC0)
  1347  		VPXOR ACC1, ACC0, ACC0
  1348  
  1349  		// XOR plaintext
  1350  		VPXOR (32*0)(ptx), DWB0, DWB0
  1351  		VPXOR (32*1)(ptx), DWB1, DWB1
  1352  		VPXOR (32*2)(ptx), DWB2, DWB2
  1353  		VPXOR (32*3)(ptx), DWB3, DWB3
  1354  
  1355  		// Store ciphertext
  1356  		VMOVDQU DWB0, (32*0)(ctx)
  1357  		VPSHUFB DWBSWAP, DWB0, DWB0
  1358  		VMOVDQU DWB1, (32*1)(ctx)
  1359  		VPSHUFB DWBSWAP, DWB1, DWB1
  1360  		VMOVDQU DWB2, (32*2)(ctx)
  1361  		VPSHUFB DWBSWAP, DWB2, DWB2
  1362  		VMOVDQU DWB3, (32*3)(ctx)
  1363  		VPSHUFB DWBSWAP, DWB3, DWB3
  1364  
  1365  		//VPXOR XDWTMP0, XDWTMP0, XDWTMP0
  1366  		//VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0
  1367  		//VPXOR XDWTMP0, DWB0, DWB0
  1368  		PXOR ACC0, B0  // Can't call VPXOR here
  1369  		VMOVDQU DWB0, (32*0)(SP)
  1370  		VMOVDQU DWB1, (32*1)(SP)
  1371  		VMOVDQU DWB2, (32*2)(SP)
  1372  		VMOVDQU DWB3, (32*3)(SP)
  1373  
  1374  		LEAQ 128(ptx), ptx
  1375  		LEAQ 128(ctx), ctx
  1376  
  1377  		JMP avx2GcmSm4EncOctetsLoop
  1378  
  1379  avx2GcmSm4EncOctetsEnd:
  1380  	VMOVDQU (16*0)(SP), T0
  1381  	VMOVDQU (16*0)(pTbl), ACC0
  1382  	VMOVDQU (16*1)(pTbl), ACCM
  1383  	VMOVDQU ACC0, ACC1
  1384  	VPSHUFD $78, T0, T1
  1385  	VPXOR T0, T1, T1
  1386  	VPCLMULQDQ $0x00, T0, ACC0, ACC0
  1387  	VPCLMULQDQ $0x11, T0, ACC1, ACC1
  1388  	VPCLMULQDQ $0x00, T1, ACCM, ACCM
  1389  
  1390  	avxMulRound(1)
  1391  	avxMulRound(2)
  1392  	avxMulRound(3)
  1393  	avxMulRound(4)
  1394  	avxMulRound(5)
  1395  	avxMulRound(6)
  1396  	avxMulRound(7)
  1397  
  1398  	VPXOR ACC0, ACCM, ACCM
  1399  	VPXOR ACC1, ACCM, ACCM
  1400  	VPSLLDQ $8, ACCM, T0
  1401  	VPSRLDQ $8, ACCM, ACCM
  1402  	
  1403  	VPXOR ACCM, ACC1, ACC1
  1404  	VPXOR T0, ACC0, ACC0
  1405  
  1406  	avxReduceRound(ACC0)
  1407  	avxReduceRound(ACC0)
  1408  	VPXOR ACC1, ACC0, ACC0
  1409  
  1410  	TESTQ ptxLen, ptxLen
  1411  	JE avx2GcmSm4EncDone
  1412  
  1413  	SUBQ $4, aluCTR
  1414  
  1415  avx2GcmSm4EncNibbles:
  1416  	CMPQ ptxLen, $64
  1417  	JBE avx2GcmSm4EncSingles
  1418  	SUBQ $64, ptxLen
  1419  
  1420  	VMOVDQU (8*16 + 0*16)(SP), B0
  1421  	VMOVDQU (8*16 + 1*16)(SP), B1
  1422  	VMOVDQU (8*16 + 2*16)(SP), B2
  1423  	VMOVDQU (8*16 + 3*16)(SP), B3
  1424  	
  1425  	AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
  1426  
  1427  	VPXOR (16*0)(ptx), B0, B0
  1428  	VPXOR (16*1)(ptx), B1, B1
  1429  	VPXOR (16*2)(ptx), B2, B2
  1430  	VPXOR (16*3)(ptx), B3, B3
  1431  
  1432  	VMOVDQU B0, (16*0)(ctx)
  1433  	VMOVDQU B1, (16*1)(ctx)
  1434  	VMOVDQU B2, (16*2)(ctx)
  1435  	VMOVDQU B3, (16*3)(ctx)
  1436  
  1437  	VMOVDQU (16*14)(pTbl), T2
  1438  	avxGcmEncDataStep(B0)
  1439  	increment(0)
  1440  	avxGcmEncDataStep(B1)
  1441  	increment(1)
  1442  	avxGcmEncDataStep(B2)
  1443  	increment(2)
  1444  	avxGcmEncDataStep(B3)
  1445  	increment(3)
  1446  
  1447  	LEAQ 64(ptx), ptx
  1448  	LEAQ 64(ctx), ctx
  1449  
  1450  avx2GcmSm4EncSingles:
  1451  	TESTQ ptxLen, ptxLen
  1452  	JE avx2GcmSm4EncDone
  1453  
  1454  	VMOVDQU (8*16 + 0*16)(SP), B0
  1455  	VMOVDQU (8*16 + 1*16)(SP), B1
  1456  	VMOVDQU (8*16 + 2*16)(SP), B2
  1457  	VMOVDQU (8*16 + 3*16)(SP), B3
  1458  
  1459  	AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
  1460  
  1461  	VMOVDQU B0, (16*0)(SP)
  1462  	VMOVDQU B1, (16*1)(SP)
  1463  	VMOVDQU B2, (16*2)(SP)
  1464  	VMOVDQU B3, (16*3)(SP)
  1465  
  1466  	VMOVDQU (16*14)(pTbl), T2
  1467  	MOVQ SP, BP
  1468  
  1469  avx2GcmSm4EncSinglesLoop:
  1470  		CMPQ ptxLen, $16
  1471  		JB avx2GcmSm4EncTail
  1472  		SUBQ $16, ptxLen
  1473  		VMOVDQU (16*0)(BP), B0
  1474  		VMOVDQU (ptx), T0
  1475  		VPXOR T0, B0, B0
  1476  		VMOVDQU B0, (ctx)
  1477  		avxGcmEncDataStep(B0)
  1478  		LEAQ (16*1)(ptx), ptx
  1479  		LEAQ (16*1)(ctx), ctx
  1480  		ADDQ $16, BP
  1481  	JMP avx2GcmSm4EncSinglesLoop
  1482  
  1483  avx2GcmSm4EncTail:
  1484  	TESTQ ptxLen, ptxLen
  1485  	JE avx2GcmSm4EncDone
  1486  	VMOVDQU (16*0)(BP), B0
  1487  	VMOVDQU B0, T0
  1488  
  1489  	LEAQ -1(ptx)(ptxLen*1), ptx
  1490  
  1491  	MOVQ ptxLen, aluTMP
  1492  	SHLQ $4, aluTMP
  1493  
  1494  	LEAQ andMask<>(SB), aluCTR
  1495  	VMOVDQU -16(aluCTR)(aluTMP*1), T1
  1496  	VPXOR B0, B0, B0
  1497  
  1498  avx2PtxLoadLoop:
  1499  		VPSLLDQ $1, B0, B0
  1500  		VPINSRB $0, (ptx), B0, B0
  1501  		LEAQ -1(ptx), ptx
  1502  		DECQ ptxLen
  1503  	JNE avx2PtxLoadLoop
  1504  
  1505  	VPXOR T0, B0, B0
  1506  	VPAND T1, B0, B0
  1507  	VMOVDQU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
  1508  	avxGcmEncDataStep(B0)
  1509  
  1510  avx2GcmSm4EncDone:
  1511  	VMOVDQU ACC0, (tPtr)
  1512  	VZEROUPPER
  1513  	RET
  1514  
  1515  #undef increment
  1516  
  1517  // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
  1518  TEXT ·gcmSm4Dec(SB),0,$128-96
  1519  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, (3*4 + i*16)(SP)
  1520  
  1521  #define decMulRound(i) \
  1522  	MOVOU (16*i)(ctx), T0;\
  1523  	PSHUFB BSWAP, T0;\
  1524  	internalDecMulRound(i)
  1525  
  1526  #define internalDecMulRound(i) \
  1527  	MOVOU (16*(i*2))(pTbl), T1;\
  1528  	MOVOU T1, T2;\
  1529  	PCLMULQDQ $0x00, T0, T1;\
  1530  	PXOR T1, ACC0;\
  1531  	PSHUFD $78, T0, T1;\
  1532  	PCLMULQDQ $0x11, T0, T2;\
  1533  	PXOR T1, T0;\
  1534  	PXOR T2, ACC1;\
  1535  	MOVOU (16*(i*2+1))(pTbl), T2;\
  1536  	PCLMULQDQ $0x00, T2, T0;\
  1537  	PXOR T0, ACCM
  1538  
  1539  #define decGhashRound(i) \
  1540  		MOVOU (16*i)(ctx), B0; \
  1541  		internalDecGhashRound()
  1542  
  1543  #define internalDecGhashRound() \
  1544  		PSHUFB BSWAP, B0; \
  1545  		PXOR ACC0, B0; \
  1546  		MOVOU T2, ACC0; \
  1547  		MOVOU T2, ACC1; \
  1548  		MOVOU (16*15)(pTbl), ACCM; \
  1549  		PCLMULQDQ $0x00, B0, ACC0; \
  1550  		PCLMULQDQ $0x11, B0, ACC1; \
  1551  		PSHUFD $78, B0, T0; \
  1552  		PXOR B0, T0; \
  1553  		PCLMULQDQ $0x00, T0, ACCM; \
  1554  		PXOR ACC0, ACCM; \
  1555  		PXOR ACC1, ACCM; \
  1556  		MOVOU ACCM, T0; \
  1557  		PSRLDQ $8, ACCM; \
  1558  		PSLLDQ $8, T0; \
  1559  		PXOR ACCM, ACC1; \
  1560  		PXOR T0, ACC0; \
  1561  		reduceRound(ACC0); \
  1562  		reduceRound(ACC0); \
  1563  		PXOR ACC1, ACC0
  1564  
  1565  #define avxDecMulRound(i) \
  1566  	VMOVDQU (16*i)(ctx), T0;\
  1567  	VPSHUFB BSWAP, T0, T0;\
  1568  	VMOVDQU (16*(i*2))(pTbl), T2;\
  1569  	VPCLMULQDQ $0x00, T0, T2, T1;\
  1570  	VPXOR T1, ACC0, ACC0;\
  1571  	VPSHUFD $78, T0, T1;\
  1572  	VPCLMULQDQ $0x11, T0, T2, T2;\
  1573  	VPXOR T1, T0, T0;\
  1574  	VPXOR T2, ACC1, ACC1;\
  1575  	VMOVDQU (16*(i*2+1))(pTbl), T2;\
  1576  	VPCLMULQDQ $0x00, T2, T0, T0;\
  1577  	VPXOR T0, ACCM, ACCM
  1578  
  1579  #define internalAvxDecGhashRound() \
  1580  		VPSHUFB BSWAP, B0, B0; \
  1581  		VPXOR ACC0, B0, B0; \
  1582  		VMOVDQU (16*15)(pTbl), ACCM; \
  1583  		VPCLMULQDQ $0x00, B0, T2, ACC0; \
  1584  		VPCLMULQDQ $0x11, B0, T2, ACC1; \
  1585  		VPSHUFD $78, B0, T0; \
  1586  		VPXOR B0, T0, T0; \
  1587  		VPCLMULQDQ $0x00, T0, ACCM, ACCM; \
  1588  		VPXOR ACC0, ACCM, ACCM; \
  1589  		VPXOR ACC1, ACCM, ACCM; \
  1590  		VPSLLDQ $8, ACCM, T0; \
  1591  		VPSRLDQ $8, ACCM, ACCM; \
  1592  		VPXOR ACCM, ACC1, ACC1; \
  1593  		VPXOR T0, ACC0, ACC0; \
  1594  		avxReduceRound(ACC0); \
  1595  		avxReduceRound(ACC0); \
  1596  		VPXOR ACC1, ACC0, ACC0
  1597  
  1598  	MOVQ productTable+0(FP), pTbl
  1599  	MOVQ dst+8(FP), ptx
  1600  	MOVQ src_base+32(FP), ctx
  1601  	MOVQ src_len+40(FP), ptxLen
  1602  	MOVQ ctr+56(FP), ctrPtr
  1603  	MOVQ T+64(FP), tPtr
  1604  	MOVQ rk_base+72(FP), rk
  1605  
  1606  	CMPB ·useAVX2(SB), $1
  1607  	JE   avx2GcmSm4Dec
  1608  
  1609  	CMPB ·useAVX(SB), $1
  1610  	JE   avxGcmSm4Dec
  1611  
  1612  	MOVOU bswap_mask<>(SB), BSWAP
  1613  	MOVOU gcmPoly<>(SB), POLY
  1614  
  1615  	MOVOU (tPtr), ACC0
  1616  	PXOR ACC1, ACC1
  1617  	PXOR ACCM, ACCM
  1618  	MOVOU (ctrPtr), T0
  1619  	PSHUFB flip_mask<>(SB), T0
  1620  	PEXTRD $3, T0, aluCTR
  1621  
  1622  	MOVOU T0, (0*16)(SP)
  1623  	increment(0)
  1624  	MOVOU T0, (1*16)(SP)
  1625  	increment(1)
  1626  	MOVOU T0, (2*16)(SP)
  1627  	increment(2)
  1628  	MOVOU T0, (3*16)(SP)
  1629  	increment(3)
  1630  
  1631  	CMPQ ptxLen, $128
  1632  	JB gcmSm4DecNibbles
  1633  
  1634  	// We have at least 8 blocks to dencrypt, prepare the rest of the counters
  1635  	MOVOU T0, (4*16)(SP)
  1636  	increment(4)
  1637  	MOVOU T0, (5*16)(SP)
  1638  	increment(5)
  1639  	MOVOU T0, (6*16)(SP)
  1640  	increment(6)
  1641  	MOVOU T0, (7*16)(SP)
  1642  	increment(7)
  1643  
  1644  gcmSm4DecOctetsLoop:
  1645  		CMPQ ptxLen, $128
  1646  		JB gcmSm4DecEndOctets
  1647  		SUBQ $128, ptxLen
  1648  
  1649  		MOVOU (0*16)(SP), B0
  1650  		MOVOU (1*16)(SP), B1
  1651  		MOVOU (2*16)(SP), B2
  1652  		MOVOU (3*16)(SP), B3
  1653  		MOVOU (4*16)(SP), B4
  1654  		MOVOU (5*16)(SP), B5
  1655  		MOVOU (6*16)(SP), B6
  1656  		MOVOU (7*16)(SP), B7
  1657  
  1658  		MOVOU (16*0)(ctx), T0
  1659  		PSHUFB BSWAP, T0
  1660  		PXOR ACC0, T0
  1661  		PSHUFD $78, T0, T1
  1662  		PXOR T0, T1
  1663  
  1664  		MOVOU (16*0)(pTbl), ACC0
  1665  		MOVOU (16*1)(pTbl), ACCM
  1666  		MOVOU ACC0, ACC1
  1667  
  1668  		PCLMULQDQ $0x00, T1, ACCM
  1669  		PCLMULQDQ $0x00, T0, ACC0
  1670  		PCLMULQDQ $0x11, T0, ACC1
  1671  
  1672  		decMulRound(1)
  1673  		increment(0)
  1674  		decMulRound(2)
  1675  		increment(1)
  1676  		decMulRound(3)
  1677  		increment(2)
  1678  	 	decMulRound(4)
  1679  		increment(3)
  1680  		decMulRound(5)
  1681  		increment(4)
  1682  		decMulRound(6)
  1683  		increment(5)
  1684  	 	decMulRound(7)
  1685  		increment(6)
  1686  		increment(7)
  1687  
  1688  		PXOR ACC0, ACCM
  1689  		PXOR ACC1, ACCM
  1690  		MOVOU ACCM, T0
  1691  		PSRLDQ $8, ACCM
  1692  		PSLLDQ $8, T0
  1693  		PXOR ACCM, ACC1
  1694  		PXOR T0, ACC0
  1695  
  1696  		reduceRound(ACC0)
  1697  		reduceRound(ACC0)
  1698  		PXOR ACC1, ACC0
  1699  
  1700  		SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
  1701  
  1702  		MOVOU (16*0)(ctx), T0 
  1703  		PXOR T0, B0
  1704  		MOVOU (16*1)(ctx), T0
  1705  		PXOR T0, B1
  1706  		MOVOU (16*2)(ctx), T0
  1707  		PXOR T0, B2
  1708  		MOVOU (16*3)(ctx), T0
  1709  		PXOR T0, B3
  1710  		MOVOU (16*4)(ctx), T0
  1711  		PXOR T0, B4
  1712  		MOVOU (16*5)(ctx), T0
  1713  		PXOR T0, B5
  1714  		MOVOU (16*6)(ctx), T0
  1715  		PXOR T0, B6
  1716  		MOVOU (16*7)(ctx), T0
  1717  		PXOR T0, B7
  1718  
  1719  		MOVOU B0, (16*0)(ptx)
  1720  		MOVOU B1, (16*1)(ptx)
  1721  		MOVOU B2, (16*2)(ptx)
  1722  		MOVOU B3, (16*3)(ptx)
  1723  		MOVOU B4, (16*4)(ptx)
  1724  		MOVOU B5, (16*5)(ptx)
  1725  		MOVOU B6, (16*6)(ptx)
  1726  		MOVOU B7, (16*7)(ptx)
  1727  
  1728  		LEAQ 128(ptx), ptx
  1729  		LEAQ 128(ctx), ctx
  1730  
  1731  		JMP gcmSm4DecOctetsLoop
  1732  
  1733  gcmSm4DecEndOctets:
  1734  	SUBQ $4, aluCTR
  1735  
  1736  gcmSm4DecNibbles:
  1737  	CMPQ ptxLen, $64
  1738  	JBE gcmSm4DecSingles
  1739  	SUBQ $64, ptxLen
  1740  
  1741  	MOVOU (0*16)(SP), B4
  1742  	MOVOU (1*16)(SP), B5
  1743  	MOVOU (2*16)(SP), B6
  1744  	MOVOU (3*16)(SP), B7
  1745  
  1746  	SM4_4BLOCKS_WO_BS(rk, B0, T0, T1, T2, B4, B5, B6, B7)
  1747  	MOVOU (16*14)(pTbl), T2
  1748  
  1749  	MOVOU (16*0)(ctx), B0
  1750  	PXOR B0, B4
  1751  	internalDecGhashRound()
  1752  	increment(0)
  1753  	MOVOU (16*1)(ctx), B0
  1754  	PXOR B0, B5
  1755  	internalDecGhashRound()
  1756  	increment(1)
  1757  	MOVOU (16*2)(ctx), B0
  1758  	PXOR B0, B6
  1759  	internalDecGhashRound()
  1760  	increment(2)
  1761  	MOVOU (16*3)(ctx), B0
  1762  	PXOR B0, B7
  1763  	internalDecGhashRound()
  1764  	increment(3)
  1765  
  1766  	MOVOU B4, (16*0)(ptx)
  1767  	MOVOU B5, (16*1)(ptx)
  1768  	MOVOU B6, (16*2)(ptx)
  1769  	MOVOU B7, (16*3)(ptx)
  1770  
  1771  	LEAQ 64(ptx), ptx
  1772  	LEAQ 64(ctx), ctx
  1773  
  1774  gcmSm4DecSingles:
  1775  	TESTQ ptxLen, ptxLen
  1776  	JE gcmSm4DecDone
  1777  	MOVOU (0*16)(SP), B0
  1778  	MOVOU (1*16)(SP), B1
  1779  	MOVOU (2*16)(SP), B2
  1780  	MOVOU (3*16)(SP), B3
  1781  	
  1782  	SM4_4BLOCKS_WO_BS(rk, B4, T0, T1, T2, B0, B1, B2, B3)
  1783  	MOVOU B0, (16*4)(SP)
  1784  	MOVOU B1, (16*5)(SP)
  1785  	MOVOU B2, (16*6)(SP)
  1786  	MOVOU B3, (16*7)(SP)
  1787  
  1788  	MOVOU (16*14)(pTbl), T2
  1789  	MOVQ SP, BP
  1790  	ADDQ $64, BP
  1791  
  1792  gcmSm4DecSinglesLoop:
  1793  		CMPQ ptxLen, $16
  1794  		JB gcmSm4DecTail
  1795  		SUBQ $16, ptxLen
  1796  
  1797  		MOVOU (16*0)(BP), B1
  1798  		MOVOU (ctx), T0
  1799  		PXOR T0, B1
  1800  		
  1801  		decGhashRound(0)
  1802  		MOVOU B1, (ptx)
  1803  
  1804  		LEAQ (16*1)(ptx), ptx
  1805  		LEAQ (16*1)(ctx), ctx
  1806  		ADDQ $16, BP
  1807  	JMP gcmSm4DecSinglesLoop		
  1808  
  1809  gcmSm4DecTail:
  1810  	TESTQ ptxLen, ptxLen
  1811  	JE gcmSm4DecDone
  1812  
  1813  	MOVQ ptxLen, aluTMP
  1814  	SHLQ $4, aluTMP
  1815  	LEAQ andMask<>(SB), aluCTR
  1816  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1817  
  1818  	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1819  	PAND T1, B0
  1820  
  1821  	MOVOU B0, T1
  1822  	internalDecGhashRound()
  1823  
  1824  	MOVOU (16*0)(BP), B0
  1825  	PXOR T1, B0
  1826  
  1827  ptxStoreLoop:
  1828  		PEXTRB $0, B0, (ptx)
  1829  		PSRLDQ $1, B0
  1830  		LEAQ 1(ptx), ptx
  1831  		DECQ ptxLen
  1832  
  1833  	JNE ptxStoreLoop
  1834  
  1835  gcmSm4DecDone:
  1836  	MOVOU ACC0, (tPtr)
  1837  	RET
  1838  
  1839  avxGcmSm4Dec:
  1840  	VMOVDQU bswap_mask<>(SB), BSWAP
  1841  	VMOVDQU gcmPoly<>(SB), POLY
  1842  
  1843  	VMOVDQU (tPtr), ACC0
  1844  	VPXOR ACC1, ACC1, ACC1
  1845  	VPXOR ACCM, ACCM, ACCM
  1846  	VMOVDQU (ctrPtr), T0
  1847  	VPSHUFB flip_mask<>(SB), T0, T0
  1848  	VPEXTRD $3, T0, aluCTR
  1849  
  1850  	VMOVDQU T0, (0*16)(SP)
  1851  	increment(0)
  1852  	VMOVDQU T0, (1*16)(SP)
  1853  	increment(1)
  1854  	VMOVDQU T0, (2*16)(SP)
  1855  	increment(2)
  1856  	VMOVDQU T0, (3*16)(SP)
  1857  	increment(3)
  1858  
  1859  	CMPQ ptxLen, $128
  1860  	JB avxGcmSm4DecNibbles
  1861  
  1862  	// We have at least 8 blocks to dencrypt, prepare the rest of the counters
  1863  	VMOVDQU T0, (4*16)(SP)
  1864  	increment(4)
  1865  	VMOVDQU T0, (5*16)(SP)
  1866  	increment(5)
  1867  	VMOVDQU T0, (6*16)(SP)
  1868  	increment(6)
  1869  	VMOVDQU T0, (7*16)(SP)
  1870  	increment(7)
  1871  
  1872  avxGcmSm4DecOctetsLoop:
  1873  		CMPQ ptxLen, $128
  1874  		JB avxGcmSm4DecEndOctets
  1875  		SUBQ $128, ptxLen
  1876  
  1877  		VMOVDQU (0*16)(SP), B0
  1878  		VMOVDQU (1*16)(SP), B1
  1879  		VMOVDQU (2*16)(SP), B2
  1880  		VMOVDQU (3*16)(SP), B3
  1881  		VMOVDQU (4*16)(SP), B4
  1882  		VMOVDQU (5*16)(SP), B5
  1883  		VMOVDQU (6*16)(SP), B6
  1884  		VMOVDQU (7*16)(SP), B7
  1885  
  1886  		VMOVDQU (16*0)(ctx), T0
  1887  		VPSHUFB BSWAP, T0, T0
  1888  		VPXOR ACC0, T0, T0
  1889  		VPSHUFD $78, T0, T1
  1890  		VPXOR T0, T1, T1
  1891  
  1892  		VMOVDQU (16*0)(pTbl), ACC1
  1893  		VMOVDQU (16*1)(pTbl), ACCM
  1894  
  1895  		VPCLMULQDQ $0x00, T1, ACCM, ACCM
  1896  		VPCLMULQDQ $0x00, T0, ACC1, ACC0
  1897  		VPCLMULQDQ $0x11, T0, ACC1, ACC1
  1898  
  1899  		avxDecMulRound(1)
  1900  		increment(0)
  1901  		avxDecMulRound(2)
  1902  		increment(1)
  1903  		avxDecMulRound(3)
  1904  		increment(2)
  1905  	 	avxDecMulRound(4)
  1906  		increment(3)
  1907  		avxDecMulRound(5)
  1908  		increment(4)
  1909  		avxDecMulRound(6)
  1910  		increment(5)
  1911  	 	avxDecMulRound(7)
  1912  		increment(6)
  1913  		
  1914  		VPXOR ACC0, ACCM, ACCM
  1915  		VPXOR ACC1, ACCM, ACCM
  1916  
  1917  		VPSLLDQ $8, ACCM, T0
  1918  		VPSRLDQ $8, ACCM, ACCM
  1919  
  1920  		VPXOR ACCM, ACC1, ACC1
  1921  		VPXOR T0, ACC0, ACC0
  1922  
  1923  		increment(7)
  1924  		avxReduceRound(ACC0)
  1925  		avxReduceRound(ACC0)
  1926  		VPXOR ACC1, ACC0, ACC0
  1927  
  1928  		AVX_SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
  1929  
  1930  		VPXOR (16*0)(ctx), B0, B0
  1931  		VPXOR (16*1)(ctx), B1, B1
  1932  		VPXOR (16*2)(ctx), B2, B2
  1933  		VPXOR (16*3)(ctx), B3, B3
  1934  		VPXOR (16*4)(ctx), B4, B4
  1935  		VPXOR (16*5)(ctx), B5, B5
  1936  		VPXOR (16*6)(ctx), B6, B6
  1937  		VPXOR (16*7)(ctx), B7, B7
  1938  
  1939  		VMOVDQU B0, (16*0)(ptx)
  1940  		VMOVDQU B1, (16*1)(ptx)
  1941  		VMOVDQU B2, (16*2)(ptx)
  1942  		VMOVDQU B3, (16*3)(ptx)
  1943  		VMOVDQU B4, (16*4)(ptx)
  1944  		VMOVDQU B5, (16*5)(ptx)
  1945  		VMOVDQU B6, (16*6)(ptx)
  1946  		VMOVDQU B7, (16*7)(ptx)
  1947  
  1948  		LEAQ 128(ptx), ptx
  1949  		LEAQ 128(ctx), ctx
  1950  
  1951  		JMP avxGcmSm4DecOctetsLoop
  1952  
  1953  avxGcmSm4DecEndOctets:
  1954  	SUBQ $4, aluCTR
  1955  
  1956  avxGcmSm4DecNibbles:
  1957  	CMPQ ptxLen, $64
  1958  	JBE avxGcmSm4DecSingles
  1959  	SUBQ $64, ptxLen
  1960  
  1961  	VMOVDQU (0*16)(SP), B4
  1962  	VMOVDQU (1*16)(SP), B5
  1963  	VMOVDQU (2*16)(SP), B6
  1964  	VMOVDQU (3*16)(SP), B7
  1965  
  1966  	AVX_SM4_4BLOCKS_WO_BS(rk, B0, B1, T1, T2, B4, B5, B6, B7)
  1967  
  1968  	VMOVDQU (16*14)(pTbl), T2
  1969  	VMOVDQU (16*0)(ctx), B0
  1970  	VPXOR B0, B4, B4
  1971  	internalAvxDecGhashRound()
  1972  	increment(0)
  1973  
  1974  	VMOVDQU (16*1)(ctx), B0
  1975  	VPXOR B0, B5, B5
  1976  	internalAvxDecGhashRound()
  1977  	increment(1)
  1978  
  1979  	VMOVDQU (16*2)(ctx), B0
  1980  	VPXOR B0, B6, B6
  1981  	internalAvxDecGhashRound()
  1982  	increment(2)
  1983  
  1984  	VMOVDQU (16*3)(ctx), B0
  1985  	VPXOR B0, B7, B7
  1986  	internalAvxDecGhashRound()
  1987  	increment(3)
  1988  
  1989  	VMOVDQU B4, (16*0)(ptx)
  1990  	VMOVDQU B5, (16*1)(ptx)
  1991  	VMOVDQU B6, (16*2)(ptx)
  1992  	VMOVDQU B7, (16*3)(ptx)
  1993  
  1994  	LEAQ 64(ptx), ptx
  1995  	LEAQ 64(ctx), ctx
  1996  
  1997  avxGcmSm4DecSingles:
  1998  	TESTQ ptxLen, ptxLen
  1999  	JE avxGcmSm4DecDone
  2000  
  2001  	VMOVDQU (0*16)(SP), B0
  2002  	VMOVDQU (1*16)(SP), B1
  2003  	VMOVDQU (2*16)(SP), B2
  2004  	VMOVDQU (3*16)(SP), B3
  2005  	
  2006  	AVX_SM4_4BLOCKS_WO_BS(rk, B7, B6, B5, B4, B0, B1, B2, B3)
  2007  	VMOVDQU B0, (16*4)(SP)
  2008  	VMOVDQU B1, (16*5)(SP)
  2009  	VMOVDQU B2, (16*6)(SP)
  2010  	VMOVDQU B3, (16*7)(SP)
  2011  
  2012  	VMOVDQU (16*14)(pTbl), T2
  2013  	MOVQ SP, BP
  2014  	ADDQ $64, BP
  2015  
  2016  avxGcmSm4DecSinglesLoop:
  2017  		CMPQ ptxLen, $16
  2018  		JB avxGcmSm4DecTail
  2019  		SUBQ $16, ptxLen
  2020  
  2021  		VMOVDQU (16*0)(BP), T0
  2022  		VMOVDQU (ctx), B0
  2023  		VPXOR T0, B0, T0
  2024  		VMOVDQU T0, (ptx)
  2025  
  2026  		internalAvxDecGhashRound()
  2027  
  2028  		LEAQ (16*1)(ptx), ptx
  2029  		LEAQ (16*1)(ctx), ctx
  2030  		ADDQ $16, BP
  2031  	JMP avxGcmSm4DecSinglesLoop
  2032  
  2033  avxGcmSm4DecTail:
  2034  	TESTQ ptxLen, ptxLen
  2035  	JE avxGcmSm4DecDone
  2036  
  2037  	MOVQ ptxLen, aluTMP
  2038  	SHLQ $4, aluTMP
  2039  	LEAQ andMask<>(SB), aluCTR
  2040  	VMOVDQU -16(aluCTR)(aluTMP*1), T1
  2041  
  2042  	VMOVDQU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  2043  	VPAND T1, B0, B0
  2044  
  2045  	VMOVDQU B0, T1
  2046  	internalAvxDecGhashRound()
  2047  
  2048  	VMOVDQU (16*0)(BP), B0
  2049  	VPXOR T1, B0, B0
  2050  
  2051  avxPtxStoreLoop:
  2052  		VPEXTRB $0, B0, (ptx)
  2053  		VPSRLDQ $1, B0, B0
  2054  		LEAQ 1(ptx), ptx
  2055  		DECQ ptxLen
  2056  
  2057  	JNE avxPtxStoreLoop
  2058  
  2059  avxGcmSm4DecDone:
  2060  	VMOVDQU ACC0, (tPtr)
  2061  	RET
  2062  
  2063  avx2GcmSm4Dec:
  2064  	VMOVDQU bswap_mask<>(SB), BSWAP
  2065  	VMOVDQU gcmPoly<>(SB), POLY
  2066  
  2067  	VMOVDQU (tPtr), ACC0
  2068  	VPXOR ACC1, ACC1, ACC1
  2069  	VPXOR ACCM, ACCM, ACCM
  2070  	VMOVDQU (ctrPtr), T0
  2071  	VPSHUFB flip_mask<>(SB), T0, T0
  2072  	VPEXTRD $3, T0, aluCTR
  2073  
  2074  	VINSERTI128 $1, T0, Y11, Y11
  2075  	VMOVDQU Y11, (0*32)(SP)
  2076  	increment(0)
  2077  	increment(1)
  2078  	VMOVDQU Y11, (1*32)(SP)
  2079  	increment(2)
  2080  	increment(3)
  2081  
  2082  	CMPQ ptxLen, $128
  2083  	JB avx2GcmSm4DecNibbles
  2084  
  2085  	// We have at least 8 blocks to dencrypt, prepare the rest of the counters
  2086  	VMOVDQU Y11, (2*32)(SP)
  2087  	increment(4)
  2088  	increment(5)
  2089  	VMOVDQU Y11, (3*32)(SP)
  2090  	increment(6)
  2091  	increment(7)
  2092  
  2093  	VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
  2094  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
  2095  
  2096  avx2GcmSm4DecOctetsLoop:
  2097  		CMPQ ptxLen, $128
  2098  		JB avx2GcmSm4DecEndOctets
  2099  		SUBQ $128, ptxLen
  2100  
  2101  		// load 8 ctrs for encryption
  2102  		VMOVDQU (0*32)(SP), DWB0
  2103  		VMOVDQU (1*32)(SP), DWB1
  2104  		VMOVDQU (2*32)(SP), DWB2
  2105  		VMOVDQU (3*32)(SP), DWB3
  2106  
  2107  		VMOVDQU (16*0)(ctx), T0
  2108  		VPSHUFB BSWAP, T0, T0
  2109  		VPXOR ACC0, T0, T0
  2110  		VPSHUFD $78, T0, T1
  2111  		VPXOR T0, T1, T1
  2112  
  2113  		VMOVDQU (16*0)(pTbl), ACC1
  2114  		VMOVDQU (16*1)(pTbl), ACCM
  2115  
  2116  		VPCLMULQDQ $0x00, T1, ACCM, ACCM
  2117  		VPCLMULQDQ $0x00, T0, ACC1, ACC0
  2118  		VPCLMULQDQ $0x11, T0, ACC1, ACC1
  2119  
  2120  		avxDecMulRound(1)
  2121  		increment(0)
  2122  		avxDecMulRound(2)
  2123  		increment(1)
  2124  		avxDecMulRound(3)
  2125  		increment(2)
  2126  	 	avxDecMulRound(4)
  2127  		increment(3)
  2128  		avxDecMulRound(5)
  2129  		increment(4)
  2130  		avxDecMulRound(6)
  2131  		increment(5)
  2132  	 	avxDecMulRound(7)
  2133  		increment(6)
  2134  		
  2135  		VPXOR ACC0, ACCM, ACCM
  2136  		VPXOR ACC1, ACCM, ACCM
  2137  		VPSLLDQ $8, ACCM, T0
  2138  		VPSRLDQ $8, ACCM, ACCM
  2139  		
  2140  		VPXOR ACCM, ACC1, ACC1
  2141  		VPXOR T0, ACC0, ACC0
  2142  		increment(7)
  2143  
  2144  		avxReduceRound(ACC0)
  2145  		avxReduceRound(ACC0)
  2146  		VPXOR ACC1, ACC0, ACC0
  2147  
  2148  		// Transpose matrix 4 x 4 32bits word
  2149  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
  2150  
  2151  		AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3)
  2152  
  2153  		// Transpose matrix 4 x 4 32bits word
  2154  		TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD)
  2155  
  2156  		VPSHUFB DWBSWAP, DWB0, DWB0
  2157  		VPSHUFB DWBSWAP, DWB1, DWB1
  2158  		VPSHUFB DWBSWAP, DWB2, DWB2
  2159  		VPSHUFB DWBSWAP, DWB3, DWB3
  2160  
  2161  		VPXOR (32*0)(ctx), DWB0, DWB0
  2162  		VPXOR (32*1)(ctx), DWB1, DWB1
  2163  		VPXOR (32*2)(ctx), DWB2, DWB2
  2164  		VPXOR (32*3)(ctx), DWB3, DWB3
  2165  
  2166  		VMOVDQU DWB0, (32*0)(ptx)
  2167  		VMOVDQU DWB1, (32*1)(ptx)
  2168  		VMOVDQU DWB2, (32*2)(ptx)
  2169  		VMOVDQU DWB3, (32*3)(ptx)
  2170  		
  2171  		LEAQ 128(ptx), ptx
  2172  		LEAQ 128(ctx), ctx
  2173  
  2174  		JMP avx2GcmSm4DecOctetsLoop
  2175  
  2176  avx2GcmSm4DecEndOctets:
  2177  	SUBQ $4, aluCTR
  2178  
  2179  avx2GcmSm4DecNibbles:
  2180  	CMPQ ptxLen, $64
  2181  	JBE avx2GcmSm4DecSingles
  2182  	SUBQ $64, ptxLen
  2183  
  2184  	VMOVDQU (0*16)(SP), B4
  2185  	VMOVDQU (1*16)(SP), B1
  2186  	VMOVDQU (2*16)(SP), B2
  2187  	VMOVDQU (3*16)(SP), B3
  2188  	
  2189  	AVX_SM4_4BLOCKS_WO_BS(rk, B0, B5, B6, B7, B4, B1, B2, B3)
  2190  
  2191  	VMOVDQU (16*14)(pTbl), T2
  2192  	VMOVDQU (16*0)(ctx), B0
  2193  	VPXOR B0, B4, B4
  2194  	increment(0)
  2195  	internalAvxDecGhashRound()
  2196  
  2197  	VMOVDQU (16*1)(ctx), B0
  2198  	VPXOR B0, B1, B1
  2199  	increment(1)
  2200  	internalAvxDecGhashRound()
  2201  
  2202  	VMOVDQU (16*2)(ctx), B0
  2203  	VPXOR B0, B2, B2
  2204  	increment(2)
  2205  	internalAvxDecGhashRound()
  2206  
  2207  	VMOVDQU (16*3)(ctx), B0
  2208  	VPXOR B0, B3, B3
  2209  	increment(3)
  2210  	internalAvxDecGhashRound()
  2211  
  2212  	VMOVDQU B4, (16*0)(ptx)
  2213  	VMOVDQU B1, (16*1)(ptx)
  2214  	VMOVDQU B2, (16*2)(ptx)
  2215  	VMOVDQU B3, (16*3)(ptx)
  2216  
  2217  	LEAQ 64(ptx), ptx
  2218  	LEAQ 64(ctx), ctx
  2219  
  2220  avx2GcmSm4DecSingles:
  2221  	TESTQ ptxLen, ptxLen
  2222  	JE avx2GcmSm4DecDone
  2223  
  2224  	VMOVDQU (0*16)(SP), B0
  2225  	VMOVDQU (1*16)(SP), B1
  2226  	VMOVDQU (2*16)(SP), B2
  2227  	VMOVDQU (3*16)(SP), B3
  2228  
  2229  	AVX_SM4_4BLOCKS_WO_BS(rk, B4, B5, B6, B7, B0, B1, B2, B3)
  2230  
  2231  	VMOVDQU B0, (16*4)(SP)
  2232  	VMOVDQU B1, (16*5)(SP)
  2233  	VMOVDQU B2, (16*6)(SP)
  2234  	VMOVDQU B3, (16*7)(SP)
  2235  
  2236  	VMOVDQU (16*14)(pTbl), T2
  2237  	MOVQ SP, BP
  2238  	ADDQ $64, BP
  2239  
  2240  avx2GcmSm4DecSinglesLoop:
  2241  		CMPQ ptxLen, $16
  2242  		JB avx2GcmSm4DecTail
  2243  		SUBQ $16, ptxLen
  2244  
  2245  		VMOVDQU (16*0)(BP), T0
  2246  		VMOVDQU (ctx), B0
  2247  		VPXOR T0, B0, T0
  2248  		VMOVDQU T0, (ptx)
  2249  
  2250  		internalAvxDecGhashRound()
  2251  		LEAQ (16*1)(ptx), ptx
  2252  		LEAQ (16*1)(ctx), ctx
  2253  		ADDQ $16, BP
  2254  	JMP avx2GcmSm4DecSinglesLoop
  2255  
  2256  avx2GcmSm4DecTail:
  2257  	TESTQ ptxLen, ptxLen
  2258  	JE avx2GcmSm4DecDone
  2259  
  2260  	MOVQ ptxLen, aluTMP
  2261  	SHLQ $4, aluTMP
  2262  	LEAQ andMask<>(SB), aluCTR
  2263  	VMOVDQU -16(aluCTR)(aluTMP*1), T1 // Fetch and-mask according ptxLen
  2264  
  2265  	VMOVDQU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  2266  	VPAND T1, B0, B0  // Just keep ptxLen bytes, others will be zero
  2267  
  2268  	VMOVDQU B0, T1
  2269  	internalAvxDecGhashRound()
  2270  	VMOVDQU (16*0)(BP), B0
  2271  	VPXOR T1, B0, B0
  2272  
  2273  avx2PtxStoreLoop:
  2274  		VPEXTRB $0, B0, (ptx)
  2275  		VPSRLDQ $1, B0, B0
  2276  		LEAQ 1(ptx), ptx
  2277  		DECQ ptxLen
  2278  
  2279  	JNE avx2PtxStoreLoop
  2280  
  2281  avx2GcmSm4DecDone:
  2282  	VMOVDQU ACC0, (tPtr)
  2283  	VZEROUPPER	
  2284  	RET
  2285  
  2286  // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
  2287  TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
  2288  	RET
  2289  
  2290  // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
  2291  TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
  2292  	RET