github.com/emmansun/gmsm@v0.29.1/sm4/xts_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define B0 X0
     6  #define B1 X1
     7  #define B2 X2
     8  #define B3 X3
     9  #define B4 X4
    10  #define B5 X5
    11  #define B6 X6
    12  #define B7 X7
    13  
    14  #define TW X10
    15  
    16  #define T0 X11
    17  #define T1 X12
    18  #define T2 X13
    19  #define POLY X14
    20  #define NIBBLE_MASK Y13
    21  #define X_NIBBLE_MASK X13
    22  #define BSWAP X15
    23  #define DWBSWAP Y15
    24  
    25  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000087
    26  DATA gcmPoly<>+0x08(SB)/8, $0x0000000000000000
    27  
    28  DATA gbGcmPoly<>+0x00(SB)/8, $0x0000000000000000
    29  DATA gbGcmPoly<>+0x08(SB)/8, $0xe100000000000000
    30  
    31  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    32  GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16
    33  
    34  #include "aesni_macros_amd64.s"
    35  
    36  #define mul2GBInline        \
    37  	PSHUFB BSWAP, TW;       \
    38  	\// TW * 2
    39  	MOVOU TW, T0;           \
    40   	PSHUFD $0, TW, T1;      \
    41  	PSRLQ $1, TW;           \
    42  	PSLLQ $63, T0;          \
    43  	PSRLDQ $8, T0;          \
    44  	POR T0, TW;             \
    45  	\// reduction
    46  	PSLLL $31, T1;          \
    47  	PSRAL $31, T1;          \
    48  	PAND POLY, T1;          \
    49  	PXOR T1, TW;            \
    50  	PSHUFB BSWAP, TW
    51  
    52  #define avxMul2GBInline        \
    53  	VPSHUFB BSWAP, TW, TW;       \
    54  	\// TW * 2
    55  	VPSLLQ $63, TW, T0;     \      
    56   	VPSHUFD $0, TW, T1;     \
    57  	VPSRLQ $1, TW, TW;      \
    58  	VPSRLDQ $8, T0, T0;     \
    59  	VPOR T0, TW, TW;        \
    60  	\// reduction
    61  	VPSLLD $31, T1, T1;     \
    62  	VPSRAD $31, T1, T1;     \
    63  	VPAND POLY, T1, T1;     \
    64  	VPXOR T1, TW, TW;       \
    65  	VPSHUFB BSWAP, TW, TW
    66  
    67  #define prepareGB4Tweaks \
    68  	MOVOU TW, (16*0)(SP); \
    69  	mul2GBInline;           \ 
    70  	MOVOU TW, (16*1)(SP); \ 
    71  	mul2GBInline;           \
    72  	MOVOU TW, (16*2)(SP); \
    73  	mul2GBInline;           \
    74  	MOVOU TW, (16*3)(SP); \
    75  	mul2GBInline
    76  
    77  #define prepareGB8Tweaks \
    78  	prepareGB4Tweaks;       \
    79  	MOVOU TW, (16*4)(SP); \
    80  	mul2GBInline;           \
    81  	MOVOU TW, (16*5)(SP); \
    82  	mul2GBInline;           \
    83  	MOVOU TW, (16*6)(SP); \
    84  	mul2GBInline;           \
    85  	MOVOU TW, (16*7)(SP); \
    86  	mul2GBInline
    87  
    88  #define avxPrepareGB4Tweaks \
    89  	VMOVDQU TW, (16*0)(SP); \
    90  	avxMul2GBInline;           \ 
    91  	VMOVDQU TW, (16*1)(SP); \ 
    92  	avxMul2GBInline;           \
    93  	VMOVDQU TW, (16*2)(SP); \
    94  	avxMul2GBInline;           \
    95  	VMOVDQU TW, (16*3)(SP); \
    96  	avxMul2GBInline
    97  
    98  #define avxPrepareGB8Tweaks \
    99  	avxPrepareGB4Tweaks;       \
   100  	VMOVDQU TW, (16*4)(SP); \
   101  	avxMul2GBInline;           \
   102  	VMOVDQU TW, (16*5)(SP); \
   103  	avxMul2GBInline;           \
   104  	VMOVDQU TW, (16*6)(SP); \
   105  	avxMul2GBInline;           \
   106  	VMOVDQU TW, (16*7)(SP); \
   107  	avxMul2GBInline
   108  
   109  #define avxPrepareGB16Tweaks \
   110  	avxPrepareGB8Tweaks;       \
   111  	VMOVDQU TW, (16*8)(SP); \
   112  	avxMul2GBInline;           \
   113  	VMOVDQU TW, (16*9)(SP); \
   114  	avxMul2GBInline;           \
   115  	VMOVDQU TW, (16*10)(SP); \
   116  	avxMul2GBInline;           \
   117  	VMOVDQU TW, (16*11)(SP); \
   118  	avxMul2GBInline;           \
   119  	VMOVDQU TW, (16*12)(SP); \
   120  	avxMul2GBInline;           \
   121  	VMOVDQU TW, (16*13)(SP); \
   122  	avxMul2GBInline;           \
   123  	VMOVDQU TW, (16*14)(SP); \
   124  	avxMul2GBInline;           \
   125  	VMOVDQU TW, (16*15)(SP); \
   126  	avxMul2GBInline
   127  
   128  #define mul2Inline        \
   129  	PSHUFD $0xff, TW, T0; \
   130  	MOVOU TW, T1;         \         
   131  	PSRAL $31, T0;        \
   132  	PAND POLY, T0;        \        
   133  	PSRLL $31, T1;        \
   134  	PSLLDQ $4, T1;        \
   135  	PSLLL $1, TW;         \
   136  	PXOR T0, TW;          \
   137  	PXOR T1, TW
   138  
   139  #define avxMul2Inline        \
   140  	VPSHUFD $0xff, TW, T0; \
   141  	VPSRLD $31, TW, T1;    \       
   142  	VPSRAD $31, T0, T0;    \
   143  	VPAND POLY, T0, T0;    \        
   144  	VPSLLDQ $4, T1, T1;    \
   145  	VPSLLD $1, TW, TW;     \
   146  	VPXOR T0, TW, TW;      \
   147  	VPXOR T1, TW, TW
   148  
   149  #define prepare4Tweaks \
   150  	MOVOU TW, (16*0)(SP); \
   151  	mul2Inline;           \ 
   152  	MOVOU TW, (16*1)(SP); \ 
   153  	mul2Inline;           \
   154  	MOVOU TW, (16*2)(SP); \
   155  	mul2Inline;           \
   156  	MOVOU TW, (16*3)(SP); \
   157  	mul2Inline
   158  
   159  #define prepare8Tweaks \
   160  	prepare4Tweaks;       \
   161  	MOVOU TW, (16*4)(SP); \
   162  	mul2Inline;           \
   163  	MOVOU TW, (16*5)(SP); \
   164  	mul2Inline;           \
   165  	MOVOU TW, (16*6)(SP); \
   166  	mul2Inline;           \
   167  	MOVOU TW, (16*7)(SP); \
   168  	mul2Inline
   169  
   170  #define avxPrepare4Tweaks \
   171  	VMOVDQU TW, (16*0)(SP); \
   172  	avxMul2Inline;           \ 
   173  	VMOVDQU TW, (16*1)(SP); \ 
   174  	avxMul2Inline;           \
   175  	VMOVDQU TW, (16*2)(SP); \
   176  	avxMul2Inline;           \
   177  	VMOVDQU TW, (16*3)(SP); \
   178  	avxMul2Inline
   179  
   180  #define avxPrepare8Tweaks \
   181  	prepare4Tweaks;       \
   182  	VMOVDQU TW, (16*4)(SP); \
   183  	avxMul2Inline;           \
   184  	VMOVDQU TW, (16*5)(SP); \
   185  	avxMul2Inline;           \
   186  	VMOVDQU TW, (16*6)(SP); \
   187  	avxMul2Inline;           \
   188  	VMOVDQU TW, (16*7)(SP); \
   189  	avxMul2Inline
   190  
   191  #define avxPrepare16Tweaks \
   192  	prepare8Tweaks;       \
   193  	VMOVDQU TW, (16*8)(SP); \
   194  	avxMul2Inline;           \
   195  	VMOVDQU TW, (16*9)(SP); \
   196  	avxMul2Inline;           \
   197  	VMOVDQU TW, (16*10)(SP); \
   198  	avxMul2Inline;           \
   199  	VMOVDQU TW, (16*11)(SP); \
   200  	avxMul2Inline;           \
   201  	VMOVDQU TW, (16*12)(SP); \
   202  	avxMul2Inline;           \
   203  	VMOVDQU TW, (16*13)(SP); \
   204  	avxMul2Inline;           \
   205  	VMOVDQU TW, (16*14)(SP); \
   206  	avxMul2Inline;           \
   207  	VMOVDQU TW, (16*15)(SP); \
   208  	avxMul2Inline
   209  
   210  #define sseLoad4Blocks \
   211  	MOVOU (16*0)(DX), B0; \
   212  	MOVOU (16*0)(SP), T0; \ 
   213  	PXOR T0, B0; \
   214  	MOVOU (16*1)(DX), B1; \
   215  	MOVOU (16*1)(SP), T0; \
   216  	PXOR T0, B1; \
   217  	MOVOU (16*2)(DX), B2; \
   218  	MOVOU (16*2)(SP), T0; \
   219  	PXOR T0, B2; \
   220  	MOVOU (16*3)(DX), B3; \
   221  	MOVOU (16*3)(SP), T0; \
   222  	PXOR T0, B3
   223  
   224  #define sseStore4Blocks \
   225  	MOVOU (16*0)(SP), T0; \
   226  	PXOR T0, B0; \
   227  	MOVOU B0, (16*0)(CX); \
   228  	MOVOU (16*1)(SP), T0; \
   229  	PXOR T0, B1; \
   230  	MOVOU B1, (16*1)(CX); \
   231  	MOVOU (16*2)(SP), T0; \
   232  	PXOR T0, B2; \
   233  	MOVOU B2, (16*2)(CX); \
   234  	MOVOU (16*3)(SP), T0; \
   235  	PXOR T0, B3; \
   236  	MOVOU B3, (16*3)(CX)
   237  
   238  #define sseLoad8Blocks \
   239  	sseLoad4Blocks;  \
   240  	MOVOU (16*4)(DX), B4;  \
   241  	MOVOU (16*4)(SP), T0;  \ 
   242  	PXOR T0, B4;           \
   243  	MOVOU (16*5)(DX), B5;  \
   244  	MOVOU (16*5)(SP), T0;  \ 
   245  	PXOR T0, B5;           \
   246  	MOVOU (16*6)(DX), B6;  \
   247  	MOVOU (16*6)(SP), T0;  \ 
   248  	PXOR T0, B6;           \
   249  	MOVOU (16*7)(DX), B7;  \
   250  	MOVOU (16*7)(SP), T0;  \
   251  	PXOR T0, B7
   252  
   253  #define sseStore8Blocks \
   254  	sseStore4Blocks; \
   255  	MOVOU (16*4)(SP), T0; \ 
   256  	PXOR T0, B4; \
   257  	MOVOU B4, (16*4)(CX); \
   258  	MOVOU (16*5)(SP), T0; \
   259  	PXOR T0, B5; \
   260  	MOVOU B5, (16*5)(CX); \
   261  	MOVOU (16*6)(SP), T0; \
   262  	PXOR T0, B6; \
   263  	MOVOU B6, (16*6)(CX); \
   264  	MOVOU (16*7)(SP), T0; \
   265  	PXOR T0, B7; \
   266  	MOVOU B7, (16*7)(CX)
   267  
   268  #define avxLoad4Blocks \
   269  	VMOVDQU (16*0)(DX), B0; \
   270  	VPXOR (16*0)(SP), B0, B0; \
   271  	VMOVDQU (16*1)(DX), B1; \
   272  	VPXOR (16*1)(SP), B1, B1; \
   273  	VMOVDQU (16*2)(DX), B2; \
   274  	VPXOR (16*2)(SP), B2, B2; \
   275  	VMOVDQU (16*3)(DX), B3; \
   276  	VPXOR (16*3)(SP), B3, B3
   277  
   278  #define avxStore4Blocks \
   279  	VPXOR (16*0)(SP), B0, B0; \
   280  	VMOVDQU B0, (16*0)(CX); \
   281  	VPXOR (16*1)(SP), B1, B1; \
   282  	VMOVDQU B1, (16*1)(CX); \
   283  	VPXOR (16*2)(SP), B2, B2; \
   284  	VMOVDQU B2, (16*2)(CX); \
   285  	VPXOR (16*3)(SP), B3, B3; \
   286  	VMOVDQU B3, (16*3)(CX)
   287  
   288  #define avxLoad8Blocks \
   289  	avxLoad4Blocks; \
   290  	VMOVDQU (16*4)(DX), B4; \
   291  	VPXOR (16*4)(SP), B4, B4; \
   292  	VMOVDQU (16*5)(DX), B5; \
   293  	VPXOR (16*5)(SP), B5, B5; \
   294  	VMOVDQU (16*6)(DX), B6; \
   295  	VPXOR (16*6)(SP), B6, B6; \
   296  	VMOVDQU (16*7)(DX), B7; \
   297  	VPXOR (16*7)(SP), B7, B7
   298  
   299  #define avxStore8Blocks \
   300  	avxStore4Blocks; \
   301  	VPXOR (16*4)(SP), B4, B4; \
   302  	VMOVDQU B4, (16*4)(CX); \
   303  	VPXOR (16*5)(SP), B5, B5; \
   304  	VMOVDQU B5, (16*5)(CX); \
   305  	VPXOR (16*6)(SP), B6, B6; \
   306  	VMOVDQU B6, (16*6)(CX); \
   307  	VPXOR (16*7)(SP), B7, B7; \
   308  	VMOVDQU B7, (16*7)(CX)
   309  
   310  #define avx2Load8Blocks \
   311  	VMOVDQU (32*0)(DX), Y0; \
   312  	VPXOR (32*0)(SP), Y0, Y0; \
   313  	VMOVDQU (32*1)(DX), Y1; \
   314  	VPXOR (32*1)(SP), Y1, Y1; \
   315  	VMOVDQU (32*2)(DX), Y2; \
   316  	VPXOR (32*2)(SP), Y2, Y2; \
   317  	VMOVDQU (32*3)(DX), Y3; \
   318  	VPXOR (32*3)(SP), Y3, Y3
   319  
   320  #define avx2Load16Blocks \
   321  	avx2Load8Blocks; \
   322  	VMOVDQU (32*4)(DX), Y4; \
   323  	VPXOR (32*4)(SP), Y4, Y4; \
   324  	VMOVDQU (32*5)(DX), Y5; \
   325  	VPXOR (32*5)(SP), Y5, Y5; \
   326  	VMOVDQU (32*6)(DX), Y6; \
   327  	VPXOR (32*6)(SP), Y6, Y6; \
   328  	VMOVDQU (32*7)(DX), Y7; \
   329  	VPXOR (32*7)(SP), Y7, Y7
   330  
   331  #define avx2LE2BE8Blocks \
   332  	VBROADCASTI128 flip_mask<>(SB), Y11; \
   333  	VPSHUFB Y11, Y0, Y0; \
   334  	VPSHUFB Y11, Y1, Y1; \
   335  	VPSHUFB Y11, Y2, Y2; \
   336  	VPSHUFB Y11, Y3, Y3; \
   337  
   338  #define avx2LE2BE16Blocks \
   339  	avx2LE2BE8Blocks; \
   340  	VPSHUFB Y11, Y4, Y4; \
   341  	VPSHUFB Y11, Y5, Y5; \
   342  	VPSHUFB Y11, Y6, Y6; \
   343  	VPSHUFB Y11, Y7, Y7
   344  
   345  #define avx2Store8Blocks \
   346  	VPXOR (32*0)(SP), Y0, Y0; \
   347  	VMOVDQU Y0, (32*0)(CX); \
   348  	VPXOR (32*1)(SP), Y1, Y1; \
   349  	VMOVDQU Y1, (32*1)(CX); \
   350  	VPXOR (32*2)(SP), Y2, Y2; \
   351  	VMOVDQU Y2, (32*2)(CX); \
   352  	VPXOR (32*3)(SP), Y3, Y3; \
   353  	VMOVDQU Y3, (32*3)(CX); \
   354  
   355  #define avx2Store16Blocks \
   356  	avx2Store8Blocks; \
   357  	VPXOR (32*4)(SP), Y4, Y4; \
   358  	VMOVDQU Y4, (32*4)(CX); \
   359  	VPXOR (32*5)(SP), Y5, Y5; \
   360  	VMOVDQU Y5, (32*5)(CX); \
   361  	VPXOR (32*6)(SP), Y6, Y6; \
   362  	VMOVDQU Y6, (32*6)(CX); \
   363  	VPXOR (32*7)(SP), Y7, Y7; \
   364  	VMOVDQU Y7, (32*7)(CX)
   365  
   366  #define avx2ByteSwap8Blocks \
   367  	VPSHUFB DWBSWAP, Y0, Y0; \
   368  	VPSHUFB DWBSWAP, Y1, Y1; \
   369  	VPSHUFB DWBSWAP, Y2, Y2; \
   370  	VPSHUFB DWBSWAP, Y3, Y3; \
   371  
   372  #define avx2ByteSwap16Blocks \
   373  	avx2ByteSwap8Blocks; \
   374    	VPSHUFB DWBSWAP, Y4, Y4; \
   375  	VPSHUFB DWBSWAP, Y5, Y5; \
   376  	VPSHUFB DWBSWAP, Y6, Y6; \
   377  	VPSHUFB DWBSWAP, Y7, Y7
   378  
   379  // func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   380  TEXT ·encryptSm4Xts(SB),0,$256-64
   381  	MOVQ xk+0(FP), AX
   382  	MOVQ tweak+8(FP), BX
   383  	MOVQ dst+16(FP), CX
   384  	MOVQ src+40(FP), DX
   385  	MOVQ src_len+48(FP), DI
   386  
   387  	CMPB ·useAVX2(SB), $1
   388  	JE   avx2XtsSm4Enc
   389  
   390  	CMPB ·useAVX(SB), $1
   391  	JE   avxXtsSm4Enc
   392  
   393  	MOVOU gcmPoly<>(SB), POLY
   394  
   395  	MOVOU (0*16)(BX), TW
   396  
   397  xtsSm4EncOctets:
   398  	CMPQ DI, $128
   399  	JB xtsSm4EncNibbles
   400  	SUBQ $128, DI
   401  
   402  	// prepare tweaks
   403  	prepare8Tweaks
   404  	// load 8 blocks for encryption
   405  	sseLoad8Blocks
   406  
   407  	SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
   408  
   409  	sseStore8Blocks
   410  
   411  	LEAQ 128(DX), DX
   412  	LEAQ 128(CX), CX
   413  
   414  	JMP xtsSm4EncOctets
   415  
   416  xtsSm4EncNibbles:
   417  	CMPQ DI, $64
   418  	JB xtsSm4EncSingles
   419  	SUBQ $64, DI
   420  
   421  	// prepare tweaks
   422  	prepare4Tweaks
   423  	// load 4 blocks for encryption
   424  	sseLoad4Blocks
   425  
   426  	SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   427  
   428  	sseStore4Blocks
   429  
   430  	LEAQ 64(DX), DX
   431  	LEAQ 64(CX), CX
   432  
   433  xtsSm4EncSingles:
   434  	CMPQ DI, $16
   435  	JB xtsSm4EncTail
   436  	SUBQ $16, DI
   437  
   438  	// load 1 block for encryption
   439  	MOVOU (16*0)(DX), B0
   440  	
   441  	PXOR TW, B0
   442  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   443  	PXOR TW, B0
   444  	MOVOU B0, (16*0)(CX)
   445  	mul2Inline
   446  
   447  	LEAQ 16(DX), DX
   448  	LEAQ 16(CX), CX
   449  
   450  	JMP xtsSm4EncSingles
   451  
   452  xtsSm4EncTail:
   453  	TESTQ DI, DI
   454  	JE xtsSm4EncDone
   455  
   456  	LEAQ -16(CX), R8
   457  	MOVOU (16*0)(R8), B0
   458  	MOVOU B0, (16*0)(SP)
   459  
   460  	CMPQ DI, $8
   461  	JB   loop_1b
   462  	SUBQ  $8, DI
   463  	MOVQ (DX)(DI*1), R9
   464  	MOVQ (SP)(DI*1), R10
   465  	MOVQ R9, (SP)(DI*1)
   466  	MOVQ R10, (CX)(DI*1)
   467  
   468  	TESTQ DI, DI
   469  	JE xtsSm4EncTailEnc
   470  
   471  loop_1b:
   472  	SUBQ  $1, DI
   473  	MOVB (DX)(DI*1), R9
   474  	MOVB (SP)(DI*1), R10
   475  	MOVB R9, (SP)(DI*1)
   476  	MOVB R10, (CX)(DI*1)
   477  	TESTQ DI, DI
   478  	JNE   loop_1b
   479  
   480  xtsSm4EncTailEnc:
   481  	MOVOU (16*0)(SP), B0
   482  	PXOR TW, B0
   483  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   484  	PXOR TW, B0
   485  	MOVOU B0, (16*0)(R8)
   486  
   487  xtsSm4EncDone:
   488  	MOVOU TW, (16*0)(BX)
   489  	RET
   490  
   491  avxXtsSm4Enc:
   492  	VMOVDQU gcmPoly<>(SB), POLY
   493  	VMOVDQU (0*16)(BX), TW
   494  
   495  avxXtsSm4EncOctets:
   496  	CMPQ DI, $128
   497  	JB avxXtsSm4EncNibbles
   498  	SUBQ $128, DI
   499  
   500  	// prepare tweaks
   501  	avxPrepare8Tweaks
   502  	// load 8 blocks for encryption
   503  	avxLoad8Blocks
   504  
   505  	AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
   506  
   507  	avxStore8Blocks
   508  
   509  	LEAQ 128(DX), DX
   510  	LEAQ 128(CX), CX
   511  
   512  	JMP avxXtsSm4EncOctets
   513  
   514  avxXtsSm4EncNibbles:
   515  	CMPQ DI, $64
   516  	JB avxXtsSm4EncSingles
   517  	SUBQ $64, DI
   518  
   519  	// prepare tweaks
   520  	avxPrepare4Tweaks
   521  	// load 4 blocks for encryption
   522  	avxLoad4Blocks
   523  
   524  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   525  
   526  	avxStore4Blocks
   527  
   528  	LEAQ 64(DX), DX
   529  	LEAQ 64(CX), CX
   530  
   531  avxXtsSm4EncSingles:
   532  	CMPQ DI, $16
   533  	JB avxXtsSm4EncTail
   534  	SUBQ $16, DI
   535  
   536  	// load 1 block for encryption
   537  	VMOVDQU (16*0)(DX), B0
   538  	
   539  	VPXOR TW, B0, B0
   540  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   541  	VPXOR TW, B0, B0
   542  	VMOVDQU B0, (16*0)(CX)
   543  	avxMul2Inline
   544  
   545  	LEAQ 16(DX), DX
   546  	LEAQ 16(CX), CX
   547  
   548  	JMP avxXtsSm4EncSingles
   549  
   550  avxXtsSm4EncTail:
   551  	TESTQ DI, DI
   552  	JE avxXtsSm4EncDone
   553  
   554  	LEAQ -16(CX), R8
   555  	VMOVDQU (16*0)(R8), B0
   556  	VMOVDQU B0, (16*0)(SP)
   557  
   558  	CMPQ DI, $8
   559  	JB   avx_loop_1b
   560  	SUBQ  $8, DI
   561  	MOVQ (DX)(DI*1), R9
   562  	MOVQ (SP)(DI*1), R10
   563  	MOVQ R9, (SP)(DI*1)
   564  	MOVQ R10, (CX)(DI*1)
   565  
   566  	TESTQ DI, DI
   567  	JE avxXtsSm4EncTailEnc
   568  
   569  avx_loop_1b:
   570  	SUBQ  $1, DI
   571  	MOVB (DX)(DI*1), R9
   572  	MOVB (SP)(DI*1), R10
   573  	MOVB R9, (SP)(DI*1)
   574  	MOVB R10, (CX)(DI*1)
   575  	TESTQ DI, DI
   576  	JNE   avx_loop_1b
   577  
   578  avxXtsSm4EncTailEnc:
   579  	VMOVDQU (16*0)(SP), B0
   580  	VPXOR TW, B0, B0
   581  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   582  	VPXOR TW, B0, B0
   583  	VMOVDQU B0, (16*0)(R8)
   584  
   585  avxXtsSm4EncDone:
   586  	VMOVDQU TW, (16*0)(BX)
   587  	RET
   588  
   589  avx2XtsSm4Enc:
   590  	VMOVDQU gcmPoly<>(SB), POLY
   591  	VMOVDQU (0*16)(BX), TW
   592  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
   593  	VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
   594  
   595  avx2XtsSm4Enc16Blocks:
   596  	CMPQ DI, $256
   597  	JB avx2XtsSm4EncOctets
   598  	SUBQ $256, DI
   599  
   600  	// prepare tweaks
   601  	avxPrepare16Tweaks
   602  	// load 16 blocks for encryption
   603  	avx2Load16Blocks
   604  	// Apply Byte Flip Mask: LE -> BE
   605  	avx2LE2BE16Blocks
   606  	// Transpose matrix 4 x 4 32bits word
   607  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   608  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
   609  
   610  	AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7)
   611  
   612  	// Transpose matrix 4 x 4 32bits word
   613  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   614  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
   615  	avx2ByteSwap16Blocks
   616  	avx2Store16Blocks
   617  
   618  	LEAQ 256(DX), DX
   619  	LEAQ 256(CX), CX
   620  	JMP avx2XtsSm4Enc16Blocks
   621  
   622  avx2XtsSm4EncOctets:
   623  	CMPQ DI, $128
   624  	JB avx2XtsSm4EncNibbles
   625  	SUBQ $128, DI
   626  
   627  	// prepare tweaks
   628  	avxPrepare8Tweaks
   629  	// load 8 blocks for encryption
   630  	avx2Load8Blocks
   631  	// Apply Byte Flip Mask: LE -> BE
   632  	avx2LE2BE8Blocks
   633  	// Transpose matrix 4 x 4 32bits word
   634  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   635  
   636  	AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3)
   637  
   638  	// Transpose matrix 4 x 4 32bits word
   639  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   640  	avx2ByteSwap8Blocks
   641  	avx2Store8Blocks
   642  
   643  	LEAQ 128(DX), DX
   644  	LEAQ 128(CX), CX
   645  
   646  avx2XtsSm4EncNibbles:
   647  	CMPQ DI, $64
   648  	JB avx2XtsSm4EncSingles
   649  	SUBQ $64, DI
   650  
   651  	// prepare tweaks
   652  	avxPrepare4Tweaks
   653  
   654  	// load 4 blocks for encryption
   655  	avxLoad4Blocks
   656  
   657  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   658  
   659  	avxStore4Blocks
   660  
   661  	LEAQ 64(DX), DX
   662  	LEAQ 64(CX), CX
   663  
   664  avx2XtsSm4EncSingles:
   665  	CMPQ DI, $16
   666  	JB avx2XtsSm4EncTail
   667  	SUBQ $16, DI
   668  
   669  	// load 1 block for encryption
   670  	VMOVDQU (16*0)(DX), B0
   671  	
   672  	VPXOR TW, B0, B0
   673  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   674  	VPXOR TW, B0, B0
   675  	VMOVDQU B0, (16*0)(CX)
   676  	avxMul2Inline
   677  
   678  	LEAQ 16(DX), DX
   679  	LEAQ 16(CX), CX
   680  
   681  	JMP avx2XtsSm4EncSingles
   682  
   683  avx2XtsSm4EncTail:
   684  	TESTQ DI, DI
   685  	JE avx2XtsSm4EncDone
   686  
   687  	LEAQ -16(CX), R8
   688  	VMOVDQU (16*0)(R8), B0
   689  	VMOVDQU B0, (16*0)(SP)
   690  
   691  	CMPQ DI, $8
   692  	JB   avx2_loop_1b
   693  	SUBQ  $8, DI
   694  	MOVQ (DX)(DI*1), R9
   695  	MOVQ (SP)(DI*1), R10
   696  	MOVQ R9, (SP)(DI*1)
   697  	MOVQ R10, (CX)(DI*1)
   698  
   699  	TESTQ DI, DI
   700  	JE avx2XtsSm4EncTailEnc
   701  
   702  avx2_loop_1b:
   703  	SUBQ  $1, DI
   704  	MOVB (DX)(DI*1), R9
   705  	MOVB (SP)(DI*1), R10
   706  	MOVB R9, (SP)(DI*1)
   707  	MOVB R10, (CX)(DI*1)
   708  	TESTQ DI, DI
   709  	JNE   avx2_loop_1b
   710  
   711  avx2XtsSm4EncTailEnc:
   712  	VMOVDQU (16*0)(SP), B0
   713  	VPXOR TW, B0, B0
   714  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   715  	VPXOR TW, B0, B0
   716  	VMOVDQU B0, (16*0)(R8)
   717  
   718  avx2XtsSm4EncDone:
   719  	VMOVDQU TW, (16*0)(BX)
   720  	VZEROUPPER
   721  	RET
   722  
   723  // func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   724  TEXT ·encryptSm4XtsGB(SB),0,$256-64
   725  	MOVQ xk+0(FP), AX
   726  	MOVQ tweak+8(FP), BX
   727  	MOVQ dst+16(FP), CX
   728  	MOVQ src+40(FP), DX
   729  	MOVQ src_len+48(FP), DI
   730  
   731  	CMPB ·useAVX2(SB), $1
   732  	JE   avx2XtsSm4Enc
   733  
   734  	CMPB ·useAVX(SB), $1
   735  	JE   avxXtsSm4Enc
   736  
   737  	MOVOU gbGcmPoly<>(SB), POLY
   738  	MOVOU bswap_mask<>(SB), BSWAP
   739  	MOVOU (0*16)(BX), TW
   740  
   741  xtsSm4EncOctets:
   742  	CMPQ DI, $128
   743  	JB xtsSm4EncNibbles
   744  	SUBQ $128, DI
   745  
   746  	// prepare tweaks
   747  	prepareGB8Tweaks
   748  	// load 8 blocks for encryption
   749  	sseLoad8Blocks
   750  
   751  	SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
   752  
   753  	sseStore8Blocks
   754  
   755  	LEAQ 128(DX), DX
   756  	LEAQ 128(CX), CX
   757  
   758  	JMP xtsSm4EncOctets
   759  
   760  xtsSm4EncNibbles:
   761  	CMPQ DI, $64
   762  	JB xtsSm4EncSingles
   763  	SUBQ $64, DI
   764  
   765  	// prepare tweaks
   766  	prepareGB4Tweaks
   767  	// load 4 blocks for encryption
   768  	sseLoad4Blocks
   769  
   770  	SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   771  
   772  	sseStore4Blocks
   773  
   774  	LEAQ 64(DX), DX
   775  	LEAQ 64(CX), CX
   776  
   777  xtsSm4EncSingles:
   778  	CMPQ DI, $16
   779  	JB xtsSm4EncTail
   780  	SUBQ $16, DI
   781  
   782  	// load 1 block for encryption
   783  	MOVOU (16*0)(DX), B0
   784  	
   785  	PXOR TW, B0
   786  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   787  	PXOR TW, B0
   788  	MOVOU B0, (16*0)(CX)
   789  	mul2GBInline
   790  
   791  	LEAQ 16(DX), DX
   792  	LEAQ 16(CX), CX
   793  
   794  	JMP xtsSm4EncSingles
   795  
   796  xtsSm4EncTail:
   797  	TESTQ DI, DI
   798  	JE xtsSm4EncDone
   799  
   800  	LEAQ -16(CX), R8
   801  	MOVOU (16*0)(R8), B0
   802  	MOVOU B0, (16*0)(SP)
   803  
   804  	CMPQ DI, $8
   805  	JB   loop_1b
   806  	SUBQ  $8, DI
   807  	MOVQ (DX)(DI*1), R9
   808  	MOVQ (SP)(DI*1), R10
   809  	MOVQ R9, (SP)(DI*1)
   810  	MOVQ R10, (CX)(DI*1)
   811  
   812  	TESTQ DI, DI
   813  	JE xtsSm4EncTailEnc
   814  
   815  loop_1b:
   816  	SUBQ  $1, DI
   817  	MOVB (DX)(DI*1), R9
   818  	MOVB (SP)(DI*1), R10
   819  	MOVB R9, (SP)(DI*1)
   820  	MOVB R10, (CX)(DI*1)
   821  	TESTQ DI, DI
   822  	JNE   loop_1b
   823  
   824  xtsSm4EncTailEnc:
   825  	MOVOU (16*0)(SP), B0
   826  	PXOR TW, B0
   827  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   828  	PXOR TW, B0
   829  	MOVOU B0, (16*0)(R8)
   830  
   831  xtsSm4EncDone:
   832  	MOVOU TW, (16*0)(BX)
   833  	RET
   834  
   835  avxXtsSm4Enc:
   836  	VMOVDQU gbGcmPoly<>(SB), POLY
   837  	VMOVDQU bswap_mask<>(SB), BSWAP
   838  	VMOVDQU (0*16)(BX), TW
   839  
   840  avxXtsSm4EncOctets:
   841  	CMPQ DI, $128
   842  	JB avxXtsSm4EncNibbles
   843  	SUBQ $128, DI
   844  
   845  	// prepare tweaks
   846  	avxPrepareGB8Tweaks
   847  	// load 8 blocks for encryption
   848  	avxLoad8Blocks
   849  
   850  	AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
   851  
   852  	avxStore8Blocks
   853  
   854  	LEAQ 128(DX), DX
   855  	LEAQ 128(CX), CX
   856  
   857  	JMP avxXtsSm4EncOctets
   858  
   859  avxXtsSm4EncNibbles:
   860  	CMPQ DI, $64
   861  	JB avxXtsSm4EncSingles
   862  	SUBQ $64, DI
   863  
   864  	// prepare tweaks
   865  	avxPrepareGB4Tweaks
   866  	// load 4 blocks for encryption
   867  	avxLoad4Blocks
   868  
   869  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   870  
   871  	avxStore4Blocks
   872  
   873  	LEAQ 64(DX), DX
   874  	LEAQ 64(CX), CX
   875  
   876  avxXtsSm4EncSingles:
   877  	CMPQ DI, $16
   878  	JB avxXtsSm4EncTail
   879  	SUBQ $16, DI
   880  
   881  	// load 1 block for encryption
   882  	VMOVDQU (16*0)(DX), B0
   883  	
   884  	VPXOR TW, B0, B0
   885  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   886  	VPXOR TW, B0, B0
   887  	VMOVDQU B0, (16*0)(CX)
   888  	avxMul2GBInline
   889  
   890  	LEAQ 16(DX), DX
   891  	LEAQ 16(CX), CX
   892  
   893  	JMP avxXtsSm4EncSingles
   894  
   895  avxXtsSm4EncTail:
   896  	TESTQ DI, DI
   897  	JE avxXtsSm4EncDone
   898  
   899  	LEAQ -16(CX), R8
   900  	VMOVDQU (16*0)(R8), B0
   901  	VMOVDQU B0, (16*0)(SP)
   902  
   903  	CMPQ DI, $8
   904  	JB   avx_loop_1b
   905  	SUBQ  $8, DI
   906  	MOVQ (DX)(DI*1), R9
   907  	MOVQ (SP)(DI*1), R10
   908  	MOVQ R9, (SP)(DI*1)
   909  	MOVQ R10, (CX)(DI*1)
   910  
   911  	TESTQ DI, DI
   912  	JE avxXtsSm4EncTailEnc
   913  
   914  avx_loop_1b:
   915  	SUBQ  $1, DI
   916  	MOVB (DX)(DI*1), R9
   917  	MOVB (SP)(DI*1), R10
   918  	MOVB R9, (SP)(DI*1)
   919  	MOVB R10, (CX)(DI*1)
   920  	TESTQ DI, DI
   921  	JNE   avx_loop_1b
   922  
   923  avxXtsSm4EncTailEnc:
   924  	VMOVDQU (16*0)(SP), B0
   925  	VPXOR TW, B0, B0
   926  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
   927  	VPXOR TW, B0, B0
   928  	VMOVDQU B0, (16*0)(R8)
   929  
   930  avxXtsSm4EncDone:
   931  	VMOVDQU TW, (16*0)(BX)
   932  	RET
   933  
   934  avx2XtsSm4Enc:
   935  	VMOVDQU gbGcmPoly<>(SB), POLY
   936  	VMOVDQU (0*16)(BX), TW
   937  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
   938  	VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
   939  
   940  avx2XtsSm4Enc16Blocks:
   941  	CMPQ DI, $256
   942  	JB avx2XtsSm4EncOctets
   943  	SUBQ $256, DI
   944  
   945  	// prepare tweaks
   946  	avxPrepareGB16Tweaks
   947  	// load 16 blocks for encryption
   948  	avx2Load16Blocks
   949  	// Apply Byte Flip Mask: LE -> BE
   950  	avx2LE2BE16Blocks
   951  	// Transpose matrix 4 x 4 32bits word
   952  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   953  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
   954  
   955  	AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7)
   956  
   957  	// Transpose matrix 4 x 4 32bits word
   958  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   959  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
   960  	avx2ByteSwap16Blocks
   961  	avx2Store16Blocks
   962  
   963  	LEAQ 256(DX), DX
   964  	LEAQ 256(CX), CX
   965  	JMP avx2XtsSm4Enc16Blocks
   966  
   967  avx2XtsSm4EncOctets:
   968  	CMPQ DI, $128
   969  	JB avx2XtsSm4EncNibbles
   970  	SUBQ $128, DI
   971  
   972  	// prepare tweaks
   973  	avxPrepareGB8Tweaks
   974  	// load 8 blocks for encryption
   975  	avx2Load8Blocks
   976  	// Apply Byte Flip Mask: LE -> BE
   977  	avx2LE2BE8Blocks
   978  	// Transpose matrix 4 x 4 32bits word
   979  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   980  
   981  	AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3)
   982  
   983  	// Transpose matrix 4 x 4 32bits word
   984  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
   985  	avx2ByteSwap8Blocks
   986  	avx2Store8Blocks
   987  
   988  	LEAQ 128(DX), DX
   989  	LEAQ 128(CX), CX
   990  
   991  avx2XtsSm4EncNibbles:
   992  	CMPQ DI, $64
   993  	JB avx2XtsSm4EncSingles
   994  	SUBQ $64, DI
   995  
   996  	// prepare tweaks
   997  	avxPrepareGB4Tweaks
   998  	// load 4 blocks for encryption
   999  	avxLoad4Blocks
  1000  
  1001  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1002  
  1003  	avxStore4Blocks
  1004  
  1005  	LEAQ 64(DX), DX
  1006  	LEAQ 64(CX), CX
  1007  
  1008  avx2XtsSm4EncSingles:
  1009  	CMPQ DI, $16
  1010  	JB avx2XtsSm4EncTail
  1011  	SUBQ $16, DI
  1012  
  1013  	// load 1 block for encryption
  1014  	VMOVDQU (16*0)(DX), B0
  1015  	
  1016  	VPXOR TW, B0, B0
  1017  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1018  	VPXOR TW, B0, B0
  1019  	VMOVDQU B0, (16*0)(CX)
  1020  	avxMul2GBInline
  1021  
  1022  	LEAQ 16(DX), DX
  1023  	LEAQ 16(CX), CX
  1024  
  1025  	JMP avx2XtsSm4EncSingles
  1026  
  1027  avx2XtsSm4EncTail:
  1028  	TESTQ DI, DI
  1029  	JE avx2XtsSm4EncDone
  1030  
  1031  	LEAQ -16(CX), R8
  1032  	VMOVDQU (16*0)(R8), B0
  1033  	VMOVDQU B0, (16*0)(SP)
  1034  
  1035  	CMPQ DI, $8
  1036  	JB   avx2_loop_1b
  1037  	SUBQ  $8, DI
  1038  	MOVQ (DX)(DI*1), R9
  1039  	MOVQ (SP)(DI*1), R10
  1040  	MOVQ R9, (SP)(DI*1)
  1041  	MOVQ R10, (CX)(DI*1)
  1042  
  1043  	TESTQ DI, DI
  1044  	JE avx2XtsSm4EncTailEnc
  1045  
  1046  avx2_loop_1b:
  1047  	SUBQ  $1, DI
  1048  	MOVB (DX)(DI*1), R9
  1049  	MOVB (SP)(DI*1), R10
  1050  	MOVB R9, (SP)(DI*1)
  1051  	MOVB R10, (CX)(DI*1)
  1052  	TESTQ DI, DI
  1053  	JNE   avx2_loop_1b
  1054  
  1055  avx2XtsSm4EncTailEnc:
  1056  	VMOVDQU (16*0)(SP), B0
  1057  	VPXOR TW, B0, B0
  1058  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1059  	VPXOR TW, B0, B0
  1060  	VMOVDQU B0, (16*0)(R8)
  1061  
  1062  avx2XtsSm4EncDone:
  1063  	VMOVDQU TW, (16*0)(BX)
  1064  	VZEROUPPER
  1065  	RET
  1066  
  1067  // func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
  1068  TEXT ·decryptSm4Xts(SB),0,$256-64
  1069  	MOVQ xk+0(FP), AX
  1070  	MOVQ tweak+8(FP), BX
  1071  	MOVQ dst+16(FP), CX
  1072  	MOVQ src+40(FP), DX
  1073  	MOVQ src_len+48(FP), DI
  1074  
  1075  	CMPB ·useAVX2(SB), $1
  1076  	JE   avx2XtsSm4Dec
  1077  
  1078  	CMPB ·useAVX(SB), $1
  1079  	JE   avxXtsSm4Dec
  1080  
  1081  	MOVOU gcmPoly<>(SB), POLY
  1082  	MOVOU (0*16)(BX), TW
  1083  
  1084  xtsSm4DecOctets:
  1085  	CMPQ DI, $128
  1086  	JB xtsSm4DecNibbles
  1087  	SUBQ $128, DI
  1088  
  1089  	// prepare tweaks
  1090  	prepare8Tweaks
  1091  	// load 8 blocks for decryption
  1092  	sseLoad8Blocks
  1093  
  1094  	SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
  1095  
  1096  	sseStore8Blocks
  1097  
  1098  	LEAQ 128(DX), DX
  1099  	LEAQ 128(CX), CX
  1100  
  1101  	JMP xtsSm4DecOctets
  1102  
  1103  xtsSm4DecNibbles:
  1104  	CMPQ DI, $64
  1105  	JB xtsSm4DecSingles
  1106  	SUBQ $64, DI
  1107  
  1108  	// prepare tweaks
  1109  	prepare4Tweaks
  1110  	// load 4 blocks for decryption
  1111  	sseLoad4Blocks
  1112  
  1113  	SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1114  
  1115  	sseStore4Blocks
  1116  
  1117  	LEAQ 64(DX), DX
  1118  	LEAQ 64(CX), CX
  1119  
  1120  xtsSm4DecSingles:
  1121  	CMPQ DI, $32
  1122  	JB xtsSm4DecTail
  1123  	SUBQ $16, DI
  1124  
  1125  	// load 1 block for decryption
  1126  	MOVOU (16*0)(DX), B0
  1127  	
  1128  	PXOR TW, B0
  1129  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1130  	PXOR TW, B0
  1131  	MOVOU B0, (16*0)(CX)
  1132  	mul2Inline
  1133  
  1134  	LEAQ 16(DX), DX
  1135  	LEAQ 16(CX), CX
  1136  
  1137  	JMP xtsSm4DecSingles
  1138  
  1139  xtsSm4DecTail:
  1140  	TESTQ DI, DI
  1141  	JE xtsSm4DecDone
  1142  
  1143  	CMPQ DI, $16
  1144  	JE xtsSm4DecLastBlock
  1145  
  1146  	// length > 16
  1147  	// load 1 block for decryption
  1148  	MOVOU (16*0)(DX), B0
  1149  	MOVOU TW, B5
  1150  	mul2Inline
  1151  	PXOR TW, B0
  1152  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1153  	PXOR TW, B0
  1154  	MOVOU B0, (16*0)(CX)
  1155  	MOVOU B5, TW
  1156  
  1157  	SUBQ $16, DI
  1158  	LEAQ 16(DX), DX
  1159  	LEAQ 16(CX), CX
  1160  	LEAQ -16(CX), R8
  1161  	MOVOU B0, (16*0)(SP)
  1162  
  1163  	CMPQ DI, $8
  1164  	JB   loop_1b
  1165  	SUBQ  $8, DI
  1166  	MOVQ (DX)(DI*1), R9
  1167  	MOVQ (SP)(DI*1), R10
  1168  	MOVQ R9, (SP)(DI*1)
  1169  	MOVQ R10, (CX)(DI*1)
  1170  
  1171  	TESTQ DI, DI
  1172  	JE xtsSm4DecTailDec
  1173  
  1174  loop_1b:
  1175  	SUBQ  $1, DI
  1176  	MOVB (DX)(DI*1), R9
  1177  	MOVB (SP)(DI*1), R10
  1178  	MOVB R9, (SP)(DI*1)
  1179  	MOVB R10, (CX)(DI*1)
  1180  	TESTQ DI, DI
  1181  	JNE   loop_1b
  1182  
  1183  xtsSm4DecTailDec:
  1184  	MOVOU (16*0)(SP), B0
  1185  	PXOR TW, B0
  1186  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1187  	PXOR TW, B0
  1188  	MOVOU B0, (16*0)(R8)
  1189  	JMP xtsSm4DecDone
  1190  
  1191  xtsSm4DecLastBlock:
  1192  	MOVOU (16*0)(DX), B0
  1193  	PXOR TW, B0
  1194  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1195  	PXOR TW, B0
  1196  	MOVOU B0, (16*0)(CX)
  1197  	mul2Inline
  1198  
  1199  xtsSm4DecDone:
  1200  	MOVOU TW, (16*0)(BX)
  1201  	RET
  1202  
  1203  avxXtsSm4Dec:
  1204  	VMOVDQU gcmPoly<>(SB), POLY
  1205  	VMOVDQU (0*16)(BX), TW
  1206  
  1207  avxXtsSm4DecOctets:
  1208  	CMPQ DI, $128
  1209  	JB avxXtsSm4DecNibbles
  1210  	SUBQ $128, DI
  1211  
  1212  	// prepare tweaks
  1213  	avxPrepare8Tweaks
  1214  
  1215  	// load 8 blocks for decryption
  1216  	avxLoad8Blocks
  1217  
  1218  	AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
  1219  
  1220  	avxStore8Blocks
  1221  
  1222  	LEAQ 128(DX), DX
  1223  	LEAQ 128(CX), CX
  1224  
  1225  	JMP avxXtsSm4DecOctets
  1226  
  1227  avxXtsSm4DecNibbles:
  1228  	CMPQ DI, $64
  1229  	JB avxXtsSm4DecSingles
  1230  	SUBQ $64, DI
  1231  
  1232  	// prepare tweaks
  1233  	avxPrepare4Tweaks
  1234  	// load 4 blocks for decryption
  1235  	avxLoad4Blocks
  1236  
  1237  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1238  
  1239  	avxStore4Blocks
  1240  
  1241  	LEAQ 64(DX), DX
  1242  	LEAQ 64(CX), CX
  1243  
  1244  avxXtsSm4DecSingles:
  1245  	CMPQ DI, $32
  1246  	JB avxXtsSm4DecTail
  1247  	SUBQ $16, DI
  1248  
  1249  	// load 1 block for decryption
  1250  	VMOVDQU (16*0)(DX), B0
  1251  	
  1252  	VPXOR TW, B0, B0
  1253  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1254  	VPXOR TW, B0, B0
  1255  	VMOVDQU B0, (16*0)(CX)
  1256  	avxMul2Inline
  1257  
  1258  	LEAQ 16(DX), DX
  1259  	LEAQ 16(CX), CX
  1260  
  1261  	JMP avxXtsSm4DecSingles
  1262  
  1263  avxXtsSm4DecTail:
  1264  	TESTQ DI, DI
  1265  	JE avxXtsSm4DecDone
  1266  
  1267  	CMPQ DI, $16
  1268  	JE avxXtsSm4DecLastBlock
  1269  
  1270  	// length > 16
  1271  	// load 1 block for decryption
  1272  	VMOVDQU (16*0)(DX), B0
  1273  	VMOVDQU TW, B5
  1274  	avxMul2Inline
  1275  	VPXOR TW, B0, B0
  1276  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1277  	VPXOR TW, B0, B0
  1278  	VMOVDQU B0, (16*0)(CX)
  1279  	VMOVDQU B5, TW
  1280  
  1281  	SUBQ $16, DI
  1282  	LEAQ 16(DX), DX
  1283  	LEAQ 16(CX), CX
  1284  	LEAQ -16(CX), R8
  1285  	VMOVDQU B0, (16*0)(SP)
  1286  
  1287  	CMPQ DI, $8
  1288  	JB   avx_loop_1b
  1289  	SUBQ  $8, DI
  1290  	MOVQ (DX)(DI*1), R9
  1291  	MOVQ (SP)(DI*1), R10
  1292  	MOVQ R9, (SP)(DI*1)
  1293  	MOVQ R10, (CX)(DI*1)
  1294  
  1295  	TESTQ DI, DI
  1296  	JE avxXtsSm4DecTailDec
  1297  
  1298  avx_loop_1b:
  1299  	SUBQ  $1, DI
  1300  	MOVB (DX)(DI*1), R9
  1301  	MOVB (SP)(DI*1), R10
  1302  	MOVB R9, (SP)(DI*1)
  1303  	MOVB R10, (CX)(DI*1)
  1304  	TESTQ DI, DI
  1305  	JNE   avx_loop_1b
  1306  
  1307  avxXtsSm4DecTailDec:
  1308  	VMOVDQU (16*0)(SP), B0
  1309  	VPXOR TW, B0, B0
  1310  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1311  	VPXOR TW, B0, B0
  1312  	VMOVDQU B0, (16*0)(R8)
  1313  	JMP avxXtsSm4DecDone
  1314  
  1315  avxXtsSm4DecLastBlock:
  1316  	VMOVDQU (16*0)(DX), B0
  1317  	VPXOR TW, B0, B0
  1318  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1319  	VPXOR TW, B0, B0
  1320  	VMOVDQU B0, (16*0)(CX)
  1321  	avxMul2Inline
  1322  
  1323  avxXtsSm4DecDone:
  1324  	VMOVDQU TW, (16*0)(BX)
  1325  	RET
  1326  
  1327  avx2XtsSm4Dec:
  1328  	VMOVDQU gcmPoly<>(SB), POLY
  1329  	VMOVDQU (0*16)(BX), TW
  1330  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
  1331  	VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
  1332  
  1333  avx2XtsSm4Dec16Blocks:
  1334  	CMPQ DI, $256
  1335  	JB avx2XtsSm4DecOctets
  1336  	SUBQ $256, DI
  1337  
  1338  	// prepare tweaks
  1339  	avxPrepare16Tweaks
  1340  	// load 16 blocks for encryption
  1341  	avx2Load16Blocks
  1342  	// Apply Byte Flip Mask: LE -> BE
  1343  	avx2LE2BE16Blocks
  1344  	// Transpose matrix 4 x 4 32bits word
  1345  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1346  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
  1347  
  1348  	AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7)
  1349  
  1350  	// Transpose matrix 4 x 4 32bits word
  1351  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1352  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
  1353  	avx2ByteSwap16Blocks
  1354  	avx2Store16Blocks
  1355  
  1356  	LEAQ 256(DX), DX
  1357  	LEAQ 256(CX), CX
  1358  
  1359  	JMP avx2XtsSm4Dec16Blocks
  1360  
  1361  avx2XtsSm4DecOctets:
  1362  	CMPQ DI, $128
  1363  	JB avx2XtsSm4DecNibbles
  1364  	SUBQ $128, DI
  1365  
  1366  	// prepare tweaks
  1367  	avxPrepare8Tweaks
  1368  	// load 8 blocks for encryption
  1369  	avx2Load8Blocks
  1370  	// Apply Byte Flip Mask: LE -> BE
  1371  	avx2LE2BE8Blocks
  1372  	// Transpose matrix 4 x 4 32bits word
  1373  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1374  
  1375  	AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3)
  1376  
  1377  	// Transpose matrix 4 x 4 32bits word
  1378  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1379  	avx2ByteSwap8Blocks
  1380  	avx2Store8Blocks
  1381  
  1382  	LEAQ 128(DX), DX
  1383  	LEAQ 128(CX), CX
  1384  
  1385  avx2XtsSm4DecNibbles:
  1386  	CMPQ DI, $64
  1387  	JB avxXtsSm4DecSingles
  1388  	SUBQ $64, DI
  1389  
  1390  	// prepare tweaks
  1391  	avxPrepare4Tweaks
  1392  	// load 4 blocks for decryption
  1393  	avxLoad4Blocks
  1394  
  1395  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1396  
  1397  	avxStore4Blocks
  1398  
  1399  	LEAQ 64(DX), DX
  1400  	LEAQ 64(CX), CX
  1401  
  1402  avx2XtsSm4DecSingles:
  1403  	CMPQ DI, $32
  1404  	JB avx2XtsSm4DecTail
  1405  	SUBQ $16, DI
  1406  
  1407  	// load 1 block for decryption
  1408  	VMOVDQU (16*0)(DX), B0
  1409  	
  1410  	VPXOR TW, B0, B0
  1411  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1412  	VPXOR TW, B0, B0
  1413  	VMOVDQU B0, (16*0)(CX)
  1414  	avxMul2Inline
  1415  
  1416  	LEAQ 16(DX), DX
  1417  	LEAQ 16(CX), CX
  1418  
  1419  	JMP avx2XtsSm4DecSingles
  1420  
  1421  avx2XtsSm4DecTail:
  1422  	TESTQ DI, DI
  1423  	JE avx2XtsSm4DecDone
  1424  
  1425  	CMPQ DI, $16
  1426  	JE avx2XtsSm4DecLastBlock
  1427  
  1428  	// length > 16
  1429  	// load 1 block for decryption
  1430  	VMOVDQU (16*0)(DX), B0
  1431  	VMOVDQU TW, B5
  1432  	avxMul2Inline
  1433  	VPXOR TW, B0, B0
  1434  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1435  	VPXOR TW, B0, B0
  1436  	VMOVDQU B0, (16*0)(CX)
  1437  	VMOVDQU B5, TW
  1438  
  1439  	SUBQ $16, DI
  1440  	LEAQ 16(DX), DX
  1441  	LEAQ 16(CX), CX
  1442  	LEAQ -16(CX), R8
  1443  	VMOVDQU B0, (16*0)(SP)
  1444  
  1445  	CMPQ DI, $8
  1446  	JB   avx2_loop_1b
  1447  	SUBQ  $8, DI
  1448  	MOVQ (DX)(DI*1), R9
  1449  	MOVQ (SP)(DI*1), R10
  1450  	MOVQ R9, (SP)(DI*1)
  1451  	MOVQ R10, (CX)(DI*1)
  1452  
  1453  	TESTQ DI, DI
  1454  	JE avx2XtsSm4DecTailDec
  1455  
  1456  avx2_loop_1b:
  1457  	SUBQ  $1, DI
  1458  	MOVB (DX)(DI*1), R9
  1459  	MOVB (SP)(DI*1), R10
  1460  	MOVB R9, (SP)(DI*1)
  1461  	MOVB R10, (CX)(DI*1)
  1462  	TESTQ DI, DI
  1463  	JNE   avx2_loop_1b
  1464  
  1465  avx2XtsSm4DecTailDec:
  1466  	VMOVDQU (16*0)(SP), B0
  1467  	VPXOR TW, B0, B0
  1468  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1469  	VPXOR TW, B0, B0
  1470  	VMOVDQU B0, (16*0)(R8)
  1471  	JMP avx2XtsSm4DecDone
  1472  
  1473  avx2XtsSm4DecLastBlock:
  1474  	VMOVDQU (16*0)(DX), B0
  1475  	VPXOR TW, B0, B0
  1476  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1477  	VPXOR TW, B0, B0
  1478  	VMOVDQU B0, (16*0)(CX)
  1479  	avxMul2Inline
  1480  
  1481  avx2XtsSm4DecDone:
  1482  	VMOVDQU TW, (16*0)(BX)
  1483  	VZEROUPPER
  1484  	RET
  1485  
  1486  // func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
  1487  TEXT ·decryptSm4XtsGB(SB),0,$256-64
  1488  	MOVQ xk+0(FP), AX
  1489  	MOVQ tweak+8(FP), BX
  1490  	MOVQ dst+16(FP), CX
  1491  	MOVQ src+40(FP), DX
  1492  	MOVQ src_len+48(FP), DI
  1493  
  1494  	CMPB ·useAVX2(SB), $1
  1495  	JE   avx2XtsSm4Dec
  1496  
  1497  	CMPB ·useAVX(SB), $1
  1498  	JE   avxXtsSm4Dec
  1499  
  1500  	MOVOU gbGcmPoly<>(SB), POLY
  1501  	MOVOU bswap_mask<>(SB), BSWAP
  1502  	MOVOU (0*16)(BX), TW
  1503  
  1504  xtsSm4DecOctets:
  1505  	CMPQ DI, $128
  1506  	JB xtsSm4DecNibbles
  1507  	SUBQ $128, DI
  1508  
  1509  	// prepare tweaks
  1510  	prepareGB8Tweaks
  1511  	// load 8 blocks for decryption
  1512  	sseLoad8Blocks
  1513  
  1514  	SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
  1515  
  1516  	sseStore8Blocks
  1517  
  1518  	LEAQ 128(DX), DX
  1519  	LEAQ 128(CX), CX
  1520  
  1521  	JMP xtsSm4DecOctets
  1522  
  1523  xtsSm4DecNibbles:
  1524  	CMPQ DI, $64
  1525  	JB xtsSm4DecSingles
  1526  	SUBQ $64, DI
  1527  
  1528  	// prepare tweaks
  1529  	prepareGB4Tweaks
  1530  	// load 4 blocks for decryption
  1531  	sseLoad4Blocks
  1532  
  1533  	SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1534  
  1535  	sseStore4Blocks
  1536  
  1537  	LEAQ 64(DX), DX
  1538  	LEAQ 64(CX), CX
  1539  
  1540  xtsSm4DecSingles:
  1541  	CMPQ DI, $32
  1542  	JB xtsSm4DecTail
  1543  	SUBQ $16, DI
  1544  
  1545  	// load 1 block for decryption
  1546  	MOVOU (16*0)(DX), B0
  1547  	
  1548  	PXOR TW, B0
  1549  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1550  	PXOR TW, B0
  1551  	MOVOU B0, (16*0)(CX)
  1552  	mul2GBInline
  1553  
  1554  	LEAQ 16(DX), DX
  1555  	LEAQ 16(CX), CX
  1556  
  1557  	JMP xtsSm4DecSingles
  1558  
  1559  xtsSm4DecTail:
  1560  	TESTQ DI, DI
  1561  	JE xtsSm4DecDone
  1562  
  1563  	CMPQ DI, $16
  1564  	JE xtsSm4DecLastBlock
  1565  
  1566  	// length > 16
  1567  	// load 1 block for decryption
  1568  	MOVOU (16*0)(DX), B0
  1569  	MOVOU TW, B5
  1570  	mul2GBInline
  1571  	PXOR TW, B0
  1572  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1573  	PXOR TW, B0
  1574  	MOVOU B0, (16*0)(CX)
  1575  	MOVOU B5, TW
  1576  
  1577  	SUBQ $16, DI
  1578  	LEAQ 16(DX), DX
  1579  	LEAQ 16(CX), CX
  1580  	LEAQ -16(CX), R8
  1581  	MOVOU B0, (16*0)(SP)
  1582  
  1583  	CMPQ DI, $8
  1584  	JB   loop_1b
  1585  	SUBQ  $8, DI
  1586  	MOVQ (DX)(DI*1), R9
  1587  	MOVQ (SP)(DI*1), R10
  1588  	MOVQ R9, (SP)(DI*1)
  1589  	MOVQ R10, (CX)(DI*1)
  1590  
  1591  	TESTQ DI, DI
  1592  	JE xtsSm4DecTailDec
  1593  
  1594  loop_1b:
  1595  	SUBQ  $1, DI
  1596  	MOVB (DX)(DI*1), R9
  1597  	MOVB (SP)(DI*1), R10
  1598  	MOVB R9, (SP)(DI*1)
  1599  	MOVB R10, (CX)(DI*1)
  1600  	TESTQ DI, DI
  1601  	JNE   loop_1b
  1602  
  1603  xtsSm4DecTailDec:
  1604  	MOVOU (16*0)(SP), B0
  1605  	PXOR TW, B0
  1606  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1607  	PXOR TW, B0
  1608  	MOVOU B0, (16*0)(R8)
  1609  	JMP xtsSm4DecDone
  1610  
  1611  xtsSm4DecLastBlock:
  1612  	MOVOU (16*0)(DX), B0
  1613  	PXOR TW, B0
  1614  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1615  	PXOR TW, B0
  1616  	MOVOU B0, (16*0)(CX)
  1617  	mul2GBInline
  1618  
  1619  xtsSm4DecDone:
  1620  	MOVOU TW, (16*0)(BX)
  1621  	RET
  1622  
  1623  avxXtsSm4Dec:
  1624  	VMOVDQU gbGcmPoly<>(SB), POLY
  1625  	VMOVDQU bswap_mask<>(SB), BSWAP	
  1626  	VMOVDQU (0*16)(BX), TW
  1627  
  1628  avxXtsSm4DecOctets:
  1629  	CMPQ DI, $128
  1630  	JB avxXtsSm4DecNibbles
  1631  	SUBQ $128, DI
  1632  
  1633  	// prepare tweaks
  1634  	avxPrepareGB8Tweaks
  1635  	// load 8 blocks for decryption
  1636  	avxLoad8Blocks
  1637  
  1638  	AVX_SM4_8BLOCKS(AX, X8, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)
  1639  
  1640  	avxStore8Blocks
  1641  
  1642  	LEAQ 128(DX), DX
  1643  	LEAQ 128(CX), CX
  1644  
  1645  	JMP avxXtsSm4DecOctets
  1646  
  1647  avxXtsSm4DecNibbles:
  1648  	CMPQ DI, $64
  1649  	JB avxXtsSm4DecSingles
  1650  	SUBQ $64, DI
  1651  
  1652  	// prepare tweaks
  1653  	avxPrepareGB4Tweaks
  1654  	// load 4 blocks for decryption
  1655  	avxLoad4Blocks
  1656  
  1657  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1658  
  1659  	avxStore4Blocks
  1660  
  1661  	LEAQ 64(DX), DX
  1662  	LEAQ 64(CX), CX
  1663  
  1664  avxXtsSm4DecSingles:
  1665  	CMPQ DI, $32
  1666  	JB avxXtsSm4DecTail
  1667  	SUBQ $16, DI
  1668  
  1669  	// load 1 block for decryption
  1670  	VMOVDQU (16*0)(DX), B0
  1671  	
  1672  	VPXOR TW, B0, B0
  1673  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1674  	VPXOR TW, B0, B0
  1675  	VMOVDQU B0, (16*0)(CX)
  1676  	avxMul2GBInline
  1677  
  1678  	LEAQ 16(DX), DX
  1679  	LEAQ 16(CX), CX
  1680  
  1681  	JMP avxXtsSm4DecSingles
  1682  
  1683  avxXtsSm4DecTail:
  1684  	TESTQ DI, DI
  1685  	JE avxXtsSm4DecDone
  1686  
  1687  	CMPQ DI, $16
  1688  	JE avxXtsSm4DecLastBlock
  1689  
  1690  	// length > 16
  1691  	// load 1 block for decryption
  1692  	VMOVDQU (16*0)(DX), B0
  1693  	VMOVDQU TW, B5
  1694  	avxMul2GBInline
  1695  	VPXOR TW, B0, B0
  1696  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1697  	VPXOR TW, B0, B0
  1698  	VMOVDQU B0, (16*0)(CX)
  1699  	VMOVDQU B5, TW
  1700  
  1701  	SUBQ $16, DI
  1702  	LEAQ 16(DX), DX
  1703  	LEAQ 16(CX), CX
  1704  	LEAQ -16(CX), R8
  1705  	VMOVDQU B0, (16*0)(SP)
  1706  
  1707  	CMPQ DI, $8
  1708  	JB   avx_loop_1b
  1709  	SUBQ  $8, DI
  1710  	MOVQ (DX)(DI*1), R9
  1711  	MOVQ (SP)(DI*1), R10
  1712  	MOVQ R9, (SP)(DI*1)
  1713  	MOVQ R10, (CX)(DI*1)
  1714  
  1715  	TESTQ DI, DI
  1716  	JE avxXtsSm4DecTailDec
  1717  
  1718  avx_loop_1b:
  1719  	SUBQ  $1, DI
  1720  	MOVB (DX)(DI*1), R9
  1721  	MOVB (SP)(DI*1), R10
  1722  	MOVB R9, (SP)(DI*1)
  1723  	MOVB R10, (CX)(DI*1)
  1724  	TESTQ DI, DI
  1725  	JNE   avx_loop_1b
  1726  
  1727  avxXtsSm4DecTailDec:
  1728  	VMOVDQU (16*0)(SP), B0
  1729  	VPXOR TW, B0, B0
  1730  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1731  	VPXOR TW, B0, B0
  1732  	VMOVDQU B0, (16*0)(R8)
  1733  	JMP avxXtsSm4DecDone
  1734  
  1735  avxXtsSm4DecLastBlock:
  1736  	VMOVDQU (16*0)(DX), B0
  1737  	VPXOR TW, B0, B0
  1738  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1739  	VPXOR TW, B0, B0
  1740  	VMOVDQU B0, (16*0)(CX)
  1741  	avxMul2GBInline
  1742  
  1743  avxXtsSm4DecDone:
  1744  	VMOVDQU TW, (16*0)(BX)
  1745  	RET
  1746  
  1747  avx2XtsSm4Dec:
  1748  	VMOVDQU gbGcmPoly<>(SB), POLY
  1749  	VMOVDQU (0*16)(BX), TW
  1750  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
  1751  	VBROADCASTI128 bswap_mask<>(SB), DWBSWAP
  1752  
  1753  avx2XtsSm4Dec16Blocks:
  1754  	CMPQ DI, $256
  1755  	JB avx2XtsSm4DecOctets
  1756  	SUBQ $256, DI
  1757  
  1758  	// prepare tweaks
  1759  	avxPrepareGB16Tweaks
  1760  	// load 16 blocks for encryption
  1761  	avx2Load16Blocks
  1762  	// Apply Byte Flip Mask: LE -> BE
  1763  	avx2LE2BE16Blocks
  1764  	// Transpose matrix 4 x 4 32bits word
  1765  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1766  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
  1767  
  1768  	AVX2_SM4_16BLOCKS(AX, Y8, Y9, X8, X9, Y11, Y12, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7)
  1769  
  1770  	// Transpose matrix 4 x 4 32bits word
  1771  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1772  	TRANSPOSE_MATRIX(Y4, Y5, Y6, Y7, Y8, Y9)
  1773  	avx2ByteSwap16Blocks
  1774  	avx2Store16Blocks
  1775  
  1776  	LEAQ 256(DX), DX
  1777  	LEAQ 256(CX), CX
  1778  
  1779  	JMP avx2XtsSm4Dec16Blocks
  1780  
  1781  avx2XtsSm4DecOctets:
  1782  	CMPQ DI, $128
  1783  	JB avx2XtsSm4DecNibbles
  1784  	SUBQ $128, DI
  1785  
  1786  	// prepare tweaks
  1787  	avxPrepareGB8Tweaks
  1788  	// load 8 blocks for encryption
  1789  	avx2Load8Blocks
  1790  	// Apply Byte Flip Mask: LE -> BE
  1791  	avx2LE2BE8Blocks
  1792  	// Transpose matrix 4 x 4 32bits word
  1793  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1794  
  1795  	AVX2_SM4_8BLOCKS(AX, Y8, Y9, X8, X9, Y7, Y0, Y1, Y2, Y3)
  1796  
  1797  	// Transpose matrix 4 x 4 32bits word
  1798  	TRANSPOSE_MATRIX(Y0, Y1, Y2, Y3, Y8, Y9)
  1799  	avx2ByteSwap8Blocks
  1800  	avx2Store8Blocks
  1801  
  1802  	LEAQ 128(DX), DX
  1803  	LEAQ 128(CX), CX
  1804  
  1805  avx2XtsSm4DecNibbles:
  1806  	CMPQ DI, $64
  1807  	JB avxXtsSm4DecSingles
  1808  	SUBQ $64, DI
  1809  
  1810  	// prepare tweaks
  1811  	avxPrepareGB4Tweaks
  1812  	// load 4 blocks for decryption
  1813  	avxLoad4Blocks
  1814  
  1815  	AVX_SM4_4BLOCKS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1816  
  1817  	avxStore4Blocks
  1818  
  1819  	LEAQ 64(DX), DX
  1820  	LEAQ 64(CX), CX
  1821  
  1822  avx2XtsSm4DecSingles:
  1823  	CMPQ DI, $32
  1824  	JB avx2XtsSm4DecTail
  1825  	SUBQ $16, DI
  1826  
  1827  	// load 1 block for decryption
  1828  	VMOVDQU (16*0)(DX), B0
  1829  	
  1830  	VPXOR TW, B0, B0
  1831  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1832  	VPXOR TW, B0, B0
  1833  	VMOVDQU B0, (16*0)(CX)
  1834  	avxMul2Inline
  1835  
  1836  	LEAQ 16(DX), DX
  1837  	LEAQ 16(CX), CX
  1838  
  1839  	JMP avx2XtsSm4DecSingles
  1840  
  1841  avx2XtsSm4DecTail:
  1842  	TESTQ DI, DI
  1843  	JE avx2XtsSm4DecDone
  1844  
  1845  	CMPQ DI, $16
  1846  	JE avx2XtsSm4DecLastBlock
  1847  
  1848  	// length > 16
  1849  	// load 1 block for decryption
  1850  	VMOVDQU (16*0)(DX), B0
  1851  	VMOVDQU TW, B5
  1852  	avxMul2GBInline
  1853  	VPXOR TW, B0, B0
  1854  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1855  	VPXOR TW, B0, B0
  1856  	VMOVDQU B0, (16*0)(CX)
  1857  	VMOVDQU B5, TW
  1858  
  1859  	SUBQ $16, DI
  1860  	LEAQ 16(DX), DX
  1861  	LEAQ 16(CX), CX
  1862  	LEAQ -16(CX), R8
  1863  	VMOVDQU B0, (16*0)(SP)
  1864  
  1865  	CMPQ DI, $8
  1866  	JB   avx2_loop_1b
  1867  	SUBQ  $8, DI
  1868  	MOVQ (DX)(DI*1), R9
  1869  	MOVQ (SP)(DI*1), R10
  1870  	MOVQ R9, (SP)(DI*1)
  1871  	MOVQ R10, (CX)(DI*1)
  1872  
  1873  	TESTQ DI, DI
  1874  	JE avx2XtsSm4DecTailDec
  1875  
  1876  avx2_loop_1b:
  1877  	SUBQ  $1, DI
  1878  	MOVB (DX)(DI*1), R9
  1879  	MOVB (SP)(DI*1), R10
  1880  	MOVB R9, (SP)(DI*1)
  1881  	MOVB R10, (CX)(DI*1)
  1882  	TESTQ DI, DI
  1883  	JNE   avx2_loop_1b
  1884  
  1885  avx2XtsSm4DecTailDec:
  1886  	VMOVDQU (16*0)(SP), B0
  1887  	VPXOR TW, B0, B0
  1888  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1889  	VPXOR TW, B0, B0
  1890  	VMOVDQU B0, (16*0)(R8)
  1891  	JMP avx2XtsSm4DecDone
  1892  
  1893  avx2XtsSm4DecLastBlock:
  1894  	VMOVDQU (16*0)(DX), B0
  1895  	VPXOR TW, B0, B0
  1896  	SM4_SINGLE_BLOCK(AX, B4, T0, T1, T2, B0, B1, B2, B3)
  1897  	VPXOR TW, B0, B0
  1898  	VMOVDQU B0, (16*0)(CX)
  1899  	avxMul2GBInline
  1900  
  1901  avx2XtsSm4DecDone:
  1902  	VMOVDQU TW, (16*0)(BX)
  1903  	VZEROUPPER
  1904  	RET