github.com/emmansun/gmsm@v0.29.1/sm4/cbc_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  #include "aesni_macros_amd64.s"
     5  
     6  
     7  #define XDWTMP0 Y0
     8  #define XDWTMP1 Y1
     9  
    10  #define XDWORD0 Y4
    11  #define XDWORD1 Y5
    12  #define XDWORD2 Y6
    13  #define XDWORD3 Y7
    14  
    15  #define XDWORD4 Y10
    16  #define XDWORD5 Y11
    17  #define XDWORD6 Y12
    18  #define XDWORD7 Y14
    19  
    20  #define XWTMP0 X0
    21  #define XWTMP1 X1
    22  #define XWTMP2 X2
    23  
    24  #define XWORD0 X4
    25  #define XWORD1 X5
    26  #define XWORD2 X6
    27  #define XWORD3 X7
    28  
    29  #define XWORD4 X10
    30  #define XWORD5 X11
    31  #define XWORD6 X12
    32  #define XWORD7 X14
    33  
    34  #define NIBBLE_MASK Y3
    35  #define X_NIBBLE_MASK X3
    36  
    37  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
    38  #define X_BYTE_FLIP_MASK 	X13 // mask to convert LE -> BE
    39  
    40  #define BSWAP_MASK Y2
    41  
    42  #define XDWORD Y8
    43  #define YDWORD Y9
    44  
    45  #define XWORD X8
    46  #define YWORD X9
    47  
    48  // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
    49  TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
    50  	MOVQ xk+0(FP), AX
    51  	MOVQ dst+8(FP), BX
    52  	MOVQ src+32(FP), DX
    53  	MOVQ src_len+40(FP), DI
    54  	MOVQ iv+56(FP), SI
    55  
    56  	LEAQ (DX)(DI*1), DX
    57  	LEAQ (BX)(DI*1), BX
    58  
    59  	CMPB ·useAVX2(SB), $1
    60  	JE   avx2Start
    61  
    62  	CMPB ·useAVX(SB), $1
    63  	JE   avxStart
    64  
    65  	MOVOU -16(DX), X15
    66  
    67  cbcSm4Octets:
    68  	CMPQ DI, $128
    69  	JLE cbcSm4Nibbles
    70  	SUBQ $128, DI
    71  	LEAQ -128(DX), DX
    72  	LEAQ -128(BX), BX
    73  
    74  	MOVOU 0(DX), XWORD0
    75  	MOVOU 16(DX), XWORD1
    76  	MOVOU 32(DX), XWORD2
    77  	MOVOU 48(DX), XWORD3
    78  	MOVOU 64(DX), XWORD4
    79  	MOVOU 80(DX), XWORD5
    80  	MOVOU 96(DX), XWORD6
    81  	MOVOU 112(DX), XWORD7
    82  
    83  	SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
    84  
    85  	MOVOU -16(DX), XWTMP0
    86  	PXOR XWTMP0, XWORD0
    87  	MOVOU 0(DX), XWTMP0
    88  	PXOR XWTMP0, XWORD1
    89  	MOVOU 16(DX), XWTMP0
    90  	PXOR XWTMP0, XWORD2
    91  	MOVOU 32(DX), XWTMP0
    92  	PXOR XWTMP0, XWORD3
    93  	MOVOU 48(DX), XWTMP0
    94  	PXOR XWTMP0, XWORD4
    95  	MOVOU 64(DX), XWTMP0
    96  	PXOR XWTMP0, XWORD5
    97  	MOVOU 80(DX), XWTMP0
    98  	PXOR XWTMP0, XWORD6
    99  	MOVOU 96(DX), XWTMP0
   100  	PXOR XWTMP0, XWORD7
   101  
   102  	MOVOU XWORD0, 0(BX)
   103  	MOVOU XWORD1, 16(BX)
   104  	MOVOU XWORD2, 32(BX)
   105  	MOVOU XWORD3, 48(BX)
   106  	MOVOU XWORD4, 64(BX)
   107  	MOVOU XWORD5, 80(BX)
   108  	MOVOU XWORD6, 96(BX)
   109  	MOVOU XWORD7, 112(BX)		
   110  
   111  	JMP cbcSm4Octets
   112  
   113  cbcSm4Nibbles:
   114  	CMPQ DI, $64
   115  	JLE cbCSm4Single
   116  	SUBQ $64, DI
   117  	LEAQ -64(DX), DX
   118  	LEAQ -64(BX), BX
   119  
   120  	MOVOU 0(DX), XWORD0
   121  	MOVOU 16(DX), XWORD1
   122  	MOVOU 32(DX), XWORD2
   123  	MOVOU 48(DX), XWORD3
   124  
   125  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   126  
   127  	MOVUPS -16(DX), XWTMP0
   128  	PXOR XWTMP0, XWORD0
   129  	MOVUPS 0(DX), XWTMP0
   130  	PXOR XWTMP0, XWORD1
   131  	MOVUPS 16(DX), XWTMP0
   132  	PXOR XWTMP0, XWORD2
   133  	MOVUPS 32(DX), XWTMP0
   134  	PXOR XWTMP0, XWORD3
   135  
   136  	MOVUPS XWORD0, 0(BX)
   137  	MOVUPS XWORD1, 16(BX)
   138  	MOVUPS XWORD2, 32(BX)
   139  	MOVUPS XWORD3, 48(BX)
   140  
   141  cbCSm4Single:
   142  	CMPQ DI, $16
   143  	JEQ cbcSm4Single16
   144  
   145  	CMPQ DI, $32
   146  	JEQ cbcSm4Single32
   147  
   148  	CMPQ DI, $48
   149  	JEQ cbcSm4Single48
   150  
   151  	MOVOU -64(DX), XWORD0
   152  	MOVOU -48(DX), XWORD1
   153  	MOVOU -32(DX), XWORD2
   154  	MOVOU -16(DX), XWORD3
   155  
   156  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   157  
   158  	MOVUPS 0(SI), XWTMP0
   159  	PXOR XWTMP0, XWORD0
   160  	MOVUPS -64(DX), XWTMP0
   161  	PXOR XWTMP0, XWORD1
   162  	MOVUPS -48(DX), XWTMP0
   163  	PXOR XWTMP0, XWORD2
   164  	MOVUPS -32(DX), XWTMP0
   165  	PXOR XWTMP0, XWORD3
   166  
   167  	MOVUPS XWORD0, -64(BX)
   168  	MOVUPS XWORD1, -48(BX)
   169  	MOVUPS XWORD2, -32(BX)
   170  	MOVUPS XWORD3, -16(BX)
   171  
   172  	JMP cbcSm4Done
   173  
   174  cbcSm4Single16:
   175  	MOVOU -16(DX), XWORD0
   176  
   177  	SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   178  
   179  	MOVUPS 0(SI), XWTMP0
   180  	PXOR XWTMP0, XWORD0
   181  
   182  	MOVUPS XWORD0, -16(BX)
   183  
   184  	JMP cbcSm4Done
   185  
   186  cbcSm4Single32:
   187  	MOVOU -32(DX), XWORD0
   188  	MOVOU -16(DX), XWORD1
   189  
   190  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   191  
   192  	MOVUPS 0(SI), XWTMP0
   193  	PXOR XWTMP0, XWORD0
   194  	MOVUPS -32(DX), XWTMP0
   195  	PXOR XWTMP0, XWORD1
   196  
   197  	MOVUPS XWORD0, -32(BX)
   198  	MOVUPS XWORD1, -16(BX)
   199  
   200  	JMP cbcSm4Done
   201  
   202  cbcSm4Single48:
   203  	MOVOU -48(DX), XWORD0
   204  	MOVOU -32(DX), XWORD1
   205  	MOVOU -16(DX), XWORD2
   206  
   207  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   208  
   209  	MOVUPS 0(SI), XWTMP0
   210  	PXOR XWTMP0, XWORD0
   211  	MOVUPS -48(DX), XWTMP0
   212  	PXOR XWTMP0, XWORD1
   213  	MOVUPS -32(DX), XWTMP0
   214  	PXOR XWTMP0, XWORD2
   215  
   216  	MOVUPS XWORD0, -48(BX)
   217  	MOVUPS XWORD1, -32(BX)
   218  	MOVUPS XWORD2, -16(BX)
   219  	
   220  cbcSm4Done:
   221  	MOVUPS X15, (SI)
   222  	RET
   223  
   224  avxStart:
   225  	VMOVDQU -16(DX), X15
   226  
   227  avxCbcSm4Octets:
   228  	CMPQ DI, $128
   229  	JLE avxCbcSm4Nibbles
   230  	SUBQ $128, DI
   231  	LEAQ -128(DX), DX
   232  	LEAQ -128(BX), BX
   233  
   234  	VMOVDQU 0(DX), XWORD0
   235  	VMOVDQU 16(DX), XWORD1
   236  	VMOVDQU 32(DX), XWORD2
   237  	VMOVDQU 48(DX), XWORD3
   238  	VMOVDQU 64(DX), XWORD4
   239  	VMOVDQU 80(DX), XWORD5
   240  	VMOVDQU 96(DX), XWORD6
   241  	VMOVDQU 112(DX), XWORD7
   242  
   243  	AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
   244  	
   245  	VPXOR -16(DX), XWORD0, XWORD0
   246  	VPXOR 0(DX), XWORD1, XWORD1
   247  	VPXOR 16(DX), XWORD2, XWORD2
   248  	VPXOR 32(DX), XWORD3, XWORD3
   249  	VPXOR 48(DX), XWORD4, XWORD4
   250  	VPXOR 64(DX), XWORD5, XWORD5
   251  	VPXOR 80(DX), XWORD6, XWORD6
   252  	VPXOR 96(DX), XWORD7, XWORD7
   253  
   254  	VMOVDQU XWORD0, 0(BX)
   255  	VMOVDQU XWORD1, 16(BX)
   256  	VMOVDQU XWORD2, 32(BX)
   257  	VMOVDQU XWORD3, 48(BX)
   258  	VMOVDQU XWORD4, 64(BX)
   259  	VMOVDQU XWORD5, 80(BX)
   260  	VMOVDQU XWORD6, 96(BX)
   261  	VMOVDQU XWORD7, 112(BX)	
   262  
   263  	JMP avxCbcSm4Octets
   264  
   265  avxCbcSm4Nibbles:
   266  	CMPQ DI, $64
   267  	JLE avxCbCSm4Single
   268  	SUBQ $64, DI
   269  	LEAQ -64(DX), DX
   270  	LEAQ -64(BX), BX
   271  
   272  	VMOVDQU 0(DX), XWORD0
   273  	VMOVDQU 16(DX), XWORD1
   274  	VMOVDQU 32(DX), XWORD2
   275  	VMOVDQU 48(DX), XWORD3
   276  
   277  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   278  
   279  	VPXOR -16(DX), XWORD0, XWORD0
   280  	VPXOR 0(DX), XWORD1, XWORD1
   281  	VPXOR 16(DX), XWORD2, XWORD2
   282  	VPXOR 32(DX), XWORD3, XWORD3
   283  
   284  	VMOVDQU XWORD0, 0(BX)
   285  	VMOVDQU XWORD1, 16(BX)
   286  	VMOVDQU XWORD2, 32(BX)
   287  	VMOVDQU XWORD3, 48(BX)
   288  
   289  avxCbCSm4Single:
   290  	CMPQ DI, $16
   291  	JEQ avxCbcSm4Single16
   292  
   293  	CMPQ DI, $32
   294  	JEQ avxCbcSm4Single32
   295  
   296  	CMPQ DI, $48
   297  	JEQ avxCbcSm4Single48
   298  
   299  	VMOVDQU -64(DX), XWORD0
   300  	VMOVDQU -48(DX), XWORD1
   301  	VMOVDQU -32(DX), XWORD2
   302  	VMOVDQU -16(DX), XWORD3
   303  
   304  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   305  
   306  	VPXOR 0(SI), XWORD0, XWORD0
   307  	VPXOR -64(DX), XWORD1, XWORD1
   308  	VPXOR -48(DX), XWORD2, XWORD2
   309  	VPXOR -32(DX), XWORD3, XWORD3
   310  
   311  	VMOVDQU XWORD0, -64(BX)
   312  	VMOVDQU XWORD1, -48(BX)
   313  	VMOVDQU XWORD2, -32(BX)
   314  	VMOVDQU XWORD3, -16(BX)
   315  
   316  	JMP avxCbcSm4Done
   317  
   318  avxCbcSm4Single16:
   319  	VMOVDQU -16(DX), XWORD0
   320  
   321  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   322  
   323  	VPXOR 0(SI), XWORD0, XWORD0
   324  
   325  	VMOVDQU XWORD0, -16(BX)
   326  
   327  	JMP avxCbcSm4Done
   328  
   329  avxCbcSm4Single32:
   330  	VMOVDQU -32(DX), XWORD0
   331  	VMOVDQU -16(DX), XWORD1
   332  
   333  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   334  
   335  	VPXOR 0(SI), XWORD0, XWORD0
   336  	VPXOR -32(DX), XWORD1, XWORD1
   337  
   338  	VMOVDQU XWORD0, -32(BX)
   339  	VMOVDQU XWORD1, -16(BX)
   340  
   341  	JMP avxCbcSm4Done
   342  
   343  avxCbcSm4Single48:
   344  	VMOVDQU -48(DX), XWORD0
   345  	VMOVDQU -32(DX), XWORD1
   346  	VMOVDQU -16(DX), XWORD2
   347  
   348  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   349  
   350  	VPXOR 0(SI), XWORD0, XWORD0
   351  	VPXOR -48(DX), XWORD1, XWORD1
   352  	VPXOR -32(DX), XWORD2, XWORD2
   353  
   354  	VMOVDQU XWORD0, -48(BX)
   355  	VMOVDQU XWORD1, -32(BX)
   356  	VMOVDQU XWORD2, -16(BX)
   357  	
   358  avxCbcSm4Done:
   359  	VMOVDQU X15, (SI)
   360  	RET
   361  
   362  avx2Start:
   363  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
   364  	VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
   365  	VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
   366  
   367  	VMOVDQU -16(DX), X15
   368  
   369  avx2_16blocks:
   370  	CMPQ DI, $256
   371  	JLE avx2CbcSm4Octets
   372  	SUBQ $256, DI
   373  	LEAQ -256(DX), DX
   374  	LEAQ -256(BX), BX
   375  
   376  	VMOVDQU 0(DX), XDWORD0
   377  	VMOVDQU 32(DX), XDWORD1
   378  	VMOVDQU 64(DX), XDWORD2
   379  	VMOVDQU 96(DX), XDWORD3
   380  	VMOVDQU 128(DX), XDWORD4
   381  	VMOVDQU 160(DX), XDWORD5
   382  	VMOVDQU 192(DX), XDWORD6
   383  	VMOVDQU 224(DX), XDWORD7
   384  
   385  	// Apply Byte Flip Mask: LE -> BE
   386  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   387  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   388  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   389  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   390  	VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
   391  	VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
   392  	VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
   393  	VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
   394  
   395  	// Transpose matrix 4 x 4 32bits word
   396  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   397  	TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
   398  
   399  	AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
   400  
   401  	// Transpose matrix 4 x 4 32bits word
   402  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   403  	TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
   404  
   405  	VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
   406  	VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
   407  	VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
   408  	VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
   409    	VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
   410  	VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
   411  	VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
   412  	VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
   413  
   414  	VPXOR -16(DX), XDWORD0, XDWORD0
   415  	VPXOR 16(DX), XDWORD1, XDWORD1
   416  	VPXOR 48(DX), XDWORD2, XDWORD2
   417  	VPXOR 80(DX), XDWORD3, XDWORD3
   418  	VPXOR 112(DX), XDWORD4, XDWORD4
   419  	VPXOR 144(DX), XDWORD5, XDWORD5
   420  	VPXOR 176(DX), XDWORD6, XDWORD6
   421  	VPXOR 208(DX), XDWORD7, XDWORD7
   422  
   423  	VMOVDQU XDWORD0, 0(BX)
   424  	VMOVDQU XDWORD1, 32(BX)
   425  	VMOVDQU XDWORD2, 64(BX)
   426  	VMOVDQU XDWORD3, 96(BX)
   427  	VMOVDQU XDWORD4, 128(BX)
   428  	VMOVDQU XDWORD5, 160(BX)
   429  	VMOVDQU XDWORD6, 192(BX)
   430  	VMOVDQU XDWORD7, 224(BX)
   431  
   432  	JMP avx2_16blocks
   433  
   434  avx2CbcSm4Octets:
   435  	CMPQ DI, $128
   436  	JLE avx2CbcSm4Nibbles
   437  	SUBQ $128, DI
   438  	LEAQ -128(DX), DX
   439  	LEAQ -128(BX), BX
   440  
   441  	VMOVDQU 0(DX), XDWORD0
   442  	VMOVDQU 32(DX), XDWORD1
   443  	VMOVDQU 64(DX), XDWORD2
   444  	VMOVDQU 96(DX), XDWORD3
   445  
   446  	// Apply Byte Flip Mask: LE -> BE
   447  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   448  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   449  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   450  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   451  
   452  	// Transpose matrix 4 x 4 32bits word
   453  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   454  
   455  	AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   456  
   457  	// Transpose matrix 4 x 4 32bits word
   458  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   459  
   460  	VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
   461  	VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
   462  	VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
   463  	VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
   464  	
   465  	VPXOR -16(DX), XDWORD0, XDWORD0
   466  	VPXOR 16(DX), XDWORD1, XDWORD1
   467  	VPXOR 48(DX), XDWORD2, XDWORD2
   468  	VPXOR 80(DX), XDWORD3, XDWORD3
   469  
   470  	VMOVDQU XDWORD0, 0(BX)
   471  	VMOVDQU XDWORD1, 32(BX)
   472  	VMOVDQU XDWORD2, 64(BX)
   473  	VMOVDQU XDWORD3, 96(BX)
   474  
   475  	JMP avx2CbcSm4Octets
   476  
   477  avx2CbcSm4Nibbles:
   478  	CMPQ DI, $64
   479  	JLE avx2CbCSm4Single
   480  	SUBQ $64, DI
   481  	LEAQ -64(DX), DX
   482  	LEAQ -64(BX), BX
   483  
   484  	VMOVDQU 0(DX), XWORD0
   485  	VMOVDQU 16(DX), XWORD1
   486  	VMOVDQU 32(DX), XWORD2
   487  	VMOVDQU 48(DX), XWORD3
   488  
   489  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   490  
   491  	VPXOR -16(DX), XWORD0, XWORD0
   492  	VPXOR 0(DX), XWORD1, XWORD1
   493  	VPXOR 16(DX), XWORD2, XWORD2
   494  	VPXOR 32(DX), XWORD3, XWORD3
   495  
   496  	VMOVDQU XWORD0, 0(BX)
   497  	VMOVDQU XWORD1, 16(BX)
   498  	VMOVDQU XWORD2, 32(BX)
   499  	VMOVDQU XWORD3, 48(BX)
   500  
   501  avx2CbCSm4Single:
   502  	CMPQ DI, $16
   503  	JEQ avx2CbcSm4Single16
   504  
   505  	CMPQ DI, $32
   506  	JEQ avx2CbcSm4Single32
   507  
   508  	CMPQ DI, $48
   509  	JEQ avx2CbcSm4Single48
   510  
   511  	VMOVDQU -64(DX), XWORD0
   512  	VMOVDQU -48(DX), XWORD1
   513  	VMOVDQU -32(DX), XWORD2
   514  	VMOVDQU -16(DX), XWORD3
   515  
   516  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   517  
   518  	VPXOR 0(SI), XWORD0, XWORD0
   519  	VPXOR -64(DX), XWORD1, XWORD1
   520  	VPXOR -48(DX), XWORD2, XWORD2
   521  	VPXOR -32(DX), XWORD3, XWORD3
   522  
   523  	VMOVDQU XWORD0, -64(BX)
   524  	VMOVDQU XWORD1, -48(BX)
   525  	VMOVDQU XWORD2, -32(BX)
   526  	VMOVDQU XWORD3, -16(BX)
   527  
   528  	JMP avx2CbcSm4Done
   529  
   530  avx2CbcSm4Single16:
   531  	VMOVDQU -16(DX), XWORD0
   532  
   533  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   534  
   535  	VPXOR 0(SI), XWORD0, XWORD0
   536  
   537  	VMOVDQU XWORD0, -16(BX)
   538  
   539  	JMP avx2CbcSm4Done
   540  
   541  avx2CbcSm4Single32:
   542  	VMOVDQU -32(DX), XWORD0
   543  	VMOVDQU -16(DX), XWORD1
   544  
   545  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   546  
   547  	VPXOR 0(SI), XWORD0, XWORD0
   548  	VPXOR -32(DX), XWORD1, XWORD1
   549  
   550  	VMOVDQU XWORD0, -32(BX)
   551  	VMOVDQU XWORD1, -16(BX)
   552  
   553  	JMP avx2CbcSm4Done
   554  
   555  avx2CbcSm4Single48:
   556  	VMOVDQU -48(DX), XWORD0
   557  	VMOVDQU -32(DX), XWORD1
   558  	VMOVDQU -16(DX), XWORD2
   559  
   560  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   561  
   562  	VPXOR 0(SI), XWORD0, XWORD0
   563  	VPXOR -48(DX), XWORD1, XWORD1
   564  	VPXOR -32(DX), XWORD2, XWORD2
   565  
   566  	VMOVDQU XWORD0, -48(BX)
   567  	VMOVDQU XWORD1, -32(BX)
   568  	VMOVDQU XWORD2, -16(BX)
   569  	
   570  avx2CbcSm4Done:
   571  	VMOVDQU X15, (SI)
   572  	VZEROUPPER
   573  	RET