github.com/emmansun/gmsm@v0.29.1/sm4/xts_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define B0 V0
     6  #define B1 V1
     7  #define B2 V2
     8  #define B3 V3
     9  #define B4 V4
    10  #define B5 V5
    11  #define B6 V6
    12  #define B7 V7
    13  
    14  #define T0 V8
    15  #define T1 V9
    16  #define T2 V10
    17  #define T3 V11
    18  #define T4 V12
    19  #define T5 V13
    20  #define T6 V14
    21  #define T7 V15
    22  
    23  #define POLY V16
    24  #define ZERO V17
    25  #define TW V18
    26  
    27  #define K0 V19
    28  #define K1 V20
    29  #define K2 V21
    30  #define K3 V22
    31  
    32  #define NIBBLE_MASK V23
    33  #define INVERSE_SHIFT_ROWS V24
    34  #define M1L V25
    35  #define M1H V26 
    36  #define M2L V27 
    37  #define M2H V28
    38  #define R08_MASK V29 
    39  
    40  #include "aesni_macros_arm64.s"
    41  #include "xts_macros_arm64.s"
    42  
    43  #define load8blocks \
    44  	VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
    45  	VEOR T0.B16, B0.B16, B0.B16; \
    46  	VEOR T1.B16, B1.B16, B1.B16; \
    47  	VEOR T2.B16, B2.B16, B2.B16; \
    48  	VEOR T3.B16, B3.B16, B3.B16; \
    49  	\
    50  	VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \
    51  	VEOR T4.B16, B4.B16, B4.B16; \
    52  	VEOR T5.B16, B5.B16, B5.B16; \
    53  	VEOR T6.B16, B6.B16, B6.B16; \
    54  	VEOR T7.B16, B7.B16, B7.B16; \
    55  	\
    56  	VREV32 B0.B16, B0.B16; \
    57  	VREV32 B1.B16, B1.B16; \
    58  	VREV32 B2.B16, B2.B16; \
    59  	VREV32 B3.B16, B3.B16; \
    60  	VREV32 B4.B16, B4.B16; \
    61  	VREV32 B5.B16, B5.B16; \
    62  	VREV32 B6.B16, B6.B16; \
    63  	VREV32 B7.B16, B7.B16; \
    64  	\
    65  	PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3); \
    66  	PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
    67  
    68  #define store8blocks \
    69  	VREV32 B0.B16, B0.B16; \
    70  	VREV32 B1.B16, B1.B16; \
    71  	VREV32 B2.B16, B2.B16; \
    72  	VREV32 B3.B16, B3.B16; \
    73  	TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3); \
    74  	VREV32 B4.B16, B4.B16; \
    75  	VREV32 B5.B16, B5.B16; \
    76  	VREV32 B6.B16, B6.B16; \
    77  	VREV32 B7.B16, B7.B16; \
    78  	TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3); \
    79  	\
    80  	VEOR T0.B16, B0.B16, B0.B16; \
    81  	VEOR T1.B16, B1.B16, B1.B16; \
    82  	VEOR T2.B16, B2.B16, B2.B16; \
    83  	VEOR T3.B16, B3.B16, B3.B16; \
    84  	VEOR T4.B16, B4.B16, B4.B16; \
    85  	VEOR T5.B16, B5.B16, B5.B16; \
    86  	VEOR T6.B16, B6.B16, B6.B16; \
    87  	VEOR T7.B16, B7.B16, B7.B16; \
    88  	\
    89  	VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \
    90  	VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr)
    91  
    92  #define load4blocks \
    93  	VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
    94  	VEOR T0.B16, B0.B16, B0.B16; \
    95  	VEOR T1.B16, B1.B16, B1.B16; \
    96  	VEOR T2.B16, B2.B16, B2.B16; \
    97  	VEOR T3.B16, B3.B16, B3.B16; \
    98  	\
    99  	VREV32 B0.B16, B0.B16; \
   100  	VREV32 B1.B16, B1.B16; \
   101  	VREV32 B2.B16, B2.B16; \
   102  	VREV32 B3.B16, B3.B16; \
   103  	PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   104  
   105  #define store4blocks \
   106  	VREV32 B0.B16, B0.B16; \
   107  	VREV32 B1.B16, B1.B16; \
   108  	VREV32 B2.B16, B2.B16; \
   109  	VREV32 B3.B16, B3.B16; \
   110  	TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3); \
   111  	\
   112  	VEOR T0.B16, B0.B16, B0.B16; \
   113  	VEOR T1.B16, B1.B16, B1.B16; \
   114  	VEOR T2.B16, B2.B16, B2.B16; \
   115  	VEOR T3.B16, B3.B16, B3.B16; \
   116  	\
   117  	VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr)
   118  
   119  #define loadOneBlock \
   120  	VLD1.P 16(srcPtr), [B0.S4]; \
   121  	VEOR TW.B16, B0.B16, B0.B16; \
   122  	\
   123  	VREV32 B0.B16, B0.B16; \
   124  	VMOV B0.S[1], B1.S[0]; \
   125  	VMOV B0.S[2], B2.S[0]; \
   126  	VMOV B0.S[3], B3.S[0]
   127  
   128  #define storeOneBlock \
   129  	VMOV B2.S[0], B3.S[1]; \
   130  	VMOV B1.S[0], B3.S[2]; \
   131  	VMOV B0.S[0], B3.S[3]; \
   132  	VREV32 B3.B16, B3.B16; \
   133  	\
   134  	VEOR TW.B16, B3.B16, B3.B16; \
   135  	VST1.P [B3.S4], 16(dstPtr)
   136  
   137  #define dstPtr R2
   138  #define srcPtr R3
   139  #define rk R0
   140  #define twPtr R1
   141  #define srcPtrLen R4
   142  #define I R5
   143  #define rkSave R6
   144  
   145  // func encryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   146  TEXT ·encryptSm4Xts(SB),0,$128-64
   147  	LOAD_SM4_AESNI_CONSTS()
   148  	MOVD xk+0(FP), rk
   149  	MOVD tweak+8(FP), twPtr
   150  	MOVD dst+16(FP), dstPtr
   151  	MOVD src+40(FP), srcPtr
   152  	MOVD src_len+48(FP), srcPtrLen
   153  
   154  	VEOR	POLY.B16, POLY.B16, POLY.B16
   155  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   156  
   157  	MOVD	$0x87, I
   158  	VMOV	I, POLY.D[0]
   159  
   160  	MOVD rk, rkSave
   161  	VLD1 (twPtr), [TW.B16]
   162  
   163  xtsSm4EncOctets:
   164  	CMP	$128, srcPtrLen
   165  	BLT	xtsSm4EncNibbles
   166  	SUB	$128, srcPtrLen
   167  
   168  	prepare8Tweaks
   169  	load8blocks
   170  	MOVD rkSave, rk
   171  	EOR R13, R13
   172  
   173  encOctetsEnc8Blocks:
   174  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
   175  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
   176  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
   177  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
   178  
   179  		ADD $1, R13
   180  		CMP $8, R13
   181  		BNE encOctetsEnc8Blocks
   182  
   183  	store8blocks
   184  	B	xtsSm4EncOctets
   185  
   186  xtsSm4EncNibbles:
   187  	CMP	$64, srcPtrLen
   188  	BLT	xtsSm4EncSingles
   189  	SUB	$64, srcPtrLen
   190  
   191  	prepare4Tweaks
   192  	load4blocks
   193  	MOVD rkSave, rk
   194  	EOR R13, R13
   195  
   196  encNibblesEnc4Blocks:
   197  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   198  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   199  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   200  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   201  
   202  		ADD $1, R13
   203  		CMP $8, R13
   204  		BNE encNibblesEnc4Blocks
   205  	
   206  	store4blocks
   207  
   208  xtsSm4EncSingles:
   209  	CMP	$16, srcPtrLen
   210  	BLT	xtsSm4EncTail
   211  	SUB	$16, srcPtrLen
   212  
   213  	loadOneBlock
   214  
   215  	MOVD rkSave, rk
   216  	EOR R13, R13
   217  
   218  encSinglesEnc4Blocks:	
   219  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   220  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   221  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   222  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   223  		ADD $1, R13
   224  		CMP $8, R13
   225  		BNE encSinglesEnc4Blocks
   226  
   227  	storeOneBlock
   228  	mul2Inline
   229  	B	xtsSm4EncSingles
   230  
   231  xtsSm4EncTail:
   232  	CBZ	srcPtrLen, xtsSm4EncDone
   233  	SUB $16, dstPtr, R7
   234  	MOVD R7, R9
   235  	MOVD RSP, R8
   236  	VLD1 (R7), [B0.B16]
   237  	VST1 [B0.B16], (R8)
   238  
   239  	TBZ	$3, srcPtrLen, less_than8
   240  	MOVD.P 8(srcPtr), R11
   241  	MOVD.P R11, 8(R8)
   242  	MOVD.P 8(R7), R12
   243  	MOVD.P R12, 8(dstPtr)
   244  
   245  less_than8:
   246  	TBZ	$2, srcPtrLen, less_than4
   247  	MOVWU.P 4(srcPtr), R11
   248  	MOVWU.P R11, 4(R8)
   249  	MOVWU.P 4(R7), R12
   250  	MOVWU.P R12, 4(dstPtr)
   251  
   252  less_than4:
   253  	TBZ	$1, srcPtrLen, less_than2
   254  	MOVHU.P 2(srcPtr), R11
   255  	MOVHU.P R11, 2(R8)
   256  	MOVHU.P 2(R7), R12
   257  	MOVHU.P R12, 2(dstPtr)
   258  
   259  less_than2:
   260  	TBZ	$0, srcPtrLen, xtsSm4EncTailEnc
   261  	MOVBU (srcPtr), R11
   262  	MOVBU R11, (R8)
   263  	MOVBU (R7), R12
   264  	MOVBU R12, (dstPtr)
   265  
   266  xtsSm4EncTailEnc:
   267  	VLD1 (RSP), [B0.B16]
   268  	VEOR TW.B16, B0.B16, B0.B16
   269  	VREV32 B0.B16, B0.B16
   270  	VMOV B0.S[1], B1.S[0]
   271  	VMOV B0.S[2], B2.S[0]
   272  	VMOV B0.S[3], B3.S[0]
   273  
   274  	MOVD rkSave, rk
   275  	EOR R13, R13
   276  
   277  tailEncLoop:
   278  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   279  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   280  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   281  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   282  		ADD $1, R13
   283  		CMP $8, R13
   284  		BNE tailEncLoop
   285  
   286  	VMOV B2.S[0], B3.S[1]
   287  	VMOV B1.S[0], B3.S[2]
   288  	VMOV B0.S[0], B3.S[3]
   289  	VREV32 B3.B16, B3.B16
   290  
   291  	VEOR TW.B16, B3.B16, B3.B16
   292  	VST1 [B3.B16], (R9)
   293  
   294  xtsSm4EncDone:
   295  	VST1 [TW.B16], (twPtr)
   296  	RET
   297  
   298  // func encryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   299  TEXT ·encryptSm4XtsGB(SB),0,$128-64
   300  	LOAD_SM4_AESNI_CONSTS()
   301  	MOVD xk+0(FP), rk
   302  	MOVD tweak+8(FP), twPtr
   303  	MOVD dst+16(FP), dstPtr
   304  	MOVD src+40(FP), srcPtr
   305  	MOVD src_len+48(FP), srcPtrLen
   306  
   307  	VEOR	POLY.B16, POLY.B16, POLY.B16
   308  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   309  
   310  	MOVD	$0xE1, I
   311  	LSL	$56, I
   312  	VMOV	I, POLY.D[1]
   313  
   314  	MOVD rk, rkSave
   315  	VLD1 (twPtr), [TW.B16]
   316  
   317  xtsSm4EncOctets:
   318  	CMP	$128, srcPtrLen
   319  	BLT	xtsSm4EncNibbles
   320  	SUB	$128, srcPtrLen
   321  
   322  	prepareGB8Tweaks
   323  	load8blocks
   324  	MOVD rkSave, rk
   325  	EOR R13, R13
   326  
   327  encOctetsEnc8Blocks:
   328  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
   329  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
   330  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
   331  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
   332  
   333  		ADD $1, R13
   334  		CMP $8, R13
   335  		BNE encOctetsEnc8Blocks
   336  
   337  	store8blocks
   338  	B	xtsSm4EncOctets
   339  
   340  xtsSm4EncNibbles:
   341  	CMP	$64, srcPtrLen
   342  	BLT	xtsSm4EncSingles
   343  	SUB	$64, srcPtrLen
   344  
   345  	prepareGB4Tweaks
   346  	load4blocks
   347  	MOVD rkSave, rk
   348  	EOR R13, R13
   349  
   350  encNibblesEnc4Blocks:
   351  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   352  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   353  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   354  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   355  
   356  		ADD $1, R13
   357  		CMP $8, R13
   358  		BNE encNibblesEnc4Blocks
   359  	
   360  	store4blocks
   361  
   362  xtsSm4EncSingles:
   363  	CMP	$16, srcPtrLen
   364  	BLT	xtsSm4EncTail
   365  	SUB	$16, srcPtrLen
   366  
   367  	loadOneBlock
   368  
   369  	MOVD rkSave, rk
   370  	EOR R13, R13
   371  
   372  encSinglesEnc4Blocks:	
   373  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   374  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   375  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   376  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   377  		ADD $1, R13
   378  		CMP $8, R13
   379  		BNE encSinglesEnc4Blocks
   380  
   381  	storeOneBlock
   382  	mul2GBInline
   383  	B	xtsSm4EncSingles
   384  
   385  xtsSm4EncTail:
   386  	CBZ	srcPtrLen, xtsSm4EncDone
   387  	SUB $16, dstPtr, R7
   388  	MOVD R7, R9
   389  	MOVD RSP, R8
   390  	VLD1 (R7), [B0.B16]
   391  	VST1 [B0.B16], (R8)
   392  
   393  	TBZ	$3, srcPtrLen, less_than8
   394  	MOVD.P 8(srcPtr), R11
   395  	MOVD.P R11, 8(R8)
   396  	MOVD.P 8(R7), R12
   397  	MOVD.P R12, 8(dstPtr)
   398  
   399  less_than8:
   400  	TBZ	$2, srcPtrLen, less_than4
   401  	MOVWU.P 4(srcPtr), R11
   402  	MOVWU.P R11, 4(R8)
   403  	MOVWU.P 4(R7), R12
   404  	MOVWU.P R12, 4(dstPtr)
   405  
   406  less_than4:
   407  	TBZ	$1, srcPtrLen, less_than2
   408  	MOVHU.P 2(srcPtr), R11
   409  	MOVHU.P R11, 2(R8)
   410  	MOVHU.P 2(R7), R12
   411  	MOVHU.P R12, 2(dstPtr)
   412  
   413  less_than2:
   414  	TBZ	$0, srcPtrLen, xtsSm4EncTailEnc
   415  	MOVBU (srcPtr), R11
   416  	MOVBU R11, (R8)
   417  	MOVBU (R7), R12
   418  	MOVBU R12, (dstPtr)
   419  
   420  xtsSm4EncTailEnc:
   421  	VLD1 (RSP), [B0.B16]
   422  	VEOR TW.B16, B0.B16, B0.B16
   423  	VREV32 B0.B16, B0.B16
   424  	VMOV B0.S[1], B1.S[0]
   425  	VMOV B0.S[2], B2.S[0]
   426  	VMOV B0.S[3], B3.S[0]
   427  
   428  	MOVD rkSave, rk
   429  	EOR R13, R13
   430  
   431  tailEncLoop:
   432  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   433  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   434  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   435  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   436  		ADD $1, R13
   437  		CMP $8, R13
   438  		BNE tailEncLoop
   439  
   440  	VMOV B2.S[0], B3.S[1]
   441  	VMOV B1.S[0], B3.S[2]
   442  	VMOV B0.S[0], B3.S[3]
   443  	VREV32 B3.B16, B3.B16
   444  
   445  	VEOR TW.B16, B3.B16, B3.B16
   446  	VST1 [B3.B16], (R9)
   447  
   448  xtsSm4EncDone:
   449  	VST1 [TW.B16], (twPtr)
   450  	RET
   451  
   452  // func decryptSm4Xts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   453  TEXT ·decryptSm4Xts(SB),0,$128-64
   454  	LOAD_SM4_AESNI_CONSTS()
   455  	MOVD xk+0(FP), rk
   456  	MOVD tweak+8(FP), twPtr
   457  	MOVD dst+16(FP), dstPtr
   458  	MOVD src+40(FP), srcPtr
   459  	MOVD src_len+48(FP), srcPtrLen
   460  
   461  	VEOR	POLY.B16, POLY.B16, POLY.B16
   462  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   463  
   464  	MOVD	$0x87, I
   465  	VMOV	I, POLY.D[0]
   466  
   467  	MOVD rk, rkSave
   468  	VLD1 (twPtr), [TW.B16]
   469  
   470  xtsSm4DecOctets:
   471  	CMP	$128, srcPtrLen
   472  	BLT	xtsSm4DecNibbles
   473  	SUB	$128, srcPtrLen
   474  
   475  	prepare8Tweaks
   476  	load8blocks
   477  	MOVD rkSave, rk
   478  	EOR R13, R13
   479  
   480  decOctetsDec8Blocks:
   481  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
   482  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
   483  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
   484  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
   485  
   486  		ADD $1, R13
   487  		CMP $8, R13
   488  		BNE decOctetsDec8Blocks
   489  
   490  	store8blocks
   491  	B	xtsSm4DecOctets
   492  
   493  xtsSm4DecNibbles:
   494  	CMP	$64, srcPtrLen
   495  	BLT	xtsSm4DecSingles
   496  	SUB	$64, srcPtrLen
   497  
   498  	prepare4Tweaks
   499  	load4blocks
   500  	MOVD rkSave, rk
   501  	EOR R13, R13
   502  
   503  decNibblesDec4Blocks:
   504  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   505  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   506  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   507  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   508  
   509  		ADD $1, R13
   510  		CMP $8, R13
   511  		BNE decNibblesDec4Blocks
   512  	
   513  	store4blocks
   514  
   515  xtsSm4DecSingles:
   516  	CMP	$32, srcPtrLen
   517  	BLT	xtsSm4DecTail
   518  	SUB	$16, srcPtrLen
   519  
   520  	loadOneBlock
   521  
   522  	MOVD rkSave, rk
   523  	EOR R13, R13
   524  
   525  decSinglesDec4Blocks:	
   526  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   527  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   528  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   529  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   530  		ADD $1, R13
   531  		CMP $8, R13
   532  		BNE decSinglesDec4Blocks
   533  
   534  	storeOneBlock
   535  	mul2Inline
   536  
   537  	B	xtsSm4DecSingles
   538  
   539  xtsSm4DecTail:
   540  	CBZ	srcPtrLen, xtsSm4DecDone
   541  	
   542  	CMP	$16, srcPtrLen
   543  	BEQ xtsSm4DecLastBlock
   544  
   545  	VMOV TW.B16, B4.B16
   546  	mul2Inline
   547  	loadOneBlock
   548  	MOVD rkSave, rk
   549  	EOR R13, R13
   550  
   551  decLastCompleteBlockLoop:	
   552  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   553  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   554  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   555  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   556  		ADD $1, R13
   557  		CMP $8, R13
   558  		BNE decLastCompleteBlockLoop
   559  	storeOneBlock
   560  	VMOV B4.B16, TW.B16
   561  	VST1 [B3.B16], (RSP)
   562  
   563  	SUB $16, dstPtr, R7
   564  	MOVD R7, R9
   565  	MOVD RSP, R8
   566  
   567  	TBZ	$3, srcPtrLen, less_than8
   568  	MOVD.P 8(srcPtr), R11
   569  	MOVD.P R11, 8(R8)
   570  	MOVD.P 8(R7), R12
   571  	MOVD.P R12, 8(dstPtr)
   572  
   573  less_than8:
   574  	TBZ	$2, srcPtrLen, less_than4
   575  	MOVWU.P 4(srcPtr), R11
   576  	MOVWU.P R11, 4(R8)
   577  	MOVWU.P 4(R7), R12
   578  	MOVWU.P R12, 4(dstPtr)
   579  
   580  less_than4:
   581  	TBZ	$1, srcPtrLen, less_than2
   582  	MOVHU.P 2(srcPtr), R11
   583  	MOVHU.P R11, 2(R8)
   584  	MOVHU.P 2(R7), R12
   585  	MOVHU.P R12, 2(dstPtr)
   586  
   587  less_than2:
   588  	TBZ	$0, srcPtrLen, xtsSm4DecTailDec
   589  	MOVBU (srcPtr), R11
   590  	MOVBU R11, (R8)
   591  	MOVBU (R7), R12
   592  	MOVBU R12, (dstPtr)
   593  
   594  xtsSm4DecTailDec:
   595  	VLD1 (RSP), [B0.B16]
   596  	VEOR TW.B16, B0.B16, B0.B16
   597  	VREV32 B0.B16, B0.B16
   598  	VMOV B0.S[1], B1.S[0]
   599  	VMOV B0.S[2], B2.S[0]
   600  	VMOV B0.S[3], B3.S[0]
   601  
   602  	MOVD rkSave, rk
   603  	EOR R13, R13
   604  
   605  tailDecLoop:
   606  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   607  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   608  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   609  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   610  		ADD $1, R13
   611  		CMP $8, R13
   612  		BNE tailDecLoop
   613  
   614  	VMOV B2.S[0], B3.S[1]
   615  	VMOV B1.S[0], B3.S[2]
   616  	VMOV B0.S[0], B3.S[3]
   617  	VREV32 B3.B16, B3.B16
   618  
   619  	VEOR TW.B16, B3.B16, B3.B16
   620  	VST1 [B3.B16], (R9)
   621  
   622  	B xtsSm4DecDone
   623  
   624  xtsSm4DecLastBlock:
   625  	loadOneBlock
   626  
   627  	MOVD rkSave, rk
   628  	EOR R13, R13
   629  
   630  decLastBlockLoop:	
   631  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   632  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   633  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   634  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   635  		ADD $1, R13
   636  		CMP $8, R13
   637  		BNE decLastBlockLoop
   638  
   639  	storeOneBlock
   640  	mul2Inline
   641  
   642  xtsSm4DecDone:
   643  	VST1 [TW.B16], (twPtr)
   644  	RET
   645  
   646  // func decryptSm4XtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   647  TEXT ·decryptSm4XtsGB(SB),0,$128-64
   648  	LOAD_SM4_AESNI_CONSTS()
   649  	MOVD xk+0(FP), rk
   650  	MOVD tweak+8(FP), twPtr
   651  	MOVD dst+16(FP), dstPtr
   652  	MOVD src+40(FP), srcPtr
   653  	MOVD src_len+48(FP), srcPtrLen
   654  
   655  	VEOR	POLY.B16, POLY.B16, POLY.B16
   656  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   657  
   658  	MOVD	$0xE1, I
   659  	LSL	$56, I
   660  	VMOV	I, POLY.D[1]
   661  
   662  	MOVD rk, rkSave
   663  	VLD1 (twPtr), [TW.B16]
   664  
   665  xtsSm4DecOctets:
   666  	CMP	$128, srcPtrLen
   667  	BLT	xtsSm4DecNibbles
   668  	SUB	$128, srcPtrLen
   669  
   670  	prepareGB8Tweaks
   671  	load8blocks
   672  	MOVD rkSave, rk
   673  	EOR R13, R13
   674  
   675  decOctetsDec8Blocks:
   676  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
   677  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
   678  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
   679  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
   680  
   681  		ADD $1, R13
   682  		CMP $8, R13
   683  		BNE decOctetsDec8Blocks
   684  
   685  	store8blocks
   686  	B	xtsSm4DecOctets
   687  
   688  xtsSm4DecNibbles:
   689  	CMP	$64, srcPtrLen
   690  	BLT	xtsSm4DecSingles
   691  	SUB	$64, srcPtrLen
   692  
   693  	prepareGB4Tweaks
   694  	load4blocks
   695  	MOVD rkSave, rk
   696  	EOR R13, R13
   697  
   698  decNibblesDec4Blocks:
   699  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   700  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   701  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   702  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   703  
   704  		ADD $1, R13
   705  		CMP $8, R13
   706  		BNE decNibblesDec4Blocks
   707  	
   708  	store4blocks
   709  
   710  xtsSm4DecSingles:
   711  	CMP	$32, srcPtrLen
   712  	BLT	xtsSm4DecTail
   713  	SUB	$16, srcPtrLen
   714  
   715  	loadOneBlock
   716  
   717  	MOVD rkSave, rk
   718  	EOR R13, R13
   719  
   720  decSinglesDec4Blocks:	
   721  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   722  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   723  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   724  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   725  		ADD $1, R13
   726  		CMP $8, R13
   727  		BNE decSinglesDec4Blocks
   728  
   729  	storeOneBlock
   730  	mul2GBInline
   731  
   732  	B	xtsSm4DecSingles
   733  
   734  xtsSm4DecTail:
   735  	CBZ	srcPtrLen, xtsSm4DecDone
   736  	
   737  	CMP	$16, srcPtrLen
   738  	BEQ xtsSm4DecLastBlock
   739  
   740  	VMOV TW.B16, B4.B16
   741  	mul2GBInline
   742  	loadOneBlock
   743  	MOVD rkSave, rk
   744  	EOR R13, R13
   745  
   746  decLastCompleteBlockLoop:	
   747  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   748  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   749  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   750  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   751  		ADD $1, R13
   752  		CMP $8, R13
   753  		BNE decLastCompleteBlockLoop
   754  	storeOneBlock
   755  	VMOV B4.B16, TW.B16
   756  	VST1 [B3.B16], (RSP)
   757  
   758  	SUB $16, dstPtr, R7
   759  	MOVD R7, R9
   760  	MOVD RSP, R8
   761  
   762  	TBZ	$3, srcPtrLen, less_than8
   763  	MOVD.P 8(srcPtr), R11
   764  	MOVD.P R11, 8(R8)
   765  	MOVD.P 8(R7), R12
   766  	MOVD.P R12, 8(dstPtr)
   767  
   768  less_than8:
   769  	TBZ	$2, srcPtrLen, less_than4
   770  	MOVWU.P 4(srcPtr), R11
   771  	MOVWU.P R11, 4(R8)
   772  	MOVWU.P 4(R7), R12
   773  	MOVWU.P R12, 4(dstPtr)
   774  
   775  less_than4:
   776  	TBZ	$1, srcPtrLen, less_than2
   777  	MOVHU.P 2(srcPtr), R11
   778  	MOVHU.P R11, 2(R8)
   779  	MOVHU.P 2(R7), R12
   780  	MOVHU.P R12, 2(dstPtr)
   781  
   782  less_than2:
   783  	TBZ	$0, srcPtrLen, xtsSm4DecTailDec
   784  	MOVBU (srcPtr), R11
   785  	MOVBU R11, (R8)
   786  	MOVBU (R7), R12
   787  	MOVBU R12, (dstPtr)
   788  
   789  xtsSm4DecTailDec:
   790  	VLD1 (RSP), [B0.B16]
   791  	VEOR TW.B16, B0.B16, B0.B16
   792  	VREV32 B0.B16, B0.B16
   793  	VMOV B0.S[1], B1.S[0]
   794  	VMOV B0.S[2], B2.S[0]
   795  	VMOV B0.S[3], B3.S[0]
   796  
   797  	MOVD rkSave, rk
   798  	EOR R13, R13
   799  
   800  tailDecLoop:
   801  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   802  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   803  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   804  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   805  		ADD $1, R13
   806  		CMP $8, R13
   807  		BNE tailDecLoop
   808  
   809  	VMOV B2.S[0], B3.S[1]
   810  	VMOV B1.S[0], B3.S[2]
   811  	VMOV B0.S[0], B3.S[3]
   812  	VREV32 B3.B16, B3.B16
   813  
   814  	VEOR TW.B16, B3.B16, B3.B16
   815  	VST1 [B3.B16], (R9)
   816  
   817  	B xtsSm4DecDone
   818  
   819  xtsSm4DecLastBlock:
   820  	loadOneBlock
   821  
   822  	MOVD rkSave, rk
   823  	EOR R13, R13
   824  
   825  decLastBlockLoop:	
   826  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   827  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   828  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   829  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   830  		ADD $1, R13
   831  		CMP $8, R13
   832  		BNE decLastBlockLoop
   833  
   834  	storeOneBlock
   835  	mul2GBInline
   836  
   837  xtsSm4DecDone:
   838  	VST1 [TW.B16], (twPtr)
   839  	RET