github.com/emmansun/gmsm@v0.29.1/sm4/xts_sm4ni_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define B0 V0
     6  #define B1 V1
     7  #define B2 V2
     8  #define B3 V3
     9  #define B4 V4
    10  #define B5 V5
    11  #define B6 V6
    12  #define B7 V7
    13  
    14  #define POLY V8
    15  #define ZERO V9
    16  #define TW V10
    17  
    18  #define T0 V11
    19  #define T1 V12
    20  #define T2 V13
    21  #define T3 V14
    22  #define T4 V15
    23  #define T5 V16
    24  #define T6 V17
    25  #define T7 V18
    26  
    27  #define RK0 V19
    28  #define RK1 V20
    29  #define RK2 V21
    30  #define RK3 V22
    31  #define RK4 V23
    32  #define RK5 V24
    33  #define RK6 V25
    34  #define RK7 V26
    35  
    36  #define K0 V27
    37  #define K1 V28
    38  
    39  #include "sm4ni_macros_arm64.s"
    40  #include "xts_macros_arm64.s"
    41  
    42  #define load8blocks \
    43  	VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
    44  	VEOR T0.B16, B0.B16, B0.B16; \
    45  	VEOR T1.B16, B1.B16, B1.B16; \
    46  	VEOR T2.B16, B2.B16, B2.B16; \
    47  	VEOR T3.B16, B3.B16, B3.B16; \
    48  	\
    49  	VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \
    50  	VEOR T4.B16, B4.B16, B4.B16; \
    51  	VEOR T5.B16, B5.B16, B5.B16; \
    52  	VEOR T6.B16, B6.B16, B6.B16; \
    53  	VEOR T7.B16, B7.B16, B7.B16; \
    54      \
    55  	VREV32 B0.B16, B0.B16; \
    56  	VREV32 B1.B16, B1.B16; \
    57  	VREV32 B2.B16, B2.B16; \
    58  	VREV32 B3.B16, B3.B16; \
    59  	VREV32 B4.B16, B4.B16; \
    60  	VREV32 B5.B16, B5.B16; \
    61  	VREV32 B6.B16, B6.B16; \
    62  	VREV32 B7.B16, B7.B16
    63  
    64  #define store8blocks \
    65  	VEOR T0.B16, B0.B16, B0.B16; \
    66  	VEOR T1.B16, B1.B16, B1.B16; \
    67  	VEOR T2.B16, B2.B16, B2.B16; \
    68  	VEOR T3.B16, B3.B16, B3.B16; \
    69  	VEOR T4.B16, B4.B16, B4.B16; \
    70  	VEOR T5.B16, B5.B16, B5.B16; \
    71  	VEOR T6.B16, B6.B16, B6.B16; \
    72  	VEOR T7.B16, B7.B16, B7.B16; \
    73  	\
    74  	VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \
    75  	VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr)
    76  
    77  #define dstPtr R2
    78  #define srcPtr R3
    79  #define rk R0
    80  #define twPtr R1
    81  #define srcPtrLen R4
    82  #define I R5
    83  
    84  // func encryptSm4NiXts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
    85  TEXT ·encryptSm4NiXts(SB),0,$128-64
    86  	MOVD xk+0(FP), rk
    87  	MOVD tweak+8(FP), twPtr
    88  	MOVD dst+16(FP), dstPtr
    89  	MOVD src+40(FP), srcPtr
    90  	MOVD src_len+48(FP), srcPtrLen
    91  
    92  	VEOR	POLY.B16, POLY.B16, POLY.B16
    93  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    94  
    95  	MOVD	$0x87, I
    96  	VMOV	I, POLY.D[0]
    97  
    98  	// For SM4 round keys are stored in: RK0 .. RK7
    99  	VLD1.P	64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
   100  	VLD1.P	64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
   101  
   102  	VLD1 (twPtr), [TW.B16]
   103  
   104  xtsSm4EncOctets:
   105  	CMP	$128, srcPtrLen
   106  	BLT	xtsSm4EncSingles
   107  	SUB	$128, srcPtrLen
   108  	prepare8Tweaks
   109  	load8blocks
   110  	sm4eEnc8blocks()
   111  	store8blocks
   112  
   113  	B	xtsSm4EncOctets
   114  
   115  xtsSm4EncSingles:
   116  	CMP	$16, srcPtrLen
   117  	BLT	xtsSm4EncTail
   118  	SUB	$16, srcPtrLen
   119  
   120  	VLD1.P 16(srcPtr), [B0.S4]
   121  	VEOR TW.B16, B0.B16, B0.B16
   122  	VREV32 B0.B16, B0.B16
   123  	sm4eEnc1block()
   124  	VEOR TW.B16, B0.B16, B0.B16
   125  	VST1.P [B0.S4], 16(dstPtr)
   126  
   127  	mul2Inline
   128  	B xtsSm4EncSingles
   129  
   130  xtsSm4EncTail:
   131  	CBZ	srcPtrLen, xtsSm4EncDone
   132  	SUB $16, dstPtr, R7
   133  	MOVD R7, R9
   134  	MOVD RSP, R8
   135  	VLD1 (R7), [B0.B16]
   136  	VST1 [B0.B16], (R8)
   137  
   138  	TBZ	$3, srcPtrLen, less_than8
   139  	MOVD.P 8(srcPtr), R11
   140  	MOVD.P R11, 8(R8)
   141  	MOVD.P 8(R7), R12
   142  	MOVD.P R12, 8(dstPtr)
   143  
   144  less_than8:
   145  	TBZ	$2, srcPtrLen, less_than4
   146  	MOVWU.P 4(srcPtr), R11
   147  	MOVWU.P R11, 4(R8)
   148  	MOVWU.P 4(R7), R12
   149  	MOVWU.P R12, 4(dstPtr)
   150  
   151  less_than4:
   152  	TBZ	$1, srcPtrLen, less_than2
   153  	MOVHU.P 2(srcPtr), R11
   154  	MOVHU.P R11, 2(R8)
   155  	MOVHU.P 2(R7), R12
   156  	MOVHU.P R12, 2(dstPtr)
   157  
   158  less_than2:
   159  	TBZ	$0, srcPtrLen, xtsSm4EncTailEnc
   160  	MOVBU (srcPtr), R11
   161  	MOVBU R11, (R8)
   162  	MOVBU (R7), R12
   163  	MOVBU R12, (dstPtr)
   164  
   165  xtsSm4EncTailEnc:
   166  	VLD1 (RSP), [B0.B16]
   167  	VEOR TW.B16, B0.B16, B0.B16
   168  	VREV32 B0.B16, B0.B16
   169  	sm4eEnc1block()
   170  	VEOR TW.B16, B0.B16, B0.B16
   171  	VST1 [B0.B16], (R9)
   172  
   173  xtsSm4EncDone:
   174  	VST1 [TW.B16], (twPtr)
   175  	RET
   176  
   177  // func encryptSm4NiXtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   178  TEXT ·encryptSm4NiXtsGB(SB),0,$128-64
   179  	MOVD xk+0(FP), rk
   180  	MOVD tweak+8(FP), twPtr
   181  	MOVD dst+16(FP), dstPtr
   182  	MOVD src+40(FP), srcPtr
   183  	MOVD src_len+48(FP), srcPtrLen
   184  
   185  	VEOR	POLY.B16, POLY.B16, POLY.B16
   186  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   187  
   188  	MOVD	$0xE1, I
   189  	LSL	$56, I
   190  	VMOV	I, POLY.D[1]
   191  
   192  	// For SM4 round keys are stored in: RK0 .. RK7
   193  	VLD1.P	64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
   194  	VLD1.P	64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
   195  
   196  	VLD1 (twPtr), [TW.B16]
   197  
   198  xtsSm4EncOctets:
   199  	CMP	$128, srcPtrLen
   200  	BLT	xtsSm4EncSingles
   201  	SUB	$128, srcPtrLen
   202  	prepareGB8Tweaks
   203  	load8blocks
   204  	sm4eEnc8blocks()
   205  	store8blocks
   206  
   207  	B	xtsSm4EncOctets
   208  
   209  xtsSm4EncSingles:
   210  	CMP	$16, srcPtrLen
   211  	BLT	xtsSm4EncTail
   212  	SUB	$16, srcPtrLen
   213  
   214  	VLD1.P 16(srcPtr), [B0.S4]
   215  	VEOR TW.B16, B0.B16, B0.B16
   216  	VREV32 B0.B16, B0.B16
   217  	sm4eEnc1block()
   218  	VEOR TW.B16, B0.B16, B0.B16
   219  	VST1.P [B0.S4], 16(dstPtr)
   220  
   221  	mul2GBInline
   222  	B xtsSm4EncSingles
   223  
   224  xtsSm4EncTail:
   225  	CBZ	srcPtrLen, xtsSm4EncDone
   226  	SUB $16, dstPtr, R7
   227  	MOVD R7, R9
   228  	MOVD RSP, R8
   229  	VLD1 (R7), [B0.B16]
   230  	VST1 [B0.B16], (R8)
   231  
   232  	TBZ	$3, srcPtrLen, less_than8
   233  	MOVD.P 8(srcPtr), R11
   234  	MOVD.P R11, 8(R8)
   235  	MOVD.P 8(R7), R12
   236  	MOVD.P R12, 8(dstPtr)
   237  
   238  less_than8:
   239  	TBZ	$2, srcPtrLen, less_than4
   240  	MOVWU.P 4(srcPtr), R11
   241  	MOVWU.P R11, 4(R8)
   242  	MOVWU.P 4(R7), R12
   243  	MOVWU.P R12, 4(dstPtr)
   244  
   245  less_than4:
   246  	TBZ	$1, srcPtrLen, less_than2
   247  	MOVHU.P 2(srcPtr), R11
   248  	MOVHU.P R11, 2(R8)
   249  	MOVHU.P 2(R7), R12
   250  	MOVHU.P R12, 2(dstPtr)
   251  
   252  less_than2:
   253  	TBZ	$0, srcPtrLen, xtsSm4EncTailEnc
   254  	MOVBU (srcPtr), R11
   255  	MOVBU R11, (R8)
   256  	MOVBU (R7), R12
   257  	MOVBU R12, (dstPtr)
   258  
   259  xtsSm4EncTailEnc:
   260  	VLD1 (RSP), [B0.B16]
   261  	VEOR TW.B16, B0.B16, B0.B16
   262  	VREV32 B0.B16, B0.B16
   263  	sm4eEnc1block()
   264  	VEOR TW.B16, B0.B16, B0.B16
   265  	VST1 [B0.B16], (R9)
   266  
   267  xtsSm4EncDone:
   268  	VST1 [TW.B16], (twPtr)
   269      RET
   270  
   271  // func decryptSm4NiXts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   272  TEXT ·decryptSm4NiXts(SB),0,$128-64
   273  	MOVD xk+0(FP), rk
   274  	MOVD tweak+8(FP), twPtr
   275  	MOVD dst+16(FP), dstPtr
   276  	MOVD src+40(FP), srcPtr
   277  	MOVD src_len+48(FP), srcPtrLen
   278  
   279  	VEOR	POLY.B16, POLY.B16, POLY.B16
   280  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   281  
   282  	MOVD	$0x87, I
   283  	VMOV	I, POLY.D[0]
   284  
   285  	// For SM4 round keys are stored in: RK0 .. RK7
   286  	VLD1.P	64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
   287  	VLD1.P	64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
   288  
   289  	VLD1 (twPtr), [TW.B16]
   290  
   291  xtsSm4DecOctets:
   292  	CMP	$128, srcPtrLen
   293  	BLT	xtsSm4DecSingles
   294  	SUB	$128, srcPtrLen
   295  
   296  	prepare8Tweaks
   297  	load8blocks
   298  	sm4eEnc8blocks()
   299  	store8blocks
   300  
   301  	B xtsSm4DecOctets
   302  
   303  xtsSm4DecSingles:
   304  	CMP	$32, srcPtrLen
   305  	BLT	xtsSm4DecTail
   306  	SUB	$16, srcPtrLen
   307  
   308  	VLD1.P 16(srcPtr), [B0.S4]
   309  	VEOR TW.B16, B0.B16, B0.B16
   310  	VREV32 B0.B16, B0.B16
   311  	sm4eEnc1block()
   312  	VEOR TW.B16, B0.B16, B0.B16
   313  	VST1.P [B0.S4], 16(dstPtr)
   314  
   315  	mul2Inline
   316  	B xtsSm4DecSingles
   317  
   318  xtsSm4DecTail:
   319  	CBZ	srcPtrLen, xtsSm4DecDone
   320  	
   321  	CMP	$16, srcPtrLen
   322  	BEQ xtsSm4DecLastBlock
   323  
   324  	VMOV TW.B16, B4.B16
   325  	mul2Inline
   326  	VLD1.P 16(srcPtr), [B0.S4]
   327  	VEOR TW.B16, B0.B16, B0.B16
   328  	VREV32 B0.B16, B0.B16
   329  	sm4eEnc1block()
   330  	VEOR TW.B16, B0.B16, B0.B16
   331  	VST1.P [B0.S4], 16(dstPtr)
   332  	VMOV B4.B16, TW.B16
   333  	VST1 [B0.B16], (RSP)
   334  
   335  	SUB $16, dstPtr, R7
   336  	MOVD R7, R9
   337  	MOVD RSP, R8
   338  
   339  	TBZ	$3, srcPtrLen, less_than8
   340  	MOVD.P 8(srcPtr), R11
   341  	MOVD.P R11, 8(R8)
   342  	MOVD.P 8(R7), R12
   343  	MOVD.P R12, 8(dstPtr)
   344  
   345  less_than8:
   346  	TBZ	$2, srcPtrLen, less_than4
   347  	MOVWU.P 4(srcPtr), R11
   348  	MOVWU.P R11, 4(R8)
   349  	MOVWU.P 4(R7), R12
   350  	MOVWU.P R12, 4(dstPtr)
   351  
   352  less_than4:
   353  	TBZ	$1, srcPtrLen, less_than2
   354  	MOVHU.P 2(srcPtr), R11
   355  	MOVHU.P R11, 2(R8)
   356  	MOVHU.P 2(R7), R12
   357  	MOVHU.P R12, 2(dstPtr)
   358  
   359  less_than2:
   360  	TBZ	$0, srcPtrLen, xtsSm4DecTailDec
   361  	MOVBU (srcPtr), R11
   362  	MOVBU R11, (R8)
   363  	MOVBU (R7), R12
   364  	MOVBU R12, (dstPtr)
   365  
   366  xtsSm4DecTailDec:
   367  	VLD1 (RSP), [B0.B16]
   368  	VEOR TW.B16, B0.B16, B0.B16
   369  	VREV32 B0.B16, B0.B16
   370  	sm4eEnc1block()
   371  	VEOR TW.B16, B0.B16, B0.B16
   372  	VST1 [B0.B16], (R9)
   373  
   374  	B xtsSm4DecDone
   375  
   376  xtsSm4DecLastBlock:
   377  	VLD1.P 16(srcPtr), [B0.S4]
   378  	VEOR TW.B16, B0.B16, B0.B16
   379  	VREV32 B0.B16, B0.B16
   380  	sm4eEnc1block()
   381  	VEOR TW.B16, B0.B16, B0.B16
   382  	VST1.P [B0.S4], 16(dstPtr)
   383  	mul2Inline
   384  
   385  xtsSm4DecDone:
   386  	VST1 [TW.B16], (twPtr)
   387      RET
   388  
   389  // func decryptSm4NiXtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
   390  TEXT ·decryptSm4NiXtsGB(SB),0,$128-64
   391  	MOVD xk+0(FP), rk
   392  	MOVD tweak+8(FP), twPtr
   393  	MOVD dst+16(FP), dstPtr
   394  	MOVD src+40(FP), srcPtr
   395  	MOVD src_len+48(FP), srcPtrLen
   396  
   397  	VEOR	POLY.B16, POLY.B16, POLY.B16
   398  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   399  
   400  	MOVD	$0xE1, I
   401  	LSL	$56, I
   402  	VMOV	I, POLY.D[1]
   403  
   404  	// For SM4 round keys are stored in: RK0 .. RK7
   405  	VLD1.P	64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
   406  	VLD1.P	64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
   407  
   408  	VLD1 (twPtr), [TW.B16]
   409  
   410  xtsSm4DecOctets:
   411  	CMP	$128, srcPtrLen
   412  	BLT	xtsSm4DecSingles
   413  	SUB	$128, srcPtrLen
   414  
   415  	prepareGB8Tweaks
   416  	load8blocks
   417  	sm4eEnc8blocks()
   418  	store8blocks
   419  
   420  	B xtsSm4DecOctets
   421  
   422  xtsSm4DecSingles:
   423  	CMP	$32, srcPtrLen
   424  	BLT	xtsSm4DecTail
   425  	SUB	$16, srcPtrLen
   426  
   427  	VLD1.P 16(srcPtr), [B0.S4]
   428  	VEOR TW.B16, B0.B16, B0.B16
   429  	VREV32 B0.B16, B0.B16
   430  	sm4eEnc1block()
   431  	VEOR TW.B16, B0.B16, B0.B16
   432  	VST1.P [B0.S4], 16(dstPtr)
   433  
   434  	mul2GBInline
   435  	B xtsSm4DecSingles
   436  
   437  xtsSm4DecTail:
   438  	CBZ	srcPtrLen, xtsSm4DecDone
   439  	
   440  	CMP	$16, srcPtrLen
   441  	BEQ xtsSm4DecLastBlock
   442  
   443  	VMOV TW.B16, B4.B16
   444  	mul2GBInline
   445  	VLD1.P 16(srcPtr), [B0.S4]
   446  	VEOR TW.B16, B0.B16, B0.B16
   447  	VREV32 B0.B16, B0.B16
   448  	sm4eEnc1block()
   449  	VEOR TW.B16, B0.B16, B0.B16
   450  	VST1.P [B0.S4], 16(dstPtr)
   451  	VMOV B4.B16, TW.B16
   452  	VST1 [B0.B16], (RSP)
   453  
   454  	SUB $16, dstPtr, R7
   455  	MOVD R7, R9
   456  	MOVD RSP, R8
   457  
   458  	TBZ	$3, srcPtrLen, less_than8
   459  	MOVD.P 8(srcPtr), R11
   460  	MOVD.P R11, 8(R8)
   461  	MOVD.P 8(R7), R12
   462  	MOVD.P R12, 8(dstPtr)
   463  
   464  less_than8:
   465  	TBZ	$2, srcPtrLen, less_than4
   466  	MOVWU.P 4(srcPtr), R11
   467  	MOVWU.P R11, 4(R8)
   468  	MOVWU.P 4(R7), R12
   469  	MOVWU.P R12, 4(dstPtr)
   470  
   471  less_than4:
   472  	TBZ	$1, srcPtrLen, less_than2
   473  	MOVHU.P 2(srcPtr), R11
   474  	MOVHU.P R11, 2(R8)
   475  	MOVHU.P 2(R7), R12
   476  	MOVHU.P R12, 2(dstPtr)
   477  
   478  less_than2:
   479  	TBZ	$0, srcPtrLen, xtsSm4DecTailDec
   480  	MOVBU (srcPtr), R11
   481  	MOVBU R11, (R8)
   482  	MOVBU (R7), R12
   483  	MOVBU R12, (dstPtr)
   484  
   485  xtsSm4DecTailDec:
   486  	VLD1 (RSP), [B0.B16]
   487  	VEOR TW.B16, B0.B16, B0.B16
   488  	VREV32 B0.B16, B0.B16
   489  	sm4eEnc1block()
   490  	VEOR TW.B16, B0.B16, B0.B16
   491  	VST1 [B0.B16], (R9)
   492  
   493  	B xtsSm4DecDone
   494  
   495  xtsSm4DecLastBlock:
   496  	VLD1.P 16(srcPtr), [B0.S4]
   497  	VEOR TW.B16, B0.B16, B0.B16
   498  	VREV32 B0.B16, B0.B16
   499  	sm4eEnc1block()
   500  	VEOR TW.B16, B0.B16, B0.B16
   501  	VST1.P [B0.S4], 16(dstPtr)
   502  	mul2GBInline
   503  
   504  xtsSm4DecDone:
   505  	VST1 [TW.B16], (twPtr)
   506      RET