github.com/emmansun/gmsm@v0.29.1/sm4/gcm_sm4ni_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define B0 V0
     6  #define B1 V1
     7  #define B2 V2
     8  #define B3 V3
     9  #define B4 V4
    10  #define B5 V5
    11  #define B6 V6
    12  #define B7 V7
    13  
    14  #define ACC0 V8
    15  #define ACC1 V9
    16  #define ACCM V10
    17  
    18  #define T0 V11
    19  #define T1 V12
    20  #define T2 V13
    21  #define T3 V14
    22  
    23  #define POLY V15
    24  #define ZERO V16
    25  #define INC V17
    26  #define CTR V18
    27  
    28  #define K0 V19
    29  #define K1 V20
    30  #define K2 V21
    31  #define K3 V22
    32  #define K4 V23
    33  #define K5 V24
    34  #define K6 V25
    35  #define K7 V26
    36  
    37  #define reduce() \
    38  	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
    39  	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
    40  	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
    41  	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
    42  	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
    43  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    44  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    45  	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
    46  	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
    47  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    48  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    49  	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
    50  	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
    51  
    52  #define mulRound(X) \
    53  	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
    54  	VREV64	X.B16, X.B16               \
    55  	VEXT	$8, X.B16, X.B16, T0.B16   \
    56  	VEOR	X.B16, T0.B16, T0.B16      \
    57  	VPMULL	X.D1, T1.D1, T3.Q1         \
    58  	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
    59  	VPMULL2	X.D2, T1.D2, T3.Q1         \
    60  	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
    61  	VPMULL	T0.D1, T2.D1, T3.Q1        \
    62  	VEOR	T3.B16, ACCM.B16, ACCM.B16
    63  
    64  #include "sm4ni_macros_arm64.s"
    65  
    66  // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
    67  TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
    68  #define pTbl R0
    69  #define dstPtr R1
    70  #define ctrPtr R2
    71  #define srcPtr R3
    72  #define rk R4
    73  #define tPtr R5
    74  #define srcPtrLen R6
    75  #define aluCTR R7
    76  #define aluTMP R8
    77  #define H0 R9
    78  #define H1 R10
    79  #define pTblSave R11
    80  #define rkSave R12
    81  	MOVD	productTable+0(FP), pTbl
    82  	MOVD	dst+8(FP), dstPtr
    83  	MOVD	src_base+32(FP), srcPtr
    84  	MOVD	src_len+40(FP), srcPtrLen
    85  	MOVD	ctr+56(FP), ctrPtr
    86  	MOVD	T+64(FP), tPtr
    87  	MOVD	rk_base+72(FP), rk
    88  	
    89  	MOVD	$0xC2, H1
    90  	LSL	$56, H1
    91  	MOVD	$1, H0
    92  	VMOV	H1, POLY.D[0]
    93  	VMOV	H0, POLY.D[1]
    94  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    95  
    96  	MOVD	pTbl, pTblSave
    97  	// Current tag, after AAD
    98  	VLD1	(tPtr), [ACC0.B16]
    99  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   100  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   101  	// Prepare initial counter, and the increment vector
   102  	VLD1	(ctrPtr), [CTR.B16]
   103  	VEOR	INC.B16, INC.B16, INC.B16
   104  	MOVD	$1, H0
   105  	VMOV	H0, INC.S[3]
   106  	VREV32	CTR.B16, CTR.B16
   107  	VADD	CTR.S4, INC.S4, CTR.S4
   108  
   109  	// Skip to <8 blocks loop
   110  	CMP	$128, srcPtrLen
   111  
   112  	MOVD	rk, H0
   113  	// For SM4 round keys are stored in: K0 .. K7
   114  	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
   115  	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
   116  
   117  	BLT	startSingles
   118  octetsLoop:
   119  		SUB	$128, srcPtrLen
   120  		// Prepare 8 counters
   121  		VMOV	CTR.B16, B0.B16
   122  		VADD	B0.S4, INC.S4, B1.S4
   123  		VADD	B1.S4, INC.S4, B2.S4
   124  		VADD	B2.S4, INC.S4, B3.S4
   125  		VADD	B3.S4, INC.S4, B4.S4
   126  		VADD	B4.S4, INC.S4, B5.S4
   127  		VADD	B5.S4, INC.S4, B6.S4
   128  		VADD	B6.S4, INC.S4, B7.S4
   129  		VADD	B7.S4, INC.S4, CTR.S4
   130  
   131  		sm4eEnc8blocks()
   132  
   133  		// XOR plaintext and store ciphertext
   134  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   135  		VEOR	B0.B16, T1.B16, B0.B16
   136  		VEOR	B1.B16, T2.B16, B1.B16
   137  		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   138  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   139  		VEOR	B2.B16, T1.B16, B2.B16
   140  		VEOR	B3.B16, T2.B16, B3.B16
   141  		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   142  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   143  		VEOR	B4.B16, T1.B16, B4.B16
   144  		VEOR	B5.B16, T2.B16, B5.B16
   145  		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
   146  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   147  		VEOR	B6.B16, T1.B16, B6.B16
   148  		VEOR	B7.B16, T2.B16, B7.B16
   149  		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
   150  
   151  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   152  		VREV64	B0.B16, B0.B16
   153  		VEOR	ACC0.B16, B0.B16, B0.B16
   154  		VEXT	$8, B0.B16, B0.B16, T0.B16
   155  		VEOR	B0.B16, T0.B16, T0.B16
   156  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   157  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   158  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   159  
   160  		mulRound(B1)
   161  		mulRound(B2)
   162  		mulRound(B3)
   163  		mulRound(B4)
   164  		mulRound(B5)
   165  		mulRound(B6)
   166  		mulRound(B7)
   167  		MOVD	pTblSave, pTbl
   168  		reduce()
   169  
   170  		CMP	$128, srcPtrLen
   171  		BGE	octetsLoop
   172  
   173  startSingles:
   174  	CBZ	srcPtrLen, done
   175  	ADD	$14*16, pTbl
   176  	// Preload H and its Karatsuba precomp
   177  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   178  
   179  singlesLoop:
   180  		CMP	$16, srcPtrLen
   181  		BLT	tail
   182  		SUB	$16, srcPtrLen
   183  
   184  		VMOV	CTR.B16, B0.B16
   185  		VADD	CTR.S4, INC.S4, CTR.S4
   186  		sm4eEnc1block()
   187  
   188  singlesLast:
   189  		VLD1.P	16(srcPtr), [T0.B16]
   190  		VEOR	T0.B16, B0.B16, B0.B16
   191  
   192  encReduce:
   193  		VST1.P	[B0.B16], 16(dstPtr)
   194  
   195  		VREV64	B0.B16, B0.B16
   196  		VEOR	ACC0.B16, B0.B16, B0.B16
   197  
   198  		VEXT	$8, B0.B16, B0.B16, T0.B16
   199  		VEOR	B0.B16, T0.B16, T0.B16
   200  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   201  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   202  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   203  
   204  		reduce()
   205  
   206  	B	singlesLoop
   207  tail:
   208  	CBZ	srcPtrLen, done
   209  
   210  	VEOR	T0.B16, T0.B16, T0.B16
   211  	VEOR	T3.B16, T3.B16, T3.B16
   212  	MOVD	$0, H1
   213  	SUB	$1, H1
   214  	ADD	srcPtrLen, srcPtr
   215  
   216  	TBZ	$3, srcPtrLen, ld4
   217  	MOVD.W	-8(srcPtr), H0
   218  	VMOV	H0, T0.D[0]
   219  	VMOV	H1, T3.D[0]
   220  
   221  ld4:
   222  	TBZ	$2, srcPtrLen, ld2
   223  	MOVW.W	-4(srcPtr), H0
   224  	VEXT	$12, T0.B16, ZERO.B16, T0.B16
   225  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   226  	VMOV	H0, T0.S[0]
   227  	VMOV	H1, T3.S[0]
   228  ld2:
   229  	TBZ	$1, srcPtrLen, ld1
   230  	MOVH.W	-2(srcPtr), H0
   231  	VEXT	$14, T0.B16, ZERO.B16, T0.B16
   232  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   233  	VMOV	H0, T0.H[0]
   234  	VMOV	H1, T3.H[0]
   235  ld1:
   236  	TBZ	$0, srcPtrLen, ld0
   237  	MOVB.W	-1(srcPtr), H0
   238  	VEXT	$15, T0.B16, ZERO.B16, T0.B16
   239  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   240  	VMOV	H0, T0.B[0]
   241  	VMOV	H1, T3.B[0]
   242  ld0:
   243  	MOVD	ZR, srcPtrLen
   244  	VMOV	CTR.B16, B0.B16
   245  	sm4eEnc1block()
   246  
   247  tailLast:
   248  	VEOR	T0.B16, B0.B16, B0.B16
   249  	VAND	T3.B16, B0.B16, B0.B16
   250  	B	encReduce
   251  
   252  done:
   253  	VST1	[ACC0.B16], (tPtr)
   254  	RET
   255  
   256  // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
   257  TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
   258  	MOVD	productTable+0(FP), pTbl
   259  	MOVD	dst+8(FP), dstPtr
   260  	MOVD	src_base+32(FP), srcPtr
   261  	MOVD	src_len+40(FP), srcPtrLen
   262  	MOVD	ctr+56(FP), ctrPtr
   263  	MOVD	T+64(FP), tPtr
   264  	MOVD	rk_base+72(FP), rk
   265  
   266  	MOVD	$0xC2, H1
   267  	LSL	$56, H1
   268  	MOVD	$1, H0
   269  	VMOV	H1, POLY.D[0]
   270  	VMOV	H0, POLY.D[1]
   271  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   272  
   273  	MOVD	pTbl, pTblSave
   274  	MOVD rk, rkSave
   275  	// Current tag, after AAD
   276  	VLD1	(tPtr), [ACC0.B16]
   277  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   278  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   279  	// Prepare initial counter, and the increment vector
   280  	VLD1	(ctrPtr), [CTR.B16]
   281  	VEOR	INC.B16, INC.B16, INC.B16
   282  	MOVD	$1, H0
   283  	VMOV	H0, INC.S[3]
   284  	VREV32	CTR.B16, CTR.B16
   285  	VADD	CTR.S4, INC.S4, CTR.S4
   286  
   287  	// Skip to <8 blocks loop
   288  	CMP	$128, srcPtrLen
   289  
   290  	MOVD	rk, H0
   291  	// For SM4 round keys are stored in: K0 .. K7
   292  	VLD1.P	64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
   293  	VLD1.P	64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
   294  
   295  	BLT	startSingles
   296  octetsLoop:
   297  		SUB	$128, srcPtrLen
   298  
   299  		VMOV	CTR.B16, B0.B16
   300  		VADD	B0.S4, INC.S4, B1.S4
   301  		VADD	B1.S4, INC.S4, B2.S4
   302  		VADD	B2.S4, INC.S4, B3.S4
   303  		VADD	B3.S4, INC.S4, B4.S4
   304  		VADD	B4.S4, INC.S4, B5.S4
   305  		VADD	B5.S4, INC.S4, B6.S4
   306  		VADD	B6.S4, INC.S4, B7.S4
   307  		VADD	B7.S4, INC.S4, CTR.S4
   308  
   309  		sm4eEnc8blocks()      
   310  
   311  		VMOV B0.B16, T1.B16
   312  		VMOV B1.B16, T2.B16
   313  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   314  		VEOR	B0.B16, T1.B16, T1.B16
   315  		VEOR	B1.B16, T2.B16, T2.B16
   316  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   317  
   318  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   319  		VREV64	B0.B16, B0.B16
   320  		VEOR	ACC0.B16, B0.B16, B0.B16
   321  		VEXT	$8, B0.B16, B0.B16, T0.B16
   322  		VEOR	B0.B16, T0.B16, T0.B16
   323  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   324  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   325  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   326  		mulRound(B1)
   327  
   328  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   329  		VEOR	B2.B16, B0.B16, T1.B16
   330  		VEOR	B3.B16, B1.B16, T2.B16
   331  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   332  		mulRound(B0)
   333  		mulRound(B1)
   334  
   335  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   336  		VEOR	B4.B16, B0.B16, T1.B16
   337  		VEOR	B5.B16, B1.B16, T2.B16
   338  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   339  		mulRound(B0)
   340  		mulRound(B1)
   341  
   342  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   343  		VEOR	B6.B16, B0.B16, T1.B16
   344  		VEOR	B7.B16, B1.B16, T2.B16
   345  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   346  		mulRound(B0)
   347  		mulRound(B1)
   348  
   349  		MOVD	pTblSave, pTbl
   350  		reduce()
   351  
   352  		CMP	$128, srcPtrLen
   353  		BGE	octetsLoop
   354  
   355  startSingles:
   356  	CBZ	srcPtrLen, done
   357  	ADD	$14*16, pTbl
   358  	// Preload H and its Karatsuba precomp
   359  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   360  
   361  singlesLoop:
   362  		CMP	$16, srcPtrLen
   363  		BLT	tail
   364  		SUB	$16, srcPtrLen
   365  
   366  		VLD1.P	16(srcPtr), [T0.B16]
   367  		VREV64	T0.B16, B5.B16
   368  
   369  		VMOV	CTR.B16, B0.B16
   370  		VADD	CTR.S4, INC.S4, CTR.S4
   371  		sm4eEnc1block()
   372  
   373  singlesLast:
   374  		VEOR	T0.B16, B0.B16, B0.B16
   375  		VST1.P	[B0.B16], 16(dstPtr)
   376  
   377  		VEOR	ACC0.B16, B5.B16, B5.B16
   378  		VEXT	$8, B5.B16, B5.B16, T0.B16
   379  		VEOR	B5.B16, T0.B16, T0.B16
   380  		VPMULL	B5.D1, T1.D1, ACC1.Q1
   381  		VPMULL2	B5.D2, T1.D2, ACC0.Q1
   382  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   383  		reduce()
   384  
   385  	B	singlesLoop        
   386  tail:
   387  	CBZ	srcPtrLen, done
   388  	VMOV	CTR.B16, B0.B16
   389  	VADD	CTR.S4, INC.S4, CTR.S4
   390  	sm4eEnc1block()
   391  
   392  tailLast:
   393  	// Assuming it is safe to load past dstPtr due to the presence of the tag
   394  	// B5 stored last ciphertext
   395  	VLD1	(srcPtr), [B5.B16]
   396  
   397  	VEOR	B5.B16, B0.B16, B0.B16
   398  
   399  	VEOR	T3.B16, T3.B16, T3.B16
   400  	MOVD	$0, H1
   401  	SUB	$1, H1
   402  
   403  	TBZ	$3, srcPtrLen, ld4 // Test if srcPtrLen < 8, if yes, goto ld4
   404  	VMOV	B0.D[0], H0
   405  	MOVD.P	H0, 8(dstPtr)
   406  	VMOV	H1, T3.D[0]
   407  	VEXT	$8, ZERO.B16, B0.B16, B0.B16
   408  ld4:
   409  	TBZ	$2, srcPtrLen, ld2 // Test if srcPtrLen < 4, if yes, goto ld2
   410  	VMOV	B0.S[0], H0
   411  	MOVW.P	H0, 4(dstPtr)
   412  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   413  	VMOV	H1, T3.S[0]
   414  	VEXT	$4, ZERO.B16, B0.B16, B0.B16
   415  ld2:
   416  	TBZ	$1, srcPtrLen, ld1 // Test if srcPtrLen < 2, if yes, goto ld1
   417  	VMOV	B0.H[0], H0
   418  	MOVH.P	H0, 2(dstPtr)
   419  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   420  	VMOV	H1, T3.H[0]
   421  	VEXT	$2, ZERO.B16, B0.B16, B0.B16
   422  ld1:
   423  	TBZ	$0, srcPtrLen, ld0 // Test if srcPtrLen < 1, if yes, goto ld0
   424  	VMOV	B0.B[0], H0
   425  	MOVB.P	H0, 1(dstPtr)
   426  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   427  	VMOV	H1, T3.B[0]
   428  ld0:
   429  
   430  	VAND	T3.B16, B5.B16, B5.B16
   431  	VREV64	B5.B16, B5.B16
   432  
   433  	VEOR	ACC0.B16, B5.B16, B5.B16
   434  	VEXT	$8, B5.B16, B5.B16, T0.B16
   435  	VEOR	B5.B16, T0.B16, T0.B16
   436  	VPMULL	B5.D1, T1.D1, ACC1.Q1
   437  	VPMULL2	B5.D2, T1.D2, ACC0.Q1
   438  	VPMULL	T0.D1, T2.D1, ACCM.Q1
   439  	reduce()
   440  done:
   441  	VST1	[ACC0.B16], (tPtr)
   442  
   443  	RET