github.com/emmansun/gmsm@v0.29.1/sm4/gcm_arm64.s

github.com/emmansun/gmsm@v0.29.1/sm4/gcm_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define B0 V0
     6  #define B1 V1
     7  #define B2 V2
     8  #define B3 V3
     9  #define B4 V4
    10  #define B5 V5
    11  #define B6 V6
    12  #define B7 V7
    13  
    14  #define ACC0 V8
    15  #define ACC1 V9
    16  #define ACCM V10
    17  
    18  #define T0 V11
    19  #define T1 V12
    20  #define T2 V13
    21  #define T3 V14
    22  
    23  #define POLY V15
    24  #define ZERO V16
    25  #define INC V17
    26  #define CTR V18
    27  
    28  #define K0 V19
    29  #define K1 V20
    30  #define K2 V21
    31  #define K3 V22
    32  #define NIBBLE_MASK V23
    33  #define INVERSE_SHIFT_ROWS V24
    34  #define M1L V25
    35  #define M1H V26 
    36  #define M2L V27 
    37  #define M2H V28
    38  #define R08_MASK V29 
    39  
    40  #define reduce() \
    41  	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
    42  	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
    43  	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
    44  	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
    45  	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
    46  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    47  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    48  	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
    49  	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
    50  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    51  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    52  	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
    53  	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
    54  
    55  // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    56  TEXT ·gcmSm4Finish(SB),NOSPLIT,$0    
    57  #define pTbl R0
    58  #define tMsk R1
    59  #define tPtr R2
    60  #define plen R3
    61  #define dlen R4
    62  
    63  	MOVD	$0xC2, R1
    64  	LSL	$56, R1
    65  	MOVD	$1, R0
    66  	VMOV	R1, POLY.D[0]
    67  	VMOV	R0, POLY.D[1]
    68  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    69  
    70  	MOVD	productTable+0(FP), pTbl
    71  	MOVD	tagMask+8(FP), tMsk
    72  	MOVD	T+16(FP), tPtr
    73  	MOVD	pLen+24(FP), plen
    74  	MOVD	dLen+32(FP), dlen
    75  
    76  	VLD1	(tPtr), [ACC0.B16]
    77  	VLD1	(tMsk), [B1.B16]
    78  
    79  	LSL	$3, plen
    80  	LSL	$3, dlen
    81  
    82  	VMOV	dlen, B0.D[0]
    83  	VMOV	plen, B0.D[1]
    84  
    85  	ADD	$14*16, pTbl
    86  	VLD1.P	(pTbl), [T1.B16, T2.B16]
    87  
    88  	VEOR	ACC0.B16, B0.B16, B0.B16
    89  
    90  	VEXT	$8, B0.B16, B0.B16, T0.B16
    91  	VEOR	B0.B16, T0.B16, T0.B16
    92  	VPMULL	B0.D1, T1.D1, ACC1.Q1
    93  	VPMULL2	B0.D2, T1.D2, ACC0.Q1
    94  	VPMULL	T0.D1, T2.D1, ACCM.Q1
    95  
    96  	reduce()
    97  
    98  	VREV64	ACC0.B16, ACC0.B16
    99  	VEOR	B1.B16, ACC0.B16, ACC0.B16
   100  
   101  	VST1	[ACC0.B16], (tPtr)
   102  	RET
   103  #undef pTbl
   104  #undef tMsk
   105  #undef tPtr
   106  #undef plen
   107  #undef dlen
   108  
   109  #include "aesni_macros_arm64.s"
   110  
   111  // func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
   112  TEXT ·gcmSm4Init(SB),NOSPLIT,$0
   113  #define pTbl R0
   114  #define RK R1
   115  #define I R2
   116  
   117  	MOVD productTable+0(FP), pTbl
   118  	MOVD rk+8(FP), RK
   119  	MOVD inst+16(FP), R5
   120  
   121  	MOVD	$0xC2, I
   122  	LSL	$56, I
   123  	VMOV	I, POLY.D[0]
   124  	MOVD	$1, I
   125  	VMOV	I, POLY.D[1]
   126  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   127  
   128  	// Encrypt block 0 with the SM4 keys to generate the hash key H
   129  	CMP $1, R5
   130  	BEQ sm4InitSM4E
   131  
   132  	LOAD_SM4_AESNI_CONSTS()
   133  	VEOR	B0.B16, B0.B16, B0.B16
   134  	VEOR	B1.B16, B1.B16, B1.B16
   135  	VEOR	B2.B16, B2.B16, B2.B16
   136  	VEOR	B3.B16, B3.B16, B3.B16
   137  	EOR R3, R3
   138  
   139  sm4InitEncLoop:	
   140  	SM4_ROUND(RK, R19, K0, K1, K2, B0, B1, B2, B3)
   141  	SM4_ROUND(RK, R19, K0, K1, K2, B1, B2, B3, B0)
   142  	SM4_ROUND(RK, R19, K0, K1, K2, B2, B3, B0, B1)
   143  	SM4_ROUND(RK, R19, K0, K1, K2, B3, B0, B1, B2)
   144  
   145  	ADD $1, R3
   146  	CMP $8, R3
   147  	BNE sm4InitEncLoop
   148  
   149  	VMOV B0.S[0], B0.S[2]
   150  	VMOV B1.S[0], B0.S[3]
   151  	VMOV B2.S[0], B0.S[0]
   152  	VMOV B3.S[0], B0.S[1]
   153  	B sm4InitEncDone
   154  sm4InitSM4E:
   155  	VEOR	B0.B16, B0.B16, B0.B16
   156  	VLD1.P	64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
   157  	WORD $0x6085c0ce          //SM4E V0.4S, V11.4S
   158  	WORD $0x8085c0ce          //SM4E V0.4S, V12.4S
   159  	WORD $0xa085c0ce          //SM4E V0.4S, V13.4S
   160  	WORD $0xc085c0ce          //SM4E V0.4S, V14.4S
   161  	VLD1.P	64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
   162  	WORD $0x6085c0ce          //SM4E V0.4S, V11.4S
   163  	WORD $0x8085c0ce          //SM4E V0.4S, V12.4S
   164  	WORD $0xa085c0ce          //SM4E V0.4S, V13.4S
   165  	WORD $0xc085c0ce          //SM4E V0.4S, V14.4S
   166  	VREV32	B0.B16, B0.B16
   167  	VREV64	B0.B16, B0.B16		
   168  sm4InitEncDone:
   169  	// Multiply by 2 modulo P
   170  	VMOV	B0.D[0], I
   171  	ASR	$63, I
   172  	VMOV	I, T1.D[0]
   173  	VMOV	I, T1.D[1]
   174  	VAND	POLY.B16, T1.B16, T1.B16
   175  	VUSHR	$63, B0.D2, T2.D2
   176  	VEXT	$8, ZERO.B16, T2.B16, T2.B16
   177  	VSLI	$1, B0.D2, T2.D2
   178  	VEOR	T1.B16, T2.B16, B0.B16
   179  
   180  	// Karatsuba pre-computation
   181  	VEXT	$8, B0.B16, B0.B16, B1.B16
   182  	VEOR	B0.B16, B1.B16, B1.B16
   183  
   184  	ADD	$14*16, pTbl
   185  
   186  	VST1	[B0.B16, B1.B16], (pTbl)
   187  	SUB	$2*16, pTbl
   188  
   189  	VMOV	B0.B16, B2.B16
   190  	VMOV	B1.B16, B3.B16
   191  
   192  	MOVD	$7, I
   193  
   194  initLoop:
   195  	// Compute powers of H
   196  	SUBS	$1, I
   197  
   198  	VPMULL	B0.D1, B2.D1, T1.Q1
   199  	VPMULL2	B0.D2, B2.D2, T0.Q1
   200  	VPMULL	B1.D1, B3.D1, T2.Q1
   201  	VEOR	T0.B16, T2.B16, T2.B16
   202  	VEOR	T1.B16, T2.B16, T2.B16
   203  	VEXT	$8, ZERO.B16, T2.B16, T3.B16
   204  	VEXT	$8, T2.B16, ZERO.B16, T2.B16
   205  	VEOR	T2.B16, T0.B16, T0.B16
   206  	VEOR	T3.B16, T1.B16, T1.B16
   207  	VPMULL	POLY.D1, T0.D1, T2.Q1
   208  	VEXT	$8, T0.B16, T0.B16, T0.B16
   209  	VEOR	T2.B16, T0.B16, T0.B16
   210  	VPMULL	POLY.D1, T0.D1, T2.Q1
   211  	VEXT	$8, T0.B16, T0.B16, T0.B16
   212  	VEOR	T2.B16, T0.B16, T0.B16
   213  	VEOR	T1.B16, T0.B16, B2.B16
   214  	VMOV	B2.B16, B3.B16
   215  	VEXT	$8, B2.B16, B2.B16, B2.B16
   216  	VEOR	B2.B16, B3.B16, B3.B16
   217  
   218  	VST1	[B2.B16, B3.B16], (pTbl)
   219  	SUB	$2*16, pTbl
   220  
   221  	BNE	initLoop
   222  	RET
   223  #undef I
   224  #undef RK
   225  #undef pTbl	
   226  
   227  // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
   228  TEXT ·gcmSm4Data(SB),NOSPLIT,$0
   229  #define pTbl R0
   230  #define aut R1
   231  #define tPtr R2
   232  #define autLen R3
   233  #define H0 R4
   234  #define pTblSave R5
   235  
   236  #define mulRound(X) \
   237  	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
   238  	VREV64	X.B16, X.B16               \
   239  	VEXT	$8, X.B16, X.B16, T0.B16   \
   240  	VEOR	X.B16, T0.B16, T0.B16      \
   241  	VPMULL	X.D1, T1.D1, T3.Q1         \
   242  	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
   243  	VPMULL2	X.D2, T1.D2, T3.Q1         \
   244  	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
   245  	VPMULL	T0.D1, T2.D1, T3.Q1        \
   246  	VEOR	T3.B16, ACCM.B16, ACCM.B16
   247  
   248  	MOVD	productTable+0(FP), pTbl
   249  	MOVD	data_base+8(FP), aut
   250  	MOVD	data_len+16(FP), autLen
   251  	MOVD	T+32(FP), tPtr
   252  
   253  	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
   254  	//VLD1 (tPtr), [ACC0.B16] // originally we passed in tag initial value
   255  	CBZ	autLen, dataBail
   256  
   257  	MOVD	$0xC2, H0
   258  	LSL	$56, H0
   259  	VMOV	H0, POLY.D[0]
   260  	MOVD	$1, H0
   261  	VMOV	H0, POLY.D[1]
   262  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   263  	MOVD	pTbl, pTblSave
   264  
   265  	CMP	$13, autLen
   266  	BEQ	dataTLS
   267  	CMP	$128, autLen
   268  	BLT	startSinglesLoop
   269  	B	octetsLoop
   270  
   271  dataTLS:
   272  	ADD	$14*16, pTbl
   273  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   274  	VEOR	B0.B16, B0.B16, B0.B16
   275  
   276  	MOVD	(aut), H0
   277  	VMOV	H0, B0.D[0]
   278  	MOVW	8(aut), H0
   279  	VMOV	H0, B0.S[2]
   280  	MOVB	12(aut), H0
   281  	VMOV	H0, B0.B[12]
   282  
   283  	MOVD	$0, autLen
   284  	B	dataMul
   285  
   286  octetsLoop:
   287  		CMP	$128, autLen
   288  		BLT	startSinglesLoop
   289  		SUB	$128, autLen
   290  
   291  		VLD1.P	32(aut), [B0.B16, B1.B16]
   292  
   293  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   294  		VREV64	B0.B16, B0.B16
   295  		VEOR	ACC0.B16, B0.B16, B0.B16
   296  		VEXT	$8, B0.B16, B0.B16, T0.B16
   297  		VEOR	B0.B16, T0.B16, T0.B16
   298  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   299  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   300  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   301  
   302  		mulRound(B1)
   303  		VLD1.P  32(aut), [B2.B16, B3.B16]
   304  		mulRound(B2)
   305  		mulRound(B3)
   306  		VLD1.P  32(aut), [B4.B16, B5.B16]
   307  		mulRound(B4)
   308  		mulRound(B5)
   309  		VLD1.P  32(aut), [B6.B16, B7.B16]
   310  		mulRound(B6)
   311  		mulRound(B7)
   312  
   313  		MOVD	pTblSave, pTbl
   314  		reduce()
   315  	B	octetsLoop
   316  
   317  startSinglesLoop:
   318  
   319  	ADD	$14*16, pTbl
   320  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   321  
   322  singlesLoop:
   323  
   324  		CMP	$16, autLen
   325  		BLT	dataEnd
   326  		SUB	$16, autLen
   327  
   328  		VLD1.P	16(aut), [B0.B16]
   329  dataMul:
   330  		VREV64	B0.B16, B0.B16
   331  		VEOR	ACC0.B16, B0.B16, B0.B16
   332  
   333  		VEXT	$8, B0.B16, B0.B16, T0.B16
   334  		VEOR	B0.B16, T0.B16, T0.B16
   335  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   336  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   337  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   338  
   339  		reduce()
   340  
   341  	B	singlesLoop
   342  
   343  dataEnd:
   344  
   345  	CBZ	autLen, dataBail
   346  	VEOR	B0.B16, B0.B16, B0.B16
   347  	ADD	autLen, aut
   348  
   349  dataLoadLoop:
   350  		MOVB.W	-1(aut), H0
   351  		VEXT	$15, B0.B16, ZERO.B16, B0.B16
   352  		VMOV	H0, B0.B[0]
   353  		SUBS	$1, autLen
   354  		BNE	dataLoadLoop
   355  	B	dataMul
   356  
   357  dataBail:
   358  	VST1	[ACC0.B16], (tPtr)
   359  	RET
   360  
   361  #undef pTbl
   362  #undef aut
   363  #undef tPtr
   364  #undef autLen
   365  #undef H0
   366  #undef pTblSave
   367  
   368  // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
   369  TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
   370  #define pTbl R0
   371  #define dstPtr R1
   372  #define ctrPtr R2
   373  #define srcPtr R3
   374  #define rk R4
   375  #define tPtr R5
   376  #define srcPtrLen R6
   377  #define aluCTR R7
   378  #define aluTMP R8
   379  #define H0 R9
   380  #define H1 R10
   381  #define pTblSave R11
   382  #define rkSave R12
   383  #define mulRoundSingleWithoutRev(X) \
   384  	VEOR	ACC0.B16, X.B16, X.B16    \
   385  	VEXT	$8, X.B16, X.B16, T0.B16  \
   386  	VEOR	X.B16, T0.B16, T0.B16     \
   387  	VPMULL	X.D1, T1.D1, ACC1.Q1    \
   388  	VPMULL2	X.D2, T1.D2, ACC0.Q1    \
   389  	VPMULL	T0.D1, T2.D1, ACCM.Q1   \
   390  	reduce()                        \
   391  
   392  #define mulRoundSingle(X) \
   393  	VREV64	X.B16, X.B16            \
   394  	mulRoundSingleWithoutRev(X)     \
   395  
   396  	MOVD	productTable+0(FP), pTbl
   397  	MOVD	dst+8(FP), dstPtr
   398  	MOVD	src_base+32(FP), srcPtr
   399  	MOVD	src_len+40(FP), srcPtrLen
   400  	MOVD	ctr+56(FP), ctrPtr
   401  	MOVD	T+64(FP), tPtr
   402  	MOVD	rk_base+72(FP), rk
   403  	
   404  	MOVD	$0xC2, H1
   405  	LSL	$56, H1
   406  	MOVD	$1, H0
   407  	VMOV	H1, POLY.D[0]
   408  	VMOV	H0, POLY.D[1]
   409  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   410  
   411  	MOVD	pTbl, pTblSave
   412  	MOVD rk, rkSave
   413  	// Current tag, after AAD
   414  	VLD1	(tPtr), [ACC0.B16]
   415  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   416  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   417  	// Prepare initial counter, and the increment vector
   418  	VLD1	(ctrPtr), [CTR.B16]
   419  	VEOR	INC.B16, INC.B16, INC.B16
   420  	MOVD	$1, H0
   421  	VMOV	H0, INC.S[3]
   422  	VREV32	CTR.B16, CTR.B16
   423  	VADD	CTR.S4, INC.S4, CTR.S4
   424  
   425  	// Skip to <8 blocks loop
   426  	CMP	$128, srcPtrLen
   427  
   428  	LOAD_SM4_AESNI_CONSTS()
   429  
   430  	BLT	encNibblesLoop
   431  	// There are at least 8 blocks to encrypt
   432  
   433  encOctetsLoop:
   434  		SUB	$128, srcPtrLen
   435  		// Prepare 8 counters
   436  		VMOV	CTR.B16, B0.B16
   437  		VADD	B0.S4, INC.S4, B1.S4
   438  		VADD	B1.S4, INC.S4, B2.S4
   439  		VADD	B2.S4, INC.S4, B3.S4
   440  		VADD	B3.S4, INC.S4, B4.S4
   441  		VADD	B4.S4, INC.S4, B5.S4
   442  		VADD	B5.S4, INC.S4, B6.S4
   443  		VADD	B6.S4, INC.S4, B7.S4
   444  		VADD	B7.S4, INC.S4, CTR.S4
   445  
   446  		// encryption first 4 blocks
   447  		PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   448  		PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
   449  		EOR R13, R13
   450  		MOVD	rkSave, rk
   451  
   452  encOctetsEnc8Blocks:
   453  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
   454  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
   455  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
   456  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
   457  
   458  		ADD $1, R13
   459  		CMP $8, R13
   460  		BNE encOctetsEnc8Blocks
   461  		VREV32 B0.B16, B0.B16
   462  		VREV32 B1.B16, B1.B16
   463  		VREV32 B2.B16, B2.B16
   464  		VREV32 B3.B16, B3.B16
   465  		TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   466  		VREV32 B4.B16, B4.B16
   467  		VREV32 B5.B16, B5.B16
   468  		VREV32 B6.B16, B6.B16
   469  		VREV32 B7.B16, B7.B16
   470  		TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
   471  
   472  		// XOR plaintext and store ciphertext
   473  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   474  		VEOR	B0.B16, T1.B16, B0.B16
   475  		VEOR	B1.B16, T2.B16, B1.B16
   476  		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   477  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   478  		VEOR	B2.B16, T1.B16, B2.B16
   479  		VEOR	B3.B16, T2.B16, B3.B16
   480  		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   481  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   482  		VEOR	B4.B16, T1.B16, B4.B16
   483  		VEOR	B5.B16, T2.B16, B5.B16
   484  		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
   485  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   486  		VEOR	B6.B16, T1.B16, B6.B16
   487  		VEOR	B7.B16, T2.B16, B7.B16
   488  		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
   489  
   490  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   491  		VREV64	B0.B16, B0.B16
   492  		VEOR	ACC0.B16, B0.B16, B0.B16
   493  		VEXT	$8, B0.B16, B0.B16, T0.B16
   494  		VEOR	B0.B16, T0.B16, T0.B16
   495  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   496  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   497  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   498  
   499  		mulRound(B1)
   500  		mulRound(B2)
   501  		mulRound(B3)
   502  		mulRound(B4)
   503  		mulRound(B5)
   504  		mulRound(B6)
   505  		mulRound(B7)
   506  		MOVD	pTblSave, pTbl
   507  		reduce()
   508  
   509  		CMP	$128, srcPtrLen
   510  		BGE	encOctetsLoop
   511  
   512  encNibblesLoop:
   513  	CBZ	srcPtrLen, encDone
   514  	ADD	$14*16, pTbl
   515  	// Preload H and its Karatsuba precomp
   516  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   517  
   518  	CMP	$64, srcPtrLen
   519  	BLT	encStartSingles
   520  	SUB	$64, srcPtrLen
   521  
   522  	// Prepare 4 counters
   523  	VMOV	CTR.B16, B0.B16
   524  	VADD	B0.S4, INC.S4, B1.S4
   525  	VADD	B1.S4, INC.S4, B2.S4
   526  	VADD	B2.S4, INC.S4, B3.S4
   527  	VADD	B3.S4, INC.S4, CTR.S4
   528  
   529  	// encryption first 4 blocks
   530  	PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   531  	EOR R13, R13
   532  	MOVD	rkSave, rk
   533  
   534  encNibblesEnc4Blocks:	
   535  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   536  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   537  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   538  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   539  
   540  	ADD $1, R13
   541  	CMP $8, R13
   542  	BNE encNibblesEnc4Blocks
   543  	VREV32 B0.B16, B0.B16
   544  	VREV32 B1.B16, B1.B16
   545  	VREV32 B2.B16, B2.B16
   546  	VREV32 B3.B16, B3.B16
   547  	TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   548  
   549  	// XOR plaintext and store ciphertext
   550  	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
   551  	VEOR	B0.B16, K1.B16, B0.B16
   552  	VEOR	B1.B16, K2.B16, B1.B16
   553  	VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   554  	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
   555  	VEOR	B2.B16, K1.B16, B2.B16
   556  	VEOR	B3.B16, K2.B16, B3.B16
   557  	VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   558  
   559  	mulRoundSingle(B0)
   560  	mulRoundSingle(B1)
   561  	mulRoundSingle(B2)
   562  	mulRoundSingle(B3)
   563  
   564  encStartSingles:
   565  	CBZ	srcPtrLen, encDone
   566  
   567  	// Prepare 4 counters
   568  	VMOV	CTR.B16, B0.B16
   569  	VADD	B0.S4, INC.S4, B1.S4
   570  	VADD	B1.S4, INC.S4, B2.S4
   571  	VADD	B2.S4, INC.S4, B3.S4
   572  	VADD	B3.S4, INC.S4, CTR.S4
   573  
   574  	// encryption first 4 blocks
   575  	PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   576  	EOR R13, R13
   577  	MOVD	rkSave, rk
   578  
   579  encSinglesEnc4Blocks:	
   580  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   581  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   582  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   583  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   584  
   585  	ADD $1, R13
   586  	CMP $8, R13
   587  	BNE encSinglesEnc4Blocks
   588  	VREV32 B0.B16, B0.B16
   589  	VREV32 B1.B16, B1.B16
   590  	VREV32 B2.B16, B2.B16
   591  	VREV32 B3.B16, B3.B16
   592  	TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   593  
   594  	VMOV B0.B16, K0.B16
   595  	CMP	$16, srcPtrLen
   596  	BLT	encTail
   597  	SUB	$16, srcPtrLen
   598  	VLD1.P	16(srcPtr), [K1.B16]
   599  	VEOR	K0.B16, K1.B16, K0.B16		
   600  	VST1.P  [K0.B16], 16(dstPtr)
   601  	mulRoundSingle(K0)
   602  
   603  	VMOV B1.B16, K0.B16
   604  	CMP	$16, srcPtrLen
   605  	BLT	encTail
   606  	SUB	$16, srcPtrLen
   607  	VLD1.P	16(srcPtr), [K1.B16]
   608  	VEOR	K0.B16, K1.B16, K0.B16		
   609  	VST1.P  [K0.B16], 16(dstPtr)
   610  	mulRoundSingle(K0)
   611  
   612  	VMOV B2.B16, K0.B16
   613  	CMP	$16, srcPtrLen
   614  	BLT	encTail
   615  	SUB	$16, srcPtrLen
   616  	VLD1.P	16(srcPtr), [K1.B16]
   617  	VEOR	K0.B16, K1.B16, K0.B16		
   618  	VST1.P  [K0.B16], 16(dstPtr)
   619  	mulRoundSingle(K0)
   620  
   621  	VMOV B3.B16, K0.B16
   622  	CMP	$16, srcPtrLen
   623  	BLT	encTail
   624  	SUB	$16, srcPtrLen
   625  	VLD1.P	16(srcPtr), [K1.B16]
   626  	VEOR	K0.B16, K1.B16, K0.B16		
   627  	VST1.P  [K0.B16], 16(dstPtr)
   628  	mulRoundSingle(K0)
   629  
   630  encTail:
   631  	CBZ	srcPtrLen, encDone
   632  	VEOR	T0.B16, T0.B16, T0.B16
   633  	VEOR	T3.B16, T3.B16, T3.B16
   634  	MOVD	$0, H1
   635  	SUB	$1, H1
   636  	ADD	srcPtrLen, srcPtr
   637  
   638  	TBZ	$3, srcPtrLen, ld4
   639  	MOVD.W	-8(srcPtr), H0
   640  	VMOV	H0, T0.D[0]
   641  	VMOV	H1, T3.D[0]
   642  ld4:
   643  	TBZ	$2, srcPtrLen, ld2
   644  	MOVW.W	-4(srcPtr), H0
   645  	VEXT	$12, T0.B16, ZERO.B16, T0.B16
   646  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   647  	VMOV	H0, T0.S[0]
   648  	VMOV	H1, T3.S[0]
   649  ld2:
   650  	TBZ	$1, srcPtrLen, ld1
   651  	MOVH.W	-2(srcPtr), H0
   652  	VEXT	$14, T0.B16, ZERO.B16, T0.B16
   653  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   654  	VMOV	H0, T0.H[0]
   655  	VMOV	H1, T3.H[0]
   656  ld1:
   657  	TBZ	$0, srcPtrLen, ld0
   658  	MOVB.W	-1(srcPtr), H0
   659  	VEXT	$15, T0.B16, ZERO.B16, T0.B16
   660  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   661  	VMOV	H0, T0.B[0]
   662  	VMOV	H1, T3.B[0]
   663  ld0:
   664  	MOVD	ZR, srcPtrLen
   665  	VEOR	T0.B16, K0.B16, K0.B16
   666  	VAND	T3.B16, K0.B16, K0.B16
   667  	VST1.P  [K0.B16], 16(dstPtr)
   668  	mulRoundSingle(K0)
   669  
   670  encDone:
   671  	VST1	[ACC0.B16], (tPtr)
   672  	RET
   673  
   674  // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
   675  TEXT ·gcmSm4Dec(SB),NOSPLIT,$0
   676  	MOVD	productTable+0(FP), pTbl
   677  	MOVD	dst+8(FP), dstPtr
   678  	MOVD	src_base+32(FP), srcPtr
   679  	MOVD	src_len+40(FP), srcPtrLen
   680  	MOVD	ctr+56(FP), ctrPtr
   681  	MOVD	T+64(FP), tPtr
   682  	MOVD	rk_base+72(FP), rk
   683  
   684  	MOVD	$0xC2, H1
   685  	LSL	$56, H1
   686  	MOVD	$1, H0
   687  	VMOV	H1, POLY.D[0]
   688  	VMOV	H0, POLY.D[1]
   689  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   690  
   691  	MOVD	pTbl, pTblSave
   692  	MOVD rk, rkSave
   693  	// Current tag, after AAD
   694  	VLD1	(tPtr), [ACC0.B16]
   695  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   696  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   697  	// Prepare initial counter, and the increment vector
   698  	VLD1	(ctrPtr), [CTR.B16]
   699  	VEOR	INC.B16, INC.B16, INC.B16
   700  	MOVD	$1, H0
   701  	VMOV	H0, INC.S[3]
   702  	VREV32	CTR.B16, CTR.B16
   703  	VADD	CTR.S4, INC.S4, CTR.S4
   704  
   705  	// Skip to <8 blocks loop
   706  	CMP	$128, srcPtrLen
   707  
   708  	LOAD_SM4_AESNI_CONSTS()
   709  
   710  	BLT	decNibblesLoop
   711  	// There are at least 8 blocks to encrypt
   712  
   713  decOctetsLoop:
   714  		SUB	$128, srcPtrLen
   715  
   716  		VMOV	CTR.B16, B0.B16
   717  		VADD	B0.S4, INC.S4, B1.S4
   718  		VADD	B1.S4, INC.S4, B2.S4
   719  		VADD	B2.S4, INC.S4, B3.S4
   720  		VADD	B3.S4, INC.S4, B4.S4
   721  		VADD	B4.S4, INC.S4, B5.S4
   722  		VADD	B5.S4, INC.S4, B6.S4
   723  		VADD	B6.S4, INC.S4, B7.S4
   724  		VADD	B7.S4, INC.S4, CTR.S4
   725  
   726  		// encryption first 4 blocks
   727  		PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   728  		PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
   729  		EOR R13, R13
   730  		MOVD	rkSave, rk
   731  
   732  decOctetsEnc8Blocks:	
   733  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B0, B1, B2, B3, B4, B5, B6, B7)
   734  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B1, B2, B3, B0, B5, B6, B7, B4)
   735  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B2, B3, B0, B1, B6, B7, B4, B5)
   736  			SM4_8BLOCKS_ROUND(rk, R19, K0, K1, K2, K3, B3, B0, B1, B2, B7, B4, B5, B6)
   737  
   738  		ADD $1, R13
   739  		CMP $8, R13
   740  		BNE decOctetsEnc8Blocks
   741  		VREV32 B0.B16, T1.B16
   742  		VREV32 B1.B16, T2.B16
   743  		VREV32 B2.B16, B2.B16
   744  		VREV32 B3.B16, B3.B16
   745  		TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3)
   746  		VREV32 B4.B16, B4.B16
   747  		VREV32 B5.B16, B5.B16
   748  		VREV32 B6.B16, B6.B16
   749  		VREV32 B7.B16, B7.B16		
   750  		TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
   751  
   752  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   753  		VEOR	B0.B16, T1.B16, T1.B16
   754  		VEOR	B1.B16, T2.B16, T2.B16
   755  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   756  
   757  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   758  		VREV64	B0.B16, B0.B16
   759  		VEOR	ACC0.B16, B0.B16, B0.B16
   760  		VEXT	$8, B0.B16, B0.B16, T0.B16
   761  		VEOR	B0.B16, T0.B16, T0.B16
   762  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   763  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   764  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   765  		mulRound(B1)
   766  
   767  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   768  		VEOR	B2.B16, B0.B16, T1.B16
   769  		VEOR	B3.B16, B1.B16, T2.B16
   770  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   771  		mulRound(B0)
   772  		mulRound(B1)
   773  
   774  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   775  		VEOR	B4.B16, B0.B16, T1.B16
   776  		VEOR	B5.B16, B1.B16, T2.B16
   777  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   778  		mulRound(B0)
   779  		mulRound(B1)
   780  
   781  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   782  		VEOR	B6.B16, B0.B16, T1.B16
   783  		VEOR	B7.B16, B1.B16, T2.B16
   784  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   785  		mulRound(B0)
   786  		mulRound(B1)
   787  
   788  		MOVD	pTblSave, pTbl
   789  		reduce()
   790  
   791  		CMP	$128, srcPtrLen
   792  		BGE	decOctetsLoop
   793  
   794  decNibblesLoop:
   795  	CBZ	srcPtrLen, decDone
   796  	ADD	$14*16, pTbl
   797  	// Preload H and its Karatsuba precomp
   798  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   799  	CMP	$64, srcPtrLen
   800  	BLT	decStartSingles
   801  	SUB	$64, srcPtrLen
   802  
   803  	// Prepare 4 counters
   804  	VMOV	CTR.B16, B0.B16
   805  	VADD	B0.S4, INC.S4, B1.S4
   806  	VADD	B1.S4, INC.S4, B2.S4
   807  	VADD	B2.S4, INC.S4, B3.S4
   808  	VADD	B3.S4, INC.S4, CTR.S4
   809  
   810  	// encryption first 4 blocks
   811  	PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   812  	EOR R13, R13
   813  	MOVD	rkSave, rk
   814  
   815  decNibblesEnc4Blocks:	
   816  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   817  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   818  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   819  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   820  
   821  	ADD $1, R13
   822  	CMP $8, R13
   823  	BNE decNibblesEnc4Blocks
   824  	VREV32 B0.B16, B0.B16
   825  	VREV32 B1.B16, B1.B16
   826  	VREV32 B2.B16, B2.B16
   827  	VREV32 B3.B16, B3.B16
   828  	TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   829  
   830  	// XOR plaintext and store ciphertext
   831  	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
   832  	VREV64	K1.B16, B4.B16
   833  	VREV64	K2.B16, B5.B16
   834  	VEOR	B0.B16, K1.B16, B0.B16
   835  	VEOR	B1.B16, K2.B16, B1.B16
   836  	VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   837  	VLD1.P	32(srcPtr), [K1.B16, K2.B16]
   838  	VREV64	K1.B16, B6.B16
   839  	VREV64	K2.B16, B7.B16
   840  	VEOR	B2.B16, K1.B16, B2.B16
   841  	VEOR	B3.B16, K2.B16, B3.B16
   842  	VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   843  	mulRoundSingleWithoutRev(B4)
   844  	mulRoundSingleWithoutRev(B5)
   845  	mulRoundSingleWithoutRev(B6)
   846  	mulRoundSingleWithoutRev(B7)
   847  
   848  decStartSingles:
   849  	CBZ	srcPtrLen, decDone
   850  
   851  	// Prepare 4 counters
   852  	VMOV	CTR.B16, B0.B16
   853  	VADD	B0.S4, INC.S4, B1.S4
   854  	VADD	B1.S4, INC.S4, B2.S4
   855  	VADD	B2.S4, INC.S4, B3.S4
   856  	VADD	B3.S4, INC.S4, CTR.S4
   857  
   858  	// encryption first 4 blocks
   859  	PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   860  	EOR R13, R13
   861  	MOVD	rkSave, rk
   862  
   863  decSinglesEnc4Blocks:	
   864  		SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
   865  		SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
   866  		SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
   867  		SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
   868  
   869  	ADD $1, R13
   870  	CMP $8, R13
   871  	BNE decSinglesEnc4Blocks
   872  	VREV32 B0.B16, B0.B16
   873  	VREV32 B1.B16, B1.B16
   874  	VREV32 B2.B16, B2.B16
   875  	VREV32 B3.B16, B3.B16
   876  	TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
   877  
   878  	VMOV B0.B16, K0.B16
   879  	CMP	$16, srcPtrLen
   880  	BLT	decTail
   881  	SUB	$16, srcPtrLen
   882  	VLD1.P	16(srcPtr), [K1.B16]
   883  	VREV64	K1.B16, B5.B16
   884  	VEOR	K0.B16, K1.B16, K0.B16		
   885  	VST1.P  [K0.B16], 16(dstPtr)
   886  	mulRoundSingleWithoutRev(B5)
   887  
   888  	VMOV B1.B16, K0.B16
   889  	CMP	$16, srcPtrLen
   890  	BLT	decTail
   891  	SUB	$16, srcPtrLen
   892  	VLD1.P	16(srcPtr), [K1.B16]
   893  	VREV64	K1.B16, B5.B16
   894  	VEOR	K0.B16, K1.B16, K0.B16		
   895  	VST1.P  [K0.B16], 16(dstPtr)
   896  	mulRoundSingleWithoutRev(B5)
   897  
   898  	VMOV B2.B16, K0.B16
   899  	CMP	$16, srcPtrLen
   900  	BLT	decTail
   901  	SUB	$16, srcPtrLen
   902  	VLD1.P	16(srcPtr), [K1.B16]
   903  	VREV64	K1.B16, B5.B16
   904  	VEOR	K0.B16, K1.B16, K0.B16		
   905  	VST1.P  [K0.B16], 16(dstPtr)
   906  	mulRoundSingleWithoutRev(B5)
   907  
   908  	VMOV B3.B16, K0.B16
   909  	CMP	$16, srcPtrLen
   910  	BLT	decTail
   911  	SUB	$16, srcPtrLen
   912  	VLD1.P	16(srcPtr), [K1.B16]
   913  	VREV64	K1.B16, B5.B16
   914  	VEOR	K0.B16, K1.B16, K0.B16		
   915  	VST1.P  [K0.B16], 16(dstPtr)
   916  	mulRoundSingleWithoutRev(B5)
   917  
   918  decTail:
   919  	CBZ	srcPtrLen, decDone
   920  	// Assuming it is safe to load past dstPtr due to the presence of the tag
   921  	VLD1	(srcPtr), [B5.B16]
   922  
   923  	VEOR	B5.B16, K0.B16, B0.B16
   924  
   925  	VEOR	T3.B16, T3.B16, T3.B16
   926  	MOVD	$0, H1
   927  	SUB	$1, H1
   928  
   929  	TBZ	$3, srcPtrLen, decLd4
   930  	VMOV	B0.D[0], H0
   931  	MOVD.P	H0, 8(dstPtr)
   932  	VMOV	H1, T3.D[0]
   933  	VEXT	$8, ZERO.B16, B0.B16, B0.B16
   934  
   935  decLd4:
   936  	TBZ	$2, srcPtrLen, decLd2
   937  	VMOV	B0.S[0], H0
   938  	MOVW.P	H0, 4(dstPtr)
   939  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   940  	VMOV	H1, T3.S[0]
   941  	VEXT	$4, ZERO.B16, B0.B16, B0.B16
   942  decLd2:
   943  	TBZ	$1, srcPtrLen, decLd1
   944  	VMOV	B0.H[0], H0
   945  	MOVH.P	H0, 2(dstPtr)
   946  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   947  	VMOV	H1, T3.H[0]
   948  	VEXT	$2, ZERO.B16, B0.B16, B0.B16
   949  decLd1:
   950  	TBZ	$0, srcPtrLen, decLd0
   951  	VMOV	B0.B[0], H0
   952  	MOVB.P	H0, 1(dstPtr)
   953  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   954  	VMOV	H1, T3.B[0]
   955  decLd0:
   956  
   957  	VAND	T3.B16, B5.B16, B5.B16
   958  	VREV64	B5.B16, B5.B16
   959  
   960  	VEOR	ACC0.B16, B5.B16, B5.B16
   961  	VEXT	$8, B5.B16, B5.B16, T0.B16
   962  	VEOR	B5.B16, T0.B16, T0.B16
   963  	VPMULL	B5.D1, T1.D1, ACC1.Q1
   964  	VPMULL2	B5.D2, T1.D2, ACC0.Q1
   965  	VPMULL	T0.D1, T2.D1, ACCM.Q1
   966  	reduce()
   967  
   968  decDone:
   969  	VST1	[ACC0.B16], (tPtr)
   970  	RET