github.com/emmansun/gmsm@v0.29.1/sm4/asm_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define t0 V0
     6  #define t1 V1
     7  #define t2 V2
     8  #define t3 V3
     9  #define t4 V4
    10  #define t5 V5
    11  #define t6 V6
    12  #define t7 V7
    13  #define x V8
    14  #define y V9
    15  #define XTMP6 V10
    16  #define XTMP7 V11
    17  #define M1L V20
    18  #define M1H V21
    19  #define M2L V22
    20  #define M2H V23
    21  #define R08_MASK V24
    22  #define INVERSE_SHIFT_ROWS V25
    23  #define NIBBLE_MASK V26
    24  #define FK_MASK V27
    25  #define ZERO V28
    26  
    27  #include "aesni_macros_arm64.s"
    28  
    29  #define SM4_TAO_L2(x, y)         \
    30  	SM4_SBOX(x, y, XTMP6);                      \
    31  	;                                           \ //####################  4 parallel L2 linear transforms ##################//
    32  	VSHL $13, x.S4, y.S4;                       \
    33  	VSRI $19, x.S4, y.S4;                       \
    34  	VSHL $23, x.S4, XTMP6.S4;                   \
    35  	VSRI $9, x.S4, XTMP6.S4;                    \
    36  	VEOR XTMP6.B16, y.B16, y.B16;               \
    37  	VEOR x.B16, y.B16, x.B16
    38  
    39  #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \
    40  	MOVW.P 4(R9), R19;                               \
    41  	VMOV R19, x.S[0];                                \
    42  	VEOR t1.B16, x.B16, x.B16;                       \
    43  	VEOR t2.B16, x.B16, x.B16;                       \
    44  	VEOR t3.B16, x.B16, x.B16;                       \
    45  	SM4_TAO_L2(x, y);                                \
    46  	VEOR x.B16, t0.B16, t0.B16;                      \
    47  	VMOV t0.S[0], R2;                                \
    48  	MOVW.P R2, 4(R10);                               \
    49  	MOVW.P R2, -4(R11)
    50  
    51  #define LOAD_SM4KEY_AESNI_CONSTS() \
    52  	MOVW $0x0F0F0F0F, R0                              \
    53  	VDUP R0, NIBBLE_MASK.S4                           \
    54  	MOVD $m1_2<>(SB), R0                              \
    55  	VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16]   \
    56  	MOVD $fk_mask<>(SB), R0                           \
    57  	VLD1 (R0), [FK_MASK.B16]                          \
    58  	MOVD $inverse_shift_rows<>(SB), R0                \
    59  	VLD1 (R0), [INVERSE_SHIFT_ROWS.B16]
    60  
    61  #define SM4EKEY_EXPORT_KEYS() \
    62  	VMOV V9.S[3], V10.S[0]            \
    63  	VMOV V9.S[2], V10.S[1]            \
    64  	VMOV V9.S[1], V10.S[2]            \
    65  	VMOV V9.S[0], V10.S[3]            \
    66  	VMOV V8.S[3], V11.S[0]            \
    67  	VMOV V8.S[2], V11.S[1]            \
    68  	VMOV V8.S[1], V11.S[2]            \
    69  	VMOV V8.S[0], V11.S[3]            \
    70  	VST1.P	[V8.S4, V9.S4], 32(R10)   \
    71  	VST1	[V10.S4, V11.S4], (R11)   \
    72  	SUB  $32, R11, R11
    73  
    74  #define SM4E_ROUND() \
    75  	VLD1.P 16(R10), [V8.B16]    \
    76  	VREV32 V8.B16, V8.B16       \
    77  	WORD $0xcec08408            \ //SM4E V8.4S, V0.4S
    78  	WORD $0xcec08428            \ //SM4E V8.4S, V1.4S
    79  	WORD $0xcec08448            \ //SM4E V8.4S, V2.4S
    80  	WORD $0xcec08468            \ //SM4E V8.4S, V3.4S
    81  	WORD $0xcec08488            \ //SM4E V8.4S, V4.4S
    82  	WORD $0xcec084a8            \ //SM4E V8.4S, V5.4S
    83  	WORD $0xcec084c8            \ //SM4E V8.4S, V6.4S
    84  	WORD $0xcec084e8            \ //SM4E V8.4S, V7.4S
    85  	VREV64	V8.B16, V8.B16             \ 
    86  	VEXT $8, V8.B16, V8.B16, V8.B16    \	
    87  	VST1.P  [V8.B16], 16(R9)
    88  
    89  // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
    90  TEXT ·expandKeyAsm(SB),NOSPLIT,$0
    91  	MOVD key+0(FP), R8
    92  	MOVD ck+8(FP), R9
    93  	MOVD enc+16(FP), R10
    94  	MOVD dec+24(FP), R11
    95  	MOVD inst+32(FP), R12
    96  
    97  	CMP $1, R12
    98  	BEQ sm4ekey
    99  
   100  	LOAD_SM4KEY_AESNI_CONSTS()
   101  	
   102  	VLD1 (R8), [t0.B16]
   103  	VREV32 t0.B16, t0.B16
   104  	VEOR t0.B16, FK_MASK.B16, t0.B16
   105  	VMOV t0.S[1], t1.S[0]
   106  	VMOV t0.S[2], t2.S[0]
   107  	VMOV t0.S[3], t3.S[0]
   108  
   109  	EOR R0, R0
   110  	ADD $124, R11
   111  	VEOR ZERO.B16, ZERO.B16, ZERO.B16
   112  
   113  ksLoop:
   114  		SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
   115  		SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
   116  		SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
   117  		SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
   118  
   119  		ADD $16, R0 
   120  		CMP $128, R0
   121  		BNE ksLoop
   122  	RET 
   123  
   124  sm4ekey:
   125  	MOVD $fk_mask<>(SB), R0
   126  	VLD1 (R0), [FK_MASK.B16]
   127  	VLD1 (R8), [V9.B16]
   128  	VREV32 V9.B16, V9.B16
   129  	VEOR FK_MASK.B16, V9.B16, V9.B16
   130  	ADD $96, R11
   131  
   132  	VLD1.P	64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
   133  	WORD $0xce60c928          //SM4EKEY V8.4S, V9.4S, V0.4S
   134  	WORD $0xce61c909          //SM4EKEY V9.4S, V8.4S, V1.4S
   135  	SM4EKEY_EXPORT_KEYS()
   136  
   137  	WORD $0xce62c928          //SM4EKEY V8.4S, V9.4S, V2.4S
   138  	WORD $0xce63c909          //SM4EKEY V9.4S, V8.4S, V3.4S
   139  	SM4EKEY_EXPORT_KEYS()
   140  
   141  	VLD1.P	64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
   142  	WORD $0xce60c928          //SM4EKEY V8.4S, V9.4S, V0.4S
   143  	WORD $0xce61c909          //SM4EKEY V9.4S, V8.4S, V1.4S
   144  	SM4EKEY_EXPORT_KEYS()
   145  
   146  	WORD $0xce62c928          //SM4EKEY V8.4S, V9.4S, V2.4S
   147  	WORD $0xce63c909          //SM4EKEY V9.4S, V8.4S, V3.4S
   148  	SM4EKEY_EXPORT_KEYS()
   149  	RET
   150  
   151  // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
   152  TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
   153  	MOVD xk+0(FP), R8
   154  	MOVD dst+8(FP), R9
   155  	MOVD src+32(FP), R10
   156  	MOVD src_len+40(FP), R12
   157  	MOVD inst+56(FP), R11
   158  
   159  	CMP $1, R11
   160  	BEQ sm4niblocks
   161  
   162  	LOAD_SM4_AESNI_CONSTS()
   163  
   164  	CMP $128, R12
   165  	BEQ double_enc
   166  
   167  	VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
   168  	VREV32 t0.B16, t0.B16
   169  	VREV32 t1.B16, t1.B16
   170  	VREV32 t2.B16, t2.B16
   171  	VREV32 t3.B16, t3.B16
   172  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   173  
   174  	VEOR ZERO.B16, ZERO.B16, ZERO.B16
   175  	EOR R0, R0
   176  
   177  encryptBlocksLoop:
   178  		SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
   179  		SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
   180  		SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
   181  		SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
   182  
   183  		ADD $16, R0
   184  		CMP $128, R0
   185  		BNE encryptBlocksLoop
   186  
   187  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   188  	VREV32 t0.B16, t0.B16
   189  	VREV32 t1.B16, t1.B16
   190  	VREV32 t2.B16, t2.B16
   191  	VREV32 t3.B16, t3.B16
   192  
   193  	VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
   194  	RET
   195  
   196  double_enc:
   197  	VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
   198  	VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
   199  	VREV32 t0.B16, t0.B16
   200  	VREV32 t1.B16, t1.B16
   201  	VREV32 t2.B16, t2.B16
   202  	VREV32 t3.B16, t3.B16
   203  	VREV32 t4.B16, t4.B16
   204  	VREV32 t5.B16, t5.B16
   205  	VREV32 t6.B16, t6.B16
   206  	VREV32 t7.B16, t7.B16
   207  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   208  	PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
   209  
   210  	VEOR ZERO.B16, ZERO.B16, ZERO.B16
   211  	EOR R0, R0
   212  
   213  encrypt8BlocksLoop:
   214  		SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
   215  		SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
   216  		SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
   217  		SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
   218  
   219  		ADD $16, R0
   220  		CMP $128, R0
   221  		BNE encrypt8BlocksLoop
   222  
   223  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   224  	TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
   225  	VREV32 t0.B16, t0.B16
   226  	VREV32 t1.B16, t1.B16
   227  	VREV32 t2.B16, t2.B16
   228  	VREV32 t3.B16, t3.B16
   229  	VREV32 t4.B16, t4.B16
   230  	VREV32 t5.B16, t5.B16
   231  	VREV32 t6.B16, t6.B16
   232  	VREV32 t7.B16, t7.B16
   233  
   234  	VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
   235  	VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
   236  
   237  	RET
   238  
   239  sm4niblocks:
   240  	VLD1.P  64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
   241  	VLD1.P  64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
   242  
   243  sm4niblockloop:  
   244  		SM4E_ROUND()
   245  		SUB	$16, R12, R12                                  // message length - 16bytes, then compare with 16bytes
   246  		CBNZ	R12, sm4niblockloop  
   247  	RET
   248  
   249  // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
   250  TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
   251  	MOVD xk+0(FP), R8
   252  	MOVD dst+8(FP), R9
   253  	MOVD src+16(FP), R10
   254  	MOVD inst+24(FP), R11
   255  
   256  	CMP $1, R11
   257  	BEQ sm4niblock
   258  
   259  	VLD1 (R10), [t0.S4]
   260  	VREV32 t0.B16, t0.B16
   261  	VMOV t0.S[1], t1.S[0]
   262  	VMOV t0.S[2], t2.S[0]
   263  	VMOV t0.S[3], t3.S[0]
   264  
   265  	LOAD_SM4_AESNI_CONSTS()
   266  
   267  	VEOR ZERO.B16, ZERO.B16, ZERO.B16
   268  	EOR R0, R0
   269  
   270  encryptBlockLoop:
   271  		SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
   272  		SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
   273  		SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
   274  		SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
   275  
   276  		ADD $16, R0
   277  		CMP $128, R0
   278  		BNE encryptBlockLoop
   279  
   280  	VMOV t2.S[0], t3.S[1]
   281  	VMOV t1.S[0], t3.S[2]
   282  	VMOV t0.S[0], t3.S[3]
   283  	VREV32 t3.B16, t3.B16
   284  	VST1 [t3.B16], (R9)
   285  	RET
   286  
   287  sm4niblock:
   288  	VLD1 (R10), [V8.B16]
   289  	VREV32 V8.B16, V8.B16
   290  	VLD1.P	64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
   291  	WORD $0xcec08408          //SM4E V8.4S, V0.4S
   292  	WORD $0xcec08428          //SM4E V8.4S, V1.4S
   293  	WORD $0xcec08448          //SM4E V8.4S, V2.4S
   294  	WORD $0xcec08468          //SM4E V8.4S, V3.4S
   295  	VLD1.P	64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
   296  	WORD $0xcec08408          //SM4E V8.4S, V0.4S
   297  	WORD $0xcec08428          //SM4E V8.4S, V1.4S
   298  	WORD $0xcec08448          //SM4E V8.4S, V2.4S
   299  	WORD $0xcec08468          //SM4E V8.4S, V3.4S
   300  	VREV64	V8.B16, V8.B16
   301  	VEXT $8, V8.B16, V8.B16, V8.B16
   302  	VST1	[V8.B16], (R9)
   303  	RET