github.com/emmansun/gmsm@v0.29.1/sm4/cbc_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define x V0
     6  #define y V1
     7  #define t0 V2
     8  #define t1 V3
     9  #define t2 V4
    10  #define t3 V5
    11  #define XTMP6 V6
    12  #define XTMP7 V7
    13  #define t4 V10
    14  #define t5 V11
    15  #define t6 V12
    16  #define t7 V13
    17  #define IV V18
    18  
    19  #define ZERO V16
    20  #define NIBBLE_MASK V20
    21  #define INVERSE_SHIFT_ROWS V21
    22  #define M1L V22
    23  #define M1H V23 
    24  #define M2L V24 
    25  #define M2H V25
    26  #define R08_MASK V26 
    27  #define FK_MASK V27
    28  
    29  #include "aesni_macros_arm64.s"
    30  
    31  #define dstPtr R1
    32  #define srcPtr R2
    33  #define rk R3
    34  #define rkSave R4
    35  #define srcPtrLen R5
    36  
    37  // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
    38  TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
    39  	LOAD_SM4_AESNI_CONSTS()
    40  	VEOR ZERO.B16, ZERO.B16, ZERO.B16
    41  
    42  	MOVD xk+0(FP), rk
    43  	MOVD dst+8(FP), dstPtr
    44  	MOVD src+32(FP), srcPtr
    45  	MOVD src_len+40(FP), srcPtrLen
    46  	MOVD iv+56(FP), R6
    47  	MOVD rk, rkSave
    48  	VLD1 (R6), [IV.B16]
    49  
    50  	ADD srcPtr, srcPtrLen, R10
    51  	SUB $16, R10, R10
    52  	VLD1 (R10), [V15.S4]
    53  
    54  cbcSm4Octets:
    55  	CMP	$128, srcPtrLen
    56  	BLE	cbcSm4Nibbles
    57  	SUB	$128, srcPtrLen
    58  	MOVD rkSave, rk
    59  	ADD srcPtr, srcPtrLen, R10
    60  	SUB $16, R10, R11
    61  	ADD dstPtr, srcPtrLen, R12
    62  
    63  	VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
    64  	VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
    65  	VREV32 t0.B16, t0.B16
    66  	VREV32 t1.B16, t1.B16
    67  	VREV32 t2.B16, t2.B16
    68  	VREV32 t3.B16, t3.B16
    69  	VREV32 t4.B16, t4.B16
    70  	VREV32 t5.B16, t5.B16
    71  	VREV32 t6.B16, t6.B16
    72  	VREV32 t7.B16, t7.B16
    73  
    74  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
    75  	PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
    76  	EOR R0, R0
    77  
    78  cbc8BlocksLoop:
    79  		SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
    80  		SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
    81  		SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
    82  		SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
    83  
    84  		ADD $16, R0
    85  		CMP $128, R0
    86  		BNE cbc8BlocksLoop
    87  
    88  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
    89  	TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
    90  	VREV32 t0.B16, t0.B16
    91  	VREV32 t1.B16, t1.B16
    92  	VREV32 t2.B16, t2.B16
    93  	VREV32 t3.B16, t3.B16
    94  	VREV32 t4.B16, t4.B16
    95  	VREV32 t5.B16, t5.B16
    96  	VREV32 t6.B16, t6.B16
    97  	VREV32 t7.B16, t7.B16
    98  
    99  	VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
   100  	VEOR V6.B16, t0.B16, t0.B16
   101  	VEOR V7.B16, t1.B16, t1.B16
   102  	VEOR V8.B16, t2.B16, t2.B16
   103  	VEOR V9.B16, t3.B16, t3.B16
   104  
   105  	VLD1.P 64(R11), [V6.S4, V7.S4, V8.S4, V9.S4]
   106  	VEOR V6.B16, t4.B16, t4.B16
   107  	VEOR V7.B16, t5.B16, t5.B16
   108  	VEOR V8.B16, t6.B16, t6.B16
   109  	VEOR V9.B16, t7.B16, t7.B16
   110  
   111  	VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R12)
   112  	VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R12)
   113  
   114  	B cbcSm4Octets
   115  
   116  cbcSm4Nibbles:
   117  	CMP	$64, srcPtrLen
   118  	BLE	cbcSm4Single
   119  	SUB	$64, srcPtrLen
   120  	MOVD rkSave, rk
   121  	ADD srcPtr, srcPtrLen, R10
   122  	SUB $16, R10, R11
   123  	ADD dstPtr, srcPtrLen, R12
   124  
   125  	VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
   126  	VMOV t0.B16, t5.B16
   127  	VMOV t1.B16, t6.B16
   128  	VMOV t2.B16, t7.B16	
   129  	VREV32 t0.B16, t0.B16
   130  	VREV32 t1.B16, t1.B16
   131  	VREV32 t2.B16, t2.B16
   132  	VREV32 t3.B16, t3.B16
   133  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   134  
   135  	EOR R0, R0
   136  
   137  cbc4BlocksLoop:
   138  		SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
   139  		SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
   140  		SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
   141  		SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
   142  
   143  		ADD $16, R0
   144  		CMP $128, R0
   145  		BNE cbc4BlocksLoop
   146  
   147  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   148  	VREV32 t0.B16, t0.B16
   149  	VREV32 t1.B16, t1.B16
   150  	VREV32 t2.B16, t2.B16
   151  	VREV32 t3.B16, t3.B16
   152  
   153  	VLD1 (R11), [t4.S4]
   154  	VEOR t4.B16, t0.B16, t0.B16
   155  	VEOR t5.B16, t1.B16, t1.B16
   156  	VEOR t6.B16, t2.B16, t2.B16
   157  	VEOR t7.B16, t3.B16, t3.B16
   158  
   159  	VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R12)
   160  
   161  cbcSm4Single:
   162  	MOVD rkSave, rk
   163  	EOR R0, R0
   164  
   165  	CMP $16, srcPtrLen
   166  	BEQ cbcSm4Single16
   167  
   168  	CMP $32, srcPtrLen
   169  	BEQ cbcSm4Single32
   170  
   171  	CMP $48, srcPtrLen
   172  	BEQ cbcSm4Single48
   173  
   174  	// 4 blocks
   175  	VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
   176  	VMOV t0.B16, t4.B16
   177  	VMOV t1.B16, t5.B16
   178  	VMOV t2.B16, t6.B16
   179  	VREV32 t0.B16, t0.B16
   180  	VREV32 t1.B16, t1.B16
   181  	VREV32 t2.B16, t2.B16
   182  	VREV32 t3.B16, t3.B16
   183  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   184  
   185  cbc4BlocksLoop64:
   186  		SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
   187  		SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
   188  		SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
   189  		SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
   190  
   191  		ADD $16, R0
   192  		CMP $128, R0
   193  		BNE cbc4BlocksLoop64
   194  
   195  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   196  	VREV32 t0.B16, t0.B16
   197  	VREV32 t1.B16, t1.B16
   198  	VREV32 t2.B16, t2.B16
   199  	VREV32 t3.B16, t3.B16
   200  
   201  	VEOR IV.B16, t0.B16, t0.B16
   202  	VEOR t4.B16, t1.B16, t1.B16
   203  	VEOR t5.B16, t2.B16, t2.B16
   204  	VEOR t6.B16, t3.B16, t3.B16
   205  
   206  	VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
   207  
   208  	B cbcSm4Done
   209  
   210  cbcSm4Single16:
   211  	VLD1 (srcPtr), [t0.S4]
   212  	VREV32 t0.B16, t0.B16
   213  	VMOV t0.S[1], t1.S[0]
   214  	VMOV t0.S[2], t2.S[0]
   215  	VMOV t0.S[3], t3.S[0]
   216  
   217  cbc4BlocksLoop16:
   218  		SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
   219  		SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
   220  		SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
   221  		SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
   222  
   223  		ADD $16, R0
   224  		CMP $128, R0
   225  		BNE cbc4BlocksLoop16
   226  
   227  	VMOV t2.S[0], t3.S[1]
   228  	VMOV t1.S[0], t3.S[2]
   229  	VMOV t0.S[0], t3.S[3]
   230  	VREV32 t3.B16, t3.B16
   231  
   232  	VEOR IV.B16, t3.B16, t3.B16
   233  
   234  	VST1 [t3.S4], (dstPtr)
   235  
   236  	B cbcSm4Done
   237  
   238  cbcSm4Single32:
   239  	VLD1 (srcPtr), [t0.S4, t1.S4]
   240  	VMOV t0.B16, t4.B16
   241  	VREV32 t0.B16, t0.B16
   242  	VREV32 t1.B16, t1.B16
   243  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   244  
   245  cbc4BlocksLoop32:
   246  		SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
   247  		SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
   248  		SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
   249  		SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
   250  
   251  		ADD $16, R0
   252  		CMP $128, R0
   253  		BNE cbc4BlocksLoop32
   254  
   255  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   256  	VREV32 t0.B16, t0.B16
   257  	VREV32 t1.B16, t1.B16
   258  
   259  	VEOR IV.B16, t0.B16, t0.B16
   260  	VEOR t4.B16, t1.B16, t1.B16
   261  
   262  	VST1 [t0.S4, t1.S4], (dstPtr)
   263  	B cbcSm4Done
   264  
   265  cbcSm4Single48:
   266  	VLD1 (srcPtr), [t0.S4, t1.S4, t2.S4]
   267  	VMOV t0.B16, t4.B16
   268  	VMOV t1.B16, t5.B16
   269  	VREV32 t0.B16, t0.B16
   270  	VREV32 t1.B16, t1.B16
   271  	VREV32 t2.B16, t2.B16
   272  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   273  
   274  cbc4BlocksLoop48:
   275  		SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
   276  		SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
   277  		SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
   278  		SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
   279  
   280  		ADD $16, R0
   281  		CMP $128, R0
   282  		BNE cbc4BlocksLoop48
   283  
   284  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   285  	VREV32 t0.B16, t0.B16
   286  	VREV32 t1.B16, t1.B16
   287  	VREV32 t2.B16, t2.B16
   288  
   289  	VEOR IV.B16, t0.B16, t0.B16
   290  	VEOR t4.B16, t1.B16, t1.B16
   291  	VEOR t5.B16, t2.B16, t2.B16
   292  
   293  	VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
   294  
   295  cbcSm4Done:
   296  	VST1 [V15.S4], (R6)
   297  	RET