github.com/emmansun/gmsm@v0.29.1/sm4/ecb_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define x V0
     6  #define y V1
     7  #define t0 V2
     8  #define t1 V3
     9  #define t2 V4
    10  #define t3 V5
    11  #define ZERO V16
    12  #define NIBBLE_MASK V20
    13  #define INVERSE_SHIFT_ROWS V21
    14  #define M1L V22
    15  #define M1H V23 
    16  #define M2L V24 
    17  #define M2H V25
    18  #define R08_MASK V26 
    19  #define FK_MASK V27
    20  #define XTMP6 V6
    21  #define XTMP7 V7
    22  #define t4 V10
    23  #define t5 V11
    24  #define t6 V12
    25  #define t7 V13
    26  
    27  #include "aesni_macros_arm64.s"
    28  
    29  // func encryptSm4Ecb(xk *uint32, dst, src []byte)
    30  TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
    31  #define dstPtr R1
    32  #define srcPtr R2
    33  #define rk R3
    34  #define rkSave R4
    35  #define srcPtrLen R5
    36  	LOAD_SM4_AESNI_CONSTS()
    37  	VEOR ZERO.B16, ZERO.B16, ZERO.B16
    38  
    39  	MOVD xk+0(FP), rk
    40  	MOVD dst+8(FP), dstPtr
    41  	MOVD src+32(FP), srcPtr
    42  	MOVD src_len+40(FP), srcPtrLen
    43  	MOVD rk, rkSave
    44  
    45  ecbSm4Octets:
    46  	CMP	$128, srcPtrLen
    47  	BLT	ecbSm4Nibbles
    48  	SUB	$128, srcPtrLen
    49  	MOVD rkSave, rk
    50  
    51  	VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
    52  	VLD1.P 64(srcPtr), [t4.S4, t5.S4, t6.S4, t7.S4]
    53  	VREV32 t0.B16, t0.B16
    54  	VREV32 t1.B16, t1.B16
    55  	VREV32 t2.B16, t2.B16
    56  	VREV32 t3.B16, t3.B16
    57  	VREV32 t4.B16, t4.B16
    58  	VREV32 t5.B16, t5.B16
    59  	VREV32 t6.B16, t6.B16
    60  	VREV32 t7.B16, t7.B16	
    61  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
    62  	PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
    63  
    64  	EOR R0, R0
    65  
    66  ecb8BlocksLoop:
    67  		SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
    68  		SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
    69  		SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
    70  		SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
    71  
    72  		ADD $16, R0
    73  		CMP $128, R0
    74  		BNE ecb8BlocksLoop
    75  
    76  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
    77  	TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
    78  	VREV32 t0.B16, t0.B16
    79  	VREV32 t1.B16, t1.B16
    80  	VREV32 t2.B16, t2.B16
    81  	VREV32 t3.B16, t3.B16
    82  	VREV32 t4.B16, t4.B16
    83  	VREV32 t5.B16, t5.B16
    84  	VREV32 t6.B16, t6.B16
    85  	VREV32 t7.B16, t7.B16
    86  
    87  	VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
    88  	VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(dstPtr)
    89  
    90  	B ecbSm4Octets
    91  
    92  ecbSm4Nibbles:
    93  	CMP	$64, srcPtrLen
    94  	BLT	ecbSm4Single
    95  	SUB	$64, srcPtrLen
    96  	MOVD rkSave, rk
    97  
    98  	VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
    99  	VREV32 t0.B16, t0.B16
   100  	VREV32 t1.B16, t1.B16
   101  	VREV32 t2.B16, t2.B16
   102  	VREV32 t3.B16, t3.B16
   103  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   104  
   105  	EOR R0, R0
   106  
   107  ecb4BlocksLoop:
   108  		SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
   109  		SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
   110  		SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
   111  		SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
   112  
   113  		ADD $16, R0
   114  		CMP $128, R0
   115  		BNE ecb4BlocksLoop
   116  
   117  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   118  	VREV32 t0.B16, t0.B16
   119  	VREV32 t1.B16, t1.B16
   120  	VREV32 t2.B16, t2.B16
   121  	VREV32 t3.B16, t3.B16
   122  	VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
   123  
   124  ecbSm4Single:
   125  	CBZ	srcPtrLen, ecbSm4Done
   126  	MOVD rkSave, rk
   127  	EOR R0, R0
   128  
   129  	CMP $32, srcPtrLen
   130  	BEQ ecbSm4Single32
   131  
   132  	CMP $48, srcPtrLen
   133  	BEQ ecbSm4Single48
   134  
   135  ecbSm4Single16:
   136  	VLD1.P 16(srcPtr), [t0.S4]
   137  	VREV32 t0.B16, t0.B16
   138  	VMOV t0.S[1], t1.S[0]
   139  	VMOV t0.S[2], t2.S[0]
   140  	VMOV t0.S[3], t3.S[0]
   141  
   142  encryptBlocksLoop1:
   143  		SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
   144  		SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
   145  		SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
   146  		SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
   147  
   148  		ADD $16, R0
   149  		CMP $128, R0
   150  		BNE encryptBlocksLoop1
   151  
   152  	VMOV t2.S[0], t3.S[1]
   153  	VMOV t1.S[0], t3.S[2]
   154  	VMOV t0.S[0], t3.S[3]
   155  	VREV32 t3.B16, t3.B16
   156  	VST1.P [t3.S4], 16(dstPtr)
   157  
   158  	B ecbSm4Done
   159  
   160  ecbSm4Single32:
   161  	VLD1.P 32(srcPtr), [t0.S4, t1.S4]
   162  	VREV32 t0.B16, t0.B16
   163  	VREV32 t1.B16, t1.B16
   164  	PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   165  
   166  encryptBlocksLoop2:
   167  		SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
   168  		SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
   169  		SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
   170  		SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
   171  
   172  		ADD $16, R0
   173  		CMP $128, R0
   174  		BNE encryptBlocksLoop2
   175  
   176  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   177  	VREV32 t0.B16, t0.B16
   178  	VREV32 t1.B16, t1.B16
   179  	VST1.P [t0.S4, t1.S4], 32(dstPtr)
   180  
   181  	B ecbSm4Done
   182  
   183  ecbSm4Single48:
   184  	VLD1.P 48(srcPtr), [t0.S4, t1.S4, t2.S4]
   185  	VREV32 t0.B16, t0.B16
   186  	VREV32 t1.B16, t1.B16
   187  	VREV32 t2.B16, t2.B16
   188      PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   189  
   190  encryptBlocksLoop3:
   191  		SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
   192  		SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
   193  		SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
   194  		SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
   195  
   196  		ADD $16, R0
   197  		CMP $128, R0
   198  		BNE encryptBlocksLoop3
   199  
   200  	TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
   201  	VREV32 t0.B16, t0.B16
   202  	VREV32 t1.B16, t1.B16
   203  	VREV32 t2.B16, t2.B16
   204  	VST1.P [t0.S4, t1.S4, t2.S4], 48(dstPtr)
   205  ecbSm4Done:
   206  	RET