github.com/emmansun/gmsm@v0.29.1/sm4/asm_amd64.s (about)

     1  // This SM4 implementation referenced https://github.com/mjosaarinen/sm4ni/blob/master/sm4ni.c
     2  //go:build !purego
     3  
     4  #include "textflag.h"
     5  
     6  #define t0 X0
     7  #define t1 X1
     8  #define t2 X2
     9  #define t3 X3
    10  
    11  #define x X8
    12  #define y X9
    13  #define XTMP6 X10
    14  #define XTMP7 X11
    15  
    16  #include "aesni_macros_amd64.s"
    17  
    18  // SM4 TAO L2 function, used for key expand
    19  // parameters:
    20  // -  x: 128 bits register as TAO_L1 input/output data
    21  // -  y: 128 bits temp register
    22  // -  tmp1: 128 bits temp register
    23  // -  tmp2: 128 bits temp register
    24  #define SM4_TAO_L2(x, y, tmp1, tmp2)    \
    25  	SM4_SBOX(x, y, tmp1);              \
    26  	;                                  \ //####################  4 parallel L2 linear transforms ##################//
    27  	MOVOU x, y;                        \
    28  	MOVOU x, tmp1;                     \
    29  	PSLLL $13, tmp1;                   \
    30  	PSRLL $19, y;                      \
    31  	POR tmp1, y;                       \ //y = X roll 13  
    32  	PSLLL $10, tmp1;                   \
    33  	MOVOU x, tmp2;                     \
    34  	PSRLL $9, tmp2;                    \
    35  	POR tmp1, tmp2;                    \ //tmp2 = x roll 23
    36  	PXOR tmp2, y;                      \
    37  	PXOR y, x                        
    38  
    39  // SM4 expand round function
    40  // t0 ^= tao_l2(t1^t2^t3^ck) and store t0.S[0] to enc/dec
    41  // parameters:
    42  // - index: round key index immediate number
    43  // -  x: 128 bits temp register
    44  // -  y: 128 bits temp register
    45  // - t0: 128 bits register for data
    46  // - t1: 128 bits register for data
    47  // - t2: 128 bits register for data
    48  // - t3: 128 bits register for data
    49  #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
    50  	MOVL (index * 4)(BX)(CX*1), x;                         \
    51  	PXOR t1, x;                                            \
    52  	PXOR t2, x;                                            \
    53  	PXOR t3, x;                                            \
    54  	SM4_TAO_L2(x, y, XTMP6, XTMP7);                        \
    55  	PXOR x, t0;                                            \
    56  	MOVL t0, R8;                                           \ // _mm_cvtsi128_si32
    57  	MOVL R8, (index * 4)(DX)(CX*1);                        \
    58  	MOVL R8, (12 - index * 4)(DI)(SI*1)
    59  
    60  #define XDWORD0 Y4
    61  #define XDWORD1 Y5
    62  #define XDWORD2 Y6
    63  #define XDWORD3 Y7
    64  
    65  #define XWORD0 X4
    66  #define XWORD1 X5
    67  #define XWORD2 X6
    68  #define XWORD3 X7
    69  
    70  #define XDWORD4 Y10
    71  #define XDWORD5 Y11
    72  #define XDWORD6 Y12
    73  #define XDWORD7 Y14
    74  
    75  #define XWORD4 X10
    76  #define XWORD5 X11
    77  #define XWORD6 X12
    78  #define XWORD7 X14
    79  
    80  #define XDWTMP0 Y0
    81  #define XDWTMP1 Y1
    82  #define XDWTMP2 Y2
    83  
    84  #define XWTMP0 X0
    85  #define XWTMP1 X1
    86  #define XWTMP2 X2
    87  
    88  #define NIBBLE_MASK Y3
    89  #define X_NIBBLE_MASK X3
    90  
    91  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
    92  #define X_BYTE_FLIP_MASK 	X13 // mask to convert LE -> BE
    93  
    94  #define XDWORD Y8
    95  #define YDWORD Y9
    96  
    97  #define XWORD X8
    98  #define YWORD X9
    99  
   100  // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
   101  TEXT ·expandKeyAsm(SB),NOSPLIT,$0
   102  	MOVQ key+0(FP), AX
   103  	MOVQ  ck+8(FP), BX
   104  	MOVQ  enc+16(FP), DX
   105  	MOVQ  dec+24(FP), DI
   106  
   107  	MOVUPS 0(AX), t0
   108  	PSHUFB flip_mask<>(SB), t0
   109  	PXOR fk_mask<>(SB), t0
   110  	PSHUFD $1, t0, t1
   111  	PSHUFD $2, t0, t2
   112  	PSHUFD $3, t0, t3
   113  
   114  	XORL CX, CX
   115  	MOVL $112, SI
   116  
   117  loop:
   118  		SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3)
   119  		SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0)
   120  		SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1)
   121  		SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2)
   122  
   123  		ADDL $16, CX
   124  		SUBL $16, SI
   125  		CMPL CX, $4*32
   126  		JB loop
   127  
   128  expand_end:  
   129  	RET 
   130  
   131  // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
   132  TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
   133  	MOVQ xk+0(FP), AX
   134  	MOVQ dst+8(FP), BX
   135  	MOVQ src+32(FP), DX
   136  	MOVQ src_len+40(FP), DI
   137  
   138  	CMPB ·useAVX2(SB), $1
   139  	JE   avx2
   140  
   141  	CMPB ·useAVX(SB), $1
   142  	JE   avx
   143  
   144  non_avx2_start:
   145  	CMPQ DI, $128
   146  	JEQ sse_8blocks
   147  
   148  	MOVOU 0(DX), XWORD0
   149  	MOVOU 16(DX), XWORD1
   150  	MOVOU 32(DX), XWORD2
   151  	MOVOU 48(DX), XWORD3
   152  
   153  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   154  	
   155  	MOVOU XWORD0, 0(BX)
   156  	MOVOU XWORD1, 16(BX)
   157  	MOVOU XWORD2, 32(BX)
   158  	MOVOU XWORD3, 48(BX)
   159  
   160  	RET
   161  
   162  sse_8blocks:
   163  	MOVOU 0(DX), XWORD0
   164  	MOVOU 16(DX), XWORD1
   165  	MOVOU 32(DX), XWORD2
   166  	MOVOU 48(DX), XWORD3
   167  	MOVOU 64(DX), XWORD4
   168  	MOVOU 80(DX), XWORD5
   169  	MOVOU 96(DX), XWORD6
   170  	MOVOU 112(DX), XWORD7
   171  
   172  	SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
   173  	
   174  	MOVOU XWORD0, 0(BX)
   175  	MOVOU XWORD1, 16(BX)
   176  	MOVOU XWORD2, 32(BX)
   177  	MOVOU XWORD3, 48(BX)
   178  	MOVOU XWORD4, 64(BX)
   179  	MOVOU XWORD5, 80(BX)
   180  	MOVOU XWORD6, 96(BX)
   181  	MOVOU XWORD7, 112(BX)	
   182  done_sm4:
   183  	RET
   184  
   185  avx:
   186  	CMPQ DI, $128
   187  	JEQ avx_8blocks
   188  
   189  	VMOVDQU 0(DX), XWORD0
   190  	VMOVDQU 16(DX), XWORD1
   191  	VMOVDQU 32(DX), XWORD2
   192  	VMOVDQU 48(DX), XWORD3
   193  
   194  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   195  
   196  	VMOVDQU XWORD0, 0(BX)
   197  	VMOVDQU XWORD1, 16(BX)
   198  	VMOVDQU XWORD2, 32(BX)
   199  	VMOVDQU XWORD3, 48(BX)
   200  	
   201  	RET
   202  
   203  avx_8blocks:
   204  	VMOVDQU 0(DX), XWORD0
   205  	VMOVDQU 16(DX), XWORD1
   206  	VMOVDQU 32(DX), XWORD2
   207  	VMOVDQU 48(DX), XWORD3
   208  	VMOVDQU 64(DX), XWORD4
   209  	VMOVDQU 80(DX), XWORD5
   210  	VMOVDQU 96(DX), XWORD6
   211  	VMOVDQU 112(DX), XWORD7
   212  
   213  	AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
   214  
   215  	VMOVDQU XWORD0, 0(BX)
   216  	VMOVDQU XWORD1, 16(BX)
   217  	VMOVDQU XWORD2, 32(BX)
   218  	VMOVDQU XWORD3, 48(BX)
   219  	VMOVDQU XWORD4, 64(BX)
   220  	VMOVDQU XWORD5, 80(BX)
   221  	VMOVDQU XWORD6, 96(BX)
   222  	VMOVDQU XWORD7, 112(BX)
   223  
   224  avx_done_sm4:	
   225  	RET
   226  
   227  avx2:
   228  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
   229  	
   230  	CMPQ DI, $256
   231  	JEQ avx2_16blocks
   232  
   233  avx2_8blocks:
   234  	VMOVDQU 0(DX), XDWORD0
   235  	VMOVDQU 32(DX), XDWORD1
   236  	VMOVDQU 64(DX), XDWORD2
   237  	VMOVDQU 96(DX), XDWORD3
   238  	VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
   239  
   240  	// Apply Byte Flip Mask: LE -> BE
   241  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   242  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   243  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   244  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   245  
   246  	// Transpose matrix 4 x 4 32bits word
   247  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
   248  
   249  	AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   250  
   251  	// Transpose matrix 4 x 4 32bits word
   252  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
   253  
   254  	VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
   255  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   256  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   257  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   258  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   259    
   260  	VMOVDQU XDWORD0, 0(BX)
   261  	VMOVDQU XDWORD1, 32(BX)
   262  	VMOVDQU XDWORD2, 64(BX)
   263  	VMOVDQU XDWORD3, 96(BX)
   264  
   265  	VZEROUPPER
   266  	RET
   267  
   268  avx2_16blocks:
   269  	VMOVDQU 0(DX), XDWORD0
   270  	VMOVDQU 32(DX), XDWORD1
   271  	VMOVDQU 64(DX), XDWORD2
   272  	VMOVDQU 96(DX), XDWORD3
   273  	VMOVDQU 128(DX), XDWORD4
   274  	VMOVDQU 160(DX), XDWORD5
   275  	VMOVDQU 192(DX), XDWORD6
   276  	VMOVDQU 224(DX), XDWORD7
   277  
   278  	VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
   279  
   280  	// Apply Byte Flip Mask: LE -> BE
   281  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   282  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   283  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   284  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   285  	VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
   286  	VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
   287  	VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
   288  	VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
   289  
   290  	// Transpose matrix 4 x 4 32bits word
   291  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
   292  	TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
   293  
   294  	AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
   295  
   296  	// Transpose matrix 4 x 4 32bits word
   297  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
   298  	TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
   299  
   300  	VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
   301  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   302  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   303  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   304  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   305    	VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
   306  	VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
   307  	VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
   308  	VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
   309  
   310  	VMOVDQU XDWORD0, 0(BX)
   311  	VMOVDQU XDWORD1, 32(BX)
   312  	VMOVDQU XDWORD2, 64(BX)
   313  	VMOVDQU XDWORD3, 96(BX)
   314  	VMOVDQU XDWORD4, 128(BX)
   315  	VMOVDQU XDWORD5, 160(BX)
   316  	VMOVDQU XDWORD6, 192(BX)
   317  	VMOVDQU XDWORD7, 224(BX)	
   318  
   319  avx2_sm4_done:
   320  	VZEROUPPER
   321  	RET
   322  
   323  // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
   324  // Requires: SSSE3
   325  TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
   326  	MOVQ xk+0(FP), AX
   327  	MOVQ dst+8(FP), BX
   328  	MOVQ src+16(FP), DX
   329    
   330  	MOVUPS (DX), t0
   331  	PSHUFB flip_mask<>(SB), t0
   332  	PSHUFD $1, t0, t1
   333  	PSHUFD $2, t0, t2
   334  	PSHUFD $3, t0, t3
   335  
   336  	XORL CX, CX
   337  
   338  loop:
   339  		MOVUPS (AX)(CX*1), XTMP7
   340  		MOVOU XTMP7, x
   341  		SM4_SINGLE_ROUND(x, y, XTMP6, t0, t1, t2, t3)
   342  		PSHUFD $1, XTMP7, x
   343  		SM4_SINGLE_ROUND(x, y, XTMP6, t1, t2, t3, t0)
   344  		PSHUFD $2, XTMP7, x
   345  		SM4_SINGLE_ROUND(x, y, XTMP6, t2, t3, t0, t1)
   346  		PSHUFD $3, XTMP7, x
   347  		SM4_SINGLE_ROUND(x, y, XTMP6, t3, t0, t1, t2)
   348  
   349  		ADDL $16, CX
   350  		CMPL CX, $4*32
   351  		JB loop
   352  
   353  	PUNPCKLLQ t2, t3
   354  	PUNPCKLLQ t0, t1
   355  	PUNPCKLQDQ t1, t3
   356  	PSHUFB flip_mask<>(SB), t3
   357  	MOVUPS t3, (BX)
   358  
   359  done_sm4:
   360  	RET