github.com/emmansun/gmsm@v0.29.1/sm4/ecb_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #include "aesni_macros_amd64.s"
     6  
     7  #define XDWTMP0 Y0
     8  #define XDWTMP1 Y1
     9  
    10  #define XDWORD0 Y4
    11  #define XDWORD1 Y5
    12  #define XDWORD2 Y6
    13  #define XDWORD3 Y7
    14  
    15  #define XDWORD4 Y10
    16  #define XDWORD5 Y11
    17  #define XDWORD6 Y12
    18  #define XDWORD7 Y14
    19  
    20  #define XWTMP0 X0
    21  #define XWTMP1 X1
    22  #define XWTMP2 X2
    23  
    24  #define XWORD0 X4
    25  #define XWORD1 X5
    26  #define XWORD2 X6
    27  #define XWORD3 X7
    28  
    29  #define XWORD4 X10
    30  #define XWORD5 X11
    31  #define XWORD6 X12
    32  #define XWORD7 X14
    33  
    34  #define NIBBLE_MASK Y3
    35  #define X_NIBBLE_MASK X3
    36  
    37  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
    38  #define X_BYTE_FLIP_MASK 	X13 // mask to convert LE -> BE
    39  
    40  #define BSWAP_MASK Y2
    41  
    42  #define XDWORD Y8
    43  #define YDWORD Y9
    44  
    45  #define XWORD X8
    46  #define YWORD X9
    47  
    48  // func encryptSm4Ecb(xk *uint32, dst, src []byte)
    49  TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
    50  	MOVQ xk+0(FP), AX
    51  	MOVQ dst+8(FP), BX
    52  	MOVQ src+32(FP), DX
    53  	MOVQ src_len+40(FP), DI
    54  
    55  	CMPB ·useAVX2(SB), $1
    56  	JE   avx2_start
    57  
    58  	CMPB ·useAVX(SB), $1
    59  	JE   avxEcbSm4Octets
    60  
    61  ecbSm4Octets:
    62  	CMPQ DI, $128
    63  	JB ecbSm4Nibbles
    64  	SUBQ $128, DI
    65  
    66  	MOVOU 0(DX), XWORD0
    67  	MOVOU 16(DX), XWORD1
    68  	MOVOU 32(DX), XWORD2
    69  	MOVOU 48(DX), XWORD3
    70  	MOVOU 64(DX), XWORD4
    71  	MOVOU 80(DX), XWORD5
    72  	MOVOU 96(DX), XWORD6
    73  	MOVOU 112(DX), XWORD7
    74  
    75  	SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
    76  	
    77  	MOVOU XWORD0, 0(BX)
    78  	MOVOU XWORD1, 16(BX)
    79  	MOVOU XWORD2, 32(BX)
    80  	MOVOU XWORD3, 48(BX)
    81  	MOVOU XWORD4, 64(BX)
    82  	MOVOU XWORD5, 80(BX)
    83  	MOVOU XWORD6, 96(BX)
    84  	MOVOU XWORD7, 112(BX)	
    85  
    86  	LEAQ 128(BX), BX
    87  	LEAQ 128(DX), DX
    88  	JMP ecbSm4Octets
    89  
    90  ecbSm4Nibbles:
    91  	CMPQ DI, $64
    92  	JB ecbSm4Single
    93  	SUBQ $64, DI
    94  
    95  	MOVOU 0(DX), XWORD0
    96  	MOVOU 16(DX), XWORD1
    97  	MOVOU 32(DX), XWORD2
    98  	MOVOU 48(DX), XWORD3
    99  
   100  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   101  
   102  	MOVUPS XWORD0, 0(BX)
   103  	MOVUPS XWORD1, 16(BX)
   104  	MOVUPS XWORD2, 32(BX)
   105  	MOVUPS XWORD3, 48(BX)
   106  
   107  	LEAQ 64(BX), BX
   108  	LEAQ 64(DX), DX
   109  
   110  ecbSm4Single:
   111  	TESTQ DI, DI
   112  	JE ecbSm4Done
   113  
   114  	MOVOU 0(DX), XWORD0
   115  	CMPQ DI, $32
   116  	JEQ ecbSm4Single32
   117  	CMPQ DI, $48
   118  	JEQ ecbSm4Single48
   119  	SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   120  	MOVUPS XWORD0, 0(BX)
   121  	JMP ecbSm4Done
   122  
   123  ecbSm4Single32:
   124  	MOVOU 16(DX), XWORD1
   125  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   126  	MOVUPS XWORD0, 0(BX)
   127  	MOVUPS XWORD1, 16(BX)
   128  	JMP ecbSm4Done
   129  
   130  ecbSm4Single48:
   131  	MOVOU 16(DX), XWORD1
   132  	MOVOU 32(DX), XWORD2
   133  	SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   134  	MOVUPS XWORD0, 0(BX)
   135  	MOVUPS XWORD1, 16(BX)
   136  	MOVUPS XWORD2, 32(BX)
   137  
   138  ecbSm4Done:
   139  	RET
   140  
   141  avxEcbSm4Octets:
   142  	CMPQ DI, $128
   143  	JB avxEcbSm4Nibbles
   144  	SUBQ $128, DI
   145  
   146  	VMOVDQU 0(DX), XWORD0
   147  	VMOVDQU 16(DX), XWORD1
   148  	VMOVDQU 32(DX), XWORD2
   149  	VMOVDQU 48(DX), XWORD3
   150  	VMOVDQU 64(DX), XWORD4
   151  	VMOVDQU 80(DX), XWORD5
   152  	VMOVDQU 96(DX), XWORD6
   153  	VMOVDQU 112(DX), XWORD7
   154  
   155  	AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
   156  
   157  	VMOVDQU XWORD0, 0(BX)
   158  	VMOVDQU XWORD1, 16(BX)
   159  	VMOVDQU XWORD2, 32(BX)
   160  	VMOVDQU XWORD3, 48(BX)
   161  	VMOVDQU XWORD4, 64(BX)
   162  	VMOVDQU XWORD5, 80(BX)
   163  	VMOVDQU XWORD6, 96(BX)
   164  	VMOVDQU XWORD7, 112(BX)
   165  
   166  	LEAQ 128(BX), BX
   167  	LEAQ 128(DX), DX
   168  	JMP avxEcbSm4Octets
   169  
   170  avxEcbSm4Nibbles:
   171  	CMPQ DI, $64
   172  	JB avxEcbSm4Single
   173  	SUBQ $64, DI
   174  
   175  	VMOVDQU 0(DX), XWORD0
   176  	VMOVDQU 16(DX), XWORD1
   177  	VMOVDQU 32(DX), XWORD2
   178  	VMOVDQU 48(DX), XWORD3
   179  
   180  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   181  
   182  	VMOVDQU XWORD0, 0(BX)
   183  	VMOVDQU XWORD1, 16(BX)
   184  	VMOVDQU XWORD2, 32(BX)
   185  	VMOVDQU XWORD3, 48(BX)
   186  
   187  	LEAQ 64(BX), BX
   188  	LEAQ 64(DX), DX
   189  
   190  avxEcbSm4Single:
   191  	TESTQ DI, DI
   192  	JE avxEcbSm4Done
   193  
   194  	VMOVDQU 0(DX), XWORD0
   195  	CMPQ DI, $32
   196  	JEQ avxEcbSm4Single32
   197  	CMPQ DI, $48
   198  	JEQ avxEcbSm4Single48
   199  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   200  	VMOVDQU XWORD0, 0(BX)
   201  	JMP avxEcbSm4Done
   202  
   203  avxEcbSm4Single32:
   204  	VMOVDQU 16(DX), XWORD1
   205  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   206  	VMOVDQU XWORD0, 0(BX)
   207  	VMOVDQU XWORD1, 16(BX)
   208  	JMP avxEcbSm4Done
   209  
   210  avxEcbSm4Single48:
   211  	VMOVDQU 16(DX), XWORD1
   212  	VMOVDQU 32(DX), XWORD2
   213  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   214  	VMOVDQU XWORD0, 0(BX)
   215  	VMOVDQU XWORD1, 16(BX)
   216  	VMOVDQU XWORD2, 32(BX)
   217  
   218  avxEcbSm4Done:
   219  	RET
   220  
   221  avx2_start:
   222  	VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
   223  	VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
   224  	VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
   225  
   226  avx2_16blocks:
   227  	CMPQ DI, $256
   228  	JB avx2EcbSm4Octets
   229  	SUBQ $256, DI
   230  
   231  	VMOVDQU 0(DX), XDWORD0
   232  	VMOVDQU 32(DX), XDWORD1
   233  	VMOVDQU 64(DX), XDWORD2
   234  	VMOVDQU 96(DX), XDWORD3
   235  	VMOVDQU 128(DX), XDWORD4
   236  	VMOVDQU 160(DX), XDWORD5
   237  	VMOVDQU 192(DX), XDWORD6
   238  	VMOVDQU 224(DX), XDWORD7
   239  
   240  	// Apply Byte Flip Mask: LE -> BE
   241  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   242  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   243  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   244  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   245  	VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
   246  	VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
   247  	VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
   248  	VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
   249  
   250  	// Transpose matrix 4 x 4 32bits word
   251  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   252  	TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
   253  
   254  	AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
   255  
   256  	// Transpose matrix 4 x 4 32bits word
   257  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   258  	TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
   259  
   260  	VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
   261  	VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
   262  	VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
   263  	VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
   264    	VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
   265  	VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
   266  	VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
   267  	VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
   268  
   269  	VMOVDQU XDWORD0, 0(BX)
   270  	VMOVDQU XDWORD1, 32(BX)
   271  	VMOVDQU XDWORD2, 64(BX)
   272  	VMOVDQU XDWORD3, 96(BX)
   273  	VMOVDQU XDWORD4, 128(BX)
   274  	VMOVDQU XDWORD5, 160(BX)
   275  	VMOVDQU XDWORD6, 192(BX)
   276  	VMOVDQU XDWORD7, 224(BX)
   277  
   278  	LEAQ 256(BX), BX
   279  	LEAQ 256(DX), DX
   280  	JMP avx2_16blocks
   281  
   282  avx2EcbSm4Octets:
   283  	CMPQ DI, $128
   284  	JB avx2EcbSm4Nibbles
   285  	SUBQ $128, DI
   286  
   287  	VMOVDQU 0(DX), XDWORD0
   288  	VMOVDQU 32(DX), XDWORD1
   289  	VMOVDQU 64(DX), XDWORD2
   290  	VMOVDQU 96(DX), XDWORD3
   291  
   292  	// Apply Byte Flip Mask: LE -> BE
   293  	VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
   294  	VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
   295  	VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
   296  	VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
   297  
   298  	// Transpose matrix 4 x 4 32bits word
   299  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   300  
   301  	AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   302  
   303  	// Transpose matrix 4 x 4 32bits word
   304  	TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
   305  
   306  	VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
   307  	VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
   308  	VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
   309  	VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
   310  
   311  	VMOVDQU XDWORD0, 0(BX)
   312  	VMOVDQU XDWORD1, 32(BX)
   313  	VMOVDQU XDWORD2, 64(BX)
   314  	VMOVDQU XDWORD3, 96(BX)
   315  
   316  	LEAQ 128(BX), BX
   317  	LEAQ 128(DX), DX
   318  	JMP avx2EcbSm4Octets
   319  
   320  avx2EcbSm4Nibbles:
   321  	CMPQ DI, $64
   322  	JB avx2EcbSm4Single
   323  	SUBQ $64, DI
   324  
   325  	VMOVDQU 0(DX), XWORD0
   326  	VMOVDQU 16(DX), XWORD1
   327  	VMOVDQU 32(DX), XWORD2
   328  	VMOVDQU 48(DX), XWORD3
   329  
   330  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   331  
   332  	VMOVDQU XWORD0, 0(BX)
   333  	VMOVDQU XWORD1, 16(BX)
   334  	VMOVDQU XWORD2, 32(BX)
   335  	VMOVDQU XWORD3, 48(BX)
   336  
   337  	LEAQ 64(BX), BX
   338  	LEAQ 64(DX), DX
   339  
   340  avx2EcbSm4Single:
   341  	TESTQ DI, DI
   342  	JE avx2EcbSm4Done
   343  
   344  	VMOVDQU 0(DX), XWORD0
   345  	CMPQ DI, $32
   346  	JEQ avx2EcbSm4Single32
   347  	CMPQ DI, $48
   348  	JEQ avx2EcbSm4Single48
   349  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   350  	VMOVDQU XWORD0, 0(BX)
   351  	JMP avx2EcbSm4Done
   352  
   353  avx2EcbSm4Single32:
   354  	VMOVDQU 16(DX), XWORD1
   355  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   356  	VMOVDQU XWORD0, 0(BX)
   357  	VMOVDQU XWORD1, 16(BX)
   358  	JMP avx2EcbSm4Done
   359  
   360  avx2EcbSm4Single48:
   361  	VMOVDQU 16(DX), XWORD1
   362  	VMOVDQU 32(DX), XWORD2
   363  	AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
   364  	VMOVDQU XWORD0, 0(BX)
   365  	VMOVDQU XWORD1, 16(BX)
   366  	VMOVDQU XWORD2, 32(BX)
   367  
   368  avx2EcbSm4Done:	
   369  	VZEROUPPER
   370  	RET