github.com/emmansun/gmsm@v0.29.1/sm4/cbc_ppc64x.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build (ppc64 || ppc64le) && !purego
     6  
     7  #include "textflag.h"
     8  
     9  #define REVERSE_WORDS V19
    10  #define M1L V20
    11  #define M1H V21
    12  #define M2L V22
    13  #define M2H V23
    14  #define V_FOUR V24
    15  #define M0 V25
    16  #define M1 V26
    17  #define M2 V27
    18  #define M3 V28
    19  #define NIBBLE_MASK V29
    20  #define INVERSE_SHIFT_ROWS V30
    21  // For instruction emulation
    22  #define ESPERMW  V31 // Endian swapping permute into BE
    23  
    24  #define TMP0 V10
    25  #define TMP1 V11
    26  #define TMP2 V12
    27  #define TMP3 V13
    28  #define IV V18
    29  
    30  #include "aesni_macros_ppc64x.s"
    31  
    32  #ifdef NEEDS_PERMW
    33  #define REVERSE32LE_8BLOCKS \
    34  	VPERM	V0, V0, ESPERMW, V0 \
    35  	VPERM	V1, V1, ESPERMW, V1 \
    36  	VPERM	V2, V2, ESPERMW, V2 \
    37  	VPERM	V3, V3, ESPERMW, V3 \
    38  	VPERM	V4, V4, ESPERMW, V4 \
    39  	VPERM	V5, V5, ESPERMW, V5 \
    40  	VPERM	V6, V6, ESPERMW, V6 \
    41  	VPERM	V7, V7, ESPERMW, V7
    42  #else	
    43  #define REVERSE32LE_8BLOCKS
    44  #endif
    45  
    46  // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
    47  TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
    48  #define dstPtr R3
    49  #define srcPtr R4
    50  #define rk R5
    51  #define srcLen R6
    52  	// prepare/load constants
    53  	VSPLTISW $4, V_FOUR;
    54  #ifdef NEEDS_PERMW
    55  	MOVD	$·rcon(SB), R4
    56  	LVX	(R4), ESPERMW
    57  #endif
    58  	MOVD	$·rcon+0x10(SB), R4
    59  	LOAD_CONSTS(R4, R3)
    60  
    61  	// Load IV
    62  	MOVD iv+56(FP), R7
    63  	PPC64X_LXVW4X(R7, R0, IV)
    64  	
    65  	MOVD xk+0(FP), rk
    66  	MOVD dst+8(FP), dstPtr
    67  	MOVD src+32(FP), srcPtr
    68  	MOVD src_len+40(FP), srcLen
    69  
    70  	MOVD $16, R7
    71  	MOVD $32, R8
    72  	MOVD $48, R9
    73  	MOVD $64, R10
    74  	MOVD $80, R11
    75  	MOVD $96, R12
    76  	MOVD $112, R14
    77  
    78  	ADD srcPtr, srcLen, R15
    79  	ADD $-16, R15, R15
    80  	LXVD2X (R15)(R0), V14 // Load last 16 bytes of src into V14
    81  
    82  	CMP srcLen, $144 // 9 blocks
    83  	BLT lessThan9blocks
    84  
    85  	PCALIGN	$16
    86  loop8blocks:
    87  	ADD	$-128, srcLen
    88  	ADD srcPtr, srcLen, R15
    89  	ADD $-16, R15, R16
    90  	ADD dstPtr, srcLen, R17
    91  	PPC64X_LXVW4X(R15, R0, V0)
    92  	PPC64X_LXVW4X(R15, R7, V1)
    93  	PPC64X_LXVW4X(R15, R8, V2)
    94  	PPC64X_LXVW4X(R15, R9, V3)
    95  	PPC64X_LXVW4X(R15, R10, V4)
    96  	PPC64X_LXVW4X(R15, R11, V5)
    97  	PPC64X_LXVW4X(R15, R12, V6)
    98  	PPC64X_LXVW4X(R15, R14, V7)
    99  	PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
   100  	PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7)
   101  
   102  	LXVW4X (rk)(R0), V8
   103  	PROCESS_8BLOCKS_4ROUND
   104  	LXVW4X (rk)(R7), V8
   105  	PROCESS_8BLOCKS_4ROUND
   106  	LXVW4X (rk)(R8), V8
   107  	PROCESS_8BLOCKS_4ROUND
   108  	LXVW4X (rk)(R9), V8
   109  	PROCESS_8BLOCKS_4ROUND
   110  	LXVW4X (rk)(R10), V8
   111  	PROCESS_8BLOCKS_4ROUND
   112  	LXVW4X (rk)(R11), V8
   113  	PROCESS_8BLOCKS_4ROUND
   114  	LXVW4X (rk)(R12), V8
   115  	PROCESS_8BLOCKS_4ROUND
   116  	LXVW4X (rk)(R14), V8
   117  	PROCESS_8BLOCKS_4ROUND
   118  
   119  	TRANSPOSE_MATRIX(V0, V1, V2, V3)
   120  	TRANSPOSE_MATRIX(V4, V5, V6, V7)
   121  
   122  	REVERSE32LE_8BLOCKS // for ppc64le
   123  
   124  	LXVW4X (R16)(R0), TMP0
   125  	LXVW4X (R16)(R7), TMP1
   126  	LXVW4X (R16)(R8), TMP2
   127  	LXVW4X (R16)(R9), TMP3	
   128  	VXOR V0, TMP0, V0
   129  	VXOR V1, TMP1, V1
   130  	VXOR V2, TMP2, V2
   131  	VXOR V3, TMP3, V3
   132  	LXVW4X (R16)(R10), TMP0
   133  	LXVW4X (R16)(R11), TMP1
   134  	LXVW4X (R16)(R12), TMP2
   135  	LXVW4X (R16)(R14), TMP3
   136  	VXOR V4, TMP0, V4
   137  	VXOR V5, TMP1, V5
   138  	VXOR V6, TMP2, V6
   139  	VXOR V7, TMP3, V7
   140  	STXVW4X V0, (R17)(R0)
   141  	STXVW4X V1, (R17)(R7)
   142  	STXVW4X V2, (R17)(R8)
   143  	STXVW4X V3, (R17)(R9)
   144  	STXVW4X V4, (R17)(R10)
   145  	STXVW4X V5, (R17)(R11)
   146  	STXVW4X V6, (R17)(R12)
   147  	STXVW4X V7, (R17)(R14)
   148  
   149  	CMP srcLen, $144 // 9 blocks
   150  	BGE loop8blocks
   151  
   152  lessThan9blocks:
   153  	CMP srcLen, $64
   154  	BLE ble4blocks
   155  	
   156  	ADD	$-64, srcLen
   157  	ADD srcPtr, srcLen, R15
   158  	ADD $-16, R15, R16
   159  	ADD dstPtr, srcLen, R17
   160  	PPC64X_LXVW4X(R15, R0, V0)
   161  	PPC64X_LXVW4X(R15, R7, V1)
   162  	PPC64X_LXVW4X(R15, R8, V2)
   163  	PPC64X_LXVW4X(R15, R9, V3)
   164  	VOR V0, V0, V5
   165  	VOR V1, V1, V6
   166  	VOR V2, V2, V7
   167  
   168  	PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
   169  
   170  	LXVW4X (rk)(R0), V8
   171  	PROCESS_4BLOCKS_4ROUND
   172  	LXVW4X (rk)(R7), V8
   173  	PROCESS_4BLOCKS_4ROUND
   174  	LXVW4X (rk)(R8), V8
   175  	PROCESS_4BLOCKS_4ROUND
   176  	LXVW4X (rk)(R9), V8
   177  	PROCESS_4BLOCKS_4ROUND
   178  	LXVW4X (rk)(R10), V8
   179  	PROCESS_4BLOCKS_4ROUND
   180  	LXVW4X (rk)(R11), V8
   181  	PROCESS_4BLOCKS_4ROUND
   182  	LXVW4X (rk)(R12), V8
   183  	PROCESS_4BLOCKS_4ROUND
   184  	LXVW4X (rk)(R14), V8
   185  	PROCESS_4BLOCKS_4ROUND
   186  
   187  	TRANSPOSE_MATRIX(V0, V1, V2, V3)
   188  	PPC64X_LXVW4X(R16, R0, V4)
   189  	VXOR V0, V4, V0
   190  	VXOR V1, V5, V1
   191  	VXOR V2, V6, V2
   192  	VXOR V3, V7, V3
   193  	PPC64X_STXVW4X(V0, R17, R0)
   194  	PPC64X_STXVW4X(V1, R17, R7)
   195  	PPC64X_STXVW4X(V2, R17, R8)
   196  	PPC64X_STXVW4X(V3, R17, R9)
   197  
   198  ble4blocks:
   199  	CMPU srcLen, $48, CR1
   200  	CMPU srcLen, $32, CR2
   201  	CMPU srcLen, $16, CR3
   202  	BEQ CR1, eq3blocks
   203  	BEQ CR2, eq2blocks
   204  	BEQ CR3, eq1block
   205  
   206  	PPC64X_LXVW4X(srcPtr, R0, V0)
   207  	PPC64X_LXVW4X(srcPtr, R7, V1)
   208  	PPC64X_LXVW4X(srcPtr, R8, V2)
   209  	PPC64X_LXVW4X(srcPtr, R9, V3)
   210  	VOR V0, V0, V4
   211  	VOR V1, V1, V5
   212  	VOR V2, V2, V6
   213  	PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
   214  
   215  	LXVW4X (rk)(R0), V8
   216  	PROCESS_4BLOCKS_4ROUND
   217  	LXVW4X (rk)(R7), V8
   218  	PROCESS_4BLOCKS_4ROUND
   219  	LXVW4X (rk)(R8), V8
   220  	PROCESS_4BLOCKS_4ROUND
   221  	LXVW4X (rk)(R9), V8
   222  	PROCESS_4BLOCKS_4ROUND
   223  	LXVW4X (rk)(R10), V8
   224  	PROCESS_4BLOCKS_4ROUND
   225  	LXVW4X (rk)(R11), V8
   226  	PROCESS_4BLOCKS_4ROUND
   227  	LXVW4X (rk)(R12), V8
   228  	PROCESS_4BLOCKS_4ROUND
   229  	LXVW4X (rk)(R14), V8
   230  	PROCESS_4BLOCKS_4ROUND
   231  
   232  	TRANSPOSE_MATRIX(V0, V1, V2, V3)
   233  	VXOR V0, IV, V0
   234  	VXOR V1, V4, V1
   235  	VXOR V2, V5, V2
   236  	VXOR V3, V6, V3
   237  	PPC64X_STXVW4X(V0, dstPtr, R0)
   238  	PPC64X_STXVW4X(V1, dstPtr, R7)
   239  	PPC64X_STXVW4X(V2, dstPtr, R8)
   240  	PPC64X_STXVW4X(V3, dstPtr, R9)
   241  	BR done
   242  
   243  eq3blocks:
   244  	PPC64X_LXVW4X(srcPtr, R0, V0)
   245  	PPC64X_LXVW4X(srcPtr, R7, V1)
   246  	PPC64X_LXVW4X(srcPtr, R8, V2)
   247  	VOR V0, V0, V4
   248  	VOR V1, V1, V5
   249  	PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
   250  
   251  	LXVW4X (rk)(R0), V8
   252  	PROCESS_4BLOCKS_4ROUND
   253  	LXVW4X (rk)(R7), V8
   254  	PROCESS_4BLOCKS_4ROUND
   255  	LXVW4X (rk)(R8), V8
   256  	PROCESS_4BLOCKS_4ROUND
   257  	LXVW4X (rk)(R9), V8
   258  	PROCESS_4BLOCKS_4ROUND
   259  	LXVW4X (rk)(R10), V8
   260  	PROCESS_4BLOCKS_4ROUND
   261  	LXVW4X (rk)(R11), V8
   262  	PROCESS_4BLOCKS_4ROUND
   263  	LXVW4X (rk)(R12), V8
   264  	PROCESS_4BLOCKS_4ROUND
   265  	LXVW4X (rk)(R14), V8
   266  	PROCESS_4BLOCKS_4ROUND
   267  
   268  	TRANSPOSE_MATRIX(V0, V1, V2, V3)
   269  	VXOR V0, IV, V0
   270  	VXOR V1, V4, V1
   271  	VXOR V2, V5, V2
   272  	PPC64X_STXVW4X(V0, dstPtr, R0)
   273  	PPC64X_STXVW4X(V1, dstPtr, R7)
   274  	PPC64X_STXVW4X(V2, dstPtr, R8)
   275  	BR done
   276  
   277  eq2blocks:
   278  	PPC64X_LXVW4X(srcPtr, R0, V0)
   279  	PPC64X_LXVW4X(srcPtr, R7, V1)
   280  	VOR V0, V0, V4
   281  	PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
   282  	LXVW4X (rk)(R0), V8
   283  	PROCESS_4BLOCKS_4ROUND
   284  	LXVW4X (rk)(R7), V8
   285  	PROCESS_4BLOCKS_4ROUND
   286  	LXVW4X (rk)(R8), V8
   287  	PROCESS_4BLOCKS_4ROUND
   288  	LXVW4X (rk)(R9), V8
   289  	PROCESS_4BLOCKS_4ROUND
   290  	LXVW4X (rk)(R10), V8
   291  	PROCESS_4BLOCKS_4ROUND
   292  	LXVW4X (rk)(R11), V8
   293  	PROCESS_4BLOCKS_4ROUND
   294  	LXVW4X (rk)(R12), V8
   295  	PROCESS_4BLOCKS_4ROUND
   296  	LXVW4X (rk)(R14), V8
   297  	PROCESS_4BLOCKS_4ROUND
   298  
   299  	TRANSPOSE_MATRIX(V0, V1, V2, V3)
   300  	VXOR V0, IV, V0
   301  	VXOR V1, V4, V1
   302  	PPC64X_STXVW4X(V0, dstPtr, R0)
   303  	PPC64X_STXVW4X(V1, dstPtr, R7)
   304  	BR done
   305  
   306  eq1block:
   307  	PPC64X_LXVW4X(srcPtr, R0, V0)
   308  	VSLDOI $4, V0, V0, V1
   309  	VSLDOI $4, V1, V1, V2
   310  	VSLDOI $4, V2, V2, V3
   311  	LXVW4X (rk)(R0), V8
   312  	PROCESS_SINGLEBLOCK_4ROUND
   313  	LXVW4X (rk)(R7), V8
   314  	PROCESS_SINGLEBLOCK_4ROUND
   315  	LXVW4X (rk)(R8), V8
   316  	PROCESS_SINGLEBLOCK_4ROUND
   317  	LXVW4X (rk)(R9), V8
   318  	PROCESS_SINGLEBLOCK_4ROUND
   319  	LXVW4X (rk)(R10), V8
   320  	PROCESS_SINGLEBLOCK_4ROUND
   321  	LXVW4X (rk)(R11), V8
   322  	PROCESS_SINGLEBLOCK_4ROUND
   323  	LXVW4X (rk)(R12), V8
   324  	PROCESS_SINGLEBLOCK_4ROUND
   325  	LXVW4X (rk)(R14), V8
   326  	PROCESS_SINGLEBLOCK_4ROUND
   327  	VSLDOI $4, V3, V3, V3
   328  	VSLDOI $4, V3, V2, V2
   329  	VSLDOI $4, V2, V1, V1
   330  	VSLDOI $4, V1, V0, V0
   331  	VXOR V0, IV, V0
   332  	PPC64X_STXVW4X(V0, dstPtr, R0)
   333  
   334  done:
   335  	MOVD iv+56(FP), R7
   336  	STXVD2X V14, (R7)(R0)
   337  	RET