github.com/emmansun/gmsm@v0.29.1/sm4/asm_ppc64x.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build (ppc64 || ppc64le) && !purego
     6  
     7  #include "textflag.h"
     8  
     9  DATA ·rcon+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word
    10  DATA ·rcon+0x08(SB)/8, $0x0302010007060504
    11  DATA ·rcon+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix
    12  DATA ·rcon+0x18(SB)/8, $0x0405060714151617
    13  DATA ·rcon+0x20(SB)/8, $0x08090a0b18191a1b
    14  DATA ·rcon+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f
    15  DATA ·rcon+0x30(SB)/8, $0x0001020304050607
    16  DATA ·rcon+0x38(SB)/8, $0x1011121314151617
    17  DATA ·rcon+0x40(SB)/8, $0x08090a0b0c0d0e0f
    18  DATA ·rcon+0x48(SB)/8, $0x18191a1b1c1d1e1f
    19  DATA ·rcon+0x50(SB)/8, $0x0c0d0e0f08090a0b // reverse words
    20  DATA ·rcon+0x58(SB)/8, $0x0405060700010203
    21  DATA ·rcon+0x60(SB)/8, $0x0F0F0F0F0F0F0F0F // nibble mask
    22  DATA ·rcon+0x68(SB)/8, $0x0F0F0F0F0F0F0F0F
    23  DATA ·rcon+0x70(SB)/8, $0x000D0A0704010E0B // inverse shift rows
    24  DATA ·rcon+0x78(SB)/8, $0x0805020F0C090603
    25  DATA ·rcon+0x80(SB)/8, $0x691CA0D5B6C37F0A // affine transform matrix m1 low
    26  DATA ·rcon+0x88(SB)/8, $0x53269AEF8CF94530
    27  DATA ·rcon+0x90(SB)/8, $0x009837AF6CF45BC3 // affine transform matrix m1 high
    28  DATA ·rcon+0x98(SB)/8, $0xAB339C04C75FF068
    29  DATA ·rcon+0xa0(SB)/8, $0x616EF1FE050A959A // affine transform matrix m2 low
    30  DATA ·rcon+0xa8(SB)/8, $0xF5FA656A919E010E
    31  DATA ·rcon+0xb0(SB)/8, $0x00A4E044CD692D89 // affine transform matrix m2 high
    32  DATA ·rcon+0xb8(SB)/8, $0xA50145E168CC882C
    33  GLOBL ·rcon(SB), RODATA, $192
    34  
    35  #define REVERSE_WORDS V19
    36  #define M1L V20
    37  #define M1H V21
    38  #define M2L V22
    39  #define M2H V23
    40  #define V_FOUR V24
    41  #define M0 V25
    42  #define M1 V26
    43  #define M2 V27
    44  #define M3 V28
    45  #define NIBBLE_MASK V29
    46  #define INVERSE_SHIFT_ROWS V30
    47  // For instruction emulation
    48  #define ESPERMW  V31 // Endian swapping permute into BE
    49  
    50  #define TMP0 V10
    51  #define TMP1 V11
    52  #define TMP2 V12
    53  #define TMP3 V13
    54  
    55  #include "aesni_macros_ppc64x.s"
    56  
    57  #define SM4_TAO_L2(x, y, z)         \
    58  	SM4_SBOX(x, y, z);                      \
    59  	;                                       \ //####################  4 parallel L2 linear transforms ##################//
    60  	VSPLTISW $13, z;                        \
    61  	VRLW	x, z, y;                        \ // y = x <<< 13
    62  	VXOR x, y, x;                           \
    63  	VSPLTISW $10, z;                        \
    64  	VRLW y, z, y;                           \ // y = x <<< 23
    65  	VXOR x, y, x
    66  
    67  #define SM4_EXPANDKEY_ROUND(CK, x, y, z, t0, t1, t2, t3, target) \
    68  	VXOR t1, CK, x;                      \
    69  	VXOR t2, x, x;                       \
    70  	VXOR t3, x, x;                       \
    71  	SM4_TAO_L2(x, y, z);                 \
    72  	VXOR x, t0, t0;                      \
    73  	VSLDOI $4, target, t0, target
    74  
    75  // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
    76  TEXT ·expandKeyAsm(SB),NOSPLIT,$0
    77  	// prepare/load constants
    78  	VSPLTISW $4, V_FOUR;
    79  #ifdef NEEDS_PERMW
    80  	MOVD	$·rcon(SB), R4
    81  	LVX	(R4), ESPERMW
    82  #endif
    83  	MOVD	$·rcon+0x50(SB), R4
    84  	LXVD2X (R4)(R0), REVERSE_WORDS
    85  	MOVD $16, R3
    86  	LXVD2X (R4)(R3), NIBBLE_MASK
    87  	MOVD $48, R3
    88  	LXVD2X (R4)(R3), M1L
    89  	MOVD $64, R3
    90  	LXVD2X (R4)(R3), M1H
    91  	MOVD $80, R3
    92  	LXVD2X (R4)(R3), M2L
    93  	MOVD $96, R3
    94  	LXVD2X (R4)(R3), M2H
    95  
    96  	MOVD key+0(FP), R3
    97  	MOVD ck+8(FP), R4
    98  	MOVD enc+16(FP), R5
    99  	MOVD dec+24(FP), R6
   100  
   101  	ADD $112, R6
   102  
   103  	// load fk
   104  	MOVD $·fk+0(SB), R7
   105  	LXVW4X (R7), V4
   106  
   107  	// load key
   108  	PPC64X_LXVW4X(R3, R0, V0)
   109  
   110  	// xor key with fk
   111  	VXOR V0, V4, V0
   112  	VSLDOI $4, V0, V0, V1
   113  	VSLDOI $4, V1, V1, V2
   114  	VSLDOI $4, V2, V2, V3
   115  
   116  	// prepare counter
   117  	MOVD $8, R7
   118  	MOVD R7, CTR
   119  
   120  ksLoop:
   121  	LXVW4X (R4), V4
   122  	SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V0, V1, V2, V3, V5)
   123  	VSLDOI $4, V4, V4, V4
   124  	SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V1, V2, V3, V0, V5)
   125  	VSLDOI $4, V4, V4, V4
   126  	SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V2, V3, V0, V1, V5)
   127  	VSLDOI $4, V4, V4, V4
   128  	SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V3, V0, V1, V2, V5)
   129  	STXVW4X V5, (R5)
   130  	VPERM V5, V5, REVERSE_WORDS, V5
   131  	STXVW4X V5, (R6)
   132  
   133  	ADD $16, R5
   134  	ADD $16, R4
   135  	ADD $-16, R6
   136  	BDNZ	ksLoop
   137  
   138      RET
   139  
   140  // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
   141  TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
   142  	// prepare/load constants
   143  	VSPLTISW $4, V_FOUR;
   144  #ifdef NEEDS_PERMW
   145  	MOVD	$·rcon(SB), R4
   146  	LVX	(R4), ESPERMW
   147  #endif
   148  	MOVD	$·rcon+0x50(SB), R4
   149  	LXVD2X (R4)(R0), REVERSE_WORDS
   150  	MOVD $16, R3
   151  	LXVD2X (R4)(R3), NIBBLE_MASK
   152  	MOVD $48, R3
   153  	LXVD2X (R4)(R3), M1L
   154  	MOVD $64, R3
   155  	LXVD2X (R4)(R3), M1H
   156  	MOVD $80, R3
   157  	LXVD2X (R4)(R3), M2L
   158  	MOVD $96, R3
   159  	LXVD2X (R4)(R3), M2H
   160  
   161  	MOVD xk+0(FP), R3
   162  	MOVD dst+8(FP), R4
   163  	MOVD src+16(FP), R5
   164  
   165  	// load src
   166  	PPC64X_LXVW4X(R5, R0, V0)
   167  	VSLDOI $4, V0, V0, V1
   168  	VSLDOI $4, V1, V1, V2
   169  	VSLDOI $4, V2, V2, V3
   170  
   171  	// prepare counter
   172  	MOVD $8, R7
   173  	MOVD R7, CTR
   174  
   175  encryptBlockLoop:
   176  	// load xk
   177  	LXVW4X (R3), V8
   178  	PROCESS_SINGLEBLOCK_4ROUND
   179  	ADD $16, R3
   180  	BDNZ	encryptBlockLoop
   181  
   182  	VSLDOI $4, V3, V3, V3
   183  	VSLDOI $4, V3, V2, V2
   184  	VSLDOI $4, V2, V1, V1
   185  	VSLDOI $4, V1, V0, V0
   186  
   187  	PPC64X_STXVW4X(V0, R4, R0)
   188  
   189  	RET
   190  
   191  // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
   192  TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
   193  	// prepare/load constants
   194  	VSPLTISW $4, V_FOUR;
   195  #ifdef NEEDS_PERMW
   196  	MOVD	$·rcon(SB), R4
   197  	LVX	(R4), ESPERMW
   198  #endif
   199  	MOVD	$·rcon+0x10(SB), R4
   200  	LOAD_CONSTS(R4, R3)
   201  
   202  	MOVD xk+0(FP), R3
   203  	MOVD dst+8(FP), R4
   204  	MOVD src+32(FP), R5
   205  	MOVD src_len+40(FP), R6
   206  
   207  	CMP	R6, $128
   208  	BEQ enc8blocks
   209  
   210  enc4blocks:
   211  	// prepare counter
   212  	MOVD $8, R7
   213  	MOVD R7, CTR
   214  
   215  	MOVD $16, R7
   216  	MOVD $32, R8
   217  	MOVD $48, R9
   218  	PPC64X_LXVW4X(R5, R0, V0)
   219  	PPC64X_LXVW4X(R5, R7, V1)
   220  	PPC64X_LXVW4X(R5, R8, V2)
   221  	PPC64X_LXVW4X(R5, R9, V3)
   222  	PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
   223  
   224  enc4blocksLoop:
   225  		// load xk
   226  		LXVW4X (R3), V8
   227  		PROCESS_4BLOCKS_4ROUND	
   228  		ADD $16, R3
   229  		BDNZ	enc4blocksLoop
   230  
   231  	TRANSPOSE_MATRIX(V0, V1, V2, V3)
   232  	PPC64X_STXVW4X(V0, R4, R0)
   233  	PPC64X_STXVW4X(V1, R4, R7)
   234  	PPC64X_STXVW4X(V2, R4, R8)
   235  	PPC64X_STXVW4X(V3, R4, R9)
   236  	RET
   237  
   238  enc8blocks:
   239  	// prepare counter
   240  	MOVD $8, R7
   241  	MOVD R7, CTR
   242  
   243  	MOVD $16, R7
   244  	MOVD $32, R8
   245  	MOVD $48, R9
   246  	MOVD $64, R10
   247  	MOVD $80, R11
   248  	MOVD $96, R12
   249  	MOVD $112, R14
   250  	PPC64X_LXVW4X(R5, R0, V0)
   251  	PPC64X_LXVW4X(R5, R7, V1)
   252  	PPC64X_LXVW4X(R5, R8, V2)
   253  	PPC64X_LXVW4X(R5, R9, V3)
   254  	PPC64X_LXVW4X(R5, R10, V4)
   255  	PPC64X_LXVW4X(R5, R11, V5)
   256  	PPC64X_LXVW4X(R5, R12, V6)
   257  	PPC64X_LXVW4X(R5, R14, V7)
   258  	PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3)
   259  	PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7)
   260  
   261  enc8blocksLoop:
   262  		LXVW4X (R3), V8
   263  		PROCESS_8BLOCKS_4ROUND
   264  		ADD $16, R3
   265  		BDNZ	enc8blocksLoop
   266  	
   267  	TRANSPOSE_MATRIX(V0, V1, V2, V3)
   268  	TRANSPOSE_MATRIX(V4, V5, V6, V7)
   269  	PPC64X_STXVW4X(V0, R4, R0)
   270  	PPC64X_STXVW4X(V1, R4, R7)
   271  	PPC64X_STXVW4X(V2, R4, R8)
   272  	PPC64X_STXVW4X(V3, R4, R9)
   273  	PPC64X_STXVW4X(V4, R4, R10)
   274  	PPC64X_STXVW4X(V5, R4, R11)
   275  	PPC64X_STXVW4X(V6, R4, R12)
   276  	PPC64X_STXVW4X(V7, R4, R14)
   277  
   278  	RET