github.com/emmansun/gmsm@v0.29.1/sm4/asm_ppc64x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 #include "textflag.h" 8 9 DATA ·rcon+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word 10 DATA ·rcon+0x08(SB)/8, $0x0302010007060504 11 DATA ·rcon+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix 12 DATA ·rcon+0x18(SB)/8, $0x0405060714151617 13 DATA ·rcon+0x20(SB)/8, $0x08090a0b18191a1b 14 DATA ·rcon+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f 15 DATA ·rcon+0x30(SB)/8, $0x0001020304050607 16 DATA ·rcon+0x38(SB)/8, $0x1011121314151617 17 DATA ·rcon+0x40(SB)/8, $0x08090a0b0c0d0e0f 18 DATA ·rcon+0x48(SB)/8, $0x18191a1b1c1d1e1f 19 DATA ·rcon+0x50(SB)/8, $0x0c0d0e0f08090a0b // reverse words 20 DATA ·rcon+0x58(SB)/8, $0x0405060700010203 21 DATA ·rcon+0x60(SB)/8, $0x0F0F0F0F0F0F0F0F // nibble mask 22 DATA ·rcon+0x68(SB)/8, $0x0F0F0F0F0F0F0F0F 23 DATA ·rcon+0x70(SB)/8, $0x000D0A0704010E0B // inverse shift rows 24 DATA ·rcon+0x78(SB)/8, $0x0805020F0C090603 25 DATA ·rcon+0x80(SB)/8, $0x691CA0D5B6C37F0A // affine transform matrix m1 low 26 DATA ·rcon+0x88(SB)/8, $0x53269AEF8CF94530 27 DATA ·rcon+0x90(SB)/8, $0x009837AF6CF45BC3 // affine transform matrix m1 high 28 DATA ·rcon+0x98(SB)/8, $0xAB339C04C75FF068 29 DATA ·rcon+0xa0(SB)/8, $0x616EF1FE050A959A // affine transform matrix m2 low 30 DATA ·rcon+0xa8(SB)/8, $0xF5FA656A919E010E 31 DATA ·rcon+0xb0(SB)/8, $0x00A4E044CD692D89 // affine transform matrix m2 high 32 DATA ·rcon+0xb8(SB)/8, $0xA50145E168CC882C 33 GLOBL ·rcon(SB), RODATA, $192 34 35 #define REVERSE_WORDS V19 36 #define M1L V20 37 #define M1H V21 38 #define M2L V22 39 #define M2H V23 40 #define V_FOUR V24 41 #define M0 V25 42 #define M1 V26 43 #define M2 V27 44 #define M3 V28 45 #define NIBBLE_MASK V29 46 #define INVERSE_SHIFT_ROWS V30 47 // For instruction emulation 48 #define ESPERMW V31 // Endian swapping permute into BE 49 50 #define TMP0 V10 51 #define TMP1 V11 52 #define TMP2 V12 53 #define TMP3 V13 54 55 #include "aesni_macros_ppc64x.s" 56 57 #define SM4_TAO_L2(x, y, z) \ 58 SM4_SBOX(x, y, z); \ 59 ; \ //#################### 4 parallel L2 linear transforms ##################// 60 VSPLTISW $13, z; \ 61 VRLW x, z, y; \ // y = x <<< 13 62 VXOR x, y, x; \ 63 VSPLTISW $10, z; \ 64 VRLW y, z, y; \ // y = x <<< 23 65 VXOR x, y, x 66 67 #define SM4_EXPANDKEY_ROUND(CK, x, y, z, t0, t1, t2, t3, target) \ 68 VXOR t1, CK, x; \ 69 VXOR t2, x, x; \ 70 VXOR t3, x, x; \ 71 SM4_TAO_L2(x, y, z); \ 72 VXOR x, t0, t0; \ 73 VSLDOI $4, target, t0, target 74 75 // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) 76 TEXT ·expandKeyAsm(SB),NOSPLIT,$0 77 // prepare/load constants 78 VSPLTISW $4, V_FOUR; 79 #ifdef NEEDS_PERMW 80 MOVD $·rcon(SB), R4 81 LVX (R4), ESPERMW 82 #endif 83 MOVD $·rcon+0x50(SB), R4 84 LXVD2X (R4)(R0), REVERSE_WORDS 85 MOVD $16, R3 86 LXVD2X (R4)(R3), NIBBLE_MASK 87 MOVD $48, R3 88 LXVD2X (R4)(R3), M1L 89 MOVD $64, R3 90 LXVD2X (R4)(R3), M1H 91 MOVD $80, R3 92 LXVD2X (R4)(R3), M2L 93 MOVD $96, R3 94 LXVD2X (R4)(R3), M2H 95 96 MOVD key+0(FP), R3 97 MOVD ck+8(FP), R4 98 MOVD enc+16(FP), R5 99 MOVD dec+24(FP), R6 100 101 ADD $112, R6 102 103 // load fk 104 MOVD $·fk+0(SB), R7 105 LXVW4X (R7), V4 106 107 // load key 108 PPC64X_LXVW4X(R3, R0, V0) 109 110 // xor key with fk 111 VXOR V0, V4, V0 112 VSLDOI $4, V0, V0, V1 113 VSLDOI $4, V1, V1, V2 114 VSLDOI $4, V2, V2, V3 115 116 // prepare counter 117 MOVD $8, R7 118 MOVD R7, CTR 119 120 ksLoop: 121 LXVW4X (R4), V4 122 SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V0, V1, V2, V3, V5) 123 VSLDOI $4, V4, V4, V4 124 SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V1, V2, V3, V0, V5) 125 VSLDOI $4, V4, V4, V4 126 SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V2, V3, V0, V1, V5) 127 VSLDOI $4, V4, V4, V4 128 SM4_EXPANDKEY_ROUND(V4, V7, V8, V9, V3, V0, V1, V2, V5) 129 STXVW4X V5, (R5) 130 VPERM V5, V5, REVERSE_WORDS, V5 131 STXVW4X V5, (R6) 132 133 ADD $16, R5 134 ADD $16, R4 135 ADD $-16, R6 136 BDNZ ksLoop 137 138 RET 139 140 // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) 141 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 142 // prepare/load constants 143 VSPLTISW $4, V_FOUR; 144 #ifdef NEEDS_PERMW 145 MOVD $·rcon(SB), R4 146 LVX (R4), ESPERMW 147 #endif 148 MOVD $·rcon+0x50(SB), R4 149 LXVD2X (R4)(R0), REVERSE_WORDS 150 MOVD $16, R3 151 LXVD2X (R4)(R3), NIBBLE_MASK 152 MOVD $48, R3 153 LXVD2X (R4)(R3), M1L 154 MOVD $64, R3 155 LXVD2X (R4)(R3), M1H 156 MOVD $80, R3 157 LXVD2X (R4)(R3), M2L 158 MOVD $96, R3 159 LXVD2X (R4)(R3), M2H 160 161 MOVD xk+0(FP), R3 162 MOVD dst+8(FP), R4 163 MOVD src+16(FP), R5 164 165 // load src 166 PPC64X_LXVW4X(R5, R0, V0) 167 VSLDOI $4, V0, V0, V1 168 VSLDOI $4, V1, V1, V2 169 VSLDOI $4, V2, V2, V3 170 171 // prepare counter 172 MOVD $8, R7 173 MOVD R7, CTR 174 175 encryptBlockLoop: 176 // load xk 177 LXVW4X (R3), V8 178 PROCESS_SINGLEBLOCK_4ROUND 179 ADD $16, R3 180 BDNZ encryptBlockLoop 181 182 VSLDOI $4, V3, V3, V3 183 VSLDOI $4, V3, V2, V2 184 VSLDOI $4, V2, V1, V1 185 VSLDOI $4, V1, V0, V0 186 187 PPC64X_STXVW4X(V0, R4, R0) 188 189 RET 190 191 // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) 192 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 193 // prepare/load constants 194 VSPLTISW $4, V_FOUR; 195 #ifdef NEEDS_PERMW 196 MOVD $·rcon(SB), R4 197 LVX (R4), ESPERMW 198 #endif 199 MOVD $·rcon+0x10(SB), R4 200 LOAD_CONSTS(R4, R3) 201 202 MOVD xk+0(FP), R3 203 MOVD dst+8(FP), R4 204 MOVD src+32(FP), R5 205 MOVD src_len+40(FP), R6 206 207 CMP R6, $128 208 BEQ enc8blocks 209 210 enc4blocks: 211 // prepare counter 212 MOVD $8, R7 213 MOVD R7, CTR 214 215 MOVD $16, R7 216 MOVD $32, R8 217 MOVD $48, R9 218 PPC64X_LXVW4X(R5, R0, V0) 219 PPC64X_LXVW4X(R5, R7, V1) 220 PPC64X_LXVW4X(R5, R8, V2) 221 PPC64X_LXVW4X(R5, R9, V3) 222 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 223 224 enc4blocksLoop: 225 // load xk 226 LXVW4X (R3), V8 227 PROCESS_4BLOCKS_4ROUND 228 ADD $16, R3 229 BDNZ enc4blocksLoop 230 231 TRANSPOSE_MATRIX(V0, V1, V2, V3) 232 PPC64X_STXVW4X(V0, R4, R0) 233 PPC64X_STXVW4X(V1, R4, R7) 234 PPC64X_STXVW4X(V2, R4, R8) 235 PPC64X_STXVW4X(V3, R4, R9) 236 RET 237 238 enc8blocks: 239 // prepare counter 240 MOVD $8, R7 241 MOVD R7, CTR 242 243 MOVD $16, R7 244 MOVD $32, R8 245 MOVD $48, R9 246 MOVD $64, R10 247 MOVD $80, R11 248 MOVD $96, R12 249 MOVD $112, R14 250 PPC64X_LXVW4X(R5, R0, V0) 251 PPC64X_LXVW4X(R5, R7, V1) 252 PPC64X_LXVW4X(R5, R8, V2) 253 PPC64X_LXVW4X(R5, R9, V3) 254 PPC64X_LXVW4X(R5, R10, V4) 255 PPC64X_LXVW4X(R5, R11, V5) 256 PPC64X_LXVW4X(R5, R12, V6) 257 PPC64X_LXVW4X(R5, R14, V7) 258 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 259 PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7) 260 261 enc8blocksLoop: 262 LXVW4X (R3), V8 263 PROCESS_8BLOCKS_4ROUND 264 ADD $16, R3 265 BDNZ enc8blocksLoop 266 267 TRANSPOSE_MATRIX(V0, V1, V2, V3) 268 TRANSPOSE_MATRIX(V4, V5, V6, V7) 269 PPC64X_STXVW4X(V0, R4, R0) 270 PPC64X_STXVW4X(V1, R4, R7) 271 PPC64X_STXVW4X(V2, R4, R8) 272 PPC64X_STXVW4X(V3, R4, R9) 273 PPC64X_STXVW4X(V4, R4, R10) 274 PPC64X_STXVW4X(V5, R4, R11) 275 PPC64X_STXVW4X(V6, R4, R12) 276 PPC64X_STXVW4X(V7, R4, R14) 277 278 RET