github.com/emmansun/gmsm@v0.29.1/sm4/cbc_ppc64x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 #include "textflag.h" 8 9 #define REVERSE_WORDS V19 10 #define M1L V20 11 #define M1H V21 12 #define M2L V22 13 #define M2H V23 14 #define V_FOUR V24 15 #define M0 V25 16 #define M1 V26 17 #define M2 V27 18 #define M3 V28 19 #define NIBBLE_MASK V29 20 #define INVERSE_SHIFT_ROWS V30 21 // For instruction emulation 22 #define ESPERMW V31 // Endian swapping permute into BE 23 24 #define TMP0 V10 25 #define TMP1 V11 26 #define TMP2 V12 27 #define TMP3 V13 28 #define IV V18 29 30 #include "aesni_macros_ppc64x.s" 31 32 #ifdef NEEDS_PERMW 33 #define REVERSE32LE_8BLOCKS \ 34 VPERM V0, V0, ESPERMW, V0 \ 35 VPERM V1, V1, ESPERMW, V1 \ 36 VPERM V2, V2, ESPERMW, V2 \ 37 VPERM V3, V3, ESPERMW, V3 \ 38 VPERM V4, V4, ESPERMW, V4 \ 39 VPERM V5, V5, ESPERMW, V5 \ 40 VPERM V6, V6, ESPERMW, V6 \ 41 VPERM V7, V7, ESPERMW, V7 42 #else 43 #define REVERSE32LE_8BLOCKS 44 #endif 45 46 // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) 47 TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 48 #define dstPtr R3 49 #define srcPtr R4 50 #define rk R5 51 #define srcLen R6 52 // prepare/load constants 53 VSPLTISW $4, V_FOUR; 54 #ifdef NEEDS_PERMW 55 MOVD $·rcon(SB), R4 56 LVX (R4), ESPERMW 57 #endif 58 MOVD $·rcon+0x10(SB), R4 59 LOAD_CONSTS(R4, R3) 60 61 // Load IV 62 MOVD iv+56(FP), R7 63 PPC64X_LXVW4X(R7, R0, IV) 64 65 MOVD xk+0(FP), rk 66 MOVD dst+8(FP), dstPtr 67 MOVD src+32(FP), srcPtr 68 MOVD src_len+40(FP), srcLen 69 70 MOVD $16, R7 71 MOVD $32, R8 72 MOVD $48, R9 73 MOVD $64, R10 74 MOVD $80, R11 75 MOVD $96, R12 76 MOVD $112, R14 77 78 ADD srcPtr, srcLen, R15 79 ADD $-16, R15, R15 80 LXVD2X (R15)(R0), V14 // Load last 16 bytes of src into V14 81 82 CMP srcLen, $144 // 9 blocks 83 BLT lessThan9blocks 84 85 PCALIGN $16 86 loop8blocks: 87 ADD $-128, srcLen 88 ADD srcPtr, srcLen, R15 89 ADD $-16, R15, R16 90 ADD dstPtr, srcLen, R17 91 PPC64X_LXVW4X(R15, R0, V0) 92 PPC64X_LXVW4X(R15, R7, V1) 93 PPC64X_LXVW4X(R15, R8, V2) 94 PPC64X_LXVW4X(R15, R9, V3) 95 PPC64X_LXVW4X(R15, R10, V4) 96 PPC64X_LXVW4X(R15, R11, V5) 97 PPC64X_LXVW4X(R15, R12, V6) 98 PPC64X_LXVW4X(R15, R14, V7) 99 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 100 PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7) 101 102 LXVW4X (rk)(R0), V8 103 PROCESS_8BLOCKS_4ROUND 104 LXVW4X (rk)(R7), V8 105 PROCESS_8BLOCKS_4ROUND 106 LXVW4X (rk)(R8), V8 107 PROCESS_8BLOCKS_4ROUND 108 LXVW4X (rk)(R9), V8 109 PROCESS_8BLOCKS_4ROUND 110 LXVW4X (rk)(R10), V8 111 PROCESS_8BLOCKS_4ROUND 112 LXVW4X (rk)(R11), V8 113 PROCESS_8BLOCKS_4ROUND 114 LXVW4X (rk)(R12), V8 115 PROCESS_8BLOCKS_4ROUND 116 LXVW4X (rk)(R14), V8 117 PROCESS_8BLOCKS_4ROUND 118 119 TRANSPOSE_MATRIX(V0, V1, V2, V3) 120 TRANSPOSE_MATRIX(V4, V5, V6, V7) 121 122 REVERSE32LE_8BLOCKS // for ppc64le 123 124 LXVW4X (R16)(R0), TMP0 125 LXVW4X (R16)(R7), TMP1 126 LXVW4X (R16)(R8), TMP2 127 LXVW4X (R16)(R9), TMP3 128 VXOR V0, TMP0, V0 129 VXOR V1, TMP1, V1 130 VXOR V2, TMP2, V2 131 VXOR V3, TMP3, V3 132 LXVW4X (R16)(R10), TMP0 133 LXVW4X (R16)(R11), TMP1 134 LXVW4X (R16)(R12), TMP2 135 LXVW4X (R16)(R14), TMP3 136 VXOR V4, TMP0, V4 137 VXOR V5, TMP1, V5 138 VXOR V6, TMP2, V6 139 VXOR V7, TMP3, V7 140 STXVW4X V0, (R17)(R0) 141 STXVW4X V1, (R17)(R7) 142 STXVW4X V2, (R17)(R8) 143 STXVW4X V3, (R17)(R9) 144 STXVW4X V4, (R17)(R10) 145 STXVW4X V5, (R17)(R11) 146 STXVW4X V6, (R17)(R12) 147 STXVW4X V7, (R17)(R14) 148 149 CMP srcLen, $144 // 9 blocks 150 BGE loop8blocks 151 152 lessThan9blocks: 153 CMP srcLen, $64 154 BLE ble4blocks 155 156 ADD $-64, srcLen 157 ADD srcPtr, srcLen, R15 158 ADD $-16, R15, R16 159 ADD dstPtr, srcLen, R17 160 PPC64X_LXVW4X(R15, R0, V0) 161 PPC64X_LXVW4X(R15, R7, V1) 162 PPC64X_LXVW4X(R15, R8, V2) 163 PPC64X_LXVW4X(R15, R9, V3) 164 VOR V0, V0, V5 165 VOR V1, V1, V6 166 VOR V2, V2, V7 167 168 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 169 170 LXVW4X (rk)(R0), V8 171 PROCESS_4BLOCKS_4ROUND 172 LXVW4X (rk)(R7), V8 173 PROCESS_4BLOCKS_4ROUND 174 LXVW4X (rk)(R8), V8 175 PROCESS_4BLOCKS_4ROUND 176 LXVW4X (rk)(R9), V8 177 PROCESS_4BLOCKS_4ROUND 178 LXVW4X (rk)(R10), V8 179 PROCESS_4BLOCKS_4ROUND 180 LXVW4X (rk)(R11), V8 181 PROCESS_4BLOCKS_4ROUND 182 LXVW4X (rk)(R12), V8 183 PROCESS_4BLOCKS_4ROUND 184 LXVW4X (rk)(R14), V8 185 PROCESS_4BLOCKS_4ROUND 186 187 TRANSPOSE_MATRIX(V0, V1, V2, V3) 188 PPC64X_LXVW4X(R16, R0, V4) 189 VXOR V0, V4, V0 190 VXOR V1, V5, V1 191 VXOR V2, V6, V2 192 VXOR V3, V7, V3 193 PPC64X_STXVW4X(V0, R17, R0) 194 PPC64X_STXVW4X(V1, R17, R7) 195 PPC64X_STXVW4X(V2, R17, R8) 196 PPC64X_STXVW4X(V3, R17, R9) 197 198 ble4blocks: 199 CMPU srcLen, $48, CR1 200 CMPU srcLen, $32, CR2 201 CMPU srcLen, $16, CR3 202 BEQ CR1, eq3blocks 203 BEQ CR2, eq2blocks 204 BEQ CR3, eq1block 205 206 PPC64X_LXVW4X(srcPtr, R0, V0) 207 PPC64X_LXVW4X(srcPtr, R7, V1) 208 PPC64X_LXVW4X(srcPtr, R8, V2) 209 PPC64X_LXVW4X(srcPtr, R9, V3) 210 VOR V0, V0, V4 211 VOR V1, V1, V5 212 VOR V2, V2, V6 213 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 214 215 LXVW4X (rk)(R0), V8 216 PROCESS_4BLOCKS_4ROUND 217 LXVW4X (rk)(R7), V8 218 PROCESS_4BLOCKS_4ROUND 219 LXVW4X (rk)(R8), V8 220 PROCESS_4BLOCKS_4ROUND 221 LXVW4X (rk)(R9), V8 222 PROCESS_4BLOCKS_4ROUND 223 LXVW4X (rk)(R10), V8 224 PROCESS_4BLOCKS_4ROUND 225 LXVW4X (rk)(R11), V8 226 PROCESS_4BLOCKS_4ROUND 227 LXVW4X (rk)(R12), V8 228 PROCESS_4BLOCKS_4ROUND 229 LXVW4X (rk)(R14), V8 230 PROCESS_4BLOCKS_4ROUND 231 232 TRANSPOSE_MATRIX(V0, V1, V2, V3) 233 VXOR V0, IV, V0 234 VXOR V1, V4, V1 235 VXOR V2, V5, V2 236 VXOR V3, V6, V3 237 PPC64X_STXVW4X(V0, dstPtr, R0) 238 PPC64X_STXVW4X(V1, dstPtr, R7) 239 PPC64X_STXVW4X(V2, dstPtr, R8) 240 PPC64X_STXVW4X(V3, dstPtr, R9) 241 BR done 242 243 eq3blocks: 244 PPC64X_LXVW4X(srcPtr, R0, V0) 245 PPC64X_LXVW4X(srcPtr, R7, V1) 246 PPC64X_LXVW4X(srcPtr, R8, V2) 247 VOR V0, V0, V4 248 VOR V1, V1, V5 249 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 250 251 LXVW4X (rk)(R0), V8 252 PROCESS_4BLOCKS_4ROUND 253 LXVW4X (rk)(R7), V8 254 PROCESS_4BLOCKS_4ROUND 255 LXVW4X (rk)(R8), V8 256 PROCESS_4BLOCKS_4ROUND 257 LXVW4X (rk)(R9), V8 258 PROCESS_4BLOCKS_4ROUND 259 LXVW4X (rk)(R10), V8 260 PROCESS_4BLOCKS_4ROUND 261 LXVW4X (rk)(R11), V8 262 PROCESS_4BLOCKS_4ROUND 263 LXVW4X (rk)(R12), V8 264 PROCESS_4BLOCKS_4ROUND 265 LXVW4X (rk)(R14), V8 266 PROCESS_4BLOCKS_4ROUND 267 268 TRANSPOSE_MATRIX(V0, V1, V2, V3) 269 VXOR V0, IV, V0 270 VXOR V1, V4, V1 271 VXOR V2, V5, V2 272 PPC64X_STXVW4X(V0, dstPtr, R0) 273 PPC64X_STXVW4X(V1, dstPtr, R7) 274 PPC64X_STXVW4X(V2, dstPtr, R8) 275 BR done 276 277 eq2blocks: 278 PPC64X_LXVW4X(srcPtr, R0, V0) 279 PPC64X_LXVW4X(srcPtr, R7, V1) 280 VOR V0, V0, V4 281 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 282 LXVW4X (rk)(R0), V8 283 PROCESS_4BLOCKS_4ROUND 284 LXVW4X (rk)(R7), V8 285 PROCESS_4BLOCKS_4ROUND 286 LXVW4X (rk)(R8), V8 287 PROCESS_4BLOCKS_4ROUND 288 LXVW4X (rk)(R9), V8 289 PROCESS_4BLOCKS_4ROUND 290 LXVW4X (rk)(R10), V8 291 PROCESS_4BLOCKS_4ROUND 292 LXVW4X (rk)(R11), V8 293 PROCESS_4BLOCKS_4ROUND 294 LXVW4X (rk)(R12), V8 295 PROCESS_4BLOCKS_4ROUND 296 LXVW4X (rk)(R14), V8 297 PROCESS_4BLOCKS_4ROUND 298 299 TRANSPOSE_MATRIX(V0, V1, V2, V3) 300 VXOR V0, IV, V0 301 VXOR V1, V4, V1 302 PPC64X_STXVW4X(V0, dstPtr, R0) 303 PPC64X_STXVW4X(V1, dstPtr, R7) 304 BR done 305 306 eq1block: 307 PPC64X_LXVW4X(srcPtr, R0, V0) 308 VSLDOI $4, V0, V0, V1 309 VSLDOI $4, V1, V1, V2 310 VSLDOI $4, V2, V2, V3 311 LXVW4X (rk)(R0), V8 312 PROCESS_SINGLEBLOCK_4ROUND 313 LXVW4X (rk)(R7), V8 314 PROCESS_SINGLEBLOCK_4ROUND 315 LXVW4X (rk)(R8), V8 316 PROCESS_SINGLEBLOCK_4ROUND 317 LXVW4X (rk)(R9), V8 318 PROCESS_SINGLEBLOCK_4ROUND 319 LXVW4X (rk)(R10), V8 320 PROCESS_SINGLEBLOCK_4ROUND 321 LXVW4X (rk)(R11), V8 322 PROCESS_SINGLEBLOCK_4ROUND 323 LXVW4X (rk)(R12), V8 324 PROCESS_SINGLEBLOCK_4ROUND 325 LXVW4X (rk)(R14), V8 326 PROCESS_SINGLEBLOCK_4ROUND 327 VSLDOI $4, V3, V3, V3 328 VSLDOI $4, V3, V2, V2 329 VSLDOI $4, V2, V1, V1 330 VSLDOI $4, V1, V0, V0 331 VXOR V0, IV, V0 332 PPC64X_STXVW4X(V0, dstPtr, R0) 333 334 done: 335 MOVD iv+56(FP), R7 336 STXVD2X V14, (R7)(R0) 337 RET