github.com/emmansun/gmsm@v0.29.1/sm4/ecb_ppc64x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 #include "textflag.h" 8 9 #define REVERSE_WORDS V19 10 #define M1L V20 11 #define M1H V21 12 #define M2L V22 13 #define M2H V23 14 #define V_FOUR V24 15 #define M0 V25 16 #define M1 V26 17 #define M2 V27 18 #define M3 V28 19 #define NIBBLE_MASK V29 20 #define INVERSE_SHIFT_ROWS V30 21 // For instruction emulation 22 #define ESPERMW V31 // Endian swapping permute into BE 23 24 #define TMP0 V10 25 #define TMP1 V11 26 #define TMP2 V12 27 #define TMP3 V13 28 29 #include "aesni_macros_ppc64x.s" 30 31 // func encryptSm4Ecb(xk *uint32, dst, src []byte) 32 TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0 33 #define dstPtr R3 34 #define srcPtr R4 35 #define rk R5 36 #define srcLen R6 37 // prepare/load constants 38 VSPLTISW $4, V_FOUR; 39 #ifdef NEEDS_PERMW 40 MOVD $·rcon(SB), R4 41 LVX (R4), ESPERMW 42 #endif 43 MOVD $·rcon+0x10(SB), R4 44 LOAD_CONSTS(R4, R3) 45 46 MOVD xk+0(FP), rk 47 MOVD dst+8(FP), dstPtr 48 MOVD src+32(FP), srcPtr 49 MOVD src_len+40(FP), srcLen 50 51 MOVD $16, R7 52 MOVD $32, R8 53 MOVD $48, R10 54 MOVD $64, R11 55 MOVD $80, R12 56 MOVD $96, R14 57 MOVD $112, R15 58 59 CMP srcLen, $128 60 BLT block64 61 62 preloop128: 63 SRD $7, srcLen, R9 // Set up loop counter 64 MOVD R9, CTR 65 ANDCC $127, srcLen, R9 // Check for tailing bytes for later 66 PCALIGN $16 67 68 block128: 69 // Case for >= 128 bytes 70 PPC64X_LXVW4X(srcPtr, R0, V0) 71 PPC64X_LXVW4X(srcPtr, R7, V1) 72 PPC64X_LXVW4X(srcPtr, R8, V2) 73 PPC64X_LXVW4X(srcPtr, R10, V3) 74 PPC64X_LXVW4X(srcPtr, R11, V4) 75 PPC64X_LXVW4X(srcPtr, R12, V5) 76 PPC64X_LXVW4X(srcPtr, R14, V6) 77 PPC64X_LXVW4X(srcPtr, R15, V7) 78 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 79 PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7) 80 81 LXVW4X (rk)(R0), V8 82 PROCESS_8BLOCKS_4ROUND 83 LXVW4X (rk)(R7), V8 84 PROCESS_8BLOCKS_4ROUND 85 LXVW4X (rk)(R8), V8 86 PROCESS_8BLOCKS_4ROUND 87 LXVW4X (rk)(R10), V8 88 PROCESS_8BLOCKS_4ROUND 89 LXVW4X (rk)(R11), V8 90 PROCESS_8BLOCKS_4ROUND 91 LXVW4X (rk)(R12), V8 92 PROCESS_8BLOCKS_4ROUND 93 LXVW4X (rk)(R14), V8 94 PROCESS_8BLOCKS_4ROUND 95 LXVW4X (rk)(R15), V8 96 PROCESS_8BLOCKS_4ROUND 97 98 TRANSPOSE_MATRIX(V0, V1, V2, V3) 99 TRANSPOSE_MATRIX(V4, V5, V6, V7) 100 101 PPC64X_STXVW4X(V0, dstPtr, R0) 102 PPC64X_STXVW4X(V1, dstPtr, R7) 103 PPC64X_STXVW4X(V2, dstPtr, R8) 104 PPC64X_STXVW4X(V3, dstPtr, R10) 105 PPC64X_STXVW4X(V4, dstPtr, R11) 106 PPC64X_STXVW4X(V5, dstPtr, R12) 107 PPC64X_STXVW4X(V6, dstPtr, R14) 108 PPC64X_STXVW4X(V7, dstPtr, R15) 109 110 ADD $128, srcPtr 111 ADD $128, dstPtr 112 BDNZ block128 113 BC 12,2,LR // BEQLR, fast return 114 MOVD R9, srcLen 115 116 block64: 117 CMP srcLen, $64 118 BLT lessThan64 119 PPC64X_LXVW4X(srcPtr, R0, V0) 120 PPC64X_LXVW4X(srcPtr, R7, V1) 121 PPC64X_LXVW4X(srcPtr, R8, V2) 122 PPC64X_LXVW4X(srcPtr, R10, V3) 123 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 124 LXVW4X (rk)(R0), V8 125 PROCESS_4BLOCKS_4ROUND 126 LXVW4X (rk)(R7), V8 127 PROCESS_4BLOCKS_4ROUND 128 LXVW4X (rk)(R8), V8 129 PROCESS_4BLOCKS_4ROUND 130 LXVW4X (rk)(R10), V8 131 PROCESS_4BLOCKS_4ROUND 132 LXVW4X (rk)(R11), V8 133 PROCESS_4BLOCKS_4ROUND 134 LXVW4X (rk)(R12), V8 135 PROCESS_4BLOCKS_4ROUND 136 LXVW4X (rk)(R14), V8 137 PROCESS_4BLOCKS_4ROUND 138 LXVW4X (rk)(R15), V8 139 PROCESS_4BLOCKS_4ROUND 140 TRANSPOSE_MATRIX(V0, V1, V2, V3) 141 PPC64X_STXVW4X(V0, dstPtr, R0) 142 PPC64X_STXVW4X(V1, dstPtr, R7) 143 PPC64X_STXVW4X(V2, dstPtr, R8) 144 PPC64X_STXVW4X(V3, dstPtr, R10) 145 ADD $64, srcPtr 146 ADD $64, dstPtr 147 ADD $-64, srcLen 148 149 lessThan64: 150 CMPU srcLen, $48, CR1 151 CMPU srcLen, $32, CR2 152 CMPU srcLen, $16, CR3 153 BEQ CR1, block48 154 BEQ CR2, block32 155 BEQ CR3, block16 156 RET 157 158 block48: 159 PPC64X_LXVW4X(srcPtr, R0, V0) 160 PPC64X_LXVW4X(srcPtr, R7, V1) 161 PPC64X_LXVW4X(srcPtr, R8, V2) 162 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 163 LXVW4X (rk)(R0), V8 164 PROCESS_4BLOCKS_4ROUND 165 LXVW4X (rk)(R7), V8 166 PROCESS_4BLOCKS_4ROUND 167 LXVW4X (rk)(R8), V8 168 PROCESS_4BLOCKS_4ROUND 169 LXVW4X (rk)(R10), V8 170 PROCESS_4BLOCKS_4ROUND 171 LXVW4X (rk)(R11), V8 172 PROCESS_4BLOCKS_4ROUND 173 LXVW4X (rk)(R12), V8 174 PROCESS_4BLOCKS_4ROUND 175 LXVW4X (rk)(R14), V8 176 PROCESS_4BLOCKS_4ROUND 177 LXVW4X (rk)(R15), V8 178 PROCESS_4BLOCKS_4ROUND 179 TRANSPOSE_MATRIX(V0, V1, V2, V3) 180 PPC64X_STXVW4X(V0, dstPtr, R0) 181 PPC64X_STXVW4X(V1, dstPtr, R7) 182 PPC64X_STXVW4X(V2, dstPtr, R8) 183 RET 184 185 block32: 186 PPC64X_LXVW4X(srcPtr, R0, V0) 187 PPC64X_LXVW4X(srcPtr, R7, V1) 188 PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) 189 LXVW4X (rk)(R0), V8 190 PROCESS_4BLOCKS_4ROUND 191 LXVW4X (rk)(R7), V8 192 PROCESS_4BLOCKS_4ROUND 193 LXVW4X (rk)(R8), V8 194 PROCESS_4BLOCKS_4ROUND 195 LXVW4X (rk)(R10), V8 196 PROCESS_4BLOCKS_4ROUND 197 LXVW4X (rk)(R11), V8 198 PROCESS_4BLOCKS_4ROUND 199 LXVW4X (rk)(R12), V8 200 PROCESS_4BLOCKS_4ROUND 201 LXVW4X (rk)(R14), V8 202 PROCESS_4BLOCKS_4ROUND 203 LXVW4X (rk)(R15), V8 204 PROCESS_4BLOCKS_4ROUND 205 TRANSPOSE_MATRIX(V0, V1, V2, V3) 206 PPC64X_STXVW4X(V0, dstPtr, R0) 207 PPC64X_STXVW4X(V1, dstPtr, R7) 208 RET 209 210 block16: 211 PPC64X_LXVW4X(srcPtr, R0, V0) 212 VSLDOI $4, V0, V0, V1 213 VSLDOI $4, V1, V1, V2 214 VSLDOI $4, V2, V2, V3 215 LXVW4X (rk)(R0), V8 216 PROCESS_SINGLEBLOCK_4ROUND 217 LXVW4X (rk)(R7), V8 218 PROCESS_SINGLEBLOCK_4ROUND 219 LXVW4X (rk)(R8), V8 220 PROCESS_SINGLEBLOCK_4ROUND 221 LXVW4X (rk)(R10), V8 222 PROCESS_SINGLEBLOCK_4ROUND 223 LXVW4X (rk)(R11), V8 224 PROCESS_SINGLEBLOCK_4ROUND 225 LXVW4X (rk)(R12), V8 226 PROCESS_SINGLEBLOCK_4ROUND 227 LXVW4X (rk)(R14), V8 228 PROCESS_SINGLEBLOCK_4ROUND 229 LXVW4X (rk)(R15), V8 230 PROCESS_SINGLEBLOCK_4ROUND 231 VSLDOI $4, V3, V3, V3 232 VSLDOI $4, V3, V2, V2 233 VSLDOI $4, V2, V1, V1 234 VSLDOI $4, V1, V0, V0 235 PPC64X_STXVW4X(V0, dstPtr, R0) 236 RET