github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_ppc64x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 #include "textflag.h" 8 #include "sm3_const_asm.s" 9 10 // We can also use MFVSRWZ to extract the first word from vector register 11 // and then use VSLDOI to shift the vector register, thus we can avoid the usage of memory (buffer). 12 // But due to we have no real phisical machine to test the performance difference, 13 // we'll keep current implementation first. 14 15 #ifdef GOARCH_ppc64le 16 #define NEEDS_PERMW 17 18 #define PPC64X_LXVW4X(RA,RB,VT) \ 19 LXVW4X (RA+RB), VT \ 20 VPERM VT, VT, ESPERMW, VT 21 22 #else 23 #define PPC64X_LXVW4X(RA,RB,VT) LXVW4X (RA+RB), VT 24 #endif // defined(GOARCH_ppc64le) 25 26 #define a R7 27 #define b R8 28 #define c R9 29 #define d R10 30 #define e R11 31 #define f R12 32 #define g R14 33 #define h R15 34 35 #define CTX R3 36 #define INP R4 37 #define LEN R5 38 #define BUFFER R16 39 40 #define R_x000 R0 41 #define R_x010 R17 42 #define R_x020 R18 43 #define R_x030 R19 44 45 #define y0 R20 46 #define y1 R21 47 #define y2 R22 48 #define TEMP R6 49 50 #define XWORD0 V0 51 #define XWORD1 V1 52 #define XWORD2 V2 53 #define XWORD3 V3 54 55 #define XTMP0 V4 56 #define XTMP1 V5 57 #define XTMP2 V6 58 #define XTMP3 V7 59 #define XTMP4 V8 60 61 #define XFER V9 62 63 // For instruction emulation 64 #define ESPERMW V31 // Endian swapping permute into BE 65 66 // shuffle byte order from LE to BE 67 DATA ·flip_mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word 68 DATA ·flip_mask+0x08(SB)/8, $0x0302010007060504 69 70 GLOBL ·flip_mask(SB), RODATA, $16 71 72 #define SS12(a, e, const, ss1, ss2) \ 73 ROTLW $12, a, ss2; \ // y0 = a <<< 12 74 ADD $const, e, ss1; \ 75 ADD ss2, ss1; \ // y2 = a <<< 12 + e + T 76 ROTLW $7, ss1; \ // y2 = SS1 77 XOR ss1, ss2 78 79 #define P0(tt2, tmp, out) \ 80 ROTLW $9, tt2, tmp; \ 81 ROTLW $17, tt2, out; \ 82 XOR tmp, out; \ 83 XOR tt2, out 84 85 // For rounds [0 - 16) 86 // addr1 for w, addr2 for w' 87 #define DO_ROUND_N_0(addr1, addr2, const, a, b, c, d, e, f, g, h) \ 88 ; \ 89 SS12(a, e, const, y2, y0); \ 90 MOVWZ addr1, y1; \ 91 ADD y1, y2; \ // y2 = SS1 + W 92 ADD h, y2; \ // y2 = h + SS1 + W 93 MOVWZ addr2, y1; \ 94 ADD y1, y0; \ // y0 = SS2 + W' 95 ADD d, y0; \ // y0 = d + SS2 + W' 96 ; \ 97 XOR a, b, h; \ 98 XOR c, h; \ 99 ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 100 ; \ 101 XOR e, f, y1; \ 102 XOR g, y1; \ 103 ADD y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 104 ; \ 105 ROTLW $9, b; \ 106 ROTLW $19, f; \ 107 ; \ 108 P0(y2, y0, d) 109 110 // For rounds [16 - 64) 111 // addr1 for w, addr2 for w' 112 #define DO_ROUND_N_1(addr1, addr2, const, a, b, c, d, e, f, g, h) \ 113 ; \ 114 SS12(a, e, const, y2, y0); \ 115 MOVWZ addr1, y1; \ 116 ADD y1, y2; \ // y2 = SS1 + W 117 ADD h, y2; \ // y2 = h + SS1 + W 118 MOVWZ addr2, y1; \ 119 ADD y1, y0; \ // y0 = SS2 + W' 120 ADD d, y0; \ // y0 = d + SS2 + W' 121 ; \ 122 OR a, b, y1; \ 123 AND a, b, h; \ 124 AND c, y1; \ 125 OR y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 126 ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 127 ; \ 128 XOR f, g, y1; \ 129 AND e, y1; \ 130 XOR g, y1; \ // y1 = GG2(e, f, g) 131 ADD y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 132 ; \ 133 ROTLW $9, b; \ 134 ROTLW $19, f; \ 135 ; \ 136 P0(y2, y0, d) 137 138 // r = s <<< n 139 // Due to VSPLTISW's limitation, the n MUST be [0, 31] 140 #define PROLD(s, r, n) \ 141 VSPLTISW $n, XFER \ 142 VRLW s, XFER, r 143 144 #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \ 145 VSLDOI $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w3, w4, w5, w6} 146 PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7 147 VSLDOI $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13} 148 VXOR XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7) 149 ; \ // Prepare P1 parameters 150 VSLDOI $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10} 151 VXOR XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16] 152 VSLDOI $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8} 153 PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15 154 VXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx} 155 ; \ // P1 156 PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx} 157 PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx} 158 VXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) 159 VXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx}) 160 ; \ // First 2 words message schedule result 161 VXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...} 162 ; \ // Prepare P1 parameters 163 VSLDOI $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0} 164 PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15 165 VXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD} 166 ; \ // P1 167 PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD} 168 PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD} 169 VXOR XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) 170 VXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD}) 171 ; \ // 4 words message schedule result 172 VXOR XTMP1, XTMP0, XWORD0; \ // XWORD0 = {w[0], w[1], w[2], w[3]} 173 174 175 // func blockASM(dig *digest, p []byte, buffer *uint32) 176 TEXT ·blockASM(SB), NOSPLIT, $0 177 #ifdef NEEDS_PERMW 178 MOVD $·flip_mask(SB), TEMP 179 LVX (TEMP), ESPERMW 180 #endif 181 182 MOVD dig+0(FP), CTX 183 MOVD p_base+8(FP), INP 184 MOVD p_len+16(FP), LEN 185 MOVD buffer+32(FP), BUFFER 186 187 // We assume p_len >= 64 188 SRD $6, LEN 189 MOVD LEN, CTR 190 191 MOVD $16, R_x010 192 MOVD $32, R_x020 193 MOVD $48, R_x030 194 195 // Load initial digest 196 MOVWZ 0(CTX), a 197 MOVWZ 4(CTX), b 198 MOVWZ 8(CTX), c 199 MOVWZ 12(CTX), d 200 MOVWZ 16(CTX), e 201 MOVWZ 20(CTX), f 202 MOVWZ 24(CTX), g 203 MOVWZ 28(CTX), h 204 205 loop: 206 PPC64X_LXVW4X(INP, R_x000, XWORD0) 207 PPC64X_LXVW4X(INP, R_x010, XWORD1) 208 PPC64X_LXVW4X(INP, R_x020, XWORD2) 209 PPC64X_LXVW4X(INP, R_x030, XWORD3) 210 211 ADD $64, INP 212 213 schedule_compress: // for w0 - w47 214 // Do 4 rounds and scheduling 215 STXVW4X XWORD0, (BUFFER)(R_x000) 216 VXOR XWORD0, XWORD1, XFER 217 STXVW4X XFER, (BUFFER)(R_x010) 218 DO_ROUND_N_0(0(BUFFER), 16(BUFFER), T0, a, b, c, d, e, f, g, h) 219 DO_ROUND_N_0(4(BUFFER), 20(BUFFER), T1, h, a, b, c, d, e, f, g) 220 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 221 DO_ROUND_N_0(8(BUFFER), 24(BUFFER), T2, g, h, a, b, c, d, e, f) 222 DO_ROUND_N_0(12(BUFFER), 28(BUFFER), T3, f, g, h, a, b, c, d, e) 223 224 // Do 4 rounds and scheduling 225 STXVW4X XWORD1, (BUFFER)(R_x000) 226 VXOR XWORD1, XWORD2, XFER 227 STXVW4X XFER, (BUFFER)(R_x010) 228 DO_ROUND_N_0(0(BUFFER), 16(BUFFER), T4, e, f, g, h, a, b, c, d) 229 DO_ROUND_N_0(4(BUFFER), 20(BUFFER), T5, d, e, f, g, h, a, b, c) 230 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 231 DO_ROUND_N_0(8(BUFFER), 24(BUFFER), T6, c, d, e, f, g, h, a, b) 232 DO_ROUND_N_0(12(BUFFER), 28(BUFFER), T7, b, c, d, e, f, g, h, a) 233 234 // Do 4 rounds and scheduling 235 STXVW4X XWORD2, (BUFFER)(R_x000) 236 VXOR XWORD2, XWORD3, XFER 237 STXVW4X XFER, (BUFFER)(R_x010) 238 DO_ROUND_N_0(0(BUFFER), 16(BUFFER), T8, a, b, c, d, e, f, g, h) 239 DO_ROUND_N_0(4(BUFFER), 20(BUFFER), T9, h, a, b, c, d, e, f, g) 240 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 241 DO_ROUND_N_0(8(BUFFER), 24(BUFFER), T10, g, h, a, b, c, d, e, f) 242 DO_ROUND_N_0(12(BUFFER), 28(BUFFER), T11, f, g, h, a, b, c, d, e) 243 244 // Do 4 rounds and scheduling 245 STXVW4X XWORD3, (BUFFER)(R_x000) 246 VXOR XWORD3, XWORD0, XFER 247 STXVW4X XFER, (BUFFER)(R_x010) 248 DO_ROUND_N_0(0(BUFFER), 16(BUFFER), T12, e, f, g, h, a, b, c, d) 249 DO_ROUND_N_0(4(BUFFER), 20(BUFFER), T13, d, e, f, g, h, a, b, c) 250 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 251 DO_ROUND_N_0(8(BUFFER), 24(BUFFER), T14, c, d, e, f, g, h, a, b) 252 DO_ROUND_N_0(12(BUFFER), 28(BUFFER), T15, b, c, d, e, f, g, h, a) 253 254 // Do 4 rounds and scheduling 255 STXVW4X XWORD0, (BUFFER)(R_x000) 256 VXOR XWORD0, XWORD1, XFER 257 STXVW4X XFER, (BUFFER)(R_x010) 258 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T16, a, b, c, d, e, f, g, h) 259 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T17, h, a, b, c, d, e, f, g) 260 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 261 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T18, g, h, a, b, c, d, e, f) 262 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T19, f, g, h, a, b, c, d, e) 263 264 // Do 4 rounds and scheduling 265 STXVW4X XWORD1, (BUFFER)(R_x000) 266 VXOR XWORD1, XWORD2, XFER 267 STXVW4X XFER, (BUFFER)(R_x010) 268 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T20, e, f, g, h, a, b, c, d) 269 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T21, d, e, f, g, h, a, b, c) 270 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 271 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T22, c, d, e, f, g, h, a, b) 272 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T23, b, c, d, e, f, g, h, a) 273 274 // Do 4 rounds and scheduling 275 STXVW4X XWORD2, (BUFFER)(R_x000) 276 VXOR XWORD2, XWORD3, XFER 277 STXVW4X XFER, (BUFFER)(R_x010) 278 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T24, a, b, c, d, e, f, g, h) 279 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T25, h, a, b, c, d, e, f, g) 280 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 281 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T26, g, h, a, b, c, d, e, f) 282 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T27, f, g, h, a, b, c, d, e) 283 284 // Do 4 rounds and scheduling 285 STXVW4X XWORD3, (BUFFER)(R_x000) 286 VXOR XWORD3, XWORD0, XFER 287 STXVW4X XFER, (BUFFER)(R_x010) 288 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T28, e, f, g, h, a, b, c, d) 289 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T29, d, e, f, g, h, a, b, c) 290 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 291 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T30, c, d, e, f, g, h, a, b) 292 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T31, b, c, d, e, f, g, h, a) 293 294 // Do 4 rounds and scheduling 295 STXVW4X XWORD0, (BUFFER)(R_x000) 296 VXOR XWORD0, XWORD1, XFER 297 STXVW4X XFER, (BUFFER)(R_x010) 298 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T32, a, b, c, d, e, f, g, h) 299 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T33, h, a, b, c, d, e, f, g) 300 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 301 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T34, g, h, a, b, c, d, e, f) 302 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T35, f, g, h, a, b, c, d, e) 303 304 // Do 4 rounds and scheduling 305 STXVW4X XWORD1, (BUFFER)(R_x000) 306 VXOR XWORD1, XWORD2, XFER 307 STXVW4X XFER, (BUFFER)(R_x010) 308 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T36, e, f, g, h, a, b, c, d) 309 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T37, d, e, f, g, h, a, b, c) 310 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 311 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T38, c, d, e, f, g, h, a, b) 312 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T39, b, c, d, e, f, g, h, a) 313 314 // Do 4 rounds and scheduling 315 STXVW4X XWORD2, (BUFFER)(R_x000) 316 VXOR XWORD2, XWORD3, XFER 317 STXVW4X XFER, (BUFFER)(R_x010) 318 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T40, a, b, c, d, e, f, g, h) 319 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T41, h, a, b, c, d, e, f, g) 320 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 321 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T42, g, h, a, b, c, d, e, f) 322 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T43, f, g, h, a, b, c, d, e) 323 324 // Do 4 rounds and scheduling 325 STXVW4X XWORD3, (BUFFER)(R_x000) 326 VXOR XWORD3, XWORD0, XFER 327 STXVW4X XFER, (BUFFER)(R_x010) 328 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T44, e, f, g, h, a, b, c, d) 329 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T45, d, e, f, g, h, a, b, c) 330 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 331 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T46, c, d, e, f, g, h, a, b) 332 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T47, b, c, d, e, f, g, h, a) 333 334 // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) 335 // Do 4 rounds 336 STXVW4X XWORD0, (BUFFER)(R_x000) 337 VXOR XWORD0, XWORD1, XFER 338 STXVW4X XFER, (BUFFER)(R_x010) 339 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T48, a, b, c, d, e, f, g, h) 340 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T49, h, a, b, c, d, e, f, g) 341 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T50, g, h, a, b, c, d, e, f) 342 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T51, f, g, h, a, b, c, d, e) 343 344 STXVW4X XWORD1, (BUFFER)(R_x000) 345 VXOR XWORD1, XWORD2, XFER 346 STXVW4X XFER, (BUFFER)(R_x010) 347 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T52, e, f, g, h, a, b, c, d) 348 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T53, d, e, f, g, h, a, b, c) 349 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T54, c, d, e, f, g, h, a, b) 350 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T55, b, c, d, e, f, g, h, a) 351 352 STXVW4X XWORD2, (BUFFER)(R_x000) 353 VXOR XWORD2, XWORD3, XFER 354 STXVW4X XFER, (BUFFER)(R_x010) 355 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 356 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T56, a, b, c, d, e, f, g, h) 357 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T57, h, a, b, c, d, e, f, g) 358 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T58, g, h, a, b, c, d, e, f) 359 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T59, f, g, h, a, b, c, d, e) 360 361 STXVW4X XWORD3, (BUFFER)(R_x000) 362 VXOR XWORD3, XWORD0, XFER 363 STXVW4X XFER, (BUFFER)(R_x010) 364 DO_ROUND_N_1(0(BUFFER), 16(BUFFER), T60, e, f, g, h, a, b, c, d) 365 DO_ROUND_N_1(4(BUFFER), 20(BUFFER), T61, d, e, f, g, h, a, b, c) 366 DO_ROUND_N_1(8(BUFFER), 24(BUFFER), T62, c, d, e, f, g, h, a, b) 367 DO_ROUND_N_1(12(BUFFER), 28(BUFFER), T63, b, c, d, e, f, g, h, a) 368 369 MOVWZ 0(CTX), TEMP 370 XOR TEMP, a 371 MOVWZ a, 0(CTX) 372 373 MOVWZ 4(CTX), TEMP 374 XOR TEMP, b 375 MOVWZ b, 4(CTX) 376 377 MOVWZ 8(CTX), TEMP 378 XOR TEMP, c 379 MOVWZ c, 8(CTX) 380 381 MOVWZ 12(CTX), TEMP 382 XOR TEMP, d 383 MOVWZ d, 12(CTX) 384 385 MOVWZ 16(CTX), TEMP 386 XOR TEMP, e 387 MOVWZ e, 16(CTX) 388 389 MOVWZ 20(CTX), TEMP 390 XOR TEMP, f 391 MOVWZ f, 20(CTX) 392 393 MOVWZ 24(CTX), TEMP 394 XOR TEMP, g 395 MOVWZ g, 24(CTX) 396 397 MOVWZ 28(CTX), TEMP 398 XOR TEMP, h 399 MOVWZ h, 28(CTX) 400 401 BDNZ loop 402 403 end: 404 RET