github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_ppc64x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 #include "textflag.h" 8 #include "sm3_const_asm.s" 9 10 #define a V0 11 #define e V1 12 #define b V2 13 #define f V3 14 #define c V4 15 #define g V5 16 #define d V6 17 #define h V7 18 #define M0 V8 19 #define M1 V9 20 #define M2 V10 21 #define M3 V11 22 #define TMP0 V12 23 #define TMP1 V13 24 #define TMP2 V14 25 #define TMP3 V15 26 #define TMP4 V16 27 #define TMP5 V17 28 29 // For instruction emulation 30 #define ESPERMW V31 // Endian swapping permute into BE 31 32 #define R_x08 R15 33 #define R_x10 R16 34 #define R_x18 R17 35 #define R_x20 R18 36 #define R_x30 R19 37 #define R_TMP R19 38 39 DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word 40 DATA ·mask+0x08(SB)/8, $0x0302010007060504 41 DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix 42 DATA ·mask+0x18(SB)/8, $0x0405060714151617 43 DATA ·mask+0x20(SB)/8, $0x08090a0b18191a1b 44 DATA ·mask+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f 45 DATA ·mask+0x30(SB)/8, $0x0001020304050607 46 DATA ·mask+0x38(SB)/8, $0x1011121314151617 47 DATA ·mask+0x40(SB)/8, $0x08090a0b0c0d0e0f 48 DATA ·mask+0x48(SB)/8, $0x18191a1b1c1d1e1f 49 50 GLOBL ·mask(SB), RODATA, $80 51 52 #ifdef GOARCH_ppc64le 53 #define NEEDS_PERMW 54 55 #define PPC64X_STXVD2X(VS,RA,RB) \ 56 VPERM VS, VS, ESPERMW, TMP5 \ // byte swap per word 57 STXVD2X TMP5, (RA+RB) 58 59 #define PPC64X_LXVW4X(RA,RB,VT) \ 60 LXVW4X (RA+RB), VT \ 61 VPERM VT, VT, ESPERMW, VT 62 63 #else 64 #define PPC64X_STXVD2X(VS,RA,RB) STXVD2X VS, (RA+RB) 65 #define PPC64X_LXVW4X(RA,RB,VT) LXVW4X (RA+RB), VT 66 #endif // defined(GOARCH_ppc64le) 67 68 // r = s <<< n 69 // Due to VSPLTISW's limitation, the n MUST be [0, 31] 70 #define PROLD(s, r, n) \ 71 VSPLTISW $n, TMP5 \ 72 VRLW s, TMP5, r 73 74 #define loadWordByIndex(W, i) \ 75 MOVD $(16*(i)), R_TMP \ 76 LXVW4X (R_TMP)(statePtr), W 77 78 // one word is 16 bytes 79 #define prepare4Words \ 80 PPC64X_LXVW4X(srcPtr1, srcPtrPtr, V16); \ 81 PPC64X_LXVW4X(srcPtr2, srcPtrPtr, V17); \ 82 PPC64X_LXVW4X(srcPtr3, srcPtrPtr, V18); \ 83 PPC64X_LXVW4X(srcPtr4, srcPtrPtr, V19); \ 84 TRANSPOSE_MATRIX(V16, V17, V18, V19); \ 85 ADD $16, srcPtrPtr; \ 86 STXVW4X V16, (wordPtr); \ 87 ADD $16, wordPtr; \ 88 STXVW4X V17, (wordPtr); \ 89 ADD $16, wordPtr; \ 90 STXVW4X V18, (wordPtr); \ 91 ADD $16, wordPtr; \ 92 STXVW4X V19, (wordPtr); \ 93 ADD $16, wordPtr 94 95 #define TRANSPOSE_MATRIX(T0, T1, T2, T3) \ 96 VPERM T0, T1, M0, TMP0; \ 97 VPERM T2, T3, M0, TMP1; \ 98 VPERM T0, T1, M1, TMP2; \ 99 VPERM T2, T3, M1, TMP3; \ 100 VPERM TMP0, TMP1, M2, T0; \ 101 VPERM TMP0, TMP1, M3, T1; \ 102 VPERM TMP2, TMP3, M2, T2; \ 103 VPERM TMP2, TMP3, M3, T3 104 105 // Load constant T, How to simlify it? 106 // Solution 1: big constant table like sha256block_ppc64x.s 107 // Solution 2: 2 constant T, rotate shift left one bit every time 108 // Solution 1's performance is better but it uses more memory. 109 #define LOAD_T(index, const, target) \ 110 MOVD $const, R_TMP \ 111 MTVSRWZ R_TMP, target \ 112 VSPLTW $1, target, target 113 114 #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ 115 PROLD(a, TMP0, 12) \ 116 VOR TMP0, TMP0, TMP1 \ 117 LOAD_T(index, const, TMP2) \ 118 VADDUWM TMP2, TMP0, TMP0 \ 119 VADDUWM e, TMP0, TMP0 \ 120 PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 121 VXOR TMP2, TMP1, TMP0 \ // TMP0 = SS2 122 VXOR a, b, TMP1 \ 123 VXOR c, TMP1, TMP1 \ 124 VADDUWM TMP1, d, TMP1 \ // TMP1 = (a XOR b XOR c) + d 125 loadWordByIndex(TMP3, index) \ 126 loadWordByIndex(TMP4, index+4) \ 127 VXOR TMP3, TMP4, TMP4 \ 128 VADDUWM TMP4, TMP1, TMP1 \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4) 129 VADDUWM TMP1, TMP0, TMP1 \ // TMP1 = TT1 130 VADDUWM h, TMP3, TMP3 \ 131 VADDUWM TMP3, TMP2, TMP3 \ // Wt + h + SS1 132 VXOR e, f, TMP4 \ 133 VXOR g, TMP4, TMP4 \ 134 VADDUWM TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 135 VOR b, b, TMP4 \ 136 PROLD(TMP4, b, 9) \ // b = b <<< 9 137 VOR TMP1, TMP1, h \ // h = TT1 138 PROLD(f, f, 19) \ // f = f <<< 19 139 PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 140 PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 141 VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) 142 VXOR TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) 143 144 #define MESSAGE_SCHEDULE(index) \ 145 loadWordByIndex(TMP0, index+1) \ // Wj-3 146 PROLD(TMP0, TMP1, 15) \ 147 loadWordByIndex(TMP0, index-12) \ // Wj-16 148 VXOR TMP0, TMP1, TMP0 \ 149 loadWordByIndex(TMP1, index-5) \ // Wj-9 150 VXOR TMP0, TMP1, TMP0 \ 151 PROLD(TMP0, TMP1, 15) \ 152 PROLD(TMP1, TMP2, 8) \ 153 VXOR TMP1, TMP0, TMP0 \ 154 VXOR TMP2, TMP0, TMP0 \ // P1 155 loadWordByIndex(TMP1, index-9) \ // Wj-13 156 PROLD(TMP1, TMP2, 7) \ 157 VXOR TMP2, TMP0, TMP0 \ 158 loadWordByIndex(TMP1, index-2) \ // Wj-6 159 VXOR TMP1, TMP0, TMP1 \ 160 STXVW4X TMP1, (wordPtr) \ 161 ADD $16, wordPtr \ 162 163 #define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \ 164 MESSAGE_SCHEDULE(index) \ 165 ROUND_00_11(index, const, a, b, c, d, e, f, g, h) 166 167 #define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \ 168 MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it 169 PROLD(a, TMP0, 12) \ 170 VOR TMP0, TMP0, TMP4 \ 171 LOAD_T(index, const, TMP2) \ 172 VADDUWM TMP2, TMP0, TMP0 \ 173 VADDUWM e, TMP0, TMP0 \ 174 PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 175 VXOR TMP2, TMP4, TMP0 \ // TMP0 = SS2 176 VOR a, b, TMP3 \ 177 VAND a, b, TMP4 \ 178 VAND c, TMP3, TMP3 \ 179 VOR TMP4, TMP3, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) 180 VADDUWM TMP4, d, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d 181 loadWordByIndex(TMP3, index) \ // Wj 182 VXOR TMP3, TMP1, TMP1 \ // Wj XOR Wj+4 183 VADDUWM TMP4, TMP1, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) 184 VADDUWM TMP4, TMP0, TMP4 \ // TT1 185 VADDUWM h, TMP3, TMP3 \ // Wt + h 186 VADDUWM TMP2, TMP3, TMP3 \ // Wt + h + SS1 187 VXOR f, g, TMP1 \ 188 VAND TMP1, e, TMP1 \ 189 VXOR g, TMP1, TMP1 \ // (f XOR g) AND e XOR g 190 VADDUWM TMP3, TMP1, TMP3 \ // TT2 191 VOR b, b, TMP1 \ 192 PROLD(TMP1, b, 9) \ // b = b <<< 9 193 VOR TMP4, TMP4, h \ // h = TT1 194 PROLD(f, f, 19) \ // f = f <<< 19 195 PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9 196 PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17 197 VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9) 198 VXOR TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) 199 200 // Used general purpose registers R4-R12, R15-R19. 201 // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) 202 TEXT ·blockMultBy4(SB), NOSPLIT, $0 203 MOVD $8, R_x08 204 MOVD $16, R_x10 205 MOVD $24, R_x18 206 MOVD $32, R_x20 207 MOVD $48, R_x30 208 #ifdef NEEDS_PERMW 209 MOVD $·mask(SB), R4 210 LVX (R4), ESPERMW 211 ADD $0x10, R4 212 #else 213 MOVD $·mask+0x10(SB), R4 214 #endif 215 LXVD2X (R0)(R4), M0 216 LXVD2X (R_x10)(R4), M1 217 LXVD2X (R_x20)(R4), M2 218 LXVD2X (R_x30)(R4), M3 219 #define digPtr R11 220 #define srcPtrPtr R5 221 #define statePtr R4 222 #define blockCount R6 223 #define srcPtr1 R7 224 #define srcPtr2 R8 225 #define srcPtr3 R9 226 #define srcPtr4 R10 227 #define wordPtr R12 228 MOVD dig+0(FP), digPtr 229 MOVD p+8(FP), srcPtrPtr 230 MOVD buffer+16(FP), statePtr 231 MOVD blocks+24(FP), blockCount 232 233 // load state 234 MOVD (R0)(digPtr), R_TMP 235 LXVW4X (R0)(R_TMP), a 236 LXVW4X (R_x10)(R_TMP), e 237 MOVD (R_x08)(digPtr), R_TMP 238 LXVW4X (R0)(R_TMP), b 239 LXVW4X (R_x10)(R_TMP), f 240 MOVD (R_x10)(digPtr), R_TMP 241 LXVW4X (R0)(R_TMP), c 242 LXVW4X (R_x10)(R_TMP), g 243 MOVD (R_x18)(digPtr), R_TMP 244 LXVW4X (R0)(R_TMP), d 245 LXVW4X (R_x10)(R_TMP), h 246 247 TRANSPOSE_MATRIX(a, b, c, d) 248 TRANSPOSE_MATRIX(e, f, g, h) 249 250 MOVD (R0)(srcPtrPtr), srcPtr1 251 MOVD (R_x08)(srcPtrPtr), srcPtr2 252 MOVD (R_x10)(srcPtrPtr), srcPtr3 253 MOVD (R_x18)(srcPtrPtr), srcPtr4 254 MOVD $0, srcPtrPtr 255 256 MOVD blockCount, CTR 257 258 loop: 259 // Offload to VSR24-31 (aka FPR24-31) 260 XXLOR V0, V0, VS24 261 XXLOR V1, V1, VS25 262 XXLOR V2, V2, VS26 263 XXLOR V3, V3, VS27 264 XXLOR V4, V4, VS28 265 XXLOR V5, V5, VS29 266 XXLOR V6, V6, VS30 267 XXLOR V7, V7, VS31 268 269 // reset wordPtr 270 MOVD statePtr, wordPtr 271 272 // load message block 273 prepare4Words 274 prepare4Words 275 prepare4Words 276 prepare4Words 277 278 ROUND_00_11(0, T0, a, b, c, d, e, f, g, h) 279 ROUND_00_11(1, T1, h, a, b, c, d, e, f, g) 280 ROUND_00_11(2, T2, g, h, a, b, c, d, e, f) 281 ROUND_00_11(3, T3, f, g, h, a, b, c, d, e) 282 ROUND_00_11(4, T4, e, f, g, h, a, b, c, d) 283 ROUND_00_11(5, T5, d, e, f, g, h, a, b, c) 284 ROUND_00_11(6, T6, c, d, e, f, g, h, a, b) 285 ROUND_00_11(7, T7, b, c, d, e, f, g, h, a) 286 ROUND_00_11(8, T8, a, b, c, d, e, f, g, h) 287 ROUND_00_11(9, T9, h, a, b, c, d, e, f, g) 288 ROUND_00_11(10, T10, g, h, a, b, c, d, e, f) 289 ROUND_00_11(11, T11, f, g, h, a, b, c, d, e) 290 291 ROUND_12_15(12, T12, e, f, g, h, a, b, c, d) 292 ROUND_12_15(13, T13, d, e, f, g, h, a, b, c) 293 ROUND_12_15(14, T14, c, d, e, f, g, h, a, b) 294 ROUND_12_15(15, T15, b, c, d, e, f, g, h, a) 295 296 ROUND_16_63(16, T16, a, b, c, d, e, f, g, h) 297 ROUND_16_63(17, T17, h, a, b, c, d, e, f, g) 298 ROUND_16_63(18, T18, g, h, a, b, c, d, e, f) 299 ROUND_16_63(19, T19, f, g, h, a, b, c, d, e) 300 ROUND_16_63(20, T20, e, f, g, h, a, b, c, d) 301 ROUND_16_63(21, T21, d, e, f, g, h, a, b, c) 302 ROUND_16_63(22, T22, c, d, e, f, g, h, a, b) 303 ROUND_16_63(23, T23, b, c, d, e, f, g, h, a) 304 ROUND_16_63(24, T24, a, b, c, d, e, f, g, h) 305 ROUND_16_63(25, T25, h, a, b, c, d, e, f, g) 306 ROUND_16_63(26, T26, g, h, a, b, c, d, e, f) 307 ROUND_16_63(27, T27, f, g, h, a, b, c, d, e) 308 ROUND_16_63(28, T28, e, f, g, h, a, b, c, d) 309 ROUND_16_63(29, T29, d, e, f, g, h, a, b, c) 310 ROUND_16_63(30, T30, c, d, e, f, g, h, a, b) 311 ROUND_16_63(31, T31, b, c, d, e, f, g, h, a) 312 ROUND_16_63(32, T32, a, b, c, d, e, f, g, h) 313 ROUND_16_63(33, T33, h, a, b, c, d, e, f, g) 314 ROUND_16_63(34, T34, g, h, a, b, c, d, e, f) 315 ROUND_16_63(35, T35, f, g, h, a, b, c, d, e) 316 ROUND_16_63(36, T36, e, f, g, h, a, b, c, d) 317 ROUND_16_63(37, T37, d, e, f, g, h, a, b, c) 318 ROUND_16_63(38, T38, c, d, e, f, g, h, a, b) 319 ROUND_16_63(39, T39, b, c, d, e, f, g, h, a) 320 ROUND_16_63(40, T40, a, b, c, d, e, f, g, h) 321 ROUND_16_63(41, T41, h, a, b, c, d, e, f, g) 322 ROUND_16_63(42, T42, g, h, a, b, c, d, e, f) 323 ROUND_16_63(43, T43, f, g, h, a, b, c, d, e) 324 ROUND_16_63(44, T44, e, f, g, h, a, b, c, d) 325 ROUND_16_63(45, T45, d, e, f, g, h, a, b, c) 326 ROUND_16_63(46, T46, c, d, e, f, g, h, a, b) 327 ROUND_16_63(47, T47, b, c, d, e, f, g, h, a) 328 ROUND_16_63(48, T16, a, b, c, d, e, f, g, h) 329 ROUND_16_63(49, T17, h, a, b, c, d, e, f, g) 330 ROUND_16_63(50, T18, g, h, a, b, c, d, e, f) 331 ROUND_16_63(51, T19, f, g, h, a, b, c, d, e) 332 ROUND_16_63(52, T20, e, f, g, h, a, b, c, d) 333 ROUND_16_63(53, T21, d, e, f, g, h, a, b, c) 334 ROUND_16_63(54, T22, c, d, e, f, g, h, a, b) 335 ROUND_16_63(55, T23, b, c, d, e, f, g, h, a) 336 ROUND_16_63(56, T24, a, b, c, d, e, f, g, h) 337 ROUND_16_63(57, T25, h, a, b, c, d, e, f, g) 338 ROUND_16_63(58, T26, g, h, a, b, c, d, e, f) 339 ROUND_16_63(59, T27, f, g, h, a, b, c, d, e) 340 ROUND_16_63(60, T28, e, f, g, h, a, b, c, d) 341 ROUND_16_63(61, T29, d, e, f, g, h, a, b, c) 342 ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) 343 ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) 344 345 XXLXOR V0, VS24, V0 346 XXLXOR V1, VS25, V1 347 XXLXOR V2, VS26, V2 348 XXLXOR V3, VS27, V3 349 XXLXOR V4, VS28, V4 350 XXLXOR V5, VS29, V5 351 XXLXOR V6, VS30, V6 352 XXLXOR V7, VS31, V7 353 354 BDNZ loop 355 356 end: 357 TRANSPOSE_MATRIX(a, b, c, d) 358 TRANSPOSE_MATRIX(e, f, g, h) 359 360 // save state 361 MOVD (R0)(digPtr), R_TMP 362 STXVW4X a, (R0)(R_TMP) 363 STXVW4X e, (R_x10)(R_TMP) 364 MOVD (R_x08)(digPtr), R_TMP 365 STXVW4X b, (R0)(R_TMP) 366 STXVW4X f, (R_x10)(R_TMP) 367 MOVD (R_x10)(digPtr), R_TMP 368 STXVW4X c, (R0)(R_TMP) 369 STXVW4X g, (R_x10)(R_TMP) 370 MOVD (R_x18)(digPtr), R_TMP 371 STXVW4X d, (R0)(R_TMP) 372 STXVW4X h, (R_x10)(R_TMP) 373 374 RET 375 376 // Used general purpose registers R4-R6, R8-R9, R16-R19. 377 // func copyResultsBy4(dig *uint32, dst *byte) 378 TEXT ·copyResultsBy4(SB),NOSPLIT,$0 379 MOVD dig+0(FP), R6 380 MOVD dst+8(FP), R4 381 382 #ifdef NEEDS_PERMW 383 MOVD $·mask+0x00(SB), R5 384 LVX (R5), ESPERMW 385 #endif 386 MOVD $16, R5 387 MOVD $32, R16 388 MOVD $48, R17 389 MOVD $64, R18 390 MOVD $80, R19 391 MOVD $96, R8 392 MOVD $112, R9 393 394 LXVD2X (R0)(R6), V0 395 PPC64X_STXVD2X(V0, R0, R4) 396 397 LXVD2X (R5)(R6), V0 398 PPC64X_STXVD2X(V0, R5, R4) 399 400 LXVD2X (R16)(R6), V0 401 PPC64X_STXVD2X(V0, R16, R4) 402 403 LXVD2X (R17)(R6), V0 404 PPC64X_STXVD2X(V0, R17, R4) 405 406 LXVD2X (R18)(R6), V0 407 PPC64X_STXVD2X(V0, R18, R4) 408 409 LXVD2X (R19)(R6), V0 410 PPC64X_STXVD2X(V0, R19, R4) 411 412 LXVD2X (R8)(R6), V0 413 PPC64X_STXVD2X(V0, R8, R4) 414 415 LXVD2X (R9)(R6), V0 416 PPC64X_STXVD2X(V0, R9, R4) 417 418 RET