github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_s390x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 #include "sm3_const_asm.s" 9 10 #define a R1 11 #define b R2 12 #define c R3 13 #define d R4 14 #define e R5 15 #define f R6 16 #define g R7 17 #define h R8 18 19 #define CTX R9 20 #define INP R10 21 #define LEN R11 22 #define END R12 23 24 #define y0 R9 25 #define y1 R11 26 #define y2 R12 27 28 #define XWORD0 V0 29 #define XWORD1 V1 30 #define XWORD2 V2 31 #define XWORD3 V3 32 33 #define XTMP0 V4 34 #define XTMP1 V5 35 #define XTMP2 V6 36 #define XTMP3 V7 37 #define XTMP4 V8 38 39 #define XFER V9 40 41 #define SS12(a, e, const, ss1, ss2) \ 42 RLL $12, a, ss2; \ // y0 = a <<< 12 43 ADD $const, e, ss1; \ 44 ADD ss2, ss1; \ // y2 = a <<< 12 + e + T 45 RLL $7, ss1; \ // y2 = SS1 46 XOR ss1, ss2 47 48 #define P0(tt2, tmp, out) \ 49 RLL $9, tt2, tmp; \ 50 RLL $17, tt2, out; \ 51 XOR tmp, out; \ 52 XOR tt2, out 53 54 // For rounds [0 - 16) 55 // addr1 for w, addr2 for w' 56 #define DO_ROUND_N_0(addr1, addr2, const, a, b, c, d, e, f, g, h) \ 57 ; \ 58 SS12(a, e, const, y2, y0); \ 59 MOVWZ addr1, y1; \ 60 ADD y1, y2; \ // y2 = SS1 + W 61 ADD h, y2; \ // y2 = h + SS1 + W 62 MOVWZ addr2, y1; \ 63 ADD y1, y0; \ // y0 = SS2 + W' 64 ADD d, y0; \ // y0 = d + SS2 + W' 65 ; \ 66 XOR a, b, h; \ 67 XOR c, h; \ 68 ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 69 ; \ 70 XOR e, f, y1; \ 71 XOR g, y1; \ 72 ADD y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 73 ; \ 74 RLL $9, b; \ 75 RLL $19, f; \ 76 ; \ 77 P0(y2, y0, d) 78 79 // For rounds [16 - 64) 80 // addr1 for w, addr2 for w' 81 #define DO_ROUND_N_1(addr1, addr2, const, a, b, c, d, e, f, g, h) \ 82 ; \ 83 SS12(a, e, const, y2, y0); \ 84 MOVWZ addr1, y1; \ 85 ADD y1, y2; \ // y2 = SS1 + W 86 ADD h, y2; \ // y2 = h + SS1 + W 87 MOVWZ addr2, y1; \ 88 ADD y1, y0; \ // y0 = SS2 + W' 89 ADD d, y0; \ // y0 = d + SS2 + W' 90 ; \ 91 OR a, b, y1; \ 92 AND a, b, h; \ 93 AND c, y1; \ 94 OR y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 95 ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 96 ; \ 97 XOR f, g, y1; \ 98 AND e, y1; \ 99 XOR g, y1; \ // y1 = GG2(e, f, g) 100 ADD y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 101 ; \ 102 RLL $9, b; \ 103 RLL $19, f; \ 104 ; \ 105 P0(y2, y0, d) 106 107 // r = s <<< n 108 #define PROLD(s, r, n) \ 109 VERLLF $n, s, r 110 111 #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \ 112 VSLDB $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w3, w4, w5, w6} 113 PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7 114 VSLDB $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13} 115 VX XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7) 116 ; \ // Prepare P1 parameters 117 VSLDB $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10} 118 VX XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16] 119 VSLDB $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8} 120 PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15 121 VX XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx} 122 ; \ // P1 123 PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx} 124 PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx} 125 VX XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) 126 VX XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx}) 127 ; \ // First 2 words message schedule result 128 VX XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...} 129 ; \ // Prepare P1 parameters 130 VSLDB $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0} 131 PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15 132 VX XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD} 133 ; \ // P1 134 PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD} 135 PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD} 136 VX XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) 137 VX XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD}) 138 ; \ // 4 words message schedule result 139 VX XTMP1, XTMP0, XWORD0; \ // XWORD0 = {w[0], w[1], w[2], w[3]} 140 141 // For the usage of tmp-xx(SP), I referred to the code of 142 // https://github.com/golang/go/blob/master/src/crypto/md5/md5block_s390x.s 143 // 144 // func block(dig *digest, p []byte) 145 TEXT ·block(SB),NOSPLIT,$72-32 146 MOVD dig+0(FP), CTX 147 MOVD p+8(FP), INP 148 MOVD p_len+16(FP), LEN 149 AND $-64, LEN 150 LAY (INP)(LEN*1), END 151 152 CMPBEQ INP, END, end 153 MOVD END, tmp-8(SP) // backup END 154 LMY 0(CTX), a, h 155 156 loop: 157 STMY a, h, tmp-40(SP) // backup state 158 VLM (INP), XWORD0, XWORD3 159 160 schedule_compress: // for w0 - w47 161 // Do 4 rounds and scheduling 162 VST XWORD0, tmp-56(SP) 163 VX XWORD0, XWORD1, XFER 164 VST XFER, tmp-72(SP) 165 DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T0, a, b, c, d, e, f, g, h) 166 DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T1, h, a, b, c, d, e, f, g) 167 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 168 DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T2, g, h, a, b, c, d, e, f) 169 DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T3, f, g, h, a, b, c, d, e) 170 171 // Do 4 rounds and scheduling 172 VST XWORD1, tmp-56(SP) 173 VX XWORD1, XWORD2, XFER 174 VST XFER, tmp-72(SP) 175 DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T4, e, f, g, h, a, b, c, d) 176 DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T5, d, e, f, g, h, a, b, c) 177 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 178 DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T6, c, d, e, f, g, h, a, b) 179 DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T7, b, c, d, e, f, g, h, a) 180 181 // Do 4 rounds and scheduling 182 VST XWORD2, tmp-56(SP) 183 VX XWORD2, XWORD3, XFER 184 VST XFER, tmp-72(SP) 185 DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T8, a, b, c, d, e, f, g, h) 186 DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T9, h, a, b, c, d, e, f, g) 187 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 188 DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T10, g, h, a, b, c, d, e, f) 189 DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T11, f, g, h, a, b, c, d, e) 190 191 // Do 4 rounds and scheduling 192 VST XWORD3, tmp-56(SP) 193 VX XWORD3, XWORD0, XFER 194 VST XFER, tmp-72(SP) 195 DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T12, e, f, g, h, a, b, c, d) 196 DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T13, d, e, f, g, h, a, b, c) 197 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 198 DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T14, c, d, e, f, g, h, a, b) 199 DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T15, b, c, d, e, f, g, h, a) 200 201 // Do 4 rounds and scheduling 202 VST XWORD0, tmp-56(SP) 203 VX XWORD0, XWORD1, XFER 204 VST XFER, tmp-72(SP) 205 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T16, a, b, c, d, e, f, g, h) 206 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T17, h, a, b, c, d, e, f, g) 207 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 208 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T18, g, h, a, b, c, d, e, f) 209 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T19, f, g, h, a, b, c, d, e) 210 211 // Do 4 rounds and scheduling 212 VST XWORD1, tmp-56(SP) 213 VX XWORD1, XWORD2, XFER 214 VST XFER, tmp-72(SP) 215 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T20, e, f, g, h, a, b, c, d) 216 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T21, d, e, f, g, h, a, b, c) 217 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 218 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T22, c, d, e, f, g, h, a, b) 219 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T23, b, c, d, e, f, g, h, a) 220 221 // Do 4 rounds and scheduling 222 VST XWORD2, tmp-56(SP) 223 VX XWORD2, XWORD3, XFER 224 VST XFER, tmp-72(SP) 225 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T24, a, b, c, d, e, f, g, h) 226 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T25, h, a, b, c, d, e, f, g) 227 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 228 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T26, g, h, a, b, c, d, e, f) 229 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T27, f, g, h, a, b, c, d, e) 230 231 // Do 4 rounds and scheduling 232 VST XWORD3, tmp-56(SP) 233 VX XWORD3, XWORD0, XFER 234 VST XFER, tmp-72(SP) 235 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T28, e, f, g, h, a, b, c, d) 236 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T29, d, e, f, g, h, a, b, c) 237 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 238 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T30, c, d, e, f, g, h, a, b) 239 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T31, b, c, d, e, f, g, h, a) 240 241 // Do 4 rounds and scheduling 242 VST XWORD0, tmp-56(SP) 243 VX XWORD0, XWORD1, XFER 244 VST XFER, tmp-72(SP) 245 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T32, a, b, c, d, e, f, g, h) 246 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T33, h, a, b, c, d, e, f, g) 247 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 248 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T34, g, h, a, b, c, d, e, f) 249 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T35, f, g, h, a, b, c, d, e) 250 251 // Do 4 rounds and scheduling 252 VST XWORD1, tmp-56(SP) 253 VX XWORD1, XWORD2, XFER 254 VST XFER, tmp-72(SP) 255 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T36, e, f, g, h, a, b, c, d) 256 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T37, d, e, f, g, h, a, b, c) 257 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 258 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T38, c, d, e, f, g, h, a, b) 259 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T39, b, c, d, e, f, g, h, a) 260 261 // Do 4 rounds and scheduling 262 VST XWORD2, tmp-56(SP) 263 VX XWORD2, XWORD3, XFER 264 VST XFER, tmp-72(SP) 265 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T40, a, b, c, d, e, f, g, h) 266 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T41, h, a, b, c, d, e, f, g) 267 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 268 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T42, g, h, a, b, c, d, e, f) 269 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T43, f, g, h, a, b, c, d, e) 270 271 // Do 4 rounds and scheduling 272 VST XWORD3, tmp-56(SP) 273 VX XWORD3, XWORD0, XFER 274 VST XFER, tmp-72(SP) 275 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T44, e, f, g, h, a, b, c, d) 276 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T45, d, e, f, g, h, a, b, c) 277 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 278 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T46, c, d, e, f, g, h, a, b) 279 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T47, b, c, d, e, f, g, h, a) 280 281 // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) 282 // Do 4 rounds 283 VST XWORD0, tmp-56(SP) 284 VX XWORD0, XWORD1, XFER 285 VST XFER, tmp-72(SP) 286 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T48, a, b, c, d, e, f, g, h) 287 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T49, h, a, b, c, d, e, f, g) 288 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T50, g, h, a, b, c, d, e, f) 289 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T51, f, g, h, a, b, c, d, e) 290 291 VST XWORD1, tmp-56(SP) 292 VX XWORD1, XWORD2, XFER 293 VST XFER, tmp-72(SP) 294 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T52, e, f, g, h, a, b, c, d) 295 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T53, d, e, f, g, h, a, b, c) 296 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T54, c, d, e, f, g, h, a, b) 297 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T55, b, c, d, e, f, g, h, a) 298 299 VST XWORD2, tmp-56(SP) 300 VX XWORD2, XWORD3, XFER 301 VST XFER, tmp-72(SP) 302 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 303 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T56, a, b, c, d, e, f, g, h) 304 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T57, h, a, b, c, d, e, f, g) 305 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T58, g, h, a, b, c, d, e, f) 306 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T59, f, g, h, a, b, c, d, e) 307 308 VST XWORD3, tmp-56(SP) 309 VX XWORD3, XWORD0, XFER 310 VST XFER, tmp-72(SP) 311 DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T60, e, f, g, h, a, b, c, d) 312 DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T61, d, e, f, g, h, a, b, c) 313 DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T62, c, d, e, f, g, h, a, b) 314 DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T63, b, c, d, e, f, g, h, a) 315 316 MOVWZ tmp-40(SP), END 317 XOR END, a 318 MOVWZ tmp-36(SP), END 319 XOR END, b 320 MOVWZ tmp-32(SP), END 321 XOR END, c 322 MOVWZ tmp-28(SP), END 323 XOR END, d 324 MOVWZ tmp-24(SP), END 325 XOR END, e 326 MOVWZ tmp-20(SP), END 327 XOR END, f 328 MOVWZ tmp-16(SP), END 329 XOR END, g 330 MOVWZ tmp-12(SP), END 331 XOR END, h 332 333 LA 64(INP), INP 334 MOVD tmp-8(SP), END 335 CMPBLT INP, END, loop 336 337 end: 338 MOVD dig+0(FP), CTX 339 STMY a, h, 0(CTX) 340 RET