github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/poly1305/sum_vmsl_s390x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build s390x,go1.11,!gccgo,!appengine 6 7 #include "textflag.h" 8 9 // Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction. 10 11 // constants 12 #define EX0 V1 13 #define EX1 V2 14 #define EX2 V3 15 16 // temporaries 17 #define T_0 V4 18 #define T_1 V5 19 #define T_2 V6 20 #define T_3 V7 21 #define T_4 V8 22 #define T_5 V9 23 #define T_6 V10 24 #define T_7 V11 25 #define T_8 V12 26 #define T_9 V13 27 #define T_10 V14 28 29 // r**2 & r**4 30 #define R_0 V15 31 #define R_1 V16 32 #define R_2 V17 33 #define R5_1 V18 34 #define R5_2 V19 35 // key (r) 36 #define RSAVE_0 R7 37 #define RSAVE_1 R8 38 #define RSAVE_2 R9 39 #define R5SAVE_1 R10 40 #define R5SAVE_2 R11 41 42 // message block 43 #define M0 V20 44 #define M1 V21 45 #define M2 V22 46 #define M3 V23 47 #define M4 V24 48 #define M5 V25 49 50 // accumulator 51 #define H0_0 V26 52 #define H1_0 V27 53 #define H2_0 V28 54 #define H0_1 V29 55 #define H1_1 V30 56 #define H2_1 V31 57 58 GLOBL ·keyMask<>(SB), RODATA, $16 59 DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f 60 DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f 61 62 GLOBL ·bswapMask<>(SB), RODATA, $16 63 DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908 64 DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100 65 66 GLOBL ·constants<>(SB), RODATA, $48 67 // EX0 68 DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f 69 DATA ·constants<>+8(SB)/8, $0x0000050403020100 70 // EX1 71 DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f 72 DATA ·constants<>+24(SB)/8, $0x00000a0908070605 73 // EX2 74 DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f 75 DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b 76 77 GLOBL ·c<>(SB), RODATA, $48 78 // EX0 79 DATA ·c<>+0(SB)/8, $0x0000050403020100 80 DATA ·c<>+8(SB)/8, $0x0000151413121110 81 // EX1 82 DATA ·c<>+16(SB)/8, $0x00000a0908070605 83 DATA ·c<>+24(SB)/8, $0x00001a1918171615 84 // EX2 85 DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b 86 DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b 87 88 GLOBL ·reduce<>(SB), RODATA, $32 89 // 44 bit 90 DATA ·reduce<>+0(SB)/8, $0x0 91 DATA ·reduce<>+8(SB)/8, $0xfffffffffff 92 // 42 bit 93 DATA ·reduce<>+16(SB)/8, $0x0 94 DATA ·reduce<>+24(SB)/8, $0x3ffffffffff 95 96 // h = (f*g) % (2**130-5) [partial reduction] 97 // uses T_0...T_9 temporary registers 98 // input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2 99 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9 100 // output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2 101 #define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \ 102 \ // Eliminate the dependency for the last 2 VMSLs 103 VMSLG m02_0, r_2, m4_2, m4_2 \ 104 VMSLG m13_0, r_2, m5_2, m5_2 \ // 8 VMSLs pipelined 105 VMSLG m02_0, r_0, m4_0, m4_0 \ 106 VMSLG m02_1, r5_2, V0, T_0 \ 107 VMSLG m02_0, r_1, m4_1, m4_1 \ 108 VMSLG m02_1, r_0, V0, T_1 \ 109 VMSLG m02_1, r_1, V0, T_2 \ 110 VMSLG m02_2, r5_1, V0, T_3 \ 111 VMSLG m02_2, r5_2, V0, T_4 \ 112 VMSLG m13_0, r_0, m5_0, m5_0 \ 113 VMSLG m13_1, r5_2, V0, T_5 \ 114 VMSLG m13_0, r_1, m5_1, m5_1 \ 115 VMSLG m13_1, r_0, V0, T_6 \ 116 VMSLG m13_1, r_1, V0, T_7 \ 117 VMSLG m13_2, r5_1, V0, T_8 \ 118 VMSLG m13_2, r5_2, V0, T_9 \ 119 VMSLG m02_2, r_0, m4_2, m4_2 \ 120 VMSLG m13_2, r_0, m5_2, m5_2 \ 121 VAQ m4_0, T_0, m02_0 \ 122 VAQ m4_1, T_1, m02_1 \ 123 VAQ m5_0, T_5, m13_0 \ 124 VAQ m5_1, T_6, m13_1 \ 125 VAQ m02_0, T_3, m02_0 \ 126 VAQ m02_1, T_4, m02_1 \ 127 VAQ m13_0, T_8, m13_0 \ 128 VAQ m13_1, T_9, m13_1 \ 129 VAQ m4_2, T_2, m02_2 \ 130 VAQ m5_2, T_7, m13_2 \ 131 132 // SQUARE uses three limbs of r and r_2*5 to output square of r 133 // uses T_1, T_5 and T_7 temporary registers 134 // input: r_0, r_1, r_2, r5_2 135 // temp: TEMP0, TEMP1, TEMP2 136 // output: p0, p1, p2 137 #define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \ 138 VMSLG r_0, r_0, p0, p0 \ 139 VMSLG r_1, r5_2, V0, TEMP0 \ 140 VMSLG r_2, r5_2, p1, p1 \ 141 VMSLG r_0, r_1, V0, TEMP1 \ 142 VMSLG r_1, r_1, p2, p2 \ 143 VMSLG r_0, r_2, V0, TEMP2 \ 144 VAQ TEMP0, p0, p0 \ 145 VAQ TEMP1, p1, p1 \ 146 VAQ TEMP2, p2, p2 \ 147 VAQ TEMP0, p0, p0 \ 148 VAQ TEMP1, p1, p1 \ 149 VAQ TEMP2, p2, p2 \ 150 151 // carry h0->h1->h2->h0 || h3->h4->h5->h3 152 // uses T_2, T_4, T_5, T_7, T_8, T_9 153 // t6, t7, t8, t9, t10, t11 154 // input: h0, h1, h2, h3, h4, h5 155 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11 156 // output: h0, h1, h2, h3, h4, h5 157 #define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \ 158 VLM (R12), t6, t7 \ // 44 and 42 bit clear mask 159 VLEIB $7, $0x28, t10 \ // 5 byte shift mask 160 VREPIB $4, t8 \ // 4 bit shift mask 161 VREPIB $2, t11 \ // 2 bit shift mask 162 VSRLB t10, h0, t0 \ // h0 byte shift 163 VSRLB t10, h1, t1 \ // h1 byte shift 164 VSRLB t10, h2, t2 \ // h2 byte shift 165 VSRLB t10, h3, t3 \ // h3 byte shift 166 VSRLB t10, h4, t4 \ // h4 byte shift 167 VSRLB t10, h5, t5 \ // h5 byte shift 168 VSRL t8, t0, t0 \ // h0 bit shift 169 VSRL t8, t1, t1 \ // h2 bit shift 170 VSRL t11, t2, t2 \ // h2 bit shift 171 VSRL t8, t3, t3 \ // h3 bit shift 172 VSRL t8, t4, t4 \ // h4 bit shift 173 VESLG $2, t2, t9 \ // h2 carry x5 174 VSRL t11, t5, t5 \ // h5 bit shift 175 VN t6, h0, h0 \ // h0 clear carry 176 VAQ t2, t9, t2 \ // h2 carry x5 177 VESLG $2, t5, t9 \ // h5 carry x5 178 VN t6, h1, h1 \ // h1 clear carry 179 VN t7, h2, h2 \ // h2 clear carry 180 VAQ t5, t9, t5 \ // h5 carry x5 181 VN t6, h3, h3 \ // h3 clear carry 182 VN t6, h4, h4 \ // h4 clear carry 183 VN t7, h5, h5 \ // h5 clear carry 184 VAQ t0, h1, h1 \ // h0->h1 185 VAQ t3, h4, h4 \ // h3->h4 186 VAQ t1, h2, h2 \ // h1->h2 187 VAQ t4, h5, h5 \ // h4->h5 188 VAQ t2, h0, h0 \ // h2->h0 189 VAQ t5, h3, h3 \ // h5->h3 190 VREPG $1, t6, t6 \ // 44 and 42 bit masks across both halves 191 VREPG $1, t7, t7 \ 192 VSLDB $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5] 193 VSLDB $8, h1, h1, h1 \ 194 VSLDB $8, h2, h2, h2 \ 195 VO h0, h3, h3 \ 196 VO h1, h4, h4 \ 197 VO h2, h5, h5 \ 198 VESRLG $44, h3, t0 \ // 44 bit shift right 199 VESRLG $44, h4, t1 \ 200 VESRLG $42, h5, t2 \ 201 VN t6, h3, h3 \ // clear carry bits 202 VN t6, h4, h4 \ 203 VN t7, h5, h5 \ 204 VESLG $2, t2, t9 \ // multiply carry by 5 205 VAQ t9, t2, t2 \ 206 VAQ t0, h4, h4 \ 207 VAQ t1, h5, h5 \ 208 VAQ t2, h3, h3 \ 209 210 // carry h0->h1->h2->h0 211 // input: h0, h1, h2 212 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8 213 // output: h0, h1, h2 214 #define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \ 215 VLEIB $7, $0x28, t3 \ // 5 byte shift mask 216 VREPIB $4, t4 \ // 4 bit shift mask 217 VREPIB $2, t7 \ // 2 bit shift mask 218 VGBM $0x003F, t5 \ // mask to clear carry bits 219 VSRLB t3, h0, t0 \ 220 VSRLB t3, h1, t1 \ 221 VSRLB t3, h2, t2 \ 222 VESRLG $4, t5, t5 \ // 44 bit clear mask 223 VSRL t4, t0, t0 \ 224 VSRL t4, t1, t1 \ 225 VSRL t7, t2, t2 \ 226 VESRLG $2, t5, t6 \ // 42 bit clear mask 227 VESLG $2, t2, t8 \ 228 VAQ t8, t2, t2 \ 229 VN t5, h0, h0 \ 230 VN t5, h1, h1 \ 231 VN t6, h2, h2 \ 232 VAQ t0, h1, h1 \ 233 VAQ t1, h2, h2 \ 234 VAQ t2, h0, h0 \ 235 VSRLB t3, h0, t0 \ 236 VSRLB t3, h1, t1 \ 237 VSRLB t3, h2, t2 \ 238 VSRL t4, t0, t0 \ 239 VSRL t4, t1, t1 \ 240 VSRL t7, t2, t2 \ 241 VN t5, h0, h0 \ 242 VN t5, h1, h1 \ 243 VESLG $2, t2, t8 \ 244 VN t6, h2, h2 \ 245 VAQ t0, h1, h1 \ 246 VAQ t8, t2, t2 \ 247 VAQ t1, h2, h2 \ 248 VAQ t2, h0, h0 \ 249 250 // expands two message blocks into the lower halfs of the d registers 251 // moves the contents of the d registers into upper halfs 252 // input: in1, in2, d0, d1, d2, d3, d4, d5 253 // temp: TEMP0, TEMP1, TEMP2, TEMP3 254 // output: d0, d1, d2, d3, d4, d5 255 #define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \ 256 VGBM $0xff3f, TEMP0 \ 257 VGBM $0xff1f, TEMP1 \ 258 VESLG $4, d1, TEMP2 \ 259 VESLG $4, d4, TEMP3 \ 260 VESRLG $4, TEMP0, TEMP0 \ 261 VPERM in1, d0, EX0, d0 \ 262 VPERM in2, d3, EX0, d3 \ 263 VPERM in1, d2, EX2, d2 \ 264 VPERM in2, d5, EX2, d5 \ 265 VPERM in1, TEMP2, EX1, d1 \ 266 VPERM in2, TEMP3, EX1, d4 \ 267 VN TEMP0, d0, d0 \ 268 VN TEMP0, d3, d3 \ 269 VESRLG $4, d1, d1 \ 270 VESRLG $4, d4, d4 \ 271 VN TEMP1, d2, d2 \ 272 VN TEMP1, d5, d5 \ 273 VN TEMP0, d1, d1 \ 274 VN TEMP0, d4, d4 \ 275 276 // expands one message block into the lower halfs of the d registers 277 // moves the contents of the d registers into upper halfs 278 // input: in, d0, d1, d2 279 // temp: TEMP0, TEMP1, TEMP2 280 // output: d0, d1, d2 281 #define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \ 282 VGBM $0xff3f, TEMP0 \ 283 VESLG $4, d1, TEMP2 \ 284 VGBM $0xff1f, TEMP1 \ 285 VPERM in, d0, EX0, d0 \ 286 VESRLG $4, TEMP0, TEMP0 \ 287 VPERM in, d2, EX2, d2 \ 288 VPERM in, TEMP2, EX1, d1 \ 289 VN TEMP0, d0, d0 \ 290 VN TEMP1, d2, d2 \ 291 VESRLG $4, d1, d1 \ 292 VN TEMP0, d1, d1 \ 293 294 // pack h2:h0 into h1:h0 (no carry) 295 // input: h0, h1, h2 296 // output: h0, h1, h2 297 #define PACK(h0, h1, h2) \ 298 VMRLG h1, h2, h2 \ // copy h1 to upper half h2 299 VESLG $44, h1, h1 \ // shift limb 1 44 bits, leaving 20 300 VO h0, h1, h0 \ // combine h0 with 20 bits from limb 1 301 VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1 302 VLEIG $1, $0, h1 \ // clear h2 stuff from lower half of h1 303 VO h0, h1, h0 \ // h0 now has 88 bits (limb 0 and 1) 304 VLEIG $0, $0, h2 \ // clear upper half of h2 305 VESRLG $40, h2, h1 \ // h1 now has upper two bits of result 306 VLEIB $7, $88, h1 \ // for byte shift (11 bytes) 307 VSLB h1, h2, h2 \ // shift h2 11 bytes to the left 308 VO h0, h2, h0 \ // combine h0 with 20 bits from limb 1 309 VLEIG $0, $0, h1 \ // clear upper half of h1 310 311 // if h > 2**130-5 then h -= 2**130-5 312 // input: h0, h1 313 // temp: t0, t1, t2 314 // output: h0 315 #define MOD(h0, h1, t0, t1, t2) \ 316 VZERO t0 \ 317 VLEIG $1, $5, t0 \ 318 VACCQ h0, t0, t1 \ 319 VAQ h0, t0, t0 \ 320 VONE t2 \ 321 VLEIG $1, $-4, t2 \ 322 VAQ t2, t1, t1 \ 323 VACCQ h1, t1, t1 \ 324 VONE t2 \ 325 VAQ t2, t1, t1 \ 326 VN h0, t1, t2 \ 327 VNC t0, t1, t1 \ 328 VO t1, t2, h0 \ 329 330 // func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key) 331 TEXT ·poly1305vmsl(SB), $0-32 332 // This code processes 6 + up to 4 blocks (32 bytes) per iteration 333 // using the algorithm described in: 334 // NEON crypto, Daniel J. Bernstein & Peter Schwabe 335 // https://cryptojedi.org/papers/neoncrypto-20120320.pdf 336 // And as moddified for VMSL as described in 337 // Accelerating Poly1305 Cryptographic Message Authentication on the z14 338 // O'Farrell et al, CASCON 2017, p48-55 339 // https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht 340 341 LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key 342 VZERO V0 // c 343 344 // load EX0, EX1 and EX2 345 MOVD $·constants<>(SB), R5 346 VLM (R5), EX0, EX2 // c 347 348 // setup r 349 VL (R4), T_0 350 MOVD $·keyMask<>(SB), R6 351 VL (R6), T_1 352 VN T_0, T_1, T_0 353 VZERO T_2 // limbs for r 354 VZERO T_3 355 VZERO T_4 356 EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7) 357 358 // T_2, T_3, T_4: [0, r] 359 360 // setup r*20 361 VLEIG $0, $0, T_0 362 VLEIG $1, $20, T_0 // T_0: [0, 20] 363 VZERO T_5 364 VZERO T_6 365 VMSLG T_0, T_3, T_5, T_5 366 VMSLG T_0, T_4, T_6, T_6 367 368 // store r for final block in GR 369 VLGVG $1, T_2, RSAVE_0 // c 370 VLGVG $1, T_3, RSAVE_1 // c 371 VLGVG $1, T_4, RSAVE_2 // c 372 VLGVG $1, T_5, R5SAVE_1 // c 373 VLGVG $1, T_6, R5SAVE_2 // c 374 375 // initialize h 376 VZERO H0_0 377 VZERO H1_0 378 VZERO H2_0 379 VZERO H0_1 380 VZERO H1_1 381 VZERO H2_1 382 383 // initialize pointer for reduce constants 384 MOVD $·reduce<>(SB), R12 385 386 // calculate r**2 and 20*(r**2) 387 VZERO R_0 388 VZERO R_1 389 VZERO R_2 390 SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7) 391 REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1) 392 VZERO R5_1 393 VZERO R5_2 394 VMSLG T_0, R_1, R5_1, R5_1 395 VMSLG T_0, R_2, R5_2, R5_2 396 397 // skip r**4 calculation if 3 blocks or less 398 CMPBLE R3, $48, b4 399 400 // calculate r**4 and 20*(r**4) 401 VZERO T_8 402 VZERO T_9 403 VZERO T_10 404 SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7) 405 REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1) 406 VZERO T_2 407 VZERO T_3 408 VMSLG T_0, T_9, T_2, T_2 409 VMSLG T_0, T_10, T_3, T_3 410 411 // put r**2 to the right and r**4 to the left of R_0, R_1, R_2 412 VSLDB $8, T_8, T_8, T_8 413 VSLDB $8, T_9, T_9, T_9 414 VSLDB $8, T_10, T_10, T_10 415 VSLDB $8, T_2, T_2, T_2 416 VSLDB $8, T_3, T_3, T_3 417 418 VO T_8, R_0, R_0 419 VO T_9, R_1, R_1 420 VO T_10, R_2, R_2 421 VO T_2, R5_1, R5_1 422 VO T_3, R5_2, R5_2 423 424 CMPBLE R3, $80, load // less than or equal to 5 blocks in message 425 426 // 6(or 5+1) blocks 427 SUB $81, R3 428 VLM (R2), M0, M4 429 VLL R3, 80(R2), M5 430 ADD $1, R3 431 MOVBZ $1, R0 432 CMPBGE R3, $16, 2(PC) 433 VLVGB R3, R0, M5 434 MOVD $96(R2), R2 435 EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3) 436 EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3) 437 VLEIB $2, $1, H2_0 438 VLEIB $2, $1, H2_1 439 VLEIB $10, $1, H2_0 440 VLEIB $10, $1, H2_1 441 442 VZERO M0 443 VZERO M1 444 VZERO M2 445 VZERO M3 446 VZERO T_4 447 VZERO T_10 448 EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3) 449 VLR T_4, M4 450 VLEIB $10, $1, M2 451 CMPBLT R3, $16, 2(PC) 452 VLEIB $10, $1, T_10 453 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 454 REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9) 455 VMRHG V0, H0_1, H0_0 456 VMRHG V0, H1_1, H1_0 457 VMRHG V0, H2_1, H2_0 458 VMRLG V0, H0_1, H0_1 459 VMRLG V0, H1_1, H1_1 460 VMRLG V0, H2_1, H2_1 461 462 SUB $16, R3 463 CMPBLE R3, $0, square 464 465 load: 466 // load EX0, EX1 and EX2 467 MOVD $·c<>(SB), R5 468 VLM (R5), EX0, EX2 469 470 loop: 471 CMPBLE R3, $64, add // b4 // last 4 or less blocks left 472 473 // next 4 full blocks 474 VLM (R2), M2, M5 475 SUB $64, R3 476 MOVD $64(R2), R2 477 REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9) 478 479 // expacc in-lined to create [m2, m3] limbs 480 VGBM $0x3f3f, T_0 // 44 bit clear mask 481 VGBM $0x1f1f, T_1 // 40 bit clear mask 482 VPERM M2, M3, EX0, T_3 483 VESRLG $4, T_0, T_0 // 44 bit clear mask ready 484 VPERM M2, M3, EX1, T_4 485 VPERM M2, M3, EX2, T_5 486 VN T_0, T_3, T_3 487 VESRLG $4, T_4, T_4 488 VN T_1, T_5, T_5 489 VN T_0, T_4, T_4 490 VMRHG H0_1, T_3, H0_0 491 VMRHG H1_1, T_4, H1_0 492 VMRHG H2_1, T_5, H2_0 493 VMRLG H0_1, T_3, H0_1 494 VMRLG H1_1, T_4, H1_1 495 VMRLG H2_1, T_5, H2_1 496 VLEIB $10, $1, H2_0 497 VLEIB $10, $1, H2_1 498 VPERM M4, M5, EX0, T_3 499 VPERM M4, M5, EX1, T_4 500 VPERM M4, M5, EX2, T_5 501 VN T_0, T_3, T_3 502 VESRLG $4, T_4, T_4 503 VN T_1, T_5, T_5 504 VN T_0, T_4, T_4 505 VMRHG V0, T_3, M0 506 VMRHG V0, T_4, M1 507 VMRHG V0, T_5, M2 508 VMRLG V0, T_3, M3 509 VMRLG V0, T_4, M4 510 VMRLG V0, T_5, M5 511 VLEIB $10, $1, M2 512 VLEIB $10, $1, M5 513 514 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 515 CMPBNE R3, $0, loop 516 REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9) 517 VMRHG V0, H0_1, H0_0 518 VMRHG V0, H1_1, H1_0 519 VMRHG V0, H2_1, H2_0 520 VMRLG V0, H0_1, H0_1 521 VMRLG V0, H1_1, H1_1 522 VMRLG V0, H2_1, H2_1 523 524 // load EX0, EX1, EX2 525 MOVD $·constants<>(SB), R5 526 VLM (R5), EX0, EX2 527 528 // sum vectors 529 VAQ H0_0, H0_1, H0_0 530 VAQ H1_0, H1_1, H1_0 531 VAQ H2_0, H2_1, H2_0 532 533 // h may be >= 2*(2**130-5) so we need to reduce it again 534 // M0...M4 are used as temps here 535 REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) 536 537 next: // carry h1->h2 538 VLEIB $7, $0x28, T_1 539 VREPIB $4, T_2 540 VGBM $0x003F, T_3 541 VESRLG $4, T_3 542 543 // byte shift 544 VSRLB T_1, H1_0, T_4 545 546 // bit shift 547 VSRL T_2, T_4, T_4 548 549 // clear h1 carry bits 550 VN T_3, H1_0, H1_0 551 552 // add carry 553 VAQ T_4, H2_0, H2_0 554 555 // h is now < 2*(2**130-5) 556 // pack h into h1 (hi) and h0 (lo) 557 PACK(H0_0, H1_0, H2_0) 558 559 // if h > 2**130-5 then h -= 2**130-5 560 MOD(H0_0, H1_0, T_0, T_1, T_2) 561 562 // h += s 563 MOVD $·bswapMask<>(SB), R5 564 VL (R5), T_1 565 VL 16(R4), T_0 566 VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big) 567 VAQ T_0, H0_0, H0_0 568 VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little) 569 VST H0_0, (R1) 570 RET 571 572 add: 573 // load EX0, EX1, EX2 574 MOVD $·constants<>(SB), R5 575 VLM (R5), EX0, EX2 576 577 REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9) 578 VMRHG V0, H0_1, H0_0 579 VMRHG V0, H1_1, H1_0 580 VMRHG V0, H2_1, H2_0 581 VMRLG V0, H0_1, H0_1 582 VMRLG V0, H1_1, H1_1 583 VMRLG V0, H2_1, H2_1 584 CMPBLE R3, $64, b4 585 586 b4: 587 CMPBLE R3, $48, b3 // 3 blocks or less 588 589 // 4(3+1) blocks remaining 590 SUB $49, R3 591 VLM (R2), M0, M2 592 VLL R3, 48(R2), M3 593 ADD $1, R3 594 MOVBZ $1, R0 595 CMPBEQ R3, $16, 2(PC) 596 VLVGB R3, R0, M3 597 MOVD $64(R2), R2 598 EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3) 599 VLEIB $10, $1, H2_0 600 VLEIB $10, $1, H2_1 601 VZERO M0 602 VZERO M1 603 VZERO M4 604 VZERO M5 605 VZERO T_4 606 VZERO T_10 607 EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3) 608 VLR T_4, M2 609 VLEIB $10, $1, M4 610 CMPBNE R3, $16, 2(PC) 611 VLEIB $10, $1, T_10 612 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 613 REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9) 614 VMRHG V0, H0_1, H0_0 615 VMRHG V0, H1_1, H1_0 616 VMRHG V0, H2_1, H2_0 617 VMRLG V0, H0_1, H0_1 618 VMRLG V0, H1_1, H1_1 619 VMRLG V0, H2_1, H2_1 620 SUB $16, R3 621 CMPBLE R3, $0, square // this condition must always hold true! 622 623 b3: 624 CMPBLE R3, $32, b2 625 626 // 3 blocks remaining 627 628 // setup [r²,r] 629 VSLDB $8, R_0, R_0, R_0 630 VSLDB $8, R_1, R_1, R_1 631 VSLDB $8, R_2, R_2, R_2 632 VSLDB $8, R5_1, R5_1, R5_1 633 VSLDB $8, R5_2, R5_2, R5_2 634 635 VLVGG $1, RSAVE_0, R_0 636 VLVGG $1, RSAVE_1, R_1 637 VLVGG $1, RSAVE_2, R_2 638 VLVGG $1, R5SAVE_1, R5_1 639 VLVGG $1, R5SAVE_2, R5_2 640 641 // setup [h0, h1] 642 VSLDB $8, H0_0, H0_0, H0_0 643 VSLDB $8, H1_0, H1_0, H1_0 644 VSLDB $8, H2_0, H2_0, H2_0 645 VO H0_1, H0_0, H0_0 646 VO H1_1, H1_0, H1_0 647 VO H2_1, H2_0, H2_0 648 VZERO H0_1 649 VZERO H1_1 650 VZERO H2_1 651 652 VZERO M0 653 VZERO M1 654 VZERO M2 655 VZERO M3 656 VZERO M4 657 VZERO M5 658 659 // H*[r**2, r] 660 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 661 REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5) 662 663 SUB $33, R3 664 VLM (R2), M0, M1 665 VLL R3, 32(R2), M2 666 ADD $1, R3 667 MOVBZ $1, R0 668 CMPBEQ R3, $16, 2(PC) 669 VLVGB R3, R0, M2 670 671 // H += m0 672 VZERO T_1 673 VZERO T_2 674 VZERO T_3 675 EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6) 676 VLEIB $10, $1, T_3 677 VAG H0_0, T_1, H0_0 678 VAG H1_0, T_2, H1_0 679 VAG H2_0, T_3, H2_0 680 681 VZERO M0 682 VZERO M3 683 VZERO M4 684 VZERO M5 685 VZERO T_10 686 687 // (H+m0)*r 688 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 689 REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9) 690 691 // H += m1 692 VZERO V0 693 VZERO T_1 694 VZERO T_2 695 VZERO T_3 696 EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6) 697 VLEIB $10, $1, T_3 698 VAQ H0_0, T_1, H0_0 699 VAQ H1_0, T_2, H1_0 700 VAQ H2_0, T_3, H2_0 701 REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10) 702 703 // [H, m2] * [r**2, r] 704 EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3) 705 CMPBNE R3, $16, 2(PC) 706 VLEIB $10, $1, H2_0 707 VZERO M0 708 VZERO M1 709 VZERO M2 710 VZERO M3 711 VZERO M4 712 VZERO M5 713 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 714 REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10) 715 SUB $16, R3 716 CMPBLE R3, $0, next // this condition must always hold true! 717 718 b2: 719 CMPBLE R3, $16, b1 720 721 // 2 blocks remaining 722 723 // setup [r²,r] 724 VSLDB $8, R_0, R_0, R_0 725 VSLDB $8, R_1, R_1, R_1 726 VSLDB $8, R_2, R_2, R_2 727 VSLDB $8, R5_1, R5_1, R5_1 728 VSLDB $8, R5_2, R5_2, R5_2 729 730 VLVGG $1, RSAVE_0, R_0 731 VLVGG $1, RSAVE_1, R_1 732 VLVGG $1, RSAVE_2, R_2 733 VLVGG $1, R5SAVE_1, R5_1 734 VLVGG $1, R5SAVE_2, R5_2 735 736 // setup [h0, h1] 737 VSLDB $8, H0_0, H0_0, H0_0 738 VSLDB $8, H1_0, H1_0, H1_0 739 VSLDB $8, H2_0, H2_0, H2_0 740 VO H0_1, H0_0, H0_0 741 VO H1_1, H1_0, H1_0 742 VO H2_1, H2_0, H2_0 743 VZERO H0_1 744 VZERO H1_1 745 VZERO H2_1 746 747 VZERO M0 748 VZERO M1 749 VZERO M2 750 VZERO M3 751 VZERO M4 752 VZERO M5 753 754 // H*[r**2, r] 755 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 756 REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9) 757 VMRHG V0, H0_1, H0_0 758 VMRHG V0, H1_1, H1_0 759 VMRHG V0, H2_1, H2_0 760 VMRLG V0, H0_1, H0_1 761 VMRLG V0, H1_1, H1_1 762 VMRLG V0, H2_1, H2_1 763 764 // move h to the left and 0s at the right 765 VSLDB $8, H0_0, H0_0, H0_0 766 VSLDB $8, H1_0, H1_0, H1_0 767 VSLDB $8, H2_0, H2_0, H2_0 768 769 // get message blocks and append 1 to start 770 SUB $17, R3 771 VL (R2), M0 772 VLL R3, 16(R2), M1 773 ADD $1, R3 774 MOVBZ $1, R0 775 CMPBEQ R3, $16, 2(PC) 776 VLVGB R3, R0, M1 777 VZERO T_6 778 VZERO T_7 779 VZERO T_8 780 EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3) 781 EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3) 782 VLEIB $2, $1, T_8 783 CMPBNE R3, $16, 2(PC) 784 VLEIB $10, $1, T_8 785 786 // add [m0, m1] to h 787 VAG H0_0, T_6, H0_0 788 VAG H1_0, T_7, H1_0 789 VAG H2_0, T_8, H2_0 790 791 VZERO M2 792 VZERO M3 793 VZERO M4 794 VZERO M5 795 VZERO T_10 796 VZERO M0 797 798 // at this point R_0 .. R5_2 look like [r**2, r] 799 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 800 REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10) 801 SUB $16, R3, R3 802 CMPBLE R3, $0, next 803 804 b1: 805 CMPBLE R3, $0, next 806 807 // 1 block remaining 808 809 // setup [r²,r] 810 VSLDB $8, R_0, R_0, R_0 811 VSLDB $8, R_1, R_1, R_1 812 VSLDB $8, R_2, R_2, R_2 813 VSLDB $8, R5_1, R5_1, R5_1 814 VSLDB $8, R5_2, R5_2, R5_2 815 816 VLVGG $1, RSAVE_0, R_0 817 VLVGG $1, RSAVE_1, R_1 818 VLVGG $1, RSAVE_2, R_2 819 VLVGG $1, R5SAVE_1, R5_1 820 VLVGG $1, R5SAVE_2, R5_2 821 822 // setup [h0, h1] 823 VSLDB $8, H0_0, H0_0, H0_0 824 VSLDB $8, H1_0, H1_0, H1_0 825 VSLDB $8, H2_0, H2_0, H2_0 826 VO H0_1, H0_0, H0_0 827 VO H1_1, H1_0, H1_0 828 VO H2_1, H2_0, H2_0 829 VZERO H0_1 830 VZERO H1_1 831 VZERO H2_1 832 833 VZERO M0 834 VZERO M1 835 VZERO M2 836 VZERO M3 837 VZERO M4 838 VZERO M5 839 840 // H*[r**2, r] 841 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 842 REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) 843 844 // set up [0, m0] limbs 845 SUB $1, R3 846 VLL R3, (R2), M0 847 ADD $1, R3 848 MOVBZ $1, R0 849 CMPBEQ R3, $16, 2(PC) 850 VLVGB R3, R0, M0 851 VZERO T_1 852 VZERO T_2 853 VZERO T_3 854 EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m] 855 CMPBNE R3, $16, 2(PC) 856 VLEIB $10, $1, T_3 857 858 // h+m0 859 VAQ H0_0, T_1, H0_0 860 VAQ H1_0, T_2, H1_0 861 VAQ H2_0, T_3, H2_0 862 863 VZERO M0 864 VZERO M1 865 VZERO M2 866 VZERO M3 867 VZERO M4 868 VZERO M5 869 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 870 REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) 871 872 BR next 873 874 square: 875 // setup [r²,r] 876 VSLDB $8, R_0, R_0, R_0 877 VSLDB $8, R_1, R_1, R_1 878 VSLDB $8, R_2, R_2, R_2 879 VSLDB $8, R5_1, R5_1, R5_1 880 VSLDB $8, R5_2, R5_2, R5_2 881 882 VLVGG $1, RSAVE_0, R_0 883 VLVGG $1, RSAVE_1, R_1 884 VLVGG $1, RSAVE_2, R_2 885 VLVGG $1, R5SAVE_1, R5_1 886 VLVGG $1, R5SAVE_2, R5_2 887 888 // setup [h0, h1] 889 VSLDB $8, H0_0, H0_0, H0_0 890 VSLDB $8, H1_0, H1_0, H1_0 891 VSLDB $8, H2_0, H2_0, H2_0 892 VO H0_1, H0_0, H0_0 893 VO H1_1, H1_0, H1_0 894 VO H2_1, H2_0, H2_0 895 VZERO H0_1 896 VZERO H1_1 897 VZERO H2_1 898 899 VZERO M0 900 VZERO M1 901 VZERO M2 902 VZERO M3 903 VZERO M4 904 VZERO M5 905 906 // (h0*r**2) + (h1*r) 907 MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9) 908 REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5) 909 BR next 910 911 TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1 912 MOVD $x-24(SP), R1 913 XC $24, 0(R1), 0(R1) // clear the storage 914 MOVD $2, R0 // R0 is the number of double words stored -1 915 WORD $0xB2B01000 // STFLE 0(R1) 916 XOR R0, R0 // reset the value of R0 917 MOVBZ z-8(SP), R1 918 AND $0x01, R1 919 BEQ novmsl 920 921 vectorinstalled: 922 // check if the vector instruction has been enabled 923 VLEIB $0, $0xF, V16 924 VLGVB $0, V16, R1 925 CMPBNE R1, $0xF, novmsl 926 MOVB $1, ret+0(FP) // have vx 927 RET 928 929 novmsl: 930 MOVB $0, ret+0(FP) // no vx 931 RET