github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/pa64/mul_1.asm (about) 1 dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store 2 dnl the result in a second limb vector. 3 4 dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C 8000,8200: 6.5 36 C 8500,8600,8700: 5.625 37 38 C The feed-in and wind-down code has not yet been scheduled. Many cycles 39 C could be saved there per call. 40 41 C DESCRIPTION: 42 C The main loop "BIG" is 4-way unrolled, mainly to allow 43 C effective use of ADD,DC. Delays in moving data via the cache from the FP 44 C registers to the IU registers, have demanded a deep software pipeline, and 45 C a lot of stack slots for partial products in flight. 46 C 47 C CODE STRUCTURE: 48 C save-some-registers 49 C do 0, 1, 2, or 3 limbs 50 C if done, restore-some-regs and return 51 C save-many-regs 52 C do 4, 8, ... limb 53 C restore-all-regs 54 55 C STACK LAYOUT: 56 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the 57 C slots marked FREE, as well as some slots in the caller's "frame marker". 58 C 59 C -00 <- r30 60 C -08 FREE 61 C -10 tmp 62 C -18 tmp 63 C -20 tmp 64 C -28 tmp 65 C -30 tmp 66 C -38 tmp 67 C -40 tmp 68 C -48 tmp 69 C -50 tmp 70 C -58 tmp 71 C -60 tmp 72 C -68 tmp 73 C -70 tmp 74 C -78 tmp 75 C -80 tmp 76 C -88 tmp 77 C -90 FREE 78 C -98 FREE 79 C -a0 FREE 80 C -a8 FREE 81 C -b0 r13 82 C -b8 r12 83 C -c0 r11 84 C -c8 r10 85 C -d0 r8 86 C -d8 r8 87 C -e0 r7 88 C -e8 r6 89 C -f0 r5 90 C -f8 r4 91 C -100 r3 92 C Previous frame: 93 C [unused area] 94 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. 95 96 97 include(`../config.m4') 98 99 C INPUT PARAMETERS: 100 define(`rp',`%r26') C 101 define(`up',`%r25') C 102 define(`n',`%r24') C 103 define(`vlimb',`%r23') C 104 105 define(`climb',`%r23') C 106 107 ifdef(`HAVE_ABI_2_0w', 108 ` .level 2.0w 109 ',` .level 2.0 110 ') 111 PROLOGUE(mpn_mul_1) 112 113 ifdef(`HAVE_ABI_2_0w', 114 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot 115 ') 116 std,ma %r3, 0x100(%r30) 117 std %r4, -0xf8(%r30) 118 std %r5, -0xf0(%r30) 119 ldo 0(%r0), climb C clear climb 120 fldd -0x138(%r30), %fr8 C put vlimb in fp register 121 122 define(`p032a1',`%r1') C 123 define(`p032a2',`%r19') C 124 125 define(`m032',`%r20') C 126 define(`m096',`%r21') C 127 128 define(`p000a',`%r22') C 129 define(`p064a',`%r29') C 130 131 define(`s000',`%r31') C 132 133 define(`ma000',`%r4') C 134 define(`ma064',`%r20') C 135 136 C define(`r000',`%r3') C FIXME don't save r3 for n < 4. 137 138 extrd,u n, 63, 2, %r5 139 cmpb,= %r5, %r0, L(BIG) 140 nop 141 142 fldd 0(up), %fr4 143 ldo 8(up), up 144 xmpyu %fr8R, %fr4L, %fr22 145 xmpyu %fr8L, %fr4R, %fr23 146 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 147 xmpyu %fr8R, %fr4R, %fr24 148 xmpyu %fr8L, %fr4L, %fr25 149 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 150 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 151 addib,<> -1, %r5, L(two_or_more) 152 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 153 LDEF(one) 154 ldd -0x78(%r30), p032a1 155 ldd -0x70(%r30), p032a2 156 ldd -0x80(%r30), p000a 157 b L(0_one_out) 158 ldd -0x68(%r30), p064a 159 160 LDEF(two_or_more) 161 fldd 0(up), %fr4 162 ldo 8(up), up 163 xmpyu %fr8R, %fr4L, %fr22 164 xmpyu %fr8L, %fr4R, %fr23 165 ldd -0x78(%r30), p032a1 166 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 167 xmpyu %fr8R, %fr4R, %fr24 168 xmpyu %fr8L, %fr4L, %fr25 169 ldd -0x70(%r30), p032a2 170 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 171 ldd -0x80(%r30), p000a 172 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 173 ldd -0x68(%r30), p064a 174 addib,<> -1, %r5, L(three_or_more) 175 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 176 LDEF(two) 177 add p032a1, p032a2, m032 178 add,dc %r0, %r0, m096 179 depd,z m032, 31, 32, ma000 180 extrd,u m032, 31, 32, ma064 181 b L(0_two_out) 182 depd m096, 31, 32, ma064 183 184 LDEF(three_or_more) 185 fldd 0(up), %fr4 186 add p032a1, p032a2, m032 187 add,dc %r0, %r0, m096 188 depd,z m032, 31, 32, ma000 189 extrd,u m032, 31, 32, ma064 190 C addib,= -1, %r5, L(0_out) 191 depd m096, 31, 32, ma064 192 LDEF(loop0) 193 C xmpyu %fr8R, %fr4L, %fr22 194 C xmpyu %fr8L, %fr4R, %fr23 195 C ldd -0x78(%r30), p032a1 196 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 197 C 198 C xmpyu %fr8R, %fr4R, %fr24 199 C xmpyu %fr8L, %fr4L, %fr25 200 C ldd -0x70(%r30), p032a2 201 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 202 C 203 C ldo 8(rp), rp 204 C add climb, p000a, s000 205 C ldd -0x80(%r30), p000a 206 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 207 C 208 C add,dc p064a, %r0, climb 209 C ldo 8(up), up 210 C ldd -0x68(%r30), p064a 211 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 212 C 213 C add ma000, s000, s000 214 C add,dc ma064, climb, climb 215 C fldd 0(up), %fr4 216 C 217 C std s000, -8(rp) 218 C 219 C add p032a1, p032a2, m032 220 C add,dc %r0, %r0, m096 221 C 222 C depd,z m032, 31, 32, ma000 223 C extrd,u m032, 31, 32, ma064 224 C addib,<> -1, %r5, L(loop0) 225 C depd m096, 31, 32, ma064 226 LDEF(0_out) 227 ldo 8(up), up 228 xmpyu %fr8R, %fr4L, %fr22 229 xmpyu %fr8L, %fr4R, %fr23 230 ldd -0x78(%r30), p032a1 231 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 232 xmpyu %fr8R, %fr4R, %fr24 233 xmpyu %fr8L, %fr4L, %fr25 234 ldd -0x70(%r30), p032a2 235 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 236 ldo 8(rp), rp 237 add climb, p000a, s000 238 ldd -0x80(%r30), p000a 239 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 240 add,dc p064a, %r0, climb 241 ldd -0x68(%r30), p064a 242 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 243 add ma000, s000, s000 244 add,dc ma064, climb, climb 245 std s000, -8(rp) 246 add p032a1, p032a2, m032 247 add,dc %r0, %r0, m096 248 depd,z m032, 31, 32, ma000 249 extrd,u m032, 31, 32, ma064 250 depd m096, 31, 32, ma064 251 LDEF(0_two_out) 252 ldd -0x78(%r30), p032a1 253 ldd -0x70(%r30), p032a2 254 ldo 8(rp), rp 255 add climb, p000a, s000 256 ldd -0x80(%r30), p000a 257 add,dc p064a, %r0, climb 258 ldd -0x68(%r30), p064a 259 add ma000, s000, s000 260 add,dc ma064, climb, climb 261 std s000, -8(rp) 262 LDEF(0_one_out) 263 add p032a1, p032a2, m032 264 add,dc %r0, %r0, m096 265 depd,z m032, 31, 32, ma000 266 extrd,u m032, 31, 32, ma064 267 depd m096, 31, 32, ma064 268 269 add climb, p000a, s000 270 add,dc p064a, %r0, climb 271 add ma000, s000, s000 272 add,dc ma064, climb, climb 273 std s000, 0(rp) 274 275 cmpib,>= 4, n, L(done) 276 ldo 8(rp), rp 277 278 C 4-way unrolled code. 279 280 LDEF(BIG) 281 282 define(`p032a1',`%r1') C 283 define(`p032a2',`%r19') C 284 define(`p096b1',`%r20') C 285 define(`p096b2',`%r21') C 286 define(`p160c1',`%r22') C 287 define(`p160c2',`%r29') C 288 define(`p224d1',`%r31') C 289 define(`p224d2',`%r3') C 290 C 291 define(`m032',`%r4') C 292 define(`m096',`%r5') C 293 define(`m160',`%r6') C 294 define(`m224',`%r7') C 295 define(`m288',`%r8') C 296 C 297 define(`p000a',`%r1') C 298 define(`p064a',`%r19') C 299 define(`p064b',`%r20') C 300 define(`p128b',`%r21') C 301 define(`p128c',`%r22') C 302 define(`p192c',`%r29') C 303 define(`p192d',`%r31') C 304 define(`p256d',`%r3') C 305 C 306 define(`s000',`%r10') C 307 define(`s064',`%r11') C 308 define(`s128',`%r12') C 309 define(`s192',`%r13') C 310 C 311 define(`ma000',`%r9') C 312 define(`ma064',`%r4') C 313 define(`ma128',`%r5') C 314 define(`ma192',`%r6') C 315 define(`ma256',`%r7') C 316 317 std %r6, -0xe8(%r30) 318 std %r7, -0xe0(%r30) 319 std %r8, -0xd8(%r30) 320 std %r9, -0xd0(%r30) 321 std %r10, -0xc8(%r30) 322 std %r11, -0xc0(%r30) 323 std %r12, -0xb8(%r30) 324 std %r13, -0xb0(%r30) 325 326 ifdef(`HAVE_ABI_2_0w', 327 ` extrd,u n, 61, 62, n C right shift 2 328 ',` extrd,u n, 61, 30, n C right shift 2, zero extend 329 ') 330 331 LDEF(4_or_more) 332 fldd 0(up), %fr4 333 fldd 8(up), %fr5 334 fldd 16(up), %fr6 335 fldd 24(up), %fr7 336 xmpyu %fr8R, %fr4L, %fr22 337 xmpyu %fr8L, %fr4R, %fr23 338 xmpyu %fr8R, %fr5L, %fr24 339 xmpyu %fr8L, %fr5R, %fr25 340 xmpyu %fr8R, %fr6L, %fr26 341 xmpyu %fr8L, %fr6R, %fr27 342 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 343 xmpyu %fr8R, %fr7L, %fr28 344 xmpyu %fr8L, %fr7R, %fr29 345 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 346 xmpyu %fr8R, %fr4R, %fr30 347 xmpyu %fr8L, %fr4L, %fr31 348 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 349 xmpyu %fr8R, %fr5R, %fr22 350 xmpyu %fr8L, %fr5L, %fr23 351 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 352 xmpyu %fr8R, %fr6R, %fr24 353 xmpyu %fr8L, %fr6L, %fr25 354 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 355 xmpyu %fr8R, %fr7R, %fr26 356 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 357 addib,<> -1, n, L(8_or_more) 358 xmpyu %fr8L, %fr7L, %fr27 359 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 360 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 361 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 362 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 363 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 364 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 365 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 366 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 367 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 368 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 369 ldd -0x78(%r30), p032a1 370 ldd -0x70(%r30), p032a2 371 ldd -0x38(%r30), p096b1 372 ldd -0x30(%r30), p096b2 373 ldd -0x58(%r30), p160c1 374 ldd -0x50(%r30), p160c2 375 ldd -0x18(%r30), p224d1 376 ldd -0x10(%r30), p224d2 377 b L(end1) 378 nop 379 380 LDEF(8_or_more) 381 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 382 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 383 ldo 32(up), up 384 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 385 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 386 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 387 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 388 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 389 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 390 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 391 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 392 fldd 0(up), %fr4 393 fldd 8(up), %fr5 394 fldd 16(up), %fr6 395 fldd 24(up), %fr7 396 xmpyu %fr8R, %fr4L, %fr22 397 ldd -0x78(%r30), p032a1 398 xmpyu %fr8L, %fr4R, %fr23 399 xmpyu %fr8R, %fr5L, %fr24 400 ldd -0x70(%r30), p032a2 401 xmpyu %fr8L, %fr5R, %fr25 402 xmpyu %fr8R, %fr6L, %fr26 403 ldd -0x38(%r30), p096b1 404 xmpyu %fr8L, %fr6R, %fr27 405 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 406 xmpyu %fr8R, %fr7L, %fr28 407 ldd -0x30(%r30), p096b2 408 xmpyu %fr8L, %fr7R, %fr29 409 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 410 xmpyu %fr8R, %fr4R, %fr30 411 ldd -0x58(%r30), p160c1 412 xmpyu %fr8L, %fr4L, %fr31 413 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 414 xmpyu %fr8R, %fr5R, %fr22 415 ldd -0x50(%r30), p160c2 416 xmpyu %fr8L, %fr5L, %fr23 417 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 418 xmpyu %fr8R, %fr6R, %fr24 419 ldd -0x18(%r30), p224d1 420 xmpyu %fr8L, %fr6L, %fr25 421 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 422 xmpyu %fr8R, %fr7R, %fr26 423 ldd -0x10(%r30), p224d2 424 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 425 addib,= -1, n, L(end2) 426 xmpyu %fr8L, %fr7L, %fr27 427 LDEF(loop) 428 add p032a1, p032a2, m032 429 ldd -0x80(%r30), p000a 430 add,dc p096b1, p096b2, m096 431 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 432 433 add,dc p160c1, p160c2, m160 434 ldd -0x68(%r30), p064a 435 add,dc p224d1, p224d2, m224 436 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 437 438 add,dc %r0, %r0, m288 439 ldd -0x40(%r30), p064b 440 ldo 32(up), up 441 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 442 443 depd,z m032, 31, 32, ma000 444 ldd -0x28(%r30), p128b 445 extrd,u m032, 31, 32, ma064 446 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 447 448 depd m096, 31, 32, ma064 449 ldd -0x60(%r30), p128c 450 extrd,u m096, 31, 32, ma128 451 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 452 453 depd m160, 31, 32, ma128 454 ldd -0x48(%r30), p192c 455 extrd,u m160, 31, 32, ma192 456 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 457 458 depd m224, 31, 32, ma192 459 ldd -0x20(%r30), p192d 460 extrd,u m224, 31, 32, ma256 461 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 462 463 depd m288, 31, 32, ma256 464 ldd -0x88(%r30), p256d 465 add climb, p000a, s000 466 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 467 468 add,dc p064a, p064b, s064 469 add,dc p128b, p128c, s128 470 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 471 472 add,dc p192c, p192d, s192 473 add,dc p256d, %r0, climb 474 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 475 476 add ma000, s000, s000 C accum mid 0 477 fldd 0(up), %fr4 478 add,dc ma064, s064, s064 C accum mid 1 479 std s000, 0(rp) 480 481 add,dc ma128, s128, s128 C accum mid 2 482 fldd 8(up), %fr5 483 add,dc ma192, s192, s192 C accum mid 3 484 std s064, 8(rp) 485 486 add,dc ma256, climb, climb 487 fldd 16(up), %fr6 488 std s128, 16(rp) 489 490 xmpyu %fr8R, %fr4L, %fr22 491 ldd -0x78(%r30), p032a1 492 xmpyu %fr8L, %fr4R, %fr23 493 fldd 24(up), %fr7 494 495 xmpyu %fr8R, %fr5L, %fr24 496 ldd -0x70(%r30), p032a2 497 xmpyu %fr8L, %fr5R, %fr25 498 std s192, 24(rp) 499 500 xmpyu %fr8R, %fr6L, %fr26 501 ldd -0x38(%r30), p096b1 502 xmpyu %fr8L, %fr6R, %fr27 503 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 504 505 xmpyu %fr8R, %fr7L, %fr28 506 ldd -0x30(%r30), p096b2 507 xmpyu %fr8L, %fr7R, %fr29 508 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 509 510 xmpyu %fr8R, %fr4R, %fr30 511 ldd -0x58(%r30), p160c1 512 xmpyu %fr8L, %fr4L, %fr31 513 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 514 515 xmpyu %fr8R, %fr5R, %fr22 516 ldd -0x50(%r30), p160c2 517 xmpyu %fr8L, %fr5L, %fr23 518 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 519 520 xmpyu %fr8R, %fr6R, %fr24 521 ldd -0x18(%r30), p224d1 522 xmpyu %fr8L, %fr6L, %fr25 523 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 524 525 xmpyu %fr8R, %fr7R, %fr26 526 ldd -0x10(%r30), p224d2 527 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 528 xmpyu %fr8L, %fr7L, %fr27 529 530 addib,<> -1, n, L(loop) 531 ldo 32(rp), rp 532 533 LDEF(end2) 534 add p032a1, p032a2, m032 535 ldd -0x80(%r30), p000a 536 add,dc p096b1, p096b2, m096 537 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 538 add,dc p160c1, p160c2, m160 539 ldd -0x68(%r30), p064a 540 add,dc p224d1, p224d2, m224 541 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 542 add,dc %r0, %r0, m288 543 ldd -0x40(%r30), p064b 544 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 545 depd,z m032, 31, 32, ma000 546 ldd -0x28(%r30), p128b 547 extrd,u m032, 31, 32, ma064 548 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 549 depd m096, 31, 32, ma064 550 ldd -0x60(%r30), p128c 551 extrd,u m096, 31, 32, ma128 552 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 553 depd m160, 31, 32, ma128 554 ldd -0x48(%r30), p192c 555 extrd,u m160, 31, 32, ma192 556 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 557 depd m224, 31, 32, ma192 558 ldd -0x20(%r30), p192d 559 extrd,u m224, 31, 32, ma256 560 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 561 depd m288, 31, 32, ma256 562 ldd -0x88(%r30), p256d 563 add climb, p000a, s000 564 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 565 add,dc p064a, p064b, s064 566 add,dc p128b, p128c, s128 567 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 568 add,dc p192c, p192d, s192 569 add,dc p256d, %r0, climb 570 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 571 add ma000, s000, s000 C accum mid 0 572 add,dc ma064, s064, s064 C accum mid 1 573 add,dc ma128, s128, s128 C accum mid 2 574 add,dc ma192, s192, s192 C accum mid 3 575 add,dc ma256, climb, climb 576 std s000, 0(rp) 577 std s064, 8(rp) 578 ldd -0x78(%r30), p032a1 579 std s128, 16(rp) 580 ldd -0x70(%r30), p032a2 581 std s192, 24(rp) 582 ldd -0x38(%r30), p096b1 583 ldd -0x30(%r30), p096b2 584 ldd -0x58(%r30), p160c1 585 ldd -0x50(%r30), p160c2 586 ldd -0x18(%r30), p224d1 587 ldd -0x10(%r30), p224d2 588 ldo 32(rp), rp 589 590 LDEF(end1) 591 add p032a1, p032a2, m032 592 ldd -0x80(%r30), p000a 593 add,dc p096b1, p096b2, m096 594 add,dc p160c1, p160c2, m160 595 ldd -0x68(%r30), p064a 596 add,dc p224d1, p224d2, m224 597 add,dc %r0, %r0, m288 598 ldd -0x40(%r30), p064b 599 depd,z m032, 31, 32, ma000 600 ldd -0x28(%r30), p128b 601 extrd,u m032, 31, 32, ma064 602 depd m096, 31, 32, ma064 603 ldd -0x60(%r30), p128c 604 extrd,u m096, 31, 32, ma128 605 depd m160, 31, 32, ma128 606 ldd -0x48(%r30), p192c 607 extrd,u m160, 31, 32, ma192 608 depd m224, 31, 32, ma192 609 ldd -0x20(%r30), p192d 610 extrd,u m224, 31, 32, ma256 611 depd m288, 31, 32, ma256 612 ldd -0x88(%r30), p256d 613 add climb, p000a, s000 614 add,dc p064a, p064b, s064 615 add,dc p128b, p128c, s128 616 add,dc p192c, p192d, s192 617 add,dc p256d, %r0, climb 618 add ma000, s000, s000 C accum mid 0 619 add,dc ma064, s064, s064 C accum mid 1 620 add,dc ma128, s128, s128 C accum mid 2 621 add,dc ma192, s192, s192 C accum mid 3 622 add,dc ma256, climb, climb 623 std s000, 0(rp) 624 std s064, 8(rp) 625 std s128, 16(rp) 626 std s192, 24(rp) 627 628 ldd -0xb0(%r30), %r13 629 ldd -0xb8(%r30), %r12 630 ldd -0xc0(%r30), %r11 631 ldd -0xc8(%r30), %r10 632 ldd -0xd0(%r30), %r9 633 ldd -0xd8(%r30), %r8 634 ldd -0xe0(%r30), %r7 635 ldd -0xe8(%r30), %r6 636 LDEF(done) 637 ifdef(`HAVE_ABI_2_0w', 638 ` copy climb, %r28 639 ',` extrd,u climb, 63, 32, %r29 640 extrd,u climb, 31, 32, %r28 641 ') 642 ldd -0xf0(%r30), %r5 643 ldd -0xf8(%r30), %r4 644 bve (%r2) 645 ldd,mb -0x100(%r30), %r3 646 EPILOGUE(mpn_mul_1)