github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparc1234/addmul_2.asm (about) 1 dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb 2 dnl number and add the result to a n limb vector. 3 4 dnl Copyright 2002, 2003 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C UltraSPARC 1&2: 9 36 C UltraSPARC 3: 10 37 38 C Algorithm: We use 16 floating-point multiplies per limb product, with the 39 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand 40 C split into 32-bit pieces. We sum four 48-bit partial products using 41 C floating-point add, then convert the resulting four 50-bit quantities and 42 C transfer them to the integer unit. 43 44 C Possible optimizations: 45 C 1. Align the stack area where we transfer the four 50-bit product-sums 46 C to a 32-byte boundary. That would minimize the cache collision. 47 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would 48 C be to align the area to map to the area immediately before up?) 49 C 2. Perform two of the fp->int conversions with integer instructions. We 50 C can get almost ten free IEU slots, if we clean up bookkeeping and the 51 C silly carry-limb code. 52 C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb 53 C code. 54 55 C OSP (Overlapping software pipeline) version of mpn_mul_basecase: 56 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles. 57 C FI = 20 58 C L = 9 x un * vn 59 C WDFI = 10 x vn / 2 60 C WD = 4 61 62 C Instruction classification (as per UltraSPARC functional units). 63 C Assuming silly carry code is fixed. Includes bookkeeping. 64 C 65 C mpn_addmul_X mpn_mul_X 66 C 1 2 1 2 67 C ========== ========== 68 C FM 8 16 8 16 69 C FA 10 18 10 18 70 C MEM 12 12 10 10 71 C ISHIFT 6 6 6 6 72 C IADDLOG 11 11 10 10 73 C BRANCH 1 1 1 1 74 C 75 C TOTAL IEU 17 17 16 16 76 C TOTAL 48 64 45 61 77 C 78 C IEU cycles 8.5 8.5 8 8 79 C MEM cycles 12 12 10 10 80 C ISSUE cycles 12 16 11.25 15.25 81 C FPU cycles 10 18 10 18 82 C cycles/loop 12 18 12 18 83 C cycles/limb 12 9 12 9 84 85 86 C INPUT PARAMETERS 87 C rp[n + 1] i0 88 C up[n] i1 89 C n i2 90 C vp[2] i3 91 92 93 ASM_START() 94 REGISTER(%g2,#scratch) 95 REGISTER(%g3,#scratch) 96 97 C Combine registers: 98 C u00_hi= u32_hi 99 C u00_lo= u32_lo 100 C a000 = out000 101 C a016 = out016 102 C Free: f52 f54 103 104 105 define(`p000', `%f8') define(`p016',`%f10') 106 define(`p032',`%f12') define(`p048',`%f14') 107 define(`p064',`%f16') define(`p080',`%f18') 108 define(`p096a',`%f20') define(`p112a',`%f22') 109 define(`p096b',`%f56') define(`p112b',`%f58') 110 111 define(`out000',`%f0') define(`out016',`%f6') 112 113 define(`v000',`%f24') define(`v016',`%f26') 114 define(`v032',`%f28') define(`v048',`%f30') 115 define(`v064',`%f44') define(`v080',`%f46') 116 define(`v096',`%f48') define(`v112',`%f50') 117 118 define(`u00',`%f32') define(`u32', `%f34') 119 120 define(`a000',`%f36') define(`a016',`%f38') 121 define(`a032',`%f40') define(`a048',`%f42') 122 define(`a064',`%f60') define(`a080',`%f62') 123 124 define(`u00_hi',`%f2') define(`u32_hi',`%f4') 125 define(`u00_lo',`%f3') define(`u32_lo',`%f5') 126 127 define(`cy',`%g1') 128 define(`rlimb',`%g3') 129 define(`i00',`%l0') define(`i16',`%l1') 130 define(`r00',`%l2') define(`r32',`%l3') 131 define(`xffffffff',`%l7') 132 define(`xffff',`%o0') 133 134 135 PROLOGUE(mpn_addmul_2) 136 137 C Initialization. (1) Split v operand into eight 16-bit chunks and store them 138 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs 139 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. 140 C This code could be better scheduled. 141 142 save %sp, -256, %sp 143 144 ifdef(`HAVE_VIS', 145 ` mov -1, %g4 146 wr %g0, 0xD2, %asi 147 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 148 ldda [%i3+6] %asi, v000 149 ldda [%i3+4] %asi, v016 150 ldda [%i3+2] %asi, v032 151 ldda [%i3+0] %asi, v048 152 fxtod v000, v000 153 ldda [%i3+14] %asi, v064 154 fxtod v016, v016 155 ldda [%i3+12] %asi, v080 156 fxtod v032, v032 157 ldda [%i3+10] %asi, v096 158 fxtod v048, v048 159 ldda [%i3+8] %asi, v112 160 fxtod v064, v064 161 fxtod v080, v080 162 fxtod v096, v096 163 fxtod v112, v112 164 fzero u00_hi 165 fzero u32_hi 166 ', 167 ` mov -1, %g4 168 ldx [%i3+0], %l0 C vp[0] 169 srlx %g4, 48, xffff C store mask in register `xffff' 170 ldx [%i3+8], %l1 C vp[1] 171 172 and %l0, xffff, %g2 173 stx %g2, [%sp+2223+0] 174 srlx %l0, 16, %g3 175 and %g3, xffff, %g3 176 stx %g3, [%sp+2223+8] 177 srlx %l0, 32, %g2 178 and %g2, xffff, %g2 179 stx %g2, [%sp+2223+16] 180 srlx %l0, 48, %g3 181 stx %g3, [%sp+2223+24] 182 and %l1, xffff, %g2 183 stx %g2, [%sp+2223+32] 184 srlx %l1, 16, %g3 185 and %g3, xffff, %g3 186 stx %g3, [%sp+2223+40] 187 srlx %l1, 32, %g2 188 and %g2, xffff, %g2 189 stx %g2, [%sp+2223+48] 190 srlx %l1, 48, %g3 191 stx %g3, [%sp+2223+56] 192 193 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 194 195 ldd [%sp+2223+0], v000 196 ldd [%sp+2223+8], v016 197 ldd [%sp+2223+16], v032 198 ldd [%sp+2223+24], v048 199 fxtod v000, v000 200 ldd [%sp+2223+32], v064 201 fxtod v016, v016 202 ldd [%sp+2223+40], v080 203 fxtod v032, v032 204 ldd [%sp+2223+48], v096 205 fxtod v048, v048 206 ldd [%sp+2223+56], v112 207 fxtod v064, v064 208 ld [%sp+2223+0], u00_hi C zero u00_hi 209 fxtod v080, v080 210 ld [%sp+2223+0], u32_hi C zero u32_hi 211 fxtod v096, v096 212 fxtod v112, v112 213 ') 214 C Initialization done. 215 mov 0, %g2 216 mov 0, rlimb 217 mov 0, %g4 218 add %i0, -8, %i0 C BOOKKEEPING 219 220 C Start software pipeline. 221 222 ld [%i1+4], u00_lo C read low 32 bits of up[i] 223 fxtod u00_hi, u00 224 C mid 225 ld [%i1+0], u32_lo C read high 32 bits of up[i] 226 fmuld u00, v000, a000 227 fmuld u00, v016, a016 228 fmuld u00, v032, a032 229 fmuld u00, v048, a048 230 add %i2, -1, %i2 C BOOKKEEPING 231 fmuld u00, v064, p064 232 add %i1, 8, %i1 C BOOKKEEPING 233 fxtod u32_hi, u32 234 fmuld u00, v080, p080 235 fmuld u00, v096, p096a 236 brnz,pt %i2, .L_2_or_more 237 fmuld u00, v112, p112a 238 239 .L1: fdtox a000, out000 240 fmuld u32, v000, p000 241 fdtox a016, out016 242 fmuld u32, v016, p016 243 fmovd p064, a064 244 fmuld u32, v032, p032 245 fmovd p080, a080 246 fmuld u32, v048, p048 247 std out000, [%sp+2223+16] 248 faddd p000, a032, a000 249 fmuld u32, v064, p064 250 std out016, [%sp+2223+24] 251 fxtod u00_hi, u00 252 faddd p016, a048, a016 253 fmuld u32, v080, p080 254 faddd p032, a064, a032 255 fmuld u32, v096, p096b 256 faddd p048, a080, a048 257 fmuld u32, v112, p112b 258 C mid 259 fdtox a000, out000 260 fdtox a016, out016 261 faddd p064, p096a, a064 262 faddd p080, p112a, a080 263 std out000, [%sp+2223+0] 264 b .L_wd2 265 std out016, [%sp+2223+8] 266 267 .L_2_or_more: 268 ld [%i1+4], u00_lo C read low 32 bits of up[i] 269 fdtox a000, out000 270 fmuld u32, v000, p000 271 fdtox a016, out016 272 fmuld u32, v016, p016 273 fmovd p064, a064 274 fmuld u32, v032, p032 275 fmovd p080, a080 276 fmuld u32, v048, p048 277 std out000, [%sp+2223+16] 278 faddd p000, a032, a000 279 fmuld u32, v064, p064 280 std out016, [%sp+2223+24] 281 fxtod u00_hi, u00 282 faddd p016, a048, a016 283 fmuld u32, v080, p080 284 faddd p032, a064, a032 285 fmuld u32, v096, p096b 286 faddd p048, a080, a048 287 fmuld u32, v112, p112b 288 C mid 289 ld [%i1+0], u32_lo C read high 32 bits of up[i] 290 fdtox a000, out000 291 fmuld u00, v000, p000 292 fdtox a016, out016 293 fmuld u00, v016, p016 294 faddd p064, p096a, a064 295 fmuld u00, v032, p032 296 faddd p080, p112a, a080 297 fmuld u00, v048, p048 298 add %i2, -1, %i2 C BOOKKEEPING 299 std out000, [%sp+2223+0] 300 faddd p000, a032, a000 301 fmuld u00, v064, p064 302 add %i1, 8, %i1 C BOOKKEEPING 303 std out016, [%sp+2223+8] 304 fxtod u32_hi, u32 305 faddd p016, a048, a016 306 fmuld u00, v080, p080 307 faddd p032, a064, a032 308 fmuld u00, v096, p096a 309 faddd p048, a080, a048 310 brnz,pt %i2, .L_3_or_more 311 fmuld u00, v112, p112a 312 313 b .Lend 314 nop 315 316 C 64 32 0 317 C . . . 318 C . |__rXXX_| 32 319 C . |___cy___| 34 320 C . |_______i00__| 50 321 C |_______i16__| . 50 322 323 324 C BEGIN MAIN LOOP 325 .align 16 326 .L_3_or_more: 327 .Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i] 328 and %g2, xffffffff, %g2 329 fdtox a000, out000 330 fmuld u32, v000, p000 331 C 332 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 333 add %g2, rlimb, %l5 334 fdtox a016, out016 335 fmuld u32, v016, p016 336 C 337 srlx %l5, 32, cy 338 ldx [%sp+2223+16], i00 339 faddd p064, p096b, a064 340 fmuld u32, v032, p032 341 C 342 add %g4, cy, cy C new cy 343 ldx [%sp+2223+24], i16 344 faddd p080, p112b, a080 345 fmuld u32, v048, p048 346 C 347 nop 348 std out000, [%sp+2223+16] 349 faddd p000, a032, a000 350 fmuld u32, v064, p064 351 C 352 add i00, r00, rlimb 353 add %i0, 8, %i0 C BOOKKEEPING 354 std out016, [%sp+2223+24] 355 fxtod u00_hi, u00 356 C 357 sllx i16, 16, %g2 358 add cy, rlimb, rlimb 359 faddd p016, a048, a016 360 fmuld u32, v080, p080 361 C 362 srlx i16, 16, %g4 363 add %g2, rlimb, %l5 364 faddd p032, a064, a032 365 fmuld u32, v096, p096b 366 C 367 stw %l5, [%i0+4] 368 nop 369 faddd p048, a080, a048 370 fmuld u32, v112, p112b 371 C midloop 372 ld [%i1+0], u32_lo C read high 32 bits of up[i] 373 and %g2, xffffffff, %g2 374 fdtox a000, out000 375 fmuld u00, v000, p000 376 C 377 lduw [%i0+0], r32 C read high 32 bits of rp[i] 378 add %g2, rlimb, %l5 379 fdtox a016, out016 380 fmuld u00, v016, p016 381 C 382 srlx %l5, 32, cy 383 ldx [%sp+2223+0], i00 384 faddd p064, p096a, a064 385 fmuld u00, v032, p032 386 C 387 add %g4, cy, cy C new cy 388 ldx [%sp+2223+8], i16 389 faddd p080, p112a, a080 390 fmuld u00, v048, p048 391 C 392 add %i2, -1, %i2 C BOOKKEEPING 393 std out000, [%sp+2223+0] 394 faddd p000, a032, a000 395 fmuld u00, v064, p064 396 C 397 add i00, r32, rlimb 398 add %i1, 8, %i1 C BOOKKEEPING 399 std out016, [%sp+2223+8] 400 fxtod u32_hi, u32 401 C 402 sllx i16, 16, %g2 403 add cy, rlimb, rlimb 404 faddd p016, a048, a016 405 fmuld u00, v080, p080 406 C 407 srlx i16, 16, %g4 408 add %g2, rlimb, %l5 409 faddd p032, a064, a032 410 fmuld u00, v096, p096a 411 C 412 stw %l5, [%i0+0] 413 faddd p048, a080, a048 414 brnz,pt %i2, .Loop 415 fmuld u00, v112, p112a 416 C END MAIN LOOP 417 418 C WIND-DOWN PHASE 1 419 .Lend: and %g2, xffffffff, %g2 420 fdtox a000, out000 421 fmuld u32, v000, p000 422 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 423 add %g2, rlimb, %l5 424 fdtox a016, out016 425 fmuld u32, v016, p016 426 srlx %l5, 32, cy 427 ldx [%sp+2223+16], i00 428 faddd p064, p096b, a064 429 fmuld u32, v032, p032 430 add %g4, cy, cy C new cy 431 ldx [%sp+2223+24], i16 432 faddd p080, p112b, a080 433 fmuld u32, v048, p048 434 std out000, [%sp+2223+16] 435 faddd p000, a032, a000 436 fmuld u32, v064, p064 437 add i00, r00, rlimb 438 add %i0, 8, %i0 C BOOKKEEPING 439 std out016, [%sp+2223+24] 440 sllx i16, 16, %g2 441 add cy, rlimb, rlimb 442 faddd p016, a048, a016 443 fmuld u32, v080, p080 444 srlx i16, 16, %g4 445 add %g2, rlimb, %l5 446 faddd p032, a064, a032 447 fmuld u32, v096, p096b 448 stw %l5, [%i0+4] 449 faddd p048, a080, a048 450 fmuld u32, v112, p112b 451 C mid 452 and %g2, xffffffff, %g2 453 fdtox a000, out000 454 lduw [%i0+0], r32 C read high 32 bits of rp[i] 455 add %g2, rlimb, %l5 456 fdtox a016, out016 457 srlx %l5, 32, cy 458 ldx [%sp+2223+0], i00 459 faddd p064, p096a, a064 460 add %g4, cy, cy C new cy 461 ldx [%sp+2223+8], i16 462 faddd p080, p112a, a080 463 std out000, [%sp+2223+0] 464 add i00, r32, rlimb 465 std out016, [%sp+2223+8] 466 sllx i16, 16, %g2 467 add cy, rlimb, rlimb 468 srlx i16, 16, %g4 469 add %g2, rlimb, %l5 470 stw %l5, [%i0+0] 471 472 C WIND-DOWN PHASE 2 473 .L_wd2: and %g2, xffffffff, %g2 474 fdtox a032, out000 475 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 476 add %g2, rlimb, %l5 477 fdtox a048, out016 478 srlx %l5, 32, cy 479 ldx [%sp+2223+16], i00 480 add %g4, cy, cy C new cy 481 ldx [%sp+2223+24], i16 482 std out000, [%sp+2223+16] 483 add i00, r00, rlimb 484 add %i0, 8, %i0 C BOOKKEEPING 485 std out016, [%sp+2223+24] 486 sllx i16, 16, %g2 487 add cy, rlimb, rlimb 488 srlx i16, 16, %g4 489 add %g2, rlimb, %l5 490 stw %l5, [%i0+4] 491 C mid 492 and %g2, xffffffff, %g2 493 fdtox a064, out000 494 lduw [%i0+0], r32 C read high 32 bits of rp[i] 495 add %g2, rlimb, %l5 496 fdtox a080, out016 497 srlx %l5, 32, cy 498 ldx [%sp+2223+0], i00 499 add %g4, cy, cy C new cy 500 ldx [%sp+2223+8], i16 501 std out000, [%sp+2223+0] 502 add i00, r32, rlimb 503 std out016, [%sp+2223+8] 504 sllx i16, 16, %g2 505 add cy, rlimb, rlimb 506 srlx i16, 16, %g4 507 add %g2, rlimb, %l5 508 stw %l5, [%i0+0] 509 510 C WIND-DOWN PHASE 3 511 .L_wd3: and %g2, xffffffff, %g2 512 fdtox p096b, out000 513 add %g2, rlimb, %l5 514 fdtox p112b, out016 515 srlx %l5, 32, cy 516 ldx [%sp+2223+16], rlimb 517 add %g4, cy, cy C new cy 518 ldx [%sp+2223+24], i16 519 std out000, [%sp+2223+16] 520 add %i0, 8, %i0 C BOOKKEEPING 521 std out016, [%sp+2223+24] 522 sllx i16, 16, %g2 523 add cy, rlimb, rlimb 524 srlx i16, 16, %g4 525 add %g2, rlimb, %l5 526 stw %l5, [%i0+4] 527 C mid 528 and %g2, xffffffff, %g2 529 add %g2, rlimb, %l5 530 srlx %l5, 32, cy 531 ldx [%sp+2223+0], rlimb 532 add %g4, cy, cy C new cy 533 ldx [%sp+2223+8], i16 534 sllx i16, 16, %g2 535 add cy, rlimb, rlimb 536 srlx i16, 16, %g4 537 add %g2, rlimb, %l5 538 stw %l5, [%i0+0] 539 540 and %g2, xffffffff, %g2 541 add %g2, rlimb, %l5 542 srlx %l5, 32, cy 543 ldx [%sp+2223+16], i00 544 add %g4, cy, cy C new cy 545 ldx [%sp+2223+24], i16 546 547 sllx i16, 16, %g2 548 add i00, cy, cy 549 return %i7+8 550 add %g2, cy, %o0 551 EPILOGUE(mpn_addmul_2)