github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/addmul_1.asm (about) 1 dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the 2 dnl result to a second limb vector. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund. 5 6 dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb 37 C Itanium: 3.0 38 C Itanium 2: 2.0 39 40 C TODO 41 C * Further optimize feed-in and wind-down code, both for speed and code size. 42 C * Handle low limb input and results specially, using a common stf8 in the 43 C epilogue. 44 C * Use 1 c/l carry propagation scheme in wind-down code. 45 C * Use extra pointer registers for `up' and rp to speed up feed-in loads. 46 C * Work out final differences with mul_1.asm. That function is 300 bytes 47 C smaller than this due to better loop scheduling and thus simpler feed-in 48 C code. 49 50 C INPUT PARAMETERS 51 define(`rp', `r32') 52 define(`up', `r33') 53 define(`n', `r34') 54 define(`vl', `r35') 55 56 ASM_START() 57 PROLOGUE(mpn_addmul_1) 58 .prologue 59 .save ar.lc, r2 60 .body 61 62 ifdef(`HAVE_ABI_32', 63 ` addp4 rp = 0, rp C M I 64 addp4 up = 0, up C M I 65 zxt4 n = n C I 66 ;; 67 ') 68 {.mmi 69 adds r15 = -1, n C M I 70 mov r20 = rp C M I 71 mov.i r2 = ar.lc C I0 72 } 73 {.mmi 74 ldf8 f7 = [up], 8 C M 75 ldf8 f8 = [rp], 8 C M 76 and r14 = 3, n C M I 77 ;; 78 } 79 {.mmi 80 setf.sig f6 = vl C M2 M3 81 cmp.eq p10, p0 = 0, r14 C M I 82 shr.u r31 = r15, 2 C I0 83 } 84 {.mmi 85 cmp.eq p11, p0 = 2, r14 C M I 86 cmp.eq p12, p0 = 3, r14 C M I 87 nop.i 0 C I 88 ;; 89 } 90 {.mii 91 cmp.ne p6, p7 = r0, r0 C M I 92 mov.i ar.lc = r31 C I0 93 cmp.ne p8, p9 = r0, r0 C M I 94 } 95 {.bbb 96 (p10) br.dptk .Lb00 C B 97 (p11) br.dptk .Lb10 C B 98 (p12) br.dptk .Lb11 C B 99 ;; 100 } 101 102 .Lb01: br.cloop.dptk .grt1 C B 103 104 xma.l f39 = f7, f6, f8 C F 105 xma.hu f43 = f7, f6, f8 C F 106 ;; 107 getf.sig r8 = f43 C M2 108 stf8 [r20] = f39 C M2 M3 109 mov.i ar.lc = r2 C I0 110 br.ret.sptk.many b0 C B 111 112 .grt1: 113 ldf8 f32 = [up], 8 114 ldf8 f44 = [rp], 8 115 ;; 116 ldf8 f33 = [up], 8 117 ldf8 f45 = [rp], 8 118 ;; 119 ldf8 f34 = [up], 8 120 xma.l f39 = f7, f6, f8 121 ldf8 f46 = [rp], 8 122 xma.hu f43 = f7, f6, f8 123 ;; 124 ldf8 f35 = [up], 8 125 ldf8 f47 = [rp], 8 126 br.cloop.dptk .grt5 127 128 xma.l f36 = f32, f6, f44 129 xma.hu f40 = f32, f6, f44 130 ;; 131 stf8 [r20] = f39, 8 132 xma.l f37 = f33, f6, f45 133 xma.hu f41 = f33, f6, f45 134 ;; 135 getf.sig r31 = f43 136 getf.sig r24 = f36 137 xma.l f38 = f34, f6, f46 138 xma.hu f42 = f34, f6, f46 139 ;; 140 getf.sig r28 = f40 141 getf.sig r25 = f37 142 xma.l f39 = f35, f6, f47 143 xma.hu f43 = f35, f6, f47 144 ;; 145 getf.sig r29 = f41 146 getf.sig r26 = f38 147 br .Lcj5 148 149 .grt5: 150 mov r30 = 0 151 xma.l f36 = f32, f6, f44 152 xma.hu f40 = f32, f6, f44 153 ;; 154 ldf8 f32 = [up], 8 155 xma.l f37 = f33, f6, f45 156 ldf8 f44 = [rp], 8 157 xma.hu f41 = f33, f6, f45 158 ;; 159 ldf8 f33 = [up], 8 160 getf.sig r27 = f39 161 ;; 162 getf.sig r31 = f43 163 xma.l f38 = f34, f6, f46 164 ldf8 f45 = [rp], 8 165 xma.hu f42 = f34, f6, f46 166 ;; 167 ldf8 f34 = [up], 8 168 getf.sig r24 = f36 169 ;; 170 getf.sig r28 = f40 171 xma.l f39 = f35, f6, f47 172 ldf8 f46 = [rp], 8 173 xma.hu f43 = f35, f6, f47 174 ;; 175 ldf8 f35 = [up], 8 176 getf.sig r25 = f37 177 br.cloop.dptk .Loop 178 br .Le0 179 180 181 .Lb10: ldf8 f35 = [up], 8 182 ldf8 f47 = [rp], 8 183 br.cloop.dptk .grt2 184 185 xma.l f38 = f7, f6, f8 186 xma.hu f42 = f7, f6, f8 187 ;; 188 xma.l f39 = f35, f6, f47 189 xma.hu f43 = f35, f6, f47 190 ;; 191 getf.sig r30 = f42 192 stf8 [r20] = f38, 8 193 getf.sig r27 = f39 194 getf.sig r8 = f43 195 br .Lcj2 196 197 .grt2: 198 ldf8 f32 = [up], 8 199 ldf8 f44 = [rp], 8 200 ;; 201 ldf8 f33 = [up], 8 202 xma.l f38 = f7, f6, f8 203 ldf8 f45 = [rp], 8 204 xma.hu f42 = f7, f6, f8 205 ;; 206 ldf8 f34 = [up], 8 207 xma.l f39 = f35, f6, f47 208 ldf8 f46 = [rp], 8 209 xma.hu f43 = f35, f6, f47 210 ;; 211 ldf8 f35 = [up], 8 212 ldf8 f47 = [rp], 8 213 br.cloop.dptk .grt6 214 215 stf8 [r20] = f38, 8 216 xma.l f36 = f32, f6, f44 217 xma.hu f40 = f32, f6, f44 218 ;; 219 getf.sig r30 = f42 220 getf.sig r27 = f39 221 xma.l f37 = f33, f6, f45 222 xma.hu f41 = f33, f6, f45 223 ;; 224 getf.sig r31 = f43 225 getf.sig r24 = f36 226 xma.l f38 = f34, f6, f46 227 xma.hu f42 = f34, f6, f46 228 ;; 229 getf.sig r28 = f40 230 getf.sig r25 = f37 231 xma.l f39 = f35, f6, f47 232 xma.hu f43 = f35, f6, f47 233 br .Lcj6 234 235 .grt6: 236 mov r29 = 0 237 xma.l f36 = f32, f6, f44 238 xma.hu f40 = f32, f6, f44 239 ;; 240 ldf8 f32 = [up], 8 241 getf.sig r26 = f38 242 ;; 243 getf.sig r30 = f42 244 xma.l f37 = f33, f6, f45 245 ldf8 f44 = [rp], 8 246 xma.hu f41 = f33, f6, f45 247 ;; 248 ldf8 f33 = [up], 8 249 getf.sig r27 = f39 250 ;; 251 getf.sig r31 = f43 252 xma.l f38 = f34, f6, f46 253 ldf8 f45 = [rp], 8 254 xma.hu f42 = f34, f6, f46 255 ;; 256 ldf8 f34 = [up], 8 257 getf.sig r24 = f36 258 br .LL10 259 260 261 .Lb11: ldf8 f34 = [up], 8 262 ldf8 f46 = [rp], 8 263 ;; 264 ldf8 f35 = [up], 8 265 ldf8 f47 = [rp], 8 266 br.cloop.dptk .grt3 267 ;; 268 269 xma.l f37 = f7, f6, f8 270 xma.hu f41 = f7, f6, f8 271 xma.l f38 = f34, f6, f46 272 xma.hu f42 = f34, f6, f46 273 xma.l f39 = f35, f6, f47 274 xma.hu f43 = f35, f6, f47 275 ;; 276 getf.sig r29 = f41 277 stf8 [r20] = f37, 8 278 getf.sig r26 = f38 279 getf.sig r30 = f42 280 getf.sig r27 = f39 281 getf.sig r8 = f43 282 br .Lcj3 283 284 .grt3: 285 ldf8 f32 = [up], 8 286 xma.l f37 = f7, f6, f8 287 ldf8 f44 = [rp], 8 288 xma.hu f41 = f7, f6, f8 289 ;; 290 ldf8 f33 = [up], 8 291 xma.l f38 = f34, f6, f46 292 ldf8 f45 = [rp], 8 293 xma.hu f42 = f34, f6, f46 294 ;; 295 ldf8 f34 = [up], 8 296 xma.l f39 = f35, f6, f47 297 ldf8 f46 = [rp], 8 298 xma.hu f43 = f35, f6, f47 299 ;; 300 ldf8 f35 = [up], 8 301 getf.sig r25 = f37 C FIXME 302 ldf8 f47 = [rp], 8 303 br.cloop.dptk .grt7 304 305 getf.sig r29 = f41 306 stf8 [r20] = f37, 8 C FIXME 307 xma.l f36 = f32, f6, f44 308 getf.sig r26 = f38 309 xma.hu f40 = f32, f6, f44 310 ;; 311 getf.sig r30 = f42 312 xma.l f37 = f33, f6, f45 313 getf.sig r27 = f39 314 xma.hu f41 = f33, f6, f45 315 ;; 316 getf.sig r31 = f43 317 xma.l f38 = f34, f6, f46 318 getf.sig r24 = f36 319 xma.hu f42 = f34, f6, f46 320 br .Lcj7 321 322 .grt7: 323 getf.sig r29 = f41 324 xma.l f36 = f32, f6, f44 325 mov r28 = 0 326 xma.hu f40 = f32, f6, f44 327 ;; 328 ldf8 f32 = [up], 8 329 getf.sig r26 = f38 330 ;; 331 getf.sig r30 = f42 332 xma.l f37 = f33, f6, f45 333 ldf8 f44 = [rp], 8 334 xma.hu f41 = f33, f6, f45 335 ;; 336 ldf8 f33 = [up], 8 337 getf.sig r27 = f39 338 br .LL11 339 340 341 .Lb00: ldf8 f33 = [up], 8 342 ldf8 f45 = [rp], 8 343 ;; 344 ldf8 f34 = [up], 8 345 ldf8 f46 = [rp], 8 346 ;; 347 ldf8 f35 = [up], 8 348 xma.l f36 = f7, f6, f8 349 ldf8 f47 = [rp], 8 350 xma.hu f40 = f7, f6, f8 351 br.cloop.dptk .grt4 352 353 xma.l f37 = f33, f6, f45 354 xma.hu f41 = f33, f6, f45 355 xma.l f38 = f34, f6, f46 356 xma.hu f42 = f34, f6, f46 357 ;; 358 getf.sig r28 = f40 359 stf8 [r20] = f36, 8 360 xma.l f39 = f35, f6, f47 361 getf.sig r25 = f37 362 xma.hu f43 = f35, f6, f47 363 ;; 364 getf.sig r29 = f41 365 getf.sig r26 = f38 366 getf.sig r30 = f42 367 getf.sig r27 = f39 368 br .Lcj4 369 370 .grt4: 371 ldf8 f32 = [up], 8 372 xma.l f37 = f33, f6, f45 373 ldf8 f44 = [rp], 8 374 xma.hu f41 = f33, f6, f45 375 ;; 376 ldf8 f33 = [up], 8 377 xma.l f38 = f34, f6, f46 378 ldf8 f45 = [rp], 8 379 xma.hu f42 = f34, f6, f46 380 ;; 381 ldf8 f34 = [up], 8 382 getf.sig r24 = f36 C FIXME 383 xma.l f39 = f35, f6, f47 384 ldf8 f46 = [rp], 8 385 getf.sig r28 = f40 386 xma.hu f43 = f35, f6, f47 387 ;; 388 ldf8 f35 = [up], 8 389 getf.sig r25 = f37 390 ldf8 f47 = [rp], 8 391 br.cloop.dptk .grt8 392 393 getf.sig r29 = f41 394 stf8 [r20] = f36, 8 C FIXME 395 xma.l f36 = f32, f6, f44 396 getf.sig r26 = f38 397 getf.sig r30 = f42 398 xma.hu f40 = f32, f6, f44 399 ;; 400 xma.l f37 = f33, f6, f45 401 getf.sig r27 = f39 402 xma.hu f41 = f33, f6, f45 403 br .Lcj8 404 405 .grt8: 406 getf.sig r29 = f41 407 xma.l f36 = f32, f6, f44 408 mov r31 = 0 409 xma.hu f40 = f32, f6, f44 410 ;; 411 ldf8 f32 = [up], 8 412 getf.sig r26 = f38 413 br .LL00 414 415 416 C *** MAIN LOOP START *** 417 ALIGN(32) C insn fed cycle # 418 .Loop: 419 .pred.rel "mutex", p6, p7 C num by i1 i2 420 getf.sig r29 = f41 C 00 16 0 0 421 xma.l f36 = f32, f6, f44 C 01 06,15 0 0 422 (p6) add r14 = r30, r27, 1 C 02 0 0 423 ldf8 f47 = [rp], 8 C 03 0 0 424 xma.hu f40 = f32, f6, f44 C 04 06,15 0 0 425 (p7) add r14 = r30, r27 C 05 0 0 426 ;; 427 .pred.rel "mutex", p6, p7 428 ldf8 f32 = [up], 8 C 06 1 1 429 (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1 430 (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1 431 getf.sig r26 = f38 C 09 25 2 1 432 st8 [r20] = r14, 8 C 10 2 1 433 nop.b 0 C 11 2 1 434 ;; 435 .LL00: 436 .pred.rel "mutex", p8, p9 437 getf.sig r30 = f42 C 12 28 3 2 438 xma.l f37 = f33, f6, f45 C 13 18,27 3 2 439 (p8) add r16 = r31, r24, 1 C 14 3 2 440 ldf8 f44 = [rp], 8 C 15 3 2 441 xma.hu f41 = f33, f6, f45 C 16 18,27 3 2 442 (p9) add r16 = r31, r24 C 17 3 2 443 ;; 444 .pred.rel "mutex", p8, p9 445 ldf8 f33 = [up], 8 C 18 4 3 446 (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3 447 (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3 448 getf.sig r27 = f39 C 21 37 5 3 449 st8 [r20] = r16, 8 C 22 5 3 450 nop.b 0 C 23 5 3 451 ;; 452 .LL11: 453 .pred.rel "mutex", p6, p7 454 getf.sig r31 = f43 C 24 40 6 4 455 xma.l f38 = f34, f6, f46 C 25 30,39 6 4 456 (p6) add r14 = r28, r25, 1 C 26 6 4 457 ldf8 f45 = [rp], 8 C 27 6 4 458 xma.hu f42 = f34, f6, f46 C 28 30,39 6 4 459 (p7) add r14 = r28, r25 C 29 6 4 460 ;; 461 .pred.rel "mutex", p6, p7 462 ldf8 f34 = [up], 8 C 30 7 5 463 (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5 464 (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5 465 getf.sig r24 = f36 C 33 01 8 5 466 st8 [r20] = r14, 8 C 34 8 5 467 nop.b 0 C 35 8 5 468 ;; 469 .LL10: 470 .pred.rel "mutex", p8, p9 471 getf.sig r28 = f40 C 36 04 9 6 472 xma.l f39 = f35, f6, f47 C 37 42,03 9 6 473 (p8) add r16 = r29, r26, 1 C 38 9 6 474 ldf8 f46 = [rp], 8 C 39 9 6 475 xma.hu f43 = f35, f6, f47 C 40 42,03 9 6 476 (p9) add r16 = r29, r26 C 41 9 6 477 ;; 478 .pred.rel "mutex", p8, p9 479 ldf8 f35 = [up], 8 C 42 10 7 480 (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7 481 (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7 482 getf.sig r25 = f37 C 45 13 11 7 483 st8 [r20] = r16, 8 C 46 11 7 484 br.cloop.dptk .Loop C 47 11 7 485 C *** MAIN LOOP END *** 486 ;; 487 .Le0: 488 .pred.rel "mutex", p6, p7 489 getf.sig r29 = f41 C 490 xma.l f36 = f32, f6, f44 C 491 (p6) add r14 = r30, r27, 1 C 492 ldf8 f47 = [rp], 8 C 493 xma.hu f40 = f32, f6, f44 C 494 (p7) add r14 = r30, r27 C 495 ;; 496 .pred.rel "mutex", p6, p7 497 (p6) cmp.leu p8, p9 = r14, r27 C 498 (p7) cmp.ltu p8, p9 = r14, r27 C 499 getf.sig r26 = f38 C 500 st8 [r20] = r14, 8 C 501 ;; 502 .pred.rel "mutex", p8, p9 503 getf.sig r30 = f42 C 504 xma.l f37 = f33, f6, f45 C 505 (p8) add r16 = r31, r24, 1 C 506 xma.hu f41 = f33, f6, f45 C 507 (p9) add r16 = r31, r24 C 508 ;; 509 .pred.rel "mutex", p8, p9 510 (p8) cmp.leu p6, p7 = r16, r24 C 511 (p9) cmp.ltu p6, p7 = r16, r24 C 512 getf.sig r27 = f39 C 513 st8 [r20] = r16, 8 C 514 ;; 515 .Lcj8: 516 .pred.rel "mutex", p6, p7 517 getf.sig r31 = f43 C 518 xma.l f38 = f34, f6, f46 C 519 (p6) add r14 = r28, r25, 1 C 520 xma.hu f42 = f34, f6, f46 C 521 (p7) add r14 = r28, r25 C 522 ;; 523 .pred.rel "mutex", p6, p7 524 (p6) cmp.leu p8, p9 = r14, r25 C 525 (p7) cmp.ltu p8, p9 = r14, r25 C 526 getf.sig r24 = f36 C 527 st8 [r20] = r14, 8 C 528 ;; 529 .Lcj7: 530 .pred.rel "mutex", p8, p9 531 getf.sig r28 = f40 C 532 xma.l f39 = f35, f6, f47 C 533 (p8) add r16 = r29, r26, 1 C 534 xma.hu f43 = f35, f6, f47 C 535 (p9) add r16 = r29, r26 C 536 ;; 537 .pred.rel "mutex", p8, p9 538 (p8) cmp.leu p6, p7 = r16, r26 C 539 (p9) cmp.ltu p6, p7 = r16, r26 C 540 getf.sig r25 = f37 C 541 st8 [r20] = r16, 8 C 542 ;; 543 .Lcj6: 544 .pred.rel "mutex", p6, p7 545 getf.sig r29 = f41 C 546 (p6) add r14 = r30, r27, 1 C 547 (p7) add r14 = r30, r27 C 548 ;; 549 .pred.rel "mutex", p6, p7 550 (p6) cmp.leu p8, p9 = r14, r27 C 551 (p7) cmp.ltu p8, p9 = r14, r27 C 552 getf.sig r26 = f38 C 553 st8 [r20] = r14, 8 C 554 ;; 555 .Lcj5: 556 .pred.rel "mutex", p8, p9 557 getf.sig r30 = f42 C 558 (p8) add r16 = r31, r24, 1 C 559 (p9) add r16 = r31, r24 C 560 ;; 561 .pred.rel "mutex", p8, p9 562 (p8) cmp.leu p6, p7 = r16, r24 C 563 (p9) cmp.ltu p6, p7 = r16, r24 C 564 getf.sig r27 = f39 C 565 st8 [r20] = r16, 8 C 566 ;; 567 .Lcj4: 568 .pred.rel "mutex", p6, p7 569 getf.sig r8 = f43 C 570 (p6) add r14 = r28, r25, 1 C 571 (p7) add r14 = r28, r25 C 572 ;; 573 .pred.rel "mutex", p6, p7 574 st8 [r20] = r14, 8 C 575 (p6) cmp.leu p8, p9 = r14, r25 C 576 (p7) cmp.ltu p8, p9 = r14, r25 C 577 ;; 578 .Lcj3: 579 .pred.rel "mutex", p8, p9 580 (p8) add r16 = r29, r26, 1 C 581 (p9) add r16 = r29, r26 C 582 ;; 583 .pred.rel "mutex", p8, p9 584 st8 [r20] = r16, 8 C 585 (p8) cmp.leu p6, p7 = r16, r26 C 586 (p9) cmp.ltu p6, p7 = r16, r26 C 587 ;; 588 .Lcj2: 589 .pred.rel "mutex", p6, p7 590 (p6) add r14 = r30, r27, 1 C 591 (p7) add r14 = r30, r27 C 592 ;; 593 .pred.rel "mutex", p6, p7 594 st8 [r20] = r14 C 595 (p6) cmp.leu p8, p9 = r14, r27 C 596 (p7) cmp.ltu p8, p9 = r14, r27 C 597 ;; 598 (p8) add r8 = 1, r8 C M I 599 mov.i ar.lc = r2 C I0 600 br.ret.sptk.many b0 C B 601 EPILOGUE() 602 ASM_END()