github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/submul_1.asm (about) 1 dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the 2 dnl result from a second limb vector. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund. 5 6 dnl Copyright 2000-2004 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb 37 C Itanium: 4.0 38 C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l) 39 40 C TODO 41 C * Optimize feed-in and wind-down code, both for speed and code size. 42 C * Handle low limb input and results specially, using a common stf8 in the 43 C epilogue. 44 C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in 45 C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and 46 C save a cycle. 47 48 C INPUT PARAMETERS 49 define(`rp', `r32') 50 define(`up', `r33') 51 define(`n', `r34') 52 define(`vl', `r35') 53 54 ASM_START() 55 PROLOGUE(mpn_submul_1) 56 .prologue 57 .save ar.lc, r2 58 .body 59 60 ifdef(`HAVE_ABI_32', 61 ` addp4 rp = 0, rp C M I 62 addp4 up = 0, up C M I 63 zxt4 n = n C I 64 ;; 65 ') 66 {.mmi 67 mov r10 = rp C M I 68 mov r9 = up C M I 69 sub vl = r0, vl C M I negate vl 70 } 71 {.mmi 72 ldf8 f8 = [rp], 8 C M 73 ldf8 f7 = [up], 8 C M 74 add r19 = -1, n C M I n - 1 75 ;; 76 } 77 {.mmi 78 cmp.eq p6, p0 = 0, vl C M I 79 mov r8 = 0 C M I zero cylimb 80 mov r2 = ar.lc C I0 81 } 82 {.mmi 83 setf.sig f6 = vl C M2 M3 84 and r14 = 3, n C M I 85 shr.u r19 = r19, 2 C I0 86 ;; 87 } 88 {.mmb 89 nop 0 90 cmp.eq p10, p0 = 0, r14 C M I 91 (p6) br.spnt .Ldone C B vl == 0 92 } 93 {.mmi 94 cmp.eq p11, p0 = 2, r14 C M I 95 cmp.eq p12, p0 = 3, r14 C M I 96 mov ar.lc = r19 C I0 97 } 98 {.bbb 99 (p10) br.dptk .Lb00 C B 100 (p11) br.dptk .Lb10 C B 101 (p12) br.dptk .Lb11 C B 102 ;; 103 } 104 105 .Lb01: br.cloop.dptk .grt1 106 107 xma.l f39 = f7, f6, f8 108 xma.hu f43 = f7, f6, f8 109 ;; 110 getf.sig r27 = f39 C lo 111 getf.sig r31 = f43 C hi 112 ld8 r20 = [r9], 8 113 br .Lcj1 114 115 .grt1: ldf8 f44 = [rp], 8 116 ldf8 f32 = [up], 8 117 ;; 118 ldf8 f45 = [rp], 8 119 ldf8 f33 = [up], 8 120 ;; 121 ldf8 f46 = [rp], 8 122 xma.l f39 = f7, f6, f8 123 ldf8 f34 = [up], 8 124 xma.hu f43 = f7, f6, f8 125 ;; 126 ldf8 f47 = [rp], 8 127 xma.l f36 = f32, f6, f44 128 ldf8 f35 = [up], 8 129 xma.hu f40 = f32, f6, f44 130 br.cloop.dptk .grt5 131 ;; 132 133 getf.sig r27 = f39 C lo 134 xma.l f37 = f33, f6, f45 135 ld8 r20 = [r9], 8 136 xma.hu f41 = f33, f6, f45 137 ;; 138 getf.sig r31 = f43 C hi 139 getf.sig r24 = f36 C lo 140 xma.l f38 = f34, f6, f46 141 ld8 r21 = [r9], 8 142 xma.hu f42 = f34, f6, f46 143 ;; 144 getf.sig r28 = f40 C hi 145 getf.sig r25 = f37 C lo 146 xma.l f39 = f35, f6, f47 147 ld8 r22 = [r9], 8 148 xma.hu f43 = f35, f6, f47 149 ;; 150 getf.sig r29 = f41 C hi 151 getf.sig r26 = f38 C lo 152 ld8 r23 = [r9], 8 153 br .Lcj5 154 155 .grt5: ldf8 f44 = [rp], 8 156 ldf8 f32 = [up], 8 157 ;; 158 getf.sig r27 = f39 C lo 159 xma.l f37 = f33, f6, f45 160 ld8 r20 = [r9], 8 161 xma.hu f41 = f33, f6, f45 162 ;; 163 ldf8 f45 = [rp], 8 164 getf.sig r31 = f43 C hi 165 ldf8 f33 = [up], 8 166 ;; 167 getf.sig r24 = f36 C lo 168 xma.l f38 = f34, f6, f46 169 ld8 r21 = [r9], 8 170 xma.hu f42 = f34, f6, f46 171 ;; 172 ldf8 f46 = [rp], 8 173 getf.sig r28 = f40 C hi 174 ldf8 f34 = [up], 8 175 ;; 176 getf.sig r25 = f37 C lo 177 xma.l f39 = f35, f6, f47 178 ld8 r22 = [r9], 8 179 xma.hu f43 = f35, f6, f47 180 ;; 181 ldf8 f47 = [rp], 8 182 getf.sig r29 = f41 C hi 183 ldf8 f35 = [up], 8 184 ;; 185 getf.sig r26 = f38 C lo 186 xma.l f36 = f32, f6, f44 187 ld8 r23 = [r9], 8 188 xma.hu f40 = f32, f6, f44 189 br.cloop.dptk .Loop 190 br .Lend 191 192 193 .Lb10: ldf8 f47 = [rp], 8 194 ldf8 f35 = [up], 8 195 br.cloop.dptk .grt2 196 197 xma.l f38 = f7, f6, f8 198 xma.hu f42 = f7, f6, f8 199 ;; 200 xma.l f39 = f35, f6, f47 201 xma.hu f43 = f35, f6, f47 202 ;; 203 getf.sig r26 = f38 C lo 204 getf.sig r30 = f42 C hi 205 ld8 r23 = [r9], 8 206 ;; 207 getf.sig r27 = f39 C lo 208 getf.sig r31 = f43 C hi 209 ld8 r20 = [r9], 8 210 br .Lcj2 211 212 .grt2: ldf8 f44 = [rp], 8 213 ldf8 f32 = [up], 8 214 ;; 215 ldf8 f45 = [rp], 8 216 ldf8 f33 = [up], 8 217 xma.l f38 = f7, f6, f8 218 xma.hu f42 = f7, f6, f8 219 ;; 220 ldf8 f46 = [rp], 8 221 ldf8 f34 = [up], 8 222 xma.l f39 = f35, f6, f47 223 xma.hu f43 = f35, f6, f47 224 ;; 225 ldf8 f47 = [rp], 8 226 ldf8 f35 = [up], 8 227 ;; 228 getf.sig r26 = f38 C lo 229 xma.l f36 = f32, f6, f44 230 ld8 r23 = [r9], 8 231 xma.hu f40 = f32, f6, f44 232 br.cloop.dptk .grt6 233 234 getf.sig r30 = f42 C hi 235 ;; 236 getf.sig r27 = f39 C lo 237 xma.l f37 = f33, f6, f45 238 ld8 r20 = [r9], 8 239 xma.hu f41 = f33, f6, f45 240 ;; 241 getf.sig r31 = f43 C hi 242 getf.sig r24 = f36 C lo 243 xma.l f38 = f34, f6, f46 244 ld8 r21 = [r9], 8 245 xma.hu f42 = f34, f6, f46 246 ;; 247 getf.sig r28 = f40 C hi 248 getf.sig r25 = f37 C lo 249 xma.l f39 = f35, f6, f47 250 ld8 r22 = [r9], 8 251 xma.hu f43 = f35, f6, f47 252 br .Lcj6 253 254 .grt6: ldf8 f44 = [rp], 8 255 getf.sig r30 = f42 C hi 256 ldf8 f32 = [up], 8 257 ;; 258 getf.sig r27 = f39 C lo 259 xma.l f37 = f33, f6, f45 260 ld8 r20 = [r9], 8 261 xma.hu f41 = f33, f6, f45 262 ;; 263 ldf8 f45 = [rp], 8 264 getf.sig r31 = f43 C hi 265 ldf8 f33 = [up], 8 266 ;; 267 getf.sig r24 = f36 C lo 268 xma.l f38 = f34, f6, f46 269 ld8 r21 = [r9], 8 270 xma.hu f42 = f34, f6, f46 271 ;; 272 ldf8 f46 = [rp], 8 273 getf.sig r28 = f40 C hi 274 ldf8 f34 = [up], 8 275 ;; 276 getf.sig r25 = f37 C lo 277 xma.l f39 = f35, f6, f47 278 ld8 r22 = [r9], 8 279 xma.hu f43 = f35, f6, f47 280 br .LL10 281 282 283 .Lb11: ldf8 f46 = [rp], 8 284 ldf8 f34 = [up], 8 285 ;; 286 ldf8 f47 = [rp], 8 287 ldf8 f35 = [up], 8 288 br.cloop.dptk .grt3 289 290 xma.l f37 = f7, f6, f8 291 xma.hu f41 = f7, f6, f8 292 ;; 293 xma.l f38 = f34, f6, f46 294 xma.hu f42 = f34, f6, f46 295 ;; 296 getf.sig r25 = f37 C lo 297 xma.l f39 = f35, f6, f47 298 xma.hu f43 = f35, f6, f47 299 ;; 300 getf.sig r29 = f41 C hi 301 ld8 r22 = [r9], 8 302 ;; 303 getf.sig r26 = f38 C lo 304 getf.sig r30 = f42 C hi 305 ld8 r23 = [r9], 8 306 ;; 307 getf.sig r27 = f39 C lo 308 getf.sig r31 = f43 C hi 309 ld8 r20 = [r9], 8 310 br .Lcj3 311 312 .grt3: ldf8 f44 = [rp], 8 313 xma.l f37 = f7, f6, f8 314 ldf8 f32 = [up], 8 315 xma.hu f41 = f7, f6, f8 316 ;; 317 ldf8 f45 = [rp], 8 318 xma.l f38 = f34, f6, f46 319 ldf8 f33 = [up], 8 320 xma.hu f42 = f34, f6, f46 321 ;; 322 ldf8 f46 = [rp], 8 323 ldf8 f34 = [up], 8 324 ;; 325 getf.sig r25 = f37 C lo 326 xma.l f39 = f35, f6, f47 327 ld8 r22 = [r9], 8 328 xma.hu f43 = f35, f6, f47 329 ;; 330 ldf8 f47 = [rp], 8 331 getf.sig r29 = f41 C hi 332 ldf8 f35 = [up], 8 333 ;; 334 getf.sig r26 = f38 C lo 335 xma.l f36 = f32, f6, f44 336 ld8 r23 = [r9], 8 337 xma.hu f40 = f32, f6, f44 338 br.cloop.dptk .grt7 339 ;; 340 341 getf.sig r30 = f42 C hi 342 getf.sig r27 = f39 C lo 343 xma.l f37 = f33, f6, f45 344 ld8 r20 = [r9], 8 345 xma.hu f41 = f33, f6, f45 346 ;; 347 getf.sig r31 = f43 C hi 348 getf.sig r24 = f36 C lo 349 xma.l f38 = f34, f6, f46 350 ld8 r21 = [r9], 8 351 xma.hu f42 = f34, f6, f46 352 br .Lcj7 353 354 .grt7: ldf8 f44 = [rp], 8 355 getf.sig r30 = f42 C hi 356 ldf8 f32 = [up], 8 357 ;; 358 getf.sig r27 = f39 C lo 359 xma.l f37 = f33, f6, f45 360 ld8 r20 = [r9], 8 361 xma.hu f41 = f33, f6, f45 362 ;; 363 ldf8 f45 = [rp], 8 364 getf.sig r31 = f43 C hi 365 ldf8 f33 = [up], 8 366 ;; 367 getf.sig r24 = f36 C lo 368 xma.l f38 = f34, f6, f46 369 ld8 r21 = [r9], 8 370 xma.hu f42 = f34, f6, f46 371 br .LL11 372 373 374 .Lb00: ldf8 f45 = [rp], 8 375 ldf8 f33 = [up], 8 376 ;; 377 ldf8 f46 = [rp], 8 378 ldf8 f34 = [up], 8 379 ;; 380 ldf8 f47 = [rp], 8 381 xma.l f36 = f7, f6, f8 382 ldf8 f35 = [up], 8 383 xma.hu f40 = f7, f6, f8 384 br.cloop.dptk .grt4 385 386 xma.l f37 = f33, f6, f45 387 xma.hu f41 = f33, f6, f45 388 ;; 389 getf.sig r24 = f36 C lo 390 xma.l f38 = f34, f6, f46 391 ld8 r21 = [r9], 8 392 xma.hu f42 = f34, f6, f46 393 ;; 394 getf.sig r28 = f40 C hi 395 xma.l f39 = f35, f6, f47 396 getf.sig r25 = f37 C lo 397 ld8 r22 = [r9], 8 398 xma.hu f43 = f35, f6, f47 399 ;; 400 getf.sig r29 = f41 C hi 401 getf.sig r26 = f38 C lo 402 ld8 r23 = [r9], 8 403 ;; 404 getf.sig r30 = f42 C hi 405 getf.sig r27 = f39 C lo 406 ld8 r20 = [r9], 8 407 br .Lcj4 408 409 .grt4: ldf8 f44 = [rp], 8 410 xma.l f37 = f33, f6, f45 411 ldf8 f32 = [up], 8 412 xma.hu f41 = f33, f6, f45 413 ;; 414 ldf8 f45 = [rp], 8 415 ldf8 f33 = [up], 8 416 xma.l f38 = f34, f6, f46 417 getf.sig r24 = f36 C lo 418 ld8 r21 = [r9], 8 419 xma.hu f42 = f34, f6, f46 420 ;; 421 ldf8 f46 = [rp], 8 422 getf.sig r28 = f40 C hi 423 ldf8 f34 = [up], 8 424 xma.l f39 = f35, f6, f47 425 getf.sig r25 = f37 C lo 426 ld8 r22 = [r9], 8 427 xma.hu f43 = f35, f6, f47 428 ;; 429 ldf8 f47 = [rp], 8 430 getf.sig r29 = f41 C hi 431 ldf8 f35 = [up], 8 432 ;; 433 getf.sig r26 = f38 C lo 434 xma.l f36 = f32, f6, f44 435 ld8 r23 = [r9], 8 436 xma.hu f40 = f32, f6, f44 437 br.cloop.dptk .grt8 438 ;; 439 440 getf.sig r30 = f42 C hi 441 getf.sig r27 = f39 C lo 442 xma.l f37 = f33, f6, f45 443 ld8 r20 = [r9], 8 444 xma.hu f41 = f33, f6, f45 445 br .Lcj8 446 447 .grt8: ldf8 f44 = [rp], 8 448 getf.sig r30 = f42 C hi 449 ldf8 f32 = [up], 8 450 ;; 451 getf.sig r27 = f39 C lo 452 xma.l f37 = f33, f6, f45 453 ld8 r20 = [r9], 8 454 xma.hu f41 = f33, f6, f45 455 br .LL00 456 457 ALIGN(32) 458 .Loop: 459 {.mmi 460 ldf8 f44 = [rp], 8 461 cmp.ltu p6, p0 = r27, r8 C lo cmp 462 sub r14 = r27, r8 C lo sub 463 } 464 {.mmi 465 getf.sig r30 = f42 C hi 466 ldf8 f32 = [up], 8 467 sub r8 = r20, r31 C hi sub 468 ;; C 01 469 } 470 {.mmf 471 getf.sig r27 = f39 C lo 472 st8 [r10] = r14, 8 473 xma.l f37 = f33, f6, f45 474 } 475 {.mfi 476 ld8 r20 = [r9], 8 477 xma.hu f41 = f33, f6, f45 478 (p6) add r8 = 1, r8 479 ;; C 02 480 } 481 {.mmi 482 .LL00: ldf8 f45 = [rp], 8 483 cmp.ltu p6, p0 = r24, r8 484 sub r14 = r24, r8 485 } 486 {.mmi 487 getf.sig r31 = f43 C hi 488 ldf8 f33 = [up], 8 489 sub r8 = r21, r28 490 ;; C 03 491 } 492 {.mmf 493 getf.sig r24 = f36 C lo 494 st8 [r10] = r14, 8 495 xma.l f38 = f34, f6, f46 496 } 497 {.mfi 498 ld8 r21 = [r9], 8 499 xma.hu f42 = f34, f6, f46 500 (p6) add r8 = 1, r8 501 ;; C 04 502 } 503 {.mmi 504 .LL11: ldf8 f46 = [rp], 8 505 cmp.ltu p6, p0 = r25, r8 506 sub r14 = r25, r8 507 } 508 {.mmi 509 getf.sig r28 = f40 C hi 510 ldf8 f34 = [up], 8 511 sub r8 = r22, r29 512 ;; C 05 513 } 514 {.mmf 515 getf.sig r25 = f37 C lo 516 st8 [r10] = r14, 8 517 xma.l f39 = f35, f6, f47 518 } 519 {.mfi 520 ld8 r22 = [r9], 8 521 xma.hu f43 = f35, f6, f47 522 (p6) add r8 = 1, r8 523 ;; C 06 524 } 525 {.mmi 526 .LL10: ldf8 f47 = [rp], 8 527 cmp.ltu p6, p0 = r26, r8 528 sub r14 = r26, r8 529 } 530 {.mmi 531 getf.sig r29 = f41 C hi 532 ldf8 f35 = [up], 8 533 sub r8 = r23, r30 534 ;; C 07 535 } 536 {.mmf 537 getf.sig r26 = f38 C lo 538 st8 [r10] = r14, 8 539 xma.l f36 = f32, f6, f44 540 } 541 {.mfi 542 ld8 r23 = [r9], 8 543 xma.hu f40 = f32, f6, f44 544 (p6) add r8 = 1, r8 545 } 546 br.cloop.dptk .Loop 547 ;; 548 549 .Lend: 550 cmp.ltu p6, p0 = r27, r8 551 sub r14 = r27, r8 552 getf.sig r30 = f42 553 sub r8 = r20, r31 554 ;; 555 getf.sig r27 = f39 556 st8 [r10] = r14, 8 557 xma.l f37 = f33, f6, f45 558 ld8 r20 = [r9], 8 559 xma.hu f41 = f33, f6, f45 560 (p6) add r8 = 1, r8 561 ;; 562 .Lcj8: 563 cmp.ltu p6, p0 = r24, r8 564 sub r14 = r24, r8 565 getf.sig r31 = f43 566 sub r8 = r21, r28 567 ;; 568 getf.sig r24 = f36 569 st8 [r10] = r14, 8 570 xma.l f38 = f34, f6, f46 571 ld8 r21 = [r9], 8 572 xma.hu f42 = f34, f6, f46 573 (p6) add r8 = 1, r8 574 ;; 575 .Lcj7: 576 cmp.ltu p6, p0 = r25, r8 577 sub r14 = r25, r8 578 getf.sig r28 = f40 579 sub r8 = r22, r29 580 ;; 581 getf.sig r25 = f37 582 st8 [r10] = r14, 8 583 xma.l f39 = f35, f6, f47 584 ld8 r22 = [r9], 8 585 xma.hu f43 = f35, f6, f47 586 (p6) add r8 = 1, r8 587 ;; 588 .Lcj6: 589 cmp.ltu p6, p0 = r26, r8 590 sub r14 = r26, r8 591 getf.sig r29 = f41 592 sub r8 = r23, r30 593 ;; 594 getf.sig r26 = f38 595 st8 [r10] = r14, 8 596 ld8 r23 = [r9], 8 597 (p6) add r8 = 1, r8 598 ;; 599 .Lcj5: 600 cmp.ltu p6, p0 = r27, r8 601 sub r14 = r27, r8 602 getf.sig r30 = f42 603 sub r8 = r20, r31 604 ;; 605 getf.sig r27 = f39 606 st8 [r10] = r14, 8 607 ld8 r20 = [r9], 8 608 (p6) add r8 = 1, r8 609 ;; 610 .Lcj4: 611 cmp.ltu p6, p0 = r24, r8 612 sub r14 = r24, r8 613 getf.sig r31 = f43 614 sub r8 = r21, r28 615 ;; 616 st8 [r10] = r14, 8 617 (p6) add r8 = 1, r8 618 ;; 619 .Lcj3: 620 cmp.ltu p6, p0 = r25, r8 621 sub r14 = r25, r8 622 sub r8 = r22, r29 623 ;; 624 st8 [r10] = r14, 8 625 (p6) add r8 = 1, r8 626 ;; 627 .Lcj2: 628 cmp.ltu p6, p0 = r26, r8 629 sub r14 = r26, r8 630 sub r8 = r23, r30 631 ;; 632 st8 [r10] = r14, 8 633 (p6) add r8 = 1, r8 634 ;; 635 .Lcj1: 636 cmp.ltu p6, p0 = r27, r8 637 sub r14 = r27, r8 638 sub r8 = r20, r31 639 ;; 640 st8 [r10] = r14, 8 641 mov ar.lc = r2 642 (p6) add r8 = 1, r8 643 br.ret.sptk.many b0 644 .Ldone: mov ar.lc = r2 645 br.ret.sptk.many b0 646 EPILOGUE() 647 ASM_END()