github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/mul_1.asm (about) 1 dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and 2 dnl store the result in a second limb vector. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund. 5 6 dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb 37 C Itanium: 4.0 38 C Itanium 2: 2.0 39 40 C TODO 41 C * Further optimize feed-in and wind-down code, both for speed and code size. 42 C * Handle low limb input and results specially, using a common stf8 in the 43 C epilogue. 44 C * Use 1 c/l carry propagation scheme in wind-down code. 45 C * Use extra pointer register for `up' to speed up feed-in loads. 46 C * Work out final differences with addmul_1.asm. 47 48 C INPUT PARAMETERS 49 define(`rp', `r32') 50 define(`up', `r33') 51 define(`n', `r34') 52 define(`vl', `r35') 53 define(`cy', `r36') C for mpn_mul_1c 54 55 ASM_START() 56 PROLOGUE(mpn_mul_1) 57 .prologue 58 .save ar.lc, r2 59 .body 60 61 ifdef(`HAVE_ABI_32', 62 ` addp4 rp = 0, rp C M I 63 addp4 up = 0, up C M I 64 zxt4 n = n C I 65 ;; 66 ') 67 {.mfi 68 adds r15 = -1, n C M I 69 mov f9 = f0 C F 70 mov.i r2 = ar.lc C I0 71 } 72 {.mmi 73 ldf8 f7 = [up], 8 C M 74 nop.m 0 C M 75 and r14 = 3, n C M I 76 ;; 77 } 78 .Lcommon: 79 {.mii 80 setf.sig f6 = vl C M2 M3 81 shr.u r31 = r15, 2 C I0 82 cmp.eq p10, p0 = 0, r14 C M I 83 } 84 {.mii 85 cmp.eq p11, p0 = 2, r14 C M I 86 cmp.eq p12, p0 = 3, r14 C M I 87 nop.i 0 C I 88 ;; 89 } 90 {.mii 91 cmp.ne p6, p7 = r0, r0 C M I 92 mov.i ar.lc = r31 C I0 93 cmp.ne p8, p9 = r0, r0 C M I 94 } 95 {.bbb 96 (p10) br.dptk .Lb00 C B 97 (p11) br.dptk .Lb10 C B 98 (p12) br.dptk .Lb11 C B 99 ;; 100 } 101 102 .Lb01: mov r20 = 0 103 br.cloop.dptk .grt1 C B 104 105 xma.l f39 = f7, f6, f9 C F 106 xma.hu f43 = f7, f6, f9 C F 107 ;; 108 getf.sig r8 = f43 C M2 109 stf8 [rp] = f39 C M2 M3 110 mov.i ar.lc = r2 C I0 111 br.ret.sptk.many b0 C B 112 113 .grt1: 114 ldf8 f32 = [up], 8 115 ;; 116 ldf8 f33 = [up], 8 117 ;; 118 ldf8 f34 = [up], 8 119 xma.l f39 = f7, f6, f9 120 xma.hu f43 = f7, f6, f9 121 ;; 122 ldf8 f35 = [up], 8 123 br.cloop.dptk .grt5 124 125 xma.l f36 = f32, f6, f0 126 xma.hu f40 = f32, f6, f0 127 ;; 128 stf8 [rp] = f39, 8 129 xma.l f37 = f33, f6, f0 130 xma.hu f41 = f33, f6, f0 131 ;; 132 getf.sig r21 = f43 133 getf.sig r18 = f36 134 xma.l f38 = f34, f6, f0 135 xma.hu f42 = f34, f6, f0 136 ;; 137 getf.sig r22 = f40 138 getf.sig r19 = f37 139 xma.l f39 = f35, f6, f0 140 xma.hu f43 = f35, f6, f0 141 ;; 142 getf.sig r23 = f41 143 getf.sig r16 = f38 144 br .Lcj5 145 146 .grt5: 147 xma.l f36 = f32, f6, f0 148 xma.hu f40 = f32, f6, f0 149 ;; 150 getf.sig r17 = f39 151 ldf8 f32 = [up], 8 152 xma.l f37 = f33, f6, f0 153 xma.hu f41 = f33, f6, f0 154 ;; 155 getf.sig r21 = f43 156 ldf8 f33 = [up], 8 157 xma.l f38 = f34, f6, f0 158 ;; 159 getf.sig r18 = f36 160 xma.hu f42 = f34, f6, f0 161 ;; 162 getf.sig r22 = f40 163 ldf8 f34 = [up], 8 164 xma.l f39 = f35, f6, f0 165 ;; 166 getf.sig r19 = f37 167 xma.hu f43 = f35, f6, f0 168 br .LL01 169 170 171 .Lb10: ldf8 f35 = [up], 8 172 mov r23 = 0 173 br.cloop.dptk .grt2 174 175 xma.l f38 = f7, f6, f9 176 xma.hu f42 = f7, f6, f9 177 ;; 178 stf8 [rp] = f38, 8 179 xma.l f39 = f35, f6, f42 180 xma.hu f43 = f35, f6, f42 181 ;; 182 getf.sig r8 = f43 183 stf8 [rp] = f39 184 mov.i ar.lc = r2 185 br.ret.sptk.many b0 186 187 188 .grt2: 189 ldf8 f32 = [up], 8 190 ;; 191 ldf8 f33 = [up], 8 192 xma.l f38 = f7, f6, f9 193 xma.hu f42 = f7, f6, f9 194 ;; 195 ldf8 f34 = [up], 8 196 xma.l f39 = f35, f6, f0 197 xma.hu f43 = f35, f6, f0 198 ;; 199 ldf8 f35 = [up], 8 200 br.cloop.dptk .grt6 201 202 stf8 [rp] = f38, 8 203 xma.l f36 = f32, f6, f0 204 xma.hu f40 = f32, f6, f0 205 ;; 206 getf.sig r20 = f42 207 getf.sig r17 = f39 208 xma.l f37 = f33, f6, f0 209 xma.hu f41 = f33, f6, f0 210 ;; 211 getf.sig r21 = f43 212 getf.sig r18 = f36 213 xma.l f38 = f34, f6, f0 214 xma.hu f42 = f34, f6, f0 215 ;; 216 getf.sig r22 = f40 217 getf.sig r19 = f37 218 xma.l f39 = f35, f6, f0 219 xma.hu f43 = f35, f6, f0 220 br .Lcj6 221 222 .grt6: 223 getf.sig r16 = f38 224 xma.l f36 = f32, f6, f0 225 xma.hu f40 = f32, f6, f0 226 ;; 227 getf.sig r20 = f42 228 ldf8 f32 = [up], 8 229 xma.l f37 = f33, f6, f0 230 ;; 231 getf.sig r17 = f39 232 xma.hu f41 = f33, f6, f0 233 ;; 234 getf.sig r21 = f43 235 ldf8 f33 = [up], 8 236 xma.l f38 = f34, f6, f0 237 ;; 238 getf.sig r18 = f36 239 xma.hu f42 = f34, f6, f0 240 br .LL10 241 242 243 .Lb11: ldf8 f34 = [up], 8 244 mov r22 = 0 245 ;; 246 ldf8 f35 = [up], 8 247 br.cloop.dptk .grt3 248 ;; 249 250 xma.l f37 = f7, f6, f9 251 xma.hu f41 = f7, f6, f9 252 xma.l f38 = f34, f6, f0 253 xma.hu f42 = f34, f6, f0 254 xma.l f39 = f35, f6, f0 255 xma.hu f43 = f35, f6, f0 256 ;; 257 getf.sig r23 = f41 258 stf8 [rp] = f37, 8 259 getf.sig r16 = f38 260 getf.sig r20 = f42 261 getf.sig r17 = f39 262 getf.sig r8 = f43 263 br .Lcj3 264 265 .grt3: 266 ldf8 f32 = [up], 8 267 xma.l f37 = f7, f6, f9 268 xma.hu f41 = f7, f6, f9 269 ;; 270 ldf8 f33 = [up], 8 271 xma.l f38 = f34, f6, f0 272 xma.hu f42 = f34, f6, f0 273 ;; 274 getf.sig r19 = f37 275 ldf8 f34 = [up], 8 276 xma.l f39 = f35, f6, f0 277 xma.hu f43 = f35, f6, f0 278 ;; 279 getf.sig r23 = f41 280 ldf8 f35 = [up], 8 281 br.cloop.dptk .grt7 282 283 getf.sig r16 = f38 284 xma.l f36 = f32, f6, f0 285 getf.sig r20 = f42 286 xma.hu f40 = f32, f6, f0 287 ;; 288 getf.sig r17 = f39 289 xma.l f37 = f33, f6, f0 290 getf.sig r21 = f43 291 xma.hu f41 = f33, f6, f0 292 ;; 293 getf.sig r18 = f36 294 st8 [rp] = r19, 8 295 xma.l f38 = f34, f6, f0 296 xma.hu f42 = f34, f6, f0 297 br .Lcj7 298 299 .grt7: 300 getf.sig r16 = f38 301 xma.l f36 = f32, f6, f0 302 xma.hu f40 = f32, f6, f0 303 ;; 304 getf.sig r20 = f42 305 ldf8 f32 = [up], 8 306 xma.l f37 = f33, f6, f0 307 ;; 308 getf.sig r17 = f39 309 xma.hu f41 = f33, f6, f0 310 br .LL11 311 312 313 .Lb00: ldf8 f33 = [up], 8 314 mov r21 = 0 315 ;; 316 ldf8 f34 = [up], 8 317 ;; 318 ldf8 f35 = [up], 8 319 xma.l f36 = f7, f6, f9 320 xma.hu f40 = f7, f6, f9 321 br.cloop.dptk .grt4 322 323 xma.l f37 = f33, f6, f0 324 xma.hu f41 = f33, f6, f0 325 xma.l f38 = f34, f6, f0 326 xma.hu f42 = f34, f6, f0 327 ;; 328 getf.sig r22 = f40 329 stf8 [rp] = f36, 8 330 xma.l f39 = f35, f6, f0 331 getf.sig r19 = f37 332 xma.hu f43 = f35, f6, f0 333 ;; 334 getf.sig r23 = f41 335 getf.sig r16 = f38 336 getf.sig r20 = f42 337 getf.sig r17 = f39 338 br .Lcj4 339 340 .grt4: 341 ldf8 f32 = [up], 8 342 xma.l f37 = f33, f6, f0 343 xma.hu f41 = f33, f6, f0 344 ;; 345 getf.sig r18 = f36 346 ldf8 f33 = [up], 8 347 xma.l f38 = f34, f6, f0 348 xma.hu f42 = f34, f6, f0 349 ;; 350 getf.sig r22 = f40 351 ldf8 f34 = [up], 8 352 xma.l f39 = f35, f6, f0 353 ;; 354 getf.sig r19 = f37 355 getf.sig r23 = f41 356 xma.hu f43 = f35, f6, f0 357 ldf8 f35 = [up], 8 358 br.cloop.dptk .grt8 359 360 getf.sig r16 = f38 361 xma.l f36 = f32, f6, f0 362 getf.sig r20 = f42 363 xma.hu f40 = f32, f6, f0 364 ;; 365 getf.sig r17 = f39 366 st8 [rp] = r18, 8 367 xma.l f37 = f33, f6, f0 368 xma.hu f41 = f33, f6, f0 369 br .Lcj8 370 371 .grt8: 372 getf.sig r16 = f38 373 xma.l f36 = f32, f6, f0 374 xma.hu f40 = f32, f6, f0 375 br .LL00 376 377 378 C *** MAIN LOOP START *** 379 ALIGN(32) 380 .Loop: 381 .pred.rel "mutex",p6,p7 382 getf.sig r16 = f38 383 xma.l f36 = f32, f6, f0 384 (p6) cmp.leu p8, p9 = r24, r17 385 st8 [rp] = r24, 8 386 xma.hu f40 = f32, f6, f0 387 (p7) cmp.ltu p8, p9 = r24, r17 388 ;; 389 .LL00: 390 .pred.rel "mutex",p8,p9 391 getf.sig r20 = f42 392 (p8) add r24 = r18, r21, 1 393 nop.b 0 394 ldf8 f32 = [up], 8 395 (p9) add r24 = r18, r21 396 nop.b 0 397 ;; 398 .pred.rel "mutex",p8,p9 399 getf.sig r17 = f39 400 xma.l f37 = f33, f6, f0 401 (p8) cmp.leu p6, p7 = r24, r18 402 st8 [rp] = r24, 8 403 xma.hu f41 = f33, f6, f0 404 (p9) cmp.ltu p6, p7 = r24, r18 405 ;; 406 .LL11: 407 .pred.rel "mutex",p6,p7 408 getf.sig r21 = f43 409 (p6) add r24 = r19, r22, 1 410 nop.b 0 411 ldf8 f33 = [up], 8 412 (p7) add r24 = r19, r22 413 nop.b 0 414 ;; 415 .pred.rel "mutex",p6,p7 416 getf.sig r18 = f36 417 xma.l f38 = f34, f6, f0 418 (p6) cmp.leu p8, p9 = r24, r19 419 st8 [rp] = r24, 8 420 xma.hu f42 = f34, f6, f0 421 (p7) cmp.ltu p8, p9 = r24, r19 422 ;; 423 .LL10: 424 .pred.rel "mutex",p8,p9 425 getf.sig r22 = f40 426 (p8) add r24 = r16, r23, 1 427 nop.b 0 428 ldf8 f34 = [up], 8 429 (p9) add r24 = r16, r23 430 nop.b 0 431 ;; 432 .pred.rel "mutex",p8,p9 433 getf.sig r19 = f37 434 xma.l f39 = f35, f6, f0 435 (p8) cmp.leu p6, p7 = r24, r16 436 st8 [rp] = r24, 8 437 xma.hu f43 = f35, f6, f0 438 (p9) cmp.ltu p6, p7 = r24, r16 439 ;; 440 .LL01: 441 .pred.rel "mutex",p6,p7 442 getf.sig r23 = f41 443 (p6) add r24 = r17, r20, 1 444 nop.b 0 445 ldf8 f35 = [up], 8 446 (p7) add r24 = r17, r20 447 br.cloop.dptk .Loop 448 C *** MAIN LOOP END *** 449 ;; 450 451 .Lcj9: 452 .pred.rel "mutex",p6,p7 453 getf.sig r16 = f38 454 xma.l f36 = f32, f6, f0 455 (p6) cmp.leu p8, p9 = r24, r17 456 st8 [rp] = r24, 8 457 xma.hu f40 = f32, f6, f0 458 (p7) cmp.ltu p8, p9 = r24, r17 459 ;; 460 .pred.rel "mutex",p8,p9 461 getf.sig r20 = f42 462 (p8) add r24 = r18, r21, 1 463 (p9) add r24 = r18, r21 464 ;; 465 .pred.rel "mutex",p8,p9 466 getf.sig r17 = f39 467 xma.l f37 = f33, f6, f0 468 (p8) cmp.leu p6, p7 = r24, r18 469 st8 [rp] = r24, 8 470 xma.hu f41 = f33, f6, f0 471 (p9) cmp.ltu p6, p7 = r24, r18 472 ;; 473 .Lcj8: 474 .pred.rel "mutex",p6,p7 475 getf.sig r21 = f43 476 (p6) add r24 = r19, r22, 1 477 (p7) add r24 = r19, r22 478 ;; 479 .pred.rel "mutex",p6,p7 480 getf.sig r18 = f36 481 xma.l f38 = f34, f6, f0 482 (p6) cmp.leu p8, p9 = r24, r19 483 st8 [rp] = r24, 8 484 xma.hu f42 = f34, f6, f0 485 (p7) cmp.ltu p8, p9 = r24, r19 486 ;; 487 .Lcj7: 488 .pred.rel "mutex",p8,p9 489 getf.sig r22 = f40 490 (p8) add r24 = r16, r23, 1 491 (p9) add r24 = r16, r23 492 ;; 493 .pred.rel "mutex",p8,p9 494 getf.sig r19 = f37 495 xma.l f39 = f35, f6, f0 496 (p8) cmp.leu p6, p7 = r24, r16 497 st8 [rp] = r24, 8 498 xma.hu f43 = f35, f6, f0 499 (p9) cmp.ltu p6, p7 = r24, r16 500 ;; 501 .Lcj6: 502 .pred.rel "mutex",p6,p7 503 getf.sig r23 = f41 504 (p6) add r24 = r17, r20, 1 505 (p7) add r24 = r17, r20 506 ;; 507 .pred.rel "mutex",p6,p7 508 (p6) cmp.leu p8, p9 = r24, r17 509 (p7) cmp.ltu p8, p9 = r24, r17 510 getf.sig r16 = f38 511 st8 [rp] = r24, 8 512 ;; 513 .Lcj5: 514 .pred.rel "mutex",p8,p9 515 getf.sig r20 = f42 516 (p8) add r24 = r18, r21, 1 517 (p9) add r24 = r18, r21 518 ;; 519 .pred.rel "mutex",p8,p9 520 (p8) cmp.leu p6, p7 = r24, r18 521 (p9) cmp.ltu p6, p7 = r24, r18 522 getf.sig r17 = f39 523 st8 [rp] = r24, 8 524 ;; 525 .Lcj4: 526 .pred.rel "mutex",p6,p7 527 getf.sig r8 = f43 528 (p6) add r24 = r19, r22, 1 529 (p7) add r24 = r19, r22 530 ;; 531 .pred.rel "mutex",p6,p7 532 st8 [rp] = r24, 8 533 (p6) cmp.leu p8, p9 = r24, r19 534 (p7) cmp.ltu p8, p9 = r24, r19 535 ;; 536 .Lcj3: 537 .pred.rel "mutex",p8,p9 538 (p8) add r24 = r16, r23, 1 539 (p9) add r24 = r16, r23 540 ;; 541 .pred.rel "mutex",p8,p9 542 st8 [rp] = r24, 8 543 (p8) cmp.leu p6, p7 = r24, r16 544 (p9) cmp.ltu p6, p7 = r24, r16 545 ;; 546 .Lcj2: 547 .pred.rel "mutex",p6,p7 548 (p6) add r24 = r17, r20, 1 549 (p7) add r24 = r17, r20 550 ;; 551 .pred.rel "mutex",p6,p7 552 st8 [rp] = r24, 8 553 (p6) cmp.leu p8, p9 = r24, r17 554 (p7) cmp.ltu p8, p9 = r24, r17 555 ;; 556 (p8) add r8 = 1, r8 557 mov.i ar.lc = r2 558 br.ret.sptk.many b0 559 EPILOGUE() 560 561 PROLOGUE(mpn_mul_1c) 562 .prologue 563 .save ar.lc, r2 564 .body 565 566 ifdef(`HAVE_ABI_32', 567 ` addp4 rp = 0, rp C M I 568 addp4 up = 0, up C M I 569 zxt4 n = n C I 570 ;; 571 ') 572 {.mmi 573 adds r15 = -1, n C M I 574 setf.sig f9 = cy C M2 M3 575 mov.i r2 = ar.lc C I0 576 } 577 {.mmb 578 ldf8 f7 = [up], 8 C M 579 and r14 = 3, n C M I 580 br.sptk .Lcommon 581 ;; 582 } 583 EPILOGUE() 584 ASM_END()