github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/bdiv_dbm1c.asm (about) 1 dnl IA-64 mpn_bdiv_dbm1. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2008, 2009 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C Itanium: 4 37 C Itanium 2: 2 38 39 C TODO 40 C * Optimize feed-in and wind-down code, both for speed and code size. 41 42 C INPUT PARAMETERS 43 define(`rp', `r32') 44 define(`up', `r33') 45 define(`n', `r34') 46 define(`bd', `r35') 47 48 ASM_START() 49 PROLOGUE(mpn_bdiv_dbm1c) 50 .prologue 51 .save ar.lc, r2 52 .body 53 54 ifdef(`HAVE_ABI_32', 55 ` addp4 rp = 0, rp C M I 56 addp4 up = 0, up C M I 57 zxt4 n = n C I 58 ;; 59 ') 60 {.mmb 61 mov r15 = r36 C M I 62 ldf8 f9 = [up], 8 C M 63 nop.b 0 C B 64 } 65 .Lcommon: 66 {.mii 67 adds r16 = -1, n C M I 68 mov r2 = ar.lc C I0 69 and r14 = 3, n C M I 70 ;; 71 } 72 {.mii 73 setf.sig f6 = bd C M2 M3 74 shr.u r31 = r16, 2 C I0 75 cmp.eq p10, p0 = 0, r14 C M I 76 } 77 {.mii 78 nop.m 0 C M 79 cmp.eq p11, p0 = 2, r14 C M I 80 cmp.eq p12, p0 = 3, r14 C M I 81 ;; 82 } 83 {.mii 84 cmp.ne p6, p7 = r0, r0 C M I 85 mov.i ar.lc = r31 C I0 86 cmp.ne p8, p9 = r0, r0 C M I 87 } 88 {.bbb 89 (p10) br.dptk .Lb00 C B 90 (p11) br.dptk .Lb10 C B 91 (p12) br.dptk .Lb11 C B 92 ;; 93 } 94 95 .Lb01: br.cloop.dptk .grt1 96 ;; 97 xma.l f38 = f9, f6, f0 98 xma.hu f39 = f9, f6, f0 99 ;; 100 getf.sig r26 = f38 101 getf.sig r27 = f39 102 br .Lcj1 103 104 .grt1: ldf8 f10 = [r33], 8 105 ;; 106 ldf8 f11 = [r33], 8 107 ;; 108 ldf8 f12 = [r33], 8 109 ;; 110 xma.l f38 = f9, f6, f0 111 xma.hu f39 = f9, f6, f0 112 ;; 113 ldf8 f13 = [r33], 8 114 ;; 115 xma.l f32 = f10, f6, f0 116 xma.hu f33 = f10, f6, f0 117 br.cloop.dptk .grt5 118 119 ;; 120 getf.sig r26 = f38 121 xma.l f34 = f11, f6, f0 122 xma.hu f35 = f11, f6, f0 123 ;; 124 getf.sig r27 = f39 125 ;; 126 getf.sig r20 = f32 127 xma.l f36 = f12, f6, f0 128 xma.hu f37 = f12, f6, f0 129 ;; 130 getf.sig r21 = f33 131 ;; 132 getf.sig r22 = f34 133 xma.l f38 = f13, f6, f0 134 xma.hu f39 = f13, f6, f0 135 br .Lcj5 136 137 .grt5: ldf8 f10 = [r33], 8 138 ;; 139 getf.sig r26 = f38 140 xma.l f34 = f11, f6, f0 141 xma.hu f35 = f11, f6, f0 142 ;; 143 getf.sig r27 = f39 144 ldf8 f11 = [r33], 8 145 ;; 146 getf.sig r20 = f32 147 xma.l f36 = f12, f6, f0 148 xma.hu f37 = f12, f6, f0 149 ;; 150 getf.sig r21 = f33 151 ldf8 f12 = [r33], 8 152 ;; 153 getf.sig r22 = f34 154 xma.l f38 = f13, f6, f0 155 xma.hu f39 = f13, f6, f0 156 br .LL01 157 158 .Lb10: ldf8 f13 = [r33], 8 159 br.cloop.dptk .grt2 160 ;; 161 162 xma.l f36 = f9, f6, f0 163 xma.hu f37 = f9, f6, f0 164 ;; 165 xma.l f38 = f13, f6, f0 166 xma.hu f39 = f13, f6, f0 167 ;; 168 getf.sig r24 = f36 169 ;; 170 getf.sig r25 = f37 171 ;; 172 getf.sig r26 = f38 173 ;; 174 getf.sig r27 = f39 175 br .Lcj2 176 177 .grt2: ldf8 f10 = [r33], 8 178 ;; 179 ldf8 f11 = [r33], 8 180 ;; 181 xma.l f36 = f9, f6, f0 182 xma.hu f37 = f9, f6, f0 183 ;; 184 ldf8 f12 = [r33], 8 185 ;; 186 xma.l f38 = f13, f6, f0 187 xma.hu f39 = f13, f6, f0 188 ;; 189 ldf8 f13 = [r33], 8 190 ;; 191 getf.sig r24 = f36 192 xma.l f32 = f10, f6, f0 193 xma.hu f33 = f10, f6, f0 194 br.cloop.dptk .grt6 195 196 getf.sig r25 = f37 197 ;; 198 getf.sig r26 = f38 199 xma.l f34 = f11, f6, f0 200 xma.hu f35 = f11, f6, f0 201 ;; 202 getf.sig r27 = f39 203 ;; 204 getf.sig r20 = f32 205 xma.l f36 = f12, f6, f0 206 xma.hu f37 = f12, f6, f0 207 br .Lcj6 208 209 .grt6: getf.sig r25 = f37 210 ldf8 f10 = [r33], 8 211 ;; 212 getf.sig r26 = f38 213 xma.l f34 = f11, f6, f0 214 xma.hu f35 = f11, f6, f0 215 ;; 216 getf.sig r27 = f39 217 ldf8 f11 = [r33], 8 218 ;; 219 getf.sig r20 = f32 220 xma.l f36 = f12, f6, f0 221 xma.hu f37 = f12, f6, f0 222 br .LL10 223 224 225 .Lb11: ldf8 f12 = [r33], 8 226 ;; 227 ldf8 f13 = [r33], 8 228 br.cloop.dptk .grt3 229 ;; 230 231 xma.l f34 = f9, f6, f0 232 xma.hu f35 = f9, f6, f0 233 ;; 234 xma.l f36 = f12, f6, f0 235 xma.hu f37 = f12, f6, f0 236 ;; 237 getf.sig r22 = f34 238 xma.l f38 = f13, f6, f0 239 xma.hu f39 = f13, f6, f0 240 ;; 241 getf.sig r23 = f35 242 ;; 243 getf.sig r24 = f36 244 ;; 245 getf.sig r25 = f37 246 ;; 247 getf.sig r26 = f38 248 br .Lcj3 249 250 .grt3: ldf8 f10 = [r33], 8 251 ;; 252 xma.l f34 = f9, f6, f0 253 xma.hu f35 = f9, f6, f0 254 ;; 255 ldf8 f11 = [r33], 8 256 ;; 257 xma.l f36 = f12, f6, f0 258 xma.hu f37 = f12, f6, f0 259 ;; 260 ldf8 f12 = [r33], 8 261 ;; 262 getf.sig r22 = f34 263 xma.l f38 = f13, f6, f0 264 xma.hu f39 = f13, f6, f0 265 ;; 266 getf.sig r23 = f35 267 ldf8 f13 = [r33], 8 268 ;; 269 getf.sig r24 = f36 270 xma.l f32 = f10, f6, f0 271 xma.hu f33 = f10, f6, f0 272 br.cloop.dptk .grt7 273 274 getf.sig r25 = f37 275 ;; 276 getf.sig r26 = f38 277 xma.l f34 = f11, f6, f0 278 xma.hu f35 = f11, f6, f0 279 br .Lcj7 280 281 .grt7: getf.sig r25 = f37 282 ldf8 f10 = [r33], 8 283 ;; 284 getf.sig r26 = f38 285 xma.l f34 = f11, f6, f0 286 xma.hu f35 = f11, f6, f0 287 br .LL11 288 289 290 .Lb00: ldf8 f11 = [r33], 8 291 ;; 292 ldf8 f12 = [r33], 8 293 ;; 294 ldf8 f13 = [r33], 8 295 br.cloop.dptk .grt4 296 ;; 297 298 xma.l f32 = f9, f6, f0 299 xma.hu f33 = f9, f6, f0 300 ;; 301 xma.l f34 = f11, f6, f0 302 xma.hu f35 = f11, f6, f0 303 ;; 304 getf.sig r20 = f32 305 xma.l f36 = f12, f6, f0 306 xma.hu f37 = f12, f6, f0 307 ;; 308 getf.sig r21 = f33 309 ;; 310 getf.sig r22 = f34 311 xma.l f38 = f13, f6, f0 312 xma.hu f39 = f13, f6, f0 313 ;; 314 getf.sig r23 = f35 315 ;; 316 getf.sig r24 = f36 317 br .Lcj4 318 319 .grt4: xma.l f32 = f9, f6, f0 320 xma.hu f33 = f9, f6, f0 321 ;; 322 ldf8 f10 = [r33], 8 323 ;; 324 xma.l f34 = f11, f6, f0 325 xma.hu f35 = f11, f6, f0 326 ;; 327 ldf8 f11 = [r33], 8 328 ;; 329 getf.sig r20 = f32 330 xma.l f36 = f12, f6, f0 331 xma.hu f37 = f12, f6, f0 332 ;; 333 getf.sig r21 = f33 334 ldf8 f12 = [r33], 8 335 ;; 336 getf.sig r22 = f34 337 xma.l f38 = f13, f6, f0 338 xma.hu f39 = f13, f6, f0 339 ;; 340 getf.sig r23 = f35 341 ldf8 f13 = [r33], 8 342 ;; 343 getf.sig r24 = f36 344 xma.l f32 = f10, f6, f0 345 xma.hu f33 = f10, f6, f0 346 br.cloop.dptk .LL00 347 br .Lcj8 348 349 C *** MAIN LOOP START *** 350 ALIGN(32) 351 .Ltop: 352 .pred.rel "mutex",p6,p7 353 C .mfi 354 getf.sig r24 = f36 355 xma.l f32 = f10, f6, f0 356 (p6) sub r15 = r19, r27, 1 357 C .mfi 358 st8 [r32] = r19, 8 359 xma.hu f33 = f10, f6, f0 360 (p7) sub r15 = r19, r27 361 ;; 362 .LL00: 363 C .mfi 364 getf.sig r25 = f37 365 nop.f 0 366 cmp.ltu p6, p7 = r15, r20 367 C .mib 368 ldf8 f10 = [r33], 8 369 sub r16 = r15, r20 370 nop.b 0 371 ;; 372 373 C .mfi 374 getf.sig r26 = f38 375 xma.l f34 = f11, f6, f0 376 (p6) sub r15 = r16, r21, 1 377 C .mfi 378 st8 [r32] = r16, 8 379 xma.hu f35 = f11, f6, f0 380 (p7) sub r15 = r16, r21 381 ;; 382 .LL11: 383 C .mfi 384 getf.sig r27 = f39 385 nop.f 0 386 cmp.ltu p6, p7 = r15, r22 387 C .mib 388 ldf8 f11 = [r33], 8 389 sub r17 = r15, r22 390 nop.b 0 391 ;; 392 393 C .mfi 394 getf.sig r20 = f32 395 xma.l f36 = f12, f6, f0 396 (p6) sub r15 = r17, r23, 1 397 C .mfi 398 st8 [r32] = r17, 8 399 xma.hu f37 = f12, f6, f0 400 (p7) sub r15 = r17, r23 401 ;; 402 .LL10: 403 C .mfi 404 getf.sig r21 = f33 405 nop.f 0 406 cmp.ltu p6, p7 = r15, r24 407 C .mib 408 ldf8 f12 = [r33], 8 409 sub r18 = r15, r24 410 nop.b 0 411 ;; 412 413 C .mfi 414 getf.sig r22 = f34 415 xma.l f38 = f13, f6, f0 416 (p6) sub r15 = r18, r25, 1 417 C .mfi 418 st8 [r32] = r18, 8 419 xma.hu f39 = f13, f6, f0 420 (p7) sub r15 = r18, r25 421 ;; 422 .LL01: 423 C .mfi 424 getf.sig r23 = f35 425 nop.f 0 426 cmp.ltu p6, p7 = r15, r26 427 C .mib 428 ldf8 f13 = [r33], 8 429 sub r19 = r15, r26 430 br.cloop.sptk.few .Ltop 431 C *** MAIN LOOP END *** 432 ;; 433 434 getf.sig r24 = f36 435 xma.l f32 = f10, f6, f0 436 (p6) sub r15 = r19, r27, 1 437 st8 [r32] = r19, 8 438 xma.hu f33 = f10, f6, f0 439 (p7) sub r15 = r19, r27 440 ;; 441 .Lcj8: getf.sig r25 = f37 442 cmp.ltu p6, p7 = r15, r20 443 sub r16 = r15, r20 444 ;; 445 getf.sig r26 = f38 446 xma.l f34 = f11, f6, f0 447 (p6) sub r15 = r16, r21, 1 448 st8 [r32] = r16, 8 449 xma.hu f35 = f11, f6, f0 450 (p7) sub r15 = r16, r21 451 ;; 452 .Lcj7: getf.sig r27 = f39 453 cmp.ltu p6, p7 = r15, r22 454 sub r17 = r15, r22 455 ;; 456 getf.sig r20 = f32 457 xma.l f36 = f12, f6, f0 458 (p6) sub r15 = r17, r23, 1 459 st8 [r32] = r17, 8 460 xma.hu f37 = f12, f6, f0 461 (p7) sub r15 = r17, r23 462 ;; 463 .Lcj6: getf.sig r21 = f33 464 cmp.ltu p6, p7 = r15, r24 465 sub r18 = r15, r24 466 ;; 467 getf.sig r22 = f34 468 xma.l f38 = f13, f6, f0 469 (p6) sub r15 = r18, r25, 1 470 st8 [r32] = r18, 8 471 xma.hu f39 = f13, f6, f0 472 (p7) sub r15 = r18, r25 473 ;; 474 .Lcj5: getf.sig r23 = f35 475 cmp.ltu p6, p7 = r15, r26 476 sub r19 = r15, r26 477 ;; 478 getf.sig r24 = f36 479 (p6) sub r15 = r19, r27, 1 480 st8 [r32] = r19, 8 481 (p7) sub r15 = r19, r27 482 ;; 483 .Lcj4: getf.sig r25 = f37 484 cmp.ltu p6, p7 = r15, r20 485 sub r16 = r15, r20 486 ;; 487 getf.sig r26 = f38 488 (p6) sub r15 = r16, r21, 1 489 st8 [r32] = r16, 8 490 (p7) sub r15 = r16, r21 491 ;; 492 .Lcj3: getf.sig r27 = f39 493 cmp.ltu p6, p7 = r15, r22 494 sub r17 = r15, r22 495 ;; 496 (p6) sub r15 = r17, r23, 1 497 st8 [r32] = r17, 8 498 (p7) sub r15 = r17, r23 499 ;; 500 .Lcj2: cmp.ltu p6, p7 = r15, r24 501 sub r18 = r15, r24 502 ;; 503 (p6) sub r15 = r18, r25, 1 504 st8 [r32] = r18, 8 505 (p7) sub r15 = r18, r25 506 ;; 507 .Lcj1: cmp.ltu p6, p7 = r15, r26 508 sub r19 = r15, r26 509 ;; 510 (p6) sub r8 = r19, r27, 1 511 st8 [r32] = r19 512 (p7) sub r8 = r19, r27 513 mov ar.lc = r2 514 br.ret.sptk.many b0 515 EPILOGUE() 516 ASM_END()