github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/divrem_1.asm (about) 1 dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an 2 dnl unnormalized limb. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund. 5 6 dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 37 C cycles/limb 38 C Itanium: 40-42 39 C Itanium 2: 29-30 40 41 C This was generated by gcc, then the loops were optimized. The preinv entry 42 C point was shoehorned into the file. Lots of things outside the loops could 43 C be streamlined. It would probably be a good idea to merge the loops for 44 C normalized and unnormalized divisor, since the shifting stuff is done for 45 C free in parallel with other operations. It would even be possible to merge 46 C all loops, if the ld8 were made conditional. 47 48 C TODO 49 C * Consider delaying inversion for normalized mpn_divrem_1 entry till after 50 C computing leading limb. 51 C * Inline and interleave limb inversion code with loop setup code. 52 53 ASM_START() 54 55 C HP's assembler requires these declarations for importing mpn_invert_limb 56 .global mpn_invert_limb 57 .type mpn_invert_limb,@function 58 59 C INPUT PARAMETERS 60 C rp = r32 61 C qxn = r33 62 C up = r34 63 C n = r35 64 C vl = r36 65 C vlinv = r37 (preinv only) 66 C cnt = r38 (preinv only) 67 68 PROLOGUE(mpn_preinv_divrem_1) 69 .prologue 70 .save ar.pfs, r42 71 alloc r42 = ar.pfs, 7, 8, 1, 0 72 .save ar.lc, r44 73 mov r44 = ar.lc 74 .save rp, r41 75 mov r41 = b0 76 .body 77 ifdef(`HAVE_ABI_32', 78 ` addp4 r32 = 0, r32 79 sxt4 r33 = r33 80 addp4 r34 = 0, r34 81 sxt4 r35 = r35 82 ;; 83 ') 84 mov r40 = r38 85 shladd r34 = r35, 3, r34 86 ;; 87 adds r34 = -8, r34 88 ;; 89 ld8 r39 = [r34], -8 90 ;; 91 92 add r15 = r35, r33 93 ;; 94 mov r8 = r37 95 shladd r32 = r15, 3, r32 C r32 = rp + n + qxn 96 cmp.le p8, p0 = 0, r36 97 ;; 98 adds r32 = -8, r32 C r32 = rp + n + qxn - 1 99 cmp.leu p6, p7 = r36, r39 100 (p8) br.cond.dpnt .Lpunnorm 101 ;; 102 103 (p6) addl r15 = 1, r0 104 (p7) mov r15 = r0 105 ;; 106 (p6) sub r38 = r39, r36 107 (p7) mov r38 = r39 108 st8 [r32] = r15, -8 109 adds r35 = -2, r35 C un -= 2 110 br .Lpn 111 112 .Lpunnorm: 113 (p6) add r34 = 8, r34 114 mov r38 = 0 C r = 0 115 shl r36 = r36, r40 116 (p6) br.cond.dptk .Lpu 117 ;; 118 shl r38 = r39, r40 C r = ahigh << cnt 119 cmp.ne p8, p0 = 1, r35 120 st8 [r32] = r0, -8 121 adds r35 = -1, r35 C un-- 122 (p8) br.cond.dpnt .Lpu 123 124 mov r23 = 1 125 ;; 126 setf.sig f6 = r8 127 setf.sig f12 = r23 128 br .L435 129 EPILOGUE() 130 131 132 PROLOGUE(mpn_divrem_1) 133 .prologue 134 .save ar.pfs, r42 135 alloc r42 = ar.pfs, 5, 8, 1, 0 136 .save ar.lc, r44 137 mov r44 = ar.lc 138 .save rp, r41 139 mov r41 = b0 140 .body 141 ifdef(`HAVE_ABI_32', 142 ` addp4 r32 = 0, r32 143 sxt4 r33 = r33 144 addp4 r34 = 0, r34 145 sxt4 r35 = r35 146 ;; 147 ') 148 mov r38 = r0 149 add r15 = r35, r33 150 ;; 151 cmp.ne p6, p7 = 0, r15 152 ;; 153 (p7) mov r8 = r0 154 (p7) br.cond.dpnt .Lret 155 shladd r14 = r15, 3, r32 C r14 = rp + n + qxn 156 cmp.le p6, p7 = 0, r36 157 ;; 158 adds r32 = -8, r14 C r32 = rp + n + qxn - 1 159 (p6) br.cond.dpnt .Lunnorm 160 cmp.eq p6, p7 = 0, r35 161 (p6) br.cond.dpnt .L179 162 shladd r14 = r35, 3, r34 163 ;; 164 adds r14 = -8, r14 165 adds r35 = -1, r35 166 ;; 167 ld8 r38 = [r14] 168 ;; 169 cmp.leu p6, p7 = r36, r38 170 ;; 171 (p6) addl r15 = 1, r0 172 (p7) mov r15 = r0 173 ;; 174 st8 [r32] = r15, -8 175 (p6) sub r38 = r38, r36 176 177 .L179: 178 mov r45 = r36 179 adds r35 = -1, r35 180 br.call.sptk.many b0 = mpn_invert_limb 181 ;; 182 shladd r34 = r35, 3, r34 183 .Lpn: 184 mov r23 = 1 185 ;; 186 setf.sig f6 = r8 187 setf.sig f12 = r23 188 cmp.le p6, p7 = 0, r35 189 mov r40 = 0 190 (p7) br.cond.dpnt .L435 191 setf.sig f10 = r36 192 mov ar.lc = r35 193 setf.sig f7 = r38 194 ;; 195 sub r28 = -1, r36 196 C Develop quotient limbs for normalized divisor 197 .Loop1: C 00 C q=r18 nh=r38/f7 198 ld8 r20 = [r34], -8 199 xma.hu f11 = f7, f6, f0 200 ;; C 04 201 xma.l f8 = f11, f12, f7 C q = q + nh 202 ;; C 08 203 getf.sig r18 = f8 204 xma.hu f9 = f8, f10, f0 205 xma.l f8 = f8, f10, f0 206 ;; C 12 207 getf.sig r16 = f9 208 C 13 209 getf.sig r15 = f8 210 ;; C 18 211 cmp.ltu p6, p7 = r20, r15 212 sub r15 = r20, r15 213 sub r16 = r38, r16 214 ;; C 19 215 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 216 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 217 (p6) add r16 = -1, r16 218 (p0) cmp.ne.unc p6, p7 = r0, r0 219 ;; C 20 220 (p8) cmp.ltu p6, p7 = r15, r36 221 (p8) sub r15 = r15, r36 222 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 223 ;; C 21 224 .pred.rel "mutex",p6,p7 225 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 226 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 227 cmp.ltu p6, p7 = r15, r36 C speculative 228 sub r28 = r15, r36 C speculative, just for cmp 229 ;; C 22 230 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 231 (p8) mov r15 = r28 232 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 233 ;; C 23 234 (p6) setf.sig f7 = r15 235 (p7) sub r15 = r15, r36 236 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 237 ;; C 24 238 (p7) setf.sig f7 = r15 239 st8 [r32] = r18, -8 240 mov r38 = r15 241 br.cloop.dptk .Loop1 242 C 29/30 243 br.sptk .L435 244 ;; 245 .Lunnorm: 246 mux1 r16 = r36, @rev 247 cmp.eq p6, p7 = 0, r35 248 (p6) br.cond.dpnt .L322 249 shladd r34 = r35, 3, r34 250 ;; 251 adds r34 = -8, r34 252 ;; 253 ld8 r39 = [r34] 254 ;; 255 cmp.leu p6, p7 = r36, r39 256 (p6) br.cond.dptk .L322 257 adds r34 = -8, r34 258 ;; 259 mov r38 = r39 260 ;; 261 cmp.ne p6, p7 = 1, r15 262 st8 [r32] = r0, -8 263 ;; 264 (p7) mov r8 = r38 265 (p7) br.cond.dpnt .Lret 266 adds r35 = -1, r35 267 .L322: 268 sub r14 = r0, r16 269 ;; 270 or r14 = r16, r14 271 ;; 272 mov r16 = -8 273 czx1.l r14 = r14 274 ;; 275 shladd r16 = r14, 3, r16 276 ;; 277 shr.u r14 = r36, r16 278 ;; 279 cmp.geu p6, p7 = 15, r14 280 ;; 281 (p7) shr.u r14 = r14, 4 282 (p7) adds r16 = 4, r16 283 ;; 284 cmp.geu p6, p7 = 3, r14 285 ;; 286 (p7) shr.u r14 = r14, 2 287 (p7) adds r16 = 2, r16 288 ;; 289 tbit.nz p6, p7 = r14, 1 290 ;; 291 .pred.rel "mutex",p6,p7 292 (p6) sub r40 = 62, r16 293 (p7) sub r40 = 63, r16 294 ;; 295 shl r45 = r36, r40 296 shl r36 = r36, r40 297 shl r38 = r38, r40 298 br.call.sptk.many b0 = mpn_invert_limb 299 ;; 300 .Lpu: 301 mov r23 = 1 302 ;; 303 setf.sig f6 = r8 304 setf.sig f12 = r23 305 cmp.eq p6, p7 = 0, r35 306 (p6) br.cond.dpnt .L435 307 sub r16 = 64, r40 308 adds r35 = -2, r35 309 ;; 310 ld8 r39 = [r34], -8 311 cmp.le p6, p7 = 0, r35 312 ;; 313 shr.u r14 = r39, r16 314 ;; 315 or r38 = r14, r38 316 (p7) br.cond.dpnt .Lend3 317 ;; 318 mov r22 = r16 319 setf.sig f10 = r36 320 setf.sig f7 = r38 321 mov ar.lc = r35 322 ;; 323 C Develop quotient limbs for unnormalized divisor 324 .Loop3: 325 ld8 r14 = [r34], -8 326 xma.hu f11 = f7, f6, f0 327 ;; 328 xma.l f8 = f11, f12, f7 C q = q + nh 329 ;; 330 getf.sig r18 = f8 331 xma.hu f9 = f8, f10, f0 332 shl r20 = r39, r40 333 xma.l f8 = f8, f10, f0 334 shr.u r24 = r14, r22 335 ;; 336 getf.sig r16 = f9 337 getf.sig r15 = f8 338 or r20 = r24, r20 339 ;; 340 cmp.ltu p6, p7 = r20, r15 341 sub r15 = r20, r15 342 sub r16 = r38, r16 343 ;; 344 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 345 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 346 (p6) add r16 = -1, r16 347 (p0) cmp.ne.unc p6, p7 = r0, r0 348 ;; 349 (p8) cmp.ltu p6, p7 = r15, r36 350 (p8) sub r15 = r15, r36 351 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 352 ;; 353 .pred.rel "mutex",p6,p7 354 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 355 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 356 cmp.ltu p6, p7 = r15, r36 C speculative 357 sub r28 = r15, r36 C speculative, just for cmp 358 ;; 359 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 360 (p8) mov r15 = r28 361 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 362 ;; 363 (p6) setf.sig f7 = r15 364 (p7) sub r15 = r15, r36 365 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 366 ;; 367 (p7) setf.sig f7 = r15 368 st8 [r32] = r18, -8 369 mov r39 = r14 370 mov r38 = r15 371 br.cloop.dptk .Loop3 372 ;; 373 .Lend3: 374 setf.sig f10 = r36 375 setf.sig f7 = r38 376 ;; 377 xma.hu f11 = f7, f6, f0 378 ;; 379 xma.l f8 = f11, f12, f7 C q = q + nh 380 ;; 381 getf.sig r18 = f8 382 xma.hu f9 = f8, f10, f0 383 shl r20 = r39, r40 384 xma.l f8 = f8, f10, f0 385 ;; 386 getf.sig r16 = f9 387 getf.sig r15 = f8 388 ;; 389 cmp.ltu p6, p7 = r20, r15 390 sub r15 = r20, r15 391 sub r16 = r38, r16 392 ;; 393 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 394 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 395 (p6) add r16 = -1, r16 396 (p0) cmp.ne.unc p6, p7 = r0, r0 397 ;; 398 (p8) cmp.ltu p6, p7 = r15, r36 399 (p8) sub r15 = r15, r36 400 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 401 ;; 402 .pred.rel "mutex",p6,p7 403 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 404 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 405 ;; 406 (p8) sub r15 = r15, r36 407 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 408 ;; 409 cmp.ltu p6, p7 = r15, r36 410 ;; 411 (p7) sub r15 = r15, r36 412 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 413 ;; 414 st8 [r32] = r18, -8 415 mov r38 = r15 416 .L435: 417 adds r35 = -1, r33 418 cmp.le p6, p7 = 1, r33 419 (p7) br.cond.dpnt .Lend4 420 ;; 421 setf.sig f7 = r38 422 setf.sig f10 = r36 423 mov ar.lc = r35 424 ;; 425 .Loop4: 426 xma.hu f11 = f7, f6, f0 427 ;; 428 xma.l f8 = f11, f12, f7 C q = q + nh 429 ;; 430 getf.sig r18 = f8 431 xma.hu f9 = f8, f10, f0 432 xma.l f8 = f8, f10, f0 433 ;; 434 getf.sig r16 = f9 435 getf.sig r15 = f8 436 ;; 437 cmp.ltu p6, p7 = 0, r15 438 sub r15 = 0, r15 439 sub r16 = r38, r16 440 ;; 441 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 442 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 443 (p6) add r16 = -1, r16 444 (p0) cmp.ne.unc p6, p7 = r0, r0 445 ;; 446 (p8) cmp.ltu p6, p7 = r15, r36 447 (p8) sub r15 = r15, r36 448 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 449 ;; 450 .pred.rel "mutex",p6,p7 451 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 452 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 453 cmp.ltu p6, p7 = r15, r36 C speculative 454 sub r28 = r15, r36 C speculative, just for cmp 455 ;; 456 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 457 (p8) mov r15 = r28 458 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 459 ;; 460 (p6) setf.sig f7 = r15 461 (p7) sub r15 = r15, r36 462 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 463 ;; 464 (p7) setf.sig f7 = r15 465 st8 [r32] = r18, -8 466 mov r38 = r15 467 br.cloop.dptk .Loop4 468 ;; 469 .Lend4: 470 shr.u r8 = r38, r40 471 .Lret: 472 mov ar.pfs = r42 473 mov ar.lc = r44 474 mov b0 = r41 475 br.ret.sptk.many b0 476 EPILOGUE() 477 ASM_END()