github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/addmul_2.asm (about) 1 dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and 2 dnl add the result to a (n+1)-limb number. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund. 5 6 dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C cycles/limb 37 C Itanium: 3.65 38 C Itanium 2: 1.625 39 40 C TODO 41 C * Clean up variable names, and try to decrease the number of distinct 42 C registers used. 43 C * Clean up feed-in code to not require zeroing several registers. 44 C * Make sure we don't depend on uninitialised predicate registers. 45 C * Could perhaps save a few cycles by using 1 c/l carry propagation in 46 C wind-down code. 47 C * Ultimately rewrite. The problem with this code is that it first uses a 48 C loaded u value in one xma pair, then leaves it live over several unrelated 49 C xma pairs, before it uses it again. It should actually be quite possible 50 C to just swap some aligned xma pairs around. But we should then schedule 51 C u loads further from the first use. 52 53 C INPUT PARAMETERS 54 define(`rp',`r32') 55 define(`up',`r33') 56 define(`n',`r34') 57 define(`vp',`r35') 58 59 define(`srp',`r3') 60 61 define(`v0',`f6') 62 define(`v1',`f7') 63 64 define(`s0',`r14') 65 define(`acc0',`r15') 66 67 define(`pr0_0',`r16') define(`pr0_1',`r17') 68 define(`pr0_2',`r18') define(`pr0_3',`r19') 69 70 define(`pr1_0',`r20') define(`pr1_1',`r21') 71 define(`pr1_2',`r22') define(`pr1_3',`r23') 72 73 define(`acc1_0',`r24') define(`acc1_1',`r25') 74 define(`acc1_2',`r26') define(`acc1_3',`r27') 75 76 dnl define(`',`r28') 77 dnl define(`',`r29') 78 dnl define(`',`r30') 79 dnl define(`',`r31') 80 81 define(`fp0b_0',`f8') define(`fp0b_1',`f9') 82 define(`fp0b_2',`f10') define(`fp0b_3',`f11') 83 84 define(`fp1a_0',`f12') define(`fp1a_1',`f13') 85 define(`fp1a_2',`f14') define(`fp1a_3',`f15') 86 87 define(`fp1b_0',`f32') define(`fp1b_1',`f33') 88 define(`fp1b_2',`f34') define(`fp1b_3',`f35') 89 90 define(`fp2a_0',`f36') define(`fp2a_1',`f37') 91 define(`fp2a_2',`f38') define(`fp2a_3',`f39') 92 93 define(`r_0',`f40') define(`r_1',`f41') 94 define(`r_2',`f42') define(`r_3',`f43') 95 96 define(`u_0',`f44') define(`u_1',`f45') 97 define(`u_2',`f46') define(`u_3',`f47') 98 99 define(`rx',`f48') 100 define(`ux',`f49') 101 define(`ry',`f50') 102 define(`uy',`f51') 103 104 ASM_START() 105 PROLOGUE(mpn_addmul_2s) 106 .prologue 107 .save ar.lc, r2 108 .body 109 110 ifdef(`HAVE_ABI_32',` 111 {.mmi; addp4 rp = 0, rp C M I 112 addp4 up = 0, up C M I 113 addp4 vp = 0, vp C M I 114 }{.mmi; nop 1 115 nop 1 116 zxt4 n = n C I 117 ;; 118 }') 119 120 {.mmi; ldf8 ux = [up], 8 C M 121 ldf8 v0 = [vp], 8 C M 122 mov r2 = ar.lc C I0 123 }{.mmi; ldf8 rx = [rp], 8 C M 124 and r14 = 3, n C M I 125 add n = -2, n C M I 126 ;; 127 }{.mmi; ldf8 uy = [up], 8 C M 128 ldf8 v1 = [vp] C M 129 shr.u n = n, 2 C I0 130 }{.mmi; ldf8 ry = [rp], -8 C M 131 cmp.eq p14, p0 = 1, r14 C M I 132 cmp.eq p11, p0 = 2, r14 C M I 133 ;; 134 }{.mmi; add srp = 16, rp C M I 135 cmp.eq p15, p0 = 3, r14 C M I 136 mov ar.lc = n C I0 137 }{.bbb; (p14) br.dptk L(x01) C B 138 (p11) br.dptk L(x10) C B 139 (p15) br.dptk L(x11) C B 140 ;; 141 } 142 L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair 143 mov fp2a_3 = f0 144 br L(b00) 145 L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair 146 mov fp2a_2 = f0 147 br L(b01) 148 L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair 149 mov fp2a_1 = f0 150 br L(b10) 151 L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair 152 mov fp2a_0 = f0 153 br L(b11) 154 155 EPILOGUE() 156 157 PROLOGUE(mpn_addmul_2) 158 .prologue 159 .save ar.lc, r2 160 .body 161 162 ifdef(`HAVE_ABI_32',` 163 {.mmi; addp4 rp = 0, rp C M I 164 addp4 up = 0, up C M I 165 addp4 vp = 0, vp C M I 166 }{.mmi; nop 1 167 nop 1 168 zxt4 n = n C I 169 ;; 170 }') 171 172 {.mmi; ldf8 ux = [up], 8 C M 173 ldf8 v0 = [vp], 8 C M 174 mov r2 = ar.lc C I0 175 }{.mmi; ldf8 rx = [rp], 8 C M 176 and r14 = 3, n C M I 177 add n = -2, n C M I 178 ;; 179 }{.mmi; ldf8 uy = [up], 8 C M 180 ldf8 v1 = [vp] C M 181 shr.u n = n, 2 C I0 182 }{.mmi; ldf8 ry = [rp], -8 C M 183 cmp.eq p14, p0 = 1, r14 C M I 184 cmp.eq p11, p0 = 2, r14 C M I 185 ;; 186 }{.mmi; add srp = 16, rp C M I 187 cmp.eq p15, p6 = 3, r14 C M I 188 mov ar.lc = n C I0 189 }{.bbb; (p14) br.dptk L(b01) C B 190 (p11) br.dptk L(b10) C B 191 (p15) br.dptk L(b11) C B 192 ;; 193 } 194 ALIGN(32) 195 L(b00): 196 {.mmi; ldf8 r_1 = [srp], 8 197 ldf8 u_1 = [up], 8 198 mov acc1_2 = 0 199 }{.mmi; mov pr1_2 = 0 200 mov pr0_3 = 0 201 cmp.ne p8, p9 = r0, r0 202 ;; 203 }{.mfi; ldf8 r_2 = [srp], 8 204 xma.l fp0b_3 = ux, v0, rx 205 cmp.ne p12, p13 = r0, r0 206 }{.mfb; ldf8 u_2 = [up], 8 207 xma.hu fp1b_3 = ux, v0, rx 208 br.cloop.dptk L(gt4) 209 } 210 xma.l fp0b_0 = uy, v0, ry 211 xma.hu fp1a_0 = uy, v0, ry 212 ;; 213 getfsig acc0 = fp0b_3 214 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 215 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 216 ;; 217 xma.l fp0b_1 = u_1, v0, r_1 218 xma.hu fp1a_1 = u_1, v0, r_1 219 ;; 220 getfsig pr0_0 = fp0b_0 221 xma.l fp1b_0 = uy, v1, fp1a_0 222 xma.hu fp2a_0 = uy, v1, fp1a_0 223 ;; 224 getfsig pr1_3 = fp1b_3 225 getfsig acc1_3 = fp2a_3 226 xma.l fp0b_2 = u_2, v0, r_2 227 xma.hu fp1a_2 = u_2, v0, r_2 228 br L(cj4) 229 230 L(gt4): xma.l fp0b_0 = uy, v0, ry 231 xma.hu fp1a_0 = uy, v0, ry 232 ;; 233 ldf8 r_3 = [srp], 8 234 getfsig acc0 = fp0b_3 235 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 236 ldf8 u_3 = [up], 8 237 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 238 ;; 239 xma.l fp0b_1 = u_1, v0, r_1 240 xma.hu fp1a_1 = u_1, v0, r_1 241 ;; 242 ldf8 r_0 = [srp], 8 243 getfsig pr0_0 = fp0b_0 244 xma.l fp1b_0 = uy, v1, fp1a_0 245 xma.hu fp2a_0 = uy, v1, fp1a_0 246 ;; 247 ldf8 u_0 = [up], 8 248 getfsig pr1_3 = fp1b_3 249 xma.l fp0b_2 = u_2, v0, r_2 250 ;; 251 getfsig acc1_3 = fp2a_3 252 xma.hu fp1a_2 = u_2, v0, r_2 253 br L(00) 254 255 256 ALIGN(32) 257 L(b01): 258 {.mmi; ldf8 r_0 = [srp], 8 C M 259 ldf8 u_0 = [up], 8 C M 260 mov acc1_1 = 0 C M I 261 }{.mmi; mov pr1_1 = 0 C M I 262 mov pr0_2 = 0 C M I 263 cmp.ne p6, p7 = r0, r0 C M I 264 ;; 265 }{.mfi; ldf8 r_1 = [srp], 8 C M 266 xma.l fp0b_2 = ux, v0, rx C F 267 cmp.ne p10, p11 = r0, r0 C M I 268 }{.mfi; ldf8 u_1 = [up], 8 C M 269 xma.hu fp1b_2 = ux, v0, rx C F 270 nop 1 271 ;; 272 } xma.l fp0b_3 = uy, v0, ry C F 273 xma.hu fp1a_3 = uy, v0, ry C F 274 ;; 275 {.mmf; getfsig acc0 = fp0b_2 C M 276 ldf8 r_2 = [srp], 8 C M 277 (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s 278 }{.mfb; ldf8 u_2 = [up], 8 C M 279 (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s 280 br.cloop.dptk L(gt5) 281 } 282 xma.l fp0b_0 = u_0, v0, r_0 C F 283 xma.hu fp1a_0 = u_0, v0, r_0 C F 284 ;; 285 getfsig pr0_3 = fp0b_3 C M 286 xma.l fp1b_3 = uy, v1,fp1a_3 C F 287 xma.hu fp2a_3 = uy, v1,fp1a_3 C F 288 ;; 289 getfsig pr1_2 = fp1b_2 C M 290 getfsig acc1_2 = fp2a_2 C M 291 xma.l fp0b_1 = u_1, v0, r_1 C F 292 xma.hu fp1a_1 = u_1, v0, r_1 C F 293 br L(cj5) 294 295 L(gt5): xma.l fp0b_0 = u_0, v0, r_0 296 xma.hu fp1a_0 = u_0, v0, r_0 297 ;; 298 getfsig pr0_3 = fp0b_3 299 ldf8 r_3 = [srp], 8 300 xma.l fp1b_3 = uy, v1, fp1a_3 301 xma.hu fp2a_3 = uy, v1, fp1a_3 302 ;; 303 ldf8 u_3 = [up], 8 304 getfsig pr1_2 = fp1b_2 305 xma.l fp0b_1 = u_1, v0, r_1 306 ;; 307 getfsig acc1_2 = fp2a_2 308 xma.hu fp1a_1 = u_1, v0, r_1 309 br L(01) 310 311 312 ALIGN(32) 313 L(b10): br.cloop.dptk L(gt2) 314 xma.l fp0b_1 = ux, v0, rx 315 xma.hu fp1b_1 = ux, v0, rx 316 ;; 317 xma.l fp0b_2 = uy, v0, ry 318 xma.hu fp1a_2 = uy, v0, ry 319 ;; 320 stf8 [rp] = fp0b_1, 8 321 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 322 (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 323 ;; 324 getfsig acc0 = fp0b_2 325 xma.l fp1b_2 = uy, v1, fp1a_2 326 xma.hu fp2a_2 = uy, v1, fp1a_2 327 ;; 328 getfsig pr1_1 = fp1b_1 329 getfsig acc1_1 = fp2a_1 330 mov ar.lc = r2 331 getfsig pr1_2 = fp1b_2 332 getfsig r8 = fp2a_2 333 ;; 334 add s0 = pr1_1, acc0 335 ;; 336 st8 [rp] = s0, 8 337 cmp.ltu p8, p9 = s0, pr1_1 338 sub r31 = -1, acc1_1 339 ;; 340 .pred.rel "mutex", p8, p9 341 (p8) add acc0 = pr1_2, acc1_1, 1 342 (p9) add acc0 = pr1_2, acc1_1 343 (p8) cmp.leu p10, p0 = r31, pr1_2 344 (p9) cmp.ltu p10, p0 = r31, pr1_2 345 ;; 346 st8 [rp] = acc0, 8 347 (p10) add r8 = 1, r8 348 br.ret.sptk.many b0 349 350 351 L(gt2): 352 {.mmi; ldf8 r_3 = [srp], 8 353 ldf8 u_3 = [up], 8 354 mov acc1_0 = 0 355 ;; 356 }{.mfi; ldf8 r_0 = [srp], 8 357 xma.l fp0b_1 = ux, v0, rx 358 mov pr1_0 = 0 359 }{.mfi; ldf8 u_0 = [up], 8 360 xma.hu fp1b_1 = ux, v0, rx 361 mov pr0_1 = 0 362 ;; 363 } xma.l fp0b_2 = uy, v0, ry 364 xma.hu fp1a_2 = uy, v0, ry 365 ;; 366 getfsig acc0 = fp0b_1 367 ldf8 r_1 = [srp], 8 368 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 369 (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 370 ;; 371 ldf8 u_1 = [up], 8 372 xma.l fp0b_3 = u_3, v0, r_3 373 xma.hu fp1a_3 = u_3, v0, r_3 374 ;; 375 getfsig pr0_2 = fp0b_2 376 ldf8 r_2 = [srp], 8 377 xma.l fp1b_2 = uy, v1, fp1a_2 378 xma.hu fp2a_2 = uy, v1, fp1a_2 379 ;; 380 ldf8 u_2 = [up], 8 381 getfsig pr1_1 = fp1b_1 382 ;; 383 {.mfi; getfsig acc1_1 = fp2a_1 384 xma.l fp0b_0 = u_0, v0, r_0 385 cmp.ne p8, p9 = r0, r0 386 }{.mfb; cmp.ne p12, p13 = r0, r0 387 xma.hu fp1a_0 = u_0, v0, r_0 388 br.cloop.sptk.clr L(top) 389 } 390 br.many L(end) 391 392 393 ALIGN(32) 394 L(b11): ldf8 r_2 = [srp], 8 395 mov pr1_3 = 0 396 mov pr0_0 = 0 397 ;; 398 ldf8 u_2 = [up], 8 399 mov acc1_3 = 0 400 br.cloop.dptk L(gt3) 401 ;; 402 cmp.ne p6, p7 = r0, r0 403 xma.l fp0b_0 = ux, v0, rx 404 xma.hu fp1b_0 = ux, v0, rx 405 ;; 406 cmp.ne p10, p11 = r0, r0 407 xma.l fp0b_1 = uy, v0, ry 408 xma.hu fp1a_1 = uy, v0, ry 409 ;; 410 getfsig acc0 = fp0b_0 411 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 412 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 413 ;; 414 xma.l fp0b_2 = uy, v1, r_2 415 xma.hu fp1a_2 = uy, v1, r_2 416 ;; 417 getfsig pr0_1 = fp0b_1 418 xma.l fp1b_1 = u_2, v0, fp1a_1 419 xma.hu fp2a_1 = u_2, v0, fp1a_1 420 ;; 421 getfsig pr1_0 = fp1b_0 422 getfsig acc1_0 = fp2a_0 423 br L(cj3) 424 425 L(gt3): ldf8 r_3 = [srp], 8 426 xma.l fp0b_0 = ux, v0, rx 427 cmp.ne p10, p11 = r0, r0 428 ldf8 u_3 = [up], 8 429 xma.hu fp1b_0 = ux, v0, rx 430 cmp.ne p6, p7 = r0, r0 431 ;; 432 xma.l fp0b_1 = uy, v0, ry 433 xma.hu fp1a_1 = uy, v0, ry 434 ;; 435 getfsig acc0 = fp0b_0 436 ldf8 r_0 = [srp], 8 437 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 438 ldf8 u_0 = [up], 8 439 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 440 ;; 441 xma.l fp0b_2 = u_2, v0, r_2 442 xma.hu fp1a_2 = u_2, v0, r_2 443 ;; 444 getfsig pr0_1 = fp0b_1 445 ldf8 r_1 = [srp], 8 446 xma.l fp1b_1 = uy, v1, fp1a_1 447 xma.hu fp2a_1 = uy, v1, fp1a_1 448 ;; 449 ldf8 u_1 = [up], 8 450 getfsig pr1_0 = fp1b_0 451 ;; 452 getfsig acc1_0 = fp2a_0 453 xma.l fp0b_3 = u_3, v0, r_3 454 xma.hu fp1a_3 = u_3, v0, r_3 455 br L(11) 456 457 458 C *** MAIN LOOP START *** 459 ALIGN(32) 460 L(top): C 00 461 .pred.rel "mutex", p12, p13 462 getfsig pr0_3 = fp0b_3 463 ldf8 r_3 = [srp], 8 464 xma.l fp1b_3 = u_3, v1, fp1a_3 465 (p12) add s0 = pr1_0, acc0, 1 466 (p13) add s0 = pr1_0, acc0 467 xma.hu fp2a_3 = u_3, v1, fp1a_3 468 ;; C 01 469 .pred.rel "mutex", p8, p9 470 .pred.rel "mutex", p12, p13 471 ldf8 u_3 = [up], 8 472 getfsig pr1_2 = fp1b_2 473 (p8) cmp.leu p6, p7 = acc0, pr0_1 474 (p9) cmp.ltu p6, p7 = acc0, pr0_1 475 (p12) cmp.leu p10, p11 = s0, pr1_0 476 (p13) cmp.ltu p10, p11 = s0, pr1_0 477 ;; C 02 478 .pred.rel "mutex", p6, p7 479 getfsig acc1_2 = fp2a_2 480 st8 [rp] = s0, 8 481 xma.l fp0b_1 = u_1, v0, r_1 482 (p6) add acc0 = pr0_2, acc1_0, 1 483 (p7) add acc0 = pr0_2, acc1_0 484 xma.hu fp1a_1 = u_1, v0, r_1 485 ;; C 03 486 L(01): 487 .pred.rel "mutex", p10, p11 488 getfsig pr0_0 = fp0b_0 489 ldf8 r_0 = [srp], 8 490 xma.l fp1b_0 = u_0, v1, fp1a_0 491 (p10) add s0 = pr1_1, acc0, 1 492 (p11) add s0 = pr1_1, acc0 493 xma.hu fp2a_0 = u_0, v1, fp1a_0 494 ;; C 04 495 .pred.rel "mutex", p6, p7 496 .pred.rel "mutex", p10, p11 497 ldf8 u_0 = [up], 8 498 getfsig pr1_3 = fp1b_3 499 (p6) cmp.leu p8, p9 = acc0, pr0_2 500 (p7) cmp.ltu p8, p9 = acc0, pr0_2 501 (p10) cmp.leu p12, p13 = s0, pr1_1 502 (p11) cmp.ltu p12, p13 = s0, pr1_1 503 ;; C 05 504 .pred.rel "mutex", p8, p9 505 getfsig acc1_3 = fp2a_3 506 st8 [rp] = s0, 8 507 xma.l fp0b_2 = u_2, v0, r_2 508 (p8) add acc0 = pr0_3, acc1_1, 1 509 (p9) add acc0 = pr0_3, acc1_1 510 xma.hu fp1a_2 = u_2, v0, r_2 511 ;; C 06 512 L(00): 513 .pred.rel "mutex", p12, p13 514 getfsig pr0_1 = fp0b_1 515 ldf8 r_1 = [srp], 8 516 xma.l fp1b_1 = u_1, v1, fp1a_1 517 (p12) add s0 = pr1_2, acc0, 1 518 (p13) add s0 = pr1_2, acc0 519 xma.hu fp2a_1 = u_1, v1, fp1a_1 520 ;; C 07 521 .pred.rel "mutex", p8, p9 522 .pred.rel "mutex", p12, p13 523 ldf8 u_1 = [up], 8 524 getfsig pr1_0 = fp1b_0 525 (p8) cmp.leu p6, p7 = acc0, pr0_3 526 (p9) cmp.ltu p6, p7 = acc0, pr0_3 527 (p12) cmp.leu p10, p11 = s0, pr1_2 528 (p13) cmp.ltu p10, p11 = s0, pr1_2 529 ;; C 08 530 .pred.rel "mutex", p6, p7 531 getfsig acc1_0 = fp2a_0 532 st8 [rp] = s0, 8 533 xma.l fp0b_3 = u_3, v0, r_3 534 (p6) add acc0 = pr0_0, acc1_2, 1 535 (p7) add acc0 = pr0_0, acc1_2 536 xma.hu fp1a_3 = u_3, v0, r_3 537 ;; C 09 538 L(11): 539 .pred.rel "mutex", p10, p11 540 getfsig pr0_2 = fp0b_2 541 ldf8 r_2 = [srp], 8 542 xma.l fp1b_2 = u_2, v1, fp1a_2 543 (p10) add s0 = pr1_3, acc0, 1 544 (p11) add s0 = pr1_3, acc0 545 xma.hu fp2a_2 = u_2, v1, fp1a_2 546 ;; C 10 547 .pred.rel "mutex", p6, p7 548 .pred.rel "mutex", p10, p11 549 ldf8 u_2 = [up], 8 550 getfsig pr1_1 = fp1b_1 551 (p6) cmp.leu p8, p9 = acc0, pr0_0 552 (p7) cmp.ltu p8, p9 = acc0, pr0_0 553 (p10) cmp.leu p12, p13 = s0, pr1_3 554 (p11) cmp.ltu p12, p13 = s0, pr1_3 555 ;; C 11 556 .pred.rel "mutex", p8, p9 557 getfsig acc1_1 = fp2a_1 558 st8 [rp] = s0, 8 559 xma.l fp0b_0 = u_0, v0, r_0 560 (p8) add acc0 = pr0_1, acc1_3, 1 561 (p9) add acc0 = pr0_1, acc1_3 562 xma.hu fp1a_0 = u_0, v0, r_0 563 L(10): br.cloop.sptk.clr L(top) C 12 564 ;; 565 C *** MAIN LOOP END *** 566 L(end): 567 .pred.rel "mutex", p12, p13 568 {.mfi; getfsig pr0_3 = fp0b_3 569 xma.l fp1b_3 = u_3, v1, fp1a_3 570 (p12) add s0 = pr1_0, acc0, 1 571 }{.mfi; (p13) add s0 = pr1_0, acc0 572 xma.hu fp2a_3 = u_3, v1, fp1a_3 573 nop 1 574 ;; 575 } .pred.rel "mutex", p8, p9 576 .pred.rel "mutex", p12, p13 577 {.mmi; getfsig pr1_2 = fp1b_2 578 st8 [rp] = s0, 8 579 (p8) cmp.leu p6, p7 = acc0, pr0_1 580 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 581 (p12) cmp.leu p10, p11 = s0, pr1_0 582 (p13) cmp.ltu p10, p11 = s0, pr1_0 583 ;; 584 } .pred.rel "mutex", p6, p7 585 {.mfi; getfsig acc1_2 = fp2a_2 586 xma.l fp0b_1 = u_1, v0, r_1 587 nop 1 588 }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 589 (p7) add acc0 = pr0_2, acc1_0 590 xma.hu fp1a_1 = u_1, v0, r_1 591 ;; 592 } 593 L(cj5): 594 .pred.rel "mutex", p10, p11 595 {.mfi; getfsig pr0_0 = fp0b_0 596 xma.l fp1b_0 = u_0, v1, fp1a_0 597 (p10) add s0 = pr1_1, acc0, 1 598 }{.mfi; (p11) add s0 = pr1_1, acc0 599 xma.hu fp2a_0 = u_0, v1, fp1a_0 600 nop 1 601 ;; 602 } .pred.rel "mutex", p6, p7 603 .pred.rel "mutex", p10, p11 604 {.mmi; getfsig pr1_3 = fp1b_3 605 st8 [rp] = s0, 8 606 (p6) cmp.leu p8, p9 = acc0, pr0_2 607 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 608 (p10) cmp.leu p12, p13 = s0, pr1_1 609 (p11) cmp.ltu p12, p13 = s0, pr1_1 610 ;; 611 } .pred.rel "mutex", p8, p9 612 {.mfi; getfsig acc1_3 = fp2a_3 613 xma.l fp0b_2 = u_2, v0, r_2 614 nop 1 615 }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 616 (p9) add acc0 = pr0_3, acc1_1 617 xma.hu fp1a_2 = u_2, v0, r_2 618 ;; 619 } 620 L(cj4): 621 .pred.rel "mutex", p12, p13 622 {.mfi; getfsig pr0_1 = fp0b_1 623 xma.l fp1b_1 = u_1, v1, fp1a_1 624 (p12) add s0 = pr1_2, acc0, 1 625 }{.mfi; (p13) add s0 = pr1_2, acc0 626 xma.hu fp2a_1 = u_1, v1, fp1a_1 627 nop 1 628 ;; 629 } .pred.rel "mutex", p8, p9 630 .pred.rel "mutex", p12, p13 631 {.mmi; getfsig pr1_0 = fp1b_0 632 st8 [rp] = s0, 8 633 (p8) cmp.leu p6, p7 = acc0, pr0_3 634 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 635 (p12) cmp.leu p10, p11 = s0, pr1_2 636 (p13) cmp.ltu p10, p11 = s0, pr1_2 637 ;; 638 } .pred.rel "mutex", p6, p7 639 {.mmi; getfsig acc1_0 = fp2a_0 640 (p6) add acc0 = pr0_0, acc1_2, 1 641 (p7) add acc0 = pr0_0, acc1_2 642 ;; 643 } 644 L(cj3): 645 .pred.rel "mutex", p10, p11 646 {.mfi; getfsig pr0_2 = fp0b_2 647 xma.l fp1b_2 = u_2, v1, fp1a_2 648 (p10) add s0 = pr1_3, acc0, 1 649 }{.mfi; (p11) add s0 = pr1_3, acc0 650 xma.hu fp2a_2 = u_2, v1, fp1a_2 651 nop 1 652 ;; 653 } .pred.rel "mutex", p6, p7 654 .pred.rel "mutex", p10, p11 655 {.mmi; getfsig pr1_1 = fp1b_1 656 st8 [rp] = s0, 8 657 (p6) cmp.leu p8, p9 = acc0, pr0_0 658 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 659 (p10) cmp.leu p12, p13 = s0, pr1_3 660 (p11) cmp.ltu p12, p13 = s0, pr1_3 661 ;; 662 } .pred.rel "mutex", p8, p9 663 {.mmi; getfsig acc1_1 = fp2a_1 664 (p8) add acc0 = pr0_1, acc1_3, 1 665 (p9) add acc0 = pr0_1, acc1_3 666 ;; 667 } .pred.rel "mutex", p12, p13 668 {.mmi; (p12) add s0 = pr1_0, acc0, 1 669 (p13) add s0 = pr1_0, acc0 670 nop 1 671 ;; 672 } .pred.rel "mutex", p8, p9 673 .pred.rel "mutex", p12, p13 674 {.mmi; getfsig pr1_2 = fp1b_2 675 st8 [rp] = s0, 8 676 (p8) cmp.leu p6, p7 = acc0, pr0_1 677 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 678 (p12) cmp.leu p10, p11 = s0, pr1_0 679 (p13) cmp.ltu p10, p11 = s0, pr1_0 680 ;; 681 } .pred.rel "mutex", p6, p7 682 {.mmi; getfsig r8 = fp2a_2 683 (p6) add acc0 = pr0_2, acc1_0, 1 684 (p7) add acc0 = pr0_2, acc1_0 685 ;; 686 } .pred.rel "mutex", p10, p11 687 {.mmi; (p10) add s0 = pr1_1, acc0, 1 688 (p11) add s0 = pr1_1, acc0 689 (p6) cmp.leu p8, p9 = acc0, pr0_2 690 ;; 691 } .pred.rel "mutex", p10, p11 692 {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 693 (p10) cmp.leu p12, p13 = s0, pr1_1 694 (p11) cmp.ltu p12, p13 = s0, pr1_1 695 ;; 696 } .pred.rel "mutex", p8, p9 697 {.mmi; st8 [rp] = s0, 8 698 (p8) add acc0 = pr1_2, acc1_1, 1 699 (p9) add acc0 = pr1_2, acc1_1 700 ;; 701 } .pred.rel "mutex", p8, p9 702 {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 703 (p9) cmp.ltu p10, p11 = acc0, pr1_2 704 (p12) add acc0 = 1, acc0 705 ;; 706 }{.mmi; st8 [rp] = acc0, 8 707 (p12) cmpeqor p10, p0 = 0, acc0 708 nop 1 709 ;; 710 }{.mib; (p10) add r8 = 1, r8 711 mov ar.lc = r2 712 br.ret.sptk.many b0 713 } 714 EPILOGUE() 715 ASM_END()