github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/aors_n.asm (about) 1 dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C Itanium: 2.67 37 C Itanium 2: 1.25 38 39 C TODO 40 C * Consider using special code for small n, using something like 41 C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code. 42 C * The non-nc code was trimmed cycle for cycle to its current state. It is 43 C probably hard to save more that an odd cycle there. The nc code is much 44 C cruder (since tune/speed doesn't have any applicable direct measurements). 45 C * Without the nc entry points, this becomes around 1800 bytes of object 46 C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a 47 C few cycles for the non-nc code and let it fall into the nc code. 48 49 C INPUT PARAMETERS 50 define(`rp', `r32') 51 define(`up', `r33') 52 define(`vp', `r34') 53 define(`n', `r35') 54 define(`cy', `r36') 55 56 ifdef(`OPERATION_add_n',` 57 define(ADDSUB, add) 58 define(CND, ltu) 59 define(INCR, 1) 60 define(LIM, -1) 61 define(LIM2, 0) 62 define(func, mpn_add_n) 63 define(func_nc, mpn_add_nc) 64 ') 65 ifdef(`OPERATION_sub_n',` 66 define(ADDSUB, sub) 67 define(CND, gtu) 68 define(INCR, -1) 69 define(LIM, 0) 70 define(LIM2, -1) 71 define(func, mpn_sub_n) 72 define(func_nc, mpn_sub_nc) 73 ') 74 75 define(PFDIST, 500) 76 77 C Some useful aliases for registers we use 78 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') 79 define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27') 80 define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31') 81 define(`rpx',`r3') 82 define(`upadv',`r20') define(`vpadv',`r21') 83 84 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 85 86 ASM_START() 87 PROLOGUE(func_nc) 88 .prologue 89 .save ar.lc, r2 90 .body 91 ifdef(`HAVE_ABI_32',` 92 addp4 rp = 0, rp C M I 93 addp4 up = 0, up C M I 94 nop.i 0 95 addp4 vp = 0, vp C M I 96 nop.m 0 97 zxt4 n = n C I 98 ;; 99 ') 100 101 {.mmi; ld8 r11 = [vp], 8 C M01 102 ld8 r10 = [up], 8 C M01 103 mov r2 = ar.lc C I0 104 }{.mmi; and r14 = 7, n C M I 105 cmp.lt p15, p14 = 8, n C M I 106 add n = -6, n C M I 107 ;; 108 }{.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in 109 add vpadv = PFDIST, vp C code could save a cycle per call at 110 mov r23 = cy C the expense of code size. 111 ;; 112 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I 113 cmp.eq p7, p0 = 2, r14 C M I 114 cmp.eq p8, p0 = 3, r14 C M I 115 }{.bbb; (p6) br.dptk .Lc001 C B 116 (p7) br.dptk .Lc010 C B 117 (p8) br.dptk .Lc011 C B 118 ;; 119 }{.mmi; cmp.eq p9, p0 = 4, r14 C M I 120 cmp.eq p10, p0 = 5, r14 C M I 121 cmp.eq p11, p0 = 6, r14 C M I 122 }{.bbb; (p9) br.dptk .Lc100 C B 123 (p10) br.dptk .Lc101 C B 124 (p11) br.dptk .Lc110 C B 125 ;; 126 }{.mmi; ld8 r19 = [vp], 8 C M01 127 ld8 r18 = [up], 8 C M01 128 cmp.ne p13, p0 = 0, cy C copy cy to p13 M I 129 }{.mmb; cmp.eq p12, p0 = 7, r14 C M I 130 nop 0 131 (p12) br.dptk .Lc111 C B 132 ;; 133 } 134 135 .Lc000: 136 {.mmi; ld8 v3 = [vp], 8 C M01 137 ld8 u3 = [up], 8 C M01 138 shr.u n = n, 3 C I0 139 ;; 140 }{.mmi; add vpadv = PFDIST, vp C M I 141 ld8 v0 = [vp], 8 C M01 142 mov ar.lc = n C I0 143 }{.mmi; ld8 u0 = [up], 8 C M01 144 ADDSUB w1 = r10, r11 C M I 145 nop 0 146 ;; 147 }{.mmi; add upadv = PFDIST, up C M I 148 ld8 v1 = [vp], 8 C M01 149 cmp.CND p7, p0 = w1, r10 C M I 150 }{.mmi; ld8 u1 = [up], 8 C M01 151 ADDSUB w2 = r18, r19 C M I 152 add rpx = 8, rp C M I 153 ;; 154 }{.mmi; ld8 v2 = [vp], 8 C M01 155 cmp.CND p8, p0 = w2, r18 C M I 156 (p13) cmpeqor p7, p0 = LIM, w1 C M I 157 }{.mmi; ld8 u2 = [up], 8 C M01 158 (p13) add w1 = INCR, w1 C M I 159 ADDSUB w3 = u3, v3 C M I 160 ;; 161 }{.mmi; ld8 v3 = [vp], 8 C M01 162 cmp.CND p9, p0 = w3, u3 C M I 163 (p7) cmpeqor p8, p0 = LIM, w2 C M I 164 }{.mmb; ld8 u3 = [up], 8 C M01 165 (p7) add w2 = INCR, w2 C M I 166 br L(m0) 167 } 168 169 .Lc001: 170 {.mmi; (p15) ld8 v1 = [vp], 8 C M01 171 (p15) ld8 u1 = [up], 8 C M01 172 ADDSUB w0 = r10, r11 C M I 173 }{.mmb; nop 0 174 nop 0 175 (p15) br L(0) 176 ;; 177 }{.mmi; cmp.ne p9, p0 = 0, r23 C M I 178 mov r8 = 0 179 cmp.CND p6, p0 = w0, r10 C M I 180 ;; 181 }{.mmb; (p9) cmpeqor p6, p0 = LIM, w0 C M I 182 (p9) add w0 = INCR, w0 C M I 183 br L(cj1) C B 184 } 185 L(0): 186 {.mmi; ld8 v2 = [vp], 8 C M01 187 ld8 u2 = [up], 8 C M01 188 shr.u n = n, 3 C I0 189 ;; 190 }{.mmi; ld8 v3 = [vp], 8 C M01 191 ld8 u3 = [up], 8 C M01 192 mov ar.lc = n C I0 193 }{.mmi; nop 0 194 cmp.ne p9, p0 = 0, r23 C M I 195 nop 0 196 ;; 197 }{.mmi; ld8 v0 = [vp], 8 C M01 198 cmp.CND p6, p0 = w0, r10 C M I 199 add rpx = 16, rp C M I 200 }{.mmb; ld8 u0 = [up], 8 C M01 201 ADDSUB w1 = u1, v1 C M I 202 br L(c1) C B 203 } 204 205 .Lc010: 206 {.mmi; ld8 v0 = [vp], 8 C M01 207 ld8 u0 = [up], 8 C M01 208 mov r8 = 0 C M I 209 }{.mmb; ADDSUB w3 = r10, r11 C M I 210 cmp.ne p8, p0 = 0, r23 C M I 211 (p15) br L(1) C B 212 ;; 213 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I 214 ADDSUB w0 = u0, v0 C M I 215 (p8) add w3 = INCR, w3 C M I 216 ;; 217 }{.mmb; cmp.CND p6, p0 = w0, u0 C M I 218 (p8) cmpeqor p9, p0 = LIM2, w3 C M I 219 br L(cj2) C B 220 } 221 L(1): 222 {.mmi; ld8 v1 = [vp], 8 C M01 223 ld8 u1 = [up], 8 C M01 224 shr.u n = n, 3 C I0 225 ;; 226 }{.mmi; ld8 v2 = [vp], 8 C M01 227 ld8 u2 = [up], 8 C M01 228 mov ar.lc = n C I0 229 ;; 230 }{.mmi; ld8 v3 = [vp], 8 C M01 231 ld8 u3 = [up], 8 C M01 232 cmp.CND p9, p0 = w3, r10 C M I 233 ;; 234 }{.mmi; (p8) cmpeqor p9, p0 = LIM, w3 C M I 235 (p8) add w3 = INCR, w3 C M I 236 ADDSUB w0 = u0, v0 C M I 237 }{.mmb; add rpx = 24, rp C M I 238 nop 0 239 br L(m23) C B 240 } 241 242 .Lc011: 243 {.mmi; ld8 v3 = [vp], 8 C M01 244 ld8 u3 = [up], 8 C M01 245 shr.u n = n, 3 C I0 246 }{.mmi; ADDSUB w2 = r10, r11 C M I 247 cmp.ne p7, p0 = 0, r23 C M I 248 nop 0 249 ;; 250 }{.mmb; ld8 v0 = [vp], 8 C M01 251 ld8 u0 = [up], 8 C M01 252 (p15) br L(2) C B 253 }{.mmi; cmp.CND p8, p0 = w2, r10 C M I 254 ADDSUB w3 = u3, v3 C M I 255 nop 0 256 ;; 257 }{.mmb; (p7) cmpeqor p8, p0 = LIM, w2 C M I 258 (p7) add w2 = INCR, w2 C M I 259 br L(cj3) C B 260 } 261 L(2): 262 {.mmi; ld8 v1 = [vp], 8 C M01 263 ld8 u1 = [up], 8 C M01 264 ADDSUB w3 = u3, v3 C M I 265 ;; 266 }{.mmi; ld8 v2 = [vp], 8 C M01 267 ld8 u2 = [up], 8 C M01 268 cmp.CND p8, p0 = w2, r10 C M I 269 ;; 270 }{.mmi; ld8 v3 = [vp], 8 C M01 271 cmp.CND p9, p0 = w3, u3 C M I 272 mov ar.lc = n C I0 273 }{.mmi; ld8 u3 = [up], 8 C M01 274 (p7) cmpeqor p8, p0 = LIM, w2 C M I 275 (p7) add w2 = INCR, w2 C M I 276 ;; 277 }{.mmi; add rpx = 32, rp C M I 278 st8 [rp] = w2, 8 C M23 279 (p8) cmpeqor p9, p0 = LIM, w3 C M I 280 }{.mmb; (p8) add w3 = INCR, w3 C M I 281 ADDSUB w0 = u0, v0 C M I 282 br L(m23) 283 } 284 285 .Lc100: 286 {.mmi; ld8 v2 = [vp], 8 C M01 287 ld8 u2 = [up], 8 C M01 288 shr.u n = n, 3 C I0 289 }{.mmi; ADDSUB w1 = r10, r11 C M I 290 nop 0 291 nop 0 292 ;; 293 }{.mmi; ld8 v3 = [vp], 8 C M01 294 ld8 u3 = [up], 8 C M01 295 add rpx = 8, rp C M I 296 }{.mmi; cmp.ne p6, p0 = 0, r23 C M I 297 cmp.CND p7, p0 = w1, r10 C M I 298 nop 0 299 ;; 300 }{.mmi; ld8 v0 = [vp], 8 C M01 301 ld8 u0 = [up], 8 C M01 302 ADDSUB w2 = u2, v2 C M I 303 }{.mmb; (p6) cmpeqor p7, p0 = LIM, w1 C M I 304 (p6) add w1 = INCR, w1 C M I 305 (p14) br L(cj4) 306 ;; 307 }{.mmi; ld8 v1 = [vp], 8 C M01 308 ld8 u1 = [up], 8 C M01 309 mov ar.lc = n C I0 310 ;; 311 }{.mmi; ld8 v2 = [vp], 8 C M01 312 cmp.CND p8, p0 = w2, u2 C M I 313 nop 0 314 }{.mmi; ld8 u2 = [up], 8 C M01 315 nop 0 316 ADDSUB w3 = u3, v3 C M I 317 ;; 318 }{.mmi; ld8 v3 = [vp], 8 C M01 319 cmp.CND p9, p0 = w3, u3 C M I 320 (p7) cmpeqor p8, p0 = LIM, w2 C M I 321 }{.mmb; ld8 u3 = [up], 8 C M01 322 (p7) add w2 = INCR, w2 C M I 323 br L(m4) 324 } 325 326 .Lc101: 327 {.mmi; ld8 v1 = [vp], 8 C M01 328 ld8 u1 = [up], 8 C M01 329 shr.u n = n, 3 C I0 330 ;; 331 }{.mmi; ld8 v2 = [vp], 8 C M01 332 ld8 u2 = [up], 8 C M01 333 mov ar.lc = n C I0 334 ;; 335 }{.mmi; ld8 v3 = [vp], 8 C M01 336 ld8 u3 = [up], 8 C M01 337 ADDSUB w0 = r10, r11 C M I 338 }{.mmi; cmp.ne p9, p0 = 0, r23 C M I 339 add rpx = 16, rp C M I 340 nop 0 341 ;; 342 }{.mmi; ld8 v0 = [vp], 8 C M01 343 ld8 u0 = [up], 8 C M01 344 cmp.CND p6, p0 = w0, r10 C M I 345 }{.mbb; ADDSUB w1 = u1, v1 C M I 346 (p15) br L(c5) C B 347 br L(end) C B 348 } 349 350 .Lc110: 351 {.mmi; ld8 v0 = [vp], 8 C M01 352 ld8 u0 = [up], 8 C M01 353 shr.u n = n, 3 C I0 354 ;; 355 }{.mmi; add upadv = PFDIST, up C M I 356 add vpadv = PFDIST, vp C M I 357 mov ar.lc = n C I0 358 }{.mmi; ld8 v1 = [vp], 8 C M01 359 ld8 u1 = [up], 8 C M01 360 ADDSUB w3 = r10, r11 C M I 361 ;; 362 }{.mmi; ld8 v2 = [vp], 8 C M01 363 ld8 u2 = [up], 8 C M01 364 ADDSUB w0 = u0, v0 C M I 365 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I 366 cmp.ne p8, p0 = 0, r23 C M I 367 add rpx = 24, rp C M I 368 ;; 369 }{.mmi; ld8 v3 = [vp], 8 C M01 370 ld8 u3 = [up], 8 C M01 371 nop 0 372 }{.mmb; (p8) cmpeqor p9, p0 = LIM, w3 C M I 373 (p8) add w3 = INCR, w3 C M I 374 br L(m67) C B 375 } 376 377 .Lc111: 378 {.mmi; ld8 v0 = [vp], 8 C M01 379 ld8 u0 = [up], 8 C M01 380 shr.u n = n, 3 C I0 381 ;; 382 }{.mmi; add upadv = PFDIST, up C M I 383 ld8 v1 = [vp], 8 C M01 384 mov ar.lc = n C I0 385 }{.mmi; ld8 u1 = [up], 8 C M01 386 ADDSUB w2 = r10, r11 C M I 387 nop 0 388 ;; 389 }{.mmi; add vpadv = PFDIST, vp C M I 390 ld8 v2 = [vp], 8 C M01 391 cmp.CND p8, p0 = w2, r10 C M I 392 }{.mmi; ld8 u2 = [up], 8 C M01 393 ADDSUB w3 = r18, r19 C M I 394 nop 0 395 ;; 396 }{.mmi; ld8 v3 = [vp], 8 C M01 397 cmp.CND p9, p0 = w3, r18 C M I 398 (p13) cmpeqor p8, p0 = LIM, w2 C M I 399 }{.mmi; ld8 u3 = [up], 8 C M01 400 (p13) add w2 = INCR, w2 C M I 401 nop 0 402 ;; 403 }{.mmi; add rpx = 32, rp C M I 404 st8 [rp] = w2, 8 C M23 405 (p8) cmpeqor p9, p0 = LIM, w3 C M I 406 }{.mmb; (p8) add w3 = INCR, w3 C M I 407 ADDSUB w0 = u0, v0 C M I 408 br L(m67) 409 } 410 EPILOGUE() 411 412 PROLOGUE(func) 413 .prologue 414 .save ar.lc, r2 415 .body 416 ifdef(`HAVE_ABI_32',` 417 addp4 rp = 0, rp C M I 418 addp4 up = 0, up C M I 419 nop.i 0 420 addp4 vp = 0, vp C M I 421 nop.m 0 422 zxt4 n = n C I 423 ;; 424 ') 425 426 {.mmi; ld8 r11 = [vp], 8 C M01 427 ld8 r10 = [up], 8 C M01 428 mov r2 = ar.lc C I0 429 }{.mmi; and r14 = 7, n C M I 430 cmp.lt p15, p14 = 8, n C M I 431 add n = -6, n C M I 432 ;; 433 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I 434 cmp.eq p7, p0 = 2, r14 C M I 435 cmp.eq p8, p0 = 3, r14 C M I 436 }{.bbb; (p6) br.dptk .Lb001 C B 437 (p7) br.dptk .Lb010 C B 438 (p8) br.dptk .Lb011 C B 439 ;; 440 }{.mmi; cmp.eq p9, p0 = 4, r14 C M I 441 cmp.eq p10, p0 = 5, r14 C M I 442 cmp.eq p11, p0 = 6, r14 C M I 443 }{.bbb; (p9) br.dptk .Lb100 C B 444 (p10) br.dptk .Lb101 C B 445 (p11) br.dptk .Lb110 C B 446 ;; 447 }{.mmi; ld8 r19 = [vp], 8 C M01 448 ld8 r18 = [up], 8 C M01 449 cmp.ne p13, p0 = r0, r0 C clear "CF" M I 450 }{.mmb; cmp.eq p12, p0 = 7, r14 C M I 451 mov r23 = 0 C M I 452 (p12) br.dptk .Lb111 C B 453 ;; 454 } 455 456 .Lb000: 457 {.mmi; ld8 v3 = [vp], 8 C M01 458 ld8 u3 = [up], 8 C M01 459 shr.u n = n, 3 C I0 460 ;; 461 }{.mmi; ld8 v0 = [vp], 8 C M01 462 ld8 u0 = [up], 8 C M01 463 ADDSUB w1 = r10, r11 C M I 464 ;; 465 }{.mmi; ld8 v1 = [vp], 8 C M01 466 cmp.CND p7, p0 = w1, r10 C M I 467 mov ar.lc = n C I0 468 }{.mmi; ld8 u1 = [up], 8 C M01 469 ADDSUB w2 = r18, r19 C M I 470 add rpx = 8, rp C M I 471 ;; 472 }{.mmi; add upadv = PFDIST, up 473 add vpadv = PFDIST, vp 474 cmp.CND p8, p0 = w2, r18 C M I 475 }{.mmi; ld8 v2 = [vp], 8 C M01 476 ld8 u2 = [up], 8 C M01 477 ADDSUB w3 = u3, v3 C M I 478 ;; 479 }{.mmi; ld8 v3 = [vp], 8 C M01 480 cmp.CND p9, p0 = w3, u3 C M I 481 (p7) cmpeqor p8, p0 = LIM, w2 C M I 482 }{.mmb; ld8 u3 = [up], 8 C M01 483 (p7) add w2 = INCR, w2 C M I 484 br L(m0) C B 485 } 486 487 ALIGN(32) 488 .Lb001: 489 {.mmi; ADDSUB w0 = r10, r11 C M I 490 (p15) ld8 v1 = [vp], 8 C M01 491 mov r8 = 0 C M I 492 ;; 493 }{.mmb; cmp.CND p6, p0 = w0, r10 C M I 494 (p15) ld8 u1 = [up], 8 C M01 495 (p14) br L(cj1) C B 496 ;; 497 }{.mmi; add upadv = PFDIST, up 498 add vpadv = PFDIST, vp 499 shr.u n = n, 3 C I0 500 }{.mmi; ld8 v2 = [vp], 8 C M01 501 ld8 u2 = [up], 8 C M01 502 cmp.CND p6, p0 = w0, r10 C M I 503 ;; 504 }{.mmi; ld8 v3 = [vp], 8 C M01 505 ld8 u3 = [up], 8 C M01 506 mov ar.lc = n C I0 507 ;; 508 }{.mmi; ld8 v0 = [vp], 8 C M01 509 ld8 u0 = [up], 8 C M01 510 ADDSUB w1 = u1, v1 C M I 511 ;; 512 }{.mmi; ld8 v1 = [vp], 8 C M01 513 cmp.CND p7, p0 = w1, u1 C M I 514 ADDSUB w2 = u2, v2 C M I 515 }{.mmb; ld8 u1 = [up], 8 C M01 516 add rpx = 16, rp C M I 517 br L(m1) C B 518 } 519 520 ALIGN(32) 521 .Lb010: 522 {.mmi; ld8 v0 = [vp], 8 C M01 523 ld8 u0 = [up], 8 C M01 524 shr.u n = n, 3 C I0 525 }{.mmb; ADDSUB w3 = r10, r11 C M I 526 nop 0 527 (p15) br L(gt2) C B 528 ;; 529 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I 530 ADDSUB w0 = u0, v0 C M I 531 mov r8 = 0 C M I 532 ;; 533 }{.mmb; nop 0 534 cmp.CND p6, p0 = w0, u0 C M I 535 br L(cj2) C B 536 } 537 L(gt2): 538 {.mmi; ld8 v1 = [vp], 8 C M01 539 ld8 u1 = [up], 8 C M01 540 nop 0 541 ;; 542 }{.mmi; add upadv = PFDIST, up 543 add vpadv = PFDIST, vp 544 mov ar.lc = n C I0 545 }{.mmi; ld8 v2 = [vp], 8 C M01 546 ld8 u2 = [up], 8 C M01 547 nop 0 548 ;; 549 }{.mmi; ld8 v3 = [vp], 8 C M01 550 cmp.CND p9, p0 = w3, r10 C M I 551 ADDSUB w0 = u0, v0 C M I 552 }{.mmb; ld8 u3 = [up], 8 C M01 553 add rpx = 24, rp C M I 554 br L(m23) C B 555 } 556 557 ALIGN(32) 558 .Lb011: 559 {.mmi; ld8 v3 = [vp], 8 C M01 560 ld8 u3 = [up], 8 C M01 561 ADDSUB w2 = r10, r11 C M I 562 ;; 563 }{.mmb; ld8 v0 = [vp], 8 C M01 564 ld8 u0 = [up], 8 C M01 565 (p15) br L(3) C B 566 }{.mmb; cmp.CND p8, p0 = w2, r10 C M I 567 ADDSUB w3 = u3, v3 C M I 568 br L(cj3) C B 569 } 570 L(3): 571 {.mmi; ld8 v1 = [vp], 8 C M01 572 ld8 u1 = [up], 8 C M01 573 shr.u n = n, 3 C I0 574 ;; 575 }{.mmi; add upadv = PFDIST, up 576 add vpadv = PFDIST, vp 577 ADDSUB w3 = u3, v3 C M I 578 }{.mmi; ld8 v2 = [vp], 8 C M01 579 ld8 u2 = [up], 8 C M01 580 cmp.CND p8, p0 = w2, r10 C M I 581 ;; 582 }{.mmi; ld8 v3 = [vp], 8 C M01 583 cmp.CND p9, p0 = w3, u3 C M I 584 mov ar.lc = n C I0 585 }{.mmi; ld8 u3 = [up], 8 C M01 586 nop 0 587 nop 0 588 ;; 589 }{.mmi; add rpx = 32, rp C M I 590 st8 [rp] = w2, 8 C M23 591 (p8) cmpeqor p9, p0 = LIM, w3 C M I 592 }{.mmb; (p8) add w3 = INCR, w3 C M I 593 ADDSUB w0 = u0, v0 C M I 594 br L(m23) C B 595 } 596 597 ALIGN(32) 598 .Lb100: 599 {.mmi; ld8 v2 = [vp], 8 C M01 600 ld8 u2 = [up], 8 C M01 601 shr.u n = n, 3 C I0 602 ;; 603 }{.mmi; ld8 v3 = [vp], 8 C M01 604 ld8 u3 = [up], 8 C M01 605 ADDSUB w1 = r10, r11 C M I 606 ;; 607 }{.mmi; ld8 v0 = [vp], 8 C M01 608 ld8 u0 = [up], 8 C M01 609 cmp.CND p7, p0 = w1, r10 C M I 610 }{.mmb; nop 0 611 ADDSUB w2 = u2, v2 C M I 612 (p14) br L(cj4) C B 613 ;; 614 } 615 L(gt4): 616 {.mmi; add upadv = PFDIST, up 617 add vpadv = PFDIST, vp 618 mov ar.lc = n C I0 619 }{.mmi; ld8 v1 = [vp], 8 C M01 620 ld8 u1 = [up], 8 C M01 621 nop 0 622 ;; 623 }{.mmi; ld8 v2 = [vp], 8 C M01 624 cmp.CND p8, p0 = w2, u2 C M I 625 nop 0 626 }{.mmi; ld8 u2 = [up], 8 C M01 627 ADDSUB w3 = u3, v3 C M I 628 add rpx = 8, rp C M I 629 ;; 630 }{.mmi; ld8 v3 = [vp], 8 C M01 631 cmp.CND p9, p0 = w3, u3 C M I 632 (p7) cmpeqor p8, p0 = LIM, w2 C M I 633 }{.mmb; ld8 u3 = [up], 8 C M01 634 (p7) add w2 = INCR, w2 C M I 635 br L(m4) C B 636 } 637 638 ALIGN(32) 639 .Lb101: 640 {.mmi; ld8 v1 = [vp], 8 C M01 641 ld8 u1 = [up], 8 C M01 642 shr.u n = n, 3 C I0 643 ;; 644 }{.mmi; ld8 v2 = [vp], 8 C M01 645 ld8 u2 = [up], 8 C M01 646 ADDSUB w0 = r10, r11 C M I 647 ;; 648 }{.mmi; add upadv = PFDIST, up 649 add vpadv = PFDIST, vp 650 add rpx = 16, rp C M I 651 }{.mmi; ld8 v3 = [vp], 8 C M01 652 ld8 u3 = [up], 8 C M01 653 nop 0 654 ;; 655 }{.mmi; ld8 v0 = [vp], 8 C M01 656 cmp.CND p6, p0 = w0, r10 C M I 657 nop 0 658 }{.mmb; ld8 u0 = [up], 8 C M01 659 ADDSUB w1 = u1, v1 C M I 660 (p14) br L(cj5) C B 661 ;; 662 } 663 L(gt5): 664 {.mmi; ld8 v1 = [vp], 8 C M01 665 cmp.CND p7, p0 = w1, u1 C M I 666 mov ar.lc = n C I0 667 }{.mmb; ld8 u1 = [up], 8 C M01 668 ADDSUB w2 = u2, v2 C M I 669 br L(m5) C B 670 } 671 672 ALIGN(32) 673 .Lb110: 674 {.mmi; ld8 v0 = [vp], 8 C M01 675 ld8 u0 = [up], 8 C M01 676 shr.u n = n, 3 C I0 677 ;; 678 }{.mmi; ld8 v1 = [vp], 8 C M01 679 ld8 u1 = [up], 8 C M01 680 ADDSUB w3 = r10, r11 C M I 681 ;; 682 }{.mmi; add upadv = PFDIST, up 683 add vpadv = PFDIST, vp 684 mov ar.lc = n C I0 685 }{.mmi; ld8 v2 = [vp], 8 C M01 686 ld8 u2 = [up], 8 C M01 687 nop 0 688 ;; 689 }{.mmi; ld8 v3 = [vp], 8 C M01 690 cmp.CND p9, p0 = w3, r10 C M I 691 ADDSUB w0 = u0, v0 C M I 692 }{.mmb; ld8 u3 = [up], 8 C M01 693 add rpx = 24, rp C M I 694 br L(m67) C B 695 } 696 697 ALIGN(32) 698 .Lb111: 699 {.mmi; ld8 v0 = [vp], 8 C M01 700 ld8 u0 = [up], 8 C M01 701 shr.u n = n, 3 C I0 702 ;; 703 }{.mmi; ld8 v1 = [vp], 8 C M01 704 ld8 u1 = [up], 8 C M01 705 ADDSUB w2 = r10, r11 C M I 706 ;; 707 }{.mmi; ld8 v2 = [vp], 8 C M01 708 cmp.CND p8, p0 = w2, r10 C M I 709 mov ar.lc = n C I0 710 }{.mmi; ld8 u2 = [up], 8 C M01 711 ADDSUB w3 = r18, r19 C M I 712 nop 0 713 ;; 714 }{.mmi; add upadv = PFDIST, up 715 add vpadv = PFDIST, vp 716 nop 0 717 }{.mmi; ld8 v3 = [vp], 8 C M01 718 ld8 u3 = [up], 8 C M01 719 cmp.CND p9, p0 = w3, r18 C M I 720 ;; 721 }{.mmi; add rpx = 32, rp C M I 722 st8 [rp] = w2, 8 C M23 723 (p8) cmpeqor p9, p0 = LIM, w3 C M I 724 }{.mmb; (p8) add w3 = INCR, w3 C M I 725 ADDSUB w0 = u0, v0 C M I 726 br L(m67) C B 727 } 728 729 C *** MAIN LOOP START *** 730 ALIGN(32) 731 L(top): 732 L(c5): ld8 v1 = [vp], 8 C M01 733 cmp.CND p7, p0 = w1, u1 C M I 734 (p9) cmpeqor p6, p0 = LIM, w0 C M I 735 ld8 u1 = [up], 8 C M01 736 (p9) add w0 = INCR, w0 C M I 737 ADDSUB w2 = u2, v2 C M I 738 ;; 739 L(m5): ld8 v2 = [vp], 8 C M01 740 cmp.CND p8, p0 = w2, u2 C M I 741 (p6) cmpeqor p7, p0 = LIM, w1 C M I 742 ld8 u2 = [up], 8 C M01 743 (p6) add w1 = INCR, w1 C M I 744 ADDSUB w3 = u3, v3 C M I 745 ;; 746 st8 [rp] = w0, 8 C M23 747 ld8 v3 = [vp], 8 C M01 748 cmp.CND p9, p0 = w3, u3 C M I 749 (p7) cmpeqor p8, p0 = LIM, w2 C M I 750 ld8 u3 = [up], 8 C M01 751 (p7) add w2 = INCR, w2 C M I 752 ;; 753 L(m4): st8 [rp] = w1, 16 C M23 754 st8 [rpx] = w2, 32 C M23 755 (p8) cmpeqor p9, p0 = LIM, w3 C M I 756 lfetch [upadv], 64 757 (p8) add w3 = INCR, w3 C M I 758 ADDSUB w0 = u0, v0 C M I 759 ;; 760 L(m23): st8 [rp] = w3, 8 C M23 761 ld8 v0 = [vp], 8 C M01 762 cmp.CND p6, p0 = w0, u0 C M I 763 ld8 u0 = [up], 8 C M01 764 ADDSUB w1 = u1, v1 C M I 765 nop.b 0 766 ;; 767 L(c1): ld8 v1 = [vp], 8 C M01 768 cmp.CND p7, p0 = w1, u1 C M I 769 (p9) cmpeqor p6, p0 = LIM, w0 C M I 770 ld8 u1 = [up], 8 C M01 771 (p9) add w0 = INCR, w0 C M I 772 ADDSUB w2 = u2, v2 C M I 773 ;; 774 L(m1): ld8 v2 = [vp], 8 C M01 775 cmp.CND p8, p0 = w2, u2 C M I 776 (p6) cmpeqor p7, p0 = LIM, w1 C M I 777 ld8 u2 = [up], 8 C M01 778 (p6) add w1 = INCR, w1 C M I 779 ADDSUB w3 = u3, v3 C M I 780 ;; 781 st8 [rp] = w0, 8 C M23 782 ld8 v3 = [vp], 8 C M01 783 cmp.CND p9, p0 = w3, u3 C M I 784 (p7) cmpeqor p8, p0 = LIM, w2 C M I 785 ld8 u3 = [up], 8 C M01 786 (p7) add w2 = INCR, w2 C M I 787 ;; 788 L(m0): st8 [rp] = w1, 16 C M23 789 st8 [rpx] = w2, 32 C M23 790 (p8) cmpeqor p9, p0 = LIM, w3 C M I 791 lfetch [vpadv], 64 792 (p8) add w3 = INCR, w3 C M I 793 ADDSUB w0 = u0, v0 C M I 794 ;; 795 L(m67): st8 [rp] = w3, 8 C M23 796 ld8 v0 = [vp], 8 C M01 797 cmp.CND p6, p0 = w0, u0 C M I 798 ld8 u0 = [up], 8 C M01 799 ADDSUB w1 = u1, v1 C M I 800 br.cloop.dptk L(top) C B 801 ;; 802 C *** MAIN LOOP END *** 803 804 L(end): 805 {.mmi; (p9) cmpeqor p6, p0 = LIM, w0 C M I 806 (p9) add w0 = INCR, w0 C M I 807 mov ar.lc = r2 C I0 808 } 809 L(cj5): 810 {.mmi; cmp.CND p7, p0 = w1, u1 C M I 811 ADDSUB w2 = u2, v2 C M I 812 nop 0 813 ;; 814 }{.mmi; st8 [rp] = w0, 8 C M23 815 (p6) cmpeqor p7, p0 = LIM, w1 C M I 816 (p6) add w1 = INCR, w1 C M I 817 } 818 L(cj4): 819 {.mmi; cmp.CND p8, p0 = w2, u2 C M I 820 ADDSUB w3 = u3, v3 C M I 821 nop 0 822 ;; 823 }{.mmi; st8 [rp] = w1, 8 C M23 824 (p7) cmpeqor p8, p0 = LIM, w2 C M I 825 (p7) add w2 = INCR, w2 C M I 826 } 827 L(cj3): 828 {.mmi; cmp.CND p9, p0 = w3, u3 C M I 829 ADDSUB w0 = u0, v0 C M I 830 nop 0 831 ;; 832 }{.mmi; st8 [rp] = w2, 8 C M23 833 (p8) cmpeqor p9, p0 = LIM, w3 C M I 834 (p8) add w3 = INCR, w3 C M I 835 }{.mmi; cmp.CND p6, p0 = w0, u0 C M I 836 nop 0 837 mov r8 = 0 C M I 838 ;; 839 } 840 L(cj2): 841 {.mmi; st8 [rp] = w3, 8 C M23 842 (p9) cmpeqor p6, p0 = LIM, w0 C M I 843 (p9) add w0 = INCR, w0 C M I 844 ;; 845 } 846 L(cj1): 847 {.mmb; st8 [rp] = w0, 8 C M23 848 (p6) mov r8 = 1 C M I 849 br.ret.sptk.many b0 C B 850 } 851 EPILOGUE() 852 ASM_END()