github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/rsh1aors_n.asm (about) 1 dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2003-2005 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C Itanium: 2.5 37 C Itanium 2: 1.5 38 39 C TODO 40 C * Rewrite function entry code using aorslsh1_n.asm style. 41 C * Micro-optimize feed-in and wind-down code. 42 43 C INPUT PARAMETERS 44 define(`rp',`r32') 45 define(`up',`r33') 46 define(`vp',`r34') 47 define(`n',`r35') 48 49 ifdef(`OPERATION_rsh1add_n',` 50 define(ADDSUB, add) 51 define(PRED, ltu) 52 define(INCR, 1) 53 define(LIM, -1) 54 define(func, mpn_rsh1add_n) 55 ') 56 ifdef(`OPERATION_rsh1sub_n',` 57 define(ADDSUB, sub) 58 define(PRED, gtu) 59 define(INCR, -1) 60 define(LIM, 0) 61 define(func, mpn_rsh1sub_n) 62 ') 63 64 C Some useful aliases for registers we use 65 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') 66 define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') 67 define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') 68 define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31') 69 70 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) 71 72 ASM_START() 73 PROLOGUE(func) 74 .prologue 75 .save ar.lc, r2 76 .body 77 ifdef(`HAVE_ABI_32',` 78 addp4 rp = 0, rp C M I 79 addp4 up = 0, up C M I 80 addp4 vp = 0, vp C M I 81 nop.m 0 82 nop.m 0 83 zxt4 n = n C I 84 ;; 85 ') 86 {.mmi; ld8 r11 = [vp], 8 C M01 87 ld8 r10 = [up], 8 C M01 88 mov.i r2 = ar.lc C I0 89 }{.mmi; and r14 = 3, n C M I 90 cmp.lt p15, p0 = 4, n C M I 91 add n = -4, n C M I 92 ;; 93 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I 94 cmp.eq p7, p0 = 2, r14 C M I 95 cmp.eq p8, p0 = 3, r14 C M I 96 }{.bbb 97 (p6) br.dptk .Lb01 C B 98 (p7) br.dptk .Lb10 C B 99 (p8) br.dptk .Lb11 C B 100 } 101 102 .Lb00: ld8 v0 = [vp], 8 C M01 103 ld8 u0 = [up], 8 C M01 104 shr.u n = n, 2 C I0 105 ;; 106 ld8 v1 = [vp], 8 C M01 107 ld8 u1 = [up], 8 C M01 108 ADDSUB w3 = r10, r11 C M I 109 ;; 110 ld8 v2 = [vp], 8 C M01 111 ld8 u2 = [up], 8 C M01 112 (p15) br.dpnt .grt4 C B 113 ;; 114 115 cmp.PRED p7, p0 = w3, r10 C M I 116 and r8 = 1, w3 C M I 117 ADDSUB w0 = u0, v0 C M I 118 ;; 119 cmp.PRED p8, p0 = w0, u0 C M I 120 ADDSUB w1 = u1, v1 C M I 121 ;; 122 cmp.PRED p9, p0 = w1, u1 C M I 123 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 124 (p7) add w0 = INCR, w0 C M I 125 ;; 126 shrp x3 = w0, w3, 1 C I0 127 ADDSUB w2 = u2, v2 C M I 128 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 129 (p8) add w1 = INCR, w1 C M I 130 br .Lcj4 C B 131 132 .grt4: ld8 v3 = [vp], 8 C M01 133 cmp.PRED p7, p0 = w3, r10 C M I 134 ld8 u3 = [up], 8 C M01 135 and r8 = 1, w3 C M I 136 ;; 137 ADDSUB w0 = u0, v0 C M I 138 ld8 v0 = [vp], 8 C M01 139 add n = -1, n 140 ;; 141 cmp.PRED p8, p0 = w0, u0 C M I 142 ld8 u0 = [up], 8 C M01 143 ADDSUB w1 = u1, v1 C M I 144 ;; 145 ld8 v1 = [vp], 8 C M01 146 mov.i ar.lc = n C I0 147 cmp.PRED p9, p0 = w1, u1 C M I 148 ld8 u1 = [up], 8 C M01 149 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 150 (p7) add w0 = INCR, w0 C M I 151 ;; 152 ADDSUB w2 = u2, v2 C M I 153 ld8 v2 = [vp], 8 C M01 154 shrp x3 = w0, w3, 1 C I0 155 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 156 (p8) add w1 = INCR, w1 C M I 157 br .LL00 C B 158 159 160 .Lb01: ADDSUB w2 = r10, r11 C M I 161 shr.u n = n, 2 C I0 162 (p15) br.dpnt .grt1 C B 163 ;; 164 165 cmp.PRED p6, p7 = w2, r10 C M I 166 shr.u x2 = w2, 1 C I0 167 and r8 = 1, w2 C M I 168 ;; 169 (p6) dep x2 = -1, x2, 63, 1 C I0 170 br .Lcj1 C B 171 172 .grt1: ld8 v3 = [vp], 8 C M01 173 ld8 u3 = [up], 8 C M01 174 ;; 175 ld8 v0 = [vp], 8 C M01 176 ld8 u0 = [up], 8 C M01 177 mov.i ar.lc = n C FIXME swap with next I0 178 ;; 179 ld8 v1 = [vp], 8 C M01 180 ld8 u1 = [up], 8 C M01 181 ;; 182 ld8 v2 = [vp], 8 C M01 183 ld8 u2 = [up], 8 C M01 184 cmp.PRED p6, p0 = w2, r10 C M I 185 and r8 = 1, w2 C M I 186 ADDSUB w3 = u3, v3 C M I 187 br.cloop.dptk .grt5 C B 188 ;; 189 190 cmp.PRED p7, p0 = w3, u3 C M I 191 ;; 192 ADDSUB w0 = u0, v0 C M I 193 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 194 (p6) add w3 = INCR, w3 C M I 195 ;; 196 cmp.PRED p8, p0 = w0, u0 C M I 197 shrp x2 = w3, w2, 1 C I0 198 ADDSUB w1 = u1, v1 C M I 199 ;; 200 cmp.PRED p9, p0 = w1, u1 C M I 201 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 202 (p7) add w0 = INCR, w0 C M I 203 br .Lcj5 C B 204 205 .grt5: ld8 v3 = [vp], 8 C M01 206 cmp.PRED p7, p0 = w3, u3 C M I 207 ld8 u3 = [up], 8 C M01 208 ;; 209 ADDSUB w0 = u0, v0 C M I 210 ld8 v0 = [vp], 8 C M01 211 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 212 (p6) add w3 = INCR, w3 C M I 213 ;; 214 cmp.PRED p8, p0 = w0, u0 C M I 215 shrp x2 = w3, w2, 1 C I0 216 ld8 u0 = [up], 8 C M01 217 ADDSUB w1 = u1, v1 C M I 218 ;; 219 ld8 v1 = [vp], 8 C M01 220 cmp.PRED p9, p0 = w1, u1 C M I 221 ld8 u1 = [up], 8 C M01 222 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 223 (p7) add w0 = INCR, w0 C M I 224 br .LL01 C B 225 226 227 .Lb10: ld8 v2 = [vp], 8 C M01 228 ld8 u2 = [up], 8 C M01 229 shr.u n = n, 2 C I0 230 ADDSUB w1 = r10, r11 C M I 231 (p15) br.dpnt .grt2 C B 232 ;; 233 234 cmp.PRED p9, p0 = w1, r10 C M I 235 and r8 = 1, w1 C M I 236 ADDSUB w2 = u2, v2 C M I 237 ;; 238 cmp.PRED p6, p0 = w2, u2 C M I 239 ;; 240 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 241 (p9) add w2 = INCR, w2 C M I 242 ;; 243 shrp x1 = w2, w1, 1 C I0 244 shr.u x2 = w2, 1 C I0 245 br .Lcj2 C B 246 247 .grt2: ld8 v3 = [vp], 8 C M01 248 ld8 u3 = [up], 8 C M01 249 ;; 250 ld8 v0 = [vp], 8 C M01 251 ld8 u0 = [up], 8 C M01 252 mov.i ar.lc = n C I0 253 ;; 254 ld8 v1 = [vp], 8 C M01 255 cmp.PRED p9, p0 = w1, r10 C M I 256 ld8 u1 = [up], 8 C M01 257 and r8 = 1, w1 C M I 258 ;; 259 ADDSUB w2 = u2, v2 C M I 260 ld8 v2 = [vp], 8 C M01 261 ;; 262 cmp.PRED p6, p0 = w2, u2 C M I 263 ld8 u2 = [up], 8 C M01 264 ADDSUB w3 = u3, v3 C M I 265 br.cloop.dptk .grt6 C B 266 ;; 267 268 cmp.PRED p7, p0 = w3, u3 C M I 269 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 270 (p9) add w2 = INCR, w2 C M I 271 ;; 272 shrp x1 = w2, w1, 1 C I0 273 ADDSUB w0 = u0, v0 C M I 274 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 275 (p6) add w3 = INCR, w3 C M I 276 br .Lcj6 C B 277 278 .grt6: ld8 v3 = [vp], 8 C M01 279 cmp.PRED p7, p0 = w3, u3 C M I 280 ld8 u3 = [up], 8 C M01 281 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 282 (p9) add w2 = INCR, w2 C M I 283 ;; 284 shrp x1 = w2, w1, 1 C I0 285 ADDSUB w0 = u0, v0 C M I 286 ld8 v0 = [vp], 8 C M01 287 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 288 (p6) add w3 = INCR, w3 C M I 289 br .LL10 C B 290 291 292 .Lb11: ld8 v1 = [vp], 8 C M01 293 ld8 u1 = [up], 8 C M01 294 shr.u n = n, 2 C I0 295 ;; 296 ld8 v2 = [vp], 8 C M01 297 ld8 u2 = [up], 8 C M01 298 ADDSUB w0 = r10, r11 C M I 299 (p15) br.dpnt .grt3 C B 300 ;; 301 302 cmp.PRED p8, p0 = w0, r10 C M I 303 ADDSUB w1 = u1, v1 C M I 304 and r8 = 1, w0 C M I 305 ;; 306 cmp.PRED p9, p0 = w1, u1 C M I 307 ;; 308 ADDSUB w2 = u2, v2 C M I 309 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 310 (p8) add w1 = INCR, w1 C M I 311 ;; 312 cmp.PRED p6, p0 = w2, u2 C M I 313 shrp x0 = w1, w0, 1 C I0 314 ;; 315 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 316 (p9) add w2 = INCR, w2 C M I 317 br .Lcj3 C B 318 319 .grt3: ld8 v3 = [vp], 8 C M01 320 ld8 u3 = [up], 8 C M01 321 ;; 322 ld8 v0 = [vp], 8 C M01 323 mov.i ar.lc = n C I0 324 cmp.PRED p8, p0 = w0, r10 C M I 325 ld8 u0 = [up], 8 C M01 326 ADDSUB w1 = u1, v1 C M I 327 and r8 = 1, w0 C M I 328 ;; 329 ld8 v1 = [vp], 8 C M01 330 cmp.PRED p9, p0 = w1, u1 C M I 331 ld8 u1 = [up], 8 C M01 332 ;; 333 ADDSUB w2 = u2, v2 C M I 334 ld8 v2 = [vp], 8 C M01 335 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 336 (p8) add w1 = INCR, w1 C M I 337 ;; 338 cmp.PRED p6, p0 = w2, u2 C M I 339 shrp x0 = w1, w0, 1 C I0 340 ld8 u2 = [up], 8 C M01 341 ADDSUB w3 = u3, v3 C M I 342 br.cloop.dptk .grt7 C B 343 ;; 344 345 cmp.PRED p7, p0 = w3, u3 C M I 346 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 347 (p9) add w2 = INCR, w2 C M I 348 br .Lcj7 C B 349 350 .grt7: ld8 v3 = [vp], 8 C M01 351 cmp.PRED p7, p0 = w3, u3 C M I 352 ld8 u3 = [up], 8 C M01 353 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 354 (p9) add w2 = INCR, w2 C M I 355 br .LL11 C B 356 357 358 C *** MAIN LOOP START *** 359 ALIGN(32) 360 .Loop: st8 [rp] = x3, 8 C M23 361 ld8 v3 = [vp], 8 C M01 362 cmp.PRED p7, p0 = w3, u3 C M I 363 ld8 u3 = [up], 8 C M01 364 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 365 (p9) add w2 = INCR, w2 C M I 366 ;; 367 .LL11: st8 [rp] = x0, 8 C M23 368 shrp x1 = w2, w1, 1 C I0 369 ADDSUB w0 = u0, v0 C M I 370 ld8 v0 = [vp], 8 C M01 371 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 372 (p6) add w3 = INCR, w3 C M I 373 ;; 374 .LL10: cmp.PRED p8, p0 = w0, u0 C M I 375 shrp x2 = w3, w2, 1 C I0 376 nop.b 0 377 ld8 u0 = [up], 8 C M01 378 ADDSUB w1 = u1, v1 C M I 379 nop.b 0 380 ;; 381 st8 [rp] = x1, 8 C M23 382 ld8 v1 = [vp], 8 C M01 383 cmp.PRED p9, p0 = w1, u1 C M I 384 ld8 u1 = [up], 8 C M01 385 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 386 (p7) add w0 = INCR, w0 C M I 387 ;; 388 .LL01: st8 [rp] = x2, 8 C M23 389 shrp x3 = w0, w3, 1 C I0 390 ADDSUB w2 = u2, v2 C M I 391 ld8 v2 = [vp], 8 C M01 392 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 393 (p8) add w1 = INCR, w1 C M I 394 ;; 395 .LL00: cmp.PRED p6, p0 = w2, u2 C M I 396 shrp x0 = w1, w0, 1 C I0 397 nop.b 0 398 ld8 u2 = [up], 8 C M01 399 ADDSUB w3 = u3, v3 C M I 400 br.cloop.dptk .Loop C B 401 ;; 402 C *** MAIN LOOP END *** 403 404 .Lskip: st8 [rp] = x3, 8 C M23 405 cmp.PRED p7, p0 = w3, u3 C M I 406 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 407 (p9) add w2 = INCR, w2 C M I 408 ;; 409 .Lcj7: st8 [rp] = x0, 8 C M23 410 shrp x1 = w2, w1, 1 C I0 411 ADDSUB w0 = u0, v0 C M I 412 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I 413 (p6) add w3 = INCR, w3 C M I 414 ;; 415 .Lcj6: cmp.PRED p8, p0 = w0, u0 C M I 416 shrp x2 = w3, w2, 1 C I0 417 ADDSUB w1 = u1, v1 C M I 418 ;; 419 st8 [rp] = x1, 8 C M23 420 cmp.PRED p9, p0 = w1, u1 C M I 421 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I 422 (p7) add w0 = INCR, w0 C M I 423 ;; 424 .Lcj5: st8 [rp] = x2, 8 C M23 425 shrp x3 = w0, w3, 1 C I0 426 ADDSUB w2 = u2, v2 C M I 427 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I 428 (p8) add w1 = INCR, w1 C M I 429 ;; 430 .Lcj4: cmp.PRED p6, p0 = w2, u2 C M I 431 shrp x0 = w1, w0, 1 C I0 432 ;; 433 st8 [rp] = x3, 8 C M23 434 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I 435 (p9) add w2 = INCR, w2 C M I 436 ;; 437 .Lcj3: st8 [rp] = x0, 8 C M23 438 shrp x1 = w2, w1, 1 C I0 439 shr.u x2 = w2, 1 C I0 440 ;; 441 .Lcj2: st8 [rp] = x1, 8 C M23 442 (p6) dep x2 = -1, x2, 63, 1 C I0 443 ;; 444 .Lcj1: st8 [rp] = x2 C M23 445 mov.i ar.lc = r2 C I0 446 br.ret.sptk.many b0 C B 447 EPILOGUE()