github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/aorsorrlshC_n.asm (about) 1 dnl IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 C cycles/limb 34 C Itanium: ? 35 C Itanium 2: 1.5 36 37 C TODO 38 C * Use shladd in feed-in code (for mpn_addlshC_n). 39 C * Rewrite loop to schedule loads closer to use, since we do prefetch. 40 41 C INPUT PARAMETERS 42 define(`rp', `r32') 43 define(`up', `r33') 44 define(`vp', `r34') 45 define(`n', `r35') 46 47 ifdef(`DO_add', ` 48 define(`ADDSUB', `add $1 = $2, $3') 49 define(`CMP', `cmp.ltu $1,p0 = $2, $3') 50 define(`INCR', 1) 51 define(`LIM', -1) 52 define(`func', mpn_addlsh`'LSH`'_n)') 53 ifdef(`DO_sub', ` 54 define(`ADDSUB', `sub $1 = $2, $3') 55 define(`CMP', `cmp.gtu $1,p0 = $2, $3') 56 define(`INCR', -1) 57 define(`LIM', 0) 58 define(`func', mpn_sublsh`'LSH`'_n)') 59 ifdef(`DO_rsb', ` 60 define(`ADDSUB', `sub $1 = $3, $2') 61 define(`CMP', `cmp.gtu $1,p0 = $2, $4') 62 define(`INCR', -1) 63 define(`LIM', 0) 64 define(`func', mpn_rsblsh`'LSH`'_n)') 65 66 define(PFDIST, 500) 67 68 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') 69 define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') 70 define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') 71 define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29') 72 define(`x0',`r30') define(`x1',`r31') define(`x2',`r3') define(`x3',`r9') 73 74 C r3 r8 r9 r10 r11 75 76 ASM_START() 77 PROLOGUE(func) 78 .prologue 79 .save ar.lc, r2 80 .body 81 ifdef(`HAVE_ABI_32',` 82 addp4 rp = 0, rp C M I 83 addp4 up = 0, up C M I 84 nop.i 0 85 addp4 vp = 0, vp C M I 86 nop.m 0 87 zxt4 n = n C I 88 ;; 89 ') 90 {.mmi; ld8 r11 = [vp], 8 C M01 91 ld8 r10 = [up], 8 C M01 92 mov.i r2 = ar.lc C I0 93 }{.mmi; and r14 = 3, n C M I 94 cmp.lt p15, p0 = 4, n C M I 95 add n = -5, n C M I 96 ;; 97 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I 98 cmp.eq p7, p0 = 2, r14 C M I 99 cmp.eq p8, p0 = 3, r14 C M I 100 }{.bbb 101 (p6) br.dptk .Lb01 C B 102 (p7) br.dptk .Lb10 C B 103 (p8) br.dptk .Lb11 C B 104 } 105 106 .Lb00: 107 {.mmi; ld8 v0 = [vp], 8 C M01 108 ld8 u0 = [up], 8 C M01 109 shr.u n = n, 2 C I0 110 ;; 111 }{.mmi; ld8 v1 = [vp], 8 C M01 112 ld8 u1 = [up], 8 C M01 113 shl x3 = r11, LSH C I0 114 ;; 115 }{.mmi; ld8 v2 = [vp], 8 C M01 116 ld8 u2 = [up], 8 C M01 117 shrp x0 = v0, r11, 64-LSH C I0 118 }{.mmb; ADDSUB( w3, r10, x3) C M I 119 nop 0 120 (p15) br.dpnt .grt4 C B 121 ;; 122 }{.mii; CMP( p7, w3, r10, x3) C M II0 123 shrp x1 = v1, v0, 64-LSH C I0 124 ADDSUB( w0, u0, x0) C M I 125 ;; 126 }{.mii; CMP( p8, w0, u0, x0) C M I 127 shrp x2 = v2, v1, 64-LSH C I0 128 ADDSUB( w1, u1, x1) C M I 129 }{.mmb; nop 0 130 nop 0 131 br .Lcj4 C B 132 } 133 ALIGN(32) 134 .grt4: 135 {.mii; ld8 v3 = [vp], 8 C M01 136 shrp x0 = v0, r11, 64-LSH C I0 137 CMP( p8, w3, r10, x3) C M I 138 ;; 139 }{.mmi; ld8 u3 = [up], 8 C M01 140 add r11 = PFDIST, vp 141 shrp x1 = v1, v0, 64-LSH C I0 142 }{.mmi; ld8 v0 = [vp], 8 C M01 143 ADDSUB( w0, u0, x0) C M I 144 nop 0 145 ;; 146 }{.mmi; CMP( p6, w0, u0, x0) C M I 147 add r10 = PFDIST, up 148 mov.i ar.lc = n C I0 149 }{.mmb; ADDSUB( w1, u1, x1) C M I 150 ld8 u0 = [up], 8 C M01 151 br .LL00 C B 152 } 153 154 ALIGN(32) 155 .Lb01: 156 ifdef(`DO_add', 157 ` shladd w2 = r11, LSH, r10 C M I 158 shr.u r8 = r11, 64-LSH C retval I0 159 (p15) br.dpnt .grt1 C B 160 ;; 161 ',` 162 shl x2 = r11, LSH C I0 163 (p15) br.dpnt .grt1 C B 164 ;; 165 ADDSUB( w2, r10, x2) C M I 166 shr.u r8 = r11, 64-LSH C retval I0 167 ;; 168 ') 169 CMP( p6, w2, r10, x2) C M I 170 br .Lcj1 171 172 .grt1: ld8 v3 = [vp], 8 C M01 173 ld8 u3 = [up], 8 C M01 174 shr.u n = n, 2 C I0 175 ;; 176 ld8 v0 = [vp], 8 C M01 177 ld8 u0 = [up], 8 C M01 178 mov.i ar.lc = n C FIXME swap with next I0 179 ifdef(`DO_add', 180 `',` 181 ADDSUB( w2, r10, x2) 182 ') 183 ;; 184 {.mmi; ld8 v1 = [vp], 8 C M01 185 ld8 u1 = [up], 8 C M01 186 shrp x3 = v3, r11, 64-LSH C I0 187 ;; 188 }{.mmi; ld8 v2 = [vp], 8 C M01 189 ld8 u2 = [up], 8 C M01 190 shrp x0 = v0, v3, 64-LSH C I0 191 }{.mmb; CMP( p6, w2, r10, x2) C M I 192 ADDSUB( w3, u3, x3) C M I 193 br.cloop.dptk .grt5 C B 194 ;; 195 }{.mmi; CMP( p7, w3, u3, x3) C M I 196 ADDSUB( w0, u0, x0) C M I 197 shrp x1 = v1, v0, 64-LSH C I0 198 }{.mmb; nop 0 199 nop 0 200 br .Lcj5 C B 201 } 202 .grt5: 203 {.mmi; add r10 = PFDIST, up 204 add r11 = PFDIST, vp 205 shrp x0 = v0, v3, 64-LSH C I0 206 }{.mmb; ld8 v3 = [vp], 8 C M01 207 CMP( p8, w3, u3, x3) C M I 208 br .LL01 C B 209 } 210 ALIGN(32) 211 .Lb10: 212 {.mmi; ld8 v2 = [vp], 8 C M01 213 ld8 u2 = [up], 8 C M01 214 shl x1 = r11, LSH C I0 215 }{.mmb; nop 0 216 nop 0 217 (p15) br.dpnt .grt2 C B 218 ;; 219 }{.mmi; ADDSUB( w1, r10, x1) C M I 220 nop 0 221 shrp x2 = v2, r11, 64-LSH C I0 222 ;; 223 }{.mmi; CMP( p9, w1, r10, x1) C M I 224 ADDSUB( w2, u2, x2) C M I 225 shr.u r8 = v2, 64-LSH C retval I0 226 ;; 227 }{.mmb; CMP( p6, w2, u2, x2) C M I 228 nop 0 229 br .Lcj2 C B 230 } 231 .grt2: 232 {.mmi; ld8 v3 = [vp], 8 C M01 233 ld8 u3 = [up], 8 C M01 234 shr.u n = n, 2 C I0 235 ;; 236 }{.mmi; ld8 v0 = [vp], 8 C M01 237 ld8 u0 = [up], 8 C M01 238 mov.i ar.lc = n C I0 239 }{.mmi; ADDSUB( w1, r10, x1) C M I 240 nop 0 241 nop 0 242 ;; 243 }{.mii; ld8 v1 = [vp], 8 C M01 244 shrp x2 = v2, r11, 64-LSH C I0 245 CMP( p8, w1, r10, x1) C M I 246 ;; 247 }{.mmi; add r10 = PFDIST, up 248 ld8 u1 = [up], 8 C M01 249 shrp x3 = v3, v2, 64-LSH C I0 250 }{.mmi; add r11 = PFDIST, vp 251 ld8 v2 = [vp], 8 C M01 252 ADDSUB( w2, u2, x2) C M I 253 ;; 254 }{.mmi; CMP( p6, w2, u2, x2) C M I 255 ld8 u2 = [up], 8 C M01 256 shrp x0 = v0, v3, 64-LSH C I0 257 }{.mib; ADDSUB( w3, u3, x3) C M I 258 nop 0 259 br.cloop.dpnt L(top) C B 260 } 261 br L(end) C B 262 .Lb11: 263 {.mmi; ld8 v1 = [vp], 8 C M01 264 ld8 u1 = [up], 8 C M01 265 shl x0 = r11, LSH C I0 266 ;; 267 }{.mmi; ld8 v2 = [vp], 8 C M01 268 ld8 u2 = [up], 8 C M01 269 shr.u n = n, 2 C I0 270 }{.mmb; nop 0 271 nop 0 272 (p15) br.dpnt .grt3 C B 273 ;; 274 }{.mii; nop 0 275 shrp x1 = v1, r11, 64-LSH C I0 276 ADDSUB( w0, r10, x0) C M I 277 ;; 278 }{.mii; CMP( p8, w0, r10, x0) C M I 279 shrp x2 = v2, v1, 64-LSH C I0 280 ADDSUB( w1, u1, x1) C M I 281 ;; 282 }{.mmb; CMP( p9, w1, u1, x1) C M I 283 ADDSUB( w2, u2, x2) C M I 284 br .Lcj3 C B 285 } 286 .grt3: 287 {.mmi; ld8 v3 = [vp], 8 C M01 288 ld8 u3 = [up], 8 C M01 289 shrp x1 = v1, r11, 64-LSH C I0 290 }{.mmi; ADDSUB( w0, r10, x0) C M I 291 nop 0 292 nop 0 293 ;; 294 }{.mmi; ld8 v0 = [vp], 8 C M01 295 CMP( p6, w0, r10, x0) C M I 296 mov.i ar.lc = n C I0 297 }{.mmi; ld8 u0 = [up], 8 C M01 298 ADDSUB( w1, u1, x1) C M I 299 nop 0 300 ;; 301 }{.mmi; add r10 = PFDIST, up 302 add r11 = PFDIST, vp 303 shrp x2 = v2, v1, 64-LSH C I0 304 }{.mmb; ld8 v1 = [vp], 8 C M01 305 CMP( p8, w1, u1, x1) C M I 306 br .LL11 C B 307 } 308 309 C *** MAIN LOOP START *** 310 ALIGN(32) 311 L(top): st8 [rp] = w1, 8 C M23 312 lfetch [r10], 32 313 (p8) cmpeqor p6, p0 = LIM, w2 C M I 314 (p8) add w2 = INCR, w2 C M I 315 ld8 v3 = [vp], 8 C M01 316 CMP( p8, w3, u3, x3) C M I 317 ;; 318 .LL01: ld8 u3 = [up], 8 C M01 319 shrp x1 = v1, v0, 64-LSH C I0 320 (p6) cmpeqor p8, p0 = LIM, w3 C M I 321 (p6) add w3 = INCR, w3 C M I 322 ld8 v0 = [vp], 8 C M01 323 ADDSUB( w0, u0, x0) C M I 324 ;; 325 st8 [rp] = w2, 8 C M23 326 CMP( p6, w0, u0, x0) C M I 327 nop.b 0 328 ld8 u0 = [up], 8 C M01 329 lfetch [r11], 32 330 ADDSUB( w1, u1, x1) C M I 331 ;; 332 .LL00: st8 [rp] = w3, 8 C M23 333 shrp x2 = v2, v1, 64-LSH C I0 334 (p8) cmpeqor p6, p0 = LIM, w0 C M I 335 (p8) add w0 = INCR, w0 C M I 336 ld8 v1 = [vp], 8 C M01 337 CMP( p8, w1, u1, x1) C M I 338 ;; 339 .LL11: ld8 u1 = [up], 8 C M01 340 shrp x3 = v3, v2, 64-LSH C I0 341 (p6) cmpeqor p8, p0 = LIM, w1 C M I 342 (p6) add w1 = INCR, w1 C M I 343 ld8 v2 = [vp], 8 C M01 344 ADDSUB( w2, u2, x2) C M I 345 ;; 346 {.mmi; st8 [rp] = w0, 8 C M23 347 CMP( p6, w2, u2, x2) C M I 348 shrp x0 = v0, v3, 64-LSH C I0 349 }{.mib; 350 ld8 u2 = [up], 8 C M01 351 ADDSUB( w3, u3, x3) C M I 352 br.cloop.dptk L(top) C B 353 ;; 354 } 355 C *** MAIN LOOP END *** 356 357 L(end): 358 {.mmi; st8 [rp] = w1, 8 C M23 359 (p8) cmpeqor p6, p0 = LIM, w2 C M I 360 shrp x1 = v1, v0, 64-LSH C I0 361 }{.mmi; 362 (p8) add w2 = INCR, w2 C M I 363 CMP( p7, w3, u3, x3) C M I 364 ADDSUB( w0, u0, x0) C M I 365 ;; 366 } 367 .Lcj5: 368 {.mmi; st8 [rp] = w2, 8 C M23 369 (p6) cmpeqor p7, p0 = LIM, w3 C M I 370 shrp x2 = v2, v1, 64-LSH C I0 371 }{.mmi; 372 (p6) add w3 = INCR, w3 C M I 373 CMP( p8, w0, u0, x0) C M I 374 ADDSUB( w1, u1, x1) C M I 375 ;; 376 } 377 .Lcj4: 378 {.mmi; st8 [rp] = w3, 8 C M23 379 (p7) cmpeqor p8, p0 = LIM, w0 C M I 380 mov.i ar.lc = r2 C I0 381 }{.mmi; 382 (p7) add w0 = INCR, w0 C M I 383 CMP( p9, w1, u1, x1) C M I 384 ADDSUB( w2, u2, x2) C M I 385 ;; 386 } 387 .Lcj3: 388 {.mmi; st8 [rp] = w0, 8 C M23 389 (p8) cmpeqor p9, p0 = LIM, w1 C M I 390 shr.u r8 = v2, 64-LSH C I0 391 }{.mmi; 392 (p8) add w1 = INCR, w1 C M I 393 CMP( p6, w2, u2, x2) C M I 394 nop 0 395 ;; 396 } 397 .Lcj2: 398 {.mmi; st8 [rp] = w1, 8 C M23 399 (p9) cmpeqor p6, p0 = LIM, w2 C M I 400 (p9) add w2 = INCR, w2 C M I 401 ;; 402 } 403 .Lcj1: 404 {.mmb; st8 [rp] = w2 C M23 405 ifdef(`DO_rsb',` 406 (p6) add r8 = -1, r8 C M I 407 ',` 408 (p6) add r8 = 1, r8 C M I 409 ') br.ret.sptk.many b0 C B 410 } 411 EPILOGUE() 412 ASM_END()