github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/lshiftc.asm (about) 1 dnl IA-64 mpn_lshiftc. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2000-2005, 2010 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C Itanium: ? 37 C Itanium 2: 1.25 38 39 C This code is scheduled deeply since the plain shift instructions shr and shl 40 C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of 41 C these instructions cause a 10 cycle replay trap on Itanium. 42 43 C The ld8 scheduling should probably be decreased to make the function smaller. 44 C Good lfetch will make sure we never stall anyway. 45 46 C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair 47 C at cycle 2. Judicious use of predicates could allow us to issue more ld8's 48 C in the prologue. 49 50 51 C INPUT PARAMETERS 52 define(`rp', `r32') 53 define(`up', `r33') 54 define(`n', `r34') 55 define(`cnt',`r35') 56 57 define(`tnc',`r9') 58 59 define(`FSH',`shl') 60 define(`BSH',`shr.u') 61 define(`UPD',`-8') 62 define(`POFF',`-512') 63 define(`PUPD',`-32') 64 define(`func',`mpn_lshiftc') 65 66 ASM_START() 67 PROLOGUE(mpn_lshiftc) 68 .prologue 69 .save ar.lc, r2 70 .body 71 ifdef(`HAVE_ABI_32', 72 ` addp4 rp = 0, rp C M I 73 addp4 up = 0, up C M I 74 sxt4 n = n C M I 75 nop.m 0 76 nop.m 0 77 zxt4 cnt = cnt C I 78 ;; 79 ') 80 81 {.mmi; nop 0 C M I 82 and r14 = 3, n C M I 83 mov.i r2 = ar.lc C I0 84 }{.mmi; add r15 = -1, n C M I 85 sub tnc = 64, cnt C M I 86 nop 0 87 ;; 88 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I 89 cmp.eq p7, p0 = 2, r14 C M I 90 shr.u n = r15, 2 C I0 91 }{.mmi; cmp.eq p8, p0 = 3, r14 C M I 92 shladd up = r15, 3, up C M I 93 shladd rp = r15, 3, rp C M I 94 ;; 95 }{.mmi; add r11 = POFF, up C M I 96 ld8 r10 = [up], UPD C M01 97 mov.i ar.lc = n C I0 98 }{.bbb; 99 (p6) br.dptk .Lb01 100 (p7) br.dptk .Lb10 101 (p8) br.dptk .Lb11 102 ;; } 103 104 .Lb00: 105 ld8 r19 = [up], UPD 106 ;; 107 ld8 r16 = [up], UPD 108 ;; 109 ld8 r17 = [up], UPD 110 BSH r8 = r10, tnc 111 br.cloop.dptk L(gt4) 112 ;; 113 FSH r24 = r10, cnt 114 BSH r25 = r19, tnc 115 ;; 116 FSH r26 = r19, cnt 117 BSH r27 = r16, tnc 118 ;; 119 FSH r20 = r16, cnt 120 BSH r21 = r17, tnc 121 ;; 122 or r14 = r25, r24 123 FSH r22 = r17, cnt 124 ;; 125 or r15 = r27, r26 126 sub r31 = -1, r14 127 br .Lr4 128 129 L(gt4): 130 {.mmi; nop 0 131 nop 0 132 FSH r24 = r10, cnt 133 }{.mmi; ld8 r18 = [up], UPD 134 nop 0 135 BSH r25 = r19, tnc 136 ;; } 137 {.mmi; nop 0 138 nop 0 139 FSH r26 = r19, cnt 140 }{.mmi; ld8 r19 = [up], UPD 141 nop 0 142 BSH r27 = r16, tnc 143 ;; } 144 {.mmi; nop 0 145 nop 0 146 FSH r20 = r16, cnt 147 }{.mmi; ld8 r16 = [up], UPD 148 nop 0 149 BSH r21 = r17, tnc 150 ;; } 151 {.mmi; nop 0 152 or r14 = r25, r24 153 FSH r22 = r17, cnt 154 }{.mib; ld8 r17 = [up], UPD 155 BSH r23 = r18, tnc 156 br.cloop.dptk L(gt8) 157 ;; } 158 {.mmi; nop 0 159 or r15 = r27, r26 160 FSH r24 = r18, cnt 161 }{.mib; sub r31 = -1, r14 162 BSH r25 = r19, tnc 163 br .Lr8 } 164 165 L(gt8): 166 or r15 = r27, r26 167 FSH r24 = r18, cnt 168 ld8 r18 = [up], UPD 169 sub r31 = -1, r14 170 BSH r25 = r19, tnc 171 br .LL00 172 173 .Lb01: 174 br.cloop.dptk L(gt1) 175 ;; 176 BSH r8 = r10, tnc 177 FSH r22 = r10, cnt 178 ;; 179 sub r31 = -1, r22 180 br .Lr1 181 ;; 182 L(gt1): 183 ld8 r18 = [up], UPD 184 BSH r8 = r10, tnc 185 FSH r22 = r10, cnt 186 ;; 187 ld8 r19 = [up], UPD 188 ;; 189 ld8 r16 = [up], UPD 190 ;; 191 ld8 r17 = [up], UPD 192 BSH r23 = r18, tnc 193 br.cloop.dptk L(gt5) 194 ;; 195 nop 0 196 FSH r24 = r18, cnt 197 BSH r25 = r19, tnc 198 ;; 199 nop 0 200 FSH r26 = r19, cnt 201 BSH r27 = r16, tnc 202 ;; 203 or r15 = r23, r22 204 FSH r20 = r16, cnt 205 BSH r21 = r17, tnc 206 ;; 207 or r14 = r25, r24 208 FSH r22 = r17, cnt 209 sub r31 = -1, r15 210 br .Lr5 211 212 L(gt5): 213 {.mmi; nop 0 214 nop 0 215 FSH r24 = r18, cnt 216 }{.mmi; ld8 r18 = [up], UPD 217 nop 0 218 BSH r25 = r19, tnc 219 ;; } 220 {.mmi; nop 0 221 nop 0 222 FSH r26 = r19, cnt 223 }{.mmi; ld8 r19 = [up], UPD 224 nop 0 225 BSH r27 = r16, tnc 226 ;; } 227 {.mmi; nop 0 228 or r15 = r23, r22 229 FSH r20 = r16, cnt 230 }{.mmi; ld8 r16 = [up], UPD 231 nop 0 232 BSH r21 = r17, tnc 233 ;; } 234 {.mmi; or r14 = r25, r24 235 sub r31 = -1, r15 236 FSH r22 = r17, cnt 237 }{.mib; ld8 r17 = [up], UPD 238 BSH r23 = r18, tnc 239 br L(end) 240 ;; } 241 242 .Lb10: 243 ld8 r17 = [up], UPD 244 br.cloop.dptk L(gt2) 245 ;; 246 BSH r8 = r10, tnc 247 FSH r20 = r10, cnt 248 ;; 249 BSH r21 = r17, tnc 250 FSH r22 = r17, cnt 251 ;; 252 or r14 = r21, r20 253 ;; 254 sub r31 = -1, r14 255 br .Lr2 256 ;; 257 L(gt2): 258 ld8 r18 = [up], UPD 259 BSH r8 = r10, tnc 260 FSH r20 = r10, cnt 261 ;; 262 ld8 r19 = [up], UPD 263 ;; 264 ld8 r16 = [up], UPD 265 BSH r21 = r17, tnc 266 FSH r22 = r17, cnt 267 ;; 268 ld8 r17 = [up], UPD 269 BSH r23 = r18, tnc 270 br.cloop.dptk L(gt6) 271 ;; 272 nop 0 273 FSH r24 = r18, cnt 274 BSH r25 = r19, tnc 275 ;; 276 or r14 = r21, r20 277 FSH r26 = r19, cnt 278 BSH r27 = r16, tnc 279 ;; 280 {.mmi; nop 0 281 or r15 = r23, r22 282 FSH r20 = r16, cnt 283 }{.mib; sub r31 = -1, r14 284 BSH r21 = r17, tnc 285 br .Lr6 286 ;; } 287 L(gt6): 288 {.mmi; nop 0 289 nop 0 290 FSH r24 = r18, cnt 291 }{.mmi; ld8 r18 = [up], UPD 292 nop 0 293 BSH r25 = r19, tnc 294 ;; } 295 {.mmi; nop 0 296 or r14 = r21, r20 297 FSH r26 = r19, cnt 298 }{.mmi; ld8 r19 = [up], UPD 299 nop 0 300 BSH r27 = r16, tnc 301 ;; } 302 {.mmi; or r15 = r23, r22 303 sub r31 = -1, r14 304 FSH r20 = r16, cnt 305 }{.mib; ld8 r16 = [up], UPD 306 BSH r21 = r17, tnc 307 br .LL10 308 } 309 310 .Lb11: 311 ld8 r16 = [up], UPD 312 ;; 313 ld8 r17 = [up], UPD 314 BSH r8 = r10, tnc 315 FSH r26 = r10, cnt 316 br.cloop.dptk L(gt3) 317 ;; 318 BSH r27 = r16, tnc 319 ;; 320 FSH r20 = r16, cnt 321 BSH r21 = r17, tnc 322 ;; 323 FSH r22 = r17, cnt 324 ;; 325 or r15 = r27, r26 326 ;; 327 or r14 = r21, r20 328 sub r31 = -1, r15 329 br .Lr3 330 ;; 331 L(gt3): 332 ld8 r18 = [up], UPD 333 ;; 334 ld8 r19 = [up], UPD 335 BSH r27 = r16, tnc 336 ;; 337 {.mmi; nop 0 338 nop 0 339 FSH r20 = r16, cnt 340 }{.mmi; ld8 r16 = [up], UPD 341 nop 0 342 BSH r21 = r17, tnc 343 ;; 344 }{.mmi; nop 0 345 nop 0 346 FSH r22 = r17, cnt 347 }{.mib; ld8 r17 = [up], UPD 348 BSH r23 = r18, tnc 349 br.cloop.dptk L(gt7) 350 ;; } 351 or r15 = r27, r26 352 FSH r24 = r18, cnt 353 BSH r25 = r19, tnc 354 ;; 355 {.mmi; nop 0 356 or r14 = r21, r20 357 FSH r26 = r19, cnt 358 }{.mib; sub r31 = -1, r15 359 BSH r27 = r16, tnc 360 br .Lr7 361 } 362 L(gt7): 363 {.mmi; nop 0 364 or r15 = r27, r26 365 FSH r24 = r18, cnt 366 }{.mmi; ld8 r18 = [up], UPD 367 nop 0 368 BSH r25 = r19, tnc 369 ;; } 370 {.mmi; or r14 = r21, r20 371 sub r31 = -1, r15 372 FSH r26 = r19, cnt 373 }{.mib; ld8 r19 = [up], UPD 374 BSH r27 = r16, tnc 375 br .LL11 376 } 377 378 C *** MAIN LOOP START *** 379 ALIGN(32) 380 L(top): 381 .LL01: 382 {.mmi; st8 [rp] = r31, UPD C M2 383 or r15 = r27, r26 C M3 384 FSH r24 = r18, cnt C I0 385 }{.mmi; ld8 r18 = [up], UPD C M0 386 sub r31 = -1, r14 C M1 387 BSH r25 = r19, tnc C I1 388 ;; } 389 .LL00: 390 {.mmi; st8 [rp] = r31, UPD 391 or r14 = r21, r20 392 FSH r26 = r19, cnt 393 }{.mmi; ld8 r19 = [up], UPD 394 sub r31 = -1, r15 395 BSH r27 = r16, tnc 396 ;; } 397 .LL11: 398 {.mmi; st8 [rp] = r31, UPD 399 or r15 = r23, r22 400 FSH r20 = r16, cnt 401 }{.mmi; ld8 r16 = [up], UPD 402 sub r31 = -1, r14 403 BSH r21 = r17, tnc 404 ;; } 405 .LL10: 406 {.mmi; st8 [rp] = r31, UPD 407 or r14 = r25, r24 408 FSH r22 = r17, cnt 409 }{.mmi; ld8 r17 = [up], UPD 410 sub r31 = -1, r15 411 BSH r23 = r18, tnc 412 ;; } 413 L(end): lfetch [r11], PUPD 414 br.cloop.dptk L(top) 415 C *** MAIN LOOP END *** 416 417 {.mmi; st8 [rp] = r31, UPD 418 or r15 = r27, r26 419 FSH r24 = r18, cnt 420 }{.mib; sub r31 = -1, r14 421 BSH r25 = r19, tnc 422 nop 0 423 ;; } 424 .Lr8: 425 {.mmi; st8 [rp] = r31, UPD 426 or r14 = r21, r20 427 FSH r26 = r19, cnt 428 }{.mib; sub r31 = -1, r15 429 BSH r27 = r16, tnc 430 nop 0 431 ;; } 432 .Lr7: 433 {.mmi; st8 [rp] = r31, UPD 434 or r15 = r23, r22 435 FSH r20 = r16, cnt 436 }{.mib; sub r31 = -1, r14 437 BSH r21 = r17, tnc 438 nop 0 439 ;; } 440 .Lr6: st8 [rp] = r31, UPD 441 or r14 = r25, r24 442 FSH r22 = r17, cnt 443 sub r31 = -1, r15 444 ;; 445 .Lr5: st8 [rp] = r31, UPD 446 or r15 = r27, r26 447 sub r31 = -1, r14 448 ;; 449 .Lr4: st8 [rp] = r31, UPD 450 or r14 = r21, r20 451 sub r31 = -1, r15 452 ;; 453 .Lr3: st8 [rp] = r31, UPD 454 sub r31 = -1, r14 455 ;; 456 .Lr2: st8 [rp] = r31, UPD 457 sub r31 = -1, r22 458 ;; 459 .Lr1: st8 [rp] = r31, UPD C M23 460 mov ar.lc = r2 C I0 461 br.ret.sptk.many b0 C B 462 EPILOGUE(func) 463 ASM_END()