github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/lorrshift.asm (about) 1 dnl IA-64 mpn_lshift/mpn_rshift. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2000-2005 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C Itanium: 2 37 C Itanium 2: 1 38 39 C This code is scheduled deeply since the plain shift instructions shr and shl 40 C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of 41 C these instructions cause a 10 cycle replay trap on Itanium. 42 43 C The ld8 scheduling should probably be decreased to make the function smaller. 44 C Good lfetch will make sure we never stall anyway. 45 46 C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair 47 C at cycle 2. Judicious use of predicates could allow us to issue more ld8's 48 C in the prologue. 49 50 51 C INPUT PARAMETERS 52 define(`rp', `r32') 53 define(`up', `r33') 54 define(`n', `r34') 55 define(`cnt',`r35') 56 57 define(`tnc',`r9') 58 59 ifdef(`OPERATION_lshift',` 60 define(`FSH',`shl') 61 define(`BSH',`shr.u') 62 define(`UPD',`-8') 63 define(`POFF',`-512') 64 define(`PUPD',`-32') 65 define(`func',`mpn_lshift') 66 ') 67 ifdef(`OPERATION_rshift',` 68 define(`FSH',`shr.u') 69 define(`BSH',`shl') 70 define(`UPD',`8') 71 define(`POFF',`512') 72 define(`PUPD',`32') 73 define(`func',`mpn_rshift') 74 ') 75 76 MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) 77 78 ASM_START() 79 PROLOGUE(func) 80 .prologue 81 .save ar.lc, r2 82 .body 83 ifdef(`HAVE_ABI_32', 84 ` addp4 rp = 0, rp C M I 85 addp4 up = 0, up C M I 86 sxt4 n = n C M I 87 nop.m 0 88 nop.m 0 89 zxt4 cnt = cnt C I 90 ;; 91 ') 92 93 {.mmi; cmp.lt p14, p15 = 4, n C M I 94 and r14 = 3, n C M I 95 mov.i r2 = ar.lc C I0 96 }{.mmi; add r15 = -1, n C M I 97 sub tnc = 64, cnt C M I 98 add r16 = -5, n 99 ;; 100 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I 101 cmp.eq p7, p0 = 2, r14 C M I 102 shr.u n = r16, 2 C I0 103 }{.mmi; cmp.eq p8, p0 = 3, r14 C M I 104 ifdef(`OPERATION_lshift', 105 ` shladd up = r15, 3, up C M I 106 shladd rp = r15, 3, rp') C M I 107 ;; 108 }{.mmi; add r11 = POFF, up C M I 109 ld8 r10 = [up], UPD C M01 110 mov.i ar.lc = n C I0 111 }{.bbb; 112 (p6) br.dptk .Lb01 113 (p7) br.dptk .Lb10 114 (p8) br.dptk .Lb11 115 ;; } 116 117 .Lb00: ld8 r19 = [up], UPD 118 ;; 119 ld8 r16 = [up], UPD 120 ;; 121 ld8 r17 = [up], UPD 122 BSH r8 = r10, tnc C function return value 123 ;; 124 FSH r24 = r10, cnt 125 BSH r25 = r19, tnc 126 (p14) br.cond.dptk .grt4 127 ;; 128 FSH r26 = r19, cnt 129 BSH r27 = r16, tnc 130 ;; 131 FSH r20 = r16, cnt 132 BSH r21 = r17, tnc 133 ;; 134 or r14 = r25, r24 135 FSH r22 = r17, cnt 136 BSH r23 = r10, tnc 137 br .Lr4 138 139 .grt4: ld8 r18 = [up], UPD 140 FSH r26 = r19, cnt 141 BSH r27 = r16, tnc 142 ;; 143 ld8 r19 = [up], UPD 144 FSH r20 = r16, cnt 145 BSH r21 = r17, tnc 146 ;; 147 ld8 r16 = [up], UPD 148 FSH r22 = r17, cnt 149 BSH r23 = r18, tnc 150 ;; 151 or r14 = r25, r24 152 ld8 r17 = [up], UPD 153 br.cloop.dpnt .Ltop 154 br .Lbot 155 156 .Lb01: 157 (p15) BSH r8 = r10, tnc C function return value I 158 (p15) FSH r22 = r10, cnt C I 159 (p15) br.cond.dptk .Lr1 C return B 160 161 .grt1: ld8 r18 = [up], UPD 162 ;; 163 ld8 r19 = [up], UPD 164 BSH r8 = r10, tnc C function return value 165 ;; 166 ld8 r16 = [up], UPD 167 FSH r22 = r10, cnt 168 BSH r23 = r18, tnc 169 ;; 170 ld8 r17 = [up], UPD 171 FSH r24 = r18, cnt 172 BSH r25 = r19, tnc 173 br.cloop.dpnt .grt5 174 ;; 175 or r15 = r23, r22 176 FSH r26 = r19, cnt 177 BSH r27 = r16, tnc 178 ;; 179 FSH r20 = r16, cnt 180 BSH r21 = r17, tnc 181 br .Lr5 182 183 .grt5: ld8 r18 = [up], UPD 184 FSH r26 = r19, cnt 185 BSH r27 = r16, tnc 186 ;; 187 ld8 r19 = [up], UPD 188 FSH r20 = r16, cnt 189 BSH r21 = r17, tnc 190 ;; 191 or r15 = r23, r22 192 ld8 r16 = [up], UPD 193 br .LL01 194 195 196 .Lb10: ld8 r17 = [up], UPD 197 (p14) br.cond.dptk .grt2 198 199 BSH r8 = r10, tnc C function return value 200 ;; 201 FSH r20 = r10, cnt 202 BSH r21 = r17, tnc 203 ;; 204 or r14 = r21, r20 205 FSH r22 = r17, cnt 206 br .Lr2 C return 207 208 .grt2: ld8 r18 = [up], UPD 209 BSH r8 = r10, tnc C function return value 210 ;; 211 ld8 r19 = [up], UPD 212 FSH r20 = r10, cnt 213 BSH r21 = r17, tnc 214 ;; 215 ld8 r16 = [up], UPD 216 FSH r22 = r17, cnt 217 BSH r23 = r18, tnc 218 ;; 219 {.mmi; ld8 r17 = [up], UPD 220 or r14 = r21, r20 221 FSH r24 = r18, cnt 222 }{.mib; nop 0 223 BSH r25 = r19, tnc 224 br.cloop.dpnt .grt6 225 ;; } 226 227 FSH r26 = r19, cnt 228 BSH r27 = r16, tnc 229 br .Lr6 230 231 .grt6: ld8 r18 = [up], UPD 232 FSH r26 = r19, cnt 233 BSH r27 = r16, tnc 234 ;; 235 ld8 r19 = [up], UPD 236 br .LL10 237 238 239 .Lb11: ld8 r16 = [up], UPD 240 ;; 241 ld8 r17 = [up], UPD 242 BSH r8 = r10, tnc C function return value 243 (p14) br.cond.dptk .grt3 244 ;; 245 246 FSH r26 = r10, cnt 247 BSH r27 = r16, tnc 248 ;; 249 FSH r20 = r16, cnt 250 BSH r21 = r17, tnc 251 ;; 252 or r15 = r27, r26 253 FSH r22 = r17, cnt 254 br .Lr3 C return 255 256 .grt3: ld8 r18 = [up], UPD 257 FSH r26 = r10, cnt 258 BSH r27 = r16, tnc 259 ;; 260 ld8 r19 = [up], UPD 261 FSH r20 = r16, cnt 262 BSH r21 = r17, tnc 263 ;; 264 ld8 r16 = [up], UPD 265 FSH r22 = r17, cnt 266 BSH r23 = r18, tnc 267 ;; 268 ld8 r17 = [up], UPD 269 br.cloop.dpnt .grt7 270 271 or r15 = r27, r26 272 FSH r24 = r18, cnt 273 BSH r25 = r19, tnc 274 br .Lr7 275 276 .grt7: or r15 = r27, r26 277 FSH r24 = r18, cnt 278 BSH r25 = r19, tnc 279 ld8 r18 = [up], UPD 280 br .LL11 281 282 C *** MAIN LOOP START *** 283 ALIGN(32) 284 .Ltop: 285 {.mmi; st8 [rp] = r14, UPD C M2 286 or r15 = r27, r26 C M3 287 FSH r24 = r18, cnt C I0 288 }{.mmi; ld8 r18 = [up], UPD C M1 289 lfetch [r11], PUPD 290 BSH r25 = r19, tnc C I1 291 ;; } 292 .LL11: 293 {.mmi; st8 [rp] = r15, UPD 294 or r14 = r21, r20 295 FSH r26 = r19, cnt 296 }{.mmi; ld8 r19 = [up], UPD 297 nop.m 0 298 BSH r27 = r16, tnc 299 ;; } 300 .LL10: 301 {.mmi; st8 [rp] = r14, UPD 302 or r15 = r23, r22 303 FSH r20 = r16, cnt 304 }{.mmi; ld8 r16 = [up], UPD 305 nop.m 0 306 BSH r21 = r17, tnc 307 ;; } 308 .LL01: 309 {.mmi; st8 [rp] = r15, UPD 310 or r14 = r25, r24 311 FSH r22 = r17, cnt 312 }{.mib; ld8 r17 = [up], UPD 313 BSH r23 = r18, tnc 314 br.cloop.dptk .Ltop 315 ;; } 316 C *** MAIN LOOP END *** 317 318 .Lbot: 319 {.mmi; st8 [rp] = r14, UPD 320 or r15 = r27, r26 321 FSH r24 = r18, cnt 322 }{.mib; nop 0 323 BSH r25 = r19, tnc 324 nop 0 325 ;; } 326 .Lr7: 327 {.mmi; st8 [rp] = r15, UPD 328 or r14 = r21, r20 329 FSH r26 = r19, cnt 330 }{.mib; nop 0 331 BSH r27 = r16, tnc 332 nop 0 333 ;; } 334 .Lr6: 335 {.mmi; st8 [rp] = r14, UPD 336 or r15 = r23, r22 337 FSH r20 = r16, cnt 338 }{.mib; nop 0 339 BSH r21 = r17, tnc 340 nop 0 341 ;; } 342 .Lr5: st8 [rp] = r15, UPD 343 or r14 = r25, r24 344 FSH r22 = r17, cnt 345 ;; 346 .Lr4: st8 [rp] = r14, UPD 347 or r15 = r27, r26 348 ;; 349 .Lr3: st8 [rp] = r15, UPD 350 or r14 = r21, r20 351 ;; 352 .Lr2: st8 [rp] = r14, UPD 353 ;; 354 .Lr1: st8 [rp] = r22, UPD C M23 355 mov ar.lc = r2 C I0 356 br.ret.sptk.many b0 C B 357 EPILOGUE(func) 358 ASM_END()