github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/aorsmul_1.asm (about) 1 dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1. 2 3 dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C EV4: 42 35 C EV5: 18 36 C EV6: 3.5 37 38 C INPUT PARAMETERS 39 define(`rp', `r16') 40 define(`up', `r17') 41 define(`n', `r18') 42 define(`v0', `r19') 43 44 dnl This code was written in cooperation with ev6 pipeline expert Steve Root. 45 46 dnl The stores can issue a cycle late so we have paired no-op's to 'catch' 47 dnl them, so that further disturbance to the schedule is damped. 48 49 dnl We couldn't pair the loads, because the entangled schedule of the carry's 50 dnl has to happen on one side {0} of the machine. 51 52 dnl This is a great schedule for the d_cache, a poor schedule for the b_cache. 53 dnl The lockup on U0 means that any stall can't be recovered from. Consider a 54 dnl ldq in L1, say that load gets stalled because it collides with a fill from 55 dnl the b_cache. On the next cycle, this load gets priority. If first looks 56 dnl at L0, and goes there. The instruction we intended for L0 gets to look at 57 dnl L1, which is NOT where we want it. It either stalls 1, because it can't 58 dnl go in L0, or goes there, and causes a further instruction to stall. 59 60 dnl So for b_cache, we're likely going to want to put one or more cycles back 61 dnl into the code! And, of course, put in lds prefetch for the rp[] operand. 62 dnl At a place where we have an mt followed by a bookkeeping, put the 63 dnl bookkeeping in upper, and the prefetch into lower. 64 65 dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd 66 dnl like not to have an ldq or an stq to preceded a conditional branch in a 67 dnl quadpack. The conditional branch moves the retire pointer one cycle 68 dnl later. 69 70 ifdef(`OPERATION_addmul_1',` 71 define(`ADDSUB', `addq') 72 define(`CMPCY', `cmpult $2,$1') 73 define(`func', `mpn_addmul_1') 74 ') 75 ifdef(`OPERATION_submul_1',` 76 define(`ADDSUB', `subq') 77 define(`CMPCY', `cmpult $1,$2') 78 define(`func', `mpn_submul_1') 79 ') 80 81 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 82 83 ASM_START() 84 PROLOGUE(func) 85 ldq r3, 0(up) C 86 and r18, 7, r20 C 87 lda r18, -9(r18) C 88 cmpeq r20, 1, r21 C 89 beq r21, $L1 C 90 91 $1mod8: ldq r5, 0(rp) C 92 mulq v0, r3, r7 C 93 umulh v0, r3, r8 C 94 ADDSUB r5, r7, r23 C 95 CMPCY( r5, r23), r20 C 96 addq r8, r20, r0 C 97 stq r23, 0(rp) C 98 bge r18, $ent1 C 99 ret r31, (r26), 1 C 100 101 $L1: lda r8, 0(r31) C zero carry reg 102 lda r24, 0(r31) C zero carry reg 103 cmpeq r20, 2, r21 C 104 bne r21, $2mod8 C 105 cmpeq r20, 3, r21 C 106 bne r21, $3mod8 C 107 cmpeq r20, 4, r21 C 108 bne r21, $4mod8 C 109 cmpeq r20, 5, r21 C 110 bne r21, $5mod8 C 111 cmpeq r20, 6, r21 C 112 bne r21, $6mod8 C 113 cmpeq r20, 7, r21 C 114 beq r21, $0mod8 C 115 116 $7mod8: ldq r5, 0(rp) C 117 lda up, 8(up) C 118 mulq v0, r3, r7 C 119 umulh v0, r3, r24 C 120 ADDSUB r5, r7, r23 C 121 CMPCY( r5, r23), r20 C 122 addq r24, r20, r24 C 123 stq r23, 0(rp) C 124 lda rp, 8(rp) C 125 ldq r3, 0(up) C 126 $6mod8: ldq r1, 8(up) C 127 mulq v0, r3, r25 C 128 umulh v0, r3, r3 C 129 mulq v0, r1, r28 C 130 ldq r0, 16(up) C 131 ldq r4, 0(rp) C 132 umulh v0, r1, r8 C 133 ldq r1, 24(up) C 134 lda up, 48(up) C L1 bookkeeping 135 mulq v0, r0, r2 C 136 ldq r5, 8(rp) C 137 lda rp, -32(rp) C L1 bookkeeping 138 umulh v0, r0, r6 C 139 ADDSUB r4, r25, r25 C lo + acc 140 mulq v0, r1, r7 C 141 br r31, $ent6 C 142 143 $ent1: lda up, 8(up) C 144 lda rp, 8(rp) C 145 lda r8, 0(r0) C 146 ldq r3, 0(up) C 147 $0mod8: ldq r1, 8(up) C 148 mulq v0, r3, r2 C 149 umulh v0, r3, r6 C 150 mulq v0, r1, r7 C 151 ldq r0, 16(up) C 152 ldq r4, 0(rp) C 153 umulh v0, r1, r24 C 154 ldq r1, 24(up) C 155 mulq v0, r0, r25 C 156 ldq r5, 8(rp) C 157 umulh v0, r0, r3 C 158 ADDSUB r4, r2, r2 C lo + acc 159 mulq v0, r1, r28 C 160 lda rp, -16(rp) C 161 br r31, $ent0 C 162 163 $3mod8: ldq r5, 0(rp) C 164 lda up, 8(up) C 165 mulq v0, r3, r7 C 166 umulh v0, r3, r8 C 167 ADDSUB r5, r7, r23 C 168 CMPCY( r5, r23), r20 C 169 addq r8, r20, r24 C 170 stq r23, 0(rp) C 171 lda rp, 8(rp) C 172 ldq r3, 0(up) C 173 $2mod8: ldq r1, 8(up) C 174 mulq v0, r3, r25 C 175 umulh v0, r3, r3 C 176 mulq v0, r1, r28 C 177 ble r18, $n23 C 178 ldq r0, 16(up) C 179 ldq r4, 0(rp) C 180 umulh v0, r1, r8 C 181 ldq r1, 24(up) C 182 lda up, 16(up) C L1 bookkeeping 183 mulq v0, r0, r2 C 184 ldq r5, 8(rp) C 185 lda rp, 0(rp) C L1 bookkeeping 186 umulh v0, r0, r6 C 187 ADDSUB r4, r25, r25 C lo + acc 188 mulq v0, r1, r7 C 189 br r31, $ent2 C 190 191 $5mod8: ldq r5, 0(rp) C 192 lda up, 8(up) C 193 mulq v0, r3, r7 C 194 umulh v0, r3, r24 C 195 ADDSUB r5, r7, r23 C 196 CMPCY( r5, r23), r20 C 197 addq r24, r20, r8 C 198 stq r23, 0(rp) C 199 lda rp, 8(rp) C 200 ldq r3, 0(up) C 201 $4mod8: ldq r1, 8(up) C 202 mulq v0, r3, r2 C 203 umulh v0, r3, r6 C 204 mulq v0, r1, r7 C 205 ldq r0, 16(up) C 206 ldq r4, 0(rp) C 207 umulh v0, r1, r24 C 208 ldq r1, 24(up) C 209 lda up, 32(up) C L1 bookkeeping 210 mulq v0, r0, r25 C 211 ldq r5, 8(rp) C 212 lda rp, 16(rp) C L1 bookkeeping 213 umulh v0, r0, r3 C 214 ADDSUB r4, r2, r2 C lo + acc 215 mulq v0, r1, r28 C 216 CMPCY( r4, r2), r20 C L0 lo add => carry 217 ADDSUB r2, r8, r22 C U0 hi add => answer 218 ble r18, $Lend C 219 ALIGN(16) 220 $Loop: 221 bis r31, r31, r31 C U1 mt 222 CMPCY( r2, r22), r21 C L0 hi add => carry 223 addq r6, r20, r6 C U0 hi mul + carry 224 ldq r0, 0(up) C 225 226 bis r31, r31, r31 C U1 mt 227 ADDSUB r5, r7, r7 C L0 lo + acc 228 addq r6, r21, r6 C U0 hi mul + carry 229 ldq r4, 0(rp) C L1 230 231 umulh v0, r1, r8 C U1 232 CMPCY( r5, r7), r20 C L0 lo add => carry 233 ADDSUB r7, r6, r23 C U0 hi add => answer 234 ldq r1, 8(up) C L1 235 236 mulq v0, r0, r2 C U1 237 CMPCY( r7, r23), r21 C L0 hi add => carry 238 addq r24, r20, r24 C U0 hi mul + carry 239 ldq r5, 8(rp) C L1 240 241 umulh v0, r0, r6 C U1 242 ADDSUB r4, r25, r25 C U0 lo + acc 243 stq r22, -16(rp) C L0 244 stq r23, -8(rp) C L1 245 246 bis r31, r31, r31 C L0 st slosh 247 mulq v0, r1, r7 C U1 248 bis r31, r31, r31 C L1 st slosh 249 addq r24, r21, r24 C U0 hi mul + carry 250 $ent2: 251 CMPCY( r4, r25), r20 C L0 lo add => carry 252 bis r31, r31, r31 C U1 mt 253 lda r18, -8(r18) C L1 bookkeeping 254 ADDSUB r25, r24, r22 C U0 hi add => answer 255 256 bis r31, r31, r31 C U1 mt 257 CMPCY( r25, r22), r21 C L0 hi add => carry 258 addq r3, r20, r3 C U0 hi mul + carry 259 ldq r0, 16(up) C L1 260 261 bis r31, r31, r31 C U1 mt 262 ADDSUB r5, r28, r28 C L0 lo + acc 263 addq r3, r21, r3 C U0 hi mul + carry 264 ldq r4, 16(rp) C L1 265 266 umulh v0, r1, r24 C U1 267 CMPCY( r5, r28), r20 C L0 lo add => carry 268 ADDSUB r28, r3, r23 C U0 hi add => answer 269 ldq r1, 24(up) C L1 270 271 mulq v0, r0, r25 C U1 272 CMPCY( r28, r23), r21 C L0 hi add => carry 273 addq r8, r20, r8 C U0 hi mul + carry 274 ldq r5, 24(rp) C L1 275 276 umulh v0, r0, r3 C U1 277 ADDSUB r4, r2, r2 C U0 lo + acc 278 stq r22, 0(rp) C L0 279 stq r23, 8(rp) C L1 280 281 bis r31, r31, r31 C L0 st slosh 282 mulq v0, r1, r28 C U1 283 bis r31, r31, r31 C L1 st slosh 284 addq r8, r21, r8 C U0 hi mul + carry 285 $ent0: 286 CMPCY( r4, r2), r20 C L0 lo add => carry 287 bis r31, r31, r31 C U1 mt 288 lda up, 64(up) C L1 bookkeeping 289 ADDSUB r2, r8, r22 C U0 hi add => answer 290 291 bis r31, r31, r31 C U1 mt 292 CMPCY( r2, r22), r21 C L0 hi add => carry 293 addq r6, r20, r6 C U0 hi mul + carry 294 ldq r0, -32(up) C L1 295 296 bis r31, r31, r31 C U1 mt 297 ADDSUB r5, r7, r7 C L0 lo + acc 298 addq r6, r21, r6 C U0 hi mul + carry 299 ldq r4, 32(rp) C L1 300 301 umulh v0, r1, r8 C U1 302 CMPCY( r5, r7), r20 C L0 lo add => carry 303 ADDSUB r7, r6, r23 C U0 hi add => answer 304 ldq r1, -24(up) C L1 305 306 mulq v0, r0, r2 C U1 307 CMPCY( r7, r23), r21 C L0 hi add => carry 308 addq r24, r20, r24 C U0 hi mul + carry 309 ldq r5, 40(rp) C L1 310 311 umulh v0, r0, r6 C U1 312 ADDSUB r4, r25, r25 C U0 lo + acc 313 stq r22, 16(rp) C L0 314 stq r23, 24(rp) C L1 315 316 bis r31, r31, r31 C L0 st slosh 317 mulq v0, r1, r7 C U1 318 bis r31, r31, r31 C L1 st slosh 319 addq r24, r21, r24 C U0 hi mul + carry 320 $ent6: 321 CMPCY( r4, r25), r20 C L0 lo add => carry 322 bis r31, r31, r31 C U1 mt 323 lda rp, 64(rp) C L1 bookkeeping 324 ADDSUB r25, r24, r22 C U0 hi add => answer 325 326 bis r31, r31, r31 C U1 mt 327 CMPCY( r25, r22), r21 C L0 hi add => carry 328 addq r3, r20, r3 C U0 hi mul + carry 329 ldq r0, -16(up) C L1 330 331 bis r31, r31, r31 C U1 mt 332 ADDSUB r5, r28, r28 C L0 lo + acc 333 addq r3, r21, r3 C U0 hi mul + carry 334 ldq r4, -16(rp) C L1 335 336 umulh v0, r1, r24 C U1 337 CMPCY( r5, r28), r20 C L0 lo add => carry 338 ADDSUB r28, r3, r23 C U0 hi add => answer 339 ldq r1, -8(up) C L1 340 341 mulq v0, r0, r25 C U1 342 CMPCY( r28, r23), r21 C L0 hi add => carry 343 addq r8, r20, r8 C U0 hi mul + carry 344 ldq r5, -8(rp) C L1 345 346 umulh v0, r0, r3 C U1 347 ADDSUB r4, r2, r2 C U0 lo + acc 348 stq r22, -32(rp) C L0 349 stq r23, -24(rp) C L1 350 351 bis r31, r31, r31 C L0 st slosh 352 mulq v0, r1, r28 C U1 353 bis r31, r31, r31 C L1 st slosh 354 addq r8, r21, r8 C U0 hi mul + carry 355 356 CMPCY( r4, r2), r20 C L0 lo add => carry 357 ADDSUB r2, r8, r22 C U0 hi add => answer 358 ldl r31, 256(up) C prefetch up[] 359 bgt r18, $Loop C U1 bookkeeping 360 361 $Lend: CMPCY( r2, r22), r21 C 362 addq r6, r20, r6 C 363 ADDSUB r5, r7, r7 C 364 addq r6, r21, r6 C 365 ldq r4, 0(rp) C 366 umulh v0, r1, r8 C 367 CMPCY( r5, r7), r20 C 368 ADDSUB r7, r6, r23 C 369 CMPCY(r7, r23), r21 C 370 addq r24, r20, r24 C 371 ldq r5, 8(rp) C 372 ADDSUB r4, r25, r25 C 373 stq r22, -16(rp) C 374 stq r23, -8(rp) C 375 addq r24, r21, r24 C 376 br L(x) 377 378 ALIGN(16) 379 $n23: ldq r4, 0(rp) C 380 ldq r5, 8(rp) C 381 umulh v0, r1, r8 C 382 ADDSUB r4, r25, r25 C 383 L(x): CMPCY( r4, r25), r20 C 384 ADDSUB r25, r24, r22 C 385 CMPCY( r25, r22), r21 C 386 addq r3, r20, r3 C 387 ADDSUB r5, r28, r28 C 388 addq r3, r21, r3 C 389 CMPCY( r5, r28), r20 C 390 ADDSUB r28, r3, r23 C 391 CMPCY( r28, r23), r21 C 392 addq r8, r20, r8 C 393 stq r22, 0(rp) C 394 stq r23, 8(rp) C 395 addq r8, r21, r0 C 396 ret r31, (r26), 1 C 397 EPILOGUE() 398 ASM_END()