github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/nails/submul_1.asm (about) 1 dnl Alpha ev6 nails mpn_submul_1. 2 3 dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C EV4: 42 35 C EV5: 18 36 C EV6: 4 37 38 C TODO 39 C * Reroll loop for 3.75 c/l with current 4-way unrolling. 40 C * The loop is overscheduled wrt loads and wrt multiplies, in particular 41 C umulh. 42 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 43 C and would work since the loop structure is really regular. 44 45 C INPUT PARAMETERS 46 define(`rp',`r16') 47 define(`up',`r17') 48 define(`n', `r18') 49 define(`vl0',`r19') 50 51 define(`numb_mask',`r6') 52 53 define(`m0a',`r0') 54 define(`m0b',`r1') 55 define(`m1a',`r2') 56 define(`m1b',`r3') 57 define(`m2a',`r20') 58 define(`m2b',`r21') 59 define(`m3a',`r22') 60 define(`m3b',`r23') 61 62 define(`acc0',`r25') 63 define(`acc1',`r27') 64 65 define(`ul0',`r4') 66 define(`ul1',`r5') 67 define(`ul2',`r4') 68 define(`ul3',`r5') 69 70 define(`rl0',`r24') 71 define(`rl1',`r24') 72 define(`rl2',`r24') 73 define(`rl3',`r24') 74 75 define(`t0',`r7') 76 define(`t1',`r8') 77 78 define(`NAIL_BITS',`GMP_NAIL_BITS') 79 define(`NUMB_BITS',`GMP_NUMB_BITS') 80 81 dnl This declaration is munged by configure 82 NAILS_SUPPORT(2-63) 83 84 ASM_START() 85 PROLOGUE(mpn_submul_1) 86 sll vl0, NAIL_BITS, vl0 87 lda numb_mask, -1(r31) 88 srl numb_mask, NAIL_BITS, numb_mask 89 90 and n, 3, r25 91 cmpeq r25, 1, r21 92 bne r21, L(1m4) 93 cmpeq r25, 2, r21 94 bne r21, L(2m4) 95 beq r25, L(0m4) 96 97 L(3m4): ldq ul3, 0(up) 98 lda n, -4(n) 99 ldq ul0, 8(up) 100 mulq vl0, ul3, m3a 101 umulh vl0, ul3, m3b 102 ldq ul1, 16(up) 103 lda up, 24(up) 104 lda rp, -8(rp) 105 mulq vl0, ul0, m0a 106 umulh vl0, ul0, m0b 107 bge n, L(ge3) 108 109 mulq vl0, ul1, m1a 110 umulh vl0, ul1, m1b 111 ldq rl3, 8(rp) 112 srl m3a,NAIL_BITS, t0 113 addq t0, r31, acc1 114 subq rl3, acc1, acc1 115 ldq rl0, 16(rp) 116 srl m0a,NAIL_BITS, t0 117 addq t0, m3b, acc0 118 sra acc1,NUMB_BITS, t1 119 br r31, L(ta3) 120 121 L(ge3): ldq ul2, 0(up) 122 mulq vl0, ul1, m1a 123 umulh vl0, ul1, m1b 124 ldq rl3, 8(rp) 125 srl m3a,NAIL_BITS, t0 126 ldq ul3, 8(up) 127 lda n, -4(n) 128 mulq vl0, ul2, m2a 129 addq t0, r31, acc1 130 umulh vl0, ul2, m2b 131 subq rl3, acc1, acc1 132 ldq rl0, 16(rp) 133 srl m0a,NAIL_BITS, t0 134 ldq ul0, 16(up) 135 mulq vl0, ul3, m3a 136 addq t0, m3b, acc0 137 sra acc1,NUMB_BITS, t1 138 br r31, L(el3) 139 140 L(0m4): lda n, -8(n) 141 ldq ul2, 0(up) 142 ldq ul3, 8(up) 143 mulq vl0, ul2, m2a 144 umulh vl0, ul2, m2b 145 ldq ul0, 16(up) 146 mulq vl0, ul3, m3a 147 umulh vl0, ul3, m3b 148 ldq ul1, 24(up) 149 lda up, 32(up) 150 mulq vl0, ul0, m0a 151 umulh vl0, ul0, m0b 152 bge n, L(ge4) 153 154 ldq rl2, 0(rp) 155 srl m2a,NAIL_BITS, t0 156 mulq vl0, ul1, m1a 157 addq t0, r31, acc0 158 umulh vl0, ul1, m1b 159 subq rl2, acc0, acc0 160 ldq rl3, 8(rp) 161 srl m3a,NAIL_BITS, t0 162 addq t0, m2b, acc1 163 sra acc0,NUMB_BITS, t1 164 br r31, L(ta4) 165 166 L(ge4): ldq rl2, 0(rp) 167 srl m2a,NAIL_BITS, t0 168 ldq ul2, 0(up) 169 mulq vl0, ul1, m1a 170 addq t0, r31, acc0 171 umulh vl0, ul1, m1b 172 subq rl2, acc0, acc0 173 ldq rl3, 8(rp) 174 srl m3a,NAIL_BITS, t0 175 ldq ul3, 8(up) 176 lda n, -4(n) 177 mulq vl0, ul2, m2a 178 addq t0, m2b, acc1 179 sra acc0,NUMB_BITS, t1 180 br r31, L(el0) 181 182 L(2m4): lda n, -4(n) 183 ldq ul0, 0(up) 184 ldq ul1, 8(up) 185 lda up, 16(up) 186 lda rp, -16(rp) 187 mulq vl0, ul0, m0a 188 umulh vl0, ul0, m0b 189 bge n, L(ge2) 190 191 mulq vl0, ul1, m1a 192 umulh vl0, ul1, m1b 193 ldq rl0, 16(rp) 194 srl m0a,NAIL_BITS, t0 195 addq t0, r31, acc0 196 subq rl0, acc0, acc0 197 ldq rl1, 24(rp) 198 srl m1a,NAIL_BITS, t0 199 addq t0, m0b, acc1 200 sra acc0,NUMB_BITS, t1 201 br r31, L(ta2) 202 203 L(ge2): ldq ul2, 0(up) 204 mulq vl0, ul1, m1a 205 umulh vl0, ul1, m1b 206 ldq ul3, 8(up) 207 lda n, -4(n) 208 mulq vl0, ul2, m2a 209 umulh vl0, ul2, m2b 210 ldq rl0, 16(rp) 211 srl m0a,NAIL_BITS, t0 212 ldq ul0, 16(up) 213 mulq vl0, ul3, m3a 214 addq t0, r31, acc0 215 umulh vl0, ul3, m3b 216 subq rl0, acc0, acc0 217 ldq rl1, 24(rp) 218 srl m1a,NAIL_BITS, t0 219 ldq ul1, 24(up) 220 lda up, 32(up) 221 lda rp, 32(rp) 222 mulq vl0, ul0, m0a 223 addq t0, m0b, acc1 224 sra acc0,NUMB_BITS, t1 225 bge n, L(el2) 226 227 br r31, L(ta6) 228 229 L(1m4): lda n, -4(n) 230 ldq ul1, 0(up) 231 lda up, 8(up) 232 lda rp, -24(rp) 233 bge n, L(ge1) 234 235 mulq vl0, ul1, m1a 236 umulh vl0, ul1, m1b 237 ldq rl1, 24(rp) 238 srl m1a,NAIL_BITS, t0 239 subq rl1, t0, acc1 240 and acc1,numb_mask, r28 241 sra acc1,NUMB_BITS, t1 242 stq r28, 24(rp) 243 subq m1b, t1, r0 244 ret r31, (r26), 1 245 246 L(ge1): ldq ul2, 0(up) 247 mulq vl0, ul1, m1a 248 umulh vl0, ul1, m1b 249 ldq ul3, 8(up) 250 lda n, -4(n) 251 mulq vl0, ul2, m2a 252 umulh vl0, ul2, m2b 253 ldq ul0, 16(up) 254 mulq vl0, ul3, m3a 255 umulh vl0, ul3, m3b 256 ldq rl1, 24(rp) 257 srl m1a,NAIL_BITS, t0 258 ldq ul1, 24(up) 259 lda up, 32(up) 260 lda rp, 32(rp) 261 mulq vl0, ul0, m0a 262 addq t0, r31, acc1 263 umulh vl0, ul0, m0b 264 subq rl1, acc1, acc1 265 ldq rl2, 0(rp) 266 srl m2a,NAIL_BITS, t0 267 mulq vl0, ul1, m1a 268 addq t0, m1b, acc0 269 sra acc1,NUMB_BITS, t1 270 blt n, L(ta5) 271 272 L(ge5): ldq ul2, 0(up) 273 br r31, L(el1) 274 275 ALIGN(16) 276 L(top): mulq vl0, ul0, m0a C U1 277 addq t0, m0b, acc1 C L0 278 sra acc0,NUMB_BITS, t1 C U0 279 stq r28, -24(rp) C L1 280 C 281 L(el2): umulh vl0, ul0, m0b C U1 282 and acc0,numb_mask, r28 C L0 283 subq rl1, acc1, acc1 C U0 284 ldq rl2, 0(rp) C L1 285 C 286 unop C U1 287 addq t1, acc1, acc1 C L0 288 srl m2a,NAIL_BITS, t0 C U0 289 ldq ul2, 0(up) C L1 290 C 291 mulq vl0, ul1, m1a C U1 292 addq t0, m1b, acc0 C L0 293 sra acc1,NUMB_BITS, t1 C U0 294 stq r28, -16(rp) C L1 295 C 296 L(el1): umulh vl0, ul1, m1b C U1 297 and acc1,numb_mask, r28 C L0 298 subq rl2, acc0, acc0 C U0 299 ldq rl3, 8(rp) C L1 300 C 301 lda n, -4(n) C L1 302 addq t1, acc0, acc0 C L0 303 srl m3a,NAIL_BITS, t0 C U0 304 ldq ul3, 8(up) C L1 305 C 306 mulq vl0, ul2, m2a C U1 307 addq t0, m2b, acc1 C L0 308 sra acc0,NUMB_BITS, t1 C U0 309 stq r28, -8(rp) C L1 310 C 311 L(el0): umulh vl0, ul2, m2b C U1 312 and acc0,numb_mask, r28 C L0 313 subq rl3, acc1, acc1 C U0 314 ldq rl0, 16(rp) C L1 315 C 316 unop C U1 317 addq t1, acc1, acc1 C L0 318 srl m0a,NAIL_BITS, t0 C U0 319 ldq ul0, 16(up) C L1 320 C 321 mulq vl0, ul3, m3a C U1 322 addq t0, m3b, acc0 C L0 323 sra acc1,NUMB_BITS, t1 C U0 324 stq r28, 0(rp) C L1 325 C 326 L(el3): umulh vl0, ul3, m3b C U1 327 and acc1,numb_mask, r28 C L0 328 subq rl0, acc0, acc0 C U0 329 ldq rl1, 24(rp) C L1 330 C 331 unop C U1 332 addq t1, acc0, acc0 C L0 333 srl m1a,NAIL_BITS, t0 C U0 334 ldq ul1, 24(up) C L1 335 C 336 lda up, 32(up) C L0 337 unop C U1 338 lda rp, 32(rp) C L1 339 bge n, L(top) C U0 340 341 L(end): mulq vl0, ul0, m0a 342 addq t0, m0b, acc1 343 sra acc0,NUMB_BITS, t1 344 stq r28, -24(rp) 345 L(ta6): umulh vl0, ul0, m0b 346 and acc0,numb_mask, r28 347 subq rl1, acc1, acc1 348 ldq rl2, 0(rp) 349 addq t1, acc1, acc1 350 srl m2a,NAIL_BITS, t0 351 mulq vl0, ul1, m1a 352 addq t0, m1b, acc0 353 sra acc1,NUMB_BITS, t1 354 stq r28, -16(rp) 355 L(ta5): umulh vl0, ul1, m1b 356 and acc1,numb_mask, r28 357 subq rl2, acc0, acc0 358 ldq rl3, 8(rp) 359 addq t1, acc0, acc0 360 srl m3a,NAIL_BITS, t0 361 addq t0, m2b, acc1 362 sra acc0,NUMB_BITS, t1 363 stq r28, -8(rp) 364 unop 365 ALIGN(16) 366 L(ta4): and acc0,numb_mask, r28 367 subq rl3, acc1, acc1 368 ldq rl0, 16(rp) 369 addq t1, acc1, acc1 370 srl m0a,NAIL_BITS, t0 371 addq t0, m3b, acc0 372 sra acc1,NUMB_BITS, t1 373 stq r28, 0(rp) 374 unop 375 ALIGN(16) 376 L(ta3): and acc1,numb_mask, r28 377 subq rl0, acc0, acc0 378 ldq rl1, 24(rp) 379 addq t1, acc0, acc0 380 srl m1a,NAIL_BITS, t0 381 addq t0, m0b, acc1 382 sra acc0,NUMB_BITS, t1 383 stq r28, 8(rp) 384 unop 385 ALIGN(16) 386 L(ta2): and acc0,numb_mask, r28 387 subq rl1, acc1, acc1 388 addq t1, acc1, acc1 389 sra acc1,NUMB_BITS, t1 390 stq r28, 16(rp) 391 and acc1,numb_mask, r28 392 subq m1b, t1, r0 393 stq r28, 24(rp) 394 ret r31, (r26), 1 395 EPILOGUE() 396 ASM_END()