github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/nails/mul_1.asm (about) 1 dnl Alpha ev6 nails mpn_mul_1. 2 3 dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C EV4: 42 35 C EV5: 18 36 C EV6: 3.25 37 38 C TODO 39 C * Reroll loop for 3.0 c/l with current 4-way unrolling. 40 C * The loop is overscheduled wrt loads and wrt multiplies, in particular 41 C umulh. 42 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 43 C and would work since the loop structure is really regular. 44 45 C INPUT PARAMETERS 46 define(`rp',`r16') 47 define(`up',`r17') 48 define(`n', `r18') 49 define(`vl0',`r19') 50 51 define(`numb_mask',`r6') 52 53 define(`m0a',`r0') 54 define(`m0b',`r1') 55 define(`m1a',`r2') 56 define(`m1b',`r3') 57 define(`m2a',`r20') 58 define(`m2b',`r21') 59 define(`m3a',`r22') 60 define(`m3b',`r23') 61 62 define(`acc0',`r25') 63 define(`acc1',`r27') 64 65 define(`ul0',`r4') 66 define(`ul1',`r5') 67 define(`ul2',`r4') 68 define(`ul3',`r5') 69 70 define(`rl0',`r24') 71 define(`rl1',`r24') 72 define(`rl2',`r24') 73 define(`rl3',`r24') 74 75 define(`t0',`r7') 76 define(`t1',`r8') 77 78 define(`NAIL_BITS',`GMP_NAIL_BITS') 79 define(`NUMB_BITS',`GMP_NUMB_BITS') 80 81 dnl This declaration is munged by configure 82 NAILS_SUPPORT(1-63) 83 84 ASM_START() 85 PROLOGUE(mpn_mul_1) 86 sll vl0, NAIL_BITS, vl0 87 lda numb_mask, -1(r31) 88 srl numb_mask, NAIL_BITS, numb_mask 89 90 and n, 3, r25 91 cmpeq r25, 1, r21 92 bne r21, L(1m4) 93 cmpeq r25, 2, r21 94 bne r21, L(2m4) 95 beq r25, L(0m4) 96 97 L(3m4): ldq ul3, 0(up) 98 lda n, -4(n) 99 ldq ul0, 8(up) 100 mulq vl0, ul3, m3a 101 umulh vl0, ul3, m3b 102 ldq ul1, 16(up) 103 lda up, 24(up) 104 lda rp, -8(rp) 105 mulq vl0, ul0, m0a 106 umulh vl0, ul0, m0b 107 bge n, L(ge3) 108 109 mulq vl0, ul1, m1a 110 umulh vl0, ul1, m1b 111 srl m3a,NAIL_BITS, t0 112 addq t0, r31, acc1 113 srl m0a,NAIL_BITS, t0 114 addq t0, m3b, acc0 115 srl acc1,NUMB_BITS, t1 116 br r31, L(ta3) 117 118 L(ge3): ldq ul2, 0(up) 119 mulq vl0, ul1, m1a 120 umulh vl0, ul1, m1b 121 srl m3a,NAIL_BITS, t0 122 ldq ul3, 8(up) 123 lda n, -4(n) 124 mulq vl0, ul2, m2a 125 addq t0, r31, acc1 126 umulh vl0, ul2, m2b 127 srl m0a,NAIL_BITS, t0 128 ldq ul0, 16(up) 129 mulq vl0, ul3, m3a 130 addq t0, m3b, acc0 131 srl acc1,NUMB_BITS, t1 132 br r31, L(el3) 133 134 L(0m4): lda n, -8(n) 135 ldq ul2, 0(up) 136 ldq ul3, 8(up) 137 mulq vl0, ul2, m2a 138 umulh vl0, ul2, m2b 139 ldq ul0, 16(up) 140 mulq vl0, ul3, m3a 141 umulh vl0, ul3, m3b 142 ldq ul1, 24(up) 143 lda up, 32(up) 144 mulq vl0, ul0, m0a 145 umulh vl0, ul0, m0b 146 bge n, L(ge4) 147 148 srl m2a,NAIL_BITS, t0 149 mulq vl0, ul1, m1a 150 addq t0, r31, acc0 151 umulh vl0, ul1, m1b 152 srl m3a,NAIL_BITS, t0 153 addq t0, m2b, acc1 154 srl acc0,NUMB_BITS, t1 155 br r31, L(ta4) 156 157 L(ge4): srl m2a,NAIL_BITS, t0 158 ldq ul2, 0(up) 159 mulq vl0, ul1, m1a 160 addq t0, r31, acc0 161 umulh vl0, ul1, m1b 162 srl m3a,NAIL_BITS, t0 163 ldq ul3, 8(up) 164 lda n, -4(n) 165 mulq vl0, ul2, m2a 166 addq t0, m2b, acc1 167 srl acc0,NUMB_BITS, t1 168 br r31, L(el0) 169 170 L(2m4): lda n, -4(n) 171 ldq ul0, 0(up) 172 ldq ul1, 8(up) 173 lda up, 16(up) 174 lda rp, -16(rp) 175 mulq vl0, ul0, m0a 176 umulh vl0, ul0, m0b 177 bge n, L(ge2) 178 179 mulq vl0, ul1, m1a 180 umulh vl0, ul1, m1b 181 srl m0a,NAIL_BITS, t0 182 addq t0, r31, acc0 183 srl m1a,NAIL_BITS, t0 184 addq t0, m0b, acc1 185 srl acc0,NUMB_BITS, t1 186 br r31, L(ta2) 187 188 L(ge2): ldq ul2, 0(up) 189 mulq vl0, ul1, m1a 190 umulh vl0, ul1, m1b 191 ldq ul3, 8(up) 192 lda n, -4(n) 193 mulq vl0, ul2, m2a 194 umulh vl0, ul2, m2b 195 srl m0a,NAIL_BITS, t0 196 ldq ul0, 16(up) 197 mulq vl0, ul3, m3a 198 addq t0, r31, acc0 199 umulh vl0, ul3, m3b 200 srl m1a,NAIL_BITS, t0 201 ldq ul1, 24(up) 202 lda up, 32(up) 203 lda rp, 32(rp) 204 mulq vl0, ul0, m0a 205 addq t0, m0b, acc1 206 srl acc0,NUMB_BITS, t1 207 bge n, L(el2) 208 209 br r31, L(ta6) 210 211 L(1m4): lda n, -4(n) 212 ldq ul1, 0(up) 213 lda up, 8(up) 214 lda rp, -24(rp) 215 bge n, L(ge1) 216 217 mulq vl0, ul1, m1a 218 umulh vl0, ul1, m1b 219 srl m1a,NAIL_BITS, t0 220 addq t0, r31, acc1 221 and acc1,numb_mask, r28 222 srl acc1,NUMB_BITS, t1 223 stq r28, 24(rp) 224 addq t1, m1b, r0 225 ret r31, (r26), 1 226 227 L(ge1): ldq ul2, 0(up) 228 mulq vl0, ul1, m1a 229 umulh vl0, ul1, m1b 230 ldq ul3, 8(up) 231 lda n, -4(n) 232 mulq vl0, ul2, m2a 233 umulh vl0, ul2, m2b 234 ldq ul0, 16(up) 235 mulq vl0, ul3, m3a 236 umulh vl0, ul3, m3b 237 srl m1a,NAIL_BITS, t0 238 ldq ul1, 24(up) 239 lda up, 32(up) 240 lda rp, 32(rp) 241 mulq vl0, ul0, m0a 242 addq t0, r31, acc1 243 umulh vl0, ul0, m0b 244 srl m2a,NAIL_BITS, t0 245 mulq vl0, ul1, m1a 246 addq t0, m1b, acc0 247 srl acc1,NUMB_BITS, t1 248 blt n, L(ta5) 249 250 L(ge5): ldq ul2, 0(up) 251 br r31, L(el1) 252 253 ALIGN(16) 254 L(top): mulq vl0, ul0, m0a C U1 255 addq t0, m0b, acc1 C L0 256 srl acc0,NUMB_BITS, t1 C U0 257 stq r28, -24(rp) C L1 258 C 259 L(el2): umulh vl0, ul0, m0b C U1 260 and acc0,numb_mask, r28 C L0 261 unop C U0 262 unop C L1 263 C 264 unop C U1 265 addq t1, acc1, acc1 C L0 266 srl m2a,NAIL_BITS, t0 C U0 267 ldq ul2, 0(up) C L1 268 C 269 mulq vl0, ul1, m1a C U1 270 addq t0, m1b, acc0 C L0 271 srl acc1,NUMB_BITS, t1 C U0 272 stq r28, -16(rp) C L1 273 C 274 L(el1): umulh vl0, ul1, m1b C U1 275 and acc1,numb_mask, r28 C L0 276 unop C U0 277 lda n, -4(n) C L1 278 C 279 unop C U1 280 addq t1, acc0, acc0 C L0 281 srl m3a,NAIL_BITS, t0 C U0 282 ldq ul3, 8(up) C L1 283 C 284 mulq vl0, ul2, m2a C U1 285 addq t0, m2b, acc1 C L0 286 srl acc0,NUMB_BITS, t1 C U0 287 stq r28, -8(rp) C L1 288 C 289 L(el0): umulh vl0, ul2, m2b C U1 290 and acc0,numb_mask, r28 C L0 291 unop C U0 292 unop C L1 293 C 294 unop C U1 295 addq t1, acc1, acc1 C L0 296 srl m0a,NAIL_BITS, t0 C U0 297 ldq ul0, 16(up) C L1 298 C 299 mulq vl0, ul3, m3a C U1 300 addq t0, m3b, acc0 C L0 301 srl acc1,NUMB_BITS, t1 C U0 302 stq r28, 0(rp) C L1 303 C 304 L(el3): umulh vl0, ul3, m3b C U1 305 and acc1,numb_mask, r28 C L0 306 unop C U0 307 unop C L1 308 C 309 unop C U1 310 addq t1, acc0, acc0 C L0 311 srl m1a,NAIL_BITS, t0 C U0 312 ldq ul1, 24(up) C L1 313 C 314 lda up, 32(up) C L0 315 unop C U1 316 lda rp, 32(rp) C L1 317 bge n, L(top) C U0 318 319 L(end): mulq vl0, ul0, m0a 320 addq t0, m0b, acc1 321 srl acc0,NUMB_BITS, t1 322 stq r28, -24(rp) 323 L(ta6): umulh vl0, ul0, m0b 324 and acc0,numb_mask, r28 325 addq t1, acc1, acc1 326 srl m2a,NAIL_BITS, t0 327 mulq vl0, ul1, m1a 328 addq t0, m1b, acc0 329 srl acc1,NUMB_BITS, t1 330 stq r28, -16(rp) 331 L(ta5): umulh vl0, ul1, m1b 332 and acc1,numb_mask, r28 333 addq t1, acc0, acc0 334 srl m3a,NAIL_BITS, t0 335 addq t0, m2b, acc1 336 srl acc0,NUMB_BITS, t1 337 stq r28, -8(rp) 338 ALIGN(16) 339 L(ta4): and acc0,numb_mask, r28 340 addq t1, acc1, acc1 341 srl m0a,NAIL_BITS, t0 342 addq t0, m3b, acc0 343 srl acc1,NUMB_BITS, t1 344 stq r28, 0(rp) 345 unop 346 ALIGN(16) 347 L(ta3): and acc1,numb_mask, r28 348 addq t1, acc0, acc0 349 srl m1a,NAIL_BITS, t0 350 addq t0, m0b, acc1 351 srl acc0,NUMB_BITS, t1 352 stq r28, 8(rp) 353 unop 354 ALIGN(16) 355 L(ta2): and acc0,numb_mask, r28 356 addq t1, acc1, acc1 357 srl acc1,NUMB_BITS, t1 358 stq r28, 16(rp) 359 and acc1,numb_mask, r28 360 addq t1, m1b, r0 361 stq r28, 24(rp) 362 ret r31, (r26), 1 363 EPILOGUE() 364 ASM_END()