github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev5/diveby3.asm (about) 1 dnl Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder. 2 3 dnl Copyright 2004, 2005, 2009 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C EV4: 22 35 C EV5: 11.5 36 C EV6: 6.3 Note that mpn_bdiv_dbm1c is faster 37 38 C TODO 39 C * Remove the unops, they benefit just ev6, which no longer uses this file. 40 C * Try prefetch for destination, using lds. 41 C * Improve feed-in code, by moving initial mulq earlier; make initial load 42 C to u0/u0 to save some copying. 43 C * Combine u0 and u2, u1 and u3. 44 45 C INPUT PARAMETERS 46 define(`rp', `r16') 47 define(`up', `r17') 48 define(`n', `r18') 49 define(`cy', `r19') 50 51 ASM_START() 52 53 DATASTART(L(LC),8) 54 .quad 0xAAAAAAAAAAAAAAAB 55 .quad 0x5555555555555555 56 .quad 0xAAAAAAAAAAAAAAAA 57 DATAEND() 58 59 define(`xAAAAAAAAAAAAAAAB', `r20') 60 define(`x5555555555555555', `r21') 61 define(`xAAAAAAAAAAAAAAAA', `r22') 62 define(`u0', `r0') define(`u1', `r1') 63 define(`u2', `r2') define(`u3', `r3') 64 define(`l0', `r25') define(`x', `r8') 65 define(`q0', `r4') define(`q1', `r5') 66 define(`p6', `r6') define(`p7', `r7') 67 define(`t0', `r23') define(`t1', `r24') 68 define(`cymask',`r28') 69 70 71 PROLOGUE(mpn_divexact_by3c,gp) 72 73 ldq r28, 0(up) C load first limb early 74 75 C Put magic constants in registers 76 lda r0, L(LC) 77 ldq xAAAAAAAAAAAAAAAB, 0(r0) 78 ldq x5555555555555555, 8(r0) 79 ldq xAAAAAAAAAAAAAAAA, 16(r0) 80 81 C Compute initial l0 value 82 cmpeq cy, 1, p6 83 cmpeq cy, 2, p7 84 negq p6, p6 85 and p6, x5555555555555555, l0 86 cmovne p7, xAAAAAAAAAAAAAAAA, l0 87 88 C Feed-in depending on (n mod 4) 89 and n, 3, r8 90 lda n, -3(n) 91 cmpeq r8, 1, r4 92 cmpeq r8, 2, r5 93 bne r4, $Lb01 94 bne r5, $Lb10 95 beq r8, $Lb00 96 97 $Lb11: ldq u3, 8(up) 98 lda up, -24(up) 99 lda rp, -24(rp) 100 mulq r28, xAAAAAAAAAAAAAAAB, q0 101 mov r28, u2 102 br r31, $L11 103 104 $Lb00: ldq u2, 8(up) 105 lda up, -16(up) 106 lda rp, -16(rp) 107 mulq r28, xAAAAAAAAAAAAAAAB, q1 108 mov r28, u1 109 br r31, $L00 110 111 $Lb01: lda rp, -8(rp) 112 mulq r28, xAAAAAAAAAAAAAAAB, q0 113 mov r28, u0 114 blt n, $Lcj1 115 ldq u1, 8(up) 116 lda up, -8(up) 117 br r31, $L01 118 119 $Lb10: ldq u0, 8(up) 120 mulq r28, xAAAAAAAAAAAAAAAB, q1 121 mov r28, u3 122 blt n, $Lend 123 124 ALIGN(16) 125 $Ltop: 126 C 0 127 cmpult u3, cy, cy C L0 128 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1 129 ldq u1, 16(up) C L1 130 addq q1, l0, x C U0 131 C 1 132 negq cy, cymask C L0 133 unop C U1 134 unop C L1 135 cmpult x5555555555555555, x, p6 C U0 136 C 2 137 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 138 unop 139 unop 140 negq p6, t0 C L0 141 C 3 142 negq p7, t1 C L0 143 and cymask, x5555555555555555, l0 C U1 144 addq p6, cy, cy 145 and t0, x5555555555555555, t0 146 C 4 147 and t1, x5555555555555555, t1 148 addq p7, cy, cy 149 unop 150 addq t0, l0, l0 151 C 5 152 addq t1, l0, l0 153 unop 154 stq x, 0(rp) C L1 155 unop 156 $L01: 157 C 0 158 cmpult u0, cy, cy C L0 159 mulq u1, xAAAAAAAAAAAAAAAB, q1 C U1 160 ldq u2, 24(up) C L1 161 addq q0, l0, x C U0 162 C 1 163 negq cy, cymask C L0 164 unop C U1 165 unop C L1 166 cmpult x5555555555555555, x, p6 C U0 167 C 2 168 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 169 unop 170 unop 171 negq p6, t0 C L0 172 C 3 173 negq p7, t1 C L0 174 and cymask, x5555555555555555, l0 C U1 175 addq p6, cy, cy 176 and t0, x5555555555555555, t0 177 C 4 178 and t1, x5555555555555555, t1 179 addq p7, cy, cy 180 unop 181 addq t0, l0, l0 182 C 5 183 addq t1, l0, l0 184 unop 185 stq x, 8(rp) C L1 186 unop 187 $L00: 188 C 0 189 cmpult u1, cy, cy C L0 190 mulq u2, xAAAAAAAAAAAAAAAB, q0 C U1 191 ldq u3, 32(up) C L1 192 addq q1, l0, x C U0 193 C 1 194 negq cy, cymask C L0 195 unop C U1 196 unop C L1 197 cmpult x5555555555555555, x, p6 C U0 198 C 2 199 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 200 unop 201 unop 202 negq p6, t0 C L0 203 C 3 204 negq p7, t1 C L0 205 and cymask, x5555555555555555, l0 C U1 206 addq p6, cy, cy 207 and t0, x5555555555555555, t0 208 C 4 209 and t1, x5555555555555555, t1 210 addq p7, cy, cy 211 unop 212 addq t0, l0, l0 213 C 5 214 addq t1, l0, l0 215 unop 216 stq x, 16(rp) C L1 217 unop 218 $L11: 219 C 0 220 cmpult u2, cy, cy C L0 221 mulq u3, xAAAAAAAAAAAAAAAB, q1 C U1 222 ldq u0, 40(up) C L1 223 addq q0, l0, x C U0 224 C 1 225 negq cy, cymask C L0 226 unop C U1 227 unop C L1 228 cmpult x5555555555555555, x, p6 C U0 229 C 2 230 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 231 lda n, -4(n) C L1 bookkeeping 232 unop 233 negq p6, t0 C L0 234 C 3 235 negq p7, t1 C L0 236 and cymask, x5555555555555555, l0 C U1 237 addq p6, cy, cy 238 and t0, x5555555555555555, t0 239 C 4 240 and t1, x5555555555555555, t1 241 addq p7, cy, cy 242 unop 243 addq t0, l0, l0 244 C 5 245 addq t1, l0, l0 246 unop 247 stq x, 24(rp) C L1 248 lda up, 32(up) 249 C 250 ldl r31, 256(up) C prefetch 251 unop 252 lda rp, 32(rp) 253 bge n, $Ltop C U1 254 C *** MAIN LOOP END *** 255 $Lend: 256 257 cmpult u3, cy, cy C L0 258 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1 259 unop 260 addq q1, l0, x C U0 261 C 1 262 negq cy, cymask C L0 263 unop C U1 264 unop C L1 265 cmpult x5555555555555555, x, p6 C U0 266 C 2 267 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 268 unop 269 unop 270 negq p6, t0 C L0 271 C 3 272 negq p7, t1 C L0 273 and cymask, x5555555555555555, l0 C U1 274 addq p6, cy, cy 275 and t0, x5555555555555555, t0 276 C 4 277 and t1, x5555555555555555, t1 278 addq p7, cy, cy 279 unop 280 addq t0, l0, l0 281 C 5 282 addq t1, l0, l0 283 unop 284 stq x, 0(rp) C L1 285 unop 286 $Lcj1: 287 cmpult u0, cy, cy C L0 288 addq q0, l0, x C U0 289 cmpult x5555555555555555, x, p6 C U0 290 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 291 addq p6, cy, cy 292 addq p7, cy, r0 293 stq x, 8(rp) C L1 294 295 ret r31,(r26),1 296 EPILOGUE() 297 ASM_END() 298 299 C This is useful for playing with various schedules. 300 C Expand as: one(0)one(1)one(2)one(3) 301 define(`one',` 302 C 0 303 cmpult `$'eval(($1+3)%4), cy, cy C L0 304 mulq `$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1 305 ldq `$'eval(($1+1)%4), eval($1*8+16)(up) C L1 306 addq `$'eval(4+($1+1)%2), l0, x C U0 307 C 1 308 negq cy, cymask C L0 309 unop C U1 310 unop C L1 311 cmpult x5555555555555555, x, p6 C U0 312 C 2 313 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 314 unop 315 unop 316 negq p6, t0 C L0 317 C 3 318 negq p7, t1 C L0 319 and cymask, x5555555555555555, l0 C U1 320 addq p6, cy, cy 321 and t0, x5555555555555555, t0 322 C 4 323 and t1, x5555555555555555, t1 324 addq p7, cy, cy 325 unop 326 addq t0, l0, l0 327 C 5 328 addq t1, l0, l0 329 unop 330 stq x, eval($1*8)(rp) C L1 331 unop 332 ')