github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparct3/aormul_4.asm (about) 1 dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb cycles/limb 37 C mul_4 addmul_4 38 C UltraSPARC T3: 21.5 22.0 39 C UltraSPARC T4: 2.625 2.75 40 41 42 C The code is well-scheduled and relies on OoO very little. There is hope that 43 C this will run at around 2.5 and 2.75 c/l respectively, on T4. 44 45 define(`rp', `%i0') 46 define(`up', `%i1') 47 define(`n', `%i2') 48 define(`vp', `%i3') 49 50 define(`v0', `%g1') 51 define(`v1', `%o7') 52 define(`v2', `%g2') 53 define(`v3', `%i3') 54 55 define(`w0', `%o0') 56 define(`w1', `%o1') 57 define(`w2', `%o2') 58 define(`w3', `%o3') 59 define(`w4', `%o4') 60 61 define(`r0', `%o5') 62 63 define(`u0', `%i4') 64 define(`u1', `%i5') 65 66 define(`rp0', `rp') 67 define(`rp1', `%g3') 68 define(`rp2', `%g4') 69 define(`up0', `up') 70 define(`up1', `%g5') 71 72 ifdef(`OPERATION_mul_4',` 73 define(`AM4', `') 74 define(`ADDX', `addcc`'$1') 75 define(`func', `mpn_mul_4') 76 ') 77 ifdef(`OPERATION_addmul_4',` 78 define(`AM4', `$1') 79 define(`ADDX', `addxccc($1,$2,$3)') 80 define(`func', `mpn_addmul_4') 81 ') 82 83 84 MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4) 85 86 ASM_START() 87 REGISTER(%g2,#scratch) 88 REGISTER(%g3,#scratch) 89 PROLOGUE(func) 90 save %sp, -176, %sp 91 92 ldx [up + 0], u1 C load up[0] early 93 andcc n, 1, %g0 C is n odd? 94 ldx [vp + 0], v0 95 sllx n, 3, n 96 ldx [vp + 8], v1 97 add n, -28, n 98 ldx [vp + 16], v2 99 add rp, -16, rp 100 ldx [vp + 24], v3 101 add up, n, up0 102 add rp, n, rp0 103 add up0, 8, up1 104 add rp0, 8, rp1 105 add rp0, 16, rp2 106 mulx u1, v0, %l0 107 mov 0, w0 108 mulx u1, v1, %l1 109 mov 0, w1 110 mulx u1, v2, %l2 111 mov 0, w2 112 mulx u1, v3, %l3 113 mov 0, w3 114 115 be L(evn) 116 neg n, n 117 118 L(odd): mov u1, u0 119 ldx [up1 + n], u1 120 AM4(` ldx [rp2 + n], r0') 121 umulxhi(u0, v0, %l4) 122 umulxhi(u0, v1, %l5) 123 umulxhi(u0, v2, %l6) 124 umulxhi(u0, v3, %l7) 125 b L(mid) 126 add n, 8, n 127 128 L(evn): ldx [up1 + n], u0 129 AM4(` ldx [rp2 + n], r0') 130 umulxhi(u1, v0, %l4) 131 umulxhi(u1, v1, %l5) 132 umulxhi(u1, v2, %l6) 133 umulxhi(u1, v3, %l7) 134 add n, 16, n 135 136 ALIGN(16) 137 L(top): addcc %l0, w0, w0 138 mulx u0, v0, %l0 C w 0 139 addxccc(%l1, w1, w1) 140 mulx u0, v1, %l1 C w 1 141 addxccc(%l2, w2, w2) 142 mulx u0, v2, %l2 C w 2 143 addxccc(%l3, w3, w3) 144 mulx u0, v3, %l3 C w 3 145 ldx [up0 + n], u1 146 addxc( %g0, %g0, w4) 147 AM4(` addcc r0, w0, w0') 148 stx w0, [rp0 + n] 149 ADDX(` %l4, w1, w0') 150 umulxhi(u0, v0, %l4) C w 1 151 AM4(` ldx [rp1 + n], r0') 152 addxccc(%l5, w2, w1) 153 umulxhi(u0, v1, %l5) C w 2 154 addxccc(%l6, w3, w2) 155 umulxhi(u0, v2, %l6) C w 3 156 addxc( %l7, w4, w3) 157 umulxhi(u0, v3, %l7) C w 4 158 L(mid): addcc %l0, w0, w0 159 mulx u1, v0, %l0 C w 1 160 addxccc(%l1, w1, w1) 161 mulx u1, v1, %l1 C w 2 162 addxccc(%l2, w2, w2) 163 mulx u1, v2, %l2 C w 3 164 addxccc(%l3, w3, w3) 165 mulx u1, v3, %l3 C w 4 166 ldx [up1 + n], u0 167 addxc( %g0, %g0, w4) 168 AM4(` addcc r0, w0, w0') 169 stx w0, [rp1 + n] 170 ADDX(` %l4, w1, w0') 171 umulxhi(u1, v0, %l4) C w 2 172 AM4(` ldx [rp2 + n], r0') 173 addxccc(%l5, w2, w1) 174 umulxhi(u1, v1, %l5) C w 3 175 addxccc(%l6, w3, w2) 176 umulxhi(u1, v2, %l6) C w 4 177 addxc( %l7, w4, w3) 178 umulxhi(u1, v3, %l7) C w 5 179 brlz n, L(top) 180 add n, 16, n 181 182 L(end): addcc %l0, w0, w0 183 mulx u0, v0, %l0 184 addxccc(%l1, w1, w1) 185 mulx u0, v1, %l1 186 addxccc(%l2, w2, w2) 187 mulx u0, v2, %l2 188 addxccc(%l3, w3, w3) 189 mulx u0, v3, %l3 190 addxc( %g0, %g0, w4) 191 AM4(` addcc r0, w0, w0') 192 stx w0, [rp0 + n] 193 ADDX(` %l4, w1, w0') 194 umulxhi(u0, v0, %l4) 195 AM4(` ldx [rp1 + n], r0') 196 addxccc(%l5, w2, w1) 197 umulxhi(u0, v1, %l5) 198 addxccc(%l6, w3, w2) 199 umulxhi(u0, v2, %l6) 200 addxc( %l7, w4, w3) 201 umulxhi(u0, v3, %l7) 202 addcc %l0, w0, w0 203 addxccc(%l1, w1, w1) 204 addxccc(%l2, w2, w2) 205 addxccc(%l3, w3, w3) 206 addxc( %g0, %g0, w4) 207 AM4(` addcc r0, w0, w0') 208 stx w0, [rp1 + n] 209 ADDX(` %l4, w1, w0') 210 addxccc(%l5, w2, w1) 211 addxccc(%l6, w3, w2) 212 stx w0, [rp2 + n] 213 add n, 16, n 214 stx w1, [rp1 + n] 215 stx w2, [rp2 + n] 216 addxc( %l7, w4, %i0) 217 ret 218 restore 219 EPILOGUE()