github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparct3/aormul_2.asm (about) 1 dnl SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb cycles/limb 37 C mul_2 addmul_2 38 C UltraSPARC T3: 22.5 23.5 39 C UltraSPARC T4: 3.25 3.75 40 41 42 C The code is reasonably scheduled but also relies on OoO. There was hope that 43 C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per 44 C iteration needs to be removed. 45 C 46 C We could almost use 2-way unrolling, but currently the wN registers live too 47 C long. By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down- 48 C wards, 2-way unrolling should become possible. With n-indexed addressing it 49 C should run no slower. 50 C 51 C The rp loads to g1/g3 are very much over-scheduled. Presumably, they could 52 C be postponed a full way, and then just one register could be used. 53 54 C INPUT PARAMETERS 55 define(`rp', `%i0') 56 define(`up', `%i1') 57 define(`n', `%i2') 58 define(`vp', `%i3') 59 60 define(`v0', `%o0') 61 define(`v1', `%o1') 62 63 define(`w0', `%o2') 64 define(`w1', `%o3') 65 define(`w2', `%o4') 66 define(`w3', `%o5') 67 68 ifdef(`OPERATION_mul_2',` 69 define(`AM2', `') 70 define(`ADDX', `addcc`'$1') 71 define(`func', `mpn_mul_2') 72 ') 73 ifdef(`OPERATION_addmul_2',` 74 define(`AM2', `$1') 75 define(`ADDX', `addxccc($1,$2,$3)') 76 define(`func', `mpn_addmul_2') 77 ') 78 79 80 MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2) 81 82 ASM_START() 83 REGISTER(%g2,#scratch) 84 REGISTER(%g3,#scratch) 85 PROLOGUE(func) 86 save %sp, -176, %sp 87 88 ldx [vp+0], v0 C load v0 89 and n, 3, %g5 90 ldx [vp+8], v1 C load v1 91 add n, -6, n 92 ldx [up+0], %g4 93 brz %g5, L(b0) 94 cmp %g5, 2 95 bcs L(b1) 96 nop 97 be L(b2) 98 nop 99 100 L(b3): 101 AM2(` ldx [rp+0], %g1') 102 mulx %g4, v0, w2 103 umulxhi(%g4, v0, w3) 104 ldx [up+8], %i5 105 mulx %g4, v1, %l3 106 umulxhi(%g4, v1, %l7) 107 AM2(` ldx [rp+8], %g3') 108 add up, -8, up 109 add rp, -8, rp 110 b L(lo3) 111 mov 0, w0 112 113 L(b2): 114 AM2(` ldx [rp+0], %g3') 115 mulx %g4, v0, w3 116 umulxhi(%g4, v0, w0) 117 ldx [up+8], %i4 118 mulx %g4, v1, %l1 119 umulxhi(%g4, v1, %l5) 120 AM2(` ldx [rp+8], %g1') 121 add rp, 16, rp 122 brlz n, L(end) 123 mov 0, w1 124 ba L(top) 125 add up, 16, up 126 127 L(b1): 128 AM2(` ldx [rp+0], %g1') 129 mulx %g4, v0, w0 130 umulxhi(%g4, v0, w1) 131 ldx [up+8], %i5 132 mulx %g4, v1, %l3 133 umulxhi(%g4, v1, %l7) 134 AM2(` ldx [rp+8], %g3') 135 add up, 8, up 136 add rp, 8, rp 137 b L(lo1) 138 mov 0, w2 139 140 L(b0): 141 AM2(` ldx [rp+0], %g3') 142 mulx %g4, v0, w1 143 umulxhi(%g4, v0, w2) 144 ldx [up+8], %i4 145 mulx %g4, v1, %l1 146 umulxhi(%g4, v1, %l5) 147 AM2(` ldx [rp+8], %g1') 148 b L(lo0) 149 mov 0, w3 150 151 ALIGN(16) C cycle 152 L(top): mulx %i4, v0, %l2 C 0->5 153 umulxhi(%i4, v0, %l6) C 0->5 154 ldx [up+0], %i5 C 1->6 155 AM2(` addcc w3, %g3, w3') C 1 156 stx w3, [rp-16] C 2 157 ADDX(` %l1, w0, w0') C 2 158 addxccc(%l5, w1, w1) C 3 159 mulx %i4, v1, %l3 C 3->9 160 umulxhi(%i4, v1, %l7) C 4->9 161 AM2(` ldx [rp+0], %g3') C 4 162 addcc %l2, w0, w0 C 5 163 addxccc(%l6, w1, w1) C 5 164 addxc( %g0, %g0, w2) C 6 165 L(lo1): mulx %i5, v0, %l0 C 6 166 umulxhi(%i5, v0, %l4) C 7 167 ldx [up+8], %i4 C 7 168 AM2(` addcc w0, %g1, w0') C 8 169 stx w0, [rp-8] C 8 170 ADDX(` %l3, w1, w1') C 9 171 addxccc(%l7, w2, w2) C 9 172 mulx %i5, v1, %l1 C 10 173 umulxhi(%i5, v1, %l5) C 10 174 AM2(` ldx [rp+8], %g1') C 11 175 addcc %l0, w1, w1 C 11 176 addxccc(%l4, w2, w2) C 12 177 addxc( %g0, %g0, w3) C 12 178 L(lo0): mulx %i4, v0, %l2 C 13 179 umulxhi(%i4, v0, %l6) C 13 180 ldx [up+16], %i5 C 14 181 AM2(` addcc w1, %g3, w1') C 14 182 stx w1, [rp+0] C 15 183 ADDX(` %l1, w2, w2') C 15 184 addxccc(%l5, w3, w3) C 16 185 mulx %i4, v1, %l3 C 16 186 umulxhi(%i4, v1, %l7) C 17 187 AM2(` ldx [rp+16], %g3') C 17 188 addcc %l2, w2, w2 C 18 189 addxccc(%l6, w3, w3) C 18 190 addxc( %g0, %g0, w0) C 19 191 L(lo3): mulx %i5, v0, %l0 C 19 192 umulxhi(%i5, v0, %l4) C 20 193 ldx [up+24], %i4 C 20 194 AM2(` addcc w2, %g1, w2') C 21 195 stx w2, [rp+8] C 21 196 ADDX(` %l3, w3, w3') C 22 197 addxccc(%l7, w0, w0) C 22 198 mulx %i5, v1, %l1 C 23 199 umulxhi(%i5, v1, %l5) C 23 200 AM2(` ldx [rp+24], %g1') C 24 201 addcc %l0, w3, w3 C 24 202 addxccc(%l4, w0, w0) C 25 203 addxc( %g0, %g0, w1) C 25 204 add up, 32, up 205 add rp, 32, rp 206 brgz n, L(top) 207 add n, -4, n 208 209 L(end): mulx %i4, v0, %l2 210 umulxhi(%i4, v0, %l6) 211 AM2(` addcc w3, %g3, w3') 212 stx w3, [rp-16] 213 ADDX(` %l1, w0, w0') 214 addxccc(%l5, w1, w1) 215 mulx %i4, v1, %l3 216 umulxhi(%i4, v1, %l7) 217 addcc %l2, w0, w0 218 addxccc(%l6, w1, w1) 219 addxc( %g0, %g0, w2) 220 AM2(` addcc w0, %g1, w0') 221 stx w0, [rp-8] 222 ADDX(` %l3, w1, w1') 223 stx w1, [rp+0] 224 addxc(%l7, w2, %i0) 225 226 ret 227 restore 228 EPILOGUE()