github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/addmul_1.asm (about) 1 dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add 2 dnl the result to a second limb vector. 3 4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C Algorithm: We use two floating-point multiplies per limb product, with the 35 C invariant v operand split into two 16-bit pieces, and the u operand split 36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to 37 C the integer unit. 38 39 C cycles/limb 40 C UltraSPARC 1&2: 6.5 41 C UltraSPARC 3: ? 42 43 C Possible optimizations: 44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're 45 C memory bandwidth limited, this could save 1.5 cycles/limb. 46 C 2. Unroll the inner loop. Since we already use alternate temporary areas, 47 C it is very straightforward to unroll, using an exit branch midways. 48 C Unrolling would allow deeper scheduling which could improve speed for L2 49 C cache case. 50 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es 51 C aren't sufficiently apart-scheduled with just two temp areas. 52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we 53 C could save many operations. 54 55 C INPUT PARAMETERS 56 C rp i0 57 C up i1 58 C n i2 59 C v i3 60 61 define(`FSIZE',224) 62 63 ASM_START() 64 PROLOGUE(mpn_addmul_1) 65 add %sp, -FSIZE, %sp 66 sethi %hi(0xffff), %g1 67 srl %o3, 16, %g2 68 or %g1, %lo(0xffff), %g1 69 and %o3, %g1, %g1 70 stx %g1, [%sp+104] 71 stx %g2, [%sp+112] 72 ldd [%sp+104], %f6 73 ldd [%sp+112], %f8 74 fxtod %f6, %f6 75 fxtod %f8, %f8 76 ld [%sp+104], %f10 C zero f10 77 78 mov 0, %g3 C cy = 0 79 80 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe 81 82 add %sp, 160, %o5 C point in scratch area 83 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area 84 85 subcc %o2, 1, %o2 86 ld [%o1], %f11 C read up[i] 87 add %o1, 4, %o1 C up++ 88 bne,pt %icc, .L_two_or_more 89 fxtod %f10, %f2 90 91 fmuld %f2, %f8, %f16 92 fmuld %f2, %f6, %f4 93 fdtox %f16, %f14 94 fdtox %f4, %f12 95 std %f14, [%o5+16] 96 std %f12, [%o5+24] 97 ldx [%o5+16], %g2 C p16 98 ldx [%o5+24], %g1 C p0 99 lduw [%o0], %g5 C read rp[i] 100 b .L1 101 add %o0, -16, %o0 102 103 .align 16 104 .L_two_or_more: 105 subcc %o2, 1, %o2 106 ld [%o1], %f11 C read up[i] 107 fmuld %f2, %f8, %f16 108 fmuld %f2, %f6, %f4 109 add %o1, 4, %o1 C up++ 110 bne,pt %icc, .L_three_or_more 111 fxtod %f10, %f2 112 113 fdtox %f16, %f14 114 fdtox %f4, %f12 115 std %f14, [%o5+16] 116 fmuld %f2, %f8, %f16 117 std %f12, [%o5+24] 118 fmuld %f2, %f6, %f4 119 fdtox %f16, %f14 120 fdtox %f4, %f12 121 std %f14, [%o5+0] 122 std %f12, [%o5+8] 123 lduw [%o0], %g5 C read rp[i] 124 ldx [%o5+16], %g2 C p16 125 ldx [%o5+24], %g1 C p0 126 b .L2 127 add %o0, -12, %o0 128 129 .align 16 130 .L_three_or_more: 131 subcc %o2, 1, %o2 132 ld [%o1], %f11 C read up[i] 133 fdtox %f16, %f14 134 fdtox %f4, %f12 135 std %f14, [%o5+16] 136 fmuld %f2, %f8, %f16 137 std %f12, [%o5+24] 138 fmuld %f2, %f6, %f4 139 add %o1, 4, %o1 C up++ 140 bne,pt %icc, .L_four_or_more 141 fxtod %f10, %f2 142 143 fdtox %f16, %f14 144 fdtox %f4, %f12 145 std %f14, [%o5+0] 146 fmuld %f2, %f8, %f16 147 std %f12, [%o5+8] 148 fmuld %f2, %f6, %f4 149 fdtox %f16, %f14 150 ldx [%o5+16], %g2 C p16 151 fdtox %f4, %f12 152 ldx [%o5+24], %g1 C p0 153 std %f14, [%o5+16] 154 std %f12, [%o5+24] 155 lduw [%o0], %g5 C read rp[i] 156 b .L3 157 add %o0, -8, %o0 158 159 .align 16 160 .L_four_or_more: 161 subcc %o2, 1, %o2 162 ld [%o1], %f11 C read up[i] 163 fdtox %f16, %f14 164 fdtox %f4, %f12 165 std %f14, [%o5+0] 166 fmuld %f2, %f8, %f16 167 std %f12, [%o5+8] 168 fmuld %f2, %f6, %f4 169 add %o1, 4, %o1 C up++ 170 bne,pt %icc, .L_five_or_more 171 fxtod %f10, %f2 172 173 fdtox %f16, %f14 174 ldx [%o5+16], %g2 C p16 175 fdtox %f4, %f12 176 ldx [%o5+24], %g1 C p0 177 std %f14, [%o5+16] 178 fmuld %f2, %f8, %f16 179 std %f12, [%o5+24] 180 fmuld %f2, %f6, %f4 181 add %o1, 4, %o1 C up++ 182 lduw [%o0], %g5 C read rp[i] 183 b .L4 184 add %o0, -4, %o0 185 186 .align 16 187 .L_five_or_more: 188 subcc %o2, 1, %o2 189 ld [%o1], %f11 C read up[i] 190 fdtox %f16, %f14 191 ldx [%o5+16], %g2 C p16 192 fdtox %f4, %f12 193 ldx [%o5+24], %g1 C p0 194 std %f14, [%o5+16] 195 fmuld %f2, %f8, %f16 196 std %f12, [%o5+24] 197 fmuld %f2, %f6, %f4 198 add %o1, 4, %o1 C up++ 199 lduw [%o0], %g5 C read rp[i] 200 bne,pt %icc, .Loop 201 fxtod %f10, %f2 202 b,a .L5 203 204 C BEGIN MAIN LOOP 205 .align 16 206 C -- 0 207 .Loop: nop 208 subcc %o2, 1, %o2 209 ld [%o1], %f11 C read up[i] 210 fdtox %f16, %f14 211 C -- 1 212 sllx %g2, 16, %g4 C (p16 << 16) 213 add %o0, 4, %o0 C rp++ 214 ldx [%o5+0], %g2 C p16 215 fdtox %f4, %f12 216 C -- 2 217 nop 218 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 219 ldx [%o5+8], %g1 C p0 220 fanop 221 C -- 3 222 nop 223 add %g3, %g4, %g4 C p += cy 224 std %f14, [%o5+0] 225 fmuld %f2, %f8, %f16 226 C -- 4 227 nop 228 add %g5, %g4, %g4 C p += rp[i] 229 std %f12, [%o5+8] 230 fmuld %f2, %f6, %f4 231 C -- 5 232 xor %o5, 16, %o5 C alternate scratch variables 233 add %o1, 4, %o1 C up++ 234 stw %g4, [%o0-4] 235 fanop 236 C -- 6 237 srlx %g4, 32, %g3 C new cy 238 lduw [%o0], %g5 C read rp[i] 239 bne,pt %icc, .Loop 240 fxtod %f10, %f2 241 C END MAIN LOOP 242 243 .L5: fdtox %f16, %f14 244 sllx %g2, 16, %g4 C (p16 << 16) 245 ldx [%o5+0], %g2 C p16 246 fdtox %f4, %f12 247 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 248 ldx [%o5+8], %g1 C p0 249 add %g4, %g3, %g4 C p += cy 250 std %f14, [%o5+0] 251 fmuld %f2, %f8, %f16 252 add %g5, %g4, %g4 C p += rp[i] 253 std %f12, [%o5+8] 254 fmuld %f2, %f6, %f4 255 xor %o5, 16, %o5 256 stw %g4, [%o0+0] 257 srlx %g4, 32, %g3 C new cy 258 lduw [%o0+4], %g5 C read rp[i] 259 260 .L4: fdtox %f16, %f14 261 sllx %g2, 16, %g4 C (p16 << 16) 262 ldx [%o5+0], %g2 C p16 263 fdtox %f4, %f12 264 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 265 ldx [%o5+8], %g1 C p0 266 add %g3, %g4, %g4 C p += cy 267 std %f14, [%o5+0] 268 add %g5, %g4, %g4 C p += rp[i] 269 std %f12, [%o5+8] 270 xor %o5, 16, %o5 271 stw %g4, [%o0+4] 272 srlx %g4, 32, %g3 C new cy 273 lduw [%o0+8], %g5 C read rp[i] 274 275 .L3: sllx %g2, 16, %g4 C (p16 << 16) 276 ldx [%o5+0], %g2 C p16 277 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 278 ldx [%o5+8], %g1 C p0 279 add %g3, %g4, %g4 C p += cy 280 add %g5, %g4, %g4 C p += rp[i] 281 xor %o5, 16, %o5 282 stw %g4, [%o0+8] 283 srlx %g4, 32, %g3 C new cy 284 lduw [%o0+12], %g5 C read rp[i] 285 286 .L2: sllx %g2, 16, %g4 C (p16 << 16) 287 ldx [%o5+0], %g2 C p16 288 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 289 ldx [%o5+8], %g1 C p0 290 add %g3, %g4, %g4 C p += cy 291 add %g5, %g4, %g4 C p += rp[i] 292 stw %g4, [%o0+12] 293 srlx %g4, 32, %g3 C new cy 294 lduw [%o0+16], %g5 C read rp[i] 295 296 .L1: sllx %g2, 16, %g4 C (p16 << 16) 297 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 298 add %g3, %g4, %g4 C p += cy 299 add %g5, %g4, %g4 C p += rp[i] 300 stw %g4, [%o0+16] 301 srlx %g4, 32, %g3 C new cy 302 303 mov %g3, %o0 304 retl 305 sub %sp, -FSIZE, %sp 306 EPILOGUE(mpn_addmul_1)