github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/submul_1.asm (about) 1 dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and 2 dnl subtract the result from a second limb vector. 3 4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C Algorithm: We use two floating-point multiplies per limb product, with the 35 C invariant v operand split into two 16-bit pieces, and the u operand split 36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to 37 C the integer unit. 38 39 C cycles/limb 40 C UltraSPARC 1&2: 6.5 41 C UltraSPARC 3: ? 42 43 C Possible optimizations: 44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're 45 C memory bandwidth limited, this could save 1.5 cycles/limb. 46 C 2. Unroll the inner loop. Since we already use alternate temporary areas, 47 C it is very straightforward to unroll, using an exit branch midways. 48 C Unrolling would allow deeper scheduling which could improve speed for L2 49 C cache case. 50 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es 51 C aren't sufficiently apart-scheduled with just two temp areas. 52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we 53 C could save many operations. 54 55 C INPUT PARAMETERS 56 C rp i0 57 C up i1 58 C n i2 59 C v i3 60 61 define(`FSIZE',224) 62 63 ASM_START() 64 PROLOGUE(mpn_submul_1) 65 add %sp, -FSIZE, %sp 66 sethi %hi(0xffff), %g1 67 srl %o3, 16, %g2 68 or %g1, %lo(0xffff), %g1 69 and %o3, %g1, %g1 70 stx %g1, [%sp+104] 71 stx %g2, [%sp+112] 72 ldd [%sp+104], %f6 73 ldd [%sp+112], %f8 74 fxtod %f6, %f6 75 fxtod %f8, %f8 76 ld [%sp+104], %f10 C zero f10 77 78 mov 0, %g3 C cy = 0 79 80 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe 81 82 add %sp, 160, %o5 C point in scratch area 83 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area 84 85 subcc %o2, 1, %o2 86 ld [%o1], %f11 C read up[i] 87 add %o1, 4, %o1 C up++ 88 bne,pt %icc, .L_two_or_more 89 fxtod %f10, %f2 90 91 fmuld %f2, %f8, %f16 92 fmuld %f2, %f6, %f4 93 fdtox %f16, %f14 94 fdtox %f4, %f12 95 std %f14, [%o5+16] 96 std %f12, [%o5+24] 97 ldx [%o5+16], %g2 C p16 98 ldx [%o5+24], %g1 C p0 99 lduw [%o0], %g5 C read rp[i] 100 b .L1 101 add %o0, -16, %o0 102 103 .align 16 104 .L_two_or_more: 105 subcc %o2, 1, %o2 106 ld [%o1], %f11 C read up[i] 107 fmuld %f2, %f8, %f16 108 fmuld %f2, %f6, %f4 109 add %o1, 4, %o1 C up++ 110 bne,pt %icc, .L_three_or_more 111 fxtod %f10, %f2 112 113 fdtox %f16, %f14 114 fdtox %f4, %f12 115 std %f14, [%o5+16] 116 fmuld %f2, %f8, %f16 117 std %f12, [%o5+24] 118 fmuld %f2, %f6, %f4 119 fdtox %f16, %f14 120 fdtox %f4, %f12 121 std %f14, [%o5+0] 122 std %f12, [%o5+8] 123 lduw [%o0], %g5 C read rp[i] 124 ldx [%o5+16], %g2 C p16 125 ldx [%o5+24], %g1 C p0 126 b .L2 127 add %o0, -12, %o0 128 129 .align 16 130 .L_three_or_more: 131 subcc %o2, 1, %o2 132 ld [%o1], %f11 C read up[i] 133 fdtox %f16, %f14 134 fdtox %f4, %f12 135 std %f14, [%o5+16] 136 fmuld %f2, %f8, %f16 137 std %f12, [%o5+24] 138 fmuld %f2, %f6, %f4 139 add %o1, 4, %o1 C up++ 140 bne,pt %icc, .L_four_or_more 141 fxtod %f10, %f2 142 143 fdtox %f16, %f14 144 fdtox %f4, %f12 145 std %f14, [%o5+0] 146 fmuld %f2, %f8, %f16 147 std %f12, [%o5+8] 148 fmuld %f2, %f6, %f4 149 fdtox %f16, %f14 150 ldx [%o5+16], %g2 C p16 151 fdtox %f4, %f12 152 ldx [%o5+24], %g1 C p0 153 std %f14, [%o5+16] 154 std %f12, [%o5+24] 155 lduw [%o0], %g5 C read rp[i] 156 b .L3 157 add %o0, -8, %o0 158 159 .align 16 160 .L_four_or_more: 161 subcc %o2, 1, %o2 162 ld [%o1], %f11 C read up[i] 163 fdtox %f16, %f14 164 fdtox %f4, %f12 165 std %f14, [%o5+0] 166 fmuld %f2, %f8, %f16 167 std %f12, [%o5+8] 168 fmuld %f2, %f6, %f4 169 add %o1, 4, %o1 C up++ 170 bne,pt %icc, .L_five_or_more 171 fxtod %f10, %f2 172 173 fdtox %f16, %f14 174 ldx [%o5+16], %g2 C p16 175 fdtox %f4, %f12 176 ldx [%o5+24], %g1 C p0 177 std %f14, [%o5+16] 178 fmuld %f2, %f8, %f16 179 std %f12, [%o5+24] 180 fmuld %f2, %f6, %f4 181 add %o1, 4, %o1 C up++ 182 lduw [%o0], %g5 C read rp[i] 183 b .L4 184 add %o0, -4, %o0 185 186 .align 16 187 .L_five_or_more: 188 subcc %o2, 1, %o2 189 ld [%o1], %f11 C read up[i] 190 fdtox %f16, %f14 191 ldx [%o5+16], %g2 C p16 192 fdtox %f4, %f12 193 ldx [%o5+24], %g1 C p0 194 std %f14, [%o5+16] 195 fmuld %f2, %f8, %f16 196 std %f12, [%o5+24] 197 fmuld %f2, %f6, %f4 198 add %o1, 4, %o1 C up++ 199 lduw [%o0], %g5 C read rp[i] 200 bne,pt %icc, .Loop 201 fxtod %f10, %f2 202 b,a .L5 203 204 C BEGIN MAIN LOOP 205 .align 16 206 C -- 0 207 .Loop: sub %g0, %g3, %g3 208 subcc %o2, 1, %o2 209 ld [%o1], %f11 C read up[i] 210 fdtox %f16, %f14 211 C -- 1 212 sllx %g2, 16, %g4 C (p16 << 16) 213 add %o0, 4, %o0 C rp++ 214 ldx [%o5+0], %g2 C p16 215 fdtox %f4, %f12 216 C -- 2 217 srl %g3, 0, %g3 C zero most significant 32 bits 218 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 219 ldx [%o5+8], %g1 C p0 220 fanop 221 C -- 3 222 nop 223 add %g3, %g4, %g4 C p += cy 224 std %f14, [%o5+0] 225 fmuld %f2, %f8, %f16 226 C -- 4 227 nop 228 sub %g5, %g4, %g4 C p += rp[i] 229 std %f12, [%o5+8] 230 fmuld %f2, %f6, %f4 231 C -- 5 232 xor %o5, 16, %o5 C alternate scratch variables 233 add %o1, 4, %o1 C up++ 234 stw %g4, [%o0-4] 235 fanop 236 C -- 6 237 srlx %g4, 32, %g3 C new cy 238 lduw [%o0], %g5 C read rp[i] 239 bne,pt %icc, .Loop 240 fxtod %f10, %f2 241 C END MAIN LOOP 242 243 .L5: sub %g0, %g3, %g3 244 fdtox %f16, %f14 245 sllx %g2, 16, %g4 C (p16 << 16) 246 ldx [%o5+0], %g2 C p16 247 fdtox %f4, %f12 248 srl %g3, 0, %g3 C zero most significant 32 bits 249 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 250 ldx [%o5+8], %g1 C p0 251 add %g4, %g3, %g4 C p += cy 252 std %f14, [%o5+0] 253 fmuld %f2, %f8, %f16 254 sub %g5, %g4, %g4 C p += rp[i] 255 std %f12, [%o5+8] 256 fmuld %f2, %f6, %f4 257 xor %o5, 16, %o5 258 stw %g4, [%o0+0] 259 srlx %g4, 32, %g3 C new cy 260 lduw [%o0+4], %g5 C read rp[i] 261 262 sub %g0, %g3, %g3 263 .L4: fdtox %f16, %f14 264 sllx %g2, 16, %g4 C (p16 << 16) 265 ldx [%o5+0], %g2 C p16 266 fdtox %f4, %f12 267 srl %g3, 0, %g3 C zero most significant 32 bits 268 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 269 ldx [%o5+8], %g1 C p0 270 add %g3, %g4, %g4 C p += cy 271 std %f14, [%o5+0] 272 sub %g5, %g4, %g4 C p += rp[i] 273 std %f12, [%o5+8] 274 xor %o5, 16, %o5 275 stw %g4, [%o0+4] 276 srlx %g4, 32, %g3 C new cy 277 lduw [%o0+8], %g5 C read rp[i] 278 279 sub %g0, %g3, %g3 280 .L3: sllx %g2, 16, %g4 C (p16 << 16) 281 ldx [%o5+0], %g2 C p16 282 srl %g3, 0, %g3 C zero most significant 32 bits 283 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 284 ldx [%o5+8], %g1 C p0 285 add %g3, %g4, %g4 C p += cy 286 sub %g5, %g4, %g4 C p += rp[i] 287 xor %o5, 16, %o5 288 stw %g4, [%o0+8] 289 srlx %g4, 32, %g3 C new cy 290 lduw [%o0+12], %g5 C read rp[i] 291 292 sub %g0, %g3, %g3 293 .L2: sllx %g2, 16, %g4 C (p16 << 16) 294 ldx [%o5+0], %g2 C p16 295 srl %g3, 0, %g3 C zero most significant 32 bits 296 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 297 ldx [%o5+8], %g1 C p0 298 add %g3, %g4, %g4 C p += cy 299 sub %g5, %g4, %g4 C p += rp[i] 300 stw %g4, [%o0+12] 301 srlx %g4, 32, %g3 C new cy 302 lduw [%o0+16], %g5 C read rp[i] 303 304 sub %g0, %g3, %g3 305 .L1: sllx %g2, 16, %g4 C (p16 << 16) 306 srl %g3, 0, %g3 C zero most significant 32 bits 307 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 308 add %g3, %g4, %g4 C p += cy 309 sub %g5, %g4, %g4 C p += rp[i] 310 stw %g4, [%o0+16] 311 srlx %g4, 32, %g3 C new cy 312 313 sub %g0, %g3, %o0 314 retl 315 sub %sp, -FSIZE, %sp 316 EPILOGUE(mpn_submul_1)