github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/mul_1.asm (about) 1 dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store 2 dnl the result in a second limb vector. 3 4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C Algorithm: We use two floating-point multiplies per limb product, with the 35 C invariant v operand split into two 16-bit pieces, and the u operand split 36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to 37 C the integer unit. 38 39 C cycles/limb 40 C UltraSPARC 1&2: 6.5 41 C UltraSPARC 3: ? 42 43 C Possible optimizations: 44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're 45 C memory bandwidth limited, this could save 1.5 cycles/limb. 46 C 2. Unroll the inner loop. Since we already use alternate temporary areas, 47 C it is very straightforward to unroll, using an exit branch midways. 48 C Unrolling would allow deeper scheduling which could improve speed for L2 49 C cache case. 50 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es 51 C aren't sufficiently apart-scheduled with just two temp areas. 52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we 53 C could save many operations. 54 55 C INPUT PARAMETERS 56 C rp i0 57 C up i1 58 C n i2 59 C v i3 60 61 define(`FSIZE',224) 62 63 ASM_START() 64 PROLOGUE(mpn_mul_1) 65 add %sp, -FSIZE, %sp 66 sethi %hi(0xffff), %g1 67 srl %o3, 16, %g2 68 or %g1, %lo(0xffff), %g1 69 and %o3, %g1, %g1 70 stx %g1, [%sp+104] 71 stx %g2, [%sp+112] 72 ldd [%sp+104], %f6 73 ldd [%sp+112], %f8 74 fxtod %f6, %f6 75 fxtod %f8, %f8 76 ld [%sp+104], %f10 C zero f10 77 78 mov 0, %g3 C cy = 0 79 80 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe 81 82 add %sp, 160, %o5 C point in scratch area 83 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area 84 85 subcc %o2, 1, %o2 86 ld [%o1], %f11 C read up[i] 87 add %o1, 4, %o1 C up++ 88 bne,pt %icc, .L_two_or_more 89 fxtod %f10, %f2 90 91 fmuld %f2, %f8, %f16 92 fmuld %f2, %f6, %f4 93 fdtox %f16, %f14 94 fdtox %f4, %f12 95 std %f14, [%o5+16] 96 std %f12, [%o5+24] 97 ldx [%o5+16], %g2 C p16 98 ldx [%o5+24], %g1 C p0 99 b .L1 100 add %o0, -16, %o0 101 102 .align 16 103 .L_two_or_more: 104 subcc %o2, 1, %o2 105 ld [%o1], %f11 C read up[i] 106 fmuld %f2, %f8, %f16 107 fmuld %f2, %f6, %f4 108 add %o1, 4, %o1 C up++ 109 bne,pt %icc, .L_three_or_more 110 fxtod %f10, %f2 111 112 fdtox %f16, %f14 113 fdtox %f4, %f12 114 std %f14, [%o5+16] 115 fmuld %f2, %f8, %f16 116 std %f12, [%o5+24] 117 fmuld %f2, %f6, %f4 118 fdtox %f16, %f14 119 fdtox %f4, %f12 120 std %f14, [%o5+0] 121 std %f12, [%o5+8] 122 ldx [%o5+16], %g2 C p16 123 ldx [%o5+24], %g1 C p0 124 b .L2 125 add %o0, -12, %o0 126 127 .align 16 128 .L_three_or_more: 129 subcc %o2, 1, %o2 130 ld [%o1], %f11 C read up[i] 131 fdtox %f16, %f14 132 fdtox %f4, %f12 133 std %f14, [%o5+16] 134 fmuld %f2, %f8, %f16 135 std %f12, [%o5+24] 136 fmuld %f2, %f6, %f4 137 add %o1, 4, %o1 C up++ 138 bne,pt %icc, .L_four_or_more 139 fxtod %f10, %f2 140 141 fdtox %f16, %f14 142 fdtox %f4, %f12 143 std %f14, [%o5+0] 144 fmuld %f2, %f8, %f16 145 std %f12, [%o5+8] 146 fmuld %f2, %f6, %f4 147 fdtox %f16, %f14 148 ldx [%o5+16], %g2 C p16 149 fdtox %f4, %f12 150 ldx [%o5+24], %g1 C p0 151 std %f14, [%o5+16] 152 std %f12, [%o5+24] 153 b .L3 154 add %o0, -8, %o0 155 156 .align 16 157 .L_four_or_more: 158 subcc %o2, 1, %o2 159 ld [%o1], %f11 C read up[i] 160 fdtox %f16, %f14 161 fdtox %f4, %f12 162 std %f14, [%o5+0] 163 fmuld %f2, %f8, %f16 164 std %f12, [%o5+8] 165 fmuld %f2, %f6, %f4 166 add %o1, 4, %o1 C up++ 167 bne,pt %icc, .L_five_or_more 168 fxtod %f10, %f2 169 170 fdtox %f16, %f14 171 ldx [%o5+16], %g2 C p16 172 fdtox %f4, %f12 173 ldx [%o5+24], %g1 C p0 174 std %f14, [%o5+16] 175 fmuld %f2, %f8, %f16 176 std %f12, [%o5+24] 177 fmuld %f2, %f6, %f4 178 add %o1, 4, %o1 C up++ 179 b .L4 180 add %o0, -4, %o0 181 182 .align 16 183 .L_five_or_more: 184 subcc %o2, 1, %o2 185 ld [%o1], %f11 C read up[i] 186 fdtox %f16, %f14 187 ldx [%o5+16], %g2 C p16 188 fdtox %f4, %f12 189 ldx [%o5+24], %g1 C p0 190 std %f14, [%o5+16] 191 fmuld %f2, %f8, %f16 192 std %f12, [%o5+24] 193 fmuld %f2, %f6, %f4 194 add %o1, 4, %o1 C up++ 195 bne,pt %icc, .Loop 196 fxtod %f10, %f2 197 b,a .L5 198 199 C BEGIN MAIN LOOP 200 .align 16 201 C -- 0 202 .Loop: nop 203 subcc %o2, 1, %o2 204 ld [%o1], %f11 C read up[i] 205 fdtox %f16, %f14 206 C -- 1 207 sllx %g2, 16, %g4 C (p16 << 16) 208 add %o0, 4, %o0 C rp++ 209 ldx [%o5+0], %g2 C p16 210 fdtox %f4, %f12 211 C -- 2 212 nop 213 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 214 ldx [%o5+8], %g1 C p0 215 fanop 216 C -- 3 217 nop 218 add %g3, %g4, %g4 C p += cy 219 std %f14, [%o5+0] 220 fmuld %f2, %f8, %f16 221 C -- 4 222 srlx %g4, 32, %g3 C new cy 223 add %o1, 4, %o1 C up++ 224 std %f12, [%o5+8] 225 fmuld %f2, %f6, %f4 226 C -- 5 227 xor %o5, 16, %o5 C alternate scratch variables 228 stw %g4, [%o0-4] 229 bne,pt %icc, .Loop 230 fxtod %f10, %f2 231 C END MAIN LOOP 232 233 .L5: fdtox %f16, %f14 234 sllx %g2, 16, %g4 C (p16 << 16) 235 ldx [%o5+0], %g2 C p16 236 fdtox %f4, %f12 237 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 238 ldx [%o5+8], %g1 C p0 239 add %g4, %g3, %g4 C p += cy 240 std %f14, [%o5+0] 241 fmuld %f2, %f8, %f16 242 std %f12, [%o5+8] 243 fmuld %f2, %f6, %f4 244 xor %o5, 16, %o5 245 stw %g4, [%o0+0] 246 srlx %g4, 32, %g3 C new cy 247 248 .L4: fdtox %f16, %f14 249 sllx %g2, 16, %g4 C (p16 << 16) 250 ldx [%o5+0], %g2 C p16 251 fdtox %f4, %f12 252 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 253 ldx [%o5+8], %g1 C p0 254 add %g3, %g4, %g4 C p += cy 255 std %f14, [%o5+0] 256 std %f12, [%o5+8] 257 xor %o5, 16, %o5 258 stw %g4, [%o0+4] 259 srlx %g4, 32, %g3 C new cy 260 261 .L3: sllx %g2, 16, %g4 C (p16 << 16) 262 ldx [%o5+0], %g2 C p16 263 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 264 ldx [%o5+8], %g1 C p0 265 add %g3, %g4, %g4 C p += cy 266 xor %o5, 16, %o5 267 stw %g4, [%o0+8] 268 srlx %g4, 32, %g3 C new cy 269 270 .L2: sllx %g2, 16, %g4 C (p16 << 16) 271 ldx [%o5+0], %g2 C p16 272 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 273 ldx [%o5+8], %g1 C p0 274 add %g3, %g4, %g4 C p += cy 275 stw %g4, [%o0+12] 276 srlx %g4, 32, %g3 C new cy 277 278 .L1: sllx %g2, 16, %g4 C (p16 << 16) 279 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 280 add %g3, %g4, %g4 C p += cy 281 stw %g4, [%o0+16] 282 srlx %g4, 32, %g3 C new cy 283 284 mov %g3, %o0 285 retl 286 sub %sp, -FSIZE, %sp 287 EPILOGUE(mpn_mul_1)