github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparc1234/addmul_1.asm (about) 1 dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add 2 dnl the result to a second limb vector. 3 4 dnl Copyright 1998, 2000-2004 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C UltraSPARC 1&2: 14 36 C UltraSPARC 3: 17.5 37 38 C Algorithm: We use eight floating-point multiplies per limb product, with the 39 C invariant v operand split into four 16-bit pieces, and the up operand split 40 C into 32-bit pieces. We sum pairs of 48-bit partial products using 41 C floating-point add, then convert the four 49-bit product-sums and transfer 42 C them to the integer unit. 43 44 C Possible optimizations: 45 C 0. Rewrite to use algorithm of mpn_addmul_2. 46 C 1. Align the stack area where we transfer the four 49-bit product-sums 47 C to a 32-byte boundary. That would minimize the cache collision. 48 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would 49 C be to align the area to map to the area immediately before up?) 50 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the 51 C develop mpn_addmul_2. This would save many integer instructions. 52 C 3. Unrolling. Questionable if it is worth the code expansion, given that 53 C it could only save 1 cycle/limb. 54 C 4. Specialize for particular v values. If its upper 32 bits are zero, we 55 C could save many operations, in the FPU (fmuld), but more so in the IEU 56 C since we'll be summing 48-bit quantities, which might be simpler. 57 C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and 58 C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should 59 C not be greater than needed for L2 cache latency, and also not so great 60 C that i16 needs to be copied. 61 C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want 62 C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU 63 C ops.) 64 65 C Instruction classification (as per UltraSPARC-1/2 functional units): 66 C 8 FM 67 C 10 FA 68 C 12 MEM 69 C 10 ISHIFT + 14 IADDLOG 70 C 1 BRANCH 71 C 55 insns totally (plus one mov insn that should be optimized out) 72 73 C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we 74 C sustain the peak execution rate of 4 instructions/cycle. 75 76 C INPUT PARAMETERS 77 C rp i0 78 C up i1 79 C n i2 80 C v i3 81 82 ASM_START() 83 REGISTER(%g2,#scratch) 84 REGISTER(%g3,#scratch) 85 86 define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') 87 define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') 88 define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') 89 define(`u00',`%f32') define(`u32', `%f34') 90 define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') 91 define(`cy',`%g1') 92 define(`rlimb',`%g3') 93 define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') 94 define(`xffffffff',`%l7') 95 define(`xffff',`%o0') 96 97 PROLOGUE(mpn_addmul_1) 98 99 C Initialization. (1) Split v operand into four 16-bit chunks and store them 100 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs 101 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. 102 103 save %sp, -256, %sp 104 mov -1, %g4 105 srlx %g4, 48, xffff C store mask in register `xffff' 106 and %i3, xffff, %g2 107 stx %g2, [%sp+2223+0] 108 srlx %i3, 16, %g3 109 and %g3, xffff, %g3 110 stx %g3, [%sp+2223+8] 111 srlx %i3, 32, %g2 112 and %g2, xffff, %g2 113 stx %g2, [%sp+2223+16] 114 srlx %i3, 48, %g3 115 stx %g3, [%sp+2223+24] 116 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 117 118 sllx %i2, 3, %i2 119 mov 0, cy C clear cy 120 add %i0, %i2, %i0 121 add %i1, %i2, %i1 122 neg %i2 123 add %i1, 4, %i5 124 add %i0, -32, %i4 125 add %i0, -16, %i0 126 127 ldd [%sp+2223+0], v00 128 ldd [%sp+2223+8], v16 129 ldd [%sp+2223+16], v32 130 ldd [%sp+2223+24], v48 131 ld [%sp+2223+0],%f2 C zero f2 132 ld [%sp+2223+0],%f4 C zero f4 133 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 134 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 135 fxtod v00, v00 136 fxtod v16, v16 137 fxtod v32, v32 138 fxtod v48, v48 139 140 C Start real work. (We sneakingly read f3 and f5 above...) 141 C The software pipeline is very deep, requiring 4 feed-in stages. 142 143 fxtod %f2, u00 144 fxtod %f4, u32 145 fmuld u00, v00, a00 146 fmuld u00, v16, a16 147 fmuld u00, v32, p32 148 fmuld u32, v00, r32 149 fmuld u00, v48, p48 150 addcc %i2, 8, %i2 151 bnz,pt %xcc, .L_two_or_more 152 fmuld u32, v16, r48 153 154 .L_one: 155 fmuld u32, v32, r64 C FIXME not urgent 156 faddd p32, r32, a32 157 fdtox a00, a00 158 faddd p48, r48, a48 159 fmuld u32, v48, r80 C FIXME not urgent 160 fdtox a16, a16 161 fdtox a32, a32 162 fdtox a48, a48 163 std a00, [%sp+2223+0] 164 std a16, [%sp+2223+8] 165 std a32, [%sp+2223+16] 166 std a48, [%sp+2223+24] 167 add %i2, 8, %i2 168 169 fdtox r64, a00 170 ldx [%i0+%i2], rlimb C read rp[i] 171 fdtox r80, a16 172 ldx [%sp+2223+0], i00 173 ldx [%sp+2223+8], i16 174 ldx [%sp+2223+16], i32 175 ldx [%sp+2223+24], i48 176 std a00, [%sp+2223+0] 177 std a16, [%sp+2223+8] 178 add %i2, 8, %i2 179 180 srlx rlimb, 32, %g4 C HI(rlimb) 181 and rlimb, xffffffff, %g5 C LO(rlimb) 182 add i00, %g5, %g5 C i00+ now in g5 183 ldx [%sp+2223+0], i00 184 srlx i16, 48, %l4 C (i16 >> 48) 185 mov i16, %g2 186 ldx [%sp+2223+8], i16 187 srlx i48, 16, %l5 C (i48 >> 16) 188 add i32, %g4, %g4 C i32+ now in g4 189 sllx i48, 32, %l6 C (i48 << 32) 190 srlx %g4, 32, %o3 C (i32 >> 32) 191 add %l5, %l4, %o1 C hi64- in %o1 192 std a00, [%sp+2223+0] 193 sllx %g4, 16, %o2 C (i32 << 16) 194 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 195 std a16, [%sp+2223+8] 196 sllx %o1, 48, %o3 C (hi64 << 48) 197 add %g2, %o2, %o2 C mi64- in %o2 198 add %l6, %o2, %o2 C mi64- in %o2 199 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 200 add cy, %g5, %o4 C x = prev(i00) + cy 201 b .L_out_1 202 add %i2, 8, %i2 203 204 .L_two_or_more: 205 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 206 fmuld u32, v32, r64 C FIXME not urgent 207 faddd p32, r32, a32 208 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 209 fdtox a00, a00 210 faddd p48, r48, a48 211 fmuld u32, v48, r80 C FIXME not urgent 212 fdtox a16, a16 213 fdtox a32, a32 214 fxtod %f2, u00 215 fxtod %f4, u32 216 fdtox a48, a48 217 std a00, [%sp+2223+0] 218 fmuld u00, v00, p00 219 std a16, [%sp+2223+8] 220 fmuld u00, v16, p16 221 std a32, [%sp+2223+16] 222 fmuld u00, v32, p32 223 std a48, [%sp+2223+24] 224 faddd p00, r64, a00 225 fmuld u32, v00, r32 226 faddd p16, r80, a16 227 fmuld u00, v48, p48 228 addcc %i2, 8, %i2 229 bnz,pt %xcc, .L_three_or_more 230 fmuld u32, v16, r48 231 232 .L_two: 233 fmuld u32, v32, r64 C FIXME not urgent 234 faddd p32, r32, a32 235 fdtox a00, a00 236 ldx [%i0+%i2], rlimb C read rp[i] 237 faddd p48, r48, a48 238 fmuld u32, v48, r80 C FIXME not urgent 239 fdtox a16, a16 240 ldx [%sp+2223+0], i00 241 fdtox a32, a32 242 ldx [%sp+2223+8], i16 243 ldx [%sp+2223+16], i32 244 ldx [%sp+2223+24], i48 245 fdtox a48, a48 246 std a00, [%sp+2223+0] 247 std a16, [%sp+2223+8] 248 std a32, [%sp+2223+16] 249 std a48, [%sp+2223+24] 250 add %i2, 8, %i2 251 252 fdtox r64, a00 253 srlx rlimb, 32, %g4 C HI(rlimb) 254 and rlimb, xffffffff, %g5 C LO(rlimb) 255 ldx [%i0+%i2], rlimb C read rp[i] 256 add i00, %g5, %g5 C i00+ now in g5 257 fdtox r80, a16 258 ldx [%sp+2223+0], i00 259 srlx i16, 48, %l4 C (i16 >> 48) 260 mov i16, %g2 261 ldx [%sp+2223+8], i16 262 srlx i48, 16, %l5 C (i48 >> 16) 263 add i32, %g4, %g4 C i32+ now in g4 264 ldx [%sp+2223+16], i32 265 sllx i48, 32, %l6 C (i48 << 32) 266 ldx [%sp+2223+24], i48 267 srlx %g4, 32, %o3 C (i32 >> 32) 268 add %l5, %l4, %o1 C hi64- in %o1 269 std a00, [%sp+2223+0] 270 sllx %g4, 16, %o2 C (i32 << 16) 271 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 272 std a16, [%sp+2223+8] 273 sllx %o1, 48, %o3 C (hi64 << 48) 274 add %g2, %o2, %o2 C mi64- in %o2 275 add %l6, %o2, %o2 C mi64- in %o2 276 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 277 add cy, %g5, %o4 C x = prev(i00) + cy 278 b .L_out_2 279 add %i2, 8, %i2 280 281 .L_three_or_more: 282 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 283 fmuld u32, v32, r64 C FIXME not urgent 284 faddd p32, r32, a32 285 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 286 fdtox a00, a00 287 ldx [%i0+%i2], rlimb C read rp[i] 288 faddd p48, r48, a48 289 fmuld u32, v48, r80 C FIXME not urgent 290 fdtox a16, a16 291 ldx [%sp+2223+0], i00 292 fdtox a32, a32 293 ldx [%sp+2223+8], i16 294 fxtod %f2, u00 295 ldx [%sp+2223+16], i32 296 fxtod %f4, u32 297 ldx [%sp+2223+24], i48 298 fdtox a48, a48 299 std a00, [%sp+2223+0] 300 fmuld u00, v00, p00 301 std a16, [%sp+2223+8] 302 fmuld u00, v16, p16 303 std a32, [%sp+2223+16] 304 fmuld u00, v32, p32 305 std a48, [%sp+2223+24] 306 faddd p00, r64, a00 307 fmuld u32, v00, r32 308 faddd p16, r80, a16 309 fmuld u00, v48, p48 310 addcc %i2, 8, %i2 311 bnz,pt %xcc, .L_four_or_more 312 fmuld u32, v16, r48 313 314 .L_three: 315 fmuld u32, v32, r64 C FIXME not urgent 316 faddd p32, r32, a32 317 fdtox a00, a00 318 srlx rlimb, 32, %g4 C HI(rlimb) 319 and rlimb, xffffffff, %g5 C LO(rlimb) 320 ldx [%i0+%i2], rlimb C read rp[i] 321 faddd p48, r48, a48 322 add i00, %g5, %g5 C i00+ now in g5 323 fmuld u32, v48, r80 C FIXME not urgent 324 fdtox a16, a16 325 ldx [%sp+2223+0], i00 326 fdtox a32, a32 327 srlx i16, 48, %l4 C (i16 >> 48) 328 mov i16, %g2 329 ldx [%sp+2223+8], i16 330 srlx i48, 16, %l5 C (i48 >> 16) 331 add i32, %g4, %g4 C i32+ now in g4 332 ldx [%sp+2223+16], i32 333 sllx i48, 32, %l6 C (i48 << 32) 334 ldx [%sp+2223+24], i48 335 fdtox a48, a48 336 srlx %g4, 32, %o3 C (i32 >> 32) 337 add %l5, %l4, %o1 C hi64- in %o1 338 std a00, [%sp+2223+0] 339 sllx %g4, 16, %o2 C (i32 << 16) 340 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 341 std a16, [%sp+2223+8] 342 sllx %o1, 48, %o3 C (hi64 << 48) 343 add %g2, %o2, %o2 C mi64- in %o2 344 std a32, [%sp+2223+16] 345 add %l6, %o2, %o2 C mi64- in %o2 346 std a48, [%sp+2223+24] 347 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 348 add cy, %g5, %o4 C x = prev(i00) + cy 349 b .L_out_3 350 add %i2, 8, %i2 351 352 .L_four_or_more: 353 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 354 fmuld u32, v32, r64 C FIXME not urgent 355 faddd p32, r32, a32 356 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 357 fdtox a00, a00 358 srlx rlimb, 32, %g4 C HI(rlimb) 359 and rlimb, xffffffff, %g5 C LO(rlimb) 360 ldx [%i0+%i2], rlimb C read rp[i] 361 faddd p48, r48, a48 362 add i00, %g5, %g5 C i00+ now in g5 363 fmuld u32, v48, r80 C FIXME not urgent 364 fdtox a16, a16 365 ldx [%sp+2223+0], i00 366 fdtox a32, a32 367 srlx i16, 48, %l4 C (i16 >> 48) 368 mov i16, %g2 369 ldx [%sp+2223+8], i16 370 fxtod %f2, u00 371 srlx i48, 16, %l5 C (i48 >> 16) 372 add i32, %g4, %g4 C i32+ now in g4 373 ldx [%sp+2223+16], i32 374 fxtod %f4, u32 375 sllx i48, 32, %l6 C (i48 << 32) 376 ldx [%sp+2223+24], i48 377 fdtox a48, a48 378 srlx %g4, 32, %o3 C (i32 >> 32) 379 add %l5, %l4, %o1 C hi64- in %o1 380 std a00, [%sp+2223+0] 381 fmuld u00, v00, p00 382 sllx %g4, 16, %o2 C (i32 << 16) 383 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 384 std a16, [%sp+2223+8] 385 fmuld u00, v16, p16 386 sllx %o1, 48, %o3 C (hi64 << 48) 387 add %g2, %o2, %o2 C mi64- in %o2 388 std a32, [%sp+2223+16] 389 fmuld u00, v32, p32 390 add %l6, %o2, %o2 C mi64- in %o2 391 std a48, [%sp+2223+24] 392 faddd p00, r64, a00 393 fmuld u32, v00, r32 394 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 395 faddd p16, r80, a16 396 fmuld u00, v48, p48 397 add cy, %g5, %o4 C x = prev(i00) + cy 398 addcc %i2, 8, %i2 399 bnz,pt %xcc, .Loop 400 fmuld u32, v16, r48 401 402 .L_four: 403 b,a .L_out_4 404 405 C BEGIN MAIN LOOP 406 .align 16 407 .Loop: 408 C 00 409 srlx %o4, 16, %o5 C (x >> 16) 410 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 411 fmuld u32, v32, r64 C FIXME not urgent 412 faddd p32, r32, a32 413 C 01 414 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 415 and %o4, xffff, %o5 C (x & 0xffff) 416 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 417 fdtox a00, a00 418 C 02 419 srlx rlimb, 32, %g4 C HI(rlimb) 420 and rlimb, xffffffff, %g5 C LO(rlimb) 421 ldx [%i0+%i2], rlimb C read rp[i] 422 faddd p48, r48, a48 423 C 03 424 srlx %o2, 48, %o7 C (mi64 >> 48) 425 add i00, %g5, %g5 C i00+ now in g5 426 fmuld u32, v48, r80 C FIXME not urgent 427 fdtox a16, a16 428 C 04 429 sllx %o2, 16, %i3 C (mi64 << 16) 430 add %o7, %o1, cy C new cy 431 ldx [%sp+2223+0], i00 432 fdtox a32, a32 433 C 05 434 srlx i16, 48, %l4 C (i16 >> 48) 435 mov i16, %g2 436 ldx [%sp+2223+8], i16 437 fxtod %f2, u00 438 C 06 439 srlx i48, 16, %l5 C (i48 >> 16) 440 add i32, %g4, %g4 C i32+ now in g4 441 ldx [%sp+2223+16], i32 442 fxtod %f4, u32 443 C 07 444 sllx i48, 32, %l6 C (i48 << 32) 445 or %i3, %o5, %o5 446 ldx [%sp+2223+24], i48 447 fdtox a48, a48 448 C 08 449 srlx %g4, 32, %o3 C (i32 >> 32) 450 add %l5, %l4, %o1 C hi64- in %o1 451 std a00, [%sp+2223+0] 452 fmuld u00, v00, p00 453 C 09 454 sllx %g4, 16, %o2 C (i32 << 16) 455 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 456 std a16, [%sp+2223+8] 457 fmuld u00, v16, p16 458 C 10 459 sllx %o1, 48, %o3 C (hi64 << 48) 460 add %g2, %o2, %o2 C mi64- in %o2 461 std a32, [%sp+2223+16] 462 fmuld u00, v32, p32 463 C 11 464 add %l6, %o2, %o2 C mi64- in %o2 465 std a48, [%sp+2223+24] 466 faddd p00, r64, a00 467 fmuld u32, v00, r32 468 C 12 469 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 470 stx %o5, [%i4+%i2] 471 faddd p16, r80, a16 472 fmuld u00, v48, p48 473 C 13 474 add cy, %g5, %o4 C x = prev(i00) + cy 475 addcc %i2, 8, %i2 476 bnz,pt %xcc, .Loop 477 fmuld u32, v16, r48 478 C END MAIN LOOP 479 480 .L_out_4: 481 srlx %o4, 16, %o5 C (x >> 16) 482 fmuld u32, v32, r64 C FIXME not urgent 483 faddd p32, r32, a32 484 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 485 and %o4, xffff, %o5 C (x & 0xffff) 486 fdtox a00, a00 487 srlx rlimb, 32, %g4 C HI(rlimb) 488 and rlimb, xffffffff, %g5 C LO(rlimb) 489 ldx [%i0+%i2], rlimb C read rp[i] 490 faddd p48, r48, a48 491 srlx %o2, 48, %o7 C (mi64 >> 48) 492 add i00, %g5, %g5 C i00+ now in g5 493 fmuld u32, v48, r80 C FIXME not urgent 494 fdtox a16, a16 495 sllx %o2, 16, %i3 C (mi64 << 16) 496 add %o7, %o1, cy C new cy 497 ldx [%sp+2223+0], i00 498 fdtox a32, a32 499 srlx i16, 48, %l4 C (i16 >> 48) 500 mov i16, %g2 501 ldx [%sp+2223+8], i16 502 srlx i48, 16, %l5 C (i48 >> 16) 503 add i32, %g4, %g4 C i32+ now in g4 504 ldx [%sp+2223+16], i32 505 sllx i48, 32, %l6 C (i48 << 32) 506 or %i3, %o5, %o5 507 ldx [%sp+2223+24], i48 508 fdtox a48, a48 509 srlx %g4, 32, %o3 C (i32 >> 32) 510 add %l5, %l4, %o1 C hi64- in %o1 511 std a00, [%sp+2223+0] 512 sllx %g4, 16, %o2 C (i32 << 16) 513 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 514 std a16, [%sp+2223+8] 515 sllx %o1, 48, %o3 C (hi64 << 48) 516 add %g2, %o2, %o2 C mi64- in %o2 517 std a32, [%sp+2223+16] 518 add %l6, %o2, %o2 C mi64- in %o2 519 std a48, [%sp+2223+24] 520 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 521 stx %o5, [%i4+%i2] 522 add cy, %g5, %o4 C x = prev(i00) + cy 523 add %i2, 8, %i2 524 .L_out_3: 525 srlx %o4, 16, %o5 C (x >> 16) 526 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 527 and %o4, xffff, %o5 C (x & 0xffff) 528 fdtox r64, a00 529 srlx rlimb, 32, %g4 C HI(rlimb) 530 and rlimb, xffffffff, %g5 C LO(rlimb) 531 ldx [%i0+%i2], rlimb C read rp[i] 532 srlx %o2, 48, %o7 C (mi64 >> 48) 533 add i00, %g5, %g5 C i00+ now in g5 534 fdtox r80, a16 535 sllx %o2, 16, %i3 C (mi64 << 16) 536 add %o7, %o1, cy C new cy 537 ldx [%sp+2223+0], i00 538 srlx i16, 48, %l4 C (i16 >> 48) 539 mov i16, %g2 540 ldx [%sp+2223+8], i16 541 srlx i48, 16, %l5 C (i48 >> 16) 542 add i32, %g4, %g4 C i32+ now in g4 543 ldx [%sp+2223+16], i32 544 sllx i48, 32, %l6 C (i48 << 32) 545 or %i3, %o5, %o5 546 ldx [%sp+2223+24], i48 547 srlx %g4, 32, %o3 C (i32 >> 32) 548 add %l5, %l4, %o1 C hi64- in %o1 549 std a00, [%sp+2223+0] 550 sllx %g4, 16, %o2 C (i32 << 16) 551 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 552 std a16, [%sp+2223+8] 553 sllx %o1, 48, %o3 C (hi64 << 48) 554 add %g2, %o2, %o2 C mi64- in %o2 555 add %l6, %o2, %o2 C mi64- in %o2 556 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 557 stx %o5, [%i4+%i2] 558 add cy, %g5, %o4 C x = prev(i00) + cy 559 add %i2, 8, %i2 560 .L_out_2: 561 srlx %o4, 16, %o5 C (x >> 16) 562 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 563 and %o4, xffff, %o5 C (x & 0xffff) 564 srlx rlimb, 32, %g4 C HI(rlimb) 565 and rlimb, xffffffff, %g5 C LO(rlimb) 566 srlx %o2, 48, %o7 C (mi64 >> 48) 567 add i00, %g5, %g5 C i00+ now in g5 568 sllx %o2, 16, %i3 C (mi64 << 16) 569 add %o7, %o1, cy C new cy 570 ldx [%sp+2223+0], i00 571 srlx i16, 48, %l4 C (i16 >> 48) 572 mov i16, %g2 573 ldx [%sp+2223+8], i16 574 srlx i48, 16, %l5 C (i48 >> 16) 575 add i32, %g4, %g4 C i32+ now in g4 576 sllx i48, 32, %l6 C (i48 << 32) 577 or %i3, %o5, %o5 578 srlx %g4, 32, %o3 C (i32 >> 32) 579 add %l5, %l4, %o1 C hi64- in %o1 580 sllx %g4, 16, %o2 C (i32 << 16) 581 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 582 sllx %o1, 48, %o3 C (hi64 << 48) 583 add %g2, %o2, %o2 C mi64- in %o2 584 add %l6, %o2, %o2 C mi64- in %o2 585 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 586 stx %o5, [%i4+%i2] 587 add cy, %g5, %o4 C x = prev(i00) + cy 588 add %i2, 8, %i2 589 .L_out_1: 590 srlx %o4, 16, %o5 C (x >> 16) 591 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 592 and %o4, xffff, %o5 C (x & 0xffff) 593 srlx %o2, 48, %o7 C (mi64 >> 48) 594 sllx %o2, 16, %i3 C (mi64 << 16) 595 add %o7, %o1, cy C new cy 596 or %i3, %o5, %o5 597 stx %o5, [%i4+%i2] 598 599 sllx i00, 0, %g2 600 add %g2, cy, cy 601 sllx i16, 16, %g3 602 add %g3, cy, cy 603 604 return %i7+8 605 mov cy, %o0 606 EPILOGUE(mpn_addmul_1)