github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparc1234/mul_1.asm (about) 1 dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store 2 dnl the result in a second limb vector. 3 4 dnl Copyright 1998, 2000-2003 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C UltraSPARC 1&2: 14 36 C UltraSPARC 3: 18.5 37 38 C Algorithm: We use eight floating-point multiplies per limb product, with the 39 C invariant v operand split into four 16-bit pieces, and the s1 operand split 40 C into 32-bit pieces. We sum pairs of 48-bit partial products using 41 C floating-point add, then convert the four 49-bit product-sums and transfer 42 C them to the integer unit. 43 44 C Possible optimizations: 45 C 1. Align the stack area where we transfer the four 49-bit product-sums 46 C to a 32-byte boundary. That would minimize the cache collision. 47 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would 48 C be to align the area to map to the area immediately before s1?) 49 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the 50 C develop mpn_addmul_2. This would save many integer instructions. 51 C 3. Unrolling. Questionable if it is worth the code expansion, given that 52 C it could only save 1 cycle/limb. 53 C 4. Specialize for particular v values. If its upper 32 bits are zero, we 54 C could save many operations, in the FPU (fmuld), but more so in the IEU 55 C since we'll be summing 48-bit quantities, which might be simpler. 56 C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and 57 C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should 58 C not be greater than needed for L2 cache latency, and also not so great 59 C that i16 needs to be copied. 60 C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want 61 C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU 62 C ops.) 63 64 C Instruction classification (as per UltraSPARC-1/2 functional units): 65 C 8 FM 66 C 10 FA 67 C 11 MEM 68 C 9 ISHIFT + 10? IADDLOG 69 C 1 BRANCH 70 C 49 insns totally (plus three mov insns that should be optimized out) 71 72 C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we 73 C sustain 3.79 instructions/cycle. 74 75 C INPUT PARAMETERS 76 C rp i0 77 C up i1 78 C n i2 79 C v i3 80 81 ASM_START() 82 REGISTER(%g2,#scratch) 83 REGISTER(%g3,#scratch) 84 85 define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') 86 define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') 87 define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') 88 define(`u00',`%f32') define(`u32', `%f34') 89 define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') 90 define(`cy',`%g1') 91 define(`rlimb',`%g3') 92 define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') 93 define(`xffffffff',`%l7') 94 define(`xffff',`%o0') 95 96 PROLOGUE(mpn_mul_1) 97 98 C Initialization. (1) Split v operand into four 16-bit chunks and store them 99 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs 100 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. 101 102 save %sp, -256, %sp 103 mov -1, %g4 104 srlx %g4, 48, xffff C store mask in register `xffff' 105 and %i3, xffff, %g2 106 stx %g2, [%sp+2223+0] 107 srlx %i3, 16, %g3 108 and %g3, xffff, %g3 109 stx %g3, [%sp+2223+8] 110 srlx %i3, 32, %g2 111 and %g2, xffff, %g2 112 stx %g2, [%sp+2223+16] 113 srlx %i3, 48, %g3 114 stx %g3, [%sp+2223+24] 115 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 116 117 sllx %i2, 3, %i2 118 mov 0, cy C clear cy 119 add %i0, %i2, %i0 120 add %i1, %i2, %i1 121 neg %i2 122 add %i1, 4, %i5 123 add %i0, -32, %i4 124 add %i0, -16, %i0 125 126 ldd [%sp+2223+0], v00 127 ldd [%sp+2223+8], v16 128 ldd [%sp+2223+16], v32 129 ldd [%sp+2223+24], v48 130 ld [%sp+2223+0],%f2 C zero f2 131 ld [%sp+2223+0],%f4 C zero f4 132 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 133 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 134 fxtod v00, v00 135 fxtod v16, v16 136 fxtod v32, v32 137 fxtod v48, v48 138 139 C Start real work. (We sneakingly read f3 and f5 above...) 140 C The software pipeline is very deep, requiring 4 feed-in stages. 141 142 fxtod %f2, u00 143 fxtod %f4, u32 144 fmuld u00, v00, a00 145 fmuld u00, v16, a16 146 fmuld u00, v32, p32 147 fmuld u32, v00, r32 148 fmuld u00, v48, p48 149 addcc %i2, 8, %i2 150 bnz,pt %xcc, .L_two_or_more 151 fmuld u32, v16, r48 152 153 .L_one: 154 fmuld u32, v32, r64 C FIXME not urgent 155 faddd p32, r32, a32 156 fdtox a00, a00 157 faddd p48, r48, a48 158 fmuld u32, v48, r80 C FIXME not urgent 159 fdtox a16, a16 160 fdtox a32, a32 161 fdtox a48, a48 162 std a00, [%sp+2223+0] 163 std a16, [%sp+2223+8] 164 std a32, [%sp+2223+16] 165 std a48, [%sp+2223+24] 166 add %i2, 8, %i2 167 168 fdtox r64, a00 169 fdtox r80, a16 170 ldx [%sp+2223+0], i00 171 ldx [%sp+2223+8], i16 172 ldx [%sp+2223+16], i32 173 ldx [%sp+2223+24], i48 174 std a00, [%sp+2223+0] 175 std a16, [%sp+2223+8] 176 add %i2, 8, %i2 177 178 mov i00, %g5 C i00+ now in g5 179 ldx [%sp+2223+0], i00 180 srlx i16, 48, %l4 C (i16 >> 48) 181 mov i16, %g2 182 ldx [%sp+2223+8], i16 183 srlx i48, 16, %l5 C (i48 >> 16) 184 mov i32, %g4 C i32+ now in g4 185 sllx i48, 32, %l6 C (i48 << 32) 186 srlx %g4, 32, %o3 C (i32 >> 32) 187 add %l5, %l4, %o1 C hi64- in %o1 188 std a00, [%sp+2223+0] 189 sllx %g4, 16, %o2 C (i32 << 16) 190 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 191 std a16, [%sp+2223+8] 192 sllx %o1, 48, %o3 C (hi64 << 48) 193 add %g2, %o2, %o2 C mi64- in %o2 194 add %l6, %o2, %o2 C mi64- in %o2 195 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 196 add cy, %g5, %o4 C x = prev(i00) + cy 197 b .L_out_1 198 add %i2, 8, %i2 199 200 .L_two_or_more: 201 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 202 fmuld u32, v32, r64 C FIXME not urgent 203 faddd p32, r32, a32 204 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 205 fdtox a00, a00 206 faddd p48, r48, a48 207 fmuld u32, v48, r80 C FIXME not urgent 208 fdtox a16, a16 209 fdtox a32, a32 210 fxtod %f2, u00 211 fxtod %f4, u32 212 fdtox a48, a48 213 std a00, [%sp+2223+0] 214 fmuld u00, v00, p00 215 std a16, [%sp+2223+8] 216 fmuld u00, v16, p16 217 std a32, [%sp+2223+16] 218 fmuld u00, v32, p32 219 std a48, [%sp+2223+24] 220 faddd p00, r64, a00 221 fmuld u32, v00, r32 222 faddd p16, r80, a16 223 fmuld u00, v48, p48 224 addcc %i2, 8, %i2 225 bnz,pt %xcc, .L_three_or_more 226 fmuld u32, v16, r48 227 228 .L_two: 229 fmuld u32, v32, r64 C FIXME not urgent 230 faddd p32, r32, a32 231 fdtox a00, a00 232 faddd p48, r48, a48 233 fmuld u32, v48, r80 C FIXME not urgent 234 fdtox a16, a16 235 ldx [%sp+2223+0], i00 236 fdtox a32, a32 237 ldx [%sp+2223+8], i16 238 ldx [%sp+2223+16], i32 239 ldx [%sp+2223+24], i48 240 fdtox a48, a48 241 std a00, [%sp+2223+0] 242 std a16, [%sp+2223+8] 243 std a32, [%sp+2223+16] 244 std a48, [%sp+2223+24] 245 add %i2, 8, %i2 246 247 fdtox r64, a00 248 mov i00, %g5 C i00+ now in g5 249 fdtox r80, a16 250 ldx [%sp+2223+0], i00 251 srlx i16, 48, %l4 C (i16 >> 48) 252 mov i16, %g2 253 ldx [%sp+2223+8], i16 254 srlx i48, 16, %l5 C (i48 >> 16) 255 mov i32, %g4 C i32+ now in g4 256 ldx [%sp+2223+16], i32 257 sllx i48, 32, %l6 C (i48 << 32) 258 ldx [%sp+2223+24], i48 259 srlx %g4, 32, %o3 C (i32 >> 32) 260 add %l5, %l4, %o1 C hi64- in %o1 261 std a00, [%sp+2223+0] 262 sllx %g4, 16, %o2 C (i32 << 16) 263 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 264 std a16, [%sp+2223+8] 265 sllx %o1, 48, %o3 C (hi64 << 48) 266 add %g2, %o2, %o2 C mi64- in %o2 267 add %l6, %o2, %o2 C mi64- in %o2 268 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 269 add cy, %g5, %o4 C x = prev(i00) + cy 270 b .L_out_2 271 add %i2, 8, %i2 272 273 .L_three_or_more: 274 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 275 fmuld u32, v32, r64 C FIXME not urgent 276 faddd p32, r32, a32 277 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 278 fdtox a00, a00 279 faddd p48, r48, a48 280 fmuld u32, v48, r80 C FIXME not urgent 281 fdtox a16, a16 282 ldx [%sp+2223+0], i00 283 fdtox a32, a32 284 ldx [%sp+2223+8], i16 285 fxtod %f2, u00 286 ldx [%sp+2223+16], i32 287 fxtod %f4, u32 288 ldx [%sp+2223+24], i48 289 fdtox a48, a48 290 std a00, [%sp+2223+0] 291 fmuld u00, v00, p00 292 std a16, [%sp+2223+8] 293 fmuld u00, v16, p16 294 std a32, [%sp+2223+16] 295 fmuld u00, v32, p32 296 std a48, [%sp+2223+24] 297 faddd p00, r64, a00 298 fmuld u32, v00, r32 299 faddd p16, r80, a16 300 fmuld u00, v48, p48 301 addcc %i2, 8, %i2 302 bnz,pt %xcc, .L_four_or_more 303 fmuld u32, v16, r48 304 305 .L_three: 306 fmuld u32, v32, r64 C FIXME not urgent 307 faddd p32, r32, a32 308 fdtox a00, a00 309 faddd p48, r48, a48 310 mov i00, %g5 C i00+ now in g5 311 fmuld u32, v48, r80 C FIXME not urgent 312 fdtox a16, a16 313 ldx [%sp+2223+0], i00 314 fdtox a32, a32 315 srlx i16, 48, %l4 C (i16 >> 48) 316 mov i16, %g2 317 ldx [%sp+2223+8], i16 318 srlx i48, 16, %l5 C (i48 >> 16) 319 mov i32, %g4 C i32+ now in g4 320 ldx [%sp+2223+16], i32 321 sllx i48, 32, %l6 C (i48 << 32) 322 ldx [%sp+2223+24], i48 323 fdtox a48, a48 324 srlx %g4, 32, %o3 C (i32 >> 32) 325 add %l5, %l4, %o1 C hi64- in %o1 326 std a00, [%sp+2223+0] 327 sllx %g4, 16, %o2 C (i32 << 16) 328 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 329 std a16, [%sp+2223+8] 330 sllx %o1, 48, %o3 C (hi64 << 48) 331 add %g2, %o2, %o2 C mi64- in %o2 332 std a32, [%sp+2223+16] 333 add %l6, %o2, %o2 C mi64- in %o2 334 std a48, [%sp+2223+24] 335 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 336 add cy, %g5, %o4 C x = prev(i00) + cy 337 b .L_out_3 338 add %i2, 8, %i2 339 340 .L_four_or_more: 341 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 342 fmuld u32, v32, r64 C FIXME not urgent 343 faddd p32, r32, a32 344 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 345 fdtox a00, a00 346 faddd p48, r48, a48 347 mov i00, %g5 C i00+ now in g5 348 fmuld u32, v48, r80 C FIXME not urgent 349 fdtox a16, a16 350 ldx [%sp+2223+0], i00 351 fdtox a32, a32 352 srlx i16, 48, %l4 C (i16 >> 48) 353 mov i16, %g2 354 ldx [%sp+2223+8], i16 355 fxtod %f2, u00 356 srlx i48, 16, %l5 C (i48 >> 16) 357 mov i32, %g4 C i32+ now in g4 358 ldx [%sp+2223+16], i32 359 fxtod %f4, u32 360 sllx i48, 32, %l6 C (i48 << 32) 361 ldx [%sp+2223+24], i48 362 fdtox a48, a48 363 srlx %g4, 32, %o3 C (i32 >> 32) 364 add %l5, %l4, %o1 C hi64- in %o1 365 std a00, [%sp+2223+0] 366 fmuld u00, v00, p00 367 sllx %g4, 16, %o2 C (i32 << 16) 368 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 369 std a16, [%sp+2223+8] 370 fmuld u00, v16, p16 371 sllx %o1, 48, %o3 C (hi64 << 48) 372 add %g2, %o2, %o2 C mi64- in %o2 373 std a32, [%sp+2223+16] 374 fmuld u00, v32, p32 375 add %l6, %o2, %o2 C mi64- in %o2 376 std a48, [%sp+2223+24] 377 faddd p00, r64, a00 378 fmuld u32, v00, r32 379 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 380 faddd p16, r80, a16 381 fmuld u00, v48, p48 382 add cy, %g5, %o4 C x = prev(i00) + cy 383 addcc %i2, 8, %i2 384 bnz,pt %xcc, .Loop 385 fmuld u32, v16, r48 386 387 .L_four: 388 b,a .L_out_4 389 390 C BEGIN MAIN LOOP 391 .align 16 392 .Loop: 393 C 00 394 srlx %o4, 16, %o5 C (x >> 16) 395 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 396 fmuld u32, v32, r64 C FIXME not urgent 397 faddd p32, r32, a32 398 C 01 399 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 400 and %o4, xffff, %o5 C (x & 0xffff) 401 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 402 fdtox a00, a00 403 C 02 404 faddd p48, r48, a48 405 C 03 406 srlx %o2, 48, %o7 C (mi64 >> 48) 407 mov i00, %g5 C i00+ now in g5 408 fmuld u32, v48, r80 C FIXME not urgent 409 fdtox a16, a16 410 C 04 411 sllx %o2, 16, %i3 C (mi64 << 16) 412 add %o7, %o1, cy C new cy 413 ldx [%sp+2223+0], i00 414 fdtox a32, a32 415 C 05 416 srlx i16, 48, %l4 C (i16 >> 48) 417 mov i16, %g2 418 ldx [%sp+2223+8], i16 419 fxtod %f2, u00 420 C 06 421 srlx i48, 16, %l5 C (i48 >> 16) 422 mov i32, %g4 C i32+ now in g4 423 ldx [%sp+2223+16], i32 424 fxtod %f4, u32 425 C 07 426 sllx i48, 32, %l6 C (i48 << 32) 427 or %i3, %o5, %o5 428 ldx [%sp+2223+24], i48 429 fdtox a48, a48 430 C 08 431 srlx %g4, 32, %o3 C (i32 >> 32) 432 add %l5, %l4, %o1 C hi64- in %o1 433 std a00, [%sp+2223+0] 434 fmuld u00, v00, p00 435 C 09 436 sllx %g4, 16, %o2 C (i32 << 16) 437 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 438 std a16, [%sp+2223+8] 439 fmuld u00, v16, p16 440 C 10 441 sllx %o1, 48, %o3 C (hi64 << 48) 442 add %g2, %o2, %o2 C mi64- in %o2 443 std a32, [%sp+2223+16] 444 fmuld u00, v32, p32 445 C 11 446 add %l6, %o2, %o2 C mi64- in %o2 447 std a48, [%sp+2223+24] 448 faddd p00, r64, a00 449 fmuld u32, v00, r32 450 C 12 451 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 452 stx %o5, [%i4+%i2] 453 faddd p16, r80, a16 454 fmuld u00, v48, p48 455 C 13 456 add cy, %g5, %o4 C x = prev(i00) + cy 457 addcc %i2, 8, %i2 458 bnz,pt %xcc, .Loop 459 fmuld u32, v16, r48 460 C END MAIN LOOP 461 462 .L_out_4: 463 srlx %o4, 16, %o5 C (x >> 16) 464 fmuld u32, v32, r64 C FIXME not urgent 465 faddd p32, r32, a32 466 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 467 and %o4, xffff, %o5 C (x & 0xffff) 468 fdtox a00, a00 469 faddd p48, r48, a48 470 srlx %o2, 48, %o7 C (mi64 >> 48) 471 mov i00, %g5 C i00+ now in g5 472 fmuld u32, v48, r80 C FIXME not urgent 473 fdtox a16, a16 474 sllx %o2, 16, %i3 C (mi64 << 16) 475 add %o7, %o1, cy C new cy 476 ldx [%sp+2223+0], i00 477 fdtox a32, a32 478 srlx i16, 48, %l4 C (i16 >> 48) 479 mov i16, %g2 480 ldx [%sp+2223+8], i16 481 srlx i48, 16, %l5 C (i48 >> 16) 482 mov i32, %g4 C i32+ now in g4 483 ldx [%sp+2223+16], i32 484 sllx i48, 32, %l6 C (i48 << 32) 485 or %i3, %o5, %o5 486 ldx [%sp+2223+24], i48 487 fdtox a48, a48 488 srlx %g4, 32, %o3 C (i32 >> 32) 489 add %l5, %l4, %o1 C hi64- in %o1 490 std a00, [%sp+2223+0] 491 sllx %g4, 16, %o2 C (i32 << 16) 492 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 493 std a16, [%sp+2223+8] 494 sllx %o1, 48, %o3 C (hi64 << 48) 495 add %g2, %o2, %o2 C mi64- in %o2 496 std a32, [%sp+2223+16] 497 add %l6, %o2, %o2 C mi64- in %o2 498 std a48, [%sp+2223+24] 499 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 500 stx %o5, [%i4+%i2] 501 add cy, %g5, %o4 C x = prev(i00) + cy 502 add %i2, 8, %i2 503 .L_out_3: 504 srlx %o4, 16, %o5 C (x >> 16) 505 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 506 and %o4, xffff, %o5 C (x & 0xffff) 507 fdtox r64, a00 508 srlx %o2, 48, %o7 C (mi64 >> 48) 509 mov i00, %g5 C i00+ now in g5 510 fdtox r80, a16 511 sllx %o2, 16, %i3 C (mi64 << 16) 512 add %o7, %o1, cy C new cy 513 ldx [%sp+2223+0], i00 514 srlx i16, 48, %l4 C (i16 >> 48) 515 mov i16, %g2 516 ldx [%sp+2223+8], i16 517 srlx i48, 16, %l5 C (i48 >> 16) 518 mov i32, %g4 C i32+ now in g4 519 ldx [%sp+2223+16], i32 520 sllx i48, 32, %l6 C (i48 << 32) 521 or %i3, %o5, %o5 522 ldx [%sp+2223+24], i48 523 srlx %g4, 32, %o3 C (i32 >> 32) 524 add %l5, %l4, %o1 C hi64- in %o1 525 std a00, [%sp+2223+0] 526 sllx %g4, 16, %o2 C (i32 << 16) 527 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 528 std a16, [%sp+2223+8] 529 sllx %o1, 48, %o3 C (hi64 << 48) 530 add %g2, %o2, %o2 C mi64- in %o2 531 add %l6, %o2, %o2 C mi64- in %o2 532 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 533 stx %o5, [%i4+%i2] 534 add cy, %g5, %o4 C x = prev(i00) + cy 535 add %i2, 8, %i2 536 .L_out_2: 537 srlx %o4, 16, %o5 C (x >> 16) 538 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 539 and %o4, xffff, %o5 C (x & 0xffff) 540 srlx %o2, 48, %o7 C (mi64 >> 48) 541 mov i00, %g5 C i00+ now in g5 542 sllx %o2, 16, %i3 C (mi64 << 16) 543 add %o7, %o1, cy C new cy 544 ldx [%sp+2223+0], i00 545 srlx i16, 48, %l4 C (i16 >> 48) 546 mov i16, %g2 547 ldx [%sp+2223+8], i16 548 srlx i48, 16, %l5 C (i48 >> 16) 549 mov i32, %g4 C i32+ now in g4 550 sllx i48, 32, %l6 C (i48 << 32) 551 or %i3, %o5, %o5 552 srlx %g4, 32, %o3 C (i32 >> 32) 553 add %l5, %l4, %o1 C hi64- in %o1 554 sllx %g4, 16, %o2 C (i32 << 16) 555 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 556 sllx %o1, 48, %o3 C (hi64 << 48) 557 add %g2, %o2, %o2 C mi64- in %o2 558 add %l6, %o2, %o2 C mi64- in %o2 559 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 560 stx %o5, [%i4+%i2] 561 add cy, %g5, %o4 C x = prev(i00) + cy 562 add %i2, 8, %i2 563 .L_out_1: 564 srlx %o4, 16, %o5 C (x >> 16) 565 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 566 and %o4, xffff, %o5 C (x & 0xffff) 567 srlx %o2, 48, %o7 C (mi64 >> 48) 568 sllx %o2, 16, %i3 C (mi64 << 16) 569 add %o7, %o1, cy C new cy 570 or %i3, %o5, %o5 571 stx %o5, [%i4+%i2] 572 573 sllx i00, 0, %g2 574 add %g2, cy, cy 575 sllx i16, 16, %g3 576 add %g3, cy, cy 577 578 return %i7+8 579 mov cy, %o0 580 EPILOGUE(mpn_mul_1)