github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/sqr_basecase.asm (about) 1 dnl AMD64 mpn_sqr_basecase. 2 3 dnl Contributed to the GNU project by Torbjorn Granlund. 4 5 dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C The inner loops of this code are the result of running a code generation and 36 C optimization tool suite written by David Harvey and Torbjorn Granlund. 37 38 C NOTES 39 C * There is a major stupidity in that we call mpn_mul_1 initially, for a 40 C large trip count. Instead, we should follow the generic/sqr_basecase.c 41 C code which uses addmul_2s from the start, conditionally leaving a 1x1 42 C multiply to the end. (In assembly code, one would stop invoking 43 C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.) 44 C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to 45 C save/restore carry, instead it can propagate into the high product word. 46 C * Align more labels, should shave off a few cycles. 47 C * We can safely use 32-bit size operations, since operands with (2^32) 48 C limbs will lead to non-termination in practice. 49 C * The jump table could probably be optimized, at least for non-pic. 50 C * The special code for n <= 4 was quickly written. It is probably too 51 C large and unnecessarily slow. 52 C * Consider combining small cases code so that the n=k-1 code jumps into the 53 C middle of the n=k code. 54 C * Avoid saving registers for small cases code. 55 C * Needed variables: 56 C n r11 input size 57 C i r8 work left, initially n 58 C j r9 inner loop count 59 C r15 unused 60 C v0 r13 61 C v1 r14 62 C rp rdi 63 C up rsi 64 C w0 rbx 65 C w1 rcx 66 C w2 rbp 67 C w3 r10 68 C tp r12 69 C lo rax 70 C hi rdx 71 C rsp 72 73 C INPUT PARAMETERS 74 define(`rp', `%rdi') 75 define(`up', `%rsi') 76 define(`n_param', `%rdx') 77 78 define(`n', `%r11') 79 define(`tp', `%r12') 80 define(`i', `%r8') 81 define(`j', `%r9') 82 define(`v0', `%r13') 83 define(`v1', `%r14') 84 define(`w0', `%rbx') 85 define(`w1', `%rcx') 86 define(`w2', `%rbp') 87 define(`w3', `%r10') 88 89 ABI_SUPPORT(DOS64) 90 ABI_SUPPORT(STD64) 91 92 ASM_START() 93 TEXT 94 ALIGN(16) 95 PROLOGUE(mpn_sqr_basecase) 96 FUNC_ENTRY(3) 97 mov R32(n_param), R32(%rcx) 98 mov R32(n_param), R32(n) C free original n register (rdx) 99 100 add $-40, %rsp 101 102 and $3, R32(%rcx) 103 cmp $4, R32(n_param) 104 lea 4(%rcx), %r8 105 106 mov %rbx, 32(%rsp) 107 mov %rbp, 24(%rsp) 108 mov %r12, 16(%rsp) 109 mov %r13, 8(%rsp) 110 mov %r14, (%rsp) 111 112 cmovg %r8, %rcx 113 114 lea L(tab)(%rip), %rax 115 ifdef(`PIC', 116 ` movslq (%rax,%rcx,4), %r10 117 add %r10, %rax 118 jmp *%rax 119 ',` 120 jmp *(%rax,%rcx,8) 121 ') 122 JUMPTABSECT 123 ALIGN(8) 124 L(tab): JMPENT( L(4), L(tab)) 125 JMPENT( L(1), L(tab)) 126 JMPENT( L(2), L(tab)) 127 JMPENT( L(3), L(tab)) 128 JMPENT( L(0m4), L(tab)) 129 JMPENT( L(1m4), L(tab)) 130 JMPENT( L(2m4), L(tab)) 131 JMPENT( L(3m4), L(tab)) 132 TEXT 133 134 L(1): mov (up), %rax 135 mul %rax 136 add $40, %rsp 137 mov %rax, (rp) 138 mov %rdx, 8(rp) 139 FUNC_EXIT() 140 ret 141 142 L(2): mov (up), %rax 143 mov %rax, %r8 144 mul %rax 145 mov 8(up), %r11 146 mov %rax, (rp) 147 mov %r11, %rax 148 mov %rdx, %r9 149 mul %rax 150 add $40, %rsp 151 mov %rax, %r10 152 mov %r11, %rax 153 mov %rdx, %r11 154 mul %r8 155 xor %r8, %r8 156 add %rax, %r9 157 adc %rdx, %r10 158 adc %r8, %r11 159 add %rax, %r9 160 mov %r9, 8(rp) 161 adc %rdx, %r10 162 mov %r10, 16(rp) 163 adc %r8, %r11 164 mov %r11, 24(rp) 165 FUNC_EXIT() 166 ret 167 168 L(3): mov (up), %rax 169 mov %rax, %r10 170 mul %rax 171 mov 8(up), %r11 172 mov %rax, (rp) 173 mov %r11, %rax 174 mov %rdx, 8(rp) 175 mul %rax 176 mov 16(up), %rcx 177 mov %rax, 16(rp) 178 mov %rcx, %rax 179 mov %rdx, 24(rp) 180 mul %rax 181 mov %rax, 32(rp) 182 mov %rdx, 40(rp) 183 184 mov %r11, %rax 185 mul %r10 186 mov %rax, %r8 187 mov %rcx, %rax 188 mov %rdx, %r9 189 mul %r10 190 xor %r10, %r10 191 add %rax, %r9 192 mov %r11, %rax 193 mov %r10, %r11 194 adc %rdx, %r10 195 196 mul %rcx 197 add $40, %rsp 198 add %rax, %r10 199 adc %r11, %rdx 200 add %r8, %r8 201 adc %r9, %r9 202 adc %r10, %r10 203 adc %rdx, %rdx 204 adc %r11, %r11 205 add %r8, 8(rp) 206 adc %r9, 16(rp) 207 adc %r10, 24(rp) 208 adc %rdx, 32(rp) 209 adc %r11, 40(rp) 210 FUNC_EXIT() 211 ret 212 213 L(4): mov (up), %rax 214 mov %rax, %r11 215 mul %rax 216 mov 8(up), %rbx 217 mov %rax, (rp) 218 mov %rbx, %rax 219 mov %rdx, 8(rp) 220 mul %rax 221 mov %rax, 16(rp) 222 mov %rdx, 24(rp) 223 mov 16(up), %rax 224 mul %rax 225 mov %rax, 32(rp) 226 mov %rdx, 40(rp) 227 mov 24(up), %rax 228 mul %rax 229 mov %rax, 48(rp) 230 mov %rbx, %rax 231 mov %rdx, 56(rp) 232 233 mul %r11 234 add $32, %rsp 235 mov %rax, %r8 236 mov %rdx, %r9 237 mov 16(up), %rax 238 mul %r11 239 xor %r10, %r10 240 add %rax, %r9 241 adc %rdx, %r10 242 mov 24(up), %rax 243 mul %r11 244 xor %r11, %r11 245 add %rax, %r10 246 adc %rdx, %r11 247 mov 16(up), %rax 248 mul %rbx 249 xor %rcx, %rcx 250 add %rax, %r10 251 adc %rdx, %r11 252 adc $0, %rcx 253 mov 24(up), %rax 254 mul %rbx 255 pop %rbx 256 add %rax, %r11 257 adc %rdx, %rcx 258 mov 16(up), %rdx 259 mov 24(up), %rax 260 mul %rdx 261 add %rax, %rcx 262 adc $0, %rdx 263 264 add %r8, %r8 265 adc %r9, %r9 266 adc %r10, %r10 267 adc %r11, %r11 268 adc %rcx, %rcx 269 mov $0, R32(%rax) 270 adc %rdx, %rdx 271 272 adc %rax, %rax 273 add %r8, 8(rp) 274 adc %r9, 16(rp) 275 adc %r10, 24(rp) 276 adc %r11, 32(rp) 277 adc %rcx, 40(rp) 278 adc %rdx, 48(rp) 279 adc %rax, 56(rp) 280 FUNC_EXIT() 281 ret 282 283 284 L(0m4): 285 lea -16(rp,n,8), tp C point tp in middle of result operand 286 mov (up), v0 287 mov 8(up), %rax 288 lea (up,n,8), up C point up at end of input operand 289 290 lea -4(n), i 291 C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1]) 292 xor R32(j), R32(j) 293 sub n, j 294 295 mul v0 296 xor R32(w2), R32(w2) 297 mov %rax, w0 298 mov 16(up,j,8), %rax 299 mov %rdx, w3 300 jmp L(L3) 301 302 ALIGN(16) 303 L(mul_1_m3_top): 304 add %rax, w2 305 mov w3, (tp,j,8) 306 mov (up,j,8), %rax 307 adc %rdx, w1 308 xor R32(w0), R32(w0) 309 mul v0 310 xor R32(w3), R32(w3) 311 mov w2, 8(tp,j,8) 312 add %rax, w1 313 adc %rdx, w0 314 mov 8(up,j,8), %rax 315 mov w1, 16(tp,j,8) 316 xor R32(w2), R32(w2) 317 mul v0 318 add %rax, w0 319 mov 16(up,j,8), %rax 320 adc %rdx, w3 321 L(L3): xor R32(w1), R32(w1) 322 mul v0 323 add %rax, w3 324 mov 24(up,j,8), %rax 325 adc %rdx, w2 326 mov w0, 24(tp,j,8) 327 mul v0 328 add $4, j 329 js L(mul_1_m3_top) 330 331 add %rax, w2 332 mov w3, (tp) 333 adc %rdx, w1 334 mov w2, 8(tp) 335 mov w1, 16(tp) 336 337 lea eval(2*8)(tp), tp C tp += 2 338 lea -8(up), up 339 jmp L(dowhile) 340 341 342 L(1m4): 343 lea 8(rp,n,8), tp C point tp in middle of result operand 344 mov (up), v0 C u0 345 mov 8(up), %rax C u1 346 lea 8(up,n,8), up C point up at end of input operand 347 348 lea -3(n), i 349 C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) 350 lea -3(n), j 351 neg j 352 353 mov %rax, v1 C u1 354 mul v0 C u0 * u1 355 mov %rdx, w1 356 xor R32(w2), R32(w2) 357 mov %rax, 8(rp) 358 jmp L(m0) 359 360 ALIGN(16) 361 L(mul_2_m0_top): 362 mul v1 363 add %rax, w0 364 adc %rdx, w1 365 mov -24(up,j,8), %rax 366 mov $0, R32(w2) 367 mul v0 368 add %rax, w0 369 mov -24(up,j,8), %rax 370 adc %rdx, w1 371 adc $0, R32(w2) 372 mul v1 C v1 * u0 373 add %rax, w1 374 mov w0, -24(tp,j,8) 375 adc %rdx, w2 376 L(m0): mov -16(up,j,8), %rax C u2, u6 ... 377 mul v0 C u0 * u2 378 mov $0, R32(w3) 379 add %rax, w1 380 adc %rdx, w2 381 mov -16(up,j,8), %rax 382 adc $0, R32(w3) 383 mov $0, R32(w0) 384 mov w1, -16(tp,j,8) 385 mul v1 386 add %rax, w2 387 mov -8(up,j,8), %rax 388 adc %rdx, w3 389 mov $0, R32(w1) 390 mul v0 391 add %rax, w2 392 mov -8(up,j,8), %rax 393 adc %rdx, w3 394 adc $0, R32(w0) 395 mul v1 396 add %rax, w3 397 mov w2, -8(tp,j,8) 398 adc %rdx, w0 399 L(m2x): mov (up,j,8), %rax 400 mul v0 401 add %rax, w3 402 adc %rdx, w0 403 adc $0, R32(w1) 404 add $4, j 405 mov -32(up,j,8), %rax 406 mov w3, -32(tp,j,8) 407 js L(mul_2_m0_top) 408 409 mul v1 410 add %rax, w0 411 adc %rdx, w1 412 mov w0, -8(tp) 413 mov w1, (tp) 414 415 lea -16(up), up 416 lea eval(3*8-24)(tp), tp C tp += 3 417 jmp L(dowhile_end) 418 419 420 L(2m4): 421 lea -16(rp,n,8), tp C point tp in middle of result operand 422 mov (up), v0 423 mov 8(up), %rax 424 lea (up,n,8), up C point up at end of input operand 425 426 lea -4(n), i 427 C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i]) 428 lea -2(n), j 429 neg j 430 431 mul v0 432 mov %rax, w2 433 mov (up,j,8), %rax 434 mov %rdx, w1 435 jmp L(L1) 436 437 ALIGN(16) 438 L(mul_1_m1_top): 439 add %rax, w2 440 mov w3, (tp,j,8) 441 mov (up,j,8), %rax 442 adc %rdx, w1 443 L(L1): xor R32(w0), R32(w0) 444 mul v0 445 xor R32(w3), R32(w3) 446 mov w2, 8(tp,j,8) 447 add %rax, w1 448 adc %rdx, w0 449 mov 8(up,j,8), %rax 450 mov w1, 16(tp,j,8) 451 xor R32(w2), R32(w2) 452 mul v0 453 add %rax, w0 454 mov 16(up,j,8), %rax 455 adc %rdx, w3 456 xor R32(w1), R32(w1) 457 mul v0 458 add %rax, w3 459 mov 24(up,j,8), %rax 460 adc %rdx, w2 461 mov w0, 24(tp,j,8) 462 mul v0 463 add $4, j 464 js L(mul_1_m1_top) 465 466 add %rax, w2 467 mov w3, (tp) 468 adc %rdx, w1 469 mov w2, 8(tp) 470 mov w1, 16(tp) 471 472 lea eval(2*8)(tp), tp C tp += 2 473 lea -8(up), up 474 jmp L(dowhile_mid) 475 476 477 L(3m4): 478 lea 8(rp,n,8), tp C point tp in middle of result operand 479 mov (up), v0 C u0 480 mov 8(up), %rax C u1 481 lea 8(up,n,8), up C point up at end of input operand 482 483 lea -5(n), i 484 C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) 485 lea -1(n), j 486 neg j 487 488 mov %rax, v1 C u1 489 mul v0 C u0 * u1 490 mov %rdx, w3 491 xor R32(w0), R32(w0) 492 xor R32(w1), R32(w1) 493 mov %rax, 8(rp) 494 jmp L(m2) 495 496 ALIGN(16) 497 L(mul_2_m2_top): 498 mul v1 499 add %rax, w0 500 adc %rdx, w1 501 mov -24(up,j,8), %rax 502 mov $0, R32(w2) 503 mul v0 504 add %rax, w0 505 mov -24(up,j,8), %rax 506 adc %rdx, w1 507 adc $0, R32(w2) 508 mul v1 C v1 * u0 509 add %rax, w1 510 mov w0, -24(tp,j,8) 511 adc %rdx, w2 512 mov -16(up,j,8), %rax 513 mul v0 514 mov $0, R32(w3) 515 add %rax, w1 516 adc %rdx, w2 517 mov -16(up,j,8), %rax 518 adc $0, R32(w3) 519 mov $0, R32(w0) 520 mov w1, -16(tp,j,8) 521 mul v1 522 add %rax, w2 523 mov -8(up,j,8), %rax 524 adc %rdx, w3 525 mov $0, R32(w1) 526 mul v0 527 add %rax, w2 528 mov -8(up,j,8), %rax 529 adc %rdx, w3 530 adc $0, R32(w0) 531 mul v1 532 add %rax, w3 533 mov w2, -8(tp,j,8) 534 adc %rdx, w0 535 L(m2): mov (up,j,8), %rax 536 mul v0 537 add %rax, w3 538 adc %rdx, w0 539 adc $0, R32(w1) 540 add $4, j 541 mov -32(up,j,8), %rax 542 mov w3, -32(tp,j,8) 543 js L(mul_2_m2_top) 544 545 mul v1 546 add %rax, w0 547 adc %rdx, w1 548 mov w0, -8(tp) 549 mov w1, (tp) 550 551 lea -16(up), up 552 jmp L(dowhile_mid) 553 554 L(dowhile): 555 C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i) 556 lea 4(i), j 557 neg j 558 559 mov 16(up,j,8), v0 560 mov 24(up,j,8), v1 561 mov 24(up,j,8), %rax 562 mul v0 563 xor R32(w3), R32(w3) 564 add %rax, 24(tp,j,8) 565 adc %rdx, w3 566 xor R32(w0), R32(w0) 567 xor R32(w1), R32(w1) 568 jmp L(am2) 569 570 ALIGN(16) 571 L(addmul_2_m2_top): 572 add w3, (tp,j,8) 573 adc %rax, w0 574 mov 8(up,j,8), %rax 575 adc %rdx, w1 576 mov $0, R32(w2) 577 mul v0 578 add %rax, w0 579 mov 8(up,j,8), %rax 580 adc %rdx, w1 581 adc $0, R32(w2) 582 mul v1 C v1 * u0 583 add w0, 8(tp,j,8) 584 adc %rax, w1 585 adc %rdx, w2 586 mov 16(up,j,8), %rax 587 mov $0, R32(w3) 588 mul v0 C v0 * u1 589 add %rax, w1 590 mov 16(up,j,8), %rax 591 adc %rdx, w2 592 adc $0, R32(w3) 593 mul v1 C v1 * u1 594 add w1, 16(tp,j,8) 595 adc %rax, w2 596 mov 24(up,j,8), %rax 597 adc %rdx, w3 598 mul v0 599 mov $0, R32(w0) 600 add %rax, w2 601 adc %rdx, w3 602 mov $0, R32(w1) 603 mov 24(up,j,8), %rax 604 adc $0, R32(w0) 605 mul v1 606 add w2, 24(tp,j,8) 607 adc %rax, w3 608 adc %rdx, w0 609 L(am2): mov 32(up,j,8), %rax 610 mul v0 611 add %rax, w3 612 mov 32(up,j,8), %rax 613 adc %rdx, w0 614 adc $0, R32(w1) 615 mul v1 616 add $4, j 617 js L(addmul_2_m2_top) 618 619 add w3, (tp) 620 adc %rax, w0 621 adc %rdx, w1 622 mov w0, 8(tp) 623 mov w1, 16(tp) 624 625 lea eval(2*8)(tp), tp C tp += 2 626 627 add $-2, R32(i) C i -= 2 628 629 L(dowhile_mid): 630 C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i) 631 lea 2(i), j 632 neg j 633 634 mov (up,j,8), v0 635 mov 8(up,j,8), v1 636 mov 8(up,j,8), %rax 637 mul v0 638 xor R32(w1), R32(w1) 639 add %rax, 8(tp,j,8) 640 adc %rdx, w1 641 xor R32(w2), R32(w2) 642 jmp L(20) 643 644 ALIGN(16) 645 L(addmul_2_m0_top): 646 add w3, (tp,j,8) 647 adc %rax, w0 648 mov 8(up,j,8), %rax 649 adc %rdx, w1 650 mov $0, R32(w2) 651 mul v0 652 add %rax, w0 653 mov 8(up,j,8), %rax 654 adc %rdx, w1 655 adc $0, R32(w2) 656 mul v1 C v1 * u0 657 add w0, 8(tp,j,8) 658 adc %rax, w1 659 adc %rdx, w2 660 L(20): mov 16(up,j,8), %rax 661 mov $0, R32(w3) 662 mul v0 C v0 * u1 663 add %rax, w1 664 mov 16(up,j,8), %rax 665 adc %rdx, w2 666 adc $0, R32(w3) 667 mul v1 C v1 * u1 668 add w1, 16(tp,j,8) 669 adc %rax, w2 670 mov 24(up,j,8), %rax 671 adc %rdx, w3 672 mul v0 673 mov $0, R32(w0) 674 add %rax, w2 675 adc %rdx, w3 676 mov $0, R32(w1) 677 mov 24(up,j,8), %rax 678 adc $0, R32(w0) 679 mul v1 680 add w2, 24(tp,j,8) 681 adc %rax, w3 682 adc %rdx, w0 683 mov 32(up,j,8), %rax 684 mul v0 685 add %rax, w3 686 mov 32(up,j,8), %rax 687 adc %rdx, w0 688 adc $0, R32(w1) 689 mul v1 690 add $4, j 691 js L(addmul_2_m0_top) 692 693 add w3, (tp) 694 adc %rax, w0 695 adc %rdx, w1 696 mov w0, 8(tp) 697 mov w1, 16(tp) 698 699 lea eval(2*8)(tp), tp C tp += 2 700 L(dowhile_end): 701 702 add $-2, R32(i) C i -= 2 703 jne L(dowhile) 704 705 C Function mpn_addmul_2s_2 706 mov -16(up), v0 707 mov -8(up), v1 708 mov -8(up), %rax 709 mul v0 710 xor R32(w3), R32(w3) 711 add %rax, -8(tp) 712 adc %rdx, w3 713 xor R32(w0), R32(w0) 714 xor R32(w1), R32(w1) 715 mov (up), %rax 716 mul v0 717 add %rax, w3 718 mov (up), %rax 719 adc %rdx, w0 720 mul v1 721 add w3, (tp) 722 adc %rax, w0 723 adc %rdx, w1 724 mov w0, 8(tp) 725 mov w1, 16(tp) 726 727 C Function mpn_sqr_diag_addlsh1 728 lea -4(n,n), j 729 730 mov 8(rp), %r11 731 lea -8(up), up 732 lea (rp,j,8), rp 733 neg j 734 mov (up,j,4), %rax 735 mul %rax 736 test $2, R8(j) 737 jnz L(odd) 738 739 L(evn): add %r11, %r11 740 sbb R32(%rbx), R32(%rbx) C save CF 741 add %rdx, %r11 742 mov %rax, (rp,j,8) 743 jmp L(d0) 744 745 L(odd): add %r11, %r11 746 sbb R32(%rbp), R32(%rbp) C save CF 747 add %rdx, %r11 748 mov %rax, (rp,j,8) 749 lea -2(j), j 750 jmp L(d1) 751 752 ALIGN(16) 753 L(top): mov (up,j,4), %rax 754 mul %rax 755 add R32(%rbp), R32(%rbp) C restore carry 756 adc %rax, %r10 757 adc %rdx, %r11 758 mov %r10, (rp,j,8) 759 L(d0): mov %r11, 8(rp,j,8) 760 mov 16(rp,j,8), %r10 761 adc %r10, %r10 762 mov 24(rp,j,8), %r11 763 adc %r11, %r11 764 nop 765 sbb R32(%rbp), R32(%rbp) C save CF 766 mov 8(up,j,4), %rax 767 mul %rax 768 add R32(%rbx), R32(%rbx) C restore carry 769 adc %rax, %r10 770 adc %rdx, %r11 771 mov %r10, 16(rp,j,8) 772 L(d1): mov %r11, 24(rp,j,8) 773 mov 32(rp,j,8), %r10 774 adc %r10, %r10 775 mov 40(rp,j,8), %r11 776 adc %r11, %r11 777 sbb R32(%rbx), R32(%rbx) C save CF 778 add $4, j 779 js L(top) 780 781 mov (up), %rax 782 mul %rax 783 add R32(%rbp), R32(%rbp) C restore carry 784 adc %rax, %r10 785 adc %rdx, %r11 786 mov %r10, (rp) 787 mov %r11, 8(rp) 788 mov 16(rp), %r10 789 adc %r10, %r10 790 sbb R32(%rbp), R32(%rbp) C save CF 791 neg R32(%rbp) 792 mov 8(up), %rax 793 mul %rax 794 add R32(%rbx), R32(%rbx) C restore carry 795 adc %rax, %r10 796 adc %rbp, %rdx 797 mov %r10, 16(rp) 798 mov %rdx, 24(rp) 799 800 pop %r14 801 pop %r13 802 pop %r12 803 pop %rbp 804 pop %rbx 805 FUNC_EXIT() 806 ret 807 EPILOGUE()