github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define res_ptr R0 6 #define a_ptr R1 7 #define b_ptr R2 8 9 #define acc0 R3 10 #define acc1 R4 11 #define acc2 R5 12 #define acc3 R6 13 14 #define acc4 R7 15 #define acc5 R8 16 #define acc6 R9 17 #define acc7 R10 18 #define t0 R11 19 #define t1 R12 20 #define t2 R13 21 #define t3 R14 22 #define const0 R15 23 #define const1 R16 24 25 #define hlp0 R17 26 #define hlp1 res_ptr 27 28 #define x0 R19 29 #define x1 R20 30 #define x2 R21 31 #define x3 R22 32 #define y0 R23 33 #define y1 R24 34 #define y2 R25 35 #define y3 R26 36 37 #define const2 t2 38 #define const3 t3 39 40 #define storeBlock(a0,a1,a2,a3, r) \ 41 MOVD a0, 0+r \ 42 MOVD a1, 8+r \ 43 MOVD a2, 16+r \ 44 MOVD a3, 24+r 45 46 #define loadBlock(r, a0,a1,a2,a3) \ 47 MOVD 0+r, a0 \ 48 MOVD 8+r, a1 \ 49 MOVD 16+r, a2 \ 50 MOVD 24+r, a3 51 52 #define loadModulus(p0,p1,p2,p3) \ 53 MOVD ·p2+0(SB), p0 \ 54 MOVD ·p2+8(SB), p1 \ 55 MOVD ·p2+16(SB), p2 \ 56 MOVD ·p2+24(SB), p3 57 58 TEXT ·gfpNeg(SB),0,$0-16 59 MOVD a+8(FP), R0 60 loadBlock(0(R0), R1,R2,R3,R4) 61 loadModulus(R5,R6,R7,R8) 62 63 SUBS R1, R5, R1 64 SBCS R2, R6, R2 65 SBCS R3, R7, R3 66 SBCS R4, R8, R4 67 68 SUBS R5, R1, R5 69 SBCS R6, R2, R6 70 SBCS R7, R3, R7 71 SBCS R8, R4, R8 72 73 CSEL CS, R5, R1, R1 74 CSEL CS, R6, R2, R2 75 CSEL CS, R7, R3, R3 76 CSEL CS, R8, R4, R4 77 78 MOVD c+0(FP), R0 79 storeBlock(R1,R2,R3,R4, 0(R0)) 80 RET 81 82 TEXT ·gfpAdd(SB),0,$0-24 83 MOVD a+8(FP), R0 84 loadBlock(0(R0), R1,R2,R3,R4) 85 MOVD b+16(FP), R0 86 loadBlock(0(R0), R5,R6,R7,R8) 87 loadModulus(R9,R10,R11,R12) 88 MOVD ZR, R0 89 90 ADDS R5, R1 91 ADCS R6, R2 92 ADCS R7, R3 93 ADCS R8, R4 94 ADCS ZR, R0 95 96 SUBS R9, R1, R5 97 SBCS R10, R2, R6 98 SBCS R11, R3, R7 99 SBCS R12, R4, R8 100 SBCS ZR, R0, R0 101 102 CSEL CS, R5, R1, R1 103 CSEL CS, R6, R2, R2 104 CSEL CS, R7, R3, R3 105 CSEL CS, R8, R4, R4 106 107 MOVD c+0(FP), R0 108 storeBlock(R1,R2,R3,R4, 0(R0)) 109 RET 110 111 TEXT ·gfpDouble(SB),0,$0-16 112 MOVD a+8(FP), R0 113 loadBlock(0(R0), R1,R2,R3,R4) 114 loadModulus(R9,R10,R11,R12) 115 MOVD ZR, R0 116 117 ADDS R1, R1 118 ADCS R2, R2 119 ADCS R3, R3 120 ADCS R4, R4 121 ADCS ZR, R0 122 123 SUBS R9, R1, R5 124 SBCS R10, R2, R6 125 SBCS R11, R3, R7 126 SBCS R12, R4, R8 127 SBCS ZR, R0, R0 128 129 CSEL CS, R5, R1, R1 130 CSEL CS, R6, R2, R2 131 CSEL CS, R7, R3, R3 132 CSEL CS, R8, R4, R4 133 134 MOVD c+0(FP), R0 135 storeBlock(R1,R2,R3,R4, 0(R0)) 136 RET 137 138 TEXT ·gfpTriple(SB),0,$0-16 139 MOVD a+8(FP), R0 140 loadBlock(0(R0), R1,R2,R3,R4) 141 MOVD R1, R19 142 MOVD R2, R20 143 MOVD R3, R21 144 MOVD R4, R22 145 loadModulus(R9,R10,R11,R12) 146 MOVD ZR, R0 147 148 ADDS R1, R1 149 ADCS R2, R2 150 ADCS R3, R3 151 ADCS R4, R4 152 ADCS ZR, R0 153 154 SUBS R9, R1, R5 155 SBCS R10, R2, R6 156 SBCS R11, R3, R7 157 SBCS R12, R4, R8 158 SBCS ZR, R0, R0 159 160 CSEL CS, R5, R1, R1 161 CSEL CS, R6, R2, R2 162 CSEL CS, R7, R3, R3 163 CSEL CS, R8, R4, R4 164 165 MOVD ZR, R0 166 167 ADDS R19, R1 168 ADCS R20, R2 169 ADCS R21, R3 170 ADCS R22, R4 171 ADCS ZR, R0 172 173 SUBS R9, R1, R5 174 SBCS R10, R2, R6 175 SBCS R11, R3, R7 176 SBCS R12, R4, R8 177 SBCS ZR, R0, R0 178 179 CSEL CS, R5, R1, R1 180 CSEL CS, R6, R2, R2 181 CSEL CS, R7, R3, R3 182 CSEL CS, R8, R4, R4 183 184 MOVD c+0(FP), R0 185 storeBlock(R1,R2,R3,R4, 0(R0)) 186 RET 187 188 TEXT ·gfpSub(SB),0,$0-24 189 MOVD a+8(FP), R0 190 loadBlock(0(R0), R1,R2,R3,R4) 191 MOVD b+16(FP), R0 192 loadBlock(0(R0), R5,R6,R7,R8) 193 loadModulus(R9,R10,R11,R12) 194 195 SUBS R5, R1 196 SBCS R6, R2 197 SBCS R7, R3 198 SBCS R8, R4 199 200 CSEL CS, ZR, R9, R9 201 CSEL CS, ZR, R10, R10 202 CSEL CS, ZR, R11, R11 203 CSEL CS, ZR, R12, R12 204 205 ADDS R9, R1 206 ADCS R10, R2 207 ADCS R11, R3 208 ADCS R12, R4 209 210 MOVD c+0(FP), R0 211 storeBlock(R1,R2,R3,R4, 0(R0)) 212 RET 213 214 TEXT ·gfpMul(SB),NOSPLIT,$0 215 MOVD in1+8(FP), a_ptr 216 MOVD in2+16(FP), b_ptr 217 218 MOVD ·np+0x00(SB), hlp1 219 LDP ·p2+0x00(SB), (const0, const1) 220 LDP ·p2+0x10(SB), (const2, const3) 221 222 LDP 0*16(a_ptr), (x0, x1) 223 LDP 1*16(a_ptr), (x2, x3) 224 LDP 0*16(b_ptr), (y0, y1) 225 LDP 1*16(b_ptr), (y2, y3) 226 227 // y[0] * x 228 MUL y0, x0, acc0 229 UMULH y0, x0, acc1 230 231 MUL y0, x1, t0 232 ADDS t0, acc1 233 UMULH y0, x1, acc2 234 235 MUL y0, x2, t0 236 ADCS t0, acc2 237 UMULH y0, x2, acc3 238 239 MUL y0, x3, t0 240 ADCS t0, acc3 241 UMULH y0, x3, acc4 242 ADC $0, acc4 243 // First reduction step 244 MUL acc0, hlp1, hlp0 245 246 MUL const0, hlp0, t0 247 ADDS t0, acc0, acc0 248 UMULH const0, hlp0, t1 249 250 MUL const1, hlp0, t0 251 ADCS t0, acc1, acc1 252 UMULH const1, hlp0, y0 253 254 MUL const2, hlp0, t0 255 ADCS t0, acc2, acc2 256 UMULH const2, hlp0, acc0 257 258 MUL const3, hlp0, t0 259 ADCS t0, acc3, acc3 260 261 UMULH const3, hlp0, hlp0 262 ADC $0, acc4 263 264 ADDS t1, acc1, acc1 265 ADCS y0, acc2, acc2 266 ADCS acc0, acc3, acc3 267 ADC $0, hlp0, acc0 268 // y[1] * x 269 MUL y1, x0, t0 270 ADDS t0, acc1 271 UMULH y1, x0, t1 272 273 MUL y1, x1, t0 274 ADCS t0, acc2 275 UMULH y1, x1, hlp0 276 277 MUL y1, x2, t0 278 ADCS t0, acc3 279 UMULH y1, x2, y0 280 281 MUL y1, x3, t0 282 ADCS t0, acc4 283 UMULH y1, x3, y1 284 ADC $0, ZR, acc5 285 286 ADDS t1, acc2 287 ADCS hlp0, acc3 288 ADCS y0, acc4 289 ADC y1, acc5 290 // Second reduction step 291 MUL acc1, hlp1, hlp0 292 293 MUL const0, hlp0, t0 294 ADDS t0, acc1, acc1 295 UMULH const0, hlp0, t1 296 297 MUL const1, hlp0, t0 298 ADCS t0, acc2, acc2 299 UMULH const1, hlp0, y0 300 301 MUL const2, hlp0, t0 302 ADCS t0, acc3, acc3 303 UMULH const2, hlp0, acc1 304 305 MUL const3, hlp0, t0 306 ADCS t0, acc0, acc0 307 308 UMULH const3, hlp0, hlp0 309 ADC $0, acc5 310 311 ADDS t1, acc2, acc2 312 ADCS y0, acc3, acc3 313 ADCS acc1, acc0, acc0 314 ADC $0, hlp0, acc1 315 // y[2] * x 316 MUL y2, x0, t0 317 ADDS t0, acc2 318 UMULH y2, x0, t1 319 320 MUL y2, x1, t0 321 ADCS t0, acc3 322 UMULH y2, x1, hlp0 323 324 MUL y2, x2, t0 325 ADCS t0, acc4 326 UMULH y2, x2, y0 327 328 MUL y2, x3, t0 329 ADCS t0, acc5 330 UMULH y2, x3, y1 331 ADC $0, ZR, acc6 332 333 ADDS t1, acc3 334 ADCS hlp0, acc4 335 ADCS y0, acc5 336 ADC y1, acc6 337 // Third reduction step 338 MUL acc2, hlp1, hlp0 339 340 MUL const0, hlp0, t0 341 ADDS t0, acc2, acc2 342 UMULH const0, hlp0, t1 343 344 MUL const1, hlp0, t0 345 ADCS t0, acc3, acc3 346 UMULH const1, hlp0, y0 347 348 MUL const2, hlp0, t0 349 ADCS t0, acc0, acc0 350 UMULH const2, hlp0, acc2 351 352 MUL const3, hlp0, t0 353 ADCS t0, acc1, acc1 354 355 UMULH const3, hlp0, hlp0 356 ADC $0, acc6 357 358 ADDS t1, acc3, acc3 359 ADCS y0, acc0, acc0 360 ADCS acc2, acc1, acc1 361 ADC $0, hlp0, acc2 362 // y[3] * x 363 MUL y3, x0, t0 364 ADDS t0, acc3 365 UMULH y3, x0, t1 366 367 MUL y3, x1, t0 368 ADCS t0, acc4 369 UMULH y3, x1, hlp0 370 371 MUL y3, x2, t0 372 ADCS t0, acc5 373 UMULH y3, x2, y0 374 375 MUL y3, x3, t0 376 ADCS t0, acc6 377 UMULH y3, x3, y1 378 ADC $0, ZR, acc7 379 380 ADDS t1, acc4 381 ADCS hlp0, acc5 382 ADCS y0, acc6 383 ADC y1, acc7 384 // Last reduction step 385 MUL acc3, hlp1, hlp0 386 387 MUL const0, hlp0, t0 388 ADDS t0, acc3, acc3 389 UMULH const0, hlp0, t1 390 391 MUL const1, hlp0, t0 392 ADCS t0, acc0, acc0 393 UMULH const1, hlp0, y0 394 395 MUL const2, hlp0, t0 396 ADCS t0, acc1, acc1 397 UMULH const2, hlp0, acc3 398 399 MUL const3, hlp0, t0 400 ADCS t0, acc2, acc2 401 402 UMULH const3, hlp0, hlp0 403 ADC $0, acc7 404 405 ADDS t1, acc0, acc0 406 ADCS y0, acc1, acc1 407 ADCS acc3, acc2, acc2 408 ADC $0, hlp0, acc3 409 410 ADDS acc4, acc0, acc0 411 ADCS acc5, acc1, acc1 412 ADCS acc6, acc2, acc2 413 ADCS acc7, acc3, acc3 414 ADC $0, ZR, acc4 415 416 SUBS const0, acc0, t0 417 SBCS const1, acc1, t1 418 SBCS const2, acc2, t2 419 SBCS const3, acc3, t3 420 SBCS $0, acc4, acc4 421 422 CSEL CS, t0, acc0, acc0 423 CSEL CS, t1, acc1, acc1 424 CSEL CS, t2, acc2, acc2 425 CSEL CS, t3, acc3, acc3 426 427 MOVD res+0(FP), res_ptr 428 STP (acc0, acc1), 0*16(res_ptr) 429 STP (acc2, acc3), 1*16(res_ptr) 430 431 RET 432 433 // func gfpSqr(res, in *gfP, n int) 434 TEXT ·gfpSqr(SB),NOSPLIT,$0 435 MOVD in+8(FP), a_ptr 436 MOVD n+16(FP), b_ptr 437 438 MOVD ·np+0x00(SB), hlp1 439 LDP ·p2+0x00(SB), (const0, const1) 440 LDP ·p2+0x10(SB), (const2, const3) 441 442 LDP 0*16(a_ptr), (x0, x1) 443 LDP 1*16(a_ptr), (x2, x3) 444 445 ordSqrLoop: 446 SUB $1, b_ptr 447 448 // x[1:] * x[0] 449 MUL x0, x1, acc1 450 UMULH x0, x1, acc2 451 452 MUL x0, x2, t0 453 ADDS t0, acc2, acc2 454 UMULH x0, x2, acc3 455 456 MUL x0, x3, t0 457 ADCS t0, acc3, acc3 458 UMULH x0, x3, acc4 459 ADC $0, acc4, acc4 460 // x[2:] * x[1] 461 MUL x1, x2, t0 462 ADDS t0, acc3 463 UMULH x1, x2, t1 464 ADCS t1, acc4 465 ADC $0, ZR, acc5 466 467 MUL x1, x3, t0 468 ADDS t0, acc4 469 UMULH x1, x3, t1 470 ADC t1, acc5 471 // x[3] * x[2] 472 MUL x2, x3, t0 473 ADDS t0, acc5 474 UMULH x2, x3, acc6 475 ADC $0, acc6 476 477 MOVD $0, acc7 478 // *2 479 ADDS acc1, acc1 480 ADCS acc2, acc2 481 ADCS acc3, acc3 482 ADCS acc4, acc4 483 ADCS acc5, acc5 484 ADCS acc6, acc6 485 ADC $0, acc7 486 // Missing products 487 MUL x0, x0, acc0 488 UMULH x0, x0, t0 489 ADDS t0, acc1, acc1 490 491 MUL x1, x1, t0 492 ADCS t0, acc2, acc2 493 UMULH x1, x1, t1 494 ADCS t1, acc3, acc3 495 496 MUL x2, x2, t0 497 ADCS t0, acc4, acc4 498 UMULH x2, x2, t1 499 ADCS t1, acc5, acc5 500 501 MUL x3, x3, t0 502 ADCS t0, acc6, acc6 503 UMULH x3, x3, t1 504 ADC t1, acc7, acc7 505 // First reduction step 506 MUL acc0, hlp1, hlp0 507 508 MUL const0, hlp0, t0 509 ADDS t0, acc0, acc0 510 UMULH const0, hlp0, t1 511 512 MUL const1, hlp0, t0 513 ADCS t0, acc1, acc1 514 UMULH const1, hlp0, y0 515 516 MUL const2, hlp0, t0 517 ADCS t0, acc2, acc2 518 UMULH const2, hlp0, acc0 519 520 MUL const3, hlp0, t0 521 ADCS t0, acc3, acc3 522 523 UMULH const3, hlp0, hlp0 524 ADC $0, hlp0 525 526 ADDS t1, acc1, acc1 527 ADCS y0, acc2, acc2 528 ADCS acc0, acc3, acc3 529 ADC $0, hlp0, acc0 530 // Second reduction step 531 MUL acc1, hlp1, hlp0 532 533 MUL const0, hlp0, t0 534 ADDS t0, acc1, acc1 535 UMULH const0, hlp0, t1 536 537 MUL const1, hlp0, t0 538 ADCS t0, acc2, acc2 539 UMULH const1, hlp0, y0 540 541 MUL const2, hlp0, t0 542 ADCS t0, acc3, acc3 543 UMULH const2, hlp0, acc1 544 545 MUL const3, hlp0, t0 546 ADCS t0, acc0, acc0 547 548 UMULH const3, hlp0, hlp0 549 ADC $0, hlp0 550 551 ADDS t1, acc2, acc2 552 ADCS y0, acc3, acc3 553 ADCS acc1, acc0, acc0 554 ADC $0, hlp0, acc1 555 // Third reduction step 556 MUL acc2, hlp1, hlp0 557 558 MUL const0, hlp0, t0 559 ADDS t0, acc2, acc2 560 UMULH const0, hlp0, t1 561 562 MUL const1, hlp0, t0 563 ADCS t0, acc3, acc3 564 UMULH const1, hlp0, y0 565 566 MUL const2, hlp0, t0 567 ADCS t0, acc0, acc0 568 UMULH const2, hlp0, acc2 569 570 MUL const3, hlp0, t0 571 ADCS t0, acc1, acc1 572 573 UMULH const3, hlp0, hlp0 574 ADC $0, hlp0 575 576 ADDS t1, acc3, acc3 577 ADCS y0, acc0, acc0 578 ADCS acc2, acc1, acc1 579 ADC $0, hlp0, acc2 580 581 // Last reduction step 582 MUL acc3, hlp1, hlp0 583 584 MUL const0, hlp0, t0 585 ADDS t0, acc3, acc3 586 UMULH const0, hlp0, t1 587 588 MUL const1, hlp0, t0 589 ADCS t0, acc0, acc0 590 UMULH const1, hlp0, y0 591 592 MUL const2, hlp0, t0 593 ADCS t0, acc1, acc1 594 UMULH const2, hlp0, acc3 595 596 MUL const3, hlp0, t0 597 ADCS t0, acc2, acc2 598 599 UMULH const3, hlp0, hlp0 600 ADC $0, acc7 601 602 ADDS t1, acc0, acc0 603 ADCS y0, acc1, acc1 604 ADCS acc3, acc2, acc2 605 ADC $0, hlp0, acc3 606 607 ADDS acc4, acc0, acc0 608 ADCS acc5, acc1, acc1 609 ADCS acc6, acc2, acc2 610 ADCS acc7, acc3, acc3 611 ADC $0, ZR, acc4 612 613 SUBS const0, acc0, y0 614 SBCS const1, acc1, y1 615 SBCS const2, acc2, y2 616 SBCS const3, acc3, y3 617 SBCS $0, acc4, acc4 618 619 CSEL CS, y0, acc0, x0 620 CSEL CS, y1, acc1, x1 621 CSEL CS, y2, acc2, x2 622 CSEL CS, y3, acc3, x3 623 624 CBNZ b_ptr, ordSqrLoop 625 626 MOVD res+0(FP), res_ptr 627 STP (x0, x1), 0*16(res_ptr) 628 STP (x2, x3), 1*16(res_ptr) 629 630 RET 631 632 /* ---------------------------------------*/ 633 // func gfpFromMont(res, in *gfP) 634 TEXT ·gfpFromMont(SB),NOSPLIT,$0 635 MOVD in+8(FP), a_ptr 636 637 MOVD ·np+0x00(SB), hlp1 638 LDP ·p2+0x00(SB), (const0, const1) 639 LDP ·p2+0x10(SB), (const2, const3) 640 641 LDP 0*16(a_ptr), (acc0, acc1) 642 LDP 1*16(a_ptr), (acc2, acc3) 643 // Only reduce, no multiplications are needed 644 // First reduction step 645 MUL acc0, hlp1, hlp0 646 647 MUL const0, hlp1, t0 648 ADDS t0, acc0, acc0 649 UMULH const0, hlp0, t1 650 651 MUL const1, hlp0, t0 652 ADCS t0, acc1, acc1 653 UMULH const1, hlp0, y0 654 655 MUL const2, hlp0, t0 656 ADCS t0, acc2, acc2 657 UMULH const2, hlp0, acc0 658 659 MUL const3, hlp0, t0 660 ADCS t0, acc3, acc3 661 662 UMULH const3, hlp0, hlp0 663 ADC $0, hlp0 664 665 ADDS t1, acc1, acc1 666 ADCS y0, acc2, acc2 667 ADCS acc0, acc3, acc3 668 ADC $0, hlp0, acc0 669 // Second reduction step 670 MUL acc1, hlp1, hlp0 671 672 MUL const0, hlp1, t0 673 ADDS t0, acc1, acc1 674 UMULH const0, hlp0, t1 675 676 MUL const1, hlp0, t0 677 ADCS t0, acc2, acc2 678 UMULH const1, hlp0, y0 679 680 MUL const2, hlp0, t0 681 ADCS t0, acc3, acc3 682 UMULH const2, hlp0, acc1 683 684 MUL const3, hlp0, t0 685 ADCS t0, acc0, acc0 686 687 UMULH const3, hlp0, hlp0 688 ADC $0, hlp0 689 690 ADDS t1, acc2, acc2 691 ADCS y0, acc3, acc3 692 ADCS acc1, acc0, acc0 693 ADC $0, hlp0, acc1 694 // Third reduction step 695 MUL acc2, hlp1, hlp0 696 697 MUL const0, hlp1, t0 698 ADDS t0, acc2, acc2 699 UMULH const0, hlp0, t1 700 701 MUL const1, hlp0, t0 702 ADCS t0, acc3, acc3 703 UMULH const1, hlp0, y0 704 705 MUL const2, hlp0, t0 706 ADCS t0, acc0, acc0 707 UMULH const2, hlp0, acc2 708 709 MUL const3, hlp0, t0 710 ADCS t0, acc1, acc1 711 712 UMULH const3, hlp0, hlp0 713 ADC $0, hlp0 714 715 ADDS t1, acc3, acc3 716 ADCS y0, acc0, acc0 717 ADCS acc2, acc1, acc1 718 ADC $0, hlp0, acc2 719 720 // Last reduction step 721 MUL acc3, hlp1, hlp0 722 723 MUL const0, hlp1, t0 724 ADDS t0, acc3, acc3 725 UMULH const0, hlp0, t1 726 727 MUL const1, hlp0, t0 728 ADCS t0, acc0, acc0 729 UMULH const1, hlp0, y0 730 731 MUL const2, hlp0, t0 732 ADCS t0, acc1, acc1 733 UMULH const2, hlp0, acc3 734 735 MUL const3, hlp0, t0 736 ADCS t0, acc2, acc2 737 738 UMULH const3, hlp0, hlp0 739 ADC $0, hlp0 740 741 ADDS t1, acc0, acc0 742 ADCS y0, acc1, acc1 743 ADCS acc3, acc2, acc2 744 ADC $0, hlp0, acc3 745 746 SUBS const0, acc0, y0 747 SBCS const1, acc1, y1 748 SBCS const2, acc2, y2 749 SBCS const3, acc3, y3 750 751 CSEL CS, y0, acc0, x0 752 CSEL CS, y1, acc1, x1 753 CSEL CS, y2, acc2, x2 754 CSEL CS, y3, acc3, x3 755 756 MOVD res+0(FP), res_ptr 757 STP (x0, x1), 0*16(res_ptr) 758 STP (x2, x3), 1*16(res_ptr) 759 760 RET 761 762 /* ---------------------------------------*/ 763 // func gfpUnmarshal(res *gfP, in *[32]byte) 764 TEXT ·gfpUnmarshal(SB),NOSPLIT,$0 765 JMP ·gfpMarshal(SB) 766 767 /* ---------------------------------------*/ 768 // func gfpMarshal(res *[32]byte, in *gfP) 769 TEXT ·gfpMarshal(SB),NOSPLIT,$0 770 MOVD res+0(FP), res_ptr 771 MOVD in+8(FP), a_ptr 772 773 LDP 0*16(a_ptr), (acc0, acc1) 774 LDP 1*16(a_ptr), (acc2, acc3) 775 776 REV acc0, acc0 777 REV acc1, acc1 778 REV acc2, acc2 779 REV acc3, acc3 780 781 STP (acc3, acc2), 0*16(res_ptr) 782 STP (acc1, acc0), 1*16(res_ptr) 783 RET