github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_cmn_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #include "gfp_macros_amd64.s" 6 7 TEXT ·gfpNeg(SB),NOSPLIT,$0-16 8 MOVQ ·p2+0(SB), R8 9 MOVQ ·p2+8(SB), R9 10 MOVQ ·p2+16(SB), R10 11 MOVQ ·p2+24(SB), R11 12 13 MOVQ a+8(FP), DI 14 SUBQ 0(DI), R8 15 SBBQ 8(DI), R9 16 SBBQ 16(DI), R10 17 SBBQ 24(DI), R11 18 19 gfpCarryWithoutCarry(R8,R9,R10,R11, R12,R13,R14,CX) 20 21 MOVQ c+0(FP), DI 22 storeBlock(R8,R9,R10,R11, 0(DI)) 23 RET 24 25 TEXT ·gfpAdd(SB),NOSPLIT,$0-24 26 MOVQ a+8(FP), DI 27 MOVQ b+16(FP), SI 28 29 loadBlock(0(DI), R8,R9,R10,R11) 30 MOVQ $0, R12 31 32 ADDQ 0(SI), R8 33 ADCQ 8(SI), R9 34 ADCQ 16(SI), R10 35 ADCQ 24(SI), R11 36 ADCQ $0, R12 37 38 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) 39 40 MOVQ c+0(FP), DI 41 storeBlock(R8,R9,R10,R11, 0(DI)) 42 RET 43 44 TEXT ·gfpDouble(SB),NOSPLIT,$0-16 45 MOVQ a+0(FP), DI 46 MOVQ b+8(FP), SI 47 48 loadBlock(0(SI), R8,R9,R10,R11) 49 XORQ R12, R12 50 51 ADDQ R8, R8 52 ADCQ R9, R9 53 ADCQ R10, R10 54 ADCQ R11, R11 55 ADCQ $0, R12 56 57 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) 58 59 storeBlock(R8,R9,R10,R11, 0(DI)) 60 RET 61 62 TEXT ·gfpTriple(SB),NOSPLIT,$0-16 63 MOVQ a+0(FP), DI 64 MOVQ b+8(FP), SI 65 66 loadBlock(0(SI), R8,R9,R10,R11) 67 XORQ R12, R12 68 69 ADDQ R8, R8 70 ADCQ R9, R9 71 ADCQ R10, R10 72 ADCQ R11, R11 73 ADCQ $0, R12 74 75 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) 76 77 XORQ R12, R12 78 ADDQ 0(SI), R8 79 ADCQ 8(SI), R9 80 ADCQ 16(SI), R10 81 ADCQ 24(SI), R11 82 ADCQ $0, R12 83 84 gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12) 85 86 storeBlock(R8,R9,R10,R11, 0(DI)) 87 RET 88 89 TEXT ·gfpSub(SB),NOSPLIT,$0-24 90 MOVQ a+8(FP), DI 91 MOVQ b+16(FP), SI 92 93 loadBlock(0(DI), R8,R9,R10,R11) 94 95 MOVQ ·p2+0(SB), R12 96 MOVQ ·p2+8(SB), R13 97 MOVQ ·p2+16(SB), R14 98 MOVQ ·p2+24(SB), CX 99 MOVQ $0, AX 100 101 SUBQ 0(SI), R8 102 SBBQ 8(SI), R9 103 SBBQ 16(SI), R10 104 SBBQ 24(SI), R11 105 106 CMOVQCC AX, R12 107 CMOVQCC AX, R13 108 CMOVQCC AX, R14 109 CMOVQCC AX, CX 110 111 ADDQ R12, R8 112 ADCQ R13, R9 113 ADCQ R14, R10 114 ADCQ CX, R11 115 116 MOVQ c+0(FP), DI 117 storeBlock(R8,R9,R10,R11, 0(DI)) 118 RET 119 120 TEXT ·gfpMul(SB),NOSPLIT,$0-24 121 MOVQ in1+8(FP), x_ptr 122 MOVQ in2+16(FP), y_ptr 123 124 CMPB ·supportADX(SB), $0 125 JE noAdxMul 126 127 XORQ acc5, acc5 128 XORQ res_ptr, res_ptr 129 // x * y[0] 130 MOVQ (8*0)(y_ptr), DX 131 MULXQ (8*0)(x_ptr), acc0, acc1 132 133 MULXQ (8*1)(x_ptr), AX, acc2 134 ADCXQ AX, acc1 135 136 MULXQ (8*2)(x_ptr), AX, acc3 137 ADCXQ AX, acc2 138 139 MULXQ (8*3)(x_ptr), AX, acc4 140 ADCXQ AX, acc3 141 ADCXQ acc5, acc4 142 143 // First reduction step 144 MOVQ acc0, DX 145 MULXQ ·np+0x00(SB), DX, AX 146 147 MULXQ ·p2+0x00(SB), AX, t0 148 ADOXQ AX, acc0 149 150 MULXQ ·p2+0x08(SB), AX, BX 151 ADCXQ t0, AX 152 ADOXQ AX, acc1 153 154 MULXQ ·p2+0x10(SB), AX, t0 155 ADCXQ BX, AX 156 ADOXQ AX, acc2 157 158 MULXQ ·p2+0x18(SB), AX, BX 159 ADCXQ t0, AX 160 ADOXQ AX, acc3 161 162 ADCXQ res_ptr, BX 163 ADOXQ BX, acc4 164 ADOXQ res_ptr, acc5 165 XORQ acc0, acc0 // It seems this line is optional. 166 167 // x * y[1] 168 MOVQ (8*1)(y_ptr), DX 169 MULXQ (8*0)(x_ptr), AX, t0 170 ADOXQ AX, acc1 171 172 MULXQ (8*1)(x_ptr), AX, BX 173 ADCXQ t0, AX 174 ADOXQ AX, acc2 175 176 MULXQ (8*2)(x_ptr), AX, t0 177 ADCXQ BX, AX 178 ADOXQ AX, acc3 179 180 MULXQ (8*3)(x_ptr), AX, BX 181 ADCXQ t0, AX 182 ADOXQ AX, acc4 183 184 ADCXQ acc0, BX 185 ADOXQ BX, acc5 186 ADOXQ res_ptr, acc0 187 188 // Second reduction step 189 MOVQ acc1, DX 190 MULXQ ·np+0x00(SB), DX, AX 191 192 MULXQ ·p2+0x00(SB), AX, t0 193 ADOXQ AX, acc1 194 195 MULXQ ·p2+0x08(SB), AX, BX 196 ADCXQ t0, AX 197 ADOXQ AX, acc2 198 199 MULXQ ·p2+0x10(SB), AX, t0 200 ADCXQ BX, AX 201 ADOXQ AX, acc3 202 203 MULXQ ·p2+0x18(SB), AX, BX 204 ADCXQ t0, AX 205 ADOXQ AX, acc4 206 207 ADCXQ res_ptr, BX 208 ADOXQ BX, acc5 209 ADOXQ res_ptr, acc0 210 XORQ acc1, acc1 // It seems this line is optional. 211 212 // x * y[2] 213 MOVQ (8*2)(y_ptr), DX 214 MULXQ (8*0)(x_ptr), AX, t0 215 ADOXQ AX, acc2 216 217 MULXQ (8*1)(x_ptr), AX, BX 218 ADCXQ t0, AX 219 ADOXQ AX, acc3 220 221 MULXQ (8*2)(x_ptr), AX, t0 222 ADCXQ BX, AX 223 ADOXQ AX, acc4 224 225 MULXQ (8*3)(x_ptr), AX, BX 226 ADCXQ t0, AX 227 ADOXQ AX, acc5 228 229 ADCXQ res_ptr, BX 230 ADOXQ BX, acc0 231 ADOXQ res_ptr, acc1 232 233 // Third reduction step 234 MOVQ acc2, DX 235 MULXQ ·np+0x00(SB), DX, AX 236 237 MULXQ ·p2+0x00(SB), AX, t0 238 ADOXQ AX, acc2 239 240 MULXQ ·p2+0x08(SB), AX, BX 241 ADCXQ t0, AX 242 ADOXQ AX, acc3 243 244 MULXQ ·p2+0x10(SB), AX, t0 245 ADCXQ BX, AX 246 ADOXQ AX, acc4 247 248 MULXQ ·p2+0x18(SB), AX, BX 249 ADCXQ t0, AX 250 ADOXQ AX, acc5 251 252 ADCXQ res_ptr, BX 253 ADOXQ BX, acc0 254 ADOXQ res_ptr, acc1 255 XORQ acc2, acc2 // It seems this line is optional. 256 257 // x * y[3] 258 MOVQ (8*3)(y_ptr), DX 259 MULXQ (8*0)(x_ptr), AX, t0 260 ADOXQ AX, acc3 261 262 MULXQ (8*1)(x_ptr), AX, BX 263 ADCXQ t0, AX 264 ADOXQ AX, acc4 265 266 MULXQ (8*2)(x_ptr), AX, t0 267 ADCXQ BX, AX 268 ADOXQ AX, acc5 269 270 MULXQ (8*3)(x_ptr), AX, BX 271 ADCXQ t0, AX 272 ADOXQ AX, acc0 273 274 ADCXQ res_ptr, BX 275 ADOXQ BX, acc1 276 ADOXQ res_ptr, acc2 277 278 // Last reduction step 279 MOVQ acc3, DX 280 MULXQ ·np+0x00(SB), DX, AX 281 282 MULXQ ·p2+0x00(SB), AX, t0 283 ADOXQ AX, acc3 284 285 MULXQ ·p2+0x08(SB), AX, BX 286 ADCXQ t0, AX 287 ADOXQ AX, acc4 288 289 MULXQ ·p2+0x10(SB), AX, t0 290 ADCXQ BX, AX 291 ADOXQ AX, acc5 292 293 MULXQ ·p2+0x18(SB), AX, BX 294 ADCXQ t0, AX 295 ADOXQ AX, acc0 296 297 ADCXQ res_ptr, BX 298 ADOXQ BX, acc1 299 ADOXQ res_ptr, acc2 300 // Copy result [255:0] 301 gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2) 302 MOVQ res+0(FP), res_ptr 303 storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) 304 RET 305 306 noAdxMul: 307 // x * y[0] 308 MOVQ (8*0)(y_ptr), t0 309 310 MOVQ (8*0)(x_ptr), AX 311 MULQ t0 312 MOVQ AX, acc0 313 MOVQ DX, acc1 314 315 MOVQ (8*1)(x_ptr), AX 316 MULQ t0 317 ADDQ AX, acc1 318 ADCQ $0, DX 319 MOVQ DX, acc2 320 321 MOVQ (8*2)(x_ptr), AX 322 MULQ t0 323 ADDQ AX, acc2 324 ADCQ $0, DX 325 MOVQ DX, acc3 326 327 MOVQ (8*3)(x_ptr), AX 328 MULQ t0 329 ADDQ AX, acc3 330 ADCQ $0, DX 331 MOVQ DX, acc4 332 XORQ acc5, acc5 333 // First reduction step 334 MOVQ acc0, AX 335 MULQ ·np+0x00(SB) 336 MOVQ AX, t0 337 338 MOVQ ·p2+0x00(SB), AX 339 MULQ t0 340 ADDQ AX, acc0 341 ADCQ $0, DX 342 MOVQ DX, BX 343 344 MOVQ ·p2+0x08(SB), AX 345 MULQ t0 346 ADDQ BX, acc1 347 ADCQ $0, DX 348 ADDQ AX, acc1 349 ADCQ $0, DX 350 MOVQ DX, BX 351 352 MOVQ ·p2+0x10(SB), AX 353 MULQ t0 354 ADDQ BX, acc2 355 ADCQ $0, DX 356 ADDQ AX, acc2 357 ADCQ $0, DX 358 MOVQ DX, BX 359 360 MOVQ ·p2+0x18(SB), AX 361 MULQ t0 362 ADDQ BX, acc3 363 ADCQ $0, DX 364 ADDQ AX, acc3 365 ADCQ DX, acc4 366 ADCQ $0, acc5 367 368 XORQ acc0, acc0 // It seems this line is optional. 369 // x * y[1] 370 MOVQ (8*1)(y_ptr), t0 371 372 MOVQ (8*0)(x_ptr), AX 373 MULQ t0 374 ADDQ AX, acc1 375 ADCQ $0, DX 376 MOVQ DX, BX 377 378 MOVQ (8*1)(x_ptr), AX 379 MULQ t0 380 ADDQ BX, acc2 381 ADCQ $0, DX 382 ADDQ AX, acc2 383 ADCQ $0, DX 384 MOVQ DX, BX 385 386 MOVQ (8*2)(x_ptr), AX 387 MULQ t0 388 ADDQ BX, acc3 389 ADCQ $0, DX 390 ADDQ AX, acc3 391 ADCQ $0, DX 392 MOVQ DX, BX 393 394 MOVQ (8*3)(x_ptr), AX 395 MULQ t0 396 ADDQ BX, acc4 397 ADCQ $0, DX 398 ADDQ AX, acc4 399 ADCQ DX, acc5 400 ADCQ $0, acc0 401 // Second reduction step 402 MOVQ acc1, AX 403 MULQ ·np+0x00(SB) 404 MOVQ AX, t0 405 406 MOVQ ·p2+0x00(SB), AX 407 MULQ t0 408 ADDQ AX, acc1 409 ADCQ $0, DX 410 MOVQ DX, BX 411 412 MOVQ ·p2+0x08(SB), AX 413 MULQ t0 414 ADDQ BX, acc2 415 ADCQ $0, DX 416 ADDQ AX, acc2 417 ADCQ $0, DX 418 MOVQ DX, BX 419 420 MOVQ ·p2+0x10(SB), AX 421 MULQ t0 422 ADDQ BX, acc3 423 ADCQ $0, DX 424 ADDQ AX, acc3 425 ADCQ $0, DX 426 MOVQ DX, BX 427 428 MOVQ ·p2+0x18(SB), AX 429 MULQ t0 430 ADDQ BX, acc4 431 ADCQ $0, DX 432 ADDQ AX, acc4 433 ADCQ DX, acc5 434 ADCQ $0, acc0 435 436 XORQ acc1, acc1 // It seems this line is optional. 437 // x * y[2] 438 MOVQ (8*2)(y_ptr), t0 439 440 MOVQ (8*0)(x_ptr), AX 441 MULQ t0 442 ADDQ AX, acc2 443 ADCQ $0, DX 444 MOVQ DX, BX 445 446 MOVQ (8*1)(x_ptr), AX 447 MULQ t0 448 ADDQ BX, acc3 449 ADCQ $0, DX 450 ADDQ AX, acc3 451 ADCQ $0, DX 452 MOVQ DX, BX 453 454 MOVQ (8*2)(x_ptr), AX 455 MULQ t0 456 ADDQ BX, acc4 457 ADCQ $0, DX 458 ADDQ AX, acc4 459 ADCQ $0, DX 460 MOVQ DX, BX 461 462 MOVQ (8*3)(x_ptr), AX 463 MULQ t0 464 ADDQ BX, acc5 465 ADCQ $0, DX 466 ADDQ AX, acc5 467 ADCQ DX, acc0 468 ADCQ $0, acc1 469 // Third reduction step 470 MOVQ acc2, AX 471 MULQ ·np+0x00(SB) 472 MOVQ AX, t0 473 474 MOVQ ·p2+0x00(SB), AX 475 MULQ t0 476 ADDQ AX, acc2 477 ADCQ $0, DX 478 MOVQ DX, BX 479 480 MOVQ ·p2+0x08(SB), AX 481 MULQ t0 482 ADDQ BX, acc3 483 ADCQ $0, DX 484 ADDQ AX, acc3 485 ADCQ $0, DX 486 MOVQ DX, BX 487 488 MOVQ ·p2+0x10(SB), AX 489 MULQ t0 490 ADDQ BX, acc4 491 ADCQ $0, DX 492 ADDQ AX, acc4 493 ADCQ $0, DX 494 MOVQ DX, BX 495 496 MOVQ ·p2+0x18(SB), AX 497 MULQ t0 498 ADDQ BX, acc5 499 ADCQ $0, DX 500 ADDQ AX, acc5 501 ADCQ DX, acc0 502 ADCQ $0, acc1 503 504 XORQ acc2, acc2 // It seems this line is optional. 505 // x * y[3] 506 MOVQ (8*3)(y_ptr), t0 507 508 MOVQ (8*0)(x_ptr), AX 509 MULQ t0 510 ADDQ AX, acc3 511 ADCQ $0, DX 512 MOVQ DX, BX 513 514 MOVQ (8*1)(x_ptr), AX 515 MULQ t0 516 ADDQ BX, acc4 517 ADCQ $0, DX 518 ADDQ AX, acc4 519 ADCQ $0, DX 520 MOVQ DX, BX 521 522 MOVQ (8*2)(x_ptr), AX 523 MULQ t0 524 ADDQ BX, acc5 525 ADCQ $0, DX 526 ADDQ AX, acc5 527 ADCQ $0, DX 528 MOVQ DX, BX 529 530 MOVQ (8*3)(x_ptr), AX 531 MULQ t0 532 ADDQ BX, acc0 533 ADCQ $0, DX 534 ADDQ AX, acc0 535 ADCQ DX, acc1 536 ADCQ $0, acc2 537 // Last reduction step 538 MOVQ acc3, AX 539 MULQ ·np+0x00(SB) 540 MOVQ AX, t0 541 542 MOVQ ·p2+0x00(SB), AX 543 MULQ t0 544 ADDQ AX, acc3 545 ADCQ $0, DX 546 MOVQ DX, BX 547 548 MOVQ ·p2+0x08(SB), AX 549 MULQ t0 550 ADDQ BX, acc4 551 ADCQ $0, DX 552 ADDQ AX, acc4 553 ADCQ $0, DX 554 MOVQ DX, BX 555 556 MOVQ ·p2+0x10(SB), AX 557 MULQ t0 558 ADDQ BX, acc5 559 ADCQ $0, DX 560 ADDQ AX, acc5 561 ADCQ $0, DX 562 MOVQ DX, BX 563 564 MOVQ ·p2+0x18(SB), AX 565 MULQ t0 566 ADDQ BX, acc0 567 ADCQ $0, DX 568 ADDQ AX, acc0 569 ADCQ DX, acc1 570 ADCQ $0, acc2 571 // Copy result [255:0] 572 gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2) 573 MOVQ res+0(FP), res_ptr 574 storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) 575 576 RET 577 578 /* ---------------------------------------*/ 579 // func gfpFromMont(res, in *gfP) 580 TEXT ·gfpFromMont(SB),NOSPLIT,$0 581 MOVQ res+0(FP), res_ptr 582 MOVQ in+8(FP), x_ptr 583 584 MOVQ (8*0)(x_ptr), acc0 585 MOVQ (8*1)(x_ptr), acc1 586 MOVQ (8*2)(x_ptr), acc2 587 MOVQ (8*3)(x_ptr), acc3 588 XORQ acc4, acc4 589 590 // Only reduce, no multiplications are needed 591 // First reduction step 592 MOVQ acc0, AX 593 MULQ ·np+0x00(SB) 594 MOVQ AX, t0 // Y 595 596 // Calculate next T = T+Y*P 597 MOVQ ·p2+0x00(SB), AX 598 MULQ t0 599 ADDQ AX, acc0 // acc0 is free now 600 ADCQ $0, DX 601 MOVQ DX, BX // carry 602 XORQ acc0, acc0 603 604 MOVQ ·p2+0x08(SB), AX 605 MULQ t0 606 ADDQ BX, acc1 607 ADCQ $0, DX 608 ADDQ AX, acc1 609 ADCQ $0, DX 610 MOVQ DX, BX // carry 611 612 MOVQ ·p2+0x10(SB), AX 613 MULQ t0 614 ADDQ BX, acc2 615 ADCQ $0, DX 616 ADDQ AX, acc2 617 ADCQ $0, DX 618 MOVQ DX, BX // carry 619 620 MOVQ ·p2+0x18(SB), AX 621 MULQ t0 622 ADDQ BX, acc3 623 ADCQ $0, DX 624 ADDQ AX, acc3 625 ADCQ DX, acc4 626 XORQ acc5, acc5 627 628 // Second reduction step 629 MOVQ acc1, AX 630 MULQ ·np+0x00(SB) 631 MOVQ AX, t0 // Y 632 633 // Calculate next T = T+Y*P 634 MOVQ ·p2+0x00(SB), AX 635 MULQ t0 636 ADDQ AX, acc1 // acc1 is free now 637 ADCQ $0, DX 638 MOVQ DX, BX // carry 639 XORQ acc1, acc1 640 641 MOVQ ·p2+0x08(SB), AX 642 MULQ t0 643 ADDQ BX, acc2 644 ADCQ $0, DX 645 ADDQ AX, acc2 646 ADCQ $0, DX 647 MOVQ DX, BX // carry 648 649 MOVQ ·p2+0x10(SB), AX 650 MULQ t0 651 ADDQ BX, acc3 652 ADCQ $0, DX 653 ADDQ AX, acc3 654 ADCQ $0, DX 655 MOVQ DX, BX // carry 656 657 MOVQ ·p2+0x18(SB), AX 658 MULQ t0 659 ADDQ BX, acc4 660 ADCQ $0, DX 661 ADDQ AX, acc4 662 ADCQ DX, acc5 663 664 // Third reduction step 665 MOVQ acc2, AX 666 MULQ ·np+0x00(SB) 667 MOVQ AX, t0 // Y 668 669 // Calculate next T = T+Y*P 670 MOVQ ·p2+0x00(SB), AX 671 MULQ t0 672 ADDQ AX, acc2 // acc2 is free now 673 ADCQ $0, DX 674 MOVQ DX, BX // carry 675 676 MOVQ ·p2+0x08(SB), AX 677 MULQ t0 678 ADDQ BX, acc3 679 ADCQ $0, DX 680 ADDQ AX, acc3 681 ADCQ $0, DX 682 MOVQ DX, BX // carry 683 684 MOVQ ·p2+0x10(SB), AX 685 MULQ t0 686 ADDQ BX, acc4 687 ADCQ $0, DX 688 ADDQ AX, acc4 689 ADCQ $0, DX 690 MOVQ DX, BX // carry 691 692 MOVQ ·p2+0x18(SB), AX 693 MULQ t0 694 ADDQ BX, acc5 695 ADCQ $0, DX 696 ADDQ AX, acc5 697 ADCQ DX, acc0 698 699 // Last reduction step 700 MOVQ acc3, AX 701 MULQ ·np+0x00(SB) 702 MOVQ AX, t0 // Y 703 704 // Calculate next T = T+Y*P 705 MOVQ ·p2+0x00(SB), AX 706 MULQ t0 707 ADDQ AX, acc3 // acc3 is free now 708 ADCQ $0, DX 709 MOVQ DX, BX // carry 710 XORQ acc3, acc3 711 712 MOVQ ·p2+0x08(SB), AX 713 MULQ t0 714 ADDQ BX, acc4 715 ADCQ $0, DX 716 ADDQ AX, acc4 717 ADCQ $0, DX 718 MOVQ DX, BX // carry 719 720 MOVQ ·p2+0x10(SB), AX 721 MULQ t0 722 ADDQ BX, acc5 723 ADCQ $0, DX 724 ADDQ AX, acc5 725 ADCQ $0, DX 726 MOVQ DX, BX // carry 727 728 MOVQ ·p2+0x18(SB), AX 729 MULQ t0 730 ADDQ BX, acc0 731 ADCQ $0, DX 732 ADDQ AX, acc0 733 ADCQ DX, acc1 734 735 gfpCarryWithoutCarry(acc4, acc5, acc0, acc1, x_ptr, acc3, t0, BX) 736 storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) 737 RET 738 739 /* ---------------------------------------*/ 740 // func gfpUnmarshal(res *gfP, in *[32]byte) 741 TEXT ·gfpUnmarshal(SB),NOSPLIT,$0 742 JMP ·gfpMarshal(SB) 743 744 /* ---------------------------------------*/ 745 // func gfpMarshal(res *[32]byte, in *gfP) 746 TEXT ·gfpMarshal(SB),NOSPLIT,$0 747 MOVQ res+0(FP), res_ptr 748 MOVQ in+8(FP), x_ptr 749 750 MOVQ (8*0)(x_ptr), acc0 751 MOVQ (8*1)(x_ptr), acc1 752 MOVQ (8*2)(x_ptr), acc2 753 MOVQ (8*3)(x_ptr), acc3 754 755 BSWAPQ acc0 756 BSWAPQ acc1 757 BSWAPQ acc2 758 BSWAPQ acc3 759 760 MOVQ acc3, (8*0)(res_ptr) 761 MOVQ acc2, (8*1)(res_ptr) 762 MOVQ acc1, (8*2)(res_ptr) 763 MOVQ acc0, (8*3)(res_ptr) 764 765 RET