github.com/qiaogw/arrgo@v0.0.8/internal/arithmetic_amd64.s (about) 1 // +build !noasm !appengine 2 3 #define NOSPLIT 7 4 5 // func initasm()(a,a2 bool) 6 // pulled from runtime/asm_amd64.s 7 TEXT ·initasm(SB), NOSPLIT, $0 8 MOVQ $1, R15 9 10 MOVQ $1, AX 11 CPUID 12 13 ANDL $0x1, CX 14 CMPL CX, $0x1 15 CMOVQEQ R15, R9 16 MOVB R9, ·Sse3Supt(SB) 17 XORQ R9, R9 18 19 MOVQ $1, AX 20 CPUID 21 ANDL $0x18001000, CX 22 CMPL CX, $0x18001000 23 CMOVQEQ R15, R9 24 MOVB R9, ·FmaSupt(SB) // set numgo·fmaSupt 25 XORQ R9, R9 26 27 ANDL $0x18000000, CX 28 CMPL CX, $0x18000000 29 JNE noavx 30 31 // For XGETBV, OSXSAVE bit is required and sufficient 32 MOVQ $0, CX 33 34 // Check for FMA capability 35 // XGETBV 36 BYTE $0x0F; BYTE $0x01; BYTE $0xD0 37 38 ANDL $6, AX 39 CMPL AX, $6 // Check for OS support of YMM registers 40 JNE noavx 41 MOVB $1, ·AvxSupt(SB) // set numgo·avxSupt 42 43 // Check for AVX2 capability 44 MOVL $7, AX 45 XORQ CX, CX 46 CPUID 47 ANDL $0x20, BX // check for AVX2 bit 48 CMPL BX, $0x20 49 CMOVQEQ R15, R9 50 MOVB R9, ·Avx2Supt(SB) // set numgo·avx2Supt 51 XORQ R9, R9 52 RET 53 54 noavx: 55 MOVB $0, ·FmaSupt(SB) // set numgo·fmaSupt 56 MOVB $0, ·AvxSupt(SB) // set numgo·avxSupt 57 MOVB $0, ·Avx2Supt(SB) // set numgo·avx2Supt 58 RET 59 60 // func AddC(c float64, d []float64) 61 TEXT ·AddC(SB), NOSPLIT, $0 62 // Data ptr 63 MOVQ d+8(FP), R10 64 65 // n = Data len 66 MOVQ d_len+16(FP), SI 67 68 // zero len return 69 CMPQ SI, $0 70 JE ACEND 71 72 // check tail 73 SUBQ $4, SI 74 JL ACTAIL 75 76 // avx support test 77 LEAQ c+0(FP), R9 78 CMPB ·AvxSupt(SB), $1 79 JE AVX_AC 80 CMPB ·Avx2Supt(SB), $1 81 JE AVX2_AC 82 83 // load multiplier 84 MOVSD (R9), X0 85 SHUFPD $0, X0, X0 86 87 ACLOOP: // Unrolled x2 d[i]|d[i+1] += c 88 MOVUPD 0(R10), X1 89 MOVUPD 16(R10), X2 90 ADDPD X0, X1 91 ADDPD X0, X2 92 MOVUPD X1, 0(R10) 93 MOVUPD X2, 16(R10) 94 ADDQ $32, R10 95 SUBQ $4, SI 96 JGE ACLOOP 97 JMP ACTAIL 98 99 // NEED AVX INSTRUCTION CODING FOR THIS TO WORK 100 AVX2_AC: // Until AVX2 is known 101 AVX_AC: 102 // VBROADCASTD (R9), Y0 103 BYTE $0xC4; BYTE $0xC2; BYTE $0x7D; BYTE $0x19; BYTE $0x01 104 105 AVX_ACLOOP: 106 // VADDPD (R10),Y0,Y1 107 BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x58; BYTE $0x0A 108 109 // VMOVDQA Y1, (R10) 110 BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x7F; BYTE $0x0A 111 112 ADDQ $32, R10 113 SUBQ $4, SI 114 JGE AVX_ACLOOP 115 //VZEROUPPER 116 BYTE $0xC5; BYTE $0xF8; BYTE $0x77 117 118 ACTAIL: // Catch len % 4 == 0 119 ADDQ $4, SI 120 JE ACEND 121 122 ACTL: // Calc the last values individually d[i] += c 123 MOVSD 0(R10), X1 124 ADDSD X0, X1 125 MOVSD X1, 0(R10) 126 ADDQ $8, R10 127 SUBQ $1, SI 128 JG ACTL 129 130 ACEND: 131 RET 132 133 // func subtrC(c float64, d []float64) 134 TEXT ·SubtrC(SB), NOSPLIT, $0 135 // Data ptr 136 MOVQ d+8(FP), R10 137 138 // n = Data len 139 MOVQ d_len+16(FP), SI 140 141 // zero len return 142 CMPQ SI, $0 143 JE SCEND 144 145 // check tail 146 SUBQ $4, SI 147 JL SCTAIL 148 149 // load multiplier 150 MOVSD c+0(FP), X0 151 SHUFPD $0, X0, X0 152 153 SCLOOP: // load d[i] | d[i+1] 154 MOVUPD 0(R10), X1 155 MOVUPD 16(R10), X2 156 SUBPD X0, X1 157 SUBPD X0, X2 158 MOVUPD X1, 0(R10) 159 MOVUPD X2, 16(R10) 160 ADDQ $32, R10 161 SUBQ $4, SI 162 JGE SCLOOP 163 164 SCTAIL: 165 ADDQ $4, SI 166 JE SCEND 167 168 SCTL: 169 MOVSD 0(R10), X1 170 SUBSD X0, X1 171 MOVSD X1, 0(R10) 172 ADDQ $8, R10 173 SUBQ $1, SI 174 JG SCTL 175 176 SCEND: 177 RET 178 179 // func multC(c float64, d []float64) 180 TEXT ·MultC(SB), NOSPLIT, $0 181 MOVQ d_base+8(FP), R10 182 MOVQ d_len+16(FP), SI 183 184 // zero len return 185 CMPQ SI, $0 186 JE MCEND 187 SUBQ $4, SI 188 JL MCTAIL 189 190 // load multiplier 191 MOVSD c+0(FP), X0 192 SHUFPD $0, X0, X0 193 194 MCLOOP: // load d[i] | d[i+1] 195 MOVUPD 0(R10), X1 196 MOVUPD 16(R10), X2 197 MULPD X0, X1 198 MULPD X0, X2 199 MOVUPD X1, 0(R10) 200 MOVUPD X2, 16(R10) 201 ADDQ $32, R10 202 SUBQ $4, SI 203 JGE MCLOOP 204 205 MCTAIL: 206 ADDQ $4, SI 207 JE MCEND 208 209 MCTL: 210 MOVSD 0(R10), X1 211 MULSD X0, X1 212 MOVSD X1, 0(R10) 213 ADDQ $8, R10 214 SUBQ $1, SI 215 JG MCTL 216 217 MCEND: 218 RET 219 220 // func divC(c float64, d []float64) 221 TEXT ·DivC(SB), NOSPLIT, $0 222 // Data ptr 223 MOVQ d+8(FP), R10 224 225 // n = Data len 226 MOVQ d_len+16(FP), SI 227 228 // zero len return 229 CMPQ SI, $0 230 JE DCEND 231 232 // check tail 233 SUBQ $4, SI 234 JL DCTAIL 235 236 // load multiplier 237 MOVSD c+0(FP), X0 238 SHUFPD $0, X0, X0 239 240 DCLOOP: // load d[i] | d[i+1] 241 MOVUPD 0(R10), X1 242 MOVUPD 16(R10), X2 243 DIVPD X0, X1 244 DIVPD X0, X2 245 MOVUPD X1, 0(R10) 246 MOVUPD X2, 16(R10) 247 ADDQ $32, R10 248 SUBQ $4, SI 249 JGE DCLOOP 250 251 DCTAIL: 252 ADDQ $4, SI 253 JE DCEND 254 255 DCTL: 256 MOVSD 0(R10), X1 257 DIVSD X0, X1 258 MOVSD X1, 0(R10) 259 ADDQ $8, R10 260 SUBQ $1, SI 261 JG DCTL 262 263 DCEND: 264 RET 265 266 // func add(a,b []float64) 267 TEXT ·Add(SB), NOSPLIT, $0 268 // a Data ptr 269 MOVQ a_base+0(FP), R8 270 271 // a len 272 MOVQ a_len+8(FP), SI 273 274 // b Data ptr 275 MOVQ b_base+24(FP), R9 276 MOVQ R9, R10 277 278 // b len 279 MOVQ b_len+32(FP), DI 280 MOVQ DI, R11 281 282 // zero len return 283 CMPQ SI, $0 284 JE AEND 285 286 // check tail 287 SUBQ $2, SI 288 JL ATAIL 289 290 ALD: 291 CMPQ DI, $1 292 JE ALT 293 SUBQ $2, DI 294 JGE ALO 295 MOVQ R10, R9 296 MOVQ R11, DI 297 SUBQ $2, DI 298 299 ALO: 300 MOVUPD (R9), X1 301 ADDQ $16, R9 302 JMP ALOOP 303 304 ALT: 305 MOVLPD (R9), X1 306 MOVQ R10, R9 307 MOVQ R11, DI 308 MOVHPD (R9), X1 309 SUBQ $1, DI 310 ADDQ $8, R9 311 312 ALOOP: 313 MOVUPD (R8), X0 314 ADDPD X1, X0 315 MOVUPD X0, (R8) 316 ADDQ $16, R8 317 SUBQ $2, SI 318 JGE ALD 319 320 ATAIL: 321 ADDQ $2, SI 322 JE AEND 323 324 ATL: 325 MOVSD (R8), X0 326 MOVSD (R9), X1 327 ADDSD X1, X0 328 MOVSD X0, (R8) 329 ADDQ $8, R8 330 ADDQ $8, R9 331 SUBQ $1, SI 332 JG ATL 333 334 AEND: 335 RET 336 337 // func vadd(a,b[]float64) 338 // req: len(a) == len(b) 339 TEXT ·Vadd(SB), NOSPLIT, $0 340 // a Data ptr 341 MOVQ a_base+0(FP), R8 342 343 // a len 344 MOVQ a_len+8(FP), SI 345 346 // b Data ptr 347 MOVQ b_base+24(FP), R9 348 349 // zero len return 350 CMPQ SI, $0 351 JE vadd_exit 352 353 // check tail 354 SUBQ $8, SI 355 JL vadd_tail 356 357 // AVX vs SSE 358 CMPB ·AvxSupt(SB), $1 359 JE vadd_avx_loop 360 361 vadd_loop: 362 MOVUPD (R9), X1 363 MOVUPD 16(R9), X3 364 MOVUPD 32(R9), X5 365 MOVUPD 48(R9), X7 366 367 MOVUPD (R8), X0 368 ADDPD X1, X0 369 MOVUPD 16(R8), X2 370 ADDPD X3, X2 371 MOVUPD 32(R8), X4 372 ADDPD X5, X4 373 MOVUPD 48(R8), X6 374 ADDPD X7, X6 375 376 MOVUPD X0, (R8) 377 MOVUPD X2, 16(R8) 378 MOVUPD X4, 32(R8) 379 MOVUPD X6, 48(R8) 380 ADDQ $64, R8 381 ADDQ $64, R9 382 SUBQ $8, SI 383 JGE vadd_loop 384 385 vadd_tail: 386 ADDQ $8, SI 387 JE vadd_exit 388 389 vadd_tail_loop: 390 MOVSD (R8), X15 391 MOVSD (R9), X14 392 ADDSD X14, X15 393 MOVSD X15, (R8) 394 ADDQ $8, R8 395 ADDQ $8, R9 396 SUBQ $1, SI 397 JGE vadd_tail_loop 398 JMP vadd_exit 399 400 vadd_avx_loop: 401 //VMOVDQU (R9), Y0 402 BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x6F; BYTE $0x01 403 //VMOVDQU 32(R9), Y1 404 BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x6F; BYTE $0x49; BYTE $0x20 405 406 // VADDPD (R8),Y0,Y0 407 BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x58; BYTE $0x00 408 // VADDPD 32(R10),Y1,Y1 409 BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0x58; BYTE $0x48; BYTE $0x20 410 411 //VMOVDQA Y0, (R8) 412 BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x7F; BYTE $0x00 413 //VMOVDQA Y1, 32(R8) 414 BYTE $0xC4; BYTE $0xC1; BYTE $0x7E; BYTE $0x7F; BYTE $0x48; BYTE $0x20 415 416 417 ADDQ $64, R8 418 ADDQ $64, R9 419 SUBQ $8, SI 420 JGE vadd_avx_loop 421 //VZEROUPPER 422 BYTE $0xC5; BYTE $0xF8; BYTE $0x77 423 ADDQ $8, SI 424 JE vadd_exit 425 JMP vadd_tail_loop 426 427 vadd_exit: 428 RET 429 430 // func hadd(st uint64, a []float64) 431 // req: len(a) == len(b) 432 TEXT ·Hadd(SB), NOSPLIT, $0 433 // a Data ptr 434 MOVQ a_base+8(FP), R8 435 MOVQ R8, R9 436 437 // a len 438 MOVQ a_len+16(FP), SI 439 MOVQ st+0(FP), CX 440 MOVQ CX, DI 441 ANDQ $1, DI 442 443 444 CMPQ CX, $1 445 JE hadd_exit 446 CMPQ SI, $0 447 JE hadd_exit 448 CMPQ CX, $8 449 JG hadd_big_stride 450 CMPB ·Sse3Supt(SB), $1 451 JE hadd_sse3_head 452 453 hadd_big_stride: 454 // AVX vs SSE 455 CMPB ·AvxSupt(SB), $1 456 //JE hadd_avx_head 457 CMPB ·Sse3Supt(SB), $1 458 JE hadd_sse3_head 459 hadd_head: 460 PXOR X0, X0 461 MOVQ CX, DI 462 SUBQ $1, DI 463 hadd_loop: 464 ADDPD (R8), X0 465 ADDQ $16, R8 466 SUBQ $2, DI 467 JG hadd_loop 468 JZ hadd_tail 469 MOVAPD X0, X1 470 UNPCKHPD X1, X0 471 ADDPD X1,X0 472 MOVQ X0, (R9) 473 ADDQ $8, R9 474 SUBQ CX, SI 475 JG hadd_head 476 JMP hadd_exit 477 hadd_tail: 478 ADDSD (R8), X0 479 MOVAPD X0, X1 480 UNPCKHPD X1, X0 481 ADDPD X1,X0 482 MOVQ X0, (R9) 483 ADDQ $8, R9 484 SUBQ CX, SI 485 JZ hadd_exit 486 MOVQ 8(R8), X0 487 MOVQ CX, DI 488 SUBQ $2, DI 489 ADDQ $16, R8 490 JMP hadd_loop 491 hadd_sse3_head: 492 PXOR X0, X0 493 MOVQ CX, DI 494 SUBQ $1, DI 495 hadd_sse3_loop: 496 ADDPD (R8), X0 497 ADDQ $16, R8 498 SUBQ $2, DI 499 JG hadd_sse3_loop 500 JZ hadd_sse3_tail 501 BYTE $0x66; BYTE $0x0F; BYTE $0x7C; BYTE $0xC0 502 // HADDPD X0, X0 //Added in 1.6 503 MOVQ X0, (R9) 504 ADDQ $8, R9 505 SUBQ CX, SI 506 JG hadd_sse3_head 507 JMP hadd_exit 508 hadd_sse3_tail: 509 ADDSD (R8), X0 510 BYTE $0x66; BYTE $0x0F; BYTE $0x7C; BYTE $0xC0 511 // HADDPD X0, X0 //Added in 1.6 512 MOVQ X0, (R9) 513 ADDQ $8, R9 514 SUBQ CX, SI 515 JZ hadd_exit 516 MOVQ 8(R8), X0 517 MOVQ CX, DI 518 SUBQ $2, DI 519 ADDQ $16, R8 520 JMP hadd_sse3_loop 521 hadd_exit: 522 RET 523 524 525 // func subtr(a,b []float64) 526 TEXT ·Subtr(SB), NOSPLIT, $0 527 // a Data ptr 528 MOVQ a_base+0(FP), R8 529 530 // a len 531 MOVQ a_len+8(FP), SI 532 533 // b Data ptr 534 MOVQ b_base+24(FP), R9 535 MOVQ R9, R10 536 537 // b len 538 MOVQ b_len+32(FP), DI 539 MOVQ DI, R11 540 541 // zero len return 542 MOVQ $0, AX 543 CMPQ AX, SI 544 JE SEND 545 546 // check tail 547 SUBQ $2, SI 548 JL STAIL 549 550 SLD: 551 SUBQ $1, DI 552 JE SLT 553 SUBQ $1, DI 554 JGE SLO 555 MOVQ R10, R9 556 MOVQ R11, DI 557 SUBQ $2, DI 558 559 SLO: 560 MOVUPD 0(R9), X1 561 ADDQ $16, R9 562 JMP SLOOP 563 564 SLT: 565 MOVLPD 0(R9), X1 566 MOVQ R10, R9 567 MOVQ R11, DI 568 MOVHPD 0(R9), X1 569 SUBQ $1, DI 570 ADDQ $8, R9 571 572 SLOOP: 573 MOVUPD 0(R8), X0 574 SUBPD X1, X0 575 MOVUPD X0, 0(R8) 576 ADDQ $16, R8 577 SUBQ $2, SI 578 JGE SLD 579 580 STAIL: 581 ADDQ $2, SI 582 JE SEND 583 584 STL: 585 MOVSD 0(R8), X0 586 MOVSD 0(R9), X1 587 SUBSD X1, X0 588 MOVSD X0, 0(R8) 589 ADDQ $8, R8 590 ADDQ $8, R9 591 SUBQ $1, SI 592 JG STL 593 594 SEND: 595 RET 596 597 // func mult(a,b []float64) 598 TEXT ·Mult(SB), NOSPLIT, $0 599 // a Data ptr 600 MOVQ a_base+0(FP), R8 601 602 // a len 603 MOVQ a_len+8(FP), SI 604 605 // b Data ptr 606 MOVQ b_base+24(FP), R9 607 MOVQ R9, R10 608 609 // b len 610 MOVQ b_len+32(FP), DI 611 MOVQ DI, R11 612 613 // zero len return 614 MOVQ $0, AX 615 CMPQ AX, SI 616 JE MEND 617 618 // check tail 619 SUBQ $2, SI 620 JL MTAIL 621 622 MLD: 623 SUBQ $1, DI 624 JE MLT 625 SUBQ $1, DI 626 JGE MLO 627 MOVQ R10, R9 628 MOVQ R11, DI 629 SUBQ $2, DI 630 631 MLO: 632 MOVUPD 0(R9), X1 633 ADDQ $16, R9 634 JMP MLOOP 635 636 MLT: 637 MOVLPD 0(R9), X1 638 MOVQ R10, R9 639 MOVQ R11, DI 640 MOVHPD 0(R9), X1 641 SUBQ $1, DI 642 ADDQ $8, R9 643 644 MLOOP: 645 MOVUPD 0(R8), X0 646 MULPD X1, X0 647 MOVUPD X0, 0(R8) 648 ADDQ $16, R8 649 SUBQ $2, SI 650 JGE MLD 651 652 MTAIL: 653 ADDQ $2, SI 654 JE MEND 655 656 MTL: 657 MOVSD 0(R8), X0 658 MOVSD 0(R9), X1 659 MULSD X1, X0 660 MOVSD X0, 0(R8) 661 ADDQ $8, R8 662 ADDQ $8, R9 663 SUBQ $1, SI 664 JG MTL 665 666 MEND: 667 RET 668 669 // func div(a,b []float64) 670 TEXT ·Div(SB), NOSPLIT, $0 671 // a Data ptr 672 MOVQ a_base+0(FP), R8 673 674 // a len 675 MOVQ a_len+8(FP), SI 676 677 // b Data ptr 678 MOVQ b_base+24(FP), R9 679 MOVQ R9, R10 680 681 // b len 682 MOVQ b_len+32(FP), DI 683 MOVQ DI, R11 684 685 // zero len return 686 MOVQ $0, AX 687 CMPQ AX, SI 688 JE DEND 689 690 // check tail 691 SUBQ $2, SI 692 JL DTAIL 693 694 DLD: 695 SUBQ $1, DI 696 JE DLT 697 SUBQ $1, DI 698 JGE DLO 699 MOVQ R10, R9 700 MOVQ R11, DI 701 SUBQ $2, DI 702 703 DLO: 704 MOVUPD 0(R9), X1 705 ADDQ $16, R9 706 JMP DLOOP 707 DLT: 708 MOVLPD 0(R9), X1 709 MOVQ R10, R9 710 MOVQ R11, DI 711 MOVHPD 0(R9), X1 712 SUBQ $1, DI 713 ADDQ $8, R9 714 715 DLOOP: 716 MOVUPD 0(R8), X0 717 DIVPD X1, X0 718 MOVUPD X0, 0(R8) 719 ADDQ $16, R8 720 SUBQ $2, SI 721 JGE DLD 722 723 DTAIL: 724 ADDQ $2, SI 725 JE DEND 726 DTL: 727 MOVSD 0(R8), X0 728 MOVSD 0(R9), X1 729 DIVSD X1, X0 730 MOVSD X0, 0(R8) 731 ADDQ $8, R8 732 ADDQ $8, R9 733 SUBQ $1, SI 734 JG DTL 735 736 DEND: 737 RET 738 739 // func fma12(a float64, x,b []float64) 740 // x[i] = a*x[i]+b[i] 741 TEXT ·Fma12(SB), NOSPLIT, $0 742 // a ptr 743 MOVSD a+0(FP), X2 744 SHUFPD $0, X2, X2 745 746 // x Data ptr 747 MOVQ x_base+8(FP), R8 748 749 // x len 750 MOVQ x_len+16(FP), SI 751 752 // b Data ptr 753 MOVQ b_base+32(FP), R9 754 MOVQ R9, R10 755 756 // b len 757 MOVQ b_len+40(FP), DI 758 MOVQ DI, R11 759 760 // zero len return 761 CMPQ SI, $0 762 JE F12END 763 764 // check tail 765 SUBQ $2, SI 766 JL F12TAIL 767 768 F12LD: 769 CMPQ DI, $1 770 JE F12LT 771 SUBQ $2, DI 772 JGE F12LO 773 MOVQ R10, R9 774 MOVQ R11, DI 775 SUBQ $2, DI 776 777 F12LO: 778 MOVUPD (R9), X1 779 ADDQ $16, R9 780 JMP F12LOOP 781 782 F12LT: 783 MOVLPD (R9), X1 784 MOVQ R10, R9 785 MOVQ R11, DI 786 MOVHPD (R9), X1 787 SUBQ $1, DI 788 ADDQ $8, R9 789 790 F12LOOP: 791 MOVUPD (R8), X0 792 MULPD X2, X0 793 ADDPD X1, X0 794 MOVUPD X0, (R8) 795 ADDQ $16, R8 796 SUBQ $2, SI 797 JGE F12LD 798 JMP F12TAIL 799 800 F12LDF: 801 CMPQ DI, $1 802 JE F12LTF 803 SUBQ $2, DI 804 JGE F12LOF 805 MOVQ R10, R9 806 MOVQ R11, DI 807 SUBQ $2, DI 808 809 F12LOF: 810 MOVUPD (R9), X1 811 ADDQ $16, R9 812 JMP F12LOOPF 813 814 F12LTF: 815 MOVLPD (R9), X1 816 MOVQ R10, R9 817 MOVQ R11, DI 818 MOVHPD (R9), X1 819 SUBQ $1, DI 820 ADDQ $8, R9 821 822 F12LOOPF: 823 MOVUPD (R8), X0 824 825 // VMFADD213PD X0, X1, X2 826 BYTE $0xC4; BYTE $0xE2; BYTE $0xF1; BYTE $0x98; BYTE $0xC2 827 MOVUPD X0, (R8) 828 ADDQ $16, R8 829 SUBQ $2, SI 830 JGE F12LDF 831 832 F12TAIL: 833 ADDQ $2, SI 834 JE F12END 835 836 F12TL: 837 MOVSD (R8), X0 838 MOVSD (R9), X1 839 MULPD X2, X0 840 ADDPD X1, X0 841 MOVSD X0, (R8) 842 ADDQ $8, R8 843 ADDQ $8, R9 844 SUBQ $1, SI 845 JG F12TL 846 847 F12END: 848 RET 849 850 // func fma21(a float64, x,b []float64) 851 // x[i] = x[i]*b[i]+a 852 TEXT ·Fma21(SB), NOSPLIT, $0 853 // a ptr 854 MOVSD a+0(FP), X2 855 SHUFPD $0, X2, X2 856 857 // x Data ptr 858 MOVQ x_base+8(FP), R8 859 860 // x len 861 MOVQ x_len+16(FP), SI 862 863 // b Data ptr 864 MOVQ b_base+32(FP), R9 865 MOVQ R9, R10 866 867 // b len 868 MOVQ b_len+40(FP), DI 869 MOVQ DI, R11 870 871 // zero len return 872 CMPQ SI, $0 873 JE F21END 874 875 // check tail 876 SUBQ $2, SI 877 JL F21TAIL 878 879 F21LD: 880 CMPQ DI, $1 881 JE F21LT 882 SUBQ $2, DI 883 JGE F21LO 884 MOVQ R10, R9 885 MOVQ R11, DI 886 SUBQ $2, DI 887 888 F21LO: 889 MOVUPD (R9), X1 890 ADDQ $16, R9 891 JMP F21LOOP 892 893 F21LT: 894 MOVLPD (R9), X1 895 MOVQ R10, R9 896 MOVQ R11, DI 897 MOVHPD (R9), X1 898 SUBQ $1, DI 899 ADDQ $8, R9 900 901 F21LOOP: 902 MOVUPD (R8), X0 903 MULPD X1, X0 904 ADDPD X2, X0 905 MOVUPD X0, (R8) 906 ADDQ $16, R8 907 SUBQ $2, SI 908 JGE F21LD 909 JMP F21TAIL 910 911 F21LDF: 912 CMPQ DI, $1 913 JE F21LTF 914 SUBQ $2, DI 915 JGE F21LOF 916 MOVQ R10, R9 917 MOVQ R11, DI 918 SUBQ $2, DI 919 920 F21LOF: 921 MOVUPD (R9), X1 922 ADDQ $16, R9 923 JMP F21LOOPF 924 925 F21LTF: 926 MOVLPD (R9), X1 927 MOVQ R10, R9 928 MOVQ R11, DI 929 MOVHPD (R9), X1 930 SUBQ $1, DI 931 ADDQ $8, R9 932 933 F21LOOPF: 934 MOVUPD (R8), X0 935 936 // VMFADD213PD X0, X1, X2 937 BYTE $0xC4; BYTE $0xE2; BYTE $0xF1; BYTE $0xA8; BYTE $0xC2 938 MOVUPD X0, (R8) 939 ADDQ $16, R8 940 SUBQ $2, SI 941 JGE F21LDF 942 943 F21TAIL: 944 ADDQ $2, SI 945 JE F21END 946 947 F21TL: 948 MOVSD (R8), X0 949 MOVSD (R9), X1 950 MULPD X1, X0 951 ADDPD X2, X0 952 MOVSD X0, (R8) 953 ADDQ $8, R8 954 ADDQ $8, R9 955 SUBQ $1, SI 956 JG F21TL 957 958 F21END: 959 RET