gonum.org/v1/gonum@v0.14.0/internal/asm/f64/gemvT_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define SIZE 8 10 11 #define M_DIM n+8(FP) 12 #define M CX 13 #define N_DIM m+0(FP) 14 #define N BX 15 16 #define TMP1 R14 17 #define TMP2 R15 18 19 #define X_PTR SI 20 #define X x_base+56(FP) 21 #define Y_PTR DX 22 #define Y y_base+96(FP) 23 #define A_ROW AX 24 #define A_PTR DI 25 26 #define INC_X R8 27 #define INC3_X R9 28 29 #define INC_Y R10 30 #define INC3_Y R11 31 32 #define LDA R12 33 #define LDA3 R13 34 35 #define ALPHA X15 36 #define BETA X14 37 38 #define INIT4 \ 39 MOVDDUP (X_PTR), X8 \ 40 MOVDDUP (X_PTR)(INC_X*1), X9 \ 41 MOVDDUP (X_PTR)(INC_X*2), X10 \ 42 MOVDDUP (X_PTR)(INC3_X*1), X11 \ 43 MULPD ALPHA, X8 \ 44 MULPD ALPHA, X9 \ 45 MULPD ALPHA, X10 \ 46 MULPD ALPHA, X11 47 48 #define INIT2 \ 49 MOVDDUP (X_PTR), X8 \ 50 MOVDDUP (X_PTR)(INC_X*1), X9 \ 51 MULPD ALPHA, X8 \ 52 MULPD ALPHA, X9 53 54 #define INIT1 \ 55 MOVDDUP (X_PTR), X8 \ 56 MULPD ALPHA, X8 57 58 #define KERNEL_LOAD4 \ 59 MOVUPS (Y_PTR), X0 \ 60 MOVUPS 2*SIZE(Y_PTR), X1 61 62 #define KERNEL_LOAD2 \ 63 MOVUPS (Y_PTR), X0 64 65 #define KERNEL_LOAD4_INC \ 66 MOVSD (Y_PTR), X0 \ 67 MOVHPD (Y_PTR)(INC_Y*1), X0 \ 68 MOVSD (Y_PTR)(INC_Y*2), X1 \ 69 MOVHPD (Y_PTR)(INC3_Y*1), X1 70 71 #define KERNEL_LOAD2_INC \ 72 MOVSD (Y_PTR), X0 \ 73 MOVHPD (Y_PTR)(INC_Y*1), X0 74 75 #define KERNEL_4x4 \ 76 MOVUPS (A_PTR), X4 \ 77 MOVUPS 2*SIZE(A_PTR), X5 \ 78 MOVUPS (A_PTR)(LDA*1), X6 \ 79 MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \ 80 MULPD X8, X4 \ 81 MULPD X8, X5 \ 82 MULPD X9, X6 \ 83 MULPD X9, X7 \ 84 ADDPD X4, X0 \ 85 ADDPD X5, X1 \ 86 ADDPD X6, X0 \ 87 ADDPD X7, X1 \ 88 MOVUPS (A_PTR)(LDA*2), X4 \ 89 MOVUPS 2*SIZE(A_PTR)(LDA*2), X5 \ 90 MOVUPS (A_PTR)(LDA3*1), X6 \ 91 MOVUPS 2*SIZE(A_PTR)(LDA3*1), X7 \ 92 MULPD X10, X4 \ 93 MULPD X10, X5 \ 94 MULPD X11, X6 \ 95 MULPD X11, X7 \ 96 ADDPD X4, X0 \ 97 ADDPD X5, X1 \ 98 ADDPD X6, X0 \ 99 ADDPD X7, X1 \ 100 ADDQ $4*SIZE, A_PTR 101 102 #define KERNEL_4x2 \ 103 MOVUPS (A_PTR), X4 \ 104 MOVUPS 2*SIZE(A_PTR), X5 \ 105 MOVUPS (A_PTR)(LDA*1), X6 \ 106 MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \ 107 MULPD X8, X4 \ 108 MULPD X8, X5 \ 109 MULPD X9, X6 \ 110 MULPD X9, X7 \ 111 ADDPD X4, X0 \ 112 ADDPD X5, X1 \ 113 ADDPD X6, X0 \ 114 ADDPD X7, X1 \ 115 ADDQ $4*SIZE, A_PTR 116 117 #define KERNEL_4x1 \ 118 MOVUPS (A_PTR), X4 \ 119 MOVUPS 2*SIZE(A_PTR), X5 \ 120 MULPD X8, X4 \ 121 MULPD X8, X5 \ 122 ADDPD X4, X0 \ 123 ADDPD X5, X1 \ 124 ADDQ $4*SIZE, A_PTR 125 126 #define STORE4 \ 127 MOVUPS X0, (Y_PTR) \ 128 MOVUPS X1, 2*SIZE(Y_PTR) 129 130 #define STORE4_INC \ 131 MOVLPD X0, (Y_PTR) \ 132 MOVHPD X0, (Y_PTR)(INC_Y*1) \ 133 MOVLPD X1, (Y_PTR)(INC_Y*2) \ 134 MOVHPD X1, (Y_PTR)(INC3_Y*1) 135 136 #define KERNEL_2x4 \ 137 MOVUPS (A_PTR), X4 \ 138 MOVUPS (A_PTR)(LDA*1), X5 \ 139 MOVUPS (A_PTR)(LDA*2), X6 \ 140 MOVUPS (A_PTR)(LDA3*1), X7 \ 141 MULPD X8, X4 \ 142 MULPD X9, X5 \ 143 MULPD X10, X6 \ 144 MULPD X11, X7 \ 145 ADDPD X4, X0 \ 146 ADDPD X5, X0 \ 147 ADDPD X6, X0 \ 148 ADDPD X7, X0 \ 149 ADDQ $2*SIZE, A_PTR 150 151 #define KERNEL_2x2 \ 152 MOVUPS (A_PTR), X4 \ 153 MOVUPS (A_PTR)(LDA*1), X5 \ 154 MULPD X8, X4 \ 155 MULPD X9, X5 \ 156 ADDPD X4, X0 \ 157 ADDPD X5, X0 \ 158 ADDQ $2*SIZE, A_PTR 159 160 #define KERNEL_2x1 \ 161 MOVUPS (A_PTR), X4 \ 162 MULPD X8, X4 \ 163 ADDPD X4, X0 \ 164 ADDQ $2*SIZE, A_PTR 165 166 #define STORE2 \ 167 MOVUPS X0, (Y_PTR) 168 169 #define STORE2_INC \ 170 MOVLPD X0, (Y_PTR) \ 171 MOVHPD X0, (Y_PTR)(INC_Y*1) 172 173 #define KERNEL_1x4 \ 174 MOVSD (Y_PTR), X0 \ 175 MOVSD (A_PTR), X4 \ 176 MOVSD (A_PTR)(LDA*1), X5 \ 177 MOVSD (A_PTR)(LDA*2), X6 \ 178 MOVSD (A_PTR)(LDA3*1), X7 \ 179 MULSD X8, X4 \ 180 MULSD X9, X5 \ 181 MULSD X10, X6 \ 182 MULSD X11, X7 \ 183 ADDSD X4, X0 \ 184 ADDSD X5, X0 \ 185 ADDSD X6, X0 \ 186 ADDSD X7, X0 \ 187 MOVSD X0, (Y_PTR) \ 188 ADDQ $SIZE, A_PTR 189 190 #define KERNEL_1x2 \ 191 MOVSD (Y_PTR), X0 \ 192 MOVSD (A_PTR), X4 \ 193 MOVSD (A_PTR)(LDA*1), X5 \ 194 MULSD X8, X4 \ 195 MULSD X9, X5 \ 196 ADDSD X4, X0 \ 197 ADDSD X5, X0 \ 198 MOVSD X0, (Y_PTR) \ 199 ADDQ $SIZE, A_PTR 200 201 #define KERNEL_1x1 \ 202 MOVSD (Y_PTR), X0 \ 203 MOVSD (A_PTR), X4 \ 204 MULSD X8, X4 \ 205 ADDSD X4, X0 \ 206 MOVSD X0, (Y_PTR) \ 207 ADDQ $SIZE, A_PTR 208 209 #define SCALE_8(PTR, SCAL) \ 210 MOVUPS (PTR), X0 \ 211 MOVUPS 16(PTR), X1 \ 212 MOVUPS 32(PTR), X2 \ 213 MOVUPS 48(PTR), X3 \ 214 MULPD SCAL, X0 \ 215 MULPD SCAL, X1 \ 216 MULPD SCAL, X2 \ 217 MULPD SCAL, X3 \ 218 MOVUPS X0, (PTR) \ 219 MOVUPS X1, 16(PTR) \ 220 MOVUPS X2, 32(PTR) \ 221 MOVUPS X3, 48(PTR) 222 223 #define SCALE_4(PTR, SCAL) \ 224 MOVUPS (PTR), X0 \ 225 MOVUPS 16(PTR), X1 \ 226 MULPD SCAL, X0 \ 227 MULPD SCAL, X1 \ 228 MOVUPS X0, (PTR) \ 229 MOVUPS X1, 16(PTR) \ 230 231 #define SCALE_2(PTR, SCAL) \ 232 MOVUPS (PTR), X0 \ 233 MULPD SCAL, X0 \ 234 MOVUPS X0, (PTR) \ 235 236 #define SCALE_1(PTR, SCAL) \ 237 MOVSD (PTR), X0 \ 238 MULSD SCAL, X0 \ 239 MOVSD X0, (PTR) \ 240 241 #define SCALEINC_4(PTR, INC, INC3, SCAL) \ 242 MOVSD (PTR), X0 \ 243 MOVSD (PTR)(INC*1), X1 \ 244 MOVSD (PTR)(INC*2), X2 \ 245 MOVSD (PTR)(INC3*1), X3 \ 246 MULSD SCAL, X0 \ 247 MULSD SCAL, X1 \ 248 MULSD SCAL, X2 \ 249 MULSD SCAL, X3 \ 250 MOVSD X0, (PTR) \ 251 MOVSD X1, (PTR)(INC*1) \ 252 MOVSD X2, (PTR)(INC*2) \ 253 MOVSD X3, (PTR)(INC3*1) 254 255 #define SCALEINC_2(PTR, INC, SCAL) \ 256 MOVSD (PTR), X0 \ 257 MOVSD (PTR)(INC*1), X1 \ 258 MULSD SCAL, X0 \ 259 MULSD SCAL, X1 \ 260 MOVSD X0, (PTR) \ 261 MOVSD X1, (PTR)(INC*1) 262 263 // func GemvT(m, n int, 264 // alpha float64, 265 // a []float64, lda int, 266 // x []float64, incX int, 267 // beta float64, 268 // y []float64, incY int) 269 TEXT ·GemvT(SB), NOSPLIT, $32-128 270 MOVQ M_DIM, M 271 MOVQ N_DIM, N 272 CMPQ M, $0 273 JE end 274 CMPQ N, $0 275 JE end 276 277 MOVDDUP alpha+16(FP), ALPHA 278 279 MOVQ x_base+56(FP), X_PTR 280 MOVQ y_base+96(FP), Y_PTR 281 MOVQ a_base+24(FP), A_ROW 282 MOVQ incY+120(FP), INC_Y // INC_Y = incY * sizeof(float64) 283 MOVQ lda+48(FP), LDA // LDA = LDA * sizeof(float64) 284 SHLQ $3, LDA 285 LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3 286 MOVQ A_ROW, A_PTR 287 288 MOVQ incX+80(FP), INC_X // INC_X = incX * sizeof(float64) 289 290 XORQ TMP2, TMP2 291 MOVQ N, TMP1 292 SUBQ $1, TMP1 293 NEGQ TMP1 294 IMULQ INC_X, TMP1 295 CMPQ INC_X, $0 296 CMOVQLT TMP1, TMP2 297 LEAQ (X_PTR)(TMP2*SIZE), X_PTR 298 MOVQ X_PTR, X 299 300 SHLQ $3, INC_X 301 LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3 302 303 CMPQ incY+120(FP), $1 // Check for dense vector Y (fast-path) 304 JNE inc 305 306 MOVSD $1.0, X0 307 COMISD beta+88(FP), X0 308 JE gemv_start 309 310 MOVSD $0.0, X0 311 COMISD beta+88(FP), X0 312 JE gemv_clear 313 314 MOVDDUP beta+88(FP), BETA 315 SHRQ $3, M 316 JZ scal4 317 318 scal8: 319 SCALE_8(Y_PTR, BETA) 320 ADDQ $8*SIZE, Y_PTR 321 DECQ M 322 JNZ scal8 323 324 scal4: 325 TESTQ $4, M_DIM 326 JZ scal2 327 SCALE_4(Y_PTR, BETA) 328 ADDQ $4*SIZE, Y_PTR 329 330 scal2: 331 TESTQ $2, M_DIM 332 JZ scal1 333 SCALE_2(Y_PTR, BETA) 334 ADDQ $2*SIZE, Y_PTR 335 336 scal1: 337 TESTQ $1, M_DIM 338 JZ prep_end 339 SCALE_1(Y_PTR, BETA) 340 341 JMP prep_end 342 343 gemv_clear: // beta == 0 is special cased to clear memory (no nan handling) 344 XORPS X0, X0 345 XORPS X1, X1 346 XORPS X2, X2 347 XORPS X3, X3 348 349 SHRQ $3, M 350 JZ clear4 351 352 clear8: 353 MOVUPS X0, (Y_PTR) 354 MOVUPS X1, 16(Y_PTR) 355 MOVUPS X2, 32(Y_PTR) 356 MOVUPS X3, 48(Y_PTR) 357 ADDQ $8*SIZE, Y_PTR 358 DECQ M 359 JNZ clear8 360 361 clear4: 362 TESTQ $4, M_DIM 363 JZ clear2 364 MOVUPS X0, (Y_PTR) 365 MOVUPS X1, 16(Y_PTR) 366 ADDQ $4*SIZE, Y_PTR 367 368 clear2: 369 TESTQ $2, M_DIM 370 JZ clear1 371 MOVUPS X0, (Y_PTR) 372 ADDQ $2*SIZE, Y_PTR 373 374 clear1: 375 TESTQ $1, M_DIM 376 JZ prep_end 377 MOVSD X0, (Y_PTR) 378 379 prep_end: 380 MOVQ Y, Y_PTR 381 MOVQ M_DIM, M 382 383 gemv_start: 384 SHRQ $2, N 385 JZ c2 386 387 c4: 388 // LOAD 4 389 INIT4 390 391 MOVQ M_DIM, M 392 SHRQ $2, M 393 JZ c4r2 394 395 c4r4: 396 // 4x4 KERNEL 397 KERNEL_LOAD4 398 KERNEL_4x4 399 STORE4 400 401 ADDQ $4*SIZE, Y_PTR 402 403 DECQ M 404 JNZ c4r4 405 406 c4r2: 407 TESTQ $2, M_DIM 408 JZ c4r1 409 410 // 4x2 KERNEL 411 KERNEL_LOAD2 412 KERNEL_2x4 413 STORE2 414 415 ADDQ $2*SIZE, Y_PTR 416 417 c4r1: 418 TESTQ $1, M_DIM 419 JZ c4end 420 421 // 4x1 KERNEL 422 KERNEL_1x4 423 424 ADDQ $SIZE, Y_PTR 425 426 c4end: 427 LEAQ (X_PTR)(INC_X*4), X_PTR 428 MOVQ Y, Y_PTR 429 LEAQ (A_ROW)(LDA*4), A_ROW 430 MOVQ A_ROW, A_PTR 431 432 DECQ N 433 JNZ c4 434 435 c2: 436 TESTQ $2, N_DIM 437 JZ c1 438 439 // LOAD 2 440 INIT2 441 442 MOVQ M_DIM, M 443 SHRQ $2, M 444 JZ c2r2 445 446 c2r4: 447 // 2x4 KERNEL 448 KERNEL_LOAD4 449 KERNEL_4x2 450 STORE4 451 452 ADDQ $4*SIZE, Y_PTR 453 454 DECQ M 455 JNZ c2r4 456 457 c2r2: 458 TESTQ $2, M_DIM 459 JZ c2r1 460 461 // 2x2 KERNEL 462 KERNEL_LOAD2 463 KERNEL_2x2 464 STORE2 465 466 ADDQ $2*SIZE, Y_PTR 467 468 c2r1: 469 TESTQ $1, M_DIM 470 JZ c2end 471 472 // 2x1 KERNEL 473 KERNEL_1x2 474 475 ADDQ $SIZE, Y_PTR 476 477 c2end: 478 LEAQ (X_PTR)(INC_X*2), X_PTR 479 MOVQ Y, Y_PTR 480 LEAQ (A_ROW)(LDA*2), A_ROW 481 MOVQ A_ROW, A_PTR 482 483 c1: 484 TESTQ $1, N_DIM 485 JZ end 486 487 // LOAD 1 488 INIT1 489 490 MOVQ M_DIM, M 491 SHRQ $2, M 492 JZ c1r2 493 494 c1r4: 495 // 1x4 KERNEL 496 KERNEL_LOAD4 497 KERNEL_4x1 498 STORE4 499 500 ADDQ $4*SIZE, Y_PTR 501 502 DECQ M 503 JNZ c1r4 504 505 c1r2: 506 TESTQ $2, M_DIM 507 JZ c1r1 508 509 // 1x2 KERNEL 510 KERNEL_LOAD2 511 KERNEL_2x1 512 STORE2 513 514 ADDQ $2*SIZE, Y_PTR 515 516 c1r1: 517 TESTQ $1, M_DIM 518 JZ end 519 520 // 1x1 KERNEL 521 KERNEL_1x1 522 523 end: 524 RET 525 526 inc: // Algorithm for incX != 0 ( split loads in kernel ) 527 XORQ TMP2, TMP2 528 MOVQ M, TMP1 529 SUBQ $1, TMP1 530 IMULQ INC_Y, TMP1 531 NEGQ TMP1 532 CMPQ INC_Y, $0 533 CMOVQLT TMP1, TMP2 534 LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR 535 MOVQ Y_PTR, Y 536 537 SHLQ $3, INC_Y 538 LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3 539 540 MOVSD $1.0, X0 541 COMISD beta+88(FP), X0 542 JE inc_gemv_start 543 544 MOVSD $0.0, X0 545 COMISD beta+88(FP), X0 546 JE inc_gemv_clear 547 548 MOVDDUP beta+88(FP), BETA 549 SHRQ $2, M 550 JZ inc_scal2 551 552 inc_scal4: 553 SCALEINC_4(Y_PTR, INC_Y, INC3_Y, BETA) 554 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 555 DECQ M 556 JNZ inc_scal4 557 558 inc_scal2: 559 TESTQ $2, M_DIM 560 JZ inc_scal1 561 562 SCALEINC_2(Y_PTR, INC_Y, BETA) 563 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 564 565 inc_scal1: 566 TESTQ $1, M_DIM 567 JZ inc_prep_end 568 SCALE_1(Y_PTR, BETA) 569 570 JMP inc_prep_end 571 572 inc_gemv_clear: // beta == 0 is special-cased to clear memory (no nan handling) 573 XORPS X0, X0 574 XORPS X1, X1 575 XORPS X2, X2 576 XORPS X3, X3 577 578 SHRQ $2, M 579 JZ inc_clear2 580 581 inc_clear4: 582 MOVSD X0, (Y_PTR) 583 MOVSD X1, (Y_PTR)(INC_Y*1) 584 MOVSD X2, (Y_PTR)(INC_Y*2) 585 MOVSD X3, (Y_PTR)(INC3_Y*1) 586 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 587 DECQ M 588 JNZ inc_clear4 589 590 inc_clear2: 591 TESTQ $2, M_DIM 592 JZ inc_clear1 593 MOVSD X0, (Y_PTR) 594 MOVSD X1, (Y_PTR)(INC_Y*1) 595 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 596 597 inc_clear1: 598 TESTQ $1, M_DIM 599 JZ inc_prep_end 600 MOVSD X0, (Y_PTR) 601 602 inc_prep_end: 603 MOVQ Y, Y_PTR 604 MOVQ M_DIM, M 605 606 inc_gemv_start: 607 SHRQ $2, N 608 JZ inc_c2 609 610 inc_c4: 611 // LOAD 4 612 INIT4 613 614 MOVQ M_DIM, M 615 SHRQ $2, M 616 JZ inc_c4r2 617 618 inc_c4r4: 619 // 4x4 KERNEL 620 KERNEL_LOAD4_INC 621 KERNEL_4x4 622 STORE4_INC 623 624 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 625 626 DECQ M 627 JNZ inc_c4r4 628 629 inc_c4r2: 630 TESTQ $2, M_DIM 631 JZ inc_c4r1 632 633 // 4x2 KERNEL 634 KERNEL_LOAD2_INC 635 KERNEL_2x4 636 STORE2_INC 637 638 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 639 640 inc_c4r1: 641 TESTQ $1, M_DIM 642 JZ inc_c4end 643 644 // 4x1 KERNEL 645 KERNEL_1x4 646 647 ADDQ INC_Y, Y_PTR 648 649 inc_c4end: 650 LEAQ (X_PTR)(INC_X*4), X_PTR 651 MOVQ Y, Y_PTR 652 LEAQ (A_ROW)(LDA*4), A_ROW 653 MOVQ A_ROW, A_PTR 654 655 DECQ N 656 JNZ inc_c4 657 658 inc_c2: 659 TESTQ $2, N_DIM 660 JZ inc_c1 661 662 // LOAD 2 663 INIT2 664 665 MOVQ M_DIM, M 666 SHRQ $2, M 667 JZ inc_c2r2 668 669 inc_c2r4: 670 // 2x4 KERNEL 671 KERNEL_LOAD4_INC 672 KERNEL_4x2 673 STORE4_INC 674 675 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 676 DECQ M 677 JNZ inc_c2r4 678 679 inc_c2r2: 680 TESTQ $2, M_DIM 681 JZ inc_c2r1 682 683 // 2x2 KERNEL 684 KERNEL_LOAD2_INC 685 KERNEL_2x2 686 STORE2_INC 687 688 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 689 690 inc_c2r1: 691 TESTQ $1, M_DIM 692 JZ inc_c2end 693 694 // 2x1 KERNEL 695 KERNEL_1x2 696 697 ADDQ INC_Y, Y_PTR 698 699 inc_c2end: 700 LEAQ (X_PTR)(INC_X*2), X_PTR 701 MOVQ Y, Y_PTR 702 LEAQ (A_ROW)(LDA*2), A_ROW 703 MOVQ A_ROW, A_PTR 704 705 inc_c1: 706 TESTQ $1, N_DIM 707 JZ inc_end 708 709 // LOAD 1 710 INIT1 711 712 MOVQ M_DIM, M 713 SHRQ $2, M 714 JZ inc_c1r2 715 716 inc_c1r4: 717 // 1x4 KERNEL 718 KERNEL_LOAD4_INC 719 KERNEL_4x1 720 STORE4_INC 721 722 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 723 DECQ M 724 JNZ inc_c1r4 725 726 inc_c1r2: 727 TESTQ $2, M_DIM 728 JZ inc_c1r1 729 730 // 1x2 KERNEL 731 KERNEL_LOAD2_INC 732 KERNEL_2x1 733 STORE2_INC 734 735 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 736 737 inc_c1r1: 738 TESTQ $1, M_DIM 739 JZ inc_end 740 741 // 1x1 KERNEL 742 KERNEL_1x1 743 744 inc_end: 745 RET