gonum.org/v1/gonum@v0.14.0/internal/asm/f64/gemvN_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define SIZE 8 10 11 #define M_DIM m+0(FP) 12 #define M CX 13 #define N_DIM n+8(FP) 14 #define N BX 15 16 #define TMP1 R14 17 #define TMP2 R15 18 19 #define X_PTR SI 20 #define X x_base+56(FP) 21 #define INC_X R8 22 #define INC3_X R9 23 24 #define Y_PTR DX 25 #define Y y_base+96(FP) 26 #define INC_Y R10 27 #define INC3_Y R11 28 29 #define A_ROW AX 30 #define A_PTR DI 31 #define LDA R12 32 #define LDA3 R13 33 34 #define ALPHA X15 35 #define BETA X14 36 37 #define INIT4 \ 38 XORPS X0, X0 \ 39 XORPS X1, X1 \ 40 XORPS X2, X2 \ 41 XORPS X3, X3 42 43 #define INIT2 \ 44 XORPS X0, X0 \ 45 XORPS X1, X1 46 47 #define INIT1 \ 48 XORPS X0, X0 49 50 #define KERNEL_LOAD4 \ 51 MOVUPS (X_PTR), X12 \ 52 MOVUPS 2*SIZE(X_PTR), X13 53 54 #define KERNEL_LOAD2 \ 55 MOVUPS (X_PTR), X12 56 57 #define KERNEL_LOAD4_INC \ 58 MOVSD (X_PTR), X12 \ 59 MOVHPD (X_PTR)(INC_X*1), X12 \ 60 MOVSD (X_PTR)(INC_X*2), X13 \ 61 MOVHPD (X_PTR)(INC3_X*1), X13 62 63 #define KERNEL_LOAD2_INC \ 64 MOVSD (X_PTR), X12 \ 65 MOVHPD (X_PTR)(INC_X*1), X12 66 67 #define KERNEL_4x4 \ 68 MOVUPS (A_PTR), X4 \ 69 MOVUPS 2*SIZE(A_PTR), X5 \ 70 MOVUPS (A_PTR)(LDA*1), X6 \ 71 MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \ 72 MOVUPS (A_PTR)(LDA*2), X8 \ 73 MOVUPS 2*SIZE(A_PTR)(LDA*2), X9 \ 74 MOVUPS (A_PTR)(LDA3*1), X10 \ 75 MOVUPS 2*SIZE(A_PTR)(LDA3*1), X11 \ 76 MULPD X12, X4 \ 77 MULPD X13, X5 \ 78 MULPD X12, X6 \ 79 MULPD X13, X7 \ 80 MULPD X12, X8 \ 81 MULPD X13, X9 \ 82 MULPD X12, X10 \ 83 MULPD X13, X11 \ 84 ADDPD X4, X0 \ 85 ADDPD X5, X0 \ 86 ADDPD X6, X1 \ 87 ADDPD X7, X1 \ 88 ADDPD X8, X2 \ 89 ADDPD X9, X2 \ 90 ADDPD X10, X3 \ 91 ADDPD X11, X3 \ 92 ADDQ $4*SIZE, A_PTR 93 94 #define KERNEL_4x2 \ 95 MOVUPS (A_PTR), X4 \ 96 MOVUPS (A_PTR)(LDA*1), X5 \ 97 MOVUPS (A_PTR)(LDA*2), X6 \ 98 MOVUPS (A_PTR)(LDA3*1), X7 \ 99 MULPD X12, X4 \ 100 MULPD X12, X5 \ 101 MULPD X12, X6 \ 102 MULPD X12, X7 \ 103 ADDPD X4, X0 \ 104 ADDPD X5, X1 \ 105 ADDPD X6, X2 \ 106 ADDPD X7, X3 \ 107 ADDQ $2*SIZE, A_PTR 108 109 #define KERNEL_4x1 \ 110 MOVDDUP (X_PTR), X12 \ 111 MOVSD (A_PTR), X4 \ 112 MOVHPD (A_PTR)(LDA*1), X4 \ 113 MOVSD (A_PTR)(LDA*2), X5 \ 114 MOVHPD (A_PTR)(LDA3*1), X5 \ 115 MULPD X12, X4 \ 116 MULPD X12, X5 \ 117 ADDPD X4, X0 \ 118 ADDPD X5, X2 \ 119 ADDQ $SIZE, A_PTR 120 121 #define STORE4 \ 122 MOVUPS (Y_PTR), X4 \ 123 MOVUPS 2*SIZE(Y_PTR), X5 \ 124 MULPD ALPHA, X0 \ 125 MULPD ALPHA, X2 \ 126 MULPD BETA, X4 \ 127 MULPD BETA, X5 \ 128 ADDPD X0, X4 \ 129 ADDPD X2, X5 \ 130 MOVUPS X4, (Y_PTR) \ 131 MOVUPS X5, 2*SIZE(Y_PTR) 132 133 #define STORE4_INC \ 134 MOVSD (Y_PTR), X4 \ 135 MOVHPD (Y_PTR)(INC_Y*1), X4 \ 136 MOVSD (Y_PTR)(INC_Y*2), X5 \ 137 MOVHPD (Y_PTR)(INC3_Y*1), X5 \ 138 MULPD ALPHA, X0 \ 139 MULPD ALPHA, X2 \ 140 MULPD BETA, X4 \ 141 MULPD BETA, X5 \ 142 ADDPD X0, X4 \ 143 ADDPD X2, X5 \ 144 MOVLPD X4, (Y_PTR) \ 145 MOVHPD X4, (Y_PTR)(INC_Y*1) \ 146 MOVLPD X5, (Y_PTR)(INC_Y*2) \ 147 MOVHPD X5, (Y_PTR)(INC3_Y*1) 148 149 #define KERNEL_2x4 \ 150 MOVUPS (A_PTR), X8 \ 151 MOVUPS 2*SIZE(A_PTR), X9 \ 152 MOVUPS (A_PTR)(LDA*1), X10 \ 153 MOVUPS 2*SIZE(A_PTR)(LDA*1), X11 \ 154 MULPD X12, X8 \ 155 MULPD X13, X9 \ 156 MULPD X12, X10 \ 157 MULPD X13, X11 \ 158 ADDPD X8, X0 \ 159 ADDPD X10, X1 \ 160 ADDPD X9, X0 \ 161 ADDPD X11, X1 \ 162 ADDQ $4*SIZE, A_PTR 163 164 #define KERNEL_2x2 \ 165 MOVUPS (A_PTR), X8 \ 166 MOVUPS (A_PTR)(LDA*1), X9 \ 167 MULPD X12, X8 \ 168 MULPD X12, X9 \ 169 ADDPD X8, X0 \ 170 ADDPD X9, X1 \ 171 ADDQ $2*SIZE, A_PTR 172 173 #define KERNEL_2x1 \ 174 MOVDDUP (X_PTR), X12 \ 175 MOVSD (A_PTR), X8 \ 176 MOVHPD (A_PTR)(LDA*1), X8 \ 177 MULPD X12, X8 \ 178 ADDPD X8, X0 \ 179 ADDQ $SIZE, A_PTR 180 181 #define STORE2 \ 182 MOVUPS (Y_PTR), X4 \ 183 MULPD ALPHA, X0 \ 184 MULPD BETA, X4 \ 185 ADDPD X0, X4 \ 186 MOVUPS X4, (Y_PTR) 187 188 #define STORE2_INC \ 189 MOVSD (Y_PTR), X4 \ 190 MOVHPD (Y_PTR)(INC_Y*1), X4 \ 191 MULPD ALPHA, X0 \ 192 MULPD BETA, X4 \ 193 ADDPD X0, X4 \ 194 MOVSD X4, (Y_PTR) \ 195 MOVHPD X4, (Y_PTR)(INC_Y*1) 196 197 #define KERNEL_1x4 \ 198 MOVUPS (A_PTR), X8 \ 199 MOVUPS 2*SIZE(A_PTR), X9 \ 200 MULPD X12, X8 \ 201 MULPD X13, X9 \ 202 ADDPD X8, X0 \ 203 ADDPD X9, X0 \ 204 ADDQ $4*SIZE, A_PTR 205 206 #define KERNEL_1x2 \ 207 MOVUPS (A_PTR), X8 \ 208 MULPD X12, X8 \ 209 ADDPD X8, X0 \ 210 ADDQ $2*SIZE, A_PTR 211 212 #define KERNEL_1x1 \ 213 MOVSD (X_PTR), X12 \ 214 MOVSD (A_PTR), X8 \ 215 MULSD X12, X8 \ 216 ADDSD X8, X0 \ 217 ADDQ $SIZE, A_PTR 218 219 #define STORE1 \ 220 HADDPD X0, X0 \ 221 MOVSD (Y_PTR), X4 \ 222 MULSD ALPHA, X0 \ 223 MULSD BETA, X4 \ 224 ADDSD X0, X4 \ 225 MOVSD X4, (Y_PTR) 226 227 // func GemvN(m, n int, 228 // alpha float64, 229 // a []float64, lda int, 230 // x []float64, incX int, 231 // beta float64, 232 // y []float64, incY int) 233 TEXT ·GemvN(SB), NOSPLIT, $32-128 234 MOVQ M_DIM, M 235 MOVQ N_DIM, N 236 CMPQ M, $0 237 JE end 238 CMPQ N, $0 239 JE end 240 241 MOVDDUP alpha+16(FP), ALPHA 242 MOVDDUP beta+88(FP), BETA 243 244 MOVQ x_base+56(FP), X_PTR 245 MOVQ y_base+96(FP), Y_PTR 246 MOVQ a_base+24(FP), A_ROW 247 MOVQ incY+120(FP), INC_Y 248 MOVQ lda+48(FP), LDA // LDA = LDA * sizeof(float64) 249 SHLQ $3, LDA 250 LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3 251 MOVQ A_ROW, A_PTR 252 253 XORQ TMP2, TMP2 254 MOVQ M, TMP1 255 SUBQ $1, TMP1 256 IMULQ INC_Y, TMP1 257 NEGQ TMP1 258 CMPQ INC_Y, $0 259 CMOVQLT TMP1, TMP2 260 LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR 261 MOVQ Y_PTR, Y 262 263 SHLQ $3, INC_Y // INC_Y = incY * sizeof(float64) 264 LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3 265 266 MOVSD $0.0, X0 267 COMISD BETA, X0 268 JNE gemv_start // if beta != 0 { goto gemv_start } 269 270 gemv_clear: // beta == 0 is special cased to clear memory (no nan handling) 271 XORPS X0, X0 272 XORPS X1, X1 273 XORPS X2, X2 274 XORPS X3, X3 275 276 CMPQ incY+120(FP), $1 // Check for dense vector X (fast-path) 277 JNE inc_clear 278 279 SHRQ $3, M 280 JZ clear4 281 282 clear8: 283 MOVUPS X0, (Y_PTR) 284 MOVUPS X1, 16(Y_PTR) 285 MOVUPS X2, 32(Y_PTR) 286 MOVUPS X3, 48(Y_PTR) 287 ADDQ $8*SIZE, Y_PTR 288 DECQ M 289 JNZ clear8 290 291 clear4: 292 TESTQ $4, M_DIM 293 JZ clear2 294 MOVUPS X0, (Y_PTR) 295 MOVUPS X1, 16(Y_PTR) 296 ADDQ $4*SIZE, Y_PTR 297 298 clear2: 299 TESTQ $2, M_DIM 300 JZ clear1 301 MOVUPS X0, (Y_PTR) 302 ADDQ $2*SIZE, Y_PTR 303 304 clear1: 305 TESTQ $1, M_DIM 306 JZ prep_end 307 MOVSD X0, (Y_PTR) 308 309 JMP prep_end 310 311 inc_clear: 312 SHRQ $2, M 313 JZ inc_clear2 314 315 inc_clear4: 316 MOVSD X0, (Y_PTR) 317 MOVSD X1, (Y_PTR)(INC_Y*1) 318 MOVSD X2, (Y_PTR)(INC_Y*2) 319 MOVSD X3, (Y_PTR)(INC3_Y*1) 320 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 321 DECQ M 322 JNZ inc_clear4 323 324 inc_clear2: 325 TESTQ $2, M_DIM 326 JZ inc_clear1 327 MOVSD X0, (Y_PTR) 328 MOVSD X1, (Y_PTR)(INC_Y*1) 329 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 330 331 inc_clear1: 332 TESTQ $1, M_DIM 333 JZ prep_end 334 MOVSD X0, (Y_PTR) 335 336 prep_end: 337 MOVQ Y, Y_PTR 338 MOVQ M_DIM, M 339 340 gemv_start: 341 CMPQ incX+80(FP), $1 // Check for dense vector X (fast-path) 342 JNE inc 343 344 SHRQ $2, M 345 JZ r2 346 347 r4: 348 // LOAD 4 349 INIT4 350 351 MOVQ N_DIM, N 352 SHRQ $2, N 353 JZ r4c2 354 355 r4c4: 356 // 4x4 KERNEL 357 KERNEL_LOAD4 358 KERNEL_4x4 359 360 ADDQ $4*SIZE, X_PTR 361 362 DECQ N 363 JNZ r4c4 364 365 r4c2: 366 TESTQ $2, N_DIM 367 JZ r4c1 368 369 // 4x2 KERNEL 370 KERNEL_LOAD2 371 KERNEL_4x2 372 373 ADDQ $2*SIZE, X_PTR 374 375 r4c1: 376 HADDPD X1, X0 377 HADDPD X3, X2 378 TESTQ $1, N_DIM 379 JZ r4end 380 381 // 4x1 KERNEL 382 KERNEL_4x1 383 384 ADDQ $SIZE, X_PTR 385 386 r4end: 387 CMPQ INC_Y, $SIZE 388 JNZ r4st_inc 389 390 STORE4 391 ADDQ $4*SIZE, Y_PTR 392 JMP r4inc 393 394 r4st_inc: 395 STORE4_INC 396 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 397 398 r4inc: 399 MOVQ X, X_PTR 400 LEAQ (A_ROW)(LDA*4), A_ROW 401 MOVQ A_ROW, A_PTR 402 403 DECQ M 404 JNZ r4 405 406 r2: 407 TESTQ $2, M_DIM 408 JZ r1 409 410 // LOAD 2 411 INIT2 412 413 MOVQ N_DIM, N 414 SHRQ $2, N 415 JZ r2c2 416 417 r2c4: 418 // 2x4 KERNEL 419 KERNEL_LOAD4 420 KERNEL_2x4 421 422 ADDQ $4*SIZE, X_PTR 423 424 DECQ N 425 JNZ r2c4 426 427 r2c2: 428 TESTQ $2, N_DIM 429 JZ r2c1 430 431 // 2x2 KERNEL 432 KERNEL_LOAD2 433 KERNEL_2x2 434 435 ADDQ $2*SIZE, X_PTR 436 437 r2c1: 438 HADDPD X1, X0 439 TESTQ $1, N_DIM 440 JZ r2end 441 442 // 2x1 KERNEL 443 KERNEL_2x1 444 445 ADDQ $SIZE, X_PTR 446 447 r2end: 448 CMPQ INC_Y, $SIZE 449 JNE r2st_inc 450 451 STORE2 452 ADDQ $2*SIZE, Y_PTR 453 JMP r2inc 454 455 r2st_inc: 456 STORE2_INC 457 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 458 459 r2inc: 460 MOVQ X, X_PTR 461 LEAQ (A_ROW)(LDA*2), A_ROW 462 MOVQ A_ROW, A_PTR 463 464 r1: 465 TESTQ $1, M_DIM 466 JZ end 467 468 // LOAD 1 469 INIT1 470 471 MOVQ N_DIM, N 472 SHRQ $2, N 473 JZ r1c2 474 475 r1c4: 476 // 1x4 KERNEL 477 KERNEL_LOAD4 478 KERNEL_1x4 479 480 ADDQ $4*SIZE, X_PTR 481 482 DECQ N 483 JNZ r1c4 484 485 r1c2: 486 TESTQ $2, N_DIM 487 JZ r1c1 488 489 // 1x2 KERNEL 490 KERNEL_LOAD2 491 KERNEL_1x2 492 493 ADDQ $2*SIZE, X_PTR 494 495 r1c1: 496 497 TESTQ $1, N_DIM 498 JZ r1end 499 500 // 1x1 KERNEL 501 KERNEL_1x1 502 503 r1end: 504 STORE1 505 506 end: 507 RET 508 509 inc: // Algorithm for incX != 1 ( split loads in kernel ) 510 MOVQ incX+80(FP), INC_X // INC_X = incX 511 512 XORQ TMP2, TMP2 // TMP2 = 0 513 MOVQ N, TMP1 // TMP1 = N 514 SUBQ $1, TMP1 // TMP1 -= 1 515 NEGQ TMP1 // TMP1 = -TMP1 516 IMULQ INC_X, TMP1 // TMP1 *= INC_X 517 CMPQ INC_X, $0 // if INC_X < 0 { TMP2 = TMP1 } 518 CMOVQLT TMP1, TMP2 519 LEAQ (X_PTR)(TMP2*SIZE), X_PTR // X_PTR = X_PTR[TMP2] 520 MOVQ X_PTR, X // X = X_PTR 521 522 SHLQ $3, INC_X 523 LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3 524 525 SHRQ $2, M 526 JZ inc_r2 527 528 inc_r4: 529 // LOAD 4 530 INIT4 531 532 MOVQ N_DIM, N 533 SHRQ $2, N 534 JZ inc_r4c2 535 536 inc_r4c4: 537 // 4x4 KERNEL 538 KERNEL_LOAD4_INC 539 KERNEL_4x4 540 541 LEAQ (X_PTR)(INC_X*4), X_PTR 542 543 DECQ N 544 JNZ inc_r4c4 545 546 inc_r4c2: 547 TESTQ $2, N_DIM 548 JZ inc_r4c1 549 550 // 4x2 KERNEL 551 KERNEL_LOAD2_INC 552 KERNEL_4x2 553 554 LEAQ (X_PTR)(INC_X*2), X_PTR 555 556 inc_r4c1: 557 HADDPD X1, X0 558 HADDPD X3, X2 559 TESTQ $1, N_DIM 560 JZ inc_r4end 561 562 // 4x1 KERNEL 563 KERNEL_4x1 564 565 ADDQ INC_X, X_PTR 566 567 inc_r4end: 568 CMPQ INC_Y, $SIZE 569 JNE inc_r4st_inc 570 571 STORE4 572 ADDQ $4*SIZE, Y_PTR 573 JMP inc_r4inc 574 575 inc_r4st_inc: 576 STORE4_INC 577 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 578 579 inc_r4inc: 580 MOVQ X, X_PTR 581 LEAQ (A_ROW)(LDA*4), A_ROW 582 MOVQ A_ROW, A_PTR 583 584 DECQ M 585 JNZ inc_r4 586 587 inc_r2: 588 TESTQ $2, M_DIM 589 JZ inc_r1 590 591 // LOAD 2 592 INIT2 593 594 MOVQ N_DIM, N 595 SHRQ $2, N 596 JZ inc_r2c2 597 598 inc_r2c4: 599 // 2x4 KERNEL 600 KERNEL_LOAD4_INC 601 KERNEL_2x4 602 603 LEAQ (X_PTR)(INC_X*4), X_PTR 604 DECQ N 605 JNZ inc_r2c4 606 607 inc_r2c2: 608 TESTQ $2, N_DIM 609 JZ inc_r2c1 610 611 // 2x2 KERNEL 612 KERNEL_LOAD2_INC 613 KERNEL_2x2 614 615 LEAQ (X_PTR)(INC_X*2), X_PTR 616 617 inc_r2c1: 618 HADDPD X1, X0 619 TESTQ $1, N_DIM 620 JZ inc_r2end 621 622 // 2x1 KERNEL 623 KERNEL_2x1 624 625 ADDQ INC_X, X_PTR 626 627 inc_r2end: 628 CMPQ INC_Y, $SIZE 629 JNE inc_r2st_inc 630 631 STORE2 632 ADDQ $2*SIZE, Y_PTR 633 JMP inc_r2inc 634 635 inc_r2st_inc: 636 STORE2_INC 637 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 638 639 inc_r2inc: 640 MOVQ X, X_PTR 641 LEAQ (A_ROW)(LDA*2), A_ROW 642 MOVQ A_ROW, A_PTR 643 644 inc_r1: 645 TESTQ $1, M_DIM 646 JZ inc_end 647 648 // LOAD 1 649 INIT1 650 651 MOVQ N_DIM, N 652 SHRQ $2, N 653 JZ inc_r1c2 654 655 inc_r1c4: 656 // 1x4 KERNEL 657 KERNEL_LOAD4_INC 658 KERNEL_1x4 659 660 LEAQ (X_PTR)(INC_X*4), X_PTR 661 DECQ N 662 JNZ inc_r1c4 663 664 inc_r1c2: 665 TESTQ $2, N_DIM 666 JZ inc_r1c1 667 668 // 1x2 KERNEL 669 KERNEL_LOAD2_INC 670 KERNEL_1x2 671 672 LEAQ (X_PTR)(INC_X*2), X_PTR 673 674 inc_r1c1: 675 TESTQ $1, N_DIM 676 JZ inc_r1end 677 678 // 1x1 KERNEL 679 KERNEL_1x1 680 681 inc_r1end: 682 STORE1 683 684 inc_end: 685 RET