gonum.org/v1/gonum@v0.14.0/internal/asm/f32/ge_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define SIZE 4 10 #define BITSIZE 2 11 #define KERNELSIZE 3 12 13 #define M_DIM m+0(FP) 14 #define M CX 15 #define N_DIM n+8(FP) 16 #define N BX 17 18 #define TMP1 R14 19 #define TMP2 R15 20 21 #define X_PTR SI 22 #define Y y_base+56(FP) 23 #define Y_PTR DX 24 #define A_ROW AX 25 #define A_PTR DI 26 27 #define INC_X R8 28 #define INC3_X R9 29 30 #define INC_Y R10 31 #define INC3_Y R11 32 33 #define LDA R12 34 #define LDA3 R13 35 36 #define ALPHA X0 37 #define ALPHA_SPILL al-16(SP) 38 39 #define LOAD_ALPHA \ 40 MOVSS alpha+16(FP), ALPHA \ 41 SHUFPS $0, ALPHA, ALPHA 42 43 #define LOAD_SCALED4 \ 44 PREFETCHNTA 16*SIZE(X_PTR) \ 45 MOVDDUP (X_PTR), X1 \ 46 MOVDDUP 2*SIZE(X_PTR), X3 \ 47 MOVSHDUP X1, X2 \ 48 MOVSHDUP X3, X4 \ 49 MOVSLDUP X1, X1 \ 50 MOVSLDUP X3, X3 \ 51 MULPS ALPHA, X1 \ 52 MULPS ALPHA, X2 \ 53 MULPS ALPHA, X3 \ 54 MULPS ALPHA, X4 55 56 #define LOAD_SCALED2 \ 57 MOVDDUP (X_PTR), X1 \ 58 MOVSHDUP X1, X2 \ 59 MOVSLDUP X1, X1 \ 60 MULPS ALPHA, X1 \ 61 MULPS ALPHA, X2 62 63 #define LOAD_SCALED1 \ 64 MOVSS (X_PTR), X1 \ 65 SHUFPS $0, X1, X1 \ 66 MULPS ALPHA, X1 67 68 #define LOAD_SCALED4_INC \ 69 PREFETCHNTA (X_PTR)(INC_X*8) \ 70 MOVSS (X_PTR), X1 \ 71 MOVSS (X_PTR)(INC_X*1), X2 \ 72 MOVSS (X_PTR)(INC_X*2), X3 \ 73 MOVSS (X_PTR)(INC3_X*1), X4 \ 74 SHUFPS $0, X1, X1 \ 75 SHUFPS $0, X2, X2 \ 76 SHUFPS $0, X3, X3 \ 77 SHUFPS $0, X4, X4 \ 78 MULPS ALPHA, X1 \ 79 MULPS ALPHA, X2 \ 80 MULPS ALPHA, X3 \ 81 MULPS ALPHA, X4 82 83 #define LOAD_SCALED2_INC \ 84 MOVSS (X_PTR), X1 \ 85 MOVSS (X_PTR)(INC_X*1), X2 \ 86 SHUFPS $0, X1, X1 \ 87 SHUFPS $0, X2, X2 \ 88 MULPS ALPHA, X1 \ 89 MULPS ALPHA, X2 90 91 #define KERNEL_LOAD8 \ 92 MOVUPS (Y_PTR), X5 \ 93 MOVUPS 4*SIZE(Y_PTR), X6 94 95 #define KERNEL_LOAD8_INC \ 96 MOVSS (Y_PTR), X5 \ 97 MOVSS (Y_PTR)(INC_Y*1), X6 \ 98 MOVSS (Y_PTR)(INC_Y*2), X7 \ 99 MOVSS (Y_PTR)(INC3_Y*1), X8 \ 100 UNPCKLPS X6, X5 \ 101 UNPCKLPS X8, X7 \ 102 MOVLHPS X7, X5 \ 103 LEAQ (Y_PTR)(INC_Y*4), Y_PTR \ 104 MOVSS (Y_PTR), X6 \ 105 MOVSS (Y_PTR)(INC_Y*1), X7 \ 106 MOVSS (Y_PTR)(INC_Y*2), X8 \ 107 MOVSS (Y_PTR)(INC3_Y*1), X9 \ 108 UNPCKLPS X7, X6 \ 109 UNPCKLPS X9, X8 \ 110 MOVLHPS X8, X6 111 112 #define KERNEL_LOAD4 \ 113 MOVUPS (Y_PTR), X5 114 115 #define KERNEL_LOAD4_INC \ 116 MOVSS (Y_PTR), X5 \ 117 MOVSS (Y_PTR)(INC_Y*1), X6 \ 118 MOVSS (Y_PTR)(INC_Y*2), X7 \ 119 MOVSS (Y_PTR)(INC3_Y*1), X8 \ 120 UNPCKLPS X6, X5 \ 121 UNPCKLPS X8, X7 \ 122 MOVLHPS X7, X5 123 124 #define KERNEL_LOAD2 \ 125 MOVSD (Y_PTR), X5 126 127 #define KERNEL_LOAD2_INC \ 128 MOVSS (Y_PTR), X5 \ 129 MOVSS (Y_PTR)(INC_Y*1), X6 \ 130 UNPCKLPS X6, X5 131 132 #define KERNEL_4x8 \ 133 MOVUPS X5, X7 \ 134 MOVUPS X6, X8 \ 135 MOVUPS X5, X9 \ 136 MOVUPS X6, X10 \ 137 MOVUPS X5, X11 \ 138 MOVUPS X6, X12 \ 139 MULPS X1, X5 \ 140 MULPS X1, X6 \ 141 MULPS X2, X7 \ 142 MULPS X2, X8 \ 143 MULPS X3, X9 \ 144 MULPS X3, X10 \ 145 MULPS X4, X11 \ 146 MULPS X4, X12 147 148 #define STORE_4x8 \ 149 MOVUPS ALPHA, ALPHA_SPILL \ 150 MOVUPS (A_PTR), X13 \ 151 ADDPS X13, X5 \ 152 MOVUPS 4*SIZE(A_PTR), X14 \ 153 ADDPS X14, X6 \ 154 MOVUPS (A_PTR)(LDA*1), X15 \ 155 ADDPS X15, X7 \ 156 MOVUPS 4*SIZE(A_PTR)(LDA*1), X0 \ 157 ADDPS X0, X8 \ 158 MOVUPS (A_PTR)(LDA*2), X13 \ 159 ADDPS X13, X9 \ 160 MOVUPS 4*SIZE(A_PTR)(LDA*2), X14 \ 161 ADDPS X14, X10 \ 162 MOVUPS (A_PTR)(LDA3*1), X15 \ 163 ADDPS X15, X11 \ 164 MOVUPS 4*SIZE(A_PTR)(LDA3*1), X0 \ 165 ADDPS X0, X12 \ 166 MOVUPS X5, (A_PTR) \ 167 MOVUPS X6, 4*SIZE(A_PTR) \ 168 MOVUPS X7, (A_PTR)(LDA*1) \ 169 MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \ 170 MOVUPS X9, (A_PTR)(LDA*2) \ 171 MOVUPS X10, 4*SIZE(A_PTR)(LDA*2) \ 172 MOVUPS X11, (A_PTR)(LDA3*1) \ 173 MOVUPS X12, 4*SIZE(A_PTR)(LDA3*1) \ 174 MOVUPS ALPHA_SPILL, ALPHA \ 175 ADDQ $8*SIZE, A_PTR 176 177 #define KERNEL_4x4 \ 178 MOVUPS X5, X6 \ 179 MOVUPS X5, X7 \ 180 MOVUPS X5, X8 \ 181 MULPS X1, X5 \ 182 MULPS X2, X6 \ 183 MULPS X3, X7 \ 184 MULPS X4, X8 185 186 #define STORE_4x4 \ 187 MOVUPS (A_PTR), X13 \ 188 ADDPS X13, X5 \ 189 MOVUPS (A_PTR)(LDA*1), X14 \ 190 ADDPS X14, X6 \ 191 MOVUPS (A_PTR)(LDA*2), X15 \ 192 ADDPS X15, X7 \ 193 MOVUPS (A_PTR)(LDA3*1), X13 \ 194 ADDPS X13, X8 \ 195 MOVUPS X5, (A_PTR) \ 196 MOVUPS X6, (A_PTR)(LDA*1) \ 197 MOVUPS X7, (A_PTR)(LDA*2) \ 198 MOVUPS X8, (A_PTR)(LDA3*1) \ 199 ADDQ $4*SIZE, A_PTR 200 201 #define KERNEL_4x2 \ 202 MOVUPS X5, X6 \ 203 MOVUPS X5, X7 \ 204 MOVUPS X5, X8 \ 205 MULPS X1, X5 \ 206 MULPS X2, X6 \ 207 MULPS X3, X7 \ 208 MULPS X4, X8 209 210 #define STORE_4x2 \ 211 MOVSD (A_PTR), X9 \ 212 ADDPS X9, X5 \ 213 MOVSD (A_PTR)(LDA*1), X10 \ 214 ADDPS X10, X6 \ 215 MOVSD (A_PTR)(LDA*2), X11 \ 216 ADDPS X11, X7 \ 217 MOVSD (A_PTR)(LDA3*1), X12 \ 218 ADDPS X12, X8 \ 219 MOVSD X5, (A_PTR) \ 220 MOVSD X6, (A_PTR)(LDA*1) \ 221 MOVSD X7, (A_PTR)(LDA*2) \ 222 MOVSD X8, (A_PTR)(LDA3*1) \ 223 ADDQ $2*SIZE, A_PTR 224 225 #define KERNEL_4x1 \ 226 MOVSS (Y_PTR), X5 \ 227 MOVSS X5, X6 \ 228 MOVSS X5, X7 \ 229 MOVSS X5, X8 \ 230 MULSS X1, X5 \ 231 MULSS X2, X6 \ 232 MULSS X3, X7 \ 233 MULSS X4, X8 234 235 #define STORE_4x1 \ 236 ADDSS (A_PTR), X5 \ 237 ADDSS (A_PTR)(LDA*1), X6 \ 238 ADDSS (A_PTR)(LDA*2), X7 \ 239 ADDSS (A_PTR)(LDA3*1), X8 \ 240 MOVSS X5, (A_PTR) \ 241 MOVSS X6, (A_PTR)(LDA*1) \ 242 MOVSS X7, (A_PTR)(LDA*2) \ 243 MOVSS X8, (A_PTR)(LDA3*1) \ 244 ADDQ $SIZE, A_PTR 245 246 #define KERNEL_2x8 \ 247 MOVUPS X5, X7 \ 248 MOVUPS X6, X8 \ 249 MULPS X1, X5 \ 250 MULPS X1, X6 \ 251 MULPS X2, X7 \ 252 MULPS X2, X8 253 254 #define STORE_2x8 \ 255 MOVUPS (A_PTR), X9 \ 256 ADDPS X9, X5 \ 257 MOVUPS 4*SIZE(A_PTR), X10 \ 258 ADDPS X10, X6 \ 259 MOVUPS (A_PTR)(LDA*1), X11 \ 260 ADDPS X11, X7 \ 261 MOVUPS 4*SIZE(A_PTR)(LDA*1), X12 \ 262 ADDPS X12, X8 \ 263 MOVUPS X5, (A_PTR) \ 264 MOVUPS X6, 4*SIZE(A_PTR) \ 265 MOVUPS X7, (A_PTR)(LDA*1) \ 266 MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \ 267 ADDQ $8*SIZE, A_PTR 268 269 #define KERNEL_2x4 \ 270 MOVUPS X5, X6 \ 271 MULPS X1, X5 \ 272 MULPS X2, X6 273 274 #define STORE_2x4 \ 275 MOVUPS (A_PTR), X9 \ 276 ADDPS X9, X5 \ 277 MOVUPS (A_PTR)(LDA*1), X11 \ 278 ADDPS X11, X6 \ 279 MOVUPS X5, (A_PTR) \ 280 MOVUPS X6, (A_PTR)(LDA*1) \ 281 ADDQ $4*SIZE, A_PTR 282 283 #define KERNEL_2x2 \ 284 MOVSD X5, X6 \ 285 MULPS X1, X5 \ 286 MULPS X2, X6 287 288 #define STORE_2x2 \ 289 MOVSD (A_PTR), X7 \ 290 ADDPS X7, X5 \ 291 MOVSD (A_PTR)(LDA*1), X8 \ 292 ADDPS X8, X6 \ 293 MOVSD X5, (A_PTR) \ 294 MOVSD X6, (A_PTR)(LDA*1) \ 295 ADDQ $2*SIZE, A_PTR 296 297 #define KERNEL_2x1 \ 298 MOVSS (Y_PTR), X5 \ 299 MOVSS X5, X6 \ 300 MULSS X1, X5 \ 301 MULSS X2, X6 302 303 #define STORE_2x1 \ 304 ADDSS (A_PTR), X5 \ 305 ADDSS (A_PTR)(LDA*1), X6 \ 306 MOVSS X5, (A_PTR) \ 307 MOVSS X6, (A_PTR)(LDA*1) \ 308 ADDQ $SIZE, A_PTR 309 310 #define KERNEL_1x8 \ 311 MULPS X1, X5 \ 312 MULPS X1, X6 313 314 #define STORE_1x8 \ 315 MOVUPS (A_PTR), X7 \ 316 ADDPS X7, X5 \ 317 MOVUPS 4*SIZE(A_PTR), X8 \ 318 ADDPS X8, X6 \ 319 MOVUPS X5, (A_PTR) \ 320 MOVUPS X6, 4*SIZE(A_PTR) \ 321 ADDQ $8*SIZE, A_PTR 322 323 #define KERNEL_1x4 \ 324 MULPS X1, X5 \ 325 MULPS X1, X6 326 327 #define STORE_1x4 \ 328 MOVUPS (A_PTR), X7 \ 329 ADDPS X7, X5 \ 330 MOVUPS X5, (A_PTR) \ 331 ADDQ $4*SIZE, A_PTR 332 333 #define KERNEL_1x2 \ 334 MULPS X1, X5 335 336 #define STORE_1x2 \ 337 MOVSD (A_PTR), X6 \ 338 ADDPS X6, X5 \ 339 MOVSD X5, (A_PTR) \ 340 ADDQ $2*SIZE, A_PTR 341 342 #define KERNEL_1x1 \ 343 MOVSS (Y_PTR), X5 \ 344 MULSS X1, X5 345 346 #define STORE_1x1 \ 347 ADDSS (A_PTR), X5 \ 348 MOVSS X5, (A_PTR) \ 349 ADDQ $SIZE, A_PTR 350 351 // func Ger(m, n uintptr, alpha float32, 352 // x []float32, incX uintptr, 353 // y []float32, incY uintptr, 354 // a []float32, lda uintptr) 355 TEXT ·Ger(SB), 0, $16-120 356 MOVQ M_DIM, M 357 MOVQ N_DIM, N 358 CMPQ M, $0 359 JE end 360 CMPQ N, $0 361 JE end 362 363 LOAD_ALPHA 364 365 MOVQ x_base+24(FP), X_PTR 366 MOVQ y_base+56(FP), Y_PTR 367 MOVQ a_base+88(FP), A_ROW 368 MOVQ A_ROW, A_PTR 369 MOVQ lda+112(FP), LDA // LDA = LDA * sizeof(float32) 370 SHLQ $BITSIZE, LDA 371 LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3 372 373 CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path) 374 JNE inc 375 CMPQ incX+48(FP), $1 // Check for dense vector X (fast-path) 376 JNE inc 377 378 SHRQ $2, M 379 JZ r2 380 381 r4: 382 383 // LOAD 4 384 LOAD_SCALED4 385 386 MOVQ N_DIM, N 387 SHRQ $KERNELSIZE, N 388 JZ r4c4 389 390 r4c8: 391 // 4x8 KERNEL 392 KERNEL_LOAD8 393 KERNEL_4x8 394 STORE_4x8 395 396 ADDQ $8*SIZE, Y_PTR 397 398 DECQ N 399 JNZ r4c8 400 401 r4c4: 402 TESTQ $4, N_DIM 403 JZ r4c2 404 405 // 4x4 KERNEL 406 KERNEL_LOAD4 407 KERNEL_4x4 408 STORE_4x4 409 410 ADDQ $4*SIZE, Y_PTR 411 412 r4c2: 413 TESTQ $2, N_DIM 414 JZ r4c1 415 416 // 4x2 KERNEL 417 KERNEL_LOAD2 418 KERNEL_4x2 419 STORE_4x2 420 421 ADDQ $2*SIZE, Y_PTR 422 423 r4c1: 424 TESTQ $1, N_DIM 425 JZ r4end 426 427 // 4x1 KERNEL 428 KERNEL_4x1 429 STORE_4x1 430 431 ADDQ $SIZE, Y_PTR 432 433 r4end: 434 ADDQ $4*SIZE, X_PTR 435 MOVQ Y, Y_PTR 436 LEAQ (A_ROW)(LDA*4), A_ROW 437 MOVQ A_ROW, A_PTR 438 439 DECQ M 440 JNZ r4 441 442 r2: 443 TESTQ $2, M_DIM 444 JZ r1 445 446 // LOAD 2 447 LOAD_SCALED2 448 449 MOVQ N_DIM, N 450 SHRQ $KERNELSIZE, N 451 JZ r2c4 452 453 r2c8: 454 // 2x8 KERNEL 455 KERNEL_LOAD8 456 KERNEL_2x8 457 STORE_2x8 458 459 ADDQ $8*SIZE, Y_PTR 460 461 DECQ N 462 JNZ r2c8 463 464 r2c4: 465 TESTQ $4, N_DIM 466 JZ r2c2 467 468 // 2x4 KERNEL 469 KERNEL_LOAD4 470 KERNEL_2x4 471 STORE_2x4 472 473 ADDQ $4*SIZE, Y_PTR 474 475 r2c2: 476 TESTQ $2, N_DIM 477 JZ r2c1 478 479 // 2x2 KERNEL 480 KERNEL_LOAD2 481 KERNEL_2x2 482 STORE_2x2 483 484 ADDQ $2*SIZE, Y_PTR 485 486 r2c1: 487 TESTQ $1, N_DIM 488 JZ r2end 489 490 // 2x1 KERNEL 491 KERNEL_2x1 492 STORE_2x1 493 494 ADDQ $SIZE, Y_PTR 495 496 r2end: 497 ADDQ $2*SIZE, X_PTR 498 MOVQ Y, Y_PTR 499 LEAQ (A_ROW)(LDA*2), A_ROW 500 MOVQ A_ROW, A_PTR 501 502 r1: 503 TESTQ $1, M_DIM 504 JZ end 505 506 // LOAD 1 507 LOAD_SCALED1 508 509 MOVQ N_DIM, N 510 SHRQ $KERNELSIZE, N 511 JZ r1c4 512 513 r1c8: 514 // 1x8 KERNEL 515 KERNEL_LOAD8 516 KERNEL_1x8 517 STORE_1x8 518 519 ADDQ $8*SIZE, Y_PTR 520 521 DECQ N 522 JNZ r1c8 523 524 r1c4: 525 TESTQ $4, N_DIM 526 JZ r1c2 527 528 // 1x4 KERNEL 529 KERNEL_LOAD4 530 KERNEL_1x4 531 STORE_1x4 532 533 ADDQ $4*SIZE, Y_PTR 534 535 r1c2: 536 TESTQ $2, N_DIM 537 JZ r1c1 538 539 // 1x2 KERNEL 540 KERNEL_LOAD2 541 KERNEL_1x2 542 STORE_1x2 543 544 ADDQ $2*SIZE, Y_PTR 545 546 r1c1: 547 TESTQ $1, N_DIM 548 JZ end 549 550 // 1x1 KERNEL 551 KERNEL_1x1 552 STORE_1x1 553 554 end: 555 RET 556 557 inc: // Algorithm for incY != 0 ( split loads in kernel ) 558 559 MOVQ incX+48(FP), INC_X // INC_X = incX * sizeof(float32) 560 SHLQ $BITSIZE, INC_X 561 MOVQ incY+80(FP), INC_Y // INC_Y = incY * sizeof(float32) 562 SHLQ $BITSIZE, INC_Y 563 LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3 564 LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3 565 566 XORQ TMP2, TMP2 567 MOVQ M, TMP1 568 SUBQ $1, TMP1 569 IMULQ INC_X, TMP1 570 NEGQ TMP1 571 CMPQ INC_X, $0 572 CMOVQLT TMP1, TMP2 573 LEAQ (X_PTR)(TMP2*SIZE), X_PTR 574 575 XORQ TMP2, TMP2 576 MOVQ N, TMP1 577 SUBQ $1, TMP1 578 IMULQ INC_Y, TMP1 579 NEGQ TMP1 580 CMPQ INC_Y, $0 581 CMOVQLT TMP1, TMP2 582 LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR 583 584 SHRQ $2, M 585 JZ inc_r2 586 587 inc_r4: 588 // LOAD 4 589 LOAD_SCALED4_INC 590 591 MOVQ N_DIM, N 592 SHRQ $KERNELSIZE, N 593 JZ inc_r4c4 594 595 inc_r4c8: 596 // 4x4 KERNEL 597 KERNEL_LOAD8_INC 598 KERNEL_4x8 599 STORE_4x8 600 601 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 602 DECQ N 603 JNZ inc_r4c8 604 605 inc_r4c4: 606 TESTQ $4, N_DIM 607 JZ inc_r4c2 608 609 // 4x4 KERNEL 610 KERNEL_LOAD4_INC 611 KERNEL_4x4 612 STORE_4x4 613 614 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 615 616 inc_r4c2: 617 TESTQ $2, N_DIM 618 JZ inc_r4c1 619 620 // 4x2 KERNEL 621 KERNEL_LOAD2_INC 622 KERNEL_4x2 623 STORE_4x2 624 625 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 626 627 inc_r4c1: 628 TESTQ $1, N_DIM 629 JZ inc_r4end 630 631 // 4x1 KERNEL 632 KERNEL_4x1 633 STORE_4x1 634 635 ADDQ INC_Y, Y_PTR 636 637 inc_r4end: 638 LEAQ (X_PTR)(INC_X*4), X_PTR 639 MOVQ Y, Y_PTR 640 LEAQ (A_ROW)(LDA*4), A_ROW 641 MOVQ A_ROW, A_PTR 642 643 DECQ M 644 JNZ inc_r4 645 646 inc_r2: 647 TESTQ $2, M_DIM 648 JZ inc_r1 649 650 // LOAD 2 651 LOAD_SCALED2_INC 652 653 MOVQ N_DIM, N 654 SHRQ $KERNELSIZE, N 655 JZ inc_r2c4 656 657 inc_r2c8: 658 // 2x8 KERNEL 659 KERNEL_LOAD8_INC 660 KERNEL_2x8 661 STORE_2x8 662 663 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 664 DECQ N 665 JNZ inc_r2c8 666 667 inc_r2c4: 668 TESTQ $4, N_DIM 669 JZ inc_r2c2 670 671 // 2x4 KERNEL 672 KERNEL_LOAD4_INC 673 KERNEL_2x4 674 STORE_2x4 675 676 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 677 678 inc_r2c2: 679 TESTQ $2, N_DIM 680 JZ inc_r2c1 681 682 // 2x2 KERNEL 683 KERNEL_LOAD2_INC 684 KERNEL_2x2 685 STORE_2x2 686 687 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 688 689 inc_r2c1: 690 TESTQ $1, N_DIM 691 JZ inc_r2end 692 693 // 2x1 KERNEL 694 KERNEL_2x1 695 STORE_2x1 696 697 ADDQ INC_Y, Y_PTR 698 699 inc_r2end: 700 LEAQ (X_PTR)(INC_X*2), X_PTR 701 MOVQ Y, Y_PTR 702 LEAQ (A_ROW)(LDA*2), A_ROW 703 MOVQ A_ROW, A_PTR 704 705 inc_r1: 706 TESTQ $1, M_DIM 707 JZ end 708 709 // LOAD 1 710 LOAD_SCALED1 711 712 MOVQ N_DIM, N 713 SHRQ $KERNELSIZE, N 714 JZ inc_r1c4 715 716 inc_r1c8: 717 // 1x8 KERNEL 718 KERNEL_LOAD8_INC 719 KERNEL_1x8 720 STORE_1x8 721 722 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 723 DECQ N 724 JNZ inc_r1c8 725 726 inc_r1c4: 727 TESTQ $4, N_DIM 728 JZ inc_r1c2 729 730 // 1x4 KERNEL 731 KERNEL_LOAD4_INC 732 KERNEL_1x4 733 STORE_1x4 734 735 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 736 737 inc_r1c2: 738 TESTQ $2, N_DIM 739 JZ inc_r1c1 740 741 // 1x2 KERNEL 742 KERNEL_LOAD2_INC 743 KERNEL_1x2 744 STORE_1x2 745 746 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 747 748 inc_r1c1: 749 TESTQ $1, N_DIM 750 JZ inc_end 751 752 // 1x1 KERNEL 753 KERNEL_1x1 754 STORE_1x1 755 756 inc_end: 757 RET