gonum.org/v1/gonum@v0.14.0/internal/asm/f64/ger_amd64.s (about) 1 // Copyright ©2017 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !noasm,!gccgo,!safe 6 7 #include "textflag.h" 8 9 #define SIZE 8 10 11 #define M_DIM m+0(FP) 12 #define M CX 13 #define N_DIM n+8(FP) 14 #define N BX 15 16 #define TMP1 R14 17 #define TMP2 R15 18 19 #define X_PTR SI 20 #define Y y_base+56(FP) 21 #define Y_PTR DX 22 #define A_ROW AX 23 #define A_PTR DI 24 25 #define INC_X R8 26 #define INC3_X R9 27 28 #define INC_Y R10 29 #define INC3_Y R11 30 31 #define LDA R12 32 #define LDA3 R13 33 34 #define ALPHA X0 35 36 #define LOAD4 \ 37 PREFETCHNTA (X_PTR )(INC_X*8) \ 38 MOVDDUP (X_PTR), X1 \ 39 MOVDDUP (X_PTR)(INC_X*1), X2 \ 40 MOVDDUP (X_PTR)(INC_X*2), X3 \ 41 MOVDDUP (X_PTR)(INC3_X*1), X4 \ 42 MULPD ALPHA, X1 \ 43 MULPD ALPHA, X2 \ 44 MULPD ALPHA, X3 \ 45 MULPD ALPHA, X4 46 47 #define LOAD2 \ 48 MOVDDUP (X_PTR), X1 \ 49 MOVDDUP (X_PTR)(INC_X*1), X2 \ 50 MULPD ALPHA, X1 \ 51 MULPD ALPHA, X2 52 53 #define LOAD1 \ 54 MOVDDUP (X_PTR), X1 \ 55 MULPD ALPHA, X1 56 57 #define KERNEL_LOAD4 \ 58 MOVUPS (Y_PTR), X5 \ 59 MOVUPS 2*SIZE(Y_PTR), X6 60 61 #define KERNEL_LOAD4_INC \ 62 MOVLPD (Y_PTR), X5 \ 63 MOVHPD (Y_PTR)(INC_Y*1), X5 \ 64 MOVLPD (Y_PTR)(INC_Y*2), X6 \ 65 MOVHPD (Y_PTR)(INC3_Y*1), X6 66 67 #define KERNEL_LOAD2 \ 68 MOVUPS (Y_PTR), X5 69 70 #define KERNEL_LOAD2_INC \ 71 MOVLPD (Y_PTR), X5 \ 72 MOVHPD (Y_PTR)(INC_Y*1), X5 73 74 #define KERNEL_4x4 \ 75 MOVUPS X5, X7 \ 76 MOVUPS X6, X8 \ 77 MOVUPS X5, X9 \ 78 MOVUPS X6, X10 \ 79 MOVUPS X5, X11 \ 80 MOVUPS X6, X12 \ 81 MULPD X1, X5 \ 82 MULPD X1, X6 \ 83 MULPD X2, X7 \ 84 MULPD X2, X8 \ 85 MULPD X3, X9 \ 86 MULPD X3, X10 \ 87 MULPD X4, X11 \ 88 MULPD X4, X12 89 90 #define STORE_4x4 \ 91 MOVUPS (A_PTR), X13 \ 92 ADDPD X13, X5 \ 93 MOVUPS 2*SIZE(A_PTR), X14 \ 94 ADDPD X14, X6 \ 95 MOVUPS (A_PTR)(LDA*1), X15 \ 96 ADDPD X15, X7 \ 97 MOVUPS 2*SIZE(A_PTR)(LDA*1), X0 \ 98 ADDPD X0, X8 \ 99 MOVUPS (A_PTR)(LDA*2), X13 \ 100 ADDPD X13, X9 \ 101 MOVUPS 2*SIZE(A_PTR)(LDA*2), X14 \ 102 ADDPD X14, X10 \ 103 MOVUPS (A_PTR)(LDA3*1), X15 \ 104 ADDPD X15, X11 \ 105 MOVUPS 2*SIZE(A_PTR)(LDA3*1), X0 \ 106 ADDPD X0, X12 \ 107 MOVUPS X5, (A_PTR) \ 108 MOVUPS X6, 2*SIZE(A_PTR) \ 109 MOVUPS X7, (A_PTR)(LDA*1) \ 110 MOVUPS X8, 2*SIZE(A_PTR)(LDA*1) \ 111 MOVUPS X9, (A_PTR)(LDA*2) \ 112 MOVUPS X10, 2*SIZE(A_PTR)(LDA*2) \ 113 MOVUPS X11, (A_PTR)(LDA3*1) \ 114 MOVUPS X12, 2*SIZE(A_PTR)(LDA3*1) \ 115 ADDQ $4*SIZE, A_PTR 116 117 #define KERNEL_4x2 \ 118 MOVUPS X5, X6 \ 119 MOVUPS X5, X7 \ 120 MOVUPS X5, X8 \ 121 MULPD X1, X5 \ 122 MULPD X2, X6 \ 123 MULPD X3, X7 \ 124 MULPD X4, X8 125 126 #define STORE_4x2 \ 127 MOVUPS (A_PTR), X9 \ 128 ADDPD X9, X5 \ 129 MOVUPS (A_PTR)(LDA*1), X10 \ 130 ADDPD X10, X6 \ 131 MOVUPS (A_PTR)(LDA*2), X11 \ 132 ADDPD X11, X7 \ 133 MOVUPS (A_PTR)(LDA3*1), X12 \ 134 ADDPD X12, X8 \ 135 MOVUPS X5, (A_PTR) \ 136 MOVUPS X6, (A_PTR)(LDA*1) \ 137 MOVUPS X7, (A_PTR)(LDA*2) \ 138 MOVUPS X8, (A_PTR)(LDA3*1) \ 139 ADDQ $2*SIZE, A_PTR 140 141 #define KERNEL_4x1 \ 142 MOVSD (Y_PTR), X5 \ 143 MOVSD X5, X6 \ 144 MOVSD X5, X7 \ 145 MOVSD X5, X8 \ 146 MULSD X1, X5 \ 147 MULSD X2, X6 \ 148 MULSD X3, X7 \ 149 MULSD X4, X8 150 151 #define STORE_4x1 \ 152 ADDSD (A_PTR), X5 \ 153 ADDSD (A_PTR)(LDA*1), X6 \ 154 ADDSD (A_PTR)(LDA*2), X7 \ 155 ADDSD (A_PTR)(LDA3*1), X8 \ 156 MOVSD X5, (A_PTR) \ 157 MOVSD X6, (A_PTR)(LDA*1) \ 158 MOVSD X7, (A_PTR)(LDA*2) \ 159 MOVSD X8, (A_PTR)(LDA3*1) \ 160 ADDQ $SIZE, A_PTR 161 162 #define KERNEL_2x4 \ 163 MOVUPS X5, X7 \ 164 MOVUPS X6, X8 \ 165 MULPD X1, X5 \ 166 MULPD X1, X6 \ 167 MULPD X2, X7 \ 168 MULPD X2, X8 169 170 #define STORE_2x4 \ 171 MOVUPS (A_PTR), X9 \ 172 ADDPD X9, X5 \ 173 MOVUPS 2*SIZE(A_PTR), X10 \ 174 ADDPD X10, X6 \ 175 MOVUPS (A_PTR)(LDA*1), X11 \ 176 ADDPD X11, X7 \ 177 MOVUPS 2*SIZE(A_PTR)(LDA*1), X12 \ 178 ADDPD X12, X8 \ 179 MOVUPS X5, (A_PTR) \ 180 MOVUPS X6, 2*SIZE(A_PTR) \ 181 MOVUPS X7, (A_PTR)(LDA*1) \ 182 MOVUPS X8, 2*SIZE(A_PTR)(LDA*1) \ 183 ADDQ $4*SIZE, A_PTR 184 185 #define KERNEL_2x2 \ 186 MOVUPS X5, X6 \ 187 MULPD X1, X5 \ 188 MULPD X2, X6 189 190 #define STORE_2x2 \ 191 MOVUPS (A_PTR), X7 \ 192 ADDPD X7, X5 \ 193 MOVUPS (A_PTR)(LDA*1), X8 \ 194 ADDPD X8, X6 \ 195 MOVUPS X5, (A_PTR) \ 196 MOVUPS X6, (A_PTR)(LDA*1) \ 197 ADDQ $2*SIZE, A_PTR 198 199 #define KERNEL_2x1 \ 200 MOVSD (Y_PTR), X5 \ 201 MOVSD X5, X6 \ 202 MULSD X1, X5 \ 203 MULSD X2, X6 204 205 #define STORE_2x1 \ 206 ADDSD (A_PTR), X5 \ 207 ADDSD (A_PTR)(LDA*1), X6 \ 208 MOVSD X5, (A_PTR) \ 209 MOVSD X6, (A_PTR)(LDA*1) \ 210 ADDQ $SIZE, A_PTR 211 212 #define KERNEL_1x4 \ 213 MULPD X1, X5 \ 214 MULPD X1, X6 215 216 #define STORE_1x4 \ 217 MOVUPS (A_PTR), X7 \ 218 ADDPD X7, X5 \ 219 MOVUPS 2*SIZE(A_PTR), X8 \ 220 ADDPD X8, X6 \ 221 MOVUPS X5, (A_PTR) \ 222 MOVUPS X6, 2*SIZE(A_PTR) \ 223 ADDQ $4*SIZE, A_PTR 224 225 #define KERNEL_1x2 \ 226 MULPD X1, X5 227 228 #define STORE_1x2 \ 229 MOVUPS (A_PTR), X6 \ 230 ADDPD X6, X5 \ 231 MOVUPS X5, (A_PTR) \ 232 ADDQ $2*SIZE, A_PTR 233 234 #define KERNEL_1x1 \ 235 MOVSD (Y_PTR), X5 \ 236 MULSD X1, X5 237 238 #define STORE_1x1 \ 239 ADDSD (A_PTR), X5 \ 240 MOVSD X5, (A_PTR) \ 241 ADDQ $SIZE, A_PTR 242 243 // func Ger(m, n uintptr, alpha float64, 244 // x []float64, incX uintptr, 245 // y []float64, incY uintptr, 246 // a []float64, lda uintptr) 247 TEXT ·Ger(SB), NOSPLIT, $0 248 MOVQ M_DIM, M 249 MOVQ N_DIM, N 250 CMPQ M, $0 251 JE end 252 CMPQ N, $0 253 JE end 254 255 MOVDDUP alpha+16(FP), ALPHA 256 257 MOVQ x_base+24(FP), X_PTR 258 MOVQ y_base+56(FP), Y_PTR 259 MOVQ a_base+88(FP), A_ROW 260 MOVQ incX+48(FP), INC_X // INC_X = incX * sizeof(float64) 261 SHLQ $3, INC_X 262 MOVQ lda+112(FP), LDA // LDA = LDA * sizeof(float64) 263 SHLQ $3, LDA 264 LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3 265 LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3 266 MOVQ A_ROW, A_PTR 267 268 XORQ TMP2, TMP2 269 MOVQ M, TMP1 270 SUBQ $1, TMP1 271 IMULQ INC_X, TMP1 272 NEGQ TMP1 273 CMPQ INC_X, $0 274 CMOVQLT TMP1, TMP2 275 LEAQ (X_PTR)(TMP2*SIZE), X_PTR 276 277 CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path) 278 JG inc 279 JL end 280 281 SHRQ $2, M 282 JZ r2 283 284 r4: 285 // LOAD 4 286 LOAD4 287 288 MOVQ N_DIM, N 289 SHRQ $2, N 290 JZ r4c2 291 292 r4c4: 293 // 4x4 KERNEL 294 KERNEL_LOAD4 295 KERNEL_4x4 296 STORE_4x4 297 298 ADDQ $4*SIZE, Y_PTR 299 300 DECQ N 301 JNZ r4c4 302 303 // Reload ALPHA after it's clobbered by STORE_4x4 304 MOVDDUP alpha+16(FP), ALPHA 305 306 r4c2: 307 TESTQ $2, N_DIM 308 JZ r4c1 309 310 // 4x2 KERNEL 311 KERNEL_LOAD2 312 KERNEL_4x2 313 STORE_4x2 314 315 ADDQ $2*SIZE, Y_PTR 316 317 r4c1: 318 TESTQ $1, N_DIM 319 JZ r4end 320 321 // 4x1 KERNEL 322 KERNEL_4x1 323 STORE_4x1 324 325 ADDQ $SIZE, Y_PTR 326 327 r4end: 328 LEAQ (X_PTR)(INC_X*4), X_PTR 329 MOVQ Y, Y_PTR 330 LEAQ (A_ROW)(LDA*4), A_ROW 331 MOVQ A_ROW, A_PTR 332 333 DECQ M 334 JNZ r4 335 336 r2: 337 TESTQ $2, M_DIM 338 JZ r1 339 340 // LOAD 2 341 LOAD2 342 343 MOVQ N_DIM, N 344 SHRQ $2, N 345 JZ r2c2 346 347 r2c4: 348 // 2x4 KERNEL 349 KERNEL_LOAD4 350 KERNEL_2x4 351 STORE_2x4 352 353 ADDQ $4*SIZE, Y_PTR 354 355 DECQ N 356 JNZ r2c4 357 358 r2c2: 359 TESTQ $2, N_DIM 360 JZ r2c1 361 362 // 2x2 KERNEL 363 KERNEL_LOAD2 364 KERNEL_2x2 365 STORE_2x2 366 367 ADDQ $2*SIZE, Y_PTR 368 369 r2c1: 370 TESTQ $1, N_DIM 371 JZ r2end 372 373 // 2x1 KERNEL 374 KERNEL_2x1 375 STORE_2x1 376 377 ADDQ $SIZE, Y_PTR 378 379 r2end: 380 LEAQ (X_PTR)(INC_X*2), X_PTR 381 MOVQ Y, Y_PTR 382 LEAQ (A_ROW)(LDA*2), A_ROW 383 MOVQ A_ROW, A_PTR 384 385 r1: 386 TESTQ $1, M_DIM 387 JZ end 388 389 // LOAD 1 390 LOAD1 391 392 MOVQ N_DIM, N 393 SHRQ $2, N 394 JZ r1c2 395 396 r1c4: 397 // 1x4 KERNEL 398 KERNEL_LOAD4 399 KERNEL_1x4 400 STORE_1x4 401 402 ADDQ $4*SIZE, Y_PTR 403 404 DECQ N 405 JNZ r1c4 406 407 r1c2: 408 TESTQ $2, N_DIM 409 JZ r1c1 410 411 // 1x2 KERNEL 412 KERNEL_LOAD2 413 KERNEL_1x2 414 STORE_1x2 415 416 ADDQ $2*SIZE, Y_PTR 417 418 r1c1: 419 TESTQ $1, N_DIM 420 JZ end 421 422 // 1x1 KERNEL 423 KERNEL_1x1 424 STORE_1x1 425 426 ADDQ $SIZE, Y_PTR 427 428 end: 429 RET 430 431 inc: // Algorithm for incY != 1 ( split loads in kernel ) 432 433 MOVQ incY+80(FP), INC_Y // INC_Y = incY * sizeof(float64) 434 SHLQ $3, INC_Y 435 LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3 436 437 XORQ TMP2, TMP2 438 MOVQ N, TMP1 439 SUBQ $1, TMP1 440 IMULQ INC_Y, TMP1 441 NEGQ TMP1 442 CMPQ INC_Y, $0 443 CMOVQLT TMP1, TMP2 444 LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR 445 446 SHRQ $2, M 447 JZ inc_r2 448 449 inc_r4: 450 // LOAD 4 451 LOAD4 452 453 MOVQ N_DIM, N 454 SHRQ $2, N 455 JZ inc_r4c2 456 457 inc_r4c4: 458 // 4x4 KERNEL 459 KERNEL_LOAD4_INC 460 KERNEL_4x4 461 STORE_4x4 462 463 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 464 DECQ N 465 JNZ inc_r4c4 466 467 // Reload ALPHA after it's clobbered by STORE_4x4 468 MOVDDUP alpha+16(FP), ALPHA 469 470 inc_r4c2: 471 TESTQ $2, N_DIM 472 JZ inc_r4c1 473 474 // 4x2 KERNEL 475 KERNEL_LOAD2_INC 476 KERNEL_4x2 477 STORE_4x2 478 479 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 480 481 inc_r4c1: 482 TESTQ $1, N_DIM 483 JZ inc_r4end 484 485 // 4x1 KERNEL 486 KERNEL_4x1 487 STORE_4x1 488 489 ADDQ INC_Y, Y_PTR 490 491 inc_r4end: 492 LEAQ (X_PTR)(INC_X*4), X_PTR 493 MOVQ Y, Y_PTR 494 LEAQ (A_ROW)(LDA*4), A_ROW 495 MOVQ A_ROW, A_PTR 496 497 DECQ M 498 JNZ inc_r4 499 500 inc_r2: 501 TESTQ $2, M_DIM 502 JZ inc_r1 503 504 // LOAD 2 505 LOAD2 506 507 MOVQ N_DIM, N 508 SHRQ $2, N 509 JZ inc_r2c2 510 511 inc_r2c4: 512 // 2x4 KERNEL 513 KERNEL_LOAD4_INC 514 KERNEL_2x4 515 STORE_2x4 516 517 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 518 DECQ N 519 JNZ inc_r2c4 520 521 inc_r2c2: 522 TESTQ $2, N_DIM 523 JZ inc_r2c1 524 525 // 2x2 KERNEL 526 KERNEL_LOAD2_INC 527 KERNEL_2x2 528 STORE_2x2 529 530 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 531 532 inc_r2c1: 533 TESTQ $1, N_DIM 534 JZ inc_r2end 535 536 // 2x1 KERNEL 537 KERNEL_2x1 538 STORE_2x1 539 540 ADDQ INC_Y, Y_PTR 541 542 inc_r2end: 543 LEAQ (X_PTR)(INC_X*2), X_PTR 544 MOVQ Y, Y_PTR 545 LEAQ (A_ROW)(LDA*2), A_ROW 546 MOVQ A_ROW, A_PTR 547 548 inc_r1: 549 TESTQ $1, M_DIM 550 JZ end 551 552 // LOAD 1 553 LOAD1 554 555 MOVQ N_DIM, N 556 SHRQ $2, N 557 JZ inc_r1c2 558 559 inc_r1c4: 560 // 1x4 KERNEL 561 KERNEL_LOAD4_INC 562 KERNEL_1x4 563 STORE_1x4 564 565 LEAQ (Y_PTR)(INC_Y*4), Y_PTR 566 DECQ N 567 JNZ inc_r1c4 568 569 inc_r1c2: 570 TESTQ $2, N_DIM 571 JZ inc_r1c1 572 573 // 1x2 KERNEL 574 KERNEL_LOAD2_INC 575 KERNEL_1x2 576 STORE_1x2 577 578 LEAQ (Y_PTR)(INC_Y*2), Y_PTR 579 580 inc_r1c1: 581 TESTQ $1, N_DIM 582 JZ end 583 584 // 1x1 KERNEL 585 KERNEL_1x1 586 STORE_1x1 587 588 ADDQ INC_Y, Y_PTR 589 590 inc_end: 591 RET