github.com/egonelbre/exp@v0.0.0-20240430123955-ed1d3aa93911/vector/compare/axpy_amd64.s (about) 1 // Code generated by command: go run main.go -out axpy_amd64.s -stubs axpy_amd64.go -testhelp axpy_stub_amd64_test.go. DO NOT EDIT. 2 3 #include "textflag.h" 4 5 // func AmdAxpyPointer_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6 // Requires: SSE 7 TEXT ·AmdAxpyPointer_V0A0(SB), NOSPLIT, $0-48 8 MOVSS alpha+0(FP), X0 9 MOVQ xs+8(FP), AX 10 MOVQ incx+16(FP), CX 11 MOVQ ys+24(FP), DX 12 MOVQ incy+32(FP), BX 13 MOVQ n+40(FP), SI 14 SHLQ $0x02, SI 15 IMULQ CX, SI 16 ADDQ AX, SI 17 JMP check_limit 18 19 loop: 20 MOVSS (AX), X1 21 MULSS X0, X1 22 ADDSS (DX), X1 23 MOVSS X1, (DX) 24 LEAQ (AX)(CX*4), AX 25 LEAQ (DX)(BX*4), DX 26 27 check_limit: 28 CMPQ SI, AX 29 JHI loop 30 RET 31 32 // func AmdAxpyPointer_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33 // Requires: SSE 34 TEXT ·AmdAxpyPointer_V1A0(SB), NOSPLIT, $0-48 35 MOVSS alpha+0(FP), X0 36 MOVQ xs+8(FP), AX 37 MOVQ incx+16(FP), CX 38 MOVQ ys+24(FP), DX 39 MOVQ incy+32(FP), BX 40 MOVQ n+40(FP), SI 41 SHLQ $0x02, SI 42 IMULQ CX, SI 43 ADDQ AX, SI 44 JMP check_limit 45 46 loop: 47 MOVSS (AX), X1 48 MULSS X0, X1 49 ADDSS (DX), X1 50 MOVSS X1, (DX) 51 LEAQ (AX)(CX*4), AX 52 LEAQ (DX)(BX*4), DX 53 54 check_limit: 55 CMPQ SI, AX 56 JHI loop 57 RET 58 59 // func AmdAxpyPointer_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 60 // Requires: SSE 61 TEXT ·AmdAxpyPointer_V2A0(SB), NOSPLIT, $0-48 62 MOVSS alpha+0(FP), X0 63 MOVQ xs+8(FP), AX 64 MOVQ incx+16(FP), CX 65 MOVQ ys+24(FP), DX 66 MOVQ incy+32(FP), BX 67 MOVQ n+40(FP), SI 68 SHLQ $0x02, SI 69 IMULQ CX, SI 70 ADDQ AX, SI 71 JMP check_limit 72 73 loop: 74 MOVSS (AX), X1 75 MULSS X0, X1 76 ADDSS (DX), X1 77 MOVSS X1, (DX) 78 LEAQ (AX)(CX*4), AX 79 LEAQ (DX)(BX*4), DX 80 81 check_limit: 82 CMPQ SI, AX 83 JHI loop 84 RET 85 86 // func AmdAxpyPointer_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 87 // Requires: SSE 88 TEXT ·AmdAxpyPointer_V3A0(SB), NOSPLIT, $0-48 89 MOVSS alpha+0(FP), X0 90 MOVQ xs+8(FP), AX 91 MOVQ incx+16(FP), CX 92 MOVQ ys+24(FP), DX 93 MOVQ incy+32(FP), BX 94 MOVQ n+40(FP), SI 95 SHLQ $0x02, SI 96 IMULQ CX, SI 97 ADDQ AX, SI 98 JMP check_limit 99 100 loop: 101 MOVSS (AX), X1 102 MULSS X0, X1 103 ADDSS (DX), X1 104 MOVSS X1, (DX) 105 LEAQ (AX)(CX*4), AX 106 LEAQ (DX)(BX*4), DX 107 108 check_limit: 109 CMPQ SI, AX 110 JHI loop 111 RET 112 113 // func AmdAxpyPointer_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 114 // Requires: SSE 115 TEXT ·AmdAxpyPointer_V4A0(SB), NOSPLIT, $0-48 116 MOVSS alpha+0(FP), X0 117 MOVQ xs+8(FP), AX 118 MOVQ incx+16(FP), CX 119 MOVQ ys+24(FP), DX 120 MOVQ incy+32(FP), BX 121 MOVQ n+40(FP), SI 122 SHLQ $0x02, SI 123 IMULQ CX, SI 124 ADDQ AX, SI 125 JMP check_limit 126 127 loop: 128 MOVSS (AX), X1 129 MULSS X0, X1 130 ADDSS (DX), X1 131 MOVSS X1, (DX) 132 LEAQ (AX)(CX*4), AX 133 LEAQ (DX)(BX*4), DX 134 135 check_limit: 136 CMPQ SI, AX 137 JHI loop 138 RET 139 140 // func AmdAxpyPointer_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 141 // Requires: SSE 142 TEXT ·AmdAxpyPointer_V5A0(SB), NOSPLIT, $0-48 143 MOVSS alpha+0(FP), X0 144 MOVQ xs+8(FP), AX 145 MOVQ incx+16(FP), CX 146 MOVQ ys+24(FP), DX 147 MOVQ incy+32(FP), BX 148 MOVQ n+40(FP), SI 149 SHLQ $0x02, SI 150 IMULQ CX, SI 151 ADDQ AX, SI 152 JMP check_limit 153 154 loop: 155 MOVSS (AX), X1 156 MULSS X0, X1 157 ADDSS (DX), X1 158 MOVSS X1, (DX) 159 LEAQ (AX)(CX*4), AX 160 LEAQ (DX)(BX*4), DX 161 162 check_limit: 163 CMPQ SI, AX 164 JHI loop 165 RET 166 167 // func AmdAxpyPointer_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 168 // Requires: SSE 169 TEXT ·AmdAxpyPointer_V0A8(SB), NOSPLIT, $0-48 170 MOVSS alpha+0(FP), X0 171 MOVQ xs+8(FP), AX 172 MOVQ incx+16(FP), CX 173 MOVQ ys+24(FP), DX 174 MOVQ incy+32(FP), BX 175 MOVQ n+40(FP), SI 176 SHLQ $0x02, SI 177 IMULQ CX, SI 178 ADDQ AX, SI 179 JMP check_limit 180 PCALIGN $0x08 181 182 loop: 183 MOVSS (AX), X1 184 MULSS X0, X1 185 ADDSS (DX), X1 186 MOVSS X1, (DX) 187 LEAQ (AX)(CX*4), AX 188 LEAQ (DX)(BX*4), DX 189 190 check_limit: 191 CMPQ SI, AX 192 JHI loop 193 RET 194 195 // func AmdAxpyPointer_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 196 // Requires: SSE 197 TEXT ·AmdAxpyPointer_V1A8(SB), NOSPLIT, $0-48 198 MOVSS alpha+0(FP), X0 199 MOVQ xs+8(FP), AX 200 MOVQ incx+16(FP), CX 201 MOVQ ys+24(FP), DX 202 MOVQ incy+32(FP), BX 203 MOVQ n+40(FP), SI 204 SHLQ $0x02, SI 205 IMULQ CX, SI 206 ADDQ AX, SI 207 JMP check_limit 208 PCALIGN $0x08 209 210 loop: 211 MOVSS (AX), X1 212 MULSS X0, X1 213 ADDSS (DX), X1 214 MOVSS X1, (DX) 215 LEAQ (AX)(CX*4), AX 216 LEAQ (DX)(BX*4), DX 217 218 check_limit: 219 CMPQ SI, AX 220 JHI loop 221 RET 222 223 // func AmdAxpyPointer_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 224 // Requires: SSE 225 TEXT ·AmdAxpyPointer_V2A8(SB), NOSPLIT, $0-48 226 MOVSS alpha+0(FP), X0 227 MOVQ xs+8(FP), AX 228 MOVQ incx+16(FP), CX 229 MOVQ ys+24(FP), DX 230 MOVQ incy+32(FP), BX 231 MOVQ n+40(FP), SI 232 SHLQ $0x02, SI 233 IMULQ CX, SI 234 ADDQ AX, SI 235 JMP check_limit 236 PCALIGN $0x08 237 238 loop: 239 MOVSS (AX), X1 240 MULSS X0, X1 241 ADDSS (DX), X1 242 MOVSS X1, (DX) 243 LEAQ (AX)(CX*4), AX 244 LEAQ (DX)(BX*4), DX 245 246 check_limit: 247 CMPQ SI, AX 248 JHI loop 249 RET 250 251 // func AmdAxpyPointer_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 252 // Requires: SSE 253 TEXT ·AmdAxpyPointer_V3A8(SB), NOSPLIT, $0-48 254 MOVSS alpha+0(FP), X0 255 MOVQ xs+8(FP), AX 256 MOVQ incx+16(FP), CX 257 MOVQ ys+24(FP), DX 258 MOVQ incy+32(FP), BX 259 MOVQ n+40(FP), SI 260 SHLQ $0x02, SI 261 IMULQ CX, SI 262 ADDQ AX, SI 263 JMP check_limit 264 PCALIGN $0x08 265 266 loop: 267 MOVSS (AX), X1 268 MULSS X0, X1 269 ADDSS (DX), X1 270 MOVSS X1, (DX) 271 LEAQ (AX)(CX*4), AX 272 LEAQ (DX)(BX*4), DX 273 274 check_limit: 275 CMPQ SI, AX 276 JHI loop 277 RET 278 279 // func AmdAxpyPointer_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 280 // Requires: SSE 281 TEXT ·AmdAxpyPointer_V4A8(SB), NOSPLIT, $0-48 282 MOVSS alpha+0(FP), X0 283 MOVQ xs+8(FP), AX 284 MOVQ incx+16(FP), CX 285 MOVQ ys+24(FP), DX 286 MOVQ incy+32(FP), BX 287 MOVQ n+40(FP), SI 288 SHLQ $0x02, SI 289 IMULQ CX, SI 290 ADDQ AX, SI 291 JMP check_limit 292 PCALIGN $0x08 293 294 loop: 295 MOVSS (AX), X1 296 MULSS X0, X1 297 ADDSS (DX), X1 298 MOVSS X1, (DX) 299 LEAQ (AX)(CX*4), AX 300 LEAQ (DX)(BX*4), DX 301 302 check_limit: 303 CMPQ SI, AX 304 JHI loop 305 RET 306 307 // func AmdAxpyPointer_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 308 // Requires: SSE 309 TEXT ·AmdAxpyPointer_V5A8(SB), NOSPLIT, $0-48 310 MOVSS alpha+0(FP), X0 311 MOVQ xs+8(FP), AX 312 MOVQ incx+16(FP), CX 313 MOVQ ys+24(FP), DX 314 MOVQ incy+32(FP), BX 315 MOVQ n+40(FP), SI 316 SHLQ $0x02, SI 317 IMULQ CX, SI 318 ADDQ AX, SI 319 JMP check_limit 320 PCALIGN $0x08 321 322 loop: 323 MOVSS (AX), X1 324 MULSS X0, X1 325 ADDSS (DX), X1 326 MOVSS X1, (DX) 327 LEAQ (AX)(CX*4), AX 328 LEAQ (DX)(BX*4), DX 329 330 check_limit: 331 CMPQ SI, AX 332 JHI loop 333 RET 334 335 // func AmdAxpyPointer_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 336 // Requires: SSE 337 TEXT ·AmdAxpyPointer_V0A9(SB), NOSPLIT, $0-48 338 MOVSS alpha+0(FP), X0 339 MOVQ xs+8(FP), AX 340 MOVQ incx+16(FP), CX 341 MOVQ ys+24(FP), DX 342 MOVQ incy+32(FP), BX 343 MOVQ n+40(FP), SI 344 SHLQ $0x02, SI 345 IMULQ CX, SI 346 ADDQ AX, SI 347 JMP check_limit 348 PCALIGN $0x08 349 NOP 350 351 loop: 352 MOVSS (AX), X1 353 MULSS X0, X1 354 ADDSS (DX), X1 355 MOVSS X1, (DX) 356 LEAQ (AX)(CX*4), AX 357 LEAQ (DX)(BX*4), DX 358 359 check_limit: 360 CMPQ SI, AX 361 JHI loop 362 RET 363 364 // func AmdAxpyPointer_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 365 // Requires: SSE 366 TEXT ·AmdAxpyPointer_V1A9(SB), NOSPLIT, $0-48 367 MOVSS alpha+0(FP), X0 368 MOVQ xs+8(FP), AX 369 MOVQ incx+16(FP), CX 370 MOVQ ys+24(FP), DX 371 MOVQ incy+32(FP), BX 372 MOVQ n+40(FP), SI 373 SHLQ $0x02, SI 374 IMULQ CX, SI 375 ADDQ AX, SI 376 JMP check_limit 377 PCALIGN $0x08 378 NOP 379 380 loop: 381 MOVSS (AX), X1 382 MULSS X0, X1 383 ADDSS (DX), X1 384 MOVSS X1, (DX) 385 LEAQ (AX)(CX*4), AX 386 LEAQ (DX)(BX*4), DX 387 388 check_limit: 389 CMPQ SI, AX 390 JHI loop 391 RET 392 393 // func AmdAxpyPointer_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 394 // Requires: SSE 395 TEXT ·AmdAxpyPointer_V2A9(SB), NOSPLIT, $0-48 396 MOVSS alpha+0(FP), X0 397 MOVQ xs+8(FP), AX 398 MOVQ incx+16(FP), CX 399 MOVQ ys+24(FP), DX 400 MOVQ incy+32(FP), BX 401 MOVQ n+40(FP), SI 402 SHLQ $0x02, SI 403 IMULQ CX, SI 404 ADDQ AX, SI 405 JMP check_limit 406 PCALIGN $0x08 407 NOP 408 409 loop: 410 MOVSS (AX), X1 411 MULSS X0, X1 412 ADDSS (DX), X1 413 MOVSS X1, (DX) 414 LEAQ (AX)(CX*4), AX 415 LEAQ (DX)(BX*4), DX 416 417 check_limit: 418 CMPQ SI, AX 419 JHI loop 420 RET 421 422 // func AmdAxpyPointer_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 423 // Requires: SSE 424 TEXT ·AmdAxpyPointer_V3A9(SB), NOSPLIT, $0-48 425 MOVSS alpha+0(FP), X0 426 MOVQ xs+8(FP), AX 427 MOVQ incx+16(FP), CX 428 MOVQ ys+24(FP), DX 429 MOVQ incy+32(FP), BX 430 MOVQ n+40(FP), SI 431 SHLQ $0x02, SI 432 IMULQ CX, SI 433 ADDQ AX, SI 434 JMP check_limit 435 PCALIGN $0x08 436 NOP 437 438 loop: 439 MOVSS (AX), X1 440 MULSS X0, X1 441 ADDSS (DX), X1 442 MOVSS X1, (DX) 443 LEAQ (AX)(CX*4), AX 444 LEAQ (DX)(BX*4), DX 445 446 check_limit: 447 CMPQ SI, AX 448 JHI loop 449 RET 450 451 // func AmdAxpyPointer_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 452 // Requires: SSE 453 TEXT ·AmdAxpyPointer_V4A9(SB), NOSPLIT, $0-48 454 MOVSS alpha+0(FP), X0 455 MOVQ xs+8(FP), AX 456 MOVQ incx+16(FP), CX 457 MOVQ ys+24(FP), DX 458 MOVQ incy+32(FP), BX 459 MOVQ n+40(FP), SI 460 SHLQ $0x02, SI 461 IMULQ CX, SI 462 ADDQ AX, SI 463 JMP check_limit 464 PCALIGN $0x08 465 NOP 466 467 loop: 468 MOVSS (AX), X1 469 MULSS X0, X1 470 ADDSS (DX), X1 471 MOVSS X1, (DX) 472 LEAQ (AX)(CX*4), AX 473 LEAQ (DX)(BX*4), DX 474 475 check_limit: 476 CMPQ SI, AX 477 JHI loop 478 RET 479 480 // func AmdAxpyPointer_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 481 // Requires: SSE 482 TEXT ·AmdAxpyPointer_V5A9(SB), NOSPLIT, $0-48 483 MOVSS alpha+0(FP), X0 484 MOVQ xs+8(FP), AX 485 MOVQ incx+16(FP), CX 486 MOVQ ys+24(FP), DX 487 MOVQ incy+32(FP), BX 488 MOVQ n+40(FP), SI 489 SHLQ $0x02, SI 490 IMULQ CX, SI 491 ADDQ AX, SI 492 JMP check_limit 493 PCALIGN $0x08 494 NOP 495 496 loop: 497 MOVSS (AX), X1 498 MULSS X0, X1 499 ADDSS (DX), X1 500 MOVSS X1, (DX) 501 LEAQ (AX)(CX*4), AX 502 LEAQ (DX)(BX*4), DX 503 504 check_limit: 505 CMPQ SI, AX 506 JHI loop 507 RET 508 509 // func AmdAxpyPointer_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 510 // Requires: SSE 511 TEXT ·AmdAxpyPointer_V0A10(SB), NOSPLIT, $0-48 512 MOVSS alpha+0(FP), X0 513 MOVQ xs+8(FP), AX 514 MOVQ incx+16(FP), CX 515 MOVQ ys+24(FP), DX 516 MOVQ incy+32(FP), BX 517 MOVQ n+40(FP), SI 518 SHLQ $0x02, SI 519 IMULQ CX, SI 520 ADDQ AX, SI 521 JMP check_limit 522 PCALIGN $0x08 523 NOP 524 NOP 525 526 loop: 527 MOVSS (AX), X1 528 MULSS X0, X1 529 ADDSS (DX), X1 530 MOVSS X1, (DX) 531 LEAQ (AX)(CX*4), AX 532 LEAQ (DX)(BX*4), DX 533 534 check_limit: 535 CMPQ SI, AX 536 JHI loop 537 RET 538 539 // func AmdAxpyPointer_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 540 // Requires: SSE 541 TEXT ·AmdAxpyPointer_V1A10(SB), NOSPLIT, $0-48 542 MOVSS alpha+0(FP), X0 543 MOVQ xs+8(FP), AX 544 MOVQ incx+16(FP), CX 545 MOVQ ys+24(FP), DX 546 MOVQ incy+32(FP), BX 547 MOVQ n+40(FP), SI 548 SHLQ $0x02, SI 549 IMULQ CX, SI 550 ADDQ AX, SI 551 JMP check_limit 552 PCALIGN $0x08 553 NOP 554 NOP 555 556 loop: 557 MOVSS (AX), X1 558 MULSS X0, X1 559 ADDSS (DX), X1 560 MOVSS X1, (DX) 561 LEAQ (AX)(CX*4), AX 562 LEAQ (DX)(BX*4), DX 563 564 check_limit: 565 CMPQ SI, AX 566 JHI loop 567 RET 568 569 // func AmdAxpyPointer_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 570 // Requires: SSE 571 TEXT ·AmdAxpyPointer_V2A10(SB), NOSPLIT, $0-48 572 MOVSS alpha+0(FP), X0 573 MOVQ xs+8(FP), AX 574 MOVQ incx+16(FP), CX 575 MOVQ ys+24(FP), DX 576 MOVQ incy+32(FP), BX 577 MOVQ n+40(FP), SI 578 SHLQ $0x02, SI 579 IMULQ CX, SI 580 ADDQ AX, SI 581 JMP check_limit 582 PCALIGN $0x08 583 NOP 584 NOP 585 586 loop: 587 MOVSS (AX), X1 588 MULSS X0, X1 589 ADDSS (DX), X1 590 MOVSS X1, (DX) 591 LEAQ (AX)(CX*4), AX 592 LEAQ (DX)(BX*4), DX 593 594 check_limit: 595 CMPQ SI, AX 596 JHI loop 597 RET 598 599 // func AmdAxpyPointer_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 600 // Requires: SSE 601 TEXT ·AmdAxpyPointer_V3A10(SB), NOSPLIT, $0-48 602 MOVSS alpha+0(FP), X0 603 MOVQ xs+8(FP), AX 604 MOVQ incx+16(FP), CX 605 MOVQ ys+24(FP), DX 606 MOVQ incy+32(FP), BX 607 MOVQ n+40(FP), SI 608 SHLQ $0x02, SI 609 IMULQ CX, SI 610 ADDQ AX, SI 611 JMP check_limit 612 PCALIGN $0x08 613 NOP 614 NOP 615 616 loop: 617 MOVSS (AX), X1 618 MULSS X0, X1 619 ADDSS (DX), X1 620 MOVSS X1, (DX) 621 LEAQ (AX)(CX*4), AX 622 LEAQ (DX)(BX*4), DX 623 624 check_limit: 625 CMPQ SI, AX 626 JHI loop 627 RET 628 629 // func AmdAxpyPointer_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 630 // Requires: SSE 631 TEXT ·AmdAxpyPointer_V4A10(SB), NOSPLIT, $0-48 632 MOVSS alpha+0(FP), X0 633 MOVQ xs+8(FP), AX 634 MOVQ incx+16(FP), CX 635 MOVQ ys+24(FP), DX 636 MOVQ incy+32(FP), BX 637 MOVQ n+40(FP), SI 638 SHLQ $0x02, SI 639 IMULQ CX, SI 640 ADDQ AX, SI 641 JMP check_limit 642 PCALIGN $0x08 643 NOP 644 NOP 645 646 loop: 647 MOVSS (AX), X1 648 MULSS X0, X1 649 ADDSS (DX), X1 650 MOVSS X1, (DX) 651 LEAQ (AX)(CX*4), AX 652 LEAQ (DX)(BX*4), DX 653 654 check_limit: 655 CMPQ SI, AX 656 JHI loop 657 RET 658 659 // func AmdAxpyPointer_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 660 // Requires: SSE 661 TEXT ·AmdAxpyPointer_V5A10(SB), NOSPLIT, $0-48 662 MOVSS alpha+0(FP), X0 663 MOVQ xs+8(FP), AX 664 MOVQ incx+16(FP), CX 665 MOVQ ys+24(FP), DX 666 MOVQ incy+32(FP), BX 667 MOVQ n+40(FP), SI 668 SHLQ $0x02, SI 669 IMULQ CX, SI 670 ADDQ AX, SI 671 JMP check_limit 672 PCALIGN $0x08 673 NOP 674 NOP 675 676 loop: 677 MOVSS (AX), X1 678 MULSS X0, X1 679 ADDSS (DX), X1 680 MOVSS X1, (DX) 681 LEAQ (AX)(CX*4), AX 682 LEAQ (DX)(BX*4), DX 683 684 check_limit: 685 CMPQ SI, AX 686 JHI loop 687 RET 688 689 // func AmdAxpyPointer_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 690 // Requires: SSE 691 TEXT ·AmdAxpyPointer_V0A11(SB), NOSPLIT, $0-48 692 MOVSS alpha+0(FP), X0 693 MOVQ xs+8(FP), AX 694 MOVQ incx+16(FP), CX 695 MOVQ ys+24(FP), DX 696 MOVQ incy+32(FP), BX 697 MOVQ n+40(FP), SI 698 SHLQ $0x02, SI 699 IMULQ CX, SI 700 ADDQ AX, SI 701 JMP check_limit 702 PCALIGN $0x08 703 NOP 704 NOP 705 NOP 706 707 loop: 708 MOVSS (AX), X1 709 MULSS X0, X1 710 ADDSS (DX), X1 711 MOVSS X1, (DX) 712 LEAQ (AX)(CX*4), AX 713 LEAQ (DX)(BX*4), DX 714 715 check_limit: 716 CMPQ SI, AX 717 JHI loop 718 RET 719 720 // func AmdAxpyPointer_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 721 // Requires: SSE 722 TEXT ·AmdAxpyPointer_V1A11(SB), NOSPLIT, $0-48 723 MOVSS alpha+0(FP), X0 724 MOVQ xs+8(FP), AX 725 MOVQ incx+16(FP), CX 726 MOVQ ys+24(FP), DX 727 MOVQ incy+32(FP), BX 728 MOVQ n+40(FP), SI 729 SHLQ $0x02, SI 730 IMULQ CX, SI 731 ADDQ AX, SI 732 JMP check_limit 733 PCALIGN $0x08 734 NOP 735 NOP 736 NOP 737 738 loop: 739 MOVSS (AX), X1 740 MULSS X0, X1 741 ADDSS (DX), X1 742 MOVSS X1, (DX) 743 LEAQ (AX)(CX*4), AX 744 LEAQ (DX)(BX*4), DX 745 746 check_limit: 747 CMPQ SI, AX 748 JHI loop 749 RET 750 751 // func AmdAxpyPointer_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 752 // Requires: SSE 753 TEXT ·AmdAxpyPointer_V2A11(SB), NOSPLIT, $0-48 754 MOVSS alpha+0(FP), X0 755 MOVQ xs+8(FP), AX 756 MOVQ incx+16(FP), CX 757 MOVQ ys+24(FP), DX 758 MOVQ incy+32(FP), BX 759 MOVQ n+40(FP), SI 760 SHLQ $0x02, SI 761 IMULQ CX, SI 762 ADDQ AX, SI 763 JMP check_limit 764 PCALIGN $0x08 765 NOP 766 NOP 767 NOP 768 769 loop: 770 MOVSS (AX), X1 771 MULSS X0, X1 772 ADDSS (DX), X1 773 MOVSS X1, (DX) 774 LEAQ (AX)(CX*4), AX 775 LEAQ (DX)(BX*4), DX 776 777 check_limit: 778 CMPQ SI, AX 779 JHI loop 780 RET 781 782 // func AmdAxpyPointer_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 783 // Requires: SSE 784 TEXT ·AmdAxpyPointer_V3A11(SB), NOSPLIT, $0-48 785 MOVSS alpha+0(FP), X0 786 MOVQ xs+8(FP), AX 787 MOVQ incx+16(FP), CX 788 MOVQ ys+24(FP), DX 789 MOVQ incy+32(FP), BX 790 MOVQ n+40(FP), SI 791 SHLQ $0x02, SI 792 IMULQ CX, SI 793 ADDQ AX, SI 794 JMP check_limit 795 PCALIGN $0x08 796 NOP 797 NOP 798 NOP 799 800 loop: 801 MOVSS (AX), X1 802 MULSS X0, X1 803 ADDSS (DX), X1 804 MOVSS X1, (DX) 805 LEAQ (AX)(CX*4), AX 806 LEAQ (DX)(BX*4), DX 807 808 check_limit: 809 CMPQ SI, AX 810 JHI loop 811 RET 812 813 // func AmdAxpyPointer_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 814 // Requires: SSE 815 TEXT ·AmdAxpyPointer_V4A11(SB), NOSPLIT, $0-48 816 MOVSS alpha+0(FP), X0 817 MOVQ xs+8(FP), AX 818 MOVQ incx+16(FP), CX 819 MOVQ ys+24(FP), DX 820 MOVQ incy+32(FP), BX 821 MOVQ n+40(FP), SI 822 SHLQ $0x02, SI 823 IMULQ CX, SI 824 ADDQ AX, SI 825 JMP check_limit 826 PCALIGN $0x08 827 NOP 828 NOP 829 NOP 830 831 loop: 832 MOVSS (AX), X1 833 MULSS X0, X1 834 ADDSS (DX), X1 835 MOVSS X1, (DX) 836 LEAQ (AX)(CX*4), AX 837 LEAQ (DX)(BX*4), DX 838 839 check_limit: 840 CMPQ SI, AX 841 JHI loop 842 RET 843 844 // func AmdAxpyPointer_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 845 // Requires: SSE 846 TEXT ·AmdAxpyPointer_V5A11(SB), NOSPLIT, $0-48 847 MOVSS alpha+0(FP), X0 848 MOVQ xs+8(FP), AX 849 MOVQ incx+16(FP), CX 850 MOVQ ys+24(FP), DX 851 MOVQ incy+32(FP), BX 852 MOVQ n+40(FP), SI 853 SHLQ $0x02, SI 854 IMULQ CX, SI 855 ADDQ AX, SI 856 JMP check_limit 857 PCALIGN $0x08 858 NOP 859 NOP 860 NOP 861 862 loop: 863 MOVSS (AX), X1 864 MULSS X0, X1 865 ADDSS (DX), X1 866 MOVSS X1, (DX) 867 LEAQ (AX)(CX*4), AX 868 LEAQ (DX)(BX*4), DX 869 870 check_limit: 871 CMPQ SI, AX 872 JHI loop 873 RET 874 875 // func AmdAxpyPointer_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 876 // Requires: SSE 877 TEXT ·AmdAxpyPointer_V0A12(SB), NOSPLIT, $0-48 878 MOVSS alpha+0(FP), X0 879 MOVQ xs+8(FP), AX 880 MOVQ incx+16(FP), CX 881 MOVQ ys+24(FP), DX 882 MOVQ incy+32(FP), BX 883 MOVQ n+40(FP), SI 884 SHLQ $0x02, SI 885 IMULQ CX, SI 886 ADDQ AX, SI 887 JMP check_limit 888 PCALIGN $0x08 889 NOP 890 NOP 891 NOP 892 NOP 893 894 loop: 895 MOVSS (AX), X1 896 MULSS X0, X1 897 ADDSS (DX), X1 898 MOVSS X1, (DX) 899 LEAQ (AX)(CX*4), AX 900 LEAQ (DX)(BX*4), DX 901 902 check_limit: 903 CMPQ SI, AX 904 JHI loop 905 RET 906 907 // func AmdAxpyPointer_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 908 // Requires: SSE 909 TEXT ·AmdAxpyPointer_V1A12(SB), NOSPLIT, $0-48 910 MOVSS alpha+0(FP), X0 911 MOVQ xs+8(FP), AX 912 MOVQ incx+16(FP), CX 913 MOVQ ys+24(FP), DX 914 MOVQ incy+32(FP), BX 915 MOVQ n+40(FP), SI 916 SHLQ $0x02, SI 917 IMULQ CX, SI 918 ADDQ AX, SI 919 JMP check_limit 920 PCALIGN $0x08 921 NOP 922 NOP 923 NOP 924 NOP 925 926 loop: 927 MOVSS (AX), X1 928 MULSS X0, X1 929 ADDSS (DX), X1 930 MOVSS X1, (DX) 931 LEAQ (AX)(CX*4), AX 932 LEAQ (DX)(BX*4), DX 933 934 check_limit: 935 CMPQ SI, AX 936 JHI loop 937 RET 938 939 // func AmdAxpyPointer_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 940 // Requires: SSE 941 TEXT ·AmdAxpyPointer_V2A12(SB), NOSPLIT, $0-48 942 MOVSS alpha+0(FP), X0 943 MOVQ xs+8(FP), AX 944 MOVQ incx+16(FP), CX 945 MOVQ ys+24(FP), DX 946 MOVQ incy+32(FP), BX 947 MOVQ n+40(FP), SI 948 SHLQ $0x02, SI 949 IMULQ CX, SI 950 ADDQ AX, SI 951 JMP check_limit 952 PCALIGN $0x08 953 NOP 954 NOP 955 NOP 956 NOP 957 958 loop: 959 MOVSS (AX), X1 960 MULSS X0, X1 961 ADDSS (DX), X1 962 MOVSS X1, (DX) 963 LEAQ (AX)(CX*4), AX 964 LEAQ (DX)(BX*4), DX 965 966 check_limit: 967 CMPQ SI, AX 968 JHI loop 969 RET 970 971 // func AmdAxpyPointer_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 972 // Requires: SSE 973 TEXT ·AmdAxpyPointer_V3A12(SB), NOSPLIT, $0-48 974 MOVSS alpha+0(FP), X0 975 MOVQ xs+8(FP), AX 976 MOVQ incx+16(FP), CX 977 MOVQ ys+24(FP), DX 978 MOVQ incy+32(FP), BX 979 MOVQ n+40(FP), SI 980 SHLQ $0x02, SI 981 IMULQ CX, SI 982 ADDQ AX, SI 983 JMP check_limit 984 PCALIGN $0x08 985 NOP 986 NOP 987 NOP 988 NOP 989 990 loop: 991 MOVSS (AX), X1 992 MULSS X0, X1 993 ADDSS (DX), X1 994 MOVSS X1, (DX) 995 LEAQ (AX)(CX*4), AX 996 LEAQ (DX)(BX*4), DX 997 998 check_limit: 999 CMPQ SI, AX 1000 JHI loop 1001 RET 1002 1003 // func AmdAxpyPointer_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1004 // Requires: SSE 1005 TEXT ·AmdAxpyPointer_V4A12(SB), NOSPLIT, $0-48 1006 MOVSS alpha+0(FP), X0 1007 MOVQ xs+8(FP), AX 1008 MOVQ incx+16(FP), CX 1009 MOVQ ys+24(FP), DX 1010 MOVQ incy+32(FP), BX 1011 MOVQ n+40(FP), SI 1012 SHLQ $0x02, SI 1013 IMULQ CX, SI 1014 ADDQ AX, SI 1015 JMP check_limit 1016 PCALIGN $0x08 1017 NOP 1018 NOP 1019 NOP 1020 NOP 1021 1022 loop: 1023 MOVSS (AX), X1 1024 MULSS X0, X1 1025 ADDSS (DX), X1 1026 MOVSS X1, (DX) 1027 LEAQ (AX)(CX*4), AX 1028 LEAQ (DX)(BX*4), DX 1029 1030 check_limit: 1031 CMPQ SI, AX 1032 JHI loop 1033 RET 1034 1035 // func AmdAxpyPointer_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1036 // Requires: SSE 1037 TEXT ·AmdAxpyPointer_V5A12(SB), NOSPLIT, $0-48 1038 MOVSS alpha+0(FP), X0 1039 MOVQ xs+8(FP), AX 1040 MOVQ incx+16(FP), CX 1041 MOVQ ys+24(FP), DX 1042 MOVQ incy+32(FP), BX 1043 MOVQ n+40(FP), SI 1044 SHLQ $0x02, SI 1045 IMULQ CX, SI 1046 ADDQ AX, SI 1047 JMP check_limit 1048 PCALIGN $0x08 1049 NOP 1050 NOP 1051 NOP 1052 NOP 1053 1054 loop: 1055 MOVSS (AX), X1 1056 MULSS X0, X1 1057 ADDSS (DX), X1 1058 MOVSS X1, (DX) 1059 LEAQ (AX)(CX*4), AX 1060 LEAQ (DX)(BX*4), DX 1061 1062 check_limit: 1063 CMPQ SI, AX 1064 JHI loop 1065 RET 1066 1067 // func AmdAxpyPointer_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1068 // Requires: SSE 1069 TEXT ·AmdAxpyPointer_V0A13(SB), NOSPLIT, $0-48 1070 MOVSS alpha+0(FP), X0 1071 MOVQ xs+8(FP), AX 1072 MOVQ incx+16(FP), CX 1073 MOVQ ys+24(FP), DX 1074 MOVQ incy+32(FP), BX 1075 MOVQ n+40(FP), SI 1076 SHLQ $0x02, SI 1077 IMULQ CX, SI 1078 ADDQ AX, SI 1079 JMP check_limit 1080 PCALIGN $0x08 1081 NOP 1082 NOP 1083 NOP 1084 NOP 1085 NOP 1086 1087 loop: 1088 MOVSS (AX), X1 1089 MULSS X0, X1 1090 ADDSS (DX), X1 1091 MOVSS X1, (DX) 1092 LEAQ (AX)(CX*4), AX 1093 LEAQ (DX)(BX*4), DX 1094 1095 check_limit: 1096 CMPQ SI, AX 1097 JHI loop 1098 RET 1099 1100 // func AmdAxpyPointer_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1101 // Requires: SSE 1102 TEXT ·AmdAxpyPointer_V1A13(SB), NOSPLIT, $0-48 1103 MOVSS alpha+0(FP), X0 1104 MOVQ xs+8(FP), AX 1105 MOVQ incx+16(FP), CX 1106 MOVQ ys+24(FP), DX 1107 MOVQ incy+32(FP), BX 1108 MOVQ n+40(FP), SI 1109 SHLQ $0x02, SI 1110 IMULQ CX, SI 1111 ADDQ AX, SI 1112 JMP check_limit 1113 PCALIGN $0x08 1114 NOP 1115 NOP 1116 NOP 1117 NOP 1118 NOP 1119 1120 loop: 1121 MOVSS (AX), X1 1122 MULSS X0, X1 1123 ADDSS (DX), X1 1124 MOVSS X1, (DX) 1125 LEAQ (AX)(CX*4), AX 1126 LEAQ (DX)(BX*4), DX 1127 1128 check_limit: 1129 CMPQ SI, AX 1130 JHI loop 1131 RET 1132 1133 // func AmdAxpyPointer_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1134 // Requires: SSE 1135 TEXT ·AmdAxpyPointer_V2A13(SB), NOSPLIT, $0-48 1136 MOVSS alpha+0(FP), X0 1137 MOVQ xs+8(FP), AX 1138 MOVQ incx+16(FP), CX 1139 MOVQ ys+24(FP), DX 1140 MOVQ incy+32(FP), BX 1141 MOVQ n+40(FP), SI 1142 SHLQ $0x02, SI 1143 IMULQ CX, SI 1144 ADDQ AX, SI 1145 JMP check_limit 1146 PCALIGN $0x08 1147 NOP 1148 NOP 1149 NOP 1150 NOP 1151 NOP 1152 1153 loop: 1154 MOVSS (AX), X1 1155 MULSS X0, X1 1156 ADDSS (DX), X1 1157 MOVSS X1, (DX) 1158 LEAQ (AX)(CX*4), AX 1159 LEAQ (DX)(BX*4), DX 1160 1161 check_limit: 1162 CMPQ SI, AX 1163 JHI loop 1164 RET 1165 1166 // func AmdAxpyPointer_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1167 // Requires: SSE 1168 TEXT ·AmdAxpyPointer_V3A13(SB), NOSPLIT, $0-48 1169 MOVSS alpha+0(FP), X0 1170 MOVQ xs+8(FP), AX 1171 MOVQ incx+16(FP), CX 1172 MOVQ ys+24(FP), DX 1173 MOVQ incy+32(FP), BX 1174 MOVQ n+40(FP), SI 1175 SHLQ $0x02, SI 1176 IMULQ CX, SI 1177 ADDQ AX, SI 1178 JMP check_limit 1179 PCALIGN $0x08 1180 NOP 1181 NOP 1182 NOP 1183 NOP 1184 NOP 1185 1186 loop: 1187 MOVSS (AX), X1 1188 MULSS X0, X1 1189 ADDSS (DX), X1 1190 MOVSS X1, (DX) 1191 LEAQ (AX)(CX*4), AX 1192 LEAQ (DX)(BX*4), DX 1193 1194 check_limit: 1195 CMPQ SI, AX 1196 JHI loop 1197 RET 1198 1199 // func AmdAxpyPointer_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1200 // Requires: SSE 1201 TEXT ·AmdAxpyPointer_V4A13(SB), NOSPLIT, $0-48 1202 MOVSS alpha+0(FP), X0 1203 MOVQ xs+8(FP), AX 1204 MOVQ incx+16(FP), CX 1205 MOVQ ys+24(FP), DX 1206 MOVQ incy+32(FP), BX 1207 MOVQ n+40(FP), SI 1208 SHLQ $0x02, SI 1209 IMULQ CX, SI 1210 ADDQ AX, SI 1211 JMP check_limit 1212 PCALIGN $0x08 1213 NOP 1214 NOP 1215 NOP 1216 NOP 1217 NOP 1218 1219 loop: 1220 MOVSS (AX), X1 1221 MULSS X0, X1 1222 ADDSS (DX), X1 1223 MOVSS X1, (DX) 1224 LEAQ (AX)(CX*4), AX 1225 LEAQ (DX)(BX*4), DX 1226 1227 check_limit: 1228 CMPQ SI, AX 1229 JHI loop 1230 RET 1231 1232 // func AmdAxpyPointer_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1233 // Requires: SSE 1234 TEXT ·AmdAxpyPointer_V5A13(SB), NOSPLIT, $0-48 1235 MOVSS alpha+0(FP), X0 1236 MOVQ xs+8(FP), AX 1237 MOVQ incx+16(FP), CX 1238 MOVQ ys+24(FP), DX 1239 MOVQ incy+32(FP), BX 1240 MOVQ n+40(FP), SI 1241 SHLQ $0x02, SI 1242 IMULQ CX, SI 1243 ADDQ AX, SI 1244 JMP check_limit 1245 PCALIGN $0x08 1246 NOP 1247 NOP 1248 NOP 1249 NOP 1250 NOP 1251 1252 loop: 1253 MOVSS (AX), X1 1254 MULSS X0, X1 1255 ADDSS (DX), X1 1256 MOVSS X1, (DX) 1257 LEAQ (AX)(CX*4), AX 1258 LEAQ (DX)(BX*4), DX 1259 1260 check_limit: 1261 CMPQ SI, AX 1262 JHI loop 1263 RET 1264 1265 // func AmdAxpyPointer_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1266 // Requires: SSE 1267 TEXT ·AmdAxpyPointer_V0A14(SB), NOSPLIT, $0-48 1268 MOVSS alpha+0(FP), X0 1269 MOVQ xs+8(FP), AX 1270 MOVQ incx+16(FP), CX 1271 MOVQ ys+24(FP), DX 1272 MOVQ incy+32(FP), BX 1273 MOVQ n+40(FP), SI 1274 SHLQ $0x02, SI 1275 IMULQ CX, SI 1276 ADDQ AX, SI 1277 JMP check_limit 1278 PCALIGN $0x08 1279 NOP 1280 NOP 1281 NOP 1282 NOP 1283 NOP 1284 NOP 1285 1286 loop: 1287 MOVSS (AX), X1 1288 MULSS X0, X1 1289 ADDSS (DX), X1 1290 MOVSS X1, (DX) 1291 LEAQ (AX)(CX*4), AX 1292 LEAQ (DX)(BX*4), DX 1293 1294 check_limit: 1295 CMPQ SI, AX 1296 JHI loop 1297 RET 1298 1299 // func AmdAxpyPointer_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1300 // Requires: SSE 1301 TEXT ·AmdAxpyPointer_V1A14(SB), NOSPLIT, $0-48 1302 MOVSS alpha+0(FP), X0 1303 MOVQ xs+8(FP), AX 1304 MOVQ incx+16(FP), CX 1305 MOVQ ys+24(FP), DX 1306 MOVQ incy+32(FP), BX 1307 MOVQ n+40(FP), SI 1308 SHLQ $0x02, SI 1309 IMULQ CX, SI 1310 ADDQ AX, SI 1311 JMP check_limit 1312 PCALIGN $0x08 1313 NOP 1314 NOP 1315 NOP 1316 NOP 1317 NOP 1318 NOP 1319 1320 loop: 1321 MOVSS (AX), X1 1322 MULSS X0, X1 1323 ADDSS (DX), X1 1324 MOVSS X1, (DX) 1325 LEAQ (AX)(CX*4), AX 1326 LEAQ (DX)(BX*4), DX 1327 1328 check_limit: 1329 CMPQ SI, AX 1330 JHI loop 1331 RET 1332 1333 // func AmdAxpyPointer_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1334 // Requires: SSE 1335 TEXT ·AmdAxpyPointer_V2A14(SB), NOSPLIT, $0-48 1336 MOVSS alpha+0(FP), X0 1337 MOVQ xs+8(FP), AX 1338 MOVQ incx+16(FP), CX 1339 MOVQ ys+24(FP), DX 1340 MOVQ incy+32(FP), BX 1341 MOVQ n+40(FP), SI 1342 SHLQ $0x02, SI 1343 IMULQ CX, SI 1344 ADDQ AX, SI 1345 JMP check_limit 1346 PCALIGN $0x08 1347 NOP 1348 NOP 1349 NOP 1350 NOP 1351 NOP 1352 NOP 1353 1354 loop: 1355 MOVSS (AX), X1 1356 MULSS X0, X1 1357 ADDSS (DX), X1 1358 MOVSS X1, (DX) 1359 LEAQ (AX)(CX*4), AX 1360 LEAQ (DX)(BX*4), DX 1361 1362 check_limit: 1363 CMPQ SI, AX 1364 JHI loop 1365 RET 1366 1367 // func AmdAxpyPointer_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1368 // Requires: SSE 1369 TEXT ·AmdAxpyPointer_V3A14(SB), NOSPLIT, $0-48 1370 MOVSS alpha+0(FP), X0 1371 MOVQ xs+8(FP), AX 1372 MOVQ incx+16(FP), CX 1373 MOVQ ys+24(FP), DX 1374 MOVQ incy+32(FP), BX 1375 MOVQ n+40(FP), SI 1376 SHLQ $0x02, SI 1377 IMULQ CX, SI 1378 ADDQ AX, SI 1379 JMP check_limit 1380 PCALIGN $0x08 1381 NOP 1382 NOP 1383 NOP 1384 NOP 1385 NOP 1386 NOP 1387 1388 loop: 1389 MOVSS (AX), X1 1390 MULSS X0, X1 1391 ADDSS (DX), X1 1392 MOVSS X1, (DX) 1393 LEAQ (AX)(CX*4), AX 1394 LEAQ (DX)(BX*4), DX 1395 1396 check_limit: 1397 CMPQ SI, AX 1398 JHI loop 1399 RET 1400 1401 // func AmdAxpyPointer_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1402 // Requires: SSE 1403 TEXT ·AmdAxpyPointer_V4A14(SB), NOSPLIT, $0-48 1404 MOVSS alpha+0(FP), X0 1405 MOVQ xs+8(FP), AX 1406 MOVQ incx+16(FP), CX 1407 MOVQ ys+24(FP), DX 1408 MOVQ incy+32(FP), BX 1409 MOVQ n+40(FP), SI 1410 SHLQ $0x02, SI 1411 IMULQ CX, SI 1412 ADDQ AX, SI 1413 JMP check_limit 1414 PCALIGN $0x08 1415 NOP 1416 NOP 1417 NOP 1418 NOP 1419 NOP 1420 NOP 1421 1422 loop: 1423 MOVSS (AX), X1 1424 MULSS X0, X1 1425 ADDSS (DX), X1 1426 MOVSS X1, (DX) 1427 LEAQ (AX)(CX*4), AX 1428 LEAQ (DX)(BX*4), DX 1429 1430 check_limit: 1431 CMPQ SI, AX 1432 JHI loop 1433 RET 1434 1435 // func AmdAxpyPointer_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1436 // Requires: SSE 1437 TEXT ·AmdAxpyPointer_V5A14(SB), NOSPLIT, $0-48 1438 MOVSS alpha+0(FP), X0 1439 MOVQ xs+8(FP), AX 1440 MOVQ incx+16(FP), CX 1441 MOVQ ys+24(FP), DX 1442 MOVQ incy+32(FP), BX 1443 MOVQ n+40(FP), SI 1444 SHLQ $0x02, SI 1445 IMULQ CX, SI 1446 ADDQ AX, SI 1447 JMP check_limit 1448 PCALIGN $0x08 1449 NOP 1450 NOP 1451 NOP 1452 NOP 1453 NOP 1454 NOP 1455 1456 loop: 1457 MOVSS (AX), X1 1458 MULSS X0, X1 1459 ADDSS (DX), X1 1460 MOVSS X1, (DX) 1461 LEAQ (AX)(CX*4), AX 1462 LEAQ (DX)(BX*4), DX 1463 1464 check_limit: 1465 CMPQ SI, AX 1466 JHI loop 1467 RET 1468 1469 // func AmdAxpyPointer_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1470 // Requires: SSE 1471 TEXT ·AmdAxpyPointer_V0A15(SB), NOSPLIT, $0-48 1472 MOVSS alpha+0(FP), X0 1473 MOVQ xs+8(FP), AX 1474 MOVQ incx+16(FP), CX 1475 MOVQ ys+24(FP), DX 1476 MOVQ incy+32(FP), BX 1477 MOVQ n+40(FP), SI 1478 SHLQ $0x02, SI 1479 IMULQ CX, SI 1480 ADDQ AX, SI 1481 JMP check_limit 1482 PCALIGN $0x08 1483 NOP 1484 NOP 1485 NOP 1486 NOP 1487 NOP 1488 NOP 1489 NOP 1490 1491 loop: 1492 MOVSS (AX), X1 1493 MULSS X0, X1 1494 ADDSS (DX), X1 1495 MOVSS X1, (DX) 1496 LEAQ (AX)(CX*4), AX 1497 LEAQ (DX)(BX*4), DX 1498 1499 check_limit: 1500 CMPQ SI, AX 1501 JHI loop 1502 RET 1503 1504 // func AmdAxpyPointer_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1505 // Requires: SSE 1506 TEXT ·AmdAxpyPointer_V1A15(SB), NOSPLIT, $0-48 1507 MOVSS alpha+0(FP), X0 1508 MOVQ xs+8(FP), AX 1509 MOVQ incx+16(FP), CX 1510 MOVQ ys+24(FP), DX 1511 MOVQ incy+32(FP), BX 1512 MOVQ n+40(FP), SI 1513 SHLQ $0x02, SI 1514 IMULQ CX, SI 1515 ADDQ AX, SI 1516 JMP check_limit 1517 PCALIGN $0x08 1518 NOP 1519 NOP 1520 NOP 1521 NOP 1522 NOP 1523 NOP 1524 NOP 1525 1526 loop: 1527 MOVSS (AX), X1 1528 MULSS X0, X1 1529 ADDSS (DX), X1 1530 MOVSS X1, (DX) 1531 LEAQ (AX)(CX*4), AX 1532 LEAQ (DX)(BX*4), DX 1533 1534 check_limit: 1535 CMPQ SI, AX 1536 JHI loop 1537 RET 1538 1539 // func AmdAxpyPointer_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1540 // Requires: SSE 1541 TEXT ·AmdAxpyPointer_V2A15(SB), NOSPLIT, $0-48 1542 MOVSS alpha+0(FP), X0 1543 MOVQ xs+8(FP), AX 1544 MOVQ incx+16(FP), CX 1545 MOVQ ys+24(FP), DX 1546 MOVQ incy+32(FP), BX 1547 MOVQ n+40(FP), SI 1548 SHLQ $0x02, SI 1549 IMULQ CX, SI 1550 ADDQ AX, SI 1551 JMP check_limit 1552 PCALIGN $0x08 1553 NOP 1554 NOP 1555 NOP 1556 NOP 1557 NOP 1558 NOP 1559 NOP 1560 1561 loop: 1562 MOVSS (AX), X1 1563 MULSS X0, X1 1564 ADDSS (DX), X1 1565 MOVSS X1, (DX) 1566 LEAQ (AX)(CX*4), AX 1567 LEAQ (DX)(BX*4), DX 1568 1569 check_limit: 1570 CMPQ SI, AX 1571 JHI loop 1572 RET 1573 1574 // func AmdAxpyPointer_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1575 // Requires: SSE 1576 TEXT ·AmdAxpyPointer_V3A15(SB), NOSPLIT, $0-48 1577 MOVSS alpha+0(FP), X0 1578 MOVQ xs+8(FP), AX 1579 MOVQ incx+16(FP), CX 1580 MOVQ ys+24(FP), DX 1581 MOVQ incy+32(FP), BX 1582 MOVQ n+40(FP), SI 1583 SHLQ $0x02, SI 1584 IMULQ CX, SI 1585 ADDQ AX, SI 1586 JMP check_limit 1587 PCALIGN $0x08 1588 NOP 1589 NOP 1590 NOP 1591 NOP 1592 NOP 1593 NOP 1594 NOP 1595 1596 loop: 1597 MOVSS (AX), X1 1598 MULSS X0, X1 1599 ADDSS (DX), X1 1600 MOVSS X1, (DX) 1601 LEAQ (AX)(CX*4), AX 1602 LEAQ (DX)(BX*4), DX 1603 1604 check_limit: 1605 CMPQ SI, AX 1606 JHI loop 1607 RET 1608 1609 // func AmdAxpyPointer_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1610 // Requires: SSE 1611 TEXT ·AmdAxpyPointer_V4A15(SB), NOSPLIT, $0-48 1612 MOVSS alpha+0(FP), X0 1613 MOVQ xs+8(FP), AX 1614 MOVQ incx+16(FP), CX 1615 MOVQ ys+24(FP), DX 1616 MOVQ incy+32(FP), BX 1617 MOVQ n+40(FP), SI 1618 SHLQ $0x02, SI 1619 IMULQ CX, SI 1620 ADDQ AX, SI 1621 JMP check_limit 1622 PCALIGN $0x08 1623 NOP 1624 NOP 1625 NOP 1626 NOP 1627 NOP 1628 NOP 1629 NOP 1630 1631 loop: 1632 MOVSS (AX), X1 1633 MULSS X0, X1 1634 ADDSS (DX), X1 1635 MOVSS X1, (DX) 1636 LEAQ (AX)(CX*4), AX 1637 LEAQ (DX)(BX*4), DX 1638 1639 check_limit: 1640 CMPQ SI, AX 1641 JHI loop 1642 RET 1643 1644 // func AmdAxpyPointer_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1645 // Requires: SSE 1646 TEXT ·AmdAxpyPointer_V5A15(SB), NOSPLIT, $0-48 1647 MOVSS alpha+0(FP), X0 1648 MOVQ xs+8(FP), AX 1649 MOVQ incx+16(FP), CX 1650 MOVQ ys+24(FP), DX 1651 MOVQ incy+32(FP), BX 1652 MOVQ n+40(FP), SI 1653 SHLQ $0x02, SI 1654 IMULQ CX, SI 1655 ADDQ AX, SI 1656 JMP check_limit 1657 PCALIGN $0x08 1658 NOP 1659 NOP 1660 NOP 1661 NOP 1662 NOP 1663 NOP 1664 NOP 1665 1666 loop: 1667 MOVSS (AX), X1 1668 MULSS X0, X1 1669 ADDSS (DX), X1 1670 MOVSS X1, (DX) 1671 LEAQ (AX)(CX*4), AX 1672 LEAQ (DX)(BX*4), DX 1673 1674 check_limit: 1675 CMPQ SI, AX 1676 JHI loop 1677 RET 1678 1679 // func AmdAxpyPointer_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1680 // Requires: SSE 1681 TEXT ·AmdAxpyPointer_V0A16(SB), NOSPLIT, $0-48 1682 MOVSS alpha+0(FP), X0 1683 MOVQ xs+8(FP), AX 1684 MOVQ incx+16(FP), CX 1685 MOVQ ys+24(FP), DX 1686 MOVQ incy+32(FP), BX 1687 MOVQ n+40(FP), SI 1688 SHLQ $0x02, SI 1689 IMULQ CX, SI 1690 ADDQ AX, SI 1691 JMP check_limit 1692 PCALIGN $0x10 1693 1694 loop: 1695 MOVSS (AX), X1 1696 MULSS X0, X1 1697 ADDSS (DX), X1 1698 MOVSS X1, (DX) 1699 LEAQ (AX)(CX*4), AX 1700 LEAQ (DX)(BX*4), DX 1701 1702 check_limit: 1703 CMPQ SI, AX 1704 JHI loop 1705 RET 1706 1707 // func AmdAxpyPointer_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1708 // Requires: SSE 1709 TEXT ·AmdAxpyPointer_V1A16(SB), NOSPLIT, $0-48 1710 MOVSS alpha+0(FP), X0 1711 MOVQ xs+8(FP), AX 1712 MOVQ incx+16(FP), CX 1713 MOVQ ys+24(FP), DX 1714 MOVQ incy+32(FP), BX 1715 MOVQ n+40(FP), SI 1716 SHLQ $0x02, SI 1717 IMULQ CX, SI 1718 ADDQ AX, SI 1719 JMP check_limit 1720 PCALIGN $0x10 1721 1722 loop: 1723 MOVSS (AX), X1 1724 MULSS X0, X1 1725 ADDSS (DX), X1 1726 MOVSS X1, (DX) 1727 LEAQ (AX)(CX*4), AX 1728 LEAQ (DX)(BX*4), DX 1729 1730 check_limit: 1731 CMPQ SI, AX 1732 JHI loop 1733 RET 1734 1735 // func AmdAxpyPointer_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1736 // Requires: SSE 1737 TEXT ·AmdAxpyPointer_V2A16(SB), NOSPLIT, $0-48 1738 MOVSS alpha+0(FP), X0 1739 MOVQ xs+8(FP), AX 1740 MOVQ incx+16(FP), CX 1741 MOVQ ys+24(FP), DX 1742 MOVQ incy+32(FP), BX 1743 MOVQ n+40(FP), SI 1744 SHLQ $0x02, SI 1745 IMULQ CX, SI 1746 ADDQ AX, SI 1747 JMP check_limit 1748 PCALIGN $0x10 1749 1750 loop: 1751 MOVSS (AX), X1 1752 MULSS X0, X1 1753 ADDSS (DX), X1 1754 MOVSS X1, (DX) 1755 LEAQ (AX)(CX*4), AX 1756 LEAQ (DX)(BX*4), DX 1757 1758 check_limit: 1759 CMPQ SI, AX 1760 JHI loop 1761 RET 1762 1763 // func AmdAxpyPointer_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1764 // Requires: SSE 1765 TEXT ·AmdAxpyPointer_V3A16(SB), NOSPLIT, $0-48 1766 MOVSS alpha+0(FP), X0 1767 MOVQ xs+8(FP), AX 1768 MOVQ incx+16(FP), CX 1769 MOVQ ys+24(FP), DX 1770 MOVQ incy+32(FP), BX 1771 MOVQ n+40(FP), SI 1772 SHLQ $0x02, SI 1773 IMULQ CX, SI 1774 ADDQ AX, SI 1775 JMP check_limit 1776 PCALIGN $0x10 1777 1778 loop: 1779 MOVSS (AX), X1 1780 MULSS X0, X1 1781 ADDSS (DX), X1 1782 MOVSS X1, (DX) 1783 LEAQ (AX)(CX*4), AX 1784 LEAQ (DX)(BX*4), DX 1785 1786 check_limit: 1787 CMPQ SI, AX 1788 JHI loop 1789 RET 1790 1791 // func AmdAxpyPointer_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1792 // Requires: SSE 1793 TEXT ·AmdAxpyPointer_V4A16(SB), NOSPLIT, $0-48 1794 MOVSS alpha+0(FP), X0 1795 MOVQ xs+8(FP), AX 1796 MOVQ incx+16(FP), CX 1797 MOVQ ys+24(FP), DX 1798 MOVQ incy+32(FP), BX 1799 MOVQ n+40(FP), SI 1800 SHLQ $0x02, SI 1801 IMULQ CX, SI 1802 ADDQ AX, SI 1803 JMP check_limit 1804 PCALIGN $0x10 1805 1806 loop: 1807 MOVSS (AX), X1 1808 MULSS X0, X1 1809 ADDSS (DX), X1 1810 MOVSS X1, (DX) 1811 LEAQ (AX)(CX*4), AX 1812 LEAQ (DX)(BX*4), DX 1813 1814 check_limit: 1815 CMPQ SI, AX 1816 JHI loop 1817 RET 1818 1819 // func AmdAxpyPointer_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1820 // Requires: SSE 1821 TEXT ·AmdAxpyPointer_V5A16(SB), NOSPLIT, $0-48 1822 MOVSS alpha+0(FP), X0 1823 MOVQ xs+8(FP), AX 1824 MOVQ incx+16(FP), CX 1825 MOVQ ys+24(FP), DX 1826 MOVQ incy+32(FP), BX 1827 MOVQ n+40(FP), SI 1828 SHLQ $0x02, SI 1829 IMULQ CX, SI 1830 ADDQ AX, SI 1831 JMP check_limit 1832 PCALIGN $0x10 1833 1834 loop: 1835 MOVSS (AX), X1 1836 MULSS X0, X1 1837 ADDSS (DX), X1 1838 MOVSS X1, (DX) 1839 LEAQ (AX)(CX*4), AX 1840 LEAQ (DX)(BX*4), DX 1841 1842 check_limit: 1843 CMPQ SI, AX 1844 JHI loop 1845 RET 1846 1847 // func AmdAxpyPointerLoop_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1848 // Requires: SSE 1849 TEXT ·AmdAxpyPointerLoop_V0A0(SB), NOSPLIT, $0-48 1850 MOVSS alpha+0(FP), X0 1851 MOVQ xs+8(FP), AX 1852 MOVQ incx+16(FP), CX 1853 MOVQ ys+24(FP), DX 1854 MOVQ incy+32(FP), BX 1855 MOVQ n+40(FP), SI 1856 XORQ DI, DI 1857 JMP check_limit 1858 1859 loop: 1860 MOVSS (AX), X1 1861 MULSS X0, X1 1862 ADDSS (DX), X1 1863 MOVSS X1, (DX) 1864 INCQ DI 1865 LEAQ (AX)(CX*4), AX 1866 LEAQ (DX)(BX*4), DX 1867 1868 check_limit: 1869 CMPQ SI, DI 1870 JHI loop 1871 RET 1872 1873 // func AmdAxpyPointerLoop_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1874 // Requires: SSE 1875 TEXT ·AmdAxpyPointerLoop_V1A0(SB), NOSPLIT, $0-48 1876 MOVSS alpha+0(FP), X0 1877 MOVQ xs+8(FP), AX 1878 MOVQ incx+16(FP), CX 1879 MOVQ ys+24(FP), DX 1880 MOVQ incy+32(FP), BX 1881 MOVQ n+40(FP), SI 1882 XORQ DI, DI 1883 JMP check_limit 1884 1885 loop: 1886 MOVSS (AX), X1 1887 MULSS X0, X1 1888 ADDSS (DX), X1 1889 MOVSS X1, (DX) 1890 INCQ DI 1891 LEAQ (AX)(CX*4), AX 1892 LEAQ (DX)(BX*4), DX 1893 1894 check_limit: 1895 CMPQ SI, DI 1896 JHI loop 1897 RET 1898 1899 // func AmdAxpyPointerLoop_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1900 // Requires: SSE 1901 TEXT ·AmdAxpyPointerLoop_V2A0(SB), NOSPLIT, $0-48 1902 MOVSS alpha+0(FP), X0 1903 MOVQ xs+8(FP), AX 1904 MOVQ incx+16(FP), CX 1905 MOVQ ys+24(FP), DX 1906 MOVQ incy+32(FP), BX 1907 MOVQ n+40(FP), SI 1908 XORQ DI, DI 1909 JMP check_limit 1910 1911 loop: 1912 MOVSS (AX), X1 1913 MULSS X0, X1 1914 ADDSS (DX), X1 1915 MOVSS X1, (DX) 1916 INCQ DI 1917 LEAQ (AX)(CX*4), AX 1918 LEAQ (DX)(BX*4), DX 1919 1920 check_limit: 1921 CMPQ SI, DI 1922 JHI loop 1923 RET 1924 1925 // func AmdAxpyPointerLoop_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1926 // Requires: SSE 1927 TEXT ·AmdAxpyPointerLoop_V3A0(SB), NOSPLIT, $0-48 1928 MOVSS alpha+0(FP), X0 1929 MOVQ xs+8(FP), AX 1930 MOVQ incx+16(FP), CX 1931 MOVQ ys+24(FP), DX 1932 MOVQ incy+32(FP), BX 1933 MOVQ n+40(FP), SI 1934 XORQ DI, DI 1935 JMP check_limit 1936 1937 loop: 1938 MOVSS (AX), X1 1939 MULSS X0, X1 1940 ADDSS (DX), X1 1941 MOVSS X1, (DX) 1942 INCQ DI 1943 LEAQ (AX)(CX*4), AX 1944 LEAQ (DX)(BX*4), DX 1945 1946 check_limit: 1947 CMPQ SI, DI 1948 JHI loop 1949 RET 1950 1951 // func AmdAxpyPointerLoop_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1952 // Requires: SSE 1953 TEXT ·AmdAxpyPointerLoop_V4A0(SB), NOSPLIT, $0-48 1954 MOVSS alpha+0(FP), X0 1955 MOVQ xs+8(FP), AX 1956 MOVQ incx+16(FP), CX 1957 MOVQ ys+24(FP), DX 1958 MOVQ incy+32(FP), BX 1959 MOVQ n+40(FP), SI 1960 XORQ DI, DI 1961 JMP check_limit 1962 1963 loop: 1964 MOVSS (AX), X1 1965 MULSS X0, X1 1966 ADDSS (DX), X1 1967 MOVSS X1, (DX) 1968 INCQ DI 1969 LEAQ (AX)(CX*4), AX 1970 LEAQ (DX)(BX*4), DX 1971 1972 check_limit: 1973 CMPQ SI, DI 1974 JHI loop 1975 RET 1976 1977 // func AmdAxpyPointerLoop_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 1978 // Requires: SSE 1979 TEXT ·AmdAxpyPointerLoop_V5A0(SB), NOSPLIT, $0-48 1980 MOVSS alpha+0(FP), X0 1981 MOVQ xs+8(FP), AX 1982 MOVQ incx+16(FP), CX 1983 MOVQ ys+24(FP), DX 1984 MOVQ incy+32(FP), BX 1985 MOVQ n+40(FP), SI 1986 XORQ DI, DI 1987 JMP check_limit 1988 1989 loop: 1990 MOVSS (AX), X1 1991 MULSS X0, X1 1992 ADDSS (DX), X1 1993 MOVSS X1, (DX) 1994 INCQ DI 1995 LEAQ (AX)(CX*4), AX 1996 LEAQ (DX)(BX*4), DX 1997 1998 check_limit: 1999 CMPQ SI, DI 2000 JHI loop 2001 RET 2002 2003 // func AmdAxpyPointerLoop_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2004 // Requires: SSE 2005 TEXT ·AmdAxpyPointerLoop_V0A8(SB), NOSPLIT, $0-48 2006 MOVSS alpha+0(FP), X0 2007 MOVQ xs+8(FP), AX 2008 MOVQ incx+16(FP), CX 2009 MOVQ ys+24(FP), DX 2010 MOVQ incy+32(FP), BX 2011 MOVQ n+40(FP), SI 2012 XORQ DI, DI 2013 JMP check_limit 2014 PCALIGN $0x08 2015 2016 loop: 2017 MOVSS (AX), X1 2018 MULSS X0, X1 2019 ADDSS (DX), X1 2020 MOVSS X1, (DX) 2021 INCQ DI 2022 LEAQ (AX)(CX*4), AX 2023 LEAQ (DX)(BX*4), DX 2024 2025 check_limit: 2026 CMPQ SI, DI 2027 JHI loop 2028 RET 2029 2030 // func AmdAxpyPointerLoop_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2031 // Requires: SSE 2032 TEXT ·AmdAxpyPointerLoop_V1A8(SB), NOSPLIT, $0-48 2033 MOVSS alpha+0(FP), X0 2034 MOVQ xs+8(FP), AX 2035 MOVQ incx+16(FP), CX 2036 MOVQ ys+24(FP), DX 2037 MOVQ incy+32(FP), BX 2038 MOVQ n+40(FP), SI 2039 XORQ DI, DI 2040 JMP check_limit 2041 PCALIGN $0x08 2042 2043 loop: 2044 MOVSS (AX), X1 2045 MULSS X0, X1 2046 ADDSS (DX), X1 2047 MOVSS X1, (DX) 2048 INCQ DI 2049 LEAQ (AX)(CX*4), AX 2050 LEAQ (DX)(BX*4), DX 2051 2052 check_limit: 2053 CMPQ SI, DI 2054 JHI loop 2055 RET 2056 2057 // func AmdAxpyPointerLoop_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2058 // Requires: SSE 2059 TEXT ·AmdAxpyPointerLoop_V2A8(SB), NOSPLIT, $0-48 2060 MOVSS alpha+0(FP), X0 2061 MOVQ xs+8(FP), AX 2062 MOVQ incx+16(FP), CX 2063 MOVQ ys+24(FP), DX 2064 MOVQ incy+32(FP), BX 2065 MOVQ n+40(FP), SI 2066 XORQ DI, DI 2067 JMP check_limit 2068 PCALIGN $0x08 2069 2070 loop: 2071 MOVSS (AX), X1 2072 MULSS X0, X1 2073 ADDSS (DX), X1 2074 MOVSS X1, (DX) 2075 INCQ DI 2076 LEAQ (AX)(CX*4), AX 2077 LEAQ (DX)(BX*4), DX 2078 2079 check_limit: 2080 CMPQ SI, DI 2081 JHI loop 2082 RET 2083 2084 // func AmdAxpyPointerLoop_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2085 // Requires: SSE 2086 TEXT ·AmdAxpyPointerLoop_V3A8(SB), NOSPLIT, $0-48 2087 MOVSS alpha+0(FP), X0 2088 MOVQ xs+8(FP), AX 2089 MOVQ incx+16(FP), CX 2090 MOVQ ys+24(FP), DX 2091 MOVQ incy+32(FP), BX 2092 MOVQ n+40(FP), SI 2093 XORQ DI, DI 2094 JMP check_limit 2095 PCALIGN $0x08 2096 2097 loop: 2098 MOVSS (AX), X1 2099 MULSS X0, X1 2100 ADDSS (DX), X1 2101 MOVSS X1, (DX) 2102 INCQ DI 2103 LEAQ (AX)(CX*4), AX 2104 LEAQ (DX)(BX*4), DX 2105 2106 check_limit: 2107 CMPQ SI, DI 2108 JHI loop 2109 RET 2110 2111 // func AmdAxpyPointerLoop_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2112 // Requires: SSE 2113 TEXT ·AmdAxpyPointerLoop_V4A8(SB), NOSPLIT, $0-48 2114 MOVSS alpha+0(FP), X0 2115 MOVQ xs+8(FP), AX 2116 MOVQ incx+16(FP), CX 2117 MOVQ ys+24(FP), DX 2118 MOVQ incy+32(FP), BX 2119 MOVQ n+40(FP), SI 2120 XORQ DI, DI 2121 JMP check_limit 2122 PCALIGN $0x08 2123 2124 loop: 2125 MOVSS (AX), X1 2126 MULSS X0, X1 2127 ADDSS (DX), X1 2128 MOVSS X1, (DX) 2129 INCQ DI 2130 LEAQ (AX)(CX*4), AX 2131 LEAQ (DX)(BX*4), DX 2132 2133 check_limit: 2134 CMPQ SI, DI 2135 JHI loop 2136 RET 2137 2138 // func AmdAxpyPointerLoop_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2139 // Requires: SSE 2140 TEXT ·AmdAxpyPointerLoop_V5A8(SB), NOSPLIT, $0-48 2141 MOVSS alpha+0(FP), X0 2142 MOVQ xs+8(FP), AX 2143 MOVQ incx+16(FP), CX 2144 MOVQ ys+24(FP), DX 2145 MOVQ incy+32(FP), BX 2146 MOVQ n+40(FP), SI 2147 XORQ DI, DI 2148 JMP check_limit 2149 PCALIGN $0x08 2150 2151 loop: 2152 MOVSS (AX), X1 2153 MULSS X0, X1 2154 ADDSS (DX), X1 2155 MOVSS X1, (DX) 2156 INCQ DI 2157 LEAQ (AX)(CX*4), AX 2158 LEAQ (DX)(BX*4), DX 2159 2160 check_limit: 2161 CMPQ SI, DI 2162 JHI loop 2163 RET 2164 2165 // func AmdAxpyPointerLoop_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2166 // Requires: SSE 2167 TEXT ·AmdAxpyPointerLoop_V0A9(SB), NOSPLIT, $0-48 2168 MOVSS alpha+0(FP), X0 2169 MOVQ xs+8(FP), AX 2170 MOVQ incx+16(FP), CX 2171 MOVQ ys+24(FP), DX 2172 MOVQ incy+32(FP), BX 2173 MOVQ n+40(FP), SI 2174 XORQ DI, DI 2175 JMP check_limit 2176 PCALIGN $0x08 2177 NOP 2178 2179 loop: 2180 MOVSS (AX), X1 2181 MULSS X0, X1 2182 ADDSS (DX), X1 2183 MOVSS X1, (DX) 2184 INCQ DI 2185 LEAQ (AX)(CX*4), AX 2186 LEAQ (DX)(BX*4), DX 2187 2188 check_limit: 2189 CMPQ SI, DI 2190 JHI loop 2191 RET 2192 2193 // func AmdAxpyPointerLoop_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2194 // Requires: SSE 2195 TEXT ·AmdAxpyPointerLoop_V1A9(SB), NOSPLIT, $0-48 2196 MOVSS alpha+0(FP), X0 2197 MOVQ xs+8(FP), AX 2198 MOVQ incx+16(FP), CX 2199 MOVQ ys+24(FP), DX 2200 MOVQ incy+32(FP), BX 2201 MOVQ n+40(FP), SI 2202 XORQ DI, DI 2203 JMP check_limit 2204 PCALIGN $0x08 2205 NOP 2206 2207 loop: 2208 MOVSS (AX), X1 2209 MULSS X0, X1 2210 ADDSS (DX), X1 2211 MOVSS X1, (DX) 2212 INCQ DI 2213 LEAQ (AX)(CX*4), AX 2214 LEAQ (DX)(BX*4), DX 2215 2216 check_limit: 2217 CMPQ SI, DI 2218 JHI loop 2219 RET 2220 2221 // func AmdAxpyPointerLoop_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2222 // Requires: SSE 2223 TEXT ·AmdAxpyPointerLoop_V2A9(SB), NOSPLIT, $0-48 2224 MOVSS alpha+0(FP), X0 2225 MOVQ xs+8(FP), AX 2226 MOVQ incx+16(FP), CX 2227 MOVQ ys+24(FP), DX 2228 MOVQ incy+32(FP), BX 2229 MOVQ n+40(FP), SI 2230 XORQ DI, DI 2231 JMP check_limit 2232 PCALIGN $0x08 2233 NOP 2234 2235 loop: 2236 MOVSS (AX), X1 2237 MULSS X0, X1 2238 ADDSS (DX), X1 2239 MOVSS X1, (DX) 2240 INCQ DI 2241 LEAQ (AX)(CX*4), AX 2242 LEAQ (DX)(BX*4), DX 2243 2244 check_limit: 2245 CMPQ SI, DI 2246 JHI loop 2247 RET 2248 2249 // func AmdAxpyPointerLoop_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2250 // Requires: SSE 2251 TEXT ·AmdAxpyPointerLoop_V3A9(SB), NOSPLIT, $0-48 2252 MOVSS alpha+0(FP), X0 2253 MOVQ xs+8(FP), AX 2254 MOVQ incx+16(FP), CX 2255 MOVQ ys+24(FP), DX 2256 MOVQ incy+32(FP), BX 2257 MOVQ n+40(FP), SI 2258 XORQ DI, DI 2259 JMP check_limit 2260 PCALIGN $0x08 2261 NOP 2262 2263 loop: 2264 MOVSS (AX), X1 2265 MULSS X0, X1 2266 ADDSS (DX), X1 2267 MOVSS X1, (DX) 2268 INCQ DI 2269 LEAQ (AX)(CX*4), AX 2270 LEAQ (DX)(BX*4), DX 2271 2272 check_limit: 2273 CMPQ SI, DI 2274 JHI loop 2275 RET 2276 2277 // func AmdAxpyPointerLoop_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2278 // Requires: SSE 2279 TEXT ·AmdAxpyPointerLoop_V4A9(SB), NOSPLIT, $0-48 2280 MOVSS alpha+0(FP), X0 2281 MOVQ xs+8(FP), AX 2282 MOVQ incx+16(FP), CX 2283 MOVQ ys+24(FP), DX 2284 MOVQ incy+32(FP), BX 2285 MOVQ n+40(FP), SI 2286 XORQ DI, DI 2287 JMP check_limit 2288 PCALIGN $0x08 2289 NOP 2290 2291 loop: 2292 MOVSS (AX), X1 2293 MULSS X0, X1 2294 ADDSS (DX), X1 2295 MOVSS X1, (DX) 2296 INCQ DI 2297 LEAQ (AX)(CX*4), AX 2298 LEAQ (DX)(BX*4), DX 2299 2300 check_limit: 2301 CMPQ SI, DI 2302 JHI loop 2303 RET 2304 2305 // func AmdAxpyPointerLoop_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2306 // Requires: SSE 2307 TEXT ·AmdAxpyPointerLoop_V5A9(SB), NOSPLIT, $0-48 2308 MOVSS alpha+0(FP), X0 2309 MOVQ xs+8(FP), AX 2310 MOVQ incx+16(FP), CX 2311 MOVQ ys+24(FP), DX 2312 MOVQ incy+32(FP), BX 2313 MOVQ n+40(FP), SI 2314 XORQ DI, DI 2315 JMP check_limit 2316 PCALIGN $0x08 2317 NOP 2318 2319 loop: 2320 MOVSS (AX), X1 2321 MULSS X0, X1 2322 ADDSS (DX), X1 2323 MOVSS X1, (DX) 2324 INCQ DI 2325 LEAQ (AX)(CX*4), AX 2326 LEAQ (DX)(BX*4), DX 2327 2328 check_limit: 2329 CMPQ SI, DI 2330 JHI loop 2331 RET 2332 2333 // func AmdAxpyPointerLoop_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2334 // Requires: SSE 2335 TEXT ·AmdAxpyPointerLoop_V0A10(SB), NOSPLIT, $0-48 2336 MOVSS alpha+0(FP), X0 2337 MOVQ xs+8(FP), AX 2338 MOVQ incx+16(FP), CX 2339 MOVQ ys+24(FP), DX 2340 MOVQ incy+32(FP), BX 2341 MOVQ n+40(FP), SI 2342 XORQ DI, DI 2343 JMP check_limit 2344 PCALIGN $0x08 2345 NOP 2346 NOP 2347 2348 loop: 2349 MOVSS (AX), X1 2350 MULSS X0, X1 2351 ADDSS (DX), X1 2352 MOVSS X1, (DX) 2353 INCQ DI 2354 LEAQ (AX)(CX*4), AX 2355 LEAQ (DX)(BX*4), DX 2356 2357 check_limit: 2358 CMPQ SI, DI 2359 JHI loop 2360 RET 2361 2362 // func AmdAxpyPointerLoop_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2363 // Requires: SSE 2364 TEXT ·AmdAxpyPointerLoop_V1A10(SB), NOSPLIT, $0-48 2365 MOVSS alpha+0(FP), X0 2366 MOVQ xs+8(FP), AX 2367 MOVQ incx+16(FP), CX 2368 MOVQ ys+24(FP), DX 2369 MOVQ incy+32(FP), BX 2370 MOVQ n+40(FP), SI 2371 XORQ DI, DI 2372 JMP check_limit 2373 PCALIGN $0x08 2374 NOP 2375 NOP 2376 2377 loop: 2378 MOVSS (AX), X1 2379 MULSS X0, X1 2380 ADDSS (DX), X1 2381 MOVSS X1, (DX) 2382 INCQ DI 2383 LEAQ (AX)(CX*4), AX 2384 LEAQ (DX)(BX*4), DX 2385 2386 check_limit: 2387 CMPQ SI, DI 2388 JHI loop 2389 RET 2390 2391 // func AmdAxpyPointerLoop_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2392 // Requires: SSE 2393 TEXT ·AmdAxpyPointerLoop_V2A10(SB), NOSPLIT, $0-48 2394 MOVSS alpha+0(FP), X0 2395 MOVQ xs+8(FP), AX 2396 MOVQ incx+16(FP), CX 2397 MOVQ ys+24(FP), DX 2398 MOVQ incy+32(FP), BX 2399 MOVQ n+40(FP), SI 2400 XORQ DI, DI 2401 JMP check_limit 2402 PCALIGN $0x08 2403 NOP 2404 NOP 2405 2406 loop: 2407 MOVSS (AX), X1 2408 MULSS X0, X1 2409 ADDSS (DX), X1 2410 MOVSS X1, (DX) 2411 INCQ DI 2412 LEAQ (AX)(CX*4), AX 2413 LEAQ (DX)(BX*4), DX 2414 2415 check_limit: 2416 CMPQ SI, DI 2417 JHI loop 2418 RET 2419 2420 // func AmdAxpyPointerLoop_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2421 // Requires: SSE 2422 TEXT ·AmdAxpyPointerLoop_V3A10(SB), NOSPLIT, $0-48 2423 MOVSS alpha+0(FP), X0 2424 MOVQ xs+8(FP), AX 2425 MOVQ incx+16(FP), CX 2426 MOVQ ys+24(FP), DX 2427 MOVQ incy+32(FP), BX 2428 MOVQ n+40(FP), SI 2429 XORQ DI, DI 2430 JMP check_limit 2431 PCALIGN $0x08 2432 NOP 2433 NOP 2434 2435 loop: 2436 MOVSS (AX), X1 2437 MULSS X0, X1 2438 ADDSS (DX), X1 2439 MOVSS X1, (DX) 2440 INCQ DI 2441 LEAQ (AX)(CX*4), AX 2442 LEAQ (DX)(BX*4), DX 2443 2444 check_limit: 2445 CMPQ SI, DI 2446 JHI loop 2447 RET 2448 2449 // func AmdAxpyPointerLoop_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2450 // Requires: SSE 2451 TEXT ·AmdAxpyPointerLoop_V4A10(SB), NOSPLIT, $0-48 2452 MOVSS alpha+0(FP), X0 2453 MOVQ xs+8(FP), AX 2454 MOVQ incx+16(FP), CX 2455 MOVQ ys+24(FP), DX 2456 MOVQ incy+32(FP), BX 2457 MOVQ n+40(FP), SI 2458 XORQ DI, DI 2459 JMP check_limit 2460 PCALIGN $0x08 2461 NOP 2462 NOP 2463 2464 loop: 2465 MOVSS (AX), X1 2466 MULSS X0, X1 2467 ADDSS (DX), X1 2468 MOVSS X1, (DX) 2469 INCQ DI 2470 LEAQ (AX)(CX*4), AX 2471 LEAQ (DX)(BX*4), DX 2472 2473 check_limit: 2474 CMPQ SI, DI 2475 JHI loop 2476 RET 2477 2478 // func AmdAxpyPointerLoop_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2479 // Requires: SSE 2480 TEXT ·AmdAxpyPointerLoop_V5A10(SB), NOSPLIT, $0-48 2481 MOVSS alpha+0(FP), X0 2482 MOVQ xs+8(FP), AX 2483 MOVQ incx+16(FP), CX 2484 MOVQ ys+24(FP), DX 2485 MOVQ incy+32(FP), BX 2486 MOVQ n+40(FP), SI 2487 XORQ DI, DI 2488 JMP check_limit 2489 PCALIGN $0x08 2490 NOP 2491 NOP 2492 2493 loop: 2494 MOVSS (AX), X1 2495 MULSS X0, X1 2496 ADDSS (DX), X1 2497 MOVSS X1, (DX) 2498 INCQ DI 2499 LEAQ (AX)(CX*4), AX 2500 LEAQ (DX)(BX*4), DX 2501 2502 check_limit: 2503 CMPQ SI, DI 2504 JHI loop 2505 RET 2506 2507 // func AmdAxpyPointerLoop_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2508 // Requires: SSE 2509 TEXT ·AmdAxpyPointerLoop_V0A11(SB), NOSPLIT, $0-48 2510 MOVSS alpha+0(FP), X0 2511 MOVQ xs+8(FP), AX 2512 MOVQ incx+16(FP), CX 2513 MOVQ ys+24(FP), DX 2514 MOVQ incy+32(FP), BX 2515 MOVQ n+40(FP), SI 2516 XORQ DI, DI 2517 JMP check_limit 2518 PCALIGN $0x08 2519 NOP 2520 NOP 2521 NOP 2522 2523 loop: 2524 MOVSS (AX), X1 2525 MULSS X0, X1 2526 ADDSS (DX), X1 2527 MOVSS X1, (DX) 2528 INCQ DI 2529 LEAQ (AX)(CX*4), AX 2530 LEAQ (DX)(BX*4), DX 2531 2532 check_limit: 2533 CMPQ SI, DI 2534 JHI loop 2535 RET 2536 2537 // func AmdAxpyPointerLoop_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2538 // Requires: SSE 2539 TEXT ·AmdAxpyPointerLoop_V1A11(SB), NOSPLIT, $0-48 2540 MOVSS alpha+0(FP), X0 2541 MOVQ xs+8(FP), AX 2542 MOVQ incx+16(FP), CX 2543 MOVQ ys+24(FP), DX 2544 MOVQ incy+32(FP), BX 2545 MOVQ n+40(FP), SI 2546 XORQ DI, DI 2547 JMP check_limit 2548 PCALIGN $0x08 2549 NOP 2550 NOP 2551 NOP 2552 2553 loop: 2554 MOVSS (AX), X1 2555 MULSS X0, X1 2556 ADDSS (DX), X1 2557 MOVSS X1, (DX) 2558 INCQ DI 2559 LEAQ (AX)(CX*4), AX 2560 LEAQ (DX)(BX*4), DX 2561 2562 check_limit: 2563 CMPQ SI, DI 2564 JHI loop 2565 RET 2566 2567 // func AmdAxpyPointerLoop_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2568 // Requires: SSE 2569 TEXT ·AmdAxpyPointerLoop_V2A11(SB), NOSPLIT, $0-48 2570 MOVSS alpha+0(FP), X0 2571 MOVQ xs+8(FP), AX 2572 MOVQ incx+16(FP), CX 2573 MOVQ ys+24(FP), DX 2574 MOVQ incy+32(FP), BX 2575 MOVQ n+40(FP), SI 2576 XORQ DI, DI 2577 JMP check_limit 2578 PCALIGN $0x08 2579 NOP 2580 NOP 2581 NOP 2582 2583 loop: 2584 MOVSS (AX), X1 2585 MULSS X0, X1 2586 ADDSS (DX), X1 2587 MOVSS X1, (DX) 2588 INCQ DI 2589 LEAQ (AX)(CX*4), AX 2590 LEAQ (DX)(BX*4), DX 2591 2592 check_limit: 2593 CMPQ SI, DI 2594 JHI loop 2595 RET 2596 2597 // func AmdAxpyPointerLoop_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2598 // Requires: SSE 2599 TEXT ·AmdAxpyPointerLoop_V3A11(SB), NOSPLIT, $0-48 2600 MOVSS alpha+0(FP), X0 2601 MOVQ xs+8(FP), AX 2602 MOVQ incx+16(FP), CX 2603 MOVQ ys+24(FP), DX 2604 MOVQ incy+32(FP), BX 2605 MOVQ n+40(FP), SI 2606 XORQ DI, DI 2607 JMP check_limit 2608 PCALIGN $0x08 2609 NOP 2610 NOP 2611 NOP 2612 2613 loop: 2614 MOVSS (AX), X1 2615 MULSS X0, X1 2616 ADDSS (DX), X1 2617 MOVSS X1, (DX) 2618 INCQ DI 2619 LEAQ (AX)(CX*4), AX 2620 LEAQ (DX)(BX*4), DX 2621 2622 check_limit: 2623 CMPQ SI, DI 2624 JHI loop 2625 RET 2626 2627 // func AmdAxpyPointerLoop_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2628 // Requires: SSE 2629 TEXT ·AmdAxpyPointerLoop_V4A11(SB), NOSPLIT, $0-48 2630 MOVSS alpha+0(FP), X0 2631 MOVQ xs+8(FP), AX 2632 MOVQ incx+16(FP), CX 2633 MOVQ ys+24(FP), DX 2634 MOVQ incy+32(FP), BX 2635 MOVQ n+40(FP), SI 2636 XORQ DI, DI 2637 JMP check_limit 2638 PCALIGN $0x08 2639 NOP 2640 NOP 2641 NOP 2642 2643 loop: 2644 MOVSS (AX), X1 2645 MULSS X0, X1 2646 ADDSS (DX), X1 2647 MOVSS X1, (DX) 2648 INCQ DI 2649 LEAQ (AX)(CX*4), AX 2650 LEAQ (DX)(BX*4), DX 2651 2652 check_limit: 2653 CMPQ SI, DI 2654 JHI loop 2655 RET 2656 2657 // func AmdAxpyPointerLoop_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2658 // Requires: SSE 2659 TEXT ·AmdAxpyPointerLoop_V5A11(SB), NOSPLIT, $0-48 2660 MOVSS alpha+0(FP), X0 2661 MOVQ xs+8(FP), AX 2662 MOVQ incx+16(FP), CX 2663 MOVQ ys+24(FP), DX 2664 MOVQ incy+32(FP), BX 2665 MOVQ n+40(FP), SI 2666 XORQ DI, DI 2667 JMP check_limit 2668 PCALIGN $0x08 2669 NOP 2670 NOP 2671 NOP 2672 2673 loop: 2674 MOVSS (AX), X1 2675 MULSS X0, X1 2676 ADDSS (DX), X1 2677 MOVSS X1, (DX) 2678 INCQ DI 2679 LEAQ (AX)(CX*4), AX 2680 LEAQ (DX)(BX*4), DX 2681 2682 check_limit: 2683 CMPQ SI, DI 2684 JHI loop 2685 RET 2686 2687 // func AmdAxpyPointerLoop_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2688 // Requires: SSE 2689 TEXT ·AmdAxpyPointerLoop_V0A12(SB), NOSPLIT, $0-48 2690 MOVSS alpha+0(FP), X0 2691 MOVQ xs+8(FP), AX 2692 MOVQ incx+16(FP), CX 2693 MOVQ ys+24(FP), DX 2694 MOVQ incy+32(FP), BX 2695 MOVQ n+40(FP), SI 2696 XORQ DI, DI 2697 JMP check_limit 2698 PCALIGN $0x08 2699 NOP 2700 NOP 2701 NOP 2702 NOP 2703 2704 loop: 2705 MOVSS (AX), X1 2706 MULSS X0, X1 2707 ADDSS (DX), X1 2708 MOVSS X1, (DX) 2709 INCQ DI 2710 LEAQ (AX)(CX*4), AX 2711 LEAQ (DX)(BX*4), DX 2712 2713 check_limit: 2714 CMPQ SI, DI 2715 JHI loop 2716 RET 2717 2718 // func AmdAxpyPointerLoop_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2719 // Requires: SSE 2720 TEXT ·AmdAxpyPointerLoop_V1A12(SB), NOSPLIT, $0-48 2721 MOVSS alpha+0(FP), X0 2722 MOVQ xs+8(FP), AX 2723 MOVQ incx+16(FP), CX 2724 MOVQ ys+24(FP), DX 2725 MOVQ incy+32(FP), BX 2726 MOVQ n+40(FP), SI 2727 XORQ DI, DI 2728 JMP check_limit 2729 PCALIGN $0x08 2730 NOP 2731 NOP 2732 NOP 2733 NOP 2734 2735 loop: 2736 MOVSS (AX), X1 2737 MULSS X0, X1 2738 ADDSS (DX), X1 2739 MOVSS X1, (DX) 2740 INCQ DI 2741 LEAQ (AX)(CX*4), AX 2742 LEAQ (DX)(BX*4), DX 2743 2744 check_limit: 2745 CMPQ SI, DI 2746 JHI loop 2747 RET 2748 2749 // func AmdAxpyPointerLoop_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2750 // Requires: SSE 2751 TEXT ·AmdAxpyPointerLoop_V2A12(SB), NOSPLIT, $0-48 2752 MOVSS alpha+0(FP), X0 2753 MOVQ xs+8(FP), AX 2754 MOVQ incx+16(FP), CX 2755 MOVQ ys+24(FP), DX 2756 MOVQ incy+32(FP), BX 2757 MOVQ n+40(FP), SI 2758 XORQ DI, DI 2759 JMP check_limit 2760 PCALIGN $0x08 2761 NOP 2762 NOP 2763 NOP 2764 NOP 2765 2766 loop: 2767 MOVSS (AX), X1 2768 MULSS X0, X1 2769 ADDSS (DX), X1 2770 MOVSS X1, (DX) 2771 INCQ DI 2772 LEAQ (AX)(CX*4), AX 2773 LEAQ (DX)(BX*4), DX 2774 2775 check_limit: 2776 CMPQ SI, DI 2777 JHI loop 2778 RET 2779 2780 // func AmdAxpyPointerLoop_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2781 // Requires: SSE 2782 TEXT ·AmdAxpyPointerLoop_V3A12(SB), NOSPLIT, $0-48 2783 MOVSS alpha+0(FP), X0 2784 MOVQ xs+8(FP), AX 2785 MOVQ incx+16(FP), CX 2786 MOVQ ys+24(FP), DX 2787 MOVQ incy+32(FP), BX 2788 MOVQ n+40(FP), SI 2789 XORQ DI, DI 2790 JMP check_limit 2791 PCALIGN $0x08 2792 NOP 2793 NOP 2794 NOP 2795 NOP 2796 2797 loop: 2798 MOVSS (AX), X1 2799 MULSS X0, X1 2800 ADDSS (DX), X1 2801 MOVSS X1, (DX) 2802 INCQ DI 2803 LEAQ (AX)(CX*4), AX 2804 LEAQ (DX)(BX*4), DX 2805 2806 check_limit: 2807 CMPQ SI, DI 2808 JHI loop 2809 RET 2810 2811 // func AmdAxpyPointerLoop_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2812 // Requires: SSE 2813 TEXT ·AmdAxpyPointerLoop_V4A12(SB), NOSPLIT, $0-48 2814 MOVSS alpha+0(FP), X0 2815 MOVQ xs+8(FP), AX 2816 MOVQ incx+16(FP), CX 2817 MOVQ ys+24(FP), DX 2818 MOVQ incy+32(FP), BX 2819 MOVQ n+40(FP), SI 2820 XORQ DI, DI 2821 JMP check_limit 2822 PCALIGN $0x08 2823 NOP 2824 NOP 2825 NOP 2826 NOP 2827 2828 loop: 2829 MOVSS (AX), X1 2830 MULSS X0, X1 2831 ADDSS (DX), X1 2832 MOVSS X1, (DX) 2833 INCQ DI 2834 LEAQ (AX)(CX*4), AX 2835 LEAQ (DX)(BX*4), DX 2836 2837 check_limit: 2838 CMPQ SI, DI 2839 JHI loop 2840 RET 2841 2842 // func AmdAxpyPointerLoop_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2843 // Requires: SSE 2844 TEXT ·AmdAxpyPointerLoop_V5A12(SB), NOSPLIT, $0-48 2845 MOVSS alpha+0(FP), X0 2846 MOVQ xs+8(FP), AX 2847 MOVQ incx+16(FP), CX 2848 MOVQ ys+24(FP), DX 2849 MOVQ incy+32(FP), BX 2850 MOVQ n+40(FP), SI 2851 XORQ DI, DI 2852 JMP check_limit 2853 PCALIGN $0x08 2854 NOP 2855 NOP 2856 NOP 2857 NOP 2858 2859 loop: 2860 MOVSS (AX), X1 2861 MULSS X0, X1 2862 ADDSS (DX), X1 2863 MOVSS X1, (DX) 2864 INCQ DI 2865 LEAQ (AX)(CX*4), AX 2866 LEAQ (DX)(BX*4), DX 2867 2868 check_limit: 2869 CMPQ SI, DI 2870 JHI loop 2871 RET 2872 2873 // func AmdAxpyPointerLoop_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2874 // Requires: SSE 2875 TEXT ·AmdAxpyPointerLoop_V0A13(SB), NOSPLIT, $0-48 2876 MOVSS alpha+0(FP), X0 2877 MOVQ xs+8(FP), AX 2878 MOVQ incx+16(FP), CX 2879 MOVQ ys+24(FP), DX 2880 MOVQ incy+32(FP), BX 2881 MOVQ n+40(FP), SI 2882 XORQ DI, DI 2883 JMP check_limit 2884 PCALIGN $0x08 2885 NOP 2886 NOP 2887 NOP 2888 NOP 2889 NOP 2890 2891 loop: 2892 MOVSS (AX), X1 2893 MULSS X0, X1 2894 ADDSS (DX), X1 2895 MOVSS X1, (DX) 2896 INCQ DI 2897 LEAQ (AX)(CX*4), AX 2898 LEAQ (DX)(BX*4), DX 2899 2900 check_limit: 2901 CMPQ SI, DI 2902 JHI loop 2903 RET 2904 2905 // func AmdAxpyPointerLoop_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2906 // Requires: SSE 2907 TEXT ·AmdAxpyPointerLoop_V1A13(SB), NOSPLIT, $0-48 2908 MOVSS alpha+0(FP), X0 2909 MOVQ xs+8(FP), AX 2910 MOVQ incx+16(FP), CX 2911 MOVQ ys+24(FP), DX 2912 MOVQ incy+32(FP), BX 2913 MOVQ n+40(FP), SI 2914 XORQ DI, DI 2915 JMP check_limit 2916 PCALIGN $0x08 2917 NOP 2918 NOP 2919 NOP 2920 NOP 2921 NOP 2922 2923 loop: 2924 MOVSS (AX), X1 2925 MULSS X0, X1 2926 ADDSS (DX), X1 2927 MOVSS X1, (DX) 2928 INCQ DI 2929 LEAQ (AX)(CX*4), AX 2930 LEAQ (DX)(BX*4), DX 2931 2932 check_limit: 2933 CMPQ SI, DI 2934 JHI loop 2935 RET 2936 2937 // func AmdAxpyPointerLoop_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2938 // Requires: SSE 2939 TEXT ·AmdAxpyPointerLoop_V2A13(SB), NOSPLIT, $0-48 2940 MOVSS alpha+0(FP), X0 2941 MOVQ xs+8(FP), AX 2942 MOVQ incx+16(FP), CX 2943 MOVQ ys+24(FP), DX 2944 MOVQ incy+32(FP), BX 2945 MOVQ n+40(FP), SI 2946 XORQ DI, DI 2947 JMP check_limit 2948 PCALIGN $0x08 2949 NOP 2950 NOP 2951 NOP 2952 NOP 2953 NOP 2954 2955 loop: 2956 MOVSS (AX), X1 2957 MULSS X0, X1 2958 ADDSS (DX), X1 2959 MOVSS X1, (DX) 2960 INCQ DI 2961 LEAQ (AX)(CX*4), AX 2962 LEAQ (DX)(BX*4), DX 2963 2964 check_limit: 2965 CMPQ SI, DI 2966 JHI loop 2967 RET 2968 2969 // func AmdAxpyPointerLoop_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 2970 // Requires: SSE 2971 TEXT ·AmdAxpyPointerLoop_V3A13(SB), NOSPLIT, $0-48 2972 MOVSS alpha+0(FP), X0 2973 MOVQ xs+8(FP), AX 2974 MOVQ incx+16(FP), CX 2975 MOVQ ys+24(FP), DX 2976 MOVQ incy+32(FP), BX 2977 MOVQ n+40(FP), SI 2978 XORQ DI, DI 2979 JMP check_limit 2980 PCALIGN $0x08 2981 NOP 2982 NOP 2983 NOP 2984 NOP 2985 NOP 2986 2987 loop: 2988 MOVSS (AX), X1 2989 MULSS X0, X1 2990 ADDSS (DX), X1 2991 MOVSS X1, (DX) 2992 INCQ DI 2993 LEAQ (AX)(CX*4), AX 2994 LEAQ (DX)(BX*4), DX 2995 2996 check_limit: 2997 CMPQ SI, DI 2998 JHI loop 2999 RET 3000 3001 // func AmdAxpyPointerLoop_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3002 // Requires: SSE 3003 TEXT ·AmdAxpyPointerLoop_V4A13(SB), NOSPLIT, $0-48 3004 MOVSS alpha+0(FP), X0 3005 MOVQ xs+8(FP), AX 3006 MOVQ incx+16(FP), CX 3007 MOVQ ys+24(FP), DX 3008 MOVQ incy+32(FP), BX 3009 MOVQ n+40(FP), SI 3010 XORQ DI, DI 3011 JMP check_limit 3012 PCALIGN $0x08 3013 NOP 3014 NOP 3015 NOP 3016 NOP 3017 NOP 3018 3019 loop: 3020 MOVSS (AX), X1 3021 MULSS X0, X1 3022 ADDSS (DX), X1 3023 MOVSS X1, (DX) 3024 INCQ DI 3025 LEAQ (AX)(CX*4), AX 3026 LEAQ (DX)(BX*4), DX 3027 3028 check_limit: 3029 CMPQ SI, DI 3030 JHI loop 3031 RET 3032 3033 // func AmdAxpyPointerLoop_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3034 // Requires: SSE 3035 TEXT ·AmdAxpyPointerLoop_V5A13(SB), NOSPLIT, $0-48 3036 MOVSS alpha+0(FP), X0 3037 MOVQ xs+8(FP), AX 3038 MOVQ incx+16(FP), CX 3039 MOVQ ys+24(FP), DX 3040 MOVQ incy+32(FP), BX 3041 MOVQ n+40(FP), SI 3042 XORQ DI, DI 3043 JMP check_limit 3044 PCALIGN $0x08 3045 NOP 3046 NOP 3047 NOP 3048 NOP 3049 NOP 3050 3051 loop: 3052 MOVSS (AX), X1 3053 MULSS X0, X1 3054 ADDSS (DX), X1 3055 MOVSS X1, (DX) 3056 INCQ DI 3057 LEAQ (AX)(CX*4), AX 3058 LEAQ (DX)(BX*4), DX 3059 3060 check_limit: 3061 CMPQ SI, DI 3062 JHI loop 3063 RET 3064 3065 // func AmdAxpyPointerLoop_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3066 // Requires: SSE 3067 TEXT ·AmdAxpyPointerLoop_V0A14(SB), NOSPLIT, $0-48 3068 MOVSS alpha+0(FP), X0 3069 MOVQ xs+8(FP), AX 3070 MOVQ incx+16(FP), CX 3071 MOVQ ys+24(FP), DX 3072 MOVQ incy+32(FP), BX 3073 MOVQ n+40(FP), SI 3074 XORQ DI, DI 3075 JMP check_limit 3076 PCALIGN $0x08 3077 NOP 3078 NOP 3079 NOP 3080 NOP 3081 NOP 3082 NOP 3083 3084 loop: 3085 MOVSS (AX), X1 3086 MULSS X0, X1 3087 ADDSS (DX), X1 3088 MOVSS X1, (DX) 3089 INCQ DI 3090 LEAQ (AX)(CX*4), AX 3091 LEAQ (DX)(BX*4), DX 3092 3093 check_limit: 3094 CMPQ SI, DI 3095 JHI loop 3096 RET 3097 3098 // func AmdAxpyPointerLoop_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3099 // Requires: SSE 3100 TEXT ·AmdAxpyPointerLoop_V1A14(SB), NOSPLIT, $0-48 3101 MOVSS alpha+0(FP), X0 3102 MOVQ xs+8(FP), AX 3103 MOVQ incx+16(FP), CX 3104 MOVQ ys+24(FP), DX 3105 MOVQ incy+32(FP), BX 3106 MOVQ n+40(FP), SI 3107 XORQ DI, DI 3108 JMP check_limit 3109 PCALIGN $0x08 3110 NOP 3111 NOP 3112 NOP 3113 NOP 3114 NOP 3115 NOP 3116 3117 loop: 3118 MOVSS (AX), X1 3119 MULSS X0, X1 3120 ADDSS (DX), X1 3121 MOVSS X1, (DX) 3122 INCQ DI 3123 LEAQ (AX)(CX*4), AX 3124 LEAQ (DX)(BX*4), DX 3125 3126 check_limit: 3127 CMPQ SI, DI 3128 JHI loop 3129 RET 3130 3131 // func AmdAxpyPointerLoop_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3132 // Requires: SSE 3133 TEXT ·AmdAxpyPointerLoop_V2A14(SB), NOSPLIT, $0-48 3134 MOVSS alpha+0(FP), X0 3135 MOVQ xs+8(FP), AX 3136 MOVQ incx+16(FP), CX 3137 MOVQ ys+24(FP), DX 3138 MOVQ incy+32(FP), BX 3139 MOVQ n+40(FP), SI 3140 XORQ DI, DI 3141 JMP check_limit 3142 PCALIGN $0x08 3143 NOP 3144 NOP 3145 NOP 3146 NOP 3147 NOP 3148 NOP 3149 3150 loop: 3151 MOVSS (AX), X1 3152 MULSS X0, X1 3153 ADDSS (DX), X1 3154 MOVSS X1, (DX) 3155 INCQ DI 3156 LEAQ (AX)(CX*4), AX 3157 LEAQ (DX)(BX*4), DX 3158 3159 check_limit: 3160 CMPQ SI, DI 3161 JHI loop 3162 RET 3163 3164 // func AmdAxpyPointerLoop_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3165 // Requires: SSE 3166 TEXT ·AmdAxpyPointerLoop_V3A14(SB), NOSPLIT, $0-48 3167 MOVSS alpha+0(FP), X0 3168 MOVQ xs+8(FP), AX 3169 MOVQ incx+16(FP), CX 3170 MOVQ ys+24(FP), DX 3171 MOVQ incy+32(FP), BX 3172 MOVQ n+40(FP), SI 3173 XORQ DI, DI 3174 JMP check_limit 3175 PCALIGN $0x08 3176 NOP 3177 NOP 3178 NOP 3179 NOP 3180 NOP 3181 NOP 3182 3183 loop: 3184 MOVSS (AX), X1 3185 MULSS X0, X1 3186 ADDSS (DX), X1 3187 MOVSS X1, (DX) 3188 INCQ DI 3189 LEAQ (AX)(CX*4), AX 3190 LEAQ (DX)(BX*4), DX 3191 3192 check_limit: 3193 CMPQ SI, DI 3194 JHI loop 3195 RET 3196 3197 // func AmdAxpyPointerLoop_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3198 // Requires: SSE 3199 TEXT ·AmdAxpyPointerLoop_V4A14(SB), NOSPLIT, $0-48 3200 MOVSS alpha+0(FP), X0 3201 MOVQ xs+8(FP), AX 3202 MOVQ incx+16(FP), CX 3203 MOVQ ys+24(FP), DX 3204 MOVQ incy+32(FP), BX 3205 MOVQ n+40(FP), SI 3206 XORQ DI, DI 3207 JMP check_limit 3208 PCALIGN $0x08 3209 NOP 3210 NOP 3211 NOP 3212 NOP 3213 NOP 3214 NOP 3215 3216 loop: 3217 MOVSS (AX), X1 3218 MULSS X0, X1 3219 ADDSS (DX), X1 3220 MOVSS X1, (DX) 3221 INCQ DI 3222 LEAQ (AX)(CX*4), AX 3223 LEAQ (DX)(BX*4), DX 3224 3225 check_limit: 3226 CMPQ SI, DI 3227 JHI loop 3228 RET 3229 3230 // func AmdAxpyPointerLoop_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3231 // Requires: SSE 3232 TEXT ·AmdAxpyPointerLoop_V5A14(SB), NOSPLIT, $0-48 3233 MOVSS alpha+0(FP), X0 3234 MOVQ xs+8(FP), AX 3235 MOVQ incx+16(FP), CX 3236 MOVQ ys+24(FP), DX 3237 MOVQ incy+32(FP), BX 3238 MOVQ n+40(FP), SI 3239 XORQ DI, DI 3240 JMP check_limit 3241 PCALIGN $0x08 3242 NOP 3243 NOP 3244 NOP 3245 NOP 3246 NOP 3247 NOP 3248 3249 loop: 3250 MOVSS (AX), X1 3251 MULSS X0, X1 3252 ADDSS (DX), X1 3253 MOVSS X1, (DX) 3254 INCQ DI 3255 LEAQ (AX)(CX*4), AX 3256 LEAQ (DX)(BX*4), DX 3257 3258 check_limit: 3259 CMPQ SI, DI 3260 JHI loop 3261 RET 3262 3263 // func AmdAxpyPointerLoop_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3264 // Requires: SSE 3265 TEXT ·AmdAxpyPointerLoop_V0A15(SB), NOSPLIT, $0-48 3266 MOVSS alpha+0(FP), X0 3267 MOVQ xs+8(FP), AX 3268 MOVQ incx+16(FP), CX 3269 MOVQ ys+24(FP), DX 3270 MOVQ incy+32(FP), BX 3271 MOVQ n+40(FP), SI 3272 XORQ DI, DI 3273 JMP check_limit 3274 PCALIGN $0x08 3275 NOP 3276 NOP 3277 NOP 3278 NOP 3279 NOP 3280 NOP 3281 NOP 3282 3283 loop: 3284 MOVSS (AX), X1 3285 MULSS X0, X1 3286 ADDSS (DX), X1 3287 MOVSS X1, (DX) 3288 INCQ DI 3289 LEAQ (AX)(CX*4), AX 3290 LEAQ (DX)(BX*4), DX 3291 3292 check_limit: 3293 CMPQ SI, DI 3294 JHI loop 3295 RET 3296 3297 // func AmdAxpyPointerLoop_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3298 // Requires: SSE 3299 TEXT ·AmdAxpyPointerLoop_V1A15(SB), NOSPLIT, $0-48 3300 MOVSS alpha+0(FP), X0 3301 MOVQ xs+8(FP), AX 3302 MOVQ incx+16(FP), CX 3303 MOVQ ys+24(FP), DX 3304 MOVQ incy+32(FP), BX 3305 MOVQ n+40(FP), SI 3306 XORQ DI, DI 3307 JMP check_limit 3308 PCALIGN $0x08 3309 NOP 3310 NOP 3311 NOP 3312 NOP 3313 NOP 3314 NOP 3315 NOP 3316 3317 loop: 3318 MOVSS (AX), X1 3319 MULSS X0, X1 3320 ADDSS (DX), X1 3321 MOVSS X1, (DX) 3322 INCQ DI 3323 LEAQ (AX)(CX*4), AX 3324 LEAQ (DX)(BX*4), DX 3325 3326 check_limit: 3327 CMPQ SI, DI 3328 JHI loop 3329 RET 3330 3331 // func AmdAxpyPointerLoop_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3332 // Requires: SSE 3333 TEXT ·AmdAxpyPointerLoop_V2A15(SB), NOSPLIT, $0-48 3334 MOVSS alpha+0(FP), X0 3335 MOVQ xs+8(FP), AX 3336 MOVQ incx+16(FP), CX 3337 MOVQ ys+24(FP), DX 3338 MOVQ incy+32(FP), BX 3339 MOVQ n+40(FP), SI 3340 XORQ DI, DI 3341 JMP check_limit 3342 PCALIGN $0x08 3343 NOP 3344 NOP 3345 NOP 3346 NOP 3347 NOP 3348 NOP 3349 NOP 3350 3351 loop: 3352 MOVSS (AX), X1 3353 MULSS X0, X1 3354 ADDSS (DX), X1 3355 MOVSS X1, (DX) 3356 INCQ DI 3357 LEAQ (AX)(CX*4), AX 3358 LEAQ (DX)(BX*4), DX 3359 3360 check_limit: 3361 CMPQ SI, DI 3362 JHI loop 3363 RET 3364 3365 // func AmdAxpyPointerLoop_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3366 // Requires: SSE 3367 TEXT ·AmdAxpyPointerLoop_V3A15(SB), NOSPLIT, $0-48 3368 MOVSS alpha+0(FP), X0 3369 MOVQ xs+8(FP), AX 3370 MOVQ incx+16(FP), CX 3371 MOVQ ys+24(FP), DX 3372 MOVQ incy+32(FP), BX 3373 MOVQ n+40(FP), SI 3374 XORQ DI, DI 3375 JMP check_limit 3376 PCALIGN $0x08 3377 NOP 3378 NOP 3379 NOP 3380 NOP 3381 NOP 3382 NOP 3383 NOP 3384 3385 loop: 3386 MOVSS (AX), X1 3387 MULSS X0, X1 3388 ADDSS (DX), X1 3389 MOVSS X1, (DX) 3390 INCQ DI 3391 LEAQ (AX)(CX*4), AX 3392 LEAQ (DX)(BX*4), DX 3393 3394 check_limit: 3395 CMPQ SI, DI 3396 JHI loop 3397 RET 3398 3399 // func AmdAxpyPointerLoop_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3400 // Requires: SSE 3401 TEXT ·AmdAxpyPointerLoop_V4A15(SB), NOSPLIT, $0-48 3402 MOVSS alpha+0(FP), X0 3403 MOVQ xs+8(FP), AX 3404 MOVQ incx+16(FP), CX 3405 MOVQ ys+24(FP), DX 3406 MOVQ incy+32(FP), BX 3407 MOVQ n+40(FP), SI 3408 XORQ DI, DI 3409 JMP check_limit 3410 PCALIGN $0x08 3411 NOP 3412 NOP 3413 NOP 3414 NOP 3415 NOP 3416 NOP 3417 NOP 3418 3419 loop: 3420 MOVSS (AX), X1 3421 MULSS X0, X1 3422 ADDSS (DX), X1 3423 MOVSS X1, (DX) 3424 INCQ DI 3425 LEAQ (AX)(CX*4), AX 3426 LEAQ (DX)(BX*4), DX 3427 3428 check_limit: 3429 CMPQ SI, DI 3430 JHI loop 3431 RET 3432 3433 // func AmdAxpyPointerLoop_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3434 // Requires: SSE 3435 TEXT ·AmdAxpyPointerLoop_V5A15(SB), NOSPLIT, $0-48 3436 MOVSS alpha+0(FP), X0 3437 MOVQ xs+8(FP), AX 3438 MOVQ incx+16(FP), CX 3439 MOVQ ys+24(FP), DX 3440 MOVQ incy+32(FP), BX 3441 MOVQ n+40(FP), SI 3442 XORQ DI, DI 3443 JMP check_limit 3444 PCALIGN $0x08 3445 NOP 3446 NOP 3447 NOP 3448 NOP 3449 NOP 3450 NOP 3451 NOP 3452 3453 loop: 3454 MOVSS (AX), X1 3455 MULSS X0, X1 3456 ADDSS (DX), X1 3457 MOVSS X1, (DX) 3458 INCQ DI 3459 LEAQ (AX)(CX*4), AX 3460 LEAQ (DX)(BX*4), DX 3461 3462 check_limit: 3463 CMPQ SI, DI 3464 JHI loop 3465 RET 3466 3467 // func AmdAxpyPointerLoop_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3468 // Requires: SSE 3469 TEXT ·AmdAxpyPointerLoop_V0A16(SB), NOSPLIT, $0-48 3470 MOVSS alpha+0(FP), X0 3471 MOVQ xs+8(FP), AX 3472 MOVQ incx+16(FP), CX 3473 MOVQ ys+24(FP), DX 3474 MOVQ incy+32(FP), BX 3475 MOVQ n+40(FP), SI 3476 XORQ DI, DI 3477 JMP check_limit 3478 PCALIGN $0x10 3479 3480 loop: 3481 MOVSS (AX), X1 3482 MULSS X0, X1 3483 ADDSS (DX), X1 3484 MOVSS X1, (DX) 3485 INCQ DI 3486 LEAQ (AX)(CX*4), AX 3487 LEAQ (DX)(BX*4), DX 3488 3489 check_limit: 3490 CMPQ SI, DI 3491 JHI loop 3492 RET 3493 3494 // func AmdAxpyPointerLoop_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3495 // Requires: SSE 3496 TEXT ·AmdAxpyPointerLoop_V1A16(SB), NOSPLIT, $0-48 3497 MOVSS alpha+0(FP), X0 3498 MOVQ xs+8(FP), AX 3499 MOVQ incx+16(FP), CX 3500 MOVQ ys+24(FP), DX 3501 MOVQ incy+32(FP), BX 3502 MOVQ n+40(FP), SI 3503 XORQ DI, DI 3504 JMP check_limit 3505 PCALIGN $0x10 3506 3507 loop: 3508 MOVSS (AX), X1 3509 MULSS X0, X1 3510 ADDSS (DX), X1 3511 MOVSS X1, (DX) 3512 INCQ DI 3513 LEAQ (AX)(CX*4), AX 3514 LEAQ (DX)(BX*4), DX 3515 3516 check_limit: 3517 CMPQ SI, DI 3518 JHI loop 3519 RET 3520 3521 // func AmdAxpyPointerLoop_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3522 // Requires: SSE 3523 TEXT ·AmdAxpyPointerLoop_V2A16(SB), NOSPLIT, $0-48 3524 MOVSS alpha+0(FP), X0 3525 MOVQ xs+8(FP), AX 3526 MOVQ incx+16(FP), CX 3527 MOVQ ys+24(FP), DX 3528 MOVQ incy+32(FP), BX 3529 MOVQ n+40(FP), SI 3530 XORQ DI, DI 3531 JMP check_limit 3532 PCALIGN $0x10 3533 3534 loop: 3535 MOVSS (AX), X1 3536 MULSS X0, X1 3537 ADDSS (DX), X1 3538 MOVSS X1, (DX) 3539 INCQ DI 3540 LEAQ (AX)(CX*4), AX 3541 LEAQ (DX)(BX*4), DX 3542 3543 check_limit: 3544 CMPQ SI, DI 3545 JHI loop 3546 RET 3547 3548 // func AmdAxpyPointerLoop_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3549 // Requires: SSE 3550 TEXT ·AmdAxpyPointerLoop_V3A16(SB), NOSPLIT, $0-48 3551 MOVSS alpha+0(FP), X0 3552 MOVQ xs+8(FP), AX 3553 MOVQ incx+16(FP), CX 3554 MOVQ ys+24(FP), DX 3555 MOVQ incy+32(FP), BX 3556 MOVQ n+40(FP), SI 3557 XORQ DI, DI 3558 JMP check_limit 3559 PCALIGN $0x10 3560 3561 loop: 3562 MOVSS (AX), X1 3563 MULSS X0, X1 3564 ADDSS (DX), X1 3565 MOVSS X1, (DX) 3566 INCQ DI 3567 LEAQ (AX)(CX*4), AX 3568 LEAQ (DX)(BX*4), DX 3569 3570 check_limit: 3571 CMPQ SI, DI 3572 JHI loop 3573 RET 3574 3575 // func AmdAxpyPointerLoop_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3576 // Requires: SSE 3577 TEXT ·AmdAxpyPointerLoop_V4A16(SB), NOSPLIT, $0-48 3578 MOVSS alpha+0(FP), X0 3579 MOVQ xs+8(FP), AX 3580 MOVQ incx+16(FP), CX 3581 MOVQ ys+24(FP), DX 3582 MOVQ incy+32(FP), BX 3583 MOVQ n+40(FP), SI 3584 XORQ DI, DI 3585 JMP check_limit 3586 PCALIGN $0x10 3587 3588 loop: 3589 MOVSS (AX), X1 3590 MULSS X0, X1 3591 ADDSS (DX), X1 3592 MOVSS X1, (DX) 3593 INCQ DI 3594 LEAQ (AX)(CX*4), AX 3595 LEAQ (DX)(BX*4), DX 3596 3597 check_limit: 3598 CMPQ SI, DI 3599 JHI loop 3600 RET 3601 3602 // func AmdAxpyPointerLoop_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3603 // Requires: SSE 3604 TEXT ·AmdAxpyPointerLoop_V5A16(SB), NOSPLIT, $0-48 3605 MOVSS alpha+0(FP), X0 3606 MOVQ xs+8(FP), AX 3607 MOVQ incx+16(FP), CX 3608 MOVQ ys+24(FP), DX 3609 MOVQ incy+32(FP), BX 3610 MOVQ n+40(FP), SI 3611 XORQ DI, DI 3612 JMP check_limit 3613 PCALIGN $0x10 3614 3615 loop: 3616 MOVSS (AX), X1 3617 MULSS X0, X1 3618 ADDSS (DX), X1 3619 MOVSS X1, (DX) 3620 INCQ DI 3621 LEAQ (AX)(CX*4), AX 3622 LEAQ (DX)(BX*4), DX 3623 3624 check_limit: 3625 CMPQ SI, DI 3626 JHI loop 3627 RET 3628 3629 // func AmdAxpyPointerLoopX_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3630 // Requires: SSE 3631 TEXT ·AmdAxpyPointerLoopX_V0A0(SB), NOSPLIT, $0-48 3632 MOVSS alpha+0(FP), X0 3633 MOVQ xs+8(FP), AX 3634 MOVQ incx+16(FP), CX 3635 MOVQ ys+24(FP), DX 3636 MOVQ incy+32(FP), BX 3637 MOVQ n+40(FP), SI 3638 JMP check_limit 3639 3640 loop: 3641 MOVSS (AX), X1 3642 MULSS X0, X1 3643 ADDSS (DX), X1 3644 MOVSS X1, (DX) 3645 DECQ SI 3646 LEAQ (AX)(CX*4), AX 3647 LEAQ (DX)(BX*4), DX 3648 3649 check_limit: 3650 CMPQ SI, $0x00 3651 JHI loop 3652 RET 3653 3654 // func AmdAxpyPointerLoopX_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3655 // Requires: SSE 3656 TEXT ·AmdAxpyPointerLoopX_V1A0(SB), NOSPLIT, $0-48 3657 MOVSS alpha+0(FP), X0 3658 MOVQ xs+8(FP), AX 3659 MOVQ incx+16(FP), CX 3660 MOVQ ys+24(FP), DX 3661 MOVQ incy+32(FP), BX 3662 MOVQ n+40(FP), SI 3663 JMP check_limit 3664 3665 loop: 3666 MOVSS (AX), X1 3667 MULSS X0, X1 3668 ADDSS (DX), X1 3669 MOVSS X1, (DX) 3670 DECQ SI 3671 LEAQ (AX)(CX*4), AX 3672 LEAQ (DX)(BX*4), DX 3673 3674 check_limit: 3675 CMPQ SI, $0x00 3676 JHI loop 3677 RET 3678 3679 // func AmdAxpyPointerLoopX_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3680 // Requires: SSE 3681 TEXT ·AmdAxpyPointerLoopX_V2A0(SB), NOSPLIT, $0-48 3682 MOVSS alpha+0(FP), X0 3683 MOVQ xs+8(FP), AX 3684 MOVQ incx+16(FP), CX 3685 MOVQ ys+24(FP), DX 3686 MOVQ incy+32(FP), BX 3687 MOVQ n+40(FP), SI 3688 JMP check_limit 3689 3690 loop: 3691 MOVSS (AX), X1 3692 MULSS X0, X1 3693 ADDSS (DX), X1 3694 MOVSS X1, (DX) 3695 DECQ SI 3696 LEAQ (AX)(CX*4), AX 3697 LEAQ (DX)(BX*4), DX 3698 3699 check_limit: 3700 CMPQ SI, $0x00 3701 JHI loop 3702 RET 3703 3704 // func AmdAxpyPointerLoopX_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3705 // Requires: SSE 3706 TEXT ·AmdAxpyPointerLoopX_V3A0(SB), NOSPLIT, $0-48 3707 MOVSS alpha+0(FP), X0 3708 MOVQ xs+8(FP), AX 3709 MOVQ incx+16(FP), CX 3710 MOVQ ys+24(FP), DX 3711 MOVQ incy+32(FP), BX 3712 MOVQ n+40(FP), SI 3713 JMP check_limit 3714 3715 loop: 3716 MOVSS (AX), X1 3717 MULSS X0, X1 3718 ADDSS (DX), X1 3719 MOVSS X1, (DX) 3720 DECQ SI 3721 LEAQ (AX)(CX*4), AX 3722 LEAQ (DX)(BX*4), DX 3723 3724 check_limit: 3725 CMPQ SI, $0x00 3726 JHI loop 3727 RET 3728 3729 // func AmdAxpyPointerLoopX_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3730 // Requires: SSE 3731 TEXT ·AmdAxpyPointerLoopX_V4A0(SB), NOSPLIT, $0-48 3732 MOVSS alpha+0(FP), X0 3733 MOVQ xs+8(FP), AX 3734 MOVQ incx+16(FP), CX 3735 MOVQ ys+24(FP), DX 3736 MOVQ incy+32(FP), BX 3737 MOVQ n+40(FP), SI 3738 JMP check_limit 3739 3740 loop: 3741 MOVSS (AX), X1 3742 MULSS X0, X1 3743 ADDSS (DX), X1 3744 MOVSS X1, (DX) 3745 DECQ SI 3746 LEAQ (AX)(CX*4), AX 3747 LEAQ (DX)(BX*4), DX 3748 3749 check_limit: 3750 CMPQ SI, $0x00 3751 JHI loop 3752 RET 3753 3754 // func AmdAxpyPointerLoopX_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3755 // Requires: SSE 3756 TEXT ·AmdAxpyPointerLoopX_V5A0(SB), NOSPLIT, $0-48 3757 MOVSS alpha+0(FP), X0 3758 MOVQ xs+8(FP), AX 3759 MOVQ incx+16(FP), CX 3760 MOVQ ys+24(FP), DX 3761 MOVQ incy+32(FP), BX 3762 MOVQ n+40(FP), SI 3763 JMP check_limit 3764 3765 loop: 3766 MOVSS (AX), X1 3767 MULSS X0, X1 3768 ADDSS (DX), X1 3769 MOVSS X1, (DX) 3770 DECQ SI 3771 LEAQ (AX)(CX*4), AX 3772 LEAQ (DX)(BX*4), DX 3773 3774 check_limit: 3775 CMPQ SI, $0x00 3776 JHI loop 3777 RET 3778 3779 // func AmdAxpyPointerLoopX_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3780 // Requires: SSE 3781 TEXT ·AmdAxpyPointerLoopX_V0A8(SB), NOSPLIT, $0-48 3782 MOVSS alpha+0(FP), X0 3783 MOVQ xs+8(FP), AX 3784 MOVQ incx+16(FP), CX 3785 MOVQ ys+24(FP), DX 3786 MOVQ incy+32(FP), BX 3787 MOVQ n+40(FP), SI 3788 JMP check_limit 3789 PCALIGN $0x08 3790 3791 loop: 3792 MOVSS (AX), X1 3793 MULSS X0, X1 3794 ADDSS (DX), X1 3795 MOVSS X1, (DX) 3796 DECQ SI 3797 LEAQ (AX)(CX*4), AX 3798 LEAQ (DX)(BX*4), DX 3799 3800 check_limit: 3801 CMPQ SI, $0x00 3802 JHI loop 3803 RET 3804 3805 // func AmdAxpyPointerLoopX_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3806 // Requires: SSE 3807 TEXT ·AmdAxpyPointerLoopX_V1A8(SB), NOSPLIT, $0-48 3808 MOVSS alpha+0(FP), X0 3809 MOVQ xs+8(FP), AX 3810 MOVQ incx+16(FP), CX 3811 MOVQ ys+24(FP), DX 3812 MOVQ incy+32(FP), BX 3813 MOVQ n+40(FP), SI 3814 JMP check_limit 3815 PCALIGN $0x08 3816 3817 loop: 3818 MOVSS (AX), X1 3819 MULSS X0, X1 3820 ADDSS (DX), X1 3821 MOVSS X1, (DX) 3822 DECQ SI 3823 LEAQ (AX)(CX*4), AX 3824 LEAQ (DX)(BX*4), DX 3825 3826 check_limit: 3827 CMPQ SI, $0x00 3828 JHI loop 3829 RET 3830 3831 // func AmdAxpyPointerLoopX_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3832 // Requires: SSE 3833 TEXT ·AmdAxpyPointerLoopX_V2A8(SB), NOSPLIT, $0-48 3834 MOVSS alpha+0(FP), X0 3835 MOVQ xs+8(FP), AX 3836 MOVQ incx+16(FP), CX 3837 MOVQ ys+24(FP), DX 3838 MOVQ incy+32(FP), BX 3839 MOVQ n+40(FP), SI 3840 JMP check_limit 3841 PCALIGN $0x08 3842 3843 loop: 3844 MOVSS (AX), X1 3845 MULSS X0, X1 3846 ADDSS (DX), X1 3847 MOVSS X1, (DX) 3848 DECQ SI 3849 LEAQ (AX)(CX*4), AX 3850 LEAQ (DX)(BX*4), DX 3851 3852 check_limit: 3853 CMPQ SI, $0x00 3854 JHI loop 3855 RET 3856 3857 // func AmdAxpyPointerLoopX_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3858 // Requires: SSE 3859 TEXT ·AmdAxpyPointerLoopX_V3A8(SB), NOSPLIT, $0-48 3860 MOVSS alpha+0(FP), X0 3861 MOVQ xs+8(FP), AX 3862 MOVQ incx+16(FP), CX 3863 MOVQ ys+24(FP), DX 3864 MOVQ incy+32(FP), BX 3865 MOVQ n+40(FP), SI 3866 JMP check_limit 3867 PCALIGN $0x08 3868 3869 loop: 3870 MOVSS (AX), X1 3871 MULSS X0, X1 3872 ADDSS (DX), X1 3873 MOVSS X1, (DX) 3874 DECQ SI 3875 LEAQ (AX)(CX*4), AX 3876 LEAQ (DX)(BX*4), DX 3877 3878 check_limit: 3879 CMPQ SI, $0x00 3880 JHI loop 3881 RET 3882 3883 // func AmdAxpyPointerLoopX_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3884 // Requires: SSE 3885 TEXT ·AmdAxpyPointerLoopX_V4A8(SB), NOSPLIT, $0-48 3886 MOVSS alpha+0(FP), X0 3887 MOVQ xs+8(FP), AX 3888 MOVQ incx+16(FP), CX 3889 MOVQ ys+24(FP), DX 3890 MOVQ incy+32(FP), BX 3891 MOVQ n+40(FP), SI 3892 JMP check_limit 3893 PCALIGN $0x08 3894 3895 loop: 3896 MOVSS (AX), X1 3897 MULSS X0, X1 3898 ADDSS (DX), X1 3899 MOVSS X1, (DX) 3900 DECQ SI 3901 LEAQ (AX)(CX*4), AX 3902 LEAQ (DX)(BX*4), DX 3903 3904 check_limit: 3905 CMPQ SI, $0x00 3906 JHI loop 3907 RET 3908 3909 // func AmdAxpyPointerLoopX_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3910 // Requires: SSE 3911 TEXT ·AmdAxpyPointerLoopX_V5A8(SB), NOSPLIT, $0-48 3912 MOVSS alpha+0(FP), X0 3913 MOVQ xs+8(FP), AX 3914 MOVQ incx+16(FP), CX 3915 MOVQ ys+24(FP), DX 3916 MOVQ incy+32(FP), BX 3917 MOVQ n+40(FP), SI 3918 JMP check_limit 3919 PCALIGN $0x08 3920 3921 loop: 3922 MOVSS (AX), X1 3923 MULSS X0, X1 3924 ADDSS (DX), X1 3925 MOVSS X1, (DX) 3926 DECQ SI 3927 LEAQ (AX)(CX*4), AX 3928 LEAQ (DX)(BX*4), DX 3929 3930 check_limit: 3931 CMPQ SI, $0x00 3932 JHI loop 3933 RET 3934 3935 // func AmdAxpyPointerLoopX_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3936 // Requires: SSE 3937 TEXT ·AmdAxpyPointerLoopX_V0A9(SB), NOSPLIT, $0-48 3938 MOVSS alpha+0(FP), X0 3939 MOVQ xs+8(FP), AX 3940 MOVQ incx+16(FP), CX 3941 MOVQ ys+24(FP), DX 3942 MOVQ incy+32(FP), BX 3943 MOVQ n+40(FP), SI 3944 JMP check_limit 3945 PCALIGN $0x08 3946 NOP 3947 3948 loop: 3949 MOVSS (AX), X1 3950 MULSS X0, X1 3951 ADDSS (DX), X1 3952 MOVSS X1, (DX) 3953 DECQ SI 3954 LEAQ (AX)(CX*4), AX 3955 LEAQ (DX)(BX*4), DX 3956 3957 check_limit: 3958 CMPQ SI, $0x00 3959 JHI loop 3960 RET 3961 3962 // func AmdAxpyPointerLoopX_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3963 // Requires: SSE 3964 TEXT ·AmdAxpyPointerLoopX_V1A9(SB), NOSPLIT, $0-48 3965 MOVSS alpha+0(FP), X0 3966 MOVQ xs+8(FP), AX 3967 MOVQ incx+16(FP), CX 3968 MOVQ ys+24(FP), DX 3969 MOVQ incy+32(FP), BX 3970 MOVQ n+40(FP), SI 3971 JMP check_limit 3972 PCALIGN $0x08 3973 NOP 3974 3975 loop: 3976 MOVSS (AX), X1 3977 MULSS X0, X1 3978 ADDSS (DX), X1 3979 MOVSS X1, (DX) 3980 DECQ SI 3981 LEAQ (AX)(CX*4), AX 3982 LEAQ (DX)(BX*4), DX 3983 3984 check_limit: 3985 CMPQ SI, $0x00 3986 JHI loop 3987 RET 3988 3989 // func AmdAxpyPointerLoopX_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 3990 // Requires: SSE 3991 TEXT ·AmdAxpyPointerLoopX_V2A9(SB), NOSPLIT, $0-48 3992 MOVSS alpha+0(FP), X0 3993 MOVQ xs+8(FP), AX 3994 MOVQ incx+16(FP), CX 3995 MOVQ ys+24(FP), DX 3996 MOVQ incy+32(FP), BX 3997 MOVQ n+40(FP), SI 3998 JMP check_limit 3999 PCALIGN $0x08 4000 NOP 4001 4002 loop: 4003 MOVSS (AX), X1 4004 MULSS X0, X1 4005 ADDSS (DX), X1 4006 MOVSS X1, (DX) 4007 DECQ SI 4008 LEAQ (AX)(CX*4), AX 4009 LEAQ (DX)(BX*4), DX 4010 4011 check_limit: 4012 CMPQ SI, $0x00 4013 JHI loop 4014 RET 4015 4016 // func AmdAxpyPointerLoopX_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4017 // Requires: SSE 4018 TEXT ·AmdAxpyPointerLoopX_V3A9(SB), NOSPLIT, $0-48 4019 MOVSS alpha+0(FP), X0 4020 MOVQ xs+8(FP), AX 4021 MOVQ incx+16(FP), CX 4022 MOVQ ys+24(FP), DX 4023 MOVQ incy+32(FP), BX 4024 MOVQ n+40(FP), SI 4025 JMP check_limit 4026 PCALIGN $0x08 4027 NOP 4028 4029 loop: 4030 MOVSS (AX), X1 4031 MULSS X0, X1 4032 ADDSS (DX), X1 4033 MOVSS X1, (DX) 4034 DECQ SI 4035 LEAQ (AX)(CX*4), AX 4036 LEAQ (DX)(BX*4), DX 4037 4038 check_limit: 4039 CMPQ SI, $0x00 4040 JHI loop 4041 RET 4042 4043 // func AmdAxpyPointerLoopX_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4044 // Requires: SSE 4045 TEXT ·AmdAxpyPointerLoopX_V4A9(SB), NOSPLIT, $0-48 4046 MOVSS alpha+0(FP), X0 4047 MOVQ xs+8(FP), AX 4048 MOVQ incx+16(FP), CX 4049 MOVQ ys+24(FP), DX 4050 MOVQ incy+32(FP), BX 4051 MOVQ n+40(FP), SI 4052 JMP check_limit 4053 PCALIGN $0x08 4054 NOP 4055 4056 loop: 4057 MOVSS (AX), X1 4058 MULSS X0, X1 4059 ADDSS (DX), X1 4060 MOVSS X1, (DX) 4061 DECQ SI 4062 LEAQ (AX)(CX*4), AX 4063 LEAQ (DX)(BX*4), DX 4064 4065 check_limit: 4066 CMPQ SI, $0x00 4067 JHI loop 4068 RET 4069 4070 // func AmdAxpyPointerLoopX_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4071 // Requires: SSE 4072 TEXT ·AmdAxpyPointerLoopX_V5A9(SB), NOSPLIT, $0-48 4073 MOVSS alpha+0(FP), X0 4074 MOVQ xs+8(FP), AX 4075 MOVQ incx+16(FP), CX 4076 MOVQ ys+24(FP), DX 4077 MOVQ incy+32(FP), BX 4078 MOVQ n+40(FP), SI 4079 JMP check_limit 4080 PCALIGN $0x08 4081 NOP 4082 4083 loop: 4084 MOVSS (AX), X1 4085 MULSS X0, X1 4086 ADDSS (DX), X1 4087 MOVSS X1, (DX) 4088 DECQ SI 4089 LEAQ (AX)(CX*4), AX 4090 LEAQ (DX)(BX*4), DX 4091 4092 check_limit: 4093 CMPQ SI, $0x00 4094 JHI loop 4095 RET 4096 4097 // func AmdAxpyPointerLoopX_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4098 // Requires: SSE 4099 TEXT ·AmdAxpyPointerLoopX_V0A10(SB), NOSPLIT, $0-48 4100 MOVSS alpha+0(FP), X0 4101 MOVQ xs+8(FP), AX 4102 MOVQ incx+16(FP), CX 4103 MOVQ ys+24(FP), DX 4104 MOVQ incy+32(FP), BX 4105 MOVQ n+40(FP), SI 4106 JMP check_limit 4107 PCALIGN $0x08 4108 NOP 4109 NOP 4110 4111 loop: 4112 MOVSS (AX), X1 4113 MULSS X0, X1 4114 ADDSS (DX), X1 4115 MOVSS X1, (DX) 4116 DECQ SI 4117 LEAQ (AX)(CX*4), AX 4118 LEAQ (DX)(BX*4), DX 4119 4120 check_limit: 4121 CMPQ SI, $0x00 4122 JHI loop 4123 RET 4124 4125 // func AmdAxpyPointerLoopX_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4126 // Requires: SSE 4127 TEXT ·AmdAxpyPointerLoopX_V1A10(SB), NOSPLIT, $0-48 4128 MOVSS alpha+0(FP), X0 4129 MOVQ xs+8(FP), AX 4130 MOVQ incx+16(FP), CX 4131 MOVQ ys+24(FP), DX 4132 MOVQ incy+32(FP), BX 4133 MOVQ n+40(FP), SI 4134 JMP check_limit 4135 PCALIGN $0x08 4136 NOP 4137 NOP 4138 4139 loop: 4140 MOVSS (AX), X1 4141 MULSS X0, X1 4142 ADDSS (DX), X1 4143 MOVSS X1, (DX) 4144 DECQ SI 4145 LEAQ (AX)(CX*4), AX 4146 LEAQ (DX)(BX*4), DX 4147 4148 check_limit: 4149 CMPQ SI, $0x00 4150 JHI loop 4151 RET 4152 4153 // func AmdAxpyPointerLoopX_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4154 // Requires: SSE 4155 TEXT ·AmdAxpyPointerLoopX_V2A10(SB), NOSPLIT, $0-48 4156 MOVSS alpha+0(FP), X0 4157 MOVQ xs+8(FP), AX 4158 MOVQ incx+16(FP), CX 4159 MOVQ ys+24(FP), DX 4160 MOVQ incy+32(FP), BX 4161 MOVQ n+40(FP), SI 4162 JMP check_limit 4163 PCALIGN $0x08 4164 NOP 4165 NOP 4166 4167 loop: 4168 MOVSS (AX), X1 4169 MULSS X0, X1 4170 ADDSS (DX), X1 4171 MOVSS X1, (DX) 4172 DECQ SI 4173 LEAQ (AX)(CX*4), AX 4174 LEAQ (DX)(BX*4), DX 4175 4176 check_limit: 4177 CMPQ SI, $0x00 4178 JHI loop 4179 RET 4180 4181 // func AmdAxpyPointerLoopX_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4182 // Requires: SSE 4183 TEXT ·AmdAxpyPointerLoopX_V3A10(SB), NOSPLIT, $0-48 4184 MOVSS alpha+0(FP), X0 4185 MOVQ xs+8(FP), AX 4186 MOVQ incx+16(FP), CX 4187 MOVQ ys+24(FP), DX 4188 MOVQ incy+32(FP), BX 4189 MOVQ n+40(FP), SI 4190 JMP check_limit 4191 PCALIGN $0x08 4192 NOP 4193 NOP 4194 4195 loop: 4196 MOVSS (AX), X1 4197 MULSS X0, X1 4198 ADDSS (DX), X1 4199 MOVSS X1, (DX) 4200 DECQ SI 4201 LEAQ (AX)(CX*4), AX 4202 LEAQ (DX)(BX*4), DX 4203 4204 check_limit: 4205 CMPQ SI, $0x00 4206 JHI loop 4207 RET 4208 4209 // func AmdAxpyPointerLoopX_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4210 // Requires: SSE 4211 TEXT ·AmdAxpyPointerLoopX_V4A10(SB), NOSPLIT, $0-48 4212 MOVSS alpha+0(FP), X0 4213 MOVQ xs+8(FP), AX 4214 MOVQ incx+16(FP), CX 4215 MOVQ ys+24(FP), DX 4216 MOVQ incy+32(FP), BX 4217 MOVQ n+40(FP), SI 4218 JMP check_limit 4219 PCALIGN $0x08 4220 NOP 4221 NOP 4222 4223 loop: 4224 MOVSS (AX), X1 4225 MULSS X0, X1 4226 ADDSS (DX), X1 4227 MOVSS X1, (DX) 4228 DECQ SI 4229 LEAQ (AX)(CX*4), AX 4230 LEAQ (DX)(BX*4), DX 4231 4232 check_limit: 4233 CMPQ SI, $0x00 4234 JHI loop 4235 RET 4236 4237 // func AmdAxpyPointerLoopX_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4238 // Requires: SSE 4239 TEXT ·AmdAxpyPointerLoopX_V5A10(SB), NOSPLIT, $0-48 4240 MOVSS alpha+0(FP), X0 4241 MOVQ xs+8(FP), AX 4242 MOVQ incx+16(FP), CX 4243 MOVQ ys+24(FP), DX 4244 MOVQ incy+32(FP), BX 4245 MOVQ n+40(FP), SI 4246 JMP check_limit 4247 PCALIGN $0x08 4248 NOP 4249 NOP 4250 4251 loop: 4252 MOVSS (AX), X1 4253 MULSS X0, X1 4254 ADDSS (DX), X1 4255 MOVSS X1, (DX) 4256 DECQ SI 4257 LEAQ (AX)(CX*4), AX 4258 LEAQ (DX)(BX*4), DX 4259 4260 check_limit: 4261 CMPQ SI, $0x00 4262 JHI loop 4263 RET 4264 4265 // func AmdAxpyPointerLoopX_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4266 // Requires: SSE 4267 TEXT ·AmdAxpyPointerLoopX_V0A11(SB), NOSPLIT, $0-48 4268 MOVSS alpha+0(FP), X0 4269 MOVQ xs+8(FP), AX 4270 MOVQ incx+16(FP), CX 4271 MOVQ ys+24(FP), DX 4272 MOVQ incy+32(FP), BX 4273 MOVQ n+40(FP), SI 4274 JMP check_limit 4275 PCALIGN $0x08 4276 NOP 4277 NOP 4278 NOP 4279 4280 loop: 4281 MOVSS (AX), X1 4282 MULSS X0, X1 4283 ADDSS (DX), X1 4284 MOVSS X1, (DX) 4285 DECQ SI 4286 LEAQ (AX)(CX*4), AX 4287 LEAQ (DX)(BX*4), DX 4288 4289 check_limit: 4290 CMPQ SI, $0x00 4291 JHI loop 4292 RET 4293 4294 // func AmdAxpyPointerLoopX_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4295 // Requires: SSE 4296 TEXT ·AmdAxpyPointerLoopX_V1A11(SB), NOSPLIT, $0-48 4297 MOVSS alpha+0(FP), X0 4298 MOVQ xs+8(FP), AX 4299 MOVQ incx+16(FP), CX 4300 MOVQ ys+24(FP), DX 4301 MOVQ incy+32(FP), BX 4302 MOVQ n+40(FP), SI 4303 JMP check_limit 4304 PCALIGN $0x08 4305 NOP 4306 NOP 4307 NOP 4308 4309 loop: 4310 MOVSS (AX), X1 4311 MULSS X0, X1 4312 ADDSS (DX), X1 4313 MOVSS X1, (DX) 4314 DECQ SI 4315 LEAQ (AX)(CX*4), AX 4316 LEAQ (DX)(BX*4), DX 4317 4318 check_limit: 4319 CMPQ SI, $0x00 4320 JHI loop 4321 RET 4322 4323 // func AmdAxpyPointerLoopX_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4324 // Requires: SSE 4325 TEXT ·AmdAxpyPointerLoopX_V2A11(SB), NOSPLIT, $0-48 4326 MOVSS alpha+0(FP), X0 4327 MOVQ xs+8(FP), AX 4328 MOVQ incx+16(FP), CX 4329 MOVQ ys+24(FP), DX 4330 MOVQ incy+32(FP), BX 4331 MOVQ n+40(FP), SI 4332 JMP check_limit 4333 PCALIGN $0x08 4334 NOP 4335 NOP 4336 NOP 4337 4338 loop: 4339 MOVSS (AX), X1 4340 MULSS X0, X1 4341 ADDSS (DX), X1 4342 MOVSS X1, (DX) 4343 DECQ SI 4344 LEAQ (AX)(CX*4), AX 4345 LEAQ (DX)(BX*4), DX 4346 4347 check_limit: 4348 CMPQ SI, $0x00 4349 JHI loop 4350 RET 4351 4352 // func AmdAxpyPointerLoopX_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4353 // Requires: SSE 4354 TEXT ·AmdAxpyPointerLoopX_V3A11(SB), NOSPLIT, $0-48 4355 MOVSS alpha+0(FP), X0 4356 MOVQ xs+8(FP), AX 4357 MOVQ incx+16(FP), CX 4358 MOVQ ys+24(FP), DX 4359 MOVQ incy+32(FP), BX 4360 MOVQ n+40(FP), SI 4361 JMP check_limit 4362 PCALIGN $0x08 4363 NOP 4364 NOP 4365 NOP 4366 4367 loop: 4368 MOVSS (AX), X1 4369 MULSS X0, X1 4370 ADDSS (DX), X1 4371 MOVSS X1, (DX) 4372 DECQ SI 4373 LEAQ (AX)(CX*4), AX 4374 LEAQ (DX)(BX*4), DX 4375 4376 check_limit: 4377 CMPQ SI, $0x00 4378 JHI loop 4379 RET 4380 4381 // func AmdAxpyPointerLoopX_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4382 // Requires: SSE 4383 TEXT ·AmdAxpyPointerLoopX_V4A11(SB), NOSPLIT, $0-48 4384 MOVSS alpha+0(FP), X0 4385 MOVQ xs+8(FP), AX 4386 MOVQ incx+16(FP), CX 4387 MOVQ ys+24(FP), DX 4388 MOVQ incy+32(FP), BX 4389 MOVQ n+40(FP), SI 4390 JMP check_limit 4391 PCALIGN $0x08 4392 NOP 4393 NOP 4394 NOP 4395 4396 loop: 4397 MOVSS (AX), X1 4398 MULSS X0, X1 4399 ADDSS (DX), X1 4400 MOVSS X1, (DX) 4401 DECQ SI 4402 LEAQ (AX)(CX*4), AX 4403 LEAQ (DX)(BX*4), DX 4404 4405 check_limit: 4406 CMPQ SI, $0x00 4407 JHI loop 4408 RET 4409 4410 // func AmdAxpyPointerLoopX_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4411 // Requires: SSE 4412 TEXT ·AmdAxpyPointerLoopX_V5A11(SB), NOSPLIT, $0-48 4413 MOVSS alpha+0(FP), X0 4414 MOVQ xs+8(FP), AX 4415 MOVQ incx+16(FP), CX 4416 MOVQ ys+24(FP), DX 4417 MOVQ incy+32(FP), BX 4418 MOVQ n+40(FP), SI 4419 JMP check_limit 4420 PCALIGN $0x08 4421 NOP 4422 NOP 4423 NOP 4424 4425 loop: 4426 MOVSS (AX), X1 4427 MULSS X0, X1 4428 ADDSS (DX), X1 4429 MOVSS X1, (DX) 4430 DECQ SI 4431 LEAQ (AX)(CX*4), AX 4432 LEAQ (DX)(BX*4), DX 4433 4434 check_limit: 4435 CMPQ SI, $0x00 4436 JHI loop 4437 RET 4438 4439 // func AmdAxpyPointerLoopX_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4440 // Requires: SSE 4441 TEXT ·AmdAxpyPointerLoopX_V0A12(SB), NOSPLIT, $0-48 4442 MOVSS alpha+0(FP), X0 4443 MOVQ xs+8(FP), AX 4444 MOVQ incx+16(FP), CX 4445 MOVQ ys+24(FP), DX 4446 MOVQ incy+32(FP), BX 4447 MOVQ n+40(FP), SI 4448 JMP check_limit 4449 PCALIGN $0x08 4450 NOP 4451 NOP 4452 NOP 4453 NOP 4454 4455 loop: 4456 MOVSS (AX), X1 4457 MULSS X0, X1 4458 ADDSS (DX), X1 4459 MOVSS X1, (DX) 4460 DECQ SI 4461 LEAQ (AX)(CX*4), AX 4462 LEAQ (DX)(BX*4), DX 4463 4464 check_limit: 4465 CMPQ SI, $0x00 4466 JHI loop 4467 RET 4468 4469 // func AmdAxpyPointerLoopX_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4470 // Requires: SSE 4471 TEXT ·AmdAxpyPointerLoopX_V1A12(SB), NOSPLIT, $0-48 4472 MOVSS alpha+0(FP), X0 4473 MOVQ xs+8(FP), AX 4474 MOVQ incx+16(FP), CX 4475 MOVQ ys+24(FP), DX 4476 MOVQ incy+32(FP), BX 4477 MOVQ n+40(FP), SI 4478 JMP check_limit 4479 PCALIGN $0x08 4480 NOP 4481 NOP 4482 NOP 4483 NOP 4484 4485 loop: 4486 MOVSS (AX), X1 4487 MULSS X0, X1 4488 ADDSS (DX), X1 4489 MOVSS X1, (DX) 4490 DECQ SI 4491 LEAQ (AX)(CX*4), AX 4492 LEAQ (DX)(BX*4), DX 4493 4494 check_limit: 4495 CMPQ SI, $0x00 4496 JHI loop 4497 RET 4498 4499 // func AmdAxpyPointerLoopX_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4500 // Requires: SSE 4501 TEXT ·AmdAxpyPointerLoopX_V2A12(SB), NOSPLIT, $0-48 4502 MOVSS alpha+0(FP), X0 4503 MOVQ xs+8(FP), AX 4504 MOVQ incx+16(FP), CX 4505 MOVQ ys+24(FP), DX 4506 MOVQ incy+32(FP), BX 4507 MOVQ n+40(FP), SI 4508 JMP check_limit 4509 PCALIGN $0x08 4510 NOP 4511 NOP 4512 NOP 4513 NOP 4514 4515 loop: 4516 MOVSS (AX), X1 4517 MULSS X0, X1 4518 ADDSS (DX), X1 4519 MOVSS X1, (DX) 4520 DECQ SI 4521 LEAQ (AX)(CX*4), AX 4522 LEAQ (DX)(BX*4), DX 4523 4524 check_limit: 4525 CMPQ SI, $0x00 4526 JHI loop 4527 RET 4528 4529 // func AmdAxpyPointerLoopX_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4530 // Requires: SSE 4531 TEXT ·AmdAxpyPointerLoopX_V3A12(SB), NOSPLIT, $0-48 4532 MOVSS alpha+0(FP), X0 4533 MOVQ xs+8(FP), AX 4534 MOVQ incx+16(FP), CX 4535 MOVQ ys+24(FP), DX 4536 MOVQ incy+32(FP), BX 4537 MOVQ n+40(FP), SI 4538 JMP check_limit 4539 PCALIGN $0x08 4540 NOP 4541 NOP 4542 NOP 4543 NOP 4544 4545 loop: 4546 MOVSS (AX), X1 4547 MULSS X0, X1 4548 ADDSS (DX), X1 4549 MOVSS X1, (DX) 4550 DECQ SI 4551 LEAQ (AX)(CX*4), AX 4552 LEAQ (DX)(BX*4), DX 4553 4554 check_limit: 4555 CMPQ SI, $0x00 4556 JHI loop 4557 RET 4558 4559 // func AmdAxpyPointerLoopX_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4560 // Requires: SSE 4561 TEXT ·AmdAxpyPointerLoopX_V4A12(SB), NOSPLIT, $0-48 4562 MOVSS alpha+0(FP), X0 4563 MOVQ xs+8(FP), AX 4564 MOVQ incx+16(FP), CX 4565 MOVQ ys+24(FP), DX 4566 MOVQ incy+32(FP), BX 4567 MOVQ n+40(FP), SI 4568 JMP check_limit 4569 PCALIGN $0x08 4570 NOP 4571 NOP 4572 NOP 4573 NOP 4574 4575 loop: 4576 MOVSS (AX), X1 4577 MULSS X0, X1 4578 ADDSS (DX), X1 4579 MOVSS X1, (DX) 4580 DECQ SI 4581 LEAQ (AX)(CX*4), AX 4582 LEAQ (DX)(BX*4), DX 4583 4584 check_limit: 4585 CMPQ SI, $0x00 4586 JHI loop 4587 RET 4588 4589 // func AmdAxpyPointerLoopX_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4590 // Requires: SSE 4591 TEXT ·AmdAxpyPointerLoopX_V5A12(SB), NOSPLIT, $0-48 4592 MOVSS alpha+0(FP), X0 4593 MOVQ xs+8(FP), AX 4594 MOVQ incx+16(FP), CX 4595 MOVQ ys+24(FP), DX 4596 MOVQ incy+32(FP), BX 4597 MOVQ n+40(FP), SI 4598 JMP check_limit 4599 PCALIGN $0x08 4600 NOP 4601 NOP 4602 NOP 4603 NOP 4604 4605 loop: 4606 MOVSS (AX), X1 4607 MULSS X0, X1 4608 ADDSS (DX), X1 4609 MOVSS X1, (DX) 4610 DECQ SI 4611 LEAQ (AX)(CX*4), AX 4612 LEAQ (DX)(BX*4), DX 4613 4614 check_limit: 4615 CMPQ SI, $0x00 4616 JHI loop 4617 RET 4618 4619 // func AmdAxpyPointerLoopX_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4620 // Requires: SSE 4621 TEXT ·AmdAxpyPointerLoopX_V0A13(SB), NOSPLIT, $0-48 4622 MOVSS alpha+0(FP), X0 4623 MOVQ xs+8(FP), AX 4624 MOVQ incx+16(FP), CX 4625 MOVQ ys+24(FP), DX 4626 MOVQ incy+32(FP), BX 4627 MOVQ n+40(FP), SI 4628 JMP check_limit 4629 PCALIGN $0x08 4630 NOP 4631 NOP 4632 NOP 4633 NOP 4634 NOP 4635 4636 loop: 4637 MOVSS (AX), X1 4638 MULSS X0, X1 4639 ADDSS (DX), X1 4640 MOVSS X1, (DX) 4641 DECQ SI 4642 LEAQ (AX)(CX*4), AX 4643 LEAQ (DX)(BX*4), DX 4644 4645 check_limit: 4646 CMPQ SI, $0x00 4647 JHI loop 4648 RET 4649 4650 // func AmdAxpyPointerLoopX_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4651 // Requires: SSE 4652 TEXT ·AmdAxpyPointerLoopX_V1A13(SB), NOSPLIT, $0-48 4653 MOVSS alpha+0(FP), X0 4654 MOVQ xs+8(FP), AX 4655 MOVQ incx+16(FP), CX 4656 MOVQ ys+24(FP), DX 4657 MOVQ incy+32(FP), BX 4658 MOVQ n+40(FP), SI 4659 JMP check_limit 4660 PCALIGN $0x08 4661 NOP 4662 NOP 4663 NOP 4664 NOP 4665 NOP 4666 4667 loop: 4668 MOVSS (AX), X1 4669 MULSS X0, X1 4670 ADDSS (DX), X1 4671 MOVSS X1, (DX) 4672 DECQ SI 4673 LEAQ (AX)(CX*4), AX 4674 LEAQ (DX)(BX*4), DX 4675 4676 check_limit: 4677 CMPQ SI, $0x00 4678 JHI loop 4679 RET 4680 4681 // func AmdAxpyPointerLoopX_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4682 // Requires: SSE 4683 TEXT ·AmdAxpyPointerLoopX_V2A13(SB), NOSPLIT, $0-48 4684 MOVSS alpha+0(FP), X0 4685 MOVQ xs+8(FP), AX 4686 MOVQ incx+16(FP), CX 4687 MOVQ ys+24(FP), DX 4688 MOVQ incy+32(FP), BX 4689 MOVQ n+40(FP), SI 4690 JMP check_limit 4691 PCALIGN $0x08 4692 NOP 4693 NOP 4694 NOP 4695 NOP 4696 NOP 4697 4698 loop: 4699 MOVSS (AX), X1 4700 MULSS X0, X1 4701 ADDSS (DX), X1 4702 MOVSS X1, (DX) 4703 DECQ SI 4704 LEAQ (AX)(CX*4), AX 4705 LEAQ (DX)(BX*4), DX 4706 4707 check_limit: 4708 CMPQ SI, $0x00 4709 JHI loop 4710 RET 4711 4712 // func AmdAxpyPointerLoopX_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4713 // Requires: SSE 4714 TEXT ·AmdAxpyPointerLoopX_V3A13(SB), NOSPLIT, $0-48 4715 MOVSS alpha+0(FP), X0 4716 MOVQ xs+8(FP), AX 4717 MOVQ incx+16(FP), CX 4718 MOVQ ys+24(FP), DX 4719 MOVQ incy+32(FP), BX 4720 MOVQ n+40(FP), SI 4721 JMP check_limit 4722 PCALIGN $0x08 4723 NOP 4724 NOP 4725 NOP 4726 NOP 4727 NOP 4728 4729 loop: 4730 MOVSS (AX), X1 4731 MULSS X0, X1 4732 ADDSS (DX), X1 4733 MOVSS X1, (DX) 4734 DECQ SI 4735 LEAQ (AX)(CX*4), AX 4736 LEAQ (DX)(BX*4), DX 4737 4738 check_limit: 4739 CMPQ SI, $0x00 4740 JHI loop 4741 RET 4742 4743 // func AmdAxpyPointerLoopX_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4744 // Requires: SSE 4745 TEXT ·AmdAxpyPointerLoopX_V4A13(SB), NOSPLIT, $0-48 4746 MOVSS alpha+0(FP), X0 4747 MOVQ xs+8(FP), AX 4748 MOVQ incx+16(FP), CX 4749 MOVQ ys+24(FP), DX 4750 MOVQ incy+32(FP), BX 4751 MOVQ n+40(FP), SI 4752 JMP check_limit 4753 PCALIGN $0x08 4754 NOP 4755 NOP 4756 NOP 4757 NOP 4758 NOP 4759 4760 loop: 4761 MOVSS (AX), X1 4762 MULSS X0, X1 4763 ADDSS (DX), X1 4764 MOVSS X1, (DX) 4765 DECQ SI 4766 LEAQ (AX)(CX*4), AX 4767 LEAQ (DX)(BX*4), DX 4768 4769 check_limit: 4770 CMPQ SI, $0x00 4771 JHI loop 4772 RET 4773 4774 // func AmdAxpyPointerLoopX_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4775 // Requires: SSE 4776 TEXT ·AmdAxpyPointerLoopX_V5A13(SB), NOSPLIT, $0-48 4777 MOVSS alpha+0(FP), X0 4778 MOVQ xs+8(FP), AX 4779 MOVQ incx+16(FP), CX 4780 MOVQ ys+24(FP), DX 4781 MOVQ incy+32(FP), BX 4782 MOVQ n+40(FP), SI 4783 JMP check_limit 4784 PCALIGN $0x08 4785 NOP 4786 NOP 4787 NOP 4788 NOP 4789 NOP 4790 4791 loop: 4792 MOVSS (AX), X1 4793 MULSS X0, X1 4794 ADDSS (DX), X1 4795 MOVSS X1, (DX) 4796 DECQ SI 4797 LEAQ (AX)(CX*4), AX 4798 LEAQ (DX)(BX*4), DX 4799 4800 check_limit: 4801 CMPQ SI, $0x00 4802 JHI loop 4803 RET 4804 4805 // func AmdAxpyPointerLoopX_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4806 // Requires: SSE 4807 TEXT ·AmdAxpyPointerLoopX_V0A14(SB), NOSPLIT, $0-48 4808 MOVSS alpha+0(FP), X0 4809 MOVQ xs+8(FP), AX 4810 MOVQ incx+16(FP), CX 4811 MOVQ ys+24(FP), DX 4812 MOVQ incy+32(FP), BX 4813 MOVQ n+40(FP), SI 4814 JMP check_limit 4815 PCALIGN $0x08 4816 NOP 4817 NOP 4818 NOP 4819 NOP 4820 NOP 4821 NOP 4822 4823 loop: 4824 MOVSS (AX), X1 4825 MULSS X0, X1 4826 ADDSS (DX), X1 4827 MOVSS X1, (DX) 4828 DECQ SI 4829 LEAQ (AX)(CX*4), AX 4830 LEAQ (DX)(BX*4), DX 4831 4832 check_limit: 4833 CMPQ SI, $0x00 4834 JHI loop 4835 RET 4836 4837 // func AmdAxpyPointerLoopX_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4838 // Requires: SSE 4839 TEXT ·AmdAxpyPointerLoopX_V1A14(SB), NOSPLIT, $0-48 4840 MOVSS alpha+0(FP), X0 4841 MOVQ xs+8(FP), AX 4842 MOVQ incx+16(FP), CX 4843 MOVQ ys+24(FP), DX 4844 MOVQ incy+32(FP), BX 4845 MOVQ n+40(FP), SI 4846 JMP check_limit 4847 PCALIGN $0x08 4848 NOP 4849 NOP 4850 NOP 4851 NOP 4852 NOP 4853 NOP 4854 4855 loop: 4856 MOVSS (AX), X1 4857 MULSS X0, X1 4858 ADDSS (DX), X1 4859 MOVSS X1, (DX) 4860 DECQ SI 4861 LEAQ (AX)(CX*4), AX 4862 LEAQ (DX)(BX*4), DX 4863 4864 check_limit: 4865 CMPQ SI, $0x00 4866 JHI loop 4867 RET 4868 4869 // func AmdAxpyPointerLoopX_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4870 // Requires: SSE 4871 TEXT ·AmdAxpyPointerLoopX_V2A14(SB), NOSPLIT, $0-48 4872 MOVSS alpha+0(FP), X0 4873 MOVQ xs+8(FP), AX 4874 MOVQ incx+16(FP), CX 4875 MOVQ ys+24(FP), DX 4876 MOVQ incy+32(FP), BX 4877 MOVQ n+40(FP), SI 4878 JMP check_limit 4879 PCALIGN $0x08 4880 NOP 4881 NOP 4882 NOP 4883 NOP 4884 NOP 4885 NOP 4886 4887 loop: 4888 MOVSS (AX), X1 4889 MULSS X0, X1 4890 ADDSS (DX), X1 4891 MOVSS X1, (DX) 4892 DECQ SI 4893 LEAQ (AX)(CX*4), AX 4894 LEAQ (DX)(BX*4), DX 4895 4896 check_limit: 4897 CMPQ SI, $0x00 4898 JHI loop 4899 RET 4900 4901 // func AmdAxpyPointerLoopX_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4902 // Requires: SSE 4903 TEXT ·AmdAxpyPointerLoopX_V3A14(SB), NOSPLIT, $0-48 4904 MOVSS alpha+0(FP), X0 4905 MOVQ xs+8(FP), AX 4906 MOVQ incx+16(FP), CX 4907 MOVQ ys+24(FP), DX 4908 MOVQ incy+32(FP), BX 4909 MOVQ n+40(FP), SI 4910 JMP check_limit 4911 PCALIGN $0x08 4912 NOP 4913 NOP 4914 NOP 4915 NOP 4916 NOP 4917 NOP 4918 4919 loop: 4920 MOVSS (AX), X1 4921 MULSS X0, X1 4922 ADDSS (DX), X1 4923 MOVSS X1, (DX) 4924 DECQ SI 4925 LEAQ (AX)(CX*4), AX 4926 LEAQ (DX)(BX*4), DX 4927 4928 check_limit: 4929 CMPQ SI, $0x00 4930 JHI loop 4931 RET 4932 4933 // func AmdAxpyPointerLoopX_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4934 // Requires: SSE 4935 TEXT ·AmdAxpyPointerLoopX_V4A14(SB), NOSPLIT, $0-48 4936 MOVSS alpha+0(FP), X0 4937 MOVQ xs+8(FP), AX 4938 MOVQ incx+16(FP), CX 4939 MOVQ ys+24(FP), DX 4940 MOVQ incy+32(FP), BX 4941 MOVQ n+40(FP), SI 4942 JMP check_limit 4943 PCALIGN $0x08 4944 NOP 4945 NOP 4946 NOP 4947 NOP 4948 NOP 4949 NOP 4950 4951 loop: 4952 MOVSS (AX), X1 4953 MULSS X0, X1 4954 ADDSS (DX), X1 4955 MOVSS X1, (DX) 4956 DECQ SI 4957 LEAQ (AX)(CX*4), AX 4958 LEAQ (DX)(BX*4), DX 4959 4960 check_limit: 4961 CMPQ SI, $0x00 4962 JHI loop 4963 RET 4964 4965 // func AmdAxpyPointerLoopX_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4966 // Requires: SSE 4967 TEXT ·AmdAxpyPointerLoopX_V5A14(SB), NOSPLIT, $0-48 4968 MOVSS alpha+0(FP), X0 4969 MOVQ xs+8(FP), AX 4970 MOVQ incx+16(FP), CX 4971 MOVQ ys+24(FP), DX 4972 MOVQ incy+32(FP), BX 4973 MOVQ n+40(FP), SI 4974 JMP check_limit 4975 PCALIGN $0x08 4976 NOP 4977 NOP 4978 NOP 4979 NOP 4980 NOP 4981 NOP 4982 4983 loop: 4984 MOVSS (AX), X1 4985 MULSS X0, X1 4986 ADDSS (DX), X1 4987 MOVSS X1, (DX) 4988 DECQ SI 4989 LEAQ (AX)(CX*4), AX 4990 LEAQ (DX)(BX*4), DX 4991 4992 check_limit: 4993 CMPQ SI, $0x00 4994 JHI loop 4995 RET 4996 4997 // func AmdAxpyPointerLoopX_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 4998 // Requires: SSE 4999 TEXT ·AmdAxpyPointerLoopX_V0A15(SB), NOSPLIT, $0-48 5000 MOVSS alpha+0(FP), X0 5001 MOVQ xs+8(FP), AX 5002 MOVQ incx+16(FP), CX 5003 MOVQ ys+24(FP), DX 5004 MOVQ incy+32(FP), BX 5005 MOVQ n+40(FP), SI 5006 JMP check_limit 5007 PCALIGN $0x08 5008 NOP 5009 NOP 5010 NOP 5011 NOP 5012 NOP 5013 NOP 5014 NOP 5015 5016 loop: 5017 MOVSS (AX), X1 5018 MULSS X0, X1 5019 ADDSS (DX), X1 5020 MOVSS X1, (DX) 5021 DECQ SI 5022 LEAQ (AX)(CX*4), AX 5023 LEAQ (DX)(BX*4), DX 5024 5025 check_limit: 5026 CMPQ SI, $0x00 5027 JHI loop 5028 RET 5029 5030 // func AmdAxpyPointerLoopX_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5031 // Requires: SSE 5032 TEXT ·AmdAxpyPointerLoopX_V1A15(SB), NOSPLIT, $0-48 5033 MOVSS alpha+0(FP), X0 5034 MOVQ xs+8(FP), AX 5035 MOVQ incx+16(FP), CX 5036 MOVQ ys+24(FP), DX 5037 MOVQ incy+32(FP), BX 5038 MOVQ n+40(FP), SI 5039 JMP check_limit 5040 PCALIGN $0x08 5041 NOP 5042 NOP 5043 NOP 5044 NOP 5045 NOP 5046 NOP 5047 NOP 5048 5049 loop: 5050 MOVSS (AX), X1 5051 MULSS X0, X1 5052 ADDSS (DX), X1 5053 MOVSS X1, (DX) 5054 DECQ SI 5055 LEAQ (AX)(CX*4), AX 5056 LEAQ (DX)(BX*4), DX 5057 5058 check_limit: 5059 CMPQ SI, $0x00 5060 JHI loop 5061 RET 5062 5063 // func AmdAxpyPointerLoopX_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5064 // Requires: SSE 5065 TEXT ·AmdAxpyPointerLoopX_V2A15(SB), NOSPLIT, $0-48 5066 MOVSS alpha+0(FP), X0 5067 MOVQ xs+8(FP), AX 5068 MOVQ incx+16(FP), CX 5069 MOVQ ys+24(FP), DX 5070 MOVQ incy+32(FP), BX 5071 MOVQ n+40(FP), SI 5072 JMP check_limit 5073 PCALIGN $0x08 5074 NOP 5075 NOP 5076 NOP 5077 NOP 5078 NOP 5079 NOP 5080 NOP 5081 5082 loop: 5083 MOVSS (AX), X1 5084 MULSS X0, X1 5085 ADDSS (DX), X1 5086 MOVSS X1, (DX) 5087 DECQ SI 5088 LEAQ (AX)(CX*4), AX 5089 LEAQ (DX)(BX*4), DX 5090 5091 check_limit: 5092 CMPQ SI, $0x00 5093 JHI loop 5094 RET 5095 5096 // func AmdAxpyPointerLoopX_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5097 // Requires: SSE 5098 TEXT ·AmdAxpyPointerLoopX_V3A15(SB), NOSPLIT, $0-48 5099 MOVSS alpha+0(FP), X0 5100 MOVQ xs+8(FP), AX 5101 MOVQ incx+16(FP), CX 5102 MOVQ ys+24(FP), DX 5103 MOVQ incy+32(FP), BX 5104 MOVQ n+40(FP), SI 5105 JMP check_limit 5106 PCALIGN $0x08 5107 NOP 5108 NOP 5109 NOP 5110 NOP 5111 NOP 5112 NOP 5113 NOP 5114 5115 loop: 5116 MOVSS (AX), X1 5117 MULSS X0, X1 5118 ADDSS (DX), X1 5119 MOVSS X1, (DX) 5120 DECQ SI 5121 LEAQ (AX)(CX*4), AX 5122 LEAQ (DX)(BX*4), DX 5123 5124 check_limit: 5125 CMPQ SI, $0x00 5126 JHI loop 5127 RET 5128 5129 // func AmdAxpyPointerLoopX_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5130 // Requires: SSE 5131 TEXT ·AmdAxpyPointerLoopX_V4A15(SB), NOSPLIT, $0-48 5132 MOVSS alpha+0(FP), X0 5133 MOVQ xs+8(FP), AX 5134 MOVQ incx+16(FP), CX 5135 MOVQ ys+24(FP), DX 5136 MOVQ incy+32(FP), BX 5137 MOVQ n+40(FP), SI 5138 JMP check_limit 5139 PCALIGN $0x08 5140 NOP 5141 NOP 5142 NOP 5143 NOP 5144 NOP 5145 NOP 5146 NOP 5147 5148 loop: 5149 MOVSS (AX), X1 5150 MULSS X0, X1 5151 ADDSS (DX), X1 5152 MOVSS X1, (DX) 5153 DECQ SI 5154 LEAQ (AX)(CX*4), AX 5155 LEAQ (DX)(BX*4), DX 5156 5157 check_limit: 5158 CMPQ SI, $0x00 5159 JHI loop 5160 RET 5161 5162 // func AmdAxpyPointerLoopX_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5163 // Requires: SSE 5164 TEXT ·AmdAxpyPointerLoopX_V5A15(SB), NOSPLIT, $0-48 5165 MOVSS alpha+0(FP), X0 5166 MOVQ xs+8(FP), AX 5167 MOVQ incx+16(FP), CX 5168 MOVQ ys+24(FP), DX 5169 MOVQ incy+32(FP), BX 5170 MOVQ n+40(FP), SI 5171 JMP check_limit 5172 PCALIGN $0x08 5173 NOP 5174 NOP 5175 NOP 5176 NOP 5177 NOP 5178 NOP 5179 NOP 5180 5181 loop: 5182 MOVSS (AX), X1 5183 MULSS X0, X1 5184 ADDSS (DX), X1 5185 MOVSS X1, (DX) 5186 DECQ SI 5187 LEAQ (AX)(CX*4), AX 5188 LEAQ (DX)(BX*4), DX 5189 5190 check_limit: 5191 CMPQ SI, $0x00 5192 JHI loop 5193 RET 5194 5195 // func AmdAxpyPointerLoopX_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5196 // Requires: SSE 5197 TEXT ·AmdAxpyPointerLoopX_V0A16(SB), NOSPLIT, $0-48 5198 MOVSS alpha+0(FP), X0 5199 MOVQ xs+8(FP), AX 5200 MOVQ incx+16(FP), CX 5201 MOVQ ys+24(FP), DX 5202 MOVQ incy+32(FP), BX 5203 MOVQ n+40(FP), SI 5204 JMP check_limit 5205 PCALIGN $0x10 5206 5207 loop: 5208 MOVSS (AX), X1 5209 MULSS X0, X1 5210 ADDSS (DX), X1 5211 MOVSS X1, (DX) 5212 DECQ SI 5213 LEAQ (AX)(CX*4), AX 5214 LEAQ (DX)(BX*4), DX 5215 5216 check_limit: 5217 CMPQ SI, $0x00 5218 JHI loop 5219 RET 5220 5221 // func AmdAxpyPointerLoopX_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5222 // Requires: SSE 5223 TEXT ·AmdAxpyPointerLoopX_V1A16(SB), NOSPLIT, $0-48 5224 MOVSS alpha+0(FP), X0 5225 MOVQ xs+8(FP), AX 5226 MOVQ incx+16(FP), CX 5227 MOVQ ys+24(FP), DX 5228 MOVQ incy+32(FP), BX 5229 MOVQ n+40(FP), SI 5230 JMP check_limit 5231 PCALIGN $0x10 5232 5233 loop: 5234 MOVSS (AX), X1 5235 MULSS X0, X1 5236 ADDSS (DX), X1 5237 MOVSS X1, (DX) 5238 DECQ SI 5239 LEAQ (AX)(CX*4), AX 5240 LEAQ (DX)(BX*4), DX 5241 5242 check_limit: 5243 CMPQ SI, $0x00 5244 JHI loop 5245 RET 5246 5247 // func AmdAxpyPointerLoopX_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5248 // Requires: SSE 5249 TEXT ·AmdAxpyPointerLoopX_V2A16(SB), NOSPLIT, $0-48 5250 MOVSS alpha+0(FP), X0 5251 MOVQ xs+8(FP), AX 5252 MOVQ incx+16(FP), CX 5253 MOVQ ys+24(FP), DX 5254 MOVQ incy+32(FP), BX 5255 MOVQ n+40(FP), SI 5256 JMP check_limit 5257 PCALIGN $0x10 5258 5259 loop: 5260 MOVSS (AX), X1 5261 MULSS X0, X1 5262 ADDSS (DX), X1 5263 MOVSS X1, (DX) 5264 DECQ SI 5265 LEAQ (AX)(CX*4), AX 5266 LEAQ (DX)(BX*4), DX 5267 5268 check_limit: 5269 CMPQ SI, $0x00 5270 JHI loop 5271 RET 5272 5273 // func AmdAxpyPointerLoopX_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5274 // Requires: SSE 5275 TEXT ·AmdAxpyPointerLoopX_V3A16(SB), NOSPLIT, $0-48 5276 MOVSS alpha+0(FP), X0 5277 MOVQ xs+8(FP), AX 5278 MOVQ incx+16(FP), CX 5279 MOVQ ys+24(FP), DX 5280 MOVQ incy+32(FP), BX 5281 MOVQ n+40(FP), SI 5282 JMP check_limit 5283 PCALIGN $0x10 5284 5285 loop: 5286 MOVSS (AX), X1 5287 MULSS X0, X1 5288 ADDSS (DX), X1 5289 MOVSS X1, (DX) 5290 DECQ SI 5291 LEAQ (AX)(CX*4), AX 5292 LEAQ (DX)(BX*4), DX 5293 5294 check_limit: 5295 CMPQ SI, $0x00 5296 JHI loop 5297 RET 5298 5299 // func AmdAxpyPointerLoopX_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5300 // Requires: SSE 5301 TEXT ·AmdAxpyPointerLoopX_V4A16(SB), NOSPLIT, $0-48 5302 MOVSS alpha+0(FP), X0 5303 MOVQ xs+8(FP), AX 5304 MOVQ incx+16(FP), CX 5305 MOVQ ys+24(FP), DX 5306 MOVQ incy+32(FP), BX 5307 MOVQ n+40(FP), SI 5308 JMP check_limit 5309 PCALIGN $0x10 5310 5311 loop: 5312 MOVSS (AX), X1 5313 MULSS X0, X1 5314 ADDSS (DX), X1 5315 MOVSS X1, (DX) 5316 DECQ SI 5317 LEAQ (AX)(CX*4), AX 5318 LEAQ (DX)(BX*4), DX 5319 5320 check_limit: 5321 CMPQ SI, $0x00 5322 JHI loop 5323 RET 5324 5325 // func AmdAxpyPointerLoopX_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5326 // Requires: SSE 5327 TEXT ·AmdAxpyPointerLoopX_V5A16(SB), NOSPLIT, $0-48 5328 MOVSS alpha+0(FP), X0 5329 MOVQ xs+8(FP), AX 5330 MOVQ incx+16(FP), CX 5331 MOVQ ys+24(FP), DX 5332 MOVQ incy+32(FP), BX 5333 MOVQ n+40(FP), SI 5334 JMP check_limit 5335 PCALIGN $0x10 5336 5337 loop: 5338 MOVSS (AX), X1 5339 MULSS X0, X1 5340 ADDSS (DX), X1 5341 MOVSS X1, (DX) 5342 DECQ SI 5343 LEAQ (AX)(CX*4), AX 5344 LEAQ (DX)(BX*4), DX 5345 5346 check_limit: 5347 CMPQ SI, $0x00 5348 JHI loop 5349 RET 5350 5351 // func AmdAxpyUnsafeX_V0A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5352 // Requires: SSE 5353 TEXT ·AmdAxpyUnsafeX_V0A0(SB), NOSPLIT, $0-48 5354 MOVSS alpha+0(FP), X0 5355 MOVQ xs+8(FP), AX 5356 MOVQ incx+16(FP), CX 5357 MOVQ ys+24(FP), DX 5358 MOVQ incy+32(FP), BX 5359 MOVQ n+40(FP), SI 5360 XORQ DI, DI 5361 XORQ R8, R8 5362 JMP check_limit 5363 5364 loop: 5365 MOVSS (AX)(DI*4), X1 5366 MULSS X0, X1 5367 ADDSS (DX)(R8*4), X1 5368 MOVSS X1, (DX)(R8*4) 5369 DECQ SI 5370 ADDQ CX, DI 5371 ADDQ BX, R8 5372 5373 check_limit: 5374 CMPQ SI, $0x00 5375 JHI loop 5376 RET 5377 5378 // func AmdAxpyUnsafeX_V1A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5379 // Requires: SSE 5380 TEXT ·AmdAxpyUnsafeX_V1A0(SB), NOSPLIT, $0-48 5381 MOVSS alpha+0(FP), X0 5382 MOVQ xs+8(FP), AX 5383 MOVQ incx+16(FP), CX 5384 MOVQ ys+24(FP), DX 5385 MOVQ incy+32(FP), BX 5386 MOVQ n+40(FP), SI 5387 XORQ DI, DI 5388 XORQ R8, R8 5389 JMP check_limit 5390 5391 loop: 5392 MOVSS (AX)(DI*4), X1 5393 MULSS X0, X1 5394 ADDSS (DX)(R8*4), X1 5395 MOVSS X1, (DX)(R8*4) 5396 DECQ SI 5397 ADDQ CX, DI 5398 ADDQ BX, R8 5399 5400 check_limit: 5401 CMPQ SI, $0x00 5402 JHI loop 5403 RET 5404 5405 // func AmdAxpyUnsafeX_V2A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5406 // Requires: SSE 5407 TEXT ·AmdAxpyUnsafeX_V2A0(SB), NOSPLIT, $0-48 5408 MOVSS alpha+0(FP), X0 5409 MOVQ xs+8(FP), AX 5410 MOVQ incx+16(FP), CX 5411 MOVQ ys+24(FP), DX 5412 MOVQ incy+32(FP), BX 5413 MOVQ n+40(FP), SI 5414 XORQ DI, DI 5415 XORQ R8, R8 5416 JMP check_limit 5417 5418 loop: 5419 MOVSS (AX)(DI*4), X1 5420 MULSS X0, X1 5421 ADDSS (DX)(R8*4), X1 5422 MOVSS X1, (DX)(R8*4) 5423 DECQ SI 5424 ADDQ CX, DI 5425 ADDQ BX, R8 5426 5427 check_limit: 5428 CMPQ SI, $0x00 5429 JHI loop 5430 RET 5431 5432 // func AmdAxpyUnsafeX_V3A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5433 // Requires: SSE 5434 TEXT ·AmdAxpyUnsafeX_V3A0(SB), NOSPLIT, $0-48 5435 MOVSS alpha+0(FP), X0 5436 MOVQ xs+8(FP), AX 5437 MOVQ incx+16(FP), CX 5438 MOVQ ys+24(FP), DX 5439 MOVQ incy+32(FP), BX 5440 MOVQ n+40(FP), SI 5441 XORQ DI, DI 5442 XORQ R8, R8 5443 JMP check_limit 5444 5445 loop: 5446 MOVSS (AX)(DI*4), X1 5447 MULSS X0, X1 5448 ADDSS (DX)(R8*4), X1 5449 MOVSS X1, (DX)(R8*4) 5450 DECQ SI 5451 ADDQ CX, DI 5452 ADDQ BX, R8 5453 5454 check_limit: 5455 CMPQ SI, $0x00 5456 JHI loop 5457 RET 5458 5459 // func AmdAxpyUnsafeX_V4A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5460 // Requires: SSE 5461 TEXT ·AmdAxpyUnsafeX_V4A0(SB), NOSPLIT, $0-48 5462 MOVSS alpha+0(FP), X0 5463 MOVQ xs+8(FP), AX 5464 MOVQ incx+16(FP), CX 5465 MOVQ ys+24(FP), DX 5466 MOVQ incy+32(FP), BX 5467 MOVQ n+40(FP), SI 5468 XORQ DI, DI 5469 XORQ R8, R8 5470 JMP check_limit 5471 5472 loop: 5473 MOVSS (AX)(DI*4), X1 5474 MULSS X0, X1 5475 ADDSS (DX)(R8*4), X1 5476 MOVSS X1, (DX)(R8*4) 5477 DECQ SI 5478 ADDQ CX, DI 5479 ADDQ BX, R8 5480 5481 check_limit: 5482 CMPQ SI, $0x00 5483 JHI loop 5484 RET 5485 5486 // func AmdAxpyUnsafeX_V5A0(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5487 // Requires: SSE 5488 TEXT ·AmdAxpyUnsafeX_V5A0(SB), NOSPLIT, $0-48 5489 MOVSS alpha+0(FP), X0 5490 MOVQ xs+8(FP), AX 5491 MOVQ incx+16(FP), CX 5492 MOVQ ys+24(FP), DX 5493 MOVQ incy+32(FP), BX 5494 MOVQ n+40(FP), SI 5495 XORQ DI, DI 5496 XORQ R8, R8 5497 JMP check_limit 5498 5499 loop: 5500 MOVSS (AX)(DI*4), X1 5501 MULSS X0, X1 5502 ADDSS (DX)(R8*4), X1 5503 MOVSS X1, (DX)(R8*4) 5504 DECQ SI 5505 ADDQ CX, DI 5506 ADDQ BX, R8 5507 5508 check_limit: 5509 CMPQ SI, $0x00 5510 JHI loop 5511 RET 5512 5513 // func AmdAxpyUnsafeX_V0A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5514 // Requires: SSE 5515 TEXT ·AmdAxpyUnsafeX_V0A8(SB), NOSPLIT, $0-48 5516 MOVSS alpha+0(FP), X0 5517 MOVQ xs+8(FP), AX 5518 MOVQ incx+16(FP), CX 5519 MOVQ ys+24(FP), DX 5520 MOVQ incy+32(FP), BX 5521 MOVQ n+40(FP), SI 5522 XORQ DI, DI 5523 XORQ R8, R8 5524 JMP check_limit 5525 PCALIGN $0x08 5526 5527 loop: 5528 MOVSS (AX)(DI*4), X1 5529 MULSS X0, X1 5530 ADDSS (DX)(R8*4), X1 5531 MOVSS X1, (DX)(R8*4) 5532 DECQ SI 5533 ADDQ CX, DI 5534 ADDQ BX, R8 5535 5536 check_limit: 5537 CMPQ SI, $0x00 5538 JHI loop 5539 RET 5540 5541 // func AmdAxpyUnsafeX_V1A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5542 // Requires: SSE 5543 TEXT ·AmdAxpyUnsafeX_V1A8(SB), NOSPLIT, $0-48 5544 MOVSS alpha+0(FP), X0 5545 MOVQ xs+8(FP), AX 5546 MOVQ incx+16(FP), CX 5547 MOVQ ys+24(FP), DX 5548 MOVQ incy+32(FP), BX 5549 MOVQ n+40(FP), SI 5550 XORQ DI, DI 5551 XORQ R8, R8 5552 JMP check_limit 5553 PCALIGN $0x08 5554 5555 loop: 5556 MOVSS (AX)(DI*4), X1 5557 MULSS X0, X1 5558 ADDSS (DX)(R8*4), X1 5559 MOVSS X1, (DX)(R8*4) 5560 DECQ SI 5561 ADDQ CX, DI 5562 ADDQ BX, R8 5563 5564 check_limit: 5565 CMPQ SI, $0x00 5566 JHI loop 5567 RET 5568 5569 // func AmdAxpyUnsafeX_V2A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5570 // Requires: SSE 5571 TEXT ·AmdAxpyUnsafeX_V2A8(SB), NOSPLIT, $0-48 5572 MOVSS alpha+0(FP), X0 5573 MOVQ xs+8(FP), AX 5574 MOVQ incx+16(FP), CX 5575 MOVQ ys+24(FP), DX 5576 MOVQ incy+32(FP), BX 5577 MOVQ n+40(FP), SI 5578 XORQ DI, DI 5579 XORQ R8, R8 5580 JMP check_limit 5581 PCALIGN $0x08 5582 5583 loop: 5584 MOVSS (AX)(DI*4), X1 5585 MULSS X0, X1 5586 ADDSS (DX)(R8*4), X1 5587 MOVSS X1, (DX)(R8*4) 5588 DECQ SI 5589 ADDQ CX, DI 5590 ADDQ BX, R8 5591 5592 check_limit: 5593 CMPQ SI, $0x00 5594 JHI loop 5595 RET 5596 5597 // func AmdAxpyUnsafeX_V3A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5598 // Requires: SSE 5599 TEXT ·AmdAxpyUnsafeX_V3A8(SB), NOSPLIT, $0-48 5600 MOVSS alpha+0(FP), X0 5601 MOVQ xs+8(FP), AX 5602 MOVQ incx+16(FP), CX 5603 MOVQ ys+24(FP), DX 5604 MOVQ incy+32(FP), BX 5605 MOVQ n+40(FP), SI 5606 XORQ DI, DI 5607 XORQ R8, R8 5608 JMP check_limit 5609 PCALIGN $0x08 5610 5611 loop: 5612 MOVSS (AX)(DI*4), X1 5613 MULSS X0, X1 5614 ADDSS (DX)(R8*4), X1 5615 MOVSS X1, (DX)(R8*4) 5616 DECQ SI 5617 ADDQ CX, DI 5618 ADDQ BX, R8 5619 5620 check_limit: 5621 CMPQ SI, $0x00 5622 JHI loop 5623 RET 5624 5625 // func AmdAxpyUnsafeX_V4A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5626 // Requires: SSE 5627 TEXT ·AmdAxpyUnsafeX_V4A8(SB), NOSPLIT, $0-48 5628 MOVSS alpha+0(FP), X0 5629 MOVQ xs+8(FP), AX 5630 MOVQ incx+16(FP), CX 5631 MOVQ ys+24(FP), DX 5632 MOVQ incy+32(FP), BX 5633 MOVQ n+40(FP), SI 5634 XORQ DI, DI 5635 XORQ R8, R8 5636 JMP check_limit 5637 PCALIGN $0x08 5638 5639 loop: 5640 MOVSS (AX)(DI*4), X1 5641 MULSS X0, X1 5642 ADDSS (DX)(R8*4), X1 5643 MOVSS X1, (DX)(R8*4) 5644 DECQ SI 5645 ADDQ CX, DI 5646 ADDQ BX, R8 5647 5648 check_limit: 5649 CMPQ SI, $0x00 5650 JHI loop 5651 RET 5652 5653 // func AmdAxpyUnsafeX_V5A8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5654 // Requires: SSE 5655 TEXT ·AmdAxpyUnsafeX_V5A8(SB), NOSPLIT, $0-48 5656 MOVSS alpha+0(FP), X0 5657 MOVQ xs+8(FP), AX 5658 MOVQ incx+16(FP), CX 5659 MOVQ ys+24(FP), DX 5660 MOVQ incy+32(FP), BX 5661 MOVQ n+40(FP), SI 5662 XORQ DI, DI 5663 XORQ R8, R8 5664 JMP check_limit 5665 PCALIGN $0x08 5666 5667 loop: 5668 MOVSS (AX)(DI*4), X1 5669 MULSS X0, X1 5670 ADDSS (DX)(R8*4), X1 5671 MOVSS X1, (DX)(R8*4) 5672 DECQ SI 5673 ADDQ CX, DI 5674 ADDQ BX, R8 5675 5676 check_limit: 5677 CMPQ SI, $0x00 5678 JHI loop 5679 RET 5680 5681 // func AmdAxpyUnsafeX_V0A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5682 // Requires: SSE 5683 TEXT ·AmdAxpyUnsafeX_V0A9(SB), NOSPLIT, $0-48 5684 MOVSS alpha+0(FP), X0 5685 MOVQ xs+8(FP), AX 5686 MOVQ incx+16(FP), CX 5687 MOVQ ys+24(FP), DX 5688 MOVQ incy+32(FP), BX 5689 MOVQ n+40(FP), SI 5690 XORQ DI, DI 5691 XORQ R8, R8 5692 JMP check_limit 5693 PCALIGN $0x08 5694 NOP 5695 5696 loop: 5697 MOVSS (AX)(DI*4), X1 5698 MULSS X0, X1 5699 ADDSS (DX)(R8*4), X1 5700 MOVSS X1, (DX)(R8*4) 5701 DECQ SI 5702 ADDQ CX, DI 5703 ADDQ BX, R8 5704 5705 check_limit: 5706 CMPQ SI, $0x00 5707 JHI loop 5708 RET 5709 5710 // func AmdAxpyUnsafeX_V1A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5711 // Requires: SSE 5712 TEXT ·AmdAxpyUnsafeX_V1A9(SB), NOSPLIT, $0-48 5713 MOVSS alpha+0(FP), X0 5714 MOVQ xs+8(FP), AX 5715 MOVQ incx+16(FP), CX 5716 MOVQ ys+24(FP), DX 5717 MOVQ incy+32(FP), BX 5718 MOVQ n+40(FP), SI 5719 XORQ DI, DI 5720 XORQ R8, R8 5721 JMP check_limit 5722 PCALIGN $0x08 5723 NOP 5724 5725 loop: 5726 MOVSS (AX)(DI*4), X1 5727 MULSS X0, X1 5728 ADDSS (DX)(R8*4), X1 5729 MOVSS X1, (DX)(R8*4) 5730 DECQ SI 5731 ADDQ CX, DI 5732 ADDQ BX, R8 5733 5734 check_limit: 5735 CMPQ SI, $0x00 5736 JHI loop 5737 RET 5738 5739 // func AmdAxpyUnsafeX_V2A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5740 // Requires: SSE 5741 TEXT ·AmdAxpyUnsafeX_V2A9(SB), NOSPLIT, $0-48 5742 MOVSS alpha+0(FP), X0 5743 MOVQ xs+8(FP), AX 5744 MOVQ incx+16(FP), CX 5745 MOVQ ys+24(FP), DX 5746 MOVQ incy+32(FP), BX 5747 MOVQ n+40(FP), SI 5748 XORQ DI, DI 5749 XORQ R8, R8 5750 JMP check_limit 5751 PCALIGN $0x08 5752 NOP 5753 5754 loop: 5755 MOVSS (AX)(DI*4), X1 5756 MULSS X0, X1 5757 ADDSS (DX)(R8*4), X1 5758 MOVSS X1, (DX)(R8*4) 5759 DECQ SI 5760 ADDQ CX, DI 5761 ADDQ BX, R8 5762 5763 check_limit: 5764 CMPQ SI, $0x00 5765 JHI loop 5766 RET 5767 5768 // func AmdAxpyUnsafeX_V3A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5769 // Requires: SSE 5770 TEXT ·AmdAxpyUnsafeX_V3A9(SB), NOSPLIT, $0-48 5771 MOVSS alpha+0(FP), X0 5772 MOVQ xs+8(FP), AX 5773 MOVQ incx+16(FP), CX 5774 MOVQ ys+24(FP), DX 5775 MOVQ incy+32(FP), BX 5776 MOVQ n+40(FP), SI 5777 XORQ DI, DI 5778 XORQ R8, R8 5779 JMP check_limit 5780 PCALIGN $0x08 5781 NOP 5782 5783 loop: 5784 MOVSS (AX)(DI*4), X1 5785 MULSS X0, X1 5786 ADDSS (DX)(R8*4), X1 5787 MOVSS X1, (DX)(R8*4) 5788 DECQ SI 5789 ADDQ CX, DI 5790 ADDQ BX, R8 5791 5792 check_limit: 5793 CMPQ SI, $0x00 5794 JHI loop 5795 RET 5796 5797 // func AmdAxpyUnsafeX_V4A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5798 // Requires: SSE 5799 TEXT ·AmdAxpyUnsafeX_V4A9(SB), NOSPLIT, $0-48 5800 MOVSS alpha+0(FP), X0 5801 MOVQ xs+8(FP), AX 5802 MOVQ incx+16(FP), CX 5803 MOVQ ys+24(FP), DX 5804 MOVQ incy+32(FP), BX 5805 MOVQ n+40(FP), SI 5806 XORQ DI, DI 5807 XORQ R8, R8 5808 JMP check_limit 5809 PCALIGN $0x08 5810 NOP 5811 5812 loop: 5813 MOVSS (AX)(DI*4), X1 5814 MULSS X0, X1 5815 ADDSS (DX)(R8*4), X1 5816 MOVSS X1, (DX)(R8*4) 5817 DECQ SI 5818 ADDQ CX, DI 5819 ADDQ BX, R8 5820 5821 check_limit: 5822 CMPQ SI, $0x00 5823 JHI loop 5824 RET 5825 5826 // func AmdAxpyUnsafeX_V5A9(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5827 // Requires: SSE 5828 TEXT ·AmdAxpyUnsafeX_V5A9(SB), NOSPLIT, $0-48 5829 MOVSS alpha+0(FP), X0 5830 MOVQ xs+8(FP), AX 5831 MOVQ incx+16(FP), CX 5832 MOVQ ys+24(FP), DX 5833 MOVQ incy+32(FP), BX 5834 MOVQ n+40(FP), SI 5835 XORQ DI, DI 5836 XORQ R8, R8 5837 JMP check_limit 5838 PCALIGN $0x08 5839 NOP 5840 5841 loop: 5842 MOVSS (AX)(DI*4), X1 5843 MULSS X0, X1 5844 ADDSS (DX)(R8*4), X1 5845 MOVSS X1, (DX)(R8*4) 5846 DECQ SI 5847 ADDQ CX, DI 5848 ADDQ BX, R8 5849 5850 check_limit: 5851 CMPQ SI, $0x00 5852 JHI loop 5853 RET 5854 5855 // func AmdAxpyUnsafeX_V0A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5856 // Requires: SSE 5857 TEXT ·AmdAxpyUnsafeX_V0A10(SB), NOSPLIT, $0-48 5858 MOVSS alpha+0(FP), X0 5859 MOVQ xs+8(FP), AX 5860 MOVQ incx+16(FP), CX 5861 MOVQ ys+24(FP), DX 5862 MOVQ incy+32(FP), BX 5863 MOVQ n+40(FP), SI 5864 XORQ DI, DI 5865 XORQ R8, R8 5866 JMP check_limit 5867 PCALIGN $0x08 5868 NOP 5869 NOP 5870 5871 loop: 5872 MOVSS (AX)(DI*4), X1 5873 MULSS X0, X1 5874 ADDSS (DX)(R8*4), X1 5875 MOVSS X1, (DX)(R8*4) 5876 DECQ SI 5877 ADDQ CX, DI 5878 ADDQ BX, R8 5879 5880 check_limit: 5881 CMPQ SI, $0x00 5882 JHI loop 5883 RET 5884 5885 // func AmdAxpyUnsafeX_V1A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5886 // Requires: SSE 5887 TEXT ·AmdAxpyUnsafeX_V1A10(SB), NOSPLIT, $0-48 5888 MOVSS alpha+0(FP), X0 5889 MOVQ xs+8(FP), AX 5890 MOVQ incx+16(FP), CX 5891 MOVQ ys+24(FP), DX 5892 MOVQ incy+32(FP), BX 5893 MOVQ n+40(FP), SI 5894 XORQ DI, DI 5895 XORQ R8, R8 5896 JMP check_limit 5897 PCALIGN $0x08 5898 NOP 5899 NOP 5900 5901 loop: 5902 MOVSS (AX)(DI*4), X1 5903 MULSS X0, X1 5904 ADDSS (DX)(R8*4), X1 5905 MOVSS X1, (DX)(R8*4) 5906 DECQ SI 5907 ADDQ CX, DI 5908 ADDQ BX, R8 5909 5910 check_limit: 5911 CMPQ SI, $0x00 5912 JHI loop 5913 RET 5914 5915 // func AmdAxpyUnsafeX_V2A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5916 // Requires: SSE 5917 TEXT ·AmdAxpyUnsafeX_V2A10(SB), NOSPLIT, $0-48 5918 MOVSS alpha+0(FP), X0 5919 MOVQ xs+8(FP), AX 5920 MOVQ incx+16(FP), CX 5921 MOVQ ys+24(FP), DX 5922 MOVQ incy+32(FP), BX 5923 MOVQ n+40(FP), SI 5924 XORQ DI, DI 5925 XORQ R8, R8 5926 JMP check_limit 5927 PCALIGN $0x08 5928 NOP 5929 NOP 5930 5931 loop: 5932 MOVSS (AX)(DI*4), X1 5933 MULSS X0, X1 5934 ADDSS (DX)(R8*4), X1 5935 MOVSS X1, (DX)(R8*4) 5936 DECQ SI 5937 ADDQ CX, DI 5938 ADDQ BX, R8 5939 5940 check_limit: 5941 CMPQ SI, $0x00 5942 JHI loop 5943 RET 5944 5945 // func AmdAxpyUnsafeX_V3A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5946 // Requires: SSE 5947 TEXT ·AmdAxpyUnsafeX_V3A10(SB), NOSPLIT, $0-48 5948 MOVSS alpha+0(FP), X0 5949 MOVQ xs+8(FP), AX 5950 MOVQ incx+16(FP), CX 5951 MOVQ ys+24(FP), DX 5952 MOVQ incy+32(FP), BX 5953 MOVQ n+40(FP), SI 5954 XORQ DI, DI 5955 XORQ R8, R8 5956 JMP check_limit 5957 PCALIGN $0x08 5958 NOP 5959 NOP 5960 5961 loop: 5962 MOVSS (AX)(DI*4), X1 5963 MULSS X0, X1 5964 ADDSS (DX)(R8*4), X1 5965 MOVSS X1, (DX)(R8*4) 5966 DECQ SI 5967 ADDQ CX, DI 5968 ADDQ BX, R8 5969 5970 check_limit: 5971 CMPQ SI, $0x00 5972 JHI loop 5973 RET 5974 5975 // func AmdAxpyUnsafeX_V4A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 5976 // Requires: SSE 5977 TEXT ·AmdAxpyUnsafeX_V4A10(SB), NOSPLIT, $0-48 5978 MOVSS alpha+0(FP), X0 5979 MOVQ xs+8(FP), AX 5980 MOVQ incx+16(FP), CX 5981 MOVQ ys+24(FP), DX 5982 MOVQ incy+32(FP), BX 5983 MOVQ n+40(FP), SI 5984 XORQ DI, DI 5985 XORQ R8, R8 5986 JMP check_limit 5987 PCALIGN $0x08 5988 NOP 5989 NOP 5990 5991 loop: 5992 MOVSS (AX)(DI*4), X1 5993 MULSS X0, X1 5994 ADDSS (DX)(R8*4), X1 5995 MOVSS X1, (DX)(R8*4) 5996 DECQ SI 5997 ADDQ CX, DI 5998 ADDQ BX, R8 5999 6000 check_limit: 6001 CMPQ SI, $0x00 6002 JHI loop 6003 RET 6004 6005 // func AmdAxpyUnsafeX_V5A10(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6006 // Requires: SSE 6007 TEXT ·AmdAxpyUnsafeX_V5A10(SB), NOSPLIT, $0-48 6008 MOVSS alpha+0(FP), X0 6009 MOVQ xs+8(FP), AX 6010 MOVQ incx+16(FP), CX 6011 MOVQ ys+24(FP), DX 6012 MOVQ incy+32(FP), BX 6013 MOVQ n+40(FP), SI 6014 XORQ DI, DI 6015 XORQ R8, R8 6016 JMP check_limit 6017 PCALIGN $0x08 6018 NOP 6019 NOP 6020 6021 loop: 6022 MOVSS (AX)(DI*4), X1 6023 MULSS X0, X1 6024 ADDSS (DX)(R8*4), X1 6025 MOVSS X1, (DX)(R8*4) 6026 DECQ SI 6027 ADDQ CX, DI 6028 ADDQ BX, R8 6029 6030 check_limit: 6031 CMPQ SI, $0x00 6032 JHI loop 6033 RET 6034 6035 // func AmdAxpyUnsafeX_V0A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6036 // Requires: SSE 6037 TEXT ·AmdAxpyUnsafeX_V0A11(SB), NOSPLIT, $0-48 6038 MOVSS alpha+0(FP), X0 6039 MOVQ xs+8(FP), AX 6040 MOVQ incx+16(FP), CX 6041 MOVQ ys+24(FP), DX 6042 MOVQ incy+32(FP), BX 6043 MOVQ n+40(FP), SI 6044 XORQ DI, DI 6045 XORQ R8, R8 6046 JMP check_limit 6047 PCALIGN $0x08 6048 NOP 6049 NOP 6050 NOP 6051 6052 loop: 6053 MOVSS (AX)(DI*4), X1 6054 MULSS X0, X1 6055 ADDSS (DX)(R8*4), X1 6056 MOVSS X1, (DX)(R8*4) 6057 DECQ SI 6058 ADDQ CX, DI 6059 ADDQ BX, R8 6060 6061 check_limit: 6062 CMPQ SI, $0x00 6063 JHI loop 6064 RET 6065 6066 // func AmdAxpyUnsafeX_V1A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6067 // Requires: SSE 6068 TEXT ·AmdAxpyUnsafeX_V1A11(SB), NOSPLIT, $0-48 6069 MOVSS alpha+0(FP), X0 6070 MOVQ xs+8(FP), AX 6071 MOVQ incx+16(FP), CX 6072 MOVQ ys+24(FP), DX 6073 MOVQ incy+32(FP), BX 6074 MOVQ n+40(FP), SI 6075 XORQ DI, DI 6076 XORQ R8, R8 6077 JMP check_limit 6078 PCALIGN $0x08 6079 NOP 6080 NOP 6081 NOP 6082 6083 loop: 6084 MOVSS (AX)(DI*4), X1 6085 MULSS X0, X1 6086 ADDSS (DX)(R8*4), X1 6087 MOVSS X1, (DX)(R8*4) 6088 DECQ SI 6089 ADDQ CX, DI 6090 ADDQ BX, R8 6091 6092 check_limit: 6093 CMPQ SI, $0x00 6094 JHI loop 6095 RET 6096 6097 // func AmdAxpyUnsafeX_V2A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6098 // Requires: SSE 6099 TEXT ·AmdAxpyUnsafeX_V2A11(SB), NOSPLIT, $0-48 6100 MOVSS alpha+0(FP), X0 6101 MOVQ xs+8(FP), AX 6102 MOVQ incx+16(FP), CX 6103 MOVQ ys+24(FP), DX 6104 MOVQ incy+32(FP), BX 6105 MOVQ n+40(FP), SI 6106 XORQ DI, DI 6107 XORQ R8, R8 6108 JMP check_limit 6109 PCALIGN $0x08 6110 NOP 6111 NOP 6112 NOP 6113 6114 loop: 6115 MOVSS (AX)(DI*4), X1 6116 MULSS X0, X1 6117 ADDSS (DX)(R8*4), X1 6118 MOVSS X1, (DX)(R8*4) 6119 DECQ SI 6120 ADDQ CX, DI 6121 ADDQ BX, R8 6122 6123 check_limit: 6124 CMPQ SI, $0x00 6125 JHI loop 6126 RET 6127 6128 // func AmdAxpyUnsafeX_V3A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6129 // Requires: SSE 6130 TEXT ·AmdAxpyUnsafeX_V3A11(SB), NOSPLIT, $0-48 6131 MOVSS alpha+0(FP), X0 6132 MOVQ xs+8(FP), AX 6133 MOVQ incx+16(FP), CX 6134 MOVQ ys+24(FP), DX 6135 MOVQ incy+32(FP), BX 6136 MOVQ n+40(FP), SI 6137 XORQ DI, DI 6138 XORQ R8, R8 6139 JMP check_limit 6140 PCALIGN $0x08 6141 NOP 6142 NOP 6143 NOP 6144 6145 loop: 6146 MOVSS (AX)(DI*4), X1 6147 MULSS X0, X1 6148 ADDSS (DX)(R8*4), X1 6149 MOVSS X1, (DX)(R8*4) 6150 DECQ SI 6151 ADDQ CX, DI 6152 ADDQ BX, R8 6153 6154 check_limit: 6155 CMPQ SI, $0x00 6156 JHI loop 6157 RET 6158 6159 // func AmdAxpyUnsafeX_V4A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6160 // Requires: SSE 6161 TEXT ·AmdAxpyUnsafeX_V4A11(SB), NOSPLIT, $0-48 6162 MOVSS alpha+0(FP), X0 6163 MOVQ xs+8(FP), AX 6164 MOVQ incx+16(FP), CX 6165 MOVQ ys+24(FP), DX 6166 MOVQ incy+32(FP), BX 6167 MOVQ n+40(FP), SI 6168 XORQ DI, DI 6169 XORQ R8, R8 6170 JMP check_limit 6171 PCALIGN $0x08 6172 NOP 6173 NOP 6174 NOP 6175 6176 loop: 6177 MOVSS (AX)(DI*4), X1 6178 MULSS X0, X1 6179 ADDSS (DX)(R8*4), X1 6180 MOVSS X1, (DX)(R8*4) 6181 DECQ SI 6182 ADDQ CX, DI 6183 ADDQ BX, R8 6184 6185 check_limit: 6186 CMPQ SI, $0x00 6187 JHI loop 6188 RET 6189 6190 // func AmdAxpyUnsafeX_V5A11(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6191 // Requires: SSE 6192 TEXT ·AmdAxpyUnsafeX_V5A11(SB), NOSPLIT, $0-48 6193 MOVSS alpha+0(FP), X0 6194 MOVQ xs+8(FP), AX 6195 MOVQ incx+16(FP), CX 6196 MOVQ ys+24(FP), DX 6197 MOVQ incy+32(FP), BX 6198 MOVQ n+40(FP), SI 6199 XORQ DI, DI 6200 XORQ R8, R8 6201 JMP check_limit 6202 PCALIGN $0x08 6203 NOP 6204 NOP 6205 NOP 6206 6207 loop: 6208 MOVSS (AX)(DI*4), X1 6209 MULSS X0, X1 6210 ADDSS (DX)(R8*4), X1 6211 MOVSS X1, (DX)(R8*4) 6212 DECQ SI 6213 ADDQ CX, DI 6214 ADDQ BX, R8 6215 6216 check_limit: 6217 CMPQ SI, $0x00 6218 JHI loop 6219 RET 6220 6221 // func AmdAxpyUnsafeX_V0A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6222 // Requires: SSE 6223 TEXT ·AmdAxpyUnsafeX_V0A12(SB), NOSPLIT, $0-48 6224 MOVSS alpha+0(FP), X0 6225 MOVQ xs+8(FP), AX 6226 MOVQ incx+16(FP), CX 6227 MOVQ ys+24(FP), DX 6228 MOVQ incy+32(FP), BX 6229 MOVQ n+40(FP), SI 6230 XORQ DI, DI 6231 XORQ R8, R8 6232 JMP check_limit 6233 PCALIGN $0x08 6234 NOP 6235 NOP 6236 NOP 6237 NOP 6238 6239 loop: 6240 MOVSS (AX)(DI*4), X1 6241 MULSS X0, X1 6242 ADDSS (DX)(R8*4), X1 6243 MOVSS X1, (DX)(R8*4) 6244 DECQ SI 6245 ADDQ CX, DI 6246 ADDQ BX, R8 6247 6248 check_limit: 6249 CMPQ SI, $0x00 6250 JHI loop 6251 RET 6252 6253 // func AmdAxpyUnsafeX_V1A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6254 // Requires: SSE 6255 TEXT ·AmdAxpyUnsafeX_V1A12(SB), NOSPLIT, $0-48 6256 MOVSS alpha+0(FP), X0 6257 MOVQ xs+8(FP), AX 6258 MOVQ incx+16(FP), CX 6259 MOVQ ys+24(FP), DX 6260 MOVQ incy+32(FP), BX 6261 MOVQ n+40(FP), SI 6262 XORQ DI, DI 6263 XORQ R8, R8 6264 JMP check_limit 6265 PCALIGN $0x08 6266 NOP 6267 NOP 6268 NOP 6269 NOP 6270 6271 loop: 6272 MOVSS (AX)(DI*4), X1 6273 MULSS X0, X1 6274 ADDSS (DX)(R8*4), X1 6275 MOVSS X1, (DX)(R8*4) 6276 DECQ SI 6277 ADDQ CX, DI 6278 ADDQ BX, R8 6279 6280 check_limit: 6281 CMPQ SI, $0x00 6282 JHI loop 6283 RET 6284 6285 // func AmdAxpyUnsafeX_V2A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6286 // Requires: SSE 6287 TEXT ·AmdAxpyUnsafeX_V2A12(SB), NOSPLIT, $0-48 6288 MOVSS alpha+0(FP), X0 6289 MOVQ xs+8(FP), AX 6290 MOVQ incx+16(FP), CX 6291 MOVQ ys+24(FP), DX 6292 MOVQ incy+32(FP), BX 6293 MOVQ n+40(FP), SI 6294 XORQ DI, DI 6295 XORQ R8, R8 6296 JMP check_limit 6297 PCALIGN $0x08 6298 NOP 6299 NOP 6300 NOP 6301 NOP 6302 6303 loop: 6304 MOVSS (AX)(DI*4), X1 6305 MULSS X0, X1 6306 ADDSS (DX)(R8*4), X1 6307 MOVSS X1, (DX)(R8*4) 6308 DECQ SI 6309 ADDQ CX, DI 6310 ADDQ BX, R8 6311 6312 check_limit: 6313 CMPQ SI, $0x00 6314 JHI loop 6315 RET 6316 6317 // func AmdAxpyUnsafeX_V3A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6318 // Requires: SSE 6319 TEXT ·AmdAxpyUnsafeX_V3A12(SB), NOSPLIT, $0-48 6320 MOVSS alpha+0(FP), X0 6321 MOVQ xs+8(FP), AX 6322 MOVQ incx+16(FP), CX 6323 MOVQ ys+24(FP), DX 6324 MOVQ incy+32(FP), BX 6325 MOVQ n+40(FP), SI 6326 XORQ DI, DI 6327 XORQ R8, R8 6328 JMP check_limit 6329 PCALIGN $0x08 6330 NOP 6331 NOP 6332 NOP 6333 NOP 6334 6335 loop: 6336 MOVSS (AX)(DI*4), X1 6337 MULSS X0, X1 6338 ADDSS (DX)(R8*4), X1 6339 MOVSS X1, (DX)(R8*4) 6340 DECQ SI 6341 ADDQ CX, DI 6342 ADDQ BX, R8 6343 6344 check_limit: 6345 CMPQ SI, $0x00 6346 JHI loop 6347 RET 6348 6349 // func AmdAxpyUnsafeX_V4A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6350 // Requires: SSE 6351 TEXT ·AmdAxpyUnsafeX_V4A12(SB), NOSPLIT, $0-48 6352 MOVSS alpha+0(FP), X0 6353 MOVQ xs+8(FP), AX 6354 MOVQ incx+16(FP), CX 6355 MOVQ ys+24(FP), DX 6356 MOVQ incy+32(FP), BX 6357 MOVQ n+40(FP), SI 6358 XORQ DI, DI 6359 XORQ R8, R8 6360 JMP check_limit 6361 PCALIGN $0x08 6362 NOP 6363 NOP 6364 NOP 6365 NOP 6366 6367 loop: 6368 MOVSS (AX)(DI*4), X1 6369 MULSS X0, X1 6370 ADDSS (DX)(R8*4), X1 6371 MOVSS X1, (DX)(R8*4) 6372 DECQ SI 6373 ADDQ CX, DI 6374 ADDQ BX, R8 6375 6376 check_limit: 6377 CMPQ SI, $0x00 6378 JHI loop 6379 RET 6380 6381 // func AmdAxpyUnsafeX_V5A12(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6382 // Requires: SSE 6383 TEXT ·AmdAxpyUnsafeX_V5A12(SB), NOSPLIT, $0-48 6384 MOVSS alpha+0(FP), X0 6385 MOVQ xs+8(FP), AX 6386 MOVQ incx+16(FP), CX 6387 MOVQ ys+24(FP), DX 6388 MOVQ incy+32(FP), BX 6389 MOVQ n+40(FP), SI 6390 XORQ DI, DI 6391 XORQ R8, R8 6392 JMP check_limit 6393 PCALIGN $0x08 6394 NOP 6395 NOP 6396 NOP 6397 NOP 6398 6399 loop: 6400 MOVSS (AX)(DI*4), X1 6401 MULSS X0, X1 6402 ADDSS (DX)(R8*4), X1 6403 MOVSS X1, (DX)(R8*4) 6404 DECQ SI 6405 ADDQ CX, DI 6406 ADDQ BX, R8 6407 6408 check_limit: 6409 CMPQ SI, $0x00 6410 JHI loop 6411 RET 6412 6413 // func AmdAxpyUnsafeX_V0A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6414 // Requires: SSE 6415 TEXT ·AmdAxpyUnsafeX_V0A13(SB), NOSPLIT, $0-48 6416 MOVSS alpha+0(FP), X0 6417 MOVQ xs+8(FP), AX 6418 MOVQ incx+16(FP), CX 6419 MOVQ ys+24(FP), DX 6420 MOVQ incy+32(FP), BX 6421 MOVQ n+40(FP), SI 6422 XORQ DI, DI 6423 XORQ R8, R8 6424 JMP check_limit 6425 PCALIGN $0x08 6426 NOP 6427 NOP 6428 NOP 6429 NOP 6430 NOP 6431 6432 loop: 6433 MOVSS (AX)(DI*4), X1 6434 MULSS X0, X1 6435 ADDSS (DX)(R8*4), X1 6436 MOVSS X1, (DX)(R8*4) 6437 DECQ SI 6438 ADDQ CX, DI 6439 ADDQ BX, R8 6440 6441 check_limit: 6442 CMPQ SI, $0x00 6443 JHI loop 6444 RET 6445 6446 // func AmdAxpyUnsafeX_V1A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6447 // Requires: SSE 6448 TEXT ·AmdAxpyUnsafeX_V1A13(SB), NOSPLIT, $0-48 6449 MOVSS alpha+0(FP), X0 6450 MOVQ xs+8(FP), AX 6451 MOVQ incx+16(FP), CX 6452 MOVQ ys+24(FP), DX 6453 MOVQ incy+32(FP), BX 6454 MOVQ n+40(FP), SI 6455 XORQ DI, DI 6456 XORQ R8, R8 6457 JMP check_limit 6458 PCALIGN $0x08 6459 NOP 6460 NOP 6461 NOP 6462 NOP 6463 NOP 6464 6465 loop: 6466 MOVSS (AX)(DI*4), X1 6467 MULSS X0, X1 6468 ADDSS (DX)(R8*4), X1 6469 MOVSS X1, (DX)(R8*4) 6470 DECQ SI 6471 ADDQ CX, DI 6472 ADDQ BX, R8 6473 6474 check_limit: 6475 CMPQ SI, $0x00 6476 JHI loop 6477 RET 6478 6479 // func AmdAxpyUnsafeX_V2A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6480 // Requires: SSE 6481 TEXT ·AmdAxpyUnsafeX_V2A13(SB), NOSPLIT, $0-48 6482 MOVSS alpha+0(FP), X0 6483 MOVQ xs+8(FP), AX 6484 MOVQ incx+16(FP), CX 6485 MOVQ ys+24(FP), DX 6486 MOVQ incy+32(FP), BX 6487 MOVQ n+40(FP), SI 6488 XORQ DI, DI 6489 XORQ R8, R8 6490 JMP check_limit 6491 PCALIGN $0x08 6492 NOP 6493 NOP 6494 NOP 6495 NOP 6496 NOP 6497 6498 loop: 6499 MOVSS (AX)(DI*4), X1 6500 MULSS X0, X1 6501 ADDSS (DX)(R8*4), X1 6502 MOVSS X1, (DX)(R8*4) 6503 DECQ SI 6504 ADDQ CX, DI 6505 ADDQ BX, R8 6506 6507 check_limit: 6508 CMPQ SI, $0x00 6509 JHI loop 6510 RET 6511 6512 // func AmdAxpyUnsafeX_V3A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6513 // Requires: SSE 6514 TEXT ·AmdAxpyUnsafeX_V3A13(SB), NOSPLIT, $0-48 6515 MOVSS alpha+0(FP), X0 6516 MOVQ xs+8(FP), AX 6517 MOVQ incx+16(FP), CX 6518 MOVQ ys+24(FP), DX 6519 MOVQ incy+32(FP), BX 6520 MOVQ n+40(FP), SI 6521 XORQ DI, DI 6522 XORQ R8, R8 6523 JMP check_limit 6524 PCALIGN $0x08 6525 NOP 6526 NOP 6527 NOP 6528 NOP 6529 NOP 6530 6531 loop: 6532 MOVSS (AX)(DI*4), X1 6533 MULSS X0, X1 6534 ADDSS (DX)(R8*4), X1 6535 MOVSS X1, (DX)(R8*4) 6536 DECQ SI 6537 ADDQ CX, DI 6538 ADDQ BX, R8 6539 6540 check_limit: 6541 CMPQ SI, $0x00 6542 JHI loop 6543 RET 6544 6545 // func AmdAxpyUnsafeX_V4A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6546 // Requires: SSE 6547 TEXT ·AmdAxpyUnsafeX_V4A13(SB), NOSPLIT, $0-48 6548 MOVSS alpha+0(FP), X0 6549 MOVQ xs+8(FP), AX 6550 MOVQ incx+16(FP), CX 6551 MOVQ ys+24(FP), DX 6552 MOVQ incy+32(FP), BX 6553 MOVQ n+40(FP), SI 6554 XORQ DI, DI 6555 XORQ R8, R8 6556 JMP check_limit 6557 PCALIGN $0x08 6558 NOP 6559 NOP 6560 NOP 6561 NOP 6562 NOP 6563 6564 loop: 6565 MOVSS (AX)(DI*4), X1 6566 MULSS X0, X1 6567 ADDSS (DX)(R8*4), X1 6568 MOVSS X1, (DX)(R8*4) 6569 DECQ SI 6570 ADDQ CX, DI 6571 ADDQ BX, R8 6572 6573 check_limit: 6574 CMPQ SI, $0x00 6575 JHI loop 6576 RET 6577 6578 // func AmdAxpyUnsafeX_V5A13(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6579 // Requires: SSE 6580 TEXT ·AmdAxpyUnsafeX_V5A13(SB), NOSPLIT, $0-48 6581 MOVSS alpha+0(FP), X0 6582 MOVQ xs+8(FP), AX 6583 MOVQ incx+16(FP), CX 6584 MOVQ ys+24(FP), DX 6585 MOVQ incy+32(FP), BX 6586 MOVQ n+40(FP), SI 6587 XORQ DI, DI 6588 XORQ R8, R8 6589 JMP check_limit 6590 PCALIGN $0x08 6591 NOP 6592 NOP 6593 NOP 6594 NOP 6595 NOP 6596 6597 loop: 6598 MOVSS (AX)(DI*4), X1 6599 MULSS X0, X1 6600 ADDSS (DX)(R8*4), X1 6601 MOVSS X1, (DX)(R8*4) 6602 DECQ SI 6603 ADDQ CX, DI 6604 ADDQ BX, R8 6605 6606 check_limit: 6607 CMPQ SI, $0x00 6608 JHI loop 6609 RET 6610 6611 // func AmdAxpyUnsafeX_V0A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6612 // Requires: SSE 6613 TEXT ·AmdAxpyUnsafeX_V0A14(SB), NOSPLIT, $0-48 6614 MOVSS alpha+0(FP), X0 6615 MOVQ xs+8(FP), AX 6616 MOVQ incx+16(FP), CX 6617 MOVQ ys+24(FP), DX 6618 MOVQ incy+32(FP), BX 6619 MOVQ n+40(FP), SI 6620 XORQ DI, DI 6621 XORQ R8, R8 6622 JMP check_limit 6623 PCALIGN $0x08 6624 NOP 6625 NOP 6626 NOP 6627 NOP 6628 NOP 6629 NOP 6630 6631 loop: 6632 MOVSS (AX)(DI*4), X1 6633 MULSS X0, X1 6634 ADDSS (DX)(R8*4), X1 6635 MOVSS X1, (DX)(R8*4) 6636 DECQ SI 6637 ADDQ CX, DI 6638 ADDQ BX, R8 6639 6640 check_limit: 6641 CMPQ SI, $0x00 6642 JHI loop 6643 RET 6644 6645 // func AmdAxpyUnsafeX_V1A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6646 // Requires: SSE 6647 TEXT ·AmdAxpyUnsafeX_V1A14(SB), NOSPLIT, $0-48 6648 MOVSS alpha+0(FP), X0 6649 MOVQ xs+8(FP), AX 6650 MOVQ incx+16(FP), CX 6651 MOVQ ys+24(FP), DX 6652 MOVQ incy+32(FP), BX 6653 MOVQ n+40(FP), SI 6654 XORQ DI, DI 6655 XORQ R8, R8 6656 JMP check_limit 6657 PCALIGN $0x08 6658 NOP 6659 NOP 6660 NOP 6661 NOP 6662 NOP 6663 NOP 6664 6665 loop: 6666 MOVSS (AX)(DI*4), X1 6667 MULSS X0, X1 6668 ADDSS (DX)(R8*4), X1 6669 MOVSS X1, (DX)(R8*4) 6670 DECQ SI 6671 ADDQ CX, DI 6672 ADDQ BX, R8 6673 6674 check_limit: 6675 CMPQ SI, $0x00 6676 JHI loop 6677 RET 6678 6679 // func AmdAxpyUnsafeX_V2A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6680 // Requires: SSE 6681 TEXT ·AmdAxpyUnsafeX_V2A14(SB), NOSPLIT, $0-48 6682 MOVSS alpha+0(FP), X0 6683 MOVQ xs+8(FP), AX 6684 MOVQ incx+16(FP), CX 6685 MOVQ ys+24(FP), DX 6686 MOVQ incy+32(FP), BX 6687 MOVQ n+40(FP), SI 6688 XORQ DI, DI 6689 XORQ R8, R8 6690 JMP check_limit 6691 PCALIGN $0x08 6692 NOP 6693 NOP 6694 NOP 6695 NOP 6696 NOP 6697 NOP 6698 6699 loop: 6700 MOVSS (AX)(DI*4), X1 6701 MULSS X0, X1 6702 ADDSS (DX)(R8*4), X1 6703 MOVSS X1, (DX)(R8*4) 6704 DECQ SI 6705 ADDQ CX, DI 6706 ADDQ BX, R8 6707 6708 check_limit: 6709 CMPQ SI, $0x00 6710 JHI loop 6711 RET 6712 6713 // func AmdAxpyUnsafeX_V3A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6714 // Requires: SSE 6715 TEXT ·AmdAxpyUnsafeX_V3A14(SB), NOSPLIT, $0-48 6716 MOVSS alpha+0(FP), X0 6717 MOVQ xs+8(FP), AX 6718 MOVQ incx+16(FP), CX 6719 MOVQ ys+24(FP), DX 6720 MOVQ incy+32(FP), BX 6721 MOVQ n+40(FP), SI 6722 XORQ DI, DI 6723 XORQ R8, R8 6724 JMP check_limit 6725 PCALIGN $0x08 6726 NOP 6727 NOP 6728 NOP 6729 NOP 6730 NOP 6731 NOP 6732 6733 loop: 6734 MOVSS (AX)(DI*4), X1 6735 MULSS X0, X1 6736 ADDSS (DX)(R8*4), X1 6737 MOVSS X1, (DX)(R8*4) 6738 DECQ SI 6739 ADDQ CX, DI 6740 ADDQ BX, R8 6741 6742 check_limit: 6743 CMPQ SI, $0x00 6744 JHI loop 6745 RET 6746 6747 // func AmdAxpyUnsafeX_V4A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6748 // Requires: SSE 6749 TEXT ·AmdAxpyUnsafeX_V4A14(SB), NOSPLIT, $0-48 6750 MOVSS alpha+0(FP), X0 6751 MOVQ xs+8(FP), AX 6752 MOVQ incx+16(FP), CX 6753 MOVQ ys+24(FP), DX 6754 MOVQ incy+32(FP), BX 6755 MOVQ n+40(FP), SI 6756 XORQ DI, DI 6757 XORQ R8, R8 6758 JMP check_limit 6759 PCALIGN $0x08 6760 NOP 6761 NOP 6762 NOP 6763 NOP 6764 NOP 6765 NOP 6766 6767 loop: 6768 MOVSS (AX)(DI*4), X1 6769 MULSS X0, X1 6770 ADDSS (DX)(R8*4), X1 6771 MOVSS X1, (DX)(R8*4) 6772 DECQ SI 6773 ADDQ CX, DI 6774 ADDQ BX, R8 6775 6776 check_limit: 6777 CMPQ SI, $0x00 6778 JHI loop 6779 RET 6780 6781 // func AmdAxpyUnsafeX_V5A14(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6782 // Requires: SSE 6783 TEXT ·AmdAxpyUnsafeX_V5A14(SB), NOSPLIT, $0-48 6784 MOVSS alpha+0(FP), X0 6785 MOVQ xs+8(FP), AX 6786 MOVQ incx+16(FP), CX 6787 MOVQ ys+24(FP), DX 6788 MOVQ incy+32(FP), BX 6789 MOVQ n+40(FP), SI 6790 XORQ DI, DI 6791 XORQ R8, R8 6792 JMP check_limit 6793 PCALIGN $0x08 6794 NOP 6795 NOP 6796 NOP 6797 NOP 6798 NOP 6799 NOP 6800 6801 loop: 6802 MOVSS (AX)(DI*4), X1 6803 MULSS X0, X1 6804 ADDSS (DX)(R8*4), X1 6805 MOVSS X1, (DX)(R8*4) 6806 DECQ SI 6807 ADDQ CX, DI 6808 ADDQ BX, R8 6809 6810 check_limit: 6811 CMPQ SI, $0x00 6812 JHI loop 6813 RET 6814 6815 // func AmdAxpyUnsafeX_V0A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6816 // Requires: SSE 6817 TEXT ·AmdAxpyUnsafeX_V0A15(SB), NOSPLIT, $0-48 6818 MOVSS alpha+0(FP), X0 6819 MOVQ xs+8(FP), AX 6820 MOVQ incx+16(FP), CX 6821 MOVQ ys+24(FP), DX 6822 MOVQ incy+32(FP), BX 6823 MOVQ n+40(FP), SI 6824 XORQ DI, DI 6825 XORQ R8, R8 6826 JMP check_limit 6827 PCALIGN $0x08 6828 NOP 6829 NOP 6830 NOP 6831 NOP 6832 NOP 6833 NOP 6834 NOP 6835 6836 loop: 6837 MOVSS (AX)(DI*4), X1 6838 MULSS X0, X1 6839 ADDSS (DX)(R8*4), X1 6840 MOVSS X1, (DX)(R8*4) 6841 DECQ SI 6842 ADDQ CX, DI 6843 ADDQ BX, R8 6844 6845 check_limit: 6846 CMPQ SI, $0x00 6847 JHI loop 6848 RET 6849 6850 // func AmdAxpyUnsafeX_V1A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6851 // Requires: SSE 6852 TEXT ·AmdAxpyUnsafeX_V1A15(SB), NOSPLIT, $0-48 6853 MOVSS alpha+0(FP), X0 6854 MOVQ xs+8(FP), AX 6855 MOVQ incx+16(FP), CX 6856 MOVQ ys+24(FP), DX 6857 MOVQ incy+32(FP), BX 6858 MOVQ n+40(FP), SI 6859 XORQ DI, DI 6860 XORQ R8, R8 6861 JMP check_limit 6862 PCALIGN $0x08 6863 NOP 6864 NOP 6865 NOP 6866 NOP 6867 NOP 6868 NOP 6869 NOP 6870 6871 loop: 6872 MOVSS (AX)(DI*4), X1 6873 MULSS X0, X1 6874 ADDSS (DX)(R8*4), X1 6875 MOVSS X1, (DX)(R8*4) 6876 DECQ SI 6877 ADDQ CX, DI 6878 ADDQ BX, R8 6879 6880 check_limit: 6881 CMPQ SI, $0x00 6882 JHI loop 6883 RET 6884 6885 // func AmdAxpyUnsafeX_V2A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6886 // Requires: SSE 6887 TEXT ·AmdAxpyUnsafeX_V2A15(SB), NOSPLIT, $0-48 6888 MOVSS alpha+0(FP), X0 6889 MOVQ xs+8(FP), AX 6890 MOVQ incx+16(FP), CX 6891 MOVQ ys+24(FP), DX 6892 MOVQ incy+32(FP), BX 6893 MOVQ n+40(FP), SI 6894 XORQ DI, DI 6895 XORQ R8, R8 6896 JMP check_limit 6897 PCALIGN $0x08 6898 NOP 6899 NOP 6900 NOP 6901 NOP 6902 NOP 6903 NOP 6904 NOP 6905 6906 loop: 6907 MOVSS (AX)(DI*4), X1 6908 MULSS X0, X1 6909 ADDSS (DX)(R8*4), X1 6910 MOVSS X1, (DX)(R8*4) 6911 DECQ SI 6912 ADDQ CX, DI 6913 ADDQ BX, R8 6914 6915 check_limit: 6916 CMPQ SI, $0x00 6917 JHI loop 6918 RET 6919 6920 // func AmdAxpyUnsafeX_V3A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6921 // Requires: SSE 6922 TEXT ·AmdAxpyUnsafeX_V3A15(SB), NOSPLIT, $0-48 6923 MOVSS alpha+0(FP), X0 6924 MOVQ xs+8(FP), AX 6925 MOVQ incx+16(FP), CX 6926 MOVQ ys+24(FP), DX 6927 MOVQ incy+32(FP), BX 6928 MOVQ n+40(FP), SI 6929 XORQ DI, DI 6930 XORQ R8, R8 6931 JMP check_limit 6932 PCALIGN $0x08 6933 NOP 6934 NOP 6935 NOP 6936 NOP 6937 NOP 6938 NOP 6939 NOP 6940 6941 loop: 6942 MOVSS (AX)(DI*4), X1 6943 MULSS X0, X1 6944 ADDSS (DX)(R8*4), X1 6945 MOVSS X1, (DX)(R8*4) 6946 DECQ SI 6947 ADDQ CX, DI 6948 ADDQ BX, R8 6949 6950 check_limit: 6951 CMPQ SI, $0x00 6952 JHI loop 6953 RET 6954 6955 // func AmdAxpyUnsafeX_V4A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6956 // Requires: SSE 6957 TEXT ·AmdAxpyUnsafeX_V4A15(SB), NOSPLIT, $0-48 6958 MOVSS alpha+0(FP), X0 6959 MOVQ xs+8(FP), AX 6960 MOVQ incx+16(FP), CX 6961 MOVQ ys+24(FP), DX 6962 MOVQ incy+32(FP), BX 6963 MOVQ n+40(FP), SI 6964 XORQ DI, DI 6965 XORQ R8, R8 6966 JMP check_limit 6967 PCALIGN $0x08 6968 NOP 6969 NOP 6970 NOP 6971 NOP 6972 NOP 6973 NOP 6974 NOP 6975 6976 loop: 6977 MOVSS (AX)(DI*4), X1 6978 MULSS X0, X1 6979 ADDSS (DX)(R8*4), X1 6980 MOVSS X1, (DX)(R8*4) 6981 DECQ SI 6982 ADDQ CX, DI 6983 ADDQ BX, R8 6984 6985 check_limit: 6986 CMPQ SI, $0x00 6987 JHI loop 6988 RET 6989 6990 // func AmdAxpyUnsafeX_V5A15(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 6991 // Requires: SSE 6992 TEXT ·AmdAxpyUnsafeX_V5A15(SB), NOSPLIT, $0-48 6993 MOVSS alpha+0(FP), X0 6994 MOVQ xs+8(FP), AX 6995 MOVQ incx+16(FP), CX 6996 MOVQ ys+24(FP), DX 6997 MOVQ incy+32(FP), BX 6998 MOVQ n+40(FP), SI 6999 XORQ DI, DI 7000 XORQ R8, R8 7001 JMP check_limit 7002 PCALIGN $0x08 7003 NOP 7004 NOP 7005 NOP 7006 NOP 7007 NOP 7008 NOP 7009 NOP 7010 7011 loop: 7012 MOVSS (AX)(DI*4), X1 7013 MULSS X0, X1 7014 ADDSS (DX)(R8*4), X1 7015 MOVSS X1, (DX)(R8*4) 7016 DECQ SI 7017 ADDQ CX, DI 7018 ADDQ BX, R8 7019 7020 check_limit: 7021 CMPQ SI, $0x00 7022 JHI loop 7023 RET 7024 7025 // func AmdAxpyUnsafeX_V0A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7026 // Requires: SSE 7027 TEXT ·AmdAxpyUnsafeX_V0A16(SB), NOSPLIT, $0-48 7028 MOVSS alpha+0(FP), X0 7029 MOVQ xs+8(FP), AX 7030 MOVQ incx+16(FP), CX 7031 MOVQ ys+24(FP), DX 7032 MOVQ incy+32(FP), BX 7033 MOVQ n+40(FP), SI 7034 XORQ DI, DI 7035 XORQ R8, R8 7036 JMP check_limit 7037 PCALIGN $0x10 7038 7039 loop: 7040 MOVSS (AX)(DI*4), X1 7041 MULSS X0, X1 7042 ADDSS (DX)(R8*4), X1 7043 MOVSS X1, (DX)(R8*4) 7044 DECQ SI 7045 ADDQ CX, DI 7046 ADDQ BX, R8 7047 7048 check_limit: 7049 CMPQ SI, $0x00 7050 JHI loop 7051 RET 7052 7053 // func AmdAxpyUnsafeX_V1A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7054 // Requires: SSE 7055 TEXT ·AmdAxpyUnsafeX_V1A16(SB), NOSPLIT, $0-48 7056 MOVSS alpha+0(FP), X0 7057 MOVQ xs+8(FP), AX 7058 MOVQ incx+16(FP), CX 7059 MOVQ ys+24(FP), DX 7060 MOVQ incy+32(FP), BX 7061 MOVQ n+40(FP), SI 7062 XORQ DI, DI 7063 XORQ R8, R8 7064 JMP check_limit 7065 PCALIGN $0x10 7066 7067 loop: 7068 MOVSS (AX)(DI*4), X1 7069 MULSS X0, X1 7070 ADDSS (DX)(R8*4), X1 7071 MOVSS X1, (DX)(R8*4) 7072 DECQ SI 7073 ADDQ CX, DI 7074 ADDQ BX, R8 7075 7076 check_limit: 7077 CMPQ SI, $0x00 7078 JHI loop 7079 RET 7080 7081 // func AmdAxpyUnsafeX_V2A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7082 // Requires: SSE 7083 TEXT ·AmdAxpyUnsafeX_V2A16(SB), NOSPLIT, $0-48 7084 MOVSS alpha+0(FP), X0 7085 MOVQ xs+8(FP), AX 7086 MOVQ incx+16(FP), CX 7087 MOVQ ys+24(FP), DX 7088 MOVQ incy+32(FP), BX 7089 MOVQ n+40(FP), SI 7090 XORQ DI, DI 7091 XORQ R8, R8 7092 JMP check_limit 7093 PCALIGN $0x10 7094 7095 loop: 7096 MOVSS (AX)(DI*4), X1 7097 MULSS X0, X1 7098 ADDSS (DX)(R8*4), X1 7099 MOVSS X1, (DX)(R8*4) 7100 DECQ SI 7101 ADDQ CX, DI 7102 ADDQ BX, R8 7103 7104 check_limit: 7105 CMPQ SI, $0x00 7106 JHI loop 7107 RET 7108 7109 // func AmdAxpyUnsafeX_V3A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7110 // Requires: SSE 7111 TEXT ·AmdAxpyUnsafeX_V3A16(SB), NOSPLIT, $0-48 7112 MOVSS alpha+0(FP), X0 7113 MOVQ xs+8(FP), AX 7114 MOVQ incx+16(FP), CX 7115 MOVQ ys+24(FP), DX 7116 MOVQ incy+32(FP), BX 7117 MOVQ n+40(FP), SI 7118 XORQ DI, DI 7119 XORQ R8, R8 7120 JMP check_limit 7121 PCALIGN $0x10 7122 7123 loop: 7124 MOVSS (AX)(DI*4), X1 7125 MULSS X0, X1 7126 ADDSS (DX)(R8*4), X1 7127 MOVSS X1, (DX)(R8*4) 7128 DECQ SI 7129 ADDQ CX, DI 7130 ADDQ BX, R8 7131 7132 check_limit: 7133 CMPQ SI, $0x00 7134 JHI loop 7135 RET 7136 7137 // func AmdAxpyUnsafeX_V4A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7138 // Requires: SSE 7139 TEXT ·AmdAxpyUnsafeX_V4A16(SB), NOSPLIT, $0-48 7140 MOVSS alpha+0(FP), X0 7141 MOVQ xs+8(FP), AX 7142 MOVQ incx+16(FP), CX 7143 MOVQ ys+24(FP), DX 7144 MOVQ incy+32(FP), BX 7145 MOVQ n+40(FP), SI 7146 XORQ DI, DI 7147 XORQ R8, R8 7148 JMP check_limit 7149 PCALIGN $0x10 7150 7151 loop: 7152 MOVSS (AX)(DI*4), X1 7153 MULSS X0, X1 7154 ADDSS (DX)(R8*4), X1 7155 MOVSS X1, (DX)(R8*4) 7156 DECQ SI 7157 ADDQ CX, DI 7158 ADDQ BX, R8 7159 7160 check_limit: 7161 CMPQ SI, $0x00 7162 JHI loop 7163 RET 7164 7165 // func AmdAxpyUnsafeX_V5A16(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7166 // Requires: SSE 7167 TEXT ·AmdAxpyUnsafeX_V5A16(SB), NOSPLIT, $0-48 7168 MOVSS alpha+0(FP), X0 7169 MOVQ xs+8(FP), AX 7170 MOVQ incx+16(FP), CX 7171 MOVQ ys+24(FP), DX 7172 MOVQ incy+32(FP), BX 7173 MOVQ n+40(FP), SI 7174 XORQ DI, DI 7175 XORQ R8, R8 7176 JMP check_limit 7177 PCALIGN $0x10 7178 7179 loop: 7180 MOVSS (AX)(DI*4), X1 7181 MULSS X0, X1 7182 ADDSS (DX)(R8*4), X1 7183 MOVSS X1, (DX)(R8*4) 7184 DECQ SI 7185 ADDQ CX, DI 7186 ADDQ BX, R8 7187 7188 check_limit: 7189 CMPQ SI, $0x00 7190 JHI loop 7191 RET 7192 7193 // func AmdAxpyUnsafeX_V0A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7194 // Requires: SSE 7195 TEXT ·AmdAxpyUnsafeX_V0A0R4(SB), NOSPLIT, $0-48 7196 MOVSS alpha+0(FP), X0 7197 MOVQ xs+8(FP), AX 7198 MOVQ incx+16(FP), CX 7199 MOVQ ys+24(FP), DX 7200 MOVQ incy+32(FP), BX 7201 MOVQ n+40(FP), SI 7202 XORQ DI, DI 7203 XORQ R8, R8 7204 JMP check_limit_unroll 7205 7206 loop_unroll: 7207 MOVSS (AX)(DI*4), X1 7208 MULSS X0, X1 7209 ADDSS (DX)(R8*4), X1 7210 MOVSS X1, (DX)(R8*4) 7211 ADDQ CX, DI 7212 ADDQ BX, R8 7213 MOVSS (AX)(DI*4), X1 7214 MULSS X0, X1 7215 ADDSS (DX)(R8*4), X1 7216 MOVSS X1, (DX)(R8*4) 7217 ADDQ CX, DI 7218 ADDQ BX, R8 7219 MOVSS (AX)(DI*4), X1 7220 MULSS X0, X1 7221 ADDSS (DX)(R8*4), X1 7222 MOVSS X1, (DX)(R8*4) 7223 ADDQ CX, DI 7224 ADDQ BX, R8 7225 MOVSS (AX)(DI*4), X1 7226 MULSS X0, X1 7227 ADDSS (DX)(R8*4), X1 7228 MOVSS X1, (DX)(R8*4) 7229 ADDQ CX, DI 7230 ADDQ BX, R8 7231 SUBQ $0x04, SI 7232 7233 check_limit_unroll: 7234 CMPQ SI, $0x04 7235 JHI loop_unroll 7236 JMP check_limit 7237 7238 loop: 7239 MOVSS (AX)(DI*4), X1 7240 MULSS X0, X1 7241 ADDSS (DX)(R8*4), X1 7242 MOVSS X1, (DX)(R8*4) 7243 DECQ SI 7244 ADDQ CX, DI 7245 ADDQ BX, R8 7246 7247 check_limit: 7248 CMPQ SI, $0x00 7249 JHI loop 7250 RET 7251 7252 // func AmdAxpyUnsafeX_V1A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7253 // Requires: SSE 7254 TEXT ·AmdAxpyUnsafeX_V1A0R4(SB), NOSPLIT, $0-48 7255 MOVSS alpha+0(FP), X0 7256 MOVQ xs+8(FP), AX 7257 MOVQ incx+16(FP), CX 7258 MOVQ ys+24(FP), DX 7259 MOVQ incy+32(FP), BX 7260 MOVQ n+40(FP), SI 7261 XORQ DI, DI 7262 XORQ R8, R8 7263 JMP check_limit_unroll 7264 7265 loop_unroll: 7266 MOVSS (AX)(DI*4), X1 7267 MULSS X0, X1 7268 ADDSS (DX)(R8*4), X1 7269 MOVSS X1, (DX)(R8*4) 7270 ADDQ CX, DI 7271 ADDQ BX, R8 7272 MOVSS (AX)(DI*4), X1 7273 MULSS X0, X1 7274 ADDSS (DX)(R8*4), X1 7275 MOVSS X1, (DX)(R8*4) 7276 ADDQ CX, DI 7277 ADDQ BX, R8 7278 MOVSS (AX)(DI*4), X1 7279 MULSS X0, X1 7280 ADDSS (DX)(R8*4), X1 7281 MOVSS X1, (DX)(R8*4) 7282 ADDQ CX, DI 7283 ADDQ BX, R8 7284 MOVSS (AX)(DI*4), X1 7285 MULSS X0, X1 7286 ADDSS (DX)(R8*4), X1 7287 MOVSS X1, (DX)(R8*4) 7288 ADDQ CX, DI 7289 ADDQ BX, R8 7290 SUBQ $0x04, SI 7291 7292 check_limit_unroll: 7293 CMPQ SI, $0x04 7294 JHI loop_unroll 7295 JMP check_limit 7296 7297 loop: 7298 MOVSS (AX)(DI*4), X1 7299 MULSS X0, X1 7300 ADDSS (DX)(R8*4), X1 7301 MOVSS X1, (DX)(R8*4) 7302 DECQ SI 7303 ADDQ CX, DI 7304 ADDQ BX, R8 7305 7306 check_limit: 7307 CMPQ SI, $0x00 7308 JHI loop 7309 RET 7310 7311 // func AmdAxpyUnsafeX_V2A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7312 // Requires: SSE 7313 TEXT ·AmdAxpyUnsafeX_V2A0R4(SB), NOSPLIT, $0-48 7314 MOVSS alpha+0(FP), X0 7315 MOVQ xs+8(FP), AX 7316 MOVQ incx+16(FP), CX 7317 MOVQ ys+24(FP), DX 7318 MOVQ incy+32(FP), BX 7319 MOVQ n+40(FP), SI 7320 XORQ DI, DI 7321 XORQ R8, R8 7322 JMP check_limit_unroll 7323 7324 loop_unroll: 7325 MOVSS (AX)(DI*4), X1 7326 MULSS X0, X1 7327 ADDSS (DX)(R8*4), X1 7328 MOVSS X1, (DX)(R8*4) 7329 ADDQ CX, DI 7330 ADDQ BX, R8 7331 MOVSS (AX)(DI*4), X1 7332 MULSS X0, X1 7333 ADDSS (DX)(R8*4), X1 7334 MOVSS X1, (DX)(R8*4) 7335 ADDQ CX, DI 7336 ADDQ BX, R8 7337 MOVSS (AX)(DI*4), X1 7338 MULSS X0, X1 7339 ADDSS (DX)(R8*4), X1 7340 MOVSS X1, (DX)(R8*4) 7341 ADDQ CX, DI 7342 ADDQ BX, R8 7343 MOVSS (AX)(DI*4), X1 7344 MULSS X0, X1 7345 ADDSS (DX)(R8*4), X1 7346 MOVSS X1, (DX)(R8*4) 7347 ADDQ CX, DI 7348 ADDQ BX, R8 7349 SUBQ $0x04, SI 7350 7351 check_limit_unroll: 7352 CMPQ SI, $0x04 7353 JHI loop_unroll 7354 JMP check_limit 7355 7356 loop: 7357 MOVSS (AX)(DI*4), X1 7358 MULSS X0, X1 7359 ADDSS (DX)(R8*4), X1 7360 MOVSS X1, (DX)(R8*4) 7361 DECQ SI 7362 ADDQ CX, DI 7363 ADDQ BX, R8 7364 7365 check_limit: 7366 CMPQ SI, $0x00 7367 JHI loop 7368 RET 7369 7370 // func AmdAxpyUnsafeX_V3A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7371 // Requires: SSE 7372 TEXT ·AmdAxpyUnsafeX_V3A0R4(SB), NOSPLIT, $0-48 7373 MOVSS alpha+0(FP), X0 7374 MOVQ xs+8(FP), AX 7375 MOVQ incx+16(FP), CX 7376 MOVQ ys+24(FP), DX 7377 MOVQ incy+32(FP), BX 7378 MOVQ n+40(FP), SI 7379 XORQ DI, DI 7380 XORQ R8, R8 7381 JMP check_limit_unroll 7382 7383 loop_unroll: 7384 MOVSS (AX)(DI*4), X1 7385 MULSS X0, X1 7386 ADDSS (DX)(R8*4), X1 7387 MOVSS X1, (DX)(R8*4) 7388 ADDQ CX, DI 7389 ADDQ BX, R8 7390 MOVSS (AX)(DI*4), X1 7391 MULSS X0, X1 7392 ADDSS (DX)(R8*4), X1 7393 MOVSS X1, (DX)(R8*4) 7394 ADDQ CX, DI 7395 ADDQ BX, R8 7396 MOVSS (AX)(DI*4), X1 7397 MULSS X0, X1 7398 ADDSS (DX)(R8*4), X1 7399 MOVSS X1, (DX)(R8*4) 7400 ADDQ CX, DI 7401 ADDQ BX, R8 7402 MOVSS (AX)(DI*4), X1 7403 MULSS X0, X1 7404 ADDSS (DX)(R8*4), X1 7405 MOVSS X1, (DX)(R8*4) 7406 ADDQ CX, DI 7407 ADDQ BX, R8 7408 SUBQ $0x04, SI 7409 7410 check_limit_unroll: 7411 CMPQ SI, $0x04 7412 JHI loop_unroll 7413 JMP check_limit 7414 7415 loop: 7416 MOVSS (AX)(DI*4), X1 7417 MULSS X0, X1 7418 ADDSS (DX)(R8*4), X1 7419 MOVSS X1, (DX)(R8*4) 7420 DECQ SI 7421 ADDQ CX, DI 7422 ADDQ BX, R8 7423 7424 check_limit: 7425 CMPQ SI, $0x00 7426 JHI loop 7427 RET 7428 7429 // func AmdAxpyUnsafeX_V4A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7430 // Requires: SSE 7431 TEXT ·AmdAxpyUnsafeX_V4A0R4(SB), NOSPLIT, $0-48 7432 MOVSS alpha+0(FP), X0 7433 MOVQ xs+8(FP), AX 7434 MOVQ incx+16(FP), CX 7435 MOVQ ys+24(FP), DX 7436 MOVQ incy+32(FP), BX 7437 MOVQ n+40(FP), SI 7438 XORQ DI, DI 7439 XORQ R8, R8 7440 JMP check_limit_unroll 7441 7442 loop_unroll: 7443 MOVSS (AX)(DI*4), X1 7444 MULSS X0, X1 7445 ADDSS (DX)(R8*4), X1 7446 MOVSS X1, (DX)(R8*4) 7447 ADDQ CX, DI 7448 ADDQ BX, R8 7449 MOVSS (AX)(DI*4), X1 7450 MULSS X0, X1 7451 ADDSS (DX)(R8*4), X1 7452 MOVSS X1, (DX)(R8*4) 7453 ADDQ CX, DI 7454 ADDQ BX, R8 7455 MOVSS (AX)(DI*4), X1 7456 MULSS X0, X1 7457 ADDSS (DX)(R8*4), X1 7458 MOVSS X1, (DX)(R8*4) 7459 ADDQ CX, DI 7460 ADDQ BX, R8 7461 MOVSS (AX)(DI*4), X1 7462 MULSS X0, X1 7463 ADDSS (DX)(R8*4), X1 7464 MOVSS X1, (DX)(R8*4) 7465 ADDQ CX, DI 7466 ADDQ BX, R8 7467 SUBQ $0x04, SI 7468 7469 check_limit_unroll: 7470 CMPQ SI, $0x04 7471 JHI loop_unroll 7472 JMP check_limit 7473 7474 loop: 7475 MOVSS (AX)(DI*4), X1 7476 MULSS X0, X1 7477 ADDSS (DX)(R8*4), X1 7478 MOVSS X1, (DX)(R8*4) 7479 DECQ SI 7480 ADDQ CX, DI 7481 ADDQ BX, R8 7482 7483 check_limit: 7484 CMPQ SI, $0x00 7485 JHI loop 7486 RET 7487 7488 // func AmdAxpyUnsafeX_V5A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7489 // Requires: SSE 7490 TEXT ·AmdAxpyUnsafeX_V5A0R4(SB), NOSPLIT, $0-48 7491 MOVSS alpha+0(FP), X0 7492 MOVQ xs+8(FP), AX 7493 MOVQ incx+16(FP), CX 7494 MOVQ ys+24(FP), DX 7495 MOVQ incy+32(FP), BX 7496 MOVQ n+40(FP), SI 7497 XORQ DI, DI 7498 XORQ R8, R8 7499 JMP check_limit_unroll 7500 7501 loop_unroll: 7502 MOVSS (AX)(DI*4), X1 7503 MULSS X0, X1 7504 ADDSS (DX)(R8*4), X1 7505 MOVSS X1, (DX)(R8*4) 7506 ADDQ CX, DI 7507 ADDQ BX, R8 7508 MOVSS (AX)(DI*4), X1 7509 MULSS X0, X1 7510 ADDSS (DX)(R8*4), X1 7511 MOVSS X1, (DX)(R8*4) 7512 ADDQ CX, DI 7513 ADDQ BX, R8 7514 MOVSS (AX)(DI*4), X1 7515 MULSS X0, X1 7516 ADDSS (DX)(R8*4), X1 7517 MOVSS X1, (DX)(R8*4) 7518 ADDQ CX, DI 7519 ADDQ BX, R8 7520 MOVSS (AX)(DI*4), X1 7521 MULSS X0, X1 7522 ADDSS (DX)(R8*4), X1 7523 MOVSS X1, (DX)(R8*4) 7524 ADDQ CX, DI 7525 ADDQ BX, R8 7526 SUBQ $0x04, SI 7527 7528 check_limit_unroll: 7529 CMPQ SI, $0x04 7530 JHI loop_unroll 7531 JMP check_limit 7532 7533 loop: 7534 MOVSS (AX)(DI*4), X1 7535 MULSS X0, X1 7536 ADDSS (DX)(R8*4), X1 7537 MOVSS X1, (DX)(R8*4) 7538 DECQ SI 7539 ADDQ CX, DI 7540 ADDQ BX, R8 7541 7542 check_limit: 7543 CMPQ SI, $0x00 7544 JHI loop 7545 RET 7546 7547 // func AmdAxpyUnsafeX_V0A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7548 // Requires: SSE 7549 TEXT ·AmdAxpyUnsafeX_V0A8R4(SB), NOSPLIT, $0-48 7550 MOVSS alpha+0(FP), X0 7551 MOVQ xs+8(FP), AX 7552 MOVQ incx+16(FP), CX 7553 MOVQ ys+24(FP), DX 7554 MOVQ incy+32(FP), BX 7555 MOVQ n+40(FP), SI 7556 XORQ DI, DI 7557 XORQ R8, R8 7558 JMP check_limit_unroll 7559 PCALIGN $0x08 7560 7561 loop_unroll: 7562 MOVSS (AX)(DI*4), X1 7563 MULSS X0, X1 7564 ADDSS (DX)(R8*4), X1 7565 MOVSS X1, (DX)(R8*4) 7566 ADDQ CX, DI 7567 ADDQ BX, R8 7568 MOVSS (AX)(DI*4), X1 7569 MULSS X0, X1 7570 ADDSS (DX)(R8*4), X1 7571 MOVSS X1, (DX)(R8*4) 7572 ADDQ CX, DI 7573 ADDQ BX, R8 7574 MOVSS (AX)(DI*4), X1 7575 MULSS X0, X1 7576 ADDSS (DX)(R8*4), X1 7577 MOVSS X1, (DX)(R8*4) 7578 ADDQ CX, DI 7579 ADDQ BX, R8 7580 MOVSS (AX)(DI*4), X1 7581 MULSS X0, X1 7582 ADDSS (DX)(R8*4), X1 7583 MOVSS X1, (DX)(R8*4) 7584 ADDQ CX, DI 7585 ADDQ BX, R8 7586 SUBQ $0x04, SI 7587 7588 check_limit_unroll: 7589 CMPQ SI, $0x04 7590 JHI loop_unroll 7591 JMP check_limit 7592 7593 loop: 7594 MOVSS (AX)(DI*4), X1 7595 MULSS X0, X1 7596 ADDSS (DX)(R8*4), X1 7597 MOVSS X1, (DX)(R8*4) 7598 DECQ SI 7599 ADDQ CX, DI 7600 ADDQ BX, R8 7601 7602 check_limit: 7603 CMPQ SI, $0x00 7604 JHI loop 7605 RET 7606 7607 // func AmdAxpyUnsafeX_V1A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7608 // Requires: SSE 7609 TEXT ·AmdAxpyUnsafeX_V1A8R4(SB), NOSPLIT, $0-48 7610 MOVSS alpha+0(FP), X0 7611 MOVQ xs+8(FP), AX 7612 MOVQ incx+16(FP), CX 7613 MOVQ ys+24(FP), DX 7614 MOVQ incy+32(FP), BX 7615 MOVQ n+40(FP), SI 7616 XORQ DI, DI 7617 XORQ R8, R8 7618 JMP check_limit_unroll 7619 PCALIGN $0x08 7620 7621 loop_unroll: 7622 MOVSS (AX)(DI*4), X1 7623 MULSS X0, X1 7624 ADDSS (DX)(R8*4), X1 7625 MOVSS X1, (DX)(R8*4) 7626 ADDQ CX, DI 7627 ADDQ BX, R8 7628 MOVSS (AX)(DI*4), X1 7629 MULSS X0, X1 7630 ADDSS (DX)(R8*4), X1 7631 MOVSS X1, (DX)(R8*4) 7632 ADDQ CX, DI 7633 ADDQ BX, R8 7634 MOVSS (AX)(DI*4), X1 7635 MULSS X0, X1 7636 ADDSS (DX)(R8*4), X1 7637 MOVSS X1, (DX)(R8*4) 7638 ADDQ CX, DI 7639 ADDQ BX, R8 7640 MOVSS (AX)(DI*4), X1 7641 MULSS X0, X1 7642 ADDSS (DX)(R8*4), X1 7643 MOVSS X1, (DX)(R8*4) 7644 ADDQ CX, DI 7645 ADDQ BX, R8 7646 SUBQ $0x04, SI 7647 7648 check_limit_unroll: 7649 CMPQ SI, $0x04 7650 JHI loop_unroll 7651 JMP check_limit 7652 7653 loop: 7654 MOVSS (AX)(DI*4), X1 7655 MULSS X0, X1 7656 ADDSS (DX)(R8*4), X1 7657 MOVSS X1, (DX)(R8*4) 7658 DECQ SI 7659 ADDQ CX, DI 7660 ADDQ BX, R8 7661 7662 check_limit: 7663 CMPQ SI, $0x00 7664 JHI loop 7665 RET 7666 7667 // func AmdAxpyUnsafeX_V2A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7668 // Requires: SSE 7669 TEXT ·AmdAxpyUnsafeX_V2A8R4(SB), NOSPLIT, $0-48 7670 MOVSS alpha+0(FP), X0 7671 MOVQ xs+8(FP), AX 7672 MOVQ incx+16(FP), CX 7673 MOVQ ys+24(FP), DX 7674 MOVQ incy+32(FP), BX 7675 MOVQ n+40(FP), SI 7676 XORQ DI, DI 7677 XORQ R8, R8 7678 JMP check_limit_unroll 7679 PCALIGN $0x08 7680 7681 loop_unroll: 7682 MOVSS (AX)(DI*4), X1 7683 MULSS X0, X1 7684 ADDSS (DX)(R8*4), X1 7685 MOVSS X1, (DX)(R8*4) 7686 ADDQ CX, DI 7687 ADDQ BX, R8 7688 MOVSS (AX)(DI*4), X1 7689 MULSS X0, X1 7690 ADDSS (DX)(R8*4), X1 7691 MOVSS X1, (DX)(R8*4) 7692 ADDQ CX, DI 7693 ADDQ BX, R8 7694 MOVSS (AX)(DI*4), X1 7695 MULSS X0, X1 7696 ADDSS (DX)(R8*4), X1 7697 MOVSS X1, (DX)(R8*4) 7698 ADDQ CX, DI 7699 ADDQ BX, R8 7700 MOVSS (AX)(DI*4), X1 7701 MULSS X0, X1 7702 ADDSS (DX)(R8*4), X1 7703 MOVSS X1, (DX)(R8*4) 7704 ADDQ CX, DI 7705 ADDQ BX, R8 7706 SUBQ $0x04, SI 7707 7708 check_limit_unroll: 7709 CMPQ SI, $0x04 7710 JHI loop_unroll 7711 JMP check_limit 7712 7713 loop: 7714 MOVSS (AX)(DI*4), X1 7715 MULSS X0, X1 7716 ADDSS (DX)(R8*4), X1 7717 MOVSS X1, (DX)(R8*4) 7718 DECQ SI 7719 ADDQ CX, DI 7720 ADDQ BX, R8 7721 7722 check_limit: 7723 CMPQ SI, $0x00 7724 JHI loop 7725 RET 7726 7727 // func AmdAxpyUnsafeX_V3A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7728 // Requires: SSE 7729 TEXT ·AmdAxpyUnsafeX_V3A8R4(SB), NOSPLIT, $0-48 7730 MOVSS alpha+0(FP), X0 7731 MOVQ xs+8(FP), AX 7732 MOVQ incx+16(FP), CX 7733 MOVQ ys+24(FP), DX 7734 MOVQ incy+32(FP), BX 7735 MOVQ n+40(FP), SI 7736 XORQ DI, DI 7737 XORQ R8, R8 7738 JMP check_limit_unroll 7739 PCALIGN $0x08 7740 7741 loop_unroll: 7742 MOVSS (AX)(DI*4), X1 7743 MULSS X0, X1 7744 ADDSS (DX)(R8*4), X1 7745 MOVSS X1, (DX)(R8*4) 7746 ADDQ CX, DI 7747 ADDQ BX, R8 7748 MOVSS (AX)(DI*4), X1 7749 MULSS X0, X1 7750 ADDSS (DX)(R8*4), X1 7751 MOVSS X1, (DX)(R8*4) 7752 ADDQ CX, DI 7753 ADDQ BX, R8 7754 MOVSS (AX)(DI*4), X1 7755 MULSS X0, X1 7756 ADDSS (DX)(R8*4), X1 7757 MOVSS X1, (DX)(R8*4) 7758 ADDQ CX, DI 7759 ADDQ BX, R8 7760 MOVSS (AX)(DI*4), X1 7761 MULSS X0, X1 7762 ADDSS (DX)(R8*4), X1 7763 MOVSS X1, (DX)(R8*4) 7764 ADDQ CX, DI 7765 ADDQ BX, R8 7766 SUBQ $0x04, SI 7767 7768 check_limit_unroll: 7769 CMPQ SI, $0x04 7770 JHI loop_unroll 7771 JMP check_limit 7772 7773 loop: 7774 MOVSS (AX)(DI*4), X1 7775 MULSS X0, X1 7776 ADDSS (DX)(R8*4), X1 7777 MOVSS X1, (DX)(R8*4) 7778 DECQ SI 7779 ADDQ CX, DI 7780 ADDQ BX, R8 7781 7782 check_limit: 7783 CMPQ SI, $0x00 7784 JHI loop 7785 RET 7786 7787 // func AmdAxpyUnsafeX_V4A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7788 // Requires: SSE 7789 TEXT ·AmdAxpyUnsafeX_V4A8R4(SB), NOSPLIT, $0-48 7790 MOVSS alpha+0(FP), X0 7791 MOVQ xs+8(FP), AX 7792 MOVQ incx+16(FP), CX 7793 MOVQ ys+24(FP), DX 7794 MOVQ incy+32(FP), BX 7795 MOVQ n+40(FP), SI 7796 XORQ DI, DI 7797 XORQ R8, R8 7798 JMP check_limit_unroll 7799 PCALIGN $0x08 7800 7801 loop_unroll: 7802 MOVSS (AX)(DI*4), X1 7803 MULSS X0, X1 7804 ADDSS (DX)(R8*4), X1 7805 MOVSS X1, (DX)(R8*4) 7806 ADDQ CX, DI 7807 ADDQ BX, R8 7808 MOVSS (AX)(DI*4), X1 7809 MULSS X0, X1 7810 ADDSS (DX)(R8*4), X1 7811 MOVSS X1, (DX)(R8*4) 7812 ADDQ CX, DI 7813 ADDQ BX, R8 7814 MOVSS (AX)(DI*4), X1 7815 MULSS X0, X1 7816 ADDSS (DX)(R8*4), X1 7817 MOVSS X1, (DX)(R8*4) 7818 ADDQ CX, DI 7819 ADDQ BX, R8 7820 MOVSS (AX)(DI*4), X1 7821 MULSS X0, X1 7822 ADDSS (DX)(R8*4), X1 7823 MOVSS X1, (DX)(R8*4) 7824 ADDQ CX, DI 7825 ADDQ BX, R8 7826 SUBQ $0x04, SI 7827 7828 check_limit_unroll: 7829 CMPQ SI, $0x04 7830 JHI loop_unroll 7831 JMP check_limit 7832 7833 loop: 7834 MOVSS (AX)(DI*4), X1 7835 MULSS X0, X1 7836 ADDSS (DX)(R8*4), X1 7837 MOVSS X1, (DX)(R8*4) 7838 DECQ SI 7839 ADDQ CX, DI 7840 ADDQ BX, R8 7841 7842 check_limit: 7843 CMPQ SI, $0x00 7844 JHI loop 7845 RET 7846 7847 // func AmdAxpyUnsafeX_V5A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7848 // Requires: SSE 7849 TEXT ·AmdAxpyUnsafeX_V5A8R4(SB), NOSPLIT, $0-48 7850 MOVSS alpha+0(FP), X0 7851 MOVQ xs+8(FP), AX 7852 MOVQ incx+16(FP), CX 7853 MOVQ ys+24(FP), DX 7854 MOVQ incy+32(FP), BX 7855 MOVQ n+40(FP), SI 7856 XORQ DI, DI 7857 XORQ R8, R8 7858 JMP check_limit_unroll 7859 PCALIGN $0x08 7860 7861 loop_unroll: 7862 MOVSS (AX)(DI*4), X1 7863 MULSS X0, X1 7864 ADDSS (DX)(R8*4), X1 7865 MOVSS X1, (DX)(R8*4) 7866 ADDQ CX, DI 7867 ADDQ BX, R8 7868 MOVSS (AX)(DI*4), X1 7869 MULSS X0, X1 7870 ADDSS (DX)(R8*4), X1 7871 MOVSS X1, (DX)(R8*4) 7872 ADDQ CX, DI 7873 ADDQ BX, R8 7874 MOVSS (AX)(DI*4), X1 7875 MULSS X0, X1 7876 ADDSS (DX)(R8*4), X1 7877 MOVSS X1, (DX)(R8*4) 7878 ADDQ CX, DI 7879 ADDQ BX, R8 7880 MOVSS (AX)(DI*4), X1 7881 MULSS X0, X1 7882 ADDSS (DX)(R8*4), X1 7883 MOVSS X1, (DX)(R8*4) 7884 ADDQ CX, DI 7885 ADDQ BX, R8 7886 SUBQ $0x04, SI 7887 7888 check_limit_unroll: 7889 CMPQ SI, $0x04 7890 JHI loop_unroll 7891 JMP check_limit 7892 7893 loop: 7894 MOVSS (AX)(DI*4), X1 7895 MULSS X0, X1 7896 ADDSS (DX)(R8*4), X1 7897 MOVSS X1, (DX)(R8*4) 7898 DECQ SI 7899 ADDQ CX, DI 7900 ADDQ BX, R8 7901 7902 check_limit: 7903 CMPQ SI, $0x00 7904 JHI loop 7905 RET 7906 7907 // func AmdAxpyUnsafeX_V0A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7908 // Requires: SSE 7909 TEXT ·AmdAxpyUnsafeX_V0A9R4(SB), NOSPLIT, $0-48 7910 MOVSS alpha+0(FP), X0 7911 MOVQ xs+8(FP), AX 7912 MOVQ incx+16(FP), CX 7913 MOVQ ys+24(FP), DX 7914 MOVQ incy+32(FP), BX 7915 MOVQ n+40(FP), SI 7916 XORQ DI, DI 7917 XORQ R8, R8 7918 JMP check_limit_unroll 7919 PCALIGN $0x08 7920 NOP 7921 7922 loop_unroll: 7923 MOVSS (AX)(DI*4), X1 7924 MULSS X0, X1 7925 ADDSS (DX)(R8*4), X1 7926 MOVSS X1, (DX)(R8*4) 7927 ADDQ CX, DI 7928 ADDQ BX, R8 7929 MOVSS (AX)(DI*4), X1 7930 MULSS X0, X1 7931 ADDSS (DX)(R8*4), X1 7932 MOVSS X1, (DX)(R8*4) 7933 ADDQ CX, DI 7934 ADDQ BX, R8 7935 MOVSS (AX)(DI*4), X1 7936 MULSS X0, X1 7937 ADDSS (DX)(R8*4), X1 7938 MOVSS X1, (DX)(R8*4) 7939 ADDQ CX, DI 7940 ADDQ BX, R8 7941 MOVSS (AX)(DI*4), X1 7942 MULSS X0, X1 7943 ADDSS (DX)(R8*4), X1 7944 MOVSS X1, (DX)(R8*4) 7945 ADDQ CX, DI 7946 ADDQ BX, R8 7947 SUBQ $0x04, SI 7948 7949 check_limit_unroll: 7950 CMPQ SI, $0x04 7951 JHI loop_unroll 7952 JMP check_limit 7953 7954 loop: 7955 MOVSS (AX)(DI*4), X1 7956 MULSS X0, X1 7957 ADDSS (DX)(R8*4), X1 7958 MOVSS X1, (DX)(R8*4) 7959 DECQ SI 7960 ADDQ CX, DI 7961 ADDQ BX, R8 7962 7963 check_limit: 7964 CMPQ SI, $0x00 7965 JHI loop 7966 RET 7967 7968 // func AmdAxpyUnsafeX_V1A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 7969 // Requires: SSE 7970 TEXT ·AmdAxpyUnsafeX_V1A9R4(SB), NOSPLIT, $0-48 7971 MOVSS alpha+0(FP), X0 7972 MOVQ xs+8(FP), AX 7973 MOVQ incx+16(FP), CX 7974 MOVQ ys+24(FP), DX 7975 MOVQ incy+32(FP), BX 7976 MOVQ n+40(FP), SI 7977 XORQ DI, DI 7978 XORQ R8, R8 7979 JMP check_limit_unroll 7980 PCALIGN $0x08 7981 NOP 7982 7983 loop_unroll: 7984 MOVSS (AX)(DI*4), X1 7985 MULSS X0, X1 7986 ADDSS (DX)(R8*4), X1 7987 MOVSS X1, (DX)(R8*4) 7988 ADDQ CX, DI 7989 ADDQ BX, R8 7990 MOVSS (AX)(DI*4), X1 7991 MULSS X0, X1 7992 ADDSS (DX)(R8*4), X1 7993 MOVSS X1, (DX)(R8*4) 7994 ADDQ CX, DI 7995 ADDQ BX, R8 7996 MOVSS (AX)(DI*4), X1 7997 MULSS X0, X1 7998 ADDSS (DX)(R8*4), X1 7999 MOVSS X1, (DX)(R8*4) 8000 ADDQ CX, DI 8001 ADDQ BX, R8 8002 MOVSS (AX)(DI*4), X1 8003 MULSS X0, X1 8004 ADDSS (DX)(R8*4), X1 8005 MOVSS X1, (DX)(R8*4) 8006 ADDQ CX, DI 8007 ADDQ BX, R8 8008 SUBQ $0x04, SI 8009 8010 check_limit_unroll: 8011 CMPQ SI, $0x04 8012 JHI loop_unroll 8013 JMP check_limit 8014 8015 loop: 8016 MOVSS (AX)(DI*4), X1 8017 MULSS X0, X1 8018 ADDSS (DX)(R8*4), X1 8019 MOVSS X1, (DX)(R8*4) 8020 DECQ SI 8021 ADDQ CX, DI 8022 ADDQ BX, R8 8023 8024 check_limit: 8025 CMPQ SI, $0x00 8026 JHI loop 8027 RET 8028 8029 // func AmdAxpyUnsafeX_V2A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8030 // Requires: SSE 8031 TEXT ·AmdAxpyUnsafeX_V2A9R4(SB), NOSPLIT, $0-48 8032 MOVSS alpha+0(FP), X0 8033 MOVQ xs+8(FP), AX 8034 MOVQ incx+16(FP), CX 8035 MOVQ ys+24(FP), DX 8036 MOVQ incy+32(FP), BX 8037 MOVQ n+40(FP), SI 8038 XORQ DI, DI 8039 XORQ R8, R8 8040 JMP check_limit_unroll 8041 PCALIGN $0x08 8042 NOP 8043 8044 loop_unroll: 8045 MOVSS (AX)(DI*4), X1 8046 MULSS X0, X1 8047 ADDSS (DX)(R8*4), X1 8048 MOVSS X1, (DX)(R8*4) 8049 ADDQ CX, DI 8050 ADDQ BX, R8 8051 MOVSS (AX)(DI*4), X1 8052 MULSS X0, X1 8053 ADDSS (DX)(R8*4), X1 8054 MOVSS X1, (DX)(R8*4) 8055 ADDQ CX, DI 8056 ADDQ BX, R8 8057 MOVSS (AX)(DI*4), X1 8058 MULSS X0, X1 8059 ADDSS (DX)(R8*4), X1 8060 MOVSS X1, (DX)(R8*4) 8061 ADDQ CX, DI 8062 ADDQ BX, R8 8063 MOVSS (AX)(DI*4), X1 8064 MULSS X0, X1 8065 ADDSS (DX)(R8*4), X1 8066 MOVSS X1, (DX)(R8*4) 8067 ADDQ CX, DI 8068 ADDQ BX, R8 8069 SUBQ $0x04, SI 8070 8071 check_limit_unroll: 8072 CMPQ SI, $0x04 8073 JHI loop_unroll 8074 JMP check_limit 8075 8076 loop: 8077 MOVSS (AX)(DI*4), X1 8078 MULSS X0, X1 8079 ADDSS (DX)(R8*4), X1 8080 MOVSS X1, (DX)(R8*4) 8081 DECQ SI 8082 ADDQ CX, DI 8083 ADDQ BX, R8 8084 8085 check_limit: 8086 CMPQ SI, $0x00 8087 JHI loop 8088 RET 8089 8090 // func AmdAxpyUnsafeX_V3A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8091 // Requires: SSE 8092 TEXT ·AmdAxpyUnsafeX_V3A9R4(SB), NOSPLIT, $0-48 8093 MOVSS alpha+0(FP), X0 8094 MOVQ xs+8(FP), AX 8095 MOVQ incx+16(FP), CX 8096 MOVQ ys+24(FP), DX 8097 MOVQ incy+32(FP), BX 8098 MOVQ n+40(FP), SI 8099 XORQ DI, DI 8100 XORQ R8, R8 8101 JMP check_limit_unroll 8102 PCALIGN $0x08 8103 NOP 8104 8105 loop_unroll: 8106 MOVSS (AX)(DI*4), X1 8107 MULSS X0, X1 8108 ADDSS (DX)(R8*4), X1 8109 MOVSS X1, (DX)(R8*4) 8110 ADDQ CX, DI 8111 ADDQ BX, R8 8112 MOVSS (AX)(DI*4), X1 8113 MULSS X0, X1 8114 ADDSS (DX)(R8*4), X1 8115 MOVSS X1, (DX)(R8*4) 8116 ADDQ CX, DI 8117 ADDQ BX, R8 8118 MOVSS (AX)(DI*4), X1 8119 MULSS X0, X1 8120 ADDSS (DX)(R8*4), X1 8121 MOVSS X1, (DX)(R8*4) 8122 ADDQ CX, DI 8123 ADDQ BX, R8 8124 MOVSS (AX)(DI*4), X1 8125 MULSS X0, X1 8126 ADDSS (DX)(R8*4), X1 8127 MOVSS X1, (DX)(R8*4) 8128 ADDQ CX, DI 8129 ADDQ BX, R8 8130 SUBQ $0x04, SI 8131 8132 check_limit_unroll: 8133 CMPQ SI, $0x04 8134 JHI loop_unroll 8135 JMP check_limit 8136 8137 loop: 8138 MOVSS (AX)(DI*4), X1 8139 MULSS X0, X1 8140 ADDSS (DX)(R8*4), X1 8141 MOVSS X1, (DX)(R8*4) 8142 DECQ SI 8143 ADDQ CX, DI 8144 ADDQ BX, R8 8145 8146 check_limit: 8147 CMPQ SI, $0x00 8148 JHI loop 8149 RET 8150 8151 // func AmdAxpyUnsafeX_V4A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8152 // Requires: SSE 8153 TEXT ·AmdAxpyUnsafeX_V4A9R4(SB), NOSPLIT, $0-48 8154 MOVSS alpha+0(FP), X0 8155 MOVQ xs+8(FP), AX 8156 MOVQ incx+16(FP), CX 8157 MOVQ ys+24(FP), DX 8158 MOVQ incy+32(FP), BX 8159 MOVQ n+40(FP), SI 8160 XORQ DI, DI 8161 XORQ R8, R8 8162 JMP check_limit_unroll 8163 PCALIGN $0x08 8164 NOP 8165 8166 loop_unroll: 8167 MOVSS (AX)(DI*4), X1 8168 MULSS X0, X1 8169 ADDSS (DX)(R8*4), X1 8170 MOVSS X1, (DX)(R8*4) 8171 ADDQ CX, DI 8172 ADDQ BX, R8 8173 MOVSS (AX)(DI*4), X1 8174 MULSS X0, X1 8175 ADDSS (DX)(R8*4), X1 8176 MOVSS X1, (DX)(R8*4) 8177 ADDQ CX, DI 8178 ADDQ BX, R8 8179 MOVSS (AX)(DI*4), X1 8180 MULSS X0, X1 8181 ADDSS (DX)(R8*4), X1 8182 MOVSS X1, (DX)(R8*4) 8183 ADDQ CX, DI 8184 ADDQ BX, R8 8185 MOVSS (AX)(DI*4), X1 8186 MULSS X0, X1 8187 ADDSS (DX)(R8*4), X1 8188 MOVSS X1, (DX)(R8*4) 8189 ADDQ CX, DI 8190 ADDQ BX, R8 8191 SUBQ $0x04, SI 8192 8193 check_limit_unroll: 8194 CMPQ SI, $0x04 8195 JHI loop_unroll 8196 JMP check_limit 8197 8198 loop: 8199 MOVSS (AX)(DI*4), X1 8200 MULSS X0, X1 8201 ADDSS (DX)(R8*4), X1 8202 MOVSS X1, (DX)(R8*4) 8203 DECQ SI 8204 ADDQ CX, DI 8205 ADDQ BX, R8 8206 8207 check_limit: 8208 CMPQ SI, $0x00 8209 JHI loop 8210 RET 8211 8212 // func AmdAxpyUnsafeX_V5A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8213 // Requires: SSE 8214 TEXT ·AmdAxpyUnsafeX_V5A9R4(SB), NOSPLIT, $0-48 8215 MOVSS alpha+0(FP), X0 8216 MOVQ xs+8(FP), AX 8217 MOVQ incx+16(FP), CX 8218 MOVQ ys+24(FP), DX 8219 MOVQ incy+32(FP), BX 8220 MOVQ n+40(FP), SI 8221 XORQ DI, DI 8222 XORQ R8, R8 8223 JMP check_limit_unroll 8224 PCALIGN $0x08 8225 NOP 8226 8227 loop_unroll: 8228 MOVSS (AX)(DI*4), X1 8229 MULSS X0, X1 8230 ADDSS (DX)(R8*4), X1 8231 MOVSS X1, (DX)(R8*4) 8232 ADDQ CX, DI 8233 ADDQ BX, R8 8234 MOVSS (AX)(DI*4), X1 8235 MULSS X0, X1 8236 ADDSS (DX)(R8*4), X1 8237 MOVSS X1, (DX)(R8*4) 8238 ADDQ CX, DI 8239 ADDQ BX, R8 8240 MOVSS (AX)(DI*4), X1 8241 MULSS X0, X1 8242 ADDSS (DX)(R8*4), X1 8243 MOVSS X1, (DX)(R8*4) 8244 ADDQ CX, DI 8245 ADDQ BX, R8 8246 MOVSS (AX)(DI*4), X1 8247 MULSS X0, X1 8248 ADDSS (DX)(R8*4), X1 8249 MOVSS X1, (DX)(R8*4) 8250 ADDQ CX, DI 8251 ADDQ BX, R8 8252 SUBQ $0x04, SI 8253 8254 check_limit_unroll: 8255 CMPQ SI, $0x04 8256 JHI loop_unroll 8257 JMP check_limit 8258 8259 loop: 8260 MOVSS (AX)(DI*4), X1 8261 MULSS X0, X1 8262 ADDSS (DX)(R8*4), X1 8263 MOVSS X1, (DX)(R8*4) 8264 DECQ SI 8265 ADDQ CX, DI 8266 ADDQ BX, R8 8267 8268 check_limit: 8269 CMPQ SI, $0x00 8270 JHI loop 8271 RET 8272 8273 // func AmdAxpyUnsafeX_V0A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8274 // Requires: SSE 8275 TEXT ·AmdAxpyUnsafeX_V0A10R4(SB), NOSPLIT, $0-48 8276 MOVSS alpha+0(FP), X0 8277 MOVQ xs+8(FP), AX 8278 MOVQ incx+16(FP), CX 8279 MOVQ ys+24(FP), DX 8280 MOVQ incy+32(FP), BX 8281 MOVQ n+40(FP), SI 8282 XORQ DI, DI 8283 XORQ R8, R8 8284 JMP check_limit_unroll 8285 PCALIGN $0x08 8286 NOP 8287 NOP 8288 8289 loop_unroll: 8290 MOVSS (AX)(DI*4), X1 8291 MULSS X0, X1 8292 ADDSS (DX)(R8*4), X1 8293 MOVSS X1, (DX)(R8*4) 8294 ADDQ CX, DI 8295 ADDQ BX, R8 8296 MOVSS (AX)(DI*4), X1 8297 MULSS X0, X1 8298 ADDSS (DX)(R8*4), X1 8299 MOVSS X1, (DX)(R8*4) 8300 ADDQ CX, DI 8301 ADDQ BX, R8 8302 MOVSS (AX)(DI*4), X1 8303 MULSS X0, X1 8304 ADDSS (DX)(R8*4), X1 8305 MOVSS X1, (DX)(R8*4) 8306 ADDQ CX, DI 8307 ADDQ BX, R8 8308 MOVSS (AX)(DI*4), X1 8309 MULSS X0, X1 8310 ADDSS (DX)(R8*4), X1 8311 MOVSS X1, (DX)(R8*4) 8312 ADDQ CX, DI 8313 ADDQ BX, R8 8314 SUBQ $0x04, SI 8315 8316 check_limit_unroll: 8317 CMPQ SI, $0x04 8318 JHI loop_unroll 8319 JMP check_limit 8320 8321 loop: 8322 MOVSS (AX)(DI*4), X1 8323 MULSS X0, X1 8324 ADDSS (DX)(R8*4), X1 8325 MOVSS X1, (DX)(R8*4) 8326 DECQ SI 8327 ADDQ CX, DI 8328 ADDQ BX, R8 8329 8330 check_limit: 8331 CMPQ SI, $0x00 8332 JHI loop 8333 RET 8334 8335 // func AmdAxpyUnsafeX_V1A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8336 // Requires: SSE 8337 TEXT ·AmdAxpyUnsafeX_V1A10R4(SB), NOSPLIT, $0-48 8338 MOVSS alpha+0(FP), X0 8339 MOVQ xs+8(FP), AX 8340 MOVQ incx+16(FP), CX 8341 MOVQ ys+24(FP), DX 8342 MOVQ incy+32(FP), BX 8343 MOVQ n+40(FP), SI 8344 XORQ DI, DI 8345 XORQ R8, R8 8346 JMP check_limit_unroll 8347 PCALIGN $0x08 8348 NOP 8349 NOP 8350 8351 loop_unroll: 8352 MOVSS (AX)(DI*4), X1 8353 MULSS X0, X1 8354 ADDSS (DX)(R8*4), X1 8355 MOVSS X1, (DX)(R8*4) 8356 ADDQ CX, DI 8357 ADDQ BX, R8 8358 MOVSS (AX)(DI*4), X1 8359 MULSS X0, X1 8360 ADDSS (DX)(R8*4), X1 8361 MOVSS X1, (DX)(R8*4) 8362 ADDQ CX, DI 8363 ADDQ BX, R8 8364 MOVSS (AX)(DI*4), X1 8365 MULSS X0, X1 8366 ADDSS (DX)(R8*4), X1 8367 MOVSS X1, (DX)(R8*4) 8368 ADDQ CX, DI 8369 ADDQ BX, R8 8370 MOVSS (AX)(DI*4), X1 8371 MULSS X0, X1 8372 ADDSS (DX)(R8*4), X1 8373 MOVSS X1, (DX)(R8*4) 8374 ADDQ CX, DI 8375 ADDQ BX, R8 8376 SUBQ $0x04, SI 8377 8378 check_limit_unroll: 8379 CMPQ SI, $0x04 8380 JHI loop_unroll 8381 JMP check_limit 8382 8383 loop: 8384 MOVSS (AX)(DI*4), X1 8385 MULSS X0, X1 8386 ADDSS (DX)(R8*4), X1 8387 MOVSS X1, (DX)(R8*4) 8388 DECQ SI 8389 ADDQ CX, DI 8390 ADDQ BX, R8 8391 8392 check_limit: 8393 CMPQ SI, $0x00 8394 JHI loop 8395 RET 8396 8397 // func AmdAxpyUnsafeX_V2A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8398 // Requires: SSE 8399 TEXT ·AmdAxpyUnsafeX_V2A10R4(SB), NOSPLIT, $0-48 8400 MOVSS alpha+0(FP), X0 8401 MOVQ xs+8(FP), AX 8402 MOVQ incx+16(FP), CX 8403 MOVQ ys+24(FP), DX 8404 MOVQ incy+32(FP), BX 8405 MOVQ n+40(FP), SI 8406 XORQ DI, DI 8407 XORQ R8, R8 8408 JMP check_limit_unroll 8409 PCALIGN $0x08 8410 NOP 8411 NOP 8412 8413 loop_unroll: 8414 MOVSS (AX)(DI*4), X1 8415 MULSS X0, X1 8416 ADDSS (DX)(R8*4), X1 8417 MOVSS X1, (DX)(R8*4) 8418 ADDQ CX, DI 8419 ADDQ BX, R8 8420 MOVSS (AX)(DI*4), X1 8421 MULSS X0, X1 8422 ADDSS (DX)(R8*4), X1 8423 MOVSS X1, (DX)(R8*4) 8424 ADDQ CX, DI 8425 ADDQ BX, R8 8426 MOVSS (AX)(DI*4), X1 8427 MULSS X0, X1 8428 ADDSS (DX)(R8*4), X1 8429 MOVSS X1, (DX)(R8*4) 8430 ADDQ CX, DI 8431 ADDQ BX, R8 8432 MOVSS (AX)(DI*4), X1 8433 MULSS X0, X1 8434 ADDSS (DX)(R8*4), X1 8435 MOVSS X1, (DX)(R8*4) 8436 ADDQ CX, DI 8437 ADDQ BX, R8 8438 SUBQ $0x04, SI 8439 8440 check_limit_unroll: 8441 CMPQ SI, $0x04 8442 JHI loop_unroll 8443 JMP check_limit 8444 8445 loop: 8446 MOVSS (AX)(DI*4), X1 8447 MULSS X0, X1 8448 ADDSS (DX)(R8*4), X1 8449 MOVSS X1, (DX)(R8*4) 8450 DECQ SI 8451 ADDQ CX, DI 8452 ADDQ BX, R8 8453 8454 check_limit: 8455 CMPQ SI, $0x00 8456 JHI loop 8457 RET 8458 8459 // func AmdAxpyUnsafeX_V3A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8460 // Requires: SSE 8461 TEXT ·AmdAxpyUnsafeX_V3A10R4(SB), NOSPLIT, $0-48 8462 MOVSS alpha+0(FP), X0 8463 MOVQ xs+8(FP), AX 8464 MOVQ incx+16(FP), CX 8465 MOVQ ys+24(FP), DX 8466 MOVQ incy+32(FP), BX 8467 MOVQ n+40(FP), SI 8468 XORQ DI, DI 8469 XORQ R8, R8 8470 JMP check_limit_unroll 8471 PCALIGN $0x08 8472 NOP 8473 NOP 8474 8475 loop_unroll: 8476 MOVSS (AX)(DI*4), X1 8477 MULSS X0, X1 8478 ADDSS (DX)(R8*4), X1 8479 MOVSS X1, (DX)(R8*4) 8480 ADDQ CX, DI 8481 ADDQ BX, R8 8482 MOVSS (AX)(DI*4), X1 8483 MULSS X0, X1 8484 ADDSS (DX)(R8*4), X1 8485 MOVSS X1, (DX)(R8*4) 8486 ADDQ CX, DI 8487 ADDQ BX, R8 8488 MOVSS (AX)(DI*4), X1 8489 MULSS X0, X1 8490 ADDSS (DX)(R8*4), X1 8491 MOVSS X1, (DX)(R8*4) 8492 ADDQ CX, DI 8493 ADDQ BX, R8 8494 MOVSS (AX)(DI*4), X1 8495 MULSS X0, X1 8496 ADDSS (DX)(R8*4), X1 8497 MOVSS X1, (DX)(R8*4) 8498 ADDQ CX, DI 8499 ADDQ BX, R8 8500 SUBQ $0x04, SI 8501 8502 check_limit_unroll: 8503 CMPQ SI, $0x04 8504 JHI loop_unroll 8505 JMP check_limit 8506 8507 loop: 8508 MOVSS (AX)(DI*4), X1 8509 MULSS X0, X1 8510 ADDSS (DX)(R8*4), X1 8511 MOVSS X1, (DX)(R8*4) 8512 DECQ SI 8513 ADDQ CX, DI 8514 ADDQ BX, R8 8515 8516 check_limit: 8517 CMPQ SI, $0x00 8518 JHI loop 8519 RET 8520 8521 // func AmdAxpyUnsafeX_V4A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8522 // Requires: SSE 8523 TEXT ·AmdAxpyUnsafeX_V4A10R4(SB), NOSPLIT, $0-48 8524 MOVSS alpha+0(FP), X0 8525 MOVQ xs+8(FP), AX 8526 MOVQ incx+16(FP), CX 8527 MOVQ ys+24(FP), DX 8528 MOVQ incy+32(FP), BX 8529 MOVQ n+40(FP), SI 8530 XORQ DI, DI 8531 XORQ R8, R8 8532 JMP check_limit_unroll 8533 PCALIGN $0x08 8534 NOP 8535 NOP 8536 8537 loop_unroll: 8538 MOVSS (AX)(DI*4), X1 8539 MULSS X0, X1 8540 ADDSS (DX)(R8*4), X1 8541 MOVSS X1, (DX)(R8*4) 8542 ADDQ CX, DI 8543 ADDQ BX, R8 8544 MOVSS (AX)(DI*4), X1 8545 MULSS X0, X1 8546 ADDSS (DX)(R8*4), X1 8547 MOVSS X1, (DX)(R8*4) 8548 ADDQ CX, DI 8549 ADDQ BX, R8 8550 MOVSS (AX)(DI*4), X1 8551 MULSS X0, X1 8552 ADDSS (DX)(R8*4), X1 8553 MOVSS X1, (DX)(R8*4) 8554 ADDQ CX, DI 8555 ADDQ BX, R8 8556 MOVSS (AX)(DI*4), X1 8557 MULSS X0, X1 8558 ADDSS (DX)(R8*4), X1 8559 MOVSS X1, (DX)(R8*4) 8560 ADDQ CX, DI 8561 ADDQ BX, R8 8562 SUBQ $0x04, SI 8563 8564 check_limit_unroll: 8565 CMPQ SI, $0x04 8566 JHI loop_unroll 8567 JMP check_limit 8568 8569 loop: 8570 MOVSS (AX)(DI*4), X1 8571 MULSS X0, X1 8572 ADDSS (DX)(R8*4), X1 8573 MOVSS X1, (DX)(R8*4) 8574 DECQ SI 8575 ADDQ CX, DI 8576 ADDQ BX, R8 8577 8578 check_limit: 8579 CMPQ SI, $0x00 8580 JHI loop 8581 RET 8582 8583 // func AmdAxpyUnsafeX_V5A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8584 // Requires: SSE 8585 TEXT ·AmdAxpyUnsafeX_V5A10R4(SB), NOSPLIT, $0-48 8586 MOVSS alpha+0(FP), X0 8587 MOVQ xs+8(FP), AX 8588 MOVQ incx+16(FP), CX 8589 MOVQ ys+24(FP), DX 8590 MOVQ incy+32(FP), BX 8591 MOVQ n+40(FP), SI 8592 XORQ DI, DI 8593 XORQ R8, R8 8594 JMP check_limit_unroll 8595 PCALIGN $0x08 8596 NOP 8597 NOP 8598 8599 loop_unroll: 8600 MOVSS (AX)(DI*4), X1 8601 MULSS X0, X1 8602 ADDSS (DX)(R8*4), X1 8603 MOVSS X1, (DX)(R8*4) 8604 ADDQ CX, DI 8605 ADDQ BX, R8 8606 MOVSS (AX)(DI*4), X1 8607 MULSS X0, X1 8608 ADDSS (DX)(R8*4), X1 8609 MOVSS X1, (DX)(R8*4) 8610 ADDQ CX, DI 8611 ADDQ BX, R8 8612 MOVSS (AX)(DI*4), X1 8613 MULSS X0, X1 8614 ADDSS (DX)(R8*4), X1 8615 MOVSS X1, (DX)(R8*4) 8616 ADDQ CX, DI 8617 ADDQ BX, R8 8618 MOVSS (AX)(DI*4), X1 8619 MULSS X0, X1 8620 ADDSS (DX)(R8*4), X1 8621 MOVSS X1, (DX)(R8*4) 8622 ADDQ CX, DI 8623 ADDQ BX, R8 8624 SUBQ $0x04, SI 8625 8626 check_limit_unroll: 8627 CMPQ SI, $0x04 8628 JHI loop_unroll 8629 JMP check_limit 8630 8631 loop: 8632 MOVSS (AX)(DI*4), X1 8633 MULSS X0, X1 8634 ADDSS (DX)(R8*4), X1 8635 MOVSS X1, (DX)(R8*4) 8636 DECQ SI 8637 ADDQ CX, DI 8638 ADDQ BX, R8 8639 8640 check_limit: 8641 CMPQ SI, $0x00 8642 JHI loop 8643 RET 8644 8645 // func AmdAxpyUnsafeX_V0A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8646 // Requires: SSE 8647 TEXT ·AmdAxpyUnsafeX_V0A11R4(SB), NOSPLIT, $0-48 8648 MOVSS alpha+0(FP), X0 8649 MOVQ xs+8(FP), AX 8650 MOVQ incx+16(FP), CX 8651 MOVQ ys+24(FP), DX 8652 MOVQ incy+32(FP), BX 8653 MOVQ n+40(FP), SI 8654 XORQ DI, DI 8655 XORQ R8, R8 8656 JMP check_limit_unroll 8657 PCALIGN $0x08 8658 NOP 8659 NOP 8660 NOP 8661 8662 loop_unroll: 8663 MOVSS (AX)(DI*4), X1 8664 MULSS X0, X1 8665 ADDSS (DX)(R8*4), X1 8666 MOVSS X1, (DX)(R8*4) 8667 ADDQ CX, DI 8668 ADDQ BX, R8 8669 MOVSS (AX)(DI*4), X1 8670 MULSS X0, X1 8671 ADDSS (DX)(R8*4), X1 8672 MOVSS X1, (DX)(R8*4) 8673 ADDQ CX, DI 8674 ADDQ BX, R8 8675 MOVSS (AX)(DI*4), X1 8676 MULSS X0, X1 8677 ADDSS (DX)(R8*4), X1 8678 MOVSS X1, (DX)(R8*4) 8679 ADDQ CX, DI 8680 ADDQ BX, R8 8681 MOVSS (AX)(DI*4), X1 8682 MULSS X0, X1 8683 ADDSS (DX)(R8*4), X1 8684 MOVSS X1, (DX)(R8*4) 8685 ADDQ CX, DI 8686 ADDQ BX, R8 8687 SUBQ $0x04, SI 8688 8689 check_limit_unroll: 8690 CMPQ SI, $0x04 8691 JHI loop_unroll 8692 JMP check_limit 8693 8694 loop: 8695 MOVSS (AX)(DI*4), X1 8696 MULSS X0, X1 8697 ADDSS (DX)(R8*4), X1 8698 MOVSS X1, (DX)(R8*4) 8699 DECQ SI 8700 ADDQ CX, DI 8701 ADDQ BX, R8 8702 8703 check_limit: 8704 CMPQ SI, $0x00 8705 JHI loop 8706 RET 8707 8708 // func AmdAxpyUnsafeX_V1A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8709 // Requires: SSE 8710 TEXT ·AmdAxpyUnsafeX_V1A11R4(SB), NOSPLIT, $0-48 8711 MOVSS alpha+0(FP), X0 8712 MOVQ xs+8(FP), AX 8713 MOVQ incx+16(FP), CX 8714 MOVQ ys+24(FP), DX 8715 MOVQ incy+32(FP), BX 8716 MOVQ n+40(FP), SI 8717 XORQ DI, DI 8718 XORQ R8, R8 8719 JMP check_limit_unroll 8720 PCALIGN $0x08 8721 NOP 8722 NOP 8723 NOP 8724 8725 loop_unroll: 8726 MOVSS (AX)(DI*4), X1 8727 MULSS X0, X1 8728 ADDSS (DX)(R8*4), X1 8729 MOVSS X1, (DX)(R8*4) 8730 ADDQ CX, DI 8731 ADDQ BX, R8 8732 MOVSS (AX)(DI*4), X1 8733 MULSS X0, X1 8734 ADDSS (DX)(R8*4), X1 8735 MOVSS X1, (DX)(R8*4) 8736 ADDQ CX, DI 8737 ADDQ BX, R8 8738 MOVSS (AX)(DI*4), X1 8739 MULSS X0, X1 8740 ADDSS (DX)(R8*4), X1 8741 MOVSS X1, (DX)(R8*4) 8742 ADDQ CX, DI 8743 ADDQ BX, R8 8744 MOVSS (AX)(DI*4), X1 8745 MULSS X0, X1 8746 ADDSS (DX)(R8*4), X1 8747 MOVSS X1, (DX)(R8*4) 8748 ADDQ CX, DI 8749 ADDQ BX, R8 8750 SUBQ $0x04, SI 8751 8752 check_limit_unroll: 8753 CMPQ SI, $0x04 8754 JHI loop_unroll 8755 JMP check_limit 8756 8757 loop: 8758 MOVSS (AX)(DI*4), X1 8759 MULSS X0, X1 8760 ADDSS (DX)(R8*4), X1 8761 MOVSS X1, (DX)(R8*4) 8762 DECQ SI 8763 ADDQ CX, DI 8764 ADDQ BX, R8 8765 8766 check_limit: 8767 CMPQ SI, $0x00 8768 JHI loop 8769 RET 8770 8771 // func AmdAxpyUnsafeX_V2A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8772 // Requires: SSE 8773 TEXT ·AmdAxpyUnsafeX_V2A11R4(SB), NOSPLIT, $0-48 8774 MOVSS alpha+0(FP), X0 8775 MOVQ xs+8(FP), AX 8776 MOVQ incx+16(FP), CX 8777 MOVQ ys+24(FP), DX 8778 MOVQ incy+32(FP), BX 8779 MOVQ n+40(FP), SI 8780 XORQ DI, DI 8781 XORQ R8, R8 8782 JMP check_limit_unroll 8783 PCALIGN $0x08 8784 NOP 8785 NOP 8786 NOP 8787 8788 loop_unroll: 8789 MOVSS (AX)(DI*4), X1 8790 MULSS X0, X1 8791 ADDSS (DX)(R8*4), X1 8792 MOVSS X1, (DX)(R8*4) 8793 ADDQ CX, DI 8794 ADDQ BX, R8 8795 MOVSS (AX)(DI*4), X1 8796 MULSS X0, X1 8797 ADDSS (DX)(R8*4), X1 8798 MOVSS X1, (DX)(R8*4) 8799 ADDQ CX, DI 8800 ADDQ BX, R8 8801 MOVSS (AX)(DI*4), X1 8802 MULSS X0, X1 8803 ADDSS (DX)(R8*4), X1 8804 MOVSS X1, (DX)(R8*4) 8805 ADDQ CX, DI 8806 ADDQ BX, R8 8807 MOVSS (AX)(DI*4), X1 8808 MULSS X0, X1 8809 ADDSS (DX)(R8*4), X1 8810 MOVSS X1, (DX)(R8*4) 8811 ADDQ CX, DI 8812 ADDQ BX, R8 8813 SUBQ $0x04, SI 8814 8815 check_limit_unroll: 8816 CMPQ SI, $0x04 8817 JHI loop_unroll 8818 JMP check_limit 8819 8820 loop: 8821 MOVSS (AX)(DI*4), X1 8822 MULSS X0, X1 8823 ADDSS (DX)(R8*4), X1 8824 MOVSS X1, (DX)(R8*4) 8825 DECQ SI 8826 ADDQ CX, DI 8827 ADDQ BX, R8 8828 8829 check_limit: 8830 CMPQ SI, $0x00 8831 JHI loop 8832 RET 8833 8834 // func AmdAxpyUnsafeX_V3A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8835 // Requires: SSE 8836 TEXT ·AmdAxpyUnsafeX_V3A11R4(SB), NOSPLIT, $0-48 8837 MOVSS alpha+0(FP), X0 8838 MOVQ xs+8(FP), AX 8839 MOVQ incx+16(FP), CX 8840 MOVQ ys+24(FP), DX 8841 MOVQ incy+32(FP), BX 8842 MOVQ n+40(FP), SI 8843 XORQ DI, DI 8844 XORQ R8, R8 8845 JMP check_limit_unroll 8846 PCALIGN $0x08 8847 NOP 8848 NOP 8849 NOP 8850 8851 loop_unroll: 8852 MOVSS (AX)(DI*4), X1 8853 MULSS X0, X1 8854 ADDSS (DX)(R8*4), X1 8855 MOVSS X1, (DX)(R8*4) 8856 ADDQ CX, DI 8857 ADDQ BX, R8 8858 MOVSS (AX)(DI*4), X1 8859 MULSS X0, X1 8860 ADDSS (DX)(R8*4), X1 8861 MOVSS X1, (DX)(R8*4) 8862 ADDQ CX, DI 8863 ADDQ BX, R8 8864 MOVSS (AX)(DI*4), X1 8865 MULSS X0, X1 8866 ADDSS (DX)(R8*4), X1 8867 MOVSS X1, (DX)(R8*4) 8868 ADDQ CX, DI 8869 ADDQ BX, R8 8870 MOVSS (AX)(DI*4), X1 8871 MULSS X0, X1 8872 ADDSS (DX)(R8*4), X1 8873 MOVSS X1, (DX)(R8*4) 8874 ADDQ CX, DI 8875 ADDQ BX, R8 8876 SUBQ $0x04, SI 8877 8878 check_limit_unroll: 8879 CMPQ SI, $0x04 8880 JHI loop_unroll 8881 JMP check_limit 8882 8883 loop: 8884 MOVSS (AX)(DI*4), X1 8885 MULSS X0, X1 8886 ADDSS (DX)(R8*4), X1 8887 MOVSS X1, (DX)(R8*4) 8888 DECQ SI 8889 ADDQ CX, DI 8890 ADDQ BX, R8 8891 8892 check_limit: 8893 CMPQ SI, $0x00 8894 JHI loop 8895 RET 8896 8897 // func AmdAxpyUnsafeX_V4A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8898 // Requires: SSE 8899 TEXT ·AmdAxpyUnsafeX_V4A11R4(SB), NOSPLIT, $0-48 8900 MOVSS alpha+0(FP), X0 8901 MOVQ xs+8(FP), AX 8902 MOVQ incx+16(FP), CX 8903 MOVQ ys+24(FP), DX 8904 MOVQ incy+32(FP), BX 8905 MOVQ n+40(FP), SI 8906 XORQ DI, DI 8907 XORQ R8, R8 8908 JMP check_limit_unroll 8909 PCALIGN $0x08 8910 NOP 8911 NOP 8912 NOP 8913 8914 loop_unroll: 8915 MOVSS (AX)(DI*4), X1 8916 MULSS X0, X1 8917 ADDSS (DX)(R8*4), X1 8918 MOVSS X1, (DX)(R8*4) 8919 ADDQ CX, DI 8920 ADDQ BX, R8 8921 MOVSS (AX)(DI*4), X1 8922 MULSS X0, X1 8923 ADDSS (DX)(R8*4), X1 8924 MOVSS X1, (DX)(R8*4) 8925 ADDQ CX, DI 8926 ADDQ BX, R8 8927 MOVSS (AX)(DI*4), X1 8928 MULSS X0, X1 8929 ADDSS (DX)(R8*4), X1 8930 MOVSS X1, (DX)(R8*4) 8931 ADDQ CX, DI 8932 ADDQ BX, R8 8933 MOVSS (AX)(DI*4), X1 8934 MULSS X0, X1 8935 ADDSS (DX)(R8*4), X1 8936 MOVSS X1, (DX)(R8*4) 8937 ADDQ CX, DI 8938 ADDQ BX, R8 8939 SUBQ $0x04, SI 8940 8941 check_limit_unroll: 8942 CMPQ SI, $0x04 8943 JHI loop_unroll 8944 JMP check_limit 8945 8946 loop: 8947 MOVSS (AX)(DI*4), X1 8948 MULSS X0, X1 8949 ADDSS (DX)(R8*4), X1 8950 MOVSS X1, (DX)(R8*4) 8951 DECQ SI 8952 ADDQ CX, DI 8953 ADDQ BX, R8 8954 8955 check_limit: 8956 CMPQ SI, $0x00 8957 JHI loop 8958 RET 8959 8960 // func AmdAxpyUnsafeX_V5A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 8961 // Requires: SSE 8962 TEXT ·AmdAxpyUnsafeX_V5A11R4(SB), NOSPLIT, $0-48 8963 MOVSS alpha+0(FP), X0 8964 MOVQ xs+8(FP), AX 8965 MOVQ incx+16(FP), CX 8966 MOVQ ys+24(FP), DX 8967 MOVQ incy+32(FP), BX 8968 MOVQ n+40(FP), SI 8969 XORQ DI, DI 8970 XORQ R8, R8 8971 JMP check_limit_unroll 8972 PCALIGN $0x08 8973 NOP 8974 NOP 8975 NOP 8976 8977 loop_unroll: 8978 MOVSS (AX)(DI*4), X1 8979 MULSS X0, X1 8980 ADDSS (DX)(R8*4), X1 8981 MOVSS X1, (DX)(R8*4) 8982 ADDQ CX, DI 8983 ADDQ BX, R8 8984 MOVSS (AX)(DI*4), X1 8985 MULSS X0, X1 8986 ADDSS (DX)(R8*4), X1 8987 MOVSS X1, (DX)(R8*4) 8988 ADDQ CX, DI 8989 ADDQ BX, R8 8990 MOVSS (AX)(DI*4), X1 8991 MULSS X0, X1 8992 ADDSS (DX)(R8*4), X1 8993 MOVSS X1, (DX)(R8*4) 8994 ADDQ CX, DI 8995 ADDQ BX, R8 8996 MOVSS (AX)(DI*4), X1 8997 MULSS X0, X1 8998 ADDSS (DX)(R8*4), X1 8999 MOVSS X1, (DX)(R8*4) 9000 ADDQ CX, DI 9001 ADDQ BX, R8 9002 SUBQ $0x04, SI 9003 9004 check_limit_unroll: 9005 CMPQ SI, $0x04 9006 JHI loop_unroll 9007 JMP check_limit 9008 9009 loop: 9010 MOVSS (AX)(DI*4), X1 9011 MULSS X0, X1 9012 ADDSS (DX)(R8*4), X1 9013 MOVSS X1, (DX)(R8*4) 9014 DECQ SI 9015 ADDQ CX, DI 9016 ADDQ BX, R8 9017 9018 check_limit: 9019 CMPQ SI, $0x00 9020 JHI loop 9021 RET 9022 9023 // func AmdAxpyUnsafeX_V0A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9024 // Requires: SSE 9025 TEXT ·AmdAxpyUnsafeX_V0A12R4(SB), NOSPLIT, $0-48 9026 MOVSS alpha+0(FP), X0 9027 MOVQ xs+8(FP), AX 9028 MOVQ incx+16(FP), CX 9029 MOVQ ys+24(FP), DX 9030 MOVQ incy+32(FP), BX 9031 MOVQ n+40(FP), SI 9032 XORQ DI, DI 9033 XORQ R8, R8 9034 JMP check_limit_unroll 9035 PCALIGN $0x08 9036 NOP 9037 NOP 9038 NOP 9039 NOP 9040 9041 loop_unroll: 9042 MOVSS (AX)(DI*4), X1 9043 MULSS X0, X1 9044 ADDSS (DX)(R8*4), X1 9045 MOVSS X1, (DX)(R8*4) 9046 ADDQ CX, DI 9047 ADDQ BX, R8 9048 MOVSS (AX)(DI*4), X1 9049 MULSS X0, X1 9050 ADDSS (DX)(R8*4), X1 9051 MOVSS X1, (DX)(R8*4) 9052 ADDQ CX, DI 9053 ADDQ BX, R8 9054 MOVSS (AX)(DI*4), X1 9055 MULSS X0, X1 9056 ADDSS (DX)(R8*4), X1 9057 MOVSS X1, (DX)(R8*4) 9058 ADDQ CX, DI 9059 ADDQ BX, R8 9060 MOVSS (AX)(DI*4), X1 9061 MULSS X0, X1 9062 ADDSS (DX)(R8*4), X1 9063 MOVSS X1, (DX)(R8*4) 9064 ADDQ CX, DI 9065 ADDQ BX, R8 9066 SUBQ $0x04, SI 9067 9068 check_limit_unroll: 9069 CMPQ SI, $0x04 9070 JHI loop_unroll 9071 JMP check_limit 9072 9073 loop: 9074 MOVSS (AX)(DI*4), X1 9075 MULSS X0, X1 9076 ADDSS (DX)(R8*4), X1 9077 MOVSS X1, (DX)(R8*4) 9078 DECQ SI 9079 ADDQ CX, DI 9080 ADDQ BX, R8 9081 9082 check_limit: 9083 CMPQ SI, $0x00 9084 JHI loop 9085 RET 9086 9087 // func AmdAxpyUnsafeX_V1A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9088 // Requires: SSE 9089 TEXT ·AmdAxpyUnsafeX_V1A12R4(SB), NOSPLIT, $0-48 9090 MOVSS alpha+0(FP), X0 9091 MOVQ xs+8(FP), AX 9092 MOVQ incx+16(FP), CX 9093 MOVQ ys+24(FP), DX 9094 MOVQ incy+32(FP), BX 9095 MOVQ n+40(FP), SI 9096 XORQ DI, DI 9097 XORQ R8, R8 9098 JMP check_limit_unroll 9099 PCALIGN $0x08 9100 NOP 9101 NOP 9102 NOP 9103 NOP 9104 9105 loop_unroll: 9106 MOVSS (AX)(DI*4), X1 9107 MULSS X0, X1 9108 ADDSS (DX)(R8*4), X1 9109 MOVSS X1, (DX)(R8*4) 9110 ADDQ CX, DI 9111 ADDQ BX, R8 9112 MOVSS (AX)(DI*4), X1 9113 MULSS X0, X1 9114 ADDSS (DX)(R8*4), X1 9115 MOVSS X1, (DX)(R8*4) 9116 ADDQ CX, DI 9117 ADDQ BX, R8 9118 MOVSS (AX)(DI*4), X1 9119 MULSS X0, X1 9120 ADDSS (DX)(R8*4), X1 9121 MOVSS X1, (DX)(R8*4) 9122 ADDQ CX, DI 9123 ADDQ BX, R8 9124 MOVSS (AX)(DI*4), X1 9125 MULSS X0, X1 9126 ADDSS (DX)(R8*4), X1 9127 MOVSS X1, (DX)(R8*4) 9128 ADDQ CX, DI 9129 ADDQ BX, R8 9130 SUBQ $0x04, SI 9131 9132 check_limit_unroll: 9133 CMPQ SI, $0x04 9134 JHI loop_unroll 9135 JMP check_limit 9136 9137 loop: 9138 MOVSS (AX)(DI*4), X1 9139 MULSS X0, X1 9140 ADDSS (DX)(R8*4), X1 9141 MOVSS X1, (DX)(R8*4) 9142 DECQ SI 9143 ADDQ CX, DI 9144 ADDQ BX, R8 9145 9146 check_limit: 9147 CMPQ SI, $0x00 9148 JHI loop 9149 RET 9150 9151 // func AmdAxpyUnsafeX_V2A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9152 // Requires: SSE 9153 TEXT ·AmdAxpyUnsafeX_V2A12R4(SB), NOSPLIT, $0-48 9154 MOVSS alpha+0(FP), X0 9155 MOVQ xs+8(FP), AX 9156 MOVQ incx+16(FP), CX 9157 MOVQ ys+24(FP), DX 9158 MOVQ incy+32(FP), BX 9159 MOVQ n+40(FP), SI 9160 XORQ DI, DI 9161 XORQ R8, R8 9162 JMP check_limit_unroll 9163 PCALIGN $0x08 9164 NOP 9165 NOP 9166 NOP 9167 NOP 9168 9169 loop_unroll: 9170 MOVSS (AX)(DI*4), X1 9171 MULSS X0, X1 9172 ADDSS (DX)(R8*4), X1 9173 MOVSS X1, (DX)(R8*4) 9174 ADDQ CX, DI 9175 ADDQ BX, R8 9176 MOVSS (AX)(DI*4), X1 9177 MULSS X0, X1 9178 ADDSS (DX)(R8*4), X1 9179 MOVSS X1, (DX)(R8*4) 9180 ADDQ CX, DI 9181 ADDQ BX, R8 9182 MOVSS (AX)(DI*4), X1 9183 MULSS X0, X1 9184 ADDSS (DX)(R8*4), X1 9185 MOVSS X1, (DX)(R8*4) 9186 ADDQ CX, DI 9187 ADDQ BX, R8 9188 MOVSS (AX)(DI*4), X1 9189 MULSS X0, X1 9190 ADDSS (DX)(R8*4), X1 9191 MOVSS X1, (DX)(R8*4) 9192 ADDQ CX, DI 9193 ADDQ BX, R8 9194 SUBQ $0x04, SI 9195 9196 check_limit_unroll: 9197 CMPQ SI, $0x04 9198 JHI loop_unroll 9199 JMP check_limit 9200 9201 loop: 9202 MOVSS (AX)(DI*4), X1 9203 MULSS X0, X1 9204 ADDSS (DX)(R8*4), X1 9205 MOVSS X1, (DX)(R8*4) 9206 DECQ SI 9207 ADDQ CX, DI 9208 ADDQ BX, R8 9209 9210 check_limit: 9211 CMPQ SI, $0x00 9212 JHI loop 9213 RET 9214 9215 // func AmdAxpyUnsafeX_V3A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9216 // Requires: SSE 9217 TEXT ·AmdAxpyUnsafeX_V3A12R4(SB), NOSPLIT, $0-48 9218 MOVSS alpha+0(FP), X0 9219 MOVQ xs+8(FP), AX 9220 MOVQ incx+16(FP), CX 9221 MOVQ ys+24(FP), DX 9222 MOVQ incy+32(FP), BX 9223 MOVQ n+40(FP), SI 9224 XORQ DI, DI 9225 XORQ R8, R8 9226 JMP check_limit_unroll 9227 PCALIGN $0x08 9228 NOP 9229 NOP 9230 NOP 9231 NOP 9232 9233 loop_unroll: 9234 MOVSS (AX)(DI*4), X1 9235 MULSS X0, X1 9236 ADDSS (DX)(R8*4), X1 9237 MOVSS X1, (DX)(R8*4) 9238 ADDQ CX, DI 9239 ADDQ BX, R8 9240 MOVSS (AX)(DI*4), X1 9241 MULSS X0, X1 9242 ADDSS (DX)(R8*4), X1 9243 MOVSS X1, (DX)(R8*4) 9244 ADDQ CX, DI 9245 ADDQ BX, R8 9246 MOVSS (AX)(DI*4), X1 9247 MULSS X0, X1 9248 ADDSS (DX)(R8*4), X1 9249 MOVSS X1, (DX)(R8*4) 9250 ADDQ CX, DI 9251 ADDQ BX, R8 9252 MOVSS (AX)(DI*4), X1 9253 MULSS X0, X1 9254 ADDSS (DX)(R8*4), X1 9255 MOVSS X1, (DX)(R8*4) 9256 ADDQ CX, DI 9257 ADDQ BX, R8 9258 SUBQ $0x04, SI 9259 9260 check_limit_unroll: 9261 CMPQ SI, $0x04 9262 JHI loop_unroll 9263 JMP check_limit 9264 9265 loop: 9266 MOVSS (AX)(DI*4), X1 9267 MULSS X0, X1 9268 ADDSS (DX)(R8*4), X1 9269 MOVSS X1, (DX)(R8*4) 9270 DECQ SI 9271 ADDQ CX, DI 9272 ADDQ BX, R8 9273 9274 check_limit: 9275 CMPQ SI, $0x00 9276 JHI loop 9277 RET 9278 9279 // func AmdAxpyUnsafeX_V4A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9280 // Requires: SSE 9281 TEXT ·AmdAxpyUnsafeX_V4A12R4(SB), NOSPLIT, $0-48 9282 MOVSS alpha+0(FP), X0 9283 MOVQ xs+8(FP), AX 9284 MOVQ incx+16(FP), CX 9285 MOVQ ys+24(FP), DX 9286 MOVQ incy+32(FP), BX 9287 MOVQ n+40(FP), SI 9288 XORQ DI, DI 9289 XORQ R8, R8 9290 JMP check_limit_unroll 9291 PCALIGN $0x08 9292 NOP 9293 NOP 9294 NOP 9295 NOP 9296 9297 loop_unroll: 9298 MOVSS (AX)(DI*4), X1 9299 MULSS X0, X1 9300 ADDSS (DX)(R8*4), X1 9301 MOVSS X1, (DX)(R8*4) 9302 ADDQ CX, DI 9303 ADDQ BX, R8 9304 MOVSS (AX)(DI*4), X1 9305 MULSS X0, X1 9306 ADDSS (DX)(R8*4), X1 9307 MOVSS X1, (DX)(R8*4) 9308 ADDQ CX, DI 9309 ADDQ BX, R8 9310 MOVSS (AX)(DI*4), X1 9311 MULSS X0, X1 9312 ADDSS (DX)(R8*4), X1 9313 MOVSS X1, (DX)(R8*4) 9314 ADDQ CX, DI 9315 ADDQ BX, R8 9316 MOVSS (AX)(DI*4), X1 9317 MULSS X0, X1 9318 ADDSS (DX)(R8*4), X1 9319 MOVSS X1, (DX)(R8*4) 9320 ADDQ CX, DI 9321 ADDQ BX, R8 9322 SUBQ $0x04, SI 9323 9324 check_limit_unroll: 9325 CMPQ SI, $0x04 9326 JHI loop_unroll 9327 JMP check_limit 9328 9329 loop: 9330 MOVSS (AX)(DI*4), X1 9331 MULSS X0, X1 9332 ADDSS (DX)(R8*4), X1 9333 MOVSS X1, (DX)(R8*4) 9334 DECQ SI 9335 ADDQ CX, DI 9336 ADDQ BX, R8 9337 9338 check_limit: 9339 CMPQ SI, $0x00 9340 JHI loop 9341 RET 9342 9343 // func AmdAxpyUnsafeX_V5A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9344 // Requires: SSE 9345 TEXT ·AmdAxpyUnsafeX_V5A12R4(SB), NOSPLIT, $0-48 9346 MOVSS alpha+0(FP), X0 9347 MOVQ xs+8(FP), AX 9348 MOVQ incx+16(FP), CX 9349 MOVQ ys+24(FP), DX 9350 MOVQ incy+32(FP), BX 9351 MOVQ n+40(FP), SI 9352 XORQ DI, DI 9353 XORQ R8, R8 9354 JMP check_limit_unroll 9355 PCALIGN $0x08 9356 NOP 9357 NOP 9358 NOP 9359 NOP 9360 9361 loop_unroll: 9362 MOVSS (AX)(DI*4), X1 9363 MULSS X0, X1 9364 ADDSS (DX)(R8*4), X1 9365 MOVSS X1, (DX)(R8*4) 9366 ADDQ CX, DI 9367 ADDQ BX, R8 9368 MOVSS (AX)(DI*4), X1 9369 MULSS X0, X1 9370 ADDSS (DX)(R8*4), X1 9371 MOVSS X1, (DX)(R8*4) 9372 ADDQ CX, DI 9373 ADDQ BX, R8 9374 MOVSS (AX)(DI*4), X1 9375 MULSS X0, X1 9376 ADDSS (DX)(R8*4), X1 9377 MOVSS X1, (DX)(R8*4) 9378 ADDQ CX, DI 9379 ADDQ BX, R8 9380 MOVSS (AX)(DI*4), X1 9381 MULSS X0, X1 9382 ADDSS (DX)(R8*4), X1 9383 MOVSS X1, (DX)(R8*4) 9384 ADDQ CX, DI 9385 ADDQ BX, R8 9386 SUBQ $0x04, SI 9387 9388 check_limit_unroll: 9389 CMPQ SI, $0x04 9390 JHI loop_unroll 9391 JMP check_limit 9392 9393 loop: 9394 MOVSS (AX)(DI*4), X1 9395 MULSS X0, X1 9396 ADDSS (DX)(R8*4), X1 9397 MOVSS X1, (DX)(R8*4) 9398 DECQ SI 9399 ADDQ CX, DI 9400 ADDQ BX, R8 9401 9402 check_limit: 9403 CMPQ SI, $0x00 9404 JHI loop 9405 RET 9406 9407 // func AmdAxpyUnsafeX_V0A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9408 // Requires: SSE 9409 TEXT ·AmdAxpyUnsafeX_V0A13R4(SB), NOSPLIT, $0-48 9410 MOVSS alpha+0(FP), X0 9411 MOVQ xs+8(FP), AX 9412 MOVQ incx+16(FP), CX 9413 MOVQ ys+24(FP), DX 9414 MOVQ incy+32(FP), BX 9415 MOVQ n+40(FP), SI 9416 XORQ DI, DI 9417 XORQ R8, R8 9418 JMP check_limit_unroll 9419 PCALIGN $0x08 9420 NOP 9421 NOP 9422 NOP 9423 NOP 9424 NOP 9425 9426 loop_unroll: 9427 MOVSS (AX)(DI*4), X1 9428 MULSS X0, X1 9429 ADDSS (DX)(R8*4), X1 9430 MOVSS X1, (DX)(R8*4) 9431 ADDQ CX, DI 9432 ADDQ BX, R8 9433 MOVSS (AX)(DI*4), X1 9434 MULSS X0, X1 9435 ADDSS (DX)(R8*4), X1 9436 MOVSS X1, (DX)(R8*4) 9437 ADDQ CX, DI 9438 ADDQ BX, R8 9439 MOVSS (AX)(DI*4), X1 9440 MULSS X0, X1 9441 ADDSS (DX)(R8*4), X1 9442 MOVSS X1, (DX)(R8*4) 9443 ADDQ CX, DI 9444 ADDQ BX, R8 9445 MOVSS (AX)(DI*4), X1 9446 MULSS X0, X1 9447 ADDSS (DX)(R8*4), X1 9448 MOVSS X1, (DX)(R8*4) 9449 ADDQ CX, DI 9450 ADDQ BX, R8 9451 SUBQ $0x04, SI 9452 9453 check_limit_unroll: 9454 CMPQ SI, $0x04 9455 JHI loop_unroll 9456 JMP check_limit 9457 9458 loop: 9459 MOVSS (AX)(DI*4), X1 9460 MULSS X0, X1 9461 ADDSS (DX)(R8*4), X1 9462 MOVSS X1, (DX)(R8*4) 9463 DECQ SI 9464 ADDQ CX, DI 9465 ADDQ BX, R8 9466 9467 check_limit: 9468 CMPQ SI, $0x00 9469 JHI loop 9470 RET 9471 9472 // func AmdAxpyUnsafeX_V1A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9473 // Requires: SSE 9474 TEXT ·AmdAxpyUnsafeX_V1A13R4(SB), NOSPLIT, $0-48 9475 MOVSS alpha+0(FP), X0 9476 MOVQ xs+8(FP), AX 9477 MOVQ incx+16(FP), CX 9478 MOVQ ys+24(FP), DX 9479 MOVQ incy+32(FP), BX 9480 MOVQ n+40(FP), SI 9481 XORQ DI, DI 9482 XORQ R8, R8 9483 JMP check_limit_unroll 9484 PCALIGN $0x08 9485 NOP 9486 NOP 9487 NOP 9488 NOP 9489 NOP 9490 9491 loop_unroll: 9492 MOVSS (AX)(DI*4), X1 9493 MULSS X0, X1 9494 ADDSS (DX)(R8*4), X1 9495 MOVSS X1, (DX)(R8*4) 9496 ADDQ CX, DI 9497 ADDQ BX, R8 9498 MOVSS (AX)(DI*4), X1 9499 MULSS X0, X1 9500 ADDSS (DX)(R8*4), X1 9501 MOVSS X1, (DX)(R8*4) 9502 ADDQ CX, DI 9503 ADDQ BX, R8 9504 MOVSS (AX)(DI*4), X1 9505 MULSS X0, X1 9506 ADDSS (DX)(R8*4), X1 9507 MOVSS X1, (DX)(R8*4) 9508 ADDQ CX, DI 9509 ADDQ BX, R8 9510 MOVSS (AX)(DI*4), X1 9511 MULSS X0, X1 9512 ADDSS (DX)(R8*4), X1 9513 MOVSS X1, (DX)(R8*4) 9514 ADDQ CX, DI 9515 ADDQ BX, R8 9516 SUBQ $0x04, SI 9517 9518 check_limit_unroll: 9519 CMPQ SI, $0x04 9520 JHI loop_unroll 9521 JMP check_limit 9522 9523 loop: 9524 MOVSS (AX)(DI*4), X1 9525 MULSS X0, X1 9526 ADDSS (DX)(R8*4), X1 9527 MOVSS X1, (DX)(R8*4) 9528 DECQ SI 9529 ADDQ CX, DI 9530 ADDQ BX, R8 9531 9532 check_limit: 9533 CMPQ SI, $0x00 9534 JHI loop 9535 RET 9536 9537 // func AmdAxpyUnsafeX_V2A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9538 // Requires: SSE 9539 TEXT ·AmdAxpyUnsafeX_V2A13R4(SB), NOSPLIT, $0-48 9540 MOVSS alpha+0(FP), X0 9541 MOVQ xs+8(FP), AX 9542 MOVQ incx+16(FP), CX 9543 MOVQ ys+24(FP), DX 9544 MOVQ incy+32(FP), BX 9545 MOVQ n+40(FP), SI 9546 XORQ DI, DI 9547 XORQ R8, R8 9548 JMP check_limit_unroll 9549 PCALIGN $0x08 9550 NOP 9551 NOP 9552 NOP 9553 NOP 9554 NOP 9555 9556 loop_unroll: 9557 MOVSS (AX)(DI*4), X1 9558 MULSS X0, X1 9559 ADDSS (DX)(R8*4), X1 9560 MOVSS X1, (DX)(R8*4) 9561 ADDQ CX, DI 9562 ADDQ BX, R8 9563 MOVSS (AX)(DI*4), X1 9564 MULSS X0, X1 9565 ADDSS (DX)(R8*4), X1 9566 MOVSS X1, (DX)(R8*4) 9567 ADDQ CX, DI 9568 ADDQ BX, R8 9569 MOVSS (AX)(DI*4), X1 9570 MULSS X0, X1 9571 ADDSS (DX)(R8*4), X1 9572 MOVSS X1, (DX)(R8*4) 9573 ADDQ CX, DI 9574 ADDQ BX, R8 9575 MOVSS (AX)(DI*4), X1 9576 MULSS X0, X1 9577 ADDSS (DX)(R8*4), X1 9578 MOVSS X1, (DX)(R8*4) 9579 ADDQ CX, DI 9580 ADDQ BX, R8 9581 SUBQ $0x04, SI 9582 9583 check_limit_unroll: 9584 CMPQ SI, $0x04 9585 JHI loop_unroll 9586 JMP check_limit 9587 9588 loop: 9589 MOVSS (AX)(DI*4), X1 9590 MULSS X0, X1 9591 ADDSS (DX)(R8*4), X1 9592 MOVSS X1, (DX)(R8*4) 9593 DECQ SI 9594 ADDQ CX, DI 9595 ADDQ BX, R8 9596 9597 check_limit: 9598 CMPQ SI, $0x00 9599 JHI loop 9600 RET 9601 9602 // func AmdAxpyUnsafeX_V3A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9603 // Requires: SSE 9604 TEXT ·AmdAxpyUnsafeX_V3A13R4(SB), NOSPLIT, $0-48 9605 MOVSS alpha+0(FP), X0 9606 MOVQ xs+8(FP), AX 9607 MOVQ incx+16(FP), CX 9608 MOVQ ys+24(FP), DX 9609 MOVQ incy+32(FP), BX 9610 MOVQ n+40(FP), SI 9611 XORQ DI, DI 9612 XORQ R8, R8 9613 JMP check_limit_unroll 9614 PCALIGN $0x08 9615 NOP 9616 NOP 9617 NOP 9618 NOP 9619 NOP 9620 9621 loop_unroll: 9622 MOVSS (AX)(DI*4), X1 9623 MULSS X0, X1 9624 ADDSS (DX)(R8*4), X1 9625 MOVSS X1, (DX)(R8*4) 9626 ADDQ CX, DI 9627 ADDQ BX, R8 9628 MOVSS (AX)(DI*4), X1 9629 MULSS X0, X1 9630 ADDSS (DX)(R8*4), X1 9631 MOVSS X1, (DX)(R8*4) 9632 ADDQ CX, DI 9633 ADDQ BX, R8 9634 MOVSS (AX)(DI*4), X1 9635 MULSS X0, X1 9636 ADDSS (DX)(R8*4), X1 9637 MOVSS X1, (DX)(R8*4) 9638 ADDQ CX, DI 9639 ADDQ BX, R8 9640 MOVSS (AX)(DI*4), X1 9641 MULSS X0, X1 9642 ADDSS (DX)(R8*4), X1 9643 MOVSS X1, (DX)(R8*4) 9644 ADDQ CX, DI 9645 ADDQ BX, R8 9646 SUBQ $0x04, SI 9647 9648 check_limit_unroll: 9649 CMPQ SI, $0x04 9650 JHI loop_unroll 9651 JMP check_limit 9652 9653 loop: 9654 MOVSS (AX)(DI*4), X1 9655 MULSS X0, X1 9656 ADDSS (DX)(R8*4), X1 9657 MOVSS X1, (DX)(R8*4) 9658 DECQ SI 9659 ADDQ CX, DI 9660 ADDQ BX, R8 9661 9662 check_limit: 9663 CMPQ SI, $0x00 9664 JHI loop 9665 RET 9666 9667 // func AmdAxpyUnsafeX_V4A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9668 // Requires: SSE 9669 TEXT ·AmdAxpyUnsafeX_V4A13R4(SB), NOSPLIT, $0-48 9670 MOVSS alpha+0(FP), X0 9671 MOVQ xs+8(FP), AX 9672 MOVQ incx+16(FP), CX 9673 MOVQ ys+24(FP), DX 9674 MOVQ incy+32(FP), BX 9675 MOVQ n+40(FP), SI 9676 XORQ DI, DI 9677 XORQ R8, R8 9678 JMP check_limit_unroll 9679 PCALIGN $0x08 9680 NOP 9681 NOP 9682 NOP 9683 NOP 9684 NOP 9685 9686 loop_unroll: 9687 MOVSS (AX)(DI*4), X1 9688 MULSS X0, X1 9689 ADDSS (DX)(R8*4), X1 9690 MOVSS X1, (DX)(R8*4) 9691 ADDQ CX, DI 9692 ADDQ BX, R8 9693 MOVSS (AX)(DI*4), X1 9694 MULSS X0, X1 9695 ADDSS (DX)(R8*4), X1 9696 MOVSS X1, (DX)(R8*4) 9697 ADDQ CX, DI 9698 ADDQ BX, R8 9699 MOVSS (AX)(DI*4), X1 9700 MULSS X0, X1 9701 ADDSS (DX)(R8*4), X1 9702 MOVSS X1, (DX)(R8*4) 9703 ADDQ CX, DI 9704 ADDQ BX, R8 9705 MOVSS (AX)(DI*4), X1 9706 MULSS X0, X1 9707 ADDSS (DX)(R8*4), X1 9708 MOVSS X1, (DX)(R8*4) 9709 ADDQ CX, DI 9710 ADDQ BX, R8 9711 SUBQ $0x04, SI 9712 9713 check_limit_unroll: 9714 CMPQ SI, $0x04 9715 JHI loop_unroll 9716 JMP check_limit 9717 9718 loop: 9719 MOVSS (AX)(DI*4), X1 9720 MULSS X0, X1 9721 ADDSS (DX)(R8*4), X1 9722 MOVSS X1, (DX)(R8*4) 9723 DECQ SI 9724 ADDQ CX, DI 9725 ADDQ BX, R8 9726 9727 check_limit: 9728 CMPQ SI, $0x00 9729 JHI loop 9730 RET 9731 9732 // func AmdAxpyUnsafeX_V5A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9733 // Requires: SSE 9734 TEXT ·AmdAxpyUnsafeX_V5A13R4(SB), NOSPLIT, $0-48 9735 MOVSS alpha+0(FP), X0 9736 MOVQ xs+8(FP), AX 9737 MOVQ incx+16(FP), CX 9738 MOVQ ys+24(FP), DX 9739 MOVQ incy+32(FP), BX 9740 MOVQ n+40(FP), SI 9741 XORQ DI, DI 9742 XORQ R8, R8 9743 JMP check_limit_unroll 9744 PCALIGN $0x08 9745 NOP 9746 NOP 9747 NOP 9748 NOP 9749 NOP 9750 9751 loop_unroll: 9752 MOVSS (AX)(DI*4), X1 9753 MULSS X0, X1 9754 ADDSS (DX)(R8*4), X1 9755 MOVSS X1, (DX)(R8*4) 9756 ADDQ CX, DI 9757 ADDQ BX, R8 9758 MOVSS (AX)(DI*4), X1 9759 MULSS X0, X1 9760 ADDSS (DX)(R8*4), X1 9761 MOVSS X1, (DX)(R8*4) 9762 ADDQ CX, DI 9763 ADDQ BX, R8 9764 MOVSS (AX)(DI*4), X1 9765 MULSS X0, X1 9766 ADDSS (DX)(R8*4), X1 9767 MOVSS X1, (DX)(R8*4) 9768 ADDQ CX, DI 9769 ADDQ BX, R8 9770 MOVSS (AX)(DI*4), X1 9771 MULSS X0, X1 9772 ADDSS (DX)(R8*4), X1 9773 MOVSS X1, (DX)(R8*4) 9774 ADDQ CX, DI 9775 ADDQ BX, R8 9776 SUBQ $0x04, SI 9777 9778 check_limit_unroll: 9779 CMPQ SI, $0x04 9780 JHI loop_unroll 9781 JMP check_limit 9782 9783 loop: 9784 MOVSS (AX)(DI*4), X1 9785 MULSS X0, X1 9786 ADDSS (DX)(R8*4), X1 9787 MOVSS X1, (DX)(R8*4) 9788 DECQ SI 9789 ADDQ CX, DI 9790 ADDQ BX, R8 9791 9792 check_limit: 9793 CMPQ SI, $0x00 9794 JHI loop 9795 RET 9796 9797 // func AmdAxpyUnsafeX_V0A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9798 // Requires: SSE 9799 TEXT ·AmdAxpyUnsafeX_V0A14R4(SB), NOSPLIT, $0-48 9800 MOVSS alpha+0(FP), X0 9801 MOVQ xs+8(FP), AX 9802 MOVQ incx+16(FP), CX 9803 MOVQ ys+24(FP), DX 9804 MOVQ incy+32(FP), BX 9805 MOVQ n+40(FP), SI 9806 XORQ DI, DI 9807 XORQ R8, R8 9808 JMP check_limit_unroll 9809 PCALIGN $0x08 9810 NOP 9811 NOP 9812 NOP 9813 NOP 9814 NOP 9815 NOP 9816 9817 loop_unroll: 9818 MOVSS (AX)(DI*4), X1 9819 MULSS X0, X1 9820 ADDSS (DX)(R8*4), X1 9821 MOVSS X1, (DX)(R8*4) 9822 ADDQ CX, DI 9823 ADDQ BX, R8 9824 MOVSS (AX)(DI*4), X1 9825 MULSS X0, X1 9826 ADDSS (DX)(R8*4), X1 9827 MOVSS X1, (DX)(R8*4) 9828 ADDQ CX, DI 9829 ADDQ BX, R8 9830 MOVSS (AX)(DI*4), X1 9831 MULSS X0, X1 9832 ADDSS (DX)(R8*4), X1 9833 MOVSS X1, (DX)(R8*4) 9834 ADDQ CX, DI 9835 ADDQ BX, R8 9836 MOVSS (AX)(DI*4), X1 9837 MULSS X0, X1 9838 ADDSS (DX)(R8*4), X1 9839 MOVSS X1, (DX)(R8*4) 9840 ADDQ CX, DI 9841 ADDQ BX, R8 9842 SUBQ $0x04, SI 9843 9844 check_limit_unroll: 9845 CMPQ SI, $0x04 9846 JHI loop_unroll 9847 JMP check_limit 9848 9849 loop: 9850 MOVSS (AX)(DI*4), X1 9851 MULSS X0, X1 9852 ADDSS (DX)(R8*4), X1 9853 MOVSS X1, (DX)(R8*4) 9854 DECQ SI 9855 ADDQ CX, DI 9856 ADDQ BX, R8 9857 9858 check_limit: 9859 CMPQ SI, $0x00 9860 JHI loop 9861 RET 9862 9863 // func AmdAxpyUnsafeX_V1A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9864 // Requires: SSE 9865 TEXT ·AmdAxpyUnsafeX_V1A14R4(SB), NOSPLIT, $0-48 9866 MOVSS alpha+0(FP), X0 9867 MOVQ xs+8(FP), AX 9868 MOVQ incx+16(FP), CX 9869 MOVQ ys+24(FP), DX 9870 MOVQ incy+32(FP), BX 9871 MOVQ n+40(FP), SI 9872 XORQ DI, DI 9873 XORQ R8, R8 9874 JMP check_limit_unroll 9875 PCALIGN $0x08 9876 NOP 9877 NOP 9878 NOP 9879 NOP 9880 NOP 9881 NOP 9882 9883 loop_unroll: 9884 MOVSS (AX)(DI*4), X1 9885 MULSS X0, X1 9886 ADDSS (DX)(R8*4), X1 9887 MOVSS X1, (DX)(R8*4) 9888 ADDQ CX, DI 9889 ADDQ BX, R8 9890 MOVSS (AX)(DI*4), X1 9891 MULSS X0, X1 9892 ADDSS (DX)(R8*4), X1 9893 MOVSS X1, (DX)(R8*4) 9894 ADDQ CX, DI 9895 ADDQ BX, R8 9896 MOVSS (AX)(DI*4), X1 9897 MULSS X0, X1 9898 ADDSS (DX)(R8*4), X1 9899 MOVSS X1, (DX)(R8*4) 9900 ADDQ CX, DI 9901 ADDQ BX, R8 9902 MOVSS (AX)(DI*4), X1 9903 MULSS X0, X1 9904 ADDSS (DX)(R8*4), X1 9905 MOVSS X1, (DX)(R8*4) 9906 ADDQ CX, DI 9907 ADDQ BX, R8 9908 SUBQ $0x04, SI 9909 9910 check_limit_unroll: 9911 CMPQ SI, $0x04 9912 JHI loop_unroll 9913 JMP check_limit 9914 9915 loop: 9916 MOVSS (AX)(DI*4), X1 9917 MULSS X0, X1 9918 ADDSS (DX)(R8*4), X1 9919 MOVSS X1, (DX)(R8*4) 9920 DECQ SI 9921 ADDQ CX, DI 9922 ADDQ BX, R8 9923 9924 check_limit: 9925 CMPQ SI, $0x00 9926 JHI loop 9927 RET 9928 9929 // func AmdAxpyUnsafeX_V2A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9930 // Requires: SSE 9931 TEXT ·AmdAxpyUnsafeX_V2A14R4(SB), NOSPLIT, $0-48 9932 MOVSS alpha+0(FP), X0 9933 MOVQ xs+8(FP), AX 9934 MOVQ incx+16(FP), CX 9935 MOVQ ys+24(FP), DX 9936 MOVQ incy+32(FP), BX 9937 MOVQ n+40(FP), SI 9938 XORQ DI, DI 9939 XORQ R8, R8 9940 JMP check_limit_unroll 9941 PCALIGN $0x08 9942 NOP 9943 NOP 9944 NOP 9945 NOP 9946 NOP 9947 NOP 9948 9949 loop_unroll: 9950 MOVSS (AX)(DI*4), X1 9951 MULSS X0, X1 9952 ADDSS (DX)(R8*4), X1 9953 MOVSS X1, (DX)(R8*4) 9954 ADDQ CX, DI 9955 ADDQ BX, R8 9956 MOVSS (AX)(DI*4), X1 9957 MULSS X0, X1 9958 ADDSS (DX)(R8*4), X1 9959 MOVSS X1, (DX)(R8*4) 9960 ADDQ CX, DI 9961 ADDQ BX, R8 9962 MOVSS (AX)(DI*4), X1 9963 MULSS X0, X1 9964 ADDSS (DX)(R8*4), X1 9965 MOVSS X1, (DX)(R8*4) 9966 ADDQ CX, DI 9967 ADDQ BX, R8 9968 MOVSS (AX)(DI*4), X1 9969 MULSS X0, X1 9970 ADDSS (DX)(R8*4), X1 9971 MOVSS X1, (DX)(R8*4) 9972 ADDQ CX, DI 9973 ADDQ BX, R8 9974 SUBQ $0x04, SI 9975 9976 check_limit_unroll: 9977 CMPQ SI, $0x04 9978 JHI loop_unroll 9979 JMP check_limit 9980 9981 loop: 9982 MOVSS (AX)(DI*4), X1 9983 MULSS X0, X1 9984 ADDSS (DX)(R8*4), X1 9985 MOVSS X1, (DX)(R8*4) 9986 DECQ SI 9987 ADDQ CX, DI 9988 ADDQ BX, R8 9989 9990 check_limit: 9991 CMPQ SI, $0x00 9992 JHI loop 9993 RET 9994 9995 // func AmdAxpyUnsafeX_V3A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 9996 // Requires: SSE 9997 TEXT ·AmdAxpyUnsafeX_V3A14R4(SB), NOSPLIT, $0-48 9998 MOVSS alpha+0(FP), X0 9999 MOVQ xs+8(FP), AX 10000 MOVQ incx+16(FP), CX 10001 MOVQ ys+24(FP), DX 10002 MOVQ incy+32(FP), BX 10003 MOVQ n+40(FP), SI 10004 XORQ DI, DI 10005 XORQ R8, R8 10006 JMP check_limit_unroll 10007 PCALIGN $0x08 10008 NOP 10009 NOP 10010 NOP 10011 NOP 10012 NOP 10013 NOP 10014 10015 loop_unroll: 10016 MOVSS (AX)(DI*4), X1 10017 MULSS X0, X1 10018 ADDSS (DX)(R8*4), X1 10019 MOVSS X1, (DX)(R8*4) 10020 ADDQ CX, DI 10021 ADDQ BX, R8 10022 MOVSS (AX)(DI*4), X1 10023 MULSS X0, X1 10024 ADDSS (DX)(R8*4), X1 10025 MOVSS X1, (DX)(R8*4) 10026 ADDQ CX, DI 10027 ADDQ BX, R8 10028 MOVSS (AX)(DI*4), X1 10029 MULSS X0, X1 10030 ADDSS (DX)(R8*4), X1 10031 MOVSS X1, (DX)(R8*4) 10032 ADDQ CX, DI 10033 ADDQ BX, R8 10034 MOVSS (AX)(DI*4), X1 10035 MULSS X0, X1 10036 ADDSS (DX)(R8*4), X1 10037 MOVSS X1, (DX)(R8*4) 10038 ADDQ CX, DI 10039 ADDQ BX, R8 10040 SUBQ $0x04, SI 10041 10042 check_limit_unroll: 10043 CMPQ SI, $0x04 10044 JHI loop_unroll 10045 JMP check_limit 10046 10047 loop: 10048 MOVSS (AX)(DI*4), X1 10049 MULSS X0, X1 10050 ADDSS (DX)(R8*4), X1 10051 MOVSS X1, (DX)(R8*4) 10052 DECQ SI 10053 ADDQ CX, DI 10054 ADDQ BX, R8 10055 10056 check_limit: 10057 CMPQ SI, $0x00 10058 JHI loop 10059 RET 10060 10061 // func AmdAxpyUnsafeX_V4A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10062 // Requires: SSE 10063 TEXT ·AmdAxpyUnsafeX_V4A14R4(SB), NOSPLIT, $0-48 10064 MOVSS alpha+0(FP), X0 10065 MOVQ xs+8(FP), AX 10066 MOVQ incx+16(FP), CX 10067 MOVQ ys+24(FP), DX 10068 MOVQ incy+32(FP), BX 10069 MOVQ n+40(FP), SI 10070 XORQ DI, DI 10071 XORQ R8, R8 10072 JMP check_limit_unroll 10073 PCALIGN $0x08 10074 NOP 10075 NOP 10076 NOP 10077 NOP 10078 NOP 10079 NOP 10080 10081 loop_unroll: 10082 MOVSS (AX)(DI*4), X1 10083 MULSS X0, X1 10084 ADDSS (DX)(R8*4), X1 10085 MOVSS X1, (DX)(R8*4) 10086 ADDQ CX, DI 10087 ADDQ BX, R8 10088 MOVSS (AX)(DI*4), X1 10089 MULSS X0, X1 10090 ADDSS (DX)(R8*4), X1 10091 MOVSS X1, (DX)(R8*4) 10092 ADDQ CX, DI 10093 ADDQ BX, R8 10094 MOVSS (AX)(DI*4), X1 10095 MULSS X0, X1 10096 ADDSS (DX)(R8*4), X1 10097 MOVSS X1, (DX)(R8*4) 10098 ADDQ CX, DI 10099 ADDQ BX, R8 10100 MOVSS (AX)(DI*4), X1 10101 MULSS X0, X1 10102 ADDSS (DX)(R8*4), X1 10103 MOVSS X1, (DX)(R8*4) 10104 ADDQ CX, DI 10105 ADDQ BX, R8 10106 SUBQ $0x04, SI 10107 10108 check_limit_unroll: 10109 CMPQ SI, $0x04 10110 JHI loop_unroll 10111 JMP check_limit 10112 10113 loop: 10114 MOVSS (AX)(DI*4), X1 10115 MULSS X0, X1 10116 ADDSS (DX)(R8*4), X1 10117 MOVSS X1, (DX)(R8*4) 10118 DECQ SI 10119 ADDQ CX, DI 10120 ADDQ BX, R8 10121 10122 check_limit: 10123 CMPQ SI, $0x00 10124 JHI loop 10125 RET 10126 10127 // func AmdAxpyUnsafeX_V5A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10128 // Requires: SSE 10129 TEXT ·AmdAxpyUnsafeX_V5A14R4(SB), NOSPLIT, $0-48 10130 MOVSS alpha+0(FP), X0 10131 MOVQ xs+8(FP), AX 10132 MOVQ incx+16(FP), CX 10133 MOVQ ys+24(FP), DX 10134 MOVQ incy+32(FP), BX 10135 MOVQ n+40(FP), SI 10136 XORQ DI, DI 10137 XORQ R8, R8 10138 JMP check_limit_unroll 10139 PCALIGN $0x08 10140 NOP 10141 NOP 10142 NOP 10143 NOP 10144 NOP 10145 NOP 10146 10147 loop_unroll: 10148 MOVSS (AX)(DI*4), X1 10149 MULSS X0, X1 10150 ADDSS (DX)(R8*4), X1 10151 MOVSS X1, (DX)(R8*4) 10152 ADDQ CX, DI 10153 ADDQ BX, R8 10154 MOVSS (AX)(DI*4), X1 10155 MULSS X0, X1 10156 ADDSS (DX)(R8*4), X1 10157 MOVSS X1, (DX)(R8*4) 10158 ADDQ CX, DI 10159 ADDQ BX, R8 10160 MOVSS (AX)(DI*4), X1 10161 MULSS X0, X1 10162 ADDSS (DX)(R8*4), X1 10163 MOVSS X1, (DX)(R8*4) 10164 ADDQ CX, DI 10165 ADDQ BX, R8 10166 MOVSS (AX)(DI*4), X1 10167 MULSS X0, X1 10168 ADDSS (DX)(R8*4), X1 10169 MOVSS X1, (DX)(R8*4) 10170 ADDQ CX, DI 10171 ADDQ BX, R8 10172 SUBQ $0x04, SI 10173 10174 check_limit_unroll: 10175 CMPQ SI, $0x04 10176 JHI loop_unroll 10177 JMP check_limit 10178 10179 loop: 10180 MOVSS (AX)(DI*4), X1 10181 MULSS X0, X1 10182 ADDSS (DX)(R8*4), X1 10183 MOVSS X1, (DX)(R8*4) 10184 DECQ SI 10185 ADDQ CX, DI 10186 ADDQ BX, R8 10187 10188 check_limit: 10189 CMPQ SI, $0x00 10190 JHI loop 10191 RET 10192 10193 // func AmdAxpyUnsafeX_V0A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10194 // Requires: SSE 10195 TEXT ·AmdAxpyUnsafeX_V0A15R4(SB), NOSPLIT, $0-48 10196 MOVSS alpha+0(FP), X0 10197 MOVQ xs+8(FP), AX 10198 MOVQ incx+16(FP), CX 10199 MOVQ ys+24(FP), DX 10200 MOVQ incy+32(FP), BX 10201 MOVQ n+40(FP), SI 10202 XORQ DI, DI 10203 XORQ R8, R8 10204 JMP check_limit_unroll 10205 PCALIGN $0x08 10206 NOP 10207 NOP 10208 NOP 10209 NOP 10210 NOP 10211 NOP 10212 NOP 10213 10214 loop_unroll: 10215 MOVSS (AX)(DI*4), X1 10216 MULSS X0, X1 10217 ADDSS (DX)(R8*4), X1 10218 MOVSS X1, (DX)(R8*4) 10219 ADDQ CX, DI 10220 ADDQ BX, R8 10221 MOVSS (AX)(DI*4), X1 10222 MULSS X0, X1 10223 ADDSS (DX)(R8*4), X1 10224 MOVSS X1, (DX)(R8*4) 10225 ADDQ CX, DI 10226 ADDQ BX, R8 10227 MOVSS (AX)(DI*4), X1 10228 MULSS X0, X1 10229 ADDSS (DX)(R8*4), X1 10230 MOVSS X1, (DX)(R8*4) 10231 ADDQ CX, DI 10232 ADDQ BX, R8 10233 MOVSS (AX)(DI*4), X1 10234 MULSS X0, X1 10235 ADDSS (DX)(R8*4), X1 10236 MOVSS X1, (DX)(R8*4) 10237 ADDQ CX, DI 10238 ADDQ BX, R8 10239 SUBQ $0x04, SI 10240 10241 check_limit_unroll: 10242 CMPQ SI, $0x04 10243 JHI loop_unroll 10244 JMP check_limit 10245 10246 loop: 10247 MOVSS (AX)(DI*4), X1 10248 MULSS X0, X1 10249 ADDSS (DX)(R8*4), X1 10250 MOVSS X1, (DX)(R8*4) 10251 DECQ SI 10252 ADDQ CX, DI 10253 ADDQ BX, R8 10254 10255 check_limit: 10256 CMPQ SI, $0x00 10257 JHI loop 10258 RET 10259 10260 // func AmdAxpyUnsafeX_V1A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10261 // Requires: SSE 10262 TEXT ·AmdAxpyUnsafeX_V1A15R4(SB), NOSPLIT, $0-48 10263 MOVSS alpha+0(FP), X0 10264 MOVQ xs+8(FP), AX 10265 MOVQ incx+16(FP), CX 10266 MOVQ ys+24(FP), DX 10267 MOVQ incy+32(FP), BX 10268 MOVQ n+40(FP), SI 10269 XORQ DI, DI 10270 XORQ R8, R8 10271 JMP check_limit_unroll 10272 PCALIGN $0x08 10273 NOP 10274 NOP 10275 NOP 10276 NOP 10277 NOP 10278 NOP 10279 NOP 10280 10281 loop_unroll: 10282 MOVSS (AX)(DI*4), X1 10283 MULSS X0, X1 10284 ADDSS (DX)(R8*4), X1 10285 MOVSS X1, (DX)(R8*4) 10286 ADDQ CX, DI 10287 ADDQ BX, R8 10288 MOVSS (AX)(DI*4), X1 10289 MULSS X0, X1 10290 ADDSS (DX)(R8*4), X1 10291 MOVSS X1, (DX)(R8*4) 10292 ADDQ CX, DI 10293 ADDQ BX, R8 10294 MOVSS (AX)(DI*4), X1 10295 MULSS X0, X1 10296 ADDSS (DX)(R8*4), X1 10297 MOVSS X1, (DX)(R8*4) 10298 ADDQ CX, DI 10299 ADDQ BX, R8 10300 MOVSS (AX)(DI*4), X1 10301 MULSS X0, X1 10302 ADDSS (DX)(R8*4), X1 10303 MOVSS X1, (DX)(R8*4) 10304 ADDQ CX, DI 10305 ADDQ BX, R8 10306 SUBQ $0x04, SI 10307 10308 check_limit_unroll: 10309 CMPQ SI, $0x04 10310 JHI loop_unroll 10311 JMP check_limit 10312 10313 loop: 10314 MOVSS (AX)(DI*4), X1 10315 MULSS X0, X1 10316 ADDSS (DX)(R8*4), X1 10317 MOVSS X1, (DX)(R8*4) 10318 DECQ SI 10319 ADDQ CX, DI 10320 ADDQ BX, R8 10321 10322 check_limit: 10323 CMPQ SI, $0x00 10324 JHI loop 10325 RET 10326 10327 // func AmdAxpyUnsafeX_V2A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10328 // Requires: SSE 10329 TEXT ·AmdAxpyUnsafeX_V2A15R4(SB), NOSPLIT, $0-48 10330 MOVSS alpha+0(FP), X0 10331 MOVQ xs+8(FP), AX 10332 MOVQ incx+16(FP), CX 10333 MOVQ ys+24(FP), DX 10334 MOVQ incy+32(FP), BX 10335 MOVQ n+40(FP), SI 10336 XORQ DI, DI 10337 XORQ R8, R8 10338 JMP check_limit_unroll 10339 PCALIGN $0x08 10340 NOP 10341 NOP 10342 NOP 10343 NOP 10344 NOP 10345 NOP 10346 NOP 10347 10348 loop_unroll: 10349 MOVSS (AX)(DI*4), X1 10350 MULSS X0, X1 10351 ADDSS (DX)(R8*4), X1 10352 MOVSS X1, (DX)(R8*4) 10353 ADDQ CX, DI 10354 ADDQ BX, R8 10355 MOVSS (AX)(DI*4), X1 10356 MULSS X0, X1 10357 ADDSS (DX)(R8*4), X1 10358 MOVSS X1, (DX)(R8*4) 10359 ADDQ CX, DI 10360 ADDQ BX, R8 10361 MOVSS (AX)(DI*4), X1 10362 MULSS X0, X1 10363 ADDSS (DX)(R8*4), X1 10364 MOVSS X1, (DX)(R8*4) 10365 ADDQ CX, DI 10366 ADDQ BX, R8 10367 MOVSS (AX)(DI*4), X1 10368 MULSS X0, X1 10369 ADDSS (DX)(R8*4), X1 10370 MOVSS X1, (DX)(R8*4) 10371 ADDQ CX, DI 10372 ADDQ BX, R8 10373 SUBQ $0x04, SI 10374 10375 check_limit_unroll: 10376 CMPQ SI, $0x04 10377 JHI loop_unroll 10378 JMP check_limit 10379 10380 loop: 10381 MOVSS (AX)(DI*4), X1 10382 MULSS X0, X1 10383 ADDSS (DX)(R8*4), X1 10384 MOVSS X1, (DX)(R8*4) 10385 DECQ SI 10386 ADDQ CX, DI 10387 ADDQ BX, R8 10388 10389 check_limit: 10390 CMPQ SI, $0x00 10391 JHI loop 10392 RET 10393 10394 // func AmdAxpyUnsafeX_V3A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10395 // Requires: SSE 10396 TEXT ·AmdAxpyUnsafeX_V3A15R4(SB), NOSPLIT, $0-48 10397 MOVSS alpha+0(FP), X0 10398 MOVQ xs+8(FP), AX 10399 MOVQ incx+16(FP), CX 10400 MOVQ ys+24(FP), DX 10401 MOVQ incy+32(FP), BX 10402 MOVQ n+40(FP), SI 10403 XORQ DI, DI 10404 XORQ R8, R8 10405 JMP check_limit_unroll 10406 PCALIGN $0x08 10407 NOP 10408 NOP 10409 NOP 10410 NOP 10411 NOP 10412 NOP 10413 NOP 10414 10415 loop_unroll: 10416 MOVSS (AX)(DI*4), X1 10417 MULSS X0, X1 10418 ADDSS (DX)(R8*4), X1 10419 MOVSS X1, (DX)(R8*4) 10420 ADDQ CX, DI 10421 ADDQ BX, R8 10422 MOVSS (AX)(DI*4), X1 10423 MULSS X0, X1 10424 ADDSS (DX)(R8*4), X1 10425 MOVSS X1, (DX)(R8*4) 10426 ADDQ CX, DI 10427 ADDQ BX, R8 10428 MOVSS (AX)(DI*4), X1 10429 MULSS X0, X1 10430 ADDSS (DX)(R8*4), X1 10431 MOVSS X1, (DX)(R8*4) 10432 ADDQ CX, DI 10433 ADDQ BX, R8 10434 MOVSS (AX)(DI*4), X1 10435 MULSS X0, X1 10436 ADDSS (DX)(R8*4), X1 10437 MOVSS X1, (DX)(R8*4) 10438 ADDQ CX, DI 10439 ADDQ BX, R8 10440 SUBQ $0x04, SI 10441 10442 check_limit_unroll: 10443 CMPQ SI, $0x04 10444 JHI loop_unroll 10445 JMP check_limit 10446 10447 loop: 10448 MOVSS (AX)(DI*4), X1 10449 MULSS X0, X1 10450 ADDSS (DX)(R8*4), X1 10451 MOVSS X1, (DX)(R8*4) 10452 DECQ SI 10453 ADDQ CX, DI 10454 ADDQ BX, R8 10455 10456 check_limit: 10457 CMPQ SI, $0x00 10458 JHI loop 10459 RET 10460 10461 // func AmdAxpyUnsafeX_V4A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10462 // Requires: SSE 10463 TEXT ·AmdAxpyUnsafeX_V4A15R4(SB), NOSPLIT, $0-48 10464 MOVSS alpha+0(FP), X0 10465 MOVQ xs+8(FP), AX 10466 MOVQ incx+16(FP), CX 10467 MOVQ ys+24(FP), DX 10468 MOVQ incy+32(FP), BX 10469 MOVQ n+40(FP), SI 10470 XORQ DI, DI 10471 XORQ R8, R8 10472 JMP check_limit_unroll 10473 PCALIGN $0x08 10474 NOP 10475 NOP 10476 NOP 10477 NOP 10478 NOP 10479 NOP 10480 NOP 10481 10482 loop_unroll: 10483 MOVSS (AX)(DI*4), X1 10484 MULSS X0, X1 10485 ADDSS (DX)(R8*4), X1 10486 MOVSS X1, (DX)(R8*4) 10487 ADDQ CX, DI 10488 ADDQ BX, R8 10489 MOVSS (AX)(DI*4), X1 10490 MULSS X0, X1 10491 ADDSS (DX)(R8*4), X1 10492 MOVSS X1, (DX)(R8*4) 10493 ADDQ CX, DI 10494 ADDQ BX, R8 10495 MOVSS (AX)(DI*4), X1 10496 MULSS X0, X1 10497 ADDSS (DX)(R8*4), X1 10498 MOVSS X1, (DX)(R8*4) 10499 ADDQ CX, DI 10500 ADDQ BX, R8 10501 MOVSS (AX)(DI*4), X1 10502 MULSS X0, X1 10503 ADDSS (DX)(R8*4), X1 10504 MOVSS X1, (DX)(R8*4) 10505 ADDQ CX, DI 10506 ADDQ BX, R8 10507 SUBQ $0x04, SI 10508 10509 check_limit_unroll: 10510 CMPQ SI, $0x04 10511 JHI loop_unroll 10512 JMP check_limit 10513 10514 loop: 10515 MOVSS (AX)(DI*4), X1 10516 MULSS X0, X1 10517 ADDSS (DX)(R8*4), X1 10518 MOVSS X1, (DX)(R8*4) 10519 DECQ SI 10520 ADDQ CX, DI 10521 ADDQ BX, R8 10522 10523 check_limit: 10524 CMPQ SI, $0x00 10525 JHI loop 10526 RET 10527 10528 // func AmdAxpyUnsafeX_V5A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10529 // Requires: SSE 10530 TEXT ·AmdAxpyUnsafeX_V5A15R4(SB), NOSPLIT, $0-48 10531 MOVSS alpha+0(FP), X0 10532 MOVQ xs+8(FP), AX 10533 MOVQ incx+16(FP), CX 10534 MOVQ ys+24(FP), DX 10535 MOVQ incy+32(FP), BX 10536 MOVQ n+40(FP), SI 10537 XORQ DI, DI 10538 XORQ R8, R8 10539 JMP check_limit_unroll 10540 PCALIGN $0x08 10541 NOP 10542 NOP 10543 NOP 10544 NOP 10545 NOP 10546 NOP 10547 NOP 10548 10549 loop_unroll: 10550 MOVSS (AX)(DI*4), X1 10551 MULSS X0, X1 10552 ADDSS (DX)(R8*4), X1 10553 MOVSS X1, (DX)(R8*4) 10554 ADDQ CX, DI 10555 ADDQ BX, R8 10556 MOVSS (AX)(DI*4), X1 10557 MULSS X0, X1 10558 ADDSS (DX)(R8*4), X1 10559 MOVSS X1, (DX)(R8*4) 10560 ADDQ CX, DI 10561 ADDQ BX, R8 10562 MOVSS (AX)(DI*4), X1 10563 MULSS X0, X1 10564 ADDSS (DX)(R8*4), X1 10565 MOVSS X1, (DX)(R8*4) 10566 ADDQ CX, DI 10567 ADDQ BX, R8 10568 MOVSS (AX)(DI*4), X1 10569 MULSS X0, X1 10570 ADDSS (DX)(R8*4), X1 10571 MOVSS X1, (DX)(R8*4) 10572 ADDQ CX, DI 10573 ADDQ BX, R8 10574 SUBQ $0x04, SI 10575 10576 check_limit_unroll: 10577 CMPQ SI, $0x04 10578 JHI loop_unroll 10579 JMP check_limit 10580 10581 loop: 10582 MOVSS (AX)(DI*4), X1 10583 MULSS X0, X1 10584 ADDSS (DX)(R8*4), X1 10585 MOVSS X1, (DX)(R8*4) 10586 DECQ SI 10587 ADDQ CX, DI 10588 ADDQ BX, R8 10589 10590 check_limit: 10591 CMPQ SI, $0x00 10592 JHI loop 10593 RET 10594 10595 // func AmdAxpyUnsafeX_V0A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10596 // Requires: SSE 10597 TEXT ·AmdAxpyUnsafeX_V0A16R4(SB), NOSPLIT, $0-48 10598 MOVSS alpha+0(FP), X0 10599 MOVQ xs+8(FP), AX 10600 MOVQ incx+16(FP), CX 10601 MOVQ ys+24(FP), DX 10602 MOVQ incy+32(FP), BX 10603 MOVQ n+40(FP), SI 10604 XORQ DI, DI 10605 XORQ R8, R8 10606 JMP check_limit_unroll 10607 PCALIGN $0x10 10608 10609 loop_unroll: 10610 MOVSS (AX)(DI*4), X1 10611 MULSS X0, X1 10612 ADDSS (DX)(R8*4), X1 10613 MOVSS X1, (DX)(R8*4) 10614 ADDQ CX, DI 10615 ADDQ BX, R8 10616 MOVSS (AX)(DI*4), X1 10617 MULSS X0, X1 10618 ADDSS (DX)(R8*4), X1 10619 MOVSS X1, (DX)(R8*4) 10620 ADDQ CX, DI 10621 ADDQ BX, R8 10622 MOVSS (AX)(DI*4), X1 10623 MULSS X0, X1 10624 ADDSS (DX)(R8*4), X1 10625 MOVSS X1, (DX)(R8*4) 10626 ADDQ CX, DI 10627 ADDQ BX, R8 10628 MOVSS (AX)(DI*4), X1 10629 MULSS X0, X1 10630 ADDSS (DX)(R8*4), X1 10631 MOVSS X1, (DX)(R8*4) 10632 ADDQ CX, DI 10633 ADDQ BX, R8 10634 SUBQ $0x04, SI 10635 10636 check_limit_unroll: 10637 CMPQ SI, $0x04 10638 JHI loop_unroll 10639 JMP check_limit 10640 10641 loop: 10642 MOVSS (AX)(DI*4), X1 10643 MULSS X0, X1 10644 ADDSS (DX)(R8*4), X1 10645 MOVSS X1, (DX)(R8*4) 10646 DECQ SI 10647 ADDQ CX, DI 10648 ADDQ BX, R8 10649 10650 check_limit: 10651 CMPQ SI, $0x00 10652 JHI loop 10653 RET 10654 10655 // func AmdAxpyUnsafeX_V1A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10656 // Requires: SSE 10657 TEXT ·AmdAxpyUnsafeX_V1A16R4(SB), NOSPLIT, $0-48 10658 MOVSS alpha+0(FP), X0 10659 MOVQ xs+8(FP), AX 10660 MOVQ incx+16(FP), CX 10661 MOVQ ys+24(FP), DX 10662 MOVQ incy+32(FP), BX 10663 MOVQ n+40(FP), SI 10664 XORQ DI, DI 10665 XORQ R8, R8 10666 JMP check_limit_unroll 10667 PCALIGN $0x10 10668 10669 loop_unroll: 10670 MOVSS (AX)(DI*4), X1 10671 MULSS X0, X1 10672 ADDSS (DX)(R8*4), X1 10673 MOVSS X1, (DX)(R8*4) 10674 ADDQ CX, DI 10675 ADDQ BX, R8 10676 MOVSS (AX)(DI*4), X1 10677 MULSS X0, X1 10678 ADDSS (DX)(R8*4), X1 10679 MOVSS X1, (DX)(R8*4) 10680 ADDQ CX, DI 10681 ADDQ BX, R8 10682 MOVSS (AX)(DI*4), X1 10683 MULSS X0, X1 10684 ADDSS (DX)(R8*4), X1 10685 MOVSS X1, (DX)(R8*4) 10686 ADDQ CX, DI 10687 ADDQ BX, R8 10688 MOVSS (AX)(DI*4), X1 10689 MULSS X0, X1 10690 ADDSS (DX)(R8*4), X1 10691 MOVSS X1, (DX)(R8*4) 10692 ADDQ CX, DI 10693 ADDQ BX, R8 10694 SUBQ $0x04, SI 10695 10696 check_limit_unroll: 10697 CMPQ SI, $0x04 10698 JHI loop_unroll 10699 JMP check_limit 10700 10701 loop: 10702 MOVSS (AX)(DI*4), X1 10703 MULSS X0, X1 10704 ADDSS (DX)(R8*4), X1 10705 MOVSS X1, (DX)(R8*4) 10706 DECQ SI 10707 ADDQ CX, DI 10708 ADDQ BX, R8 10709 10710 check_limit: 10711 CMPQ SI, $0x00 10712 JHI loop 10713 RET 10714 10715 // func AmdAxpyUnsafeX_V2A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10716 // Requires: SSE 10717 TEXT ·AmdAxpyUnsafeX_V2A16R4(SB), NOSPLIT, $0-48 10718 MOVSS alpha+0(FP), X0 10719 MOVQ xs+8(FP), AX 10720 MOVQ incx+16(FP), CX 10721 MOVQ ys+24(FP), DX 10722 MOVQ incy+32(FP), BX 10723 MOVQ n+40(FP), SI 10724 XORQ DI, DI 10725 XORQ R8, R8 10726 JMP check_limit_unroll 10727 PCALIGN $0x10 10728 10729 loop_unroll: 10730 MOVSS (AX)(DI*4), X1 10731 MULSS X0, X1 10732 ADDSS (DX)(R8*4), X1 10733 MOVSS X1, (DX)(R8*4) 10734 ADDQ CX, DI 10735 ADDQ BX, R8 10736 MOVSS (AX)(DI*4), X1 10737 MULSS X0, X1 10738 ADDSS (DX)(R8*4), X1 10739 MOVSS X1, (DX)(R8*4) 10740 ADDQ CX, DI 10741 ADDQ BX, R8 10742 MOVSS (AX)(DI*4), X1 10743 MULSS X0, X1 10744 ADDSS (DX)(R8*4), X1 10745 MOVSS X1, (DX)(R8*4) 10746 ADDQ CX, DI 10747 ADDQ BX, R8 10748 MOVSS (AX)(DI*4), X1 10749 MULSS X0, X1 10750 ADDSS (DX)(R8*4), X1 10751 MOVSS X1, (DX)(R8*4) 10752 ADDQ CX, DI 10753 ADDQ BX, R8 10754 SUBQ $0x04, SI 10755 10756 check_limit_unroll: 10757 CMPQ SI, $0x04 10758 JHI loop_unroll 10759 JMP check_limit 10760 10761 loop: 10762 MOVSS (AX)(DI*4), X1 10763 MULSS X0, X1 10764 ADDSS (DX)(R8*4), X1 10765 MOVSS X1, (DX)(R8*4) 10766 DECQ SI 10767 ADDQ CX, DI 10768 ADDQ BX, R8 10769 10770 check_limit: 10771 CMPQ SI, $0x00 10772 JHI loop 10773 RET 10774 10775 // func AmdAxpyUnsafeX_V3A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10776 // Requires: SSE 10777 TEXT ·AmdAxpyUnsafeX_V3A16R4(SB), NOSPLIT, $0-48 10778 MOVSS alpha+0(FP), X0 10779 MOVQ xs+8(FP), AX 10780 MOVQ incx+16(FP), CX 10781 MOVQ ys+24(FP), DX 10782 MOVQ incy+32(FP), BX 10783 MOVQ n+40(FP), SI 10784 XORQ DI, DI 10785 XORQ R8, R8 10786 JMP check_limit_unroll 10787 PCALIGN $0x10 10788 10789 loop_unroll: 10790 MOVSS (AX)(DI*4), X1 10791 MULSS X0, X1 10792 ADDSS (DX)(R8*4), X1 10793 MOVSS X1, (DX)(R8*4) 10794 ADDQ CX, DI 10795 ADDQ BX, R8 10796 MOVSS (AX)(DI*4), X1 10797 MULSS X0, X1 10798 ADDSS (DX)(R8*4), X1 10799 MOVSS X1, (DX)(R8*4) 10800 ADDQ CX, DI 10801 ADDQ BX, R8 10802 MOVSS (AX)(DI*4), X1 10803 MULSS X0, X1 10804 ADDSS (DX)(R8*4), X1 10805 MOVSS X1, (DX)(R8*4) 10806 ADDQ CX, DI 10807 ADDQ BX, R8 10808 MOVSS (AX)(DI*4), X1 10809 MULSS X0, X1 10810 ADDSS (DX)(R8*4), X1 10811 MOVSS X1, (DX)(R8*4) 10812 ADDQ CX, DI 10813 ADDQ BX, R8 10814 SUBQ $0x04, SI 10815 10816 check_limit_unroll: 10817 CMPQ SI, $0x04 10818 JHI loop_unroll 10819 JMP check_limit 10820 10821 loop: 10822 MOVSS (AX)(DI*4), X1 10823 MULSS X0, X1 10824 ADDSS (DX)(R8*4), X1 10825 MOVSS X1, (DX)(R8*4) 10826 DECQ SI 10827 ADDQ CX, DI 10828 ADDQ BX, R8 10829 10830 check_limit: 10831 CMPQ SI, $0x00 10832 JHI loop 10833 RET 10834 10835 // func AmdAxpyUnsafeX_V4A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10836 // Requires: SSE 10837 TEXT ·AmdAxpyUnsafeX_V4A16R4(SB), NOSPLIT, $0-48 10838 MOVSS alpha+0(FP), X0 10839 MOVQ xs+8(FP), AX 10840 MOVQ incx+16(FP), CX 10841 MOVQ ys+24(FP), DX 10842 MOVQ incy+32(FP), BX 10843 MOVQ n+40(FP), SI 10844 XORQ DI, DI 10845 XORQ R8, R8 10846 JMP check_limit_unroll 10847 PCALIGN $0x10 10848 10849 loop_unroll: 10850 MOVSS (AX)(DI*4), X1 10851 MULSS X0, X1 10852 ADDSS (DX)(R8*4), X1 10853 MOVSS X1, (DX)(R8*4) 10854 ADDQ CX, DI 10855 ADDQ BX, R8 10856 MOVSS (AX)(DI*4), X1 10857 MULSS X0, X1 10858 ADDSS (DX)(R8*4), X1 10859 MOVSS X1, (DX)(R8*4) 10860 ADDQ CX, DI 10861 ADDQ BX, R8 10862 MOVSS (AX)(DI*4), X1 10863 MULSS X0, X1 10864 ADDSS (DX)(R8*4), X1 10865 MOVSS X1, (DX)(R8*4) 10866 ADDQ CX, DI 10867 ADDQ BX, R8 10868 MOVSS (AX)(DI*4), X1 10869 MULSS X0, X1 10870 ADDSS (DX)(R8*4), X1 10871 MOVSS X1, (DX)(R8*4) 10872 ADDQ CX, DI 10873 ADDQ BX, R8 10874 SUBQ $0x04, SI 10875 10876 check_limit_unroll: 10877 CMPQ SI, $0x04 10878 JHI loop_unroll 10879 JMP check_limit 10880 10881 loop: 10882 MOVSS (AX)(DI*4), X1 10883 MULSS X0, X1 10884 ADDSS (DX)(R8*4), X1 10885 MOVSS X1, (DX)(R8*4) 10886 DECQ SI 10887 ADDQ CX, DI 10888 ADDQ BX, R8 10889 10890 check_limit: 10891 CMPQ SI, $0x00 10892 JHI loop 10893 RET 10894 10895 // func AmdAxpyUnsafeX_V5A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10896 // Requires: SSE 10897 TEXT ·AmdAxpyUnsafeX_V5A16R4(SB), NOSPLIT, $0-48 10898 MOVSS alpha+0(FP), X0 10899 MOVQ xs+8(FP), AX 10900 MOVQ incx+16(FP), CX 10901 MOVQ ys+24(FP), DX 10902 MOVQ incy+32(FP), BX 10903 MOVQ n+40(FP), SI 10904 XORQ DI, DI 10905 XORQ R8, R8 10906 JMP check_limit_unroll 10907 PCALIGN $0x10 10908 10909 loop_unroll: 10910 MOVSS (AX)(DI*4), X1 10911 MULSS X0, X1 10912 ADDSS (DX)(R8*4), X1 10913 MOVSS X1, (DX)(R8*4) 10914 ADDQ CX, DI 10915 ADDQ BX, R8 10916 MOVSS (AX)(DI*4), X1 10917 MULSS X0, X1 10918 ADDSS (DX)(R8*4), X1 10919 MOVSS X1, (DX)(R8*4) 10920 ADDQ CX, DI 10921 ADDQ BX, R8 10922 MOVSS (AX)(DI*4), X1 10923 MULSS X0, X1 10924 ADDSS (DX)(R8*4), X1 10925 MOVSS X1, (DX)(R8*4) 10926 ADDQ CX, DI 10927 ADDQ BX, R8 10928 MOVSS (AX)(DI*4), X1 10929 MULSS X0, X1 10930 ADDSS (DX)(R8*4), X1 10931 MOVSS X1, (DX)(R8*4) 10932 ADDQ CX, DI 10933 ADDQ BX, R8 10934 SUBQ $0x04, SI 10935 10936 check_limit_unroll: 10937 CMPQ SI, $0x04 10938 JHI loop_unroll 10939 JMP check_limit 10940 10941 loop: 10942 MOVSS (AX)(DI*4), X1 10943 MULSS X0, X1 10944 ADDSS (DX)(R8*4), X1 10945 MOVSS X1, (DX)(R8*4) 10946 DECQ SI 10947 ADDQ CX, DI 10948 ADDQ BX, R8 10949 10950 check_limit: 10951 CMPQ SI, $0x00 10952 JHI loop 10953 RET 10954 10955 // func AmdAxpyUnsafeX_V0A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 10956 // Requires: SSE 10957 TEXT ·AmdAxpyUnsafeX_V0A0R8(SB), NOSPLIT, $0-48 10958 MOVSS alpha+0(FP), X0 10959 MOVQ xs+8(FP), AX 10960 MOVQ incx+16(FP), CX 10961 MOVQ ys+24(FP), DX 10962 MOVQ incy+32(FP), BX 10963 MOVQ n+40(FP), SI 10964 XORQ DI, DI 10965 XORQ R8, R8 10966 JMP check_limit_unroll 10967 10968 loop_unroll: 10969 MOVSS (AX)(DI*4), X1 10970 MULSS X0, X1 10971 ADDSS (DX)(R8*4), X1 10972 MOVSS X1, (DX)(R8*4) 10973 ADDQ CX, DI 10974 ADDQ BX, R8 10975 MOVSS (AX)(DI*4), X1 10976 MULSS X0, X1 10977 ADDSS (DX)(R8*4), X1 10978 MOVSS X1, (DX)(R8*4) 10979 ADDQ CX, DI 10980 ADDQ BX, R8 10981 MOVSS (AX)(DI*4), X1 10982 MULSS X0, X1 10983 ADDSS (DX)(R8*4), X1 10984 MOVSS X1, (DX)(R8*4) 10985 ADDQ CX, DI 10986 ADDQ BX, R8 10987 MOVSS (AX)(DI*4), X1 10988 MULSS X0, X1 10989 ADDSS (DX)(R8*4), X1 10990 MOVSS X1, (DX)(R8*4) 10991 ADDQ CX, DI 10992 ADDQ BX, R8 10993 MOVSS (AX)(DI*4), X1 10994 MULSS X0, X1 10995 ADDSS (DX)(R8*4), X1 10996 MOVSS X1, (DX)(R8*4) 10997 ADDQ CX, DI 10998 ADDQ BX, R8 10999 MOVSS (AX)(DI*4), X1 11000 MULSS X0, X1 11001 ADDSS (DX)(R8*4), X1 11002 MOVSS X1, (DX)(R8*4) 11003 ADDQ CX, DI 11004 ADDQ BX, R8 11005 MOVSS (AX)(DI*4), X1 11006 MULSS X0, X1 11007 ADDSS (DX)(R8*4), X1 11008 MOVSS X1, (DX)(R8*4) 11009 ADDQ CX, DI 11010 ADDQ BX, R8 11011 MOVSS (AX)(DI*4), X1 11012 MULSS X0, X1 11013 ADDSS (DX)(R8*4), X1 11014 MOVSS X1, (DX)(R8*4) 11015 ADDQ CX, DI 11016 ADDQ BX, R8 11017 SUBQ $0x08, SI 11018 11019 check_limit_unroll: 11020 CMPQ SI, $0x08 11021 JHI loop_unroll 11022 JMP check_limit 11023 11024 loop: 11025 MOVSS (AX)(DI*4), X1 11026 MULSS X0, X1 11027 ADDSS (DX)(R8*4), X1 11028 MOVSS X1, (DX)(R8*4) 11029 DECQ SI 11030 ADDQ CX, DI 11031 ADDQ BX, R8 11032 11033 check_limit: 11034 CMPQ SI, $0x00 11035 JHI loop 11036 RET 11037 11038 // func AmdAxpyUnsafeX_V1A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11039 // Requires: SSE 11040 TEXT ·AmdAxpyUnsafeX_V1A0R8(SB), NOSPLIT, $0-48 11041 MOVSS alpha+0(FP), X0 11042 MOVQ xs+8(FP), AX 11043 MOVQ incx+16(FP), CX 11044 MOVQ ys+24(FP), DX 11045 MOVQ incy+32(FP), BX 11046 MOVQ n+40(FP), SI 11047 XORQ DI, DI 11048 XORQ R8, R8 11049 JMP check_limit_unroll 11050 11051 loop_unroll: 11052 MOVSS (AX)(DI*4), X1 11053 MULSS X0, X1 11054 ADDSS (DX)(R8*4), X1 11055 MOVSS X1, (DX)(R8*4) 11056 ADDQ CX, DI 11057 ADDQ BX, R8 11058 MOVSS (AX)(DI*4), X1 11059 MULSS X0, X1 11060 ADDSS (DX)(R8*4), X1 11061 MOVSS X1, (DX)(R8*4) 11062 ADDQ CX, DI 11063 ADDQ BX, R8 11064 MOVSS (AX)(DI*4), X1 11065 MULSS X0, X1 11066 ADDSS (DX)(R8*4), X1 11067 MOVSS X1, (DX)(R8*4) 11068 ADDQ CX, DI 11069 ADDQ BX, R8 11070 MOVSS (AX)(DI*4), X1 11071 MULSS X0, X1 11072 ADDSS (DX)(R8*4), X1 11073 MOVSS X1, (DX)(R8*4) 11074 ADDQ CX, DI 11075 ADDQ BX, R8 11076 MOVSS (AX)(DI*4), X1 11077 MULSS X0, X1 11078 ADDSS (DX)(R8*4), X1 11079 MOVSS X1, (DX)(R8*4) 11080 ADDQ CX, DI 11081 ADDQ BX, R8 11082 MOVSS (AX)(DI*4), X1 11083 MULSS X0, X1 11084 ADDSS (DX)(R8*4), X1 11085 MOVSS X1, (DX)(R8*4) 11086 ADDQ CX, DI 11087 ADDQ BX, R8 11088 MOVSS (AX)(DI*4), X1 11089 MULSS X0, X1 11090 ADDSS (DX)(R8*4), X1 11091 MOVSS X1, (DX)(R8*4) 11092 ADDQ CX, DI 11093 ADDQ BX, R8 11094 MOVSS (AX)(DI*4), X1 11095 MULSS X0, X1 11096 ADDSS (DX)(R8*4), X1 11097 MOVSS X1, (DX)(R8*4) 11098 ADDQ CX, DI 11099 ADDQ BX, R8 11100 SUBQ $0x08, SI 11101 11102 check_limit_unroll: 11103 CMPQ SI, $0x08 11104 JHI loop_unroll 11105 JMP check_limit 11106 11107 loop: 11108 MOVSS (AX)(DI*4), X1 11109 MULSS X0, X1 11110 ADDSS (DX)(R8*4), X1 11111 MOVSS X1, (DX)(R8*4) 11112 DECQ SI 11113 ADDQ CX, DI 11114 ADDQ BX, R8 11115 11116 check_limit: 11117 CMPQ SI, $0x00 11118 JHI loop 11119 RET 11120 11121 // func AmdAxpyUnsafeX_V2A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11122 // Requires: SSE 11123 TEXT ·AmdAxpyUnsafeX_V2A0R8(SB), NOSPLIT, $0-48 11124 MOVSS alpha+0(FP), X0 11125 MOVQ xs+8(FP), AX 11126 MOVQ incx+16(FP), CX 11127 MOVQ ys+24(FP), DX 11128 MOVQ incy+32(FP), BX 11129 MOVQ n+40(FP), SI 11130 XORQ DI, DI 11131 XORQ R8, R8 11132 JMP check_limit_unroll 11133 11134 loop_unroll: 11135 MOVSS (AX)(DI*4), X1 11136 MULSS X0, X1 11137 ADDSS (DX)(R8*4), X1 11138 MOVSS X1, (DX)(R8*4) 11139 ADDQ CX, DI 11140 ADDQ BX, R8 11141 MOVSS (AX)(DI*4), X1 11142 MULSS X0, X1 11143 ADDSS (DX)(R8*4), X1 11144 MOVSS X1, (DX)(R8*4) 11145 ADDQ CX, DI 11146 ADDQ BX, R8 11147 MOVSS (AX)(DI*4), X1 11148 MULSS X0, X1 11149 ADDSS (DX)(R8*4), X1 11150 MOVSS X1, (DX)(R8*4) 11151 ADDQ CX, DI 11152 ADDQ BX, R8 11153 MOVSS (AX)(DI*4), X1 11154 MULSS X0, X1 11155 ADDSS (DX)(R8*4), X1 11156 MOVSS X1, (DX)(R8*4) 11157 ADDQ CX, DI 11158 ADDQ BX, R8 11159 MOVSS (AX)(DI*4), X1 11160 MULSS X0, X1 11161 ADDSS (DX)(R8*4), X1 11162 MOVSS X1, (DX)(R8*4) 11163 ADDQ CX, DI 11164 ADDQ BX, R8 11165 MOVSS (AX)(DI*4), X1 11166 MULSS X0, X1 11167 ADDSS (DX)(R8*4), X1 11168 MOVSS X1, (DX)(R8*4) 11169 ADDQ CX, DI 11170 ADDQ BX, R8 11171 MOVSS (AX)(DI*4), X1 11172 MULSS X0, X1 11173 ADDSS (DX)(R8*4), X1 11174 MOVSS X1, (DX)(R8*4) 11175 ADDQ CX, DI 11176 ADDQ BX, R8 11177 MOVSS (AX)(DI*4), X1 11178 MULSS X0, X1 11179 ADDSS (DX)(R8*4), X1 11180 MOVSS X1, (DX)(R8*4) 11181 ADDQ CX, DI 11182 ADDQ BX, R8 11183 SUBQ $0x08, SI 11184 11185 check_limit_unroll: 11186 CMPQ SI, $0x08 11187 JHI loop_unroll 11188 JMP check_limit 11189 11190 loop: 11191 MOVSS (AX)(DI*4), X1 11192 MULSS X0, X1 11193 ADDSS (DX)(R8*4), X1 11194 MOVSS X1, (DX)(R8*4) 11195 DECQ SI 11196 ADDQ CX, DI 11197 ADDQ BX, R8 11198 11199 check_limit: 11200 CMPQ SI, $0x00 11201 JHI loop 11202 RET 11203 11204 // func AmdAxpyUnsafeX_V3A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11205 // Requires: SSE 11206 TEXT ·AmdAxpyUnsafeX_V3A0R8(SB), NOSPLIT, $0-48 11207 MOVSS alpha+0(FP), X0 11208 MOVQ xs+8(FP), AX 11209 MOVQ incx+16(FP), CX 11210 MOVQ ys+24(FP), DX 11211 MOVQ incy+32(FP), BX 11212 MOVQ n+40(FP), SI 11213 XORQ DI, DI 11214 XORQ R8, R8 11215 JMP check_limit_unroll 11216 11217 loop_unroll: 11218 MOVSS (AX)(DI*4), X1 11219 MULSS X0, X1 11220 ADDSS (DX)(R8*4), X1 11221 MOVSS X1, (DX)(R8*4) 11222 ADDQ CX, DI 11223 ADDQ BX, R8 11224 MOVSS (AX)(DI*4), X1 11225 MULSS X0, X1 11226 ADDSS (DX)(R8*4), X1 11227 MOVSS X1, (DX)(R8*4) 11228 ADDQ CX, DI 11229 ADDQ BX, R8 11230 MOVSS (AX)(DI*4), X1 11231 MULSS X0, X1 11232 ADDSS (DX)(R8*4), X1 11233 MOVSS X1, (DX)(R8*4) 11234 ADDQ CX, DI 11235 ADDQ BX, R8 11236 MOVSS (AX)(DI*4), X1 11237 MULSS X0, X1 11238 ADDSS (DX)(R8*4), X1 11239 MOVSS X1, (DX)(R8*4) 11240 ADDQ CX, DI 11241 ADDQ BX, R8 11242 MOVSS (AX)(DI*4), X1 11243 MULSS X0, X1 11244 ADDSS (DX)(R8*4), X1 11245 MOVSS X1, (DX)(R8*4) 11246 ADDQ CX, DI 11247 ADDQ BX, R8 11248 MOVSS (AX)(DI*4), X1 11249 MULSS X0, X1 11250 ADDSS (DX)(R8*4), X1 11251 MOVSS X1, (DX)(R8*4) 11252 ADDQ CX, DI 11253 ADDQ BX, R8 11254 MOVSS (AX)(DI*4), X1 11255 MULSS X0, X1 11256 ADDSS (DX)(R8*4), X1 11257 MOVSS X1, (DX)(R8*4) 11258 ADDQ CX, DI 11259 ADDQ BX, R8 11260 MOVSS (AX)(DI*4), X1 11261 MULSS X0, X1 11262 ADDSS (DX)(R8*4), X1 11263 MOVSS X1, (DX)(R8*4) 11264 ADDQ CX, DI 11265 ADDQ BX, R8 11266 SUBQ $0x08, SI 11267 11268 check_limit_unroll: 11269 CMPQ SI, $0x08 11270 JHI loop_unroll 11271 JMP check_limit 11272 11273 loop: 11274 MOVSS (AX)(DI*4), X1 11275 MULSS X0, X1 11276 ADDSS (DX)(R8*4), X1 11277 MOVSS X1, (DX)(R8*4) 11278 DECQ SI 11279 ADDQ CX, DI 11280 ADDQ BX, R8 11281 11282 check_limit: 11283 CMPQ SI, $0x00 11284 JHI loop 11285 RET 11286 11287 // func AmdAxpyUnsafeX_V4A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11288 // Requires: SSE 11289 TEXT ·AmdAxpyUnsafeX_V4A0R8(SB), NOSPLIT, $0-48 11290 MOVSS alpha+0(FP), X0 11291 MOVQ xs+8(FP), AX 11292 MOVQ incx+16(FP), CX 11293 MOVQ ys+24(FP), DX 11294 MOVQ incy+32(FP), BX 11295 MOVQ n+40(FP), SI 11296 XORQ DI, DI 11297 XORQ R8, R8 11298 JMP check_limit_unroll 11299 11300 loop_unroll: 11301 MOVSS (AX)(DI*4), X1 11302 MULSS X0, X1 11303 ADDSS (DX)(R8*4), X1 11304 MOVSS X1, (DX)(R8*4) 11305 ADDQ CX, DI 11306 ADDQ BX, R8 11307 MOVSS (AX)(DI*4), X1 11308 MULSS X0, X1 11309 ADDSS (DX)(R8*4), X1 11310 MOVSS X1, (DX)(R8*4) 11311 ADDQ CX, DI 11312 ADDQ BX, R8 11313 MOVSS (AX)(DI*4), X1 11314 MULSS X0, X1 11315 ADDSS (DX)(R8*4), X1 11316 MOVSS X1, (DX)(R8*4) 11317 ADDQ CX, DI 11318 ADDQ BX, R8 11319 MOVSS (AX)(DI*4), X1 11320 MULSS X0, X1 11321 ADDSS (DX)(R8*4), X1 11322 MOVSS X1, (DX)(R8*4) 11323 ADDQ CX, DI 11324 ADDQ BX, R8 11325 MOVSS (AX)(DI*4), X1 11326 MULSS X0, X1 11327 ADDSS (DX)(R8*4), X1 11328 MOVSS X1, (DX)(R8*4) 11329 ADDQ CX, DI 11330 ADDQ BX, R8 11331 MOVSS (AX)(DI*4), X1 11332 MULSS X0, X1 11333 ADDSS (DX)(R8*4), X1 11334 MOVSS X1, (DX)(R8*4) 11335 ADDQ CX, DI 11336 ADDQ BX, R8 11337 MOVSS (AX)(DI*4), X1 11338 MULSS X0, X1 11339 ADDSS (DX)(R8*4), X1 11340 MOVSS X1, (DX)(R8*4) 11341 ADDQ CX, DI 11342 ADDQ BX, R8 11343 MOVSS (AX)(DI*4), X1 11344 MULSS X0, X1 11345 ADDSS (DX)(R8*4), X1 11346 MOVSS X1, (DX)(R8*4) 11347 ADDQ CX, DI 11348 ADDQ BX, R8 11349 SUBQ $0x08, SI 11350 11351 check_limit_unroll: 11352 CMPQ SI, $0x08 11353 JHI loop_unroll 11354 JMP check_limit 11355 11356 loop: 11357 MOVSS (AX)(DI*4), X1 11358 MULSS X0, X1 11359 ADDSS (DX)(R8*4), X1 11360 MOVSS X1, (DX)(R8*4) 11361 DECQ SI 11362 ADDQ CX, DI 11363 ADDQ BX, R8 11364 11365 check_limit: 11366 CMPQ SI, $0x00 11367 JHI loop 11368 RET 11369 11370 // func AmdAxpyUnsafeX_V5A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11371 // Requires: SSE 11372 TEXT ·AmdAxpyUnsafeX_V5A0R8(SB), NOSPLIT, $0-48 11373 MOVSS alpha+0(FP), X0 11374 MOVQ xs+8(FP), AX 11375 MOVQ incx+16(FP), CX 11376 MOVQ ys+24(FP), DX 11377 MOVQ incy+32(FP), BX 11378 MOVQ n+40(FP), SI 11379 XORQ DI, DI 11380 XORQ R8, R8 11381 JMP check_limit_unroll 11382 11383 loop_unroll: 11384 MOVSS (AX)(DI*4), X1 11385 MULSS X0, X1 11386 ADDSS (DX)(R8*4), X1 11387 MOVSS X1, (DX)(R8*4) 11388 ADDQ CX, DI 11389 ADDQ BX, R8 11390 MOVSS (AX)(DI*4), X1 11391 MULSS X0, X1 11392 ADDSS (DX)(R8*4), X1 11393 MOVSS X1, (DX)(R8*4) 11394 ADDQ CX, DI 11395 ADDQ BX, R8 11396 MOVSS (AX)(DI*4), X1 11397 MULSS X0, X1 11398 ADDSS (DX)(R8*4), X1 11399 MOVSS X1, (DX)(R8*4) 11400 ADDQ CX, DI 11401 ADDQ BX, R8 11402 MOVSS (AX)(DI*4), X1 11403 MULSS X0, X1 11404 ADDSS (DX)(R8*4), X1 11405 MOVSS X1, (DX)(R8*4) 11406 ADDQ CX, DI 11407 ADDQ BX, R8 11408 MOVSS (AX)(DI*4), X1 11409 MULSS X0, X1 11410 ADDSS (DX)(R8*4), X1 11411 MOVSS X1, (DX)(R8*4) 11412 ADDQ CX, DI 11413 ADDQ BX, R8 11414 MOVSS (AX)(DI*4), X1 11415 MULSS X0, X1 11416 ADDSS (DX)(R8*4), X1 11417 MOVSS X1, (DX)(R8*4) 11418 ADDQ CX, DI 11419 ADDQ BX, R8 11420 MOVSS (AX)(DI*4), X1 11421 MULSS X0, X1 11422 ADDSS (DX)(R8*4), X1 11423 MOVSS X1, (DX)(R8*4) 11424 ADDQ CX, DI 11425 ADDQ BX, R8 11426 MOVSS (AX)(DI*4), X1 11427 MULSS X0, X1 11428 ADDSS (DX)(R8*4), X1 11429 MOVSS X1, (DX)(R8*4) 11430 ADDQ CX, DI 11431 ADDQ BX, R8 11432 SUBQ $0x08, SI 11433 11434 check_limit_unroll: 11435 CMPQ SI, $0x08 11436 JHI loop_unroll 11437 JMP check_limit 11438 11439 loop: 11440 MOVSS (AX)(DI*4), X1 11441 MULSS X0, X1 11442 ADDSS (DX)(R8*4), X1 11443 MOVSS X1, (DX)(R8*4) 11444 DECQ SI 11445 ADDQ CX, DI 11446 ADDQ BX, R8 11447 11448 check_limit: 11449 CMPQ SI, $0x00 11450 JHI loop 11451 RET 11452 11453 // func AmdAxpyUnsafeX_V0A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11454 // Requires: SSE 11455 TEXT ·AmdAxpyUnsafeX_V0A8R8(SB), NOSPLIT, $0-48 11456 MOVSS alpha+0(FP), X0 11457 MOVQ xs+8(FP), AX 11458 MOVQ incx+16(FP), CX 11459 MOVQ ys+24(FP), DX 11460 MOVQ incy+32(FP), BX 11461 MOVQ n+40(FP), SI 11462 XORQ DI, DI 11463 XORQ R8, R8 11464 JMP check_limit_unroll 11465 PCALIGN $0x08 11466 11467 loop_unroll: 11468 MOVSS (AX)(DI*4), X1 11469 MULSS X0, X1 11470 ADDSS (DX)(R8*4), X1 11471 MOVSS X1, (DX)(R8*4) 11472 ADDQ CX, DI 11473 ADDQ BX, R8 11474 MOVSS (AX)(DI*4), X1 11475 MULSS X0, X1 11476 ADDSS (DX)(R8*4), X1 11477 MOVSS X1, (DX)(R8*4) 11478 ADDQ CX, DI 11479 ADDQ BX, R8 11480 MOVSS (AX)(DI*4), X1 11481 MULSS X0, X1 11482 ADDSS (DX)(R8*4), X1 11483 MOVSS X1, (DX)(R8*4) 11484 ADDQ CX, DI 11485 ADDQ BX, R8 11486 MOVSS (AX)(DI*4), X1 11487 MULSS X0, X1 11488 ADDSS (DX)(R8*4), X1 11489 MOVSS X1, (DX)(R8*4) 11490 ADDQ CX, DI 11491 ADDQ BX, R8 11492 MOVSS (AX)(DI*4), X1 11493 MULSS X0, X1 11494 ADDSS (DX)(R8*4), X1 11495 MOVSS X1, (DX)(R8*4) 11496 ADDQ CX, DI 11497 ADDQ BX, R8 11498 MOVSS (AX)(DI*4), X1 11499 MULSS X0, X1 11500 ADDSS (DX)(R8*4), X1 11501 MOVSS X1, (DX)(R8*4) 11502 ADDQ CX, DI 11503 ADDQ BX, R8 11504 MOVSS (AX)(DI*4), X1 11505 MULSS X0, X1 11506 ADDSS (DX)(R8*4), X1 11507 MOVSS X1, (DX)(R8*4) 11508 ADDQ CX, DI 11509 ADDQ BX, R8 11510 MOVSS (AX)(DI*4), X1 11511 MULSS X0, X1 11512 ADDSS (DX)(R8*4), X1 11513 MOVSS X1, (DX)(R8*4) 11514 ADDQ CX, DI 11515 ADDQ BX, R8 11516 SUBQ $0x08, SI 11517 11518 check_limit_unroll: 11519 CMPQ SI, $0x08 11520 JHI loop_unroll 11521 JMP check_limit 11522 11523 loop: 11524 MOVSS (AX)(DI*4), X1 11525 MULSS X0, X1 11526 ADDSS (DX)(R8*4), X1 11527 MOVSS X1, (DX)(R8*4) 11528 DECQ SI 11529 ADDQ CX, DI 11530 ADDQ BX, R8 11531 11532 check_limit: 11533 CMPQ SI, $0x00 11534 JHI loop 11535 RET 11536 11537 // func AmdAxpyUnsafeX_V1A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11538 // Requires: SSE 11539 TEXT ·AmdAxpyUnsafeX_V1A8R8(SB), NOSPLIT, $0-48 11540 MOVSS alpha+0(FP), X0 11541 MOVQ xs+8(FP), AX 11542 MOVQ incx+16(FP), CX 11543 MOVQ ys+24(FP), DX 11544 MOVQ incy+32(FP), BX 11545 MOVQ n+40(FP), SI 11546 XORQ DI, DI 11547 XORQ R8, R8 11548 JMP check_limit_unroll 11549 PCALIGN $0x08 11550 11551 loop_unroll: 11552 MOVSS (AX)(DI*4), X1 11553 MULSS X0, X1 11554 ADDSS (DX)(R8*4), X1 11555 MOVSS X1, (DX)(R8*4) 11556 ADDQ CX, DI 11557 ADDQ BX, R8 11558 MOVSS (AX)(DI*4), X1 11559 MULSS X0, X1 11560 ADDSS (DX)(R8*4), X1 11561 MOVSS X1, (DX)(R8*4) 11562 ADDQ CX, DI 11563 ADDQ BX, R8 11564 MOVSS (AX)(DI*4), X1 11565 MULSS X0, X1 11566 ADDSS (DX)(R8*4), X1 11567 MOVSS X1, (DX)(R8*4) 11568 ADDQ CX, DI 11569 ADDQ BX, R8 11570 MOVSS (AX)(DI*4), X1 11571 MULSS X0, X1 11572 ADDSS (DX)(R8*4), X1 11573 MOVSS X1, (DX)(R8*4) 11574 ADDQ CX, DI 11575 ADDQ BX, R8 11576 MOVSS (AX)(DI*4), X1 11577 MULSS X0, X1 11578 ADDSS (DX)(R8*4), X1 11579 MOVSS X1, (DX)(R8*4) 11580 ADDQ CX, DI 11581 ADDQ BX, R8 11582 MOVSS (AX)(DI*4), X1 11583 MULSS X0, X1 11584 ADDSS (DX)(R8*4), X1 11585 MOVSS X1, (DX)(R8*4) 11586 ADDQ CX, DI 11587 ADDQ BX, R8 11588 MOVSS (AX)(DI*4), X1 11589 MULSS X0, X1 11590 ADDSS (DX)(R8*4), X1 11591 MOVSS X1, (DX)(R8*4) 11592 ADDQ CX, DI 11593 ADDQ BX, R8 11594 MOVSS (AX)(DI*4), X1 11595 MULSS X0, X1 11596 ADDSS (DX)(R8*4), X1 11597 MOVSS X1, (DX)(R8*4) 11598 ADDQ CX, DI 11599 ADDQ BX, R8 11600 SUBQ $0x08, SI 11601 11602 check_limit_unroll: 11603 CMPQ SI, $0x08 11604 JHI loop_unroll 11605 JMP check_limit 11606 11607 loop: 11608 MOVSS (AX)(DI*4), X1 11609 MULSS X0, X1 11610 ADDSS (DX)(R8*4), X1 11611 MOVSS X1, (DX)(R8*4) 11612 DECQ SI 11613 ADDQ CX, DI 11614 ADDQ BX, R8 11615 11616 check_limit: 11617 CMPQ SI, $0x00 11618 JHI loop 11619 RET 11620 11621 // func AmdAxpyUnsafeX_V2A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11622 // Requires: SSE 11623 TEXT ·AmdAxpyUnsafeX_V2A8R8(SB), NOSPLIT, $0-48 11624 MOVSS alpha+0(FP), X0 11625 MOVQ xs+8(FP), AX 11626 MOVQ incx+16(FP), CX 11627 MOVQ ys+24(FP), DX 11628 MOVQ incy+32(FP), BX 11629 MOVQ n+40(FP), SI 11630 XORQ DI, DI 11631 XORQ R8, R8 11632 JMP check_limit_unroll 11633 PCALIGN $0x08 11634 11635 loop_unroll: 11636 MOVSS (AX)(DI*4), X1 11637 MULSS X0, X1 11638 ADDSS (DX)(R8*4), X1 11639 MOVSS X1, (DX)(R8*4) 11640 ADDQ CX, DI 11641 ADDQ BX, R8 11642 MOVSS (AX)(DI*4), X1 11643 MULSS X0, X1 11644 ADDSS (DX)(R8*4), X1 11645 MOVSS X1, (DX)(R8*4) 11646 ADDQ CX, DI 11647 ADDQ BX, R8 11648 MOVSS (AX)(DI*4), X1 11649 MULSS X0, X1 11650 ADDSS (DX)(R8*4), X1 11651 MOVSS X1, (DX)(R8*4) 11652 ADDQ CX, DI 11653 ADDQ BX, R8 11654 MOVSS (AX)(DI*4), X1 11655 MULSS X0, X1 11656 ADDSS (DX)(R8*4), X1 11657 MOVSS X1, (DX)(R8*4) 11658 ADDQ CX, DI 11659 ADDQ BX, R8 11660 MOVSS (AX)(DI*4), X1 11661 MULSS X0, X1 11662 ADDSS (DX)(R8*4), X1 11663 MOVSS X1, (DX)(R8*4) 11664 ADDQ CX, DI 11665 ADDQ BX, R8 11666 MOVSS (AX)(DI*4), X1 11667 MULSS X0, X1 11668 ADDSS (DX)(R8*4), X1 11669 MOVSS X1, (DX)(R8*4) 11670 ADDQ CX, DI 11671 ADDQ BX, R8 11672 MOVSS (AX)(DI*4), X1 11673 MULSS X0, X1 11674 ADDSS (DX)(R8*4), X1 11675 MOVSS X1, (DX)(R8*4) 11676 ADDQ CX, DI 11677 ADDQ BX, R8 11678 MOVSS (AX)(DI*4), X1 11679 MULSS X0, X1 11680 ADDSS (DX)(R8*4), X1 11681 MOVSS X1, (DX)(R8*4) 11682 ADDQ CX, DI 11683 ADDQ BX, R8 11684 SUBQ $0x08, SI 11685 11686 check_limit_unroll: 11687 CMPQ SI, $0x08 11688 JHI loop_unroll 11689 JMP check_limit 11690 11691 loop: 11692 MOVSS (AX)(DI*4), X1 11693 MULSS X0, X1 11694 ADDSS (DX)(R8*4), X1 11695 MOVSS X1, (DX)(R8*4) 11696 DECQ SI 11697 ADDQ CX, DI 11698 ADDQ BX, R8 11699 11700 check_limit: 11701 CMPQ SI, $0x00 11702 JHI loop 11703 RET 11704 11705 // func AmdAxpyUnsafeX_V3A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11706 // Requires: SSE 11707 TEXT ·AmdAxpyUnsafeX_V3A8R8(SB), NOSPLIT, $0-48 11708 MOVSS alpha+0(FP), X0 11709 MOVQ xs+8(FP), AX 11710 MOVQ incx+16(FP), CX 11711 MOVQ ys+24(FP), DX 11712 MOVQ incy+32(FP), BX 11713 MOVQ n+40(FP), SI 11714 XORQ DI, DI 11715 XORQ R8, R8 11716 JMP check_limit_unroll 11717 PCALIGN $0x08 11718 11719 loop_unroll: 11720 MOVSS (AX)(DI*4), X1 11721 MULSS X0, X1 11722 ADDSS (DX)(R8*4), X1 11723 MOVSS X1, (DX)(R8*4) 11724 ADDQ CX, DI 11725 ADDQ BX, R8 11726 MOVSS (AX)(DI*4), X1 11727 MULSS X0, X1 11728 ADDSS (DX)(R8*4), X1 11729 MOVSS X1, (DX)(R8*4) 11730 ADDQ CX, DI 11731 ADDQ BX, R8 11732 MOVSS (AX)(DI*4), X1 11733 MULSS X0, X1 11734 ADDSS (DX)(R8*4), X1 11735 MOVSS X1, (DX)(R8*4) 11736 ADDQ CX, DI 11737 ADDQ BX, R8 11738 MOVSS (AX)(DI*4), X1 11739 MULSS X0, X1 11740 ADDSS (DX)(R8*4), X1 11741 MOVSS X1, (DX)(R8*4) 11742 ADDQ CX, DI 11743 ADDQ BX, R8 11744 MOVSS (AX)(DI*4), X1 11745 MULSS X0, X1 11746 ADDSS (DX)(R8*4), X1 11747 MOVSS X1, (DX)(R8*4) 11748 ADDQ CX, DI 11749 ADDQ BX, R8 11750 MOVSS (AX)(DI*4), X1 11751 MULSS X0, X1 11752 ADDSS (DX)(R8*4), X1 11753 MOVSS X1, (DX)(R8*4) 11754 ADDQ CX, DI 11755 ADDQ BX, R8 11756 MOVSS (AX)(DI*4), X1 11757 MULSS X0, X1 11758 ADDSS (DX)(R8*4), X1 11759 MOVSS X1, (DX)(R8*4) 11760 ADDQ CX, DI 11761 ADDQ BX, R8 11762 MOVSS (AX)(DI*4), X1 11763 MULSS X0, X1 11764 ADDSS (DX)(R8*4), X1 11765 MOVSS X1, (DX)(R8*4) 11766 ADDQ CX, DI 11767 ADDQ BX, R8 11768 SUBQ $0x08, SI 11769 11770 check_limit_unroll: 11771 CMPQ SI, $0x08 11772 JHI loop_unroll 11773 JMP check_limit 11774 11775 loop: 11776 MOVSS (AX)(DI*4), X1 11777 MULSS X0, X1 11778 ADDSS (DX)(R8*4), X1 11779 MOVSS X1, (DX)(R8*4) 11780 DECQ SI 11781 ADDQ CX, DI 11782 ADDQ BX, R8 11783 11784 check_limit: 11785 CMPQ SI, $0x00 11786 JHI loop 11787 RET 11788 11789 // func AmdAxpyUnsafeX_V4A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11790 // Requires: SSE 11791 TEXT ·AmdAxpyUnsafeX_V4A8R8(SB), NOSPLIT, $0-48 11792 MOVSS alpha+0(FP), X0 11793 MOVQ xs+8(FP), AX 11794 MOVQ incx+16(FP), CX 11795 MOVQ ys+24(FP), DX 11796 MOVQ incy+32(FP), BX 11797 MOVQ n+40(FP), SI 11798 XORQ DI, DI 11799 XORQ R8, R8 11800 JMP check_limit_unroll 11801 PCALIGN $0x08 11802 11803 loop_unroll: 11804 MOVSS (AX)(DI*4), X1 11805 MULSS X0, X1 11806 ADDSS (DX)(R8*4), X1 11807 MOVSS X1, (DX)(R8*4) 11808 ADDQ CX, DI 11809 ADDQ BX, R8 11810 MOVSS (AX)(DI*4), X1 11811 MULSS X0, X1 11812 ADDSS (DX)(R8*4), X1 11813 MOVSS X1, (DX)(R8*4) 11814 ADDQ CX, DI 11815 ADDQ BX, R8 11816 MOVSS (AX)(DI*4), X1 11817 MULSS X0, X1 11818 ADDSS (DX)(R8*4), X1 11819 MOVSS X1, (DX)(R8*4) 11820 ADDQ CX, DI 11821 ADDQ BX, R8 11822 MOVSS (AX)(DI*4), X1 11823 MULSS X0, X1 11824 ADDSS (DX)(R8*4), X1 11825 MOVSS X1, (DX)(R8*4) 11826 ADDQ CX, DI 11827 ADDQ BX, R8 11828 MOVSS (AX)(DI*4), X1 11829 MULSS X0, X1 11830 ADDSS (DX)(R8*4), X1 11831 MOVSS X1, (DX)(R8*4) 11832 ADDQ CX, DI 11833 ADDQ BX, R8 11834 MOVSS (AX)(DI*4), X1 11835 MULSS X0, X1 11836 ADDSS (DX)(R8*4), X1 11837 MOVSS X1, (DX)(R8*4) 11838 ADDQ CX, DI 11839 ADDQ BX, R8 11840 MOVSS (AX)(DI*4), X1 11841 MULSS X0, X1 11842 ADDSS (DX)(R8*4), X1 11843 MOVSS X1, (DX)(R8*4) 11844 ADDQ CX, DI 11845 ADDQ BX, R8 11846 MOVSS (AX)(DI*4), X1 11847 MULSS X0, X1 11848 ADDSS (DX)(R8*4), X1 11849 MOVSS X1, (DX)(R8*4) 11850 ADDQ CX, DI 11851 ADDQ BX, R8 11852 SUBQ $0x08, SI 11853 11854 check_limit_unroll: 11855 CMPQ SI, $0x08 11856 JHI loop_unroll 11857 JMP check_limit 11858 11859 loop: 11860 MOVSS (AX)(DI*4), X1 11861 MULSS X0, X1 11862 ADDSS (DX)(R8*4), X1 11863 MOVSS X1, (DX)(R8*4) 11864 DECQ SI 11865 ADDQ CX, DI 11866 ADDQ BX, R8 11867 11868 check_limit: 11869 CMPQ SI, $0x00 11870 JHI loop 11871 RET 11872 11873 // func AmdAxpyUnsafeX_V5A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11874 // Requires: SSE 11875 TEXT ·AmdAxpyUnsafeX_V5A8R8(SB), NOSPLIT, $0-48 11876 MOVSS alpha+0(FP), X0 11877 MOVQ xs+8(FP), AX 11878 MOVQ incx+16(FP), CX 11879 MOVQ ys+24(FP), DX 11880 MOVQ incy+32(FP), BX 11881 MOVQ n+40(FP), SI 11882 XORQ DI, DI 11883 XORQ R8, R8 11884 JMP check_limit_unroll 11885 PCALIGN $0x08 11886 11887 loop_unroll: 11888 MOVSS (AX)(DI*4), X1 11889 MULSS X0, X1 11890 ADDSS (DX)(R8*4), X1 11891 MOVSS X1, (DX)(R8*4) 11892 ADDQ CX, DI 11893 ADDQ BX, R8 11894 MOVSS (AX)(DI*4), X1 11895 MULSS X0, X1 11896 ADDSS (DX)(R8*4), X1 11897 MOVSS X1, (DX)(R8*4) 11898 ADDQ CX, DI 11899 ADDQ BX, R8 11900 MOVSS (AX)(DI*4), X1 11901 MULSS X0, X1 11902 ADDSS (DX)(R8*4), X1 11903 MOVSS X1, (DX)(R8*4) 11904 ADDQ CX, DI 11905 ADDQ BX, R8 11906 MOVSS (AX)(DI*4), X1 11907 MULSS X0, X1 11908 ADDSS (DX)(R8*4), X1 11909 MOVSS X1, (DX)(R8*4) 11910 ADDQ CX, DI 11911 ADDQ BX, R8 11912 MOVSS (AX)(DI*4), X1 11913 MULSS X0, X1 11914 ADDSS (DX)(R8*4), X1 11915 MOVSS X1, (DX)(R8*4) 11916 ADDQ CX, DI 11917 ADDQ BX, R8 11918 MOVSS (AX)(DI*4), X1 11919 MULSS X0, X1 11920 ADDSS (DX)(R8*4), X1 11921 MOVSS X1, (DX)(R8*4) 11922 ADDQ CX, DI 11923 ADDQ BX, R8 11924 MOVSS (AX)(DI*4), X1 11925 MULSS X0, X1 11926 ADDSS (DX)(R8*4), X1 11927 MOVSS X1, (DX)(R8*4) 11928 ADDQ CX, DI 11929 ADDQ BX, R8 11930 MOVSS (AX)(DI*4), X1 11931 MULSS X0, X1 11932 ADDSS (DX)(R8*4), X1 11933 MOVSS X1, (DX)(R8*4) 11934 ADDQ CX, DI 11935 ADDQ BX, R8 11936 SUBQ $0x08, SI 11937 11938 check_limit_unroll: 11939 CMPQ SI, $0x08 11940 JHI loop_unroll 11941 JMP check_limit 11942 11943 loop: 11944 MOVSS (AX)(DI*4), X1 11945 MULSS X0, X1 11946 ADDSS (DX)(R8*4), X1 11947 MOVSS X1, (DX)(R8*4) 11948 DECQ SI 11949 ADDQ CX, DI 11950 ADDQ BX, R8 11951 11952 check_limit: 11953 CMPQ SI, $0x00 11954 JHI loop 11955 RET 11956 11957 // func AmdAxpyUnsafeX_V0A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 11958 // Requires: SSE 11959 TEXT ·AmdAxpyUnsafeX_V0A9R8(SB), NOSPLIT, $0-48 11960 MOVSS alpha+0(FP), X0 11961 MOVQ xs+8(FP), AX 11962 MOVQ incx+16(FP), CX 11963 MOVQ ys+24(FP), DX 11964 MOVQ incy+32(FP), BX 11965 MOVQ n+40(FP), SI 11966 XORQ DI, DI 11967 XORQ R8, R8 11968 JMP check_limit_unroll 11969 PCALIGN $0x08 11970 NOP 11971 11972 loop_unroll: 11973 MOVSS (AX)(DI*4), X1 11974 MULSS X0, X1 11975 ADDSS (DX)(R8*4), X1 11976 MOVSS X1, (DX)(R8*4) 11977 ADDQ CX, DI 11978 ADDQ BX, R8 11979 MOVSS (AX)(DI*4), X1 11980 MULSS X0, X1 11981 ADDSS (DX)(R8*4), X1 11982 MOVSS X1, (DX)(R8*4) 11983 ADDQ CX, DI 11984 ADDQ BX, R8 11985 MOVSS (AX)(DI*4), X1 11986 MULSS X0, X1 11987 ADDSS (DX)(R8*4), X1 11988 MOVSS X1, (DX)(R8*4) 11989 ADDQ CX, DI 11990 ADDQ BX, R8 11991 MOVSS (AX)(DI*4), X1 11992 MULSS X0, X1 11993 ADDSS (DX)(R8*4), X1 11994 MOVSS X1, (DX)(R8*4) 11995 ADDQ CX, DI 11996 ADDQ BX, R8 11997 MOVSS (AX)(DI*4), X1 11998 MULSS X0, X1 11999 ADDSS (DX)(R8*4), X1 12000 MOVSS X1, (DX)(R8*4) 12001 ADDQ CX, DI 12002 ADDQ BX, R8 12003 MOVSS (AX)(DI*4), X1 12004 MULSS X0, X1 12005 ADDSS (DX)(R8*4), X1 12006 MOVSS X1, (DX)(R8*4) 12007 ADDQ CX, DI 12008 ADDQ BX, R8 12009 MOVSS (AX)(DI*4), X1 12010 MULSS X0, X1 12011 ADDSS (DX)(R8*4), X1 12012 MOVSS X1, (DX)(R8*4) 12013 ADDQ CX, DI 12014 ADDQ BX, R8 12015 MOVSS (AX)(DI*4), X1 12016 MULSS X0, X1 12017 ADDSS (DX)(R8*4), X1 12018 MOVSS X1, (DX)(R8*4) 12019 ADDQ CX, DI 12020 ADDQ BX, R8 12021 SUBQ $0x08, SI 12022 12023 check_limit_unroll: 12024 CMPQ SI, $0x08 12025 JHI loop_unroll 12026 JMP check_limit 12027 12028 loop: 12029 MOVSS (AX)(DI*4), X1 12030 MULSS X0, X1 12031 ADDSS (DX)(R8*4), X1 12032 MOVSS X1, (DX)(R8*4) 12033 DECQ SI 12034 ADDQ CX, DI 12035 ADDQ BX, R8 12036 12037 check_limit: 12038 CMPQ SI, $0x00 12039 JHI loop 12040 RET 12041 12042 // func AmdAxpyUnsafeX_V1A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12043 // Requires: SSE 12044 TEXT ·AmdAxpyUnsafeX_V1A9R8(SB), NOSPLIT, $0-48 12045 MOVSS alpha+0(FP), X0 12046 MOVQ xs+8(FP), AX 12047 MOVQ incx+16(FP), CX 12048 MOVQ ys+24(FP), DX 12049 MOVQ incy+32(FP), BX 12050 MOVQ n+40(FP), SI 12051 XORQ DI, DI 12052 XORQ R8, R8 12053 JMP check_limit_unroll 12054 PCALIGN $0x08 12055 NOP 12056 12057 loop_unroll: 12058 MOVSS (AX)(DI*4), X1 12059 MULSS X0, X1 12060 ADDSS (DX)(R8*4), X1 12061 MOVSS X1, (DX)(R8*4) 12062 ADDQ CX, DI 12063 ADDQ BX, R8 12064 MOVSS (AX)(DI*4), X1 12065 MULSS X0, X1 12066 ADDSS (DX)(R8*4), X1 12067 MOVSS X1, (DX)(R8*4) 12068 ADDQ CX, DI 12069 ADDQ BX, R8 12070 MOVSS (AX)(DI*4), X1 12071 MULSS X0, X1 12072 ADDSS (DX)(R8*4), X1 12073 MOVSS X1, (DX)(R8*4) 12074 ADDQ CX, DI 12075 ADDQ BX, R8 12076 MOVSS (AX)(DI*4), X1 12077 MULSS X0, X1 12078 ADDSS (DX)(R8*4), X1 12079 MOVSS X1, (DX)(R8*4) 12080 ADDQ CX, DI 12081 ADDQ BX, R8 12082 MOVSS (AX)(DI*4), X1 12083 MULSS X0, X1 12084 ADDSS (DX)(R8*4), X1 12085 MOVSS X1, (DX)(R8*4) 12086 ADDQ CX, DI 12087 ADDQ BX, R8 12088 MOVSS (AX)(DI*4), X1 12089 MULSS X0, X1 12090 ADDSS (DX)(R8*4), X1 12091 MOVSS X1, (DX)(R8*4) 12092 ADDQ CX, DI 12093 ADDQ BX, R8 12094 MOVSS (AX)(DI*4), X1 12095 MULSS X0, X1 12096 ADDSS (DX)(R8*4), X1 12097 MOVSS X1, (DX)(R8*4) 12098 ADDQ CX, DI 12099 ADDQ BX, R8 12100 MOVSS (AX)(DI*4), X1 12101 MULSS X0, X1 12102 ADDSS (DX)(R8*4), X1 12103 MOVSS X1, (DX)(R8*4) 12104 ADDQ CX, DI 12105 ADDQ BX, R8 12106 SUBQ $0x08, SI 12107 12108 check_limit_unroll: 12109 CMPQ SI, $0x08 12110 JHI loop_unroll 12111 JMP check_limit 12112 12113 loop: 12114 MOVSS (AX)(DI*4), X1 12115 MULSS X0, X1 12116 ADDSS (DX)(R8*4), X1 12117 MOVSS X1, (DX)(R8*4) 12118 DECQ SI 12119 ADDQ CX, DI 12120 ADDQ BX, R8 12121 12122 check_limit: 12123 CMPQ SI, $0x00 12124 JHI loop 12125 RET 12126 12127 // func AmdAxpyUnsafeX_V2A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12128 // Requires: SSE 12129 TEXT ·AmdAxpyUnsafeX_V2A9R8(SB), NOSPLIT, $0-48 12130 MOVSS alpha+0(FP), X0 12131 MOVQ xs+8(FP), AX 12132 MOVQ incx+16(FP), CX 12133 MOVQ ys+24(FP), DX 12134 MOVQ incy+32(FP), BX 12135 MOVQ n+40(FP), SI 12136 XORQ DI, DI 12137 XORQ R8, R8 12138 JMP check_limit_unroll 12139 PCALIGN $0x08 12140 NOP 12141 12142 loop_unroll: 12143 MOVSS (AX)(DI*4), X1 12144 MULSS X0, X1 12145 ADDSS (DX)(R8*4), X1 12146 MOVSS X1, (DX)(R8*4) 12147 ADDQ CX, DI 12148 ADDQ BX, R8 12149 MOVSS (AX)(DI*4), X1 12150 MULSS X0, X1 12151 ADDSS (DX)(R8*4), X1 12152 MOVSS X1, (DX)(R8*4) 12153 ADDQ CX, DI 12154 ADDQ BX, R8 12155 MOVSS (AX)(DI*4), X1 12156 MULSS X0, X1 12157 ADDSS (DX)(R8*4), X1 12158 MOVSS X1, (DX)(R8*4) 12159 ADDQ CX, DI 12160 ADDQ BX, R8 12161 MOVSS (AX)(DI*4), X1 12162 MULSS X0, X1 12163 ADDSS (DX)(R8*4), X1 12164 MOVSS X1, (DX)(R8*4) 12165 ADDQ CX, DI 12166 ADDQ BX, R8 12167 MOVSS (AX)(DI*4), X1 12168 MULSS X0, X1 12169 ADDSS (DX)(R8*4), X1 12170 MOVSS X1, (DX)(R8*4) 12171 ADDQ CX, DI 12172 ADDQ BX, R8 12173 MOVSS (AX)(DI*4), X1 12174 MULSS X0, X1 12175 ADDSS (DX)(R8*4), X1 12176 MOVSS X1, (DX)(R8*4) 12177 ADDQ CX, DI 12178 ADDQ BX, R8 12179 MOVSS (AX)(DI*4), X1 12180 MULSS X0, X1 12181 ADDSS (DX)(R8*4), X1 12182 MOVSS X1, (DX)(R8*4) 12183 ADDQ CX, DI 12184 ADDQ BX, R8 12185 MOVSS (AX)(DI*4), X1 12186 MULSS X0, X1 12187 ADDSS (DX)(R8*4), X1 12188 MOVSS X1, (DX)(R8*4) 12189 ADDQ CX, DI 12190 ADDQ BX, R8 12191 SUBQ $0x08, SI 12192 12193 check_limit_unroll: 12194 CMPQ SI, $0x08 12195 JHI loop_unroll 12196 JMP check_limit 12197 12198 loop: 12199 MOVSS (AX)(DI*4), X1 12200 MULSS X0, X1 12201 ADDSS (DX)(R8*4), X1 12202 MOVSS X1, (DX)(R8*4) 12203 DECQ SI 12204 ADDQ CX, DI 12205 ADDQ BX, R8 12206 12207 check_limit: 12208 CMPQ SI, $0x00 12209 JHI loop 12210 RET 12211 12212 // func AmdAxpyUnsafeX_V3A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12213 // Requires: SSE 12214 TEXT ·AmdAxpyUnsafeX_V3A9R8(SB), NOSPLIT, $0-48 12215 MOVSS alpha+0(FP), X0 12216 MOVQ xs+8(FP), AX 12217 MOVQ incx+16(FP), CX 12218 MOVQ ys+24(FP), DX 12219 MOVQ incy+32(FP), BX 12220 MOVQ n+40(FP), SI 12221 XORQ DI, DI 12222 XORQ R8, R8 12223 JMP check_limit_unroll 12224 PCALIGN $0x08 12225 NOP 12226 12227 loop_unroll: 12228 MOVSS (AX)(DI*4), X1 12229 MULSS X0, X1 12230 ADDSS (DX)(R8*4), X1 12231 MOVSS X1, (DX)(R8*4) 12232 ADDQ CX, DI 12233 ADDQ BX, R8 12234 MOVSS (AX)(DI*4), X1 12235 MULSS X0, X1 12236 ADDSS (DX)(R8*4), X1 12237 MOVSS X1, (DX)(R8*4) 12238 ADDQ CX, DI 12239 ADDQ BX, R8 12240 MOVSS (AX)(DI*4), X1 12241 MULSS X0, X1 12242 ADDSS (DX)(R8*4), X1 12243 MOVSS X1, (DX)(R8*4) 12244 ADDQ CX, DI 12245 ADDQ BX, R8 12246 MOVSS (AX)(DI*4), X1 12247 MULSS X0, X1 12248 ADDSS (DX)(R8*4), X1 12249 MOVSS X1, (DX)(R8*4) 12250 ADDQ CX, DI 12251 ADDQ BX, R8 12252 MOVSS (AX)(DI*4), X1 12253 MULSS X0, X1 12254 ADDSS (DX)(R8*4), X1 12255 MOVSS X1, (DX)(R8*4) 12256 ADDQ CX, DI 12257 ADDQ BX, R8 12258 MOVSS (AX)(DI*4), X1 12259 MULSS X0, X1 12260 ADDSS (DX)(R8*4), X1 12261 MOVSS X1, (DX)(R8*4) 12262 ADDQ CX, DI 12263 ADDQ BX, R8 12264 MOVSS (AX)(DI*4), X1 12265 MULSS X0, X1 12266 ADDSS (DX)(R8*4), X1 12267 MOVSS X1, (DX)(R8*4) 12268 ADDQ CX, DI 12269 ADDQ BX, R8 12270 MOVSS (AX)(DI*4), X1 12271 MULSS X0, X1 12272 ADDSS (DX)(R8*4), X1 12273 MOVSS X1, (DX)(R8*4) 12274 ADDQ CX, DI 12275 ADDQ BX, R8 12276 SUBQ $0x08, SI 12277 12278 check_limit_unroll: 12279 CMPQ SI, $0x08 12280 JHI loop_unroll 12281 JMP check_limit 12282 12283 loop: 12284 MOVSS (AX)(DI*4), X1 12285 MULSS X0, X1 12286 ADDSS (DX)(R8*4), X1 12287 MOVSS X1, (DX)(R8*4) 12288 DECQ SI 12289 ADDQ CX, DI 12290 ADDQ BX, R8 12291 12292 check_limit: 12293 CMPQ SI, $0x00 12294 JHI loop 12295 RET 12296 12297 // func AmdAxpyUnsafeX_V4A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12298 // Requires: SSE 12299 TEXT ·AmdAxpyUnsafeX_V4A9R8(SB), NOSPLIT, $0-48 12300 MOVSS alpha+0(FP), X0 12301 MOVQ xs+8(FP), AX 12302 MOVQ incx+16(FP), CX 12303 MOVQ ys+24(FP), DX 12304 MOVQ incy+32(FP), BX 12305 MOVQ n+40(FP), SI 12306 XORQ DI, DI 12307 XORQ R8, R8 12308 JMP check_limit_unroll 12309 PCALIGN $0x08 12310 NOP 12311 12312 loop_unroll: 12313 MOVSS (AX)(DI*4), X1 12314 MULSS X0, X1 12315 ADDSS (DX)(R8*4), X1 12316 MOVSS X1, (DX)(R8*4) 12317 ADDQ CX, DI 12318 ADDQ BX, R8 12319 MOVSS (AX)(DI*4), X1 12320 MULSS X0, X1 12321 ADDSS (DX)(R8*4), X1 12322 MOVSS X1, (DX)(R8*4) 12323 ADDQ CX, DI 12324 ADDQ BX, R8 12325 MOVSS (AX)(DI*4), X1 12326 MULSS X0, X1 12327 ADDSS (DX)(R8*4), X1 12328 MOVSS X1, (DX)(R8*4) 12329 ADDQ CX, DI 12330 ADDQ BX, R8 12331 MOVSS (AX)(DI*4), X1 12332 MULSS X0, X1 12333 ADDSS (DX)(R8*4), X1 12334 MOVSS X1, (DX)(R8*4) 12335 ADDQ CX, DI 12336 ADDQ BX, R8 12337 MOVSS (AX)(DI*4), X1 12338 MULSS X0, X1 12339 ADDSS (DX)(R8*4), X1 12340 MOVSS X1, (DX)(R8*4) 12341 ADDQ CX, DI 12342 ADDQ BX, R8 12343 MOVSS (AX)(DI*4), X1 12344 MULSS X0, X1 12345 ADDSS (DX)(R8*4), X1 12346 MOVSS X1, (DX)(R8*4) 12347 ADDQ CX, DI 12348 ADDQ BX, R8 12349 MOVSS (AX)(DI*4), X1 12350 MULSS X0, X1 12351 ADDSS (DX)(R8*4), X1 12352 MOVSS X1, (DX)(R8*4) 12353 ADDQ CX, DI 12354 ADDQ BX, R8 12355 MOVSS (AX)(DI*4), X1 12356 MULSS X0, X1 12357 ADDSS (DX)(R8*4), X1 12358 MOVSS X1, (DX)(R8*4) 12359 ADDQ CX, DI 12360 ADDQ BX, R8 12361 SUBQ $0x08, SI 12362 12363 check_limit_unroll: 12364 CMPQ SI, $0x08 12365 JHI loop_unroll 12366 JMP check_limit 12367 12368 loop: 12369 MOVSS (AX)(DI*4), X1 12370 MULSS X0, X1 12371 ADDSS (DX)(R8*4), X1 12372 MOVSS X1, (DX)(R8*4) 12373 DECQ SI 12374 ADDQ CX, DI 12375 ADDQ BX, R8 12376 12377 check_limit: 12378 CMPQ SI, $0x00 12379 JHI loop 12380 RET 12381 12382 // func AmdAxpyUnsafeX_V5A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12383 // Requires: SSE 12384 TEXT ·AmdAxpyUnsafeX_V5A9R8(SB), NOSPLIT, $0-48 12385 MOVSS alpha+0(FP), X0 12386 MOVQ xs+8(FP), AX 12387 MOVQ incx+16(FP), CX 12388 MOVQ ys+24(FP), DX 12389 MOVQ incy+32(FP), BX 12390 MOVQ n+40(FP), SI 12391 XORQ DI, DI 12392 XORQ R8, R8 12393 JMP check_limit_unroll 12394 PCALIGN $0x08 12395 NOP 12396 12397 loop_unroll: 12398 MOVSS (AX)(DI*4), X1 12399 MULSS X0, X1 12400 ADDSS (DX)(R8*4), X1 12401 MOVSS X1, (DX)(R8*4) 12402 ADDQ CX, DI 12403 ADDQ BX, R8 12404 MOVSS (AX)(DI*4), X1 12405 MULSS X0, X1 12406 ADDSS (DX)(R8*4), X1 12407 MOVSS X1, (DX)(R8*4) 12408 ADDQ CX, DI 12409 ADDQ BX, R8 12410 MOVSS (AX)(DI*4), X1 12411 MULSS X0, X1 12412 ADDSS (DX)(R8*4), X1 12413 MOVSS X1, (DX)(R8*4) 12414 ADDQ CX, DI 12415 ADDQ BX, R8 12416 MOVSS (AX)(DI*4), X1 12417 MULSS X0, X1 12418 ADDSS (DX)(R8*4), X1 12419 MOVSS X1, (DX)(R8*4) 12420 ADDQ CX, DI 12421 ADDQ BX, R8 12422 MOVSS (AX)(DI*4), X1 12423 MULSS X0, X1 12424 ADDSS (DX)(R8*4), X1 12425 MOVSS X1, (DX)(R8*4) 12426 ADDQ CX, DI 12427 ADDQ BX, R8 12428 MOVSS (AX)(DI*4), X1 12429 MULSS X0, X1 12430 ADDSS (DX)(R8*4), X1 12431 MOVSS X1, (DX)(R8*4) 12432 ADDQ CX, DI 12433 ADDQ BX, R8 12434 MOVSS (AX)(DI*4), X1 12435 MULSS X0, X1 12436 ADDSS (DX)(R8*4), X1 12437 MOVSS X1, (DX)(R8*4) 12438 ADDQ CX, DI 12439 ADDQ BX, R8 12440 MOVSS (AX)(DI*4), X1 12441 MULSS X0, X1 12442 ADDSS (DX)(R8*4), X1 12443 MOVSS X1, (DX)(R8*4) 12444 ADDQ CX, DI 12445 ADDQ BX, R8 12446 SUBQ $0x08, SI 12447 12448 check_limit_unroll: 12449 CMPQ SI, $0x08 12450 JHI loop_unroll 12451 JMP check_limit 12452 12453 loop: 12454 MOVSS (AX)(DI*4), X1 12455 MULSS X0, X1 12456 ADDSS (DX)(R8*4), X1 12457 MOVSS X1, (DX)(R8*4) 12458 DECQ SI 12459 ADDQ CX, DI 12460 ADDQ BX, R8 12461 12462 check_limit: 12463 CMPQ SI, $0x00 12464 JHI loop 12465 RET 12466 12467 // func AmdAxpyUnsafeX_V0A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12468 // Requires: SSE 12469 TEXT ·AmdAxpyUnsafeX_V0A10R8(SB), NOSPLIT, $0-48 12470 MOVSS alpha+0(FP), X0 12471 MOVQ xs+8(FP), AX 12472 MOVQ incx+16(FP), CX 12473 MOVQ ys+24(FP), DX 12474 MOVQ incy+32(FP), BX 12475 MOVQ n+40(FP), SI 12476 XORQ DI, DI 12477 XORQ R8, R8 12478 JMP check_limit_unroll 12479 PCALIGN $0x08 12480 NOP 12481 NOP 12482 12483 loop_unroll: 12484 MOVSS (AX)(DI*4), X1 12485 MULSS X0, X1 12486 ADDSS (DX)(R8*4), X1 12487 MOVSS X1, (DX)(R8*4) 12488 ADDQ CX, DI 12489 ADDQ BX, R8 12490 MOVSS (AX)(DI*4), X1 12491 MULSS X0, X1 12492 ADDSS (DX)(R8*4), X1 12493 MOVSS X1, (DX)(R8*4) 12494 ADDQ CX, DI 12495 ADDQ BX, R8 12496 MOVSS (AX)(DI*4), X1 12497 MULSS X0, X1 12498 ADDSS (DX)(R8*4), X1 12499 MOVSS X1, (DX)(R8*4) 12500 ADDQ CX, DI 12501 ADDQ BX, R8 12502 MOVSS (AX)(DI*4), X1 12503 MULSS X0, X1 12504 ADDSS (DX)(R8*4), X1 12505 MOVSS X1, (DX)(R8*4) 12506 ADDQ CX, DI 12507 ADDQ BX, R8 12508 MOVSS (AX)(DI*4), X1 12509 MULSS X0, X1 12510 ADDSS (DX)(R8*4), X1 12511 MOVSS X1, (DX)(R8*4) 12512 ADDQ CX, DI 12513 ADDQ BX, R8 12514 MOVSS (AX)(DI*4), X1 12515 MULSS X0, X1 12516 ADDSS (DX)(R8*4), X1 12517 MOVSS X1, (DX)(R8*4) 12518 ADDQ CX, DI 12519 ADDQ BX, R8 12520 MOVSS (AX)(DI*4), X1 12521 MULSS X0, X1 12522 ADDSS (DX)(R8*4), X1 12523 MOVSS X1, (DX)(R8*4) 12524 ADDQ CX, DI 12525 ADDQ BX, R8 12526 MOVSS (AX)(DI*4), X1 12527 MULSS X0, X1 12528 ADDSS (DX)(R8*4), X1 12529 MOVSS X1, (DX)(R8*4) 12530 ADDQ CX, DI 12531 ADDQ BX, R8 12532 SUBQ $0x08, SI 12533 12534 check_limit_unroll: 12535 CMPQ SI, $0x08 12536 JHI loop_unroll 12537 JMP check_limit 12538 12539 loop: 12540 MOVSS (AX)(DI*4), X1 12541 MULSS X0, X1 12542 ADDSS (DX)(R8*4), X1 12543 MOVSS X1, (DX)(R8*4) 12544 DECQ SI 12545 ADDQ CX, DI 12546 ADDQ BX, R8 12547 12548 check_limit: 12549 CMPQ SI, $0x00 12550 JHI loop 12551 RET 12552 12553 // func AmdAxpyUnsafeX_V1A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12554 // Requires: SSE 12555 TEXT ·AmdAxpyUnsafeX_V1A10R8(SB), NOSPLIT, $0-48 12556 MOVSS alpha+0(FP), X0 12557 MOVQ xs+8(FP), AX 12558 MOVQ incx+16(FP), CX 12559 MOVQ ys+24(FP), DX 12560 MOVQ incy+32(FP), BX 12561 MOVQ n+40(FP), SI 12562 XORQ DI, DI 12563 XORQ R8, R8 12564 JMP check_limit_unroll 12565 PCALIGN $0x08 12566 NOP 12567 NOP 12568 12569 loop_unroll: 12570 MOVSS (AX)(DI*4), X1 12571 MULSS X0, X1 12572 ADDSS (DX)(R8*4), X1 12573 MOVSS X1, (DX)(R8*4) 12574 ADDQ CX, DI 12575 ADDQ BX, R8 12576 MOVSS (AX)(DI*4), X1 12577 MULSS X0, X1 12578 ADDSS (DX)(R8*4), X1 12579 MOVSS X1, (DX)(R8*4) 12580 ADDQ CX, DI 12581 ADDQ BX, R8 12582 MOVSS (AX)(DI*4), X1 12583 MULSS X0, X1 12584 ADDSS (DX)(R8*4), X1 12585 MOVSS X1, (DX)(R8*4) 12586 ADDQ CX, DI 12587 ADDQ BX, R8 12588 MOVSS (AX)(DI*4), X1 12589 MULSS X0, X1 12590 ADDSS (DX)(R8*4), X1 12591 MOVSS X1, (DX)(R8*4) 12592 ADDQ CX, DI 12593 ADDQ BX, R8 12594 MOVSS (AX)(DI*4), X1 12595 MULSS X0, X1 12596 ADDSS (DX)(R8*4), X1 12597 MOVSS X1, (DX)(R8*4) 12598 ADDQ CX, DI 12599 ADDQ BX, R8 12600 MOVSS (AX)(DI*4), X1 12601 MULSS X0, X1 12602 ADDSS (DX)(R8*4), X1 12603 MOVSS X1, (DX)(R8*4) 12604 ADDQ CX, DI 12605 ADDQ BX, R8 12606 MOVSS (AX)(DI*4), X1 12607 MULSS X0, X1 12608 ADDSS (DX)(R8*4), X1 12609 MOVSS X1, (DX)(R8*4) 12610 ADDQ CX, DI 12611 ADDQ BX, R8 12612 MOVSS (AX)(DI*4), X1 12613 MULSS X0, X1 12614 ADDSS (DX)(R8*4), X1 12615 MOVSS X1, (DX)(R8*4) 12616 ADDQ CX, DI 12617 ADDQ BX, R8 12618 SUBQ $0x08, SI 12619 12620 check_limit_unroll: 12621 CMPQ SI, $0x08 12622 JHI loop_unroll 12623 JMP check_limit 12624 12625 loop: 12626 MOVSS (AX)(DI*4), X1 12627 MULSS X0, X1 12628 ADDSS (DX)(R8*4), X1 12629 MOVSS X1, (DX)(R8*4) 12630 DECQ SI 12631 ADDQ CX, DI 12632 ADDQ BX, R8 12633 12634 check_limit: 12635 CMPQ SI, $0x00 12636 JHI loop 12637 RET 12638 12639 // func AmdAxpyUnsafeX_V2A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12640 // Requires: SSE 12641 TEXT ·AmdAxpyUnsafeX_V2A10R8(SB), NOSPLIT, $0-48 12642 MOVSS alpha+0(FP), X0 12643 MOVQ xs+8(FP), AX 12644 MOVQ incx+16(FP), CX 12645 MOVQ ys+24(FP), DX 12646 MOVQ incy+32(FP), BX 12647 MOVQ n+40(FP), SI 12648 XORQ DI, DI 12649 XORQ R8, R8 12650 JMP check_limit_unroll 12651 PCALIGN $0x08 12652 NOP 12653 NOP 12654 12655 loop_unroll: 12656 MOVSS (AX)(DI*4), X1 12657 MULSS X0, X1 12658 ADDSS (DX)(R8*4), X1 12659 MOVSS X1, (DX)(R8*4) 12660 ADDQ CX, DI 12661 ADDQ BX, R8 12662 MOVSS (AX)(DI*4), X1 12663 MULSS X0, X1 12664 ADDSS (DX)(R8*4), X1 12665 MOVSS X1, (DX)(R8*4) 12666 ADDQ CX, DI 12667 ADDQ BX, R8 12668 MOVSS (AX)(DI*4), X1 12669 MULSS X0, X1 12670 ADDSS (DX)(R8*4), X1 12671 MOVSS X1, (DX)(R8*4) 12672 ADDQ CX, DI 12673 ADDQ BX, R8 12674 MOVSS (AX)(DI*4), X1 12675 MULSS X0, X1 12676 ADDSS (DX)(R8*4), X1 12677 MOVSS X1, (DX)(R8*4) 12678 ADDQ CX, DI 12679 ADDQ BX, R8 12680 MOVSS (AX)(DI*4), X1 12681 MULSS X0, X1 12682 ADDSS (DX)(R8*4), X1 12683 MOVSS X1, (DX)(R8*4) 12684 ADDQ CX, DI 12685 ADDQ BX, R8 12686 MOVSS (AX)(DI*4), X1 12687 MULSS X0, X1 12688 ADDSS (DX)(R8*4), X1 12689 MOVSS X1, (DX)(R8*4) 12690 ADDQ CX, DI 12691 ADDQ BX, R8 12692 MOVSS (AX)(DI*4), X1 12693 MULSS X0, X1 12694 ADDSS (DX)(R8*4), X1 12695 MOVSS X1, (DX)(R8*4) 12696 ADDQ CX, DI 12697 ADDQ BX, R8 12698 MOVSS (AX)(DI*4), X1 12699 MULSS X0, X1 12700 ADDSS (DX)(R8*4), X1 12701 MOVSS X1, (DX)(R8*4) 12702 ADDQ CX, DI 12703 ADDQ BX, R8 12704 SUBQ $0x08, SI 12705 12706 check_limit_unroll: 12707 CMPQ SI, $0x08 12708 JHI loop_unroll 12709 JMP check_limit 12710 12711 loop: 12712 MOVSS (AX)(DI*4), X1 12713 MULSS X0, X1 12714 ADDSS (DX)(R8*4), X1 12715 MOVSS X1, (DX)(R8*4) 12716 DECQ SI 12717 ADDQ CX, DI 12718 ADDQ BX, R8 12719 12720 check_limit: 12721 CMPQ SI, $0x00 12722 JHI loop 12723 RET 12724 12725 // func AmdAxpyUnsafeX_V3A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12726 // Requires: SSE 12727 TEXT ·AmdAxpyUnsafeX_V3A10R8(SB), NOSPLIT, $0-48 12728 MOVSS alpha+0(FP), X0 12729 MOVQ xs+8(FP), AX 12730 MOVQ incx+16(FP), CX 12731 MOVQ ys+24(FP), DX 12732 MOVQ incy+32(FP), BX 12733 MOVQ n+40(FP), SI 12734 XORQ DI, DI 12735 XORQ R8, R8 12736 JMP check_limit_unroll 12737 PCALIGN $0x08 12738 NOP 12739 NOP 12740 12741 loop_unroll: 12742 MOVSS (AX)(DI*4), X1 12743 MULSS X0, X1 12744 ADDSS (DX)(R8*4), X1 12745 MOVSS X1, (DX)(R8*4) 12746 ADDQ CX, DI 12747 ADDQ BX, R8 12748 MOVSS (AX)(DI*4), X1 12749 MULSS X0, X1 12750 ADDSS (DX)(R8*4), X1 12751 MOVSS X1, (DX)(R8*4) 12752 ADDQ CX, DI 12753 ADDQ BX, R8 12754 MOVSS (AX)(DI*4), X1 12755 MULSS X0, X1 12756 ADDSS (DX)(R8*4), X1 12757 MOVSS X1, (DX)(R8*4) 12758 ADDQ CX, DI 12759 ADDQ BX, R8 12760 MOVSS (AX)(DI*4), X1 12761 MULSS X0, X1 12762 ADDSS (DX)(R8*4), X1 12763 MOVSS X1, (DX)(R8*4) 12764 ADDQ CX, DI 12765 ADDQ BX, R8 12766 MOVSS (AX)(DI*4), X1 12767 MULSS X0, X1 12768 ADDSS (DX)(R8*4), X1 12769 MOVSS X1, (DX)(R8*4) 12770 ADDQ CX, DI 12771 ADDQ BX, R8 12772 MOVSS (AX)(DI*4), X1 12773 MULSS X0, X1 12774 ADDSS (DX)(R8*4), X1 12775 MOVSS X1, (DX)(R8*4) 12776 ADDQ CX, DI 12777 ADDQ BX, R8 12778 MOVSS (AX)(DI*4), X1 12779 MULSS X0, X1 12780 ADDSS (DX)(R8*4), X1 12781 MOVSS X1, (DX)(R8*4) 12782 ADDQ CX, DI 12783 ADDQ BX, R8 12784 MOVSS (AX)(DI*4), X1 12785 MULSS X0, X1 12786 ADDSS (DX)(R8*4), X1 12787 MOVSS X1, (DX)(R8*4) 12788 ADDQ CX, DI 12789 ADDQ BX, R8 12790 SUBQ $0x08, SI 12791 12792 check_limit_unroll: 12793 CMPQ SI, $0x08 12794 JHI loop_unroll 12795 JMP check_limit 12796 12797 loop: 12798 MOVSS (AX)(DI*4), X1 12799 MULSS X0, X1 12800 ADDSS (DX)(R8*4), X1 12801 MOVSS X1, (DX)(R8*4) 12802 DECQ SI 12803 ADDQ CX, DI 12804 ADDQ BX, R8 12805 12806 check_limit: 12807 CMPQ SI, $0x00 12808 JHI loop 12809 RET 12810 12811 // func AmdAxpyUnsafeX_V4A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12812 // Requires: SSE 12813 TEXT ·AmdAxpyUnsafeX_V4A10R8(SB), NOSPLIT, $0-48 12814 MOVSS alpha+0(FP), X0 12815 MOVQ xs+8(FP), AX 12816 MOVQ incx+16(FP), CX 12817 MOVQ ys+24(FP), DX 12818 MOVQ incy+32(FP), BX 12819 MOVQ n+40(FP), SI 12820 XORQ DI, DI 12821 XORQ R8, R8 12822 JMP check_limit_unroll 12823 PCALIGN $0x08 12824 NOP 12825 NOP 12826 12827 loop_unroll: 12828 MOVSS (AX)(DI*4), X1 12829 MULSS X0, X1 12830 ADDSS (DX)(R8*4), X1 12831 MOVSS X1, (DX)(R8*4) 12832 ADDQ CX, DI 12833 ADDQ BX, R8 12834 MOVSS (AX)(DI*4), X1 12835 MULSS X0, X1 12836 ADDSS (DX)(R8*4), X1 12837 MOVSS X1, (DX)(R8*4) 12838 ADDQ CX, DI 12839 ADDQ BX, R8 12840 MOVSS (AX)(DI*4), X1 12841 MULSS X0, X1 12842 ADDSS (DX)(R8*4), X1 12843 MOVSS X1, (DX)(R8*4) 12844 ADDQ CX, DI 12845 ADDQ BX, R8 12846 MOVSS (AX)(DI*4), X1 12847 MULSS X0, X1 12848 ADDSS (DX)(R8*4), X1 12849 MOVSS X1, (DX)(R8*4) 12850 ADDQ CX, DI 12851 ADDQ BX, R8 12852 MOVSS (AX)(DI*4), X1 12853 MULSS X0, X1 12854 ADDSS (DX)(R8*4), X1 12855 MOVSS X1, (DX)(R8*4) 12856 ADDQ CX, DI 12857 ADDQ BX, R8 12858 MOVSS (AX)(DI*4), X1 12859 MULSS X0, X1 12860 ADDSS (DX)(R8*4), X1 12861 MOVSS X1, (DX)(R8*4) 12862 ADDQ CX, DI 12863 ADDQ BX, R8 12864 MOVSS (AX)(DI*4), X1 12865 MULSS X0, X1 12866 ADDSS (DX)(R8*4), X1 12867 MOVSS X1, (DX)(R8*4) 12868 ADDQ CX, DI 12869 ADDQ BX, R8 12870 MOVSS (AX)(DI*4), X1 12871 MULSS X0, X1 12872 ADDSS (DX)(R8*4), X1 12873 MOVSS X1, (DX)(R8*4) 12874 ADDQ CX, DI 12875 ADDQ BX, R8 12876 SUBQ $0x08, SI 12877 12878 check_limit_unroll: 12879 CMPQ SI, $0x08 12880 JHI loop_unroll 12881 JMP check_limit 12882 12883 loop: 12884 MOVSS (AX)(DI*4), X1 12885 MULSS X0, X1 12886 ADDSS (DX)(R8*4), X1 12887 MOVSS X1, (DX)(R8*4) 12888 DECQ SI 12889 ADDQ CX, DI 12890 ADDQ BX, R8 12891 12892 check_limit: 12893 CMPQ SI, $0x00 12894 JHI loop 12895 RET 12896 12897 // func AmdAxpyUnsafeX_V5A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12898 // Requires: SSE 12899 TEXT ·AmdAxpyUnsafeX_V5A10R8(SB), NOSPLIT, $0-48 12900 MOVSS alpha+0(FP), X0 12901 MOVQ xs+8(FP), AX 12902 MOVQ incx+16(FP), CX 12903 MOVQ ys+24(FP), DX 12904 MOVQ incy+32(FP), BX 12905 MOVQ n+40(FP), SI 12906 XORQ DI, DI 12907 XORQ R8, R8 12908 JMP check_limit_unroll 12909 PCALIGN $0x08 12910 NOP 12911 NOP 12912 12913 loop_unroll: 12914 MOVSS (AX)(DI*4), X1 12915 MULSS X0, X1 12916 ADDSS (DX)(R8*4), X1 12917 MOVSS X1, (DX)(R8*4) 12918 ADDQ CX, DI 12919 ADDQ BX, R8 12920 MOVSS (AX)(DI*4), X1 12921 MULSS X0, X1 12922 ADDSS (DX)(R8*4), X1 12923 MOVSS X1, (DX)(R8*4) 12924 ADDQ CX, DI 12925 ADDQ BX, R8 12926 MOVSS (AX)(DI*4), X1 12927 MULSS X0, X1 12928 ADDSS (DX)(R8*4), X1 12929 MOVSS X1, (DX)(R8*4) 12930 ADDQ CX, DI 12931 ADDQ BX, R8 12932 MOVSS (AX)(DI*4), X1 12933 MULSS X0, X1 12934 ADDSS (DX)(R8*4), X1 12935 MOVSS X1, (DX)(R8*4) 12936 ADDQ CX, DI 12937 ADDQ BX, R8 12938 MOVSS (AX)(DI*4), X1 12939 MULSS X0, X1 12940 ADDSS (DX)(R8*4), X1 12941 MOVSS X1, (DX)(R8*4) 12942 ADDQ CX, DI 12943 ADDQ BX, R8 12944 MOVSS (AX)(DI*4), X1 12945 MULSS X0, X1 12946 ADDSS (DX)(R8*4), X1 12947 MOVSS X1, (DX)(R8*4) 12948 ADDQ CX, DI 12949 ADDQ BX, R8 12950 MOVSS (AX)(DI*4), X1 12951 MULSS X0, X1 12952 ADDSS (DX)(R8*4), X1 12953 MOVSS X1, (DX)(R8*4) 12954 ADDQ CX, DI 12955 ADDQ BX, R8 12956 MOVSS (AX)(DI*4), X1 12957 MULSS X0, X1 12958 ADDSS (DX)(R8*4), X1 12959 MOVSS X1, (DX)(R8*4) 12960 ADDQ CX, DI 12961 ADDQ BX, R8 12962 SUBQ $0x08, SI 12963 12964 check_limit_unroll: 12965 CMPQ SI, $0x08 12966 JHI loop_unroll 12967 JMP check_limit 12968 12969 loop: 12970 MOVSS (AX)(DI*4), X1 12971 MULSS X0, X1 12972 ADDSS (DX)(R8*4), X1 12973 MOVSS X1, (DX)(R8*4) 12974 DECQ SI 12975 ADDQ CX, DI 12976 ADDQ BX, R8 12977 12978 check_limit: 12979 CMPQ SI, $0x00 12980 JHI loop 12981 RET 12982 12983 // func AmdAxpyUnsafeX_V0A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 12984 // Requires: SSE 12985 TEXT ·AmdAxpyUnsafeX_V0A11R8(SB), NOSPLIT, $0-48 12986 MOVSS alpha+0(FP), X0 12987 MOVQ xs+8(FP), AX 12988 MOVQ incx+16(FP), CX 12989 MOVQ ys+24(FP), DX 12990 MOVQ incy+32(FP), BX 12991 MOVQ n+40(FP), SI 12992 XORQ DI, DI 12993 XORQ R8, R8 12994 JMP check_limit_unroll 12995 PCALIGN $0x08 12996 NOP 12997 NOP 12998 NOP 12999 13000 loop_unroll: 13001 MOVSS (AX)(DI*4), X1 13002 MULSS X0, X1 13003 ADDSS (DX)(R8*4), X1 13004 MOVSS X1, (DX)(R8*4) 13005 ADDQ CX, DI 13006 ADDQ BX, R8 13007 MOVSS (AX)(DI*4), X1 13008 MULSS X0, X1 13009 ADDSS (DX)(R8*4), X1 13010 MOVSS X1, (DX)(R8*4) 13011 ADDQ CX, DI 13012 ADDQ BX, R8 13013 MOVSS (AX)(DI*4), X1 13014 MULSS X0, X1 13015 ADDSS (DX)(R8*4), X1 13016 MOVSS X1, (DX)(R8*4) 13017 ADDQ CX, DI 13018 ADDQ BX, R8 13019 MOVSS (AX)(DI*4), X1 13020 MULSS X0, X1 13021 ADDSS (DX)(R8*4), X1 13022 MOVSS X1, (DX)(R8*4) 13023 ADDQ CX, DI 13024 ADDQ BX, R8 13025 MOVSS (AX)(DI*4), X1 13026 MULSS X0, X1 13027 ADDSS (DX)(R8*4), X1 13028 MOVSS X1, (DX)(R8*4) 13029 ADDQ CX, DI 13030 ADDQ BX, R8 13031 MOVSS (AX)(DI*4), X1 13032 MULSS X0, X1 13033 ADDSS (DX)(R8*4), X1 13034 MOVSS X1, (DX)(R8*4) 13035 ADDQ CX, DI 13036 ADDQ BX, R8 13037 MOVSS (AX)(DI*4), X1 13038 MULSS X0, X1 13039 ADDSS (DX)(R8*4), X1 13040 MOVSS X1, (DX)(R8*4) 13041 ADDQ CX, DI 13042 ADDQ BX, R8 13043 MOVSS (AX)(DI*4), X1 13044 MULSS X0, X1 13045 ADDSS (DX)(R8*4), X1 13046 MOVSS X1, (DX)(R8*4) 13047 ADDQ CX, DI 13048 ADDQ BX, R8 13049 SUBQ $0x08, SI 13050 13051 check_limit_unroll: 13052 CMPQ SI, $0x08 13053 JHI loop_unroll 13054 JMP check_limit 13055 13056 loop: 13057 MOVSS (AX)(DI*4), X1 13058 MULSS X0, X1 13059 ADDSS (DX)(R8*4), X1 13060 MOVSS X1, (DX)(R8*4) 13061 DECQ SI 13062 ADDQ CX, DI 13063 ADDQ BX, R8 13064 13065 check_limit: 13066 CMPQ SI, $0x00 13067 JHI loop 13068 RET 13069 13070 // func AmdAxpyUnsafeX_V1A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13071 // Requires: SSE 13072 TEXT ·AmdAxpyUnsafeX_V1A11R8(SB), NOSPLIT, $0-48 13073 MOVSS alpha+0(FP), X0 13074 MOVQ xs+8(FP), AX 13075 MOVQ incx+16(FP), CX 13076 MOVQ ys+24(FP), DX 13077 MOVQ incy+32(FP), BX 13078 MOVQ n+40(FP), SI 13079 XORQ DI, DI 13080 XORQ R8, R8 13081 JMP check_limit_unroll 13082 PCALIGN $0x08 13083 NOP 13084 NOP 13085 NOP 13086 13087 loop_unroll: 13088 MOVSS (AX)(DI*4), X1 13089 MULSS X0, X1 13090 ADDSS (DX)(R8*4), X1 13091 MOVSS X1, (DX)(R8*4) 13092 ADDQ CX, DI 13093 ADDQ BX, R8 13094 MOVSS (AX)(DI*4), X1 13095 MULSS X0, X1 13096 ADDSS (DX)(R8*4), X1 13097 MOVSS X1, (DX)(R8*4) 13098 ADDQ CX, DI 13099 ADDQ BX, R8 13100 MOVSS (AX)(DI*4), X1 13101 MULSS X0, X1 13102 ADDSS (DX)(R8*4), X1 13103 MOVSS X1, (DX)(R8*4) 13104 ADDQ CX, DI 13105 ADDQ BX, R8 13106 MOVSS (AX)(DI*4), X1 13107 MULSS X0, X1 13108 ADDSS (DX)(R8*4), X1 13109 MOVSS X1, (DX)(R8*4) 13110 ADDQ CX, DI 13111 ADDQ BX, R8 13112 MOVSS (AX)(DI*4), X1 13113 MULSS X0, X1 13114 ADDSS (DX)(R8*4), X1 13115 MOVSS X1, (DX)(R8*4) 13116 ADDQ CX, DI 13117 ADDQ BX, R8 13118 MOVSS (AX)(DI*4), X1 13119 MULSS X0, X1 13120 ADDSS (DX)(R8*4), X1 13121 MOVSS X1, (DX)(R8*4) 13122 ADDQ CX, DI 13123 ADDQ BX, R8 13124 MOVSS (AX)(DI*4), X1 13125 MULSS X0, X1 13126 ADDSS (DX)(R8*4), X1 13127 MOVSS X1, (DX)(R8*4) 13128 ADDQ CX, DI 13129 ADDQ BX, R8 13130 MOVSS (AX)(DI*4), X1 13131 MULSS X0, X1 13132 ADDSS (DX)(R8*4), X1 13133 MOVSS X1, (DX)(R8*4) 13134 ADDQ CX, DI 13135 ADDQ BX, R8 13136 SUBQ $0x08, SI 13137 13138 check_limit_unroll: 13139 CMPQ SI, $0x08 13140 JHI loop_unroll 13141 JMP check_limit 13142 13143 loop: 13144 MOVSS (AX)(DI*4), X1 13145 MULSS X0, X1 13146 ADDSS (DX)(R8*4), X1 13147 MOVSS X1, (DX)(R8*4) 13148 DECQ SI 13149 ADDQ CX, DI 13150 ADDQ BX, R8 13151 13152 check_limit: 13153 CMPQ SI, $0x00 13154 JHI loop 13155 RET 13156 13157 // func AmdAxpyUnsafeX_V2A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13158 // Requires: SSE 13159 TEXT ·AmdAxpyUnsafeX_V2A11R8(SB), NOSPLIT, $0-48 13160 MOVSS alpha+0(FP), X0 13161 MOVQ xs+8(FP), AX 13162 MOVQ incx+16(FP), CX 13163 MOVQ ys+24(FP), DX 13164 MOVQ incy+32(FP), BX 13165 MOVQ n+40(FP), SI 13166 XORQ DI, DI 13167 XORQ R8, R8 13168 JMP check_limit_unroll 13169 PCALIGN $0x08 13170 NOP 13171 NOP 13172 NOP 13173 13174 loop_unroll: 13175 MOVSS (AX)(DI*4), X1 13176 MULSS X0, X1 13177 ADDSS (DX)(R8*4), X1 13178 MOVSS X1, (DX)(R8*4) 13179 ADDQ CX, DI 13180 ADDQ BX, R8 13181 MOVSS (AX)(DI*4), X1 13182 MULSS X0, X1 13183 ADDSS (DX)(R8*4), X1 13184 MOVSS X1, (DX)(R8*4) 13185 ADDQ CX, DI 13186 ADDQ BX, R8 13187 MOVSS (AX)(DI*4), X1 13188 MULSS X0, X1 13189 ADDSS (DX)(R8*4), X1 13190 MOVSS X1, (DX)(R8*4) 13191 ADDQ CX, DI 13192 ADDQ BX, R8 13193 MOVSS (AX)(DI*4), X1 13194 MULSS X0, X1 13195 ADDSS (DX)(R8*4), X1 13196 MOVSS X1, (DX)(R8*4) 13197 ADDQ CX, DI 13198 ADDQ BX, R8 13199 MOVSS (AX)(DI*4), X1 13200 MULSS X0, X1 13201 ADDSS (DX)(R8*4), X1 13202 MOVSS X1, (DX)(R8*4) 13203 ADDQ CX, DI 13204 ADDQ BX, R8 13205 MOVSS (AX)(DI*4), X1 13206 MULSS X0, X1 13207 ADDSS (DX)(R8*4), X1 13208 MOVSS X1, (DX)(R8*4) 13209 ADDQ CX, DI 13210 ADDQ BX, R8 13211 MOVSS (AX)(DI*4), X1 13212 MULSS X0, X1 13213 ADDSS (DX)(R8*4), X1 13214 MOVSS X1, (DX)(R8*4) 13215 ADDQ CX, DI 13216 ADDQ BX, R8 13217 MOVSS (AX)(DI*4), X1 13218 MULSS X0, X1 13219 ADDSS (DX)(R8*4), X1 13220 MOVSS X1, (DX)(R8*4) 13221 ADDQ CX, DI 13222 ADDQ BX, R8 13223 SUBQ $0x08, SI 13224 13225 check_limit_unroll: 13226 CMPQ SI, $0x08 13227 JHI loop_unroll 13228 JMP check_limit 13229 13230 loop: 13231 MOVSS (AX)(DI*4), X1 13232 MULSS X0, X1 13233 ADDSS (DX)(R8*4), X1 13234 MOVSS X1, (DX)(R8*4) 13235 DECQ SI 13236 ADDQ CX, DI 13237 ADDQ BX, R8 13238 13239 check_limit: 13240 CMPQ SI, $0x00 13241 JHI loop 13242 RET 13243 13244 // func AmdAxpyUnsafeX_V3A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13245 // Requires: SSE 13246 TEXT ·AmdAxpyUnsafeX_V3A11R8(SB), NOSPLIT, $0-48 13247 MOVSS alpha+0(FP), X0 13248 MOVQ xs+8(FP), AX 13249 MOVQ incx+16(FP), CX 13250 MOVQ ys+24(FP), DX 13251 MOVQ incy+32(FP), BX 13252 MOVQ n+40(FP), SI 13253 XORQ DI, DI 13254 XORQ R8, R8 13255 JMP check_limit_unroll 13256 PCALIGN $0x08 13257 NOP 13258 NOP 13259 NOP 13260 13261 loop_unroll: 13262 MOVSS (AX)(DI*4), X1 13263 MULSS X0, X1 13264 ADDSS (DX)(R8*4), X1 13265 MOVSS X1, (DX)(R8*4) 13266 ADDQ CX, DI 13267 ADDQ BX, R8 13268 MOVSS (AX)(DI*4), X1 13269 MULSS X0, X1 13270 ADDSS (DX)(R8*4), X1 13271 MOVSS X1, (DX)(R8*4) 13272 ADDQ CX, DI 13273 ADDQ BX, R8 13274 MOVSS (AX)(DI*4), X1 13275 MULSS X0, X1 13276 ADDSS (DX)(R8*4), X1 13277 MOVSS X1, (DX)(R8*4) 13278 ADDQ CX, DI 13279 ADDQ BX, R8 13280 MOVSS (AX)(DI*4), X1 13281 MULSS X0, X1 13282 ADDSS (DX)(R8*4), X1 13283 MOVSS X1, (DX)(R8*4) 13284 ADDQ CX, DI 13285 ADDQ BX, R8 13286 MOVSS (AX)(DI*4), X1 13287 MULSS X0, X1 13288 ADDSS (DX)(R8*4), X1 13289 MOVSS X1, (DX)(R8*4) 13290 ADDQ CX, DI 13291 ADDQ BX, R8 13292 MOVSS (AX)(DI*4), X1 13293 MULSS X0, X1 13294 ADDSS (DX)(R8*4), X1 13295 MOVSS X1, (DX)(R8*4) 13296 ADDQ CX, DI 13297 ADDQ BX, R8 13298 MOVSS (AX)(DI*4), X1 13299 MULSS X0, X1 13300 ADDSS (DX)(R8*4), X1 13301 MOVSS X1, (DX)(R8*4) 13302 ADDQ CX, DI 13303 ADDQ BX, R8 13304 MOVSS (AX)(DI*4), X1 13305 MULSS X0, X1 13306 ADDSS (DX)(R8*4), X1 13307 MOVSS X1, (DX)(R8*4) 13308 ADDQ CX, DI 13309 ADDQ BX, R8 13310 SUBQ $0x08, SI 13311 13312 check_limit_unroll: 13313 CMPQ SI, $0x08 13314 JHI loop_unroll 13315 JMP check_limit 13316 13317 loop: 13318 MOVSS (AX)(DI*4), X1 13319 MULSS X0, X1 13320 ADDSS (DX)(R8*4), X1 13321 MOVSS X1, (DX)(R8*4) 13322 DECQ SI 13323 ADDQ CX, DI 13324 ADDQ BX, R8 13325 13326 check_limit: 13327 CMPQ SI, $0x00 13328 JHI loop 13329 RET 13330 13331 // func AmdAxpyUnsafeX_V4A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13332 // Requires: SSE 13333 TEXT ·AmdAxpyUnsafeX_V4A11R8(SB), NOSPLIT, $0-48 13334 MOVSS alpha+0(FP), X0 13335 MOVQ xs+8(FP), AX 13336 MOVQ incx+16(FP), CX 13337 MOVQ ys+24(FP), DX 13338 MOVQ incy+32(FP), BX 13339 MOVQ n+40(FP), SI 13340 XORQ DI, DI 13341 XORQ R8, R8 13342 JMP check_limit_unroll 13343 PCALIGN $0x08 13344 NOP 13345 NOP 13346 NOP 13347 13348 loop_unroll: 13349 MOVSS (AX)(DI*4), X1 13350 MULSS X0, X1 13351 ADDSS (DX)(R8*4), X1 13352 MOVSS X1, (DX)(R8*4) 13353 ADDQ CX, DI 13354 ADDQ BX, R8 13355 MOVSS (AX)(DI*4), X1 13356 MULSS X0, X1 13357 ADDSS (DX)(R8*4), X1 13358 MOVSS X1, (DX)(R8*4) 13359 ADDQ CX, DI 13360 ADDQ BX, R8 13361 MOVSS (AX)(DI*4), X1 13362 MULSS X0, X1 13363 ADDSS (DX)(R8*4), X1 13364 MOVSS X1, (DX)(R8*4) 13365 ADDQ CX, DI 13366 ADDQ BX, R8 13367 MOVSS (AX)(DI*4), X1 13368 MULSS X0, X1 13369 ADDSS (DX)(R8*4), X1 13370 MOVSS X1, (DX)(R8*4) 13371 ADDQ CX, DI 13372 ADDQ BX, R8 13373 MOVSS (AX)(DI*4), X1 13374 MULSS X0, X1 13375 ADDSS (DX)(R8*4), X1 13376 MOVSS X1, (DX)(R8*4) 13377 ADDQ CX, DI 13378 ADDQ BX, R8 13379 MOVSS (AX)(DI*4), X1 13380 MULSS X0, X1 13381 ADDSS (DX)(R8*4), X1 13382 MOVSS X1, (DX)(R8*4) 13383 ADDQ CX, DI 13384 ADDQ BX, R8 13385 MOVSS (AX)(DI*4), X1 13386 MULSS X0, X1 13387 ADDSS (DX)(R8*4), X1 13388 MOVSS X1, (DX)(R8*4) 13389 ADDQ CX, DI 13390 ADDQ BX, R8 13391 MOVSS (AX)(DI*4), X1 13392 MULSS X0, X1 13393 ADDSS (DX)(R8*4), X1 13394 MOVSS X1, (DX)(R8*4) 13395 ADDQ CX, DI 13396 ADDQ BX, R8 13397 SUBQ $0x08, SI 13398 13399 check_limit_unroll: 13400 CMPQ SI, $0x08 13401 JHI loop_unroll 13402 JMP check_limit 13403 13404 loop: 13405 MOVSS (AX)(DI*4), X1 13406 MULSS X0, X1 13407 ADDSS (DX)(R8*4), X1 13408 MOVSS X1, (DX)(R8*4) 13409 DECQ SI 13410 ADDQ CX, DI 13411 ADDQ BX, R8 13412 13413 check_limit: 13414 CMPQ SI, $0x00 13415 JHI loop 13416 RET 13417 13418 // func AmdAxpyUnsafeX_V5A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13419 // Requires: SSE 13420 TEXT ·AmdAxpyUnsafeX_V5A11R8(SB), NOSPLIT, $0-48 13421 MOVSS alpha+0(FP), X0 13422 MOVQ xs+8(FP), AX 13423 MOVQ incx+16(FP), CX 13424 MOVQ ys+24(FP), DX 13425 MOVQ incy+32(FP), BX 13426 MOVQ n+40(FP), SI 13427 XORQ DI, DI 13428 XORQ R8, R8 13429 JMP check_limit_unroll 13430 PCALIGN $0x08 13431 NOP 13432 NOP 13433 NOP 13434 13435 loop_unroll: 13436 MOVSS (AX)(DI*4), X1 13437 MULSS X0, X1 13438 ADDSS (DX)(R8*4), X1 13439 MOVSS X1, (DX)(R8*4) 13440 ADDQ CX, DI 13441 ADDQ BX, R8 13442 MOVSS (AX)(DI*4), X1 13443 MULSS X0, X1 13444 ADDSS (DX)(R8*4), X1 13445 MOVSS X1, (DX)(R8*4) 13446 ADDQ CX, DI 13447 ADDQ BX, R8 13448 MOVSS (AX)(DI*4), X1 13449 MULSS X0, X1 13450 ADDSS (DX)(R8*4), X1 13451 MOVSS X1, (DX)(R8*4) 13452 ADDQ CX, DI 13453 ADDQ BX, R8 13454 MOVSS (AX)(DI*4), X1 13455 MULSS X0, X1 13456 ADDSS (DX)(R8*4), X1 13457 MOVSS X1, (DX)(R8*4) 13458 ADDQ CX, DI 13459 ADDQ BX, R8 13460 MOVSS (AX)(DI*4), X1 13461 MULSS X0, X1 13462 ADDSS (DX)(R8*4), X1 13463 MOVSS X1, (DX)(R8*4) 13464 ADDQ CX, DI 13465 ADDQ BX, R8 13466 MOVSS (AX)(DI*4), X1 13467 MULSS X0, X1 13468 ADDSS (DX)(R8*4), X1 13469 MOVSS X1, (DX)(R8*4) 13470 ADDQ CX, DI 13471 ADDQ BX, R8 13472 MOVSS (AX)(DI*4), X1 13473 MULSS X0, X1 13474 ADDSS (DX)(R8*4), X1 13475 MOVSS X1, (DX)(R8*4) 13476 ADDQ CX, DI 13477 ADDQ BX, R8 13478 MOVSS (AX)(DI*4), X1 13479 MULSS X0, X1 13480 ADDSS (DX)(R8*4), X1 13481 MOVSS X1, (DX)(R8*4) 13482 ADDQ CX, DI 13483 ADDQ BX, R8 13484 SUBQ $0x08, SI 13485 13486 check_limit_unroll: 13487 CMPQ SI, $0x08 13488 JHI loop_unroll 13489 JMP check_limit 13490 13491 loop: 13492 MOVSS (AX)(DI*4), X1 13493 MULSS X0, X1 13494 ADDSS (DX)(R8*4), X1 13495 MOVSS X1, (DX)(R8*4) 13496 DECQ SI 13497 ADDQ CX, DI 13498 ADDQ BX, R8 13499 13500 check_limit: 13501 CMPQ SI, $0x00 13502 JHI loop 13503 RET 13504 13505 // func AmdAxpyUnsafeX_V0A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13506 // Requires: SSE 13507 TEXT ·AmdAxpyUnsafeX_V0A12R8(SB), NOSPLIT, $0-48 13508 MOVSS alpha+0(FP), X0 13509 MOVQ xs+8(FP), AX 13510 MOVQ incx+16(FP), CX 13511 MOVQ ys+24(FP), DX 13512 MOVQ incy+32(FP), BX 13513 MOVQ n+40(FP), SI 13514 XORQ DI, DI 13515 XORQ R8, R8 13516 JMP check_limit_unroll 13517 PCALIGN $0x08 13518 NOP 13519 NOP 13520 NOP 13521 NOP 13522 13523 loop_unroll: 13524 MOVSS (AX)(DI*4), X1 13525 MULSS X0, X1 13526 ADDSS (DX)(R8*4), X1 13527 MOVSS X1, (DX)(R8*4) 13528 ADDQ CX, DI 13529 ADDQ BX, R8 13530 MOVSS (AX)(DI*4), X1 13531 MULSS X0, X1 13532 ADDSS (DX)(R8*4), X1 13533 MOVSS X1, (DX)(R8*4) 13534 ADDQ CX, DI 13535 ADDQ BX, R8 13536 MOVSS (AX)(DI*4), X1 13537 MULSS X0, X1 13538 ADDSS (DX)(R8*4), X1 13539 MOVSS X1, (DX)(R8*4) 13540 ADDQ CX, DI 13541 ADDQ BX, R8 13542 MOVSS (AX)(DI*4), X1 13543 MULSS X0, X1 13544 ADDSS (DX)(R8*4), X1 13545 MOVSS X1, (DX)(R8*4) 13546 ADDQ CX, DI 13547 ADDQ BX, R8 13548 MOVSS (AX)(DI*4), X1 13549 MULSS X0, X1 13550 ADDSS (DX)(R8*4), X1 13551 MOVSS X1, (DX)(R8*4) 13552 ADDQ CX, DI 13553 ADDQ BX, R8 13554 MOVSS (AX)(DI*4), X1 13555 MULSS X0, X1 13556 ADDSS (DX)(R8*4), X1 13557 MOVSS X1, (DX)(R8*4) 13558 ADDQ CX, DI 13559 ADDQ BX, R8 13560 MOVSS (AX)(DI*4), X1 13561 MULSS X0, X1 13562 ADDSS (DX)(R8*4), X1 13563 MOVSS X1, (DX)(R8*4) 13564 ADDQ CX, DI 13565 ADDQ BX, R8 13566 MOVSS (AX)(DI*4), X1 13567 MULSS X0, X1 13568 ADDSS (DX)(R8*4), X1 13569 MOVSS X1, (DX)(R8*4) 13570 ADDQ CX, DI 13571 ADDQ BX, R8 13572 SUBQ $0x08, SI 13573 13574 check_limit_unroll: 13575 CMPQ SI, $0x08 13576 JHI loop_unroll 13577 JMP check_limit 13578 13579 loop: 13580 MOVSS (AX)(DI*4), X1 13581 MULSS X0, X1 13582 ADDSS (DX)(R8*4), X1 13583 MOVSS X1, (DX)(R8*4) 13584 DECQ SI 13585 ADDQ CX, DI 13586 ADDQ BX, R8 13587 13588 check_limit: 13589 CMPQ SI, $0x00 13590 JHI loop 13591 RET 13592 13593 // func AmdAxpyUnsafeX_V1A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13594 // Requires: SSE 13595 TEXT ·AmdAxpyUnsafeX_V1A12R8(SB), NOSPLIT, $0-48 13596 MOVSS alpha+0(FP), X0 13597 MOVQ xs+8(FP), AX 13598 MOVQ incx+16(FP), CX 13599 MOVQ ys+24(FP), DX 13600 MOVQ incy+32(FP), BX 13601 MOVQ n+40(FP), SI 13602 XORQ DI, DI 13603 XORQ R8, R8 13604 JMP check_limit_unroll 13605 PCALIGN $0x08 13606 NOP 13607 NOP 13608 NOP 13609 NOP 13610 13611 loop_unroll: 13612 MOVSS (AX)(DI*4), X1 13613 MULSS X0, X1 13614 ADDSS (DX)(R8*4), X1 13615 MOVSS X1, (DX)(R8*4) 13616 ADDQ CX, DI 13617 ADDQ BX, R8 13618 MOVSS (AX)(DI*4), X1 13619 MULSS X0, X1 13620 ADDSS (DX)(R8*4), X1 13621 MOVSS X1, (DX)(R8*4) 13622 ADDQ CX, DI 13623 ADDQ BX, R8 13624 MOVSS (AX)(DI*4), X1 13625 MULSS X0, X1 13626 ADDSS (DX)(R8*4), X1 13627 MOVSS X1, (DX)(R8*4) 13628 ADDQ CX, DI 13629 ADDQ BX, R8 13630 MOVSS (AX)(DI*4), X1 13631 MULSS X0, X1 13632 ADDSS (DX)(R8*4), X1 13633 MOVSS X1, (DX)(R8*4) 13634 ADDQ CX, DI 13635 ADDQ BX, R8 13636 MOVSS (AX)(DI*4), X1 13637 MULSS X0, X1 13638 ADDSS (DX)(R8*4), X1 13639 MOVSS X1, (DX)(R8*4) 13640 ADDQ CX, DI 13641 ADDQ BX, R8 13642 MOVSS (AX)(DI*4), X1 13643 MULSS X0, X1 13644 ADDSS (DX)(R8*4), X1 13645 MOVSS X1, (DX)(R8*4) 13646 ADDQ CX, DI 13647 ADDQ BX, R8 13648 MOVSS (AX)(DI*4), X1 13649 MULSS X0, X1 13650 ADDSS (DX)(R8*4), X1 13651 MOVSS X1, (DX)(R8*4) 13652 ADDQ CX, DI 13653 ADDQ BX, R8 13654 MOVSS (AX)(DI*4), X1 13655 MULSS X0, X1 13656 ADDSS (DX)(R8*4), X1 13657 MOVSS X1, (DX)(R8*4) 13658 ADDQ CX, DI 13659 ADDQ BX, R8 13660 SUBQ $0x08, SI 13661 13662 check_limit_unroll: 13663 CMPQ SI, $0x08 13664 JHI loop_unroll 13665 JMP check_limit 13666 13667 loop: 13668 MOVSS (AX)(DI*4), X1 13669 MULSS X0, X1 13670 ADDSS (DX)(R8*4), X1 13671 MOVSS X1, (DX)(R8*4) 13672 DECQ SI 13673 ADDQ CX, DI 13674 ADDQ BX, R8 13675 13676 check_limit: 13677 CMPQ SI, $0x00 13678 JHI loop 13679 RET 13680 13681 // func AmdAxpyUnsafeX_V2A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13682 // Requires: SSE 13683 TEXT ·AmdAxpyUnsafeX_V2A12R8(SB), NOSPLIT, $0-48 13684 MOVSS alpha+0(FP), X0 13685 MOVQ xs+8(FP), AX 13686 MOVQ incx+16(FP), CX 13687 MOVQ ys+24(FP), DX 13688 MOVQ incy+32(FP), BX 13689 MOVQ n+40(FP), SI 13690 XORQ DI, DI 13691 XORQ R8, R8 13692 JMP check_limit_unroll 13693 PCALIGN $0x08 13694 NOP 13695 NOP 13696 NOP 13697 NOP 13698 13699 loop_unroll: 13700 MOVSS (AX)(DI*4), X1 13701 MULSS X0, X1 13702 ADDSS (DX)(R8*4), X1 13703 MOVSS X1, (DX)(R8*4) 13704 ADDQ CX, DI 13705 ADDQ BX, R8 13706 MOVSS (AX)(DI*4), X1 13707 MULSS X0, X1 13708 ADDSS (DX)(R8*4), X1 13709 MOVSS X1, (DX)(R8*4) 13710 ADDQ CX, DI 13711 ADDQ BX, R8 13712 MOVSS (AX)(DI*4), X1 13713 MULSS X0, X1 13714 ADDSS (DX)(R8*4), X1 13715 MOVSS X1, (DX)(R8*4) 13716 ADDQ CX, DI 13717 ADDQ BX, R8 13718 MOVSS (AX)(DI*4), X1 13719 MULSS X0, X1 13720 ADDSS (DX)(R8*4), X1 13721 MOVSS X1, (DX)(R8*4) 13722 ADDQ CX, DI 13723 ADDQ BX, R8 13724 MOVSS (AX)(DI*4), X1 13725 MULSS X0, X1 13726 ADDSS (DX)(R8*4), X1 13727 MOVSS X1, (DX)(R8*4) 13728 ADDQ CX, DI 13729 ADDQ BX, R8 13730 MOVSS (AX)(DI*4), X1 13731 MULSS X0, X1 13732 ADDSS (DX)(R8*4), X1 13733 MOVSS X1, (DX)(R8*4) 13734 ADDQ CX, DI 13735 ADDQ BX, R8 13736 MOVSS (AX)(DI*4), X1 13737 MULSS X0, X1 13738 ADDSS (DX)(R8*4), X1 13739 MOVSS X1, (DX)(R8*4) 13740 ADDQ CX, DI 13741 ADDQ BX, R8 13742 MOVSS (AX)(DI*4), X1 13743 MULSS X0, X1 13744 ADDSS (DX)(R8*4), X1 13745 MOVSS X1, (DX)(R8*4) 13746 ADDQ CX, DI 13747 ADDQ BX, R8 13748 SUBQ $0x08, SI 13749 13750 check_limit_unroll: 13751 CMPQ SI, $0x08 13752 JHI loop_unroll 13753 JMP check_limit 13754 13755 loop: 13756 MOVSS (AX)(DI*4), X1 13757 MULSS X0, X1 13758 ADDSS (DX)(R8*4), X1 13759 MOVSS X1, (DX)(R8*4) 13760 DECQ SI 13761 ADDQ CX, DI 13762 ADDQ BX, R8 13763 13764 check_limit: 13765 CMPQ SI, $0x00 13766 JHI loop 13767 RET 13768 13769 // func AmdAxpyUnsafeX_V3A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13770 // Requires: SSE 13771 TEXT ·AmdAxpyUnsafeX_V3A12R8(SB), NOSPLIT, $0-48 13772 MOVSS alpha+0(FP), X0 13773 MOVQ xs+8(FP), AX 13774 MOVQ incx+16(FP), CX 13775 MOVQ ys+24(FP), DX 13776 MOVQ incy+32(FP), BX 13777 MOVQ n+40(FP), SI 13778 XORQ DI, DI 13779 XORQ R8, R8 13780 JMP check_limit_unroll 13781 PCALIGN $0x08 13782 NOP 13783 NOP 13784 NOP 13785 NOP 13786 13787 loop_unroll: 13788 MOVSS (AX)(DI*4), X1 13789 MULSS X0, X1 13790 ADDSS (DX)(R8*4), X1 13791 MOVSS X1, (DX)(R8*4) 13792 ADDQ CX, DI 13793 ADDQ BX, R8 13794 MOVSS (AX)(DI*4), X1 13795 MULSS X0, X1 13796 ADDSS (DX)(R8*4), X1 13797 MOVSS X1, (DX)(R8*4) 13798 ADDQ CX, DI 13799 ADDQ BX, R8 13800 MOVSS (AX)(DI*4), X1 13801 MULSS X0, X1 13802 ADDSS (DX)(R8*4), X1 13803 MOVSS X1, (DX)(R8*4) 13804 ADDQ CX, DI 13805 ADDQ BX, R8 13806 MOVSS (AX)(DI*4), X1 13807 MULSS X0, X1 13808 ADDSS (DX)(R8*4), X1 13809 MOVSS X1, (DX)(R8*4) 13810 ADDQ CX, DI 13811 ADDQ BX, R8 13812 MOVSS (AX)(DI*4), X1 13813 MULSS X0, X1 13814 ADDSS (DX)(R8*4), X1 13815 MOVSS X1, (DX)(R8*4) 13816 ADDQ CX, DI 13817 ADDQ BX, R8 13818 MOVSS (AX)(DI*4), X1 13819 MULSS X0, X1 13820 ADDSS (DX)(R8*4), X1 13821 MOVSS X1, (DX)(R8*4) 13822 ADDQ CX, DI 13823 ADDQ BX, R8 13824 MOVSS (AX)(DI*4), X1 13825 MULSS X0, X1 13826 ADDSS (DX)(R8*4), X1 13827 MOVSS X1, (DX)(R8*4) 13828 ADDQ CX, DI 13829 ADDQ BX, R8 13830 MOVSS (AX)(DI*4), X1 13831 MULSS X0, X1 13832 ADDSS (DX)(R8*4), X1 13833 MOVSS X1, (DX)(R8*4) 13834 ADDQ CX, DI 13835 ADDQ BX, R8 13836 SUBQ $0x08, SI 13837 13838 check_limit_unroll: 13839 CMPQ SI, $0x08 13840 JHI loop_unroll 13841 JMP check_limit 13842 13843 loop: 13844 MOVSS (AX)(DI*4), X1 13845 MULSS X0, X1 13846 ADDSS (DX)(R8*4), X1 13847 MOVSS X1, (DX)(R8*4) 13848 DECQ SI 13849 ADDQ CX, DI 13850 ADDQ BX, R8 13851 13852 check_limit: 13853 CMPQ SI, $0x00 13854 JHI loop 13855 RET 13856 13857 // func AmdAxpyUnsafeX_V4A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13858 // Requires: SSE 13859 TEXT ·AmdAxpyUnsafeX_V4A12R8(SB), NOSPLIT, $0-48 13860 MOVSS alpha+0(FP), X0 13861 MOVQ xs+8(FP), AX 13862 MOVQ incx+16(FP), CX 13863 MOVQ ys+24(FP), DX 13864 MOVQ incy+32(FP), BX 13865 MOVQ n+40(FP), SI 13866 XORQ DI, DI 13867 XORQ R8, R8 13868 JMP check_limit_unroll 13869 PCALIGN $0x08 13870 NOP 13871 NOP 13872 NOP 13873 NOP 13874 13875 loop_unroll: 13876 MOVSS (AX)(DI*4), X1 13877 MULSS X0, X1 13878 ADDSS (DX)(R8*4), X1 13879 MOVSS X1, (DX)(R8*4) 13880 ADDQ CX, DI 13881 ADDQ BX, R8 13882 MOVSS (AX)(DI*4), X1 13883 MULSS X0, X1 13884 ADDSS (DX)(R8*4), X1 13885 MOVSS X1, (DX)(R8*4) 13886 ADDQ CX, DI 13887 ADDQ BX, R8 13888 MOVSS (AX)(DI*4), X1 13889 MULSS X0, X1 13890 ADDSS (DX)(R8*4), X1 13891 MOVSS X1, (DX)(R8*4) 13892 ADDQ CX, DI 13893 ADDQ BX, R8 13894 MOVSS (AX)(DI*4), X1 13895 MULSS X0, X1 13896 ADDSS (DX)(R8*4), X1 13897 MOVSS X1, (DX)(R8*4) 13898 ADDQ CX, DI 13899 ADDQ BX, R8 13900 MOVSS (AX)(DI*4), X1 13901 MULSS X0, X1 13902 ADDSS (DX)(R8*4), X1 13903 MOVSS X1, (DX)(R8*4) 13904 ADDQ CX, DI 13905 ADDQ BX, R8 13906 MOVSS (AX)(DI*4), X1 13907 MULSS X0, X1 13908 ADDSS (DX)(R8*4), X1 13909 MOVSS X1, (DX)(R8*4) 13910 ADDQ CX, DI 13911 ADDQ BX, R8 13912 MOVSS (AX)(DI*4), X1 13913 MULSS X0, X1 13914 ADDSS (DX)(R8*4), X1 13915 MOVSS X1, (DX)(R8*4) 13916 ADDQ CX, DI 13917 ADDQ BX, R8 13918 MOVSS (AX)(DI*4), X1 13919 MULSS X0, X1 13920 ADDSS (DX)(R8*4), X1 13921 MOVSS X1, (DX)(R8*4) 13922 ADDQ CX, DI 13923 ADDQ BX, R8 13924 SUBQ $0x08, SI 13925 13926 check_limit_unroll: 13927 CMPQ SI, $0x08 13928 JHI loop_unroll 13929 JMP check_limit 13930 13931 loop: 13932 MOVSS (AX)(DI*4), X1 13933 MULSS X0, X1 13934 ADDSS (DX)(R8*4), X1 13935 MOVSS X1, (DX)(R8*4) 13936 DECQ SI 13937 ADDQ CX, DI 13938 ADDQ BX, R8 13939 13940 check_limit: 13941 CMPQ SI, $0x00 13942 JHI loop 13943 RET 13944 13945 // func AmdAxpyUnsafeX_V5A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 13946 // Requires: SSE 13947 TEXT ·AmdAxpyUnsafeX_V5A12R8(SB), NOSPLIT, $0-48 13948 MOVSS alpha+0(FP), X0 13949 MOVQ xs+8(FP), AX 13950 MOVQ incx+16(FP), CX 13951 MOVQ ys+24(FP), DX 13952 MOVQ incy+32(FP), BX 13953 MOVQ n+40(FP), SI 13954 XORQ DI, DI 13955 XORQ R8, R8 13956 JMP check_limit_unroll 13957 PCALIGN $0x08 13958 NOP 13959 NOP 13960 NOP 13961 NOP 13962 13963 loop_unroll: 13964 MOVSS (AX)(DI*4), X1 13965 MULSS X0, X1 13966 ADDSS (DX)(R8*4), X1 13967 MOVSS X1, (DX)(R8*4) 13968 ADDQ CX, DI 13969 ADDQ BX, R8 13970 MOVSS (AX)(DI*4), X1 13971 MULSS X0, X1 13972 ADDSS (DX)(R8*4), X1 13973 MOVSS X1, (DX)(R8*4) 13974 ADDQ CX, DI 13975 ADDQ BX, R8 13976 MOVSS (AX)(DI*4), X1 13977 MULSS X0, X1 13978 ADDSS (DX)(R8*4), X1 13979 MOVSS X1, (DX)(R8*4) 13980 ADDQ CX, DI 13981 ADDQ BX, R8 13982 MOVSS (AX)(DI*4), X1 13983 MULSS X0, X1 13984 ADDSS (DX)(R8*4), X1 13985 MOVSS X1, (DX)(R8*4) 13986 ADDQ CX, DI 13987 ADDQ BX, R8 13988 MOVSS (AX)(DI*4), X1 13989 MULSS X0, X1 13990 ADDSS (DX)(R8*4), X1 13991 MOVSS X1, (DX)(R8*4) 13992 ADDQ CX, DI 13993 ADDQ BX, R8 13994 MOVSS (AX)(DI*4), X1 13995 MULSS X0, X1 13996 ADDSS (DX)(R8*4), X1 13997 MOVSS X1, (DX)(R8*4) 13998 ADDQ CX, DI 13999 ADDQ BX, R8 14000 MOVSS (AX)(DI*4), X1 14001 MULSS X0, X1 14002 ADDSS (DX)(R8*4), X1 14003 MOVSS X1, (DX)(R8*4) 14004 ADDQ CX, DI 14005 ADDQ BX, R8 14006 MOVSS (AX)(DI*4), X1 14007 MULSS X0, X1 14008 ADDSS (DX)(R8*4), X1 14009 MOVSS X1, (DX)(R8*4) 14010 ADDQ CX, DI 14011 ADDQ BX, R8 14012 SUBQ $0x08, SI 14013 14014 check_limit_unroll: 14015 CMPQ SI, $0x08 14016 JHI loop_unroll 14017 JMP check_limit 14018 14019 loop: 14020 MOVSS (AX)(DI*4), X1 14021 MULSS X0, X1 14022 ADDSS (DX)(R8*4), X1 14023 MOVSS X1, (DX)(R8*4) 14024 DECQ SI 14025 ADDQ CX, DI 14026 ADDQ BX, R8 14027 14028 check_limit: 14029 CMPQ SI, $0x00 14030 JHI loop 14031 RET 14032 14033 // func AmdAxpyUnsafeX_V0A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14034 // Requires: SSE 14035 TEXT ·AmdAxpyUnsafeX_V0A13R8(SB), NOSPLIT, $0-48 14036 MOVSS alpha+0(FP), X0 14037 MOVQ xs+8(FP), AX 14038 MOVQ incx+16(FP), CX 14039 MOVQ ys+24(FP), DX 14040 MOVQ incy+32(FP), BX 14041 MOVQ n+40(FP), SI 14042 XORQ DI, DI 14043 XORQ R8, R8 14044 JMP check_limit_unroll 14045 PCALIGN $0x08 14046 NOP 14047 NOP 14048 NOP 14049 NOP 14050 NOP 14051 14052 loop_unroll: 14053 MOVSS (AX)(DI*4), X1 14054 MULSS X0, X1 14055 ADDSS (DX)(R8*4), X1 14056 MOVSS X1, (DX)(R8*4) 14057 ADDQ CX, DI 14058 ADDQ BX, R8 14059 MOVSS (AX)(DI*4), X1 14060 MULSS X0, X1 14061 ADDSS (DX)(R8*4), X1 14062 MOVSS X1, (DX)(R8*4) 14063 ADDQ CX, DI 14064 ADDQ BX, R8 14065 MOVSS (AX)(DI*4), X1 14066 MULSS X0, X1 14067 ADDSS (DX)(R8*4), X1 14068 MOVSS X1, (DX)(R8*4) 14069 ADDQ CX, DI 14070 ADDQ BX, R8 14071 MOVSS (AX)(DI*4), X1 14072 MULSS X0, X1 14073 ADDSS (DX)(R8*4), X1 14074 MOVSS X1, (DX)(R8*4) 14075 ADDQ CX, DI 14076 ADDQ BX, R8 14077 MOVSS (AX)(DI*4), X1 14078 MULSS X0, X1 14079 ADDSS (DX)(R8*4), X1 14080 MOVSS X1, (DX)(R8*4) 14081 ADDQ CX, DI 14082 ADDQ BX, R8 14083 MOVSS (AX)(DI*4), X1 14084 MULSS X0, X1 14085 ADDSS (DX)(R8*4), X1 14086 MOVSS X1, (DX)(R8*4) 14087 ADDQ CX, DI 14088 ADDQ BX, R8 14089 MOVSS (AX)(DI*4), X1 14090 MULSS X0, X1 14091 ADDSS (DX)(R8*4), X1 14092 MOVSS X1, (DX)(R8*4) 14093 ADDQ CX, DI 14094 ADDQ BX, R8 14095 MOVSS (AX)(DI*4), X1 14096 MULSS X0, X1 14097 ADDSS (DX)(R8*4), X1 14098 MOVSS X1, (DX)(R8*4) 14099 ADDQ CX, DI 14100 ADDQ BX, R8 14101 SUBQ $0x08, SI 14102 14103 check_limit_unroll: 14104 CMPQ SI, $0x08 14105 JHI loop_unroll 14106 JMP check_limit 14107 14108 loop: 14109 MOVSS (AX)(DI*4), X1 14110 MULSS X0, X1 14111 ADDSS (DX)(R8*4), X1 14112 MOVSS X1, (DX)(R8*4) 14113 DECQ SI 14114 ADDQ CX, DI 14115 ADDQ BX, R8 14116 14117 check_limit: 14118 CMPQ SI, $0x00 14119 JHI loop 14120 RET 14121 14122 // func AmdAxpyUnsafeX_V1A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14123 // Requires: SSE 14124 TEXT ·AmdAxpyUnsafeX_V1A13R8(SB), NOSPLIT, $0-48 14125 MOVSS alpha+0(FP), X0 14126 MOVQ xs+8(FP), AX 14127 MOVQ incx+16(FP), CX 14128 MOVQ ys+24(FP), DX 14129 MOVQ incy+32(FP), BX 14130 MOVQ n+40(FP), SI 14131 XORQ DI, DI 14132 XORQ R8, R8 14133 JMP check_limit_unroll 14134 PCALIGN $0x08 14135 NOP 14136 NOP 14137 NOP 14138 NOP 14139 NOP 14140 14141 loop_unroll: 14142 MOVSS (AX)(DI*4), X1 14143 MULSS X0, X1 14144 ADDSS (DX)(R8*4), X1 14145 MOVSS X1, (DX)(R8*4) 14146 ADDQ CX, DI 14147 ADDQ BX, R8 14148 MOVSS (AX)(DI*4), X1 14149 MULSS X0, X1 14150 ADDSS (DX)(R8*4), X1 14151 MOVSS X1, (DX)(R8*4) 14152 ADDQ CX, DI 14153 ADDQ BX, R8 14154 MOVSS (AX)(DI*4), X1 14155 MULSS X0, X1 14156 ADDSS (DX)(R8*4), X1 14157 MOVSS X1, (DX)(R8*4) 14158 ADDQ CX, DI 14159 ADDQ BX, R8 14160 MOVSS (AX)(DI*4), X1 14161 MULSS X0, X1 14162 ADDSS (DX)(R8*4), X1 14163 MOVSS X1, (DX)(R8*4) 14164 ADDQ CX, DI 14165 ADDQ BX, R8 14166 MOVSS (AX)(DI*4), X1 14167 MULSS X0, X1 14168 ADDSS (DX)(R8*4), X1 14169 MOVSS X1, (DX)(R8*4) 14170 ADDQ CX, DI 14171 ADDQ BX, R8 14172 MOVSS (AX)(DI*4), X1 14173 MULSS X0, X1 14174 ADDSS (DX)(R8*4), X1 14175 MOVSS X1, (DX)(R8*4) 14176 ADDQ CX, DI 14177 ADDQ BX, R8 14178 MOVSS (AX)(DI*4), X1 14179 MULSS X0, X1 14180 ADDSS (DX)(R8*4), X1 14181 MOVSS X1, (DX)(R8*4) 14182 ADDQ CX, DI 14183 ADDQ BX, R8 14184 MOVSS (AX)(DI*4), X1 14185 MULSS X0, X1 14186 ADDSS (DX)(R8*4), X1 14187 MOVSS X1, (DX)(R8*4) 14188 ADDQ CX, DI 14189 ADDQ BX, R8 14190 SUBQ $0x08, SI 14191 14192 check_limit_unroll: 14193 CMPQ SI, $0x08 14194 JHI loop_unroll 14195 JMP check_limit 14196 14197 loop: 14198 MOVSS (AX)(DI*4), X1 14199 MULSS X0, X1 14200 ADDSS (DX)(R8*4), X1 14201 MOVSS X1, (DX)(R8*4) 14202 DECQ SI 14203 ADDQ CX, DI 14204 ADDQ BX, R8 14205 14206 check_limit: 14207 CMPQ SI, $0x00 14208 JHI loop 14209 RET 14210 14211 // func AmdAxpyUnsafeX_V2A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14212 // Requires: SSE 14213 TEXT ·AmdAxpyUnsafeX_V2A13R8(SB), NOSPLIT, $0-48 14214 MOVSS alpha+0(FP), X0 14215 MOVQ xs+8(FP), AX 14216 MOVQ incx+16(FP), CX 14217 MOVQ ys+24(FP), DX 14218 MOVQ incy+32(FP), BX 14219 MOVQ n+40(FP), SI 14220 XORQ DI, DI 14221 XORQ R8, R8 14222 JMP check_limit_unroll 14223 PCALIGN $0x08 14224 NOP 14225 NOP 14226 NOP 14227 NOP 14228 NOP 14229 14230 loop_unroll: 14231 MOVSS (AX)(DI*4), X1 14232 MULSS X0, X1 14233 ADDSS (DX)(R8*4), X1 14234 MOVSS X1, (DX)(R8*4) 14235 ADDQ CX, DI 14236 ADDQ BX, R8 14237 MOVSS (AX)(DI*4), X1 14238 MULSS X0, X1 14239 ADDSS (DX)(R8*4), X1 14240 MOVSS X1, (DX)(R8*4) 14241 ADDQ CX, DI 14242 ADDQ BX, R8 14243 MOVSS (AX)(DI*4), X1 14244 MULSS X0, X1 14245 ADDSS (DX)(R8*4), X1 14246 MOVSS X1, (DX)(R8*4) 14247 ADDQ CX, DI 14248 ADDQ BX, R8 14249 MOVSS (AX)(DI*4), X1 14250 MULSS X0, X1 14251 ADDSS (DX)(R8*4), X1 14252 MOVSS X1, (DX)(R8*4) 14253 ADDQ CX, DI 14254 ADDQ BX, R8 14255 MOVSS (AX)(DI*4), X1 14256 MULSS X0, X1 14257 ADDSS (DX)(R8*4), X1 14258 MOVSS X1, (DX)(R8*4) 14259 ADDQ CX, DI 14260 ADDQ BX, R8 14261 MOVSS (AX)(DI*4), X1 14262 MULSS X0, X1 14263 ADDSS (DX)(R8*4), X1 14264 MOVSS X1, (DX)(R8*4) 14265 ADDQ CX, DI 14266 ADDQ BX, R8 14267 MOVSS (AX)(DI*4), X1 14268 MULSS X0, X1 14269 ADDSS (DX)(R8*4), X1 14270 MOVSS X1, (DX)(R8*4) 14271 ADDQ CX, DI 14272 ADDQ BX, R8 14273 MOVSS (AX)(DI*4), X1 14274 MULSS X0, X1 14275 ADDSS (DX)(R8*4), X1 14276 MOVSS X1, (DX)(R8*4) 14277 ADDQ CX, DI 14278 ADDQ BX, R8 14279 SUBQ $0x08, SI 14280 14281 check_limit_unroll: 14282 CMPQ SI, $0x08 14283 JHI loop_unroll 14284 JMP check_limit 14285 14286 loop: 14287 MOVSS (AX)(DI*4), X1 14288 MULSS X0, X1 14289 ADDSS (DX)(R8*4), X1 14290 MOVSS X1, (DX)(R8*4) 14291 DECQ SI 14292 ADDQ CX, DI 14293 ADDQ BX, R8 14294 14295 check_limit: 14296 CMPQ SI, $0x00 14297 JHI loop 14298 RET 14299 14300 // func AmdAxpyUnsafeX_V3A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14301 // Requires: SSE 14302 TEXT ·AmdAxpyUnsafeX_V3A13R8(SB), NOSPLIT, $0-48 14303 MOVSS alpha+0(FP), X0 14304 MOVQ xs+8(FP), AX 14305 MOVQ incx+16(FP), CX 14306 MOVQ ys+24(FP), DX 14307 MOVQ incy+32(FP), BX 14308 MOVQ n+40(FP), SI 14309 XORQ DI, DI 14310 XORQ R8, R8 14311 JMP check_limit_unroll 14312 PCALIGN $0x08 14313 NOP 14314 NOP 14315 NOP 14316 NOP 14317 NOP 14318 14319 loop_unroll: 14320 MOVSS (AX)(DI*4), X1 14321 MULSS X0, X1 14322 ADDSS (DX)(R8*4), X1 14323 MOVSS X1, (DX)(R8*4) 14324 ADDQ CX, DI 14325 ADDQ BX, R8 14326 MOVSS (AX)(DI*4), X1 14327 MULSS X0, X1 14328 ADDSS (DX)(R8*4), X1 14329 MOVSS X1, (DX)(R8*4) 14330 ADDQ CX, DI 14331 ADDQ BX, R8 14332 MOVSS (AX)(DI*4), X1 14333 MULSS X0, X1 14334 ADDSS (DX)(R8*4), X1 14335 MOVSS X1, (DX)(R8*4) 14336 ADDQ CX, DI 14337 ADDQ BX, R8 14338 MOVSS (AX)(DI*4), X1 14339 MULSS X0, X1 14340 ADDSS (DX)(R8*4), X1 14341 MOVSS X1, (DX)(R8*4) 14342 ADDQ CX, DI 14343 ADDQ BX, R8 14344 MOVSS (AX)(DI*4), X1 14345 MULSS X0, X1 14346 ADDSS (DX)(R8*4), X1 14347 MOVSS X1, (DX)(R8*4) 14348 ADDQ CX, DI 14349 ADDQ BX, R8 14350 MOVSS (AX)(DI*4), X1 14351 MULSS X0, X1 14352 ADDSS (DX)(R8*4), X1 14353 MOVSS X1, (DX)(R8*4) 14354 ADDQ CX, DI 14355 ADDQ BX, R8 14356 MOVSS (AX)(DI*4), X1 14357 MULSS X0, X1 14358 ADDSS (DX)(R8*4), X1 14359 MOVSS X1, (DX)(R8*4) 14360 ADDQ CX, DI 14361 ADDQ BX, R8 14362 MOVSS (AX)(DI*4), X1 14363 MULSS X0, X1 14364 ADDSS (DX)(R8*4), X1 14365 MOVSS X1, (DX)(R8*4) 14366 ADDQ CX, DI 14367 ADDQ BX, R8 14368 SUBQ $0x08, SI 14369 14370 check_limit_unroll: 14371 CMPQ SI, $0x08 14372 JHI loop_unroll 14373 JMP check_limit 14374 14375 loop: 14376 MOVSS (AX)(DI*4), X1 14377 MULSS X0, X1 14378 ADDSS (DX)(R8*4), X1 14379 MOVSS X1, (DX)(R8*4) 14380 DECQ SI 14381 ADDQ CX, DI 14382 ADDQ BX, R8 14383 14384 check_limit: 14385 CMPQ SI, $0x00 14386 JHI loop 14387 RET 14388 14389 // func AmdAxpyUnsafeX_V4A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14390 // Requires: SSE 14391 TEXT ·AmdAxpyUnsafeX_V4A13R8(SB), NOSPLIT, $0-48 14392 MOVSS alpha+0(FP), X0 14393 MOVQ xs+8(FP), AX 14394 MOVQ incx+16(FP), CX 14395 MOVQ ys+24(FP), DX 14396 MOVQ incy+32(FP), BX 14397 MOVQ n+40(FP), SI 14398 XORQ DI, DI 14399 XORQ R8, R8 14400 JMP check_limit_unroll 14401 PCALIGN $0x08 14402 NOP 14403 NOP 14404 NOP 14405 NOP 14406 NOP 14407 14408 loop_unroll: 14409 MOVSS (AX)(DI*4), X1 14410 MULSS X0, X1 14411 ADDSS (DX)(R8*4), X1 14412 MOVSS X1, (DX)(R8*4) 14413 ADDQ CX, DI 14414 ADDQ BX, R8 14415 MOVSS (AX)(DI*4), X1 14416 MULSS X0, X1 14417 ADDSS (DX)(R8*4), X1 14418 MOVSS X1, (DX)(R8*4) 14419 ADDQ CX, DI 14420 ADDQ BX, R8 14421 MOVSS (AX)(DI*4), X1 14422 MULSS X0, X1 14423 ADDSS (DX)(R8*4), X1 14424 MOVSS X1, (DX)(R8*4) 14425 ADDQ CX, DI 14426 ADDQ BX, R8 14427 MOVSS (AX)(DI*4), X1 14428 MULSS X0, X1 14429 ADDSS (DX)(R8*4), X1 14430 MOVSS X1, (DX)(R8*4) 14431 ADDQ CX, DI 14432 ADDQ BX, R8 14433 MOVSS (AX)(DI*4), X1 14434 MULSS X0, X1 14435 ADDSS (DX)(R8*4), X1 14436 MOVSS X1, (DX)(R8*4) 14437 ADDQ CX, DI 14438 ADDQ BX, R8 14439 MOVSS (AX)(DI*4), X1 14440 MULSS X0, X1 14441 ADDSS (DX)(R8*4), X1 14442 MOVSS X1, (DX)(R8*4) 14443 ADDQ CX, DI 14444 ADDQ BX, R8 14445 MOVSS (AX)(DI*4), X1 14446 MULSS X0, X1 14447 ADDSS (DX)(R8*4), X1 14448 MOVSS X1, (DX)(R8*4) 14449 ADDQ CX, DI 14450 ADDQ BX, R8 14451 MOVSS (AX)(DI*4), X1 14452 MULSS X0, X1 14453 ADDSS (DX)(R8*4), X1 14454 MOVSS X1, (DX)(R8*4) 14455 ADDQ CX, DI 14456 ADDQ BX, R8 14457 SUBQ $0x08, SI 14458 14459 check_limit_unroll: 14460 CMPQ SI, $0x08 14461 JHI loop_unroll 14462 JMP check_limit 14463 14464 loop: 14465 MOVSS (AX)(DI*4), X1 14466 MULSS X0, X1 14467 ADDSS (DX)(R8*4), X1 14468 MOVSS X1, (DX)(R8*4) 14469 DECQ SI 14470 ADDQ CX, DI 14471 ADDQ BX, R8 14472 14473 check_limit: 14474 CMPQ SI, $0x00 14475 JHI loop 14476 RET 14477 14478 // func AmdAxpyUnsafeX_V5A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14479 // Requires: SSE 14480 TEXT ·AmdAxpyUnsafeX_V5A13R8(SB), NOSPLIT, $0-48 14481 MOVSS alpha+0(FP), X0 14482 MOVQ xs+8(FP), AX 14483 MOVQ incx+16(FP), CX 14484 MOVQ ys+24(FP), DX 14485 MOVQ incy+32(FP), BX 14486 MOVQ n+40(FP), SI 14487 XORQ DI, DI 14488 XORQ R8, R8 14489 JMP check_limit_unroll 14490 PCALIGN $0x08 14491 NOP 14492 NOP 14493 NOP 14494 NOP 14495 NOP 14496 14497 loop_unroll: 14498 MOVSS (AX)(DI*4), X1 14499 MULSS X0, X1 14500 ADDSS (DX)(R8*4), X1 14501 MOVSS X1, (DX)(R8*4) 14502 ADDQ CX, DI 14503 ADDQ BX, R8 14504 MOVSS (AX)(DI*4), X1 14505 MULSS X0, X1 14506 ADDSS (DX)(R8*4), X1 14507 MOVSS X1, (DX)(R8*4) 14508 ADDQ CX, DI 14509 ADDQ BX, R8 14510 MOVSS (AX)(DI*4), X1 14511 MULSS X0, X1 14512 ADDSS (DX)(R8*4), X1 14513 MOVSS X1, (DX)(R8*4) 14514 ADDQ CX, DI 14515 ADDQ BX, R8 14516 MOVSS (AX)(DI*4), X1 14517 MULSS X0, X1 14518 ADDSS (DX)(R8*4), X1 14519 MOVSS X1, (DX)(R8*4) 14520 ADDQ CX, DI 14521 ADDQ BX, R8 14522 MOVSS (AX)(DI*4), X1 14523 MULSS X0, X1 14524 ADDSS (DX)(R8*4), X1 14525 MOVSS X1, (DX)(R8*4) 14526 ADDQ CX, DI 14527 ADDQ BX, R8 14528 MOVSS (AX)(DI*4), X1 14529 MULSS X0, X1 14530 ADDSS (DX)(R8*4), X1 14531 MOVSS X1, (DX)(R8*4) 14532 ADDQ CX, DI 14533 ADDQ BX, R8 14534 MOVSS (AX)(DI*4), X1 14535 MULSS X0, X1 14536 ADDSS (DX)(R8*4), X1 14537 MOVSS X1, (DX)(R8*4) 14538 ADDQ CX, DI 14539 ADDQ BX, R8 14540 MOVSS (AX)(DI*4), X1 14541 MULSS X0, X1 14542 ADDSS (DX)(R8*4), X1 14543 MOVSS X1, (DX)(R8*4) 14544 ADDQ CX, DI 14545 ADDQ BX, R8 14546 SUBQ $0x08, SI 14547 14548 check_limit_unroll: 14549 CMPQ SI, $0x08 14550 JHI loop_unroll 14551 JMP check_limit 14552 14553 loop: 14554 MOVSS (AX)(DI*4), X1 14555 MULSS X0, X1 14556 ADDSS (DX)(R8*4), X1 14557 MOVSS X1, (DX)(R8*4) 14558 DECQ SI 14559 ADDQ CX, DI 14560 ADDQ BX, R8 14561 14562 check_limit: 14563 CMPQ SI, $0x00 14564 JHI loop 14565 RET 14566 14567 // func AmdAxpyUnsafeX_V0A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14568 // Requires: SSE 14569 TEXT ·AmdAxpyUnsafeX_V0A14R8(SB), NOSPLIT, $0-48 14570 MOVSS alpha+0(FP), X0 14571 MOVQ xs+8(FP), AX 14572 MOVQ incx+16(FP), CX 14573 MOVQ ys+24(FP), DX 14574 MOVQ incy+32(FP), BX 14575 MOVQ n+40(FP), SI 14576 XORQ DI, DI 14577 XORQ R8, R8 14578 JMP check_limit_unroll 14579 PCALIGN $0x08 14580 NOP 14581 NOP 14582 NOP 14583 NOP 14584 NOP 14585 NOP 14586 14587 loop_unroll: 14588 MOVSS (AX)(DI*4), X1 14589 MULSS X0, X1 14590 ADDSS (DX)(R8*4), X1 14591 MOVSS X1, (DX)(R8*4) 14592 ADDQ CX, DI 14593 ADDQ BX, R8 14594 MOVSS (AX)(DI*4), X1 14595 MULSS X0, X1 14596 ADDSS (DX)(R8*4), X1 14597 MOVSS X1, (DX)(R8*4) 14598 ADDQ CX, DI 14599 ADDQ BX, R8 14600 MOVSS (AX)(DI*4), X1 14601 MULSS X0, X1 14602 ADDSS (DX)(R8*4), X1 14603 MOVSS X1, (DX)(R8*4) 14604 ADDQ CX, DI 14605 ADDQ BX, R8 14606 MOVSS (AX)(DI*4), X1 14607 MULSS X0, X1 14608 ADDSS (DX)(R8*4), X1 14609 MOVSS X1, (DX)(R8*4) 14610 ADDQ CX, DI 14611 ADDQ BX, R8 14612 MOVSS (AX)(DI*4), X1 14613 MULSS X0, X1 14614 ADDSS (DX)(R8*4), X1 14615 MOVSS X1, (DX)(R8*4) 14616 ADDQ CX, DI 14617 ADDQ BX, R8 14618 MOVSS (AX)(DI*4), X1 14619 MULSS X0, X1 14620 ADDSS (DX)(R8*4), X1 14621 MOVSS X1, (DX)(R8*4) 14622 ADDQ CX, DI 14623 ADDQ BX, R8 14624 MOVSS (AX)(DI*4), X1 14625 MULSS X0, X1 14626 ADDSS (DX)(R8*4), X1 14627 MOVSS X1, (DX)(R8*4) 14628 ADDQ CX, DI 14629 ADDQ BX, R8 14630 MOVSS (AX)(DI*4), X1 14631 MULSS X0, X1 14632 ADDSS (DX)(R8*4), X1 14633 MOVSS X1, (DX)(R8*4) 14634 ADDQ CX, DI 14635 ADDQ BX, R8 14636 SUBQ $0x08, SI 14637 14638 check_limit_unroll: 14639 CMPQ SI, $0x08 14640 JHI loop_unroll 14641 JMP check_limit 14642 14643 loop: 14644 MOVSS (AX)(DI*4), X1 14645 MULSS X0, X1 14646 ADDSS (DX)(R8*4), X1 14647 MOVSS X1, (DX)(R8*4) 14648 DECQ SI 14649 ADDQ CX, DI 14650 ADDQ BX, R8 14651 14652 check_limit: 14653 CMPQ SI, $0x00 14654 JHI loop 14655 RET 14656 14657 // func AmdAxpyUnsafeX_V1A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14658 // Requires: SSE 14659 TEXT ·AmdAxpyUnsafeX_V1A14R8(SB), NOSPLIT, $0-48 14660 MOVSS alpha+0(FP), X0 14661 MOVQ xs+8(FP), AX 14662 MOVQ incx+16(FP), CX 14663 MOVQ ys+24(FP), DX 14664 MOVQ incy+32(FP), BX 14665 MOVQ n+40(FP), SI 14666 XORQ DI, DI 14667 XORQ R8, R8 14668 JMP check_limit_unroll 14669 PCALIGN $0x08 14670 NOP 14671 NOP 14672 NOP 14673 NOP 14674 NOP 14675 NOP 14676 14677 loop_unroll: 14678 MOVSS (AX)(DI*4), X1 14679 MULSS X0, X1 14680 ADDSS (DX)(R8*4), X1 14681 MOVSS X1, (DX)(R8*4) 14682 ADDQ CX, DI 14683 ADDQ BX, R8 14684 MOVSS (AX)(DI*4), X1 14685 MULSS X0, X1 14686 ADDSS (DX)(R8*4), X1 14687 MOVSS X1, (DX)(R8*4) 14688 ADDQ CX, DI 14689 ADDQ BX, R8 14690 MOVSS (AX)(DI*4), X1 14691 MULSS X0, X1 14692 ADDSS (DX)(R8*4), X1 14693 MOVSS X1, (DX)(R8*4) 14694 ADDQ CX, DI 14695 ADDQ BX, R8 14696 MOVSS (AX)(DI*4), X1 14697 MULSS X0, X1 14698 ADDSS (DX)(R8*4), X1 14699 MOVSS X1, (DX)(R8*4) 14700 ADDQ CX, DI 14701 ADDQ BX, R8 14702 MOVSS (AX)(DI*4), X1 14703 MULSS X0, X1 14704 ADDSS (DX)(R8*4), X1 14705 MOVSS X1, (DX)(R8*4) 14706 ADDQ CX, DI 14707 ADDQ BX, R8 14708 MOVSS (AX)(DI*4), X1 14709 MULSS X0, X1 14710 ADDSS (DX)(R8*4), X1 14711 MOVSS X1, (DX)(R8*4) 14712 ADDQ CX, DI 14713 ADDQ BX, R8 14714 MOVSS (AX)(DI*4), X1 14715 MULSS X0, X1 14716 ADDSS (DX)(R8*4), X1 14717 MOVSS X1, (DX)(R8*4) 14718 ADDQ CX, DI 14719 ADDQ BX, R8 14720 MOVSS (AX)(DI*4), X1 14721 MULSS X0, X1 14722 ADDSS (DX)(R8*4), X1 14723 MOVSS X1, (DX)(R8*4) 14724 ADDQ CX, DI 14725 ADDQ BX, R8 14726 SUBQ $0x08, SI 14727 14728 check_limit_unroll: 14729 CMPQ SI, $0x08 14730 JHI loop_unroll 14731 JMP check_limit 14732 14733 loop: 14734 MOVSS (AX)(DI*4), X1 14735 MULSS X0, X1 14736 ADDSS (DX)(R8*4), X1 14737 MOVSS X1, (DX)(R8*4) 14738 DECQ SI 14739 ADDQ CX, DI 14740 ADDQ BX, R8 14741 14742 check_limit: 14743 CMPQ SI, $0x00 14744 JHI loop 14745 RET 14746 14747 // func AmdAxpyUnsafeX_V2A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14748 // Requires: SSE 14749 TEXT ·AmdAxpyUnsafeX_V2A14R8(SB), NOSPLIT, $0-48 14750 MOVSS alpha+0(FP), X0 14751 MOVQ xs+8(FP), AX 14752 MOVQ incx+16(FP), CX 14753 MOVQ ys+24(FP), DX 14754 MOVQ incy+32(FP), BX 14755 MOVQ n+40(FP), SI 14756 XORQ DI, DI 14757 XORQ R8, R8 14758 JMP check_limit_unroll 14759 PCALIGN $0x08 14760 NOP 14761 NOP 14762 NOP 14763 NOP 14764 NOP 14765 NOP 14766 14767 loop_unroll: 14768 MOVSS (AX)(DI*4), X1 14769 MULSS X0, X1 14770 ADDSS (DX)(R8*4), X1 14771 MOVSS X1, (DX)(R8*4) 14772 ADDQ CX, DI 14773 ADDQ BX, R8 14774 MOVSS (AX)(DI*4), X1 14775 MULSS X0, X1 14776 ADDSS (DX)(R8*4), X1 14777 MOVSS X1, (DX)(R8*4) 14778 ADDQ CX, DI 14779 ADDQ BX, R8 14780 MOVSS (AX)(DI*4), X1 14781 MULSS X0, X1 14782 ADDSS (DX)(R8*4), X1 14783 MOVSS X1, (DX)(R8*4) 14784 ADDQ CX, DI 14785 ADDQ BX, R8 14786 MOVSS (AX)(DI*4), X1 14787 MULSS X0, X1 14788 ADDSS (DX)(R8*4), X1 14789 MOVSS X1, (DX)(R8*4) 14790 ADDQ CX, DI 14791 ADDQ BX, R8 14792 MOVSS (AX)(DI*4), X1 14793 MULSS X0, X1 14794 ADDSS (DX)(R8*4), X1 14795 MOVSS X1, (DX)(R8*4) 14796 ADDQ CX, DI 14797 ADDQ BX, R8 14798 MOVSS (AX)(DI*4), X1 14799 MULSS X0, X1 14800 ADDSS (DX)(R8*4), X1 14801 MOVSS X1, (DX)(R8*4) 14802 ADDQ CX, DI 14803 ADDQ BX, R8 14804 MOVSS (AX)(DI*4), X1 14805 MULSS X0, X1 14806 ADDSS (DX)(R8*4), X1 14807 MOVSS X1, (DX)(R8*4) 14808 ADDQ CX, DI 14809 ADDQ BX, R8 14810 MOVSS (AX)(DI*4), X1 14811 MULSS X0, X1 14812 ADDSS (DX)(R8*4), X1 14813 MOVSS X1, (DX)(R8*4) 14814 ADDQ CX, DI 14815 ADDQ BX, R8 14816 SUBQ $0x08, SI 14817 14818 check_limit_unroll: 14819 CMPQ SI, $0x08 14820 JHI loop_unroll 14821 JMP check_limit 14822 14823 loop: 14824 MOVSS (AX)(DI*4), X1 14825 MULSS X0, X1 14826 ADDSS (DX)(R8*4), X1 14827 MOVSS X1, (DX)(R8*4) 14828 DECQ SI 14829 ADDQ CX, DI 14830 ADDQ BX, R8 14831 14832 check_limit: 14833 CMPQ SI, $0x00 14834 JHI loop 14835 RET 14836 14837 // func AmdAxpyUnsafeX_V3A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14838 // Requires: SSE 14839 TEXT ·AmdAxpyUnsafeX_V3A14R8(SB), NOSPLIT, $0-48 14840 MOVSS alpha+0(FP), X0 14841 MOVQ xs+8(FP), AX 14842 MOVQ incx+16(FP), CX 14843 MOVQ ys+24(FP), DX 14844 MOVQ incy+32(FP), BX 14845 MOVQ n+40(FP), SI 14846 XORQ DI, DI 14847 XORQ R8, R8 14848 JMP check_limit_unroll 14849 PCALIGN $0x08 14850 NOP 14851 NOP 14852 NOP 14853 NOP 14854 NOP 14855 NOP 14856 14857 loop_unroll: 14858 MOVSS (AX)(DI*4), X1 14859 MULSS X0, X1 14860 ADDSS (DX)(R8*4), X1 14861 MOVSS X1, (DX)(R8*4) 14862 ADDQ CX, DI 14863 ADDQ BX, R8 14864 MOVSS (AX)(DI*4), X1 14865 MULSS X0, X1 14866 ADDSS (DX)(R8*4), X1 14867 MOVSS X1, (DX)(R8*4) 14868 ADDQ CX, DI 14869 ADDQ BX, R8 14870 MOVSS (AX)(DI*4), X1 14871 MULSS X0, X1 14872 ADDSS (DX)(R8*4), X1 14873 MOVSS X1, (DX)(R8*4) 14874 ADDQ CX, DI 14875 ADDQ BX, R8 14876 MOVSS (AX)(DI*4), X1 14877 MULSS X0, X1 14878 ADDSS (DX)(R8*4), X1 14879 MOVSS X1, (DX)(R8*4) 14880 ADDQ CX, DI 14881 ADDQ BX, R8 14882 MOVSS (AX)(DI*4), X1 14883 MULSS X0, X1 14884 ADDSS (DX)(R8*4), X1 14885 MOVSS X1, (DX)(R8*4) 14886 ADDQ CX, DI 14887 ADDQ BX, R8 14888 MOVSS (AX)(DI*4), X1 14889 MULSS X0, X1 14890 ADDSS (DX)(R8*4), X1 14891 MOVSS X1, (DX)(R8*4) 14892 ADDQ CX, DI 14893 ADDQ BX, R8 14894 MOVSS (AX)(DI*4), X1 14895 MULSS X0, X1 14896 ADDSS (DX)(R8*4), X1 14897 MOVSS X1, (DX)(R8*4) 14898 ADDQ CX, DI 14899 ADDQ BX, R8 14900 MOVSS (AX)(DI*4), X1 14901 MULSS X0, X1 14902 ADDSS (DX)(R8*4), X1 14903 MOVSS X1, (DX)(R8*4) 14904 ADDQ CX, DI 14905 ADDQ BX, R8 14906 SUBQ $0x08, SI 14907 14908 check_limit_unroll: 14909 CMPQ SI, $0x08 14910 JHI loop_unroll 14911 JMP check_limit 14912 14913 loop: 14914 MOVSS (AX)(DI*4), X1 14915 MULSS X0, X1 14916 ADDSS (DX)(R8*4), X1 14917 MOVSS X1, (DX)(R8*4) 14918 DECQ SI 14919 ADDQ CX, DI 14920 ADDQ BX, R8 14921 14922 check_limit: 14923 CMPQ SI, $0x00 14924 JHI loop 14925 RET 14926 14927 // func AmdAxpyUnsafeX_V4A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 14928 // Requires: SSE 14929 TEXT ·AmdAxpyUnsafeX_V4A14R8(SB), NOSPLIT, $0-48 14930 MOVSS alpha+0(FP), X0 14931 MOVQ xs+8(FP), AX 14932 MOVQ incx+16(FP), CX 14933 MOVQ ys+24(FP), DX 14934 MOVQ incy+32(FP), BX 14935 MOVQ n+40(FP), SI 14936 XORQ DI, DI 14937 XORQ R8, R8 14938 JMP check_limit_unroll 14939 PCALIGN $0x08 14940 NOP 14941 NOP 14942 NOP 14943 NOP 14944 NOP 14945 NOP 14946 14947 loop_unroll: 14948 MOVSS (AX)(DI*4), X1 14949 MULSS X0, X1 14950 ADDSS (DX)(R8*4), X1 14951 MOVSS X1, (DX)(R8*4) 14952 ADDQ CX, DI 14953 ADDQ BX, R8 14954 MOVSS (AX)(DI*4), X1 14955 MULSS X0, X1 14956 ADDSS (DX)(R8*4), X1 14957 MOVSS X1, (DX)(R8*4) 14958 ADDQ CX, DI 14959 ADDQ BX, R8 14960 MOVSS (AX)(DI*4), X1 14961 MULSS X0, X1 14962 ADDSS (DX)(R8*4), X1 14963 MOVSS X1, (DX)(R8*4) 14964 ADDQ CX, DI 14965 ADDQ BX, R8 14966 MOVSS (AX)(DI*4), X1 14967 MULSS X0, X1 14968 ADDSS (DX)(R8*4), X1 14969 MOVSS X1, (DX)(R8*4) 14970 ADDQ CX, DI 14971 ADDQ BX, R8 14972 MOVSS (AX)(DI*4), X1 14973 MULSS X0, X1 14974 ADDSS (DX)(R8*4), X1 14975 MOVSS X1, (DX)(R8*4) 14976 ADDQ CX, DI 14977 ADDQ BX, R8 14978 MOVSS (AX)(DI*4), X1 14979 MULSS X0, X1 14980 ADDSS (DX)(R8*4), X1 14981 MOVSS X1, (DX)(R8*4) 14982 ADDQ CX, DI 14983 ADDQ BX, R8 14984 MOVSS (AX)(DI*4), X1 14985 MULSS X0, X1 14986 ADDSS (DX)(R8*4), X1 14987 MOVSS X1, (DX)(R8*4) 14988 ADDQ CX, DI 14989 ADDQ BX, R8 14990 MOVSS (AX)(DI*4), X1 14991 MULSS X0, X1 14992 ADDSS (DX)(R8*4), X1 14993 MOVSS X1, (DX)(R8*4) 14994 ADDQ CX, DI 14995 ADDQ BX, R8 14996 SUBQ $0x08, SI 14997 14998 check_limit_unroll: 14999 CMPQ SI, $0x08 15000 JHI loop_unroll 15001 JMP check_limit 15002 15003 loop: 15004 MOVSS (AX)(DI*4), X1 15005 MULSS X0, X1 15006 ADDSS (DX)(R8*4), X1 15007 MOVSS X1, (DX)(R8*4) 15008 DECQ SI 15009 ADDQ CX, DI 15010 ADDQ BX, R8 15011 15012 check_limit: 15013 CMPQ SI, $0x00 15014 JHI loop 15015 RET 15016 15017 // func AmdAxpyUnsafeX_V5A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15018 // Requires: SSE 15019 TEXT ·AmdAxpyUnsafeX_V5A14R8(SB), NOSPLIT, $0-48 15020 MOVSS alpha+0(FP), X0 15021 MOVQ xs+8(FP), AX 15022 MOVQ incx+16(FP), CX 15023 MOVQ ys+24(FP), DX 15024 MOVQ incy+32(FP), BX 15025 MOVQ n+40(FP), SI 15026 XORQ DI, DI 15027 XORQ R8, R8 15028 JMP check_limit_unroll 15029 PCALIGN $0x08 15030 NOP 15031 NOP 15032 NOP 15033 NOP 15034 NOP 15035 NOP 15036 15037 loop_unroll: 15038 MOVSS (AX)(DI*4), X1 15039 MULSS X0, X1 15040 ADDSS (DX)(R8*4), X1 15041 MOVSS X1, (DX)(R8*4) 15042 ADDQ CX, DI 15043 ADDQ BX, R8 15044 MOVSS (AX)(DI*4), X1 15045 MULSS X0, X1 15046 ADDSS (DX)(R8*4), X1 15047 MOVSS X1, (DX)(R8*4) 15048 ADDQ CX, DI 15049 ADDQ BX, R8 15050 MOVSS (AX)(DI*4), X1 15051 MULSS X0, X1 15052 ADDSS (DX)(R8*4), X1 15053 MOVSS X1, (DX)(R8*4) 15054 ADDQ CX, DI 15055 ADDQ BX, R8 15056 MOVSS (AX)(DI*4), X1 15057 MULSS X0, X1 15058 ADDSS (DX)(R8*4), X1 15059 MOVSS X1, (DX)(R8*4) 15060 ADDQ CX, DI 15061 ADDQ BX, R8 15062 MOVSS (AX)(DI*4), X1 15063 MULSS X0, X1 15064 ADDSS (DX)(R8*4), X1 15065 MOVSS X1, (DX)(R8*4) 15066 ADDQ CX, DI 15067 ADDQ BX, R8 15068 MOVSS (AX)(DI*4), X1 15069 MULSS X0, X1 15070 ADDSS (DX)(R8*4), X1 15071 MOVSS X1, (DX)(R8*4) 15072 ADDQ CX, DI 15073 ADDQ BX, R8 15074 MOVSS (AX)(DI*4), X1 15075 MULSS X0, X1 15076 ADDSS (DX)(R8*4), X1 15077 MOVSS X1, (DX)(R8*4) 15078 ADDQ CX, DI 15079 ADDQ BX, R8 15080 MOVSS (AX)(DI*4), X1 15081 MULSS X0, X1 15082 ADDSS (DX)(R8*4), X1 15083 MOVSS X1, (DX)(R8*4) 15084 ADDQ CX, DI 15085 ADDQ BX, R8 15086 SUBQ $0x08, SI 15087 15088 check_limit_unroll: 15089 CMPQ SI, $0x08 15090 JHI loop_unroll 15091 JMP check_limit 15092 15093 loop: 15094 MOVSS (AX)(DI*4), X1 15095 MULSS X0, X1 15096 ADDSS (DX)(R8*4), X1 15097 MOVSS X1, (DX)(R8*4) 15098 DECQ SI 15099 ADDQ CX, DI 15100 ADDQ BX, R8 15101 15102 check_limit: 15103 CMPQ SI, $0x00 15104 JHI loop 15105 RET 15106 15107 // func AmdAxpyUnsafeX_V0A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15108 // Requires: SSE 15109 TEXT ·AmdAxpyUnsafeX_V0A15R8(SB), NOSPLIT, $0-48 15110 MOVSS alpha+0(FP), X0 15111 MOVQ xs+8(FP), AX 15112 MOVQ incx+16(FP), CX 15113 MOVQ ys+24(FP), DX 15114 MOVQ incy+32(FP), BX 15115 MOVQ n+40(FP), SI 15116 XORQ DI, DI 15117 XORQ R8, R8 15118 JMP check_limit_unroll 15119 PCALIGN $0x08 15120 NOP 15121 NOP 15122 NOP 15123 NOP 15124 NOP 15125 NOP 15126 NOP 15127 15128 loop_unroll: 15129 MOVSS (AX)(DI*4), X1 15130 MULSS X0, X1 15131 ADDSS (DX)(R8*4), X1 15132 MOVSS X1, (DX)(R8*4) 15133 ADDQ CX, DI 15134 ADDQ BX, R8 15135 MOVSS (AX)(DI*4), X1 15136 MULSS X0, X1 15137 ADDSS (DX)(R8*4), X1 15138 MOVSS X1, (DX)(R8*4) 15139 ADDQ CX, DI 15140 ADDQ BX, R8 15141 MOVSS (AX)(DI*4), X1 15142 MULSS X0, X1 15143 ADDSS (DX)(R8*4), X1 15144 MOVSS X1, (DX)(R8*4) 15145 ADDQ CX, DI 15146 ADDQ BX, R8 15147 MOVSS (AX)(DI*4), X1 15148 MULSS X0, X1 15149 ADDSS (DX)(R8*4), X1 15150 MOVSS X1, (DX)(R8*4) 15151 ADDQ CX, DI 15152 ADDQ BX, R8 15153 MOVSS (AX)(DI*4), X1 15154 MULSS X0, X1 15155 ADDSS (DX)(R8*4), X1 15156 MOVSS X1, (DX)(R8*4) 15157 ADDQ CX, DI 15158 ADDQ BX, R8 15159 MOVSS (AX)(DI*4), X1 15160 MULSS X0, X1 15161 ADDSS (DX)(R8*4), X1 15162 MOVSS X1, (DX)(R8*4) 15163 ADDQ CX, DI 15164 ADDQ BX, R8 15165 MOVSS (AX)(DI*4), X1 15166 MULSS X0, X1 15167 ADDSS (DX)(R8*4), X1 15168 MOVSS X1, (DX)(R8*4) 15169 ADDQ CX, DI 15170 ADDQ BX, R8 15171 MOVSS (AX)(DI*4), X1 15172 MULSS X0, X1 15173 ADDSS (DX)(R8*4), X1 15174 MOVSS X1, (DX)(R8*4) 15175 ADDQ CX, DI 15176 ADDQ BX, R8 15177 SUBQ $0x08, SI 15178 15179 check_limit_unroll: 15180 CMPQ SI, $0x08 15181 JHI loop_unroll 15182 JMP check_limit 15183 15184 loop: 15185 MOVSS (AX)(DI*4), X1 15186 MULSS X0, X1 15187 ADDSS (DX)(R8*4), X1 15188 MOVSS X1, (DX)(R8*4) 15189 DECQ SI 15190 ADDQ CX, DI 15191 ADDQ BX, R8 15192 15193 check_limit: 15194 CMPQ SI, $0x00 15195 JHI loop 15196 RET 15197 15198 // func AmdAxpyUnsafeX_V1A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15199 // Requires: SSE 15200 TEXT ·AmdAxpyUnsafeX_V1A15R8(SB), NOSPLIT, $0-48 15201 MOVSS alpha+0(FP), X0 15202 MOVQ xs+8(FP), AX 15203 MOVQ incx+16(FP), CX 15204 MOVQ ys+24(FP), DX 15205 MOVQ incy+32(FP), BX 15206 MOVQ n+40(FP), SI 15207 XORQ DI, DI 15208 XORQ R8, R8 15209 JMP check_limit_unroll 15210 PCALIGN $0x08 15211 NOP 15212 NOP 15213 NOP 15214 NOP 15215 NOP 15216 NOP 15217 NOP 15218 15219 loop_unroll: 15220 MOVSS (AX)(DI*4), X1 15221 MULSS X0, X1 15222 ADDSS (DX)(R8*4), X1 15223 MOVSS X1, (DX)(R8*4) 15224 ADDQ CX, DI 15225 ADDQ BX, R8 15226 MOVSS (AX)(DI*4), X1 15227 MULSS X0, X1 15228 ADDSS (DX)(R8*4), X1 15229 MOVSS X1, (DX)(R8*4) 15230 ADDQ CX, DI 15231 ADDQ BX, R8 15232 MOVSS (AX)(DI*4), X1 15233 MULSS X0, X1 15234 ADDSS (DX)(R8*4), X1 15235 MOVSS X1, (DX)(R8*4) 15236 ADDQ CX, DI 15237 ADDQ BX, R8 15238 MOVSS (AX)(DI*4), X1 15239 MULSS X0, X1 15240 ADDSS (DX)(R8*4), X1 15241 MOVSS X1, (DX)(R8*4) 15242 ADDQ CX, DI 15243 ADDQ BX, R8 15244 MOVSS (AX)(DI*4), X1 15245 MULSS X0, X1 15246 ADDSS (DX)(R8*4), X1 15247 MOVSS X1, (DX)(R8*4) 15248 ADDQ CX, DI 15249 ADDQ BX, R8 15250 MOVSS (AX)(DI*4), X1 15251 MULSS X0, X1 15252 ADDSS (DX)(R8*4), X1 15253 MOVSS X1, (DX)(R8*4) 15254 ADDQ CX, DI 15255 ADDQ BX, R8 15256 MOVSS (AX)(DI*4), X1 15257 MULSS X0, X1 15258 ADDSS (DX)(R8*4), X1 15259 MOVSS X1, (DX)(R8*4) 15260 ADDQ CX, DI 15261 ADDQ BX, R8 15262 MOVSS (AX)(DI*4), X1 15263 MULSS X0, X1 15264 ADDSS (DX)(R8*4), X1 15265 MOVSS X1, (DX)(R8*4) 15266 ADDQ CX, DI 15267 ADDQ BX, R8 15268 SUBQ $0x08, SI 15269 15270 check_limit_unroll: 15271 CMPQ SI, $0x08 15272 JHI loop_unroll 15273 JMP check_limit 15274 15275 loop: 15276 MOVSS (AX)(DI*4), X1 15277 MULSS X0, X1 15278 ADDSS (DX)(R8*4), X1 15279 MOVSS X1, (DX)(R8*4) 15280 DECQ SI 15281 ADDQ CX, DI 15282 ADDQ BX, R8 15283 15284 check_limit: 15285 CMPQ SI, $0x00 15286 JHI loop 15287 RET 15288 15289 // func AmdAxpyUnsafeX_V2A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15290 // Requires: SSE 15291 TEXT ·AmdAxpyUnsafeX_V2A15R8(SB), NOSPLIT, $0-48 15292 MOVSS alpha+0(FP), X0 15293 MOVQ xs+8(FP), AX 15294 MOVQ incx+16(FP), CX 15295 MOVQ ys+24(FP), DX 15296 MOVQ incy+32(FP), BX 15297 MOVQ n+40(FP), SI 15298 XORQ DI, DI 15299 XORQ R8, R8 15300 JMP check_limit_unroll 15301 PCALIGN $0x08 15302 NOP 15303 NOP 15304 NOP 15305 NOP 15306 NOP 15307 NOP 15308 NOP 15309 15310 loop_unroll: 15311 MOVSS (AX)(DI*4), X1 15312 MULSS X0, X1 15313 ADDSS (DX)(R8*4), X1 15314 MOVSS X1, (DX)(R8*4) 15315 ADDQ CX, DI 15316 ADDQ BX, R8 15317 MOVSS (AX)(DI*4), X1 15318 MULSS X0, X1 15319 ADDSS (DX)(R8*4), X1 15320 MOVSS X1, (DX)(R8*4) 15321 ADDQ CX, DI 15322 ADDQ BX, R8 15323 MOVSS (AX)(DI*4), X1 15324 MULSS X0, X1 15325 ADDSS (DX)(R8*4), X1 15326 MOVSS X1, (DX)(R8*4) 15327 ADDQ CX, DI 15328 ADDQ BX, R8 15329 MOVSS (AX)(DI*4), X1 15330 MULSS X0, X1 15331 ADDSS (DX)(R8*4), X1 15332 MOVSS X1, (DX)(R8*4) 15333 ADDQ CX, DI 15334 ADDQ BX, R8 15335 MOVSS (AX)(DI*4), X1 15336 MULSS X0, X1 15337 ADDSS (DX)(R8*4), X1 15338 MOVSS X1, (DX)(R8*4) 15339 ADDQ CX, DI 15340 ADDQ BX, R8 15341 MOVSS (AX)(DI*4), X1 15342 MULSS X0, X1 15343 ADDSS (DX)(R8*4), X1 15344 MOVSS X1, (DX)(R8*4) 15345 ADDQ CX, DI 15346 ADDQ BX, R8 15347 MOVSS (AX)(DI*4), X1 15348 MULSS X0, X1 15349 ADDSS (DX)(R8*4), X1 15350 MOVSS X1, (DX)(R8*4) 15351 ADDQ CX, DI 15352 ADDQ BX, R8 15353 MOVSS (AX)(DI*4), X1 15354 MULSS X0, X1 15355 ADDSS (DX)(R8*4), X1 15356 MOVSS X1, (DX)(R8*4) 15357 ADDQ CX, DI 15358 ADDQ BX, R8 15359 SUBQ $0x08, SI 15360 15361 check_limit_unroll: 15362 CMPQ SI, $0x08 15363 JHI loop_unroll 15364 JMP check_limit 15365 15366 loop: 15367 MOVSS (AX)(DI*4), X1 15368 MULSS X0, X1 15369 ADDSS (DX)(R8*4), X1 15370 MOVSS X1, (DX)(R8*4) 15371 DECQ SI 15372 ADDQ CX, DI 15373 ADDQ BX, R8 15374 15375 check_limit: 15376 CMPQ SI, $0x00 15377 JHI loop 15378 RET 15379 15380 // func AmdAxpyUnsafeX_V3A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15381 // Requires: SSE 15382 TEXT ·AmdAxpyUnsafeX_V3A15R8(SB), NOSPLIT, $0-48 15383 MOVSS alpha+0(FP), X0 15384 MOVQ xs+8(FP), AX 15385 MOVQ incx+16(FP), CX 15386 MOVQ ys+24(FP), DX 15387 MOVQ incy+32(FP), BX 15388 MOVQ n+40(FP), SI 15389 XORQ DI, DI 15390 XORQ R8, R8 15391 JMP check_limit_unroll 15392 PCALIGN $0x08 15393 NOP 15394 NOP 15395 NOP 15396 NOP 15397 NOP 15398 NOP 15399 NOP 15400 15401 loop_unroll: 15402 MOVSS (AX)(DI*4), X1 15403 MULSS X0, X1 15404 ADDSS (DX)(R8*4), X1 15405 MOVSS X1, (DX)(R8*4) 15406 ADDQ CX, DI 15407 ADDQ BX, R8 15408 MOVSS (AX)(DI*4), X1 15409 MULSS X0, X1 15410 ADDSS (DX)(R8*4), X1 15411 MOVSS X1, (DX)(R8*4) 15412 ADDQ CX, DI 15413 ADDQ BX, R8 15414 MOVSS (AX)(DI*4), X1 15415 MULSS X0, X1 15416 ADDSS (DX)(R8*4), X1 15417 MOVSS X1, (DX)(R8*4) 15418 ADDQ CX, DI 15419 ADDQ BX, R8 15420 MOVSS (AX)(DI*4), X1 15421 MULSS X0, X1 15422 ADDSS (DX)(R8*4), X1 15423 MOVSS X1, (DX)(R8*4) 15424 ADDQ CX, DI 15425 ADDQ BX, R8 15426 MOVSS (AX)(DI*4), X1 15427 MULSS X0, X1 15428 ADDSS (DX)(R8*4), X1 15429 MOVSS X1, (DX)(R8*4) 15430 ADDQ CX, DI 15431 ADDQ BX, R8 15432 MOVSS (AX)(DI*4), X1 15433 MULSS X0, X1 15434 ADDSS (DX)(R8*4), X1 15435 MOVSS X1, (DX)(R8*4) 15436 ADDQ CX, DI 15437 ADDQ BX, R8 15438 MOVSS (AX)(DI*4), X1 15439 MULSS X0, X1 15440 ADDSS (DX)(R8*4), X1 15441 MOVSS X1, (DX)(R8*4) 15442 ADDQ CX, DI 15443 ADDQ BX, R8 15444 MOVSS (AX)(DI*4), X1 15445 MULSS X0, X1 15446 ADDSS (DX)(R8*4), X1 15447 MOVSS X1, (DX)(R8*4) 15448 ADDQ CX, DI 15449 ADDQ BX, R8 15450 SUBQ $0x08, SI 15451 15452 check_limit_unroll: 15453 CMPQ SI, $0x08 15454 JHI loop_unroll 15455 JMP check_limit 15456 15457 loop: 15458 MOVSS (AX)(DI*4), X1 15459 MULSS X0, X1 15460 ADDSS (DX)(R8*4), X1 15461 MOVSS X1, (DX)(R8*4) 15462 DECQ SI 15463 ADDQ CX, DI 15464 ADDQ BX, R8 15465 15466 check_limit: 15467 CMPQ SI, $0x00 15468 JHI loop 15469 RET 15470 15471 // func AmdAxpyUnsafeX_V4A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15472 // Requires: SSE 15473 TEXT ·AmdAxpyUnsafeX_V4A15R8(SB), NOSPLIT, $0-48 15474 MOVSS alpha+0(FP), X0 15475 MOVQ xs+8(FP), AX 15476 MOVQ incx+16(FP), CX 15477 MOVQ ys+24(FP), DX 15478 MOVQ incy+32(FP), BX 15479 MOVQ n+40(FP), SI 15480 XORQ DI, DI 15481 XORQ R8, R8 15482 JMP check_limit_unroll 15483 PCALIGN $0x08 15484 NOP 15485 NOP 15486 NOP 15487 NOP 15488 NOP 15489 NOP 15490 NOP 15491 15492 loop_unroll: 15493 MOVSS (AX)(DI*4), X1 15494 MULSS X0, X1 15495 ADDSS (DX)(R8*4), X1 15496 MOVSS X1, (DX)(R8*4) 15497 ADDQ CX, DI 15498 ADDQ BX, R8 15499 MOVSS (AX)(DI*4), X1 15500 MULSS X0, X1 15501 ADDSS (DX)(R8*4), X1 15502 MOVSS X1, (DX)(R8*4) 15503 ADDQ CX, DI 15504 ADDQ BX, R8 15505 MOVSS (AX)(DI*4), X1 15506 MULSS X0, X1 15507 ADDSS (DX)(R8*4), X1 15508 MOVSS X1, (DX)(R8*4) 15509 ADDQ CX, DI 15510 ADDQ BX, R8 15511 MOVSS (AX)(DI*4), X1 15512 MULSS X0, X1 15513 ADDSS (DX)(R8*4), X1 15514 MOVSS X1, (DX)(R8*4) 15515 ADDQ CX, DI 15516 ADDQ BX, R8 15517 MOVSS (AX)(DI*4), X1 15518 MULSS X0, X1 15519 ADDSS (DX)(R8*4), X1 15520 MOVSS X1, (DX)(R8*4) 15521 ADDQ CX, DI 15522 ADDQ BX, R8 15523 MOVSS (AX)(DI*4), X1 15524 MULSS X0, X1 15525 ADDSS (DX)(R8*4), X1 15526 MOVSS X1, (DX)(R8*4) 15527 ADDQ CX, DI 15528 ADDQ BX, R8 15529 MOVSS (AX)(DI*4), X1 15530 MULSS X0, X1 15531 ADDSS (DX)(R8*4), X1 15532 MOVSS X1, (DX)(R8*4) 15533 ADDQ CX, DI 15534 ADDQ BX, R8 15535 MOVSS (AX)(DI*4), X1 15536 MULSS X0, X1 15537 ADDSS (DX)(R8*4), X1 15538 MOVSS X1, (DX)(R8*4) 15539 ADDQ CX, DI 15540 ADDQ BX, R8 15541 SUBQ $0x08, SI 15542 15543 check_limit_unroll: 15544 CMPQ SI, $0x08 15545 JHI loop_unroll 15546 JMP check_limit 15547 15548 loop: 15549 MOVSS (AX)(DI*4), X1 15550 MULSS X0, X1 15551 ADDSS (DX)(R8*4), X1 15552 MOVSS X1, (DX)(R8*4) 15553 DECQ SI 15554 ADDQ CX, DI 15555 ADDQ BX, R8 15556 15557 check_limit: 15558 CMPQ SI, $0x00 15559 JHI loop 15560 RET 15561 15562 // func AmdAxpyUnsafeX_V5A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15563 // Requires: SSE 15564 TEXT ·AmdAxpyUnsafeX_V5A15R8(SB), NOSPLIT, $0-48 15565 MOVSS alpha+0(FP), X0 15566 MOVQ xs+8(FP), AX 15567 MOVQ incx+16(FP), CX 15568 MOVQ ys+24(FP), DX 15569 MOVQ incy+32(FP), BX 15570 MOVQ n+40(FP), SI 15571 XORQ DI, DI 15572 XORQ R8, R8 15573 JMP check_limit_unroll 15574 PCALIGN $0x08 15575 NOP 15576 NOP 15577 NOP 15578 NOP 15579 NOP 15580 NOP 15581 NOP 15582 15583 loop_unroll: 15584 MOVSS (AX)(DI*4), X1 15585 MULSS X0, X1 15586 ADDSS (DX)(R8*4), X1 15587 MOVSS X1, (DX)(R8*4) 15588 ADDQ CX, DI 15589 ADDQ BX, R8 15590 MOVSS (AX)(DI*4), X1 15591 MULSS X0, X1 15592 ADDSS (DX)(R8*4), X1 15593 MOVSS X1, (DX)(R8*4) 15594 ADDQ CX, DI 15595 ADDQ BX, R8 15596 MOVSS (AX)(DI*4), X1 15597 MULSS X0, X1 15598 ADDSS (DX)(R8*4), X1 15599 MOVSS X1, (DX)(R8*4) 15600 ADDQ CX, DI 15601 ADDQ BX, R8 15602 MOVSS (AX)(DI*4), X1 15603 MULSS X0, X1 15604 ADDSS (DX)(R8*4), X1 15605 MOVSS X1, (DX)(R8*4) 15606 ADDQ CX, DI 15607 ADDQ BX, R8 15608 MOVSS (AX)(DI*4), X1 15609 MULSS X0, X1 15610 ADDSS (DX)(R8*4), X1 15611 MOVSS X1, (DX)(R8*4) 15612 ADDQ CX, DI 15613 ADDQ BX, R8 15614 MOVSS (AX)(DI*4), X1 15615 MULSS X0, X1 15616 ADDSS (DX)(R8*4), X1 15617 MOVSS X1, (DX)(R8*4) 15618 ADDQ CX, DI 15619 ADDQ BX, R8 15620 MOVSS (AX)(DI*4), X1 15621 MULSS X0, X1 15622 ADDSS (DX)(R8*4), X1 15623 MOVSS X1, (DX)(R8*4) 15624 ADDQ CX, DI 15625 ADDQ BX, R8 15626 MOVSS (AX)(DI*4), X1 15627 MULSS X0, X1 15628 ADDSS (DX)(R8*4), X1 15629 MOVSS X1, (DX)(R8*4) 15630 ADDQ CX, DI 15631 ADDQ BX, R8 15632 SUBQ $0x08, SI 15633 15634 check_limit_unroll: 15635 CMPQ SI, $0x08 15636 JHI loop_unroll 15637 JMP check_limit 15638 15639 loop: 15640 MOVSS (AX)(DI*4), X1 15641 MULSS X0, X1 15642 ADDSS (DX)(R8*4), X1 15643 MOVSS X1, (DX)(R8*4) 15644 DECQ SI 15645 ADDQ CX, DI 15646 ADDQ BX, R8 15647 15648 check_limit: 15649 CMPQ SI, $0x00 15650 JHI loop 15651 RET 15652 15653 // func AmdAxpyUnsafeX_V0A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15654 // Requires: SSE 15655 TEXT ·AmdAxpyUnsafeX_V0A16R8(SB), NOSPLIT, $0-48 15656 MOVSS alpha+0(FP), X0 15657 MOVQ xs+8(FP), AX 15658 MOVQ incx+16(FP), CX 15659 MOVQ ys+24(FP), DX 15660 MOVQ incy+32(FP), BX 15661 MOVQ n+40(FP), SI 15662 XORQ DI, DI 15663 XORQ R8, R8 15664 JMP check_limit_unroll 15665 PCALIGN $0x10 15666 15667 loop_unroll: 15668 MOVSS (AX)(DI*4), X1 15669 MULSS X0, X1 15670 ADDSS (DX)(R8*4), X1 15671 MOVSS X1, (DX)(R8*4) 15672 ADDQ CX, DI 15673 ADDQ BX, R8 15674 MOVSS (AX)(DI*4), X1 15675 MULSS X0, X1 15676 ADDSS (DX)(R8*4), X1 15677 MOVSS X1, (DX)(R8*4) 15678 ADDQ CX, DI 15679 ADDQ BX, R8 15680 MOVSS (AX)(DI*4), X1 15681 MULSS X0, X1 15682 ADDSS (DX)(R8*4), X1 15683 MOVSS X1, (DX)(R8*4) 15684 ADDQ CX, DI 15685 ADDQ BX, R8 15686 MOVSS (AX)(DI*4), X1 15687 MULSS X0, X1 15688 ADDSS (DX)(R8*4), X1 15689 MOVSS X1, (DX)(R8*4) 15690 ADDQ CX, DI 15691 ADDQ BX, R8 15692 MOVSS (AX)(DI*4), X1 15693 MULSS X0, X1 15694 ADDSS (DX)(R8*4), X1 15695 MOVSS X1, (DX)(R8*4) 15696 ADDQ CX, DI 15697 ADDQ BX, R8 15698 MOVSS (AX)(DI*4), X1 15699 MULSS X0, X1 15700 ADDSS (DX)(R8*4), X1 15701 MOVSS X1, (DX)(R8*4) 15702 ADDQ CX, DI 15703 ADDQ BX, R8 15704 MOVSS (AX)(DI*4), X1 15705 MULSS X0, X1 15706 ADDSS (DX)(R8*4), X1 15707 MOVSS X1, (DX)(R8*4) 15708 ADDQ CX, DI 15709 ADDQ BX, R8 15710 MOVSS (AX)(DI*4), X1 15711 MULSS X0, X1 15712 ADDSS (DX)(R8*4), X1 15713 MOVSS X1, (DX)(R8*4) 15714 ADDQ CX, DI 15715 ADDQ BX, R8 15716 SUBQ $0x08, SI 15717 15718 check_limit_unroll: 15719 CMPQ SI, $0x08 15720 JHI loop_unroll 15721 JMP check_limit 15722 15723 loop: 15724 MOVSS (AX)(DI*4), X1 15725 MULSS X0, X1 15726 ADDSS (DX)(R8*4), X1 15727 MOVSS X1, (DX)(R8*4) 15728 DECQ SI 15729 ADDQ CX, DI 15730 ADDQ BX, R8 15731 15732 check_limit: 15733 CMPQ SI, $0x00 15734 JHI loop 15735 RET 15736 15737 // func AmdAxpyUnsafeX_V1A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15738 // Requires: SSE 15739 TEXT ·AmdAxpyUnsafeX_V1A16R8(SB), NOSPLIT, $0-48 15740 MOVSS alpha+0(FP), X0 15741 MOVQ xs+8(FP), AX 15742 MOVQ incx+16(FP), CX 15743 MOVQ ys+24(FP), DX 15744 MOVQ incy+32(FP), BX 15745 MOVQ n+40(FP), SI 15746 XORQ DI, DI 15747 XORQ R8, R8 15748 JMP check_limit_unroll 15749 PCALIGN $0x10 15750 15751 loop_unroll: 15752 MOVSS (AX)(DI*4), X1 15753 MULSS X0, X1 15754 ADDSS (DX)(R8*4), X1 15755 MOVSS X1, (DX)(R8*4) 15756 ADDQ CX, DI 15757 ADDQ BX, R8 15758 MOVSS (AX)(DI*4), X1 15759 MULSS X0, X1 15760 ADDSS (DX)(R8*4), X1 15761 MOVSS X1, (DX)(R8*4) 15762 ADDQ CX, DI 15763 ADDQ BX, R8 15764 MOVSS (AX)(DI*4), X1 15765 MULSS X0, X1 15766 ADDSS (DX)(R8*4), X1 15767 MOVSS X1, (DX)(R8*4) 15768 ADDQ CX, DI 15769 ADDQ BX, R8 15770 MOVSS (AX)(DI*4), X1 15771 MULSS X0, X1 15772 ADDSS (DX)(R8*4), X1 15773 MOVSS X1, (DX)(R8*4) 15774 ADDQ CX, DI 15775 ADDQ BX, R8 15776 MOVSS (AX)(DI*4), X1 15777 MULSS X0, X1 15778 ADDSS (DX)(R8*4), X1 15779 MOVSS X1, (DX)(R8*4) 15780 ADDQ CX, DI 15781 ADDQ BX, R8 15782 MOVSS (AX)(DI*4), X1 15783 MULSS X0, X1 15784 ADDSS (DX)(R8*4), X1 15785 MOVSS X1, (DX)(R8*4) 15786 ADDQ CX, DI 15787 ADDQ BX, R8 15788 MOVSS (AX)(DI*4), X1 15789 MULSS X0, X1 15790 ADDSS (DX)(R8*4), X1 15791 MOVSS X1, (DX)(R8*4) 15792 ADDQ CX, DI 15793 ADDQ BX, R8 15794 MOVSS (AX)(DI*4), X1 15795 MULSS X0, X1 15796 ADDSS (DX)(R8*4), X1 15797 MOVSS X1, (DX)(R8*4) 15798 ADDQ CX, DI 15799 ADDQ BX, R8 15800 SUBQ $0x08, SI 15801 15802 check_limit_unroll: 15803 CMPQ SI, $0x08 15804 JHI loop_unroll 15805 JMP check_limit 15806 15807 loop: 15808 MOVSS (AX)(DI*4), X1 15809 MULSS X0, X1 15810 ADDSS (DX)(R8*4), X1 15811 MOVSS X1, (DX)(R8*4) 15812 DECQ SI 15813 ADDQ CX, DI 15814 ADDQ BX, R8 15815 15816 check_limit: 15817 CMPQ SI, $0x00 15818 JHI loop 15819 RET 15820 15821 // func AmdAxpyUnsafeX_V2A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15822 // Requires: SSE 15823 TEXT ·AmdAxpyUnsafeX_V2A16R8(SB), NOSPLIT, $0-48 15824 MOVSS alpha+0(FP), X0 15825 MOVQ xs+8(FP), AX 15826 MOVQ incx+16(FP), CX 15827 MOVQ ys+24(FP), DX 15828 MOVQ incy+32(FP), BX 15829 MOVQ n+40(FP), SI 15830 XORQ DI, DI 15831 XORQ R8, R8 15832 JMP check_limit_unroll 15833 PCALIGN $0x10 15834 15835 loop_unroll: 15836 MOVSS (AX)(DI*4), X1 15837 MULSS X0, X1 15838 ADDSS (DX)(R8*4), X1 15839 MOVSS X1, (DX)(R8*4) 15840 ADDQ CX, DI 15841 ADDQ BX, R8 15842 MOVSS (AX)(DI*4), X1 15843 MULSS X0, X1 15844 ADDSS (DX)(R8*4), X1 15845 MOVSS X1, (DX)(R8*4) 15846 ADDQ CX, DI 15847 ADDQ BX, R8 15848 MOVSS (AX)(DI*4), X1 15849 MULSS X0, X1 15850 ADDSS (DX)(R8*4), X1 15851 MOVSS X1, (DX)(R8*4) 15852 ADDQ CX, DI 15853 ADDQ BX, R8 15854 MOVSS (AX)(DI*4), X1 15855 MULSS X0, X1 15856 ADDSS (DX)(R8*4), X1 15857 MOVSS X1, (DX)(R8*4) 15858 ADDQ CX, DI 15859 ADDQ BX, R8 15860 MOVSS (AX)(DI*4), X1 15861 MULSS X0, X1 15862 ADDSS (DX)(R8*4), X1 15863 MOVSS X1, (DX)(R8*4) 15864 ADDQ CX, DI 15865 ADDQ BX, R8 15866 MOVSS (AX)(DI*4), X1 15867 MULSS X0, X1 15868 ADDSS (DX)(R8*4), X1 15869 MOVSS X1, (DX)(R8*4) 15870 ADDQ CX, DI 15871 ADDQ BX, R8 15872 MOVSS (AX)(DI*4), X1 15873 MULSS X0, X1 15874 ADDSS (DX)(R8*4), X1 15875 MOVSS X1, (DX)(R8*4) 15876 ADDQ CX, DI 15877 ADDQ BX, R8 15878 MOVSS (AX)(DI*4), X1 15879 MULSS X0, X1 15880 ADDSS (DX)(R8*4), X1 15881 MOVSS X1, (DX)(R8*4) 15882 ADDQ CX, DI 15883 ADDQ BX, R8 15884 SUBQ $0x08, SI 15885 15886 check_limit_unroll: 15887 CMPQ SI, $0x08 15888 JHI loop_unroll 15889 JMP check_limit 15890 15891 loop: 15892 MOVSS (AX)(DI*4), X1 15893 MULSS X0, X1 15894 ADDSS (DX)(R8*4), X1 15895 MOVSS X1, (DX)(R8*4) 15896 DECQ SI 15897 ADDQ CX, DI 15898 ADDQ BX, R8 15899 15900 check_limit: 15901 CMPQ SI, $0x00 15902 JHI loop 15903 RET 15904 15905 // func AmdAxpyUnsafeX_V3A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15906 // Requires: SSE 15907 TEXT ·AmdAxpyUnsafeX_V3A16R8(SB), NOSPLIT, $0-48 15908 MOVSS alpha+0(FP), X0 15909 MOVQ xs+8(FP), AX 15910 MOVQ incx+16(FP), CX 15911 MOVQ ys+24(FP), DX 15912 MOVQ incy+32(FP), BX 15913 MOVQ n+40(FP), SI 15914 XORQ DI, DI 15915 XORQ R8, R8 15916 JMP check_limit_unroll 15917 PCALIGN $0x10 15918 15919 loop_unroll: 15920 MOVSS (AX)(DI*4), X1 15921 MULSS X0, X1 15922 ADDSS (DX)(R8*4), X1 15923 MOVSS X1, (DX)(R8*4) 15924 ADDQ CX, DI 15925 ADDQ BX, R8 15926 MOVSS (AX)(DI*4), X1 15927 MULSS X0, X1 15928 ADDSS (DX)(R8*4), X1 15929 MOVSS X1, (DX)(R8*4) 15930 ADDQ CX, DI 15931 ADDQ BX, R8 15932 MOVSS (AX)(DI*4), X1 15933 MULSS X0, X1 15934 ADDSS (DX)(R8*4), X1 15935 MOVSS X1, (DX)(R8*4) 15936 ADDQ CX, DI 15937 ADDQ BX, R8 15938 MOVSS (AX)(DI*4), X1 15939 MULSS X0, X1 15940 ADDSS (DX)(R8*4), X1 15941 MOVSS X1, (DX)(R8*4) 15942 ADDQ CX, DI 15943 ADDQ BX, R8 15944 MOVSS (AX)(DI*4), X1 15945 MULSS X0, X1 15946 ADDSS (DX)(R8*4), X1 15947 MOVSS X1, (DX)(R8*4) 15948 ADDQ CX, DI 15949 ADDQ BX, R8 15950 MOVSS (AX)(DI*4), X1 15951 MULSS X0, X1 15952 ADDSS (DX)(R8*4), X1 15953 MOVSS X1, (DX)(R8*4) 15954 ADDQ CX, DI 15955 ADDQ BX, R8 15956 MOVSS (AX)(DI*4), X1 15957 MULSS X0, X1 15958 ADDSS (DX)(R8*4), X1 15959 MOVSS X1, (DX)(R8*4) 15960 ADDQ CX, DI 15961 ADDQ BX, R8 15962 MOVSS (AX)(DI*4), X1 15963 MULSS X0, X1 15964 ADDSS (DX)(R8*4), X1 15965 MOVSS X1, (DX)(R8*4) 15966 ADDQ CX, DI 15967 ADDQ BX, R8 15968 SUBQ $0x08, SI 15969 15970 check_limit_unroll: 15971 CMPQ SI, $0x08 15972 JHI loop_unroll 15973 JMP check_limit 15974 15975 loop: 15976 MOVSS (AX)(DI*4), X1 15977 MULSS X0, X1 15978 ADDSS (DX)(R8*4), X1 15979 MOVSS X1, (DX)(R8*4) 15980 DECQ SI 15981 ADDQ CX, DI 15982 ADDQ BX, R8 15983 15984 check_limit: 15985 CMPQ SI, $0x00 15986 JHI loop 15987 RET 15988 15989 // func AmdAxpyUnsafeX_V4A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 15990 // Requires: SSE 15991 TEXT ·AmdAxpyUnsafeX_V4A16R8(SB), NOSPLIT, $0-48 15992 MOVSS alpha+0(FP), X0 15993 MOVQ xs+8(FP), AX 15994 MOVQ incx+16(FP), CX 15995 MOVQ ys+24(FP), DX 15996 MOVQ incy+32(FP), BX 15997 MOVQ n+40(FP), SI 15998 XORQ DI, DI 15999 XORQ R8, R8 16000 JMP check_limit_unroll 16001 PCALIGN $0x10 16002 16003 loop_unroll: 16004 MOVSS (AX)(DI*4), X1 16005 MULSS X0, X1 16006 ADDSS (DX)(R8*4), X1 16007 MOVSS X1, (DX)(R8*4) 16008 ADDQ CX, DI 16009 ADDQ BX, R8 16010 MOVSS (AX)(DI*4), X1 16011 MULSS X0, X1 16012 ADDSS (DX)(R8*4), X1 16013 MOVSS X1, (DX)(R8*4) 16014 ADDQ CX, DI 16015 ADDQ BX, R8 16016 MOVSS (AX)(DI*4), X1 16017 MULSS X0, X1 16018 ADDSS (DX)(R8*4), X1 16019 MOVSS X1, (DX)(R8*4) 16020 ADDQ CX, DI 16021 ADDQ BX, R8 16022 MOVSS (AX)(DI*4), X1 16023 MULSS X0, X1 16024 ADDSS (DX)(R8*4), X1 16025 MOVSS X1, (DX)(R8*4) 16026 ADDQ CX, DI 16027 ADDQ BX, R8 16028 MOVSS (AX)(DI*4), X1 16029 MULSS X0, X1 16030 ADDSS (DX)(R8*4), X1 16031 MOVSS X1, (DX)(R8*4) 16032 ADDQ CX, DI 16033 ADDQ BX, R8 16034 MOVSS (AX)(DI*4), X1 16035 MULSS X0, X1 16036 ADDSS (DX)(R8*4), X1 16037 MOVSS X1, (DX)(R8*4) 16038 ADDQ CX, DI 16039 ADDQ BX, R8 16040 MOVSS (AX)(DI*4), X1 16041 MULSS X0, X1 16042 ADDSS (DX)(R8*4), X1 16043 MOVSS X1, (DX)(R8*4) 16044 ADDQ CX, DI 16045 ADDQ BX, R8 16046 MOVSS (AX)(DI*4), X1 16047 MULSS X0, X1 16048 ADDSS (DX)(R8*4), X1 16049 MOVSS X1, (DX)(R8*4) 16050 ADDQ CX, DI 16051 ADDQ BX, R8 16052 SUBQ $0x08, SI 16053 16054 check_limit_unroll: 16055 CMPQ SI, $0x08 16056 JHI loop_unroll 16057 JMP check_limit 16058 16059 loop: 16060 MOVSS (AX)(DI*4), X1 16061 MULSS X0, X1 16062 ADDSS (DX)(R8*4), X1 16063 MOVSS X1, (DX)(R8*4) 16064 DECQ SI 16065 ADDQ CX, DI 16066 ADDQ BX, R8 16067 16068 check_limit: 16069 CMPQ SI, $0x00 16070 JHI loop 16071 RET 16072 16073 // func AmdAxpyUnsafeX_V5A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16074 // Requires: SSE 16075 TEXT ·AmdAxpyUnsafeX_V5A16R8(SB), NOSPLIT, $0-48 16076 MOVSS alpha+0(FP), X0 16077 MOVQ xs+8(FP), AX 16078 MOVQ incx+16(FP), CX 16079 MOVQ ys+24(FP), DX 16080 MOVQ incy+32(FP), BX 16081 MOVQ n+40(FP), SI 16082 XORQ DI, DI 16083 XORQ R8, R8 16084 JMP check_limit_unroll 16085 PCALIGN $0x10 16086 16087 loop_unroll: 16088 MOVSS (AX)(DI*4), X1 16089 MULSS X0, X1 16090 ADDSS (DX)(R8*4), X1 16091 MOVSS X1, (DX)(R8*4) 16092 ADDQ CX, DI 16093 ADDQ BX, R8 16094 MOVSS (AX)(DI*4), X1 16095 MULSS X0, X1 16096 ADDSS (DX)(R8*4), X1 16097 MOVSS X1, (DX)(R8*4) 16098 ADDQ CX, DI 16099 ADDQ BX, R8 16100 MOVSS (AX)(DI*4), X1 16101 MULSS X0, X1 16102 ADDSS (DX)(R8*4), X1 16103 MOVSS X1, (DX)(R8*4) 16104 ADDQ CX, DI 16105 ADDQ BX, R8 16106 MOVSS (AX)(DI*4), X1 16107 MULSS X0, X1 16108 ADDSS (DX)(R8*4), X1 16109 MOVSS X1, (DX)(R8*4) 16110 ADDQ CX, DI 16111 ADDQ BX, R8 16112 MOVSS (AX)(DI*4), X1 16113 MULSS X0, X1 16114 ADDSS (DX)(R8*4), X1 16115 MOVSS X1, (DX)(R8*4) 16116 ADDQ CX, DI 16117 ADDQ BX, R8 16118 MOVSS (AX)(DI*4), X1 16119 MULSS X0, X1 16120 ADDSS (DX)(R8*4), X1 16121 MOVSS X1, (DX)(R8*4) 16122 ADDQ CX, DI 16123 ADDQ BX, R8 16124 MOVSS (AX)(DI*4), X1 16125 MULSS X0, X1 16126 ADDSS (DX)(R8*4), X1 16127 MOVSS X1, (DX)(R8*4) 16128 ADDQ CX, DI 16129 ADDQ BX, R8 16130 MOVSS (AX)(DI*4), X1 16131 MULSS X0, X1 16132 ADDSS (DX)(R8*4), X1 16133 MOVSS X1, (DX)(R8*4) 16134 ADDQ CX, DI 16135 ADDQ BX, R8 16136 SUBQ $0x08, SI 16137 16138 check_limit_unroll: 16139 CMPQ SI, $0x08 16140 JHI loop_unroll 16141 JMP check_limit 16142 16143 loop: 16144 MOVSS (AX)(DI*4), X1 16145 MULSS X0, X1 16146 ADDSS (DX)(R8*4), X1 16147 MOVSS X1, (DX)(R8*4) 16148 DECQ SI 16149 ADDQ CX, DI 16150 ADDQ BX, R8 16151 16152 check_limit: 16153 CMPQ SI, $0x00 16154 JHI loop 16155 RET 16156 16157 // func AmdAxpyUnsafeXInterleave_V0A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16158 // Requires: SSE 16159 TEXT ·AmdAxpyUnsafeXInterleave_V0A0R4(SB), NOSPLIT, $0-48 16160 MOVSS alpha+0(FP), X0 16161 MOVQ xs+8(FP), AX 16162 MOVQ incx+16(FP), CX 16163 MOVQ ys+24(FP), DX 16164 MOVQ incy+32(FP), BX 16165 MOVQ n+40(FP), SI 16166 XORQ DI, DI 16167 XORQ R8, R8 16168 JMP check_limit_unroll 16169 16170 loop_unroll: 16171 MOVSS (AX)(DI*4), X1 16172 ADDQ CX, DI 16173 MOVSS (AX)(DI*4), X2 16174 ADDQ CX, DI 16175 MOVSS (AX)(DI*4), X3 16176 ADDQ CX, DI 16177 MOVSS (AX)(DI*4), X4 16178 ADDQ CX, DI 16179 MULSS X0, X1 16180 MULSS X0, X2 16181 MULSS X0, X3 16182 MULSS X0, X4 16183 ADDSS (DX)(R8*4), X1 16184 MOVSS X1, (DX)(R8*4) 16185 ADDQ BX, R8 16186 ADDSS (DX)(R8*4), X2 16187 MOVSS X2, (DX)(R8*4) 16188 ADDQ BX, R8 16189 ADDSS (DX)(R8*4), X3 16190 MOVSS X3, (DX)(R8*4) 16191 ADDQ BX, R8 16192 ADDSS (DX)(R8*4), X4 16193 MOVSS X4, (DX)(R8*4) 16194 ADDQ BX, R8 16195 SUBQ $0x04, SI 16196 16197 check_limit_unroll: 16198 CMPQ SI, $0x04 16199 JHS loop_unroll 16200 JMP check_limit 16201 16202 loop: 16203 MOVSS (AX)(DI*4), X1 16204 MULSS X0, X1 16205 ADDSS (DX)(R8*4), X1 16206 MOVSS X1, (DX)(R8*4) 16207 DECQ SI 16208 ADDQ CX, DI 16209 ADDQ BX, R8 16210 16211 check_limit: 16212 CMPQ SI, $0x00 16213 JHI loop 16214 RET 16215 16216 // func AmdAxpyUnsafeXInterleave_V1A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16217 // Requires: SSE 16218 TEXT ·AmdAxpyUnsafeXInterleave_V1A0R4(SB), NOSPLIT, $0-48 16219 MOVSS alpha+0(FP), X0 16220 MOVQ xs+8(FP), AX 16221 MOVQ incx+16(FP), CX 16222 MOVQ ys+24(FP), DX 16223 MOVQ incy+32(FP), BX 16224 MOVQ n+40(FP), SI 16225 XORQ DI, DI 16226 XORQ R8, R8 16227 JMP check_limit_unroll 16228 16229 loop_unroll: 16230 MOVSS (AX)(DI*4), X1 16231 ADDQ CX, DI 16232 MOVSS (AX)(DI*4), X2 16233 ADDQ CX, DI 16234 MOVSS (AX)(DI*4), X3 16235 ADDQ CX, DI 16236 MOVSS (AX)(DI*4), X4 16237 ADDQ CX, DI 16238 MULSS X0, X1 16239 MULSS X0, X2 16240 MULSS X0, X3 16241 MULSS X0, X4 16242 ADDSS (DX)(R8*4), X1 16243 MOVSS X1, (DX)(R8*4) 16244 ADDQ BX, R8 16245 ADDSS (DX)(R8*4), X2 16246 MOVSS X2, (DX)(R8*4) 16247 ADDQ BX, R8 16248 ADDSS (DX)(R8*4), X3 16249 MOVSS X3, (DX)(R8*4) 16250 ADDQ BX, R8 16251 ADDSS (DX)(R8*4), X4 16252 MOVSS X4, (DX)(R8*4) 16253 ADDQ BX, R8 16254 SUBQ $0x04, SI 16255 16256 check_limit_unroll: 16257 CMPQ SI, $0x04 16258 JHS loop_unroll 16259 JMP check_limit 16260 16261 loop: 16262 MOVSS (AX)(DI*4), X1 16263 MULSS X0, X1 16264 ADDSS (DX)(R8*4), X1 16265 MOVSS X1, (DX)(R8*4) 16266 DECQ SI 16267 ADDQ CX, DI 16268 ADDQ BX, R8 16269 16270 check_limit: 16271 CMPQ SI, $0x00 16272 JHI loop 16273 RET 16274 16275 // func AmdAxpyUnsafeXInterleave_V2A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16276 // Requires: SSE 16277 TEXT ·AmdAxpyUnsafeXInterleave_V2A0R4(SB), NOSPLIT, $0-48 16278 MOVSS alpha+0(FP), X0 16279 MOVQ xs+8(FP), AX 16280 MOVQ incx+16(FP), CX 16281 MOVQ ys+24(FP), DX 16282 MOVQ incy+32(FP), BX 16283 MOVQ n+40(FP), SI 16284 XORQ DI, DI 16285 XORQ R8, R8 16286 JMP check_limit_unroll 16287 16288 loop_unroll: 16289 MOVSS (AX)(DI*4), X1 16290 ADDQ CX, DI 16291 MOVSS (AX)(DI*4), X2 16292 ADDQ CX, DI 16293 MOVSS (AX)(DI*4), X3 16294 ADDQ CX, DI 16295 MOVSS (AX)(DI*4), X4 16296 ADDQ CX, DI 16297 MULSS X0, X1 16298 MULSS X0, X2 16299 MULSS X0, X3 16300 MULSS X0, X4 16301 ADDSS (DX)(R8*4), X1 16302 MOVSS X1, (DX)(R8*4) 16303 ADDQ BX, R8 16304 ADDSS (DX)(R8*4), X2 16305 MOVSS X2, (DX)(R8*4) 16306 ADDQ BX, R8 16307 ADDSS (DX)(R8*4), X3 16308 MOVSS X3, (DX)(R8*4) 16309 ADDQ BX, R8 16310 ADDSS (DX)(R8*4), X4 16311 MOVSS X4, (DX)(R8*4) 16312 ADDQ BX, R8 16313 SUBQ $0x04, SI 16314 16315 check_limit_unroll: 16316 CMPQ SI, $0x04 16317 JHS loop_unroll 16318 JMP check_limit 16319 16320 loop: 16321 MOVSS (AX)(DI*4), X1 16322 MULSS X0, X1 16323 ADDSS (DX)(R8*4), X1 16324 MOVSS X1, (DX)(R8*4) 16325 DECQ SI 16326 ADDQ CX, DI 16327 ADDQ BX, R8 16328 16329 check_limit: 16330 CMPQ SI, $0x00 16331 JHI loop 16332 RET 16333 16334 // func AmdAxpyUnsafeXInterleave_V3A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16335 // Requires: SSE 16336 TEXT ·AmdAxpyUnsafeXInterleave_V3A0R4(SB), NOSPLIT, $0-48 16337 MOVSS alpha+0(FP), X0 16338 MOVQ xs+8(FP), AX 16339 MOVQ incx+16(FP), CX 16340 MOVQ ys+24(FP), DX 16341 MOVQ incy+32(FP), BX 16342 MOVQ n+40(FP), SI 16343 XORQ DI, DI 16344 XORQ R8, R8 16345 JMP check_limit_unroll 16346 16347 loop_unroll: 16348 MOVSS (AX)(DI*4), X1 16349 ADDQ CX, DI 16350 MOVSS (AX)(DI*4), X2 16351 ADDQ CX, DI 16352 MOVSS (AX)(DI*4), X3 16353 ADDQ CX, DI 16354 MOVSS (AX)(DI*4), X4 16355 ADDQ CX, DI 16356 MULSS X0, X1 16357 MULSS X0, X2 16358 MULSS X0, X3 16359 MULSS X0, X4 16360 ADDSS (DX)(R8*4), X1 16361 MOVSS X1, (DX)(R8*4) 16362 ADDQ BX, R8 16363 ADDSS (DX)(R8*4), X2 16364 MOVSS X2, (DX)(R8*4) 16365 ADDQ BX, R8 16366 ADDSS (DX)(R8*4), X3 16367 MOVSS X3, (DX)(R8*4) 16368 ADDQ BX, R8 16369 ADDSS (DX)(R8*4), X4 16370 MOVSS X4, (DX)(R8*4) 16371 ADDQ BX, R8 16372 SUBQ $0x04, SI 16373 16374 check_limit_unroll: 16375 CMPQ SI, $0x04 16376 JHS loop_unroll 16377 JMP check_limit 16378 16379 loop: 16380 MOVSS (AX)(DI*4), X1 16381 MULSS X0, X1 16382 ADDSS (DX)(R8*4), X1 16383 MOVSS X1, (DX)(R8*4) 16384 DECQ SI 16385 ADDQ CX, DI 16386 ADDQ BX, R8 16387 16388 check_limit: 16389 CMPQ SI, $0x00 16390 JHI loop 16391 RET 16392 16393 // func AmdAxpyUnsafeXInterleave_V4A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16394 // Requires: SSE 16395 TEXT ·AmdAxpyUnsafeXInterleave_V4A0R4(SB), NOSPLIT, $0-48 16396 MOVSS alpha+0(FP), X0 16397 MOVQ xs+8(FP), AX 16398 MOVQ incx+16(FP), CX 16399 MOVQ ys+24(FP), DX 16400 MOVQ incy+32(FP), BX 16401 MOVQ n+40(FP), SI 16402 XORQ DI, DI 16403 XORQ R8, R8 16404 JMP check_limit_unroll 16405 16406 loop_unroll: 16407 MOVSS (AX)(DI*4), X1 16408 ADDQ CX, DI 16409 MOVSS (AX)(DI*4), X2 16410 ADDQ CX, DI 16411 MOVSS (AX)(DI*4), X3 16412 ADDQ CX, DI 16413 MOVSS (AX)(DI*4), X4 16414 ADDQ CX, DI 16415 MULSS X0, X1 16416 MULSS X0, X2 16417 MULSS X0, X3 16418 MULSS X0, X4 16419 ADDSS (DX)(R8*4), X1 16420 MOVSS X1, (DX)(R8*4) 16421 ADDQ BX, R8 16422 ADDSS (DX)(R8*4), X2 16423 MOVSS X2, (DX)(R8*4) 16424 ADDQ BX, R8 16425 ADDSS (DX)(R8*4), X3 16426 MOVSS X3, (DX)(R8*4) 16427 ADDQ BX, R8 16428 ADDSS (DX)(R8*4), X4 16429 MOVSS X4, (DX)(R8*4) 16430 ADDQ BX, R8 16431 SUBQ $0x04, SI 16432 16433 check_limit_unroll: 16434 CMPQ SI, $0x04 16435 JHS loop_unroll 16436 JMP check_limit 16437 16438 loop: 16439 MOVSS (AX)(DI*4), X1 16440 MULSS X0, X1 16441 ADDSS (DX)(R8*4), X1 16442 MOVSS X1, (DX)(R8*4) 16443 DECQ SI 16444 ADDQ CX, DI 16445 ADDQ BX, R8 16446 16447 check_limit: 16448 CMPQ SI, $0x00 16449 JHI loop 16450 RET 16451 16452 // func AmdAxpyUnsafeXInterleave_V5A0R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16453 // Requires: SSE 16454 TEXT ·AmdAxpyUnsafeXInterleave_V5A0R4(SB), NOSPLIT, $0-48 16455 MOVSS alpha+0(FP), X0 16456 MOVQ xs+8(FP), AX 16457 MOVQ incx+16(FP), CX 16458 MOVQ ys+24(FP), DX 16459 MOVQ incy+32(FP), BX 16460 MOVQ n+40(FP), SI 16461 XORQ DI, DI 16462 XORQ R8, R8 16463 JMP check_limit_unroll 16464 16465 loop_unroll: 16466 MOVSS (AX)(DI*4), X1 16467 ADDQ CX, DI 16468 MOVSS (AX)(DI*4), X2 16469 ADDQ CX, DI 16470 MOVSS (AX)(DI*4), X3 16471 ADDQ CX, DI 16472 MOVSS (AX)(DI*4), X4 16473 ADDQ CX, DI 16474 MULSS X0, X1 16475 MULSS X0, X2 16476 MULSS X0, X3 16477 MULSS X0, X4 16478 ADDSS (DX)(R8*4), X1 16479 MOVSS X1, (DX)(R8*4) 16480 ADDQ BX, R8 16481 ADDSS (DX)(R8*4), X2 16482 MOVSS X2, (DX)(R8*4) 16483 ADDQ BX, R8 16484 ADDSS (DX)(R8*4), X3 16485 MOVSS X3, (DX)(R8*4) 16486 ADDQ BX, R8 16487 ADDSS (DX)(R8*4), X4 16488 MOVSS X4, (DX)(R8*4) 16489 ADDQ BX, R8 16490 SUBQ $0x04, SI 16491 16492 check_limit_unroll: 16493 CMPQ SI, $0x04 16494 JHS loop_unroll 16495 JMP check_limit 16496 16497 loop: 16498 MOVSS (AX)(DI*4), X1 16499 MULSS X0, X1 16500 ADDSS (DX)(R8*4), X1 16501 MOVSS X1, (DX)(R8*4) 16502 DECQ SI 16503 ADDQ CX, DI 16504 ADDQ BX, R8 16505 16506 check_limit: 16507 CMPQ SI, $0x00 16508 JHI loop 16509 RET 16510 16511 // func AmdAxpyUnsafeXInterleave_V0A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16512 // Requires: SSE 16513 TEXT ·AmdAxpyUnsafeXInterleave_V0A8R4(SB), NOSPLIT, $0-48 16514 MOVSS alpha+0(FP), X0 16515 MOVQ xs+8(FP), AX 16516 MOVQ incx+16(FP), CX 16517 MOVQ ys+24(FP), DX 16518 MOVQ incy+32(FP), BX 16519 MOVQ n+40(FP), SI 16520 XORQ DI, DI 16521 XORQ R8, R8 16522 JMP check_limit_unroll 16523 PCALIGN $0x08 16524 16525 loop_unroll: 16526 MOVSS (AX)(DI*4), X1 16527 ADDQ CX, DI 16528 MOVSS (AX)(DI*4), X2 16529 ADDQ CX, DI 16530 MOVSS (AX)(DI*4), X3 16531 ADDQ CX, DI 16532 MOVSS (AX)(DI*4), X4 16533 ADDQ CX, DI 16534 MULSS X0, X1 16535 MULSS X0, X2 16536 MULSS X0, X3 16537 MULSS X0, X4 16538 ADDSS (DX)(R8*4), X1 16539 MOVSS X1, (DX)(R8*4) 16540 ADDQ BX, R8 16541 ADDSS (DX)(R8*4), X2 16542 MOVSS X2, (DX)(R8*4) 16543 ADDQ BX, R8 16544 ADDSS (DX)(R8*4), X3 16545 MOVSS X3, (DX)(R8*4) 16546 ADDQ BX, R8 16547 ADDSS (DX)(R8*4), X4 16548 MOVSS X4, (DX)(R8*4) 16549 ADDQ BX, R8 16550 SUBQ $0x04, SI 16551 16552 check_limit_unroll: 16553 CMPQ SI, $0x04 16554 JHS loop_unroll 16555 JMP check_limit 16556 16557 loop: 16558 MOVSS (AX)(DI*4), X1 16559 MULSS X0, X1 16560 ADDSS (DX)(R8*4), X1 16561 MOVSS X1, (DX)(R8*4) 16562 DECQ SI 16563 ADDQ CX, DI 16564 ADDQ BX, R8 16565 16566 check_limit: 16567 CMPQ SI, $0x00 16568 JHI loop 16569 RET 16570 16571 // func AmdAxpyUnsafeXInterleave_V1A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16572 // Requires: SSE 16573 TEXT ·AmdAxpyUnsafeXInterleave_V1A8R4(SB), NOSPLIT, $0-48 16574 MOVSS alpha+0(FP), X0 16575 MOVQ xs+8(FP), AX 16576 MOVQ incx+16(FP), CX 16577 MOVQ ys+24(FP), DX 16578 MOVQ incy+32(FP), BX 16579 MOVQ n+40(FP), SI 16580 XORQ DI, DI 16581 XORQ R8, R8 16582 JMP check_limit_unroll 16583 PCALIGN $0x08 16584 16585 loop_unroll: 16586 MOVSS (AX)(DI*4), X1 16587 ADDQ CX, DI 16588 MOVSS (AX)(DI*4), X2 16589 ADDQ CX, DI 16590 MOVSS (AX)(DI*4), X3 16591 ADDQ CX, DI 16592 MOVSS (AX)(DI*4), X4 16593 ADDQ CX, DI 16594 MULSS X0, X1 16595 MULSS X0, X2 16596 MULSS X0, X3 16597 MULSS X0, X4 16598 ADDSS (DX)(R8*4), X1 16599 MOVSS X1, (DX)(R8*4) 16600 ADDQ BX, R8 16601 ADDSS (DX)(R8*4), X2 16602 MOVSS X2, (DX)(R8*4) 16603 ADDQ BX, R8 16604 ADDSS (DX)(R8*4), X3 16605 MOVSS X3, (DX)(R8*4) 16606 ADDQ BX, R8 16607 ADDSS (DX)(R8*4), X4 16608 MOVSS X4, (DX)(R8*4) 16609 ADDQ BX, R8 16610 SUBQ $0x04, SI 16611 16612 check_limit_unroll: 16613 CMPQ SI, $0x04 16614 JHS loop_unroll 16615 JMP check_limit 16616 16617 loop: 16618 MOVSS (AX)(DI*4), X1 16619 MULSS X0, X1 16620 ADDSS (DX)(R8*4), X1 16621 MOVSS X1, (DX)(R8*4) 16622 DECQ SI 16623 ADDQ CX, DI 16624 ADDQ BX, R8 16625 16626 check_limit: 16627 CMPQ SI, $0x00 16628 JHI loop 16629 RET 16630 16631 // func AmdAxpyUnsafeXInterleave_V2A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16632 // Requires: SSE 16633 TEXT ·AmdAxpyUnsafeXInterleave_V2A8R4(SB), NOSPLIT, $0-48 16634 MOVSS alpha+0(FP), X0 16635 MOVQ xs+8(FP), AX 16636 MOVQ incx+16(FP), CX 16637 MOVQ ys+24(FP), DX 16638 MOVQ incy+32(FP), BX 16639 MOVQ n+40(FP), SI 16640 XORQ DI, DI 16641 XORQ R8, R8 16642 JMP check_limit_unroll 16643 PCALIGN $0x08 16644 16645 loop_unroll: 16646 MOVSS (AX)(DI*4), X1 16647 ADDQ CX, DI 16648 MOVSS (AX)(DI*4), X2 16649 ADDQ CX, DI 16650 MOVSS (AX)(DI*4), X3 16651 ADDQ CX, DI 16652 MOVSS (AX)(DI*4), X4 16653 ADDQ CX, DI 16654 MULSS X0, X1 16655 MULSS X0, X2 16656 MULSS X0, X3 16657 MULSS X0, X4 16658 ADDSS (DX)(R8*4), X1 16659 MOVSS X1, (DX)(R8*4) 16660 ADDQ BX, R8 16661 ADDSS (DX)(R8*4), X2 16662 MOVSS X2, (DX)(R8*4) 16663 ADDQ BX, R8 16664 ADDSS (DX)(R8*4), X3 16665 MOVSS X3, (DX)(R8*4) 16666 ADDQ BX, R8 16667 ADDSS (DX)(R8*4), X4 16668 MOVSS X4, (DX)(R8*4) 16669 ADDQ BX, R8 16670 SUBQ $0x04, SI 16671 16672 check_limit_unroll: 16673 CMPQ SI, $0x04 16674 JHS loop_unroll 16675 JMP check_limit 16676 16677 loop: 16678 MOVSS (AX)(DI*4), X1 16679 MULSS X0, X1 16680 ADDSS (DX)(R8*4), X1 16681 MOVSS X1, (DX)(R8*4) 16682 DECQ SI 16683 ADDQ CX, DI 16684 ADDQ BX, R8 16685 16686 check_limit: 16687 CMPQ SI, $0x00 16688 JHI loop 16689 RET 16690 16691 // func AmdAxpyUnsafeXInterleave_V3A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16692 // Requires: SSE 16693 TEXT ·AmdAxpyUnsafeXInterleave_V3A8R4(SB), NOSPLIT, $0-48 16694 MOVSS alpha+0(FP), X0 16695 MOVQ xs+8(FP), AX 16696 MOVQ incx+16(FP), CX 16697 MOVQ ys+24(FP), DX 16698 MOVQ incy+32(FP), BX 16699 MOVQ n+40(FP), SI 16700 XORQ DI, DI 16701 XORQ R8, R8 16702 JMP check_limit_unroll 16703 PCALIGN $0x08 16704 16705 loop_unroll: 16706 MOVSS (AX)(DI*4), X1 16707 ADDQ CX, DI 16708 MOVSS (AX)(DI*4), X2 16709 ADDQ CX, DI 16710 MOVSS (AX)(DI*4), X3 16711 ADDQ CX, DI 16712 MOVSS (AX)(DI*4), X4 16713 ADDQ CX, DI 16714 MULSS X0, X1 16715 MULSS X0, X2 16716 MULSS X0, X3 16717 MULSS X0, X4 16718 ADDSS (DX)(R8*4), X1 16719 MOVSS X1, (DX)(R8*4) 16720 ADDQ BX, R8 16721 ADDSS (DX)(R8*4), X2 16722 MOVSS X2, (DX)(R8*4) 16723 ADDQ BX, R8 16724 ADDSS (DX)(R8*4), X3 16725 MOVSS X3, (DX)(R8*4) 16726 ADDQ BX, R8 16727 ADDSS (DX)(R8*4), X4 16728 MOVSS X4, (DX)(R8*4) 16729 ADDQ BX, R8 16730 SUBQ $0x04, SI 16731 16732 check_limit_unroll: 16733 CMPQ SI, $0x04 16734 JHS loop_unroll 16735 JMP check_limit 16736 16737 loop: 16738 MOVSS (AX)(DI*4), X1 16739 MULSS X0, X1 16740 ADDSS (DX)(R8*4), X1 16741 MOVSS X1, (DX)(R8*4) 16742 DECQ SI 16743 ADDQ CX, DI 16744 ADDQ BX, R8 16745 16746 check_limit: 16747 CMPQ SI, $0x00 16748 JHI loop 16749 RET 16750 16751 // func AmdAxpyUnsafeXInterleave_V4A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16752 // Requires: SSE 16753 TEXT ·AmdAxpyUnsafeXInterleave_V4A8R4(SB), NOSPLIT, $0-48 16754 MOVSS alpha+0(FP), X0 16755 MOVQ xs+8(FP), AX 16756 MOVQ incx+16(FP), CX 16757 MOVQ ys+24(FP), DX 16758 MOVQ incy+32(FP), BX 16759 MOVQ n+40(FP), SI 16760 XORQ DI, DI 16761 XORQ R8, R8 16762 JMP check_limit_unroll 16763 PCALIGN $0x08 16764 16765 loop_unroll: 16766 MOVSS (AX)(DI*4), X1 16767 ADDQ CX, DI 16768 MOVSS (AX)(DI*4), X2 16769 ADDQ CX, DI 16770 MOVSS (AX)(DI*4), X3 16771 ADDQ CX, DI 16772 MOVSS (AX)(DI*4), X4 16773 ADDQ CX, DI 16774 MULSS X0, X1 16775 MULSS X0, X2 16776 MULSS X0, X3 16777 MULSS X0, X4 16778 ADDSS (DX)(R8*4), X1 16779 MOVSS X1, (DX)(R8*4) 16780 ADDQ BX, R8 16781 ADDSS (DX)(R8*4), X2 16782 MOVSS X2, (DX)(R8*4) 16783 ADDQ BX, R8 16784 ADDSS (DX)(R8*4), X3 16785 MOVSS X3, (DX)(R8*4) 16786 ADDQ BX, R8 16787 ADDSS (DX)(R8*4), X4 16788 MOVSS X4, (DX)(R8*4) 16789 ADDQ BX, R8 16790 SUBQ $0x04, SI 16791 16792 check_limit_unroll: 16793 CMPQ SI, $0x04 16794 JHS loop_unroll 16795 JMP check_limit 16796 16797 loop: 16798 MOVSS (AX)(DI*4), X1 16799 MULSS X0, X1 16800 ADDSS (DX)(R8*4), X1 16801 MOVSS X1, (DX)(R8*4) 16802 DECQ SI 16803 ADDQ CX, DI 16804 ADDQ BX, R8 16805 16806 check_limit: 16807 CMPQ SI, $0x00 16808 JHI loop 16809 RET 16810 16811 // func AmdAxpyUnsafeXInterleave_V5A8R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16812 // Requires: SSE 16813 TEXT ·AmdAxpyUnsafeXInterleave_V5A8R4(SB), NOSPLIT, $0-48 16814 MOVSS alpha+0(FP), X0 16815 MOVQ xs+8(FP), AX 16816 MOVQ incx+16(FP), CX 16817 MOVQ ys+24(FP), DX 16818 MOVQ incy+32(FP), BX 16819 MOVQ n+40(FP), SI 16820 XORQ DI, DI 16821 XORQ R8, R8 16822 JMP check_limit_unroll 16823 PCALIGN $0x08 16824 16825 loop_unroll: 16826 MOVSS (AX)(DI*4), X1 16827 ADDQ CX, DI 16828 MOVSS (AX)(DI*4), X2 16829 ADDQ CX, DI 16830 MOVSS (AX)(DI*4), X3 16831 ADDQ CX, DI 16832 MOVSS (AX)(DI*4), X4 16833 ADDQ CX, DI 16834 MULSS X0, X1 16835 MULSS X0, X2 16836 MULSS X0, X3 16837 MULSS X0, X4 16838 ADDSS (DX)(R8*4), X1 16839 MOVSS X1, (DX)(R8*4) 16840 ADDQ BX, R8 16841 ADDSS (DX)(R8*4), X2 16842 MOVSS X2, (DX)(R8*4) 16843 ADDQ BX, R8 16844 ADDSS (DX)(R8*4), X3 16845 MOVSS X3, (DX)(R8*4) 16846 ADDQ BX, R8 16847 ADDSS (DX)(R8*4), X4 16848 MOVSS X4, (DX)(R8*4) 16849 ADDQ BX, R8 16850 SUBQ $0x04, SI 16851 16852 check_limit_unroll: 16853 CMPQ SI, $0x04 16854 JHS loop_unroll 16855 JMP check_limit 16856 16857 loop: 16858 MOVSS (AX)(DI*4), X1 16859 MULSS X0, X1 16860 ADDSS (DX)(R8*4), X1 16861 MOVSS X1, (DX)(R8*4) 16862 DECQ SI 16863 ADDQ CX, DI 16864 ADDQ BX, R8 16865 16866 check_limit: 16867 CMPQ SI, $0x00 16868 JHI loop 16869 RET 16870 16871 // func AmdAxpyUnsafeXInterleave_V0A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16872 // Requires: SSE 16873 TEXT ·AmdAxpyUnsafeXInterleave_V0A9R4(SB), NOSPLIT, $0-48 16874 MOVSS alpha+0(FP), X0 16875 MOVQ xs+8(FP), AX 16876 MOVQ incx+16(FP), CX 16877 MOVQ ys+24(FP), DX 16878 MOVQ incy+32(FP), BX 16879 MOVQ n+40(FP), SI 16880 XORQ DI, DI 16881 XORQ R8, R8 16882 JMP check_limit_unroll 16883 PCALIGN $0x08 16884 NOP 16885 16886 loop_unroll: 16887 MOVSS (AX)(DI*4), X1 16888 ADDQ CX, DI 16889 MOVSS (AX)(DI*4), X2 16890 ADDQ CX, DI 16891 MOVSS (AX)(DI*4), X3 16892 ADDQ CX, DI 16893 MOVSS (AX)(DI*4), X4 16894 ADDQ CX, DI 16895 MULSS X0, X1 16896 MULSS X0, X2 16897 MULSS X0, X3 16898 MULSS X0, X4 16899 ADDSS (DX)(R8*4), X1 16900 MOVSS X1, (DX)(R8*4) 16901 ADDQ BX, R8 16902 ADDSS (DX)(R8*4), X2 16903 MOVSS X2, (DX)(R8*4) 16904 ADDQ BX, R8 16905 ADDSS (DX)(R8*4), X3 16906 MOVSS X3, (DX)(R8*4) 16907 ADDQ BX, R8 16908 ADDSS (DX)(R8*4), X4 16909 MOVSS X4, (DX)(R8*4) 16910 ADDQ BX, R8 16911 SUBQ $0x04, SI 16912 16913 check_limit_unroll: 16914 CMPQ SI, $0x04 16915 JHS loop_unroll 16916 JMP check_limit 16917 16918 loop: 16919 MOVSS (AX)(DI*4), X1 16920 MULSS X0, X1 16921 ADDSS (DX)(R8*4), X1 16922 MOVSS X1, (DX)(R8*4) 16923 DECQ SI 16924 ADDQ CX, DI 16925 ADDQ BX, R8 16926 16927 check_limit: 16928 CMPQ SI, $0x00 16929 JHI loop 16930 RET 16931 16932 // func AmdAxpyUnsafeXInterleave_V1A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16933 // Requires: SSE 16934 TEXT ·AmdAxpyUnsafeXInterleave_V1A9R4(SB), NOSPLIT, $0-48 16935 MOVSS alpha+0(FP), X0 16936 MOVQ xs+8(FP), AX 16937 MOVQ incx+16(FP), CX 16938 MOVQ ys+24(FP), DX 16939 MOVQ incy+32(FP), BX 16940 MOVQ n+40(FP), SI 16941 XORQ DI, DI 16942 XORQ R8, R8 16943 JMP check_limit_unroll 16944 PCALIGN $0x08 16945 NOP 16946 16947 loop_unroll: 16948 MOVSS (AX)(DI*4), X1 16949 ADDQ CX, DI 16950 MOVSS (AX)(DI*4), X2 16951 ADDQ CX, DI 16952 MOVSS (AX)(DI*4), X3 16953 ADDQ CX, DI 16954 MOVSS (AX)(DI*4), X4 16955 ADDQ CX, DI 16956 MULSS X0, X1 16957 MULSS X0, X2 16958 MULSS X0, X3 16959 MULSS X0, X4 16960 ADDSS (DX)(R8*4), X1 16961 MOVSS X1, (DX)(R8*4) 16962 ADDQ BX, R8 16963 ADDSS (DX)(R8*4), X2 16964 MOVSS X2, (DX)(R8*4) 16965 ADDQ BX, R8 16966 ADDSS (DX)(R8*4), X3 16967 MOVSS X3, (DX)(R8*4) 16968 ADDQ BX, R8 16969 ADDSS (DX)(R8*4), X4 16970 MOVSS X4, (DX)(R8*4) 16971 ADDQ BX, R8 16972 SUBQ $0x04, SI 16973 16974 check_limit_unroll: 16975 CMPQ SI, $0x04 16976 JHS loop_unroll 16977 JMP check_limit 16978 16979 loop: 16980 MOVSS (AX)(DI*4), X1 16981 MULSS X0, X1 16982 ADDSS (DX)(R8*4), X1 16983 MOVSS X1, (DX)(R8*4) 16984 DECQ SI 16985 ADDQ CX, DI 16986 ADDQ BX, R8 16987 16988 check_limit: 16989 CMPQ SI, $0x00 16990 JHI loop 16991 RET 16992 16993 // func AmdAxpyUnsafeXInterleave_V2A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 16994 // Requires: SSE 16995 TEXT ·AmdAxpyUnsafeXInterleave_V2A9R4(SB), NOSPLIT, $0-48 16996 MOVSS alpha+0(FP), X0 16997 MOVQ xs+8(FP), AX 16998 MOVQ incx+16(FP), CX 16999 MOVQ ys+24(FP), DX 17000 MOVQ incy+32(FP), BX 17001 MOVQ n+40(FP), SI 17002 XORQ DI, DI 17003 XORQ R8, R8 17004 JMP check_limit_unroll 17005 PCALIGN $0x08 17006 NOP 17007 17008 loop_unroll: 17009 MOVSS (AX)(DI*4), X1 17010 ADDQ CX, DI 17011 MOVSS (AX)(DI*4), X2 17012 ADDQ CX, DI 17013 MOVSS (AX)(DI*4), X3 17014 ADDQ CX, DI 17015 MOVSS (AX)(DI*4), X4 17016 ADDQ CX, DI 17017 MULSS X0, X1 17018 MULSS X0, X2 17019 MULSS X0, X3 17020 MULSS X0, X4 17021 ADDSS (DX)(R8*4), X1 17022 MOVSS X1, (DX)(R8*4) 17023 ADDQ BX, R8 17024 ADDSS (DX)(R8*4), X2 17025 MOVSS X2, (DX)(R8*4) 17026 ADDQ BX, R8 17027 ADDSS (DX)(R8*4), X3 17028 MOVSS X3, (DX)(R8*4) 17029 ADDQ BX, R8 17030 ADDSS (DX)(R8*4), X4 17031 MOVSS X4, (DX)(R8*4) 17032 ADDQ BX, R8 17033 SUBQ $0x04, SI 17034 17035 check_limit_unroll: 17036 CMPQ SI, $0x04 17037 JHS loop_unroll 17038 JMP check_limit 17039 17040 loop: 17041 MOVSS (AX)(DI*4), X1 17042 MULSS X0, X1 17043 ADDSS (DX)(R8*4), X1 17044 MOVSS X1, (DX)(R8*4) 17045 DECQ SI 17046 ADDQ CX, DI 17047 ADDQ BX, R8 17048 17049 check_limit: 17050 CMPQ SI, $0x00 17051 JHI loop 17052 RET 17053 17054 // func AmdAxpyUnsafeXInterleave_V3A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17055 // Requires: SSE 17056 TEXT ·AmdAxpyUnsafeXInterleave_V3A9R4(SB), NOSPLIT, $0-48 17057 MOVSS alpha+0(FP), X0 17058 MOVQ xs+8(FP), AX 17059 MOVQ incx+16(FP), CX 17060 MOVQ ys+24(FP), DX 17061 MOVQ incy+32(FP), BX 17062 MOVQ n+40(FP), SI 17063 XORQ DI, DI 17064 XORQ R8, R8 17065 JMP check_limit_unroll 17066 PCALIGN $0x08 17067 NOP 17068 17069 loop_unroll: 17070 MOVSS (AX)(DI*4), X1 17071 ADDQ CX, DI 17072 MOVSS (AX)(DI*4), X2 17073 ADDQ CX, DI 17074 MOVSS (AX)(DI*4), X3 17075 ADDQ CX, DI 17076 MOVSS (AX)(DI*4), X4 17077 ADDQ CX, DI 17078 MULSS X0, X1 17079 MULSS X0, X2 17080 MULSS X0, X3 17081 MULSS X0, X4 17082 ADDSS (DX)(R8*4), X1 17083 MOVSS X1, (DX)(R8*4) 17084 ADDQ BX, R8 17085 ADDSS (DX)(R8*4), X2 17086 MOVSS X2, (DX)(R8*4) 17087 ADDQ BX, R8 17088 ADDSS (DX)(R8*4), X3 17089 MOVSS X3, (DX)(R8*4) 17090 ADDQ BX, R8 17091 ADDSS (DX)(R8*4), X4 17092 MOVSS X4, (DX)(R8*4) 17093 ADDQ BX, R8 17094 SUBQ $0x04, SI 17095 17096 check_limit_unroll: 17097 CMPQ SI, $0x04 17098 JHS loop_unroll 17099 JMP check_limit 17100 17101 loop: 17102 MOVSS (AX)(DI*4), X1 17103 MULSS X0, X1 17104 ADDSS (DX)(R8*4), X1 17105 MOVSS X1, (DX)(R8*4) 17106 DECQ SI 17107 ADDQ CX, DI 17108 ADDQ BX, R8 17109 17110 check_limit: 17111 CMPQ SI, $0x00 17112 JHI loop 17113 RET 17114 17115 // func AmdAxpyUnsafeXInterleave_V4A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17116 // Requires: SSE 17117 TEXT ·AmdAxpyUnsafeXInterleave_V4A9R4(SB), NOSPLIT, $0-48 17118 MOVSS alpha+0(FP), X0 17119 MOVQ xs+8(FP), AX 17120 MOVQ incx+16(FP), CX 17121 MOVQ ys+24(FP), DX 17122 MOVQ incy+32(FP), BX 17123 MOVQ n+40(FP), SI 17124 XORQ DI, DI 17125 XORQ R8, R8 17126 JMP check_limit_unroll 17127 PCALIGN $0x08 17128 NOP 17129 17130 loop_unroll: 17131 MOVSS (AX)(DI*4), X1 17132 ADDQ CX, DI 17133 MOVSS (AX)(DI*4), X2 17134 ADDQ CX, DI 17135 MOVSS (AX)(DI*4), X3 17136 ADDQ CX, DI 17137 MOVSS (AX)(DI*4), X4 17138 ADDQ CX, DI 17139 MULSS X0, X1 17140 MULSS X0, X2 17141 MULSS X0, X3 17142 MULSS X0, X4 17143 ADDSS (DX)(R8*4), X1 17144 MOVSS X1, (DX)(R8*4) 17145 ADDQ BX, R8 17146 ADDSS (DX)(R8*4), X2 17147 MOVSS X2, (DX)(R8*4) 17148 ADDQ BX, R8 17149 ADDSS (DX)(R8*4), X3 17150 MOVSS X3, (DX)(R8*4) 17151 ADDQ BX, R8 17152 ADDSS (DX)(R8*4), X4 17153 MOVSS X4, (DX)(R8*4) 17154 ADDQ BX, R8 17155 SUBQ $0x04, SI 17156 17157 check_limit_unroll: 17158 CMPQ SI, $0x04 17159 JHS loop_unroll 17160 JMP check_limit 17161 17162 loop: 17163 MOVSS (AX)(DI*4), X1 17164 MULSS X0, X1 17165 ADDSS (DX)(R8*4), X1 17166 MOVSS X1, (DX)(R8*4) 17167 DECQ SI 17168 ADDQ CX, DI 17169 ADDQ BX, R8 17170 17171 check_limit: 17172 CMPQ SI, $0x00 17173 JHI loop 17174 RET 17175 17176 // func AmdAxpyUnsafeXInterleave_V5A9R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17177 // Requires: SSE 17178 TEXT ·AmdAxpyUnsafeXInterleave_V5A9R4(SB), NOSPLIT, $0-48 17179 MOVSS alpha+0(FP), X0 17180 MOVQ xs+8(FP), AX 17181 MOVQ incx+16(FP), CX 17182 MOVQ ys+24(FP), DX 17183 MOVQ incy+32(FP), BX 17184 MOVQ n+40(FP), SI 17185 XORQ DI, DI 17186 XORQ R8, R8 17187 JMP check_limit_unroll 17188 PCALIGN $0x08 17189 NOP 17190 17191 loop_unroll: 17192 MOVSS (AX)(DI*4), X1 17193 ADDQ CX, DI 17194 MOVSS (AX)(DI*4), X2 17195 ADDQ CX, DI 17196 MOVSS (AX)(DI*4), X3 17197 ADDQ CX, DI 17198 MOVSS (AX)(DI*4), X4 17199 ADDQ CX, DI 17200 MULSS X0, X1 17201 MULSS X0, X2 17202 MULSS X0, X3 17203 MULSS X0, X4 17204 ADDSS (DX)(R8*4), X1 17205 MOVSS X1, (DX)(R8*4) 17206 ADDQ BX, R8 17207 ADDSS (DX)(R8*4), X2 17208 MOVSS X2, (DX)(R8*4) 17209 ADDQ BX, R8 17210 ADDSS (DX)(R8*4), X3 17211 MOVSS X3, (DX)(R8*4) 17212 ADDQ BX, R8 17213 ADDSS (DX)(R8*4), X4 17214 MOVSS X4, (DX)(R8*4) 17215 ADDQ BX, R8 17216 SUBQ $0x04, SI 17217 17218 check_limit_unroll: 17219 CMPQ SI, $0x04 17220 JHS loop_unroll 17221 JMP check_limit 17222 17223 loop: 17224 MOVSS (AX)(DI*4), X1 17225 MULSS X0, X1 17226 ADDSS (DX)(R8*4), X1 17227 MOVSS X1, (DX)(R8*4) 17228 DECQ SI 17229 ADDQ CX, DI 17230 ADDQ BX, R8 17231 17232 check_limit: 17233 CMPQ SI, $0x00 17234 JHI loop 17235 RET 17236 17237 // func AmdAxpyUnsafeXInterleave_V0A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17238 // Requires: SSE 17239 TEXT ·AmdAxpyUnsafeXInterleave_V0A10R4(SB), NOSPLIT, $0-48 17240 MOVSS alpha+0(FP), X0 17241 MOVQ xs+8(FP), AX 17242 MOVQ incx+16(FP), CX 17243 MOVQ ys+24(FP), DX 17244 MOVQ incy+32(FP), BX 17245 MOVQ n+40(FP), SI 17246 XORQ DI, DI 17247 XORQ R8, R8 17248 JMP check_limit_unroll 17249 PCALIGN $0x08 17250 NOP 17251 NOP 17252 17253 loop_unroll: 17254 MOVSS (AX)(DI*4), X1 17255 ADDQ CX, DI 17256 MOVSS (AX)(DI*4), X2 17257 ADDQ CX, DI 17258 MOVSS (AX)(DI*4), X3 17259 ADDQ CX, DI 17260 MOVSS (AX)(DI*4), X4 17261 ADDQ CX, DI 17262 MULSS X0, X1 17263 MULSS X0, X2 17264 MULSS X0, X3 17265 MULSS X0, X4 17266 ADDSS (DX)(R8*4), X1 17267 MOVSS X1, (DX)(R8*4) 17268 ADDQ BX, R8 17269 ADDSS (DX)(R8*4), X2 17270 MOVSS X2, (DX)(R8*4) 17271 ADDQ BX, R8 17272 ADDSS (DX)(R8*4), X3 17273 MOVSS X3, (DX)(R8*4) 17274 ADDQ BX, R8 17275 ADDSS (DX)(R8*4), X4 17276 MOVSS X4, (DX)(R8*4) 17277 ADDQ BX, R8 17278 SUBQ $0x04, SI 17279 17280 check_limit_unroll: 17281 CMPQ SI, $0x04 17282 JHS loop_unroll 17283 JMP check_limit 17284 17285 loop: 17286 MOVSS (AX)(DI*4), X1 17287 MULSS X0, X1 17288 ADDSS (DX)(R8*4), X1 17289 MOVSS X1, (DX)(R8*4) 17290 DECQ SI 17291 ADDQ CX, DI 17292 ADDQ BX, R8 17293 17294 check_limit: 17295 CMPQ SI, $0x00 17296 JHI loop 17297 RET 17298 17299 // func AmdAxpyUnsafeXInterleave_V1A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17300 // Requires: SSE 17301 TEXT ·AmdAxpyUnsafeXInterleave_V1A10R4(SB), NOSPLIT, $0-48 17302 MOVSS alpha+0(FP), X0 17303 MOVQ xs+8(FP), AX 17304 MOVQ incx+16(FP), CX 17305 MOVQ ys+24(FP), DX 17306 MOVQ incy+32(FP), BX 17307 MOVQ n+40(FP), SI 17308 XORQ DI, DI 17309 XORQ R8, R8 17310 JMP check_limit_unroll 17311 PCALIGN $0x08 17312 NOP 17313 NOP 17314 17315 loop_unroll: 17316 MOVSS (AX)(DI*4), X1 17317 ADDQ CX, DI 17318 MOVSS (AX)(DI*4), X2 17319 ADDQ CX, DI 17320 MOVSS (AX)(DI*4), X3 17321 ADDQ CX, DI 17322 MOVSS (AX)(DI*4), X4 17323 ADDQ CX, DI 17324 MULSS X0, X1 17325 MULSS X0, X2 17326 MULSS X0, X3 17327 MULSS X0, X4 17328 ADDSS (DX)(R8*4), X1 17329 MOVSS X1, (DX)(R8*4) 17330 ADDQ BX, R8 17331 ADDSS (DX)(R8*4), X2 17332 MOVSS X2, (DX)(R8*4) 17333 ADDQ BX, R8 17334 ADDSS (DX)(R8*4), X3 17335 MOVSS X3, (DX)(R8*4) 17336 ADDQ BX, R8 17337 ADDSS (DX)(R8*4), X4 17338 MOVSS X4, (DX)(R8*4) 17339 ADDQ BX, R8 17340 SUBQ $0x04, SI 17341 17342 check_limit_unroll: 17343 CMPQ SI, $0x04 17344 JHS loop_unroll 17345 JMP check_limit 17346 17347 loop: 17348 MOVSS (AX)(DI*4), X1 17349 MULSS X0, X1 17350 ADDSS (DX)(R8*4), X1 17351 MOVSS X1, (DX)(R8*4) 17352 DECQ SI 17353 ADDQ CX, DI 17354 ADDQ BX, R8 17355 17356 check_limit: 17357 CMPQ SI, $0x00 17358 JHI loop 17359 RET 17360 17361 // func AmdAxpyUnsafeXInterleave_V2A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17362 // Requires: SSE 17363 TEXT ·AmdAxpyUnsafeXInterleave_V2A10R4(SB), NOSPLIT, $0-48 17364 MOVSS alpha+0(FP), X0 17365 MOVQ xs+8(FP), AX 17366 MOVQ incx+16(FP), CX 17367 MOVQ ys+24(FP), DX 17368 MOVQ incy+32(FP), BX 17369 MOVQ n+40(FP), SI 17370 XORQ DI, DI 17371 XORQ R8, R8 17372 JMP check_limit_unroll 17373 PCALIGN $0x08 17374 NOP 17375 NOP 17376 17377 loop_unroll: 17378 MOVSS (AX)(DI*4), X1 17379 ADDQ CX, DI 17380 MOVSS (AX)(DI*4), X2 17381 ADDQ CX, DI 17382 MOVSS (AX)(DI*4), X3 17383 ADDQ CX, DI 17384 MOVSS (AX)(DI*4), X4 17385 ADDQ CX, DI 17386 MULSS X0, X1 17387 MULSS X0, X2 17388 MULSS X0, X3 17389 MULSS X0, X4 17390 ADDSS (DX)(R8*4), X1 17391 MOVSS X1, (DX)(R8*4) 17392 ADDQ BX, R8 17393 ADDSS (DX)(R8*4), X2 17394 MOVSS X2, (DX)(R8*4) 17395 ADDQ BX, R8 17396 ADDSS (DX)(R8*4), X3 17397 MOVSS X3, (DX)(R8*4) 17398 ADDQ BX, R8 17399 ADDSS (DX)(R8*4), X4 17400 MOVSS X4, (DX)(R8*4) 17401 ADDQ BX, R8 17402 SUBQ $0x04, SI 17403 17404 check_limit_unroll: 17405 CMPQ SI, $0x04 17406 JHS loop_unroll 17407 JMP check_limit 17408 17409 loop: 17410 MOVSS (AX)(DI*4), X1 17411 MULSS X0, X1 17412 ADDSS (DX)(R8*4), X1 17413 MOVSS X1, (DX)(R8*4) 17414 DECQ SI 17415 ADDQ CX, DI 17416 ADDQ BX, R8 17417 17418 check_limit: 17419 CMPQ SI, $0x00 17420 JHI loop 17421 RET 17422 17423 // func AmdAxpyUnsafeXInterleave_V3A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17424 // Requires: SSE 17425 TEXT ·AmdAxpyUnsafeXInterleave_V3A10R4(SB), NOSPLIT, $0-48 17426 MOVSS alpha+0(FP), X0 17427 MOVQ xs+8(FP), AX 17428 MOVQ incx+16(FP), CX 17429 MOVQ ys+24(FP), DX 17430 MOVQ incy+32(FP), BX 17431 MOVQ n+40(FP), SI 17432 XORQ DI, DI 17433 XORQ R8, R8 17434 JMP check_limit_unroll 17435 PCALIGN $0x08 17436 NOP 17437 NOP 17438 17439 loop_unroll: 17440 MOVSS (AX)(DI*4), X1 17441 ADDQ CX, DI 17442 MOVSS (AX)(DI*4), X2 17443 ADDQ CX, DI 17444 MOVSS (AX)(DI*4), X3 17445 ADDQ CX, DI 17446 MOVSS (AX)(DI*4), X4 17447 ADDQ CX, DI 17448 MULSS X0, X1 17449 MULSS X0, X2 17450 MULSS X0, X3 17451 MULSS X0, X4 17452 ADDSS (DX)(R8*4), X1 17453 MOVSS X1, (DX)(R8*4) 17454 ADDQ BX, R8 17455 ADDSS (DX)(R8*4), X2 17456 MOVSS X2, (DX)(R8*4) 17457 ADDQ BX, R8 17458 ADDSS (DX)(R8*4), X3 17459 MOVSS X3, (DX)(R8*4) 17460 ADDQ BX, R8 17461 ADDSS (DX)(R8*4), X4 17462 MOVSS X4, (DX)(R8*4) 17463 ADDQ BX, R8 17464 SUBQ $0x04, SI 17465 17466 check_limit_unroll: 17467 CMPQ SI, $0x04 17468 JHS loop_unroll 17469 JMP check_limit 17470 17471 loop: 17472 MOVSS (AX)(DI*4), X1 17473 MULSS X0, X1 17474 ADDSS (DX)(R8*4), X1 17475 MOVSS X1, (DX)(R8*4) 17476 DECQ SI 17477 ADDQ CX, DI 17478 ADDQ BX, R8 17479 17480 check_limit: 17481 CMPQ SI, $0x00 17482 JHI loop 17483 RET 17484 17485 // func AmdAxpyUnsafeXInterleave_V4A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17486 // Requires: SSE 17487 TEXT ·AmdAxpyUnsafeXInterleave_V4A10R4(SB), NOSPLIT, $0-48 17488 MOVSS alpha+0(FP), X0 17489 MOVQ xs+8(FP), AX 17490 MOVQ incx+16(FP), CX 17491 MOVQ ys+24(FP), DX 17492 MOVQ incy+32(FP), BX 17493 MOVQ n+40(FP), SI 17494 XORQ DI, DI 17495 XORQ R8, R8 17496 JMP check_limit_unroll 17497 PCALIGN $0x08 17498 NOP 17499 NOP 17500 17501 loop_unroll: 17502 MOVSS (AX)(DI*4), X1 17503 ADDQ CX, DI 17504 MOVSS (AX)(DI*4), X2 17505 ADDQ CX, DI 17506 MOVSS (AX)(DI*4), X3 17507 ADDQ CX, DI 17508 MOVSS (AX)(DI*4), X4 17509 ADDQ CX, DI 17510 MULSS X0, X1 17511 MULSS X0, X2 17512 MULSS X0, X3 17513 MULSS X0, X4 17514 ADDSS (DX)(R8*4), X1 17515 MOVSS X1, (DX)(R8*4) 17516 ADDQ BX, R8 17517 ADDSS (DX)(R8*4), X2 17518 MOVSS X2, (DX)(R8*4) 17519 ADDQ BX, R8 17520 ADDSS (DX)(R8*4), X3 17521 MOVSS X3, (DX)(R8*4) 17522 ADDQ BX, R8 17523 ADDSS (DX)(R8*4), X4 17524 MOVSS X4, (DX)(R8*4) 17525 ADDQ BX, R8 17526 SUBQ $0x04, SI 17527 17528 check_limit_unroll: 17529 CMPQ SI, $0x04 17530 JHS loop_unroll 17531 JMP check_limit 17532 17533 loop: 17534 MOVSS (AX)(DI*4), X1 17535 MULSS X0, X1 17536 ADDSS (DX)(R8*4), X1 17537 MOVSS X1, (DX)(R8*4) 17538 DECQ SI 17539 ADDQ CX, DI 17540 ADDQ BX, R8 17541 17542 check_limit: 17543 CMPQ SI, $0x00 17544 JHI loop 17545 RET 17546 17547 // func AmdAxpyUnsafeXInterleave_V5A10R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17548 // Requires: SSE 17549 TEXT ·AmdAxpyUnsafeXInterleave_V5A10R4(SB), NOSPLIT, $0-48 17550 MOVSS alpha+0(FP), X0 17551 MOVQ xs+8(FP), AX 17552 MOVQ incx+16(FP), CX 17553 MOVQ ys+24(FP), DX 17554 MOVQ incy+32(FP), BX 17555 MOVQ n+40(FP), SI 17556 XORQ DI, DI 17557 XORQ R8, R8 17558 JMP check_limit_unroll 17559 PCALIGN $0x08 17560 NOP 17561 NOP 17562 17563 loop_unroll: 17564 MOVSS (AX)(DI*4), X1 17565 ADDQ CX, DI 17566 MOVSS (AX)(DI*4), X2 17567 ADDQ CX, DI 17568 MOVSS (AX)(DI*4), X3 17569 ADDQ CX, DI 17570 MOVSS (AX)(DI*4), X4 17571 ADDQ CX, DI 17572 MULSS X0, X1 17573 MULSS X0, X2 17574 MULSS X0, X3 17575 MULSS X0, X4 17576 ADDSS (DX)(R8*4), X1 17577 MOVSS X1, (DX)(R8*4) 17578 ADDQ BX, R8 17579 ADDSS (DX)(R8*4), X2 17580 MOVSS X2, (DX)(R8*4) 17581 ADDQ BX, R8 17582 ADDSS (DX)(R8*4), X3 17583 MOVSS X3, (DX)(R8*4) 17584 ADDQ BX, R8 17585 ADDSS (DX)(R8*4), X4 17586 MOVSS X4, (DX)(R8*4) 17587 ADDQ BX, R8 17588 SUBQ $0x04, SI 17589 17590 check_limit_unroll: 17591 CMPQ SI, $0x04 17592 JHS loop_unroll 17593 JMP check_limit 17594 17595 loop: 17596 MOVSS (AX)(DI*4), X1 17597 MULSS X0, X1 17598 ADDSS (DX)(R8*4), X1 17599 MOVSS X1, (DX)(R8*4) 17600 DECQ SI 17601 ADDQ CX, DI 17602 ADDQ BX, R8 17603 17604 check_limit: 17605 CMPQ SI, $0x00 17606 JHI loop 17607 RET 17608 17609 // func AmdAxpyUnsafeXInterleave_V0A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17610 // Requires: SSE 17611 TEXT ·AmdAxpyUnsafeXInterleave_V0A11R4(SB), NOSPLIT, $0-48 17612 MOVSS alpha+0(FP), X0 17613 MOVQ xs+8(FP), AX 17614 MOVQ incx+16(FP), CX 17615 MOVQ ys+24(FP), DX 17616 MOVQ incy+32(FP), BX 17617 MOVQ n+40(FP), SI 17618 XORQ DI, DI 17619 XORQ R8, R8 17620 JMP check_limit_unroll 17621 PCALIGN $0x08 17622 NOP 17623 NOP 17624 NOP 17625 17626 loop_unroll: 17627 MOVSS (AX)(DI*4), X1 17628 ADDQ CX, DI 17629 MOVSS (AX)(DI*4), X2 17630 ADDQ CX, DI 17631 MOVSS (AX)(DI*4), X3 17632 ADDQ CX, DI 17633 MOVSS (AX)(DI*4), X4 17634 ADDQ CX, DI 17635 MULSS X0, X1 17636 MULSS X0, X2 17637 MULSS X0, X3 17638 MULSS X0, X4 17639 ADDSS (DX)(R8*4), X1 17640 MOVSS X1, (DX)(R8*4) 17641 ADDQ BX, R8 17642 ADDSS (DX)(R8*4), X2 17643 MOVSS X2, (DX)(R8*4) 17644 ADDQ BX, R8 17645 ADDSS (DX)(R8*4), X3 17646 MOVSS X3, (DX)(R8*4) 17647 ADDQ BX, R8 17648 ADDSS (DX)(R8*4), X4 17649 MOVSS X4, (DX)(R8*4) 17650 ADDQ BX, R8 17651 SUBQ $0x04, SI 17652 17653 check_limit_unroll: 17654 CMPQ SI, $0x04 17655 JHS loop_unroll 17656 JMP check_limit 17657 17658 loop: 17659 MOVSS (AX)(DI*4), X1 17660 MULSS X0, X1 17661 ADDSS (DX)(R8*4), X1 17662 MOVSS X1, (DX)(R8*4) 17663 DECQ SI 17664 ADDQ CX, DI 17665 ADDQ BX, R8 17666 17667 check_limit: 17668 CMPQ SI, $0x00 17669 JHI loop 17670 RET 17671 17672 // func AmdAxpyUnsafeXInterleave_V1A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17673 // Requires: SSE 17674 TEXT ·AmdAxpyUnsafeXInterleave_V1A11R4(SB), NOSPLIT, $0-48 17675 MOVSS alpha+0(FP), X0 17676 MOVQ xs+8(FP), AX 17677 MOVQ incx+16(FP), CX 17678 MOVQ ys+24(FP), DX 17679 MOVQ incy+32(FP), BX 17680 MOVQ n+40(FP), SI 17681 XORQ DI, DI 17682 XORQ R8, R8 17683 JMP check_limit_unroll 17684 PCALIGN $0x08 17685 NOP 17686 NOP 17687 NOP 17688 17689 loop_unroll: 17690 MOVSS (AX)(DI*4), X1 17691 ADDQ CX, DI 17692 MOVSS (AX)(DI*4), X2 17693 ADDQ CX, DI 17694 MOVSS (AX)(DI*4), X3 17695 ADDQ CX, DI 17696 MOVSS (AX)(DI*4), X4 17697 ADDQ CX, DI 17698 MULSS X0, X1 17699 MULSS X0, X2 17700 MULSS X0, X3 17701 MULSS X0, X4 17702 ADDSS (DX)(R8*4), X1 17703 MOVSS X1, (DX)(R8*4) 17704 ADDQ BX, R8 17705 ADDSS (DX)(R8*4), X2 17706 MOVSS X2, (DX)(R8*4) 17707 ADDQ BX, R8 17708 ADDSS (DX)(R8*4), X3 17709 MOVSS X3, (DX)(R8*4) 17710 ADDQ BX, R8 17711 ADDSS (DX)(R8*4), X4 17712 MOVSS X4, (DX)(R8*4) 17713 ADDQ BX, R8 17714 SUBQ $0x04, SI 17715 17716 check_limit_unroll: 17717 CMPQ SI, $0x04 17718 JHS loop_unroll 17719 JMP check_limit 17720 17721 loop: 17722 MOVSS (AX)(DI*4), X1 17723 MULSS X0, X1 17724 ADDSS (DX)(R8*4), X1 17725 MOVSS X1, (DX)(R8*4) 17726 DECQ SI 17727 ADDQ CX, DI 17728 ADDQ BX, R8 17729 17730 check_limit: 17731 CMPQ SI, $0x00 17732 JHI loop 17733 RET 17734 17735 // func AmdAxpyUnsafeXInterleave_V2A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17736 // Requires: SSE 17737 TEXT ·AmdAxpyUnsafeXInterleave_V2A11R4(SB), NOSPLIT, $0-48 17738 MOVSS alpha+0(FP), X0 17739 MOVQ xs+8(FP), AX 17740 MOVQ incx+16(FP), CX 17741 MOVQ ys+24(FP), DX 17742 MOVQ incy+32(FP), BX 17743 MOVQ n+40(FP), SI 17744 XORQ DI, DI 17745 XORQ R8, R8 17746 JMP check_limit_unroll 17747 PCALIGN $0x08 17748 NOP 17749 NOP 17750 NOP 17751 17752 loop_unroll: 17753 MOVSS (AX)(DI*4), X1 17754 ADDQ CX, DI 17755 MOVSS (AX)(DI*4), X2 17756 ADDQ CX, DI 17757 MOVSS (AX)(DI*4), X3 17758 ADDQ CX, DI 17759 MOVSS (AX)(DI*4), X4 17760 ADDQ CX, DI 17761 MULSS X0, X1 17762 MULSS X0, X2 17763 MULSS X0, X3 17764 MULSS X0, X4 17765 ADDSS (DX)(R8*4), X1 17766 MOVSS X1, (DX)(R8*4) 17767 ADDQ BX, R8 17768 ADDSS (DX)(R8*4), X2 17769 MOVSS X2, (DX)(R8*4) 17770 ADDQ BX, R8 17771 ADDSS (DX)(R8*4), X3 17772 MOVSS X3, (DX)(R8*4) 17773 ADDQ BX, R8 17774 ADDSS (DX)(R8*4), X4 17775 MOVSS X4, (DX)(R8*4) 17776 ADDQ BX, R8 17777 SUBQ $0x04, SI 17778 17779 check_limit_unroll: 17780 CMPQ SI, $0x04 17781 JHS loop_unroll 17782 JMP check_limit 17783 17784 loop: 17785 MOVSS (AX)(DI*4), X1 17786 MULSS X0, X1 17787 ADDSS (DX)(R8*4), X1 17788 MOVSS X1, (DX)(R8*4) 17789 DECQ SI 17790 ADDQ CX, DI 17791 ADDQ BX, R8 17792 17793 check_limit: 17794 CMPQ SI, $0x00 17795 JHI loop 17796 RET 17797 17798 // func AmdAxpyUnsafeXInterleave_V3A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17799 // Requires: SSE 17800 TEXT ·AmdAxpyUnsafeXInterleave_V3A11R4(SB), NOSPLIT, $0-48 17801 MOVSS alpha+0(FP), X0 17802 MOVQ xs+8(FP), AX 17803 MOVQ incx+16(FP), CX 17804 MOVQ ys+24(FP), DX 17805 MOVQ incy+32(FP), BX 17806 MOVQ n+40(FP), SI 17807 XORQ DI, DI 17808 XORQ R8, R8 17809 JMP check_limit_unroll 17810 PCALIGN $0x08 17811 NOP 17812 NOP 17813 NOP 17814 17815 loop_unroll: 17816 MOVSS (AX)(DI*4), X1 17817 ADDQ CX, DI 17818 MOVSS (AX)(DI*4), X2 17819 ADDQ CX, DI 17820 MOVSS (AX)(DI*4), X3 17821 ADDQ CX, DI 17822 MOVSS (AX)(DI*4), X4 17823 ADDQ CX, DI 17824 MULSS X0, X1 17825 MULSS X0, X2 17826 MULSS X0, X3 17827 MULSS X0, X4 17828 ADDSS (DX)(R8*4), X1 17829 MOVSS X1, (DX)(R8*4) 17830 ADDQ BX, R8 17831 ADDSS (DX)(R8*4), X2 17832 MOVSS X2, (DX)(R8*4) 17833 ADDQ BX, R8 17834 ADDSS (DX)(R8*4), X3 17835 MOVSS X3, (DX)(R8*4) 17836 ADDQ BX, R8 17837 ADDSS (DX)(R8*4), X4 17838 MOVSS X4, (DX)(R8*4) 17839 ADDQ BX, R8 17840 SUBQ $0x04, SI 17841 17842 check_limit_unroll: 17843 CMPQ SI, $0x04 17844 JHS loop_unroll 17845 JMP check_limit 17846 17847 loop: 17848 MOVSS (AX)(DI*4), X1 17849 MULSS X0, X1 17850 ADDSS (DX)(R8*4), X1 17851 MOVSS X1, (DX)(R8*4) 17852 DECQ SI 17853 ADDQ CX, DI 17854 ADDQ BX, R8 17855 17856 check_limit: 17857 CMPQ SI, $0x00 17858 JHI loop 17859 RET 17860 17861 // func AmdAxpyUnsafeXInterleave_V4A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17862 // Requires: SSE 17863 TEXT ·AmdAxpyUnsafeXInterleave_V4A11R4(SB), NOSPLIT, $0-48 17864 MOVSS alpha+0(FP), X0 17865 MOVQ xs+8(FP), AX 17866 MOVQ incx+16(FP), CX 17867 MOVQ ys+24(FP), DX 17868 MOVQ incy+32(FP), BX 17869 MOVQ n+40(FP), SI 17870 XORQ DI, DI 17871 XORQ R8, R8 17872 JMP check_limit_unroll 17873 PCALIGN $0x08 17874 NOP 17875 NOP 17876 NOP 17877 17878 loop_unroll: 17879 MOVSS (AX)(DI*4), X1 17880 ADDQ CX, DI 17881 MOVSS (AX)(DI*4), X2 17882 ADDQ CX, DI 17883 MOVSS (AX)(DI*4), X3 17884 ADDQ CX, DI 17885 MOVSS (AX)(DI*4), X4 17886 ADDQ CX, DI 17887 MULSS X0, X1 17888 MULSS X0, X2 17889 MULSS X0, X3 17890 MULSS X0, X4 17891 ADDSS (DX)(R8*4), X1 17892 MOVSS X1, (DX)(R8*4) 17893 ADDQ BX, R8 17894 ADDSS (DX)(R8*4), X2 17895 MOVSS X2, (DX)(R8*4) 17896 ADDQ BX, R8 17897 ADDSS (DX)(R8*4), X3 17898 MOVSS X3, (DX)(R8*4) 17899 ADDQ BX, R8 17900 ADDSS (DX)(R8*4), X4 17901 MOVSS X4, (DX)(R8*4) 17902 ADDQ BX, R8 17903 SUBQ $0x04, SI 17904 17905 check_limit_unroll: 17906 CMPQ SI, $0x04 17907 JHS loop_unroll 17908 JMP check_limit 17909 17910 loop: 17911 MOVSS (AX)(DI*4), X1 17912 MULSS X0, X1 17913 ADDSS (DX)(R8*4), X1 17914 MOVSS X1, (DX)(R8*4) 17915 DECQ SI 17916 ADDQ CX, DI 17917 ADDQ BX, R8 17918 17919 check_limit: 17920 CMPQ SI, $0x00 17921 JHI loop 17922 RET 17923 17924 // func AmdAxpyUnsafeXInterleave_V5A11R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17925 // Requires: SSE 17926 TEXT ·AmdAxpyUnsafeXInterleave_V5A11R4(SB), NOSPLIT, $0-48 17927 MOVSS alpha+0(FP), X0 17928 MOVQ xs+8(FP), AX 17929 MOVQ incx+16(FP), CX 17930 MOVQ ys+24(FP), DX 17931 MOVQ incy+32(FP), BX 17932 MOVQ n+40(FP), SI 17933 XORQ DI, DI 17934 XORQ R8, R8 17935 JMP check_limit_unroll 17936 PCALIGN $0x08 17937 NOP 17938 NOP 17939 NOP 17940 17941 loop_unroll: 17942 MOVSS (AX)(DI*4), X1 17943 ADDQ CX, DI 17944 MOVSS (AX)(DI*4), X2 17945 ADDQ CX, DI 17946 MOVSS (AX)(DI*4), X3 17947 ADDQ CX, DI 17948 MOVSS (AX)(DI*4), X4 17949 ADDQ CX, DI 17950 MULSS X0, X1 17951 MULSS X0, X2 17952 MULSS X0, X3 17953 MULSS X0, X4 17954 ADDSS (DX)(R8*4), X1 17955 MOVSS X1, (DX)(R8*4) 17956 ADDQ BX, R8 17957 ADDSS (DX)(R8*4), X2 17958 MOVSS X2, (DX)(R8*4) 17959 ADDQ BX, R8 17960 ADDSS (DX)(R8*4), X3 17961 MOVSS X3, (DX)(R8*4) 17962 ADDQ BX, R8 17963 ADDSS (DX)(R8*4), X4 17964 MOVSS X4, (DX)(R8*4) 17965 ADDQ BX, R8 17966 SUBQ $0x04, SI 17967 17968 check_limit_unroll: 17969 CMPQ SI, $0x04 17970 JHS loop_unroll 17971 JMP check_limit 17972 17973 loop: 17974 MOVSS (AX)(DI*4), X1 17975 MULSS X0, X1 17976 ADDSS (DX)(R8*4), X1 17977 MOVSS X1, (DX)(R8*4) 17978 DECQ SI 17979 ADDQ CX, DI 17980 ADDQ BX, R8 17981 17982 check_limit: 17983 CMPQ SI, $0x00 17984 JHI loop 17985 RET 17986 17987 // func AmdAxpyUnsafeXInterleave_V0A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 17988 // Requires: SSE 17989 TEXT ·AmdAxpyUnsafeXInterleave_V0A12R4(SB), NOSPLIT, $0-48 17990 MOVSS alpha+0(FP), X0 17991 MOVQ xs+8(FP), AX 17992 MOVQ incx+16(FP), CX 17993 MOVQ ys+24(FP), DX 17994 MOVQ incy+32(FP), BX 17995 MOVQ n+40(FP), SI 17996 XORQ DI, DI 17997 XORQ R8, R8 17998 JMP check_limit_unroll 17999 PCALIGN $0x08 18000 NOP 18001 NOP 18002 NOP 18003 NOP 18004 18005 loop_unroll: 18006 MOVSS (AX)(DI*4), X1 18007 ADDQ CX, DI 18008 MOVSS (AX)(DI*4), X2 18009 ADDQ CX, DI 18010 MOVSS (AX)(DI*4), X3 18011 ADDQ CX, DI 18012 MOVSS (AX)(DI*4), X4 18013 ADDQ CX, DI 18014 MULSS X0, X1 18015 MULSS X0, X2 18016 MULSS X0, X3 18017 MULSS X0, X4 18018 ADDSS (DX)(R8*4), X1 18019 MOVSS X1, (DX)(R8*4) 18020 ADDQ BX, R8 18021 ADDSS (DX)(R8*4), X2 18022 MOVSS X2, (DX)(R8*4) 18023 ADDQ BX, R8 18024 ADDSS (DX)(R8*4), X3 18025 MOVSS X3, (DX)(R8*4) 18026 ADDQ BX, R8 18027 ADDSS (DX)(R8*4), X4 18028 MOVSS X4, (DX)(R8*4) 18029 ADDQ BX, R8 18030 SUBQ $0x04, SI 18031 18032 check_limit_unroll: 18033 CMPQ SI, $0x04 18034 JHS loop_unroll 18035 JMP check_limit 18036 18037 loop: 18038 MOVSS (AX)(DI*4), X1 18039 MULSS X0, X1 18040 ADDSS (DX)(R8*4), X1 18041 MOVSS X1, (DX)(R8*4) 18042 DECQ SI 18043 ADDQ CX, DI 18044 ADDQ BX, R8 18045 18046 check_limit: 18047 CMPQ SI, $0x00 18048 JHI loop 18049 RET 18050 18051 // func AmdAxpyUnsafeXInterleave_V1A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18052 // Requires: SSE 18053 TEXT ·AmdAxpyUnsafeXInterleave_V1A12R4(SB), NOSPLIT, $0-48 18054 MOVSS alpha+0(FP), X0 18055 MOVQ xs+8(FP), AX 18056 MOVQ incx+16(FP), CX 18057 MOVQ ys+24(FP), DX 18058 MOVQ incy+32(FP), BX 18059 MOVQ n+40(FP), SI 18060 XORQ DI, DI 18061 XORQ R8, R8 18062 JMP check_limit_unroll 18063 PCALIGN $0x08 18064 NOP 18065 NOP 18066 NOP 18067 NOP 18068 18069 loop_unroll: 18070 MOVSS (AX)(DI*4), X1 18071 ADDQ CX, DI 18072 MOVSS (AX)(DI*4), X2 18073 ADDQ CX, DI 18074 MOVSS (AX)(DI*4), X3 18075 ADDQ CX, DI 18076 MOVSS (AX)(DI*4), X4 18077 ADDQ CX, DI 18078 MULSS X0, X1 18079 MULSS X0, X2 18080 MULSS X0, X3 18081 MULSS X0, X4 18082 ADDSS (DX)(R8*4), X1 18083 MOVSS X1, (DX)(R8*4) 18084 ADDQ BX, R8 18085 ADDSS (DX)(R8*4), X2 18086 MOVSS X2, (DX)(R8*4) 18087 ADDQ BX, R8 18088 ADDSS (DX)(R8*4), X3 18089 MOVSS X3, (DX)(R8*4) 18090 ADDQ BX, R8 18091 ADDSS (DX)(R8*4), X4 18092 MOVSS X4, (DX)(R8*4) 18093 ADDQ BX, R8 18094 SUBQ $0x04, SI 18095 18096 check_limit_unroll: 18097 CMPQ SI, $0x04 18098 JHS loop_unroll 18099 JMP check_limit 18100 18101 loop: 18102 MOVSS (AX)(DI*4), X1 18103 MULSS X0, X1 18104 ADDSS (DX)(R8*4), X1 18105 MOVSS X1, (DX)(R8*4) 18106 DECQ SI 18107 ADDQ CX, DI 18108 ADDQ BX, R8 18109 18110 check_limit: 18111 CMPQ SI, $0x00 18112 JHI loop 18113 RET 18114 18115 // func AmdAxpyUnsafeXInterleave_V2A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18116 // Requires: SSE 18117 TEXT ·AmdAxpyUnsafeXInterleave_V2A12R4(SB), NOSPLIT, $0-48 18118 MOVSS alpha+0(FP), X0 18119 MOVQ xs+8(FP), AX 18120 MOVQ incx+16(FP), CX 18121 MOVQ ys+24(FP), DX 18122 MOVQ incy+32(FP), BX 18123 MOVQ n+40(FP), SI 18124 XORQ DI, DI 18125 XORQ R8, R8 18126 JMP check_limit_unroll 18127 PCALIGN $0x08 18128 NOP 18129 NOP 18130 NOP 18131 NOP 18132 18133 loop_unroll: 18134 MOVSS (AX)(DI*4), X1 18135 ADDQ CX, DI 18136 MOVSS (AX)(DI*4), X2 18137 ADDQ CX, DI 18138 MOVSS (AX)(DI*4), X3 18139 ADDQ CX, DI 18140 MOVSS (AX)(DI*4), X4 18141 ADDQ CX, DI 18142 MULSS X0, X1 18143 MULSS X0, X2 18144 MULSS X0, X3 18145 MULSS X0, X4 18146 ADDSS (DX)(R8*4), X1 18147 MOVSS X1, (DX)(R8*4) 18148 ADDQ BX, R8 18149 ADDSS (DX)(R8*4), X2 18150 MOVSS X2, (DX)(R8*4) 18151 ADDQ BX, R8 18152 ADDSS (DX)(R8*4), X3 18153 MOVSS X3, (DX)(R8*4) 18154 ADDQ BX, R8 18155 ADDSS (DX)(R8*4), X4 18156 MOVSS X4, (DX)(R8*4) 18157 ADDQ BX, R8 18158 SUBQ $0x04, SI 18159 18160 check_limit_unroll: 18161 CMPQ SI, $0x04 18162 JHS loop_unroll 18163 JMP check_limit 18164 18165 loop: 18166 MOVSS (AX)(DI*4), X1 18167 MULSS X0, X1 18168 ADDSS (DX)(R8*4), X1 18169 MOVSS X1, (DX)(R8*4) 18170 DECQ SI 18171 ADDQ CX, DI 18172 ADDQ BX, R8 18173 18174 check_limit: 18175 CMPQ SI, $0x00 18176 JHI loop 18177 RET 18178 18179 // func AmdAxpyUnsafeXInterleave_V3A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18180 // Requires: SSE 18181 TEXT ·AmdAxpyUnsafeXInterleave_V3A12R4(SB), NOSPLIT, $0-48 18182 MOVSS alpha+0(FP), X0 18183 MOVQ xs+8(FP), AX 18184 MOVQ incx+16(FP), CX 18185 MOVQ ys+24(FP), DX 18186 MOVQ incy+32(FP), BX 18187 MOVQ n+40(FP), SI 18188 XORQ DI, DI 18189 XORQ R8, R8 18190 JMP check_limit_unroll 18191 PCALIGN $0x08 18192 NOP 18193 NOP 18194 NOP 18195 NOP 18196 18197 loop_unroll: 18198 MOVSS (AX)(DI*4), X1 18199 ADDQ CX, DI 18200 MOVSS (AX)(DI*4), X2 18201 ADDQ CX, DI 18202 MOVSS (AX)(DI*4), X3 18203 ADDQ CX, DI 18204 MOVSS (AX)(DI*4), X4 18205 ADDQ CX, DI 18206 MULSS X0, X1 18207 MULSS X0, X2 18208 MULSS X0, X3 18209 MULSS X0, X4 18210 ADDSS (DX)(R8*4), X1 18211 MOVSS X1, (DX)(R8*4) 18212 ADDQ BX, R8 18213 ADDSS (DX)(R8*4), X2 18214 MOVSS X2, (DX)(R8*4) 18215 ADDQ BX, R8 18216 ADDSS (DX)(R8*4), X3 18217 MOVSS X3, (DX)(R8*4) 18218 ADDQ BX, R8 18219 ADDSS (DX)(R8*4), X4 18220 MOVSS X4, (DX)(R8*4) 18221 ADDQ BX, R8 18222 SUBQ $0x04, SI 18223 18224 check_limit_unroll: 18225 CMPQ SI, $0x04 18226 JHS loop_unroll 18227 JMP check_limit 18228 18229 loop: 18230 MOVSS (AX)(DI*4), X1 18231 MULSS X0, X1 18232 ADDSS (DX)(R8*4), X1 18233 MOVSS X1, (DX)(R8*4) 18234 DECQ SI 18235 ADDQ CX, DI 18236 ADDQ BX, R8 18237 18238 check_limit: 18239 CMPQ SI, $0x00 18240 JHI loop 18241 RET 18242 18243 // func AmdAxpyUnsafeXInterleave_V4A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18244 // Requires: SSE 18245 TEXT ·AmdAxpyUnsafeXInterleave_V4A12R4(SB), NOSPLIT, $0-48 18246 MOVSS alpha+0(FP), X0 18247 MOVQ xs+8(FP), AX 18248 MOVQ incx+16(FP), CX 18249 MOVQ ys+24(FP), DX 18250 MOVQ incy+32(FP), BX 18251 MOVQ n+40(FP), SI 18252 XORQ DI, DI 18253 XORQ R8, R8 18254 JMP check_limit_unroll 18255 PCALIGN $0x08 18256 NOP 18257 NOP 18258 NOP 18259 NOP 18260 18261 loop_unroll: 18262 MOVSS (AX)(DI*4), X1 18263 ADDQ CX, DI 18264 MOVSS (AX)(DI*4), X2 18265 ADDQ CX, DI 18266 MOVSS (AX)(DI*4), X3 18267 ADDQ CX, DI 18268 MOVSS (AX)(DI*4), X4 18269 ADDQ CX, DI 18270 MULSS X0, X1 18271 MULSS X0, X2 18272 MULSS X0, X3 18273 MULSS X0, X4 18274 ADDSS (DX)(R8*4), X1 18275 MOVSS X1, (DX)(R8*4) 18276 ADDQ BX, R8 18277 ADDSS (DX)(R8*4), X2 18278 MOVSS X2, (DX)(R8*4) 18279 ADDQ BX, R8 18280 ADDSS (DX)(R8*4), X3 18281 MOVSS X3, (DX)(R8*4) 18282 ADDQ BX, R8 18283 ADDSS (DX)(R8*4), X4 18284 MOVSS X4, (DX)(R8*4) 18285 ADDQ BX, R8 18286 SUBQ $0x04, SI 18287 18288 check_limit_unroll: 18289 CMPQ SI, $0x04 18290 JHS loop_unroll 18291 JMP check_limit 18292 18293 loop: 18294 MOVSS (AX)(DI*4), X1 18295 MULSS X0, X1 18296 ADDSS (DX)(R8*4), X1 18297 MOVSS X1, (DX)(R8*4) 18298 DECQ SI 18299 ADDQ CX, DI 18300 ADDQ BX, R8 18301 18302 check_limit: 18303 CMPQ SI, $0x00 18304 JHI loop 18305 RET 18306 18307 // func AmdAxpyUnsafeXInterleave_V5A12R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18308 // Requires: SSE 18309 TEXT ·AmdAxpyUnsafeXInterleave_V5A12R4(SB), NOSPLIT, $0-48 18310 MOVSS alpha+0(FP), X0 18311 MOVQ xs+8(FP), AX 18312 MOVQ incx+16(FP), CX 18313 MOVQ ys+24(FP), DX 18314 MOVQ incy+32(FP), BX 18315 MOVQ n+40(FP), SI 18316 XORQ DI, DI 18317 XORQ R8, R8 18318 JMP check_limit_unroll 18319 PCALIGN $0x08 18320 NOP 18321 NOP 18322 NOP 18323 NOP 18324 18325 loop_unroll: 18326 MOVSS (AX)(DI*4), X1 18327 ADDQ CX, DI 18328 MOVSS (AX)(DI*4), X2 18329 ADDQ CX, DI 18330 MOVSS (AX)(DI*4), X3 18331 ADDQ CX, DI 18332 MOVSS (AX)(DI*4), X4 18333 ADDQ CX, DI 18334 MULSS X0, X1 18335 MULSS X0, X2 18336 MULSS X0, X3 18337 MULSS X0, X4 18338 ADDSS (DX)(R8*4), X1 18339 MOVSS X1, (DX)(R8*4) 18340 ADDQ BX, R8 18341 ADDSS (DX)(R8*4), X2 18342 MOVSS X2, (DX)(R8*4) 18343 ADDQ BX, R8 18344 ADDSS (DX)(R8*4), X3 18345 MOVSS X3, (DX)(R8*4) 18346 ADDQ BX, R8 18347 ADDSS (DX)(R8*4), X4 18348 MOVSS X4, (DX)(R8*4) 18349 ADDQ BX, R8 18350 SUBQ $0x04, SI 18351 18352 check_limit_unroll: 18353 CMPQ SI, $0x04 18354 JHS loop_unroll 18355 JMP check_limit 18356 18357 loop: 18358 MOVSS (AX)(DI*4), X1 18359 MULSS X0, X1 18360 ADDSS (DX)(R8*4), X1 18361 MOVSS X1, (DX)(R8*4) 18362 DECQ SI 18363 ADDQ CX, DI 18364 ADDQ BX, R8 18365 18366 check_limit: 18367 CMPQ SI, $0x00 18368 JHI loop 18369 RET 18370 18371 // func AmdAxpyUnsafeXInterleave_V0A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18372 // Requires: SSE 18373 TEXT ·AmdAxpyUnsafeXInterleave_V0A13R4(SB), NOSPLIT, $0-48 18374 MOVSS alpha+0(FP), X0 18375 MOVQ xs+8(FP), AX 18376 MOVQ incx+16(FP), CX 18377 MOVQ ys+24(FP), DX 18378 MOVQ incy+32(FP), BX 18379 MOVQ n+40(FP), SI 18380 XORQ DI, DI 18381 XORQ R8, R8 18382 JMP check_limit_unroll 18383 PCALIGN $0x08 18384 NOP 18385 NOP 18386 NOP 18387 NOP 18388 NOP 18389 18390 loop_unroll: 18391 MOVSS (AX)(DI*4), X1 18392 ADDQ CX, DI 18393 MOVSS (AX)(DI*4), X2 18394 ADDQ CX, DI 18395 MOVSS (AX)(DI*4), X3 18396 ADDQ CX, DI 18397 MOVSS (AX)(DI*4), X4 18398 ADDQ CX, DI 18399 MULSS X0, X1 18400 MULSS X0, X2 18401 MULSS X0, X3 18402 MULSS X0, X4 18403 ADDSS (DX)(R8*4), X1 18404 MOVSS X1, (DX)(R8*4) 18405 ADDQ BX, R8 18406 ADDSS (DX)(R8*4), X2 18407 MOVSS X2, (DX)(R8*4) 18408 ADDQ BX, R8 18409 ADDSS (DX)(R8*4), X3 18410 MOVSS X3, (DX)(R8*4) 18411 ADDQ BX, R8 18412 ADDSS (DX)(R8*4), X4 18413 MOVSS X4, (DX)(R8*4) 18414 ADDQ BX, R8 18415 SUBQ $0x04, SI 18416 18417 check_limit_unroll: 18418 CMPQ SI, $0x04 18419 JHS loop_unroll 18420 JMP check_limit 18421 18422 loop: 18423 MOVSS (AX)(DI*4), X1 18424 MULSS X0, X1 18425 ADDSS (DX)(R8*4), X1 18426 MOVSS X1, (DX)(R8*4) 18427 DECQ SI 18428 ADDQ CX, DI 18429 ADDQ BX, R8 18430 18431 check_limit: 18432 CMPQ SI, $0x00 18433 JHI loop 18434 RET 18435 18436 // func AmdAxpyUnsafeXInterleave_V1A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18437 // Requires: SSE 18438 TEXT ·AmdAxpyUnsafeXInterleave_V1A13R4(SB), NOSPLIT, $0-48 18439 MOVSS alpha+0(FP), X0 18440 MOVQ xs+8(FP), AX 18441 MOVQ incx+16(FP), CX 18442 MOVQ ys+24(FP), DX 18443 MOVQ incy+32(FP), BX 18444 MOVQ n+40(FP), SI 18445 XORQ DI, DI 18446 XORQ R8, R8 18447 JMP check_limit_unroll 18448 PCALIGN $0x08 18449 NOP 18450 NOP 18451 NOP 18452 NOP 18453 NOP 18454 18455 loop_unroll: 18456 MOVSS (AX)(DI*4), X1 18457 ADDQ CX, DI 18458 MOVSS (AX)(DI*4), X2 18459 ADDQ CX, DI 18460 MOVSS (AX)(DI*4), X3 18461 ADDQ CX, DI 18462 MOVSS (AX)(DI*4), X4 18463 ADDQ CX, DI 18464 MULSS X0, X1 18465 MULSS X0, X2 18466 MULSS X0, X3 18467 MULSS X0, X4 18468 ADDSS (DX)(R8*4), X1 18469 MOVSS X1, (DX)(R8*4) 18470 ADDQ BX, R8 18471 ADDSS (DX)(R8*4), X2 18472 MOVSS X2, (DX)(R8*4) 18473 ADDQ BX, R8 18474 ADDSS (DX)(R8*4), X3 18475 MOVSS X3, (DX)(R8*4) 18476 ADDQ BX, R8 18477 ADDSS (DX)(R8*4), X4 18478 MOVSS X4, (DX)(R8*4) 18479 ADDQ BX, R8 18480 SUBQ $0x04, SI 18481 18482 check_limit_unroll: 18483 CMPQ SI, $0x04 18484 JHS loop_unroll 18485 JMP check_limit 18486 18487 loop: 18488 MOVSS (AX)(DI*4), X1 18489 MULSS X0, X1 18490 ADDSS (DX)(R8*4), X1 18491 MOVSS X1, (DX)(R8*4) 18492 DECQ SI 18493 ADDQ CX, DI 18494 ADDQ BX, R8 18495 18496 check_limit: 18497 CMPQ SI, $0x00 18498 JHI loop 18499 RET 18500 18501 // func AmdAxpyUnsafeXInterleave_V2A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18502 // Requires: SSE 18503 TEXT ·AmdAxpyUnsafeXInterleave_V2A13R4(SB), NOSPLIT, $0-48 18504 MOVSS alpha+0(FP), X0 18505 MOVQ xs+8(FP), AX 18506 MOVQ incx+16(FP), CX 18507 MOVQ ys+24(FP), DX 18508 MOVQ incy+32(FP), BX 18509 MOVQ n+40(FP), SI 18510 XORQ DI, DI 18511 XORQ R8, R8 18512 JMP check_limit_unroll 18513 PCALIGN $0x08 18514 NOP 18515 NOP 18516 NOP 18517 NOP 18518 NOP 18519 18520 loop_unroll: 18521 MOVSS (AX)(DI*4), X1 18522 ADDQ CX, DI 18523 MOVSS (AX)(DI*4), X2 18524 ADDQ CX, DI 18525 MOVSS (AX)(DI*4), X3 18526 ADDQ CX, DI 18527 MOVSS (AX)(DI*4), X4 18528 ADDQ CX, DI 18529 MULSS X0, X1 18530 MULSS X0, X2 18531 MULSS X0, X3 18532 MULSS X0, X4 18533 ADDSS (DX)(R8*4), X1 18534 MOVSS X1, (DX)(R8*4) 18535 ADDQ BX, R8 18536 ADDSS (DX)(R8*4), X2 18537 MOVSS X2, (DX)(R8*4) 18538 ADDQ BX, R8 18539 ADDSS (DX)(R8*4), X3 18540 MOVSS X3, (DX)(R8*4) 18541 ADDQ BX, R8 18542 ADDSS (DX)(R8*4), X4 18543 MOVSS X4, (DX)(R8*4) 18544 ADDQ BX, R8 18545 SUBQ $0x04, SI 18546 18547 check_limit_unroll: 18548 CMPQ SI, $0x04 18549 JHS loop_unroll 18550 JMP check_limit 18551 18552 loop: 18553 MOVSS (AX)(DI*4), X1 18554 MULSS X0, X1 18555 ADDSS (DX)(R8*4), X1 18556 MOVSS X1, (DX)(R8*4) 18557 DECQ SI 18558 ADDQ CX, DI 18559 ADDQ BX, R8 18560 18561 check_limit: 18562 CMPQ SI, $0x00 18563 JHI loop 18564 RET 18565 18566 // func AmdAxpyUnsafeXInterleave_V3A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18567 // Requires: SSE 18568 TEXT ·AmdAxpyUnsafeXInterleave_V3A13R4(SB), NOSPLIT, $0-48 18569 MOVSS alpha+0(FP), X0 18570 MOVQ xs+8(FP), AX 18571 MOVQ incx+16(FP), CX 18572 MOVQ ys+24(FP), DX 18573 MOVQ incy+32(FP), BX 18574 MOVQ n+40(FP), SI 18575 XORQ DI, DI 18576 XORQ R8, R8 18577 JMP check_limit_unroll 18578 PCALIGN $0x08 18579 NOP 18580 NOP 18581 NOP 18582 NOP 18583 NOP 18584 18585 loop_unroll: 18586 MOVSS (AX)(DI*4), X1 18587 ADDQ CX, DI 18588 MOVSS (AX)(DI*4), X2 18589 ADDQ CX, DI 18590 MOVSS (AX)(DI*4), X3 18591 ADDQ CX, DI 18592 MOVSS (AX)(DI*4), X4 18593 ADDQ CX, DI 18594 MULSS X0, X1 18595 MULSS X0, X2 18596 MULSS X0, X3 18597 MULSS X0, X4 18598 ADDSS (DX)(R8*4), X1 18599 MOVSS X1, (DX)(R8*4) 18600 ADDQ BX, R8 18601 ADDSS (DX)(R8*4), X2 18602 MOVSS X2, (DX)(R8*4) 18603 ADDQ BX, R8 18604 ADDSS (DX)(R8*4), X3 18605 MOVSS X3, (DX)(R8*4) 18606 ADDQ BX, R8 18607 ADDSS (DX)(R8*4), X4 18608 MOVSS X4, (DX)(R8*4) 18609 ADDQ BX, R8 18610 SUBQ $0x04, SI 18611 18612 check_limit_unroll: 18613 CMPQ SI, $0x04 18614 JHS loop_unroll 18615 JMP check_limit 18616 18617 loop: 18618 MOVSS (AX)(DI*4), X1 18619 MULSS X0, X1 18620 ADDSS (DX)(R8*4), X1 18621 MOVSS X1, (DX)(R8*4) 18622 DECQ SI 18623 ADDQ CX, DI 18624 ADDQ BX, R8 18625 18626 check_limit: 18627 CMPQ SI, $0x00 18628 JHI loop 18629 RET 18630 18631 // func AmdAxpyUnsafeXInterleave_V4A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18632 // Requires: SSE 18633 TEXT ·AmdAxpyUnsafeXInterleave_V4A13R4(SB), NOSPLIT, $0-48 18634 MOVSS alpha+0(FP), X0 18635 MOVQ xs+8(FP), AX 18636 MOVQ incx+16(FP), CX 18637 MOVQ ys+24(FP), DX 18638 MOVQ incy+32(FP), BX 18639 MOVQ n+40(FP), SI 18640 XORQ DI, DI 18641 XORQ R8, R8 18642 JMP check_limit_unroll 18643 PCALIGN $0x08 18644 NOP 18645 NOP 18646 NOP 18647 NOP 18648 NOP 18649 18650 loop_unroll: 18651 MOVSS (AX)(DI*4), X1 18652 ADDQ CX, DI 18653 MOVSS (AX)(DI*4), X2 18654 ADDQ CX, DI 18655 MOVSS (AX)(DI*4), X3 18656 ADDQ CX, DI 18657 MOVSS (AX)(DI*4), X4 18658 ADDQ CX, DI 18659 MULSS X0, X1 18660 MULSS X0, X2 18661 MULSS X0, X3 18662 MULSS X0, X4 18663 ADDSS (DX)(R8*4), X1 18664 MOVSS X1, (DX)(R8*4) 18665 ADDQ BX, R8 18666 ADDSS (DX)(R8*4), X2 18667 MOVSS X2, (DX)(R8*4) 18668 ADDQ BX, R8 18669 ADDSS (DX)(R8*4), X3 18670 MOVSS X3, (DX)(R8*4) 18671 ADDQ BX, R8 18672 ADDSS (DX)(R8*4), X4 18673 MOVSS X4, (DX)(R8*4) 18674 ADDQ BX, R8 18675 SUBQ $0x04, SI 18676 18677 check_limit_unroll: 18678 CMPQ SI, $0x04 18679 JHS loop_unroll 18680 JMP check_limit 18681 18682 loop: 18683 MOVSS (AX)(DI*4), X1 18684 MULSS X0, X1 18685 ADDSS (DX)(R8*4), X1 18686 MOVSS X1, (DX)(R8*4) 18687 DECQ SI 18688 ADDQ CX, DI 18689 ADDQ BX, R8 18690 18691 check_limit: 18692 CMPQ SI, $0x00 18693 JHI loop 18694 RET 18695 18696 // func AmdAxpyUnsafeXInterleave_V5A13R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18697 // Requires: SSE 18698 TEXT ·AmdAxpyUnsafeXInterleave_V5A13R4(SB), NOSPLIT, $0-48 18699 MOVSS alpha+0(FP), X0 18700 MOVQ xs+8(FP), AX 18701 MOVQ incx+16(FP), CX 18702 MOVQ ys+24(FP), DX 18703 MOVQ incy+32(FP), BX 18704 MOVQ n+40(FP), SI 18705 XORQ DI, DI 18706 XORQ R8, R8 18707 JMP check_limit_unroll 18708 PCALIGN $0x08 18709 NOP 18710 NOP 18711 NOP 18712 NOP 18713 NOP 18714 18715 loop_unroll: 18716 MOVSS (AX)(DI*4), X1 18717 ADDQ CX, DI 18718 MOVSS (AX)(DI*4), X2 18719 ADDQ CX, DI 18720 MOVSS (AX)(DI*4), X3 18721 ADDQ CX, DI 18722 MOVSS (AX)(DI*4), X4 18723 ADDQ CX, DI 18724 MULSS X0, X1 18725 MULSS X0, X2 18726 MULSS X0, X3 18727 MULSS X0, X4 18728 ADDSS (DX)(R8*4), X1 18729 MOVSS X1, (DX)(R8*4) 18730 ADDQ BX, R8 18731 ADDSS (DX)(R8*4), X2 18732 MOVSS X2, (DX)(R8*4) 18733 ADDQ BX, R8 18734 ADDSS (DX)(R8*4), X3 18735 MOVSS X3, (DX)(R8*4) 18736 ADDQ BX, R8 18737 ADDSS (DX)(R8*4), X4 18738 MOVSS X4, (DX)(R8*4) 18739 ADDQ BX, R8 18740 SUBQ $0x04, SI 18741 18742 check_limit_unroll: 18743 CMPQ SI, $0x04 18744 JHS loop_unroll 18745 JMP check_limit 18746 18747 loop: 18748 MOVSS (AX)(DI*4), X1 18749 MULSS X0, X1 18750 ADDSS (DX)(R8*4), X1 18751 MOVSS X1, (DX)(R8*4) 18752 DECQ SI 18753 ADDQ CX, DI 18754 ADDQ BX, R8 18755 18756 check_limit: 18757 CMPQ SI, $0x00 18758 JHI loop 18759 RET 18760 18761 // func AmdAxpyUnsafeXInterleave_V0A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18762 // Requires: SSE 18763 TEXT ·AmdAxpyUnsafeXInterleave_V0A14R4(SB), NOSPLIT, $0-48 18764 MOVSS alpha+0(FP), X0 18765 MOVQ xs+8(FP), AX 18766 MOVQ incx+16(FP), CX 18767 MOVQ ys+24(FP), DX 18768 MOVQ incy+32(FP), BX 18769 MOVQ n+40(FP), SI 18770 XORQ DI, DI 18771 XORQ R8, R8 18772 JMP check_limit_unroll 18773 PCALIGN $0x08 18774 NOP 18775 NOP 18776 NOP 18777 NOP 18778 NOP 18779 NOP 18780 18781 loop_unroll: 18782 MOVSS (AX)(DI*4), X1 18783 ADDQ CX, DI 18784 MOVSS (AX)(DI*4), X2 18785 ADDQ CX, DI 18786 MOVSS (AX)(DI*4), X3 18787 ADDQ CX, DI 18788 MOVSS (AX)(DI*4), X4 18789 ADDQ CX, DI 18790 MULSS X0, X1 18791 MULSS X0, X2 18792 MULSS X0, X3 18793 MULSS X0, X4 18794 ADDSS (DX)(R8*4), X1 18795 MOVSS X1, (DX)(R8*4) 18796 ADDQ BX, R8 18797 ADDSS (DX)(R8*4), X2 18798 MOVSS X2, (DX)(R8*4) 18799 ADDQ BX, R8 18800 ADDSS (DX)(R8*4), X3 18801 MOVSS X3, (DX)(R8*4) 18802 ADDQ BX, R8 18803 ADDSS (DX)(R8*4), X4 18804 MOVSS X4, (DX)(R8*4) 18805 ADDQ BX, R8 18806 SUBQ $0x04, SI 18807 18808 check_limit_unroll: 18809 CMPQ SI, $0x04 18810 JHS loop_unroll 18811 JMP check_limit 18812 18813 loop: 18814 MOVSS (AX)(DI*4), X1 18815 MULSS X0, X1 18816 ADDSS (DX)(R8*4), X1 18817 MOVSS X1, (DX)(R8*4) 18818 DECQ SI 18819 ADDQ CX, DI 18820 ADDQ BX, R8 18821 18822 check_limit: 18823 CMPQ SI, $0x00 18824 JHI loop 18825 RET 18826 18827 // func AmdAxpyUnsafeXInterleave_V1A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18828 // Requires: SSE 18829 TEXT ·AmdAxpyUnsafeXInterleave_V1A14R4(SB), NOSPLIT, $0-48 18830 MOVSS alpha+0(FP), X0 18831 MOVQ xs+8(FP), AX 18832 MOVQ incx+16(FP), CX 18833 MOVQ ys+24(FP), DX 18834 MOVQ incy+32(FP), BX 18835 MOVQ n+40(FP), SI 18836 XORQ DI, DI 18837 XORQ R8, R8 18838 JMP check_limit_unroll 18839 PCALIGN $0x08 18840 NOP 18841 NOP 18842 NOP 18843 NOP 18844 NOP 18845 NOP 18846 18847 loop_unroll: 18848 MOVSS (AX)(DI*4), X1 18849 ADDQ CX, DI 18850 MOVSS (AX)(DI*4), X2 18851 ADDQ CX, DI 18852 MOVSS (AX)(DI*4), X3 18853 ADDQ CX, DI 18854 MOVSS (AX)(DI*4), X4 18855 ADDQ CX, DI 18856 MULSS X0, X1 18857 MULSS X0, X2 18858 MULSS X0, X3 18859 MULSS X0, X4 18860 ADDSS (DX)(R8*4), X1 18861 MOVSS X1, (DX)(R8*4) 18862 ADDQ BX, R8 18863 ADDSS (DX)(R8*4), X2 18864 MOVSS X2, (DX)(R8*4) 18865 ADDQ BX, R8 18866 ADDSS (DX)(R8*4), X3 18867 MOVSS X3, (DX)(R8*4) 18868 ADDQ BX, R8 18869 ADDSS (DX)(R8*4), X4 18870 MOVSS X4, (DX)(R8*4) 18871 ADDQ BX, R8 18872 SUBQ $0x04, SI 18873 18874 check_limit_unroll: 18875 CMPQ SI, $0x04 18876 JHS loop_unroll 18877 JMP check_limit 18878 18879 loop: 18880 MOVSS (AX)(DI*4), X1 18881 MULSS X0, X1 18882 ADDSS (DX)(R8*4), X1 18883 MOVSS X1, (DX)(R8*4) 18884 DECQ SI 18885 ADDQ CX, DI 18886 ADDQ BX, R8 18887 18888 check_limit: 18889 CMPQ SI, $0x00 18890 JHI loop 18891 RET 18892 18893 // func AmdAxpyUnsafeXInterleave_V2A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18894 // Requires: SSE 18895 TEXT ·AmdAxpyUnsafeXInterleave_V2A14R4(SB), NOSPLIT, $0-48 18896 MOVSS alpha+0(FP), X0 18897 MOVQ xs+8(FP), AX 18898 MOVQ incx+16(FP), CX 18899 MOVQ ys+24(FP), DX 18900 MOVQ incy+32(FP), BX 18901 MOVQ n+40(FP), SI 18902 XORQ DI, DI 18903 XORQ R8, R8 18904 JMP check_limit_unroll 18905 PCALIGN $0x08 18906 NOP 18907 NOP 18908 NOP 18909 NOP 18910 NOP 18911 NOP 18912 18913 loop_unroll: 18914 MOVSS (AX)(DI*4), X1 18915 ADDQ CX, DI 18916 MOVSS (AX)(DI*4), X2 18917 ADDQ CX, DI 18918 MOVSS (AX)(DI*4), X3 18919 ADDQ CX, DI 18920 MOVSS (AX)(DI*4), X4 18921 ADDQ CX, DI 18922 MULSS X0, X1 18923 MULSS X0, X2 18924 MULSS X0, X3 18925 MULSS X0, X4 18926 ADDSS (DX)(R8*4), X1 18927 MOVSS X1, (DX)(R8*4) 18928 ADDQ BX, R8 18929 ADDSS (DX)(R8*4), X2 18930 MOVSS X2, (DX)(R8*4) 18931 ADDQ BX, R8 18932 ADDSS (DX)(R8*4), X3 18933 MOVSS X3, (DX)(R8*4) 18934 ADDQ BX, R8 18935 ADDSS (DX)(R8*4), X4 18936 MOVSS X4, (DX)(R8*4) 18937 ADDQ BX, R8 18938 SUBQ $0x04, SI 18939 18940 check_limit_unroll: 18941 CMPQ SI, $0x04 18942 JHS loop_unroll 18943 JMP check_limit 18944 18945 loop: 18946 MOVSS (AX)(DI*4), X1 18947 MULSS X0, X1 18948 ADDSS (DX)(R8*4), X1 18949 MOVSS X1, (DX)(R8*4) 18950 DECQ SI 18951 ADDQ CX, DI 18952 ADDQ BX, R8 18953 18954 check_limit: 18955 CMPQ SI, $0x00 18956 JHI loop 18957 RET 18958 18959 // func AmdAxpyUnsafeXInterleave_V3A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 18960 // Requires: SSE 18961 TEXT ·AmdAxpyUnsafeXInterleave_V3A14R4(SB), NOSPLIT, $0-48 18962 MOVSS alpha+0(FP), X0 18963 MOVQ xs+8(FP), AX 18964 MOVQ incx+16(FP), CX 18965 MOVQ ys+24(FP), DX 18966 MOVQ incy+32(FP), BX 18967 MOVQ n+40(FP), SI 18968 XORQ DI, DI 18969 XORQ R8, R8 18970 JMP check_limit_unroll 18971 PCALIGN $0x08 18972 NOP 18973 NOP 18974 NOP 18975 NOP 18976 NOP 18977 NOP 18978 18979 loop_unroll: 18980 MOVSS (AX)(DI*4), X1 18981 ADDQ CX, DI 18982 MOVSS (AX)(DI*4), X2 18983 ADDQ CX, DI 18984 MOVSS (AX)(DI*4), X3 18985 ADDQ CX, DI 18986 MOVSS (AX)(DI*4), X4 18987 ADDQ CX, DI 18988 MULSS X0, X1 18989 MULSS X0, X2 18990 MULSS X0, X3 18991 MULSS X0, X4 18992 ADDSS (DX)(R8*4), X1 18993 MOVSS X1, (DX)(R8*4) 18994 ADDQ BX, R8 18995 ADDSS (DX)(R8*4), X2 18996 MOVSS X2, (DX)(R8*4) 18997 ADDQ BX, R8 18998 ADDSS (DX)(R8*4), X3 18999 MOVSS X3, (DX)(R8*4) 19000 ADDQ BX, R8 19001 ADDSS (DX)(R8*4), X4 19002 MOVSS X4, (DX)(R8*4) 19003 ADDQ BX, R8 19004 SUBQ $0x04, SI 19005 19006 check_limit_unroll: 19007 CMPQ SI, $0x04 19008 JHS loop_unroll 19009 JMP check_limit 19010 19011 loop: 19012 MOVSS (AX)(DI*4), X1 19013 MULSS X0, X1 19014 ADDSS (DX)(R8*4), X1 19015 MOVSS X1, (DX)(R8*4) 19016 DECQ SI 19017 ADDQ CX, DI 19018 ADDQ BX, R8 19019 19020 check_limit: 19021 CMPQ SI, $0x00 19022 JHI loop 19023 RET 19024 19025 // func AmdAxpyUnsafeXInterleave_V4A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19026 // Requires: SSE 19027 TEXT ·AmdAxpyUnsafeXInterleave_V4A14R4(SB), NOSPLIT, $0-48 19028 MOVSS alpha+0(FP), X0 19029 MOVQ xs+8(FP), AX 19030 MOVQ incx+16(FP), CX 19031 MOVQ ys+24(FP), DX 19032 MOVQ incy+32(FP), BX 19033 MOVQ n+40(FP), SI 19034 XORQ DI, DI 19035 XORQ R8, R8 19036 JMP check_limit_unroll 19037 PCALIGN $0x08 19038 NOP 19039 NOP 19040 NOP 19041 NOP 19042 NOP 19043 NOP 19044 19045 loop_unroll: 19046 MOVSS (AX)(DI*4), X1 19047 ADDQ CX, DI 19048 MOVSS (AX)(DI*4), X2 19049 ADDQ CX, DI 19050 MOVSS (AX)(DI*4), X3 19051 ADDQ CX, DI 19052 MOVSS (AX)(DI*4), X4 19053 ADDQ CX, DI 19054 MULSS X0, X1 19055 MULSS X0, X2 19056 MULSS X0, X3 19057 MULSS X0, X4 19058 ADDSS (DX)(R8*4), X1 19059 MOVSS X1, (DX)(R8*4) 19060 ADDQ BX, R8 19061 ADDSS (DX)(R8*4), X2 19062 MOVSS X2, (DX)(R8*4) 19063 ADDQ BX, R8 19064 ADDSS (DX)(R8*4), X3 19065 MOVSS X3, (DX)(R8*4) 19066 ADDQ BX, R8 19067 ADDSS (DX)(R8*4), X4 19068 MOVSS X4, (DX)(R8*4) 19069 ADDQ BX, R8 19070 SUBQ $0x04, SI 19071 19072 check_limit_unroll: 19073 CMPQ SI, $0x04 19074 JHS loop_unroll 19075 JMP check_limit 19076 19077 loop: 19078 MOVSS (AX)(DI*4), X1 19079 MULSS X0, X1 19080 ADDSS (DX)(R8*4), X1 19081 MOVSS X1, (DX)(R8*4) 19082 DECQ SI 19083 ADDQ CX, DI 19084 ADDQ BX, R8 19085 19086 check_limit: 19087 CMPQ SI, $0x00 19088 JHI loop 19089 RET 19090 19091 // func AmdAxpyUnsafeXInterleave_V5A14R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19092 // Requires: SSE 19093 TEXT ·AmdAxpyUnsafeXInterleave_V5A14R4(SB), NOSPLIT, $0-48 19094 MOVSS alpha+0(FP), X0 19095 MOVQ xs+8(FP), AX 19096 MOVQ incx+16(FP), CX 19097 MOVQ ys+24(FP), DX 19098 MOVQ incy+32(FP), BX 19099 MOVQ n+40(FP), SI 19100 XORQ DI, DI 19101 XORQ R8, R8 19102 JMP check_limit_unroll 19103 PCALIGN $0x08 19104 NOP 19105 NOP 19106 NOP 19107 NOP 19108 NOP 19109 NOP 19110 19111 loop_unroll: 19112 MOVSS (AX)(DI*4), X1 19113 ADDQ CX, DI 19114 MOVSS (AX)(DI*4), X2 19115 ADDQ CX, DI 19116 MOVSS (AX)(DI*4), X3 19117 ADDQ CX, DI 19118 MOVSS (AX)(DI*4), X4 19119 ADDQ CX, DI 19120 MULSS X0, X1 19121 MULSS X0, X2 19122 MULSS X0, X3 19123 MULSS X0, X4 19124 ADDSS (DX)(R8*4), X1 19125 MOVSS X1, (DX)(R8*4) 19126 ADDQ BX, R8 19127 ADDSS (DX)(R8*4), X2 19128 MOVSS X2, (DX)(R8*4) 19129 ADDQ BX, R8 19130 ADDSS (DX)(R8*4), X3 19131 MOVSS X3, (DX)(R8*4) 19132 ADDQ BX, R8 19133 ADDSS (DX)(R8*4), X4 19134 MOVSS X4, (DX)(R8*4) 19135 ADDQ BX, R8 19136 SUBQ $0x04, SI 19137 19138 check_limit_unroll: 19139 CMPQ SI, $0x04 19140 JHS loop_unroll 19141 JMP check_limit 19142 19143 loop: 19144 MOVSS (AX)(DI*4), X1 19145 MULSS X0, X1 19146 ADDSS (DX)(R8*4), X1 19147 MOVSS X1, (DX)(R8*4) 19148 DECQ SI 19149 ADDQ CX, DI 19150 ADDQ BX, R8 19151 19152 check_limit: 19153 CMPQ SI, $0x00 19154 JHI loop 19155 RET 19156 19157 // func AmdAxpyUnsafeXInterleave_V0A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19158 // Requires: SSE 19159 TEXT ·AmdAxpyUnsafeXInterleave_V0A15R4(SB), NOSPLIT, $0-48 19160 MOVSS alpha+0(FP), X0 19161 MOVQ xs+8(FP), AX 19162 MOVQ incx+16(FP), CX 19163 MOVQ ys+24(FP), DX 19164 MOVQ incy+32(FP), BX 19165 MOVQ n+40(FP), SI 19166 XORQ DI, DI 19167 XORQ R8, R8 19168 JMP check_limit_unroll 19169 PCALIGN $0x08 19170 NOP 19171 NOP 19172 NOP 19173 NOP 19174 NOP 19175 NOP 19176 NOP 19177 19178 loop_unroll: 19179 MOVSS (AX)(DI*4), X1 19180 ADDQ CX, DI 19181 MOVSS (AX)(DI*4), X2 19182 ADDQ CX, DI 19183 MOVSS (AX)(DI*4), X3 19184 ADDQ CX, DI 19185 MOVSS (AX)(DI*4), X4 19186 ADDQ CX, DI 19187 MULSS X0, X1 19188 MULSS X0, X2 19189 MULSS X0, X3 19190 MULSS X0, X4 19191 ADDSS (DX)(R8*4), X1 19192 MOVSS X1, (DX)(R8*4) 19193 ADDQ BX, R8 19194 ADDSS (DX)(R8*4), X2 19195 MOVSS X2, (DX)(R8*4) 19196 ADDQ BX, R8 19197 ADDSS (DX)(R8*4), X3 19198 MOVSS X3, (DX)(R8*4) 19199 ADDQ BX, R8 19200 ADDSS (DX)(R8*4), X4 19201 MOVSS X4, (DX)(R8*4) 19202 ADDQ BX, R8 19203 SUBQ $0x04, SI 19204 19205 check_limit_unroll: 19206 CMPQ SI, $0x04 19207 JHS loop_unroll 19208 JMP check_limit 19209 19210 loop: 19211 MOVSS (AX)(DI*4), X1 19212 MULSS X0, X1 19213 ADDSS (DX)(R8*4), X1 19214 MOVSS X1, (DX)(R8*4) 19215 DECQ SI 19216 ADDQ CX, DI 19217 ADDQ BX, R8 19218 19219 check_limit: 19220 CMPQ SI, $0x00 19221 JHI loop 19222 RET 19223 19224 // func AmdAxpyUnsafeXInterleave_V1A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19225 // Requires: SSE 19226 TEXT ·AmdAxpyUnsafeXInterleave_V1A15R4(SB), NOSPLIT, $0-48 19227 MOVSS alpha+0(FP), X0 19228 MOVQ xs+8(FP), AX 19229 MOVQ incx+16(FP), CX 19230 MOVQ ys+24(FP), DX 19231 MOVQ incy+32(FP), BX 19232 MOVQ n+40(FP), SI 19233 XORQ DI, DI 19234 XORQ R8, R8 19235 JMP check_limit_unroll 19236 PCALIGN $0x08 19237 NOP 19238 NOP 19239 NOP 19240 NOP 19241 NOP 19242 NOP 19243 NOP 19244 19245 loop_unroll: 19246 MOVSS (AX)(DI*4), X1 19247 ADDQ CX, DI 19248 MOVSS (AX)(DI*4), X2 19249 ADDQ CX, DI 19250 MOVSS (AX)(DI*4), X3 19251 ADDQ CX, DI 19252 MOVSS (AX)(DI*4), X4 19253 ADDQ CX, DI 19254 MULSS X0, X1 19255 MULSS X0, X2 19256 MULSS X0, X3 19257 MULSS X0, X4 19258 ADDSS (DX)(R8*4), X1 19259 MOVSS X1, (DX)(R8*4) 19260 ADDQ BX, R8 19261 ADDSS (DX)(R8*4), X2 19262 MOVSS X2, (DX)(R8*4) 19263 ADDQ BX, R8 19264 ADDSS (DX)(R8*4), X3 19265 MOVSS X3, (DX)(R8*4) 19266 ADDQ BX, R8 19267 ADDSS (DX)(R8*4), X4 19268 MOVSS X4, (DX)(R8*4) 19269 ADDQ BX, R8 19270 SUBQ $0x04, SI 19271 19272 check_limit_unroll: 19273 CMPQ SI, $0x04 19274 JHS loop_unroll 19275 JMP check_limit 19276 19277 loop: 19278 MOVSS (AX)(DI*4), X1 19279 MULSS X0, X1 19280 ADDSS (DX)(R8*4), X1 19281 MOVSS X1, (DX)(R8*4) 19282 DECQ SI 19283 ADDQ CX, DI 19284 ADDQ BX, R8 19285 19286 check_limit: 19287 CMPQ SI, $0x00 19288 JHI loop 19289 RET 19290 19291 // func AmdAxpyUnsafeXInterleave_V2A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19292 // Requires: SSE 19293 TEXT ·AmdAxpyUnsafeXInterleave_V2A15R4(SB), NOSPLIT, $0-48 19294 MOVSS alpha+0(FP), X0 19295 MOVQ xs+8(FP), AX 19296 MOVQ incx+16(FP), CX 19297 MOVQ ys+24(FP), DX 19298 MOVQ incy+32(FP), BX 19299 MOVQ n+40(FP), SI 19300 XORQ DI, DI 19301 XORQ R8, R8 19302 JMP check_limit_unroll 19303 PCALIGN $0x08 19304 NOP 19305 NOP 19306 NOP 19307 NOP 19308 NOP 19309 NOP 19310 NOP 19311 19312 loop_unroll: 19313 MOVSS (AX)(DI*4), X1 19314 ADDQ CX, DI 19315 MOVSS (AX)(DI*4), X2 19316 ADDQ CX, DI 19317 MOVSS (AX)(DI*4), X3 19318 ADDQ CX, DI 19319 MOVSS (AX)(DI*4), X4 19320 ADDQ CX, DI 19321 MULSS X0, X1 19322 MULSS X0, X2 19323 MULSS X0, X3 19324 MULSS X0, X4 19325 ADDSS (DX)(R8*4), X1 19326 MOVSS X1, (DX)(R8*4) 19327 ADDQ BX, R8 19328 ADDSS (DX)(R8*4), X2 19329 MOVSS X2, (DX)(R8*4) 19330 ADDQ BX, R8 19331 ADDSS (DX)(R8*4), X3 19332 MOVSS X3, (DX)(R8*4) 19333 ADDQ BX, R8 19334 ADDSS (DX)(R8*4), X4 19335 MOVSS X4, (DX)(R8*4) 19336 ADDQ BX, R8 19337 SUBQ $0x04, SI 19338 19339 check_limit_unroll: 19340 CMPQ SI, $0x04 19341 JHS loop_unroll 19342 JMP check_limit 19343 19344 loop: 19345 MOVSS (AX)(DI*4), X1 19346 MULSS X0, X1 19347 ADDSS (DX)(R8*4), X1 19348 MOVSS X1, (DX)(R8*4) 19349 DECQ SI 19350 ADDQ CX, DI 19351 ADDQ BX, R8 19352 19353 check_limit: 19354 CMPQ SI, $0x00 19355 JHI loop 19356 RET 19357 19358 // func AmdAxpyUnsafeXInterleave_V3A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19359 // Requires: SSE 19360 TEXT ·AmdAxpyUnsafeXInterleave_V3A15R4(SB), NOSPLIT, $0-48 19361 MOVSS alpha+0(FP), X0 19362 MOVQ xs+8(FP), AX 19363 MOVQ incx+16(FP), CX 19364 MOVQ ys+24(FP), DX 19365 MOVQ incy+32(FP), BX 19366 MOVQ n+40(FP), SI 19367 XORQ DI, DI 19368 XORQ R8, R8 19369 JMP check_limit_unroll 19370 PCALIGN $0x08 19371 NOP 19372 NOP 19373 NOP 19374 NOP 19375 NOP 19376 NOP 19377 NOP 19378 19379 loop_unroll: 19380 MOVSS (AX)(DI*4), X1 19381 ADDQ CX, DI 19382 MOVSS (AX)(DI*4), X2 19383 ADDQ CX, DI 19384 MOVSS (AX)(DI*4), X3 19385 ADDQ CX, DI 19386 MOVSS (AX)(DI*4), X4 19387 ADDQ CX, DI 19388 MULSS X0, X1 19389 MULSS X0, X2 19390 MULSS X0, X3 19391 MULSS X0, X4 19392 ADDSS (DX)(R8*4), X1 19393 MOVSS X1, (DX)(R8*4) 19394 ADDQ BX, R8 19395 ADDSS (DX)(R8*4), X2 19396 MOVSS X2, (DX)(R8*4) 19397 ADDQ BX, R8 19398 ADDSS (DX)(R8*4), X3 19399 MOVSS X3, (DX)(R8*4) 19400 ADDQ BX, R8 19401 ADDSS (DX)(R8*4), X4 19402 MOVSS X4, (DX)(R8*4) 19403 ADDQ BX, R8 19404 SUBQ $0x04, SI 19405 19406 check_limit_unroll: 19407 CMPQ SI, $0x04 19408 JHS loop_unroll 19409 JMP check_limit 19410 19411 loop: 19412 MOVSS (AX)(DI*4), X1 19413 MULSS X0, X1 19414 ADDSS (DX)(R8*4), X1 19415 MOVSS X1, (DX)(R8*4) 19416 DECQ SI 19417 ADDQ CX, DI 19418 ADDQ BX, R8 19419 19420 check_limit: 19421 CMPQ SI, $0x00 19422 JHI loop 19423 RET 19424 19425 // func AmdAxpyUnsafeXInterleave_V4A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19426 // Requires: SSE 19427 TEXT ·AmdAxpyUnsafeXInterleave_V4A15R4(SB), NOSPLIT, $0-48 19428 MOVSS alpha+0(FP), X0 19429 MOVQ xs+8(FP), AX 19430 MOVQ incx+16(FP), CX 19431 MOVQ ys+24(FP), DX 19432 MOVQ incy+32(FP), BX 19433 MOVQ n+40(FP), SI 19434 XORQ DI, DI 19435 XORQ R8, R8 19436 JMP check_limit_unroll 19437 PCALIGN $0x08 19438 NOP 19439 NOP 19440 NOP 19441 NOP 19442 NOP 19443 NOP 19444 NOP 19445 19446 loop_unroll: 19447 MOVSS (AX)(DI*4), X1 19448 ADDQ CX, DI 19449 MOVSS (AX)(DI*4), X2 19450 ADDQ CX, DI 19451 MOVSS (AX)(DI*4), X3 19452 ADDQ CX, DI 19453 MOVSS (AX)(DI*4), X4 19454 ADDQ CX, DI 19455 MULSS X0, X1 19456 MULSS X0, X2 19457 MULSS X0, X3 19458 MULSS X0, X4 19459 ADDSS (DX)(R8*4), X1 19460 MOVSS X1, (DX)(R8*4) 19461 ADDQ BX, R8 19462 ADDSS (DX)(R8*4), X2 19463 MOVSS X2, (DX)(R8*4) 19464 ADDQ BX, R8 19465 ADDSS (DX)(R8*4), X3 19466 MOVSS X3, (DX)(R8*4) 19467 ADDQ BX, R8 19468 ADDSS (DX)(R8*4), X4 19469 MOVSS X4, (DX)(R8*4) 19470 ADDQ BX, R8 19471 SUBQ $0x04, SI 19472 19473 check_limit_unroll: 19474 CMPQ SI, $0x04 19475 JHS loop_unroll 19476 JMP check_limit 19477 19478 loop: 19479 MOVSS (AX)(DI*4), X1 19480 MULSS X0, X1 19481 ADDSS (DX)(R8*4), X1 19482 MOVSS X1, (DX)(R8*4) 19483 DECQ SI 19484 ADDQ CX, DI 19485 ADDQ BX, R8 19486 19487 check_limit: 19488 CMPQ SI, $0x00 19489 JHI loop 19490 RET 19491 19492 // func AmdAxpyUnsafeXInterleave_V5A15R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19493 // Requires: SSE 19494 TEXT ·AmdAxpyUnsafeXInterleave_V5A15R4(SB), NOSPLIT, $0-48 19495 MOVSS alpha+0(FP), X0 19496 MOVQ xs+8(FP), AX 19497 MOVQ incx+16(FP), CX 19498 MOVQ ys+24(FP), DX 19499 MOVQ incy+32(FP), BX 19500 MOVQ n+40(FP), SI 19501 XORQ DI, DI 19502 XORQ R8, R8 19503 JMP check_limit_unroll 19504 PCALIGN $0x08 19505 NOP 19506 NOP 19507 NOP 19508 NOP 19509 NOP 19510 NOP 19511 NOP 19512 19513 loop_unroll: 19514 MOVSS (AX)(DI*4), X1 19515 ADDQ CX, DI 19516 MOVSS (AX)(DI*4), X2 19517 ADDQ CX, DI 19518 MOVSS (AX)(DI*4), X3 19519 ADDQ CX, DI 19520 MOVSS (AX)(DI*4), X4 19521 ADDQ CX, DI 19522 MULSS X0, X1 19523 MULSS X0, X2 19524 MULSS X0, X3 19525 MULSS X0, X4 19526 ADDSS (DX)(R8*4), X1 19527 MOVSS X1, (DX)(R8*4) 19528 ADDQ BX, R8 19529 ADDSS (DX)(R8*4), X2 19530 MOVSS X2, (DX)(R8*4) 19531 ADDQ BX, R8 19532 ADDSS (DX)(R8*4), X3 19533 MOVSS X3, (DX)(R8*4) 19534 ADDQ BX, R8 19535 ADDSS (DX)(R8*4), X4 19536 MOVSS X4, (DX)(R8*4) 19537 ADDQ BX, R8 19538 SUBQ $0x04, SI 19539 19540 check_limit_unroll: 19541 CMPQ SI, $0x04 19542 JHS loop_unroll 19543 JMP check_limit 19544 19545 loop: 19546 MOVSS (AX)(DI*4), X1 19547 MULSS X0, X1 19548 ADDSS (DX)(R8*4), X1 19549 MOVSS X1, (DX)(R8*4) 19550 DECQ SI 19551 ADDQ CX, DI 19552 ADDQ BX, R8 19553 19554 check_limit: 19555 CMPQ SI, $0x00 19556 JHI loop 19557 RET 19558 19559 // func AmdAxpyUnsafeXInterleave_V0A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19560 // Requires: SSE 19561 TEXT ·AmdAxpyUnsafeXInterleave_V0A16R4(SB), NOSPLIT, $0-48 19562 MOVSS alpha+0(FP), X0 19563 MOVQ xs+8(FP), AX 19564 MOVQ incx+16(FP), CX 19565 MOVQ ys+24(FP), DX 19566 MOVQ incy+32(FP), BX 19567 MOVQ n+40(FP), SI 19568 XORQ DI, DI 19569 XORQ R8, R8 19570 JMP check_limit_unroll 19571 PCALIGN $0x10 19572 19573 loop_unroll: 19574 MOVSS (AX)(DI*4), X1 19575 ADDQ CX, DI 19576 MOVSS (AX)(DI*4), X2 19577 ADDQ CX, DI 19578 MOVSS (AX)(DI*4), X3 19579 ADDQ CX, DI 19580 MOVSS (AX)(DI*4), X4 19581 ADDQ CX, DI 19582 MULSS X0, X1 19583 MULSS X0, X2 19584 MULSS X0, X3 19585 MULSS X0, X4 19586 ADDSS (DX)(R8*4), X1 19587 MOVSS X1, (DX)(R8*4) 19588 ADDQ BX, R8 19589 ADDSS (DX)(R8*4), X2 19590 MOVSS X2, (DX)(R8*4) 19591 ADDQ BX, R8 19592 ADDSS (DX)(R8*4), X3 19593 MOVSS X3, (DX)(R8*4) 19594 ADDQ BX, R8 19595 ADDSS (DX)(R8*4), X4 19596 MOVSS X4, (DX)(R8*4) 19597 ADDQ BX, R8 19598 SUBQ $0x04, SI 19599 19600 check_limit_unroll: 19601 CMPQ SI, $0x04 19602 JHS loop_unroll 19603 JMP check_limit 19604 19605 loop: 19606 MOVSS (AX)(DI*4), X1 19607 MULSS X0, X1 19608 ADDSS (DX)(R8*4), X1 19609 MOVSS X1, (DX)(R8*4) 19610 DECQ SI 19611 ADDQ CX, DI 19612 ADDQ BX, R8 19613 19614 check_limit: 19615 CMPQ SI, $0x00 19616 JHI loop 19617 RET 19618 19619 // func AmdAxpyUnsafeXInterleave_V1A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19620 // Requires: SSE 19621 TEXT ·AmdAxpyUnsafeXInterleave_V1A16R4(SB), NOSPLIT, $0-48 19622 MOVSS alpha+0(FP), X0 19623 MOVQ xs+8(FP), AX 19624 MOVQ incx+16(FP), CX 19625 MOVQ ys+24(FP), DX 19626 MOVQ incy+32(FP), BX 19627 MOVQ n+40(FP), SI 19628 XORQ DI, DI 19629 XORQ R8, R8 19630 JMP check_limit_unroll 19631 PCALIGN $0x10 19632 19633 loop_unroll: 19634 MOVSS (AX)(DI*4), X1 19635 ADDQ CX, DI 19636 MOVSS (AX)(DI*4), X2 19637 ADDQ CX, DI 19638 MOVSS (AX)(DI*4), X3 19639 ADDQ CX, DI 19640 MOVSS (AX)(DI*4), X4 19641 ADDQ CX, DI 19642 MULSS X0, X1 19643 MULSS X0, X2 19644 MULSS X0, X3 19645 MULSS X0, X4 19646 ADDSS (DX)(R8*4), X1 19647 MOVSS X1, (DX)(R8*4) 19648 ADDQ BX, R8 19649 ADDSS (DX)(R8*4), X2 19650 MOVSS X2, (DX)(R8*4) 19651 ADDQ BX, R8 19652 ADDSS (DX)(R8*4), X3 19653 MOVSS X3, (DX)(R8*4) 19654 ADDQ BX, R8 19655 ADDSS (DX)(R8*4), X4 19656 MOVSS X4, (DX)(R8*4) 19657 ADDQ BX, R8 19658 SUBQ $0x04, SI 19659 19660 check_limit_unroll: 19661 CMPQ SI, $0x04 19662 JHS loop_unroll 19663 JMP check_limit 19664 19665 loop: 19666 MOVSS (AX)(DI*4), X1 19667 MULSS X0, X1 19668 ADDSS (DX)(R8*4), X1 19669 MOVSS X1, (DX)(R8*4) 19670 DECQ SI 19671 ADDQ CX, DI 19672 ADDQ BX, R8 19673 19674 check_limit: 19675 CMPQ SI, $0x00 19676 JHI loop 19677 RET 19678 19679 // func AmdAxpyUnsafeXInterleave_V2A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19680 // Requires: SSE 19681 TEXT ·AmdAxpyUnsafeXInterleave_V2A16R4(SB), NOSPLIT, $0-48 19682 MOVSS alpha+0(FP), X0 19683 MOVQ xs+8(FP), AX 19684 MOVQ incx+16(FP), CX 19685 MOVQ ys+24(FP), DX 19686 MOVQ incy+32(FP), BX 19687 MOVQ n+40(FP), SI 19688 XORQ DI, DI 19689 XORQ R8, R8 19690 JMP check_limit_unroll 19691 PCALIGN $0x10 19692 19693 loop_unroll: 19694 MOVSS (AX)(DI*4), X1 19695 ADDQ CX, DI 19696 MOVSS (AX)(DI*4), X2 19697 ADDQ CX, DI 19698 MOVSS (AX)(DI*4), X3 19699 ADDQ CX, DI 19700 MOVSS (AX)(DI*4), X4 19701 ADDQ CX, DI 19702 MULSS X0, X1 19703 MULSS X0, X2 19704 MULSS X0, X3 19705 MULSS X0, X4 19706 ADDSS (DX)(R8*4), X1 19707 MOVSS X1, (DX)(R8*4) 19708 ADDQ BX, R8 19709 ADDSS (DX)(R8*4), X2 19710 MOVSS X2, (DX)(R8*4) 19711 ADDQ BX, R8 19712 ADDSS (DX)(R8*4), X3 19713 MOVSS X3, (DX)(R8*4) 19714 ADDQ BX, R8 19715 ADDSS (DX)(R8*4), X4 19716 MOVSS X4, (DX)(R8*4) 19717 ADDQ BX, R8 19718 SUBQ $0x04, SI 19719 19720 check_limit_unroll: 19721 CMPQ SI, $0x04 19722 JHS loop_unroll 19723 JMP check_limit 19724 19725 loop: 19726 MOVSS (AX)(DI*4), X1 19727 MULSS X0, X1 19728 ADDSS (DX)(R8*4), X1 19729 MOVSS X1, (DX)(R8*4) 19730 DECQ SI 19731 ADDQ CX, DI 19732 ADDQ BX, R8 19733 19734 check_limit: 19735 CMPQ SI, $0x00 19736 JHI loop 19737 RET 19738 19739 // func AmdAxpyUnsafeXInterleave_V3A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19740 // Requires: SSE 19741 TEXT ·AmdAxpyUnsafeXInterleave_V3A16R4(SB), NOSPLIT, $0-48 19742 MOVSS alpha+0(FP), X0 19743 MOVQ xs+8(FP), AX 19744 MOVQ incx+16(FP), CX 19745 MOVQ ys+24(FP), DX 19746 MOVQ incy+32(FP), BX 19747 MOVQ n+40(FP), SI 19748 XORQ DI, DI 19749 XORQ R8, R8 19750 JMP check_limit_unroll 19751 PCALIGN $0x10 19752 19753 loop_unroll: 19754 MOVSS (AX)(DI*4), X1 19755 ADDQ CX, DI 19756 MOVSS (AX)(DI*4), X2 19757 ADDQ CX, DI 19758 MOVSS (AX)(DI*4), X3 19759 ADDQ CX, DI 19760 MOVSS (AX)(DI*4), X4 19761 ADDQ CX, DI 19762 MULSS X0, X1 19763 MULSS X0, X2 19764 MULSS X0, X3 19765 MULSS X0, X4 19766 ADDSS (DX)(R8*4), X1 19767 MOVSS X1, (DX)(R8*4) 19768 ADDQ BX, R8 19769 ADDSS (DX)(R8*4), X2 19770 MOVSS X2, (DX)(R8*4) 19771 ADDQ BX, R8 19772 ADDSS (DX)(R8*4), X3 19773 MOVSS X3, (DX)(R8*4) 19774 ADDQ BX, R8 19775 ADDSS (DX)(R8*4), X4 19776 MOVSS X4, (DX)(R8*4) 19777 ADDQ BX, R8 19778 SUBQ $0x04, SI 19779 19780 check_limit_unroll: 19781 CMPQ SI, $0x04 19782 JHS loop_unroll 19783 JMP check_limit 19784 19785 loop: 19786 MOVSS (AX)(DI*4), X1 19787 MULSS X0, X1 19788 ADDSS (DX)(R8*4), X1 19789 MOVSS X1, (DX)(R8*4) 19790 DECQ SI 19791 ADDQ CX, DI 19792 ADDQ BX, R8 19793 19794 check_limit: 19795 CMPQ SI, $0x00 19796 JHI loop 19797 RET 19798 19799 // func AmdAxpyUnsafeXInterleave_V4A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19800 // Requires: SSE 19801 TEXT ·AmdAxpyUnsafeXInterleave_V4A16R4(SB), NOSPLIT, $0-48 19802 MOVSS alpha+0(FP), X0 19803 MOVQ xs+8(FP), AX 19804 MOVQ incx+16(FP), CX 19805 MOVQ ys+24(FP), DX 19806 MOVQ incy+32(FP), BX 19807 MOVQ n+40(FP), SI 19808 XORQ DI, DI 19809 XORQ R8, R8 19810 JMP check_limit_unroll 19811 PCALIGN $0x10 19812 19813 loop_unroll: 19814 MOVSS (AX)(DI*4), X1 19815 ADDQ CX, DI 19816 MOVSS (AX)(DI*4), X2 19817 ADDQ CX, DI 19818 MOVSS (AX)(DI*4), X3 19819 ADDQ CX, DI 19820 MOVSS (AX)(DI*4), X4 19821 ADDQ CX, DI 19822 MULSS X0, X1 19823 MULSS X0, X2 19824 MULSS X0, X3 19825 MULSS X0, X4 19826 ADDSS (DX)(R8*4), X1 19827 MOVSS X1, (DX)(R8*4) 19828 ADDQ BX, R8 19829 ADDSS (DX)(R8*4), X2 19830 MOVSS X2, (DX)(R8*4) 19831 ADDQ BX, R8 19832 ADDSS (DX)(R8*4), X3 19833 MOVSS X3, (DX)(R8*4) 19834 ADDQ BX, R8 19835 ADDSS (DX)(R8*4), X4 19836 MOVSS X4, (DX)(R8*4) 19837 ADDQ BX, R8 19838 SUBQ $0x04, SI 19839 19840 check_limit_unroll: 19841 CMPQ SI, $0x04 19842 JHS loop_unroll 19843 JMP check_limit 19844 19845 loop: 19846 MOVSS (AX)(DI*4), X1 19847 MULSS X0, X1 19848 ADDSS (DX)(R8*4), X1 19849 MOVSS X1, (DX)(R8*4) 19850 DECQ SI 19851 ADDQ CX, DI 19852 ADDQ BX, R8 19853 19854 check_limit: 19855 CMPQ SI, $0x00 19856 JHI loop 19857 RET 19858 19859 // func AmdAxpyUnsafeXInterleave_V5A16R4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19860 // Requires: SSE 19861 TEXT ·AmdAxpyUnsafeXInterleave_V5A16R4(SB), NOSPLIT, $0-48 19862 MOVSS alpha+0(FP), X0 19863 MOVQ xs+8(FP), AX 19864 MOVQ incx+16(FP), CX 19865 MOVQ ys+24(FP), DX 19866 MOVQ incy+32(FP), BX 19867 MOVQ n+40(FP), SI 19868 XORQ DI, DI 19869 XORQ R8, R8 19870 JMP check_limit_unroll 19871 PCALIGN $0x10 19872 19873 loop_unroll: 19874 MOVSS (AX)(DI*4), X1 19875 ADDQ CX, DI 19876 MOVSS (AX)(DI*4), X2 19877 ADDQ CX, DI 19878 MOVSS (AX)(DI*4), X3 19879 ADDQ CX, DI 19880 MOVSS (AX)(DI*4), X4 19881 ADDQ CX, DI 19882 MULSS X0, X1 19883 MULSS X0, X2 19884 MULSS X0, X3 19885 MULSS X0, X4 19886 ADDSS (DX)(R8*4), X1 19887 MOVSS X1, (DX)(R8*4) 19888 ADDQ BX, R8 19889 ADDSS (DX)(R8*4), X2 19890 MOVSS X2, (DX)(R8*4) 19891 ADDQ BX, R8 19892 ADDSS (DX)(R8*4), X3 19893 MOVSS X3, (DX)(R8*4) 19894 ADDQ BX, R8 19895 ADDSS (DX)(R8*4), X4 19896 MOVSS X4, (DX)(R8*4) 19897 ADDQ BX, R8 19898 SUBQ $0x04, SI 19899 19900 check_limit_unroll: 19901 CMPQ SI, $0x04 19902 JHS loop_unroll 19903 JMP check_limit 19904 19905 loop: 19906 MOVSS (AX)(DI*4), X1 19907 MULSS X0, X1 19908 ADDSS (DX)(R8*4), X1 19909 MOVSS X1, (DX)(R8*4) 19910 DECQ SI 19911 ADDQ CX, DI 19912 ADDQ BX, R8 19913 19914 check_limit: 19915 CMPQ SI, $0x00 19916 JHI loop 19917 RET 19918 19919 // func AmdAxpyUnsafeXInterleave_V0A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 19920 // Requires: SSE 19921 TEXT ·AmdAxpyUnsafeXInterleave_V0A0R8(SB), NOSPLIT, $0-48 19922 MOVSS alpha+0(FP), X0 19923 MOVQ xs+8(FP), AX 19924 MOVQ incx+16(FP), CX 19925 MOVQ ys+24(FP), DX 19926 MOVQ incy+32(FP), BX 19927 MOVQ n+40(FP), SI 19928 XORQ DI, DI 19929 XORQ R8, R8 19930 JMP check_limit_unroll 19931 19932 loop_unroll: 19933 MOVSS (AX)(DI*4), X1 19934 ADDQ CX, DI 19935 MOVSS (AX)(DI*4), X2 19936 ADDQ CX, DI 19937 MOVSS (AX)(DI*4), X3 19938 ADDQ CX, DI 19939 MOVSS (AX)(DI*4), X4 19940 ADDQ CX, DI 19941 MOVSS (AX)(DI*4), X5 19942 ADDQ CX, DI 19943 MOVSS (AX)(DI*4), X6 19944 ADDQ CX, DI 19945 MOVSS (AX)(DI*4), X7 19946 ADDQ CX, DI 19947 MOVSS (AX)(DI*4), X8 19948 ADDQ CX, DI 19949 MULSS X0, X1 19950 MULSS X0, X2 19951 MULSS X0, X3 19952 MULSS X0, X4 19953 MULSS X0, X5 19954 MULSS X0, X6 19955 MULSS X0, X7 19956 MULSS X0, X8 19957 ADDSS (DX)(R8*4), X1 19958 MOVSS X1, (DX)(R8*4) 19959 ADDQ BX, R8 19960 ADDSS (DX)(R8*4), X2 19961 MOVSS X2, (DX)(R8*4) 19962 ADDQ BX, R8 19963 ADDSS (DX)(R8*4), X3 19964 MOVSS X3, (DX)(R8*4) 19965 ADDQ BX, R8 19966 ADDSS (DX)(R8*4), X4 19967 MOVSS X4, (DX)(R8*4) 19968 ADDQ BX, R8 19969 ADDSS (DX)(R8*4), X5 19970 MOVSS X5, (DX)(R8*4) 19971 ADDQ BX, R8 19972 ADDSS (DX)(R8*4), X6 19973 MOVSS X6, (DX)(R8*4) 19974 ADDQ BX, R8 19975 ADDSS (DX)(R8*4), X7 19976 MOVSS X7, (DX)(R8*4) 19977 ADDQ BX, R8 19978 ADDSS (DX)(R8*4), X8 19979 MOVSS X8, (DX)(R8*4) 19980 ADDQ BX, R8 19981 SUBQ $0x08, SI 19982 19983 check_limit_unroll: 19984 CMPQ SI, $0x08 19985 JHS loop_unroll 19986 JMP check_limit 19987 19988 loop: 19989 MOVSS (AX)(DI*4), X1 19990 MULSS X0, X1 19991 ADDSS (DX)(R8*4), X1 19992 MOVSS X1, (DX)(R8*4) 19993 DECQ SI 19994 ADDQ CX, DI 19995 ADDQ BX, R8 19996 19997 check_limit: 19998 CMPQ SI, $0x00 19999 JHI loop 20000 RET 20001 20002 // func AmdAxpyUnsafeXInterleave_V1A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20003 // Requires: SSE 20004 TEXT ·AmdAxpyUnsafeXInterleave_V1A0R8(SB), NOSPLIT, $0-48 20005 MOVSS alpha+0(FP), X0 20006 MOVQ xs+8(FP), AX 20007 MOVQ incx+16(FP), CX 20008 MOVQ ys+24(FP), DX 20009 MOVQ incy+32(FP), BX 20010 MOVQ n+40(FP), SI 20011 XORQ DI, DI 20012 XORQ R8, R8 20013 JMP check_limit_unroll 20014 20015 loop_unroll: 20016 MOVSS (AX)(DI*4), X1 20017 ADDQ CX, DI 20018 MOVSS (AX)(DI*4), X2 20019 ADDQ CX, DI 20020 MOVSS (AX)(DI*4), X3 20021 ADDQ CX, DI 20022 MOVSS (AX)(DI*4), X4 20023 ADDQ CX, DI 20024 MOVSS (AX)(DI*4), X5 20025 ADDQ CX, DI 20026 MOVSS (AX)(DI*4), X6 20027 ADDQ CX, DI 20028 MOVSS (AX)(DI*4), X7 20029 ADDQ CX, DI 20030 MOVSS (AX)(DI*4), X8 20031 ADDQ CX, DI 20032 MULSS X0, X1 20033 MULSS X0, X2 20034 MULSS X0, X3 20035 MULSS X0, X4 20036 MULSS X0, X5 20037 MULSS X0, X6 20038 MULSS X0, X7 20039 MULSS X0, X8 20040 ADDSS (DX)(R8*4), X1 20041 MOVSS X1, (DX)(R8*4) 20042 ADDQ BX, R8 20043 ADDSS (DX)(R8*4), X2 20044 MOVSS X2, (DX)(R8*4) 20045 ADDQ BX, R8 20046 ADDSS (DX)(R8*4), X3 20047 MOVSS X3, (DX)(R8*4) 20048 ADDQ BX, R8 20049 ADDSS (DX)(R8*4), X4 20050 MOVSS X4, (DX)(R8*4) 20051 ADDQ BX, R8 20052 ADDSS (DX)(R8*4), X5 20053 MOVSS X5, (DX)(R8*4) 20054 ADDQ BX, R8 20055 ADDSS (DX)(R8*4), X6 20056 MOVSS X6, (DX)(R8*4) 20057 ADDQ BX, R8 20058 ADDSS (DX)(R8*4), X7 20059 MOVSS X7, (DX)(R8*4) 20060 ADDQ BX, R8 20061 ADDSS (DX)(R8*4), X8 20062 MOVSS X8, (DX)(R8*4) 20063 ADDQ BX, R8 20064 SUBQ $0x08, SI 20065 20066 check_limit_unroll: 20067 CMPQ SI, $0x08 20068 JHS loop_unroll 20069 JMP check_limit 20070 20071 loop: 20072 MOVSS (AX)(DI*4), X1 20073 MULSS X0, X1 20074 ADDSS (DX)(R8*4), X1 20075 MOVSS X1, (DX)(R8*4) 20076 DECQ SI 20077 ADDQ CX, DI 20078 ADDQ BX, R8 20079 20080 check_limit: 20081 CMPQ SI, $0x00 20082 JHI loop 20083 RET 20084 20085 // func AmdAxpyUnsafeXInterleave_V2A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20086 // Requires: SSE 20087 TEXT ·AmdAxpyUnsafeXInterleave_V2A0R8(SB), NOSPLIT, $0-48 20088 MOVSS alpha+0(FP), X0 20089 MOVQ xs+8(FP), AX 20090 MOVQ incx+16(FP), CX 20091 MOVQ ys+24(FP), DX 20092 MOVQ incy+32(FP), BX 20093 MOVQ n+40(FP), SI 20094 XORQ DI, DI 20095 XORQ R8, R8 20096 JMP check_limit_unroll 20097 20098 loop_unroll: 20099 MOVSS (AX)(DI*4), X1 20100 ADDQ CX, DI 20101 MOVSS (AX)(DI*4), X2 20102 ADDQ CX, DI 20103 MOVSS (AX)(DI*4), X3 20104 ADDQ CX, DI 20105 MOVSS (AX)(DI*4), X4 20106 ADDQ CX, DI 20107 MOVSS (AX)(DI*4), X5 20108 ADDQ CX, DI 20109 MOVSS (AX)(DI*4), X6 20110 ADDQ CX, DI 20111 MOVSS (AX)(DI*4), X7 20112 ADDQ CX, DI 20113 MOVSS (AX)(DI*4), X8 20114 ADDQ CX, DI 20115 MULSS X0, X1 20116 MULSS X0, X2 20117 MULSS X0, X3 20118 MULSS X0, X4 20119 MULSS X0, X5 20120 MULSS X0, X6 20121 MULSS X0, X7 20122 MULSS X0, X8 20123 ADDSS (DX)(R8*4), X1 20124 MOVSS X1, (DX)(R8*4) 20125 ADDQ BX, R8 20126 ADDSS (DX)(R8*4), X2 20127 MOVSS X2, (DX)(R8*4) 20128 ADDQ BX, R8 20129 ADDSS (DX)(R8*4), X3 20130 MOVSS X3, (DX)(R8*4) 20131 ADDQ BX, R8 20132 ADDSS (DX)(R8*4), X4 20133 MOVSS X4, (DX)(R8*4) 20134 ADDQ BX, R8 20135 ADDSS (DX)(R8*4), X5 20136 MOVSS X5, (DX)(R8*4) 20137 ADDQ BX, R8 20138 ADDSS (DX)(R8*4), X6 20139 MOVSS X6, (DX)(R8*4) 20140 ADDQ BX, R8 20141 ADDSS (DX)(R8*4), X7 20142 MOVSS X7, (DX)(R8*4) 20143 ADDQ BX, R8 20144 ADDSS (DX)(R8*4), X8 20145 MOVSS X8, (DX)(R8*4) 20146 ADDQ BX, R8 20147 SUBQ $0x08, SI 20148 20149 check_limit_unroll: 20150 CMPQ SI, $0x08 20151 JHS loop_unroll 20152 JMP check_limit 20153 20154 loop: 20155 MOVSS (AX)(DI*4), X1 20156 MULSS X0, X1 20157 ADDSS (DX)(R8*4), X1 20158 MOVSS X1, (DX)(R8*4) 20159 DECQ SI 20160 ADDQ CX, DI 20161 ADDQ BX, R8 20162 20163 check_limit: 20164 CMPQ SI, $0x00 20165 JHI loop 20166 RET 20167 20168 // func AmdAxpyUnsafeXInterleave_V3A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20169 // Requires: SSE 20170 TEXT ·AmdAxpyUnsafeXInterleave_V3A0R8(SB), NOSPLIT, $0-48 20171 MOVSS alpha+0(FP), X0 20172 MOVQ xs+8(FP), AX 20173 MOVQ incx+16(FP), CX 20174 MOVQ ys+24(FP), DX 20175 MOVQ incy+32(FP), BX 20176 MOVQ n+40(FP), SI 20177 XORQ DI, DI 20178 XORQ R8, R8 20179 JMP check_limit_unroll 20180 20181 loop_unroll: 20182 MOVSS (AX)(DI*4), X1 20183 ADDQ CX, DI 20184 MOVSS (AX)(DI*4), X2 20185 ADDQ CX, DI 20186 MOVSS (AX)(DI*4), X3 20187 ADDQ CX, DI 20188 MOVSS (AX)(DI*4), X4 20189 ADDQ CX, DI 20190 MOVSS (AX)(DI*4), X5 20191 ADDQ CX, DI 20192 MOVSS (AX)(DI*4), X6 20193 ADDQ CX, DI 20194 MOVSS (AX)(DI*4), X7 20195 ADDQ CX, DI 20196 MOVSS (AX)(DI*4), X8 20197 ADDQ CX, DI 20198 MULSS X0, X1 20199 MULSS X0, X2 20200 MULSS X0, X3 20201 MULSS X0, X4 20202 MULSS X0, X5 20203 MULSS X0, X6 20204 MULSS X0, X7 20205 MULSS X0, X8 20206 ADDSS (DX)(R8*4), X1 20207 MOVSS X1, (DX)(R8*4) 20208 ADDQ BX, R8 20209 ADDSS (DX)(R8*4), X2 20210 MOVSS X2, (DX)(R8*4) 20211 ADDQ BX, R8 20212 ADDSS (DX)(R8*4), X3 20213 MOVSS X3, (DX)(R8*4) 20214 ADDQ BX, R8 20215 ADDSS (DX)(R8*4), X4 20216 MOVSS X4, (DX)(R8*4) 20217 ADDQ BX, R8 20218 ADDSS (DX)(R8*4), X5 20219 MOVSS X5, (DX)(R8*4) 20220 ADDQ BX, R8 20221 ADDSS (DX)(R8*4), X6 20222 MOVSS X6, (DX)(R8*4) 20223 ADDQ BX, R8 20224 ADDSS (DX)(R8*4), X7 20225 MOVSS X7, (DX)(R8*4) 20226 ADDQ BX, R8 20227 ADDSS (DX)(R8*4), X8 20228 MOVSS X8, (DX)(R8*4) 20229 ADDQ BX, R8 20230 SUBQ $0x08, SI 20231 20232 check_limit_unroll: 20233 CMPQ SI, $0x08 20234 JHS loop_unroll 20235 JMP check_limit 20236 20237 loop: 20238 MOVSS (AX)(DI*4), X1 20239 MULSS X0, X1 20240 ADDSS (DX)(R8*4), X1 20241 MOVSS X1, (DX)(R8*4) 20242 DECQ SI 20243 ADDQ CX, DI 20244 ADDQ BX, R8 20245 20246 check_limit: 20247 CMPQ SI, $0x00 20248 JHI loop 20249 RET 20250 20251 // func AmdAxpyUnsafeXInterleave_V4A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20252 // Requires: SSE 20253 TEXT ·AmdAxpyUnsafeXInterleave_V4A0R8(SB), NOSPLIT, $0-48 20254 MOVSS alpha+0(FP), X0 20255 MOVQ xs+8(FP), AX 20256 MOVQ incx+16(FP), CX 20257 MOVQ ys+24(FP), DX 20258 MOVQ incy+32(FP), BX 20259 MOVQ n+40(FP), SI 20260 XORQ DI, DI 20261 XORQ R8, R8 20262 JMP check_limit_unroll 20263 20264 loop_unroll: 20265 MOVSS (AX)(DI*4), X1 20266 ADDQ CX, DI 20267 MOVSS (AX)(DI*4), X2 20268 ADDQ CX, DI 20269 MOVSS (AX)(DI*4), X3 20270 ADDQ CX, DI 20271 MOVSS (AX)(DI*4), X4 20272 ADDQ CX, DI 20273 MOVSS (AX)(DI*4), X5 20274 ADDQ CX, DI 20275 MOVSS (AX)(DI*4), X6 20276 ADDQ CX, DI 20277 MOVSS (AX)(DI*4), X7 20278 ADDQ CX, DI 20279 MOVSS (AX)(DI*4), X8 20280 ADDQ CX, DI 20281 MULSS X0, X1 20282 MULSS X0, X2 20283 MULSS X0, X3 20284 MULSS X0, X4 20285 MULSS X0, X5 20286 MULSS X0, X6 20287 MULSS X0, X7 20288 MULSS X0, X8 20289 ADDSS (DX)(R8*4), X1 20290 MOVSS X1, (DX)(R8*4) 20291 ADDQ BX, R8 20292 ADDSS (DX)(R8*4), X2 20293 MOVSS X2, (DX)(R8*4) 20294 ADDQ BX, R8 20295 ADDSS (DX)(R8*4), X3 20296 MOVSS X3, (DX)(R8*4) 20297 ADDQ BX, R8 20298 ADDSS (DX)(R8*4), X4 20299 MOVSS X4, (DX)(R8*4) 20300 ADDQ BX, R8 20301 ADDSS (DX)(R8*4), X5 20302 MOVSS X5, (DX)(R8*4) 20303 ADDQ BX, R8 20304 ADDSS (DX)(R8*4), X6 20305 MOVSS X6, (DX)(R8*4) 20306 ADDQ BX, R8 20307 ADDSS (DX)(R8*4), X7 20308 MOVSS X7, (DX)(R8*4) 20309 ADDQ BX, R8 20310 ADDSS (DX)(R8*4), X8 20311 MOVSS X8, (DX)(R8*4) 20312 ADDQ BX, R8 20313 SUBQ $0x08, SI 20314 20315 check_limit_unroll: 20316 CMPQ SI, $0x08 20317 JHS loop_unroll 20318 JMP check_limit 20319 20320 loop: 20321 MOVSS (AX)(DI*4), X1 20322 MULSS X0, X1 20323 ADDSS (DX)(R8*4), X1 20324 MOVSS X1, (DX)(R8*4) 20325 DECQ SI 20326 ADDQ CX, DI 20327 ADDQ BX, R8 20328 20329 check_limit: 20330 CMPQ SI, $0x00 20331 JHI loop 20332 RET 20333 20334 // func AmdAxpyUnsafeXInterleave_V5A0R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20335 // Requires: SSE 20336 TEXT ·AmdAxpyUnsafeXInterleave_V5A0R8(SB), NOSPLIT, $0-48 20337 MOVSS alpha+0(FP), X0 20338 MOVQ xs+8(FP), AX 20339 MOVQ incx+16(FP), CX 20340 MOVQ ys+24(FP), DX 20341 MOVQ incy+32(FP), BX 20342 MOVQ n+40(FP), SI 20343 XORQ DI, DI 20344 XORQ R8, R8 20345 JMP check_limit_unroll 20346 20347 loop_unroll: 20348 MOVSS (AX)(DI*4), X1 20349 ADDQ CX, DI 20350 MOVSS (AX)(DI*4), X2 20351 ADDQ CX, DI 20352 MOVSS (AX)(DI*4), X3 20353 ADDQ CX, DI 20354 MOVSS (AX)(DI*4), X4 20355 ADDQ CX, DI 20356 MOVSS (AX)(DI*4), X5 20357 ADDQ CX, DI 20358 MOVSS (AX)(DI*4), X6 20359 ADDQ CX, DI 20360 MOVSS (AX)(DI*4), X7 20361 ADDQ CX, DI 20362 MOVSS (AX)(DI*4), X8 20363 ADDQ CX, DI 20364 MULSS X0, X1 20365 MULSS X0, X2 20366 MULSS X0, X3 20367 MULSS X0, X4 20368 MULSS X0, X5 20369 MULSS X0, X6 20370 MULSS X0, X7 20371 MULSS X0, X8 20372 ADDSS (DX)(R8*4), X1 20373 MOVSS X1, (DX)(R8*4) 20374 ADDQ BX, R8 20375 ADDSS (DX)(R8*4), X2 20376 MOVSS X2, (DX)(R8*4) 20377 ADDQ BX, R8 20378 ADDSS (DX)(R8*4), X3 20379 MOVSS X3, (DX)(R8*4) 20380 ADDQ BX, R8 20381 ADDSS (DX)(R8*4), X4 20382 MOVSS X4, (DX)(R8*4) 20383 ADDQ BX, R8 20384 ADDSS (DX)(R8*4), X5 20385 MOVSS X5, (DX)(R8*4) 20386 ADDQ BX, R8 20387 ADDSS (DX)(R8*4), X6 20388 MOVSS X6, (DX)(R8*4) 20389 ADDQ BX, R8 20390 ADDSS (DX)(R8*4), X7 20391 MOVSS X7, (DX)(R8*4) 20392 ADDQ BX, R8 20393 ADDSS (DX)(R8*4), X8 20394 MOVSS X8, (DX)(R8*4) 20395 ADDQ BX, R8 20396 SUBQ $0x08, SI 20397 20398 check_limit_unroll: 20399 CMPQ SI, $0x08 20400 JHS loop_unroll 20401 JMP check_limit 20402 20403 loop: 20404 MOVSS (AX)(DI*4), X1 20405 MULSS X0, X1 20406 ADDSS (DX)(R8*4), X1 20407 MOVSS X1, (DX)(R8*4) 20408 DECQ SI 20409 ADDQ CX, DI 20410 ADDQ BX, R8 20411 20412 check_limit: 20413 CMPQ SI, $0x00 20414 JHI loop 20415 RET 20416 20417 // func AmdAxpyUnsafeXInterleave_V0A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20418 // Requires: SSE 20419 TEXT ·AmdAxpyUnsafeXInterleave_V0A8R8(SB), NOSPLIT, $0-48 20420 MOVSS alpha+0(FP), X0 20421 MOVQ xs+8(FP), AX 20422 MOVQ incx+16(FP), CX 20423 MOVQ ys+24(FP), DX 20424 MOVQ incy+32(FP), BX 20425 MOVQ n+40(FP), SI 20426 XORQ DI, DI 20427 XORQ R8, R8 20428 JMP check_limit_unroll 20429 PCALIGN $0x08 20430 20431 loop_unroll: 20432 MOVSS (AX)(DI*4), X1 20433 ADDQ CX, DI 20434 MOVSS (AX)(DI*4), X2 20435 ADDQ CX, DI 20436 MOVSS (AX)(DI*4), X3 20437 ADDQ CX, DI 20438 MOVSS (AX)(DI*4), X4 20439 ADDQ CX, DI 20440 MOVSS (AX)(DI*4), X5 20441 ADDQ CX, DI 20442 MOVSS (AX)(DI*4), X6 20443 ADDQ CX, DI 20444 MOVSS (AX)(DI*4), X7 20445 ADDQ CX, DI 20446 MOVSS (AX)(DI*4), X8 20447 ADDQ CX, DI 20448 MULSS X0, X1 20449 MULSS X0, X2 20450 MULSS X0, X3 20451 MULSS X0, X4 20452 MULSS X0, X5 20453 MULSS X0, X6 20454 MULSS X0, X7 20455 MULSS X0, X8 20456 ADDSS (DX)(R8*4), X1 20457 MOVSS X1, (DX)(R8*4) 20458 ADDQ BX, R8 20459 ADDSS (DX)(R8*4), X2 20460 MOVSS X2, (DX)(R8*4) 20461 ADDQ BX, R8 20462 ADDSS (DX)(R8*4), X3 20463 MOVSS X3, (DX)(R8*4) 20464 ADDQ BX, R8 20465 ADDSS (DX)(R8*4), X4 20466 MOVSS X4, (DX)(R8*4) 20467 ADDQ BX, R8 20468 ADDSS (DX)(R8*4), X5 20469 MOVSS X5, (DX)(R8*4) 20470 ADDQ BX, R8 20471 ADDSS (DX)(R8*4), X6 20472 MOVSS X6, (DX)(R8*4) 20473 ADDQ BX, R8 20474 ADDSS (DX)(R8*4), X7 20475 MOVSS X7, (DX)(R8*4) 20476 ADDQ BX, R8 20477 ADDSS (DX)(R8*4), X8 20478 MOVSS X8, (DX)(R8*4) 20479 ADDQ BX, R8 20480 SUBQ $0x08, SI 20481 20482 check_limit_unroll: 20483 CMPQ SI, $0x08 20484 JHS loop_unroll 20485 JMP check_limit 20486 20487 loop: 20488 MOVSS (AX)(DI*4), X1 20489 MULSS X0, X1 20490 ADDSS (DX)(R8*4), X1 20491 MOVSS X1, (DX)(R8*4) 20492 DECQ SI 20493 ADDQ CX, DI 20494 ADDQ BX, R8 20495 20496 check_limit: 20497 CMPQ SI, $0x00 20498 JHI loop 20499 RET 20500 20501 // func AmdAxpyUnsafeXInterleave_V1A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20502 // Requires: SSE 20503 TEXT ·AmdAxpyUnsafeXInterleave_V1A8R8(SB), NOSPLIT, $0-48 20504 MOVSS alpha+0(FP), X0 20505 MOVQ xs+8(FP), AX 20506 MOVQ incx+16(FP), CX 20507 MOVQ ys+24(FP), DX 20508 MOVQ incy+32(FP), BX 20509 MOVQ n+40(FP), SI 20510 XORQ DI, DI 20511 XORQ R8, R8 20512 JMP check_limit_unroll 20513 PCALIGN $0x08 20514 20515 loop_unroll: 20516 MOVSS (AX)(DI*4), X1 20517 ADDQ CX, DI 20518 MOVSS (AX)(DI*4), X2 20519 ADDQ CX, DI 20520 MOVSS (AX)(DI*4), X3 20521 ADDQ CX, DI 20522 MOVSS (AX)(DI*4), X4 20523 ADDQ CX, DI 20524 MOVSS (AX)(DI*4), X5 20525 ADDQ CX, DI 20526 MOVSS (AX)(DI*4), X6 20527 ADDQ CX, DI 20528 MOVSS (AX)(DI*4), X7 20529 ADDQ CX, DI 20530 MOVSS (AX)(DI*4), X8 20531 ADDQ CX, DI 20532 MULSS X0, X1 20533 MULSS X0, X2 20534 MULSS X0, X3 20535 MULSS X0, X4 20536 MULSS X0, X5 20537 MULSS X0, X6 20538 MULSS X0, X7 20539 MULSS X0, X8 20540 ADDSS (DX)(R8*4), X1 20541 MOVSS X1, (DX)(R8*4) 20542 ADDQ BX, R8 20543 ADDSS (DX)(R8*4), X2 20544 MOVSS X2, (DX)(R8*4) 20545 ADDQ BX, R8 20546 ADDSS (DX)(R8*4), X3 20547 MOVSS X3, (DX)(R8*4) 20548 ADDQ BX, R8 20549 ADDSS (DX)(R8*4), X4 20550 MOVSS X4, (DX)(R8*4) 20551 ADDQ BX, R8 20552 ADDSS (DX)(R8*4), X5 20553 MOVSS X5, (DX)(R8*4) 20554 ADDQ BX, R8 20555 ADDSS (DX)(R8*4), X6 20556 MOVSS X6, (DX)(R8*4) 20557 ADDQ BX, R8 20558 ADDSS (DX)(R8*4), X7 20559 MOVSS X7, (DX)(R8*4) 20560 ADDQ BX, R8 20561 ADDSS (DX)(R8*4), X8 20562 MOVSS X8, (DX)(R8*4) 20563 ADDQ BX, R8 20564 SUBQ $0x08, SI 20565 20566 check_limit_unroll: 20567 CMPQ SI, $0x08 20568 JHS loop_unroll 20569 JMP check_limit 20570 20571 loop: 20572 MOVSS (AX)(DI*4), X1 20573 MULSS X0, X1 20574 ADDSS (DX)(R8*4), X1 20575 MOVSS X1, (DX)(R8*4) 20576 DECQ SI 20577 ADDQ CX, DI 20578 ADDQ BX, R8 20579 20580 check_limit: 20581 CMPQ SI, $0x00 20582 JHI loop 20583 RET 20584 20585 // func AmdAxpyUnsafeXInterleave_V2A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20586 // Requires: SSE 20587 TEXT ·AmdAxpyUnsafeXInterleave_V2A8R8(SB), NOSPLIT, $0-48 20588 MOVSS alpha+0(FP), X0 20589 MOVQ xs+8(FP), AX 20590 MOVQ incx+16(FP), CX 20591 MOVQ ys+24(FP), DX 20592 MOVQ incy+32(FP), BX 20593 MOVQ n+40(FP), SI 20594 XORQ DI, DI 20595 XORQ R8, R8 20596 JMP check_limit_unroll 20597 PCALIGN $0x08 20598 20599 loop_unroll: 20600 MOVSS (AX)(DI*4), X1 20601 ADDQ CX, DI 20602 MOVSS (AX)(DI*4), X2 20603 ADDQ CX, DI 20604 MOVSS (AX)(DI*4), X3 20605 ADDQ CX, DI 20606 MOVSS (AX)(DI*4), X4 20607 ADDQ CX, DI 20608 MOVSS (AX)(DI*4), X5 20609 ADDQ CX, DI 20610 MOVSS (AX)(DI*4), X6 20611 ADDQ CX, DI 20612 MOVSS (AX)(DI*4), X7 20613 ADDQ CX, DI 20614 MOVSS (AX)(DI*4), X8 20615 ADDQ CX, DI 20616 MULSS X0, X1 20617 MULSS X0, X2 20618 MULSS X0, X3 20619 MULSS X0, X4 20620 MULSS X0, X5 20621 MULSS X0, X6 20622 MULSS X0, X7 20623 MULSS X0, X8 20624 ADDSS (DX)(R8*4), X1 20625 MOVSS X1, (DX)(R8*4) 20626 ADDQ BX, R8 20627 ADDSS (DX)(R8*4), X2 20628 MOVSS X2, (DX)(R8*4) 20629 ADDQ BX, R8 20630 ADDSS (DX)(R8*4), X3 20631 MOVSS X3, (DX)(R8*4) 20632 ADDQ BX, R8 20633 ADDSS (DX)(R8*4), X4 20634 MOVSS X4, (DX)(R8*4) 20635 ADDQ BX, R8 20636 ADDSS (DX)(R8*4), X5 20637 MOVSS X5, (DX)(R8*4) 20638 ADDQ BX, R8 20639 ADDSS (DX)(R8*4), X6 20640 MOVSS X6, (DX)(R8*4) 20641 ADDQ BX, R8 20642 ADDSS (DX)(R8*4), X7 20643 MOVSS X7, (DX)(R8*4) 20644 ADDQ BX, R8 20645 ADDSS (DX)(R8*4), X8 20646 MOVSS X8, (DX)(R8*4) 20647 ADDQ BX, R8 20648 SUBQ $0x08, SI 20649 20650 check_limit_unroll: 20651 CMPQ SI, $0x08 20652 JHS loop_unroll 20653 JMP check_limit 20654 20655 loop: 20656 MOVSS (AX)(DI*4), X1 20657 MULSS X0, X1 20658 ADDSS (DX)(R8*4), X1 20659 MOVSS X1, (DX)(R8*4) 20660 DECQ SI 20661 ADDQ CX, DI 20662 ADDQ BX, R8 20663 20664 check_limit: 20665 CMPQ SI, $0x00 20666 JHI loop 20667 RET 20668 20669 // func AmdAxpyUnsafeXInterleave_V3A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20670 // Requires: SSE 20671 TEXT ·AmdAxpyUnsafeXInterleave_V3A8R8(SB), NOSPLIT, $0-48 20672 MOVSS alpha+0(FP), X0 20673 MOVQ xs+8(FP), AX 20674 MOVQ incx+16(FP), CX 20675 MOVQ ys+24(FP), DX 20676 MOVQ incy+32(FP), BX 20677 MOVQ n+40(FP), SI 20678 XORQ DI, DI 20679 XORQ R8, R8 20680 JMP check_limit_unroll 20681 PCALIGN $0x08 20682 20683 loop_unroll: 20684 MOVSS (AX)(DI*4), X1 20685 ADDQ CX, DI 20686 MOVSS (AX)(DI*4), X2 20687 ADDQ CX, DI 20688 MOVSS (AX)(DI*4), X3 20689 ADDQ CX, DI 20690 MOVSS (AX)(DI*4), X4 20691 ADDQ CX, DI 20692 MOVSS (AX)(DI*4), X5 20693 ADDQ CX, DI 20694 MOVSS (AX)(DI*4), X6 20695 ADDQ CX, DI 20696 MOVSS (AX)(DI*4), X7 20697 ADDQ CX, DI 20698 MOVSS (AX)(DI*4), X8 20699 ADDQ CX, DI 20700 MULSS X0, X1 20701 MULSS X0, X2 20702 MULSS X0, X3 20703 MULSS X0, X4 20704 MULSS X0, X5 20705 MULSS X0, X6 20706 MULSS X0, X7 20707 MULSS X0, X8 20708 ADDSS (DX)(R8*4), X1 20709 MOVSS X1, (DX)(R8*4) 20710 ADDQ BX, R8 20711 ADDSS (DX)(R8*4), X2 20712 MOVSS X2, (DX)(R8*4) 20713 ADDQ BX, R8 20714 ADDSS (DX)(R8*4), X3 20715 MOVSS X3, (DX)(R8*4) 20716 ADDQ BX, R8 20717 ADDSS (DX)(R8*4), X4 20718 MOVSS X4, (DX)(R8*4) 20719 ADDQ BX, R8 20720 ADDSS (DX)(R8*4), X5 20721 MOVSS X5, (DX)(R8*4) 20722 ADDQ BX, R8 20723 ADDSS (DX)(R8*4), X6 20724 MOVSS X6, (DX)(R8*4) 20725 ADDQ BX, R8 20726 ADDSS (DX)(R8*4), X7 20727 MOVSS X7, (DX)(R8*4) 20728 ADDQ BX, R8 20729 ADDSS (DX)(R8*4), X8 20730 MOVSS X8, (DX)(R8*4) 20731 ADDQ BX, R8 20732 SUBQ $0x08, SI 20733 20734 check_limit_unroll: 20735 CMPQ SI, $0x08 20736 JHS loop_unroll 20737 JMP check_limit 20738 20739 loop: 20740 MOVSS (AX)(DI*4), X1 20741 MULSS X0, X1 20742 ADDSS (DX)(R8*4), X1 20743 MOVSS X1, (DX)(R8*4) 20744 DECQ SI 20745 ADDQ CX, DI 20746 ADDQ BX, R8 20747 20748 check_limit: 20749 CMPQ SI, $0x00 20750 JHI loop 20751 RET 20752 20753 // func AmdAxpyUnsafeXInterleave_V4A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20754 // Requires: SSE 20755 TEXT ·AmdAxpyUnsafeXInterleave_V4A8R8(SB), NOSPLIT, $0-48 20756 MOVSS alpha+0(FP), X0 20757 MOVQ xs+8(FP), AX 20758 MOVQ incx+16(FP), CX 20759 MOVQ ys+24(FP), DX 20760 MOVQ incy+32(FP), BX 20761 MOVQ n+40(FP), SI 20762 XORQ DI, DI 20763 XORQ R8, R8 20764 JMP check_limit_unroll 20765 PCALIGN $0x08 20766 20767 loop_unroll: 20768 MOVSS (AX)(DI*4), X1 20769 ADDQ CX, DI 20770 MOVSS (AX)(DI*4), X2 20771 ADDQ CX, DI 20772 MOVSS (AX)(DI*4), X3 20773 ADDQ CX, DI 20774 MOVSS (AX)(DI*4), X4 20775 ADDQ CX, DI 20776 MOVSS (AX)(DI*4), X5 20777 ADDQ CX, DI 20778 MOVSS (AX)(DI*4), X6 20779 ADDQ CX, DI 20780 MOVSS (AX)(DI*4), X7 20781 ADDQ CX, DI 20782 MOVSS (AX)(DI*4), X8 20783 ADDQ CX, DI 20784 MULSS X0, X1 20785 MULSS X0, X2 20786 MULSS X0, X3 20787 MULSS X0, X4 20788 MULSS X0, X5 20789 MULSS X0, X6 20790 MULSS X0, X7 20791 MULSS X0, X8 20792 ADDSS (DX)(R8*4), X1 20793 MOVSS X1, (DX)(R8*4) 20794 ADDQ BX, R8 20795 ADDSS (DX)(R8*4), X2 20796 MOVSS X2, (DX)(R8*4) 20797 ADDQ BX, R8 20798 ADDSS (DX)(R8*4), X3 20799 MOVSS X3, (DX)(R8*4) 20800 ADDQ BX, R8 20801 ADDSS (DX)(R8*4), X4 20802 MOVSS X4, (DX)(R8*4) 20803 ADDQ BX, R8 20804 ADDSS (DX)(R8*4), X5 20805 MOVSS X5, (DX)(R8*4) 20806 ADDQ BX, R8 20807 ADDSS (DX)(R8*4), X6 20808 MOVSS X6, (DX)(R8*4) 20809 ADDQ BX, R8 20810 ADDSS (DX)(R8*4), X7 20811 MOVSS X7, (DX)(R8*4) 20812 ADDQ BX, R8 20813 ADDSS (DX)(R8*4), X8 20814 MOVSS X8, (DX)(R8*4) 20815 ADDQ BX, R8 20816 SUBQ $0x08, SI 20817 20818 check_limit_unroll: 20819 CMPQ SI, $0x08 20820 JHS loop_unroll 20821 JMP check_limit 20822 20823 loop: 20824 MOVSS (AX)(DI*4), X1 20825 MULSS X0, X1 20826 ADDSS (DX)(R8*4), X1 20827 MOVSS X1, (DX)(R8*4) 20828 DECQ SI 20829 ADDQ CX, DI 20830 ADDQ BX, R8 20831 20832 check_limit: 20833 CMPQ SI, $0x00 20834 JHI loop 20835 RET 20836 20837 // func AmdAxpyUnsafeXInterleave_V5A8R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20838 // Requires: SSE 20839 TEXT ·AmdAxpyUnsafeXInterleave_V5A8R8(SB), NOSPLIT, $0-48 20840 MOVSS alpha+0(FP), X0 20841 MOVQ xs+8(FP), AX 20842 MOVQ incx+16(FP), CX 20843 MOVQ ys+24(FP), DX 20844 MOVQ incy+32(FP), BX 20845 MOVQ n+40(FP), SI 20846 XORQ DI, DI 20847 XORQ R8, R8 20848 JMP check_limit_unroll 20849 PCALIGN $0x08 20850 20851 loop_unroll: 20852 MOVSS (AX)(DI*4), X1 20853 ADDQ CX, DI 20854 MOVSS (AX)(DI*4), X2 20855 ADDQ CX, DI 20856 MOVSS (AX)(DI*4), X3 20857 ADDQ CX, DI 20858 MOVSS (AX)(DI*4), X4 20859 ADDQ CX, DI 20860 MOVSS (AX)(DI*4), X5 20861 ADDQ CX, DI 20862 MOVSS (AX)(DI*4), X6 20863 ADDQ CX, DI 20864 MOVSS (AX)(DI*4), X7 20865 ADDQ CX, DI 20866 MOVSS (AX)(DI*4), X8 20867 ADDQ CX, DI 20868 MULSS X0, X1 20869 MULSS X0, X2 20870 MULSS X0, X3 20871 MULSS X0, X4 20872 MULSS X0, X5 20873 MULSS X0, X6 20874 MULSS X0, X7 20875 MULSS X0, X8 20876 ADDSS (DX)(R8*4), X1 20877 MOVSS X1, (DX)(R8*4) 20878 ADDQ BX, R8 20879 ADDSS (DX)(R8*4), X2 20880 MOVSS X2, (DX)(R8*4) 20881 ADDQ BX, R8 20882 ADDSS (DX)(R8*4), X3 20883 MOVSS X3, (DX)(R8*4) 20884 ADDQ BX, R8 20885 ADDSS (DX)(R8*4), X4 20886 MOVSS X4, (DX)(R8*4) 20887 ADDQ BX, R8 20888 ADDSS (DX)(R8*4), X5 20889 MOVSS X5, (DX)(R8*4) 20890 ADDQ BX, R8 20891 ADDSS (DX)(R8*4), X6 20892 MOVSS X6, (DX)(R8*4) 20893 ADDQ BX, R8 20894 ADDSS (DX)(R8*4), X7 20895 MOVSS X7, (DX)(R8*4) 20896 ADDQ BX, R8 20897 ADDSS (DX)(R8*4), X8 20898 MOVSS X8, (DX)(R8*4) 20899 ADDQ BX, R8 20900 SUBQ $0x08, SI 20901 20902 check_limit_unroll: 20903 CMPQ SI, $0x08 20904 JHS loop_unroll 20905 JMP check_limit 20906 20907 loop: 20908 MOVSS (AX)(DI*4), X1 20909 MULSS X0, X1 20910 ADDSS (DX)(R8*4), X1 20911 MOVSS X1, (DX)(R8*4) 20912 DECQ SI 20913 ADDQ CX, DI 20914 ADDQ BX, R8 20915 20916 check_limit: 20917 CMPQ SI, $0x00 20918 JHI loop 20919 RET 20920 20921 // func AmdAxpyUnsafeXInterleave_V0A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 20922 // Requires: SSE 20923 TEXT ·AmdAxpyUnsafeXInterleave_V0A9R8(SB), NOSPLIT, $0-48 20924 MOVSS alpha+0(FP), X0 20925 MOVQ xs+8(FP), AX 20926 MOVQ incx+16(FP), CX 20927 MOVQ ys+24(FP), DX 20928 MOVQ incy+32(FP), BX 20929 MOVQ n+40(FP), SI 20930 XORQ DI, DI 20931 XORQ R8, R8 20932 JMP check_limit_unroll 20933 PCALIGN $0x08 20934 NOP 20935 20936 loop_unroll: 20937 MOVSS (AX)(DI*4), X1 20938 ADDQ CX, DI 20939 MOVSS (AX)(DI*4), X2 20940 ADDQ CX, DI 20941 MOVSS (AX)(DI*4), X3 20942 ADDQ CX, DI 20943 MOVSS (AX)(DI*4), X4 20944 ADDQ CX, DI 20945 MOVSS (AX)(DI*4), X5 20946 ADDQ CX, DI 20947 MOVSS (AX)(DI*4), X6 20948 ADDQ CX, DI 20949 MOVSS (AX)(DI*4), X7 20950 ADDQ CX, DI 20951 MOVSS (AX)(DI*4), X8 20952 ADDQ CX, DI 20953 MULSS X0, X1 20954 MULSS X0, X2 20955 MULSS X0, X3 20956 MULSS X0, X4 20957 MULSS X0, X5 20958 MULSS X0, X6 20959 MULSS X0, X7 20960 MULSS X0, X8 20961 ADDSS (DX)(R8*4), X1 20962 MOVSS X1, (DX)(R8*4) 20963 ADDQ BX, R8 20964 ADDSS (DX)(R8*4), X2 20965 MOVSS X2, (DX)(R8*4) 20966 ADDQ BX, R8 20967 ADDSS (DX)(R8*4), X3 20968 MOVSS X3, (DX)(R8*4) 20969 ADDQ BX, R8 20970 ADDSS (DX)(R8*4), X4 20971 MOVSS X4, (DX)(R8*4) 20972 ADDQ BX, R8 20973 ADDSS (DX)(R8*4), X5 20974 MOVSS X5, (DX)(R8*4) 20975 ADDQ BX, R8 20976 ADDSS (DX)(R8*4), X6 20977 MOVSS X6, (DX)(R8*4) 20978 ADDQ BX, R8 20979 ADDSS (DX)(R8*4), X7 20980 MOVSS X7, (DX)(R8*4) 20981 ADDQ BX, R8 20982 ADDSS (DX)(R8*4), X8 20983 MOVSS X8, (DX)(R8*4) 20984 ADDQ BX, R8 20985 SUBQ $0x08, SI 20986 20987 check_limit_unroll: 20988 CMPQ SI, $0x08 20989 JHS loop_unroll 20990 JMP check_limit 20991 20992 loop: 20993 MOVSS (AX)(DI*4), X1 20994 MULSS X0, X1 20995 ADDSS (DX)(R8*4), X1 20996 MOVSS X1, (DX)(R8*4) 20997 DECQ SI 20998 ADDQ CX, DI 20999 ADDQ BX, R8 21000 21001 check_limit: 21002 CMPQ SI, $0x00 21003 JHI loop 21004 RET 21005 21006 // func AmdAxpyUnsafeXInterleave_V1A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21007 // Requires: SSE 21008 TEXT ·AmdAxpyUnsafeXInterleave_V1A9R8(SB), NOSPLIT, $0-48 21009 MOVSS alpha+0(FP), X0 21010 MOVQ xs+8(FP), AX 21011 MOVQ incx+16(FP), CX 21012 MOVQ ys+24(FP), DX 21013 MOVQ incy+32(FP), BX 21014 MOVQ n+40(FP), SI 21015 XORQ DI, DI 21016 XORQ R8, R8 21017 JMP check_limit_unroll 21018 PCALIGN $0x08 21019 NOP 21020 21021 loop_unroll: 21022 MOVSS (AX)(DI*4), X1 21023 ADDQ CX, DI 21024 MOVSS (AX)(DI*4), X2 21025 ADDQ CX, DI 21026 MOVSS (AX)(DI*4), X3 21027 ADDQ CX, DI 21028 MOVSS (AX)(DI*4), X4 21029 ADDQ CX, DI 21030 MOVSS (AX)(DI*4), X5 21031 ADDQ CX, DI 21032 MOVSS (AX)(DI*4), X6 21033 ADDQ CX, DI 21034 MOVSS (AX)(DI*4), X7 21035 ADDQ CX, DI 21036 MOVSS (AX)(DI*4), X8 21037 ADDQ CX, DI 21038 MULSS X0, X1 21039 MULSS X0, X2 21040 MULSS X0, X3 21041 MULSS X0, X4 21042 MULSS X0, X5 21043 MULSS X0, X6 21044 MULSS X0, X7 21045 MULSS X0, X8 21046 ADDSS (DX)(R8*4), X1 21047 MOVSS X1, (DX)(R8*4) 21048 ADDQ BX, R8 21049 ADDSS (DX)(R8*4), X2 21050 MOVSS X2, (DX)(R8*4) 21051 ADDQ BX, R8 21052 ADDSS (DX)(R8*4), X3 21053 MOVSS X3, (DX)(R8*4) 21054 ADDQ BX, R8 21055 ADDSS (DX)(R8*4), X4 21056 MOVSS X4, (DX)(R8*4) 21057 ADDQ BX, R8 21058 ADDSS (DX)(R8*4), X5 21059 MOVSS X5, (DX)(R8*4) 21060 ADDQ BX, R8 21061 ADDSS (DX)(R8*4), X6 21062 MOVSS X6, (DX)(R8*4) 21063 ADDQ BX, R8 21064 ADDSS (DX)(R8*4), X7 21065 MOVSS X7, (DX)(R8*4) 21066 ADDQ BX, R8 21067 ADDSS (DX)(R8*4), X8 21068 MOVSS X8, (DX)(R8*4) 21069 ADDQ BX, R8 21070 SUBQ $0x08, SI 21071 21072 check_limit_unroll: 21073 CMPQ SI, $0x08 21074 JHS loop_unroll 21075 JMP check_limit 21076 21077 loop: 21078 MOVSS (AX)(DI*4), X1 21079 MULSS X0, X1 21080 ADDSS (DX)(R8*4), X1 21081 MOVSS X1, (DX)(R8*4) 21082 DECQ SI 21083 ADDQ CX, DI 21084 ADDQ BX, R8 21085 21086 check_limit: 21087 CMPQ SI, $0x00 21088 JHI loop 21089 RET 21090 21091 // func AmdAxpyUnsafeXInterleave_V2A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21092 // Requires: SSE 21093 TEXT ·AmdAxpyUnsafeXInterleave_V2A9R8(SB), NOSPLIT, $0-48 21094 MOVSS alpha+0(FP), X0 21095 MOVQ xs+8(FP), AX 21096 MOVQ incx+16(FP), CX 21097 MOVQ ys+24(FP), DX 21098 MOVQ incy+32(FP), BX 21099 MOVQ n+40(FP), SI 21100 XORQ DI, DI 21101 XORQ R8, R8 21102 JMP check_limit_unroll 21103 PCALIGN $0x08 21104 NOP 21105 21106 loop_unroll: 21107 MOVSS (AX)(DI*4), X1 21108 ADDQ CX, DI 21109 MOVSS (AX)(DI*4), X2 21110 ADDQ CX, DI 21111 MOVSS (AX)(DI*4), X3 21112 ADDQ CX, DI 21113 MOVSS (AX)(DI*4), X4 21114 ADDQ CX, DI 21115 MOVSS (AX)(DI*4), X5 21116 ADDQ CX, DI 21117 MOVSS (AX)(DI*4), X6 21118 ADDQ CX, DI 21119 MOVSS (AX)(DI*4), X7 21120 ADDQ CX, DI 21121 MOVSS (AX)(DI*4), X8 21122 ADDQ CX, DI 21123 MULSS X0, X1 21124 MULSS X0, X2 21125 MULSS X0, X3 21126 MULSS X0, X4 21127 MULSS X0, X5 21128 MULSS X0, X6 21129 MULSS X0, X7 21130 MULSS X0, X8 21131 ADDSS (DX)(R8*4), X1 21132 MOVSS X1, (DX)(R8*4) 21133 ADDQ BX, R8 21134 ADDSS (DX)(R8*4), X2 21135 MOVSS X2, (DX)(R8*4) 21136 ADDQ BX, R8 21137 ADDSS (DX)(R8*4), X3 21138 MOVSS X3, (DX)(R8*4) 21139 ADDQ BX, R8 21140 ADDSS (DX)(R8*4), X4 21141 MOVSS X4, (DX)(R8*4) 21142 ADDQ BX, R8 21143 ADDSS (DX)(R8*4), X5 21144 MOVSS X5, (DX)(R8*4) 21145 ADDQ BX, R8 21146 ADDSS (DX)(R8*4), X6 21147 MOVSS X6, (DX)(R8*4) 21148 ADDQ BX, R8 21149 ADDSS (DX)(R8*4), X7 21150 MOVSS X7, (DX)(R8*4) 21151 ADDQ BX, R8 21152 ADDSS (DX)(R8*4), X8 21153 MOVSS X8, (DX)(R8*4) 21154 ADDQ BX, R8 21155 SUBQ $0x08, SI 21156 21157 check_limit_unroll: 21158 CMPQ SI, $0x08 21159 JHS loop_unroll 21160 JMP check_limit 21161 21162 loop: 21163 MOVSS (AX)(DI*4), X1 21164 MULSS X0, X1 21165 ADDSS (DX)(R8*4), X1 21166 MOVSS X1, (DX)(R8*4) 21167 DECQ SI 21168 ADDQ CX, DI 21169 ADDQ BX, R8 21170 21171 check_limit: 21172 CMPQ SI, $0x00 21173 JHI loop 21174 RET 21175 21176 // func AmdAxpyUnsafeXInterleave_V3A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21177 // Requires: SSE 21178 TEXT ·AmdAxpyUnsafeXInterleave_V3A9R8(SB), NOSPLIT, $0-48 21179 MOVSS alpha+0(FP), X0 21180 MOVQ xs+8(FP), AX 21181 MOVQ incx+16(FP), CX 21182 MOVQ ys+24(FP), DX 21183 MOVQ incy+32(FP), BX 21184 MOVQ n+40(FP), SI 21185 XORQ DI, DI 21186 XORQ R8, R8 21187 JMP check_limit_unroll 21188 PCALIGN $0x08 21189 NOP 21190 21191 loop_unroll: 21192 MOVSS (AX)(DI*4), X1 21193 ADDQ CX, DI 21194 MOVSS (AX)(DI*4), X2 21195 ADDQ CX, DI 21196 MOVSS (AX)(DI*4), X3 21197 ADDQ CX, DI 21198 MOVSS (AX)(DI*4), X4 21199 ADDQ CX, DI 21200 MOVSS (AX)(DI*4), X5 21201 ADDQ CX, DI 21202 MOVSS (AX)(DI*4), X6 21203 ADDQ CX, DI 21204 MOVSS (AX)(DI*4), X7 21205 ADDQ CX, DI 21206 MOVSS (AX)(DI*4), X8 21207 ADDQ CX, DI 21208 MULSS X0, X1 21209 MULSS X0, X2 21210 MULSS X0, X3 21211 MULSS X0, X4 21212 MULSS X0, X5 21213 MULSS X0, X6 21214 MULSS X0, X7 21215 MULSS X0, X8 21216 ADDSS (DX)(R8*4), X1 21217 MOVSS X1, (DX)(R8*4) 21218 ADDQ BX, R8 21219 ADDSS (DX)(R8*4), X2 21220 MOVSS X2, (DX)(R8*4) 21221 ADDQ BX, R8 21222 ADDSS (DX)(R8*4), X3 21223 MOVSS X3, (DX)(R8*4) 21224 ADDQ BX, R8 21225 ADDSS (DX)(R8*4), X4 21226 MOVSS X4, (DX)(R8*4) 21227 ADDQ BX, R8 21228 ADDSS (DX)(R8*4), X5 21229 MOVSS X5, (DX)(R8*4) 21230 ADDQ BX, R8 21231 ADDSS (DX)(R8*4), X6 21232 MOVSS X6, (DX)(R8*4) 21233 ADDQ BX, R8 21234 ADDSS (DX)(R8*4), X7 21235 MOVSS X7, (DX)(R8*4) 21236 ADDQ BX, R8 21237 ADDSS (DX)(R8*4), X8 21238 MOVSS X8, (DX)(R8*4) 21239 ADDQ BX, R8 21240 SUBQ $0x08, SI 21241 21242 check_limit_unroll: 21243 CMPQ SI, $0x08 21244 JHS loop_unroll 21245 JMP check_limit 21246 21247 loop: 21248 MOVSS (AX)(DI*4), X1 21249 MULSS X0, X1 21250 ADDSS (DX)(R8*4), X1 21251 MOVSS X1, (DX)(R8*4) 21252 DECQ SI 21253 ADDQ CX, DI 21254 ADDQ BX, R8 21255 21256 check_limit: 21257 CMPQ SI, $0x00 21258 JHI loop 21259 RET 21260 21261 // func AmdAxpyUnsafeXInterleave_V4A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21262 // Requires: SSE 21263 TEXT ·AmdAxpyUnsafeXInterleave_V4A9R8(SB), NOSPLIT, $0-48 21264 MOVSS alpha+0(FP), X0 21265 MOVQ xs+8(FP), AX 21266 MOVQ incx+16(FP), CX 21267 MOVQ ys+24(FP), DX 21268 MOVQ incy+32(FP), BX 21269 MOVQ n+40(FP), SI 21270 XORQ DI, DI 21271 XORQ R8, R8 21272 JMP check_limit_unroll 21273 PCALIGN $0x08 21274 NOP 21275 21276 loop_unroll: 21277 MOVSS (AX)(DI*4), X1 21278 ADDQ CX, DI 21279 MOVSS (AX)(DI*4), X2 21280 ADDQ CX, DI 21281 MOVSS (AX)(DI*4), X3 21282 ADDQ CX, DI 21283 MOVSS (AX)(DI*4), X4 21284 ADDQ CX, DI 21285 MOVSS (AX)(DI*4), X5 21286 ADDQ CX, DI 21287 MOVSS (AX)(DI*4), X6 21288 ADDQ CX, DI 21289 MOVSS (AX)(DI*4), X7 21290 ADDQ CX, DI 21291 MOVSS (AX)(DI*4), X8 21292 ADDQ CX, DI 21293 MULSS X0, X1 21294 MULSS X0, X2 21295 MULSS X0, X3 21296 MULSS X0, X4 21297 MULSS X0, X5 21298 MULSS X0, X6 21299 MULSS X0, X7 21300 MULSS X0, X8 21301 ADDSS (DX)(R8*4), X1 21302 MOVSS X1, (DX)(R8*4) 21303 ADDQ BX, R8 21304 ADDSS (DX)(R8*4), X2 21305 MOVSS X2, (DX)(R8*4) 21306 ADDQ BX, R8 21307 ADDSS (DX)(R8*4), X3 21308 MOVSS X3, (DX)(R8*4) 21309 ADDQ BX, R8 21310 ADDSS (DX)(R8*4), X4 21311 MOVSS X4, (DX)(R8*4) 21312 ADDQ BX, R8 21313 ADDSS (DX)(R8*4), X5 21314 MOVSS X5, (DX)(R8*4) 21315 ADDQ BX, R8 21316 ADDSS (DX)(R8*4), X6 21317 MOVSS X6, (DX)(R8*4) 21318 ADDQ BX, R8 21319 ADDSS (DX)(R8*4), X7 21320 MOVSS X7, (DX)(R8*4) 21321 ADDQ BX, R8 21322 ADDSS (DX)(R8*4), X8 21323 MOVSS X8, (DX)(R8*4) 21324 ADDQ BX, R8 21325 SUBQ $0x08, SI 21326 21327 check_limit_unroll: 21328 CMPQ SI, $0x08 21329 JHS loop_unroll 21330 JMP check_limit 21331 21332 loop: 21333 MOVSS (AX)(DI*4), X1 21334 MULSS X0, X1 21335 ADDSS (DX)(R8*4), X1 21336 MOVSS X1, (DX)(R8*4) 21337 DECQ SI 21338 ADDQ CX, DI 21339 ADDQ BX, R8 21340 21341 check_limit: 21342 CMPQ SI, $0x00 21343 JHI loop 21344 RET 21345 21346 // func AmdAxpyUnsafeXInterleave_V5A9R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21347 // Requires: SSE 21348 TEXT ·AmdAxpyUnsafeXInterleave_V5A9R8(SB), NOSPLIT, $0-48 21349 MOVSS alpha+0(FP), X0 21350 MOVQ xs+8(FP), AX 21351 MOVQ incx+16(FP), CX 21352 MOVQ ys+24(FP), DX 21353 MOVQ incy+32(FP), BX 21354 MOVQ n+40(FP), SI 21355 XORQ DI, DI 21356 XORQ R8, R8 21357 JMP check_limit_unroll 21358 PCALIGN $0x08 21359 NOP 21360 21361 loop_unroll: 21362 MOVSS (AX)(DI*4), X1 21363 ADDQ CX, DI 21364 MOVSS (AX)(DI*4), X2 21365 ADDQ CX, DI 21366 MOVSS (AX)(DI*4), X3 21367 ADDQ CX, DI 21368 MOVSS (AX)(DI*4), X4 21369 ADDQ CX, DI 21370 MOVSS (AX)(DI*4), X5 21371 ADDQ CX, DI 21372 MOVSS (AX)(DI*4), X6 21373 ADDQ CX, DI 21374 MOVSS (AX)(DI*4), X7 21375 ADDQ CX, DI 21376 MOVSS (AX)(DI*4), X8 21377 ADDQ CX, DI 21378 MULSS X0, X1 21379 MULSS X0, X2 21380 MULSS X0, X3 21381 MULSS X0, X4 21382 MULSS X0, X5 21383 MULSS X0, X6 21384 MULSS X0, X7 21385 MULSS X0, X8 21386 ADDSS (DX)(R8*4), X1 21387 MOVSS X1, (DX)(R8*4) 21388 ADDQ BX, R8 21389 ADDSS (DX)(R8*4), X2 21390 MOVSS X2, (DX)(R8*4) 21391 ADDQ BX, R8 21392 ADDSS (DX)(R8*4), X3 21393 MOVSS X3, (DX)(R8*4) 21394 ADDQ BX, R8 21395 ADDSS (DX)(R8*4), X4 21396 MOVSS X4, (DX)(R8*4) 21397 ADDQ BX, R8 21398 ADDSS (DX)(R8*4), X5 21399 MOVSS X5, (DX)(R8*4) 21400 ADDQ BX, R8 21401 ADDSS (DX)(R8*4), X6 21402 MOVSS X6, (DX)(R8*4) 21403 ADDQ BX, R8 21404 ADDSS (DX)(R8*4), X7 21405 MOVSS X7, (DX)(R8*4) 21406 ADDQ BX, R8 21407 ADDSS (DX)(R8*4), X8 21408 MOVSS X8, (DX)(R8*4) 21409 ADDQ BX, R8 21410 SUBQ $0x08, SI 21411 21412 check_limit_unroll: 21413 CMPQ SI, $0x08 21414 JHS loop_unroll 21415 JMP check_limit 21416 21417 loop: 21418 MOVSS (AX)(DI*4), X1 21419 MULSS X0, X1 21420 ADDSS (DX)(R8*4), X1 21421 MOVSS X1, (DX)(R8*4) 21422 DECQ SI 21423 ADDQ CX, DI 21424 ADDQ BX, R8 21425 21426 check_limit: 21427 CMPQ SI, $0x00 21428 JHI loop 21429 RET 21430 21431 // func AmdAxpyUnsafeXInterleave_V0A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21432 // Requires: SSE 21433 TEXT ·AmdAxpyUnsafeXInterleave_V0A10R8(SB), NOSPLIT, $0-48 21434 MOVSS alpha+0(FP), X0 21435 MOVQ xs+8(FP), AX 21436 MOVQ incx+16(FP), CX 21437 MOVQ ys+24(FP), DX 21438 MOVQ incy+32(FP), BX 21439 MOVQ n+40(FP), SI 21440 XORQ DI, DI 21441 XORQ R8, R8 21442 JMP check_limit_unroll 21443 PCALIGN $0x08 21444 NOP 21445 NOP 21446 21447 loop_unroll: 21448 MOVSS (AX)(DI*4), X1 21449 ADDQ CX, DI 21450 MOVSS (AX)(DI*4), X2 21451 ADDQ CX, DI 21452 MOVSS (AX)(DI*4), X3 21453 ADDQ CX, DI 21454 MOVSS (AX)(DI*4), X4 21455 ADDQ CX, DI 21456 MOVSS (AX)(DI*4), X5 21457 ADDQ CX, DI 21458 MOVSS (AX)(DI*4), X6 21459 ADDQ CX, DI 21460 MOVSS (AX)(DI*4), X7 21461 ADDQ CX, DI 21462 MOVSS (AX)(DI*4), X8 21463 ADDQ CX, DI 21464 MULSS X0, X1 21465 MULSS X0, X2 21466 MULSS X0, X3 21467 MULSS X0, X4 21468 MULSS X0, X5 21469 MULSS X0, X6 21470 MULSS X0, X7 21471 MULSS X0, X8 21472 ADDSS (DX)(R8*4), X1 21473 MOVSS X1, (DX)(R8*4) 21474 ADDQ BX, R8 21475 ADDSS (DX)(R8*4), X2 21476 MOVSS X2, (DX)(R8*4) 21477 ADDQ BX, R8 21478 ADDSS (DX)(R8*4), X3 21479 MOVSS X3, (DX)(R8*4) 21480 ADDQ BX, R8 21481 ADDSS (DX)(R8*4), X4 21482 MOVSS X4, (DX)(R8*4) 21483 ADDQ BX, R8 21484 ADDSS (DX)(R8*4), X5 21485 MOVSS X5, (DX)(R8*4) 21486 ADDQ BX, R8 21487 ADDSS (DX)(R8*4), X6 21488 MOVSS X6, (DX)(R8*4) 21489 ADDQ BX, R8 21490 ADDSS (DX)(R8*4), X7 21491 MOVSS X7, (DX)(R8*4) 21492 ADDQ BX, R8 21493 ADDSS (DX)(R8*4), X8 21494 MOVSS X8, (DX)(R8*4) 21495 ADDQ BX, R8 21496 SUBQ $0x08, SI 21497 21498 check_limit_unroll: 21499 CMPQ SI, $0x08 21500 JHS loop_unroll 21501 JMP check_limit 21502 21503 loop: 21504 MOVSS (AX)(DI*4), X1 21505 MULSS X0, X1 21506 ADDSS (DX)(R8*4), X1 21507 MOVSS X1, (DX)(R8*4) 21508 DECQ SI 21509 ADDQ CX, DI 21510 ADDQ BX, R8 21511 21512 check_limit: 21513 CMPQ SI, $0x00 21514 JHI loop 21515 RET 21516 21517 // func AmdAxpyUnsafeXInterleave_V1A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21518 // Requires: SSE 21519 TEXT ·AmdAxpyUnsafeXInterleave_V1A10R8(SB), NOSPLIT, $0-48 21520 MOVSS alpha+0(FP), X0 21521 MOVQ xs+8(FP), AX 21522 MOVQ incx+16(FP), CX 21523 MOVQ ys+24(FP), DX 21524 MOVQ incy+32(FP), BX 21525 MOVQ n+40(FP), SI 21526 XORQ DI, DI 21527 XORQ R8, R8 21528 JMP check_limit_unroll 21529 PCALIGN $0x08 21530 NOP 21531 NOP 21532 21533 loop_unroll: 21534 MOVSS (AX)(DI*4), X1 21535 ADDQ CX, DI 21536 MOVSS (AX)(DI*4), X2 21537 ADDQ CX, DI 21538 MOVSS (AX)(DI*4), X3 21539 ADDQ CX, DI 21540 MOVSS (AX)(DI*4), X4 21541 ADDQ CX, DI 21542 MOVSS (AX)(DI*4), X5 21543 ADDQ CX, DI 21544 MOVSS (AX)(DI*4), X6 21545 ADDQ CX, DI 21546 MOVSS (AX)(DI*4), X7 21547 ADDQ CX, DI 21548 MOVSS (AX)(DI*4), X8 21549 ADDQ CX, DI 21550 MULSS X0, X1 21551 MULSS X0, X2 21552 MULSS X0, X3 21553 MULSS X0, X4 21554 MULSS X0, X5 21555 MULSS X0, X6 21556 MULSS X0, X7 21557 MULSS X0, X8 21558 ADDSS (DX)(R8*4), X1 21559 MOVSS X1, (DX)(R8*4) 21560 ADDQ BX, R8 21561 ADDSS (DX)(R8*4), X2 21562 MOVSS X2, (DX)(R8*4) 21563 ADDQ BX, R8 21564 ADDSS (DX)(R8*4), X3 21565 MOVSS X3, (DX)(R8*4) 21566 ADDQ BX, R8 21567 ADDSS (DX)(R8*4), X4 21568 MOVSS X4, (DX)(R8*4) 21569 ADDQ BX, R8 21570 ADDSS (DX)(R8*4), X5 21571 MOVSS X5, (DX)(R8*4) 21572 ADDQ BX, R8 21573 ADDSS (DX)(R8*4), X6 21574 MOVSS X6, (DX)(R8*4) 21575 ADDQ BX, R8 21576 ADDSS (DX)(R8*4), X7 21577 MOVSS X7, (DX)(R8*4) 21578 ADDQ BX, R8 21579 ADDSS (DX)(R8*4), X8 21580 MOVSS X8, (DX)(R8*4) 21581 ADDQ BX, R8 21582 SUBQ $0x08, SI 21583 21584 check_limit_unroll: 21585 CMPQ SI, $0x08 21586 JHS loop_unroll 21587 JMP check_limit 21588 21589 loop: 21590 MOVSS (AX)(DI*4), X1 21591 MULSS X0, X1 21592 ADDSS (DX)(R8*4), X1 21593 MOVSS X1, (DX)(R8*4) 21594 DECQ SI 21595 ADDQ CX, DI 21596 ADDQ BX, R8 21597 21598 check_limit: 21599 CMPQ SI, $0x00 21600 JHI loop 21601 RET 21602 21603 // func AmdAxpyUnsafeXInterleave_V2A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21604 // Requires: SSE 21605 TEXT ·AmdAxpyUnsafeXInterleave_V2A10R8(SB), NOSPLIT, $0-48 21606 MOVSS alpha+0(FP), X0 21607 MOVQ xs+8(FP), AX 21608 MOVQ incx+16(FP), CX 21609 MOVQ ys+24(FP), DX 21610 MOVQ incy+32(FP), BX 21611 MOVQ n+40(FP), SI 21612 XORQ DI, DI 21613 XORQ R8, R8 21614 JMP check_limit_unroll 21615 PCALIGN $0x08 21616 NOP 21617 NOP 21618 21619 loop_unroll: 21620 MOVSS (AX)(DI*4), X1 21621 ADDQ CX, DI 21622 MOVSS (AX)(DI*4), X2 21623 ADDQ CX, DI 21624 MOVSS (AX)(DI*4), X3 21625 ADDQ CX, DI 21626 MOVSS (AX)(DI*4), X4 21627 ADDQ CX, DI 21628 MOVSS (AX)(DI*4), X5 21629 ADDQ CX, DI 21630 MOVSS (AX)(DI*4), X6 21631 ADDQ CX, DI 21632 MOVSS (AX)(DI*4), X7 21633 ADDQ CX, DI 21634 MOVSS (AX)(DI*4), X8 21635 ADDQ CX, DI 21636 MULSS X0, X1 21637 MULSS X0, X2 21638 MULSS X0, X3 21639 MULSS X0, X4 21640 MULSS X0, X5 21641 MULSS X0, X6 21642 MULSS X0, X7 21643 MULSS X0, X8 21644 ADDSS (DX)(R8*4), X1 21645 MOVSS X1, (DX)(R8*4) 21646 ADDQ BX, R8 21647 ADDSS (DX)(R8*4), X2 21648 MOVSS X2, (DX)(R8*4) 21649 ADDQ BX, R8 21650 ADDSS (DX)(R8*4), X3 21651 MOVSS X3, (DX)(R8*4) 21652 ADDQ BX, R8 21653 ADDSS (DX)(R8*4), X4 21654 MOVSS X4, (DX)(R8*4) 21655 ADDQ BX, R8 21656 ADDSS (DX)(R8*4), X5 21657 MOVSS X5, (DX)(R8*4) 21658 ADDQ BX, R8 21659 ADDSS (DX)(R8*4), X6 21660 MOVSS X6, (DX)(R8*4) 21661 ADDQ BX, R8 21662 ADDSS (DX)(R8*4), X7 21663 MOVSS X7, (DX)(R8*4) 21664 ADDQ BX, R8 21665 ADDSS (DX)(R8*4), X8 21666 MOVSS X8, (DX)(R8*4) 21667 ADDQ BX, R8 21668 SUBQ $0x08, SI 21669 21670 check_limit_unroll: 21671 CMPQ SI, $0x08 21672 JHS loop_unroll 21673 JMP check_limit 21674 21675 loop: 21676 MOVSS (AX)(DI*4), X1 21677 MULSS X0, X1 21678 ADDSS (DX)(R8*4), X1 21679 MOVSS X1, (DX)(R8*4) 21680 DECQ SI 21681 ADDQ CX, DI 21682 ADDQ BX, R8 21683 21684 check_limit: 21685 CMPQ SI, $0x00 21686 JHI loop 21687 RET 21688 21689 // func AmdAxpyUnsafeXInterleave_V3A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21690 // Requires: SSE 21691 TEXT ·AmdAxpyUnsafeXInterleave_V3A10R8(SB), NOSPLIT, $0-48 21692 MOVSS alpha+0(FP), X0 21693 MOVQ xs+8(FP), AX 21694 MOVQ incx+16(FP), CX 21695 MOVQ ys+24(FP), DX 21696 MOVQ incy+32(FP), BX 21697 MOVQ n+40(FP), SI 21698 XORQ DI, DI 21699 XORQ R8, R8 21700 JMP check_limit_unroll 21701 PCALIGN $0x08 21702 NOP 21703 NOP 21704 21705 loop_unroll: 21706 MOVSS (AX)(DI*4), X1 21707 ADDQ CX, DI 21708 MOVSS (AX)(DI*4), X2 21709 ADDQ CX, DI 21710 MOVSS (AX)(DI*4), X3 21711 ADDQ CX, DI 21712 MOVSS (AX)(DI*4), X4 21713 ADDQ CX, DI 21714 MOVSS (AX)(DI*4), X5 21715 ADDQ CX, DI 21716 MOVSS (AX)(DI*4), X6 21717 ADDQ CX, DI 21718 MOVSS (AX)(DI*4), X7 21719 ADDQ CX, DI 21720 MOVSS (AX)(DI*4), X8 21721 ADDQ CX, DI 21722 MULSS X0, X1 21723 MULSS X0, X2 21724 MULSS X0, X3 21725 MULSS X0, X4 21726 MULSS X0, X5 21727 MULSS X0, X6 21728 MULSS X0, X7 21729 MULSS X0, X8 21730 ADDSS (DX)(R8*4), X1 21731 MOVSS X1, (DX)(R8*4) 21732 ADDQ BX, R8 21733 ADDSS (DX)(R8*4), X2 21734 MOVSS X2, (DX)(R8*4) 21735 ADDQ BX, R8 21736 ADDSS (DX)(R8*4), X3 21737 MOVSS X3, (DX)(R8*4) 21738 ADDQ BX, R8 21739 ADDSS (DX)(R8*4), X4 21740 MOVSS X4, (DX)(R8*4) 21741 ADDQ BX, R8 21742 ADDSS (DX)(R8*4), X5 21743 MOVSS X5, (DX)(R8*4) 21744 ADDQ BX, R8 21745 ADDSS (DX)(R8*4), X6 21746 MOVSS X6, (DX)(R8*4) 21747 ADDQ BX, R8 21748 ADDSS (DX)(R8*4), X7 21749 MOVSS X7, (DX)(R8*4) 21750 ADDQ BX, R8 21751 ADDSS (DX)(R8*4), X8 21752 MOVSS X8, (DX)(R8*4) 21753 ADDQ BX, R8 21754 SUBQ $0x08, SI 21755 21756 check_limit_unroll: 21757 CMPQ SI, $0x08 21758 JHS loop_unroll 21759 JMP check_limit 21760 21761 loop: 21762 MOVSS (AX)(DI*4), X1 21763 MULSS X0, X1 21764 ADDSS (DX)(R8*4), X1 21765 MOVSS X1, (DX)(R8*4) 21766 DECQ SI 21767 ADDQ CX, DI 21768 ADDQ BX, R8 21769 21770 check_limit: 21771 CMPQ SI, $0x00 21772 JHI loop 21773 RET 21774 21775 // func AmdAxpyUnsafeXInterleave_V4A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21776 // Requires: SSE 21777 TEXT ·AmdAxpyUnsafeXInterleave_V4A10R8(SB), NOSPLIT, $0-48 21778 MOVSS alpha+0(FP), X0 21779 MOVQ xs+8(FP), AX 21780 MOVQ incx+16(FP), CX 21781 MOVQ ys+24(FP), DX 21782 MOVQ incy+32(FP), BX 21783 MOVQ n+40(FP), SI 21784 XORQ DI, DI 21785 XORQ R8, R8 21786 JMP check_limit_unroll 21787 PCALIGN $0x08 21788 NOP 21789 NOP 21790 21791 loop_unroll: 21792 MOVSS (AX)(DI*4), X1 21793 ADDQ CX, DI 21794 MOVSS (AX)(DI*4), X2 21795 ADDQ CX, DI 21796 MOVSS (AX)(DI*4), X3 21797 ADDQ CX, DI 21798 MOVSS (AX)(DI*4), X4 21799 ADDQ CX, DI 21800 MOVSS (AX)(DI*4), X5 21801 ADDQ CX, DI 21802 MOVSS (AX)(DI*4), X6 21803 ADDQ CX, DI 21804 MOVSS (AX)(DI*4), X7 21805 ADDQ CX, DI 21806 MOVSS (AX)(DI*4), X8 21807 ADDQ CX, DI 21808 MULSS X0, X1 21809 MULSS X0, X2 21810 MULSS X0, X3 21811 MULSS X0, X4 21812 MULSS X0, X5 21813 MULSS X0, X6 21814 MULSS X0, X7 21815 MULSS X0, X8 21816 ADDSS (DX)(R8*4), X1 21817 MOVSS X1, (DX)(R8*4) 21818 ADDQ BX, R8 21819 ADDSS (DX)(R8*4), X2 21820 MOVSS X2, (DX)(R8*4) 21821 ADDQ BX, R8 21822 ADDSS (DX)(R8*4), X3 21823 MOVSS X3, (DX)(R8*4) 21824 ADDQ BX, R8 21825 ADDSS (DX)(R8*4), X4 21826 MOVSS X4, (DX)(R8*4) 21827 ADDQ BX, R8 21828 ADDSS (DX)(R8*4), X5 21829 MOVSS X5, (DX)(R8*4) 21830 ADDQ BX, R8 21831 ADDSS (DX)(R8*4), X6 21832 MOVSS X6, (DX)(R8*4) 21833 ADDQ BX, R8 21834 ADDSS (DX)(R8*4), X7 21835 MOVSS X7, (DX)(R8*4) 21836 ADDQ BX, R8 21837 ADDSS (DX)(R8*4), X8 21838 MOVSS X8, (DX)(R8*4) 21839 ADDQ BX, R8 21840 SUBQ $0x08, SI 21841 21842 check_limit_unroll: 21843 CMPQ SI, $0x08 21844 JHS loop_unroll 21845 JMP check_limit 21846 21847 loop: 21848 MOVSS (AX)(DI*4), X1 21849 MULSS X0, X1 21850 ADDSS (DX)(R8*4), X1 21851 MOVSS X1, (DX)(R8*4) 21852 DECQ SI 21853 ADDQ CX, DI 21854 ADDQ BX, R8 21855 21856 check_limit: 21857 CMPQ SI, $0x00 21858 JHI loop 21859 RET 21860 21861 // func AmdAxpyUnsafeXInterleave_V5A10R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21862 // Requires: SSE 21863 TEXT ·AmdAxpyUnsafeXInterleave_V5A10R8(SB), NOSPLIT, $0-48 21864 MOVSS alpha+0(FP), X0 21865 MOVQ xs+8(FP), AX 21866 MOVQ incx+16(FP), CX 21867 MOVQ ys+24(FP), DX 21868 MOVQ incy+32(FP), BX 21869 MOVQ n+40(FP), SI 21870 XORQ DI, DI 21871 XORQ R8, R8 21872 JMP check_limit_unroll 21873 PCALIGN $0x08 21874 NOP 21875 NOP 21876 21877 loop_unroll: 21878 MOVSS (AX)(DI*4), X1 21879 ADDQ CX, DI 21880 MOVSS (AX)(DI*4), X2 21881 ADDQ CX, DI 21882 MOVSS (AX)(DI*4), X3 21883 ADDQ CX, DI 21884 MOVSS (AX)(DI*4), X4 21885 ADDQ CX, DI 21886 MOVSS (AX)(DI*4), X5 21887 ADDQ CX, DI 21888 MOVSS (AX)(DI*4), X6 21889 ADDQ CX, DI 21890 MOVSS (AX)(DI*4), X7 21891 ADDQ CX, DI 21892 MOVSS (AX)(DI*4), X8 21893 ADDQ CX, DI 21894 MULSS X0, X1 21895 MULSS X0, X2 21896 MULSS X0, X3 21897 MULSS X0, X4 21898 MULSS X0, X5 21899 MULSS X0, X6 21900 MULSS X0, X7 21901 MULSS X0, X8 21902 ADDSS (DX)(R8*4), X1 21903 MOVSS X1, (DX)(R8*4) 21904 ADDQ BX, R8 21905 ADDSS (DX)(R8*4), X2 21906 MOVSS X2, (DX)(R8*4) 21907 ADDQ BX, R8 21908 ADDSS (DX)(R8*4), X3 21909 MOVSS X3, (DX)(R8*4) 21910 ADDQ BX, R8 21911 ADDSS (DX)(R8*4), X4 21912 MOVSS X4, (DX)(R8*4) 21913 ADDQ BX, R8 21914 ADDSS (DX)(R8*4), X5 21915 MOVSS X5, (DX)(R8*4) 21916 ADDQ BX, R8 21917 ADDSS (DX)(R8*4), X6 21918 MOVSS X6, (DX)(R8*4) 21919 ADDQ BX, R8 21920 ADDSS (DX)(R8*4), X7 21921 MOVSS X7, (DX)(R8*4) 21922 ADDQ BX, R8 21923 ADDSS (DX)(R8*4), X8 21924 MOVSS X8, (DX)(R8*4) 21925 ADDQ BX, R8 21926 SUBQ $0x08, SI 21927 21928 check_limit_unroll: 21929 CMPQ SI, $0x08 21930 JHS loop_unroll 21931 JMP check_limit 21932 21933 loop: 21934 MOVSS (AX)(DI*4), X1 21935 MULSS X0, X1 21936 ADDSS (DX)(R8*4), X1 21937 MOVSS X1, (DX)(R8*4) 21938 DECQ SI 21939 ADDQ CX, DI 21940 ADDQ BX, R8 21941 21942 check_limit: 21943 CMPQ SI, $0x00 21944 JHI loop 21945 RET 21946 21947 // func AmdAxpyUnsafeXInterleave_V0A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 21948 // Requires: SSE 21949 TEXT ·AmdAxpyUnsafeXInterleave_V0A11R8(SB), NOSPLIT, $0-48 21950 MOVSS alpha+0(FP), X0 21951 MOVQ xs+8(FP), AX 21952 MOVQ incx+16(FP), CX 21953 MOVQ ys+24(FP), DX 21954 MOVQ incy+32(FP), BX 21955 MOVQ n+40(FP), SI 21956 XORQ DI, DI 21957 XORQ R8, R8 21958 JMP check_limit_unroll 21959 PCALIGN $0x08 21960 NOP 21961 NOP 21962 NOP 21963 21964 loop_unroll: 21965 MOVSS (AX)(DI*4), X1 21966 ADDQ CX, DI 21967 MOVSS (AX)(DI*4), X2 21968 ADDQ CX, DI 21969 MOVSS (AX)(DI*4), X3 21970 ADDQ CX, DI 21971 MOVSS (AX)(DI*4), X4 21972 ADDQ CX, DI 21973 MOVSS (AX)(DI*4), X5 21974 ADDQ CX, DI 21975 MOVSS (AX)(DI*4), X6 21976 ADDQ CX, DI 21977 MOVSS (AX)(DI*4), X7 21978 ADDQ CX, DI 21979 MOVSS (AX)(DI*4), X8 21980 ADDQ CX, DI 21981 MULSS X0, X1 21982 MULSS X0, X2 21983 MULSS X0, X3 21984 MULSS X0, X4 21985 MULSS X0, X5 21986 MULSS X0, X6 21987 MULSS X0, X7 21988 MULSS X0, X8 21989 ADDSS (DX)(R8*4), X1 21990 MOVSS X1, (DX)(R8*4) 21991 ADDQ BX, R8 21992 ADDSS (DX)(R8*4), X2 21993 MOVSS X2, (DX)(R8*4) 21994 ADDQ BX, R8 21995 ADDSS (DX)(R8*4), X3 21996 MOVSS X3, (DX)(R8*4) 21997 ADDQ BX, R8 21998 ADDSS (DX)(R8*4), X4 21999 MOVSS X4, (DX)(R8*4) 22000 ADDQ BX, R8 22001 ADDSS (DX)(R8*4), X5 22002 MOVSS X5, (DX)(R8*4) 22003 ADDQ BX, R8 22004 ADDSS (DX)(R8*4), X6 22005 MOVSS X6, (DX)(R8*4) 22006 ADDQ BX, R8 22007 ADDSS (DX)(R8*4), X7 22008 MOVSS X7, (DX)(R8*4) 22009 ADDQ BX, R8 22010 ADDSS (DX)(R8*4), X8 22011 MOVSS X8, (DX)(R8*4) 22012 ADDQ BX, R8 22013 SUBQ $0x08, SI 22014 22015 check_limit_unroll: 22016 CMPQ SI, $0x08 22017 JHS loop_unroll 22018 JMP check_limit 22019 22020 loop: 22021 MOVSS (AX)(DI*4), X1 22022 MULSS X0, X1 22023 ADDSS (DX)(R8*4), X1 22024 MOVSS X1, (DX)(R8*4) 22025 DECQ SI 22026 ADDQ CX, DI 22027 ADDQ BX, R8 22028 22029 check_limit: 22030 CMPQ SI, $0x00 22031 JHI loop 22032 RET 22033 22034 // func AmdAxpyUnsafeXInterleave_V1A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22035 // Requires: SSE 22036 TEXT ·AmdAxpyUnsafeXInterleave_V1A11R8(SB), NOSPLIT, $0-48 22037 MOVSS alpha+0(FP), X0 22038 MOVQ xs+8(FP), AX 22039 MOVQ incx+16(FP), CX 22040 MOVQ ys+24(FP), DX 22041 MOVQ incy+32(FP), BX 22042 MOVQ n+40(FP), SI 22043 XORQ DI, DI 22044 XORQ R8, R8 22045 JMP check_limit_unroll 22046 PCALIGN $0x08 22047 NOP 22048 NOP 22049 NOP 22050 22051 loop_unroll: 22052 MOVSS (AX)(DI*4), X1 22053 ADDQ CX, DI 22054 MOVSS (AX)(DI*4), X2 22055 ADDQ CX, DI 22056 MOVSS (AX)(DI*4), X3 22057 ADDQ CX, DI 22058 MOVSS (AX)(DI*4), X4 22059 ADDQ CX, DI 22060 MOVSS (AX)(DI*4), X5 22061 ADDQ CX, DI 22062 MOVSS (AX)(DI*4), X6 22063 ADDQ CX, DI 22064 MOVSS (AX)(DI*4), X7 22065 ADDQ CX, DI 22066 MOVSS (AX)(DI*4), X8 22067 ADDQ CX, DI 22068 MULSS X0, X1 22069 MULSS X0, X2 22070 MULSS X0, X3 22071 MULSS X0, X4 22072 MULSS X0, X5 22073 MULSS X0, X6 22074 MULSS X0, X7 22075 MULSS X0, X8 22076 ADDSS (DX)(R8*4), X1 22077 MOVSS X1, (DX)(R8*4) 22078 ADDQ BX, R8 22079 ADDSS (DX)(R8*4), X2 22080 MOVSS X2, (DX)(R8*4) 22081 ADDQ BX, R8 22082 ADDSS (DX)(R8*4), X3 22083 MOVSS X3, (DX)(R8*4) 22084 ADDQ BX, R8 22085 ADDSS (DX)(R8*4), X4 22086 MOVSS X4, (DX)(R8*4) 22087 ADDQ BX, R8 22088 ADDSS (DX)(R8*4), X5 22089 MOVSS X5, (DX)(R8*4) 22090 ADDQ BX, R8 22091 ADDSS (DX)(R8*4), X6 22092 MOVSS X6, (DX)(R8*4) 22093 ADDQ BX, R8 22094 ADDSS (DX)(R8*4), X7 22095 MOVSS X7, (DX)(R8*4) 22096 ADDQ BX, R8 22097 ADDSS (DX)(R8*4), X8 22098 MOVSS X8, (DX)(R8*4) 22099 ADDQ BX, R8 22100 SUBQ $0x08, SI 22101 22102 check_limit_unroll: 22103 CMPQ SI, $0x08 22104 JHS loop_unroll 22105 JMP check_limit 22106 22107 loop: 22108 MOVSS (AX)(DI*4), X1 22109 MULSS X0, X1 22110 ADDSS (DX)(R8*4), X1 22111 MOVSS X1, (DX)(R8*4) 22112 DECQ SI 22113 ADDQ CX, DI 22114 ADDQ BX, R8 22115 22116 check_limit: 22117 CMPQ SI, $0x00 22118 JHI loop 22119 RET 22120 22121 // func AmdAxpyUnsafeXInterleave_V2A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22122 // Requires: SSE 22123 TEXT ·AmdAxpyUnsafeXInterleave_V2A11R8(SB), NOSPLIT, $0-48 22124 MOVSS alpha+0(FP), X0 22125 MOVQ xs+8(FP), AX 22126 MOVQ incx+16(FP), CX 22127 MOVQ ys+24(FP), DX 22128 MOVQ incy+32(FP), BX 22129 MOVQ n+40(FP), SI 22130 XORQ DI, DI 22131 XORQ R8, R8 22132 JMP check_limit_unroll 22133 PCALIGN $0x08 22134 NOP 22135 NOP 22136 NOP 22137 22138 loop_unroll: 22139 MOVSS (AX)(DI*4), X1 22140 ADDQ CX, DI 22141 MOVSS (AX)(DI*4), X2 22142 ADDQ CX, DI 22143 MOVSS (AX)(DI*4), X3 22144 ADDQ CX, DI 22145 MOVSS (AX)(DI*4), X4 22146 ADDQ CX, DI 22147 MOVSS (AX)(DI*4), X5 22148 ADDQ CX, DI 22149 MOVSS (AX)(DI*4), X6 22150 ADDQ CX, DI 22151 MOVSS (AX)(DI*4), X7 22152 ADDQ CX, DI 22153 MOVSS (AX)(DI*4), X8 22154 ADDQ CX, DI 22155 MULSS X0, X1 22156 MULSS X0, X2 22157 MULSS X0, X3 22158 MULSS X0, X4 22159 MULSS X0, X5 22160 MULSS X0, X6 22161 MULSS X0, X7 22162 MULSS X0, X8 22163 ADDSS (DX)(R8*4), X1 22164 MOVSS X1, (DX)(R8*4) 22165 ADDQ BX, R8 22166 ADDSS (DX)(R8*4), X2 22167 MOVSS X2, (DX)(R8*4) 22168 ADDQ BX, R8 22169 ADDSS (DX)(R8*4), X3 22170 MOVSS X3, (DX)(R8*4) 22171 ADDQ BX, R8 22172 ADDSS (DX)(R8*4), X4 22173 MOVSS X4, (DX)(R8*4) 22174 ADDQ BX, R8 22175 ADDSS (DX)(R8*4), X5 22176 MOVSS X5, (DX)(R8*4) 22177 ADDQ BX, R8 22178 ADDSS (DX)(R8*4), X6 22179 MOVSS X6, (DX)(R8*4) 22180 ADDQ BX, R8 22181 ADDSS (DX)(R8*4), X7 22182 MOVSS X7, (DX)(R8*4) 22183 ADDQ BX, R8 22184 ADDSS (DX)(R8*4), X8 22185 MOVSS X8, (DX)(R8*4) 22186 ADDQ BX, R8 22187 SUBQ $0x08, SI 22188 22189 check_limit_unroll: 22190 CMPQ SI, $0x08 22191 JHS loop_unroll 22192 JMP check_limit 22193 22194 loop: 22195 MOVSS (AX)(DI*4), X1 22196 MULSS X0, X1 22197 ADDSS (DX)(R8*4), X1 22198 MOVSS X1, (DX)(R8*4) 22199 DECQ SI 22200 ADDQ CX, DI 22201 ADDQ BX, R8 22202 22203 check_limit: 22204 CMPQ SI, $0x00 22205 JHI loop 22206 RET 22207 22208 // func AmdAxpyUnsafeXInterleave_V3A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22209 // Requires: SSE 22210 TEXT ·AmdAxpyUnsafeXInterleave_V3A11R8(SB), NOSPLIT, $0-48 22211 MOVSS alpha+0(FP), X0 22212 MOVQ xs+8(FP), AX 22213 MOVQ incx+16(FP), CX 22214 MOVQ ys+24(FP), DX 22215 MOVQ incy+32(FP), BX 22216 MOVQ n+40(FP), SI 22217 XORQ DI, DI 22218 XORQ R8, R8 22219 JMP check_limit_unroll 22220 PCALIGN $0x08 22221 NOP 22222 NOP 22223 NOP 22224 22225 loop_unroll: 22226 MOVSS (AX)(DI*4), X1 22227 ADDQ CX, DI 22228 MOVSS (AX)(DI*4), X2 22229 ADDQ CX, DI 22230 MOVSS (AX)(DI*4), X3 22231 ADDQ CX, DI 22232 MOVSS (AX)(DI*4), X4 22233 ADDQ CX, DI 22234 MOVSS (AX)(DI*4), X5 22235 ADDQ CX, DI 22236 MOVSS (AX)(DI*4), X6 22237 ADDQ CX, DI 22238 MOVSS (AX)(DI*4), X7 22239 ADDQ CX, DI 22240 MOVSS (AX)(DI*4), X8 22241 ADDQ CX, DI 22242 MULSS X0, X1 22243 MULSS X0, X2 22244 MULSS X0, X3 22245 MULSS X0, X4 22246 MULSS X0, X5 22247 MULSS X0, X6 22248 MULSS X0, X7 22249 MULSS X0, X8 22250 ADDSS (DX)(R8*4), X1 22251 MOVSS X1, (DX)(R8*4) 22252 ADDQ BX, R8 22253 ADDSS (DX)(R8*4), X2 22254 MOVSS X2, (DX)(R8*4) 22255 ADDQ BX, R8 22256 ADDSS (DX)(R8*4), X3 22257 MOVSS X3, (DX)(R8*4) 22258 ADDQ BX, R8 22259 ADDSS (DX)(R8*4), X4 22260 MOVSS X4, (DX)(R8*4) 22261 ADDQ BX, R8 22262 ADDSS (DX)(R8*4), X5 22263 MOVSS X5, (DX)(R8*4) 22264 ADDQ BX, R8 22265 ADDSS (DX)(R8*4), X6 22266 MOVSS X6, (DX)(R8*4) 22267 ADDQ BX, R8 22268 ADDSS (DX)(R8*4), X7 22269 MOVSS X7, (DX)(R8*4) 22270 ADDQ BX, R8 22271 ADDSS (DX)(R8*4), X8 22272 MOVSS X8, (DX)(R8*4) 22273 ADDQ BX, R8 22274 SUBQ $0x08, SI 22275 22276 check_limit_unroll: 22277 CMPQ SI, $0x08 22278 JHS loop_unroll 22279 JMP check_limit 22280 22281 loop: 22282 MOVSS (AX)(DI*4), X1 22283 MULSS X0, X1 22284 ADDSS (DX)(R8*4), X1 22285 MOVSS X1, (DX)(R8*4) 22286 DECQ SI 22287 ADDQ CX, DI 22288 ADDQ BX, R8 22289 22290 check_limit: 22291 CMPQ SI, $0x00 22292 JHI loop 22293 RET 22294 22295 // func AmdAxpyUnsafeXInterleave_V4A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22296 // Requires: SSE 22297 TEXT ·AmdAxpyUnsafeXInterleave_V4A11R8(SB), NOSPLIT, $0-48 22298 MOVSS alpha+0(FP), X0 22299 MOVQ xs+8(FP), AX 22300 MOVQ incx+16(FP), CX 22301 MOVQ ys+24(FP), DX 22302 MOVQ incy+32(FP), BX 22303 MOVQ n+40(FP), SI 22304 XORQ DI, DI 22305 XORQ R8, R8 22306 JMP check_limit_unroll 22307 PCALIGN $0x08 22308 NOP 22309 NOP 22310 NOP 22311 22312 loop_unroll: 22313 MOVSS (AX)(DI*4), X1 22314 ADDQ CX, DI 22315 MOVSS (AX)(DI*4), X2 22316 ADDQ CX, DI 22317 MOVSS (AX)(DI*4), X3 22318 ADDQ CX, DI 22319 MOVSS (AX)(DI*4), X4 22320 ADDQ CX, DI 22321 MOVSS (AX)(DI*4), X5 22322 ADDQ CX, DI 22323 MOVSS (AX)(DI*4), X6 22324 ADDQ CX, DI 22325 MOVSS (AX)(DI*4), X7 22326 ADDQ CX, DI 22327 MOVSS (AX)(DI*4), X8 22328 ADDQ CX, DI 22329 MULSS X0, X1 22330 MULSS X0, X2 22331 MULSS X0, X3 22332 MULSS X0, X4 22333 MULSS X0, X5 22334 MULSS X0, X6 22335 MULSS X0, X7 22336 MULSS X0, X8 22337 ADDSS (DX)(R8*4), X1 22338 MOVSS X1, (DX)(R8*4) 22339 ADDQ BX, R8 22340 ADDSS (DX)(R8*4), X2 22341 MOVSS X2, (DX)(R8*4) 22342 ADDQ BX, R8 22343 ADDSS (DX)(R8*4), X3 22344 MOVSS X3, (DX)(R8*4) 22345 ADDQ BX, R8 22346 ADDSS (DX)(R8*4), X4 22347 MOVSS X4, (DX)(R8*4) 22348 ADDQ BX, R8 22349 ADDSS (DX)(R8*4), X5 22350 MOVSS X5, (DX)(R8*4) 22351 ADDQ BX, R8 22352 ADDSS (DX)(R8*4), X6 22353 MOVSS X6, (DX)(R8*4) 22354 ADDQ BX, R8 22355 ADDSS (DX)(R8*4), X7 22356 MOVSS X7, (DX)(R8*4) 22357 ADDQ BX, R8 22358 ADDSS (DX)(R8*4), X8 22359 MOVSS X8, (DX)(R8*4) 22360 ADDQ BX, R8 22361 SUBQ $0x08, SI 22362 22363 check_limit_unroll: 22364 CMPQ SI, $0x08 22365 JHS loop_unroll 22366 JMP check_limit 22367 22368 loop: 22369 MOVSS (AX)(DI*4), X1 22370 MULSS X0, X1 22371 ADDSS (DX)(R8*4), X1 22372 MOVSS X1, (DX)(R8*4) 22373 DECQ SI 22374 ADDQ CX, DI 22375 ADDQ BX, R8 22376 22377 check_limit: 22378 CMPQ SI, $0x00 22379 JHI loop 22380 RET 22381 22382 // func AmdAxpyUnsafeXInterleave_V5A11R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22383 // Requires: SSE 22384 TEXT ·AmdAxpyUnsafeXInterleave_V5A11R8(SB), NOSPLIT, $0-48 22385 MOVSS alpha+0(FP), X0 22386 MOVQ xs+8(FP), AX 22387 MOVQ incx+16(FP), CX 22388 MOVQ ys+24(FP), DX 22389 MOVQ incy+32(FP), BX 22390 MOVQ n+40(FP), SI 22391 XORQ DI, DI 22392 XORQ R8, R8 22393 JMP check_limit_unroll 22394 PCALIGN $0x08 22395 NOP 22396 NOP 22397 NOP 22398 22399 loop_unroll: 22400 MOVSS (AX)(DI*4), X1 22401 ADDQ CX, DI 22402 MOVSS (AX)(DI*4), X2 22403 ADDQ CX, DI 22404 MOVSS (AX)(DI*4), X3 22405 ADDQ CX, DI 22406 MOVSS (AX)(DI*4), X4 22407 ADDQ CX, DI 22408 MOVSS (AX)(DI*4), X5 22409 ADDQ CX, DI 22410 MOVSS (AX)(DI*4), X6 22411 ADDQ CX, DI 22412 MOVSS (AX)(DI*4), X7 22413 ADDQ CX, DI 22414 MOVSS (AX)(DI*4), X8 22415 ADDQ CX, DI 22416 MULSS X0, X1 22417 MULSS X0, X2 22418 MULSS X0, X3 22419 MULSS X0, X4 22420 MULSS X0, X5 22421 MULSS X0, X6 22422 MULSS X0, X7 22423 MULSS X0, X8 22424 ADDSS (DX)(R8*4), X1 22425 MOVSS X1, (DX)(R8*4) 22426 ADDQ BX, R8 22427 ADDSS (DX)(R8*4), X2 22428 MOVSS X2, (DX)(R8*4) 22429 ADDQ BX, R8 22430 ADDSS (DX)(R8*4), X3 22431 MOVSS X3, (DX)(R8*4) 22432 ADDQ BX, R8 22433 ADDSS (DX)(R8*4), X4 22434 MOVSS X4, (DX)(R8*4) 22435 ADDQ BX, R8 22436 ADDSS (DX)(R8*4), X5 22437 MOVSS X5, (DX)(R8*4) 22438 ADDQ BX, R8 22439 ADDSS (DX)(R8*4), X6 22440 MOVSS X6, (DX)(R8*4) 22441 ADDQ BX, R8 22442 ADDSS (DX)(R8*4), X7 22443 MOVSS X7, (DX)(R8*4) 22444 ADDQ BX, R8 22445 ADDSS (DX)(R8*4), X8 22446 MOVSS X8, (DX)(R8*4) 22447 ADDQ BX, R8 22448 SUBQ $0x08, SI 22449 22450 check_limit_unroll: 22451 CMPQ SI, $0x08 22452 JHS loop_unroll 22453 JMP check_limit 22454 22455 loop: 22456 MOVSS (AX)(DI*4), X1 22457 MULSS X0, X1 22458 ADDSS (DX)(R8*4), X1 22459 MOVSS X1, (DX)(R8*4) 22460 DECQ SI 22461 ADDQ CX, DI 22462 ADDQ BX, R8 22463 22464 check_limit: 22465 CMPQ SI, $0x00 22466 JHI loop 22467 RET 22468 22469 // func AmdAxpyUnsafeXInterleave_V0A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22470 // Requires: SSE 22471 TEXT ·AmdAxpyUnsafeXInterleave_V0A12R8(SB), NOSPLIT, $0-48 22472 MOVSS alpha+0(FP), X0 22473 MOVQ xs+8(FP), AX 22474 MOVQ incx+16(FP), CX 22475 MOVQ ys+24(FP), DX 22476 MOVQ incy+32(FP), BX 22477 MOVQ n+40(FP), SI 22478 XORQ DI, DI 22479 XORQ R8, R8 22480 JMP check_limit_unroll 22481 PCALIGN $0x08 22482 NOP 22483 NOP 22484 NOP 22485 NOP 22486 22487 loop_unroll: 22488 MOVSS (AX)(DI*4), X1 22489 ADDQ CX, DI 22490 MOVSS (AX)(DI*4), X2 22491 ADDQ CX, DI 22492 MOVSS (AX)(DI*4), X3 22493 ADDQ CX, DI 22494 MOVSS (AX)(DI*4), X4 22495 ADDQ CX, DI 22496 MOVSS (AX)(DI*4), X5 22497 ADDQ CX, DI 22498 MOVSS (AX)(DI*4), X6 22499 ADDQ CX, DI 22500 MOVSS (AX)(DI*4), X7 22501 ADDQ CX, DI 22502 MOVSS (AX)(DI*4), X8 22503 ADDQ CX, DI 22504 MULSS X0, X1 22505 MULSS X0, X2 22506 MULSS X0, X3 22507 MULSS X0, X4 22508 MULSS X0, X5 22509 MULSS X0, X6 22510 MULSS X0, X7 22511 MULSS X0, X8 22512 ADDSS (DX)(R8*4), X1 22513 MOVSS X1, (DX)(R8*4) 22514 ADDQ BX, R8 22515 ADDSS (DX)(R8*4), X2 22516 MOVSS X2, (DX)(R8*4) 22517 ADDQ BX, R8 22518 ADDSS (DX)(R8*4), X3 22519 MOVSS X3, (DX)(R8*4) 22520 ADDQ BX, R8 22521 ADDSS (DX)(R8*4), X4 22522 MOVSS X4, (DX)(R8*4) 22523 ADDQ BX, R8 22524 ADDSS (DX)(R8*4), X5 22525 MOVSS X5, (DX)(R8*4) 22526 ADDQ BX, R8 22527 ADDSS (DX)(R8*4), X6 22528 MOVSS X6, (DX)(R8*4) 22529 ADDQ BX, R8 22530 ADDSS (DX)(R8*4), X7 22531 MOVSS X7, (DX)(R8*4) 22532 ADDQ BX, R8 22533 ADDSS (DX)(R8*4), X8 22534 MOVSS X8, (DX)(R8*4) 22535 ADDQ BX, R8 22536 SUBQ $0x08, SI 22537 22538 check_limit_unroll: 22539 CMPQ SI, $0x08 22540 JHS loop_unroll 22541 JMP check_limit 22542 22543 loop: 22544 MOVSS (AX)(DI*4), X1 22545 MULSS X0, X1 22546 ADDSS (DX)(R8*4), X1 22547 MOVSS X1, (DX)(R8*4) 22548 DECQ SI 22549 ADDQ CX, DI 22550 ADDQ BX, R8 22551 22552 check_limit: 22553 CMPQ SI, $0x00 22554 JHI loop 22555 RET 22556 22557 // func AmdAxpyUnsafeXInterleave_V1A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22558 // Requires: SSE 22559 TEXT ·AmdAxpyUnsafeXInterleave_V1A12R8(SB), NOSPLIT, $0-48 22560 MOVSS alpha+0(FP), X0 22561 MOVQ xs+8(FP), AX 22562 MOVQ incx+16(FP), CX 22563 MOVQ ys+24(FP), DX 22564 MOVQ incy+32(FP), BX 22565 MOVQ n+40(FP), SI 22566 XORQ DI, DI 22567 XORQ R8, R8 22568 JMP check_limit_unroll 22569 PCALIGN $0x08 22570 NOP 22571 NOP 22572 NOP 22573 NOP 22574 22575 loop_unroll: 22576 MOVSS (AX)(DI*4), X1 22577 ADDQ CX, DI 22578 MOVSS (AX)(DI*4), X2 22579 ADDQ CX, DI 22580 MOVSS (AX)(DI*4), X3 22581 ADDQ CX, DI 22582 MOVSS (AX)(DI*4), X4 22583 ADDQ CX, DI 22584 MOVSS (AX)(DI*4), X5 22585 ADDQ CX, DI 22586 MOVSS (AX)(DI*4), X6 22587 ADDQ CX, DI 22588 MOVSS (AX)(DI*4), X7 22589 ADDQ CX, DI 22590 MOVSS (AX)(DI*4), X8 22591 ADDQ CX, DI 22592 MULSS X0, X1 22593 MULSS X0, X2 22594 MULSS X0, X3 22595 MULSS X0, X4 22596 MULSS X0, X5 22597 MULSS X0, X6 22598 MULSS X0, X7 22599 MULSS X0, X8 22600 ADDSS (DX)(R8*4), X1 22601 MOVSS X1, (DX)(R8*4) 22602 ADDQ BX, R8 22603 ADDSS (DX)(R8*4), X2 22604 MOVSS X2, (DX)(R8*4) 22605 ADDQ BX, R8 22606 ADDSS (DX)(R8*4), X3 22607 MOVSS X3, (DX)(R8*4) 22608 ADDQ BX, R8 22609 ADDSS (DX)(R8*4), X4 22610 MOVSS X4, (DX)(R8*4) 22611 ADDQ BX, R8 22612 ADDSS (DX)(R8*4), X5 22613 MOVSS X5, (DX)(R8*4) 22614 ADDQ BX, R8 22615 ADDSS (DX)(R8*4), X6 22616 MOVSS X6, (DX)(R8*4) 22617 ADDQ BX, R8 22618 ADDSS (DX)(R8*4), X7 22619 MOVSS X7, (DX)(R8*4) 22620 ADDQ BX, R8 22621 ADDSS (DX)(R8*4), X8 22622 MOVSS X8, (DX)(R8*4) 22623 ADDQ BX, R8 22624 SUBQ $0x08, SI 22625 22626 check_limit_unroll: 22627 CMPQ SI, $0x08 22628 JHS loop_unroll 22629 JMP check_limit 22630 22631 loop: 22632 MOVSS (AX)(DI*4), X1 22633 MULSS X0, X1 22634 ADDSS (DX)(R8*4), X1 22635 MOVSS X1, (DX)(R8*4) 22636 DECQ SI 22637 ADDQ CX, DI 22638 ADDQ BX, R8 22639 22640 check_limit: 22641 CMPQ SI, $0x00 22642 JHI loop 22643 RET 22644 22645 // func AmdAxpyUnsafeXInterleave_V2A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22646 // Requires: SSE 22647 TEXT ·AmdAxpyUnsafeXInterleave_V2A12R8(SB), NOSPLIT, $0-48 22648 MOVSS alpha+0(FP), X0 22649 MOVQ xs+8(FP), AX 22650 MOVQ incx+16(FP), CX 22651 MOVQ ys+24(FP), DX 22652 MOVQ incy+32(FP), BX 22653 MOVQ n+40(FP), SI 22654 XORQ DI, DI 22655 XORQ R8, R8 22656 JMP check_limit_unroll 22657 PCALIGN $0x08 22658 NOP 22659 NOP 22660 NOP 22661 NOP 22662 22663 loop_unroll: 22664 MOVSS (AX)(DI*4), X1 22665 ADDQ CX, DI 22666 MOVSS (AX)(DI*4), X2 22667 ADDQ CX, DI 22668 MOVSS (AX)(DI*4), X3 22669 ADDQ CX, DI 22670 MOVSS (AX)(DI*4), X4 22671 ADDQ CX, DI 22672 MOVSS (AX)(DI*4), X5 22673 ADDQ CX, DI 22674 MOVSS (AX)(DI*4), X6 22675 ADDQ CX, DI 22676 MOVSS (AX)(DI*4), X7 22677 ADDQ CX, DI 22678 MOVSS (AX)(DI*4), X8 22679 ADDQ CX, DI 22680 MULSS X0, X1 22681 MULSS X0, X2 22682 MULSS X0, X3 22683 MULSS X0, X4 22684 MULSS X0, X5 22685 MULSS X0, X6 22686 MULSS X0, X7 22687 MULSS X0, X8 22688 ADDSS (DX)(R8*4), X1 22689 MOVSS X1, (DX)(R8*4) 22690 ADDQ BX, R8 22691 ADDSS (DX)(R8*4), X2 22692 MOVSS X2, (DX)(R8*4) 22693 ADDQ BX, R8 22694 ADDSS (DX)(R8*4), X3 22695 MOVSS X3, (DX)(R8*4) 22696 ADDQ BX, R8 22697 ADDSS (DX)(R8*4), X4 22698 MOVSS X4, (DX)(R8*4) 22699 ADDQ BX, R8 22700 ADDSS (DX)(R8*4), X5 22701 MOVSS X5, (DX)(R8*4) 22702 ADDQ BX, R8 22703 ADDSS (DX)(R8*4), X6 22704 MOVSS X6, (DX)(R8*4) 22705 ADDQ BX, R8 22706 ADDSS (DX)(R8*4), X7 22707 MOVSS X7, (DX)(R8*4) 22708 ADDQ BX, R8 22709 ADDSS (DX)(R8*4), X8 22710 MOVSS X8, (DX)(R8*4) 22711 ADDQ BX, R8 22712 SUBQ $0x08, SI 22713 22714 check_limit_unroll: 22715 CMPQ SI, $0x08 22716 JHS loop_unroll 22717 JMP check_limit 22718 22719 loop: 22720 MOVSS (AX)(DI*4), X1 22721 MULSS X0, X1 22722 ADDSS (DX)(R8*4), X1 22723 MOVSS X1, (DX)(R8*4) 22724 DECQ SI 22725 ADDQ CX, DI 22726 ADDQ BX, R8 22727 22728 check_limit: 22729 CMPQ SI, $0x00 22730 JHI loop 22731 RET 22732 22733 // func AmdAxpyUnsafeXInterleave_V3A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22734 // Requires: SSE 22735 TEXT ·AmdAxpyUnsafeXInterleave_V3A12R8(SB), NOSPLIT, $0-48 22736 MOVSS alpha+0(FP), X0 22737 MOVQ xs+8(FP), AX 22738 MOVQ incx+16(FP), CX 22739 MOVQ ys+24(FP), DX 22740 MOVQ incy+32(FP), BX 22741 MOVQ n+40(FP), SI 22742 XORQ DI, DI 22743 XORQ R8, R8 22744 JMP check_limit_unroll 22745 PCALIGN $0x08 22746 NOP 22747 NOP 22748 NOP 22749 NOP 22750 22751 loop_unroll: 22752 MOVSS (AX)(DI*4), X1 22753 ADDQ CX, DI 22754 MOVSS (AX)(DI*4), X2 22755 ADDQ CX, DI 22756 MOVSS (AX)(DI*4), X3 22757 ADDQ CX, DI 22758 MOVSS (AX)(DI*4), X4 22759 ADDQ CX, DI 22760 MOVSS (AX)(DI*4), X5 22761 ADDQ CX, DI 22762 MOVSS (AX)(DI*4), X6 22763 ADDQ CX, DI 22764 MOVSS (AX)(DI*4), X7 22765 ADDQ CX, DI 22766 MOVSS (AX)(DI*4), X8 22767 ADDQ CX, DI 22768 MULSS X0, X1 22769 MULSS X0, X2 22770 MULSS X0, X3 22771 MULSS X0, X4 22772 MULSS X0, X5 22773 MULSS X0, X6 22774 MULSS X0, X7 22775 MULSS X0, X8 22776 ADDSS (DX)(R8*4), X1 22777 MOVSS X1, (DX)(R8*4) 22778 ADDQ BX, R8 22779 ADDSS (DX)(R8*4), X2 22780 MOVSS X2, (DX)(R8*4) 22781 ADDQ BX, R8 22782 ADDSS (DX)(R8*4), X3 22783 MOVSS X3, (DX)(R8*4) 22784 ADDQ BX, R8 22785 ADDSS (DX)(R8*4), X4 22786 MOVSS X4, (DX)(R8*4) 22787 ADDQ BX, R8 22788 ADDSS (DX)(R8*4), X5 22789 MOVSS X5, (DX)(R8*4) 22790 ADDQ BX, R8 22791 ADDSS (DX)(R8*4), X6 22792 MOVSS X6, (DX)(R8*4) 22793 ADDQ BX, R8 22794 ADDSS (DX)(R8*4), X7 22795 MOVSS X7, (DX)(R8*4) 22796 ADDQ BX, R8 22797 ADDSS (DX)(R8*4), X8 22798 MOVSS X8, (DX)(R8*4) 22799 ADDQ BX, R8 22800 SUBQ $0x08, SI 22801 22802 check_limit_unroll: 22803 CMPQ SI, $0x08 22804 JHS loop_unroll 22805 JMP check_limit 22806 22807 loop: 22808 MOVSS (AX)(DI*4), X1 22809 MULSS X0, X1 22810 ADDSS (DX)(R8*4), X1 22811 MOVSS X1, (DX)(R8*4) 22812 DECQ SI 22813 ADDQ CX, DI 22814 ADDQ BX, R8 22815 22816 check_limit: 22817 CMPQ SI, $0x00 22818 JHI loop 22819 RET 22820 22821 // func AmdAxpyUnsafeXInterleave_V4A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22822 // Requires: SSE 22823 TEXT ·AmdAxpyUnsafeXInterleave_V4A12R8(SB), NOSPLIT, $0-48 22824 MOVSS alpha+0(FP), X0 22825 MOVQ xs+8(FP), AX 22826 MOVQ incx+16(FP), CX 22827 MOVQ ys+24(FP), DX 22828 MOVQ incy+32(FP), BX 22829 MOVQ n+40(FP), SI 22830 XORQ DI, DI 22831 XORQ R8, R8 22832 JMP check_limit_unroll 22833 PCALIGN $0x08 22834 NOP 22835 NOP 22836 NOP 22837 NOP 22838 22839 loop_unroll: 22840 MOVSS (AX)(DI*4), X1 22841 ADDQ CX, DI 22842 MOVSS (AX)(DI*4), X2 22843 ADDQ CX, DI 22844 MOVSS (AX)(DI*4), X3 22845 ADDQ CX, DI 22846 MOVSS (AX)(DI*4), X4 22847 ADDQ CX, DI 22848 MOVSS (AX)(DI*4), X5 22849 ADDQ CX, DI 22850 MOVSS (AX)(DI*4), X6 22851 ADDQ CX, DI 22852 MOVSS (AX)(DI*4), X7 22853 ADDQ CX, DI 22854 MOVSS (AX)(DI*4), X8 22855 ADDQ CX, DI 22856 MULSS X0, X1 22857 MULSS X0, X2 22858 MULSS X0, X3 22859 MULSS X0, X4 22860 MULSS X0, X5 22861 MULSS X0, X6 22862 MULSS X0, X7 22863 MULSS X0, X8 22864 ADDSS (DX)(R8*4), X1 22865 MOVSS X1, (DX)(R8*4) 22866 ADDQ BX, R8 22867 ADDSS (DX)(R8*4), X2 22868 MOVSS X2, (DX)(R8*4) 22869 ADDQ BX, R8 22870 ADDSS (DX)(R8*4), X3 22871 MOVSS X3, (DX)(R8*4) 22872 ADDQ BX, R8 22873 ADDSS (DX)(R8*4), X4 22874 MOVSS X4, (DX)(R8*4) 22875 ADDQ BX, R8 22876 ADDSS (DX)(R8*4), X5 22877 MOVSS X5, (DX)(R8*4) 22878 ADDQ BX, R8 22879 ADDSS (DX)(R8*4), X6 22880 MOVSS X6, (DX)(R8*4) 22881 ADDQ BX, R8 22882 ADDSS (DX)(R8*4), X7 22883 MOVSS X7, (DX)(R8*4) 22884 ADDQ BX, R8 22885 ADDSS (DX)(R8*4), X8 22886 MOVSS X8, (DX)(R8*4) 22887 ADDQ BX, R8 22888 SUBQ $0x08, SI 22889 22890 check_limit_unroll: 22891 CMPQ SI, $0x08 22892 JHS loop_unroll 22893 JMP check_limit 22894 22895 loop: 22896 MOVSS (AX)(DI*4), X1 22897 MULSS X0, X1 22898 ADDSS (DX)(R8*4), X1 22899 MOVSS X1, (DX)(R8*4) 22900 DECQ SI 22901 ADDQ CX, DI 22902 ADDQ BX, R8 22903 22904 check_limit: 22905 CMPQ SI, $0x00 22906 JHI loop 22907 RET 22908 22909 // func AmdAxpyUnsafeXInterleave_V5A12R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22910 // Requires: SSE 22911 TEXT ·AmdAxpyUnsafeXInterleave_V5A12R8(SB), NOSPLIT, $0-48 22912 MOVSS alpha+0(FP), X0 22913 MOVQ xs+8(FP), AX 22914 MOVQ incx+16(FP), CX 22915 MOVQ ys+24(FP), DX 22916 MOVQ incy+32(FP), BX 22917 MOVQ n+40(FP), SI 22918 XORQ DI, DI 22919 XORQ R8, R8 22920 JMP check_limit_unroll 22921 PCALIGN $0x08 22922 NOP 22923 NOP 22924 NOP 22925 NOP 22926 22927 loop_unroll: 22928 MOVSS (AX)(DI*4), X1 22929 ADDQ CX, DI 22930 MOVSS (AX)(DI*4), X2 22931 ADDQ CX, DI 22932 MOVSS (AX)(DI*4), X3 22933 ADDQ CX, DI 22934 MOVSS (AX)(DI*4), X4 22935 ADDQ CX, DI 22936 MOVSS (AX)(DI*4), X5 22937 ADDQ CX, DI 22938 MOVSS (AX)(DI*4), X6 22939 ADDQ CX, DI 22940 MOVSS (AX)(DI*4), X7 22941 ADDQ CX, DI 22942 MOVSS (AX)(DI*4), X8 22943 ADDQ CX, DI 22944 MULSS X0, X1 22945 MULSS X0, X2 22946 MULSS X0, X3 22947 MULSS X0, X4 22948 MULSS X0, X5 22949 MULSS X0, X6 22950 MULSS X0, X7 22951 MULSS X0, X8 22952 ADDSS (DX)(R8*4), X1 22953 MOVSS X1, (DX)(R8*4) 22954 ADDQ BX, R8 22955 ADDSS (DX)(R8*4), X2 22956 MOVSS X2, (DX)(R8*4) 22957 ADDQ BX, R8 22958 ADDSS (DX)(R8*4), X3 22959 MOVSS X3, (DX)(R8*4) 22960 ADDQ BX, R8 22961 ADDSS (DX)(R8*4), X4 22962 MOVSS X4, (DX)(R8*4) 22963 ADDQ BX, R8 22964 ADDSS (DX)(R8*4), X5 22965 MOVSS X5, (DX)(R8*4) 22966 ADDQ BX, R8 22967 ADDSS (DX)(R8*4), X6 22968 MOVSS X6, (DX)(R8*4) 22969 ADDQ BX, R8 22970 ADDSS (DX)(R8*4), X7 22971 MOVSS X7, (DX)(R8*4) 22972 ADDQ BX, R8 22973 ADDSS (DX)(R8*4), X8 22974 MOVSS X8, (DX)(R8*4) 22975 ADDQ BX, R8 22976 SUBQ $0x08, SI 22977 22978 check_limit_unroll: 22979 CMPQ SI, $0x08 22980 JHS loop_unroll 22981 JMP check_limit 22982 22983 loop: 22984 MOVSS (AX)(DI*4), X1 22985 MULSS X0, X1 22986 ADDSS (DX)(R8*4), X1 22987 MOVSS X1, (DX)(R8*4) 22988 DECQ SI 22989 ADDQ CX, DI 22990 ADDQ BX, R8 22991 22992 check_limit: 22993 CMPQ SI, $0x00 22994 JHI loop 22995 RET 22996 22997 // func AmdAxpyUnsafeXInterleave_V0A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 22998 // Requires: SSE 22999 TEXT ·AmdAxpyUnsafeXInterleave_V0A13R8(SB), NOSPLIT, $0-48 23000 MOVSS alpha+0(FP), X0 23001 MOVQ xs+8(FP), AX 23002 MOVQ incx+16(FP), CX 23003 MOVQ ys+24(FP), DX 23004 MOVQ incy+32(FP), BX 23005 MOVQ n+40(FP), SI 23006 XORQ DI, DI 23007 XORQ R8, R8 23008 JMP check_limit_unroll 23009 PCALIGN $0x08 23010 NOP 23011 NOP 23012 NOP 23013 NOP 23014 NOP 23015 23016 loop_unroll: 23017 MOVSS (AX)(DI*4), X1 23018 ADDQ CX, DI 23019 MOVSS (AX)(DI*4), X2 23020 ADDQ CX, DI 23021 MOVSS (AX)(DI*4), X3 23022 ADDQ CX, DI 23023 MOVSS (AX)(DI*4), X4 23024 ADDQ CX, DI 23025 MOVSS (AX)(DI*4), X5 23026 ADDQ CX, DI 23027 MOVSS (AX)(DI*4), X6 23028 ADDQ CX, DI 23029 MOVSS (AX)(DI*4), X7 23030 ADDQ CX, DI 23031 MOVSS (AX)(DI*4), X8 23032 ADDQ CX, DI 23033 MULSS X0, X1 23034 MULSS X0, X2 23035 MULSS X0, X3 23036 MULSS X0, X4 23037 MULSS X0, X5 23038 MULSS X0, X6 23039 MULSS X0, X7 23040 MULSS X0, X8 23041 ADDSS (DX)(R8*4), X1 23042 MOVSS X1, (DX)(R8*4) 23043 ADDQ BX, R8 23044 ADDSS (DX)(R8*4), X2 23045 MOVSS X2, (DX)(R8*4) 23046 ADDQ BX, R8 23047 ADDSS (DX)(R8*4), X3 23048 MOVSS X3, (DX)(R8*4) 23049 ADDQ BX, R8 23050 ADDSS (DX)(R8*4), X4 23051 MOVSS X4, (DX)(R8*4) 23052 ADDQ BX, R8 23053 ADDSS (DX)(R8*4), X5 23054 MOVSS X5, (DX)(R8*4) 23055 ADDQ BX, R8 23056 ADDSS (DX)(R8*4), X6 23057 MOVSS X6, (DX)(R8*4) 23058 ADDQ BX, R8 23059 ADDSS (DX)(R8*4), X7 23060 MOVSS X7, (DX)(R8*4) 23061 ADDQ BX, R8 23062 ADDSS (DX)(R8*4), X8 23063 MOVSS X8, (DX)(R8*4) 23064 ADDQ BX, R8 23065 SUBQ $0x08, SI 23066 23067 check_limit_unroll: 23068 CMPQ SI, $0x08 23069 JHS loop_unroll 23070 JMP check_limit 23071 23072 loop: 23073 MOVSS (AX)(DI*4), X1 23074 MULSS X0, X1 23075 ADDSS (DX)(R8*4), X1 23076 MOVSS X1, (DX)(R8*4) 23077 DECQ SI 23078 ADDQ CX, DI 23079 ADDQ BX, R8 23080 23081 check_limit: 23082 CMPQ SI, $0x00 23083 JHI loop 23084 RET 23085 23086 // func AmdAxpyUnsafeXInterleave_V1A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23087 // Requires: SSE 23088 TEXT ·AmdAxpyUnsafeXInterleave_V1A13R8(SB), NOSPLIT, $0-48 23089 MOVSS alpha+0(FP), X0 23090 MOVQ xs+8(FP), AX 23091 MOVQ incx+16(FP), CX 23092 MOVQ ys+24(FP), DX 23093 MOVQ incy+32(FP), BX 23094 MOVQ n+40(FP), SI 23095 XORQ DI, DI 23096 XORQ R8, R8 23097 JMP check_limit_unroll 23098 PCALIGN $0x08 23099 NOP 23100 NOP 23101 NOP 23102 NOP 23103 NOP 23104 23105 loop_unroll: 23106 MOVSS (AX)(DI*4), X1 23107 ADDQ CX, DI 23108 MOVSS (AX)(DI*4), X2 23109 ADDQ CX, DI 23110 MOVSS (AX)(DI*4), X3 23111 ADDQ CX, DI 23112 MOVSS (AX)(DI*4), X4 23113 ADDQ CX, DI 23114 MOVSS (AX)(DI*4), X5 23115 ADDQ CX, DI 23116 MOVSS (AX)(DI*4), X6 23117 ADDQ CX, DI 23118 MOVSS (AX)(DI*4), X7 23119 ADDQ CX, DI 23120 MOVSS (AX)(DI*4), X8 23121 ADDQ CX, DI 23122 MULSS X0, X1 23123 MULSS X0, X2 23124 MULSS X0, X3 23125 MULSS X0, X4 23126 MULSS X0, X5 23127 MULSS X0, X6 23128 MULSS X0, X7 23129 MULSS X0, X8 23130 ADDSS (DX)(R8*4), X1 23131 MOVSS X1, (DX)(R8*4) 23132 ADDQ BX, R8 23133 ADDSS (DX)(R8*4), X2 23134 MOVSS X2, (DX)(R8*4) 23135 ADDQ BX, R8 23136 ADDSS (DX)(R8*4), X3 23137 MOVSS X3, (DX)(R8*4) 23138 ADDQ BX, R8 23139 ADDSS (DX)(R8*4), X4 23140 MOVSS X4, (DX)(R8*4) 23141 ADDQ BX, R8 23142 ADDSS (DX)(R8*4), X5 23143 MOVSS X5, (DX)(R8*4) 23144 ADDQ BX, R8 23145 ADDSS (DX)(R8*4), X6 23146 MOVSS X6, (DX)(R8*4) 23147 ADDQ BX, R8 23148 ADDSS (DX)(R8*4), X7 23149 MOVSS X7, (DX)(R8*4) 23150 ADDQ BX, R8 23151 ADDSS (DX)(R8*4), X8 23152 MOVSS X8, (DX)(R8*4) 23153 ADDQ BX, R8 23154 SUBQ $0x08, SI 23155 23156 check_limit_unroll: 23157 CMPQ SI, $0x08 23158 JHS loop_unroll 23159 JMP check_limit 23160 23161 loop: 23162 MOVSS (AX)(DI*4), X1 23163 MULSS X0, X1 23164 ADDSS (DX)(R8*4), X1 23165 MOVSS X1, (DX)(R8*4) 23166 DECQ SI 23167 ADDQ CX, DI 23168 ADDQ BX, R8 23169 23170 check_limit: 23171 CMPQ SI, $0x00 23172 JHI loop 23173 RET 23174 23175 // func AmdAxpyUnsafeXInterleave_V2A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23176 // Requires: SSE 23177 TEXT ·AmdAxpyUnsafeXInterleave_V2A13R8(SB), NOSPLIT, $0-48 23178 MOVSS alpha+0(FP), X0 23179 MOVQ xs+8(FP), AX 23180 MOVQ incx+16(FP), CX 23181 MOVQ ys+24(FP), DX 23182 MOVQ incy+32(FP), BX 23183 MOVQ n+40(FP), SI 23184 XORQ DI, DI 23185 XORQ R8, R8 23186 JMP check_limit_unroll 23187 PCALIGN $0x08 23188 NOP 23189 NOP 23190 NOP 23191 NOP 23192 NOP 23193 23194 loop_unroll: 23195 MOVSS (AX)(DI*4), X1 23196 ADDQ CX, DI 23197 MOVSS (AX)(DI*4), X2 23198 ADDQ CX, DI 23199 MOVSS (AX)(DI*4), X3 23200 ADDQ CX, DI 23201 MOVSS (AX)(DI*4), X4 23202 ADDQ CX, DI 23203 MOVSS (AX)(DI*4), X5 23204 ADDQ CX, DI 23205 MOVSS (AX)(DI*4), X6 23206 ADDQ CX, DI 23207 MOVSS (AX)(DI*4), X7 23208 ADDQ CX, DI 23209 MOVSS (AX)(DI*4), X8 23210 ADDQ CX, DI 23211 MULSS X0, X1 23212 MULSS X0, X2 23213 MULSS X0, X3 23214 MULSS X0, X4 23215 MULSS X0, X5 23216 MULSS X0, X6 23217 MULSS X0, X7 23218 MULSS X0, X8 23219 ADDSS (DX)(R8*4), X1 23220 MOVSS X1, (DX)(R8*4) 23221 ADDQ BX, R8 23222 ADDSS (DX)(R8*4), X2 23223 MOVSS X2, (DX)(R8*4) 23224 ADDQ BX, R8 23225 ADDSS (DX)(R8*4), X3 23226 MOVSS X3, (DX)(R8*4) 23227 ADDQ BX, R8 23228 ADDSS (DX)(R8*4), X4 23229 MOVSS X4, (DX)(R8*4) 23230 ADDQ BX, R8 23231 ADDSS (DX)(R8*4), X5 23232 MOVSS X5, (DX)(R8*4) 23233 ADDQ BX, R8 23234 ADDSS (DX)(R8*4), X6 23235 MOVSS X6, (DX)(R8*4) 23236 ADDQ BX, R8 23237 ADDSS (DX)(R8*4), X7 23238 MOVSS X7, (DX)(R8*4) 23239 ADDQ BX, R8 23240 ADDSS (DX)(R8*4), X8 23241 MOVSS X8, (DX)(R8*4) 23242 ADDQ BX, R8 23243 SUBQ $0x08, SI 23244 23245 check_limit_unroll: 23246 CMPQ SI, $0x08 23247 JHS loop_unroll 23248 JMP check_limit 23249 23250 loop: 23251 MOVSS (AX)(DI*4), X1 23252 MULSS X0, X1 23253 ADDSS (DX)(R8*4), X1 23254 MOVSS X1, (DX)(R8*4) 23255 DECQ SI 23256 ADDQ CX, DI 23257 ADDQ BX, R8 23258 23259 check_limit: 23260 CMPQ SI, $0x00 23261 JHI loop 23262 RET 23263 23264 // func AmdAxpyUnsafeXInterleave_V3A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23265 // Requires: SSE 23266 TEXT ·AmdAxpyUnsafeXInterleave_V3A13R8(SB), NOSPLIT, $0-48 23267 MOVSS alpha+0(FP), X0 23268 MOVQ xs+8(FP), AX 23269 MOVQ incx+16(FP), CX 23270 MOVQ ys+24(FP), DX 23271 MOVQ incy+32(FP), BX 23272 MOVQ n+40(FP), SI 23273 XORQ DI, DI 23274 XORQ R8, R8 23275 JMP check_limit_unroll 23276 PCALIGN $0x08 23277 NOP 23278 NOP 23279 NOP 23280 NOP 23281 NOP 23282 23283 loop_unroll: 23284 MOVSS (AX)(DI*4), X1 23285 ADDQ CX, DI 23286 MOVSS (AX)(DI*4), X2 23287 ADDQ CX, DI 23288 MOVSS (AX)(DI*4), X3 23289 ADDQ CX, DI 23290 MOVSS (AX)(DI*4), X4 23291 ADDQ CX, DI 23292 MOVSS (AX)(DI*4), X5 23293 ADDQ CX, DI 23294 MOVSS (AX)(DI*4), X6 23295 ADDQ CX, DI 23296 MOVSS (AX)(DI*4), X7 23297 ADDQ CX, DI 23298 MOVSS (AX)(DI*4), X8 23299 ADDQ CX, DI 23300 MULSS X0, X1 23301 MULSS X0, X2 23302 MULSS X0, X3 23303 MULSS X0, X4 23304 MULSS X0, X5 23305 MULSS X0, X6 23306 MULSS X0, X7 23307 MULSS X0, X8 23308 ADDSS (DX)(R8*4), X1 23309 MOVSS X1, (DX)(R8*4) 23310 ADDQ BX, R8 23311 ADDSS (DX)(R8*4), X2 23312 MOVSS X2, (DX)(R8*4) 23313 ADDQ BX, R8 23314 ADDSS (DX)(R8*4), X3 23315 MOVSS X3, (DX)(R8*4) 23316 ADDQ BX, R8 23317 ADDSS (DX)(R8*4), X4 23318 MOVSS X4, (DX)(R8*4) 23319 ADDQ BX, R8 23320 ADDSS (DX)(R8*4), X5 23321 MOVSS X5, (DX)(R8*4) 23322 ADDQ BX, R8 23323 ADDSS (DX)(R8*4), X6 23324 MOVSS X6, (DX)(R8*4) 23325 ADDQ BX, R8 23326 ADDSS (DX)(R8*4), X7 23327 MOVSS X7, (DX)(R8*4) 23328 ADDQ BX, R8 23329 ADDSS (DX)(R8*4), X8 23330 MOVSS X8, (DX)(R8*4) 23331 ADDQ BX, R8 23332 SUBQ $0x08, SI 23333 23334 check_limit_unroll: 23335 CMPQ SI, $0x08 23336 JHS loop_unroll 23337 JMP check_limit 23338 23339 loop: 23340 MOVSS (AX)(DI*4), X1 23341 MULSS X0, X1 23342 ADDSS (DX)(R8*4), X1 23343 MOVSS X1, (DX)(R8*4) 23344 DECQ SI 23345 ADDQ CX, DI 23346 ADDQ BX, R8 23347 23348 check_limit: 23349 CMPQ SI, $0x00 23350 JHI loop 23351 RET 23352 23353 // func AmdAxpyUnsafeXInterleave_V4A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23354 // Requires: SSE 23355 TEXT ·AmdAxpyUnsafeXInterleave_V4A13R8(SB), NOSPLIT, $0-48 23356 MOVSS alpha+0(FP), X0 23357 MOVQ xs+8(FP), AX 23358 MOVQ incx+16(FP), CX 23359 MOVQ ys+24(FP), DX 23360 MOVQ incy+32(FP), BX 23361 MOVQ n+40(FP), SI 23362 XORQ DI, DI 23363 XORQ R8, R8 23364 JMP check_limit_unroll 23365 PCALIGN $0x08 23366 NOP 23367 NOP 23368 NOP 23369 NOP 23370 NOP 23371 23372 loop_unroll: 23373 MOVSS (AX)(DI*4), X1 23374 ADDQ CX, DI 23375 MOVSS (AX)(DI*4), X2 23376 ADDQ CX, DI 23377 MOVSS (AX)(DI*4), X3 23378 ADDQ CX, DI 23379 MOVSS (AX)(DI*4), X4 23380 ADDQ CX, DI 23381 MOVSS (AX)(DI*4), X5 23382 ADDQ CX, DI 23383 MOVSS (AX)(DI*4), X6 23384 ADDQ CX, DI 23385 MOVSS (AX)(DI*4), X7 23386 ADDQ CX, DI 23387 MOVSS (AX)(DI*4), X8 23388 ADDQ CX, DI 23389 MULSS X0, X1 23390 MULSS X0, X2 23391 MULSS X0, X3 23392 MULSS X0, X4 23393 MULSS X0, X5 23394 MULSS X0, X6 23395 MULSS X0, X7 23396 MULSS X0, X8 23397 ADDSS (DX)(R8*4), X1 23398 MOVSS X1, (DX)(R8*4) 23399 ADDQ BX, R8 23400 ADDSS (DX)(R8*4), X2 23401 MOVSS X2, (DX)(R8*4) 23402 ADDQ BX, R8 23403 ADDSS (DX)(R8*4), X3 23404 MOVSS X3, (DX)(R8*4) 23405 ADDQ BX, R8 23406 ADDSS (DX)(R8*4), X4 23407 MOVSS X4, (DX)(R8*4) 23408 ADDQ BX, R8 23409 ADDSS (DX)(R8*4), X5 23410 MOVSS X5, (DX)(R8*4) 23411 ADDQ BX, R8 23412 ADDSS (DX)(R8*4), X6 23413 MOVSS X6, (DX)(R8*4) 23414 ADDQ BX, R8 23415 ADDSS (DX)(R8*4), X7 23416 MOVSS X7, (DX)(R8*4) 23417 ADDQ BX, R8 23418 ADDSS (DX)(R8*4), X8 23419 MOVSS X8, (DX)(R8*4) 23420 ADDQ BX, R8 23421 SUBQ $0x08, SI 23422 23423 check_limit_unroll: 23424 CMPQ SI, $0x08 23425 JHS loop_unroll 23426 JMP check_limit 23427 23428 loop: 23429 MOVSS (AX)(DI*4), X1 23430 MULSS X0, X1 23431 ADDSS (DX)(R8*4), X1 23432 MOVSS X1, (DX)(R8*4) 23433 DECQ SI 23434 ADDQ CX, DI 23435 ADDQ BX, R8 23436 23437 check_limit: 23438 CMPQ SI, $0x00 23439 JHI loop 23440 RET 23441 23442 // func AmdAxpyUnsafeXInterleave_V5A13R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23443 // Requires: SSE 23444 TEXT ·AmdAxpyUnsafeXInterleave_V5A13R8(SB), NOSPLIT, $0-48 23445 MOVSS alpha+0(FP), X0 23446 MOVQ xs+8(FP), AX 23447 MOVQ incx+16(FP), CX 23448 MOVQ ys+24(FP), DX 23449 MOVQ incy+32(FP), BX 23450 MOVQ n+40(FP), SI 23451 XORQ DI, DI 23452 XORQ R8, R8 23453 JMP check_limit_unroll 23454 PCALIGN $0x08 23455 NOP 23456 NOP 23457 NOP 23458 NOP 23459 NOP 23460 23461 loop_unroll: 23462 MOVSS (AX)(DI*4), X1 23463 ADDQ CX, DI 23464 MOVSS (AX)(DI*4), X2 23465 ADDQ CX, DI 23466 MOVSS (AX)(DI*4), X3 23467 ADDQ CX, DI 23468 MOVSS (AX)(DI*4), X4 23469 ADDQ CX, DI 23470 MOVSS (AX)(DI*4), X5 23471 ADDQ CX, DI 23472 MOVSS (AX)(DI*4), X6 23473 ADDQ CX, DI 23474 MOVSS (AX)(DI*4), X7 23475 ADDQ CX, DI 23476 MOVSS (AX)(DI*4), X8 23477 ADDQ CX, DI 23478 MULSS X0, X1 23479 MULSS X0, X2 23480 MULSS X0, X3 23481 MULSS X0, X4 23482 MULSS X0, X5 23483 MULSS X0, X6 23484 MULSS X0, X7 23485 MULSS X0, X8 23486 ADDSS (DX)(R8*4), X1 23487 MOVSS X1, (DX)(R8*4) 23488 ADDQ BX, R8 23489 ADDSS (DX)(R8*4), X2 23490 MOVSS X2, (DX)(R8*4) 23491 ADDQ BX, R8 23492 ADDSS (DX)(R8*4), X3 23493 MOVSS X3, (DX)(R8*4) 23494 ADDQ BX, R8 23495 ADDSS (DX)(R8*4), X4 23496 MOVSS X4, (DX)(R8*4) 23497 ADDQ BX, R8 23498 ADDSS (DX)(R8*4), X5 23499 MOVSS X5, (DX)(R8*4) 23500 ADDQ BX, R8 23501 ADDSS (DX)(R8*4), X6 23502 MOVSS X6, (DX)(R8*4) 23503 ADDQ BX, R8 23504 ADDSS (DX)(R8*4), X7 23505 MOVSS X7, (DX)(R8*4) 23506 ADDQ BX, R8 23507 ADDSS (DX)(R8*4), X8 23508 MOVSS X8, (DX)(R8*4) 23509 ADDQ BX, R8 23510 SUBQ $0x08, SI 23511 23512 check_limit_unroll: 23513 CMPQ SI, $0x08 23514 JHS loop_unroll 23515 JMP check_limit 23516 23517 loop: 23518 MOVSS (AX)(DI*4), X1 23519 MULSS X0, X1 23520 ADDSS (DX)(R8*4), X1 23521 MOVSS X1, (DX)(R8*4) 23522 DECQ SI 23523 ADDQ CX, DI 23524 ADDQ BX, R8 23525 23526 check_limit: 23527 CMPQ SI, $0x00 23528 JHI loop 23529 RET 23530 23531 // func AmdAxpyUnsafeXInterleave_V0A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23532 // Requires: SSE 23533 TEXT ·AmdAxpyUnsafeXInterleave_V0A14R8(SB), NOSPLIT, $0-48 23534 MOVSS alpha+0(FP), X0 23535 MOVQ xs+8(FP), AX 23536 MOVQ incx+16(FP), CX 23537 MOVQ ys+24(FP), DX 23538 MOVQ incy+32(FP), BX 23539 MOVQ n+40(FP), SI 23540 XORQ DI, DI 23541 XORQ R8, R8 23542 JMP check_limit_unroll 23543 PCALIGN $0x08 23544 NOP 23545 NOP 23546 NOP 23547 NOP 23548 NOP 23549 NOP 23550 23551 loop_unroll: 23552 MOVSS (AX)(DI*4), X1 23553 ADDQ CX, DI 23554 MOVSS (AX)(DI*4), X2 23555 ADDQ CX, DI 23556 MOVSS (AX)(DI*4), X3 23557 ADDQ CX, DI 23558 MOVSS (AX)(DI*4), X4 23559 ADDQ CX, DI 23560 MOVSS (AX)(DI*4), X5 23561 ADDQ CX, DI 23562 MOVSS (AX)(DI*4), X6 23563 ADDQ CX, DI 23564 MOVSS (AX)(DI*4), X7 23565 ADDQ CX, DI 23566 MOVSS (AX)(DI*4), X8 23567 ADDQ CX, DI 23568 MULSS X0, X1 23569 MULSS X0, X2 23570 MULSS X0, X3 23571 MULSS X0, X4 23572 MULSS X0, X5 23573 MULSS X0, X6 23574 MULSS X0, X7 23575 MULSS X0, X8 23576 ADDSS (DX)(R8*4), X1 23577 MOVSS X1, (DX)(R8*4) 23578 ADDQ BX, R8 23579 ADDSS (DX)(R8*4), X2 23580 MOVSS X2, (DX)(R8*4) 23581 ADDQ BX, R8 23582 ADDSS (DX)(R8*4), X3 23583 MOVSS X3, (DX)(R8*4) 23584 ADDQ BX, R8 23585 ADDSS (DX)(R8*4), X4 23586 MOVSS X4, (DX)(R8*4) 23587 ADDQ BX, R8 23588 ADDSS (DX)(R8*4), X5 23589 MOVSS X5, (DX)(R8*4) 23590 ADDQ BX, R8 23591 ADDSS (DX)(R8*4), X6 23592 MOVSS X6, (DX)(R8*4) 23593 ADDQ BX, R8 23594 ADDSS (DX)(R8*4), X7 23595 MOVSS X7, (DX)(R8*4) 23596 ADDQ BX, R8 23597 ADDSS (DX)(R8*4), X8 23598 MOVSS X8, (DX)(R8*4) 23599 ADDQ BX, R8 23600 SUBQ $0x08, SI 23601 23602 check_limit_unroll: 23603 CMPQ SI, $0x08 23604 JHS loop_unroll 23605 JMP check_limit 23606 23607 loop: 23608 MOVSS (AX)(DI*4), X1 23609 MULSS X0, X1 23610 ADDSS (DX)(R8*4), X1 23611 MOVSS X1, (DX)(R8*4) 23612 DECQ SI 23613 ADDQ CX, DI 23614 ADDQ BX, R8 23615 23616 check_limit: 23617 CMPQ SI, $0x00 23618 JHI loop 23619 RET 23620 23621 // func AmdAxpyUnsafeXInterleave_V1A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23622 // Requires: SSE 23623 TEXT ·AmdAxpyUnsafeXInterleave_V1A14R8(SB), NOSPLIT, $0-48 23624 MOVSS alpha+0(FP), X0 23625 MOVQ xs+8(FP), AX 23626 MOVQ incx+16(FP), CX 23627 MOVQ ys+24(FP), DX 23628 MOVQ incy+32(FP), BX 23629 MOVQ n+40(FP), SI 23630 XORQ DI, DI 23631 XORQ R8, R8 23632 JMP check_limit_unroll 23633 PCALIGN $0x08 23634 NOP 23635 NOP 23636 NOP 23637 NOP 23638 NOP 23639 NOP 23640 23641 loop_unroll: 23642 MOVSS (AX)(DI*4), X1 23643 ADDQ CX, DI 23644 MOVSS (AX)(DI*4), X2 23645 ADDQ CX, DI 23646 MOVSS (AX)(DI*4), X3 23647 ADDQ CX, DI 23648 MOVSS (AX)(DI*4), X4 23649 ADDQ CX, DI 23650 MOVSS (AX)(DI*4), X5 23651 ADDQ CX, DI 23652 MOVSS (AX)(DI*4), X6 23653 ADDQ CX, DI 23654 MOVSS (AX)(DI*4), X7 23655 ADDQ CX, DI 23656 MOVSS (AX)(DI*4), X8 23657 ADDQ CX, DI 23658 MULSS X0, X1 23659 MULSS X0, X2 23660 MULSS X0, X3 23661 MULSS X0, X4 23662 MULSS X0, X5 23663 MULSS X0, X6 23664 MULSS X0, X7 23665 MULSS X0, X8 23666 ADDSS (DX)(R8*4), X1 23667 MOVSS X1, (DX)(R8*4) 23668 ADDQ BX, R8 23669 ADDSS (DX)(R8*4), X2 23670 MOVSS X2, (DX)(R8*4) 23671 ADDQ BX, R8 23672 ADDSS (DX)(R8*4), X3 23673 MOVSS X3, (DX)(R8*4) 23674 ADDQ BX, R8 23675 ADDSS (DX)(R8*4), X4 23676 MOVSS X4, (DX)(R8*4) 23677 ADDQ BX, R8 23678 ADDSS (DX)(R8*4), X5 23679 MOVSS X5, (DX)(R8*4) 23680 ADDQ BX, R8 23681 ADDSS (DX)(R8*4), X6 23682 MOVSS X6, (DX)(R8*4) 23683 ADDQ BX, R8 23684 ADDSS (DX)(R8*4), X7 23685 MOVSS X7, (DX)(R8*4) 23686 ADDQ BX, R8 23687 ADDSS (DX)(R8*4), X8 23688 MOVSS X8, (DX)(R8*4) 23689 ADDQ BX, R8 23690 SUBQ $0x08, SI 23691 23692 check_limit_unroll: 23693 CMPQ SI, $0x08 23694 JHS loop_unroll 23695 JMP check_limit 23696 23697 loop: 23698 MOVSS (AX)(DI*4), X1 23699 MULSS X0, X1 23700 ADDSS (DX)(R8*4), X1 23701 MOVSS X1, (DX)(R8*4) 23702 DECQ SI 23703 ADDQ CX, DI 23704 ADDQ BX, R8 23705 23706 check_limit: 23707 CMPQ SI, $0x00 23708 JHI loop 23709 RET 23710 23711 // func AmdAxpyUnsafeXInterleave_V2A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23712 // Requires: SSE 23713 TEXT ·AmdAxpyUnsafeXInterleave_V2A14R8(SB), NOSPLIT, $0-48 23714 MOVSS alpha+0(FP), X0 23715 MOVQ xs+8(FP), AX 23716 MOVQ incx+16(FP), CX 23717 MOVQ ys+24(FP), DX 23718 MOVQ incy+32(FP), BX 23719 MOVQ n+40(FP), SI 23720 XORQ DI, DI 23721 XORQ R8, R8 23722 JMP check_limit_unroll 23723 PCALIGN $0x08 23724 NOP 23725 NOP 23726 NOP 23727 NOP 23728 NOP 23729 NOP 23730 23731 loop_unroll: 23732 MOVSS (AX)(DI*4), X1 23733 ADDQ CX, DI 23734 MOVSS (AX)(DI*4), X2 23735 ADDQ CX, DI 23736 MOVSS (AX)(DI*4), X3 23737 ADDQ CX, DI 23738 MOVSS (AX)(DI*4), X4 23739 ADDQ CX, DI 23740 MOVSS (AX)(DI*4), X5 23741 ADDQ CX, DI 23742 MOVSS (AX)(DI*4), X6 23743 ADDQ CX, DI 23744 MOVSS (AX)(DI*4), X7 23745 ADDQ CX, DI 23746 MOVSS (AX)(DI*4), X8 23747 ADDQ CX, DI 23748 MULSS X0, X1 23749 MULSS X0, X2 23750 MULSS X0, X3 23751 MULSS X0, X4 23752 MULSS X0, X5 23753 MULSS X0, X6 23754 MULSS X0, X7 23755 MULSS X0, X8 23756 ADDSS (DX)(R8*4), X1 23757 MOVSS X1, (DX)(R8*4) 23758 ADDQ BX, R8 23759 ADDSS (DX)(R8*4), X2 23760 MOVSS X2, (DX)(R8*4) 23761 ADDQ BX, R8 23762 ADDSS (DX)(R8*4), X3 23763 MOVSS X3, (DX)(R8*4) 23764 ADDQ BX, R8 23765 ADDSS (DX)(R8*4), X4 23766 MOVSS X4, (DX)(R8*4) 23767 ADDQ BX, R8 23768 ADDSS (DX)(R8*4), X5 23769 MOVSS X5, (DX)(R8*4) 23770 ADDQ BX, R8 23771 ADDSS (DX)(R8*4), X6 23772 MOVSS X6, (DX)(R8*4) 23773 ADDQ BX, R8 23774 ADDSS (DX)(R8*4), X7 23775 MOVSS X7, (DX)(R8*4) 23776 ADDQ BX, R8 23777 ADDSS (DX)(R8*4), X8 23778 MOVSS X8, (DX)(R8*4) 23779 ADDQ BX, R8 23780 SUBQ $0x08, SI 23781 23782 check_limit_unroll: 23783 CMPQ SI, $0x08 23784 JHS loop_unroll 23785 JMP check_limit 23786 23787 loop: 23788 MOVSS (AX)(DI*4), X1 23789 MULSS X0, X1 23790 ADDSS (DX)(R8*4), X1 23791 MOVSS X1, (DX)(R8*4) 23792 DECQ SI 23793 ADDQ CX, DI 23794 ADDQ BX, R8 23795 23796 check_limit: 23797 CMPQ SI, $0x00 23798 JHI loop 23799 RET 23800 23801 // func AmdAxpyUnsafeXInterleave_V3A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23802 // Requires: SSE 23803 TEXT ·AmdAxpyUnsafeXInterleave_V3A14R8(SB), NOSPLIT, $0-48 23804 MOVSS alpha+0(FP), X0 23805 MOVQ xs+8(FP), AX 23806 MOVQ incx+16(FP), CX 23807 MOVQ ys+24(FP), DX 23808 MOVQ incy+32(FP), BX 23809 MOVQ n+40(FP), SI 23810 XORQ DI, DI 23811 XORQ R8, R8 23812 JMP check_limit_unroll 23813 PCALIGN $0x08 23814 NOP 23815 NOP 23816 NOP 23817 NOP 23818 NOP 23819 NOP 23820 23821 loop_unroll: 23822 MOVSS (AX)(DI*4), X1 23823 ADDQ CX, DI 23824 MOVSS (AX)(DI*4), X2 23825 ADDQ CX, DI 23826 MOVSS (AX)(DI*4), X3 23827 ADDQ CX, DI 23828 MOVSS (AX)(DI*4), X4 23829 ADDQ CX, DI 23830 MOVSS (AX)(DI*4), X5 23831 ADDQ CX, DI 23832 MOVSS (AX)(DI*4), X6 23833 ADDQ CX, DI 23834 MOVSS (AX)(DI*4), X7 23835 ADDQ CX, DI 23836 MOVSS (AX)(DI*4), X8 23837 ADDQ CX, DI 23838 MULSS X0, X1 23839 MULSS X0, X2 23840 MULSS X0, X3 23841 MULSS X0, X4 23842 MULSS X0, X5 23843 MULSS X0, X6 23844 MULSS X0, X7 23845 MULSS X0, X8 23846 ADDSS (DX)(R8*4), X1 23847 MOVSS X1, (DX)(R8*4) 23848 ADDQ BX, R8 23849 ADDSS (DX)(R8*4), X2 23850 MOVSS X2, (DX)(R8*4) 23851 ADDQ BX, R8 23852 ADDSS (DX)(R8*4), X3 23853 MOVSS X3, (DX)(R8*4) 23854 ADDQ BX, R8 23855 ADDSS (DX)(R8*4), X4 23856 MOVSS X4, (DX)(R8*4) 23857 ADDQ BX, R8 23858 ADDSS (DX)(R8*4), X5 23859 MOVSS X5, (DX)(R8*4) 23860 ADDQ BX, R8 23861 ADDSS (DX)(R8*4), X6 23862 MOVSS X6, (DX)(R8*4) 23863 ADDQ BX, R8 23864 ADDSS (DX)(R8*4), X7 23865 MOVSS X7, (DX)(R8*4) 23866 ADDQ BX, R8 23867 ADDSS (DX)(R8*4), X8 23868 MOVSS X8, (DX)(R8*4) 23869 ADDQ BX, R8 23870 SUBQ $0x08, SI 23871 23872 check_limit_unroll: 23873 CMPQ SI, $0x08 23874 JHS loop_unroll 23875 JMP check_limit 23876 23877 loop: 23878 MOVSS (AX)(DI*4), X1 23879 MULSS X0, X1 23880 ADDSS (DX)(R8*4), X1 23881 MOVSS X1, (DX)(R8*4) 23882 DECQ SI 23883 ADDQ CX, DI 23884 ADDQ BX, R8 23885 23886 check_limit: 23887 CMPQ SI, $0x00 23888 JHI loop 23889 RET 23890 23891 // func AmdAxpyUnsafeXInterleave_V4A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23892 // Requires: SSE 23893 TEXT ·AmdAxpyUnsafeXInterleave_V4A14R8(SB), NOSPLIT, $0-48 23894 MOVSS alpha+0(FP), X0 23895 MOVQ xs+8(FP), AX 23896 MOVQ incx+16(FP), CX 23897 MOVQ ys+24(FP), DX 23898 MOVQ incy+32(FP), BX 23899 MOVQ n+40(FP), SI 23900 XORQ DI, DI 23901 XORQ R8, R8 23902 JMP check_limit_unroll 23903 PCALIGN $0x08 23904 NOP 23905 NOP 23906 NOP 23907 NOP 23908 NOP 23909 NOP 23910 23911 loop_unroll: 23912 MOVSS (AX)(DI*4), X1 23913 ADDQ CX, DI 23914 MOVSS (AX)(DI*4), X2 23915 ADDQ CX, DI 23916 MOVSS (AX)(DI*4), X3 23917 ADDQ CX, DI 23918 MOVSS (AX)(DI*4), X4 23919 ADDQ CX, DI 23920 MOVSS (AX)(DI*4), X5 23921 ADDQ CX, DI 23922 MOVSS (AX)(DI*4), X6 23923 ADDQ CX, DI 23924 MOVSS (AX)(DI*4), X7 23925 ADDQ CX, DI 23926 MOVSS (AX)(DI*4), X8 23927 ADDQ CX, DI 23928 MULSS X0, X1 23929 MULSS X0, X2 23930 MULSS X0, X3 23931 MULSS X0, X4 23932 MULSS X0, X5 23933 MULSS X0, X6 23934 MULSS X0, X7 23935 MULSS X0, X8 23936 ADDSS (DX)(R8*4), X1 23937 MOVSS X1, (DX)(R8*4) 23938 ADDQ BX, R8 23939 ADDSS (DX)(R8*4), X2 23940 MOVSS X2, (DX)(R8*4) 23941 ADDQ BX, R8 23942 ADDSS (DX)(R8*4), X3 23943 MOVSS X3, (DX)(R8*4) 23944 ADDQ BX, R8 23945 ADDSS (DX)(R8*4), X4 23946 MOVSS X4, (DX)(R8*4) 23947 ADDQ BX, R8 23948 ADDSS (DX)(R8*4), X5 23949 MOVSS X5, (DX)(R8*4) 23950 ADDQ BX, R8 23951 ADDSS (DX)(R8*4), X6 23952 MOVSS X6, (DX)(R8*4) 23953 ADDQ BX, R8 23954 ADDSS (DX)(R8*4), X7 23955 MOVSS X7, (DX)(R8*4) 23956 ADDQ BX, R8 23957 ADDSS (DX)(R8*4), X8 23958 MOVSS X8, (DX)(R8*4) 23959 ADDQ BX, R8 23960 SUBQ $0x08, SI 23961 23962 check_limit_unroll: 23963 CMPQ SI, $0x08 23964 JHS loop_unroll 23965 JMP check_limit 23966 23967 loop: 23968 MOVSS (AX)(DI*4), X1 23969 MULSS X0, X1 23970 ADDSS (DX)(R8*4), X1 23971 MOVSS X1, (DX)(R8*4) 23972 DECQ SI 23973 ADDQ CX, DI 23974 ADDQ BX, R8 23975 23976 check_limit: 23977 CMPQ SI, $0x00 23978 JHI loop 23979 RET 23980 23981 // func AmdAxpyUnsafeXInterleave_V5A14R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 23982 // Requires: SSE 23983 TEXT ·AmdAxpyUnsafeXInterleave_V5A14R8(SB), NOSPLIT, $0-48 23984 MOVSS alpha+0(FP), X0 23985 MOVQ xs+8(FP), AX 23986 MOVQ incx+16(FP), CX 23987 MOVQ ys+24(FP), DX 23988 MOVQ incy+32(FP), BX 23989 MOVQ n+40(FP), SI 23990 XORQ DI, DI 23991 XORQ R8, R8 23992 JMP check_limit_unroll 23993 PCALIGN $0x08 23994 NOP 23995 NOP 23996 NOP 23997 NOP 23998 NOP 23999 NOP 24000 24001 loop_unroll: 24002 MOVSS (AX)(DI*4), X1 24003 ADDQ CX, DI 24004 MOVSS (AX)(DI*4), X2 24005 ADDQ CX, DI 24006 MOVSS (AX)(DI*4), X3 24007 ADDQ CX, DI 24008 MOVSS (AX)(DI*4), X4 24009 ADDQ CX, DI 24010 MOVSS (AX)(DI*4), X5 24011 ADDQ CX, DI 24012 MOVSS (AX)(DI*4), X6 24013 ADDQ CX, DI 24014 MOVSS (AX)(DI*4), X7 24015 ADDQ CX, DI 24016 MOVSS (AX)(DI*4), X8 24017 ADDQ CX, DI 24018 MULSS X0, X1 24019 MULSS X0, X2 24020 MULSS X0, X3 24021 MULSS X0, X4 24022 MULSS X0, X5 24023 MULSS X0, X6 24024 MULSS X0, X7 24025 MULSS X0, X8 24026 ADDSS (DX)(R8*4), X1 24027 MOVSS X1, (DX)(R8*4) 24028 ADDQ BX, R8 24029 ADDSS (DX)(R8*4), X2 24030 MOVSS X2, (DX)(R8*4) 24031 ADDQ BX, R8 24032 ADDSS (DX)(R8*4), X3 24033 MOVSS X3, (DX)(R8*4) 24034 ADDQ BX, R8 24035 ADDSS (DX)(R8*4), X4 24036 MOVSS X4, (DX)(R8*4) 24037 ADDQ BX, R8 24038 ADDSS (DX)(R8*4), X5 24039 MOVSS X5, (DX)(R8*4) 24040 ADDQ BX, R8 24041 ADDSS (DX)(R8*4), X6 24042 MOVSS X6, (DX)(R8*4) 24043 ADDQ BX, R8 24044 ADDSS (DX)(R8*4), X7 24045 MOVSS X7, (DX)(R8*4) 24046 ADDQ BX, R8 24047 ADDSS (DX)(R8*4), X8 24048 MOVSS X8, (DX)(R8*4) 24049 ADDQ BX, R8 24050 SUBQ $0x08, SI 24051 24052 check_limit_unroll: 24053 CMPQ SI, $0x08 24054 JHS loop_unroll 24055 JMP check_limit 24056 24057 loop: 24058 MOVSS (AX)(DI*4), X1 24059 MULSS X0, X1 24060 ADDSS (DX)(R8*4), X1 24061 MOVSS X1, (DX)(R8*4) 24062 DECQ SI 24063 ADDQ CX, DI 24064 ADDQ BX, R8 24065 24066 check_limit: 24067 CMPQ SI, $0x00 24068 JHI loop 24069 RET 24070 24071 // func AmdAxpyUnsafeXInterleave_V0A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24072 // Requires: SSE 24073 TEXT ·AmdAxpyUnsafeXInterleave_V0A15R8(SB), NOSPLIT, $0-48 24074 MOVSS alpha+0(FP), X0 24075 MOVQ xs+8(FP), AX 24076 MOVQ incx+16(FP), CX 24077 MOVQ ys+24(FP), DX 24078 MOVQ incy+32(FP), BX 24079 MOVQ n+40(FP), SI 24080 XORQ DI, DI 24081 XORQ R8, R8 24082 JMP check_limit_unroll 24083 PCALIGN $0x08 24084 NOP 24085 NOP 24086 NOP 24087 NOP 24088 NOP 24089 NOP 24090 NOP 24091 24092 loop_unroll: 24093 MOVSS (AX)(DI*4), X1 24094 ADDQ CX, DI 24095 MOVSS (AX)(DI*4), X2 24096 ADDQ CX, DI 24097 MOVSS (AX)(DI*4), X3 24098 ADDQ CX, DI 24099 MOVSS (AX)(DI*4), X4 24100 ADDQ CX, DI 24101 MOVSS (AX)(DI*4), X5 24102 ADDQ CX, DI 24103 MOVSS (AX)(DI*4), X6 24104 ADDQ CX, DI 24105 MOVSS (AX)(DI*4), X7 24106 ADDQ CX, DI 24107 MOVSS (AX)(DI*4), X8 24108 ADDQ CX, DI 24109 MULSS X0, X1 24110 MULSS X0, X2 24111 MULSS X0, X3 24112 MULSS X0, X4 24113 MULSS X0, X5 24114 MULSS X0, X6 24115 MULSS X0, X7 24116 MULSS X0, X8 24117 ADDSS (DX)(R8*4), X1 24118 MOVSS X1, (DX)(R8*4) 24119 ADDQ BX, R8 24120 ADDSS (DX)(R8*4), X2 24121 MOVSS X2, (DX)(R8*4) 24122 ADDQ BX, R8 24123 ADDSS (DX)(R8*4), X3 24124 MOVSS X3, (DX)(R8*4) 24125 ADDQ BX, R8 24126 ADDSS (DX)(R8*4), X4 24127 MOVSS X4, (DX)(R8*4) 24128 ADDQ BX, R8 24129 ADDSS (DX)(R8*4), X5 24130 MOVSS X5, (DX)(R8*4) 24131 ADDQ BX, R8 24132 ADDSS (DX)(R8*4), X6 24133 MOVSS X6, (DX)(R8*4) 24134 ADDQ BX, R8 24135 ADDSS (DX)(R8*4), X7 24136 MOVSS X7, (DX)(R8*4) 24137 ADDQ BX, R8 24138 ADDSS (DX)(R8*4), X8 24139 MOVSS X8, (DX)(R8*4) 24140 ADDQ BX, R8 24141 SUBQ $0x08, SI 24142 24143 check_limit_unroll: 24144 CMPQ SI, $0x08 24145 JHS loop_unroll 24146 JMP check_limit 24147 24148 loop: 24149 MOVSS (AX)(DI*4), X1 24150 MULSS X0, X1 24151 ADDSS (DX)(R8*4), X1 24152 MOVSS X1, (DX)(R8*4) 24153 DECQ SI 24154 ADDQ CX, DI 24155 ADDQ BX, R8 24156 24157 check_limit: 24158 CMPQ SI, $0x00 24159 JHI loop 24160 RET 24161 24162 // func AmdAxpyUnsafeXInterleave_V1A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24163 // Requires: SSE 24164 TEXT ·AmdAxpyUnsafeXInterleave_V1A15R8(SB), NOSPLIT, $0-48 24165 MOVSS alpha+0(FP), X0 24166 MOVQ xs+8(FP), AX 24167 MOVQ incx+16(FP), CX 24168 MOVQ ys+24(FP), DX 24169 MOVQ incy+32(FP), BX 24170 MOVQ n+40(FP), SI 24171 XORQ DI, DI 24172 XORQ R8, R8 24173 JMP check_limit_unroll 24174 PCALIGN $0x08 24175 NOP 24176 NOP 24177 NOP 24178 NOP 24179 NOP 24180 NOP 24181 NOP 24182 24183 loop_unroll: 24184 MOVSS (AX)(DI*4), X1 24185 ADDQ CX, DI 24186 MOVSS (AX)(DI*4), X2 24187 ADDQ CX, DI 24188 MOVSS (AX)(DI*4), X3 24189 ADDQ CX, DI 24190 MOVSS (AX)(DI*4), X4 24191 ADDQ CX, DI 24192 MOVSS (AX)(DI*4), X5 24193 ADDQ CX, DI 24194 MOVSS (AX)(DI*4), X6 24195 ADDQ CX, DI 24196 MOVSS (AX)(DI*4), X7 24197 ADDQ CX, DI 24198 MOVSS (AX)(DI*4), X8 24199 ADDQ CX, DI 24200 MULSS X0, X1 24201 MULSS X0, X2 24202 MULSS X0, X3 24203 MULSS X0, X4 24204 MULSS X0, X5 24205 MULSS X0, X6 24206 MULSS X0, X7 24207 MULSS X0, X8 24208 ADDSS (DX)(R8*4), X1 24209 MOVSS X1, (DX)(R8*4) 24210 ADDQ BX, R8 24211 ADDSS (DX)(R8*4), X2 24212 MOVSS X2, (DX)(R8*4) 24213 ADDQ BX, R8 24214 ADDSS (DX)(R8*4), X3 24215 MOVSS X3, (DX)(R8*4) 24216 ADDQ BX, R8 24217 ADDSS (DX)(R8*4), X4 24218 MOVSS X4, (DX)(R8*4) 24219 ADDQ BX, R8 24220 ADDSS (DX)(R8*4), X5 24221 MOVSS X5, (DX)(R8*4) 24222 ADDQ BX, R8 24223 ADDSS (DX)(R8*4), X6 24224 MOVSS X6, (DX)(R8*4) 24225 ADDQ BX, R8 24226 ADDSS (DX)(R8*4), X7 24227 MOVSS X7, (DX)(R8*4) 24228 ADDQ BX, R8 24229 ADDSS (DX)(R8*4), X8 24230 MOVSS X8, (DX)(R8*4) 24231 ADDQ BX, R8 24232 SUBQ $0x08, SI 24233 24234 check_limit_unroll: 24235 CMPQ SI, $0x08 24236 JHS loop_unroll 24237 JMP check_limit 24238 24239 loop: 24240 MOVSS (AX)(DI*4), X1 24241 MULSS X0, X1 24242 ADDSS (DX)(R8*4), X1 24243 MOVSS X1, (DX)(R8*4) 24244 DECQ SI 24245 ADDQ CX, DI 24246 ADDQ BX, R8 24247 24248 check_limit: 24249 CMPQ SI, $0x00 24250 JHI loop 24251 RET 24252 24253 // func AmdAxpyUnsafeXInterleave_V2A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24254 // Requires: SSE 24255 TEXT ·AmdAxpyUnsafeXInterleave_V2A15R8(SB), NOSPLIT, $0-48 24256 MOVSS alpha+0(FP), X0 24257 MOVQ xs+8(FP), AX 24258 MOVQ incx+16(FP), CX 24259 MOVQ ys+24(FP), DX 24260 MOVQ incy+32(FP), BX 24261 MOVQ n+40(FP), SI 24262 XORQ DI, DI 24263 XORQ R8, R8 24264 JMP check_limit_unroll 24265 PCALIGN $0x08 24266 NOP 24267 NOP 24268 NOP 24269 NOP 24270 NOP 24271 NOP 24272 NOP 24273 24274 loop_unroll: 24275 MOVSS (AX)(DI*4), X1 24276 ADDQ CX, DI 24277 MOVSS (AX)(DI*4), X2 24278 ADDQ CX, DI 24279 MOVSS (AX)(DI*4), X3 24280 ADDQ CX, DI 24281 MOVSS (AX)(DI*4), X4 24282 ADDQ CX, DI 24283 MOVSS (AX)(DI*4), X5 24284 ADDQ CX, DI 24285 MOVSS (AX)(DI*4), X6 24286 ADDQ CX, DI 24287 MOVSS (AX)(DI*4), X7 24288 ADDQ CX, DI 24289 MOVSS (AX)(DI*4), X8 24290 ADDQ CX, DI 24291 MULSS X0, X1 24292 MULSS X0, X2 24293 MULSS X0, X3 24294 MULSS X0, X4 24295 MULSS X0, X5 24296 MULSS X0, X6 24297 MULSS X0, X7 24298 MULSS X0, X8 24299 ADDSS (DX)(R8*4), X1 24300 MOVSS X1, (DX)(R8*4) 24301 ADDQ BX, R8 24302 ADDSS (DX)(R8*4), X2 24303 MOVSS X2, (DX)(R8*4) 24304 ADDQ BX, R8 24305 ADDSS (DX)(R8*4), X3 24306 MOVSS X3, (DX)(R8*4) 24307 ADDQ BX, R8 24308 ADDSS (DX)(R8*4), X4 24309 MOVSS X4, (DX)(R8*4) 24310 ADDQ BX, R8 24311 ADDSS (DX)(R8*4), X5 24312 MOVSS X5, (DX)(R8*4) 24313 ADDQ BX, R8 24314 ADDSS (DX)(R8*4), X6 24315 MOVSS X6, (DX)(R8*4) 24316 ADDQ BX, R8 24317 ADDSS (DX)(R8*4), X7 24318 MOVSS X7, (DX)(R8*4) 24319 ADDQ BX, R8 24320 ADDSS (DX)(R8*4), X8 24321 MOVSS X8, (DX)(R8*4) 24322 ADDQ BX, R8 24323 SUBQ $0x08, SI 24324 24325 check_limit_unroll: 24326 CMPQ SI, $0x08 24327 JHS loop_unroll 24328 JMP check_limit 24329 24330 loop: 24331 MOVSS (AX)(DI*4), X1 24332 MULSS X0, X1 24333 ADDSS (DX)(R8*4), X1 24334 MOVSS X1, (DX)(R8*4) 24335 DECQ SI 24336 ADDQ CX, DI 24337 ADDQ BX, R8 24338 24339 check_limit: 24340 CMPQ SI, $0x00 24341 JHI loop 24342 RET 24343 24344 // func AmdAxpyUnsafeXInterleave_V3A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24345 // Requires: SSE 24346 TEXT ·AmdAxpyUnsafeXInterleave_V3A15R8(SB), NOSPLIT, $0-48 24347 MOVSS alpha+0(FP), X0 24348 MOVQ xs+8(FP), AX 24349 MOVQ incx+16(FP), CX 24350 MOVQ ys+24(FP), DX 24351 MOVQ incy+32(FP), BX 24352 MOVQ n+40(FP), SI 24353 XORQ DI, DI 24354 XORQ R8, R8 24355 JMP check_limit_unroll 24356 PCALIGN $0x08 24357 NOP 24358 NOP 24359 NOP 24360 NOP 24361 NOP 24362 NOP 24363 NOP 24364 24365 loop_unroll: 24366 MOVSS (AX)(DI*4), X1 24367 ADDQ CX, DI 24368 MOVSS (AX)(DI*4), X2 24369 ADDQ CX, DI 24370 MOVSS (AX)(DI*4), X3 24371 ADDQ CX, DI 24372 MOVSS (AX)(DI*4), X4 24373 ADDQ CX, DI 24374 MOVSS (AX)(DI*4), X5 24375 ADDQ CX, DI 24376 MOVSS (AX)(DI*4), X6 24377 ADDQ CX, DI 24378 MOVSS (AX)(DI*4), X7 24379 ADDQ CX, DI 24380 MOVSS (AX)(DI*4), X8 24381 ADDQ CX, DI 24382 MULSS X0, X1 24383 MULSS X0, X2 24384 MULSS X0, X3 24385 MULSS X0, X4 24386 MULSS X0, X5 24387 MULSS X0, X6 24388 MULSS X0, X7 24389 MULSS X0, X8 24390 ADDSS (DX)(R8*4), X1 24391 MOVSS X1, (DX)(R8*4) 24392 ADDQ BX, R8 24393 ADDSS (DX)(R8*4), X2 24394 MOVSS X2, (DX)(R8*4) 24395 ADDQ BX, R8 24396 ADDSS (DX)(R8*4), X3 24397 MOVSS X3, (DX)(R8*4) 24398 ADDQ BX, R8 24399 ADDSS (DX)(R8*4), X4 24400 MOVSS X4, (DX)(R8*4) 24401 ADDQ BX, R8 24402 ADDSS (DX)(R8*4), X5 24403 MOVSS X5, (DX)(R8*4) 24404 ADDQ BX, R8 24405 ADDSS (DX)(R8*4), X6 24406 MOVSS X6, (DX)(R8*4) 24407 ADDQ BX, R8 24408 ADDSS (DX)(R8*4), X7 24409 MOVSS X7, (DX)(R8*4) 24410 ADDQ BX, R8 24411 ADDSS (DX)(R8*4), X8 24412 MOVSS X8, (DX)(R8*4) 24413 ADDQ BX, R8 24414 SUBQ $0x08, SI 24415 24416 check_limit_unroll: 24417 CMPQ SI, $0x08 24418 JHS loop_unroll 24419 JMP check_limit 24420 24421 loop: 24422 MOVSS (AX)(DI*4), X1 24423 MULSS X0, X1 24424 ADDSS (DX)(R8*4), X1 24425 MOVSS X1, (DX)(R8*4) 24426 DECQ SI 24427 ADDQ CX, DI 24428 ADDQ BX, R8 24429 24430 check_limit: 24431 CMPQ SI, $0x00 24432 JHI loop 24433 RET 24434 24435 // func AmdAxpyUnsafeXInterleave_V4A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24436 // Requires: SSE 24437 TEXT ·AmdAxpyUnsafeXInterleave_V4A15R8(SB), NOSPLIT, $0-48 24438 MOVSS alpha+0(FP), X0 24439 MOVQ xs+8(FP), AX 24440 MOVQ incx+16(FP), CX 24441 MOVQ ys+24(FP), DX 24442 MOVQ incy+32(FP), BX 24443 MOVQ n+40(FP), SI 24444 XORQ DI, DI 24445 XORQ R8, R8 24446 JMP check_limit_unroll 24447 PCALIGN $0x08 24448 NOP 24449 NOP 24450 NOP 24451 NOP 24452 NOP 24453 NOP 24454 NOP 24455 24456 loop_unroll: 24457 MOVSS (AX)(DI*4), X1 24458 ADDQ CX, DI 24459 MOVSS (AX)(DI*4), X2 24460 ADDQ CX, DI 24461 MOVSS (AX)(DI*4), X3 24462 ADDQ CX, DI 24463 MOVSS (AX)(DI*4), X4 24464 ADDQ CX, DI 24465 MOVSS (AX)(DI*4), X5 24466 ADDQ CX, DI 24467 MOVSS (AX)(DI*4), X6 24468 ADDQ CX, DI 24469 MOVSS (AX)(DI*4), X7 24470 ADDQ CX, DI 24471 MOVSS (AX)(DI*4), X8 24472 ADDQ CX, DI 24473 MULSS X0, X1 24474 MULSS X0, X2 24475 MULSS X0, X3 24476 MULSS X0, X4 24477 MULSS X0, X5 24478 MULSS X0, X6 24479 MULSS X0, X7 24480 MULSS X0, X8 24481 ADDSS (DX)(R8*4), X1 24482 MOVSS X1, (DX)(R8*4) 24483 ADDQ BX, R8 24484 ADDSS (DX)(R8*4), X2 24485 MOVSS X2, (DX)(R8*4) 24486 ADDQ BX, R8 24487 ADDSS (DX)(R8*4), X3 24488 MOVSS X3, (DX)(R8*4) 24489 ADDQ BX, R8 24490 ADDSS (DX)(R8*4), X4 24491 MOVSS X4, (DX)(R8*4) 24492 ADDQ BX, R8 24493 ADDSS (DX)(R8*4), X5 24494 MOVSS X5, (DX)(R8*4) 24495 ADDQ BX, R8 24496 ADDSS (DX)(R8*4), X6 24497 MOVSS X6, (DX)(R8*4) 24498 ADDQ BX, R8 24499 ADDSS (DX)(R8*4), X7 24500 MOVSS X7, (DX)(R8*4) 24501 ADDQ BX, R8 24502 ADDSS (DX)(R8*4), X8 24503 MOVSS X8, (DX)(R8*4) 24504 ADDQ BX, R8 24505 SUBQ $0x08, SI 24506 24507 check_limit_unroll: 24508 CMPQ SI, $0x08 24509 JHS loop_unroll 24510 JMP check_limit 24511 24512 loop: 24513 MOVSS (AX)(DI*4), X1 24514 MULSS X0, X1 24515 ADDSS (DX)(R8*4), X1 24516 MOVSS X1, (DX)(R8*4) 24517 DECQ SI 24518 ADDQ CX, DI 24519 ADDQ BX, R8 24520 24521 check_limit: 24522 CMPQ SI, $0x00 24523 JHI loop 24524 RET 24525 24526 // func AmdAxpyUnsafeXInterleave_V5A15R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24527 // Requires: SSE 24528 TEXT ·AmdAxpyUnsafeXInterleave_V5A15R8(SB), NOSPLIT, $0-48 24529 MOVSS alpha+0(FP), X0 24530 MOVQ xs+8(FP), AX 24531 MOVQ incx+16(FP), CX 24532 MOVQ ys+24(FP), DX 24533 MOVQ incy+32(FP), BX 24534 MOVQ n+40(FP), SI 24535 XORQ DI, DI 24536 XORQ R8, R8 24537 JMP check_limit_unroll 24538 PCALIGN $0x08 24539 NOP 24540 NOP 24541 NOP 24542 NOP 24543 NOP 24544 NOP 24545 NOP 24546 24547 loop_unroll: 24548 MOVSS (AX)(DI*4), X1 24549 ADDQ CX, DI 24550 MOVSS (AX)(DI*4), X2 24551 ADDQ CX, DI 24552 MOVSS (AX)(DI*4), X3 24553 ADDQ CX, DI 24554 MOVSS (AX)(DI*4), X4 24555 ADDQ CX, DI 24556 MOVSS (AX)(DI*4), X5 24557 ADDQ CX, DI 24558 MOVSS (AX)(DI*4), X6 24559 ADDQ CX, DI 24560 MOVSS (AX)(DI*4), X7 24561 ADDQ CX, DI 24562 MOVSS (AX)(DI*4), X8 24563 ADDQ CX, DI 24564 MULSS X0, X1 24565 MULSS X0, X2 24566 MULSS X0, X3 24567 MULSS X0, X4 24568 MULSS X0, X5 24569 MULSS X0, X6 24570 MULSS X0, X7 24571 MULSS X0, X8 24572 ADDSS (DX)(R8*4), X1 24573 MOVSS X1, (DX)(R8*4) 24574 ADDQ BX, R8 24575 ADDSS (DX)(R8*4), X2 24576 MOVSS X2, (DX)(R8*4) 24577 ADDQ BX, R8 24578 ADDSS (DX)(R8*4), X3 24579 MOVSS X3, (DX)(R8*4) 24580 ADDQ BX, R8 24581 ADDSS (DX)(R8*4), X4 24582 MOVSS X4, (DX)(R8*4) 24583 ADDQ BX, R8 24584 ADDSS (DX)(R8*4), X5 24585 MOVSS X5, (DX)(R8*4) 24586 ADDQ BX, R8 24587 ADDSS (DX)(R8*4), X6 24588 MOVSS X6, (DX)(R8*4) 24589 ADDQ BX, R8 24590 ADDSS (DX)(R8*4), X7 24591 MOVSS X7, (DX)(R8*4) 24592 ADDQ BX, R8 24593 ADDSS (DX)(R8*4), X8 24594 MOVSS X8, (DX)(R8*4) 24595 ADDQ BX, R8 24596 SUBQ $0x08, SI 24597 24598 check_limit_unroll: 24599 CMPQ SI, $0x08 24600 JHS loop_unroll 24601 JMP check_limit 24602 24603 loop: 24604 MOVSS (AX)(DI*4), X1 24605 MULSS X0, X1 24606 ADDSS (DX)(R8*4), X1 24607 MOVSS X1, (DX)(R8*4) 24608 DECQ SI 24609 ADDQ CX, DI 24610 ADDQ BX, R8 24611 24612 check_limit: 24613 CMPQ SI, $0x00 24614 JHI loop 24615 RET 24616 24617 // func AmdAxpyUnsafeXInterleave_V0A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24618 // Requires: SSE 24619 TEXT ·AmdAxpyUnsafeXInterleave_V0A16R8(SB), NOSPLIT, $0-48 24620 MOVSS alpha+0(FP), X0 24621 MOVQ xs+8(FP), AX 24622 MOVQ incx+16(FP), CX 24623 MOVQ ys+24(FP), DX 24624 MOVQ incy+32(FP), BX 24625 MOVQ n+40(FP), SI 24626 XORQ DI, DI 24627 XORQ R8, R8 24628 JMP check_limit_unroll 24629 PCALIGN $0x10 24630 24631 loop_unroll: 24632 MOVSS (AX)(DI*4), X1 24633 ADDQ CX, DI 24634 MOVSS (AX)(DI*4), X2 24635 ADDQ CX, DI 24636 MOVSS (AX)(DI*4), X3 24637 ADDQ CX, DI 24638 MOVSS (AX)(DI*4), X4 24639 ADDQ CX, DI 24640 MOVSS (AX)(DI*4), X5 24641 ADDQ CX, DI 24642 MOVSS (AX)(DI*4), X6 24643 ADDQ CX, DI 24644 MOVSS (AX)(DI*4), X7 24645 ADDQ CX, DI 24646 MOVSS (AX)(DI*4), X8 24647 ADDQ CX, DI 24648 MULSS X0, X1 24649 MULSS X0, X2 24650 MULSS X0, X3 24651 MULSS X0, X4 24652 MULSS X0, X5 24653 MULSS X0, X6 24654 MULSS X0, X7 24655 MULSS X0, X8 24656 ADDSS (DX)(R8*4), X1 24657 MOVSS X1, (DX)(R8*4) 24658 ADDQ BX, R8 24659 ADDSS (DX)(R8*4), X2 24660 MOVSS X2, (DX)(R8*4) 24661 ADDQ BX, R8 24662 ADDSS (DX)(R8*4), X3 24663 MOVSS X3, (DX)(R8*4) 24664 ADDQ BX, R8 24665 ADDSS (DX)(R8*4), X4 24666 MOVSS X4, (DX)(R8*4) 24667 ADDQ BX, R8 24668 ADDSS (DX)(R8*4), X5 24669 MOVSS X5, (DX)(R8*4) 24670 ADDQ BX, R8 24671 ADDSS (DX)(R8*4), X6 24672 MOVSS X6, (DX)(R8*4) 24673 ADDQ BX, R8 24674 ADDSS (DX)(R8*4), X7 24675 MOVSS X7, (DX)(R8*4) 24676 ADDQ BX, R8 24677 ADDSS (DX)(R8*4), X8 24678 MOVSS X8, (DX)(R8*4) 24679 ADDQ BX, R8 24680 SUBQ $0x08, SI 24681 24682 check_limit_unroll: 24683 CMPQ SI, $0x08 24684 JHS loop_unroll 24685 JMP check_limit 24686 24687 loop: 24688 MOVSS (AX)(DI*4), X1 24689 MULSS X0, X1 24690 ADDSS (DX)(R8*4), X1 24691 MOVSS X1, (DX)(R8*4) 24692 DECQ SI 24693 ADDQ CX, DI 24694 ADDQ BX, R8 24695 24696 check_limit: 24697 CMPQ SI, $0x00 24698 JHI loop 24699 RET 24700 24701 // func AmdAxpyUnsafeXInterleave_V1A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24702 // Requires: SSE 24703 TEXT ·AmdAxpyUnsafeXInterleave_V1A16R8(SB), NOSPLIT, $0-48 24704 MOVSS alpha+0(FP), X0 24705 MOVQ xs+8(FP), AX 24706 MOVQ incx+16(FP), CX 24707 MOVQ ys+24(FP), DX 24708 MOVQ incy+32(FP), BX 24709 MOVQ n+40(FP), SI 24710 XORQ DI, DI 24711 XORQ R8, R8 24712 JMP check_limit_unroll 24713 PCALIGN $0x10 24714 24715 loop_unroll: 24716 MOVSS (AX)(DI*4), X1 24717 ADDQ CX, DI 24718 MOVSS (AX)(DI*4), X2 24719 ADDQ CX, DI 24720 MOVSS (AX)(DI*4), X3 24721 ADDQ CX, DI 24722 MOVSS (AX)(DI*4), X4 24723 ADDQ CX, DI 24724 MOVSS (AX)(DI*4), X5 24725 ADDQ CX, DI 24726 MOVSS (AX)(DI*4), X6 24727 ADDQ CX, DI 24728 MOVSS (AX)(DI*4), X7 24729 ADDQ CX, DI 24730 MOVSS (AX)(DI*4), X8 24731 ADDQ CX, DI 24732 MULSS X0, X1 24733 MULSS X0, X2 24734 MULSS X0, X3 24735 MULSS X0, X4 24736 MULSS X0, X5 24737 MULSS X0, X6 24738 MULSS X0, X7 24739 MULSS X0, X8 24740 ADDSS (DX)(R8*4), X1 24741 MOVSS X1, (DX)(R8*4) 24742 ADDQ BX, R8 24743 ADDSS (DX)(R8*4), X2 24744 MOVSS X2, (DX)(R8*4) 24745 ADDQ BX, R8 24746 ADDSS (DX)(R8*4), X3 24747 MOVSS X3, (DX)(R8*4) 24748 ADDQ BX, R8 24749 ADDSS (DX)(R8*4), X4 24750 MOVSS X4, (DX)(R8*4) 24751 ADDQ BX, R8 24752 ADDSS (DX)(R8*4), X5 24753 MOVSS X5, (DX)(R8*4) 24754 ADDQ BX, R8 24755 ADDSS (DX)(R8*4), X6 24756 MOVSS X6, (DX)(R8*4) 24757 ADDQ BX, R8 24758 ADDSS (DX)(R8*4), X7 24759 MOVSS X7, (DX)(R8*4) 24760 ADDQ BX, R8 24761 ADDSS (DX)(R8*4), X8 24762 MOVSS X8, (DX)(R8*4) 24763 ADDQ BX, R8 24764 SUBQ $0x08, SI 24765 24766 check_limit_unroll: 24767 CMPQ SI, $0x08 24768 JHS loop_unroll 24769 JMP check_limit 24770 24771 loop: 24772 MOVSS (AX)(DI*4), X1 24773 MULSS X0, X1 24774 ADDSS (DX)(R8*4), X1 24775 MOVSS X1, (DX)(R8*4) 24776 DECQ SI 24777 ADDQ CX, DI 24778 ADDQ BX, R8 24779 24780 check_limit: 24781 CMPQ SI, $0x00 24782 JHI loop 24783 RET 24784 24785 // func AmdAxpyUnsafeXInterleave_V2A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24786 // Requires: SSE 24787 TEXT ·AmdAxpyUnsafeXInterleave_V2A16R8(SB), NOSPLIT, $0-48 24788 MOVSS alpha+0(FP), X0 24789 MOVQ xs+8(FP), AX 24790 MOVQ incx+16(FP), CX 24791 MOVQ ys+24(FP), DX 24792 MOVQ incy+32(FP), BX 24793 MOVQ n+40(FP), SI 24794 XORQ DI, DI 24795 XORQ R8, R8 24796 JMP check_limit_unroll 24797 PCALIGN $0x10 24798 24799 loop_unroll: 24800 MOVSS (AX)(DI*4), X1 24801 ADDQ CX, DI 24802 MOVSS (AX)(DI*4), X2 24803 ADDQ CX, DI 24804 MOVSS (AX)(DI*4), X3 24805 ADDQ CX, DI 24806 MOVSS (AX)(DI*4), X4 24807 ADDQ CX, DI 24808 MOVSS (AX)(DI*4), X5 24809 ADDQ CX, DI 24810 MOVSS (AX)(DI*4), X6 24811 ADDQ CX, DI 24812 MOVSS (AX)(DI*4), X7 24813 ADDQ CX, DI 24814 MOVSS (AX)(DI*4), X8 24815 ADDQ CX, DI 24816 MULSS X0, X1 24817 MULSS X0, X2 24818 MULSS X0, X3 24819 MULSS X0, X4 24820 MULSS X0, X5 24821 MULSS X0, X6 24822 MULSS X0, X7 24823 MULSS X0, X8 24824 ADDSS (DX)(R8*4), X1 24825 MOVSS X1, (DX)(R8*4) 24826 ADDQ BX, R8 24827 ADDSS (DX)(R8*4), X2 24828 MOVSS X2, (DX)(R8*4) 24829 ADDQ BX, R8 24830 ADDSS (DX)(R8*4), X3 24831 MOVSS X3, (DX)(R8*4) 24832 ADDQ BX, R8 24833 ADDSS (DX)(R8*4), X4 24834 MOVSS X4, (DX)(R8*4) 24835 ADDQ BX, R8 24836 ADDSS (DX)(R8*4), X5 24837 MOVSS X5, (DX)(R8*4) 24838 ADDQ BX, R8 24839 ADDSS (DX)(R8*4), X6 24840 MOVSS X6, (DX)(R8*4) 24841 ADDQ BX, R8 24842 ADDSS (DX)(R8*4), X7 24843 MOVSS X7, (DX)(R8*4) 24844 ADDQ BX, R8 24845 ADDSS (DX)(R8*4), X8 24846 MOVSS X8, (DX)(R8*4) 24847 ADDQ BX, R8 24848 SUBQ $0x08, SI 24849 24850 check_limit_unroll: 24851 CMPQ SI, $0x08 24852 JHS loop_unroll 24853 JMP check_limit 24854 24855 loop: 24856 MOVSS (AX)(DI*4), X1 24857 MULSS X0, X1 24858 ADDSS (DX)(R8*4), X1 24859 MOVSS X1, (DX)(R8*4) 24860 DECQ SI 24861 ADDQ CX, DI 24862 ADDQ BX, R8 24863 24864 check_limit: 24865 CMPQ SI, $0x00 24866 JHI loop 24867 RET 24868 24869 // func AmdAxpyUnsafeXInterleave_V3A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24870 // Requires: SSE 24871 TEXT ·AmdAxpyUnsafeXInterleave_V3A16R8(SB), NOSPLIT, $0-48 24872 MOVSS alpha+0(FP), X0 24873 MOVQ xs+8(FP), AX 24874 MOVQ incx+16(FP), CX 24875 MOVQ ys+24(FP), DX 24876 MOVQ incy+32(FP), BX 24877 MOVQ n+40(FP), SI 24878 XORQ DI, DI 24879 XORQ R8, R8 24880 JMP check_limit_unroll 24881 PCALIGN $0x10 24882 24883 loop_unroll: 24884 MOVSS (AX)(DI*4), X1 24885 ADDQ CX, DI 24886 MOVSS (AX)(DI*4), X2 24887 ADDQ CX, DI 24888 MOVSS (AX)(DI*4), X3 24889 ADDQ CX, DI 24890 MOVSS (AX)(DI*4), X4 24891 ADDQ CX, DI 24892 MOVSS (AX)(DI*4), X5 24893 ADDQ CX, DI 24894 MOVSS (AX)(DI*4), X6 24895 ADDQ CX, DI 24896 MOVSS (AX)(DI*4), X7 24897 ADDQ CX, DI 24898 MOVSS (AX)(DI*4), X8 24899 ADDQ CX, DI 24900 MULSS X0, X1 24901 MULSS X0, X2 24902 MULSS X0, X3 24903 MULSS X0, X4 24904 MULSS X0, X5 24905 MULSS X0, X6 24906 MULSS X0, X7 24907 MULSS X0, X8 24908 ADDSS (DX)(R8*4), X1 24909 MOVSS X1, (DX)(R8*4) 24910 ADDQ BX, R8 24911 ADDSS (DX)(R8*4), X2 24912 MOVSS X2, (DX)(R8*4) 24913 ADDQ BX, R8 24914 ADDSS (DX)(R8*4), X3 24915 MOVSS X3, (DX)(R8*4) 24916 ADDQ BX, R8 24917 ADDSS (DX)(R8*4), X4 24918 MOVSS X4, (DX)(R8*4) 24919 ADDQ BX, R8 24920 ADDSS (DX)(R8*4), X5 24921 MOVSS X5, (DX)(R8*4) 24922 ADDQ BX, R8 24923 ADDSS (DX)(R8*4), X6 24924 MOVSS X6, (DX)(R8*4) 24925 ADDQ BX, R8 24926 ADDSS (DX)(R8*4), X7 24927 MOVSS X7, (DX)(R8*4) 24928 ADDQ BX, R8 24929 ADDSS (DX)(R8*4), X8 24930 MOVSS X8, (DX)(R8*4) 24931 ADDQ BX, R8 24932 SUBQ $0x08, SI 24933 24934 check_limit_unroll: 24935 CMPQ SI, $0x08 24936 JHS loop_unroll 24937 JMP check_limit 24938 24939 loop: 24940 MOVSS (AX)(DI*4), X1 24941 MULSS X0, X1 24942 ADDSS (DX)(R8*4), X1 24943 MOVSS X1, (DX)(R8*4) 24944 DECQ SI 24945 ADDQ CX, DI 24946 ADDQ BX, R8 24947 24948 check_limit: 24949 CMPQ SI, $0x00 24950 JHI loop 24951 RET 24952 24953 // func AmdAxpyUnsafeXInterleave_V4A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 24954 // Requires: SSE 24955 TEXT ·AmdAxpyUnsafeXInterleave_V4A16R8(SB), NOSPLIT, $0-48 24956 MOVSS alpha+0(FP), X0 24957 MOVQ xs+8(FP), AX 24958 MOVQ incx+16(FP), CX 24959 MOVQ ys+24(FP), DX 24960 MOVQ incy+32(FP), BX 24961 MOVQ n+40(FP), SI 24962 XORQ DI, DI 24963 XORQ R8, R8 24964 JMP check_limit_unroll 24965 PCALIGN $0x10 24966 24967 loop_unroll: 24968 MOVSS (AX)(DI*4), X1 24969 ADDQ CX, DI 24970 MOVSS (AX)(DI*4), X2 24971 ADDQ CX, DI 24972 MOVSS (AX)(DI*4), X3 24973 ADDQ CX, DI 24974 MOVSS (AX)(DI*4), X4 24975 ADDQ CX, DI 24976 MOVSS (AX)(DI*4), X5 24977 ADDQ CX, DI 24978 MOVSS (AX)(DI*4), X6 24979 ADDQ CX, DI 24980 MOVSS (AX)(DI*4), X7 24981 ADDQ CX, DI 24982 MOVSS (AX)(DI*4), X8 24983 ADDQ CX, DI 24984 MULSS X0, X1 24985 MULSS X0, X2 24986 MULSS X0, X3 24987 MULSS X0, X4 24988 MULSS X0, X5 24989 MULSS X0, X6 24990 MULSS X0, X7 24991 MULSS X0, X8 24992 ADDSS (DX)(R8*4), X1 24993 MOVSS X1, (DX)(R8*4) 24994 ADDQ BX, R8 24995 ADDSS (DX)(R8*4), X2 24996 MOVSS X2, (DX)(R8*4) 24997 ADDQ BX, R8 24998 ADDSS (DX)(R8*4), X3 24999 MOVSS X3, (DX)(R8*4) 25000 ADDQ BX, R8 25001 ADDSS (DX)(R8*4), X4 25002 MOVSS X4, (DX)(R8*4) 25003 ADDQ BX, R8 25004 ADDSS (DX)(R8*4), X5 25005 MOVSS X5, (DX)(R8*4) 25006 ADDQ BX, R8 25007 ADDSS (DX)(R8*4), X6 25008 MOVSS X6, (DX)(R8*4) 25009 ADDQ BX, R8 25010 ADDSS (DX)(R8*4), X7 25011 MOVSS X7, (DX)(R8*4) 25012 ADDQ BX, R8 25013 ADDSS (DX)(R8*4), X8 25014 MOVSS X8, (DX)(R8*4) 25015 ADDQ BX, R8 25016 SUBQ $0x08, SI 25017 25018 check_limit_unroll: 25019 CMPQ SI, $0x08 25020 JHS loop_unroll 25021 JMP check_limit 25022 25023 loop: 25024 MOVSS (AX)(DI*4), X1 25025 MULSS X0, X1 25026 ADDSS (DX)(R8*4), X1 25027 MOVSS X1, (DX)(R8*4) 25028 DECQ SI 25029 ADDQ CX, DI 25030 ADDQ BX, R8 25031 25032 check_limit: 25033 CMPQ SI, $0x00 25034 JHI loop 25035 RET 25036 25037 // func AmdAxpyUnsafeXInterleave_V5A16R8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25038 // Requires: SSE 25039 TEXT ·AmdAxpyUnsafeXInterleave_V5A16R8(SB), NOSPLIT, $0-48 25040 MOVSS alpha+0(FP), X0 25041 MOVQ xs+8(FP), AX 25042 MOVQ incx+16(FP), CX 25043 MOVQ ys+24(FP), DX 25044 MOVQ incy+32(FP), BX 25045 MOVQ n+40(FP), SI 25046 XORQ DI, DI 25047 XORQ R8, R8 25048 JMP check_limit_unroll 25049 PCALIGN $0x10 25050 25051 loop_unroll: 25052 MOVSS (AX)(DI*4), X1 25053 ADDQ CX, DI 25054 MOVSS (AX)(DI*4), X2 25055 ADDQ CX, DI 25056 MOVSS (AX)(DI*4), X3 25057 ADDQ CX, DI 25058 MOVSS (AX)(DI*4), X4 25059 ADDQ CX, DI 25060 MOVSS (AX)(DI*4), X5 25061 ADDQ CX, DI 25062 MOVSS (AX)(DI*4), X6 25063 ADDQ CX, DI 25064 MOVSS (AX)(DI*4), X7 25065 ADDQ CX, DI 25066 MOVSS (AX)(DI*4), X8 25067 ADDQ CX, DI 25068 MULSS X0, X1 25069 MULSS X0, X2 25070 MULSS X0, X3 25071 MULSS X0, X4 25072 MULSS X0, X5 25073 MULSS X0, X6 25074 MULSS X0, X7 25075 MULSS X0, X8 25076 ADDSS (DX)(R8*4), X1 25077 MOVSS X1, (DX)(R8*4) 25078 ADDQ BX, R8 25079 ADDSS (DX)(R8*4), X2 25080 MOVSS X2, (DX)(R8*4) 25081 ADDQ BX, R8 25082 ADDSS (DX)(R8*4), X3 25083 MOVSS X3, (DX)(R8*4) 25084 ADDQ BX, R8 25085 ADDSS (DX)(R8*4), X4 25086 MOVSS X4, (DX)(R8*4) 25087 ADDQ BX, R8 25088 ADDSS (DX)(R8*4), X5 25089 MOVSS X5, (DX)(R8*4) 25090 ADDQ BX, R8 25091 ADDSS (DX)(R8*4), X6 25092 MOVSS X6, (DX)(R8*4) 25093 ADDQ BX, R8 25094 ADDSS (DX)(R8*4), X7 25095 MOVSS X7, (DX)(R8*4) 25096 ADDQ BX, R8 25097 ADDSS (DX)(R8*4), X8 25098 MOVSS X8, (DX)(R8*4) 25099 ADDQ BX, R8 25100 SUBQ $0x08, SI 25101 25102 check_limit_unroll: 25103 CMPQ SI, $0x08 25104 JHS loop_unroll 25105 JMP check_limit 25106 25107 loop: 25108 MOVSS (AX)(DI*4), X1 25109 MULSS X0, X1 25110 ADDSS (DX)(R8*4), X1 25111 MOVSS X1, (DX)(R8*4) 25112 DECQ SI 25113 ADDQ CX, DI 25114 ADDQ BX, R8 25115 25116 check_limit: 25117 CMPQ SI, $0x00 25118 JHI loop 25119 RET 25120 25121 // func AmdAxpyPointerLoopX_V0A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25122 // Requires: SSE 25123 TEXT ·AmdAxpyPointerLoopX_V0A0U4(SB), NOSPLIT, $0-48 25124 MOVSS alpha+0(FP), X0 25125 MOVQ xs+8(FP), AX 25126 MOVQ incx+16(FP), CX 25127 MOVQ ys+24(FP), DX 25128 MOVQ incy+32(FP), BX 25129 MOVQ n+40(FP), SI 25130 JMP check_limit_unroll 25131 25132 loop_unroll: 25133 MOVSS (AX), X1 25134 MULSS X0, X1 25135 ADDSS (DX), X1 25136 MOVSS X1, (DX) 25137 LEAQ (AX)(CX*4), AX 25138 LEAQ (DX)(BX*4), DX 25139 MOVSS (AX), X1 25140 MULSS X0, X1 25141 ADDSS (DX), X1 25142 MOVSS X1, (DX) 25143 LEAQ (AX)(CX*4), AX 25144 LEAQ (DX)(BX*4), DX 25145 MOVSS (AX), X1 25146 MULSS X0, X1 25147 ADDSS (DX), X1 25148 MOVSS X1, (DX) 25149 LEAQ (AX)(CX*4), AX 25150 LEAQ (DX)(BX*4), DX 25151 MOVSS (AX), X1 25152 MULSS X0, X1 25153 ADDSS (DX), X1 25154 MOVSS X1, (DX) 25155 LEAQ (AX)(CX*4), AX 25156 LEAQ (DX)(BX*4), DX 25157 SUBQ $0x04, SI 25158 25159 check_limit_unroll: 25160 CMPQ SI, $0x04 25161 JHS loop_unroll 25162 JMP check_limit 25163 25164 loop: 25165 MOVSS (AX), X1 25166 MULSS X0, X1 25167 ADDSS (DX), X1 25168 MOVSS X1, (DX) 25169 DECQ SI 25170 LEAQ (AX)(CX*4), AX 25171 LEAQ (DX)(BX*4), DX 25172 25173 check_limit: 25174 CMPQ SI, $0x00 25175 JHI loop 25176 RET 25177 25178 // func AmdAxpyPointerLoopX_V1A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25179 // Requires: SSE 25180 TEXT ·AmdAxpyPointerLoopX_V1A0U4(SB), NOSPLIT, $0-48 25181 MOVSS alpha+0(FP), X0 25182 MOVQ xs+8(FP), AX 25183 MOVQ incx+16(FP), CX 25184 MOVQ ys+24(FP), DX 25185 MOVQ incy+32(FP), BX 25186 MOVQ n+40(FP), SI 25187 JMP check_limit_unroll 25188 25189 loop_unroll: 25190 MOVSS (AX), X1 25191 MULSS X0, X1 25192 ADDSS (DX), X1 25193 MOVSS X1, (DX) 25194 LEAQ (AX)(CX*4), AX 25195 LEAQ (DX)(BX*4), DX 25196 MOVSS (AX), X1 25197 MULSS X0, X1 25198 ADDSS (DX), X1 25199 MOVSS X1, (DX) 25200 LEAQ (AX)(CX*4), AX 25201 LEAQ (DX)(BX*4), DX 25202 MOVSS (AX), X1 25203 MULSS X0, X1 25204 ADDSS (DX), X1 25205 MOVSS X1, (DX) 25206 LEAQ (AX)(CX*4), AX 25207 LEAQ (DX)(BX*4), DX 25208 MOVSS (AX), X1 25209 MULSS X0, X1 25210 ADDSS (DX), X1 25211 MOVSS X1, (DX) 25212 LEAQ (AX)(CX*4), AX 25213 LEAQ (DX)(BX*4), DX 25214 SUBQ $0x04, SI 25215 25216 check_limit_unroll: 25217 CMPQ SI, $0x04 25218 JHS loop_unroll 25219 JMP check_limit 25220 25221 loop: 25222 MOVSS (AX), X1 25223 MULSS X0, X1 25224 ADDSS (DX), X1 25225 MOVSS X1, (DX) 25226 DECQ SI 25227 LEAQ (AX)(CX*4), AX 25228 LEAQ (DX)(BX*4), DX 25229 25230 check_limit: 25231 CMPQ SI, $0x00 25232 JHI loop 25233 RET 25234 25235 // func AmdAxpyPointerLoopX_V2A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25236 // Requires: SSE 25237 TEXT ·AmdAxpyPointerLoopX_V2A0U4(SB), NOSPLIT, $0-48 25238 MOVSS alpha+0(FP), X0 25239 MOVQ xs+8(FP), AX 25240 MOVQ incx+16(FP), CX 25241 MOVQ ys+24(FP), DX 25242 MOVQ incy+32(FP), BX 25243 MOVQ n+40(FP), SI 25244 JMP check_limit_unroll 25245 25246 loop_unroll: 25247 MOVSS (AX), X1 25248 MULSS X0, X1 25249 ADDSS (DX), X1 25250 MOVSS X1, (DX) 25251 LEAQ (AX)(CX*4), AX 25252 LEAQ (DX)(BX*4), DX 25253 MOVSS (AX), X1 25254 MULSS X0, X1 25255 ADDSS (DX), X1 25256 MOVSS X1, (DX) 25257 LEAQ (AX)(CX*4), AX 25258 LEAQ (DX)(BX*4), DX 25259 MOVSS (AX), X1 25260 MULSS X0, X1 25261 ADDSS (DX), X1 25262 MOVSS X1, (DX) 25263 LEAQ (AX)(CX*4), AX 25264 LEAQ (DX)(BX*4), DX 25265 MOVSS (AX), X1 25266 MULSS X0, X1 25267 ADDSS (DX), X1 25268 MOVSS X1, (DX) 25269 LEAQ (AX)(CX*4), AX 25270 LEAQ (DX)(BX*4), DX 25271 SUBQ $0x04, SI 25272 25273 check_limit_unroll: 25274 CMPQ SI, $0x04 25275 JHS loop_unroll 25276 JMP check_limit 25277 25278 loop: 25279 MOVSS (AX), X1 25280 MULSS X0, X1 25281 ADDSS (DX), X1 25282 MOVSS X1, (DX) 25283 DECQ SI 25284 LEAQ (AX)(CX*4), AX 25285 LEAQ (DX)(BX*4), DX 25286 25287 check_limit: 25288 CMPQ SI, $0x00 25289 JHI loop 25290 RET 25291 25292 // func AmdAxpyPointerLoopX_V3A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25293 // Requires: SSE 25294 TEXT ·AmdAxpyPointerLoopX_V3A0U4(SB), NOSPLIT, $0-48 25295 MOVSS alpha+0(FP), X0 25296 MOVQ xs+8(FP), AX 25297 MOVQ incx+16(FP), CX 25298 MOVQ ys+24(FP), DX 25299 MOVQ incy+32(FP), BX 25300 MOVQ n+40(FP), SI 25301 JMP check_limit_unroll 25302 25303 loop_unroll: 25304 MOVSS (AX), X1 25305 MULSS X0, X1 25306 ADDSS (DX), X1 25307 MOVSS X1, (DX) 25308 LEAQ (AX)(CX*4), AX 25309 LEAQ (DX)(BX*4), DX 25310 MOVSS (AX), X1 25311 MULSS X0, X1 25312 ADDSS (DX), X1 25313 MOVSS X1, (DX) 25314 LEAQ (AX)(CX*4), AX 25315 LEAQ (DX)(BX*4), DX 25316 MOVSS (AX), X1 25317 MULSS X0, X1 25318 ADDSS (DX), X1 25319 MOVSS X1, (DX) 25320 LEAQ (AX)(CX*4), AX 25321 LEAQ (DX)(BX*4), DX 25322 MOVSS (AX), X1 25323 MULSS X0, X1 25324 ADDSS (DX), X1 25325 MOVSS X1, (DX) 25326 LEAQ (AX)(CX*4), AX 25327 LEAQ (DX)(BX*4), DX 25328 SUBQ $0x04, SI 25329 25330 check_limit_unroll: 25331 CMPQ SI, $0x04 25332 JHS loop_unroll 25333 JMP check_limit 25334 25335 loop: 25336 MOVSS (AX), X1 25337 MULSS X0, X1 25338 ADDSS (DX), X1 25339 MOVSS X1, (DX) 25340 DECQ SI 25341 LEAQ (AX)(CX*4), AX 25342 LEAQ (DX)(BX*4), DX 25343 25344 check_limit: 25345 CMPQ SI, $0x00 25346 JHI loop 25347 RET 25348 25349 // func AmdAxpyPointerLoopX_V4A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25350 // Requires: SSE 25351 TEXT ·AmdAxpyPointerLoopX_V4A0U4(SB), NOSPLIT, $0-48 25352 MOVSS alpha+0(FP), X0 25353 MOVQ xs+8(FP), AX 25354 MOVQ incx+16(FP), CX 25355 MOVQ ys+24(FP), DX 25356 MOVQ incy+32(FP), BX 25357 MOVQ n+40(FP), SI 25358 JMP check_limit_unroll 25359 25360 loop_unroll: 25361 MOVSS (AX), X1 25362 MULSS X0, X1 25363 ADDSS (DX), X1 25364 MOVSS X1, (DX) 25365 LEAQ (AX)(CX*4), AX 25366 LEAQ (DX)(BX*4), DX 25367 MOVSS (AX), X1 25368 MULSS X0, X1 25369 ADDSS (DX), X1 25370 MOVSS X1, (DX) 25371 LEAQ (AX)(CX*4), AX 25372 LEAQ (DX)(BX*4), DX 25373 MOVSS (AX), X1 25374 MULSS X0, X1 25375 ADDSS (DX), X1 25376 MOVSS X1, (DX) 25377 LEAQ (AX)(CX*4), AX 25378 LEAQ (DX)(BX*4), DX 25379 MOVSS (AX), X1 25380 MULSS X0, X1 25381 ADDSS (DX), X1 25382 MOVSS X1, (DX) 25383 LEAQ (AX)(CX*4), AX 25384 LEAQ (DX)(BX*4), DX 25385 SUBQ $0x04, SI 25386 25387 check_limit_unroll: 25388 CMPQ SI, $0x04 25389 JHS loop_unroll 25390 JMP check_limit 25391 25392 loop: 25393 MOVSS (AX), X1 25394 MULSS X0, X1 25395 ADDSS (DX), X1 25396 MOVSS X1, (DX) 25397 DECQ SI 25398 LEAQ (AX)(CX*4), AX 25399 LEAQ (DX)(BX*4), DX 25400 25401 check_limit: 25402 CMPQ SI, $0x00 25403 JHI loop 25404 RET 25405 25406 // func AmdAxpyPointerLoopX_V5A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25407 // Requires: SSE 25408 TEXT ·AmdAxpyPointerLoopX_V5A0U4(SB), NOSPLIT, $0-48 25409 MOVSS alpha+0(FP), X0 25410 MOVQ xs+8(FP), AX 25411 MOVQ incx+16(FP), CX 25412 MOVQ ys+24(FP), DX 25413 MOVQ incy+32(FP), BX 25414 MOVQ n+40(FP), SI 25415 JMP check_limit_unroll 25416 25417 loop_unroll: 25418 MOVSS (AX), X1 25419 MULSS X0, X1 25420 ADDSS (DX), X1 25421 MOVSS X1, (DX) 25422 LEAQ (AX)(CX*4), AX 25423 LEAQ (DX)(BX*4), DX 25424 MOVSS (AX), X1 25425 MULSS X0, X1 25426 ADDSS (DX), X1 25427 MOVSS X1, (DX) 25428 LEAQ (AX)(CX*4), AX 25429 LEAQ (DX)(BX*4), DX 25430 MOVSS (AX), X1 25431 MULSS X0, X1 25432 ADDSS (DX), X1 25433 MOVSS X1, (DX) 25434 LEAQ (AX)(CX*4), AX 25435 LEAQ (DX)(BX*4), DX 25436 MOVSS (AX), X1 25437 MULSS X0, X1 25438 ADDSS (DX), X1 25439 MOVSS X1, (DX) 25440 LEAQ (AX)(CX*4), AX 25441 LEAQ (DX)(BX*4), DX 25442 SUBQ $0x04, SI 25443 25444 check_limit_unroll: 25445 CMPQ SI, $0x04 25446 JHS loop_unroll 25447 JMP check_limit 25448 25449 loop: 25450 MOVSS (AX), X1 25451 MULSS X0, X1 25452 ADDSS (DX), X1 25453 MOVSS X1, (DX) 25454 DECQ SI 25455 LEAQ (AX)(CX*4), AX 25456 LEAQ (DX)(BX*4), DX 25457 25458 check_limit: 25459 CMPQ SI, $0x00 25460 JHI loop 25461 RET 25462 25463 // func AmdAxpyPointerLoopX_V0A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25464 // Requires: SSE 25465 TEXT ·AmdAxpyPointerLoopX_V0A8U4(SB), NOSPLIT, $0-48 25466 MOVSS alpha+0(FP), X0 25467 MOVQ xs+8(FP), AX 25468 MOVQ incx+16(FP), CX 25469 MOVQ ys+24(FP), DX 25470 MOVQ incy+32(FP), BX 25471 MOVQ n+40(FP), SI 25472 JMP check_limit_unroll 25473 PCALIGN $0x08 25474 25475 loop_unroll: 25476 MOVSS (AX), X1 25477 MULSS X0, X1 25478 ADDSS (DX), X1 25479 MOVSS X1, (DX) 25480 LEAQ (AX)(CX*4), AX 25481 LEAQ (DX)(BX*4), DX 25482 MOVSS (AX), X1 25483 MULSS X0, X1 25484 ADDSS (DX), X1 25485 MOVSS X1, (DX) 25486 LEAQ (AX)(CX*4), AX 25487 LEAQ (DX)(BX*4), DX 25488 MOVSS (AX), X1 25489 MULSS X0, X1 25490 ADDSS (DX), X1 25491 MOVSS X1, (DX) 25492 LEAQ (AX)(CX*4), AX 25493 LEAQ (DX)(BX*4), DX 25494 MOVSS (AX), X1 25495 MULSS X0, X1 25496 ADDSS (DX), X1 25497 MOVSS X1, (DX) 25498 LEAQ (AX)(CX*4), AX 25499 LEAQ (DX)(BX*4), DX 25500 SUBQ $0x04, SI 25501 25502 check_limit_unroll: 25503 CMPQ SI, $0x04 25504 JHS loop_unroll 25505 JMP check_limit 25506 25507 loop: 25508 MOVSS (AX), X1 25509 MULSS X0, X1 25510 ADDSS (DX), X1 25511 MOVSS X1, (DX) 25512 DECQ SI 25513 LEAQ (AX)(CX*4), AX 25514 LEAQ (DX)(BX*4), DX 25515 25516 check_limit: 25517 CMPQ SI, $0x00 25518 JHI loop 25519 RET 25520 25521 // func AmdAxpyPointerLoopX_V1A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25522 // Requires: SSE 25523 TEXT ·AmdAxpyPointerLoopX_V1A8U4(SB), NOSPLIT, $0-48 25524 MOVSS alpha+0(FP), X0 25525 MOVQ xs+8(FP), AX 25526 MOVQ incx+16(FP), CX 25527 MOVQ ys+24(FP), DX 25528 MOVQ incy+32(FP), BX 25529 MOVQ n+40(FP), SI 25530 JMP check_limit_unroll 25531 PCALIGN $0x08 25532 25533 loop_unroll: 25534 MOVSS (AX), X1 25535 MULSS X0, X1 25536 ADDSS (DX), X1 25537 MOVSS X1, (DX) 25538 LEAQ (AX)(CX*4), AX 25539 LEAQ (DX)(BX*4), DX 25540 MOVSS (AX), X1 25541 MULSS X0, X1 25542 ADDSS (DX), X1 25543 MOVSS X1, (DX) 25544 LEAQ (AX)(CX*4), AX 25545 LEAQ (DX)(BX*4), DX 25546 MOVSS (AX), X1 25547 MULSS X0, X1 25548 ADDSS (DX), X1 25549 MOVSS X1, (DX) 25550 LEAQ (AX)(CX*4), AX 25551 LEAQ (DX)(BX*4), DX 25552 MOVSS (AX), X1 25553 MULSS X0, X1 25554 ADDSS (DX), X1 25555 MOVSS X1, (DX) 25556 LEAQ (AX)(CX*4), AX 25557 LEAQ (DX)(BX*4), DX 25558 SUBQ $0x04, SI 25559 25560 check_limit_unroll: 25561 CMPQ SI, $0x04 25562 JHS loop_unroll 25563 JMP check_limit 25564 25565 loop: 25566 MOVSS (AX), X1 25567 MULSS X0, X1 25568 ADDSS (DX), X1 25569 MOVSS X1, (DX) 25570 DECQ SI 25571 LEAQ (AX)(CX*4), AX 25572 LEAQ (DX)(BX*4), DX 25573 25574 check_limit: 25575 CMPQ SI, $0x00 25576 JHI loop 25577 RET 25578 25579 // func AmdAxpyPointerLoopX_V2A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25580 // Requires: SSE 25581 TEXT ·AmdAxpyPointerLoopX_V2A8U4(SB), NOSPLIT, $0-48 25582 MOVSS alpha+0(FP), X0 25583 MOVQ xs+8(FP), AX 25584 MOVQ incx+16(FP), CX 25585 MOVQ ys+24(FP), DX 25586 MOVQ incy+32(FP), BX 25587 MOVQ n+40(FP), SI 25588 JMP check_limit_unroll 25589 PCALIGN $0x08 25590 25591 loop_unroll: 25592 MOVSS (AX), X1 25593 MULSS X0, X1 25594 ADDSS (DX), X1 25595 MOVSS X1, (DX) 25596 LEAQ (AX)(CX*4), AX 25597 LEAQ (DX)(BX*4), DX 25598 MOVSS (AX), X1 25599 MULSS X0, X1 25600 ADDSS (DX), X1 25601 MOVSS X1, (DX) 25602 LEAQ (AX)(CX*4), AX 25603 LEAQ (DX)(BX*4), DX 25604 MOVSS (AX), X1 25605 MULSS X0, X1 25606 ADDSS (DX), X1 25607 MOVSS X1, (DX) 25608 LEAQ (AX)(CX*4), AX 25609 LEAQ (DX)(BX*4), DX 25610 MOVSS (AX), X1 25611 MULSS X0, X1 25612 ADDSS (DX), X1 25613 MOVSS X1, (DX) 25614 LEAQ (AX)(CX*4), AX 25615 LEAQ (DX)(BX*4), DX 25616 SUBQ $0x04, SI 25617 25618 check_limit_unroll: 25619 CMPQ SI, $0x04 25620 JHS loop_unroll 25621 JMP check_limit 25622 25623 loop: 25624 MOVSS (AX), X1 25625 MULSS X0, X1 25626 ADDSS (DX), X1 25627 MOVSS X1, (DX) 25628 DECQ SI 25629 LEAQ (AX)(CX*4), AX 25630 LEAQ (DX)(BX*4), DX 25631 25632 check_limit: 25633 CMPQ SI, $0x00 25634 JHI loop 25635 RET 25636 25637 // func AmdAxpyPointerLoopX_V3A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25638 // Requires: SSE 25639 TEXT ·AmdAxpyPointerLoopX_V3A8U4(SB), NOSPLIT, $0-48 25640 MOVSS alpha+0(FP), X0 25641 MOVQ xs+8(FP), AX 25642 MOVQ incx+16(FP), CX 25643 MOVQ ys+24(FP), DX 25644 MOVQ incy+32(FP), BX 25645 MOVQ n+40(FP), SI 25646 JMP check_limit_unroll 25647 PCALIGN $0x08 25648 25649 loop_unroll: 25650 MOVSS (AX), X1 25651 MULSS X0, X1 25652 ADDSS (DX), X1 25653 MOVSS X1, (DX) 25654 LEAQ (AX)(CX*4), AX 25655 LEAQ (DX)(BX*4), DX 25656 MOVSS (AX), X1 25657 MULSS X0, X1 25658 ADDSS (DX), X1 25659 MOVSS X1, (DX) 25660 LEAQ (AX)(CX*4), AX 25661 LEAQ (DX)(BX*4), DX 25662 MOVSS (AX), X1 25663 MULSS X0, X1 25664 ADDSS (DX), X1 25665 MOVSS X1, (DX) 25666 LEAQ (AX)(CX*4), AX 25667 LEAQ (DX)(BX*4), DX 25668 MOVSS (AX), X1 25669 MULSS X0, X1 25670 ADDSS (DX), X1 25671 MOVSS X1, (DX) 25672 LEAQ (AX)(CX*4), AX 25673 LEAQ (DX)(BX*4), DX 25674 SUBQ $0x04, SI 25675 25676 check_limit_unroll: 25677 CMPQ SI, $0x04 25678 JHS loop_unroll 25679 JMP check_limit 25680 25681 loop: 25682 MOVSS (AX), X1 25683 MULSS X0, X1 25684 ADDSS (DX), X1 25685 MOVSS X1, (DX) 25686 DECQ SI 25687 LEAQ (AX)(CX*4), AX 25688 LEAQ (DX)(BX*4), DX 25689 25690 check_limit: 25691 CMPQ SI, $0x00 25692 JHI loop 25693 RET 25694 25695 // func AmdAxpyPointerLoopX_V4A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25696 // Requires: SSE 25697 TEXT ·AmdAxpyPointerLoopX_V4A8U4(SB), NOSPLIT, $0-48 25698 MOVSS alpha+0(FP), X0 25699 MOVQ xs+8(FP), AX 25700 MOVQ incx+16(FP), CX 25701 MOVQ ys+24(FP), DX 25702 MOVQ incy+32(FP), BX 25703 MOVQ n+40(FP), SI 25704 JMP check_limit_unroll 25705 PCALIGN $0x08 25706 25707 loop_unroll: 25708 MOVSS (AX), X1 25709 MULSS X0, X1 25710 ADDSS (DX), X1 25711 MOVSS X1, (DX) 25712 LEAQ (AX)(CX*4), AX 25713 LEAQ (DX)(BX*4), DX 25714 MOVSS (AX), X1 25715 MULSS X0, X1 25716 ADDSS (DX), X1 25717 MOVSS X1, (DX) 25718 LEAQ (AX)(CX*4), AX 25719 LEAQ (DX)(BX*4), DX 25720 MOVSS (AX), X1 25721 MULSS X0, X1 25722 ADDSS (DX), X1 25723 MOVSS X1, (DX) 25724 LEAQ (AX)(CX*4), AX 25725 LEAQ (DX)(BX*4), DX 25726 MOVSS (AX), X1 25727 MULSS X0, X1 25728 ADDSS (DX), X1 25729 MOVSS X1, (DX) 25730 LEAQ (AX)(CX*4), AX 25731 LEAQ (DX)(BX*4), DX 25732 SUBQ $0x04, SI 25733 25734 check_limit_unroll: 25735 CMPQ SI, $0x04 25736 JHS loop_unroll 25737 JMP check_limit 25738 25739 loop: 25740 MOVSS (AX), X1 25741 MULSS X0, X1 25742 ADDSS (DX), X1 25743 MOVSS X1, (DX) 25744 DECQ SI 25745 LEAQ (AX)(CX*4), AX 25746 LEAQ (DX)(BX*4), DX 25747 25748 check_limit: 25749 CMPQ SI, $0x00 25750 JHI loop 25751 RET 25752 25753 // func AmdAxpyPointerLoopX_V5A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25754 // Requires: SSE 25755 TEXT ·AmdAxpyPointerLoopX_V5A8U4(SB), NOSPLIT, $0-48 25756 MOVSS alpha+0(FP), X0 25757 MOVQ xs+8(FP), AX 25758 MOVQ incx+16(FP), CX 25759 MOVQ ys+24(FP), DX 25760 MOVQ incy+32(FP), BX 25761 MOVQ n+40(FP), SI 25762 JMP check_limit_unroll 25763 PCALIGN $0x08 25764 25765 loop_unroll: 25766 MOVSS (AX), X1 25767 MULSS X0, X1 25768 ADDSS (DX), X1 25769 MOVSS X1, (DX) 25770 LEAQ (AX)(CX*4), AX 25771 LEAQ (DX)(BX*4), DX 25772 MOVSS (AX), X1 25773 MULSS X0, X1 25774 ADDSS (DX), X1 25775 MOVSS X1, (DX) 25776 LEAQ (AX)(CX*4), AX 25777 LEAQ (DX)(BX*4), DX 25778 MOVSS (AX), X1 25779 MULSS X0, X1 25780 ADDSS (DX), X1 25781 MOVSS X1, (DX) 25782 LEAQ (AX)(CX*4), AX 25783 LEAQ (DX)(BX*4), DX 25784 MOVSS (AX), X1 25785 MULSS X0, X1 25786 ADDSS (DX), X1 25787 MOVSS X1, (DX) 25788 LEAQ (AX)(CX*4), AX 25789 LEAQ (DX)(BX*4), DX 25790 SUBQ $0x04, SI 25791 25792 check_limit_unroll: 25793 CMPQ SI, $0x04 25794 JHS loop_unroll 25795 JMP check_limit 25796 25797 loop: 25798 MOVSS (AX), X1 25799 MULSS X0, X1 25800 ADDSS (DX), X1 25801 MOVSS X1, (DX) 25802 DECQ SI 25803 LEAQ (AX)(CX*4), AX 25804 LEAQ (DX)(BX*4), DX 25805 25806 check_limit: 25807 CMPQ SI, $0x00 25808 JHI loop 25809 RET 25810 25811 // func AmdAxpyPointerLoopX_V0A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25812 // Requires: SSE 25813 TEXT ·AmdAxpyPointerLoopX_V0A9U4(SB), NOSPLIT, $0-48 25814 MOVSS alpha+0(FP), X0 25815 MOVQ xs+8(FP), AX 25816 MOVQ incx+16(FP), CX 25817 MOVQ ys+24(FP), DX 25818 MOVQ incy+32(FP), BX 25819 MOVQ n+40(FP), SI 25820 JMP check_limit_unroll 25821 PCALIGN $0x08 25822 NOP 25823 25824 loop_unroll: 25825 MOVSS (AX), X1 25826 MULSS X0, X1 25827 ADDSS (DX), X1 25828 MOVSS X1, (DX) 25829 LEAQ (AX)(CX*4), AX 25830 LEAQ (DX)(BX*4), DX 25831 MOVSS (AX), X1 25832 MULSS X0, X1 25833 ADDSS (DX), X1 25834 MOVSS X1, (DX) 25835 LEAQ (AX)(CX*4), AX 25836 LEAQ (DX)(BX*4), DX 25837 MOVSS (AX), X1 25838 MULSS X0, X1 25839 ADDSS (DX), X1 25840 MOVSS X1, (DX) 25841 LEAQ (AX)(CX*4), AX 25842 LEAQ (DX)(BX*4), DX 25843 MOVSS (AX), X1 25844 MULSS X0, X1 25845 ADDSS (DX), X1 25846 MOVSS X1, (DX) 25847 LEAQ (AX)(CX*4), AX 25848 LEAQ (DX)(BX*4), DX 25849 SUBQ $0x04, SI 25850 25851 check_limit_unroll: 25852 CMPQ SI, $0x04 25853 JHS loop_unroll 25854 JMP check_limit 25855 25856 loop: 25857 MOVSS (AX), X1 25858 MULSS X0, X1 25859 ADDSS (DX), X1 25860 MOVSS X1, (DX) 25861 DECQ SI 25862 LEAQ (AX)(CX*4), AX 25863 LEAQ (DX)(BX*4), DX 25864 25865 check_limit: 25866 CMPQ SI, $0x00 25867 JHI loop 25868 RET 25869 25870 // func AmdAxpyPointerLoopX_V1A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25871 // Requires: SSE 25872 TEXT ·AmdAxpyPointerLoopX_V1A9U4(SB), NOSPLIT, $0-48 25873 MOVSS alpha+0(FP), X0 25874 MOVQ xs+8(FP), AX 25875 MOVQ incx+16(FP), CX 25876 MOVQ ys+24(FP), DX 25877 MOVQ incy+32(FP), BX 25878 MOVQ n+40(FP), SI 25879 JMP check_limit_unroll 25880 PCALIGN $0x08 25881 NOP 25882 25883 loop_unroll: 25884 MOVSS (AX), X1 25885 MULSS X0, X1 25886 ADDSS (DX), X1 25887 MOVSS X1, (DX) 25888 LEAQ (AX)(CX*4), AX 25889 LEAQ (DX)(BX*4), DX 25890 MOVSS (AX), X1 25891 MULSS X0, X1 25892 ADDSS (DX), X1 25893 MOVSS X1, (DX) 25894 LEAQ (AX)(CX*4), AX 25895 LEAQ (DX)(BX*4), DX 25896 MOVSS (AX), X1 25897 MULSS X0, X1 25898 ADDSS (DX), X1 25899 MOVSS X1, (DX) 25900 LEAQ (AX)(CX*4), AX 25901 LEAQ (DX)(BX*4), DX 25902 MOVSS (AX), X1 25903 MULSS X0, X1 25904 ADDSS (DX), X1 25905 MOVSS X1, (DX) 25906 LEAQ (AX)(CX*4), AX 25907 LEAQ (DX)(BX*4), DX 25908 SUBQ $0x04, SI 25909 25910 check_limit_unroll: 25911 CMPQ SI, $0x04 25912 JHS loop_unroll 25913 JMP check_limit 25914 25915 loop: 25916 MOVSS (AX), X1 25917 MULSS X0, X1 25918 ADDSS (DX), X1 25919 MOVSS X1, (DX) 25920 DECQ SI 25921 LEAQ (AX)(CX*4), AX 25922 LEAQ (DX)(BX*4), DX 25923 25924 check_limit: 25925 CMPQ SI, $0x00 25926 JHI loop 25927 RET 25928 25929 // func AmdAxpyPointerLoopX_V2A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25930 // Requires: SSE 25931 TEXT ·AmdAxpyPointerLoopX_V2A9U4(SB), NOSPLIT, $0-48 25932 MOVSS alpha+0(FP), X0 25933 MOVQ xs+8(FP), AX 25934 MOVQ incx+16(FP), CX 25935 MOVQ ys+24(FP), DX 25936 MOVQ incy+32(FP), BX 25937 MOVQ n+40(FP), SI 25938 JMP check_limit_unroll 25939 PCALIGN $0x08 25940 NOP 25941 25942 loop_unroll: 25943 MOVSS (AX), X1 25944 MULSS X0, X1 25945 ADDSS (DX), X1 25946 MOVSS X1, (DX) 25947 LEAQ (AX)(CX*4), AX 25948 LEAQ (DX)(BX*4), DX 25949 MOVSS (AX), X1 25950 MULSS X0, X1 25951 ADDSS (DX), X1 25952 MOVSS X1, (DX) 25953 LEAQ (AX)(CX*4), AX 25954 LEAQ (DX)(BX*4), DX 25955 MOVSS (AX), X1 25956 MULSS X0, X1 25957 ADDSS (DX), X1 25958 MOVSS X1, (DX) 25959 LEAQ (AX)(CX*4), AX 25960 LEAQ (DX)(BX*4), DX 25961 MOVSS (AX), X1 25962 MULSS X0, X1 25963 ADDSS (DX), X1 25964 MOVSS X1, (DX) 25965 LEAQ (AX)(CX*4), AX 25966 LEAQ (DX)(BX*4), DX 25967 SUBQ $0x04, SI 25968 25969 check_limit_unroll: 25970 CMPQ SI, $0x04 25971 JHS loop_unroll 25972 JMP check_limit 25973 25974 loop: 25975 MOVSS (AX), X1 25976 MULSS X0, X1 25977 ADDSS (DX), X1 25978 MOVSS X1, (DX) 25979 DECQ SI 25980 LEAQ (AX)(CX*4), AX 25981 LEAQ (DX)(BX*4), DX 25982 25983 check_limit: 25984 CMPQ SI, $0x00 25985 JHI loop 25986 RET 25987 25988 // func AmdAxpyPointerLoopX_V3A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 25989 // Requires: SSE 25990 TEXT ·AmdAxpyPointerLoopX_V3A9U4(SB), NOSPLIT, $0-48 25991 MOVSS alpha+0(FP), X0 25992 MOVQ xs+8(FP), AX 25993 MOVQ incx+16(FP), CX 25994 MOVQ ys+24(FP), DX 25995 MOVQ incy+32(FP), BX 25996 MOVQ n+40(FP), SI 25997 JMP check_limit_unroll 25998 PCALIGN $0x08 25999 NOP 26000 26001 loop_unroll: 26002 MOVSS (AX), X1 26003 MULSS X0, X1 26004 ADDSS (DX), X1 26005 MOVSS X1, (DX) 26006 LEAQ (AX)(CX*4), AX 26007 LEAQ (DX)(BX*4), DX 26008 MOVSS (AX), X1 26009 MULSS X0, X1 26010 ADDSS (DX), X1 26011 MOVSS X1, (DX) 26012 LEAQ (AX)(CX*4), AX 26013 LEAQ (DX)(BX*4), DX 26014 MOVSS (AX), X1 26015 MULSS X0, X1 26016 ADDSS (DX), X1 26017 MOVSS X1, (DX) 26018 LEAQ (AX)(CX*4), AX 26019 LEAQ (DX)(BX*4), DX 26020 MOVSS (AX), X1 26021 MULSS X0, X1 26022 ADDSS (DX), X1 26023 MOVSS X1, (DX) 26024 LEAQ (AX)(CX*4), AX 26025 LEAQ (DX)(BX*4), DX 26026 SUBQ $0x04, SI 26027 26028 check_limit_unroll: 26029 CMPQ SI, $0x04 26030 JHS loop_unroll 26031 JMP check_limit 26032 26033 loop: 26034 MOVSS (AX), X1 26035 MULSS X0, X1 26036 ADDSS (DX), X1 26037 MOVSS X1, (DX) 26038 DECQ SI 26039 LEAQ (AX)(CX*4), AX 26040 LEAQ (DX)(BX*4), DX 26041 26042 check_limit: 26043 CMPQ SI, $0x00 26044 JHI loop 26045 RET 26046 26047 // func AmdAxpyPointerLoopX_V4A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26048 // Requires: SSE 26049 TEXT ·AmdAxpyPointerLoopX_V4A9U4(SB), NOSPLIT, $0-48 26050 MOVSS alpha+0(FP), X0 26051 MOVQ xs+8(FP), AX 26052 MOVQ incx+16(FP), CX 26053 MOVQ ys+24(FP), DX 26054 MOVQ incy+32(FP), BX 26055 MOVQ n+40(FP), SI 26056 JMP check_limit_unroll 26057 PCALIGN $0x08 26058 NOP 26059 26060 loop_unroll: 26061 MOVSS (AX), X1 26062 MULSS X0, X1 26063 ADDSS (DX), X1 26064 MOVSS X1, (DX) 26065 LEAQ (AX)(CX*4), AX 26066 LEAQ (DX)(BX*4), DX 26067 MOVSS (AX), X1 26068 MULSS X0, X1 26069 ADDSS (DX), X1 26070 MOVSS X1, (DX) 26071 LEAQ (AX)(CX*4), AX 26072 LEAQ (DX)(BX*4), DX 26073 MOVSS (AX), X1 26074 MULSS X0, X1 26075 ADDSS (DX), X1 26076 MOVSS X1, (DX) 26077 LEAQ (AX)(CX*4), AX 26078 LEAQ (DX)(BX*4), DX 26079 MOVSS (AX), X1 26080 MULSS X0, X1 26081 ADDSS (DX), X1 26082 MOVSS X1, (DX) 26083 LEAQ (AX)(CX*4), AX 26084 LEAQ (DX)(BX*4), DX 26085 SUBQ $0x04, SI 26086 26087 check_limit_unroll: 26088 CMPQ SI, $0x04 26089 JHS loop_unroll 26090 JMP check_limit 26091 26092 loop: 26093 MOVSS (AX), X1 26094 MULSS X0, X1 26095 ADDSS (DX), X1 26096 MOVSS X1, (DX) 26097 DECQ SI 26098 LEAQ (AX)(CX*4), AX 26099 LEAQ (DX)(BX*4), DX 26100 26101 check_limit: 26102 CMPQ SI, $0x00 26103 JHI loop 26104 RET 26105 26106 // func AmdAxpyPointerLoopX_V5A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26107 // Requires: SSE 26108 TEXT ·AmdAxpyPointerLoopX_V5A9U4(SB), NOSPLIT, $0-48 26109 MOVSS alpha+0(FP), X0 26110 MOVQ xs+8(FP), AX 26111 MOVQ incx+16(FP), CX 26112 MOVQ ys+24(FP), DX 26113 MOVQ incy+32(FP), BX 26114 MOVQ n+40(FP), SI 26115 JMP check_limit_unroll 26116 PCALIGN $0x08 26117 NOP 26118 26119 loop_unroll: 26120 MOVSS (AX), X1 26121 MULSS X0, X1 26122 ADDSS (DX), X1 26123 MOVSS X1, (DX) 26124 LEAQ (AX)(CX*4), AX 26125 LEAQ (DX)(BX*4), DX 26126 MOVSS (AX), X1 26127 MULSS X0, X1 26128 ADDSS (DX), X1 26129 MOVSS X1, (DX) 26130 LEAQ (AX)(CX*4), AX 26131 LEAQ (DX)(BX*4), DX 26132 MOVSS (AX), X1 26133 MULSS X0, X1 26134 ADDSS (DX), X1 26135 MOVSS X1, (DX) 26136 LEAQ (AX)(CX*4), AX 26137 LEAQ (DX)(BX*4), DX 26138 MOVSS (AX), X1 26139 MULSS X0, X1 26140 ADDSS (DX), X1 26141 MOVSS X1, (DX) 26142 LEAQ (AX)(CX*4), AX 26143 LEAQ (DX)(BX*4), DX 26144 SUBQ $0x04, SI 26145 26146 check_limit_unroll: 26147 CMPQ SI, $0x04 26148 JHS loop_unroll 26149 JMP check_limit 26150 26151 loop: 26152 MOVSS (AX), X1 26153 MULSS X0, X1 26154 ADDSS (DX), X1 26155 MOVSS X1, (DX) 26156 DECQ SI 26157 LEAQ (AX)(CX*4), AX 26158 LEAQ (DX)(BX*4), DX 26159 26160 check_limit: 26161 CMPQ SI, $0x00 26162 JHI loop 26163 RET 26164 26165 // func AmdAxpyPointerLoopX_V0A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26166 // Requires: SSE 26167 TEXT ·AmdAxpyPointerLoopX_V0A10U4(SB), NOSPLIT, $0-48 26168 MOVSS alpha+0(FP), X0 26169 MOVQ xs+8(FP), AX 26170 MOVQ incx+16(FP), CX 26171 MOVQ ys+24(FP), DX 26172 MOVQ incy+32(FP), BX 26173 MOVQ n+40(FP), SI 26174 JMP check_limit_unroll 26175 PCALIGN $0x08 26176 NOP 26177 NOP 26178 26179 loop_unroll: 26180 MOVSS (AX), X1 26181 MULSS X0, X1 26182 ADDSS (DX), X1 26183 MOVSS X1, (DX) 26184 LEAQ (AX)(CX*4), AX 26185 LEAQ (DX)(BX*4), DX 26186 MOVSS (AX), X1 26187 MULSS X0, X1 26188 ADDSS (DX), X1 26189 MOVSS X1, (DX) 26190 LEAQ (AX)(CX*4), AX 26191 LEAQ (DX)(BX*4), DX 26192 MOVSS (AX), X1 26193 MULSS X0, X1 26194 ADDSS (DX), X1 26195 MOVSS X1, (DX) 26196 LEAQ (AX)(CX*4), AX 26197 LEAQ (DX)(BX*4), DX 26198 MOVSS (AX), X1 26199 MULSS X0, X1 26200 ADDSS (DX), X1 26201 MOVSS X1, (DX) 26202 LEAQ (AX)(CX*4), AX 26203 LEAQ (DX)(BX*4), DX 26204 SUBQ $0x04, SI 26205 26206 check_limit_unroll: 26207 CMPQ SI, $0x04 26208 JHS loop_unroll 26209 JMP check_limit 26210 26211 loop: 26212 MOVSS (AX), X1 26213 MULSS X0, X1 26214 ADDSS (DX), X1 26215 MOVSS X1, (DX) 26216 DECQ SI 26217 LEAQ (AX)(CX*4), AX 26218 LEAQ (DX)(BX*4), DX 26219 26220 check_limit: 26221 CMPQ SI, $0x00 26222 JHI loop 26223 RET 26224 26225 // func AmdAxpyPointerLoopX_V1A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26226 // Requires: SSE 26227 TEXT ·AmdAxpyPointerLoopX_V1A10U4(SB), NOSPLIT, $0-48 26228 MOVSS alpha+0(FP), X0 26229 MOVQ xs+8(FP), AX 26230 MOVQ incx+16(FP), CX 26231 MOVQ ys+24(FP), DX 26232 MOVQ incy+32(FP), BX 26233 MOVQ n+40(FP), SI 26234 JMP check_limit_unroll 26235 PCALIGN $0x08 26236 NOP 26237 NOP 26238 26239 loop_unroll: 26240 MOVSS (AX), X1 26241 MULSS X0, X1 26242 ADDSS (DX), X1 26243 MOVSS X1, (DX) 26244 LEAQ (AX)(CX*4), AX 26245 LEAQ (DX)(BX*4), DX 26246 MOVSS (AX), X1 26247 MULSS X0, X1 26248 ADDSS (DX), X1 26249 MOVSS X1, (DX) 26250 LEAQ (AX)(CX*4), AX 26251 LEAQ (DX)(BX*4), DX 26252 MOVSS (AX), X1 26253 MULSS X0, X1 26254 ADDSS (DX), X1 26255 MOVSS X1, (DX) 26256 LEAQ (AX)(CX*4), AX 26257 LEAQ (DX)(BX*4), DX 26258 MOVSS (AX), X1 26259 MULSS X0, X1 26260 ADDSS (DX), X1 26261 MOVSS X1, (DX) 26262 LEAQ (AX)(CX*4), AX 26263 LEAQ (DX)(BX*4), DX 26264 SUBQ $0x04, SI 26265 26266 check_limit_unroll: 26267 CMPQ SI, $0x04 26268 JHS loop_unroll 26269 JMP check_limit 26270 26271 loop: 26272 MOVSS (AX), X1 26273 MULSS X0, X1 26274 ADDSS (DX), X1 26275 MOVSS X1, (DX) 26276 DECQ SI 26277 LEAQ (AX)(CX*4), AX 26278 LEAQ (DX)(BX*4), DX 26279 26280 check_limit: 26281 CMPQ SI, $0x00 26282 JHI loop 26283 RET 26284 26285 // func AmdAxpyPointerLoopX_V2A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26286 // Requires: SSE 26287 TEXT ·AmdAxpyPointerLoopX_V2A10U4(SB), NOSPLIT, $0-48 26288 MOVSS alpha+0(FP), X0 26289 MOVQ xs+8(FP), AX 26290 MOVQ incx+16(FP), CX 26291 MOVQ ys+24(FP), DX 26292 MOVQ incy+32(FP), BX 26293 MOVQ n+40(FP), SI 26294 JMP check_limit_unroll 26295 PCALIGN $0x08 26296 NOP 26297 NOP 26298 26299 loop_unroll: 26300 MOVSS (AX), X1 26301 MULSS X0, X1 26302 ADDSS (DX), X1 26303 MOVSS X1, (DX) 26304 LEAQ (AX)(CX*4), AX 26305 LEAQ (DX)(BX*4), DX 26306 MOVSS (AX), X1 26307 MULSS X0, X1 26308 ADDSS (DX), X1 26309 MOVSS X1, (DX) 26310 LEAQ (AX)(CX*4), AX 26311 LEAQ (DX)(BX*4), DX 26312 MOVSS (AX), X1 26313 MULSS X0, X1 26314 ADDSS (DX), X1 26315 MOVSS X1, (DX) 26316 LEAQ (AX)(CX*4), AX 26317 LEAQ (DX)(BX*4), DX 26318 MOVSS (AX), X1 26319 MULSS X0, X1 26320 ADDSS (DX), X1 26321 MOVSS X1, (DX) 26322 LEAQ (AX)(CX*4), AX 26323 LEAQ (DX)(BX*4), DX 26324 SUBQ $0x04, SI 26325 26326 check_limit_unroll: 26327 CMPQ SI, $0x04 26328 JHS loop_unroll 26329 JMP check_limit 26330 26331 loop: 26332 MOVSS (AX), X1 26333 MULSS X0, X1 26334 ADDSS (DX), X1 26335 MOVSS X1, (DX) 26336 DECQ SI 26337 LEAQ (AX)(CX*4), AX 26338 LEAQ (DX)(BX*4), DX 26339 26340 check_limit: 26341 CMPQ SI, $0x00 26342 JHI loop 26343 RET 26344 26345 // func AmdAxpyPointerLoopX_V3A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26346 // Requires: SSE 26347 TEXT ·AmdAxpyPointerLoopX_V3A10U4(SB), NOSPLIT, $0-48 26348 MOVSS alpha+0(FP), X0 26349 MOVQ xs+8(FP), AX 26350 MOVQ incx+16(FP), CX 26351 MOVQ ys+24(FP), DX 26352 MOVQ incy+32(FP), BX 26353 MOVQ n+40(FP), SI 26354 JMP check_limit_unroll 26355 PCALIGN $0x08 26356 NOP 26357 NOP 26358 26359 loop_unroll: 26360 MOVSS (AX), X1 26361 MULSS X0, X1 26362 ADDSS (DX), X1 26363 MOVSS X1, (DX) 26364 LEAQ (AX)(CX*4), AX 26365 LEAQ (DX)(BX*4), DX 26366 MOVSS (AX), X1 26367 MULSS X0, X1 26368 ADDSS (DX), X1 26369 MOVSS X1, (DX) 26370 LEAQ (AX)(CX*4), AX 26371 LEAQ (DX)(BX*4), DX 26372 MOVSS (AX), X1 26373 MULSS X0, X1 26374 ADDSS (DX), X1 26375 MOVSS X1, (DX) 26376 LEAQ (AX)(CX*4), AX 26377 LEAQ (DX)(BX*4), DX 26378 MOVSS (AX), X1 26379 MULSS X0, X1 26380 ADDSS (DX), X1 26381 MOVSS X1, (DX) 26382 LEAQ (AX)(CX*4), AX 26383 LEAQ (DX)(BX*4), DX 26384 SUBQ $0x04, SI 26385 26386 check_limit_unroll: 26387 CMPQ SI, $0x04 26388 JHS loop_unroll 26389 JMP check_limit 26390 26391 loop: 26392 MOVSS (AX), X1 26393 MULSS X0, X1 26394 ADDSS (DX), X1 26395 MOVSS X1, (DX) 26396 DECQ SI 26397 LEAQ (AX)(CX*4), AX 26398 LEAQ (DX)(BX*4), DX 26399 26400 check_limit: 26401 CMPQ SI, $0x00 26402 JHI loop 26403 RET 26404 26405 // func AmdAxpyPointerLoopX_V4A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26406 // Requires: SSE 26407 TEXT ·AmdAxpyPointerLoopX_V4A10U4(SB), NOSPLIT, $0-48 26408 MOVSS alpha+0(FP), X0 26409 MOVQ xs+8(FP), AX 26410 MOVQ incx+16(FP), CX 26411 MOVQ ys+24(FP), DX 26412 MOVQ incy+32(FP), BX 26413 MOVQ n+40(FP), SI 26414 JMP check_limit_unroll 26415 PCALIGN $0x08 26416 NOP 26417 NOP 26418 26419 loop_unroll: 26420 MOVSS (AX), X1 26421 MULSS X0, X1 26422 ADDSS (DX), X1 26423 MOVSS X1, (DX) 26424 LEAQ (AX)(CX*4), AX 26425 LEAQ (DX)(BX*4), DX 26426 MOVSS (AX), X1 26427 MULSS X0, X1 26428 ADDSS (DX), X1 26429 MOVSS X1, (DX) 26430 LEAQ (AX)(CX*4), AX 26431 LEAQ (DX)(BX*4), DX 26432 MOVSS (AX), X1 26433 MULSS X0, X1 26434 ADDSS (DX), X1 26435 MOVSS X1, (DX) 26436 LEAQ (AX)(CX*4), AX 26437 LEAQ (DX)(BX*4), DX 26438 MOVSS (AX), X1 26439 MULSS X0, X1 26440 ADDSS (DX), X1 26441 MOVSS X1, (DX) 26442 LEAQ (AX)(CX*4), AX 26443 LEAQ (DX)(BX*4), DX 26444 SUBQ $0x04, SI 26445 26446 check_limit_unroll: 26447 CMPQ SI, $0x04 26448 JHS loop_unroll 26449 JMP check_limit 26450 26451 loop: 26452 MOVSS (AX), X1 26453 MULSS X0, X1 26454 ADDSS (DX), X1 26455 MOVSS X1, (DX) 26456 DECQ SI 26457 LEAQ (AX)(CX*4), AX 26458 LEAQ (DX)(BX*4), DX 26459 26460 check_limit: 26461 CMPQ SI, $0x00 26462 JHI loop 26463 RET 26464 26465 // func AmdAxpyPointerLoopX_V5A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26466 // Requires: SSE 26467 TEXT ·AmdAxpyPointerLoopX_V5A10U4(SB), NOSPLIT, $0-48 26468 MOVSS alpha+0(FP), X0 26469 MOVQ xs+8(FP), AX 26470 MOVQ incx+16(FP), CX 26471 MOVQ ys+24(FP), DX 26472 MOVQ incy+32(FP), BX 26473 MOVQ n+40(FP), SI 26474 JMP check_limit_unroll 26475 PCALIGN $0x08 26476 NOP 26477 NOP 26478 26479 loop_unroll: 26480 MOVSS (AX), X1 26481 MULSS X0, X1 26482 ADDSS (DX), X1 26483 MOVSS X1, (DX) 26484 LEAQ (AX)(CX*4), AX 26485 LEAQ (DX)(BX*4), DX 26486 MOVSS (AX), X1 26487 MULSS X0, X1 26488 ADDSS (DX), X1 26489 MOVSS X1, (DX) 26490 LEAQ (AX)(CX*4), AX 26491 LEAQ (DX)(BX*4), DX 26492 MOVSS (AX), X1 26493 MULSS X0, X1 26494 ADDSS (DX), X1 26495 MOVSS X1, (DX) 26496 LEAQ (AX)(CX*4), AX 26497 LEAQ (DX)(BX*4), DX 26498 MOVSS (AX), X1 26499 MULSS X0, X1 26500 ADDSS (DX), X1 26501 MOVSS X1, (DX) 26502 LEAQ (AX)(CX*4), AX 26503 LEAQ (DX)(BX*4), DX 26504 SUBQ $0x04, SI 26505 26506 check_limit_unroll: 26507 CMPQ SI, $0x04 26508 JHS loop_unroll 26509 JMP check_limit 26510 26511 loop: 26512 MOVSS (AX), X1 26513 MULSS X0, X1 26514 ADDSS (DX), X1 26515 MOVSS X1, (DX) 26516 DECQ SI 26517 LEAQ (AX)(CX*4), AX 26518 LEAQ (DX)(BX*4), DX 26519 26520 check_limit: 26521 CMPQ SI, $0x00 26522 JHI loop 26523 RET 26524 26525 // func AmdAxpyPointerLoopX_V0A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26526 // Requires: SSE 26527 TEXT ·AmdAxpyPointerLoopX_V0A11U4(SB), NOSPLIT, $0-48 26528 MOVSS alpha+0(FP), X0 26529 MOVQ xs+8(FP), AX 26530 MOVQ incx+16(FP), CX 26531 MOVQ ys+24(FP), DX 26532 MOVQ incy+32(FP), BX 26533 MOVQ n+40(FP), SI 26534 JMP check_limit_unroll 26535 PCALIGN $0x08 26536 NOP 26537 NOP 26538 NOP 26539 26540 loop_unroll: 26541 MOVSS (AX), X1 26542 MULSS X0, X1 26543 ADDSS (DX), X1 26544 MOVSS X1, (DX) 26545 LEAQ (AX)(CX*4), AX 26546 LEAQ (DX)(BX*4), DX 26547 MOVSS (AX), X1 26548 MULSS X0, X1 26549 ADDSS (DX), X1 26550 MOVSS X1, (DX) 26551 LEAQ (AX)(CX*4), AX 26552 LEAQ (DX)(BX*4), DX 26553 MOVSS (AX), X1 26554 MULSS X0, X1 26555 ADDSS (DX), X1 26556 MOVSS X1, (DX) 26557 LEAQ (AX)(CX*4), AX 26558 LEAQ (DX)(BX*4), DX 26559 MOVSS (AX), X1 26560 MULSS X0, X1 26561 ADDSS (DX), X1 26562 MOVSS X1, (DX) 26563 LEAQ (AX)(CX*4), AX 26564 LEAQ (DX)(BX*4), DX 26565 SUBQ $0x04, SI 26566 26567 check_limit_unroll: 26568 CMPQ SI, $0x04 26569 JHS loop_unroll 26570 JMP check_limit 26571 26572 loop: 26573 MOVSS (AX), X1 26574 MULSS X0, X1 26575 ADDSS (DX), X1 26576 MOVSS X1, (DX) 26577 DECQ SI 26578 LEAQ (AX)(CX*4), AX 26579 LEAQ (DX)(BX*4), DX 26580 26581 check_limit: 26582 CMPQ SI, $0x00 26583 JHI loop 26584 RET 26585 26586 // func AmdAxpyPointerLoopX_V1A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26587 // Requires: SSE 26588 TEXT ·AmdAxpyPointerLoopX_V1A11U4(SB), NOSPLIT, $0-48 26589 MOVSS alpha+0(FP), X0 26590 MOVQ xs+8(FP), AX 26591 MOVQ incx+16(FP), CX 26592 MOVQ ys+24(FP), DX 26593 MOVQ incy+32(FP), BX 26594 MOVQ n+40(FP), SI 26595 JMP check_limit_unroll 26596 PCALIGN $0x08 26597 NOP 26598 NOP 26599 NOP 26600 26601 loop_unroll: 26602 MOVSS (AX), X1 26603 MULSS X0, X1 26604 ADDSS (DX), X1 26605 MOVSS X1, (DX) 26606 LEAQ (AX)(CX*4), AX 26607 LEAQ (DX)(BX*4), DX 26608 MOVSS (AX), X1 26609 MULSS X0, X1 26610 ADDSS (DX), X1 26611 MOVSS X1, (DX) 26612 LEAQ (AX)(CX*4), AX 26613 LEAQ (DX)(BX*4), DX 26614 MOVSS (AX), X1 26615 MULSS X0, X1 26616 ADDSS (DX), X1 26617 MOVSS X1, (DX) 26618 LEAQ (AX)(CX*4), AX 26619 LEAQ (DX)(BX*4), DX 26620 MOVSS (AX), X1 26621 MULSS X0, X1 26622 ADDSS (DX), X1 26623 MOVSS X1, (DX) 26624 LEAQ (AX)(CX*4), AX 26625 LEAQ (DX)(BX*4), DX 26626 SUBQ $0x04, SI 26627 26628 check_limit_unroll: 26629 CMPQ SI, $0x04 26630 JHS loop_unroll 26631 JMP check_limit 26632 26633 loop: 26634 MOVSS (AX), X1 26635 MULSS X0, X1 26636 ADDSS (DX), X1 26637 MOVSS X1, (DX) 26638 DECQ SI 26639 LEAQ (AX)(CX*4), AX 26640 LEAQ (DX)(BX*4), DX 26641 26642 check_limit: 26643 CMPQ SI, $0x00 26644 JHI loop 26645 RET 26646 26647 // func AmdAxpyPointerLoopX_V2A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26648 // Requires: SSE 26649 TEXT ·AmdAxpyPointerLoopX_V2A11U4(SB), NOSPLIT, $0-48 26650 MOVSS alpha+0(FP), X0 26651 MOVQ xs+8(FP), AX 26652 MOVQ incx+16(FP), CX 26653 MOVQ ys+24(FP), DX 26654 MOVQ incy+32(FP), BX 26655 MOVQ n+40(FP), SI 26656 JMP check_limit_unroll 26657 PCALIGN $0x08 26658 NOP 26659 NOP 26660 NOP 26661 26662 loop_unroll: 26663 MOVSS (AX), X1 26664 MULSS X0, X1 26665 ADDSS (DX), X1 26666 MOVSS X1, (DX) 26667 LEAQ (AX)(CX*4), AX 26668 LEAQ (DX)(BX*4), DX 26669 MOVSS (AX), X1 26670 MULSS X0, X1 26671 ADDSS (DX), X1 26672 MOVSS X1, (DX) 26673 LEAQ (AX)(CX*4), AX 26674 LEAQ (DX)(BX*4), DX 26675 MOVSS (AX), X1 26676 MULSS X0, X1 26677 ADDSS (DX), X1 26678 MOVSS X1, (DX) 26679 LEAQ (AX)(CX*4), AX 26680 LEAQ (DX)(BX*4), DX 26681 MOVSS (AX), X1 26682 MULSS X0, X1 26683 ADDSS (DX), X1 26684 MOVSS X1, (DX) 26685 LEAQ (AX)(CX*4), AX 26686 LEAQ (DX)(BX*4), DX 26687 SUBQ $0x04, SI 26688 26689 check_limit_unroll: 26690 CMPQ SI, $0x04 26691 JHS loop_unroll 26692 JMP check_limit 26693 26694 loop: 26695 MOVSS (AX), X1 26696 MULSS X0, X1 26697 ADDSS (DX), X1 26698 MOVSS X1, (DX) 26699 DECQ SI 26700 LEAQ (AX)(CX*4), AX 26701 LEAQ (DX)(BX*4), DX 26702 26703 check_limit: 26704 CMPQ SI, $0x00 26705 JHI loop 26706 RET 26707 26708 // func AmdAxpyPointerLoopX_V3A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26709 // Requires: SSE 26710 TEXT ·AmdAxpyPointerLoopX_V3A11U4(SB), NOSPLIT, $0-48 26711 MOVSS alpha+0(FP), X0 26712 MOVQ xs+8(FP), AX 26713 MOVQ incx+16(FP), CX 26714 MOVQ ys+24(FP), DX 26715 MOVQ incy+32(FP), BX 26716 MOVQ n+40(FP), SI 26717 JMP check_limit_unroll 26718 PCALIGN $0x08 26719 NOP 26720 NOP 26721 NOP 26722 26723 loop_unroll: 26724 MOVSS (AX), X1 26725 MULSS X0, X1 26726 ADDSS (DX), X1 26727 MOVSS X1, (DX) 26728 LEAQ (AX)(CX*4), AX 26729 LEAQ (DX)(BX*4), DX 26730 MOVSS (AX), X1 26731 MULSS X0, X1 26732 ADDSS (DX), X1 26733 MOVSS X1, (DX) 26734 LEAQ (AX)(CX*4), AX 26735 LEAQ (DX)(BX*4), DX 26736 MOVSS (AX), X1 26737 MULSS X0, X1 26738 ADDSS (DX), X1 26739 MOVSS X1, (DX) 26740 LEAQ (AX)(CX*4), AX 26741 LEAQ (DX)(BX*4), DX 26742 MOVSS (AX), X1 26743 MULSS X0, X1 26744 ADDSS (DX), X1 26745 MOVSS X1, (DX) 26746 LEAQ (AX)(CX*4), AX 26747 LEAQ (DX)(BX*4), DX 26748 SUBQ $0x04, SI 26749 26750 check_limit_unroll: 26751 CMPQ SI, $0x04 26752 JHS loop_unroll 26753 JMP check_limit 26754 26755 loop: 26756 MOVSS (AX), X1 26757 MULSS X0, X1 26758 ADDSS (DX), X1 26759 MOVSS X1, (DX) 26760 DECQ SI 26761 LEAQ (AX)(CX*4), AX 26762 LEAQ (DX)(BX*4), DX 26763 26764 check_limit: 26765 CMPQ SI, $0x00 26766 JHI loop 26767 RET 26768 26769 // func AmdAxpyPointerLoopX_V4A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26770 // Requires: SSE 26771 TEXT ·AmdAxpyPointerLoopX_V4A11U4(SB), NOSPLIT, $0-48 26772 MOVSS alpha+0(FP), X0 26773 MOVQ xs+8(FP), AX 26774 MOVQ incx+16(FP), CX 26775 MOVQ ys+24(FP), DX 26776 MOVQ incy+32(FP), BX 26777 MOVQ n+40(FP), SI 26778 JMP check_limit_unroll 26779 PCALIGN $0x08 26780 NOP 26781 NOP 26782 NOP 26783 26784 loop_unroll: 26785 MOVSS (AX), X1 26786 MULSS X0, X1 26787 ADDSS (DX), X1 26788 MOVSS X1, (DX) 26789 LEAQ (AX)(CX*4), AX 26790 LEAQ (DX)(BX*4), DX 26791 MOVSS (AX), X1 26792 MULSS X0, X1 26793 ADDSS (DX), X1 26794 MOVSS X1, (DX) 26795 LEAQ (AX)(CX*4), AX 26796 LEAQ (DX)(BX*4), DX 26797 MOVSS (AX), X1 26798 MULSS X0, X1 26799 ADDSS (DX), X1 26800 MOVSS X1, (DX) 26801 LEAQ (AX)(CX*4), AX 26802 LEAQ (DX)(BX*4), DX 26803 MOVSS (AX), X1 26804 MULSS X0, X1 26805 ADDSS (DX), X1 26806 MOVSS X1, (DX) 26807 LEAQ (AX)(CX*4), AX 26808 LEAQ (DX)(BX*4), DX 26809 SUBQ $0x04, SI 26810 26811 check_limit_unroll: 26812 CMPQ SI, $0x04 26813 JHS loop_unroll 26814 JMP check_limit 26815 26816 loop: 26817 MOVSS (AX), X1 26818 MULSS X0, X1 26819 ADDSS (DX), X1 26820 MOVSS X1, (DX) 26821 DECQ SI 26822 LEAQ (AX)(CX*4), AX 26823 LEAQ (DX)(BX*4), DX 26824 26825 check_limit: 26826 CMPQ SI, $0x00 26827 JHI loop 26828 RET 26829 26830 // func AmdAxpyPointerLoopX_V5A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26831 // Requires: SSE 26832 TEXT ·AmdAxpyPointerLoopX_V5A11U4(SB), NOSPLIT, $0-48 26833 MOVSS alpha+0(FP), X0 26834 MOVQ xs+8(FP), AX 26835 MOVQ incx+16(FP), CX 26836 MOVQ ys+24(FP), DX 26837 MOVQ incy+32(FP), BX 26838 MOVQ n+40(FP), SI 26839 JMP check_limit_unroll 26840 PCALIGN $0x08 26841 NOP 26842 NOP 26843 NOP 26844 26845 loop_unroll: 26846 MOVSS (AX), X1 26847 MULSS X0, X1 26848 ADDSS (DX), X1 26849 MOVSS X1, (DX) 26850 LEAQ (AX)(CX*4), AX 26851 LEAQ (DX)(BX*4), DX 26852 MOVSS (AX), X1 26853 MULSS X0, X1 26854 ADDSS (DX), X1 26855 MOVSS X1, (DX) 26856 LEAQ (AX)(CX*4), AX 26857 LEAQ (DX)(BX*4), DX 26858 MOVSS (AX), X1 26859 MULSS X0, X1 26860 ADDSS (DX), X1 26861 MOVSS X1, (DX) 26862 LEAQ (AX)(CX*4), AX 26863 LEAQ (DX)(BX*4), DX 26864 MOVSS (AX), X1 26865 MULSS X0, X1 26866 ADDSS (DX), X1 26867 MOVSS X1, (DX) 26868 LEAQ (AX)(CX*4), AX 26869 LEAQ (DX)(BX*4), DX 26870 SUBQ $0x04, SI 26871 26872 check_limit_unroll: 26873 CMPQ SI, $0x04 26874 JHS loop_unroll 26875 JMP check_limit 26876 26877 loop: 26878 MOVSS (AX), X1 26879 MULSS X0, X1 26880 ADDSS (DX), X1 26881 MOVSS X1, (DX) 26882 DECQ SI 26883 LEAQ (AX)(CX*4), AX 26884 LEAQ (DX)(BX*4), DX 26885 26886 check_limit: 26887 CMPQ SI, $0x00 26888 JHI loop 26889 RET 26890 26891 // func AmdAxpyPointerLoopX_V0A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26892 // Requires: SSE 26893 TEXT ·AmdAxpyPointerLoopX_V0A12U4(SB), NOSPLIT, $0-48 26894 MOVSS alpha+0(FP), X0 26895 MOVQ xs+8(FP), AX 26896 MOVQ incx+16(FP), CX 26897 MOVQ ys+24(FP), DX 26898 MOVQ incy+32(FP), BX 26899 MOVQ n+40(FP), SI 26900 JMP check_limit_unroll 26901 PCALIGN $0x08 26902 NOP 26903 NOP 26904 NOP 26905 NOP 26906 26907 loop_unroll: 26908 MOVSS (AX), X1 26909 MULSS X0, X1 26910 ADDSS (DX), X1 26911 MOVSS X1, (DX) 26912 LEAQ (AX)(CX*4), AX 26913 LEAQ (DX)(BX*4), DX 26914 MOVSS (AX), X1 26915 MULSS X0, X1 26916 ADDSS (DX), X1 26917 MOVSS X1, (DX) 26918 LEAQ (AX)(CX*4), AX 26919 LEAQ (DX)(BX*4), DX 26920 MOVSS (AX), X1 26921 MULSS X0, X1 26922 ADDSS (DX), X1 26923 MOVSS X1, (DX) 26924 LEAQ (AX)(CX*4), AX 26925 LEAQ (DX)(BX*4), DX 26926 MOVSS (AX), X1 26927 MULSS X0, X1 26928 ADDSS (DX), X1 26929 MOVSS X1, (DX) 26930 LEAQ (AX)(CX*4), AX 26931 LEAQ (DX)(BX*4), DX 26932 SUBQ $0x04, SI 26933 26934 check_limit_unroll: 26935 CMPQ SI, $0x04 26936 JHS loop_unroll 26937 JMP check_limit 26938 26939 loop: 26940 MOVSS (AX), X1 26941 MULSS X0, X1 26942 ADDSS (DX), X1 26943 MOVSS X1, (DX) 26944 DECQ SI 26945 LEAQ (AX)(CX*4), AX 26946 LEAQ (DX)(BX*4), DX 26947 26948 check_limit: 26949 CMPQ SI, $0x00 26950 JHI loop 26951 RET 26952 26953 // func AmdAxpyPointerLoopX_V1A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 26954 // Requires: SSE 26955 TEXT ·AmdAxpyPointerLoopX_V1A12U4(SB), NOSPLIT, $0-48 26956 MOVSS alpha+0(FP), X0 26957 MOVQ xs+8(FP), AX 26958 MOVQ incx+16(FP), CX 26959 MOVQ ys+24(FP), DX 26960 MOVQ incy+32(FP), BX 26961 MOVQ n+40(FP), SI 26962 JMP check_limit_unroll 26963 PCALIGN $0x08 26964 NOP 26965 NOP 26966 NOP 26967 NOP 26968 26969 loop_unroll: 26970 MOVSS (AX), X1 26971 MULSS X0, X1 26972 ADDSS (DX), X1 26973 MOVSS X1, (DX) 26974 LEAQ (AX)(CX*4), AX 26975 LEAQ (DX)(BX*4), DX 26976 MOVSS (AX), X1 26977 MULSS X0, X1 26978 ADDSS (DX), X1 26979 MOVSS X1, (DX) 26980 LEAQ (AX)(CX*4), AX 26981 LEAQ (DX)(BX*4), DX 26982 MOVSS (AX), X1 26983 MULSS X0, X1 26984 ADDSS (DX), X1 26985 MOVSS X1, (DX) 26986 LEAQ (AX)(CX*4), AX 26987 LEAQ (DX)(BX*4), DX 26988 MOVSS (AX), X1 26989 MULSS X0, X1 26990 ADDSS (DX), X1 26991 MOVSS X1, (DX) 26992 LEAQ (AX)(CX*4), AX 26993 LEAQ (DX)(BX*4), DX 26994 SUBQ $0x04, SI 26995 26996 check_limit_unroll: 26997 CMPQ SI, $0x04 26998 JHS loop_unroll 26999 JMP check_limit 27000 27001 loop: 27002 MOVSS (AX), X1 27003 MULSS X0, X1 27004 ADDSS (DX), X1 27005 MOVSS X1, (DX) 27006 DECQ SI 27007 LEAQ (AX)(CX*4), AX 27008 LEAQ (DX)(BX*4), DX 27009 27010 check_limit: 27011 CMPQ SI, $0x00 27012 JHI loop 27013 RET 27014 27015 // func AmdAxpyPointerLoopX_V2A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27016 // Requires: SSE 27017 TEXT ·AmdAxpyPointerLoopX_V2A12U4(SB), NOSPLIT, $0-48 27018 MOVSS alpha+0(FP), X0 27019 MOVQ xs+8(FP), AX 27020 MOVQ incx+16(FP), CX 27021 MOVQ ys+24(FP), DX 27022 MOVQ incy+32(FP), BX 27023 MOVQ n+40(FP), SI 27024 JMP check_limit_unroll 27025 PCALIGN $0x08 27026 NOP 27027 NOP 27028 NOP 27029 NOP 27030 27031 loop_unroll: 27032 MOVSS (AX), X1 27033 MULSS X0, X1 27034 ADDSS (DX), X1 27035 MOVSS X1, (DX) 27036 LEAQ (AX)(CX*4), AX 27037 LEAQ (DX)(BX*4), DX 27038 MOVSS (AX), X1 27039 MULSS X0, X1 27040 ADDSS (DX), X1 27041 MOVSS X1, (DX) 27042 LEAQ (AX)(CX*4), AX 27043 LEAQ (DX)(BX*4), DX 27044 MOVSS (AX), X1 27045 MULSS X0, X1 27046 ADDSS (DX), X1 27047 MOVSS X1, (DX) 27048 LEAQ (AX)(CX*4), AX 27049 LEAQ (DX)(BX*4), DX 27050 MOVSS (AX), X1 27051 MULSS X0, X1 27052 ADDSS (DX), X1 27053 MOVSS X1, (DX) 27054 LEAQ (AX)(CX*4), AX 27055 LEAQ (DX)(BX*4), DX 27056 SUBQ $0x04, SI 27057 27058 check_limit_unroll: 27059 CMPQ SI, $0x04 27060 JHS loop_unroll 27061 JMP check_limit 27062 27063 loop: 27064 MOVSS (AX), X1 27065 MULSS X0, X1 27066 ADDSS (DX), X1 27067 MOVSS X1, (DX) 27068 DECQ SI 27069 LEAQ (AX)(CX*4), AX 27070 LEAQ (DX)(BX*4), DX 27071 27072 check_limit: 27073 CMPQ SI, $0x00 27074 JHI loop 27075 RET 27076 27077 // func AmdAxpyPointerLoopX_V3A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27078 // Requires: SSE 27079 TEXT ·AmdAxpyPointerLoopX_V3A12U4(SB), NOSPLIT, $0-48 27080 MOVSS alpha+0(FP), X0 27081 MOVQ xs+8(FP), AX 27082 MOVQ incx+16(FP), CX 27083 MOVQ ys+24(FP), DX 27084 MOVQ incy+32(FP), BX 27085 MOVQ n+40(FP), SI 27086 JMP check_limit_unroll 27087 PCALIGN $0x08 27088 NOP 27089 NOP 27090 NOP 27091 NOP 27092 27093 loop_unroll: 27094 MOVSS (AX), X1 27095 MULSS X0, X1 27096 ADDSS (DX), X1 27097 MOVSS X1, (DX) 27098 LEAQ (AX)(CX*4), AX 27099 LEAQ (DX)(BX*4), DX 27100 MOVSS (AX), X1 27101 MULSS X0, X1 27102 ADDSS (DX), X1 27103 MOVSS X1, (DX) 27104 LEAQ (AX)(CX*4), AX 27105 LEAQ (DX)(BX*4), DX 27106 MOVSS (AX), X1 27107 MULSS X0, X1 27108 ADDSS (DX), X1 27109 MOVSS X1, (DX) 27110 LEAQ (AX)(CX*4), AX 27111 LEAQ (DX)(BX*4), DX 27112 MOVSS (AX), X1 27113 MULSS X0, X1 27114 ADDSS (DX), X1 27115 MOVSS X1, (DX) 27116 LEAQ (AX)(CX*4), AX 27117 LEAQ (DX)(BX*4), DX 27118 SUBQ $0x04, SI 27119 27120 check_limit_unroll: 27121 CMPQ SI, $0x04 27122 JHS loop_unroll 27123 JMP check_limit 27124 27125 loop: 27126 MOVSS (AX), X1 27127 MULSS X0, X1 27128 ADDSS (DX), X1 27129 MOVSS X1, (DX) 27130 DECQ SI 27131 LEAQ (AX)(CX*4), AX 27132 LEAQ (DX)(BX*4), DX 27133 27134 check_limit: 27135 CMPQ SI, $0x00 27136 JHI loop 27137 RET 27138 27139 // func AmdAxpyPointerLoopX_V4A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27140 // Requires: SSE 27141 TEXT ·AmdAxpyPointerLoopX_V4A12U4(SB), NOSPLIT, $0-48 27142 MOVSS alpha+0(FP), X0 27143 MOVQ xs+8(FP), AX 27144 MOVQ incx+16(FP), CX 27145 MOVQ ys+24(FP), DX 27146 MOVQ incy+32(FP), BX 27147 MOVQ n+40(FP), SI 27148 JMP check_limit_unroll 27149 PCALIGN $0x08 27150 NOP 27151 NOP 27152 NOP 27153 NOP 27154 27155 loop_unroll: 27156 MOVSS (AX), X1 27157 MULSS X0, X1 27158 ADDSS (DX), X1 27159 MOVSS X1, (DX) 27160 LEAQ (AX)(CX*4), AX 27161 LEAQ (DX)(BX*4), DX 27162 MOVSS (AX), X1 27163 MULSS X0, X1 27164 ADDSS (DX), X1 27165 MOVSS X1, (DX) 27166 LEAQ (AX)(CX*4), AX 27167 LEAQ (DX)(BX*4), DX 27168 MOVSS (AX), X1 27169 MULSS X0, X1 27170 ADDSS (DX), X1 27171 MOVSS X1, (DX) 27172 LEAQ (AX)(CX*4), AX 27173 LEAQ (DX)(BX*4), DX 27174 MOVSS (AX), X1 27175 MULSS X0, X1 27176 ADDSS (DX), X1 27177 MOVSS X1, (DX) 27178 LEAQ (AX)(CX*4), AX 27179 LEAQ (DX)(BX*4), DX 27180 SUBQ $0x04, SI 27181 27182 check_limit_unroll: 27183 CMPQ SI, $0x04 27184 JHS loop_unroll 27185 JMP check_limit 27186 27187 loop: 27188 MOVSS (AX), X1 27189 MULSS X0, X1 27190 ADDSS (DX), X1 27191 MOVSS X1, (DX) 27192 DECQ SI 27193 LEAQ (AX)(CX*4), AX 27194 LEAQ (DX)(BX*4), DX 27195 27196 check_limit: 27197 CMPQ SI, $0x00 27198 JHI loop 27199 RET 27200 27201 // func AmdAxpyPointerLoopX_V5A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27202 // Requires: SSE 27203 TEXT ·AmdAxpyPointerLoopX_V5A12U4(SB), NOSPLIT, $0-48 27204 MOVSS alpha+0(FP), X0 27205 MOVQ xs+8(FP), AX 27206 MOVQ incx+16(FP), CX 27207 MOVQ ys+24(FP), DX 27208 MOVQ incy+32(FP), BX 27209 MOVQ n+40(FP), SI 27210 JMP check_limit_unroll 27211 PCALIGN $0x08 27212 NOP 27213 NOP 27214 NOP 27215 NOP 27216 27217 loop_unroll: 27218 MOVSS (AX), X1 27219 MULSS X0, X1 27220 ADDSS (DX), X1 27221 MOVSS X1, (DX) 27222 LEAQ (AX)(CX*4), AX 27223 LEAQ (DX)(BX*4), DX 27224 MOVSS (AX), X1 27225 MULSS X0, X1 27226 ADDSS (DX), X1 27227 MOVSS X1, (DX) 27228 LEAQ (AX)(CX*4), AX 27229 LEAQ (DX)(BX*4), DX 27230 MOVSS (AX), X1 27231 MULSS X0, X1 27232 ADDSS (DX), X1 27233 MOVSS X1, (DX) 27234 LEAQ (AX)(CX*4), AX 27235 LEAQ (DX)(BX*4), DX 27236 MOVSS (AX), X1 27237 MULSS X0, X1 27238 ADDSS (DX), X1 27239 MOVSS X1, (DX) 27240 LEAQ (AX)(CX*4), AX 27241 LEAQ (DX)(BX*4), DX 27242 SUBQ $0x04, SI 27243 27244 check_limit_unroll: 27245 CMPQ SI, $0x04 27246 JHS loop_unroll 27247 JMP check_limit 27248 27249 loop: 27250 MOVSS (AX), X1 27251 MULSS X0, X1 27252 ADDSS (DX), X1 27253 MOVSS X1, (DX) 27254 DECQ SI 27255 LEAQ (AX)(CX*4), AX 27256 LEAQ (DX)(BX*4), DX 27257 27258 check_limit: 27259 CMPQ SI, $0x00 27260 JHI loop 27261 RET 27262 27263 // func AmdAxpyPointerLoopX_V0A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27264 // Requires: SSE 27265 TEXT ·AmdAxpyPointerLoopX_V0A13U4(SB), NOSPLIT, $0-48 27266 MOVSS alpha+0(FP), X0 27267 MOVQ xs+8(FP), AX 27268 MOVQ incx+16(FP), CX 27269 MOVQ ys+24(FP), DX 27270 MOVQ incy+32(FP), BX 27271 MOVQ n+40(FP), SI 27272 JMP check_limit_unroll 27273 PCALIGN $0x08 27274 NOP 27275 NOP 27276 NOP 27277 NOP 27278 NOP 27279 27280 loop_unroll: 27281 MOVSS (AX), X1 27282 MULSS X0, X1 27283 ADDSS (DX), X1 27284 MOVSS X1, (DX) 27285 LEAQ (AX)(CX*4), AX 27286 LEAQ (DX)(BX*4), DX 27287 MOVSS (AX), X1 27288 MULSS X0, X1 27289 ADDSS (DX), X1 27290 MOVSS X1, (DX) 27291 LEAQ (AX)(CX*4), AX 27292 LEAQ (DX)(BX*4), DX 27293 MOVSS (AX), X1 27294 MULSS X0, X1 27295 ADDSS (DX), X1 27296 MOVSS X1, (DX) 27297 LEAQ (AX)(CX*4), AX 27298 LEAQ (DX)(BX*4), DX 27299 MOVSS (AX), X1 27300 MULSS X0, X1 27301 ADDSS (DX), X1 27302 MOVSS X1, (DX) 27303 LEAQ (AX)(CX*4), AX 27304 LEAQ (DX)(BX*4), DX 27305 SUBQ $0x04, SI 27306 27307 check_limit_unroll: 27308 CMPQ SI, $0x04 27309 JHS loop_unroll 27310 JMP check_limit 27311 27312 loop: 27313 MOVSS (AX), X1 27314 MULSS X0, X1 27315 ADDSS (DX), X1 27316 MOVSS X1, (DX) 27317 DECQ SI 27318 LEAQ (AX)(CX*4), AX 27319 LEAQ (DX)(BX*4), DX 27320 27321 check_limit: 27322 CMPQ SI, $0x00 27323 JHI loop 27324 RET 27325 27326 // func AmdAxpyPointerLoopX_V1A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27327 // Requires: SSE 27328 TEXT ·AmdAxpyPointerLoopX_V1A13U4(SB), NOSPLIT, $0-48 27329 MOVSS alpha+0(FP), X0 27330 MOVQ xs+8(FP), AX 27331 MOVQ incx+16(FP), CX 27332 MOVQ ys+24(FP), DX 27333 MOVQ incy+32(FP), BX 27334 MOVQ n+40(FP), SI 27335 JMP check_limit_unroll 27336 PCALIGN $0x08 27337 NOP 27338 NOP 27339 NOP 27340 NOP 27341 NOP 27342 27343 loop_unroll: 27344 MOVSS (AX), X1 27345 MULSS X0, X1 27346 ADDSS (DX), X1 27347 MOVSS X1, (DX) 27348 LEAQ (AX)(CX*4), AX 27349 LEAQ (DX)(BX*4), DX 27350 MOVSS (AX), X1 27351 MULSS X0, X1 27352 ADDSS (DX), X1 27353 MOVSS X1, (DX) 27354 LEAQ (AX)(CX*4), AX 27355 LEAQ (DX)(BX*4), DX 27356 MOVSS (AX), X1 27357 MULSS X0, X1 27358 ADDSS (DX), X1 27359 MOVSS X1, (DX) 27360 LEAQ (AX)(CX*4), AX 27361 LEAQ (DX)(BX*4), DX 27362 MOVSS (AX), X1 27363 MULSS X0, X1 27364 ADDSS (DX), X1 27365 MOVSS X1, (DX) 27366 LEAQ (AX)(CX*4), AX 27367 LEAQ (DX)(BX*4), DX 27368 SUBQ $0x04, SI 27369 27370 check_limit_unroll: 27371 CMPQ SI, $0x04 27372 JHS loop_unroll 27373 JMP check_limit 27374 27375 loop: 27376 MOVSS (AX), X1 27377 MULSS X0, X1 27378 ADDSS (DX), X1 27379 MOVSS X1, (DX) 27380 DECQ SI 27381 LEAQ (AX)(CX*4), AX 27382 LEAQ (DX)(BX*4), DX 27383 27384 check_limit: 27385 CMPQ SI, $0x00 27386 JHI loop 27387 RET 27388 27389 // func AmdAxpyPointerLoopX_V2A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27390 // Requires: SSE 27391 TEXT ·AmdAxpyPointerLoopX_V2A13U4(SB), NOSPLIT, $0-48 27392 MOVSS alpha+0(FP), X0 27393 MOVQ xs+8(FP), AX 27394 MOVQ incx+16(FP), CX 27395 MOVQ ys+24(FP), DX 27396 MOVQ incy+32(FP), BX 27397 MOVQ n+40(FP), SI 27398 JMP check_limit_unroll 27399 PCALIGN $0x08 27400 NOP 27401 NOP 27402 NOP 27403 NOP 27404 NOP 27405 27406 loop_unroll: 27407 MOVSS (AX), X1 27408 MULSS X0, X1 27409 ADDSS (DX), X1 27410 MOVSS X1, (DX) 27411 LEAQ (AX)(CX*4), AX 27412 LEAQ (DX)(BX*4), DX 27413 MOVSS (AX), X1 27414 MULSS X0, X1 27415 ADDSS (DX), X1 27416 MOVSS X1, (DX) 27417 LEAQ (AX)(CX*4), AX 27418 LEAQ (DX)(BX*4), DX 27419 MOVSS (AX), X1 27420 MULSS X0, X1 27421 ADDSS (DX), X1 27422 MOVSS X1, (DX) 27423 LEAQ (AX)(CX*4), AX 27424 LEAQ (DX)(BX*4), DX 27425 MOVSS (AX), X1 27426 MULSS X0, X1 27427 ADDSS (DX), X1 27428 MOVSS X1, (DX) 27429 LEAQ (AX)(CX*4), AX 27430 LEAQ (DX)(BX*4), DX 27431 SUBQ $0x04, SI 27432 27433 check_limit_unroll: 27434 CMPQ SI, $0x04 27435 JHS loop_unroll 27436 JMP check_limit 27437 27438 loop: 27439 MOVSS (AX), X1 27440 MULSS X0, X1 27441 ADDSS (DX), X1 27442 MOVSS X1, (DX) 27443 DECQ SI 27444 LEAQ (AX)(CX*4), AX 27445 LEAQ (DX)(BX*4), DX 27446 27447 check_limit: 27448 CMPQ SI, $0x00 27449 JHI loop 27450 RET 27451 27452 // func AmdAxpyPointerLoopX_V3A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27453 // Requires: SSE 27454 TEXT ·AmdAxpyPointerLoopX_V3A13U4(SB), NOSPLIT, $0-48 27455 MOVSS alpha+0(FP), X0 27456 MOVQ xs+8(FP), AX 27457 MOVQ incx+16(FP), CX 27458 MOVQ ys+24(FP), DX 27459 MOVQ incy+32(FP), BX 27460 MOVQ n+40(FP), SI 27461 JMP check_limit_unroll 27462 PCALIGN $0x08 27463 NOP 27464 NOP 27465 NOP 27466 NOP 27467 NOP 27468 27469 loop_unroll: 27470 MOVSS (AX), X1 27471 MULSS X0, X1 27472 ADDSS (DX), X1 27473 MOVSS X1, (DX) 27474 LEAQ (AX)(CX*4), AX 27475 LEAQ (DX)(BX*4), DX 27476 MOVSS (AX), X1 27477 MULSS X0, X1 27478 ADDSS (DX), X1 27479 MOVSS X1, (DX) 27480 LEAQ (AX)(CX*4), AX 27481 LEAQ (DX)(BX*4), DX 27482 MOVSS (AX), X1 27483 MULSS X0, X1 27484 ADDSS (DX), X1 27485 MOVSS X1, (DX) 27486 LEAQ (AX)(CX*4), AX 27487 LEAQ (DX)(BX*4), DX 27488 MOVSS (AX), X1 27489 MULSS X0, X1 27490 ADDSS (DX), X1 27491 MOVSS X1, (DX) 27492 LEAQ (AX)(CX*4), AX 27493 LEAQ (DX)(BX*4), DX 27494 SUBQ $0x04, SI 27495 27496 check_limit_unroll: 27497 CMPQ SI, $0x04 27498 JHS loop_unroll 27499 JMP check_limit 27500 27501 loop: 27502 MOVSS (AX), X1 27503 MULSS X0, X1 27504 ADDSS (DX), X1 27505 MOVSS X1, (DX) 27506 DECQ SI 27507 LEAQ (AX)(CX*4), AX 27508 LEAQ (DX)(BX*4), DX 27509 27510 check_limit: 27511 CMPQ SI, $0x00 27512 JHI loop 27513 RET 27514 27515 // func AmdAxpyPointerLoopX_V4A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27516 // Requires: SSE 27517 TEXT ·AmdAxpyPointerLoopX_V4A13U4(SB), NOSPLIT, $0-48 27518 MOVSS alpha+0(FP), X0 27519 MOVQ xs+8(FP), AX 27520 MOVQ incx+16(FP), CX 27521 MOVQ ys+24(FP), DX 27522 MOVQ incy+32(FP), BX 27523 MOVQ n+40(FP), SI 27524 JMP check_limit_unroll 27525 PCALIGN $0x08 27526 NOP 27527 NOP 27528 NOP 27529 NOP 27530 NOP 27531 27532 loop_unroll: 27533 MOVSS (AX), X1 27534 MULSS X0, X1 27535 ADDSS (DX), X1 27536 MOVSS X1, (DX) 27537 LEAQ (AX)(CX*4), AX 27538 LEAQ (DX)(BX*4), DX 27539 MOVSS (AX), X1 27540 MULSS X0, X1 27541 ADDSS (DX), X1 27542 MOVSS X1, (DX) 27543 LEAQ (AX)(CX*4), AX 27544 LEAQ (DX)(BX*4), DX 27545 MOVSS (AX), X1 27546 MULSS X0, X1 27547 ADDSS (DX), X1 27548 MOVSS X1, (DX) 27549 LEAQ (AX)(CX*4), AX 27550 LEAQ (DX)(BX*4), DX 27551 MOVSS (AX), X1 27552 MULSS X0, X1 27553 ADDSS (DX), X1 27554 MOVSS X1, (DX) 27555 LEAQ (AX)(CX*4), AX 27556 LEAQ (DX)(BX*4), DX 27557 SUBQ $0x04, SI 27558 27559 check_limit_unroll: 27560 CMPQ SI, $0x04 27561 JHS loop_unroll 27562 JMP check_limit 27563 27564 loop: 27565 MOVSS (AX), X1 27566 MULSS X0, X1 27567 ADDSS (DX), X1 27568 MOVSS X1, (DX) 27569 DECQ SI 27570 LEAQ (AX)(CX*4), AX 27571 LEAQ (DX)(BX*4), DX 27572 27573 check_limit: 27574 CMPQ SI, $0x00 27575 JHI loop 27576 RET 27577 27578 // func AmdAxpyPointerLoopX_V5A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27579 // Requires: SSE 27580 TEXT ·AmdAxpyPointerLoopX_V5A13U4(SB), NOSPLIT, $0-48 27581 MOVSS alpha+0(FP), X0 27582 MOVQ xs+8(FP), AX 27583 MOVQ incx+16(FP), CX 27584 MOVQ ys+24(FP), DX 27585 MOVQ incy+32(FP), BX 27586 MOVQ n+40(FP), SI 27587 JMP check_limit_unroll 27588 PCALIGN $0x08 27589 NOP 27590 NOP 27591 NOP 27592 NOP 27593 NOP 27594 27595 loop_unroll: 27596 MOVSS (AX), X1 27597 MULSS X0, X1 27598 ADDSS (DX), X1 27599 MOVSS X1, (DX) 27600 LEAQ (AX)(CX*4), AX 27601 LEAQ (DX)(BX*4), DX 27602 MOVSS (AX), X1 27603 MULSS X0, X1 27604 ADDSS (DX), X1 27605 MOVSS X1, (DX) 27606 LEAQ (AX)(CX*4), AX 27607 LEAQ (DX)(BX*4), DX 27608 MOVSS (AX), X1 27609 MULSS X0, X1 27610 ADDSS (DX), X1 27611 MOVSS X1, (DX) 27612 LEAQ (AX)(CX*4), AX 27613 LEAQ (DX)(BX*4), DX 27614 MOVSS (AX), X1 27615 MULSS X0, X1 27616 ADDSS (DX), X1 27617 MOVSS X1, (DX) 27618 LEAQ (AX)(CX*4), AX 27619 LEAQ (DX)(BX*4), DX 27620 SUBQ $0x04, SI 27621 27622 check_limit_unroll: 27623 CMPQ SI, $0x04 27624 JHS loop_unroll 27625 JMP check_limit 27626 27627 loop: 27628 MOVSS (AX), X1 27629 MULSS X0, X1 27630 ADDSS (DX), X1 27631 MOVSS X1, (DX) 27632 DECQ SI 27633 LEAQ (AX)(CX*4), AX 27634 LEAQ (DX)(BX*4), DX 27635 27636 check_limit: 27637 CMPQ SI, $0x00 27638 JHI loop 27639 RET 27640 27641 // func AmdAxpyPointerLoopX_V0A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27642 // Requires: SSE 27643 TEXT ·AmdAxpyPointerLoopX_V0A14U4(SB), NOSPLIT, $0-48 27644 MOVSS alpha+0(FP), X0 27645 MOVQ xs+8(FP), AX 27646 MOVQ incx+16(FP), CX 27647 MOVQ ys+24(FP), DX 27648 MOVQ incy+32(FP), BX 27649 MOVQ n+40(FP), SI 27650 JMP check_limit_unroll 27651 PCALIGN $0x08 27652 NOP 27653 NOP 27654 NOP 27655 NOP 27656 NOP 27657 NOP 27658 27659 loop_unroll: 27660 MOVSS (AX), X1 27661 MULSS X0, X1 27662 ADDSS (DX), X1 27663 MOVSS X1, (DX) 27664 LEAQ (AX)(CX*4), AX 27665 LEAQ (DX)(BX*4), DX 27666 MOVSS (AX), X1 27667 MULSS X0, X1 27668 ADDSS (DX), X1 27669 MOVSS X1, (DX) 27670 LEAQ (AX)(CX*4), AX 27671 LEAQ (DX)(BX*4), DX 27672 MOVSS (AX), X1 27673 MULSS X0, X1 27674 ADDSS (DX), X1 27675 MOVSS X1, (DX) 27676 LEAQ (AX)(CX*4), AX 27677 LEAQ (DX)(BX*4), DX 27678 MOVSS (AX), X1 27679 MULSS X0, X1 27680 ADDSS (DX), X1 27681 MOVSS X1, (DX) 27682 LEAQ (AX)(CX*4), AX 27683 LEAQ (DX)(BX*4), DX 27684 SUBQ $0x04, SI 27685 27686 check_limit_unroll: 27687 CMPQ SI, $0x04 27688 JHS loop_unroll 27689 JMP check_limit 27690 27691 loop: 27692 MOVSS (AX), X1 27693 MULSS X0, X1 27694 ADDSS (DX), X1 27695 MOVSS X1, (DX) 27696 DECQ SI 27697 LEAQ (AX)(CX*4), AX 27698 LEAQ (DX)(BX*4), DX 27699 27700 check_limit: 27701 CMPQ SI, $0x00 27702 JHI loop 27703 RET 27704 27705 // func AmdAxpyPointerLoopX_V1A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27706 // Requires: SSE 27707 TEXT ·AmdAxpyPointerLoopX_V1A14U4(SB), NOSPLIT, $0-48 27708 MOVSS alpha+0(FP), X0 27709 MOVQ xs+8(FP), AX 27710 MOVQ incx+16(FP), CX 27711 MOVQ ys+24(FP), DX 27712 MOVQ incy+32(FP), BX 27713 MOVQ n+40(FP), SI 27714 JMP check_limit_unroll 27715 PCALIGN $0x08 27716 NOP 27717 NOP 27718 NOP 27719 NOP 27720 NOP 27721 NOP 27722 27723 loop_unroll: 27724 MOVSS (AX), X1 27725 MULSS X0, X1 27726 ADDSS (DX), X1 27727 MOVSS X1, (DX) 27728 LEAQ (AX)(CX*4), AX 27729 LEAQ (DX)(BX*4), DX 27730 MOVSS (AX), X1 27731 MULSS X0, X1 27732 ADDSS (DX), X1 27733 MOVSS X1, (DX) 27734 LEAQ (AX)(CX*4), AX 27735 LEAQ (DX)(BX*4), DX 27736 MOVSS (AX), X1 27737 MULSS X0, X1 27738 ADDSS (DX), X1 27739 MOVSS X1, (DX) 27740 LEAQ (AX)(CX*4), AX 27741 LEAQ (DX)(BX*4), DX 27742 MOVSS (AX), X1 27743 MULSS X0, X1 27744 ADDSS (DX), X1 27745 MOVSS X1, (DX) 27746 LEAQ (AX)(CX*4), AX 27747 LEAQ (DX)(BX*4), DX 27748 SUBQ $0x04, SI 27749 27750 check_limit_unroll: 27751 CMPQ SI, $0x04 27752 JHS loop_unroll 27753 JMP check_limit 27754 27755 loop: 27756 MOVSS (AX), X1 27757 MULSS X0, X1 27758 ADDSS (DX), X1 27759 MOVSS X1, (DX) 27760 DECQ SI 27761 LEAQ (AX)(CX*4), AX 27762 LEAQ (DX)(BX*4), DX 27763 27764 check_limit: 27765 CMPQ SI, $0x00 27766 JHI loop 27767 RET 27768 27769 // func AmdAxpyPointerLoopX_V2A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27770 // Requires: SSE 27771 TEXT ·AmdAxpyPointerLoopX_V2A14U4(SB), NOSPLIT, $0-48 27772 MOVSS alpha+0(FP), X0 27773 MOVQ xs+8(FP), AX 27774 MOVQ incx+16(FP), CX 27775 MOVQ ys+24(FP), DX 27776 MOVQ incy+32(FP), BX 27777 MOVQ n+40(FP), SI 27778 JMP check_limit_unroll 27779 PCALIGN $0x08 27780 NOP 27781 NOP 27782 NOP 27783 NOP 27784 NOP 27785 NOP 27786 27787 loop_unroll: 27788 MOVSS (AX), X1 27789 MULSS X0, X1 27790 ADDSS (DX), X1 27791 MOVSS X1, (DX) 27792 LEAQ (AX)(CX*4), AX 27793 LEAQ (DX)(BX*4), DX 27794 MOVSS (AX), X1 27795 MULSS X0, X1 27796 ADDSS (DX), X1 27797 MOVSS X1, (DX) 27798 LEAQ (AX)(CX*4), AX 27799 LEAQ (DX)(BX*4), DX 27800 MOVSS (AX), X1 27801 MULSS X0, X1 27802 ADDSS (DX), X1 27803 MOVSS X1, (DX) 27804 LEAQ (AX)(CX*4), AX 27805 LEAQ (DX)(BX*4), DX 27806 MOVSS (AX), X1 27807 MULSS X0, X1 27808 ADDSS (DX), X1 27809 MOVSS X1, (DX) 27810 LEAQ (AX)(CX*4), AX 27811 LEAQ (DX)(BX*4), DX 27812 SUBQ $0x04, SI 27813 27814 check_limit_unroll: 27815 CMPQ SI, $0x04 27816 JHS loop_unroll 27817 JMP check_limit 27818 27819 loop: 27820 MOVSS (AX), X1 27821 MULSS X0, X1 27822 ADDSS (DX), X1 27823 MOVSS X1, (DX) 27824 DECQ SI 27825 LEAQ (AX)(CX*4), AX 27826 LEAQ (DX)(BX*4), DX 27827 27828 check_limit: 27829 CMPQ SI, $0x00 27830 JHI loop 27831 RET 27832 27833 // func AmdAxpyPointerLoopX_V3A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27834 // Requires: SSE 27835 TEXT ·AmdAxpyPointerLoopX_V3A14U4(SB), NOSPLIT, $0-48 27836 MOVSS alpha+0(FP), X0 27837 MOVQ xs+8(FP), AX 27838 MOVQ incx+16(FP), CX 27839 MOVQ ys+24(FP), DX 27840 MOVQ incy+32(FP), BX 27841 MOVQ n+40(FP), SI 27842 JMP check_limit_unroll 27843 PCALIGN $0x08 27844 NOP 27845 NOP 27846 NOP 27847 NOP 27848 NOP 27849 NOP 27850 27851 loop_unroll: 27852 MOVSS (AX), X1 27853 MULSS X0, X1 27854 ADDSS (DX), X1 27855 MOVSS X1, (DX) 27856 LEAQ (AX)(CX*4), AX 27857 LEAQ (DX)(BX*4), DX 27858 MOVSS (AX), X1 27859 MULSS X0, X1 27860 ADDSS (DX), X1 27861 MOVSS X1, (DX) 27862 LEAQ (AX)(CX*4), AX 27863 LEAQ (DX)(BX*4), DX 27864 MOVSS (AX), X1 27865 MULSS X0, X1 27866 ADDSS (DX), X1 27867 MOVSS X1, (DX) 27868 LEAQ (AX)(CX*4), AX 27869 LEAQ (DX)(BX*4), DX 27870 MOVSS (AX), X1 27871 MULSS X0, X1 27872 ADDSS (DX), X1 27873 MOVSS X1, (DX) 27874 LEAQ (AX)(CX*4), AX 27875 LEAQ (DX)(BX*4), DX 27876 SUBQ $0x04, SI 27877 27878 check_limit_unroll: 27879 CMPQ SI, $0x04 27880 JHS loop_unroll 27881 JMP check_limit 27882 27883 loop: 27884 MOVSS (AX), X1 27885 MULSS X0, X1 27886 ADDSS (DX), X1 27887 MOVSS X1, (DX) 27888 DECQ SI 27889 LEAQ (AX)(CX*4), AX 27890 LEAQ (DX)(BX*4), DX 27891 27892 check_limit: 27893 CMPQ SI, $0x00 27894 JHI loop 27895 RET 27896 27897 // func AmdAxpyPointerLoopX_V4A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27898 // Requires: SSE 27899 TEXT ·AmdAxpyPointerLoopX_V4A14U4(SB), NOSPLIT, $0-48 27900 MOVSS alpha+0(FP), X0 27901 MOVQ xs+8(FP), AX 27902 MOVQ incx+16(FP), CX 27903 MOVQ ys+24(FP), DX 27904 MOVQ incy+32(FP), BX 27905 MOVQ n+40(FP), SI 27906 JMP check_limit_unroll 27907 PCALIGN $0x08 27908 NOP 27909 NOP 27910 NOP 27911 NOP 27912 NOP 27913 NOP 27914 27915 loop_unroll: 27916 MOVSS (AX), X1 27917 MULSS X0, X1 27918 ADDSS (DX), X1 27919 MOVSS X1, (DX) 27920 LEAQ (AX)(CX*4), AX 27921 LEAQ (DX)(BX*4), DX 27922 MOVSS (AX), X1 27923 MULSS X0, X1 27924 ADDSS (DX), X1 27925 MOVSS X1, (DX) 27926 LEAQ (AX)(CX*4), AX 27927 LEAQ (DX)(BX*4), DX 27928 MOVSS (AX), X1 27929 MULSS X0, X1 27930 ADDSS (DX), X1 27931 MOVSS X1, (DX) 27932 LEAQ (AX)(CX*4), AX 27933 LEAQ (DX)(BX*4), DX 27934 MOVSS (AX), X1 27935 MULSS X0, X1 27936 ADDSS (DX), X1 27937 MOVSS X1, (DX) 27938 LEAQ (AX)(CX*4), AX 27939 LEAQ (DX)(BX*4), DX 27940 SUBQ $0x04, SI 27941 27942 check_limit_unroll: 27943 CMPQ SI, $0x04 27944 JHS loop_unroll 27945 JMP check_limit 27946 27947 loop: 27948 MOVSS (AX), X1 27949 MULSS X0, X1 27950 ADDSS (DX), X1 27951 MOVSS X1, (DX) 27952 DECQ SI 27953 LEAQ (AX)(CX*4), AX 27954 LEAQ (DX)(BX*4), DX 27955 27956 check_limit: 27957 CMPQ SI, $0x00 27958 JHI loop 27959 RET 27960 27961 // func AmdAxpyPointerLoopX_V5A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 27962 // Requires: SSE 27963 TEXT ·AmdAxpyPointerLoopX_V5A14U4(SB), NOSPLIT, $0-48 27964 MOVSS alpha+0(FP), X0 27965 MOVQ xs+8(FP), AX 27966 MOVQ incx+16(FP), CX 27967 MOVQ ys+24(FP), DX 27968 MOVQ incy+32(FP), BX 27969 MOVQ n+40(FP), SI 27970 JMP check_limit_unroll 27971 PCALIGN $0x08 27972 NOP 27973 NOP 27974 NOP 27975 NOP 27976 NOP 27977 NOP 27978 27979 loop_unroll: 27980 MOVSS (AX), X1 27981 MULSS X0, X1 27982 ADDSS (DX), X1 27983 MOVSS X1, (DX) 27984 LEAQ (AX)(CX*4), AX 27985 LEAQ (DX)(BX*4), DX 27986 MOVSS (AX), X1 27987 MULSS X0, X1 27988 ADDSS (DX), X1 27989 MOVSS X1, (DX) 27990 LEAQ (AX)(CX*4), AX 27991 LEAQ (DX)(BX*4), DX 27992 MOVSS (AX), X1 27993 MULSS X0, X1 27994 ADDSS (DX), X1 27995 MOVSS X1, (DX) 27996 LEAQ (AX)(CX*4), AX 27997 LEAQ (DX)(BX*4), DX 27998 MOVSS (AX), X1 27999 MULSS X0, X1 28000 ADDSS (DX), X1 28001 MOVSS X1, (DX) 28002 LEAQ (AX)(CX*4), AX 28003 LEAQ (DX)(BX*4), DX 28004 SUBQ $0x04, SI 28005 28006 check_limit_unroll: 28007 CMPQ SI, $0x04 28008 JHS loop_unroll 28009 JMP check_limit 28010 28011 loop: 28012 MOVSS (AX), X1 28013 MULSS X0, X1 28014 ADDSS (DX), X1 28015 MOVSS X1, (DX) 28016 DECQ SI 28017 LEAQ (AX)(CX*4), AX 28018 LEAQ (DX)(BX*4), DX 28019 28020 check_limit: 28021 CMPQ SI, $0x00 28022 JHI loop 28023 RET 28024 28025 // func AmdAxpyPointerLoopX_V0A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28026 // Requires: SSE 28027 TEXT ·AmdAxpyPointerLoopX_V0A15U4(SB), NOSPLIT, $0-48 28028 MOVSS alpha+0(FP), X0 28029 MOVQ xs+8(FP), AX 28030 MOVQ incx+16(FP), CX 28031 MOVQ ys+24(FP), DX 28032 MOVQ incy+32(FP), BX 28033 MOVQ n+40(FP), SI 28034 JMP check_limit_unroll 28035 PCALIGN $0x08 28036 NOP 28037 NOP 28038 NOP 28039 NOP 28040 NOP 28041 NOP 28042 NOP 28043 28044 loop_unroll: 28045 MOVSS (AX), X1 28046 MULSS X0, X1 28047 ADDSS (DX), X1 28048 MOVSS X1, (DX) 28049 LEAQ (AX)(CX*4), AX 28050 LEAQ (DX)(BX*4), DX 28051 MOVSS (AX), X1 28052 MULSS X0, X1 28053 ADDSS (DX), X1 28054 MOVSS X1, (DX) 28055 LEAQ (AX)(CX*4), AX 28056 LEAQ (DX)(BX*4), DX 28057 MOVSS (AX), X1 28058 MULSS X0, X1 28059 ADDSS (DX), X1 28060 MOVSS X1, (DX) 28061 LEAQ (AX)(CX*4), AX 28062 LEAQ (DX)(BX*4), DX 28063 MOVSS (AX), X1 28064 MULSS X0, X1 28065 ADDSS (DX), X1 28066 MOVSS X1, (DX) 28067 LEAQ (AX)(CX*4), AX 28068 LEAQ (DX)(BX*4), DX 28069 SUBQ $0x04, SI 28070 28071 check_limit_unroll: 28072 CMPQ SI, $0x04 28073 JHS loop_unroll 28074 JMP check_limit 28075 28076 loop: 28077 MOVSS (AX), X1 28078 MULSS X0, X1 28079 ADDSS (DX), X1 28080 MOVSS X1, (DX) 28081 DECQ SI 28082 LEAQ (AX)(CX*4), AX 28083 LEAQ (DX)(BX*4), DX 28084 28085 check_limit: 28086 CMPQ SI, $0x00 28087 JHI loop 28088 RET 28089 28090 // func AmdAxpyPointerLoopX_V1A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28091 // Requires: SSE 28092 TEXT ·AmdAxpyPointerLoopX_V1A15U4(SB), NOSPLIT, $0-48 28093 MOVSS alpha+0(FP), X0 28094 MOVQ xs+8(FP), AX 28095 MOVQ incx+16(FP), CX 28096 MOVQ ys+24(FP), DX 28097 MOVQ incy+32(FP), BX 28098 MOVQ n+40(FP), SI 28099 JMP check_limit_unroll 28100 PCALIGN $0x08 28101 NOP 28102 NOP 28103 NOP 28104 NOP 28105 NOP 28106 NOP 28107 NOP 28108 28109 loop_unroll: 28110 MOVSS (AX), X1 28111 MULSS X0, X1 28112 ADDSS (DX), X1 28113 MOVSS X1, (DX) 28114 LEAQ (AX)(CX*4), AX 28115 LEAQ (DX)(BX*4), DX 28116 MOVSS (AX), X1 28117 MULSS X0, X1 28118 ADDSS (DX), X1 28119 MOVSS X1, (DX) 28120 LEAQ (AX)(CX*4), AX 28121 LEAQ (DX)(BX*4), DX 28122 MOVSS (AX), X1 28123 MULSS X0, X1 28124 ADDSS (DX), X1 28125 MOVSS X1, (DX) 28126 LEAQ (AX)(CX*4), AX 28127 LEAQ (DX)(BX*4), DX 28128 MOVSS (AX), X1 28129 MULSS X0, X1 28130 ADDSS (DX), X1 28131 MOVSS X1, (DX) 28132 LEAQ (AX)(CX*4), AX 28133 LEAQ (DX)(BX*4), DX 28134 SUBQ $0x04, SI 28135 28136 check_limit_unroll: 28137 CMPQ SI, $0x04 28138 JHS loop_unroll 28139 JMP check_limit 28140 28141 loop: 28142 MOVSS (AX), X1 28143 MULSS X0, X1 28144 ADDSS (DX), X1 28145 MOVSS X1, (DX) 28146 DECQ SI 28147 LEAQ (AX)(CX*4), AX 28148 LEAQ (DX)(BX*4), DX 28149 28150 check_limit: 28151 CMPQ SI, $0x00 28152 JHI loop 28153 RET 28154 28155 // func AmdAxpyPointerLoopX_V2A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28156 // Requires: SSE 28157 TEXT ·AmdAxpyPointerLoopX_V2A15U4(SB), NOSPLIT, $0-48 28158 MOVSS alpha+0(FP), X0 28159 MOVQ xs+8(FP), AX 28160 MOVQ incx+16(FP), CX 28161 MOVQ ys+24(FP), DX 28162 MOVQ incy+32(FP), BX 28163 MOVQ n+40(FP), SI 28164 JMP check_limit_unroll 28165 PCALIGN $0x08 28166 NOP 28167 NOP 28168 NOP 28169 NOP 28170 NOP 28171 NOP 28172 NOP 28173 28174 loop_unroll: 28175 MOVSS (AX), X1 28176 MULSS X0, X1 28177 ADDSS (DX), X1 28178 MOVSS X1, (DX) 28179 LEAQ (AX)(CX*4), AX 28180 LEAQ (DX)(BX*4), DX 28181 MOVSS (AX), X1 28182 MULSS X0, X1 28183 ADDSS (DX), X1 28184 MOVSS X1, (DX) 28185 LEAQ (AX)(CX*4), AX 28186 LEAQ (DX)(BX*4), DX 28187 MOVSS (AX), X1 28188 MULSS X0, X1 28189 ADDSS (DX), X1 28190 MOVSS X1, (DX) 28191 LEAQ (AX)(CX*4), AX 28192 LEAQ (DX)(BX*4), DX 28193 MOVSS (AX), X1 28194 MULSS X0, X1 28195 ADDSS (DX), X1 28196 MOVSS X1, (DX) 28197 LEAQ (AX)(CX*4), AX 28198 LEAQ (DX)(BX*4), DX 28199 SUBQ $0x04, SI 28200 28201 check_limit_unroll: 28202 CMPQ SI, $0x04 28203 JHS loop_unroll 28204 JMP check_limit 28205 28206 loop: 28207 MOVSS (AX), X1 28208 MULSS X0, X1 28209 ADDSS (DX), X1 28210 MOVSS X1, (DX) 28211 DECQ SI 28212 LEAQ (AX)(CX*4), AX 28213 LEAQ (DX)(BX*4), DX 28214 28215 check_limit: 28216 CMPQ SI, $0x00 28217 JHI loop 28218 RET 28219 28220 // func AmdAxpyPointerLoopX_V3A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28221 // Requires: SSE 28222 TEXT ·AmdAxpyPointerLoopX_V3A15U4(SB), NOSPLIT, $0-48 28223 MOVSS alpha+0(FP), X0 28224 MOVQ xs+8(FP), AX 28225 MOVQ incx+16(FP), CX 28226 MOVQ ys+24(FP), DX 28227 MOVQ incy+32(FP), BX 28228 MOVQ n+40(FP), SI 28229 JMP check_limit_unroll 28230 PCALIGN $0x08 28231 NOP 28232 NOP 28233 NOP 28234 NOP 28235 NOP 28236 NOP 28237 NOP 28238 28239 loop_unroll: 28240 MOVSS (AX), X1 28241 MULSS X0, X1 28242 ADDSS (DX), X1 28243 MOVSS X1, (DX) 28244 LEAQ (AX)(CX*4), AX 28245 LEAQ (DX)(BX*4), DX 28246 MOVSS (AX), X1 28247 MULSS X0, X1 28248 ADDSS (DX), X1 28249 MOVSS X1, (DX) 28250 LEAQ (AX)(CX*4), AX 28251 LEAQ (DX)(BX*4), DX 28252 MOVSS (AX), X1 28253 MULSS X0, X1 28254 ADDSS (DX), X1 28255 MOVSS X1, (DX) 28256 LEAQ (AX)(CX*4), AX 28257 LEAQ (DX)(BX*4), DX 28258 MOVSS (AX), X1 28259 MULSS X0, X1 28260 ADDSS (DX), X1 28261 MOVSS X1, (DX) 28262 LEAQ (AX)(CX*4), AX 28263 LEAQ (DX)(BX*4), DX 28264 SUBQ $0x04, SI 28265 28266 check_limit_unroll: 28267 CMPQ SI, $0x04 28268 JHS loop_unroll 28269 JMP check_limit 28270 28271 loop: 28272 MOVSS (AX), X1 28273 MULSS X0, X1 28274 ADDSS (DX), X1 28275 MOVSS X1, (DX) 28276 DECQ SI 28277 LEAQ (AX)(CX*4), AX 28278 LEAQ (DX)(BX*4), DX 28279 28280 check_limit: 28281 CMPQ SI, $0x00 28282 JHI loop 28283 RET 28284 28285 // func AmdAxpyPointerLoopX_V4A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28286 // Requires: SSE 28287 TEXT ·AmdAxpyPointerLoopX_V4A15U4(SB), NOSPLIT, $0-48 28288 MOVSS alpha+0(FP), X0 28289 MOVQ xs+8(FP), AX 28290 MOVQ incx+16(FP), CX 28291 MOVQ ys+24(FP), DX 28292 MOVQ incy+32(FP), BX 28293 MOVQ n+40(FP), SI 28294 JMP check_limit_unroll 28295 PCALIGN $0x08 28296 NOP 28297 NOP 28298 NOP 28299 NOP 28300 NOP 28301 NOP 28302 NOP 28303 28304 loop_unroll: 28305 MOVSS (AX), X1 28306 MULSS X0, X1 28307 ADDSS (DX), X1 28308 MOVSS X1, (DX) 28309 LEAQ (AX)(CX*4), AX 28310 LEAQ (DX)(BX*4), DX 28311 MOVSS (AX), X1 28312 MULSS X0, X1 28313 ADDSS (DX), X1 28314 MOVSS X1, (DX) 28315 LEAQ (AX)(CX*4), AX 28316 LEAQ (DX)(BX*4), DX 28317 MOVSS (AX), X1 28318 MULSS X0, X1 28319 ADDSS (DX), X1 28320 MOVSS X1, (DX) 28321 LEAQ (AX)(CX*4), AX 28322 LEAQ (DX)(BX*4), DX 28323 MOVSS (AX), X1 28324 MULSS X0, X1 28325 ADDSS (DX), X1 28326 MOVSS X1, (DX) 28327 LEAQ (AX)(CX*4), AX 28328 LEAQ (DX)(BX*4), DX 28329 SUBQ $0x04, SI 28330 28331 check_limit_unroll: 28332 CMPQ SI, $0x04 28333 JHS loop_unroll 28334 JMP check_limit 28335 28336 loop: 28337 MOVSS (AX), X1 28338 MULSS X0, X1 28339 ADDSS (DX), X1 28340 MOVSS X1, (DX) 28341 DECQ SI 28342 LEAQ (AX)(CX*4), AX 28343 LEAQ (DX)(BX*4), DX 28344 28345 check_limit: 28346 CMPQ SI, $0x00 28347 JHI loop 28348 RET 28349 28350 // func AmdAxpyPointerLoopX_V5A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28351 // Requires: SSE 28352 TEXT ·AmdAxpyPointerLoopX_V5A15U4(SB), NOSPLIT, $0-48 28353 MOVSS alpha+0(FP), X0 28354 MOVQ xs+8(FP), AX 28355 MOVQ incx+16(FP), CX 28356 MOVQ ys+24(FP), DX 28357 MOVQ incy+32(FP), BX 28358 MOVQ n+40(FP), SI 28359 JMP check_limit_unroll 28360 PCALIGN $0x08 28361 NOP 28362 NOP 28363 NOP 28364 NOP 28365 NOP 28366 NOP 28367 NOP 28368 28369 loop_unroll: 28370 MOVSS (AX), X1 28371 MULSS X0, X1 28372 ADDSS (DX), X1 28373 MOVSS X1, (DX) 28374 LEAQ (AX)(CX*4), AX 28375 LEAQ (DX)(BX*4), DX 28376 MOVSS (AX), X1 28377 MULSS X0, X1 28378 ADDSS (DX), X1 28379 MOVSS X1, (DX) 28380 LEAQ (AX)(CX*4), AX 28381 LEAQ (DX)(BX*4), DX 28382 MOVSS (AX), X1 28383 MULSS X0, X1 28384 ADDSS (DX), X1 28385 MOVSS X1, (DX) 28386 LEAQ (AX)(CX*4), AX 28387 LEAQ (DX)(BX*4), DX 28388 MOVSS (AX), X1 28389 MULSS X0, X1 28390 ADDSS (DX), X1 28391 MOVSS X1, (DX) 28392 LEAQ (AX)(CX*4), AX 28393 LEAQ (DX)(BX*4), DX 28394 SUBQ $0x04, SI 28395 28396 check_limit_unroll: 28397 CMPQ SI, $0x04 28398 JHS loop_unroll 28399 JMP check_limit 28400 28401 loop: 28402 MOVSS (AX), X1 28403 MULSS X0, X1 28404 ADDSS (DX), X1 28405 MOVSS X1, (DX) 28406 DECQ SI 28407 LEAQ (AX)(CX*4), AX 28408 LEAQ (DX)(BX*4), DX 28409 28410 check_limit: 28411 CMPQ SI, $0x00 28412 JHI loop 28413 RET 28414 28415 // func AmdAxpyPointerLoopX_V0A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28416 // Requires: SSE 28417 TEXT ·AmdAxpyPointerLoopX_V0A16U4(SB), NOSPLIT, $0-48 28418 MOVSS alpha+0(FP), X0 28419 MOVQ xs+8(FP), AX 28420 MOVQ incx+16(FP), CX 28421 MOVQ ys+24(FP), DX 28422 MOVQ incy+32(FP), BX 28423 MOVQ n+40(FP), SI 28424 JMP check_limit_unroll 28425 PCALIGN $0x10 28426 28427 loop_unroll: 28428 MOVSS (AX), X1 28429 MULSS X0, X1 28430 ADDSS (DX), X1 28431 MOVSS X1, (DX) 28432 LEAQ (AX)(CX*4), AX 28433 LEAQ (DX)(BX*4), DX 28434 MOVSS (AX), X1 28435 MULSS X0, X1 28436 ADDSS (DX), X1 28437 MOVSS X1, (DX) 28438 LEAQ (AX)(CX*4), AX 28439 LEAQ (DX)(BX*4), DX 28440 MOVSS (AX), X1 28441 MULSS X0, X1 28442 ADDSS (DX), X1 28443 MOVSS X1, (DX) 28444 LEAQ (AX)(CX*4), AX 28445 LEAQ (DX)(BX*4), DX 28446 MOVSS (AX), X1 28447 MULSS X0, X1 28448 ADDSS (DX), X1 28449 MOVSS X1, (DX) 28450 LEAQ (AX)(CX*4), AX 28451 LEAQ (DX)(BX*4), DX 28452 SUBQ $0x04, SI 28453 28454 check_limit_unroll: 28455 CMPQ SI, $0x04 28456 JHS loop_unroll 28457 JMP check_limit 28458 28459 loop: 28460 MOVSS (AX), X1 28461 MULSS X0, X1 28462 ADDSS (DX), X1 28463 MOVSS X1, (DX) 28464 DECQ SI 28465 LEAQ (AX)(CX*4), AX 28466 LEAQ (DX)(BX*4), DX 28467 28468 check_limit: 28469 CMPQ SI, $0x00 28470 JHI loop 28471 RET 28472 28473 // func AmdAxpyPointerLoopX_V1A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28474 // Requires: SSE 28475 TEXT ·AmdAxpyPointerLoopX_V1A16U4(SB), NOSPLIT, $0-48 28476 MOVSS alpha+0(FP), X0 28477 MOVQ xs+8(FP), AX 28478 MOVQ incx+16(FP), CX 28479 MOVQ ys+24(FP), DX 28480 MOVQ incy+32(FP), BX 28481 MOVQ n+40(FP), SI 28482 JMP check_limit_unroll 28483 PCALIGN $0x10 28484 28485 loop_unroll: 28486 MOVSS (AX), X1 28487 MULSS X0, X1 28488 ADDSS (DX), X1 28489 MOVSS X1, (DX) 28490 LEAQ (AX)(CX*4), AX 28491 LEAQ (DX)(BX*4), DX 28492 MOVSS (AX), X1 28493 MULSS X0, X1 28494 ADDSS (DX), X1 28495 MOVSS X1, (DX) 28496 LEAQ (AX)(CX*4), AX 28497 LEAQ (DX)(BX*4), DX 28498 MOVSS (AX), X1 28499 MULSS X0, X1 28500 ADDSS (DX), X1 28501 MOVSS X1, (DX) 28502 LEAQ (AX)(CX*4), AX 28503 LEAQ (DX)(BX*4), DX 28504 MOVSS (AX), X1 28505 MULSS X0, X1 28506 ADDSS (DX), X1 28507 MOVSS X1, (DX) 28508 LEAQ (AX)(CX*4), AX 28509 LEAQ (DX)(BX*4), DX 28510 SUBQ $0x04, SI 28511 28512 check_limit_unroll: 28513 CMPQ SI, $0x04 28514 JHS loop_unroll 28515 JMP check_limit 28516 28517 loop: 28518 MOVSS (AX), X1 28519 MULSS X0, X1 28520 ADDSS (DX), X1 28521 MOVSS X1, (DX) 28522 DECQ SI 28523 LEAQ (AX)(CX*4), AX 28524 LEAQ (DX)(BX*4), DX 28525 28526 check_limit: 28527 CMPQ SI, $0x00 28528 JHI loop 28529 RET 28530 28531 // func AmdAxpyPointerLoopX_V2A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28532 // Requires: SSE 28533 TEXT ·AmdAxpyPointerLoopX_V2A16U4(SB), NOSPLIT, $0-48 28534 MOVSS alpha+0(FP), X0 28535 MOVQ xs+8(FP), AX 28536 MOVQ incx+16(FP), CX 28537 MOVQ ys+24(FP), DX 28538 MOVQ incy+32(FP), BX 28539 MOVQ n+40(FP), SI 28540 JMP check_limit_unroll 28541 PCALIGN $0x10 28542 28543 loop_unroll: 28544 MOVSS (AX), X1 28545 MULSS X0, X1 28546 ADDSS (DX), X1 28547 MOVSS X1, (DX) 28548 LEAQ (AX)(CX*4), AX 28549 LEAQ (DX)(BX*4), DX 28550 MOVSS (AX), X1 28551 MULSS X0, X1 28552 ADDSS (DX), X1 28553 MOVSS X1, (DX) 28554 LEAQ (AX)(CX*4), AX 28555 LEAQ (DX)(BX*4), DX 28556 MOVSS (AX), X1 28557 MULSS X0, X1 28558 ADDSS (DX), X1 28559 MOVSS X1, (DX) 28560 LEAQ (AX)(CX*4), AX 28561 LEAQ (DX)(BX*4), DX 28562 MOVSS (AX), X1 28563 MULSS X0, X1 28564 ADDSS (DX), X1 28565 MOVSS X1, (DX) 28566 LEAQ (AX)(CX*4), AX 28567 LEAQ (DX)(BX*4), DX 28568 SUBQ $0x04, SI 28569 28570 check_limit_unroll: 28571 CMPQ SI, $0x04 28572 JHS loop_unroll 28573 JMP check_limit 28574 28575 loop: 28576 MOVSS (AX), X1 28577 MULSS X0, X1 28578 ADDSS (DX), X1 28579 MOVSS X1, (DX) 28580 DECQ SI 28581 LEAQ (AX)(CX*4), AX 28582 LEAQ (DX)(BX*4), DX 28583 28584 check_limit: 28585 CMPQ SI, $0x00 28586 JHI loop 28587 RET 28588 28589 // func AmdAxpyPointerLoopX_V3A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28590 // Requires: SSE 28591 TEXT ·AmdAxpyPointerLoopX_V3A16U4(SB), NOSPLIT, $0-48 28592 MOVSS alpha+0(FP), X0 28593 MOVQ xs+8(FP), AX 28594 MOVQ incx+16(FP), CX 28595 MOVQ ys+24(FP), DX 28596 MOVQ incy+32(FP), BX 28597 MOVQ n+40(FP), SI 28598 JMP check_limit_unroll 28599 PCALIGN $0x10 28600 28601 loop_unroll: 28602 MOVSS (AX), X1 28603 MULSS X0, X1 28604 ADDSS (DX), X1 28605 MOVSS X1, (DX) 28606 LEAQ (AX)(CX*4), AX 28607 LEAQ (DX)(BX*4), DX 28608 MOVSS (AX), X1 28609 MULSS X0, X1 28610 ADDSS (DX), X1 28611 MOVSS X1, (DX) 28612 LEAQ (AX)(CX*4), AX 28613 LEAQ (DX)(BX*4), DX 28614 MOVSS (AX), X1 28615 MULSS X0, X1 28616 ADDSS (DX), X1 28617 MOVSS X1, (DX) 28618 LEAQ (AX)(CX*4), AX 28619 LEAQ (DX)(BX*4), DX 28620 MOVSS (AX), X1 28621 MULSS X0, X1 28622 ADDSS (DX), X1 28623 MOVSS X1, (DX) 28624 LEAQ (AX)(CX*4), AX 28625 LEAQ (DX)(BX*4), DX 28626 SUBQ $0x04, SI 28627 28628 check_limit_unroll: 28629 CMPQ SI, $0x04 28630 JHS loop_unroll 28631 JMP check_limit 28632 28633 loop: 28634 MOVSS (AX), X1 28635 MULSS X0, X1 28636 ADDSS (DX), X1 28637 MOVSS X1, (DX) 28638 DECQ SI 28639 LEAQ (AX)(CX*4), AX 28640 LEAQ (DX)(BX*4), DX 28641 28642 check_limit: 28643 CMPQ SI, $0x00 28644 JHI loop 28645 RET 28646 28647 // func AmdAxpyPointerLoopX_V4A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28648 // Requires: SSE 28649 TEXT ·AmdAxpyPointerLoopX_V4A16U4(SB), NOSPLIT, $0-48 28650 MOVSS alpha+0(FP), X0 28651 MOVQ xs+8(FP), AX 28652 MOVQ incx+16(FP), CX 28653 MOVQ ys+24(FP), DX 28654 MOVQ incy+32(FP), BX 28655 MOVQ n+40(FP), SI 28656 JMP check_limit_unroll 28657 PCALIGN $0x10 28658 28659 loop_unroll: 28660 MOVSS (AX), X1 28661 MULSS X0, X1 28662 ADDSS (DX), X1 28663 MOVSS X1, (DX) 28664 LEAQ (AX)(CX*4), AX 28665 LEAQ (DX)(BX*4), DX 28666 MOVSS (AX), X1 28667 MULSS X0, X1 28668 ADDSS (DX), X1 28669 MOVSS X1, (DX) 28670 LEAQ (AX)(CX*4), AX 28671 LEAQ (DX)(BX*4), DX 28672 MOVSS (AX), X1 28673 MULSS X0, X1 28674 ADDSS (DX), X1 28675 MOVSS X1, (DX) 28676 LEAQ (AX)(CX*4), AX 28677 LEAQ (DX)(BX*4), DX 28678 MOVSS (AX), X1 28679 MULSS X0, X1 28680 ADDSS (DX), X1 28681 MOVSS X1, (DX) 28682 LEAQ (AX)(CX*4), AX 28683 LEAQ (DX)(BX*4), DX 28684 SUBQ $0x04, SI 28685 28686 check_limit_unroll: 28687 CMPQ SI, $0x04 28688 JHS loop_unroll 28689 JMP check_limit 28690 28691 loop: 28692 MOVSS (AX), X1 28693 MULSS X0, X1 28694 ADDSS (DX), X1 28695 MOVSS X1, (DX) 28696 DECQ SI 28697 LEAQ (AX)(CX*4), AX 28698 LEAQ (DX)(BX*4), DX 28699 28700 check_limit: 28701 CMPQ SI, $0x00 28702 JHI loop 28703 RET 28704 28705 // func AmdAxpyPointerLoopX_V5A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28706 // Requires: SSE 28707 TEXT ·AmdAxpyPointerLoopX_V5A16U4(SB), NOSPLIT, $0-48 28708 MOVSS alpha+0(FP), X0 28709 MOVQ xs+8(FP), AX 28710 MOVQ incx+16(FP), CX 28711 MOVQ ys+24(FP), DX 28712 MOVQ incy+32(FP), BX 28713 MOVQ n+40(FP), SI 28714 JMP check_limit_unroll 28715 PCALIGN $0x10 28716 28717 loop_unroll: 28718 MOVSS (AX), X1 28719 MULSS X0, X1 28720 ADDSS (DX), X1 28721 MOVSS X1, (DX) 28722 LEAQ (AX)(CX*4), AX 28723 LEAQ (DX)(BX*4), DX 28724 MOVSS (AX), X1 28725 MULSS X0, X1 28726 ADDSS (DX), X1 28727 MOVSS X1, (DX) 28728 LEAQ (AX)(CX*4), AX 28729 LEAQ (DX)(BX*4), DX 28730 MOVSS (AX), X1 28731 MULSS X0, X1 28732 ADDSS (DX), X1 28733 MOVSS X1, (DX) 28734 LEAQ (AX)(CX*4), AX 28735 LEAQ (DX)(BX*4), DX 28736 MOVSS (AX), X1 28737 MULSS X0, X1 28738 ADDSS (DX), X1 28739 MOVSS X1, (DX) 28740 LEAQ (AX)(CX*4), AX 28741 LEAQ (DX)(BX*4), DX 28742 SUBQ $0x04, SI 28743 28744 check_limit_unroll: 28745 CMPQ SI, $0x04 28746 JHS loop_unroll 28747 JMP check_limit 28748 28749 loop: 28750 MOVSS (AX), X1 28751 MULSS X0, X1 28752 ADDSS (DX), X1 28753 MOVSS X1, (DX) 28754 DECQ SI 28755 LEAQ (AX)(CX*4), AX 28756 LEAQ (DX)(BX*4), DX 28757 28758 check_limit: 28759 CMPQ SI, $0x00 28760 JHI loop 28761 RET 28762 28763 // func AmdAxpyPointerLoopX_V0A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28764 // Requires: SSE 28765 TEXT ·AmdAxpyPointerLoopX_V0A0U8(SB), NOSPLIT, $0-48 28766 MOVSS alpha+0(FP), X0 28767 MOVQ xs+8(FP), AX 28768 MOVQ incx+16(FP), CX 28769 MOVQ ys+24(FP), DX 28770 MOVQ incy+32(FP), BX 28771 MOVQ n+40(FP), SI 28772 JMP check_limit_unroll 28773 28774 loop_unroll: 28775 MOVSS (AX), X1 28776 MULSS X0, X1 28777 ADDSS (DX), X1 28778 MOVSS X1, (DX) 28779 LEAQ (AX)(CX*4), AX 28780 LEAQ (DX)(BX*4), DX 28781 MOVSS (AX), X1 28782 MULSS X0, X1 28783 ADDSS (DX), X1 28784 MOVSS X1, (DX) 28785 LEAQ (AX)(CX*4), AX 28786 LEAQ (DX)(BX*4), DX 28787 MOVSS (AX), X1 28788 MULSS X0, X1 28789 ADDSS (DX), X1 28790 MOVSS X1, (DX) 28791 LEAQ (AX)(CX*4), AX 28792 LEAQ (DX)(BX*4), DX 28793 MOVSS (AX), X1 28794 MULSS X0, X1 28795 ADDSS (DX), X1 28796 MOVSS X1, (DX) 28797 LEAQ (AX)(CX*4), AX 28798 LEAQ (DX)(BX*4), DX 28799 MOVSS (AX), X1 28800 MULSS X0, X1 28801 ADDSS (DX), X1 28802 MOVSS X1, (DX) 28803 LEAQ (AX)(CX*4), AX 28804 LEAQ (DX)(BX*4), DX 28805 MOVSS (AX), X1 28806 MULSS X0, X1 28807 ADDSS (DX), X1 28808 MOVSS X1, (DX) 28809 LEAQ (AX)(CX*4), AX 28810 LEAQ (DX)(BX*4), DX 28811 MOVSS (AX), X1 28812 MULSS X0, X1 28813 ADDSS (DX), X1 28814 MOVSS X1, (DX) 28815 LEAQ (AX)(CX*4), AX 28816 LEAQ (DX)(BX*4), DX 28817 MOVSS (AX), X1 28818 MULSS X0, X1 28819 ADDSS (DX), X1 28820 MOVSS X1, (DX) 28821 LEAQ (AX)(CX*4), AX 28822 LEAQ (DX)(BX*4), DX 28823 SUBQ $0x08, SI 28824 28825 check_limit_unroll: 28826 CMPQ SI, $0x08 28827 JHS loop_unroll 28828 JMP check_limit 28829 28830 loop: 28831 MOVSS (AX), X1 28832 MULSS X0, X1 28833 ADDSS (DX), X1 28834 MOVSS X1, (DX) 28835 DECQ SI 28836 LEAQ (AX)(CX*4), AX 28837 LEAQ (DX)(BX*4), DX 28838 28839 check_limit: 28840 CMPQ SI, $0x00 28841 JHI loop 28842 RET 28843 28844 // func AmdAxpyPointerLoopX_V1A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28845 // Requires: SSE 28846 TEXT ·AmdAxpyPointerLoopX_V1A0U8(SB), NOSPLIT, $0-48 28847 MOVSS alpha+0(FP), X0 28848 MOVQ xs+8(FP), AX 28849 MOVQ incx+16(FP), CX 28850 MOVQ ys+24(FP), DX 28851 MOVQ incy+32(FP), BX 28852 MOVQ n+40(FP), SI 28853 JMP check_limit_unroll 28854 28855 loop_unroll: 28856 MOVSS (AX), X1 28857 MULSS X0, X1 28858 ADDSS (DX), X1 28859 MOVSS X1, (DX) 28860 LEAQ (AX)(CX*4), AX 28861 LEAQ (DX)(BX*4), DX 28862 MOVSS (AX), X1 28863 MULSS X0, X1 28864 ADDSS (DX), X1 28865 MOVSS X1, (DX) 28866 LEAQ (AX)(CX*4), AX 28867 LEAQ (DX)(BX*4), DX 28868 MOVSS (AX), X1 28869 MULSS X0, X1 28870 ADDSS (DX), X1 28871 MOVSS X1, (DX) 28872 LEAQ (AX)(CX*4), AX 28873 LEAQ (DX)(BX*4), DX 28874 MOVSS (AX), X1 28875 MULSS X0, X1 28876 ADDSS (DX), X1 28877 MOVSS X1, (DX) 28878 LEAQ (AX)(CX*4), AX 28879 LEAQ (DX)(BX*4), DX 28880 MOVSS (AX), X1 28881 MULSS X0, X1 28882 ADDSS (DX), X1 28883 MOVSS X1, (DX) 28884 LEAQ (AX)(CX*4), AX 28885 LEAQ (DX)(BX*4), DX 28886 MOVSS (AX), X1 28887 MULSS X0, X1 28888 ADDSS (DX), X1 28889 MOVSS X1, (DX) 28890 LEAQ (AX)(CX*4), AX 28891 LEAQ (DX)(BX*4), DX 28892 MOVSS (AX), X1 28893 MULSS X0, X1 28894 ADDSS (DX), X1 28895 MOVSS X1, (DX) 28896 LEAQ (AX)(CX*4), AX 28897 LEAQ (DX)(BX*4), DX 28898 MOVSS (AX), X1 28899 MULSS X0, X1 28900 ADDSS (DX), X1 28901 MOVSS X1, (DX) 28902 LEAQ (AX)(CX*4), AX 28903 LEAQ (DX)(BX*4), DX 28904 SUBQ $0x08, SI 28905 28906 check_limit_unroll: 28907 CMPQ SI, $0x08 28908 JHS loop_unroll 28909 JMP check_limit 28910 28911 loop: 28912 MOVSS (AX), X1 28913 MULSS X0, X1 28914 ADDSS (DX), X1 28915 MOVSS X1, (DX) 28916 DECQ SI 28917 LEAQ (AX)(CX*4), AX 28918 LEAQ (DX)(BX*4), DX 28919 28920 check_limit: 28921 CMPQ SI, $0x00 28922 JHI loop 28923 RET 28924 28925 // func AmdAxpyPointerLoopX_V2A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 28926 // Requires: SSE 28927 TEXT ·AmdAxpyPointerLoopX_V2A0U8(SB), NOSPLIT, $0-48 28928 MOVSS alpha+0(FP), X0 28929 MOVQ xs+8(FP), AX 28930 MOVQ incx+16(FP), CX 28931 MOVQ ys+24(FP), DX 28932 MOVQ incy+32(FP), BX 28933 MOVQ n+40(FP), SI 28934 JMP check_limit_unroll 28935 28936 loop_unroll: 28937 MOVSS (AX), X1 28938 MULSS X0, X1 28939 ADDSS (DX), X1 28940 MOVSS X1, (DX) 28941 LEAQ (AX)(CX*4), AX 28942 LEAQ (DX)(BX*4), DX 28943 MOVSS (AX), X1 28944 MULSS X0, X1 28945 ADDSS (DX), X1 28946 MOVSS X1, (DX) 28947 LEAQ (AX)(CX*4), AX 28948 LEAQ (DX)(BX*4), DX 28949 MOVSS (AX), X1 28950 MULSS X0, X1 28951 ADDSS (DX), X1 28952 MOVSS X1, (DX) 28953 LEAQ (AX)(CX*4), AX 28954 LEAQ (DX)(BX*4), DX 28955 MOVSS (AX), X1 28956 MULSS X0, X1 28957 ADDSS (DX), X1 28958 MOVSS X1, (DX) 28959 LEAQ (AX)(CX*4), AX 28960 LEAQ (DX)(BX*4), DX 28961 MOVSS (AX), X1 28962 MULSS X0, X1 28963 ADDSS (DX), X1 28964 MOVSS X1, (DX) 28965 LEAQ (AX)(CX*4), AX 28966 LEAQ (DX)(BX*4), DX 28967 MOVSS (AX), X1 28968 MULSS X0, X1 28969 ADDSS (DX), X1 28970 MOVSS X1, (DX) 28971 LEAQ (AX)(CX*4), AX 28972 LEAQ (DX)(BX*4), DX 28973 MOVSS (AX), X1 28974 MULSS X0, X1 28975 ADDSS (DX), X1 28976 MOVSS X1, (DX) 28977 LEAQ (AX)(CX*4), AX 28978 LEAQ (DX)(BX*4), DX 28979 MOVSS (AX), X1 28980 MULSS X0, X1 28981 ADDSS (DX), X1 28982 MOVSS X1, (DX) 28983 LEAQ (AX)(CX*4), AX 28984 LEAQ (DX)(BX*4), DX 28985 SUBQ $0x08, SI 28986 28987 check_limit_unroll: 28988 CMPQ SI, $0x08 28989 JHS loop_unroll 28990 JMP check_limit 28991 28992 loop: 28993 MOVSS (AX), X1 28994 MULSS X0, X1 28995 ADDSS (DX), X1 28996 MOVSS X1, (DX) 28997 DECQ SI 28998 LEAQ (AX)(CX*4), AX 28999 LEAQ (DX)(BX*4), DX 29000 29001 check_limit: 29002 CMPQ SI, $0x00 29003 JHI loop 29004 RET 29005 29006 // func AmdAxpyPointerLoopX_V3A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29007 // Requires: SSE 29008 TEXT ·AmdAxpyPointerLoopX_V3A0U8(SB), NOSPLIT, $0-48 29009 MOVSS alpha+0(FP), X0 29010 MOVQ xs+8(FP), AX 29011 MOVQ incx+16(FP), CX 29012 MOVQ ys+24(FP), DX 29013 MOVQ incy+32(FP), BX 29014 MOVQ n+40(FP), SI 29015 JMP check_limit_unroll 29016 29017 loop_unroll: 29018 MOVSS (AX), X1 29019 MULSS X0, X1 29020 ADDSS (DX), X1 29021 MOVSS X1, (DX) 29022 LEAQ (AX)(CX*4), AX 29023 LEAQ (DX)(BX*4), DX 29024 MOVSS (AX), X1 29025 MULSS X0, X1 29026 ADDSS (DX), X1 29027 MOVSS X1, (DX) 29028 LEAQ (AX)(CX*4), AX 29029 LEAQ (DX)(BX*4), DX 29030 MOVSS (AX), X1 29031 MULSS X0, X1 29032 ADDSS (DX), X1 29033 MOVSS X1, (DX) 29034 LEAQ (AX)(CX*4), AX 29035 LEAQ (DX)(BX*4), DX 29036 MOVSS (AX), X1 29037 MULSS X0, X1 29038 ADDSS (DX), X1 29039 MOVSS X1, (DX) 29040 LEAQ (AX)(CX*4), AX 29041 LEAQ (DX)(BX*4), DX 29042 MOVSS (AX), X1 29043 MULSS X0, X1 29044 ADDSS (DX), X1 29045 MOVSS X1, (DX) 29046 LEAQ (AX)(CX*4), AX 29047 LEAQ (DX)(BX*4), DX 29048 MOVSS (AX), X1 29049 MULSS X0, X1 29050 ADDSS (DX), X1 29051 MOVSS X1, (DX) 29052 LEAQ (AX)(CX*4), AX 29053 LEAQ (DX)(BX*4), DX 29054 MOVSS (AX), X1 29055 MULSS X0, X1 29056 ADDSS (DX), X1 29057 MOVSS X1, (DX) 29058 LEAQ (AX)(CX*4), AX 29059 LEAQ (DX)(BX*4), DX 29060 MOVSS (AX), X1 29061 MULSS X0, X1 29062 ADDSS (DX), X1 29063 MOVSS X1, (DX) 29064 LEAQ (AX)(CX*4), AX 29065 LEAQ (DX)(BX*4), DX 29066 SUBQ $0x08, SI 29067 29068 check_limit_unroll: 29069 CMPQ SI, $0x08 29070 JHS loop_unroll 29071 JMP check_limit 29072 29073 loop: 29074 MOVSS (AX), X1 29075 MULSS X0, X1 29076 ADDSS (DX), X1 29077 MOVSS X1, (DX) 29078 DECQ SI 29079 LEAQ (AX)(CX*4), AX 29080 LEAQ (DX)(BX*4), DX 29081 29082 check_limit: 29083 CMPQ SI, $0x00 29084 JHI loop 29085 RET 29086 29087 // func AmdAxpyPointerLoopX_V4A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29088 // Requires: SSE 29089 TEXT ·AmdAxpyPointerLoopX_V4A0U8(SB), NOSPLIT, $0-48 29090 MOVSS alpha+0(FP), X0 29091 MOVQ xs+8(FP), AX 29092 MOVQ incx+16(FP), CX 29093 MOVQ ys+24(FP), DX 29094 MOVQ incy+32(FP), BX 29095 MOVQ n+40(FP), SI 29096 JMP check_limit_unroll 29097 29098 loop_unroll: 29099 MOVSS (AX), X1 29100 MULSS X0, X1 29101 ADDSS (DX), X1 29102 MOVSS X1, (DX) 29103 LEAQ (AX)(CX*4), AX 29104 LEAQ (DX)(BX*4), DX 29105 MOVSS (AX), X1 29106 MULSS X0, X1 29107 ADDSS (DX), X1 29108 MOVSS X1, (DX) 29109 LEAQ (AX)(CX*4), AX 29110 LEAQ (DX)(BX*4), DX 29111 MOVSS (AX), X1 29112 MULSS X0, X1 29113 ADDSS (DX), X1 29114 MOVSS X1, (DX) 29115 LEAQ (AX)(CX*4), AX 29116 LEAQ (DX)(BX*4), DX 29117 MOVSS (AX), X1 29118 MULSS X0, X1 29119 ADDSS (DX), X1 29120 MOVSS X1, (DX) 29121 LEAQ (AX)(CX*4), AX 29122 LEAQ (DX)(BX*4), DX 29123 MOVSS (AX), X1 29124 MULSS X0, X1 29125 ADDSS (DX), X1 29126 MOVSS X1, (DX) 29127 LEAQ (AX)(CX*4), AX 29128 LEAQ (DX)(BX*4), DX 29129 MOVSS (AX), X1 29130 MULSS X0, X1 29131 ADDSS (DX), X1 29132 MOVSS X1, (DX) 29133 LEAQ (AX)(CX*4), AX 29134 LEAQ (DX)(BX*4), DX 29135 MOVSS (AX), X1 29136 MULSS X0, X1 29137 ADDSS (DX), X1 29138 MOVSS X1, (DX) 29139 LEAQ (AX)(CX*4), AX 29140 LEAQ (DX)(BX*4), DX 29141 MOVSS (AX), X1 29142 MULSS X0, X1 29143 ADDSS (DX), X1 29144 MOVSS X1, (DX) 29145 LEAQ (AX)(CX*4), AX 29146 LEAQ (DX)(BX*4), DX 29147 SUBQ $0x08, SI 29148 29149 check_limit_unroll: 29150 CMPQ SI, $0x08 29151 JHS loop_unroll 29152 JMP check_limit 29153 29154 loop: 29155 MOVSS (AX), X1 29156 MULSS X0, X1 29157 ADDSS (DX), X1 29158 MOVSS X1, (DX) 29159 DECQ SI 29160 LEAQ (AX)(CX*4), AX 29161 LEAQ (DX)(BX*4), DX 29162 29163 check_limit: 29164 CMPQ SI, $0x00 29165 JHI loop 29166 RET 29167 29168 // func AmdAxpyPointerLoopX_V5A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29169 // Requires: SSE 29170 TEXT ·AmdAxpyPointerLoopX_V5A0U8(SB), NOSPLIT, $0-48 29171 MOVSS alpha+0(FP), X0 29172 MOVQ xs+8(FP), AX 29173 MOVQ incx+16(FP), CX 29174 MOVQ ys+24(FP), DX 29175 MOVQ incy+32(FP), BX 29176 MOVQ n+40(FP), SI 29177 JMP check_limit_unroll 29178 29179 loop_unroll: 29180 MOVSS (AX), X1 29181 MULSS X0, X1 29182 ADDSS (DX), X1 29183 MOVSS X1, (DX) 29184 LEAQ (AX)(CX*4), AX 29185 LEAQ (DX)(BX*4), DX 29186 MOVSS (AX), X1 29187 MULSS X0, X1 29188 ADDSS (DX), X1 29189 MOVSS X1, (DX) 29190 LEAQ (AX)(CX*4), AX 29191 LEAQ (DX)(BX*4), DX 29192 MOVSS (AX), X1 29193 MULSS X0, X1 29194 ADDSS (DX), X1 29195 MOVSS X1, (DX) 29196 LEAQ (AX)(CX*4), AX 29197 LEAQ (DX)(BX*4), DX 29198 MOVSS (AX), X1 29199 MULSS X0, X1 29200 ADDSS (DX), X1 29201 MOVSS X1, (DX) 29202 LEAQ (AX)(CX*4), AX 29203 LEAQ (DX)(BX*4), DX 29204 MOVSS (AX), X1 29205 MULSS X0, X1 29206 ADDSS (DX), X1 29207 MOVSS X1, (DX) 29208 LEAQ (AX)(CX*4), AX 29209 LEAQ (DX)(BX*4), DX 29210 MOVSS (AX), X1 29211 MULSS X0, X1 29212 ADDSS (DX), X1 29213 MOVSS X1, (DX) 29214 LEAQ (AX)(CX*4), AX 29215 LEAQ (DX)(BX*4), DX 29216 MOVSS (AX), X1 29217 MULSS X0, X1 29218 ADDSS (DX), X1 29219 MOVSS X1, (DX) 29220 LEAQ (AX)(CX*4), AX 29221 LEAQ (DX)(BX*4), DX 29222 MOVSS (AX), X1 29223 MULSS X0, X1 29224 ADDSS (DX), X1 29225 MOVSS X1, (DX) 29226 LEAQ (AX)(CX*4), AX 29227 LEAQ (DX)(BX*4), DX 29228 SUBQ $0x08, SI 29229 29230 check_limit_unroll: 29231 CMPQ SI, $0x08 29232 JHS loop_unroll 29233 JMP check_limit 29234 29235 loop: 29236 MOVSS (AX), X1 29237 MULSS X0, X1 29238 ADDSS (DX), X1 29239 MOVSS X1, (DX) 29240 DECQ SI 29241 LEAQ (AX)(CX*4), AX 29242 LEAQ (DX)(BX*4), DX 29243 29244 check_limit: 29245 CMPQ SI, $0x00 29246 JHI loop 29247 RET 29248 29249 // func AmdAxpyPointerLoopX_V0A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29250 // Requires: SSE 29251 TEXT ·AmdAxpyPointerLoopX_V0A8U8(SB), NOSPLIT, $0-48 29252 MOVSS alpha+0(FP), X0 29253 MOVQ xs+8(FP), AX 29254 MOVQ incx+16(FP), CX 29255 MOVQ ys+24(FP), DX 29256 MOVQ incy+32(FP), BX 29257 MOVQ n+40(FP), SI 29258 JMP check_limit_unroll 29259 PCALIGN $0x08 29260 29261 loop_unroll: 29262 MOVSS (AX), X1 29263 MULSS X0, X1 29264 ADDSS (DX), X1 29265 MOVSS X1, (DX) 29266 LEAQ (AX)(CX*4), AX 29267 LEAQ (DX)(BX*4), DX 29268 MOVSS (AX), X1 29269 MULSS X0, X1 29270 ADDSS (DX), X1 29271 MOVSS X1, (DX) 29272 LEAQ (AX)(CX*4), AX 29273 LEAQ (DX)(BX*4), DX 29274 MOVSS (AX), X1 29275 MULSS X0, X1 29276 ADDSS (DX), X1 29277 MOVSS X1, (DX) 29278 LEAQ (AX)(CX*4), AX 29279 LEAQ (DX)(BX*4), DX 29280 MOVSS (AX), X1 29281 MULSS X0, X1 29282 ADDSS (DX), X1 29283 MOVSS X1, (DX) 29284 LEAQ (AX)(CX*4), AX 29285 LEAQ (DX)(BX*4), DX 29286 MOVSS (AX), X1 29287 MULSS X0, X1 29288 ADDSS (DX), X1 29289 MOVSS X1, (DX) 29290 LEAQ (AX)(CX*4), AX 29291 LEAQ (DX)(BX*4), DX 29292 MOVSS (AX), X1 29293 MULSS X0, X1 29294 ADDSS (DX), X1 29295 MOVSS X1, (DX) 29296 LEAQ (AX)(CX*4), AX 29297 LEAQ (DX)(BX*4), DX 29298 MOVSS (AX), X1 29299 MULSS X0, X1 29300 ADDSS (DX), X1 29301 MOVSS X1, (DX) 29302 LEAQ (AX)(CX*4), AX 29303 LEAQ (DX)(BX*4), DX 29304 MOVSS (AX), X1 29305 MULSS X0, X1 29306 ADDSS (DX), X1 29307 MOVSS X1, (DX) 29308 LEAQ (AX)(CX*4), AX 29309 LEAQ (DX)(BX*4), DX 29310 SUBQ $0x08, SI 29311 29312 check_limit_unroll: 29313 CMPQ SI, $0x08 29314 JHS loop_unroll 29315 JMP check_limit 29316 29317 loop: 29318 MOVSS (AX), X1 29319 MULSS X0, X1 29320 ADDSS (DX), X1 29321 MOVSS X1, (DX) 29322 DECQ SI 29323 LEAQ (AX)(CX*4), AX 29324 LEAQ (DX)(BX*4), DX 29325 29326 check_limit: 29327 CMPQ SI, $0x00 29328 JHI loop 29329 RET 29330 29331 // func AmdAxpyPointerLoopX_V1A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29332 // Requires: SSE 29333 TEXT ·AmdAxpyPointerLoopX_V1A8U8(SB), NOSPLIT, $0-48 29334 MOVSS alpha+0(FP), X0 29335 MOVQ xs+8(FP), AX 29336 MOVQ incx+16(FP), CX 29337 MOVQ ys+24(FP), DX 29338 MOVQ incy+32(FP), BX 29339 MOVQ n+40(FP), SI 29340 JMP check_limit_unroll 29341 PCALIGN $0x08 29342 29343 loop_unroll: 29344 MOVSS (AX), X1 29345 MULSS X0, X1 29346 ADDSS (DX), X1 29347 MOVSS X1, (DX) 29348 LEAQ (AX)(CX*4), AX 29349 LEAQ (DX)(BX*4), DX 29350 MOVSS (AX), X1 29351 MULSS X0, X1 29352 ADDSS (DX), X1 29353 MOVSS X1, (DX) 29354 LEAQ (AX)(CX*4), AX 29355 LEAQ (DX)(BX*4), DX 29356 MOVSS (AX), X1 29357 MULSS X0, X1 29358 ADDSS (DX), X1 29359 MOVSS X1, (DX) 29360 LEAQ (AX)(CX*4), AX 29361 LEAQ (DX)(BX*4), DX 29362 MOVSS (AX), X1 29363 MULSS X0, X1 29364 ADDSS (DX), X1 29365 MOVSS X1, (DX) 29366 LEAQ (AX)(CX*4), AX 29367 LEAQ (DX)(BX*4), DX 29368 MOVSS (AX), X1 29369 MULSS X0, X1 29370 ADDSS (DX), X1 29371 MOVSS X1, (DX) 29372 LEAQ (AX)(CX*4), AX 29373 LEAQ (DX)(BX*4), DX 29374 MOVSS (AX), X1 29375 MULSS X0, X1 29376 ADDSS (DX), X1 29377 MOVSS X1, (DX) 29378 LEAQ (AX)(CX*4), AX 29379 LEAQ (DX)(BX*4), DX 29380 MOVSS (AX), X1 29381 MULSS X0, X1 29382 ADDSS (DX), X1 29383 MOVSS X1, (DX) 29384 LEAQ (AX)(CX*4), AX 29385 LEAQ (DX)(BX*4), DX 29386 MOVSS (AX), X1 29387 MULSS X0, X1 29388 ADDSS (DX), X1 29389 MOVSS X1, (DX) 29390 LEAQ (AX)(CX*4), AX 29391 LEAQ (DX)(BX*4), DX 29392 SUBQ $0x08, SI 29393 29394 check_limit_unroll: 29395 CMPQ SI, $0x08 29396 JHS loop_unroll 29397 JMP check_limit 29398 29399 loop: 29400 MOVSS (AX), X1 29401 MULSS X0, X1 29402 ADDSS (DX), X1 29403 MOVSS X1, (DX) 29404 DECQ SI 29405 LEAQ (AX)(CX*4), AX 29406 LEAQ (DX)(BX*4), DX 29407 29408 check_limit: 29409 CMPQ SI, $0x00 29410 JHI loop 29411 RET 29412 29413 // func AmdAxpyPointerLoopX_V2A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29414 // Requires: SSE 29415 TEXT ·AmdAxpyPointerLoopX_V2A8U8(SB), NOSPLIT, $0-48 29416 MOVSS alpha+0(FP), X0 29417 MOVQ xs+8(FP), AX 29418 MOVQ incx+16(FP), CX 29419 MOVQ ys+24(FP), DX 29420 MOVQ incy+32(FP), BX 29421 MOVQ n+40(FP), SI 29422 JMP check_limit_unroll 29423 PCALIGN $0x08 29424 29425 loop_unroll: 29426 MOVSS (AX), X1 29427 MULSS X0, X1 29428 ADDSS (DX), X1 29429 MOVSS X1, (DX) 29430 LEAQ (AX)(CX*4), AX 29431 LEAQ (DX)(BX*4), DX 29432 MOVSS (AX), X1 29433 MULSS X0, X1 29434 ADDSS (DX), X1 29435 MOVSS X1, (DX) 29436 LEAQ (AX)(CX*4), AX 29437 LEAQ (DX)(BX*4), DX 29438 MOVSS (AX), X1 29439 MULSS X0, X1 29440 ADDSS (DX), X1 29441 MOVSS X1, (DX) 29442 LEAQ (AX)(CX*4), AX 29443 LEAQ (DX)(BX*4), DX 29444 MOVSS (AX), X1 29445 MULSS X0, X1 29446 ADDSS (DX), X1 29447 MOVSS X1, (DX) 29448 LEAQ (AX)(CX*4), AX 29449 LEAQ (DX)(BX*4), DX 29450 MOVSS (AX), X1 29451 MULSS X0, X1 29452 ADDSS (DX), X1 29453 MOVSS X1, (DX) 29454 LEAQ (AX)(CX*4), AX 29455 LEAQ (DX)(BX*4), DX 29456 MOVSS (AX), X1 29457 MULSS X0, X1 29458 ADDSS (DX), X1 29459 MOVSS X1, (DX) 29460 LEAQ (AX)(CX*4), AX 29461 LEAQ (DX)(BX*4), DX 29462 MOVSS (AX), X1 29463 MULSS X0, X1 29464 ADDSS (DX), X1 29465 MOVSS X1, (DX) 29466 LEAQ (AX)(CX*4), AX 29467 LEAQ (DX)(BX*4), DX 29468 MOVSS (AX), X1 29469 MULSS X0, X1 29470 ADDSS (DX), X1 29471 MOVSS X1, (DX) 29472 LEAQ (AX)(CX*4), AX 29473 LEAQ (DX)(BX*4), DX 29474 SUBQ $0x08, SI 29475 29476 check_limit_unroll: 29477 CMPQ SI, $0x08 29478 JHS loop_unroll 29479 JMP check_limit 29480 29481 loop: 29482 MOVSS (AX), X1 29483 MULSS X0, X1 29484 ADDSS (DX), X1 29485 MOVSS X1, (DX) 29486 DECQ SI 29487 LEAQ (AX)(CX*4), AX 29488 LEAQ (DX)(BX*4), DX 29489 29490 check_limit: 29491 CMPQ SI, $0x00 29492 JHI loop 29493 RET 29494 29495 // func AmdAxpyPointerLoopX_V3A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29496 // Requires: SSE 29497 TEXT ·AmdAxpyPointerLoopX_V3A8U8(SB), NOSPLIT, $0-48 29498 MOVSS alpha+0(FP), X0 29499 MOVQ xs+8(FP), AX 29500 MOVQ incx+16(FP), CX 29501 MOVQ ys+24(FP), DX 29502 MOVQ incy+32(FP), BX 29503 MOVQ n+40(FP), SI 29504 JMP check_limit_unroll 29505 PCALIGN $0x08 29506 29507 loop_unroll: 29508 MOVSS (AX), X1 29509 MULSS X0, X1 29510 ADDSS (DX), X1 29511 MOVSS X1, (DX) 29512 LEAQ (AX)(CX*4), AX 29513 LEAQ (DX)(BX*4), DX 29514 MOVSS (AX), X1 29515 MULSS X0, X1 29516 ADDSS (DX), X1 29517 MOVSS X1, (DX) 29518 LEAQ (AX)(CX*4), AX 29519 LEAQ (DX)(BX*4), DX 29520 MOVSS (AX), X1 29521 MULSS X0, X1 29522 ADDSS (DX), X1 29523 MOVSS X1, (DX) 29524 LEAQ (AX)(CX*4), AX 29525 LEAQ (DX)(BX*4), DX 29526 MOVSS (AX), X1 29527 MULSS X0, X1 29528 ADDSS (DX), X1 29529 MOVSS X1, (DX) 29530 LEAQ (AX)(CX*4), AX 29531 LEAQ (DX)(BX*4), DX 29532 MOVSS (AX), X1 29533 MULSS X0, X1 29534 ADDSS (DX), X1 29535 MOVSS X1, (DX) 29536 LEAQ (AX)(CX*4), AX 29537 LEAQ (DX)(BX*4), DX 29538 MOVSS (AX), X1 29539 MULSS X0, X1 29540 ADDSS (DX), X1 29541 MOVSS X1, (DX) 29542 LEAQ (AX)(CX*4), AX 29543 LEAQ (DX)(BX*4), DX 29544 MOVSS (AX), X1 29545 MULSS X0, X1 29546 ADDSS (DX), X1 29547 MOVSS X1, (DX) 29548 LEAQ (AX)(CX*4), AX 29549 LEAQ (DX)(BX*4), DX 29550 MOVSS (AX), X1 29551 MULSS X0, X1 29552 ADDSS (DX), X1 29553 MOVSS X1, (DX) 29554 LEAQ (AX)(CX*4), AX 29555 LEAQ (DX)(BX*4), DX 29556 SUBQ $0x08, SI 29557 29558 check_limit_unroll: 29559 CMPQ SI, $0x08 29560 JHS loop_unroll 29561 JMP check_limit 29562 29563 loop: 29564 MOVSS (AX), X1 29565 MULSS X0, X1 29566 ADDSS (DX), X1 29567 MOVSS X1, (DX) 29568 DECQ SI 29569 LEAQ (AX)(CX*4), AX 29570 LEAQ (DX)(BX*4), DX 29571 29572 check_limit: 29573 CMPQ SI, $0x00 29574 JHI loop 29575 RET 29576 29577 // func AmdAxpyPointerLoopX_V4A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29578 // Requires: SSE 29579 TEXT ·AmdAxpyPointerLoopX_V4A8U8(SB), NOSPLIT, $0-48 29580 MOVSS alpha+0(FP), X0 29581 MOVQ xs+8(FP), AX 29582 MOVQ incx+16(FP), CX 29583 MOVQ ys+24(FP), DX 29584 MOVQ incy+32(FP), BX 29585 MOVQ n+40(FP), SI 29586 JMP check_limit_unroll 29587 PCALIGN $0x08 29588 29589 loop_unroll: 29590 MOVSS (AX), X1 29591 MULSS X0, X1 29592 ADDSS (DX), X1 29593 MOVSS X1, (DX) 29594 LEAQ (AX)(CX*4), AX 29595 LEAQ (DX)(BX*4), DX 29596 MOVSS (AX), X1 29597 MULSS X0, X1 29598 ADDSS (DX), X1 29599 MOVSS X1, (DX) 29600 LEAQ (AX)(CX*4), AX 29601 LEAQ (DX)(BX*4), DX 29602 MOVSS (AX), X1 29603 MULSS X0, X1 29604 ADDSS (DX), X1 29605 MOVSS X1, (DX) 29606 LEAQ (AX)(CX*4), AX 29607 LEAQ (DX)(BX*4), DX 29608 MOVSS (AX), X1 29609 MULSS X0, X1 29610 ADDSS (DX), X1 29611 MOVSS X1, (DX) 29612 LEAQ (AX)(CX*4), AX 29613 LEAQ (DX)(BX*4), DX 29614 MOVSS (AX), X1 29615 MULSS X0, X1 29616 ADDSS (DX), X1 29617 MOVSS X1, (DX) 29618 LEAQ (AX)(CX*4), AX 29619 LEAQ (DX)(BX*4), DX 29620 MOVSS (AX), X1 29621 MULSS X0, X1 29622 ADDSS (DX), X1 29623 MOVSS X1, (DX) 29624 LEAQ (AX)(CX*4), AX 29625 LEAQ (DX)(BX*4), DX 29626 MOVSS (AX), X1 29627 MULSS X0, X1 29628 ADDSS (DX), X1 29629 MOVSS X1, (DX) 29630 LEAQ (AX)(CX*4), AX 29631 LEAQ (DX)(BX*4), DX 29632 MOVSS (AX), X1 29633 MULSS X0, X1 29634 ADDSS (DX), X1 29635 MOVSS X1, (DX) 29636 LEAQ (AX)(CX*4), AX 29637 LEAQ (DX)(BX*4), DX 29638 SUBQ $0x08, SI 29639 29640 check_limit_unroll: 29641 CMPQ SI, $0x08 29642 JHS loop_unroll 29643 JMP check_limit 29644 29645 loop: 29646 MOVSS (AX), X1 29647 MULSS X0, X1 29648 ADDSS (DX), X1 29649 MOVSS X1, (DX) 29650 DECQ SI 29651 LEAQ (AX)(CX*4), AX 29652 LEAQ (DX)(BX*4), DX 29653 29654 check_limit: 29655 CMPQ SI, $0x00 29656 JHI loop 29657 RET 29658 29659 // func AmdAxpyPointerLoopX_V5A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29660 // Requires: SSE 29661 TEXT ·AmdAxpyPointerLoopX_V5A8U8(SB), NOSPLIT, $0-48 29662 MOVSS alpha+0(FP), X0 29663 MOVQ xs+8(FP), AX 29664 MOVQ incx+16(FP), CX 29665 MOVQ ys+24(FP), DX 29666 MOVQ incy+32(FP), BX 29667 MOVQ n+40(FP), SI 29668 JMP check_limit_unroll 29669 PCALIGN $0x08 29670 29671 loop_unroll: 29672 MOVSS (AX), X1 29673 MULSS X0, X1 29674 ADDSS (DX), X1 29675 MOVSS X1, (DX) 29676 LEAQ (AX)(CX*4), AX 29677 LEAQ (DX)(BX*4), DX 29678 MOVSS (AX), X1 29679 MULSS X0, X1 29680 ADDSS (DX), X1 29681 MOVSS X1, (DX) 29682 LEAQ (AX)(CX*4), AX 29683 LEAQ (DX)(BX*4), DX 29684 MOVSS (AX), X1 29685 MULSS X0, X1 29686 ADDSS (DX), X1 29687 MOVSS X1, (DX) 29688 LEAQ (AX)(CX*4), AX 29689 LEAQ (DX)(BX*4), DX 29690 MOVSS (AX), X1 29691 MULSS X0, X1 29692 ADDSS (DX), X1 29693 MOVSS X1, (DX) 29694 LEAQ (AX)(CX*4), AX 29695 LEAQ (DX)(BX*4), DX 29696 MOVSS (AX), X1 29697 MULSS X0, X1 29698 ADDSS (DX), X1 29699 MOVSS X1, (DX) 29700 LEAQ (AX)(CX*4), AX 29701 LEAQ (DX)(BX*4), DX 29702 MOVSS (AX), X1 29703 MULSS X0, X1 29704 ADDSS (DX), X1 29705 MOVSS X1, (DX) 29706 LEAQ (AX)(CX*4), AX 29707 LEAQ (DX)(BX*4), DX 29708 MOVSS (AX), X1 29709 MULSS X0, X1 29710 ADDSS (DX), X1 29711 MOVSS X1, (DX) 29712 LEAQ (AX)(CX*4), AX 29713 LEAQ (DX)(BX*4), DX 29714 MOVSS (AX), X1 29715 MULSS X0, X1 29716 ADDSS (DX), X1 29717 MOVSS X1, (DX) 29718 LEAQ (AX)(CX*4), AX 29719 LEAQ (DX)(BX*4), DX 29720 SUBQ $0x08, SI 29721 29722 check_limit_unroll: 29723 CMPQ SI, $0x08 29724 JHS loop_unroll 29725 JMP check_limit 29726 29727 loop: 29728 MOVSS (AX), X1 29729 MULSS X0, X1 29730 ADDSS (DX), X1 29731 MOVSS X1, (DX) 29732 DECQ SI 29733 LEAQ (AX)(CX*4), AX 29734 LEAQ (DX)(BX*4), DX 29735 29736 check_limit: 29737 CMPQ SI, $0x00 29738 JHI loop 29739 RET 29740 29741 // func AmdAxpyPointerLoopX_V0A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29742 // Requires: SSE 29743 TEXT ·AmdAxpyPointerLoopX_V0A9U8(SB), NOSPLIT, $0-48 29744 MOVSS alpha+0(FP), X0 29745 MOVQ xs+8(FP), AX 29746 MOVQ incx+16(FP), CX 29747 MOVQ ys+24(FP), DX 29748 MOVQ incy+32(FP), BX 29749 MOVQ n+40(FP), SI 29750 JMP check_limit_unroll 29751 PCALIGN $0x08 29752 NOP 29753 29754 loop_unroll: 29755 MOVSS (AX), X1 29756 MULSS X0, X1 29757 ADDSS (DX), X1 29758 MOVSS X1, (DX) 29759 LEAQ (AX)(CX*4), AX 29760 LEAQ (DX)(BX*4), DX 29761 MOVSS (AX), X1 29762 MULSS X0, X1 29763 ADDSS (DX), X1 29764 MOVSS X1, (DX) 29765 LEAQ (AX)(CX*4), AX 29766 LEAQ (DX)(BX*4), DX 29767 MOVSS (AX), X1 29768 MULSS X0, X1 29769 ADDSS (DX), X1 29770 MOVSS X1, (DX) 29771 LEAQ (AX)(CX*4), AX 29772 LEAQ (DX)(BX*4), DX 29773 MOVSS (AX), X1 29774 MULSS X0, X1 29775 ADDSS (DX), X1 29776 MOVSS X1, (DX) 29777 LEAQ (AX)(CX*4), AX 29778 LEAQ (DX)(BX*4), DX 29779 MOVSS (AX), X1 29780 MULSS X0, X1 29781 ADDSS (DX), X1 29782 MOVSS X1, (DX) 29783 LEAQ (AX)(CX*4), AX 29784 LEAQ (DX)(BX*4), DX 29785 MOVSS (AX), X1 29786 MULSS X0, X1 29787 ADDSS (DX), X1 29788 MOVSS X1, (DX) 29789 LEAQ (AX)(CX*4), AX 29790 LEAQ (DX)(BX*4), DX 29791 MOVSS (AX), X1 29792 MULSS X0, X1 29793 ADDSS (DX), X1 29794 MOVSS X1, (DX) 29795 LEAQ (AX)(CX*4), AX 29796 LEAQ (DX)(BX*4), DX 29797 MOVSS (AX), X1 29798 MULSS X0, X1 29799 ADDSS (DX), X1 29800 MOVSS X1, (DX) 29801 LEAQ (AX)(CX*4), AX 29802 LEAQ (DX)(BX*4), DX 29803 SUBQ $0x08, SI 29804 29805 check_limit_unroll: 29806 CMPQ SI, $0x08 29807 JHS loop_unroll 29808 JMP check_limit 29809 29810 loop: 29811 MOVSS (AX), X1 29812 MULSS X0, X1 29813 ADDSS (DX), X1 29814 MOVSS X1, (DX) 29815 DECQ SI 29816 LEAQ (AX)(CX*4), AX 29817 LEAQ (DX)(BX*4), DX 29818 29819 check_limit: 29820 CMPQ SI, $0x00 29821 JHI loop 29822 RET 29823 29824 // func AmdAxpyPointerLoopX_V1A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29825 // Requires: SSE 29826 TEXT ·AmdAxpyPointerLoopX_V1A9U8(SB), NOSPLIT, $0-48 29827 MOVSS alpha+0(FP), X0 29828 MOVQ xs+8(FP), AX 29829 MOVQ incx+16(FP), CX 29830 MOVQ ys+24(FP), DX 29831 MOVQ incy+32(FP), BX 29832 MOVQ n+40(FP), SI 29833 JMP check_limit_unroll 29834 PCALIGN $0x08 29835 NOP 29836 29837 loop_unroll: 29838 MOVSS (AX), X1 29839 MULSS X0, X1 29840 ADDSS (DX), X1 29841 MOVSS X1, (DX) 29842 LEAQ (AX)(CX*4), AX 29843 LEAQ (DX)(BX*4), DX 29844 MOVSS (AX), X1 29845 MULSS X0, X1 29846 ADDSS (DX), X1 29847 MOVSS X1, (DX) 29848 LEAQ (AX)(CX*4), AX 29849 LEAQ (DX)(BX*4), DX 29850 MOVSS (AX), X1 29851 MULSS X0, X1 29852 ADDSS (DX), X1 29853 MOVSS X1, (DX) 29854 LEAQ (AX)(CX*4), AX 29855 LEAQ (DX)(BX*4), DX 29856 MOVSS (AX), X1 29857 MULSS X0, X1 29858 ADDSS (DX), X1 29859 MOVSS X1, (DX) 29860 LEAQ (AX)(CX*4), AX 29861 LEAQ (DX)(BX*4), DX 29862 MOVSS (AX), X1 29863 MULSS X0, X1 29864 ADDSS (DX), X1 29865 MOVSS X1, (DX) 29866 LEAQ (AX)(CX*4), AX 29867 LEAQ (DX)(BX*4), DX 29868 MOVSS (AX), X1 29869 MULSS X0, X1 29870 ADDSS (DX), X1 29871 MOVSS X1, (DX) 29872 LEAQ (AX)(CX*4), AX 29873 LEAQ (DX)(BX*4), DX 29874 MOVSS (AX), X1 29875 MULSS X0, X1 29876 ADDSS (DX), X1 29877 MOVSS X1, (DX) 29878 LEAQ (AX)(CX*4), AX 29879 LEAQ (DX)(BX*4), DX 29880 MOVSS (AX), X1 29881 MULSS X0, X1 29882 ADDSS (DX), X1 29883 MOVSS X1, (DX) 29884 LEAQ (AX)(CX*4), AX 29885 LEAQ (DX)(BX*4), DX 29886 SUBQ $0x08, SI 29887 29888 check_limit_unroll: 29889 CMPQ SI, $0x08 29890 JHS loop_unroll 29891 JMP check_limit 29892 29893 loop: 29894 MOVSS (AX), X1 29895 MULSS X0, X1 29896 ADDSS (DX), X1 29897 MOVSS X1, (DX) 29898 DECQ SI 29899 LEAQ (AX)(CX*4), AX 29900 LEAQ (DX)(BX*4), DX 29901 29902 check_limit: 29903 CMPQ SI, $0x00 29904 JHI loop 29905 RET 29906 29907 // func AmdAxpyPointerLoopX_V2A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29908 // Requires: SSE 29909 TEXT ·AmdAxpyPointerLoopX_V2A9U8(SB), NOSPLIT, $0-48 29910 MOVSS alpha+0(FP), X0 29911 MOVQ xs+8(FP), AX 29912 MOVQ incx+16(FP), CX 29913 MOVQ ys+24(FP), DX 29914 MOVQ incy+32(FP), BX 29915 MOVQ n+40(FP), SI 29916 JMP check_limit_unroll 29917 PCALIGN $0x08 29918 NOP 29919 29920 loop_unroll: 29921 MOVSS (AX), X1 29922 MULSS X0, X1 29923 ADDSS (DX), X1 29924 MOVSS X1, (DX) 29925 LEAQ (AX)(CX*4), AX 29926 LEAQ (DX)(BX*4), DX 29927 MOVSS (AX), X1 29928 MULSS X0, X1 29929 ADDSS (DX), X1 29930 MOVSS X1, (DX) 29931 LEAQ (AX)(CX*4), AX 29932 LEAQ (DX)(BX*4), DX 29933 MOVSS (AX), X1 29934 MULSS X0, X1 29935 ADDSS (DX), X1 29936 MOVSS X1, (DX) 29937 LEAQ (AX)(CX*4), AX 29938 LEAQ (DX)(BX*4), DX 29939 MOVSS (AX), X1 29940 MULSS X0, X1 29941 ADDSS (DX), X1 29942 MOVSS X1, (DX) 29943 LEAQ (AX)(CX*4), AX 29944 LEAQ (DX)(BX*4), DX 29945 MOVSS (AX), X1 29946 MULSS X0, X1 29947 ADDSS (DX), X1 29948 MOVSS X1, (DX) 29949 LEAQ (AX)(CX*4), AX 29950 LEAQ (DX)(BX*4), DX 29951 MOVSS (AX), X1 29952 MULSS X0, X1 29953 ADDSS (DX), X1 29954 MOVSS X1, (DX) 29955 LEAQ (AX)(CX*4), AX 29956 LEAQ (DX)(BX*4), DX 29957 MOVSS (AX), X1 29958 MULSS X0, X1 29959 ADDSS (DX), X1 29960 MOVSS X1, (DX) 29961 LEAQ (AX)(CX*4), AX 29962 LEAQ (DX)(BX*4), DX 29963 MOVSS (AX), X1 29964 MULSS X0, X1 29965 ADDSS (DX), X1 29966 MOVSS X1, (DX) 29967 LEAQ (AX)(CX*4), AX 29968 LEAQ (DX)(BX*4), DX 29969 SUBQ $0x08, SI 29970 29971 check_limit_unroll: 29972 CMPQ SI, $0x08 29973 JHS loop_unroll 29974 JMP check_limit 29975 29976 loop: 29977 MOVSS (AX), X1 29978 MULSS X0, X1 29979 ADDSS (DX), X1 29980 MOVSS X1, (DX) 29981 DECQ SI 29982 LEAQ (AX)(CX*4), AX 29983 LEAQ (DX)(BX*4), DX 29984 29985 check_limit: 29986 CMPQ SI, $0x00 29987 JHI loop 29988 RET 29989 29990 // func AmdAxpyPointerLoopX_V3A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 29991 // Requires: SSE 29992 TEXT ·AmdAxpyPointerLoopX_V3A9U8(SB), NOSPLIT, $0-48 29993 MOVSS alpha+0(FP), X0 29994 MOVQ xs+8(FP), AX 29995 MOVQ incx+16(FP), CX 29996 MOVQ ys+24(FP), DX 29997 MOVQ incy+32(FP), BX 29998 MOVQ n+40(FP), SI 29999 JMP check_limit_unroll 30000 PCALIGN $0x08 30001 NOP 30002 30003 loop_unroll: 30004 MOVSS (AX), X1 30005 MULSS X0, X1 30006 ADDSS (DX), X1 30007 MOVSS X1, (DX) 30008 LEAQ (AX)(CX*4), AX 30009 LEAQ (DX)(BX*4), DX 30010 MOVSS (AX), X1 30011 MULSS X0, X1 30012 ADDSS (DX), X1 30013 MOVSS X1, (DX) 30014 LEAQ (AX)(CX*4), AX 30015 LEAQ (DX)(BX*4), DX 30016 MOVSS (AX), X1 30017 MULSS X0, X1 30018 ADDSS (DX), X1 30019 MOVSS X1, (DX) 30020 LEAQ (AX)(CX*4), AX 30021 LEAQ (DX)(BX*4), DX 30022 MOVSS (AX), X1 30023 MULSS X0, X1 30024 ADDSS (DX), X1 30025 MOVSS X1, (DX) 30026 LEAQ (AX)(CX*4), AX 30027 LEAQ (DX)(BX*4), DX 30028 MOVSS (AX), X1 30029 MULSS X0, X1 30030 ADDSS (DX), X1 30031 MOVSS X1, (DX) 30032 LEAQ (AX)(CX*4), AX 30033 LEAQ (DX)(BX*4), DX 30034 MOVSS (AX), X1 30035 MULSS X0, X1 30036 ADDSS (DX), X1 30037 MOVSS X1, (DX) 30038 LEAQ (AX)(CX*4), AX 30039 LEAQ (DX)(BX*4), DX 30040 MOVSS (AX), X1 30041 MULSS X0, X1 30042 ADDSS (DX), X1 30043 MOVSS X1, (DX) 30044 LEAQ (AX)(CX*4), AX 30045 LEAQ (DX)(BX*4), DX 30046 MOVSS (AX), X1 30047 MULSS X0, X1 30048 ADDSS (DX), X1 30049 MOVSS X1, (DX) 30050 LEAQ (AX)(CX*4), AX 30051 LEAQ (DX)(BX*4), DX 30052 SUBQ $0x08, SI 30053 30054 check_limit_unroll: 30055 CMPQ SI, $0x08 30056 JHS loop_unroll 30057 JMP check_limit 30058 30059 loop: 30060 MOVSS (AX), X1 30061 MULSS X0, X1 30062 ADDSS (DX), X1 30063 MOVSS X1, (DX) 30064 DECQ SI 30065 LEAQ (AX)(CX*4), AX 30066 LEAQ (DX)(BX*4), DX 30067 30068 check_limit: 30069 CMPQ SI, $0x00 30070 JHI loop 30071 RET 30072 30073 // func AmdAxpyPointerLoopX_V4A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30074 // Requires: SSE 30075 TEXT ·AmdAxpyPointerLoopX_V4A9U8(SB), NOSPLIT, $0-48 30076 MOVSS alpha+0(FP), X0 30077 MOVQ xs+8(FP), AX 30078 MOVQ incx+16(FP), CX 30079 MOVQ ys+24(FP), DX 30080 MOVQ incy+32(FP), BX 30081 MOVQ n+40(FP), SI 30082 JMP check_limit_unroll 30083 PCALIGN $0x08 30084 NOP 30085 30086 loop_unroll: 30087 MOVSS (AX), X1 30088 MULSS X0, X1 30089 ADDSS (DX), X1 30090 MOVSS X1, (DX) 30091 LEAQ (AX)(CX*4), AX 30092 LEAQ (DX)(BX*4), DX 30093 MOVSS (AX), X1 30094 MULSS X0, X1 30095 ADDSS (DX), X1 30096 MOVSS X1, (DX) 30097 LEAQ (AX)(CX*4), AX 30098 LEAQ (DX)(BX*4), DX 30099 MOVSS (AX), X1 30100 MULSS X0, X1 30101 ADDSS (DX), X1 30102 MOVSS X1, (DX) 30103 LEAQ (AX)(CX*4), AX 30104 LEAQ (DX)(BX*4), DX 30105 MOVSS (AX), X1 30106 MULSS X0, X1 30107 ADDSS (DX), X1 30108 MOVSS X1, (DX) 30109 LEAQ (AX)(CX*4), AX 30110 LEAQ (DX)(BX*4), DX 30111 MOVSS (AX), X1 30112 MULSS X0, X1 30113 ADDSS (DX), X1 30114 MOVSS X1, (DX) 30115 LEAQ (AX)(CX*4), AX 30116 LEAQ (DX)(BX*4), DX 30117 MOVSS (AX), X1 30118 MULSS X0, X1 30119 ADDSS (DX), X1 30120 MOVSS X1, (DX) 30121 LEAQ (AX)(CX*4), AX 30122 LEAQ (DX)(BX*4), DX 30123 MOVSS (AX), X1 30124 MULSS X0, X1 30125 ADDSS (DX), X1 30126 MOVSS X1, (DX) 30127 LEAQ (AX)(CX*4), AX 30128 LEAQ (DX)(BX*4), DX 30129 MOVSS (AX), X1 30130 MULSS X0, X1 30131 ADDSS (DX), X1 30132 MOVSS X1, (DX) 30133 LEAQ (AX)(CX*4), AX 30134 LEAQ (DX)(BX*4), DX 30135 SUBQ $0x08, SI 30136 30137 check_limit_unroll: 30138 CMPQ SI, $0x08 30139 JHS loop_unroll 30140 JMP check_limit 30141 30142 loop: 30143 MOVSS (AX), X1 30144 MULSS X0, X1 30145 ADDSS (DX), X1 30146 MOVSS X1, (DX) 30147 DECQ SI 30148 LEAQ (AX)(CX*4), AX 30149 LEAQ (DX)(BX*4), DX 30150 30151 check_limit: 30152 CMPQ SI, $0x00 30153 JHI loop 30154 RET 30155 30156 // func AmdAxpyPointerLoopX_V5A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30157 // Requires: SSE 30158 TEXT ·AmdAxpyPointerLoopX_V5A9U8(SB), NOSPLIT, $0-48 30159 MOVSS alpha+0(FP), X0 30160 MOVQ xs+8(FP), AX 30161 MOVQ incx+16(FP), CX 30162 MOVQ ys+24(FP), DX 30163 MOVQ incy+32(FP), BX 30164 MOVQ n+40(FP), SI 30165 JMP check_limit_unroll 30166 PCALIGN $0x08 30167 NOP 30168 30169 loop_unroll: 30170 MOVSS (AX), X1 30171 MULSS X0, X1 30172 ADDSS (DX), X1 30173 MOVSS X1, (DX) 30174 LEAQ (AX)(CX*4), AX 30175 LEAQ (DX)(BX*4), DX 30176 MOVSS (AX), X1 30177 MULSS X0, X1 30178 ADDSS (DX), X1 30179 MOVSS X1, (DX) 30180 LEAQ (AX)(CX*4), AX 30181 LEAQ (DX)(BX*4), DX 30182 MOVSS (AX), X1 30183 MULSS X0, X1 30184 ADDSS (DX), X1 30185 MOVSS X1, (DX) 30186 LEAQ (AX)(CX*4), AX 30187 LEAQ (DX)(BX*4), DX 30188 MOVSS (AX), X1 30189 MULSS X0, X1 30190 ADDSS (DX), X1 30191 MOVSS X1, (DX) 30192 LEAQ (AX)(CX*4), AX 30193 LEAQ (DX)(BX*4), DX 30194 MOVSS (AX), X1 30195 MULSS X0, X1 30196 ADDSS (DX), X1 30197 MOVSS X1, (DX) 30198 LEAQ (AX)(CX*4), AX 30199 LEAQ (DX)(BX*4), DX 30200 MOVSS (AX), X1 30201 MULSS X0, X1 30202 ADDSS (DX), X1 30203 MOVSS X1, (DX) 30204 LEAQ (AX)(CX*4), AX 30205 LEAQ (DX)(BX*4), DX 30206 MOVSS (AX), X1 30207 MULSS X0, X1 30208 ADDSS (DX), X1 30209 MOVSS X1, (DX) 30210 LEAQ (AX)(CX*4), AX 30211 LEAQ (DX)(BX*4), DX 30212 MOVSS (AX), X1 30213 MULSS X0, X1 30214 ADDSS (DX), X1 30215 MOVSS X1, (DX) 30216 LEAQ (AX)(CX*4), AX 30217 LEAQ (DX)(BX*4), DX 30218 SUBQ $0x08, SI 30219 30220 check_limit_unroll: 30221 CMPQ SI, $0x08 30222 JHS loop_unroll 30223 JMP check_limit 30224 30225 loop: 30226 MOVSS (AX), X1 30227 MULSS X0, X1 30228 ADDSS (DX), X1 30229 MOVSS X1, (DX) 30230 DECQ SI 30231 LEAQ (AX)(CX*4), AX 30232 LEAQ (DX)(BX*4), DX 30233 30234 check_limit: 30235 CMPQ SI, $0x00 30236 JHI loop 30237 RET 30238 30239 // func AmdAxpyPointerLoopX_V0A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30240 // Requires: SSE 30241 TEXT ·AmdAxpyPointerLoopX_V0A10U8(SB), NOSPLIT, $0-48 30242 MOVSS alpha+0(FP), X0 30243 MOVQ xs+8(FP), AX 30244 MOVQ incx+16(FP), CX 30245 MOVQ ys+24(FP), DX 30246 MOVQ incy+32(FP), BX 30247 MOVQ n+40(FP), SI 30248 JMP check_limit_unroll 30249 PCALIGN $0x08 30250 NOP 30251 NOP 30252 30253 loop_unroll: 30254 MOVSS (AX), X1 30255 MULSS X0, X1 30256 ADDSS (DX), X1 30257 MOVSS X1, (DX) 30258 LEAQ (AX)(CX*4), AX 30259 LEAQ (DX)(BX*4), DX 30260 MOVSS (AX), X1 30261 MULSS X0, X1 30262 ADDSS (DX), X1 30263 MOVSS X1, (DX) 30264 LEAQ (AX)(CX*4), AX 30265 LEAQ (DX)(BX*4), DX 30266 MOVSS (AX), X1 30267 MULSS X0, X1 30268 ADDSS (DX), X1 30269 MOVSS X1, (DX) 30270 LEAQ (AX)(CX*4), AX 30271 LEAQ (DX)(BX*4), DX 30272 MOVSS (AX), X1 30273 MULSS X0, X1 30274 ADDSS (DX), X1 30275 MOVSS X1, (DX) 30276 LEAQ (AX)(CX*4), AX 30277 LEAQ (DX)(BX*4), DX 30278 MOVSS (AX), X1 30279 MULSS X0, X1 30280 ADDSS (DX), X1 30281 MOVSS X1, (DX) 30282 LEAQ (AX)(CX*4), AX 30283 LEAQ (DX)(BX*4), DX 30284 MOVSS (AX), X1 30285 MULSS X0, X1 30286 ADDSS (DX), X1 30287 MOVSS X1, (DX) 30288 LEAQ (AX)(CX*4), AX 30289 LEAQ (DX)(BX*4), DX 30290 MOVSS (AX), X1 30291 MULSS X0, X1 30292 ADDSS (DX), X1 30293 MOVSS X1, (DX) 30294 LEAQ (AX)(CX*4), AX 30295 LEAQ (DX)(BX*4), DX 30296 MOVSS (AX), X1 30297 MULSS X0, X1 30298 ADDSS (DX), X1 30299 MOVSS X1, (DX) 30300 LEAQ (AX)(CX*4), AX 30301 LEAQ (DX)(BX*4), DX 30302 SUBQ $0x08, SI 30303 30304 check_limit_unroll: 30305 CMPQ SI, $0x08 30306 JHS loop_unroll 30307 JMP check_limit 30308 30309 loop: 30310 MOVSS (AX), X1 30311 MULSS X0, X1 30312 ADDSS (DX), X1 30313 MOVSS X1, (DX) 30314 DECQ SI 30315 LEAQ (AX)(CX*4), AX 30316 LEAQ (DX)(BX*4), DX 30317 30318 check_limit: 30319 CMPQ SI, $0x00 30320 JHI loop 30321 RET 30322 30323 // func AmdAxpyPointerLoopX_V1A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30324 // Requires: SSE 30325 TEXT ·AmdAxpyPointerLoopX_V1A10U8(SB), NOSPLIT, $0-48 30326 MOVSS alpha+0(FP), X0 30327 MOVQ xs+8(FP), AX 30328 MOVQ incx+16(FP), CX 30329 MOVQ ys+24(FP), DX 30330 MOVQ incy+32(FP), BX 30331 MOVQ n+40(FP), SI 30332 JMP check_limit_unroll 30333 PCALIGN $0x08 30334 NOP 30335 NOP 30336 30337 loop_unroll: 30338 MOVSS (AX), X1 30339 MULSS X0, X1 30340 ADDSS (DX), X1 30341 MOVSS X1, (DX) 30342 LEAQ (AX)(CX*4), AX 30343 LEAQ (DX)(BX*4), DX 30344 MOVSS (AX), X1 30345 MULSS X0, X1 30346 ADDSS (DX), X1 30347 MOVSS X1, (DX) 30348 LEAQ (AX)(CX*4), AX 30349 LEAQ (DX)(BX*4), DX 30350 MOVSS (AX), X1 30351 MULSS X0, X1 30352 ADDSS (DX), X1 30353 MOVSS X1, (DX) 30354 LEAQ (AX)(CX*4), AX 30355 LEAQ (DX)(BX*4), DX 30356 MOVSS (AX), X1 30357 MULSS X0, X1 30358 ADDSS (DX), X1 30359 MOVSS X1, (DX) 30360 LEAQ (AX)(CX*4), AX 30361 LEAQ (DX)(BX*4), DX 30362 MOVSS (AX), X1 30363 MULSS X0, X1 30364 ADDSS (DX), X1 30365 MOVSS X1, (DX) 30366 LEAQ (AX)(CX*4), AX 30367 LEAQ (DX)(BX*4), DX 30368 MOVSS (AX), X1 30369 MULSS X0, X1 30370 ADDSS (DX), X1 30371 MOVSS X1, (DX) 30372 LEAQ (AX)(CX*4), AX 30373 LEAQ (DX)(BX*4), DX 30374 MOVSS (AX), X1 30375 MULSS X0, X1 30376 ADDSS (DX), X1 30377 MOVSS X1, (DX) 30378 LEAQ (AX)(CX*4), AX 30379 LEAQ (DX)(BX*4), DX 30380 MOVSS (AX), X1 30381 MULSS X0, X1 30382 ADDSS (DX), X1 30383 MOVSS X1, (DX) 30384 LEAQ (AX)(CX*4), AX 30385 LEAQ (DX)(BX*4), DX 30386 SUBQ $0x08, SI 30387 30388 check_limit_unroll: 30389 CMPQ SI, $0x08 30390 JHS loop_unroll 30391 JMP check_limit 30392 30393 loop: 30394 MOVSS (AX), X1 30395 MULSS X0, X1 30396 ADDSS (DX), X1 30397 MOVSS X1, (DX) 30398 DECQ SI 30399 LEAQ (AX)(CX*4), AX 30400 LEAQ (DX)(BX*4), DX 30401 30402 check_limit: 30403 CMPQ SI, $0x00 30404 JHI loop 30405 RET 30406 30407 // func AmdAxpyPointerLoopX_V2A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30408 // Requires: SSE 30409 TEXT ·AmdAxpyPointerLoopX_V2A10U8(SB), NOSPLIT, $0-48 30410 MOVSS alpha+0(FP), X0 30411 MOVQ xs+8(FP), AX 30412 MOVQ incx+16(FP), CX 30413 MOVQ ys+24(FP), DX 30414 MOVQ incy+32(FP), BX 30415 MOVQ n+40(FP), SI 30416 JMP check_limit_unroll 30417 PCALIGN $0x08 30418 NOP 30419 NOP 30420 30421 loop_unroll: 30422 MOVSS (AX), X1 30423 MULSS X0, X1 30424 ADDSS (DX), X1 30425 MOVSS X1, (DX) 30426 LEAQ (AX)(CX*4), AX 30427 LEAQ (DX)(BX*4), DX 30428 MOVSS (AX), X1 30429 MULSS X0, X1 30430 ADDSS (DX), X1 30431 MOVSS X1, (DX) 30432 LEAQ (AX)(CX*4), AX 30433 LEAQ (DX)(BX*4), DX 30434 MOVSS (AX), X1 30435 MULSS X0, X1 30436 ADDSS (DX), X1 30437 MOVSS X1, (DX) 30438 LEAQ (AX)(CX*4), AX 30439 LEAQ (DX)(BX*4), DX 30440 MOVSS (AX), X1 30441 MULSS X0, X1 30442 ADDSS (DX), X1 30443 MOVSS X1, (DX) 30444 LEAQ (AX)(CX*4), AX 30445 LEAQ (DX)(BX*4), DX 30446 MOVSS (AX), X1 30447 MULSS X0, X1 30448 ADDSS (DX), X1 30449 MOVSS X1, (DX) 30450 LEAQ (AX)(CX*4), AX 30451 LEAQ (DX)(BX*4), DX 30452 MOVSS (AX), X1 30453 MULSS X0, X1 30454 ADDSS (DX), X1 30455 MOVSS X1, (DX) 30456 LEAQ (AX)(CX*4), AX 30457 LEAQ (DX)(BX*4), DX 30458 MOVSS (AX), X1 30459 MULSS X0, X1 30460 ADDSS (DX), X1 30461 MOVSS X1, (DX) 30462 LEAQ (AX)(CX*4), AX 30463 LEAQ (DX)(BX*4), DX 30464 MOVSS (AX), X1 30465 MULSS X0, X1 30466 ADDSS (DX), X1 30467 MOVSS X1, (DX) 30468 LEAQ (AX)(CX*4), AX 30469 LEAQ (DX)(BX*4), DX 30470 SUBQ $0x08, SI 30471 30472 check_limit_unroll: 30473 CMPQ SI, $0x08 30474 JHS loop_unroll 30475 JMP check_limit 30476 30477 loop: 30478 MOVSS (AX), X1 30479 MULSS X0, X1 30480 ADDSS (DX), X1 30481 MOVSS X1, (DX) 30482 DECQ SI 30483 LEAQ (AX)(CX*4), AX 30484 LEAQ (DX)(BX*4), DX 30485 30486 check_limit: 30487 CMPQ SI, $0x00 30488 JHI loop 30489 RET 30490 30491 // func AmdAxpyPointerLoopX_V3A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30492 // Requires: SSE 30493 TEXT ·AmdAxpyPointerLoopX_V3A10U8(SB), NOSPLIT, $0-48 30494 MOVSS alpha+0(FP), X0 30495 MOVQ xs+8(FP), AX 30496 MOVQ incx+16(FP), CX 30497 MOVQ ys+24(FP), DX 30498 MOVQ incy+32(FP), BX 30499 MOVQ n+40(FP), SI 30500 JMP check_limit_unroll 30501 PCALIGN $0x08 30502 NOP 30503 NOP 30504 30505 loop_unroll: 30506 MOVSS (AX), X1 30507 MULSS X0, X1 30508 ADDSS (DX), X1 30509 MOVSS X1, (DX) 30510 LEAQ (AX)(CX*4), AX 30511 LEAQ (DX)(BX*4), DX 30512 MOVSS (AX), X1 30513 MULSS X0, X1 30514 ADDSS (DX), X1 30515 MOVSS X1, (DX) 30516 LEAQ (AX)(CX*4), AX 30517 LEAQ (DX)(BX*4), DX 30518 MOVSS (AX), X1 30519 MULSS X0, X1 30520 ADDSS (DX), X1 30521 MOVSS X1, (DX) 30522 LEAQ (AX)(CX*4), AX 30523 LEAQ (DX)(BX*4), DX 30524 MOVSS (AX), X1 30525 MULSS X0, X1 30526 ADDSS (DX), X1 30527 MOVSS X1, (DX) 30528 LEAQ (AX)(CX*4), AX 30529 LEAQ (DX)(BX*4), DX 30530 MOVSS (AX), X1 30531 MULSS X0, X1 30532 ADDSS (DX), X1 30533 MOVSS X1, (DX) 30534 LEAQ (AX)(CX*4), AX 30535 LEAQ (DX)(BX*4), DX 30536 MOVSS (AX), X1 30537 MULSS X0, X1 30538 ADDSS (DX), X1 30539 MOVSS X1, (DX) 30540 LEAQ (AX)(CX*4), AX 30541 LEAQ (DX)(BX*4), DX 30542 MOVSS (AX), X1 30543 MULSS X0, X1 30544 ADDSS (DX), X1 30545 MOVSS X1, (DX) 30546 LEAQ (AX)(CX*4), AX 30547 LEAQ (DX)(BX*4), DX 30548 MOVSS (AX), X1 30549 MULSS X0, X1 30550 ADDSS (DX), X1 30551 MOVSS X1, (DX) 30552 LEAQ (AX)(CX*4), AX 30553 LEAQ (DX)(BX*4), DX 30554 SUBQ $0x08, SI 30555 30556 check_limit_unroll: 30557 CMPQ SI, $0x08 30558 JHS loop_unroll 30559 JMP check_limit 30560 30561 loop: 30562 MOVSS (AX), X1 30563 MULSS X0, X1 30564 ADDSS (DX), X1 30565 MOVSS X1, (DX) 30566 DECQ SI 30567 LEAQ (AX)(CX*4), AX 30568 LEAQ (DX)(BX*4), DX 30569 30570 check_limit: 30571 CMPQ SI, $0x00 30572 JHI loop 30573 RET 30574 30575 // func AmdAxpyPointerLoopX_V4A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30576 // Requires: SSE 30577 TEXT ·AmdAxpyPointerLoopX_V4A10U8(SB), NOSPLIT, $0-48 30578 MOVSS alpha+0(FP), X0 30579 MOVQ xs+8(FP), AX 30580 MOVQ incx+16(FP), CX 30581 MOVQ ys+24(FP), DX 30582 MOVQ incy+32(FP), BX 30583 MOVQ n+40(FP), SI 30584 JMP check_limit_unroll 30585 PCALIGN $0x08 30586 NOP 30587 NOP 30588 30589 loop_unroll: 30590 MOVSS (AX), X1 30591 MULSS X0, X1 30592 ADDSS (DX), X1 30593 MOVSS X1, (DX) 30594 LEAQ (AX)(CX*4), AX 30595 LEAQ (DX)(BX*4), DX 30596 MOVSS (AX), X1 30597 MULSS X0, X1 30598 ADDSS (DX), X1 30599 MOVSS X1, (DX) 30600 LEAQ (AX)(CX*4), AX 30601 LEAQ (DX)(BX*4), DX 30602 MOVSS (AX), X1 30603 MULSS X0, X1 30604 ADDSS (DX), X1 30605 MOVSS X1, (DX) 30606 LEAQ (AX)(CX*4), AX 30607 LEAQ (DX)(BX*4), DX 30608 MOVSS (AX), X1 30609 MULSS X0, X1 30610 ADDSS (DX), X1 30611 MOVSS X1, (DX) 30612 LEAQ (AX)(CX*4), AX 30613 LEAQ (DX)(BX*4), DX 30614 MOVSS (AX), X1 30615 MULSS X0, X1 30616 ADDSS (DX), X1 30617 MOVSS X1, (DX) 30618 LEAQ (AX)(CX*4), AX 30619 LEAQ (DX)(BX*4), DX 30620 MOVSS (AX), X1 30621 MULSS X0, X1 30622 ADDSS (DX), X1 30623 MOVSS X1, (DX) 30624 LEAQ (AX)(CX*4), AX 30625 LEAQ (DX)(BX*4), DX 30626 MOVSS (AX), X1 30627 MULSS X0, X1 30628 ADDSS (DX), X1 30629 MOVSS X1, (DX) 30630 LEAQ (AX)(CX*4), AX 30631 LEAQ (DX)(BX*4), DX 30632 MOVSS (AX), X1 30633 MULSS X0, X1 30634 ADDSS (DX), X1 30635 MOVSS X1, (DX) 30636 LEAQ (AX)(CX*4), AX 30637 LEAQ (DX)(BX*4), DX 30638 SUBQ $0x08, SI 30639 30640 check_limit_unroll: 30641 CMPQ SI, $0x08 30642 JHS loop_unroll 30643 JMP check_limit 30644 30645 loop: 30646 MOVSS (AX), X1 30647 MULSS X0, X1 30648 ADDSS (DX), X1 30649 MOVSS X1, (DX) 30650 DECQ SI 30651 LEAQ (AX)(CX*4), AX 30652 LEAQ (DX)(BX*4), DX 30653 30654 check_limit: 30655 CMPQ SI, $0x00 30656 JHI loop 30657 RET 30658 30659 // func AmdAxpyPointerLoopX_V5A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30660 // Requires: SSE 30661 TEXT ·AmdAxpyPointerLoopX_V5A10U8(SB), NOSPLIT, $0-48 30662 MOVSS alpha+0(FP), X0 30663 MOVQ xs+8(FP), AX 30664 MOVQ incx+16(FP), CX 30665 MOVQ ys+24(FP), DX 30666 MOVQ incy+32(FP), BX 30667 MOVQ n+40(FP), SI 30668 JMP check_limit_unroll 30669 PCALIGN $0x08 30670 NOP 30671 NOP 30672 30673 loop_unroll: 30674 MOVSS (AX), X1 30675 MULSS X0, X1 30676 ADDSS (DX), X1 30677 MOVSS X1, (DX) 30678 LEAQ (AX)(CX*4), AX 30679 LEAQ (DX)(BX*4), DX 30680 MOVSS (AX), X1 30681 MULSS X0, X1 30682 ADDSS (DX), X1 30683 MOVSS X1, (DX) 30684 LEAQ (AX)(CX*4), AX 30685 LEAQ (DX)(BX*4), DX 30686 MOVSS (AX), X1 30687 MULSS X0, X1 30688 ADDSS (DX), X1 30689 MOVSS X1, (DX) 30690 LEAQ (AX)(CX*4), AX 30691 LEAQ (DX)(BX*4), DX 30692 MOVSS (AX), X1 30693 MULSS X0, X1 30694 ADDSS (DX), X1 30695 MOVSS X1, (DX) 30696 LEAQ (AX)(CX*4), AX 30697 LEAQ (DX)(BX*4), DX 30698 MOVSS (AX), X1 30699 MULSS X0, X1 30700 ADDSS (DX), X1 30701 MOVSS X1, (DX) 30702 LEAQ (AX)(CX*4), AX 30703 LEAQ (DX)(BX*4), DX 30704 MOVSS (AX), X1 30705 MULSS X0, X1 30706 ADDSS (DX), X1 30707 MOVSS X1, (DX) 30708 LEAQ (AX)(CX*4), AX 30709 LEAQ (DX)(BX*4), DX 30710 MOVSS (AX), X1 30711 MULSS X0, X1 30712 ADDSS (DX), X1 30713 MOVSS X1, (DX) 30714 LEAQ (AX)(CX*4), AX 30715 LEAQ (DX)(BX*4), DX 30716 MOVSS (AX), X1 30717 MULSS X0, X1 30718 ADDSS (DX), X1 30719 MOVSS X1, (DX) 30720 LEAQ (AX)(CX*4), AX 30721 LEAQ (DX)(BX*4), DX 30722 SUBQ $0x08, SI 30723 30724 check_limit_unroll: 30725 CMPQ SI, $0x08 30726 JHS loop_unroll 30727 JMP check_limit 30728 30729 loop: 30730 MOVSS (AX), X1 30731 MULSS X0, X1 30732 ADDSS (DX), X1 30733 MOVSS X1, (DX) 30734 DECQ SI 30735 LEAQ (AX)(CX*4), AX 30736 LEAQ (DX)(BX*4), DX 30737 30738 check_limit: 30739 CMPQ SI, $0x00 30740 JHI loop 30741 RET 30742 30743 // func AmdAxpyPointerLoopX_V0A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30744 // Requires: SSE 30745 TEXT ·AmdAxpyPointerLoopX_V0A11U8(SB), NOSPLIT, $0-48 30746 MOVSS alpha+0(FP), X0 30747 MOVQ xs+8(FP), AX 30748 MOVQ incx+16(FP), CX 30749 MOVQ ys+24(FP), DX 30750 MOVQ incy+32(FP), BX 30751 MOVQ n+40(FP), SI 30752 JMP check_limit_unroll 30753 PCALIGN $0x08 30754 NOP 30755 NOP 30756 NOP 30757 30758 loop_unroll: 30759 MOVSS (AX), X1 30760 MULSS X0, X1 30761 ADDSS (DX), X1 30762 MOVSS X1, (DX) 30763 LEAQ (AX)(CX*4), AX 30764 LEAQ (DX)(BX*4), DX 30765 MOVSS (AX), X1 30766 MULSS X0, X1 30767 ADDSS (DX), X1 30768 MOVSS X1, (DX) 30769 LEAQ (AX)(CX*4), AX 30770 LEAQ (DX)(BX*4), DX 30771 MOVSS (AX), X1 30772 MULSS X0, X1 30773 ADDSS (DX), X1 30774 MOVSS X1, (DX) 30775 LEAQ (AX)(CX*4), AX 30776 LEAQ (DX)(BX*4), DX 30777 MOVSS (AX), X1 30778 MULSS X0, X1 30779 ADDSS (DX), X1 30780 MOVSS X1, (DX) 30781 LEAQ (AX)(CX*4), AX 30782 LEAQ (DX)(BX*4), DX 30783 MOVSS (AX), X1 30784 MULSS X0, X1 30785 ADDSS (DX), X1 30786 MOVSS X1, (DX) 30787 LEAQ (AX)(CX*4), AX 30788 LEAQ (DX)(BX*4), DX 30789 MOVSS (AX), X1 30790 MULSS X0, X1 30791 ADDSS (DX), X1 30792 MOVSS X1, (DX) 30793 LEAQ (AX)(CX*4), AX 30794 LEAQ (DX)(BX*4), DX 30795 MOVSS (AX), X1 30796 MULSS X0, X1 30797 ADDSS (DX), X1 30798 MOVSS X1, (DX) 30799 LEAQ (AX)(CX*4), AX 30800 LEAQ (DX)(BX*4), DX 30801 MOVSS (AX), X1 30802 MULSS X0, X1 30803 ADDSS (DX), X1 30804 MOVSS X1, (DX) 30805 LEAQ (AX)(CX*4), AX 30806 LEAQ (DX)(BX*4), DX 30807 SUBQ $0x08, SI 30808 30809 check_limit_unroll: 30810 CMPQ SI, $0x08 30811 JHS loop_unroll 30812 JMP check_limit 30813 30814 loop: 30815 MOVSS (AX), X1 30816 MULSS X0, X1 30817 ADDSS (DX), X1 30818 MOVSS X1, (DX) 30819 DECQ SI 30820 LEAQ (AX)(CX*4), AX 30821 LEAQ (DX)(BX*4), DX 30822 30823 check_limit: 30824 CMPQ SI, $0x00 30825 JHI loop 30826 RET 30827 30828 // func AmdAxpyPointerLoopX_V1A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30829 // Requires: SSE 30830 TEXT ·AmdAxpyPointerLoopX_V1A11U8(SB), NOSPLIT, $0-48 30831 MOVSS alpha+0(FP), X0 30832 MOVQ xs+8(FP), AX 30833 MOVQ incx+16(FP), CX 30834 MOVQ ys+24(FP), DX 30835 MOVQ incy+32(FP), BX 30836 MOVQ n+40(FP), SI 30837 JMP check_limit_unroll 30838 PCALIGN $0x08 30839 NOP 30840 NOP 30841 NOP 30842 30843 loop_unroll: 30844 MOVSS (AX), X1 30845 MULSS X0, X1 30846 ADDSS (DX), X1 30847 MOVSS X1, (DX) 30848 LEAQ (AX)(CX*4), AX 30849 LEAQ (DX)(BX*4), DX 30850 MOVSS (AX), X1 30851 MULSS X0, X1 30852 ADDSS (DX), X1 30853 MOVSS X1, (DX) 30854 LEAQ (AX)(CX*4), AX 30855 LEAQ (DX)(BX*4), DX 30856 MOVSS (AX), X1 30857 MULSS X0, X1 30858 ADDSS (DX), X1 30859 MOVSS X1, (DX) 30860 LEAQ (AX)(CX*4), AX 30861 LEAQ (DX)(BX*4), DX 30862 MOVSS (AX), X1 30863 MULSS X0, X1 30864 ADDSS (DX), X1 30865 MOVSS X1, (DX) 30866 LEAQ (AX)(CX*4), AX 30867 LEAQ (DX)(BX*4), DX 30868 MOVSS (AX), X1 30869 MULSS X0, X1 30870 ADDSS (DX), X1 30871 MOVSS X1, (DX) 30872 LEAQ (AX)(CX*4), AX 30873 LEAQ (DX)(BX*4), DX 30874 MOVSS (AX), X1 30875 MULSS X0, X1 30876 ADDSS (DX), X1 30877 MOVSS X1, (DX) 30878 LEAQ (AX)(CX*4), AX 30879 LEAQ (DX)(BX*4), DX 30880 MOVSS (AX), X1 30881 MULSS X0, X1 30882 ADDSS (DX), X1 30883 MOVSS X1, (DX) 30884 LEAQ (AX)(CX*4), AX 30885 LEAQ (DX)(BX*4), DX 30886 MOVSS (AX), X1 30887 MULSS X0, X1 30888 ADDSS (DX), X1 30889 MOVSS X1, (DX) 30890 LEAQ (AX)(CX*4), AX 30891 LEAQ (DX)(BX*4), DX 30892 SUBQ $0x08, SI 30893 30894 check_limit_unroll: 30895 CMPQ SI, $0x08 30896 JHS loop_unroll 30897 JMP check_limit 30898 30899 loop: 30900 MOVSS (AX), X1 30901 MULSS X0, X1 30902 ADDSS (DX), X1 30903 MOVSS X1, (DX) 30904 DECQ SI 30905 LEAQ (AX)(CX*4), AX 30906 LEAQ (DX)(BX*4), DX 30907 30908 check_limit: 30909 CMPQ SI, $0x00 30910 JHI loop 30911 RET 30912 30913 // func AmdAxpyPointerLoopX_V2A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30914 // Requires: SSE 30915 TEXT ·AmdAxpyPointerLoopX_V2A11U8(SB), NOSPLIT, $0-48 30916 MOVSS alpha+0(FP), X0 30917 MOVQ xs+8(FP), AX 30918 MOVQ incx+16(FP), CX 30919 MOVQ ys+24(FP), DX 30920 MOVQ incy+32(FP), BX 30921 MOVQ n+40(FP), SI 30922 JMP check_limit_unroll 30923 PCALIGN $0x08 30924 NOP 30925 NOP 30926 NOP 30927 30928 loop_unroll: 30929 MOVSS (AX), X1 30930 MULSS X0, X1 30931 ADDSS (DX), X1 30932 MOVSS X1, (DX) 30933 LEAQ (AX)(CX*4), AX 30934 LEAQ (DX)(BX*4), DX 30935 MOVSS (AX), X1 30936 MULSS X0, X1 30937 ADDSS (DX), X1 30938 MOVSS X1, (DX) 30939 LEAQ (AX)(CX*4), AX 30940 LEAQ (DX)(BX*4), DX 30941 MOVSS (AX), X1 30942 MULSS X0, X1 30943 ADDSS (DX), X1 30944 MOVSS X1, (DX) 30945 LEAQ (AX)(CX*4), AX 30946 LEAQ (DX)(BX*4), DX 30947 MOVSS (AX), X1 30948 MULSS X0, X1 30949 ADDSS (DX), X1 30950 MOVSS X1, (DX) 30951 LEAQ (AX)(CX*4), AX 30952 LEAQ (DX)(BX*4), DX 30953 MOVSS (AX), X1 30954 MULSS X0, X1 30955 ADDSS (DX), X1 30956 MOVSS X1, (DX) 30957 LEAQ (AX)(CX*4), AX 30958 LEAQ (DX)(BX*4), DX 30959 MOVSS (AX), X1 30960 MULSS X0, X1 30961 ADDSS (DX), X1 30962 MOVSS X1, (DX) 30963 LEAQ (AX)(CX*4), AX 30964 LEAQ (DX)(BX*4), DX 30965 MOVSS (AX), X1 30966 MULSS X0, X1 30967 ADDSS (DX), X1 30968 MOVSS X1, (DX) 30969 LEAQ (AX)(CX*4), AX 30970 LEAQ (DX)(BX*4), DX 30971 MOVSS (AX), X1 30972 MULSS X0, X1 30973 ADDSS (DX), X1 30974 MOVSS X1, (DX) 30975 LEAQ (AX)(CX*4), AX 30976 LEAQ (DX)(BX*4), DX 30977 SUBQ $0x08, SI 30978 30979 check_limit_unroll: 30980 CMPQ SI, $0x08 30981 JHS loop_unroll 30982 JMP check_limit 30983 30984 loop: 30985 MOVSS (AX), X1 30986 MULSS X0, X1 30987 ADDSS (DX), X1 30988 MOVSS X1, (DX) 30989 DECQ SI 30990 LEAQ (AX)(CX*4), AX 30991 LEAQ (DX)(BX*4), DX 30992 30993 check_limit: 30994 CMPQ SI, $0x00 30995 JHI loop 30996 RET 30997 30998 // func AmdAxpyPointerLoopX_V3A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 30999 // Requires: SSE 31000 TEXT ·AmdAxpyPointerLoopX_V3A11U8(SB), NOSPLIT, $0-48 31001 MOVSS alpha+0(FP), X0 31002 MOVQ xs+8(FP), AX 31003 MOVQ incx+16(FP), CX 31004 MOVQ ys+24(FP), DX 31005 MOVQ incy+32(FP), BX 31006 MOVQ n+40(FP), SI 31007 JMP check_limit_unroll 31008 PCALIGN $0x08 31009 NOP 31010 NOP 31011 NOP 31012 31013 loop_unroll: 31014 MOVSS (AX), X1 31015 MULSS X0, X1 31016 ADDSS (DX), X1 31017 MOVSS X1, (DX) 31018 LEAQ (AX)(CX*4), AX 31019 LEAQ (DX)(BX*4), DX 31020 MOVSS (AX), X1 31021 MULSS X0, X1 31022 ADDSS (DX), X1 31023 MOVSS X1, (DX) 31024 LEAQ (AX)(CX*4), AX 31025 LEAQ (DX)(BX*4), DX 31026 MOVSS (AX), X1 31027 MULSS X0, X1 31028 ADDSS (DX), X1 31029 MOVSS X1, (DX) 31030 LEAQ (AX)(CX*4), AX 31031 LEAQ (DX)(BX*4), DX 31032 MOVSS (AX), X1 31033 MULSS X0, X1 31034 ADDSS (DX), X1 31035 MOVSS X1, (DX) 31036 LEAQ (AX)(CX*4), AX 31037 LEAQ (DX)(BX*4), DX 31038 MOVSS (AX), X1 31039 MULSS X0, X1 31040 ADDSS (DX), X1 31041 MOVSS X1, (DX) 31042 LEAQ (AX)(CX*4), AX 31043 LEAQ (DX)(BX*4), DX 31044 MOVSS (AX), X1 31045 MULSS X0, X1 31046 ADDSS (DX), X1 31047 MOVSS X1, (DX) 31048 LEAQ (AX)(CX*4), AX 31049 LEAQ (DX)(BX*4), DX 31050 MOVSS (AX), X1 31051 MULSS X0, X1 31052 ADDSS (DX), X1 31053 MOVSS X1, (DX) 31054 LEAQ (AX)(CX*4), AX 31055 LEAQ (DX)(BX*4), DX 31056 MOVSS (AX), X1 31057 MULSS X0, X1 31058 ADDSS (DX), X1 31059 MOVSS X1, (DX) 31060 LEAQ (AX)(CX*4), AX 31061 LEAQ (DX)(BX*4), DX 31062 SUBQ $0x08, SI 31063 31064 check_limit_unroll: 31065 CMPQ SI, $0x08 31066 JHS loop_unroll 31067 JMP check_limit 31068 31069 loop: 31070 MOVSS (AX), X1 31071 MULSS X0, X1 31072 ADDSS (DX), X1 31073 MOVSS X1, (DX) 31074 DECQ SI 31075 LEAQ (AX)(CX*4), AX 31076 LEAQ (DX)(BX*4), DX 31077 31078 check_limit: 31079 CMPQ SI, $0x00 31080 JHI loop 31081 RET 31082 31083 // func AmdAxpyPointerLoopX_V4A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31084 // Requires: SSE 31085 TEXT ·AmdAxpyPointerLoopX_V4A11U8(SB), NOSPLIT, $0-48 31086 MOVSS alpha+0(FP), X0 31087 MOVQ xs+8(FP), AX 31088 MOVQ incx+16(FP), CX 31089 MOVQ ys+24(FP), DX 31090 MOVQ incy+32(FP), BX 31091 MOVQ n+40(FP), SI 31092 JMP check_limit_unroll 31093 PCALIGN $0x08 31094 NOP 31095 NOP 31096 NOP 31097 31098 loop_unroll: 31099 MOVSS (AX), X1 31100 MULSS X0, X1 31101 ADDSS (DX), X1 31102 MOVSS X1, (DX) 31103 LEAQ (AX)(CX*4), AX 31104 LEAQ (DX)(BX*4), DX 31105 MOVSS (AX), X1 31106 MULSS X0, X1 31107 ADDSS (DX), X1 31108 MOVSS X1, (DX) 31109 LEAQ (AX)(CX*4), AX 31110 LEAQ (DX)(BX*4), DX 31111 MOVSS (AX), X1 31112 MULSS X0, X1 31113 ADDSS (DX), X1 31114 MOVSS X1, (DX) 31115 LEAQ (AX)(CX*4), AX 31116 LEAQ (DX)(BX*4), DX 31117 MOVSS (AX), X1 31118 MULSS X0, X1 31119 ADDSS (DX), X1 31120 MOVSS X1, (DX) 31121 LEAQ (AX)(CX*4), AX 31122 LEAQ (DX)(BX*4), DX 31123 MOVSS (AX), X1 31124 MULSS X0, X1 31125 ADDSS (DX), X1 31126 MOVSS X1, (DX) 31127 LEAQ (AX)(CX*4), AX 31128 LEAQ (DX)(BX*4), DX 31129 MOVSS (AX), X1 31130 MULSS X0, X1 31131 ADDSS (DX), X1 31132 MOVSS X1, (DX) 31133 LEAQ (AX)(CX*4), AX 31134 LEAQ (DX)(BX*4), DX 31135 MOVSS (AX), X1 31136 MULSS X0, X1 31137 ADDSS (DX), X1 31138 MOVSS X1, (DX) 31139 LEAQ (AX)(CX*4), AX 31140 LEAQ (DX)(BX*4), DX 31141 MOVSS (AX), X1 31142 MULSS X0, X1 31143 ADDSS (DX), X1 31144 MOVSS X1, (DX) 31145 LEAQ (AX)(CX*4), AX 31146 LEAQ (DX)(BX*4), DX 31147 SUBQ $0x08, SI 31148 31149 check_limit_unroll: 31150 CMPQ SI, $0x08 31151 JHS loop_unroll 31152 JMP check_limit 31153 31154 loop: 31155 MOVSS (AX), X1 31156 MULSS X0, X1 31157 ADDSS (DX), X1 31158 MOVSS X1, (DX) 31159 DECQ SI 31160 LEAQ (AX)(CX*4), AX 31161 LEAQ (DX)(BX*4), DX 31162 31163 check_limit: 31164 CMPQ SI, $0x00 31165 JHI loop 31166 RET 31167 31168 // func AmdAxpyPointerLoopX_V5A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31169 // Requires: SSE 31170 TEXT ·AmdAxpyPointerLoopX_V5A11U8(SB), NOSPLIT, $0-48 31171 MOVSS alpha+0(FP), X0 31172 MOVQ xs+8(FP), AX 31173 MOVQ incx+16(FP), CX 31174 MOVQ ys+24(FP), DX 31175 MOVQ incy+32(FP), BX 31176 MOVQ n+40(FP), SI 31177 JMP check_limit_unroll 31178 PCALIGN $0x08 31179 NOP 31180 NOP 31181 NOP 31182 31183 loop_unroll: 31184 MOVSS (AX), X1 31185 MULSS X0, X1 31186 ADDSS (DX), X1 31187 MOVSS X1, (DX) 31188 LEAQ (AX)(CX*4), AX 31189 LEAQ (DX)(BX*4), DX 31190 MOVSS (AX), X1 31191 MULSS X0, X1 31192 ADDSS (DX), X1 31193 MOVSS X1, (DX) 31194 LEAQ (AX)(CX*4), AX 31195 LEAQ (DX)(BX*4), DX 31196 MOVSS (AX), X1 31197 MULSS X0, X1 31198 ADDSS (DX), X1 31199 MOVSS X1, (DX) 31200 LEAQ (AX)(CX*4), AX 31201 LEAQ (DX)(BX*4), DX 31202 MOVSS (AX), X1 31203 MULSS X0, X1 31204 ADDSS (DX), X1 31205 MOVSS X1, (DX) 31206 LEAQ (AX)(CX*4), AX 31207 LEAQ (DX)(BX*4), DX 31208 MOVSS (AX), X1 31209 MULSS X0, X1 31210 ADDSS (DX), X1 31211 MOVSS X1, (DX) 31212 LEAQ (AX)(CX*4), AX 31213 LEAQ (DX)(BX*4), DX 31214 MOVSS (AX), X1 31215 MULSS X0, X1 31216 ADDSS (DX), X1 31217 MOVSS X1, (DX) 31218 LEAQ (AX)(CX*4), AX 31219 LEAQ (DX)(BX*4), DX 31220 MOVSS (AX), X1 31221 MULSS X0, X1 31222 ADDSS (DX), X1 31223 MOVSS X1, (DX) 31224 LEAQ (AX)(CX*4), AX 31225 LEAQ (DX)(BX*4), DX 31226 MOVSS (AX), X1 31227 MULSS X0, X1 31228 ADDSS (DX), X1 31229 MOVSS X1, (DX) 31230 LEAQ (AX)(CX*4), AX 31231 LEAQ (DX)(BX*4), DX 31232 SUBQ $0x08, SI 31233 31234 check_limit_unroll: 31235 CMPQ SI, $0x08 31236 JHS loop_unroll 31237 JMP check_limit 31238 31239 loop: 31240 MOVSS (AX), X1 31241 MULSS X0, X1 31242 ADDSS (DX), X1 31243 MOVSS X1, (DX) 31244 DECQ SI 31245 LEAQ (AX)(CX*4), AX 31246 LEAQ (DX)(BX*4), DX 31247 31248 check_limit: 31249 CMPQ SI, $0x00 31250 JHI loop 31251 RET 31252 31253 // func AmdAxpyPointerLoopX_V0A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31254 // Requires: SSE 31255 TEXT ·AmdAxpyPointerLoopX_V0A12U8(SB), NOSPLIT, $0-48 31256 MOVSS alpha+0(FP), X0 31257 MOVQ xs+8(FP), AX 31258 MOVQ incx+16(FP), CX 31259 MOVQ ys+24(FP), DX 31260 MOVQ incy+32(FP), BX 31261 MOVQ n+40(FP), SI 31262 JMP check_limit_unroll 31263 PCALIGN $0x08 31264 NOP 31265 NOP 31266 NOP 31267 NOP 31268 31269 loop_unroll: 31270 MOVSS (AX), X1 31271 MULSS X0, X1 31272 ADDSS (DX), X1 31273 MOVSS X1, (DX) 31274 LEAQ (AX)(CX*4), AX 31275 LEAQ (DX)(BX*4), DX 31276 MOVSS (AX), X1 31277 MULSS X0, X1 31278 ADDSS (DX), X1 31279 MOVSS X1, (DX) 31280 LEAQ (AX)(CX*4), AX 31281 LEAQ (DX)(BX*4), DX 31282 MOVSS (AX), X1 31283 MULSS X0, X1 31284 ADDSS (DX), X1 31285 MOVSS X1, (DX) 31286 LEAQ (AX)(CX*4), AX 31287 LEAQ (DX)(BX*4), DX 31288 MOVSS (AX), X1 31289 MULSS X0, X1 31290 ADDSS (DX), X1 31291 MOVSS X1, (DX) 31292 LEAQ (AX)(CX*4), AX 31293 LEAQ (DX)(BX*4), DX 31294 MOVSS (AX), X1 31295 MULSS X0, X1 31296 ADDSS (DX), X1 31297 MOVSS X1, (DX) 31298 LEAQ (AX)(CX*4), AX 31299 LEAQ (DX)(BX*4), DX 31300 MOVSS (AX), X1 31301 MULSS X0, X1 31302 ADDSS (DX), X1 31303 MOVSS X1, (DX) 31304 LEAQ (AX)(CX*4), AX 31305 LEAQ (DX)(BX*4), DX 31306 MOVSS (AX), X1 31307 MULSS X0, X1 31308 ADDSS (DX), X1 31309 MOVSS X1, (DX) 31310 LEAQ (AX)(CX*4), AX 31311 LEAQ (DX)(BX*4), DX 31312 MOVSS (AX), X1 31313 MULSS X0, X1 31314 ADDSS (DX), X1 31315 MOVSS X1, (DX) 31316 LEAQ (AX)(CX*4), AX 31317 LEAQ (DX)(BX*4), DX 31318 SUBQ $0x08, SI 31319 31320 check_limit_unroll: 31321 CMPQ SI, $0x08 31322 JHS loop_unroll 31323 JMP check_limit 31324 31325 loop: 31326 MOVSS (AX), X1 31327 MULSS X0, X1 31328 ADDSS (DX), X1 31329 MOVSS X1, (DX) 31330 DECQ SI 31331 LEAQ (AX)(CX*4), AX 31332 LEAQ (DX)(BX*4), DX 31333 31334 check_limit: 31335 CMPQ SI, $0x00 31336 JHI loop 31337 RET 31338 31339 // func AmdAxpyPointerLoopX_V1A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31340 // Requires: SSE 31341 TEXT ·AmdAxpyPointerLoopX_V1A12U8(SB), NOSPLIT, $0-48 31342 MOVSS alpha+0(FP), X0 31343 MOVQ xs+8(FP), AX 31344 MOVQ incx+16(FP), CX 31345 MOVQ ys+24(FP), DX 31346 MOVQ incy+32(FP), BX 31347 MOVQ n+40(FP), SI 31348 JMP check_limit_unroll 31349 PCALIGN $0x08 31350 NOP 31351 NOP 31352 NOP 31353 NOP 31354 31355 loop_unroll: 31356 MOVSS (AX), X1 31357 MULSS X0, X1 31358 ADDSS (DX), X1 31359 MOVSS X1, (DX) 31360 LEAQ (AX)(CX*4), AX 31361 LEAQ (DX)(BX*4), DX 31362 MOVSS (AX), X1 31363 MULSS X0, X1 31364 ADDSS (DX), X1 31365 MOVSS X1, (DX) 31366 LEAQ (AX)(CX*4), AX 31367 LEAQ (DX)(BX*4), DX 31368 MOVSS (AX), X1 31369 MULSS X0, X1 31370 ADDSS (DX), X1 31371 MOVSS X1, (DX) 31372 LEAQ (AX)(CX*4), AX 31373 LEAQ (DX)(BX*4), DX 31374 MOVSS (AX), X1 31375 MULSS X0, X1 31376 ADDSS (DX), X1 31377 MOVSS X1, (DX) 31378 LEAQ (AX)(CX*4), AX 31379 LEAQ (DX)(BX*4), DX 31380 MOVSS (AX), X1 31381 MULSS X0, X1 31382 ADDSS (DX), X1 31383 MOVSS X1, (DX) 31384 LEAQ (AX)(CX*4), AX 31385 LEAQ (DX)(BX*4), DX 31386 MOVSS (AX), X1 31387 MULSS X0, X1 31388 ADDSS (DX), X1 31389 MOVSS X1, (DX) 31390 LEAQ (AX)(CX*4), AX 31391 LEAQ (DX)(BX*4), DX 31392 MOVSS (AX), X1 31393 MULSS X0, X1 31394 ADDSS (DX), X1 31395 MOVSS X1, (DX) 31396 LEAQ (AX)(CX*4), AX 31397 LEAQ (DX)(BX*4), DX 31398 MOVSS (AX), X1 31399 MULSS X0, X1 31400 ADDSS (DX), X1 31401 MOVSS X1, (DX) 31402 LEAQ (AX)(CX*4), AX 31403 LEAQ (DX)(BX*4), DX 31404 SUBQ $0x08, SI 31405 31406 check_limit_unroll: 31407 CMPQ SI, $0x08 31408 JHS loop_unroll 31409 JMP check_limit 31410 31411 loop: 31412 MOVSS (AX), X1 31413 MULSS X0, X1 31414 ADDSS (DX), X1 31415 MOVSS X1, (DX) 31416 DECQ SI 31417 LEAQ (AX)(CX*4), AX 31418 LEAQ (DX)(BX*4), DX 31419 31420 check_limit: 31421 CMPQ SI, $0x00 31422 JHI loop 31423 RET 31424 31425 // func AmdAxpyPointerLoopX_V2A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31426 // Requires: SSE 31427 TEXT ·AmdAxpyPointerLoopX_V2A12U8(SB), NOSPLIT, $0-48 31428 MOVSS alpha+0(FP), X0 31429 MOVQ xs+8(FP), AX 31430 MOVQ incx+16(FP), CX 31431 MOVQ ys+24(FP), DX 31432 MOVQ incy+32(FP), BX 31433 MOVQ n+40(FP), SI 31434 JMP check_limit_unroll 31435 PCALIGN $0x08 31436 NOP 31437 NOP 31438 NOP 31439 NOP 31440 31441 loop_unroll: 31442 MOVSS (AX), X1 31443 MULSS X0, X1 31444 ADDSS (DX), X1 31445 MOVSS X1, (DX) 31446 LEAQ (AX)(CX*4), AX 31447 LEAQ (DX)(BX*4), DX 31448 MOVSS (AX), X1 31449 MULSS X0, X1 31450 ADDSS (DX), X1 31451 MOVSS X1, (DX) 31452 LEAQ (AX)(CX*4), AX 31453 LEAQ (DX)(BX*4), DX 31454 MOVSS (AX), X1 31455 MULSS X0, X1 31456 ADDSS (DX), X1 31457 MOVSS X1, (DX) 31458 LEAQ (AX)(CX*4), AX 31459 LEAQ (DX)(BX*4), DX 31460 MOVSS (AX), X1 31461 MULSS X0, X1 31462 ADDSS (DX), X1 31463 MOVSS X1, (DX) 31464 LEAQ (AX)(CX*4), AX 31465 LEAQ (DX)(BX*4), DX 31466 MOVSS (AX), X1 31467 MULSS X0, X1 31468 ADDSS (DX), X1 31469 MOVSS X1, (DX) 31470 LEAQ (AX)(CX*4), AX 31471 LEAQ (DX)(BX*4), DX 31472 MOVSS (AX), X1 31473 MULSS X0, X1 31474 ADDSS (DX), X1 31475 MOVSS X1, (DX) 31476 LEAQ (AX)(CX*4), AX 31477 LEAQ (DX)(BX*4), DX 31478 MOVSS (AX), X1 31479 MULSS X0, X1 31480 ADDSS (DX), X1 31481 MOVSS X1, (DX) 31482 LEAQ (AX)(CX*4), AX 31483 LEAQ (DX)(BX*4), DX 31484 MOVSS (AX), X1 31485 MULSS X0, X1 31486 ADDSS (DX), X1 31487 MOVSS X1, (DX) 31488 LEAQ (AX)(CX*4), AX 31489 LEAQ (DX)(BX*4), DX 31490 SUBQ $0x08, SI 31491 31492 check_limit_unroll: 31493 CMPQ SI, $0x08 31494 JHS loop_unroll 31495 JMP check_limit 31496 31497 loop: 31498 MOVSS (AX), X1 31499 MULSS X0, X1 31500 ADDSS (DX), X1 31501 MOVSS X1, (DX) 31502 DECQ SI 31503 LEAQ (AX)(CX*4), AX 31504 LEAQ (DX)(BX*4), DX 31505 31506 check_limit: 31507 CMPQ SI, $0x00 31508 JHI loop 31509 RET 31510 31511 // func AmdAxpyPointerLoopX_V3A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31512 // Requires: SSE 31513 TEXT ·AmdAxpyPointerLoopX_V3A12U8(SB), NOSPLIT, $0-48 31514 MOVSS alpha+0(FP), X0 31515 MOVQ xs+8(FP), AX 31516 MOVQ incx+16(FP), CX 31517 MOVQ ys+24(FP), DX 31518 MOVQ incy+32(FP), BX 31519 MOVQ n+40(FP), SI 31520 JMP check_limit_unroll 31521 PCALIGN $0x08 31522 NOP 31523 NOP 31524 NOP 31525 NOP 31526 31527 loop_unroll: 31528 MOVSS (AX), X1 31529 MULSS X0, X1 31530 ADDSS (DX), X1 31531 MOVSS X1, (DX) 31532 LEAQ (AX)(CX*4), AX 31533 LEAQ (DX)(BX*4), DX 31534 MOVSS (AX), X1 31535 MULSS X0, X1 31536 ADDSS (DX), X1 31537 MOVSS X1, (DX) 31538 LEAQ (AX)(CX*4), AX 31539 LEAQ (DX)(BX*4), DX 31540 MOVSS (AX), X1 31541 MULSS X0, X1 31542 ADDSS (DX), X1 31543 MOVSS X1, (DX) 31544 LEAQ (AX)(CX*4), AX 31545 LEAQ (DX)(BX*4), DX 31546 MOVSS (AX), X1 31547 MULSS X0, X1 31548 ADDSS (DX), X1 31549 MOVSS X1, (DX) 31550 LEAQ (AX)(CX*4), AX 31551 LEAQ (DX)(BX*4), DX 31552 MOVSS (AX), X1 31553 MULSS X0, X1 31554 ADDSS (DX), X1 31555 MOVSS X1, (DX) 31556 LEAQ (AX)(CX*4), AX 31557 LEAQ (DX)(BX*4), DX 31558 MOVSS (AX), X1 31559 MULSS X0, X1 31560 ADDSS (DX), X1 31561 MOVSS X1, (DX) 31562 LEAQ (AX)(CX*4), AX 31563 LEAQ (DX)(BX*4), DX 31564 MOVSS (AX), X1 31565 MULSS X0, X1 31566 ADDSS (DX), X1 31567 MOVSS X1, (DX) 31568 LEAQ (AX)(CX*4), AX 31569 LEAQ (DX)(BX*4), DX 31570 MOVSS (AX), X1 31571 MULSS X0, X1 31572 ADDSS (DX), X1 31573 MOVSS X1, (DX) 31574 LEAQ (AX)(CX*4), AX 31575 LEAQ (DX)(BX*4), DX 31576 SUBQ $0x08, SI 31577 31578 check_limit_unroll: 31579 CMPQ SI, $0x08 31580 JHS loop_unroll 31581 JMP check_limit 31582 31583 loop: 31584 MOVSS (AX), X1 31585 MULSS X0, X1 31586 ADDSS (DX), X1 31587 MOVSS X1, (DX) 31588 DECQ SI 31589 LEAQ (AX)(CX*4), AX 31590 LEAQ (DX)(BX*4), DX 31591 31592 check_limit: 31593 CMPQ SI, $0x00 31594 JHI loop 31595 RET 31596 31597 // func AmdAxpyPointerLoopX_V4A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31598 // Requires: SSE 31599 TEXT ·AmdAxpyPointerLoopX_V4A12U8(SB), NOSPLIT, $0-48 31600 MOVSS alpha+0(FP), X0 31601 MOVQ xs+8(FP), AX 31602 MOVQ incx+16(FP), CX 31603 MOVQ ys+24(FP), DX 31604 MOVQ incy+32(FP), BX 31605 MOVQ n+40(FP), SI 31606 JMP check_limit_unroll 31607 PCALIGN $0x08 31608 NOP 31609 NOP 31610 NOP 31611 NOP 31612 31613 loop_unroll: 31614 MOVSS (AX), X1 31615 MULSS X0, X1 31616 ADDSS (DX), X1 31617 MOVSS X1, (DX) 31618 LEAQ (AX)(CX*4), AX 31619 LEAQ (DX)(BX*4), DX 31620 MOVSS (AX), X1 31621 MULSS X0, X1 31622 ADDSS (DX), X1 31623 MOVSS X1, (DX) 31624 LEAQ (AX)(CX*4), AX 31625 LEAQ (DX)(BX*4), DX 31626 MOVSS (AX), X1 31627 MULSS X0, X1 31628 ADDSS (DX), X1 31629 MOVSS X1, (DX) 31630 LEAQ (AX)(CX*4), AX 31631 LEAQ (DX)(BX*4), DX 31632 MOVSS (AX), X1 31633 MULSS X0, X1 31634 ADDSS (DX), X1 31635 MOVSS X1, (DX) 31636 LEAQ (AX)(CX*4), AX 31637 LEAQ (DX)(BX*4), DX 31638 MOVSS (AX), X1 31639 MULSS X0, X1 31640 ADDSS (DX), X1 31641 MOVSS X1, (DX) 31642 LEAQ (AX)(CX*4), AX 31643 LEAQ (DX)(BX*4), DX 31644 MOVSS (AX), X1 31645 MULSS X0, X1 31646 ADDSS (DX), X1 31647 MOVSS X1, (DX) 31648 LEAQ (AX)(CX*4), AX 31649 LEAQ (DX)(BX*4), DX 31650 MOVSS (AX), X1 31651 MULSS X0, X1 31652 ADDSS (DX), X1 31653 MOVSS X1, (DX) 31654 LEAQ (AX)(CX*4), AX 31655 LEAQ (DX)(BX*4), DX 31656 MOVSS (AX), X1 31657 MULSS X0, X1 31658 ADDSS (DX), X1 31659 MOVSS X1, (DX) 31660 LEAQ (AX)(CX*4), AX 31661 LEAQ (DX)(BX*4), DX 31662 SUBQ $0x08, SI 31663 31664 check_limit_unroll: 31665 CMPQ SI, $0x08 31666 JHS loop_unroll 31667 JMP check_limit 31668 31669 loop: 31670 MOVSS (AX), X1 31671 MULSS X0, X1 31672 ADDSS (DX), X1 31673 MOVSS X1, (DX) 31674 DECQ SI 31675 LEAQ (AX)(CX*4), AX 31676 LEAQ (DX)(BX*4), DX 31677 31678 check_limit: 31679 CMPQ SI, $0x00 31680 JHI loop 31681 RET 31682 31683 // func AmdAxpyPointerLoopX_V5A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31684 // Requires: SSE 31685 TEXT ·AmdAxpyPointerLoopX_V5A12U8(SB), NOSPLIT, $0-48 31686 MOVSS alpha+0(FP), X0 31687 MOVQ xs+8(FP), AX 31688 MOVQ incx+16(FP), CX 31689 MOVQ ys+24(FP), DX 31690 MOVQ incy+32(FP), BX 31691 MOVQ n+40(FP), SI 31692 JMP check_limit_unroll 31693 PCALIGN $0x08 31694 NOP 31695 NOP 31696 NOP 31697 NOP 31698 31699 loop_unroll: 31700 MOVSS (AX), X1 31701 MULSS X0, X1 31702 ADDSS (DX), X1 31703 MOVSS X1, (DX) 31704 LEAQ (AX)(CX*4), AX 31705 LEAQ (DX)(BX*4), DX 31706 MOVSS (AX), X1 31707 MULSS X0, X1 31708 ADDSS (DX), X1 31709 MOVSS X1, (DX) 31710 LEAQ (AX)(CX*4), AX 31711 LEAQ (DX)(BX*4), DX 31712 MOVSS (AX), X1 31713 MULSS X0, X1 31714 ADDSS (DX), X1 31715 MOVSS X1, (DX) 31716 LEAQ (AX)(CX*4), AX 31717 LEAQ (DX)(BX*4), DX 31718 MOVSS (AX), X1 31719 MULSS X0, X1 31720 ADDSS (DX), X1 31721 MOVSS X1, (DX) 31722 LEAQ (AX)(CX*4), AX 31723 LEAQ (DX)(BX*4), DX 31724 MOVSS (AX), X1 31725 MULSS X0, X1 31726 ADDSS (DX), X1 31727 MOVSS X1, (DX) 31728 LEAQ (AX)(CX*4), AX 31729 LEAQ (DX)(BX*4), DX 31730 MOVSS (AX), X1 31731 MULSS X0, X1 31732 ADDSS (DX), X1 31733 MOVSS X1, (DX) 31734 LEAQ (AX)(CX*4), AX 31735 LEAQ (DX)(BX*4), DX 31736 MOVSS (AX), X1 31737 MULSS X0, X1 31738 ADDSS (DX), X1 31739 MOVSS X1, (DX) 31740 LEAQ (AX)(CX*4), AX 31741 LEAQ (DX)(BX*4), DX 31742 MOVSS (AX), X1 31743 MULSS X0, X1 31744 ADDSS (DX), X1 31745 MOVSS X1, (DX) 31746 LEAQ (AX)(CX*4), AX 31747 LEAQ (DX)(BX*4), DX 31748 SUBQ $0x08, SI 31749 31750 check_limit_unroll: 31751 CMPQ SI, $0x08 31752 JHS loop_unroll 31753 JMP check_limit 31754 31755 loop: 31756 MOVSS (AX), X1 31757 MULSS X0, X1 31758 ADDSS (DX), X1 31759 MOVSS X1, (DX) 31760 DECQ SI 31761 LEAQ (AX)(CX*4), AX 31762 LEAQ (DX)(BX*4), DX 31763 31764 check_limit: 31765 CMPQ SI, $0x00 31766 JHI loop 31767 RET 31768 31769 // func AmdAxpyPointerLoopX_V0A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31770 // Requires: SSE 31771 TEXT ·AmdAxpyPointerLoopX_V0A13U8(SB), NOSPLIT, $0-48 31772 MOVSS alpha+0(FP), X0 31773 MOVQ xs+8(FP), AX 31774 MOVQ incx+16(FP), CX 31775 MOVQ ys+24(FP), DX 31776 MOVQ incy+32(FP), BX 31777 MOVQ n+40(FP), SI 31778 JMP check_limit_unroll 31779 PCALIGN $0x08 31780 NOP 31781 NOP 31782 NOP 31783 NOP 31784 NOP 31785 31786 loop_unroll: 31787 MOVSS (AX), X1 31788 MULSS X0, X1 31789 ADDSS (DX), X1 31790 MOVSS X1, (DX) 31791 LEAQ (AX)(CX*4), AX 31792 LEAQ (DX)(BX*4), DX 31793 MOVSS (AX), X1 31794 MULSS X0, X1 31795 ADDSS (DX), X1 31796 MOVSS X1, (DX) 31797 LEAQ (AX)(CX*4), AX 31798 LEAQ (DX)(BX*4), DX 31799 MOVSS (AX), X1 31800 MULSS X0, X1 31801 ADDSS (DX), X1 31802 MOVSS X1, (DX) 31803 LEAQ (AX)(CX*4), AX 31804 LEAQ (DX)(BX*4), DX 31805 MOVSS (AX), X1 31806 MULSS X0, X1 31807 ADDSS (DX), X1 31808 MOVSS X1, (DX) 31809 LEAQ (AX)(CX*4), AX 31810 LEAQ (DX)(BX*4), DX 31811 MOVSS (AX), X1 31812 MULSS X0, X1 31813 ADDSS (DX), X1 31814 MOVSS X1, (DX) 31815 LEAQ (AX)(CX*4), AX 31816 LEAQ (DX)(BX*4), DX 31817 MOVSS (AX), X1 31818 MULSS X0, X1 31819 ADDSS (DX), X1 31820 MOVSS X1, (DX) 31821 LEAQ (AX)(CX*4), AX 31822 LEAQ (DX)(BX*4), DX 31823 MOVSS (AX), X1 31824 MULSS X0, X1 31825 ADDSS (DX), X1 31826 MOVSS X1, (DX) 31827 LEAQ (AX)(CX*4), AX 31828 LEAQ (DX)(BX*4), DX 31829 MOVSS (AX), X1 31830 MULSS X0, X1 31831 ADDSS (DX), X1 31832 MOVSS X1, (DX) 31833 LEAQ (AX)(CX*4), AX 31834 LEAQ (DX)(BX*4), DX 31835 SUBQ $0x08, SI 31836 31837 check_limit_unroll: 31838 CMPQ SI, $0x08 31839 JHS loop_unroll 31840 JMP check_limit 31841 31842 loop: 31843 MOVSS (AX), X1 31844 MULSS X0, X1 31845 ADDSS (DX), X1 31846 MOVSS X1, (DX) 31847 DECQ SI 31848 LEAQ (AX)(CX*4), AX 31849 LEAQ (DX)(BX*4), DX 31850 31851 check_limit: 31852 CMPQ SI, $0x00 31853 JHI loop 31854 RET 31855 31856 // func AmdAxpyPointerLoopX_V1A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31857 // Requires: SSE 31858 TEXT ·AmdAxpyPointerLoopX_V1A13U8(SB), NOSPLIT, $0-48 31859 MOVSS alpha+0(FP), X0 31860 MOVQ xs+8(FP), AX 31861 MOVQ incx+16(FP), CX 31862 MOVQ ys+24(FP), DX 31863 MOVQ incy+32(FP), BX 31864 MOVQ n+40(FP), SI 31865 JMP check_limit_unroll 31866 PCALIGN $0x08 31867 NOP 31868 NOP 31869 NOP 31870 NOP 31871 NOP 31872 31873 loop_unroll: 31874 MOVSS (AX), X1 31875 MULSS X0, X1 31876 ADDSS (DX), X1 31877 MOVSS X1, (DX) 31878 LEAQ (AX)(CX*4), AX 31879 LEAQ (DX)(BX*4), DX 31880 MOVSS (AX), X1 31881 MULSS X0, X1 31882 ADDSS (DX), X1 31883 MOVSS X1, (DX) 31884 LEAQ (AX)(CX*4), AX 31885 LEAQ (DX)(BX*4), DX 31886 MOVSS (AX), X1 31887 MULSS X0, X1 31888 ADDSS (DX), X1 31889 MOVSS X1, (DX) 31890 LEAQ (AX)(CX*4), AX 31891 LEAQ (DX)(BX*4), DX 31892 MOVSS (AX), X1 31893 MULSS X0, X1 31894 ADDSS (DX), X1 31895 MOVSS X1, (DX) 31896 LEAQ (AX)(CX*4), AX 31897 LEAQ (DX)(BX*4), DX 31898 MOVSS (AX), X1 31899 MULSS X0, X1 31900 ADDSS (DX), X1 31901 MOVSS X1, (DX) 31902 LEAQ (AX)(CX*4), AX 31903 LEAQ (DX)(BX*4), DX 31904 MOVSS (AX), X1 31905 MULSS X0, X1 31906 ADDSS (DX), X1 31907 MOVSS X1, (DX) 31908 LEAQ (AX)(CX*4), AX 31909 LEAQ (DX)(BX*4), DX 31910 MOVSS (AX), X1 31911 MULSS X0, X1 31912 ADDSS (DX), X1 31913 MOVSS X1, (DX) 31914 LEAQ (AX)(CX*4), AX 31915 LEAQ (DX)(BX*4), DX 31916 MOVSS (AX), X1 31917 MULSS X0, X1 31918 ADDSS (DX), X1 31919 MOVSS X1, (DX) 31920 LEAQ (AX)(CX*4), AX 31921 LEAQ (DX)(BX*4), DX 31922 SUBQ $0x08, SI 31923 31924 check_limit_unroll: 31925 CMPQ SI, $0x08 31926 JHS loop_unroll 31927 JMP check_limit 31928 31929 loop: 31930 MOVSS (AX), X1 31931 MULSS X0, X1 31932 ADDSS (DX), X1 31933 MOVSS X1, (DX) 31934 DECQ SI 31935 LEAQ (AX)(CX*4), AX 31936 LEAQ (DX)(BX*4), DX 31937 31938 check_limit: 31939 CMPQ SI, $0x00 31940 JHI loop 31941 RET 31942 31943 // func AmdAxpyPointerLoopX_V2A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 31944 // Requires: SSE 31945 TEXT ·AmdAxpyPointerLoopX_V2A13U8(SB), NOSPLIT, $0-48 31946 MOVSS alpha+0(FP), X0 31947 MOVQ xs+8(FP), AX 31948 MOVQ incx+16(FP), CX 31949 MOVQ ys+24(FP), DX 31950 MOVQ incy+32(FP), BX 31951 MOVQ n+40(FP), SI 31952 JMP check_limit_unroll 31953 PCALIGN $0x08 31954 NOP 31955 NOP 31956 NOP 31957 NOP 31958 NOP 31959 31960 loop_unroll: 31961 MOVSS (AX), X1 31962 MULSS X0, X1 31963 ADDSS (DX), X1 31964 MOVSS X1, (DX) 31965 LEAQ (AX)(CX*4), AX 31966 LEAQ (DX)(BX*4), DX 31967 MOVSS (AX), X1 31968 MULSS X0, X1 31969 ADDSS (DX), X1 31970 MOVSS X1, (DX) 31971 LEAQ (AX)(CX*4), AX 31972 LEAQ (DX)(BX*4), DX 31973 MOVSS (AX), X1 31974 MULSS X0, X1 31975 ADDSS (DX), X1 31976 MOVSS X1, (DX) 31977 LEAQ (AX)(CX*4), AX 31978 LEAQ (DX)(BX*4), DX 31979 MOVSS (AX), X1 31980 MULSS X0, X1 31981 ADDSS (DX), X1 31982 MOVSS X1, (DX) 31983 LEAQ (AX)(CX*4), AX 31984 LEAQ (DX)(BX*4), DX 31985 MOVSS (AX), X1 31986 MULSS X0, X1 31987 ADDSS (DX), X1 31988 MOVSS X1, (DX) 31989 LEAQ (AX)(CX*4), AX 31990 LEAQ (DX)(BX*4), DX 31991 MOVSS (AX), X1 31992 MULSS X0, X1 31993 ADDSS (DX), X1 31994 MOVSS X1, (DX) 31995 LEAQ (AX)(CX*4), AX 31996 LEAQ (DX)(BX*4), DX 31997 MOVSS (AX), X1 31998 MULSS X0, X1 31999 ADDSS (DX), X1 32000 MOVSS X1, (DX) 32001 LEAQ (AX)(CX*4), AX 32002 LEAQ (DX)(BX*4), DX 32003 MOVSS (AX), X1 32004 MULSS X0, X1 32005 ADDSS (DX), X1 32006 MOVSS X1, (DX) 32007 LEAQ (AX)(CX*4), AX 32008 LEAQ (DX)(BX*4), DX 32009 SUBQ $0x08, SI 32010 32011 check_limit_unroll: 32012 CMPQ SI, $0x08 32013 JHS loop_unroll 32014 JMP check_limit 32015 32016 loop: 32017 MOVSS (AX), X1 32018 MULSS X0, X1 32019 ADDSS (DX), X1 32020 MOVSS X1, (DX) 32021 DECQ SI 32022 LEAQ (AX)(CX*4), AX 32023 LEAQ (DX)(BX*4), DX 32024 32025 check_limit: 32026 CMPQ SI, $0x00 32027 JHI loop 32028 RET 32029 32030 // func AmdAxpyPointerLoopX_V3A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32031 // Requires: SSE 32032 TEXT ·AmdAxpyPointerLoopX_V3A13U8(SB), NOSPLIT, $0-48 32033 MOVSS alpha+0(FP), X0 32034 MOVQ xs+8(FP), AX 32035 MOVQ incx+16(FP), CX 32036 MOVQ ys+24(FP), DX 32037 MOVQ incy+32(FP), BX 32038 MOVQ n+40(FP), SI 32039 JMP check_limit_unroll 32040 PCALIGN $0x08 32041 NOP 32042 NOP 32043 NOP 32044 NOP 32045 NOP 32046 32047 loop_unroll: 32048 MOVSS (AX), X1 32049 MULSS X0, X1 32050 ADDSS (DX), X1 32051 MOVSS X1, (DX) 32052 LEAQ (AX)(CX*4), AX 32053 LEAQ (DX)(BX*4), DX 32054 MOVSS (AX), X1 32055 MULSS X0, X1 32056 ADDSS (DX), X1 32057 MOVSS X1, (DX) 32058 LEAQ (AX)(CX*4), AX 32059 LEAQ (DX)(BX*4), DX 32060 MOVSS (AX), X1 32061 MULSS X0, X1 32062 ADDSS (DX), X1 32063 MOVSS X1, (DX) 32064 LEAQ (AX)(CX*4), AX 32065 LEAQ (DX)(BX*4), DX 32066 MOVSS (AX), X1 32067 MULSS X0, X1 32068 ADDSS (DX), X1 32069 MOVSS X1, (DX) 32070 LEAQ (AX)(CX*4), AX 32071 LEAQ (DX)(BX*4), DX 32072 MOVSS (AX), X1 32073 MULSS X0, X1 32074 ADDSS (DX), X1 32075 MOVSS X1, (DX) 32076 LEAQ (AX)(CX*4), AX 32077 LEAQ (DX)(BX*4), DX 32078 MOVSS (AX), X1 32079 MULSS X0, X1 32080 ADDSS (DX), X1 32081 MOVSS X1, (DX) 32082 LEAQ (AX)(CX*4), AX 32083 LEAQ (DX)(BX*4), DX 32084 MOVSS (AX), X1 32085 MULSS X0, X1 32086 ADDSS (DX), X1 32087 MOVSS X1, (DX) 32088 LEAQ (AX)(CX*4), AX 32089 LEAQ (DX)(BX*4), DX 32090 MOVSS (AX), X1 32091 MULSS X0, X1 32092 ADDSS (DX), X1 32093 MOVSS X1, (DX) 32094 LEAQ (AX)(CX*4), AX 32095 LEAQ (DX)(BX*4), DX 32096 SUBQ $0x08, SI 32097 32098 check_limit_unroll: 32099 CMPQ SI, $0x08 32100 JHS loop_unroll 32101 JMP check_limit 32102 32103 loop: 32104 MOVSS (AX), X1 32105 MULSS X0, X1 32106 ADDSS (DX), X1 32107 MOVSS X1, (DX) 32108 DECQ SI 32109 LEAQ (AX)(CX*4), AX 32110 LEAQ (DX)(BX*4), DX 32111 32112 check_limit: 32113 CMPQ SI, $0x00 32114 JHI loop 32115 RET 32116 32117 // func AmdAxpyPointerLoopX_V4A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32118 // Requires: SSE 32119 TEXT ·AmdAxpyPointerLoopX_V4A13U8(SB), NOSPLIT, $0-48 32120 MOVSS alpha+0(FP), X0 32121 MOVQ xs+8(FP), AX 32122 MOVQ incx+16(FP), CX 32123 MOVQ ys+24(FP), DX 32124 MOVQ incy+32(FP), BX 32125 MOVQ n+40(FP), SI 32126 JMP check_limit_unroll 32127 PCALIGN $0x08 32128 NOP 32129 NOP 32130 NOP 32131 NOP 32132 NOP 32133 32134 loop_unroll: 32135 MOVSS (AX), X1 32136 MULSS X0, X1 32137 ADDSS (DX), X1 32138 MOVSS X1, (DX) 32139 LEAQ (AX)(CX*4), AX 32140 LEAQ (DX)(BX*4), DX 32141 MOVSS (AX), X1 32142 MULSS X0, X1 32143 ADDSS (DX), X1 32144 MOVSS X1, (DX) 32145 LEAQ (AX)(CX*4), AX 32146 LEAQ (DX)(BX*4), DX 32147 MOVSS (AX), X1 32148 MULSS X0, X1 32149 ADDSS (DX), X1 32150 MOVSS X1, (DX) 32151 LEAQ (AX)(CX*4), AX 32152 LEAQ (DX)(BX*4), DX 32153 MOVSS (AX), X1 32154 MULSS X0, X1 32155 ADDSS (DX), X1 32156 MOVSS X1, (DX) 32157 LEAQ (AX)(CX*4), AX 32158 LEAQ (DX)(BX*4), DX 32159 MOVSS (AX), X1 32160 MULSS X0, X1 32161 ADDSS (DX), X1 32162 MOVSS X1, (DX) 32163 LEAQ (AX)(CX*4), AX 32164 LEAQ (DX)(BX*4), DX 32165 MOVSS (AX), X1 32166 MULSS X0, X1 32167 ADDSS (DX), X1 32168 MOVSS X1, (DX) 32169 LEAQ (AX)(CX*4), AX 32170 LEAQ (DX)(BX*4), DX 32171 MOVSS (AX), X1 32172 MULSS X0, X1 32173 ADDSS (DX), X1 32174 MOVSS X1, (DX) 32175 LEAQ (AX)(CX*4), AX 32176 LEAQ (DX)(BX*4), DX 32177 MOVSS (AX), X1 32178 MULSS X0, X1 32179 ADDSS (DX), X1 32180 MOVSS X1, (DX) 32181 LEAQ (AX)(CX*4), AX 32182 LEAQ (DX)(BX*4), DX 32183 SUBQ $0x08, SI 32184 32185 check_limit_unroll: 32186 CMPQ SI, $0x08 32187 JHS loop_unroll 32188 JMP check_limit 32189 32190 loop: 32191 MOVSS (AX), X1 32192 MULSS X0, X1 32193 ADDSS (DX), X1 32194 MOVSS X1, (DX) 32195 DECQ SI 32196 LEAQ (AX)(CX*4), AX 32197 LEAQ (DX)(BX*4), DX 32198 32199 check_limit: 32200 CMPQ SI, $0x00 32201 JHI loop 32202 RET 32203 32204 // func AmdAxpyPointerLoopX_V5A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32205 // Requires: SSE 32206 TEXT ·AmdAxpyPointerLoopX_V5A13U8(SB), NOSPLIT, $0-48 32207 MOVSS alpha+0(FP), X0 32208 MOVQ xs+8(FP), AX 32209 MOVQ incx+16(FP), CX 32210 MOVQ ys+24(FP), DX 32211 MOVQ incy+32(FP), BX 32212 MOVQ n+40(FP), SI 32213 JMP check_limit_unroll 32214 PCALIGN $0x08 32215 NOP 32216 NOP 32217 NOP 32218 NOP 32219 NOP 32220 32221 loop_unroll: 32222 MOVSS (AX), X1 32223 MULSS X0, X1 32224 ADDSS (DX), X1 32225 MOVSS X1, (DX) 32226 LEAQ (AX)(CX*4), AX 32227 LEAQ (DX)(BX*4), DX 32228 MOVSS (AX), X1 32229 MULSS X0, X1 32230 ADDSS (DX), X1 32231 MOVSS X1, (DX) 32232 LEAQ (AX)(CX*4), AX 32233 LEAQ (DX)(BX*4), DX 32234 MOVSS (AX), X1 32235 MULSS X0, X1 32236 ADDSS (DX), X1 32237 MOVSS X1, (DX) 32238 LEAQ (AX)(CX*4), AX 32239 LEAQ (DX)(BX*4), DX 32240 MOVSS (AX), X1 32241 MULSS X0, X1 32242 ADDSS (DX), X1 32243 MOVSS X1, (DX) 32244 LEAQ (AX)(CX*4), AX 32245 LEAQ (DX)(BX*4), DX 32246 MOVSS (AX), X1 32247 MULSS X0, X1 32248 ADDSS (DX), X1 32249 MOVSS X1, (DX) 32250 LEAQ (AX)(CX*4), AX 32251 LEAQ (DX)(BX*4), DX 32252 MOVSS (AX), X1 32253 MULSS X0, X1 32254 ADDSS (DX), X1 32255 MOVSS X1, (DX) 32256 LEAQ (AX)(CX*4), AX 32257 LEAQ (DX)(BX*4), DX 32258 MOVSS (AX), X1 32259 MULSS X0, X1 32260 ADDSS (DX), X1 32261 MOVSS X1, (DX) 32262 LEAQ (AX)(CX*4), AX 32263 LEAQ (DX)(BX*4), DX 32264 MOVSS (AX), X1 32265 MULSS X0, X1 32266 ADDSS (DX), X1 32267 MOVSS X1, (DX) 32268 LEAQ (AX)(CX*4), AX 32269 LEAQ (DX)(BX*4), DX 32270 SUBQ $0x08, SI 32271 32272 check_limit_unroll: 32273 CMPQ SI, $0x08 32274 JHS loop_unroll 32275 JMP check_limit 32276 32277 loop: 32278 MOVSS (AX), X1 32279 MULSS X0, X1 32280 ADDSS (DX), X1 32281 MOVSS X1, (DX) 32282 DECQ SI 32283 LEAQ (AX)(CX*4), AX 32284 LEAQ (DX)(BX*4), DX 32285 32286 check_limit: 32287 CMPQ SI, $0x00 32288 JHI loop 32289 RET 32290 32291 // func AmdAxpyPointerLoopX_V0A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32292 // Requires: SSE 32293 TEXT ·AmdAxpyPointerLoopX_V0A14U8(SB), NOSPLIT, $0-48 32294 MOVSS alpha+0(FP), X0 32295 MOVQ xs+8(FP), AX 32296 MOVQ incx+16(FP), CX 32297 MOVQ ys+24(FP), DX 32298 MOVQ incy+32(FP), BX 32299 MOVQ n+40(FP), SI 32300 JMP check_limit_unroll 32301 PCALIGN $0x08 32302 NOP 32303 NOP 32304 NOP 32305 NOP 32306 NOP 32307 NOP 32308 32309 loop_unroll: 32310 MOVSS (AX), X1 32311 MULSS X0, X1 32312 ADDSS (DX), X1 32313 MOVSS X1, (DX) 32314 LEAQ (AX)(CX*4), AX 32315 LEAQ (DX)(BX*4), DX 32316 MOVSS (AX), X1 32317 MULSS X0, X1 32318 ADDSS (DX), X1 32319 MOVSS X1, (DX) 32320 LEAQ (AX)(CX*4), AX 32321 LEAQ (DX)(BX*4), DX 32322 MOVSS (AX), X1 32323 MULSS X0, X1 32324 ADDSS (DX), X1 32325 MOVSS X1, (DX) 32326 LEAQ (AX)(CX*4), AX 32327 LEAQ (DX)(BX*4), DX 32328 MOVSS (AX), X1 32329 MULSS X0, X1 32330 ADDSS (DX), X1 32331 MOVSS X1, (DX) 32332 LEAQ (AX)(CX*4), AX 32333 LEAQ (DX)(BX*4), DX 32334 MOVSS (AX), X1 32335 MULSS X0, X1 32336 ADDSS (DX), X1 32337 MOVSS X1, (DX) 32338 LEAQ (AX)(CX*4), AX 32339 LEAQ (DX)(BX*4), DX 32340 MOVSS (AX), X1 32341 MULSS X0, X1 32342 ADDSS (DX), X1 32343 MOVSS X1, (DX) 32344 LEAQ (AX)(CX*4), AX 32345 LEAQ (DX)(BX*4), DX 32346 MOVSS (AX), X1 32347 MULSS X0, X1 32348 ADDSS (DX), X1 32349 MOVSS X1, (DX) 32350 LEAQ (AX)(CX*4), AX 32351 LEAQ (DX)(BX*4), DX 32352 MOVSS (AX), X1 32353 MULSS X0, X1 32354 ADDSS (DX), X1 32355 MOVSS X1, (DX) 32356 LEAQ (AX)(CX*4), AX 32357 LEAQ (DX)(BX*4), DX 32358 SUBQ $0x08, SI 32359 32360 check_limit_unroll: 32361 CMPQ SI, $0x08 32362 JHS loop_unroll 32363 JMP check_limit 32364 32365 loop: 32366 MOVSS (AX), X1 32367 MULSS X0, X1 32368 ADDSS (DX), X1 32369 MOVSS X1, (DX) 32370 DECQ SI 32371 LEAQ (AX)(CX*4), AX 32372 LEAQ (DX)(BX*4), DX 32373 32374 check_limit: 32375 CMPQ SI, $0x00 32376 JHI loop 32377 RET 32378 32379 // func AmdAxpyPointerLoopX_V1A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32380 // Requires: SSE 32381 TEXT ·AmdAxpyPointerLoopX_V1A14U8(SB), NOSPLIT, $0-48 32382 MOVSS alpha+0(FP), X0 32383 MOVQ xs+8(FP), AX 32384 MOVQ incx+16(FP), CX 32385 MOVQ ys+24(FP), DX 32386 MOVQ incy+32(FP), BX 32387 MOVQ n+40(FP), SI 32388 JMP check_limit_unroll 32389 PCALIGN $0x08 32390 NOP 32391 NOP 32392 NOP 32393 NOP 32394 NOP 32395 NOP 32396 32397 loop_unroll: 32398 MOVSS (AX), X1 32399 MULSS X0, X1 32400 ADDSS (DX), X1 32401 MOVSS X1, (DX) 32402 LEAQ (AX)(CX*4), AX 32403 LEAQ (DX)(BX*4), DX 32404 MOVSS (AX), X1 32405 MULSS X0, X1 32406 ADDSS (DX), X1 32407 MOVSS X1, (DX) 32408 LEAQ (AX)(CX*4), AX 32409 LEAQ (DX)(BX*4), DX 32410 MOVSS (AX), X1 32411 MULSS X0, X1 32412 ADDSS (DX), X1 32413 MOVSS X1, (DX) 32414 LEAQ (AX)(CX*4), AX 32415 LEAQ (DX)(BX*4), DX 32416 MOVSS (AX), X1 32417 MULSS X0, X1 32418 ADDSS (DX), X1 32419 MOVSS X1, (DX) 32420 LEAQ (AX)(CX*4), AX 32421 LEAQ (DX)(BX*4), DX 32422 MOVSS (AX), X1 32423 MULSS X0, X1 32424 ADDSS (DX), X1 32425 MOVSS X1, (DX) 32426 LEAQ (AX)(CX*4), AX 32427 LEAQ (DX)(BX*4), DX 32428 MOVSS (AX), X1 32429 MULSS X0, X1 32430 ADDSS (DX), X1 32431 MOVSS X1, (DX) 32432 LEAQ (AX)(CX*4), AX 32433 LEAQ (DX)(BX*4), DX 32434 MOVSS (AX), X1 32435 MULSS X0, X1 32436 ADDSS (DX), X1 32437 MOVSS X1, (DX) 32438 LEAQ (AX)(CX*4), AX 32439 LEAQ (DX)(BX*4), DX 32440 MOVSS (AX), X1 32441 MULSS X0, X1 32442 ADDSS (DX), X1 32443 MOVSS X1, (DX) 32444 LEAQ (AX)(CX*4), AX 32445 LEAQ (DX)(BX*4), DX 32446 SUBQ $0x08, SI 32447 32448 check_limit_unroll: 32449 CMPQ SI, $0x08 32450 JHS loop_unroll 32451 JMP check_limit 32452 32453 loop: 32454 MOVSS (AX), X1 32455 MULSS X0, X1 32456 ADDSS (DX), X1 32457 MOVSS X1, (DX) 32458 DECQ SI 32459 LEAQ (AX)(CX*4), AX 32460 LEAQ (DX)(BX*4), DX 32461 32462 check_limit: 32463 CMPQ SI, $0x00 32464 JHI loop 32465 RET 32466 32467 // func AmdAxpyPointerLoopX_V2A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32468 // Requires: SSE 32469 TEXT ·AmdAxpyPointerLoopX_V2A14U8(SB), NOSPLIT, $0-48 32470 MOVSS alpha+0(FP), X0 32471 MOVQ xs+8(FP), AX 32472 MOVQ incx+16(FP), CX 32473 MOVQ ys+24(FP), DX 32474 MOVQ incy+32(FP), BX 32475 MOVQ n+40(FP), SI 32476 JMP check_limit_unroll 32477 PCALIGN $0x08 32478 NOP 32479 NOP 32480 NOP 32481 NOP 32482 NOP 32483 NOP 32484 32485 loop_unroll: 32486 MOVSS (AX), X1 32487 MULSS X0, X1 32488 ADDSS (DX), X1 32489 MOVSS X1, (DX) 32490 LEAQ (AX)(CX*4), AX 32491 LEAQ (DX)(BX*4), DX 32492 MOVSS (AX), X1 32493 MULSS X0, X1 32494 ADDSS (DX), X1 32495 MOVSS X1, (DX) 32496 LEAQ (AX)(CX*4), AX 32497 LEAQ (DX)(BX*4), DX 32498 MOVSS (AX), X1 32499 MULSS X0, X1 32500 ADDSS (DX), X1 32501 MOVSS X1, (DX) 32502 LEAQ (AX)(CX*4), AX 32503 LEAQ (DX)(BX*4), DX 32504 MOVSS (AX), X1 32505 MULSS X0, X1 32506 ADDSS (DX), X1 32507 MOVSS X1, (DX) 32508 LEAQ (AX)(CX*4), AX 32509 LEAQ (DX)(BX*4), DX 32510 MOVSS (AX), X1 32511 MULSS X0, X1 32512 ADDSS (DX), X1 32513 MOVSS X1, (DX) 32514 LEAQ (AX)(CX*4), AX 32515 LEAQ (DX)(BX*4), DX 32516 MOVSS (AX), X1 32517 MULSS X0, X1 32518 ADDSS (DX), X1 32519 MOVSS X1, (DX) 32520 LEAQ (AX)(CX*4), AX 32521 LEAQ (DX)(BX*4), DX 32522 MOVSS (AX), X1 32523 MULSS X0, X1 32524 ADDSS (DX), X1 32525 MOVSS X1, (DX) 32526 LEAQ (AX)(CX*4), AX 32527 LEAQ (DX)(BX*4), DX 32528 MOVSS (AX), X1 32529 MULSS X0, X1 32530 ADDSS (DX), X1 32531 MOVSS X1, (DX) 32532 LEAQ (AX)(CX*4), AX 32533 LEAQ (DX)(BX*4), DX 32534 SUBQ $0x08, SI 32535 32536 check_limit_unroll: 32537 CMPQ SI, $0x08 32538 JHS loop_unroll 32539 JMP check_limit 32540 32541 loop: 32542 MOVSS (AX), X1 32543 MULSS X0, X1 32544 ADDSS (DX), X1 32545 MOVSS X1, (DX) 32546 DECQ SI 32547 LEAQ (AX)(CX*4), AX 32548 LEAQ (DX)(BX*4), DX 32549 32550 check_limit: 32551 CMPQ SI, $0x00 32552 JHI loop 32553 RET 32554 32555 // func AmdAxpyPointerLoopX_V3A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32556 // Requires: SSE 32557 TEXT ·AmdAxpyPointerLoopX_V3A14U8(SB), NOSPLIT, $0-48 32558 MOVSS alpha+0(FP), X0 32559 MOVQ xs+8(FP), AX 32560 MOVQ incx+16(FP), CX 32561 MOVQ ys+24(FP), DX 32562 MOVQ incy+32(FP), BX 32563 MOVQ n+40(FP), SI 32564 JMP check_limit_unroll 32565 PCALIGN $0x08 32566 NOP 32567 NOP 32568 NOP 32569 NOP 32570 NOP 32571 NOP 32572 32573 loop_unroll: 32574 MOVSS (AX), X1 32575 MULSS X0, X1 32576 ADDSS (DX), X1 32577 MOVSS X1, (DX) 32578 LEAQ (AX)(CX*4), AX 32579 LEAQ (DX)(BX*4), DX 32580 MOVSS (AX), X1 32581 MULSS X0, X1 32582 ADDSS (DX), X1 32583 MOVSS X1, (DX) 32584 LEAQ (AX)(CX*4), AX 32585 LEAQ (DX)(BX*4), DX 32586 MOVSS (AX), X1 32587 MULSS X0, X1 32588 ADDSS (DX), X1 32589 MOVSS X1, (DX) 32590 LEAQ (AX)(CX*4), AX 32591 LEAQ (DX)(BX*4), DX 32592 MOVSS (AX), X1 32593 MULSS X0, X1 32594 ADDSS (DX), X1 32595 MOVSS X1, (DX) 32596 LEAQ (AX)(CX*4), AX 32597 LEAQ (DX)(BX*4), DX 32598 MOVSS (AX), X1 32599 MULSS X0, X1 32600 ADDSS (DX), X1 32601 MOVSS X1, (DX) 32602 LEAQ (AX)(CX*4), AX 32603 LEAQ (DX)(BX*4), DX 32604 MOVSS (AX), X1 32605 MULSS X0, X1 32606 ADDSS (DX), X1 32607 MOVSS X1, (DX) 32608 LEAQ (AX)(CX*4), AX 32609 LEAQ (DX)(BX*4), DX 32610 MOVSS (AX), X1 32611 MULSS X0, X1 32612 ADDSS (DX), X1 32613 MOVSS X1, (DX) 32614 LEAQ (AX)(CX*4), AX 32615 LEAQ (DX)(BX*4), DX 32616 MOVSS (AX), X1 32617 MULSS X0, X1 32618 ADDSS (DX), X1 32619 MOVSS X1, (DX) 32620 LEAQ (AX)(CX*4), AX 32621 LEAQ (DX)(BX*4), DX 32622 SUBQ $0x08, SI 32623 32624 check_limit_unroll: 32625 CMPQ SI, $0x08 32626 JHS loop_unroll 32627 JMP check_limit 32628 32629 loop: 32630 MOVSS (AX), X1 32631 MULSS X0, X1 32632 ADDSS (DX), X1 32633 MOVSS X1, (DX) 32634 DECQ SI 32635 LEAQ (AX)(CX*4), AX 32636 LEAQ (DX)(BX*4), DX 32637 32638 check_limit: 32639 CMPQ SI, $0x00 32640 JHI loop 32641 RET 32642 32643 // func AmdAxpyPointerLoopX_V4A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32644 // Requires: SSE 32645 TEXT ·AmdAxpyPointerLoopX_V4A14U8(SB), NOSPLIT, $0-48 32646 MOVSS alpha+0(FP), X0 32647 MOVQ xs+8(FP), AX 32648 MOVQ incx+16(FP), CX 32649 MOVQ ys+24(FP), DX 32650 MOVQ incy+32(FP), BX 32651 MOVQ n+40(FP), SI 32652 JMP check_limit_unroll 32653 PCALIGN $0x08 32654 NOP 32655 NOP 32656 NOP 32657 NOP 32658 NOP 32659 NOP 32660 32661 loop_unroll: 32662 MOVSS (AX), X1 32663 MULSS X0, X1 32664 ADDSS (DX), X1 32665 MOVSS X1, (DX) 32666 LEAQ (AX)(CX*4), AX 32667 LEAQ (DX)(BX*4), DX 32668 MOVSS (AX), X1 32669 MULSS X0, X1 32670 ADDSS (DX), X1 32671 MOVSS X1, (DX) 32672 LEAQ (AX)(CX*4), AX 32673 LEAQ (DX)(BX*4), DX 32674 MOVSS (AX), X1 32675 MULSS X0, X1 32676 ADDSS (DX), X1 32677 MOVSS X1, (DX) 32678 LEAQ (AX)(CX*4), AX 32679 LEAQ (DX)(BX*4), DX 32680 MOVSS (AX), X1 32681 MULSS X0, X1 32682 ADDSS (DX), X1 32683 MOVSS X1, (DX) 32684 LEAQ (AX)(CX*4), AX 32685 LEAQ (DX)(BX*4), DX 32686 MOVSS (AX), X1 32687 MULSS X0, X1 32688 ADDSS (DX), X1 32689 MOVSS X1, (DX) 32690 LEAQ (AX)(CX*4), AX 32691 LEAQ (DX)(BX*4), DX 32692 MOVSS (AX), X1 32693 MULSS X0, X1 32694 ADDSS (DX), X1 32695 MOVSS X1, (DX) 32696 LEAQ (AX)(CX*4), AX 32697 LEAQ (DX)(BX*4), DX 32698 MOVSS (AX), X1 32699 MULSS X0, X1 32700 ADDSS (DX), X1 32701 MOVSS X1, (DX) 32702 LEAQ (AX)(CX*4), AX 32703 LEAQ (DX)(BX*4), DX 32704 MOVSS (AX), X1 32705 MULSS X0, X1 32706 ADDSS (DX), X1 32707 MOVSS X1, (DX) 32708 LEAQ (AX)(CX*4), AX 32709 LEAQ (DX)(BX*4), DX 32710 SUBQ $0x08, SI 32711 32712 check_limit_unroll: 32713 CMPQ SI, $0x08 32714 JHS loop_unroll 32715 JMP check_limit 32716 32717 loop: 32718 MOVSS (AX), X1 32719 MULSS X0, X1 32720 ADDSS (DX), X1 32721 MOVSS X1, (DX) 32722 DECQ SI 32723 LEAQ (AX)(CX*4), AX 32724 LEAQ (DX)(BX*4), DX 32725 32726 check_limit: 32727 CMPQ SI, $0x00 32728 JHI loop 32729 RET 32730 32731 // func AmdAxpyPointerLoopX_V5A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32732 // Requires: SSE 32733 TEXT ·AmdAxpyPointerLoopX_V5A14U8(SB), NOSPLIT, $0-48 32734 MOVSS alpha+0(FP), X0 32735 MOVQ xs+8(FP), AX 32736 MOVQ incx+16(FP), CX 32737 MOVQ ys+24(FP), DX 32738 MOVQ incy+32(FP), BX 32739 MOVQ n+40(FP), SI 32740 JMP check_limit_unroll 32741 PCALIGN $0x08 32742 NOP 32743 NOP 32744 NOP 32745 NOP 32746 NOP 32747 NOP 32748 32749 loop_unroll: 32750 MOVSS (AX), X1 32751 MULSS X0, X1 32752 ADDSS (DX), X1 32753 MOVSS X1, (DX) 32754 LEAQ (AX)(CX*4), AX 32755 LEAQ (DX)(BX*4), DX 32756 MOVSS (AX), X1 32757 MULSS X0, X1 32758 ADDSS (DX), X1 32759 MOVSS X1, (DX) 32760 LEAQ (AX)(CX*4), AX 32761 LEAQ (DX)(BX*4), DX 32762 MOVSS (AX), X1 32763 MULSS X0, X1 32764 ADDSS (DX), X1 32765 MOVSS X1, (DX) 32766 LEAQ (AX)(CX*4), AX 32767 LEAQ (DX)(BX*4), DX 32768 MOVSS (AX), X1 32769 MULSS X0, X1 32770 ADDSS (DX), X1 32771 MOVSS X1, (DX) 32772 LEAQ (AX)(CX*4), AX 32773 LEAQ (DX)(BX*4), DX 32774 MOVSS (AX), X1 32775 MULSS X0, X1 32776 ADDSS (DX), X1 32777 MOVSS X1, (DX) 32778 LEAQ (AX)(CX*4), AX 32779 LEAQ (DX)(BX*4), DX 32780 MOVSS (AX), X1 32781 MULSS X0, X1 32782 ADDSS (DX), X1 32783 MOVSS X1, (DX) 32784 LEAQ (AX)(CX*4), AX 32785 LEAQ (DX)(BX*4), DX 32786 MOVSS (AX), X1 32787 MULSS X0, X1 32788 ADDSS (DX), X1 32789 MOVSS X1, (DX) 32790 LEAQ (AX)(CX*4), AX 32791 LEAQ (DX)(BX*4), DX 32792 MOVSS (AX), X1 32793 MULSS X0, X1 32794 ADDSS (DX), X1 32795 MOVSS X1, (DX) 32796 LEAQ (AX)(CX*4), AX 32797 LEAQ (DX)(BX*4), DX 32798 SUBQ $0x08, SI 32799 32800 check_limit_unroll: 32801 CMPQ SI, $0x08 32802 JHS loop_unroll 32803 JMP check_limit 32804 32805 loop: 32806 MOVSS (AX), X1 32807 MULSS X0, X1 32808 ADDSS (DX), X1 32809 MOVSS X1, (DX) 32810 DECQ SI 32811 LEAQ (AX)(CX*4), AX 32812 LEAQ (DX)(BX*4), DX 32813 32814 check_limit: 32815 CMPQ SI, $0x00 32816 JHI loop 32817 RET 32818 32819 // func AmdAxpyPointerLoopX_V0A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32820 // Requires: SSE 32821 TEXT ·AmdAxpyPointerLoopX_V0A15U8(SB), NOSPLIT, $0-48 32822 MOVSS alpha+0(FP), X0 32823 MOVQ xs+8(FP), AX 32824 MOVQ incx+16(FP), CX 32825 MOVQ ys+24(FP), DX 32826 MOVQ incy+32(FP), BX 32827 MOVQ n+40(FP), SI 32828 JMP check_limit_unroll 32829 PCALIGN $0x08 32830 NOP 32831 NOP 32832 NOP 32833 NOP 32834 NOP 32835 NOP 32836 NOP 32837 32838 loop_unroll: 32839 MOVSS (AX), X1 32840 MULSS X0, X1 32841 ADDSS (DX), X1 32842 MOVSS X1, (DX) 32843 LEAQ (AX)(CX*4), AX 32844 LEAQ (DX)(BX*4), DX 32845 MOVSS (AX), X1 32846 MULSS X0, X1 32847 ADDSS (DX), X1 32848 MOVSS X1, (DX) 32849 LEAQ (AX)(CX*4), AX 32850 LEAQ (DX)(BX*4), DX 32851 MOVSS (AX), X1 32852 MULSS X0, X1 32853 ADDSS (DX), X1 32854 MOVSS X1, (DX) 32855 LEAQ (AX)(CX*4), AX 32856 LEAQ (DX)(BX*4), DX 32857 MOVSS (AX), X1 32858 MULSS X0, X1 32859 ADDSS (DX), X1 32860 MOVSS X1, (DX) 32861 LEAQ (AX)(CX*4), AX 32862 LEAQ (DX)(BX*4), DX 32863 MOVSS (AX), X1 32864 MULSS X0, X1 32865 ADDSS (DX), X1 32866 MOVSS X1, (DX) 32867 LEAQ (AX)(CX*4), AX 32868 LEAQ (DX)(BX*4), DX 32869 MOVSS (AX), X1 32870 MULSS X0, X1 32871 ADDSS (DX), X1 32872 MOVSS X1, (DX) 32873 LEAQ (AX)(CX*4), AX 32874 LEAQ (DX)(BX*4), DX 32875 MOVSS (AX), X1 32876 MULSS X0, X1 32877 ADDSS (DX), X1 32878 MOVSS X1, (DX) 32879 LEAQ (AX)(CX*4), AX 32880 LEAQ (DX)(BX*4), DX 32881 MOVSS (AX), X1 32882 MULSS X0, X1 32883 ADDSS (DX), X1 32884 MOVSS X1, (DX) 32885 LEAQ (AX)(CX*4), AX 32886 LEAQ (DX)(BX*4), DX 32887 SUBQ $0x08, SI 32888 32889 check_limit_unroll: 32890 CMPQ SI, $0x08 32891 JHS loop_unroll 32892 JMP check_limit 32893 32894 loop: 32895 MOVSS (AX), X1 32896 MULSS X0, X1 32897 ADDSS (DX), X1 32898 MOVSS X1, (DX) 32899 DECQ SI 32900 LEAQ (AX)(CX*4), AX 32901 LEAQ (DX)(BX*4), DX 32902 32903 check_limit: 32904 CMPQ SI, $0x00 32905 JHI loop 32906 RET 32907 32908 // func AmdAxpyPointerLoopX_V1A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32909 // Requires: SSE 32910 TEXT ·AmdAxpyPointerLoopX_V1A15U8(SB), NOSPLIT, $0-48 32911 MOVSS alpha+0(FP), X0 32912 MOVQ xs+8(FP), AX 32913 MOVQ incx+16(FP), CX 32914 MOVQ ys+24(FP), DX 32915 MOVQ incy+32(FP), BX 32916 MOVQ n+40(FP), SI 32917 JMP check_limit_unroll 32918 PCALIGN $0x08 32919 NOP 32920 NOP 32921 NOP 32922 NOP 32923 NOP 32924 NOP 32925 NOP 32926 32927 loop_unroll: 32928 MOVSS (AX), X1 32929 MULSS X0, X1 32930 ADDSS (DX), X1 32931 MOVSS X1, (DX) 32932 LEAQ (AX)(CX*4), AX 32933 LEAQ (DX)(BX*4), DX 32934 MOVSS (AX), X1 32935 MULSS X0, X1 32936 ADDSS (DX), X1 32937 MOVSS X1, (DX) 32938 LEAQ (AX)(CX*4), AX 32939 LEAQ (DX)(BX*4), DX 32940 MOVSS (AX), X1 32941 MULSS X0, X1 32942 ADDSS (DX), X1 32943 MOVSS X1, (DX) 32944 LEAQ (AX)(CX*4), AX 32945 LEAQ (DX)(BX*4), DX 32946 MOVSS (AX), X1 32947 MULSS X0, X1 32948 ADDSS (DX), X1 32949 MOVSS X1, (DX) 32950 LEAQ (AX)(CX*4), AX 32951 LEAQ (DX)(BX*4), DX 32952 MOVSS (AX), X1 32953 MULSS X0, X1 32954 ADDSS (DX), X1 32955 MOVSS X1, (DX) 32956 LEAQ (AX)(CX*4), AX 32957 LEAQ (DX)(BX*4), DX 32958 MOVSS (AX), X1 32959 MULSS X0, X1 32960 ADDSS (DX), X1 32961 MOVSS X1, (DX) 32962 LEAQ (AX)(CX*4), AX 32963 LEAQ (DX)(BX*4), DX 32964 MOVSS (AX), X1 32965 MULSS X0, X1 32966 ADDSS (DX), X1 32967 MOVSS X1, (DX) 32968 LEAQ (AX)(CX*4), AX 32969 LEAQ (DX)(BX*4), DX 32970 MOVSS (AX), X1 32971 MULSS X0, X1 32972 ADDSS (DX), X1 32973 MOVSS X1, (DX) 32974 LEAQ (AX)(CX*4), AX 32975 LEAQ (DX)(BX*4), DX 32976 SUBQ $0x08, SI 32977 32978 check_limit_unroll: 32979 CMPQ SI, $0x08 32980 JHS loop_unroll 32981 JMP check_limit 32982 32983 loop: 32984 MOVSS (AX), X1 32985 MULSS X0, X1 32986 ADDSS (DX), X1 32987 MOVSS X1, (DX) 32988 DECQ SI 32989 LEAQ (AX)(CX*4), AX 32990 LEAQ (DX)(BX*4), DX 32991 32992 check_limit: 32993 CMPQ SI, $0x00 32994 JHI loop 32995 RET 32996 32997 // func AmdAxpyPointerLoopX_V2A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 32998 // Requires: SSE 32999 TEXT ·AmdAxpyPointerLoopX_V2A15U8(SB), NOSPLIT, $0-48 33000 MOVSS alpha+0(FP), X0 33001 MOVQ xs+8(FP), AX 33002 MOVQ incx+16(FP), CX 33003 MOVQ ys+24(FP), DX 33004 MOVQ incy+32(FP), BX 33005 MOVQ n+40(FP), SI 33006 JMP check_limit_unroll 33007 PCALIGN $0x08 33008 NOP 33009 NOP 33010 NOP 33011 NOP 33012 NOP 33013 NOP 33014 NOP 33015 33016 loop_unroll: 33017 MOVSS (AX), X1 33018 MULSS X0, X1 33019 ADDSS (DX), X1 33020 MOVSS X1, (DX) 33021 LEAQ (AX)(CX*4), AX 33022 LEAQ (DX)(BX*4), DX 33023 MOVSS (AX), X1 33024 MULSS X0, X1 33025 ADDSS (DX), X1 33026 MOVSS X1, (DX) 33027 LEAQ (AX)(CX*4), AX 33028 LEAQ (DX)(BX*4), DX 33029 MOVSS (AX), X1 33030 MULSS X0, X1 33031 ADDSS (DX), X1 33032 MOVSS X1, (DX) 33033 LEAQ (AX)(CX*4), AX 33034 LEAQ (DX)(BX*4), DX 33035 MOVSS (AX), X1 33036 MULSS X0, X1 33037 ADDSS (DX), X1 33038 MOVSS X1, (DX) 33039 LEAQ (AX)(CX*4), AX 33040 LEAQ (DX)(BX*4), DX 33041 MOVSS (AX), X1 33042 MULSS X0, X1 33043 ADDSS (DX), X1 33044 MOVSS X1, (DX) 33045 LEAQ (AX)(CX*4), AX 33046 LEAQ (DX)(BX*4), DX 33047 MOVSS (AX), X1 33048 MULSS X0, X1 33049 ADDSS (DX), X1 33050 MOVSS X1, (DX) 33051 LEAQ (AX)(CX*4), AX 33052 LEAQ (DX)(BX*4), DX 33053 MOVSS (AX), X1 33054 MULSS X0, X1 33055 ADDSS (DX), X1 33056 MOVSS X1, (DX) 33057 LEAQ (AX)(CX*4), AX 33058 LEAQ (DX)(BX*4), DX 33059 MOVSS (AX), X1 33060 MULSS X0, X1 33061 ADDSS (DX), X1 33062 MOVSS X1, (DX) 33063 LEAQ (AX)(CX*4), AX 33064 LEAQ (DX)(BX*4), DX 33065 SUBQ $0x08, SI 33066 33067 check_limit_unroll: 33068 CMPQ SI, $0x08 33069 JHS loop_unroll 33070 JMP check_limit 33071 33072 loop: 33073 MOVSS (AX), X1 33074 MULSS X0, X1 33075 ADDSS (DX), X1 33076 MOVSS X1, (DX) 33077 DECQ SI 33078 LEAQ (AX)(CX*4), AX 33079 LEAQ (DX)(BX*4), DX 33080 33081 check_limit: 33082 CMPQ SI, $0x00 33083 JHI loop 33084 RET 33085 33086 // func AmdAxpyPointerLoopX_V3A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33087 // Requires: SSE 33088 TEXT ·AmdAxpyPointerLoopX_V3A15U8(SB), NOSPLIT, $0-48 33089 MOVSS alpha+0(FP), X0 33090 MOVQ xs+8(FP), AX 33091 MOVQ incx+16(FP), CX 33092 MOVQ ys+24(FP), DX 33093 MOVQ incy+32(FP), BX 33094 MOVQ n+40(FP), SI 33095 JMP check_limit_unroll 33096 PCALIGN $0x08 33097 NOP 33098 NOP 33099 NOP 33100 NOP 33101 NOP 33102 NOP 33103 NOP 33104 33105 loop_unroll: 33106 MOVSS (AX), X1 33107 MULSS X0, X1 33108 ADDSS (DX), X1 33109 MOVSS X1, (DX) 33110 LEAQ (AX)(CX*4), AX 33111 LEAQ (DX)(BX*4), DX 33112 MOVSS (AX), X1 33113 MULSS X0, X1 33114 ADDSS (DX), X1 33115 MOVSS X1, (DX) 33116 LEAQ (AX)(CX*4), AX 33117 LEAQ (DX)(BX*4), DX 33118 MOVSS (AX), X1 33119 MULSS X0, X1 33120 ADDSS (DX), X1 33121 MOVSS X1, (DX) 33122 LEAQ (AX)(CX*4), AX 33123 LEAQ (DX)(BX*4), DX 33124 MOVSS (AX), X1 33125 MULSS X0, X1 33126 ADDSS (DX), X1 33127 MOVSS X1, (DX) 33128 LEAQ (AX)(CX*4), AX 33129 LEAQ (DX)(BX*4), DX 33130 MOVSS (AX), X1 33131 MULSS X0, X1 33132 ADDSS (DX), X1 33133 MOVSS X1, (DX) 33134 LEAQ (AX)(CX*4), AX 33135 LEAQ (DX)(BX*4), DX 33136 MOVSS (AX), X1 33137 MULSS X0, X1 33138 ADDSS (DX), X1 33139 MOVSS X1, (DX) 33140 LEAQ (AX)(CX*4), AX 33141 LEAQ (DX)(BX*4), DX 33142 MOVSS (AX), X1 33143 MULSS X0, X1 33144 ADDSS (DX), X1 33145 MOVSS X1, (DX) 33146 LEAQ (AX)(CX*4), AX 33147 LEAQ (DX)(BX*4), DX 33148 MOVSS (AX), X1 33149 MULSS X0, X1 33150 ADDSS (DX), X1 33151 MOVSS X1, (DX) 33152 LEAQ (AX)(CX*4), AX 33153 LEAQ (DX)(BX*4), DX 33154 SUBQ $0x08, SI 33155 33156 check_limit_unroll: 33157 CMPQ SI, $0x08 33158 JHS loop_unroll 33159 JMP check_limit 33160 33161 loop: 33162 MOVSS (AX), X1 33163 MULSS X0, X1 33164 ADDSS (DX), X1 33165 MOVSS X1, (DX) 33166 DECQ SI 33167 LEAQ (AX)(CX*4), AX 33168 LEAQ (DX)(BX*4), DX 33169 33170 check_limit: 33171 CMPQ SI, $0x00 33172 JHI loop 33173 RET 33174 33175 // func AmdAxpyPointerLoopX_V4A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33176 // Requires: SSE 33177 TEXT ·AmdAxpyPointerLoopX_V4A15U8(SB), NOSPLIT, $0-48 33178 MOVSS alpha+0(FP), X0 33179 MOVQ xs+8(FP), AX 33180 MOVQ incx+16(FP), CX 33181 MOVQ ys+24(FP), DX 33182 MOVQ incy+32(FP), BX 33183 MOVQ n+40(FP), SI 33184 JMP check_limit_unroll 33185 PCALIGN $0x08 33186 NOP 33187 NOP 33188 NOP 33189 NOP 33190 NOP 33191 NOP 33192 NOP 33193 33194 loop_unroll: 33195 MOVSS (AX), X1 33196 MULSS X0, X1 33197 ADDSS (DX), X1 33198 MOVSS X1, (DX) 33199 LEAQ (AX)(CX*4), AX 33200 LEAQ (DX)(BX*4), DX 33201 MOVSS (AX), X1 33202 MULSS X0, X1 33203 ADDSS (DX), X1 33204 MOVSS X1, (DX) 33205 LEAQ (AX)(CX*4), AX 33206 LEAQ (DX)(BX*4), DX 33207 MOVSS (AX), X1 33208 MULSS X0, X1 33209 ADDSS (DX), X1 33210 MOVSS X1, (DX) 33211 LEAQ (AX)(CX*4), AX 33212 LEAQ (DX)(BX*4), DX 33213 MOVSS (AX), X1 33214 MULSS X0, X1 33215 ADDSS (DX), X1 33216 MOVSS X1, (DX) 33217 LEAQ (AX)(CX*4), AX 33218 LEAQ (DX)(BX*4), DX 33219 MOVSS (AX), X1 33220 MULSS X0, X1 33221 ADDSS (DX), X1 33222 MOVSS X1, (DX) 33223 LEAQ (AX)(CX*4), AX 33224 LEAQ (DX)(BX*4), DX 33225 MOVSS (AX), X1 33226 MULSS X0, X1 33227 ADDSS (DX), X1 33228 MOVSS X1, (DX) 33229 LEAQ (AX)(CX*4), AX 33230 LEAQ (DX)(BX*4), DX 33231 MOVSS (AX), X1 33232 MULSS X0, X1 33233 ADDSS (DX), X1 33234 MOVSS X1, (DX) 33235 LEAQ (AX)(CX*4), AX 33236 LEAQ (DX)(BX*4), DX 33237 MOVSS (AX), X1 33238 MULSS X0, X1 33239 ADDSS (DX), X1 33240 MOVSS X1, (DX) 33241 LEAQ (AX)(CX*4), AX 33242 LEAQ (DX)(BX*4), DX 33243 SUBQ $0x08, SI 33244 33245 check_limit_unroll: 33246 CMPQ SI, $0x08 33247 JHS loop_unroll 33248 JMP check_limit 33249 33250 loop: 33251 MOVSS (AX), X1 33252 MULSS X0, X1 33253 ADDSS (DX), X1 33254 MOVSS X1, (DX) 33255 DECQ SI 33256 LEAQ (AX)(CX*4), AX 33257 LEAQ (DX)(BX*4), DX 33258 33259 check_limit: 33260 CMPQ SI, $0x00 33261 JHI loop 33262 RET 33263 33264 // func AmdAxpyPointerLoopX_V5A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33265 // Requires: SSE 33266 TEXT ·AmdAxpyPointerLoopX_V5A15U8(SB), NOSPLIT, $0-48 33267 MOVSS alpha+0(FP), X0 33268 MOVQ xs+8(FP), AX 33269 MOVQ incx+16(FP), CX 33270 MOVQ ys+24(FP), DX 33271 MOVQ incy+32(FP), BX 33272 MOVQ n+40(FP), SI 33273 JMP check_limit_unroll 33274 PCALIGN $0x08 33275 NOP 33276 NOP 33277 NOP 33278 NOP 33279 NOP 33280 NOP 33281 NOP 33282 33283 loop_unroll: 33284 MOVSS (AX), X1 33285 MULSS X0, X1 33286 ADDSS (DX), X1 33287 MOVSS X1, (DX) 33288 LEAQ (AX)(CX*4), AX 33289 LEAQ (DX)(BX*4), DX 33290 MOVSS (AX), X1 33291 MULSS X0, X1 33292 ADDSS (DX), X1 33293 MOVSS X1, (DX) 33294 LEAQ (AX)(CX*4), AX 33295 LEAQ (DX)(BX*4), DX 33296 MOVSS (AX), X1 33297 MULSS X0, X1 33298 ADDSS (DX), X1 33299 MOVSS X1, (DX) 33300 LEAQ (AX)(CX*4), AX 33301 LEAQ (DX)(BX*4), DX 33302 MOVSS (AX), X1 33303 MULSS X0, X1 33304 ADDSS (DX), X1 33305 MOVSS X1, (DX) 33306 LEAQ (AX)(CX*4), AX 33307 LEAQ (DX)(BX*4), DX 33308 MOVSS (AX), X1 33309 MULSS X0, X1 33310 ADDSS (DX), X1 33311 MOVSS X1, (DX) 33312 LEAQ (AX)(CX*4), AX 33313 LEAQ (DX)(BX*4), DX 33314 MOVSS (AX), X1 33315 MULSS X0, X1 33316 ADDSS (DX), X1 33317 MOVSS X1, (DX) 33318 LEAQ (AX)(CX*4), AX 33319 LEAQ (DX)(BX*4), DX 33320 MOVSS (AX), X1 33321 MULSS X0, X1 33322 ADDSS (DX), X1 33323 MOVSS X1, (DX) 33324 LEAQ (AX)(CX*4), AX 33325 LEAQ (DX)(BX*4), DX 33326 MOVSS (AX), X1 33327 MULSS X0, X1 33328 ADDSS (DX), X1 33329 MOVSS X1, (DX) 33330 LEAQ (AX)(CX*4), AX 33331 LEAQ (DX)(BX*4), DX 33332 SUBQ $0x08, SI 33333 33334 check_limit_unroll: 33335 CMPQ SI, $0x08 33336 JHS loop_unroll 33337 JMP check_limit 33338 33339 loop: 33340 MOVSS (AX), X1 33341 MULSS X0, X1 33342 ADDSS (DX), X1 33343 MOVSS X1, (DX) 33344 DECQ SI 33345 LEAQ (AX)(CX*4), AX 33346 LEAQ (DX)(BX*4), DX 33347 33348 check_limit: 33349 CMPQ SI, $0x00 33350 JHI loop 33351 RET 33352 33353 // func AmdAxpyPointerLoopX_V0A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33354 // Requires: SSE 33355 TEXT ·AmdAxpyPointerLoopX_V0A16U8(SB), NOSPLIT, $0-48 33356 MOVSS alpha+0(FP), X0 33357 MOVQ xs+8(FP), AX 33358 MOVQ incx+16(FP), CX 33359 MOVQ ys+24(FP), DX 33360 MOVQ incy+32(FP), BX 33361 MOVQ n+40(FP), SI 33362 JMP check_limit_unroll 33363 PCALIGN $0x10 33364 33365 loop_unroll: 33366 MOVSS (AX), X1 33367 MULSS X0, X1 33368 ADDSS (DX), X1 33369 MOVSS X1, (DX) 33370 LEAQ (AX)(CX*4), AX 33371 LEAQ (DX)(BX*4), DX 33372 MOVSS (AX), X1 33373 MULSS X0, X1 33374 ADDSS (DX), X1 33375 MOVSS X1, (DX) 33376 LEAQ (AX)(CX*4), AX 33377 LEAQ (DX)(BX*4), DX 33378 MOVSS (AX), X1 33379 MULSS X0, X1 33380 ADDSS (DX), X1 33381 MOVSS X1, (DX) 33382 LEAQ (AX)(CX*4), AX 33383 LEAQ (DX)(BX*4), DX 33384 MOVSS (AX), X1 33385 MULSS X0, X1 33386 ADDSS (DX), X1 33387 MOVSS X1, (DX) 33388 LEAQ (AX)(CX*4), AX 33389 LEAQ (DX)(BX*4), DX 33390 MOVSS (AX), X1 33391 MULSS X0, X1 33392 ADDSS (DX), X1 33393 MOVSS X1, (DX) 33394 LEAQ (AX)(CX*4), AX 33395 LEAQ (DX)(BX*4), DX 33396 MOVSS (AX), X1 33397 MULSS X0, X1 33398 ADDSS (DX), X1 33399 MOVSS X1, (DX) 33400 LEAQ (AX)(CX*4), AX 33401 LEAQ (DX)(BX*4), DX 33402 MOVSS (AX), X1 33403 MULSS X0, X1 33404 ADDSS (DX), X1 33405 MOVSS X1, (DX) 33406 LEAQ (AX)(CX*4), AX 33407 LEAQ (DX)(BX*4), DX 33408 MOVSS (AX), X1 33409 MULSS X0, X1 33410 ADDSS (DX), X1 33411 MOVSS X1, (DX) 33412 LEAQ (AX)(CX*4), AX 33413 LEAQ (DX)(BX*4), DX 33414 SUBQ $0x08, SI 33415 33416 check_limit_unroll: 33417 CMPQ SI, $0x08 33418 JHS loop_unroll 33419 JMP check_limit 33420 33421 loop: 33422 MOVSS (AX), X1 33423 MULSS X0, X1 33424 ADDSS (DX), X1 33425 MOVSS X1, (DX) 33426 DECQ SI 33427 LEAQ (AX)(CX*4), AX 33428 LEAQ (DX)(BX*4), DX 33429 33430 check_limit: 33431 CMPQ SI, $0x00 33432 JHI loop 33433 RET 33434 33435 // func AmdAxpyPointerLoopX_V1A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33436 // Requires: SSE 33437 TEXT ·AmdAxpyPointerLoopX_V1A16U8(SB), NOSPLIT, $0-48 33438 MOVSS alpha+0(FP), X0 33439 MOVQ xs+8(FP), AX 33440 MOVQ incx+16(FP), CX 33441 MOVQ ys+24(FP), DX 33442 MOVQ incy+32(FP), BX 33443 MOVQ n+40(FP), SI 33444 JMP check_limit_unroll 33445 PCALIGN $0x10 33446 33447 loop_unroll: 33448 MOVSS (AX), X1 33449 MULSS X0, X1 33450 ADDSS (DX), X1 33451 MOVSS X1, (DX) 33452 LEAQ (AX)(CX*4), AX 33453 LEAQ (DX)(BX*4), DX 33454 MOVSS (AX), X1 33455 MULSS X0, X1 33456 ADDSS (DX), X1 33457 MOVSS X1, (DX) 33458 LEAQ (AX)(CX*4), AX 33459 LEAQ (DX)(BX*4), DX 33460 MOVSS (AX), X1 33461 MULSS X0, X1 33462 ADDSS (DX), X1 33463 MOVSS X1, (DX) 33464 LEAQ (AX)(CX*4), AX 33465 LEAQ (DX)(BX*4), DX 33466 MOVSS (AX), X1 33467 MULSS X0, X1 33468 ADDSS (DX), X1 33469 MOVSS X1, (DX) 33470 LEAQ (AX)(CX*4), AX 33471 LEAQ (DX)(BX*4), DX 33472 MOVSS (AX), X1 33473 MULSS X0, X1 33474 ADDSS (DX), X1 33475 MOVSS X1, (DX) 33476 LEAQ (AX)(CX*4), AX 33477 LEAQ (DX)(BX*4), DX 33478 MOVSS (AX), X1 33479 MULSS X0, X1 33480 ADDSS (DX), X1 33481 MOVSS X1, (DX) 33482 LEAQ (AX)(CX*4), AX 33483 LEAQ (DX)(BX*4), DX 33484 MOVSS (AX), X1 33485 MULSS X0, X1 33486 ADDSS (DX), X1 33487 MOVSS X1, (DX) 33488 LEAQ (AX)(CX*4), AX 33489 LEAQ (DX)(BX*4), DX 33490 MOVSS (AX), X1 33491 MULSS X0, X1 33492 ADDSS (DX), X1 33493 MOVSS X1, (DX) 33494 LEAQ (AX)(CX*4), AX 33495 LEAQ (DX)(BX*4), DX 33496 SUBQ $0x08, SI 33497 33498 check_limit_unroll: 33499 CMPQ SI, $0x08 33500 JHS loop_unroll 33501 JMP check_limit 33502 33503 loop: 33504 MOVSS (AX), X1 33505 MULSS X0, X1 33506 ADDSS (DX), X1 33507 MOVSS X1, (DX) 33508 DECQ SI 33509 LEAQ (AX)(CX*4), AX 33510 LEAQ (DX)(BX*4), DX 33511 33512 check_limit: 33513 CMPQ SI, $0x00 33514 JHI loop 33515 RET 33516 33517 // func AmdAxpyPointerLoopX_V2A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33518 // Requires: SSE 33519 TEXT ·AmdAxpyPointerLoopX_V2A16U8(SB), NOSPLIT, $0-48 33520 MOVSS alpha+0(FP), X0 33521 MOVQ xs+8(FP), AX 33522 MOVQ incx+16(FP), CX 33523 MOVQ ys+24(FP), DX 33524 MOVQ incy+32(FP), BX 33525 MOVQ n+40(FP), SI 33526 JMP check_limit_unroll 33527 PCALIGN $0x10 33528 33529 loop_unroll: 33530 MOVSS (AX), X1 33531 MULSS X0, X1 33532 ADDSS (DX), X1 33533 MOVSS X1, (DX) 33534 LEAQ (AX)(CX*4), AX 33535 LEAQ (DX)(BX*4), DX 33536 MOVSS (AX), X1 33537 MULSS X0, X1 33538 ADDSS (DX), X1 33539 MOVSS X1, (DX) 33540 LEAQ (AX)(CX*4), AX 33541 LEAQ (DX)(BX*4), DX 33542 MOVSS (AX), X1 33543 MULSS X0, X1 33544 ADDSS (DX), X1 33545 MOVSS X1, (DX) 33546 LEAQ (AX)(CX*4), AX 33547 LEAQ (DX)(BX*4), DX 33548 MOVSS (AX), X1 33549 MULSS X0, X1 33550 ADDSS (DX), X1 33551 MOVSS X1, (DX) 33552 LEAQ (AX)(CX*4), AX 33553 LEAQ (DX)(BX*4), DX 33554 MOVSS (AX), X1 33555 MULSS X0, X1 33556 ADDSS (DX), X1 33557 MOVSS X1, (DX) 33558 LEAQ (AX)(CX*4), AX 33559 LEAQ (DX)(BX*4), DX 33560 MOVSS (AX), X1 33561 MULSS X0, X1 33562 ADDSS (DX), X1 33563 MOVSS X1, (DX) 33564 LEAQ (AX)(CX*4), AX 33565 LEAQ (DX)(BX*4), DX 33566 MOVSS (AX), X1 33567 MULSS X0, X1 33568 ADDSS (DX), X1 33569 MOVSS X1, (DX) 33570 LEAQ (AX)(CX*4), AX 33571 LEAQ (DX)(BX*4), DX 33572 MOVSS (AX), X1 33573 MULSS X0, X1 33574 ADDSS (DX), X1 33575 MOVSS X1, (DX) 33576 LEAQ (AX)(CX*4), AX 33577 LEAQ (DX)(BX*4), DX 33578 SUBQ $0x08, SI 33579 33580 check_limit_unroll: 33581 CMPQ SI, $0x08 33582 JHS loop_unroll 33583 JMP check_limit 33584 33585 loop: 33586 MOVSS (AX), X1 33587 MULSS X0, X1 33588 ADDSS (DX), X1 33589 MOVSS X1, (DX) 33590 DECQ SI 33591 LEAQ (AX)(CX*4), AX 33592 LEAQ (DX)(BX*4), DX 33593 33594 check_limit: 33595 CMPQ SI, $0x00 33596 JHI loop 33597 RET 33598 33599 // func AmdAxpyPointerLoopX_V3A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33600 // Requires: SSE 33601 TEXT ·AmdAxpyPointerLoopX_V3A16U8(SB), NOSPLIT, $0-48 33602 MOVSS alpha+0(FP), X0 33603 MOVQ xs+8(FP), AX 33604 MOVQ incx+16(FP), CX 33605 MOVQ ys+24(FP), DX 33606 MOVQ incy+32(FP), BX 33607 MOVQ n+40(FP), SI 33608 JMP check_limit_unroll 33609 PCALIGN $0x10 33610 33611 loop_unroll: 33612 MOVSS (AX), X1 33613 MULSS X0, X1 33614 ADDSS (DX), X1 33615 MOVSS X1, (DX) 33616 LEAQ (AX)(CX*4), AX 33617 LEAQ (DX)(BX*4), DX 33618 MOVSS (AX), X1 33619 MULSS X0, X1 33620 ADDSS (DX), X1 33621 MOVSS X1, (DX) 33622 LEAQ (AX)(CX*4), AX 33623 LEAQ (DX)(BX*4), DX 33624 MOVSS (AX), X1 33625 MULSS X0, X1 33626 ADDSS (DX), X1 33627 MOVSS X1, (DX) 33628 LEAQ (AX)(CX*4), AX 33629 LEAQ (DX)(BX*4), DX 33630 MOVSS (AX), X1 33631 MULSS X0, X1 33632 ADDSS (DX), X1 33633 MOVSS X1, (DX) 33634 LEAQ (AX)(CX*4), AX 33635 LEAQ (DX)(BX*4), DX 33636 MOVSS (AX), X1 33637 MULSS X0, X1 33638 ADDSS (DX), X1 33639 MOVSS X1, (DX) 33640 LEAQ (AX)(CX*4), AX 33641 LEAQ (DX)(BX*4), DX 33642 MOVSS (AX), X1 33643 MULSS X0, X1 33644 ADDSS (DX), X1 33645 MOVSS X1, (DX) 33646 LEAQ (AX)(CX*4), AX 33647 LEAQ (DX)(BX*4), DX 33648 MOVSS (AX), X1 33649 MULSS X0, X1 33650 ADDSS (DX), X1 33651 MOVSS X1, (DX) 33652 LEAQ (AX)(CX*4), AX 33653 LEAQ (DX)(BX*4), DX 33654 MOVSS (AX), X1 33655 MULSS X0, X1 33656 ADDSS (DX), X1 33657 MOVSS X1, (DX) 33658 LEAQ (AX)(CX*4), AX 33659 LEAQ (DX)(BX*4), DX 33660 SUBQ $0x08, SI 33661 33662 check_limit_unroll: 33663 CMPQ SI, $0x08 33664 JHS loop_unroll 33665 JMP check_limit 33666 33667 loop: 33668 MOVSS (AX), X1 33669 MULSS X0, X1 33670 ADDSS (DX), X1 33671 MOVSS X1, (DX) 33672 DECQ SI 33673 LEAQ (AX)(CX*4), AX 33674 LEAQ (DX)(BX*4), DX 33675 33676 check_limit: 33677 CMPQ SI, $0x00 33678 JHI loop 33679 RET 33680 33681 // func AmdAxpyPointerLoopX_V4A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33682 // Requires: SSE 33683 TEXT ·AmdAxpyPointerLoopX_V4A16U8(SB), NOSPLIT, $0-48 33684 MOVSS alpha+0(FP), X0 33685 MOVQ xs+8(FP), AX 33686 MOVQ incx+16(FP), CX 33687 MOVQ ys+24(FP), DX 33688 MOVQ incy+32(FP), BX 33689 MOVQ n+40(FP), SI 33690 JMP check_limit_unroll 33691 PCALIGN $0x10 33692 33693 loop_unroll: 33694 MOVSS (AX), X1 33695 MULSS X0, X1 33696 ADDSS (DX), X1 33697 MOVSS X1, (DX) 33698 LEAQ (AX)(CX*4), AX 33699 LEAQ (DX)(BX*4), DX 33700 MOVSS (AX), X1 33701 MULSS X0, X1 33702 ADDSS (DX), X1 33703 MOVSS X1, (DX) 33704 LEAQ (AX)(CX*4), AX 33705 LEAQ (DX)(BX*4), DX 33706 MOVSS (AX), X1 33707 MULSS X0, X1 33708 ADDSS (DX), X1 33709 MOVSS X1, (DX) 33710 LEAQ (AX)(CX*4), AX 33711 LEAQ (DX)(BX*4), DX 33712 MOVSS (AX), X1 33713 MULSS X0, X1 33714 ADDSS (DX), X1 33715 MOVSS X1, (DX) 33716 LEAQ (AX)(CX*4), AX 33717 LEAQ (DX)(BX*4), DX 33718 MOVSS (AX), X1 33719 MULSS X0, X1 33720 ADDSS (DX), X1 33721 MOVSS X1, (DX) 33722 LEAQ (AX)(CX*4), AX 33723 LEAQ (DX)(BX*4), DX 33724 MOVSS (AX), X1 33725 MULSS X0, X1 33726 ADDSS (DX), X1 33727 MOVSS X1, (DX) 33728 LEAQ (AX)(CX*4), AX 33729 LEAQ (DX)(BX*4), DX 33730 MOVSS (AX), X1 33731 MULSS X0, X1 33732 ADDSS (DX), X1 33733 MOVSS X1, (DX) 33734 LEAQ (AX)(CX*4), AX 33735 LEAQ (DX)(BX*4), DX 33736 MOVSS (AX), X1 33737 MULSS X0, X1 33738 ADDSS (DX), X1 33739 MOVSS X1, (DX) 33740 LEAQ (AX)(CX*4), AX 33741 LEAQ (DX)(BX*4), DX 33742 SUBQ $0x08, SI 33743 33744 check_limit_unroll: 33745 CMPQ SI, $0x08 33746 JHS loop_unroll 33747 JMP check_limit 33748 33749 loop: 33750 MOVSS (AX), X1 33751 MULSS X0, X1 33752 ADDSS (DX), X1 33753 MOVSS X1, (DX) 33754 DECQ SI 33755 LEAQ (AX)(CX*4), AX 33756 LEAQ (DX)(BX*4), DX 33757 33758 check_limit: 33759 CMPQ SI, $0x00 33760 JHI loop 33761 RET 33762 33763 // func AmdAxpyPointerLoopX_V5A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33764 // Requires: SSE 33765 TEXT ·AmdAxpyPointerLoopX_V5A16U8(SB), NOSPLIT, $0-48 33766 MOVSS alpha+0(FP), X0 33767 MOVQ xs+8(FP), AX 33768 MOVQ incx+16(FP), CX 33769 MOVQ ys+24(FP), DX 33770 MOVQ incy+32(FP), BX 33771 MOVQ n+40(FP), SI 33772 JMP check_limit_unroll 33773 PCALIGN $0x10 33774 33775 loop_unroll: 33776 MOVSS (AX), X1 33777 MULSS X0, X1 33778 ADDSS (DX), X1 33779 MOVSS X1, (DX) 33780 LEAQ (AX)(CX*4), AX 33781 LEAQ (DX)(BX*4), DX 33782 MOVSS (AX), X1 33783 MULSS X0, X1 33784 ADDSS (DX), X1 33785 MOVSS X1, (DX) 33786 LEAQ (AX)(CX*4), AX 33787 LEAQ (DX)(BX*4), DX 33788 MOVSS (AX), X1 33789 MULSS X0, X1 33790 ADDSS (DX), X1 33791 MOVSS X1, (DX) 33792 LEAQ (AX)(CX*4), AX 33793 LEAQ (DX)(BX*4), DX 33794 MOVSS (AX), X1 33795 MULSS X0, X1 33796 ADDSS (DX), X1 33797 MOVSS X1, (DX) 33798 LEAQ (AX)(CX*4), AX 33799 LEAQ (DX)(BX*4), DX 33800 MOVSS (AX), X1 33801 MULSS X0, X1 33802 ADDSS (DX), X1 33803 MOVSS X1, (DX) 33804 LEAQ (AX)(CX*4), AX 33805 LEAQ (DX)(BX*4), DX 33806 MOVSS (AX), X1 33807 MULSS X0, X1 33808 ADDSS (DX), X1 33809 MOVSS X1, (DX) 33810 LEAQ (AX)(CX*4), AX 33811 LEAQ (DX)(BX*4), DX 33812 MOVSS (AX), X1 33813 MULSS X0, X1 33814 ADDSS (DX), X1 33815 MOVSS X1, (DX) 33816 LEAQ (AX)(CX*4), AX 33817 LEAQ (DX)(BX*4), DX 33818 MOVSS (AX), X1 33819 MULSS X0, X1 33820 ADDSS (DX), X1 33821 MOVSS X1, (DX) 33822 LEAQ (AX)(CX*4), AX 33823 LEAQ (DX)(BX*4), DX 33824 SUBQ $0x08, SI 33825 33826 check_limit_unroll: 33827 CMPQ SI, $0x08 33828 JHS loop_unroll 33829 JMP check_limit 33830 33831 loop: 33832 MOVSS (AX), X1 33833 MULSS X0, X1 33834 ADDSS (DX), X1 33835 MOVSS X1, (DX) 33836 DECQ SI 33837 LEAQ (AX)(CX*4), AX 33838 LEAQ (DX)(BX*4), DX 33839 33840 check_limit: 33841 CMPQ SI, $0x00 33842 JHI loop 33843 RET 33844 33845 // func AmdAxpyPointerLoopXInterleave_V0A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33846 // Requires: SSE 33847 TEXT ·AmdAxpyPointerLoopXInterleave_V0A0U4(SB), NOSPLIT, $0-48 33848 MOVSS alpha+0(FP), X0 33849 MOVQ xs+8(FP), AX 33850 MOVQ incx+16(FP), CX 33851 MOVQ CX, DX 33852 SHLQ $0x04, DX 33853 MOVQ ys+24(FP), DX 33854 MOVQ incy+32(FP), BX 33855 MOVQ BX, SI 33856 SHLQ $0x04, SI 33857 MOVQ n+40(FP), SI 33858 JMP check_limit_unroll 33859 33860 loop_unroll: 33861 MOVSS (AX), X1 33862 LEAQ (AX)(CX*4), AX 33863 MOVSS (AX), X2 33864 LEAQ (AX)(CX*4), AX 33865 MOVSS (AX), X3 33866 LEAQ (AX)(CX*4), AX 33867 MOVSS (AX), X4 33868 LEAQ (AX)(CX*4), AX 33869 MULSS X0, X1 33870 MULSS X0, X2 33871 MULSS X0, X3 33872 MULSS X0, X4 33873 ADDSS (DX), X1 33874 MOVSS X1, (DX) 33875 LEAQ (DX)(BX*4), DX 33876 ADDSS (DX), X2 33877 MOVSS X2, (DX) 33878 LEAQ (DX)(BX*4), DX 33879 ADDSS (DX), X3 33880 MOVSS X3, (DX) 33881 LEAQ (DX)(BX*4), DX 33882 ADDSS (DX), X4 33883 MOVSS X4, (DX) 33884 LEAQ (DX)(BX*4), DX 33885 SUBQ $0x04, SI 33886 33887 check_limit_unroll: 33888 CMPQ SI, $0x04 33889 JHS loop_unroll 33890 JMP check_limit 33891 33892 loop: 33893 MOVSS (AX), X1 33894 MULSS X0, X1 33895 ADDSS (DX), X1 33896 MOVSS X1, (DX) 33897 DECQ SI 33898 LEAQ (AX)(CX*4), AX 33899 LEAQ (DX)(BX*4), DX 33900 33901 check_limit: 33902 CMPQ SI, $0x00 33903 JHI loop 33904 RET 33905 33906 // func AmdAxpyPointerLoopXInterleave_V1A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33907 // Requires: SSE 33908 TEXT ·AmdAxpyPointerLoopXInterleave_V1A0U4(SB), NOSPLIT, $0-48 33909 MOVSS alpha+0(FP), X0 33910 MOVQ xs+8(FP), AX 33911 MOVQ incx+16(FP), CX 33912 MOVQ CX, DX 33913 SHLQ $0x04, DX 33914 MOVQ ys+24(FP), DX 33915 MOVQ incy+32(FP), BX 33916 MOVQ BX, SI 33917 SHLQ $0x04, SI 33918 MOVQ n+40(FP), SI 33919 JMP check_limit_unroll 33920 33921 loop_unroll: 33922 MOVSS (AX), X1 33923 LEAQ (AX)(CX*4), AX 33924 MOVSS (AX), X2 33925 LEAQ (AX)(CX*4), AX 33926 MOVSS (AX), X3 33927 LEAQ (AX)(CX*4), AX 33928 MOVSS (AX), X4 33929 LEAQ (AX)(CX*4), AX 33930 MULSS X0, X1 33931 MULSS X0, X2 33932 MULSS X0, X3 33933 MULSS X0, X4 33934 ADDSS (DX), X1 33935 MOVSS X1, (DX) 33936 LEAQ (DX)(BX*4), DX 33937 ADDSS (DX), X2 33938 MOVSS X2, (DX) 33939 LEAQ (DX)(BX*4), DX 33940 ADDSS (DX), X3 33941 MOVSS X3, (DX) 33942 LEAQ (DX)(BX*4), DX 33943 ADDSS (DX), X4 33944 MOVSS X4, (DX) 33945 LEAQ (DX)(BX*4), DX 33946 SUBQ $0x04, SI 33947 33948 check_limit_unroll: 33949 CMPQ SI, $0x04 33950 JHS loop_unroll 33951 JMP check_limit 33952 33953 loop: 33954 MOVSS (AX), X1 33955 MULSS X0, X1 33956 ADDSS (DX), X1 33957 MOVSS X1, (DX) 33958 DECQ SI 33959 LEAQ (AX)(CX*4), AX 33960 LEAQ (DX)(BX*4), DX 33961 33962 check_limit: 33963 CMPQ SI, $0x00 33964 JHI loop 33965 RET 33966 33967 // func AmdAxpyPointerLoopXInterleave_V2A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 33968 // Requires: SSE 33969 TEXT ·AmdAxpyPointerLoopXInterleave_V2A0U4(SB), NOSPLIT, $0-48 33970 MOVSS alpha+0(FP), X0 33971 MOVQ xs+8(FP), AX 33972 MOVQ incx+16(FP), CX 33973 MOVQ CX, DX 33974 SHLQ $0x04, DX 33975 MOVQ ys+24(FP), DX 33976 MOVQ incy+32(FP), BX 33977 MOVQ BX, SI 33978 SHLQ $0x04, SI 33979 MOVQ n+40(FP), SI 33980 JMP check_limit_unroll 33981 33982 loop_unroll: 33983 MOVSS (AX), X1 33984 LEAQ (AX)(CX*4), AX 33985 MOVSS (AX), X2 33986 LEAQ (AX)(CX*4), AX 33987 MOVSS (AX), X3 33988 LEAQ (AX)(CX*4), AX 33989 MOVSS (AX), X4 33990 LEAQ (AX)(CX*4), AX 33991 MULSS X0, X1 33992 MULSS X0, X2 33993 MULSS X0, X3 33994 MULSS X0, X4 33995 ADDSS (DX), X1 33996 MOVSS X1, (DX) 33997 LEAQ (DX)(BX*4), DX 33998 ADDSS (DX), X2 33999 MOVSS X2, (DX) 34000 LEAQ (DX)(BX*4), DX 34001 ADDSS (DX), X3 34002 MOVSS X3, (DX) 34003 LEAQ (DX)(BX*4), DX 34004 ADDSS (DX), X4 34005 MOVSS X4, (DX) 34006 LEAQ (DX)(BX*4), DX 34007 SUBQ $0x04, SI 34008 34009 check_limit_unroll: 34010 CMPQ SI, $0x04 34011 JHS loop_unroll 34012 JMP check_limit 34013 34014 loop: 34015 MOVSS (AX), X1 34016 MULSS X0, X1 34017 ADDSS (DX), X1 34018 MOVSS X1, (DX) 34019 DECQ SI 34020 LEAQ (AX)(CX*4), AX 34021 LEAQ (DX)(BX*4), DX 34022 34023 check_limit: 34024 CMPQ SI, $0x00 34025 JHI loop 34026 RET 34027 34028 // func AmdAxpyPointerLoopXInterleave_V3A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34029 // Requires: SSE 34030 TEXT ·AmdAxpyPointerLoopXInterleave_V3A0U4(SB), NOSPLIT, $0-48 34031 MOVSS alpha+0(FP), X0 34032 MOVQ xs+8(FP), AX 34033 MOVQ incx+16(FP), CX 34034 MOVQ CX, DX 34035 SHLQ $0x04, DX 34036 MOVQ ys+24(FP), DX 34037 MOVQ incy+32(FP), BX 34038 MOVQ BX, SI 34039 SHLQ $0x04, SI 34040 MOVQ n+40(FP), SI 34041 JMP check_limit_unroll 34042 34043 loop_unroll: 34044 MOVSS (AX), X1 34045 LEAQ (AX)(CX*4), AX 34046 MOVSS (AX), X2 34047 LEAQ (AX)(CX*4), AX 34048 MOVSS (AX), X3 34049 LEAQ (AX)(CX*4), AX 34050 MOVSS (AX), X4 34051 LEAQ (AX)(CX*4), AX 34052 MULSS X0, X1 34053 MULSS X0, X2 34054 MULSS X0, X3 34055 MULSS X0, X4 34056 ADDSS (DX), X1 34057 MOVSS X1, (DX) 34058 LEAQ (DX)(BX*4), DX 34059 ADDSS (DX), X2 34060 MOVSS X2, (DX) 34061 LEAQ (DX)(BX*4), DX 34062 ADDSS (DX), X3 34063 MOVSS X3, (DX) 34064 LEAQ (DX)(BX*4), DX 34065 ADDSS (DX), X4 34066 MOVSS X4, (DX) 34067 LEAQ (DX)(BX*4), DX 34068 SUBQ $0x04, SI 34069 34070 check_limit_unroll: 34071 CMPQ SI, $0x04 34072 JHS loop_unroll 34073 JMP check_limit 34074 34075 loop: 34076 MOVSS (AX), X1 34077 MULSS X0, X1 34078 ADDSS (DX), X1 34079 MOVSS X1, (DX) 34080 DECQ SI 34081 LEAQ (AX)(CX*4), AX 34082 LEAQ (DX)(BX*4), DX 34083 34084 check_limit: 34085 CMPQ SI, $0x00 34086 JHI loop 34087 RET 34088 34089 // func AmdAxpyPointerLoopXInterleave_V4A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34090 // Requires: SSE 34091 TEXT ·AmdAxpyPointerLoopXInterleave_V4A0U4(SB), NOSPLIT, $0-48 34092 MOVSS alpha+0(FP), X0 34093 MOVQ xs+8(FP), AX 34094 MOVQ incx+16(FP), CX 34095 MOVQ CX, DX 34096 SHLQ $0x04, DX 34097 MOVQ ys+24(FP), DX 34098 MOVQ incy+32(FP), BX 34099 MOVQ BX, SI 34100 SHLQ $0x04, SI 34101 MOVQ n+40(FP), SI 34102 JMP check_limit_unroll 34103 34104 loop_unroll: 34105 MOVSS (AX), X1 34106 LEAQ (AX)(CX*4), AX 34107 MOVSS (AX), X2 34108 LEAQ (AX)(CX*4), AX 34109 MOVSS (AX), X3 34110 LEAQ (AX)(CX*4), AX 34111 MOVSS (AX), X4 34112 LEAQ (AX)(CX*4), AX 34113 MULSS X0, X1 34114 MULSS X0, X2 34115 MULSS X0, X3 34116 MULSS X0, X4 34117 ADDSS (DX), X1 34118 MOVSS X1, (DX) 34119 LEAQ (DX)(BX*4), DX 34120 ADDSS (DX), X2 34121 MOVSS X2, (DX) 34122 LEAQ (DX)(BX*4), DX 34123 ADDSS (DX), X3 34124 MOVSS X3, (DX) 34125 LEAQ (DX)(BX*4), DX 34126 ADDSS (DX), X4 34127 MOVSS X4, (DX) 34128 LEAQ (DX)(BX*4), DX 34129 SUBQ $0x04, SI 34130 34131 check_limit_unroll: 34132 CMPQ SI, $0x04 34133 JHS loop_unroll 34134 JMP check_limit 34135 34136 loop: 34137 MOVSS (AX), X1 34138 MULSS X0, X1 34139 ADDSS (DX), X1 34140 MOVSS X1, (DX) 34141 DECQ SI 34142 LEAQ (AX)(CX*4), AX 34143 LEAQ (DX)(BX*4), DX 34144 34145 check_limit: 34146 CMPQ SI, $0x00 34147 JHI loop 34148 RET 34149 34150 // func AmdAxpyPointerLoopXInterleave_V5A0U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34151 // Requires: SSE 34152 TEXT ·AmdAxpyPointerLoopXInterleave_V5A0U4(SB), NOSPLIT, $0-48 34153 MOVSS alpha+0(FP), X0 34154 MOVQ xs+8(FP), AX 34155 MOVQ incx+16(FP), CX 34156 MOVQ CX, DX 34157 SHLQ $0x04, DX 34158 MOVQ ys+24(FP), DX 34159 MOVQ incy+32(FP), BX 34160 MOVQ BX, SI 34161 SHLQ $0x04, SI 34162 MOVQ n+40(FP), SI 34163 JMP check_limit_unroll 34164 34165 loop_unroll: 34166 MOVSS (AX), X1 34167 LEAQ (AX)(CX*4), AX 34168 MOVSS (AX), X2 34169 LEAQ (AX)(CX*4), AX 34170 MOVSS (AX), X3 34171 LEAQ (AX)(CX*4), AX 34172 MOVSS (AX), X4 34173 LEAQ (AX)(CX*4), AX 34174 MULSS X0, X1 34175 MULSS X0, X2 34176 MULSS X0, X3 34177 MULSS X0, X4 34178 ADDSS (DX), X1 34179 MOVSS X1, (DX) 34180 LEAQ (DX)(BX*4), DX 34181 ADDSS (DX), X2 34182 MOVSS X2, (DX) 34183 LEAQ (DX)(BX*4), DX 34184 ADDSS (DX), X3 34185 MOVSS X3, (DX) 34186 LEAQ (DX)(BX*4), DX 34187 ADDSS (DX), X4 34188 MOVSS X4, (DX) 34189 LEAQ (DX)(BX*4), DX 34190 SUBQ $0x04, SI 34191 34192 check_limit_unroll: 34193 CMPQ SI, $0x04 34194 JHS loop_unroll 34195 JMP check_limit 34196 34197 loop: 34198 MOVSS (AX), X1 34199 MULSS X0, X1 34200 ADDSS (DX), X1 34201 MOVSS X1, (DX) 34202 DECQ SI 34203 LEAQ (AX)(CX*4), AX 34204 LEAQ (DX)(BX*4), DX 34205 34206 check_limit: 34207 CMPQ SI, $0x00 34208 JHI loop 34209 RET 34210 34211 // func AmdAxpyPointerLoopXInterleave_V0A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34212 // Requires: SSE 34213 TEXT ·AmdAxpyPointerLoopXInterleave_V0A8U4(SB), NOSPLIT, $0-48 34214 MOVSS alpha+0(FP), X0 34215 MOVQ xs+8(FP), AX 34216 MOVQ incx+16(FP), CX 34217 MOVQ CX, DX 34218 SHLQ $0x04, DX 34219 MOVQ ys+24(FP), DX 34220 MOVQ incy+32(FP), BX 34221 MOVQ BX, SI 34222 SHLQ $0x04, SI 34223 MOVQ n+40(FP), SI 34224 JMP check_limit_unroll 34225 PCALIGN $0x08 34226 34227 loop_unroll: 34228 MOVSS (AX), X1 34229 LEAQ (AX)(CX*4), AX 34230 MOVSS (AX), X2 34231 LEAQ (AX)(CX*4), AX 34232 MOVSS (AX), X3 34233 LEAQ (AX)(CX*4), AX 34234 MOVSS (AX), X4 34235 LEAQ (AX)(CX*4), AX 34236 MULSS X0, X1 34237 MULSS X0, X2 34238 MULSS X0, X3 34239 MULSS X0, X4 34240 ADDSS (DX), X1 34241 MOVSS X1, (DX) 34242 LEAQ (DX)(BX*4), DX 34243 ADDSS (DX), X2 34244 MOVSS X2, (DX) 34245 LEAQ (DX)(BX*4), DX 34246 ADDSS (DX), X3 34247 MOVSS X3, (DX) 34248 LEAQ (DX)(BX*4), DX 34249 ADDSS (DX), X4 34250 MOVSS X4, (DX) 34251 LEAQ (DX)(BX*4), DX 34252 SUBQ $0x04, SI 34253 34254 check_limit_unroll: 34255 CMPQ SI, $0x04 34256 JHS loop_unroll 34257 JMP check_limit 34258 34259 loop: 34260 MOVSS (AX), X1 34261 MULSS X0, X1 34262 ADDSS (DX), X1 34263 MOVSS X1, (DX) 34264 DECQ SI 34265 LEAQ (AX)(CX*4), AX 34266 LEAQ (DX)(BX*4), DX 34267 34268 check_limit: 34269 CMPQ SI, $0x00 34270 JHI loop 34271 RET 34272 34273 // func AmdAxpyPointerLoopXInterleave_V1A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34274 // Requires: SSE 34275 TEXT ·AmdAxpyPointerLoopXInterleave_V1A8U4(SB), NOSPLIT, $0-48 34276 MOVSS alpha+0(FP), X0 34277 MOVQ xs+8(FP), AX 34278 MOVQ incx+16(FP), CX 34279 MOVQ CX, DX 34280 SHLQ $0x04, DX 34281 MOVQ ys+24(FP), DX 34282 MOVQ incy+32(FP), BX 34283 MOVQ BX, SI 34284 SHLQ $0x04, SI 34285 MOVQ n+40(FP), SI 34286 JMP check_limit_unroll 34287 PCALIGN $0x08 34288 34289 loop_unroll: 34290 MOVSS (AX), X1 34291 LEAQ (AX)(CX*4), AX 34292 MOVSS (AX), X2 34293 LEAQ (AX)(CX*4), AX 34294 MOVSS (AX), X3 34295 LEAQ (AX)(CX*4), AX 34296 MOVSS (AX), X4 34297 LEAQ (AX)(CX*4), AX 34298 MULSS X0, X1 34299 MULSS X0, X2 34300 MULSS X0, X3 34301 MULSS X0, X4 34302 ADDSS (DX), X1 34303 MOVSS X1, (DX) 34304 LEAQ (DX)(BX*4), DX 34305 ADDSS (DX), X2 34306 MOVSS X2, (DX) 34307 LEAQ (DX)(BX*4), DX 34308 ADDSS (DX), X3 34309 MOVSS X3, (DX) 34310 LEAQ (DX)(BX*4), DX 34311 ADDSS (DX), X4 34312 MOVSS X4, (DX) 34313 LEAQ (DX)(BX*4), DX 34314 SUBQ $0x04, SI 34315 34316 check_limit_unroll: 34317 CMPQ SI, $0x04 34318 JHS loop_unroll 34319 JMP check_limit 34320 34321 loop: 34322 MOVSS (AX), X1 34323 MULSS X0, X1 34324 ADDSS (DX), X1 34325 MOVSS X1, (DX) 34326 DECQ SI 34327 LEAQ (AX)(CX*4), AX 34328 LEAQ (DX)(BX*4), DX 34329 34330 check_limit: 34331 CMPQ SI, $0x00 34332 JHI loop 34333 RET 34334 34335 // func AmdAxpyPointerLoopXInterleave_V2A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34336 // Requires: SSE 34337 TEXT ·AmdAxpyPointerLoopXInterleave_V2A8U4(SB), NOSPLIT, $0-48 34338 MOVSS alpha+0(FP), X0 34339 MOVQ xs+8(FP), AX 34340 MOVQ incx+16(FP), CX 34341 MOVQ CX, DX 34342 SHLQ $0x04, DX 34343 MOVQ ys+24(FP), DX 34344 MOVQ incy+32(FP), BX 34345 MOVQ BX, SI 34346 SHLQ $0x04, SI 34347 MOVQ n+40(FP), SI 34348 JMP check_limit_unroll 34349 PCALIGN $0x08 34350 34351 loop_unroll: 34352 MOVSS (AX), X1 34353 LEAQ (AX)(CX*4), AX 34354 MOVSS (AX), X2 34355 LEAQ (AX)(CX*4), AX 34356 MOVSS (AX), X3 34357 LEAQ (AX)(CX*4), AX 34358 MOVSS (AX), X4 34359 LEAQ (AX)(CX*4), AX 34360 MULSS X0, X1 34361 MULSS X0, X2 34362 MULSS X0, X3 34363 MULSS X0, X4 34364 ADDSS (DX), X1 34365 MOVSS X1, (DX) 34366 LEAQ (DX)(BX*4), DX 34367 ADDSS (DX), X2 34368 MOVSS X2, (DX) 34369 LEAQ (DX)(BX*4), DX 34370 ADDSS (DX), X3 34371 MOVSS X3, (DX) 34372 LEAQ (DX)(BX*4), DX 34373 ADDSS (DX), X4 34374 MOVSS X4, (DX) 34375 LEAQ (DX)(BX*4), DX 34376 SUBQ $0x04, SI 34377 34378 check_limit_unroll: 34379 CMPQ SI, $0x04 34380 JHS loop_unroll 34381 JMP check_limit 34382 34383 loop: 34384 MOVSS (AX), X1 34385 MULSS X0, X1 34386 ADDSS (DX), X1 34387 MOVSS X1, (DX) 34388 DECQ SI 34389 LEAQ (AX)(CX*4), AX 34390 LEAQ (DX)(BX*4), DX 34391 34392 check_limit: 34393 CMPQ SI, $0x00 34394 JHI loop 34395 RET 34396 34397 // func AmdAxpyPointerLoopXInterleave_V3A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34398 // Requires: SSE 34399 TEXT ·AmdAxpyPointerLoopXInterleave_V3A8U4(SB), NOSPLIT, $0-48 34400 MOVSS alpha+0(FP), X0 34401 MOVQ xs+8(FP), AX 34402 MOVQ incx+16(FP), CX 34403 MOVQ CX, DX 34404 SHLQ $0x04, DX 34405 MOVQ ys+24(FP), DX 34406 MOVQ incy+32(FP), BX 34407 MOVQ BX, SI 34408 SHLQ $0x04, SI 34409 MOVQ n+40(FP), SI 34410 JMP check_limit_unroll 34411 PCALIGN $0x08 34412 34413 loop_unroll: 34414 MOVSS (AX), X1 34415 LEAQ (AX)(CX*4), AX 34416 MOVSS (AX), X2 34417 LEAQ (AX)(CX*4), AX 34418 MOVSS (AX), X3 34419 LEAQ (AX)(CX*4), AX 34420 MOVSS (AX), X4 34421 LEAQ (AX)(CX*4), AX 34422 MULSS X0, X1 34423 MULSS X0, X2 34424 MULSS X0, X3 34425 MULSS X0, X4 34426 ADDSS (DX), X1 34427 MOVSS X1, (DX) 34428 LEAQ (DX)(BX*4), DX 34429 ADDSS (DX), X2 34430 MOVSS X2, (DX) 34431 LEAQ (DX)(BX*4), DX 34432 ADDSS (DX), X3 34433 MOVSS X3, (DX) 34434 LEAQ (DX)(BX*4), DX 34435 ADDSS (DX), X4 34436 MOVSS X4, (DX) 34437 LEAQ (DX)(BX*4), DX 34438 SUBQ $0x04, SI 34439 34440 check_limit_unroll: 34441 CMPQ SI, $0x04 34442 JHS loop_unroll 34443 JMP check_limit 34444 34445 loop: 34446 MOVSS (AX), X1 34447 MULSS X0, X1 34448 ADDSS (DX), X1 34449 MOVSS X1, (DX) 34450 DECQ SI 34451 LEAQ (AX)(CX*4), AX 34452 LEAQ (DX)(BX*4), DX 34453 34454 check_limit: 34455 CMPQ SI, $0x00 34456 JHI loop 34457 RET 34458 34459 // func AmdAxpyPointerLoopXInterleave_V4A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34460 // Requires: SSE 34461 TEXT ·AmdAxpyPointerLoopXInterleave_V4A8U4(SB), NOSPLIT, $0-48 34462 MOVSS alpha+0(FP), X0 34463 MOVQ xs+8(FP), AX 34464 MOVQ incx+16(FP), CX 34465 MOVQ CX, DX 34466 SHLQ $0x04, DX 34467 MOVQ ys+24(FP), DX 34468 MOVQ incy+32(FP), BX 34469 MOVQ BX, SI 34470 SHLQ $0x04, SI 34471 MOVQ n+40(FP), SI 34472 JMP check_limit_unroll 34473 PCALIGN $0x08 34474 34475 loop_unroll: 34476 MOVSS (AX), X1 34477 LEAQ (AX)(CX*4), AX 34478 MOVSS (AX), X2 34479 LEAQ (AX)(CX*4), AX 34480 MOVSS (AX), X3 34481 LEAQ (AX)(CX*4), AX 34482 MOVSS (AX), X4 34483 LEAQ (AX)(CX*4), AX 34484 MULSS X0, X1 34485 MULSS X0, X2 34486 MULSS X0, X3 34487 MULSS X0, X4 34488 ADDSS (DX), X1 34489 MOVSS X1, (DX) 34490 LEAQ (DX)(BX*4), DX 34491 ADDSS (DX), X2 34492 MOVSS X2, (DX) 34493 LEAQ (DX)(BX*4), DX 34494 ADDSS (DX), X3 34495 MOVSS X3, (DX) 34496 LEAQ (DX)(BX*4), DX 34497 ADDSS (DX), X4 34498 MOVSS X4, (DX) 34499 LEAQ (DX)(BX*4), DX 34500 SUBQ $0x04, SI 34501 34502 check_limit_unroll: 34503 CMPQ SI, $0x04 34504 JHS loop_unroll 34505 JMP check_limit 34506 34507 loop: 34508 MOVSS (AX), X1 34509 MULSS X0, X1 34510 ADDSS (DX), X1 34511 MOVSS X1, (DX) 34512 DECQ SI 34513 LEAQ (AX)(CX*4), AX 34514 LEAQ (DX)(BX*4), DX 34515 34516 check_limit: 34517 CMPQ SI, $0x00 34518 JHI loop 34519 RET 34520 34521 // func AmdAxpyPointerLoopXInterleave_V5A8U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34522 // Requires: SSE 34523 TEXT ·AmdAxpyPointerLoopXInterleave_V5A8U4(SB), NOSPLIT, $0-48 34524 MOVSS alpha+0(FP), X0 34525 MOVQ xs+8(FP), AX 34526 MOVQ incx+16(FP), CX 34527 MOVQ CX, DX 34528 SHLQ $0x04, DX 34529 MOVQ ys+24(FP), DX 34530 MOVQ incy+32(FP), BX 34531 MOVQ BX, SI 34532 SHLQ $0x04, SI 34533 MOVQ n+40(FP), SI 34534 JMP check_limit_unroll 34535 PCALIGN $0x08 34536 34537 loop_unroll: 34538 MOVSS (AX), X1 34539 LEAQ (AX)(CX*4), AX 34540 MOVSS (AX), X2 34541 LEAQ (AX)(CX*4), AX 34542 MOVSS (AX), X3 34543 LEAQ (AX)(CX*4), AX 34544 MOVSS (AX), X4 34545 LEAQ (AX)(CX*4), AX 34546 MULSS X0, X1 34547 MULSS X0, X2 34548 MULSS X0, X3 34549 MULSS X0, X4 34550 ADDSS (DX), X1 34551 MOVSS X1, (DX) 34552 LEAQ (DX)(BX*4), DX 34553 ADDSS (DX), X2 34554 MOVSS X2, (DX) 34555 LEAQ (DX)(BX*4), DX 34556 ADDSS (DX), X3 34557 MOVSS X3, (DX) 34558 LEAQ (DX)(BX*4), DX 34559 ADDSS (DX), X4 34560 MOVSS X4, (DX) 34561 LEAQ (DX)(BX*4), DX 34562 SUBQ $0x04, SI 34563 34564 check_limit_unroll: 34565 CMPQ SI, $0x04 34566 JHS loop_unroll 34567 JMP check_limit 34568 34569 loop: 34570 MOVSS (AX), X1 34571 MULSS X0, X1 34572 ADDSS (DX), X1 34573 MOVSS X1, (DX) 34574 DECQ SI 34575 LEAQ (AX)(CX*4), AX 34576 LEAQ (DX)(BX*4), DX 34577 34578 check_limit: 34579 CMPQ SI, $0x00 34580 JHI loop 34581 RET 34582 34583 // func AmdAxpyPointerLoopXInterleave_V0A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34584 // Requires: SSE 34585 TEXT ·AmdAxpyPointerLoopXInterleave_V0A9U4(SB), NOSPLIT, $0-48 34586 MOVSS alpha+0(FP), X0 34587 MOVQ xs+8(FP), AX 34588 MOVQ incx+16(FP), CX 34589 MOVQ CX, DX 34590 SHLQ $0x04, DX 34591 MOVQ ys+24(FP), DX 34592 MOVQ incy+32(FP), BX 34593 MOVQ BX, SI 34594 SHLQ $0x04, SI 34595 MOVQ n+40(FP), SI 34596 JMP check_limit_unroll 34597 PCALIGN $0x08 34598 NOP 34599 34600 loop_unroll: 34601 MOVSS (AX), X1 34602 LEAQ (AX)(CX*4), AX 34603 MOVSS (AX), X2 34604 LEAQ (AX)(CX*4), AX 34605 MOVSS (AX), X3 34606 LEAQ (AX)(CX*4), AX 34607 MOVSS (AX), X4 34608 LEAQ (AX)(CX*4), AX 34609 MULSS X0, X1 34610 MULSS X0, X2 34611 MULSS X0, X3 34612 MULSS X0, X4 34613 ADDSS (DX), X1 34614 MOVSS X1, (DX) 34615 LEAQ (DX)(BX*4), DX 34616 ADDSS (DX), X2 34617 MOVSS X2, (DX) 34618 LEAQ (DX)(BX*4), DX 34619 ADDSS (DX), X3 34620 MOVSS X3, (DX) 34621 LEAQ (DX)(BX*4), DX 34622 ADDSS (DX), X4 34623 MOVSS X4, (DX) 34624 LEAQ (DX)(BX*4), DX 34625 SUBQ $0x04, SI 34626 34627 check_limit_unroll: 34628 CMPQ SI, $0x04 34629 JHS loop_unroll 34630 JMP check_limit 34631 34632 loop: 34633 MOVSS (AX), X1 34634 MULSS X0, X1 34635 ADDSS (DX), X1 34636 MOVSS X1, (DX) 34637 DECQ SI 34638 LEAQ (AX)(CX*4), AX 34639 LEAQ (DX)(BX*4), DX 34640 34641 check_limit: 34642 CMPQ SI, $0x00 34643 JHI loop 34644 RET 34645 34646 // func AmdAxpyPointerLoopXInterleave_V1A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34647 // Requires: SSE 34648 TEXT ·AmdAxpyPointerLoopXInterleave_V1A9U4(SB), NOSPLIT, $0-48 34649 MOVSS alpha+0(FP), X0 34650 MOVQ xs+8(FP), AX 34651 MOVQ incx+16(FP), CX 34652 MOVQ CX, DX 34653 SHLQ $0x04, DX 34654 MOVQ ys+24(FP), DX 34655 MOVQ incy+32(FP), BX 34656 MOVQ BX, SI 34657 SHLQ $0x04, SI 34658 MOVQ n+40(FP), SI 34659 JMP check_limit_unroll 34660 PCALIGN $0x08 34661 NOP 34662 34663 loop_unroll: 34664 MOVSS (AX), X1 34665 LEAQ (AX)(CX*4), AX 34666 MOVSS (AX), X2 34667 LEAQ (AX)(CX*4), AX 34668 MOVSS (AX), X3 34669 LEAQ (AX)(CX*4), AX 34670 MOVSS (AX), X4 34671 LEAQ (AX)(CX*4), AX 34672 MULSS X0, X1 34673 MULSS X0, X2 34674 MULSS X0, X3 34675 MULSS X0, X4 34676 ADDSS (DX), X1 34677 MOVSS X1, (DX) 34678 LEAQ (DX)(BX*4), DX 34679 ADDSS (DX), X2 34680 MOVSS X2, (DX) 34681 LEAQ (DX)(BX*4), DX 34682 ADDSS (DX), X3 34683 MOVSS X3, (DX) 34684 LEAQ (DX)(BX*4), DX 34685 ADDSS (DX), X4 34686 MOVSS X4, (DX) 34687 LEAQ (DX)(BX*4), DX 34688 SUBQ $0x04, SI 34689 34690 check_limit_unroll: 34691 CMPQ SI, $0x04 34692 JHS loop_unroll 34693 JMP check_limit 34694 34695 loop: 34696 MOVSS (AX), X1 34697 MULSS X0, X1 34698 ADDSS (DX), X1 34699 MOVSS X1, (DX) 34700 DECQ SI 34701 LEAQ (AX)(CX*4), AX 34702 LEAQ (DX)(BX*4), DX 34703 34704 check_limit: 34705 CMPQ SI, $0x00 34706 JHI loop 34707 RET 34708 34709 // func AmdAxpyPointerLoopXInterleave_V2A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34710 // Requires: SSE 34711 TEXT ·AmdAxpyPointerLoopXInterleave_V2A9U4(SB), NOSPLIT, $0-48 34712 MOVSS alpha+0(FP), X0 34713 MOVQ xs+8(FP), AX 34714 MOVQ incx+16(FP), CX 34715 MOVQ CX, DX 34716 SHLQ $0x04, DX 34717 MOVQ ys+24(FP), DX 34718 MOVQ incy+32(FP), BX 34719 MOVQ BX, SI 34720 SHLQ $0x04, SI 34721 MOVQ n+40(FP), SI 34722 JMP check_limit_unroll 34723 PCALIGN $0x08 34724 NOP 34725 34726 loop_unroll: 34727 MOVSS (AX), X1 34728 LEAQ (AX)(CX*4), AX 34729 MOVSS (AX), X2 34730 LEAQ (AX)(CX*4), AX 34731 MOVSS (AX), X3 34732 LEAQ (AX)(CX*4), AX 34733 MOVSS (AX), X4 34734 LEAQ (AX)(CX*4), AX 34735 MULSS X0, X1 34736 MULSS X0, X2 34737 MULSS X0, X3 34738 MULSS X0, X4 34739 ADDSS (DX), X1 34740 MOVSS X1, (DX) 34741 LEAQ (DX)(BX*4), DX 34742 ADDSS (DX), X2 34743 MOVSS X2, (DX) 34744 LEAQ (DX)(BX*4), DX 34745 ADDSS (DX), X3 34746 MOVSS X3, (DX) 34747 LEAQ (DX)(BX*4), DX 34748 ADDSS (DX), X4 34749 MOVSS X4, (DX) 34750 LEAQ (DX)(BX*4), DX 34751 SUBQ $0x04, SI 34752 34753 check_limit_unroll: 34754 CMPQ SI, $0x04 34755 JHS loop_unroll 34756 JMP check_limit 34757 34758 loop: 34759 MOVSS (AX), X1 34760 MULSS X0, X1 34761 ADDSS (DX), X1 34762 MOVSS X1, (DX) 34763 DECQ SI 34764 LEAQ (AX)(CX*4), AX 34765 LEAQ (DX)(BX*4), DX 34766 34767 check_limit: 34768 CMPQ SI, $0x00 34769 JHI loop 34770 RET 34771 34772 // func AmdAxpyPointerLoopXInterleave_V3A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34773 // Requires: SSE 34774 TEXT ·AmdAxpyPointerLoopXInterleave_V3A9U4(SB), NOSPLIT, $0-48 34775 MOVSS alpha+0(FP), X0 34776 MOVQ xs+8(FP), AX 34777 MOVQ incx+16(FP), CX 34778 MOVQ CX, DX 34779 SHLQ $0x04, DX 34780 MOVQ ys+24(FP), DX 34781 MOVQ incy+32(FP), BX 34782 MOVQ BX, SI 34783 SHLQ $0x04, SI 34784 MOVQ n+40(FP), SI 34785 JMP check_limit_unroll 34786 PCALIGN $0x08 34787 NOP 34788 34789 loop_unroll: 34790 MOVSS (AX), X1 34791 LEAQ (AX)(CX*4), AX 34792 MOVSS (AX), X2 34793 LEAQ (AX)(CX*4), AX 34794 MOVSS (AX), X3 34795 LEAQ (AX)(CX*4), AX 34796 MOVSS (AX), X4 34797 LEAQ (AX)(CX*4), AX 34798 MULSS X0, X1 34799 MULSS X0, X2 34800 MULSS X0, X3 34801 MULSS X0, X4 34802 ADDSS (DX), X1 34803 MOVSS X1, (DX) 34804 LEAQ (DX)(BX*4), DX 34805 ADDSS (DX), X2 34806 MOVSS X2, (DX) 34807 LEAQ (DX)(BX*4), DX 34808 ADDSS (DX), X3 34809 MOVSS X3, (DX) 34810 LEAQ (DX)(BX*4), DX 34811 ADDSS (DX), X4 34812 MOVSS X4, (DX) 34813 LEAQ (DX)(BX*4), DX 34814 SUBQ $0x04, SI 34815 34816 check_limit_unroll: 34817 CMPQ SI, $0x04 34818 JHS loop_unroll 34819 JMP check_limit 34820 34821 loop: 34822 MOVSS (AX), X1 34823 MULSS X0, X1 34824 ADDSS (DX), X1 34825 MOVSS X1, (DX) 34826 DECQ SI 34827 LEAQ (AX)(CX*4), AX 34828 LEAQ (DX)(BX*4), DX 34829 34830 check_limit: 34831 CMPQ SI, $0x00 34832 JHI loop 34833 RET 34834 34835 // func AmdAxpyPointerLoopXInterleave_V4A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34836 // Requires: SSE 34837 TEXT ·AmdAxpyPointerLoopXInterleave_V4A9U4(SB), NOSPLIT, $0-48 34838 MOVSS alpha+0(FP), X0 34839 MOVQ xs+8(FP), AX 34840 MOVQ incx+16(FP), CX 34841 MOVQ CX, DX 34842 SHLQ $0x04, DX 34843 MOVQ ys+24(FP), DX 34844 MOVQ incy+32(FP), BX 34845 MOVQ BX, SI 34846 SHLQ $0x04, SI 34847 MOVQ n+40(FP), SI 34848 JMP check_limit_unroll 34849 PCALIGN $0x08 34850 NOP 34851 34852 loop_unroll: 34853 MOVSS (AX), X1 34854 LEAQ (AX)(CX*4), AX 34855 MOVSS (AX), X2 34856 LEAQ (AX)(CX*4), AX 34857 MOVSS (AX), X3 34858 LEAQ (AX)(CX*4), AX 34859 MOVSS (AX), X4 34860 LEAQ (AX)(CX*4), AX 34861 MULSS X0, X1 34862 MULSS X0, X2 34863 MULSS X0, X3 34864 MULSS X0, X4 34865 ADDSS (DX), X1 34866 MOVSS X1, (DX) 34867 LEAQ (DX)(BX*4), DX 34868 ADDSS (DX), X2 34869 MOVSS X2, (DX) 34870 LEAQ (DX)(BX*4), DX 34871 ADDSS (DX), X3 34872 MOVSS X3, (DX) 34873 LEAQ (DX)(BX*4), DX 34874 ADDSS (DX), X4 34875 MOVSS X4, (DX) 34876 LEAQ (DX)(BX*4), DX 34877 SUBQ $0x04, SI 34878 34879 check_limit_unroll: 34880 CMPQ SI, $0x04 34881 JHS loop_unroll 34882 JMP check_limit 34883 34884 loop: 34885 MOVSS (AX), X1 34886 MULSS X0, X1 34887 ADDSS (DX), X1 34888 MOVSS X1, (DX) 34889 DECQ SI 34890 LEAQ (AX)(CX*4), AX 34891 LEAQ (DX)(BX*4), DX 34892 34893 check_limit: 34894 CMPQ SI, $0x00 34895 JHI loop 34896 RET 34897 34898 // func AmdAxpyPointerLoopXInterleave_V5A9U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34899 // Requires: SSE 34900 TEXT ·AmdAxpyPointerLoopXInterleave_V5A9U4(SB), NOSPLIT, $0-48 34901 MOVSS alpha+0(FP), X0 34902 MOVQ xs+8(FP), AX 34903 MOVQ incx+16(FP), CX 34904 MOVQ CX, DX 34905 SHLQ $0x04, DX 34906 MOVQ ys+24(FP), DX 34907 MOVQ incy+32(FP), BX 34908 MOVQ BX, SI 34909 SHLQ $0x04, SI 34910 MOVQ n+40(FP), SI 34911 JMP check_limit_unroll 34912 PCALIGN $0x08 34913 NOP 34914 34915 loop_unroll: 34916 MOVSS (AX), X1 34917 LEAQ (AX)(CX*4), AX 34918 MOVSS (AX), X2 34919 LEAQ (AX)(CX*4), AX 34920 MOVSS (AX), X3 34921 LEAQ (AX)(CX*4), AX 34922 MOVSS (AX), X4 34923 LEAQ (AX)(CX*4), AX 34924 MULSS X0, X1 34925 MULSS X0, X2 34926 MULSS X0, X3 34927 MULSS X0, X4 34928 ADDSS (DX), X1 34929 MOVSS X1, (DX) 34930 LEAQ (DX)(BX*4), DX 34931 ADDSS (DX), X2 34932 MOVSS X2, (DX) 34933 LEAQ (DX)(BX*4), DX 34934 ADDSS (DX), X3 34935 MOVSS X3, (DX) 34936 LEAQ (DX)(BX*4), DX 34937 ADDSS (DX), X4 34938 MOVSS X4, (DX) 34939 LEAQ (DX)(BX*4), DX 34940 SUBQ $0x04, SI 34941 34942 check_limit_unroll: 34943 CMPQ SI, $0x04 34944 JHS loop_unroll 34945 JMP check_limit 34946 34947 loop: 34948 MOVSS (AX), X1 34949 MULSS X0, X1 34950 ADDSS (DX), X1 34951 MOVSS X1, (DX) 34952 DECQ SI 34953 LEAQ (AX)(CX*4), AX 34954 LEAQ (DX)(BX*4), DX 34955 34956 check_limit: 34957 CMPQ SI, $0x00 34958 JHI loop 34959 RET 34960 34961 // func AmdAxpyPointerLoopXInterleave_V0A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 34962 // Requires: SSE 34963 TEXT ·AmdAxpyPointerLoopXInterleave_V0A10U4(SB), NOSPLIT, $0-48 34964 MOVSS alpha+0(FP), X0 34965 MOVQ xs+8(FP), AX 34966 MOVQ incx+16(FP), CX 34967 MOVQ CX, DX 34968 SHLQ $0x04, DX 34969 MOVQ ys+24(FP), DX 34970 MOVQ incy+32(FP), BX 34971 MOVQ BX, SI 34972 SHLQ $0x04, SI 34973 MOVQ n+40(FP), SI 34974 JMP check_limit_unroll 34975 PCALIGN $0x08 34976 NOP 34977 NOP 34978 34979 loop_unroll: 34980 MOVSS (AX), X1 34981 LEAQ (AX)(CX*4), AX 34982 MOVSS (AX), X2 34983 LEAQ (AX)(CX*4), AX 34984 MOVSS (AX), X3 34985 LEAQ (AX)(CX*4), AX 34986 MOVSS (AX), X4 34987 LEAQ (AX)(CX*4), AX 34988 MULSS X0, X1 34989 MULSS X0, X2 34990 MULSS X0, X3 34991 MULSS X0, X4 34992 ADDSS (DX), X1 34993 MOVSS X1, (DX) 34994 LEAQ (DX)(BX*4), DX 34995 ADDSS (DX), X2 34996 MOVSS X2, (DX) 34997 LEAQ (DX)(BX*4), DX 34998 ADDSS (DX), X3 34999 MOVSS X3, (DX) 35000 LEAQ (DX)(BX*4), DX 35001 ADDSS (DX), X4 35002 MOVSS X4, (DX) 35003 LEAQ (DX)(BX*4), DX 35004 SUBQ $0x04, SI 35005 35006 check_limit_unroll: 35007 CMPQ SI, $0x04 35008 JHS loop_unroll 35009 JMP check_limit 35010 35011 loop: 35012 MOVSS (AX), X1 35013 MULSS X0, X1 35014 ADDSS (DX), X1 35015 MOVSS X1, (DX) 35016 DECQ SI 35017 LEAQ (AX)(CX*4), AX 35018 LEAQ (DX)(BX*4), DX 35019 35020 check_limit: 35021 CMPQ SI, $0x00 35022 JHI loop 35023 RET 35024 35025 // func AmdAxpyPointerLoopXInterleave_V1A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35026 // Requires: SSE 35027 TEXT ·AmdAxpyPointerLoopXInterleave_V1A10U4(SB), NOSPLIT, $0-48 35028 MOVSS alpha+0(FP), X0 35029 MOVQ xs+8(FP), AX 35030 MOVQ incx+16(FP), CX 35031 MOVQ CX, DX 35032 SHLQ $0x04, DX 35033 MOVQ ys+24(FP), DX 35034 MOVQ incy+32(FP), BX 35035 MOVQ BX, SI 35036 SHLQ $0x04, SI 35037 MOVQ n+40(FP), SI 35038 JMP check_limit_unroll 35039 PCALIGN $0x08 35040 NOP 35041 NOP 35042 35043 loop_unroll: 35044 MOVSS (AX), X1 35045 LEAQ (AX)(CX*4), AX 35046 MOVSS (AX), X2 35047 LEAQ (AX)(CX*4), AX 35048 MOVSS (AX), X3 35049 LEAQ (AX)(CX*4), AX 35050 MOVSS (AX), X4 35051 LEAQ (AX)(CX*4), AX 35052 MULSS X0, X1 35053 MULSS X0, X2 35054 MULSS X0, X3 35055 MULSS X0, X4 35056 ADDSS (DX), X1 35057 MOVSS X1, (DX) 35058 LEAQ (DX)(BX*4), DX 35059 ADDSS (DX), X2 35060 MOVSS X2, (DX) 35061 LEAQ (DX)(BX*4), DX 35062 ADDSS (DX), X3 35063 MOVSS X3, (DX) 35064 LEAQ (DX)(BX*4), DX 35065 ADDSS (DX), X4 35066 MOVSS X4, (DX) 35067 LEAQ (DX)(BX*4), DX 35068 SUBQ $0x04, SI 35069 35070 check_limit_unroll: 35071 CMPQ SI, $0x04 35072 JHS loop_unroll 35073 JMP check_limit 35074 35075 loop: 35076 MOVSS (AX), X1 35077 MULSS X0, X1 35078 ADDSS (DX), X1 35079 MOVSS X1, (DX) 35080 DECQ SI 35081 LEAQ (AX)(CX*4), AX 35082 LEAQ (DX)(BX*4), DX 35083 35084 check_limit: 35085 CMPQ SI, $0x00 35086 JHI loop 35087 RET 35088 35089 // func AmdAxpyPointerLoopXInterleave_V2A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35090 // Requires: SSE 35091 TEXT ·AmdAxpyPointerLoopXInterleave_V2A10U4(SB), NOSPLIT, $0-48 35092 MOVSS alpha+0(FP), X0 35093 MOVQ xs+8(FP), AX 35094 MOVQ incx+16(FP), CX 35095 MOVQ CX, DX 35096 SHLQ $0x04, DX 35097 MOVQ ys+24(FP), DX 35098 MOVQ incy+32(FP), BX 35099 MOVQ BX, SI 35100 SHLQ $0x04, SI 35101 MOVQ n+40(FP), SI 35102 JMP check_limit_unroll 35103 PCALIGN $0x08 35104 NOP 35105 NOP 35106 35107 loop_unroll: 35108 MOVSS (AX), X1 35109 LEAQ (AX)(CX*4), AX 35110 MOVSS (AX), X2 35111 LEAQ (AX)(CX*4), AX 35112 MOVSS (AX), X3 35113 LEAQ (AX)(CX*4), AX 35114 MOVSS (AX), X4 35115 LEAQ (AX)(CX*4), AX 35116 MULSS X0, X1 35117 MULSS X0, X2 35118 MULSS X0, X3 35119 MULSS X0, X4 35120 ADDSS (DX), X1 35121 MOVSS X1, (DX) 35122 LEAQ (DX)(BX*4), DX 35123 ADDSS (DX), X2 35124 MOVSS X2, (DX) 35125 LEAQ (DX)(BX*4), DX 35126 ADDSS (DX), X3 35127 MOVSS X3, (DX) 35128 LEAQ (DX)(BX*4), DX 35129 ADDSS (DX), X4 35130 MOVSS X4, (DX) 35131 LEAQ (DX)(BX*4), DX 35132 SUBQ $0x04, SI 35133 35134 check_limit_unroll: 35135 CMPQ SI, $0x04 35136 JHS loop_unroll 35137 JMP check_limit 35138 35139 loop: 35140 MOVSS (AX), X1 35141 MULSS X0, X1 35142 ADDSS (DX), X1 35143 MOVSS X1, (DX) 35144 DECQ SI 35145 LEAQ (AX)(CX*4), AX 35146 LEAQ (DX)(BX*4), DX 35147 35148 check_limit: 35149 CMPQ SI, $0x00 35150 JHI loop 35151 RET 35152 35153 // func AmdAxpyPointerLoopXInterleave_V3A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35154 // Requires: SSE 35155 TEXT ·AmdAxpyPointerLoopXInterleave_V3A10U4(SB), NOSPLIT, $0-48 35156 MOVSS alpha+0(FP), X0 35157 MOVQ xs+8(FP), AX 35158 MOVQ incx+16(FP), CX 35159 MOVQ CX, DX 35160 SHLQ $0x04, DX 35161 MOVQ ys+24(FP), DX 35162 MOVQ incy+32(FP), BX 35163 MOVQ BX, SI 35164 SHLQ $0x04, SI 35165 MOVQ n+40(FP), SI 35166 JMP check_limit_unroll 35167 PCALIGN $0x08 35168 NOP 35169 NOP 35170 35171 loop_unroll: 35172 MOVSS (AX), X1 35173 LEAQ (AX)(CX*4), AX 35174 MOVSS (AX), X2 35175 LEAQ (AX)(CX*4), AX 35176 MOVSS (AX), X3 35177 LEAQ (AX)(CX*4), AX 35178 MOVSS (AX), X4 35179 LEAQ (AX)(CX*4), AX 35180 MULSS X0, X1 35181 MULSS X0, X2 35182 MULSS X0, X3 35183 MULSS X0, X4 35184 ADDSS (DX), X1 35185 MOVSS X1, (DX) 35186 LEAQ (DX)(BX*4), DX 35187 ADDSS (DX), X2 35188 MOVSS X2, (DX) 35189 LEAQ (DX)(BX*4), DX 35190 ADDSS (DX), X3 35191 MOVSS X3, (DX) 35192 LEAQ (DX)(BX*4), DX 35193 ADDSS (DX), X4 35194 MOVSS X4, (DX) 35195 LEAQ (DX)(BX*4), DX 35196 SUBQ $0x04, SI 35197 35198 check_limit_unroll: 35199 CMPQ SI, $0x04 35200 JHS loop_unroll 35201 JMP check_limit 35202 35203 loop: 35204 MOVSS (AX), X1 35205 MULSS X0, X1 35206 ADDSS (DX), X1 35207 MOVSS X1, (DX) 35208 DECQ SI 35209 LEAQ (AX)(CX*4), AX 35210 LEAQ (DX)(BX*4), DX 35211 35212 check_limit: 35213 CMPQ SI, $0x00 35214 JHI loop 35215 RET 35216 35217 // func AmdAxpyPointerLoopXInterleave_V4A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35218 // Requires: SSE 35219 TEXT ·AmdAxpyPointerLoopXInterleave_V4A10U4(SB), NOSPLIT, $0-48 35220 MOVSS alpha+0(FP), X0 35221 MOVQ xs+8(FP), AX 35222 MOVQ incx+16(FP), CX 35223 MOVQ CX, DX 35224 SHLQ $0x04, DX 35225 MOVQ ys+24(FP), DX 35226 MOVQ incy+32(FP), BX 35227 MOVQ BX, SI 35228 SHLQ $0x04, SI 35229 MOVQ n+40(FP), SI 35230 JMP check_limit_unroll 35231 PCALIGN $0x08 35232 NOP 35233 NOP 35234 35235 loop_unroll: 35236 MOVSS (AX), X1 35237 LEAQ (AX)(CX*4), AX 35238 MOVSS (AX), X2 35239 LEAQ (AX)(CX*4), AX 35240 MOVSS (AX), X3 35241 LEAQ (AX)(CX*4), AX 35242 MOVSS (AX), X4 35243 LEAQ (AX)(CX*4), AX 35244 MULSS X0, X1 35245 MULSS X0, X2 35246 MULSS X0, X3 35247 MULSS X0, X4 35248 ADDSS (DX), X1 35249 MOVSS X1, (DX) 35250 LEAQ (DX)(BX*4), DX 35251 ADDSS (DX), X2 35252 MOVSS X2, (DX) 35253 LEAQ (DX)(BX*4), DX 35254 ADDSS (DX), X3 35255 MOVSS X3, (DX) 35256 LEAQ (DX)(BX*4), DX 35257 ADDSS (DX), X4 35258 MOVSS X4, (DX) 35259 LEAQ (DX)(BX*4), DX 35260 SUBQ $0x04, SI 35261 35262 check_limit_unroll: 35263 CMPQ SI, $0x04 35264 JHS loop_unroll 35265 JMP check_limit 35266 35267 loop: 35268 MOVSS (AX), X1 35269 MULSS X0, X1 35270 ADDSS (DX), X1 35271 MOVSS X1, (DX) 35272 DECQ SI 35273 LEAQ (AX)(CX*4), AX 35274 LEAQ (DX)(BX*4), DX 35275 35276 check_limit: 35277 CMPQ SI, $0x00 35278 JHI loop 35279 RET 35280 35281 // func AmdAxpyPointerLoopXInterleave_V5A10U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35282 // Requires: SSE 35283 TEXT ·AmdAxpyPointerLoopXInterleave_V5A10U4(SB), NOSPLIT, $0-48 35284 MOVSS alpha+0(FP), X0 35285 MOVQ xs+8(FP), AX 35286 MOVQ incx+16(FP), CX 35287 MOVQ CX, DX 35288 SHLQ $0x04, DX 35289 MOVQ ys+24(FP), DX 35290 MOVQ incy+32(FP), BX 35291 MOVQ BX, SI 35292 SHLQ $0x04, SI 35293 MOVQ n+40(FP), SI 35294 JMP check_limit_unroll 35295 PCALIGN $0x08 35296 NOP 35297 NOP 35298 35299 loop_unroll: 35300 MOVSS (AX), X1 35301 LEAQ (AX)(CX*4), AX 35302 MOVSS (AX), X2 35303 LEAQ (AX)(CX*4), AX 35304 MOVSS (AX), X3 35305 LEAQ (AX)(CX*4), AX 35306 MOVSS (AX), X4 35307 LEAQ (AX)(CX*4), AX 35308 MULSS X0, X1 35309 MULSS X0, X2 35310 MULSS X0, X3 35311 MULSS X0, X4 35312 ADDSS (DX), X1 35313 MOVSS X1, (DX) 35314 LEAQ (DX)(BX*4), DX 35315 ADDSS (DX), X2 35316 MOVSS X2, (DX) 35317 LEAQ (DX)(BX*4), DX 35318 ADDSS (DX), X3 35319 MOVSS X3, (DX) 35320 LEAQ (DX)(BX*4), DX 35321 ADDSS (DX), X4 35322 MOVSS X4, (DX) 35323 LEAQ (DX)(BX*4), DX 35324 SUBQ $0x04, SI 35325 35326 check_limit_unroll: 35327 CMPQ SI, $0x04 35328 JHS loop_unroll 35329 JMP check_limit 35330 35331 loop: 35332 MOVSS (AX), X1 35333 MULSS X0, X1 35334 ADDSS (DX), X1 35335 MOVSS X1, (DX) 35336 DECQ SI 35337 LEAQ (AX)(CX*4), AX 35338 LEAQ (DX)(BX*4), DX 35339 35340 check_limit: 35341 CMPQ SI, $0x00 35342 JHI loop 35343 RET 35344 35345 // func AmdAxpyPointerLoopXInterleave_V0A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35346 // Requires: SSE 35347 TEXT ·AmdAxpyPointerLoopXInterleave_V0A11U4(SB), NOSPLIT, $0-48 35348 MOVSS alpha+0(FP), X0 35349 MOVQ xs+8(FP), AX 35350 MOVQ incx+16(FP), CX 35351 MOVQ CX, DX 35352 SHLQ $0x04, DX 35353 MOVQ ys+24(FP), DX 35354 MOVQ incy+32(FP), BX 35355 MOVQ BX, SI 35356 SHLQ $0x04, SI 35357 MOVQ n+40(FP), SI 35358 JMP check_limit_unroll 35359 PCALIGN $0x08 35360 NOP 35361 NOP 35362 NOP 35363 35364 loop_unroll: 35365 MOVSS (AX), X1 35366 LEAQ (AX)(CX*4), AX 35367 MOVSS (AX), X2 35368 LEAQ (AX)(CX*4), AX 35369 MOVSS (AX), X3 35370 LEAQ (AX)(CX*4), AX 35371 MOVSS (AX), X4 35372 LEAQ (AX)(CX*4), AX 35373 MULSS X0, X1 35374 MULSS X0, X2 35375 MULSS X0, X3 35376 MULSS X0, X4 35377 ADDSS (DX), X1 35378 MOVSS X1, (DX) 35379 LEAQ (DX)(BX*4), DX 35380 ADDSS (DX), X2 35381 MOVSS X2, (DX) 35382 LEAQ (DX)(BX*4), DX 35383 ADDSS (DX), X3 35384 MOVSS X3, (DX) 35385 LEAQ (DX)(BX*4), DX 35386 ADDSS (DX), X4 35387 MOVSS X4, (DX) 35388 LEAQ (DX)(BX*4), DX 35389 SUBQ $0x04, SI 35390 35391 check_limit_unroll: 35392 CMPQ SI, $0x04 35393 JHS loop_unroll 35394 JMP check_limit 35395 35396 loop: 35397 MOVSS (AX), X1 35398 MULSS X0, X1 35399 ADDSS (DX), X1 35400 MOVSS X1, (DX) 35401 DECQ SI 35402 LEAQ (AX)(CX*4), AX 35403 LEAQ (DX)(BX*4), DX 35404 35405 check_limit: 35406 CMPQ SI, $0x00 35407 JHI loop 35408 RET 35409 35410 // func AmdAxpyPointerLoopXInterleave_V1A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35411 // Requires: SSE 35412 TEXT ·AmdAxpyPointerLoopXInterleave_V1A11U4(SB), NOSPLIT, $0-48 35413 MOVSS alpha+0(FP), X0 35414 MOVQ xs+8(FP), AX 35415 MOVQ incx+16(FP), CX 35416 MOVQ CX, DX 35417 SHLQ $0x04, DX 35418 MOVQ ys+24(FP), DX 35419 MOVQ incy+32(FP), BX 35420 MOVQ BX, SI 35421 SHLQ $0x04, SI 35422 MOVQ n+40(FP), SI 35423 JMP check_limit_unroll 35424 PCALIGN $0x08 35425 NOP 35426 NOP 35427 NOP 35428 35429 loop_unroll: 35430 MOVSS (AX), X1 35431 LEAQ (AX)(CX*4), AX 35432 MOVSS (AX), X2 35433 LEAQ (AX)(CX*4), AX 35434 MOVSS (AX), X3 35435 LEAQ (AX)(CX*4), AX 35436 MOVSS (AX), X4 35437 LEAQ (AX)(CX*4), AX 35438 MULSS X0, X1 35439 MULSS X0, X2 35440 MULSS X0, X3 35441 MULSS X0, X4 35442 ADDSS (DX), X1 35443 MOVSS X1, (DX) 35444 LEAQ (DX)(BX*4), DX 35445 ADDSS (DX), X2 35446 MOVSS X2, (DX) 35447 LEAQ (DX)(BX*4), DX 35448 ADDSS (DX), X3 35449 MOVSS X3, (DX) 35450 LEAQ (DX)(BX*4), DX 35451 ADDSS (DX), X4 35452 MOVSS X4, (DX) 35453 LEAQ (DX)(BX*4), DX 35454 SUBQ $0x04, SI 35455 35456 check_limit_unroll: 35457 CMPQ SI, $0x04 35458 JHS loop_unroll 35459 JMP check_limit 35460 35461 loop: 35462 MOVSS (AX), X1 35463 MULSS X0, X1 35464 ADDSS (DX), X1 35465 MOVSS X1, (DX) 35466 DECQ SI 35467 LEAQ (AX)(CX*4), AX 35468 LEAQ (DX)(BX*4), DX 35469 35470 check_limit: 35471 CMPQ SI, $0x00 35472 JHI loop 35473 RET 35474 35475 // func AmdAxpyPointerLoopXInterleave_V2A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35476 // Requires: SSE 35477 TEXT ·AmdAxpyPointerLoopXInterleave_V2A11U4(SB), NOSPLIT, $0-48 35478 MOVSS alpha+0(FP), X0 35479 MOVQ xs+8(FP), AX 35480 MOVQ incx+16(FP), CX 35481 MOVQ CX, DX 35482 SHLQ $0x04, DX 35483 MOVQ ys+24(FP), DX 35484 MOVQ incy+32(FP), BX 35485 MOVQ BX, SI 35486 SHLQ $0x04, SI 35487 MOVQ n+40(FP), SI 35488 JMP check_limit_unroll 35489 PCALIGN $0x08 35490 NOP 35491 NOP 35492 NOP 35493 35494 loop_unroll: 35495 MOVSS (AX), X1 35496 LEAQ (AX)(CX*4), AX 35497 MOVSS (AX), X2 35498 LEAQ (AX)(CX*4), AX 35499 MOVSS (AX), X3 35500 LEAQ (AX)(CX*4), AX 35501 MOVSS (AX), X4 35502 LEAQ (AX)(CX*4), AX 35503 MULSS X0, X1 35504 MULSS X0, X2 35505 MULSS X0, X3 35506 MULSS X0, X4 35507 ADDSS (DX), X1 35508 MOVSS X1, (DX) 35509 LEAQ (DX)(BX*4), DX 35510 ADDSS (DX), X2 35511 MOVSS X2, (DX) 35512 LEAQ (DX)(BX*4), DX 35513 ADDSS (DX), X3 35514 MOVSS X3, (DX) 35515 LEAQ (DX)(BX*4), DX 35516 ADDSS (DX), X4 35517 MOVSS X4, (DX) 35518 LEAQ (DX)(BX*4), DX 35519 SUBQ $0x04, SI 35520 35521 check_limit_unroll: 35522 CMPQ SI, $0x04 35523 JHS loop_unroll 35524 JMP check_limit 35525 35526 loop: 35527 MOVSS (AX), X1 35528 MULSS X0, X1 35529 ADDSS (DX), X1 35530 MOVSS X1, (DX) 35531 DECQ SI 35532 LEAQ (AX)(CX*4), AX 35533 LEAQ (DX)(BX*4), DX 35534 35535 check_limit: 35536 CMPQ SI, $0x00 35537 JHI loop 35538 RET 35539 35540 // func AmdAxpyPointerLoopXInterleave_V3A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35541 // Requires: SSE 35542 TEXT ·AmdAxpyPointerLoopXInterleave_V3A11U4(SB), NOSPLIT, $0-48 35543 MOVSS alpha+0(FP), X0 35544 MOVQ xs+8(FP), AX 35545 MOVQ incx+16(FP), CX 35546 MOVQ CX, DX 35547 SHLQ $0x04, DX 35548 MOVQ ys+24(FP), DX 35549 MOVQ incy+32(FP), BX 35550 MOVQ BX, SI 35551 SHLQ $0x04, SI 35552 MOVQ n+40(FP), SI 35553 JMP check_limit_unroll 35554 PCALIGN $0x08 35555 NOP 35556 NOP 35557 NOP 35558 35559 loop_unroll: 35560 MOVSS (AX), X1 35561 LEAQ (AX)(CX*4), AX 35562 MOVSS (AX), X2 35563 LEAQ (AX)(CX*4), AX 35564 MOVSS (AX), X3 35565 LEAQ (AX)(CX*4), AX 35566 MOVSS (AX), X4 35567 LEAQ (AX)(CX*4), AX 35568 MULSS X0, X1 35569 MULSS X0, X2 35570 MULSS X0, X3 35571 MULSS X0, X4 35572 ADDSS (DX), X1 35573 MOVSS X1, (DX) 35574 LEAQ (DX)(BX*4), DX 35575 ADDSS (DX), X2 35576 MOVSS X2, (DX) 35577 LEAQ (DX)(BX*4), DX 35578 ADDSS (DX), X3 35579 MOVSS X3, (DX) 35580 LEAQ (DX)(BX*4), DX 35581 ADDSS (DX), X4 35582 MOVSS X4, (DX) 35583 LEAQ (DX)(BX*4), DX 35584 SUBQ $0x04, SI 35585 35586 check_limit_unroll: 35587 CMPQ SI, $0x04 35588 JHS loop_unroll 35589 JMP check_limit 35590 35591 loop: 35592 MOVSS (AX), X1 35593 MULSS X0, X1 35594 ADDSS (DX), X1 35595 MOVSS X1, (DX) 35596 DECQ SI 35597 LEAQ (AX)(CX*4), AX 35598 LEAQ (DX)(BX*4), DX 35599 35600 check_limit: 35601 CMPQ SI, $0x00 35602 JHI loop 35603 RET 35604 35605 // func AmdAxpyPointerLoopXInterleave_V4A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35606 // Requires: SSE 35607 TEXT ·AmdAxpyPointerLoopXInterleave_V4A11U4(SB), NOSPLIT, $0-48 35608 MOVSS alpha+0(FP), X0 35609 MOVQ xs+8(FP), AX 35610 MOVQ incx+16(FP), CX 35611 MOVQ CX, DX 35612 SHLQ $0x04, DX 35613 MOVQ ys+24(FP), DX 35614 MOVQ incy+32(FP), BX 35615 MOVQ BX, SI 35616 SHLQ $0x04, SI 35617 MOVQ n+40(FP), SI 35618 JMP check_limit_unroll 35619 PCALIGN $0x08 35620 NOP 35621 NOP 35622 NOP 35623 35624 loop_unroll: 35625 MOVSS (AX), X1 35626 LEAQ (AX)(CX*4), AX 35627 MOVSS (AX), X2 35628 LEAQ (AX)(CX*4), AX 35629 MOVSS (AX), X3 35630 LEAQ (AX)(CX*4), AX 35631 MOVSS (AX), X4 35632 LEAQ (AX)(CX*4), AX 35633 MULSS X0, X1 35634 MULSS X0, X2 35635 MULSS X0, X3 35636 MULSS X0, X4 35637 ADDSS (DX), X1 35638 MOVSS X1, (DX) 35639 LEAQ (DX)(BX*4), DX 35640 ADDSS (DX), X2 35641 MOVSS X2, (DX) 35642 LEAQ (DX)(BX*4), DX 35643 ADDSS (DX), X3 35644 MOVSS X3, (DX) 35645 LEAQ (DX)(BX*4), DX 35646 ADDSS (DX), X4 35647 MOVSS X4, (DX) 35648 LEAQ (DX)(BX*4), DX 35649 SUBQ $0x04, SI 35650 35651 check_limit_unroll: 35652 CMPQ SI, $0x04 35653 JHS loop_unroll 35654 JMP check_limit 35655 35656 loop: 35657 MOVSS (AX), X1 35658 MULSS X0, X1 35659 ADDSS (DX), X1 35660 MOVSS X1, (DX) 35661 DECQ SI 35662 LEAQ (AX)(CX*4), AX 35663 LEAQ (DX)(BX*4), DX 35664 35665 check_limit: 35666 CMPQ SI, $0x00 35667 JHI loop 35668 RET 35669 35670 // func AmdAxpyPointerLoopXInterleave_V5A11U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35671 // Requires: SSE 35672 TEXT ·AmdAxpyPointerLoopXInterleave_V5A11U4(SB), NOSPLIT, $0-48 35673 MOVSS alpha+0(FP), X0 35674 MOVQ xs+8(FP), AX 35675 MOVQ incx+16(FP), CX 35676 MOVQ CX, DX 35677 SHLQ $0x04, DX 35678 MOVQ ys+24(FP), DX 35679 MOVQ incy+32(FP), BX 35680 MOVQ BX, SI 35681 SHLQ $0x04, SI 35682 MOVQ n+40(FP), SI 35683 JMP check_limit_unroll 35684 PCALIGN $0x08 35685 NOP 35686 NOP 35687 NOP 35688 35689 loop_unroll: 35690 MOVSS (AX), X1 35691 LEAQ (AX)(CX*4), AX 35692 MOVSS (AX), X2 35693 LEAQ (AX)(CX*4), AX 35694 MOVSS (AX), X3 35695 LEAQ (AX)(CX*4), AX 35696 MOVSS (AX), X4 35697 LEAQ (AX)(CX*4), AX 35698 MULSS X0, X1 35699 MULSS X0, X2 35700 MULSS X0, X3 35701 MULSS X0, X4 35702 ADDSS (DX), X1 35703 MOVSS X1, (DX) 35704 LEAQ (DX)(BX*4), DX 35705 ADDSS (DX), X2 35706 MOVSS X2, (DX) 35707 LEAQ (DX)(BX*4), DX 35708 ADDSS (DX), X3 35709 MOVSS X3, (DX) 35710 LEAQ (DX)(BX*4), DX 35711 ADDSS (DX), X4 35712 MOVSS X4, (DX) 35713 LEAQ (DX)(BX*4), DX 35714 SUBQ $0x04, SI 35715 35716 check_limit_unroll: 35717 CMPQ SI, $0x04 35718 JHS loop_unroll 35719 JMP check_limit 35720 35721 loop: 35722 MOVSS (AX), X1 35723 MULSS X0, X1 35724 ADDSS (DX), X1 35725 MOVSS X1, (DX) 35726 DECQ SI 35727 LEAQ (AX)(CX*4), AX 35728 LEAQ (DX)(BX*4), DX 35729 35730 check_limit: 35731 CMPQ SI, $0x00 35732 JHI loop 35733 RET 35734 35735 // func AmdAxpyPointerLoopXInterleave_V0A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35736 // Requires: SSE 35737 TEXT ·AmdAxpyPointerLoopXInterleave_V0A12U4(SB), NOSPLIT, $0-48 35738 MOVSS alpha+0(FP), X0 35739 MOVQ xs+8(FP), AX 35740 MOVQ incx+16(FP), CX 35741 MOVQ CX, DX 35742 SHLQ $0x04, DX 35743 MOVQ ys+24(FP), DX 35744 MOVQ incy+32(FP), BX 35745 MOVQ BX, SI 35746 SHLQ $0x04, SI 35747 MOVQ n+40(FP), SI 35748 JMP check_limit_unroll 35749 PCALIGN $0x08 35750 NOP 35751 NOP 35752 NOP 35753 NOP 35754 35755 loop_unroll: 35756 MOVSS (AX), X1 35757 LEAQ (AX)(CX*4), AX 35758 MOVSS (AX), X2 35759 LEAQ (AX)(CX*4), AX 35760 MOVSS (AX), X3 35761 LEAQ (AX)(CX*4), AX 35762 MOVSS (AX), X4 35763 LEAQ (AX)(CX*4), AX 35764 MULSS X0, X1 35765 MULSS X0, X2 35766 MULSS X0, X3 35767 MULSS X0, X4 35768 ADDSS (DX), X1 35769 MOVSS X1, (DX) 35770 LEAQ (DX)(BX*4), DX 35771 ADDSS (DX), X2 35772 MOVSS X2, (DX) 35773 LEAQ (DX)(BX*4), DX 35774 ADDSS (DX), X3 35775 MOVSS X3, (DX) 35776 LEAQ (DX)(BX*4), DX 35777 ADDSS (DX), X4 35778 MOVSS X4, (DX) 35779 LEAQ (DX)(BX*4), DX 35780 SUBQ $0x04, SI 35781 35782 check_limit_unroll: 35783 CMPQ SI, $0x04 35784 JHS loop_unroll 35785 JMP check_limit 35786 35787 loop: 35788 MOVSS (AX), X1 35789 MULSS X0, X1 35790 ADDSS (DX), X1 35791 MOVSS X1, (DX) 35792 DECQ SI 35793 LEAQ (AX)(CX*4), AX 35794 LEAQ (DX)(BX*4), DX 35795 35796 check_limit: 35797 CMPQ SI, $0x00 35798 JHI loop 35799 RET 35800 35801 // func AmdAxpyPointerLoopXInterleave_V1A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35802 // Requires: SSE 35803 TEXT ·AmdAxpyPointerLoopXInterleave_V1A12U4(SB), NOSPLIT, $0-48 35804 MOVSS alpha+0(FP), X0 35805 MOVQ xs+8(FP), AX 35806 MOVQ incx+16(FP), CX 35807 MOVQ CX, DX 35808 SHLQ $0x04, DX 35809 MOVQ ys+24(FP), DX 35810 MOVQ incy+32(FP), BX 35811 MOVQ BX, SI 35812 SHLQ $0x04, SI 35813 MOVQ n+40(FP), SI 35814 JMP check_limit_unroll 35815 PCALIGN $0x08 35816 NOP 35817 NOP 35818 NOP 35819 NOP 35820 35821 loop_unroll: 35822 MOVSS (AX), X1 35823 LEAQ (AX)(CX*4), AX 35824 MOVSS (AX), X2 35825 LEAQ (AX)(CX*4), AX 35826 MOVSS (AX), X3 35827 LEAQ (AX)(CX*4), AX 35828 MOVSS (AX), X4 35829 LEAQ (AX)(CX*4), AX 35830 MULSS X0, X1 35831 MULSS X0, X2 35832 MULSS X0, X3 35833 MULSS X0, X4 35834 ADDSS (DX), X1 35835 MOVSS X1, (DX) 35836 LEAQ (DX)(BX*4), DX 35837 ADDSS (DX), X2 35838 MOVSS X2, (DX) 35839 LEAQ (DX)(BX*4), DX 35840 ADDSS (DX), X3 35841 MOVSS X3, (DX) 35842 LEAQ (DX)(BX*4), DX 35843 ADDSS (DX), X4 35844 MOVSS X4, (DX) 35845 LEAQ (DX)(BX*4), DX 35846 SUBQ $0x04, SI 35847 35848 check_limit_unroll: 35849 CMPQ SI, $0x04 35850 JHS loop_unroll 35851 JMP check_limit 35852 35853 loop: 35854 MOVSS (AX), X1 35855 MULSS X0, X1 35856 ADDSS (DX), X1 35857 MOVSS X1, (DX) 35858 DECQ SI 35859 LEAQ (AX)(CX*4), AX 35860 LEAQ (DX)(BX*4), DX 35861 35862 check_limit: 35863 CMPQ SI, $0x00 35864 JHI loop 35865 RET 35866 35867 // func AmdAxpyPointerLoopXInterleave_V2A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35868 // Requires: SSE 35869 TEXT ·AmdAxpyPointerLoopXInterleave_V2A12U4(SB), NOSPLIT, $0-48 35870 MOVSS alpha+0(FP), X0 35871 MOVQ xs+8(FP), AX 35872 MOVQ incx+16(FP), CX 35873 MOVQ CX, DX 35874 SHLQ $0x04, DX 35875 MOVQ ys+24(FP), DX 35876 MOVQ incy+32(FP), BX 35877 MOVQ BX, SI 35878 SHLQ $0x04, SI 35879 MOVQ n+40(FP), SI 35880 JMP check_limit_unroll 35881 PCALIGN $0x08 35882 NOP 35883 NOP 35884 NOP 35885 NOP 35886 35887 loop_unroll: 35888 MOVSS (AX), X1 35889 LEAQ (AX)(CX*4), AX 35890 MOVSS (AX), X2 35891 LEAQ (AX)(CX*4), AX 35892 MOVSS (AX), X3 35893 LEAQ (AX)(CX*4), AX 35894 MOVSS (AX), X4 35895 LEAQ (AX)(CX*4), AX 35896 MULSS X0, X1 35897 MULSS X0, X2 35898 MULSS X0, X3 35899 MULSS X0, X4 35900 ADDSS (DX), X1 35901 MOVSS X1, (DX) 35902 LEAQ (DX)(BX*4), DX 35903 ADDSS (DX), X2 35904 MOVSS X2, (DX) 35905 LEAQ (DX)(BX*4), DX 35906 ADDSS (DX), X3 35907 MOVSS X3, (DX) 35908 LEAQ (DX)(BX*4), DX 35909 ADDSS (DX), X4 35910 MOVSS X4, (DX) 35911 LEAQ (DX)(BX*4), DX 35912 SUBQ $0x04, SI 35913 35914 check_limit_unroll: 35915 CMPQ SI, $0x04 35916 JHS loop_unroll 35917 JMP check_limit 35918 35919 loop: 35920 MOVSS (AX), X1 35921 MULSS X0, X1 35922 ADDSS (DX), X1 35923 MOVSS X1, (DX) 35924 DECQ SI 35925 LEAQ (AX)(CX*4), AX 35926 LEAQ (DX)(BX*4), DX 35927 35928 check_limit: 35929 CMPQ SI, $0x00 35930 JHI loop 35931 RET 35932 35933 // func AmdAxpyPointerLoopXInterleave_V3A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 35934 // Requires: SSE 35935 TEXT ·AmdAxpyPointerLoopXInterleave_V3A12U4(SB), NOSPLIT, $0-48 35936 MOVSS alpha+0(FP), X0 35937 MOVQ xs+8(FP), AX 35938 MOVQ incx+16(FP), CX 35939 MOVQ CX, DX 35940 SHLQ $0x04, DX 35941 MOVQ ys+24(FP), DX 35942 MOVQ incy+32(FP), BX 35943 MOVQ BX, SI 35944 SHLQ $0x04, SI 35945 MOVQ n+40(FP), SI 35946 JMP check_limit_unroll 35947 PCALIGN $0x08 35948 NOP 35949 NOP 35950 NOP 35951 NOP 35952 35953 loop_unroll: 35954 MOVSS (AX), X1 35955 LEAQ (AX)(CX*4), AX 35956 MOVSS (AX), X2 35957 LEAQ (AX)(CX*4), AX 35958 MOVSS (AX), X3 35959 LEAQ (AX)(CX*4), AX 35960 MOVSS (AX), X4 35961 LEAQ (AX)(CX*4), AX 35962 MULSS X0, X1 35963 MULSS X0, X2 35964 MULSS X0, X3 35965 MULSS X0, X4 35966 ADDSS (DX), X1 35967 MOVSS X1, (DX) 35968 LEAQ (DX)(BX*4), DX 35969 ADDSS (DX), X2 35970 MOVSS X2, (DX) 35971 LEAQ (DX)(BX*4), DX 35972 ADDSS (DX), X3 35973 MOVSS X3, (DX) 35974 LEAQ (DX)(BX*4), DX 35975 ADDSS (DX), X4 35976 MOVSS X4, (DX) 35977 LEAQ (DX)(BX*4), DX 35978 SUBQ $0x04, SI 35979 35980 check_limit_unroll: 35981 CMPQ SI, $0x04 35982 JHS loop_unroll 35983 JMP check_limit 35984 35985 loop: 35986 MOVSS (AX), X1 35987 MULSS X0, X1 35988 ADDSS (DX), X1 35989 MOVSS X1, (DX) 35990 DECQ SI 35991 LEAQ (AX)(CX*4), AX 35992 LEAQ (DX)(BX*4), DX 35993 35994 check_limit: 35995 CMPQ SI, $0x00 35996 JHI loop 35997 RET 35998 35999 // func AmdAxpyPointerLoopXInterleave_V4A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36000 // Requires: SSE 36001 TEXT ·AmdAxpyPointerLoopXInterleave_V4A12U4(SB), NOSPLIT, $0-48 36002 MOVSS alpha+0(FP), X0 36003 MOVQ xs+8(FP), AX 36004 MOVQ incx+16(FP), CX 36005 MOVQ CX, DX 36006 SHLQ $0x04, DX 36007 MOVQ ys+24(FP), DX 36008 MOVQ incy+32(FP), BX 36009 MOVQ BX, SI 36010 SHLQ $0x04, SI 36011 MOVQ n+40(FP), SI 36012 JMP check_limit_unroll 36013 PCALIGN $0x08 36014 NOP 36015 NOP 36016 NOP 36017 NOP 36018 36019 loop_unroll: 36020 MOVSS (AX), X1 36021 LEAQ (AX)(CX*4), AX 36022 MOVSS (AX), X2 36023 LEAQ (AX)(CX*4), AX 36024 MOVSS (AX), X3 36025 LEAQ (AX)(CX*4), AX 36026 MOVSS (AX), X4 36027 LEAQ (AX)(CX*4), AX 36028 MULSS X0, X1 36029 MULSS X0, X2 36030 MULSS X0, X3 36031 MULSS X0, X4 36032 ADDSS (DX), X1 36033 MOVSS X1, (DX) 36034 LEAQ (DX)(BX*4), DX 36035 ADDSS (DX), X2 36036 MOVSS X2, (DX) 36037 LEAQ (DX)(BX*4), DX 36038 ADDSS (DX), X3 36039 MOVSS X3, (DX) 36040 LEAQ (DX)(BX*4), DX 36041 ADDSS (DX), X4 36042 MOVSS X4, (DX) 36043 LEAQ (DX)(BX*4), DX 36044 SUBQ $0x04, SI 36045 36046 check_limit_unroll: 36047 CMPQ SI, $0x04 36048 JHS loop_unroll 36049 JMP check_limit 36050 36051 loop: 36052 MOVSS (AX), X1 36053 MULSS X0, X1 36054 ADDSS (DX), X1 36055 MOVSS X1, (DX) 36056 DECQ SI 36057 LEAQ (AX)(CX*4), AX 36058 LEAQ (DX)(BX*4), DX 36059 36060 check_limit: 36061 CMPQ SI, $0x00 36062 JHI loop 36063 RET 36064 36065 // func AmdAxpyPointerLoopXInterleave_V5A12U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36066 // Requires: SSE 36067 TEXT ·AmdAxpyPointerLoopXInterleave_V5A12U4(SB), NOSPLIT, $0-48 36068 MOVSS alpha+0(FP), X0 36069 MOVQ xs+8(FP), AX 36070 MOVQ incx+16(FP), CX 36071 MOVQ CX, DX 36072 SHLQ $0x04, DX 36073 MOVQ ys+24(FP), DX 36074 MOVQ incy+32(FP), BX 36075 MOVQ BX, SI 36076 SHLQ $0x04, SI 36077 MOVQ n+40(FP), SI 36078 JMP check_limit_unroll 36079 PCALIGN $0x08 36080 NOP 36081 NOP 36082 NOP 36083 NOP 36084 36085 loop_unroll: 36086 MOVSS (AX), X1 36087 LEAQ (AX)(CX*4), AX 36088 MOVSS (AX), X2 36089 LEAQ (AX)(CX*4), AX 36090 MOVSS (AX), X3 36091 LEAQ (AX)(CX*4), AX 36092 MOVSS (AX), X4 36093 LEAQ (AX)(CX*4), AX 36094 MULSS X0, X1 36095 MULSS X0, X2 36096 MULSS X0, X3 36097 MULSS X0, X4 36098 ADDSS (DX), X1 36099 MOVSS X1, (DX) 36100 LEAQ (DX)(BX*4), DX 36101 ADDSS (DX), X2 36102 MOVSS X2, (DX) 36103 LEAQ (DX)(BX*4), DX 36104 ADDSS (DX), X3 36105 MOVSS X3, (DX) 36106 LEAQ (DX)(BX*4), DX 36107 ADDSS (DX), X4 36108 MOVSS X4, (DX) 36109 LEAQ (DX)(BX*4), DX 36110 SUBQ $0x04, SI 36111 36112 check_limit_unroll: 36113 CMPQ SI, $0x04 36114 JHS loop_unroll 36115 JMP check_limit 36116 36117 loop: 36118 MOVSS (AX), X1 36119 MULSS X0, X1 36120 ADDSS (DX), X1 36121 MOVSS X1, (DX) 36122 DECQ SI 36123 LEAQ (AX)(CX*4), AX 36124 LEAQ (DX)(BX*4), DX 36125 36126 check_limit: 36127 CMPQ SI, $0x00 36128 JHI loop 36129 RET 36130 36131 // func AmdAxpyPointerLoopXInterleave_V0A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36132 // Requires: SSE 36133 TEXT ·AmdAxpyPointerLoopXInterleave_V0A13U4(SB), NOSPLIT, $0-48 36134 MOVSS alpha+0(FP), X0 36135 MOVQ xs+8(FP), AX 36136 MOVQ incx+16(FP), CX 36137 MOVQ CX, DX 36138 SHLQ $0x04, DX 36139 MOVQ ys+24(FP), DX 36140 MOVQ incy+32(FP), BX 36141 MOVQ BX, SI 36142 SHLQ $0x04, SI 36143 MOVQ n+40(FP), SI 36144 JMP check_limit_unroll 36145 PCALIGN $0x08 36146 NOP 36147 NOP 36148 NOP 36149 NOP 36150 NOP 36151 36152 loop_unroll: 36153 MOVSS (AX), X1 36154 LEAQ (AX)(CX*4), AX 36155 MOVSS (AX), X2 36156 LEAQ (AX)(CX*4), AX 36157 MOVSS (AX), X3 36158 LEAQ (AX)(CX*4), AX 36159 MOVSS (AX), X4 36160 LEAQ (AX)(CX*4), AX 36161 MULSS X0, X1 36162 MULSS X0, X2 36163 MULSS X0, X3 36164 MULSS X0, X4 36165 ADDSS (DX), X1 36166 MOVSS X1, (DX) 36167 LEAQ (DX)(BX*4), DX 36168 ADDSS (DX), X2 36169 MOVSS X2, (DX) 36170 LEAQ (DX)(BX*4), DX 36171 ADDSS (DX), X3 36172 MOVSS X3, (DX) 36173 LEAQ (DX)(BX*4), DX 36174 ADDSS (DX), X4 36175 MOVSS X4, (DX) 36176 LEAQ (DX)(BX*4), DX 36177 SUBQ $0x04, SI 36178 36179 check_limit_unroll: 36180 CMPQ SI, $0x04 36181 JHS loop_unroll 36182 JMP check_limit 36183 36184 loop: 36185 MOVSS (AX), X1 36186 MULSS X0, X1 36187 ADDSS (DX), X1 36188 MOVSS X1, (DX) 36189 DECQ SI 36190 LEAQ (AX)(CX*4), AX 36191 LEAQ (DX)(BX*4), DX 36192 36193 check_limit: 36194 CMPQ SI, $0x00 36195 JHI loop 36196 RET 36197 36198 // func AmdAxpyPointerLoopXInterleave_V1A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36199 // Requires: SSE 36200 TEXT ·AmdAxpyPointerLoopXInterleave_V1A13U4(SB), NOSPLIT, $0-48 36201 MOVSS alpha+0(FP), X0 36202 MOVQ xs+8(FP), AX 36203 MOVQ incx+16(FP), CX 36204 MOVQ CX, DX 36205 SHLQ $0x04, DX 36206 MOVQ ys+24(FP), DX 36207 MOVQ incy+32(FP), BX 36208 MOVQ BX, SI 36209 SHLQ $0x04, SI 36210 MOVQ n+40(FP), SI 36211 JMP check_limit_unroll 36212 PCALIGN $0x08 36213 NOP 36214 NOP 36215 NOP 36216 NOP 36217 NOP 36218 36219 loop_unroll: 36220 MOVSS (AX), X1 36221 LEAQ (AX)(CX*4), AX 36222 MOVSS (AX), X2 36223 LEAQ (AX)(CX*4), AX 36224 MOVSS (AX), X3 36225 LEAQ (AX)(CX*4), AX 36226 MOVSS (AX), X4 36227 LEAQ (AX)(CX*4), AX 36228 MULSS X0, X1 36229 MULSS X0, X2 36230 MULSS X0, X3 36231 MULSS X0, X4 36232 ADDSS (DX), X1 36233 MOVSS X1, (DX) 36234 LEAQ (DX)(BX*4), DX 36235 ADDSS (DX), X2 36236 MOVSS X2, (DX) 36237 LEAQ (DX)(BX*4), DX 36238 ADDSS (DX), X3 36239 MOVSS X3, (DX) 36240 LEAQ (DX)(BX*4), DX 36241 ADDSS (DX), X4 36242 MOVSS X4, (DX) 36243 LEAQ (DX)(BX*4), DX 36244 SUBQ $0x04, SI 36245 36246 check_limit_unroll: 36247 CMPQ SI, $0x04 36248 JHS loop_unroll 36249 JMP check_limit 36250 36251 loop: 36252 MOVSS (AX), X1 36253 MULSS X0, X1 36254 ADDSS (DX), X1 36255 MOVSS X1, (DX) 36256 DECQ SI 36257 LEAQ (AX)(CX*4), AX 36258 LEAQ (DX)(BX*4), DX 36259 36260 check_limit: 36261 CMPQ SI, $0x00 36262 JHI loop 36263 RET 36264 36265 // func AmdAxpyPointerLoopXInterleave_V2A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36266 // Requires: SSE 36267 TEXT ·AmdAxpyPointerLoopXInterleave_V2A13U4(SB), NOSPLIT, $0-48 36268 MOVSS alpha+0(FP), X0 36269 MOVQ xs+8(FP), AX 36270 MOVQ incx+16(FP), CX 36271 MOVQ CX, DX 36272 SHLQ $0x04, DX 36273 MOVQ ys+24(FP), DX 36274 MOVQ incy+32(FP), BX 36275 MOVQ BX, SI 36276 SHLQ $0x04, SI 36277 MOVQ n+40(FP), SI 36278 JMP check_limit_unroll 36279 PCALIGN $0x08 36280 NOP 36281 NOP 36282 NOP 36283 NOP 36284 NOP 36285 36286 loop_unroll: 36287 MOVSS (AX), X1 36288 LEAQ (AX)(CX*4), AX 36289 MOVSS (AX), X2 36290 LEAQ (AX)(CX*4), AX 36291 MOVSS (AX), X3 36292 LEAQ (AX)(CX*4), AX 36293 MOVSS (AX), X4 36294 LEAQ (AX)(CX*4), AX 36295 MULSS X0, X1 36296 MULSS X0, X2 36297 MULSS X0, X3 36298 MULSS X0, X4 36299 ADDSS (DX), X1 36300 MOVSS X1, (DX) 36301 LEAQ (DX)(BX*4), DX 36302 ADDSS (DX), X2 36303 MOVSS X2, (DX) 36304 LEAQ (DX)(BX*4), DX 36305 ADDSS (DX), X3 36306 MOVSS X3, (DX) 36307 LEAQ (DX)(BX*4), DX 36308 ADDSS (DX), X4 36309 MOVSS X4, (DX) 36310 LEAQ (DX)(BX*4), DX 36311 SUBQ $0x04, SI 36312 36313 check_limit_unroll: 36314 CMPQ SI, $0x04 36315 JHS loop_unroll 36316 JMP check_limit 36317 36318 loop: 36319 MOVSS (AX), X1 36320 MULSS X0, X1 36321 ADDSS (DX), X1 36322 MOVSS X1, (DX) 36323 DECQ SI 36324 LEAQ (AX)(CX*4), AX 36325 LEAQ (DX)(BX*4), DX 36326 36327 check_limit: 36328 CMPQ SI, $0x00 36329 JHI loop 36330 RET 36331 36332 // func AmdAxpyPointerLoopXInterleave_V3A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36333 // Requires: SSE 36334 TEXT ·AmdAxpyPointerLoopXInterleave_V3A13U4(SB), NOSPLIT, $0-48 36335 MOVSS alpha+0(FP), X0 36336 MOVQ xs+8(FP), AX 36337 MOVQ incx+16(FP), CX 36338 MOVQ CX, DX 36339 SHLQ $0x04, DX 36340 MOVQ ys+24(FP), DX 36341 MOVQ incy+32(FP), BX 36342 MOVQ BX, SI 36343 SHLQ $0x04, SI 36344 MOVQ n+40(FP), SI 36345 JMP check_limit_unroll 36346 PCALIGN $0x08 36347 NOP 36348 NOP 36349 NOP 36350 NOP 36351 NOP 36352 36353 loop_unroll: 36354 MOVSS (AX), X1 36355 LEAQ (AX)(CX*4), AX 36356 MOVSS (AX), X2 36357 LEAQ (AX)(CX*4), AX 36358 MOVSS (AX), X3 36359 LEAQ (AX)(CX*4), AX 36360 MOVSS (AX), X4 36361 LEAQ (AX)(CX*4), AX 36362 MULSS X0, X1 36363 MULSS X0, X2 36364 MULSS X0, X3 36365 MULSS X0, X4 36366 ADDSS (DX), X1 36367 MOVSS X1, (DX) 36368 LEAQ (DX)(BX*4), DX 36369 ADDSS (DX), X2 36370 MOVSS X2, (DX) 36371 LEAQ (DX)(BX*4), DX 36372 ADDSS (DX), X3 36373 MOVSS X3, (DX) 36374 LEAQ (DX)(BX*4), DX 36375 ADDSS (DX), X4 36376 MOVSS X4, (DX) 36377 LEAQ (DX)(BX*4), DX 36378 SUBQ $0x04, SI 36379 36380 check_limit_unroll: 36381 CMPQ SI, $0x04 36382 JHS loop_unroll 36383 JMP check_limit 36384 36385 loop: 36386 MOVSS (AX), X1 36387 MULSS X0, X1 36388 ADDSS (DX), X1 36389 MOVSS X1, (DX) 36390 DECQ SI 36391 LEAQ (AX)(CX*4), AX 36392 LEAQ (DX)(BX*4), DX 36393 36394 check_limit: 36395 CMPQ SI, $0x00 36396 JHI loop 36397 RET 36398 36399 // func AmdAxpyPointerLoopXInterleave_V4A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36400 // Requires: SSE 36401 TEXT ·AmdAxpyPointerLoopXInterleave_V4A13U4(SB), NOSPLIT, $0-48 36402 MOVSS alpha+0(FP), X0 36403 MOVQ xs+8(FP), AX 36404 MOVQ incx+16(FP), CX 36405 MOVQ CX, DX 36406 SHLQ $0x04, DX 36407 MOVQ ys+24(FP), DX 36408 MOVQ incy+32(FP), BX 36409 MOVQ BX, SI 36410 SHLQ $0x04, SI 36411 MOVQ n+40(FP), SI 36412 JMP check_limit_unroll 36413 PCALIGN $0x08 36414 NOP 36415 NOP 36416 NOP 36417 NOP 36418 NOP 36419 36420 loop_unroll: 36421 MOVSS (AX), X1 36422 LEAQ (AX)(CX*4), AX 36423 MOVSS (AX), X2 36424 LEAQ (AX)(CX*4), AX 36425 MOVSS (AX), X3 36426 LEAQ (AX)(CX*4), AX 36427 MOVSS (AX), X4 36428 LEAQ (AX)(CX*4), AX 36429 MULSS X0, X1 36430 MULSS X0, X2 36431 MULSS X0, X3 36432 MULSS X0, X4 36433 ADDSS (DX), X1 36434 MOVSS X1, (DX) 36435 LEAQ (DX)(BX*4), DX 36436 ADDSS (DX), X2 36437 MOVSS X2, (DX) 36438 LEAQ (DX)(BX*4), DX 36439 ADDSS (DX), X3 36440 MOVSS X3, (DX) 36441 LEAQ (DX)(BX*4), DX 36442 ADDSS (DX), X4 36443 MOVSS X4, (DX) 36444 LEAQ (DX)(BX*4), DX 36445 SUBQ $0x04, SI 36446 36447 check_limit_unroll: 36448 CMPQ SI, $0x04 36449 JHS loop_unroll 36450 JMP check_limit 36451 36452 loop: 36453 MOVSS (AX), X1 36454 MULSS X0, X1 36455 ADDSS (DX), X1 36456 MOVSS X1, (DX) 36457 DECQ SI 36458 LEAQ (AX)(CX*4), AX 36459 LEAQ (DX)(BX*4), DX 36460 36461 check_limit: 36462 CMPQ SI, $0x00 36463 JHI loop 36464 RET 36465 36466 // func AmdAxpyPointerLoopXInterleave_V5A13U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36467 // Requires: SSE 36468 TEXT ·AmdAxpyPointerLoopXInterleave_V5A13U4(SB), NOSPLIT, $0-48 36469 MOVSS alpha+0(FP), X0 36470 MOVQ xs+8(FP), AX 36471 MOVQ incx+16(FP), CX 36472 MOVQ CX, DX 36473 SHLQ $0x04, DX 36474 MOVQ ys+24(FP), DX 36475 MOVQ incy+32(FP), BX 36476 MOVQ BX, SI 36477 SHLQ $0x04, SI 36478 MOVQ n+40(FP), SI 36479 JMP check_limit_unroll 36480 PCALIGN $0x08 36481 NOP 36482 NOP 36483 NOP 36484 NOP 36485 NOP 36486 36487 loop_unroll: 36488 MOVSS (AX), X1 36489 LEAQ (AX)(CX*4), AX 36490 MOVSS (AX), X2 36491 LEAQ (AX)(CX*4), AX 36492 MOVSS (AX), X3 36493 LEAQ (AX)(CX*4), AX 36494 MOVSS (AX), X4 36495 LEAQ (AX)(CX*4), AX 36496 MULSS X0, X1 36497 MULSS X0, X2 36498 MULSS X0, X3 36499 MULSS X0, X4 36500 ADDSS (DX), X1 36501 MOVSS X1, (DX) 36502 LEAQ (DX)(BX*4), DX 36503 ADDSS (DX), X2 36504 MOVSS X2, (DX) 36505 LEAQ (DX)(BX*4), DX 36506 ADDSS (DX), X3 36507 MOVSS X3, (DX) 36508 LEAQ (DX)(BX*4), DX 36509 ADDSS (DX), X4 36510 MOVSS X4, (DX) 36511 LEAQ (DX)(BX*4), DX 36512 SUBQ $0x04, SI 36513 36514 check_limit_unroll: 36515 CMPQ SI, $0x04 36516 JHS loop_unroll 36517 JMP check_limit 36518 36519 loop: 36520 MOVSS (AX), X1 36521 MULSS X0, X1 36522 ADDSS (DX), X1 36523 MOVSS X1, (DX) 36524 DECQ SI 36525 LEAQ (AX)(CX*4), AX 36526 LEAQ (DX)(BX*4), DX 36527 36528 check_limit: 36529 CMPQ SI, $0x00 36530 JHI loop 36531 RET 36532 36533 // func AmdAxpyPointerLoopXInterleave_V0A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36534 // Requires: SSE 36535 TEXT ·AmdAxpyPointerLoopXInterleave_V0A14U4(SB), NOSPLIT, $0-48 36536 MOVSS alpha+0(FP), X0 36537 MOVQ xs+8(FP), AX 36538 MOVQ incx+16(FP), CX 36539 MOVQ CX, DX 36540 SHLQ $0x04, DX 36541 MOVQ ys+24(FP), DX 36542 MOVQ incy+32(FP), BX 36543 MOVQ BX, SI 36544 SHLQ $0x04, SI 36545 MOVQ n+40(FP), SI 36546 JMP check_limit_unroll 36547 PCALIGN $0x08 36548 NOP 36549 NOP 36550 NOP 36551 NOP 36552 NOP 36553 NOP 36554 36555 loop_unroll: 36556 MOVSS (AX), X1 36557 LEAQ (AX)(CX*4), AX 36558 MOVSS (AX), X2 36559 LEAQ (AX)(CX*4), AX 36560 MOVSS (AX), X3 36561 LEAQ (AX)(CX*4), AX 36562 MOVSS (AX), X4 36563 LEAQ (AX)(CX*4), AX 36564 MULSS X0, X1 36565 MULSS X0, X2 36566 MULSS X0, X3 36567 MULSS X0, X4 36568 ADDSS (DX), X1 36569 MOVSS X1, (DX) 36570 LEAQ (DX)(BX*4), DX 36571 ADDSS (DX), X2 36572 MOVSS X2, (DX) 36573 LEAQ (DX)(BX*4), DX 36574 ADDSS (DX), X3 36575 MOVSS X3, (DX) 36576 LEAQ (DX)(BX*4), DX 36577 ADDSS (DX), X4 36578 MOVSS X4, (DX) 36579 LEAQ (DX)(BX*4), DX 36580 SUBQ $0x04, SI 36581 36582 check_limit_unroll: 36583 CMPQ SI, $0x04 36584 JHS loop_unroll 36585 JMP check_limit 36586 36587 loop: 36588 MOVSS (AX), X1 36589 MULSS X0, X1 36590 ADDSS (DX), X1 36591 MOVSS X1, (DX) 36592 DECQ SI 36593 LEAQ (AX)(CX*4), AX 36594 LEAQ (DX)(BX*4), DX 36595 36596 check_limit: 36597 CMPQ SI, $0x00 36598 JHI loop 36599 RET 36600 36601 // func AmdAxpyPointerLoopXInterleave_V1A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36602 // Requires: SSE 36603 TEXT ·AmdAxpyPointerLoopXInterleave_V1A14U4(SB), NOSPLIT, $0-48 36604 MOVSS alpha+0(FP), X0 36605 MOVQ xs+8(FP), AX 36606 MOVQ incx+16(FP), CX 36607 MOVQ CX, DX 36608 SHLQ $0x04, DX 36609 MOVQ ys+24(FP), DX 36610 MOVQ incy+32(FP), BX 36611 MOVQ BX, SI 36612 SHLQ $0x04, SI 36613 MOVQ n+40(FP), SI 36614 JMP check_limit_unroll 36615 PCALIGN $0x08 36616 NOP 36617 NOP 36618 NOP 36619 NOP 36620 NOP 36621 NOP 36622 36623 loop_unroll: 36624 MOVSS (AX), X1 36625 LEAQ (AX)(CX*4), AX 36626 MOVSS (AX), X2 36627 LEAQ (AX)(CX*4), AX 36628 MOVSS (AX), X3 36629 LEAQ (AX)(CX*4), AX 36630 MOVSS (AX), X4 36631 LEAQ (AX)(CX*4), AX 36632 MULSS X0, X1 36633 MULSS X0, X2 36634 MULSS X0, X3 36635 MULSS X0, X4 36636 ADDSS (DX), X1 36637 MOVSS X1, (DX) 36638 LEAQ (DX)(BX*4), DX 36639 ADDSS (DX), X2 36640 MOVSS X2, (DX) 36641 LEAQ (DX)(BX*4), DX 36642 ADDSS (DX), X3 36643 MOVSS X3, (DX) 36644 LEAQ (DX)(BX*4), DX 36645 ADDSS (DX), X4 36646 MOVSS X4, (DX) 36647 LEAQ (DX)(BX*4), DX 36648 SUBQ $0x04, SI 36649 36650 check_limit_unroll: 36651 CMPQ SI, $0x04 36652 JHS loop_unroll 36653 JMP check_limit 36654 36655 loop: 36656 MOVSS (AX), X1 36657 MULSS X0, X1 36658 ADDSS (DX), X1 36659 MOVSS X1, (DX) 36660 DECQ SI 36661 LEAQ (AX)(CX*4), AX 36662 LEAQ (DX)(BX*4), DX 36663 36664 check_limit: 36665 CMPQ SI, $0x00 36666 JHI loop 36667 RET 36668 36669 // func AmdAxpyPointerLoopXInterleave_V2A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36670 // Requires: SSE 36671 TEXT ·AmdAxpyPointerLoopXInterleave_V2A14U4(SB), NOSPLIT, $0-48 36672 MOVSS alpha+0(FP), X0 36673 MOVQ xs+8(FP), AX 36674 MOVQ incx+16(FP), CX 36675 MOVQ CX, DX 36676 SHLQ $0x04, DX 36677 MOVQ ys+24(FP), DX 36678 MOVQ incy+32(FP), BX 36679 MOVQ BX, SI 36680 SHLQ $0x04, SI 36681 MOVQ n+40(FP), SI 36682 JMP check_limit_unroll 36683 PCALIGN $0x08 36684 NOP 36685 NOP 36686 NOP 36687 NOP 36688 NOP 36689 NOP 36690 36691 loop_unroll: 36692 MOVSS (AX), X1 36693 LEAQ (AX)(CX*4), AX 36694 MOVSS (AX), X2 36695 LEAQ (AX)(CX*4), AX 36696 MOVSS (AX), X3 36697 LEAQ (AX)(CX*4), AX 36698 MOVSS (AX), X4 36699 LEAQ (AX)(CX*4), AX 36700 MULSS X0, X1 36701 MULSS X0, X2 36702 MULSS X0, X3 36703 MULSS X0, X4 36704 ADDSS (DX), X1 36705 MOVSS X1, (DX) 36706 LEAQ (DX)(BX*4), DX 36707 ADDSS (DX), X2 36708 MOVSS X2, (DX) 36709 LEAQ (DX)(BX*4), DX 36710 ADDSS (DX), X3 36711 MOVSS X3, (DX) 36712 LEAQ (DX)(BX*4), DX 36713 ADDSS (DX), X4 36714 MOVSS X4, (DX) 36715 LEAQ (DX)(BX*4), DX 36716 SUBQ $0x04, SI 36717 36718 check_limit_unroll: 36719 CMPQ SI, $0x04 36720 JHS loop_unroll 36721 JMP check_limit 36722 36723 loop: 36724 MOVSS (AX), X1 36725 MULSS X0, X1 36726 ADDSS (DX), X1 36727 MOVSS X1, (DX) 36728 DECQ SI 36729 LEAQ (AX)(CX*4), AX 36730 LEAQ (DX)(BX*4), DX 36731 36732 check_limit: 36733 CMPQ SI, $0x00 36734 JHI loop 36735 RET 36736 36737 // func AmdAxpyPointerLoopXInterleave_V3A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36738 // Requires: SSE 36739 TEXT ·AmdAxpyPointerLoopXInterleave_V3A14U4(SB), NOSPLIT, $0-48 36740 MOVSS alpha+0(FP), X0 36741 MOVQ xs+8(FP), AX 36742 MOVQ incx+16(FP), CX 36743 MOVQ CX, DX 36744 SHLQ $0x04, DX 36745 MOVQ ys+24(FP), DX 36746 MOVQ incy+32(FP), BX 36747 MOVQ BX, SI 36748 SHLQ $0x04, SI 36749 MOVQ n+40(FP), SI 36750 JMP check_limit_unroll 36751 PCALIGN $0x08 36752 NOP 36753 NOP 36754 NOP 36755 NOP 36756 NOP 36757 NOP 36758 36759 loop_unroll: 36760 MOVSS (AX), X1 36761 LEAQ (AX)(CX*4), AX 36762 MOVSS (AX), X2 36763 LEAQ (AX)(CX*4), AX 36764 MOVSS (AX), X3 36765 LEAQ (AX)(CX*4), AX 36766 MOVSS (AX), X4 36767 LEAQ (AX)(CX*4), AX 36768 MULSS X0, X1 36769 MULSS X0, X2 36770 MULSS X0, X3 36771 MULSS X0, X4 36772 ADDSS (DX), X1 36773 MOVSS X1, (DX) 36774 LEAQ (DX)(BX*4), DX 36775 ADDSS (DX), X2 36776 MOVSS X2, (DX) 36777 LEAQ (DX)(BX*4), DX 36778 ADDSS (DX), X3 36779 MOVSS X3, (DX) 36780 LEAQ (DX)(BX*4), DX 36781 ADDSS (DX), X4 36782 MOVSS X4, (DX) 36783 LEAQ (DX)(BX*4), DX 36784 SUBQ $0x04, SI 36785 36786 check_limit_unroll: 36787 CMPQ SI, $0x04 36788 JHS loop_unroll 36789 JMP check_limit 36790 36791 loop: 36792 MOVSS (AX), X1 36793 MULSS X0, X1 36794 ADDSS (DX), X1 36795 MOVSS X1, (DX) 36796 DECQ SI 36797 LEAQ (AX)(CX*4), AX 36798 LEAQ (DX)(BX*4), DX 36799 36800 check_limit: 36801 CMPQ SI, $0x00 36802 JHI loop 36803 RET 36804 36805 // func AmdAxpyPointerLoopXInterleave_V4A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36806 // Requires: SSE 36807 TEXT ·AmdAxpyPointerLoopXInterleave_V4A14U4(SB), NOSPLIT, $0-48 36808 MOVSS alpha+0(FP), X0 36809 MOVQ xs+8(FP), AX 36810 MOVQ incx+16(FP), CX 36811 MOVQ CX, DX 36812 SHLQ $0x04, DX 36813 MOVQ ys+24(FP), DX 36814 MOVQ incy+32(FP), BX 36815 MOVQ BX, SI 36816 SHLQ $0x04, SI 36817 MOVQ n+40(FP), SI 36818 JMP check_limit_unroll 36819 PCALIGN $0x08 36820 NOP 36821 NOP 36822 NOP 36823 NOP 36824 NOP 36825 NOP 36826 36827 loop_unroll: 36828 MOVSS (AX), X1 36829 LEAQ (AX)(CX*4), AX 36830 MOVSS (AX), X2 36831 LEAQ (AX)(CX*4), AX 36832 MOVSS (AX), X3 36833 LEAQ (AX)(CX*4), AX 36834 MOVSS (AX), X4 36835 LEAQ (AX)(CX*4), AX 36836 MULSS X0, X1 36837 MULSS X0, X2 36838 MULSS X0, X3 36839 MULSS X0, X4 36840 ADDSS (DX), X1 36841 MOVSS X1, (DX) 36842 LEAQ (DX)(BX*4), DX 36843 ADDSS (DX), X2 36844 MOVSS X2, (DX) 36845 LEAQ (DX)(BX*4), DX 36846 ADDSS (DX), X3 36847 MOVSS X3, (DX) 36848 LEAQ (DX)(BX*4), DX 36849 ADDSS (DX), X4 36850 MOVSS X4, (DX) 36851 LEAQ (DX)(BX*4), DX 36852 SUBQ $0x04, SI 36853 36854 check_limit_unroll: 36855 CMPQ SI, $0x04 36856 JHS loop_unroll 36857 JMP check_limit 36858 36859 loop: 36860 MOVSS (AX), X1 36861 MULSS X0, X1 36862 ADDSS (DX), X1 36863 MOVSS X1, (DX) 36864 DECQ SI 36865 LEAQ (AX)(CX*4), AX 36866 LEAQ (DX)(BX*4), DX 36867 36868 check_limit: 36869 CMPQ SI, $0x00 36870 JHI loop 36871 RET 36872 36873 // func AmdAxpyPointerLoopXInterleave_V5A14U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36874 // Requires: SSE 36875 TEXT ·AmdAxpyPointerLoopXInterleave_V5A14U4(SB), NOSPLIT, $0-48 36876 MOVSS alpha+0(FP), X0 36877 MOVQ xs+8(FP), AX 36878 MOVQ incx+16(FP), CX 36879 MOVQ CX, DX 36880 SHLQ $0x04, DX 36881 MOVQ ys+24(FP), DX 36882 MOVQ incy+32(FP), BX 36883 MOVQ BX, SI 36884 SHLQ $0x04, SI 36885 MOVQ n+40(FP), SI 36886 JMP check_limit_unroll 36887 PCALIGN $0x08 36888 NOP 36889 NOP 36890 NOP 36891 NOP 36892 NOP 36893 NOP 36894 36895 loop_unroll: 36896 MOVSS (AX), X1 36897 LEAQ (AX)(CX*4), AX 36898 MOVSS (AX), X2 36899 LEAQ (AX)(CX*4), AX 36900 MOVSS (AX), X3 36901 LEAQ (AX)(CX*4), AX 36902 MOVSS (AX), X4 36903 LEAQ (AX)(CX*4), AX 36904 MULSS X0, X1 36905 MULSS X0, X2 36906 MULSS X0, X3 36907 MULSS X0, X4 36908 ADDSS (DX), X1 36909 MOVSS X1, (DX) 36910 LEAQ (DX)(BX*4), DX 36911 ADDSS (DX), X2 36912 MOVSS X2, (DX) 36913 LEAQ (DX)(BX*4), DX 36914 ADDSS (DX), X3 36915 MOVSS X3, (DX) 36916 LEAQ (DX)(BX*4), DX 36917 ADDSS (DX), X4 36918 MOVSS X4, (DX) 36919 LEAQ (DX)(BX*4), DX 36920 SUBQ $0x04, SI 36921 36922 check_limit_unroll: 36923 CMPQ SI, $0x04 36924 JHS loop_unroll 36925 JMP check_limit 36926 36927 loop: 36928 MOVSS (AX), X1 36929 MULSS X0, X1 36930 ADDSS (DX), X1 36931 MOVSS X1, (DX) 36932 DECQ SI 36933 LEAQ (AX)(CX*4), AX 36934 LEAQ (DX)(BX*4), DX 36935 36936 check_limit: 36937 CMPQ SI, $0x00 36938 JHI loop 36939 RET 36940 36941 // func AmdAxpyPointerLoopXInterleave_V0A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 36942 // Requires: SSE 36943 TEXT ·AmdAxpyPointerLoopXInterleave_V0A15U4(SB), NOSPLIT, $0-48 36944 MOVSS alpha+0(FP), X0 36945 MOVQ xs+8(FP), AX 36946 MOVQ incx+16(FP), CX 36947 MOVQ CX, DX 36948 SHLQ $0x04, DX 36949 MOVQ ys+24(FP), DX 36950 MOVQ incy+32(FP), BX 36951 MOVQ BX, SI 36952 SHLQ $0x04, SI 36953 MOVQ n+40(FP), SI 36954 JMP check_limit_unroll 36955 PCALIGN $0x08 36956 NOP 36957 NOP 36958 NOP 36959 NOP 36960 NOP 36961 NOP 36962 NOP 36963 36964 loop_unroll: 36965 MOVSS (AX), X1 36966 LEAQ (AX)(CX*4), AX 36967 MOVSS (AX), X2 36968 LEAQ (AX)(CX*4), AX 36969 MOVSS (AX), X3 36970 LEAQ (AX)(CX*4), AX 36971 MOVSS (AX), X4 36972 LEAQ (AX)(CX*4), AX 36973 MULSS X0, X1 36974 MULSS X0, X2 36975 MULSS X0, X3 36976 MULSS X0, X4 36977 ADDSS (DX), X1 36978 MOVSS X1, (DX) 36979 LEAQ (DX)(BX*4), DX 36980 ADDSS (DX), X2 36981 MOVSS X2, (DX) 36982 LEAQ (DX)(BX*4), DX 36983 ADDSS (DX), X3 36984 MOVSS X3, (DX) 36985 LEAQ (DX)(BX*4), DX 36986 ADDSS (DX), X4 36987 MOVSS X4, (DX) 36988 LEAQ (DX)(BX*4), DX 36989 SUBQ $0x04, SI 36990 36991 check_limit_unroll: 36992 CMPQ SI, $0x04 36993 JHS loop_unroll 36994 JMP check_limit 36995 36996 loop: 36997 MOVSS (AX), X1 36998 MULSS X0, X1 36999 ADDSS (DX), X1 37000 MOVSS X1, (DX) 37001 DECQ SI 37002 LEAQ (AX)(CX*4), AX 37003 LEAQ (DX)(BX*4), DX 37004 37005 check_limit: 37006 CMPQ SI, $0x00 37007 JHI loop 37008 RET 37009 37010 // func AmdAxpyPointerLoopXInterleave_V1A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37011 // Requires: SSE 37012 TEXT ·AmdAxpyPointerLoopXInterleave_V1A15U4(SB), NOSPLIT, $0-48 37013 MOVSS alpha+0(FP), X0 37014 MOVQ xs+8(FP), AX 37015 MOVQ incx+16(FP), CX 37016 MOVQ CX, DX 37017 SHLQ $0x04, DX 37018 MOVQ ys+24(FP), DX 37019 MOVQ incy+32(FP), BX 37020 MOVQ BX, SI 37021 SHLQ $0x04, SI 37022 MOVQ n+40(FP), SI 37023 JMP check_limit_unroll 37024 PCALIGN $0x08 37025 NOP 37026 NOP 37027 NOP 37028 NOP 37029 NOP 37030 NOP 37031 NOP 37032 37033 loop_unroll: 37034 MOVSS (AX), X1 37035 LEAQ (AX)(CX*4), AX 37036 MOVSS (AX), X2 37037 LEAQ (AX)(CX*4), AX 37038 MOVSS (AX), X3 37039 LEAQ (AX)(CX*4), AX 37040 MOVSS (AX), X4 37041 LEAQ (AX)(CX*4), AX 37042 MULSS X0, X1 37043 MULSS X0, X2 37044 MULSS X0, X3 37045 MULSS X0, X4 37046 ADDSS (DX), X1 37047 MOVSS X1, (DX) 37048 LEAQ (DX)(BX*4), DX 37049 ADDSS (DX), X2 37050 MOVSS X2, (DX) 37051 LEAQ (DX)(BX*4), DX 37052 ADDSS (DX), X3 37053 MOVSS X3, (DX) 37054 LEAQ (DX)(BX*4), DX 37055 ADDSS (DX), X4 37056 MOVSS X4, (DX) 37057 LEAQ (DX)(BX*4), DX 37058 SUBQ $0x04, SI 37059 37060 check_limit_unroll: 37061 CMPQ SI, $0x04 37062 JHS loop_unroll 37063 JMP check_limit 37064 37065 loop: 37066 MOVSS (AX), X1 37067 MULSS X0, X1 37068 ADDSS (DX), X1 37069 MOVSS X1, (DX) 37070 DECQ SI 37071 LEAQ (AX)(CX*4), AX 37072 LEAQ (DX)(BX*4), DX 37073 37074 check_limit: 37075 CMPQ SI, $0x00 37076 JHI loop 37077 RET 37078 37079 // func AmdAxpyPointerLoopXInterleave_V2A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37080 // Requires: SSE 37081 TEXT ·AmdAxpyPointerLoopXInterleave_V2A15U4(SB), NOSPLIT, $0-48 37082 MOVSS alpha+0(FP), X0 37083 MOVQ xs+8(FP), AX 37084 MOVQ incx+16(FP), CX 37085 MOVQ CX, DX 37086 SHLQ $0x04, DX 37087 MOVQ ys+24(FP), DX 37088 MOVQ incy+32(FP), BX 37089 MOVQ BX, SI 37090 SHLQ $0x04, SI 37091 MOVQ n+40(FP), SI 37092 JMP check_limit_unroll 37093 PCALIGN $0x08 37094 NOP 37095 NOP 37096 NOP 37097 NOP 37098 NOP 37099 NOP 37100 NOP 37101 37102 loop_unroll: 37103 MOVSS (AX), X1 37104 LEAQ (AX)(CX*4), AX 37105 MOVSS (AX), X2 37106 LEAQ (AX)(CX*4), AX 37107 MOVSS (AX), X3 37108 LEAQ (AX)(CX*4), AX 37109 MOVSS (AX), X4 37110 LEAQ (AX)(CX*4), AX 37111 MULSS X0, X1 37112 MULSS X0, X2 37113 MULSS X0, X3 37114 MULSS X0, X4 37115 ADDSS (DX), X1 37116 MOVSS X1, (DX) 37117 LEAQ (DX)(BX*4), DX 37118 ADDSS (DX), X2 37119 MOVSS X2, (DX) 37120 LEAQ (DX)(BX*4), DX 37121 ADDSS (DX), X3 37122 MOVSS X3, (DX) 37123 LEAQ (DX)(BX*4), DX 37124 ADDSS (DX), X4 37125 MOVSS X4, (DX) 37126 LEAQ (DX)(BX*4), DX 37127 SUBQ $0x04, SI 37128 37129 check_limit_unroll: 37130 CMPQ SI, $0x04 37131 JHS loop_unroll 37132 JMP check_limit 37133 37134 loop: 37135 MOVSS (AX), X1 37136 MULSS X0, X1 37137 ADDSS (DX), X1 37138 MOVSS X1, (DX) 37139 DECQ SI 37140 LEAQ (AX)(CX*4), AX 37141 LEAQ (DX)(BX*4), DX 37142 37143 check_limit: 37144 CMPQ SI, $0x00 37145 JHI loop 37146 RET 37147 37148 // func AmdAxpyPointerLoopXInterleave_V3A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37149 // Requires: SSE 37150 TEXT ·AmdAxpyPointerLoopXInterleave_V3A15U4(SB), NOSPLIT, $0-48 37151 MOVSS alpha+0(FP), X0 37152 MOVQ xs+8(FP), AX 37153 MOVQ incx+16(FP), CX 37154 MOVQ CX, DX 37155 SHLQ $0x04, DX 37156 MOVQ ys+24(FP), DX 37157 MOVQ incy+32(FP), BX 37158 MOVQ BX, SI 37159 SHLQ $0x04, SI 37160 MOVQ n+40(FP), SI 37161 JMP check_limit_unroll 37162 PCALIGN $0x08 37163 NOP 37164 NOP 37165 NOP 37166 NOP 37167 NOP 37168 NOP 37169 NOP 37170 37171 loop_unroll: 37172 MOVSS (AX), X1 37173 LEAQ (AX)(CX*4), AX 37174 MOVSS (AX), X2 37175 LEAQ (AX)(CX*4), AX 37176 MOVSS (AX), X3 37177 LEAQ (AX)(CX*4), AX 37178 MOVSS (AX), X4 37179 LEAQ (AX)(CX*4), AX 37180 MULSS X0, X1 37181 MULSS X0, X2 37182 MULSS X0, X3 37183 MULSS X0, X4 37184 ADDSS (DX), X1 37185 MOVSS X1, (DX) 37186 LEAQ (DX)(BX*4), DX 37187 ADDSS (DX), X2 37188 MOVSS X2, (DX) 37189 LEAQ (DX)(BX*4), DX 37190 ADDSS (DX), X3 37191 MOVSS X3, (DX) 37192 LEAQ (DX)(BX*4), DX 37193 ADDSS (DX), X4 37194 MOVSS X4, (DX) 37195 LEAQ (DX)(BX*4), DX 37196 SUBQ $0x04, SI 37197 37198 check_limit_unroll: 37199 CMPQ SI, $0x04 37200 JHS loop_unroll 37201 JMP check_limit 37202 37203 loop: 37204 MOVSS (AX), X1 37205 MULSS X0, X1 37206 ADDSS (DX), X1 37207 MOVSS X1, (DX) 37208 DECQ SI 37209 LEAQ (AX)(CX*4), AX 37210 LEAQ (DX)(BX*4), DX 37211 37212 check_limit: 37213 CMPQ SI, $0x00 37214 JHI loop 37215 RET 37216 37217 // func AmdAxpyPointerLoopXInterleave_V4A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37218 // Requires: SSE 37219 TEXT ·AmdAxpyPointerLoopXInterleave_V4A15U4(SB), NOSPLIT, $0-48 37220 MOVSS alpha+0(FP), X0 37221 MOVQ xs+8(FP), AX 37222 MOVQ incx+16(FP), CX 37223 MOVQ CX, DX 37224 SHLQ $0x04, DX 37225 MOVQ ys+24(FP), DX 37226 MOVQ incy+32(FP), BX 37227 MOVQ BX, SI 37228 SHLQ $0x04, SI 37229 MOVQ n+40(FP), SI 37230 JMP check_limit_unroll 37231 PCALIGN $0x08 37232 NOP 37233 NOP 37234 NOP 37235 NOP 37236 NOP 37237 NOP 37238 NOP 37239 37240 loop_unroll: 37241 MOVSS (AX), X1 37242 LEAQ (AX)(CX*4), AX 37243 MOVSS (AX), X2 37244 LEAQ (AX)(CX*4), AX 37245 MOVSS (AX), X3 37246 LEAQ (AX)(CX*4), AX 37247 MOVSS (AX), X4 37248 LEAQ (AX)(CX*4), AX 37249 MULSS X0, X1 37250 MULSS X0, X2 37251 MULSS X0, X3 37252 MULSS X0, X4 37253 ADDSS (DX), X1 37254 MOVSS X1, (DX) 37255 LEAQ (DX)(BX*4), DX 37256 ADDSS (DX), X2 37257 MOVSS X2, (DX) 37258 LEAQ (DX)(BX*4), DX 37259 ADDSS (DX), X3 37260 MOVSS X3, (DX) 37261 LEAQ (DX)(BX*4), DX 37262 ADDSS (DX), X4 37263 MOVSS X4, (DX) 37264 LEAQ (DX)(BX*4), DX 37265 SUBQ $0x04, SI 37266 37267 check_limit_unroll: 37268 CMPQ SI, $0x04 37269 JHS loop_unroll 37270 JMP check_limit 37271 37272 loop: 37273 MOVSS (AX), X1 37274 MULSS X0, X1 37275 ADDSS (DX), X1 37276 MOVSS X1, (DX) 37277 DECQ SI 37278 LEAQ (AX)(CX*4), AX 37279 LEAQ (DX)(BX*4), DX 37280 37281 check_limit: 37282 CMPQ SI, $0x00 37283 JHI loop 37284 RET 37285 37286 // func AmdAxpyPointerLoopXInterleave_V5A15U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37287 // Requires: SSE 37288 TEXT ·AmdAxpyPointerLoopXInterleave_V5A15U4(SB), NOSPLIT, $0-48 37289 MOVSS alpha+0(FP), X0 37290 MOVQ xs+8(FP), AX 37291 MOVQ incx+16(FP), CX 37292 MOVQ CX, DX 37293 SHLQ $0x04, DX 37294 MOVQ ys+24(FP), DX 37295 MOVQ incy+32(FP), BX 37296 MOVQ BX, SI 37297 SHLQ $0x04, SI 37298 MOVQ n+40(FP), SI 37299 JMP check_limit_unroll 37300 PCALIGN $0x08 37301 NOP 37302 NOP 37303 NOP 37304 NOP 37305 NOP 37306 NOP 37307 NOP 37308 37309 loop_unroll: 37310 MOVSS (AX), X1 37311 LEAQ (AX)(CX*4), AX 37312 MOVSS (AX), X2 37313 LEAQ (AX)(CX*4), AX 37314 MOVSS (AX), X3 37315 LEAQ (AX)(CX*4), AX 37316 MOVSS (AX), X4 37317 LEAQ (AX)(CX*4), AX 37318 MULSS X0, X1 37319 MULSS X0, X2 37320 MULSS X0, X3 37321 MULSS X0, X4 37322 ADDSS (DX), X1 37323 MOVSS X1, (DX) 37324 LEAQ (DX)(BX*4), DX 37325 ADDSS (DX), X2 37326 MOVSS X2, (DX) 37327 LEAQ (DX)(BX*4), DX 37328 ADDSS (DX), X3 37329 MOVSS X3, (DX) 37330 LEAQ (DX)(BX*4), DX 37331 ADDSS (DX), X4 37332 MOVSS X4, (DX) 37333 LEAQ (DX)(BX*4), DX 37334 SUBQ $0x04, SI 37335 37336 check_limit_unroll: 37337 CMPQ SI, $0x04 37338 JHS loop_unroll 37339 JMP check_limit 37340 37341 loop: 37342 MOVSS (AX), X1 37343 MULSS X0, X1 37344 ADDSS (DX), X1 37345 MOVSS X1, (DX) 37346 DECQ SI 37347 LEAQ (AX)(CX*4), AX 37348 LEAQ (DX)(BX*4), DX 37349 37350 check_limit: 37351 CMPQ SI, $0x00 37352 JHI loop 37353 RET 37354 37355 // func AmdAxpyPointerLoopXInterleave_V0A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37356 // Requires: SSE 37357 TEXT ·AmdAxpyPointerLoopXInterleave_V0A16U4(SB), NOSPLIT, $0-48 37358 MOVSS alpha+0(FP), X0 37359 MOVQ xs+8(FP), AX 37360 MOVQ incx+16(FP), CX 37361 MOVQ CX, DX 37362 SHLQ $0x04, DX 37363 MOVQ ys+24(FP), DX 37364 MOVQ incy+32(FP), BX 37365 MOVQ BX, SI 37366 SHLQ $0x04, SI 37367 MOVQ n+40(FP), SI 37368 JMP check_limit_unroll 37369 PCALIGN $0x10 37370 37371 loop_unroll: 37372 MOVSS (AX), X1 37373 LEAQ (AX)(CX*4), AX 37374 MOVSS (AX), X2 37375 LEAQ (AX)(CX*4), AX 37376 MOVSS (AX), X3 37377 LEAQ (AX)(CX*4), AX 37378 MOVSS (AX), X4 37379 LEAQ (AX)(CX*4), AX 37380 MULSS X0, X1 37381 MULSS X0, X2 37382 MULSS X0, X3 37383 MULSS X0, X4 37384 ADDSS (DX), X1 37385 MOVSS X1, (DX) 37386 LEAQ (DX)(BX*4), DX 37387 ADDSS (DX), X2 37388 MOVSS X2, (DX) 37389 LEAQ (DX)(BX*4), DX 37390 ADDSS (DX), X3 37391 MOVSS X3, (DX) 37392 LEAQ (DX)(BX*4), DX 37393 ADDSS (DX), X4 37394 MOVSS X4, (DX) 37395 LEAQ (DX)(BX*4), DX 37396 SUBQ $0x04, SI 37397 37398 check_limit_unroll: 37399 CMPQ SI, $0x04 37400 JHS loop_unroll 37401 JMP check_limit 37402 37403 loop: 37404 MOVSS (AX), X1 37405 MULSS X0, X1 37406 ADDSS (DX), X1 37407 MOVSS X1, (DX) 37408 DECQ SI 37409 LEAQ (AX)(CX*4), AX 37410 LEAQ (DX)(BX*4), DX 37411 37412 check_limit: 37413 CMPQ SI, $0x00 37414 JHI loop 37415 RET 37416 37417 // func AmdAxpyPointerLoopXInterleave_V1A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37418 // Requires: SSE 37419 TEXT ·AmdAxpyPointerLoopXInterleave_V1A16U4(SB), NOSPLIT, $0-48 37420 MOVSS alpha+0(FP), X0 37421 MOVQ xs+8(FP), AX 37422 MOVQ incx+16(FP), CX 37423 MOVQ CX, DX 37424 SHLQ $0x04, DX 37425 MOVQ ys+24(FP), DX 37426 MOVQ incy+32(FP), BX 37427 MOVQ BX, SI 37428 SHLQ $0x04, SI 37429 MOVQ n+40(FP), SI 37430 JMP check_limit_unroll 37431 PCALIGN $0x10 37432 37433 loop_unroll: 37434 MOVSS (AX), X1 37435 LEAQ (AX)(CX*4), AX 37436 MOVSS (AX), X2 37437 LEAQ (AX)(CX*4), AX 37438 MOVSS (AX), X3 37439 LEAQ (AX)(CX*4), AX 37440 MOVSS (AX), X4 37441 LEAQ (AX)(CX*4), AX 37442 MULSS X0, X1 37443 MULSS X0, X2 37444 MULSS X0, X3 37445 MULSS X0, X4 37446 ADDSS (DX), X1 37447 MOVSS X1, (DX) 37448 LEAQ (DX)(BX*4), DX 37449 ADDSS (DX), X2 37450 MOVSS X2, (DX) 37451 LEAQ (DX)(BX*4), DX 37452 ADDSS (DX), X3 37453 MOVSS X3, (DX) 37454 LEAQ (DX)(BX*4), DX 37455 ADDSS (DX), X4 37456 MOVSS X4, (DX) 37457 LEAQ (DX)(BX*4), DX 37458 SUBQ $0x04, SI 37459 37460 check_limit_unroll: 37461 CMPQ SI, $0x04 37462 JHS loop_unroll 37463 JMP check_limit 37464 37465 loop: 37466 MOVSS (AX), X1 37467 MULSS X0, X1 37468 ADDSS (DX), X1 37469 MOVSS X1, (DX) 37470 DECQ SI 37471 LEAQ (AX)(CX*4), AX 37472 LEAQ (DX)(BX*4), DX 37473 37474 check_limit: 37475 CMPQ SI, $0x00 37476 JHI loop 37477 RET 37478 37479 // func AmdAxpyPointerLoopXInterleave_V2A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37480 // Requires: SSE 37481 TEXT ·AmdAxpyPointerLoopXInterleave_V2A16U4(SB), NOSPLIT, $0-48 37482 MOVSS alpha+0(FP), X0 37483 MOVQ xs+8(FP), AX 37484 MOVQ incx+16(FP), CX 37485 MOVQ CX, DX 37486 SHLQ $0x04, DX 37487 MOVQ ys+24(FP), DX 37488 MOVQ incy+32(FP), BX 37489 MOVQ BX, SI 37490 SHLQ $0x04, SI 37491 MOVQ n+40(FP), SI 37492 JMP check_limit_unroll 37493 PCALIGN $0x10 37494 37495 loop_unroll: 37496 MOVSS (AX), X1 37497 LEAQ (AX)(CX*4), AX 37498 MOVSS (AX), X2 37499 LEAQ (AX)(CX*4), AX 37500 MOVSS (AX), X3 37501 LEAQ (AX)(CX*4), AX 37502 MOVSS (AX), X4 37503 LEAQ (AX)(CX*4), AX 37504 MULSS X0, X1 37505 MULSS X0, X2 37506 MULSS X0, X3 37507 MULSS X0, X4 37508 ADDSS (DX), X1 37509 MOVSS X1, (DX) 37510 LEAQ (DX)(BX*4), DX 37511 ADDSS (DX), X2 37512 MOVSS X2, (DX) 37513 LEAQ (DX)(BX*4), DX 37514 ADDSS (DX), X3 37515 MOVSS X3, (DX) 37516 LEAQ (DX)(BX*4), DX 37517 ADDSS (DX), X4 37518 MOVSS X4, (DX) 37519 LEAQ (DX)(BX*4), DX 37520 SUBQ $0x04, SI 37521 37522 check_limit_unroll: 37523 CMPQ SI, $0x04 37524 JHS loop_unroll 37525 JMP check_limit 37526 37527 loop: 37528 MOVSS (AX), X1 37529 MULSS X0, X1 37530 ADDSS (DX), X1 37531 MOVSS X1, (DX) 37532 DECQ SI 37533 LEAQ (AX)(CX*4), AX 37534 LEAQ (DX)(BX*4), DX 37535 37536 check_limit: 37537 CMPQ SI, $0x00 37538 JHI loop 37539 RET 37540 37541 // func AmdAxpyPointerLoopXInterleave_V3A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37542 // Requires: SSE 37543 TEXT ·AmdAxpyPointerLoopXInterleave_V3A16U4(SB), NOSPLIT, $0-48 37544 MOVSS alpha+0(FP), X0 37545 MOVQ xs+8(FP), AX 37546 MOVQ incx+16(FP), CX 37547 MOVQ CX, DX 37548 SHLQ $0x04, DX 37549 MOVQ ys+24(FP), DX 37550 MOVQ incy+32(FP), BX 37551 MOVQ BX, SI 37552 SHLQ $0x04, SI 37553 MOVQ n+40(FP), SI 37554 JMP check_limit_unroll 37555 PCALIGN $0x10 37556 37557 loop_unroll: 37558 MOVSS (AX), X1 37559 LEAQ (AX)(CX*4), AX 37560 MOVSS (AX), X2 37561 LEAQ (AX)(CX*4), AX 37562 MOVSS (AX), X3 37563 LEAQ (AX)(CX*4), AX 37564 MOVSS (AX), X4 37565 LEAQ (AX)(CX*4), AX 37566 MULSS X0, X1 37567 MULSS X0, X2 37568 MULSS X0, X3 37569 MULSS X0, X4 37570 ADDSS (DX), X1 37571 MOVSS X1, (DX) 37572 LEAQ (DX)(BX*4), DX 37573 ADDSS (DX), X2 37574 MOVSS X2, (DX) 37575 LEAQ (DX)(BX*4), DX 37576 ADDSS (DX), X3 37577 MOVSS X3, (DX) 37578 LEAQ (DX)(BX*4), DX 37579 ADDSS (DX), X4 37580 MOVSS X4, (DX) 37581 LEAQ (DX)(BX*4), DX 37582 SUBQ $0x04, SI 37583 37584 check_limit_unroll: 37585 CMPQ SI, $0x04 37586 JHS loop_unroll 37587 JMP check_limit 37588 37589 loop: 37590 MOVSS (AX), X1 37591 MULSS X0, X1 37592 ADDSS (DX), X1 37593 MOVSS X1, (DX) 37594 DECQ SI 37595 LEAQ (AX)(CX*4), AX 37596 LEAQ (DX)(BX*4), DX 37597 37598 check_limit: 37599 CMPQ SI, $0x00 37600 JHI loop 37601 RET 37602 37603 // func AmdAxpyPointerLoopXInterleave_V4A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37604 // Requires: SSE 37605 TEXT ·AmdAxpyPointerLoopXInterleave_V4A16U4(SB), NOSPLIT, $0-48 37606 MOVSS alpha+0(FP), X0 37607 MOVQ xs+8(FP), AX 37608 MOVQ incx+16(FP), CX 37609 MOVQ CX, DX 37610 SHLQ $0x04, DX 37611 MOVQ ys+24(FP), DX 37612 MOVQ incy+32(FP), BX 37613 MOVQ BX, SI 37614 SHLQ $0x04, SI 37615 MOVQ n+40(FP), SI 37616 JMP check_limit_unroll 37617 PCALIGN $0x10 37618 37619 loop_unroll: 37620 MOVSS (AX), X1 37621 LEAQ (AX)(CX*4), AX 37622 MOVSS (AX), X2 37623 LEAQ (AX)(CX*4), AX 37624 MOVSS (AX), X3 37625 LEAQ (AX)(CX*4), AX 37626 MOVSS (AX), X4 37627 LEAQ (AX)(CX*4), AX 37628 MULSS X0, X1 37629 MULSS X0, X2 37630 MULSS X0, X3 37631 MULSS X0, X4 37632 ADDSS (DX), X1 37633 MOVSS X1, (DX) 37634 LEAQ (DX)(BX*4), DX 37635 ADDSS (DX), X2 37636 MOVSS X2, (DX) 37637 LEAQ (DX)(BX*4), DX 37638 ADDSS (DX), X3 37639 MOVSS X3, (DX) 37640 LEAQ (DX)(BX*4), DX 37641 ADDSS (DX), X4 37642 MOVSS X4, (DX) 37643 LEAQ (DX)(BX*4), DX 37644 SUBQ $0x04, SI 37645 37646 check_limit_unroll: 37647 CMPQ SI, $0x04 37648 JHS loop_unroll 37649 JMP check_limit 37650 37651 loop: 37652 MOVSS (AX), X1 37653 MULSS X0, X1 37654 ADDSS (DX), X1 37655 MOVSS X1, (DX) 37656 DECQ SI 37657 LEAQ (AX)(CX*4), AX 37658 LEAQ (DX)(BX*4), DX 37659 37660 check_limit: 37661 CMPQ SI, $0x00 37662 JHI loop 37663 RET 37664 37665 // func AmdAxpyPointerLoopXInterleave_V5A16U4(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37666 // Requires: SSE 37667 TEXT ·AmdAxpyPointerLoopXInterleave_V5A16U4(SB), NOSPLIT, $0-48 37668 MOVSS alpha+0(FP), X0 37669 MOVQ xs+8(FP), AX 37670 MOVQ incx+16(FP), CX 37671 MOVQ CX, DX 37672 SHLQ $0x04, DX 37673 MOVQ ys+24(FP), DX 37674 MOVQ incy+32(FP), BX 37675 MOVQ BX, SI 37676 SHLQ $0x04, SI 37677 MOVQ n+40(FP), SI 37678 JMP check_limit_unroll 37679 PCALIGN $0x10 37680 37681 loop_unroll: 37682 MOVSS (AX), X1 37683 LEAQ (AX)(CX*4), AX 37684 MOVSS (AX), X2 37685 LEAQ (AX)(CX*4), AX 37686 MOVSS (AX), X3 37687 LEAQ (AX)(CX*4), AX 37688 MOVSS (AX), X4 37689 LEAQ (AX)(CX*4), AX 37690 MULSS X0, X1 37691 MULSS X0, X2 37692 MULSS X0, X3 37693 MULSS X0, X4 37694 ADDSS (DX), X1 37695 MOVSS X1, (DX) 37696 LEAQ (DX)(BX*4), DX 37697 ADDSS (DX), X2 37698 MOVSS X2, (DX) 37699 LEAQ (DX)(BX*4), DX 37700 ADDSS (DX), X3 37701 MOVSS X3, (DX) 37702 LEAQ (DX)(BX*4), DX 37703 ADDSS (DX), X4 37704 MOVSS X4, (DX) 37705 LEAQ (DX)(BX*4), DX 37706 SUBQ $0x04, SI 37707 37708 check_limit_unroll: 37709 CMPQ SI, $0x04 37710 JHS loop_unroll 37711 JMP check_limit 37712 37713 loop: 37714 MOVSS (AX), X1 37715 MULSS X0, X1 37716 ADDSS (DX), X1 37717 MOVSS X1, (DX) 37718 DECQ SI 37719 LEAQ (AX)(CX*4), AX 37720 LEAQ (DX)(BX*4), DX 37721 37722 check_limit: 37723 CMPQ SI, $0x00 37724 JHI loop 37725 RET 37726 37727 // func AmdAxpyPointerLoopXInterleave_V0A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37728 // Requires: SSE 37729 TEXT ·AmdAxpyPointerLoopXInterleave_V0A0U8(SB), NOSPLIT, $0-48 37730 MOVSS alpha+0(FP), X0 37731 MOVQ xs+8(FP), AX 37732 MOVQ incx+16(FP), CX 37733 MOVQ CX, DX 37734 SHLQ $0x05, DX 37735 MOVQ ys+24(FP), DX 37736 MOVQ incy+32(FP), BX 37737 MOVQ BX, SI 37738 SHLQ $0x05, SI 37739 MOVQ n+40(FP), SI 37740 JMP check_limit_unroll 37741 37742 loop_unroll: 37743 MOVSS (AX), X1 37744 LEAQ (AX)(CX*4), AX 37745 MOVSS (AX), X2 37746 LEAQ (AX)(CX*4), AX 37747 MOVSS (AX), X3 37748 LEAQ (AX)(CX*4), AX 37749 MOVSS (AX), X4 37750 LEAQ (AX)(CX*4), AX 37751 MOVSS (AX), X5 37752 LEAQ (AX)(CX*4), AX 37753 MOVSS (AX), X6 37754 LEAQ (AX)(CX*4), AX 37755 MOVSS (AX), X7 37756 LEAQ (AX)(CX*4), AX 37757 MOVSS (AX), X8 37758 LEAQ (AX)(CX*4), AX 37759 MULSS X0, X1 37760 MULSS X0, X2 37761 MULSS X0, X3 37762 MULSS X0, X4 37763 MULSS X0, X5 37764 MULSS X0, X6 37765 MULSS X0, X7 37766 MULSS X0, X8 37767 ADDSS (DX), X1 37768 MOVSS X1, (DX) 37769 LEAQ (DX)(BX*4), DX 37770 ADDSS (DX), X2 37771 MOVSS X2, (DX) 37772 LEAQ (DX)(BX*4), DX 37773 ADDSS (DX), X3 37774 MOVSS X3, (DX) 37775 LEAQ (DX)(BX*4), DX 37776 ADDSS (DX), X4 37777 MOVSS X4, (DX) 37778 LEAQ (DX)(BX*4), DX 37779 ADDSS (DX), X5 37780 MOVSS X5, (DX) 37781 LEAQ (DX)(BX*4), DX 37782 ADDSS (DX), X6 37783 MOVSS X6, (DX) 37784 LEAQ (DX)(BX*4), DX 37785 ADDSS (DX), X7 37786 MOVSS X7, (DX) 37787 LEAQ (DX)(BX*4), DX 37788 ADDSS (DX), X8 37789 MOVSS X8, (DX) 37790 LEAQ (DX)(BX*4), DX 37791 SUBQ $0x08, SI 37792 37793 check_limit_unroll: 37794 CMPQ SI, $0x08 37795 JHS loop_unroll 37796 JMP check_limit 37797 37798 loop: 37799 MOVSS (AX), X1 37800 MULSS X0, X1 37801 ADDSS (DX), X1 37802 MOVSS X1, (DX) 37803 DECQ SI 37804 LEAQ (AX)(CX*4), AX 37805 LEAQ (DX)(BX*4), DX 37806 37807 check_limit: 37808 CMPQ SI, $0x00 37809 JHI loop 37810 RET 37811 37812 // func AmdAxpyPointerLoopXInterleave_V1A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37813 // Requires: SSE 37814 TEXT ·AmdAxpyPointerLoopXInterleave_V1A0U8(SB), NOSPLIT, $0-48 37815 MOVSS alpha+0(FP), X0 37816 MOVQ xs+8(FP), AX 37817 MOVQ incx+16(FP), CX 37818 MOVQ CX, DX 37819 SHLQ $0x05, DX 37820 MOVQ ys+24(FP), DX 37821 MOVQ incy+32(FP), BX 37822 MOVQ BX, SI 37823 SHLQ $0x05, SI 37824 MOVQ n+40(FP), SI 37825 JMP check_limit_unroll 37826 37827 loop_unroll: 37828 MOVSS (AX), X1 37829 LEAQ (AX)(CX*4), AX 37830 MOVSS (AX), X2 37831 LEAQ (AX)(CX*4), AX 37832 MOVSS (AX), X3 37833 LEAQ (AX)(CX*4), AX 37834 MOVSS (AX), X4 37835 LEAQ (AX)(CX*4), AX 37836 MOVSS (AX), X5 37837 LEAQ (AX)(CX*4), AX 37838 MOVSS (AX), X6 37839 LEAQ (AX)(CX*4), AX 37840 MOVSS (AX), X7 37841 LEAQ (AX)(CX*4), AX 37842 MOVSS (AX), X8 37843 LEAQ (AX)(CX*4), AX 37844 MULSS X0, X1 37845 MULSS X0, X2 37846 MULSS X0, X3 37847 MULSS X0, X4 37848 MULSS X0, X5 37849 MULSS X0, X6 37850 MULSS X0, X7 37851 MULSS X0, X8 37852 ADDSS (DX), X1 37853 MOVSS X1, (DX) 37854 LEAQ (DX)(BX*4), DX 37855 ADDSS (DX), X2 37856 MOVSS X2, (DX) 37857 LEAQ (DX)(BX*4), DX 37858 ADDSS (DX), X3 37859 MOVSS X3, (DX) 37860 LEAQ (DX)(BX*4), DX 37861 ADDSS (DX), X4 37862 MOVSS X4, (DX) 37863 LEAQ (DX)(BX*4), DX 37864 ADDSS (DX), X5 37865 MOVSS X5, (DX) 37866 LEAQ (DX)(BX*4), DX 37867 ADDSS (DX), X6 37868 MOVSS X6, (DX) 37869 LEAQ (DX)(BX*4), DX 37870 ADDSS (DX), X7 37871 MOVSS X7, (DX) 37872 LEAQ (DX)(BX*4), DX 37873 ADDSS (DX), X8 37874 MOVSS X8, (DX) 37875 LEAQ (DX)(BX*4), DX 37876 SUBQ $0x08, SI 37877 37878 check_limit_unroll: 37879 CMPQ SI, $0x08 37880 JHS loop_unroll 37881 JMP check_limit 37882 37883 loop: 37884 MOVSS (AX), X1 37885 MULSS X0, X1 37886 ADDSS (DX), X1 37887 MOVSS X1, (DX) 37888 DECQ SI 37889 LEAQ (AX)(CX*4), AX 37890 LEAQ (DX)(BX*4), DX 37891 37892 check_limit: 37893 CMPQ SI, $0x00 37894 JHI loop 37895 RET 37896 37897 // func AmdAxpyPointerLoopXInterleave_V2A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37898 // Requires: SSE 37899 TEXT ·AmdAxpyPointerLoopXInterleave_V2A0U8(SB), NOSPLIT, $0-48 37900 MOVSS alpha+0(FP), X0 37901 MOVQ xs+8(FP), AX 37902 MOVQ incx+16(FP), CX 37903 MOVQ CX, DX 37904 SHLQ $0x05, DX 37905 MOVQ ys+24(FP), DX 37906 MOVQ incy+32(FP), BX 37907 MOVQ BX, SI 37908 SHLQ $0x05, SI 37909 MOVQ n+40(FP), SI 37910 JMP check_limit_unroll 37911 37912 loop_unroll: 37913 MOVSS (AX), X1 37914 LEAQ (AX)(CX*4), AX 37915 MOVSS (AX), X2 37916 LEAQ (AX)(CX*4), AX 37917 MOVSS (AX), X3 37918 LEAQ (AX)(CX*4), AX 37919 MOVSS (AX), X4 37920 LEAQ (AX)(CX*4), AX 37921 MOVSS (AX), X5 37922 LEAQ (AX)(CX*4), AX 37923 MOVSS (AX), X6 37924 LEAQ (AX)(CX*4), AX 37925 MOVSS (AX), X7 37926 LEAQ (AX)(CX*4), AX 37927 MOVSS (AX), X8 37928 LEAQ (AX)(CX*4), AX 37929 MULSS X0, X1 37930 MULSS X0, X2 37931 MULSS X0, X3 37932 MULSS X0, X4 37933 MULSS X0, X5 37934 MULSS X0, X6 37935 MULSS X0, X7 37936 MULSS X0, X8 37937 ADDSS (DX), X1 37938 MOVSS X1, (DX) 37939 LEAQ (DX)(BX*4), DX 37940 ADDSS (DX), X2 37941 MOVSS X2, (DX) 37942 LEAQ (DX)(BX*4), DX 37943 ADDSS (DX), X3 37944 MOVSS X3, (DX) 37945 LEAQ (DX)(BX*4), DX 37946 ADDSS (DX), X4 37947 MOVSS X4, (DX) 37948 LEAQ (DX)(BX*4), DX 37949 ADDSS (DX), X5 37950 MOVSS X5, (DX) 37951 LEAQ (DX)(BX*4), DX 37952 ADDSS (DX), X6 37953 MOVSS X6, (DX) 37954 LEAQ (DX)(BX*4), DX 37955 ADDSS (DX), X7 37956 MOVSS X7, (DX) 37957 LEAQ (DX)(BX*4), DX 37958 ADDSS (DX), X8 37959 MOVSS X8, (DX) 37960 LEAQ (DX)(BX*4), DX 37961 SUBQ $0x08, SI 37962 37963 check_limit_unroll: 37964 CMPQ SI, $0x08 37965 JHS loop_unroll 37966 JMP check_limit 37967 37968 loop: 37969 MOVSS (AX), X1 37970 MULSS X0, X1 37971 ADDSS (DX), X1 37972 MOVSS X1, (DX) 37973 DECQ SI 37974 LEAQ (AX)(CX*4), AX 37975 LEAQ (DX)(BX*4), DX 37976 37977 check_limit: 37978 CMPQ SI, $0x00 37979 JHI loop 37980 RET 37981 37982 // func AmdAxpyPointerLoopXInterleave_V3A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 37983 // Requires: SSE 37984 TEXT ·AmdAxpyPointerLoopXInterleave_V3A0U8(SB), NOSPLIT, $0-48 37985 MOVSS alpha+0(FP), X0 37986 MOVQ xs+8(FP), AX 37987 MOVQ incx+16(FP), CX 37988 MOVQ CX, DX 37989 SHLQ $0x05, DX 37990 MOVQ ys+24(FP), DX 37991 MOVQ incy+32(FP), BX 37992 MOVQ BX, SI 37993 SHLQ $0x05, SI 37994 MOVQ n+40(FP), SI 37995 JMP check_limit_unroll 37996 37997 loop_unroll: 37998 MOVSS (AX), X1 37999 LEAQ (AX)(CX*4), AX 38000 MOVSS (AX), X2 38001 LEAQ (AX)(CX*4), AX 38002 MOVSS (AX), X3 38003 LEAQ (AX)(CX*4), AX 38004 MOVSS (AX), X4 38005 LEAQ (AX)(CX*4), AX 38006 MOVSS (AX), X5 38007 LEAQ (AX)(CX*4), AX 38008 MOVSS (AX), X6 38009 LEAQ (AX)(CX*4), AX 38010 MOVSS (AX), X7 38011 LEAQ (AX)(CX*4), AX 38012 MOVSS (AX), X8 38013 LEAQ (AX)(CX*4), AX 38014 MULSS X0, X1 38015 MULSS X0, X2 38016 MULSS X0, X3 38017 MULSS X0, X4 38018 MULSS X0, X5 38019 MULSS X0, X6 38020 MULSS X0, X7 38021 MULSS X0, X8 38022 ADDSS (DX), X1 38023 MOVSS X1, (DX) 38024 LEAQ (DX)(BX*4), DX 38025 ADDSS (DX), X2 38026 MOVSS X2, (DX) 38027 LEAQ (DX)(BX*4), DX 38028 ADDSS (DX), X3 38029 MOVSS X3, (DX) 38030 LEAQ (DX)(BX*4), DX 38031 ADDSS (DX), X4 38032 MOVSS X4, (DX) 38033 LEAQ (DX)(BX*4), DX 38034 ADDSS (DX), X5 38035 MOVSS X5, (DX) 38036 LEAQ (DX)(BX*4), DX 38037 ADDSS (DX), X6 38038 MOVSS X6, (DX) 38039 LEAQ (DX)(BX*4), DX 38040 ADDSS (DX), X7 38041 MOVSS X7, (DX) 38042 LEAQ (DX)(BX*4), DX 38043 ADDSS (DX), X8 38044 MOVSS X8, (DX) 38045 LEAQ (DX)(BX*4), DX 38046 SUBQ $0x08, SI 38047 38048 check_limit_unroll: 38049 CMPQ SI, $0x08 38050 JHS loop_unroll 38051 JMP check_limit 38052 38053 loop: 38054 MOVSS (AX), X1 38055 MULSS X0, X1 38056 ADDSS (DX), X1 38057 MOVSS X1, (DX) 38058 DECQ SI 38059 LEAQ (AX)(CX*4), AX 38060 LEAQ (DX)(BX*4), DX 38061 38062 check_limit: 38063 CMPQ SI, $0x00 38064 JHI loop 38065 RET 38066 38067 // func AmdAxpyPointerLoopXInterleave_V4A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38068 // Requires: SSE 38069 TEXT ·AmdAxpyPointerLoopXInterleave_V4A0U8(SB), NOSPLIT, $0-48 38070 MOVSS alpha+0(FP), X0 38071 MOVQ xs+8(FP), AX 38072 MOVQ incx+16(FP), CX 38073 MOVQ CX, DX 38074 SHLQ $0x05, DX 38075 MOVQ ys+24(FP), DX 38076 MOVQ incy+32(FP), BX 38077 MOVQ BX, SI 38078 SHLQ $0x05, SI 38079 MOVQ n+40(FP), SI 38080 JMP check_limit_unroll 38081 38082 loop_unroll: 38083 MOVSS (AX), X1 38084 LEAQ (AX)(CX*4), AX 38085 MOVSS (AX), X2 38086 LEAQ (AX)(CX*4), AX 38087 MOVSS (AX), X3 38088 LEAQ (AX)(CX*4), AX 38089 MOVSS (AX), X4 38090 LEAQ (AX)(CX*4), AX 38091 MOVSS (AX), X5 38092 LEAQ (AX)(CX*4), AX 38093 MOVSS (AX), X6 38094 LEAQ (AX)(CX*4), AX 38095 MOVSS (AX), X7 38096 LEAQ (AX)(CX*4), AX 38097 MOVSS (AX), X8 38098 LEAQ (AX)(CX*4), AX 38099 MULSS X0, X1 38100 MULSS X0, X2 38101 MULSS X0, X3 38102 MULSS X0, X4 38103 MULSS X0, X5 38104 MULSS X0, X6 38105 MULSS X0, X7 38106 MULSS X0, X8 38107 ADDSS (DX), X1 38108 MOVSS X1, (DX) 38109 LEAQ (DX)(BX*4), DX 38110 ADDSS (DX), X2 38111 MOVSS X2, (DX) 38112 LEAQ (DX)(BX*4), DX 38113 ADDSS (DX), X3 38114 MOVSS X3, (DX) 38115 LEAQ (DX)(BX*4), DX 38116 ADDSS (DX), X4 38117 MOVSS X4, (DX) 38118 LEAQ (DX)(BX*4), DX 38119 ADDSS (DX), X5 38120 MOVSS X5, (DX) 38121 LEAQ (DX)(BX*4), DX 38122 ADDSS (DX), X6 38123 MOVSS X6, (DX) 38124 LEAQ (DX)(BX*4), DX 38125 ADDSS (DX), X7 38126 MOVSS X7, (DX) 38127 LEAQ (DX)(BX*4), DX 38128 ADDSS (DX), X8 38129 MOVSS X8, (DX) 38130 LEAQ (DX)(BX*4), DX 38131 SUBQ $0x08, SI 38132 38133 check_limit_unroll: 38134 CMPQ SI, $0x08 38135 JHS loop_unroll 38136 JMP check_limit 38137 38138 loop: 38139 MOVSS (AX), X1 38140 MULSS X0, X1 38141 ADDSS (DX), X1 38142 MOVSS X1, (DX) 38143 DECQ SI 38144 LEAQ (AX)(CX*4), AX 38145 LEAQ (DX)(BX*4), DX 38146 38147 check_limit: 38148 CMPQ SI, $0x00 38149 JHI loop 38150 RET 38151 38152 // func AmdAxpyPointerLoopXInterleave_V5A0U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38153 // Requires: SSE 38154 TEXT ·AmdAxpyPointerLoopXInterleave_V5A0U8(SB), NOSPLIT, $0-48 38155 MOVSS alpha+0(FP), X0 38156 MOVQ xs+8(FP), AX 38157 MOVQ incx+16(FP), CX 38158 MOVQ CX, DX 38159 SHLQ $0x05, DX 38160 MOVQ ys+24(FP), DX 38161 MOVQ incy+32(FP), BX 38162 MOVQ BX, SI 38163 SHLQ $0x05, SI 38164 MOVQ n+40(FP), SI 38165 JMP check_limit_unroll 38166 38167 loop_unroll: 38168 MOVSS (AX), X1 38169 LEAQ (AX)(CX*4), AX 38170 MOVSS (AX), X2 38171 LEAQ (AX)(CX*4), AX 38172 MOVSS (AX), X3 38173 LEAQ (AX)(CX*4), AX 38174 MOVSS (AX), X4 38175 LEAQ (AX)(CX*4), AX 38176 MOVSS (AX), X5 38177 LEAQ (AX)(CX*4), AX 38178 MOVSS (AX), X6 38179 LEAQ (AX)(CX*4), AX 38180 MOVSS (AX), X7 38181 LEAQ (AX)(CX*4), AX 38182 MOVSS (AX), X8 38183 LEAQ (AX)(CX*4), AX 38184 MULSS X0, X1 38185 MULSS X0, X2 38186 MULSS X0, X3 38187 MULSS X0, X4 38188 MULSS X0, X5 38189 MULSS X0, X6 38190 MULSS X0, X7 38191 MULSS X0, X8 38192 ADDSS (DX), X1 38193 MOVSS X1, (DX) 38194 LEAQ (DX)(BX*4), DX 38195 ADDSS (DX), X2 38196 MOVSS X2, (DX) 38197 LEAQ (DX)(BX*4), DX 38198 ADDSS (DX), X3 38199 MOVSS X3, (DX) 38200 LEAQ (DX)(BX*4), DX 38201 ADDSS (DX), X4 38202 MOVSS X4, (DX) 38203 LEAQ (DX)(BX*4), DX 38204 ADDSS (DX), X5 38205 MOVSS X5, (DX) 38206 LEAQ (DX)(BX*4), DX 38207 ADDSS (DX), X6 38208 MOVSS X6, (DX) 38209 LEAQ (DX)(BX*4), DX 38210 ADDSS (DX), X7 38211 MOVSS X7, (DX) 38212 LEAQ (DX)(BX*4), DX 38213 ADDSS (DX), X8 38214 MOVSS X8, (DX) 38215 LEAQ (DX)(BX*4), DX 38216 SUBQ $0x08, SI 38217 38218 check_limit_unroll: 38219 CMPQ SI, $0x08 38220 JHS loop_unroll 38221 JMP check_limit 38222 38223 loop: 38224 MOVSS (AX), X1 38225 MULSS X0, X1 38226 ADDSS (DX), X1 38227 MOVSS X1, (DX) 38228 DECQ SI 38229 LEAQ (AX)(CX*4), AX 38230 LEAQ (DX)(BX*4), DX 38231 38232 check_limit: 38233 CMPQ SI, $0x00 38234 JHI loop 38235 RET 38236 38237 // func AmdAxpyPointerLoopXInterleave_V0A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38238 // Requires: SSE 38239 TEXT ·AmdAxpyPointerLoopXInterleave_V0A8U8(SB), NOSPLIT, $0-48 38240 MOVSS alpha+0(FP), X0 38241 MOVQ xs+8(FP), AX 38242 MOVQ incx+16(FP), CX 38243 MOVQ CX, DX 38244 SHLQ $0x05, DX 38245 MOVQ ys+24(FP), DX 38246 MOVQ incy+32(FP), BX 38247 MOVQ BX, SI 38248 SHLQ $0x05, SI 38249 MOVQ n+40(FP), SI 38250 JMP check_limit_unroll 38251 PCALIGN $0x08 38252 38253 loop_unroll: 38254 MOVSS (AX), X1 38255 LEAQ (AX)(CX*4), AX 38256 MOVSS (AX), X2 38257 LEAQ (AX)(CX*4), AX 38258 MOVSS (AX), X3 38259 LEAQ (AX)(CX*4), AX 38260 MOVSS (AX), X4 38261 LEAQ (AX)(CX*4), AX 38262 MOVSS (AX), X5 38263 LEAQ (AX)(CX*4), AX 38264 MOVSS (AX), X6 38265 LEAQ (AX)(CX*4), AX 38266 MOVSS (AX), X7 38267 LEAQ (AX)(CX*4), AX 38268 MOVSS (AX), X8 38269 LEAQ (AX)(CX*4), AX 38270 MULSS X0, X1 38271 MULSS X0, X2 38272 MULSS X0, X3 38273 MULSS X0, X4 38274 MULSS X0, X5 38275 MULSS X0, X6 38276 MULSS X0, X7 38277 MULSS X0, X8 38278 ADDSS (DX), X1 38279 MOVSS X1, (DX) 38280 LEAQ (DX)(BX*4), DX 38281 ADDSS (DX), X2 38282 MOVSS X2, (DX) 38283 LEAQ (DX)(BX*4), DX 38284 ADDSS (DX), X3 38285 MOVSS X3, (DX) 38286 LEAQ (DX)(BX*4), DX 38287 ADDSS (DX), X4 38288 MOVSS X4, (DX) 38289 LEAQ (DX)(BX*4), DX 38290 ADDSS (DX), X5 38291 MOVSS X5, (DX) 38292 LEAQ (DX)(BX*4), DX 38293 ADDSS (DX), X6 38294 MOVSS X6, (DX) 38295 LEAQ (DX)(BX*4), DX 38296 ADDSS (DX), X7 38297 MOVSS X7, (DX) 38298 LEAQ (DX)(BX*4), DX 38299 ADDSS (DX), X8 38300 MOVSS X8, (DX) 38301 LEAQ (DX)(BX*4), DX 38302 SUBQ $0x08, SI 38303 38304 check_limit_unroll: 38305 CMPQ SI, $0x08 38306 JHS loop_unroll 38307 JMP check_limit 38308 38309 loop: 38310 MOVSS (AX), X1 38311 MULSS X0, X1 38312 ADDSS (DX), X1 38313 MOVSS X1, (DX) 38314 DECQ SI 38315 LEAQ (AX)(CX*4), AX 38316 LEAQ (DX)(BX*4), DX 38317 38318 check_limit: 38319 CMPQ SI, $0x00 38320 JHI loop 38321 RET 38322 38323 // func AmdAxpyPointerLoopXInterleave_V1A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38324 // Requires: SSE 38325 TEXT ·AmdAxpyPointerLoopXInterleave_V1A8U8(SB), NOSPLIT, $0-48 38326 MOVSS alpha+0(FP), X0 38327 MOVQ xs+8(FP), AX 38328 MOVQ incx+16(FP), CX 38329 MOVQ CX, DX 38330 SHLQ $0x05, DX 38331 MOVQ ys+24(FP), DX 38332 MOVQ incy+32(FP), BX 38333 MOVQ BX, SI 38334 SHLQ $0x05, SI 38335 MOVQ n+40(FP), SI 38336 JMP check_limit_unroll 38337 PCALIGN $0x08 38338 38339 loop_unroll: 38340 MOVSS (AX), X1 38341 LEAQ (AX)(CX*4), AX 38342 MOVSS (AX), X2 38343 LEAQ (AX)(CX*4), AX 38344 MOVSS (AX), X3 38345 LEAQ (AX)(CX*4), AX 38346 MOVSS (AX), X4 38347 LEAQ (AX)(CX*4), AX 38348 MOVSS (AX), X5 38349 LEAQ (AX)(CX*4), AX 38350 MOVSS (AX), X6 38351 LEAQ (AX)(CX*4), AX 38352 MOVSS (AX), X7 38353 LEAQ (AX)(CX*4), AX 38354 MOVSS (AX), X8 38355 LEAQ (AX)(CX*4), AX 38356 MULSS X0, X1 38357 MULSS X0, X2 38358 MULSS X0, X3 38359 MULSS X0, X4 38360 MULSS X0, X5 38361 MULSS X0, X6 38362 MULSS X0, X7 38363 MULSS X0, X8 38364 ADDSS (DX), X1 38365 MOVSS X1, (DX) 38366 LEAQ (DX)(BX*4), DX 38367 ADDSS (DX), X2 38368 MOVSS X2, (DX) 38369 LEAQ (DX)(BX*4), DX 38370 ADDSS (DX), X3 38371 MOVSS X3, (DX) 38372 LEAQ (DX)(BX*4), DX 38373 ADDSS (DX), X4 38374 MOVSS X4, (DX) 38375 LEAQ (DX)(BX*4), DX 38376 ADDSS (DX), X5 38377 MOVSS X5, (DX) 38378 LEAQ (DX)(BX*4), DX 38379 ADDSS (DX), X6 38380 MOVSS X6, (DX) 38381 LEAQ (DX)(BX*4), DX 38382 ADDSS (DX), X7 38383 MOVSS X7, (DX) 38384 LEAQ (DX)(BX*4), DX 38385 ADDSS (DX), X8 38386 MOVSS X8, (DX) 38387 LEAQ (DX)(BX*4), DX 38388 SUBQ $0x08, SI 38389 38390 check_limit_unroll: 38391 CMPQ SI, $0x08 38392 JHS loop_unroll 38393 JMP check_limit 38394 38395 loop: 38396 MOVSS (AX), X1 38397 MULSS X0, X1 38398 ADDSS (DX), X1 38399 MOVSS X1, (DX) 38400 DECQ SI 38401 LEAQ (AX)(CX*4), AX 38402 LEAQ (DX)(BX*4), DX 38403 38404 check_limit: 38405 CMPQ SI, $0x00 38406 JHI loop 38407 RET 38408 38409 // func AmdAxpyPointerLoopXInterleave_V2A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38410 // Requires: SSE 38411 TEXT ·AmdAxpyPointerLoopXInterleave_V2A8U8(SB), NOSPLIT, $0-48 38412 MOVSS alpha+0(FP), X0 38413 MOVQ xs+8(FP), AX 38414 MOVQ incx+16(FP), CX 38415 MOVQ CX, DX 38416 SHLQ $0x05, DX 38417 MOVQ ys+24(FP), DX 38418 MOVQ incy+32(FP), BX 38419 MOVQ BX, SI 38420 SHLQ $0x05, SI 38421 MOVQ n+40(FP), SI 38422 JMP check_limit_unroll 38423 PCALIGN $0x08 38424 38425 loop_unroll: 38426 MOVSS (AX), X1 38427 LEAQ (AX)(CX*4), AX 38428 MOVSS (AX), X2 38429 LEAQ (AX)(CX*4), AX 38430 MOVSS (AX), X3 38431 LEAQ (AX)(CX*4), AX 38432 MOVSS (AX), X4 38433 LEAQ (AX)(CX*4), AX 38434 MOVSS (AX), X5 38435 LEAQ (AX)(CX*4), AX 38436 MOVSS (AX), X6 38437 LEAQ (AX)(CX*4), AX 38438 MOVSS (AX), X7 38439 LEAQ (AX)(CX*4), AX 38440 MOVSS (AX), X8 38441 LEAQ (AX)(CX*4), AX 38442 MULSS X0, X1 38443 MULSS X0, X2 38444 MULSS X0, X3 38445 MULSS X0, X4 38446 MULSS X0, X5 38447 MULSS X0, X6 38448 MULSS X0, X7 38449 MULSS X0, X8 38450 ADDSS (DX), X1 38451 MOVSS X1, (DX) 38452 LEAQ (DX)(BX*4), DX 38453 ADDSS (DX), X2 38454 MOVSS X2, (DX) 38455 LEAQ (DX)(BX*4), DX 38456 ADDSS (DX), X3 38457 MOVSS X3, (DX) 38458 LEAQ (DX)(BX*4), DX 38459 ADDSS (DX), X4 38460 MOVSS X4, (DX) 38461 LEAQ (DX)(BX*4), DX 38462 ADDSS (DX), X5 38463 MOVSS X5, (DX) 38464 LEAQ (DX)(BX*4), DX 38465 ADDSS (DX), X6 38466 MOVSS X6, (DX) 38467 LEAQ (DX)(BX*4), DX 38468 ADDSS (DX), X7 38469 MOVSS X7, (DX) 38470 LEAQ (DX)(BX*4), DX 38471 ADDSS (DX), X8 38472 MOVSS X8, (DX) 38473 LEAQ (DX)(BX*4), DX 38474 SUBQ $0x08, SI 38475 38476 check_limit_unroll: 38477 CMPQ SI, $0x08 38478 JHS loop_unroll 38479 JMP check_limit 38480 38481 loop: 38482 MOVSS (AX), X1 38483 MULSS X0, X1 38484 ADDSS (DX), X1 38485 MOVSS X1, (DX) 38486 DECQ SI 38487 LEAQ (AX)(CX*4), AX 38488 LEAQ (DX)(BX*4), DX 38489 38490 check_limit: 38491 CMPQ SI, $0x00 38492 JHI loop 38493 RET 38494 38495 // func AmdAxpyPointerLoopXInterleave_V3A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38496 // Requires: SSE 38497 TEXT ·AmdAxpyPointerLoopXInterleave_V3A8U8(SB), NOSPLIT, $0-48 38498 MOVSS alpha+0(FP), X0 38499 MOVQ xs+8(FP), AX 38500 MOVQ incx+16(FP), CX 38501 MOVQ CX, DX 38502 SHLQ $0x05, DX 38503 MOVQ ys+24(FP), DX 38504 MOVQ incy+32(FP), BX 38505 MOVQ BX, SI 38506 SHLQ $0x05, SI 38507 MOVQ n+40(FP), SI 38508 JMP check_limit_unroll 38509 PCALIGN $0x08 38510 38511 loop_unroll: 38512 MOVSS (AX), X1 38513 LEAQ (AX)(CX*4), AX 38514 MOVSS (AX), X2 38515 LEAQ (AX)(CX*4), AX 38516 MOVSS (AX), X3 38517 LEAQ (AX)(CX*4), AX 38518 MOVSS (AX), X4 38519 LEAQ (AX)(CX*4), AX 38520 MOVSS (AX), X5 38521 LEAQ (AX)(CX*4), AX 38522 MOVSS (AX), X6 38523 LEAQ (AX)(CX*4), AX 38524 MOVSS (AX), X7 38525 LEAQ (AX)(CX*4), AX 38526 MOVSS (AX), X8 38527 LEAQ (AX)(CX*4), AX 38528 MULSS X0, X1 38529 MULSS X0, X2 38530 MULSS X0, X3 38531 MULSS X0, X4 38532 MULSS X0, X5 38533 MULSS X0, X6 38534 MULSS X0, X7 38535 MULSS X0, X8 38536 ADDSS (DX), X1 38537 MOVSS X1, (DX) 38538 LEAQ (DX)(BX*4), DX 38539 ADDSS (DX), X2 38540 MOVSS X2, (DX) 38541 LEAQ (DX)(BX*4), DX 38542 ADDSS (DX), X3 38543 MOVSS X3, (DX) 38544 LEAQ (DX)(BX*4), DX 38545 ADDSS (DX), X4 38546 MOVSS X4, (DX) 38547 LEAQ (DX)(BX*4), DX 38548 ADDSS (DX), X5 38549 MOVSS X5, (DX) 38550 LEAQ (DX)(BX*4), DX 38551 ADDSS (DX), X6 38552 MOVSS X6, (DX) 38553 LEAQ (DX)(BX*4), DX 38554 ADDSS (DX), X7 38555 MOVSS X7, (DX) 38556 LEAQ (DX)(BX*4), DX 38557 ADDSS (DX), X8 38558 MOVSS X8, (DX) 38559 LEAQ (DX)(BX*4), DX 38560 SUBQ $0x08, SI 38561 38562 check_limit_unroll: 38563 CMPQ SI, $0x08 38564 JHS loop_unroll 38565 JMP check_limit 38566 38567 loop: 38568 MOVSS (AX), X1 38569 MULSS X0, X1 38570 ADDSS (DX), X1 38571 MOVSS X1, (DX) 38572 DECQ SI 38573 LEAQ (AX)(CX*4), AX 38574 LEAQ (DX)(BX*4), DX 38575 38576 check_limit: 38577 CMPQ SI, $0x00 38578 JHI loop 38579 RET 38580 38581 // func AmdAxpyPointerLoopXInterleave_V4A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38582 // Requires: SSE 38583 TEXT ·AmdAxpyPointerLoopXInterleave_V4A8U8(SB), NOSPLIT, $0-48 38584 MOVSS alpha+0(FP), X0 38585 MOVQ xs+8(FP), AX 38586 MOVQ incx+16(FP), CX 38587 MOVQ CX, DX 38588 SHLQ $0x05, DX 38589 MOVQ ys+24(FP), DX 38590 MOVQ incy+32(FP), BX 38591 MOVQ BX, SI 38592 SHLQ $0x05, SI 38593 MOVQ n+40(FP), SI 38594 JMP check_limit_unroll 38595 PCALIGN $0x08 38596 38597 loop_unroll: 38598 MOVSS (AX), X1 38599 LEAQ (AX)(CX*4), AX 38600 MOVSS (AX), X2 38601 LEAQ (AX)(CX*4), AX 38602 MOVSS (AX), X3 38603 LEAQ (AX)(CX*4), AX 38604 MOVSS (AX), X4 38605 LEAQ (AX)(CX*4), AX 38606 MOVSS (AX), X5 38607 LEAQ (AX)(CX*4), AX 38608 MOVSS (AX), X6 38609 LEAQ (AX)(CX*4), AX 38610 MOVSS (AX), X7 38611 LEAQ (AX)(CX*4), AX 38612 MOVSS (AX), X8 38613 LEAQ (AX)(CX*4), AX 38614 MULSS X0, X1 38615 MULSS X0, X2 38616 MULSS X0, X3 38617 MULSS X0, X4 38618 MULSS X0, X5 38619 MULSS X0, X6 38620 MULSS X0, X7 38621 MULSS X0, X8 38622 ADDSS (DX), X1 38623 MOVSS X1, (DX) 38624 LEAQ (DX)(BX*4), DX 38625 ADDSS (DX), X2 38626 MOVSS X2, (DX) 38627 LEAQ (DX)(BX*4), DX 38628 ADDSS (DX), X3 38629 MOVSS X3, (DX) 38630 LEAQ (DX)(BX*4), DX 38631 ADDSS (DX), X4 38632 MOVSS X4, (DX) 38633 LEAQ (DX)(BX*4), DX 38634 ADDSS (DX), X5 38635 MOVSS X5, (DX) 38636 LEAQ (DX)(BX*4), DX 38637 ADDSS (DX), X6 38638 MOVSS X6, (DX) 38639 LEAQ (DX)(BX*4), DX 38640 ADDSS (DX), X7 38641 MOVSS X7, (DX) 38642 LEAQ (DX)(BX*4), DX 38643 ADDSS (DX), X8 38644 MOVSS X8, (DX) 38645 LEAQ (DX)(BX*4), DX 38646 SUBQ $0x08, SI 38647 38648 check_limit_unroll: 38649 CMPQ SI, $0x08 38650 JHS loop_unroll 38651 JMP check_limit 38652 38653 loop: 38654 MOVSS (AX), X1 38655 MULSS X0, X1 38656 ADDSS (DX), X1 38657 MOVSS X1, (DX) 38658 DECQ SI 38659 LEAQ (AX)(CX*4), AX 38660 LEAQ (DX)(BX*4), DX 38661 38662 check_limit: 38663 CMPQ SI, $0x00 38664 JHI loop 38665 RET 38666 38667 // func AmdAxpyPointerLoopXInterleave_V5A8U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38668 // Requires: SSE 38669 TEXT ·AmdAxpyPointerLoopXInterleave_V5A8U8(SB), NOSPLIT, $0-48 38670 MOVSS alpha+0(FP), X0 38671 MOVQ xs+8(FP), AX 38672 MOVQ incx+16(FP), CX 38673 MOVQ CX, DX 38674 SHLQ $0x05, DX 38675 MOVQ ys+24(FP), DX 38676 MOVQ incy+32(FP), BX 38677 MOVQ BX, SI 38678 SHLQ $0x05, SI 38679 MOVQ n+40(FP), SI 38680 JMP check_limit_unroll 38681 PCALIGN $0x08 38682 38683 loop_unroll: 38684 MOVSS (AX), X1 38685 LEAQ (AX)(CX*4), AX 38686 MOVSS (AX), X2 38687 LEAQ (AX)(CX*4), AX 38688 MOVSS (AX), X3 38689 LEAQ (AX)(CX*4), AX 38690 MOVSS (AX), X4 38691 LEAQ (AX)(CX*4), AX 38692 MOVSS (AX), X5 38693 LEAQ (AX)(CX*4), AX 38694 MOVSS (AX), X6 38695 LEAQ (AX)(CX*4), AX 38696 MOVSS (AX), X7 38697 LEAQ (AX)(CX*4), AX 38698 MOVSS (AX), X8 38699 LEAQ (AX)(CX*4), AX 38700 MULSS X0, X1 38701 MULSS X0, X2 38702 MULSS X0, X3 38703 MULSS X0, X4 38704 MULSS X0, X5 38705 MULSS X0, X6 38706 MULSS X0, X7 38707 MULSS X0, X8 38708 ADDSS (DX), X1 38709 MOVSS X1, (DX) 38710 LEAQ (DX)(BX*4), DX 38711 ADDSS (DX), X2 38712 MOVSS X2, (DX) 38713 LEAQ (DX)(BX*4), DX 38714 ADDSS (DX), X3 38715 MOVSS X3, (DX) 38716 LEAQ (DX)(BX*4), DX 38717 ADDSS (DX), X4 38718 MOVSS X4, (DX) 38719 LEAQ (DX)(BX*4), DX 38720 ADDSS (DX), X5 38721 MOVSS X5, (DX) 38722 LEAQ (DX)(BX*4), DX 38723 ADDSS (DX), X6 38724 MOVSS X6, (DX) 38725 LEAQ (DX)(BX*4), DX 38726 ADDSS (DX), X7 38727 MOVSS X7, (DX) 38728 LEAQ (DX)(BX*4), DX 38729 ADDSS (DX), X8 38730 MOVSS X8, (DX) 38731 LEAQ (DX)(BX*4), DX 38732 SUBQ $0x08, SI 38733 38734 check_limit_unroll: 38735 CMPQ SI, $0x08 38736 JHS loop_unroll 38737 JMP check_limit 38738 38739 loop: 38740 MOVSS (AX), X1 38741 MULSS X0, X1 38742 ADDSS (DX), X1 38743 MOVSS X1, (DX) 38744 DECQ SI 38745 LEAQ (AX)(CX*4), AX 38746 LEAQ (DX)(BX*4), DX 38747 38748 check_limit: 38749 CMPQ SI, $0x00 38750 JHI loop 38751 RET 38752 38753 // func AmdAxpyPointerLoopXInterleave_V0A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38754 // Requires: SSE 38755 TEXT ·AmdAxpyPointerLoopXInterleave_V0A9U8(SB), NOSPLIT, $0-48 38756 MOVSS alpha+0(FP), X0 38757 MOVQ xs+8(FP), AX 38758 MOVQ incx+16(FP), CX 38759 MOVQ CX, DX 38760 SHLQ $0x05, DX 38761 MOVQ ys+24(FP), DX 38762 MOVQ incy+32(FP), BX 38763 MOVQ BX, SI 38764 SHLQ $0x05, SI 38765 MOVQ n+40(FP), SI 38766 JMP check_limit_unroll 38767 PCALIGN $0x08 38768 NOP 38769 38770 loop_unroll: 38771 MOVSS (AX), X1 38772 LEAQ (AX)(CX*4), AX 38773 MOVSS (AX), X2 38774 LEAQ (AX)(CX*4), AX 38775 MOVSS (AX), X3 38776 LEAQ (AX)(CX*4), AX 38777 MOVSS (AX), X4 38778 LEAQ (AX)(CX*4), AX 38779 MOVSS (AX), X5 38780 LEAQ (AX)(CX*4), AX 38781 MOVSS (AX), X6 38782 LEAQ (AX)(CX*4), AX 38783 MOVSS (AX), X7 38784 LEAQ (AX)(CX*4), AX 38785 MOVSS (AX), X8 38786 LEAQ (AX)(CX*4), AX 38787 MULSS X0, X1 38788 MULSS X0, X2 38789 MULSS X0, X3 38790 MULSS X0, X4 38791 MULSS X0, X5 38792 MULSS X0, X6 38793 MULSS X0, X7 38794 MULSS X0, X8 38795 ADDSS (DX), X1 38796 MOVSS X1, (DX) 38797 LEAQ (DX)(BX*4), DX 38798 ADDSS (DX), X2 38799 MOVSS X2, (DX) 38800 LEAQ (DX)(BX*4), DX 38801 ADDSS (DX), X3 38802 MOVSS X3, (DX) 38803 LEAQ (DX)(BX*4), DX 38804 ADDSS (DX), X4 38805 MOVSS X4, (DX) 38806 LEAQ (DX)(BX*4), DX 38807 ADDSS (DX), X5 38808 MOVSS X5, (DX) 38809 LEAQ (DX)(BX*4), DX 38810 ADDSS (DX), X6 38811 MOVSS X6, (DX) 38812 LEAQ (DX)(BX*4), DX 38813 ADDSS (DX), X7 38814 MOVSS X7, (DX) 38815 LEAQ (DX)(BX*4), DX 38816 ADDSS (DX), X8 38817 MOVSS X8, (DX) 38818 LEAQ (DX)(BX*4), DX 38819 SUBQ $0x08, SI 38820 38821 check_limit_unroll: 38822 CMPQ SI, $0x08 38823 JHS loop_unroll 38824 JMP check_limit 38825 38826 loop: 38827 MOVSS (AX), X1 38828 MULSS X0, X1 38829 ADDSS (DX), X1 38830 MOVSS X1, (DX) 38831 DECQ SI 38832 LEAQ (AX)(CX*4), AX 38833 LEAQ (DX)(BX*4), DX 38834 38835 check_limit: 38836 CMPQ SI, $0x00 38837 JHI loop 38838 RET 38839 38840 // func AmdAxpyPointerLoopXInterleave_V1A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38841 // Requires: SSE 38842 TEXT ·AmdAxpyPointerLoopXInterleave_V1A9U8(SB), NOSPLIT, $0-48 38843 MOVSS alpha+0(FP), X0 38844 MOVQ xs+8(FP), AX 38845 MOVQ incx+16(FP), CX 38846 MOVQ CX, DX 38847 SHLQ $0x05, DX 38848 MOVQ ys+24(FP), DX 38849 MOVQ incy+32(FP), BX 38850 MOVQ BX, SI 38851 SHLQ $0x05, SI 38852 MOVQ n+40(FP), SI 38853 JMP check_limit_unroll 38854 PCALIGN $0x08 38855 NOP 38856 38857 loop_unroll: 38858 MOVSS (AX), X1 38859 LEAQ (AX)(CX*4), AX 38860 MOVSS (AX), X2 38861 LEAQ (AX)(CX*4), AX 38862 MOVSS (AX), X3 38863 LEAQ (AX)(CX*4), AX 38864 MOVSS (AX), X4 38865 LEAQ (AX)(CX*4), AX 38866 MOVSS (AX), X5 38867 LEAQ (AX)(CX*4), AX 38868 MOVSS (AX), X6 38869 LEAQ (AX)(CX*4), AX 38870 MOVSS (AX), X7 38871 LEAQ (AX)(CX*4), AX 38872 MOVSS (AX), X8 38873 LEAQ (AX)(CX*4), AX 38874 MULSS X0, X1 38875 MULSS X0, X2 38876 MULSS X0, X3 38877 MULSS X0, X4 38878 MULSS X0, X5 38879 MULSS X0, X6 38880 MULSS X0, X7 38881 MULSS X0, X8 38882 ADDSS (DX), X1 38883 MOVSS X1, (DX) 38884 LEAQ (DX)(BX*4), DX 38885 ADDSS (DX), X2 38886 MOVSS X2, (DX) 38887 LEAQ (DX)(BX*4), DX 38888 ADDSS (DX), X3 38889 MOVSS X3, (DX) 38890 LEAQ (DX)(BX*4), DX 38891 ADDSS (DX), X4 38892 MOVSS X4, (DX) 38893 LEAQ (DX)(BX*4), DX 38894 ADDSS (DX), X5 38895 MOVSS X5, (DX) 38896 LEAQ (DX)(BX*4), DX 38897 ADDSS (DX), X6 38898 MOVSS X6, (DX) 38899 LEAQ (DX)(BX*4), DX 38900 ADDSS (DX), X7 38901 MOVSS X7, (DX) 38902 LEAQ (DX)(BX*4), DX 38903 ADDSS (DX), X8 38904 MOVSS X8, (DX) 38905 LEAQ (DX)(BX*4), DX 38906 SUBQ $0x08, SI 38907 38908 check_limit_unroll: 38909 CMPQ SI, $0x08 38910 JHS loop_unroll 38911 JMP check_limit 38912 38913 loop: 38914 MOVSS (AX), X1 38915 MULSS X0, X1 38916 ADDSS (DX), X1 38917 MOVSS X1, (DX) 38918 DECQ SI 38919 LEAQ (AX)(CX*4), AX 38920 LEAQ (DX)(BX*4), DX 38921 38922 check_limit: 38923 CMPQ SI, $0x00 38924 JHI loop 38925 RET 38926 38927 // func AmdAxpyPointerLoopXInterleave_V2A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 38928 // Requires: SSE 38929 TEXT ·AmdAxpyPointerLoopXInterleave_V2A9U8(SB), NOSPLIT, $0-48 38930 MOVSS alpha+0(FP), X0 38931 MOVQ xs+8(FP), AX 38932 MOVQ incx+16(FP), CX 38933 MOVQ CX, DX 38934 SHLQ $0x05, DX 38935 MOVQ ys+24(FP), DX 38936 MOVQ incy+32(FP), BX 38937 MOVQ BX, SI 38938 SHLQ $0x05, SI 38939 MOVQ n+40(FP), SI 38940 JMP check_limit_unroll 38941 PCALIGN $0x08 38942 NOP 38943 38944 loop_unroll: 38945 MOVSS (AX), X1 38946 LEAQ (AX)(CX*4), AX 38947 MOVSS (AX), X2 38948 LEAQ (AX)(CX*4), AX 38949 MOVSS (AX), X3 38950 LEAQ (AX)(CX*4), AX 38951 MOVSS (AX), X4 38952 LEAQ (AX)(CX*4), AX 38953 MOVSS (AX), X5 38954 LEAQ (AX)(CX*4), AX 38955 MOVSS (AX), X6 38956 LEAQ (AX)(CX*4), AX 38957 MOVSS (AX), X7 38958 LEAQ (AX)(CX*4), AX 38959 MOVSS (AX), X8 38960 LEAQ (AX)(CX*4), AX 38961 MULSS X0, X1 38962 MULSS X0, X2 38963 MULSS X0, X3 38964 MULSS X0, X4 38965 MULSS X0, X5 38966 MULSS X0, X6 38967 MULSS X0, X7 38968 MULSS X0, X8 38969 ADDSS (DX), X1 38970 MOVSS X1, (DX) 38971 LEAQ (DX)(BX*4), DX 38972 ADDSS (DX), X2 38973 MOVSS X2, (DX) 38974 LEAQ (DX)(BX*4), DX 38975 ADDSS (DX), X3 38976 MOVSS X3, (DX) 38977 LEAQ (DX)(BX*4), DX 38978 ADDSS (DX), X4 38979 MOVSS X4, (DX) 38980 LEAQ (DX)(BX*4), DX 38981 ADDSS (DX), X5 38982 MOVSS X5, (DX) 38983 LEAQ (DX)(BX*4), DX 38984 ADDSS (DX), X6 38985 MOVSS X6, (DX) 38986 LEAQ (DX)(BX*4), DX 38987 ADDSS (DX), X7 38988 MOVSS X7, (DX) 38989 LEAQ (DX)(BX*4), DX 38990 ADDSS (DX), X8 38991 MOVSS X8, (DX) 38992 LEAQ (DX)(BX*4), DX 38993 SUBQ $0x08, SI 38994 38995 check_limit_unroll: 38996 CMPQ SI, $0x08 38997 JHS loop_unroll 38998 JMP check_limit 38999 39000 loop: 39001 MOVSS (AX), X1 39002 MULSS X0, X1 39003 ADDSS (DX), X1 39004 MOVSS X1, (DX) 39005 DECQ SI 39006 LEAQ (AX)(CX*4), AX 39007 LEAQ (DX)(BX*4), DX 39008 39009 check_limit: 39010 CMPQ SI, $0x00 39011 JHI loop 39012 RET 39013 39014 // func AmdAxpyPointerLoopXInterleave_V3A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39015 // Requires: SSE 39016 TEXT ·AmdAxpyPointerLoopXInterleave_V3A9U8(SB), NOSPLIT, $0-48 39017 MOVSS alpha+0(FP), X0 39018 MOVQ xs+8(FP), AX 39019 MOVQ incx+16(FP), CX 39020 MOVQ CX, DX 39021 SHLQ $0x05, DX 39022 MOVQ ys+24(FP), DX 39023 MOVQ incy+32(FP), BX 39024 MOVQ BX, SI 39025 SHLQ $0x05, SI 39026 MOVQ n+40(FP), SI 39027 JMP check_limit_unroll 39028 PCALIGN $0x08 39029 NOP 39030 39031 loop_unroll: 39032 MOVSS (AX), X1 39033 LEAQ (AX)(CX*4), AX 39034 MOVSS (AX), X2 39035 LEAQ (AX)(CX*4), AX 39036 MOVSS (AX), X3 39037 LEAQ (AX)(CX*4), AX 39038 MOVSS (AX), X4 39039 LEAQ (AX)(CX*4), AX 39040 MOVSS (AX), X5 39041 LEAQ (AX)(CX*4), AX 39042 MOVSS (AX), X6 39043 LEAQ (AX)(CX*4), AX 39044 MOVSS (AX), X7 39045 LEAQ (AX)(CX*4), AX 39046 MOVSS (AX), X8 39047 LEAQ (AX)(CX*4), AX 39048 MULSS X0, X1 39049 MULSS X0, X2 39050 MULSS X0, X3 39051 MULSS X0, X4 39052 MULSS X0, X5 39053 MULSS X0, X6 39054 MULSS X0, X7 39055 MULSS X0, X8 39056 ADDSS (DX), X1 39057 MOVSS X1, (DX) 39058 LEAQ (DX)(BX*4), DX 39059 ADDSS (DX), X2 39060 MOVSS X2, (DX) 39061 LEAQ (DX)(BX*4), DX 39062 ADDSS (DX), X3 39063 MOVSS X3, (DX) 39064 LEAQ (DX)(BX*4), DX 39065 ADDSS (DX), X4 39066 MOVSS X4, (DX) 39067 LEAQ (DX)(BX*4), DX 39068 ADDSS (DX), X5 39069 MOVSS X5, (DX) 39070 LEAQ (DX)(BX*4), DX 39071 ADDSS (DX), X6 39072 MOVSS X6, (DX) 39073 LEAQ (DX)(BX*4), DX 39074 ADDSS (DX), X7 39075 MOVSS X7, (DX) 39076 LEAQ (DX)(BX*4), DX 39077 ADDSS (DX), X8 39078 MOVSS X8, (DX) 39079 LEAQ (DX)(BX*4), DX 39080 SUBQ $0x08, SI 39081 39082 check_limit_unroll: 39083 CMPQ SI, $0x08 39084 JHS loop_unroll 39085 JMP check_limit 39086 39087 loop: 39088 MOVSS (AX), X1 39089 MULSS X0, X1 39090 ADDSS (DX), X1 39091 MOVSS X1, (DX) 39092 DECQ SI 39093 LEAQ (AX)(CX*4), AX 39094 LEAQ (DX)(BX*4), DX 39095 39096 check_limit: 39097 CMPQ SI, $0x00 39098 JHI loop 39099 RET 39100 39101 // func AmdAxpyPointerLoopXInterleave_V4A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39102 // Requires: SSE 39103 TEXT ·AmdAxpyPointerLoopXInterleave_V4A9U8(SB), NOSPLIT, $0-48 39104 MOVSS alpha+0(FP), X0 39105 MOVQ xs+8(FP), AX 39106 MOVQ incx+16(FP), CX 39107 MOVQ CX, DX 39108 SHLQ $0x05, DX 39109 MOVQ ys+24(FP), DX 39110 MOVQ incy+32(FP), BX 39111 MOVQ BX, SI 39112 SHLQ $0x05, SI 39113 MOVQ n+40(FP), SI 39114 JMP check_limit_unroll 39115 PCALIGN $0x08 39116 NOP 39117 39118 loop_unroll: 39119 MOVSS (AX), X1 39120 LEAQ (AX)(CX*4), AX 39121 MOVSS (AX), X2 39122 LEAQ (AX)(CX*4), AX 39123 MOVSS (AX), X3 39124 LEAQ (AX)(CX*4), AX 39125 MOVSS (AX), X4 39126 LEAQ (AX)(CX*4), AX 39127 MOVSS (AX), X5 39128 LEAQ (AX)(CX*4), AX 39129 MOVSS (AX), X6 39130 LEAQ (AX)(CX*4), AX 39131 MOVSS (AX), X7 39132 LEAQ (AX)(CX*4), AX 39133 MOVSS (AX), X8 39134 LEAQ (AX)(CX*4), AX 39135 MULSS X0, X1 39136 MULSS X0, X2 39137 MULSS X0, X3 39138 MULSS X0, X4 39139 MULSS X0, X5 39140 MULSS X0, X6 39141 MULSS X0, X7 39142 MULSS X0, X8 39143 ADDSS (DX), X1 39144 MOVSS X1, (DX) 39145 LEAQ (DX)(BX*4), DX 39146 ADDSS (DX), X2 39147 MOVSS X2, (DX) 39148 LEAQ (DX)(BX*4), DX 39149 ADDSS (DX), X3 39150 MOVSS X3, (DX) 39151 LEAQ (DX)(BX*4), DX 39152 ADDSS (DX), X4 39153 MOVSS X4, (DX) 39154 LEAQ (DX)(BX*4), DX 39155 ADDSS (DX), X5 39156 MOVSS X5, (DX) 39157 LEAQ (DX)(BX*4), DX 39158 ADDSS (DX), X6 39159 MOVSS X6, (DX) 39160 LEAQ (DX)(BX*4), DX 39161 ADDSS (DX), X7 39162 MOVSS X7, (DX) 39163 LEAQ (DX)(BX*4), DX 39164 ADDSS (DX), X8 39165 MOVSS X8, (DX) 39166 LEAQ (DX)(BX*4), DX 39167 SUBQ $0x08, SI 39168 39169 check_limit_unroll: 39170 CMPQ SI, $0x08 39171 JHS loop_unroll 39172 JMP check_limit 39173 39174 loop: 39175 MOVSS (AX), X1 39176 MULSS X0, X1 39177 ADDSS (DX), X1 39178 MOVSS X1, (DX) 39179 DECQ SI 39180 LEAQ (AX)(CX*4), AX 39181 LEAQ (DX)(BX*4), DX 39182 39183 check_limit: 39184 CMPQ SI, $0x00 39185 JHI loop 39186 RET 39187 39188 // func AmdAxpyPointerLoopXInterleave_V5A9U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39189 // Requires: SSE 39190 TEXT ·AmdAxpyPointerLoopXInterleave_V5A9U8(SB), NOSPLIT, $0-48 39191 MOVSS alpha+0(FP), X0 39192 MOVQ xs+8(FP), AX 39193 MOVQ incx+16(FP), CX 39194 MOVQ CX, DX 39195 SHLQ $0x05, DX 39196 MOVQ ys+24(FP), DX 39197 MOVQ incy+32(FP), BX 39198 MOVQ BX, SI 39199 SHLQ $0x05, SI 39200 MOVQ n+40(FP), SI 39201 JMP check_limit_unroll 39202 PCALIGN $0x08 39203 NOP 39204 39205 loop_unroll: 39206 MOVSS (AX), X1 39207 LEAQ (AX)(CX*4), AX 39208 MOVSS (AX), X2 39209 LEAQ (AX)(CX*4), AX 39210 MOVSS (AX), X3 39211 LEAQ (AX)(CX*4), AX 39212 MOVSS (AX), X4 39213 LEAQ (AX)(CX*4), AX 39214 MOVSS (AX), X5 39215 LEAQ (AX)(CX*4), AX 39216 MOVSS (AX), X6 39217 LEAQ (AX)(CX*4), AX 39218 MOVSS (AX), X7 39219 LEAQ (AX)(CX*4), AX 39220 MOVSS (AX), X8 39221 LEAQ (AX)(CX*4), AX 39222 MULSS X0, X1 39223 MULSS X0, X2 39224 MULSS X0, X3 39225 MULSS X0, X4 39226 MULSS X0, X5 39227 MULSS X0, X6 39228 MULSS X0, X7 39229 MULSS X0, X8 39230 ADDSS (DX), X1 39231 MOVSS X1, (DX) 39232 LEAQ (DX)(BX*4), DX 39233 ADDSS (DX), X2 39234 MOVSS X2, (DX) 39235 LEAQ (DX)(BX*4), DX 39236 ADDSS (DX), X3 39237 MOVSS X3, (DX) 39238 LEAQ (DX)(BX*4), DX 39239 ADDSS (DX), X4 39240 MOVSS X4, (DX) 39241 LEAQ (DX)(BX*4), DX 39242 ADDSS (DX), X5 39243 MOVSS X5, (DX) 39244 LEAQ (DX)(BX*4), DX 39245 ADDSS (DX), X6 39246 MOVSS X6, (DX) 39247 LEAQ (DX)(BX*4), DX 39248 ADDSS (DX), X7 39249 MOVSS X7, (DX) 39250 LEAQ (DX)(BX*4), DX 39251 ADDSS (DX), X8 39252 MOVSS X8, (DX) 39253 LEAQ (DX)(BX*4), DX 39254 SUBQ $0x08, SI 39255 39256 check_limit_unroll: 39257 CMPQ SI, $0x08 39258 JHS loop_unroll 39259 JMP check_limit 39260 39261 loop: 39262 MOVSS (AX), X1 39263 MULSS X0, X1 39264 ADDSS (DX), X1 39265 MOVSS X1, (DX) 39266 DECQ SI 39267 LEAQ (AX)(CX*4), AX 39268 LEAQ (DX)(BX*4), DX 39269 39270 check_limit: 39271 CMPQ SI, $0x00 39272 JHI loop 39273 RET 39274 39275 // func AmdAxpyPointerLoopXInterleave_V0A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39276 // Requires: SSE 39277 TEXT ·AmdAxpyPointerLoopXInterleave_V0A10U8(SB), NOSPLIT, $0-48 39278 MOVSS alpha+0(FP), X0 39279 MOVQ xs+8(FP), AX 39280 MOVQ incx+16(FP), CX 39281 MOVQ CX, DX 39282 SHLQ $0x05, DX 39283 MOVQ ys+24(FP), DX 39284 MOVQ incy+32(FP), BX 39285 MOVQ BX, SI 39286 SHLQ $0x05, SI 39287 MOVQ n+40(FP), SI 39288 JMP check_limit_unroll 39289 PCALIGN $0x08 39290 NOP 39291 NOP 39292 39293 loop_unroll: 39294 MOVSS (AX), X1 39295 LEAQ (AX)(CX*4), AX 39296 MOVSS (AX), X2 39297 LEAQ (AX)(CX*4), AX 39298 MOVSS (AX), X3 39299 LEAQ (AX)(CX*4), AX 39300 MOVSS (AX), X4 39301 LEAQ (AX)(CX*4), AX 39302 MOVSS (AX), X5 39303 LEAQ (AX)(CX*4), AX 39304 MOVSS (AX), X6 39305 LEAQ (AX)(CX*4), AX 39306 MOVSS (AX), X7 39307 LEAQ (AX)(CX*4), AX 39308 MOVSS (AX), X8 39309 LEAQ (AX)(CX*4), AX 39310 MULSS X0, X1 39311 MULSS X0, X2 39312 MULSS X0, X3 39313 MULSS X0, X4 39314 MULSS X0, X5 39315 MULSS X0, X6 39316 MULSS X0, X7 39317 MULSS X0, X8 39318 ADDSS (DX), X1 39319 MOVSS X1, (DX) 39320 LEAQ (DX)(BX*4), DX 39321 ADDSS (DX), X2 39322 MOVSS X2, (DX) 39323 LEAQ (DX)(BX*4), DX 39324 ADDSS (DX), X3 39325 MOVSS X3, (DX) 39326 LEAQ (DX)(BX*4), DX 39327 ADDSS (DX), X4 39328 MOVSS X4, (DX) 39329 LEAQ (DX)(BX*4), DX 39330 ADDSS (DX), X5 39331 MOVSS X5, (DX) 39332 LEAQ (DX)(BX*4), DX 39333 ADDSS (DX), X6 39334 MOVSS X6, (DX) 39335 LEAQ (DX)(BX*4), DX 39336 ADDSS (DX), X7 39337 MOVSS X7, (DX) 39338 LEAQ (DX)(BX*4), DX 39339 ADDSS (DX), X8 39340 MOVSS X8, (DX) 39341 LEAQ (DX)(BX*4), DX 39342 SUBQ $0x08, SI 39343 39344 check_limit_unroll: 39345 CMPQ SI, $0x08 39346 JHS loop_unroll 39347 JMP check_limit 39348 39349 loop: 39350 MOVSS (AX), X1 39351 MULSS X0, X1 39352 ADDSS (DX), X1 39353 MOVSS X1, (DX) 39354 DECQ SI 39355 LEAQ (AX)(CX*4), AX 39356 LEAQ (DX)(BX*4), DX 39357 39358 check_limit: 39359 CMPQ SI, $0x00 39360 JHI loop 39361 RET 39362 39363 // func AmdAxpyPointerLoopXInterleave_V1A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39364 // Requires: SSE 39365 TEXT ·AmdAxpyPointerLoopXInterleave_V1A10U8(SB), NOSPLIT, $0-48 39366 MOVSS alpha+0(FP), X0 39367 MOVQ xs+8(FP), AX 39368 MOVQ incx+16(FP), CX 39369 MOVQ CX, DX 39370 SHLQ $0x05, DX 39371 MOVQ ys+24(FP), DX 39372 MOVQ incy+32(FP), BX 39373 MOVQ BX, SI 39374 SHLQ $0x05, SI 39375 MOVQ n+40(FP), SI 39376 JMP check_limit_unroll 39377 PCALIGN $0x08 39378 NOP 39379 NOP 39380 39381 loop_unroll: 39382 MOVSS (AX), X1 39383 LEAQ (AX)(CX*4), AX 39384 MOVSS (AX), X2 39385 LEAQ (AX)(CX*4), AX 39386 MOVSS (AX), X3 39387 LEAQ (AX)(CX*4), AX 39388 MOVSS (AX), X4 39389 LEAQ (AX)(CX*4), AX 39390 MOVSS (AX), X5 39391 LEAQ (AX)(CX*4), AX 39392 MOVSS (AX), X6 39393 LEAQ (AX)(CX*4), AX 39394 MOVSS (AX), X7 39395 LEAQ (AX)(CX*4), AX 39396 MOVSS (AX), X8 39397 LEAQ (AX)(CX*4), AX 39398 MULSS X0, X1 39399 MULSS X0, X2 39400 MULSS X0, X3 39401 MULSS X0, X4 39402 MULSS X0, X5 39403 MULSS X0, X6 39404 MULSS X0, X7 39405 MULSS X0, X8 39406 ADDSS (DX), X1 39407 MOVSS X1, (DX) 39408 LEAQ (DX)(BX*4), DX 39409 ADDSS (DX), X2 39410 MOVSS X2, (DX) 39411 LEAQ (DX)(BX*4), DX 39412 ADDSS (DX), X3 39413 MOVSS X3, (DX) 39414 LEAQ (DX)(BX*4), DX 39415 ADDSS (DX), X4 39416 MOVSS X4, (DX) 39417 LEAQ (DX)(BX*4), DX 39418 ADDSS (DX), X5 39419 MOVSS X5, (DX) 39420 LEAQ (DX)(BX*4), DX 39421 ADDSS (DX), X6 39422 MOVSS X6, (DX) 39423 LEAQ (DX)(BX*4), DX 39424 ADDSS (DX), X7 39425 MOVSS X7, (DX) 39426 LEAQ (DX)(BX*4), DX 39427 ADDSS (DX), X8 39428 MOVSS X8, (DX) 39429 LEAQ (DX)(BX*4), DX 39430 SUBQ $0x08, SI 39431 39432 check_limit_unroll: 39433 CMPQ SI, $0x08 39434 JHS loop_unroll 39435 JMP check_limit 39436 39437 loop: 39438 MOVSS (AX), X1 39439 MULSS X0, X1 39440 ADDSS (DX), X1 39441 MOVSS X1, (DX) 39442 DECQ SI 39443 LEAQ (AX)(CX*4), AX 39444 LEAQ (DX)(BX*4), DX 39445 39446 check_limit: 39447 CMPQ SI, $0x00 39448 JHI loop 39449 RET 39450 39451 // func AmdAxpyPointerLoopXInterleave_V2A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39452 // Requires: SSE 39453 TEXT ·AmdAxpyPointerLoopXInterleave_V2A10U8(SB), NOSPLIT, $0-48 39454 MOVSS alpha+0(FP), X0 39455 MOVQ xs+8(FP), AX 39456 MOVQ incx+16(FP), CX 39457 MOVQ CX, DX 39458 SHLQ $0x05, DX 39459 MOVQ ys+24(FP), DX 39460 MOVQ incy+32(FP), BX 39461 MOVQ BX, SI 39462 SHLQ $0x05, SI 39463 MOVQ n+40(FP), SI 39464 JMP check_limit_unroll 39465 PCALIGN $0x08 39466 NOP 39467 NOP 39468 39469 loop_unroll: 39470 MOVSS (AX), X1 39471 LEAQ (AX)(CX*4), AX 39472 MOVSS (AX), X2 39473 LEAQ (AX)(CX*4), AX 39474 MOVSS (AX), X3 39475 LEAQ (AX)(CX*4), AX 39476 MOVSS (AX), X4 39477 LEAQ (AX)(CX*4), AX 39478 MOVSS (AX), X5 39479 LEAQ (AX)(CX*4), AX 39480 MOVSS (AX), X6 39481 LEAQ (AX)(CX*4), AX 39482 MOVSS (AX), X7 39483 LEAQ (AX)(CX*4), AX 39484 MOVSS (AX), X8 39485 LEAQ (AX)(CX*4), AX 39486 MULSS X0, X1 39487 MULSS X0, X2 39488 MULSS X0, X3 39489 MULSS X0, X4 39490 MULSS X0, X5 39491 MULSS X0, X6 39492 MULSS X0, X7 39493 MULSS X0, X8 39494 ADDSS (DX), X1 39495 MOVSS X1, (DX) 39496 LEAQ (DX)(BX*4), DX 39497 ADDSS (DX), X2 39498 MOVSS X2, (DX) 39499 LEAQ (DX)(BX*4), DX 39500 ADDSS (DX), X3 39501 MOVSS X3, (DX) 39502 LEAQ (DX)(BX*4), DX 39503 ADDSS (DX), X4 39504 MOVSS X4, (DX) 39505 LEAQ (DX)(BX*4), DX 39506 ADDSS (DX), X5 39507 MOVSS X5, (DX) 39508 LEAQ (DX)(BX*4), DX 39509 ADDSS (DX), X6 39510 MOVSS X6, (DX) 39511 LEAQ (DX)(BX*4), DX 39512 ADDSS (DX), X7 39513 MOVSS X7, (DX) 39514 LEAQ (DX)(BX*4), DX 39515 ADDSS (DX), X8 39516 MOVSS X8, (DX) 39517 LEAQ (DX)(BX*4), DX 39518 SUBQ $0x08, SI 39519 39520 check_limit_unroll: 39521 CMPQ SI, $0x08 39522 JHS loop_unroll 39523 JMP check_limit 39524 39525 loop: 39526 MOVSS (AX), X1 39527 MULSS X0, X1 39528 ADDSS (DX), X1 39529 MOVSS X1, (DX) 39530 DECQ SI 39531 LEAQ (AX)(CX*4), AX 39532 LEAQ (DX)(BX*4), DX 39533 39534 check_limit: 39535 CMPQ SI, $0x00 39536 JHI loop 39537 RET 39538 39539 // func AmdAxpyPointerLoopXInterleave_V3A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39540 // Requires: SSE 39541 TEXT ·AmdAxpyPointerLoopXInterleave_V3A10U8(SB), NOSPLIT, $0-48 39542 MOVSS alpha+0(FP), X0 39543 MOVQ xs+8(FP), AX 39544 MOVQ incx+16(FP), CX 39545 MOVQ CX, DX 39546 SHLQ $0x05, DX 39547 MOVQ ys+24(FP), DX 39548 MOVQ incy+32(FP), BX 39549 MOVQ BX, SI 39550 SHLQ $0x05, SI 39551 MOVQ n+40(FP), SI 39552 JMP check_limit_unroll 39553 PCALIGN $0x08 39554 NOP 39555 NOP 39556 39557 loop_unroll: 39558 MOVSS (AX), X1 39559 LEAQ (AX)(CX*4), AX 39560 MOVSS (AX), X2 39561 LEAQ (AX)(CX*4), AX 39562 MOVSS (AX), X3 39563 LEAQ (AX)(CX*4), AX 39564 MOVSS (AX), X4 39565 LEAQ (AX)(CX*4), AX 39566 MOVSS (AX), X5 39567 LEAQ (AX)(CX*4), AX 39568 MOVSS (AX), X6 39569 LEAQ (AX)(CX*4), AX 39570 MOVSS (AX), X7 39571 LEAQ (AX)(CX*4), AX 39572 MOVSS (AX), X8 39573 LEAQ (AX)(CX*4), AX 39574 MULSS X0, X1 39575 MULSS X0, X2 39576 MULSS X0, X3 39577 MULSS X0, X4 39578 MULSS X0, X5 39579 MULSS X0, X6 39580 MULSS X0, X7 39581 MULSS X0, X8 39582 ADDSS (DX), X1 39583 MOVSS X1, (DX) 39584 LEAQ (DX)(BX*4), DX 39585 ADDSS (DX), X2 39586 MOVSS X2, (DX) 39587 LEAQ (DX)(BX*4), DX 39588 ADDSS (DX), X3 39589 MOVSS X3, (DX) 39590 LEAQ (DX)(BX*4), DX 39591 ADDSS (DX), X4 39592 MOVSS X4, (DX) 39593 LEAQ (DX)(BX*4), DX 39594 ADDSS (DX), X5 39595 MOVSS X5, (DX) 39596 LEAQ (DX)(BX*4), DX 39597 ADDSS (DX), X6 39598 MOVSS X6, (DX) 39599 LEAQ (DX)(BX*4), DX 39600 ADDSS (DX), X7 39601 MOVSS X7, (DX) 39602 LEAQ (DX)(BX*4), DX 39603 ADDSS (DX), X8 39604 MOVSS X8, (DX) 39605 LEAQ (DX)(BX*4), DX 39606 SUBQ $0x08, SI 39607 39608 check_limit_unroll: 39609 CMPQ SI, $0x08 39610 JHS loop_unroll 39611 JMP check_limit 39612 39613 loop: 39614 MOVSS (AX), X1 39615 MULSS X0, X1 39616 ADDSS (DX), X1 39617 MOVSS X1, (DX) 39618 DECQ SI 39619 LEAQ (AX)(CX*4), AX 39620 LEAQ (DX)(BX*4), DX 39621 39622 check_limit: 39623 CMPQ SI, $0x00 39624 JHI loop 39625 RET 39626 39627 // func AmdAxpyPointerLoopXInterleave_V4A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39628 // Requires: SSE 39629 TEXT ·AmdAxpyPointerLoopXInterleave_V4A10U8(SB), NOSPLIT, $0-48 39630 MOVSS alpha+0(FP), X0 39631 MOVQ xs+8(FP), AX 39632 MOVQ incx+16(FP), CX 39633 MOVQ CX, DX 39634 SHLQ $0x05, DX 39635 MOVQ ys+24(FP), DX 39636 MOVQ incy+32(FP), BX 39637 MOVQ BX, SI 39638 SHLQ $0x05, SI 39639 MOVQ n+40(FP), SI 39640 JMP check_limit_unroll 39641 PCALIGN $0x08 39642 NOP 39643 NOP 39644 39645 loop_unroll: 39646 MOVSS (AX), X1 39647 LEAQ (AX)(CX*4), AX 39648 MOVSS (AX), X2 39649 LEAQ (AX)(CX*4), AX 39650 MOVSS (AX), X3 39651 LEAQ (AX)(CX*4), AX 39652 MOVSS (AX), X4 39653 LEAQ (AX)(CX*4), AX 39654 MOVSS (AX), X5 39655 LEAQ (AX)(CX*4), AX 39656 MOVSS (AX), X6 39657 LEAQ (AX)(CX*4), AX 39658 MOVSS (AX), X7 39659 LEAQ (AX)(CX*4), AX 39660 MOVSS (AX), X8 39661 LEAQ (AX)(CX*4), AX 39662 MULSS X0, X1 39663 MULSS X0, X2 39664 MULSS X0, X3 39665 MULSS X0, X4 39666 MULSS X0, X5 39667 MULSS X0, X6 39668 MULSS X0, X7 39669 MULSS X0, X8 39670 ADDSS (DX), X1 39671 MOVSS X1, (DX) 39672 LEAQ (DX)(BX*4), DX 39673 ADDSS (DX), X2 39674 MOVSS X2, (DX) 39675 LEAQ (DX)(BX*4), DX 39676 ADDSS (DX), X3 39677 MOVSS X3, (DX) 39678 LEAQ (DX)(BX*4), DX 39679 ADDSS (DX), X4 39680 MOVSS X4, (DX) 39681 LEAQ (DX)(BX*4), DX 39682 ADDSS (DX), X5 39683 MOVSS X5, (DX) 39684 LEAQ (DX)(BX*4), DX 39685 ADDSS (DX), X6 39686 MOVSS X6, (DX) 39687 LEAQ (DX)(BX*4), DX 39688 ADDSS (DX), X7 39689 MOVSS X7, (DX) 39690 LEAQ (DX)(BX*4), DX 39691 ADDSS (DX), X8 39692 MOVSS X8, (DX) 39693 LEAQ (DX)(BX*4), DX 39694 SUBQ $0x08, SI 39695 39696 check_limit_unroll: 39697 CMPQ SI, $0x08 39698 JHS loop_unroll 39699 JMP check_limit 39700 39701 loop: 39702 MOVSS (AX), X1 39703 MULSS X0, X1 39704 ADDSS (DX), X1 39705 MOVSS X1, (DX) 39706 DECQ SI 39707 LEAQ (AX)(CX*4), AX 39708 LEAQ (DX)(BX*4), DX 39709 39710 check_limit: 39711 CMPQ SI, $0x00 39712 JHI loop 39713 RET 39714 39715 // func AmdAxpyPointerLoopXInterleave_V5A10U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39716 // Requires: SSE 39717 TEXT ·AmdAxpyPointerLoopXInterleave_V5A10U8(SB), NOSPLIT, $0-48 39718 MOVSS alpha+0(FP), X0 39719 MOVQ xs+8(FP), AX 39720 MOVQ incx+16(FP), CX 39721 MOVQ CX, DX 39722 SHLQ $0x05, DX 39723 MOVQ ys+24(FP), DX 39724 MOVQ incy+32(FP), BX 39725 MOVQ BX, SI 39726 SHLQ $0x05, SI 39727 MOVQ n+40(FP), SI 39728 JMP check_limit_unroll 39729 PCALIGN $0x08 39730 NOP 39731 NOP 39732 39733 loop_unroll: 39734 MOVSS (AX), X1 39735 LEAQ (AX)(CX*4), AX 39736 MOVSS (AX), X2 39737 LEAQ (AX)(CX*4), AX 39738 MOVSS (AX), X3 39739 LEAQ (AX)(CX*4), AX 39740 MOVSS (AX), X4 39741 LEAQ (AX)(CX*4), AX 39742 MOVSS (AX), X5 39743 LEAQ (AX)(CX*4), AX 39744 MOVSS (AX), X6 39745 LEAQ (AX)(CX*4), AX 39746 MOVSS (AX), X7 39747 LEAQ (AX)(CX*4), AX 39748 MOVSS (AX), X8 39749 LEAQ (AX)(CX*4), AX 39750 MULSS X0, X1 39751 MULSS X0, X2 39752 MULSS X0, X3 39753 MULSS X0, X4 39754 MULSS X0, X5 39755 MULSS X0, X6 39756 MULSS X0, X7 39757 MULSS X0, X8 39758 ADDSS (DX), X1 39759 MOVSS X1, (DX) 39760 LEAQ (DX)(BX*4), DX 39761 ADDSS (DX), X2 39762 MOVSS X2, (DX) 39763 LEAQ (DX)(BX*4), DX 39764 ADDSS (DX), X3 39765 MOVSS X3, (DX) 39766 LEAQ (DX)(BX*4), DX 39767 ADDSS (DX), X4 39768 MOVSS X4, (DX) 39769 LEAQ (DX)(BX*4), DX 39770 ADDSS (DX), X5 39771 MOVSS X5, (DX) 39772 LEAQ (DX)(BX*4), DX 39773 ADDSS (DX), X6 39774 MOVSS X6, (DX) 39775 LEAQ (DX)(BX*4), DX 39776 ADDSS (DX), X7 39777 MOVSS X7, (DX) 39778 LEAQ (DX)(BX*4), DX 39779 ADDSS (DX), X8 39780 MOVSS X8, (DX) 39781 LEAQ (DX)(BX*4), DX 39782 SUBQ $0x08, SI 39783 39784 check_limit_unroll: 39785 CMPQ SI, $0x08 39786 JHS loop_unroll 39787 JMP check_limit 39788 39789 loop: 39790 MOVSS (AX), X1 39791 MULSS X0, X1 39792 ADDSS (DX), X1 39793 MOVSS X1, (DX) 39794 DECQ SI 39795 LEAQ (AX)(CX*4), AX 39796 LEAQ (DX)(BX*4), DX 39797 39798 check_limit: 39799 CMPQ SI, $0x00 39800 JHI loop 39801 RET 39802 39803 // func AmdAxpyPointerLoopXInterleave_V0A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39804 // Requires: SSE 39805 TEXT ·AmdAxpyPointerLoopXInterleave_V0A11U8(SB), NOSPLIT, $0-48 39806 MOVSS alpha+0(FP), X0 39807 MOVQ xs+8(FP), AX 39808 MOVQ incx+16(FP), CX 39809 MOVQ CX, DX 39810 SHLQ $0x05, DX 39811 MOVQ ys+24(FP), DX 39812 MOVQ incy+32(FP), BX 39813 MOVQ BX, SI 39814 SHLQ $0x05, SI 39815 MOVQ n+40(FP), SI 39816 JMP check_limit_unroll 39817 PCALIGN $0x08 39818 NOP 39819 NOP 39820 NOP 39821 39822 loop_unroll: 39823 MOVSS (AX), X1 39824 LEAQ (AX)(CX*4), AX 39825 MOVSS (AX), X2 39826 LEAQ (AX)(CX*4), AX 39827 MOVSS (AX), X3 39828 LEAQ (AX)(CX*4), AX 39829 MOVSS (AX), X4 39830 LEAQ (AX)(CX*4), AX 39831 MOVSS (AX), X5 39832 LEAQ (AX)(CX*4), AX 39833 MOVSS (AX), X6 39834 LEAQ (AX)(CX*4), AX 39835 MOVSS (AX), X7 39836 LEAQ (AX)(CX*4), AX 39837 MOVSS (AX), X8 39838 LEAQ (AX)(CX*4), AX 39839 MULSS X0, X1 39840 MULSS X0, X2 39841 MULSS X0, X3 39842 MULSS X0, X4 39843 MULSS X0, X5 39844 MULSS X0, X6 39845 MULSS X0, X7 39846 MULSS X0, X8 39847 ADDSS (DX), X1 39848 MOVSS X1, (DX) 39849 LEAQ (DX)(BX*4), DX 39850 ADDSS (DX), X2 39851 MOVSS X2, (DX) 39852 LEAQ (DX)(BX*4), DX 39853 ADDSS (DX), X3 39854 MOVSS X3, (DX) 39855 LEAQ (DX)(BX*4), DX 39856 ADDSS (DX), X4 39857 MOVSS X4, (DX) 39858 LEAQ (DX)(BX*4), DX 39859 ADDSS (DX), X5 39860 MOVSS X5, (DX) 39861 LEAQ (DX)(BX*4), DX 39862 ADDSS (DX), X6 39863 MOVSS X6, (DX) 39864 LEAQ (DX)(BX*4), DX 39865 ADDSS (DX), X7 39866 MOVSS X7, (DX) 39867 LEAQ (DX)(BX*4), DX 39868 ADDSS (DX), X8 39869 MOVSS X8, (DX) 39870 LEAQ (DX)(BX*4), DX 39871 SUBQ $0x08, SI 39872 39873 check_limit_unroll: 39874 CMPQ SI, $0x08 39875 JHS loop_unroll 39876 JMP check_limit 39877 39878 loop: 39879 MOVSS (AX), X1 39880 MULSS X0, X1 39881 ADDSS (DX), X1 39882 MOVSS X1, (DX) 39883 DECQ SI 39884 LEAQ (AX)(CX*4), AX 39885 LEAQ (DX)(BX*4), DX 39886 39887 check_limit: 39888 CMPQ SI, $0x00 39889 JHI loop 39890 RET 39891 39892 // func AmdAxpyPointerLoopXInterleave_V1A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39893 // Requires: SSE 39894 TEXT ·AmdAxpyPointerLoopXInterleave_V1A11U8(SB), NOSPLIT, $0-48 39895 MOVSS alpha+0(FP), X0 39896 MOVQ xs+8(FP), AX 39897 MOVQ incx+16(FP), CX 39898 MOVQ CX, DX 39899 SHLQ $0x05, DX 39900 MOVQ ys+24(FP), DX 39901 MOVQ incy+32(FP), BX 39902 MOVQ BX, SI 39903 SHLQ $0x05, SI 39904 MOVQ n+40(FP), SI 39905 JMP check_limit_unroll 39906 PCALIGN $0x08 39907 NOP 39908 NOP 39909 NOP 39910 39911 loop_unroll: 39912 MOVSS (AX), X1 39913 LEAQ (AX)(CX*4), AX 39914 MOVSS (AX), X2 39915 LEAQ (AX)(CX*4), AX 39916 MOVSS (AX), X3 39917 LEAQ (AX)(CX*4), AX 39918 MOVSS (AX), X4 39919 LEAQ (AX)(CX*4), AX 39920 MOVSS (AX), X5 39921 LEAQ (AX)(CX*4), AX 39922 MOVSS (AX), X6 39923 LEAQ (AX)(CX*4), AX 39924 MOVSS (AX), X7 39925 LEAQ (AX)(CX*4), AX 39926 MOVSS (AX), X8 39927 LEAQ (AX)(CX*4), AX 39928 MULSS X0, X1 39929 MULSS X0, X2 39930 MULSS X0, X3 39931 MULSS X0, X4 39932 MULSS X0, X5 39933 MULSS X0, X6 39934 MULSS X0, X7 39935 MULSS X0, X8 39936 ADDSS (DX), X1 39937 MOVSS X1, (DX) 39938 LEAQ (DX)(BX*4), DX 39939 ADDSS (DX), X2 39940 MOVSS X2, (DX) 39941 LEAQ (DX)(BX*4), DX 39942 ADDSS (DX), X3 39943 MOVSS X3, (DX) 39944 LEAQ (DX)(BX*4), DX 39945 ADDSS (DX), X4 39946 MOVSS X4, (DX) 39947 LEAQ (DX)(BX*4), DX 39948 ADDSS (DX), X5 39949 MOVSS X5, (DX) 39950 LEAQ (DX)(BX*4), DX 39951 ADDSS (DX), X6 39952 MOVSS X6, (DX) 39953 LEAQ (DX)(BX*4), DX 39954 ADDSS (DX), X7 39955 MOVSS X7, (DX) 39956 LEAQ (DX)(BX*4), DX 39957 ADDSS (DX), X8 39958 MOVSS X8, (DX) 39959 LEAQ (DX)(BX*4), DX 39960 SUBQ $0x08, SI 39961 39962 check_limit_unroll: 39963 CMPQ SI, $0x08 39964 JHS loop_unroll 39965 JMP check_limit 39966 39967 loop: 39968 MOVSS (AX), X1 39969 MULSS X0, X1 39970 ADDSS (DX), X1 39971 MOVSS X1, (DX) 39972 DECQ SI 39973 LEAQ (AX)(CX*4), AX 39974 LEAQ (DX)(BX*4), DX 39975 39976 check_limit: 39977 CMPQ SI, $0x00 39978 JHI loop 39979 RET 39980 39981 // func AmdAxpyPointerLoopXInterleave_V2A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 39982 // Requires: SSE 39983 TEXT ·AmdAxpyPointerLoopXInterleave_V2A11U8(SB), NOSPLIT, $0-48 39984 MOVSS alpha+0(FP), X0 39985 MOVQ xs+8(FP), AX 39986 MOVQ incx+16(FP), CX 39987 MOVQ CX, DX 39988 SHLQ $0x05, DX 39989 MOVQ ys+24(FP), DX 39990 MOVQ incy+32(FP), BX 39991 MOVQ BX, SI 39992 SHLQ $0x05, SI 39993 MOVQ n+40(FP), SI 39994 JMP check_limit_unroll 39995 PCALIGN $0x08 39996 NOP 39997 NOP 39998 NOP 39999 40000 loop_unroll: 40001 MOVSS (AX), X1 40002 LEAQ (AX)(CX*4), AX 40003 MOVSS (AX), X2 40004 LEAQ (AX)(CX*4), AX 40005 MOVSS (AX), X3 40006 LEAQ (AX)(CX*4), AX 40007 MOVSS (AX), X4 40008 LEAQ (AX)(CX*4), AX 40009 MOVSS (AX), X5 40010 LEAQ (AX)(CX*4), AX 40011 MOVSS (AX), X6 40012 LEAQ (AX)(CX*4), AX 40013 MOVSS (AX), X7 40014 LEAQ (AX)(CX*4), AX 40015 MOVSS (AX), X8 40016 LEAQ (AX)(CX*4), AX 40017 MULSS X0, X1 40018 MULSS X0, X2 40019 MULSS X0, X3 40020 MULSS X0, X4 40021 MULSS X0, X5 40022 MULSS X0, X6 40023 MULSS X0, X7 40024 MULSS X0, X8 40025 ADDSS (DX), X1 40026 MOVSS X1, (DX) 40027 LEAQ (DX)(BX*4), DX 40028 ADDSS (DX), X2 40029 MOVSS X2, (DX) 40030 LEAQ (DX)(BX*4), DX 40031 ADDSS (DX), X3 40032 MOVSS X3, (DX) 40033 LEAQ (DX)(BX*4), DX 40034 ADDSS (DX), X4 40035 MOVSS X4, (DX) 40036 LEAQ (DX)(BX*4), DX 40037 ADDSS (DX), X5 40038 MOVSS X5, (DX) 40039 LEAQ (DX)(BX*4), DX 40040 ADDSS (DX), X6 40041 MOVSS X6, (DX) 40042 LEAQ (DX)(BX*4), DX 40043 ADDSS (DX), X7 40044 MOVSS X7, (DX) 40045 LEAQ (DX)(BX*4), DX 40046 ADDSS (DX), X8 40047 MOVSS X8, (DX) 40048 LEAQ (DX)(BX*4), DX 40049 SUBQ $0x08, SI 40050 40051 check_limit_unroll: 40052 CMPQ SI, $0x08 40053 JHS loop_unroll 40054 JMP check_limit 40055 40056 loop: 40057 MOVSS (AX), X1 40058 MULSS X0, X1 40059 ADDSS (DX), X1 40060 MOVSS X1, (DX) 40061 DECQ SI 40062 LEAQ (AX)(CX*4), AX 40063 LEAQ (DX)(BX*4), DX 40064 40065 check_limit: 40066 CMPQ SI, $0x00 40067 JHI loop 40068 RET 40069 40070 // func AmdAxpyPointerLoopXInterleave_V3A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40071 // Requires: SSE 40072 TEXT ·AmdAxpyPointerLoopXInterleave_V3A11U8(SB), NOSPLIT, $0-48 40073 MOVSS alpha+0(FP), X0 40074 MOVQ xs+8(FP), AX 40075 MOVQ incx+16(FP), CX 40076 MOVQ CX, DX 40077 SHLQ $0x05, DX 40078 MOVQ ys+24(FP), DX 40079 MOVQ incy+32(FP), BX 40080 MOVQ BX, SI 40081 SHLQ $0x05, SI 40082 MOVQ n+40(FP), SI 40083 JMP check_limit_unroll 40084 PCALIGN $0x08 40085 NOP 40086 NOP 40087 NOP 40088 40089 loop_unroll: 40090 MOVSS (AX), X1 40091 LEAQ (AX)(CX*4), AX 40092 MOVSS (AX), X2 40093 LEAQ (AX)(CX*4), AX 40094 MOVSS (AX), X3 40095 LEAQ (AX)(CX*4), AX 40096 MOVSS (AX), X4 40097 LEAQ (AX)(CX*4), AX 40098 MOVSS (AX), X5 40099 LEAQ (AX)(CX*4), AX 40100 MOVSS (AX), X6 40101 LEAQ (AX)(CX*4), AX 40102 MOVSS (AX), X7 40103 LEAQ (AX)(CX*4), AX 40104 MOVSS (AX), X8 40105 LEAQ (AX)(CX*4), AX 40106 MULSS X0, X1 40107 MULSS X0, X2 40108 MULSS X0, X3 40109 MULSS X0, X4 40110 MULSS X0, X5 40111 MULSS X0, X6 40112 MULSS X0, X7 40113 MULSS X0, X8 40114 ADDSS (DX), X1 40115 MOVSS X1, (DX) 40116 LEAQ (DX)(BX*4), DX 40117 ADDSS (DX), X2 40118 MOVSS X2, (DX) 40119 LEAQ (DX)(BX*4), DX 40120 ADDSS (DX), X3 40121 MOVSS X3, (DX) 40122 LEAQ (DX)(BX*4), DX 40123 ADDSS (DX), X4 40124 MOVSS X4, (DX) 40125 LEAQ (DX)(BX*4), DX 40126 ADDSS (DX), X5 40127 MOVSS X5, (DX) 40128 LEAQ (DX)(BX*4), DX 40129 ADDSS (DX), X6 40130 MOVSS X6, (DX) 40131 LEAQ (DX)(BX*4), DX 40132 ADDSS (DX), X7 40133 MOVSS X7, (DX) 40134 LEAQ (DX)(BX*4), DX 40135 ADDSS (DX), X8 40136 MOVSS X8, (DX) 40137 LEAQ (DX)(BX*4), DX 40138 SUBQ $0x08, SI 40139 40140 check_limit_unroll: 40141 CMPQ SI, $0x08 40142 JHS loop_unroll 40143 JMP check_limit 40144 40145 loop: 40146 MOVSS (AX), X1 40147 MULSS X0, X1 40148 ADDSS (DX), X1 40149 MOVSS X1, (DX) 40150 DECQ SI 40151 LEAQ (AX)(CX*4), AX 40152 LEAQ (DX)(BX*4), DX 40153 40154 check_limit: 40155 CMPQ SI, $0x00 40156 JHI loop 40157 RET 40158 40159 // func AmdAxpyPointerLoopXInterleave_V4A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40160 // Requires: SSE 40161 TEXT ·AmdAxpyPointerLoopXInterleave_V4A11U8(SB), NOSPLIT, $0-48 40162 MOVSS alpha+0(FP), X0 40163 MOVQ xs+8(FP), AX 40164 MOVQ incx+16(FP), CX 40165 MOVQ CX, DX 40166 SHLQ $0x05, DX 40167 MOVQ ys+24(FP), DX 40168 MOVQ incy+32(FP), BX 40169 MOVQ BX, SI 40170 SHLQ $0x05, SI 40171 MOVQ n+40(FP), SI 40172 JMP check_limit_unroll 40173 PCALIGN $0x08 40174 NOP 40175 NOP 40176 NOP 40177 40178 loop_unroll: 40179 MOVSS (AX), X1 40180 LEAQ (AX)(CX*4), AX 40181 MOVSS (AX), X2 40182 LEAQ (AX)(CX*4), AX 40183 MOVSS (AX), X3 40184 LEAQ (AX)(CX*4), AX 40185 MOVSS (AX), X4 40186 LEAQ (AX)(CX*4), AX 40187 MOVSS (AX), X5 40188 LEAQ (AX)(CX*4), AX 40189 MOVSS (AX), X6 40190 LEAQ (AX)(CX*4), AX 40191 MOVSS (AX), X7 40192 LEAQ (AX)(CX*4), AX 40193 MOVSS (AX), X8 40194 LEAQ (AX)(CX*4), AX 40195 MULSS X0, X1 40196 MULSS X0, X2 40197 MULSS X0, X3 40198 MULSS X0, X4 40199 MULSS X0, X5 40200 MULSS X0, X6 40201 MULSS X0, X7 40202 MULSS X0, X8 40203 ADDSS (DX), X1 40204 MOVSS X1, (DX) 40205 LEAQ (DX)(BX*4), DX 40206 ADDSS (DX), X2 40207 MOVSS X2, (DX) 40208 LEAQ (DX)(BX*4), DX 40209 ADDSS (DX), X3 40210 MOVSS X3, (DX) 40211 LEAQ (DX)(BX*4), DX 40212 ADDSS (DX), X4 40213 MOVSS X4, (DX) 40214 LEAQ (DX)(BX*4), DX 40215 ADDSS (DX), X5 40216 MOVSS X5, (DX) 40217 LEAQ (DX)(BX*4), DX 40218 ADDSS (DX), X6 40219 MOVSS X6, (DX) 40220 LEAQ (DX)(BX*4), DX 40221 ADDSS (DX), X7 40222 MOVSS X7, (DX) 40223 LEAQ (DX)(BX*4), DX 40224 ADDSS (DX), X8 40225 MOVSS X8, (DX) 40226 LEAQ (DX)(BX*4), DX 40227 SUBQ $0x08, SI 40228 40229 check_limit_unroll: 40230 CMPQ SI, $0x08 40231 JHS loop_unroll 40232 JMP check_limit 40233 40234 loop: 40235 MOVSS (AX), X1 40236 MULSS X0, X1 40237 ADDSS (DX), X1 40238 MOVSS X1, (DX) 40239 DECQ SI 40240 LEAQ (AX)(CX*4), AX 40241 LEAQ (DX)(BX*4), DX 40242 40243 check_limit: 40244 CMPQ SI, $0x00 40245 JHI loop 40246 RET 40247 40248 // func AmdAxpyPointerLoopXInterleave_V5A11U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40249 // Requires: SSE 40250 TEXT ·AmdAxpyPointerLoopXInterleave_V5A11U8(SB), NOSPLIT, $0-48 40251 MOVSS alpha+0(FP), X0 40252 MOVQ xs+8(FP), AX 40253 MOVQ incx+16(FP), CX 40254 MOVQ CX, DX 40255 SHLQ $0x05, DX 40256 MOVQ ys+24(FP), DX 40257 MOVQ incy+32(FP), BX 40258 MOVQ BX, SI 40259 SHLQ $0x05, SI 40260 MOVQ n+40(FP), SI 40261 JMP check_limit_unroll 40262 PCALIGN $0x08 40263 NOP 40264 NOP 40265 NOP 40266 40267 loop_unroll: 40268 MOVSS (AX), X1 40269 LEAQ (AX)(CX*4), AX 40270 MOVSS (AX), X2 40271 LEAQ (AX)(CX*4), AX 40272 MOVSS (AX), X3 40273 LEAQ (AX)(CX*4), AX 40274 MOVSS (AX), X4 40275 LEAQ (AX)(CX*4), AX 40276 MOVSS (AX), X5 40277 LEAQ (AX)(CX*4), AX 40278 MOVSS (AX), X6 40279 LEAQ (AX)(CX*4), AX 40280 MOVSS (AX), X7 40281 LEAQ (AX)(CX*4), AX 40282 MOVSS (AX), X8 40283 LEAQ (AX)(CX*4), AX 40284 MULSS X0, X1 40285 MULSS X0, X2 40286 MULSS X0, X3 40287 MULSS X0, X4 40288 MULSS X0, X5 40289 MULSS X0, X6 40290 MULSS X0, X7 40291 MULSS X0, X8 40292 ADDSS (DX), X1 40293 MOVSS X1, (DX) 40294 LEAQ (DX)(BX*4), DX 40295 ADDSS (DX), X2 40296 MOVSS X2, (DX) 40297 LEAQ (DX)(BX*4), DX 40298 ADDSS (DX), X3 40299 MOVSS X3, (DX) 40300 LEAQ (DX)(BX*4), DX 40301 ADDSS (DX), X4 40302 MOVSS X4, (DX) 40303 LEAQ (DX)(BX*4), DX 40304 ADDSS (DX), X5 40305 MOVSS X5, (DX) 40306 LEAQ (DX)(BX*4), DX 40307 ADDSS (DX), X6 40308 MOVSS X6, (DX) 40309 LEAQ (DX)(BX*4), DX 40310 ADDSS (DX), X7 40311 MOVSS X7, (DX) 40312 LEAQ (DX)(BX*4), DX 40313 ADDSS (DX), X8 40314 MOVSS X8, (DX) 40315 LEAQ (DX)(BX*4), DX 40316 SUBQ $0x08, SI 40317 40318 check_limit_unroll: 40319 CMPQ SI, $0x08 40320 JHS loop_unroll 40321 JMP check_limit 40322 40323 loop: 40324 MOVSS (AX), X1 40325 MULSS X0, X1 40326 ADDSS (DX), X1 40327 MOVSS X1, (DX) 40328 DECQ SI 40329 LEAQ (AX)(CX*4), AX 40330 LEAQ (DX)(BX*4), DX 40331 40332 check_limit: 40333 CMPQ SI, $0x00 40334 JHI loop 40335 RET 40336 40337 // func AmdAxpyPointerLoopXInterleave_V0A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40338 // Requires: SSE 40339 TEXT ·AmdAxpyPointerLoopXInterleave_V0A12U8(SB), NOSPLIT, $0-48 40340 MOVSS alpha+0(FP), X0 40341 MOVQ xs+8(FP), AX 40342 MOVQ incx+16(FP), CX 40343 MOVQ CX, DX 40344 SHLQ $0x05, DX 40345 MOVQ ys+24(FP), DX 40346 MOVQ incy+32(FP), BX 40347 MOVQ BX, SI 40348 SHLQ $0x05, SI 40349 MOVQ n+40(FP), SI 40350 JMP check_limit_unroll 40351 PCALIGN $0x08 40352 NOP 40353 NOP 40354 NOP 40355 NOP 40356 40357 loop_unroll: 40358 MOVSS (AX), X1 40359 LEAQ (AX)(CX*4), AX 40360 MOVSS (AX), X2 40361 LEAQ (AX)(CX*4), AX 40362 MOVSS (AX), X3 40363 LEAQ (AX)(CX*4), AX 40364 MOVSS (AX), X4 40365 LEAQ (AX)(CX*4), AX 40366 MOVSS (AX), X5 40367 LEAQ (AX)(CX*4), AX 40368 MOVSS (AX), X6 40369 LEAQ (AX)(CX*4), AX 40370 MOVSS (AX), X7 40371 LEAQ (AX)(CX*4), AX 40372 MOVSS (AX), X8 40373 LEAQ (AX)(CX*4), AX 40374 MULSS X0, X1 40375 MULSS X0, X2 40376 MULSS X0, X3 40377 MULSS X0, X4 40378 MULSS X0, X5 40379 MULSS X0, X6 40380 MULSS X0, X7 40381 MULSS X0, X8 40382 ADDSS (DX), X1 40383 MOVSS X1, (DX) 40384 LEAQ (DX)(BX*4), DX 40385 ADDSS (DX), X2 40386 MOVSS X2, (DX) 40387 LEAQ (DX)(BX*4), DX 40388 ADDSS (DX), X3 40389 MOVSS X3, (DX) 40390 LEAQ (DX)(BX*4), DX 40391 ADDSS (DX), X4 40392 MOVSS X4, (DX) 40393 LEAQ (DX)(BX*4), DX 40394 ADDSS (DX), X5 40395 MOVSS X5, (DX) 40396 LEAQ (DX)(BX*4), DX 40397 ADDSS (DX), X6 40398 MOVSS X6, (DX) 40399 LEAQ (DX)(BX*4), DX 40400 ADDSS (DX), X7 40401 MOVSS X7, (DX) 40402 LEAQ (DX)(BX*4), DX 40403 ADDSS (DX), X8 40404 MOVSS X8, (DX) 40405 LEAQ (DX)(BX*4), DX 40406 SUBQ $0x08, SI 40407 40408 check_limit_unroll: 40409 CMPQ SI, $0x08 40410 JHS loop_unroll 40411 JMP check_limit 40412 40413 loop: 40414 MOVSS (AX), X1 40415 MULSS X0, X1 40416 ADDSS (DX), X1 40417 MOVSS X1, (DX) 40418 DECQ SI 40419 LEAQ (AX)(CX*4), AX 40420 LEAQ (DX)(BX*4), DX 40421 40422 check_limit: 40423 CMPQ SI, $0x00 40424 JHI loop 40425 RET 40426 40427 // func AmdAxpyPointerLoopXInterleave_V1A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40428 // Requires: SSE 40429 TEXT ·AmdAxpyPointerLoopXInterleave_V1A12U8(SB), NOSPLIT, $0-48 40430 MOVSS alpha+0(FP), X0 40431 MOVQ xs+8(FP), AX 40432 MOVQ incx+16(FP), CX 40433 MOVQ CX, DX 40434 SHLQ $0x05, DX 40435 MOVQ ys+24(FP), DX 40436 MOVQ incy+32(FP), BX 40437 MOVQ BX, SI 40438 SHLQ $0x05, SI 40439 MOVQ n+40(FP), SI 40440 JMP check_limit_unroll 40441 PCALIGN $0x08 40442 NOP 40443 NOP 40444 NOP 40445 NOP 40446 40447 loop_unroll: 40448 MOVSS (AX), X1 40449 LEAQ (AX)(CX*4), AX 40450 MOVSS (AX), X2 40451 LEAQ (AX)(CX*4), AX 40452 MOVSS (AX), X3 40453 LEAQ (AX)(CX*4), AX 40454 MOVSS (AX), X4 40455 LEAQ (AX)(CX*4), AX 40456 MOVSS (AX), X5 40457 LEAQ (AX)(CX*4), AX 40458 MOVSS (AX), X6 40459 LEAQ (AX)(CX*4), AX 40460 MOVSS (AX), X7 40461 LEAQ (AX)(CX*4), AX 40462 MOVSS (AX), X8 40463 LEAQ (AX)(CX*4), AX 40464 MULSS X0, X1 40465 MULSS X0, X2 40466 MULSS X0, X3 40467 MULSS X0, X4 40468 MULSS X0, X5 40469 MULSS X0, X6 40470 MULSS X0, X7 40471 MULSS X0, X8 40472 ADDSS (DX), X1 40473 MOVSS X1, (DX) 40474 LEAQ (DX)(BX*4), DX 40475 ADDSS (DX), X2 40476 MOVSS X2, (DX) 40477 LEAQ (DX)(BX*4), DX 40478 ADDSS (DX), X3 40479 MOVSS X3, (DX) 40480 LEAQ (DX)(BX*4), DX 40481 ADDSS (DX), X4 40482 MOVSS X4, (DX) 40483 LEAQ (DX)(BX*4), DX 40484 ADDSS (DX), X5 40485 MOVSS X5, (DX) 40486 LEAQ (DX)(BX*4), DX 40487 ADDSS (DX), X6 40488 MOVSS X6, (DX) 40489 LEAQ (DX)(BX*4), DX 40490 ADDSS (DX), X7 40491 MOVSS X7, (DX) 40492 LEAQ (DX)(BX*4), DX 40493 ADDSS (DX), X8 40494 MOVSS X8, (DX) 40495 LEAQ (DX)(BX*4), DX 40496 SUBQ $0x08, SI 40497 40498 check_limit_unroll: 40499 CMPQ SI, $0x08 40500 JHS loop_unroll 40501 JMP check_limit 40502 40503 loop: 40504 MOVSS (AX), X1 40505 MULSS X0, X1 40506 ADDSS (DX), X1 40507 MOVSS X1, (DX) 40508 DECQ SI 40509 LEAQ (AX)(CX*4), AX 40510 LEAQ (DX)(BX*4), DX 40511 40512 check_limit: 40513 CMPQ SI, $0x00 40514 JHI loop 40515 RET 40516 40517 // func AmdAxpyPointerLoopXInterleave_V2A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40518 // Requires: SSE 40519 TEXT ·AmdAxpyPointerLoopXInterleave_V2A12U8(SB), NOSPLIT, $0-48 40520 MOVSS alpha+0(FP), X0 40521 MOVQ xs+8(FP), AX 40522 MOVQ incx+16(FP), CX 40523 MOVQ CX, DX 40524 SHLQ $0x05, DX 40525 MOVQ ys+24(FP), DX 40526 MOVQ incy+32(FP), BX 40527 MOVQ BX, SI 40528 SHLQ $0x05, SI 40529 MOVQ n+40(FP), SI 40530 JMP check_limit_unroll 40531 PCALIGN $0x08 40532 NOP 40533 NOP 40534 NOP 40535 NOP 40536 40537 loop_unroll: 40538 MOVSS (AX), X1 40539 LEAQ (AX)(CX*4), AX 40540 MOVSS (AX), X2 40541 LEAQ (AX)(CX*4), AX 40542 MOVSS (AX), X3 40543 LEAQ (AX)(CX*4), AX 40544 MOVSS (AX), X4 40545 LEAQ (AX)(CX*4), AX 40546 MOVSS (AX), X5 40547 LEAQ (AX)(CX*4), AX 40548 MOVSS (AX), X6 40549 LEAQ (AX)(CX*4), AX 40550 MOVSS (AX), X7 40551 LEAQ (AX)(CX*4), AX 40552 MOVSS (AX), X8 40553 LEAQ (AX)(CX*4), AX 40554 MULSS X0, X1 40555 MULSS X0, X2 40556 MULSS X0, X3 40557 MULSS X0, X4 40558 MULSS X0, X5 40559 MULSS X0, X6 40560 MULSS X0, X7 40561 MULSS X0, X8 40562 ADDSS (DX), X1 40563 MOVSS X1, (DX) 40564 LEAQ (DX)(BX*4), DX 40565 ADDSS (DX), X2 40566 MOVSS X2, (DX) 40567 LEAQ (DX)(BX*4), DX 40568 ADDSS (DX), X3 40569 MOVSS X3, (DX) 40570 LEAQ (DX)(BX*4), DX 40571 ADDSS (DX), X4 40572 MOVSS X4, (DX) 40573 LEAQ (DX)(BX*4), DX 40574 ADDSS (DX), X5 40575 MOVSS X5, (DX) 40576 LEAQ (DX)(BX*4), DX 40577 ADDSS (DX), X6 40578 MOVSS X6, (DX) 40579 LEAQ (DX)(BX*4), DX 40580 ADDSS (DX), X7 40581 MOVSS X7, (DX) 40582 LEAQ (DX)(BX*4), DX 40583 ADDSS (DX), X8 40584 MOVSS X8, (DX) 40585 LEAQ (DX)(BX*4), DX 40586 SUBQ $0x08, SI 40587 40588 check_limit_unroll: 40589 CMPQ SI, $0x08 40590 JHS loop_unroll 40591 JMP check_limit 40592 40593 loop: 40594 MOVSS (AX), X1 40595 MULSS X0, X1 40596 ADDSS (DX), X1 40597 MOVSS X1, (DX) 40598 DECQ SI 40599 LEAQ (AX)(CX*4), AX 40600 LEAQ (DX)(BX*4), DX 40601 40602 check_limit: 40603 CMPQ SI, $0x00 40604 JHI loop 40605 RET 40606 40607 // func AmdAxpyPointerLoopXInterleave_V3A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40608 // Requires: SSE 40609 TEXT ·AmdAxpyPointerLoopXInterleave_V3A12U8(SB), NOSPLIT, $0-48 40610 MOVSS alpha+0(FP), X0 40611 MOVQ xs+8(FP), AX 40612 MOVQ incx+16(FP), CX 40613 MOVQ CX, DX 40614 SHLQ $0x05, DX 40615 MOVQ ys+24(FP), DX 40616 MOVQ incy+32(FP), BX 40617 MOVQ BX, SI 40618 SHLQ $0x05, SI 40619 MOVQ n+40(FP), SI 40620 JMP check_limit_unroll 40621 PCALIGN $0x08 40622 NOP 40623 NOP 40624 NOP 40625 NOP 40626 40627 loop_unroll: 40628 MOVSS (AX), X1 40629 LEAQ (AX)(CX*4), AX 40630 MOVSS (AX), X2 40631 LEAQ (AX)(CX*4), AX 40632 MOVSS (AX), X3 40633 LEAQ (AX)(CX*4), AX 40634 MOVSS (AX), X4 40635 LEAQ (AX)(CX*4), AX 40636 MOVSS (AX), X5 40637 LEAQ (AX)(CX*4), AX 40638 MOVSS (AX), X6 40639 LEAQ (AX)(CX*4), AX 40640 MOVSS (AX), X7 40641 LEAQ (AX)(CX*4), AX 40642 MOVSS (AX), X8 40643 LEAQ (AX)(CX*4), AX 40644 MULSS X0, X1 40645 MULSS X0, X2 40646 MULSS X0, X3 40647 MULSS X0, X4 40648 MULSS X0, X5 40649 MULSS X0, X6 40650 MULSS X0, X7 40651 MULSS X0, X8 40652 ADDSS (DX), X1 40653 MOVSS X1, (DX) 40654 LEAQ (DX)(BX*4), DX 40655 ADDSS (DX), X2 40656 MOVSS X2, (DX) 40657 LEAQ (DX)(BX*4), DX 40658 ADDSS (DX), X3 40659 MOVSS X3, (DX) 40660 LEAQ (DX)(BX*4), DX 40661 ADDSS (DX), X4 40662 MOVSS X4, (DX) 40663 LEAQ (DX)(BX*4), DX 40664 ADDSS (DX), X5 40665 MOVSS X5, (DX) 40666 LEAQ (DX)(BX*4), DX 40667 ADDSS (DX), X6 40668 MOVSS X6, (DX) 40669 LEAQ (DX)(BX*4), DX 40670 ADDSS (DX), X7 40671 MOVSS X7, (DX) 40672 LEAQ (DX)(BX*4), DX 40673 ADDSS (DX), X8 40674 MOVSS X8, (DX) 40675 LEAQ (DX)(BX*4), DX 40676 SUBQ $0x08, SI 40677 40678 check_limit_unroll: 40679 CMPQ SI, $0x08 40680 JHS loop_unroll 40681 JMP check_limit 40682 40683 loop: 40684 MOVSS (AX), X1 40685 MULSS X0, X1 40686 ADDSS (DX), X1 40687 MOVSS X1, (DX) 40688 DECQ SI 40689 LEAQ (AX)(CX*4), AX 40690 LEAQ (DX)(BX*4), DX 40691 40692 check_limit: 40693 CMPQ SI, $0x00 40694 JHI loop 40695 RET 40696 40697 // func AmdAxpyPointerLoopXInterleave_V4A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40698 // Requires: SSE 40699 TEXT ·AmdAxpyPointerLoopXInterleave_V4A12U8(SB), NOSPLIT, $0-48 40700 MOVSS alpha+0(FP), X0 40701 MOVQ xs+8(FP), AX 40702 MOVQ incx+16(FP), CX 40703 MOVQ CX, DX 40704 SHLQ $0x05, DX 40705 MOVQ ys+24(FP), DX 40706 MOVQ incy+32(FP), BX 40707 MOVQ BX, SI 40708 SHLQ $0x05, SI 40709 MOVQ n+40(FP), SI 40710 JMP check_limit_unroll 40711 PCALIGN $0x08 40712 NOP 40713 NOP 40714 NOP 40715 NOP 40716 40717 loop_unroll: 40718 MOVSS (AX), X1 40719 LEAQ (AX)(CX*4), AX 40720 MOVSS (AX), X2 40721 LEAQ (AX)(CX*4), AX 40722 MOVSS (AX), X3 40723 LEAQ (AX)(CX*4), AX 40724 MOVSS (AX), X4 40725 LEAQ (AX)(CX*4), AX 40726 MOVSS (AX), X5 40727 LEAQ (AX)(CX*4), AX 40728 MOVSS (AX), X6 40729 LEAQ (AX)(CX*4), AX 40730 MOVSS (AX), X7 40731 LEAQ (AX)(CX*4), AX 40732 MOVSS (AX), X8 40733 LEAQ (AX)(CX*4), AX 40734 MULSS X0, X1 40735 MULSS X0, X2 40736 MULSS X0, X3 40737 MULSS X0, X4 40738 MULSS X0, X5 40739 MULSS X0, X6 40740 MULSS X0, X7 40741 MULSS X0, X8 40742 ADDSS (DX), X1 40743 MOVSS X1, (DX) 40744 LEAQ (DX)(BX*4), DX 40745 ADDSS (DX), X2 40746 MOVSS X2, (DX) 40747 LEAQ (DX)(BX*4), DX 40748 ADDSS (DX), X3 40749 MOVSS X3, (DX) 40750 LEAQ (DX)(BX*4), DX 40751 ADDSS (DX), X4 40752 MOVSS X4, (DX) 40753 LEAQ (DX)(BX*4), DX 40754 ADDSS (DX), X5 40755 MOVSS X5, (DX) 40756 LEAQ (DX)(BX*4), DX 40757 ADDSS (DX), X6 40758 MOVSS X6, (DX) 40759 LEAQ (DX)(BX*4), DX 40760 ADDSS (DX), X7 40761 MOVSS X7, (DX) 40762 LEAQ (DX)(BX*4), DX 40763 ADDSS (DX), X8 40764 MOVSS X8, (DX) 40765 LEAQ (DX)(BX*4), DX 40766 SUBQ $0x08, SI 40767 40768 check_limit_unroll: 40769 CMPQ SI, $0x08 40770 JHS loop_unroll 40771 JMP check_limit 40772 40773 loop: 40774 MOVSS (AX), X1 40775 MULSS X0, X1 40776 ADDSS (DX), X1 40777 MOVSS X1, (DX) 40778 DECQ SI 40779 LEAQ (AX)(CX*4), AX 40780 LEAQ (DX)(BX*4), DX 40781 40782 check_limit: 40783 CMPQ SI, $0x00 40784 JHI loop 40785 RET 40786 40787 // func AmdAxpyPointerLoopXInterleave_V5A12U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40788 // Requires: SSE 40789 TEXT ·AmdAxpyPointerLoopXInterleave_V5A12U8(SB), NOSPLIT, $0-48 40790 MOVSS alpha+0(FP), X0 40791 MOVQ xs+8(FP), AX 40792 MOVQ incx+16(FP), CX 40793 MOVQ CX, DX 40794 SHLQ $0x05, DX 40795 MOVQ ys+24(FP), DX 40796 MOVQ incy+32(FP), BX 40797 MOVQ BX, SI 40798 SHLQ $0x05, SI 40799 MOVQ n+40(FP), SI 40800 JMP check_limit_unroll 40801 PCALIGN $0x08 40802 NOP 40803 NOP 40804 NOP 40805 NOP 40806 40807 loop_unroll: 40808 MOVSS (AX), X1 40809 LEAQ (AX)(CX*4), AX 40810 MOVSS (AX), X2 40811 LEAQ (AX)(CX*4), AX 40812 MOVSS (AX), X3 40813 LEAQ (AX)(CX*4), AX 40814 MOVSS (AX), X4 40815 LEAQ (AX)(CX*4), AX 40816 MOVSS (AX), X5 40817 LEAQ (AX)(CX*4), AX 40818 MOVSS (AX), X6 40819 LEAQ (AX)(CX*4), AX 40820 MOVSS (AX), X7 40821 LEAQ (AX)(CX*4), AX 40822 MOVSS (AX), X8 40823 LEAQ (AX)(CX*4), AX 40824 MULSS X0, X1 40825 MULSS X0, X2 40826 MULSS X0, X3 40827 MULSS X0, X4 40828 MULSS X0, X5 40829 MULSS X0, X6 40830 MULSS X0, X7 40831 MULSS X0, X8 40832 ADDSS (DX), X1 40833 MOVSS X1, (DX) 40834 LEAQ (DX)(BX*4), DX 40835 ADDSS (DX), X2 40836 MOVSS X2, (DX) 40837 LEAQ (DX)(BX*4), DX 40838 ADDSS (DX), X3 40839 MOVSS X3, (DX) 40840 LEAQ (DX)(BX*4), DX 40841 ADDSS (DX), X4 40842 MOVSS X4, (DX) 40843 LEAQ (DX)(BX*4), DX 40844 ADDSS (DX), X5 40845 MOVSS X5, (DX) 40846 LEAQ (DX)(BX*4), DX 40847 ADDSS (DX), X6 40848 MOVSS X6, (DX) 40849 LEAQ (DX)(BX*4), DX 40850 ADDSS (DX), X7 40851 MOVSS X7, (DX) 40852 LEAQ (DX)(BX*4), DX 40853 ADDSS (DX), X8 40854 MOVSS X8, (DX) 40855 LEAQ (DX)(BX*4), DX 40856 SUBQ $0x08, SI 40857 40858 check_limit_unroll: 40859 CMPQ SI, $0x08 40860 JHS loop_unroll 40861 JMP check_limit 40862 40863 loop: 40864 MOVSS (AX), X1 40865 MULSS X0, X1 40866 ADDSS (DX), X1 40867 MOVSS X1, (DX) 40868 DECQ SI 40869 LEAQ (AX)(CX*4), AX 40870 LEAQ (DX)(BX*4), DX 40871 40872 check_limit: 40873 CMPQ SI, $0x00 40874 JHI loop 40875 RET 40876 40877 // func AmdAxpyPointerLoopXInterleave_V0A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40878 // Requires: SSE 40879 TEXT ·AmdAxpyPointerLoopXInterleave_V0A13U8(SB), NOSPLIT, $0-48 40880 MOVSS alpha+0(FP), X0 40881 MOVQ xs+8(FP), AX 40882 MOVQ incx+16(FP), CX 40883 MOVQ CX, DX 40884 SHLQ $0x05, DX 40885 MOVQ ys+24(FP), DX 40886 MOVQ incy+32(FP), BX 40887 MOVQ BX, SI 40888 SHLQ $0x05, SI 40889 MOVQ n+40(FP), SI 40890 JMP check_limit_unroll 40891 PCALIGN $0x08 40892 NOP 40893 NOP 40894 NOP 40895 NOP 40896 NOP 40897 40898 loop_unroll: 40899 MOVSS (AX), X1 40900 LEAQ (AX)(CX*4), AX 40901 MOVSS (AX), X2 40902 LEAQ (AX)(CX*4), AX 40903 MOVSS (AX), X3 40904 LEAQ (AX)(CX*4), AX 40905 MOVSS (AX), X4 40906 LEAQ (AX)(CX*4), AX 40907 MOVSS (AX), X5 40908 LEAQ (AX)(CX*4), AX 40909 MOVSS (AX), X6 40910 LEAQ (AX)(CX*4), AX 40911 MOVSS (AX), X7 40912 LEAQ (AX)(CX*4), AX 40913 MOVSS (AX), X8 40914 LEAQ (AX)(CX*4), AX 40915 MULSS X0, X1 40916 MULSS X0, X2 40917 MULSS X0, X3 40918 MULSS X0, X4 40919 MULSS X0, X5 40920 MULSS X0, X6 40921 MULSS X0, X7 40922 MULSS X0, X8 40923 ADDSS (DX), X1 40924 MOVSS X1, (DX) 40925 LEAQ (DX)(BX*4), DX 40926 ADDSS (DX), X2 40927 MOVSS X2, (DX) 40928 LEAQ (DX)(BX*4), DX 40929 ADDSS (DX), X3 40930 MOVSS X3, (DX) 40931 LEAQ (DX)(BX*4), DX 40932 ADDSS (DX), X4 40933 MOVSS X4, (DX) 40934 LEAQ (DX)(BX*4), DX 40935 ADDSS (DX), X5 40936 MOVSS X5, (DX) 40937 LEAQ (DX)(BX*4), DX 40938 ADDSS (DX), X6 40939 MOVSS X6, (DX) 40940 LEAQ (DX)(BX*4), DX 40941 ADDSS (DX), X7 40942 MOVSS X7, (DX) 40943 LEAQ (DX)(BX*4), DX 40944 ADDSS (DX), X8 40945 MOVSS X8, (DX) 40946 LEAQ (DX)(BX*4), DX 40947 SUBQ $0x08, SI 40948 40949 check_limit_unroll: 40950 CMPQ SI, $0x08 40951 JHS loop_unroll 40952 JMP check_limit 40953 40954 loop: 40955 MOVSS (AX), X1 40956 MULSS X0, X1 40957 ADDSS (DX), X1 40958 MOVSS X1, (DX) 40959 DECQ SI 40960 LEAQ (AX)(CX*4), AX 40961 LEAQ (DX)(BX*4), DX 40962 40963 check_limit: 40964 CMPQ SI, $0x00 40965 JHI loop 40966 RET 40967 40968 // func AmdAxpyPointerLoopXInterleave_V1A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 40969 // Requires: SSE 40970 TEXT ·AmdAxpyPointerLoopXInterleave_V1A13U8(SB), NOSPLIT, $0-48 40971 MOVSS alpha+0(FP), X0 40972 MOVQ xs+8(FP), AX 40973 MOVQ incx+16(FP), CX 40974 MOVQ CX, DX 40975 SHLQ $0x05, DX 40976 MOVQ ys+24(FP), DX 40977 MOVQ incy+32(FP), BX 40978 MOVQ BX, SI 40979 SHLQ $0x05, SI 40980 MOVQ n+40(FP), SI 40981 JMP check_limit_unroll 40982 PCALIGN $0x08 40983 NOP 40984 NOP 40985 NOP 40986 NOP 40987 NOP 40988 40989 loop_unroll: 40990 MOVSS (AX), X1 40991 LEAQ (AX)(CX*4), AX 40992 MOVSS (AX), X2 40993 LEAQ (AX)(CX*4), AX 40994 MOVSS (AX), X3 40995 LEAQ (AX)(CX*4), AX 40996 MOVSS (AX), X4 40997 LEAQ (AX)(CX*4), AX 40998 MOVSS (AX), X5 40999 LEAQ (AX)(CX*4), AX 41000 MOVSS (AX), X6 41001 LEAQ (AX)(CX*4), AX 41002 MOVSS (AX), X7 41003 LEAQ (AX)(CX*4), AX 41004 MOVSS (AX), X8 41005 LEAQ (AX)(CX*4), AX 41006 MULSS X0, X1 41007 MULSS X0, X2 41008 MULSS X0, X3 41009 MULSS X0, X4 41010 MULSS X0, X5 41011 MULSS X0, X6 41012 MULSS X0, X7 41013 MULSS X0, X8 41014 ADDSS (DX), X1 41015 MOVSS X1, (DX) 41016 LEAQ (DX)(BX*4), DX 41017 ADDSS (DX), X2 41018 MOVSS X2, (DX) 41019 LEAQ (DX)(BX*4), DX 41020 ADDSS (DX), X3 41021 MOVSS X3, (DX) 41022 LEAQ (DX)(BX*4), DX 41023 ADDSS (DX), X4 41024 MOVSS X4, (DX) 41025 LEAQ (DX)(BX*4), DX 41026 ADDSS (DX), X5 41027 MOVSS X5, (DX) 41028 LEAQ (DX)(BX*4), DX 41029 ADDSS (DX), X6 41030 MOVSS X6, (DX) 41031 LEAQ (DX)(BX*4), DX 41032 ADDSS (DX), X7 41033 MOVSS X7, (DX) 41034 LEAQ (DX)(BX*4), DX 41035 ADDSS (DX), X8 41036 MOVSS X8, (DX) 41037 LEAQ (DX)(BX*4), DX 41038 SUBQ $0x08, SI 41039 41040 check_limit_unroll: 41041 CMPQ SI, $0x08 41042 JHS loop_unroll 41043 JMP check_limit 41044 41045 loop: 41046 MOVSS (AX), X1 41047 MULSS X0, X1 41048 ADDSS (DX), X1 41049 MOVSS X1, (DX) 41050 DECQ SI 41051 LEAQ (AX)(CX*4), AX 41052 LEAQ (DX)(BX*4), DX 41053 41054 check_limit: 41055 CMPQ SI, $0x00 41056 JHI loop 41057 RET 41058 41059 // func AmdAxpyPointerLoopXInterleave_V2A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41060 // Requires: SSE 41061 TEXT ·AmdAxpyPointerLoopXInterleave_V2A13U8(SB), NOSPLIT, $0-48 41062 MOVSS alpha+0(FP), X0 41063 MOVQ xs+8(FP), AX 41064 MOVQ incx+16(FP), CX 41065 MOVQ CX, DX 41066 SHLQ $0x05, DX 41067 MOVQ ys+24(FP), DX 41068 MOVQ incy+32(FP), BX 41069 MOVQ BX, SI 41070 SHLQ $0x05, SI 41071 MOVQ n+40(FP), SI 41072 JMP check_limit_unroll 41073 PCALIGN $0x08 41074 NOP 41075 NOP 41076 NOP 41077 NOP 41078 NOP 41079 41080 loop_unroll: 41081 MOVSS (AX), X1 41082 LEAQ (AX)(CX*4), AX 41083 MOVSS (AX), X2 41084 LEAQ (AX)(CX*4), AX 41085 MOVSS (AX), X3 41086 LEAQ (AX)(CX*4), AX 41087 MOVSS (AX), X4 41088 LEAQ (AX)(CX*4), AX 41089 MOVSS (AX), X5 41090 LEAQ (AX)(CX*4), AX 41091 MOVSS (AX), X6 41092 LEAQ (AX)(CX*4), AX 41093 MOVSS (AX), X7 41094 LEAQ (AX)(CX*4), AX 41095 MOVSS (AX), X8 41096 LEAQ (AX)(CX*4), AX 41097 MULSS X0, X1 41098 MULSS X0, X2 41099 MULSS X0, X3 41100 MULSS X0, X4 41101 MULSS X0, X5 41102 MULSS X0, X6 41103 MULSS X0, X7 41104 MULSS X0, X8 41105 ADDSS (DX), X1 41106 MOVSS X1, (DX) 41107 LEAQ (DX)(BX*4), DX 41108 ADDSS (DX), X2 41109 MOVSS X2, (DX) 41110 LEAQ (DX)(BX*4), DX 41111 ADDSS (DX), X3 41112 MOVSS X3, (DX) 41113 LEAQ (DX)(BX*4), DX 41114 ADDSS (DX), X4 41115 MOVSS X4, (DX) 41116 LEAQ (DX)(BX*4), DX 41117 ADDSS (DX), X5 41118 MOVSS X5, (DX) 41119 LEAQ (DX)(BX*4), DX 41120 ADDSS (DX), X6 41121 MOVSS X6, (DX) 41122 LEAQ (DX)(BX*4), DX 41123 ADDSS (DX), X7 41124 MOVSS X7, (DX) 41125 LEAQ (DX)(BX*4), DX 41126 ADDSS (DX), X8 41127 MOVSS X8, (DX) 41128 LEAQ (DX)(BX*4), DX 41129 SUBQ $0x08, SI 41130 41131 check_limit_unroll: 41132 CMPQ SI, $0x08 41133 JHS loop_unroll 41134 JMP check_limit 41135 41136 loop: 41137 MOVSS (AX), X1 41138 MULSS X0, X1 41139 ADDSS (DX), X1 41140 MOVSS X1, (DX) 41141 DECQ SI 41142 LEAQ (AX)(CX*4), AX 41143 LEAQ (DX)(BX*4), DX 41144 41145 check_limit: 41146 CMPQ SI, $0x00 41147 JHI loop 41148 RET 41149 41150 // func AmdAxpyPointerLoopXInterleave_V3A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41151 // Requires: SSE 41152 TEXT ·AmdAxpyPointerLoopXInterleave_V3A13U8(SB), NOSPLIT, $0-48 41153 MOVSS alpha+0(FP), X0 41154 MOVQ xs+8(FP), AX 41155 MOVQ incx+16(FP), CX 41156 MOVQ CX, DX 41157 SHLQ $0x05, DX 41158 MOVQ ys+24(FP), DX 41159 MOVQ incy+32(FP), BX 41160 MOVQ BX, SI 41161 SHLQ $0x05, SI 41162 MOVQ n+40(FP), SI 41163 JMP check_limit_unroll 41164 PCALIGN $0x08 41165 NOP 41166 NOP 41167 NOP 41168 NOP 41169 NOP 41170 41171 loop_unroll: 41172 MOVSS (AX), X1 41173 LEAQ (AX)(CX*4), AX 41174 MOVSS (AX), X2 41175 LEAQ (AX)(CX*4), AX 41176 MOVSS (AX), X3 41177 LEAQ (AX)(CX*4), AX 41178 MOVSS (AX), X4 41179 LEAQ (AX)(CX*4), AX 41180 MOVSS (AX), X5 41181 LEAQ (AX)(CX*4), AX 41182 MOVSS (AX), X6 41183 LEAQ (AX)(CX*4), AX 41184 MOVSS (AX), X7 41185 LEAQ (AX)(CX*4), AX 41186 MOVSS (AX), X8 41187 LEAQ (AX)(CX*4), AX 41188 MULSS X0, X1 41189 MULSS X0, X2 41190 MULSS X0, X3 41191 MULSS X0, X4 41192 MULSS X0, X5 41193 MULSS X0, X6 41194 MULSS X0, X7 41195 MULSS X0, X8 41196 ADDSS (DX), X1 41197 MOVSS X1, (DX) 41198 LEAQ (DX)(BX*4), DX 41199 ADDSS (DX), X2 41200 MOVSS X2, (DX) 41201 LEAQ (DX)(BX*4), DX 41202 ADDSS (DX), X3 41203 MOVSS X3, (DX) 41204 LEAQ (DX)(BX*4), DX 41205 ADDSS (DX), X4 41206 MOVSS X4, (DX) 41207 LEAQ (DX)(BX*4), DX 41208 ADDSS (DX), X5 41209 MOVSS X5, (DX) 41210 LEAQ (DX)(BX*4), DX 41211 ADDSS (DX), X6 41212 MOVSS X6, (DX) 41213 LEAQ (DX)(BX*4), DX 41214 ADDSS (DX), X7 41215 MOVSS X7, (DX) 41216 LEAQ (DX)(BX*4), DX 41217 ADDSS (DX), X8 41218 MOVSS X8, (DX) 41219 LEAQ (DX)(BX*4), DX 41220 SUBQ $0x08, SI 41221 41222 check_limit_unroll: 41223 CMPQ SI, $0x08 41224 JHS loop_unroll 41225 JMP check_limit 41226 41227 loop: 41228 MOVSS (AX), X1 41229 MULSS X0, X1 41230 ADDSS (DX), X1 41231 MOVSS X1, (DX) 41232 DECQ SI 41233 LEAQ (AX)(CX*4), AX 41234 LEAQ (DX)(BX*4), DX 41235 41236 check_limit: 41237 CMPQ SI, $0x00 41238 JHI loop 41239 RET 41240 41241 // func AmdAxpyPointerLoopXInterleave_V4A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41242 // Requires: SSE 41243 TEXT ·AmdAxpyPointerLoopXInterleave_V4A13U8(SB), NOSPLIT, $0-48 41244 MOVSS alpha+0(FP), X0 41245 MOVQ xs+8(FP), AX 41246 MOVQ incx+16(FP), CX 41247 MOVQ CX, DX 41248 SHLQ $0x05, DX 41249 MOVQ ys+24(FP), DX 41250 MOVQ incy+32(FP), BX 41251 MOVQ BX, SI 41252 SHLQ $0x05, SI 41253 MOVQ n+40(FP), SI 41254 JMP check_limit_unroll 41255 PCALIGN $0x08 41256 NOP 41257 NOP 41258 NOP 41259 NOP 41260 NOP 41261 41262 loop_unroll: 41263 MOVSS (AX), X1 41264 LEAQ (AX)(CX*4), AX 41265 MOVSS (AX), X2 41266 LEAQ (AX)(CX*4), AX 41267 MOVSS (AX), X3 41268 LEAQ (AX)(CX*4), AX 41269 MOVSS (AX), X4 41270 LEAQ (AX)(CX*4), AX 41271 MOVSS (AX), X5 41272 LEAQ (AX)(CX*4), AX 41273 MOVSS (AX), X6 41274 LEAQ (AX)(CX*4), AX 41275 MOVSS (AX), X7 41276 LEAQ (AX)(CX*4), AX 41277 MOVSS (AX), X8 41278 LEAQ (AX)(CX*4), AX 41279 MULSS X0, X1 41280 MULSS X0, X2 41281 MULSS X0, X3 41282 MULSS X0, X4 41283 MULSS X0, X5 41284 MULSS X0, X6 41285 MULSS X0, X7 41286 MULSS X0, X8 41287 ADDSS (DX), X1 41288 MOVSS X1, (DX) 41289 LEAQ (DX)(BX*4), DX 41290 ADDSS (DX), X2 41291 MOVSS X2, (DX) 41292 LEAQ (DX)(BX*4), DX 41293 ADDSS (DX), X3 41294 MOVSS X3, (DX) 41295 LEAQ (DX)(BX*4), DX 41296 ADDSS (DX), X4 41297 MOVSS X4, (DX) 41298 LEAQ (DX)(BX*4), DX 41299 ADDSS (DX), X5 41300 MOVSS X5, (DX) 41301 LEAQ (DX)(BX*4), DX 41302 ADDSS (DX), X6 41303 MOVSS X6, (DX) 41304 LEAQ (DX)(BX*4), DX 41305 ADDSS (DX), X7 41306 MOVSS X7, (DX) 41307 LEAQ (DX)(BX*4), DX 41308 ADDSS (DX), X8 41309 MOVSS X8, (DX) 41310 LEAQ (DX)(BX*4), DX 41311 SUBQ $0x08, SI 41312 41313 check_limit_unroll: 41314 CMPQ SI, $0x08 41315 JHS loop_unroll 41316 JMP check_limit 41317 41318 loop: 41319 MOVSS (AX), X1 41320 MULSS X0, X1 41321 ADDSS (DX), X1 41322 MOVSS X1, (DX) 41323 DECQ SI 41324 LEAQ (AX)(CX*4), AX 41325 LEAQ (DX)(BX*4), DX 41326 41327 check_limit: 41328 CMPQ SI, $0x00 41329 JHI loop 41330 RET 41331 41332 // func AmdAxpyPointerLoopXInterleave_V5A13U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41333 // Requires: SSE 41334 TEXT ·AmdAxpyPointerLoopXInterleave_V5A13U8(SB), NOSPLIT, $0-48 41335 MOVSS alpha+0(FP), X0 41336 MOVQ xs+8(FP), AX 41337 MOVQ incx+16(FP), CX 41338 MOVQ CX, DX 41339 SHLQ $0x05, DX 41340 MOVQ ys+24(FP), DX 41341 MOVQ incy+32(FP), BX 41342 MOVQ BX, SI 41343 SHLQ $0x05, SI 41344 MOVQ n+40(FP), SI 41345 JMP check_limit_unroll 41346 PCALIGN $0x08 41347 NOP 41348 NOP 41349 NOP 41350 NOP 41351 NOP 41352 41353 loop_unroll: 41354 MOVSS (AX), X1 41355 LEAQ (AX)(CX*4), AX 41356 MOVSS (AX), X2 41357 LEAQ (AX)(CX*4), AX 41358 MOVSS (AX), X3 41359 LEAQ (AX)(CX*4), AX 41360 MOVSS (AX), X4 41361 LEAQ (AX)(CX*4), AX 41362 MOVSS (AX), X5 41363 LEAQ (AX)(CX*4), AX 41364 MOVSS (AX), X6 41365 LEAQ (AX)(CX*4), AX 41366 MOVSS (AX), X7 41367 LEAQ (AX)(CX*4), AX 41368 MOVSS (AX), X8 41369 LEAQ (AX)(CX*4), AX 41370 MULSS X0, X1 41371 MULSS X0, X2 41372 MULSS X0, X3 41373 MULSS X0, X4 41374 MULSS X0, X5 41375 MULSS X0, X6 41376 MULSS X0, X7 41377 MULSS X0, X8 41378 ADDSS (DX), X1 41379 MOVSS X1, (DX) 41380 LEAQ (DX)(BX*4), DX 41381 ADDSS (DX), X2 41382 MOVSS X2, (DX) 41383 LEAQ (DX)(BX*4), DX 41384 ADDSS (DX), X3 41385 MOVSS X3, (DX) 41386 LEAQ (DX)(BX*4), DX 41387 ADDSS (DX), X4 41388 MOVSS X4, (DX) 41389 LEAQ (DX)(BX*4), DX 41390 ADDSS (DX), X5 41391 MOVSS X5, (DX) 41392 LEAQ (DX)(BX*4), DX 41393 ADDSS (DX), X6 41394 MOVSS X6, (DX) 41395 LEAQ (DX)(BX*4), DX 41396 ADDSS (DX), X7 41397 MOVSS X7, (DX) 41398 LEAQ (DX)(BX*4), DX 41399 ADDSS (DX), X8 41400 MOVSS X8, (DX) 41401 LEAQ (DX)(BX*4), DX 41402 SUBQ $0x08, SI 41403 41404 check_limit_unroll: 41405 CMPQ SI, $0x08 41406 JHS loop_unroll 41407 JMP check_limit 41408 41409 loop: 41410 MOVSS (AX), X1 41411 MULSS X0, X1 41412 ADDSS (DX), X1 41413 MOVSS X1, (DX) 41414 DECQ SI 41415 LEAQ (AX)(CX*4), AX 41416 LEAQ (DX)(BX*4), DX 41417 41418 check_limit: 41419 CMPQ SI, $0x00 41420 JHI loop 41421 RET 41422 41423 // func AmdAxpyPointerLoopXInterleave_V0A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41424 // Requires: SSE 41425 TEXT ·AmdAxpyPointerLoopXInterleave_V0A14U8(SB), NOSPLIT, $0-48 41426 MOVSS alpha+0(FP), X0 41427 MOVQ xs+8(FP), AX 41428 MOVQ incx+16(FP), CX 41429 MOVQ CX, DX 41430 SHLQ $0x05, DX 41431 MOVQ ys+24(FP), DX 41432 MOVQ incy+32(FP), BX 41433 MOVQ BX, SI 41434 SHLQ $0x05, SI 41435 MOVQ n+40(FP), SI 41436 JMP check_limit_unroll 41437 PCALIGN $0x08 41438 NOP 41439 NOP 41440 NOP 41441 NOP 41442 NOP 41443 NOP 41444 41445 loop_unroll: 41446 MOVSS (AX), X1 41447 LEAQ (AX)(CX*4), AX 41448 MOVSS (AX), X2 41449 LEAQ (AX)(CX*4), AX 41450 MOVSS (AX), X3 41451 LEAQ (AX)(CX*4), AX 41452 MOVSS (AX), X4 41453 LEAQ (AX)(CX*4), AX 41454 MOVSS (AX), X5 41455 LEAQ (AX)(CX*4), AX 41456 MOVSS (AX), X6 41457 LEAQ (AX)(CX*4), AX 41458 MOVSS (AX), X7 41459 LEAQ (AX)(CX*4), AX 41460 MOVSS (AX), X8 41461 LEAQ (AX)(CX*4), AX 41462 MULSS X0, X1 41463 MULSS X0, X2 41464 MULSS X0, X3 41465 MULSS X0, X4 41466 MULSS X0, X5 41467 MULSS X0, X6 41468 MULSS X0, X7 41469 MULSS X0, X8 41470 ADDSS (DX), X1 41471 MOVSS X1, (DX) 41472 LEAQ (DX)(BX*4), DX 41473 ADDSS (DX), X2 41474 MOVSS X2, (DX) 41475 LEAQ (DX)(BX*4), DX 41476 ADDSS (DX), X3 41477 MOVSS X3, (DX) 41478 LEAQ (DX)(BX*4), DX 41479 ADDSS (DX), X4 41480 MOVSS X4, (DX) 41481 LEAQ (DX)(BX*4), DX 41482 ADDSS (DX), X5 41483 MOVSS X5, (DX) 41484 LEAQ (DX)(BX*4), DX 41485 ADDSS (DX), X6 41486 MOVSS X6, (DX) 41487 LEAQ (DX)(BX*4), DX 41488 ADDSS (DX), X7 41489 MOVSS X7, (DX) 41490 LEAQ (DX)(BX*4), DX 41491 ADDSS (DX), X8 41492 MOVSS X8, (DX) 41493 LEAQ (DX)(BX*4), DX 41494 SUBQ $0x08, SI 41495 41496 check_limit_unroll: 41497 CMPQ SI, $0x08 41498 JHS loop_unroll 41499 JMP check_limit 41500 41501 loop: 41502 MOVSS (AX), X1 41503 MULSS X0, X1 41504 ADDSS (DX), X1 41505 MOVSS X1, (DX) 41506 DECQ SI 41507 LEAQ (AX)(CX*4), AX 41508 LEAQ (DX)(BX*4), DX 41509 41510 check_limit: 41511 CMPQ SI, $0x00 41512 JHI loop 41513 RET 41514 41515 // func AmdAxpyPointerLoopXInterleave_V1A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41516 // Requires: SSE 41517 TEXT ·AmdAxpyPointerLoopXInterleave_V1A14U8(SB), NOSPLIT, $0-48 41518 MOVSS alpha+0(FP), X0 41519 MOVQ xs+8(FP), AX 41520 MOVQ incx+16(FP), CX 41521 MOVQ CX, DX 41522 SHLQ $0x05, DX 41523 MOVQ ys+24(FP), DX 41524 MOVQ incy+32(FP), BX 41525 MOVQ BX, SI 41526 SHLQ $0x05, SI 41527 MOVQ n+40(FP), SI 41528 JMP check_limit_unroll 41529 PCALIGN $0x08 41530 NOP 41531 NOP 41532 NOP 41533 NOP 41534 NOP 41535 NOP 41536 41537 loop_unroll: 41538 MOVSS (AX), X1 41539 LEAQ (AX)(CX*4), AX 41540 MOVSS (AX), X2 41541 LEAQ (AX)(CX*4), AX 41542 MOVSS (AX), X3 41543 LEAQ (AX)(CX*4), AX 41544 MOVSS (AX), X4 41545 LEAQ (AX)(CX*4), AX 41546 MOVSS (AX), X5 41547 LEAQ (AX)(CX*4), AX 41548 MOVSS (AX), X6 41549 LEAQ (AX)(CX*4), AX 41550 MOVSS (AX), X7 41551 LEAQ (AX)(CX*4), AX 41552 MOVSS (AX), X8 41553 LEAQ (AX)(CX*4), AX 41554 MULSS X0, X1 41555 MULSS X0, X2 41556 MULSS X0, X3 41557 MULSS X0, X4 41558 MULSS X0, X5 41559 MULSS X0, X6 41560 MULSS X0, X7 41561 MULSS X0, X8 41562 ADDSS (DX), X1 41563 MOVSS X1, (DX) 41564 LEAQ (DX)(BX*4), DX 41565 ADDSS (DX), X2 41566 MOVSS X2, (DX) 41567 LEAQ (DX)(BX*4), DX 41568 ADDSS (DX), X3 41569 MOVSS X3, (DX) 41570 LEAQ (DX)(BX*4), DX 41571 ADDSS (DX), X4 41572 MOVSS X4, (DX) 41573 LEAQ (DX)(BX*4), DX 41574 ADDSS (DX), X5 41575 MOVSS X5, (DX) 41576 LEAQ (DX)(BX*4), DX 41577 ADDSS (DX), X6 41578 MOVSS X6, (DX) 41579 LEAQ (DX)(BX*4), DX 41580 ADDSS (DX), X7 41581 MOVSS X7, (DX) 41582 LEAQ (DX)(BX*4), DX 41583 ADDSS (DX), X8 41584 MOVSS X8, (DX) 41585 LEAQ (DX)(BX*4), DX 41586 SUBQ $0x08, SI 41587 41588 check_limit_unroll: 41589 CMPQ SI, $0x08 41590 JHS loop_unroll 41591 JMP check_limit 41592 41593 loop: 41594 MOVSS (AX), X1 41595 MULSS X0, X1 41596 ADDSS (DX), X1 41597 MOVSS X1, (DX) 41598 DECQ SI 41599 LEAQ (AX)(CX*4), AX 41600 LEAQ (DX)(BX*4), DX 41601 41602 check_limit: 41603 CMPQ SI, $0x00 41604 JHI loop 41605 RET 41606 41607 // func AmdAxpyPointerLoopXInterleave_V2A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41608 // Requires: SSE 41609 TEXT ·AmdAxpyPointerLoopXInterleave_V2A14U8(SB), NOSPLIT, $0-48 41610 MOVSS alpha+0(FP), X0 41611 MOVQ xs+8(FP), AX 41612 MOVQ incx+16(FP), CX 41613 MOVQ CX, DX 41614 SHLQ $0x05, DX 41615 MOVQ ys+24(FP), DX 41616 MOVQ incy+32(FP), BX 41617 MOVQ BX, SI 41618 SHLQ $0x05, SI 41619 MOVQ n+40(FP), SI 41620 JMP check_limit_unroll 41621 PCALIGN $0x08 41622 NOP 41623 NOP 41624 NOP 41625 NOP 41626 NOP 41627 NOP 41628 41629 loop_unroll: 41630 MOVSS (AX), X1 41631 LEAQ (AX)(CX*4), AX 41632 MOVSS (AX), X2 41633 LEAQ (AX)(CX*4), AX 41634 MOVSS (AX), X3 41635 LEAQ (AX)(CX*4), AX 41636 MOVSS (AX), X4 41637 LEAQ (AX)(CX*4), AX 41638 MOVSS (AX), X5 41639 LEAQ (AX)(CX*4), AX 41640 MOVSS (AX), X6 41641 LEAQ (AX)(CX*4), AX 41642 MOVSS (AX), X7 41643 LEAQ (AX)(CX*4), AX 41644 MOVSS (AX), X8 41645 LEAQ (AX)(CX*4), AX 41646 MULSS X0, X1 41647 MULSS X0, X2 41648 MULSS X0, X3 41649 MULSS X0, X4 41650 MULSS X0, X5 41651 MULSS X0, X6 41652 MULSS X0, X7 41653 MULSS X0, X8 41654 ADDSS (DX), X1 41655 MOVSS X1, (DX) 41656 LEAQ (DX)(BX*4), DX 41657 ADDSS (DX), X2 41658 MOVSS X2, (DX) 41659 LEAQ (DX)(BX*4), DX 41660 ADDSS (DX), X3 41661 MOVSS X3, (DX) 41662 LEAQ (DX)(BX*4), DX 41663 ADDSS (DX), X4 41664 MOVSS X4, (DX) 41665 LEAQ (DX)(BX*4), DX 41666 ADDSS (DX), X5 41667 MOVSS X5, (DX) 41668 LEAQ (DX)(BX*4), DX 41669 ADDSS (DX), X6 41670 MOVSS X6, (DX) 41671 LEAQ (DX)(BX*4), DX 41672 ADDSS (DX), X7 41673 MOVSS X7, (DX) 41674 LEAQ (DX)(BX*4), DX 41675 ADDSS (DX), X8 41676 MOVSS X8, (DX) 41677 LEAQ (DX)(BX*4), DX 41678 SUBQ $0x08, SI 41679 41680 check_limit_unroll: 41681 CMPQ SI, $0x08 41682 JHS loop_unroll 41683 JMP check_limit 41684 41685 loop: 41686 MOVSS (AX), X1 41687 MULSS X0, X1 41688 ADDSS (DX), X1 41689 MOVSS X1, (DX) 41690 DECQ SI 41691 LEAQ (AX)(CX*4), AX 41692 LEAQ (DX)(BX*4), DX 41693 41694 check_limit: 41695 CMPQ SI, $0x00 41696 JHI loop 41697 RET 41698 41699 // func AmdAxpyPointerLoopXInterleave_V3A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41700 // Requires: SSE 41701 TEXT ·AmdAxpyPointerLoopXInterleave_V3A14U8(SB), NOSPLIT, $0-48 41702 MOVSS alpha+0(FP), X0 41703 MOVQ xs+8(FP), AX 41704 MOVQ incx+16(FP), CX 41705 MOVQ CX, DX 41706 SHLQ $0x05, DX 41707 MOVQ ys+24(FP), DX 41708 MOVQ incy+32(FP), BX 41709 MOVQ BX, SI 41710 SHLQ $0x05, SI 41711 MOVQ n+40(FP), SI 41712 JMP check_limit_unroll 41713 PCALIGN $0x08 41714 NOP 41715 NOP 41716 NOP 41717 NOP 41718 NOP 41719 NOP 41720 41721 loop_unroll: 41722 MOVSS (AX), X1 41723 LEAQ (AX)(CX*4), AX 41724 MOVSS (AX), X2 41725 LEAQ (AX)(CX*4), AX 41726 MOVSS (AX), X3 41727 LEAQ (AX)(CX*4), AX 41728 MOVSS (AX), X4 41729 LEAQ (AX)(CX*4), AX 41730 MOVSS (AX), X5 41731 LEAQ (AX)(CX*4), AX 41732 MOVSS (AX), X6 41733 LEAQ (AX)(CX*4), AX 41734 MOVSS (AX), X7 41735 LEAQ (AX)(CX*4), AX 41736 MOVSS (AX), X8 41737 LEAQ (AX)(CX*4), AX 41738 MULSS X0, X1 41739 MULSS X0, X2 41740 MULSS X0, X3 41741 MULSS X0, X4 41742 MULSS X0, X5 41743 MULSS X0, X6 41744 MULSS X0, X7 41745 MULSS X0, X8 41746 ADDSS (DX), X1 41747 MOVSS X1, (DX) 41748 LEAQ (DX)(BX*4), DX 41749 ADDSS (DX), X2 41750 MOVSS X2, (DX) 41751 LEAQ (DX)(BX*4), DX 41752 ADDSS (DX), X3 41753 MOVSS X3, (DX) 41754 LEAQ (DX)(BX*4), DX 41755 ADDSS (DX), X4 41756 MOVSS X4, (DX) 41757 LEAQ (DX)(BX*4), DX 41758 ADDSS (DX), X5 41759 MOVSS X5, (DX) 41760 LEAQ (DX)(BX*4), DX 41761 ADDSS (DX), X6 41762 MOVSS X6, (DX) 41763 LEAQ (DX)(BX*4), DX 41764 ADDSS (DX), X7 41765 MOVSS X7, (DX) 41766 LEAQ (DX)(BX*4), DX 41767 ADDSS (DX), X8 41768 MOVSS X8, (DX) 41769 LEAQ (DX)(BX*4), DX 41770 SUBQ $0x08, SI 41771 41772 check_limit_unroll: 41773 CMPQ SI, $0x08 41774 JHS loop_unroll 41775 JMP check_limit 41776 41777 loop: 41778 MOVSS (AX), X1 41779 MULSS X0, X1 41780 ADDSS (DX), X1 41781 MOVSS X1, (DX) 41782 DECQ SI 41783 LEAQ (AX)(CX*4), AX 41784 LEAQ (DX)(BX*4), DX 41785 41786 check_limit: 41787 CMPQ SI, $0x00 41788 JHI loop 41789 RET 41790 41791 // func AmdAxpyPointerLoopXInterleave_V4A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41792 // Requires: SSE 41793 TEXT ·AmdAxpyPointerLoopXInterleave_V4A14U8(SB), NOSPLIT, $0-48 41794 MOVSS alpha+0(FP), X0 41795 MOVQ xs+8(FP), AX 41796 MOVQ incx+16(FP), CX 41797 MOVQ CX, DX 41798 SHLQ $0x05, DX 41799 MOVQ ys+24(FP), DX 41800 MOVQ incy+32(FP), BX 41801 MOVQ BX, SI 41802 SHLQ $0x05, SI 41803 MOVQ n+40(FP), SI 41804 JMP check_limit_unroll 41805 PCALIGN $0x08 41806 NOP 41807 NOP 41808 NOP 41809 NOP 41810 NOP 41811 NOP 41812 41813 loop_unroll: 41814 MOVSS (AX), X1 41815 LEAQ (AX)(CX*4), AX 41816 MOVSS (AX), X2 41817 LEAQ (AX)(CX*4), AX 41818 MOVSS (AX), X3 41819 LEAQ (AX)(CX*4), AX 41820 MOVSS (AX), X4 41821 LEAQ (AX)(CX*4), AX 41822 MOVSS (AX), X5 41823 LEAQ (AX)(CX*4), AX 41824 MOVSS (AX), X6 41825 LEAQ (AX)(CX*4), AX 41826 MOVSS (AX), X7 41827 LEAQ (AX)(CX*4), AX 41828 MOVSS (AX), X8 41829 LEAQ (AX)(CX*4), AX 41830 MULSS X0, X1 41831 MULSS X0, X2 41832 MULSS X0, X3 41833 MULSS X0, X4 41834 MULSS X0, X5 41835 MULSS X0, X6 41836 MULSS X0, X7 41837 MULSS X0, X8 41838 ADDSS (DX), X1 41839 MOVSS X1, (DX) 41840 LEAQ (DX)(BX*4), DX 41841 ADDSS (DX), X2 41842 MOVSS X2, (DX) 41843 LEAQ (DX)(BX*4), DX 41844 ADDSS (DX), X3 41845 MOVSS X3, (DX) 41846 LEAQ (DX)(BX*4), DX 41847 ADDSS (DX), X4 41848 MOVSS X4, (DX) 41849 LEAQ (DX)(BX*4), DX 41850 ADDSS (DX), X5 41851 MOVSS X5, (DX) 41852 LEAQ (DX)(BX*4), DX 41853 ADDSS (DX), X6 41854 MOVSS X6, (DX) 41855 LEAQ (DX)(BX*4), DX 41856 ADDSS (DX), X7 41857 MOVSS X7, (DX) 41858 LEAQ (DX)(BX*4), DX 41859 ADDSS (DX), X8 41860 MOVSS X8, (DX) 41861 LEAQ (DX)(BX*4), DX 41862 SUBQ $0x08, SI 41863 41864 check_limit_unroll: 41865 CMPQ SI, $0x08 41866 JHS loop_unroll 41867 JMP check_limit 41868 41869 loop: 41870 MOVSS (AX), X1 41871 MULSS X0, X1 41872 ADDSS (DX), X1 41873 MOVSS X1, (DX) 41874 DECQ SI 41875 LEAQ (AX)(CX*4), AX 41876 LEAQ (DX)(BX*4), DX 41877 41878 check_limit: 41879 CMPQ SI, $0x00 41880 JHI loop 41881 RET 41882 41883 // func AmdAxpyPointerLoopXInterleave_V5A14U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41884 // Requires: SSE 41885 TEXT ·AmdAxpyPointerLoopXInterleave_V5A14U8(SB), NOSPLIT, $0-48 41886 MOVSS alpha+0(FP), X0 41887 MOVQ xs+8(FP), AX 41888 MOVQ incx+16(FP), CX 41889 MOVQ CX, DX 41890 SHLQ $0x05, DX 41891 MOVQ ys+24(FP), DX 41892 MOVQ incy+32(FP), BX 41893 MOVQ BX, SI 41894 SHLQ $0x05, SI 41895 MOVQ n+40(FP), SI 41896 JMP check_limit_unroll 41897 PCALIGN $0x08 41898 NOP 41899 NOP 41900 NOP 41901 NOP 41902 NOP 41903 NOP 41904 41905 loop_unroll: 41906 MOVSS (AX), X1 41907 LEAQ (AX)(CX*4), AX 41908 MOVSS (AX), X2 41909 LEAQ (AX)(CX*4), AX 41910 MOVSS (AX), X3 41911 LEAQ (AX)(CX*4), AX 41912 MOVSS (AX), X4 41913 LEAQ (AX)(CX*4), AX 41914 MOVSS (AX), X5 41915 LEAQ (AX)(CX*4), AX 41916 MOVSS (AX), X6 41917 LEAQ (AX)(CX*4), AX 41918 MOVSS (AX), X7 41919 LEAQ (AX)(CX*4), AX 41920 MOVSS (AX), X8 41921 LEAQ (AX)(CX*4), AX 41922 MULSS X0, X1 41923 MULSS X0, X2 41924 MULSS X0, X3 41925 MULSS X0, X4 41926 MULSS X0, X5 41927 MULSS X0, X6 41928 MULSS X0, X7 41929 MULSS X0, X8 41930 ADDSS (DX), X1 41931 MOVSS X1, (DX) 41932 LEAQ (DX)(BX*4), DX 41933 ADDSS (DX), X2 41934 MOVSS X2, (DX) 41935 LEAQ (DX)(BX*4), DX 41936 ADDSS (DX), X3 41937 MOVSS X3, (DX) 41938 LEAQ (DX)(BX*4), DX 41939 ADDSS (DX), X4 41940 MOVSS X4, (DX) 41941 LEAQ (DX)(BX*4), DX 41942 ADDSS (DX), X5 41943 MOVSS X5, (DX) 41944 LEAQ (DX)(BX*4), DX 41945 ADDSS (DX), X6 41946 MOVSS X6, (DX) 41947 LEAQ (DX)(BX*4), DX 41948 ADDSS (DX), X7 41949 MOVSS X7, (DX) 41950 LEAQ (DX)(BX*4), DX 41951 ADDSS (DX), X8 41952 MOVSS X8, (DX) 41953 LEAQ (DX)(BX*4), DX 41954 SUBQ $0x08, SI 41955 41956 check_limit_unroll: 41957 CMPQ SI, $0x08 41958 JHS loop_unroll 41959 JMP check_limit 41960 41961 loop: 41962 MOVSS (AX), X1 41963 MULSS X0, X1 41964 ADDSS (DX), X1 41965 MOVSS X1, (DX) 41966 DECQ SI 41967 LEAQ (AX)(CX*4), AX 41968 LEAQ (DX)(BX*4), DX 41969 41970 check_limit: 41971 CMPQ SI, $0x00 41972 JHI loop 41973 RET 41974 41975 // func AmdAxpyPointerLoopXInterleave_V0A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 41976 // Requires: SSE 41977 TEXT ·AmdAxpyPointerLoopXInterleave_V0A15U8(SB), NOSPLIT, $0-48 41978 MOVSS alpha+0(FP), X0 41979 MOVQ xs+8(FP), AX 41980 MOVQ incx+16(FP), CX 41981 MOVQ CX, DX 41982 SHLQ $0x05, DX 41983 MOVQ ys+24(FP), DX 41984 MOVQ incy+32(FP), BX 41985 MOVQ BX, SI 41986 SHLQ $0x05, SI 41987 MOVQ n+40(FP), SI 41988 JMP check_limit_unroll 41989 PCALIGN $0x08 41990 NOP 41991 NOP 41992 NOP 41993 NOP 41994 NOP 41995 NOP 41996 NOP 41997 41998 loop_unroll: 41999 MOVSS (AX), X1 42000 LEAQ (AX)(CX*4), AX 42001 MOVSS (AX), X2 42002 LEAQ (AX)(CX*4), AX 42003 MOVSS (AX), X3 42004 LEAQ (AX)(CX*4), AX 42005 MOVSS (AX), X4 42006 LEAQ (AX)(CX*4), AX 42007 MOVSS (AX), X5 42008 LEAQ (AX)(CX*4), AX 42009 MOVSS (AX), X6 42010 LEAQ (AX)(CX*4), AX 42011 MOVSS (AX), X7 42012 LEAQ (AX)(CX*4), AX 42013 MOVSS (AX), X8 42014 LEAQ (AX)(CX*4), AX 42015 MULSS X0, X1 42016 MULSS X0, X2 42017 MULSS X0, X3 42018 MULSS X0, X4 42019 MULSS X0, X5 42020 MULSS X0, X6 42021 MULSS X0, X7 42022 MULSS X0, X8 42023 ADDSS (DX), X1 42024 MOVSS X1, (DX) 42025 LEAQ (DX)(BX*4), DX 42026 ADDSS (DX), X2 42027 MOVSS X2, (DX) 42028 LEAQ (DX)(BX*4), DX 42029 ADDSS (DX), X3 42030 MOVSS X3, (DX) 42031 LEAQ (DX)(BX*4), DX 42032 ADDSS (DX), X4 42033 MOVSS X4, (DX) 42034 LEAQ (DX)(BX*4), DX 42035 ADDSS (DX), X5 42036 MOVSS X5, (DX) 42037 LEAQ (DX)(BX*4), DX 42038 ADDSS (DX), X6 42039 MOVSS X6, (DX) 42040 LEAQ (DX)(BX*4), DX 42041 ADDSS (DX), X7 42042 MOVSS X7, (DX) 42043 LEAQ (DX)(BX*4), DX 42044 ADDSS (DX), X8 42045 MOVSS X8, (DX) 42046 LEAQ (DX)(BX*4), DX 42047 SUBQ $0x08, SI 42048 42049 check_limit_unroll: 42050 CMPQ SI, $0x08 42051 JHS loop_unroll 42052 JMP check_limit 42053 42054 loop: 42055 MOVSS (AX), X1 42056 MULSS X0, X1 42057 ADDSS (DX), X1 42058 MOVSS X1, (DX) 42059 DECQ SI 42060 LEAQ (AX)(CX*4), AX 42061 LEAQ (DX)(BX*4), DX 42062 42063 check_limit: 42064 CMPQ SI, $0x00 42065 JHI loop 42066 RET 42067 42068 // func AmdAxpyPointerLoopXInterleave_V1A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42069 // Requires: SSE 42070 TEXT ·AmdAxpyPointerLoopXInterleave_V1A15U8(SB), NOSPLIT, $0-48 42071 MOVSS alpha+0(FP), X0 42072 MOVQ xs+8(FP), AX 42073 MOVQ incx+16(FP), CX 42074 MOVQ CX, DX 42075 SHLQ $0x05, DX 42076 MOVQ ys+24(FP), DX 42077 MOVQ incy+32(FP), BX 42078 MOVQ BX, SI 42079 SHLQ $0x05, SI 42080 MOVQ n+40(FP), SI 42081 JMP check_limit_unroll 42082 PCALIGN $0x08 42083 NOP 42084 NOP 42085 NOP 42086 NOP 42087 NOP 42088 NOP 42089 NOP 42090 42091 loop_unroll: 42092 MOVSS (AX), X1 42093 LEAQ (AX)(CX*4), AX 42094 MOVSS (AX), X2 42095 LEAQ (AX)(CX*4), AX 42096 MOVSS (AX), X3 42097 LEAQ (AX)(CX*4), AX 42098 MOVSS (AX), X4 42099 LEAQ (AX)(CX*4), AX 42100 MOVSS (AX), X5 42101 LEAQ (AX)(CX*4), AX 42102 MOVSS (AX), X6 42103 LEAQ (AX)(CX*4), AX 42104 MOVSS (AX), X7 42105 LEAQ (AX)(CX*4), AX 42106 MOVSS (AX), X8 42107 LEAQ (AX)(CX*4), AX 42108 MULSS X0, X1 42109 MULSS X0, X2 42110 MULSS X0, X3 42111 MULSS X0, X4 42112 MULSS X0, X5 42113 MULSS X0, X6 42114 MULSS X0, X7 42115 MULSS X0, X8 42116 ADDSS (DX), X1 42117 MOVSS X1, (DX) 42118 LEAQ (DX)(BX*4), DX 42119 ADDSS (DX), X2 42120 MOVSS X2, (DX) 42121 LEAQ (DX)(BX*4), DX 42122 ADDSS (DX), X3 42123 MOVSS X3, (DX) 42124 LEAQ (DX)(BX*4), DX 42125 ADDSS (DX), X4 42126 MOVSS X4, (DX) 42127 LEAQ (DX)(BX*4), DX 42128 ADDSS (DX), X5 42129 MOVSS X5, (DX) 42130 LEAQ (DX)(BX*4), DX 42131 ADDSS (DX), X6 42132 MOVSS X6, (DX) 42133 LEAQ (DX)(BX*4), DX 42134 ADDSS (DX), X7 42135 MOVSS X7, (DX) 42136 LEAQ (DX)(BX*4), DX 42137 ADDSS (DX), X8 42138 MOVSS X8, (DX) 42139 LEAQ (DX)(BX*4), DX 42140 SUBQ $0x08, SI 42141 42142 check_limit_unroll: 42143 CMPQ SI, $0x08 42144 JHS loop_unroll 42145 JMP check_limit 42146 42147 loop: 42148 MOVSS (AX), X1 42149 MULSS X0, X1 42150 ADDSS (DX), X1 42151 MOVSS X1, (DX) 42152 DECQ SI 42153 LEAQ (AX)(CX*4), AX 42154 LEAQ (DX)(BX*4), DX 42155 42156 check_limit: 42157 CMPQ SI, $0x00 42158 JHI loop 42159 RET 42160 42161 // func AmdAxpyPointerLoopXInterleave_V2A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42162 // Requires: SSE 42163 TEXT ·AmdAxpyPointerLoopXInterleave_V2A15U8(SB), NOSPLIT, $0-48 42164 MOVSS alpha+0(FP), X0 42165 MOVQ xs+8(FP), AX 42166 MOVQ incx+16(FP), CX 42167 MOVQ CX, DX 42168 SHLQ $0x05, DX 42169 MOVQ ys+24(FP), DX 42170 MOVQ incy+32(FP), BX 42171 MOVQ BX, SI 42172 SHLQ $0x05, SI 42173 MOVQ n+40(FP), SI 42174 JMP check_limit_unroll 42175 PCALIGN $0x08 42176 NOP 42177 NOP 42178 NOP 42179 NOP 42180 NOP 42181 NOP 42182 NOP 42183 42184 loop_unroll: 42185 MOVSS (AX), X1 42186 LEAQ (AX)(CX*4), AX 42187 MOVSS (AX), X2 42188 LEAQ (AX)(CX*4), AX 42189 MOVSS (AX), X3 42190 LEAQ (AX)(CX*4), AX 42191 MOVSS (AX), X4 42192 LEAQ (AX)(CX*4), AX 42193 MOVSS (AX), X5 42194 LEAQ (AX)(CX*4), AX 42195 MOVSS (AX), X6 42196 LEAQ (AX)(CX*4), AX 42197 MOVSS (AX), X7 42198 LEAQ (AX)(CX*4), AX 42199 MOVSS (AX), X8 42200 LEAQ (AX)(CX*4), AX 42201 MULSS X0, X1 42202 MULSS X0, X2 42203 MULSS X0, X3 42204 MULSS X0, X4 42205 MULSS X0, X5 42206 MULSS X0, X6 42207 MULSS X0, X7 42208 MULSS X0, X8 42209 ADDSS (DX), X1 42210 MOVSS X1, (DX) 42211 LEAQ (DX)(BX*4), DX 42212 ADDSS (DX), X2 42213 MOVSS X2, (DX) 42214 LEAQ (DX)(BX*4), DX 42215 ADDSS (DX), X3 42216 MOVSS X3, (DX) 42217 LEAQ (DX)(BX*4), DX 42218 ADDSS (DX), X4 42219 MOVSS X4, (DX) 42220 LEAQ (DX)(BX*4), DX 42221 ADDSS (DX), X5 42222 MOVSS X5, (DX) 42223 LEAQ (DX)(BX*4), DX 42224 ADDSS (DX), X6 42225 MOVSS X6, (DX) 42226 LEAQ (DX)(BX*4), DX 42227 ADDSS (DX), X7 42228 MOVSS X7, (DX) 42229 LEAQ (DX)(BX*4), DX 42230 ADDSS (DX), X8 42231 MOVSS X8, (DX) 42232 LEAQ (DX)(BX*4), DX 42233 SUBQ $0x08, SI 42234 42235 check_limit_unroll: 42236 CMPQ SI, $0x08 42237 JHS loop_unroll 42238 JMP check_limit 42239 42240 loop: 42241 MOVSS (AX), X1 42242 MULSS X0, X1 42243 ADDSS (DX), X1 42244 MOVSS X1, (DX) 42245 DECQ SI 42246 LEAQ (AX)(CX*4), AX 42247 LEAQ (DX)(BX*4), DX 42248 42249 check_limit: 42250 CMPQ SI, $0x00 42251 JHI loop 42252 RET 42253 42254 // func AmdAxpyPointerLoopXInterleave_V3A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42255 // Requires: SSE 42256 TEXT ·AmdAxpyPointerLoopXInterleave_V3A15U8(SB), NOSPLIT, $0-48 42257 MOVSS alpha+0(FP), X0 42258 MOVQ xs+8(FP), AX 42259 MOVQ incx+16(FP), CX 42260 MOVQ CX, DX 42261 SHLQ $0x05, DX 42262 MOVQ ys+24(FP), DX 42263 MOVQ incy+32(FP), BX 42264 MOVQ BX, SI 42265 SHLQ $0x05, SI 42266 MOVQ n+40(FP), SI 42267 JMP check_limit_unroll 42268 PCALIGN $0x08 42269 NOP 42270 NOP 42271 NOP 42272 NOP 42273 NOP 42274 NOP 42275 NOP 42276 42277 loop_unroll: 42278 MOVSS (AX), X1 42279 LEAQ (AX)(CX*4), AX 42280 MOVSS (AX), X2 42281 LEAQ (AX)(CX*4), AX 42282 MOVSS (AX), X3 42283 LEAQ (AX)(CX*4), AX 42284 MOVSS (AX), X4 42285 LEAQ (AX)(CX*4), AX 42286 MOVSS (AX), X5 42287 LEAQ (AX)(CX*4), AX 42288 MOVSS (AX), X6 42289 LEAQ (AX)(CX*4), AX 42290 MOVSS (AX), X7 42291 LEAQ (AX)(CX*4), AX 42292 MOVSS (AX), X8 42293 LEAQ (AX)(CX*4), AX 42294 MULSS X0, X1 42295 MULSS X0, X2 42296 MULSS X0, X3 42297 MULSS X0, X4 42298 MULSS X0, X5 42299 MULSS X0, X6 42300 MULSS X0, X7 42301 MULSS X0, X8 42302 ADDSS (DX), X1 42303 MOVSS X1, (DX) 42304 LEAQ (DX)(BX*4), DX 42305 ADDSS (DX), X2 42306 MOVSS X2, (DX) 42307 LEAQ (DX)(BX*4), DX 42308 ADDSS (DX), X3 42309 MOVSS X3, (DX) 42310 LEAQ (DX)(BX*4), DX 42311 ADDSS (DX), X4 42312 MOVSS X4, (DX) 42313 LEAQ (DX)(BX*4), DX 42314 ADDSS (DX), X5 42315 MOVSS X5, (DX) 42316 LEAQ (DX)(BX*4), DX 42317 ADDSS (DX), X6 42318 MOVSS X6, (DX) 42319 LEAQ (DX)(BX*4), DX 42320 ADDSS (DX), X7 42321 MOVSS X7, (DX) 42322 LEAQ (DX)(BX*4), DX 42323 ADDSS (DX), X8 42324 MOVSS X8, (DX) 42325 LEAQ (DX)(BX*4), DX 42326 SUBQ $0x08, SI 42327 42328 check_limit_unroll: 42329 CMPQ SI, $0x08 42330 JHS loop_unroll 42331 JMP check_limit 42332 42333 loop: 42334 MOVSS (AX), X1 42335 MULSS X0, X1 42336 ADDSS (DX), X1 42337 MOVSS X1, (DX) 42338 DECQ SI 42339 LEAQ (AX)(CX*4), AX 42340 LEAQ (DX)(BX*4), DX 42341 42342 check_limit: 42343 CMPQ SI, $0x00 42344 JHI loop 42345 RET 42346 42347 // func AmdAxpyPointerLoopXInterleave_V4A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42348 // Requires: SSE 42349 TEXT ·AmdAxpyPointerLoopXInterleave_V4A15U8(SB), NOSPLIT, $0-48 42350 MOVSS alpha+0(FP), X0 42351 MOVQ xs+8(FP), AX 42352 MOVQ incx+16(FP), CX 42353 MOVQ CX, DX 42354 SHLQ $0x05, DX 42355 MOVQ ys+24(FP), DX 42356 MOVQ incy+32(FP), BX 42357 MOVQ BX, SI 42358 SHLQ $0x05, SI 42359 MOVQ n+40(FP), SI 42360 JMP check_limit_unroll 42361 PCALIGN $0x08 42362 NOP 42363 NOP 42364 NOP 42365 NOP 42366 NOP 42367 NOP 42368 NOP 42369 42370 loop_unroll: 42371 MOVSS (AX), X1 42372 LEAQ (AX)(CX*4), AX 42373 MOVSS (AX), X2 42374 LEAQ (AX)(CX*4), AX 42375 MOVSS (AX), X3 42376 LEAQ (AX)(CX*4), AX 42377 MOVSS (AX), X4 42378 LEAQ (AX)(CX*4), AX 42379 MOVSS (AX), X5 42380 LEAQ (AX)(CX*4), AX 42381 MOVSS (AX), X6 42382 LEAQ (AX)(CX*4), AX 42383 MOVSS (AX), X7 42384 LEAQ (AX)(CX*4), AX 42385 MOVSS (AX), X8 42386 LEAQ (AX)(CX*4), AX 42387 MULSS X0, X1 42388 MULSS X0, X2 42389 MULSS X0, X3 42390 MULSS X0, X4 42391 MULSS X0, X5 42392 MULSS X0, X6 42393 MULSS X0, X7 42394 MULSS X0, X8 42395 ADDSS (DX), X1 42396 MOVSS X1, (DX) 42397 LEAQ (DX)(BX*4), DX 42398 ADDSS (DX), X2 42399 MOVSS X2, (DX) 42400 LEAQ (DX)(BX*4), DX 42401 ADDSS (DX), X3 42402 MOVSS X3, (DX) 42403 LEAQ (DX)(BX*4), DX 42404 ADDSS (DX), X4 42405 MOVSS X4, (DX) 42406 LEAQ (DX)(BX*4), DX 42407 ADDSS (DX), X5 42408 MOVSS X5, (DX) 42409 LEAQ (DX)(BX*4), DX 42410 ADDSS (DX), X6 42411 MOVSS X6, (DX) 42412 LEAQ (DX)(BX*4), DX 42413 ADDSS (DX), X7 42414 MOVSS X7, (DX) 42415 LEAQ (DX)(BX*4), DX 42416 ADDSS (DX), X8 42417 MOVSS X8, (DX) 42418 LEAQ (DX)(BX*4), DX 42419 SUBQ $0x08, SI 42420 42421 check_limit_unroll: 42422 CMPQ SI, $0x08 42423 JHS loop_unroll 42424 JMP check_limit 42425 42426 loop: 42427 MOVSS (AX), X1 42428 MULSS X0, X1 42429 ADDSS (DX), X1 42430 MOVSS X1, (DX) 42431 DECQ SI 42432 LEAQ (AX)(CX*4), AX 42433 LEAQ (DX)(BX*4), DX 42434 42435 check_limit: 42436 CMPQ SI, $0x00 42437 JHI loop 42438 RET 42439 42440 // func AmdAxpyPointerLoopXInterleave_V5A15U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42441 // Requires: SSE 42442 TEXT ·AmdAxpyPointerLoopXInterleave_V5A15U8(SB), NOSPLIT, $0-48 42443 MOVSS alpha+0(FP), X0 42444 MOVQ xs+8(FP), AX 42445 MOVQ incx+16(FP), CX 42446 MOVQ CX, DX 42447 SHLQ $0x05, DX 42448 MOVQ ys+24(FP), DX 42449 MOVQ incy+32(FP), BX 42450 MOVQ BX, SI 42451 SHLQ $0x05, SI 42452 MOVQ n+40(FP), SI 42453 JMP check_limit_unroll 42454 PCALIGN $0x08 42455 NOP 42456 NOP 42457 NOP 42458 NOP 42459 NOP 42460 NOP 42461 NOP 42462 42463 loop_unroll: 42464 MOVSS (AX), X1 42465 LEAQ (AX)(CX*4), AX 42466 MOVSS (AX), X2 42467 LEAQ (AX)(CX*4), AX 42468 MOVSS (AX), X3 42469 LEAQ (AX)(CX*4), AX 42470 MOVSS (AX), X4 42471 LEAQ (AX)(CX*4), AX 42472 MOVSS (AX), X5 42473 LEAQ (AX)(CX*4), AX 42474 MOVSS (AX), X6 42475 LEAQ (AX)(CX*4), AX 42476 MOVSS (AX), X7 42477 LEAQ (AX)(CX*4), AX 42478 MOVSS (AX), X8 42479 LEAQ (AX)(CX*4), AX 42480 MULSS X0, X1 42481 MULSS X0, X2 42482 MULSS X0, X3 42483 MULSS X0, X4 42484 MULSS X0, X5 42485 MULSS X0, X6 42486 MULSS X0, X7 42487 MULSS X0, X8 42488 ADDSS (DX), X1 42489 MOVSS X1, (DX) 42490 LEAQ (DX)(BX*4), DX 42491 ADDSS (DX), X2 42492 MOVSS X2, (DX) 42493 LEAQ (DX)(BX*4), DX 42494 ADDSS (DX), X3 42495 MOVSS X3, (DX) 42496 LEAQ (DX)(BX*4), DX 42497 ADDSS (DX), X4 42498 MOVSS X4, (DX) 42499 LEAQ (DX)(BX*4), DX 42500 ADDSS (DX), X5 42501 MOVSS X5, (DX) 42502 LEAQ (DX)(BX*4), DX 42503 ADDSS (DX), X6 42504 MOVSS X6, (DX) 42505 LEAQ (DX)(BX*4), DX 42506 ADDSS (DX), X7 42507 MOVSS X7, (DX) 42508 LEAQ (DX)(BX*4), DX 42509 ADDSS (DX), X8 42510 MOVSS X8, (DX) 42511 LEAQ (DX)(BX*4), DX 42512 SUBQ $0x08, SI 42513 42514 check_limit_unroll: 42515 CMPQ SI, $0x08 42516 JHS loop_unroll 42517 JMP check_limit 42518 42519 loop: 42520 MOVSS (AX), X1 42521 MULSS X0, X1 42522 ADDSS (DX), X1 42523 MOVSS X1, (DX) 42524 DECQ SI 42525 LEAQ (AX)(CX*4), AX 42526 LEAQ (DX)(BX*4), DX 42527 42528 check_limit: 42529 CMPQ SI, $0x00 42530 JHI loop 42531 RET 42532 42533 // func AmdAxpyPointerLoopXInterleave_V0A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42534 // Requires: SSE 42535 TEXT ·AmdAxpyPointerLoopXInterleave_V0A16U8(SB), NOSPLIT, $0-48 42536 MOVSS alpha+0(FP), X0 42537 MOVQ xs+8(FP), AX 42538 MOVQ incx+16(FP), CX 42539 MOVQ CX, DX 42540 SHLQ $0x05, DX 42541 MOVQ ys+24(FP), DX 42542 MOVQ incy+32(FP), BX 42543 MOVQ BX, SI 42544 SHLQ $0x05, SI 42545 MOVQ n+40(FP), SI 42546 JMP check_limit_unroll 42547 PCALIGN $0x10 42548 42549 loop_unroll: 42550 MOVSS (AX), X1 42551 LEAQ (AX)(CX*4), AX 42552 MOVSS (AX), X2 42553 LEAQ (AX)(CX*4), AX 42554 MOVSS (AX), X3 42555 LEAQ (AX)(CX*4), AX 42556 MOVSS (AX), X4 42557 LEAQ (AX)(CX*4), AX 42558 MOVSS (AX), X5 42559 LEAQ (AX)(CX*4), AX 42560 MOVSS (AX), X6 42561 LEAQ (AX)(CX*4), AX 42562 MOVSS (AX), X7 42563 LEAQ (AX)(CX*4), AX 42564 MOVSS (AX), X8 42565 LEAQ (AX)(CX*4), AX 42566 MULSS X0, X1 42567 MULSS X0, X2 42568 MULSS X0, X3 42569 MULSS X0, X4 42570 MULSS X0, X5 42571 MULSS X0, X6 42572 MULSS X0, X7 42573 MULSS X0, X8 42574 ADDSS (DX), X1 42575 MOVSS X1, (DX) 42576 LEAQ (DX)(BX*4), DX 42577 ADDSS (DX), X2 42578 MOVSS X2, (DX) 42579 LEAQ (DX)(BX*4), DX 42580 ADDSS (DX), X3 42581 MOVSS X3, (DX) 42582 LEAQ (DX)(BX*4), DX 42583 ADDSS (DX), X4 42584 MOVSS X4, (DX) 42585 LEAQ (DX)(BX*4), DX 42586 ADDSS (DX), X5 42587 MOVSS X5, (DX) 42588 LEAQ (DX)(BX*4), DX 42589 ADDSS (DX), X6 42590 MOVSS X6, (DX) 42591 LEAQ (DX)(BX*4), DX 42592 ADDSS (DX), X7 42593 MOVSS X7, (DX) 42594 LEAQ (DX)(BX*4), DX 42595 ADDSS (DX), X8 42596 MOVSS X8, (DX) 42597 LEAQ (DX)(BX*4), DX 42598 SUBQ $0x08, SI 42599 42600 check_limit_unroll: 42601 CMPQ SI, $0x08 42602 JHS loop_unroll 42603 JMP check_limit 42604 42605 loop: 42606 MOVSS (AX), X1 42607 MULSS X0, X1 42608 ADDSS (DX), X1 42609 MOVSS X1, (DX) 42610 DECQ SI 42611 LEAQ (AX)(CX*4), AX 42612 LEAQ (DX)(BX*4), DX 42613 42614 check_limit: 42615 CMPQ SI, $0x00 42616 JHI loop 42617 RET 42618 42619 // func AmdAxpyPointerLoopXInterleave_V1A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42620 // Requires: SSE 42621 TEXT ·AmdAxpyPointerLoopXInterleave_V1A16U8(SB), NOSPLIT, $0-48 42622 MOVSS alpha+0(FP), X0 42623 MOVQ xs+8(FP), AX 42624 MOVQ incx+16(FP), CX 42625 MOVQ CX, DX 42626 SHLQ $0x05, DX 42627 MOVQ ys+24(FP), DX 42628 MOVQ incy+32(FP), BX 42629 MOVQ BX, SI 42630 SHLQ $0x05, SI 42631 MOVQ n+40(FP), SI 42632 JMP check_limit_unroll 42633 PCALIGN $0x10 42634 42635 loop_unroll: 42636 MOVSS (AX), X1 42637 LEAQ (AX)(CX*4), AX 42638 MOVSS (AX), X2 42639 LEAQ (AX)(CX*4), AX 42640 MOVSS (AX), X3 42641 LEAQ (AX)(CX*4), AX 42642 MOVSS (AX), X4 42643 LEAQ (AX)(CX*4), AX 42644 MOVSS (AX), X5 42645 LEAQ (AX)(CX*4), AX 42646 MOVSS (AX), X6 42647 LEAQ (AX)(CX*4), AX 42648 MOVSS (AX), X7 42649 LEAQ (AX)(CX*4), AX 42650 MOVSS (AX), X8 42651 LEAQ (AX)(CX*4), AX 42652 MULSS X0, X1 42653 MULSS X0, X2 42654 MULSS X0, X3 42655 MULSS X0, X4 42656 MULSS X0, X5 42657 MULSS X0, X6 42658 MULSS X0, X7 42659 MULSS X0, X8 42660 ADDSS (DX), X1 42661 MOVSS X1, (DX) 42662 LEAQ (DX)(BX*4), DX 42663 ADDSS (DX), X2 42664 MOVSS X2, (DX) 42665 LEAQ (DX)(BX*4), DX 42666 ADDSS (DX), X3 42667 MOVSS X3, (DX) 42668 LEAQ (DX)(BX*4), DX 42669 ADDSS (DX), X4 42670 MOVSS X4, (DX) 42671 LEAQ (DX)(BX*4), DX 42672 ADDSS (DX), X5 42673 MOVSS X5, (DX) 42674 LEAQ (DX)(BX*4), DX 42675 ADDSS (DX), X6 42676 MOVSS X6, (DX) 42677 LEAQ (DX)(BX*4), DX 42678 ADDSS (DX), X7 42679 MOVSS X7, (DX) 42680 LEAQ (DX)(BX*4), DX 42681 ADDSS (DX), X8 42682 MOVSS X8, (DX) 42683 LEAQ (DX)(BX*4), DX 42684 SUBQ $0x08, SI 42685 42686 check_limit_unroll: 42687 CMPQ SI, $0x08 42688 JHS loop_unroll 42689 JMP check_limit 42690 42691 loop: 42692 MOVSS (AX), X1 42693 MULSS X0, X1 42694 ADDSS (DX), X1 42695 MOVSS X1, (DX) 42696 DECQ SI 42697 LEAQ (AX)(CX*4), AX 42698 LEAQ (DX)(BX*4), DX 42699 42700 check_limit: 42701 CMPQ SI, $0x00 42702 JHI loop 42703 RET 42704 42705 // func AmdAxpyPointerLoopXInterleave_V2A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42706 // Requires: SSE 42707 TEXT ·AmdAxpyPointerLoopXInterleave_V2A16U8(SB), NOSPLIT, $0-48 42708 MOVSS alpha+0(FP), X0 42709 MOVQ xs+8(FP), AX 42710 MOVQ incx+16(FP), CX 42711 MOVQ CX, DX 42712 SHLQ $0x05, DX 42713 MOVQ ys+24(FP), DX 42714 MOVQ incy+32(FP), BX 42715 MOVQ BX, SI 42716 SHLQ $0x05, SI 42717 MOVQ n+40(FP), SI 42718 JMP check_limit_unroll 42719 PCALIGN $0x10 42720 42721 loop_unroll: 42722 MOVSS (AX), X1 42723 LEAQ (AX)(CX*4), AX 42724 MOVSS (AX), X2 42725 LEAQ (AX)(CX*4), AX 42726 MOVSS (AX), X3 42727 LEAQ (AX)(CX*4), AX 42728 MOVSS (AX), X4 42729 LEAQ (AX)(CX*4), AX 42730 MOVSS (AX), X5 42731 LEAQ (AX)(CX*4), AX 42732 MOVSS (AX), X6 42733 LEAQ (AX)(CX*4), AX 42734 MOVSS (AX), X7 42735 LEAQ (AX)(CX*4), AX 42736 MOVSS (AX), X8 42737 LEAQ (AX)(CX*4), AX 42738 MULSS X0, X1 42739 MULSS X0, X2 42740 MULSS X0, X3 42741 MULSS X0, X4 42742 MULSS X0, X5 42743 MULSS X0, X6 42744 MULSS X0, X7 42745 MULSS X0, X8 42746 ADDSS (DX), X1 42747 MOVSS X1, (DX) 42748 LEAQ (DX)(BX*4), DX 42749 ADDSS (DX), X2 42750 MOVSS X2, (DX) 42751 LEAQ (DX)(BX*4), DX 42752 ADDSS (DX), X3 42753 MOVSS X3, (DX) 42754 LEAQ (DX)(BX*4), DX 42755 ADDSS (DX), X4 42756 MOVSS X4, (DX) 42757 LEAQ (DX)(BX*4), DX 42758 ADDSS (DX), X5 42759 MOVSS X5, (DX) 42760 LEAQ (DX)(BX*4), DX 42761 ADDSS (DX), X6 42762 MOVSS X6, (DX) 42763 LEAQ (DX)(BX*4), DX 42764 ADDSS (DX), X7 42765 MOVSS X7, (DX) 42766 LEAQ (DX)(BX*4), DX 42767 ADDSS (DX), X8 42768 MOVSS X8, (DX) 42769 LEAQ (DX)(BX*4), DX 42770 SUBQ $0x08, SI 42771 42772 check_limit_unroll: 42773 CMPQ SI, $0x08 42774 JHS loop_unroll 42775 JMP check_limit 42776 42777 loop: 42778 MOVSS (AX), X1 42779 MULSS X0, X1 42780 ADDSS (DX), X1 42781 MOVSS X1, (DX) 42782 DECQ SI 42783 LEAQ (AX)(CX*4), AX 42784 LEAQ (DX)(BX*4), DX 42785 42786 check_limit: 42787 CMPQ SI, $0x00 42788 JHI loop 42789 RET 42790 42791 // func AmdAxpyPointerLoopXInterleave_V3A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42792 // Requires: SSE 42793 TEXT ·AmdAxpyPointerLoopXInterleave_V3A16U8(SB), NOSPLIT, $0-48 42794 MOVSS alpha+0(FP), X0 42795 MOVQ xs+8(FP), AX 42796 MOVQ incx+16(FP), CX 42797 MOVQ CX, DX 42798 SHLQ $0x05, DX 42799 MOVQ ys+24(FP), DX 42800 MOVQ incy+32(FP), BX 42801 MOVQ BX, SI 42802 SHLQ $0x05, SI 42803 MOVQ n+40(FP), SI 42804 JMP check_limit_unroll 42805 PCALIGN $0x10 42806 42807 loop_unroll: 42808 MOVSS (AX), X1 42809 LEAQ (AX)(CX*4), AX 42810 MOVSS (AX), X2 42811 LEAQ (AX)(CX*4), AX 42812 MOVSS (AX), X3 42813 LEAQ (AX)(CX*4), AX 42814 MOVSS (AX), X4 42815 LEAQ (AX)(CX*4), AX 42816 MOVSS (AX), X5 42817 LEAQ (AX)(CX*4), AX 42818 MOVSS (AX), X6 42819 LEAQ (AX)(CX*4), AX 42820 MOVSS (AX), X7 42821 LEAQ (AX)(CX*4), AX 42822 MOVSS (AX), X8 42823 LEAQ (AX)(CX*4), AX 42824 MULSS X0, X1 42825 MULSS X0, X2 42826 MULSS X0, X3 42827 MULSS X0, X4 42828 MULSS X0, X5 42829 MULSS X0, X6 42830 MULSS X0, X7 42831 MULSS X0, X8 42832 ADDSS (DX), X1 42833 MOVSS X1, (DX) 42834 LEAQ (DX)(BX*4), DX 42835 ADDSS (DX), X2 42836 MOVSS X2, (DX) 42837 LEAQ (DX)(BX*4), DX 42838 ADDSS (DX), X3 42839 MOVSS X3, (DX) 42840 LEAQ (DX)(BX*4), DX 42841 ADDSS (DX), X4 42842 MOVSS X4, (DX) 42843 LEAQ (DX)(BX*4), DX 42844 ADDSS (DX), X5 42845 MOVSS X5, (DX) 42846 LEAQ (DX)(BX*4), DX 42847 ADDSS (DX), X6 42848 MOVSS X6, (DX) 42849 LEAQ (DX)(BX*4), DX 42850 ADDSS (DX), X7 42851 MOVSS X7, (DX) 42852 LEAQ (DX)(BX*4), DX 42853 ADDSS (DX), X8 42854 MOVSS X8, (DX) 42855 LEAQ (DX)(BX*4), DX 42856 SUBQ $0x08, SI 42857 42858 check_limit_unroll: 42859 CMPQ SI, $0x08 42860 JHS loop_unroll 42861 JMP check_limit 42862 42863 loop: 42864 MOVSS (AX), X1 42865 MULSS X0, X1 42866 ADDSS (DX), X1 42867 MOVSS X1, (DX) 42868 DECQ SI 42869 LEAQ (AX)(CX*4), AX 42870 LEAQ (DX)(BX*4), DX 42871 42872 check_limit: 42873 CMPQ SI, $0x00 42874 JHI loop 42875 RET 42876 42877 // func AmdAxpyPointerLoopXInterleave_V4A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42878 // Requires: SSE 42879 TEXT ·AmdAxpyPointerLoopXInterleave_V4A16U8(SB), NOSPLIT, $0-48 42880 MOVSS alpha+0(FP), X0 42881 MOVQ xs+8(FP), AX 42882 MOVQ incx+16(FP), CX 42883 MOVQ CX, DX 42884 SHLQ $0x05, DX 42885 MOVQ ys+24(FP), DX 42886 MOVQ incy+32(FP), BX 42887 MOVQ BX, SI 42888 SHLQ $0x05, SI 42889 MOVQ n+40(FP), SI 42890 JMP check_limit_unroll 42891 PCALIGN $0x10 42892 42893 loop_unroll: 42894 MOVSS (AX), X1 42895 LEAQ (AX)(CX*4), AX 42896 MOVSS (AX), X2 42897 LEAQ (AX)(CX*4), AX 42898 MOVSS (AX), X3 42899 LEAQ (AX)(CX*4), AX 42900 MOVSS (AX), X4 42901 LEAQ (AX)(CX*4), AX 42902 MOVSS (AX), X5 42903 LEAQ (AX)(CX*4), AX 42904 MOVSS (AX), X6 42905 LEAQ (AX)(CX*4), AX 42906 MOVSS (AX), X7 42907 LEAQ (AX)(CX*4), AX 42908 MOVSS (AX), X8 42909 LEAQ (AX)(CX*4), AX 42910 MULSS X0, X1 42911 MULSS X0, X2 42912 MULSS X0, X3 42913 MULSS X0, X4 42914 MULSS X0, X5 42915 MULSS X0, X6 42916 MULSS X0, X7 42917 MULSS X0, X8 42918 ADDSS (DX), X1 42919 MOVSS X1, (DX) 42920 LEAQ (DX)(BX*4), DX 42921 ADDSS (DX), X2 42922 MOVSS X2, (DX) 42923 LEAQ (DX)(BX*4), DX 42924 ADDSS (DX), X3 42925 MOVSS X3, (DX) 42926 LEAQ (DX)(BX*4), DX 42927 ADDSS (DX), X4 42928 MOVSS X4, (DX) 42929 LEAQ (DX)(BX*4), DX 42930 ADDSS (DX), X5 42931 MOVSS X5, (DX) 42932 LEAQ (DX)(BX*4), DX 42933 ADDSS (DX), X6 42934 MOVSS X6, (DX) 42935 LEAQ (DX)(BX*4), DX 42936 ADDSS (DX), X7 42937 MOVSS X7, (DX) 42938 LEAQ (DX)(BX*4), DX 42939 ADDSS (DX), X8 42940 MOVSS X8, (DX) 42941 LEAQ (DX)(BX*4), DX 42942 SUBQ $0x08, SI 42943 42944 check_limit_unroll: 42945 CMPQ SI, $0x08 42946 JHS loop_unroll 42947 JMP check_limit 42948 42949 loop: 42950 MOVSS (AX), X1 42951 MULSS X0, X1 42952 ADDSS (DX), X1 42953 MOVSS X1, (DX) 42954 DECQ SI 42955 LEAQ (AX)(CX*4), AX 42956 LEAQ (DX)(BX*4), DX 42957 42958 check_limit: 42959 CMPQ SI, $0x00 42960 JHI loop 42961 RET 42962 42963 // func AmdAxpyPointerLoopXInterleave_V5A16U8(alpha float32, xs *float32, incx uintptr, ys *float32, incy uintptr, n uintptr) 42964 // Requires: SSE 42965 TEXT ·AmdAxpyPointerLoopXInterleave_V5A16U8(SB), NOSPLIT, $0-48 42966 MOVSS alpha+0(FP), X0 42967 MOVQ xs+8(FP), AX 42968 MOVQ incx+16(FP), CX 42969 MOVQ CX, DX 42970 SHLQ $0x05, DX 42971 MOVQ ys+24(FP), DX 42972 MOVQ incy+32(FP), BX 42973 MOVQ BX, SI 42974 SHLQ $0x05, SI 42975 MOVQ n+40(FP), SI 42976 JMP check_limit_unroll 42977 PCALIGN $0x10 42978 42979 loop_unroll: 42980 MOVSS (AX), X1 42981 LEAQ (AX)(CX*4), AX 42982 MOVSS (AX), X2 42983 LEAQ (AX)(CX*4), AX 42984 MOVSS (AX), X3 42985 LEAQ (AX)(CX*4), AX 42986 MOVSS (AX), X4 42987 LEAQ (AX)(CX*4), AX 42988 MOVSS (AX), X5 42989 LEAQ (AX)(CX*4), AX 42990 MOVSS (AX), X6 42991 LEAQ (AX)(CX*4), AX 42992 MOVSS (AX), X7 42993 LEAQ (AX)(CX*4), AX 42994 MOVSS (AX), X8 42995 LEAQ (AX)(CX*4), AX 42996 MULSS X0, X1 42997 MULSS X0, X2 42998 MULSS X0, X3 42999 MULSS X0, X4 43000 MULSS X0, X5 43001 MULSS X0, X6 43002 MULSS X0, X7 43003 MULSS X0, X8 43004 ADDSS (DX), X1 43005 MOVSS X1, (DX) 43006 LEAQ (DX)(BX*4), DX 43007 ADDSS (DX), X2 43008 MOVSS X2, (DX) 43009 LEAQ (DX)(BX*4), DX 43010 ADDSS (DX), X3 43011 MOVSS X3, (DX) 43012 LEAQ (DX)(BX*4), DX 43013 ADDSS (DX), X4 43014 MOVSS X4, (DX) 43015 LEAQ (DX)(BX*4), DX 43016 ADDSS (DX), X5 43017 MOVSS X5, (DX) 43018 LEAQ (DX)(BX*4), DX 43019 ADDSS (DX), X6 43020 MOVSS X6, (DX) 43021 LEAQ (DX)(BX*4), DX 43022 ADDSS (DX), X7 43023 MOVSS X7, (DX) 43024 LEAQ (DX)(BX*4), DX 43025 ADDSS (DX), X8 43026 MOVSS X8, (DX) 43027 LEAQ (DX)(BX*4), DX 43028 SUBQ $0x08, SI 43029 43030 check_limit_unroll: 43031 CMPQ SI, $0x08 43032 JHS loop_unroll 43033 JMP check_limit 43034 43035 loop: 43036 MOVSS (AX), X1 43037 MULSS X0, X1 43038 ADDSS (DX), X1 43039 MOVSS X1, (DX) 43040 DECQ SI 43041 LEAQ (AX)(CX*4), AX 43042 LEAQ (DX)(BX*4), DX 43043 43044 check_limit: 43045 CMPQ SI, $0x00 43046 JHI loop 43047 RET