gitee.com/quant1x/num@v0.3.2/internal/functions/accel_avx2_amd64.s (about) 1 // Code generated by command: go run gen.go -out ../internal/functions/accel_avx2_amd64.s -stubs ../internal/functions/accel_avx2_amd64.go -pkg functions. DO NOT EDIT. 2 3 #include "textflag.h" 4 5 // func Add_AVX2_F64(x []float64, y []float64) 6 // Requires: AVX 7 TEXT ·Add_AVX2_F64(SB), NOSPLIT, $0-48 8 MOVQ x_base+0(FP), DI 9 MOVQ y_base+24(FP), SI 10 MOVQ x_len+8(FP), DX 11 TESTQ DX, DX 12 JE LBB0_7 13 CMPQ DX, $0x10 14 JAE LBB0_3 15 XORL AX, AX 16 JMP LBB0_6 17 18 LBB0_3: 19 MOVQ DX, AX 20 ANDQ $-16, AX 21 XORL CX, CX 22 23 LBB0_4: 24 VMOVUPD (DI)(CX*8), Y0 25 VMOVUPD 32(DI)(CX*8), Y1 26 VMOVUPD 64(DI)(CX*8), Y2 27 VMOVUPD 96(DI)(CX*8), Y3 28 VADDPD (SI)(CX*8), Y0, Y0 29 VADDPD 32(SI)(CX*8), Y1, Y1 30 VADDPD 64(SI)(CX*8), Y2, Y2 31 VADDPD 96(SI)(CX*8), Y3, Y3 32 VMOVUPD Y0, (DI)(CX*8) 33 VMOVUPD Y1, 32(DI)(CX*8) 34 VMOVUPD Y2, 64(DI)(CX*8) 35 VMOVUPD Y3, 96(DI)(CX*8) 36 ADDQ $0x10, CX 37 CMPQ AX, CX 38 JNE LBB0_4 39 CMPQ AX, DX 40 JE LBB0_7 41 42 LBB0_6: 43 VMOVSD (DI)(AX*8), X0 44 VADDSD (SI)(AX*8), X0, X0 45 VMOVSD X0, (DI)(AX*8) 46 ADDQ $0x01, AX 47 CMPQ DX, AX 48 JNE LBB0_6 49 50 LBB0_7: 51 VZEROUPPER 52 RET 53 54 // func Add_AVX2_F32(x []float32, y []float32) 55 // Requires: AVX 56 TEXT ·Add_AVX2_F32(SB), NOSPLIT, $0-48 57 MOVQ x_base+0(FP), DI 58 MOVQ y_base+24(FP), SI 59 MOVQ x_len+8(FP), DX 60 TESTQ DX, DX 61 JE LBB1_7 62 CMPQ DX, $0x20 63 JAE LBB1_3 64 XORL AX, AX 65 JMP LBB1_6 66 67 LBB1_3: 68 MOVQ DX, AX 69 ANDQ $-32, AX 70 XORL CX, CX 71 72 LBB1_4: 73 VMOVUPS (DI)(CX*4), Y0 74 VMOVUPS 32(DI)(CX*4), Y1 75 VMOVUPS 64(DI)(CX*4), Y2 76 VMOVUPS 96(DI)(CX*4), Y3 77 VADDPS (SI)(CX*4), Y0, Y0 78 VADDPS 32(SI)(CX*4), Y1, Y1 79 VADDPS 64(SI)(CX*4), Y2, Y2 80 VADDPS 96(SI)(CX*4), Y3, Y3 81 VMOVUPS Y0, (DI)(CX*4) 82 VMOVUPS Y1, 32(DI)(CX*4) 83 VMOVUPS Y2, 64(DI)(CX*4) 84 VMOVUPS Y3, 96(DI)(CX*4) 85 ADDQ $0x20, CX 86 CMPQ AX, CX 87 JNE LBB1_4 88 CMPQ AX, DX 89 JE LBB1_7 90 91 LBB1_6: 92 VMOVSS (DI)(AX*4), X0 93 VADDSS (SI)(AX*4), X0, X0 94 VMOVSS X0, (DI)(AX*4) 95 ADDQ $0x01, AX 96 CMPQ DX, AX 97 JNE LBB1_6 98 99 LBB1_7: 100 VZEROUPPER 101 RET 102 103 // func AddNumber_AVX2_F64(x []float64, a float64) 104 // Requires: AVX, AVX2, SSE2 105 TEXT ·AddNumber_AVX2_F64(SB), NOSPLIT, $0-32 106 MOVQ x_base+0(FP), DI 107 MOVSD a+24(FP), X0 108 MOVQ x_len+8(FP), SI 109 TESTQ SI, SI 110 JE LBB2_11 111 CMPQ SI, $0x10 112 JAE LBB2_3 113 XORL AX, AX 114 JMP LBB2_10 115 116 LBB2_3: 117 MOVQ SI, AX 118 ANDQ $-16, AX 119 VBROADCASTSD X0, Y1 120 LEAQ -16(AX), CX 121 MOVQ CX, R8 122 SHRQ $0x04, R8 123 ADDQ $0x01, R8 124 TESTQ CX, CX 125 JE LBB2_4 126 MOVQ R8, DX 127 ANDQ $-2, DX 128 XORL CX, CX 129 130 LBB2_6: 131 VADDPD (DI)(CX*8), Y1, Y2 132 VADDPD 32(DI)(CX*8), Y1, Y3 133 VADDPD 64(DI)(CX*8), Y1, Y4 134 VADDPD 96(DI)(CX*8), Y1, Y5 135 VMOVUPD Y2, (DI)(CX*8) 136 VMOVUPD Y3, 32(DI)(CX*8) 137 VMOVUPD Y4, 64(DI)(CX*8) 138 VMOVUPD Y5, 96(DI)(CX*8) 139 VADDPD 128(DI)(CX*8), Y1, Y2 140 VADDPD 160(DI)(CX*8), Y1, Y3 141 VADDPD 192(DI)(CX*8), Y1, Y4 142 VADDPD 224(DI)(CX*8), Y1, Y5 143 VMOVUPD Y2, 128(DI)(CX*8) 144 VMOVUPD Y3, 160(DI)(CX*8) 145 VMOVUPD Y4, 192(DI)(CX*8) 146 VMOVUPD Y5, 224(DI)(CX*8) 147 ADDQ $0x20, CX 148 ADDQ $-2, DX 149 JNE LBB2_6 150 TESTB $0x01, R8 151 JE LBB2_9 152 153 LBB2_8: 154 VADDPD (DI)(CX*8), Y1, Y2 155 VADDPD 32(DI)(CX*8), Y1, Y3 156 VADDPD 64(DI)(CX*8), Y1, Y4 157 VADDPD 96(DI)(CX*8), Y1, Y1 158 VMOVUPD Y2, (DI)(CX*8) 159 VMOVUPD Y3, 32(DI)(CX*8) 160 VMOVUPD Y4, 64(DI)(CX*8) 161 VMOVUPD Y1, 96(DI)(CX*8) 162 163 LBB2_9: 164 CMPQ AX, SI 165 JE LBB2_11 166 167 LBB2_10: 168 VADDSD (DI)(AX*8), X0, X1 169 VMOVSD X1, (DI)(AX*8) 170 ADDQ $0x01, AX 171 CMPQ SI, AX 172 JNE LBB2_10 173 174 LBB2_11: 175 VZEROUPPER 176 RET 177 178 LBB2_4: 179 XORL CX, CX 180 TESTB $0x01, R8 181 JNE LBB2_8 182 JMP LBB2_9 183 184 // func AddNumber_AVX2_F32(x []float32, a float32) 185 // Requires: AVX, AVX2, SSE 186 TEXT ·AddNumber_AVX2_F32(SB), NOSPLIT, $0-28 187 MOVQ x_base+0(FP), DI 188 MOVSS a+24(FP), X0 189 MOVQ x_len+8(FP), SI 190 TESTQ SI, SI 191 JE LBB3_11 192 CMPQ SI, $0x20 193 JAE LBB3_3 194 XORL AX, AX 195 JMP LBB3_10 196 197 LBB3_3: 198 MOVQ SI, AX 199 ANDQ $-32, AX 200 VBROADCASTSS X0, Y1 201 LEAQ -32(AX), CX 202 MOVQ CX, R8 203 SHRQ $0x05, R8 204 ADDQ $0x01, R8 205 TESTQ CX, CX 206 JE LBB3_4 207 MOVQ R8, DX 208 ANDQ $-2, DX 209 XORL CX, CX 210 211 LBB3_6: 212 VADDPS (DI)(CX*4), Y1, Y2 213 VADDPS 32(DI)(CX*4), Y1, Y3 214 VADDPS 64(DI)(CX*4), Y1, Y4 215 VADDPS 96(DI)(CX*4), Y1, Y5 216 VMOVUPS Y2, (DI)(CX*4) 217 VMOVUPS Y3, 32(DI)(CX*4) 218 VMOVUPS Y4, 64(DI)(CX*4) 219 VMOVUPS Y5, 96(DI)(CX*4) 220 VADDPS 128(DI)(CX*4), Y1, Y2 221 VADDPS 160(DI)(CX*4), Y1, Y3 222 VADDPS 192(DI)(CX*4), Y1, Y4 223 VADDPS 224(DI)(CX*4), Y1, Y5 224 VMOVUPS Y2, 128(DI)(CX*4) 225 VMOVUPS Y3, 160(DI)(CX*4) 226 VMOVUPS Y4, 192(DI)(CX*4) 227 VMOVUPS Y5, 224(DI)(CX*4) 228 ADDQ $0x40, CX 229 ADDQ $-2, DX 230 JNE LBB3_6 231 TESTB $0x01, R8 232 JE LBB3_9 233 234 LBB3_8: 235 VADDPS (DI)(CX*4), Y1, Y2 236 VADDPS 32(DI)(CX*4), Y1, Y3 237 VADDPS 64(DI)(CX*4), Y1, Y4 238 VADDPS 96(DI)(CX*4), Y1, Y1 239 VMOVUPS Y2, (DI)(CX*4) 240 VMOVUPS Y3, 32(DI)(CX*4) 241 VMOVUPS Y4, 64(DI)(CX*4) 242 VMOVUPS Y1, 96(DI)(CX*4) 243 244 LBB3_9: 245 CMPQ AX, SI 246 JE LBB3_11 247 248 LBB3_10: 249 VADDSS (DI)(AX*4), X0, X1 250 VMOVSS X1, (DI)(AX*4) 251 ADDQ $0x01, AX 252 CMPQ SI, AX 253 JNE LBB3_10 254 255 LBB3_11: 256 VZEROUPPER 257 RET 258 259 LBB3_4: 260 XORL CX, CX 261 TESTB $0x01, R8 262 JNE LBB3_8 263 JMP LBB3_9 264 265 // func Sub_AVX2_F64(x []float64, y []float64) 266 // Requires: AVX 267 TEXT ·Sub_AVX2_F64(SB), NOSPLIT, $0-48 268 MOVQ x_base+0(FP), DI 269 MOVQ y_base+24(FP), SI 270 MOVQ x_len+8(FP), DX 271 TESTQ DX, DX 272 JE LBB4_7 273 CMPQ DX, $0x10 274 JAE LBB4_3 275 XORL AX, AX 276 JMP LBB4_6 277 278 LBB4_3: 279 MOVQ DX, AX 280 ANDQ $-16, AX 281 XORL CX, CX 282 283 LBB4_4: 284 VMOVUPD (DI)(CX*8), Y0 285 VMOVUPD 32(DI)(CX*8), Y1 286 VMOVUPD 64(DI)(CX*8), Y2 287 VMOVUPD 96(DI)(CX*8), Y3 288 VSUBPD (SI)(CX*8), Y0, Y0 289 VSUBPD 32(SI)(CX*8), Y1, Y1 290 VSUBPD 64(SI)(CX*8), Y2, Y2 291 VSUBPD 96(SI)(CX*8), Y3, Y3 292 VMOVUPD Y0, (DI)(CX*8) 293 VMOVUPD Y1, 32(DI)(CX*8) 294 VMOVUPD Y2, 64(DI)(CX*8) 295 VMOVUPD Y3, 96(DI)(CX*8) 296 ADDQ $0x10, CX 297 CMPQ AX, CX 298 JNE LBB4_4 299 CMPQ AX, DX 300 JE LBB4_7 301 302 LBB4_6: 303 VMOVSD (DI)(AX*8), X0 304 VSUBSD (SI)(AX*8), X0, X0 305 VMOVSD X0, (DI)(AX*8) 306 ADDQ $0x01, AX 307 CMPQ DX, AX 308 JNE LBB4_6 309 310 LBB4_7: 311 VZEROUPPER 312 RET 313 314 // func Sub_AVX2_F32(x []float32, y []float32) 315 // Requires: AVX 316 TEXT ·Sub_AVX2_F32(SB), NOSPLIT, $0-48 317 MOVQ x_base+0(FP), DI 318 MOVQ y_base+24(FP), SI 319 MOVQ x_len+8(FP), DX 320 TESTQ DX, DX 321 JE LBB5_7 322 CMPQ DX, $0x20 323 JAE LBB5_3 324 XORL AX, AX 325 JMP LBB5_6 326 327 LBB5_3: 328 MOVQ DX, AX 329 ANDQ $-32, AX 330 XORL CX, CX 331 332 LBB5_4: 333 VMOVUPS (DI)(CX*4), Y0 334 VMOVUPS 32(DI)(CX*4), Y1 335 VMOVUPS 64(DI)(CX*4), Y2 336 VMOVUPS 96(DI)(CX*4), Y3 337 VSUBPS (SI)(CX*4), Y0, Y0 338 VSUBPS 32(SI)(CX*4), Y1, Y1 339 VSUBPS 64(SI)(CX*4), Y2, Y2 340 VSUBPS 96(SI)(CX*4), Y3, Y3 341 VMOVUPS Y0, (DI)(CX*4) 342 VMOVUPS Y1, 32(DI)(CX*4) 343 VMOVUPS Y2, 64(DI)(CX*4) 344 VMOVUPS Y3, 96(DI)(CX*4) 345 ADDQ $0x20, CX 346 CMPQ AX, CX 347 JNE LBB5_4 348 CMPQ AX, DX 349 JE LBB5_7 350 351 LBB5_6: 352 VMOVSS (DI)(AX*4), X0 353 VSUBSS (SI)(AX*4), X0, X0 354 VMOVSS X0, (DI)(AX*4) 355 ADDQ $0x01, AX 356 CMPQ DX, AX 357 JNE LBB5_6 358 359 LBB5_7: 360 VZEROUPPER 361 RET 362 363 // func SubNumber_AVX2_F64(x []float64, a float64) 364 // Requires: AVX, AVX2, SSE2 365 TEXT ·SubNumber_AVX2_F64(SB), NOSPLIT, $0-32 366 MOVQ x_base+0(FP), DI 367 MOVSD a+24(FP), X0 368 MOVQ x_len+8(FP), SI 369 TESTQ SI, SI 370 JE LBB6_11 371 CMPQ SI, $0x10 372 JAE LBB6_3 373 XORL AX, AX 374 JMP LBB6_10 375 376 LBB6_3: 377 MOVQ SI, AX 378 ANDQ $-16, AX 379 VBROADCASTSD X0, Y1 380 LEAQ -16(AX), CX 381 MOVQ CX, R8 382 SHRQ $0x04, R8 383 ADDQ $0x01, R8 384 TESTQ CX, CX 385 JE LBB6_4 386 MOVQ R8, DX 387 ANDQ $-2, DX 388 XORL CX, CX 389 390 LBB6_6: 391 VMOVUPD (DI)(CX*8), Y2 392 VMOVUPD 32(DI)(CX*8), Y3 393 VMOVUPD 64(DI)(CX*8), Y4 394 VMOVUPD 96(DI)(CX*8), Y5 395 VSUBPD Y1, Y2, Y2 396 VSUBPD Y1, Y3, Y3 397 VSUBPD Y1, Y4, Y4 398 VSUBPD Y1, Y5, Y5 399 VMOVUPD Y2, (DI)(CX*8) 400 VMOVUPD Y3, 32(DI)(CX*8) 401 VMOVUPD Y4, 64(DI)(CX*8) 402 VMOVUPD Y5, 96(DI)(CX*8) 403 VMOVUPD 128(DI)(CX*8), Y2 404 VMOVUPD 160(DI)(CX*8), Y3 405 VMOVUPD 192(DI)(CX*8), Y4 406 VMOVUPD 224(DI)(CX*8), Y5 407 VSUBPD Y1, Y2, Y2 408 VSUBPD Y1, Y3, Y3 409 VSUBPD Y1, Y4, Y4 410 VSUBPD Y1, Y5, Y5 411 VMOVUPD Y2, 128(DI)(CX*8) 412 VMOVUPD Y3, 160(DI)(CX*8) 413 VMOVUPD Y4, 192(DI)(CX*8) 414 VMOVUPD Y5, 224(DI)(CX*8) 415 ADDQ $0x20, CX 416 ADDQ $-2, DX 417 JNE LBB6_6 418 TESTB $0x01, R8 419 JE LBB6_9 420 421 LBB6_8: 422 VMOVUPD (DI)(CX*8), Y2 423 VMOVUPD 32(DI)(CX*8), Y3 424 VMOVUPD 64(DI)(CX*8), Y4 425 VMOVUPD 96(DI)(CX*8), Y5 426 VSUBPD Y1, Y2, Y2 427 VSUBPD Y1, Y3, Y3 428 VSUBPD Y1, Y4, Y4 429 VSUBPD Y1, Y5, Y1 430 VMOVUPD Y2, (DI)(CX*8) 431 VMOVUPD Y3, 32(DI)(CX*8) 432 VMOVUPD Y4, 64(DI)(CX*8) 433 VMOVUPD Y1, 96(DI)(CX*8) 434 435 LBB6_9: 436 CMPQ AX, SI 437 JE LBB6_11 438 439 LBB6_10: 440 VMOVSD (DI)(AX*8), X1 441 VSUBSD X0, X1, X1 442 VMOVSD X1, (DI)(AX*8) 443 ADDQ $0x01, AX 444 CMPQ SI, AX 445 JNE LBB6_10 446 447 LBB6_11: 448 VZEROUPPER 449 RET 450 451 LBB6_4: 452 XORL CX, CX 453 TESTB $0x01, R8 454 JNE LBB6_8 455 JMP LBB6_9 456 457 // func SubNumber_AVX2_F32(x []float32, a float32) 458 // Requires: AVX, AVX2, SSE 459 TEXT ·SubNumber_AVX2_F32(SB), NOSPLIT, $0-28 460 MOVQ x_base+0(FP), DI 461 MOVSS a+24(FP), X0 462 MOVQ x_len+8(FP), SI 463 TESTQ SI, SI 464 JE LBB7_11 465 CMPQ SI, $0x20 466 JAE LBB7_3 467 XORL AX, AX 468 JMP LBB7_10 469 470 LBB7_3: 471 MOVQ SI, AX 472 ANDQ $-32, AX 473 VBROADCASTSS X0, Y1 474 LEAQ -32(AX), CX 475 MOVQ CX, R8 476 SHRQ $0x05, R8 477 ADDQ $0x01, R8 478 TESTQ CX, CX 479 JE LBB7_4 480 MOVQ R8, DX 481 ANDQ $-2, DX 482 XORL CX, CX 483 484 LBB7_6: 485 VMOVUPS (DI)(CX*4), Y2 486 VMOVUPS 32(DI)(CX*4), Y3 487 VMOVUPS 64(DI)(CX*4), Y4 488 VMOVUPS 96(DI)(CX*4), Y5 489 VSUBPS Y1, Y2, Y2 490 VSUBPS Y1, Y3, Y3 491 VSUBPS Y1, Y4, Y4 492 VSUBPS Y1, Y5, Y5 493 VMOVUPS Y2, (DI)(CX*4) 494 VMOVUPS Y3, 32(DI)(CX*4) 495 VMOVUPS Y4, 64(DI)(CX*4) 496 VMOVUPS Y5, 96(DI)(CX*4) 497 VMOVUPS 128(DI)(CX*4), Y2 498 VMOVUPS 160(DI)(CX*4), Y3 499 VMOVUPS 192(DI)(CX*4), Y4 500 VMOVUPS 224(DI)(CX*4), Y5 501 VSUBPS Y1, Y2, Y2 502 VSUBPS Y1, Y3, Y3 503 VSUBPS Y1, Y4, Y4 504 VSUBPS Y1, Y5, Y5 505 VMOVUPS Y2, 128(DI)(CX*4) 506 VMOVUPS Y3, 160(DI)(CX*4) 507 VMOVUPS Y4, 192(DI)(CX*4) 508 VMOVUPS Y5, 224(DI)(CX*4) 509 ADDQ $0x40, CX 510 ADDQ $-2, DX 511 JNE LBB7_6 512 TESTB $0x01, R8 513 JE LBB7_9 514 515 LBB7_8: 516 VMOVUPS (DI)(CX*4), Y2 517 VMOVUPS 32(DI)(CX*4), Y3 518 VMOVUPS 64(DI)(CX*4), Y4 519 VMOVUPS 96(DI)(CX*4), Y5 520 VSUBPS Y1, Y2, Y2 521 VSUBPS Y1, Y3, Y3 522 VSUBPS Y1, Y4, Y4 523 VSUBPS Y1, Y5, Y1 524 VMOVUPS Y2, (DI)(CX*4) 525 VMOVUPS Y3, 32(DI)(CX*4) 526 VMOVUPS Y4, 64(DI)(CX*4) 527 VMOVUPS Y1, 96(DI)(CX*4) 528 529 LBB7_9: 530 CMPQ AX, SI 531 JE LBB7_11 532 533 LBB7_10: 534 VMOVSS (DI)(AX*4), X1 535 VSUBSS X0, X1, X1 536 VMOVSS X1, (DI)(AX*4) 537 ADDQ $0x01, AX 538 CMPQ SI, AX 539 JNE LBB7_10 540 541 LBB7_11: 542 VZEROUPPER 543 RET 544 545 LBB7_4: 546 XORL CX, CX 547 TESTB $0x01, R8 548 JNE LBB7_8 549 JMP LBB7_9 550 551 // func Mul_AVX2_F64(x []float64, y []float64) 552 // Requires: AVX 553 TEXT ·Mul_AVX2_F64(SB), NOSPLIT, $0-48 554 MOVQ x_base+0(FP), DI 555 MOVQ y_base+24(FP), SI 556 MOVQ x_len+8(FP), DX 557 TESTQ DX, DX 558 JE LBB8_7 559 CMPQ DX, $0x10 560 JAE LBB8_3 561 XORL AX, AX 562 JMP LBB8_6 563 564 LBB8_3: 565 MOVQ DX, AX 566 ANDQ $-16, AX 567 XORL CX, CX 568 569 LBB8_4: 570 VMOVUPD (DI)(CX*8), Y0 571 VMOVUPD 32(DI)(CX*8), Y1 572 VMOVUPD 64(DI)(CX*8), Y2 573 VMOVUPD 96(DI)(CX*8), Y3 574 VMULPD (SI)(CX*8), Y0, Y0 575 VMULPD 32(SI)(CX*8), Y1, Y1 576 VMULPD 64(SI)(CX*8), Y2, Y2 577 VMULPD 96(SI)(CX*8), Y3, Y3 578 VMOVUPD Y0, (DI)(CX*8) 579 VMOVUPD Y1, 32(DI)(CX*8) 580 VMOVUPD Y2, 64(DI)(CX*8) 581 VMOVUPD Y3, 96(DI)(CX*8) 582 ADDQ $0x10, CX 583 CMPQ AX, CX 584 JNE LBB8_4 585 CMPQ AX, DX 586 JE LBB8_7 587 588 LBB8_6: 589 VMOVSD (DI)(AX*8), X0 590 VMULSD (SI)(AX*8), X0, X0 591 VMOVSD X0, (DI)(AX*8) 592 ADDQ $0x01, AX 593 CMPQ DX, AX 594 JNE LBB8_6 595 596 LBB8_7: 597 VZEROUPPER 598 RET 599 600 // func Mul_AVX2_F32(x []float32, y []float32) 601 // Requires: AVX 602 TEXT ·Mul_AVX2_F32(SB), NOSPLIT, $0-48 603 MOVQ x_base+0(FP), DI 604 MOVQ y_base+24(FP), SI 605 MOVQ x_len+8(FP), DX 606 TESTQ DX, DX 607 JE LBB9_7 608 CMPQ DX, $0x20 609 JAE LBB9_3 610 XORL AX, AX 611 JMP LBB9_6 612 613 LBB9_3: 614 MOVQ DX, AX 615 ANDQ $-32, AX 616 XORL CX, CX 617 618 LBB9_4: 619 VMOVUPS (DI)(CX*4), Y0 620 VMOVUPS 32(DI)(CX*4), Y1 621 VMOVUPS 64(DI)(CX*4), Y2 622 VMOVUPS 96(DI)(CX*4), Y3 623 VMULPS (SI)(CX*4), Y0, Y0 624 VMULPS 32(SI)(CX*4), Y1, Y1 625 VMULPS 64(SI)(CX*4), Y2, Y2 626 VMULPS 96(SI)(CX*4), Y3, Y3 627 VMOVUPS Y0, (DI)(CX*4) 628 VMOVUPS Y1, 32(DI)(CX*4) 629 VMOVUPS Y2, 64(DI)(CX*4) 630 VMOVUPS Y3, 96(DI)(CX*4) 631 ADDQ $0x20, CX 632 CMPQ AX, CX 633 JNE LBB9_4 634 CMPQ AX, DX 635 JE LBB9_7 636 637 LBB9_6: 638 VMOVSS (DI)(AX*4), X0 639 VMULSS (SI)(AX*4), X0, X0 640 VMOVSS X0, (DI)(AX*4) 641 ADDQ $0x01, AX 642 CMPQ DX, AX 643 JNE LBB9_6 644 645 LBB9_7: 646 VZEROUPPER 647 RET 648 649 // func MulNumber_AVX2_F64(x []float64, a float64) 650 // Requires: AVX, AVX2, SSE2 651 TEXT ·MulNumber_AVX2_F64(SB), NOSPLIT, $0-32 652 MOVQ x_base+0(FP), DI 653 MOVSD a+24(FP), X0 654 MOVQ x_len+8(FP), SI 655 TESTQ SI, SI 656 JE LBB10_11 657 CMPQ SI, $0x10 658 JAE LBB10_3 659 XORL AX, AX 660 JMP LBB10_10 661 662 LBB10_3: 663 MOVQ SI, AX 664 ANDQ $-16, AX 665 VBROADCASTSD X0, Y1 666 LEAQ -16(AX), CX 667 MOVQ CX, R8 668 SHRQ $0x04, R8 669 ADDQ $0x01, R8 670 TESTQ CX, CX 671 JE LBB10_4 672 MOVQ R8, DX 673 ANDQ $-2, DX 674 XORL CX, CX 675 676 LBB10_6: 677 VMULPD (DI)(CX*8), Y1, Y2 678 VMULPD 32(DI)(CX*8), Y1, Y3 679 VMULPD 64(DI)(CX*8), Y1, Y4 680 VMULPD 96(DI)(CX*8), Y1, Y5 681 VMOVUPD Y2, (DI)(CX*8) 682 VMOVUPD Y3, 32(DI)(CX*8) 683 VMOVUPD Y4, 64(DI)(CX*8) 684 VMOVUPD Y5, 96(DI)(CX*8) 685 VMULPD 128(DI)(CX*8), Y1, Y2 686 VMULPD 160(DI)(CX*8), Y1, Y3 687 VMULPD 192(DI)(CX*8), Y1, Y4 688 VMULPD 224(DI)(CX*8), Y1, Y5 689 VMOVUPD Y2, 128(DI)(CX*8) 690 VMOVUPD Y3, 160(DI)(CX*8) 691 VMOVUPD Y4, 192(DI)(CX*8) 692 VMOVUPD Y5, 224(DI)(CX*8) 693 ADDQ $0x20, CX 694 ADDQ $-2, DX 695 JNE LBB10_6 696 TESTB $0x01, R8 697 JE LBB10_9 698 699 LBB10_8: 700 VMULPD (DI)(CX*8), Y1, Y2 701 VMULPD 32(DI)(CX*8), Y1, Y3 702 VMULPD 64(DI)(CX*8), Y1, Y4 703 VMULPD 96(DI)(CX*8), Y1, Y1 704 VMOVUPD Y2, (DI)(CX*8) 705 VMOVUPD Y3, 32(DI)(CX*8) 706 VMOVUPD Y4, 64(DI)(CX*8) 707 VMOVUPD Y1, 96(DI)(CX*8) 708 709 LBB10_9: 710 CMPQ AX, SI 711 JE LBB10_11 712 713 LBB10_10: 714 VMULSD (DI)(AX*8), X0, X1 715 VMOVSD X1, (DI)(AX*8) 716 ADDQ $0x01, AX 717 CMPQ SI, AX 718 JNE LBB10_10 719 720 LBB10_11: 721 VZEROUPPER 722 RET 723 724 LBB10_4: 725 XORL CX, CX 726 TESTB $0x01, R8 727 JNE LBB10_8 728 JMP LBB10_9 729 730 // func MulNumber_AVX2_F32(x []float32, a float32) 731 // Requires: AVX, AVX2, SSE 732 TEXT ·MulNumber_AVX2_F32(SB), NOSPLIT, $0-28 733 MOVQ x_base+0(FP), DI 734 MOVSS a+24(FP), X0 735 MOVQ x_len+8(FP), SI 736 TESTQ SI, SI 737 JE LBB11_11 738 CMPQ SI, $0x20 739 JAE LBB11_3 740 XORL AX, AX 741 JMP LBB11_10 742 743 LBB11_3: 744 MOVQ SI, AX 745 ANDQ $-32, AX 746 VBROADCASTSS X0, Y1 747 LEAQ -32(AX), CX 748 MOVQ CX, R8 749 SHRQ $0x05, R8 750 ADDQ $0x01, R8 751 TESTQ CX, CX 752 JE LBB11_4 753 MOVQ R8, DX 754 ANDQ $-2, DX 755 XORL CX, CX 756 757 LBB11_6: 758 VMULPS (DI)(CX*4), Y1, Y2 759 VMULPS 32(DI)(CX*4), Y1, Y3 760 VMULPS 64(DI)(CX*4), Y1, Y4 761 VMULPS 96(DI)(CX*4), Y1, Y5 762 VMOVUPS Y2, (DI)(CX*4) 763 VMOVUPS Y3, 32(DI)(CX*4) 764 VMOVUPS Y4, 64(DI)(CX*4) 765 VMOVUPS Y5, 96(DI)(CX*4) 766 VMULPS 128(DI)(CX*4), Y1, Y2 767 VMULPS 160(DI)(CX*4), Y1, Y3 768 VMULPS 192(DI)(CX*4), Y1, Y4 769 VMULPS 224(DI)(CX*4), Y1, Y5 770 VMOVUPS Y2, 128(DI)(CX*4) 771 VMOVUPS Y3, 160(DI)(CX*4) 772 VMOVUPS Y4, 192(DI)(CX*4) 773 VMOVUPS Y5, 224(DI)(CX*4) 774 ADDQ $0x40, CX 775 ADDQ $-2, DX 776 JNE LBB11_6 777 TESTB $0x01, R8 778 JE LBB11_9 779 780 LBB11_8: 781 VMULPS (DI)(CX*4), Y1, Y2 782 VMULPS 32(DI)(CX*4), Y1, Y3 783 VMULPS 64(DI)(CX*4), Y1, Y4 784 VMULPS 96(DI)(CX*4), Y1, Y1 785 VMOVUPS Y2, (DI)(CX*4) 786 VMOVUPS Y3, 32(DI)(CX*4) 787 VMOVUPS Y4, 64(DI)(CX*4) 788 VMOVUPS Y1, 96(DI)(CX*4) 789 790 LBB11_9: 791 CMPQ AX, SI 792 JE LBB11_11 793 794 LBB11_10: 795 VMULSS (DI)(AX*4), X0, X1 796 VMOVSS X1, (DI)(AX*4) 797 ADDQ $0x01, AX 798 CMPQ SI, AX 799 JNE LBB11_10 800 801 LBB11_11: 802 VZEROUPPER 803 RET 804 805 LBB11_4: 806 XORL CX, CX 807 TESTB $0x01, R8 808 JNE LBB11_8 809 JMP LBB11_9 810 811 // func Div_AVX2_F64(x []float64, y []float64) 812 // Requires: AVX 813 TEXT ·Div_AVX2_F64(SB), NOSPLIT, $0-48 814 MOVQ x_base+0(FP), DI 815 MOVQ y_base+24(FP), SI 816 MOVQ x_len+8(FP), DX 817 TESTQ DX, DX 818 JE LBB12_11 819 CMPQ DX, $0x04 820 JAE LBB12_3 821 XORL AX, AX 822 JMP LBB12_10 823 824 LBB12_3: 825 MOVQ DX, AX 826 ANDQ $-4, AX 827 LEAQ -4(AX), CX 828 MOVQ CX, R8 829 SHRQ $0x02, R8 830 ADDQ $0x01, R8 831 TESTQ CX, CX 832 JE LBB12_4 833 MOVQ R8, R9 834 ANDQ $-2, R9 835 XORL CX, CX 836 837 LBB12_6: 838 VMOVUPD (DI)(CX*8), Y0 839 VDIVPD (SI)(CX*8), Y0, Y0 840 VMOVUPD 32(DI)(CX*8), Y1 841 VMOVUPD Y0, (DI)(CX*8) 842 VDIVPD 32(SI)(CX*8), Y1, Y0 843 VMOVUPD Y0, 32(DI)(CX*8) 844 ADDQ $0x08, CX 845 ADDQ $-2, R9 846 JNE LBB12_6 847 TESTB $0x01, R8 848 JE LBB12_9 849 850 LBB12_8: 851 VMOVUPD (DI)(CX*8), Y0 852 VDIVPD (SI)(CX*8), Y0, Y0 853 VMOVUPD Y0, (DI)(CX*8) 854 855 LBB12_9: 856 CMPQ AX, DX 857 JE LBB12_11 858 859 LBB12_10: 860 VMOVSD (DI)(AX*8), X0 861 VDIVSD (SI)(AX*8), X0, X0 862 VMOVSD X0, (DI)(AX*8) 863 ADDQ $0x01, AX 864 CMPQ DX, AX 865 JNE LBB12_10 866 867 LBB12_11: 868 VZEROUPPER 869 RET 870 871 LBB12_4: 872 XORL CX, CX 873 TESTB $0x01, R8 874 JNE LBB12_8 875 JMP LBB12_9 876 877 // func Div_AVX2_F32(x []float32, y []float32) 878 // Requires: AVX, FMA3 879 TEXT ·Div_AVX2_F32(SB), NOSPLIT, $0-48 880 MOVQ x_base+0(FP), DI 881 MOVQ y_base+24(FP), SI 882 MOVQ x_len+8(FP), DX 883 TESTQ DX, DX 884 JE LBB13_7 885 CMPQ DX, $0x20 886 JAE LBB13_3 887 XORL AX, AX 888 JMP LBB13_6 889 890 LBB13_3: 891 MOVQ DX, AX 892 ANDQ $-32, AX 893 XORL CX, CX 894 895 LBB13_4: 896 VMOVUPS (SI)(CX*4), Y0 897 VMOVUPS 32(SI)(CX*4), Y1 898 VMOVUPS 64(SI)(CX*4), Y2 899 VRCPPS Y0, Y3 900 VMOVUPS 96(SI)(CX*4), Y4 901 VMOVUPS (DI)(CX*4), Y5 902 VMOVUPS 32(DI)(CX*4), Y6 903 VMOVUPS 64(DI)(CX*4), Y7 904 VMOVUPS 96(DI)(CX*4), Y8 905 VMULPS Y3, Y5, Y9 906 VFMSUB213PS Y5, Y9, Y0 907 VFNMADD213PS Y9, Y3, Y0 908 VRCPPS Y1, Y3 909 VMULPS Y3, Y6, Y5 910 VFMSUB213PS Y6, Y5, Y1 911 VRCPPS Y2, Y6 912 VFNMADD213PS Y5, Y3, Y1 913 VMULPS Y6, Y7, Y3 914 VFMSUB213PS Y7, Y3, Y2 915 VFNMADD213PS Y3, Y6, Y2 916 VRCPPS Y4, Y3 917 VMULPS Y3, Y8, Y5 918 VFMSUB213PS Y8, Y5, Y4 919 VFNMADD213PS Y5, Y3, Y4 920 VMOVUPS Y0, (DI)(CX*4) 921 VMOVUPS Y1, 32(DI)(CX*4) 922 VMOVUPS Y2, 64(DI)(CX*4) 923 VMOVUPS Y4, 96(DI)(CX*4) 924 ADDQ $0x20, CX 925 CMPQ AX, CX 926 JNE LBB13_4 927 CMPQ AX, DX 928 JE LBB13_7 929 930 LBB13_6: 931 VMOVSS (DI)(AX*4), X0 932 VDIVSS (SI)(AX*4), X0, X0 933 VMOVSS X0, (DI)(AX*4) 934 ADDQ $0x01, AX 935 CMPQ DX, AX 936 JNE LBB13_6 937 938 LBB13_7: 939 VZEROUPPER 940 RET 941 942 DATA dataDivNumberF64<>+0(SB)/8, $0x3ff0000000000000 943 GLOBL dataDivNumberF64<>(SB), RODATA|NOPTR, $8 944 945 // func DivNumber_AVX2_F64(x []float64, a float64) 946 // Requires: AVX, AVX2, SSE2 947 TEXT ·DivNumber_AVX2_F64(SB), NOSPLIT, $0-32 948 MOVQ x_base+0(FP), DI 949 MOVSD a+24(FP), X0 950 MOVQ x_len+8(FP), SI 951 TESTQ SI, SI 952 JE LBB14_12 953 CMPQ SI, $0x04 954 JAE LBB14_3 955 XORL AX, AX 956 JMP LBB14_10 957 958 LBB14_3: 959 MOVQ SI, AX 960 ANDQ $-4, AX 961 VBROADCASTSD X0, Y1 962 LEAQ -4(AX), CX 963 MOVQ CX, R8 964 SHRQ $0x02, R8 965 ADDQ $0x01, R8 966 TESTQ CX, CX 967 JE LBB14_4 968 MOVQ R8, CX 969 ANDQ $-2, CX 970 VBROADCASTSD dataDivNumberF64<>+0(SB), Y2 971 VDIVPD Y1, Y2, Y2 972 XORL DX, DX 973 974 LBB14_6: 975 VMULPD (DI)(DX*8), Y2, Y3 976 VMOVUPD Y3, (DI)(DX*8) 977 VMULPD 32(DI)(DX*8), Y2, Y3 978 VMOVUPD Y3, 32(DI)(DX*8) 979 ADDQ $0x08, DX 980 ADDQ $-2, CX 981 JNE LBB14_6 982 TESTB $0x01, R8 983 JE LBB14_9 984 985 LBB14_8: 986 VMOVUPD (DI)(DX*8), Y2 987 VDIVPD Y1, Y2, Y1 988 VMOVUPD Y1, (DI)(DX*8) 989 990 LBB14_9: 991 CMPQ AX, SI 992 JE LBB14_12 993 994 LBB14_10: 995 VMOVSD dataDivNumberF64<>+0(SB), X1 996 VDIVSD X0, X1, X0 997 998 LBB14_11: 999 VMULSD (DI)(AX*8), X0, X1 1000 VMOVSD X1, (DI)(AX*8) 1001 ADDQ $0x01, AX 1002 CMPQ SI, AX 1003 JNE LBB14_11 1004 1005 LBB14_12: 1006 VZEROUPPER 1007 RET 1008 1009 LBB14_4: 1010 XORL DX, DX 1011 TESTB $0x01, R8 1012 JNE LBB14_8 1013 JMP LBB14_9 1014 1015 DATA dataDivNumberF32<>+0(SB)/4, $0x3f800000 1016 GLOBL dataDivNumberF32<>(SB), RODATA|NOPTR, $4 1017 1018 // func DivNumber_AVX2_F32(x []float32, a float32) 1019 // Requires: AVX, AVX2, SSE 1020 TEXT ·DivNumber_AVX2_F32(SB), NOSPLIT, $0-28 1021 MOVQ x_base+0(FP), DI 1022 MOVSS a+24(FP), X0 1023 MOVQ x_len+8(FP), SI 1024 TESTQ SI, SI 1025 JE LBB15_8 1026 CMPQ SI, $0x20 1027 JAE LBB15_3 1028 XORL AX, AX 1029 JMP LBB15_6 1030 1031 LBB15_3: 1032 MOVQ SI, AX 1033 ANDQ $-32, AX 1034 VMOVSS dataDivNumberF32<>+0(SB), X1 1035 VDIVSS X0, X1, X1 1036 VBROADCASTSS X1, Y1 1037 XORL CX, CX 1038 1039 LBB15_4: 1040 VMULPS (DI)(CX*4), Y1, Y2 1041 VMULPS 32(DI)(CX*4), Y1, Y3 1042 VMULPS 64(DI)(CX*4), Y1, Y4 1043 VMULPS 96(DI)(CX*4), Y1, Y5 1044 VMOVUPS Y2, (DI)(CX*4) 1045 VMOVUPS Y3, 32(DI)(CX*4) 1046 VMOVUPS Y4, 64(DI)(CX*4) 1047 VMOVUPS Y5, 96(DI)(CX*4) 1048 ADDQ $0x20, CX 1049 CMPQ AX, CX 1050 JNE LBB15_4 1051 CMPQ AX, SI 1052 JE LBB15_8 1053 1054 LBB15_6: 1055 VMOVSS dataDivNumberF32<>+0(SB), X1 1056 VDIVSS X0, X1, X0 1057 1058 LBB15_7: 1059 VMULSS (DI)(AX*4), X0, X1 1060 VMOVSS X1, (DI)(AX*4) 1061 ADDQ $0x01, AX 1062 CMPQ SI, AX 1063 JNE LBB15_7 1064 1065 LBB15_8: 1066 VZEROUPPER 1067 RET 1068 1069 DATA dataAbsF64<>+0(SB)/8, $0x7fffffffffffffff 1070 DATA dataAbsF64<>+8(SB)/8, $0x7fffffffffffffff 1071 DATA dataAbsF64<>+16(SB)/8, $0x7fffffffffffffff 1072 GLOBL dataAbsF64<>(SB), RODATA|NOPTR, $24 1073 1074 // func Abs_AVX2_F64(x []float64) 1075 // Requires: AVX 1076 TEXT ·Abs_AVX2_F64(SB), NOSPLIT, $0-24 1077 MOVQ x_base+0(FP), DI 1078 MOVQ x_len+8(FP), SI 1079 TESTQ SI, SI 1080 JE LBB16_8 1081 CMPQ SI, $0x10 1082 JAE LBB16_3 1083 XORL AX, AX 1084 JMP LBB16_6 1085 1086 LBB16_3: 1087 MOVQ SI, AX 1088 ANDQ $-16, AX 1089 XORL CX, CX 1090 VBROADCASTSD dataAbsF64<>+0(SB), Y0 1091 1092 LBB16_4: 1093 VANDPS (DI)(CX*8), Y0, Y1 1094 VANDPS 32(DI)(CX*8), Y0, Y2 1095 VANDPS 64(DI)(CX*8), Y0, Y3 1096 VANDPS 96(DI)(CX*8), Y0, Y4 1097 VMOVUPS Y1, (DI)(CX*8) 1098 VMOVUPS Y2, 32(DI)(CX*8) 1099 VMOVUPS Y3, 64(DI)(CX*8) 1100 VMOVUPS Y4, 96(DI)(CX*8) 1101 ADDQ $0x10, CX 1102 CMPQ AX, CX 1103 JNE LBB16_4 1104 CMPQ AX, SI 1105 JE LBB16_8 1106 1107 LBB16_6: 1108 VMOVUPS dataAbsF64<>+8(SB), X0 1109 1110 LBB16_7: 1111 VMOVSD (DI)(AX*8), X1 1112 VANDPS X0, X1, X1 1113 VMOVLPS X1, (DI)(AX*8) 1114 ADDQ $0x01, AX 1115 CMPQ SI, AX 1116 JNE LBB16_7 1117 1118 LBB16_8: 1119 VZEROUPPER 1120 RET 1121 1122 DATA dataAbsF32<>+0(SB)/4, $0x7fffffff 1123 GLOBL dataAbsF32<>(SB), RODATA|NOPTR, $4 1124 1125 // func Abs_AVX2_F32(x []float32) 1126 // Requires: AVX 1127 TEXT ·Abs_AVX2_F32(SB), NOSPLIT, $0-24 1128 MOVQ x_base+0(FP), DI 1129 MOVQ x_len+8(FP), SI 1130 TESTQ SI, SI 1131 JE LBB17_8 1132 CMPQ SI, $0x20 1133 JAE LBB17_3 1134 XORL AX, AX 1135 JMP LBB17_6 1136 1137 LBB17_3: 1138 MOVQ SI, AX 1139 ANDQ $-32, AX 1140 XORL CX, CX 1141 VBROADCASTSS dataAbsF32<>+0(SB), Y0 1142 1143 LBB17_4: 1144 VANDPS (DI)(CX*4), Y0, Y1 1145 VANDPS 32(DI)(CX*4), Y0, Y2 1146 VANDPS 64(DI)(CX*4), Y0, Y3 1147 VANDPS 96(DI)(CX*4), Y0, Y4 1148 VMOVUPS Y1, (DI)(CX*4) 1149 VMOVUPS Y2, 32(DI)(CX*4) 1150 VMOVUPS Y3, 64(DI)(CX*4) 1151 VMOVUPS Y4, 96(DI)(CX*4) 1152 ADDQ $0x20, CX 1153 CMPQ AX, CX 1154 JNE LBB17_4 1155 CMPQ AX, SI 1156 JE LBB17_8 1157 1158 LBB17_6: 1159 VBROADCASTSS dataAbsF32<>+0(SB), X0 1160 1161 LBB17_7: 1162 VMOVSS (DI)(AX*4), X1 1163 VANDPS X0, X1, X1 1164 VMOVSS X1, (DI)(AX*4) 1165 ADDQ $0x01, AX 1166 CMPQ SI, AX 1167 JNE LBB17_7 1168 1169 LBB17_8: 1170 VZEROUPPER 1171 RET 1172 1173 DATA dataNegF64<>+0(SB)/8, $0x8000000000000000 1174 DATA dataNegF64<>+8(SB)/8, $0x8000000000000000 1175 DATA dataNegF64<>+16(SB)/8, $0x8000000000000000 1176 GLOBL dataNegF64<>(SB), RODATA|NOPTR, $24 1177 1178 // func Neg_AVX2_F64(x []float64) 1179 // Requires: AVX 1180 TEXT ·Neg_AVX2_F64(SB), NOSPLIT, $0-24 1181 MOVQ x_base+0(FP), DI 1182 MOVQ x_len+8(FP), SI 1183 TESTQ SI, SI 1184 JE LBB18_12 1185 CMPQ SI, $0x10 1186 JAE LBB18_3 1187 XORL AX, AX 1188 JMP LBB18_10 1189 1190 LBB18_3: 1191 MOVQ SI, AX 1192 ANDQ $-16, AX 1193 LEAQ -16(AX), CX 1194 MOVQ CX, R8 1195 SHRQ $0x04, R8 1196 ADDQ $0x01, R8 1197 TESTQ CX, CX 1198 JE LBB18_4 1199 MOVQ R8, DX 1200 ANDQ $-2, DX 1201 XORL CX, CX 1202 VBROADCASTSD dataNegF64<>+0(SB), Y0 1203 1204 LBB18_6: 1205 VXORPS (DI)(CX*8), Y0, Y1 1206 VXORPS 32(DI)(CX*8), Y0, Y2 1207 VXORPS 64(DI)(CX*8), Y0, Y3 1208 VXORPS 96(DI)(CX*8), Y0, Y4 1209 VMOVUPS Y1, (DI)(CX*8) 1210 VMOVUPS Y2, 32(DI)(CX*8) 1211 VMOVUPS Y3, 64(DI)(CX*8) 1212 VMOVUPS Y4, 96(DI)(CX*8) 1213 VXORPS 128(DI)(CX*8), Y0, Y1 1214 VXORPS 160(DI)(CX*8), Y0, Y2 1215 VXORPS 192(DI)(CX*8), Y0, Y3 1216 VXORPS 224(DI)(CX*8), Y0, Y4 1217 VMOVUPS Y1, 128(DI)(CX*8) 1218 VMOVUPS Y2, 160(DI)(CX*8) 1219 VMOVUPS Y3, 192(DI)(CX*8) 1220 VMOVUPS Y4, 224(DI)(CX*8) 1221 ADDQ $0x20, CX 1222 ADDQ $-2, DX 1223 JNE LBB18_6 1224 TESTB $0x01, R8 1225 JE LBB18_9 1226 1227 LBB18_8: 1228 VBROADCASTSD dataNegF64<>+0(SB), Y0 1229 VXORPS (DI)(CX*8), Y0, Y1 1230 VXORPS 32(DI)(CX*8), Y0, Y2 1231 VXORPS 64(DI)(CX*8), Y0, Y3 1232 VXORPS 96(DI)(CX*8), Y0, Y0 1233 VMOVUPS Y1, (DI)(CX*8) 1234 VMOVUPS Y2, 32(DI)(CX*8) 1235 VMOVUPS Y3, 64(DI)(CX*8) 1236 VMOVUPS Y0, 96(DI)(CX*8) 1237 1238 LBB18_9: 1239 CMPQ AX, SI 1240 JE LBB18_12 1241 1242 LBB18_10: 1243 VMOVUPS dataNegF64<>+8(SB), X0 1244 1245 LBB18_11: 1246 VMOVSD (DI)(AX*8), X1 1247 VXORPS X0, X1, X1 1248 VMOVLPS X1, (DI)(AX*8) 1249 ADDQ $0x01, AX 1250 CMPQ SI, AX 1251 JNE LBB18_11 1252 1253 LBB18_12: 1254 VZEROUPPER 1255 RET 1256 1257 LBB18_4: 1258 XORL CX, CX 1259 TESTB $0x01, R8 1260 JNE LBB18_8 1261 JMP LBB18_9 1262 1263 DATA dataNegF32<>+0(SB)/4, $0x80000000 1264 GLOBL dataNegF32<>(SB), RODATA|NOPTR, $4 1265 1266 // func Neg_AVX2_F32(x []float32) 1267 // Requires: AVX 1268 TEXT ·Neg_AVX2_F32(SB), NOSPLIT, $0-24 1269 MOVQ x_base+0(FP), DI 1270 MOVQ x_len+8(FP), SI 1271 TESTQ SI, SI 1272 JE LBB19_12 1273 CMPQ SI, $0x20 1274 JAE LBB19_3 1275 XORL AX, AX 1276 JMP LBB19_10 1277 1278 LBB19_3: 1279 MOVQ SI, AX 1280 ANDQ $-32, AX 1281 LEAQ -32(AX), CX 1282 MOVQ CX, R8 1283 SHRQ $0x05, R8 1284 ADDQ $0x01, R8 1285 TESTQ CX, CX 1286 JE LBB19_4 1287 MOVQ R8, DX 1288 ANDQ $-2, DX 1289 XORL CX, CX 1290 VBROADCASTSS dataNegF32<>+0(SB), Y0 1291 1292 LBB19_6: 1293 VXORPS (DI)(CX*4), Y0, Y1 1294 VXORPS 32(DI)(CX*4), Y0, Y2 1295 VXORPS 64(DI)(CX*4), Y0, Y3 1296 VXORPS 96(DI)(CX*4), Y0, Y4 1297 VMOVUPS Y1, (DI)(CX*4) 1298 VMOVUPS Y2, 32(DI)(CX*4) 1299 VMOVUPS Y3, 64(DI)(CX*4) 1300 VMOVUPS Y4, 96(DI)(CX*4) 1301 VXORPS 128(DI)(CX*4), Y0, Y1 1302 VXORPS 160(DI)(CX*4), Y0, Y2 1303 VXORPS 192(DI)(CX*4), Y0, Y3 1304 VXORPS 224(DI)(CX*4), Y0, Y4 1305 VMOVUPS Y1, 128(DI)(CX*4) 1306 VMOVUPS Y2, 160(DI)(CX*4) 1307 VMOVUPS Y3, 192(DI)(CX*4) 1308 VMOVUPS Y4, 224(DI)(CX*4) 1309 ADDQ $0x40, CX 1310 ADDQ $-2, DX 1311 JNE LBB19_6 1312 TESTB $0x01, R8 1313 JE LBB19_9 1314 1315 LBB19_8: 1316 VBROADCASTSS dataNegF32<>+0(SB), Y0 1317 VXORPS (DI)(CX*4), Y0, Y1 1318 VXORPS 32(DI)(CX*4), Y0, Y2 1319 VXORPS 64(DI)(CX*4), Y0, Y3 1320 VXORPS 96(DI)(CX*4), Y0, Y0 1321 VMOVUPS Y1, (DI)(CX*4) 1322 VMOVUPS Y2, 32(DI)(CX*4) 1323 VMOVUPS Y3, 64(DI)(CX*4) 1324 VMOVUPS Y0, 96(DI)(CX*4) 1325 1326 LBB19_9: 1327 CMPQ AX, SI 1328 JE LBB19_12 1329 1330 LBB19_10: 1331 VBROADCASTSS dataNegF32<>+0(SB), X0 1332 1333 LBB19_11: 1334 VMOVSS (DI)(AX*4), X1 1335 VXORPS X0, X1, X1 1336 VMOVSS X1, (DI)(AX*4) 1337 ADDQ $0x01, AX 1338 CMPQ SI, AX 1339 JNE LBB19_11 1340 1341 LBB19_12: 1342 VZEROUPPER 1343 RET 1344 1345 LBB19_4: 1346 XORL CX, CX 1347 TESTB $0x01, R8 1348 JNE LBB19_8 1349 JMP LBB19_9 1350 1351 DATA dataInvF64<>+0(SB)/8, $0x3ff0000000000000 1352 GLOBL dataInvF64<>(SB), RODATA|NOPTR, $8 1353 1354 // func Inv_AVX2_F64(x []float64) 1355 // Requires: AVX 1356 TEXT ·Inv_AVX2_F64(SB), NOSPLIT, $0-24 1357 MOVQ x_base+0(FP), DI 1358 MOVQ x_len+8(FP), SI 1359 TESTQ SI, SI 1360 JE LBB20_12 1361 CMPQ SI, $0x04 1362 JAE LBB20_3 1363 XORL AX, AX 1364 JMP LBB20_10 1365 1366 LBB20_3: 1367 MOVQ SI, AX 1368 ANDQ $-4, AX 1369 LEAQ -4(AX), CX 1370 MOVQ CX, R8 1371 SHRQ $0x02, R8 1372 ADDQ $0x01, R8 1373 TESTQ CX, CX 1374 JE LBB20_4 1375 MOVQ R8, CX 1376 ANDQ $-2, CX 1377 XORL DX, DX 1378 VBROADCASTSD dataInvF64<>+0(SB), Y0 1379 1380 LBB20_6: 1381 VDIVPD (DI)(DX*8), Y0, Y1 1382 VMOVUPD Y1, (DI)(DX*8) 1383 VDIVPD 32(DI)(DX*8), Y0, Y1 1384 VMOVUPD Y1, 32(DI)(DX*8) 1385 ADDQ $0x08, DX 1386 ADDQ $-2, CX 1387 JNE LBB20_6 1388 TESTB $0x01, R8 1389 JE LBB20_9 1390 1391 LBB20_8: 1392 VBROADCASTSD dataInvF64<>+0(SB), Y0 1393 VDIVPD (DI)(DX*8), Y0, Y0 1394 VMOVUPD Y0, (DI)(DX*8) 1395 1396 LBB20_9: 1397 CMPQ AX, SI 1398 JE LBB20_12 1399 1400 LBB20_10: 1401 VMOVSD dataInvF64<>+0(SB), X0 1402 1403 LBB20_11: 1404 VDIVSD (DI)(AX*8), X0, X1 1405 VMOVSD X1, (DI)(AX*8) 1406 ADDQ $0x01, AX 1407 CMPQ SI, AX 1408 JNE LBB20_11 1409 1410 LBB20_12: 1411 VZEROUPPER 1412 RET 1413 1414 LBB20_4: 1415 XORL DX, DX 1416 TESTB $0x01, R8 1417 JNE LBB20_8 1418 JMP LBB20_9 1419 1420 DATA dataInvF32<>+0(SB)/4, $0x3f800000 1421 GLOBL dataInvF32<>(SB), RODATA|NOPTR, $4 1422 1423 // func Inv_AVX2_F32(x []float32) 1424 // Requires: AVX, FMA3 1425 TEXT ·Inv_AVX2_F32(SB), NOSPLIT, $0-24 1426 MOVQ x_base+0(FP), DI 1427 MOVQ x_len+8(FP), SI 1428 TESTQ SI, SI 1429 JE LBB21_8 1430 CMPQ SI, $0x20 1431 JAE LBB21_3 1432 XORL AX, AX 1433 JMP LBB21_6 1434 1435 LBB21_3: 1436 MOVQ SI, AX 1437 ANDQ $-32, AX 1438 XORL CX, CX 1439 VBROADCASTSS dataInvF32<>+0(SB), Y0 1440 1441 LBB21_4: 1442 VMOVUPS (DI)(CX*4), Y1 1443 VMOVUPS 32(DI)(CX*4), Y2 1444 VMOVUPS 64(DI)(CX*4), Y3 1445 VRCPPS Y1, Y4 1446 VFMSUB213PS Y0, Y4, Y1 1447 VRCPPS Y2, Y5 1448 VFNMADD132PS Y4, Y4, Y1 1449 VMOVUPS 96(DI)(CX*4), Y4 1450 VFMSUB213PS Y0, Y5, Y2 1451 VFNMADD132PS Y5, Y5, Y2 1452 VRCPPS Y3, Y5 1453 VFMSUB213PS Y0, Y5, Y3 1454 VFNMADD132PS Y5, Y5, Y3 1455 VRCPPS Y4, Y5 1456 VFMSUB213PS Y0, Y5, Y4 1457 VFNMADD132PS Y5, Y5, Y4 1458 VMOVUPS Y1, (DI)(CX*4) 1459 VMOVUPS Y2, 32(DI)(CX*4) 1460 VMOVUPS Y3, 64(DI)(CX*4) 1461 VMOVUPS Y4, 96(DI)(CX*4) 1462 ADDQ $0x20, CX 1463 CMPQ AX, CX 1464 JNE LBB21_4 1465 CMPQ AX, SI 1466 JE LBB21_8 1467 1468 LBB21_6: 1469 VMOVSS dataInvF32<>+0(SB), X0 1470 1471 LBB21_7: 1472 VDIVSS (DI)(AX*4), X0, X1 1473 VMOVSS X1, (DI)(AX*4) 1474 ADDQ $0x01, AX 1475 CMPQ SI, AX 1476 JNE LBB21_7 1477 1478 LBB21_8: 1479 VZEROUPPER 1480 RET 1481 1482 // func Sum_AVX2_F64(x []float64) float64 1483 // Requires: AVX, SSE2 1484 TEXT ·Sum_AVX2_F64(SB), NOSPLIT, $0-32 1485 MOVQ x_base+0(FP), DI 1486 MOVQ x_len+8(FP), SI 1487 TESTQ SI, SI 1488 JE LBB0_1 1489 CMPQ SI, $0x10 1490 JAE LBB0_4 1491 VXORPD X0, X0, X0 1492 XORL AX, AX 1493 JMP LBB0_11 1494 1495 LBB0_1: 1496 VXORPS X0, X0, X0 1497 MOVSD X0, ret+24(FP) 1498 RET 1499 1500 LBB0_4: 1501 MOVQ SI, AX 1502 ANDQ $-16, AX 1503 LEAQ -16(AX), CX 1504 MOVQ CX, R8 1505 SHRQ $0x04, R8 1506 ADDQ $0x01, R8 1507 TESTQ CX, CX 1508 JE LBB0_5 1509 MOVQ R8, CX 1510 ANDQ $-2, CX 1511 VXORPD X0, X0, X0 1512 XORL DX, DX 1513 VXORPD X1, X1, X1 1514 VXORPD X2, X2, X2 1515 VXORPD X3, X3, X3 1516 1517 LBB0_7: 1518 VADDPD (DI)(DX*8), Y0, Y0 1519 VADDPD 32(DI)(DX*8), Y1, Y1 1520 VADDPD 64(DI)(DX*8), Y2, Y2 1521 VADDPD 96(DI)(DX*8), Y3, Y3 1522 VADDPD 128(DI)(DX*8), Y0, Y0 1523 VADDPD 160(DI)(DX*8), Y1, Y1 1524 VADDPD 192(DI)(DX*8), Y2, Y2 1525 VADDPD 224(DI)(DX*8), Y3, Y3 1526 ADDQ $0x20, DX 1527 ADDQ $-2, CX 1528 JNE LBB0_7 1529 TESTB $0x01, R8 1530 JE LBB0_10 1531 1532 LBB0_9: 1533 VADDPD (DI)(DX*8), Y0, Y0 1534 VADDPD 32(DI)(DX*8), Y1, Y1 1535 VADDPD 64(DI)(DX*8), Y2, Y2 1536 VADDPD 96(DI)(DX*8), Y3, Y3 1537 1538 LBB0_10: 1539 VADDPD Y3, Y1, Y1 1540 VADDPD Y2, Y0, Y0 1541 VADDPD Y1, Y0, Y0 1542 VEXTRACTF128 $0x01, Y0, X1 1543 VADDPD X1, X0, X0 1544 VPERMILPD $0x01, X0, X1 1545 VADDSD X1, X0, X0 1546 CMPQ AX, SI 1547 JE LBB0_12 1548 1549 LBB0_11: 1550 VADDSD (DI)(AX*8), X0, X0 1551 ADDQ $0x01, AX 1552 CMPQ SI, AX 1553 JNE LBB0_11 1554 1555 LBB0_12: 1556 VZEROUPPER 1557 MOVSD X0, ret+24(FP) 1558 RET 1559 1560 LBB0_5: 1561 VXORPD X0, X0, X0 1562 XORL DX, DX 1563 VXORPD X1, X1, X1 1564 VXORPD X2, X2, X2 1565 VXORPD X3, X3, X3 1566 TESTB $0x01, R8 1567 JNE LBB0_9 1568 JMP LBB0_10 1569 1570 // func Sum_AVX2_F32(x []float32) float32 1571 // Requires: AVX, SSE 1572 TEXT ·Sum_AVX2_F32(SB), NOSPLIT, $0-28 1573 MOVQ x_base+0(FP), DI 1574 MOVQ x_len+8(FP), SI 1575 TESTQ SI, SI 1576 JE LBB1_1 1577 CMPQ SI, $0x20 1578 JAE LBB1_4 1579 VXORPS X0, X0, X0 1580 XORL AX, AX 1581 JMP LBB1_11 1582 1583 LBB1_1: 1584 VXORPS X0, X0, X0 1585 MOVSS X0, ret+24(FP) 1586 RET 1587 1588 LBB1_4: 1589 MOVQ SI, AX 1590 ANDQ $-32, AX 1591 LEAQ -32(AX), CX 1592 MOVQ CX, R8 1593 SHRQ $0x05, R8 1594 ADDQ $0x01, R8 1595 TESTQ CX, CX 1596 JE LBB1_5 1597 MOVQ R8, CX 1598 ANDQ $-2, CX 1599 VXORPS X0, X0, X0 1600 XORL DX, DX 1601 VXORPS X1, X1, X1 1602 VXORPS X2, X2, X2 1603 VXORPS X3, X3, X3 1604 1605 LBB1_7: 1606 VADDPS (DI)(DX*4), Y0, Y0 1607 VADDPS 32(DI)(DX*4), Y1, Y1 1608 VADDPS 64(DI)(DX*4), Y2, Y2 1609 VADDPS 96(DI)(DX*4), Y3, Y3 1610 VADDPS 128(DI)(DX*4), Y0, Y0 1611 VADDPS 160(DI)(DX*4), Y1, Y1 1612 VADDPS 192(DI)(DX*4), Y2, Y2 1613 VADDPS 224(DI)(DX*4), Y3, Y3 1614 ADDQ $0x40, DX 1615 ADDQ $-2, CX 1616 JNE LBB1_7 1617 TESTB $0x01, R8 1618 JE LBB1_10 1619 1620 LBB1_9: 1621 VADDPS (DI)(DX*4), Y0, Y0 1622 VADDPS 32(DI)(DX*4), Y1, Y1 1623 VADDPS 64(DI)(DX*4), Y2, Y2 1624 VADDPS 96(DI)(DX*4), Y3, Y3 1625 1626 LBB1_10: 1627 VADDPS Y3, Y1, Y1 1628 VADDPS Y2, Y0, Y0 1629 VADDPS Y1, Y0, Y0 1630 VEXTRACTF128 $0x01, Y0, X1 1631 VADDPS X1, X0, X0 1632 VPERMILPD $0x01, X0, X1 1633 VADDPS X1, X0, X0 1634 VMOVSHDUP X0, X1 1635 VADDSS X1, X0, X0 1636 CMPQ AX, SI 1637 JE LBB1_12 1638 1639 LBB1_11: 1640 VADDSS (DI)(AX*4), X0, X0 1641 ADDQ $0x01, AX 1642 CMPQ SI, AX 1643 JNE LBB1_11 1644 1645 LBB1_12: 1646 VZEROUPPER 1647 MOVSS X0, ret+24(FP) 1648 RET 1649 1650 LBB1_5: 1651 VXORPS X0, X0, X0 1652 XORL DX, DX 1653 VXORPS X1, X1, X1 1654 VXORPS X2, X2, X2 1655 VXORPS X3, X3, X3 1656 TESTB $0x01, R8 1657 JNE LBB1_9 1658 JMP LBB1_10 1659 1660 // func CumSum_AVX2_F64(x []float64) 1661 // Requires: AVX 1662 TEXT ·CumSum_AVX2_F64(SB), NOSPLIT, $0-24 1663 MOVQ x_base+0(FP), DI 1664 MOVQ x_len+8(FP), SI 1665 TESTQ SI, SI 1666 JE LBB2_8 1667 LEAQ -1(SI), CX 1668 MOVL SI, AX 1669 ANDL $0x03, AX 1670 CMPQ CX, $0x03 1671 JAE LBB2_3 1672 VXORPD X0, X0, X0 1673 XORL CX, CX 1674 JMP LBB2_5 1675 1676 LBB2_3: 1677 ANDQ $-4, SI 1678 VXORPD X0, X0, X0 1679 XORL CX, CX 1680 1681 LBB2_4: 1682 VADDSD (DI)(CX*8), X0, X0 1683 VMOVSD X0, (DI)(CX*8) 1684 VADDSD 8(DI)(CX*8), X0, X0 1685 VMOVSD X0, 8(DI)(CX*8) 1686 VADDSD 16(DI)(CX*8), X0, X0 1687 VMOVSD X0, 16(DI)(CX*8) 1688 VADDSD 24(DI)(CX*8), X0, X0 1689 VMOVSD X0, 24(DI)(CX*8) 1690 ADDQ $0x04, CX 1691 CMPQ SI, CX 1692 JNE LBB2_4 1693 1694 LBB2_5: 1695 TESTQ AX, AX 1696 JE LBB2_8 1697 LEAQ (DI)(CX*8), CX 1698 XORL DX, DX 1699 1700 LBB2_7: 1701 VADDSD (CX)(DX*8), X0, X0 1702 VMOVSD X0, (CX)(DX*8) 1703 ADDQ $0x01, DX 1704 CMPQ AX, DX 1705 JNE LBB2_7 1706 1707 LBB2_8: 1708 RET 1709 1710 // func CumSum_AVX2_F32(x []float32) 1711 // Requires: AVX 1712 TEXT ·CumSum_AVX2_F32(SB), NOSPLIT, $0-24 1713 MOVQ x_base+0(FP), DI 1714 MOVQ x_len+8(FP), SI 1715 TESTQ SI, SI 1716 JE LBB3_8 1717 LEAQ -1(SI), CX 1718 MOVL SI, AX 1719 ANDL $0x03, AX 1720 CMPQ CX, $0x03 1721 JAE LBB3_3 1722 VXORPS X0, X0, X0 1723 XORL CX, CX 1724 JMP LBB3_5 1725 1726 LBB3_3: 1727 ANDQ $-4, SI 1728 VXORPS X0, X0, X0 1729 XORL CX, CX 1730 1731 LBB3_4: 1732 VADDSS (DI)(CX*4), X0, X0 1733 VMOVSS X0, (DI)(CX*4) 1734 VADDSS 4(DI)(CX*4), X0, X0 1735 VMOVSS X0, 4(DI)(CX*4) 1736 VADDSS 8(DI)(CX*4), X0, X0 1737 VMOVSS X0, 8(DI)(CX*4) 1738 VADDSS 12(DI)(CX*4), X0, X0 1739 VMOVSS X0, 12(DI)(CX*4) 1740 ADDQ $0x04, CX 1741 CMPQ SI, CX 1742 JNE LBB3_4 1743 1744 LBB3_5: 1745 TESTQ AX, AX 1746 JE LBB3_8 1747 LEAQ (DI)(CX*4), CX 1748 XORL DX, DX 1749 1750 LBB3_7: 1751 VADDSS (CX)(DX*4), X0, X0 1752 VMOVSS X0, (CX)(DX*4) 1753 ADDQ $0x01, DX 1754 CMPQ AX, DX 1755 JNE LBB3_7 1756 1757 LBB3_8: 1758 RET 1759 1760 DATA dataProdF64<>+0(SB)/8, $0x3ff0000000000000 1761 GLOBL dataProdF64<>(SB), RODATA|NOPTR, $8 1762 1763 // func Prod_AVX2_F64(x []float64) float64 1764 // Requires: AVX, SSE2 1765 TEXT ·Prod_AVX2_F64(SB), NOSPLIT, $0-32 1766 MOVQ x_base+0(FP), DI 1767 MOVQ x_len+8(FP), SI 1768 TESTQ SI, SI 1769 JE LBB4_1 1770 CMPQ SI, $0x10 1771 JAE LBB4_4 1772 VMOVSD dataProdF64<>+0(SB), X0 1773 XORL AX, AX 1774 JMP LBB4_11 1775 1776 LBB4_1: 1777 VMOVSD dataProdF64<>+0(SB), X0 1778 MOVSD X0, ret+24(FP) 1779 RET 1780 1781 LBB4_4: 1782 MOVQ SI, AX 1783 ANDQ $-16, AX 1784 LEAQ -16(AX), CX 1785 MOVQ CX, R8 1786 SHRQ $0x04, R8 1787 ADDQ $0x01, R8 1788 TESTQ CX, CX 1789 JE LBB4_5 1790 MOVQ R8, CX 1791 ANDQ $-2, CX 1792 VBROADCASTSD dataProdF64<>+0(SB), Y0 1793 XORL DX, DX 1794 VMOVAPD Y0, Y1 1795 VMOVAPD Y0, Y2 1796 VMOVAPD Y0, Y3 1797 1798 LBB4_7: 1799 VMULPD (DI)(DX*8), Y0, Y0 1800 VMULPD 32(DI)(DX*8), Y1, Y1 1801 VMULPD 64(DI)(DX*8), Y2, Y2 1802 VMULPD 96(DI)(DX*8), Y3, Y3 1803 VMULPD 128(DI)(DX*8), Y0, Y0 1804 VMULPD 160(DI)(DX*8), Y1, Y1 1805 VMULPD 192(DI)(DX*8), Y2, Y2 1806 VMULPD 224(DI)(DX*8), Y3, Y3 1807 ADDQ $0x20, DX 1808 ADDQ $-2, CX 1809 JNE LBB4_7 1810 TESTB $0x01, R8 1811 JE LBB4_10 1812 1813 LBB4_9: 1814 VMULPD (DI)(DX*8), Y0, Y0 1815 VMULPD 32(DI)(DX*8), Y1, Y1 1816 VMULPD 64(DI)(DX*8), Y2, Y2 1817 VMULPD 96(DI)(DX*8), Y3, Y3 1818 1819 LBB4_10: 1820 VMULPD Y3, Y1, Y1 1821 VMULPD Y2, Y0, Y0 1822 VMULPD Y1, Y0, Y0 1823 VEXTRACTF128 $0x01, Y0, X1 1824 VMULPD X1, X0, X0 1825 VPERMILPD $0x01, X0, X1 1826 VMULSD X1, X0, X0 1827 CMPQ AX, SI 1828 JE LBB4_12 1829 1830 LBB4_11: 1831 VMULSD (DI)(AX*8), X0, X0 1832 ADDQ $0x01, AX 1833 CMPQ SI, AX 1834 JNE LBB4_11 1835 1836 LBB4_12: 1837 VZEROUPPER 1838 MOVSD X0, ret+24(FP) 1839 RET 1840 1841 LBB4_5: 1842 VBROADCASTSD dataProdF64<>+0(SB), Y0 1843 XORL DX, DX 1844 VMOVAPD Y0, Y1 1845 VMOVAPD Y0, Y2 1846 VMOVAPD Y0, Y3 1847 TESTB $0x01, R8 1848 JNE LBB4_9 1849 JMP LBB4_10 1850 1851 DATA dataProdF32<>+0(SB)/4, $0x3f800000 1852 GLOBL dataProdF32<>(SB), RODATA|NOPTR, $4 1853 1854 // func Prod_AVX2_F32(x []float32) float32 1855 // Requires: AVX, SSE 1856 TEXT ·Prod_AVX2_F32(SB), NOSPLIT, $0-28 1857 MOVQ x_base+0(FP), DI 1858 MOVQ x_len+8(FP), SI 1859 TESTQ SI, SI 1860 JE LBB5_1 1861 CMPQ SI, $0x20 1862 JAE LBB5_4 1863 VMOVSS dataProdF32<>+0(SB), X0 1864 XORL AX, AX 1865 JMP LBB5_11 1866 1867 LBB5_1: 1868 VMOVSS dataProdF32<>+0(SB), X0 1869 MOVSS X0, ret+24(FP) 1870 RET 1871 1872 LBB5_4: 1873 MOVQ SI, AX 1874 ANDQ $-32, AX 1875 LEAQ -32(AX), CX 1876 MOVQ CX, R8 1877 SHRQ $0x05, R8 1878 ADDQ $0x01, R8 1879 TESTQ CX, CX 1880 JE LBB5_5 1881 MOVQ R8, CX 1882 ANDQ $-2, CX 1883 VBROADCASTSS dataProdF32<>+0(SB), Y0 1884 XORL DX, DX 1885 VMOVAPS Y0, Y1 1886 VMOVAPS Y0, Y2 1887 VMOVAPS Y0, Y3 1888 1889 LBB5_7: 1890 VMULPS (DI)(DX*4), Y0, Y0 1891 VMULPS 32(DI)(DX*4), Y1, Y1 1892 VMULPS 64(DI)(DX*4), Y2, Y2 1893 VMULPS 96(DI)(DX*4), Y3, Y3 1894 VMULPS 128(DI)(DX*4), Y0, Y0 1895 VMULPS 160(DI)(DX*4), Y1, Y1 1896 VMULPS 192(DI)(DX*4), Y2, Y2 1897 VMULPS 224(DI)(DX*4), Y3, Y3 1898 ADDQ $0x40, DX 1899 ADDQ $-2, CX 1900 JNE LBB5_7 1901 TESTB $0x01, R8 1902 JE LBB5_10 1903 1904 LBB5_9: 1905 VMULPS (DI)(DX*4), Y0, Y0 1906 VMULPS 32(DI)(DX*4), Y1, Y1 1907 VMULPS 64(DI)(DX*4), Y2, Y2 1908 VMULPS 96(DI)(DX*4), Y3, Y3 1909 1910 LBB5_10: 1911 VMULPS Y3, Y1, Y1 1912 VMULPS Y2, Y0, Y0 1913 VMULPS Y1, Y0, Y0 1914 VEXTRACTF128 $0x01, Y0, X1 1915 VMULPS X1, X0, X0 1916 VPERMILPD $0x01, X0, X1 1917 VMULPS X1, X0, X0 1918 VMOVSHDUP X0, X1 1919 VMULSS X1, X0, X0 1920 CMPQ AX, SI 1921 JE LBB5_12 1922 1923 LBB5_11: 1924 VMULSS (DI)(AX*4), X0, X0 1925 ADDQ $0x01, AX 1926 CMPQ SI, AX 1927 JNE LBB5_11 1928 1929 LBB5_12: 1930 VZEROUPPER 1931 MOVSS X0, ret+24(FP) 1932 RET 1933 1934 LBB5_5: 1935 VBROADCASTSS dataProdF32<>+0(SB), Y0 1936 XORL DX, DX 1937 VMOVAPS Y0, Y1 1938 VMOVAPS Y0, Y2 1939 VMOVAPS Y0, Y3 1940 TESTB $0x01, R8 1941 JNE LBB5_9 1942 JMP LBB5_10 1943 1944 DATA dataCumProdF64<>+0(SB)/8, $0x3ff0000000000000 1945 GLOBL dataCumProdF64<>(SB), RODATA|NOPTR, $8 1946 1947 // func CumProd_AVX2_F64(x []float64) 1948 // Requires: AVX 1949 TEXT ·CumProd_AVX2_F64(SB), NOSPLIT, $0-24 1950 MOVQ x_base+0(FP), DI 1951 MOVQ x_len+8(FP), SI 1952 TESTQ SI, SI 1953 JE LBB6_8 1954 LEAQ -1(SI), CX 1955 MOVL SI, AX 1956 ANDL $0x03, AX 1957 CMPQ CX, $0x03 1958 JAE LBB6_3 1959 VMOVSD dataCumProdF64<>+0(SB), X0 1960 XORL CX, CX 1961 JMP LBB6_5 1962 1963 LBB6_3: 1964 ANDQ $-4, SI 1965 VMOVSD dataCumProdF64<>+0(SB), X0 1966 XORL CX, CX 1967 1968 LBB6_4: 1969 VMULSD (DI)(CX*8), X0, X0 1970 VMOVSD X0, (DI)(CX*8) 1971 VMULSD 8(DI)(CX*8), X0, X0 1972 VMOVSD X0, 8(DI)(CX*8) 1973 VMULSD 16(DI)(CX*8), X0, X0 1974 VMOVSD X0, 16(DI)(CX*8) 1975 VMULSD 24(DI)(CX*8), X0, X0 1976 VMOVSD X0, 24(DI)(CX*8) 1977 ADDQ $0x04, CX 1978 CMPQ SI, CX 1979 JNE LBB6_4 1980 1981 LBB6_5: 1982 TESTQ AX, AX 1983 JE LBB6_8 1984 LEAQ (DI)(CX*8), CX 1985 XORL DX, DX 1986 1987 LBB6_7: 1988 VMULSD (CX)(DX*8), X0, X0 1989 VMOVSD X0, (CX)(DX*8) 1990 ADDQ $0x01, DX 1991 CMPQ AX, DX 1992 JNE LBB6_7 1993 1994 LBB6_8: 1995 RET 1996 1997 DATA dataCumProdF32<>+0(SB)/4, $0x3f800000 1998 GLOBL dataCumProdF32<>(SB), RODATA|NOPTR, $4 1999 2000 // func CumProd_AVX2_F32(x []float32) 2001 // Requires: AVX 2002 TEXT ·CumProd_AVX2_F32(SB), NOSPLIT, $0-24 2003 MOVQ x_base+0(FP), DI 2004 MOVQ x_len+8(FP), SI 2005 TESTQ SI, SI 2006 JE LBB7_8 2007 LEAQ -1(SI), CX 2008 MOVL SI, AX 2009 ANDL $0x03, AX 2010 CMPQ CX, $0x03 2011 JAE LBB7_3 2012 VMOVSS dataCumProdF32<>+0(SB), X0 2013 XORL CX, CX 2014 JMP LBB7_5 2015 2016 LBB7_3: 2017 ANDQ $-4, SI 2018 VMOVSS dataCumProdF32<>+0(SB), X0 2019 XORL CX, CX 2020 2021 LBB7_4: 2022 VMULSS (DI)(CX*4), X0, X0 2023 VMOVSS X0, (DI)(CX*4) 2024 VMULSS 4(DI)(CX*4), X0, X0 2025 VMOVSS X0, 4(DI)(CX*4) 2026 VMULSS 8(DI)(CX*4), X0, X0 2027 VMOVSS X0, 8(DI)(CX*4) 2028 VMULSS 12(DI)(CX*4), X0, X0 2029 VMOVSS X0, 12(DI)(CX*4) 2030 ADDQ $0x04, CX 2031 CMPQ SI, CX 2032 JNE LBB7_4 2033 2034 LBB7_5: 2035 TESTQ AX, AX 2036 JE LBB7_8 2037 LEAQ (DI)(CX*4), CX 2038 XORL DX, DX 2039 2040 LBB7_7: 2041 VMULSS (CX)(DX*4), X0, X0 2042 VMOVSS X0, (CX)(DX*4) 2043 ADDQ $0x01, DX 2044 CMPQ AX, DX 2045 JNE LBB7_7 2046 2047 LBB7_8: 2048 RET 2049 2050 // func Dot_AVX2_F64(x []float64, y []float64) float64 2051 // Requires: AVX, FMA3, SSE2 2052 TEXT ·Dot_AVX2_F64(SB), NOSPLIT, $0-56 2053 MOVQ x_base+0(FP), DI 2054 MOVQ y_base+24(FP), SI 2055 MOVQ x_len+8(FP), DX 2056 TESTQ DX, DX 2057 JE LBB0_1 2058 CMPQ DX, $0x10 2059 JAE LBB0_4 2060 VXORPD X0, X0, X0 2061 XORL AX, AX 2062 JMP LBB0_7 2063 2064 LBB0_1: 2065 VXORPS X0, X0, X0 2066 MOVSD X0, ret+48(FP) 2067 RET 2068 2069 LBB0_4: 2070 MOVQ DX, AX 2071 ANDQ $-16, AX 2072 VXORPD X0, X0, X0 2073 XORL CX, CX 2074 VXORPD X1, X1, X1 2075 VXORPD X2, X2, X2 2076 VXORPD X3, X3, X3 2077 2078 LBB0_5: 2079 VMOVUPD (SI)(CX*8), Y4 2080 VMOVUPD 32(SI)(CX*8), Y5 2081 VMOVUPD 64(SI)(CX*8), Y6 2082 VMOVUPD 96(SI)(CX*8), Y7 2083 VFMADD231PD (DI)(CX*8), Y4, Y0 2084 VFMADD231PD 32(DI)(CX*8), Y5, Y1 2085 VFMADD231PD 64(DI)(CX*8), Y6, Y2 2086 VFMADD231PD 96(DI)(CX*8), Y7, Y3 2087 ADDQ $0x10, CX 2088 CMPQ AX, CX 2089 JNE LBB0_5 2090 VADDPD Y0, Y1, Y0 2091 VADDPD Y0, Y2, Y0 2092 VADDPD Y0, Y3, Y0 2093 VEXTRACTF128 $0x01, Y0, X1 2094 VADDPD X1, X0, X0 2095 VPERMILPD $0x01, X0, X1 2096 VADDSD X1, X0, X0 2097 CMPQ AX, DX 2098 JE LBB0_8 2099 2100 LBB0_7: 2101 VMOVSD (SI)(AX*8), X1 2102 VFMADD231SD (DI)(AX*8), X1, X0 2103 ADDQ $0x01, AX 2104 CMPQ DX, AX 2105 JNE LBB0_7 2106 2107 LBB0_8: 2108 VZEROUPPER 2109 MOVSD X0, ret+48(FP) 2110 RET 2111 2112 // func Dot_AVX2_F32(x []float32, y []float32) float32 2113 // Requires: AVX, FMA3, SSE 2114 TEXT ·Dot_AVX2_F32(SB), NOSPLIT, $0-52 2115 MOVQ x_base+0(FP), DI 2116 MOVQ y_base+24(FP), SI 2117 MOVQ x_len+8(FP), DX 2118 TESTQ DX, DX 2119 JE LBB1_1 2120 CMPQ DX, $0x20 2121 JAE LBB1_4 2122 VXORPS X0, X0, X0 2123 XORL AX, AX 2124 JMP LBB1_7 2125 2126 LBB1_1: 2127 VXORPS X0, X0, X0 2128 MOVSS X0, ret+48(FP) 2129 RET 2130 2131 LBB1_4: 2132 MOVQ DX, AX 2133 ANDQ $-32, AX 2134 VXORPS X0, X0, X0 2135 XORL CX, CX 2136 VXORPS X1, X1, X1 2137 VXORPS X2, X2, X2 2138 VXORPS X3, X3, X3 2139 2140 LBB1_5: 2141 VMOVUPS (SI)(CX*4), Y4 2142 VMOVUPS 32(SI)(CX*4), Y5 2143 VMOVUPS 64(SI)(CX*4), Y6 2144 VMOVUPS 96(SI)(CX*4), Y7 2145 VFMADD231PS (DI)(CX*4), Y4, Y0 2146 VFMADD231PS 32(DI)(CX*4), Y5, Y1 2147 VFMADD231PS 64(DI)(CX*4), Y6, Y2 2148 VFMADD231PS 96(DI)(CX*4), Y7, Y3 2149 ADDQ $0x20, CX 2150 CMPQ AX, CX 2151 JNE LBB1_5 2152 VADDPS Y0, Y1, Y0 2153 VADDPS Y0, Y2, Y0 2154 VADDPS Y0, Y3, Y0 2155 VEXTRACTF128 $0x01, Y0, X1 2156 VADDPS X1, X0, X0 2157 VPERMILPD $0x01, X0, X1 2158 VADDPS X1, X0, X0 2159 VMOVSHDUP X0, X1 2160 VADDSS X1, X0, X0 2161 CMPQ AX, DX 2162 JE LBB1_8 2163 2164 LBB1_7: 2165 VMOVSS (SI)(AX*4), X1 2166 VFMADD231SS (DI)(AX*4), X1, X0 2167 ADDQ $0x01, AX 2168 CMPQ DX, AX 2169 JNE LBB1_7 2170 2171 LBB1_8: 2172 VZEROUPPER 2173 MOVSS X0, ret+48(FP) 2174 RET 2175 2176 // func Norm_AVX2_F64(x []float64) float64 2177 // Requires: AVX, FMA3, SSE2 2178 TEXT ·Norm_AVX2_F64(SB), NOSPLIT, $0-32 2179 MOVQ x_base+0(FP), DI 2180 MOVQ x_len+8(FP), SI 2181 TESTQ SI, SI 2182 JE LBB2_1 2183 CMPQ SI, $0x10 2184 JAE LBB2_4 2185 VXORPD X0, X0, X0 2186 XORL AX, AX 2187 JMP LBB2_11 2188 2189 LBB2_1: 2190 VXORPD X0, X0, X0 2191 VSQRTSD X0, X0, X0 2192 MOVSD X0, ret+24(FP) 2193 RET 2194 2195 LBB2_4: 2196 MOVQ SI, AX 2197 ANDQ $-16, AX 2198 LEAQ -16(AX), CX 2199 MOVQ CX, R8 2200 SHRQ $0x04, R8 2201 ADDQ $0x01, R8 2202 TESTQ CX, CX 2203 JE LBB2_5 2204 MOVQ R8, CX 2205 ANDQ $-2, CX 2206 VXORPD X0, X0, X0 2207 XORL DX, DX 2208 VXORPD X1, X1, X1 2209 VXORPD X2, X2, X2 2210 VXORPD X3, X3, X3 2211 2212 LBB2_7: 2213 VMOVUPD (DI)(DX*8), Y4 2214 VMOVUPD 32(DI)(DX*8), Y5 2215 VMOVUPD 64(DI)(DX*8), Y6 2216 VMOVUPD 96(DI)(DX*8), Y7 2217 VFMADD213PD Y0, Y4, Y4 2218 VFMADD213PD Y1, Y5, Y5 2219 VFMADD213PD Y2, Y6, Y6 2220 VFMADD213PD Y3, Y7, Y7 2221 VMOVUPD 128(DI)(DX*8), Y0 2222 VMOVUPD 160(DI)(DX*8), Y1 2223 VMOVUPD 192(DI)(DX*8), Y2 2224 VMOVUPD 224(DI)(DX*8), Y3 2225 VFMADD213PD Y4, Y0, Y0 2226 VFMADD213PD Y5, Y1, Y1 2227 VFMADD213PD Y6, Y2, Y2 2228 VFMADD213PD Y7, Y3, Y3 2229 ADDQ $0x20, DX 2230 ADDQ $-2, CX 2231 JNE LBB2_7 2232 TESTB $0x01, R8 2233 JE LBB2_10 2234 2235 LBB2_9: 2236 VMOVUPD (DI)(DX*8), Y4 2237 VMOVUPD 32(DI)(DX*8), Y5 2238 VMOVUPD 64(DI)(DX*8), Y6 2239 VMOVUPD 96(DI)(DX*8), Y7 2240 VFMADD231PD Y4, Y4, Y0 2241 VFMADD231PD Y5, Y5, Y1 2242 VFMADD231PD Y6, Y6, Y2 2243 VFMADD231PD Y7, Y7, Y3 2244 2245 LBB2_10: 2246 VADDPD Y3, Y1, Y1 2247 VADDPD Y2, Y0, Y0 2248 VADDPD Y1, Y0, Y0 2249 VEXTRACTF128 $0x01, Y0, X1 2250 VADDPD X1, X0, X0 2251 VPERMILPD $0x01, X0, X1 2252 VADDSD X1, X0, X0 2253 CMPQ AX, SI 2254 JE LBB2_12 2255 2256 LBB2_11: 2257 VMOVSD (DI)(AX*8), X1 2258 VFMADD231SD X1, X1, X0 2259 ADDQ $0x01, AX 2260 CMPQ SI, AX 2261 JNE LBB2_11 2262 2263 LBB2_12: 2264 VSQRTSD X0, X0, X0 2265 VZEROUPPER 2266 MOVSD X0, ret+24(FP) 2267 RET 2268 2269 LBB2_5: 2270 VXORPD X0, X0, X0 2271 XORL DX, DX 2272 VXORPD X1, X1, X1 2273 VXORPD X2, X2, X2 2274 VXORPD X3, X3, X3 2275 TESTB $0x01, R8 2276 JNE LBB2_9 2277 JMP LBB2_10 2278 2279 DATA dataNormF32<>+0(SB)/4, $0xc0400000 2280 DATA dataNormF32<>+4(SB)/4, $0xbf000000 2281 DATA dataNormF32<>+8(SB)/4, $0x7fffffff 2282 DATA dataNormF32<>+12(SB)/4, $0x00800000 2283 GLOBL dataNormF32<>(SB), RODATA|NOPTR, $16 2284 2285 // func Norm_AVX2_F32(x []float32) float32 2286 // Requires: AVX, FMA3, SSE 2287 TEXT ·Norm_AVX2_F32(SB), NOSPLIT, $0-28 2288 MOVQ x_base+0(FP), DI 2289 MOVQ x_len+8(FP), SI 2290 TESTQ SI, SI 2291 JE LBB3_1 2292 CMPQ SI, $0x20 2293 JAE LBB3_4 2294 VXORPS X0, X0, X0 2295 XORL AX, AX 2296 JMP LBB3_11 2297 2298 LBB3_1: 2299 VXORPS X0, X0, X0 2300 JMP LBB3_12 2301 2302 LBB3_4: 2303 MOVQ SI, AX 2304 ANDQ $-32, AX 2305 LEAQ -32(AX), CX 2306 MOVQ CX, R8 2307 SHRQ $0x05, R8 2308 ADDQ $0x01, R8 2309 TESTQ CX, CX 2310 JE LBB3_5 2311 MOVQ R8, CX 2312 ANDQ $-2, CX 2313 VXORPS X0, X0, X0 2314 XORL DX, DX 2315 VXORPS X1, X1, X1 2316 VXORPS X2, X2, X2 2317 VXORPS X3, X3, X3 2318 2319 LBB3_7: 2320 VMOVUPS (DI)(DX*4), Y4 2321 VMOVUPS 32(DI)(DX*4), Y5 2322 VMOVUPS 64(DI)(DX*4), Y6 2323 VMOVUPS 96(DI)(DX*4), Y7 2324 VFMADD213PS Y0, Y4, Y4 2325 VFMADD213PS Y1, Y5, Y5 2326 VFMADD213PS Y2, Y6, Y6 2327 VFMADD213PS Y3, Y7, Y7 2328 VMOVUPS 128(DI)(DX*4), Y0 2329 VMOVUPS 160(DI)(DX*4), Y1 2330 VMOVUPS 192(DI)(DX*4), Y2 2331 VMOVUPS 224(DI)(DX*4), Y3 2332 VFMADD213PS Y4, Y0, Y0 2333 VFMADD213PS Y5, Y1, Y1 2334 VFMADD213PS Y6, Y2, Y2 2335 VFMADD213PS Y7, Y3, Y3 2336 ADDQ $0x40, DX 2337 ADDQ $-2, CX 2338 JNE LBB3_7 2339 TESTB $0x01, R8 2340 JE LBB3_10 2341 2342 LBB3_9: 2343 VMOVUPS (DI)(DX*4), Y4 2344 VMOVUPS 32(DI)(DX*4), Y5 2345 VMOVUPS 64(DI)(DX*4), Y6 2346 VMOVUPS 96(DI)(DX*4), Y7 2347 VFMADD231PS Y4, Y4, Y0 2348 VFMADD231PS Y5, Y5, Y1 2349 VFMADD231PS Y6, Y6, Y2 2350 VFMADD231PS Y7, Y7, Y3 2351 2352 LBB3_10: 2353 VADDPS Y3, Y1, Y1 2354 VADDPS Y2, Y0, Y0 2355 VADDPS Y1, Y0, Y0 2356 VEXTRACTF128 $0x01, Y0, X1 2357 VADDPS X1, X0, X0 2358 VPERMILPD $0x01, X0, X1 2359 VADDPS X1, X0, X0 2360 VMOVSHDUP X0, X1 2361 VADDSS X1, X0, X0 2362 CMPQ AX, SI 2363 JE LBB3_12 2364 2365 LBB3_11: 2366 VMOVSS (DI)(AX*4), X1 2367 VFMADD231SS X1, X1, X0 2368 ADDQ $0x01, AX 2369 CMPQ SI, AX 2370 JNE LBB3_11 2371 2372 LBB3_12: 2373 VRSQRTSS X0, X0, X1 2374 VMULSS X1, X0, X2 2375 VFMADD213SS dataNormF32<>+0(SB), X2, X1 2376 VMULSS dataNormF32<>+4(SB), X2, X2 2377 VMULSS X1, X2, X1 2378 VBROADCASTSS dataNormF32<>+8(SB), X2 2379 VANDPS X2, X0, X0 2380 VCMPSS $0x01, dataNormF32<>+12(SB), X0, X0 2381 VANDNPS X1, X0, X0 2382 VZEROUPPER 2383 MOVSS X0, ret+24(FP) 2384 RET 2385 2386 LBB3_5: 2387 VXORPS X0, X0, X0 2388 XORL DX, DX 2389 VXORPS X1, X1, X1 2390 VXORPS X2, X2, X2 2391 VXORPS X3, X3, X3 2392 TESTB $0x01, R8 2393 JNE LBB3_9 2394 JMP LBB3_10 2395 2396 // func Distance_AVX2_F64(x []float64, y []float64) float64 2397 // Requires: AVX, FMA3, SSE2 2398 TEXT ·Distance_AVX2_F64(SB), NOSPLIT, $0-56 2399 MOVQ x_base+0(FP), DI 2400 MOVQ y_base+24(FP), SI 2401 MOVQ x_len+8(FP), DX 2402 TESTQ DX, DX 2403 JE LBB4_1 2404 CMPQ DX, $0x10 2405 JAE LBB4_4 2406 VXORPD X0, X0, X0 2407 XORL AX, AX 2408 JMP LBB4_7 2409 2410 LBB4_1: 2411 VXORPD X0, X0, X0 2412 VSQRTSD X0, X0, X0 2413 MOVSD X0, ret+48(FP) 2414 RET 2415 2416 LBB4_4: 2417 MOVQ DX, AX 2418 ANDQ $-16, AX 2419 VXORPD X0, X0, X0 2420 XORL CX, CX 2421 VXORPD X1, X1, X1 2422 VXORPD X2, X2, X2 2423 VXORPD X3, X3, X3 2424 2425 LBB4_5: 2426 VMOVUPD (DI)(CX*8), Y4 2427 VMOVUPD 32(DI)(CX*8), Y5 2428 VMOVUPD 64(DI)(CX*8), Y6 2429 VMOVUPD 96(DI)(CX*8), Y7 2430 VSUBPD (SI)(CX*8), Y4, Y4 2431 VSUBPD 32(SI)(CX*8), Y5, Y5 2432 VSUBPD 64(SI)(CX*8), Y6, Y6 2433 VSUBPD 96(SI)(CX*8), Y7, Y7 2434 VFMADD231PD Y4, Y4, Y0 2435 VFMADD231PD Y5, Y5, Y1 2436 VFMADD231PD Y6, Y6, Y2 2437 VFMADD231PD Y7, Y7, Y3 2438 ADDQ $0x10, CX 2439 CMPQ AX, CX 2440 JNE LBB4_5 2441 VADDPD Y0, Y1, Y0 2442 VADDPD Y0, Y2, Y0 2443 VADDPD Y0, Y3, Y0 2444 VEXTRACTF128 $0x01, Y0, X1 2445 VADDPD X1, X0, X0 2446 VPERMILPD $0x01, X0, X1 2447 VADDSD X1, X0, X0 2448 CMPQ AX, DX 2449 JE LBB4_8 2450 2451 LBB4_7: 2452 VMOVSD (DI)(AX*8), X1 2453 VSUBSD (SI)(AX*8), X1, X1 2454 VFMADD231SD X1, X1, X0 2455 ADDQ $0x01, AX 2456 CMPQ DX, AX 2457 JNE LBB4_7 2458 2459 LBB4_8: 2460 VSQRTSD X0, X0, X0 2461 VZEROUPPER 2462 MOVSD X0, ret+48(FP) 2463 RET 2464 2465 DATA dataDistanceF32<>+0(SB)/4, $0xc0400000 2466 DATA dataDistanceF32<>+4(SB)/4, $0xbf000000 2467 DATA dataDistanceF32<>+8(SB)/4, $0x7fffffff 2468 DATA dataDistanceF32<>+12(SB)/4, $0x00800000 2469 GLOBL dataDistanceF32<>(SB), RODATA|NOPTR, $16 2470 2471 // func Distance_AVX2_F32(x []float32, y []float32) float32 2472 // Requires: AVX, FMA3, SSE 2473 TEXT ·Distance_AVX2_F32(SB), NOSPLIT, $0-52 2474 MOVQ x_base+0(FP), DI 2475 MOVQ y_base+24(FP), SI 2476 MOVQ x_len+8(FP), DX 2477 TESTQ DX, DX 2478 JE LBB5_1 2479 CMPQ DX, $0x20 2480 JAE LBB5_4 2481 VXORPS X0, X0, X0 2482 XORL AX, AX 2483 JMP LBB5_7 2484 2485 LBB5_1: 2486 VXORPS X0, X0, X0 2487 JMP LBB5_8 2488 2489 LBB5_4: 2490 MOVQ DX, AX 2491 ANDQ $-32, AX 2492 VXORPS X0, X0, X0 2493 XORL CX, CX 2494 VXORPS X1, X1, X1 2495 VXORPS X2, X2, X2 2496 VXORPS X3, X3, X3 2497 2498 LBB5_5: 2499 VMOVUPS (DI)(CX*4), Y4 2500 VMOVUPS 32(DI)(CX*4), Y5 2501 VMOVUPS 64(DI)(CX*4), Y6 2502 VMOVUPS 96(DI)(CX*4), Y7 2503 VSUBPS (SI)(CX*4), Y4, Y4 2504 VSUBPS 32(SI)(CX*4), Y5, Y5 2505 VSUBPS 64(SI)(CX*4), Y6, Y6 2506 VSUBPS 96(SI)(CX*4), Y7, Y7 2507 VFMADD231PS Y4, Y4, Y0 2508 VFMADD231PS Y5, Y5, Y1 2509 VFMADD231PS Y6, Y6, Y2 2510 VFMADD231PS Y7, Y7, Y3 2511 ADDQ $0x20, CX 2512 CMPQ AX, CX 2513 JNE LBB5_5 2514 VADDPS Y0, Y1, Y0 2515 VADDPS Y0, Y2, Y0 2516 VADDPS Y0, Y3, Y0 2517 VEXTRACTF128 $0x01, Y0, X1 2518 VADDPS X1, X0, X0 2519 VPERMILPD $0x01, X0, X1 2520 VADDPS X1, X0, X0 2521 VMOVSHDUP X0, X1 2522 VADDSS X1, X0, X0 2523 CMPQ AX, DX 2524 JE LBB5_8 2525 2526 LBB5_7: 2527 VMOVSS (DI)(AX*4), X1 2528 VSUBSS (SI)(AX*4), X1, X1 2529 VFMADD231SS X1, X1, X0 2530 ADDQ $0x01, AX 2531 CMPQ DX, AX 2532 JNE LBB5_7 2533 2534 LBB5_8: 2535 VRSQRTSS X0, X0, X1 2536 VMULSS X1, X0, X2 2537 VFMADD213SS dataDistanceF32<>+0(SB), X2, X1 2538 VMULSS dataDistanceF32<>+4(SB), X2, X2 2539 VMULSS X1, X2, X1 2540 VBROADCASTSS dataDistanceF32<>+8(SB), X2 2541 VANDPS X2, X0, X0 2542 VCMPSS $0x01, dataDistanceF32<>+12(SB), X0, X0 2543 VANDNPS X1, X0, X0 2544 VZEROUPPER 2545 MOVSS X0, ret+48(FP) 2546 RET 2547 2548 DATA dataManhattanNormF64<>+0(SB)/8, $0x7fffffffffffffff 2549 DATA dataManhattanNormF64<>+8(SB)/8, $0x7fffffffffffffff 2550 DATA dataManhattanNormF64<>+16(SB)/8, $0x7fffffffffffffff 2551 GLOBL dataManhattanNormF64<>(SB), RODATA|NOPTR, $24 2552 2553 // func ManhattanNorm_AVX2_F64(x []float64) float64 2554 // Requires: AVX, SSE2 2555 TEXT ·ManhattanNorm_AVX2_F64(SB), NOSPLIT, $0-32 2556 MOVQ x_base+0(FP), DI 2557 MOVQ x_len+8(FP), SI 2558 TESTQ SI, SI 2559 JE LBB6_1 2560 CMPQ SI, $0x10 2561 JAE LBB6_4 2562 VXORPD X0, X0, X0 2563 XORL AX, AX 2564 JMP LBB6_7 2565 2566 LBB6_1: 2567 VXORPS X0, X0, X0 2568 MOVSD X0, ret+24(FP) 2569 RET 2570 2571 LBB6_4: 2572 MOVQ SI, AX 2573 ANDQ $-16, AX 2574 VXORPD X0, X0, X0 2575 VBROADCASTSD dataManhattanNormF64<>+0(SB), Y1 2576 XORL CX, CX 2577 VXORPD X2, X2, X2 2578 VXORPD X3, X3, X3 2579 VXORPD X4, X4, X4 2580 2581 LBB6_5: 2582 VANDPD (DI)(CX*8), Y1, Y5 2583 VADDPD Y0, Y5, Y0 2584 VANDPD 32(DI)(CX*8), Y1, Y5 2585 VADDPD Y2, Y5, Y2 2586 VANDPD 64(DI)(CX*8), Y1, Y5 2587 VANDPD 96(DI)(CX*8), Y1, Y6 2588 VADDPD Y3, Y5, Y3 2589 VADDPD Y4, Y6, Y4 2590 ADDQ $0x10, CX 2591 CMPQ AX, CX 2592 JNE LBB6_5 2593 VADDPD Y0, Y2, Y0 2594 VADDPD Y0, Y3, Y0 2595 VADDPD Y0, Y4, Y0 2596 VEXTRACTF128 $0x01, Y0, X1 2597 VADDPD X1, X0, X0 2598 VPERMILPD $0x01, X0, X1 2599 VADDSD X1, X0, X0 2600 CMPQ AX, SI 2601 JE LBB6_9 2602 2603 LBB6_7: 2604 VMOVUPD dataManhattanNormF64<>+8(SB), X1 2605 2606 LBB6_8: 2607 VMOVSD (DI)(AX*8), X2 2608 VANDPD X1, X2, X2 2609 VADDSD X0, X2, X0 2610 ADDQ $0x01, AX 2611 CMPQ SI, AX 2612 JNE LBB6_8 2613 2614 LBB6_9: 2615 VZEROUPPER 2616 MOVSD X0, ret+24(FP) 2617 RET 2618 2619 DATA dataManhattanNormF32<>+0(SB)/4, $0x7fffffff 2620 GLOBL dataManhattanNormF32<>(SB), RODATA|NOPTR, $4 2621 2622 // func ManhattanNorm_AVX2_F32(x []float32) float32 2623 // Requires: AVX, SSE 2624 TEXT ·ManhattanNorm_AVX2_F32(SB), NOSPLIT, $0-28 2625 MOVQ x_base+0(FP), DI 2626 MOVQ x_len+8(FP), SI 2627 TESTQ SI, SI 2628 JE LBB7_1 2629 CMPQ SI, $0x20 2630 JAE LBB7_4 2631 VXORPS X0, X0, X0 2632 XORL AX, AX 2633 JMP LBB7_7 2634 2635 LBB7_1: 2636 VXORPS X0, X0, X0 2637 MOVSS X0, ret+24(FP) 2638 RET 2639 2640 LBB7_4: 2641 MOVQ SI, AX 2642 ANDQ $-32, AX 2643 VXORPS X0, X0, X0 2644 VBROADCASTSS dataManhattanNormF32<>+0(SB), Y1 2645 XORL CX, CX 2646 VXORPS X2, X2, X2 2647 VXORPS X3, X3, X3 2648 VXORPS X4, X4, X4 2649 2650 LBB7_5: 2651 VANDPS (DI)(CX*4), Y1, Y5 2652 VADDPS Y0, Y5, Y0 2653 VANDPS 32(DI)(CX*4), Y1, Y5 2654 VADDPS Y2, Y5, Y2 2655 VANDPS 64(DI)(CX*4), Y1, Y5 2656 VANDPS 96(DI)(CX*4), Y1, Y6 2657 VADDPS Y3, Y5, Y3 2658 VADDPS Y4, Y6, Y4 2659 ADDQ $0x20, CX 2660 CMPQ AX, CX 2661 JNE LBB7_5 2662 VADDPS Y0, Y2, Y0 2663 VADDPS Y0, Y3, Y0 2664 VADDPS Y0, Y4, Y0 2665 VEXTRACTF128 $0x01, Y0, X1 2666 VADDPS X1, X0, X0 2667 VPERMILPD $0x01, X0, X1 2668 VADDPS X1, X0, X0 2669 VMOVSHDUP X0, X1 2670 VADDSS X1, X0, X0 2671 CMPQ AX, SI 2672 JE LBB7_9 2673 2674 LBB7_7: 2675 VBROADCASTSS dataManhattanNormF32<>+0(SB), X1 2676 2677 LBB7_8: 2678 VMOVSS (DI)(AX*4), X2 2679 VANDPS X1, X2, X2 2680 VADDSS X0, X2, X0 2681 ADDQ $0x01, AX 2682 CMPQ SI, AX 2683 JNE LBB7_8 2684 2685 LBB7_9: 2686 VZEROUPPER 2687 MOVSS X0, ret+24(FP) 2688 RET 2689 2690 DATA dataManhattanDistanceF64<>+0(SB)/8, $0x7fffffffffffffff 2691 DATA dataManhattanDistanceF64<>+8(SB)/8, $0x7fffffffffffffff 2692 DATA dataManhattanDistanceF64<>+16(SB)/8, $0x7fffffffffffffff 2693 GLOBL dataManhattanDistanceF64<>(SB), RODATA|NOPTR, $24 2694 2695 // func ManhattanDistance_AVX2_F64(x []float64, y []float64) float64 2696 // Requires: AVX, SSE2 2697 TEXT ·ManhattanDistance_AVX2_F64(SB), NOSPLIT, $0-56 2698 MOVQ x_base+0(FP), DI 2699 MOVQ y_base+24(FP), SI 2700 MOVQ x_len+8(FP), DX 2701 TESTQ DX, DX 2702 JE LBB8_1 2703 CMPQ DX, $0x10 2704 JAE LBB8_4 2705 VXORPD X0, X0, X0 2706 XORL AX, AX 2707 JMP LBB8_7 2708 2709 LBB8_1: 2710 VXORPS X0, X0, X0 2711 MOVSD X0, ret+48(FP) 2712 RET 2713 2714 LBB8_4: 2715 MOVQ DX, AX 2716 ANDQ $-16, AX 2717 VXORPD X0, X0, X0 2718 VBROADCASTSD dataManhattanDistanceF64<>+0(SB), Y1 2719 XORL CX, CX 2720 VXORPD X2, X2, X2 2721 VXORPD X3, X3, X3 2722 VXORPD X4, X4, X4 2723 2724 LBB8_5: 2725 VMOVUPD (DI)(CX*8), Y5 2726 VMOVUPD 32(DI)(CX*8), Y6 2727 VMOVUPD 64(DI)(CX*8), Y7 2728 VMOVUPD 96(DI)(CX*8), Y8 2729 VSUBPD (SI)(CX*8), Y5, Y5 2730 VSUBPD 32(SI)(CX*8), Y6, Y6 2731 VSUBPD 64(SI)(CX*8), Y7, Y7 2732 VSUBPD 96(SI)(CX*8), Y8, Y8 2733 VANDPD Y1, Y5, Y5 2734 VADDPD Y0, Y5, Y0 2735 VANDPD Y1, Y6, Y5 2736 VADDPD Y2, Y5, Y2 2737 VANDPD Y1, Y7, Y5 2738 VADDPD Y3, Y5, Y3 2739 VANDPD Y1, Y8, Y5 2740 VADDPD Y4, Y5, Y4 2741 ADDQ $0x10, CX 2742 CMPQ AX, CX 2743 JNE LBB8_5 2744 VADDPD Y0, Y2, Y0 2745 VADDPD Y0, Y3, Y0 2746 VADDPD Y0, Y4, Y0 2747 VEXTRACTF128 $0x01, Y0, X1 2748 VADDPD X1, X0, X0 2749 VPERMILPD $0x01, X0, X1 2750 VADDSD X1, X0, X0 2751 CMPQ AX, DX 2752 JE LBB8_9 2753 2754 LBB8_7: 2755 VMOVUPD dataManhattanDistanceF64<>+8(SB), X1 2756 2757 LBB8_8: 2758 VMOVSD (DI)(AX*8), X2 2759 VSUBSD (SI)(AX*8), X2, X2 2760 VANDPD X1, X2, X2 2761 VADDSD X0, X2, X0 2762 ADDQ $0x01, AX 2763 CMPQ DX, AX 2764 JNE LBB8_8 2765 2766 LBB8_9: 2767 VZEROUPPER 2768 MOVSD X0, ret+48(FP) 2769 RET 2770 2771 DATA dataManhattanDistanceF32<>+0(SB)/4, $0x7fffffff 2772 GLOBL dataManhattanDistanceF32<>(SB), RODATA|NOPTR, $4 2773 2774 // func ManhattanDistance_AVX2_F32(x []float32, y []float32) float32 2775 // Requires: AVX, SSE 2776 TEXT ·ManhattanDistance_AVX2_F32(SB), NOSPLIT, $0-52 2777 MOVQ x_base+0(FP), DI 2778 MOVQ y_base+24(FP), SI 2779 MOVQ x_len+8(FP), DX 2780 TESTQ DX, DX 2781 JE LBB9_1 2782 CMPQ DX, $0x20 2783 JAE LBB9_4 2784 VXORPS X0, X0, X0 2785 XORL AX, AX 2786 JMP LBB9_7 2787 2788 LBB9_1: 2789 VXORPS X0, X0, X0 2790 MOVSS X0, ret+48(FP) 2791 RET 2792 2793 LBB9_4: 2794 MOVQ DX, AX 2795 ANDQ $-32, AX 2796 VXORPS X0, X0, X0 2797 VBROADCASTSS dataManhattanDistanceF32<>+0(SB), Y1 2798 XORL CX, CX 2799 VXORPS X2, X2, X2 2800 VXORPS X3, X3, X3 2801 VXORPS X4, X4, X4 2802 2803 LBB9_5: 2804 VMOVUPS (DI)(CX*4), Y5 2805 VMOVUPS 32(DI)(CX*4), Y6 2806 VMOVUPS 64(DI)(CX*4), Y7 2807 VMOVUPS 96(DI)(CX*4), Y8 2808 VSUBPS (SI)(CX*4), Y5, Y5 2809 VSUBPS 32(SI)(CX*4), Y6, Y6 2810 VSUBPS 64(SI)(CX*4), Y7, Y7 2811 VSUBPS 96(SI)(CX*4), Y8, Y8 2812 VANDPS Y1, Y5, Y5 2813 VADDPS Y0, Y5, Y0 2814 VANDPS Y1, Y6, Y5 2815 VADDPS Y2, Y5, Y2 2816 VANDPS Y1, Y7, Y5 2817 VADDPS Y3, Y5, Y3 2818 VANDPS Y1, Y8, Y5 2819 VADDPS Y4, Y5, Y4 2820 ADDQ $0x20, CX 2821 CMPQ AX, CX 2822 JNE LBB9_5 2823 VADDPS Y0, Y2, Y0 2824 VADDPS Y0, Y3, Y0 2825 VADDPS Y0, Y4, Y0 2826 VEXTRACTF128 $0x01, Y0, X1 2827 VADDPS X1, X0, X0 2828 VPERMILPD $0x01, X0, X1 2829 VADDPS X1, X0, X0 2830 VMOVSHDUP X0, X1 2831 VADDSS X1, X0, X0 2832 CMPQ AX, DX 2833 JE LBB9_9 2834 2835 LBB9_7: 2836 VBROADCASTSS dataManhattanDistanceF32<>+0(SB), X1 2837 2838 LBB9_8: 2839 VMOVSS (DI)(AX*4), X2 2840 VSUBSS (SI)(AX*4), X2, X2 2841 VANDPS X1, X2, X2 2842 VADDSS X0, X2, X0 2843 ADDQ $0x01, AX 2844 CMPQ DX, AX 2845 JNE LBB9_8 2846 2847 LBB9_9: 2848 VZEROUPPER 2849 MOVSS X0, ret+48(FP) 2850 RET 2851 2852 // func CosineSimilarity_AVX2_F64(x []float64, y []float64) float64 2853 // Requires: AVX, FMA3, SSE2 2854 TEXT ·CosineSimilarity_AVX2_F64(SB), NOSPLIT, $0-56 2855 MOVQ x_base+0(FP), DI 2856 MOVQ y_base+24(FP), SI 2857 MOVQ x_len+8(FP), DX 2858 TESTQ DX, DX 2859 JE LBB2_1 2860 CMPQ DX, $0x08 2861 JAE LBB2_5 2862 VXORPD X1, X1, X1 2863 XORL AX, AX 2864 VXORPD X2, X2, X2 2865 VXORPD X0, X0, X0 2866 JMP LBB2_4 2867 2868 LBB2_1: 2869 VXORPD X0, X0, X0 2870 VXORPD X1, X1, X1 2871 VSQRTSD X1, X1, X1 2872 VDIVSD X1, X0, X0 2873 MOVSD X0, ret+48(FP) 2874 RET 2875 2876 LBB2_5: 2877 MOVQ DX, AX 2878 ANDQ $-8, AX 2879 VXORPD X1, X1, X1 2880 XORL CX, CX 2881 VXORPD X3, X3, X3 2882 VXORPD X2, X2, X2 2883 VXORPD X4, X4, X4 2884 VXORPD X0, X0, X0 2885 VXORPD X5, X5, X5 2886 2887 LBB2_6: 2888 VMOVUPD (DI)(CX*8), Y6 2889 VMOVUPD 32(DI)(CX*8), Y7 2890 VMOVUPD (SI)(CX*8), Y8 2891 VMOVUPD 32(SI)(CX*8), Y9 2892 VFMADD231PD Y6, Y8, Y0 2893 VFMADD231PD Y7, Y9, Y5 2894 VFMADD231PD Y6, Y6, Y2 2895 VFMADD231PD Y7, Y7, Y4 2896 VFMADD231PD Y8, Y8, Y1 2897 VFMADD231PD Y9, Y9, Y3 2898 ADDQ $0x08, CX 2899 CMPQ AX, CX 2900 JNE LBB2_6 2901 VADDPD Y0, Y5, Y0 2902 VEXTRACTF128 $0x01, Y0, X5 2903 VADDPD X5, X0, X0 2904 VPERMILPD $0x01, X0, X5 2905 VADDSD X5, X0, X0 2906 VADDPD Y2, Y4, Y2 2907 VEXTRACTF128 $0x01, Y2, X4 2908 VADDPD X4, X2, X2 2909 VPERMILPD $0x01, X2, X4 2910 VADDSD X4, X2, X2 2911 VADDPD Y1, Y3, Y1 2912 VEXTRACTF128 $0x01, Y1, X3 2913 VADDPD X3, X1, X1 2914 VPERMILPD $0x01, X1, X3 2915 VADDSD X3, X1, X1 2916 CMPQ AX, DX 2917 JE LBB2_8 2918 2919 LBB2_4: 2920 VMOVSD (DI)(AX*8), X3 2921 VMOVSD (SI)(AX*8), X4 2922 VFMADD231SD X3, X4, X0 2923 VFMADD231SD X3, X3, X2 2924 VFMADD231SD X4, X4, X1 2925 ADDQ $0x01, AX 2926 CMPQ DX, AX 2927 JNE LBB2_4 2928 2929 LBB2_8: 2930 VMULSD X2, X1, X1 2931 VSQRTSD X1, X1, X1 2932 VDIVSD X1, X0, X0 2933 VZEROUPPER 2934 MOVSD X0, ret+48(FP) 2935 RET 2936 2937 DATA dataCosineSimilarityF32<>+0(SB)/4, $0xc0400000 2938 DATA dataCosineSimilarityF32<>+4(SB)/4, $0xbf000000 2939 GLOBL dataCosineSimilarityF32<>(SB), RODATA|NOPTR, $8 2940 2941 // func CosineSimilarity_AVX2_F32(x []float32, y []float32) float32 2942 // Requires: AVX, FMA3, SSE 2943 TEXT ·CosineSimilarity_AVX2_F32(SB), NOSPLIT, $0-52 2944 MOVQ x_base+0(FP), DI 2945 MOVQ y_base+24(FP), SI 2946 MOVQ x_len+8(FP), DX 2947 TESTQ DX, DX 2948 JE LBB3_1 2949 CMPQ DX, $0x10 2950 JAE LBB3_5 2951 VXORPS X1, X1, X1 2952 XORL AX, AX 2953 VXORPS X2, X2, X2 2954 VXORPS X0, X0, X0 2955 JMP LBB3_4 2956 2957 LBB3_1: 2958 VXORPS X0, X0, X0 2959 VXORPS X1, X1, X1 2960 JMP LBB3_9 2961 2962 LBB3_5: 2963 MOVQ DX, AX 2964 ANDQ $-16, AX 2965 VXORPS X1, X1, X1 2966 XORL CX, CX 2967 VXORPS X3, X3, X3 2968 VXORPS X2, X2, X2 2969 VXORPS X4, X4, X4 2970 VXORPS X0, X0, X0 2971 VXORPS X5, X5, X5 2972 2973 LBB3_6: 2974 VMOVUPS (DI)(CX*4), Y6 2975 VMOVUPS 32(DI)(CX*4), Y7 2976 VMOVUPS (SI)(CX*4), Y8 2977 VMOVUPS 32(SI)(CX*4), Y9 2978 VFMADD231PS Y6, Y8, Y0 2979 VFMADD231PS Y7, Y9, Y5 2980 VFMADD231PS Y6, Y6, Y2 2981 VFMADD231PS Y7, Y7, Y4 2982 VFMADD231PS Y8, Y8, Y1 2983 VFMADD231PS Y9, Y9, Y3 2984 ADDQ $0x10, CX 2985 CMPQ AX, CX 2986 JNE LBB3_6 2987 VADDPS Y0, Y5, Y0 2988 VEXTRACTF128 $0x01, Y0, X5 2989 VADDPS X5, X0, X0 2990 VPERMILPD $0x01, X0, X5 2991 VADDPS X5, X0, X0 2992 VMOVSHDUP X0, X5 2993 VADDSS X5, X0, X0 2994 VADDPS Y2, Y4, Y2 2995 VEXTRACTF128 $0x01, Y2, X4 2996 VADDPS X4, X2, X2 2997 VPERMILPD $0x01, X2, X4 2998 VADDPS X4, X2, X2 2999 VMOVSHDUP X2, X4 3000 VADDSS X4, X2, X2 3001 VADDPS Y1, Y3, Y1 3002 VEXTRACTF128 $0x01, Y1, X3 3003 VADDPS X3, X1, X1 3004 VPERMILPD $0x01, X1, X3 3005 VADDPS X3, X1, X1 3006 VMOVSHDUP X1, X3 3007 VADDSS X3, X1, X1 3008 CMPQ AX, DX 3009 JE LBB3_8 3010 3011 LBB3_4: 3012 VMOVSS (DI)(AX*4), X3 3013 VMOVSS (SI)(AX*4), X4 3014 VFMADD231SS X3, X4, X0 3015 VFMADD231SS X3, X3, X2 3016 VFMADD231SS X4, X4, X1 3017 ADDQ $0x01, AX 3018 CMPQ DX, AX 3019 JNE LBB3_4 3020 3021 LBB3_8: 3022 VMULSS X2, X1, X1 3023 3024 LBB3_9: 3025 VRSQRTSS X1, X1, X2 3026 VMULSS X2, X1, X1 3027 VFMADD213SS dataCosineSimilarityF32<>+0(SB), X2, X1 3028 VMULSS dataCosineSimilarityF32<>+4(SB), X2, X2 3029 VMULSS X0, X2, X0 3030 VMULSS X0, X1, X0 3031 VZEROUPPER 3032 MOVSS X0, ret+48(FP) 3033 RET 3034 3035 // func Mat4Mul_AVX2_F64(x []float64, y []float64, z []float64) 3036 // Requires: AVX, FMA3 3037 TEXT ·Mat4Mul_AVX2_F64(SB), NOSPLIT, $0-72 3038 MOVQ x_base+0(FP), DI 3039 MOVQ y_base+24(FP), SI 3040 MOVQ z_base+48(FP), DX 3041 VBROADCASTSD (SI), Y0 3042 VMOVUPD (DX), Y1 3043 VMOVUPD 32(DX), Y2 3044 VMOVUPD 64(DX), Y3 3045 VMOVUPD 96(DX), Y4 3046 VMULPD Y0, Y1, Y0 3047 VBROADCASTSD 8(SI), Y5 3048 VFMADD213PD Y0, Y2, Y5 3049 VBROADCASTSD 16(SI), Y0 3050 VFMADD213PD Y5, Y3, Y0 3051 VBROADCASTSD 24(SI), Y5 3052 VFMADD213PD Y0, Y4, Y5 3053 VMOVUPD Y5, (DI) 3054 VBROADCASTSD 32(SI), Y0 3055 VMULPD Y0, Y1, Y0 3056 VBROADCASTSD 40(SI), Y1 3057 VFMADD213PD Y0, Y2, Y1 3058 VBROADCASTSD 48(SI), Y0 3059 VFMADD213PD Y1, Y3, Y0 3060 VBROADCASTSD 56(SI), Y1 3061 VFMADD213PD Y0, Y4, Y1 3062 VMOVUPD Y1, 32(DI) 3063 VBROADCASTSD 64(SI), Y0 3064 VMOVUPD (DX), Y1 3065 VMOVUPD 32(DX), Y2 3066 VMOVUPD 64(DX), Y3 3067 VMOVUPD 96(DX), Y4 3068 VMULPD Y0, Y1, Y0 3069 VBROADCASTSD 72(SI), Y5 3070 VFMADD213PD Y0, Y2, Y5 3071 VBROADCASTSD 80(SI), Y0 3072 VFMADD213PD Y5, Y3, Y0 3073 VBROADCASTSD 88(SI), Y5 3074 VFMADD213PD Y0, Y4, Y5 3075 VMOVUPD Y5, 64(DI) 3076 VBROADCASTSD 96(SI), Y0 3077 VMULPD Y0, Y1, Y0 3078 VBROADCASTSD 104(SI), Y1 3079 VFMADD213PD Y0, Y2, Y1 3080 VBROADCASTSD 112(SI), Y0 3081 VFMADD213PD Y1, Y3, Y0 3082 VBROADCASTSD 120(SI), Y1 3083 VFMADD213PD Y0, Y4, Y1 3084 VMOVUPD Y1, 96(DI) 3085 VZEROUPPER 3086 RET 3087 3088 // func Mat4Mul_AVX2_F32(x []float32, y []float32, z []float32) 3089 // Requires: AVX, AVX2, FMA3 3090 TEXT ·Mat4Mul_AVX2_F32(SB), NOSPLIT, $0-72 3091 MOVQ x_base+0(FP), DI 3092 MOVQ y_base+24(FP), SI 3093 MOVQ z_base+48(FP), DX 3094 VBROADCASTF128 (DX), Y0 3095 VBROADCASTF128 16(DX), Y1 3096 VBROADCASTF128 32(DX), Y2 3097 VBROADCASTF128 48(DX), Y3 3098 VMOVSS 16(SI), X4 3099 VMOVSS (SI), X5 3100 VSHUFPS $0x00, X4, X5, X4 3101 VMOVSS 4(SI), X5 3102 VMOVSS 8(SI), X6 3103 VMOVSS 12(SI), X7 3104 VPERMPD $0x50, Y4, Y4 3105 VMULPS Y4, Y0, Y0 3106 VMOVSS 20(SI), X4 3107 VSHUFPS $0x00, X4, X5, X4 3108 VPERMPD $0x50, Y4, Y4 3109 VFMADD213PS Y0, Y1, Y4 3110 VMOVSS 24(SI), X0 3111 VSHUFPS $0x00, X0, X6, X0 3112 VPERMPD $0x50, Y0, Y0 3113 VFMADD213PS Y4, Y2, Y0 3114 VMOVSS 28(SI), X1 3115 VSHUFPS $0x00, X1, X7, X1 3116 VPERMPD $0x50, Y1, Y1 3117 VFMADD213PS Y0, Y3, Y1 3118 VBROADCASTF128 (DX), Y0 3119 VBROADCASTF128 16(DX), Y2 3120 VBROADCASTF128 32(DX), Y3 3121 VMOVUPS Y1, (DI) 3122 VBROADCASTF128 48(DX), Y1 3123 VMOVSS 48(SI), X4 3124 VMOVSS 32(SI), X5 3125 VSHUFPS $0x00, X4, X5, X4 3126 VMOVSS 36(SI), X5 3127 VMOVSS 40(SI), X6 3128 VMOVSS 44(SI), X7 3129 VPERMPD $0x50, Y4, Y4 3130 VMULPS Y4, Y0, Y0 3131 VMOVSS 52(SI), X4 3132 VSHUFPS $0x00, X4, X5, X4 3133 VPERMPD $0x50, Y4, Y4 3134 VFMADD213PS Y0, Y2, Y4 3135 VMOVSS 56(SI), X0 3136 VSHUFPS $0x00, X0, X6, X0 3137 VPERMPD $0x50, Y0, Y0 3138 VFMADD213PS Y4, Y3, Y0 3139 VMOVSS 60(SI), X2 3140 VSHUFPS $0x00, X2, X7, X2 3141 VPERMPD $0x50, Y2, Y2 3142 VFMADD213PS Y0, Y1, Y2 3143 VMOVUPS Y2, 32(DI) 3144 VZEROUPPER 3145 RET 3146 3147 // func MatMul_AVX2_F64(x []float64, y []float64, z []float64, a int, b int, c int) 3148 // Requires: AVX, AVX2, FMA3 3149 TEXT ·MatMul_AVX2_F64(SB), $8-96 3150 MOVQ x_base+0(FP), DI 3151 MOVQ y_base+24(FP), SI 3152 MOVQ z_base+48(FP), DX 3153 MOVQ a+72(FP), CX 3154 MOVQ b+80(FP), R8 3155 MOVQ c+88(FP), R9 3156 PUSHQ BP 3157 PUSHQ R15 3158 PUSHQ R14 3159 PUSHQ R13 3160 PUSHQ R12 3161 PUSHQ BX 3162 MOVQ DX, -16(SP) 3163 MOVQ CX, -8(SP) 3164 TESTQ CX, CX 3165 JE LBB4_13 3166 TESTQ R8, R8 3167 JE LBB4_13 3168 TESTQ R9, R9 3169 JE LBB4_13 3170 MOVQ R9, R12 3171 ANDQ $-16, R12 3172 MOVQ -16(SP), AX 3173 LEAQ 96(AX), CX 3174 XORQ R15, R15 3175 LEAQ (R15)(R9*8), R11 3176 LEAQ 96(DI), BX 3177 XORL R14, R14 3178 JMP LBB4_4 3179 3180 LBB4_12: 3181 ADDQ $0x01, R14 3182 ADDQ R11, BX 3183 ADDQ R11, DI 3184 CMPQ R14, -8(SP) 3185 JE LBB4_13 3186 3187 LBB4_4: 3188 MOVQ R14, R15 3189 IMULQ R8, R15 3190 MOVQ -16(SP), R13 3191 MOVQ CX, AX 3192 XORL BP, BP 3193 JMP LBB4_5 3194 3195 LBB4_11: 3196 ADDQ $0x01, BP 3197 ADDQ R11, AX 3198 ADDQ R11, R13 3199 CMPQ BP, R8 3200 JE LBB4_12 3201 3202 LBB4_5: 3203 LEAQ (R15)(BP*1), DX 3204 VMOVSD (SI)(DX*8), X0 3205 CMPQ R9, $0x10 3206 JAE LBB4_7 3207 XORL DX, DX 3208 JMP LBB4_10 3209 3210 LBB4_7: 3211 VBROADCASTSD X0, Y1 3212 XORL R10, R10 3213 3214 LBB4_8: 3215 VMOVUPD -96(AX)(R10*8), Y2 3216 VMOVUPD -64(AX)(R10*8), Y3 3217 VMOVUPD -32(AX)(R10*8), Y4 3218 VMOVUPD (AX)(R10*8), Y5 3219 VFMADD213PD -96(BX)(R10*8), Y1, Y2 3220 VFMADD213PD -64(BX)(R10*8), Y1, Y3 3221 VFMADD213PD -32(BX)(R10*8), Y1, Y4 3222 VFMADD213PD (BX)(R10*8), Y1, Y5 3223 VMOVUPD Y2, -96(BX)(R10*8) 3224 VMOVUPD Y3, -64(BX)(R10*8) 3225 VMOVUPD Y4, -32(BX)(R10*8) 3226 VMOVUPD Y5, (BX)(R10*8) 3227 ADDQ $0x10, R10 3228 CMPQ R12, R10 3229 JNE LBB4_8 3230 MOVQ R12, DX 3231 CMPQ R12, R9 3232 JE LBB4_11 3233 3234 LBB4_10: 3235 VMOVSD (R13)(DX*8), X1 3236 VFMADD213SD (DI)(DX*8), X0, X1 3237 VMOVSD X1, (DI)(DX*8) 3238 ADDQ $0x01, DX 3239 CMPQ R9, DX 3240 JNE LBB4_10 3241 JMP LBB4_11 3242 3243 LBB4_13: 3244 POPQ BX 3245 POPQ R12 3246 POPQ R13 3247 POPQ R14 3248 POPQ R15 3249 POPQ BP 3250 VZEROUPPER 3251 RET 3252 3253 // func MatMul_AVX2_F32(x []float32, y []float32, z []float32, a int, b int, c int) 3254 // Requires: AVX, AVX2, FMA3 3255 TEXT ·MatMul_AVX2_F32(SB), $8-96 3256 MOVQ x_base+0(FP), DI 3257 MOVQ y_base+24(FP), SI 3258 MOVQ z_base+48(FP), DX 3259 MOVQ a+72(FP), CX 3260 MOVQ b+80(FP), R8 3261 MOVQ c+88(FP), R9 3262 PUSHQ BP 3263 PUSHQ R15 3264 PUSHQ R14 3265 PUSHQ R13 3266 PUSHQ R12 3267 PUSHQ BX 3268 MOVQ DX, -16(SP) 3269 MOVQ CX, -8(SP) 3270 TESTQ CX, CX 3271 JE LBB5_13 3272 TESTQ R8, R8 3273 JE LBB5_13 3274 TESTQ R9, R9 3275 JE LBB5_13 3276 MOVQ R9, R12 3277 ANDQ $-32, R12 3278 MOVQ -16(SP), AX 3279 LEAQ 96(AX), CX 3280 XORQ R15, R15 3281 LEAQ (R15)(R9*4), R11 3282 LEAQ 96(DI), BX 3283 XORL R14, R14 3284 JMP LBB5_4 3285 3286 LBB5_12: 3287 ADDQ $0x01, R14 3288 ADDQ R11, BX 3289 ADDQ R11, DI 3290 CMPQ R14, -8(SP) 3291 JE LBB5_13 3292 3293 LBB5_4: 3294 MOVQ R14, R15 3295 IMULQ R8, R15 3296 MOVQ -16(SP), R13 3297 MOVQ CX, AX 3298 XORL BP, BP 3299 JMP LBB5_5 3300 3301 LBB5_11: 3302 ADDQ $0x01, BP 3303 ADDQ R11, AX 3304 ADDQ R11, R13 3305 CMPQ BP, R8 3306 JE LBB5_12 3307 3308 LBB5_5: 3309 LEAQ (R15)(BP*1), DX 3310 VMOVSS (SI)(DX*4), X0 3311 CMPQ R9, $0x20 3312 JAE LBB5_7 3313 XORL DX, DX 3314 JMP LBB5_10 3315 3316 LBB5_7: 3317 VBROADCASTSS X0, Y1 3318 XORL R10, R10 3319 3320 LBB5_8: 3321 VMOVUPS -96(AX)(R10*4), Y2 3322 VMOVUPS -64(AX)(R10*4), Y3 3323 VMOVUPS -32(AX)(R10*4), Y4 3324 VMOVUPS (AX)(R10*4), Y5 3325 VFMADD213PS -96(BX)(R10*4), Y1, Y2 3326 VFMADD213PS -64(BX)(R10*4), Y1, Y3 3327 VFMADD213PS -32(BX)(R10*4), Y1, Y4 3328 VFMADD213PS (BX)(R10*4), Y1, Y5 3329 VMOVUPS Y2, -96(BX)(R10*4) 3330 VMOVUPS Y3, -64(BX)(R10*4) 3331 VMOVUPS Y4, -32(BX)(R10*4) 3332 VMOVUPS Y5, (BX)(R10*4) 3333 ADDQ $0x20, R10 3334 CMPQ R12, R10 3335 JNE LBB5_8 3336 MOVQ R12, DX 3337 CMPQ R12, R9 3338 JE LBB5_11 3339 3340 LBB5_10: 3341 VMOVSS (R13)(DX*4), X1 3342 VFMADD213SS (DI)(DX*4), X0, X1 3343 VMOVSS X1, (DI)(DX*4) 3344 ADDQ $0x01, DX 3345 CMPQ R9, DX 3346 JNE LBB5_10 3347 JMP LBB5_11 3348 3349 LBB5_13: 3350 POPQ BX 3351 POPQ R12 3352 POPQ R13 3353 POPQ R14 3354 POPQ R15 3355 POPQ BP 3356 VZEROUPPER 3357 RET 3358 3359 // func MatMulVec_AVX2_F64(x []float64, y []float64, z []float64, a int, b int) 3360 // Requires: AVX, FMA3 3361 TEXT ·MatMulVec_AVX2_F64(SB), $0-88 3362 MOVQ x_base+0(FP), DI 3363 MOVQ y_base+24(FP), SI 3364 MOVQ z_base+48(FP), DX 3365 MOVQ a+72(FP), CX 3366 MOVQ b+80(FP), R8 3367 PUSHQ BX 3368 TESTQ CX, CX 3369 JE LBB6_10 3370 TESTQ R8, R8 3371 JE LBB6_10 3372 MOVQ R8, R9 3373 ANDQ $-16, R9 3374 LEAQ 96(SI), AX 3375 XORQ R10, R10 3376 LEAQ (R10)(R8*8), R10 3377 XORL R11, R11 3378 JMP LBB6_3 3379 3380 LBB6_9: 3381 VMOVSD X0, (DI)(R11*8) 3382 ADDQ $0x01, R11 3383 ADDQ R10, AX 3384 ADDQ R10, SI 3385 CMPQ R11, CX 3386 JE LBB6_10 3387 3388 LBB6_3: 3389 VMOVQ (DI)(R11*8), X0 3390 CMPQ R8, $0x10 3391 JAE LBB6_5 3392 XORL BX, BX 3393 JMP LBB6_8 3394 3395 LBB6_5: 3396 VMOVQ X0, X0 3397 VXORPD X1, X1, X1 3398 XORL BX, BX 3399 VXORPD X2, X2, X2 3400 VXORPD X3, X3, X3 3401 3402 LBB6_6: 3403 VMOVUPD (DX)(BX*8), Y4 3404 VMOVUPD 32(DX)(BX*8), Y5 3405 VMOVUPD 64(DX)(BX*8), Y6 3406 VMOVUPD 96(DX)(BX*8), Y7 3407 VFMADD231PD -96(AX)(BX*8), Y4, Y0 3408 VFMADD231PD -64(AX)(BX*8), Y5, Y1 3409 VFMADD231PD -32(AX)(BX*8), Y6, Y2 3410 VFMADD231PD (AX)(BX*8), Y7, Y3 3411 ADDQ $0x10, BX 3412 CMPQ R9, BX 3413 JNE LBB6_6 3414 VADDPD Y0, Y1, Y0 3415 VADDPD Y0, Y2, Y0 3416 VADDPD Y0, Y3, Y0 3417 VEXTRACTF128 $0x01, Y0, X1 3418 VADDPD X1, X0, X0 3419 VPERMILPD $0x01, X0, X1 3420 VADDSD X1, X0, X0 3421 MOVQ R9, BX 3422 CMPQ R9, R8 3423 JE LBB6_9 3424 3425 LBB6_8: 3426 VMOVSD (DX)(BX*8), X1 3427 VFMADD231SD (SI)(BX*8), X1, X0 3428 ADDQ $0x01, BX 3429 CMPQ R8, BX 3430 JNE LBB6_8 3431 JMP LBB6_9 3432 3433 LBB6_10: 3434 POPQ BX 3435 VZEROUPPER 3436 RET 3437 3438 // func MatMulVec_AVX2_F32(x []float32, y []float32, z []float32, a int, b int) 3439 // Requires: AVX, FMA3 3440 TEXT ·MatMulVec_AVX2_F32(SB), $0-88 3441 MOVQ x_base+0(FP), DI 3442 MOVQ y_base+24(FP), SI 3443 MOVQ z_base+48(FP), DX 3444 MOVQ a+72(FP), CX 3445 MOVQ b+80(FP), R8 3446 PUSHQ BX 3447 TESTQ CX, CX 3448 JE LBB7_10 3449 TESTQ R8, R8 3450 JE LBB7_10 3451 MOVQ R8, R9 3452 ANDQ $-32, R9 3453 LEAQ 96(SI), AX 3454 XORQ R10, R10 3455 LEAQ (R10)(R8*4), R10 3456 XORL R11, R11 3457 VXORPS X0, X0, X0 3458 JMP LBB7_3 3459 3460 LBB7_9: 3461 VMOVSS X1, (DI)(R11*4) 3462 ADDQ $0x01, R11 3463 ADDQ R10, AX 3464 ADDQ R10, SI 3465 CMPQ R11, CX 3466 JE LBB7_10 3467 3468 LBB7_3: 3469 VMOVSS (DI)(R11*4), X1 3470 CMPQ R8, $0x20 3471 JAE LBB7_5 3472 XORL BX, BX 3473 JMP LBB7_8 3474 3475 LBB7_5: 3476 VBLENDPS $0x01, X1, X0, X1 3477 VXORPS X2, X2, X2 3478 XORL BX, BX 3479 VXORPS X3, X3, X3 3480 VXORPS X4, X4, X4 3481 3482 LBB7_6: 3483 VMOVUPS (DX)(BX*4), Y5 3484 VMOVUPS 32(DX)(BX*4), Y6 3485 VMOVUPS 64(DX)(BX*4), Y7 3486 VMOVUPS 96(DX)(BX*4), Y8 3487 VFMADD231PS -96(AX)(BX*4), Y5, Y1 3488 VFMADD231PS -64(AX)(BX*4), Y6, Y2 3489 VFMADD231PS -32(AX)(BX*4), Y7, Y3 3490 VFMADD231PS (AX)(BX*4), Y8, Y4 3491 ADDQ $0x20, BX 3492 CMPQ R9, BX 3493 JNE LBB7_6 3494 VADDPS Y1, Y2, Y1 3495 VADDPS Y1, Y3, Y1 3496 VADDPS Y1, Y4, Y1 3497 VEXTRACTF128 $0x01, Y1, X2 3498 VADDPS X2, X1, X1 3499 VPERMILPD $0x01, X1, X2 3500 VADDPS X2, X1, X1 3501 VMOVSHDUP X1, X2 3502 VADDSS X2, X1, X1 3503 MOVQ R9, BX 3504 CMPQ R9, R8 3505 JE LBB7_9 3506 3507 LBB7_8: 3508 VMOVSS (DX)(BX*4), X2 3509 VFMADD231SS (SI)(BX*4), X2, X1 3510 ADDQ $0x01, BX 3511 CMPQ R8, BX 3512 JNE LBB7_8 3513 JMP LBB7_9 3514 3515 LBB7_10: 3516 POPQ BX 3517 VZEROUPPER 3518 RET 3519 3520 // func MatMulTiled_AVX2_F64(x []float64, y []float64, z []float64, a int, b int, c int) 3521 // Requires: AVX, AVX2, CMOV, FMA3 3522 TEXT ·MatMulTiled_AVX2_F64(SB), $8-96 3523 MOVQ x_base+0(FP), DI 3524 MOVQ y_base+24(FP), SI 3525 MOVQ z_base+48(FP), DX 3526 MOVQ a+72(FP), CX 3527 MOVQ b+80(FP), R8 3528 MOVQ c+88(FP), R9 3529 PUSHQ BP 3530 PUSHQ R15 3531 PUSHQ R14 3532 PUSHQ R13 3533 PUSHQ R12 3534 PUSHQ BX 3535 SUBQ $0x48, SP 3536 MOVQ R9, -128(SP) 3537 MOVQ R8, -104(SP) 3538 MOVQ DX, -88(SP) 3539 MOVQ DI, -112(SP) 3540 MOVQ CX, -64(SP) 3541 ADDQ $0x07, CX 3542 MOVQ CX, -72(SP) 3543 JE LBB8_21 3544 MOVQ -104(SP), AX 3545 ADDQ $0xff, AX 3546 MOVQ AX, 8(SP) 3547 JE LBB8_21 3548 MOVQ -128(SP), AX 3549 ADDQ $0xff, AX 3550 MOVQ AX, -40(SP) 3551 JE LBB8_21 3552 MOVQ -88(SP), AX 3553 ADDQ $0x60, AX 3554 MOVQ AX, -48(SP) 3555 MOVQ -128(SP), AX 3556 XORQ R15, R15 3557 LEAQ (R15)(AX*8), BX 3558 MOVQ -112(SP), CX 3559 ADDQ $0x60, CX 3560 MOVQ CX, -96(SP) 3561 SHLQ $0x06, AX 3562 MOVQ AX, -80(SP) 3563 XORL DX, DX 3564 JMP LBB8_4 3565 3566 LBB8_20: 3567 MOVQ -80(SP), AX 3568 ADDQ AX, -96(SP) 3569 ADDQ AX, -112(SP) 3570 MOVQ -56(SP), AX 3571 MOVQ AX, DX 3572 CMPQ AX, -72(SP) 3573 JAE LBB8_21 3574 3575 LBB8_4: 3576 LEAQ 8(DX), AX 3577 MOVQ -64(SP), CX 3578 CMPQ AX, CX 3579 MOVQ AX, -56(SP) 3580 CMOVQGT CX, AX 3581 CDQE 3582 MOVQ DX, -16(SP) 3583 MOVQ AX, 24(SP) 3584 CMPQ DX, AX 3585 JAE LBB8_20 3586 XORL AX, AX 3587 MOVQ AX, -120(SP) 3588 MOVL $+256, DX 3589 XORL AX, AX 3590 JMP LBB8_6 3591 3592 LBB8_19: 3593 MOVQ -120(SP), AX 3594 ADDL $0x01, AX 3595 MOVQ AX, -120(SP) 3596 MOVQ -24(SP), DX 3597 ADDQ $+256, DX 3598 MOVQ -32(SP), AX 3599 CMPQ AX, -40(SP) 3600 JAE LBB8_20 3601 3602 LBB8_6: 3603 MOVL AX, DI 3604 MOVQ -128(SP), BP 3605 CMPQ BP, DX 3606 MOVQ DX, -24(SP) 3607 CMOVQLT BP, DX 3608 ADDQ $+256, AX 3609 CMPQ BP, AX 3610 MOVQ AX, CX 3611 CMOVQLT BP, CX 3612 MOVQ AX, -32(SP) 3613 CMOVQLT BP, AX 3614 CMPL DI, AX 3615 JGE LBB8_19 3616 MOVLQSX DI, R14 3617 MOVQ -96(SP), DI 3618 LEAQ (DI)(R14*8), DI 3619 MOVQ DI, (SP) 3620 MOVLQSX DX, R11 3621 SUBQ R14, R11 3622 ANDQ $-16, R11 3623 MOVLQSX CX, R12 3624 MOVQ -120(SP), CX 3625 SHLL $0x08, CX 3626 MOVLQSX CX, CX 3627 SUBQ CX, R12 3628 MOVLQSX AX, DX 3629 MOVQ R12, CX 3630 ANDQ $-16, CX 3631 MOVQ -48(SP), AX 3632 LEAQ (AX)(R14*8), AX 3633 MOVQ AX, -8(SP) 3634 MOVQ R14, R13 3635 MOVQ CX, 64(SP) 3636 ADDQ CX, R13 3637 XORL AX, AX 3638 JMP LBB8_8 3639 3640 LBB8_18: 3641 MOVQ 16(SP), AX 3642 CMPQ AX, 8(SP) 3643 JAE LBB8_19 3644 3645 LBB8_8: 3646 MOVL AX, CX 3647 ADDQ $+256, AX 3648 MOVQ -104(SP), DI 3649 CMPQ AX, DI 3650 MOVQ AX, 16(SP) 3651 CMOVQGT DI, AX 3652 CMPL CX, AX 3653 JGE LBB8_18 3654 MOVLQSX CX, DI 3655 MOVQ -128(SP), CX 3656 MOVQ DI, 48(SP) 3657 IMULQ DI, CX 3658 MOVQ -88(SP), DI 3659 LEAQ (DI)(CX*8), DI 3660 MOVQ DI, 40(SP) 3661 MOVQ -8(SP), DI 3662 LEAQ (DI)(CX*8), CX 3663 MOVQ CX, 32(SP) 3664 CDQE 3665 MOVQ -112(SP), CX 3666 MOVQ (SP), R10 3667 MOVQ -16(SP), R8 3668 JMP LBB8_10 3669 3670 LBB8_17: 3671 MOVQ 56(SP), R8 3672 ADDQ $0x01, R8 3673 ADDQ BX, R10 3674 ADDQ BX, CX 3675 CMPQ R8, 24(SP) 3676 JAE LBB8_18 3677 3678 LBB8_10: 3679 MOVQ R8, 56(SP) 3680 IMULQ -104(SP), R8 3681 MOVQ 40(SP), R15 3682 MOVQ 32(SP), DI 3683 MOVQ 48(SP), R9 3684 JMP LBB8_11 3685 3686 LBB8_16: 3687 ADDQ $0x01, R9 3688 ADDQ BX, DI 3689 ADDQ BX, R15 3690 CMPQ R9, AX 3691 JGE LBB8_17 3692 3693 LBB8_11: 3694 LEAQ (R9)(R8*1), BP 3695 VMOVSD (SI)(BP*8), X0 3696 MOVQ R14, BP 3697 CMPQ R12, $0x10 3698 JB LBB8_15 3699 VBROADCASTSD X0, Y1 3700 XORL BP, BP 3701 3702 LBB8_13: 3703 VMOVUPD -96(DI)(BP*8), Y2 3704 VMOVUPD -64(DI)(BP*8), Y3 3705 VMOVUPD -32(DI)(BP*8), Y4 3706 VMOVUPD (DI)(BP*8), Y5 3707 VFMADD213PD -96(R10)(BP*8), Y1, Y2 3708 VFMADD213PD -64(R10)(BP*8), Y1, Y3 3709 VFMADD213PD -32(R10)(BP*8), Y1, Y4 3710 VFMADD213PD (R10)(BP*8), Y1, Y5 3711 VMOVUPD Y2, -96(R10)(BP*8) 3712 VMOVUPD Y3, -64(R10)(BP*8) 3713 VMOVUPD Y4, -32(R10)(BP*8) 3714 VMOVUPD Y5, (R10)(BP*8) 3715 ADDQ $0x10, BP 3716 CMPQ R11, BP 3717 JNE LBB8_13 3718 MOVQ R13, BP 3719 CMPQ R12, 64(SP) 3720 JE LBB8_16 3721 3722 LBB8_15: 3723 VMOVSD (R15)(BP*8), X1 3724 VFMADD213SD (CX)(BP*8), X0, X1 3725 VMOVSD X1, (CX)(BP*8) 3726 ADDQ $0x01, BP 3727 CMPQ BP, DX 3728 JL LBB8_15 3729 JMP LBB8_16 3730 3731 LBB8_21: 3732 ADDQ $0x48, SP 3733 POPQ BX 3734 POPQ R12 3735 POPQ R13 3736 POPQ R14 3737 POPQ R15 3738 POPQ BP 3739 VZEROUPPER 3740 RET 3741 3742 // func MatMulTiled_AVX2_F32(x []float32, y []float32, z []float32, a int, b int, c int) 3743 // Requires: AVX, AVX2, CMOV, FMA3 3744 TEXT ·MatMulTiled_AVX2_F32(SB), $8-96 3745 MOVQ x_base+0(FP), DI 3746 MOVQ y_base+24(FP), SI 3747 MOVQ z_base+48(FP), DX 3748 MOVQ a+72(FP), CX 3749 MOVQ b+80(FP), R8 3750 MOVQ c+88(FP), R9 3751 PUSHQ BP 3752 PUSHQ R15 3753 PUSHQ R14 3754 PUSHQ R13 3755 PUSHQ R12 3756 PUSHQ BX 3757 SUBQ $0x48, SP 3758 MOVQ R9, -128(SP) 3759 MOVQ R8, -104(SP) 3760 MOVQ DX, -88(SP) 3761 MOVQ DI, -112(SP) 3762 MOVQ CX, -64(SP) 3763 ADDQ $0x07, CX 3764 MOVQ CX, -72(SP) 3765 JE LBB9_21 3766 MOVQ -104(SP), AX 3767 ADDQ $0xff, AX 3768 MOVQ AX, 8(SP) 3769 JE LBB9_21 3770 MOVQ -128(SP), AX 3771 ADDQ $0xff, AX 3772 MOVQ AX, -40(SP) 3773 JE LBB9_21 3774 MOVQ -88(SP), AX 3775 ADDQ $0x60, AX 3776 MOVQ AX, -48(SP) 3777 MOVQ -128(SP), AX 3778 XORQ R15, R15 3779 LEAQ (R15)(AX*4), BX 3780 MOVQ -112(SP), CX 3781 ADDQ $0x60, CX 3782 MOVQ CX, -96(SP) 3783 SHLQ $0x05, AX 3784 MOVQ AX, -80(SP) 3785 XORL DX, DX 3786 JMP LBB9_4 3787 3788 LBB9_20: 3789 MOVQ -80(SP), AX 3790 ADDQ AX, -96(SP) 3791 ADDQ AX, -112(SP) 3792 MOVQ -56(SP), AX 3793 MOVQ AX, DX 3794 CMPQ AX, -72(SP) 3795 JAE LBB9_21 3796 3797 LBB9_4: 3798 LEAQ 8(DX), AX 3799 MOVQ -64(SP), CX 3800 CMPQ AX, CX 3801 MOVQ AX, -56(SP) 3802 CMOVQGT CX, AX 3803 CDQE 3804 MOVQ DX, -16(SP) 3805 MOVQ AX, 24(SP) 3806 CMPQ DX, AX 3807 JAE LBB9_20 3808 XORL AX, AX 3809 MOVQ AX, -120(SP) 3810 MOVL $+256, DX 3811 XORL AX, AX 3812 JMP LBB9_6 3813 3814 LBB9_19: 3815 MOVQ -120(SP), AX 3816 ADDL $0x01, AX 3817 MOVQ AX, -120(SP) 3818 MOVQ -24(SP), DX 3819 ADDQ $+256, DX 3820 MOVQ -32(SP), AX 3821 CMPQ AX, -40(SP) 3822 JAE LBB9_20 3823 3824 LBB9_6: 3825 MOVL AX, DI 3826 MOVQ -128(SP), BP 3827 CMPQ BP, DX 3828 MOVQ DX, -24(SP) 3829 CMOVQLT BP, DX 3830 ADDQ $+256, AX 3831 CMPQ BP, AX 3832 MOVQ AX, CX 3833 CMOVQLT BP, CX 3834 MOVQ AX, -32(SP) 3835 CMOVQLT BP, AX 3836 CMPL DI, AX 3837 JGE LBB9_19 3838 MOVLQSX DI, R14 3839 MOVQ -96(SP), DI 3840 LEAQ (DI)(R14*4), DI 3841 MOVQ DI, (SP) 3842 MOVLQSX DX, R11 3843 SUBQ R14, R11 3844 ANDQ $-32, R11 3845 MOVLQSX CX, R12 3846 MOVQ -120(SP), CX 3847 SHLL $0x08, CX 3848 MOVLQSX CX, CX 3849 SUBQ CX, R12 3850 MOVLQSX AX, DX 3851 MOVQ R12, CX 3852 ANDQ $-32, CX 3853 MOVQ -48(SP), AX 3854 LEAQ (AX)(R14*4), AX 3855 MOVQ AX, -8(SP) 3856 MOVQ R14, R13 3857 MOVQ CX, 64(SP) 3858 ADDQ CX, R13 3859 XORL AX, AX 3860 JMP LBB9_8 3861 3862 LBB9_18: 3863 MOVQ 16(SP), AX 3864 CMPQ AX, 8(SP) 3865 JAE LBB9_19 3866 3867 LBB9_8: 3868 MOVL AX, CX 3869 ADDQ $+256, AX 3870 MOVQ -104(SP), DI 3871 CMPQ AX, DI 3872 MOVQ AX, 16(SP) 3873 CMOVQGT DI, AX 3874 CMPL CX, AX 3875 JGE LBB9_18 3876 MOVLQSX CX, DI 3877 MOVQ -128(SP), CX 3878 MOVQ DI, 48(SP) 3879 IMULQ DI, CX 3880 MOVQ -88(SP), DI 3881 LEAQ (DI)(CX*4), DI 3882 MOVQ DI, 40(SP) 3883 MOVQ -8(SP), DI 3884 LEAQ (DI)(CX*4), CX 3885 MOVQ CX, 32(SP) 3886 CDQE 3887 MOVQ -112(SP), CX 3888 MOVQ (SP), R10 3889 MOVQ -16(SP), R8 3890 JMP LBB9_10 3891 3892 LBB9_17: 3893 MOVQ 56(SP), R8 3894 ADDQ $0x01, R8 3895 ADDQ BX, R10 3896 ADDQ BX, CX 3897 CMPQ R8, 24(SP) 3898 JAE LBB9_18 3899 3900 LBB9_10: 3901 MOVQ R8, 56(SP) 3902 IMULQ -104(SP), R8 3903 MOVQ 40(SP), R15 3904 MOVQ 32(SP), DI 3905 MOVQ 48(SP), R9 3906 JMP LBB9_11 3907 3908 LBB9_16: 3909 ADDQ $0x01, R9 3910 ADDQ BX, DI 3911 ADDQ BX, R15 3912 CMPQ R9, AX 3913 JGE LBB9_17 3914 3915 LBB9_11: 3916 LEAQ (R9)(R8*1), BP 3917 VMOVSS (SI)(BP*4), X0 3918 MOVQ R14, BP 3919 CMPQ R12, $0x20 3920 JB LBB9_15 3921 VBROADCASTSS X0, Y1 3922 XORL BP, BP 3923 3924 LBB9_13: 3925 VMOVUPS -96(DI)(BP*4), Y2 3926 VMOVUPS -64(DI)(BP*4), Y3 3927 VMOVUPS -32(DI)(BP*4), Y4 3928 VMOVUPS (DI)(BP*4), Y5 3929 VFMADD213PS -96(R10)(BP*4), Y1, Y2 3930 VFMADD213PS -64(R10)(BP*4), Y1, Y3 3931 VFMADD213PS -32(R10)(BP*4), Y1, Y4 3932 VFMADD213PS (R10)(BP*4), Y1, Y5 3933 VMOVUPS Y2, -96(R10)(BP*4) 3934 VMOVUPS Y3, -64(R10)(BP*4) 3935 VMOVUPS Y4, -32(R10)(BP*4) 3936 VMOVUPS Y5, (R10)(BP*4) 3937 ADDQ $0x20, BP 3938 CMPQ R11, BP 3939 JNE LBB9_13 3940 MOVQ R13, BP 3941 CMPQ R12, 64(SP) 3942 JE LBB9_16 3943 3944 LBB9_15: 3945 VMOVSS (R15)(BP*4), X1 3946 VFMADD213SS (CX)(BP*4), X0, X1 3947 VMOVSS X1, (CX)(BP*4) 3948 ADDQ $0x01, BP 3949 CMPQ BP, DX 3950 JL LBB9_15 3951 JMP LBB9_16 3952 3953 LBB9_21: 3954 ADDQ $0x48, SP 3955 POPQ BX 3956 POPQ R12 3957 POPQ R13 3958 POPQ R14 3959 POPQ R15 3960 POPQ BP 3961 VZEROUPPER 3962 RET 3963 3964 // func Sqrt_AVX2_F64(x []float64) float64 3965 // Requires: AVX, SSE2 3966 TEXT ·Sqrt_AVX2_F64(SB), NOSPLIT, $0-32 3967 MOVQ x_base+0(FP), DI 3968 MOVQ x_len+8(FP), SI 3969 TESTQ SI, SI 3970 JE LBB0_7 3971 CMPQ SI, $0x04 3972 JAE LBB0_3 3973 XORL AX, AX 3974 JMP LBB0_6 3975 3976 LBB0_3: 3977 MOVQ SI, AX 3978 ANDQ $-4, AX 3979 XORL CX, CX 3980 3981 LBB0_4: 3982 VSQRTPD (DI)(CX*8), Y0 3983 VMOVUPD Y0, (DI)(CX*8) 3984 ADDQ $0x04, CX 3985 CMPQ AX, CX 3986 JNE LBB0_4 3987 CMPQ AX, SI 3988 JE LBB0_7 3989 3990 LBB0_6: 3991 VMOVSD (DI)(AX*8), X0 3992 VSQRTSD X0, X0, X0 3993 VMOVSD X0, (DI)(AX*8) 3994 ADDQ $0x01, AX 3995 CMPQ SI, AX 3996 JNE LBB0_6 3997 3998 LBB0_7: 3999 VZEROUPPER 4000 MOVSD X0, ret+24(FP) 4001 RET 4002 4003 DATA dataSqrtF32<>+0(SB)/4, $0xc0400000 4004 DATA dataSqrtF32<>+4(SB)/4, $0xbf000000 4005 DATA dataSqrtF32<>+8(SB)/4, $0x7fffffff 4006 DATA dataSqrtF32<>+12(SB)/4, $0x00800000 4007 GLOBL dataSqrtF32<>(SB), RODATA|NOPTR, $16 4008 4009 // func Sqrt_AVX2_F32(x []float32) float32 4010 // Requires: AVX, FMA3, SSE 4011 TEXT ·Sqrt_AVX2_F32(SB), NOSPLIT, $0-28 4012 MOVQ x_base+0(FP), DI 4013 MOVQ x_len+8(FP), SI 4014 TESTQ SI, SI 4015 JE LBB1_8 4016 CMPQ SI, $0x20 4017 JAE LBB1_3 4018 XORL AX, AX 4019 JMP LBB1_6 4020 4021 LBB1_3: 4022 MOVQ SI, AX 4023 ANDQ $-32, AX 4024 XORL CX, CX 4025 VBROADCASTSS dataSqrtF32<>+0(SB), Y0 4026 VBROADCASTSS dataSqrtF32<>+4(SB), Y1 4027 VBROADCASTSS dataSqrtF32<>+8(SB), Y2 4028 VBROADCASTSS dataSqrtF32<>+12(SB), Y3 4029 4030 LBB1_4: 4031 VMOVUPS (DI)(CX*4), Y4 4032 VMOVUPS 32(DI)(CX*4), Y5 4033 VMOVUPS 64(DI)(CX*4), Y6 4034 VRSQRTPS Y4, Y7 4035 VMOVUPS 96(DI)(CX*4), Y8 4036 VMULPS Y7, Y4, Y9 4037 VFMADD213PS Y0, Y9, Y7 4038 VMULPS Y1, Y9, Y9 4039 VMULPS Y7, Y9, Y7 4040 VANDPS Y2, Y4, Y4 4041 VCMPPS $0x02, Y4, Y3, Y4 4042 VANDPS Y7, Y4, Y4 4043 VRSQRTPS Y5, Y7 4044 VMULPS Y7, Y5, Y9 4045 VFMADD213PS Y0, Y9, Y7 4046 VMULPS Y1, Y9, Y9 4047 VMULPS Y7, Y9, Y7 4048 VANDPS Y2, Y5, Y5 4049 VCMPPS $0x02, Y5, Y3, Y5 4050 VRSQRTPS Y6, Y9 4051 VANDPS Y7, Y5, Y5 4052 VMULPS Y6, Y9, Y7 4053 VFMADD213PS Y0, Y7, Y9 4054 VMULPS Y1, Y7, Y7 4055 VMULPS Y7, Y9, Y7 4056 VANDPS Y2, Y6, Y6 4057 VCMPPS $0x02, Y6, Y3, Y6 4058 VANDPS Y7, Y6, Y6 4059 VRSQRTPS Y8, Y7 4060 VMULPS Y7, Y8, Y9 4061 VFMADD213PS Y0, Y9, Y7 4062 VMULPS Y1, Y9, Y9 4063 VMULPS Y7, Y9, Y7 4064 VANDPS Y2, Y8, Y8 4065 VCMPPS $0x02, Y8, Y3, Y8 4066 VANDPS Y7, Y8, Y7 4067 VMOVUPS Y4, (DI)(CX*4) 4068 VMOVUPS Y5, 32(DI)(CX*4) 4069 VMOVUPS Y6, 64(DI)(CX*4) 4070 VMOVUPS Y7, 96(DI)(CX*4) 4071 ADDQ $0x20, CX 4072 CMPQ AX, CX 4073 JNE LBB1_4 4074 CMPQ AX, SI 4075 JE LBB1_8 4076 4077 LBB1_6: 4078 VMOVSS dataSqrtF32<>+0(SB), X0 4079 VMOVSS dataSqrtF32<>+4(SB), X1 4080 VBROADCASTSS dataSqrtF32<>+8(SB), X2 4081 VMOVSS dataSqrtF32<>+12(SB), X3 4082 4083 LBB1_7: 4084 VMOVSS (DI)(AX*4), X4 4085 VRSQRTSS X4, X4, X5 4086 VMULSS X5, X4, X6 4087 VFMADD213SS X0, X6, X5 4088 VMULSS X1, X6, X6 4089 VMULSS X5, X6, X5 4090 VANDPS X2, X4, X4 4091 VCMPSS $0x01, X3, X4, X4 4092 VANDNPS X5, X4, X4 4093 VMOVSS X4, (DI)(AX*4) 4094 ADDQ $0x01, AX 4095 CMPQ SI, AX 4096 JNE LBB1_7 4097 4098 LBB1_8: 4099 VZEROUPPER 4100 MOVSS X0, ret+24(FP) 4101 RET 4102 4103 DATA dataRoundF64<>+0(SB)/8, $0x8000000000000000 4104 DATA dataRoundF64<>+8(SB)/8, $0x3fdfffffffffffff 4105 DATA dataRoundF64<>+16(SB)/8, $0x8000000000000000 4106 DATA dataRoundF64<>+24(SB)/8, $0x8000000000000000 4107 GLOBL dataRoundF64<>(SB), RODATA|NOPTR, $32 4108 4109 // func Round_AVX2_F64(x []float64) float64 4110 // Requires: AVX, SSE2 4111 TEXT ·Round_AVX2_F64(SB), NOSPLIT, $0-32 4112 MOVQ x_base+0(FP), DI 4113 MOVQ x_len+8(FP), SI 4114 TESTQ SI, SI 4115 JE LBB2_8 4116 CMPQ SI, $0x10 4117 JAE LBB2_3 4118 XORL AX, AX 4119 JMP LBB2_6 4120 4121 LBB2_3: 4122 MOVQ SI, AX 4123 ANDQ $-16, AX 4124 XORL CX, CX 4125 VBROADCASTSD dataRoundF64<>+0(SB), Y0 4126 VBROADCASTSD dataRoundF64<>+8(SB), Y1 4127 4128 LBB2_4: 4129 VMOVUPD (DI)(CX*8), Y2 4130 VMOVUPD 32(DI)(CX*8), Y3 4131 VMOVUPD 64(DI)(CX*8), Y4 4132 VMOVUPD 96(DI)(CX*8), Y5 4133 VANDPD Y0, Y2, Y6 4134 VORPD Y1, Y6, Y6 4135 VADDPD Y6, Y2, Y2 4136 VROUNDPD $0x0b, Y2, Y2 4137 VANDPD Y0, Y3, Y6 4138 VORPD Y1, Y6, Y6 4139 VADDPD Y6, Y3, Y3 4140 VROUNDPD $0x0b, Y3, Y3 4141 VANDPD Y0, Y4, Y6 4142 VORPD Y1, Y6, Y6 4143 VADDPD Y6, Y4, Y4 4144 VROUNDPD $0x0b, Y4, Y4 4145 VANDPD Y0, Y5, Y6 4146 VORPD Y1, Y6, Y6 4147 VADDPD Y6, Y5, Y5 4148 VROUNDPD $0x0b, Y5, Y5 4149 VMOVUPD Y2, (DI)(CX*8) 4150 VMOVUPD Y3, 32(DI)(CX*8) 4151 VMOVUPD Y4, 64(DI)(CX*8) 4152 VMOVUPD Y5, 96(DI)(CX*8) 4153 ADDQ $0x10, CX 4154 CMPQ AX, CX 4155 JNE LBB2_4 4156 CMPQ AX, SI 4157 JE LBB2_8 4158 4159 LBB2_6: 4160 VMOVUPD dataRoundF64<>+16(SB), X0 4161 VMOVDDUP dataRoundF64<>+8(SB), X1 4162 4163 LBB2_7: 4164 VMOVSD (DI)(AX*8), X2 4165 VANDPD X0, X2, X3 4166 VORPD X1, X3, X3 4167 VADDSD X3, X2, X2 4168 VROUNDSD $0x0b, X2, X2, X2 4169 VMOVSD X2, (DI)(AX*8) 4170 ADDQ $0x01, AX 4171 CMPQ SI, AX 4172 JNE LBB2_7 4173 4174 LBB2_8: 4175 VZEROUPPER 4176 MOVSD X0, ret+24(FP) 4177 RET 4178 4179 DATA dataRoundF32<>+0(SB)/4, $0x80000000 4180 DATA dataRoundF32<>+4(SB)/4, $0x3effffff 4181 GLOBL dataRoundF32<>(SB), RODATA|NOPTR, $8 4182 4183 // func Round_AVX2_F32(x []float32) float32 4184 // Requires: AVX, SSE 4185 TEXT ·Round_AVX2_F32(SB), NOSPLIT, $0-28 4186 MOVQ x_base+0(FP), DI 4187 MOVQ x_len+8(FP), SI 4188 TESTQ SI, SI 4189 JE LBB3_8 4190 CMPQ SI, $0x20 4191 JAE LBB3_3 4192 XORL AX, AX 4193 JMP LBB3_6 4194 4195 LBB3_3: 4196 MOVQ SI, AX 4197 ANDQ $-32, AX 4198 XORL CX, CX 4199 VBROADCASTSS dataRoundF32<>+0(SB), Y0 4200 VBROADCASTSS dataRoundF32<>+4(SB), Y1 4201 4202 LBB3_4: 4203 VMOVUPS (DI)(CX*4), Y2 4204 VMOVUPS 32(DI)(CX*4), Y3 4205 VMOVUPS 64(DI)(CX*4), Y4 4206 VMOVUPS 96(DI)(CX*4), Y5 4207 VANDPS Y0, Y2, Y6 4208 VORPS Y1, Y6, Y6 4209 VADDPS Y6, Y2, Y2 4210 VROUNDPS $0x0b, Y2, Y2 4211 VANDPS Y0, Y3, Y6 4212 VORPS Y1, Y6, Y6 4213 VADDPS Y6, Y3, Y3 4214 VROUNDPS $0x0b, Y3, Y3 4215 VANDPS Y0, Y4, Y6 4216 VORPS Y1, Y6, Y6 4217 VADDPS Y6, Y4, Y4 4218 VROUNDPS $0x0b, Y4, Y4 4219 VANDPS Y0, Y5, Y6 4220 VORPS Y1, Y6, Y6 4221 VADDPS Y6, Y5, Y5 4222 VROUNDPS $0x0b, Y5, Y5 4223 VMOVUPS Y2, (DI)(CX*4) 4224 VMOVUPS Y3, 32(DI)(CX*4) 4225 VMOVUPS Y4, 64(DI)(CX*4) 4226 VMOVUPS Y5, 96(DI)(CX*4) 4227 ADDQ $0x20, CX 4228 CMPQ AX, CX 4229 JNE LBB3_4 4230 CMPQ AX, SI 4231 JE LBB3_8 4232 4233 LBB3_6: 4234 VBROADCASTSS dataRoundF32<>+0(SB), X0 4235 VBROADCASTSS dataRoundF32<>+4(SB), X1 4236 4237 LBB3_7: 4238 VMOVSS (DI)(AX*4), X2 4239 VANDPS X0, X2, X3 4240 VORPS X1, X3, X3 4241 VADDSS X3, X2, X2 4242 VROUNDSS $0x0b, X2, X2, X2 4243 VMOVSS X2, (DI)(AX*4) 4244 ADDQ $0x01, AX 4245 CMPQ SI, AX 4246 JNE LBB3_7 4247 4248 LBB3_8: 4249 VZEROUPPER 4250 MOVSS X0, ret+24(FP) 4251 RET 4252 4253 // func Floor_AVX2_F64(x []float64) float64 4254 // Requires: AVX, SSE2 4255 TEXT ·Floor_AVX2_F64(SB), NOSPLIT, $0-32 4256 MOVQ x_base+0(FP), DI 4257 MOVQ x_len+8(FP), SI 4258 TESTQ SI, SI 4259 JE LBB4_11 4260 CMPQ SI, $0x10 4261 JAE LBB4_3 4262 XORL AX, AX 4263 JMP LBB4_10 4264 4265 LBB4_3: 4266 MOVQ SI, AX 4267 ANDQ $-16, AX 4268 LEAQ -16(AX), CX 4269 MOVQ CX, R8 4270 SHRQ $0x04, R8 4271 ADDQ $0x01, R8 4272 TESTQ CX, CX 4273 JE LBB4_4 4274 MOVQ R8, DX 4275 ANDQ $-2, DX 4276 XORL CX, CX 4277 4278 LBB4_6: 4279 VROUNDPD $0x09, (DI)(CX*8), Y0 4280 VROUNDPD $0x09, 32(DI)(CX*8), Y1 4281 VROUNDPD $0x09, 64(DI)(CX*8), Y2 4282 VROUNDPD $0x09, 96(DI)(CX*8), Y3 4283 VMOVUPD Y0, (DI)(CX*8) 4284 VMOVUPD Y1, 32(DI)(CX*8) 4285 VMOVUPD Y2, 64(DI)(CX*8) 4286 VMOVUPD Y3, 96(DI)(CX*8) 4287 VROUNDPD $0x09, 128(DI)(CX*8), Y0 4288 VROUNDPD $0x09, 160(DI)(CX*8), Y1 4289 VROUNDPD $0x09, 192(DI)(CX*8), Y2 4290 VROUNDPD $0x09, 224(DI)(CX*8), Y3 4291 VMOVUPD Y0, 128(DI)(CX*8) 4292 VMOVUPD Y1, 160(DI)(CX*8) 4293 VMOVUPD Y2, 192(DI)(CX*8) 4294 VMOVUPD Y3, 224(DI)(CX*8) 4295 ADDQ $0x20, CX 4296 ADDQ $-2, DX 4297 JNE LBB4_6 4298 TESTB $0x01, R8 4299 JE LBB4_9 4300 4301 LBB4_8: 4302 VROUNDPD $0x09, (DI)(CX*8), Y0 4303 VROUNDPD $0x09, 32(DI)(CX*8), Y1 4304 VROUNDPD $0x09, 64(DI)(CX*8), Y2 4305 VROUNDPD $0x09, 96(DI)(CX*8), Y3 4306 VMOVUPD Y0, (DI)(CX*8) 4307 VMOVUPD Y1, 32(DI)(CX*8) 4308 VMOVUPD Y2, 64(DI)(CX*8) 4309 VMOVUPD Y3, 96(DI)(CX*8) 4310 4311 LBB4_9: 4312 CMPQ AX, SI 4313 JE LBB4_11 4314 4315 LBB4_10: 4316 VMOVSD (DI)(AX*8), X0 4317 VROUNDSD $0x09, X0, X0, X0 4318 VMOVSD X0, (DI)(AX*8) 4319 ADDQ $0x01, AX 4320 CMPQ SI, AX 4321 JNE LBB4_10 4322 4323 LBB4_11: 4324 VZEROUPPER 4325 MOVSD X0, ret+24(FP) 4326 RET 4327 4328 LBB4_4: 4329 XORL CX, CX 4330 TESTB $0x01, R8 4331 JNE LBB4_8 4332 JMP LBB4_9 4333 4334 // func Floor_AVX2_F32(x []float32) float32 4335 // Requires: AVX, SSE 4336 TEXT ·Floor_AVX2_F32(SB), NOSPLIT, $0-28 4337 MOVQ x_base+0(FP), DI 4338 MOVQ x_len+8(FP), SI 4339 TESTQ SI, SI 4340 JE LBB5_11 4341 CMPQ SI, $0x20 4342 JAE LBB5_3 4343 XORL AX, AX 4344 JMP LBB5_10 4345 4346 LBB5_3: 4347 MOVQ SI, AX 4348 ANDQ $-32, AX 4349 LEAQ -32(AX), CX 4350 MOVQ CX, R8 4351 SHRQ $0x05, R8 4352 ADDQ $0x01, R8 4353 TESTQ CX, CX 4354 JE LBB5_4 4355 MOVQ R8, DX 4356 ANDQ $-2, DX 4357 XORL CX, CX 4358 4359 LBB5_6: 4360 VROUNDPS $0x09, (DI)(CX*4), Y0 4361 VROUNDPS $0x09, 32(DI)(CX*4), Y1 4362 VROUNDPS $0x09, 64(DI)(CX*4), Y2 4363 VROUNDPS $0x09, 96(DI)(CX*4), Y3 4364 VMOVUPS Y0, (DI)(CX*4) 4365 VMOVUPS Y1, 32(DI)(CX*4) 4366 VMOVUPS Y2, 64(DI)(CX*4) 4367 VMOVUPS Y3, 96(DI)(CX*4) 4368 VROUNDPS $0x09, 128(DI)(CX*4), Y0 4369 VROUNDPS $0x09, 160(DI)(CX*4), Y1 4370 VROUNDPS $0x09, 192(DI)(CX*4), Y2 4371 VROUNDPS $0x09, 224(DI)(CX*4), Y3 4372 VMOVUPS Y0, 128(DI)(CX*4) 4373 VMOVUPS Y1, 160(DI)(CX*4) 4374 VMOVUPS Y2, 192(DI)(CX*4) 4375 VMOVUPS Y3, 224(DI)(CX*4) 4376 ADDQ $0x40, CX 4377 ADDQ $-2, DX 4378 JNE LBB5_6 4379 TESTB $0x01, R8 4380 JE LBB5_9 4381 4382 LBB5_8: 4383 VROUNDPS $0x09, (DI)(CX*4), Y0 4384 VROUNDPS $0x09, 32(DI)(CX*4), Y1 4385 VROUNDPS $0x09, 64(DI)(CX*4), Y2 4386 VROUNDPS $0x09, 96(DI)(CX*4), Y3 4387 VMOVUPS Y0, (DI)(CX*4) 4388 VMOVUPS Y1, 32(DI)(CX*4) 4389 VMOVUPS Y2, 64(DI)(CX*4) 4390 VMOVUPS Y3, 96(DI)(CX*4) 4391 4392 LBB5_9: 4393 CMPQ AX, SI 4394 JE LBB5_11 4395 4396 LBB5_10: 4397 VMOVSS (DI)(AX*4), X0 4398 VROUNDSS $0x09, X0, X0, X0 4399 VMOVSS X0, (DI)(AX*4) 4400 ADDQ $0x01, AX 4401 CMPQ SI, AX 4402 JNE LBB5_10 4403 4404 LBB5_11: 4405 VZEROUPPER 4406 MOVSS X0, ret+24(FP) 4407 RET 4408 4409 LBB5_4: 4410 XORL CX, CX 4411 TESTB $0x01, R8 4412 JNE LBB5_8 4413 JMP LBB5_9 4414 4415 // func Ceil_AVX2_F64(x []float64) float64 4416 // Requires: AVX, SSE2 4417 TEXT ·Ceil_AVX2_F64(SB), NOSPLIT, $0-32 4418 MOVQ x_base+0(FP), DI 4419 MOVQ x_len+8(FP), SI 4420 TESTQ SI, SI 4421 JE LBB6_11 4422 CMPQ SI, $0x10 4423 JAE LBB6_3 4424 XORL AX, AX 4425 JMP LBB6_10 4426 4427 LBB6_3: 4428 MOVQ SI, AX 4429 ANDQ $-16, AX 4430 LEAQ -16(AX), CX 4431 MOVQ CX, R8 4432 SHRQ $0x04, R8 4433 ADDQ $0x01, R8 4434 TESTQ CX, CX 4435 JE LBB6_4 4436 MOVQ R8, DX 4437 ANDQ $-2, DX 4438 XORL CX, CX 4439 4440 LBB6_6: 4441 VROUNDPD $0x0a, (DI)(CX*8), Y0 4442 VROUNDPD $0x0a, 32(DI)(CX*8), Y1 4443 VROUNDPD $0x0a, 64(DI)(CX*8), Y2 4444 VROUNDPD $0x0a, 96(DI)(CX*8), Y3 4445 VMOVUPD Y0, (DI)(CX*8) 4446 VMOVUPD Y1, 32(DI)(CX*8) 4447 VMOVUPD Y2, 64(DI)(CX*8) 4448 VMOVUPD Y3, 96(DI)(CX*8) 4449 VROUNDPD $0x0a, 128(DI)(CX*8), Y0 4450 VROUNDPD $0x0a, 160(DI)(CX*8), Y1 4451 VROUNDPD $0x0a, 192(DI)(CX*8), Y2 4452 VROUNDPD $0x0a, 224(DI)(CX*8), Y3 4453 VMOVUPD Y0, 128(DI)(CX*8) 4454 VMOVUPD Y1, 160(DI)(CX*8) 4455 VMOVUPD Y2, 192(DI)(CX*8) 4456 VMOVUPD Y3, 224(DI)(CX*8) 4457 ADDQ $0x20, CX 4458 ADDQ $-2, DX 4459 JNE LBB6_6 4460 TESTB $0x01, R8 4461 JE LBB6_9 4462 4463 LBB6_8: 4464 VROUNDPD $0x0a, (DI)(CX*8), Y0 4465 VROUNDPD $0x0a, 32(DI)(CX*8), Y1 4466 VROUNDPD $0x0a, 64(DI)(CX*8), Y2 4467 VROUNDPD $0x0a, 96(DI)(CX*8), Y3 4468 VMOVUPD Y0, (DI)(CX*8) 4469 VMOVUPD Y1, 32(DI)(CX*8) 4470 VMOVUPD Y2, 64(DI)(CX*8) 4471 VMOVUPD Y3, 96(DI)(CX*8) 4472 4473 LBB6_9: 4474 CMPQ AX, SI 4475 JE LBB6_11 4476 4477 LBB6_10: 4478 VMOVSD (DI)(AX*8), X0 4479 VROUNDSD $0x0a, X0, X0, X0 4480 VMOVSD X0, (DI)(AX*8) 4481 ADDQ $0x01, AX 4482 CMPQ SI, AX 4483 JNE LBB6_10 4484 4485 LBB6_11: 4486 VZEROUPPER 4487 MOVSD X0, ret+24(FP) 4488 RET 4489 4490 LBB6_4: 4491 XORL CX, CX 4492 TESTB $0x01, R8 4493 JNE LBB6_8 4494 JMP LBB6_9 4495 4496 // func Ceil_AVX2_F32(x []float32) float32 4497 // Requires: AVX, SSE 4498 TEXT ·Ceil_AVX2_F32(SB), NOSPLIT, $0-28 4499 MOVQ x_base+0(FP), DI 4500 MOVQ x_len+8(FP), SI 4501 TESTQ SI, SI 4502 JE LBB7_11 4503 CMPQ SI, $0x20 4504 JAE LBB7_3 4505 XORL AX, AX 4506 JMP LBB7_10 4507 4508 LBB7_3: 4509 MOVQ SI, AX 4510 ANDQ $-32, AX 4511 LEAQ -32(AX), CX 4512 MOVQ CX, R8 4513 SHRQ $0x05, R8 4514 ADDQ $0x01, R8 4515 TESTQ CX, CX 4516 JE LBB7_4 4517 MOVQ R8, DX 4518 ANDQ $-2, DX 4519 XORL CX, CX 4520 4521 LBB7_6: 4522 VROUNDPS $0x0a, (DI)(CX*4), Y0 4523 VROUNDPS $0x0a, 32(DI)(CX*4), Y1 4524 VROUNDPS $0x0a, 64(DI)(CX*4), Y2 4525 VROUNDPS $0x0a, 96(DI)(CX*4), Y3 4526 VMOVUPS Y0, (DI)(CX*4) 4527 VMOVUPS Y1, 32(DI)(CX*4) 4528 VMOVUPS Y2, 64(DI)(CX*4) 4529 VMOVUPS Y3, 96(DI)(CX*4) 4530 VROUNDPS $0x0a, 128(DI)(CX*4), Y0 4531 VROUNDPS $0x0a, 160(DI)(CX*4), Y1 4532 VROUNDPS $0x0a, 192(DI)(CX*4), Y2 4533 VROUNDPS $0x0a, 224(DI)(CX*4), Y3 4534 VMOVUPS Y0, 128(DI)(CX*4) 4535 VMOVUPS Y1, 160(DI)(CX*4) 4536 VMOVUPS Y2, 192(DI)(CX*4) 4537 VMOVUPS Y3, 224(DI)(CX*4) 4538 ADDQ $0x40, CX 4539 ADDQ $-2, DX 4540 JNE LBB7_6 4541 TESTB $0x01, R8 4542 JE LBB7_9 4543 4544 LBB7_8: 4545 VROUNDPS $0x0a, (DI)(CX*4), Y0 4546 VROUNDPS $0x0a, 32(DI)(CX*4), Y1 4547 VROUNDPS $0x0a, 64(DI)(CX*4), Y2 4548 VROUNDPS $0x0a, 96(DI)(CX*4), Y3 4549 VMOVUPS Y0, (DI)(CX*4) 4550 VMOVUPS Y1, 32(DI)(CX*4) 4551 VMOVUPS Y2, 64(DI)(CX*4) 4552 VMOVUPS Y3, 96(DI)(CX*4) 4553 4554 LBB7_9: 4555 CMPQ AX, SI 4556 JE LBB7_11 4557 4558 LBB7_10: 4559 VMOVSS (DI)(AX*4), X0 4560 VROUNDSS $0x0a, X0, X0, X0 4561 VMOVSS X0, (DI)(AX*4) 4562 ADDQ $0x01, AX 4563 CMPQ SI, AX 4564 JNE LBB7_10 4565 4566 LBB7_11: 4567 VZEROUPPER 4568 MOVSS X0, ret+24(FP) 4569 RET 4570 4571 LBB7_4: 4572 XORL CX, CX 4573 TESTB $0x01, R8 4574 JNE LBB7_8 4575 JMP LBB7_9 4576 4577 DATA dataPowF64<>+0(SB)/8, $0x7fffffffffffffff 4578 DATA dataPowF64<>+8(SB)/8, $0x3fe6a09e667f3bcd 4579 DATA dataPowF64<>+16(SB)/8, $0xbff0000000000000 4580 DATA dataPowF64<>+24(SB)/8, $0x401a509f46f4fa53 4581 DATA dataPowF64<>+32(SB)/8, $0x3fdfe818a0fe1a83 4582 DATA dataPowF64<>+40(SB)/8, $0x3f07bc0962b395ca 4583 DATA dataPowF64<>+48(SB)/8, $0x404e798eb86c3351 4584 DATA dataPowF64<>+56(SB)/8, $0x403de9738b8cb9c9 4585 DATA dataPowF64<>+64(SB)/8, $0x40340a202d99830a 4586 DATA dataPowF64<>+72(SB)/8, $0x404c8e7597479a10 4587 DATA dataPowF64<>+80(SB)/8, $0x4054c30b52213498 4588 DATA dataPowF64<>+88(SB)/8, $0x402e20359e903e37 4589 DATA dataPowF64<>+96(SB)/8, $0x407351945dc908a5 4590 DATA dataPowF64<>+104(SB)/8, $0x406bb86590fcfb56 4591 DATA dataPowF64<>+112(SB)/8, $0x404e0f304466448e 4592 DATA dataPowF64<>+120(SB)/8, $0x406b0db13e48e066 4593 DATA dataPowF64<>+128(SB)/8, $0x4330000000000000 4594 DATA dataPowF64<>+136(SB)/8, $0xc3300000000003ff 4595 DATA dataPowF64<>+144(SB)/8, $0x3ff0000000000000 4596 DATA dataPowF64<>+152(SB)/8, $0xbfe0000000000000 4597 DATA dataPowF64<>+160(SB)/8, $0x3fe0000000000000 4598 DATA dataPowF64<>+168(SB)/8, $0x3ff71547652b82fe 4599 DATA dataPowF64<>+176(SB)/8, $0xbfe62e4000000000 4600 DATA dataPowF64<>+184(SB)/8, $0x3eb7f7d1cf79abca 4601 DATA dataPowF64<>+192(SB)/8, $0x3fe62e42fefa39ef 4602 DATA dataPowF64<>+200(SB)/8, $0x3e21eed8eff8d898 4603 DATA dataPowF64<>+208(SB)/8, $0x3de6124613a86d09 4604 DATA dataPowF64<>+216(SB)/8, $0x3e927e4fb7789f5c 4605 DATA dataPowF64<>+224(SB)/8, $0x3e5ae64567f544e4 4606 DATA dataPowF64<>+232(SB)/8, $0x3efa01a01a01a01a 4607 DATA dataPowF64<>+240(SB)/8, $0x3ec71de3a556c734 4608 DATA dataPowF64<>+248(SB)/8, $0x3f56c16c16c16c17 4609 DATA dataPowF64<>+256(SB)/8, $0x3f2a01a01a01a01a 4610 DATA dataPowF64<>+264(SB)/8, $0x3fa5555555555555 4611 DATA dataPowF64<>+272(SB)/8, $0x3f81111111111111 4612 DATA dataPowF64<>+280(SB)/8, $0x3fc5555555555555 4613 DATA dataPowF64<>+288(SB)/8, $0x00000000000007fe 4614 DATA dataPowF64<>+296(SB)/8, $0x40a7700000000000 4615 DATA dataPowF64<>+304(SB)/8, $0x0000000000000001 4616 DATA dataPowF64<>+312(SB)/8, $0xc0a7700000000000 4617 DATA dataPowF64<>+320(SB)/8, $0x7ff0000000000000 4618 DATA dataPowF64<>+328(SB)/8, $0x7ff8002040000000 4619 DATA dataPowF64<>+336(SB)/8, $0x000fffffffffffff 4620 DATA dataPowF64<>+344(SB)/8, $0x000fffffffffffff 4621 DATA dataPowF64<>+352(SB)/8, $0x3fe0000000000000 4622 DATA dataPowF64<>+360(SB)/8, $0x3fe0000000000000 4623 GLOBL dataPowF64<>(SB), RODATA|NOPTR, $368 4624 4625 // func Pow_4x_AVX2_F64(x []float64, y []float64) 4626 // Requires: AVX, AVX2, FMA3 4627 TEXT ·Pow_4x_AVX2_F64(SB), NOSPLIT, $0-48 4628 MOVQ x_base+0(FP), DI 4629 MOVQ y_base+24(FP), SI 4630 MOVQ x_len+8(FP), DX 4631 SUBQ $+1192, SP 4632 ANDQ $-4, DX 4633 JE LBB9_11 4634 XORL R8, R8 4635 VBROADCASTSD dataPowF64<>+0(SB), Y0 4636 VMOVUPS Y0, 512(SP) 4637 VBROADCASTSD dataPowF64<>+8(SB), Y0 4638 VMOVUPS Y0, 1120(SP) 4639 VPXOR X6, X6, X6 4640 VBROADCASTSD dataPowF64<>+16(SB), Y0 4641 VMOVUPS Y0, 1088(SP) 4642 VBROADCASTSD dataPowF64<>+24(SB), Y0 4643 VMOVUPS Y0, 1056(SP) 4644 VBROADCASTSD dataPowF64<>+32(SB), Y0 4645 VMOVUPS Y0, 1024(SP) 4646 VBROADCASTSD dataPowF64<>+40(SB), Y0 4647 VMOVUPS Y0, 992(SP) 4648 VBROADCASTSD dataPowF64<>+48(SB), Y0 4649 VMOVUPS Y0, 960(SP) 4650 VBROADCASTSD dataPowF64<>+56(SB), Y0 4651 VMOVUPS Y0, 928(SP) 4652 VBROADCASTSD dataPowF64<>+64(SB), Y0 4653 VMOVUPS Y0, 896(SP) 4654 VBROADCASTSD dataPowF64<>+72(SB), Y0 4655 VMOVUPS Y0, 864(SP) 4656 VBROADCASTSD dataPowF64<>+80(SB), Y0 4657 VMOVUPS Y0, 832(SP) 4658 VBROADCASTSD dataPowF64<>+88(SB), Y0 4659 VMOVUPS Y0, 800(SP) 4660 VBROADCASTSD dataPowF64<>+96(SB), Y0 4661 VMOVUPS Y0, 768(SP) 4662 VBROADCASTSD dataPowF64<>+104(SB), Y0 4663 VMOVUPS Y0, 736(SP) 4664 VBROADCASTSD dataPowF64<>+112(SB), Y0 4665 VMOVUPS Y0, 704(SP) 4666 VBROADCASTSD dataPowF64<>+120(SB), Y0 4667 VMOVUPS Y0, 672(SP) 4668 VBROADCASTSD dataPowF64<>+128(SB), Y0 4669 VMOVUPS Y0, 640(SP) 4670 VBROADCASTSD dataPowF64<>+136(SB), Y0 4671 VMOVUPS Y0, 608(SP) 4672 VBROADCASTSD dataPowF64<>+144(SB), Y0 4673 VMOVUPS Y0, -128(SP) 4674 VBROADCASTSD dataPowF64<>+152(SB), Y0 4675 VMOVUPS Y0, 576(SP) 4676 VBROADCASTSD dataPowF64<>+160(SB), Y0 4677 VMOVUPS Y0, 544(SP) 4678 VBROADCASTSD dataPowF64<>+168(SB), Y0 4679 VMOVUPS Y0, 480(SP) 4680 VBROADCASTSD dataPowF64<>+176(SB), Y0 4681 VMOVUPS Y0, 448(SP) 4682 VBROADCASTSD dataPowF64<>+184(SB), Y0 4683 VMOVUPS Y0, 416(SP) 4684 VBROADCASTSD dataPowF64<>+192(SB), Y0 4685 VMOVUPS Y0, 384(SP) 4686 VBROADCASTSD dataPowF64<>+200(SB), Y0 4687 VMOVUPS Y0, 352(SP) 4688 VBROADCASTSD dataPowF64<>+208(SB), Y0 4689 VMOVUPS Y0, 320(SP) 4690 VBROADCASTSD dataPowF64<>+216(SB), Y0 4691 VMOVUPS Y0, 288(SP) 4692 VBROADCASTSD dataPowF64<>+224(SB), Y0 4693 VMOVUPS Y0, 256(SP) 4694 VBROADCASTSD dataPowF64<>+232(SB), Y0 4695 VMOVUPS Y0, 224(SP) 4696 VBROADCASTSD dataPowF64<>+240(SB), Y0 4697 VMOVUPS Y0, 192(SP) 4698 VBROADCASTSD dataPowF64<>+248(SB), Y0 4699 VMOVUPS Y0, 160(SP) 4700 VBROADCASTSD dataPowF64<>+256(SB), Y0 4701 VMOVUPS Y0, 128(SP) 4702 VBROADCASTSD dataPowF64<>+264(SB), Y0 4703 VMOVUPS Y0, 96(SP) 4704 VBROADCASTSD dataPowF64<>+272(SB), Y0 4705 VMOVUPS Y0, 64(SP) 4706 VBROADCASTSD dataPowF64<>+280(SB), Y0 4707 VMOVUPS Y0, 32(SP) 4708 VBROADCASTSD dataPowF64<>+288(SB), Y0 4709 VMOVUPS Y0, (SP) 4710 VBROADCASTSD dataPowF64<>+296(SB), Y0 4711 VMOVUPS Y0, -32(SP) 4712 VBROADCASTSD dataPowF64<>+304(SB), Y0 4713 VMOVUPS Y0, -64(SP) 4714 VBROADCASTSD dataPowF64<>+312(SB), Y0 4715 VMOVUPD Y0, -96(SP) 4716 VPBROADCASTQ dataPowF64<>+320(SB), Y5 4717 VBROADCASTSD dataPowF64<>+320(SB), Y10 4718 JMP LBB9_2 4719 4720 LBB9_10: 4721 VMOVUPD Y2, (DI)(R8*8) 4722 ADDQ $0x04, R8 4723 CMPQ R8, DX 4724 JAE LBB9_11 4725 4726 LBB9_2: 4727 VMOVAPD Y10, Y9 4728 VMOVDQU (DI)(R8*8), Y13 4729 VMOVUPD (SI)(R8*8), Y12 4730 VPAND 512(SP), Y13, Y10 4731 VMOVUPD dataPowF64<>+336(SB), X1 4732 VANDPD (DI)(R8*8), X1, X2 4733 VMOVUPD dataPowF64<>+352(SB), X0 4734 VORPD X0, X2, X2 4735 VANDPD 16(DI)(R8*8), X1, X3 4736 VORPD X0, X3, X3 4737 VINSERTF128 $0x01, X3, Y2, Y3 4738 VMOVUPD 1120(SP), Y0 4739 VCMPPD $0x01, Y3, Y0, Y2 4740 VANDNPD Y3, Y2, Y4 4741 VADDPD 1088(SP), Y3, Y3 4742 VADDPD Y4, Y3, Y4 4743 VMULPD Y4, Y4, Y3 4744 VMULPD Y3, Y3, Y7 4745 VMOVUPD 1024(SP), Y8 4746 VFMADD213PD 1056(SP), Y4, Y8 4747 VFMADD231PD 992(SP), Y3, Y8 4748 VMOVUPD 928(SP), Y11 4749 VFMADD213PD 960(SP), Y4, Y11 4750 VMOVUPD 864(SP), Y14 4751 VFMADD213PD 896(SP), Y4, Y14 4752 VFMADD231PD Y11, Y3, Y14 4753 VFMADD231PD Y8, Y7, Y14 4754 VMULPD Y4, Y3, Y8 4755 VMULPD Y14, Y8, Y8 4756 VADDPD 832(SP), Y3, Y11 4757 VFMADD231PD 800(SP), Y4, Y11 4758 VMOVUPD 736(SP), Y14 4759 VFMADD213PD 768(SP), Y4, Y14 4760 VMOVUPD 672(SP), Y15 4761 VFMADD213PD 704(SP), Y4, Y15 4762 VFMADD231PD Y14, Y3, Y15 4763 VFMADD231PD Y11, Y7, Y15 4764 VDIVPD Y15, Y8, Y7 4765 VMOVDQU Y10, 1152(SP) 4766 VPSRLQ $0x34, Y10, Y8 4767 VPOR 640(SP), Y8, Y8 4768 VADDPD 608(SP), Y8, Y8 4769 VMOVUPD -128(SP), Y0 4770 VANDPD Y0, Y2, Y2 4771 VADDPD Y2, Y8, Y8 4772 VMULPD Y12, Y8, Y2 4773 VROUNDPD $0x08, Y2, Y2 4774 VFNMADD213PD Y2, Y12, Y8 4775 VMOVUPD 576(SP), Y1 4776 VMOVAPD Y1, Y11 4777 VFMADD213PD Y4, Y3, Y11 4778 VADDPD Y7, Y11, Y11 4779 VMOVUPD 544(SP), Y10 4780 VMULPD Y4, Y10, Y14 4781 VMULPD Y1, Y3, Y15 4782 VFMADD231PD Y14, Y4, Y15 4783 VSUBPD Y4, Y11, Y4 4784 VFMADD231PD Y3, Y10, Y4 4785 VMOVUPD 480(SP), Y1 4786 VMULPD Y1, Y12, Y3 4787 VMULPD Y3, Y11, Y3 4788 VROUNDPD $0x08, Y3, Y3 4789 VMULPD 448(SP), Y3, Y14 4790 VFMADD231PD Y11, Y12, Y14 4791 VFMSUB231PD 416(SP), Y3, Y14 4792 VMOVUPD 384(SP), Y11 4793 VFMADD231PD Y8, Y11, Y14 4794 VSUBPD Y7, Y15, Y7 4795 VADDPD Y4, Y7, Y4 4796 VFNMSUB213PD Y14, Y12, Y4 4797 VMULPD Y1, Y4, Y7 4798 VROUNDPD $0x08, Y7, Y7 4799 VFNMADD231PD Y11, Y7, Y4 4800 VMULPD Y4, Y4, Y8 4801 VMOVUPD 320(SP), Y11 4802 VFMADD213PD 352(SP), Y4, Y11 4803 VMOVUPD 256(SP), Y14 4804 VFMADD213PD 288(SP), Y4, Y14 4805 VMOVUPD 192(SP), Y15 4806 VFMADD213PD 224(SP), Y4, Y15 4807 VFMADD231PD Y14, Y8, Y15 4808 VMOVUPD 128(SP), Y14 4809 VFMADD213PD 160(SP), Y4, Y14 4810 VMOVUPD 64(SP), Y1 4811 VFMADD213PD 96(SP), Y4, Y1 4812 VFMADD231PD Y14, Y8, Y1 4813 VMOVUPD 32(SP), Y14 4814 VFMADD213PD Y10, Y4, Y14 4815 VFMADD213PD Y4, Y8, Y14 4816 VMULPD Y8, Y8, Y4 4817 VFMADD231PD Y11, Y4, Y15 4818 VFMADD231PD Y1, Y4, Y14 4819 VMULPD Y4, Y4, Y1 4820 VFMADD231PD Y15, Y1, Y14 4821 VADDPD Y0, Y14, Y1 4822 VADDPD Y2, Y3, Y2 4823 VADDPD Y7, Y2, Y15 4824 VROUNDPD $0x08, Y15, Y2 4825 VCVTTSD2SIQ X2, R9 4826 VPERMILPD $0x01, X2, X3 4827 VCVTTSD2SIQ X3, AX 4828 VEXTRACTF128 $0x01, Y2, X2 4829 VCVTTSD2SIQ X2, CX 4830 VMOVQ CX, X3 4831 VPERMILPD $0x01, X2, X2 4832 VCVTTSD2SIQ X2, CX 4833 VMOVQ CX, X2 4834 VPUNPCKLQDQ X2, X3, X2 4835 VMOVQ R9, X3 4836 VMOVQ AX, X4 4837 VPUNPCKLQDQ X4, X3, X3 4838 VINSERTI128 $0x01, X2, Y3, Y2 4839 VPSRAD $0x1f, Y1, Y3 4840 VPSRAD $0x14, Y1, Y4 4841 VPSRLQ $0x20, Y4, Y4 4842 VPBLENDD $0xaa, Y3, Y4, Y3 4843 VPADDQ Y3, Y2, Y4 4844 VPCMPGTQ (SP), Y4, Y3 4845 VMOVUPD -32(SP), Y0 4846 VCMPPD $0x01, Y15, Y0, Y7 4847 VPOR Y7, Y3, Y3 4848 VMOVDQU -64(SP), Y0 4849 VPCMPGTQ Y4, Y0, Y4 4850 VCMPPD $0x01, -96(SP), Y15, Y7 4851 VPOR Y7, Y4, Y4 4852 VPSLLQ $0x34, Y2, Y2 4853 VPADDQ Y1, Y2, Y2 4854 VPOR Y3, Y4, Y1 4855 VPTEST Y1, Y1 4856 JNE LBB9_3 4857 VMOVAPD Y9, Y10 4858 JMP LBB9_5 4859 4860 LBB9_3: 4861 VPANDN Y2, Y4, Y1 4862 VMOVAPD Y9, Y10 4863 VBLENDVPD Y3, Y9, Y1, Y2 4864 4865 LBB9_5: 4866 VPAND Y5, Y13, Y11 4867 VPCMPEQQ Y6, Y11, Y4 4868 VPSRAD $0x1f, Y13, Y1 4869 VPSHUFD $0xf5, Y1, Y7 4870 VCMPPD $0x01, Y6, Y12, Y14 4871 VCMPPD $0x00, Y6, Y12, Y3 4872 VANDPD -128(SP), Y3, Y1 4873 VBLENDVPD Y14, Y10, Y1, Y1 4874 VBLENDVPD Y4, Y1, Y2, Y2 4875 VPTEST Y7, Y7 4876 JNE LBB9_7 4877 VPXOR X7, X7, X7 4878 JMP LBB9_8 4879 4880 LBB9_7: 4881 VROUNDPD $0x08, Y12, Y1 4882 VCMPPD $0x00, Y1, Y12, Y8 4883 VCVTTSD2SIQ X1, R9 4884 VPERMILPD $0x01, X1, X10 4885 VCVTTSD2SIQ X10, CX 4886 VEXTRACTF128 $0x01, Y1, X1 4887 VCVTTSD2SIQ X1, AX 4888 VXORPD X10, X10, X10 4889 VMOVQ AX, X6 4890 VPERMILPD $0x01, X1, X1 4891 VCVTTSD2SIQ X1, AX 4892 VMOVQ AX, X1 4893 VPUNPCKLQDQ X1, X6, X1 4894 VMOVQ R9, X6 4895 VMOVQ CX, X0 4896 VPUNPCKLQDQ X0, X6, X0 4897 VINSERTI128 $0x01, X1, Y0, Y0 4898 VPSLLQ $0x3f, Y0, Y0 4899 VPOR Y2, Y0, Y1 4900 VCMPPD $0x00, Y10, Y13, Y6 4901 VBROADCASTSD dataPowF64<>+328(SB), Y10 4902 VBLENDVPD Y6, Y2, Y10, Y6 4903 VMOVAPD Y9, Y10 4904 VBLENDVPD Y8, Y1, Y6, Y1 4905 VXORPD X6, X6, X6 4906 VBLENDVPD Y7, Y1, Y2, Y2 4907 VANDPD Y0, Y8, Y7 4908 4909 LBB9_8: 4910 VPCMPEQD Y9, Y9, Y9 4911 VANDPD Y5, Y12, Y0 4912 VANDPD Y5, Y15, Y1 4913 VPCMPEQQ Y5, Y1, Y15 4914 VPXOR Y9, Y15, Y1 4915 VPCMPEQQ Y5, Y0, Y8 4916 VPCMPEQQ Y5, Y11, Y11 4917 VPXOR Y9, Y11, Y0 4918 VPANDN Y0, Y8, Y0 4919 VPOR Y4, Y1, Y1 4920 VPAND Y0, Y1, Y0 4921 VPTEST Y9, Y0 4922 JB LBB9_10 4923 VPXOR Y9, Y8, Y0 4924 VPANDN Y0, Y15, Y0 4925 VMOVUPD -128(SP), Y8 4926 VMOVUPD 1152(SP), Y9 4927 VCMPPD $0x00, Y8, Y9, Y1 4928 VCMPPD $0x01, Y9, Y8, Y4 4929 VPSRAD $0x1f, Y12, Y6 4930 VPXOR Y4, Y6, Y4 4931 VPXOR X6, X6, X6 4932 VBLENDVPD Y4, Y10, Y6, Y4 4933 VBLENDVPD Y1, Y8, Y4, Y1 4934 VBLENDVPD Y0, Y2, Y1, Y0 4935 VANDPD Y2, Y7, Y1 4936 VANDPD Y7, Y13, Y2 4937 VORPD Y2, Y9, Y2 4938 VBLENDVPD Y14, Y1, Y2, Y1 4939 VBLENDVPD Y3, Y8, Y1, Y1 4940 VBLENDVPD Y11, Y1, Y0, Y0 4941 VCMPPD $0x03, Y13, Y13, Y1 4942 VCMPPD $0x03, Y12, Y12, Y2 4943 VORPD Y1, Y2, Y1 4944 VADDPD Y13, Y12, Y2 4945 VBLENDVPD Y1, Y2, Y0, Y2 4946 JMP LBB9_10 4947 4948 LBB9_11: 4949 ADDQ $+1192, SP 4950 VZEROUPPER 4951 RET 4952 4953 DATA genPowF32<>+0(SB)/4, $0x7fffffff 4954 DATA genPowF32<>+4(SB)/4, $0x3f3504f3 4955 DATA genPowF32<>+8(SB)/4, $0xbf800000 4956 DATA genPowF32<>+12(SB)/4, $0x3def251a 4957 DATA genPowF32<>+16(SB)/4, $0xbdebd1b8 4958 DATA genPowF32<>+20(SB)/4, $0x3e11e9bf 4959 DATA genPowF32<>+24(SB)/4, $0xbdfe5d4f 4960 DATA genPowF32<>+28(SB)/4, $0x3e4cceac 4961 DATA genPowF32<>+32(SB)/4, $0xbe2aae50 4962 DATA genPowF32<>+36(SB)/4, $0x3eaaaaaa 4963 DATA genPowF32<>+40(SB)/4, $0xbe7ffffc 4964 DATA genPowF32<>+44(SB)/4, $0x3d9021bb 4965 DATA genPowF32<>+48(SB)/4, $0xcb00007f 4966 DATA genPowF32<>+52(SB)/4, $0x3f800000 4967 DATA genPowF32<>+56(SB)/4, $0xbf000000 4968 DATA genPowF32<>+60(SB)/4, $0x3f000000 4969 DATA genPowF32<>+64(SB)/4, $0x3fb8aa3b 4970 DATA genPowF32<>+68(SB)/4, $0xbf318000 4971 DATA genPowF32<>+72(SB)/4, $0xb95e8083 4972 DATA genPowF32<>+76(SB)/4, $0xbf317218 4973 DATA genPowF32<>+80(SB)/4, $0x3d2aaaab 4974 DATA genPowF32<>+84(SB)/4, $0x3c088889 4975 DATA genPowF32<>+88(SB)/4, $0x3ab60b61 4976 DATA genPowF32<>+92(SB)/4, $0x39500d01 4977 DATA genPowF32<>+96(SB)/4, $0x3e2aaaab 4978 DATA genPowF32<>+100(SB)/4, $0x000000fe 4979 DATA genPowF32<>+104(SB)/4, $0x43960000 4980 DATA genPowF32<>+108(SB)/4, $0x00000001 4981 DATA genPowF32<>+112(SB)/4, $0xc3960000 4982 DATA genPowF32<>+116(SB)/4, $0x7f800000 4983 DATA genPowF32<>+120(SB)/4, $0x7fc00102 4984 DATA genPowF32<>+124(SB)/8, $0x007fffff007fffff 4985 DATA genPowF32<>+132(SB)/8, $0x007fffff007fffff 4986 DATA genPowF32<>+140(SB)/8, $0x3f0000003f000000 4987 DATA genPowF32<>+148(SB)/8, $0x3f0000003f000000 4988 DATA genPowF32<>+156(SB)/8, $0x4b0000004b000000 4989 DATA genPowF32<>+164(SB)/1, $0xff 4990 DATA genPowF32<>+165(SB)/1, $0x00 4991 DATA genPowF32<>+166(SB)/1, $0x00 4992 DATA genPowF32<>+167(SB)/1, $0x00 4993 DATA genPowF32<>+168(SB)/1, $0xff 4994 DATA genPowF32<>+169(SB)/1, $0x00 4995 DATA genPowF32<>+170(SB)/1, $0x00 4996 DATA genPowF32<>+171(SB)/1, $0x00 4997 DATA genPowF32<>+172(SB)/1, $0xff 4998 DATA genPowF32<>+173(SB)/1, $0x00 4999 DATA genPowF32<>+174(SB)/1, $0x00 5000 DATA genPowF32<>+175(SB)/1, $0x00 5001 DATA genPowF32<>+176(SB)/1, $0xff 5002 DATA genPowF32<>+177(SB)/1, $0x00 5003 DATA genPowF32<>+178(SB)/1, $0x00 5004 DATA genPowF32<>+179(SB)/1, $0x00 5005 DATA genPowF32<>+180(SB)/1, $0xff 5006 DATA genPowF32<>+181(SB)/1, $0x00 5007 DATA genPowF32<>+182(SB)/1, $0x00 5008 DATA genPowF32<>+183(SB)/1, $0x00 5009 DATA genPowF32<>+184(SB)/1, $0xff 5010 DATA genPowF32<>+185(SB)/1, $0x00 5011 DATA genPowF32<>+186(SB)/1, $0x00 5012 DATA genPowF32<>+187(SB)/1, $0x00 5013 DATA genPowF32<>+188(SB)/1, $0xff 5014 DATA genPowF32<>+189(SB)/1, $0x00 5015 DATA genPowF32<>+190(SB)/1, $0x00 5016 DATA genPowF32<>+191(SB)/1, $0x00 5017 DATA genPowF32<>+192(SB)/1, $0xff 5018 DATA genPowF32<>+193(SB)/1, $0x00 5019 DATA genPowF32<>+194(SB)/1, $0x00 5020 DATA genPowF32<>+195(SB)/1, $0x00 5021 GLOBL genPowF32<>(SB), RODATA|NOPTR, $196 5022 5023 // func Pow_8x_AVX2_F32(x []float32, y []float32) 5024 // Requires: AVX, AVX2, FMA3 5025 TEXT ·Pow_8x_AVX2_F32(SB), NOSPLIT, $0-48 5026 MOVQ x_base+0(FP), DI 5027 MOVQ y_base+24(FP), SI 5028 MOVQ x_len+8(FP), DX 5029 SUBQ $+872, SP 5030 ANDQ $-8, DX 5031 JE LBB8_12 5032 XORL AX, AX 5033 VBROADCASTSS genPowF32<>+0(SB), Y0 5034 VMOVUPS Y0, 320(SP) 5035 VBROADCASTSS genPowF32<>+4(SB), Y0 5036 VMOVUPS Y0, 800(SP) 5037 VPXOR X7, X7, X7 5038 VBROADCASTSS genPowF32<>+8(SB), Y0 5039 VMOVUPS Y0, 768(SP) 5040 VBROADCASTSS genPowF32<>+12(SB), Y0 5041 VMOVUPS Y0, 736(SP) 5042 VBROADCASTSS genPowF32<>+16(SB), Y0 5043 VMOVUPS Y0, 704(SP) 5044 VBROADCASTSS genPowF32<>+20(SB), Y0 5045 VMOVUPS Y0, 672(SP) 5046 VBROADCASTSS genPowF32<>+24(SB), Y0 5047 VMOVUPS Y0, 640(SP) 5048 VBROADCASTSS genPowF32<>+28(SB), Y0 5049 VMOVUPS Y0, 608(SP) 5050 VBROADCASTSS genPowF32<>+32(SB), Y0 5051 VMOVUPS Y0, 576(SP) 5052 VBROADCASTSS genPowF32<>+36(SB), Y0 5053 VMOVUPS Y0, 544(SP) 5054 VBROADCASTSS genPowF32<>+40(SB), Y0 5055 VMOVUPS Y0, 512(SP) 5056 VBROADCASTSS genPowF32<>+44(SB), Y0 5057 VMOVUPS Y0, 480(SP) 5058 VBROADCASTSD genPowF32<>+156(SB), Y0 5059 VMOVUPS Y0, 448(SP) 5060 VBROADCASTSS genPowF32<>+48(SB), Y0 5061 VMOVUPS Y0, 416(SP) 5062 VBROADCASTSS genPowF32<>+52(SB), Y0 5063 VMOVUPS Y0, -128(SP) 5064 VBROADCASTSS genPowF32<>+56(SB), Y0 5065 VMOVUPS Y0, 384(SP) 5066 VBROADCASTSS genPowF32<>+60(SB), Y0 5067 VMOVUPS Y0, 352(SP) 5068 VBROADCASTSS genPowF32<>+64(SB), Y0 5069 VMOVUPS Y0, 288(SP) 5070 VBROADCASTSS genPowF32<>+68(SB), Y0 5071 VMOVUPS Y0, 256(SP) 5072 VBROADCASTSS genPowF32<>+72(SB), Y0 5073 VMOVUPS Y0, 224(SP) 5074 VBROADCASTSS genPowF32<>+76(SB), Y0 5075 VMOVUPS Y0, 192(SP) 5076 VBROADCASTSS genPowF32<>+80(SB), Y0 5077 VMOVUPS Y0, 160(SP) 5078 VBROADCASTSS genPowF32<>+84(SB), Y0 5079 VMOVUPS Y0, 128(SP) 5080 VBROADCASTSS genPowF32<>+88(SB), Y0 5081 VMOVUPS Y0, 96(SP) 5082 VBROADCASTSS genPowF32<>+92(SB), Y0 5083 VMOVUPS Y0, 64(SP) 5084 VBROADCASTSS genPowF32<>+96(SB), Y0 5085 VMOVUPS Y0, 32(SP) 5086 VBROADCASTSS genPowF32<>+100(SB), Y0 5087 VMOVUPS Y0, (SP) 5088 VBROADCASTSS genPowF32<>+104(SB), Y0 5089 VMOVUPS Y0, -32(SP) 5090 VBROADCASTSS genPowF32<>+108(SB), Y0 5091 VMOVUPS Y0, -64(SP) 5092 VPBROADCASTD genPowF32<>+112(SB), Y0 5093 VMOVDQU Y0, -96(SP) 5094 VPBROADCASTD genPowF32<>+116(SB), Y8 5095 VBROADCASTSS genPowF32<>+116(SB), Y12 5096 JMP LBB8_2 5097 5098 LBB8_10: 5099 VPXOR Y0, Y15, Y0 5100 VPANDN Y0, Y14, Y0 5101 VMOVUPS -128(SP), Y14 5102 VMOVUPS 832(SP), Y2 5103 VCMPPS $0x00, Y2, Y14, Y3 5104 VCMPPS $0x01, Y2, Y14, Y4 5105 VXORPS Y4, Y11, Y4 5106 VPXOR X7, X7, X7 5107 VBLENDVPS Y4, Y12, Y7, Y4 5108 VBLENDVPS Y3, Y14, Y4, Y3 5109 VBLENDVPS Y0, Y6, Y3, Y0 5110 VANDPS Y6, Y10, Y3 5111 VANDPS Y9, Y10, Y4 5112 VORPS Y2, Y4, Y4 5113 VBLENDVPS Y13, Y3, Y4, Y3 5114 VBLENDVPS Y1, Y14, Y3, Y1 5115 VBLENDVPS Y5, Y0, Y1, Y0 5116 VCMPPS $0x03, Y9, Y9, Y1 5117 VCMPPS $0x03, Y11, Y11, Y3 5118 VORPS Y1, Y3, Y1 5119 VADDPS Y9, Y11, Y3 5120 VBLENDVPS Y1, Y3, Y0, Y6 5121 VMOVUPS Y6, (DI)(AX*4) 5122 ADDQ $0x08, AX 5123 CMPQ AX, DX 5124 JAE LBB8_12 5125 5126 LBB8_2: 5127 VMOVAPS Y12, Y2 5128 VMOVDQU (DI)(AX*4), Y9 5129 VMOVUPS (SI)(AX*4), Y11 5130 VPAND 320(SP), Y9, Y12 5131 VMOVUPS genPowF32<>+124(SB), X1 5132 VANDPS (DI)(AX*4), X1, X0 5133 VMOVUPS genPowF32<>+140(SB), X3 5134 VORPS X3, X0, X0 5135 VANDPS 16(DI)(AX*4), X1, X1 5136 VORPS X3, X1, X1 5137 VINSERTF128 $0x01, X1, Y0, Y0 5138 VMOVUPS 800(SP), Y1 5139 VCMPPS $0x01, Y0, Y1, Y1 5140 VANDNPS Y0, Y1, Y4 5141 VADDPS 768(SP), Y0, Y0 5142 VADDPS Y4, Y0, Y4 5143 VMULPS Y4, Y4, Y6 5144 VMULPS Y6, Y6, Y0 5145 VMOVUPS 704(SP), Y5 5146 VFMADD213PS 736(SP), Y4, Y5 5147 VMOVUPS 640(SP), Y10 5148 VFMADD213PS 672(SP), Y4, Y10 5149 VFMADD231PS Y5, Y6, Y10 5150 VMOVUPS 576(SP), Y5 5151 VFMADD213PS 608(SP), Y4, Y5 5152 VMOVUPS 512(SP), Y13 5153 VFMADD213PS 544(SP), Y4, Y13 5154 VMULPS Y0, Y0, Y14 5155 VFMADD132PS 480(SP), Y13, Y14 5156 VFMADD231PS Y5, Y6, Y14 5157 VFMADD231PS Y10, Y0, Y14 5158 VMULPS Y4, Y6, Y0 5159 VMULPS Y0, Y14, Y0 5160 VMOVDQU Y12, 832(SP) 5161 VPSRLD $0x17, Y12, Y5 5162 VPOR 448(SP), Y5, Y5 5163 VADDPS 416(SP), Y5, Y5 5164 VMOVUPS -128(SP), Y3 5165 VANDPS Y3, Y1, Y1 5166 VADDPS Y1, Y5, Y5 5167 VMULPS Y5, Y11, Y1 5168 VROUNDPS $0x08, Y1, Y1 5169 VFNMADD213PS Y1, Y11, Y5 5170 VMOVUPS 384(SP), Y14 5171 VMOVAPS Y14, Y10 5172 VFMADD213PS Y4, Y6, Y10 5173 VADDPS Y0, Y10, Y10 5174 VMOVUPS 352(SP), Y12 5175 VMULPS Y4, Y12, Y13 5176 VMULPS Y6, Y14, Y14 5177 VFMADD231PS Y13, Y4, Y14 5178 VSUBPS Y4, Y10, Y4 5179 VFMADD231PS Y6, Y12, Y4 5180 VMOVUPS 288(SP), Y15 5181 VMULPS Y15, Y11, Y6 5182 VMULPS Y6, Y10, Y6 5183 VROUNDPS $0x08, Y6, Y6 5184 VMULPS 256(SP), Y6, Y13 5185 VFMADD231PS Y10, Y11, Y13 5186 VFNMADD231PS 224(SP), Y6, Y13 5187 VSUBPS Y0, Y14, Y0 5188 VADDPS Y4, Y0, Y0 5189 VMOVUPS 192(SP), Y10 5190 VMULPS Y5, Y10, Y4 5191 VFNMADD231PS Y0, Y11, Y4 5192 VADDPS Y4, Y13, Y0 5193 VMULPS Y0, Y15, Y4 5194 VROUNDPS $0x08, Y4, Y4 5195 VFMADD231PS Y10, Y4, Y0 5196 VMULPS Y0, Y0, Y5 5197 VMULPS Y5, Y5, Y10 5198 VMOVUPS 64(SP), Y13 5199 VFMADD213PS 96(SP), Y0, Y13 5200 VMOVUPS 32(SP), Y14 5201 VFMADD213PS Y12, Y0, Y14 5202 VFMADD231PS Y13, Y10, Y14 5203 VMOVUPS 128(SP), Y10 5204 VFMADD213PS 160(SP), Y0, Y10 5205 VFMADD231PS Y10, Y5, Y14 5206 VADDPS Y3, Y0, Y10 5207 VFMADD231PS Y14, Y5, Y10 5208 VADDPS Y1, Y6, Y0 5209 VADDPS Y4, Y0, Y14 5210 VCVTPS2DQ Y14, Y4 5211 VPSRLD $0x17, Y10, Y0 5212 VPAND genPowF32<>+164(SB), Y0, Y0 5213 VPADDD Y4, Y0, Y0 5214 VPCMPGTD (SP), Y0, Y1 5215 VMOVUPS -32(SP), Y3 5216 VCMPPS $0x01, Y14, Y3, Y5 5217 VPOR Y5, Y1, Y1 5218 VMOVDQU -64(SP), Y3 5219 VPCMPGTD Y0, Y3, Y0 5220 VCMPPS $0x01, -96(SP), Y14, Y5 5221 VPOR Y5, Y0, Y0 5222 VPSLLD $0x17, Y4, Y4 5223 VPADDD Y4, Y10, Y6 5224 VPOR Y1, Y0, Y4 5225 VTESTPS Y4, Y4 5226 JNE LBB8_3 5227 VPCMPEQD Y15, Y15, Y15 5228 VMOVAPS Y2, Y12 5229 JMP LBB8_5 5230 5231 LBB8_3: 5232 VPANDN Y6, Y0, Y0 5233 VMOVAPS Y2, Y12 5234 VBLENDVPS Y1, Y2, Y0, Y6 5235 VPCMPEQD Y15, Y15, Y15 5236 5237 LBB8_5: 5238 VPAND Y8, Y9, Y5 5239 VPCMPEQD Y7, Y5, Y4 5240 VCMPPS $0x01, Y7, Y11, Y13 5241 VCMPPS $0x00, Y7, Y11, Y1 5242 VANDPS -128(SP), Y1, Y0 5243 VBLENDVPS Y13, Y12, Y0, Y0 5244 VBLENDVPS Y4, Y0, Y6, Y6 5245 VMOVMSKPS Y9, CX 5246 TESTL CX, CX 5247 JNE LBB8_7 5248 VXORPS X10, X10, X10 5249 JMP LBB8_8 5250 5251 LBB8_7: 5252 VROUNDPS $0x08, Y11, Y0 5253 VCMPPS $0x00, Y0, Y11, Y0 5254 VCVTPS2DQ Y11, Y10 5255 VPSLLD $0x1f, Y10, Y10 5256 VPOR Y6, Y10, Y12 5257 VPXOR X3, X3, X3 5258 VCMPPS $0x00, Y3, Y9, Y7 5259 VBROADCASTSS genPowF32<>+120(SB), Y3 5260 VBLENDVPS Y7, Y6, Y3, Y3 5261 VBLENDVPS Y0, Y12, Y3, Y3 5262 VMOVAPS Y2, Y12 5263 VPSRAD $0x1f, Y9, Y7 5264 VBLENDVPS Y7, Y3, Y6, Y6 5265 VANDPS Y0, Y10, Y10 5266 5267 LBB8_8: 5268 VPCMPEQD Y5, Y8, Y0 5269 VPXOR Y0, Y15, Y5 5270 VANDPS Y8, Y11, Y0 5271 VANDPS Y8, Y14, Y3 5272 VPCMPEQD Y3, Y8, Y14 5273 VPXOR Y15, Y14, Y3 5274 VPCMPEQD Y0, Y8, Y0 5275 VPANDN Y5, Y0, Y7 5276 VPOR Y4, Y3, Y3 5277 VPAND Y7, Y3, Y3 5278 VTESTPS Y15, Y3 5279 JAE LBB8_10 5280 VPXOR X7, X7, X7 5281 VMOVUPS Y6, (DI)(AX*4) 5282 ADDQ $0x08, AX 5283 CMPQ AX, DX 5284 JB LBB8_2 5285 5286 LBB8_12: 5287 ADDQ $+872, SP 5288 VZEROUPPER 5289 RET 5290 5291 DATA dataSinF32<>+0(SB)/4, $0x7fffffff 5292 DATA dataSinF32<>+4(SB)/4, $0x3fa2f983 5293 DATA dataSinF32<>+8(SB)/4, $0xfffffffe 5294 DATA dataSinF32<>+12(SB)/4, $0x00000002 5295 DATA dataSinF32<>+16(SB)/4, $0xbf490fdb 5296 DATA dataSinF32<>+20(SB)/4, $0x80000000 5297 DATA dataSinF32<>+24(SB)/4, $0x37ccf5ce 5298 DATA dataSinF32<>+28(SB)/4, $0xbab6061a 5299 DATA dataSinF32<>+32(SB)/4, $0x3d2aaaa5 5300 DATA dataSinF32<>+36(SB)/4, $0xbf000000 5301 DATA dataSinF32<>+40(SB)/4, $0x3f800000 5302 DATA dataSinF32<>+44(SB)/4, $0xb94ca1f9 5303 DATA dataSinF32<>+48(SB)/4, $0x3c08839e 5304 DATA dataSinF32<>+52(SB)/4, $0xbe2aaaa3 5305 DATA dataSinF32<>+56(SB)/4, $0x4b7fffff 5306 DATA dataSinF32<>+60(SB)/8, $0xffffffffffffffff 5307 DATA dataSinF32<>+68(SB)/8, $0xffffffffffffffff 5308 DATA dataSinF32<>+76(SB)/8, $0xffffffffffffffff 5309 DATA dataSinF32<>+84(SB)/8, $0xffffffffffffffff 5310 DATA dataSinF32<>+92(SB)/8, $0x0000000000000000 5311 DATA dataSinF32<>+100(SB)/8, $0x0000000000000000 5312 DATA dataSinF32<>+108(SB)/8, $0x0000000000000000 5313 DATA dataSinF32<>+116(SB)/8, $0x0000000000000000 5314 GLOBL dataSinF32<>(SB), RODATA|NOPTR, $124 5315 5316 // func Sin_AVX2_F32(x []float32) 5317 // Requires: AVX, AVX2, CMOV, FMA3 5318 TEXT ·Sin_AVX2_F32(SB), $0-24 5319 MOVQ x_base+0(FP), DI 5320 MOVQ x_len+8(FP), SI 5321 PUSHQ AX 5322 MOVQ SI, AX 5323 ANDQ $-8, AX 5324 JE LBB12_3 5325 XORL CX, CX 5326 VBROADCASTSS dataSinF32<>+0(SB), Y0 5327 VMOVUPS Y0, -32(SP) 5328 VBROADCASTSS dataSinF32<>+4(SB), Y0 5329 VMOVUPS Y0, -64(SP) 5330 VBROADCASTSS dataSinF32<>+8(SB), Y0 5331 VMOVUPS Y0, -96(SP) 5332 VPBROADCASTD dataSinF32<>+12(SB), Y4 5333 VPBROADCASTD dataSinF32<>+16(SB), Y0 5334 VMOVDQU Y0, -128(SP) 5335 VPBROADCASTD dataSinF32<>+20(SB), Y7 5336 VBROADCASTSS dataSinF32<>+24(SB), Y8 5337 VBROADCASTSS dataSinF32<>+28(SB), Y9 5338 VBROADCASTSS dataSinF32<>+32(SB), Y10 5339 VBROADCASTSS dataSinF32<>+36(SB), Y11 5340 VBROADCASTSS dataSinF32<>+40(SB), Y12 5341 VBROADCASTSS dataSinF32<>+44(SB), Y3 5342 VBROADCASTSS dataSinF32<>+48(SB), Y14 5343 VBROADCASTSS dataSinF32<>+52(SB), Y15 5344 5345 LBB12_2: 5346 VMOVUPS (DI)(CX*4), Y2 5347 VANDPS -32(SP), Y2, Y5 5348 VMULPS -64(SP), Y5, Y0 5349 VCVTTPS2DQ Y0, Y0 5350 VPSUBD dataSinF32<>+60(SB), Y0, Y0 5351 VPAND -96(SP), Y0, Y1 5352 VCVTDQ2PS Y1, Y1 5353 VFMADD132PS -128(SP), Y5, Y1 5354 VMULPS Y1, Y1, Y5 5355 VMOVAPS Y3, Y13 5356 VFMADD213PS Y14, Y5, Y13 5357 VFMADD213PS Y15, Y5, Y13 5358 VMULPS Y1, Y5, Y6 5359 VFMADD213PS Y1, Y13, Y6 5360 VPSLLD $0x1d, Y0, Y1 5361 VPAND Y4, Y0, Y0 5362 VPXOR Y2, Y1, Y1 5363 VMOVAPS Y8, Y2 5364 VFMADD213PS Y9, Y5, Y2 5365 VFMADD213PS Y10, Y5, Y2 5366 VFMADD213PS Y11, Y5, Y2 5367 VFMADD213PS Y12, Y5, Y2 5368 VPCMPEQD Y4, Y0, Y5 5369 VANDPS Y5, Y2, Y2 5370 VPCMPEQD dataSinF32<>+92(SB), Y0, Y0 5371 VANDPS Y0, Y6, Y0 5372 VADDPS Y2, Y0, Y0 5373 VPAND Y7, Y1, Y1 5374 VPXOR Y0, Y1, Y0 5375 VMOVDQU Y0, (DI)(CX*4) 5376 ADDQ $0x08, CX 5377 CMPQ CX, AX 5378 JB LBB12_2 5379 5380 LBB12_3: 5381 CMPQ AX, SI 5382 JAE LBB12_14 5383 VBROADCASTSS dataSinF32<>+20(SB), X0 5384 VPXOR X1, X1, X1 5385 VMOVSS dataSinF32<>+56(SB), X2 5386 VMOVSS dataSinF32<>+40(SB), X9 5387 VMOVSS dataSinF32<>+16(SB), X10 5388 VMOVSS dataSinF32<>+24(SB), X12 5389 VMOVSS dataSinF32<>+28(SB), X11 5390 VMOVSS dataSinF32<>+32(SB), X13 5391 VMOVSS dataSinF32<>+36(SB), X14 5392 VMOVSS dataSinF32<>+44(SB), X8 5393 VMOVSS dataSinF32<>+48(SB), X15 5394 VMOVSS dataSinF32<>+52(SB), X6 5395 JMP LBB12_5 5396 5397 LBB12_13: 5398 ADDQ $0x01, AX 5399 CMPQ AX, SI 5400 JAE LBB12_14 5401 5402 LBB12_5: 5403 VMOVSS (DI)(AX*4), X4 5404 VXORPS X0, X4, X3 5405 VCMPSS $0x01, X1, X4, X5 5406 VBLENDVPS X5, X3, X4, X3 5407 VUCOMISS X2, X3 5408 JA LBB12_13 5409 VUCOMISS X1, X4 5410 SETCS R8 5411 VMULSS dataSinF32<>+4(SB), X3, X4 5412 VCVTTSS2SI X4, DX 5413 VROUNDSS $0x0b, X4, X4, X4 5414 MOVL DX, CX 5415 ANDL $0x01, CX 5416 JE LBB12_8 5417 VADDSS X4, X9, X4 5418 5419 LBB12_8: 5420 ADDL DX, CX 5421 ANDL $0x07, CX 5422 LEAL -4(CX), DX 5423 CMPL CX, $0x04 5424 SETCC R9 5425 CMOVLLT CX, DX 5426 VFMADD231SS X10, X4, X3 5427 VMULSS X3, X3, X4 5428 VMOVAPS X12, X7 5429 VFMADD213SS X11, X4, X7 5430 VFMADD213SS X13, X4, X7 5431 VFMADD213SS X14, X4, X7 5432 VMOVAPS X8, X5 5433 VFMADD213SS X15, X4, X5 5434 VFMADD213SS X6, X4, X5 5435 ADDL $-1, DX 5436 CMPL DX, $0x02 5437 JB LBB12_9 5438 VMULSS X3, X4, X4 5439 VFMADD213SS X3, X4, X5 5440 VMOVAPS X5, X4 5441 VMOVSS X4, (DI)(AX*4) 5442 CMPB R8, R9 5443 JE LBB12_13 5444 JMP LBB12_12 5445 5446 LBB12_9: 5447 VFMADD213SS X9, X7, X4 5448 VMOVSS X4, (DI)(AX*4) 5449 CMPB R8, R9 5450 JE LBB12_13 5451 5452 LBB12_12: 5453 VXORPS X0, X4, X3 5454 VMOVSS X3, (DI)(AX*4) 5455 JMP LBB12_13 5456 5457 LBB12_14: 5458 POPQ AX 5459 VZEROUPPER 5460 RET 5461 5462 DATA dataCosF32<>+0(SB)/4, $0x7fffffff 5463 DATA dataCosF32<>+4(SB)/4, $0x3fa2f983 5464 DATA dataCosF32<>+8(SB)/4, $0xfffffffe 5465 DATA dataCosF32<>+12(SB)/4, $0x00000002 5466 DATA dataCosF32<>+16(SB)/4, $0xbf490fdb 5467 DATA dataCosF32<>+20(SB)/4, $0xc0000000 5468 DATA dataCosF32<>+24(SB)/4, $0x37ccf5ce 5469 DATA dataCosF32<>+28(SB)/4, $0xbab6061a 5470 DATA dataCosF32<>+32(SB)/4, $0x3d2aaaa5 5471 DATA dataCosF32<>+36(SB)/4, $0xbf000000 5472 DATA dataCosF32<>+40(SB)/4, $0x3f800000 5473 DATA dataCosF32<>+44(SB)/4, $0xb94ca1f9 5474 DATA dataCosF32<>+48(SB)/4, $0x3c08839e 5475 DATA dataCosF32<>+52(SB)/4, $0xbe2aaaa3 5476 DATA dataCosF32<>+56(SB)/4, $0x80000000 5477 DATA dataCosF32<>+60(SB)/4, $0x4b7fffff 5478 DATA dataCosF32<>+64(SB)/8, $0xffffffffffffffff 5479 DATA dataCosF32<>+72(SB)/8, $0xffffffffffffffff 5480 DATA dataCosF32<>+80(SB)/8, $0xffffffffffffffff 5481 DATA dataCosF32<>+88(SB)/8, $0xffffffffffffffff 5482 DATA dataCosF32<>+96(SB)/8, $0x0000000000000000 5483 DATA dataCosF32<>+104(SB)/8, $0x0000000000000000 5484 DATA dataCosF32<>+112(SB)/8, $0x0000000000000000 5485 DATA dataCosF32<>+120(SB)/8, $0x0000000000000000 5486 GLOBL dataCosF32<>(SB), RODATA|NOPTR, $128 5487 5488 // func Cos_AVX2_F32(x []float32) 5489 // Requires: AVX, AVX2, CMOV, FMA3 5490 TEXT ·Cos_AVX2_F32(SB), NOSPLIT, $0-24 5491 MOVQ x_base+0(FP), DI 5492 MOVQ x_len+8(FP), SI 5493 SUBQ $0x48, SP 5494 MOVQ SI, AX 5495 ANDQ $-8, AX 5496 JE LBB13_3 5497 XORL CX, CX 5498 VBROADCASTSS dataCosF32<>+0(SB), Y0 5499 VMOVUPS Y0, 32(SP) 5500 VBROADCASTSS dataCosF32<>+4(SB), Y0 5501 VMOVUPS Y0, (SP) 5502 VBROADCASTSS dataCosF32<>+8(SB), Y0 5503 VMOVUPS Y0, -32(SP) 5504 VPBROADCASTD dataCosF32<>+12(SB), Y4 5505 VBROADCASTSS dataCosF32<>+16(SB), Y0 5506 VMOVUPS Y0, -64(SP) 5507 VBROADCASTSS dataCosF32<>+20(SB), Y0 5508 VMOVUPS Y0, -96(SP) 5509 VBROADCASTSS dataCosF32<>+24(SB), Y0 5510 VMOVUPS Y0, -128(SP) 5511 VBROADCASTSS dataCosF32<>+28(SB), Y9 5512 VBROADCASTSS dataCosF32<>+32(SB), Y10 5513 VBROADCASTSS dataCosF32<>+36(SB), Y6 5514 VBROADCASTSS dataCosF32<>+40(SB), Y12 5515 VBROADCASTSS dataCosF32<>+44(SB), Y13 5516 VBROADCASTSS dataCosF32<>+48(SB), Y14 5517 VBROADCASTSS dataCosF32<>+52(SB), Y15 5518 VPBROADCASTD dataCosF32<>+56(SB), Y2 5519 5520 LBB13_2: 5521 VMOVUPS 32(SP), Y0 5522 VANDPS (DI)(CX*4), Y0, Y5 5523 VMULPS (SP), Y5, Y0 5524 VCVTTPS2DQ Y0, Y0 5525 VPSUBD dataCosF32<>+64(SB), Y0, Y0 5526 VPAND -32(SP), Y0, Y1 5527 VCVTDQ2PS Y1, Y3 5528 VFMADD132PS -64(SP), Y5, Y3 5529 VMULPS Y3, Y3, Y5 5530 VMOVUPS -128(SP), Y8 5531 VFMADD213PS Y9, Y5, Y8 5532 VFMADD213PS Y10, Y5, Y8 5533 VMULPS Y5, Y5, Y7 5534 VMOVAPS Y6, Y11 5535 VFMADD213PS Y12, Y5, Y11 5536 VFMADD231PS Y7, Y8, Y11 5537 VMOVAPS Y13, Y7 5538 VFMADD213PS Y14, Y5, Y7 5539 VFMADD213PS Y15, Y5, Y7 5540 VMULPS Y3, Y5, Y5 5541 VFMADD213PS Y3, Y7, Y5 5542 VPAND Y4, Y0, Y0 5543 VPCMPEQD Y4, Y0, Y3 5544 VPCMPEQD dataCosF32<>+96(SB), Y0, Y0 5545 VANDPS Y0, Y5, Y0 5546 VANDPS Y3, Y11, Y3 5547 VADDPS Y3, Y0, Y0 5548 VADDPS Y5, Y11, Y3 5549 VSUBPS Y0, Y3, Y0 5550 VPSLLD $0x1d, Y1, Y1 5551 VPADDD -96(SP), Y1, Y1 5552 VPAND Y2, Y1, Y1 5553 VPXOR Y2, Y1, Y1 5554 VXORPS Y1, Y0, Y0 5555 VMOVUPS Y0, (DI)(CX*4) 5556 ADDQ $0x08, CX 5557 CMPQ CX, AX 5558 JB LBB13_2 5559 5560 LBB13_3: 5561 CMPQ AX, SI 5562 JAE LBB13_14 5563 VBROADCASTSS dataCosF32<>+56(SB), X0 5564 VXORPS X1, X1, X1 5565 VMOVSS dataCosF32<>+60(SB), X2 5566 VMOVSS dataCosF32<>+40(SB), X9 5567 VMOVSS dataCosF32<>+16(SB), X10 5568 VMOVSS dataCosF32<>+24(SB), X8 5569 VMOVSS dataCosF32<>+28(SB), X11 5570 VMOVSS dataCosF32<>+32(SB), X13 5571 VMOVSS dataCosF32<>+36(SB), X14 5572 VMOVSS dataCosF32<>+44(SB), X7 5573 VMOVSS dataCosF32<>+48(SB), X15 5574 VMOVSS dataCosF32<>+52(SB), X6 5575 JMP LBB13_5 5576 5577 LBB13_13: 5578 ADDQ $0x01, AX 5579 CMPQ AX, SI 5580 JAE LBB13_14 5581 5582 LBB13_5: 5583 VMOVSS (DI)(AX*4), X3 5584 VXORPS X0, X3, X4 5585 VCMPSS $0x01, X1, X3, X5 5586 VBLENDVPS X5, X4, X3, X3 5587 VUCOMISS X2, X3 5588 JA LBB13_13 5589 VMULSS dataCosF32<>+4(SB), X3, X4 5590 VCVTTSS2SI X4, DX 5591 VROUNDSS $0x0b, X4, X4, X4 5592 MOVL DX, CX 5593 ANDL $0x01, CX 5594 JE LBB13_8 5595 VADDSS X4, X9, X4 5596 5597 LBB13_8: 5598 ADDL DX, CX 5599 ANDL $0x07, CX 5600 LEAL -4(CX), DX 5601 CMPL CX, $0x04 5602 CMOVLLT CX, DX 5603 SETCC R8 5604 CMPL DX, $0x02 5605 SETCC CL 5606 VFMADD231SS X10, X4, X3 5607 VMULSS X3, X3, X4 5608 VMOVAPS X8, X12 5609 VFMADD213SS X11, X4, X12 5610 VFMADD213SS X13, X4, X12 5611 VFMADD213SS X14, X4, X12 5612 VMOVAPS X7, X5 5613 VFMADD213SS X15, X4, X5 5614 VFMADD213SS X6, X4, X5 5615 ADDL $-1, DX 5616 CMPL DX, $0x02 5617 JB LBB13_9 5618 VFMADD213SS X9, X12, X4 5619 VMOVAPS X4, X5 5620 VMOVSS X5, (DI)(AX*4) 5621 CMPB R8, CL 5622 JE LBB13_13 5623 JMP LBB13_12 5624 5625 LBB13_9: 5626 VMULSS X3, X4, X4 5627 VFMADD213SS X3, X4, X5 5628 VMOVSS X5, (DI)(AX*4) 5629 CMPB R8, CL 5630 JE LBB13_13 5631 5632 LBB13_12: 5633 VXORPS X0, X5, X3 5634 VMOVSS X3, (DI)(AX*4) 5635 JMP LBB13_13 5636 5637 LBB13_14: 5638 ADDQ $0x48, SP 5639 VZEROUPPER 5640 RET 5641 5642 DATA dataSinCosF32<>+0(SB)/4, $0x7fffffff 5643 DATA dataSinCosF32<>+4(SB)/4, $0x3fa2f983 5644 DATA dataSinCosF32<>+8(SB)/4, $0xfffffffe 5645 DATA dataSinCosF32<>+12(SB)/4, $0x00000002 5646 DATA dataSinCosF32<>+16(SB)/4, $0xbf490fdb 5647 DATA dataSinCosF32<>+20(SB)/4, $0xc0000000 5648 DATA dataSinCosF32<>+24(SB)/4, $0x80000000 5649 DATA dataSinCosF32<>+28(SB)/4, $0x37ccf5ce 5650 DATA dataSinCosF32<>+32(SB)/4, $0xbab6061a 5651 DATA dataSinCosF32<>+36(SB)/4, $0x3d2aaaa5 5652 DATA dataSinCosF32<>+40(SB)/4, $0xbf000000 5653 DATA dataSinCosF32<>+44(SB)/4, $0x3f800000 5654 DATA dataSinCosF32<>+48(SB)/4, $0xb94ca1f9 5655 DATA dataSinCosF32<>+52(SB)/4, $0x3c08839e 5656 DATA dataSinCosF32<>+56(SB)/4, $0xbe2aaaa3 5657 DATA dataSinCosF32<>+60(SB)/4, $0x4b7fffff 5658 DATA dataSinCosF32<>+64(SB)/8, $0xffffffffffffffff 5659 DATA dataSinCosF32<>+72(SB)/8, $0xffffffffffffffff 5660 DATA dataSinCosF32<>+80(SB)/8, $0xffffffffffffffff 5661 DATA dataSinCosF32<>+88(SB)/8, $0xffffffffffffffff 5662 DATA dataSinCosF32<>+96(SB)/8, $0x0000000000000000 5663 DATA dataSinCosF32<>+104(SB)/8, $0x0000000000000000 5664 DATA dataSinCosF32<>+112(SB)/8, $0x0000000000000000 5665 DATA dataSinCosF32<>+120(SB)/8, $0x0000000000000000 5666 GLOBL dataSinCosF32<>(SB), RODATA|NOPTR, $128 5667 5668 // func SinCos_AVX2_F32(x []float32, y []float32, z []float32) 5669 // Requires: AVX, AVX2, CMOV, FMA3 5670 TEXT ·SinCos_AVX2_F32(SB), $0-72 5671 MOVQ x_base+0(FP), DI 5672 MOVQ y_base+24(FP), SI 5673 MOVQ z_base+48(FP), DX 5674 MOVQ x_len+8(FP), CX 5675 PUSHQ BX 5676 SUBQ $0x60, SP 5677 MOVQ CX, R8 5678 ANDQ $-8, R8 5679 JE LBB14_3 5680 XORL AX, AX 5681 VBROADCASTSS dataSinCosF32<>+0(SB), Y0 5682 VMOVUPS Y0, 64(SP) 5683 VBROADCASTSS dataSinCosF32<>+4(SB), Y0 5684 VMOVUPS Y0, 32(SP) 5685 VBROADCASTSS dataSinCosF32<>+8(SB), Y0 5686 VMOVUPS Y0, (SP) 5687 VPBROADCASTD dataSinCosF32<>+12(SB), Y4 5688 VBROADCASTSS dataSinCosF32<>+16(SB), Y0 5689 VMOVUPS Y0, -32(SP) 5690 VBROADCASTSS dataSinCosF32<>+20(SB), Y0 5691 VMOVUPS Y0, -64(SP) 5692 VPBROADCASTD dataSinCosF32<>+24(SB), Y8 5693 VBROADCASTSS dataSinCosF32<>+28(SB), Y0 5694 VMOVUPS Y0, -96(SP) 5695 VBROADCASTSS dataSinCosF32<>+32(SB), Y0 5696 VMOVUPS Y0, -128(SP) 5697 VBROADCASTSS dataSinCosF32<>+36(SB), Y11 5698 VBROADCASTSS dataSinCosF32<>+40(SB), Y10 5699 VBROADCASTSS dataSinCosF32<>+44(SB), Y13 5700 VBROADCASTSS dataSinCosF32<>+48(SB), Y14 5701 VBROADCASTSS dataSinCosF32<>+52(SB), Y15 5702 VBROADCASTSS dataSinCosF32<>+56(SB), Y2 5703 5704 LBB14_2: 5705 VMOVUPS (DX)(AX*4), Y5 5706 VANDPS 64(SP), Y5, Y1 5707 VMULPS 32(SP), Y1, Y0 5708 VCVTTPS2DQ Y0, Y0 5709 VPSUBD dataSinCosF32<>+64(SB), Y0, Y3 5710 VPAND (SP), Y3, Y0 5711 VCVTDQ2PS Y0, Y6 5712 VFMADD132PS -32(SP), Y1, Y6 5713 VMULPS Y6, Y6, Y1 5714 VMOVUPS -96(SP), Y9 5715 VFMADD213PS -128(SP), Y1, Y9 5716 VFMADD213PS Y11, Y1, Y9 5717 VMULPS Y1, Y1, Y7 5718 VMOVAPS Y10, Y12 5719 VFMADD213PS Y13, Y1, Y12 5720 VFMADD231PS Y7, Y9, Y12 5721 VMOVAPS Y14, Y7 5722 VFMADD213PS Y15, Y1, Y7 5723 VFMADD213PS Y2, Y1, Y7 5724 VMULPS Y6, Y1, Y1 5725 VFMADD213PS Y6, Y7, Y1 5726 VPSLLD $0x1d, Y3, Y6 5727 VPAND Y4, Y3, Y3 5728 VPXOR Y5, Y6, Y5 5729 VPCMPEQD Y4, Y3, Y6 5730 VPCMPEQD dataSinCosF32<>+96(SB), Y3, Y3 5731 VANDPS Y3, Y1, Y3 5732 VANDPS Y6, Y12, Y6 5733 VADDPS Y3, Y6, Y3 5734 VADDPS Y1, Y12, Y1 5735 VPAND Y5, Y8, Y5 5736 VSUBPS Y3, Y1, Y1 5737 VPXOR Y3, Y5, Y3 5738 VPSLLD $0x1d, Y0, Y0 5739 VPADDD -64(SP), Y0, Y0 5740 VPAND Y0, Y8, Y0 5741 VPXOR Y0, Y8, Y0 5742 VXORPS Y0, Y1, Y0 5743 VMOVDQU Y3, (DI)(AX*4) 5744 VMOVUPS Y0, (SI)(AX*4) 5745 ADDQ $0x08, AX 5746 CMPQ AX, R8 5747 JB LBB14_2 5748 5749 LBB14_3: 5750 CMPQ R8, CX 5751 JAE LBB14_16 5752 VBROADCASTSS dataSinCosF32<>+24(SB), X0 5753 VXORPS X1, X1, X1 5754 VMOVSS dataSinCosF32<>+60(SB), X2 5755 VMOVSS dataSinCosF32<>+44(SB), X6 5756 VMOVSS dataSinCosF32<>+28(SB), X8 5757 VMOVSS dataSinCosF32<>+36(SB), X12 5758 VMOVSS dataSinCosF32<>+40(SB), X13 5759 VMOVSS dataSinCosF32<>+48(SB), X15 5760 VMOVSS dataSinCosF32<>+52(SB), X14 5761 VMOVSS dataSinCosF32<>+56(SB), X10 5762 JMP LBB14_5 5763 5764 LBB14_15: 5765 ADDQ $0x01, R8 5766 CMPQ R8, CX 5767 JAE LBB14_16 5768 5769 LBB14_5: 5770 VMOVSS (DX)(R8*4), X4 5771 VXORPS X0, X4, X5 5772 VCMPSS $0x01, X1, X4, X7 5773 VBLENDVPS X7, X5, X4, X5 5774 VUCOMISS X2, X5 5775 JA LBB14_15 5776 VUCOMISS X1, X4 5777 SETCS R9 5778 VMULSS dataSinCosF32<>+4(SB), X5, X4 5779 VCVTTSS2SI X4, R10 5780 VROUNDSS $0x0b, X4, X4, X4 5781 MOVL R10, AX 5782 ANDL $0x01, AX 5783 JE LBB14_8 5784 VADDSS X6, X4, X4 5785 5786 LBB14_8: 5787 ADDL R10, AX 5788 ANDL $0x07, AX 5789 LEAL -4(AX), R10 5790 CMPL AX, $0x04 5791 SETCC R11 5792 CMOVLLT AX, R10 5793 VFMADD231SS dataSinCosF32<>+16(SB), X4, X5 5794 VMULSS X5, X5, X7 5795 VMOVAPS X8, X11 5796 VFMADD213SS dataSinCosF32<>+32(SB), X7, X11 5797 VFMADD213SS X12, X7, X11 5798 VMULSS X7, X7, X9 5799 VMOVAPS X6, X4 5800 VFMADD231SS X13, X7, X4 5801 VFMADD231SS X9, X11, X4 5802 VMOVAPS X15, X3 5803 VFMADD213SS X14, X7, X3 5804 VFMADD213SS X10, X7, X3 5805 VMULSS X5, X7, X7 5806 VFMADD213SS X5, X3, X7 5807 LEAL -1(R10), BX 5808 CMPL BX, $0x02 5809 JB LBB14_9 5810 VMOVAPS X7, X5 5811 VMOVSS X5, (DI)(R8*4) 5812 VMOVSS X4, (SI)(R8*4) 5813 CMPB R9, R11 5814 JNE LBB14_12 5815 JMP LBB14_13 5816 5817 LBB14_9: 5818 VMOVAPS X4, X5 5819 VMOVAPS X7, X4 5820 VMOVSS X5, (DI)(R8*4) 5821 VMOVSS X4, (SI)(R8*4) 5822 CMPB R9, R11 5823 JE LBB14_13 5824 5825 LBB14_12: 5826 VMOVSS (DI)(R8*4), X3 5827 VXORPS X0, X3, X3 5828 VMOVSS X3, (DI)(R8*4) 5829 5830 LBB14_13: 5831 CMPL R10, $0x02 5832 SETCC BL 5833 CMPL AX, $0x04 5834 SETCC AL 5835 CMPB AL, BL 5836 JE LBB14_15 5837 VMOVSS (SI)(R8*4), X3 5838 VXORPS X0, X3, X3 5839 VMOVSS X3, (SI)(R8*4) 5840 JMP LBB14_15 5841 5842 LBB14_16: 5843 ADDQ $0x60, SP 5844 POPQ BX 5845 VZEROUPPER 5846 RET 5847 5848 DATA dataExpLen8xF32<>+0(SB)/4, $0x42b17218 5849 DATA dataExpLen8xF32<>+4(SB)/4, $0xc2ce8ed0 5850 DATA dataExpLen8xF32<>+8(SB)/4, $0x3f000000 5851 DATA dataExpLen8xF32<>+12(SB)/4, $0x3fb8aa3b 5852 DATA dataExpLen8xF32<>+16(SB)/4, $0xbf318000 5853 DATA dataExpLen8xF32<>+20(SB)/4, $0x395e8083 5854 DATA dataExpLen8xF32<>+24(SB)/4, $0x3f800000 5855 DATA dataExpLen8xF32<>+28(SB)/4, $0x3ab743ce 5856 DATA dataExpLen8xF32<>+32(SB)/4, $0x39506967 5857 DATA dataExpLen8xF32<>+36(SB)/4, $0x3c088908 5858 DATA dataExpLen8xF32<>+40(SB)/4, $0x3d2aa9c1 5859 DATA dataExpLen8xF32<>+44(SB)/4, $0x3e2aaaaa 5860 DATA dataExpLen8xF32<>+48(SB)/4, $0x7f7fffff 5861 GLOBL dataExpLen8xF32<>(SB), RODATA|NOPTR, $52 5862 5863 // func Exp_Len8x_AVX2_F32(x []float32) 5864 // Requires: AVX, AVX2, FMA3 5865 TEXT ·Exp_Len8x_AVX2_F32(SB), NOSPLIT, $0-24 5866 MOVQ x_base+0(FP), DI 5867 MOVQ x_len+8(FP), SI 5868 TESTQ SI, SI 5869 JE LBB11_3 5870 XORL AX, AX 5871 VBROADCASTSS dataExpLen8xF32<>+0(SB), Y0 5872 VMOVUPS Y0, -40(SP) 5873 VBROADCASTSS dataExpLen8xF32<>+4(SB), Y0 5874 VMOVUPS Y0, -72(SP) 5875 VBROADCASTSS dataExpLen8xF32<>+8(SB), Y2 5876 VBROADCASTSS dataExpLen8xF32<>+12(SB), Y3 5877 VBROADCASTSS dataExpLen8xF32<>+16(SB), Y4 5878 VBROADCASTSS dataExpLen8xF32<>+20(SB), Y5 5879 VPBROADCASTD dataExpLen8xF32<>+24(SB), Y6 5880 VBROADCASTSS dataExpLen8xF32<>+28(SB), Y7 5881 VBROADCASTSS dataExpLen8xF32<>+32(SB), Y1 5882 VBROADCASTSS dataExpLen8xF32<>+36(SB), Y9 5883 VBROADCASTSS dataExpLen8xF32<>+40(SB), Y10 5884 VBROADCASTSS dataExpLen8xF32<>+44(SB), Y11 5885 VBROADCASTSS dataExpLen8xF32<>+48(SB), Y12 5886 5887 LBB11_2: 5888 VMOVUPS (DI)(AX*4), Y13 5889 VMOVAPS Y3, Y14 5890 VFMADD213PS Y2, Y13, Y14 5891 VROUNDPS $0x01, Y14, Y14 5892 VMOVAPS Y4, Y15 5893 VFMADD213PS Y13, Y14, Y15 5894 VFMADD231PS Y5, Y14, Y15 5895 VMULPS Y15, Y15, Y0 5896 VMOVAPS Y1, Y8 5897 VFMADD213PS Y7, Y15, Y8 5898 VFMADD213PS Y9, Y15, Y8 5899 VFMADD213PS Y10, Y15, Y8 5900 VFMADD213PS Y11, Y15, Y8 5901 VFMADD213PS Y2, Y15, Y8 5902 VFMADD213PS Y15, Y0, Y8 5903 VCVTTPS2DQ Y14, Y0 5904 VPSLLD $0x17, Y0, Y0 5905 VPADDD Y6, Y0, Y0 5906 VFMADD213PS Y0, Y0, Y8 5907 VMOVUPS -40(SP), Y0 5908 VCMPPS $0x01, Y13, Y0, Y0 5909 VBLENDVPS Y0, Y12, Y8, Y0 5910 VMOVUPS -72(SP), Y8 5911 VCMPPS $0x02, Y13, Y8, Y8 5912 VANDPS Y0, Y8, Y0 5913 VMOVUPS Y0, (DI)(AX*4) 5914 ADDQ $0x08, AX 5915 CMPQ AX, SI 5916 JB LBB11_2 5917 5918 LBB11_3: 5919 VZEROUPPER 5920 RET 5921 5922 DATA dataLogLen8xF32<>+0(SB)/4, $0x00800000 5923 DATA dataLogLen8xF32<>+4(SB)/4, $0x807fffff 5924 DATA dataLogLen8xF32<>+8(SB)/4, $0x3f000000 5925 DATA dataLogLen8xF32<>+12(SB)/4, $0xffffff81 5926 DATA dataLogLen8xF32<>+16(SB)/4, $0x3f800000 5927 DATA dataLogLen8xF32<>+20(SB)/4, $0x3f3504f3 5928 DATA dataLogLen8xF32<>+24(SB)/4, $0xbf800000 5929 DATA dataLogLen8xF32<>+28(SB)/4, $0x3d9021bb 5930 DATA dataLogLen8xF32<>+32(SB)/4, $0xbdebd1b8 5931 DATA dataLogLen8xF32<>+36(SB)/4, $0x3def251a 5932 DATA dataLogLen8xF32<>+40(SB)/4, $0xbdfe5d4f 5933 DATA dataLogLen8xF32<>+44(SB)/4, $0x3e11e9bf 5934 DATA dataLogLen8xF32<>+48(SB)/4, $0xbe2aae50 5935 DATA dataLogLen8xF32<>+52(SB)/4, $0x3e4cceac 5936 DATA dataLogLen8xF32<>+56(SB)/4, $0xbe7ffffc 5937 DATA dataLogLen8xF32<>+60(SB)/4, $0x3eaaaaaa 5938 DATA dataLogLen8xF32<>+64(SB)/4, $0x3f317218 5939 DATA dataLogLen8xF32<>+68(SB)/4, $0xbf000000 5940 DATA dataLogLen8xF32<>+72(SB)/8, $0x0000000000000000 5941 DATA dataLogLen8xF32<>+80(SB)/8, $0x0000000000000000 5942 DATA dataLogLen8xF32<>+88(SB)/8, $0x0000000000000000 5943 DATA dataLogLen8xF32<>+96(SB)/8, $0x0000000000000000 5944 GLOBL dataLogLen8xF32<>(SB), RODATA|NOPTR, $104 5945 5946 // func Log_Len8x_AVX2_F32(x []float32) 5947 // Requires: AVX, AVX2, FMA3 5948 TEXT ·Log_Len8x_AVX2_F32(SB), NOSPLIT, $0-24 5949 MOVQ x_base+0(FP), DI 5950 MOVQ x_len+8(FP), SI 5951 SUBQ $0x68, SP 5952 TESTQ SI, SI 5953 JE LBB10_3 5954 XORL AX, AX 5955 VBROADCASTSS dataLogLen8xF32<>+0(SB), Y0 5956 VMOVUPS Y0, 64(SP) 5957 VBROADCASTSS dataLogLen8xF32<>+4(SB), Y0 5958 VMOVUPS Y0, 32(SP) 5959 VBROADCASTSS dataLogLen8xF32<>+8(SB), Y0 5960 VMOVUPS Y0, (SP) 5961 VBROADCASTSS dataLogLen8xF32<>+12(SB), Y0 5962 VMOVUPS Y0, -32(SP) 5963 VBROADCASTSS dataLogLen8xF32<>+16(SB), Y0 5964 VMOVUPS Y0, -64(SP) 5965 VBROADCASTSS dataLogLen8xF32<>+20(SB), Y0 5966 VMOVUPS Y0, -96(SP) 5967 VBROADCASTSS dataLogLen8xF32<>+24(SB), Y0 5968 VMOVUPS Y0, -128(SP) 5969 VBROADCASTSS dataLogLen8xF32<>+28(SB), Y8 5970 VBROADCASTSS dataLogLen8xF32<>+32(SB), Y9 5971 VBROADCASTSS dataLogLen8xF32<>+36(SB), Y10 5972 VBROADCASTSS dataLogLen8xF32<>+40(SB), Y11 5973 VBROADCASTSS dataLogLen8xF32<>+44(SB), Y12 5974 VBROADCASTSS dataLogLen8xF32<>+48(SB), Y13 5975 VBROADCASTSS dataLogLen8xF32<>+52(SB), Y14 5976 VBROADCASTSS dataLogLen8xF32<>+56(SB), Y15 5977 VBROADCASTSS dataLogLen8xF32<>+60(SB), Y0 5978 VBROADCASTSS dataLogLen8xF32<>+64(SB), Y1 5979 VBROADCASTSS dataLogLen8xF32<>+68(SB), Y2 5980 5981 LBB10_2: 5982 VMOVUPS (DI)(AX*4), Y3 5983 VMAXPS 64(SP), Y3, Y4 5984 VPSRLD $0x17, Y4, Y5 5985 VPADDD -32(SP), Y5, Y5 5986 VANDPS 32(SP), Y4, Y4 5987 VORPS (SP), Y4, Y4 5988 VCVTDQ2PS Y5, Y5 5989 VADDPS -64(SP), Y5, Y6 5990 VCMPPS $0x01, -96(SP), Y4, Y7 5991 VBLENDVPS Y7, Y5, Y6, Y5 5992 VANDPS Y4, Y7, Y6 5993 VADDPS -128(SP), Y4, Y4 5994 VADDPS Y6, Y4, Y4 5995 VMOVAPS Y8, Y6 5996 VFMADD213PS Y9, Y4, Y6 5997 VFMADD213PS Y10, Y4, Y6 5998 VFMADD213PS Y11, Y4, Y6 5999 VFMADD213PS Y12, Y4, Y6 6000 VFMADD213PS Y13, Y4, Y6 6001 VFMADD213PS Y14, Y4, Y6 6002 VFMADD213PS Y15, Y4, Y6 6003 VFMADD213PS Y0, Y4, Y6 6004 VFMADD213PS Y2, Y4, Y6 6005 VFMADD213PS Y4, Y1, Y5 6006 VMULPS Y4, Y4, Y4 6007 VFMADD231PS Y6, Y4, Y5 6008 VCMPPS $0x02, dataLogLen8xF32<>+72(SB), Y3, Y3 6009 VORPS Y5, Y3, Y3 6010 VMOVUPS Y3, (DI)(AX*4) 6011 ADDQ $0x08, AX 6012 CMPQ AX, SI 6013 JB LBB10_2 6014 6015 LBB10_3: 6016 ADDQ $0x68, SP 6017 VZEROUPPER 6018 RET 6019 6020 DATA dataLog2Len8xF32<>+0(SB)/4, $0x00800000 6021 DATA dataLog2Len8xF32<>+4(SB)/4, $0x807fffff 6022 DATA dataLog2Len8xF32<>+8(SB)/4, $0x3f000000 6023 DATA dataLog2Len8xF32<>+12(SB)/4, $0xffffff81 6024 DATA dataLog2Len8xF32<>+16(SB)/4, $0x3f800000 6025 DATA dataLog2Len8xF32<>+20(SB)/4, $0x3f3504f3 6026 DATA dataLog2Len8xF32<>+24(SB)/4, $0xbf800000 6027 DATA dataLog2Len8xF32<>+28(SB)/4, $0x3d9021bb 6028 DATA dataLog2Len8xF32<>+32(SB)/4, $0xbdebd1b8 6029 DATA dataLog2Len8xF32<>+36(SB)/4, $0x3def251a 6030 DATA dataLog2Len8xF32<>+40(SB)/4, $0xbdfe5d4f 6031 DATA dataLog2Len8xF32<>+44(SB)/4, $0x3e11e9bf 6032 DATA dataLog2Len8xF32<>+48(SB)/4, $0xbe2aae50 6033 DATA dataLog2Len8xF32<>+52(SB)/4, $0x3e4cceac 6034 DATA dataLog2Len8xF32<>+56(SB)/4, $0xbe7ffffc 6035 DATA dataLog2Len8xF32<>+60(SB)/4, $0x3eaaaaaa 6036 DATA dataLog2Len8xF32<>+64(SB)/4, $0x3f317218 6037 DATA dataLog2Len8xF32<>+68(SB)/4, $0xbf000000 6038 DATA dataLog2Len8xF32<>+72(SB)/4, $0x3fb8aa3b 6039 DATA dataLog2Len8xF32<>+76(SB)/8, $0x0000000000000000 6040 DATA dataLog2Len8xF32<>+84(SB)/8, $0x0000000000000000 6041 DATA dataLog2Len8xF32<>+92(SB)/8, $0x0000000000000000 6042 DATA dataLog2Len8xF32<>+100(SB)/8, $0x0000000000000000 6043 GLOBL dataLog2Len8xF32<>(SB), RODATA|NOPTR, $108 6044 6045 // func Log2_Len8x_AVX2_F32(x []float32) 6046 // Requires: AVX, AVX2, FMA3 6047 TEXT ·Log2_Len8x_AVX2_F32(SB), NOSPLIT, $0-24 6048 MOVQ x_base+0(FP), DI 6049 MOVQ x_len+8(FP), SI 6050 SUBQ $0x88, SP 6051 TESTQ SI, SI 6052 JE LBB9_3 6053 XORL AX, AX 6054 VBROADCASTSS dataLog2Len8xF32<>+4(SB), Y0 6055 VMOVUPS Y0, 96(SP) 6056 VBROADCASTSS dataLog2Len8xF32<>+8(SB), Y0 6057 VMOVUPS Y0, 64(SP) 6058 VBROADCASTSS dataLog2Len8xF32<>+12(SB), Y0 6059 VMOVUPS Y0, 32(SP) 6060 VBROADCASTSS dataLog2Len8xF32<>+0(SB), Y0 6061 VMOVUPS Y0, (SP) 6062 VBROADCASTSS dataLog2Len8xF32<>+16(SB), Y0 6063 VMOVUPS Y0, -32(SP) 6064 VBROADCASTSS dataLog2Len8xF32<>+20(SB), Y0 6065 VMOVUPS Y0, -64(SP) 6066 VBROADCASTSS dataLog2Len8xF32<>+24(SB), Y0 6067 VMOVUPS Y0, -96(SP) 6068 VBROADCASTSS dataLog2Len8xF32<>+28(SB), Y0 6069 VMOVUPS Y0, -128(SP) 6070 VBROADCASTSS dataLog2Len8xF32<>+32(SB), Y9 6071 VBROADCASTSS dataLog2Len8xF32<>+36(SB), Y10 6072 VBROADCASTSS dataLog2Len8xF32<>+40(SB), Y11 6073 VBROADCASTSS dataLog2Len8xF32<>+44(SB), Y12 6074 VBROADCASTSS dataLog2Len8xF32<>+48(SB), Y13 6075 VBROADCASTSS dataLog2Len8xF32<>+52(SB), Y14 6076 VBROADCASTSS dataLog2Len8xF32<>+56(SB), Y15 6077 VBROADCASTSS dataLog2Len8xF32<>+60(SB), Y0 6078 VBROADCASTSS dataLog2Len8xF32<>+64(SB), Y1 6079 VBROADCASTSS dataLog2Len8xF32<>+68(SB), Y2 6080 VBROADCASTSS dataLog2Len8xF32<>+72(SB), Y3 6081 6082 LBB9_2: 6083 VMOVUPS (DI)(AX*4), Y4 6084 VMAXPS (SP), Y4, Y5 6085 VPSRLD $0x17, Y5, Y6 6086 VPADDD 32(SP), Y6, Y6 6087 VANDPS 96(SP), Y5, Y5 6088 VORPS 64(SP), Y5, Y5 6089 VCVTDQ2PS Y6, Y6 6090 VADDPS -32(SP), Y6, Y7 6091 VCMPPS $0x01, -64(SP), Y5, Y8 6092 VBLENDVPS Y8, Y6, Y7, Y6 6093 VANDPS Y5, Y8, Y7 6094 VADDPS -96(SP), Y5, Y5 6095 VADDPS Y7, Y5, Y5 6096 VMOVUPS -128(SP), Y7 6097 VFMADD213PS Y9, Y5, Y7 6098 VFMADD213PS Y10, Y5, Y7 6099 VFMADD213PS Y11, Y5, Y7 6100 VFMADD213PS Y12, Y5, Y7 6101 VFMADD213PS Y13, Y5, Y7 6102 VFMADD213PS Y14, Y5, Y7 6103 VFMADD213PS Y15, Y5, Y7 6104 VFMADD213PS Y0, Y5, Y7 6105 VFMADD213PS Y2, Y5, Y7 6106 VFMADD213PS Y5, Y1, Y6 6107 VMULPS Y5, Y5, Y5 6108 VFMADD231PS Y7, Y5, Y6 6109 VCMPPS $0x02, dataLog2Len8xF32<>+76(SB), Y4, Y4 6110 VMULPS Y3, Y6, Y5 6111 VORPS Y5, Y4, Y4 6112 VMOVUPS Y4, (DI)(AX*4) 6113 ADDQ $0x08, AX 6114 CMPQ AX, SI 6115 JB LBB9_2 6116 6117 LBB9_3: 6118 ADDQ $0x88, SP 6119 VZEROUPPER 6120 RET 6121 6122 DATA dataLog10Len8xF32<>+0(SB)/4, $0x00800000 6123 DATA dataLog10Len8xF32<>+4(SB)/4, $0x807fffff 6124 DATA dataLog10Len8xF32<>+8(SB)/4, $0x3f000000 6125 DATA dataLog10Len8xF32<>+12(SB)/4, $0xffffff81 6126 DATA dataLog10Len8xF32<>+16(SB)/4, $0x3f800000 6127 DATA dataLog10Len8xF32<>+20(SB)/4, $0x3f3504f3 6128 DATA dataLog10Len8xF32<>+24(SB)/4, $0xbf800000 6129 DATA dataLog10Len8xF32<>+28(SB)/4, $0x3d9021bb 6130 DATA dataLog10Len8xF32<>+32(SB)/4, $0xbdebd1b8 6131 DATA dataLog10Len8xF32<>+36(SB)/4, $0x3def251a 6132 DATA dataLog10Len8xF32<>+40(SB)/4, $0xbdfe5d4f 6133 DATA dataLog10Len8xF32<>+44(SB)/4, $0x3e11e9bf 6134 DATA dataLog10Len8xF32<>+48(SB)/4, $0xbe2aae50 6135 DATA dataLog10Len8xF32<>+52(SB)/4, $0x3e4cceac 6136 DATA dataLog10Len8xF32<>+56(SB)/4, $0xbe7ffffc 6137 DATA dataLog10Len8xF32<>+60(SB)/4, $0x3eaaaaaa 6138 DATA dataLog10Len8xF32<>+64(SB)/4, $0x3f317218 6139 DATA dataLog10Len8xF32<>+68(SB)/4, $0xbf000000 6140 DATA dataLog10Len8xF32<>+72(SB)/4, $0x3ede5bd9 6141 DATA dataLog10Len8xF32<>+76(SB)/8, $0x0000000000000000 6142 DATA dataLog10Len8xF32<>+84(SB)/8, $0x0000000000000000 6143 DATA dataLog10Len8xF32<>+92(SB)/8, $0x0000000000000000 6144 DATA dataLog10Len8xF32<>+100(SB)/8, $0x0000000000000000 6145 GLOBL dataLog10Len8xF32<>(SB), RODATA|NOPTR, $108 6146 6147 // func Log10_Len8x_AVX2_F32(x []float32) 6148 // Requires: AVX, AVX2, FMA3 6149 TEXT ·Log10_Len8x_AVX2_F32(SB), NOSPLIT, $0-24 6150 MOVQ x_base+0(FP), DI 6151 MOVQ x_len+8(FP), SI 6152 SUBQ $0x88, SP 6153 TESTQ SI, SI 6154 JE LBB8_3 6155 XORL AX, AX 6156 VBROADCASTSS dataLog10Len8xF32<>+4(SB), Y0 6157 VMOVUPS Y0, 96(SP) 6158 VBROADCASTSS dataLog10Len8xF32<>+8(SB), Y0 6159 VMOVUPS Y0, 64(SP) 6160 VBROADCASTSS dataLog10Len8xF32<>+12(SB), Y0 6161 VMOVUPS Y0, 32(SP) 6162 VBROADCASTSS dataLog10Len8xF32<>+0(SB), Y0 6163 VMOVUPS Y0, (SP) 6164 VBROADCASTSS dataLog10Len8xF32<>+16(SB), Y0 6165 VMOVUPS Y0, -32(SP) 6166 VBROADCASTSS dataLog10Len8xF32<>+20(SB), Y0 6167 VMOVUPS Y0, -64(SP) 6168 VBROADCASTSS dataLog10Len8xF32<>+24(SB), Y0 6169 VMOVUPS Y0, -96(SP) 6170 VBROADCASTSS dataLog10Len8xF32<>+28(SB), Y0 6171 VMOVUPS Y0, -128(SP) 6172 VBROADCASTSS dataLog10Len8xF32<>+32(SB), Y9 6173 VBROADCASTSS dataLog10Len8xF32<>+36(SB), Y10 6174 VBROADCASTSS dataLog10Len8xF32<>+40(SB), Y11 6175 VBROADCASTSS dataLog10Len8xF32<>+44(SB), Y12 6176 VBROADCASTSS dataLog10Len8xF32<>+48(SB), Y13 6177 VBROADCASTSS dataLog10Len8xF32<>+52(SB), Y14 6178 VBROADCASTSS dataLog10Len8xF32<>+56(SB), Y15 6179 VBROADCASTSS dataLog10Len8xF32<>+60(SB), Y0 6180 VBROADCASTSS dataLog10Len8xF32<>+64(SB), Y1 6181 VBROADCASTSS dataLog10Len8xF32<>+68(SB), Y2 6182 VBROADCASTSS dataLog10Len8xF32<>+72(SB), Y3 6183 6184 LBB8_2: 6185 VMOVUPS (DI)(AX*4), Y4 6186 VMAXPS (SP), Y4, Y5 6187 VPSRLD $0x17, Y5, Y6 6188 VPADDD 32(SP), Y6, Y6 6189 VANDPS 96(SP), Y5, Y5 6190 VORPS 64(SP), Y5, Y5 6191 VCVTDQ2PS Y6, Y6 6192 VADDPS -32(SP), Y6, Y7 6193 VCMPPS $0x01, -64(SP), Y5, Y8 6194 VBLENDVPS Y8, Y6, Y7, Y6 6195 VANDPS Y5, Y8, Y7 6196 VADDPS -96(SP), Y5, Y5 6197 VADDPS Y7, Y5, Y5 6198 VMOVUPS -128(SP), Y7 6199 VFMADD213PS Y9, Y5, Y7 6200 VFMADD213PS Y10, Y5, Y7 6201 VFMADD213PS Y11, Y5, Y7 6202 VFMADD213PS Y12, Y5, Y7 6203 VFMADD213PS Y13, Y5, Y7 6204 VFMADD213PS Y14, Y5, Y7 6205 VFMADD213PS Y15, Y5, Y7 6206 VFMADD213PS Y0, Y5, Y7 6207 VFMADD213PS Y2, Y5, Y7 6208 VFMADD213PS Y5, Y1, Y6 6209 VMULPS Y5, Y5, Y5 6210 VFMADD231PS Y7, Y5, Y6 6211 VCMPPS $0x02, dataLog10Len8xF32<>+76(SB), Y4, Y4 6212 VMULPS Y3, Y6, Y5 6213 VORPS Y5, Y4, Y4 6214 VMOVUPS Y4, (DI)(AX*4) 6215 ADDQ $0x08, AX 6216 CMPQ AX, SI 6217 JB LBB8_2 6218 6219 LBB8_3: 6220 ADDQ $0x88, SP 6221 VZEROUPPER 6222 RET 6223 6224 DATA dataMinF64<>+0(SB)/8, $0x7fefffffffffffff 6225 GLOBL dataMinF64<>(SB), RODATA|NOPTR, $8 6226 6227 // func Min_AVX2_F64(x []float64) float64 6228 // Requires: AVX, SSE2 6229 TEXT ·Min_AVX2_F64(SB), NOSPLIT, $0-32 6230 MOVQ x_base+0(FP), DI 6231 MOVQ x_len+8(FP), SI 6232 TESTQ SI, SI 6233 JE LBB0_1 6234 CMPQ SI, $0x10 6235 JAE LBB0_4 6236 VMOVSD dataMinF64<>+0(SB), X0 6237 XORL AX, AX 6238 JMP LBB0_11 6239 6240 LBB0_1: 6241 VMOVSD dataMinF64<>+0(SB), X0 6242 MOVSD X0, ret+24(FP) 6243 RET 6244 6245 LBB0_4: 6246 MOVQ SI, AX 6247 ANDQ $-16, AX 6248 LEAQ -16(AX), CX 6249 MOVQ CX, R8 6250 SHRQ $0x04, R8 6251 ADDQ $0x01, R8 6252 TESTQ CX, CX 6253 JE LBB0_5 6254 MOVQ R8, CX 6255 ANDQ $-2, CX 6256 VBROADCASTSD dataMinF64<>+0(SB), Y0 6257 XORL DX, DX 6258 VMOVAPD Y0, Y1 6259 VMOVAPD Y0, Y2 6260 VMOVAPD Y0, Y3 6261 6262 LBB0_7: 6263 VMINPD (DI)(DX*8), Y0, Y0 6264 VMINPD 32(DI)(DX*8), Y1, Y1 6265 VMINPD 64(DI)(DX*8), Y2, Y2 6266 VMINPD 96(DI)(DX*8), Y3, Y3 6267 VMINPD 128(DI)(DX*8), Y0, Y0 6268 VMINPD 160(DI)(DX*8), Y1, Y1 6269 VMINPD 192(DI)(DX*8), Y2, Y2 6270 VMINPD 224(DI)(DX*8), Y3, Y3 6271 ADDQ $0x20, DX 6272 ADDQ $-2, CX 6273 JNE LBB0_7 6274 TESTB $0x01, R8 6275 JE LBB0_10 6276 6277 LBB0_9: 6278 VMINPD (DI)(DX*8), Y0, Y0 6279 VMINPD 32(DI)(DX*8), Y1, Y1 6280 VMINPD 64(DI)(DX*8), Y2, Y2 6281 VMINPD 96(DI)(DX*8), Y3, Y3 6282 6283 LBB0_10: 6284 VMINPD Y3, Y0, Y0 6285 VMINPD Y2, Y1, Y1 6286 VMINPD Y0, Y1, Y0 6287 VEXTRACTF128 $0x01, Y0, X1 6288 VMINPD X1, X0, X0 6289 VPERMILPD $0x01, X0, X1 6290 VMINSD X1, X0, X0 6291 CMPQ AX, SI 6292 JE LBB0_12 6293 6294 LBB0_11: 6295 VMINSD (DI)(AX*8), X0, X0 6296 ADDQ $0x01, AX 6297 CMPQ SI, AX 6298 JNE LBB0_11 6299 6300 LBB0_12: 6301 VZEROUPPER 6302 MOVSD X0, ret+24(FP) 6303 RET 6304 6305 LBB0_5: 6306 VBROADCASTSD dataMinF64<>+0(SB), Y0 6307 XORL DX, DX 6308 VMOVAPD Y0, Y1 6309 VMOVAPD Y0, Y2 6310 VMOVAPD Y0, Y3 6311 TESTB $0x01, R8 6312 JNE LBB0_9 6313 JMP LBB0_10 6314 6315 DATA dataMinF32<>+0(SB)/4, $0x7f7fffff 6316 GLOBL dataMinF32<>(SB), RODATA|NOPTR, $4 6317 6318 // func Min_AVX2_F32(x []float32) float32 6319 // Requires: AVX, SSE 6320 TEXT ·Min_AVX2_F32(SB), NOSPLIT, $0-28 6321 MOVQ x_base+0(FP), DI 6322 MOVQ x_len+8(FP), SI 6323 TESTQ SI, SI 6324 JE LBB1_1 6325 CMPQ SI, $0x20 6326 JAE LBB1_4 6327 VMOVSS dataMinF32<>+0(SB), X0 6328 XORL AX, AX 6329 JMP LBB1_11 6330 6331 LBB1_1: 6332 VMOVSS dataMinF32<>+0(SB), X0 6333 MOVSS X0, ret+24(FP) 6334 RET 6335 6336 LBB1_4: 6337 MOVQ SI, AX 6338 ANDQ $-32, AX 6339 LEAQ -32(AX), CX 6340 MOVQ CX, R8 6341 SHRQ $0x05, R8 6342 ADDQ $0x01, R8 6343 TESTQ CX, CX 6344 JE LBB1_5 6345 MOVQ R8, CX 6346 ANDQ $-2, CX 6347 VBROADCASTSS dataMinF32<>+0(SB), Y0 6348 XORL DX, DX 6349 VMOVAPS Y0, Y1 6350 VMOVAPS Y0, Y2 6351 VMOVAPS Y0, Y3 6352 6353 LBB1_7: 6354 VMINPS (DI)(DX*4), Y0, Y0 6355 VMINPS 32(DI)(DX*4), Y1, Y1 6356 VMINPS 64(DI)(DX*4), Y2, Y2 6357 VMINPS 96(DI)(DX*4), Y3, Y3 6358 VMINPS 128(DI)(DX*4), Y0, Y0 6359 VMINPS 160(DI)(DX*4), Y1, Y1 6360 VMINPS 192(DI)(DX*4), Y2, Y2 6361 VMINPS 224(DI)(DX*4), Y3, Y3 6362 ADDQ $0x40, DX 6363 ADDQ $-2, CX 6364 JNE LBB1_7 6365 TESTB $0x01, R8 6366 JE LBB1_10 6367 6368 LBB1_9: 6369 VMINPS (DI)(DX*4), Y0, Y0 6370 VMINPS 32(DI)(DX*4), Y1, Y1 6371 VMINPS 64(DI)(DX*4), Y2, Y2 6372 VMINPS 96(DI)(DX*4), Y3, Y3 6373 6374 LBB1_10: 6375 VMINPS Y3, Y0, Y0 6376 VMINPS Y2, Y1, Y1 6377 VMINPS Y0, Y1, Y0 6378 VEXTRACTF128 $0x01, Y0, X1 6379 VMINPS X1, X0, X0 6380 VPERMILPD $0x01, X0, X1 6381 VMINPS X1, X0, X0 6382 VMOVSHDUP X0, X1 6383 VMINSS X1, X0, X0 6384 CMPQ AX, SI 6385 JE LBB1_12 6386 6387 LBB1_11: 6388 VMINSS (DI)(AX*4), X0, X0 6389 ADDQ $0x01, AX 6390 CMPQ SI, AX 6391 JNE LBB1_11 6392 6393 LBB1_12: 6394 VZEROUPPER 6395 MOVSS X0, ret+24(FP) 6396 RET 6397 6398 LBB1_5: 6399 VBROADCASTSS dataMinF32<>+0(SB), Y0 6400 XORL DX, DX 6401 VMOVAPS Y0, Y1 6402 VMOVAPS Y0, Y2 6403 VMOVAPS Y0, Y3 6404 TESTB $0x01, R8 6405 JNE LBB1_9 6406 JMP LBB1_10 6407 6408 // func Minimum_AVX2_F64(x []float64, y []float64) 6409 // Requires: AVX 6410 TEXT ·Minimum_AVX2_F64(SB), NOSPLIT, $0-48 6411 MOVQ x_base+0(FP), DI 6412 MOVQ y_base+24(FP), SI 6413 MOVQ x_len+8(FP), DX 6414 TESTQ DX, DX 6415 JE LBB2_9 6416 CMPQ DX, $0x10 6417 JAE LBB2_3 6418 XORL AX, AX 6419 JMP LBB2_6 6420 6421 LBB2_3: 6422 MOVQ DX, AX 6423 ANDQ $-16, AX 6424 LEAQ 96(DI), R8 6425 XORL CX, CX 6426 6427 LBB2_4: 6428 VMOVUPD (SI)(CX*8), Y0 6429 VMOVUPD 32(SI)(CX*8), Y1 6430 VMOVUPD 64(SI)(CX*8), Y2 6431 VMOVUPD 96(SI)(CX*8), Y3 6432 VCMPPD $0x01, -96(R8)(CX*8), Y0, Y4 6433 VCMPPD $0x01, -64(R8)(CX*8), Y1, Y5 6434 VCMPPD $0x01, -32(R8)(CX*8), Y2, Y6 6435 VCMPPD $0x01, (R8)(CX*8), Y3, Y7 6436 VMASKMOVPD Y0, Y4, -96(R8)(CX*8) 6437 VMASKMOVPD Y1, Y5, -64(R8)(CX*8) 6438 VMASKMOVPD Y2, Y6, -32(R8)(CX*8) 6439 VMASKMOVPD Y3, Y7, (R8)(CX*8) 6440 ADDQ $0x10, CX 6441 CMPQ AX, CX 6442 JNE LBB2_4 6443 CMPQ AX, DX 6444 JNE LBB2_6 6445 6446 LBB2_9: 6447 VZEROUPPER 6448 RET 6449 6450 LBB2_8: 6451 ADDQ $0x01, AX 6452 CMPQ DX, AX 6453 JE LBB2_9 6454 6455 LBB2_6: 6456 VMOVSD (SI)(AX*8), X0 6457 VUCOMISD (DI)(AX*8), X0 6458 JAE LBB2_8 6459 VMOVSD X0, (DI)(AX*8) 6460 JMP LBB2_8 6461 6462 // func Minimum_AVX2_F32(x []float32, y []float32) 6463 // Requires: AVX 6464 TEXT ·Minimum_AVX2_F32(SB), NOSPLIT, $0-48 6465 MOVQ x_base+0(FP), DI 6466 MOVQ y_base+24(FP), SI 6467 MOVQ x_len+8(FP), DX 6468 TESTQ DX, DX 6469 JE LBB3_9 6470 CMPQ DX, $0x20 6471 JAE LBB3_3 6472 XORL AX, AX 6473 JMP LBB3_6 6474 6475 LBB3_3: 6476 MOVQ DX, AX 6477 ANDQ $-32, AX 6478 LEAQ 96(DI), R8 6479 XORL CX, CX 6480 6481 LBB3_4: 6482 VMOVUPS (SI)(CX*4), Y0 6483 VMOVUPS 32(SI)(CX*4), Y1 6484 VMOVUPS 64(SI)(CX*4), Y2 6485 VMOVUPS 96(SI)(CX*4), Y3 6486 VCMPPS $0x01, -96(R8)(CX*4), Y0, Y4 6487 VCMPPS $0x01, -64(R8)(CX*4), Y1, Y5 6488 VCMPPS $0x01, -32(R8)(CX*4), Y2, Y6 6489 VCMPPS $0x01, (R8)(CX*4), Y3, Y7 6490 VMASKMOVPS Y0, Y4, -96(R8)(CX*4) 6491 VMASKMOVPS Y1, Y5, -64(R8)(CX*4) 6492 VMASKMOVPS Y2, Y6, -32(R8)(CX*4) 6493 VMASKMOVPS Y3, Y7, (R8)(CX*4) 6494 ADDQ $0x20, CX 6495 CMPQ AX, CX 6496 JNE LBB3_4 6497 CMPQ AX, DX 6498 JNE LBB3_6 6499 6500 LBB3_9: 6501 VZEROUPPER 6502 RET 6503 6504 LBB3_8: 6505 ADDQ $0x01, AX 6506 CMPQ DX, AX 6507 JE LBB3_9 6508 6509 LBB3_6: 6510 VMOVSS (SI)(AX*4), X0 6511 VUCOMISS (DI)(AX*4), X0 6512 JAE LBB3_8 6513 VMOVSS X0, (DI)(AX*4) 6514 JMP LBB3_8 6515 6516 // func MinimumNumber_AVX2_F64(x []float64, a float64) 6517 // Requires: AVX, AVX2, SSE2 6518 TEXT ·MinimumNumber_AVX2_F64(SB), NOSPLIT, $0-32 6519 MOVQ x_base+0(FP), DI 6520 MOVSD a+24(FP), X0 6521 MOVQ x_len+8(FP), SI 6522 TESTQ SI, SI 6523 JE LBB4_9 6524 CMPQ SI, $0x10 6525 JAE LBB4_3 6526 XORL AX, AX 6527 JMP LBB4_6 6528 6529 LBB4_3: 6530 MOVQ SI, AX 6531 ANDQ $-16, AX 6532 VBROADCASTSD X0, Y1 6533 LEAQ 96(DI), CX 6534 XORL DX, DX 6535 6536 LBB4_4: 6537 VCMPPD $0x01, -96(CX)(DX*8), Y1, Y2 6538 VCMPPD $0x01, -64(CX)(DX*8), Y1, Y3 6539 VCMPPD $0x01, -32(CX)(DX*8), Y1, Y4 6540 VCMPPD $0x01, (CX)(DX*8), Y1, Y5 6541 VMASKMOVPD Y1, Y2, -96(CX)(DX*8) 6542 VMASKMOVPD Y1, Y3, -64(CX)(DX*8) 6543 VMASKMOVPD Y1, Y4, -32(CX)(DX*8) 6544 VMASKMOVPD Y1, Y5, (CX)(DX*8) 6545 ADDQ $0x10, DX 6546 CMPQ AX, DX 6547 JNE LBB4_4 6548 CMPQ AX, SI 6549 JNE LBB4_6 6550 6551 LBB4_9: 6552 VZEROUPPER 6553 RET 6554 6555 LBB4_8: 6556 ADDQ $0x01, AX 6557 CMPQ SI, AX 6558 JE LBB4_9 6559 6560 LBB4_6: 6561 VUCOMISD (DI)(AX*8), X0 6562 JAE LBB4_8 6563 VMOVSD X0, (DI)(AX*8) 6564 JMP LBB4_8 6565 6566 // func MinimumNumber_AVX2_F32(x []float32, a float32) 6567 // Requires: AVX, AVX2, SSE 6568 TEXT ·MinimumNumber_AVX2_F32(SB), NOSPLIT, $0-28 6569 MOVQ x_base+0(FP), DI 6570 MOVSS a+24(FP), X0 6571 MOVQ x_len+8(FP), SI 6572 TESTQ SI, SI 6573 JE LBB5_9 6574 CMPQ SI, $0x20 6575 JAE LBB5_3 6576 XORL AX, AX 6577 JMP LBB5_6 6578 6579 LBB5_3: 6580 MOVQ SI, AX 6581 ANDQ $-32, AX 6582 VBROADCASTSS X0, Y1 6583 LEAQ 96(DI), CX 6584 XORL DX, DX 6585 6586 LBB5_4: 6587 VCMPPS $0x01, -96(CX)(DX*4), Y1, Y2 6588 VCMPPS $0x01, -64(CX)(DX*4), Y1, Y3 6589 VCMPPS $0x01, -32(CX)(DX*4), Y1, Y4 6590 VCMPPS $0x01, (CX)(DX*4), Y1, Y5 6591 VMASKMOVPS Y1, Y2, -96(CX)(DX*4) 6592 VMASKMOVPS Y1, Y3, -64(CX)(DX*4) 6593 VMASKMOVPS Y1, Y4, -32(CX)(DX*4) 6594 VMASKMOVPS Y1, Y5, (CX)(DX*4) 6595 ADDQ $0x20, DX 6596 CMPQ AX, DX 6597 JNE LBB5_4 6598 CMPQ AX, SI 6599 JNE LBB5_6 6600 6601 LBB5_9: 6602 VZEROUPPER 6603 RET 6604 6605 LBB5_8: 6606 ADDQ $0x01, AX 6607 CMPQ SI, AX 6608 JE LBB5_9 6609 6610 LBB5_6: 6611 VUCOMISS (DI)(AX*4), X0 6612 JAE LBB5_8 6613 VMOVSS X0, (DI)(AX*4) 6614 JMP LBB5_8 6615 6616 DATA dataMaxF64<>+0(SB)/8, $0xffefffffffffffff 6617 GLOBL dataMaxF64<>(SB), RODATA|NOPTR, $8 6618 6619 // func Max_AVX2_F64(x []float64) float64 6620 // Requires: AVX, SSE2 6621 TEXT ·Max_AVX2_F64(SB), NOSPLIT, $0-32 6622 MOVQ x_base+0(FP), DI 6623 MOVQ x_len+8(FP), SI 6624 TESTQ SI, SI 6625 JE empty 6626 CMPQ SI, $0x10 6627 JAE loop 6628 VMOVSD dataMaxF64<>+0(SB), X0 6629 XORL AX, AX 6630 JMP collect 6631 6632 empty: 6633 VMOVSD dataMaxF64<>+0(SB), X0 6634 MOVSD X0, ret+24(FP) 6635 RET 6636 6637 loop: 6638 MOVQ SI, AX 6639 ANDQ $-16, AX 6640 LEAQ -16(AX), CX 6641 MOVQ CX, R8 6642 SHRQ $0x04, R8 6643 ADDQ $0x01, R8 6644 TESTQ CX, CX 6645 JE setmin 6646 MOVQ R8, CX 6647 ANDQ $-2, CX 6648 VBROADCASTSD dataMaxF64<>+0(SB), Y0 6649 XORL DX, DX 6650 VMOVAPD Y0, Y1 6651 VMOVAPD Y0, Y2 6652 VMOVAPD Y0, Y3 6653 6654 body: 6655 VMAXPD (DI)(DX*8), Y0, Y0 6656 VMAXPD 32(DI)(DX*8), Y1, Y1 6657 VMAXPD 64(DI)(DX*8), Y2, Y2 6658 VMAXPD 96(DI)(DX*8), Y3, Y3 6659 VMAXPD 128(DI)(DX*8), Y0, Y0 6660 VMAXPD 160(DI)(DX*8), Y1, Y1 6661 VMAXPD 192(DI)(DX*8), Y2, Y2 6662 VMAXPD 224(DI)(DX*8), Y3, Y3 6663 ADDQ $+32, DX 6664 ADDQ $-2, CX 6665 JNE body 6666 TESTB $0x01, R8 6667 JE combinevectors 6668 6669 tail: 6670 VMAXPD (DI)(DX*8), Y0, Y0 6671 VMAXPD 32(DI)(DX*8), Y1, Y1 6672 VMAXPD 64(DI)(DX*8), Y1, Y1 6673 VMAXPD 96(DI)(DX*8), Y1, Y1 6674 6675 combinevectors: 6676 VMAXPD Y3, Y0, Y0 6677 VMAXPD Y2, Y1, Y1 6678 VMAXPD Y0, Y1, Y0 6679 VEXTRACTF128 $0x01, Y0, X1 6680 VMAXPD X1, X0, X0 6681 VPERMILPD $0x01, X0, X1 6682 VMAXSD X1, X0, X0 6683 CMPQ AX, SI 6684 JE return 6685 6686 collect: 6687 VMAXSD (DI)(AX*8), X0, X0 6688 ADDQ $0x01, AX 6689 CMPQ SI, AX 6690 JNE collect 6691 6692 return: 6693 VZEROUPPER 6694 MOVSD X0, ret+24(FP) 6695 RET 6696 6697 setmin: 6698 VBROADCASTSD dataMaxF64<>+0(SB), Y0 6699 XORL DX, DX 6700 VMOVAPD Y0, Y1 6701 VMOVAPD Y0, Y2 6702 VMOVAPD Y0, Y3 6703 TESTB $0x01, R8 6704 JNE tail 6705 JMP combinevectors 6706 6707 DATA dataMaxF32<>+0(SB)/4, $0xff7fffff 6708 GLOBL dataMaxF32<>(SB), RODATA|NOPTR, $4 6709 6710 // func Max_AVX2_F32(x []float32) float32 6711 // Requires: AVX, SSE 6712 TEXT ·Max_AVX2_F32(SB), NOSPLIT, $0-28 6713 MOVQ x_base+0(FP), DI 6714 MOVQ x_len+8(FP), SI 6715 TESTQ SI, SI 6716 JE empty 6717 CMPQ SI, $0x20 6718 JAE loop 6719 VMOVSS dataMaxF32<>+0(SB), X0 6720 XORL AX, AX 6721 JMP collect 6722 6723 empty: 6724 VMOVSS dataMaxF32<>+0(SB), X0 6725 MOVSS X0, ret+24(FP) 6726 RET 6727 6728 loop: 6729 MOVQ SI, AX 6730 ANDQ $-32, AX 6731 LEAQ -32(AX), CX 6732 MOVQ CX, R8 6733 SHRQ $0x05, R8 6734 ADDQ $0x01, R8 6735 TESTQ CX, CX 6736 JE setmin 6737 MOVQ R8, CX 6738 ANDQ $-2, CX 6739 VBROADCASTSS dataMaxF32<>+0(SB), Y0 6740 XORL DX, DX 6741 VMOVAPD Y0, Y1 6742 VMOVAPD Y0, Y2 6743 VMOVAPD Y0, Y3 6744 6745 body: 6746 VMAXPS (DI)(DX*4), Y0, Y0 6747 VMAXPS 32(DI)(DX*4), Y1, Y1 6748 VMAXPS 64(DI)(DX*4), Y2, Y2 6749 VMAXPS 96(DI)(DX*4), Y3, Y3 6750 VMAXPS 128(DI)(DX*4), Y0, Y0 6751 VMAXPS 160(DI)(DX*4), Y1, Y1 6752 VMAXPS 192(DI)(DX*4), Y2, Y2 6753 VMAXPS 224(DI)(DX*4), Y3, Y3 6754 ADDQ $+64, DX 6755 ADDQ $-2, CX 6756 JNE body 6757 TESTB $0x01, R8 6758 JE combinevectors 6759 6760 tail: 6761 VMAXPS (DI)(DX*4), Y0, Y0 6762 VMAXPS 32(DI)(DX*4), Y1, Y1 6763 VMAXPS 64(DI)(DX*4), Y1, Y1 6764 VMAXPS 96(DI)(DX*4), Y1, Y1 6765 6766 combinevectors: 6767 VMAXPS Y3, Y0, Y0 6768 VMAXPS Y2, Y1, Y1 6769 VMAXPS Y0, Y1, Y0 6770 VEXTRACTF128 $0x01, Y0, X1 6771 VMAXPS X1, X0, X0 6772 VPERMILPD $0x01, X0, X1 6773 VMAXPS X1, X0, X0 6774 VMOVSHDUP X0, X1 6775 VMAXSS X1, X0, X0 6776 CMPQ AX, SI 6777 JE return 6778 6779 collect: 6780 VMAXSS (DI)(AX*4), X0, X0 6781 ADDQ $0x01, AX 6782 CMPQ SI, AX 6783 JNE collect 6784 6785 return: 6786 VZEROUPPER 6787 MOVSS X0, ret+24(FP) 6788 RET 6789 6790 setmin: 6791 VBROADCASTSS dataMaxF32<>+0(SB), Y0 6792 XORL DX, DX 6793 VMOVAPS Y0, Y1 6794 VMOVAPS Y0, Y2 6795 VMOVAPS Y0, Y3 6796 TESTB $0x01, R8 6797 JNE tail 6798 JMP combinevectors 6799 6800 // func Maximum_AVX2_F64(x []float64, y []float64) 6801 // Requires: AVX 6802 TEXT ·Maximum_AVX2_F64(SB), NOSPLIT, $0-48 6803 MOVQ x_base+0(FP), DI 6804 MOVQ y_base+24(FP), SI 6805 MOVQ x_len+8(FP), DX 6806 TESTQ DX, DX 6807 JE return 6808 CMPQ DX, $0x10 6809 JAE loop 6810 XORL AX, AX 6811 JMP tailbody 6812 6813 loop: 6814 MOVQ DX, AX 6815 ANDQ $-16, AX 6816 LEAQ 96(DI), R8 6817 XORL CX, CX 6818 6819 body: 6820 VMOVUPD (SI)(CX*8), Y0 6821 VMOVUPD 32(SI)(CX*8), Y1 6822 VMOVUPD 64(SI)(CX*8), Y2 6823 VMOVUPD 96(SI)(CX*8), Y3 6824 VMOVUPD -96(R8)(CX*8), Y4 6825 VMOVUPD -64(R8)(CX*8), Y5 6826 VMOVUPD -32(R8)(CX*8), Y6 6827 VMOVUPD (R8)(CX*8), Y7 6828 VCMPPD $0x01, Y0, Y4, Y4 6829 VMASKMOVPD Y0, Y4, -96(R8)(CX*8) 6830 VCMPPD $0x01, Y1, Y5, Y0 6831 VMASKMOVPD Y1, Y0, -64(R8)(CX*8) 6832 VCMPPD $0x01, Y2, Y6, Y0 6833 VMASKMOVPD Y2, Y0, -32(R8)(CX*8) 6834 VCMPPD $0x01, Y3, Y7, Y0 6835 VMASKMOVPD Y3, Y0, (R8)(CX*8) 6836 ADDQ $0x10, CX 6837 CMPQ CX, AX 6838 JNE body 6839 CMPQ DX, AX 6840 JNE tailbody 6841 6842 return: 6843 VZEROUPPER 6844 RET 6845 6846 tail: 6847 ADDQ $0x01, AX 6848 CMPQ AX, DX 6849 JE return 6850 6851 tailbody: 6852 VMOVSD (SI)(AX*8), X0 6853 VUCOMISD (DI)(AX*8), X0 6854 JBE tail 6855 VMOVSD X0, (DI)(AX*8) 6856 JMP tail 6857 6858 // func Maximum_AVX2_F32(x []float32, y []float32) 6859 // Requires: AVX 6860 TEXT ·Maximum_AVX2_F32(SB), NOSPLIT, $0-48 6861 MOVQ x_base+0(FP), DI 6862 MOVQ y_base+24(FP), SI 6863 MOVQ x_len+8(FP), DX 6864 TESTQ DX, DX 6865 JE return 6866 CMPQ DX, $0x20 6867 JAE loop 6868 XORL AX, AX 6869 JMP tailbody 6870 6871 loop: 6872 MOVQ DX, AX 6873 ANDQ $-32, AX 6874 LEAQ 96(DI), R8 6875 XORL CX, CX 6876 6877 body: 6878 VMOVUPS (SI)(CX*4), Y0 6879 VMOVUPS 32(SI)(CX*4), Y1 6880 VMOVUPS 64(SI)(CX*4), Y2 6881 VMOVUPS 96(SI)(CX*4), Y3 6882 VMOVUPS -96(R8)(CX*4), Y4 6883 VMOVUPS -64(R8)(CX*4), Y5 6884 VMOVUPS -32(R8)(CX*4), Y6 6885 VMOVUPS (R8)(CX*4), Y7 6886 VCMPPS $0x01, Y0, Y4, Y4 6887 VMASKMOVPS Y0, Y4, -96(R8)(CX*4) 6888 VCMPPS $0x01, Y1, Y5, Y0 6889 VMASKMOVPS Y1, Y0, -64(R8)(CX*4) 6890 VCMPPS $0x01, Y2, Y6, Y0 6891 VMASKMOVPS Y2, Y0, -32(R8)(CX*4) 6892 VCMPPS $0x01, Y3, Y7, Y0 6893 VMASKMOVPS Y3, Y0, (R8)(CX*4) 6894 ADDQ $0x20, CX 6895 CMPQ CX, AX 6896 JNE body 6897 CMPQ DX, AX 6898 JNE tailbody 6899 6900 return: 6901 VZEROUPPER 6902 RET 6903 6904 tail: 6905 ADDQ $0x01, AX 6906 CMPQ AX, DX 6907 JE return 6908 6909 tailbody: 6910 VMOVSS (SI)(AX*4), X0 6911 VUCOMISS (DI)(AX*4), X0 6912 JBE tail 6913 VMOVSS X0, (DI)(AX*4) 6914 JMP tail 6915 6916 // func MaximumNumber_AVX2_F64(x []float64, a float64) 6917 // Requires: AVX, AVX2, SSE2 6918 TEXT ·MaximumNumber_AVX2_F64(SB), NOSPLIT, $0-32 6919 MOVQ x_base+0(FP), DI 6920 MOVSD a+24(FP), X0 6921 MOVQ x_len+8(FP), SI 6922 TESTQ SI, SI 6923 JE return 6924 CMPQ SI, $0x10 6925 JAE loop 6926 XORL AX, AX 6927 JMP tailbody 6928 6929 loop: 6930 MOVQ SI, AX 6931 ANDQ $-16, AX 6932 VBROADCASTSD X0, Y1 6933 LEAQ 96(DI), CX 6934 XORL DX, DX 6935 6936 body: 6937 VMOVUPD -96(CX)(DX*8), Y2 6938 VMOVUPD -64(CX)(DX*8), Y3 6939 VMOVUPD -32(CX)(DX*8), Y4 6940 VMOVUPD (CX)(DX*8), Y5 6941 VCMPPD $0x01, Y1, Y2, Y2 6942 VMASKMOVPD Y1, Y2, -96(CX)(DX*8) 6943 VCMPPD $0x01, Y1, Y3, Y2 6944 VMASKMOVPD Y1, Y2, -64(CX)(DX*8) 6945 VCMPPD $0x01, Y1, Y4, Y2 6946 VMASKMOVPD Y1, Y2, -32(CX)(DX*8) 6947 VCMPPD $0x01, Y1, Y5, Y2 6948 VMASKMOVPD Y1, Y2, (CX)(DX*8) 6949 ADDQ $0x10, DX 6950 CMPQ AX, DX 6951 JNE body 6952 CMPQ AX, SI 6953 JNE tailbody 6954 6955 return: 6956 VZEROUPPER 6957 RET 6958 6959 tail: 6960 ADDQ $0x01, AX 6961 CMPQ SI, AX 6962 JE return 6963 6964 tailbody: 6965 VUCOMISD (DI)(AX*8), X0 6966 JBE tail 6967 VMOVSD X0, (DI)(AX*8) 6968 JMP tail 6969 6970 // func MaximumNumber_AVX2_F32(x []float32, a float32) 6971 // Requires: AVX, AVX2, SSE 6972 TEXT ·MaximumNumber_AVX2_F32(SB), NOSPLIT, $0-28 6973 MOVQ x_base+0(FP), DI 6974 MOVSS a+24(FP), X0 6975 MOVQ x_len+8(FP), SI 6976 TESTQ SI, SI 6977 JE return 6978 CMPQ SI, $0x20 6979 JAE loop 6980 XORL AX, AX 6981 JMP tailbody 6982 6983 loop: 6984 MOVQ SI, AX 6985 ANDQ $-32, AX 6986 VBROADCASTSS X0, Y1 6987 LEAQ 96(DI), CX 6988 XORL DX, DX 6989 6990 body: 6991 VMOVUPS -96(CX)(DX*4), Y2 6992 VMOVUPS -64(CX)(DX*4), Y3 6993 VMOVUPS -32(CX)(DX*4), Y4 6994 VMOVUPS (CX)(DX*4), Y5 6995 VCMPPS $0x01, Y1, Y2, Y2 6996 VMASKMOVPS Y1, Y2, -96(CX)(DX*4) 6997 VCMPPS $0x01, Y1, Y3, Y2 6998 VMASKMOVPS Y1, Y2, -64(CX)(DX*4) 6999 VCMPPS $0x01, Y1, Y4, Y2 7000 VMASKMOVPS Y1, Y2, -32(CX)(DX*4) 7001 VCMPPS $0x01, Y1, Y5, Y2 7002 VMASKMOVPS Y1, Y2, (CX)(DX*4) 7003 ADDQ $0x20, DX 7004 CMPQ AX, DX 7005 JNE body 7006 CMPQ AX, SI 7007 JNE tailbody 7008 7009 return: 7010 VZEROUPPER 7011 RET 7012 7013 tail: 7014 ADDQ $0x01, AX 7015 CMPQ SI, AX 7016 JE return 7017 7018 tailbody: 7019 VUCOMISS (DI)(AX*4), X0 7020 JBE tail 7021 VMOVSS X0, (DI)(AX*4) 7022 JMP tail 7023 7024 // func Find_AVX2_F64(x []float64, a float64) int 7025 // Requires: AVX, AVX2, SSE2 7026 TEXT ·Find_AVX2_F64(SB), NOSPLIT, $0-40 7027 MOVQ x_base+0(FP), DI 7028 MOVSD a+24(FP), X0 7029 MOVQ x_len+8(FP), SI 7030 MOVQ SI, CX 7031 ANDQ $-8, CX 7032 JE tail 7033 VPBROADCASTQ X0, Y1 7034 XORL AX, AX 7035 7036 loop: 7037 VPCMPEQQ (DI)(AX*8), Y1, Y2 7038 VPCMPEQQ 32(DI)(AX*8), Y1, Y3 7039 VPOR Y2, Y3, Y4 7040 VPTEST Y4, Y4 7041 JNE mask 7042 ADDQ $0x08, AX 7043 CMPQ AX, CX 7044 JB loop 7045 CMPQ AX, SI 7046 JB tailbody 7047 7048 return: 7049 VZEROUPPER 7050 MOVQ AX, ret+32(FP) 7051 RET 7052 7053 tail: 7054 XORL AX, AX 7055 CMPQ AX, SI 7056 JAE return 7057 7058 tailbody: 7059 VUCOMISD (DI)(AX*8), X0 7060 JE return 7061 ADDQ $0x01, AX 7062 CMPQ SI, AX 7063 JNE tailbody 7064 MOVQ SI, AX 7065 VZEROUPPER 7066 MOVQ AX, ret+32(FP) 7067 RET 7068 7069 mask: 7070 VMOVMSKPD Y3, CX 7071 SHLL $0x04, CX 7072 VMOVMSKPD Y2, DX 7073 ORL CX, DX 7074 BSFL DX, CX 7075 ADDQ CX, AX 7076 VZEROUPPER 7077 MOVQ AX, ret+32(FP) 7078 RET 7079 7080 // func Find_AVX2_F32(x []float32, a float32) int 7081 // Requires: AVX, AVX2, SSE 7082 TEXT ·Find_AVX2_F32(SB), NOSPLIT, $0-40 7083 MOVQ x_base+0(FP), DI 7084 MOVSS a+24(FP), X0 7085 MOVQ x_len+8(FP), SI 7086 MOVQ SI, CX 7087 ANDQ $-16, CX 7088 JE tail 7089 VPBROADCASTD X0, Y1 7090 XORL AX, AX 7091 7092 loop: 7093 VPCMPEQD (DI)(AX*4), Y1, Y2 7094 VPCMPEQD 32(DI)(AX*4), Y1, Y3 7095 VPOR Y2, Y3, Y4 7096 VPTEST Y4, Y4 7097 JNE mask 7098 ADDQ $0x10, AX 7099 CMPQ AX, CX 7100 JB loop 7101 CMPQ AX, SI 7102 JB tailbody 7103 7104 return: 7105 VZEROUPPER 7106 MOVQ AX, ret+32(FP) 7107 RET 7108 7109 tail: 7110 XORL AX, AX 7111 CMPQ AX, SI 7112 JAE return 7113 7114 tailbody: 7115 VUCOMISS (DI)(AX*4), X0 7116 JE return 7117 ADDQ $+1, AX 7118 CMPQ SI, AX 7119 JNE tailbody 7120 MOVQ SI, AX 7121 VZEROUPPER 7122 MOVQ AX, ret+32(FP) 7123 RET 7124 7125 mask: 7126 VMOVMSKPS Y3, CX 7127 SHLL $0x08, CX 7128 VMOVMSKPS Y2, DX 7129 ORL CX, DX 7130 BSFL DX, CX 7131 ADDQ CX, AX 7132 VZEROUPPER 7133 MOVQ AX, ret+32(FP) 7134 RET 7135 7136 DATA dataLtF64<>+0(SB)/1, $0x01 7137 DATA dataLtF64<>+1(SB)/1, $0x01 7138 DATA dataLtF64<>+2(SB)/1, $0x01 7139 DATA dataLtF64<>+3(SB)/1, $0x01 7140 DATA dataLtF64<>+4(SB)/1, $0x00 7141 DATA dataLtF64<>+5(SB)/1, $0x00 7142 DATA dataLtF64<>+6(SB)/1, $0x00 7143 DATA dataLtF64<>+7(SB)/1, $0x00 7144 DATA dataLtF64<>+8(SB)/1, $0x00 7145 DATA dataLtF64<>+9(SB)/1, $0x00 7146 DATA dataLtF64<>+10(SB)/1, $0x00 7147 DATA dataLtF64<>+11(SB)/1, $0x00 7148 DATA dataLtF64<>+12(SB)/1, $0x00 7149 DATA dataLtF64<>+13(SB)/1, $0x00 7150 DATA dataLtF64<>+14(SB)/1, $0x00 7151 DATA dataLtF64<>+15(SB)/1, $0x00 7152 GLOBL dataLtF64<>(SB), RODATA|NOPTR, $16 7153 7154 // func Lt_AVX2_F64(x []bool, y []float64, z []float64) 7155 // Requires: AVX, AVX2 7156 TEXT ·Lt_AVX2_F64(SB), NOSPLIT, $0-72 7157 MOVQ x_base+0(FP), DI 7158 MOVQ y_base+24(FP), SI 7159 MOVQ z_base+48(FP), DX 7160 MOVQ x_len+8(FP), CX 7161 TESTQ CX, CX 7162 JE LBB0_7 7163 CMPQ CX, $0x10 7164 JAE LBB0_3 7165 XORL R8, R8 7166 JMP LBB0_6 7167 7168 LBB0_3: 7169 MOVQ CX, R8 7170 ANDQ $-16, R8 7171 XORL AX, AX 7172 VMOVDQU dataLtF64<>+0(SB), X0 7173 7174 LBB0_4: 7175 VMOVUPD (SI)(AX*8), Y1 7176 VMOVUPD 32(SI)(AX*8), Y2 7177 VMOVUPD 64(SI)(AX*8), Y3 7178 VMOVUPD 96(SI)(AX*8), Y4 7179 VCMPPD $0x01, (DX)(AX*8), Y1, Y1 7180 VEXTRACTF128 $0x01, Y1, X5 7181 VPACKSSDW X5, X1, X1 7182 VPACKSSDW X1, X1, X1 7183 VPACKSSWB X1, X1, X1 7184 VCMPPD $0x01, 32(DX)(AX*8), Y2, Y2 7185 VPAND X0, X1, X1 7186 VEXTRACTF128 $0x01, Y2, X5 7187 VPACKSSDW X5, X2, X2 7188 VPACKSSDW X2, X2, X2 7189 VPACKSSWB X2, X2, X2 7190 VPAND X0, X2, X2 7191 VCMPPD $0x01, 64(DX)(AX*8), Y3, Y3 7192 VPUNPCKLDQ X2, X1, X1 7193 VEXTRACTF128 $0x01, Y3, X2 7194 VPACKSSDW X2, X3, X2 7195 VPACKSSDW X2, X2, X2 7196 VPACKSSWB X2, X2, X2 7197 VPAND X0, X2, X2 7198 VCMPPD $0x01, 96(DX)(AX*8), Y4, Y3 7199 VEXTRACTF128 $0x01, Y3, X4 7200 VPACKSSDW X4, X3, X3 7201 VPACKSSDW X3, X3, X3 7202 VPACKSSWB X3, X3, X3 7203 VPAND X0, X3, X3 7204 VPBROADCASTD X3, X3 7205 VPBROADCASTD X2, X2 7206 VPUNPCKLDQ X3, X2, X2 7207 VPBLENDD $0x0c, X2, X1, X1 7208 VMOVDQU X1, (DI)(AX*1) 7209 ADDQ $0x10, AX 7210 CMPQ R8, AX 7211 JNE LBB0_4 7212 CMPQ R8, CX 7213 JE LBB0_7 7214 7215 LBB0_6: 7216 VMOVSD (SI)(R8*8), X0 7217 VUCOMISD (DX)(R8*8), X0 7218 SETCS (DI)(R8*1) 7219 ADDQ $0x01, R8 7220 CMPQ CX, R8 7221 JNE LBB0_6 7222 7223 LBB0_7: 7224 VZEROUPPER 7225 RET 7226 7227 DATA dataLtF32<>+0(SB)/1, $0x01 7228 DATA dataLtF32<>+1(SB)/1, $0x01 7229 DATA dataLtF32<>+2(SB)/1, $0x01 7230 DATA dataLtF32<>+3(SB)/1, $0x01 7231 DATA dataLtF32<>+4(SB)/1, $0x01 7232 DATA dataLtF32<>+5(SB)/1, $0x01 7233 DATA dataLtF32<>+6(SB)/1, $0x01 7234 DATA dataLtF32<>+7(SB)/1, $0x01 7235 DATA dataLtF32<>+8(SB)/1, $0x00 7236 DATA dataLtF32<>+9(SB)/1, $0x00 7237 DATA dataLtF32<>+10(SB)/1, $0x00 7238 DATA dataLtF32<>+11(SB)/1, $0x00 7239 DATA dataLtF32<>+12(SB)/1, $0x00 7240 DATA dataLtF32<>+13(SB)/1, $0x00 7241 DATA dataLtF32<>+14(SB)/1, $0x00 7242 DATA dataLtF32<>+15(SB)/1, $0x00 7243 GLOBL dataLtF32<>(SB), RODATA|NOPTR, $16 7244 7245 // func Lt_AVX2_F32(x []bool, y []float32, z []float32) 7246 // Requires: AVX, AVX2 7247 TEXT ·Lt_AVX2_F32(SB), NOSPLIT, $0-72 7248 MOVQ x_base+0(FP), DI 7249 MOVQ y_base+24(FP), SI 7250 MOVQ z_base+48(FP), DX 7251 MOVQ x_len+8(FP), CX 7252 TESTQ CX, CX 7253 JE LBB1_7 7254 CMPQ CX, $0x20 7255 JAE LBB1_3 7256 XORL R8, R8 7257 JMP LBB1_6 7258 7259 LBB1_3: 7260 MOVQ CX, R8 7261 ANDQ $-32, R8 7262 XORL AX, AX 7263 VMOVDQU dataLtF32<>+0(SB), X0 7264 7265 LBB1_4: 7266 VMOVUPS (SI)(AX*4), Y1 7267 VMOVUPS 32(SI)(AX*4), Y2 7268 VMOVUPS 64(SI)(AX*4), Y3 7269 VMOVUPS 96(SI)(AX*4), Y4 7270 VCMPPS $0x01, (DX)(AX*4), Y1, Y1 7271 VEXTRACTF128 $0x01, Y1, X5 7272 VPACKSSDW X5, X1, X1 7273 VPACKSSWB X1, X1, X1 7274 VCMPPS $0x01, 32(DX)(AX*4), Y2, Y2 7275 VPAND X0, X1, X1 7276 VEXTRACTF128 $0x01, Y2, X5 7277 VPACKSSDW X5, X2, X2 7278 VPACKSSWB X2, X2, X2 7279 VPAND X0, X2, X2 7280 VCMPPS $0x01, 64(DX)(AX*4), Y3, Y3 7281 VEXTRACTF128 $0x01, Y3, X5 7282 VPACKSSDW X5, X3, X3 7283 VPACKSSWB X3, X3, X3 7284 VCMPPS $0x01, 96(DX)(AX*4), Y4, Y4 7285 VPAND X0, X3, X3 7286 VEXTRACTF128 $0x01, Y4, X5 7287 VPACKSSDW X5, X4, X4 7288 VPACKSSWB X4, X4, X4 7289 VPAND X0, X4, X4 7290 VINSERTI128 $0x01, X4, Y3, Y3 7291 VINSERTI128 $0x01, X2, Y1, Y1 7292 VPUNPCKLQDQ Y3, Y1, Y1 7293 VPERMQ $0xd8, Y1, Y1 7294 VMOVDQU Y1, (DI)(AX*1) 7295 ADDQ $0x20, AX 7296 CMPQ R8, AX 7297 JNE LBB1_4 7298 CMPQ R8, CX 7299 JE LBB1_7 7300 7301 LBB1_6: 7302 VMOVSS (SI)(R8*4), X0 7303 VUCOMISS (DX)(R8*4), X0 7304 SETCS (DI)(R8*1) 7305 ADDQ $0x01, R8 7306 CMPQ CX, R8 7307 JNE LBB1_6 7308 7309 LBB1_7: 7310 VZEROUPPER 7311 RET 7312 7313 DATA dataLteF64<>+0(SB)/1, $0x01 7314 DATA dataLteF64<>+1(SB)/1, $0x01 7315 DATA dataLteF64<>+2(SB)/1, $0x01 7316 DATA dataLteF64<>+3(SB)/1, $0x01 7317 DATA dataLteF64<>+4(SB)/1, $0x00 7318 DATA dataLteF64<>+5(SB)/1, $0x00 7319 DATA dataLteF64<>+6(SB)/1, $0x00 7320 DATA dataLteF64<>+7(SB)/1, $0x00 7321 DATA dataLteF64<>+8(SB)/1, $0x00 7322 DATA dataLteF64<>+9(SB)/1, $0x00 7323 DATA dataLteF64<>+10(SB)/1, $0x00 7324 DATA dataLteF64<>+11(SB)/1, $0x00 7325 DATA dataLteF64<>+12(SB)/1, $0x00 7326 DATA dataLteF64<>+13(SB)/1, $0x00 7327 DATA dataLteF64<>+14(SB)/1, $0x00 7328 DATA dataLteF64<>+15(SB)/1, $0x00 7329 GLOBL dataLteF64<>(SB), RODATA|NOPTR, $16 7330 7331 // func Lte_AVX2_F64(x []bool, y []float64, z []float64) 7332 // Requires: AVX, AVX2 7333 TEXT ·Lte_AVX2_F64(SB), NOSPLIT, $0-72 7334 MOVQ x_base+0(FP), DI 7335 MOVQ y_base+24(FP), SI 7336 MOVQ z_base+48(FP), DX 7337 MOVQ x_len+8(FP), CX 7338 TESTQ CX, CX 7339 JE LBB2_7 7340 CMPQ CX, $0x10 7341 JAE LBB2_3 7342 XORL R8, R8 7343 JMP LBB2_6 7344 7345 LBB2_3: 7346 MOVQ CX, R8 7347 ANDQ $-16, R8 7348 XORL AX, AX 7349 VMOVDQU dataLteF64<>+0(SB), X0 7350 7351 LBB2_4: 7352 VMOVUPD (SI)(AX*8), Y1 7353 VMOVUPD 32(SI)(AX*8), Y2 7354 VMOVUPD 64(SI)(AX*8), Y3 7355 VMOVUPD 96(SI)(AX*8), Y4 7356 VCMPPD $0x02, (DX)(AX*8), Y1, Y1 7357 VEXTRACTF128 $0x01, Y1, X5 7358 VPACKSSDW X5, X1, X1 7359 VPACKSSDW X1, X1, X1 7360 VPACKSSWB X1, X1, X1 7361 VCMPPD $0x02, 32(DX)(AX*8), Y2, Y2 7362 VPAND X0, X1, X1 7363 VEXTRACTF128 $0x01, Y2, X5 7364 VPACKSSDW X5, X2, X2 7365 VPACKSSDW X2, X2, X2 7366 VPACKSSWB X2, X2, X2 7367 VPAND X0, X2, X2 7368 VCMPPD $0x02, 64(DX)(AX*8), Y3, Y3 7369 VPUNPCKLDQ X2, X1, X1 7370 VEXTRACTF128 $0x01, Y3, X2 7371 VPACKSSDW X2, X3, X2 7372 VPACKSSDW X2, X2, X2 7373 VPACKSSWB X2, X2, X2 7374 VPAND X0, X2, X2 7375 VCMPPD $0x02, 96(DX)(AX*8), Y4, Y3 7376 VEXTRACTF128 $0x01, Y3, X4 7377 VPACKSSDW X4, X3, X3 7378 VPACKSSDW X3, X3, X3 7379 VPACKSSWB X3, X3, X3 7380 VPAND X0, X3, X3 7381 VPBROADCASTD X3, X3 7382 VPBROADCASTD X2, X2 7383 VPUNPCKLDQ X3, X2, X2 7384 VPBLENDD $0x0c, X2, X1, X1 7385 VMOVDQU X1, (DI)(AX*1) 7386 ADDQ $0x10, AX 7387 CMPQ R8, AX 7388 JNE LBB2_4 7389 CMPQ R8, CX 7390 JE LBB2_7 7391 7392 LBB2_6: 7393 VMOVSD (SI)(R8*8), X0 7394 VUCOMISD (DX)(R8*8), X0 7395 SETLS (DI)(R8*1) 7396 ADDQ $0x01, R8 7397 CMPQ CX, R8 7398 JNE LBB2_6 7399 7400 LBB2_7: 7401 VZEROUPPER 7402 RET 7403 7404 DATA dataLteF32<>+0(SB)/1, $0x01 7405 DATA dataLteF32<>+1(SB)/1, $0x01 7406 DATA dataLteF32<>+2(SB)/1, $0x01 7407 DATA dataLteF32<>+3(SB)/1, $0x01 7408 DATA dataLteF32<>+4(SB)/1, $0x01 7409 DATA dataLteF32<>+5(SB)/1, $0x01 7410 DATA dataLteF32<>+6(SB)/1, $0x01 7411 DATA dataLteF32<>+7(SB)/1, $0x01 7412 DATA dataLteF32<>+8(SB)/1, $0x00 7413 DATA dataLteF32<>+9(SB)/1, $0x00 7414 DATA dataLteF32<>+10(SB)/1, $0x00 7415 DATA dataLteF32<>+11(SB)/1, $0x00 7416 DATA dataLteF32<>+12(SB)/1, $0x00 7417 DATA dataLteF32<>+13(SB)/1, $0x00 7418 DATA dataLteF32<>+14(SB)/1, $0x00 7419 DATA dataLteF32<>+15(SB)/1, $0x00 7420 GLOBL dataLteF32<>(SB), RODATA|NOPTR, $16 7421 7422 // func Lte_AVX2_F32(x []bool, y []float32, z []float32) 7423 // Requires: AVX, AVX2 7424 TEXT ·Lte_AVX2_F32(SB), NOSPLIT, $0-72 7425 MOVQ x_base+0(FP), DI 7426 MOVQ y_base+24(FP), SI 7427 MOVQ z_base+48(FP), DX 7428 MOVQ x_len+8(FP), CX 7429 TESTQ CX, CX 7430 JE LBB3_7 7431 CMPQ CX, $0x20 7432 JAE LBB3_3 7433 XORL R8, R8 7434 JMP LBB3_6 7435 7436 LBB3_3: 7437 MOVQ CX, R8 7438 ANDQ $-32, R8 7439 XORL AX, AX 7440 VMOVDQU dataLteF32<>+0(SB), X0 7441 7442 LBB3_4: 7443 VMOVUPS (SI)(AX*4), Y1 7444 VMOVUPS 32(SI)(AX*4), Y2 7445 VMOVUPS 64(SI)(AX*4), Y3 7446 VMOVUPS 96(SI)(AX*4), Y4 7447 VCMPPS $0x02, (DX)(AX*4), Y1, Y1 7448 VEXTRACTF128 $0x01, Y1, X5 7449 VPACKSSDW X5, X1, X1 7450 VPACKSSWB X1, X1, X1 7451 VCMPPS $0x02, 32(DX)(AX*4), Y2, Y2 7452 VPAND X0, X1, X1 7453 VEXTRACTF128 $0x01, Y2, X5 7454 VPACKSSDW X5, X2, X2 7455 VPACKSSWB X2, X2, X2 7456 VPAND X0, X2, X2 7457 VCMPPS $0x02, 64(DX)(AX*4), Y3, Y3 7458 VEXTRACTF128 $0x01, Y3, X5 7459 VPACKSSDW X5, X3, X3 7460 VPACKSSWB X3, X3, X3 7461 VCMPPS $0x02, 96(DX)(AX*4), Y4, Y4 7462 VPAND X0, X3, X3 7463 VEXTRACTF128 $0x01, Y4, X5 7464 VPACKSSDW X5, X4, X4 7465 VPACKSSWB X4, X4, X4 7466 VPAND X0, X4, X4 7467 VINSERTI128 $0x01, X4, Y3, Y3 7468 VINSERTI128 $0x01, X2, Y1, Y1 7469 VPUNPCKLQDQ Y3, Y1, Y1 7470 VPERMQ $0xd8, Y1, Y1 7471 VMOVDQU Y1, (DI)(AX*1) 7472 ADDQ $0x20, AX 7473 CMPQ R8, AX 7474 JNE LBB3_4 7475 CMPQ R8, CX 7476 JE LBB3_7 7477 7478 LBB3_6: 7479 VMOVSS (SI)(R8*4), X0 7480 VUCOMISS (DX)(R8*4), X0 7481 SETLS (DI)(R8*1) 7482 ADDQ $0x01, R8 7483 CMPQ CX, R8 7484 JNE LBB3_6 7485 7486 LBB3_7: 7487 VZEROUPPER 7488 RET 7489 7490 DATA dataGtF64<>+0(SB)/1, $0x01 7491 DATA dataGtF64<>+1(SB)/1, $0x01 7492 DATA dataGtF64<>+2(SB)/1, $0x01 7493 DATA dataGtF64<>+3(SB)/1, $0x01 7494 DATA dataGtF64<>+4(SB)/1, $0x00 7495 DATA dataGtF64<>+5(SB)/1, $0x00 7496 DATA dataGtF64<>+6(SB)/1, $0x00 7497 DATA dataGtF64<>+7(SB)/1, $0x00 7498 DATA dataGtF64<>+8(SB)/1, $0x00 7499 DATA dataGtF64<>+9(SB)/1, $0x00 7500 DATA dataGtF64<>+10(SB)/1, $0x00 7501 DATA dataGtF64<>+11(SB)/1, $0x00 7502 DATA dataGtF64<>+12(SB)/1, $0x00 7503 DATA dataGtF64<>+13(SB)/1, $0x00 7504 DATA dataGtF64<>+14(SB)/1, $0x00 7505 DATA dataGtF64<>+15(SB)/1, $0x00 7506 GLOBL dataGtF64<>(SB), RODATA|NOPTR, $16 7507 7508 // func Gt_AVX2_F64(x []bool, y []float64, z []float64) 7509 // Requires: AVX, AVX2 7510 TEXT ·Gt_AVX2_F64(SB), NOSPLIT, $0-72 7511 MOVQ x_base+0(FP), DI 7512 MOVQ y_base+24(FP), SI 7513 MOVQ z_base+48(FP), DX 7514 MOVQ x_len+8(FP), CX 7515 TESTQ CX, CX 7516 JE LBB4_7 7517 CMPQ CX, $0x10 7518 JAE LBB4_3 7519 XORL R8, R8 7520 JMP LBB4_6 7521 7522 LBB4_3: 7523 MOVQ CX, R8 7524 ANDQ $-16, R8 7525 XORL AX, AX 7526 VMOVDQU dataGtF64<>+0(SB), X0 7527 7528 LBB4_4: 7529 VMOVUPD (DX)(AX*8), Y1 7530 VMOVUPD 32(DX)(AX*8), Y2 7531 VMOVUPD 64(DX)(AX*8), Y3 7532 VMOVUPD 96(DX)(AX*8), Y4 7533 VCMPPD $0x01, (SI)(AX*8), Y1, Y1 7534 VEXTRACTF128 $0x01, Y1, X5 7535 VPACKSSDW X5, X1, X1 7536 VPACKSSDW X1, X1, X1 7537 VPACKSSWB X1, X1, X1 7538 VCMPPD $0x01, 32(SI)(AX*8), Y2, Y2 7539 VPAND X0, X1, X1 7540 VEXTRACTF128 $0x01, Y2, X5 7541 VPACKSSDW X5, X2, X2 7542 VPACKSSDW X2, X2, X2 7543 VPACKSSWB X2, X2, X2 7544 VPAND X0, X2, X2 7545 VCMPPD $0x01, 64(SI)(AX*8), Y3, Y3 7546 VPUNPCKLDQ X2, X1, X1 7547 VEXTRACTF128 $0x01, Y3, X2 7548 VPACKSSDW X2, X3, X2 7549 VPACKSSDW X2, X2, X2 7550 VPACKSSWB X2, X2, X2 7551 VPAND X0, X2, X2 7552 VCMPPD $0x01, 96(SI)(AX*8), Y4, Y3 7553 VEXTRACTF128 $0x01, Y3, X4 7554 VPACKSSDW X4, X3, X3 7555 VPACKSSDW X3, X3, X3 7556 VPACKSSWB X3, X3, X3 7557 VPAND X0, X3, X3 7558 VPBROADCASTD X3, X3 7559 VPBROADCASTD X2, X2 7560 VPUNPCKLDQ X3, X2, X2 7561 VPBLENDD $0x0c, X2, X1, X1 7562 VMOVDQU X1, (DI)(AX*1) 7563 ADDQ $0x10, AX 7564 CMPQ R8, AX 7565 JNE LBB4_4 7566 CMPQ R8, CX 7567 JE LBB4_7 7568 7569 LBB4_6: 7570 VMOVSD (SI)(R8*8), X0 7571 VUCOMISD (DX)(R8*8), X0 7572 SETHI (DI)(R8*1) 7573 ADDQ $0x01, R8 7574 CMPQ CX, R8 7575 JNE LBB4_6 7576 7577 LBB4_7: 7578 VZEROUPPER 7579 RET 7580 7581 DATA dataGtF32<>+0(SB)/1, $0x01 7582 DATA dataGtF32<>+1(SB)/1, $0x01 7583 DATA dataGtF32<>+2(SB)/1, $0x01 7584 DATA dataGtF32<>+3(SB)/1, $0x01 7585 DATA dataGtF32<>+4(SB)/1, $0x01 7586 DATA dataGtF32<>+5(SB)/1, $0x01 7587 DATA dataGtF32<>+6(SB)/1, $0x01 7588 DATA dataGtF32<>+7(SB)/1, $0x01 7589 DATA dataGtF32<>+8(SB)/1, $0x00 7590 DATA dataGtF32<>+9(SB)/1, $0x00 7591 DATA dataGtF32<>+10(SB)/1, $0x00 7592 DATA dataGtF32<>+11(SB)/1, $0x00 7593 DATA dataGtF32<>+12(SB)/1, $0x00 7594 DATA dataGtF32<>+13(SB)/1, $0x00 7595 DATA dataGtF32<>+14(SB)/1, $0x00 7596 DATA dataGtF32<>+15(SB)/1, $0x00 7597 GLOBL dataGtF32<>(SB), RODATA|NOPTR, $16 7598 7599 // func Gt_AVX2_F32(x []bool, y []float32, z []float32) 7600 // Requires: AVX, AVX2 7601 TEXT ·Gt_AVX2_F32(SB), NOSPLIT, $0-72 7602 MOVQ x_base+0(FP), DI 7603 MOVQ y_base+24(FP), SI 7604 MOVQ z_base+48(FP), DX 7605 MOVQ x_len+8(FP), CX 7606 TESTQ CX, CX 7607 JE LBB5_7 7608 CMPQ CX, $0x20 7609 JAE LBB5_3 7610 XORL R8, R8 7611 JMP LBB5_6 7612 7613 LBB5_3: 7614 MOVQ CX, R8 7615 ANDQ $-32, R8 7616 XORL AX, AX 7617 VMOVDQU dataGtF32<>+0(SB), X0 7618 7619 LBB5_4: 7620 VMOVUPS (DX)(AX*4), Y1 7621 VMOVUPS 32(DX)(AX*4), Y2 7622 VMOVUPS 64(DX)(AX*4), Y3 7623 VMOVUPS 96(DX)(AX*4), Y4 7624 VCMPPS $0x01, (SI)(AX*4), Y1, Y1 7625 VEXTRACTF128 $0x01, Y1, X5 7626 VPACKSSDW X5, X1, X1 7627 VPACKSSWB X1, X1, X1 7628 VCMPPS $0x01, 32(SI)(AX*4), Y2, Y2 7629 VPAND X0, X1, X1 7630 VEXTRACTF128 $0x01, Y2, X5 7631 VPACKSSDW X5, X2, X2 7632 VPACKSSWB X2, X2, X2 7633 VPAND X0, X2, X2 7634 VCMPPS $0x01, 64(SI)(AX*4), Y3, Y3 7635 VEXTRACTF128 $0x01, Y3, X5 7636 VPACKSSDW X5, X3, X3 7637 VPACKSSWB X3, X3, X3 7638 VCMPPS $0x01, 96(SI)(AX*4), Y4, Y4 7639 VPAND X0, X3, X3 7640 VEXTRACTF128 $0x01, Y4, X5 7641 VPACKSSDW X5, X4, X4 7642 VPACKSSWB X4, X4, X4 7643 VPAND X0, X4, X4 7644 VINSERTI128 $0x01, X4, Y3, Y3 7645 VINSERTI128 $0x01, X2, Y1, Y1 7646 VPUNPCKLQDQ Y3, Y1, Y1 7647 VPERMQ $0xd8, Y1, Y1 7648 VMOVDQU Y1, (DI)(AX*1) 7649 ADDQ $0x20, AX 7650 CMPQ R8, AX 7651 JNE LBB5_4 7652 CMPQ R8, CX 7653 JE LBB5_7 7654 7655 LBB5_6: 7656 VMOVSS (SI)(R8*4), X0 7657 VUCOMISS (DX)(R8*4), X0 7658 SETHI (DI)(R8*1) 7659 ADDQ $0x01, R8 7660 CMPQ CX, R8 7661 JNE LBB5_6 7662 7663 LBB5_7: 7664 VZEROUPPER 7665 RET 7666 7667 DATA dataGteF64<>+0(SB)/1, $0x01 7668 DATA dataGteF64<>+1(SB)/1, $0x01 7669 DATA dataGteF64<>+2(SB)/1, $0x01 7670 DATA dataGteF64<>+3(SB)/1, $0x01 7671 DATA dataGteF64<>+4(SB)/1, $0x00 7672 DATA dataGteF64<>+5(SB)/1, $0x00 7673 DATA dataGteF64<>+6(SB)/1, $0x00 7674 DATA dataGteF64<>+7(SB)/1, $0x00 7675 DATA dataGteF64<>+8(SB)/1, $0x00 7676 DATA dataGteF64<>+9(SB)/1, $0x00 7677 DATA dataGteF64<>+10(SB)/1, $0x00 7678 DATA dataGteF64<>+11(SB)/1, $0x00 7679 DATA dataGteF64<>+12(SB)/1, $0x00 7680 DATA dataGteF64<>+13(SB)/1, $0x00 7681 DATA dataGteF64<>+14(SB)/1, $0x00 7682 DATA dataGteF64<>+15(SB)/1, $0x00 7683 GLOBL dataGteF64<>(SB), RODATA|NOPTR, $16 7684 7685 // func Gte_AVX2_F64(x []bool, y []float64, z []float64) 7686 // Requires: AVX, AVX2 7687 TEXT ·Gte_AVX2_F64(SB), NOSPLIT, $0-72 7688 MOVQ x_base+0(FP), DI 7689 MOVQ y_base+24(FP), SI 7690 MOVQ z_base+48(FP), DX 7691 MOVQ x_len+8(FP), CX 7692 TESTQ CX, CX 7693 JE LBB6_7 7694 CMPQ CX, $0x10 7695 JAE LBB6_3 7696 XORL R8, R8 7697 JMP LBB6_6 7698 7699 LBB6_3: 7700 MOVQ CX, R8 7701 ANDQ $-16, R8 7702 XORL AX, AX 7703 VMOVDQU dataGteF64<>+0(SB), X0 7704 7705 LBB6_4: 7706 VMOVUPD (DX)(AX*8), Y1 7707 VMOVUPD 32(DX)(AX*8), Y2 7708 VMOVUPD 64(DX)(AX*8), Y3 7709 VMOVUPD 96(DX)(AX*8), Y4 7710 VCMPPD $0x02, (SI)(AX*8), Y1, Y1 7711 VEXTRACTF128 $0x01, Y1, X5 7712 VPACKSSDW X5, X1, X1 7713 VPACKSSDW X1, X1, X1 7714 VPACKSSWB X1, X1, X1 7715 VCMPPD $0x02, 32(SI)(AX*8), Y2, Y2 7716 VPAND X0, X1, X1 7717 VEXTRACTF128 $0x01, Y2, X5 7718 VPACKSSDW X5, X2, X2 7719 VPACKSSDW X2, X2, X2 7720 VPACKSSWB X2, X2, X2 7721 VPAND X0, X2, X2 7722 VCMPPD $0x02, 64(SI)(AX*8), Y3, Y3 7723 VPUNPCKLDQ X2, X1, X1 7724 VEXTRACTF128 $0x01, Y3, X2 7725 VPACKSSDW X2, X3, X2 7726 VPACKSSDW X2, X2, X2 7727 VPACKSSWB X2, X2, X2 7728 VPAND X0, X2, X2 7729 VCMPPD $0x02, 96(SI)(AX*8), Y4, Y3 7730 VEXTRACTF128 $0x01, Y3, X4 7731 VPACKSSDW X4, X3, X3 7732 VPACKSSDW X3, X3, X3 7733 VPACKSSWB X3, X3, X3 7734 VPAND X0, X3, X3 7735 VPBROADCASTD X3, X3 7736 VPBROADCASTD X2, X2 7737 VPUNPCKLDQ X3, X2, X2 7738 VPBLENDD $0x0c, X2, X1, X1 7739 VMOVDQU X1, (DI)(AX*1) 7740 ADDQ $0x10, AX 7741 CMPQ R8, AX 7742 JNE LBB6_4 7743 CMPQ R8, CX 7744 JE LBB6_7 7745 7746 LBB6_6: 7747 VMOVSD (SI)(R8*8), X0 7748 VUCOMISD (DX)(R8*8), X0 7749 SETCC (DI)(R8*1) 7750 ADDQ $0x01, R8 7751 CMPQ CX, R8 7752 JNE LBB6_6 7753 7754 LBB6_7: 7755 VZEROUPPER 7756 RET 7757 7758 DATA dataGteF32<>+0(SB)/1, $0x01 7759 DATA dataGteF32<>+1(SB)/1, $0x01 7760 DATA dataGteF32<>+2(SB)/1, $0x01 7761 DATA dataGteF32<>+3(SB)/1, $0x01 7762 DATA dataGteF32<>+4(SB)/1, $0x01 7763 DATA dataGteF32<>+5(SB)/1, $0x01 7764 DATA dataGteF32<>+6(SB)/1, $0x01 7765 DATA dataGteF32<>+7(SB)/1, $0x01 7766 DATA dataGteF32<>+8(SB)/1, $0x00 7767 DATA dataGteF32<>+9(SB)/1, $0x00 7768 DATA dataGteF32<>+10(SB)/1, $0x00 7769 DATA dataGteF32<>+11(SB)/1, $0x00 7770 DATA dataGteF32<>+12(SB)/1, $0x00 7771 DATA dataGteF32<>+13(SB)/1, $0x00 7772 DATA dataGteF32<>+14(SB)/1, $0x00 7773 DATA dataGteF32<>+15(SB)/1, $0x00 7774 GLOBL dataGteF32<>(SB), RODATA|NOPTR, $16 7775 7776 // func Gte_AVX2_F32(x []bool, y []float32, z []float32) 7777 // Requires: AVX, AVX2 7778 TEXT ·Gte_AVX2_F32(SB), NOSPLIT, $0-72 7779 MOVQ x_base+0(FP), DI 7780 MOVQ y_base+24(FP), SI 7781 MOVQ z_base+48(FP), DX 7782 MOVQ x_len+8(FP), CX 7783 TESTQ CX, CX 7784 JE LBB7_7 7785 CMPQ CX, $0x20 7786 JAE LBB7_3 7787 XORL R8, R8 7788 JMP LBB7_6 7789 7790 LBB7_3: 7791 MOVQ CX, R8 7792 ANDQ $-32, R8 7793 XORL AX, AX 7794 VMOVDQU dataGteF32<>+0(SB), X0 7795 7796 LBB7_4: 7797 VMOVUPS (DX)(AX*4), Y1 7798 VMOVUPS 32(DX)(AX*4), Y2 7799 VMOVUPS 64(DX)(AX*4), Y3 7800 VMOVUPS 96(DX)(AX*4), Y4 7801 VCMPPS $0x02, (SI)(AX*4), Y1, Y1 7802 VEXTRACTF128 $0x01, Y1, X5 7803 VPACKSSDW X5, X1, X1 7804 VPACKSSWB X1, X1, X1 7805 VCMPPS $0x02, 32(SI)(AX*4), Y2, Y2 7806 VPAND X0, X1, X1 7807 VEXTRACTF128 $0x01, Y2, X5 7808 VPACKSSDW X5, X2, X2 7809 VPACKSSWB X2, X2, X2 7810 VPAND X0, X2, X2 7811 VCMPPS $0x02, 64(SI)(AX*4), Y3, Y3 7812 VEXTRACTF128 $0x01, Y3, X5 7813 VPACKSSDW X5, X3, X3 7814 VPACKSSWB X3, X3, X3 7815 VCMPPS $0x02, 96(SI)(AX*4), Y4, Y4 7816 VPAND X0, X3, X3 7817 VEXTRACTF128 $0x01, Y4, X5 7818 VPACKSSDW X5, X4, X4 7819 VPACKSSWB X4, X4, X4 7820 VPAND X0, X4, X4 7821 VINSERTI128 $0x01, X4, Y3, Y3 7822 VINSERTI128 $0x01, X2, Y1, Y1 7823 VPUNPCKLQDQ Y3, Y1, Y1 7824 VPERMQ $0xd8, Y1, Y1 7825 VMOVDQU Y1, (DI)(AX*1) 7826 ADDQ $0x20, AX 7827 CMPQ R8, AX 7828 JNE LBB7_4 7829 CMPQ R8, CX 7830 JE LBB7_7 7831 7832 LBB7_6: 7833 VMOVSS (SI)(R8*4), X0 7834 VUCOMISS (DX)(R8*4), X0 7835 SETCC (DI)(R8*1) 7836 ADDQ $0x01, R8 7837 CMPQ CX, R8 7838 JNE LBB7_6 7839 7840 LBB7_7: 7841 VZEROUPPER 7842 RET 7843 7844 DATA dataEqF64<>+0(SB)/1, $0x01 7845 DATA dataEqF64<>+1(SB)/1, $0x01 7846 DATA dataEqF64<>+2(SB)/1, $0x01 7847 DATA dataEqF64<>+3(SB)/1, $0x01 7848 DATA dataEqF64<>+4(SB)/1, $0x00 7849 DATA dataEqF64<>+5(SB)/1, $0x00 7850 DATA dataEqF64<>+6(SB)/1, $0x00 7851 DATA dataEqF64<>+7(SB)/1, $0x00 7852 DATA dataEqF64<>+8(SB)/1, $0x00 7853 DATA dataEqF64<>+9(SB)/1, $0x00 7854 DATA dataEqF64<>+10(SB)/1, $0x00 7855 DATA dataEqF64<>+11(SB)/1, $0x00 7856 DATA dataEqF64<>+12(SB)/1, $0x00 7857 DATA dataEqF64<>+13(SB)/1, $0x00 7858 DATA dataEqF64<>+14(SB)/1, $0x00 7859 DATA dataEqF64<>+15(SB)/1, $0x00 7860 GLOBL dataEqF64<>(SB), RODATA|NOPTR, $16 7861 7862 // func Eq_AVX2_F64(x []bool, y []float64, z []float64) 7863 // Requires: AVX, AVX2 7864 TEXT ·Eq_AVX2_F64(SB), NOSPLIT, $0-72 7865 MOVQ x_base+0(FP), DI 7866 MOVQ y_base+24(FP), SI 7867 MOVQ z_base+48(FP), DX 7868 MOVQ x_len+8(FP), CX 7869 TESTQ CX, CX 7870 JE LBB8_7 7871 CMPQ CX, $0x10 7872 JAE LBB8_3 7873 XORL R8, R8 7874 JMP LBB8_6 7875 7876 LBB8_3: 7877 MOVQ CX, R8 7878 ANDQ $-16, R8 7879 XORL AX, AX 7880 VMOVDQU dataEqF64<>+0(SB), X0 7881 7882 LBB8_4: 7883 VMOVUPD (DX)(AX*8), Y1 7884 VMOVUPD 32(DX)(AX*8), Y2 7885 VMOVUPD 64(DX)(AX*8), Y3 7886 VMOVUPD 96(DX)(AX*8), Y4 7887 VCMPPD $0x00, (SI)(AX*8), Y1, Y1 7888 VEXTRACTF128 $0x01, Y1, X5 7889 VPACKSSDW X5, X1, X1 7890 VPACKSSDW X1, X1, X1 7891 VPACKSSWB X1, X1, X1 7892 VCMPPD $0x00, 32(SI)(AX*8), Y2, Y2 7893 VPAND X0, X1, X1 7894 VEXTRACTF128 $0x01, Y2, X5 7895 VPACKSSDW X5, X2, X2 7896 VPACKSSDW X2, X2, X2 7897 VPACKSSWB X2, X2, X2 7898 VPAND X0, X2, X2 7899 VCMPPD $0x00, 64(SI)(AX*8), Y3, Y3 7900 VPUNPCKLDQ X2, X1, X1 7901 VEXTRACTF128 $0x01, Y3, X2 7902 VPACKSSDW X2, X3, X2 7903 VPACKSSDW X2, X2, X2 7904 VPACKSSWB X2, X2, X2 7905 VPAND X0, X2, X2 7906 VCMPPD $0x00, 96(SI)(AX*8), Y4, Y3 7907 VEXTRACTF128 $0x01, Y3, X4 7908 VPACKSSDW X4, X3, X3 7909 VPACKSSDW X3, X3, X3 7910 VPACKSSWB X3, X3, X3 7911 VPAND X0, X3, X3 7912 VPBROADCASTD X3, X3 7913 VPBROADCASTD X2, X2 7914 VPUNPCKLDQ X3, X2, X2 7915 VPBLENDD $0x0c, X2, X1, X1 7916 VMOVDQU X1, (DI)(AX*1) 7917 ADDQ $0x10, AX 7918 CMPQ R8, AX 7919 JNE LBB8_4 7920 CMPQ R8, CX 7921 JE LBB8_7 7922 7923 LBB8_6: 7924 VMOVSD (SI)(R8*8), X0 7925 VUCOMISD (DX)(R8*8), X0 7926 SETEQ (DI)(R8*1) 7927 ADDQ $0x01, R8 7928 CMPQ CX, R8 7929 JNE LBB8_6 7930 7931 LBB8_7: 7932 VZEROUPPER 7933 RET 7934 7935 DATA dataEqF32<>+0(SB)/1, $0x01 7936 DATA dataEqF32<>+1(SB)/1, $0x01 7937 DATA dataEqF32<>+2(SB)/1, $0x01 7938 DATA dataEqF32<>+3(SB)/1, $0x01 7939 DATA dataEqF32<>+4(SB)/1, $0x01 7940 DATA dataEqF32<>+5(SB)/1, $0x01 7941 DATA dataEqF32<>+6(SB)/1, $0x01 7942 DATA dataEqF32<>+7(SB)/1, $0x01 7943 DATA dataEqF32<>+8(SB)/1, $0x00 7944 DATA dataEqF32<>+9(SB)/1, $0x00 7945 DATA dataEqF32<>+10(SB)/1, $0x00 7946 DATA dataEqF32<>+11(SB)/1, $0x00 7947 DATA dataEqF32<>+12(SB)/1, $0x00 7948 DATA dataEqF32<>+13(SB)/1, $0x00 7949 DATA dataEqF32<>+14(SB)/1, $0x00 7950 DATA dataEqF32<>+15(SB)/1, $0x00 7951 GLOBL dataEqF32<>(SB), RODATA|NOPTR, $16 7952 7953 // func Eq_AVX2_F32(x []bool, y []float32, z []float32) 7954 // Requires: AVX, AVX2 7955 TEXT ·Eq_AVX2_F32(SB), NOSPLIT, $0-72 7956 MOVQ x_base+0(FP), DI 7957 MOVQ y_base+24(FP), SI 7958 MOVQ z_base+48(FP), DX 7959 MOVQ x_len+8(FP), CX 7960 TESTQ CX, CX 7961 JE LBB9_7 7962 CMPQ CX, $0x20 7963 JAE LBB9_3 7964 XORL R8, R8 7965 JMP LBB9_6 7966 7967 LBB9_3: 7968 MOVQ CX, R8 7969 ANDQ $-32, R8 7970 XORL AX, AX 7971 VMOVDQU dataEqF32<>+0(SB), X0 7972 7973 LBB9_4: 7974 VMOVUPS (DX)(AX*4), Y1 7975 VMOVUPS 32(DX)(AX*4), Y2 7976 VMOVUPS 64(DX)(AX*4), Y3 7977 VMOVUPS 96(DX)(AX*4), Y4 7978 VCMPPS $0x00, (SI)(AX*4), Y1, Y1 7979 VEXTRACTF128 $0x01, Y1, X5 7980 VPACKSSDW X5, X1, X1 7981 VPACKSSWB X1, X1, X1 7982 VCMPPS $0x00, 32(SI)(AX*4), Y2, Y2 7983 VPAND X0, X1, X1 7984 VEXTRACTF128 $0x01, Y2, X5 7985 VPACKSSDW X5, X2, X2 7986 VPACKSSWB X2, X2, X2 7987 VPAND X0, X2, X2 7988 VCMPPS $0x00, 64(SI)(AX*4), Y3, Y3 7989 VEXTRACTF128 $0x01, Y3, X5 7990 VPACKSSDW X5, X3, X3 7991 VPACKSSWB X3, X3, X3 7992 VCMPPS $0x00, 96(SI)(AX*4), Y4, Y4 7993 VPAND X0, X3, X3 7994 VEXTRACTF128 $0x01, Y4, X5 7995 VPACKSSDW X5, X4, X4 7996 VPACKSSWB X4, X4, X4 7997 VPAND X0, X4, X4 7998 VINSERTI128 $0x01, X4, Y3, Y3 7999 VINSERTI128 $0x01, X2, Y1, Y1 8000 VPUNPCKLQDQ Y3, Y1, Y1 8001 VPERMQ $0xd8, Y1, Y1 8002 VMOVDQU Y1, (DI)(AX*1) 8003 ADDQ $0x20, AX 8004 CMPQ R8, AX 8005 JNE LBB9_4 8006 CMPQ R8, CX 8007 JE LBB9_7 8008 8009 LBB9_6: 8010 VMOVSS (SI)(R8*4), X0 8011 VUCOMISS (DX)(R8*4), X0 8012 SETEQ (DI)(R8*1) 8013 ADDQ $0x01, R8 8014 CMPQ CX, R8 8015 JNE LBB9_6 8016 8017 LBB9_7: 8018 VZEROUPPER 8019 RET 8020 8021 DATA dataNeqF64<>+0(SB)/1, $0x01 8022 DATA dataNeqF64<>+1(SB)/1, $0x01 8023 DATA dataNeqF64<>+2(SB)/1, $0x01 8024 DATA dataNeqF64<>+3(SB)/1, $0x01 8025 DATA dataNeqF64<>+4(SB)/1, $0x00 8026 DATA dataNeqF64<>+5(SB)/1, $0x00 8027 DATA dataNeqF64<>+6(SB)/1, $0x00 8028 DATA dataNeqF64<>+7(SB)/1, $0x00 8029 DATA dataNeqF64<>+8(SB)/1, $0x00 8030 DATA dataNeqF64<>+9(SB)/1, $0x00 8031 DATA dataNeqF64<>+10(SB)/1, $0x00 8032 DATA dataNeqF64<>+11(SB)/1, $0x00 8033 DATA dataNeqF64<>+12(SB)/1, $0x00 8034 DATA dataNeqF64<>+13(SB)/1, $0x00 8035 DATA dataNeqF64<>+14(SB)/1, $0x00 8036 DATA dataNeqF64<>+15(SB)/1, $0x00 8037 GLOBL dataNeqF64<>(SB), RODATA|NOPTR, $16 8038 8039 // func Neq_AVX2_F64(x []bool, y []float64, z []float64) 8040 // Requires: AVX, AVX2 8041 TEXT ·Neq_AVX2_F64(SB), NOSPLIT, $0-72 8042 MOVQ x_base+0(FP), DI 8043 MOVQ y_base+24(FP), SI 8044 MOVQ z_base+48(FP), DX 8045 MOVQ x_len+8(FP), CX 8046 TESTQ CX, CX 8047 JE LBB10_7 8048 CMPQ CX, $0x10 8049 JAE LBB10_3 8050 XORL R8, R8 8051 JMP LBB10_6 8052 8053 LBB10_3: 8054 MOVQ CX, R8 8055 ANDQ $-16, R8 8056 XORL AX, AX 8057 VMOVDQU dataNeqF64<>+0(SB), X0 8058 8059 LBB10_4: 8060 VMOVUPD (DX)(AX*8), Y1 8061 VMOVUPD 32(DX)(AX*8), Y2 8062 VMOVUPD 64(DX)(AX*8), Y3 8063 VMOVUPD 96(DX)(AX*8), Y4 8064 VCMPPD $0x04, (SI)(AX*8), Y1, Y1 8065 VEXTRACTF128 $0x01, Y1, X5 8066 VPACKSSDW X5, X1, X1 8067 VPACKSSDW X1, X1, X1 8068 VPACKSSWB X1, X1, X1 8069 VCMPPD $0x04, 32(SI)(AX*8), Y2, Y2 8070 VPAND X0, X1, X1 8071 VEXTRACTF128 $0x01, Y2, X5 8072 VPACKSSDW X5, X2, X2 8073 VPACKSSDW X2, X2, X2 8074 VPACKSSWB X2, X2, X2 8075 VPAND X0, X2, X2 8076 VCMPPD $0x04, 64(SI)(AX*8), Y3, Y3 8077 VPUNPCKLDQ X2, X1, X1 8078 VEXTRACTF128 $0x01, Y3, X2 8079 VPACKSSDW X2, X3, X2 8080 VPACKSSDW X2, X2, X2 8081 VPACKSSWB X2, X2, X2 8082 VPAND X0, X2, X2 8083 VCMPPD $0x04, 96(SI)(AX*8), Y4, Y3 8084 VEXTRACTF128 $0x01, Y3, X4 8085 VPACKSSDW X4, X3, X3 8086 VPACKSSDW X3, X3, X3 8087 VPACKSSWB X3, X3, X3 8088 VPAND X0, X3, X3 8089 VPBROADCASTD X3, X3 8090 VPBROADCASTD X2, X2 8091 VPUNPCKLDQ X3, X2, X2 8092 VPBLENDD $0x0c, X2, X1, X1 8093 VMOVDQU X1, (DI)(AX*1) 8094 ADDQ $0x10, AX 8095 CMPQ R8, AX 8096 JNE LBB10_4 8097 CMPQ R8, CX 8098 JE LBB10_7 8099 8100 LBB10_6: 8101 VMOVSD (SI)(R8*8), X0 8102 VUCOMISD (DX)(R8*8), X0 8103 SETNE (DI)(R8*1) 8104 ADDQ $0x01, R8 8105 CMPQ CX, R8 8106 JNE LBB10_6 8107 8108 LBB10_7: 8109 VZEROUPPER 8110 RET 8111 8112 DATA dataNeqF32<>+0(SB)/1, $0x01 8113 DATA dataNeqF32<>+1(SB)/1, $0x01 8114 DATA dataNeqF32<>+2(SB)/1, $0x01 8115 DATA dataNeqF32<>+3(SB)/1, $0x01 8116 DATA dataNeqF32<>+4(SB)/1, $0x01 8117 DATA dataNeqF32<>+5(SB)/1, $0x01 8118 DATA dataNeqF32<>+6(SB)/1, $0x01 8119 DATA dataNeqF32<>+7(SB)/1, $0x01 8120 DATA dataNeqF32<>+8(SB)/1, $0x00 8121 DATA dataNeqF32<>+9(SB)/1, $0x00 8122 DATA dataNeqF32<>+10(SB)/1, $0x00 8123 DATA dataNeqF32<>+11(SB)/1, $0x00 8124 DATA dataNeqF32<>+12(SB)/1, $0x00 8125 DATA dataNeqF32<>+13(SB)/1, $0x00 8126 DATA dataNeqF32<>+14(SB)/1, $0x00 8127 DATA dataNeqF32<>+15(SB)/1, $0x00 8128 GLOBL dataNeqF32<>(SB), RODATA|NOPTR, $16 8129 8130 // func Neq_AVX2_F32(x []bool, y []float32, z []float32) 8131 // Requires: AVX, AVX2 8132 TEXT ·Neq_AVX2_F32(SB), NOSPLIT, $0-72 8133 MOVQ x_base+0(FP), DI 8134 MOVQ y_base+24(FP), SI 8135 MOVQ z_base+48(FP), DX 8136 MOVQ x_len+8(FP), CX 8137 TESTQ CX, CX 8138 JE LBB11_7 8139 CMPQ CX, $0x20 8140 JAE LBB11_3 8141 XORL R8, R8 8142 JMP LBB11_6 8143 8144 LBB11_3: 8145 MOVQ CX, R8 8146 ANDQ $-32, R8 8147 XORL AX, AX 8148 VMOVDQU dataNeqF32<>+0(SB), X0 8149 8150 LBB11_4: 8151 VMOVUPS (DX)(AX*4), Y1 8152 VMOVUPS 32(DX)(AX*4), Y2 8153 VMOVUPS 64(DX)(AX*4), Y3 8154 VMOVUPS 96(DX)(AX*4), Y4 8155 VCMPPS $0x04, (SI)(AX*4), Y1, Y1 8156 VEXTRACTF128 $0x01, Y1, X5 8157 VPACKSSDW X5, X1, X1 8158 VPACKSSWB X1, X1, X1 8159 VCMPPS $0x04, 32(SI)(AX*4), Y2, Y2 8160 VPAND X0, X1, X1 8161 VEXTRACTF128 $0x01, Y2, X5 8162 VPACKSSDW X5, X2, X2 8163 VPACKSSWB X2, X2, X2 8164 VPAND X0, X2, X2 8165 VCMPPS $0x04, 64(SI)(AX*4), Y3, Y3 8166 VEXTRACTF128 $0x01, Y3, X5 8167 VPACKSSDW X5, X3, X3 8168 VPACKSSWB X3, X3, X3 8169 VCMPPS $0x04, 96(SI)(AX*4), Y4, Y4 8170 VPAND X0, X3, X3 8171 VEXTRACTF128 $0x01, Y4, X5 8172 VPACKSSDW X5, X4, X4 8173 VPACKSSWB X4, X4, X4 8174 VPAND X0, X4, X4 8175 VINSERTI128 $0x01, X4, Y3, Y3 8176 VINSERTI128 $0x01, X2, Y1, Y1 8177 VPUNPCKLQDQ Y3, Y1, Y1 8178 VPERMQ $0xd8, Y1, Y1 8179 VMOVDQU Y1, (DI)(AX*1) 8180 ADDQ $0x20, AX 8181 CMPQ R8, AX 8182 JNE LBB11_4 8183 CMPQ R8, CX 8184 JE LBB11_7 8185 8186 LBB11_6: 8187 VMOVSS (SI)(R8*4), X0 8188 VUCOMISS (DX)(R8*4), X0 8189 SETNE (DI)(R8*1) 8190 ADDQ $0x01, R8 8191 CMPQ CX, R8 8192 JNE LBB11_6 8193 8194 LBB11_7: 8195 VZEROUPPER 8196 RET 8197 8198 DATA dataLtNumberF64<>+0(SB)/1, $0x01 8199 DATA dataLtNumberF64<>+1(SB)/1, $0x01 8200 DATA dataLtNumberF64<>+2(SB)/1, $0x01 8201 DATA dataLtNumberF64<>+3(SB)/1, $0x01 8202 DATA dataLtNumberF64<>+4(SB)/1, $0x00 8203 DATA dataLtNumberF64<>+5(SB)/1, $0x00 8204 DATA dataLtNumberF64<>+6(SB)/1, $0x00 8205 DATA dataLtNumberF64<>+7(SB)/1, $0x00 8206 DATA dataLtNumberF64<>+8(SB)/1, $0x00 8207 DATA dataLtNumberF64<>+9(SB)/1, $0x00 8208 DATA dataLtNumberF64<>+10(SB)/1, $0x00 8209 DATA dataLtNumberF64<>+11(SB)/1, $0x00 8210 DATA dataLtNumberF64<>+12(SB)/1, $0x00 8211 DATA dataLtNumberF64<>+13(SB)/1, $0x00 8212 DATA dataLtNumberF64<>+14(SB)/1, $0x00 8213 DATA dataLtNumberF64<>+15(SB)/1, $0x00 8214 GLOBL dataLtNumberF64<>(SB), RODATA|NOPTR, $16 8215 8216 // func LtNumber_AVX2_F64(x []bool, y []float64, a float64) 8217 // Requires: AVX, AVX2, SSE2 8218 TEXT ·LtNumber_AVX2_F64(SB), NOSPLIT, $0-56 8219 MOVQ x_base+0(FP), DI 8220 MOVQ y_base+24(FP), SI 8221 MOVSD a+48(FP), X0 8222 MOVQ x_len+8(FP), DX 8223 TESTQ DX, DX 8224 JE LBB12_7 8225 CMPQ DX, $0x10 8226 JAE LBB12_3 8227 XORL AX, AX 8228 JMP LBB12_6 8229 8230 LBB12_3: 8231 MOVQ DX, AX 8232 ANDQ $-16, AX 8233 VBROADCASTSD X0, Y1 8234 XORL CX, CX 8235 VMOVDQU dataLtNumberF64<>+0(SB), X2 8236 8237 LBB12_4: 8238 VMOVUPD (SI)(CX*8), Y3 8239 VMOVUPD 32(SI)(CX*8), Y4 8240 VMOVUPD 64(SI)(CX*8), Y5 8241 VMOVUPD 96(SI)(CX*8), Y6 8242 VCMPPD $0x01, Y1, Y3, Y3 8243 VEXTRACTF128 $0x01, Y3, X7 8244 VPACKSSDW X7, X3, X3 8245 VPACKSSDW X3, X3, X3 8246 VPACKSSWB X3, X3, X3 8247 VPAND X2, X3, X3 8248 VCMPPD $0x01, Y1, Y4, Y4 8249 VEXTRACTF128 $0x01, Y4, X7 8250 VPACKSSDW X7, X4, X4 8251 VPACKSSDW X4, X4, X4 8252 VPACKSSWB X4, X4, X4 8253 VPAND X2, X4, X4 8254 VPUNPCKLDQ X4, X3, X3 8255 VCMPPD $0x01, Y1, Y5, Y4 8256 VEXTRACTF128 $0x01, Y4, X5 8257 VPACKSSDW X5, X4, X4 8258 VPACKSSDW X4, X4, X4 8259 VPACKSSWB X4, X4, X4 8260 VPAND X2, X4, X4 8261 VCMPPD $0x01, Y1, Y6, Y5 8262 VEXTRACTF128 $0x01, Y5, X6 8263 VPACKSSDW X6, X5, X5 8264 VPACKSSDW X5, X5, X5 8265 VPACKSSWB X5, X5, X5 8266 VPAND X2, X5, X5 8267 VPBROADCASTD X5, X5 8268 VPBROADCASTD X4, X4 8269 VPUNPCKLDQ X5, X4, X4 8270 VPBLENDD $0x0c, X4, X3, X3 8271 VMOVDQU X3, (DI)(CX*1) 8272 ADDQ $0x10, CX 8273 CMPQ AX, CX 8274 JNE LBB12_4 8275 CMPQ AX, DX 8276 JE LBB12_7 8277 8278 LBB12_6: 8279 VUCOMISD (SI)(AX*8), X0 8280 SETHI (DI)(AX*1) 8281 ADDQ $0x01, AX 8282 CMPQ DX, AX 8283 JNE LBB12_6 8284 8285 LBB12_7: 8286 VZEROUPPER 8287 RET 8288 8289 DATA dataLtNumberF32<>+0(SB)/1, $0x01 8290 DATA dataLtNumberF32<>+1(SB)/1, $0x01 8291 DATA dataLtNumberF32<>+2(SB)/1, $0x01 8292 DATA dataLtNumberF32<>+3(SB)/1, $0x01 8293 DATA dataLtNumberF32<>+4(SB)/1, $0x01 8294 DATA dataLtNumberF32<>+5(SB)/1, $0x01 8295 DATA dataLtNumberF32<>+6(SB)/1, $0x01 8296 DATA dataLtNumberF32<>+7(SB)/1, $0x01 8297 DATA dataLtNumberF32<>+8(SB)/1, $0x00 8298 DATA dataLtNumberF32<>+9(SB)/1, $0x00 8299 DATA dataLtNumberF32<>+10(SB)/1, $0x00 8300 DATA dataLtNumberF32<>+11(SB)/1, $0x00 8301 DATA dataLtNumberF32<>+12(SB)/1, $0x00 8302 DATA dataLtNumberF32<>+13(SB)/1, $0x00 8303 DATA dataLtNumberF32<>+14(SB)/1, $0x00 8304 DATA dataLtNumberF32<>+15(SB)/1, $0x00 8305 GLOBL dataLtNumberF32<>(SB), RODATA|NOPTR, $16 8306 8307 // func LtNumber_AVX2_F32(x []bool, y []float32, a float32) 8308 // Requires: AVX, AVX2, SSE 8309 TEXT ·LtNumber_AVX2_F32(SB), NOSPLIT, $0-52 8310 MOVQ x_base+0(FP), DI 8311 MOVQ y_base+24(FP), SI 8312 MOVSS a+48(FP), X0 8313 MOVQ x_len+8(FP), DX 8314 TESTQ DX, DX 8315 JE LBB13_7 8316 CMPQ DX, $0x20 8317 JAE LBB13_3 8318 XORL AX, AX 8319 JMP LBB13_6 8320 8321 LBB13_3: 8322 MOVQ DX, AX 8323 ANDQ $-32, AX 8324 VBROADCASTSS X0, Y1 8325 XORL CX, CX 8326 VMOVDQU dataLtNumberF32<>+0(SB), X2 8327 8328 LBB13_4: 8329 VMOVUPS (SI)(CX*4), Y3 8330 VMOVUPS 32(SI)(CX*4), Y4 8331 VMOVUPS 64(SI)(CX*4), Y5 8332 VMOVUPS 96(SI)(CX*4), Y6 8333 VCMPPS $0x01, Y1, Y3, Y3 8334 VEXTRACTF128 $0x01, Y3, X7 8335 VPACKSSDW X7, X3, X3 8336 VPACKSSWB X3, X3, X3 8337 VPAND X2, X3, X3 8338 VCMPPS $0x01, Y1, Y4, Y4 8339 VEXTRACTF128 $0x01, Y4, X7 8340 VPACKSSDW X7, X4, X4 8341 VPACKSSWB X4, X4, X4 8342 VPAND X2, X4, X4 8343 VCMPPS $0x01, Y1, Y5, Y5 8344 VEXTRACTF128 $0x01, Y5, X7 8345 VPACKSSDW X7, X5, X5 8346 VPACKSSWB X5, X5, X5 8347 VPAND X2, X5, X5 8348 VCMPPS $0x01, Y1, Y6, Y6 8349 VEXTRACTF128 $0x01, Y6, X7 8350 VPACKSSDW X7, X6, X6 8351 VPACKSSWB X6, X6, X6 8352 VPAND X2, X6, X6 8353 VINSERTI128 $0x01, X6, Y5, Y5 8354 VINSERTI128 $0x01, X4, Y3, Y3 8355 VPUNPCKLQDQ Y5, Y3, Y3 8356 VPERMQ $0xd8, Y3, Y3 8357 VMOVDQU Y3, (DI)(CX*1) 8358 ADDQ $0x20, CX 8359 CMPQ AX, CX 8360 JNE LBB13_4 8361 CMPQ AX, DX 8362 JE LBB13_7 8363 8364 LBB13_6: 8365 VUCOMISS (SI)(AX*4), X0 8366 SETHI (DI)(AX*1) 8367 ADDQ $0x01, AX 8368 CMPQ DX, AX 8369 JNE LBB13_6 8370 8371 LBB13_7: 8372 VZEROUPPER 8373 RET 8374 8375 DATA dataLteNumberF64<>+0(SB)/1, $0x01 8376 DATA dataLteNumberF64<>+1(SB)/1, $0x01 8377 DATA dataLteNumberF64<>+2(SB)/1, $0x01 8378 DATA dataLteNumberF64<>+3(SB)/1, $0x01 8379 DATA dataLteNumberF64<>+4(SB)/1, $0x00 8380 DATA dataLteNumberF64<>+5(SB)/1, $0x00 8381 DATA dataLteNumberF64<>+6(SB)/1, $0x00 8382 DATA dataLteNumberF64<>+7(SB)/1, $0x00 8383 DATA dataLteNumberF64<>+8(SB)/1, $0x00 8384 DATA dataLteNumberF64<>+9(SB)/1, $0x00 8385 DATA dataLteNumberF64<>+10(SB)/1, $0x00 8386 DATA dataLteNumberF64<>+11(SB)/1, $0x00 8387 DATA dataLteNumberF64<>+12(SB)/1, $0x00 8388 DATA dataLteNumberF64<>+13(SB)/1, $0x00 8389 DATA dataLteNumberF64<>+14(SB)/1, $0x00 8390 DATA dataLteNumberF64<>+15(SB)/1, $0x00 8391 GLOBL dataLteNumberF64<>(SB), RODATA|NOPTR, $16 8392 8393 // func LteNumber_AVX2_F64(x []bool, y []float64, a float64) 8394 // Requires: AVX, AVX2, SSE2 8395 TEXT ·LteNumber_AVX2_F64(SB), NOSPLIT, $0-56 8396 MOVQ x_base+0(FP), DI 8397 MOVQ y_base+24(FP), SI 8398 MOVSD a+48(FP), X0 8399 MOVQ x_len+8(FP), DX 8400 TESTQ DX, DX 8401 JE LBB14_7 8402 CMPQ DX, $0x10 8403 JAE LBB14_3 8404 XORL AX, AX 8405 JMP LBB14_6 8406 8407 LBB14_3: 8408 MOVQ DX, AX 8409 ANDQ $-16, AX 8410 VBROADCASTSD X0, Y1 8411 XORL CX, CX 8412 VMOVDQU dataLteNumberF64<>+0(SB), X2 8413 8414 LBB14_4: 8415 VMOVUPD (SI)(CX*8), Y3 8416 VMOVUPD 32(SI)(CX*8), Y4 8417 VMOVUPD 64(SI)(CX*8), Y5 8418 VMOVUPD 96(SI)(CX*8), Y6 8419 VCMPPD $0x02, Y1, Y3, Y3 8420 VEXTRACTF128 $0x01, Y3, X7 8421 VPACKSSDW X7, X3, X3 8422 VPACKSSDW X3, X3, X3 8423 VPACKSSWB X3, X3, X3 8424 VPAND X2, X3, X3 8425 VCMPPD $0x02, Y1, Y4, Y4 8426 VEXTRACTF128 $0x01, Y4, X7 8427 VPACKSSDW X7, X4, X4 8428 VPACKSSDW X4, X4, X4 8429 VPACKSSWB X4, X4, X4 8430 VPAND X2, X4, X4 8431 VPUNPCKLDQ X4, X3, X3 8432 VCMPPD $0x02, Y1, Y5, Y4 8433 VEXTRACTF128 $0x01, Y4, X5 8434 VPACKSSDW X5, X4, X4 8435 VPACKSSDW X4, X4, X4 8436 VPACKSSWB X4, X4, X4 8437 VPAND X2, X4, X4 8438 VCMPPD $0x02, Y1, Y6, Y5 8439 VEXTRACTF128 $0x01, Y5, X6 8440 VPACKSSDW X6, X5, X5 8441 VPACKSSDW X5, X5, X5 8442 VPACKSSWB X5, X5, X5 8443 VPAND X2, X5, X5 8444 VPBROADCASTD X5, X5 8445 VPBROADCASTD X4, X4 8446 VPUNPCKLDQ X5, X4, X4 8447 VPBLENDD $0x0c, X4, X3, X3 8448 VMOVDQU X3, (DI)(CX*1) 8449 ADDQ $0x10, CX 8450 CMPQ AX, CX 8451 JNE LBB14_4 8452 CMPQ AX, DX 8453 JE LBB14_7 8454 8455 LBB14_6: 8456 VUCOMISD (SI)(AX*8), X0 8457 SETCC (DI)(AX*1) 8458 ADDQ $0x01, AX 8459 CMPQ DX, AX 8460 JNE LBB14_6 8461 8462 LBB14_7: 8463 VZEROUPPER 8464 RET 8465 8466 DATA dataLteNumberF32<>+0(SB)/1, $0x01 8467 DATA dataLteNumberF32<>+1(SB)/1, $0x01 8468 DATA dataLteNumberF32<>+2(SB)/1, $0x01 8469 DATA dataLteNumberF32<>+3(SB)/1, $0x01 8470 DATA dataLteNumberF32<>+4(SB)/1, $0x01 8471 DATA dataLteNumberF32<>+5(SB)/1, $0x01 8472 DATA dataLteNumberF32<>+6(SB)/1, $0x01 8473 DATA dataLteNumberF32<>+7(SB)/1, $0x01 8474 DATA dataLteNumberF32<>+8(SB)/1, $0x00 8475 DATA dataLteNumberF32<>+9(SB)/1, $0x00 8476 DATA dataLteNumberF32<>+10(SB)/1, $0x00 8477 DATA dataLteNumberF32<>+11(SB)/1, $0x00 8478 DATA dataLteNumberF32<>+12(SB)/1, $0x00 8479 DATA dataLteNumberF32<>+13(SB)/1, $0x00 8480 DATA dataLteNumberF32<>+14(SB)/1, $0x00 8481 DATA dataLteNumberF32<>+15(SB)/1, $0x00 8482 GLOBL dataLteNumberF32<>(SB), RODATA|NOPTR, $16 8483 8484 // func LteNumber_AVX2_F32(x []bool, y []float32, a float32) 8485 // Requires: AVX, AVX2, SSE 8486 TEXT ·LteNumber_AVX2_F32(SB), NOSPLIT, $0-52 8487 MOVQ x_base+0(FP), DI 8488 MOVQ y_base+24(FP), SI 8489 MOVSS a+48(FP), X0 8490 MOVQ x_len+8(FP), DX 8491 TESTQ DX, DX 8492 JE LBB15_7 8493 CMPQ DX, $0x20 8494 JAE LBB15_3 8495 XORL AX, AX 8496 JMP LBB15_6 8497 8498 LBB15_3: 8499 MOVQ DX, AX 8500 ANDQ $-32, AX 8501 VBROADCASTSS X0, Y1 8502 XORL CX, CX 8503 VMOVDQU dataLteNumberF32<>+0(SB), X2 8504 8505 LBB15_4: 8506 VMOVUPS (SI)(CX*4), Y3 8507 VMOVUPS 32(SI)(CX*4), Y4 8508 VMOVUPS 64(SI)(CX*4), Y5 8509 VMOVUPS 96(SI)(CX*4), Y6 8510 VCMPPS $0x02, Y1, Y3, Y3 8511 VEXTRACTF128 $0x01, Y3, X7 8512 VPACKSSDW X7, X3, X3 8513 VPACKSSWB X3, X3, X3 8514 VPAND X2, X3, X3 8515 VCMPPS $0x02, Y1, Y4, Y4 8516 VEXTRACTF128 $0x01, Y4, X7 8517 VPACKSSDW X7, X4, X4 8518 VPACKSSWB X4, X4, X4 8519 VPAND X2, X4, X4 8520 VCMPPS $0x02, Y1, Y5, Y5 8521 VEXTRACTF128 $0x01, Y5, X7 8522 VPACKSSDW X7, X5, X5 8523 VPACKSSWB X5, X5, X5 8524 VPAND X2, X5, X5 8525 VCMPPS $0x02, Y1, Y6, Y6 8526 VEXTRACTF128 $0x01, Y6, X7 8527 VPACKSSDW X7, X6, X6 8528 VPACKSSWB X6, X6, X6 8529 VPAND X2, X6, X6 8530 VINSERTI128 $0x01, X6, Y5, Y5 8531 VINSERTI128 $0x01, X4, Y3, Y3 8532 VPUNPCKLQDQ Y5, Y3, Y3 8533 VPERMQ $0xd8, Y3, Y3 8534 VMOVDQU Y3, (DI)(CX*1) 8535 ADDQ $0x20, CX 8536 CMPQ AX, CX 8537 JNE LBB15_4 8538 CMPQ AX, DX 8539 JE LBB15_7 8540 8541 LBB15_6: 8542 VUCOMISS (SI)(AX*4), X0 8543 SETCC (DI)(AX*1) 8544 ADDQ $0x01, AX 8545 CMPQ DX, AX 8546 JNE LBB15_6 8547 8548 LBB15_7: 8549 VZEROUPPER 8550 RET 8551 8552 DATA dataGtNumberF64<>+0(SB)/1, $0x01 8553 DATA dataGtNumberF64<>+1(SB)/1, $0x01 8554 DATA dataGtNumberF64<>+2(SB)/1, $0x01 8555 DATA dataGtNumberF64<>+3(SB)/1, $0x01 8556 DATA dataGtNumberF64<>+4(SB)/1, $0x00 8557 DATA dataGtNumberF64<>+5(SB)/1, $0x00 8558 DATA dataGtNumberF64<>+6(SB)/1, $0x00 8559 DATA dataGtNumberF64<>+7(SB)/1, $0x00 8560 DATA dataGtNumberF64<>+8(SB)/1, $0x00 8561 DATA dataGtNumberF64<>+9(SB)/1, $0x00 8562 DATA dataGtNumberF64<>+10(SB)/1, $0x00 8563 DATA dataGtNumberF64<>+11(SB)/1, $0x00 8564 DATA dataGtNumberF64<>+12(SB)/1, $0x00 8565 DATA dataGtNumberF64<>+13(SB)/1, $0x00 8566 DATA dataGtNumberF64<>+14(SB)/1, $0x00 8567 DATA dataGtNumberF64<>+15(SB)/1, $0x00 8568 GLOBL dataGtNumberF64<>(SB), RODATA|NOPTR, $16 8569 8570 // func GtNumber_AVX2_F64(x []bool, y []float64, a float64) 8571 // Requires: AVX, AVX2, SSE2 8572 TEXT ·GtNumber_AVX2_F64(SB), NOSPLIT, $0-56 8573 MOVQ x_base+0(FP), DI 8574 MOVQ y_base+24(FP), SI 8575 MOVSD a+48(FP), X0 8576 MOVQ x_len+8(FP), DX 8577 TESTQ DX, DX 8578 JE LBB16_7 8579 CMPQ DX, $0x10 8580 JAE LBB16_3 8581 XORL AX, AX 8582 JMP LBB16_6 8583 8584 LBB16_3: 8585 MOVQ DX, AX 8586 ANDQ $-16, AX 8587 VBROADCASTSD X0, Y1 8588 XORL CX, CX 8589 VMOVDQU dataGtNumberF64<>+0(SB), X2 8590 8591 LBB16_4: 8592 VCMPPD $0x01, (SI)(CX*8), Y1, Y3 8593 VEXTRACTF128 $0x01, Y3, X4 8594 VPACKSSDW X4, X3, X3 8595 VPACKSSDW X3, X3, X3 8596 VPACKSSWB X3, X3, X3 8597 VCMPPD $0x01, 32(SI)(CX*8), Y1, Y4 8598 VPAND X2, X3, X3 8599 VEXTRACTF128 $0x01, Y4, X5 8600 VPACKSSDW X5, X4, X4 8601 VPACKSSDW X4, X4, X4 8602 VPACKSSWB X4, X4, X4 8603 VPAND X2, X4, X4 8604 VCMPPD $0x01, 64(SI)(CX*8), Y1, Y5 8605 VPUNPCKLDQ X4, X3, X3 8606 VEXTRACTF128 $0x01, Y5, X4 8607 VPACKSSDW X4, X5, X4 8608 VPACKSSDW X4, X4, X4 8609 VPACKSSWB X4, X4, X4 8610 VPAND X2, X4, X4 8611 VCMPPD $0x01, 96(SI)(CX*8), Y1, Y5 8612 VEXTRACTF128 $0x01, Y5, X6 8613 VPACKSSDW X6, X5, X5 8614 VPACKSSDW X5, X5, X5 8615 VPACKSSWB X5, X5, X5 8616 VPAND X2, X5, X5 8617 VPBROADCASTD X5, X5 8618 VPBROADCASTD X4, X4 8619 VPUNPCKLDQ X5, X4, X4 8620 VPBLENDD $0x0c, X4, X3, X3 8621 VMOVDQU X3, (DI)(CX*1) 8622 ADDQ $0x10, CX 8623 CMPQ AX, CX 8624 JNE LBB16_4 8625 CMPQ AX, DX 8626 JE LBB16_7 8627 8628 LBB16_6: 8629 VUCOMISD (SI)(AX*8), X0 8630 SETCS (DI)(AX*1) 8631 ADDQ $0x01, AX 8632 CMPQ DX, AX 8633 JNE LBB16_6 8634 8635 LBB16_7: 8636 VZEROUPPER 8637 RET 8638 8639 DATA dataGtNumberF32<>+0(SB)/1, $0x01 8640 DATA dataGtNumberF32<>+1(SB)/1, $0x01 8641 DATA dataGtNumberF32<>+2(SB)/1, $0x01 8642 DATA dataGtNumberF32<>+3(SB)/1, $0x01 8643 DATA dataGtNumberF32<>+4(SB)/1, $0x01 8644 DATA dataGtNumberF32<>+5(SB)/1, $0x01 8645 DATA dataGtNumberF32<>+6(SB)/1, $0x01 8646 DATA dataGtNumberF32<>+7(SB)/1, $0x01 8647 DATA dataGtNumberF32<>+8(SB)/1, $0x00 8648 DATA dataGtNumberF32<>+9(SB)/1, $0x00 8649 DATA dataGtNumberF32<>+10(SB)/1, $0x00 8650 DATA dataGtNumberF32<>+11(SB)/1, $0x00 8651 DATA dataGtNumberF32<>+12(SB)/1, $0x00 8652 DATA dataGtNumberF32<>+13(SB)/1, $0x00 8653 DATA dataGtNumberF32<>+14(SB)/1, $0x00 8654 DATA dataGtNumberF32<>+15(SB)/1, $0x00 8655 GLOBL dataGtNumberF32<>(SB), RODATA|NOPTR, $16 8656 8657 // func GtNumber_AVX2_F32(x []bool, y []float32, a float32) 8658 // Requires: AVX, AVX2, SSE 8659 TEXT ·GtNumber_AVX2_F32(SB), NOSPLIT, $0-52 8660 MOVQ x_base+0(FP), DI 8661 MOVQ y_base+24(FP), SI 8662 MOVSS a+48(FP), X0 8663 MOVQ x_len+8(FP), DX 8664 TESTQ DX, DX 8665 JE LBB17_7 8666 CMPQ DX, $0x20 8667 JAE LBB17_3 8668 XORL AX, AX 8669 JMP LBB17_6 8670 8671 LBB17_3: 8672 MOVQ DX, AX 8673 ANDQ $-32, AX 8674 VBROADCASTSS X0, Y1 8675 XORL CX, CX 8676 VMOVDQU dataGtNumberF32<>+0(SB), X2 8677 8678 LBB17_4: 8679 VCMPPS $0x01, (SI)(CX*4), Y1, Y3 8680 VEXTRACTF128 $0x01, Y3, X4 8681 VPACKSSDW X4, X3, X3 8682 VPACKSSWB X3, X3, X3 8683 VCMPPS $0x01, 32(SI)(CX*4), Y1, Y4 8684 VPAND X2, X3, X3 8685 VEXTRACTF128 $0x01, Y4, X5 8686 VPACKSSDW X5, X4, X4 8687 VPACKSSWB X4, X4, X4 8688 VPAND X2, X4, X4 8689 VCMPPS $0x01, 64(SI)(CX*4), Y1, Y5 8690 VEXTRACTF128 $0x01, Y5, X6 8691 VPACKSSDW X6, X5, X5 8692 VPACKSSWB X5, X5, X5 8693 VCMPPS $0x01, 96(SI)(CX*4), Y1, Y6 8694 VPAND X2, X5, X5 8695 VEXTRACTF128 $0x01, Y6, X7 8696 VPACKSSDW X7, X6, X6 8697 VPACKSSWB X6, X6, X6 8698 VPAND X2, X6, X6 8699 VINSERTI128 $0x01, X6, Y5, Y5 8700 VINSERTI128 $0x01, X4, Y3, Y3 8701 VPUNPCKLQDQ Y5, Y3, Y3 8702 VPERMQ $0xd8, Y3, Y3 8703 VMOVDQU Y3, (DI)(CX*1) 8704 ADDQ $0x20, CX 8705 CMPQ AX, CX 8706 JNE LBB17_4 8707 CMPQ AX, DX 8708 JE LBB17_7 8709 8710 LBB17_6: 8711 VUCOMISS (SI)(AX*4), X0 8712 SETCS (DI)(AX*1) 8713 ADDQ $0x01, AX 8714 CMPQ DX, AX 8715 JNE LBB17_6 8716 8717 LBB17_7: 8718 VZEROUPPER 8719 RET 8720 8721 DATA dataGteNumberF64<>+0(SB)/1, $0x01 8722 DATA dataGteNumberF64<>+1(SB)/1, $0x01 8723 DATA dataGteNumberF64<>+2(SB)/1, $0x01 8724 DATA dataGteNumberF64<>+3(SB)/1, $0x01 8725 DATA dataGteNumberF64<>+4(SB)/1, $0x00 8726 DATA dataGteNumberF64<>+5(SB)/1, $0x00 8727 DATA dataGteNumberF64<>+6(SB)/1, $0x00 8728 DATA dataGteNumberF64<>+7(SB)/1, $0x00 8729 DATA dataGteNumberF64<>+8(SB)/1, $0x00 8730 DATA dataGteNumberF64<>+9(SB)/1, $0x00 8731 DATA dataGteNumberF64<>+10(SB)/1, $0x00 8732 DATA dataGteNumberF64<>+11(SB)/1, $0x00 8733 DATA dataGteNumberF64<>+12(SB)/1, $0x00 8734 DATA dataGteNumberF64<>+13(SB)/1, $0x00 8735 DATA dataGteNumberF64<>+14(SB)/1, $0x00 8736 DATA dataGteNumberF64<>+15(SB)/1, $0x00 8737 GLOBL dataGteNumberF64<>(SB), RODATA|NOPTR, $16 8738 8739 // func GteNumber_AVX2_F64(x []bool, y []float64, a float64) 8740 // Requires: AVX, AVX2, SSE2 8741 TEXT ·GteNumber_AVX2_F64(SB), NOSPLIT, $0-56 8742 MOVQ x_base+0(FP), DI 8743 MOVQ y_base+24(FP), SI 8744 MOVSD a+48(FP), X0 8745 MOVQ x_len+8(FP), DX 8746 TESTQ DX, DX 8747 JE LBB18_7 8748 CMPQ DX, $0x10 8749 JAE LBB18_3 8750 XORL AX, AX 8751 JMP LBB18_6 8752 8753 LBB18_3: 8754 MOVQ DX, AX 8755 ANDQ $-16, AX 8756 VBROADCASTSD X0, Y1 8757 XORL CX, CX 8758 VMOVDQU dataGteNumberF64<>+0(SB), X2 8759 8760 LBB18_4: 8761 VCMPPD $0x02, (SI)(CX*8), Y1, Y3 8762 VEXTRACTF128 $0x01, Y3, X4 8763 VPACKSSDW X4, X3, X3 8764 VPACKSSDW X3, X3, X3 8765 VPACKSSWB X3, X3, X3 8766 VCMPPD $0x02, 32(SI)(CX*8), Y1, Y4 8767 VPAND X2, X3, X3 8768 VEXTRACTF128 $0x01, Y4, X5 8769 VPACKSSDW X5, X4, X4 8770 VPACKSSDW X4, X4, X4 8771 VPACKSSWB X4, X4, X4 8772 VPAND X2, X4, X4 8773 VCMPPD $0x02, 64(SI)(CX*8), Y1, Y5 8774 VPUNPCKLDQ X4, X3, X3 8775 VEXTRACTF128 $0x01, Y5, X4 8776 VPACKSSDW X4, X5, X4 8777 VPACKSSDW X4, X4, X4 8778 VPACKSSWB X4, X4, X4 8779 VPAND X2, X4, X4 8780 VCMPPD $0x02, 96(SI)(CX*8), Y1, Y5 8781 VEXTRACTF128 $0x01, Y5, X6 8782 VPACKSSDW X6, X5, X5 8783 VPACKSSDW X5, X5, X5 8784 VPACKSSWB X5, X5, X5 8785 VPAND X2, X5, X5 8786 VPBROADCASTD X5, X5 8787 VPBROADCASTD X4, X4 8788 VPUNPCKLDQ X5, X4, X4 8789 VPBLENDD $0x0c, X4, X3, X3 8790 VMOVDQU X3, (DI)(CX*1) 8791 ADDQ $0x10, CX 8792 CMPQ AX, CX 8793 JNE LBB18_4 8794 CMPQ AX, DX 8795 JE LBB18_7 8796 8797 LBB18_6: 8798 VUCOMISD (SI)(AX*8), X0 8799 SETLS (DI)(AX*1) 8800 ADDQ $0x01, AX 8801 CMPQ DX, AX 8802 JNE LBB18_6 8803 8804 LBB18_7: 8805 VZEROUPPER 8806 RET 8807 8808 DATA dataGteNumberF32<>+0(SB)/1, $0x01 8809 DATA dataGteNumberF32<>+1(SB)/1, $0x01 8810 DATA dataGteNumberF32<>+2(SB)/1, $0x01 8811 DATA dataGteNumberF32<>+3(SB)/1, $0x01 8812 DATA dataGteNumberF32<>+4(SB)/1, $0x01 8813 DATA dataGteNumberF32<>+5(SB)/1, $0x01 8814 DATA dataGteNumberF32<>+6(SB)/1, $0x01 8815 DATA dataGteNumberF32<>+7(SB)/1, $0x01 8816 DATA dataGteNumberF32<>+8(SB)/1, $0x00 8817 DATA dataGteNumberF32<>+9(SB)/1, $0x00 8818 DATA dataGteNumberF32<>+10(SB)/1, $0x00 8819 DATA dataGteNumberF32<>+11(SB)/1, $0x00 8820 DATA dataGteNumberF32<>+12(SB)/1, $0x00 8821 DATA dataGteNumberF32<>+13(SB)/1, $0x00 8822 DATA dataGteNumberF32<>+14(SB)/1, $0x00 8823 DATA dataGteNumberF32<>+15(SB)/1, $0x00 8824 GLOBL dataGteNumberF32<>(SB), RODATA|NOPTR, $16 8825 8826 // func GteNumber_AVX2_F32(x []bool, y []float32, a float32) 8827 // Requires: AVX, AVX2, SSE 8828 TEXT ·GteNumber_AVX2_F32(SB), NOSPLIT, $0-52 8829 MOVQ x_base+0(FP), DI 8830 MOVQ y_base+24(FP), SI 8831 MOVSS a+48(FP), X0 8832 MOVQ x_len+8(FP), DX 8833 TESTQ DX, DX 8834 JE LBB19_7 8835 CMPQ DX, $0x20 8836 JAE LBB19_3 8837 XORL AX, AX 8838 JMP LBB19_6 8839 8840 LBB19_3: 8841 MOVQ DX, AX 8842 ANDQ $-32, AX 8843 VBROADCASTSS X0, Y1 8844 XORL CX, CX 8845 VMOVDQU dataGteNumberF32<>+0(SB), X2 8846 8847 LBB19_4: 8848 VCMPPS $0x02, (SI)(CX*4), Y1, Y3 8849 VEXTRACTF128 $0x01, Y3, X4 8850 VPACKSSDW X4, X3, X3 8851 VPACKSSWB X3, X3, X3 8852 VCMPPS $0x02, 32(SI)(CX*4), Y1, Y4 8853 VPAND X2, X3, X3 8854 VEXTRACTF128 $0x01, Y4, X5 8855 VPACKSSDW X5, X4, X4 8856 VPACKSSWB X4, X4, X4 8857 VPAND X2, X4, X4 8858 VCMPPS $0x02, 64(SI)(CX*4), Y1, Y5 8859 VEXTRACTF128 $0x01, Y5, X6 8860 VPACKSSDW X6, X5, X5 8861 VPACKSSWB X5, X5, X5 8862 VCMPPS $0x02, 96(SI)(CX*4), Y1, Y6 8863 VPAND X2, X5, X5 8864 VEXTRACTF128 $0x01, Y6, X7 8865 VPACKSSDW X7, X6, X6 8866 VPACKSSWB X6, X6, X6 8867 VPAND X2, X6, X6 8868 VINSERTI128 $0x01, X6, Y5, Y5 8869 VINSERTI128 $0x01, X4, Y3, Y3 8870 VPUNPCKLQDQ Y5, Y3, Y3 8871 VPERMQ $0xd8, Y3, Y3 8872 VMOVDQU Y3, (DI)(CX*1) 8873 ADDQ $0x20, CX 8874 CMPQ AX, CX 8875 JNE LBB19_4 8876 CMPQ AX, DX 8877 JE LBB19_7 8878 8879 LBB19_6: 8880 VUCOMISS (SI)(AX*4), X0 8881 SETLS (DI)(AX*1) 8882 ADDQ $0x01, AX 8883 CMPQ DX, AX 8884 JNE LBB19_6 8885 8886 LBB19_7: 8887 VZEROUPPER 8888 RET 8889 8890 DATA dataEqNumberF64<>+0(SB)/1, $0x01 8891 DATA dataEqNumberF64<>+1(SB)/1, $0x01 8892 DATA dataEqNumberF64<>+2(SB)/1, $0x01 8893 DATA dataEqNumberF64<>+3(SB)/1, $0x01 8894 DATA dataEqNumberF64<>+4(SB)/1, $0x00 8895 DATA dataEqNumberF64<>+5(SB)/1, $0x00 8896 DATA dataEqNumberF64<>+6(SB)/1, $0x00 8897 DATA dataEqNumberF64<>+7(SB)/1, $0x00 8898 DATA dataEqNumberF64<>+8(SB)/1, $0x00 8899 DATA dataEqNumberF64<>+9(SB)/1, $0x00 8900 DATA dataEqNumberF64<>+10(SB)/1, $0x00 8901 DATA dataEqNumberF64<>+11(SB)/1, $0x00 8902 DATA dataEqNumberF64<>+12(SB)/1, $0x00 8903 DATA dataEqNumberF64<>+13(SB)/1, $0x00 8904 DATA dataEqNumberF64<>+14(SB)/1, $0x00 8905 DATA dataEqNumberF64<>+15(SB)/1, $0x00 8906 GLOBL dataEqNumberF64<>(SB), RODATA|NOPTR, $16 8907 8908 // func EqNumber_AVX2_F64(x []bool, y []float64, a float64) 8909 // Requires: AVX, AVX2, SSE2 8910 TEXT ·EqNumber_AVX2_F64(SB), NOSPLIT, $0-56 8911 MOVQ x_base+0(FP), DI 8912 MOVQ y_base+24(FP), SI 8913 MOVSD a+48(FP), X0 8914 MOVQ x_len+8(FP), DX 8915 TESTQ DX, DX 8916 JE LBB20_7 8917 CMPQ DX, $0x10 8918 JAE LBB20_3 8919 XORL AX, AX 8920 JMP LBB20_6 8921 8922 LBB20_3: 8923 MOVQ DX, AX 8924 ANDQ $-16, AX 8925 VBROADCASTSD X0, Y1 8926 XORL CX, CX 8927 VMOVDQU dataEqNumberF64<>+0(SB), X2 8928 8929 LBB20_4: 8930 VCMPPD $0x00, (SI)(CX*8), Y1, Y3 8931 VEXTRACTF128 $0x01, Y3, X4 8932 VPACKSSDW X4, X3, X3 8933 VPACKSSDW X3, X3, X3 8934 VPACKSSWB X3, X3, X3 8935 VCMPPD $0x00, 32(SI)(CX*8), Y1, Y4 8936 VPAND X2, X3, X3 8937 VEXTRACTF128 $0x01, Y4, X5 8938 VPACKSSDW X5, X4, X4 8939 VPACKSSDW X4, X4, X4 8940 VPACKSSWB X4, X4, X4 8941 VPAND X2, X4, X4 8942 VCMPPD $0x00, 64(SI)(CX*8), Y1, Y5 8943 VPUNPCKLDQ X4, X3, X3 8944 VEXTRACTF128 $0x01, Y5, X4 8945 VPACKSSDW X4, X5, X4 8946 VPACKSSDW X4, X4, X4 8947 VPACKSSWB X4, X4, X4 8948 VPAND X2, X4, X4 8949 VCMPPD $0x00, 96(SI)(CX*8), Y1, Y5 8950 VEXTRACTF128 $0x01, Y5, X6 8951 VPACKSSDW X6, X5, X5 8952 VPACKSSDW X5, X5, X5 8953 VPACKSSWB X5, X5, X5 8954 VPAND X2, X5, X5 8955 VPBROADCASTD X5, X5 8956 VPBROADCASTD X4, X4 8957 VPUNPCKLDQ X5, X4, X4 8958 VPBLENDD $0x0c, X4, X3, X3 8959 VMOVDQU X3, (DI)(CX*1) 8960 ADDQ $0x10, CX 8961 CMPQ AX, CX 8962 JNE LBB20_4 8963 CMPQ AX, DX 8964 JE LBB20_7 8965 8966 LBB20_6: 8967 VUCOMISD (SI)(AX*8), X0 8968 SETEQ (DI)(AX*1) 8969 ADDQ $0x01, AX 8970 CMPQ DX, AX 8971 JNE LBB20_6 8972 8973 LBB20_7: 8974 VZEROUPPER 8975 RET 8976 8977 DATA dataEqNumberF32<>+0(SB)/1, $0x01 8978 DATA dataEqNumberF32<>+1(SB)/1, $0x01 8979 DATA dataEqNumberF32<>+2(SB)/1, $0x01 8980 DATA dataEqNumberF32<>+3(SB)/1, $0x01 8981 DATA dataEqNumberF32<>+4(SB)/1, $0x01 8982 DATA dataEqNumberF32<>+5(SB)/1, $0x01 8983 DATA dataEqNumberF32<>+6(SB)/1, $0x01 8984 DATA dataEqNumberF32<>+7(SB)/1, $0x01 8985 DATA dataEqNumberF32<>+8(SB)/1, $0x00 8986 DATA dataEqNumberF32<>+9(SB)/1, $0x00 8987 DATA dataEqNumberF32<>+10(SB)/1, $0x00 8988 DATA dataEqNumberF32<>+11(SB)/1, $0x00 8989 DATA dataEqNumberF32<>+12(SB)/1, $0x00 8990 DATA dataEqNumberF32<>+13(SB)/1, $0x00 8991 DATA dataEqNumberF32<>+14(SB)/1, $0x00 8992 DATA dataEqNumberF32<>+15(SB)/1, $0x00 8993 GLOBL dataEqNumberF32<>(SB), RODATA|NOPTR, $16 8994 8995 // func EqNumber_AVX2_F32(x []bool, y []float32, a float32) 8996 // Requires: AVX, AVX2, SSE 8997 TEXT ·EqNumber_AVX2_F32(SB), NOSPLIT, $0-52 8998 MOVQ x_base+0(FP), DI 8999 MOVQ y_base+24(FP), SI 9000 MOVSS a+48(FP), X0 9001 MOVQ x_len+8(FP), DX 9002 TESTQ DX, DX 9003 JE LBB21_7 9004 CMPQ DX, $0x20 9005 JAE LBB21_3 9006 XORL AX, AX 9007 JMP LBB21_6 9008 9009 LBB21_3: 9010 MOVQ DX, AX 9011 ANDQ $-32, AX 9012 VBROADCASTSS X0, Y1 9013 XORL CX, CX 9014 VMOVDQU dataEqNumberF32<>+0(SB), X2 9015 9016 LBB21_4: 9017 VCMPPS $0x00, (SI)(CX*4), Y1, Y3 9018 VEXTRACTF128 $0x01, Y3, X4 9019 VPACKSSDW X4, X3, X3 9020 VPACKSSWB X3, X3, X3 9021 VCMPPS $0x00, 32(SI)(CX*4), Y1, Y4 9022 VPAND X2, X3, X3 9023 VEXTRACTF128 $0x01, Y4, X5 9024 VPACKSSDW X5, X4, X4 9025 VPACKSSWB X4, X4, X4 9026 VPAND X2, X4, X4 9027 VCMPPS $0x00, 64(SI)(CX*4), Y1, Y5 9028 VEXTRACTF128 $0x01, Y5, X6 9029 VPACKSSDW X6, X5, X5 9030 VPACKSSWB X5, X5, X5 9031 VCMPPS $0x00, 96(SI)(CX*4), Y1, Y6 9032 VPAND X2, X5, X5 9033 VEXTRACTF128 $0x01, Y6, X7 9034 VPACKSSDW X7, X6, X6 9035 VPACKSSWB X6, X6, X6 9036 VPAND X2, X6, X6 9037 VINSERTI128 $0x01, X6, Y5, Y5 9038 VINSERTI128 $0x01, X4, Y3, Y3 9039 VPUNPCKLQDQ Y5, Y3, Y3 9040 VPERMQ $0xd8, Y3, Y3 9041 VMOVDQU Y3, (DI)(CX*1) 9042 ADDQ $0x20, CX 9043 CMPQ AX, CX 9044 JNE LBB21_4 9045 CMPQ AX, DX 9046 JE LBB21_7 9047 9048 LBB21_6: 9049 VUCOMISS (SI)(AX*4), X0 9050 SETEQ (DI)(AX*1) 9051 ADDQ $0x01, AX 9052 CMPQ DX, AX 9053 JNE LBB21_6 9054 9055 LBB21_7: 9056 VZEROUPPER 9057 RET 9058 9059 DATA dataNeqNumberF64<>+0(SB)/1, $0x01 9060 DATA dataNeqNumberF64<>+1(SB)/1, $0x01 9061 DATA dataNeqNumberF64<>+2(SB)/1, $0x01 9062 DATA dataNeqNumberF64<>+3(SB)/1, $0x01 9063 DATA dataNeqNumberF64<>+4(SB)/1, $0x00 9064 DATA dataNeqNumberF64<>+5(SB)/1, $0x00 9065 DATA dataNeqNumberF64<>+6(SB)/1, $0x00 9066 DATA dataNeqNumberF64<>+7(SB)/1, $0x00 9067 DATA dataNeqNumberF64<>+8(SB)/1, $0x00 9068 DATA dataNeqNumberF64<>+9(SB)/1, $0x00 9069 DATA dataNeqNumberF64<>+10(SB)/1, $0x00 9070 DATA dataNeqNumberF64<>+11(SB)/1, $0x00 9071 DATA dataNeqNumberF64<>+12(SB)/1, $0x00 9072 DATA dataNeqNumberF64<>+13(SB)/1, $0x00 9073 DATA dataNeqNumberF64<>+14(SB)/1, $0x00 9074 DATA dataNeqNumberF64<>+15(SB)/1, $0x00 9075 GLOBL dataNeqNumberF64<>(SB), RODATA|NOPTR, $16 9076 9077 // func NeqNumber_AVX2_F64(x []bool, y []float64, a float64) 9078 // Requires: AVX, AVX2, SSE2 9079 TEXT ·NeqNumber_AVX2_F64(SB), NOSPLIT, $0-56 9080 MOVQ x_base+0(FP), DI 9081 MOVQ y_base+24(FP), SI 9082 MOVSD a+48(FP), X0 9083 MOVQ x_len+8(FP), DX 9084 TESTQ DX, DX 9085 JE LBB22_7 9086 CMPQ DX, $0x10 9087 JAE LBB22_3 9088 XORL AX, AX 9089 JMP LBB22_6 9090 9091 LBB22_3: 9092 MOVQ DX, AX 9093 ANDQ $-16, AX 9094 VBROADCASTSD X0, Y1 9095 XORL CX, CX 9096 VMOVDQU dataNeqNumberF64<>+0(SB), X2 9097 9098 LBB22_4: 9099 VCMPPD $0x04, (SI)(CX*8), Y1, Y3 9100 VEXTRACTF128 $0x01, Y3, X4 9101 VPACKSSDW X4, X3, X3 9102 VPACKSSDW X3, X3, X3 9103 VPACKSSWB X3, X3, X3 9104 VCMPPD $0x04, 32(SI)(CX*8), Y1, Y4 9105 VPAND X2, X3, X3 9106 VEXTRACTF128 $0x01, Y4, X5 9107 VPACKSSDW X5, X4, X4 9108 VPACKSSDW X4, X4, X4 9109 VPACKSSWB X4, X4, X4 9110 VPAND X2, X4, X4 9111 VCMPPD $0x04, 64(SI)(CX*8), Y1, Y5 9112 VPUNPCKLDQ X4, X3, X3 9113 VEXTRACTF128 $0x01, Y5, X4 9114 VPACKSSDW X4, X5, X4 9115 VPACKSSDW X4, X4, X4 9116 VPACKSSWB X4, X4, X4 9117 VPAND X2, X4, X4 9118 VCMPPD $0x04, 96(SI)(CX*8), Y1, Y5 9119 VEXTRACTF128 $0x01, Y5, X6 9120 VPACKSSDW X6, X5, X5 9121 VPACKSSDW X5, X5, X5 9122 VPACKSSWB X5, X5, X5 9123 VPAND X2, X5, X5 9124 VPBROADCASTD X5, X5 9125 VPBROADCASTD X4, X4 9126 VPUNPCKLDQ X5, X4, X4 9127 VPBLENDD $0x0c, X4, X3, X3 9128 VMOVDQU X3, (DI)(CX*1) 9129 ADDQ $0x10, CX 9130 CMPQ AX, CX 9131 JNE LBB22_4 9132 CMPQ AX, DX 9133 JE LBB22_7 9134 9135 LBB22_6: 9136 VUCOMISD (SI)(AX*8), X0 9137 SETNE (DI)(AX*1) 9138 ADDQ $0x01, AX 9139 CMPQ DX, AX 9140 JNE LBB22_6 9141 9142 LBB22_7: 9143 VZEROUPPER 9144 RET 9145 9146 DATA dataNeqNumberF32<>+0(SB)/1, $0x01 9147 DATA dataNeqNumberF32<>+1(SB)/1, $0x01 9148 DATA dataNeqNumberF32<>+2(SB)/1, $0x01 9149 DATA dataNeqNumberF32<>+3(SB)/1, $0x01 9150 DATA dataNeqNumberF32<>+4(SB)/1, $0x01 9151 DATA dataNeqNumberF32<>+5(SB)/1, $0x01 9152 DATA dataNeqNumberF32<>+6(SB)/1, $0x01 9153 DATA dataNeqNumberF32<>+7(SB)/1, $0x01 9154 DATA dataNeqNumberF32<>+8(SB)/1, $0x00 9155 DATA dataNeqNumberF32<>+9(SB)/1, $0x00 9156 DATA dataNeqNumberF32<>+10(SB)/1, $0x00 9157 DATA dataNeqNumberF32<>+11(SB)/1, $0x00 9158 DATA dataNeqNumberF32<>+12(SB)/1, $0x00 9159 DATA dataNeqNumberF32<>+13(SB)/1, $0x00 9160 DATA dataNeqNumberF32<>+14(SB)/1, $0x00 9161 DATA dataNeqNumberF32<>+15(SB)/1, $0x00 9162 GLOBL dataNeqNumberF32<>(SB), RODATA|NOPTR, $16 9163 9164 // func NeqNumber_AVX2_F32(x []bool, y []float32, a float32) 9165 // Requires: AVX, AVX2, SSE 9166 TEXT ·NeqNumber_AVX2_F32(SB), NOSPLIT, $0-52 9167 MOVQ x_base+0(FP), DI 9168 MOVQ y_base+24(FP), SI 9169 MOVSS a+48(FP), X0 9170 MOVQ x_len+8(FP), DX 9171 TESTQ DX, DX 9172 JE LBB23_7 9173 CMPQ DX, $0x20 9174 JAE LBB23_3 9175 XORL AX, AX 9176 JMP LBB23_6 9177 9178 LBB23_3: 9179 MOVQ DX, AX 9180 ANDQ $-32, AX 9181 VBROADCASTSS X0, Y1 9182 XORL CX, CX 9183 VMOVDQU dataNeqNumberF32<>+0(SB), X2 9184 9185 LBB23_4: 9186 VCMPPS $0x04, (SI)(CX*4), Y1, Y3 9187 VEXTRACTF128 $0x01, Y3, X4 9188 VPACKSSDW X4, X3, X3 9189 VPACKSSWB X3, X3, X3 9190 VCMPPS $0x04, 32(SI)(CX*4), Y1, Y4 9191 VPAND X2, X3, X3 9192 VEXTRACTF128 $0x01, Y4, X5 9193 VPACKSSDW X5, X4, X4 9194 VPACKSSWB X4, X4, X4 9195 VPAND X2, X4, X4 9196 VCMPPS $0x04, 64(SI)(CX*4), Y1, Y5 9197 VEXTRACTF128 $0x01, Y5, X6 9198 VPACKSSDW X6, X5, X5 9199 VPACKSSWB X5, X5, X5 9200 VCMPPS $0x04, 96(SI)(CX*4), Y1, Y6 9201 VPAND X2, X5, X5 9202 VEXTRACTF128 $0x01, Y6, X7 9203 VPACKSSDW X7, X6, X6 9204 VPACKSSWB X6, X6, X6 9205 VPAND X2, X6, X6 9206 VINSERTI128 $0x01, X6, Y5, Y5 9207 VINSERTI128 $0x01, X4, Y3, Y3 9208 VPUNPCKLQDQ Y5, Y3, Y3 9209 VPERMQ $0xd8, Y3, Y3 9210 VMOVDQU Y3, (DI)(CX*1) 9211 ADDQ $0x20, CX 9212 CMPQ AX, CX 9213 JNE LBB23_4 9214 CMPQ AX, DX 9215 JE LBB23_7 9216 9217 LBB23_6: 9218 VUCOMISS (SI)(AX*4), X0 9219 SETNE (DI)(AX*1) 9220 ADDQ $0x01, AX 9221 CMPQ DX, AX 9222 JNE LBB23_6 9223 9224 LBB23_7: 9225 VZEROUPPER 9226 RET 9227 9228 DATA dataNot<>+0(SB)/1, $0x01 9229 DATA dataNot<>+1(SB)/1, $0x01 9230 DATA dataNot<>+2(SB)/1, $0x01 9231 DATA dataNot<>+3(SB)/1, $0x01 9232 DATA dataNot<>+4(SB)/1, $0x01 9233 DATA dataNot<>+5(SB)/1, $0x01 9234 DATA dataNot<>+6(SB)/1, $0x01 9235 DATA dataNot<>+7(SB)/1, $0x01 9236 DATA dataNot<>+8(SB)/1, $0x01 9237 DATA dataNot<>+9(SB)/1, $0x01 9238 DATA dataNot<>+10(SB)/1, $0x01 9239 DATA dataNot<>+11(SB)/1, $0x01 9240 DATA dataNot<>+12(SB)/1, $0x01 9241 DATA dataNot<>+13(SB)/1, $0x01 9242 DATA dataNot<>+14(SB)/1, $0x01 9243 DATA dataNot<>+15(SB)/1, $0x01 9244 DATA dataNot<>+16(SB)/1, $0x01 9245 DATA dataNot<>+17(SB)/1, $0x01 9246 DATA dataNot<>+18(SB)/1, $0x01 9247 DATA dataNot<>+19(SB)/1, $0x01 9248 DATA dataNot<>+20(SB)/1, $0x01 9249 DATA dataNot<>+21(SB)/1, $0x01 9250 DATA dataNot<>+22(SB)/1, $0x01 9251 DATA dataNot<>+23(SB)/1, $0x01 9252 DATA dataNot<>+24(SB)/1, $0x01 9253 DATA dataNot<>+25(SB)/1, $0x01 9254 DATA dataNot<>+26(SB)/1, $0x01 9255 DATA dataNot<>+27(SB)/1, $0x01 9256 DATA dataNot<>+28(SB)/1, $0x01 9257 DATA dataNot<>+29(SB)/1, $0x01 9258 DATA dataNot<>+30(SB)/1, $0x01 9259 DATA dataNot<>+31(SB)/1, $0x01 9260 GLOBL dataNot<>(SB), RODATA|NOPTR, $32 9261 9262 // func Not_AVX2(x []bool) 9263 // Requires: AVX 9264 TEXT ·Not_AVX2(SB), NOSPLIT, $0-24 9265 MOVQ x_base+0(FP), DI 9266 MOVQ x_len+8(FP), SI 9267 TESTQ SI, SI 9268 JE LBB0_17 9269 CMPQ SI, $0x10 9270 JAE LBB0_3 9271 XORL AX, AX 9272 JMP LBB0_16 9273 9274 LBB0_3: 9275 CMPQ SI, $0x80 9276 JAE LBB0_5 9277 XORL AX, AX 9278 JMP LBB0_13 9279 9280 LBB0_5: 9281 MOVQ SI, AX 9282 ANDQ $-128, AX 9283 LEAQ -128(AX), CX 9284 MOVQ CX, R8 9285 SHRQ $0x07, R8 9286 ADDQ $0x01, R8 9287 TESTQ CX, CX 9288 JE LBB0_6 9289 MOVQ R8, DX 9290 ANDQ $-2, DX 9291 XORL CX, CX 9292 VMOVUPS dataNot<>+0(SB), Y0 9293 9294 LBB0_8: 9295 VXORPS (DI)(CX*1), Y0, Y1 9296 VXORPS 32(DI)(CX*1), Y0, Y2 9297 VXORPS 64(DI)(CX*1), Y0, Y3 9298 VXORPS 96(DI)(CX*1), Y0, Y4 9299 VMOVUPS Y1, (DI)(CX*1) 9300 VMOVUPS Y2, 32(DI)(CX*1) 9301 VMOVUPS Y3, 64(DI)(CX*1) 9302 VMOVUPS Y4, 96(DI)(CX*1) 9303 VXORPS 128(DI)(CX*1), Y0, Y1 9304 VXORPS 160(DI)(CX*1), Y0, Y2 9305 VXORPS 192(DI)(CX*1), Y0, Y3 9306 VXORPS 224(DI)(CX*1), Y0, Y4 9307 VMOVUPS Y1, 128(DI)(CX*1) 9308 VMOVUPS Y2, 160(DI)(CX*1) 9309 VMOVUPS Y3, 192(DI)(CX*1) 9310 VMOVUPS Y4, 224(DI)(CX*1) 9311 ADDQ $+256, CX 9312 ADDQ $-2, DX 9313 JNE LBB0_8 9314 TESTB $0x01, R8 9315 JE LBB0_11 9316 9317 LBB0_10: 9318 VMOVUPS dataNot<>+0(SB), Y0 9319 VXORPS (DI)(CX*1), Y0, Y1 9320 VXORPS 32(DI)(CX*1), Y0, Y2 9321 VXORPS 64(DI)(CX*1), Y0, Y3 9322 VXORPS 96(DI)(CX*1), Y0, Y0 9323 VMOVUPS Y1, (DI)(CX*1) 9324 VMOVUPS Y2, 32(DI)(CX*1) 9325 VMOVUPS Y3, 64(DI)(CX*1) 9326 VMOVUPS Y0, 96(DI)(CX*1) 9327 9328 LBB0_11: 9329 CMPQ AX, SI 9330 JE LBB0_17 9331 TESTB $0x70, SI 9332 JE LBB0_16 9333 9334 LBB0_13: 9335 MOVQ AX, CX 9336 MOVQ SI, AX 9337 ANDQ $-16, AX 9338 VMOVUPS dataNot<>+0(SB), X0 9339 9340 LBB0_14: 9341 VXORPS (DI)(CX*1), X0, X1 9342 VMOVUPS X1, (DI)(CX*1) 9343 ADDQ $0x10, CX 9344 CMPQ AX, CX 9345 JNE LBB0_14 9346 CMPQ AX, SI 9347 JE LBB0_17 9348 9349 LBB0_16: 9350 XORB $0x01, (DI)(AX*1) 9351 ADDQ $0x01, AX 9352 CMPQ SI, AX 9353 JNE LBB0_16 9354 9355 LBB0_17: 9356 VZEROUPPER 9357 RET 9358 9359 LBB0_6: 9360 XORL CX, CX 9361 TESTB $0x01, R8 9362 JNE LBB0_10 9363 JMP LBB0_11 9364 9365 // func And_AVX2(x []bool, y []bool) 9366 // Requires: AVX 9367 TEXT ·And_AVX2(SB), NOSPLIT, $0-48 9368 MOVQ x_base+0(FP), DI 9369 MOVQ y_base+24(FP), SI 9370 MOVQ x_len+8(FP), DX 9371 TESTQ DX, DX 9372 JE LBB1_13 9373 CMPQ DX, $0x10 9374 JAE LBB1_3 9375 XORL AX, AX 9376 JMP LBB1_12 9377 9378 LBB1_3: 9379 CMPQ DX, $0x80 9380 JAE LBB1_5 9381 XORL AX, AX 9382 JMP LBB1_9 9383 9384 LBB1_5: 9385 MOVQ DX, AX 9386 ANDQ $-128, AX 9387 XORL CX, CX 9388 9389 LBB1_6: 9390 VMOVUPS (SI)(CX*1), Y0 9391 VMOVUPS 32(SI)(CX*1), Y1 9392 VMOVUPS 64(SI)(CX*1), Y2 9393 VMOVUPS 96(SI)(CX*1), Y3 9394 VANDPS (DI)(CX*1), Y0, Y0 9395 VANDPS 32(DI)(CX*1), Y1, Y1 9396 VANDPS 64(DI)(CX*1), Y2, Y2 9397 VANDPS 96(DI)(CX*1), Y3, Y3 9398 VMOVUPS Y0, (DI)(CX*1) 9399 VMOVUPS Y1, 32(DI)(CX*1) 9400 VMOVUPS Y2, 64(DI)(CX*1) 9401 VMOVUPS Y3, 96(DI)(CX*1) 9402 SUBQ $-128, CX 9403 CMPQ AX, CX 9404 JNE LBB1_6 9405 CMPQ AX, DX 9406 JE LBB1_13 9407 TESTB $0x70, DL 9408 JE LBB1_12 9409 9410 LBB1_9: 9411 MOVQ AX, CX 9412 MOVQ DX, AX 9413 ANDQ $-16, AX 9414 9415 LBB1_10: 9416 VMOVUPS (SI)(CX*1), X0 9417 VANDPS (DI)(CX*1), X0, X0 9418 VMOVUPS X0, (DI)(CX*1) 9419 ADDQ $0x10, CX 9420 CMPQ AX, CX 9421 JNE LBB1_10 9422 CMPQ AX, DX 9423 JE LBB1_13 9424 9425 LBB1_12: 9426 MOVBLZX (SI)(AX*1), CX 9427 ANDB CL, (DI)(AX*1) 9428 ADDQ $0x01, AX 9429 CMPQ DX, AX 9430 JNE LBB1_12 9431 9432 LBB1_13: 9433 VZEROUPPER 9434 RET 9435 9436 // func Or_AVX2(x []bool, y []bool) 9437 // Requires: AVX 9438 TEXT ·Or_AVX2(SB), NOSPLIT, $0-48 9439 MOVQ x_base+0(FP), DI 9440 MOVQ y_base+24(FP), SI 9441 MOVQ x_len+8(FP), DX 9442 TESTQ DX, DX 9443 JE LBB2_13 9444 CMPQ DX, $0x10 9445 JAE LBB2_3 9446 XORL AX, AX 9447 JMP LBB2_12 9448 9449 LBB2_3: 9450 CMPQ DX, $0x80 9451 JAE LBB2_5 9452 XORL AX, AX 9453 JMP LBB2_9 9454 9455 LBB2_5: 9456 MOVQ DX, AX 9457 ANDQ $-128, AX 9458 XORL CX, CX 9459 9460 LBB2_6: 9461 VMOVUPS (SI)(CX*1), Y0 9462 VMOVUPS 32(SI)(CX*1), Y1 9463 VMOVUPS 64(SI)(CX*1), Y2 9464 VMOVUPS 96(SI)(CX*1), Y3 9465 VORPS (DI)(CX*1), Y0, Y0 9466 VORPS 32(DI)(CX*1), Y1, Y1 9467 VORPS 64(DI)(CX*1), Y2, Y2 9468 VORPS 96(DI)(CX*1), Y3, Y3 9469 VMOVUPS Y0, (DI)(CX*1) 9470 VMOVUPS Y1, 32(DI)(CX*1) 9471 VMOVUPS Y2, 64(DI)(CX*1) 9472 VMOVUPS Y3, 96(DI)(CX*1) 9473 SUBQ $-128, CX 9474 CMPQ AX, CX 9475 JNE LBB2_6 9476 CMPQ AX, DX 9477 JE LBB2_13 9478 TESTB $0x70, DL 9479 JE LBB2_12 9480 9481 LBB2_9: 9482 MOVQ AX, CX 9483 MOVQ DX, AX 9484 ANDQ $-16, AX 9485 9486 LBB2_10: 9487 VMOVUPS (SI)(CX*1), X0 9488 VORPS (DI)(CX*1), X0, X0 9489 VMOVUPS X0, (DI)(CX*1) 9490 ADDQ $0x10, CX 9491 CMPQ AX, CX 9492 JNE LBB2_10 9493 CMPQ AX, DX 9494 JE LBB2_13 9495 9496 LBB2_12: 9497 MOVBLZX (SI)(AX*1), CX 9498 ORB CL, (DI)(AX*1) 9499 ADDQ $0x01, AX 9500 CMPQ DX, AX 9501 JNE LBB2_12 9502 9503 LBB2_13: 9504 VZEROUPPER 9505 RET 9506 9507 // func Xor_AVX2(x []bool, y []bool) 9508 // Requires: AVX 9509 TEXT ·Xor_AVX2(SB), NOSPLIT, $0-48 9510 MOVQ x_base+0(FP), DI 9511 MOVQ y_base+24(FP), SI 9512 MOVQ x_len+8(FP), DX 9513 TESTQ DX, DX 9514 JE LBB3_13 9515 CMPQ DX, $0x10 9516 JAE LBB3_3 9517 XORL AX, AX 9518 JMP LBB3_12 9519 9520 LBB3_3: 9521 CMPQ DX, $0x80 9522 JAE LBB3_5 9523 XORL AX, AX 9524 JMP LBB3_9 9525 9526 LBB3_5: 9527 MOVQ DX, AX 9528 ANDQ $-128, AX 9529 XORL CX, CX 9530 9531 LBB3_6: 9532 VMOVUPS (SI)(CX*1), Y0 9533 VMOVUPS 32(SI)(CX*1), Y1 9534 VMOVUPS 64(SI)(CX*1), Y2 9535 VMOVUPS 96(SI)(CX*1), Y3 9536 VXORPS (DI)(CX*1), Y0, Y0 9537 VXORPS 32(DI)(CX*1), Y1, Y1 9538 VXORPS 64(DI)(CX*1), Y2, Y2 9539 VXORPS 96(DI)(CX*1), Y3, Y3 9540 VMOVUPS Y0, (DI)(CX*1) 9541 VMOVUPS Y1, 32(DI)(CX*1) 9542 VMOVUPS Y2, 64(DI)(CX*1) 9543 VMOVUPS Y3, 96(DI)(CX*1) 9544 SUBQ $-128, CX 9545 CMPQ AX, CX 9546 JNE LBB3_6 9547 CMPQ AX, DX 9548 JE LBB3_13 9549 TESTB $0x70, DL 9550 JE LBB3_12 9551 9552 LBB3_9: 9553 MOVQ AX, CX 9554 MOVQ DX, AX 9555 ANDQ $-16, AX 9556 9557 LBB3_10: 9558 VMOVUPS (SI)(CX*1), X0 9559 VXORPS (DI)(CX*1), X0, X0 9560 VMOVUPS X0, (DI)(CX*1) 9561 ADDQ $0x10, CX 9562 CMPQ AX, CX 9563 JNE LBB3_10 9564 CMPQ AX, DX 9565 JE LBB3_13 9566 9567 LBB3_12: 9568 MOVBLZX (SI)(AX*1), CX 9569 XORB CL, (DI)(AX*1) 9570 ADDQ $0x01, AX 9571 CMPQ DX, AX 9572 JNE LBB3_12 9573 9574 LBB3_13: 9575 VZEROUPPER 9576 RET 9577 9578 // func All_AVX2(x []bool) int 9579 // Requires: AVX, AVX2 9580 TEXT ·All_AVX2(SB), NOSPLIT, $0-32 9581 MOVQ x_base+0(FP), DI 9582 MOVQ x_len+8(FP), SI 9583 MOVQ SI, AX 9584 XORL CX, CX 9585 ANDQ $-32, AX 9586 JE LBB0_1 9587 VPXOR X0, X0, X0 9588 9589 LBB0_8: 9590 VPCMPEQB (DI)(CX*1), Y0, Y1 9591 VPTEST Y1, Y1 9592 JNE LBB0_9 9593 ADDQ $0x20, CX 9594 CMPQ CX, AX 9595 JB LBB0_8 9596 9597 LBB0_1: 9598 MOVB $0x01, AL 9599 CMPQ CX, SI 9600 JAE LBB0_6 9601 ADDQ $-1, SI 9602 9603 LBB0_3: 9604 MOVBLZX (DI)(CX*1), AX 9605 TESTB AL, AL 9606 JE LBB0_5 9607 LEAQ 1(CX), DX 9608 CMPQ SI, CX 9609 MOVQ DX, CX 9610 JNE LBB0_3 9611 9612 LBB0_5: 9613 TESTB AL, AL 9614 SETNE AL 9615 9616 LBB0_6: 9617 VZEROUPPER 9618 MOVQ AX, ret+24(FP) 9619 RET 9620 9621 LBB0_9: 9622 XORL AX, AX 9623 VZEROUPPER 9624 MOVQ AX, ret+24(FP) 9625 RET 9626 9627 // func Any_AVX2(x []bool) int 9628 // Requires: AVX 9629 TEXT ·Any_AVX2(SB), NOSPLIT, $0-32 9630 MOVQ x_base+0(FP), DI 9631 MOVQ x_len+8(FP), SI 9632 MOVQ SI, CX 9633 XORL AX, AX 9634 ANDQ $-32, CX 9635 JE LBB1_1 9636 9637 LBB1_4: 9638 VMOVDQU (DI)(AX*1), Y0 9639 VPTEST Y0, Y0 9640 JNE LBB1_5 9641 ADDQ $0x20, AX 9642 CMPQ AX, CX 9643 JB LBB1_4 9644 9645 LBB1_1: 9646 CMPQ AX, SI 9647 JAE LBB1_2 9648 ADDQ $-1, SI 9649 9650 LBB1_7: 9651 MOVBLZX (DI)(AX*1), CX 9652 TESTB CL, CL 9653 JNE LBB1_9 9654 LEAQ 1(AX), DX 9655 CMPQ SI, AX 9656 MOVQ DX, AX 9657 JNE LBB1_7 9658 9659 LBB1_9: 9660 TESTB CL, CL 9661 SETNE AL 9662 VZEROUPPER 9663 MOVQ AX, ret+24(FP) 9664 RET 9665 9666 LBB1_5: 9667 MOVB $0x01, AL 9668 VZEROUPPER 9669 MOVQ AX, ret+24(FP) 9670 RET 9671 9672 LBB1_2: 9673 XORL AX, AX 9674 VZEROUPPER 9675 MOVQ AX, ret+24(FP) 9676 RET 9677 9678 // func None_AVX2(x []bool) int 9679 // Requires: AVX 9680 TEXT ·None_AVX2(SB), NOSPLIT, $0-32 9681 MOVQ x_base+0(FP), DI 9682 MOVQ x_len+8(FP), SI 9683 MOVQ SI, AX 9684 XORL CX, CX 9685 ANDQ $-32, AX 9686 JE LBB2_1 9687 9688 LBB2_7: 9689 VMOVDQU (DI)(CX*1), Y0 9690 VPTEST Y0, Y0 9691 JNE LBB2_8 9692 ADDQ $0x20, CX 9693 CMPQ CX, AX 9694 JB LBB2_7 9695 9696 LBB2_1: 9697 MOVB $0x01, AL 9698 CMPQ CX, SI 9699 JAE LBB2_5 9700 ADDQ $-1, SI 9701 9702 LBB2_3: 9703 CMPB (DI)(CX*1), $0x00 9704 SETEQ AL 9705 JNE LBB2_5 9706 LEAQ 1(CX), DX 9707 CMPQ SI, CX 9708 MOVQ DX, CX 9709 JNE LBB2_3 9710 9711 LBB2_5: 9712 VZEROUPPER 9713 MOVQ AX, ret+24(FP) 9714 RET 9715 9716 LBB2_8: 9717 XORL AX, AX 9718 VZEROUPPER 9719 MOVQ AX, ret+24(FP) 9720 RET 9721 9722 // func Count_AVX2(x []bool) int 9723 // Requires: AVX, AVX2 9724 TEXT ·Count_AVX2(SB), NOSPLIT, $0-32 9725 MOVQ x_base+0(FP), DI 9726 MOVQ x_len+8(FP), SI 9727 TESTQ SI, SI 9728 JE LBB9_1 9729 CMPQ SI, $0x10 9730 JAE LBB9_4 9731 XORL CX, CX 9732 XORL AX, AX 9733 JMP LBB9_11 9734 9735 LBB9_1: 9736 XORL AX, AX 9737 MOVQ AX, ret+24(FP) 9738 RET 9739 9740 LBB9_4: 9741 MOVQ SI, CX 9742 ANDQ $-16, CX 9743 LEAQ -16(CX), AX 9744 MOVQ AX, R8 9745 SHRQ $0x04, R8 9746 ADDQ $0x01, R8 9747 TESTQ AX, AX 9748 JE LBB9_5 9749 MOVQ R8, DX 9750 ANDQ $-2, DX 9751 VPXOR X0, X0, X0 9752 XORL AX, AX 9753 VPXOR X1, X1, X1 9754 VPXOR X2, X2, X2 9755 VPXOR X3, X3, X3 9756 9757 LBB9_7: 9758 VPMOVZXBQ (DI)(AX*1), Y4 9759 VPADDQ Y4, Y0, Y0 9760 VPMOVZXBQ 4(DI)(AX*1), Y4 9761 VPADDQ Y4, Y1, Y1 9762 VPMOVZXBQ 8(DI)(AX*1), Y4 9763 VPMOVZXBQ 12(DI)(AX*1), Y5 9764 VPADDQ Y4, Y2, Y2 9765 VPADDQ Y5, Y3, Y3 9766 VPMOVZXBQ 16(DI)(AX*1), Y4 9767 VPADDQ Y4, Y0, Y0 9768 VPMOVZXBQ 20(DI)(AX*1), Y4 9769 VPADDQ Y4, Y1, Y1 9770 VPMOVZXBQ 24(DI)(AX*1), Y4 9771 VPMOVZXBQ 28(DI)(AX*1), Y5 9772 VPADDQ Y4, Y2, Y2 9773 VPADDQ Y5, Y3, Y3 9774 ADDQ $0x20, AX 9775 ADDQ $-2, DX 9776 JNE LBB9_7 9777 TESTB $0x01, R8 9778 JE LBB9_10 9779 9780 LBB9_9: 9781 VPMOVZXBQ (DI)(AX*1), Y4 9782 VPMOVZXBQ 4(DI)(AX*1), Y5 9783 VPADDQ Y4, Y0, Y0 9784 VPADDQ Y5, Y1, Y1 9785 VPMOVZXBQ 8(DI)(AX*1), Y4 9786 VPADDQ Y4, Y2, Y2 9787 VPMOVZXBQ 12(DI)(AX*1), Y4 9788 VPADDQ Y4, Y3, Y3 9789 9790 LBB9_10: 9791 VPADDQ Y3, Y1, Y1 9792 VPADDQ Y2, Y0, Y0 9793 VPADDQ Y1, Y0, Y0 9794 VEXTRACTI128 $0x01, Y0, X1 9795 VPADDQ X1, X0, X0 9796 VPSHUFD $0xee, X0, X1 9797 VPADDQ X1, X0, X0 9798 VMOVQ X0, AX 9799 CMPQ CX, SI 9800 JE LBB9_12 9801 9802 LBB9_11: 9803 MOVBLZX (DI)(CX*1), DX 9804 ADDQ DX, AX 9805 ADDQ $0x01, CX 9806 CMPQ SI, CX 9807 JNE LBB9_11 9808 9809 LBB9_12: 9810 VZEROUPPER 9811 MOVQ AX, ret+24(FP) 9812 RET 9813 9814 LBB9_5: 9815 VPXOR X0, X0, X0 9816 XORL AX, AX 9817 VPXOR X1, X1, X1 9818 VPXOR X2, X2, X2 9819 VPXOR X3, X3, X3 9820 TESTB $0x01, R8 9821 JNE LBB9_9 9822 JMP LBB9_10 9823 9824 // func Repeat_AVX2_F64(x []float64, a float64, n int) 9825 // Requires: AVX, AVX2, SSE2 9826 TEXT ·Repeat_AVX2_F64(SB), NOSPLIT, $0-40 9827 MOVQ x_base+0(FP), DI 9828 MOVSD a+24(FP), X0 9829 MOVQ n+32(FP), SI 9830 TESTQ SI, SI 9831 JE LBB0_12 9832 CMPQ SI, $0x10 9833 JAE LBB0_3 9834 XORL AX, AX 9835 JMP LBB0_11 9836 9837 LBB0_3: 9838 MOVQ SI, AX 9839 ANDQ $-16, AX 9840 VBROADCASTSD X0, Y1 9841 LEAQ -16(AX), CX 9842 MOVQ CX, DX 9843 SHRQ $0x04, DX 9844 ADDQ $0x01, DX 9845 MOVL DX, R8 9846 ANDL $0x03, R8 9847 CMPQ CX, $0x30 9848 JAE LBB0_5 9849 XORL CX, CX 9850 JMP LBB0_7 9851 9852 LBB0_5: 9853 ANDQ $-4, DX 9854 XORL CX, CX 9855 9856 LBB0_6: 9857 VMOVUPS Y1, (DI)(CX*8) 9858 VMOVUPS Y1, 32(DI)(CX*8) 9859 VMOVUPS Y1, 64(DI)(CX*8) 9860 VMOVUPS Y1, 96(DI)(CX*8) 9861 VMOVUPS Y1, 128(DI)(CX*8) 9862 VMOVUPS Y1, 160(DI)(CX*8) 9863 VMOVUPS Y1, 192(DI)(CX*8) 9864 VMOVUPS Y1, 224(DI)(CX*8) 9865 VMOVUPS Y1, 256(DI)(CX*8) 9866 VMOVUPS Y1, 288(DI)(CX*8) 9867 VMOVUPS Y1, 320(DI)(CX*8) 9868 VMOVUPS Y1, 352(DI)(CX*8) 9869 VMOVUPS Y1, 384(DI)(CX*8) 9870 VMOVUPS Y1, 416(DI)(CX*8) 9871 VMOVUPS Y1, 448(DI)(CX*8) 9872 VMOVUPS Y1, 480(DI)(CX*8) 9873 ADDQ $0x40, CX 9874 ADDQ $-4, DX 9875 JNE LBB0_6 9876 9877 LBB0_7: 9878 TESTQ R8, R8 9879 JE LBB0_10 9880 LEAQ (DI)(CX*8), CX 9881 ADDQ $0x60, CX 9882 SHLQ $0x07, R8 9883 XORL DX, DX 9884 9885 LBB0_9: 9886 VMOVUPS Y1, -96(CX)(DX*1) 9887 VMOVUPS Y1, -64(CX)(DX*1) 9888 VMOVUPS Y1, -32(CX)(DX*1) 9889 VMOVUPS Y1, (CX)(DX*1) 9890 SUBQ $-128, DX 9891 CMPQ R8, DX 9892 JNE LBB0_9 9893 9894 LBB0_10: 9895 CMPQ AX, SI 9896 JE LBB0_12 9897 9898 LBB0_11: 9899 VMOVSD X0, (DI)(AX*8) 9900 ADDQ $0x01, AX 9901 CMPQ SI, AX 9902 JNE LBB0_11 9903 9904 LBB0_12: 9905 VZEROUPPER 9906 RET 9907 9908 // func Repeat_AVX2_F32(x []float32, a float32, n int) 9909 // Requires: AVX, AVX2, SSE 9910 TEXT ·Repeat_AVX2_F32(SB), NOSPLIT, $0-40 9911 MOVQ x_base+0(FP), DI 9912 MOVSS a+24(FP), X0 9913 MOVQ n+32(FP), SI 9914 TESTQ SI, SI 9915 JE LBB1_12 9916 CMPQ SI, $0x20 9917 JAE LBB1_3 9918 XORL AX, AX 9919 JMP LBB1_11 9920 9921 LBB1_3: 9922 MOVQ SI, AX 9923 ANDQ $-32, AX 9924 VBROADCASTSS X0, Y1 9925 LEAQ -32(AX), CX 9926 MOVQ CX, DX 9927 SHRQ $0x05, DX 9928 ADDQ $0x01, DX 9929 MOVL DX, R8 9930 ANDL $0x03, R8 9931 CMPQ CX, $0x60 9932 JAE LBB1_5 9933 XORL CX, CX 9934 JMP LBB1_7 9935 9936 LBB1_5: 9937 ANDQ $-4, DX 9938 XORL CX, CX 9939 9940 LBB1_6: 9941 VMOVUPS Y1, (DI)(CX*4) 9942 VMOVUPS Y1, 32(DI)(CX*4) 9943 VMOVUPS Y1, 64(DI)(CX*4) 9944 VMOVUPS Y1, 96(DI)(CX*4) 9945 VMOVUPS Y1, 128(DI)(CX*4) 9946 VMOVUPS Y1, 160(DI)(CX*4) 9947 VMOVUPS Y1, 192(DI)(CX*4) 9948 VMOVUPS Y1, 224(DI)(CX*4) 9949 VMOVUPS Y1, 256(DI)(CX*4) 9950 VMOVUPS Y1, 288(DI)(CX*4) 9951 VMOVUPS Y1, 320(DI)(CX*4) 9952 VMOVUPS Y1, 352(DI)(CX*4) 9953 VMOVUPS Y1, 384(DI)(CX*4) 9954 VMOVUPS Y1, 416(DI)(CX*4) 9955 VMOVUPS Y1, 448(DI)(CX*4) 9956 VMOVUPS Y1, 480(DI)(CX*4) 9957 SUBQ $-128, CX 9958 ADDQ $-4, DX 9959 JNE LBB1_6 9960 9961 LBB1_7: 9962 TESTQ R8, R8 9963 JE LBB1_10 9964 LEAQ (DI)(CX*4), CX 9965 ADDQ $0x60, CX 9966 SHLQ $0x07, R8 9967 XORL DX, DX 9968 9969 LBB1_9: 9970 VMOVUPS Y1, -96(CX)(DX*1) 9971 VMOVUPS Y1, -64(CX)(DX*1) 9972 VMOVUPS Y1, -32(CX)(DX*1) 9973 VMOVUPS Y1, (CX)(DX*1) 9974 SUBQ $-128, DX 9975 CMPQ R8, DX 9976 JNE LBB1_9 9977 9978 LBB1_10: 9979 CMPQ AX, SI 9980 JE LBB1_12 9981 9982 LBB1_11: 9983 VMOVSS X0, (DI)(AX*4) 9984 ADDQ $0x01, AX 9985 CMPQ SI, AX 9986 JNE LBB1_11 9987 9988 LBB1_12: 9989 VZEROUPPER 9990 RET 9991 9992 DATA dataRangeF64<>+0(SB)/8, $0x0000000000000000 9993 DATA dataRangeF64<>+8(SB)/8, $0x3ff0000000000000 9994 DATA dataRangeF64<>+16(SB)/8, $0x4000000000000000 9995 DATA dataRangeF64<>+24(SB)/8, $0x4008000000000000 9996 DATA dataRangeF64<>+32(SB)/8, $0x4010000000000000 9997 DATA dataRangeF64<>+40(SB)/8, $0x4020000000000000 9998 DATA dataRangeF64<>+48(SB)/8, $0x4028000000000000 9999 DATA dataRangeF64<>+56(SB)/8, $0x4030000000000000 10000 DATA dataRangeF64<>+64(SB)/8, $0x4034000000000000 10001 DATA dataRangeF64<>+72(SB)/8, $0x4038000000000000 10002 DATA dataRangeF64<>+80(SB)/8, $0x403c000000000000 10003 DATA dataRangeF64<>+88(SB)/8, $0x4040000000000000 10004 DATA dataRangeF64<>+96(SB)/8, $0x3ff0000000000000 10005 GLOBL dataRangeF64<>(SB), RODATA|NOPTR, $104 10006 10007 // func Range_AVX2_F64(x []float64, a float64, n int) 10008 // Requires: AVX, AVX2, SSE2 10009 TEXT ·Range_AVX2_F64(SB), NOSPLIT, $0-40 10010 MOVQ x_base+0(FP), DI 10011 MOVSD a+24(FP), X0 10012 MOVQ n+32(FP), SI 10013 TESTQ SI, SI 10014 JE LBB2_13 10015 CMPQ SI, $0x10 10016 JAE LBB2_3 10017 XORL AX, AX 10018 JMP LBB2_11 10019 10020 LBB2_3: 10021 MOVQ SI, AX 10022 ANDQ $-16, AX 10023 VBROADCASTSD X0, Y1 10024 VADDPD dataRangeF64<>+0(SB), Y1, Y1 10025 LEAQ -16(AX), CX 10026 MOVQ CX, R8 10027 SHRQ $0x04, R8 10028 ADDQ $0x01, R8 10029 TESTQ CX, CX 10030 JE LBB2_4 10031 MOVQ R8, DX 10032 ANDQ $-2, DX 10033 XORL CX, CX 10034 VBROADCASTSD dataRangeF64<>+32(SB), Y2 10035 VBROADCASTSD dataRangeF64<>+40(SB), Y3 10036 VBROADCASTSD dataRangeF64<>+48(SB), Y4 10037 VBROADCASTSD dataRangeF64<>+56(SB), Y5 10038 VBROADCASTSD dataRangeF64<>+64(SB), Y6 10039 VBROADCASTSD dataRangeF64<>+72(SB), Y7 10040 VBROADCASTSD dataRangeF64<>+80(SB), Y8 10041 VBROADCASTSD dataRangeF64<>+88(SB), Y9 10042 10043 LBB2_6: 10044 VADDPD Y2, Y1, Y10 10045 VADDPD Y3, Y1, Y11 10046 VADDPD Y4, Y1, Y12 10047 VMOVUPD Y1, (DI)(CX*8) 10048 VMOVUPD Y10, 32(DI)(CX*8) 10049 VMOVUPD Y11, 64(DI)(CX*8) 10050 VMOVUPD Y12, 96(DI)(CX*8) 10051 VADDPD Y5, Y1, Y10 10052 VADDPD Y6, Y1, Y11 10053 VADDPD Y7, Y1, Y12 10054 VADDPD Y1, Y8, Y13 10055 VMOVUPD Y10, 128(DI)(CX*8) 10056 VMOVUPD Y11, 160(DI)(CX*8) 10057 VMOVUPD Y12, 192(DI)(CX*8) 10058 VMOVUPD Y13, 224(DI)(CX*8) 10059 ADDQ $0x20, CX 10060 VADDPD Y1, Y9, Y1 10061 ADDQ $-2, DX 10062 JNE LBB2_6 10063 TESTB $0x01, R8 10064 JE LBB2_9 10065 10066 LBB2_8: 10067 VBROADCASTSD dataRangeF64<>+32(SB), Y2 10068 VADDPD Y2, Y1, Y2 10069 VBROADCASTSD dataRangeF64<>+40(SB), Y3 10070 VADDPD Y3, Y1, Y3 10071 VBROADCASTSD dataRangeF64<>+48(SB), Y4 10072 VADDPD Y4, Y1, Y4 10073 VMOVUPD Y1, (DI)(CX*8) 10074 VMOVUPD Y2, 32(DI)(CX*8) 10075 VMOVUPD Y3, 64(DI)(CX*8) 10076 VMOVUPD Y4, 96(DI)(CX*8) 10077 10078 LBB2_9: 10079 CMPQ AX, SI 10080 JE LBB2_13 10081 VCVTSI2SDQ AX, X14, X1 10082 VADDSD X0, X1, X0 10083 10084 LBB2_11: 10085 VMOVSD dataRangeF64<>+96(SB), X1 10086 10087 LBB2_12: 10088 VMOVSD X0, (DI)(AX*8) 10089 VADDSD X1, X0, X0 10090 ADDQ $0x01, AX 10091 CMPQ SI, AX 10092 JNE LBB2_12 10093 10094 LBB2_13: 10095 VZEROUPPER 10096 RET 10097 10098 LBB2_4: 10099 XORL CX, CX 10100 TESTB $0x01, R8 10101 JNE LBB2_8 10102 JMP LBB2_9 10103 10104 DATA dataRangeF32<>+0(SB)/4, $0x00000000 10105 DATA dataRangeF32<>+4(SB)/4, $0x3f800000 10106 DATA dataRangeF32<>+8(SB)/4, $0x40000000 10107 DATA dataRangeF32<>+12(SB)/4, $0x40400000 10108 DATA dataRangeF32<>+16(SB)/4, $0x40800000 10109 DATA dataRangeF32<>+20(SB)/4, $0x40a00000 10110 DATA dataRangeF32<>+24(SB)/4, $0x40c00000 10111 DATA dataRangeF32<>+28(SB)/4, $0x40e00000 10112 DATA dataRangeF32<>+32(SB)/4, $0x41000000 10113 DATA dataRangeF32<>+36(SB)/4, $0x41800000 10114 DATA dataRangeF32<>+40(SB)/4, $0x41c00000 10115 DATA dataRangeF32<>+44(SB)/4, $0x42000000 10116 DATA dataRangeF32<>+48(SB)/4, $0x42200000 10117 DATA dataRangeF32<>+52(SB)/4, $0x42400000 10118 DATA dataRangeF32<>+56(SB)/4, $0x42600000 10119 DATA dataRangeF32<>+60(SB)/4, $0x42800000 10120 DATA dataRangeF32<>+64(SB)/4, $0x3f800000 10121 GLOBL dataRangeF32<>(SB), RODATA|NOPTR, $68 10122 10123 // func Range_AVX2_F32(x []float32, a float32, n int) 10124 // Requires: AVX, AVX2, SSE 10125 TEXT ·Range_AVX2_F32(SB), NOSPLIT, $0-40 10126 MOVQ x_base+0(FP), DI 10127 MOVSS a+24(FP), X0 10128 MOVQ n+32(FP), SI 10129 TESTQ SI, SI 10130 JE LBB3_13 10131 CMPQ SI, $0x20 10132 JAE LBB3_3 10133 XORL AX, AX 10134 JMP LBB3_11 10135 10136 LBB3_3: 10137 MOVQ SI, AX 10138 ANDQ $-32, AX 10139 VBROADCASTSS X0, Y1 10140 VADDPS dataRangeF32<>+0(SB), Y1, Y1 10141 LEAQ -32(AX), CX 10142 MOVQ CX, R8 10143 SHRQ $0x05, R8 10144 ADDQ $0x01, R8 10145 TESTQ CX, CX 10146 JE LBB3_4 10147 MOVQ R8, DX 10148 ANDQ $-2, DX 10149 XORL CX, CX 10150 VBROADCASTSS dataRangeF32<>+32(SB), Y2 10151 VBROADCASTSS dataRangeF32<>+36(SB), Y3 10152 VBROADCASTSS dataRangeF32<>+40(SB), Y4 10153 VBROADCASTSS dataRangeF32<>+44(SB), Y5 10154 VBROADCASTSS dataRangeF32<>+48(SB), Y6 10155 VBROADCASTSS dataRangeF32<>+52(SB), Y7 10156 VBROADCASTSS dataRangeF32<>+56(SB), Y8 10157 VBROADCASTSS dataRangeF32<>+60(SB), Y9 10158 10159 LBB3_6: 10160 VADDPS Y2, Y1, Y10 10161 VADDPS Y3, Y1, Y11 10162 VADDPS Y4, Y1, Y12 10163 VMOVUPS Y1, (DI)(CX*4) 10164 VMOVUPS Y10, 32(DI)(CX*4) 10165 VMOVUPS Y11, 64(DI)(CX*4) 10166 VMOVUPS Y12, 96(DI)(CX*4) 10167 VADDPS Y5, Y1, Y10 10168 VADDPS Y6, Y1, Y11 10169 VADDPS Y7, Y1, Y12 10170 VADDPS Y1, Y8, Y13 10171 VMOVUPS Y10, 128(DI)(CX*4) 10172 VMOVUPS Y11, 160(DI)(CX*4) 10173 VMOVUPS Y12, 192(DI)(CX*4) 10174 VMOVUPS Y13, 224(DI)(CX*4) 10175 ADDQ $0x40, CX 10176 VADDPS Y1, Y9, Y1 10177 ADDQ $-2, DX 10178 JNE LBB3_6 10179 TESTB $0x01, R8 10180 JE LBB3_9 10181 10182 LBB3_8: 10183 VBROADCASTSS dataRangeF32<>+32(SB), Y2 10184 VADDPS Y2, Y1, Y2 10185 VBROADCASTSS dataRangeF32<>+36(SB), Y3 10186 VADDPS Y3, Y1, Y3 10187 VBROADCASTSS dataRangeF32<>+40(SB), Y4 10188 VADDPS Y4, Y1, Y4 10189 VMOVUPS Y1, (DI)(CX*4) 10190 VMOVUPS Y2, 32(DI)(CX*4) 10191 VMOVUPS Y3, 64(DI)(CX*4) 10192 VMOVUPS Y4, 96(DI)(CX*4) 10193 10194 LBB3_9: 10195 CMPQ AX, SI 10196 JE LBB3_13 10197 VCVTSI2SSQ AX, X14, X1 10198 VADDSS X0, X1, X0 10199 10200 LBB3_11: 10201 VMOVSS dataRangeF32<>+64(SB), X1 10202 10203 LBB3_12: 10204 VMOVSS X0, (DI)(AX*4) 10205 VADDSS X1, X0, X0 10206 ADDQ $0x01, AX 10207 CMPQ SI, AX 10208 JNE LBB3_12 10209 10210 LBB3_13: 10211 VZEROUPPER 10212 RET 10213 10214 LBB3_4: 10215 XORL CX, CX 10216 TESTB $0x01, R8 10217 JNE LBB3_8 10218 JMP LBB3_9 10219 10220 DATA dataFromBoolF64<>+0(SB)/4, $+1 10221 DATA dataFromBoolF64<>+4(SB)/8, $0x3ff0000000000000 10222 GLOBL dataFromBoolF64<>(SB), RODATA|NOPTR, $12 10223 10224 // func FromBool_AVX2_F64(x []float64, y []bool) 10225 // Requires: AVX, AVX2 10226 TEXT ·FromBool_AVX2_F64(SB), NOSPLIT, $0-48 10227 MOVQ x_base+0(FP), DI 10228 MOVQ y_base+24(FP), SI 10229 MOVQ x_len+8(FP), DX 10230 TESTQ DX, DX 10231 JE LBB4_10 10232 CMPQ DX, $0x10 10233 JAE LBB4_3 10234 XORL AX, AX 10235 JMP LBB4_6 10236 10237 LBB4_3: 10238 MOVQ DX, AX 10239 ANDQ $-16, AX 10240 XORL CX, CX 10241 VPXOR X0, X0, X0 10242 VPCMPEQD X1, X1, X1 10243 VPBROADCASTD dataFromBoolF64<>+0(SB), X2 10244 10245 LBB4_4: 10246 VMOVD (SI)(CX*1), X3 10247 VMOVD 4(SI)(CX*1), X4 10248 VMOVD 8(SI)(CX*1), X5 10249 VMOVD 12(SI)(CX*1), X6 10250 VPCMPEQB X0, X3, X3 10251 VPXOR X1, X3, X3 10252 VPMOVZXBD X3, X3 10253 VPAND X2, X3, X3 10254 VCVTDQ2PD X3, Y3 10255 VPCMPEQB X0, X4, X4 10256 VPXOR X1, X4, X4 10257 VPMOVZXBD X4, X4 10258 VPAND X2, X4, X4 10259 VCVTDQ2PD X4, Y4 10260 VPCMPEQB X0, X5, X5 10261 VPXOR X1, X5, X5 10262 VPMOVZXBD X5, X5 10263 VPAND X2, X5, X5 10264 VCVTDQ2PD X5, Y5 10265 VPCMPEQB X0, X6, X6 10266 VPXOR X1, X6, X6 10267 VPMOVZXBD X6, X6 10268 VPAND X2, X6, X6 10269 VCVTDQ2PD X6, Y6 10270 VMOVUPS Y3, (DI)(CX*8) 10271 VMOVUPS Y4, 32(DI)(CX*8) 10272 VMOVUPS Y5, 64(DI)(CX*8) 10273 VMOVUPS Y6, 96(DI)(CX*8) 10274 ADDQ $0x10, CX 10275 CMPQ AX, CX 10276 JNE LBB4_4 10277 CMPQ AX, DX 10278 JNE LBB4_6 10279 10280 LBB4_10: 10281 VZEROUPPER 10282 RET 10283 10284 LBB4_6: 10285 VMOVQ dataFromBoolF64<>+4(SB), X0 10286 JMP LBB4_7 10287 10288 LBB4_9: 10289 VMOVQ X1, (DI)(AX*8) 10290 ADDQ $0x01, AX 10291 CMPQ DX, AX 10292 JE LBB4_10 10293 10294 LBB4_7: 10295 CMPB (SI)(AX*1), $0x00 10296 VMOVDQA X0, X1 10297 JNE LBB4_9 10298 VPXOR X1, X1, X1 10299 JMP LBB4_9 10300 10301 DATA dataFromBoolF32<>+0(SB)/4, $+1 10302 DATA dataFromBoolF32<>+4(SB)/4, $+1065353216 10303 GLOBL dataFromBoolF32<>(SB), RODATA|NOPTR, $8 10304 10305 // func FromBool_AVX2_F32(x []float32, y []bool) 10306 // Requires: AVX, AVX2 10307 TEXT ·FromBool_AVX2_F32(SB), NOSPLIT, $0-48 10308 MOVQ x_base+0(FP), DI 10309 MOVQ y_base+24(FP), SI 10310 MOVQ x_len+8(FP), DX 10311 TESTQ DX, DX 10312 JE LBB5_10 10313 CMPQ DX, $0x20 10314 JAE LBB5_3 10315 XORL AX, AX 10316 JMP LBB5_6 10317 10318 LBB5_3: 10319 MOVQ DX, AX 10320 ANDQ $-32, AX 10321 XORL CX, CX 10322 VPXOR X0, X0, X0 10323 VPCMPEQD X1, X1, X1 10324 VPBROADCASTD dataFromBoolF32<>+0(SB), Y2 10325 10326 LBB5_4: 10327 VMOVQ (SI)(CX*1), X3 10328 VMOVQ 8(SI)(CX*1), X4 10329 VMOVQ 16(SI)(CX*1), X5 10330 VMOVQ 24(SI)(CX*1), X6 10331 VPCMPEQB X0, X3, X3 10332 VPXOR X1, X3, X3 10333 VPMOVZXBD X3, Y3 10334 VPAND Y2, Y3, Y3 10335 VCVTDQ2PS Y3, Y3 10336 VPCMPEQB X0, X4, X4 10337 VPXOR X1, X4, X4 10338 VPMOVZXBD X4, Y4 10339 VPAND Y2, Y4, Y4 10340 VCVTDQ2PS Y4, Y4 10341 VPCMPEQB X0, X5, X5 10342 VPXOR X1, X5, X5 10343 VPMOVZXBD X5, Y5 10344 VPAND Y2, Y5, Y5 10345 VCVTDQ2PS Y5, Y5 10346 VPCMPEQB X0, X6, X6 10347 VPXOR X1, X6, X6 10348 VPMOVZXBD X6, Y6 10349 VPAND Y2, Y6, Y6 10350 VCVTDQ2PS Y6, Y6 10351 VMOVUPS Y3, (DI)(CX*4) 10352 VMOVUPS Y4, 32(DI)(CX*4) 10353 VMOVUPS Y5, 64(DI)(CX*4) 10354 VMOVUPS Y6, 96(DI)(CX*4) 10355 ADDQ $0x20, CX 10356 CMPQ AX, CX 10357 JNE LBB5_4 10358 CMPQ AX, DX 10359 JNE LBB5_6 10360 10361 LBB5_10: 10362 VZEROUPPER 10363 RET 10364 10365 LBB5_6: 10366 VMOVD dataFromBoolF32<>+4(SB), X0 10367 JMP LBB5_7 10368 10369 LBB5_9: 10370 VMOVD X1, (DI)(AX*4) 10371 ADDQ $0x01, AX 10372 CMPQ DX, AX 10373 JE LBB5_10 10374 10375 LBB5_7: 10376 CMPB (SI)(AX*1), $0x00 10377 VMOVDQA X0, X1 10378 JNE LBB5_9 10379 VPXOR X1, X1, X1 10380 JMP LBB5_9 10381 10382 // func FromInt32_AVX2_F64(x []float64, y []int32) 10383 // Requires: AVX 10384 TEXT ·FromInt32_AVX2_F64(SB), NOSPLIT, $0-48 10385 MOVQ x_base+0(FP), DI 10386 MOVQ y_base+24(FP), SI 10387 MOVQ x_len+8(FP), DX 10388 TESTQ DX, DX 10389 JE LBB10_11 10390 CMPQ DX, $0x10 10391 JAE LBB10_3 10392 XORL AX, AX 10393 JMP LBB10_10 10394 10395 LBB10_3: 10396 MOVQ DX, AX 10397 ANDQ $-16, AX 10398 LEAQ -16(AX), CX 10399 MOVQ CX, R8 10400 SHRQ $0x04, R8 10401 ADDQ $0x01, R8 10402 TESTQ CX, CX 10403 JE LBB10_4 10404 MOVQ R8, R9 10405 ANDQ $-2, R9 10406 XORL CX, CX 10407 10408 LBB10_6: 10409 VCVTDQ2PD (SI)(CX*4), Y0 10410 VCVTDQ2PD 16(SI)(CX*4), Y1 10411 VCVTDQ2PD 32(SI)(CX*4), Y2 10412 VCVTDQ2PD 48(SI)(CX*4), Y3 10413 VMOVUPS Y0, (DI)(CX*8) 10414 VMOVUPS Y1, 32(DI)(CX*8) 10415 VMOVUPS Y2, 64(DI)(CX*8) 10416 VMOVUPS Y3, 96(DI)(CX*8) 10417 VCVTDQ2PD 64(SI)(CX*4), Y0 10418 VCVTDQ2PD 80(SI)(CX*4), Y1 10419 VCVTDQ2PD 96(SI)(CX*4), Y2 10420 VCVTDQ2PD 112(SI)(CX*4), Y3 10421 VMOVUPD Y0, 128(DI)(CX*8) 10422 VMOVUPS Y1, 160(DI)(CX*8) 10423 VMOVUPS Y2, 192(DI)(CX*8) 10424 VMOVUPS Y3, 224(DI)(CX*8) 10425 ADDQ $0x20, CX 10426 ADDQ $-2, R9 10427 JNE LBB10_6 10428 TESTB $0x01, R8 10429 JE LBB10_9 10430 10431 LBB10_8: 10432 VCVTDQ2PD (SI)(CX*4), Y0 10433 VCVTDQ2PD 16(SI)(CX*4), Y1 10434 VCVTDQ2PD 32(SI)(CX*4), Y2 10435 VCVTDQ2PD 48(SI)(CX*4), Y3 10436 VMOVUPD Y0, (DI)(CX*8) 10437 VMOVUPS Y1, 32(DI)(CX*8) 10438 VMOVUPS Y2, 64(DI)(CX*8) 10439 VMOVUPS Y3, 96(DI)(CX*8) 10440 10441 LBB10_9: 10442 CMPQ AX, DX 10443 JE LBB10_11 10444 10445 LBB10_10: 10446 VCVTSI2SDL (SI)(AX*4), X4, X0 10447 VMOVSD X0, (DI)(AX*8) 10448 ADDQ $0x01, AX 10449 CMPQ DX, AX 10450 JNE LBB10_10 10451 10452 LBB10_11: 10453 VZEROUPPER 10454 RET 10455 10456 LBB10_4: 10457 XORL CX, CX 10458 TESTB $0x01, R8 10459 JNE LBB10_8 10460 JMP LBB10_9 10461 10462 // func FromInt32_AVX2_F32(x []float32, y []int32) 10463 // Requires: AVX 10464 TEXT ·FromInt32_AVX2_F32(SB), NOSPLIT, $0-48 10465 MOVQ x_base+0(FP), DI 10466 MOVQ y_base+24(FP), SI 10467 MOVQ x_len+8(FP), DX 10468 TESTQ DX, DX 10469 JE LBB11_11 10470 CMPQ DX, $0x20 10471 JAE LBB11_3 10472 XORL AX, AX 10473 JMP LBB11_10 10474 10475 LBB11_3: 10476 MOVQ DX, AX 10477 ANDQ $-32, AX 10478 LEAQ -32(AX), CX 10479 MOVQ CX, R8 10480 SHRQ $0x05, R8 10481 ADDQ $0x01, R8 10482 TESTQ CX, CX 10483 JE LBB11_4 10484 MOVQ R8, R9 10485 ANDQ $-2, R9 10486 XORL CX, CX 10487 10488 LBB11_6: 10489 VCVTDQ2PS (SI)(CX*4), Y0 10490 VCVTDQ2PS 32(SI)(CX*4), Y1 10491 VCVTDQ2PS 64(SI)(CX*4), Y2 10492 VCVTDQ2PS 96(SI)(CX*4), Y3 10493 VMOVUPS Y0, (DI)(CX*4) 10494 VMOVUPS Y1, 32(DI)(CX*4) 10495 VMOVUPS Y2, 64(DI)(CX*4) 10496 VMOVUPS Y3, 96(DI)(CX*4) 10497 VCVTDQ2PS 128(SI)(CX*4), Y0 10498 VCVTDQ2PS 160(SI)(CX*4), Y1 10499 VCVTDQ2PS 192(SI)(CX*4), Y2 10500 VCVTDQ2PS 224(SI)(CX*4), Y3 10501 VMOVUPS Y0, 128(DI)(CX*4) 10502 VMOVUPS Y1, 160(DI)(CX*4) 10503 VMOVUPS Y2, 192(DI)(CX*4) 10504 VMOVUPS Y3, 224(DI)(CX*4) 10505 ADDQ $0x40, CX 10506 ADDQ $-2, R9 10507 JNE LBB11_6 10508 TESTB $0x01, R8 10509 JE LBB11_9 10510 10511 LBB11_8: 10512 VCVTDQ2PS (SI)(CX*4), Y0 10513 VCVTDQ2PS 32(SI)(CX*4), Y1 10514 VCVTDQ2PS 64(SI)(CX*4), Y2 10515 VCVTDQ2PS 96(SI)(CX*4), Y3 10516 VMOVUPS Y0, (DI)(CX*4) 10517 VMOVUPS Y1, 32(DI)(CX*4) 10518 VMOVUPS Y2, 64(DI)(CX*4) 10519 VMOVUPS Y3, 96(DI)(CX*4) 10520 10521 LBB11_9: 10522 CMPQ AX, DX 10523 JE LBB11_11 10524 10525 LBB11_10: 10526 VCVTSI2SSL (SI)(AX*4), X4, X0 10527 VMOVSS X0, (DI)(AX*4) 10528 ADDQ $0x01, AX 10529 CMPQ DX, AX 10530 JNE LBB11_10 10531 10532 LBB11_11: 10533 VZEROUPPER 10534 RET 10535 10536 LBB11_4: 10537 XORL CX, CX 10538 TESTB $0x01, R8 10539 JNE LBB11_8 10540 JMP LBB11_9 10541 10542 // func FromInt64_AVX2_F64(x []float64, y []int64) 10543 // Requires: AVX 10544 TEXT ·FromInt64_AVX2_F64(SB), NOSPLIT, $0-48 10545 MOVQ x_base+0(FP), DI 10546 MOVQ y_base+24(FP), SI 10547 MOVQ x_len+8(FP), DX 10548 TESTQ DX, DX 10549 JE LBB8_11 10550 CMPQ DX, $0x10 10551 JAE LBB8_3 10552 XORL R10, R10 10553 JMP LBB8_10 10554 10555 LBB8_3: 10556 MOVQ DX, R10 10557 ANDQ $-16, R10 10558 LEAQ -16(R10), CX 10559 MOVQ CX, R8 10560 SHRQ $0x04, R8 10561 ADDQ $0x01, R8 10562 TESTQ CX, CX 10563 JE LBB8_4 10564 MOVQ R8, R9 10565 ANDQ $-2, R9 10566 XORL CX, CX 10567 10568 LBB8_6: 10569 VMOVDQU (SI)(CX*8), X0 10570 VMOVDQU 16(SI)(CX*8), X1 10571 VPEXTRQ $0x01, X0, AX 10572 VCVTSI2SDQ AX, X11, X2 10573 VMOVDQU 32(SI)(CX*8), X3 10574 VMOVQ X0, AX 10575 VCVTSI2SDQ AX, X11, X0 10576 VPEXTRQ $0x01, X1, AX 10577 VCVTSI2SDQ AX, X11, X4 10578 VMOVDQU 48(SI)(CX*8), X5 10579 VMOVQ X1, AX 10580 VCVTSI2SDQ AX, X11, X1 10581 VPEXTRQ $0x01, X5, AX 10582 VCVTSI2SDQ AX, X11, X6 10583 VUNPCKLPD X2, X0, X8 10584 VMOVQ X5, AX 10585 VCVTSI2SDQ AX, X11, X2 10586 VPEXTRQ $0x01, X3, AX 10587 VCVTSI2SDQ AX, X11, X5 10588 VUNPCKLPD X4, X1, X10 10589 VMOVQ X3, AX 10590 VCVTSI2SDQ AX, X11, X3 10591 VUNPCKLPD X6, X2, X9 10592 VMOVDQU 80(SI)(CX*8), X4 10593 VPEXTRQ $0x01, X4, AX 10594 VUNPCKLPD X5, X3, X3 10595 VCVTSI2SDQ AX, X11, X5 10596 VMOVQ X4, AX 10597 VCVTSI2SDQ AX, X11, X4 10598 VUNPCKLPD X5, X4, X4 10599 VMOVDQU 64(SI)(CX*8), X5 10600 VPEXTRQ $0x01, X5, AX 10601 VCVTSI2SDQ AX, X11, X6 10602 VMOVQ X5, AX 10603 VCVTSI2SDQ AX, X11, X5 10604 VMOVDQU 112(SI)(CX*8), X7 10605 VPEXTRQ $0x01, X7, AX 10606 VCVTSI2SDQ AX, X11, X0 10607 VMOVQ X7, AX 10608 VCVTSI2SDQ AX, X11, X7 10609 VMOVDQU 96(SI)(CX*8), X2 10610 VPEXTRQ $0x01, X2, AX 10611 VCVTSI2SDQ AX, X11, X1 10612 VUNPCKLPD X6, X5, X5 10613 VMOVQ X2, AX 10614 VCVTSI2SDQ AX, X11, X2 10615 VUNPCKLPD X0, X7, X0 10616 VUNPCKLPD X1, X2, X1 10617 VMOVUPD X10, 16(DI)(CX*8) 10618 VMOVUPD X8, (DI)(CX*8) 10619 VMOVUPD X3, 32(DI)(CX*8) 10620 VMOVUPD X9, 48(DI)(CX*8) 10621 VMOVUPD X5, 64(DI)(CX*8) 10622 VMOVUPD X4, 80(DI)(CX*8) 10623 VMOVUPD X1, 96(DI)(CX*8) 10624 VMOVUPD X0, 112(DI)(CX*8) 10625 VMOVDQU 128(SI)(CX*8), X0 10626 VMOVDQU 144(SI)(CX*8), X1 10627 VPEXTRQ $0x01, X0, AX 10628 VCVTSI2SDQ AX, X11, X2 10629 VMOVDQU 160(SI)(CX*8), X3 10630 VMOVQ X0, AX 10631 VCVTSI2SDQ AX, X11, X0 10632 VPEXTRQ $0x01, X1, AX 10633 VCVTSI2SDQ AX, X11, X4 10634 VMOVDQU 176(SI)(CX*8), X5 10635 VMOVQ X1, AX 10636 VCVTSI2SDQ AX, X11, X1 10637 VPEXTRQ $0x01, X5, AX 10638 VCVTSI2SDQ AX, X11, X6 10639 VUNPCKLPD X2, X0, X8 10640 VMOVQ X5, AX 10641 VCVTSI2SDQ AX, X11, X2 10642 VPEXTRQ $0x01, X3, AX 10643 VCVTSI2SDQ AX, X11, X5 10644 VUNPCKLPD X4, X1, X10 10645 VMOVQ X3, AX 10646 VCVTSI2SDQ AX, X11, X3 10647 VUNPCKLPD X6, X2, X9 10648 VMOVDQU 208(SI)(CX*8), X4 10649 VPEXTRQ $0x01, X4, AX 10650 VUNPCKLPD X5, X3, X3 10651 VCVTSI2SDQ AX, X11, X5 10652 VMOVQ X4, AX 10653 VCVTSI2SDQ AX, X11, X4 10654 VUNPCKLPD X5, X4, X4 10655 VMOVDQU 192(SI)(CX*8), X5 10656 VPEXTRQ $0x01, X5, AX 10657 VCVTSI2SDQ AX, X11, X6 10658 VMOVQ X5, AX 10659 VCVTSI2SDQ AX, X11, X5 10660 VMOVDQU 240(SI)(CX*8), X7 10661 VPEXTRQ $0x01, X7, AX 10662 VCVTSI2SDQ AX, X11, X0 10663 VMOVQ X7, AX 10664 VCVTSI2SDQ AX, X11, X7 10665 VMOVDQU 224(SI)(CX*8), X2 10666 VPEXTRQ $0x01, X2, AX 10667 VCVTSI2SDQ AX, X11, X1 10668 VUNPCKLPD X6, X5, X5 10669 VMOVQ X2, AX 10670 VCVTSI2SDQ AX, X11, X2 10671 VUNPCKLPD X0, X7, X0 10672 VUNPCKLPD X1, X2, X1 10673 VMOVUPD X10, 144(DI)(CX*8) 10674 VMOVUPD X8, 128(DI)(CX*8) 10675 VMOVUPD X3, 160(DI)(CX*8) 10676 VMOVUPD X9, 176(DI)(CX*8) 10677 VMOVUPD X5, 192(DI)(CX*8) 10678 VMOVUPD X4, 208(DI)(CX*8) 10679 VMOVUPD X1, 224(DI)(CX*8) 10680 VMOVUPD X0, 240(DI)(CX*8) 10681 ADDQ $0x20, CX 10682 ADDQ $-2, R9 10683 JNE LBB8_6 10684 TESTB $0x01, R8 10685 JE LBB8_9 10686 10687 LBB8_8: 10688 VMOVDQU (SI)(CX*8), X0 10689 VMOVDQU 16(SI)(CX*8), X1 10690 VMOVDQU 32(SI)(CX*8), X3 10691 VMOVDQU 48(SI)(CX*8), X2 10692 VPEXTRQ $0x01, X0, AX 10693 VCVTSI2SDQ AX, X11, X4 10694 VMOVQ X0, AX 10695 VCVTSI2SDQ AX, X11, X0 10696 VUNPCKLPD X4, X0, X8 10697 VPEXTRQ $0x01, X1, AX 10698 VCVTSI2SDQ AX, X11, X4 10699 VMOVQ X1, AX 10700 VCVTSI2SDQ AX, X11, X1 10701 VUNPCKLPD X4, X1, X1 10702 VPEXTRQ $0x01, X2, AX 10703 VCVTSI2SDQ AX, X11, X4 10704 VMOVQ X2, AX 10705 VCVTSI2SDQ AX, X11, X2 10706 VUNPCKLPD X4, X2, X2 10707 VPEXTRQ $0x01, X3, AX 10708 VCVTSI2SDQ AX, X11, X4 10709 VMOVQ X3, AX 10710 VCVTSI2SDQ AX, X11, X3 10711 VMOVDQU 80(SI)(CX*8), X5 10712 VPEXTRQ $0x01, X5, AX 10713 VCVTSI2SDQ AX, X11, X6 10714 VMOVQ X5, AX 10715 VCVTSI2SDQ AX, X11, X5 10716 VMOVDQU 64(SI)(CX*8), X7 10717 VPEXTRQ $0x01, X7, AX 10718 VCVTSI2SDQ AX, X11, X0 10719 VUNPCKLPD X4, X3, X3 10720 VMOVQ X7, AX 10721 VCVTSI2SDQ AX, X11, X4 10722 VUNPCKLPD X6, X5, X5 10723 VMOVDQU 112(SI)(CX*8), X6 10724 VPEXTRQ $0x01, X6, AX 10725 VUNPCKLPD X0, X4, X0 10726 VCVTSI2SDQ AX, X11, X4 10727 VMOVQ X6, AX 10728 VCVTSI2SDQ AX, X11, X6 10729 VUNPCKLPD X4, X6, X4 10730 VMOVDQU 96(SI)(CX*8), X6 10731 VPEXTRQ $0x01, X6, AX 10732 VCVTSI2SDQ AX, X11, X7 10733 VMOVQ X6, AX 10734 VCVTSI2SDQ AX, X11, X6 10735 VUNPCKLPD X7, X6, X6 10736 VMOVUPD X1, 16(DI)(CX*8) 10737 VMOVUPD X8, (DI)(CX*8) 10738 VMOVUPD X3, 32(DI)(CX*8) 10739 VMOVUPD X2, 48(DI)(CX*8) 10740 VMOVUPD X0, 64(DI)(CX*8) 10741 VMOVUPD X5, 80(DI)(CX*8) 10742 VMOVUPD X6, 96(DI)(CX*8) 10743 VMOVUPD X4, 112(DI)(CX*8) 10744 10745 LBB8_9: 10746 CMPQ R10, DX 10747 JE LBB8_11 10748 10749 LBB8_10: 10750 VCVTSI2SDQ (SI)(R10*8), X11, X0 10751 VMOVSD X0, (DI)(R10*8) 10752 ADDQ $0x01, R10 10753 CMPQ DX, R10 10754 JNE LBB8_10 10755 10756 LBB8_11: 10757 RET 10758 10759 LBB8_4: 10760 XORL CX, CX 10761 TESTB $0x01, R8 10762 JNE LBB8_8 10763 JMP LBB8_9 10764 10765 // func FromInt64_AVX2_F32(x []float32, y []int64) 10766 // Requires: AVX 10767 TEXT ·FromInt64_AVX2_F32(SB), NOSPLIT, $0-48 10768 MOVQ x_base+0(FP), DI 10769 MOVQ y_base+24(FP), SI 10770 MOVQ x_len+8(FP), DX 10771 TESTQ DX, DX 10772 JE LBB9_11 10773 CMPQ DX, $0x10 10774 JAE LBB9_3 10775 XORL R11, R11 10776 JMP LBB9_10 10777 10778 LBB9_3: 10779 MOVQ DX, R11 10780 ANDQ $-16, R11 10781 LEAQ -16(R11), CX 10782 MOVQ CX, R8 10783 SHRQ $0x04, R8 10784 ADDQ $0x01, R8 10785 TESTQ CX, CX 10786 JE LBB9_4 10787 MOVQ R8, R9 10788 ANDQ $-2, R9 10789 XORL CX, CX 10790 10791 LBB9_6: 10792 VMOVDQU (SI)(CX*8), X0 10793 VPEXTRQ $0x01, X0, R10 10794 VMOVDQU 16(SI)(CX*8), X1 10795 VCVTSI2SSQ R10, X8, X2 10796 VMOVQ X0, AX 10797 VCVTSI2SSQ AX, X8, X0 10798 VMOVQ X1, AX 10799 VCVTSI2SSQ AX, X8, X3 10800 VPEXTRQ $0x01, X1, AX 10801 VCVTSI2SSQ AX, X8, X1 10802 VMOVDQU 32(SI)(CX*8), X4 10803 VPEXTRQ $0x01, X4, AX 10804 VMOVDQU 48(SI)(CX*8), X5 10805 VCVTSI2SSQ AX, X8, X6 10806 VMOVQ X4, AX 10807 VCVTSI2SSQ AX, X8, X4 10808 VMOVQ X5, AX 10809 VCVTSI2SSQ AX, X8, X7 10810 VINSERTPS $0x10, X2, X0, X0 10811 VINSERTPS $0x20, X3, X0, X0 10812 VPEXTRQ $0x01, X5, AX 10813 VINSERTPS $0x30, X1, X0, X0 10814 VCVTSI2SSQ AX, X8, X1 10815 VINSERTPS $0x10, X6, X4, X2 10816 VMOVDQU 64(SI)(CX*8), X3 10817 VPEXTRQ $0x01, X3, AX 10818 VCVTSI2SSQ AX, X8, X4 10819 VMOVQ X3, AX 10820 VCVTSI2SSQ AX, X8, X3 10821 VMOVDQU 80(SI)(CX*8), X5 10822 VMOVQ X5, AX 10823 VCVTSI2SSQ AX, X8, X6 10824 VINSERTPS $0x20, X7, X2, X2 10825 VINSERTPS $0x30, X1, X2, X1 10826 VPEXTRQ $0x01, X5, AX 10827 VINSERTPS $0x10, X4, X3, X2 10828 VCVTSI2SSQ AX, X8, X3 10829 VINSERTPS $0x20, X6, X2, X2 10830 VMOVDQU 96(SI)(CX*8), X4 10831 VPEXTRQ $0x01, X4, AX 10832 VCVTSI2SSQ AX, X8, X5 10833 VMOVQ X4, AX 10834 VCVTSI2SSQ AX, X8, X4 10835 VMOVDQU 112(SI)(CX*8), X6 10836 VMOVQ X6, AX 10837 VCVTSI2SSQ AX, X8, X7 10838 VINSERTPS $0x30, X3, X2, X2 10839 VINSERTPS $0x10, X5, X4, X3 10840 VPEXTRQ $0x01, X6, AX 10841 VINSERTPS $0x20, X7, X3, X3 10842 VCVTSI2SSQ AX, X8, X4 10843 VINSERTPS $0x30, X4, X3, X3 10844 VMOVUPS X0, (DI)(CX*4) 10845 VMOVUPS X1, 16(DI)(CX*4) 10846 VMOVUPS X2, 32(DI)(CX*4) 10847 VMOVUPS X3, 48(DI)(CX*4) 10848 VMOVDQU 128(SI)(CX*8), X0 10849 VPEXTRQ $0x01, X0, AX 10850 VMOVDQU 144(SI)(CX*8), X1 10851 VCVTSI2SSQ AX, X8, X2 10852 VMOVQ X0, AX 10853 VCVTSI2SSQ AX, X8, X0 10854 VMOVQ X1, AX 10855 VCVTSI2SSQ AX, X8, X3 10856 VPEXTRQ $0x01, X1, AX 10857 VCVTSI2SSQ AX, X8, X1 10858 VMOVDQU 160(SI)(CX*8), X4 10859 VPEXTRQ $0x01, X4, AX 10860 VCVTSI2SSQ AX, X8, X5 10861 VMOVQ X4, AX 10862 VCVTSI2SSQ AX, X8, X4 10863 VINSERTPS $0x10, X2, X0, X0 10864 VMOVDQU 176(SI)(CX*8), X2 10865 VPEXTRQ $0x01, X2, R10 10866 VMOVQ X2, AX 10867 VCVTSI2SSQ AX, X8, X2 10868 VINSERTPS $0x20, X3, X0, X0 10869 VCVTSI2SSQ R10, X8, X3 10870 VINSERTPS $0x30, X1, X0, X0 10871 VMOVDQU 192(SI)(CX*8), X1 10872 VPEXTRQ $0x01, X1, AX 10873 VINSERTPS $0x10, X5, X4, X4 10874 VCVTSI2SSQ AX, X8, X5 10875 VMOVQ X1, AX 10876 VCVTSI2SSQ AX, X8, X1 10877 VINSERTPS $0x20, X2, X4, X2 10878 VMOVDQU 208(SI)(CX*8), X4 10879 VPEXTRQ $0x01, X4, R10 10880 VMOVQ X4, AX 10881 VCVTSI2SSQ AX, X8, X4 10882 VINSERTPS $0x30, X3, X2, X2 10883 VCVTSI2SSQ R10, X8, X3 10884 VINSERTPS $0x10, X5, X1, X1 10885 VMOVDQU 224(SI)(CX*8), X5 10886 VPEXTRQ $0x01, X5, AX 10887 VINSERTPS $0x20, X4, X1, X1 10888 VCVTSI2SSQ AX, X8, X4 10889 VMOVQ X5, AX 10890 VCVTSI2SSQ AX, X8, X5 10891 VINSERTPS $0x30, X3, X1, X1 10892 VMOVDQU 240(SI)(CX*8), X3 10893 VPEXTRQ $0x01, X3, R10 10894 VMOVQ X3, AX 10895 VCVTSI2SSQ AX, X8, X3 10896 VINSERTPS $0x10, X4, X5, X4 10897 VCVTSI2SSQ R10, X8, X5 10898 VINSERTPS $0x20, X3, X4, X3 10899 VINSERTPS $0x30, X5, X3, X3 10900 VMOVUPS X0, 64(DI)(CX*4) 10901 VMOVUPS X2, 80(DI)(CX*4) 10902 VMOVUPS X1, 96(DI)(CX*4) 10903 VMOVUPS X3, 112(DI)(CX*4) 10904 ADDQ $0x20, CX 10905 ADDQ $-2, R9 10906 JNE LBB9_6 10907 TESTB $0x01, R8 10908 JE LBB9_9 10909 10910 LBB9_8: 10911 VMOVDQU (SI)(CX*8), X0 10912 VPEXTRQ $0x01, X0, AX 10913 VMOVDQU 16(SI)(CX*8), X1 10914 VCVTSI2SSQ AX, X8, X2 10915 VMOVQ X0, AX 10916 VCVTSI2SSQ AX, X8, X0 10917 VMOVQ X1, AX 10918 VCVTSI2SSQ AX, X8, X3 10919 VPEXTRQ $0x01, X1, AX 10920 VCVTSI2SSQ AX, X8, X1 10921 VMOVDQU 32(SI)(CX*8), X4 10922 VMOVDQU 48(SI)(CX*8), X5 10923 VPEXTRQ $0x01, X4, AX 10924 VINSERTPS $0x10, X2, X0, X0 10925 VCVTSI2SSQ AX, X8, X2 10926 VMOVQ X4, AX 10927 VCVTSI2SSQ AX, X8, X4 10928 VMOVQ X5, AX 10929 VCVTSI2SSQ AX, X8, X6 10930 VINSERTPS $0x20, X3, X0, X0 10931 VINSERTPS $0x30, X1, X0, X0 10932 VPEXTRQ $0x01, X5, AX 10933 VINSERTPS $0x10, X2, X4, X1 10934 VCVTSI2SSQ AX, X8, X2 10935 VINSERTPS $0x20, X6, X1, X1 10936 VMOVDQU 64(SI)(CX*8), X3 10937 VPEXTRQ $0x01, X3, AX 10938 VCVTSI2SSQ AX, X8, X4 10939 VMOVQ X3, AX 10940 VCVTSI2SSQ AX, X8, X3 10941 VMOVDQU 80(SI)(CX*8), X5 10942 VMOVQ X5, AX 10943 VCVTSI2SSQ AX, X8, X6 10944 VINSERTPS $0x30, X2, X1, X1 10945 VINSERTPS $0x10, X4, X3, X2 10946 VPEXTRQ $0x01, X5, AX 10947 VINSERTPS $0x20, X6, X2, X2 10948 VCVTSI2SSQ AX, X8, X3 10949 VINSERTPS $0x30, X3, X2, X2 10950 VMOVDQU 96(SI)(CX*8), X3 10951 VPEXTRQ $0x01, X3, AX 10952 VCVTSI2SSQ AX, X8, X4 10953 VMOVQ X3, AX 10954 VCVTSI2SSQ AX, X8, X3 10955 VMOVDQU 112(SI)(CX*8), X5 10956 VMOVQ X5, AX 10957 VCVTSI2SSQ AX, X8, X6 10958 VINSERTPS $0x10, X4, X3, X3 10959 VINSERTPS $0x20, X6, X3, X3 10960 VPEXTRQ $0x01, X5, AX 10961 VCVTSI2SSQ AX, X8, X4 10962 VINSERTPS $0x30, X4, X3, X3 10963 VMOVUPS X0, (DI)(CX*4) 10964 VMOVUPS X1, 16(DI)(CX*4) 10965 VMOVUPS X2, 32(DI)(CX*4) 10966 VMOVUPS X3, 48(DI)(CX*4) 10967 10968 LBB9_9: 10969 CMPQ R11, DX 10970 JE LBB9_11 10971 10972 LBB9_10: 10973 VCVTSI2SSQ (SI)(R11*8), X8, X0 10974 VMOVSS X0, (DI)(R11*4) 10975 ADDQ $0x01, R11 10976 CMPQ DX, R11 10977 JNE LBB9_10 10978 10979 LBB9_11: 10980 RET 10981 10982 LBB9_4: 10983 XORL CX, CX 10984 TESTB $0x01, R8 10985 JNE LBB9_8 10986 JMP LBB9_9 10987 10988 // func FromFloat32_AVX2_F64(x []float64, y []float32) 10989 // Requires: AVX 10990 TEXT ·FromFloat32_AVX2_F64(SB), NOSPLIT, $0-48 10991 MOVQ x_base+0(FP), DI 10992 MOVQ y_base+24(FP), SI 10993 MOVQ x_len+8(FP), DX 10994 TESTQ DX, DX 10995 JE LBB6_11 10996 CMPQ DX, $0x10 10997 JAE LBB6_3 10998 XORL AX, AX 10999 JMP LBB6_10 11000 11001 LBB6_3: 11002 MOVQ DX, AX 11003 ANDQ $-16, AX 11004 LEAQ -16(AX), CX 11005 MOVQ CX, R8 11006 SHRQ $0x04, R8 11007 ADDQ $0x01, R8 11008 TESTQ CX, CX 11009 JE LBB6_4 11010 MOVQ R8, R9 11011 ANDQ $-2, R9 11012 XORL CX, CX 11013 11014 LBB6_6: 11015 VCVTPS2PD (SI)(CX*4), Y0 11016 VCVTPS2PD 16(SI)(CX*4), Y1 11017 VCVTPS2PD 32(SI)(CX*4), Y2 11018 VCVTPS2PD 48(SI)(CX*4), Y3 11019 VMOVUPS Y0, (DI)(CX*8) 11020 VMOVUPS Y1, 32(DI)(CX*8) 11021 VMOVUPS Y2, 64(DI)(CX*8) 11022 VMOVUPS Y3, 96(DI)(CX*8) 11023 VCVTPS2PD 64(SI)(CX*4), Y0 11024 VCVTPS2PD 80(SI)(CX*4), Y1 11025 VCVTPS2PD 96(SI)(CX*4), Y2 11026 VCVTPS2PD 112(SI)(CX*4), Y3 11027 VMOVUPS Y0, 128(DI)(CX*8) 11028 VMOVUPS Y1, 160(DI)(CX*8) 11029 VMOVUPS Y2, 192(DI)(CX*8) 11030 VMOVUPS Y3, 224(DI)(CX*8) 11031 ADDQ $0x20, CX 11032 ADDQ $-2, R9 11033 JNE LBB6_6 11034 TESTB $0x01, R8 11035 JE LBB6_9 11036 11037 LBB6_8: 11038 VCVTPS2PD (SI)(CX*4), Y0 11039 VCVTPS2PD 16(SI)(CX*4), Y1 11040 VCVTPS2PD 32(SI)(CX*4), Y2 11041 VCVTPS2PD 48(SI)(CX*4), Y3 11042 VMOVUPS Y0, (DI)(CX*8) 11043 VMOVUPS Y1, 32(DI)(CX*8) 11044 VMOVUPS Y2, 64(DI)(CX*8) 11045 VMOVUPS Y3, 96(DI)(CX*8) 11046 11047 LBB6_9: 11048 CMPQ AX, DX 11049 JE LBB6_11 11050 11051 LBB6_10: 11052 VMOVSS (SI)(AX*4), X0 11053 VCVTSS2SD X0, X0, X0 11054 VMOVSD X0, (DI)(AX*8) 11055 ADDQ $0x01, AX 11056 CMPQ DX, AX 11057 JNE LBB6_10 11058 11059 LBB6_11: 11060 VZEROUPPER 11061 RET 11062 11063 LBB6_4: 11064 XORL CX, CX 11065 TESTB $0x01, R8 11066 JNE LBB6_8 11067 JMP LBB6_9 11068 11069 // func FromFloat64_AVX2_F32(x []float32, y []float64) 11070 // Requires: AVX 11071 TEXT ·FromFloat64_AVX2_F32(SB), NOSPLIT, $0-48 11072 MOVQ x_base+0(FP), DI 11073 MOVQ y_base+24(FP), SI 11074 MOVQ x_len+8(FP), DX 11075 TESTQ DX, DX 11076 JE LBB7_11 11077 CMPQ DX, $0x10 11078 JAE LBB7_3 11079 XORL AX, AX 11080 JMP LBB7_10 11081 11082 LBB7_3: 11083 MOVQ DX, AX 11084 ANDQ $-16, AX 11085 LEAQ -16(AX), CX 11086 MOVQ CX, R8 11087 SHRQ $0x04, R8 11088 ADDQ $0x01, R8 11089 TESTQ CX, CX 11090 JE LBB7_4 11091 MOVQ R8, R9 11092 ANDQ $-2, R9 11093 XORL CX, CX 11094 11095 LBB7_6: 11096 VCVTPD2PSY (SI)(CX*8), X0 11097 VCVTPD2PSY 32(SI)(CX*8), X1 11098 VCVTPD2PSY 64(SI)(CX*8), X2 11099 VCVTPD2PSY 96(SI)(CX*8), X3 11100 VMOVUPD X0, (DI)(CX*4) 11101 VMOVUPD X1, 16(DI)(CX*4) 11102 VMOVUPD X2, 32(DI)(CX*4) 11103 VMOVUPD X3, 48(DI)(CX*4) 11104 VCVTPD2PSY 128(SI)(CX*8), X0 11105 VCVTPD2PSY 160(SI)(CX*8), X1 11106 VCVTPD2PSY 192(SI)(CX*8), X2 11107 VCVTPD2PSY 224(SI)(CX*8), X3 11108 VMOVUPD X0, 64(DI)(CX*4) 11109 VMOVUPD X1, 80(DI)(CX*4) 11110 VMOVUPD X2, 96(DI)(CX*4) 11111 VMOVUPD X3, 112(DI)(CX*4) 11112 ADDQ $0x20, CX 11113 ADDQ $-2, R9 11114 JNE LBB7_6 11115 TESTB $0x01, R8 11116 JE LBB7_9 11117 11118 LBB7_8: 11119 VCVTPD2PSY (SI)(CX*8), X0 11120 VCVTPD2PSY 32(SI)(CX*8), X1 11121 VCVTPD2PSY 64(SI)(CX*8), X2 11122 VCVTPD2PSY 96(SI)(CX*8), X3 11123 VMOVUPD X0, (DI)(CX*4) 11124 VMOVUPD X1, 16(DI)(CX*4) 11125 VMOVUPD X2, 32(DI)(CX*4) 11126 VMOVUPD X3, 48(DI)(CX*4) 11127 11128 LBB7_9: 11129 CMPQ AX, DX 11130 JE LBB7_11 11131 11132 LBB7_10: 11133 VMOVSD (SI)(AX*8), X0 11134 VCVTSD2SS X0, X0, X0 11135 VMOVSS X0, (DI)(AX*4) 11136 ADDQ $0x01, AX 11137 CMPQ DX, AX 11138 JNE LBB7_10 11139 11140 LBB7_11: 11141 RET 11142 11143 LBB7_4: 11144 XORL CX, CX 11145 TESTB $0x01, R8 11146 JNE LBB7_8 11147 JMP LBB7_9 11148 11149 DATA dataToBoolF64<>+0(SB)/1, $+1 11150 DATA dataToBoolF64<>+1(SB)/1, $+1 11151 DATA dataToBoolF64<>+2(SB)/1, $+1 11152 DATA dataToBoolF64<>+3(SB)/1, $+1 11153 DATA dataToBoolF64<>+4(SB)/1, $+0 11154 DATA dataToBoolF64<>+5(SB)/1, $+0 11155 DATA dataToBoolF64<>+6(SB)/1, $+0 11156 DATA dataToBoolF64<>+7(SB)/1, $+0 11157 DATA dataToBoolF64<>+8(SB)/1, $+0 11158 DATA dataToBoolF64<>+9(SB)/1, $+0 11159 DATA dataToBoolF64<>+10(SB)/1, $+0 11160 DATA dataToBoolF64<>+11(SB)/1, $+0 11161 DATA dataToBoolF64<>+12(SB)/1, $+0 11162 DATA dataToBoolF64<>+13(SB)/1, $+0 11163 DATA dataToBoolF64<>+14(SB)/1, $+0 11164 DATA dataToBoolF64<>+15(SB)/1, $+0 11165 GLOBL dataToBoolF64<>(SB), RODATA|NOPTR, $16 11166 11167 // func ToBool_AVX2_F64(x []bool, y []float64) 11168 // Requires: AVX, AVX2 11169 TEXT ·ToBool_AVX2_F64(SB), NOSPLIT, $0-48 11170 MOVQ x_base+0(FP), DI 11171 MOVQ y_base+24(FP), SI 11172 MOVQ x_len+8(FP), DX 11173 TESTQ DX, DX 11174 JE LBB12_8 11175 CMPQ DX, $0x10 11176 JAE LBB12_3 11177 XORL AX, AX 11178 JMP LBB12_6 11179 11180 LBB12_3: 11181 MOVQ DX, AX 11182 ANDQ $-16, AX 11183 XORL CX, CX 11184 VXORPD X0, X0, X0 11185 VMOVDQU dataToBoolF64<>+0(SB), X1 11186 11187 LBB12_4: 11188 VCMPPD $0x04, (SI)(CX*8), Y0, Y2 11189 VEXTRACTF128 $0x01, Y2, X3 11190 VPACKSSDW X3, X2, X2 11191 VPACKSSDW X2, X2, X2 11192 VPACKSSWB X2, X2, X2 11193 VCMPPD $0x04, 32(SI)(CX*8), Y0, Y3 11194 VPAND X1, X2, X2 11195 VEXTRACTF128 $0x01, Y3, X4 11196 VPACKSSDW X4, X3, X3 11197 VPACKSSDW X3, X3, X3 11198 VPACKSSWB X3, X3, X3 11199 VPAND X1, X3, X3 11200 VCMPPD $0x04, 64(SI)(CX*8), Y0, Y4 11201 VPUNPCKLDQ X3, X2, X2 11202 VEXTRACTF128 $0x01, Y4, X3 11203 VPACKSSDW X3, X4, X3 11204 VPACKSSDW X3, X3, X3 11205 VPACKSSWB X3, X3, X3 11206 VPAND X1, X3, X3 11207 VCMPPD $0x04, 96(SI)(CX*8), Y0, Y4 11208 VEXTRACTF128 $0x01, Y4, X5 11209 VPACKSSDW X5, X4, X4 11210 VPACKSSDW X4, X4, X4 11211 VPACKSSWB X4, X4, X4 11212 VPAND X1, X4, X4 11213 VPBROADCASTD X4, X4 11214 VPBROADCASTD X3, X3 11215 VPUNPCKLDQ X4, X3, X3 11216 VPBLENDD $0x0c, X3, X2, X2 11217 VMOVDQU X2, (DI)(CX*1) 11218 ADDQ $0x10, CX 11219 CMPQ AX, CX 11220 JNE LBB12_4 11221 CMPQ AX, DX 11222 JE LBB12_8 11223 11224 LBB12_6: 11225 VXORPD X0, X0, X0 11226 11227 LBB12_7: 11228 VUCOMISD (SI)(AX*8), X0 11229 SETNE (DI)(AX*1) 11230 ADDQ $0x01, AX 11231 CMPQ DX, AX 11232 JNE LBB12_7 11233 11234 LBB12_8: 11235 VZEROUPPER 11236 RET 11237 11238 DATA dataToBoolF32<>+0(SB)/1, $+1 11239 DATA dataToBoolF32<>+1(SB)/1, $+1 11240 DATA dataToBoolF32<>+2(SB)/1, $+1 11241 DATA dataToBoolF32<>+3(SB)/1, $+1 11242 DATA dataToBoolF32<>+4(SB)/1, $+1 11243 DATA dataToBoolF32<>+5(SB)/1, $+1 11244 DATA dataToBoolF32<>+6(SB)/1, $+1 11245 DATA dataToBoolF32<>+7(SB)/1, $+1 11246 DATA dataToBoolF32<>+8(SB)/1, $+0 11247 DATA dataToBoolF32<>+9(SB)/1, $+0 11248 DATA dataToBoolF32<>+10(SB)/1, $+0 11249 DATA dataToBoolF32<>+11(SB)/1, $+0 11250 DATA dataToBoolF32<>+12(SB)/1, $+0 11251 DATA dataToBoolF32<>+13(SB)/1, $+0 11252 DATA dataToBoolF32<>+14(SB)/1, $+0 11253 DATA dataToBoolF32<>+15(SB)/1, $+0 11254 GLOBL dataToBoolF32<>(SB), RODATA|NOPTR, $16 11255 11256 // func ToBool_AVX2_F32(x []bool, y []float32) 11257 // Requires: AVX, AVX2 11258 TEXT ·ToBool_AVX2_F32(SB), NOSPLIT, $0-48 11259 MOVQ x_base+0(FP), DI 11260 MOVQ y_base+24(FP), SI 11261 MOVQ x_len+8(FP), DX 11262 TESTQ DX, DX 11263 JE LBB13_8 11264 CMPQ DX, $0x20 11265 JAE LBB13_3 11266 XORL AX, AX 11267 JMP LBB13_6 11268 11269 LBB13_3: 11270 MOVQ DX, AX 11271 ANDQ $-32, AX 11272 XORL CX, CX 11273 VXORPS X0, X0, X0 11274 VMOVDQU dataToBoolF32<>+0(SB), X1 11275 11276 LBB13_4: 11277 VCMPPS $0x04, (SI)(CX*4), Y0, Y2 11278 VEXTRACTF128 $0x01, Y2, X3 11279 VPACKSSDW X3, X2, X2 11280 VPACKSSWB X2, X2, X2 11281 VCMPPS $0x04, 32(SI)(CX*4), Y0, Y3 11282 VPAND X1, X2, X2 11283 VEXTRACTF128 $0x01, Y3, X4 11284 VPACKSSDW X4, X3, X3 11285 VPACKSSWB X3, X3, X3 11286 VPAND X1, X3, X3 11287 VCMPPS $0x04, 64(SI)(CX*4), Y0, Y4 11288 VEXTRACTF128 $0x01, Y4, X5 11289 VPACKSSDW X5, X4, X4 11290 VPACKSSWB X4, X4, X4 11291 VCMPPS $0x04, 96(SI)(CX*4), Y0, Y5 11292 VPAND X1, X4, X4 11293 VEXTRACTF128 $0x01, Y5, X6 11294 VPACKSSDW X6, X5, X5 11295 VPACKSSWB X5, X5, X5 11296 VPAND X1, X5, X5 11297 VINSERTI128 $0x01, X5, Y4, Y4 11298 VINSERTI128 $0x01, X3, Y2, Y2 11299 VPUNPCKLQDQ Y4, Y2, Y2 11300 VPERMQ $0xd8, Y2, Y2 11301 VMOVDQU Y2, (DI)(CX*1) 11302 ADDQ $0x20, CX 11303 CMPQ AX, CX 11304 JNE LBB13_4 11305 CMPQ AX, DX 11306 JE LBB13_8 11307 11308 LBB13_6: 11309 VXORPS X0, X0, X0 11310 11311 LBB13_7: 11312 VUCOMISS (SI)(AX*4), X0 11313 SETNE (DI)(AX*1) 11314 ADDQ $0x01, AX 11315 CMPQ DX, AX 11316 JNE LBB13_7 11317 11318 LBB13_8: 11319 VZEROUPPER 11320 RET 11321 11322 // func ToInt32_AVX2_F64(x []int32, y []float64) 11323 // Requires: AVX 11324 TEXT ·ToInt32_AVX2_F64(SB), NOSPLIT, $0-48 11325 MOVQ x_base+0(FP), DI 11326 MOVQ y_base+24(FP), SI 11327 MOVQ x_len+8(FP), DX 11328 TESTQ DX, DX 11329 JE LBB16_11 11330 CMPQ DX, $0x10 11331 JAE LBB16_3 11332 XORL AX, AX 11333 JMP LBB16_10 11334 11335 LBB16_3: 11336 MOVQ DX, AX 11337 ANDQ $-16, AX 11338 LEAQ -16(AX), CX 11339 MOVQ CX, R8 11340 SHRQ $0x04, R8 11341 ADDQ $0x01, R8 11342 TESTQ CX, CX 11343 JE LBB16_4 11344 MOVQ R8, R9 11345 ANDQ $-2, R9 11346 XORL CX, CX 11347 11348 LBB16_6: 11349 VCVTTPD2DQY (SI)(CX*8), X0 11350 VCVTTPD2DQY 32(SI)(CX*8), X1 11351 VCVTTPD2DQY 64(SI)(CX*8), X2 11352 VCVTTPD2DQY 96(SI)(CX*8), X3 11353 VMOVUPD X0, (DI)(CX*4) 11354 VMOVUPD X1, 16(DI)(CX*4) 11355 VMOVUPD X2, 32(DI)(CX*4) 11356 VMOVUPD X3, 48(DI)(CX*4) 11357 VCVTTPD2DQY 128(SI)(CX*8), X0 11358 VCVTTPD2DQY 160(SI)(CX*8), X1 11359 VCVTTPD2DQY 192(SI)(CX*8), X2 11360 VCVTTPD2DQY 224(SI)(CX*8), X3 11361 VMOVUPD X0, 64(DI)(CX*4) 11362 VMOVUPD X1, 80(DI)(CX*4) 11363 VMOVUPD X2, 96(DI)(CX*4) 11364 VMOVUPD X3, 112(DI)(CX*4) 11365 ADDQ $0x20, CX 11366 ADDQ $-2, R9 11367 JNE LBB16_6 11368 TESTB $0x01, R8 11369 JE LBB16_9 11370 11371 LBB16_8: 11372 VCVTTPD2DQY (SI)(CX*8), X0 11373 VCVTTPD2DQY 32(SI)(CX*8), X1 11374 VCVTTPD2DQY 64(SI)(CX*8), X2 11375 VCVTTPD2DQY 96(SI)(CX*8), X3 11376 VMOVUPD X0, (DI)(CX*4) 11377 VMOVUPD X1, 16(DI)(CX*4) 11378 VMOVUPD X2, 32(DI)(CX*4) 11379 VMOVUPD X3, 48(DI)(CX*4) 11380 11381 LBB16_9: 11382 CMPQ AX, DX 11383 JE LBB16_11 11384 11385 LBB16_10: 11386 VCVTTSD2SI (SI)(AX*8), CX 11387 MOVL CX, (DI)(AX*4) 11388 ADDQ $0x01, AX 11389 CMPQ DX, AX 11390 JNE LBB16_10 11391 11392 LBB16_11: 11393 RET 11394 11395 LBB16_4: 11396 XORL CX, CX 11397 TESTB $0x01, R8 11398 JNE LBB16_8 11399 JMP LBB16_9 11400 11401 // func ToInt32_AVX2_F32(x []int32, y []float32) 11402 // Requires: AVX 11403 TEXT ·ToInt32_AVX2_F32(SB), NOSPLIT, $0-48 11404 MOVQ x_base+0(FP), DI 11405 MOVQ y_base+24(FP), SI 11406 MOVQ x_len+8(FP), DX 11407 TESTQ DX, DX 11408 JE LBB17_11 11409 CMPQ DX, $0x20 11410 JAE LBB17_3 11411 XORL AX, AX 11412 JMP LBB17_10 11413 11414 LBB17_3: 11415 MOVQ DX, AX 11416 ANDQ $-32, AX 11417 LEAQ -32(AX), CX 11418 MOVQ CX, R8 11419 SHRQ $0x05, R8 11420 ADDQ $0x01, R8 11421 TESTQ CX, CX 11422 JE LBB17_4 11423 MOVQ R8, R9 11424 ANDQ $-2, R9 11425 XORL CX, CX 11426 11427 LBB17_6: 11428 VCVTTPS2DQ (SI)(CX*4), Y0 11429 VCVTTPS2DQ 32(SI)(CX*4), Y1 11430 VCVTTPS2DQ 64(SI)(CX*4), Y2 11431 VCVTTPS2DQ 96(SI)(CX*4), Y3 11432 VMOVUPS Y0, (DI)(CX*4) 11433 VMOVUPS Y1, 32(DI)(CX*4) 11434 VMOVUPS Y2, 64(DI)(CX*4) 11435 VMOVUPS Y3, 96(DI)(CX*4) 11436 VCVTTPS2DQ 128(SI)(CX*4), Y0 11437 VCVTTPS2DQ 160(SI)(CX*4), Y1 11438 VCVTTPS2DQ 192(SI)(CX*4), Y2 11439 VCVTTPS2DQ 224(SI)(CX*4), Y3 11440 VMOVUPS Y0, 128(DI)(CX*4) 11441 VMOVUPS Y1, 160(DI)(CX*4) 11442 VMOVUPS Y2, 192(DI)(CX*4) 11443 VMOVUPS Y3, 224(DI)(CX*4) 11444 ADDQ $0x40, CX 11445 ADDQ $-2, R9 11446 JNE LBB17_6 11447 TESTB $0x01, R8 11448 JE LBB17_9 11449 11450 LBB17_8: 11451 VCVTTPS2DQ (SI)(CX*4), Y0 11452 VCVTTPS2DQ 32(SI)(CX*4), Y1 11453 VCVTTPS2DQ 64(SI)(CX*4), Y2 11454 VCVTTPS2DQ 96(SI)(CX*4), Y3 11455 VMOVUPS Y0, (DI)(CX*4) 11456 VMOVUPS Y1, 32(DI)(CX*4) 11457 VMOVUPS Y2, 64(DI)(CX*4) 11458 VMOVUPS Y3, 96(DI)(CX*4) 11459 11460 LBB17_9: 11461 CMPQ AX, DX 11462 JE LBB17_11 11463 11464 LBB17_10: 11465 VCVTTSS2SI (SI)(AX*4), CX 11466 MOVL CX, (DI)(AX*4) 11467 ADDQ $0x01, AX 11468 CMPQ DX, AX 11469 JNE LBB17_10 11470 11471 LBB17_11: 11472 VZEROUPPER 11473 RET 11474 11475 LBB17_4: 11476 XORL CX, CX 11477 TESTB $0x01, R8 11478 JNE LBB17_8 11479 JMP LBB17_9 11480 11481 // func ToInt64_AVX2_F64(x []int64, y []float64) 11482 // Requires: AVX 11483 TEXT ·ToInt64_AVX2_F64(SB), NOSPLIT, $0-48 11484 MOVQ x_base+0(FP), DI 11485 MOVQ y_base+24(FP), SI 11486 MOVQ x_len+8(FP), DX 11487 TESTQ DX, DX 11488 JE LBB14_8 11489 LEAQ -1(DX), CX 11490 MOVL DX, R8 11491 ANDL $0x03, R8 11492 CMPQ CX, $0x03 11493 JAE LBB14_3 11494 XORL CX, CX 11495 JMP LBB14_5 11496 11497 LBB14_3: 11498 ANDQ $-4, DX 11499 XORL CX, CX 11500 11501 LBB14_4: 11502 VCVTTSD2SIQ (SI)(CX*8), AX 11503 MOVQ AX, (DI)(CX*8) 11504 VCVTTSD2SIQ 8(SI)(CX*8), AX 11505 MOVQ AX, 8(DI)(CX*8) 11506 VCVTTSD2SIQ 16(SI)(CX*8), AX 11507 MOVQ AX, 16(DI)(CX*8) 11508 VCVTTSD2SIQ 24(SI)(CX*8), AX 11509 MOVQ AX, 24(DI)(CX*8) 11510 ADDQ $0x04, CX 11511 CMPQ DX, CX 11512 JNE LBB14_4 11513 11514 LBB14_5: 11515 TESTQ R8, R8 11516 JE LBB14_8 11517 LEAQ (DI)(CX*8), DX 11518 LEAQ (SI)(CX*8), CX 11519 XORL SI, SI 11520 11521 LBB14_7: 11522 VCVTTSD2SIQ (CX)(SI*8), AX 11523 MOVQ AX, (DX)(SI*8) 11524 ADDQ $0x01, SI 11525 CMPQ R8, SI 11526 JNE LBB14_7 11527 11528 LBB14_8: 11529 RET 11530 11531 // func ToInt64_AVX2_F32(x []int64, y []float32) 11532 // Requires: AVX 11533 TEXT ·ToInt64_AVX2_F32(SB), NOSPLIT, $0-48 11534 MOVQ x_base+0(FP), DI 11535 MOVQ y_base+24(FP), SI 11536 MOVQ x_len+8(FP), DX 11537 TESTQ DX, DX 11538 JE LBB15_8 11539 LEAQ -1(DX), CX 11540 MOVL DX, R8 11541 ANDL $0x03, R8 11542 CMPQ CX, $0x03 11543 JAE LBB15_3 11544 XORL CX, CX 11545 JMP LBB15_5 11546 11547 LBB15_3: 11548 ANDQ $-4, DX 11549 XORL CX, CX 11550 11551 LBB15_4: 11552 VCVTTSS2SIQ (SI)(CX*4), AX 11553 MOVQ AX, (DI)(CX*8) 11554 VCVTTSS2SIQ 4(SI)(CX*4), AX 11555 MOVQ AX, 8(DI)(CX*8) 11556 VCVTTSS2SIQ 8(SI)(CX*4), AX 11557 MOVQ AX, 16(DI)(CX*8) 11558 VCVTTSS2SIQ 12(SI)(CX*4), AX 11559 MOVQ AX, 24(DI)(CX*8) 11560 ADDQ $0x04, CX 11561 CMPQ DX, CX 11562 JNE LBB15_4 11563 11564 LBB15_5: 11565 TESTQ R8, R8 11566 JE LBB15_8 11567 LEAQ (DI)(CX*8), DX 11568 LEAQ (SI)(CX*4), CX 11569 XORL SI, SI 11570 11571 LBB15_7: 11572 VCVTTSS2SIQ (CX)(SI*4), AX 11573 MOVQ AX, (DX)(SI*8) 11574 ADDQ $0x01, SI 11575 CMPQ R8, SI 11576 JNE LBB15_7 11577 11578 LBB15_8: 11579 RET