gitee.com/quant1x/gox@v1.7.6/num/asm/comparison.go (about) 1 package main 2 3 import ( 4 . "github.com/mmcloughlin/avo/build" 5 . "github.com/mmcloughlin/avo/operand" 6 . "github.com/mmcloughlin/avo/reg" 7 ) 8 9 func genLt_F64() { 10 11 data := GLOBL("dataLtF64", RODATA|NOPTR) 12 DATA(0, U8(1)) 13 DATA(1, U8(1)) 14 DATA(2, U8(1)) 15 DATA(3, U8(1)) 16 DATA(4, U8(0)) 17 DATA(5, U8(0)) 18 DATA(6, U8(0)) 19 DATA(7, U8(0)) 20 DATA(8, U8(0)) 21 DATA(9, U8(0)) 22 DATA(10, U8(0)) 23 DATA(11, U8(0)) 24 DATA(12, U8(0)) 25 DATA(13, U8(0)) 26 DATA(14, U8(0)) 27 DATA(15, U8(0)) 28 29 TEXT("Lt_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)") 30 Pragma("noescape") 31 Load(Param("x").Base(), RDI) 32 Load(Param("y").Base(), RSI) 33 Load(Param("z").Base(), RDX) 34 Load(Param("x").Len(), RCX) 35 36 TESTQ(RCX, RCX) 37 JE(LabelRef("LBB0_7")) 38 CMPQ(RCX, Imm(16)) 39 JAE(LabelRef("LBB0_3")) 40 XORL(R8L, R8L) 41 JMP(LabelRef("LBB0_6")) 42 43 Label("LBB0_3") 44 { 45 MOVQ(RCX, R8) 46 ANDQ(I32(-16), R8) 47 XORL(EAX, EAX) 48 VMOVDQU(data.Offset(0), X0) 49 } 50 51 Label("LBB0_4") 52 { 53 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8), Y1) 54 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2) 55 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3) 56 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4) 57 VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8), Y1, Y1) 58 VEXTRACTF128(Imm(1), Y1, X5) 59 VPACKSSDW(X5, X1, X1) 60 VPACKSSDW(X1, X1, X1) 61 VPACKSSWB(X1, X1, X1) 62 VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2, Y2) 63 VPAND(X0, X1, X1) 64 VEXTRACTF128(Imm(1), Y2, X5) 65 VPACKSSDW(X5, X2, X2) 66 VPACKSSDW(X2, X2, X2) 67 VPACKSSWB(X2, X2, X2) 68 VPAND(X0, X2, X2) 69 VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3, Y3) 70 VPUNPCKLDQ(X2, X1, X1) 71 VEXTRACTF128(Imm(1), Y3, X2) 72 VPACKSSDW(X2, X3, X2) 73 VPACKSSDW(X2, X2, X2) 74 VPACKSSWB(X2, X2, X2) 75 VPAND(X0, X2, X2) 76 VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4, Y3) 77 VEXTRACTF128(Imm(1), Y3, X4) 78 VPACKSSDW(X4, X3, X3) 79 VPACKSSDW(X3, X3, X3) 80 VPACKSSWB(X3, X3, X3) 81 VPAND(X0, X3, X3) 82 VPBROADCASTD(X3, X3) 83 VPBROADCASTD(X2, X2) 84 VPUNPCKLDQ(X3, X2, X2) 85 VPBLENDD(Imm(12), X2, X1, X1) 86 VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1)) 87 ADDQ(Imm(16), RAX) 88 CMPQ(R8, RAX) 89 JNE(LabelRef("LBB0_4")) 90 CMPQ(R8, RCX) 91 JE(LabelRef("LBB0_7")) 92 } 93 94 Label("LBB0_6") 95 { 96 VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0) 97 VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0) 98 SETCS(Mem{Base: RDI}.Idx(R8, 1)) 99 ADDQ(Imm(1), R8) 100 CMPQ(RCX, R8) 101 JNE(LabelRef("LBB0_6")) 102 } 103 104 Label("LBB0_7") 105 { 106 VZEROUPPER() 107 RET() 108 } 109 } 110 111 func genLt_F32() { 112 113 data := GLOBL("dataLtF32", RODATA|NOPTR) 114 DATA(0, U8(1)) 115 DATA(1, U8(1)) 116 DATA(2, U8(1)) 117 DATA(3, U8(1)) 118 DATA(4, U8(1)) 119 DATA(5, U8(1)) 120 DATA(6, U8(1)) 121 DATA(7, U8(1)) 122 DATA(8, U8(0)) 123 DATA(9, U8(0)) 124 DATA(10, U8(0)) 125 DATA(11, U8(0)) 126 DATA(12, U8(0)) 127 DATA(13, U8(0)) 128 DATA(14, U8(0)) 129 DATA(15, U8(0)) 130 131 TEXT("Lt_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)") 132 Pragma("noescape") 133 Load(Param("x").Base(), RDI) 134 Load(Param("y").Base(), RSI) 135 Load(Param("z").Base(), RDX) 136 Load(Param("x").Len(), RCX) 137 138 TESTQ(RCX, RCX) 139 JE(LabelRef("LBB1_7")) 140 CMPQ(RCX, Imm(32)) 141 JAE(LabelRef("LBB1_3")) 142 XORL(R8L, R8L) 143 JMP(LabelRef("LBB1_6")) 144 145 Label("LBB1_3") 146 { 147 MOVQ(RCX, R8) 148 ANDQ(I32(-32), R8) 149 XORL(EAX, EAX) 150 VMOVDQU(data.Offset(0), X0) 151 } 152 153 Label("LBB1_4") 154 { 155 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4), Y1) 156 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2) 157 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3) 158 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4) 159 VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4), Y1, Y1) 160 VEXTRACTF128(Imm(1), Y1, X5) 161 VPACKSSDW(X5, X1, X1) 162 VPACKSSWB(X1, X1, X1) 163 VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2, Y2) 164 VPAND(X0, X1, X1) 165 VEXTRACTF128(Imm(1), Y2, X5) 166 VPACKSSDW(X5, X2, X2) 167 VPACKSSWB(X2, X2, X2) 168 VPAND(X0, X2, X2) 169 VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3, Y3) 170 VEXTRACTF128(Imm(1), Y3, X5) 171 VPACKSSDW(X5, X3, X3) 172 VPACKSSWB(X3, X3, X3) 173 VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4, Y4) 174 VPAND(X0, X3, X3) 175 VEXTRACTF128(Imm(1), Y4, X5) 176 VPACKSSDW(X5, X4, X4) 177 VPACKSSWB(X4, X4, X4) 178 VPAND(X0, X4, X4) 179 VINSERTI128(Imm(1), X4, Y3, Y3) 180 VINSERTI128(Imm(1), X2, Y1, Y1) 181 VPUNPCKLQDQ(Y3, Y1, Y1) 182 VPERMQ(Imm(216), Y1, Y1) 183 VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1)) 184 ADDQ(Imm(32), RAX) 185 CMPQ(R8, RAX) 186 JNE(LabelRef("LBB1_4")) 187 CMPQ(R8, RCX) 188 JE(LabelRef("LBB1_7")) 189 } 190 191 Label("LBB1_6") 192 { 193 VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0) 194 VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0) 195 SETCS(Mem{Base: RDI}.Idx(R8, 1)) 196 ADDQ(Imm(1), R8) 197 CMPQ(RCX, R8) 198 JNE(LabelRef("LBB1_6")) 199 } 200 201 Label("LBB1_7") 202 { 203 VZEROUPPER() 204 RET() 205 } 206 } 207 208 func genLte_F64() { 209 210 data := GLOBL("dataLteF64", RODATA|NOPTR) 211 DATA(0, U8(1)) 212 DATA(1, U8(1)) 213 DATA(2, U8(1)) 214 DATA(3, U8(1)) 215 DATA(4, U8(0)) 216 DATA(5, U8(0)) 217 DATA(6, U8(0)) 218 DATA(7, U8(0)) 219 DATA(8, U8(0)) 220 DATA(9, U8(0)) 221 DATA(10, U8(0)) 222 DATA(11, U8(0)) 223 DATA(12, U8(0)) 224 DATA(13, U8(0)) 225 DATA(14, U8(0)) 226 DATA(15, U8(0)) 227 228 TEXT("Lte_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)") 229 Pragma("noescape") 230 Load(Param("x").Base(), RDI) 231 Load(Param("y").Base(), RSI) 232 Load(Param("z").Base(), RDX) 233 Load(Param("x").Len(), RCX) 234 235 TESTQ(RCX, RCX) 236 JE(LabelRef("LBB2_7")) 237 CMPQ(RCX, Imm(16)) 238 JAE(LabelRef("LBB2_3")) 239 XORL(R8L, R8L) 240 JMP(LabelRef("LBB2_6")) 241 242 Label("LBB2_3") 243 { 244 MOVQ(RCX, R8) 245 ANDQ(I32(-16), R8) 246 XORL(EAX, EAX) 247 VMOVDQU(data.Offset(0), X0) 248 } 249 250 Label("LBB2_4") 251 { 252 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8), Y1) 253 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2) 254 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3) 255 VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4) 256 VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8), Y1, Y1) 257 VEXTRACTF128(Imm(1), Y1, X5) 258 VPACKSSDW(X5, X1, X1) 259 VPACKSSDW(X1, X1, X1) 260 VPACKSSWB(X1, X1, X1) 261 VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2, Y2) 262 VPAND(X0, X1, X1) 263 VEXTRACTF128(Imm(1), Y2, X5) 264 VPACKSSDW(X5, X2, X2) 265 VPACKSSDW(X2, X2, X2) 266 VPACKSSWB(X2, X2, X2) 267 VPAND(X0, X2, X2) 268 VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3, Y3) 269 VPUNPCKLDQ(X2, X1, X1) 270 VEXTRACTF128(Imm(1), Y3, X2) 271 VPACKSSDW(X2, X3, X2) 272 VPACKSSDW(X2, X2, X2) 273 VPACKSSWB(X2, X2, X2) 274 VPAND(X0, X2, X2) 275 VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4, Y3) 276 VEXTRACTF128(Imm(1), Y3, X4) 277 VPACKSSDW(X4, X3, X3) 278 VPACKSSDW(X3, X3, X3) 279 VPACKSSWB(X3, X3, X3) 280 VPAND(X0, X3, X3) 281 VPBROADCASTD(X3, X3) 282 VPBROADCASTD(X2, X2) 283 VPUNPCKLDQ(X3, X2, X2) 284 VPBLENDD(Imm(12), X2, X1, X1) 285 VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1)) 286 ADDQ(Imm(16), RAX) 287 CMPQ(R8, RAX) 288 JNE(LabelRef("LBB2_4")) 289 CMPQ(R8, RCX) 290 JE(LabelRef("LBB2_7")) 291 } 292 293 Label("LBB2_6") 294 { 295 VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0) 296 VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0) 297 SETLS(Mem{Base: RDI}.Idx(R8, 1)) 298 ADDQ(Imm(1), R8) 299 CMPQ(RCX, R8) 300 JNE(LabelRef("LBB2_6")) 301 } 302 303 Label("LBB2_7") 304 { 305 VZEROUPPER() 306 RET() 307 } 308 } 309 310 func genLte_F32() { 311 312 data := GLOBL("dataLteF32", RODATA|NOPTR) 313 DATA(0, U8(1)) 314 DATA(1, U8(1)) 315 DATA(2, U8(1)) 316 DATA(3, U8(1)) 317 DATA(4, U8(1)) 318 DATA(5, U8(1)) 319 DATA(6, U8(1)) 320 DATA(7, U8(1)) 321 DATA(8, U8(0)) 322 DATA(9, U8(0)) 323 DATA(10, U8(0)) 324 DATA(11, U8(0)) 325 DATA(12, U8(0)) 326 DATA(13, U8(0)) 327 DATA(14, U8(0)) 328 DATA(15, U8(0)) 329 330 TEXT("Lte_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)") 331 Pragma("noescape") 332 Load(Param("x").Base(), RDI) 333 Load(Param("y").Base(), RSI) 334 Load(Param("z").Base(), RDX) 335 Load(Param("x").Len(), RCX) 336 337 TESTQ(RCX, RCX) 338 JE(LabelRef("LBB3_7")) 339 CMPQ(RCX, Imm(32)) 340 JAE(LabelRef("LBB3_3")) 341 XORL(R8L, R8L) 342 JMP(LabelRef("LBB3_6")) 343 344 Label("LBB3_3") 345 { 346 MOVQ(RCX, R8) 347 ANDQ(I32(-32), R8) 348 XORL(EAX, EAX) 349 VMOVDQU(data.Offset(0), X0) 350 } 351 352 Label("LBB3_4") 353 { 354 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4), Y1) 355 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2) 356 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3) 357 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4) 358 VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4), Y1, Y1) 359 VEXTRACTF128(Imm(1), Y1, X5) 360 VPACKSSDW(X5, X1, X1) 361 VPACKSSWB(X1, X1, X1) 362 VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2, Y2) 363 VPAND(X0, X1, X1) 364 VEXTRACTF128(Imm(1), Y2, X5) 365 VPACKSSDW(X5, X2, X2) 366 VPACKSSWB(X2, X2, X2) 367 VPAND(X0, X2, X2) 368 VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3, Y3) 369 VEXTRACTF128(Imm(1), Y3, X5) 370 VPACKSSDW(X5, X3, X3) 371 VPACKSSWB(X3, X3, X3) 372 VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4, Y4) 373 VPAND(X0, X3, X3) 374 VEXTRACTF128(Imm(1), Y4, X5) 375 VPACKSSDW(X5, X4, X4) 376 VPACKSSWB(X4, X4, X4) 377 VPAND(X0, X4, X4) 378 VINSERTI128(Imm(1), X4, Y3, Y3) 379 VINSERTI128(Imm(1), X2, Y1, Y1) 380 VPUNPCKLQDQ(Y3, Y1, Y1) 381 VPERMQ(Imm(216), Y1, Y1) 382 VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1)) 383 ADDQ(Imm(32), RAX) 384 CMPQ(R8, RAX) 385 JNE(LabelRef("LBB3_4")) 386 CMPQ(R8, RCX) 387 JE(LabelRef("LBB3_7")) 388 } 389 390 Label("LBB3_6") 391 { 392 VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0) 393 VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0) 394 SETLS(Mem{Base: RDI}.Idx(R8, 1)) 395 ADDQ(Imm(1), R8) 396 CMPQ(RCX, R8) 397 JNE(LabelRef("LBB3_6")) 398 } 399 400 Label("LBB3_7") 401 { 402 VZEROUPPER() 403 RET() 404 } 405 } 406 407 func genGt_F64() { 408 409 data := GLOBL("dataGtF64", RODATA|NOPTR) 410 DATA(0, U8(1)) 411 DATA(1, U8(1)) 412 DATA(2, U8(1)) 413 DATA(3, U8(1)) 414 DATA(4, U8(0)) 415 DATA(5, U8(0)) 416 DATA(6, U8(0)) 417 DATA(7, U8(0)) 418 DATA(8, U8(0)) 419 DATA(9, U8(0)) 420 DATA(10, U8(0)) 421 DATA(11, U8(0)) 422 DATA(12, U8(0)) 423 DATA(13, U8(0)) 424 DATA(14, U8(0)) 425 DATA(15, U8(0)) 426 427 TEXT("Gt_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)") 428 Pragma("noescape") 429 Load(Param("x").Base(), RDI) 430 Load(Param("y").Base(), RSI) 431 Load(Param("z").Base(), RDX) 432 Load(Param("x").Len(), RCX) 433 434 TESTQ(RCX, RCX) 435 JE(LabelRef("LBB4_7")) 436 CMPQ(RCX, Imm(16)) 437 JAE(LabelRef("LBB4_3")) 438 XORL(R8L, R8L) 439 JMP(LabelRef("LBB4_6")) 440 441 Label("LBB4_3") 442 { 443 MOVQ(RCX, R8) 444 ANDQ(I32(-16), R8) 445 XORL(EAX, EAX) 446 VMOVDQU(data.Offset(0), X0) 447 } 448 449 Label("LBB4_4") 450 { 451 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1) 452 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2) 453 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3) 454 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4) 455 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1) 456 VEXTRACTF128(Imm(1), Y1, X5) 457 VPACKSSDW(X5, X1, X1) 458 VPACKSSDW(X1, X1, X1) 459 VPACKSSWB(X1, X1, X1) 460 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2) 461 VPAND(X0, X1, X1) 462 VEXTRACTF128(Imm(1), Y2, X5) 463 VPACKSSDW(X5, X2, X2) 464 VPACKSSDW(X2, X2, X2) 465 VPACKSSWB(X2, X2, X2) 466 VPAND(X0, X2, X2) 467 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3) 468 VPUNPCKLDQ(X2, X1, X1) 469 VEXTRACTF128(Imm(1), Y3, X2) 470 VPACKSSDW(X2, X3, X2) 471 VPACKSSDW(X2, X2, X2) 472 VPACKSSWB(X2, X2, X2) 473 VPAND(X0, X2, X2) 474 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3) 475 VEXTRACTF128(Imm(1), Y3, X4) 476 VPACKSSDW(X4, X3, X3) 477 VPACKSSDW(X3, X3, X3) 478 VPACKSSWB(X3, X3, X3) 479 VPAND(X0, X3, X3) 480 VPBROADCASTD(X3, X3) 481 VPBROADCASTD(X2, X2) 482 VPUNPCKLDQ(X3, X2, X2) 483 VPBLENDD(Imm(12), X2, X1, X1) 484 VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1)) 485 ADDQ(Imm(16), RAX) 486 CMPQ(R8, RAX) 487 JNE(LabelRef("LBB4_4")) 488 CMPQ(R8, RCX) 489 JE(LabelRef("LBB4_7")) 490 } 491 492 Label("LBB4_6") 493 { 494 VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0) 495 VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0) 496 SETHI(Mem{Base: RDI}.Idx(R8, 1)) 497 ADDQ(Imm(1), R8) 498 CMPQ(RCX, R8) 499 JNE(LabelRef("LBB4_6")) 500 } 501 502 Label("LBB4_7") 503 { 504 VZEROUPPER() 505 RET() 506 } 507 } 508 509 func genGt_F32() { 510 511 data := GLOBL("dataGtF32", RODATA|NOPTR) 512 DATA(0, U8(1)) 513 DATA(1, U8(1)) 514 DATA(2, U8(1)) 515 DATA(3, U8(1)) 516 DATA(4, U8(1)) 517 DATA(5, U8(1)) 518 DATA(6, U8(1)) 519 DATA(7, U8(1)) 520 DATA(8, U8(0)) 521 DATA(9, U8(0)) 522 DATA(10, U8(0)) 523 DATA(11, U8(0)) 524 DATA(12, U8(0)) 525 DATA(13, U8(0)) 526 DATA(14, U8(0)) 527 DATA(15, U8(0)) 528 529 TEXT("Gt_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)") 530 Pragma("noescape") 531 Load(Param("x").Base(), RDI) 532 Load(Param("y").Base(), RSI) 533 Load(Param("z").Base(), RDX) 534 Load(Param("x").Len(), RCX) 535 536 TESTQ(RCX, RCX) 537 JE(LabelRef("LBB5_7")) 538 CMPQ(RCX, Imm(32)) 539 JAE(LabelRef("LBB5_3")) 540 XORL(R8L, R8L) 541 JMP(LabelRef("LBB5_6")) 542 543 Label("LBB5_3") 544 { 545 MOVQ(RCX, R8) 546 ANDQ(I32(-32), R8) 547 XORL(EAX, EAX) 548 VMOVDQU(data.Offset(0), X0) 549 } 550 551 Label("LBB5_4") 552 { 553 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1) 554 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2) 555 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3) 556 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4) 557 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1) 558 VEXTRACTF128(Imm(1), Y1, X5) 559 VPACKSSDW(X5, X1, X1) 560 VPACKSSWB(X1, X1, X1) 561 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2) 562 VPAND(X0, X1, X1) 563 VEXTRACTF128(Imm(1), Y2, X5) 564 VPACKSSDW(X5, X2, X2) 565 VPACKSSWB(X2, X2, X2) 566 VPAND(X0, X2, X2) 567 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3) 568 VEXTRACTF128(Imm(1), Y3, X5) 569 VPACKSSDW(X5, X3, X3) 570 VPACKSSWB(X3, X3, X3) 571 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4) 572 VPAND(X0, X3, X3) 573 VEXTRACTF128(Imm(1), Y4, X5) 574 VPACKSSDW(X5, X4, X4) 575 VPACKSSWB(X4, X4, X4) 576 VPAND(X0, X4, X4) 577 VINSERTI128(Imm(1), X4, Y3, Y3) 578 VINSERTI128(Imm(1), X2, Y1, Y1) 579 VPUNPCKLQDQ(Y3, Y1, Y1) 580 VPERMQ(Imm(216), Y1, Y1) 581 VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1)) 582 ADDQ(Imm(32), RAX) 583 CMPQ(R8, RAX) 584 JNE(LabelRef("LBB5_4")) 585 CMPQ(R8, RCX) 586 JE(LabelRef("LBB5_7")) 587 } 588 589 Label("LBB5_6") 590 { 591 VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0) 592 VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0) 593 SETHI(Mem{Base: RDI}.Idx(R8, 1)) 594 ADDQ(Imm(1), R8) 595 CMPQ(RCX, R8) 596 JNE(LabelRef("LBB5_6")) 597 } 598 599 Label("LBB5_7") 600 { 601 VZEROUPPER() 602 RET() 603 } 604 } 605 606 func genGte_F64() { 607 608 data := GLOBL("dataGteF64", RODATA|NOPTR) 609 DATA(0, U8(1)) 610 DATA(1, U8(1)) 611 DATA(2, U8(1)) 612 DATA(3, U8(1)) 613 DATA(4, U8(0)) 614 DATA(5, U8(0)) 615 DATA(6, U8(0)) 616 DATA(7, U8(0)) 617 DATA(8, U8(0)) 618 DATA(9, U8(0)) 619 DATA(10, U8(0)) 620 DATA(11, U8(0)) 621 DATA(12, U8(0)) 622 DATA(13, U8(0)) 623 DATA(14, U8(0)) 624 DATA(15, U8(0)) 625 626 TEXT("Gte_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)") 627 Pragma("noescape") 628 Load(Param("x").Base(), RDI) 629 Load(Param("y").Base(), RSI) 630 Load(Param("z").Base(), RDX) 631 Load(Param("x").Len(), RCX) 632 633 TESTQ(RCX, RCX) 634 JE(LabelRef("LBB6_7")) 635 CMPQ(RCX, Imm(16)) 636 JAE(LabelRef("LBB6_3")) 637 XORL(R8L, R8L) 638 JMP(LabelRef("LBB6_6")) 639 640 Label("LBB6_3") 641 { 642 MOVQ(RCX, R8) 643 ANDQ(I32(-16), R8) 644 XORL(EAX, EAX) 645 VMOVDQU(data.Offset(0), X0) 646 } 647 648 Label("LBB6_4") 649 { 650 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1) 651 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2) 652 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3) 653 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4) 654 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1) 655 VEXTRACTF128(Imm(1), Y1, X5) 656 VPACKSSDW(X5, X1, X1) 657 VPACKSSDW(X1, X1, X1) 658 VPACKSSWB(X1, X1, X1) 659 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2) 660 VPAND(X0, X1, X1) 661 VEXTRACTF128(Imm(1), Y2, X5) 662 VPACKSSDW(X5, X2, X2) 663 VPACKSSDW(X2, X2, X2) 664 VPACKSSWB(X2, X2, X2) 665 VPAND(X0, X2, X2) 666 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3) 667 VPUNPCKLDQ(X2, X1, X1) 668 VEXTRACTF128(Imm(1), Y3, X2) 669 VPACKSSDW(X2, X3, X2) 670 VPACKSSDW(X2, X2, X2) 671 VPACKSSWB(X2, X2, X2) 672 VPAND(X0, X2, X2) 673 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3) 674 VEXTRACTF128(Imm(1), Y3, X4) 675 VPACKSSDW(X4, X3, X3) 676 VPACKSSDW(X3, X3, X3) 677 VPACKSSWB(X3, X3, X3) 678 VPAND(X0, X3, X3) 679 VPBROADCASTD(X3, X3) 680 VPBROADCASTD(X2, X2) 681 VPUNPCKLDQ(X3, X2, X2) 682 VPBLENDD(Imm(12), X2, X1, X1) 683 VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1)) 684 ADDQ(Imm(16), RAX) 685 CMPQ(R8, RAX) 686 JNE(LabelRef("LBB6_4")) 687 CMPQ(R8, RCX) 688 JE(LabelRef("LBB6_7")) 689 } 690 691 Label("LBB6_6") 692 { 693 VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0) 694 VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0) 695 SETCC(Mem{Base: RDI}.Idx(R8, 1)) 696 ADDQ(Imm(1), R8) 697 CMPQ(RCX, R8) 698 JNE(LabelRef("LBB6_6")) 699 } 700 701 Label("LBB6_7") 702 { 703 VZEROUPPER() 704 RET() 705 } 706 } 707 708 func genGte_F32() { 709 710 data := GLOBL("dataGteF32", RODATA|NOPTR) 711 DATA(0, U8(1)) 712 DATA(1, U8(1)) 713 DATA(2, U8(1)) 714 DATA(3, U8(1)) 715 DATA(4, U8(1)) 716 DATA(5, U8(1)) 717 DATA(6, U8(1)) 718 DATA(7, U8(1)) 719 DATA(8, U8(0)) 720 DATA(9, U8(0)) 721 DATA(10, U8(0)) 722 DATA(11, U8(0)) 723 DATA(12, U8(0)) 724 DATA(13, U8(0)) 725 DATA(14, U8(0)) 726 DATA(15, U8(0)) 727 728 TEXT("Gte_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)") 729 Pragma("noescape") 730 Load(Param("x").Base(), RDI) 731 Load(Param("y").Base(), RSI) 732 Load(Param("z").Base(), RDX) 733 Load(Param("x").Len(), RCX) 734 735 TESTQ(RCX, RCX) 736 JE(LabelRef("LBB7_7")) 737 CMPQ(RCX, Imm(32)) 738 JAE(LabelRef("LBB7_3")) 739 XORL(R8L, R8L) 740 JMP(LabelRef("LBB7_6")) 741 742 Label("LBB7_3") 743 { 744 MOVQ(RCX, R8) 745 ANDQ(I32(-32), R8) 746 XORL(EAX, EAX) 747 VMOVDQU(data.Offset(0), X0) 748 } 749 750 Label("LBB7_4") 751 { 752 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1) 753 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2) 754 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3) 755 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4) 756 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1) 757 VEXTRACTF128(Imm(1), Y1, X5) 758 VPACKSSDW(X5, X1, X1) 759 VPACKSSWB(X1, X1, X1) 760 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2) 761 VPAND(X0, X1, X1) 762 VEXTRACTF128(Imm(1), Y2, X5) 763 VPACKSSDW(X5, X2, X2) 764 VPACKSSWB(X2, X2, X2) 765 VPAND(X0, X2, X2) 766 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3) 767 VEXTRACTF128(Imm(1), Y3, X5) 768 VPACKSSDW(X5, X3, X3) 769 VPACKSSWB(X3, X3, X3) 770 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4) 771 VPAND(X0, X3, X3) 772 VEXTRACTF128(Imm(1), Y4, X5) 773 VPACKSSDW(X5, X4, X4) 774 VPACKSSWB(X4, X4, X4) 775 VPAND(X0, X4, X4) 776 VINSERTI128(Imm(1), X4, Y3, Y3) 777 VINSERTI128(Imm(1), X2, Y1, Y1) 778 VPUNPCKLQDQ(Y3, Y1, Y1) 779 VPERMQ(Imm(216), Y1, Y1) 780 VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1)) 781 ADDQ(Imm(32), RAX) 782 CMPQ(R8, RAX) 783 JNE(LabelRef("LBB7_4")) 784 CMPQ(R8, RCX) 785 JE(LabelRef("LBB7_7")) 786 } 787 788 Label("LBB7_6") 789 { 790 VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0) 791 VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0) 792 SETCC(Mem{Base: RDI}.Idx(R8, 1)) 793 ADDQ(Imm(1), R8) 794 CMPQ(RCX, R8) 795 JNE(LabelRef("LBB7_6")) 796 } 797 798 Label("LBB7_7") 799 { 800 VZEROUPPER() 801 RET() 802 } 803 } 804 805 func genEq_F64() { 806 807 data := GLOBL("dataEqF64", RODATA|NOPTR) 808 DATA(0, U8(1)) 809 DATA(1, U8(1)) 810 DATA(2, U8(1)) 811 DATA(3, U8(1)) 812 DATA(4, U8(0)) 813 DATA(5, U8(0)) 814 DATA(6, U8(0)) 815 DATA(7, U8(0)) 816 DATA(8, U8(0)) 817 DATA(9, U8(0)) 818 DATA(10, U8(0)) 819 DATA(11, U8(0)) 820 DATA(12, U8(0)) 821 DATA(13, U8(0)) 822 DATA(14, U8(0)) 823 DATA(15, U8(0)) 824 825 TEXT("Eq_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)") 826 Pragma("noescape") 827 Load(Param("x").Base(), RDI) 828 Load(Param("y").Base(), RSI) 829 Load(Param("z").Base(), RDX) 830 Load(Param("x").Len(), RCX) 831 832 TESTQ(RCX, RCX) 833 JE(LabelRef("LBB8_7")) 834 CMPQ(RCX, Imm(16)) 835 JAE(LabelRef("LBB8_3")) 836 XORL(R8L, R8L) 837 JMP(LabelRef("LBB8_6")) 838 839 Label("LBB8_3") 840 { 841 MOVQ(RCX, R8) 842 ANDQ(I32(-16), R8) 843 XORL(EAX, EAX) 844 VMOVDQU(data.Offset(0), X0) 845 } 846 847 Label("LBB8_4") 848 { 849 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1) 850 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2) 851 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3) 852 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4) 853 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1) 854 VEXTRACTF128(Imm(1), Y1, X5) 855 VPACKSSDW(X5, X1, X1) 856 VPACKSSDW(X1, X1, X1) 857 VPACKSSWB(X1, X1, X1) 858 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2) 859 VPAND(X0, X1, X1) 860 VEXTRACTF128(Imm(1), Y2, X5) 861 VPACKSSDW(X5, X2, X2) 862 VPACKSSDW(X2, X2, X2) 863 VPACKSSWB(X2, X2, X2) 864 VPAND(X0, X2, X2) 865 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3) 866 VPUNPCKLDQ(X2, X1, X1) 867 VEXTRACTF128(Imm(1), Y3, X2) 868 VPACKSSDW(X2, X3, X2) 869 VPACKSSDW(X2, X2, X2) 870 VPACKSSWB(X2, X2, X2) 871 VPAND(X0, X2, X2) 872 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3) 873 VEXTRACTF128(Imm(1), Y3, X4) 874 VPACKSSDW(X4, X3, X3) 875 VPACKSSDW(X3, X3, X3) 876 VPACKSSWB(X3, X3, X3) 877 VPAND(X0, X3, X3) 878 VPBROADCASTD(X3, X3) 879 VPBROADCASTD(X2, X2) 880 VPUNPCKLDQ(X3, X2, X2) 881 VPBLENDD(Imm(12), X2, X1, X1) 882 VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1)) 883 ADDQ(Imm(16), RAX) 884 CMPQ(R8, RAX) 885 JNE(LabelRef("LBB8_4")) 886 CMPQ(R8, RCX) 887 JE(LabelRef("LBB8_7")) 888 } 889 890 Label("LBB8_6") 891 { 892 VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0) 893 VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0) 894 SETEQ(Mem{Base: RDI}.Idx(R8, 1)) 895 ADDQ(Imm(1), R8) 896 CMPQ(RCX, R8) 897 JNE(LabelRef("LBB8_6")) 898 } 899 900 Label("LBB8_7") 901 { 902 VZEROUPPER() 903 RET() 904 } 905 } 906 907 func genEq_F32() { 908 909 data := GLOBL("dataEqF32", RODATA|NOPTR) 910 DATA(0, U8(1)) 911 DATA(1, U8(1)) 912 DATA(2, U8(1)) 913 DATA(3, U8(1)) 914 DATA(4, U8(1)) 915 DATA(5, U8(1)) 916 DATA(6, U8(1)) 917 DATA(7, U8(1)) 918 DATA(8, U8(0)) 919 DATA(9, U8(0)) 920 DATA(10, U8(0)) 921 DATA(11, U8(0)) 922 DATA(12, U8(0)) 923 DATA(13, U8(0)) 924 DATA(14, U8(0)) 925 DATA(15, U8(0)) 926 927 TEXT("Eq_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)") 928 Pragma("noescape") 929 Load(Param("x").Base(), RDI) 930 Load(Param("y").Base(), RSI) 931 Load(Param("z").Base(), RDX) 932 Load(Param("x").Len(), RCX) 933 934 TESTQ(RCX, RCX) 935 JE(LabelRef("LBB9_7")) 936 CMPQ(RCX, Imm(32)) 937 JAE(LabelRef("LBB9_3")) 938 XORL(R8L, R8L) 939 JMP(LabelRef("LBB9_6")) 940 941 Label("LBB9_3") 942 { 943 MOVQ(RCX, R8) 944 ANDQ(I32(-32), R8) 945 XORL(EAX, EAX) 946 VMOVDQU(data.Offset(0), X0) 947 } 948 949 Label("LBB9_4") 950 { 951 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1) 952 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2) 953 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3) 954 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4) 955 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1) 956 VEXTRACTF128(Imm(1), Y1, X5) 957 VPACKSSDW(X5, X1, X1) 958 VPACKSSWB(X1, X1, X1) 959 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2) 960 VPAND(X0, X1, X1) 961 VEXTRACTF128(Imm(1), Y2, X5) 962 VPACKSSDW(X5, X2, X2) 963 VPACKSSWB(X2, X2, X2) 964 VPAND(X0, X2, X2) 965 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3) 966 VEXTRACTF128(Imm(1), Y3, X5) 967 VPACKSSDW(X5, X3, X3) 968 VPACKSSWB(X3, X3, X3) 969 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4) 970 VPAND(X0, X3, X3) 971 VEXTRACTF128(Imm(1), Y4, X5) 972 VPACKSSDW(X5, X4, X4) 973 VPACKSSWB(X4, X4, X4) 974 VPAND(X0, X4, X4) 975 VINSERTI128(Imm(1), X4, Y3, Y3) 976 VINSERTI128(Imm(1), X2, Y1, Y1) 977 VPUNPCKLQDQ(Y3, Y1, Y1) 978 VPERMQ(Imm(216), Y1, Y1) 979 VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1)) 980 ADDQ(Imm(32), RAX) 981 CMPQ(R8, RAX) 982 JNE(LabelRef("LBB9_4")) 983 CMPQ(R8, RCX) 984 JE(LabelRef("LBB9_7")) 985 } 986 987 Label("LBB9_6") 988 { 989 VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0) 990 VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0) 991 SETEQ(Mem{Base: RDI}.Idx(R8, 1)) 992 ADDQ(Imm(1), R8) 993 CMPQ(RCX, R8) 994 JNE(LabelRef("LBB9_6")) 995 } 996 997 Label("LBB9_7") 998 { 999 VZEROUPPER() 1000 RET() 1001 } 1002 } 1003 1004 func genNeq_F64() { 1005 1006 data := GLOBL("dataNeqF64", RODATA|NOPTR) 1007 DATA(0, U8(1)) 1008 DATA(1, U8(1)) 1009 DATA(2, U8(1)) 1010 DATA(3, U8(1)) 1011 DATA(4, U8(0)) 1012 DATA(5, U8(0)) 1013 DATA(6, U8(0)) 1014 DATA(7, U8(0)) 1015 DATA(8, U8(0)) 1016 DATA(9, U8(0)) 1017 DATA(10, U8(0)) 1018 DATA(11, U8(0)) 1019 DATA(12, U8(0)) 1020 DATA(13, U8(0)) 1021 DATA(14, U8(0)) 1022 DATA(15, U8(0)) 1023 1024 TEXT("Neq_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)") 1025 Pragma("noescape") 1026 Load(Param("x").Base(), RDI) 1027 Load(Param("y").Base(), RSI) 1028 Load(Param("z").Base(), RDX) 1029 Load(Param("x").Len(), RCX) 1030 1031 TESTQ(RCX, RCX) 1032 JE(LabelRef("LBB10_7")) 1033 CMPQ(RCX, Imm(16)) 1034 JAE(LabelRef("LBB10_3")) 1035 XORL(R8L, R8L) 1036 JMP(LabelRef("LBB10_6")) 1037 1038 Label("LBB10_3") 1039 { 1040 MOVQ(RCX, R8) 1041 ANDQ(I32(-16), R8) 1042 XORL(EAX, EAX) 1043 VMOVDQU(data.Offset(0), X0) 1044 } 1045 1046 Label("LBB10_4") 1047 { 1048 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1) 1049 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2) 1050 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3) 1051 VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4) 1052 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1) 1053 VEXTRACTF128(Imm(1), Y1, X5) 1054 VPACKSSDW(X5, X1, X1) 1055 VPACKSSDW(X1, X1, X1) 1056 VPACKSSWB(X1, X1, X1) 1057 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2) 1058 VPAND(X0, X1, X1) 1059 VEXTRACTF128(Imm(1), Y2, X5) 1060 VPACKSSDW(X5, X2, X2) 1061 VPACKSSDW(X2, X2, X2) 1062 VPACKSSWB(X2, X2, X2) 1063 VPAND(X0, X2, X2) 1064 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3) 1065 VPUNPCKLDQ(X2, X1, X1) 1066 VEXTRACTF128(Imm(1), Y3, X2) 1067 VPACKSSDW(X2, X3, X2) 1068 VPACKSSDW(X2, X2, X2) 1069 VPACKSSWB(X2, X2, X2) 1070 VPAND(X0, X2, X2) 1071 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3) 1072 VEXTRACTF128(Imm(1), Y3, X4) 1073 VPACKSSDW(X4, X3, X3) 1074 VPACKSSDW(X3, X3, X3) 1075 VPACKSSWB(X3, X3, X3) 1076 VPAND(X0, X3, X3) 1077 VPBROADCASTD(X3, X3) 1078 VPBROADCASTD(X2, X2) 1079 VPUNPCKLDQ(X3, X2, X2) 1080 VPBLENDD(Imm(12), X2, X1, X1) 1081 VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1)) 1082 ADDQ(Imm(16), RAX) 1083 CMPQ(R8, RAX) 1084 JNE(LabelRef("LBB10_4")) 1085 CMPQ(R8, RCX) 1086 JE(LabelRef("LBB10_7")) 1087 } 1088 1089 Label("LBB10_6") 1090 { 1091 VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0) 1092 VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0) 1093 SETNE(Mem{Base: RDI}.Idx(R8, 1)) 1094 ADDQ(Imm(1), R8) 1095 CMPQ(RCX, R8) 1096 JNE(LabelRef("LBB10_6")) 1097 } 1098 1099 Label("LBB10_7") 1100 { 1101 VZEROUPPER() 1102 RET() 1103 } 1104 } 1105 1106 func genNeq_F32() { 1107 1108 data := GLOBL("dataNeqF32", RODATA|NOPTR) 1109 DATA(0, U8(1)) 1110 DATA(1, U8(1)) 1111 DATA(2, U8(1)) 1112 DATA(3, U8(1)) 1113 DATA(4, U8(1)) 1114 DATA(5, U8(1)) 1115 DATA(6, U8(1)) 1116 DATA(7, U8(1)) 1117 DATA(8, U8(0)) 1118 DATA(9, U8(0)) 1119 DATA(10, U8(0)) 1120 DATA(11, U8(0)) 1121 DATA(12, U8(0)) 1122 DATA(13, U8(0)) 1123 DATA(14, U8(0)) 1124 DATA(15, U8(0)) 1125 1126 TEXT("Neq_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)") 1127 Pragma("noescape") 1128 Load(Param("x").Base(), RDI) 1129 Load(Param("y").Base(), RSI) 1130 Load(Param("z").Base(), RDX) 1131 Load(Param("x").Len(), RCX) 1132 1133 TESTQ(RCX, RCX) 1134 JE(LabelRef("LBB11_7")) 1135 CMPQ(RCX, Imm(32)) 1136 JAE(LabelRef("LBB11_3")) 1137 XORL(R8L, R8L) 1138 JMP(LabelRef("LBB11_6")) 1139 1140 Label("LBB11_3") 1141 { 1142 MOVQ(RCX, R8) 1143 ANDQ(I32(-32), R8) 1144 XORL(EAX, EAX) 1145 VMOVDQU(data.Offset(0), X0) 1146 } 1147 1148 Label("LBB11_4") 1149 { 1150 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1) 1151 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2) 1152 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3) 1153 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4) 1154 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1) 1155 VEXTRACTF128(Imm(1), Y1, X5) 1156 VPACKSSDW(X5, X1, X1) 1157 VPACKSSWB(X1, X1, X1) 1158 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2) 1159 VPAND(X0, X1, X1) 1160 VEXTRACTF128(Imm(1), Y2, X5) 1161 VPACKSSDW(X5, X2, X2) 1162 VPACKSSWB(X2, X2, X2) 1163 VPAND(X0, X2, X2) 1164 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3) 1165 VEXTRACTF128(Imm(1), Y3, X5) 1166 VPACKSSDW(X5, X3, X3) 1167 VPACKSSWB(X3, X3, X3) 1168 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4) 1169 VPAND(X0, X3, X3) 1170 VEXTRACTF128(Imm(1), Y4, X5) 1171 VPACKSSDW(X5, X4, X4) 1172 VPACKSSWB(X4, X4, X4) 1173 VPAND(X0, X4, X4) 1174 VINSERTI128(Imm(1), X4, Y3, Y3) 1175 VINSERTI128(Imm(1), X2, Y1, Y1) 1176 VPUNPCKLQDQ(Y3, Y1, Y1) 1177 VPERMQ(Imm(216), Y1, Y1) 1178 VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1)) 1179 ADDQ(Imm(32), RAX) 1180 CMPQ(R8, RAX) 1181 JNE(LabelRef("LBB11_4")) 1182 CMPQ(R8, RCX) 1183 JE(LabelRef("LBB11_7")) 1184 } 1185 1186 Label("LBB11_6") 1187 { 1188 VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0) 1189 VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0) 1190 SETNE(Mem{Base: RDI}.Idx(R8, 1)) 1191 ADDQ(Imm(1), R8) 1192 CMPQ(RCX, R8) 1193 JNE(LabelRef("LBB11_6")) 1194 } 1195 1196 Label("LBB11_7") 1197 { 1198 VZEROUPPER() 1199 RET() 1200 } 1201 } 1202 1203 func genLtNumber_F64() { 1204 1205 data := GLOBL("dataLtNumberF64", RODATA|NOPTR) 1206 DATA(0, U8(1)) 1207 DATA(1, U8(1)) 1208 DATA(2, U8(1)) 1209 DATA(3, U8(1)) 1210 DATA(4, U8(0)) 1211 DATA(5, U8(0)) 1212 DATA(6, U8(0)) 1213 DATA(7, U8(0)) 1214 DATA(8, U8(0)) 1215 DATA(9, U8(0)) 1216 DATA(10, U8(0)) 1217 DATA(11, U8(0)) 1218 DATA(12, U8(0)) 1219 DATA(13, U8(0)) 1220 DATA(14, U8(0)) 1221 DATA(15, U8(0)) 1222 1223 TEXT("LtNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)") 1224 Pragma("noescape") 1225 Load(Param("x").Base(), RDI) 1226 Load(Param("y").Base(), RSI) 1227 Load(Param("a"), X0) 1228 Load(Param("x").Len(), RDX) 1229 1230 TESTQ(RDX, RDX) 1231 JE(LabelRef("LBB12_7")) 1232 CMPQ(RDX, Imm(16)) 1233 JAE(LabelRef("LBB12_3")) 1234 XORL(EAX, EAX) 1235 JMP(LabelRef("LBB12_6")) 1236 1237 Label("LBB12_3") 1238 { 1239 MOVQ(RDX, RAX) 1240 ANDQ(I32(-16), RAX) 1241 VBROADCASTSD(X0, Y1) 1242 XORL(ECX, ECX) 1243 VMOVDQU(data.Offset(0), X2) 1244 } 1245 1246 Label("LBB12_4") 1247 { 1248 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8), Y3) 1249 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y4) 1250 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y5) 1251 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y6) 1252 VCMPPD(Imm(1), Y1, Y3, Y3) 1253 VEXTRACTF128(Imm(1), Y3, X7) 1254 VPACKSSDW(X7, X3, X3) 1255 VPACKSSDW(X3, X3, X3) 1256 VPACKSSWB(X3, X3, X3) 1257 VPAND(X2, X3, X3) 1258 VCMPPD(Imm(1), Y1, Y4, Y4) 1259 VEXTRACTF128(Imm(1), Y4, X7) 1260 VPACKSSDW(X7, X4, X4) 1261 VPACKSSDW(X4, X4, X4) 1262 VPACKSSWB(X4, X4, X4) 1263 VPAND(X2, X4, X4) 1264 VPUNPCKLDQ(X4, X3, X3) 1265 VCMPPD(Imm(1), Y1, Y5, Y4) 1266 VEXTRACTF128(Imm(1), Y4, X5) 1267 VPACKSSDW(X5, X4, X4) 1268 VPACKSSDW(X4, X4, X4) 1269 VPACKSSWB(X4, X4, X4) 1270 VPAND(X2, X4, X4) 1271 VCMPPD(Imm(1), Y1, Y6, Y5) 1272 VEXTRACTF128(Imm(1), Y5, X6) 1273 VPACKSSDW(X6, X5, X5) 1274 VPACKSSDW(X5, X5, X5) 1275 VPACKSSWB(X5, X5, X5) 1276 VPAND(X2, X5, X5) 1277 VPBROADCASTD(X5, X5) 1278 VPBROADCASTD(X4, X4) 1279 VPUNPCKLDQ(X5, X4, X4) 1280 VPBLENDD(Imm(12), X4, X3, X3) 1281 VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1)) 1282 ADDQ(Imm(16), RCX) 1283 CMPQ(RAX, RCX) 1284 JNE(LabelRef("LBB12_4")) 1285 CMPQ(RAX, RDX) 1286 JE(LabelRef("LBB12_7")) 1287 } 1288 1289 Label("LBB12_6") 1290 { 1291 VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0) 1292 SETHI(Mem{Base: RDI}.Idx(RAX, 1)) 1293 ADDQ(Imm(1), RAX) 1294 CMPQ(RDX, RAX) 1295 JNE(LabelRef("LBB12_6")) 1296 } 1297 1298 Label("LBB12_7") 1299 { 1300 VZEROUPPER() 1301 RET() 1302 } 1303 } 1304 1305 func genLtNumber_F32() { 1306 1307 data := GLOBL("dataLtNumberF32", RODATA|NOPTR) 1308 DATA(0, U8(1)) 1309 DATA(1, U8(1)) 1310 DATA(2, U8(1)) 1311 DATA(3, U8(1)) 1312 DATA(4, U8(1)) 1313 DATA(5, U8(1)) 1314 DATA(6, U8(1)) 1315 DATA(7, U8(1)) 1316 DATA(8, U8(0)) 1317 DATA(9, U8(0)) 1318 DATA(10, U8(0)) 1319 DATA(11, U8(0)) 1320 DATA(12, U8(0)) 1321 DATA(13, U8(0)) 1322 DATA(14, U8(0)) 1323 DATA(15, U8(0)) 1324 1325 TEXT("LtNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)") 1326 Pragma("noescape") 1327 Load(Param("x").Base(), RDI) 1328 Load(Param("y").Base(), RSI) 1329 Load(Param("a"), X0) 1330 Load(Param("x").Len(), RDX) 1331 1332 TESTQ(RDX, RDX) 1333 JE(LabelRef("LBB13_7")) 1334 CMPQ(RDX, Imm(32)) 1335 JAE(LabelRef("LBB13_3")) 1336 XORL(EAX, EAX) 1337 JMP(LabelRef("LBB13_6")) 1338 1339 Label("LBB13_3") 1340 { 1341 MOVQ(RDX, RAX) 1342 ANDQ(I32(-32), RAX) 1343 VBROADCASTSS(X0, Y1) 1344 XORL(ECX, ECX) 1345 VMOVDQU(data.Offset(0), X2) 1346 } 1347 1348 Label("LBB13_4") 1349 { 1350 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4), Y3) 1351 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y4) 1352 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y5) 1353 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y6) 1354 VCMPPS(Imm(1), Y1, Y3, Y3) 1355 VEXTRACTF128(Imm(1), Y3, X7) 1356 VPACKSSDW(X7, X3, X3) 1357 VPACKSSWB(X3, X3, X3) 1358 VPAND(X2, X3, X3) 1359 VCMPPS(Imm(1), Y1, Y4, Y4) 1360 VEXTRACTF128(Imm(1), Y4, X7) 1361 VPACKSSDW(X7, X4, X4) 1362 VPACKSSWB(X4, X4, X4) 1363 VPAND(X2, X4, X4) 1364 VCMPPS(Imm(1), Y1, Y5, Y5) 1365 VEXTRACTF128(Imm(1), Y5, X7) 1366 VPACKSSDW(X7, X5, X5) 1367 VPACKSSWB(X5, X5, X5) 1368 VPAND(X2, X5, X5) 1369 VCMPPS(Imm(1), Y1, Y6, Y6) 1370 VEXTRACTF128(Imm(1), Y6, X7) 1371 VPACKSSDW(X7, X6, X6) 1372 VPACKSSWB(X6, X6, X6) 1373 VPAND(X2, X6, X6) 1374 VINSERTI128(Imm(1), X6, Y5, Y5) 1375 VINSERTI128(Imm(1), X4, Y3, Y3) 1376 VPUNPCKLQDQ(Y5, Y3, Y3) 1377 VPERMQ(Imm(216), Y3, Y3) 1378 VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1)) 1379 ADDQ(Imm(32), RCX) 1380 CMPQ(RAX, RCX) 1381 JNE(LabelRef("LBB13_4")) 1382 CMPQ(RAX, RDX) 1383 JE(LabelRef("LBB13_7")) 1384 } 1385 1386 Label("LBB13_6") 1387 { 1388 VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0) 1389 SETHI(Mem{Base: RDI}.Idx(RAX, 1)) 1390 ADDQ(Imm(1), RAX) 1391 CMPQ(RDX, RAX) 1392 JNE(LabelRef("LBB13_6")) 1393 } 1394 1395 Label("LBB13_7") 1396 { 1397 VZEROUPPER() 1398 RET() 1399 } 1400 } 1401 1402 func genLteNumber_F64() { 1403 1404 data := GLOBL("dataLteNumberF64", RODATA|NOPTR) 1405 DATA(0, U8(1)) 1406 DATA(1, U8(1)) 1407 DATA(2, U8(1)) 1408 DATA(3, U8(1)) 1409 DATA(4, U8(0)) 1410 DATA(5, U8(0)) 1411 DATA(6, U8(0)) 1412 DATA(7, U8(0)) 1413 DATA(8, U8(0)) 1414 DATA(9, U8(0)) 1415 DATA(10, U8(0)) 1416 DATA(11, U8(0)) 1417 DATA(12, U8(0)) 1418 DATA(13, U8(0)) 1419 DATA(14, U8(0)) 1420 DATA(15, U8(0)) 1421 1422 TEXT("LteNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)") 1423 Pragma("noescape") 1424 Load(Param("x").Base(), RDI) 1425 Load(Param("y").Base(), RSI) 1426 Load(Param("a"), X0) 1427 Load(Param("x").Len(), RDX) 1428 1429 TESTQ(RDX, RDX) 1430 JE(LabelRef("LBB14_7")) 1431 CMPQ(RDX, Imm(16)) 1432 JAE(LabelRef("LBB14_3")) 1433 XORL(EAX, EAX) 1434 JMP(LabelRef("LBB14_6")) 1435 1436 Label("LBB14_3") 1437 { 1438 MOVQ(RDX, RAX) 1439 ANDQ(I32(-16), RAX) 1440 VBROADCASTSD(X0, Y1) 1441 XORL(ECX, ECX) 1442 VMOVDQU(data.Offset(0), X2) 1443 } 1444 1445 Label("LBB14_4") 1446 { 1447 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8), Y3) 1448 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y4) 1449 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y5) 1450 VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y6) 1451 VCMPPD(Imm(2), Y1, Y3, Y3) 1452 VEXTRACTF128(Imm(1), Y3, X7) 1453 VPACKSSDW(X7, X3, X3) 1454 VPACKSSDW(X3, X3, X3) 1455 VPACKSSWB(X3, X3, X3) 1456 VPAND(X2, X3, X3) 1457 VCMPPD(Imm(2), Y1, Y4, Y4) 1458 VEXTRACTF128(Imm(1), Y4, X7) 1459 VPACKSSDW(X7, X4, X4) 1460 VPACKSSDW(X4, X4, X4) 1461 VPACKSSWB(X4, X4, X4) 1462 VPAND(X2, X4, X4) 1463 VPUNPCKLDQ(X4, X3, X3) 1464 VCMPPD(Imm(2), Y1, Y5, Y4) 1465 VEXTRACTF128(Imm(1), Y4, X5) 1466 VPACKSSDW(X5, X4, X4) 1467 VPACKSSDW(X4, X4, X4) 1468 VPACKSSWB(X4, X4, X4) 1469 VPAND(X2, X4, X4) 1470 VCMPPD(Imm(2), Y1, Y6, Y5) 1471 VEXTRACTF128(Imm(1), Y5, X6) 1472 VPACKSSDW(X6, X5, X5) 1473 VPACKSSDW(X5, X5, X5) 1474 VPACKSSWB(X5, X5, X5) 1475 VPAND(X2, X5, X5) 1476 VPBROADCASTD(X5, X5) 1477 VPBROADCASTD(X4, X4) 1478 VPUNPCKLDQ(X5, X4, X4) 1479 VPBLENDD(Imm(12), X4, X3, X3) 1480 VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1)) 1481 ADDQ(Imm(16), RCX) 1482 CMPQ(RAX, RCX) 1483 JNE(LabelRef("LBB14_4")) 1484 CMPQ(RAX, RDX) 1485 JE(LabelRef("LBB14_7")) 1486 } 1487 1488 Label("LBB14_6") 1489 { 1490 VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0) 1491 SETCC(Mem{Base: RDI}.Idx(RAX, 1)) 1492 ADDQ(Imm(1), RAX) 1493 CMPQ(RDX, RAX) 1494 JNE(LabelRef("LBB14_6")) 1495 } 1496 1497 Label("LBB14_7") 1498 { 1499 VZEROUPPER() 1500 RET() 1501 } 1502 } 1503 1504 func genLteNumber_F32() { 1505 1506 data := GLOBL("dataLteNumberF32", RODATA|NOPTR) 1507 DATA(0, U8(1)) 1508 DATA(1, U8(1)) 1509 DATA(2, U8(1)) 1510 DATA(3, U8(1)) 1511 DATA(4, U8(1)) 1512 DATA(5, U8(1)) 1513 DATA(6, U8(1)) 1514 DATA(7, U8(1)) 1515 DATA(8, U8(0)) 1516 DATA(9, U8(0)) 1517 DATA(10, U8(0)) 1518 DATA(11, U8(0)) 1519 DATA(12, U8(0)) 1520 DATA(13, U8(0)) 1521 DATA(14, U8(0)) 1522 DATA(15, U8(0)) 1523 1524 TEXT("LteNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)") 1525 Pragma("noescape") 1526 Load(Param("x").Base(), RDI) 1527 Load(Param("y").Base(), RSI) 1528 Load(Param("a"), X0) 1529 Load(Param("x").Len(), RDX) 1530 1531 TESTQ(RDX, RDX) 1532 JE(LabelRef("LBB15_7")) 1533 CMPQ(RDX, Imm(32)) 1534 JAE(LabelRef("LBB15_3")) 1535 XORL(EAX, EAX) 1536 JMP(LabelRef("LBB15_6")) 1537 1538 Label("LBB15_3") 1539 { 1540 MOVQ(RDX, RAX) 1541 ANDQ(I32(-32), RAX) 1542 VBROADCASTSS(X0, Y1) 1543 XORL(ECX, ECX) 1544 VMOVDQU(data.Offset(0), X2) 1545 } 1546 1547 Label("LBB15_4") 1548 { 1549 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4), Y3) 1550 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y4) 1551 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y5) 1552 VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y6) 1553 VCMPPS(Imm(2), Y1, Y3, Y3) 1554 VEXTRACTF128(Imm(1), Y3, X7) 1555 VPACKSSDW(X7, X3, X3) 1556 VPACKSSWB(X3, X3, X3) 1557 VPAND(X2, X3, X3) 1558 VCMPPS(Imm(2), Y1, Y4, Y4) 1559 VEXTRACTF128(Imm(1), Y4, X7) 1560 VPACKSSDW(X7, X4, X4) 1561 VPACKSSWB(X4, X4, X4) 1562 VPAND(X2, X4, X4) 1563 VCMPPS(Imm(2), Y1, Y5, Y5) 1564 VEXTRACTF128(Imm(1), Y5, X7) 1565 VPACKSSDW(X7, X5, X5) 1566 VPACKSSWB(X5, X5, X5) 1567 VPAND(X2, X5, X5) 1568 VCMPPS(Imm(2), Y1, Y6, Y6) 1569 VEXTRACTF128(Imm(1), Y6, X7) 1570 VPACKSSDW(X7, X6, X6) 1571 VPACKSSWB(X6, X6, X6) 1572 VPAND(X2, X6, X6) 1573 VINSERTI128(Imm(1), X6, Y5, Y5) 1574 VINSERTI128(Imm(1), X4, Y3, Y3) 1575 VPUNPCKLQDQ(Y5, Y3, Y3) 1576 VPERMQ(Imm(216), Y3, Y3) 1577 VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1)) 1578 ADDQ(Imm(32), RCX) 1579 CMPQ(RAX, RCX) 1580 JNE(LabelRef("LBB15_4")) 1581 CMPQ(RAX, RDX) 1582 JE(LabelRef("LBB15_7")) 1583 } 1584 1585 Label("LBB15_6") 1586 { 1587 VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0) 1588 SETCC(Mem{Base: RDI}.Idx(RAX, 1)) 1589 ADDQ(Imm(1), RAX) 1590 CMPQ(RDX, RAX) 1591 JNE(LabelRef("LBB15_6")) 1592 } 1593 1594 Label("LBB15_7") 1595 { 1596 VZEROUPPER() 1597 RET() 1598 } 1599 } 1600 1601 func genGtNumber_F64() { 1602 1603 data := GLOBL("dataGtNumberF64", RODATA|NOPTR) 1604 DATA(0, U8(1)) 1605 DATA(1, U8(1)) 1606 DATA(2, U8(1)) 1607 DATA(3, U8(1)) 1608 DATA(4, U8(0)) 1609 DATA(5, U8(0)) 1610 DATA(6, U8(0)) 1611 DATA(7, U8(0)) 1612 DATA(8, U8(0)) 1613 DATA(9, U8(0)) 1614 DATA(10, U8(0)) 1615 DATA(11, U8(0)) 1616 DATA(12, U8(0)) 1617 DATA(13, U8(0)) 1618 DATA(14, U8(0)) 1619 DATA(15, U8(0)) 1620 1621 TEXT("GtNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)") 1622 Pragma("noescape") 1623 Load(Param("x").Base(), RDI) 1624 Load(Param("y").Base(), RSI) 1625 Load(Param("a"), X0) 1626 Load(Param("x").Len(), RDX) 1627 1628 TESTQ(RDX, RDX) 1629 JE(LabelRef("LBB16_7")) 1630 CMPQ(RDX, Imm(16)) 1631 JAE(LabelRef("LBB16_3")) 1632 XORL(EAX, EAX) 1633 JMP(LabelRef("LBB16_6")) 1634 1635 Label("LBB16_3") 1636 { 1637 MOVQ(RDX, RAX) 1638 ANDQ(I32(-16), RAX) 1639 VBROADCASTSD(X0, Y1) 1640 XORL(ECX, ECX) 1641 VMOVDQU(data.Offset(0), X2) 1642 } 1643 1644 Label("LBB16_4") 1645 { 1646 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3) 1647 VEXTRACTF128(Imm(1), Y3, X4) 1648 VPACKSSDW(X4, X3, X3) 1649 VPACKSSDW(X3, X3, X3) 1650 VPACKSSWB(X3, X3, X3) 1651 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4) 1652 VPAND(X2, X3, X3) 1653 VEXTRACTF128(Imm(1), Y4, X5) 1654 VPACKSSDW(X5, X4, X4) 1655 VPACKSSDW(X4, X4, X4) 1656 VPACKSSWB(X4, X4, X4) 1657 VPAND(X2, X4, X4) 1658 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5) 1659 VPUNPCKLDQ(X4, X3, X3) 1660 VEXTRACTF128(Imm(1), Y5, X4) 1661 VPACKSSDW(X4, X5, X4) 1662 VPACKSSDW(X4, X4, X4) 1663 VPACKSSWB(X4, X4, X4) 1664 VPAND(X2, X4, X4) 1665 VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5) 1666 VEXTRACTF128(Imm(1), Y5, X6) 1667 VPACKSSDW(X6, X5, X5) 1668 VPACKSSDW(X5, X5, X5) 1669 VPACKSSWB(X5, X5, X5) 1670 VPAND(X2, X5, X5) 1671 VPBROADCASTD(X5, X5) 1672 VPBROADCASTD(X4, X4) 1673 VPUNPCKLDQ(X5, X4, X4) 1674 VPBLENDD(Imm(12), X4, X3, X3) 1675 VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1)) 1676 ADDQ(Imm(16), RCX) 1677 CMPQ(RAX, RCX) 1678 JNE(LabelRef("LBB16_4")) 1679 CMPQ(RAX, RDX) 1680 JE(LabelRef("LBB16_7")) 1681 } 1682 1683 Label("LBB16_6") 1684 { 1685 VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0) 1686 SETCS(Mem{Base: RDI}.Idx(RAX, 1)) 1687 ADDQ(Imm(1), RAX) 1688 CMPQ(RDX, RAX) 1689 JNE(LabelRef("LBB16_6")) 1690 } 1691 1692 Label("LBB16_7") 1693 { 1694 VZEROUPPER() 1695 RET() 1696 } 1697 } 1698 1699 func genGtNumber_F32() { 1700 1701 data := GLOBL("dataGtNumberF32", RODATA|NOPTR) 1702 DATA(0, U8(1)) 1703 DATA(1, U8(1)) 1704 DATA(2, U8(1)) 1705 DATA(3, U8(1)) 1706 DATA(4, U8(1)) 1707 DATA(5, U8(1)) 1708 DATA(6, U8(1)) 1709 DATA(7, U8(1)) 1710 DATA(8, U8(0)) 1711 DATA(9, U8(0)) 1712 DATA(10, U8(0)) 1713 DATA(11, U8(0)) 1714 DATA(12, U8(0)) 1715 DATA(13, U8(0)) 1716 DATA(14, U8(0)) 1717 DATA(15, U8(0)) 1718 1719 TEXT("GtNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)") 1720 Pragma("noescape") 1721 Load(Param("x").Base(), RDI) 1722 Load(Param("y").Base(), RSI) 1723 Load(Param("a"), X0) 1724 Load(Param("x").Len(), RDX) 1725 1726 TESTQ(RDX, RDX) 1727 JE(LabelRef("LBB17_7")) 1728 CMPQ(RDX, Imm(32)) 1729 JAE(LabelRef("LBB17_3")) 1730 XORL(EAX, EAX) 1731 JMP(LabelRef("LBB17_6")) 1732 1733 Label("LBB17_3") 1734 { 1735 MOVQ(RDX, RAX) 1736 ANDQ(I32(-32), RAX) 1737 VBROADCASTSS(X0, Y1) 1738 XORL(ECX, ECX) 1739 VMOVDQU(data.Offset(0), X2) 1740 } 1741 1742 Label("LBB17_4") 1743 { 1744 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3) 1745 VEXTRACTF128(Imm(1), Y3, X4) 1746 VPACKSSDW(X4, X3, X3) 1747 VPACKSSWB(X3, X3, X3) 1748 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4) 1749 VPAND(X2, X3, X3) 1750 VEXTRACTF128(Imm(1), Y4, X5) 1751 VPACKSSDW(X5, X4, X4) 1752 VPACKSSWB(X4, X4, X4) 1753 VPAND(X2, X4, X4) 1754 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5) 1755 VEXTRACTF128(Imm(1), Y5, X6) 1756 VPACKSSDW(X6, X5, X5) 1757 VPACKSSWB(X5, X5, X5) 1758 VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6) 1759 VPAND(X2, X5, X5) 1760 VEXTRACTF128(Imm(1), Y6, X7) 1761 VPACKSSDW(X7, X6, X6) 1762 VPACKSSWB(X6, X6, X6) 1763 VPAND(X2, X6, X6) 1764 VINSERTI128(Imm(1), X6, Y5, Y5) 1765 VINSERTI128(Imm(1), X4, Y3, Y3) 1766 VPUNPCKLQDQ(Y5, Y3, Y3) 1767 VPERMQ(Imm(216), Y3, Y3) 1768 VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1)) 1769 ADDQ(Imm(32), RCX) 1770 CMPQ(RAX, RCX) 1771 JNE(LabelRef("LBB17_4")) 1772 CMPQ(RAX, RDX) 1773 JE(LabelRef("LBB17_7")) 1774 } 1775 1776 Label("LBB17_6") 1777 { 1778 VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0) 1779 SETCS(Mem{Base: RDI}.Idx(RAX, 1)) 1780 ADDQ(Imm(1), RAX) 1781 CMPQ(RDX, RAX) 1782 JNE(LabelRef("LBB17_6")) 1783 } 1784 1785 Label("LBB17_7") 1786 { 1787 VZEROUPPER() 1788 RET() 1789 } 1790 } 1791 1792 func genGteNumber_F64() { 1793 1794 data := GLOBL("dataGteNumberF64", RODATA|NOPTR) 1795 DATA(0, U8(1)) 1796 DATA(1, U8(1)) 1797 DATA(2, U8(1)) 1798 DATA(3, U8(1)) 1799 DATA(4, U8(0)) 1800 DATA(5, U8(0)) 1801 DATA(6, U8(0)) 1802 DATA(7, U8(0)) 1803 DATA(8, U8(0)) 1804 DATA(9, U8(0)) 1805 DATA(10, U8(0)) 1806 DATA(11, U8(0)) 1807 DATA(12, U8(0)) 1808 DATA(13, U8(0)) 1809 DATA(14, U8(0)) 1810 DATA(15, U8(0)) 1811 1812 TEXT("GteNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)") 1813 Pragma("noescape") 1814 Load(Param("x").Base(), RDI) 1815 Load(Param("y").Base(), RSI) 1816 Load(Param("a"), X0) 1817 Load(Param("x").Len(), RDX) 1818 1819 TESTQ(RDX, RDX) 1820 JE(LabelRef("LBB18_7")) 1821 CMPQ(RDX, Imm(16)) 1822 JAE(LabelRef("LBB18_3")) 1823 XORL(EAX, EAX) 1824 JMP(LabelRef("LBB18_6")) 1825 1826 Label("LBB18_3") 1827 { 1828 MOVQ(RDX, RAX) 1829 ANDQ(I32(-16), RAX) 1830 VBROADCASTSD(X0, Y1) 1831 XORL(ECX, ECX) 1832 VMOVDQU(data.Offset(0), X2) 1833 } 1834 1835 Label("LBB18_4") 1836 { 1837 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3) 1838 VEXTRACTF128(Imm(1), Y3, X4) 1839 VPACKSSDW(X4, X3, X3) 1840 VPACKSSDW(X3, X3, X3) 1841 VPACKSSWB(X3, X3, X3) 1842 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4) 1843 VPAND(X2, X3, X3) 1844 VEXTRACTF128(Imm(1), Y4, X5) 1845 VPACKSSDW(X5, X4, X4) 1846 VPACKSSDW(X4, X4, X4) 1847 VPACKSSWB(X4, X4, X4) 1848 VPAND(X2, X4, X4) 1849 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5) 1850 VPUNPCKLDQ(X4, X3, X3) 1851 VEXTRACTF128(Imm(1), Y5, X4) 1852 VPACKSSDW(X4, X5, X4) 1853 VPACKSSDW(X4, X4, X4) 1854 VPACKSSWB(X4, X4, X4) 1855 VPAND(X2, X4, X4) 1856 VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5) 1857 VEXTRACTF128(Imm(1), Y5, X6) 1858 VPACKSSDW(X6, X5, X5) 1859 VPACKSSDW(X5, X5, X5) 1860 VPACKSSWB(X5, X5, X5) 1861 VPAND(X2, X5, X5) 1862 VPBROADCASTD(X5, X5) 1863 VPBROADCASTD(X4, X4) 1864 VPUNPCKLDQ(X5, X4, X4) 1865 VPBLENDD(Imm(12), X4, X3, X3) 1866 VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1)) 1867 ADDQ(Imm(16), RCX) 1868 CMPQ(RAX, RCX) 1869 JNE(LabelRef("LBB18_4")) 1870 CMPQ(RAX, RDX) 1871 JE(LabelRef("LBB18_7")) 1872 } 1873 1874 Label("LBB18_6") 1875 { 1876 VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0) 1877 SETLS(Mem{Base: RDI}.Idx(RAX, 1)) 1878 ADDQ(Imm(1), RAX) 1879 CMPQ(RDX, RAX) 1880 JNE(LabelRef("LBB18_6")) 1881 } 1882 1883 Label("LBB18_7") 1884 { 1885 VZEROUPPER() 1886 RET() 1887 } 1888 } 1889 1890 func genGteNumber_F32() { 1891 1892 data := GLOBL("dataGteNumberF32", RODATA|NOPTR) 1893 DATA(0, U8(1)) 1894 DATA(1, U8(1)) 1895 DATA(2, U8(1)) 1896 DATA(3, U8(1)) 1897 DATA(4, U8(1)) 1898 DATA(5, U8(1)) 1899 DATA(6, U8(1)) 1900 DATA(7, U8(1)) 1901 DATA(8, U8(0)) 1902 DATA(9, U8(0)) 1903 DATA(10, U8(0)) 1904 DATA(11, U8(0)) 1905 DATA(12, U8(0)) 1906 DATA(13, U8(0)) 1907 DATA(14, U8(0)) 1908 DATA(15, U8(0)) 1909 1910 TEXT("GteNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)") 1911 Pragma("noescape") 1912 Load(Param("x").Base(), RDI) 1913 Load(Param("y").Base(), RSI) 1914 Load(Param("a"), X0) 1915 Load(Param("x").Len(), RDX) 1916 1917 TESTQ(RDX, RDX) 1918 JE(LabelRef("LBB19_7")) 1919 CMPQ(RDX, Imm(32)) 1920 JAE(LabelRef("LBB19_3")) 1921 XORL(EAX, EAX) 1922 JMP(LabelRef("LBB19_6")) 1923 1924 Label("LBB19_3") 1925 { 1926 MOVQ(RDX, RAX) 1927 ANDQ(I32(-32), RAX) 1928 VBROADCASTSS(X0, Y1) 1929 XORL(ECX, ECX) 1930 VMOVDQU(data.Offset(0), X2) 1931 } 1932 1933 Label("LBB19_4") 1934 { 1935 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3) 1936 VEXTRACTF128(Imm(1), Y3, X4) 1937 VPACKSSDW(X4, X3, X3) 1938 VPACKSSWB(X3, X3, X3) 1939 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4) 1940 VPAND(X2, X3, X3) 1941 VEXTRACTF128(Imm(1), Y4, X5) 1942 VPACKSSDW(X5, X4, X4) 1943 VPACKSSWB(X4, X4, X4) 1944 VPAND(X2, X4, X4) 1945 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5) 1946 VEXTRACTF128(Imm(1), Y5, X6) 1947 VPACKSSDW(X6, X5, X5) 1948 VPACKSSWB(X5, X5, X5) 1949 VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6) 1950 VPAND(X2, X5, X5) 1951 VEXTRACTF128(Imm(1), Y6, X7) 1952 VPACKSSDW(X7, X6, X6) 1953 VPACKSSWB(X6, X6, X6) 1954 VPAND(X2, X6, X6) 1955 VINSERTI128(Imm(1), X6, Y5, Y5) 1956 VINSERTI128(Imm(1), X4, Y3, Y3) 1957 VPUNPCKLQDQ(Y5, Y3, Y3) 1958 VPERMQ(Imm(216), Y3, Y3) 1959 VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1)) 1960 ADDQ(Imm(32), RCX) 1961 CMPQ(RAX, RCX) 1962 JNE(LabelRef("LBB19_4")) 1963 CMPQ(RAX, RDX) 1964 JE(LabelRef("LBB19_7")) 1965 } 1966 1967 Label("LBB19_6") 1968 { 1969 VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0) 1970 SETLS(Mem{Base: RDI}.Idx(RAX, 1)) 1971 ADDQ(Imm(1), RAX) 1972 CMPQ(RDX, RAX) 1973 JNE(LabelRef("LBB19_6")) 1974 } 1975 1976 Label("LBB19_7") 1977 { 1978 VZEROUPPER() 1979 RET() 1980 } 1981 } 1982 1983 func genEqNumber_F64() { 1984 1985 data := GLOBL("dataEqNumberF64", RODATA|NOPTR) 1986 DATA(0, U8(1)) 1987 DATA(1, U8(1)) 1988 DATA(2, U8(1)) 1989 DATA(3, U8(1)) 1990 DATA(4, U8(0)) 1991 DATA(5, U8(0)) 1992 DATA(6, U8(0)) 1993 DATA(7, U8(0)) 1994 DATA(8, U8(0)) 1995 DATA(9, U8(0)) 1996 DATA(10, U8(0)) 1997 DATA(11, U8(0)) 1998 DATA(12, U8(0)) 1999 DATA(13, U8(0)) 2000 DATA(14, U8(0)) 2001 DATA(15, U8(0)) 2002 2003 TEXT("EqNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)") 2004 Pragma("noescape") 2005 Load(Param("x").Base(), RDI) 2006 Load(Param("y").Base(), RSI) 2007 Load(Param("a"), X0) 2008 Load(Param("x").Len(), RDX) 2009 2010 TESTQ(RDX, RDX) 2011 JE(LabelRef("LBB20_7")) 2012 CMPQ(RDX, Imm(16)) 2013 JAE(LabelRef("LBB20_3")) 2014 XORL(EAX, EAX) 2015 JMP(LabelRef("LBB20_6")) 2016 2017 Label("LBB20_3") 2018 { 2019 MOVQ(RDX, RAX) 2020 ANDQ(I32(-16), RAX) 2021 VBROADCASTSD(X0, Y1) 2022 XORL(ECX, ECX) 2023 VMOVDQU(data.Offset(0), X2) 2024 } 2025 2026 Label("LBB20_4") 2027 { 2028 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3) 2029 VEXTRACTF128(Imm(1), Y3, X4) 2030 VPACKSSDW(X4, X3, X3) 2031 VPACKSSDW(X3, X3, X3) 2032 VPACKSSWB(X3, X3, X3) 2033 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4) 2034 VPAND(X2, X3, X3) 2035 VEXTRACTF128(Imm(1), Y4, X5) 2036 VPACKSSDW(X5, X4, X4) 2037 VPACKSSDW(X4, X4, X4) 2038 VPACKSSWB(X4, X4, X4) 2039 VPAND(X2, X4, X4) 2040 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5) 2041 VPUNPCKLDQ(X4, X3, X3) 2042 VEXTRACTF128(Imm(1), Y5, X4) 2043 VPACKSSDW(X4, X5, X4) 2044 VPACKSSDW(X4, X4, X4) 2045 VPACKSSWB(X4, X4, X4) 2046 VPAND(X2, X4, X4) 2047 VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5) 2048 VEXTRACTF128(Imm(1), Y5, X6) 2049 VPACKSSDW(X6, X5, X5) 2050 VPACKSSDW(X5, X5, X5) 2051 VPACKSSWB(X5, X5, X5) 2052 VPAND(X2, X5, X5) 2053 VPBROADCASTD(X5, X5) 2054 VPBROADCASTD(X4, X4) 2055 VPUNPCKLDQ(X5, X4, X4) 2056 VPBLENDD(Imm(12), X4, X3, X3) 2057 VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1)) 2058 ADDQ(Imm(16), RCX) 2059 CMPQ(RAX, RCX) 2060 JNE(LabelRef("LBB20_4")) 2061 CMPQ(RAX, RDX) 2062 JE(LabelRef("LBB20_7")) 2063 } 2064 2065 Label("LBB20_6") 2066 { 2067 VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0) 2068 SETEQ(Mem{Base: RDI}.Idx(RAX, 1)) 2069 ADDQ(Imm(1), RAX) 2070 CMPQ(RDX, RAX) 2071 JNE(LabelRef("LBB20_6")) 2072 } 2073 2074 Label("LBB20_7") 2075 { 2076 VZEROUPPER() 2077 RET() 2078 } 2079 } 2080 2081 func genEqNumber_F32() { 2082 2083 data := GLOBL("dataEqNumberF32", RODATA|NOPTR) 2084 DATA(0, U8(1)) 2085 DATA(1, U8(1)) 2086 DATA(2, U8(1)) 2087 DATA(3, U8(1)) 2088 DATA(4, U8(1)) 2089 DATA(5, U8(1)) 2090 DATA(6, U8(1)) 2091 DATA(7, U8(1)) 2092 DATA(8, U8(0)) 2093 DATA(9, U8(0)) 2094 DATA(10, U8(0)) 2095 DATA(11, U8(0)) 2096 DATA(12, U8(0)) 2097 DATA(13, U8(0)) 2098 DATA(14, U8(0)) 2099 DATA(15, U8(0)) 2100 2101 TEXT("EqNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)") 2102 Pragma("noescape") 2103 Load(Param("x").Base(), RDI) 2104 Load(Param("y").Base(), RSI) 2105 Load(Param("a"), X0) 2106 Load(Param("x").Len(), RDX) 2107 2108 TESTQ(RDX, RDX) 2109 JE(LabelRef("LBB21_7")) 2110 CMPQ(RDX, Imm(32)) 2111 JAE(LabelRef("LBB21_3")) 2112 XORL(EAX, EAX) 2113 JMP(LabelRef("LBB21_6")) 2114 2115 Label("LBB21_3") 2116 { 2117 MOVQ(RDX, RAX) 2118 ANDQ(I32(-32), RAX) 2119 VBROADCASTSS(X0, Y1) 2120 XORL(ECX, ECX) 2121 VMOVDQU(data.Offset(0), X2) 2122 } 2123 2124 Label("LBB21_4") 2125 { 2126 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3) 2127 VEXTRACTF128(Imm(1), Y3, X4) 2128 VPACKSSDW(X4, X3, X3) 2129 VPACKSSWB(X3, X3, X3) 2130 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4) 2131 VPAND(X2, X3, X3) 2132 VEXTRACTF128(Imm(1), Y4, X5) 2133 VPACKSSDW(X5, X4, X4) 2134 VPACKSSWB(X4, X4, X4) 2135 VPAND(X2, X4, X4) 2136 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5) 2137 VEXTRACTF128(Imm(1), Y5, X6) 2138 VPACKSSDW(X6, X5, X5) 2139 VPACKSSWB(X5, X5, X5) 2140 VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6) 2141 VPAND(X2, X5, X5) 2142 VEXTRACTF128(Imm(1), Y6, X7) 2143 VPACKSSDW(X7, X6, X6) 2144 VPACKSSWB(X6, X6, X6) 2145 VPAND(X2, X6, X6) 2146 VINSERTI128(Imm(1), X6, Y5, Y5) 2147 VINSERTI128(Imm(1), X4, Y3, Y3) 2148 VPUNPCKLQDQ(Y5, Y3, Y3) 2149 VPERMQ(Imm(216), Y3, Y3) 2150 VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1)) 2151 ADDQ(Imm(32), RCX) 2152 CMPQ(RAX, RCX) 2153 JNE(LabelRef("LBB21_4")) 2154 CMPQ(RAX, RDX) 2155 JE(LabelRef("LBB21_7")) 2156 } 2157 2158 Label("LBB21_6") 2159 { 2160 VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0) 2161 SETEQ(Mem{Base: RDI}.Idx(RAX, 1)) 2162 ADDQ(Imm(1), RAX) 2163 CMPQ(RDX, RAX) 2164 JNE(LabelRef("LBB21_6")) 2165 } 2166 2167 Label("LBB21_7") 2168 { 2169 VZEROUPPER() 2170 RET() 2171 } 2172 } 2173 2174 func genNeqNumber_F64() { 2175 2176 data := GLOBL("dataNeqNumberF64", RODATA|NOPTR) 2177 DATA(0, U8(1)) 2178 DATA(1, U8(1)) 2179 DATA(2, U8(1)) 2180 DATA(3, U8(1)) 2181 DATA(4, U8(0)) 2182 DATA(5, U8(0)) 2183 DATA(6, U8(0)) 2184 DATA(7, U8(0)) 2185 DATA(8, U8(0)) 2186 DATA(9, U8(0)) 2187 DATA(10, U8(0)) 2188 DATA(11, U8(0)) 2189 DATA(12, U8(0)) 2190 DATA(13, U8(0)) 2191 DATA(14, U8(0)) 2192 DATA(15, U8(0)) 2193 2194 TEXT("NeqNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)") 2195 Pragma("noescape") 2196 Load(Param("x").Base(), RDI) 2197 Load(Param("y").Base(), RSI) 2198 Load(Param("a"), X0) 2199 Load(Param("x").Len(), RDX) 2200 2201 TESTQ(RDX, RDX) 2202 JE(LabelRef("LBB22_7")) 2203 CMPQ(RDX, Imm(16)) 2204 JAE(LabelRef("LBB22_3")) 2205 XORL(EAX, EAX) 2206 JMP(LabelRef("LBB22_6")) 2207 2208 Label("LBB22_3") 2209 { 2210 MOVQ(RDX, RAX) 2211 ANDQ(I32(-16), RAX) 2212 VBROADCASTSD(X0, Y1) 2213 XORL(ECX, ECX) 2214 VMOVDQU(data.Offset(0), X2) 2215 } 2216 2217 Label("LBB22_4") 2218 { 2219 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3) 2220 VEXTRACTF128(Imm(1), Y3, X4) 2221 VPACKSSDW(X4, X3, X3) 2222 VPACKSSDW(X3, X3, X3) 2223 VPACKSSWB(X3, X3, X3) 2224 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4) 2225 VPAND(X2, X3, X3) 2226 VEXTRACTF128(Imm(1), Y4, X5) 2227 VPACKSSDW(X5, X4, X4) 2228 VPACKSSDW(X4, X4, X4) 2229 VPACKSSWB(X4, X4, X4) 2230 VPAND(X2, X4, X4) 2231 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5) 2232 VPUNPCKLDQ(X4, X3, X3) 2233 VEXTRACTF128(Imm(1), Y5, X4) 2234 VPACKSSDW(X4, X5, X4) 2235 VPACKSSDW(X4, X4, X4) 2236 VPACKSSWB(X4, X4, X4) 2237 VPAND(X2, X4, X4) 2238 VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5) 2239 VEXTRACTF128(Imm(1), Y5, X6) 2240 VPACKSSDW(X6, X5, X5) 2241 VPACKSSDW(X5, X5, X5) 2242 VPACKSSWB(X5, X5, X5) 2243 VPAND(X2, X5, X5) 2244 VPBROADCASTD(X5, X5) 2245 VPBROADCASTD(X4, X4) 2246 VPUNPCKLDQ(X5, X4, X4) 2247 VPBLENDD(Imm(12), X4, X3, X3) 2248 VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1)) 2249 ADDQ(Imm(16), RCX) 2250 CMPQ(RAX, RCX) 2251 JNE(LabelRef("LBB22_4")) 2252 CMPQ(RAX, RDX) 2253 JE(LabelRef("LBB22_7")) 2254 } 2255 2256 Label("LBB22_6") 2257 { 2258 VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0) 2259 SETNE(Mem{Base: RDI}.Idx(RAX, 1)) 2260 ADDQ(Imm(1), RAX) 2261 CMPQ(RDX, RAX) 2262 JNE(LabelRef("LBB22_6")) 2263 } 2264 2265 Label("LBB22_7") 2266 { 2267 VZEROUPPER() 2268 RET() 2269 } 2270 } 2271 2272 func genNeqNumber_F32() { 2273 2274 data := GLOBL("dataNeqNumberF32", RODATA|NOPTR) 2275 DATA(0, U8(1)) 2276 DATA(1, U8(1)) 2277 DATA(2, U8(1)) 2278 DATA(3, U8(1)) 2279 DATA(4, U8(1)) 2280 DATA(5, U8(1)) 2281 DATA(6, U8(1)) 2282 DATA(7, U8(1)) 2283 DATA(8, U8(0)) 2284 DATA(9, U8(0)) 2285 DATA(10, U8(0)) 2286 DATA(11, U8(0)) 2287 DATA(12, U8(0)) 2288 DATA(13, U8(0)) 2289 DATA(14, U8(0)) 2290 DATA(15, U8(0)) 2291 2292 TEXT("NeqNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)") 2293 Pragma("noescape") 2294 Load(Param("x").Base(), RDI) 2295 Load(Param("y").Base(), RSI) 2296 Load(Param("a"), X0) 2297 Load(Param("x").Len(), RDX) 2298 2299 TESTQ(RDX, RDX) 2300 JE(LabelRef("LBB23_7")) 2301 CMPQ(RDX, Imm(32)) 2302 JAE(LabelRef("LBB23_3")) 2303 XORL(EAX, EAX) 2304 JMP(LabelRef("LBB23_6")) 2305 2306 Label("LBB23_3") 2307 { 2308 MOVQ(RDX, RAX) 2309 ANDQ(I32(-32), RAX) 2310 VBROADCASTSS(X0, Y1) 2311 XORL(ECX, ECX) 2312 VMOVDQU(data.Offset(0), X2) 2313 } 2314 2315 Label("LBB23_4") 2316 { 2317 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3) 2318 VEXTRACTF128(Imm(1), Y3, X4) 2319 VPACKSSDW(X4, X3, X3) 2320 VPACKSSWB(X3, X3, X3) 2321 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4) 2322 VPAND(X2, X3, X3) 2323 VEXTRACTF128(Imm(1), Y4, X5) 2324 VPACKSSDW(X5, X4, X4) 2325 VPACKSSWB(X4, X4, X4) 2326 VPAND(X2, X4, X4) 2327 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5) 2328 VEXTRACTF128(Imm(1), Y5, X6) 2329 VPACKSSDW(X6, X5, X5) 2330 VPACKSSWB(X5, X5, X5) 2331 VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6) 2332 VPAND(X2, X5, X5) 2333 VEXTRACTF128(Imm(1), Y6, X7) 2334 VPACKSSDW(X7, X6, X6) 2335 VPACKSSWB(X6, X6, X6) 2336 VPAND(X2, X6, X6) 2337 VINSERTI128(Imm(1), X6, Y5, Y5) 2338 VINSERTI128(Imm(1), X4, Y3, Y3) 2339 VPUNPCKLQDQ(Y5, Y3, Y3) 2340 VPERMQ(Imm(216), Y3, Y3) 2341 VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1)) 2342 ADDQ(Imm(32), RCX) 2343 CMPQ(RAX, RCX) 2344 JNE(LabelRef("LBB23_4")) 2345 CMPQ(RAX, RDX) 2346 JE(LabelRef("LBB23_7")) 2347 } 2348 2349 Label("LBB23_6") 2350 { 2351 VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0) 2352 SETNE(Mem{Base: RDI}.Idx(RAX, 1)) 2353 ADDQ(Imm(1), RAX) 2354 CMPQ(RDX, RAX) 2355 JNE(LabelRef("LBB23_6")) 2356 } 2357 2358 Label("LBB23_7") 2359 { 2360 VZEROUPPER() 2361 RET() 2362 } 2363 }