gitee.com/quant1x/gox@v1.7.6/num/asm/comparison.go (about)

     1  package main
     2  
     3  import (
     4  	. "github.com/mmcloughlin/avo/build"
     5  	. "github.com/mmcloughlin/avo/operand"
     6  	. "github.com/mmcloughlin/avo/reg"
     7  )
     8  
     9  func genLt_F64() {
    10  
    11  	data := GLOBL("dataLtF64", RODATA|NOPTR)
    12  	DATA(0, U8(1))
    13  	DATA(1, U8(1))
    14  	DATA(2, U8(1))
    15  	DATA(3, U8(1))
    16  	DATA(4, U8(0))
    17  	DATA(5, U8(0))
    18  	DATA(6, U8(0))
    19  	DATA(7, U8(0))
    20  	DATA(8, U8(0))
    21  	DATA(9, U8(0))
    22  	DATA(10, U8(0))
    23  	DATA(11, U8(0))
    24  	DATA(12, U8(0))
    25  	DATA(13, U8(0))
    26  	DATA(14, U8(0))
    27  	DATA(15, U8(0))
    28  
    29  	TEXT("Lt_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)")
    30  	Pragma("noescape")
    31  	Load(Param("x").Base(), RDI)
    32  	Load(Param("y").Base(), RSI)
    33  	Load(Param("z").Base(), RDX)
    34  	Load(Param("x").Len(), RCX)
    35  
    36  	TESTQ(RCX, RCX)
    37  	JE(LabelRef("LBB0_7"))
    38  	CMPQ(RCX, Imm(16))
    39  	JAE(LabelRef("LBB0_3"))
    40  	XORL(R8L, R8L)
    41  	JMP(LabelRef("LBB0_6"))
    42  
    43  	Label("LBB0_3")
    44  	{
    45  		MOVQ(RCX, R8)
    46  		ANDQ(I32(-16), R8)
    47  		XORL(EAX, EAX)
    48  		VMOVDQU(data.Offset(0), X0)
    49  	}
    50  
    51  	Label("LBB0_4")
    52  	{
    53  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8), Y1)
    54  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2)
    55  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3)
    56  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4)
    57  		VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8), Y1, Y1)
    58  		VEXTRACTF128(Imm(1), Y1, X5)
    59  		VPACKSSDW(X5, X1, X1)
    60  		VPACKSSDW(X1, X1, X1)
    61  		VPACKSSWB(X1, X1, X1)
    62  		VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2, Y2)
    63  		VPAND(X0, X1, X1)
    64  		VEXTRACTF128(Imm(1), Y2, X5)
    65  		VPACKSSDW(X5, X2, X2)
    66  		VPACKSSDW(X2, X2, X2)
    67  		VPACKSSWB(X2, X2, X2)
    68  		VPAND(X0, X2, X2)
    69  		VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3, Y3)
    70  		VPUNPCKLDQ(X2, X1, X1)
    71  		VEXTRACTF128(Imm(1), Y3, X2)
    72  		VPACKSSDW(X2, X3, X2)
    73  		VPACKSSDW(X2, X2, X2)
    74  		VPACKSSWB(X2, X2, X2)
    75  		VPAND(X0, X2, X2)
    76  		VCMPPD(Imm(1), Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4, Y3)
    77  		VEXTRACTF128(Imm(1), Y3, X4)
    78  		VPACKSSDW(X4, X3, X3)
    79  		VPACKSSDW(X3, X3, X3)
    80  		VPACKSSWB(X3, X3, X3)
    81  		VPAND(X0, X3, X3)
    82  		VPBROADCASTD(X3, X3)
    83  		VPBROADCASTD(X2, X2)
    84  		VPUNPCKLDQ(X3, X2, X2)
    85  		VPBLENDD(Imm(12), X2, X1, X1)
    86  		VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1))
    87  		ADDQ(Imm(16), RAX)
    88  		CMPQ(R8, RAX)
    89  		JNE(LabelRef("LBB0_4"))
    90  		CMPQ(R8, RCX)
    91  		JE(LabelRef("LBB0_7"))
    92  	}
    93  
    94  	Label("LBB0_6")
    95  	{
    96  		VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0)
    97  		VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0)
    98  		SETCS(Mem{Base: RDI}.Idx(R8, 1))
    99  		ADDQ(Imm(1), R8)
   100  		CMPQ(RCX, R8)
   101  		JNE(LabelRef("LBB0_6"))
   102  	}
   103  
   104  	Label("LBB0_7")
   105  	{
   106  		VZEROUPPER()
   107  		RET()
   108  	}
   109  }
   110  
   111  func genLt_F32() {
   112  
   113  	data := GLOBL("dataLtF32", RODATA|NOPTR)
   114  	DATA(0, U8(1))
   115  	DATA(1, U8(1))
   116  	DATA(2, U8(1))
   117  	DATA(3, U8(1))
   118  	DATA(4, U8(1))
   119  	DATA(5, U8(1))
   120  	DATA(6, U8(1))
   121  	DATA(7, U8(1))
   122  	DATA(8, U8(0))
   123  	DATA(9, U8(0))
   124  	DATA(10, U8(0))
   125  	DATA(11, U8(0))
   126  	DATA(12, U8(0))
   127  	DATA(13, U8(0))
   128  	DATA(14, U8(0))
   129  	DATA(15, U8(0))
   130  
   131  	TEXT("Lt_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)")
   132  	Pragma("noescape")
   133  	Load(Param("x").Base(), RDI)
   134  	Load(Param("y").Base(), RSI)
   135  	Load(Param("z").Base(), RDX)
   136  	Load(Param("x").Len(), RCX)
   137  
   138  	TESTQ(RCX, RCX)
   139  	JE(LabelRef("LBB1_7"))
   140  	CMPQ(RCX, Imm(32))
   141  	JAE(LabelRef("LBB1_3"))
   142  	XORL(R8L, R8L)
   143  	JMP(LabelRef("LBB1_6"))
   144  
   145  	Label("LBB1_3")
   146  	{
   147  		MOVQ(RCX, R8)
   148  		ANDQ(I32(-32), R8)
   149  		XORL(EAX, EAX)
   150  		VMOVDQU(data.Offset(0), X0)
   151  	}
   152  
   153  	Label("LBB1_4")
   154  	{
   155  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4), Y1)
   156  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2)
   157  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3)
   158  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4)
   159  		VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4), Y1, Y1)
   160  		VEXTRACTF128(Imm(1), Y1, X5)
   161  		VPACKSSDW(X5, X1, X1)
   162  		VPACKSSWB(X1, X1, X1)
   163  		VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2, Y2)
   164  		VPAND(X0, X1, X1)
   165  		VEXTRACTF128(Imm(1), Y2, X5)
   166  		VPACKSSDW(X5, X2, X2)
   167  		VPACKSSWB(X2, X2, X2)
   168  		VPAND(X0, X2, X2)
   169  		VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3, Y3)
   170  		VEXTRACTF128(Imm(1), Y3, X5)
   171  		VPACKSSDW(X5, X3, X3)
   172  		VPACKSSWB(X3, X3, X3)
   173  		VCMPPS(Imm(1), Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4, Y4)
   174  		VPAND(X0, X3, X3)
   175  		VEXTRACTF128(Imm(1), Y4, X5)
   176  		VPACKSSDW(X5, X4, X4)
   177  		VPACKSSWB(X4, X4, X4)
   178  		VPAND(X0, X4, X4)
   179  		VINSERTI128(Imm(1), X4, Y3, Y3)
   180  		VINSERTI128(Imm(1), X2, Y1, Y1)
   181  		VPUNPCKLQDQ(Y3, Y1, Y1)
   182  		VPERMQ(Imm(216), Y1, Y1)
   183  		VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1))
   184  		ADDQ(Imm(32), RAX)
   185  		CMPQ(R8, RAX)
   186  		JNE(LabelRef("LBB1_4"))
   187  		CMPQ(R8, RCX)
   188  		JE(LabelRef("LBB1_7"))
   189  	}
   190  
   191  	Label("LBB1_6")
   192  	{
   193  		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0)
   194  		VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0)
   195  		SETCS(Mem{Base: RDI}.Idx(R8, 1))
   196  		ADDQ(Imm(1), R8)
   197  		CMPQ(RCX, R8)
   198  		JNE(LabelRef("LBB1_6"))
   199  	}
   200  
   201  	Label("LBB1_7")
   202  	{
   203  		VZEROUPPER()
   204  		RET()
   205  	}
   206  }
   207  
   208  func genLte_F64() {
   209  
   210  	data := GLOBL("dataLteF64", RODATA|NOPTR)
   211  	DATA(0, U8(1))
   212  	DATA(1, U8(1))
   213  	DATA(2, U8(1))
   214  	DATA(3, U8(1))
   215  	DATA(4, U8(0))
   216  	DATA(5, U8(0))
   217  	DATA(6, U8(0))
   218  	DATA(7, U8(0))
   219  	DATA(8, U8(0))
   220  	DATA(9, U8(0))
   221  	DATA(10, U8(0))
   222  	DATA(11, U8(0))
   223  	DATA(12, U8(0))
   224  	DATA(13, U8(0))
   225  	DATA(14, U8(0))
   226  	DATA(15, U8(0))
   227  
   228  	TEXT("Lte_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)")
   229  	Pragma("noescape")
   230  	Load(Param("x").Base(), RDI)
   231  	Load(Param("y").Base(), RSI)
   232  	Load(Param("z").Base(), RDX)
   233  	Load(Param("x").Len(), RCX)
   234  
   235  	TESTQ(RCX, RCX)
   236  	JE(LabelRef("LBB2_7"))
   237  	CMPQ(RCX, Imm(16))
   238  	JAE(LabelRef("LBB2_3"))
   239  	XORL(R8L, R8L)
   240  	JMP(LabelRef("LBB2_6"))
   241  
   242  	Label("LBB2_3")
   243  	{
   244  		MOVQ(RCX, R8)
   245  		ANDQ(I32(-16), R8)
   246  		XORL(EAX, EAX)
   247  		VMOVDQU(data.Offset(0), X0)
   248  	}
   249  
   250  	Label("LBB2_4")
   251  	{
   252  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8), Y1)
   253  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2)
   254  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3)
   255  		VMOVUPD(Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4)
   256  		VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8), Y1, Y1)
   257  		VEXTRACTF128(Imm(1), Y1, X5)
   258  		VPACKSSDW(X5, X1, X1)
   259  		VPACKSSDW(X1, X1, X1)
   260  		VPACKSSWB(X1, X1, X1)
   261  		VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2, Y2)
   262  		VPAND(X0, X1, X1)
   263  		VEXTRACTF128(Imm(1), Y2, X5)
   264  		VPACKSSDW(X5, X2, X2)
   265  		VPACKSSDW(X2, X2, X2)
   266  		VPACKSSWB(X2, X2, X2)
   267  		VPAND(X0, X2, X2)
   268  		VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3, Y3)
   269  		VPUNPCKLDQ(X2, X1, X1)
   270  		VEXTRACTF128(Imm(1), Y3, X2)
   271  		VPACKSSDW(X2, X3, X2)
   272  		VPACKSSDW(X2, X2, X2)
   273  		VPACKSSWB(X2, X2, X2)
   274  		VPAND(X0, X2, X2)
   275  		VCMPPD(Imm(2), Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4, Y3)
   276  		VEXTRACTF128(Imm(1), Y3, X4)
   277  		VPACKSSDW(X4, X3, X3)
   278  		VPACKSSDW(X3, X3, X3)
   279  		VPACKSSWB(X3, X3, X3)
   280  		VPAND(X0, X3, X3)
   281  		VPBROADCASTD(X3, X3)
   282  		VPBROADCASTD(X2, X2)
   283  		VPUNPCKLDQ(X3, X2, X2)
   284  		VPBLENDD(Imm(12), X2, X1, X1)
   285  		VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1))
   286  		ADDQ(Imm(16), RAX)
   287  		CMPQ(R8, RAX)
   288  		JNE(LabelRef("LBB2_4"))
   289  		CMPQ(R8, RCX)
   290  		JE(LabelRef("LBB2_7"))
   291  	}
   292  
   293  	Label("LBB2_6")
   294  	{
   295  		VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0)
   296  		VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0)
   297  		SETLS(Mem{Base: RDI}.Idx(R8, 1))
   298  		ADDQ(Imm(1), R8)
   299  		CMPQ(RCX, R8)
   300  		JNE(LabelRef("LBB2_6"))
   301  	}
   302  
   303  	Label("LBB2_7")
   304  	{
   305  		VZEROUPPER()
   306  		RET()
   307  	}
   308  }
   309  
   310  func genLte_F32() {
   311  
   312  	data := GLOBL("dataLteF32", RODATA|NOPTR)
   313  	DATA(0, U8(1))
   314  	DATA(1, U8(1))
   315  	DATA(2, U8(1))
   316  	DATA(3, U8(1))
   317  	DATA(4, U8(1))
   318  	DATA(5, U8(1))
   319  	DATA(6, U8(1))
   320  	DATA(7, U8(1))
   321  	DATA(8, U8(0))
   322  	DATA(9, U8(0))
   323  	DATA(10, U8(0))
   324  	DATA(11, U8(0))
   325  	DATA(12, U8(0))
   326  	DATA(13, U8(0))
   327  	DATA(14, U8(0))
   328  	DATA(15, U8(0))
   329  
   330  	TEXT("Lte_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)")
   331  	Pragma("noescape")
   332  	Load(Param("x").Base(), RDI)
   333  	Load(Param("y").Base(), RSI)
   334  	Load(Param("z").Base(), RDX)
   335  	Load(Param("x").Len(), RCX)
   336  
   337  	TESTQ(RCX, RCX)
   338  	JE(LabelRef("LBB3_7"))
   339  	CMPQ(RCX, Imm(32))
   340  	JAE(LabelRef("LBB3_3"))
   341  	XORL(R8L, R8L)
   342  	JMP(LabelRef("LBB3_6"))
   343  
   344  	Label("LBB3_3")
   345  	{
   346  		MOVQ(RCX, R8)
   347  		ANDQ(I32(-32), R8)
   348  		XORL(EAX, EAX)
   349  		VMOVDQU(data.Offset(0), X0)
   350  	}
   351  
   352  	Label("LBB3_4")
   353  	{
   354  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4), Y1)
   355  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2)
   356  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3)
   357  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4)
   358  		VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4), Y1, Y1)
   359  		VEXTRACTF128(Imm(1), Y1, X5)
   360  		VPACKSSDW(X5, X1, X1)
   361  		VPACKSSWB(X1, X1, X1)
   362  		VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2, Y2)
   363  		VPAND(X0, X1, X1)
   364  		VEXTRACTF128(Imm(1), Y2, X5)
   365  		VPACKSSDW(X5, X2, X2)
   366  		VPACKSSWB(X2, X2, X2)
   367  		VPAND(X0, X2, X2)
   368  		VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3, Y3)
   369  		VEXTRACTF128(Imm(1), Y3, X5)
   370  		VPACKSSDW(X5, X3, X3)
   371  		VPACKSSWB(X3, X3, X3)
   372  		VCMPPS(Imm(2), Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4, Y4)
   373  		VPAND(X0, X3, X3)
   374  		VEXTRACTF128(Imm(1), Y4, X5)
   375  		VPACKSSDW(X5, X4, X4)
   376  		VPACKSSWB(X4, X4, X4)
   377  		VPAND(X0, X4, X4)
   378  		VINSERTI128(Imm(1), X4, Y3, Y3)
   379  		VINSERTI128(Imm(1), X2, Y1, Y1)
   380  		VPUNPCKLQDQ(Y3, Y1, Y1)
   381  		VPERMQ(Imm(216), Y1, Y1)
   382  		VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1))
   383  		ADDQ(Imm(32), RAX)
   384  		CMPQ(R8, RAX)
   385  		JNE(LabelRef("LBB3_4"))
   386  		CMPQ(R8, RCX)
   387  		JE(LabelRef("LBB3_7"))
   388  	}
   389  
   390  	Label("LBB3_6")
   391  	{
   392  		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0)
   393  		VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0)
   394  		SETLS(Mem{Base: RDI}.Idx(R8, 1))
   395  		ADDQ(Imm(1), R8)
   396  		CMPQ(RCX, R8)
   397  		JNE(LabelRef("LBB3_6"))
   398  	}
   399  
   400  	Label("LBB3_7")
   401  	{
   402  		VZEROUPPER()
   403  		RET()
   404  	}
   405  }
   406  
   407  func genGt_F64() {
   408  
   409  	data := GLOBL("dataGtF64", RODATA|NOPTR)
   410  	DATA(0, U8(1))
   411  	DATA(1, U8(1))
   412  	DATA(2, U8(1))
   413  	DATA(3, U8(1))
   414  	DATA(4, U8(0))
   415  	DATA(5, U8(0))
   416  	DATA(6, U8(0))
   417  	DATA(7, U8(0))
   418  	DATA(8, U8(0))
   419  	DATA(9, U8(0))
   420  	DATA(10, U8(0))
   421  	DATA(11, U8(0))
   422  	DATA(12, U8(0))
   423  	DATA(13, U8(0))
   424  	DATA(14, U8(0))
   425  	DATA(15, U8(0))
   426  
   427  	TEXT("Gt_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)")
   428  	Pragma("noescape")
   429  	Load(Param("x").Base(), RDI)
   430  	Load(Param("y").Base(), RSI)
   431  	Load(Param("z").Base(), RDX)
   432  	Load(Param("x").Len(), RCX)
   433  
   434  	TESTQ(RCX, RCX)
   435  	JE(LabelRef("LBB4_7"))
   436  	CMPQ(RCX, Imm(16))
   437  	JAE(LabelRef("LBB4_3"))
   438  	XORL(R8L, R8L)
   439  	JMP(LabelRef("LBB4_6"))
   440  
   441  	Label("LBB4_3")
   442  	{
   443  		MOVQ(RCX, R8)
   444  		ANDQ(I32(-16), R8)
   445  		XORL(EAX, EAX)
   446  		VMOVDQU(data.Offset(0), X0)
   447  	}
   448  
   449  	Label("LBB4_4")
   450  	{
   451  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1)
   452  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2)
   453  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3)
   454  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4)
   455  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1)
   456  		VEXTRACTF128(Imm(1), Y1, X5)
   457  		VPACKSSDW(X5, X1, X1)
   458  		VPACKSSDW(X1, X1, X1)
   459  		VPACKSSWB(X1, X1, X1)
   460  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2)
   461  		VPAND(X0, X1, X1)
   462  		VEXTRACTF128(Imm(1), Y2, X5)
   463  		VPACKSSDW(X5, X2, X2)
   464  		VPACKSSDW(X2, X2, X2)
   465  		VPACKSSWB(X2, X2, X2)
   466  		VPAND(X0, X2, X2)
   467  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3)
   468  		VPUNPCKLDQ(X2, X1, X1)
   469  		VEXTRACTF128(Imm(1), Y3, X2)
   470  		VPACKSSDW(X2, X3, X2)
   471  		VPACKSSDW(X2, X2, X2)
   472  		VPACKSSWB(X2, X2, X2)
   473  		VPAND(X0, X2, X2)
   474  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3)
   475  		VEXTRACTF128(Imm(1), Y3, X4)
   476  		VPACKSSDW(X4, X3, X3)
   477  		VPACKSSDW(X3, X3, X3)
   478  		VPACKSSWB(X3, X3, X3)
   479  		VPAND(X0, X3, X3)
   480  		VPBROADCASTD(X3, X3)
   481  		VPBROADCASTD(X2, X2)
   482  		VPUNPCKLDQ(X3, X2, X2)
   483  		VPBLENDD(Imm(12), X2, X1, X1)
   484  		VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1))
   485  		ADDQ(Imm(16), RAX)
   486  		CMPQ(R8, RAX)
   487  		JNE(LabelRef("LBB4_4"))
   488  		CMPQ(R8, RCX)
   489  		JE(LabelRef("LBB4_7"))
   490  	}
   491  
   492  	Label("LBB4_6")
   493  	{
   494  		VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0)
   495  		VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0)
   496  		SETHI(Mem{Base: RDI}.Idx(R8, 1))
   497  		ADDQ(Imm(1), R8)
   498  		CMPQ(RCX, R8)
   499  		JNE(LabelRef("LBB4_6"))
   500  	}
   501  
   502  	Label("LBB4_7")
   503  	{
   504  		VZEROUPPER()
   505  		RET()
   506  	}
   507  }
   508  
   509  func genGt_F32() {
   510  
   511  	data := GLOBL("dataGtF32", RODATA|NOPTR)
   512  	DATA(0, U8(1))
   513  	DATA(1, U8(1))
   514  	DATA(2, U8(1))
   515  	DATA(3, U8(1))
   516  	DATA(4, U8(1))
   517  	DATA(5, U8(1))
   518  	DATA(6, U8(1))
   519  	DATA(7, U8(1))
   520  	DATA(8, U8(0))
   521  	DATA(9, U8(0))
   522  	DATA(10, U8(0))
   523  	DATA(11, U8(0))
   524  	DATA(12, U8(0))
   525  	DATA(13, U8(0))
   526  	DATA(14, U8(0))
   527  	DATA(15, U8(0))
   528  
   529  	TEXT("Gt_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)")
   530  	Pragma("noescape")
   531  	Load(Param("x").Base(), RDI)
   532  	Load(Param("y").Base(), RSI)
   533  	Load(Param("z").Base(), RDX)
   534  	Load(Param("x").Len(), RCX)
   535  
   536  	TESTQ(RCX, RCX)
   537  	JE(LabelRef("LBB5_7"))
   538  	CMPQ(RCX, Imm(32))
   539  	JAE(LabelRef("LBB5_3"))
   540  	XORL(R8L, R8L)
   541  	JMP(LabelRef("LBB5_6"))
   542  
   543  	Label("LBB5_3")
   544  	{
   545  		MOVQ(RCX, R8)
   546  		ANDQ(I32(-32), R8)
   547  		XORL(EAX, EAX)
   548  		VMOVDQU(data.Offset(0), X0)
   549  	}
   550  
   551  	Label("LBB5_4")
   552  	{
   553  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1)
   554  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2)
   555  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3)
   556  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4)
   557  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1)
   558  		VEXTRACTF128(Imm(1), Y1, X5)
   559  		VPACKSSDW(X5, X1, X1)
   560  		VPACKSSWB(X1, X1, X1)
   561  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2)
   562  		VPAND(X0, X1, X1)
   563  		VEXTRACTF128(Imm(1), Y2, X5)
   564  		VPACKSSDW(X5, X2, X2)
   565  		VPACKSSWB(X2, X2, X2)
   566  		VPAND(X0, X2, X2)
   567  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3)
   568  		VEXTRACTF128(Imm(1), Y3, X5)
   569  		VPACKSSDW(X5, X3, X3)
   570  		VPACKSSWB(X3, X3, X3)
   571  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4)
   572  		VPAND(X0, X3, X3)
   573  		VEXTRACTF128(Imm(1), Y4, X5)
   574  		VPACKSSDW(X5, X4, X4)
   575  		VPACKSSWB(X4, X4, X4)
   576  		VPAND(X0, X4, X4)
   577  		VINSERTI128(Imm(1), X4, Y3, Y3)
   578  		VINSERTI128(Imm(1), X2, Y1, Y1)
   579  		VPUNPCKLQDQ(Y3, Y1, Y1)
   580  		VPERMQ(Imm(216), Y1, Y1)
   581  		VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1))
   582  		ADDQ(Imm(32), RAX)
   583  		CMPQ(R8, RAX)
   584  		JNE(LabelRef("LBB5_4"))
   585  		CMPQ(R8, RCX)
   586  		JE(LabelRef("LBB5_7"))
   587  	}
   588  
   589  	Label("LBB5_6")
   590  	{
   591  		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0)
   592  		VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0)
   593  		SETHI(Mem{Base: RDI}.Idx(R8, 1))
   594  		ADDQ(Imm(1), R8)
   595  		CMPQ(RCX, R8)
   596  		JNE(LabelRef("LBB5_6"))
   597  	}
   598  
   599  	Label("LBB5_7")
   600  	{
   601  		VZEROUPPER()
   602  		RET()
   603  	}
   604  }
   605  
   606  func genGte_F64() {
   607  
   608  	data := GLOBL("dataGteF64", RODATA|NOPTR)
   609  	DATA(0, U8(1))
   610  	DATA(1, U8(1))
   611  	DATA(2, U8(1))
   612  	DATA(3, U8(1))
   613  	DATA(4, U8(0))
   614  	DATA(5, U8(0))
   615  	DATA(6, U8(0))
   616  	DATA(7, U8(0))
   617  	DATA(8, U8(0))
   618  	DATA(9, U8(0))
   619  	DATA(10, U8(0))
   620  	DATA(11, U8(0))
   621  	DATA(12, U8(0))
   622  	DATA(13, U8(0))
   623  	DATA(14, U8(0))
   624  	DATA(15, U8(0))
   625  
   626  	TEXT("Gte_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)")
   627  	Pragma("noescape")
   628  	Load(Param("x").Base(), RDI)
   629  	Load(Param("y").Base(), RSI)
   630  	Load(Param("z").Base(), RDX)
   631  	Load(Param("x").Len(), RCX)
   632  
   633  	TESTQ(RCX, RCX)
   634  	JE(LabelRef("LBB6_7"))
   635  	CMPQ(RCX, Imm(16))
   636  	JAE(LabelRef("LBB6_3"))
   637  	XORL(R8L, R8L)
   638  	JMP(LabelRef("LBB6_6"))
   639  
   640  	Label("LBB6_3")
   641  	{
   642  		MOVQ(RCX, R8)
   643  		ANDQ(I32(-16), R8)
   644  		XORL(EAX, EAX)
   645  		VMOVDQU(data.Offset(0), X0)
   646  	}
   647  
   648  	Label("LBB6_4")
   649  	{
   650  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1)
   651  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2)
   652  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3)
   653  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4)
   654  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1)
   655  		VEXTRACTF128(Imm(1), Y1, X5)
   656  		VPACKSSDW(X5, X1, X1)
   657  		VPACKSSDW(X1, X1, X1)
   658  		VPACKSSWB(X1, X1, X1)
   659  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2)
   660  		VPAND(X0, X1, X1)
   661  		VEXTRACTF128(Imm(1), Y2, X5)
   662  		VPACKSSDW(X5, X2, X2)
   663  		VPACKSSDW(X2, X2, X2)
   664  		VPACKSSWB(X2, X2, X2)
   665  		VPAND(X0, X2, X2)
   666  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3)
   667  		VPUNPCKLDQ(X2, X1, X1)
   668  		VEXTRACTF128(Imm(1), Y3, X2)
   669  		VPACKSSDW(X2, X3, X2)
   670  		VPACKSSDW(X2, X2, X2)
   671  		VPACKSSWB(X2, X2, X2)
   672  		VPAND(X0, X2, X2)
   673  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3)
   674  		VEXTRACTF128(Imm(1), Y3, X4)
   675  		VPACKSSDW(X4, X3, X3)
   676  		VPACKSSDW(X3, X3, X3)
   677  		VPACKSSWB(X3, X3, X3)
   678  		VPAND(X0, X3, X3)
   679  		VPBROADCASTD(X3, X3)
   680  		VPBROADCASTD(X2, X2)
   681  		VPUNPCKLDQ(X3, X2, X2)
   682  		VPBLENDD(Imm(12), X2, X1, X1)
   683  		VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1))
   684  		ADDQ(Imm(16), RAX)
   685  		CMPQ(R8, RAX)
   686  		JNE(LabelRef("LBB6_4"))
   687  		CMPQ(R8, RCX)
   688  		JE(LabelRef("LBB6_7"))
   689  	}
   690  
   691  	Label("LBB6_6")
   692  	{
   693  		VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0)
   694  		VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0)
   695  		SETCC(Mem{Base: RDI}.Idx(R8, 1))
   696  		ADDQ(Imm(1), R8)
   697  		CMPQ(RCX, R8)
   698  		JNE(LabelRef("LBB6_6"))
   699  	}
   700  
   701  	Label("LBB6_7")
   702  	{
   703  		VZEROUPPER()
   704  		RET()
   705  	}
   706  }
   707  
   708  func genGte_F32() {
   709  
   710  	data := GLOBL("dataGteF32", RODATA|NOPTR)
   711  	DATA(0, U8(1))
   712  	DATA(1, U8(1))
   713  	DATA(2, U8(1))
   714  	DATA(3, U8(1))
   715  	DATA(4, U8(1))
   716  	DATA(5, U8(1))
   717  	DATA(6, U8(1))
   718  	DATA(7, U8(1))
   719  	DATA(8, U8(0))
   720  	DATA(9, U8(0))
   721  	DATA(10, U8(0))
   722  	DATA(11, U8(0))
   723  	DATA(12, U8(0))
   724  	DATA(13, U8(0))
   725  	DATA(14, U8(0))
   726  	DATA(15, U8(0))
   727  
   728  	TEXT("Gte_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)")
   729  	Pragma("noescape")
   730  	Load(Param("x").Base(), RDI)
   731  	Load(Param("y").Base(), RSI)
   732  	Load(Param("z").Base(), RDX)
   733  	Load(Param("x").Len(), RCX)
   734  
   735  	TESTQ(RCX, RCX)
   736  	JE(LabelRef("LBB7_7"))
   737  	CMPQ(RCX, Imm(32))
   738  	JAE(LabelRef("LBB7_3"))
   739  	XORL(R8L, R8L)
   740  	JMP(LabelRef("LBB7_6"))
   741  
   742  	Label("LBB7_3")
   743  	{
   744  		MOVQ(RCX, R8)
   745  		ANDQ(I32(-32), R8)
   746  		XORL(EAX, EAX)
   747  		VMOVDQU(data.Offset(0), X0)
   748  	}
   749  
   750  	Label("LBB7_4")
   751  	{
   752  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1)
   753  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2)
   754  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3)
   755  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4)
   756  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1)
   757  		VEXTRACTF128(Imm(1), Y1, X5)
   758  		VPACKSSDW(X5, X1, X1)
   759  		VPACKSSWB(X1, X1, X1)
   760  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2)
   761  		VPAND(X0, X1, X1)
   762  		VEXTRACTF128(Imm(1), Y2, X5)
   763  		VPACKSSDW(X5, X2, X2)
   764  		VPACKSSWB(X2, X2, X2)
   765  		VPAND(X0, X2, X2)
   766  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3)
   767  		VEXTRACTF128(Imm(1), Y3, X5)
   768  		VPACKSSDW(X5, X3, X3)
   769  		VPACKSSWB(X3, X3, X3)
   770  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4)
   771  		VPAND(X0, X3, X3)
   772  		VEXTRACTF128(Imm(1), Y4, X5)
   773  		VPACKSSDW(X5, X4, X4)
   774  		VPACKSSWB(X4, X4, X4)
   775  		VPAND(X0, X4, X4)
   776  		VINSERTI128(Imm(1), X4, Y3, Y3)
   777  		VINSERTI128(Imm(1), X2, Y1, Y1)
   778  		VPUNPCKLQDQ(Y3, Y1, Y1)
   779  		VPERMQ(Imm(216), Y1, Y1)
   780  		VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1))
   781  		ADDQ(Imm(32), RAX)
   782  		CMPQ(R8, RAX)
   783  		JNE(LabelRef("LBB7_4"))
   784  		CMPQ(R8, RCX)
   785  		JE(LabelRef("LBB7_7"))
   786  	}
   787  
   788  	Label("LBB7_6")
   789  	{
   790  		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0)
   791  		VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0)
   792  		SETCC(Mem{Base: RDI}.Idx(R8, 1))
   793  		ADDQ(Imm(1), R8)
   794  		CMPQ(RCX, R8)
   795  		JNE(LabelRef("LBB7_6"))
   796  	}
   797  
   798  	Label("LBB7_7")
   799  	{
   800  		VZEROUPPER()
   801  		RET()
   802  	}
   803  }
   804  
   805  func genEq_F64() {
   806  
   807  	data := GLOBL("dataEqF64", RODATA|NOPTR)
   808  	DATA(0, U8(1))
   809  	DATA(1, U8(1))
   810  	DATA(2, U8(1))
   811  	DATA(3, U8(1))
   812  	DATA(4, U8(0))
   813  	DATA(5, U8(0))
   814  	DATA(6, U8(0))
   815  	DATA(7, U8(0))
   816  	DATA(8, U8(0))
   817  	DATA(9, U8(0))
   818  	DATA(10, U8(0))
   819  	DATA(11, U8(0))
   820  	DATA(12, U8(0))
   821  	DATA(13, U8(0))
   822  	DATA(14, U8(0))
   823  	DATA(15, U8(0))
   824  
   825  	TEXT("Eq_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)")
   826  	Pragma("noescape")
   827  	Load(Param("x").Base(), RDI)
   828  	Load(Param("y").Base(), RSI)
   829  	Load(Param("z").Base(), RDX)
   830  	Load(Param("x").Len(), RCX)
   831  
   832  	TESTQ(RCX, RCX)
   833  	JE(LabelRef("LBB8_7"))
   834  	CMPQ(RCX, Imm(16))
   835  	JAE(LabelRef("LBB8_3"))
   836  	XORL(R8L, R8L)
   837  	JMP(LabelRef("LBB8_6"))
   838  
   839  	Label("LBB8_3")
   840  	{
   841  		MOVQ(RCX, R8)
   842  		ANDQ(I32(-16), R8)
   843  		XORL(EAX, EAX)
   844  		VMOVDQU(data.Offset(0), X0)
   845  	}
   846  
   847  	Label("LBB8_4")
   848  	{
   849  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1)
   850  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2)
   851  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3)
   852  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4)
   853  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1)
   854  		VEXTRACTF128(Imm(1), Y1, X5)
   855  		VPACKSSDW(X5, X1, X1)
   856  		VPACKSSDW(X1, X1, X1)
   857  		VPACKSSWB(X1, X1, X1)
   858  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2)
   859  		VPAND(X0, X1, X1)
   860  		VEXTRACTF128(Imm(1), Y2, X5)
   861  		VPACKSSDW(X5, X2, X2)
   862  		VPACKSSDW(X2, X2, X2)
   863  		VPACKSSWB(X2, X2, X2)
   864  		VPAND(X0, X2, X2)
   865  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3)
   866  		VPUNPCKLDQ(X2, X1, X1)
   867  		VEXTRACTF128(Imm(1), Y3, X2)
   868  		VPACKSSDW(X2, X3, X2)
   869  		VPACKSSDW(X2, X2, X2)
   870  		VPACKSSWB(X2, X2, X2)
   871  		VPAND(X0, X2, X2)
   872  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3)
   873  		VEXTRACTF128(Imm(1), Y3, X4)
   874  		VPACKSSDW(X4, X3, X3)
   875  		VPACKSSDW(X3, X3, X3)
   876  		VPACKSSWB(X3, X3, X3)
   877  		VPAND(X0, X3, X3)
   878  		VPBROADCASTD(X3, X3)
   879  		VPBROADCASTD(X2, X2)
   880  		VPUNPCKLDQ(X3, X2, X2)
   881  		VPBLENDD(Imm(12), X2, X1, X1)
   882  		VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1))
   883  		ADDQ(Imm(16), RAX)
   884  		CMPQ(R8, RAX)
   885  		JNE(LabelRef("LBB8_4"))
   886  		CMPQ(R8, RCX)
   887  		JE(LabelRef("LBB8_7"))
   888  	}
   889  
   890  	Label("LBB8_6")
   891  	{
   892  		VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0)
   893  		VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0)
   894  		SETEQ(Mem{Base: RDI}.Idx(R8, 1))
   895  		ADDQ(Imm(1), R8)
   896  		CMPQ(RCX, R8)
   897  		JNE(LabelRef("LBB8_6"))
   898  	}
   899  
   900  	Label("LBB8_7")
   901  	{
   902  		VZEROUPPER()
   903  		RET()
   904  	}
   905  }
   906  
   907  func genEq_F32() {
   908  
   909  	data := GLOBL("dataEqF32", RODATA|NOPTR)
   910  	DATA(0, U8(1))
   911  	DATA(1, U8(1))
   912  	DATA(2, U8(1))
   913  	DATA(3, U8(1))
   914  	DATA(4, U8(1))
   915  	DATA(5, U8(1))
   916  	DATA(6, U8(1))
   917  	DATA(7, U8(1))
   918  	DATA(8, U8(0))
   919  	DATA(9, U8(0))
   920  	DATA(10, U8(0))
   921  	DATA(11, U8(0))
   922  	DATA(12, U8(0))
   923  	DATA(13, U8(0))
   924  	DATA(14, U8(0))
   925  	DATA(15, U8(0))
   926  
   927  	TEXT("Eq_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)")
   928  	Pragma("noescape")
   929  	Load(Param("x").Base(), RDI)
   930  	Load(Param("y").Base(), RSI)
   931  	Load(Param("z").Base(), RDX)
   932  	Load(Param("x").Len(), RCX)
   933  
   934  	TESTQ(RCX, RCX)
   935  	JE(LabelRef("LBB9_7"))
   936  	CMPQ(RCX, Imm(32))
   937  	JAE(LabelRef("LBB9_3"))
   938  	XORL(R8L, R8L)
   939  	JMP(LabelRef("LBB9_6"))
   940  
   941  	Label("LBB9_3")
   942  	{
   943  		MOVQ(RCX, R8)
   944  		ANDQ(I32(-32), R8)
   945  		XORL(EAX, EAX)
   946  		VMOVDQU(data.Offset(0), X0)
   947  	}
   948  
   949  	Label("LBB9_4")
   950  	{
   951  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1)
   952  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2)
   953  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3)
   954  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4)
   955  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1)
   956  		VEXTRACTF128(Imm(1), Y1, X5)
   957  		VPACKSSDW(X5, X1, X1)
   958  		VPACKSSWB(X1, X1, X1)
   959  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2)
   960  		VPAND(X0, X1, X1)
   961  		VEXTRACTF128(Imm(1), Y2, X5)
   962  		VPACKSSDW(X5, X2, X2)
   963  		VPACKSSWB(X2, X2, X2)
   964  		VPAND(X0, X2, X2)
   965  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3)
   966  		VEXTRACTF128(Imm(1), Y3, X5)
   967  		VPACKSSDW(X5, X3, X3)
   968  		VPACKSSWB(X3, X3, X3)
   969  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4)
   970  		VPAND(X0, X3, X3)
   971  		VEXTRACTF128(Imm(1), Y4, X5)
   972  		VPACKSSDW(X5, X4, X4)
   973  		VPACKSSWB(X4, X4, X4)
   974  		VPAND(X0, X4, X4)
   975  		VINSERTI128(Imm(1), X4, Y3, Y3)
   976  		VINSERTI128(Imm(1), X2, Y1, Y1)
   977  		VPUNPCKLQDQ(Y3, Y1, Y1)
   978  		VPERMQ(Imm(216), Y1, Y1)
   979  		VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1))
   980  		ADDQ(Imm(32), RAX)
   981  		CMPQ(R8, RAX)
   982  		JNE(LabelRef("LBB9_4"))
   983  		CMPQ(R8, RCX)
   984  		JE(LabelRef("LBB9_7"))
   985  	}
   986  
   987  	Label("LBB9_6")
   988  	{
   989  		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0)
   990  		VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0)
   991  		SETEQ(Mem{Base: RDI}.Idx(R8, 1))
   992  		ADDQ(Imm(1), R8)
   993  		CMPQ(RCX, R8)
   994  		JNE(LabelRef("LBB9_6"))
   995  	}
   996  
   997  	Label("LBB9_7")
   998  	{
   999  		VZEROUPPER()
  1000  		RET()
  1001  	}
  1002  }
  1003  
  1004  func genNeq_F64() {
  1005  
  1006  	data := GLOBL("dataNeqF64", RODATA|NOPTR)
  1007  	DATA(0, U8(1))
  1008  	DATA(1, U8(1))
  1009  	DATA(2, U8(1))
  1010  	DATA(3, U8(1))
  1011  	DATA(4, U8(0))
  1012  	DATA(5, U8(0))
  1013  	DATA(6, U8(0))
  1014  	DATA(7, U8(0))
  1015  	DATA(8, U8(0))
  1016  	DATA(9, U8(0))
  1017  	DATA(10, U8(0))
  1018  	DATA(11, U8(0))
  1019  	DATA(12, U8(0))
  1020  	DATA(13, U8(0))
  1021  	DATA(14, U8(0))
  1022  	DATA(15, U8(0))
  1023  
  1024  	TEXT("Neq_AVX2_F64", NOSPLIT, "func(x []bool, y, z []float64)")
  1025  	Pragma("noescape")
  1026  	Load(Param("x").Base(), RDI)
  1027  	Load(Param("y").Base(), RSI)
  1028  	Load(Param("z").Base(), RDX)
  1029  	Load(Param("x").Len(), RCX)
  1030  
  1031  	TESTQ(RCX, RCX)
  1032  	JE(LabelRef("LBB10_7"))
  1033  	CMPQ(RCX, Imm(16))
  1034  	JAE(LabelRef("LBB10_3"))
  1035  	XORL(R8L, R8L)
  1036  	JMP(LabelRef("LBB10_6"))
  1037  
  1038  	Label("LBB10_3")
  1039  	{
  1040  		MOVQ(RCX, R8)
  1041  		ANDQ(I32(-16), R8)
  1042  		XORL(EAX, EAX)
  1043  		VMOVDQU(data.Offset(0), X0)
  1044  	}
  1045  
  1046  	Label("LBB10_4")
  1047  	{
  1048  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8), Y1)
  1049  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(32), Y2)
  1050  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(64), Y3)
  1051  		VMOVUPD(Mem{Base: RDX}.Idx(RAX, 8).Offset(96), Y4)
  1052  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8), Y1, Y1)
  1053  		VEXTRACTF128(Imm(1), Y1, X5)
  1054  		VPACKSSDW(X5, X1, X1)
  1055  		VPACKSSDW(X1, X1, X1)
  1056  		VPACKSSWB(X1, X1, X1)
  1057  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8).Offset(32), Y2, Y2)
  1058  		VPAND(X0, X1, X1)
  1059  		VEXTRACTF128(Imm(1), Y2, X5)
  1060  		VPACKSSDW(X5, X2, X2)
  1061  		VPACKSSDW(X2, X2, X2)
  1062  		VPACKSSWB(X2, X2, X2)
  1063  		VPAND(X0, X2, X2)
  1064  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8).Offset(64), Y3, Y3)
  1065  		VPUNPCKLDQ(X2, X1, X1)
  1066  		VEXTRACTF128(Imm(1), Y3, X2)
  1067  		VPACKSSDW(X2, X3, X2)
  1068  		VPACKSSDW(X2, X2, X2)
  1069  		VPACKSSWB(X2, X2, X2)
  1070  		VPAND(X0, X2, X2)
  1071  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RAX, 8).Offset(96), Y4, Y3)
  1072  		VEXTRACTF128(Imm(1), Y3, X4)
  1073  		VPACKSSDW(X4, X3, X3)
  1074  		VPACKSSDW(X3, X3, X3)
  1075  		VPACKSSWB(X3, X3, X3)
  1076  		VPAND(X0, X3, X3)
  1077  		VPBROADCASTD(X3, X3)
  1078  		VPBROADCASTD(X2, X2)
  1079  		VPUNPCKLDQ(X3, X2, X2)
  1080  		VPBLENDD(Imm(12), X2, X1, X1)
  1081  		VMOVDQU(X1, Mem{Base: RDI}.Idx(RAX, 1))
  1082  		ADDQ(Imm(16), RAX)
  1083  		CMPQ(R8, RAX)
  1084  		JNE(LabelRef("LBB10_4"))
  1085  		CMPQ(R8, RCX)
  1086  		JE(LabelRef("LBB10_7"))
  1087  	}
  1088  
  1089  	Label("LBB10_6")
  1090  	{
  1091  		VMOVSD(Mem{Base: RSI}.Idx(R8, 8), X0)
  1092  		VUCOMISD(Mem{Base: RDX}.Idx(R8, 8), X0)
  1093  		SETNE(Mem{Base: RDI}.Idx(R8, 1))
  1094  		ADDQ(Imm(1), R8)
  1095  		CMPQ(RCX, R8)
  1096  		JNE(LabelRef("LBB10_6"))
  1097  	}
  1098  
  1099  	Label("LBB10_7")
  1100  	{
  1101  		VZEROUPPER()
  1102  		RET()
  1103  	}
  1104  }
  1105  
  1106  func genNeq_F32() {
  1107  
  1108  	data := GLOBL("dataNeqF32", RODATA|NOPTR)
  1109  	DATA(0, U8(1))
  1110  	DATA(1, U8(1))
  1111  	DATA(2, U8(1))
  1112  	DATA(3, U8(1))
  1113  	DATA(4, U8(1))
  1114  	DATA(5, U8(1))
  1115  	DATA(6, U8(1))
  1116  	DATA(7, U8(1))
  1117  	DATA(8, U8(0))
  1118  	DATA(9, U8(0))
  1119  	DATA(10, U8(0))
  1120  	DATA(11, U8(0))
  1121  	DATA(12, U8(0))
  1122  	DATA(13, U8(0))
  1123  	DATA(14, U8(0))
  1124  	DATA(15, U8(0))
  1125  
  1126  	TEXT("Neq_AVX2_F32", NOSPLIT, "func(x []bool, y, z []float32)")
  1127  	Pragma("noescape")
  1128  	Load(Param("x").Base(), RDI)
  1129  	Load(Param("y").Base(), RSI)
  1130  	Load(Param("z").Base(), RDX)
  1131  	Load(Param("x").Len(), RCX)
  1132  
  1133  	TESTQ(RCX, RCX)
  1134  	JE(LabelRef("LBB11_7"))
  1135  	CMPQ(RCX, Imm(32))
  1136  	JAE(LabelRef("LBB11_3"))
  1137  	XORL(R8L, R8L)
  1138  	JMP(LabelRef("LBB11_6"))
  1139  
  1140  	Label("LBB11_3")
  1141  	{
  1142  		MOVQ(RCX, R8)
  1143  		ANDQ(I32(-32), R8)
  1144  		XORL(EAX, EAX)
  1145  		VMOVDQU(data.Offset(0), X0)
  1146  	}
  1147  
  1148  	Label("LBB11_4")
  1149  	{
  1150  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y1)
  1151  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(32), Y2)
  1152  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(64), Y3)
  1153  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4).Offset(96), Y4)
  1154  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4), Y1, Y1)
  1155  		VEXTRACTF128(Imm(1), Y1, X5)
  1156  		VPACKSSDW(X5, X1, X1)
  1157  		VPACKSSWB(X1, X1, X1)
  1158  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4).Offset(32), Y2, Y2)
  1159  		VPAND(X0, X1, X1)
  1160  		VEXTRACTF128(Imm(1), Y2, X5)
  1161  		VPACKSSDW(X5, X2, X2)
  1162  		VPACKSSWB(X2, X2, X2)
  1163  		VPAND(X0, X2, X2)
  1164  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4).Offset(64), Y3, Y3)
  1165  		VEXTRACTF128(Imm(1), Y3, X5)
  1166  		VPACKSSDW(X5, X3, X3)
  1167  		VPACKSSWB(X3, X3, X3)
  1168  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RAX, 4).Offset(96), Y4, Y4)
  1169  		VPAND(X0, X3, X3)
  1170  		VEXTRACTF128(Imm(1), Y4, X5)
  1171  		VPACKSSDW(X5, X4, X4)
  1172  		VPACKSSWB(X4, X4, X4)
  1173  		VPAND(X0, X4, X4)
  1174  		VINSERTI128(Imm(1), X4, Y3, Y3)
  1175  		VINSERTI128(Imm(1), X2, Y1, Y1)
  1176  		VPUNPCKLQDQ(Y3, Y1, Y1)
  1177  		VPERMQ(Imm(216), Y1, Y1)
  1178  		VMOVDQU(Y1, Mem{Base: RDI}.Idx(RAX, 1))
  1179  		ADDQ(Imm(32), RAX)
  1180  		CMPQ(R8, RAX)
  1181  		JNE(LabelRef("LBB11_4"))
  1182  		CMPQ(R8, RCX)
  1183  		JE(LabelRef("LBB11_7"))
  1184  	}
  1185  
  1186  	Label("LBB11_6")
  1187  	{
  1188  		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X0)
  1189  		VUCOMISS(Mem{Base: RDX}.Idx(R8, 4), X0)
  1190  		SETNE(Mem{Base: RDI}.Idx(R8, 1))
  1191  		ADDQ(Imm(1), R8)
  1192  		CMPQ(RCX, R8)
  1193  		JNE(LabelRef("LBB11_6"))
  1194  	}
  1195  
  1196  	Label("LBB11_7")
  1197  	{
  1198  		VZEROUPPER()
  1199  		RET()
  1200  	}
  1201  }
  1202  
  1203  func genLtNumber_F64() {
  1204  
  1205  	data := GLOBL("dataLtNumberF64", RODATA|NOPTR)
  1206  	DATA(0, U8(1))
  1207  	DATA(1, U8(1))
  1208  	DATA(2, U8(1))
  1209  	DATA(3, U8(1))
  1210  	DATA(4, U8(0))
  1211  	DATA(5, U8(0))
  1212  	DATA(6, U8(0))
  1213  	DATA(7, U8(0))
  1214  	DATA(8, U8(0))
  1215  	DATA(9, U8(0))
  1216  	DATA(10, U8(0))
  1217  	DATA(11, U8(0))
  1218  	DATA(12, U8(0))
  1219  	DATA(13, U8(0))
  1220  	DATA(14, U8(0))
  1221  	DATA(15, U8(0))
  1222  
  1223  	TEXT("LtNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)")
  1224  	Pragma("noescape")
  1225  	Load(Param("x").Base(), RDI)
  1226  	Load(Param("y").Base(), RSI)
  1227  	Load(Param("a"), X0)
  1228  	Load(Param("x").Len(), RDX)
  1229  
  1230  	TESTQ(RDX, RDX)
  1231  	JE(LabelRef("LBB12_7"))
  1232  	CMPQ(RDX, Imm(16))
  1233  	JAE(LabelRef("LBB12_3"))
  1234  	XORL(EAX, EAX)
  1235  	JMP(LabelRef("LBB12_6"))
  1236  
  1237  	Label("LBB12_3")
  1238  	{
  1239  		MOVQ(RDX, RAX)
  1240  		ANDQ(I32(-16), RAX)
  1241  		VBROADCASTSD(X0, Y1)
  1242  		XORL(ECX, ECX)
  1243  		VMOVDQU(data.Offset(0), X2)
  1244  	}
  1245  
  1246  	Label("LBB12_4")
  1247  	{
  1248  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8), Y3)
  1249  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y4)
  1250  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y5)
  1251  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y6)
  1252  		VCMPPD(Imm(1), Y1, Y3, Y3)
  1253  		VEXTRACTF128(Imm(1), Y3, X7)
  1254  		VPACKSSDW(X7, X3, X3)
  1255  		VPACKSSDW(X3, X3, X3)
  1256  		VPACKSSWB(X3, X3, X3)
  1257  		VPAND(X2, X3, X3)
  1258  		VCMPPD(Imm(1), Y1, Y4, Y4)
  1259  		VEXTRACTF128(Imm(1), Y4, X7)
  1260  		VPACKSSDW(X7, X4, X4)
  1261  		VPACKSSDW(X4, X4, X4)
  1262  		VPACKSSWB(X4, X4, X4)
  1263  		VPAND(X2, X4, X4)
  1264  		VPUNPCKLDQ(X4, X3, X3)
  1265  		VCMPPD(Imm(1), Y1, Y5, Y4)
  1266  		VEXTRACTF128(Imm(1), Y4, X5)
  1267  		VPACKSSDW(X5, X4, X4)
  1268  		VPACKSSDW(X4, X4, X4)
  1269  		VPACKSSWB(X4, X4, X4)
  1270  		VPAND(X2, X4, X4)
  1271  		VCMPPD(Imm(1), Y1, Y6, Y5)
  1272  		VEXTRACTF128(Imm(1), Y5, X6)
  1273  		VPACKSSDW(X6, X5, X5)
  1274  		VPACKSSDW(X5, X5, X5)
  1275  		VPACKSSWB(X5, X5, X5)
  1276  		VPAND(X2, X5, X5)
  1277  		VPBROADCASTD(X5, X5)
  1278  		VPBROADCASTD(X4, X4)
  1279  		VPUNPCKLDQ(X5, X4, X4)
  1280  		VPBLENDD(Imm(12), X4, X3, X3)
  1281  		VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1))
  1282  		ADDQ(Imm(16), RCX)
  1283  		CMPQ(RAX, RCX)
  1284  		JNE(LabelRef("LBB12_4"))
  1285  		CMPQ(RAX, RDX)
  1286  		JE(LabelRef("LBB12_7"))
  1287  	}
  1288  
  1289  	Label("LBB12_6")
  1290  	{
  1291  		VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0)
  1292  		SETHI(Mem{Base: RDI}.Idx(RAX, 1))
  1293  		ADDQ(Imm(1), RAX)
  1294  		CMPQ(RDX, RAX)
  1295  		JNE(LabelRef("LBB12_6"))
  1296  	}
  1297  
  1298  	Label("LBB12_7")
  1299  	{
  1300  		VZEROUPPER()
  1301  		RET()
  1302  	}
  1303  }
  1304  
  1305  func genLtNumber_F32() {
  1306  
  1307  	data := GLOBL("dataLtNumberF32", RODATA|NOPTR)
  1308  	DATA(0, U8(1))
  1309  	DATA(1, U8(1))
  1310  	DATA(2, U8(1))
  1311  	DATA(3, U8(1))
  1312  	DATA(4, U8(1))
  1313  	DATA(5, U8(1))
  1314  	DATA(6, U8(1))
  1315  	DATA(7, U8(1))
  1316  	DATA(8, U8(0))
  1317  	DATA(9, U8(0))
  1318  	DATA(10, U8(0))
  1319  	DATA(11, U8(0))
  1320  	DATA(12, U8(0))
  1321  	DATA(13, U8(0))
  1322  	DATA(14, U8(0))
  1323  	DATA(15, U8(0))
  1324  
  1325  	TEXT("LtNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)")
  1326  	Pragma("noescape")
  1327  	Load(Param("x").Base(), RDI)
  1328  	Load(Param("y").Base(), RSI)
  1329  	Load(Param("a"), X0)
  1330  	Load(Param("x").Len(), RDX)
  1331  
  1332  	TESTQ(RDX, RDX)
  1333  	JE(LabelRef("LBB13_7"))
  1334  	CMPQ(RDX, Imm(32))
  1335  	JAE(LabelRef("LBB13_3"))
  1336  	XORL(EAX, EAX)
  1337  	JMP(LabelRef("LBB13_6"))
  1338  
  1339  	Label("LBB13_3")
  1340  	{
  1341  		MOVQ(RDX, RAX)
  1342  		ANDQ(I32(-32), RAX)
  1343  		VBROADCASTSS(X0, Y1)
  1344  		XORL(ECX, ECX)
  1345  		VMOVDQU(data.Offset(0), X2)
  1346  	}
  1347  
  1348  	Label("LBB13_4")
  1349  	{
  1350  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4), Y3)
  1351  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y4)
  1352  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y5)
  1353  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y6)
  1354  		VCMPPS(Imm(1), Y1, Y3, Y3)
  1355  		VEXTRACTF128(Imm(1), Y3, X7)
  1356  		VPACKSSDW(X7, X3, X3)
  1357  		VPACKSSWB(X3, X3, X3)
  1358  		VPAND(X2, X3, X3)
  1359  		VCMPPS(Imm(1), Y1, Y4, Y4)
  1360  		VEXTRACTF128(Imm(1), Y4, X7)
  1361  		VPACKSSDW(X7, X4, X4)
  1362  		VPACKSSWB(X4, X4, X4)
  1363  		VPAND(X2, X4, X4)
  1364  		VCMPPS(Imm(1), Y1, Y5, Y5)
  1365  		VEXTRACTF128(Imm(1), Y5, X7)
  1366  		VPACKSSDW(X7, X5, X5)
  1367  		VPACKSSWB(X5, X5, X5)
  1368  		VPAND(X2, X5, X5)
  1369  		VCMPPS(Imm(1), Y1, Y6, Y6)
  1370  		VEXTRACTF128(Imm(1), Y6, X7)
  1371  		VPACKSSDW(X7, X6, X6)
  1372  		VPACKSSWB(X6, X6, X6)
  1373  		VPAND(X2, X6, X6)
  1374  		VINSERTI128(Imm(1), X6, Y5, Y5)
  1375  		VINSERTI128(Imm(1), X4, Y3, Y3)
  1376  		VPUNPCKLQDQ(Y5, Y3, Y3)
  1377  		VPERMQ(Imm(216), Y3, Y3)
  1378  		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1))
  1379  		ADDQ(Imm(32), RCX)
  1380  		CMPQ(RAX, RCX)
  1381  		JNE(LabelRef("LBB13_4"))
  1382  		CMPQ(RAX, RDX)
  1383  		JE(LabelRef("LBB13_7"))
  1384  	}
  1385  
  1386  	Label("LBB13_6")
  1387  	{
  1388  		VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0)
  1389  		SETHI(Mem{Base: RDI}.Idx(RAX, 1))
  1390  		ADDQ(Imm(1), RAX)
  1391  		CMPQ(RDX, RAX)
  1392  		JNE(LabelRef("LBB13_6"))
  1393  	}
  1394  
  1395  	Label("LBB13_7")
  1396  	{
  1397  		VZEROUPPER()
  1398  		RET()
  1399  	}
  1400  }
  1401  
  1402  func genLteNumber_F64() {
  1403  
  1404  	data := GLOBL("dataLteNumberF64", RODATA|NOPTR)
  1405  	DATA(0, U8(1))
  1406  	DATA(1, U8(1))
  1407  	DATA(2, U8(1))
  1408  	DATA(3, U8(1))
  1409  	DATA(4, U8(0))
  1410  	DATA(5, U8(0))
  1411  	DATA(6, U8(0))
  1412  	DATA(7, U8(0))
  1413  	DATA(8, U8(0))
  1414  	DATA(9, U8(0))
  1415  	DATA(10, U8(0))
  1416  	DATA(11, U8(0))
  1417  	DATA(12, U8(0))
  1418  	DATA(13, U8(0))
  1419  	DATA(14, U8(0))
  1420  	DATA(15, U8(0))
  1421  
  1422  	TEXT("LteNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)")
  1423  	Pragma("noescape")
  1424  	Load(Param("x").Base(), RDI)
  1425  	Load(Param("y").Base(), RSI)
  1426  	Load(Param("a"), X0)
  1427  	Load(Param("x").Len(), RDX)
  1428  
  1429  	TESTQ(RDX, RDX)
  1430  	JE(LabelRef("LBB14_7"))
  1431  	CMPQ(RDX, Imm(16))
  1432  	JAE(LabelRef("LBB14_3"))
  1433  	XORL(EAX, EAX)
  1434  	JMP(LabelRef("LBB14_6"))
  1435  
  1436  	Label("LBB14_3")
  1437  	{
  1438  		MOVQ(RDX, RAX)
  1439  		ANDQ(I32(-16), RAX)
  1440  		VBROADCASTSD(X0, Y1)
  1441  		XORL(ECX, ECX)
  1442  		VMOVDQU(data.Offset(0), X2)
  1443  	}
  1444  
  1445  	Label("LBB14_4")
  1446  	{
  1447  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8), Y3)
  1448  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y4)
  1449  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y5)
  1450  		VMOVUPD(Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y6)
  1451  		VCMPPD(Imm(2), Y1, Y3, Y3)
  1452  		VEXTRACTF128(Imm(1), Y3, X7)
  1453  		VPACKSSDW(X7, X3, X3)
  1454  		VPACKSSDW(X3, X3, X3)
  1455  		VPACKSSWB(X3, X3, X3)
  1456  		VPAND(X2, X3, X3)
  1457  		VCMPPD(Imm(2), Y1, Y4, Y4)
  1458  		VEXTRACTF128(Imm(1), Y4, X7)
  1459  		VPACKSSDW(X7, X4, X4)
  1460  		VPACKSSDW(X4, X4, X4)
  1461  		VPACKSSWB(X4, X4, X4)
  1462  		VPAND(X2, X4, X4)
  1463  		VPUNPCKLDQ(X4, X3, X3)
  1464  		VCMPPD(Imm(2), Y1, Y5, Y4)
  1465  		VEXTRACTF128(Imm(1), Y4, X5)
  1466  		VPACKSSDW(X5, X4, X4)
  1467  		VPACKSSDW(X4, X4, X4)
  1468  		VPACKSSWB(X4, X4, X4)
  1469  		VPAND(X2, X4, X4)
  1470  		VCMPPD(Imm(2), Y1, Y6, Y5)
  1471  		VEXTRACTF128(Imm(1), Y5, X6)
  1472  		VPACKSSDW(X6, X5, X5)
  1473  		VPACKSSDW(X5, X5, X5)
  1474  		VPACKSSWB(X5, X5, X5)
  1475  		VPAND(X2, X5, X5)
  1476  		VPBROADCASTD(X5, X5)
  1477  		VPBROADCASTD(X4, X4)
  1478  		VPUNPCKLDQ(X5, X4, X4)
  1479  		VPBLENDD(Imm(12), X4, X3, X3)
  1480  		VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1))
  1481  		ADDQ(Imm(16), RCX)
  1482  		CMPQ(RAX, RCX)
  1483  		JNE(LabelRef("LBB14_4"))
  1484  		CMPQ(RAX, RDX)
  1485  		JE(LabelRef("LBB14_7"))
  1486  	}
  1487  
  1488  	Label("LBB14_6")
  1489  	{
  1490  		VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0)
  1491  		SETCC(Mem{Base: RDI}.Idx(RAX, 1))
  1492  		ADDQ(Imm(1), RAX)
  1493  		CMPQ(RDX, RAX)
  1494  		JNE(LabelRef("LBB14_6"))
  1495  	}
  1496  
  1497  	Label("LBB14_7")
  1498  	{
  1499  		VZEROUPPER()
  1500  		RET()
  1501  	}
  1502  }
  1503  
  1504  func genLteNumber_F32() {
  1505  
  1506  	data := GLOBL("dataLteNumberF32", RODATA|NOPTR)
  1507  	DATA(0, U8(1))
  1508  	DATA(1, U8(1))
  1509  	DATA(2, U8(1))
  1510  	DATA(3, U8(1))
  1511  	DATA(4, U8(1))
  1512  	DATA(5, U8(1))
  1513  	DATA(6, U8(1))
  1514  	DATA(7, U8(1))
  1515  	DATA(8, U8(0))
  1516  	DATA(9, U8(0))
  1517  	DATA(10, U8(0))
  1518  	DATA(11, U8(0))
  1519  	DATA(12, U8(0))
  1520  	DATA(13, U8(0))
  1521  	DATA(14, U8(0))
  1522  	DATA(15, U8(0))
  1523  
  1524  	TEXT("LteNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)")
  1525  	Pragma("noescape")
  1526  	Load(Param("x").Base(), RDI)
  1527  	Load(Param("y").Base(), RSI)
  1528  	Load(Param("a"), X0)
  1529  	Load(Param("x").Len(), RDX)
  1530  
  1531  	TESTQ(RDX, RDX)
  1532  	JE(LabelRef("LBB15_7"))
  1533  	CMPQ(RDX, Imm(32))
  1534  	JAE(LabelRef("LBB15_3"))
  1535  	XORL(EAX, EAX)
  1536  	JMP(LabelRef("LBB15_6"))
  1537  
  1538  	Label("LBB15_3")
  1539  	{
  1540  		MOVQ(RDX, RAX)
  1541  		ANDQ(I32(-32), RAX)
  1542  		VBROADCASTSS(X0, Y1)
  1543  		XORL(ECX, ECX)
  1544  		VMOVDQU(data.Offset(0), X2)
  1545  	}
  1546  
  1547  	Label("LBB15_4")
  1548  	{
  1549  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4), Y3)
  1550  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y4)
  1551  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y5)
  1552  		VMOVUPS(Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y6)
  1553  		VCMPPS(Imm(2), Y1, Y3, Y3)
  1554  		VEXTRACTF128(Imm(1), Y3, X7)
  1555  		VPACKSSDW(X7, X3, X3)
  1556  		VPACKSSWB(X3, X3, X3)
  1557  		VPAND(X2, X3, X3)
  1558  		VCMPPS(Imm(2), Y1, Y4, Y4)
  1559  		VEXTRACTF128(Imm(1), Y4, X7)
  1560  		VPACKSSDW(X7, X4, X4)
  1561  		VPACKSSWB(X4, X4, X4)
  1562  		VPAND(X2, X4, X4)
  1563  		VCMPPS(Imm(2), Y1, Y5, Y5)
  1564  		VEXTRACTF128(Imm(1), Y5, X7)
  1565  		VPACKSSDW(X7, X5, X5)
  1566  		VPACKSSWB(X5, X5, X5)
  1567  		VPAND(X2, X5, X5)
  1568  		VCMPPS(Imm(2), Y1, Y6, Y6)
  1569  		VEXTRACTF128(Imm(1), Y6, X7)
  1570  		VPACKSSDW(X7, X6, X6)
  1571  		VPACKSSWB(X6, X6, X6)
  1572  		VPAND(X2, X6, X6)
  1573  		VINSERTI128(Imm(1), X6, Y5, Y5)
  1574  		VINSERTI128(Imm(1), X4, Y3, Y3)
  1575  		VPUNPCKLQDQ(Y5, Y3, Y3)
  1576  		VPERMQ(Imm(216), Y3, Y3)
  1577  		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1))
  1578  		ADDQ(Imm(32), RCX)
  1579  		CMPQ(RAX, RCX)
  1580  		JNE(LabelRef("LBB15_4"))
  1581  		CMPQ(RAX, RDX)
  1582  		JE(LabelRef("LBB15_7"))
  1583  	}
  1584  
  1585  	Label("LBB15_6")
  1586  	{
  1587  		VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0)
  1588  		SETCC(Mem{Base: RDI}.Idx(RAX, 1))
  1589  		ADDQ(Imm(1), RAX)
  1590  		CMPQ(RDX, RAX)
  1591  		JNE(LabelRef("LBB15_6"))
  1592  	}
  1593  
  1594  	Label("LBB15_7")
  1595  	{
  1596  		VZEROUPPER()
  1597  		RET()
  1598  	}
  1599  }
  1600  
  1601  func genGtNumber_F64() {
  1602  
  1603  	data := GLOBL("dataGtNumberF64", RODATA|NOPTR)
  1604  	DATA(0, U8(1))
  1605  	DATA(1, U8(1))
  1606  	DATA(2, U8(1))
  1607  	DATA(3, U8(1))
  1608  	DATA(4, U8(0))
  1609  	DATA(5, U8(0))
  1610  	DATA(6, U8(0))
  1611  	DATA(7, U8(0))
  1612  	DATA(8, U8(0))
  1613  	DATA(9, U8(0))
  1614  	DATA(10, U8(0))
  1615  	DATA(11, U8(0))
  1616  	DATA(12, U8(0))
  1617  	DATA(13, U8(0))
  1618  	DATA(14, U8(0))
  1619  	DATA(15, U8(0))
  1620  
  1621  	TEXT("GtNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)")
  1622  	Pragma("noescape")
  1623  	Load(Param("x").Base(), RDI)
  1624  	Load(Param("y").Base(), RSI)
  1625  	Load(Param("a"), X0)
  1626  	Load(Param("x").Len(), RDX)
  1627  
  1628  	TESTQ(RDX, RDX)
  1629  	JE(LabelRef("LBB16_7"))
  1630  	CMPQ(RDX, Imm(16))
  1631  	JAE(LabelRef("LBB16_3"))
  1632  	XORL(EAX, EAX)
  1633  	JMP(LabelRef("LBB16_6"))
  1634  
  1635  	Label("LBB16_3")
  1636  	{
  1637  		MOVQ(RDX, RAX)
  1638  		ANDQ(I32(-16), RAX)
  1639  		VBROADCASTSD(X0, Y1)
  1640  		XORL(ECX, ECX)
  1641  		VMOVDQU(data.Offset(0), X2)
  1642  	}
  1643  
  1644  	Label("LBB16_4")
  1645  	{
  1646  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3)
  1647  		VEXTRACTF128(Imm(1), Y3, X4)
  1648  		VPACKSSDW(X4, X3, X3)
  1649  		VPACKSSDW(X3, X3, X3)
  1650  		VPACKSSWB(X3, X3, X3)
  1651  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4)
  1652  		VPAND(X2, X3, X3)
  1653  		VEXTRACTF128(Imm(1), Y4, X5)
  1654  		VPACKSSDW(X5, X4, X4)
  1655  		VPACKSSDW(X4, X4, X4)
  1656  		VPACKSSWB(X4, X4, X4)
  1657  		VPAND(X2, X4, X4)
  1658  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5)
  1659  		VPUNPCKLDQ(X4, X3, X3)
  1660  		VEXTRACTF128(Imm(1), Y5, X4)
  1661  		VPACKSSDW(X4, X5, X4)
  1662  		VPACKSSDW(X4, X4, X4)
  1663  		VPACKSSWB(X4, X4, X4)
  1664  		VPAND(X2, X4, X4)
  1665  		VCMPPD(Imm(1), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5)
  1666  		VEXTRACTF128(Imm(1), Y5, X6)
  1667  		VPACKSSDW(X6, X5, X5)
  1668  		VPACKSSDW(X5, X5, X5)
  1669  		VPACKSSWB(X5, X5, X5)
  1670  		VPAND(X2, X5, X5)
  1671  		VPBROADCASTD(X5, X5)
  1672  		VPBROADCASTD(X4, X4)
  1673  		VPUNPCKLDQ(X5, X4, X4)
  1674  		VPBLENDD(Imm(12), X4, X3, X3)
  1675  		VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1))
  1676  		ADDQ(Imm(16), RCX)
  1677  		CMPQ(RAX, RCX)
  1678  		JNE(LabelRef("LBB16_4"))
  1679  		CMPQ(RAX, RDX)
  1680  		JE(LabelRef("LBB16_7"))
  1681  	}
  1682  
  1683  	Label("LBB16_6")
  1684  	{
  1685  		VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0)
  1686  		SETCS(Mem{Base: RDI}.Idx(RAX, 1))
  1687  		ADDQ(Imm(1), RAX)
  1688  		CMPQ(RDX, RAX)
  1689  		JNE(LabelRef("LBB16_6"))
  1690  	}
  1691  
  1692  	Label("LBB16_7")
  1693  	{
  1694  		VZEROUPPER()
  1695  		RET()
  1696  	}
  1697  }
  1698  
  1699  func genGtNumber_F32() {
  1700  
  1701  	data := GLOBL("dataGtNumberF32", RODATA|NOPTR)
  1702  	DATA(0, U8(1))
  1703  	DATA(1, U8(1))
  1704  	DATA(2, U8(1))
  1705  	DATA(3, U8(1))
  1706  	DATA(4, U8(1))
  1707  	DATA(5, U8(1))
  1708  	DATA(6, U8(1))
  1709  	DATA(7, U8(1))
  1710  	DATA(8, U8(0))
  1711  	DATA(9, U8(0))
  1712  	DATA(10, U8(0))
  1713  	DATA(11, U8(0))
  1714  	DATA(12, U8(0))
  1715  	DATA(13, U8(0))
  1716  	DATA(14, U8(0))
  1717  	DATA(15, U8(0))
  1718  
  1719  	TEXT("GtNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)")
  1720  	Pragma("noescape")
  1721  	Load(Param("x").Base(), RDI)
  1722  	Load(Param("y").Base(), RSI)
  1723  	Load(Param("a"), X0)
  1724  	Load(Param("x").Len(), RDX)
  1725  
  1726  	TESTQ(RDX, RDX)
  1727  	JE(LabelRef("LBB17_7"))
  1728  	CMPQ(RDX, Imm(32))
  1729  	JAE(LabelRef("LBB17_3"))
  1730  	XORL(EAX, EAX)
  1731  	JMP(LabelRef("LBB17_6"))
  1732  
  1733  	Label("LBB17_3")
  1734  	{
  1735  		MOVQ(RDX, RAX)
  1736  		ANDQ(I32(-32), RAX)
  1737  		VBROADCASTSS(X0, Y1)
  1738  		XORL(ECX, ECX)
  1739  		VMOVDQU(data.Offset(0), X2)
  1740  	}
  1741  
  1742  	Label("LBB17_4")
  1743  	{
  1744  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3)
  1745  		VEXTRACTF128(Imm(1), Y3, X4)
  1746  		VPACKSSDW(X4, X3, X3)
  1747  		VPACKSSWB(X3, X3, X3)
  1748  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4)
  1749  		VPAND(X2, X3, X3)
  1750  		VEXTRACTF128(Imm(1), Y4, X5)
  1751  		VPACKSSDW(X5, X4, X4)
  1752  		VPACKSSWB(X4, X4, X4)
  1753  		VPAND(X2, X4, X4)
  1754  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5)
  1755  		VEXTRACTF128(Imm(1), Y5, X6)
  1756  		VPACKSSDW(X6, X5, X5)
  1757  		VPACKSSWB(X5, X5, X5)
  1758  		VCMPPS(Imm(1), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6)
  1759  		VPAND(X2, X5, X5)
  1760  		VEXTRACTF128(Imm(1), Y6, X7)
  1761  		VPACKSSDW(X7, X6, X6)
  1762  		VPACKSSWB(X6, X6, X6)
  1763  		VPAND(X2, X6, X6)
  1764  		VINSERTI128(Imm(1), X6, Y5, Y5)
  1765  		VINSERTI128(Imm(1), X4, Y3, Y3)
  1766  		VPUNPCKLQDQ(Y5, Y3, Y3)
  1767  		VPERMQ(Imm(216), Y3, Y3)
  1768  		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1))
  1769  		ADDQ(Imm(32), RCX)
  1770  		CMPQ(RAX, RCX)
  1771  		JNE(LabelRef("LBB17_4"))
  1772  		CMPQ(RAX, RDX)
  1773  		JE(LabelRef("LBB17_7"))
  1774  	}
  1775  
  1776  	Label("LBB17_6")
  1777  	{
  1778  		VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0)
  1779  		SETCS(Mem{Base: RDI}.Idx(RAX, 1))
  1780  		ADDQ(Imm(1), RAX)
  1781  		CMPQ(RDX, RAX)
  1782  		JNE(LabelRef("LBB17_6"))
  1783  	}
  1784  
  1785  	Label("LBB17_7")
  1786  	{
  1787  		VZEROUPPER()
  1788  		RET()
  1789  	}
  1790  }
  1791  
  1792  func genGteNumber_F64() {
  1793  
  1794  	data := GLOBL("dataGteNumberF64", RODATA|NOPTR)
  1795  	DATA(0, U8(1))
  1796  	DATA(1, U8(1))
  1797  	DATA(2, U8(1))
  1798  	DATA(3, U8(1))
  1799  	DATA(4, U8(0))
  1800  	DATA(5, U8(0))
  1801  	DATA(6, U8(0))
  1802  	DATA(7, U8(0))
  1803  	DATA(8, U8(0))
  1804  	DATA(9, U8(0))
  1805  	DATA(10, U8(0))
  1806  	DATA(11, U8(0))
  1807  	DATA(12, U8(0))
  1808  	DATA(13, U8(0))
  1809  	DATA(14, U8(0))
  1810  	DATA(15, U8(0))
  1811  
  1812  	TEXT("GteNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)")
  1813  	Pragma("noescape")
  1814  	Load(Param("x").Base(), RDI)
  1815  	Load(Param("y").Base(), RSI)
  1816  	Load(Param("a"), X0)
  1817  	Load(Param("x").Len(), RDX)
  1818  
  1819  	TESTQ(RDX, RDX)
  1820  	JE(LabelRef("LBB18_7"))
  1821  	CMPQ(RDX, Imm(16))
  1822  	JAE(LabelRef("LBB18_3"))
  1823  	XORL(EAX, EAX)
  1824  	JMP(LabelRef("LBB18_6"))
  1825  
  1826  	Label("LBB18_3")
  1827  	{
  1828  		MOVQ(RDX, RAX)
  1829  		ANDQ(I32(-16), RAX)
  1830  		VBROADCASTSD(X0, Y1)
  1831  		XORL(ECX, ECX)
  1832  		VMOVDQU(data.Offset(0), X2)
  1833  	}
  1834  
  1835  	Label("LBB18_4")
  1836  	{
  1837  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3)
  1838  		VEXTRACTF128(Imm(1), Y3, X4)
  1839  		VPACKSSDW(X4, X3, X3)
  1840  		VPACKSSDW(X3, X3, X3)
  1841  		VPACKSSWB(X3, X3, X3)
  1842  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4)
  1843  		VPAND(X2, X3, X3)
  1844  		VEXTRACTF128(Imm(1), Y4, X5)
  1845  		VPACKSSDW(X5, X4, X4)
  1846  		VPACKSSDW(X4, X4, X4)
  1847  		VPACKSSWB(X4, X4, X4)
  1848  		VPAND(X2, X4, X4)
  1849  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5)
  1850  		VPUNPCKLDQ(X4, X3, X3)
  1851  		VEXTRACTF128(Imm(1), Y5, X4)
  1852  		VPACKSSDW(X4, X5, X4)
  1853  		VPACKSSDW(X4, X4, X4)
  1854  		VPACKSSWB(X4, X4, X4)
  1855  		VPAND(X2, X4, X4)
  1856  		VCMPPD(Imm(2), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5)
  1857  		VEXTRACTF128(Imm(1), Y5, X6)
  1858  		VPACKSSDW(X6, X5, X5)
  1859  		VPACKSSDW(X5, X5, X5)
  1860  		VPACKSSWB(X5, X5, X5)
  1861  		VPAND(X2, X5, X5)
  1862  		VPBROADCASTD(X5, X5)
  1863  		VPBROADCASTD(X4, X4)
  1864  		VPUNPCKLDQ(X5, X4, X4)
  1865  		VPBLENDD(Imm(12), X4, X3, X3)
  1866  		VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1))
  1867  		ADDQ(Imm(16), RCX)
  1868  		CMPQ(RAX, RCX)
  1869  		JNE(LabelRef("LBB18_4"))
  1870  		CMPQ(RAX, RDX)
  1871  		JE(LabelRef("LBB18_7"))
  1872  	}
  1873  
  1874  	Label("LBB18_6")
  1875  	{
  1876  		VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0)
  1877  		SETLS(Mem{Base: RDI}.Idx(RAX, 1))
  1878  		ADDQ(Imm(1), RAX)
  1879  		CMPQ(RDX, RAX)
  1880  		JNE(LabelRef("LBB18_6"))
  1881  	}
  1882  
  1883  	Label("LBB18_7")
  1884  	{
  1885  		VZEROUPPER()
  1886  		RET()
  1887  	}
  1888  }
  1889  
  1890  func genGteNumber_F32() {
  1891  
  1892  	data := GLOBL("dataGteNumberF32", RODATA|NOPTR)
  1893  	DATA(0, U8(1))
  1894  	DATA(1, U8(1))
  1895  	DATA(2, U8(1))
  1896  	DATA(3, U8(1))
  1897  	DATA(4, U8(1))
  1898  	DATA(5, U8(1))
  1899  	DATA(6, U8(1))
  1900  	DATA(7, U8(1))
  1901  	DATA(8, U8(0))
  1902  	DATA(9, U8(0))
  1903  	DATA(10, U8(0))
  1904  	DATA(11, U8(0))
  1905  	DATA(12, U8(0))
  1906  	DATA(13, U8(0))
  1907  	DATA(14, U8(0))
  1908  	DATA(15, U8(0))
  1909  
  1910  	TEXT("GteNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)")
  1911  	Pragma("noescape")
  1912  	Load(Param("x").Base(), RDI)
  1913  	Load(Param("y").Base(), RSI)
  1914  	Load(Param("a"), X0)
  1915  	Load(Param("x").Len(), RDX)
  1916  
  1917  	TESTQ(RDX, RDX)
  1918  	JE(LabelRef("LBB19_7"))
  1919  	CMPQ(RDX, Imm(32))
  1920  	JAE(LabelRef("LBB19_3"))
  1921  	XORL(EAX, EAX)
  1922  	JMP(LabelRef("LBB19_6"))
  1923  
  1924  	Label("LBB19_3")
  1925  	{
  1926  		MOVQ(RDX, RAX)
  1927  		ANDQ(I32(-32), RAX)
  1928  		VBROADCASTSS(X0, Y1)
  1929  		XORL(ECX, ECX)
  1930  		VMOVDQU(data.Offset(0), X2)
  1931  	}
  1932  
  1933  	Label("LBB19_4")
  1934  	{
  1935  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3)
  1936  		VEXTRACTF128(Imm(1), Y3, X4)
  1937  		VPACKSSDW(X4, X3, X3)
  1938  		VPACKSSWB(X3, X3, X3)
  1939  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4)
  1940  		VPAND(X2, X3, X3)
  1941  		VEXTRACTF128(Imm(1), Y4, X5)
  1942  		VPACKSSDW(X5, X4, X4)
  1943  		VPACKSSWB(X4, X4, X4)
  1944  		VPAND(X2, X4, X4)
  1945  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5)
  1946  		VEXTRACTF128(Imm(1), Y5, X6)
  1947  		VPACKSSDW(X6, X5, X5)
  1948  		VPACKSSWB(X5, X5, X5)
  1949  		VCMPPS(Imm(2), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6)
  1950  		VPAND(X2, X5, X5)
  1951  		VEXTRACTF128(Imm(1), Y6, X7)
  1952  		VPACKSSDW(X7, X6, X6)
  1953  		VPACKSSWB(X6, X6, X6)
  1954  		VPAND(X2, X6, X6)
  1955  		VINSERTI128(Imm(1), X6, Y5, Y5)
  1956  		VINSERTI128(Imm(1), X4, Y3, Y3)
  1957  		VPUNPCKLQDQ(Y5, Y3, Y3)
  1958  		VPERMQ(Imm(216), Y3, Y3)
  1959  		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1))
  1960  		ADDQ(Imm(32), RCX)
  1961  		CMPQ(RAX, RCX)
  1962  		JNE(LabelRef("LBB19_4"))
  1963  		CMPQ(RAX, RDX)
  1964  		JE(LabelRef("LBB19_7"))
  1965  	}
  1966  
  1967  	Label("LBB19_6")
  1968  	{
  1969  		VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0)
  1970  		SETLS(Mem{Base: RDI}.Idx(RAX, 1))
  1971  		ADDQ(Imm(1), RAX)
  1972  		CMPQ(RDX, RAX)
  1973  		JNE(LabelRef("LBB19_6"))
  1974  	}
  1975  
  1976  	Label("LBB19_7")
  1977  	{
  1978  		VZEROUPPER()
  1979  		RET()
  1980  	}
  1981  }
  1982  
  1983  func genEqNumber_F64() {
  1984  
  1985  	data := GLOBL("dataEqNumberF64", RODATA|NOPTR)
  1986  	DATA(0, U8(1))
  1987  	DATA(1, U8(1))
  1988  	DATA(2, U8(1))
  1989  	DATA(3, U8(1))
  1990  	DATA(4, U8(0))
  1991  	DATA(5, U8(0))
  1992  	DATA(6, U8(0))
  1993  	DATA(7, U8(0))
  1994  	DATA(8, U8(0))
  1995  	DATA(9, U8(0))
  1996  	DATA(10, U8(0))
  1997  	DATA(11, U8(0))
  1998  	DATA(12, U8(0))
  1999  	DATA(13, U8(0))
  2000  	DATA(14, U8(0))
  2001  	DATA(15, U8(0))
  2002  
  2003  	TEXT("EqNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)")
  2004  	Pragma("noescape")
  2005  	Load(Param("x").Base(), RDI)
  2006  	Load(Param("y").Base(), RSI)
  2007  	Load(Param("a"), X0)
  2008  	Load(Param("x").Len(), RDX)
  2009  
  2010  	TESTQ(RDX, RDX)
  2011  	JE(LabelRef("LBB20_7"))
  2012  	CMPQ(RDX, Imm(16))
  2013  	JAE(LabelRef("LBB20_3"))
  2014  	XORL(EAX, EAX)
  2015  	JMP(LabelRef("LBB20_6"))
  2016  
  2017  	Label("LBB20_3")
  2018  	{
  2019  		MOVQ(RDX, RAX)
  2020  		ANDQ(I32(-16), RAX)
  2021  		VBROADCASTSD(X0, Y1)
  2022  		XORL(ECX, ECX)
  2023  		VMOVDQU(data.Offset(0), X2)
  2024  	}
  2025  
  2026  	Label("LBB20_4")
  2027  	{
  2028  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3)
  2029  		VEXTRACTF128(Imm(1), Y3, X4)
  2030  		VPACKSSDW(X4, X3, X3)
  2031  		VPACKSSDW(X3, X3, X3)
  2032  		VPACKSSWB(X3, X3, X3)
  2033  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4)
  2034  		VPAND(X2, X3, X3)
  2035  		VEXTRACTF128(Imm(1), Y4, X5)
  2036  		VPACKSSDW(X5, X4, X4)
  2037  		VPACKSSDW(X4, X4, X4)
  2038  		VPACKSSWB(X4, X4, X4)
  2039  		VPAND(X2, X4, X4)
  2040  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5)
  2041  		VPUNPCKLDQ(X4, X3, X3)
  2042  		VEXTRACTF128(Imm(1), Y5, X4)
  2043  		VPACKSSDW(X4, X5, X4)
  2044  		VPACKSSDW(X4, X4, X4)
  2045  		VPACKSSWB(X4, X4, X4)
  2046  		VPAND(X2, X4, X4)
  2047  		VCMPPD(Imm(0), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5)
  2048  		VEXTRACTF128(Imm(1), Y5, X6)
  2049  		VPACKSSDW(X6, X5, X5)
  2050  		VPACKSSDW(X5, X5, X5)
  2051  		VPACKSSWB(X5, X5, X5)
  2052  		VPAND(X2, X5, X5)
  2053  		VPBROADCASTD(X5, X5)
  2054  		VPBROADCASTD(X4, X4)
  2055  		VPUNPCKLDQ(X5, X4, X4)
  2056  		VPBLENDD(Imm(12), X4, X3, X3)
  2057  		VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1))
  2058  		ADDQ(Imm(16), RCX)
  2059  		CMPQ(RAX, RCX)
  2060  		JNE(LabelRef("LBB20_4"))
  2061  		CMPQ(RAX, RDX)
  2062  		JE(LabelRef("LBB20_7"))
  2063  	}
  2064  
  2065  	Label("LBB20_6")
  2066  	{
  2067  		VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0)
  2068  		SETEQ(Mem{Base: RDI}.Idx(RAX, 1))
  2069  		ADDQ(Imm(1), RAX)
  2070  		CMPQ(RDX, RAX)
  2071  		JNE(LabelRef("LBB20_6"))
  2072  	}
  2073  
  2074  	Label("LBB20_7")
  2075  	{
  2076  		VZEROUPPER()
  2077  		RET()
  2078  	}
  2079  }
  2080  
  2081  func genEqNumber_F32() {
  2082  
  2083  	data := GLOBL("dataEqNumberF32", RODATA|NOPTR)
  2084  	DATA(0, U8(1))
  2085  	DATA(1, U8(1))
  2086  	DATA(2, U8(1))
  2087  	DATA(3, U8(1))
  2088  	DATA(4, U8(1))
  2089  	DATA(5, U8(1))
  2090  	DATA(6, U8(1))
  2091  	DATA(7, U8(1))
  2092  	DATA(8, U8(0))
  2093  	DATA(9, U8(0))
  2094  	DATA(10, U8(0))
  2095  	DATA(11, U8(0))
  2096  	DATA(12, U8(0))
  2097  	DATA(13, U8(0))
  2098  	DATA(14, U8(0))
  2099  	DATA(15, U8(0))
  2100  
  2101  	TEXT("EqNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)")
  2102  	Pragma("noescape")
  2103  	Load(Param("x").Base(), RDI)
  2104  	Load(Param("y").Base(), RSI)
  2105  	Load(Param("a"), X0)
  2106  	Load(Param("x").Len(), RDX)
  2107  
  2108  	TESTQ(RDX, RDX)
  2109  	JE(LabelRef("LBB21_7"))
  2110  	CMPQ(RDX, Imm(32))
  2111  	JAE(LabelRef("LBB21_3"))
  2112  	XORL(EAX, EAX)
  2113  	JMP(LabelRef("LBB21_6"))
  2114  
  2115  	Label("LBB21_3")
  2116  	{
  2117  		MOVQ(RDX, RAX)
  2118  		ANDQ(I32(-32), RAX)
  2119  		VBROADCASTSS(X0, Y1)
  2120  		XORL(ECX, ECX)
  2121  		VMOVDQU(data.Offset(0), X2)
  2122  	}
  2123  
  2124  	Label("LBB21_4")
  2125  	{
  2126  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3)
  2127  		VEXTRACTF128(Imm(1), Y3, X4)
  2128  		VPACKSSDW(X4, X3, X3)
  2129  		VPACKSSWB(X3, X3, X3)
  2130  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4)
  2131  		VPAND(X2, X3, X3)
  2132  		VEXTRACTF128(Imm(1), Y4, X5)
  2133  		VPACKSSDW(X5, X4, X4)
  2134  		VPACKSSWB(X4, X4, X4)
  2135  		VPAND(X2, X4, X4)
  2136  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5)
  2137  		VEXTRACTF128(Imm(1), Y5, X6)
  2138  		VPACKSSDW(X6, X5, X5)
  2139  		VPACKSSWB(X5, X5, X5)
  2140  		VCMPPS(Imm(0), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6)
  2141  		VPAND(X2, X5, X5)
  2142  		VEXTRACTF128(Imm(1), Y6, X7)
  2143  		VPACKSSDW(X7, X6, X6)
  2144  		VPACKSSWB(X6, X6, X6)
  2145  		VPAND(X2, X6, X6)
  2146  		VINSERTI128(Imm(1), X6, Y5, Y5)
  2147  		VINSERTI128(Imm(1), X4, Y3, Y3)
  2148  		VPUNPCKLQDQ(Y5, Y3, Y3)
  2149  		VPERMQ(Imm(216), Y3, Y3)
  2150  		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1))
  2151  		ADDQ(Imm(32), RCX)
  2152  		CMPQ(RAX, RCX)
  2153  		JNE(LabelRef("LBB21_4"))
  2154  		CMPQ(RAX, RDX)
  2155  		JE(LabelRef("LBB21_7"))
  2156  	}
  2157  
  2158  	Label("LBB21_6")
  2159  	{
  2160  		VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0)
  2161  		SETEQ(Mem{Base: RDI}.Idx(RAX, 1))
  2162  		ADDQ(Imm(1), RAX)
  2163  		CMPQ(RDX, RAX)
  2164  		JNE(LabelRef("LBB21_6"))
  2165  	}
  2166  
  2167  	Label("LBB21_7")
  2168  	{
  2169  		VZEROUPPER()
  2170  		RET()
  2171  	}
  2172  }
  2173  
  2174  func genNeqNumber_F64() {
  2175  
  2176  	data := GLOBL("dataNeqNumberF64", RODATA|NOPTR)
  2177  	DATA(0, U8(1))
  2178  	DATA(1, U8(1))
  2179  	DATA(2, U8(1))
  2180  	DATA(3, U8(1))
  2181  	DATA(4, U8(0))
  2182  	DATA(5, U8(0))
  2183  	DATA(6, U8(0))
  2184  	DATA(7, U8(0))
  2185  	DATA(8, U8(0))
  2186  	DATA(9, U8(0))
  2187  	DATA(10, U8(0))
  2188  	DATA(11, U8(0))
  2189  	DATA(12, U8(0))
  2190  	DATA(13, U8(0))
  2191  	DATA(14, U8(0))
  2192  	DATA(15, U8(0))
  2193  
  2194  	TEXT("NeqNumber_AVX2_F64", NOSPLIT, "func(x []bool, y []float64, a float64)")
  2195  	Pragma("noescape")
  2196  	Load(Param("x").Base(), RDI)
  2197  	Load(Param("y").Base(), RSI)
  2198  	Load(Param("a"), X0)
  2199  	Load(Param("x").Len(), RDX)
  2200  
  2201  	TESTQ(RDX, RDX)
  2202  	JE(LabelRef("LBB22_7"))
  2203  	CMPQ(RDX, Imm(16))
  2204  	JAE(LabelRef("LBB22_3"))
  2205  	XORL(EAX, EAX)
  2206  	JMP(LabelRef("LBB22_6"))
  2207  
  2208  	Label("LBB22_3")
  2209  	{
  2210  		MOVQ(RDX, RAX)
  2211  		ANDQ(I32(-16), RAX)
  2212  		VBROADCASTSD(X0, Y1)
  2213  		XORL(ECX, ECX)
  2214  		VMOVDQU(data.Offset(0), X2)
  2215  	}
  2216  
  2217  	Label("LBB22_4")
  2218  	{
  2219  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8), Y1, Y3)
  2220  		VEXTRACTF128(Imm(1), Y3, X4)
  2221  		VPACKSSDW(X4, X3, X3)
  2222  		VPACKSSDW(X3, X3, X3)
  2223  		VPACKSSWB(X3, X3, X3)
  2224  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8).Offset(32), Y1, Y4)
  2225  		VPAND(X2, X3, X3)
  2226  		VEXTRACTF128(Imm(1), Y4, X5)
  2227  		VPACKSSDW(X5, X4, X4)
  2228  		VPACKSSDW(X4, X4, X4)
  2229  		VPACKSSWB(X4, X4, X4)
  2230  		VPAND(X2, X4, X4)
  2231  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8).Offset(64), Y1, Y5)
  2232  		VPUNPCKLDQ(X4, X3, X3)
  2233  		VEXTRACTF128(Imm(1), Y5, X4)
  2234  		VPACKSSDW(X4, X5, X4)
  2235  		VPACKSSDW(X4, X4, X4)
  2236  		VPACKSSWB(X4, X4, X4)
  2237  		VPAND(X2, X4, X4)
  2238  		VCMPPD(Imm(4), Mem{Base: RSI}.Idx(RCX, 8).Offset(96), Y1, Y5)
  2239  		VEXTRACTF128(Imm(1), Y5, X6)
  2240  		VPACKSSDW(X6, X5, X5)
  2241  		VPACKSSDW(X5, X5, X5)
  2242  		VPACKSSWB(X5, X5, X5)
  2243  		VPAND(X2, X5, X5)
  2244  		VPBROADCASTD(X5, X5)
  2245  		VPBROADCASTD(X4, X4)
  2246  		VPUNPCKLDQ(X5, X4, X4)
  2247  		VPBLENDD(Imm(12), X4, X3, X3)
  2248  		VMOVDQU(X3, Mem{Base: RDI}.Idx(RCX, 1))
  2249  		ADDQ(Imm(16), RCX)
  2250  		CMPQ(RAX, RCX)
  2251  		JNE(LabelRef("LBB22_4"))
  2252  		CMPQ(RAX, RDX)
  2253  		JE(LabelRef("LBB22_7"))
  2254  	}
  2255  
  2256  	Label("LBB22_6")
  2257  	{
  2258  		VUCOMISD(Mem{Base: RSI}.Idx(RAX, 8), X0)
  2259  		SETNE(Mem{Base: RDI}.Idx(RAX, 1))
  2260  		ADDQ(Imm(1), RAX)
  2261  		CMPQ(RDX, RAX)
  2262  		JNE(LabelRef("LBB22_6"))
  2263  	}
  2264  
  2265  	Label("LBB22_7")
  2266  	{
  2267  		VZEROUPPER()
  2268  		RET()
  2269  	}
  2270  }
  2271  
  2272  func genNeqNumber_F32() {
  2273  
  2274  	data := GLOBL("dataNeqNumberF32", RODATA|NOPTR)
  2275  	DATA(0, U8(1))
  2276  	DATA(1, U8(1))
  2277  	DATA(2, U8(1))
  2278  	DATA(3, U8(1))
  2279  	DATA(4, U8(1))
  2280  	DATA(5, U8(1))
  2281  	DATA(6, U8(1))
  2282  	DATA(7, U8(1))
  2283  	DATA(8, U8(0))
  2284  	DATA(9, U8(0))
  2285  	DATA(10, U8(0))
  2286  	DATA(11, U8(0))
  2287  	DATA(12, U8(0))
  2288  	DATA(13, U8(0))
  2289  	DATA(14, U8(0))
  2290  	DATA(15, U8(0))
  2291  
  2292  	TEXT("NeqNumber_AVX2_F32", NOSPLIT, "func(x []bool, y []float32, a float32)")
  2293  	Pragma("noescape")
  2294  	Load(Param("x").Base(), RDI)
  2295  	Load(Param("y").Base(), RSI)
  2296  	Load(Param("a"), X0)
  2297  	Load(Param("x").Len(), RDX)
  2298  
  2299  	TESTQ(RDX, RDX)
  2300  	JE(LabelRef("LBB23_7"))
  2301  	CMPQ(RDX, Imm(32))
  2302  	JAE(LabelRef("LBB23_3"))
  2303  	XORL(EAX, EAX)
  2304  	JMP(LabelRef("LBB23_6"))
  2305  
  2306  	Label("LBB23_3")
  2307  	{
  2308  		MOVQ(RDX, RAX)
  2309  		ANDQ(I32(-32), RAX)
  2310  		VBROADCASTSS(X0, Y1)
  2311  		XORL(ECX, ECX)
  2312  		VMOVDQU(data.Offset(0), X2)
  2313  	}
  2314  
  2315  	Label("LBB23_4")
  2316  	{
  2317  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4), Y1, Y3)
  2318  		VEXTRACTF128(Imm(1), Y3, X4)
  2319  		VPACKSSDW(X4, X3, X3)
  2320  		VPACKSSWB(X3, X3, X3)
  2321  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4).Offset(32), Y1, Y4)
  2322  		VPAND(X2, X3, X3)
  2323  		VEXTRACTF128(Imm(1), Y4, X5)
  2324  		VPACKSSDW(X5, X4, X4)
  2325  		VPACKSSWB(X4, X4, X4)
  2326  		VPAND(X2, X4, X4)
  2327  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4).Offset(64), Y1, Y5)
  2328  		VEXTRACTF128(Imm(1), Y5, X6)
  2329  		VPACKSSDW(X6, X5, X5)
  2330  		VPACKSSWB(X5, X5, X5)
  2331  		VCMPPS(Imm(4), Mem{Base: RSI}.Idx(RCX, 4).Offset(96), Y1, Y6)
  2332  		VPAND(X2, X5, X5)
  2333  		VEXTRACTF128(Imm(1), Y6, X7)
  2334  		VPACKSSDW(X7, X6, X6)
  2335  		VPACKSSWB(X6, X6, X6)
  2336  		VPAND(X2, X6, X6)
  2337  		VINSERTI128(Imm(1), X6, Y5, Y5)
  2338  		VINSERTI128(Imm(1), X4, Y3, Y3)
  2339  		VPUNPCKLQDQ(Y5, Y3, Y3)
  2340  		VPERMQ(Imm(216), Y3, Y3)
  2341  		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RCX, 1))
  2342  		ADDQ(Imm(32), RCX)
  2343  		CMPQ(RAX, RCX)
  2344  		JNE(LabelRef("LBB23_4"))
  2345  		CMPQ(RAX, RDX)
  2346  		JE(LabelRef("LBB23_7"))
  2347  	}
  2348  
  2349  	Label("LBB23_6")
  2350  	{
  2351  		VUCOMISS(Mem{Base: RSI}.Idx(RAX, 4), X0)
  2352  		SETNE(Mem{Base: RDI}.Idx(RAX, 1))
  2353  		ADDQ(Imm(1), RAX)
  2354  		CMPQ(RDX, RAX)
  2355  		JNE(LabelRef("LBB23_6"))
  2356  	}
  2357  
  2358  	Label("LBB23_7")
  2359  	{
  2360  		VZEROUPPER()
  2361  		RET()
  2362  	}
  2363  }