gitee.com/quant1x/gox@v1.7.6/num/asm/special.go (about)

     1  package main
     2  
     3  import (
     4  	. "github.com/mmcloughlin/avo/build"
     5  	. "github.com/mmcloughlin/avo/operand"
     6  	. "github.com/mmcloughlin/avo/reg"
     7  )
     8  
     9  func genSqrt_F64() {
    10  
    11  	TEXT("Sqrt_AVX2_F64", NOSPLIT, "func(x []float64) float64")
    12  	Pragma("noescape")
    13  	Load(Param("x").Base(), RDI)
    14  	Load(Param("x").Len(), RSI)
    15  
    16  	TESTQ(RSI, RSI)
    17  	JE(LabelRef("LBB0_7"))
    18  	CMPQ(RSI, Imm(4))
    19  	JAE(LabelRef("LBB0_3"))
    20  	XORL(EAX, EAX)
    21  	JMP(LabelRef("LBB0_6"))
    22  
    23  	Label("LBB0_3")
    24  	{
    25  		MOVQ(RSI, RAX)
    26  		ANDQ(I32(-4), RAX)
    27  		XORL(ECX, ECX)
    28  	}
    29  
    30  	Label("LBB0_4")
    31  	{
    32  		VSQRTPD(Mem{Base: RDI}.Idx(RCX, 8), Y0)
    33  		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
    34  		ADDQ(Imm(4), RCX)
    35  		CMPQ(RAX, RCX)
    36  		JNE(LabelRef("LBB0_4"))
    37  		CMPQ(RAX, RSI)
    38  		JE(LabelRef("LBB0_7"))
    39  	}
    40  
    41  	Label("LBB0_6")
    42  	{
    43  		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0)
    44  		VSQRTSD(X0, X0, X0)
    45  		VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8))
    46  		ADDQ(Imm(1), RAX)
    47  		CMPQ(RSI, RAX)
    48  		JNE(LabelRef("LBB0_6"))
    49  	}
    50  
    51  	Label("LBB0_7")
    52  	{
    53  		VZEROUPPER()
    54  		Store(X0, ReturnIndex(0))
    55  		RET()
    56  	}
    57  }
    58  
    59  func genSqrt_F32() {
    60  	data := GLOBL("dataSqrtF32", RODATA|NOPTR)
    61  	DATA(0, U32(0xc0400000))
    62  	DATA(4, U32(0xbf000000))
    63  	DATA(8, U32(0x7fffffff))
    64  	DATA(12, U32(0x00800000))
    65  
    66  	TEXT("Sqrt_AVX2_F32", NOSPLIT, "func(x []float32) float32")
    67  	Pragma("noescape")
    68  	Load(Param("x").Base(), RDI)
    69  	Load(Param("x").Len(), RSI)
    70  
    71  	TESTQ(RSI, RSI)
    72  	JE(LabelRef("LBB1_8"))
    73  	CMPQ(RSI, Imm(32))
    74  	JAE(LabelRef("LBB1_3"))
    75  	XORL(EAX, EAX)
    76  	JMP(LabelRef("LBB1_6"))
    77  
    78  	Label("LBB1_3")
    79  	{
    80  		MOVQ(RSI, RAX)
    81  		ANDQ(I32(-32), RAX)
    82  		XORL(ECX, ECX)
    83  		VBROADCASTSS(data.Offset(0), Y0)
    84  		VBROADCASTSS(data.Offset(4), Y1)
    85  		VBROADCASTSS(data.Offset(8), Y2)
    86  		VBROADCASTSS(data.Offset(12), Y3)
    87  	}
    88  
    89  	Label("LBB1_4")
    90  	{
    91  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y4)
    92  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y5)
    93  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y6)
    94  		VRSQRTPS(Y4, Y7)
    95  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y8)
    96  		VMULPS(Y7, Y4, Y9)
    97  		VFMADD213PS(Y0, Y9, Y7)
    98  		VMULPS(Y1, Y9, Y9)
    99  		VMULPS(Y7, Y9, Y7)
   100  		VANDPS(Y2, Y4, Y4)
   101  		VCMPPS(Imm(2), Y4, Y3, Y4)
   102  		VANDPS(Y7, Y4, Y4)
   103  		VRSQRTPS(Y5, Y7)
   104  		VMULPS(Y7, Y5, Y9)
   105  		VFMADD213PS(Y0, Y9, Y7)
   106  		VMULPS(Y1, Y9, Y9)
   107  		VMULPS(Y7, Y9, Y7)
   108  		VANDPS(Y2, Y5, Y5)
   109  		VCMPPS(Imm(2), Y5, Y3, Y5)
   110  		VRSQRTPS(Y6, Y9)
   111  		VANDPS(Y7, Y5, Y5)
   112  		VMULPS(Y6, Y9, Y7)
   113  		VFMADD213PS(Y0, Y7, Y9)
   114  		VMULPS(Y1, Y7, Y7)
   115  		VMULPS(Y7, Y9, Y7)
   116  		VANDPS(Y2, Y6, Y6)
   117  		VCMPPS(Imm(2), Y6, Y3, Y6)
   118  		VANDPS(Y7, Y6, Y6)
   119  		VRSQRTPS(Y8, Y7)
   120  		VMULPS(Y7, Y8, Y9)
   121  		VFMADD213PS(Y0, Y9, Y7)
   122  		VMULPS(Y1, Y9, Y9)
   123  		VMULPS(Y7, Y9, Y7)
   124  		VANDPS(Y2, Y8, Y8)
   125  		VCMPPS(Imm(2), Y8, Y3, Y8)
   126  		VANDPS(Y7, Y8, Y7)
   127  		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RCX, 4))
   128  		VMOVUPS(Y5, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
   129  		VMOVUPS(Y6, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
   130  		VMOVUPS(Y7, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
   131  		ADDQ(Imm(32), RCX)
   132  		CMPQ(RAX, RCX)
   133  		JNE(LabelRef("LBB1_4"))
   134  		CMPQ(RAX, RSI)
   135  		JE(LabelRef("LBB1_8"))
   136  	}
   137  
   138  	Label("LBB1_6")
   139  	{
   140  		VMOVSS(data.Offset(0), X0)
   141  		VMOVSS(data.Offset(4), X1)
   142  		VBROADCASTSS(data.Offset(8), X2)
   143  		VMOVSS(data.Offset(12), X3)
   144  	}
   145  
   146  	Label("LBB1_7")
   147  	{
   148  		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X4)
   149  		VRSQRTSS(X4, X4, X5)
   150  		VMULSS(X5, X4, X6)
   151  		VFMADD213SS(X0, X6, X5)
   152  		VMULSS(X1, X6, X6)
   153  		VMULSS(X5, X6, X5)
   154  		VANDPS(X2, X4, X4)
   155  		VCMPSS(Imm(1), X3, X4, X4)
   156  		VANDNPS(X5, X4, X4)
   157  		VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4))
   158  		ADDQ(Imm(1), RAX)
   159  		CMPQ(RSI, RAX)
   160  		JNE(LabelRef("LBB1_7"))
   161  	}
   162  
   163  	Label("LBB1_8")
   164  	{
   165  		VZEROUPPER()
   166  		Store(X0, ReturnIndex(0))
   167  		RET()
   168  	}
   169  }
   170  
   171  func genRound_F64() {
   172  
   173  	data := GLOBL("dataRoundF64", RODATA|NOPTR)
   174  	DATA(0, U64(0x8000000000000000))
   175  	DATA(8, U64(0x3fdfffffffffffff))
   176  	DATA(16, U64(0x8000000000000000))
   177  	DATA(24, U64(0x8000000000000000))
   178  
   179  	TEXT("Round_AVX2_F64", NOSPLIT, "func(x []float64) float64")
   180  	Pragma("noescape")
   181  	Load(Param("x").Base(), RDI)
   182  	Load(Param("x").Len(), RSI)
   183  
   184  	TESTQ(RSI, RSI)
   185  	JE(LabelRef("LBB2_8"))
   186  	CMPQ(RSI, Imm(16))
   187  	JAE(LabelRef("LBB2_3"))
   188  	XORL(EAX, EAX)
   189  	JMP(LabelRef("LBB2_6"))
   190  
   191  	Label("LBB2_3")
   192  	{
   193  		MOVQ(RSI, RAX)
   194  		ANDQ(I32(-16), RAX)
   195  		XORL(ECX, ECX)
   196  		VBROADCASTSD(data.Offset(0), Y0)
   197  		VBROADCASTSD(data.Offset(8), Y1)
   198  	}
   199  
   200  	Label("LBB2_4")
   201  	{
   202  		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8), Y2)
   203  		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y3)
   204  		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y4)
   205  		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y5)
   206  		VANDPD(Y0, Y2, Y6)
   207  		VORPD(Y1, Y6, Y6)
   208  		VADDPD(Y6, Y2, Y2)
   209  		VROUNDPD(Imm(11), Y2, Y2)
   210  		VANDPD(Y0, Y3, Y6)
   211  		VORPD(Y1, Y6, Y6)
   212  		VADDPD(Y6, Y3, Y3)
   213  		VROUNDPD(Imm(11), Y3, Y3)
   214  		VANDPD(Y0, Y4, Y6)
   215  		VORPD(Y1, Y6, Y6)
   216  		VADDPD(Y6, Y4, Y4)
   217  		VROUNDPD(Imm(11), Y4, Y4)
   218  		VANDPD(Y0, Y5, Y6)
   219  		VORPD(Y1, Y6, Y6)
   220  		VADDPD(Y6, Y5, Y5)
   221  		VROUNDPD(Imm(11), Y5, Y5)
   222  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8))
   223  		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
   224  		VMOVUPD(Y4, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
   225  		VMOVUPD(Y5, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
   226  		ADDQ(Imm(16), RCX)
   227  		CMPQ(RAX, RCX)
   228  		JNE(LabelRef("LBB2_4"))
   229  		CMPQ(RAX, RSI)
   230  		JE(LabelRef("LBB2_8"))
   231  	}
   232  
   233  	Label("LBB2_6")
   234  	{
   235  		VMOVUPD(data.Offset(16), X0)
   236  		VMOVDDUP(data.Offset(8), X1)
   237  	}
   238  
   239  	Label("LBB2_7")
   240  	{
   241  		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X2)
   242  		VANDPD(X0, X2, X3)
   243  		VORPD(X1, X3, X3)
   244  		VADDSD(X3, X2, X2)
   245  		VROUNDSD(Imm(11), X2, X2, X2)
   246  		VMOVSD(X2, Mem{Base: RDI}.Idx(RAX, 8))
   247  		ADDQ(Imm(1), RAX)
   248  		CMPQ(RSI, RAX)
   249  		JNE(LabelRef("LBB2_7"))
   250  	}
   251  
   252  	Label("LBB2_8")
   253  	{
   254  		VZEROUPPER()
   255  		Store(X0, ReturnIndex(0))
   256  		RET()
   257  	}
   258  }
   259  
   260  func genRound_F32() {
   261  
   262  	data := GLOBL("dataRoundF32", RODATA|NOPTR)
   263  	DATA(0, U32(0x80000000))
   264  	DATA(4, U32(0x3effffff))
   265  
   266  	TEXT("Round_AVX2_F32", NOSPLIT, "func(x []float32) float32")
   267  	Pragma("noescape")
   268  	Load(Param("x").Base(), RDI)
   269  	Load(Param("x").Len(), RSI)
   270  
   271  	TESTQ(RSI, RSI)
   272  	JE(LabelRef("LBB3_8"))
   273  	CMPQ(RSI, Imm(32))
   274  	JAE(LabelRef("LBB3_3"))
   275  	XORL(EAX, EAX)
   276  	JMP(LabelRef("LBB3_6"))
   277  
   278  	Label("LBB3_3")
   279  	{
   280  		MOVQ(RSI, RAX)
   281  		ANDQ(I32(-32), RAX)
   282  		XORL(ECX, ECX)
   283  		VBROADCASTSS(data.Offset(0), Y0)
   284  		VBROADCASTSS(data.Offset(4), Y1)
   285  	}
   286  
   287  	Label("LBB3_4")
   288  	{
   289  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y2)
   290  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y3)
   291  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y4)
   292  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y5)
   293  		VANDPS(Y0, Y2, Y6)
   294  		VORPS(Y1, Y6, Y6)
   295  		VADDPS(Y6, Y2, Y2)
   296  		VROUNDPS(Imm(11), Y2, Y2)
   297  		VANDPS(Y0, Y3, Y6)
   298  		VORPS(Y1, Y6, Y6)
   299  		VADDPS(Y6, Y3, Y3)
   300  		VROUNDPS(Imm(11), Y3, Y3)
   301  		VANDPS(Y0, Y4, Y6)
   302  		VORPS(Y1, Y6, Y6)
   303  		VADDPS(Y6, Y4, Y4)
   304  		VROUNDPS(Imm(11), Y4, Y4)
   305  		VANDPS(Y0, Y5, Y6)
   306  		VORPS(Y1, Y6, Y6)
   307  		VADDPS(Y6, Y5, Y5)
   308  		VROUNDPS(Imm(11), Y5, Y5)
   309  		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4))
   310  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
   311  		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
   312  		VMOVUPS(Y5, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
   313  		ADDQ(Imm(32), RCX)
   314  		CMPQ(RAX, RCX)
   315  		JNE(LabelRef("LBB3_4"))
   316  		CMPQ(RAX, RSI)
   317  		JE(LabelRef("LBB3_8"))
   318  	}
   319  
   320  	Label("LBB3_6")
   321  	{
   322  		VBROADCASTSS(data.Offset(0), X0)
   323  		VBROADCASTSS(data.Offset(4), X1)
   324  	}
   325  
   326  	Label("LBB3_7")
   327  	{
   328  		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X2)
   329  		VANDPS(X0, X2, X3)
   330  		VORPS(X1, X3, X3)
   331  		VADDSS(X3, X2, X2)
   332  		VROUNDSS(Imm(11), X2, X2, X2)
   333  		VMOVSS(X2, Mem{Base: RDI}.Idx(RAX, 4))
   334  		ADDQ(Imm(1), RAX)
   335  		CMPQ(RSI, RAX)
   336  		JNE(LabelRef("LBB3_7"))
   337  	}
   338  
   339  	Label("LBB3_8")
   340  	{
   341  		VZEROUPPER()
   342  		Store(X0, ReturnIndex(0))
   343  		RET()
   344  	}
   345  }
   346  
   347  func genFloor_F64() {
   348  
   349  	TEXT("Floor_AVX2_F64", NOSPLIT, "func(x []float64) float64")
   350  	Pragma("noescape")
   351  	Load(Param("x").Base(), RDI)
   352  	Load(Param("x").Len(), RSI)
   353  
   354  	TESTQ(RSI, RSI)
   355  	JE(LabelRef("LBB4_11"))
   356  	CMPQ(RSI, Imm(16))
   357  	JAE(LabelRef("LBB4_3"))
   358  	XORL(EAX, EAX)
   359  	JMP(LabelRef("LBB4_10"))
   360  
   361  	Label("LBB4_3")
   362  	{
   363  		MOVQ(RSI, RAX)
   364  		ANDQ(I32(-16), RAX)
   365  		LEAQ(Mem{Base: RAX}.Offset(-16), RCX)
   366  		MOVQ(RCX, R8)
   367  		SHRQ(Imm(4), R8)
   368  		ADDQ(Imm(1), R8)
   369  		TESTQ(RCX, RCX)
   370  		JE(LabelRef("LBB4_4"))
   371  		MOVQ(R8, RDX)
   372  		ANDQ(I32(-2), RDX)
   373  		XORL(ECX, ECX)
   374  	}
   375  
   376  	Label("LBB4_6")
   377  	{
   378  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8), Y0)
   379  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
   380  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
   381  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
   382  		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
   383  		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
   384  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
   385  		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
   386  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(128), Y0)
   387  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(160), Y1)
   388  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(192), Y2)
   389  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(224), Y3)
   390  		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8).Offset(128))
   391  		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(160))
   392  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(192))
   393  		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(224))
   394  		ADDQ(Imm(32), RCX)
   395  		ADDQ(I32(-2), RDX)
   396  		JNE(LabelRef("LBB4_6"))
   397  		TESTB(Imm(1), R8B)
   398  		JE(LabelRef("LBB4_9"))
   399  	}
   400  
   401  	Label("LBB4_8")
   402  	{
   403  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8), Y0)
   404  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
   405  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
   406  		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
   407  		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
   408  		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
   409  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
   410  		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
   411  	}
   412  
   413  	Label("LBB4_9")
   414  	{
   415  		CMPQ(RAX, RSI)
   416  		JE(LabelRef("LBB4_11"))
   417  	}
   418  
   419  	Label("LBB4_10")
   420  	{
   421  		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0)
   422  		VROUNDSD(Imm(9), X0, X0, X0)
   423  		VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8))
   424  		ADDQ(Imm(1), RAX)
   425  		CMPQ(RSI, RAX)
   426  		JNE(LabelRef("LBB4_10"))
   427  	}
   428  
   429  	Label("LBB4_11")
   430  	{
   431  		VZEROUPPER()
   432  		Store(X0, ReturnIndex(0))
   433  		RET()
   434  	}
   435  
   436  	Label("LBB4_4")
   437  	{
   438  		XORL(ECX, ECX)
   439  		TESTB(Imm(1), R8B)
   440  		JNE(LabelRef("LBB4_8"))
   441  		JMP(LabelRef("LBB4_9"))
   442  	}
   443  }
   444  
   445  func genFloor_F32() {
   446  
   447  	TEXT("Floor_AVX2_F32", NOSPLIT, "func(x []float32) float32")
   448  	Pragma("noescape")
   449  	Load(Param("x").Base(), RDI)
   450  	Load(Param("x").Len(), RSI)
   451  
   452  	TESTQ(RSI, RSI)
   453  	JE(LabelRef("LBB5_11"))
   454  	CMPQ(RSI, Imm(32))
   455  	JAE(LabelRef("LBB5_3"))
   456  	XORL(EAX, EAX)
   457  	JMP(LabelRef("LBB5_10"))
   458  
   459  	Label("LBB5_3")
   460  	{
   461  		MOVQ(RSI, RAX)
   462  		ANDQ(I32(-32), RAX)
   463  		LEAQ(Mem{Base: RAX}.Offset(-32), RCX)
   464  		MOVQ(RCX, R8)
   465  		SHRQ(Imm(5), R8)
   466  		ADDQ(Imm(1), R8)
   467  		TESTQ(RCX, RCX)
   468  		JE(LabelRef("LBB5_4"))
   469  		MOVQ(R8, RDX)
   470  		ANDQ(I32(-2), RDX)
   471  		XORL(ECX, ECX)
   472  	}
   473  
   474  	Label("LBB5_6")
   475  	{
   476  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4), Y0)
   477  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
   478  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
   479  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
   480  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
   481  		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
   482  		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
   483  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
   484  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(128), Y0)
   485  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(160), Y1)
   486  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(192), Y2)
   487  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(224), Y3)
   488  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4).Offset(128))
   489  		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(160))
   490  		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(192))
   491  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(224))
   492  		ADDQ(Imm(64), RCX)
   493  		ADDQ(I32(-2), RDX)
   494  		JNE(LabelRef("LBB5_6"))
   495  		TESTB(Imm(1), R8B)
   496  		JE(LabelRef("LBB5_9"))
   497  	}
   498  
   499  	Label("LBB5_8")
   500  	{
   501  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4), Y0)
   502  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
   503  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
   504  		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
   505  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
   506  		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
   507  		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
   508  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
   509  	}
   510  
   511  	Label("LBB5_9")
   512  	{
   513  		CMPQ(RAX, RSI)
   514  		JE(LabelRef("LBB5_11"))
   515  	}
   516  
   517  	Label("LBB5_10")
   518  	{
   519  		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X0)
   520  		VROUNDSS(Imm(9), X0, X0, X0)
   521  		VMOVSS(X0, Mem{Base: RDI}.Idx(RAX, 4))
   522  		ADDQ(Imm(1), RAX)
   523  		CMPQ(RSI, RAX)
   524  		JNE(LabelRef("LBB5_10"))
   525  	}
   526  
   527  	Label("LBB5_11")
   528  	{
   529  		VZEROUPPER()
   530  		Store(X0, ReturnIndex(0))
   531  		RET()
   532  	}
   533  
   534  	Label("LBB5_4")
   535  	{
   536  		XORL(ECX, ECX)
   537  		TESTB(Imm(1), R8B)
   538  		JNE(LabelRef("LBB5_8"))
   539  		JMP(LabelRef("LBB5_9"))
   540  	}
   541  }
   542  
   543  func genCeil_F64() {
   544  
   545  	TEXT("Ceil_AVX2_F64", NOSPLIT, "func(x []float64) float64")
   546  	Pragma("noescape")
   547  	Load(Param("x").Base(), RDI)
   548  	Load(Param("x").Len(), RSI)
   549  
   550  	TESTQ(RSI, RSI)
   551  	JE(LabelRef("LBB6_11"))
   552  	CMPQ(RSI, Imm(16))
   553  	JAE(LabelRef("LBB6_3"))
   554  	XORL(EAX, EAX)
   555  	JMP(LabelRef("LBB6_10"))
   556  
   557  	Label("LBB6_3")
   558  	{
   559  		MOVQ(RSI, RAX)
   560  		ANDQ(I32(-16), RAX)
   561  		LEAQ(Mem{Base: RAX}.Offset(-16), RCX)
   562  		MOVQ(RCX, R8)
   563  		SHRQ(Imm(4), R8)
   564  		ADDQ(Imm(1), R8)
   565  		TESTQ(RCX, RCX)
   566  		JE(LabelRef("LBB6_4"))
   567  		MOVQ(R8, RDX)
   568  		ANDQ(I32(-2), RDX)
   569  		XORL(ECX, ECX)
   570  	}
   571  
   572  	Label("LBB6_6")
   573  	{
   574  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8), Y0)
   575  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
   576  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
   577  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
   578  		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
   579  		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
   580  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
   581  		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
   582  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(128), Y0)
   583  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(160), Y1)
   584  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(192), Y2)
   585  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(224), Y3)
   586  		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8).Offset(128))
   587  		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(160))
   588  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(192))
   589  		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(224))
   590  		ADDQ(Imm(32), RCX)
   591  		ADDQ(I32(-2), RDX)
   592  		JNE(LabelRef("LBB6_6"))
   593  		TESTB(Imm(1), R8B)
   594  		JE(LabelRef("LBB6_9"))
   595  	}
   596  
   597  	Label("LBB6_8")
   598  	{
   599  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8), Y0)
   600  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
   601  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
   602  		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
   603  		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
   604  		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
   605  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
   606  		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
   607  	}
   608  
   609  	Label("LBB6_9")
   610  	{
   611  		CMPQ(RAX, RSI)
   612  		JE(LabelRef("LBB6_11"))
   613  	}
   614  
   615  	Label("LBB6_10")
   616  	{
   617  		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0)
   618  		VROUNDSD(Imm(10), X0, X0, X0)
   619  		VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8))
   620  		ADDQ(Imm(1), RAX)
   621  		CMPQ(RSI, RAX)
   622  		JNE(LabelRef("LBB6_10"))
   623  	}
   624  
   625  	Label("LBB6_11")
   626  	{
   627  		VZEROUPPER()
   628  		Store(X0, ReturnIndex(0))
   629  		RET()
   630  	}
   631  
   632  	Label("LBB6_4")
   633  	{
   634  		XORL(ECX, ECX)
   635  		TESTB(Imm(1), R8B)
   636  		JNE(LabelRef("LBB6_8"))
   637  		JMP(LabelRef("LBB6_9"))
   638  	}
   639  }
   640  
   641  func genCeil_F32() {
   642  
   643  	TEXT("Ceil_AVX2_F32", NOSPLIT, "func(x []float32) float32")
   644  	Pragma("noescape")
   645  	Load(Param("x").Base(), RDI)
   646  	Load(Param("x").Len(), RSI)
   647  
   648  	TESTQ(RSI, RSI)
   649  	JE(LabelRef("LBB7_11"))
   650  	CMPQ(RSI, Imm(32))
   651  	JAE(LabelRef("LBB7_3"))
   652  	XORL(EAX, EAX)
   653  	JMP(LabelRef("LBB7_10"))
   654  
   655  	Label("LBB7_3")
   656  	{
   657  		MOVQ(RSI, RAX)
   658  		ANDQ(I32(-32), RAX)
   659  		LEAQ(Mem{Base: RAX}.Offset(-32), RCX)
   660  		MOVQ(RCX, R8)
   661  		SHRQ(Imm(5), R8)
   662  		ADDQ(Imm(1), R8)
   663  		TESTQ(RCX, RCX)
   664  		JE(LabelRef("LBB7_4"))
   665  		MOVQ(R8, RDX)
   666  		ANDQ(I32(-2), RDX)
   667  		XORL(ECX, ECX)
   668  	}
   669  
   670  	Label("LBB7_6")
   671  	{
   672  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4), Y0)
   673  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
   674  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
   675  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
   676  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
   677  		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
   678  		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
   679  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
   680  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(128), Y0)
   681  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(160), Y1)
   682  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(192), Y2)
   683  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(224), Y3)
   684  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4).Offset(128))
   685  		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(160))
   686  		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(192))
   687  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(224))
   688  		ADDQ(Imm(64), RCX)
   689  		ADDQ(I32(-2), RDX)
   690  		JNE(LabelRef("LBB7_6"))
   691  		TESTB(Imm(1), R8B)
   692  		JE(LabelRef("LBB7_9"))
   693  	}
   694  
   695  	Label("LBB7_8")
   696  	{
   697  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4), Y0)
   698  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
   699  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
   700  		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
   701  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
   702  		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
   703  		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
   704  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
   705  	}
   706  
   707  	Label("LBB7_9")
   708  	{
   709  		CMPQ(RAX, RSI)
   710  		JE(LabelRef("LBB7_11"))
   711  	}
   712  
   713  	Label("LBB7_10")
   714  	{
   715  		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X0)
   716  		VROUNDSS(Imm(10), X0, X0, X0)
   717  		VMOVSS(X0, Mem{Base: RDI}.Idx(RAX, 4))
   718  		ADDQ(Imm(1), RAX)
   719  		CMPQ(RSI, RAX)
   720  		JNE(LabelRef("LBB7_10"))
   721  	}
   722  
   723  	Label("LBB7_11")
   724  	{
   725  		VZEROUPPER()
   726  		Store(X0, ReturnIndex(0))
   727  		RET()
   728  	}
   729  
   730  	Label("LBB7_4")
   731  	{
   732  		XORL(ECX, ECX)
   733  		TESTB(Imm(1), R8B)
   734  		JNE(LabelRef("LBB7_8"))
   735  		JMP(LabelRef("LBB7_9"))
   736  	}
   737  }
   738  
   739  func genPow_4x_F64() {
   740  
   741  	data := GLOBL("dataPowF64", RODATA|NOPTR)
   742  	DATA(0, U64(9223372036854775807))   // Label("LCPI9_0")
   743  	DATA(8, U64(0x3fe6a09e667f3bcd))    // Label("LCPI9_3")
   744  	DATA(16, U64(0xbff0000000000000))   // Label("LCPI9_4")
   745  	DATA(24, U64(0x401a509f46f4fa53))   // Label("LCPI9_5")
   746  	DATA(32, U64(0x3fdfe818a0fe1a83))   // Label("LCPI9_6")
   747  	DATA(40, U64(0x3f07bc0962b395ca))   // Label("LCPI9_7")
   748  	DATA(48, U64(0x404e798eb86c3351))   // Label("LCPI9_8")
   749  	DATA(56, U64(0x403de9738b8cb9c9))   // Label("LCPI9_9")
   750  	DATA(64, U64(0x40340a202d99830a))   // Label("LCPI9_10")
   751  	DATA(72, U64(0x404c8e7597479a10))   // Label("LCPI9_11")
   752  	DATA(80, U64(0x4054c30b52213498))   // Label("LCPI9_12")
   753  	DATA(88, U64(0x402e20359e903e37))   // Label("LCPI9_13")
   754  	DATA(96, U64(0x407351945dc908a5))   // Label("LCPI9_14")
   755  	DATA(104, U64(0x406bb86590fcfb56))  // Label("LCPI9_15")
   756  	DATA(112, U64(0x404e0f304466448e))  // Label("LCPI9_16")
   757  	DATA(120, U64(0x406b0db13e48e066))  // Label("LCPI9_17")
   758  	DATA(128, U64(4841369599423283200)) // Label("LCPI9_18")
   759  	DATA(136, U64(0xc3300000000003ff))  // Label("LCPI9_19")
   760  	DATA(144, U64(0x3ff0000000000000))  // Label("LCPI9_20")
   761  	DATA(152, U64(0xbfe0000000000000))  // Label("LCPI9_21")
   762  	DATA(160, U64(0x3fe0000000000000))  // Label("LCPI9_22")
   763  	DATA(168, U64(0x3ff71547652b82fe))  // Label("LCPI9_23")
   764  	DATA(176, U64(0xbfe62e4000000000))  // Label("LCPI9_24")
   765  	DATA(184, U64(0x3eb7f7d1cf79abca))  // Label("LCPI9_25")
   766  	DATA(192, U64(0x3fe62e42fefa39ef))  // Label("LCPI9_26")
   767  	DATA(200, U64(0x3e21eed8eff8d898))  // Label("LCPI9_27")
   768  	DATA(208, U64(0x3de6124613a86d09))  // Label("LCPI9_28")
   769  	DATA(216, U64(0x3e927e4fb7789f5c))  // Label("LCPI9_29")
   770  	DATA(224, U64(0x3e5ae64567f544e4))  // Label("LCPI9_30")
   771  	DATA(232, U64(0x3efa01a01a01a01a))  // Label("LCPI9_31")
   772  	DATA(240, U64(0x3ec71de3a556c734))  // Label("LCPI9_32")
   773  	DATA(248, U64(0x3f56c16c16c16c17))  // Label("LCPI9_33")
   774  	DATA(256, U64(0x3f2a01a01a01a01a))  // Label("LCPI9_34")
   775  	DATA(264, U64(0x3fa5555555555555))  // Label("LCPI9_35")
   776  	DATA(272, U64(0x3f81111111111111))  // Label("LCPI9_36")
   777  	DATA(280, U64(0x3fc5555555555555))  // Label("LCPI9_37")
   778  	DATA(288, U64(2046))                // Label("LCPI9_38")
   779  	DATA(296, U64(0x40a7700000000000))  // Label("LCPI9_39")
   780  	DATA(304, U64(1))                   // Label("LCPI9_40")
   781  	DATA(312, U64(0xc0a7700000000000))  // Label("LCPI9_41")
   782  	DATA(320, U64(9218868437227405312)) // Label("LCPI9_42")
   783  	DATA(328, U64(0x7ff8002040000000))  // Label("LCPI9_43")
   784  	DATA(336, U64(4503599627370495))    // Label("LCPI9_1")
   785  	DATA(344, U64(4503599627370495))
   786  	DATA(352, U64(4602678819172646912)) // Label("LCPI9_2")
   787  	DATA(360, U64(4602678819172646912))
   788  
   789  	TEXT("Pow_4x_AVX2_F64", NOSPLIT, "func(x, y []float64)")
   790  	Pragma("noescape")
   791  	Load(Param("x").Base(), RDI)
   792  	Load(Param("y").Base(), RSI)
   793  	Load(Param("x").Len(), RDX)
   794  
   795  	SUBQ(I32(1192), RSP)
   796  	ANDQ(I32(-4), RDX)
   797  	JE(LabelRef("LBB9_11"))
   798  	XORL(R8L, R8L)
   799  	VBROADCASTSD(data.Offset(0), Y0)
   800  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(512))
   801  	VBROADCASTSD(data.Offset(8), Y0)
   802  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1120))
   803  	VPXOR(X6, X6, X6)
   804  	VBROADCASTSD(data.Offset(16), Y0)
   805  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1088))
   806  	VBROADCASTSD(data.Offset(24), Y0)
   807  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1056))
   808  	VBROADCASTSD(data.Offset(32), Y0)
   809  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1024))
   810  	VBROADCASTSD(data.Offset(40), Y0)
   811  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(992))
   812  	VBROADCASTSD(data.Offset(48), Y0)
   813  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(960))
   814  	VBROADCASTSD(data.Offset(56), Y0)
   815  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(928))
   816  	VBROADCASTSD(data.Offset(64), Y0)
   817  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(896))
   818  	VBROADCASTSD(data.Offset(72), Y0)
   819  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(864))
   820  	VBROADCASTSD(data.Offset(80), Y0)
   821  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(832))
   822  	VBROADCASTSD(data.Offset(88), Y0)
   823  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(800))
   824  	VBROADCASTSD(data.Offset(96), Y0)
   825  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(768))
   826  	VBROADCASTSD(data.Offset(104), Y0)
   827  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(736))
   828  	VBROADCASTSD(data.Offset(112), Y0)
   829  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(704))
   830  	VBROADCASTSD(data.Offset(120), Y0)
   831  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(672))
   832  	VBROADCASTSD(data.Offset(128), Y0)
   833  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(640))
   834  	VBROADCASTSD(data.Offset(136), Y0)
   835  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(608))
   836  	VBROADCASTSD(data.Offset(144), Y0)
   837  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
   838  	VBROADCASTSD(data.Offset(152), Y0)
   839  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(576))
   840  	VBROADCASTSD(data.Offset(160), Y0)
   841  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(544))
   842  	VBROADCASTSD(data.Offset(168), Y0)
   843  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(480))
   844  	VBROADCASTSD(data.Offset(176), Y0)
   845  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(448))
   846  	VBROADCASTSD(data.Offset(184), Y0)
   847  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(416))
   848  	VBROADCASTSD(data.Offset(192), Y0)
   849  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(384))
   850  	VBROADCASTSD(data.Offset(200), Y0)
   851  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(352))
   852  	VBROADCASTSD(data.Offset(208), Y0)
   853  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(320))
   854  	VBROADCASTSD(data.Offset(216), Y0)
   855  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(288))
   856  	VBROADCASTSD(data.Offset(224), Y0)
   857  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(256))
   858  	VBROADCASTSD(data.Offset(232), Y0)
   859  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(224))
   860  	VBROADCASTSD(data.Offset(240), Y0)
   861  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(192))
   862  	VBROADCASTSD(data.Offset(248), Y0)
   863  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(160))
   864  	VBROADCASTSD(data.Offset(256), Y0)
   865  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(128))
   866  	VBROADCASTSD(data.Offset(264), Y0)
   867  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
   868  	VBROADCASTSD(data.Offset(272), Y0)
   869  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
   870  	VBROADCASTSD(data.Offset(280), Y0)
   871  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
   872  	VBROADCASTSD(data.Offset(288), Y0)
   873  	VMOVUPS(Y0, Mem{Base: RSP})
   874  	VBROADCASTSD(data.Offset(296), Y0)
   875  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
   876  	VBROADCASTSD(data.Offset(304), Y0)
   877  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
   878  	VBROADCASTSD(data.Offset(312), Y0)
   879  	VMOVUPD(Y0, Mem{Base: RSP}.Offset(-96))
   880  	VPBROADCASTQ(data.Offset(320), Y5)
   881  	VBROADCASTSD(data.Offset(320), Y10)
   882  	JMP(LabelRef("LBB9_2"))
   883  
   884  	Label("LBB9_10")
   885  	{
   886  		VMOVUPD(Y2, Mem{Base: RDI}.Idx(R8, 8))
   887  		ADDQ(Imm(4), R8)
   888  		CMPQ(R8, RDX)
   889  		JAE(LabelRef("LBB9_11"))
   890  	}
   891  
   892  	Label("LBB9_2")
   893  	{
   894  		VMOVAPD(Y10, Y9)
   895  		VMOVDQU(Mem{Base: RDI}.Idx(R8, 8), Y13)
   896  		VMOVUPD(Mem{Base: RSI}.Idx(R8, 8), Y12)
   897  		VPAND(Mem{Base: RSP}.Offset(512), Y13, Y10)
   898  		VMOVUPD(data.Offset(336), X1)
   899  		VANDPD(Mem{Base: RDI}.Idx(R8, 8), X1, X2)
   900  		VMOVUPD(data.Offset(352), X0)
   901  		VORPD(X0, X2, X2)
   902  		VANDPD(Mem{Base: RDI}.Idx(R8, 8).Offset(16), X1, X3)
   903  		VORPD(X0, X3, X3)
   904  		VINSERTF128(Imm(1), X3, Y2, Y3)
   905  		VMOVUPD(Mem{Base: RSP}.Offset(1120), Y0)
   906  		VCMPPD(Imm(1), Y3, Y0, Y2)
   907  		VANDNPD(Y3, Y2, Y4)
   908  		VADDPD(Mem{Base: RSP}.Offset(1088), Y3, Y3)
   909  		VADDPD(Y4, Y3, Y4)
   910  		VMULPD(Y4, Y4, Y3)
   911  		VMULPD(Y3, Y3, Y7)
   912  		VMOVUPD(Mem{Base: RSP}.Offset(1024), Y8)
   913  		VFMADD213PD(Mem{Base: RSP}.Offset(1056), Y4, Y8)
   914  		VFMADD231PD(Mem{Base: RSP}.Offset(992), Y3, Y8)
   915  		VMOVUPD(Mem{Base: RSP}.Offset(928), Y11)
   916  		VFMADD213PD(Mem{Base: RSP}.Offset(960), Y4, Y11)
   917  		VMOVUPD(Mem{Base: RSP}.Offset(864), Y14)
   918  		VFMADD213PD(Mem{Base: RSP}.Offset(896), Y4, Y14)
   919  		VFMADD231PD(Y11, Y3, Y14)
   920  		VFMADD231PD(Y8, Y7, Y14)
   921  		VMULPD(Y4, Y3, Y8)
   922  		VMULPD(Y14, Y8, Y8)
   923  		VADDPD(Mem{Base: RSP}.Offset(832), Y3, Y11)
   924  		VFMADD231PD(Mem{Base: RSP}.Offset(800), Y4, Y11)
   925  		VMOVUPD(Mem{Base: RSP}.Offset(736), Y14)
   926  		VFMADD213PD(Mem{Base: RSP}.Offset(768), Y4, Y14)
   927  		VMOVUPD(Mem{Base: RSP}.Offset(672), Y15)
   928  		VFMADD213PD(Mem{Base: RSP}.Offset(704), Y4, Y15)
   929  		VFMADD231PD(Y14, Y3, Y15)
   930  		VFMADD231PD(Y11, Y7, Y15)
   931  		VDIVPD(Y15, Y8, Y7)
   932  		VMOVDQU(Y10, Mem{Base: RSP}.Offset(1152))
   933  		VPSRLQ(Imm(52), Y10, Y8)
   934  		VPOR(Mem{Base: RSP}.Offset(640), Y8, Y8)
   935  		VADDPD(Mem{Base: RSP}.Offset(608), Y8, Y8)
   936  		VMOVUPD(Mem{Base: RSP}.Offset(-128), Y0)
   937  		VANDPD(Y0, Y2, Y2)
   938  		VADDPD(Y2, Y8, Y8)
   939  		VMULPD(Y12, Y8, Y2)
   940  		VROUNDPD(Imm(8), Y2, Y2)
   941  		VFNMADD213PD(Y2, Y12, Y8)
   942  		VMOVUPD(Mem{Base: RSP}.Offset(576), Y1)
   943  		VMOVAPD(Y1, Y11)
   944  		VFMADD213PD(Y4, Y3, Y11)
   945  		VADDPD(Y7, Y11, Y11)
   946  		VMOVUPD(Mem{Base: RSP}.Offset(544), Y10)
   947  		VMULPD(Y4, Y10, Y14)
   948  		VMULPD(Y1, Y3, Y15)
   949  		VFMADD231PD(Y14, Y4, Y15)
   950  		VSUBPD(Y4, Y11, Y4)
   951  		VFMADD231PD(Y3, Y10, Y4)
   952  		VMOVUPD(Mem{Base: RSP}.Offset(480), Y1)
   953  		VMULPD(Y1, Y12, Y3)
   954  		VMULPD(Y3, Y11, Y3)
   955  		VROUNDPD(Imm(8), Y3, Y3)
   956  		VMULPD(Mem{Base: RSP}.Offset(448), Y3, Y14)
   957  		VFMADD231PD(Y11, Y12, Y14)
   958  		VFMSUB231PD(Mem{Base: RSP}.Offset(416), Y3, Y14)
   959  		VMOVUPD(Mem{Base: RSP}.Offset(384), Y11)
   960  		VFMADD231PD(Y8, Y11, Y14)
   961  		VSUBPD(Y7, Y15, Y7)
   962  		VADDPD(Y4, Y7, Y4)
   963  		VFNMSUB213PD(Y14, Y12, Y4)
   964  		VMULPD(Y1, Y4, Y7)
   965  		VROUNDPD(Imm(8), Y7, Y7)
   966  		VFNMADD231PD(Y11, Y7, Y4)
   967  		VMULPD(Y4, Y4, Y8)
   968  		VMOVUPD(Mem{Base: RSP}.Offset(320), Y11)
   969  		VFMADD213PD(Mem{Base: RSP}.Offset(352), Y4, Y11)
   970  		VMOVUPD(Mem{Base: RSP}.Offset(256), Y14)
   971  		VFMADD213PD(Mem{Base: RSP}.Offset(288), Y4, Y14)
   972  		VMOVUPD(Mem{Base: RSP}.Offset(192), Y15)
   973  		VFMADD213PD(Mem{Base: RSP}.Offset(224), Y4, Y15)
   974  		VFMADD231PD(Y14, Y8, Y15)
   975  		VMOVUPD(Mem{Base: RSP}.Offset(128), Y14)
   976  		VFMADD213PD(Mem{Base: RSP}.Offset(160), Y4, Y14)
   977  		VMOVUPD(Mem{Base: RSP}.Offset(64), Y1)
   978  		VFMADD213PD(Mem{Base: RSP}.Offset(96), Y4, Y1)
   979  		VFMADD231PD(Y14, Y8, Y1)
   980  		VMOVUPD(Mem{Base: RSP}.Offset(32), Y14)
   981  		VFMADD213PD(Y10, Y4, Y14)
   982  		VFMADD213PD(Y4, Y8, Y14)
   983  		VMULPD(Y8, Y8, Y4)
   984  		VFMADD231PD(Y11, Y4, Y15)
   985  		VFMADD231PD(Y1, Y4, Y14)
   986  		VMULPD(Y4, Y4, Y1)
   987  		VFMADD231PD(Y15, Y1, Y14)
   988  		VADDPD(Y0, Y14, Y1)
   989  		VADDPD(Y2, Y3, Y2)
   990  		VADDPD(Y7, Y2, Y15)
   991  		VROUNDPD(Imm(8), Y15, Y2)
   992  		VCVTTSD2SIQ(X2, R9)
   993  		VPERMILPD(Imm(1), X2, X3)
   994  		VCVTTSD2SIQ(X3, RAX)
   995  		VEXTRACTF128(Imm(1), Y2, X2)
   996  		VCVTTSD2SIQ(X2, RCX)
   997  		VMOVQ(RCX, X3)
   998  		VPERMILPD(Imm(1), X2, X2)
   999  		VCVTTSD2SIQ(X2, RCX)
  1000  		VMOVQ(RCX, X2)
  1001  		VPUNPCKLQDQ(X2, X3, X2)
  1002  		VMOVQ(R9, X3)
  1003  		VMOVQ(RAX, X4)
  1004  		VPUNPCKLQDQ(X4, X3, X3)
  1005  		VINSERTI128(Imm(1), X2, Y3, Y2)
  1006  		VPSRAD(Imm(31), Y1, Y3)
  1007  		VPSRAD(Imm(20), Y1, Y4)
  1008  		VPSRLQ(Imm(32), Y4, Y4)
  1009  		VPBLENDD(Imm(170), Y3, Y4, Y3)
  1010  		VPADDQ(Y3, Y2, Y4)
  1011  		VPCMPGTQ(Mem{Base: RSP}, Y4, Y3)
  1012  		VMOVUPD(Mem{Base: RSP}.Offset(-32), Y0)
  1013  		VCMPPD(Imm(1), Y15, Y0, Y7)
  1014  		VPOR(Y7, Y3, Y3)
  1015  		VMOVDQU(Mem{Base: RSP}.Offset(-64), Y0)
  1016  		VPCMPGTQ(Y4, Y0, Y4)
  1017  		VCMPPD(Imm(1), Mem{Base: RSP}.Offset(-96), Y15, Y7)
  1018  		VPOR(Y7, Y4, Y4)
  1019  		VPSLLQ(Imm(52), Y2, Y2)
  1020  		VPADDQ(Y1, Y2, Y2)
  1021  		VPOR(Y3, Y4, Y1)
  1022  		VPTEST(Y1, Y1)
  1023  		JNE(LabelRef("LBB9_3"))
  1024  		VMOVAPD(Y9, Y10)
  1025  		JMP(LabelRef("LBB9_5"))
  1026  	}
  1027  
  1028  	Label("LBB9_3")
  1029  	{
  1030  		VPANDN(Y2, Y4, Y1)
  1031  		VMOVAPD(Y9, Y10)
  1032  		VBLENDVPD(Y3, Y9, Y1, Y2)
  1033  	}
  1034  
  1035  	Label("LBB9_5")
  1036  	{
  1037  		VPAND(Y5, Y13, Y11)
  1038  		VPCMPEQQ(Y6, Y11, Y4)
  1039  		VPSRAD(Imm(31), Y13, Y1)
  1040  		VPSHUFD(Imm(245), Y1, Y7)
  1041  		VCMPPD(Imm(1), Y6, Y12, Y14)
  1042  		VCMPPD(Imm(0), Y6, Y12, Y3)
  1043  		VANDPD(Mem{Base: RSP}.Offset(-128), Y3, Y1)
  1044  		VBLENDVPD(Y14, Y10, Y1, Y1)
  1045  		VBLENDVPD(Y4, Y1, Y2, Y2)
  1046  		VPTEST(Y7, Y7)
  1047  		JNE(LabelRef("LBB9_7"))
  1048  		VPXOR(X7, X7, X7)
  1049  		JMP(LabelRef("LBB9_8"))
  1050  	}
  1051  
  1052  	Label("LBB9_7")
  1053  	{
  1054  		VROUNDPD(Imm(8), Y12, Y1)
  1055  		VCMPPD(Imm(0), Y1, Y12, Y8)
  1056  		VCVTTSD2SIQ(X1, R9)
  1057  		VPERMILPD(Imm(1), X1, X10)
  1058  		VCVTTSD2SIQ(X10, RCX)
  1059  		VEXTRACTF128(Imm(1), Y1, X1)
  1060  		VCVTTSD2SIQ(X1, RAX)
  1061  		VXORPD(X10, X10, X10)
  1062  		VMOVQ(RAX, X6)
  1063  		VPERMILPD(Imm(1), X1, X1)
  1064  		VCVTTSD2SIQ(X1, RAX)
  1065  		VMOVQ(RAX, X1)
  1066  		VPUNPCKLQDQ(X1, X6, X1)
  1067  		VMOVQ(R9, X6)
  1068  		VMOVQ(RCX, X0)
  1069  		VPUNPCKLQDQ(X0, X6, X0)
  1070  		VINSERTI128(Imm(1), X1, Y0, Y0)
  1071  		VPSLLQ(Imm(63), Y0, Y0)
  1072  		VPOR(Y2, Y0, Y1)
  1073  		VCMPPD(Imm(0), Y10, Y13, Y6)
  1074  		VBROADCASTSD(data.Offset(328), Y10)
  1075  		VBLENDVPD(Y6, Y2, Y10, Y6)
  1076  		VMOVAPD(Y9, Y10)
  1077  		VBLENDVPD(Y8, Y1, Y6, Y1)
  1078  		VXORPD(X6, X6, X6)
  1079  		VBLENDVPD(Y7, Y1, Y2, Y2)
  1080  		VANDPD(Y0, Y8, Y7)
  1081  	}
  1082  
  1083  	Label("LBB9_8")
  1084  	{
  1085  		VPCMPEQD(Y9, Y9, Y9)
  1086  		VANDPD(Y5, Y12, Y0)
  1087  		VANDPD(Y5, Y15, Y1)
  1088  		VPCMPEQQ(Y5, Y1, Y15)
  1089  		VPXOR(Y9, Y15, Y1)
  1090  		VPCMPEQQ(Y5, Y0, Y8)
  1091  		VPCMPEQQ(Y5, Y11, Y11)
  1092  		VPXOR(Y9, Y11, Y0)
  1093  		VPANDN(Y0, Y8, Y0)
  1094  		VPOR(Y4, Y1, Y1)
  1095  		VPAND(Y0, Y1, Y0)
  1096  		VPTEST(Y9, Y0)
  1097  		JB(LabelRef("LBB9_10"))
  1098  		VPXOR(Y9, Y8, Y0)
  1099  		VPANDN(Y0, Y15, Y0)
  1100  		VMOVUPD(Mem{Base: RSP}.Offset(-128), Y8)
  1101  		VMOVUPD(Mem{Base: RSP}.Offset(1152), Y9)
  1102  		VCMPPD(Imm(0), Y8, Y9, Y1)
  1103  		VCMPPD(Imm(1), Y9, Y8, Y4)
  1104  		VPSRAD(Imm(31), Y12, Y6)
  1105  		VPXOR(Y4, Y6, Y4)
  1106  		VPXOR(X6, X6, X6)
  1107  		VBLENDVPD(Y4, Y10, Y6, Y4)
  1108  		VBLENDVPD(Y1, Y8, Y4, Y1)
  1109  		VBLENDVPD(Y0, Y2, Y1, Y0)
  1110  		VANDPD(Y2, Y7, Y1)
  1111  		VANDPD(Y7, Y13, Y2)
  1112  		VORPD(Y2, Y9, Y2)
  1113  		VBLENDVPD(Y14, Y1, Y2, Y1)
  1114  		VBLENDVPD(Y3, Y8, Y1, Y1)
  1115  		VBLENDVPD(Y11, Y1, Y0, Y0)
  1116  		VCMPPD(Imm(3), Y13, Y13, Y1)
  1117  		VCMPPD(Imm(3), Y12, Y12, Y2)
  1118  		VORPD(Y1, Y2, Y1)
  1119  		VADDPD(Y13, Y12, Y2)
  1120  		VBLENDVPD(Y1, Y2, Y0, Y2)
  1121  		JMP(LabelRef("LBB9_10"))
  1122  	}
  1123  
  1124  	Label("LBB9_11")
  1125  	{
  1126  		ADDQ(I32(1192), RSP)
  1127  		VZEROUPPER()
  1128  		RET()
  1129  	}
  1130  }
  1131  
  1132  func genPow_8x_F32() {
  1133  
  1134  	data := GLOBL("genPowF32", RODATA|NOPTR)
  1135  
  1136  	DATA(0, U32(2147483647))   // Label("LCPI8_0")
  1137  	DATA(4, U32(0x3f3504f3))   // Label("LCPI8_3")
  1138  	DATA(8, U32(0xbf800000))   // Label("LCPI8_4")
  1139  	DATA(12, U32(0x3def251a))  // Label("LCPI8_5")
  1140  	DATA(16, U32(0xbdebd1b8))  // Label("LCPI8_6")
  1141  	DATA(20, U32(0x3e11e9bf))  // Label("LCPI8_7")
  1142  	DATA(24, U32(0xbdfe5d4f))  // Label("LCPI8_8")
  1143  	DATA(28, U32(0x3e4cceac))  // Label("LCPI8_9")
  1144  	DATA(32, U32(0xbe2aae50))  // Label("LCPI8_10")
  1145  	DATA(36, U32(0x3eaaaaaa))  // Label("LCPI8_11")
  1146  	DATA(40, U32(0xbe7ffffc))  // Label("LCPI8_12")
  1147  	DATA(44, U32(0x3d9021bb))  // Label("LCPI8_13")
  1148  	DATA(48, U32(0xcb00007f))  // Label("LCPI8_15")
  1149  	DATA(52, U32(0x3f800000))  // Label("LCPI8_16")
  1150  	DATA(56, U32(0xbf000000))  // Label("LCPI8_17")
  1151  	DATA(60, U32(0x3f000000))  // Label("LCPI8_18")
  1152  	DATA(64, U32(0x3fb8aa3b))  // Label("LCPI8_19")
  1153  	DATA(68, U32(0xbf318000))  // Label("LCPI8_20")
  1154  	DATA(72, U32(0xb95e8083))  // Label("LCPI8_21")
  1155  	DATA(76, U32(0xbf317218))  // Label("LCPI8_22")
  1156  	DATA(80, U32(0x3d2aaaab))  // Label("LCPI8_23")
  1157  	DATA(84, U32(0x3c088889))  // Label("LCPI8_24")
  1158  	DATA(88, U32(0x3ab60b61))  // Label("LCPI8_25")
  1159  	DATA(92, U32(0x39500d01))  // Label("LCPI8_26")
  1160  	DATA(96, U32(0x3e2aaaab))  // Label("LCPI8_27")
  1161  	DATA(100, U32(254))        // Label("LCPI8_29")
  1162  	DATA(104, U32(0x43960000)) // Label("LCPI8_30")
  1163  	DATA(108, U32(1))          // Label("LCPI8_31")
  1164  	DATA(112, U32(0xc3960000)) // Label("LCPI8_32")
  1165  	DATA(116, U32(2139095040)) // Label("LCPI8_33")
  1166  	DATA(120, U32(0x7fc00102)) // Label("LCPI8_34")
  1167  
  1168  	DATA(124, U64(36028792732385279)) // Label("LCPI8_1")
  1169  	DATA(132, U64(36028792732385279))
  1170  
  1171  	DATA(140, U64(4539628425446424576)) // Label("LCPI8_2")
  1172  	DATA(148, U64(4539628425446424576))
  1173  
  1174  	DATA(156, U64(5404319554102886400)) // Label("LCPI8_14")
  1175  
  1176  	DATA(164, U8(255)) // Label("LCPI8_28")
  1177  	DATA(165, U8(0))
  1178  	DATA(166, U8(0))
  1179  	DATA(167, U8(0))
  1180  	DATA(168, U8(255))
  1181  	DATA(169, U8(0))
  1182  	DATA(170, U8(0))
  1183  	DATA(171, U8(0))
  1184  	DATA(172, U8(255))
  1185  	DATA(173, U8(0))
  1186  	DATA(174, U8(0))
  1187  	DATA(175, U8(0))
  1188  	DATA(176, U8(255))
  1189  	DATA(177, U8(0))
  1190  	DATA(178, U8(0))
  1191  	DATA(179, U8(0))
  1192  	DATA(180, U8(255))
  1193  	DATA(181, U8(0))
  1194  	DATA(182, U8(0))
  1195  	DATA(183, U8(0))
  1196  	DATA(184, U8(255))
  1197  	DATA(185, U8(0))
  1198  	DATA(186, U8(0))
  1199  	DATA(187, U8(0))
  1200  	DATA(188, U8(255))
  1201  	DATA(189, U8(0))
  1202  	DATA(190, U8(0))
  1203  	DATA(191, U8(0))
  1204  	DATA(192, U8(255))
  1205  	DATA(193, U8(0))
  1206  	DATA(194, U8(0))
  1207  	DATA(195, U8(0))
  1208  
  1209  	TEXT("Pow_8x_AVX2_F32", NOSPLIT, "func(x, y []float32)")
  1210  	Pragma("noescape")
  1211  	Load(Param("x").Base(), RDI)
  1212  	Load(Param("y").Base(), RSI)
  1213  	Load(Param("x").Len(), RDX)
  1214  
  1215  	SUBQ(I32(872), RSP)
  1216  	ANDQ(I32(-8), RDX)
  1217  	JE(LabelRef("LBB8_12"))
  1218  	XORL(EAX, EAX)
  1219  	VBROADCASTSS(data.Offset(0), Y0)
  1220  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(320))
  1221  	VBROADCASTSS(data.Offset(4), Y0)
  1222  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(800))
  1223  	VPXOR(X7, X7, X7)
  1224  	VBROADCASTSS(data.Offset(8), Y0)
  1225  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(768))
  1226  	VBROADCASTSS(data.Offset(12), Y0)
  1227  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(736))
  1228  	VBROADCASTSS(data.Offset(16), Y0)
  1229  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(704))
  1230  	VBROADCASTSS(data.Offset(20), Y0)
  1231  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(672))
  1232  	VBROADCASTSS(data.Offset(24), Y0)
  1233  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(640))
  1234  	VBROADCASTSS(data.Offset(28), Y0)
  1235  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(608))
  1236  	VBROADCASTSS(data.Offset(32), Y0)
  1237  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(576))
  1238  	VBROADCASTSS(data.Offset(36), Y0)
  1239  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(544))
  1240  	VBROADCASTSS(data.Offset(40), Y0)
  1241  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(512))
  1242  	VBROADCASTSS(data.Offset(44), Y0)
  1243  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(480))
  1244  	VBROADCASTSD(data.Offset(156), Y0)
  1245  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(448))
  1246  	VBROADCASTSS(data.Offset(48), Y0)
  1247  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(416))
  1248  	VBROADCASTSS(data.Offset(52), Y0)
  1249  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
  1250  	VBROADCASTSS(data.Offset(56), Y0)
  1251  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(384))
  1252  	VBROADCASTSS(data.Offset(60), Y0)
  1253  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(352))
  1254  	VBROADCASTSS(data.Offset(64), Y0)
  1255  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(288))
  1256  	VBROADCASTSS(data.Offset(68), Y0)
  1257  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(256))
  1258  	VBROADCASTSS(data.Offset(72), Y0)
  1259  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(224))
  1260  	VBROADCASTSS(data.Offset(76), Y0)
  1261  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(192))
  1262  	VBROADCASTSS(data.Offset(80), Y0)
  1263  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(160))
  1264  	VBROADCASTSS(data.Offset(84), Y0)
  1265  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(128))
  1266  	VBROADCASTSS(data.Offset(88), Y0)
  1267  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
  1268  	VBROADCASTSS(data.Offset(92), Y0)
  1269  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
  1270  	VBROADCASTSS(data.Offset(96), Y0)
  1271  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
  1272  	VBROADCASTSS(data.Offset(100), Y0)
  1273  	VMOVUPS(Y0, Mem{Base: RSP})
  1274  	VBROADCASTSS(data.Offset(104), Y0)
  1275  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
  1276  	VBROADCASTSS(data.Offset(108), Y0)
  1277  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
  1278  	VPBROADCASTD(data.Offset(112), Y0)
  1279  	VMOVDQU(Y0, Mem{Base: RSP}.Offset(-96))
  1280  	VPBROADCASTD(data.Offset(116), Y8)
  1281  	VBROADCASTSS(data.Offset(116), Y12)
  1282  	JMP(LabelRef("LBB8_2"))
  1283  
  1284  	Label("LBB8_10")
  1285  	{
  1286  		VPXOR(Y0, Y15, Y0)
  1287  		VPANDN(Y0, Y14, Y0)
  1288  		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y14)
  1289  		VMOVUPS(Mem{Base: RSP}.Offset(832), Y2)
  1290  		VCMPPS(Imm(0), Y2, Y14, Y3)
  1291  		VCMPPS(Imm(1), Y2, Y14, Y4)
  1292  		VXORPS(Y4, Y11, Y4)
  1293  		VPXOR(X7, X7, X7)
  1294  		VBLENDVPS(Y4, Y12, Y7, Y4)
  1295  		VBLENDVPS(Y3, Y14, Y4, Y3)
  1296  		VBLENDVPS(Y0, Y6, Y3, Y0)
  1297  		VANDPS(Y6, Y10, Y3)
  1298  		VANDPS(Y9, Y10, Y4)
  1299  		VORPS(Y2, Y4, Y4)
  1300  		VBLENDVPS(Y13, Y3, Y4, Y3)
  1301  		VBLENDVPS(Y1, Y14, Y3, Y1)
  1302  		VBLENDVPS(Y5, Y0, Y1, Y0)
  1303  		VCMPPS(Imm(3), Y9, Y9, Y1)
  1304  		VCMPPS(Imm(3), Y11, Y11, Y3)
  1305  		VORPS(Y1, Y3, Y1)
  1306  		VADDPS(Y9, Y11, Y3)
  1307  		VBLENDVPS(Y1, Y3, Y0, Y6)
  1308  		VMOVUPS(Y6, Mem{Base: RDI}.Idx(RAX, 4))
  1309  		ADDQ(Imm(8), RAX)
  1310  		CMPQ(RAX, RDX)
  1311  		JAE(LabelRef("LBB8_12"))
  1312  	}
  1313  
  1314  	Label("LBB8_2")
  1315  	{
  1316  		VMOVAPS(Y12, Y2)
  1317  		VMOVDQU(Mem{Base: RDI}.Idx(RAX, 4), Y9)
  1318  		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4), Y11)
  1319  		VPAND(Mem{Base: RSP}.Offset(320), Y9, Y12)
  1320  		VMOVUPS(data.Offset(124), X1)
  1321  		VANDPS(Mem{Base: RDI}.Idx(RAX, 4), X1, X0)
  1322  		VMOVUPS(data.Offset(140), X3)
  1323  		VORPS(X3, X0, X0)
  1324  		VANDPS(Mem{Base: RDI}.Idx(RAX, 4).Offset(16), X1, X1)
  1325  		VORPS(X3, X1, X1)
  1326  		VINSERTF128(Imm(1), X1, Y0, Y0)
  1327  		VMOVUPS(Mem{Base: RSP}.Offset(800), Y1)
  1328  		VCMPPS(Imm(1), Y0, Y1, Y1)
  1329  		VANDNPS(Y0, Y1, Y4)
  1330  		VADDPS(Mem{Base: RSP}.Offset(768), Y0, Y0)
  1331  		VADDPS(Y4, Y0, Y4)
  1332  		VMULPS(Y4, Y4, Y6)
  1333  		VMULPS(Y6, Y6, Y0)
  1334  		VMOVUPS(Mem{Base: RSP}.Offset(704), Y5)
  1335  		VFMADD213PS(Mem{Base: RSP}.Offset(736), Y4, Y5)
  1336  		VMOVUPS(Mem{Base: RSP}.Offset(640), Y10)
  1337  		VFMADD213PS(Mem{Base: RSP}.Offset(672), Y4, Y10)
  1338  		VFMADD231PS(Y5, Y6, Y10)
  1339  		VMOVUPS(Mem{Base: RSP}.Offset(576), Y5)
  1340  		VFMADD213PS(Mem{Base: RSP}.Offset(608), Y4, Y5)
  1341  		VMOVUPS(Mem{Base: RSP}.Offset(512), Y13)
  1342  		VFMADD213PS(Mem{Base: RSP}.Offset(544), Y4, Y13)
  1343  		VMULPS(Y0, Y0, Y14)
  1344  		VFMADD132PS(Mem{Base: RSP}.Offset(480), Y13, Y14)
  1345  		VFMADD231PS(Y5, Y6, Y14)
  1346  		VFMADD231PS(Y10, Y0, Y14)
  1347  		VMULPS(Y4, Y6, Y0)
  1348  		VMULPS(Y0, Y14, Y0)
  1349  		VMOVDQU(Y12, Mem{Base: RSP}.Offset(832))
  1350  		VPSRLD(Imm(23), Y12, Y5)
  1351  		VPOR(Mem{Base: RSP}.Offset(448), Y5, Y5)
  1352  		VADDPS(Mem{Base: RSP}.Offset(416), Y5, Y5)
  1353  		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y3)
  1354  		VANDPS(Y3, Y1, Y1)
  1355  		VADDPS(Y1, Y5, Y5)
  1356  		VMULPS(Y5, Y11, Y1)
  1357  		VROUNDPS(Imm(8), Y1, Y1)
  1358  		VFNMADD213PS(Y1, Y11, Y5)
  1359  		VMOVUPS(Mem{Base: RSP}.Offset(384), Y14)
  1360  		VMOVAPS(Y14, Y10)
  1361  		VFMADD213PS(Y4, Y6, Y10)
  1362  		VADDPS(Y0, Y10, Y10)
  1363  		VMOVUPS(Mem{Base: RSP}.Offset(352), Y12)
  1364  		VMULPS(Y4, Y12, Y13)
  1365  		VMULPS(Y6, Y14, Y14)
  1366  		VFMADD231PS(Y13, Y4, Y14)
  1367  		VSUBPS(Y4, Y10, Y4)
  1368  		VFMADD231PS(Y6, Y12, Y4)
  1369  		VMOVUPS(Mem{Base: RSP}.Offset(288), Y15)
  1370  		VMULPS(Y15, Y11, Y6)
  1371  		VMULPS(Y6, Y10, Y6)
  1372  		VROUNDPS(Imm(8), Y6, Y6)
  1373  		VMULPS(Mem{Base: RSP}.Offset(256), Y6, Y13)
  1374  		VFMADD231PS(Y10, Y11, Y13)
  1375  		VFNMADD231PS(Mem{Base: RSP}.Offset(224), Y6, Y13)
  1376  		VSUBPS(Y0, Y14, Y0)
  1377  		VADDPS(Y4, Y0, Y0)
  1378  		VMOVUPS(Mem{Base: RSP}.Offset(192), Y10)
  1379  		VMULPS(Y5, Y10, Y4)
  1380  		VFNMADD231PS(Y0, Y11, Y4)
  1381  		VADDPS(Y4, Y13, Y0)
  1382  		VMULPS(Y0, Y15, Y4)
  1383  		VROUNDPS(Imm(8), Y4, Y4)
  1384  		VFMADD231PS(Y10, Y4, Y0)
  1385  		VMULPS(Y0, Y0, Y5)
  1386  		VMULPS(Y5, Y5, Y10)
  1387  		VMOVUPS(Mem{Base: RSP}.Offset(64), Y13)
  1388  		VFMADD213PS(Mem{Base: RSP}.Offset(96), Y0, Y13)
  1389  		VMOVUPS(Mem{Base: RSP}.Offset(32), Y14)
  1390  		VFMADD213PS(Y12, Y0, Y14)
  1391  		VFMADD231PS(Y13, Y10, Y14)
  1392  		VMOVUPS(Mem{Base: RSP}.Offset(128), Y10)
  1393  		VFMADD213PS(Mem{Base: RSP}.Offset(160), Y0, Y10)
  1394  		VFMADD231PS(Y10, Y5, Y14)
  1395  		VADDPS(Y3, Y0, Y10)
  1396  		VFMADD231PS(Y14, Y5, Y10)
  1397  		VADDPS(Y1, Y6, Y0)
  1398  		VADDPS(Y4, Y0, Y14)
  1399  		VCVTPS2DQ(Y14, Y4)
  1400  		VPSRLD(Imm(23), Y10, Y0)
  1401  		VPAND(data.Offset(164), Y0, Y0)
  1402  		VPADDD(Y4, Y0, Y0)
  1403  		VPCMPGTD(Mem{Base: RSP}, Y0, Y1)
  1404  		VMOVUPS(Mem{Base: RSP}.Offset(-32), Y3)
  1405  		VCMPPS(Imm(1), Y14, Y3, Y5)
  1406  		VPOR(Y5, Y1, Y1)
  1407  		VMOVDQU(Mem{Base: RSP}.Offset(-64), Y3)
  1408  		VPCMPGTD(Y0, Y3, Y0)
  1409  		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-96), Y14, Y5)
  1410  		VPOR(Y5, Y0, Y0)
  1411  		VPSLLD(Imm(23), Y4, Y4)
  1412  		VPADDD(Y4, Y10, Y6)
  1413  		VPOR(Y1, Y0, Y4)
  1414  		VTESTPS(Y4, Y4)
  1415  		JNE(LabelRef("LBB8_3"))
  1416  		VPCMPEQD(Y15, Y15, Y15)
  1417  		VMOVAPS(Y2, Y12)
  1418  		JMP(LabelRef("LBB8_5"))
  1419  	}
  1420  
  1421  	Label("LBB8_3")
  1422  	{
  1423  		VPANDN(Y6, Y0, Y0)
  1424  		VMOVAPS(Y2, Y12)
  1425  		VBLENDVPS(Y1, Y2, Y0, Y6)
  1426  		VPCMPEQD(Y15, Y15, Y15)
  1427  	}
  1428  
  1429  	Label("LBB8_5")
  1430  	{
  1431  		VPAND(Y8, Y9, Y5)
  1432  		VPCMPEQD(Y7, Y5, Y4)
  1433  		VCMPPS(Imm(1), Y7, Y11, Y13)
  1434  		VCMPPS(Imm(0), Y7, Y11, Y1)
  1435  		VANDPS(Mem{Base: RSP}.Offset(-128), Y1, Y0)
  1436  		VBLENDVPS(Y13, Y12, Y0, Y0)
  1437  		VBLENDVPS(Y4, Y0, Y6, Y6)
  1438  		VMOVMSKPS(Y9, ECX)
  1439  		TESTL(ECX, ECX)
  1440  		JNE(LabelRef("LBB8_7"))
  1441  		VXORPS(X10, X10, X10)
  1442  		JMP(LabelRef("LBB8_8"))
  1443  	}
  1444  
  1445  	Label("LBB8_7")
  1446  	{
  1447  		VROUNDPS(Imm(8), Y11, Y0)
  1448  		VCMPPS(Imm(0), Y0, Y11, Y0)
  1449  		VCVTPS2DQ(Y11, Y10)
  1450  		VPSLLD(Imm(31), Y10, Y10)
  1451  		VPOR(Y6, Y10, Y12)
  1452  		VPXOR(X3, X3, X3)
  1453  		VCMPPS(Imm(0), Y3, Y9, Y7)
  1454  		VBROADCASTSS(data.Offset(120), Y3)
  1455  		VBLENDVPS(Y7, Y6, Y3, Y3)
  1456  		VBLENDVPS(Y0, Y12, Y3, Y3)
  1457  		VMOVAPS(Y2, Y12)
  1458  		VPSRAD(Imm(31), Y9, Y7)
  1459  		VBLENDVPS(Y7, Y3, Y6, Y6)
  1460  		VANDPS(Y0, Y10, Y10)
  1461  	}
  1462  
  1463  	Label("LBB8_8")
  1464  	{
  1465  		VPCMPEQD(Y5, Y8, Y0)
  1466  		VPXOR(Y0, Y15, Y5)
  1467  		VANDPS(Y8, Y11, Y0)
  1468  		VANDPS(Y8, Y14, Y3)
  1469  		VPCMPEQD(Y3, Y8, Y14)
  1470  		VPXOR(Y15, Y14, Y3)
  1471  		VPCMPEQD(Y0, Y8, Y0)
  1472  		VPANDN(Y5, Y0, Y7)
  1473  		VPOR(Y4, Y3, Y3)
  1474  		VPAND(Y7, Y3, Y3)
  1475  		VTESTPS(Y15, Y3)
  1476  		JAE(LabelRef("LBB8_10"))
  1477  		VPXOR(X7, X7, X7)
  1478  		VMOVUPS(Y6, Mem{Base: RDI}.Idx(RAX, 4))
  1479  		ADDQ(Imm(8), RAX)
  1480  		CMPQ(RAX, RDX)
  1481  		JB(LabelRef("LBB8_2"))
  1482  	}
  1483  
  1484  	Label("LBB8_12")
  1485  	{
  1486  		ADDQ(I32(872), RSP)
  1487  		VZEROUPPER()
  1488  		RET()
  1489  	}
  1490  }
  1491  
  1492  func genLog10_Len8x_F32() {
  1493  
  1494  	data := GLOBL("dataLog10Len8xF32", RODATA|NOPTR)
  1495  	DATA(0, U32(0x00800000))
  1496  	DATA(4, U32(2155872255))
  1497  	DATA(8, U32(1056964608))
  1498  	DATA(12, U32(4294967169))
  1499  	DATA(16, U32(0x3f800000))
  1500  	DATA(20, U32(0x3f3504f3))
  1501  	DATA(24, U32(0xbf800000))
  1502  	DATA(28, U32(0x3d9021bb))
  1503  	DATA(32, U32(0xbdebd1b8))
  1504  	DATA(36, U32(0x3def251a))
  1505  	DATA(40, U32(0xbdfe5d4f))
  1506  	DATA(44, U32(0x3e11e9bf))
  1507  	DATA(48, U32(0xbe2aae50))
  1508  	DATA(52, U32(0x3e4cceac))
  1509  	DATA(56, U32(0xbe7ffffc))
  1510  	DATA(60, U32(0x3eaaaaaa))
  1511  	DATA(64, U32(0x3f317218))
  1512  	DATA(68, U32(0xbf000000))
  1513  	DATA(72, U32(0x3ede5bd9))
  1514  	DATA(76, U64(0x0))
  1515  	DATA(84, U64(0x0))
  1516  	DATA(92, U64(0x0))
  1517  	DATA(100, U64(0x0))
  1518  
  1519  	TEXT("Log10_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
  1520  	Pragma("noescape")
  1521  	Load(Param("x").Base(), RDI)
  1522  	Load(Param("x").Len(), RSI)
  1523  
  1524  	SUBQ(Imm(136), RSP)
  1525  	TESTQ(RSI, RSI)
  1526  	JE(LabelRef("LBB8_3"))
  1527  	XORL(EAX, EAX)
  1528  	VBROADCASTSS(data.Offset(4), Y0)
  1529  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
  1530  	VBROADCASTSS(data.Offset(8), Y0)
  1531  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
  1532  	VBROADCASTSS(data.Offset(12), Y0)
  1533  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
  1534  	VBROADCASTSS(data.Offset(0), Y0)
  1535  	VMOVUPS(Y0, Mem{Base: RSP})
  1536  	VBROADCASTSS(data.Offset(16), Y0)
  1537  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
  1538  	VBROADCASTSS(data.Offset(20), Y0)
  1539  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
  1540  	VBROADCASTSS(data.Offset(24), Y0)
  1541  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
  1542  	VBROADCASTSS(data.Offset(28), Y0)
  1543  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
  1544  	VBROADCASTSS(data.Offset(32), Y9)
  1545  	VBROADCASTSS(data.Offset(36), Y10)
  1546  	VBROADCASTSS(data.Offset(40), Y11)
  1547  	VBROADCASTSS(data.Offset(44), Y12)
  1548  	VBROADCASTSS(data.Offset(48), Y13)
  1549  	VBROADCASTSS(data.Offset(52), Y14)
  1550  	VBROADCASTSS(data.Offset(56), Y15)
  1551  	VBROADCASTSS(data.Offset(60), Y0)
  1552  	VBROADCASTSS(data.Offset(64), Y1)
  1553  	VBROADCASTSS(data.Offset(68), Y2)
  1554  	VBROADCASTSS(data.Offset(72), Y3)
  1555  
  1556  	Label("LBB8_2")
  1557  	{
  1558  		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y4)
  1559  		VMAXPS(Mem{Base: RSP}, Y4, Y5)
  1560  		VPSRLD(Imm(23), Y5, Y6)
  1561  		VPADDD(Mem{Base: RSP}.Offset(32), Y6, Y6)
  1562  		VANDPS(Mem{Base: RSP}.Offset(96), Y5, Y5)
  1563  		VORPS(Mem{Base: RSP}.Offset(64), Y5, Y5)
  1564  		VCVTDQ2PS(Y6, Y6)
  1565  		VADDPS(Mem{Base: RSP}.Offset(-32), Y6, Y7)
  1566  		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-64), Y5, Y8)
  1567  		VBLENDVPS(Y8, Y6, Y7, Y6)
  1568  		VANDPS(Y5, Y8, Y7)
  1569  		VADDPS(Mem{Base: RSP}.Offset(-96), Y5, Y5)
  1570  		VADDPS(Y7, Y5, Y5)
  1571  		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y7)
  1572  		VFMADD213PS(Y9, Y5, Y7)
  1573  		VFMADD213PS(Y10, Y5, Y7)
  1574  		VFMADD213PS(Y11, Y5, Y7)
  1575  		VFMADD213PS(Y12, Y5, Y7)
  1576  		VFMADD213PS(Y13, Y5, Y7)
  1577  		VFMADD213PS(Y14, Y5, Y7)
  1578  		VFMADD213PS(Y15, Y5, Y7)
  1579  		VFMADD213PS(Y0, Y5, Y7)
  1580  		VFMADD213PS(Y2, Y5, Y7)
  1581  		VFMADD213PS(Y5, Y1, Y6)
  1582  		VMULPS(Y5, Y5, Y5)
  1583  		VFMADD231PS(Y7, Y5, Y6)
  1584  		VCMPPS(Imm(2), data.Offset(76), Y4, Y4)
  1585  		VMULPS(Y3, Y6, Y5)
  1586  		VORPS(Y5, Y4, Y4)
  1587  		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RAX, 4))
  1588  		ADDQ(Imm(8), RAX)
  1589  		CMPQ(RAX, RSI)
  1590  		JB(LabelRef("LBB8_2"))
  1591  	}
  1592  
  1593  	Label("LBB8_3")
  1594  	{
  1595  		ADDQ(Imm(136), RSP)
  1596  		VZEROUPPER()
  1597  		RET()
  1598  	}
  1599  }
  1600  
  1601  func genLog2_Len8x_F32() {
  1602  
  1603  	data := GLOBL("dataLog2Len8xF32", RODATA|NOPTR)
  1604  	DATA(0, U32(0x00800000))
  1605  	DATA(4, U32(2155872255))
  1606  	DATA(8, U32(1056964608))
  1607  	DATA(12, U32(4294967169))
  1608  	DATA(16, U32(0x3f800000))
  1609  	DATA(20, U32(0x3f3504f3))
  1610  	DATA(24, U32(0xbf800000))
  1611  	DATA(28, U32(0x3d9021bb))
  1612  	DATA(32, U32(0xbdebd1b8))
  1613  	DATA(36, U32(0x3def251a))
  1614  	DATA(40, U32(0xbdfe5d4f))
  1615  	DATA(44, U32(0x3e11e9bf))
  1616  	DATA(48, U32(0xbe2aae50))
  1617  	DATA(52, U32(0x3e4cceac))
  1618  	DATA(56, U32(0xbe7ffffc))
  1619  	DATA(60, U32(0x3eaaaaaa))
  1620  	DATA(64, U32(0x3f317218))
  1621  	DATA(68, U32(0xbf000000))
  1622  	DATA(72, U32(0x3fb8aa3b))
  1623  	DATA(76, U64(0x0))
  1624  	DATA(84, U64(0x0))
  1625  	DATA(92, U64(0x0))
  1626  	DATA(100, U64(0x0))
  1627  
  1628  	TEXT("Log2_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
  1629  	Pragma("noescape")
  1630  	Load(Param("x").Base(), RDI)
  1631  	Load(Param("x").Len(), RSI)
  1632  
  1633  	SUBQ(Imm(136), RSP)
  1634  	TESTQ(RSI, RSI)
  1635  	JE(LabelRef("LBB9_3"))
  1636  	XORL(EAX, EAX)
  1637  	VBROADCASTSS(data.Offset(4), Y0)
  1638  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
  1639  	VBROADCASTSS(data.Offset(8), Y0)
  1640  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
  1641  	VBROADCASTSS(data.Offset(12), Y0)
  1642  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
  1643  	VBROADCASTSS(data.Offset(0), Y0)
  1644  	VMOVUPS(Y0, Mem{Base: RSP})
  1645  	VBROADCASTSS(data.Offset(16), Y0)
  1646  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
  1647  	VBROADCASTSS(data.Offset(20), Y0)
  1648  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
  1649  	VBROADCASTSS(data.Offset(24), Y0)
  1650  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
  1651  	VBROADCASTSS(data.Offset(28), Y0)
  1652  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
  1653  	VBROADCASTSS(data.Offset(32), Y9)
  1654  	VBROADCASTSS(data.Offset(36), Y10)
  1655  	VBROADCASTSS(data.Offset(40), Y11)
  1656  	VBROADCASTSS(data.Offset(44), Y12)
  1657  	VBROADCASTSS(data.Offset(48), Y13)
  1658  	VBROADCASTSS(data.Offset(52), Y14)
  1659  	VBROADCASTSS(data.Offset(56), Y15)
  1660  	VBROADCASTSS(data.Offset(60), Y0)
  1661  	VBROADCASTSS(data.Offset(64), Y1)
  1662  	VBROADCASTSS(data.Offset(68), Y2)
  1663  	VBROADCASTSS(data.Offset(72), Y3)
  1664  
  1665  	Label("LBB9_2")
  1666  	{
  1667  		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y4)
  1668  		VMAXPS(Mem{Base: RSP}, Y4, Y5)
  1669  		VPSRLD(Imm(23), Y5, Y6)
  1670  		VPADDD(Mem{Base: RSP}.Offset(32), Y6, Y6)
  1671  		VANDPS(Mem{Base: RSP}.Offset(96), Y5, Y5)
  1672  		VORPS(Mem{Base: RSP}.Offset(64), Y5, Y5)
  1673  		VCVTDQ2PS(Y6, Y6)
  1674  		VADDPS(Mem{Base: RSP}.Offset(-32), Y6, Y7)
  1675  		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-64), Y5, Y8)
  1676  		VBLENDVPS(Y8, Y6, Y7, Y6)
  1677  		VANDPS(Y5, Y8, Y7)
  1678  		VADDPS(Mem{Base: RSP}.Offset(-96), Y5, Y5)
  1679  		VADDPS(Y7, Y5, Y5)
  1680  		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y7)
  1681  		VFMADD213PS(Y9, Y5, Y7)
  1682  		VFMADD213PS(Y10, Y5, Y7)
  1683  		VFMADD213PS(Y11, Y5, Y7)
  1684  		VFMADD213PS(Y12, Y5, Y7)
  1685  		VFMADD213PS(Y13, Y5, Y7)
  1686  		VFMADD213PS(Y14, Y5, Y7)
  1687  		VFMADD213PS(Y15, Y5, Y7)
  1688  		VFMADD213PS(Y0, Y5, Y7)
  1689  		VFMADD213PS(Y2, Y5, Y7)
  1690  		VFMADD213PS(Y5, Y1, Y6)
  1691  		VMULPS(Y5, Y5, Y5)
  1692  		VFMADD231PS(Y7, Y5, Y6)
  1693  		VCMPPS(Imm(2), data.Offset(76), Y4, Y4)
  1694  		VMULPS(Y3, Y6, Y5)
  1695  		VORPS(Y5, Y4, Y4)
  1696  		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RAX, 4))
  1697  		ADDQ(Imm(8), RAX)
  1698  		CMPQ(RAX, RSI)
  1699  		JB(LabelRef("LBB9_2"))
  1700  	}
  1701  
  1702  	Label("LBB9_3")
  1703  	{
  1704  		ADDQ(Imm(136), RSP)
  1705  		VZEROUPPER()
  1706  		RET()
  1707  	}
  1708  }
  1709  
  1710  func genLog_Len8x_F32() {
  1711  
  1712  	data := GLOBL("dataLogLen8xF32", RODATA|NOPTR)
  1713  	DATA(0, U32(0x00800000))
  1714  	DATA(4, U32(2155872255))
  1715  	DATA(8, U32(1056964608))
  1716  	DATA(12, U32(4294967169))
  1717  	DATA(16, U32(0x3f800000))
  1718  	DATA(20, U32(0x3f3504f3))
  1719  	DATA(24, U32(0xbf800000))
  1720  	DATA(28, U32(0x3d9021bb))
  1721  	DATA(32, U32(0xbdebd1b8))
  1722  	DATA(36, U32(0x3def251a))
  1723  	DATA(40, U32(0xbdfe5d4f))
  1724  	DATA(44, U32(0x3e11e9bf))
  1725  	DATA(48, U32(0xbe2aae50))
  1726  	DATA(52, U32(0x3e4cceac))
  1727  	DATA(56, U32(0xbe7ffffc))
  1728  	DATA(60, U32(0x3eaaaaaa))
  1729  	DATA(64, U32(0x3f317218))
  1730  	DATA(68, U32(0xbf000000))
  1731  	DATA(72, U64(0x0))
  1732  	DATA(80, U64(0x0))
  1733  	DATA(88, U64(0x0))
  1734  	DATA(96, U64(0x0))
  1735  
  1736  	TEXT("Log_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
  1737  	Pragma("noescape")
  1738  	Load(Param("x").Base(), RDI)
  1739  	Load(Param("x").Len(), RSI)
  1740  
  1741  	SUBQ(Imm(104), RSP)
  1742  	TESTQ(RSI, RSI)
  1743  	JE(LabelRef("LBB10_3"))
  1744  	XORL(EAX, EAX)
  1745  	VBROADCASTSS(data.Offset(0), Y0)
  1746  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
  1747  	VBROADCASTSS(data.Offset(4), Y0)
  1748  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
  1749  	VBROADCASTSS(data.Offset(8), Y0)
  1750  	VMOVUPS(Y0, Mem{Base: RSP})
  1751  	VBROADCASTSS(data.Offset(12), Y0)
  1752  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
  1753  	VBROADCASTSS(data.Offset(16), Y0)
  1754  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
  1755  	VBROADCASTSS(data.Offset(20), Y0)
  1756  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
  1757  	VBROADCASTSS(data.Offset(24), Y0)
  1758  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
  1759  	VBROADCASTSS(data.Offset(28), Y8)
  1760  	VBROADCASTSS(data.Offset(32), Y9)
  1761  	VBROADCASTSS(data.Offset(36), Y10)
  1762  	VBROADCASTSS(data.Offset(40), Y11)
  1763  	VBROADCASTSS(data.Offset(44), Y12)
  1764  	VBROADCASTSS(data.Offset(48), Y13)
  1765  	VBROADCASTSS(data.Offset(52), Y14)
  1766  	VBROADCASTSS(data.Offset(56), Y15)
  1767  	VBROADCASTSS(data.Offset(60), Y0)
  1768  	VBROADCASTSS(data.Offset(64), Y1)
  1769  	VBROADCASTSS(data.Offset(68), Y2)
  1770  
  1771  	Label("LBB10_2")
  1772  	{
  1773  		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y3)
  1774  		VMAXPS(Mem{Base: RSP}.Offset(64), Y3, Y4)
  1775  		VPSRLD(Imm(23), Y4, Y5)
  1776  		VPADDD(Mem{Base: RSP}.Offset(-32), Y5, Y5)
  1777  		VANDPS(Mem{Base: RSP}.Offset(32), Y4, Y4)
  1778  		VORPS(Mem{Base: RSP}, Y4, Y4)
  1779  		VCVTDQ2PS(Y5, Y5)
  1780  		VADDPS(Mem{Base: RSP}.Offset(-64), Y5, Y6)
  1781  		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-96), Y4, Y7)
  1782  		VBLENDVPS(Y7, Y5, Y6, Y5)
  1783  		VANDPS(Y4, Y7, Y6)
  1784  		VADDPS(Mem{Base: RSP}.Offset(-128), Y4, Y4)
  1785  		VADDPS(Y6, Y4, Y4)
  1786  		VMOVAPS(Y8, Y6)
  1787  		VFMADD213PS(Y9, Y4, Y6)
  1788  		VFMADD213PS(Y10, Y4, Y6)
  1789  		VFMADD213PS(Y11, Y4, Y6)
  1790  		VFMADD213PS(Y12, Y4, Y6)
  1791  		VFMADD213PS(Y13, Y4, Y6)
  1792  		VFMADD213PS(Y14, Y4, Y6)
  1793  		VFMADD213PS(Y15, Y4, Y6)
  1794  		VFMADD213PS(Y0, Y4, Y6)
  1795  		VFMADD213PS(Y2, Y4, Y6)
  1796  		VFMADD213PS(Y4, Y1, Y5)
  1797  		VMULPS(Y4, Y4, Y4)
  1798  		VFMADD231PS(Y6, Y4, Y5)
  1799  		VCMPPS(Imm(2), data.Offset(72), Y3, Y3)
  1800  		VORPS(Y5, Y3, Y3)
  1801  		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RAX, 4))
  1802  		ADDQ(Imm(8), RAX)
  1803  		CMPQ(RAX, RSI)
  1804  		JB(LabelRef("LBB10_2"))
  1805  	}
  1806  
  1807  	Label("LBB10_3")
  1808  	{
  1809  		ADDQ(Imm(104), RSP)
  1810  		VZEROUPPER()
  1811  		RET()
  1812  	}
  1813  }
  1814  
  1815  func genExp_Len8x_F32() {
  1816  
  1817  	data := GLOBL("dataExpLen8xF32", RODATA|NOPTR)
  1818  	DATA(0, U32(0x42b17218))
  1819  	DATA(4, U32(0xc2ce8ed0))
  1820  	DATA(8, U32(0x3f000000))
  1821  	DATA(12, U32(0x3fb8aa3b))
  1822  	DATA(16, U32(0xbf318000))
  1823  	DATA(20, U32(0x395e8083))
  1824  	DATA(24, U32(1065353216))
  1825  	DATA(28, U32(0x3ab743ce))
  1826  	DATA(32, U32(0x39506967))
  1827  	DATA(36, U32(0x3c088908))
  1828  	DATA(40, U32(0x3d2aa9c1))
  1829  	DATA(44, U32(0x3e2aaaaa))
  1830  	DATA(48, U32(0x7f7fffff))
  1831  
  1832  	TEXT("Exp_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
  1833  	Pragma("noescape")
  1834  	Load(Param("x").Base(), RDI)
  1835  	Load(Param("x").Len(), RSI)
  1836  
  1837  	TESTQ(RSI, RSI)
  1838  	JE(LabelRef("LBB11_3"))
  1839  	XORL(EAX, EAX)
  1840  	VBROADCASTSS(data.Offset(0), Y0)
  1841  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-40))
  1842  	VBROADCASTSS(data.Offset(4), Y0)
  1843  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-72))
  1844  	VBROADCASTSS(data.Offset(8), Y2)
  1845  	VBROADCASTSS(data.Offset(12), Y3)
  1846  	VBROADCASTSS(data.Offset(16), Y4)
  1847  	VBROADCASTSS(data.Offset(20), Y5)
  1848  	VPBROADCASTD(data.Offset(24), Y6)
  1849  	VBROADCASTSS(data.Offset(28), Y7)
  1850  	VBROADCASTSS(data.Offset(32), Y1)
  1851  	VBROADCASTSS(data.Offset(36), Y9)
  1852  	VBROADCASTSS(data.Offset(40), Y10)
  1853  	VBROADCASTSS(data.Offset(44), Y11)
  1854  	VBROADCASTSS(data.Offset(48), Y12)
  1855  
  1856  	Label("LBB11_2")
  1857  	{
  1858  		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y13)
  1859  		VMOVAPS(Y3, Y14)
  1860  		VFMADD213PS(Y2, Y13, Y14)
  1861  		VROUNDPS(Imm(1), Y14, Y14)
  1862  		VMOVAPS(Y4, Y15)
  1863  		VFMADD213PS(Y13, Y14, Y15)
  1864  		VFMADD231PS(Y5, Y14, Y15)
  1865  		VMULPS(Y15, Y15, Y0)
  1866  		VMOVAPS(Y1, Y8)
  1867  		VFMADD213PS(Y7, Y15, Y8)
  1868  		VFMADD213PS(Y9, Y15, Y8)
  1869  		VFMADD213PS(Y10, Y15, Y8)
  1870  		VFMADD213PS(Y11, Y15, Y8)
  1871  		VFMADD213PS(Y2, Y15, Y8)
  1872  		VFMADD213PS(Y15, Y0, Y8)
  1873  		VCVTTPS2DQ(Y14, Y0)
  1874  		VPSLLD(Imm(23), Y0, Y0)
  1875  		VPADDD(Y6, Y0, Y0)
  1876  		VFMADD213PS(Y0, Y0, Y8)
  1877  		VMOVUPS(Mem{Base: RSP}.Offset(-40), Y0)
  1878  		VCMPPS(Imm(1), Y13, Y0, Y0)
  1879  		VBLENDVPS(Y0, Y12, Y8, Y0)
  1880  		VMOVUPS(Mem{Base: RSP}.Offset(-72), Y8)
  1881  		VCMPPS(Imm(2), Y13, Y8, Y8)
  1882  		VANDPS(Y0, Y8, Y0)
  1883  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RAX, 4))
  1884  		ADDQ(Imm(8), RAX)
  1885  		CMPQ(RAX, RSI)
  1886  		JB(LabelRef("LBB11_2"))
  1887  	}
  1888  
  1889  	Label("LBB11_3")
  1890  	{
  1891  		VZEROUPPER()
  1892  		RET()
  1893  	}
  1894  }
  1895  
  1896  func genSin_F32() {
  1897  
  1898  	data := GLOBL("dataSinF32", RODATA|NOPTR)
  1899  	DATA(0, U32(2147483647))
  1900  	DATA(4, U32(0x3fa2f983))
  1901  	DATA(8, U32(4294967294))
  1902  	DATA(12, U32(2))
  1903  	DATA(16, U32(0xbf490fdb))
  1904  	DATA(20, U32(2147483648))
  1905  	DATA(24, U32(0x37ccf5ce))
  1906  	DATA(28, U32(0xbab6061a))
  1907  	DATA(32, U32(0x3d2aaaa5))
  1908  	DATA(36, U32(0xbf000000))
  1909  	DATA(40, U32(0x3f800000))
  1910  	DATA(44, U32(0xb94ca1f9))
  1911  	DATA(48, U32(0x3c08839e))
  1912  	DATA(52, U32(0xbe2aaaa3))
  1913  	DATA(56, U32(0x4b7fffff))
  1914  	DATA(60, U64(0xffffffffffffffff))
  1915  	DATA(68, U64(0xffffffffffffffff))
  1916  	DATA(76, U64(0xffffffffffffffff))
  1917  	DATA(84, U64(0xffffffffffffffff))
  1918  	DATA(92, U64(0x0))
  1919  	DATA(100, U64(0x0))
  1920  	DATA(108, U64(0x0))
  1921  	DATA(116, U64(0x0))
  1922  
  1923  	TEXT("Sin_AVX2_F32", 0, "func(x []float32)")
  1924  	Pragma("noescape")
  1925  	Load(Param("x").Base(), RDI)
  1926  	Load(Param("x").Len(), RSI)
  1927  
  1928  	PUSHQ(RAX)
  1929  	MOVQ(RSI, RAX)
  1930  	ANDQ(I32(-8), RAX)
  1931  	JE(LabelRef("LBB12_3"))
  1932  	XORL(ECX, ECX)
  1933  	VBROADCASTSS(data.Offset(0), Y0)
  1934  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
  1935  	VBROADCASTSS(data.Offset(4), Y0)
  1936  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
  1937  	VBROADCASTSS(data.Offset(8), Y0)
  1938  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
  1939  	VPBROADCASTD(data.Offset(12), Y4)
  1940  	VPBROADCASTD(data.Offset(16), Y0)
  1941  	VMOVDQU(Y0, Mem{Base: RSP}.Offset(-128))
  1942  	VPBROADCASTD(data.Offset(20), Y7)
  1943  	VBROADCASTSS(data.Offset(24), Y8)
  1944  	VBROADCASTSS(data.Offset(28), Y9)
  1945  	VBROADCASTSS(data.Offset(32), Y10)
  1946  	VBROADCASTSS(data.Offset(36), Y11)
  1947  	VBROADCASTSS(data.Offset(40), Y12)
  1948  	VBROADCASTSS(data.Offset(44), Y3)
  1949  	VBROADCASTSS(data.Offset(48), Y14)
  1950  	VBROADCASTSS(data.Offset(52), Y15)
  1951  
  1952  	Label("LBB12_2")
  1953  	{
  1954  		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y2)
  1955  		VANDPS(Mem{Base: RSP}.Offset(-32), Y2, Y5)
  1956  		VMULPS(Mem{Base: RSP}.Offset(-64), Y5, Y0)
  1957  		VCVTTPS2DQ(Y0, Y0)
  1958  		VPSUBD(data.Offset(60), Y0, Y0)
  1959  		VPAND(Mem{Base: RSP}.Offset(-96), Y0, Y1)
  1960  		VCVTDQ2PS(Y1, Y1)
  1961  		VFMADD132PS(Mem{Base: RSP}.Offset(-128), Y5, Y1)
  1962  		VMULPS(Y1, Y1, Y5)
  1963  		VMOVAPS(Y3, Y13)
  1964  		VFMADD213PS(Y14, Y5, Y13)
  1965  		VFMADD213PS(Y15, Y5, Y13)
  1966  		VMULPS(Y1, Y5, Y6)
  1967  		VFMADD213PS(Y1, Y13, Y6)
  1968  		VPSLLD(Imm(29), Y0, Y1)
  1969  		VPAND(Y4, Y0, Y0)
  1970  		VPXOR(Y2, Y1, Y1)
  1971  		VMOVAPS(Y8, Y2)
  1972  		VFMADD213PS(Y9, Y5, Y2)
  1973  		VFMADD213PS(Y10, Y5, Y2)
  1974  		VFMADD213PS(Y11, Y5, Y2)
  1975  		VFMADD213PS(Y12, Y5, Y2)
  1976  		VPCMPEQD(Y4, Y0, Y5)
  1977  		VANDPS(Y5, Y2, Y2)
  1978  		VPCMPEQD(data.Offset(92), Y0, Y0)
  1979  		VANDPS(Y0, Y6, Y0)
  1980  		VADDPS(Y2, Y0, Y0)
  1981  		VPAND(Y7, Y1, Y1)
  1982  		VPXOR(Y0, Y1, Y0)
  1983  		VMOVDQU(Y0, Mem{Base: RDI}.Idx(RCX, 4))
  1984  		ADDQ(Imm(8), RCX)
  1985  		CMPQ(RCX, RAX)
  1986  		JB(LabelRef("LBB12_2"))
  1987  	}
  1988  
  1989  	Label("LBB12_3")
  1990  	{
  1991  		CMPQ(RAX, RSI)
  1992  		JAE(LabelRef("LBB12_14"))
  1993  		VBROADCASTSS(data.Offset(20), X0)
  1994  		VPXOR(X1, X1, X1)
  1995  		VMOVSS(data.Offset(56), X2)
  1996  		VMOVSS(data.Offset(40), X9)
  1997  		VMOVSS(data.Offset(16), X10)
  1998  		VMOVSS(data.Offset(24), X12)
  1999  		VMOVSS(data.Offset(28), X11)
  2000  		VMOVSS(data.Offset(32), X13)
  2001  		VMOVSS(data.Offset(36), X14)
  2002  		VMOVSS(data.Offset(44), X8)
  2003  		VMOVSS(data.Offset(48), X15)
  2004  		VMOVSS(data.Offset(52), X6)
  2005  		JMP(LabelRef("LBB12_5"))
  2006  	}
  2007  
  2008  	Label("LBB12_13")
  2009  	{
  2010  		ADDQ(Imm(1), RAX)
  2011  		CMPQ(RAX, RSI)
  2012  		JAE(LabelRef("LBB12_14"))
  2013  	}
  2014  
  2015  	Label("LBB12_5")
  2016  	{
  2017  		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X4)
  2018  		VXORPS(X0, X4, X3)
  2019  		VCMPSS(Imm(1), X1, X4, X5)
  2020  		VBLENDVPS(X5, X3, X4, X3)
  2021  		VUCOMISS(X2, X3)
  2022  		JA(LabelRef("LBB12_13"))
  2023  		VUCOMISS(X1, X4)
  2024  		SETCS(R8B)
  2025  		VMULSS(data.Offset(4), X3, X4)
  2026  		VCVTTSS2SI(X4, EDX)
  2027  		VROUNDSS(Imm(11), X4, X4, X4)
  2028  		MOVL(EDX, ECX)
  2029  		ANDL(Imm(1), ECX)
  2030  		JE(LabelRef("LBB12_8"))
  2031  		VADDSS(X4, X9, X4)
  2032  	}
  2033  
  2034  	Label("LBB12_8")
  2035  	{
  2036  		ADDL(EDX, ECX)
  2037  		ANDL(Imm(7), ECX)
  2038  		LEAL(Mem{Base: RCX}.Offset(-4), EDX)
  2039  		CMPL(ECX, Imm(4))
  2040  		SETCC(R9B)
  2041  		CMOVLLT(ECX, EDX)
  2042  		VFMADD231SS(X10, X4, X3)
  2043  		VMULSS(X3, X3, X4)
  2044  		VMOVAPS(X12, X7)
  2045  		VFMADD213SS(X11, X4, X7)
  2046  		VFMADD213SS(X13, X4, X7)
  2047  		VFMADD213SS(X14, X4, X7)
  2048  		VMOVAPS(X8, X5)
  2049  		VFMADD213SS(X15, X4, X5)
  2050  		VFMADD213SS(X6, X4, X5)
  2051  		ADDL(I32(-1), EDX)
  2052  		CMPL(EDX, Imm(2))
  2053  		JB(LabelRef("LBB12_9"))
  2054  		VMULSS(X3, X4, X4)
  2055  		VFMADD213SS(X3, X4, X5)
  2056  		VMOVAPS(X5, X4)
  2057  		VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4))
  2058  		CMPB(R8B, R9B)
  2059  		JE(LabelRef("LBB12_13"))
  2060  		JMP(LabelRef("LBB12_12"))
  2061  	}
  2062  
  2063  	Label("LBB12_9")
  2064  	{
  2065  		VFMADD213SS(X9, X7, X4)
  2066  		VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4))
  2067  		CMPB(R8B, R9B)
  2068  		JE(LabelRef("LBB12_13"))
  2069  	}
  2070  
  2071  	Label("LBB12_12")
  2072  	{
  2073  		VXORPS(X0, X4, X3)
  2074  		VMOVSS(X3, Mem{Base: RDI}.Idx(RAX, 4))
  2075  		JMP(LabelRef("LBB12_13"))
  2076  	}
  2077  
  2078  	Label("LBB12_14")
  2079  	{
  2080  		POPQ(RAX)
  2081  		VZEROUPPER()
  2082  		RET()
  2083  	}
  2084  }
  2085  
  2086  func genCos_F32() {
  2087  
  2088  	data := GLOBL("dataCosF32", RODATA|NOPTR)
  2089  	DATA(0, U32(2147483647))
  2090  	DATA(4, U32(0x3fa2f983))
  2091  	DATA(8, U32(4294967294))
  2092  	DATA(12, U32(2))
  2093  	DATA(16, U32(0xbf490fdb))
  2094  	DATA(20, U32(3221225472))
  2095  	DATA(24, U32(0x37ccf5ce))
  2096  	DATA(28, U32(0xbab6061a))
  2097  	DATA(32, U32(0x3d2aaaa5))
  2098  	DATA(36, U32(0xbf000000))
  2099  	DATA(40, U32(0x3f800000))
  2100  	DATA(44, U32(0xb94ca1f9))
  2101  	DATA(48, U32(0x3c08839e))
  2102  	DATA(52, U32(0xbe2aaaa3))
  2103  	DATA(56, U32(2147483648))
  2104  	DATA(60, U32(0x4b7fffff))
  2105  	DATA(64, U64(0xffffffffffffffff))
  2106  	DATA(72, U64(0xffffffffffffffff))
  2107  	DATA(80, U64(0xffffffffffffffff))
  2108  	DATA(88, U64(0xffffffffffffffff))
  2109  	DATA(96, U64(0x0))
  2110  	DATA(104, U64(0x0))
  2111  	DATA(112, U64(0x0))
  2112  	DATA(120, U64(0x0))
  2113  
  2114  	TEXT("Cos_AVX2_F32", NOSPLIT, "func(x []float32)")
  2115  	Pragma("noescape")
  2116  	Load(Param("x").Base(), RDI)
  2117  	Load(Param("x").Len(), RSI)
  2118  
  2119  	SUBQ(Imm(72), RSP)
  2120  	MOVQ(RSI, RAX)
  2121  	ANDQ(I32(-8), RAX)
  2122  	JE(LabelRef("LBB13_3"))
  2123  	XORL(ECX, ECX)
  2124  	VBROADCASTSS(data.Offset(0), Y0)
  2125  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
  2126  	VBROADCASTSS(data.Offset(4), Y0)
  2127  	VMOVUPS(Y0, Mem{Base: RSP})
  2128  	VBROADCASTSS(data.Offset(8), Y0)
  2129  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
  2130  	VPBROADCASTD(data.Offset(12), Y4)
  2131  	VBROADCASTSS(data.Offset(16), Y0)
  2132  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
  2133  	VBROADCASTSS(data.Offset(20), Y0)
  2134  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
  2135  	VBROADCASTSS(data.Offset(24), Y0)
  2136  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
  2137  	VBROADCASTSS(data.Offset(28), Y9)
  2138  	VBROADCASTSS(data.Offset(32), Y10)
  2139  	VBROADCASTSS(data.Offset(36), Y6)
  2140  	VBROADCASTSS(data.Offset(40), Y12)
  2141  	VBROADCASTSS(data.Offset(44), Y13)
  2142  	VBROADCASTSS(data.Offset(48), Y14)
  2143  	VBROADCASTSS(data.Offset(52), Y15)
  2144  	VPBROADCASTD(data.Offset(56), Y2)
  2145  
  2146  	Label("LBB13_2")
  2147  	{
  2148  		VMOVUPS(Mem{Base: RSP}.Offset(32), Y0)
  2149  		VANDPS(Mem{Base: RDI}.Idx(RCX, 4), Y0, Y5)
  2150  		VMULPS(Mem{Base: RSP}, Y5, Y0)
  2151  		VCVTTPS2DQ(Y0, Y0)
  2152  		VPSUBD(data.Offset(64), Y0, Y0)
  2153  		VPAND(Mem{Base: RSP}.Offset(-32), Y0, Y1)
  2154  		VCVTDQ2PS(Y1, Y3)
  2155  		VFMADD132PS(Mem{Base: RSP}.Offset(-64), Y5, Y3)
  2156  		VMULPS(Y3, Y3, Y5)
  2157  		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y8)
  2158  		VFMADD213PS(Y9, Y5, Y8)
  2159  		VFMADD213PS(Y10, Y5, Y8)
  2160  		VMULPS(Y5, Y5, Y7)
  2161  		VMOVAPS(Y6, Y11)
  2162  		VFMADD213PS(Y12, Y5, Y11)
  2163  		VFMADD231PS(Y7, Y8, Y11)
  2164  		VMOVAPS(Y13, Y7)
  2165  		VFMADD213PS(Y14, Y5, Y7)
  2166  		VFMADD213PS(Y15, Y5, Y7)
  2167  		VMULPS(Y3, Y5, Y5)
  2168  		VFMADD213PS(Y3, Y7, Y5)
  2169  		VPAND(Y4, Y0, Y0)
  2170  		VPCMPEQD(Y4, Y0, Y3)
  2171  		VPCMPEQD(data.Offset(96), Y0, Y0)
  2172  		VANDPS(Y0, Y5, Y0)
  2173  		VANDPS(Y3, Y11, Y3)
  2174  		VADDPS(Y3, Y0, Y0)
  2175  		VADDPS(Y5, Y11, Y3)
  2176  		VSUBPS(Y0, Y3, Y0)
  2177  		VPSLLD(Imm(29), Y1, Y1)
  2178  		VPADDD(Mem{Base: RSP}.Offset(-96), Y1, Y1)
  2179  		VPAND(Y2, Y1, Y1)
  2180  		VPXOR(Y2, Y1, Y1)
  2181  		VXORPS(Y1, Y0, Y0)
  2182  		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
  2183  		ADDQ(Imm(8), RCX)
  2184  		CMPQ(RCX, RAX)
  2185  		JB(LabelRef("LBB13_2"))
  2186  	}
  2187  
  2188  	Label("LBB13_3")
  2189  	{
  2190  		CMPQ(RAX, RSI)
  2191  		JAE(LabelRef("LBB13_14"))
  2192  		VBROADCASTSS(data.Offset(56), X0)
  2193  		VXORPS(X1, X1, X1)
  2194  		VMOVSS(data.Offset(60), X2)
  2195  		VMOVSS(data.Offset(40), X9)
  2196  		VMOVSS(data.Offset(16), X10)
  2197  		VMOVSS(data.Offset(24), X8)
  2198  		VMOVSS(data.Offset(28), X11)
  2199  		VMOVSS(data.Offset(32), X13)
  2200  		VMOVSS(data.Offset(36), X14)
  2201  		VMOVSS(data.Offset(44), X7)
  2202  		VMOVSS(data.Offset(48), X15)
  2203  		VMOVSS(data.Offset(52), X6)
  2204  		JMP(LabelRef("LBB13_5"))
  2205  	}
  2206  
  2207  	Label("LBB13_13")
  2208  	{
  2209  		ADDQ(Imm(1), RAX)
  2210  		CMPQ(RAX, RSI)
  2211  		JAE(LabelRef("LBB13_14"))
  2212  	}
  2213  
  2214  	Label("LBB13_5")
  2215  	{
  2216  		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X3)
  2217  		VXORPS(X0, X3, X4)
  2218  		VCMPSS(Imm(1), X1, X3, X5)
  2219  		VBLENDVPS(X5, X4, X3, X3)
  2220  		VUCOMISS(X2, X3)
  2221  		JA(LabelRef("LBB13_13"))
  2222  		VMULSS(data.Offset(4), X3, X4)
  2223  		VCVTTSS2SI(X4, EDX)
  2224  		VROUNDSS(Imm(11), X4, X4, X4)
  2225  		MOVL(EDX, ECX)
  2226  		ANDL(Imm(1), ECX)
  2227  		JE(LabelRef("LBB13_8"))
  2228  		VADDSS(X4, X9, X4)
  2229  	}
  2230  
  2231  	Label("LBB13_8")
  2232  	{
  2233  		ADDL(EDX, ECX)
  2234  		ANDL(Imm(7), ECX)
  2235  		LEAL(Mem{Base: RCX}.Offset(-4), EDX)
  2236  		CMPL(ECX, Imm(4))
  2237  		CMOVLLT(ECX, EDX)
  2238  		SETCC(R8B)
  2239  		CMPL(EDX, Imm(2))
  2240  		SETCC(CL)
  2241  		VFMADD231SS(X10, X4, X3)
  2242  		VMULSS(X3, X3, X4)
  2243  		VMOVAPS(X8, X12)
  2244  		VFMADD213SS(X11, X4, X12)
  2245  		VFMADD213SS(X13, X4, X12)
  2246  		VFMADD213SS(X14, X4, X12)
  2247  		VMOVAPS(X7, X5)
  2248  		VFMADD213SS(X15, X4, X5)
  2249  		VFMADD213SS(X6, X4, X5)
  2250  		ADDL(I32(-1), EDX)
  2251  		CMPL(EDX, Imm(2))
  2252  		JB(LabelRef("LBB13_9"))
  2253  		VFMADD213SS(X9, X12, X4)
  2254  		VMOVAPS(X4, X5)
  2255  		VMOVSS(X5, Mem{Base: RDI}.Idx(RAX, 4))
  2256  		CMPB(R8B, CL)
  2257  		JE(LabelRef("LBB13_13"))
  2258  		JMP(LabelRef("LBB13_12"))
  2259  	}
  2260  
  2261  	Label("LBB13_9")
  2262  	{
  2263  		VMULSS(X3, X4, X4)
  2264  		VFMADD213SS(X3, X4, X5)
  2265  		VMOVSS(X5, Mem{Base: RDI}.Idx(RAX, 4))
  2266  		CMPB(R8B, CL)
  2267  		JE(LabelRef("LBB13_13"))
  2268  	}
  2269  
  2270  	Label("LBB13_12")
  2271  	{
  2272  		VXORPS(X0, X5, X3)
  2273  		VMOVSS(X3, Mem{Base: RDI}.Idx(RAX, 4))
  2274  		JMP(LabelRef("LBB13_13"))
  2275  	}
  2276  
  2277  	Label("LBB13_14")
  2278  	{
  2279  		ADDQ(Imm(72), RSP)
  2280  		VZEROUPPER()
  2281  		RET()
  2282  	}
  2283  }
  2284  
  2285  func genSinCos_F32() {
  2286  
  2287  	data := GLOBL("dataSinCosF32", RODATA|NOPTR)
  2288  	DATA(0, U32(2147483647))
  2289  	DATA(4, U32(0x3fa2f983))
  2290  	DATA(8, U32(4294967294))
  2291  	DATA(12, U32(2))
  2292  	DATA(16, U32(0xbf490fdb))
  2293  	DATA(20, U32(3221225472))
  2294  	DATA(24, U32(2147483648))
  2295  	DATA(28, U32(0x37ccf5ce))
  2296  	DATA(32, U32(0xbab6061a))
  2297  	DATA(36, U32(0x3d2aaaa5))
  2298  	DATA(40, U32(0xbf000000))
  2299  	DATA(44, U32(0x3f800000))
  2300  	DATA(48, U32(0xb94ca1f9))
  2301  	DATA(52, U32(0x3c08839e))
  2302  	DATA(56, U32(0xbe2aaaa3))
  2303  	DATA(60, U32(0x4b7fffff))
  2304  	DATA(64, U64(0xffffffffffffffff))
  2305  	DATA(72, U64(0xffffffffffffffff))
  2306  	DATA(80, U64(0xffffffffffffffff))
  2307  	DATA(88, U64(0xffffffffffffffff))
  2308  	DATA(96, U64(0x0))
  2309  	DATA(104, U64(0x0))
  2310  	DATA(112, U64(0x0))
  2311  	DATA(120, U64(0x0))
  2312  
  2313  	TEXT("SinCos_AVX2_F32", 0, "func(x, y, z []float32)")
  2314  	Pragma("noescape")
  2315  	Load(Param("x").Base(), RDI)
  2316  	Load(Param("y").Base(), RSI)
  2317  	Load(Param("z").Base(), RDX)
  2318  	Load(Param("x").Len(), RCX)
  2319  
  2320  	PUSHQ(RBX)
  2321  	SUBQ(Imm(96), RSP)
  2322  	MOVQ(RCX, R8)
  2323  	ANDQ(I32(-8), R8)
  2324  	JE(LabelRef("LBB14_3"))
  2325  	XORL(EAX, EAX)
  2326  	VBROADCASTSS(data.Offset(0), Y0)
  2327  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
  2328  	VBROADCASTSS(data.Offset(4), Y0)
  2329  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
  2330  	VBROADCASTSS(data.Offset(8), Y0)
  2331  	VMOVUPS(Y0, Mem{Base: RSP})
  2332  	VPBROADCASTD(data.Offset(12), Y4)
  2333  	VBROADCASTSS(data.Offset(16), Y0)
  2334  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
  2335  	VBROADCASTSS(data.Offset(20), Y0)
  2336  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
  2337  	VPBROADCASTD(data.Offset(24), Y8)
  2338  	VBROADCASTSS(data.Offset(28), Y0)
  2339  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
  2340  	VBROADCASTSS(data.Offset(32), Y0)
  2341  	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
  2342  	VBROADCASTSS(data.Offset(36), Y11)
  2343  	VBROADCASTSS(data.Offset(40), Y10)
  2344  	VBROADCASTSS(data.Offset(44), Y13)
  2345  	VBROADCASTSS(data.Offset(48), Y14)
  2346  	VBROADCASTSS(data.Offset(52), Y15)
  2347  	VBROADCASTSS(data.Offset(56), Y2)
  2348  
  2349  	Label("LBB14_2")
  2350  	{
  2351  		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y5)
  2352  		VANDPS(Mem{Base: RSP}.Offset(64), Y5, Y1)
  2353  		VMULPS(Mem{Base: RSP}.Offset(32), Y1, Y0)
  2354  		VCVTTPS2DQ(Y0, Y0)
  2355  		VPSUBD(data.Offset(64), Y0, Y3)
  2356  		VPAND(Mem{Base: RSP}, Y3, Y0)
  2357  		VCVTDQ2PS(Y0, Y6)
  2358  		VFMADD132PS(Mem{Base: RSP}.Offset(-32), Y1, Y6)
  2359  		VMULPS(Y6, Y6, Y1)
  2360  		VMOVUPS(Mem{Base: RSP}.Offset(-96), Y9)
  2361  		VFMADD213PS(Mem{Base: RSP}.Offset(-128), Y1, Y9)
  2362  		VFMADD213PS(Y11, Y1, Y9)
  2363  		VMULPS(Y1, Y1, Y7)
  2364  		VMOVAPS(Y10, Y12)
  2365  		VFMADD213PS(Y13, Y1, Y12)
  2366  		VFMADD231PS(Y7, Y9, Y12)
  2367  		VMOVAPS(Y14, Y7)
  2368  		VFMADD213PS(Y15, Y1, Y7)
  2369  		VFMADD213PS(Y2, Y1, Y7)
  2370  		VMULPS(Y6, Y1, Y1)
  2371  		VFMADD213PS(Y6, Y7, Y1)
  2372  		VPSLLD(Imm(29), Y3, Y6)
  2373  		VPAND(Y4, Y3, Y3)
  2374  		VPXOR(Y5, Y6, Y5)
  2375  		VPCMPEQD(Y4, Y3, Y6)
  2376  		VPCMPEQD(data.Offset(96), Y3, Y3)
  2377  		VANDPS(Y3, Y1, Y3)
  2378  		VANDPS(Y6, Y12, Y6)
  2379  		VADDPS(Y3, Y6, Y3)
  2380  		VADDPS(Y1, Y12, Y1)
  2381  		VPAND(Y5, Y8, Y5)
  2382  		VSUBPS(Y3, Y1, Y1)
  2383  		VPXOR(Y3, Y5, Y3)
  2384  		VPSLLD(Imm(29), Y0, Y0)
  2385  		VPADDD(Mem{Base: RSP}.Offset(-64), Y0, Y0)
  2386  		VPAND(Y0, Y8, Y0)
  2387  		VPXOR(Y0, Y8, Y0)
  2388  		VXORPS(Y0, Y1, Y0)
  2389  		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RAX, 4))
  2390  		VMOVUPS(Y0, Mem{Base: RSI}.Idx(RAX, 4))
  2391  		ADDQ(Imm(8), RAX)
  2392  		CMPQ(RAX, R8)
  2393  		JB(LabelRef("LBB14_2"))
  2394  	}
  2395  
  2396  	Label("LBB14_3")
  2397  	{
  2398  		CMPQ(R8, RCX)
  2399  		JAE(LabelRef("LBB14_16"))
  2400  		VBROADCASTSS(data.Offset(24), X0)
  2401  		VXORPS(X1, X1, X1)
  2402  		VMOVSS(data.Offset(60), X2)
  2403  		VMOVSS(data.Offset(44), X6)
  2404  		VMOVSS(data.Offset(28), X8)
  2405  		VMOVSS(data.Offset(36), X12)
  2406  		VMOVSS(data.Offset(40), X13)
  2407  		VMOVSS(data.Offset(48), X15)
  2408  		VMOVSS(data.Offset(52), X14)
  2409  		VMOVSS(data.Offset(56), X10)
  2410  		JMP(LabelRef("LBB14_5"))
  2411  	}
  2412  
  2413  	Label("LBB14_15")
  2414  	{
  2415  		ADDQ(Imm(1), R8)
  2416  		CMPQ(R8, RCX)
  2417  		JAE(LabelRef("LBB14_16"))
  2418  	}
  2419  
  2420  	Label("LBB14_5")
  2421  	{
  2422  		VMOVSS(Mem{Base: RDX}.Idx(R8, 4), X4)
  2423  		VXORPS(X0, X4, X5)
  2424  		VCMPSS(Imm(1), X1, X4, X7)
  2425  		VBLENDVPS(X7, X5, X4, X5)
  2426  		VUCOMISS(X2, X5)
  2427  		JA(LabelRef("LBB14_15"))
  2428  		VUCOMISS(X1, X4)
  2429  		SETCS(R9B)
  2430  		VMULSS(data.Offset(4), X5, X4)
  2431  		VCVTTSS2SI(X4, R10L)
  2432  		VROUNDSS(Imm(11), X4, X4, X4)
  2433  		MOVL(R10L, EAX)
  2434  		ANDL(Imm(1), EAX)
  2435  		JE(LabelRef("LBB14_8"))
  2436  		VADDSS(X6, X4, X4)
  2437  	}
  2438  
  2439  	Label("LBB14_8")
  2440  	{
  2441  		ADDL(R10L, EAX)
  2442  		ANDL(Imm(7), EAX)
  2443  		LEAL(Mem{Base: RAX}.Offset(-4), R10L)
  2444  		CMPL(EAX, Imm(4))
  2445  		SETCC(R11B)
  2446  		CMOVLLT(EAX, R10L)
  2447  		VFMADD231SS(data.Offset(16), X4, X5)
  2448  		VMULSS(X5, X5, X7)
  2449  		VMOVAPS(X8, X11)
  2450  		VFMADD213SS(data.Offset(32), X7, X11)
  2451  		VFMADD213SS(X12, X7, X11)
  2452  		VMULSS(X7, X7, X9)
  2453  		VMOVAPS(X6, X4)
  2454  		VFMADD231SS(X13, X7, X4)
  2455  		VFMADD231SS(X9, X11, X4)
  2456  		VMOVAPS(X15, X3)
  2457  		VFMADD213SS(X14, X7, X3)
  2458  		VFMADD213SS(X10, X7, X3)
  2459  		VMULSS(X5, X7, X7)
  2460  		VFMADD213SS(X5, X3, X7)
  2461  		LEAL(Mem{Base: R10}.Offset(-1), EBX)
  2462  		CMPL(EBX, Imm(2))
  2463  		JB(LabelRef("LBB14_9"))
  2464  		VMOVAPS(X7, X5)
  2465  		VMOVSS(X5, Mem{Base: RDI}.Idx(R8, 4))
  2466  		VMOVSS(X4, Mem{Base: RSI}.Idx(R8, 4))
  2467  		CMPB(R9B, R11B)
  2468  		JNE(LabelRef("LBB14_12"))
  2469  		JMP(LabelRef("LBB14_13"))
  2470  	}
  2471  
  2472  	Label("LBB14_9")
  2473  	{
  2474  		VMOVAPS(X4, X5)
  2475  		VMOVAPS(X7, X4)
  2476  		VMOVSS(X5, Mem{Base: RDI}.Idx(R8, 4))
  2477  		VMOVSS(X4, Mem{Base: RSI}.Idx(R8, 4))
  2478  		CMPB(R9B, R11B)
  2479  		JE(LabelRef("LBB14_13"))
  2480  	}
  2481  
  2482  	Label("LBB14_12")
  2483  	{
  2484  		VMOVSS(Mem{Base: RDI}.Idx(R8, 4), X3)
  2485  		VXORPS(X0, X3, X3)
  2486  		VMOVSS(X3, Mem{Base: RDI}.Idx(R8, 4))
  2487  	}
  2488  
  2489  	Label("LBB14_13")
  2490  	{
  2491  		CMPL(R10L, Imm(2))
  2492  		SETCC(BL)
  2493  		CMPL(EAX, Imm(4))
  2494  		SETCC(AL)
  2495  		CMPB(AL, BL)
  2496  		JE(LabelRef("LBB14_15"))
  2497  		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X3)
  2498  		VXORPS(X0, X3, X3)
  2499  		VMOVSS(X3, Mem{Base: RSI}.Idx(R8, 4))
  2500  		JMP(LabelRef("LBB14_15"))
  2501  	}
  2502  
  2503  	Label("LBB14_16")
  2504  	{
  2505  		ADDQ(Imm(96), RSP)
  2506  		POPQ(RBX)
  2507  		VZEROUPPER()
  2508  		RET()
  2509  	}
  2510  }