gitee.com/quant1x/num@v0.3.2/internal/functions/accel_avx2_amd64.s (about)

     1  // Code generated by command: go run gen.go -out ../internal/functions/accel_avx2_amd64.s -stubs ../internal/functions/accel_avx2_amd64.go -pkg functions. DO NOT EDIT.
     2  
     3  #include "textflag.h"
     4  
     5  // func Add_AVX2_F64(x []float64, y []float64)
     6  // Requires: AVX
     7  TEXT ·Add_AVX2_F64(SB), NOSPLIT, $0-48
     8  	MOVQ  x_base+0(FP), DI
     9  	MOVQ  y_base+24(FP), SI
    10  	MOVQ  x_len+8(FP), DX
    11  	TESTQ DX, DX
    12  	JE    LBB0_7
    13  	CMPQ  DX, $0x10
    14  	JAE   LBB0_3
    15  	XORL  AX, AX
    16  	JMP   LBB0_6
    17  
    18  LBB0_3:
    19  	MOVQ DX, AX
    20  	ANDQ $-16, AX
    21  	XORL CX, CX
    22  
    23  LBB0_4:
    24  	VMOVUPD (DI)(CX*8), Y0
    25  	VMOVUPD 32(DI)(CX*8), Y1
    26  	VMOVUPD 64(DI)(CX*8), Y2
    27  	VMOVUPD 96(DI)(CX*8), Y3
    28  	VADDPD  (SI)(CX*8), Y0, Y0
    29  	VADDPD  32(SI)(CX*8), Y1, Y1
    30  	VADDPD  64(SI)(CX*8), Y2, Y2
    31  	VADDPD  96(SI)(CX*8), Y3, Y3
    32  	VMOVUPD Y0, (DI)(CX*8)
    33  	VMOVUPD Y1, 32(DI)(CX*8)
    34  	VMOVUPD Y2, 64(DI)(CX*8)
    35  	VMOVUPD Y3, 96(DI)(CX*8)
    36  	ADDQ    $0x10, CX
    37  	CMPQ    AX, CX
    38  	JNE     LBB0_4
    39  	CMPQ    AX, DX
    40  	JE      LBB0_7
    41  
    42  LBB0_6:
    43  	VMOVSD (DI)(AX*8), X0
    44  	VADDSD (SI)(AX*8), X0, X0
    45  	VMOVSD X0, (DI)(AX*8)
    46  	ADDQ   $0x01, AX
    47  	CMPQ   DX, AX
    48  	JNE    LBB0_6
    49  
    50  LBB0_7:
    51  	VZEROUPPER
    52  	RET
    53  
    54  // func Add_AVX2_F32(x []float32, y []float32)
    55  // Requires: AVX
    56  TEXT ·Add_AVX2_F32(SB), NOSPLIT, $0-48
    57  	MOVQ  x_base+0(FP), DI
    58  	MOVQ  y_base+24(FP), SI
    59  	MOVQ  x_len+8(FP), DX
    60  	TESTQ DX, DX
    61  	JE    LBB1_7
    62  	CMPQ  DX, $0x20
    63  	JAE   LBB1_3
    64  	XORL  AX, AX
    65  	JMP   LBB1_6
    66  
    67  LBB1_3:
    68  	MOVQ DX, AX
    69  	ANDQ $-32, AX
    70  	XORL CX, CX
    71  
    72  LBB1_4:
    73  	VMOVUPS (DI)(CX*4), Y0
    74  	VMOVUPS 32(DI)(CX*4), Y1
    75  	VMOVUPS 64(DI)(CX*4), Y2
    76  	VMOVUPS 96(DI)(CX*4), Y3
    77  	VADDPS  (SI)(CX*4), Y0, Y0
    78  	VADDPS  32(SI)(CX*4), Y1, Y1
    79  	VADDPS  64(SI)(CX*4), Y2, Y2
    80  	VADDPS  96(SI)(CX*4), Y3, Y3
    81  	VMOVUPS Y0, (DI)(CX*4)
    82  	VMOVUPS Y1, 32(DI)(CX*4)
    83  	VMOVUPS Y2, 64(DI)(CX*4)
    84  	VMOVUPS Y3, 96(DI)(CX*4)
    85  	ADDQ    $0x20, CX
    86  	CMPQ    AX, CX
    87  	JNE     LBB1_4
    88  	CMPQ    AX, DX
    89  	JE      LBB1_7
    90  
    91  LBB1_6:
    92  	VMOVSS (DI)(AX*4), X0
    93  	VADDSS (SI)(AX*4), X0, X0
    94  	VMOVSS X0, (DI)(AX*4)
    95  	ADDQ   $0x01, AX
    96  	CMPQ   DX, AX
    97  	JNE    LBB1_6
    98  
    99  LBB1_7:
   100  	VZEROUPPER
   101  	RET
   102  
   103  // func AddNumber_AVX2_F64(x []float64, a float64)
   104  // Requires: AVX, AVX2, SSE2
   105  TEXT ·AddNumber_AVX2_F64(SB), NOSPLIT, $0-32
   106  	MOVQ  x_base+0(FP), DI
   107  	MOVSD a+24(FP), X0
   108  	MOVQ  x_len+8(FP), SI
   109  	TESTQ SI, SI
   110  	JE    LBB2_11
   111  	CMPQ  SI, $0x10
   112  	JAE   LBB2_3
   113  	XORL  AX, AX
   114  	JMP   LBB2_10
   115  
   116  LBB2_3:
   117  	MOVQ         SI, AX
   118  	ANDQ         $-16, AX
   119  	VBROADCASTSD X0, Y1
   120  	LEAQ         -16(AX), CX
   121  	MOVQ         CX, R8
   122  	SHRQ         $0x04, R8
   123  	ADDQ         $0x01, R8
   124  	TESTQ        CX, CX
   125  	JE           LBB2_4
   126  	MOVQ         R8, DX
   127  	ANDQ         $-2, DX
   128  	XORL         CX, CX
   129  
   130  LBB2_6:
   131  	VADDPD  (DI)(CX*8), Y1, Y2
   132  	VADDPD  32(DI)(CX*8), Y1, Y3
   133  	VADDPD  64(DI)(CX*8), Y1, Y4
   134  	VADDPD  96(DI)(CX*8), Y1, Y5
   135  	VMOVUPD Y2, (DI)(CX*8)
   136  	VMOVUPD Y3, 32(DI)(CX*8)
   137  	VMOVUPD Y4, 64(DI)(CX*8)
   138  	VMOVUPD Y5, 96(DI)(CX*8)
   139  	VADDPD  128(DI)(CX*8), Y1, Y2
   140  	VADDPD  160(DI)(CX*8), Y1, Y3
   141  	VADDPD  192(DI)(CX*8), Y1, Y4
   142  	VADDPD  224(DI)(CX*8), Y1, Y5
   143  	VMOVUPD Y2, 128(DI)(CX*8)
   144  	VMOVUPD Y3, 160(DI)(CX*8)
   145  	VMOVUPD Y4, 192(DI)(CX*8)
   146  	VMOVUPD Y5, 224(DI)(CX*8)
   147  	ADDQ    $0x20, CX
   148  	ADDQ    $-2, DX
   149  	JNE     LBB2_6
   150  	TESTB   $0x01, R8
   151  	JE      LBB2_9
   152  
   153  LBB2_8:
   154  	VADDPD  (DI)(CX*8), Y1, Y2
   155  	VADDPD  32(DI)(CX*8), Y1, Y3
   156  	VADDPD  64(DI)(CX*8), Y1, Y4
   157  	VADDPD  96(DI)(CX*8), Y1, Y1
   158  	VMOVUPD Y2, (DI)(CX*8)
   159  	VMOVUPD Y3, 32(DI)(CX*8)
   160  	VMOVUPD Y4, 64(DI)(CX*8)
   161  	VMOVUPD Y1, 96(DI)(CX*8)
   162  
   163  LBB2_9:
   164  	CMPQ AX, SI
   165  	JE   LBB2_11
   166  
   167  LBB2_10:
   168  	VADDSD (DI)(AX*8), X0, X1
   169  	VMOVSD X1, (DI)(AX*8)
   170  	ADDQ   $0x01, AX
   171  	CMPQ   SI, AX
   172  	JNE    LBB2_10
   173  
   174  LBB2_11:
   175  	VZEROUPPER
   176  	RET
   177  
   178  LBB2_4:
   179  	XORL  CX, CX
   180  	TESTB $0x01, R8
   181  	JNE   LBB2_8
   182  	JMP   LBB2_9
   183  
   184  // func AddNumber_AVX2_F32(x []float32, a float32)
   185  // Requires: AVX, AVX2, SSE
   186  TEXT ·AddNumber_AVX2_F32(SB), NOSPLIT, $0-28
   187  	MOVQ  x_base+0(FP), DI
   188  	MOVSS a+24(FP), X0
   189  	MOVQ  x_len+8(FP), SI
   190  	TESTQ SI, SI
   191  	JE    LBB3_11
   192  	CMPQ  SI, $0x20
   193  	JAE   LBB3_3
   194  	XORL  AX, AX
   195  	JMP   LBB3_10
   196  
   197  LBB3_3:
   198  	MOVQ         SI, AX
   199  	ANDQ         $-32, AX
   200  	VBROADCASTSS X0, Y1
   201  	LEAQ         -32(AX), CX
   202  	MOVQ         CX, R8
   203  	SHRQ         $0x05, R8
   204  	ADDQ         $0x01, R8
   205  	TESTQ        CX, CX
   206  	JE           LBB3_4
   207  	MOVQ         R8, DX
   208  	ANDQ         $-2, DX
   209  	XORL         CX, CX
   210  
   211  LBB3_6:
   212  	VADDPS  (DI)(CX*4), Y1, Y2
   213  	VADDPS  32(DI)(CX*4), Y1, Y3
   214  	VADDPS  64(DI)(CX*4), Y1, Y4
   215  	VADDPS  96(DI)(CX*4), Y1, Y5
   216  	VMOVUPS Y2, (DI)(CX*4)
   217  	VMOVUPS Y3, 32(DI)(CX*4)
   218  	VMOVUPS Y4, 64(DI)(CX*4)
   219  	VMOVUPS Y5, 96(DI)(CX*4)
   220  	VADDPS  128(DI)(CX*4), Y1, Y2
   221  	VADDPS  160(DI)(CX*4), Y1, Y3
   222  	VADDPS  192(DI)(CX*4), Y1, Y4
   223  	VADDPS  224(DI)(CX*4), Y1, Y5
   224  	VMOVUPS Y2, 128(DI)(CX*4)
   225  	VMOVUPS Y3, 160(DI)(CX*4)
   226  	VMOVUPS Y4, 192(DI)(CX*4)
   227  	VMOVUPS Y5, 224(DI)(CX*4)
   228  	ADDQ    $0x40, CX
   229  	ADDQ    $-2, DX
   230  	JNE     LBB3_6
   231  	TESTB   $0x01, R8
   232  	JE      LBB3_9
   233  
   234  LBB3_8:
   235  	VADDPS  (DI)(CX*4), Y1, Y2
   236  	VADDPS  32(DI)(CX*4), Y1, Y3
   237  	VADDPS  64(DI)(CX*4), Y1, Y4
   238  	VADDPS  96(DI)(CX*4), Y1, Y1
   239  	VMOVUPS Y2, (DI)(CX*4)
   240  	VMOVUPS Y3, 32(DI)(CX*4)
   241  	VMOVUPS Y4, 64(DI)(CX*4)
   242  	VMOVUPS Y1, 96(DI)(CX*4)
   243  
   244  LBB3_9:
   245  	CMPQ AX, SI
   246  	JE   LBB3_11
   247  
   248  LBB3_10:
   249  	VADDSS (DI)(AX*4), X0, X1
   250  	VMOVSS X1, (DI)(AX*4)
   251  	ADDQ   $0x01, AX
   252  	CMPQ   SI, AX
   253  	JNE    LBB3_10
   254  
   255  LBB3_11:
   256  	VZEROUPPER
   257  	RET
   258  
   259  LBB3_4:
   260  	XORL  CX, CX
   261  	TESTB $0x01, R8
   262  	JNE   LBB3_8
   263  	JMP   LBB3_9
   264  
   265  // func Sub_AVX2_F64(x []float64, y []float64)
   266  // Requires: AVX
   267  TEXT ·Sub_AVX2_F64(SB), NOSPLIT, $0-48
   268  	MOVQ  x_base+0(FP), DI
   269  	MOVQ  y_base+24(FP), SI
   270  	MOVQ  x_len+8(FP), DX
   271  	TESTQ DX, DX
   272  	JE    LBB4_7
   273  	CMPQ  DX, $0x10
   274  	JAE   LBB4_3
   275  	XORL  AX, AX
   276  	JMP   LBB4_6
   277  
   278  LBB4_3:
   279  	MOVQ DX, AX
   280  	ANDQ $-16, AX
   281  	XORL CX, CX
   282  
   283  LBB4_4:
   284  	VMOVUPD (DI)(CX*8), Y0
   285  	VMOVUPD 32(DI)(CX*8), Y1
   286  	VMOVUPD 64(DI)(CX*8), Y2
   287  	VMOVUPD 96(DI)(CX*8), Y3
   288  	VSUBPD  (SI)(CX*8), Y0, Y0
   289  	VSUBPD  32(SI)(CX*8), Y1, Y1
   290  	VSUBPD  64(SI)(CX*8), Y2, Y2
   291  	VSUBPD  96(SI)(CX*8), Y3, Y3
   292  	VMOVUPD Y0, (DI)(CX*8)
   293  	VMOVUPD Y1, 32(DI)(CX*8)
   294  	VMOVUPD Y2, 64(DI)(CX*8)
   295  	VMOVUPD Y3, 96(DI)(CX*8)
   296  	ADDQ    $0x10, CX
   297  	CMPQ    AX, CX
   298  	JNE     LBB4_4
   299  	CMPQ    AX, DX
   300  	JE      LBB4_7
   301  
   302  LBB4_6:
   303  	VMOVSD (DI)(AX*8), X0
   304  	VSUBSD (SI)(AX*8), X0, X0
   305  	VMOVSD X0, (DI)(AX*8)
   306  	ADDQ   $0x01, AX
   307  	CMPQ   DX, AX
   308  	JNE    LBB4_6
   309  
   310  LBB4_7:
   311  	VZEROUPPER
   312  	RET
   313  
   314  // func Sub_AVX2_F32(x []float32, y []float32)
   315  // Requires: AVX
   316  TEXT ·Sub_AVX2_F32(SB), NOSPLIT, $0-48
   317  	MOVQ  x_base+0(FP), DI
   318  	MOVQ  y_base+24(FP), SI
   319  	MOVQ  x_len+8(FP), DX
   320  	TESTQ DX, DX
   321  	JE    LBB5_7
   322  	CMPQ  DX, $0x20
   323  	JAE   LBB5_3
   324  	XORL  AX, AX
   325  	JMP   LBB5_6
   326  
   327  LBB5_3:
   328  	MOVQ DX, AX
   329  	ANDQ $-32, AX
   330  	XORL CX, CX
   331  
   332  LBB5_4:
   333  	VMOVUPS (DI)(CX*4), Y0
   334  	VMOVUPS 32(DI)(CX*4), Y1
   335  	VMOVUPS 64(DI)(CX*4), Y2
   336  	VMOVUPS 96(DI)(CX*4), Y3
   337  	VSUBPS  (SI)(CX*4), Y0, Y0
   338  	VSUBPS  32(SI)(CX*4), Y1, Y1
   339  	VSUBPS  64(SI)(CX*4), Y2, Y2
   340  	VSUBPS  96(SI)(CX*4), Y3, Y3
   341  	VMOVUPS Y0, (DI)(CX*4)
   342  	VMOVUPS Y1, 32(DI)(CX*4)
   343  	VMOVUPS Y2, 64(DI)(CX*4)
   344  	VMOVUPS Y3, 96(DI)(CX*4)
   345  	ADDQ    $0x20, CX
   346  	CMPQ    AX, CX
   347  	JNE     LBB5_4
   348  	CMPQ    AX, DX
   349  	JE      LBB5_7
   350  
   351  LBB5_6:
   352  	VMOVSS (DI)(AX*4), X0
   353  	VSUBSS (SI)(AX*4), X0, X0
   354  	VMOVSS X0, (DI)(AX*4)
   355  	ADDQ   $0x01, AX
   356  	CMPQ   DX, AX
   357  	JNE    LBB5_6
   358  
   359  LBB5_7:
   360  	VZEROUPPER
   361  	RET
   362  
   363  // func SubNumber_AVX2_F64(x []float64, a float64)
   364  // Requires: AVX, AVX2, SSE2
   365  TEXT ·SubNumber_AVX2_F64(SB), NOSPLIT, $0-32
   366  	MOVQ  x_base+0(FP), DI
   367  	MOVSD a+24(FP), X0
   368  	MOVQ  x_len+8(FP), SI
   369  	TESTQ SI, SI
   370  	JE    LBB6_11
   371  	CMPQ  SI, $0x10
   372  	JAE   LBB6_3
   373  	XORL  AX, AX
   374  	JMP   LBB6_10
   375  
   376  LBB6_3:
   377  	MOVQ         SI, AX
   378  	ANDQ         $-16, AX
   379  	VBROADCASTSD X0, Y1
   380  	LEAQ         -16(AX), CX
   381  	MOVQ         CX, R8
   382  	SHRQ         $0x04, R8
   383  	ADDQ         $0x01, R8
   384  	TESTQ        CX, CX
   385  	JE           LBB6_4
   386  	MOVQ         R8, DX
   387  	ANDQ         $-2, DX
   388  	XORL         CX, CX
   389  
   390  LBB6_6:
   391  	VMOVUPD (DI)(CX*8), Y2
   392  	VMOVUPD 32(DI)(CX*8), Y3
   393  	VMOVUPD 64(DI)(CX*8), Y4
   394  	VMOVUPD 96(DI)(CX*8), Y5
   395  	VSUBPD  Y1, Y2, Y2
   396  	VSUBPD  Y1, Y3, Y3
   397  	VSUBPD  Y1, Y4, Y4
   398  	VSUBPD  Y1, Y5, Y5
   399  	VMOVUPD Y2, (DI)(CX*8)
   400  	VMOVUPD Y3, 32(DI)(CX*8)
   401  	VMOVUPD Y4, 64(DI)(CX*8)
   402  	VMOVUPD Y5, 96(DI)(CX*8)
   403  	VMOVUPD 128(DI)(CX*8), Y2
   404  	VMOVUPD 160(DI)(CX*8), Y3
   405  	VMOVUPD 192(DI)(CX*8), Y4
   406  	VMOVUPD 224(DI)(CX*8), Y5
   407  	VSUBPD  Y1, Y2, Y2
   408  	VSUBPD  Y1, Y3, Y3
   409  	VSUBPD  Y1, Y4, Y4
   410  	VSUBPD  Y1, Y5, Y5
   411  	VMOVUPD Y2, 128(DI)(CX*8)
   412  	VMOVUPD Y3, 160(DI)(CX*8)
   413  	VMOVUPD Y4, 192(DI)(CX*8)
   414  	VMOVUPD Y5, 224(DI)(CX*8)
   415  	ADDQ    $0x20, CX
   416  	ADDQ    $-2, DX
   417  	JNE     LBB6_6
   418  	TESTB   $0x01, R8
   419  	JE      LBB6_9
   420  
   421  LBB6_8:
   422  	VMOVUPD (DI)(CX*8), Y2
   423  	VMOVUPD 32(DI)(CX*8), Y3
   424  	VMOVUPD 64(DI)(CX*8), Y4
   425  	VMOVUPD 96(DI)(CX*8), Y5
   426  	VSUBPD  Y1, Y2, Y2
   427  	VSUBPD  Y1, Y3, Y3
   428  	VSUBPD  Y1, Y4, Y4
   429  	VSUBPD  Y1, Y5, Y1
   430  	VMOVUPD Y2, (DI)(CX*8)
   431  	VMOVUPD Y3, 32(DI)(CX*8)
   432  	VMOVUPD Y4, 64(DI)(CX*8)
   433  	VMOVUPD Y1, 96(DI)(CX*8)
   434  
   435  LBB6_9:
   436  	CMPQ AX, SI
   437  	JE   LBB6_11
   438  
   439  LBB6_10:
   440  	VMOVSD (DI)(AX*8), X1
   441  	VSUBSD X0, X1, X1
   442  	VMOVSD X1, (DI)(AX*8)
   443  	ADDQ   $0x01, AX
   444  	CMPQ   SI, AX
   445  	JNE    LBB6_10
   446  
   447  LBB6_11:
   448  	VZEROUPPER
   449  	RET
   450  
   451  LBB6_4:
   452  	XORL  CX, CX
   453  	TESTB $0x01, R8
   454  	JNE   LBB6_8
   455  	JMP   LBB6_9
   456  
   457  // func SubNumber_AVX2_F32(x []float32, a float32)
   458  // Requires: AVX, AVX2, SSE
   459  TEXT ·SubNumber_AVX2_F32(SB), NOSPLIT, $0-28
   460  	MOVQ  x_base+0(FP), DI
   461  	MOVSS a+24(FP), X0
   462  	MOVQ  x_len+8(FP), SI
   463  	TESTQ SI, SI
   464  	JE    LBB7_11
   465  	CMPQ  SI, $0x20
   466  	JAE   LBB7_3
   467  	XORL  AX, AX
   468  	JMP   LBB7_10
   469  
   470  LBB7_3:
   471  	MOVQ         SI, AX
   472  	ANDQ         $-32, AX
   473  	VBROADCASTSS X0, Y1
   474  	LEAQ         -32(AX), CX
   475  	MOVQ         CX, R8
   476  	SHRQ         $0x05, R8
   477  	ADDQ         $0x01, R8
   478  	TESTQ        CX, CX
   479  	JE           LBB7_4
   480  	MOVQ         R8, DX
   481  	ANDQ         $-2, DX
   482  	XORL         CX, CX
   483  
   484  LBB7_6:
   485  	VMOVUPS (DI)(CX*4), Y2
   486  	VMOVUPS 32(DI)(CX*4), Y3
   487  	VMOVUPS 64(DI)(CX*4), Y4
   488  	VMOVUPS 96(DI)(CX*4), Y5
   489  	VSUBPS  Y1, Y2, Y2
   490  	VSUBPS  Y1, Y3, Y3
   491  	VSUBPS  Y1, Y4, Y4
   492  	VSUBPS  Y1, Y5, Y5
   493  	VMOVUPS Y2, (DI)(CX*4)
   494  	VMOVUPS Y3, 32(DI)(CX*4)
   495  	VMOVUPS Y4, 64(DI)(CX*4)
   496  	VMOVUPS Y5, 96(DI)(CX*4)
   497  	VMOVUPS 128(DI)(CX*4), Y2
   498  	VMOVUPS 160(DI)(CX*4), Y3
   499  	VMOVUPS 192(DI)(CX*4), Y4
   500  	VMOVUPS 224(DI)(CX*4), Y5
   501  	VSUBPS  Y1, Y2, Y2
   502  	VSUBPS  Y1, Y3, Y3
   503  	VSUBPS  Y1, Y4, Y4
   504  	VSUBPS  Y1, Y5, Y5
   505  	VMOVUPS Y2, 128(DI)(CX*4)
   506  	VMOVUPS Y3, 160(DI)(CX*4)
   507  	VMOVUPS Y4, 192(DI)(CX*4)
   508  	VMOVUPS Y5, 224(DI)(CX*4)
   509  	ADDQ    $0x40, CX
   510  	ADDQ    $-2, DX
   511  	JNE     LBB7_6
   512  	TESTB   $0x01, R8
   513  	JE      LBB7_9
   514  
   515  LBB7_8:
   516  	VMOVUPS (DI)(CX*4), Y2
   517  	VMOVUPS 32(DI)(CX*4), Y3
   518  	VMOVUPS 64(DI)(CX*4), Y4
   519  	VMOVUPS 96(DI)(CX*4), Y5
   520  	VSUBPS  Y1, Y2, Y2
   521  	VSUBPS  Y1, Y3, Y3
   522  	VSUBPS  Y1, Y4, Y4
   523  	VSUBPS  Y1, Y5, Y1
   524  	VMOVUPS Y2, (DI)(CX*4)
   525  	VMOVUPS Y3, 32(DI)(CX*4)
   526  	VMOVUPS Y4, 64(DI)(CX*4)
   527  	VMOVUPS Y1, 96(DI)(CX*4)
   528  
   529  LBB7_9:
   530  	CMPQ AX, SI
   531  	JE   LBB7_11
   532  
   533  LBB7_10:
   534  	VMOVSS (DI)(AX*4), X1
   535  	VSUBSS X0, X1, X1
   536  	VMOVSS X1, (DI)(AX*4)
   537  	ADDQ   $0x01, AX
   538  	CMPQ   SI, AX
   539  	JNE    LBB7_10
   540  
   541  LBB7_11:
   542  	VZEROUPPER
   543  	RET
   544  
   545  LBB7_4:
   546  	XORL  CX, CX
   547  	TESTB $0x01, R8
   548  	JNE   LBB7_8
   549  	JMP   LBB7_9
   550  
   551  // func Mul_AVX2_F64(x []float64, y []float64)
   552  // Requires: AVX
   553  TEXT ·Mul_AVX2_F64(SB), NOSPLIT, $0-48
   554  	MOVQ  x_base+0(FP), DI
   555  	MOVQ  y_base+24(FP), SI
   556  	MOVQ  x_len+8(FP), DX
   557  	TESTQ DX, DX
   558  	JE    LBB8_7
   559  	CMPQ  DX, $0x10
   560  	JAE   LBB8_3
   561  	XORL  AX, AX
   562  	JMP   LBB8_6
   563  
   564  LBB8_3:
   565  	MOVQ DX, AX
   566  	ANDQ $-16, AX
   567  	XORL CX, CX
   568  
   569  LBB8_4:
   570  	VMOVUPD (DI)(CX*8), Y0
   571  	VMOVUPD 32(DI)(CX*8), Y1
   572  	VMOVUPD 64(DI)(CX*8), Y2
   573  	VMOVUPD 96(DI)(CX*8), Y3
   574  	VMULPD  (SI)(CX*8), Y0, Y0
   575  	VMULPD  32(SI)(CX*8), Y1, Y1
   576  	VMULPD  64(SI)(CX*8), Y2, Y2
   577  	VMULPD  96(SI)(CX*8), Y3, Y3
   578  	VMOVUPD Y0, (DI)(CX*8)
   579  	VMOVUPD Y1, 32(DI)(CX*8)
   580  	VMOVUPD Y2, 64(DI)(CX*8)
   581  	VMOVUPD Y3, 96(DI)(CX*8)
   582  	ADDQ    $0x10, CX
   583  	CMPQ    AX, CX
   584  	JNE     LBB8_4
   585  	CMPQ    AX, DX
   586  	JE      LBB8_7
   587  
   588  LBB8_6:
   589  	VMOVSD (DI)(AX*8), X0
   590  	VMULSD (SI)(AX*8), X0, X0
   591  	VMOVSD X0, (DI)(AX*8)
   592  	ADDQ   $0x01, AX
   593  	CMPQ   DX, AX
   594  	JNE    LBB8_6
   595  
   596  LBB8_7:
   597  	VZEROUPPER
   598  	RET
   599  
   600  // func Mul_AVX2_F32(x []float32, y []float32)
   601  // Requires: AVX
   602  TEXT ·Mul_AVX2_F32(SB), NOSPLIT, $0-48
   603  	MOVQ  x_base+0(FP), DI
   604  	MOVQ  y_base+24(FP), SI
   605  	MOVQ  x_len+8(FP), DX
   606  	TESTQ DX, DX
   607  	JE    LBB9_7
   608  	CMPQ  DX, $0x20
   609  	JAE   LBB9_3
   610  	XORL  AX, AX
   611  	JMP   LBB9_6
   612  
   613  LBB9_3:
   614  	MOVQ DX, AX
   615  	ANDQ $-32, AX
   616  	XORL CX, CX
   617  
   618  LBB9_4:
   619  	VMOVUPS (DI)(CX*4), Y0
   620  	VMOVUPS 32(DI)(CX*4), Y1
   621  	VMOVUPS 64(DI)(CX*4), Y2
   622  	VMOVUPS 96(DI)(CX*4), Y3
   623  	VMULPS  (SI)(CX*4), Y0, Y0
   624  	VMULPS  32(SI)(CX*4), Y1, Y1
   625  	VMULPS  64(SI)(CX*4), Y2, Y2
   626  	VMULPS  96(SI)(CX*4), Y3, Y3
   627  	VMOVUPS Y0, (DI)(CX*4)
   628  	VMOVUPS Y1, 32(DI)(CX*4)
   629  	VMOVUPS Y2, 64(DI)(CX*4)
   630  	VMOVUPS Y3, 96(DI)(CX*4)
   631  	ADDQ    $0x20, CX
   632  	CMPQ    AX, CX
   633  	JNE     LBB9_4
   634  	CMPQ    AX, DX
   635  	JE      LBB9_7
   636  
   637  LBB9_6:
   638  	VMOVSS (DI)(AX*4), X0
   639  	VMULSS (SI)(AX*4), X0, X0
   640  	VMOVSS X0, (DI)(AX*4)
   641  	ADDQ   $0x01, AX
   642  	CMPQ   DX, AX
   643  	JNE    LBB9_6
   644  
   645  LBB9_7:
   646  	VZEROUPPER
   647  	RET
   648  
   649  // func MulNumber_AVX2_F64(x []float64, a float64)
   650  // Requires: AVX, AVX2, SSE2
   651  TEXT ·MulNumber_AVX2_F64(SB), NOSPLIT, $0-32
   652  	MOVQ  x_base+0(FP), DI
   653  	MOVSD a+24(FP), X0
   654  	MOVQ  x_len+8(FP), SI
   655  	TESTQ SI, SI
   656  	JE    LBB10_11
   657  	CMPQ  SI, $0x10
   658  	JAE   LBB10_3
   659  	XORL  AX, AX
   660  	JMP   LBB10_10
   661  
   662  LBB10_3:
   663  	MOVQ         SI, AX
   664  	ANDQ         $-16, AX
   665  	VBROADCASTSD X0, Y1
   666  	LEAQ         -16(AX), CX
   667  	MOVQ         CX, R8
   668  	SHRQ         $0x04, R8
   669  	ADDQ         $0x01, R8
   670  	TESTQ        CX, CX
   671  	JE           LBB10_4
   672  	MOVQ         R8, DX
   673  	ANDQ         $-2, DX
   674  	XORL         CX, CX
   675  
   676  LBB10_6:
   677  	VMULPD  (DI)(CX*8), Y1, Y2
   678  	VMULPD  32(DI)(CX*8), Y1, Y3
   679  	VMULPD  64(DI)(CX*8), Y1, Y4
   680  	VMULPD  96(DI)(CX*8), Y1, Y5
   681  	VMOVUPD Y2, (DI)(CX*8)
   682  	VMOVUPD Y3, 32(DI)(CX*8)
   683  	VMOVUPD Y4, 64(DI)(CX*8)
   684  	VMOVUPD Y5, 96(DI)(CX*8)
   685  	VMULPD  128(DI)(CX*8), Y1, Y2
   686  	VMULPD  160(DI)(CX*8), Y1, Y3
   687  	VMULPD  192(DI)(CX*8), Y1, Y4
   688  	VMULPD  224(DI)(CX*8), Y1, Y5
   689  	VMOVUPD Y2, 128(DI)(CX*8)
   690  	VMOVUPD Y3, 160(DI)(CX*8)
   691  	VMOVUPD Y4, 192(DI)(CX*8)
   692  	VMOVUPD Y5, 224(DI)(CX*8)
   693  	ADDQ    $0x20, CX
   694  	ADDQ    $-2, DX
   695  	JNE     LBB10_6
   696  	TESTB   $0x01, R8
   697  	JE      LBB10_9
   698  
   699  LBB10_8:
   700  	VMULPD  (DI)(CX*8), Y1, Y2
   701  	VMULPD  32(DI)(CX*8), Y1, Y3
   702  	VMULPD  64(DI)(CX*8), Y1, Y4
   703  	VMULPD  96(DI)(CX*8), Y1, Y1
   704  	VMOVUPD Y2, (DI)(CX*8)
   705  	VMOVUPD Y3, 32(DI)(CX*8)
   706  	VMOVUPD Y4, 64(DI)(CX*8)
   707  	VMOVUPD Y1, 96(DI)(CX*8)
   708  
   709  LBB10_9:
   710  	CMPQ AX, SI
   711  	JE   LBB10_11
   712  
   713  LBB10_10:
   714  	VMULSD (DI)(AX*8), X0, X1
   715  	VMOVSD X1, (DI)(AX*8)
   716  	ADDQ   $0x01, AX
   717  	CMPQ   SI, AX
   718  	JNE    LBB10_10
   719  
   720  LBB10_11:
   721  	VZEROUPPER
   722  	RET
   723  
   724  LBB10_4:
   725  	XORL  CX, CX
   726  	TESTB $0x01, R8
   727  	JNE   LBB10_8
   728  	JMP   LBB10_9
   729  
   730  // func MulNumber_AVX2_F32(x []float32, a float32)
   731  // Requires: AVX, AVX2, SSE
   732  TEXT ·MulNumber_AVX2_F32(SB), NOSPLIT, $0-28
   733  	MOVQ  x_base+0(FP), DI
   734  	MOVSS a+24(FP), X0
   735  	MOVQ  x_len+8(FP), SI
   736  	TESTQ SI, SI
   737  	JE    LBB11_11
   738  	CMPQ  SI, $0x20
   739  	JAE   LBB11_3
   740  	XORL  AX, AX
   741  	JMP   LBB11_10
   742  
   743  LBB11_3:
   744  	MOVQ         SI, AX
   745  	ANDQ         $-32, AX
   746  	VBROADCASTSS X0, Y1
   747  	LEAQ         -32(AX), CX
   748  	MOVQ         CX, R8
   749  	SHRQ         $0x05, R8
   750  	ADDQ         $0x01, R8
   751  	TESTQ        CX, CX
   752  	JE           LBB11_4
   753  	MOVQ         R8, DX
   754  	ANDQ         $-2, DX
   755  	XORL         CX, CX
   756  
   757  LBB11_6:
   758  	VMULPS  (DI)(CX*4), Y1, Y2
   759  	VMULPS  32(DI)(CX*4), Y1, Y3
   760  	VMULPS  64(DI)(CX*4), Y1, Y4
   761  	VMULPS  96(DI)(CX*4), Y1, Y5
   762  	VMOVUPS Y2, (DI)(CX*4)
   763  	VMOVUPS Y3, 32(DI)(CX*4)
   764  	VMOVUPS Y4, 64(DI)(CX*4)
   765  	VMOVUPS Y5, 96(DI)(CX*4)
   766  	VMULPS  128(DI)(CX*4), Y1, Y2
   767  	VMULPS  160(DI)(CX*4), Y1, Y3
   768  	VMULPS  192(DI)(CX*4), Y1, Y4
   769  	VMULPS  224(DI)(CX*4), Y1, Y5
   770  	VMOVUPS Y2, 128(DI)(CX*4)
   771  	VMOVUPS Y3, 160(DI)(CX*4)
   772  	VMOVUPS Y4, 192(DI)(CX*4)
   773  	VMOVUPS Y5, 224(DI)(CX*4)
   774  	ADDQ    $0x40, CX
   775  	ADDQ    $-2, DX
   776  	JNE     LBB11_6
   777  	TESTB   $0x01, R8
   778  	JE      LBB11_9
   779  
   780  LBB11_8:
   781  	VMULPS  (DI)(CX*4), Y1, Y2
   782  	VMULPS  32(DI)(CX*4), Y1, Y3
   783  	VMULPS  64(DI)(CX*4), Y1, Y4
   784  	VMULPS  96(DI)(CX*4), Y1, Y1
   785  	VMOVUPS Y2, (DI)(CX*4)
   786  	VMOVUPS Y3, 32(DI)(CX*4)
   787  	VMOVUPS Y4, 64(DI)(CX*4)
   788  	VMOVUPS Y1, 96(DI)(CX*4)
   789  
   790  LBB11_9:
   791  	CMPQ AX, SI
   792  	JE   LBB11_11
   793  
   794  LBB11_10:
   795  	VMULSS (DI)(AX*4), X0, X1
   796  	VMOVSS X1, (DI)(AX*4)
   797  	ADDQ   $0x01, AX
   798  	CMPQ   SI, AX
   799  	JNE    LBB11_10
   800  
   801  LBB11_11:
   802  	VZEROUPPER
   803  	RET
   804  
   805  LBB11_4:
   806  	XORL  CX, CX
   807  	TESTB $0x01, R8
   808  	JNE   LBB11_8
   809  	JMP   LBB11_9
   810  
   811  // func Div_AVX2_F64(x []float64, y []float64)
   812  // Requires: AVX
   813  TEXT ·Div_AVX2_F64(SB), NOSPLIT, $0-48
   814  	MOVQ  x_base+0(FP), DI
   815  	MOVQ  y_base+24(FP), SI
   816  	MOVQ  x_len+8(FP), DX
   817  	TESTQ DX, DX
   818  	JE    LBB12_11
   819  	CMPQ  DX, $0x04
   820  	JAE   LBB12_3
   821  	XORL  AX, AX
   822  	JMP   LBB12_10
   823  
   824  LBB12_3:
   825  	MOVQ  DX, AX
   826  	ANDQ  $-4, AX
   827  	LEAQ  -4(AX), CX
   828  	MOVQ  CX, R8
   829  	SHRQ  $0x02, R8
   830  	ADDQ  $0x01, R8
   831  	TESTQ CX, CX
   832  	JE    LBB12_4
   833  	MOVQ  R8, R9
   834  	ANDQ  $-2, R9
   835  	XORL  CX, CX
   836  
   837  LBB12_6:
   838  	VMOVUPD (DI)(CX*8), Y0
   839  	VDIVPD  (SI)(CX*8), Y0, Y0
   840  	VMOVUPD 32(DI)(CX*8), Y1
   841  	VMOVUPD Y0, (DI)(CX*8)
   842  	VDIVPD  32(SI)(CX*8), Y1, Y0
   843  	VMOVUPD Y0, 32(DI)(CX*8)
   844  	ADDQ    $0x08, CX
   845  	ADDQ    $-2, R9
   846  	JNE     LBB12_6
   847  	TESTB   $0x01, R8
   848  	JE      LBB12_9
   849  
   850  LBB12_8:
   851  	VMOVUPD (DI)(CX*8), Y0
   852  	VDIVPD  (SI)(CX*8), Y0, Y0
   853  	VMOVUPD Y0, (DI)(CX*8)
   854  
   855  LBB12_9:
   856  	CMPQ AX, DX
   857  	JE   LBB12_11
   858  
   859  LBB12_10:
   860  	VMOVSD (DI)(AX*8), X0
   861  	VDIVSD (SI)(AX*8), X0, X0
   862  	VMOVSD X0, (DI)(AX*8)
   863  	ADDQ   $0x01, AX
   864  	CMPQ   DX, AX
   865  	JNE    LBB12_10
   866  
   867  LBB12_11:
   868  	VZEROUPPER
   869  	RET
   870  
   871  LBB12_4:
   872  	XORL  CX, CX
   873  	TESTB $0x01, R8
   874  	JNE   LBB12_8
   875  	JMP   LBB12_9
   876  
   877  // func Div_AVX2_F32(x []float32, y []float32)
   878  // Requires: AVX, FMA3
   879  TEXT ·Div_AVX2_F32(SB), NOSPLIT, $0-48
   880  	MOVQ  x_base+0(FP), DI
   881  	MOVQ  y_base+24(FP), SI
   882  	MOVQ  x_len+8(FP), DX
   883  	TESTQ DX, DX
   884  	JE    LBB13_7
   885  	CMPQ  DX, $0x20
   886  	JAE   LBB13_3
   887  	XORL  AX, AX
   888  	JMP   LBB13_6
   889  
   890  LBB13_3:
   891  	MOVQ DX, AX
   892  	ANDQ $-32, AX
   893  	XORL CX, CX
   894  
   895  LBB13_4:
   896  	VMOVUPS      (SI)(CX*4), Y0
   897  	VMOVUPS      32(SI)(CX*4), Y1
   898  	VMOVUPS      64(SI)(CX*4), Y2
   899  	VRCPPS       Y0, Y3
   900  	VMOVUPS      96(SI)(CX*4), Y4
   901  	VMOVUPS      (DI)(CX*4), Y5
   902  	VMOVUPS      32(DI)(CX*4), Y6
   903  	VMOVUPS      64(DI)(CX*4), Y7
   904  	VMOVUPS      96(DI)(CX*4), Y8
   905  	VMULPS       Y3, Y5, Y9
   906  	VFMSUB213PS  Y5, Y9, Y0
   907  	VFNMADD213PS Y9, Y3, Y0
   908  	VRCPPS       Y1, Y3
   909  	VMULPS       Y3, Y6, Y5
   910  	VFMSUB213PS  Y6, Y5, Y1
   911  	VRCPPS       Y2, Y6
   912  	VFNMADD213PS Y5, Y3, Y1
   913  	VMULPS       Y6, Y7, Y3
   914  	VFMSUB213PS  Y7, Y3, Y2
   915  	VFNMADD213PS Y3, Y6, Y2
   916  	VRCPPS       Y4, Y3
   917  	VMULPS       Y3, Y8, Y5
   918  	VFMSUB213PS  Y8, Y5, Y4
   919  	VFNMADD213PS Y5, Y3, Y4
   920  	VMOVUPS      Y0, (DI)(CX*4)
   921  	VMOVUPS      Y1, 32(DI)(CX*4)
   922  	VMOVUPS      Y2, 64(DI)(CX*4)
   923  	VMOVUPS      Y4, 96(DI)(CX*4)
   924  	ADDQ         $0x20, CX
   925  	CMPQ         AX, CX
   926  	JNE          LBB13_4
   927  	CMPQ         AX, DX
   928  	JE           LBB13_7
   929  
   930  LBB13_6:
   931  	VMOVSS (DI)(AX*4), X0
   932  	VDIVSS (SI)(AX*4), X0, X0
   933  	VMOVSS X0, (DI)(AX*4)
   934  	ADDQ   $0x01, AX
   935  	CMPQ   DX, AX
   936  	JNE    LBB13_6
   937  
   938  LBB13_7:
   939  	VZEROUPPER
   940  	RET
   941  
   942  DATA dataDivNumberF64<>+0(SB)/8, $0x3ff0000000000000
   943  GLOBL dataDivNumberF64<>(SB), RODATA|NOPTR, $8
   944  
   945  // func DivNumber_AVX2_F64(x []float64, a float64)
   946  // Requires: AVX, AVX2, SSE2
   947  TEXT ·DivNumber_AVX2_F64(SB), NOSPLIT, $0-32
   948  	MOVQ  x_base+0(FP), DI
   949  	MOVSD a+24(FP), X0
   950  	MOVQ  x_len+8(FP), SI
   951  	TESTQ SI, SI
   952  	JE    LBB14_12
   953  	CMPQ  SI, $0x04
   954  	JAE   LBB14_3
   955  	XORL  AX, AX
   956  	JMP   LBB14_10
   957  
   958  LBB14_3:
   959  	MOVQ         SI, AX
   960  	ANDQ         $-4, AX
   961  	VBROADCASTSD X0, Y1
   962  	LEAQ         -4(AX), CX
   963  	MOVQ         CX, R8
   964  	SHRQ         $0x02, R8
   965  	ADDQ         $0x01, R8
   966  	TESTQ        CX, CX
   967  	JE           LBB14_4
   968  	MOVQ         R8, CX
   969  	ANDQ         $-2, CX
   970  	VBROADCASTSD dataDivNumberF64<>+0(SB), Y2
   971  	VDIVPD       Y1, Y2, Y2
   972  	XORL         DX, DX
   973  
   974  LBB14_6:
   975  	VMULPD  (DI)(DX*8), Y2, Y3
   976  	VMOVUPD Y3, (DI)(DX*8)
   977  	VMULPD  32(DI)(DX*8), Y2, Y3
   978  	VMOVUPD Y3, 32(DI)(DX*8)
   979  	ADDQ    $0x08, DX
   980  	ADDQ    $-2, CX
   981  	JNE     LBB14_6
   982  	TESTB   $0x01, R8
   983  	JE      LBB14_9
   984  
   985  LBB14_8:
   986  	VMOVUPD (DI)(DX*8), Y2
   987  	VDIVPD  Y1, Y2, Y1
   988  	VMOVUPD Y1, (DI)(DX*8)
   989  
   990  LBB14_9:
   991  	CMPQ AX, SI
   992  	JE   LBB14_12
   993  
   994  LBB14_10:
   995  	VMOVSD dataDivNumberF64<>+0(SB), X1
   996  	VDIVSD X0, X1, X0
   997  
   998  LBB14_11:
   999  	VMULSD (DI)(AX*8), X0, X1
  1000  	VMOVSD X1, (DI)(AX*8)
  1001  	ADDQ   $0x01, AX
  1002  	CMPQ   SI, AX
  1003  	JNE    LBB14_11
  1004  
  1005  LBB14_12:
  1006  	VZEROUPPER
  1007  	RET
  1008  
  1009  LBB14_4:
  1010  	XORL  DX, DX
  1011  	TESTB $0x01, R8
  1012  	JNE   LBB14_8
  1013  	JMP   LBB14_9
  1014  
  1015  DATA dataDivNumberF32<>+0(SB)/4, $0x3f800000
  1016  GLOBL dataDivNumberF32<>(SB), RODATA|NOPTR, $4
  1017  
  1018  // func DivNumber_AVX2_F32(x []float32, a float32)
  1019  // Requires: AVX, AVX2, SSE
  1020  TEXT ·DivNumber_AVX2_F32(SB), NOSPLIT, $0-28
  1021  	MOVQ  x_base+0(FP), DI
  1022  	MOVSS a+24(FP), X0
  1023  	MOVQ  x_len+8(FP), SI
  1024  	TESTQ SI, SI
  1025  	JE    LBB15_8
  1026  	CMPQ  SI, $0x20
  1027  	JAE   LBB15_3
  1028  	XORL  AX, AX
  1029  	JMP   LBB15_6
  1030  
  1031  LBB15_3:
  1032  	MOVQ         SI, AX
  1033  	ANDQ         $-32, AX
  1034  	VMOVSS       dataDivNumberF32<>+0(SB), X1
  1035  	VDIVSS       X0, X1, X1
  1036  	VBROADCASTSS X1, Y1
  1037  	XORL         CX, CX
  1038  
  1039  LBB15_4:
  1040  	VMULPS  (DI)(CX*4), Y1, Y2
  1041  	VMULPS  32(DI)(CX*4), Y1, Y3
  1042  	VMULPS  64(DI)(CX*4), Y1, Y4
  1043  	VMULPS  96(DI)(CX*4), Y1, Y5
  1044  	VMOVUPS Y2, (DI)(CX*4)
  1045  	VMOVUPS Y3, 32(DI)(CX*4)
  1046  	VMOVUPS Y4, 64(DI)(CX*4)
  1047  	VMOVUPS Y5, 96(DI)(CX*4)
  1048  	ADDQ    $0x20, CX
  1049  	CMPQ    AX, CX
  1050  	JNE     LBB15_4
  1051  	CMPQ    AX, SI
  1052  	JE      LBB15_8
  1053  
  1054  LBB15_6:
  1055  	VMOVSS dataDivNumberF32<>+0(SB), X1
  1056  	VDIVSS X0, X1, X0
  1057  
  1058  LBB15_7:
  1059  	VMULSS (DI)(AX*4), X0, X1
  1060  	VMOVSS X1, (DI)(AX*4)
  1061  	ADDQ   $0x01, AX
  1062  	CMPQ   SI, AX
  1063  	JNE    LBB15_7
  1064  
  1065  LBB15_8:
  1066  	VZEROUPPER
  1067  	RET
  1068  
  1069  DATA dataAbsF64<>+0(SB)/8, $0x7fffffffffffffff
  1070  DATA dataAbsF64<>+8(SB)/8, $0x7fffffffffffffff
  1071  DATA dataAbsF64<>+16(SB)/8, $0x7fffffffffffffff
  1072  GLOBL dataAbsF64<>(SB), RODATA|NOPTR, $24
  1073  
  1074  // func Abs_AVX2_F64(x []float64)
  1075  // Requires: AVX
  1076  TEXT ·Abs_AVX2_F64(SB), NOSPLIT, $0-24
  1077  	MOVQ  x_base+0(FP), DI
  1078  	MOVQ  x_len+8(FP), SI
  1079  	TESTQ SI, SI
  1080  	JE    LBB16_8
  1081  	CMPQ  SI, $0x10
  1082  	JAE   LBB16_3
  1083  	XORL  AX, AX
  1084  	JMP   LBB16_6
  1085  
  1086  LBB16_3:
  1087  	MOVQ         SI, AX
  1088  	ANDQ         $-16, AX
  1089  	XORL         CX, CX
  1090  	VBROADCASTSD dataAbsF64<>+0(SB), Y0
  1091  
  1092  LBB16_4:
  1093  	VANDPS  (DI)(CX*8), Y0, Y1
  1094  	VANDPS  32(DI)(CX*8), Y0, Y2
  1095  	VANDPS  64(DI)(CX*8), Y0, Y3
  1096  	VANDPS  96(DI)(CX*8), Y0, Y4
  1097  	VMOVUPS Y1, (DI)(CX*8)
  1098  	VMOVUPS Y2, 32(DI)(CX*8)
  1099  	VMOVUPS Y3, 64(DI)(CX*8)
  1100  	VMOVUPS Y4, 96(DI)(CX*8)
  1101  	ADDQ    $0x10, CX
  1102  	CMPQ    AX, CX
  1103  	JNE     LBB16_4
  1104  	CMPQ    AX, SI
  1105  	JE      LBB16_8
  1106  
  1107  LBB16_6:
  1108  	VMOVUPS dataAbsF64<>+8(SB), X0
  1109  
  1110  LBB16_7:
  1111  	VMOVSD  (DI)(AX*8), X1
  1112  	VANDPS  X0, X1, X1
  1113  	VMOVLPS X1, (DI)(AX*8)
  1114  	ADDQ    $0x01, AX
  1115  	CMPQ    SI, AX
  1116  	JNE     LBB16_7
  1117  
  1118  LBB16_8:
  1119  	VZEROUPPER
  1120  	RET
  1121  
  1122  DATA dataAbsF32<>+0(SB)/4, $0x7fffffff
  1123  GLOBL dataAbsF32<>(SB), RODATA|NOPTR, $4
  1124  
  1125  // func Abs_AVX2_F32(x []float32)
  1126  // Requires: AVX
  1127  TEXT ·Abs_AVX2_F32(SB), NOSPLIT, $0-24
  1128  	MOVQ  x_base+0(FP), DI
  1129  	MOVQ  x_len+8(FP), SI
  1130  	TESTQ SI, SI
  1131  	JE    LBB17_8
  1132  	CMPQ  SI, $0x20
  1133  	JAE   LBB17_3
  1134  	XORL  AX, AX
  1135  	JMP   LBB17_6
  1136  
  1137  LBB17_3:
  1138  	MOVQ         SI, AX
  1139  	ANDQ         $-32, AX
  1140  	XORL         CX, CX
  1141  	VBROADCASTSS dataAbsF32<>+0(SB), Y0
  1142  
  1143  LBB17_4:
  1144  	VANDPS  (DI)(CX*4), Y0, Y1
  1145  	VANDPS  32(DI)(CX*4), Y0, Y2
  1146  	VANDPS  64(DI)(CX*4), Y0, Y3
  1147  	VANDPS  96(DI)(CX*4), Y0, Y4
  1148  	VMOVUPS Y1, (DI)(CX*4)
  1149  	VMOVUPS Y2, 32(DI)(CX*4)
  1150  	VMOVUPS Y3, 64(DI)(CX*4)
  1151  	VMOVUPS Y4, 96(DI)(CX*4)
  1152  	ADDQ    $0x20, CX
  1153  	CMPQ    AX, CX
  1154  	JNE     LBB17_4
  1155  	CMPQ    AX, SI
  1156  	JE      LBB17_8
  1157  
  1158  LBB17_6:
  1159  	VBROADCASTSS dataAbsF32<>+0(SB), X0
  1160  
  1161  LBB17_7:
  1162  	VMOVSS (DI)(AX*4), X1
  1163  	VANDPS X0, X1, X1
  1164  	VMOVSS X1, (DI)(AX*4)
  1165  	ADDQ   $0x01, AX
  1166  	CMPQ   SI, AX
  1167  	JNE    LBB17_7
  1168  
  1169  LBB17_8:
  1170  	VZEROUPPER
  1171  	RET
  1172  
  1173  DATA dataNegF64<>+0(SB)/8, $0x8000000000000000
  1174  DATA dataNegF64<>+8(SB)/8, $0x8000000000000000
  1175  DATA dataNegF64<>+16(SB)/8, $0x8000000000000000
  1176  GLOBL dataNegF64<>(SB), RODATA|NOPTR, $24
  1177  
  1178  // func Neg_AVX2_F64(x []float64)
  1179  // Requires: AVX
  1180  TEXT ·Neg_AVX2_F64(SB), NOSPLIT, $0-24
  1181  	MOVQ  x_base+0(FP), DI
  1182  	MOVQ  x_len+8(FP), SI
  1183  	TESTQ SI, SI
  1184  	JE    LBB18_12
  1185  	CMPQ  SI, $0x10
  1186  	JAE   LBB18_3
  1187  	XORL  AX, AX
  1188  	JMP   LBB18_10
  1189  
  1190  LBB18_3:
  1191  	MOVQ         SI, AX
  1192  	ANDQ         $-16, AX
  1193  	LEAQ         -16(AX), CX
  1194  	MOVQ         CX, R8
  1195  	SHRQ         $0x04, R8
  1196  	ADDQ         $0x01, R8
  1197  	TESTQ        CX, CX
  1198  	JE           LBB18_4
  1199  	MOVQ         R8, DX
  1200  	ANDQ         $-2, DX
  1201  	XORL         CX, CX
  1202  	VBROADCASTSD dataNegF64<>+0(SB), Y0
  1203  
  1204  LBB18_6:
  1205  	VXORPS  (DI)(CX*8), Y0, Y1
  1206  	VXORPS  32(DI)(CX*8), Y0, Y2
  1207  	VXORPS  64(DI)(CX*8), Y0, Y3
  1208  	VXORPS  96(DI)(CX*8), Y0, Y4
  1209  	VMOVUPS Y1, (DI)(CX*8)
  1210  	VMOVUPS Y2, 32(DI)(CX*8)
  1211  	VMOVUPS Y3, 64(DI)(CX*8)
  1212  	VMOVUPS Y4, 96(DI)(CX*8)
  1213  	VXORPS  128(DI)(CX*8), Y0, Y1
  1214  	VXORPS  160(DI)(CX*8), Y0, Y2
  1215  	VXORPS  192(DI)(CX*8), Y0, Y3
  1216  	VXORPS  224(DI)(CX*8), Y0, Y4
  1217  	VMOVUPS Y1, 128(DI)(CX*8)
  1218  	VMOVUPS Y2, 160(DI)(CX*8)
  1219  	VMOVUPS Y3, 192(DI)(CX*8)
  1220  	VMOVUPS Y4, 224(DI)(CX*8)
  1221  	ADDQ    $0x20, CX
  1222  	ADDQ    $-2, DX
  1223  	JNE     LBB18_6
  1224  	TESTB   $0x01, R8
  1225  	JE      LBB18_9
  1226  
  1227  LBB18_8:
  1228  	VBROADCASTSD dataNegF64<>+0(SB), Y0
  1229  	VXORPS       (DI)(CX*8), Y0, Y1
  1230  	VXORPS       32(DI)(CX*8), Y0, Y2
  1231  	VXORPS       64(DI)(CX*8), Y0, Y3
  1232  	VXORPS       96(DI)(CX*8), Y0, Y0
  1233  	VMOVUPS      Y1, (DI)(CX*8)
  1234  	VMOVUPS      Y2, 32(DI)(CX*8)
  1235  	VMOVUPS      Y3, 64(DI)(CX*8)
  1236  	VMOVUPS      Y0, 96(DI)(CX*8)
  1237  
  1238  LBB18_9:
  1239  	CMPQ AX, SI
  1240  	JE   LBB18_12
  1241  
  1242  LBB18_10:
  1243  	VMOVUPS dataNegF64<>+8(SB), X0
  1244  
  1245  LBB18_11:
  1246  	VMOVSD  (DI)(AX*8), X1
  1247  	VXORPS  X0, X1, X1
  1248  	VMOVLPS X1, (DI)(AX*8)
  1249  	ADDQ    $0x01, AX
  1250  	CMPQ    SI, AX
  1251  	JNE     LBB18_11
  1252  
  1253  LBB18_12:
  1254  	VZEROUPPER
  1255  	RET
  1256  
  1257  LBB18_4:
  1258  	XORL  CX, CX
  1259  	TESTB $0x01, R8
  1260  	JNE   LBB18_8
  1261  	JMP   LBB18_9
  1262  
  1263  DATA dataNegF32<>+0(SB)/4, $0x80000000
  1264  GLOBL dataNegF32<>(SB), RODATA|NOPTR, $4
  1265  
  1266  // func Neg_AVX2_F32(x []float32)
  1267  // Requires: AVX
  1268  TEXT ·Neg_AVX2_F32(SB), NOSPLIT, $0-24
  1269  	MOVQ  x_base+0(FP), DI
  1270  	MOVQ  x_len+8(FP), SI
  1271  	TESTQ SI, SI
  1272  	JE    LBB19_12
  1273  	CMPQ  SI, $0x20
  1274  	JAE   LBB19_3
  1275  	XORL  AX, AX
  1276  	JMP   LBB19_10
  1277  
  1278  LBB19_3:
  1279  	MOVQ         SI, AX
  1280  	ANDQ         $-32, AX
  1281  	LEAQ         -32(AX), CX
  1282  	MOVQ         CX, R8
  1283  	SHRQ         $0x05, R8
  1284  	ADDQ         $0x01, R8
  1285  	TESTQ        CX, CX
  1286  	JE           LBB19_4
  1287  	MOVQ         R8, DX
  1288  	ANDQ         $-2, DX
  1289  	XORL         CX, CX
  1290  	VBROADCASTSS dataNegF32<>+0(SB), Y0
  1291  
  1292  LBB19_6:
  1293  	VXORPS  (DI)(CX*4), Y0, Y1
  1294  	VXORPS  32(DI)(CX*4), Y0, Y2
  1295  	VXORPS  64(DI)(CX*4), Y0, Y3
  1296  	VXORPS  96(DI)(CX*4), Y0, Y4
  1297  	VMOVUPS Y1, (DI)(CX*4)
  1298  	VMOVUPS Y2, 32(DI)(CX*4)
  1299  	VMOVUPS Y3, 64(DI)(CX*4)
  1300  	VMOVUPS Y4, 96(DI)(CX*4)
  1301  	VXORPS  128(DI)(CX*4), Y0, Y1
  1302  	VXORPS  160(DI)(CX*4), Y0, Y2
  1303  	VXORPS  192(DI)(CX*4), Y0, Y3
  1304  	VXORPS  224(DI)(CX*4), Y0, Y4
  1305  	VMOVUPS Y1, 128(DI)(CX*4)
  1306  	VMOVUPS Y2, 160(DI)(CX*4)
  1307  	VMOVUPS Y3, 192(DI)(CX*4)
  1308  	VMOVUPS Y4, 224(DI)(CX*4)
  1309  	ADDQ    $0x40, CX
  1310  	ADDQ    $-2, DX
  1311  	JNE     LBB19_6
  1312  	TESTB   $0x01, R8
  1313  	JE      LBB19_9
  1314  
  1315  LBB19_8:
  1316  	VBROADCASTSS dataNegF32<>+0(SB), Y0
  1317  	VXORPS       (DI)(CX*4), Y0, Y1
  1318  	VXORPS       32(DI)(CX*4), Y0, Y2
  1319  	VXORPS       64(DI)(CX*4), Y0, Y3
  1320  	VXORPS       96(DI)(CX*4), Y0, Y0
  1321  	VMOVUPS      Y1, (DI)(CX*4)
  1322  	VMOVUPS      Y2, 32(DI)(CX*4)
  1323  	VMOVUPS      Y3, 64(DI)(CX*4)
  1324  	VMOVUPS      Y0, 96(DI)(CX*4)
  1325  
  1326  LBB19_9:
  1327  	CMPQ AX, SI
  1328  	JE   LBB19_12
  1329  
  1330  LBB19_10:
  1331  	VBROADCASTSS dataNegF32<>+0(SB), X0
  1332  
  1333  LBB19_11:
  1334  	VMOVSS (DI)(AX*4), X1
  1335  	VXORPS X0, X1, X1
  1336  	VMOVSS X1, (DI)(AX*4)
  1337  	ADDQ   $0x01, AX
  1338  	CMPQ   SI, AX
  1339  	JNE    LBB19_11
  1340  
  1341  LBB19_12:
  1342  	VZEROUPPER
  1343  	RET
  1344  
  1345  LBB19_4:
  1346  	XORL  CX, CX
  1347  	TESTB $0x01, R8
  1348  	JNE   LBB19_8
  1349  	JMP   LBB19_9
  1350  
  1351  DATA dataInvF64<>+0(SB)/8, $0x3ff0000000000000
  1352  GLOBL dataInvF64<>(SB), RODATA|NOPTR, $8
  1353  
  1354  // func Inv_AVX2_F64(x []float64)
  1355  // Requires: AVX
  1356  TEXT ·Inv_AVX2_F64(SB), NOSPLIT, $0-24
  1357  	MOVQ  x_base+0(FP), DI
  1358  	MOVQ  x_len+8(FP), SI
  1359  	TESTQ SI, SI
  1360  	JE    LBB20_12
  1361  	CMPQ  SI, $0x04
  1362  	JAE   LBB20_3
  1363  	XORL  AX, AX
  1364  	JMP   LBB20_10
  1365  
  1366  LBB20_3:
  1367  	MOVQ         SI, AX
  1368  	ANDQ         $-4, AX
  1369  	LEAQ         -4(AX), CX
  1370  	MOVQ         CX, R8
  1371  	SHRQ         $0x02, R8
  1372  	ADDQ         $0x01, R8
  1373  	TESTQ        CX, CX
  1374  	JE           LBB20_4
  1375  	MOVQ         R8, CX
  1376  	ANDQ         $-2, CX
  1377  	XORL         DX, DX
  1378  	VBROADCASTSD dataInvF64<>+0(SB), Y0
  1379  
  1380  LBB20_6:
  1381  	VDIVPD  (DI)(DX*8), Y0, Y1
  1382  	VMOVUPD Y1, (DI)(DX*8)
  1383  	VDIVPD  32(DI)(DX*8), Y0, Y1
  1384  	VMOVUPD Y1, 32(DI)(DX*8)
  1385  	ADDQ    $0x08, DX
  1386  	ADDQ    $-2, CX
  1387  	JNE     LBB20_6
  1388  	TESTB   $0x01, R8
  1389  	JE      LBB20_9
  1390  
  1391  LBB20_8:
  1392  	VBROADCASTSD dataInvF64<>+0(SB), Y0
  1393  	VDIVPD       (DI)(DX*8), Y0, Y0
  1394  	VMOVUPD      Y0, (DI)(DX*8)
  1395  
  1396  LBB20_9:
  1397  	CMPQ AX, SI
  1398  	JE   LBB20_12
  1399  
  1400  LBB20_10:
  1401  	VMOVSD dataInvF64<>+0(SB), X0
  1402  
  1403  LBB20_11:
  1404  	VDIVSD (DI)(AX*8), X0, X1
  1405  	VMOVSD X1, (DI)(AX*8)
  1406  	ADDQ   $0x01, AX
  1407  	CMPQ   SI, AX
  1408  	JNE    LBB20_11
  1409  
  1410  LBB20_12:
  1411  	VZEROUPPER
  1412  	RET
  1413  
  1414  LBB20_4:
  1415  	XORL  DX, DX
  1416  	TESTB $0x01, R8
  1417  	JNE   LBB20_8
  1418  	JMP   LBB20_9
  1419  
  1420  DATA dataInvF32<>+0(SB)/4, $0x3f800000
  1421  GLOBL dataInvF32<>(SB), RODATA|NOPTR, $4
  1422  
  1423  // func Inv_AVX2_F32(x []float32)
  1424  // Requires: AVX, FMA3
  1425  TEXT ·Inv_AVX2_F32(SB), NOSPLIT, $0-24
  1426  	MOVQ  x_base+0(FP), DI
  1427  	MOVQ  x_len+8(FP), SI
  1428  	TESTQ SI, SI
  1429  	JE    LBB21_8
  1430  	CMPQ  SI, $0x20
  1431  	JAE   LBB21_3
  1432  	XORL  AX, AX
  1433  	JMP   LBB21_6
  1434  
  1435  LBB21_3:
  1436  	MOVQ         SI, AX
  1437  	ANDQ         $-32, AX
  1438  	XORL         CX, CX
  1439  	VBROADCASTSS dataInvF32<>+0(SB), Y0
  1440  
  1441  LBB21_4:
  1442  	VMOVUPS      (DI)(CX*4), Y1
  1443  	VMOVUPS      32(DI)(CX*4), Y2
  1444  	VMOVUPS      64(DI)(CX*4), Y3
  1445  	VRCPPS       Y1, Y4
  1446  	VFMSUB213PS  Y0, Y4, Y1
  1447  	VRCPPS       Y2, Y5
  1448  	VFNMADD132PS Y4, Y4, Y1
  1449  	VMOVUPS      96(DI)(CX*4), Y4
  1450  	VFMSUB213PS  Y0, Y5, Y2
  1451  	VFNMADD132PS Y5, Y5, Y2
  1452  	VRCPPS       Y3, Y5
  1453  	VFMSUB213PS  Y0, Y5, Y3
  1454  	VFNMADD132PS Y5, Y5, Y3
  1455  	VRCPPS       Y4, Y5
  1456  	VFMSUB213PS  Y0, Y5, Y4
  1457  	VFNMADD132PS Y5, Y5, Y4
  1458  	VMOVUPS      Y1, (DI)(CX*4)
  1459  	VMOVUPS      Y2, 32(DI)(CX*4)
  1460  	VMOVUPS      Y3, 64(DI)(CX*4)
  1461  	VMOVUPS      Y4, 96(DI)(CX*4)
  1462  	ADDQ         $0x20, CX
  1463  	CMPQ         AX, CX
  1464  	JNE          LBB21_4
  1465  	CMPQ         AX, SI
  1466  	JE           LBB21_8
  1467  
  1468  LBB21_6:
  1469  	VMOVSS dataInvF32<>+0(SB), X0
  1470  
  1471  LBB21_7:
  1472  	VDIVSS (DI)(AX*4), X0, X1
  1473  	VMOVSS X1, (DI)(AX*4)
  1474  	ADDQ   $0x01, AX
  1475  	CMPQ   SI, AX
  1476  	JNE    LBB21_7
  1477  
  1478  LBB21_8:
  1479  	VZEROUPPER
  1480  	RET
  1481  
  1482  // func Sum_AVX2_F64(x []float64) float64
  1483  // Requires: AVX, SSE2
  1484  TEXT ·Sum_AVX2_F64(SB), NOSPLIT, $0-32
  1485  	MOVQ   x_base+0(FP), DI
  1486  	MOVQ   x_len+8(FP), SI
  1487  	TESTQ  SI, SI
  1488  	JE     LBB0_1
  1489  	CMPQ   SI, $0x10
  1490  	JAE    LBB0_4
  1491  	VXORPD X0, X0, X0
  1492  	XORL   AX, AX
  1493  	JMP    LBB0_11
  1494  
  1495  LBB0_1:
  1496  	VXORPS X0, X0, X0
  1497  	MOVSD  X0, ret+24(FP)
  1498  	RET
  1499  
  1500  LBB0_4:
  1501  	MOVQ   SI, AX
  1502  	ANDQ   $-16, AX
  1503  	LEAQ   -16(AX), CX
  1504  	MOVQ   CX, R8
  1505  	SHRQ   $0x04, R8
  1506  	ADDQ   $0x01, R8
  1507  	TESTQ  CX, CX
  1508  	JE     LBB0_5
  1509  	MOVQ   R8, CX
  1510  	ANDQ   $-2, CX
  1511  	VXORPD X0, X0, X0
  1512  	XORL   DX, DX
  1513  	VXORPD X1, X1, X1
  1514  	VXORPD X2, X2, X2
  1515  	VXORPD X3, X3, X3
  1516  
  1517  LBB0_7:
  1518  	VADDPD (DI)(DX*8), Y0, Y0
  1519  	VADDPD 32(DI)(DX*8), Y1, Y1
  1520  	VADDPD 64(DI)(DX*8), Y2, Y2
  1521  	VADDPD 96(DI)(DX*8), Y3, Y3
  1522  	VADDPD 128(DI)(DX*8), Y0, Y0
  1523  	VADDPD 160(DI)(DX*8), Y1, Y1
  1524  	VADDPD 192(DI)(DX*8), Y2, Y2
  1525  	VADDPD 224(DI)(DX*8), Y3, Y3
  1526  	ADDQ   $0x20, DX
  1527  	ADDQ   $-2, CX
  1528  	JNE    LBB0_7
  1529  	TESTB  $0x01, R8
  1530  	JE     LBB0_10
  1531  
  1532  LBB0_9:
  1533  	VADDPD (DI)(DX*8), Y0, Y0
  1534  	VADDPD 32(DI)(DX*8), Y1, Y1
  1535  	VADDPD 64(DI)(DX*8), Y2, Y2
  1536  	VADDPD 96(DI)(DX*8), Y3, Y3
  1537  
  1538  LBB0_10:
  1539  	VADDPD       Y3, Y1, Y1
  1540  	VADDPD       Y2, Y0, Y0
  1541  	VADDPD       Y1, Y0, Y0
  1542  	VEXTRACTF128 $0x01, Y0, X1
  1543  	VADDPD       X1, X0, X0
  1544  	VPERMILPD    $0x01, X0, X1
  1545  	VADDSD       X1, X0, X0
  1546  	CMPQ         AX, SI
  1547  	JE           LBB0_12
  1548  
  1549  LBB0_11:
  1550  	VADDSD (DI)(AX*8), X0, X0
  1551  	ADDQ   $0x01, AX
  1552  	CMPQ   SI, AX
  1553  	JNE    LBB0_11
  1554  
  1555  LBB0_12:
  1556  	VZEROUPPER
  1557  	MOVSD X0, ret+24(FP)
  1558  	RET
  1559  
  1560  LBB0_5:
  1561  	VXORPD X0, X0, X0
  1562  	XORL   DX, DX
  1563  	VXORPD X1, X1, X1
  1564  	VXORPD X2, X2, X2
  1565  	VXORPD X3, X3, X3
  1566  	TESTB  $0x01, R8
  1567  	JNE    LBB0_9
  1568  	JMP    LBB0_10
  1569  
  1570  // func Sum_AVX2_F32(x []float32) float32
  1571  // Requires: AVX, SSE
  1572  TEXT ·Sum_AVX2_F32(SB), NOSPLIT, $0-28
  1573  	MOVQ   x_base+0(FP), DI
  1574  	MOVQ   x_len+8(FP), SI
  1575  	TESTQ  SI, SI
  1576  	JE     LBB1_1
  1577  	CMPQ   SI, $0x20
  1578  	JAE    LBB1_4
  1579  	VXORPS X0, X0, X0
  1580  	XORL   AX, AX
  1581  	JMP    LBB1_11
  1582  
  1583  LBB1_1:
  1584  	VXORPS X0, X0, X0
  1585  	MOVSS  X0, ret+24(FP)
  1586  	RET
  1587  
  1588  LBB1_4:
  1589  	MOVQ   SI, AX
  1590  	ANDQ   $-32, AX
  1591  	LEAQ   -32(AX), CX
  1592  	MOVQ   CX, R8
  1593  	SHRQ   $0x05, R8
  1594  	ADDQ   $0x01, R8
  1595  	TESTQ  CX, CX
  1596  	JE     LBB1_5
  1597  	MOVQ   R8, CX
  1598  	ANDQ   $-2, CX
  1599  	VXORPS X0, X0, X0
  1600  	XORL   DX, DX
  1601  	VXORPS X1, X1, X1
  1602  	VXORPS X2, X2, X2
  1603  	VXORPS X3, X3, X3
  1604  
  1605  LBB1_7:
  1606  	VADDPS (DI)(DX*4), Y0, Y0
  1607  	VADDPS 32(DI)(DX*4), Y1, Y1
  1608  	VADDPS 64(DI)(DX*4), Y2, Y2
  1609  	VADDPS 96(DI)(DX*4), Y3, Y3
  1610  	VADDPS 128(DI)(DX*4), Y0, Y0
  1611  	VADDPS 160(DI)(DX*4), Y1, Y1
  1612  	VADDPS 192(DI)(DX*4), Y2, Y2
  1613  	VADDPS 224(DI)(DX*4), Y3, Y3
  1614  	ADDQ   $0x40, DX
  1615  	ADDQ   $-2, CX
  1616  	JNE    LBB1_7
  1617  	TESTB  $0x01, R8
  1618  	JE     LBB1_10
  1619  
  1620  LBB1_9:
  1621  	VADDPS (DI)(DX*4), Y0, Y0
  1622  	VADDPS 32(DI)(DX*4), Y1, Y1
  1623  	VADDPS 64(DI)(DX*4), Y2, Y2
  1624  	VADDPS 96(DI)(DX*4), Y3, Y3
  1625  
  1626  LBB1_10:
  1627  	VADDPS       Y3, Y1, Y1
  1628  	VADDPS       Y2, Y0, Y0
  1629  	VADDPS       Y1, Y0, Y0
  1630  	VEXTRACTF128 $0x01, Y0, X1
  1631  	VADDPS       X1, X0, X0
  1632  	VPERMILPD    $0x01, X0, X1
  1633  	VADDPS       X1, X0, X0
  1634  	VMOVSHDUP    X0, X1
  1635  	VADDSS       X1, X0, X0
  1636  	CMPQ         AX, SI
  1637  	JE           LBB1_12
  1638  
  1639  LBB1_11:
  1640  	VADDSS (DI)(AX*4), X0, X0
  1641  	ADDQ   $0x01, AX
  1642  	CMPQ   SI, AX
  1643  	JNE    LBB1_11
  1644  
  1645  LBB1_12:
  1646  	VZEROUPPER
  1647  	MOVSS X0, ret+24(FP)
  1648  	RET
  1649  
  1650  LBB1_5:
  1651  	VXORPS X0, X0, X0
  1652  	XORL   DX, DX
  1653  	VXORPS X1, X1, X1
  1654  	VXORPS X2, X2, X2
  1655  	VXORPS X3, X3, X3
  1656  	TESTB  $0x01, R8
  1657  	JNE    LBB1_9
  1658  	JMP    LBB1_10
  1659  
  1660  // func CumSum_AVX2_F64(x []float64)
  1661  // Requires: AVX
  1662  TEXT ·CumSum_AVX2_F64(SB), NOSPLIT, $0-24
  1663  	MOVQ   x_base+0(FP), DI
  1664  	MOVQ   x_len+8(FP), SI
  1665  	TESTQ  SI, SI
  1666  	JE     LBB2_8
  1667  	LEAQ   -1(SI), CX
  1668  	MOVL   SI, AX
  1669  	ANDL   $0x03, AX
  1670  	CMPQ   CX, $0x03
  1671  	JAE    LBB2_3
  1672  	VXORPD X0, X0, X0
  1673  	XORL   CX, CX
  1674  	JMP    LBB2_5
  1675  
  1676  LBB2_3:
  1677  	ANDQ   $-4, SI
  1678  	VXORPD X0, X0, X0
  1679  	XORL   CX, CX
  1680  
  1681  LBB2_4:
  1682  	VADDSD (DI)(CX*8), X0, X0
  1683  	VMOVSD X0, (DI)(CX*8)
  1684  	VADDSD 8(DI)(CX*8), X0, X0
  1685  	VMOVSD X0, 8(DI)(CX*8)
  1686  	VADDSD 16(DI)(CX*8), X0, X0
  1687  	VMOVSD X0, 16(DI)(CX*8)
  1688  	VADDSD 24(DI)(CX*8), X0, X0
  1689  	VMOVSD X0, 24(DI)(CX*8)
  1690  	ADDQ   $0x04, CX
  1691  	CMPQ   SI, CX
  1692  	JNE    LBB2_4
  1693  
  1694  LBB2_5:
  1695  	TESTQ AX, AX
  1696  	JE    LBB2_8
  1697  	LEAQ  (DI)(CX*8), CX
  1698  	XORL  DX, DX
  1699  
  1700  LBB2_7:
  1701  	VADDSD (CX)(DX*8), X0, X0
  1702  	VMOVSD X0, (CX)(DX*8)
  1703  	ADDQ   $0x01, DX
  1704  	CMPQ   AX, DX
  1705  	JNE    LBB2_7
  1706  
  1707  LBB2_8:
  1708  	RET
  1709  
  1710  // func CumSum_AVX2_F32(x []float32)
  1711  // Requires: AVX
  1712  TEXT ·CumSum_AVX2_F32(SB), NOSPLIT, $0-24
  1713  	MOVQ   x_base+0(FP), DI
  1714  	MOVQ   x_len+8(FP), SI
  1715  	TESTQ  SI, SI
  1716  	JE     LBB3_8
  1717  	LEAQ   -1(SI), CX
  1718  	MOVL   SI, AX
  1719  	ANDL   $0x03, AX
  1720  	CMPQ   CX, $0x03
  1721  	JAE    LBB3_3
  1722  	VXORPS X0, X0, X0
  1723  	XORL   CX, CX
  1724  	JMP    LBB3_5
  1725  
  1726  LBB3_3:
  1727  	ANDQ   $-4, SI
  1728  	VXORPS X0, X0, X0
  1729  	XORL   CX, CX
  1730  
  1731  LBB3_4:
  1732  	VADDSS (DI)(CX*4), X0, X0
  1733  	VMOVSS X0, (DI)(CX*4)
  1734  	VADDSS 4(DI)(CX*4), X0, X0
  1735  	VMOVSS X0, 4(DI)(CX*4)
  1736  	VADDSS 8(DI)(CX*4), X0, X0
  1737  	VMOVSS X0, 8(DI)(CX*4)
  1738  	VADDSS 12(DI)(CX*4), X0, X0
  1739  	VMOVSS X0, 12(DI)(CX*4)
  1740  	ADDQ   $0x04, CX
  1741  	CMPQ   SI, CX
  1742  	JNE    LBB3_4
  1743  
  1744  LBB3_5:
  1745  	TESTQ AX, AX
  1746  	JE    LBB3_8
  1747  	LEAQ  (DI)(CX*4), CX
  1748  	XORL  DX, DX
  1749  
  1750  LBB3_7:
  1751  	VADDSS (CX)(DX*4), X0, X0
  1752  	VMOVSS X0, (CX)(DX*4)
  1753  	ADDQ   $0x01, DX
  1754  	CMPQ   AX, DX
  1755  	JNE    LBB3_7
  1756  
  1757  LBB3_8:
  1758  	RET
  1759  
  1760  DATA dataProdF64<>+0(SB)/8, $0x3ff0000000000000
  1761  GLOBL dataProdF64<>(SB), RODATA|NOPTR, $8
  1762  
  1763  // func Prod_AVX2_F64(x []float64) float64
  1764  // Requires: AVX, SSE2
  1765  TEXT ·Prod_AVX2_F64(SB), NOSPLIT, $0-32
  1766  	MOVQ   x_base+0(FP), DI
  1767  	MOVQ   x_len+8(FP), SI
  1768  	TESTQ  SI, SI
  1769  	JE     LBB4_1
  1770  	CMPQ   SI, $0x10
  1771  	JAE    LBB4_4
  1772  	VMOVSD dataProdF64<>+0(SB), X0
  1773  	XORL   AX, AX
  1774  	JMP    LBB4_11
  1775  
  1776  LBB4_1:
  1777  	VMOVSD dataProdF64<>+0(SB), X0
  1778  	MOVSD  X0, ret+24(FP)
  1779  	RET
  1780  
  1781  LBB4_4:
  1782  	MOVQ         SI, AX
  1783  	ANDQ         $-16, AX
  1784  	LEAQ         -16(AX), CX
  1785  	MOVQ         CX, R8
  1786  	SHRQ         $0x04, R8
  1787  	ADDQ         $0x01, R8
  1788  	TESTQ        CX, CX
  1789  	JE           LBB4_5
  1790  	MOVQ         R8, CX
  1791  	ANDQ         $-2, CX
  1792  	VBROADCASTSD dataProdF64<>+0(SB), Y0
  1793  	XORL         DX, DX
  1794  	VMOVAPD      Y0, Y1
  1795  	VMOVAPD      Y0, Y2
  1796  	VMOVAPD      Y0, Y3
  1797  
  1798  LBB4_7:
  1799  	VMULPD (DI)(DX*8), Y0, Y0
  1800  	VMULPD 32(DI)(DX*8), Y1, Y1
  1801  	VMULPD 64(DI)(DX*8), Y2, Y2
  1802  	VMULPD 96(DI)(DX*8), Y3, Y3
  1803  	VMULPD 128(DI)(DX*8), Y0, Y0
  1804  	VMULPD 160(DI)(DX*8), Y1, Y1
  1805  	VMULPD 192(DI)(DX*8), Y2, Y2
  1806  	VMULPD 224(DI)(DX*8), Y3, Y3
  1807  	ADDQ   $0x20, DX
  1808  	ADDQ   $-2, CX
  1809  	JNE    LBB4_7
  1810  	TESTB  $0x01, R8
  1811  	JE     LBB4_10
  1812  
  1813  LBB4_9:
  1814  	VMULPD (DI)(DX*8), Y0, Y0
  1815  	VMULPD 32(DI)(DX*8), Y1, Y1
  1816  	VMULPD 64(DI)(DX*8), Y2, Y2
  1817  	VMULPD 96(DI)(DX*8), Y3, Y3
  1818  
  1819  LBB4_10:
  1820  	VMULPD       Y3, Y1, Y1
  1821  	VMULPD       Y2, Y0, Y0
  1822  	VMULPD       Y1, Y0, Y0
  1823  	VEXTRACTF128 $0x01, Y0, X1
  1824  	VMULPD       X1, X0, X0
  1825  	VPERMILPD    $0x01, X0, X1
  1826  	VMULSD       X1, X0, X0
  1827  	CMPQ         AX, SI
  1828  	JE           LBB4_12
  1829  
  1830  LBB4_11:
  1831  	VMULSD (DI)(AX*8), X0, X0
  1832  	ADDQ   $0x01, AX
  1833  	CMPQ   SI, AX
  1834  	JNE    LBB4_11
  1835  
  1836  LBB4_12:
  1837  	VZEROUPPER
  1838  	MOVSD X0, ret+24(FP)
  1839  	RET
  1840  
  1841  LBB4_5:
  1842  	VBROADCASTSD dataProdF64<>+0(SB), Y0
  1843  	XORL         DX, DX
  1844  	VMOVAPD      Y0, Y1
  1845  	VMOVAPD      Y0, Y2
  1846  	VMOVAPD      Y0, Y3
  1847  	TESTB        $0x01, R8
  1848  	JNE          LBB4_9
  1849  	JMP          LBB4_10
  1850  
  1851  DATA dataProdF32<>+0(SB)/4, $0x3f800000
  1852  GLOBL dataProdF32<>(SB), RODATA|NOPTR, $4
  1853  
  1854  // func Prod_AVX2_F32(x []float32) float32
  1855  // Requires: AVX, SSE
  1856  TEXT ·Prod_AVX2_F32(SB), NOSPLIT, $0-28
  1857  	MOVQ   x_base+0(FP), DI
  1858  	MOVQ   x_len+8(FP), SI
  1859  	TESTQ  SI, SI
  1860  	JE     LBB5_1
  1861  	CMPQ   SI, $0x20
  1862  	JAE    LBB5_4
  1863  	VMOVSS dataProdF32<>+0(SB), X0
  1864  	XORL   AX, AX
  1865  	JMP    LBB5_11
  1866  
  1867  LBB5_1:
  1868  	VMOVSS dataProdF32<>+0(SB), X0
  1869  	MOVSS  X0, ret+24(FP)
  1870  	RET
  1871  
  1872  LBB5_4:
  1873  	MOVQ         SI, AX
  1874  	ANDQ         $-32, AX
  1875  	LEAQ         -32(AX), CX
  1876  	MOVQ         CX, R8
  1877  	SHRQ         $0x05, R8
  1878  	ADDQ         $0x01, R8
  1879  	TESTQ        CX, CX
  1880  	JE           LBB5_5
  1881  	MOVQ         R8, CX
  1882  	ANDQ         $-2, CX
  1883  	VBROADCASTSS dataProdF32<>+0(SB), Y0
  1884  	XORL         DX, DX
  1885  	VMOVAPS      Y0, Y1
  1886  	VMOVAPS      Y0, Y2
  1887  	VMOVAPS      Y0, Y3
  1888  
  1889  LBB5_7:
  1890  	VMULPS (DI)(DX*4), Y0, Y0
  1891  	VMULPS 32(DI)(DX*4), Y1, Y1
  1892  	VMULPS 64(DI)(DX*4), Y2, Y2
  1893  	VMULPS 96(DI)(DX*4), Y3, Y3
  1894  	VMULPS 128(DI)(DX*4), Y0, Y0
  1895  	VMULPS 160(DI)(DX*4), Y1, Y1
  1896  	VMULPS 192(DI)(DX*4), Y2, Y2
  1897  	VMULPS 224(DI)(DX*4), Y3, Y3
  1898  	ADDQ   $0x40, DX
  1899  	ADDQ   $-2, CX
  1900  	JNE    LBB5_7
  1901  	TESTB  $0x01, R8
  1902  	JE     LBB5_10
  1903  
  1904  LBB5_9:
  1905  	VMULPS (DI)(DX*4), Y0, Y0
  1906  	VMULPS 32(DI)(DX*4), Y1, Y1
  1907  	VMULPS 64(DI)(DX*4), Y2, Y2
  1908  	VMULPS 96(DI)(DX*4), Y3, Y3
  1909  
  1910  LBB5_10:
  1911  	VMULPS       Y3, Y1, Y1
  1912  	VMULPS       Y2, Y0, Y0
  1913  	VMULPS       Y1, Y0, Y0
  1914  	VEXTRACTF128 $0x01, Y0, X1
  1915  	VMULPS       X1, X0, X0
  1916  	VPERMILPD    $0x01, X0, X1
  1917  	VMULPS       X1, X0, X0
  1918  	VMOVSHDUP    X0, X1
  1919  	VMULSS       X1, X0, X0
  1920  	CMPQ         AX, SI
  1921  	JE           LBB5_12
  1922  
  1923  LBB5_11:
  1924  	VMULSS (DI)(AX*4), X0, X0
  1925  	ADDQ   $0x01, AX
  1926  	CMPQ   SI, AX
  1927  	JNE    LBB5_11
  1928  
  1929  LBB5_12:
  1930  	VZEROUPPER
  1931  	MOVSS X0, ret+24(FP)
  1932  	RET
  1933  
  1934  LBB5_5:
  1935  	VBROADCASTSS dataProdF32<>+0(SB), Y0
  1936  	XORL         DX, DX
  1937  	VMOVAPS      Y0, Y1
  1938  	VMOVAPS      Y0, Y2
  1939  	VMOVAPS      Y0, Y3
  1940  	TESTB        $0x01, R8
  1941  	JNE          LBB5_9
  1942  	JMP          LBB5_10
  1943  
  1944  DATA dataCumProdF64<>+0(SB)/8, $0x3ff0000000000000
  1945  GLOBL dataCumProdF64<>(SB), RODATA|NOPTR, $8
  1946  
  1947  // func CumProd_AVX2_F64(x []float64)
  1948  // Requires: AVX
  1949  TEXT ·CumProd_AVX2_F64(SB), NOSPLIT, $0-24
  1950  	MOVQ   x_base+0(FP), DI
  1951  	MOVQ   x_len+8(FP), SI
  1952  	TESTQ  SI, SI
  1953  	JE     LBB6_8
  1954  	LEAQ   -1(SI), CX
  1955  	MOVL   SI, AX
  1956  	ANDL   $0x03, AX
  1957  	CMPQ   CX, $0x03
  1958  	JAE    LBB6_3
  1959  	VMOVSD dataCumProdF64<>+0(SB), X0
  1960  	XORL   CX, CX
  1961  	JMP    LBB6_5
  1962  
  1963  LBB6_3:
  1964  	ANDQ   $-4, SI
  1965  	VMOVSD dataCumProdF64<>+0(SB), X0
  1966  	XORL   CX, CX
  1967  
  1968  LBB6_4:
  1969  	VMULSD (DI)(CX*8), X0, X0
  1970  	VMOVSD X0, (DI)(CX*8)
  1971  	VMULSD 8(DI)(CX*8), X0, X0
  1972  	VMOVSD X0, 8(DI)(CX*8)
  1973  	VMULSD 16(DI)(CX*8), X0, X0
  1974  	VMOVSD X0, 16(DI)(CX*8)
  1975  	VMULSD 24(DI)(CX*8), X0, X0
  1976  	VMOVSD X0, 24(DI)(CX*8)
  1977  	ADDQ   $0x04, CX
  1978  	CMPQ   SI, CX
  1979  	JNE    LBB6_4
  1980  
  1981  LBB6_5:
  1982  	TESTQ AX, AX
  1983  	JE    LBB6_8
  1984  	LEAQ  (DI)(CX*8), CX
  1985  	XORL  DX, DX
  1986  
  1987  LBB6_7:
  1988  	VMULSD (CX)(DX*8), X0, X0
  1989  	VMOVSD X0, (CX)(DX*8)
  1990  	ADDQ   $0x01, DX
  1991  	CMPQ   AX, DX
  1992  	JNE    LBB6_7
  1993  
  1994  LBB6_8:
  1995  	RET
  1996  
  1997  DATA dataCumProdF32<>+0(SB)/4, $0x3f800000
  1998  GLOBL dataCumProdF32<>(SB), RODATA|NOPTR, $4
  1999  
  2000  // func CumProd_AVX2_F32(x []float32)
  2001  // Requires: AVX
  2002  TEXT ·CumProd_AVX2_F32(SB), NOSPLIT, $0-24
  2003  	MOVQ   x_base+0(FP), DI
  2004  	MOVQ   x_len+8(FP), SI
  2005  	TESTQ  SI, SI
  2006  	JE     LBB7_8
  2007  	LEAQ   -1(SI), CX
  2008  	MOVL   SI, AX
  2009  	ANDL   $0x03, AX
  2010  	CMPQ   CX, $0x03
  2011  	JAE    LBB7_3
  2012  	VMOVSS dataCumProdF32<>+0(SB), X0
  2013  	XORL   CX, CX
  2014  	JMP    LBB7_5
  2015  
  2016  LBB7_3:
  2017  	ANDQ   $-4, SI
  2018  	VMOVSS dataCumProdF32<>+0(SB), X0
  2019  	XORL   CX, CX
  2020  
  2021  LBB7_4:
  2022  	VMULSS (DI)(CX*4), X0, X0
  2023  	VMOVSS X0, (DI)(CX*4)
  2024  	VMULSS 4(DI)(CX*4), X0, X0
  2025  	VMOVSS X0, 4(DI)(CX*4)
  2026  	VMULSS 8(DI)(CX*4), X0, X0
  2027  	VMOVSS X0, 8(DI)(CX*4)
  2028  	VMULSS 12(DI)(CX*4), X0, X0
  2029  	VMOVSS X0, 12(DI)(CX*4)
  2030  	ADDQ   $0x04, CX
  2031  	CMPQ   SI, CX
  2032  	JNE    LBB7_4
  2033  
  2034  LBB7_5:
  2035  	TESTQ AX, AX
  2036  	JE    LBB7_8
  2037  	LEAQ  (DI)(CX*4), CX
  2038  	XORL  DX, DX
  2039  
  2040  LBB7_7:
  2041  	VMULSS (CX)(DX*4), X0, X0
  2042  	VMOVSS X0, (CX)(DX*4)
  2043  	ADDQ   $0x01, DX
  2044  	CMPQ   AX, DX
  2045  	JNE    LBB7_7
  2046  
  2047  LBB7_8:
  2048  	RET
  2049  
  2050  // func Dot_AVX2_F64(x []float64, y []float64) float64
  2051  // Requires: AVX, FMA3, SSE2
  2052  TEXT ·Dot_AVX2_F64(SB), NOSPLIT, $0-56
  2053  	MOVQ   x_base+0(FP), DI
  2054  	MOVQ   y_base+24(FP), SI
  2055  	MOVQ   x_len+8(FP), DX
  2056  	TESTQ  DX, DX
  2057  	JE     LBB0_1
  2058  	CMPQ   DX, $0x10
  2059  	JAE    LBB0_4
  2060  	VXORPD X0, X0, X0
  2061  	XORL   AX, AX
  2062  	JMP    LBB0_7
  2063  
  2064  LBB0_1:
  2065  	VXORPS X0, X0, X0
  2066  	MOVSD  X0, ret+48(FP)
  2067  	RET
  2068  
  2069  LBB0_4:
  2070  	MOVQ   DX, AX
  2071  	ANDQ   $-16, AX
  2072  	VXORPD X0, X0, X0
  2073  	XORL   CX, CX
  2074  	VXORPD X1, X1, X1
  2075  	VXORPD X2, X2, X2
  2076  	VXORPD X3, X3, X3
  2077  
  2078  LBB0_5:
  2079  	VMOVUPD      (SI)(CX*8), Y4
  2080  	VMOVUPD      32(SI)(CX*8), Y5
  2081  	VMOVUPD      64(SI)(CX*8), Y6
  2082  	VMOVUPD      96(SI)(CX*8), Y7
  2083  	VFMADD231PD  (DI)(CX*8), Y4, Y0
  2084  	VFMADD231PD  32(DI)(CX*8), Y5, Y1
  2085  	VFMADD231PD  64(DI)(CX*8), Y6, Y2
  2086  	VFMADD231PD  96(DI)(CX*8), Y7, Y3
  2087  	ADDQ         $0x10, CX
  2088  	CMPQ         AX, CX
  2089  	JNE          LBB0_5
  2090  	VADDPD       Y0, Y1, Y0
  2091  	VADDPD       Y0, Y2, Y0
  2092  	VADDPD       Y0, Y3, Y0
  2093  	VEXTRACTF128 $0x01, Y0, X1
  2094  	VADDPD       X1, X0, X0
  2095  	VPERMILPD    $0x01, X0, X1
  2096  	VADDSD       X1, X0, X0
  2097  	CMPQ         AX, DX
  2098  	JE           LBB0_8
  2099  
  2100  LBB0_7:
  2101  	VMOVSD      (SI)(AX*8), X1
  2102  	VFMADD231SD (DI)(AX*8), X1, X0
  2103  	ADDQ        $0x01, AX
  2104  	CMPQ        DX, AX
  2105  	JNE         LBB0_7
  2106  
  2107  LBB0_8:
  2108  	VZEROUPPER
  2109  	MOVSD X0, ret+48(FP)
  2110  	RET
  2111  
  2112  // func Dot_AVX2_F32(x []float32, y []float32) float32
  2113  // Requires: AVX, FMA3, SSE
  2114  TEXT ·Dot_AVX2_F32(SB), NOSPLIT, $0-52
  2115  	MOVQ   x_base+0(FP), DI
  2116  	MOVQ   y_base+24(FP), SI
  2117  	MOVQ   x_len+8(FP), DX
  2118  	TESTQ  DX, DX
  2119  	JE     LBB1_1
  2120  	CMPQ   DX, $0x20
  2121  	JAE    LBB1_4
  2122  	VXORPS X0, X0, X0
  2123  	XORL   AX, AX
  2124  	JMP    LBB1_7
  2125  
  2126  LBB1_1:
  2127  	VXORPS X0, X0, X0
  2128  	MOVSS  X0, ret+48(FP)
  2129  	RET
  2130  
  2131  LBB1_4:
  2132  	MOVQ   DX, AX
  2133  	ANDQ   $-32, AX
  2134  	VXORPS X0, X0, X0
  2135  	XORL   CX, CX
  2136  	VXORPS X1, X1, X1
  2137  	VXORPS X2, X2, X2
  2138  	VXORPS X3, X3, X3
  2139  
  2140  LBB1_5:
  2141  	VMOVUPS      (SI)(CX*4), Y4
  2142  	VMOVUPS      32(SI)(CX*4), Y5
  2143  	VMOVUPS      64(SI)(CX*4), Y6
  2144  	VMOVUPS      96(SI)(CX*4), Y7
  2145  	VFMADD231PS  (DI)(CX*4), Y4, Y0
  2146  	VFMADD231PS  32(DI)(CX*4), Y5, Y1
  2147  	VFMADD231PS  64(DI)(CX*4), Y6, Y2
  2148  	VFMADD231PS  96(DI)(CX*4), Y7, Y3
  2149  	ADDQ         $0x20, CX
  2150  	CMPQ         AX, CX
  2151  	JNE          LBB1_5
  2152  	VADDPS       Y0, Y1, Y0
  2153  	VADDPS       Y0, Y2, Y0
  2154  	VADDPS       Y0, Y3, Y0
  2155  	VEXTRACTF128 $0x01, Y0, X1
  2156  	VADDPS       X1, X0, X0
  2157  	VPERMILPD    $0x01, X0, X1
  2158  	VADDPS       X1, X0, X0
  2159  	VMOVSHDUP    X0, X1
  2160  	VADDSS       X1, X0, X0
  2161  	CMPQ         AX, DX
  2162  	JE           LBB1_8
  2163  
  2164  LBB1_7:
  2165  	VMOVSS      (SI)(AX*4), X1
  2166  	VFMADD231SS (DI)(AX*4), X1, X0
  2167  	ADDQ        $0x01, AX
  2168  	CMPQ        DX, AX
  2169  	JNE         LBB1_7
  2170  
  2171  LBB1_8:
  2172  	VZEROUPPER
  2173  	MOVSS X0, ret+48(FP)
  2174  	RET
  2175  
  2176  // func Norm_AVX2_F64(x []float64) float64
  2177  // Requires: AVX, FMA3, SSE2
  2178  TEXT ·Norm_AVX2_F64(SB), NOSPLIT, $0-32
  2179  	MOVQ   x_base+0(FP), DI
  2180  	MOVQ   x_len+8(FP), SI
  2181  	TESTQ  SI, SI
  2182  	JE     LBB2_1
  2183  	CMPQ   SI, $0x10
  2184  	JAE    LBB2_4
  2185  	VXORPD X0, X0, X0
  2186  	XORL   AX, AX
  2187  	JMP    LBB2_11
  2188  
  2189  LBB2_1:
  2190  	VXORPD  X0, X0, X0
  2191  	VSQRTSD X0, X0, X0
  2192  	MOVSD   X0, ret+24(FP)
  2193  	RET
  2194  
  2195  LBB2_4:
  2196  	MOVQ   SI, AX
  2197  	ANDQ   $-16, AX
  2198  	LEAQ   -16(AX), CX
  2199  	MOVQ   CX, R8
  2200  	SHRQ   $0x04, R8
  2201  	ADDQ   $0x01, R8
  2202  	TESTQ  CX, CX
  2203  	JE     LBB2_5
  2204  	MOVQ   R8, CX
  2205  	ANDQ   $-2, CX
  2206  	VXORPD X0, X0, X0
  2207  	XORL   DX, DX
  2208  	VXORPD X1, X1, X1
  2209  	VXORPD X2, X2, X2
  2210  	VXORPD X3, X3, X3
  2211  
  2212  LBB2_7:
  2213  	VMOVUPD     (DI)(DX*8), Y4
  2214  	VMOVUPD     32(DI)(DX*8), Y5
  2215  	VMOVUPD     64(DI)(DX*8), Y6
  2216  	VMOVUPD     96(DI)(DX*8), Y7
  2217  	VFMADD213PD Y0, Y4, Y4
  2218  	VFMADD213PD Y1, Y5, Y5
  2219  	VFMADD213PD Y2, Y6, Y6
  2220  	VFMADD213PD Y3, Y7, Y7
  2221  	VMOVUPD     128(DI)(DX*8), Y0
  2222  	VMOVUPD     160(DI)(DX*8), Y1
  2223  	VMOVUPD     192(DI)(DX*8), Y2
  2224  	VMOVUPD     224(DI)(DX*8), Y3
  2225  	VFMADD213PD Y4, Y0, Y0
  2226  	VFMADD213PD Y5, Y1, Y1
  2227  	VFMADD213PD Y6, Y2, Y2
  2228  	VFMADD213PD Y7, Y3, Y3
  2229  	ADDQ        $0x20, DX
  2230  	ADDQ        $-2, CX
  2231  	JNE         LBB2_7
  2232  	TESTB       $0x01, R8
  2233  	JE          LBB2_10
  2234  
  2235  LBB2_9:
  2236  	VMOVUPD     (DI)(DX*8), Y4
  2237  	VMOVUPD     32(DI)(DX*8), Y5
  2238  	VMOVUPD     64(DI)(DX*8), Y6
  2239  	VMOVUPD     96(DI)(DX*8), Y7
  2240  	VFMADD231PD Y4, Y4, Y0
  2241  	VFMADD231PD Y5, Y5, Y1
  2242  	VFMADD231PD Y6, Y6, Y2
  2243  	VFMADD231PD Y7, Y7, Y3
  2244  
  2245  LBB2_10:
  2246  	VADDPD       Y3, Y1, Y1
  2247  	VADDPD       Y2, Y0, Y0
  2248  	VADDPD       Y1, Y0, Y0
  2249  	VEXTRACTF128 $0x01, Y0, X1
  2250  	VADDPD       X1, X0, X0
  2251  	VPERMILPD    $0x01, X0, X1
  2252  	VADDSD       X1, X0, X0
  2253  	CMPQ         AX, SI
  2254  	JE           LBB2_12
  2255  
  2256  LBB2_11:
  2257  	VMOVSD      (DI)(AX*8), X1
  2258  	VFMADD231SD X1, X1, X0
  2259  	ADDQ        $0x01, AX
  2260  	CMPQ        SI, AX
  2261  	JNE         LBB2_11
  2262  
  2263  LBB2_12:
  2264  	VSQRTSD X0, X0, X0
  2265  	VZEROUPPER
  2266  	MOVSD   X0, ret+24(FP)
  2267  	RET
  2268  
  2269  LBB2_5:
  2270  	VXORPD X0, X0, X0
  2271  	XORL   DX, DX
  2272  	VXORPD X1, X1, X1
  2273  	VXORPD X2, X2, X2
  2274  	VXORPD X3, X3, X3
  2275  	TESTB  $0x01, R8
  2276  	JNE    LBB2_9
  2277  	JMP    LBB2_10
  2278  
  2279  DATA dataNormF32<>+0(SB)/4, $0xc0400000
  2280  DATA dataNormF32<>+4(SB)/4, $0xbf000000
  2281  DATA dataNormF32<>+8(SB)/4, $0x7fffffff
  2282  DATA dataNormF32<>+12(SB)/4, $0x00800000
  2283  GLOBL dataNormF32<>(SB), RODATA|NOPTR, $16
  2284  
  2285  // func Norm_AVX2_F32(x []float32) float32
  2286  // Requires: AVX, FMA3, SSE
  2287  TEXT ·Norm_AVX2_F32(SB), NOSPLIT, $0-28
  2288  	MOVQ   x_base+0(FP), DI
  2289  	MOVQ   x_len+8(FP), SI
  2290  	TESTQ  SI, SI
  2291  	JE     LBB3_1
  2292  	CMPQ   SI, $0x20
  2293  	JAE    LBB3_4
  2294  	VXORPS X0, X0, X0
  2295  	XORL   AX, AX
  2296  	JMP    LBB3_11
  2297  
  2298  LBB3_1:
  2299  	VXORPS X0, X0, X0
  2300  	JMP    LBB3_12
  2301  
  2302  LBB3_4:
  2303  	MOVQ   SI, AX
  2304  	ANDQ   $-32, AX
  2305  	LEAQ   -32(AX), CX
  2306  	MOVQ   CX, R8
  2307  	SHRQ   $0x05, R8
  2308  	ADDQ   $0x01, R8
  2309  	TESTQ  CX, CX
  2310  	JE     LBB3_5
  2311  	MOVQ   R8, CX
  2312  	ANDQ   $-2, CX
  2313  	VXORPS X0, X0, X0
  2314  	XORL   DX, DX
  2315  	VXORPS X1, X1, X1
  2316  	VXORPS X2, X2, X2
  2317  	VXORPS X3, X3, X3
  2318  
  2319  LBB3_7:
  2320  	VMOVUPS     (DI)(DX*4), Y4
  2321  	VMOVUPS     32(DI)(DX*4), Y5
  2322  	VMOVUPS     64(DI)(DX*4), Y6
  2323  	VMOVUPS     96(DI)(DX*4), Y7
  2324  	VFMADD213PS Y0, Y4, Y4
  2325  	VFMADD213PS Y1, Y5, Y5
  2326  	VFMADD213PS Y2, Y6, Y6
  2327  	VFMADD213PS Y3, Y7, Y7
  2328  	VMOVUPS     128(DI)(DX*4), Y0
  2329  	VMOVUPS     160(DI)(DX*4), Y1
  2330  	VMOVUPS     192(DI)(DX*4), Y2
  2331  	VMOVUPS     224(DI)(DX*4), Y3
  2332  	VFMADD213PS Y4, Y0, Y0
  2333  	VFMADD213PS Y5, Y1, Y1
  2334  	VFMADD213PS Y6, Y2, Y2
  2335  	VFMADD213PS Y7, Y3, Y3
  2336  	ADDQ        $0x40, DX
  2337  	ADDQ        $-2, CX
  2338  	JNE         LBB3_7
  2339  	TESTB       $0x01, R8
  2340  	JE          LBB3_10
  2341  
  2342  LBB3_9:
  2343  	VMOVUPS     (DI)(DX*4), Y4
  2344  	VMOVUPS     32(DI)(DX*4), Y5
  2345  	VMOVUPS     64(DI)(DX*4), Y6
  2346  	VMOVUPS     96(DI)(DX*4), Y7
  2347  	VFMADD231PS Y4, Y4, Y0
  2348  	VFMADD231PS Y5, Y5, Y1
  2349  	VFMADD231PS Y6, Y6, Y2
  2350  	VFMADD231PS Y7, Y7, Y3
  2351  
  2352  LBB3_10:
  2353  	VADDPS       Y3, Y1, Y1
  2354  	VADDPS       Y2, Y0, Y0
  2355  	VADDPS       Y1, Y0, Y0
  2356  	VEXTRACTF128 $0x01, Y0, X1
  2357  	VADDPS       X1, X0, X0
  2358  	VPERMILPD    $0x01, X0, X1
  2359  	VADDPS       X1, X0, X0
  2360  	VMOVSHDUP    X0, X1
  2361  	VADDSS       X1, X0, X0
  2362  	CMPQ         AX, SI
  2363  	JE           LBB3_12
  2364  
  2365  LBB3_11:
  2366  	VMOVSS      (DI)(AX*4), X1
  2367  	VFMADD231SS X1, X1, X0
  2368  	ADDQ        $0x01, AX
  2369  	CMPQ        SI, AX
  2370  	JNE         LBB3_11
  2371  
  2372  LBB3_12:
  2373  	VRSQRTSS     X0, X0, X1
  2374  	VMULSS       X1, X0, X2
  2375  	VFMADD213SS  dataNormF32<>+0(SB), X2, X1
  2376  	VMULSS       dataNormF32<>+4(SB), X2, X2
  2377  	VMULSS       X1, X2, X1
  2378  	VBROADCASTSS dataNormF32<>+8(SB), X2
  2379  	VANDPS       X2, X0, X0
  2380  	VCMPSS       $0x01, dataNormF32<>+12(SB), X0, X0
  2381  	VANDNPS      X1, X0, X0
  2382  	VZEROUPPER
  2383  	MOVSS        X0, ret+24(FP)
  2384  	RET
  2385  
  2386  LBB3_5:
  2387  	VXORPS X0, X0, X0
  2388  	XORL   DX, DX
  2389  	VXORPS X1, X1, X1
  2390  	VXORPS X2, X2, X2
  2391  	VXORPS X3, X3, X3
  2392  	TESTB  $0x01, R8
  2393  	JNE    LBB3_9
  2394  	JMP    LBB3_10
  2395  
  2396  // func Distance_AVX2_F64(x []float64, y []float64) float64
  2397  // Requires: AVX, FMA3, SSE2
  2398  TEXT ·Distance_AVX2_F64(SB), NOSPLIT, $0-56
  2399  	MOVQ   x_base+0(FP), DI
  2400  	MOVQ   y_base+24(FP), SI
  2401  	MOVQ   x_len+8(FP), DX
  2402  	TESTQ  DX, DX
  2403  	JE     LBB4_1
  2404  	CMPQ   DX, $0x10
  2405  	JAE    LBB4_4
  2406  	VXORPD X0, X0, X0
  2407  	XORL   AX, AX
  2408  	JMP    LBB4_7
  2409  
  2410  LBB4_1:
  2411  	VXORPD  X0, X0, X0
  2412  	VSQRTSD X0, X0, X0
  2413  	MOVSD   X0, ret+48(FP)
  2414  	RET
  2415  
  2416  LBB4_4:
  2417  	MOVQ   DX, AX
  2418  	ANDQ   $-16, AX
  2419  	VXORPD X0, X0, X0
  2420  	XORL   CX, CX
  2421  	VXORPD X1, X1, X1
  2422  	VXORPD X2, X2, X2
  2423  	VXORPD X3, X3, X3
  2424  
  2425  LBB4_5:
  2426  	VMOVUPD      (DI)(CX*8), Y4
  2427  	VMOVUPD      32(DI)(CX*8), Y5
  2428  	VMOVUPD      64(DI)(CX*8), Y6
  2429  	VMOVUPD      96(DI)(CX*8), Y7
  2430  	VSUBPD       (SI)(CX*8), Y4, Y4
  2431  	VSUBPD       32(SI)(CX*8), Y5, Y5
  2432  	VSUBPD       64(SI)(CX*8), Y6, Y6
  2433  	VSUBPD       96(SI)(CX*8), Y7, Y7
  2434  	VFMADD231PD  Y4, Y4, Y0
  2435  	VFMADD231PD  Y5, Y5, Y1
  2436  	VFMADD231PD  Y6, Y6, Y2
  2437  	VFMADD231PD  Y7, Y7, Y3
  2438  	ADDQ         $0x10, CX
  2439  	CMPQ         AX, CX
  2440  	JNE          LBB4_5
  2441  	VADDPD       Y0, Y1, Y0
  2442  	VADDPD       Y0, Y2, Y0
  2443  	VADDPD       Y0, Y3, Y0
  2444  	VEXTRACTF128 $0x01, Y0, X1
  2445  	VADDPD       X1, X0, X0
  2446  	VPERMILPD    $0x01, X0, X1
  2447  	VADDSD       X1, X0, X0
  2448  	CMPQ         AX, DX
  2449  	JE           LBB4_8
  2450  
  2451  LBB4_7:
  2452  	VMOVSD      (DI)(AX*8), X1
  2453  	VSUBSD      (SI)(AX*8), X1, X1
  2454  	VFMADD231SD X1, X1, X0
  2455  	ADDQ        $0x01, AX
  2456  	CMPQ        DX, AX
  2457  	JNE         LBB4_7
  2458  
  2459  LBB4_8:
  2460  	VSQRTSD X0, X0, X0
  2461  	VZEROUPPER
  2462  	MOVSD   X0, ret+48(FP)
  2463  	RET
  2464  
  2465  DATA dataDistanceF32<>+0(SB)/4, $0xc0400000
  2466  DATA dataDistanceF32<>+4(SB)/4, $0xbf000000
  2467  DATA dataDistanceF32<>+8(SB)/4, $0x7fffffff
  2468  DATA dataDistanceF32<>+12(SB)/4, $0x00800000
  2469  GLOBL dataDistanceF32<>(SB), RODATA|NOPTR, $16
  2470  
  2471  // func Distance_AVX2_F32(x []float32, y []float32) float32
  2472  // Requires: AVX, FMA3, SSE
  2473  TEXT ·Distance_AVX2_F32(SB), NOSPLIT, $0-52
  2474  	MOVQ   x_base+0(FP), DI
  2475  	MOVQ   y_base+24(FP), SI
  2476  	MOVQ   x_len+8(FP), DX
  2477  	TESTQ  DX, DX
  2478  	JE     LBB5_1
  2479  	CMPQ   DX, $0x20
  2480  	JAE    LBB5_4
  2481  	VXORPS X0, X0, X0
  2482  	XORL   AX, AX
  2483  	JMP    LBB5_7
  2484  
  2485  LBB5_1:
  2486  	VXORPS X0, X0, X0
  2487  	JMP    LBB5_8
  2488  
  2489  LBB5_4:
  2490  	MOVQ   DX, AX
  2491  	ANDQ   $-32, AX
  2492  	VXORPS X0, X0, X0
  2493  	XORL   CX, CX
  2494  	VXORPS X1, X1, X1
  2495  	VXORPS X2, X2, X2
  2496  	VXORPS X3, X3, X3
  2497  
  2498  LBB5_5:
  2499  	VMOVUPS      (DI)(CX*4), Y4
  2500  	VMOVUPS      32(DI)(CX*4), Y5
  2501  	VMOVUPS      64(DI)(CX*4), Y6
  2502  	VMOVUPS      96(DI)(CX*4), Y7
  2503  	VSUBPS       (SI)(CX*4), Y4, Y4
  2504  	VSUBPS       32(SI)(CX*4), Y5, Y5
  2505  	VSUBPS       64(SI)(CX*4), Y6, Y6
  2506  	VSUBPS       96(SI)(CX*4), Y7, Y7
  2507  	VFMADD231PS  Y4, Y4, Y0
  2508  	VFMADD231PS  Y5, Y5, Y1
  2509  	VFMADD231PS  Y6, Y6, Y2
  2510  	VFMADD231PS  Y7, Y7, Y3
  2511  	ADDQ         $0x20, CX
  2512  	CMPQ         AX, CX
  2513  	JNE          LBB5_5
  2514  	VADDPS       Y0, Y1, Y0
  2515  	VADDPS       Y0, Y2, Y0
  2516  	VADDPS       Y0, Y3, Y0
  2517  	VEXTRACTF128 $0x01, Y0, X1
  2518  	VADDPS       X1, X0, X0
  2519  	VPERMILPD    $0x01, X0, X1
  2520  	VADDPS       X1, X0, X0
  2521  	VMOVSHDUP    X0, X1
  2522  	VADDSS       X1, X0, X0
  2523  	CMPQ         AX, DX
  2524  	JE           LBB5_8
  2525  
  2526  LBB5_7:
  2527  	VMOVSS      (DI)(AX*4), X1
  2528  	VSUBSS      (SI)(AX*4), X1, X1
  2529  	VFMADD231SS X1, X1, X0
  2530  	ADDQ        $0x01, AX
  2531  	CMPQ        DX, AX
  2532  	JNE         LBB5_7
  2533  
  2534  LBB5_8:
  2535  	VRSQRTSS     X0, X0, X1
  2536  	VMULSS       X1, X0, X2
  2537  	VFMADD213SS  dataDistanceF32<>+0(SB), X2, X1
  2538  	VMULSS       dataDistanceF32<>+4(SB), X2, X2
  2539  	VMULSS       X1, X2, X1
  2540  	VBROADCASTSS dataDistanceF32<>+8(SB), X2
  2541  	VANDPS       X2, X0, X0
  2542  	VCMPSS       $0x01, dataDistanceF32<>+12(SB), X0, X0
  2543  	VANDNPS      X1, X0, X0
  2544  	VZEROUPPER
  2545  	MOVSS        X0, ret+48(FP)
  2546  	RET
  2547  
  2548  DATA dataManhattanNormF64<>+0(SB)/8, $0x7fffffffffffffff
  2549  DATA dataManhattanNormF64<>+8(SB)/8, $0x7fffffffffffffff
  2550  DATA dataManhattanNormF64<>+16(SB)/8, $0x7fffffffffffffff
  2551  GLOBL dataManhattanNormF64<>(SB), RODATA|NOPTR, $24
  2552  
  2553  // func ManhattanNorm_AVX2_F64(x []float64) float64
  2554  // Requires: AVX, SSE2
  2555  TEXT ·ManhattanNorm_AVX2_F64(SB), NOSPLIT, $0-32
  2556  	MOVQ   x_base+0(FP), DI
  2557  	MOVQ   x_len+8(FP), SI
  2558  	TESTQ  SI, SI
  2559  	JE     LBB6_1
  2560  	CMPQ   SI, $0x10
  2561  	JAE    LBB6_4
  2562  	VXORPD X0, X0, X0
  2563  	XORL   AX, AX
  2564  	JMP    LBB6_7
  2565  
  2566  LBB6_1:
  2567  	VXORPS X0, X0, X0
  2568  	MOVSD  X0, ret+24(FP)
  2569  	RET
  2570  
  2571  LBB6_4:
  2572  	MOVQ         SI, AX
  2573  	ANDQ         $-16, AX
  2574  	VXORPD       X0, X0, X0
  2575  	VBROADCASTSD dataManhattanNormF64<>+0(SB), Y1
  2576  	XORL         CX, CX
  2577  	VXORPD       X2, X2, X2
  2578  	VXORPD       X3, X3, X3
  2579  	VXORPD       X4, X4, X4
  2580  
  2581  LBB6_5:
  2582  	VANDPD       (DI)(CX*8), Y1, Y5
  2583  	VADDPD       Y0, Y5, Y0
  2584  	VANDPD       32(DI)(CX*8), Y1, Y5
  2585  	VADDPD       Y2, Y5, Y2
  2586  	VANDPD       64(DI)(CX*8), Y1, Y5
  2587  	VANDPD       96(DI)(CX*8), Y1, Y6
  2588  	VADDPD       Y3, Y5, Y3
  2589  	VADDPD       Y4, Y6, Y4
  2590  	ADDQ         $0x10, CX
  2591  	CMPQ         AX, CX
  2592  	JNE          LBB6_5
  2593  	VADDPD       Y0, Y2, Y0
  2594  	VADDPD       Y0, Y3, Y0
  2595  	VADDPD       Y0, Y4, Y0
  2596  	VEXTRACTF128 $0x01, Y0, X1
  2597  	VADDPD       X1, X0, X0
  2598  	VPERMILPD    $0x01, X0, X1
  2599  	VADDSD       X1, X0, X0
  2600  	CMPQ         AX, SI
  2601  	JE           LBB6_9
  2602  
  2603  LBB6_7:
  2604  	VMOVUPD dataManhattanNormF64<>+8(SB), X1
  2605  
  2606  LBB6_8:
  2607  	VMOVSD (DI)(AX*8), X2
  2608  	VANDPD X1, X2, X2
  2609  	VADDSD X0, X2, X0
  2610  	ADDQ   $0x01, AX
  2611  	CMPQ   SI, AX
  2612  	JNE    LBB6_8
  2613  
  2614  LBB6_9:
  2615  	VZEROUPPER
  2616  	MOVSD X0, ret+24(FP)
  2617  	RET
  2618  
  2619  DATA dataManhattanNormF32<>+0(SB)/4, $0x7fffffff
  2620  GLOBL dataManhattanNormF32<>(SB), RODATA|NOPTR, $4
  2621  
  2622  // func ManhattanNorm_AVX2_F32(x []float32) float32
  2623  // Requires: AVX, SSE
  2624  TEXT ·ManhattanNorm_AVX2_F32(SB), NOSPLIT, $0-28
  2625  	MOVQ   x_base+0(FP), DI
  2626  	MOVQ   x_len+8(FP), SI
  2627  	TESTQ  SI, SI
  2628  	JE     LBB7_1
  2629  	CMPQ   SI, $0x20
  2630  	JAE    LBB7_4
  2631  	VXORPS X0, X0, X0
  2632  	XORL   AX, AX
  2633  	JMP    LBB7_7
  2634  
  2635  LBB7_1:
  2636  	VXORPS X0, X0, X0
  2637  	MOVSS  X0, ret+24(FP)
  2638  	RET
  2639  
  2640  LBB7_4:
  2641  	MOVQ         SI, AX
  2642  	ANDQ         $-32, AX
  2643  	VXORPS       X0, X0, X0
  2644  	VBROADCASTSS dataManhattanNormF32<>+0(SB), Y1
  2645  	XORL         CX, CX
  2646  	VXORPS       X2, X2, X2
  2647  	VXORPS       X3, X3, X3
  2648  	VXORPS       X4, X4, X4
  2649  
  2650  LBB7_5:
  2651  	VANDPS       (DI)(CX*4), Y1, Y5
  2652  	VADDPS       Y0, Y5, Y0
  2653  	VANDPS       32(DI)(CX*4), Y1, Y5
  2654  	VADDPS       Y2, Y5, Y2
  2655  	VANDPS       64(DI)(CX*4), Y1, Y5
  2656  	VANDPS       96(DI)(CX*4), Y1, Y6
  2657  	VADDPS       Y3, Y5, Y3
  2658  	VADDPS       Y4, Y6, Y4
  2659  	ADDQ         $0x20, CX
  2660  	CMPQ         AX, CX
  2661  	JNE          LBB7_5
  2662  	VADDPS       Y0, Y2, Y0
  2663  	VADDPS       Y0, Y3, Y0
  2664  	VADDPS       Y0, Y4, Y0
  2665  	VEXTRACTF128 $0x01, Y0, X1
  2666  	VADDPS       X1, X0, X0
  2667  	VPERMILPD    $0x01, X0, X1
  2668  	VADDPS       X1, X0, X0
  2669  	VMOVSHDUP    X0, X1
  2670  	VADDSS       X1, X0, X0
  2671  	CMPQ         AX, SI
  2672  	JE           LBB7_9
  2673  
  2674  LBB7_7:
  2675  	VBROADCASTSS dataManhattanNormF32<>+0(SB), X1
  2676  
  2677  LBB7_8:
  2678  	VMOVSS (DI)(AX*4), X2
  2679  	VANDPS X1, X2, X2
  2680  	VADDSS X0, X2, X0
  2681  	ADDQ   $0x01, AX
  2682  	CMPQ   SI, AX
  2683  	JNE    LBB7_8
  2684  
  2685  LBB7_9:
  2686  	VZEROUPPER
  2687  	MOVSS X0, ret+24(FP)
  2688  	RET
  2689  
  2690  DATA dataManhattanDistanceF64<>+0(SB)/8, $0x7fffffffffffffff
  2691  DATA dataManhattanDistanceF64<>+8(SB)/8, $0x7fffffffffffffff
  2692  DATA dataManhattanDistanceF64<>+16(SB)/8, $0x7fffffffffffffff
  2693  GLOBL dataManhattanDistanceF64<>(SB), RODATA|NOPTR, $24
  2694  
  2695  // func ManhattanDistance_AVX2_F64(x []float64, y []float64) float64
  2696  // Requires: AVX, SSE2
  2697  TEXT ·ManhattanDistance_AVX2_F64(SB), NOSPLIT, $0-56
  2698  	MOVQ   x_base+0(FP), DI
  2699  	MOVQ   y_base+24(FP), SI
  2700  	MOVQ   x_len+8(FP), DX
  2701  	TESTQ  DX, DX
  2702  	JE     LBB8_1
  2703  	CMPQ   DX, $0x10
  2704  	JAE    LBB8_4
  2705  	VXORPD X0, X0, X0
  2706  	XORL   AX, AX
  2707  	JMP    LBB8_7
  2708  
  2709  LBB8_1:
  2710  	VXORPS X0, X0, X0
  2711  	MOVSD  X0, ret+48(FP)
  2712  	RET
  2713  
  2714  LBB8_4:
  2715  	MOVQ         DX, AX
  2716  	ANDQ         $-16, AX
  2717  	VXORPD       X0, X0, X0
  2718  	VBROADCASTSD dataManhattanDistanceF64<>+0(SB), Y1
  2719  	XORL         CX, CX
  2720  	VXORPD       X2, X2, X2
  2721  	VXORPD       X3, X3, X3
  2722  	VXORPD       X4, X4, X4
  2723  
  2724  LBB8_5:
  2725  	VMOVUPD      (DI)(CX*8), Y5
  2726  	VMOVUPD      32(DI)(CX*8), Y6
  2727  	VMOVUPD      64(DI)(CX*8), Y7
  2728  	VMOVUPD      96(DI)(CX*8), Y8
  2729  	VSUBPD       (SI)(CX*8), Y5, Y5
  2730  	VSUBPD       32(SI)(CX*8), Y6, Y6
  2731  	VSUBPD       64(SI)(CX*8), Y7, Y7
  2732  	VSUBPD       96(SI)(CX*8), Y8, Y8
  2733  	VANDPD       Y1, Y5, Y5
  2734  	VADDPD       Y0, Y5, Y0
  2735  	VANDPD       Y1, Y6, Y5
  2736  	VADDPD       Y2, Y5, Y2
  2737  	VANDPD       Y1, Y7, Y5
  2738  	VADDPD       Y3, Y5, Y3
  2739  	VANDPD       Y1, Y8, Y5
  2740  	VADDPD       Y4, Y5, Y4
  2741  	ADDQ         $0x10, CX
  2742  	CMPQ         AX, CX
  2743  	JNE          LBB8_5
  2744  	VADDPD       Y0, Y2, Y0
  2745  	VADDPD       Y0, Y3, Y0
  2746  	VADDPD       Y0, Y4, Y0
  2747  	VEXTRACTF128 $0x01, Y0, X1
  2748  	VADDPD       X1, X0, X0
  2749  	VPERMILPD    $0x01, X0, X1
  2750  	VADDSD       X1, X0, X0
  2751  	CMPQ         AX, DX
  2752  	JE           LBB8_9
  2753  
  2754  LBB8_7:
  2755  	VMOVUPD dataManhattanDistanceF64<>+8(SB), X1
  2756  
  2757  LBB8_8:
  2758  	VMOVSD (DI)(AX*8), X2
  2759  	VSUBSD (SI)(AX*8), X2, X2
  2760  	VANDPD X1, X2, X2
  2761  	VADDSD X0, X2, X0
  2762  	ADDQ   $0x01, AX
  2763  	CMPQ   DX, AX
  2764  	JNE    LBB8_8
  2765  
  2766  LBB8_9:
  2767  	VZEROUPPER
  2768  	MOVSD X0, ret+48(FP)
  2769  	RET
  2770  
  2771  DATA dataManhattanDistanceF32<>+0(SB)/4, $0x7fffffff
  2772  GLOBL dataManhattanDistanceF32<>(SB), RODATA|NOPTR, $4
  2773  
  2774  // func ManhattanDistance_AVX2_F32(x []float32, y []float32) float32
  2775  // Requires: AVX, SSE
  2776  TEXT ·ManhattanDistance_AVX2_F32(SB), NOSPLIT, $0-52
  2777  	MOVQ   x_base+0(FP), DI
  2778  	MOVQ   y_base+24(FP), SI
  2779  	MOVQ   x_len+8(FP), DX
  2780  	TESTQ  DX, DX
  2781  	JE     LBB9_1
  2782  	CMPQ   DX, $0x20
  2783  	JAE    LBB9_4
  2784  	VXORPS X0, X0, X0
  2785  	XORL   AX, AX
  2786  	JMP    LBB9_7
  2787  
  2788  LBB9_1:
  2789  	VXORPS X0, X0, X0
  2790  	MOVSS  X0, ret+48(FP)
  2791  	RET
  2792  
  2793  LBB9_4:
  2794  	MOVQ         DX, AX
  2795  	ANDQ         $-32, AX
  2796  	VXORPS       X0, X0, X0
  2797  	VBROADCASTSS dataManhattanDistanceF32<>+0(SB), Y1
  2798  	XORL         CX, CX
  2799  	VXORPS       X2, X2, X2
  2800  	VXORPS       X3, X3, X3
  2801  	VXORPS       X4, X4, X4
  2802  
  2803  LBB9_5:
  2804  	VMOVUPS      (DI)(CX*4), Y5
  2805  	VMOVUPS      32(DI)(CX*4), Y6
  2806  	VMOVUPS      64(DI)(CX*4), Y7
  2807  	VMOVUPS      96(DI)(CX*4), Y8
  2808  	VSUBPS       (SI)(CX*4), Y5, Y5
  2809  	VSUBPS       32(SI)(CX*4), Y6, Y6
  2810  	VSUBPS       64(SI)(CX*4), Y7, Y7
  2811  	VSUBPS       96(SI)(CX*4), Y8, Y8
  2812  	VANDPS       Y1, Y5, Y5
  2813  	VADDPS       Y0, Y5, Y0
  2814  	VANDPS       Y1, Y6, Y5
  2815  	VADDPS       Y2, Y5, Y2
  2816  	VANDPS       Y1, Y7, Y5
  2817  	VADDPS       Y3, Y5, Y3
  2818  	VANDPS       Y1, Y8, Y5
  2819  	VADDPS       Y4, Y5, Y4
  2820  	ADDQ         $0x20, CX
  2821  	CMPQ         AX, CX
  2822  	JNE          LBB9_5
  2823  	VADDPS       Y0, Y2, Y0
  2824  	VADDPS       Y0, Y3, Y0
  2825  	VADDPS       Y0, Y4, Y0
  2826  	VEXTRACTF128 $0x01, Y0, X1
  2827  	VADDPS       X1, X0, X0
  2828  	VPERMILPD    $0x01, X0, X1
  2829  	VADDPS       X1, X0, X0
  2830  	VMOVSHDUP    X0, X1
  2831  	VADDSS       X1, X0, X0
  2832  	CMPQ         AX, DX
  2833  	JE           LBB9_9
  2834  
  2835  LBB9_7:
  2836  	VBROADCASTSS dataManhattanDistanceF32<>+0(SB), X1
  2837  
  2838  LBB9_8:
  2839  	VMOVSS (DI)(AX*4), X2
  2840  	VSUBSS (SI)(AX*4), X2, X2
  2841  	VANDPS X1, X2, X2
  2842  	VADDSS X0, X2, X0
  2843  	ADDQ   $0x01, AX
  2844  	CMPQ   DX, AX
  2845  	JNE    LBB9_8
  2846  
  2847  LBB9_9:
  2848  	VZEROUPPER
  2849  	MOVSS X0, ret+48(FP)
  2850  	RET
  2851  
  2852  // func CosineSimilarity_AVX2_F64(x []float64, y []float64) float64
  2853  // Requires: AVX, FMA3, SSE2
  2854  TEXT ·CosineSimilarity_AVX2_F64(SB), NOSPLIT, $0-56
  2855  	MOVQ   x_base+0(FP), DI
  2856  	MOVQ   y_base+24(FP), SI
  2857  	MOVQ   x_len+8(FP), DX
  2858  	TESTQ  DX, DX
  2859  	JE     LBB2_1
  2860  	CMPQ   DX, $0x08
  2861  	JAE    LBB2_5
  2862  	VXORPD X1, X1, X1
  2863  	XORL   AX, AX
  2864  	VXORPD X2, X2, X2
  2865  	VXORPD X0, X0, X0
  2866  	JMP    LBB2_4
  2867  
  2868  LBB2_1:
  2869  	VXORPD  X0, X0, X0
  2870  	VXORPD  X1, X1, X1
  2871  	VSQRTSD X1, X1, X1
  2872  	VDIVSD  X1, X0, X0
  2873  	MOVSD   X0, ret+48(FP)
  2874  	RET
  2875  
  2876  LBB2_5:
  2877  	MOVQ   DX, AX
  2878  	ANDQ   $-8, AX
  2879  	VXORPD X1, X1, X1
  2880  	XORL   CX, CX
  2881  	VXORPD X3, X3, X3
  2882  	VXORPD X2, X2, X2
  2883  	VXORPD X4, X4, X4
  2884  	VXORPD X0, X0, X0
  2885  	VXORPD X5, X5, X5
  2886  
  2887  LBB2_6:
  2888  	VMOVUPD      (DI)(CX*8), Y6
  2889  	VMOVUPD      32(DI)(CX*8), Y7
  2890  	VMOVUPD      (SI)(CX*8), Y8
  2891  	VMOVUPD      32(SI)(CX*8), Y9
  2892  	VFMADD231PD  Y6, Y8, Y0
  2893  	VFMADD231PD  Y7, Y9, Y5
  2894  	VFMADD231PD  Y6, Y6, Y2
  2895  	VFMADD231PD  Y7, Y7, Y4
  2896  	VFMADD231PD  Y8, Y8, Y1
  2897  	VFMADD231PD  Y9, Y9, Y3
  2898  	ADDQ         $0x08, CX
  2899  	CMPQ         AX, CX
  2900  	JNE          LBB2_6
  2901  	VADDPD       Y0, Y5, Y0
  2902  	VEXTRACTF128 $0x01, Y0, X5
  2903  	VADDPD       X5, X0, X0
  2904  	VPERMILPD    $0x01, X0, X5
  2905  	VADDSD       X5, X0, X0
  2906  	VADDPD       Y2, Y4, Y2
  2907  	VEXTRACTF128 $0x01, Y2, X4
  2908  	VADDPD       X4, X2, X2
  2909  	VPERMILPD    $0x01, X2, X4
  2910  	VADDSD       X4, X2, X2
  2911  	VADDPD       Y1, Y3, Y1
  2912  	VEXTRACTF128 $0x01, Y1, X3
  2913  	VADDPD       X3, X1, X1
  2914  	VPERMILPD    $0x01, X1, X3
  2915  	VADDSD       X3, X1, X1
  2916  	CMPQ         AX, DX
  2917  	JE           LBB2_8
  2918  
  2919  LBB2_4:
  2920  	VMOVSD      (DI)(AX*8), X3
  2921  	VMOVSD      (SI)(AX*8), X4
  2922  	VFMADD231SD X3, X4, X0
  2923  	VFMADD231SD X3, X3, X2
  2924  	VFMADD231SD X4, X4, X1
  2925  	ADDQ        $0x01, AX
  2926  	CMPQ        DX, AX
  2927  	JNE         LBB2_4
  2928  
  2929  LBB2_8:
  2930  	VMULSD  X2, X1, X1
  2931  	VSQRTSD X1, X1, X1
  2932  	VDIVSD  X1, X0, X0
  2933  	VZEROUPPER
  2934  	MOVSD   X0, ret+48(FP)
  2935  	RET
  2936  
  2937  DATA dataCosineSimilarityF32<>+0(SB)/4, $0xc0400000
  2938  DATA dataCosineSimilarityF32<>+4(SB)/4, $0xbf000000
  2939  GLOBL dataCosineSimilarityF32<>(SB), RODATA|NOPTR, $8
  2940  
  2941  // func CosineSimilarity_AVX2_F32(x []float32, y []float32) float32
  2942  // Requires: AVX, FMA3, SSE
  2943  TEXT ·CosineSimilarity_AVX2_F32(SB), NOSPLIT, $0-52
  2944  	MOVQ   x_base+0(FP), DI
  2945  	MOVQ   y_base+24(FP), SI
  2946  	MOVQ   x_len+8(FP), DX
  2947  	TESTQ  DX, DX
  2948  	JE     LBB3_1
  2949  	CMPQ   DX, $0x10
  2950  	JAE    LBB3_5
  2951  	VXORPS X1, X1, X1
  2952  	XORL   AX, AX
  2953  	VXORPS X2, X2, X2
  2954  	VXORPS X0, X0, X0
  2955  	JMP    LBB3_4
  2956  
  2957  LBB3_1:
  2958  	VXORPS X0, X0, X0
  2959  	VXORPS X1, X1, X1
  2960  	JMP    LBB3_9
  2961  
  2962  LBB3_5:
  2963  	MOVQ   DX, AX
  2964  	ANDQ   $-16, AX
  2965  	VXORPS X1, X1, X1
  2966  	XORL   CX, CX
  2967  	VXORPS X3, X3, X3
  2968  	VXORPS X2, X2, X2
  2969  	VXORPS X4, X4, X4
  2970  	VXORPS X0, X0, X0
  2971  	VXORPS X5, X5, X5
  2972  
  2973  LBB3_6:
  2974  	VMOVUPS      (DI)(CX*4), Y6
  2975  	VMOVUPS      32(DI)(CX*4), Y7
  2976  	VMOVUPS      (SI)(CX*4), Y8
  2977  	VMOVUPS      32(SI)(CX*4), Y9
  2978  	VFMADD231PS  Y6, Y8, Y0
  2979  	VFMADD231PS  Y7, Y9, Y5
  2980  	VFMADD231PS  Y6, Y6, Y2
  2981  	VFMADD231PS  Y7, Y7, Y4
  2982  	VFMADD231PS  Y8, Y8, Y1
  2983  	VFMADD231PS  Y9, Y9, Y3
  2984  	ADDQ         $0x10, CX
  2985  	CMPQ         AX, CX
  2986  	JNE          LBB3_6
  2987  	VADDPS       Y0, Y5, Y0
  2988  	VEXTRACTF128 $0x01, Y0, X5
  2989  	VADDPS       X5, X0, X0
  2990  	VPERMILPD    $0x01, X0, X5
  2991  	VADDPS       X5, X0, X0
  2992  	VMOVSHDUP    X0, X5
  2993  	VADDSS       X5, X0, X0
  2994  	VADDPS       Y2, Y4, Y2
  2995  	VEXTRACTF128 $0x01, Y2, X4
  2996  	VADDPS       X4, X2, X2
  2997  	VPERMILPD    $0x01, X2, X4
  2998  	VADDPS       X4, X2, X2
  2999  	VMOVSHDUP    X2, X4
  3000  	VADDSS       X4, X2, X2
  3001  	VADDPS       Y1, Y3, Y1
  3002  	VEXTRACTF128 $0x01, Y1, X3
  3003  	VADDPS       X3, X1, X1
  3004  	VPERMILPD    $0x01, X1, X3
  3005  	VADDPS       X3, X1, X1
  3006  	VMOVSHDUP    X1, X3
  3007  	VADDSS       X3, X1, X1
  3008  	CMPQ         AX, DX
  3009  	JE           LBB3_8
  3010  
  3011  LBB3_4:
  3012  	VMOVSS      (DI)(AX*4), X3
  3013  	VMOVSS      (SI)(AX*4), X4
  3014  	VFMADD231SS X3, X4, X0
  3015  	VFMADD231SS X3, X3, X2
  3016  	VFMADD231SS X4, X4, X1
  3017  	ADDQ        $0x01, AX
  3018  	CMPQ        DX, AX
  3019  	JNE         LBB3_4
  3020  
  3021  LBB3_8:
  3022  	VMULSS X2, X1, X1
  3023  
  3024  LBB3_9:
  3025  	VRSQRTSS    X1, X1, X2
  3026  	VMULSS      X2, X1, X1
  3027  	VFMADD213SS dataCosineSimilarityF32<>+0(SB), X2, X1
  3028  	VMULSS      dataCosineSimilarityF32<>+4(SB), X2, X2
  3029  	VMULSS      X0, X2, X0
  3030  	VMULSS      X0, X1, X0
  3031  	VZEROUPPER
  3032  	MOVSS       X0, ret+48(FP)
  3033  	RET
  3034  
  3035  // func Mat4Mul_AVX2_F64(x []float64, y []float64, z []float64)
  3036  // Requires: AVX, FMA3
  3037  TEXT ·Mat4Mul_AVX2_F64(SB), NOSPLIT, $0-72
  3038  	MOVQ         x_base+0(FP), DI
  3039  	MOVQ         y_base+24(FP), SI
  3040  	MOVQ         z_base+48(FP), DX
  3041  	VBROADCASTSD (SI), Y0
  3042  	VMOVUPD      (DX), Y1
  3043  	VMOVUPD      32(DX), Y2
  3044  	VMOVUPD      64(DX), Y3
  3045  	VMOVUPD      96(DX), Y4
  3046  	VMULPD       Y0, Y1, Y0
  3047  	VBROADCASTSD 8(SI), Y5
  3048  	VFMADD213PD  Y0, Y2, Y5
  3049  	VBROADCASTSD 16(SI), Y0
  3050  	VFMADD213PD  Y5, Y3, Y0
  3051  	VBROADCASTSD 24(SI), Y5
  3052  	VFMADD213PD  Y0, Y4, Y5
  3053  	VMOVUPD      Y5, (DI)
  3054  	VBROADCASTSD 32(SI), Y0
  3055  	VMULPD       Y0, Y1, Y0
  3056  	VBROADCASTSD 40(SI), Y1
  3057  	VFMADD213PD  Y0, Y2, Y1
  3058  	VBROADCASTSD 48(SI), Y0
  3059  	VFMADD213PD  Y1, Y3, Y0
  3060  	VBROADCASTSD 56(SI), Y1
  3061  	VFMADD213PD  Y0, Y4, Y1
  3062  	VMOVUPD      Y1, 32(DI)
  3063  	VBROADCASTSD 64(SI), Y0
  3064  	VMOVUPD      (DX), Y1
  3065  	VMOVUPD      32(DX), Y2
  3066  	VMOVUPD      64(DX), Y3
  3067  	VMOVUPD      96(DX), Y4
  3068  	VMULPD       Y0, Y1, Y0
  3069  	VBROADCASTSD 72(SI), Y5
  3070  	VFMADD213PD  Y0, Y2, Y5
  3071  	VBROADCASTSD 80(SI), Y0
  3072  	VFMADD213PD  Y5, Y3, Y0
  3073  	VBROADCASTSD 88(SI), Y5
  3074  	VFMADD213PD  Y0, Y4, Y5
  3075  	VMOVUPD      Y5, 64(DI)
  3076  	VBROADCASTSD 96(SI), Y0
  3077  	VMULPD       Y0, Y1, Y0
  3078  	VBROADCASTSD 104(SI), Y1
  3079  	VFMADD213PD  Y0, Y2, Y1
  3080  	VBROADCASTSD 112(SI), Y0
  3081  	VFMADD213PD  Y1, Y3, Y0
  3082  	VBROADCASTSD 120(SI), Y1
  3083  	VFMADD213PD  Y0, Y4, Y1
  3084  	VMOVUPD      Y1, 96(DI)
  3085  	VZEROUPPER
  3086  	RET
  3087  
  3088  // func Mat4Mul_AVX2_F32(x []float32, y []float32, z []float32)
  3089  // Requires: AVX, AVX2, FMA3
  3090  TEXT ·Mat4Mul_AVX2_F32(SB), NOSPLIT, $0-72
  3091  	MOVQ           x_base+0(FP), DI
  3092  	MOVQ           y_base+24(FP), SI
  3093  	MOVQ           z_base+48(FP), DX
  3094  	VBROADCASTF128 (DX), Y0
  3095  	VBROADCASTF128 16(DX), Y1
  3096  	VBROADCASTF128 32(DX), Y2
  3097  	VBROADCASTF128 48(DX), Y3
  3098  	VMOVSS         16(SI), X4
  3099  	VMOVSS         (SI), X5
  3100  	VSHUFPS        $0x00, X4, X5, X4
  3101  	VMOVSS         4(SI), X5
  3102  	VMOVSS         8(SI), X6
  3103  	VMOVSS         12(SI), X7
  3104  	VPERMPD        $0x50, Y4, Y4
  3105  	VMULPS         Y4, Y0, Y0
  3106  	VMOVSS         20(SI), X4
  3107  	VSHUFPS        $0x00, X4, X5, X4
  3108  	VPERMPD        $0x50, Y4, Y4
  3109  	VFMADD213PS    Y0, Y1, Y4
  3110  	VMOVSS         24(SI), X0
  3111  	VSHUFPS        $0x00, X0, X6, X0
  3112  	VPERMPD        $0x50, Y0, Y0
  3113  	VFMADD213PS    Y4, Y2, Y0
  3114  	VMOVSS         28(SI), X1
  3115  	VSHUFPS        $0x00, X1, X7, X1
  3116  	VPERMPD        $0x50, Y1, Y1
  3117  	VFMADD213PS    Y0, Y3, Y1
  3118  	VBROADCASTF128 (DX), Y0
  3119  	VBROADCASTF128 16(DX), Y2
  3120  	VBROADCASTF128 32(DX), Y3
  3121  	VMOVUPS        Y1, (DI)
  3122  	VBROADCASTF128 48(DX), Y1
  3123  	VMOVSS         48(SI), X4
  3124  	VMOVSS         32(SI), X5
  3125  	VSHUFPS        $0x00, X4, X5, X4
  3126  	VMOVSS         36(SI), X5
  3127  	VMOVSS         40(SI), X6
  3128  	VMOVSS         44(SI), X7
  3129  	VPERMPD        $0x50, Y4, Y4
  3130  	VMULPS         Y4, Y0, Y0
  3131  	VMOVSS         52(SI), X4
  3132  	VSHUFPS        $0x00, X4, X5, X4
  3133  	VPERMPD        $0x50, Y4, Y4
  3134  	VFMADD213PS    Y0, Y2, Y4
  3135  	VMOVSS         56(SI), X0
  3136  	VSHUFPS        $0x00, X0, X6, X0
  3137  	VPERMPD        $0x50, Y0, Y0
  3138  	VFMADD213PS    Y4, Y3, Y0
  3139  	VMOVSS         60(SI), X2
  3140  	VSHUFPS        $0x00, X2, X7, X2
  3141  	VPERMPD        $0x50, Y2, Y2
  3142  	VFMADD213PS    Y0, Y1, Y2
  3143  	VMOVUPS        Y2, 32(DI)
  3144  	VZEROUPPER
  3145  	RET
  3146  
  3147  // func MatMul_AVX2_F64(x []float64, y []float64, z []float64, a int, b int, c int)
  3148  // Requires: AVX, AVX2, FMA3
  3149  TEXT ·MatMul_AVX2_F64(SB), $8-96
  3150  	MOVQ  x_base+0(FP), DI
  3151  	MOVQ  y_base+24(FP), SI
  3152  	MOVQ  z_base+48(FP), DX
  3153  	MOVQ  a+72(FP), CX
  3154  	MOVQ  b+80(FP), R8
  3155  	MOVQ  c+88(FP), R9
  3156  	PUSHQ BP
  3157  	PUSHQ R15
  3158  	PUSHQ R14
  3159  	PUSHQ R13
  3160  	PUSHQ R12
  3161  	PUSHQ BX
  3162  	MOVQ  DX, -16(SP)
  3163  	MOVQ  CX, -8(SP)
  3164  	TESTQ CX, CX
  3165  	JE    LBB4_13
  3166  	TESTQ R8, R8
  3167  	JE    LBB4_13
  3168  	TESTQ R9, R9
  3169  	JE    LBB4_13
  3170  	MOVQ  R9, R12
  3171  	ANDQ  $-16, R12
  3172  	MOVQ  -16(SP), AX
  3173  	LEAQ  96(AX), CX
  3174  	XORQ  R15, R15
  3175  	LEAQ  (R15)(R9*8), R11
  3176  	LEAQ  96(DI), BX
  3177  	XORL  R14, R14
  3178  	JMP   LBB4_4
  3179  
  3180  LBB4_12:
  3181  	ADDQ $0x01, R14
  3182  	ADDQ R11, BX
  3183  	ADDQ R11, DI
  3184  	CMPQ R14, -8(SP)
  3185  	JE   LBB4_13
  3186  
  3187  LBB4_4:
  3188  	MOVQ  R14, R15
  3189  	IMULQ R8, R15
  3190  	MOVQ  -16(SP), R13
  3191  	MOVQ  CX, AX
  3192  	XORL  BP, BP
  3193  	JMP   LBB4_5
  3194  
  3195  LBB4_11:
  3196  	ADDQ $0x01, BP
  3197  	ADDQ R11, AX
  3198  	ADDQ R11, R13
  3199  	CMPQ BP, R8
  3200  	JE   LBB4_12
  3201  
  3202  LBB4_5:
  3203  	LEAQ   (R15)(BP*1), DX
  3204  	VMOVSD (SI)(DX*8), X0
  3205  	CMPQ   R9, $0x10
  3206  	JAE    LBB4_7
  3207  	XORL   DX, DX
  3208  	JMP    LBB4_10
  3209  
  3210  LBB4_7:
  3211  	VBROADCASTSD X0, Y1
  3212  	XORL         R10, R10
  3213  
  3214  LBB4_8:
  3215  	VMOVUPD     -96(AX)(R10*8), Y2
  3216  	VMOVUPD     -64(AX)(R10*8), Y3
  3217  	VMOVUPD     -32(AX)(R10*8), Y4
  3218  	VMOVUPD     (AX)(R10*8), Y5
  3219  	VFMADD213PD -96(BX)(R10*8), Y1, Y2
  3220  	VFMADD213PD -64(BX)(R10*8), Y1, Y3
  3221  	VFMADD213PD -32(BX)(R10*8), Y1, Y4
  3222  	VFMADD213PD (BX)(R10*8), Y1, Y5
  3223  	VMOVUPD     Y2, -96(BX)(R10*8)
  3224  	VMOVUPD     Y3, -64(BX)(R10*8)
  3225  	VMOVUPD     Y4, -32(BX)(R10*8)
  3226  	VMOVUPD     Y5, (BX)(R10*8)
  3227  	ADDQ        $0x10, R10
  3228  	CMPQ        R12, R10
  3229  	JNE         LBB4_8
  3230  	MOVQ        R12, DX
  3231  	CMPQ        R12, R9
  3232  	JE          LBB4_11
  3233  
  3234  LBB4_10:
  3235  	VMOVSD      (R13)(DX*8), X1
  3236  	VFMADD213SD (DI)(DX*8), X0, X1
  3237  	VMOVSD      X1, (DI)(DX*8)
  3238  	ADDQ        $0x01, DX
  3239  	CMPQ        R9, DX
  3240  	JNE         LBB4_10
  3241  	JMP         LBB4_11
  3242  
  3243  LBB4_13:
  3244  	POPQ BX
  3245  	POPQ R12
  3246  	POPQ R13
  3247  	POPQ R14
  3248  	POPQ R15
  3249  	POPQ BP
  3250  	VZEROUPPER
  3251  	RET
  3252  
  3253  // func MatMul_AVX2_F32(x []float32, y []float32, z []float32, a int, b int, c int)
  3254  // Requires: AVX, AVX2, FMA3
  3255  TEXT ·MatMul_AVX2_F32(SB), $8-96
  3256  	MOVQ  x_base+0(FP), DI
  3257  	MOVQ  y_base+24(FP), SI
  3258  	MOVQ  z_base+48(FP), DX
  3259  	MOVQ  a+72(FP), CX
  3260  	MOVQ  b+80(FP), R8
  3261  	MOVQ  c+88(FP), R9
  3262  	PUSHQ BP
  3263  	PUSHQ R15
  3264  	PUSHQ R14
  3265  	PUSHQ R13
  3266  	PUSHQ R12
  3267  	PUSHQ BX
  3268  	MOVQ  DX, -16(SP)
  3269  	MOVQ  CX, -8(SP)
  3270  	TESTQ CX, CX
  3271  	JE    LBB5_13
  3272  	TESTQ R8, R8
  3273  	JE    LBB5_13
  3274  	TESTQ R9, R9
  3275  	JE    LBB5_13
  3276  	MOVQ  R9, R12
  3277  	ANDQ  $-32, R12
  3278  	MOVQ  -16(SP), AX
  3279  	LEAQ  96(AX), CX
  3280  	XORQ  R15, R15
  3281  	LEAQ  (R15)(R9*4), R11
  3282  	LEAQ  96(DI), BX
  3283  	XORL  R14, R14
  3284  	JMP   LBB5_4
  3285  
  3286  LBB5_12:
  3287  	ADDQ $0x01, R14
  3288  	ADDQ R11, BX
  3289  	ADDQ R11, DI
  3290  	CMPQ R14, -8(SP)
  3291  	JE   LBB5_13
  3292  
  3293  LBB5_4:
  3294  	MOVQ  R14, R15
  3295  	IMULQ R8, R15
  3296  	MOVQ  -16(SP), R13
  3297  	MOVQ  CX, AX
  3298  	XORL  BP, BP
  3299  	JMP   LBB5_5
  3300  
  3301  LBB5_11:
  3302  	ADDQ $0x01, BP
  3303  	ADDQ R11, AX
  3304  	ADDQ R11, R13
  3305  	CMPQ BP, R8
  3306  	JE   LBB5_12
  3307  
  3308  LBB5_5:
  3309  	LEAQ   (R15)(BP*1), DX
  3310  	VMOVSS (SI)(DX*4), X0
  3311  	CMPQ   R9, $0x20
  3312  	JAE    LBB5_7
  3313  	XORL   DX, DX
  3314  	JMP    LBB5_10
  3315  
  3316  LBB5_7:
  3317  	VBROADCASTSS X0, Y1
  3318  	XORL         R10, R10
  3319  
  3320  LBB5_8:
  3321  	VMOVUPS     -96(AX)(R10*4), Y2
  3322  	VMOVUPS     -64(AX)(R10*4), Y3
  3323  	VMOVUPS     -32(AX)(R10*4), Y4
  3324  	VMOVUPS     (AX)(R10*4), Y5
  3325  	VFMADD213PS -96(BX)(R10*4), Y1, Y2
  3326  	VFMADD213PS -64(BX)(R10*4), Y1, Y3
  3327  	VFMADD213PS -32(BX)(R10*4), Y1, Y4
  3328  	VFMADD213PS (BX)(R10*4), Y1, Y5
  3329  	VMOVUPS     Y2, -96(BX)(R10*4)
  3330  	VMOVUPS     Y3, -64(BX)(R10*4)
  3331  	VMOVUPS     Y4, -32(BX)(R10*4)
  3332  	VMOVUPS     Y5, (BX)(R10*4)
  3333  	ADDQ        $0x20, R10
  3334  	CMPQ        R12, R10
  3335  	JNE         LBB5_8
  3336  	MOVQ        R12, DX
  3337  	CMPQ        R12, R9
  3338  	JE          LBB5_11
  3339  
  3340  LBB5_10:
  3341  	VMOVSS      (R13)(DX*4), X1
  3342  	VFMADD213SS (DI)(DX*4), X0, X1
  3343  	VMOVSS      X1, (DI)(DX*4)
  3344  	ADDQ        $0x01, DX
  3345  	CMPQ        R9, DX
  3346  	JNE         LBB5_10
  3347  	JMP         LBB5_11
  3348  
  3349  LBB5_13:
  3350  	POPQ BX
  3351  	POPQ R12
  3352  	POPQ R13
  3353  	POPQ R14
  3354  	POPQ R15
  3355  	POPQ BP
  3356  	VZEROUPPER
  3357  	RET
  3358  
  3359  // func MatMulVec_AVX2_F64(x []float64, y []float64, z []float64, a int, b int)
  3360  // Requires: AVX, FMA3
  3361  TEXT ·MatMulVec_AVX2_F64(SB), $0-88
  3362  	MOVQ  x_base+0(FP), DI
  3363  	MOVQ  y_base+24(FP), SI
  3364  	MOVQ  z_base+48(FP), DX
  3365  	MOVQ  a+72(FP), CX
  3366  	MOVQ  b+80(FP), R8
  3367  	PUSHQ BX
  3368  	TESTQ CX, CX
  3369  	JE    LBB6_10
  3370  	TESTQ R8, R8
  3371  	JE    LBB6_10
  3372  	MOVQ  R8, R9
  3373  	ANDQ  $-16, R9
  3374  	LEAQ  96(SI), AX
  3375  	XORQ  R10, R10
  3376  	LEAQ  (R10)(R8*8), R10
  3377  	XORL  R11, R11
  3378  	JMP   LBB6_3
  3379  
  3380  LBB6_9:
  3381  	VMOVSD X0, (DI)(R11*8)
  3382  	ADDQ   $0x01, R11
  3383  	ADDQ   R10, AX
  3384  	ADDQ   R10, SI
  3385  	CMPQ   R11, CX
  3386  	JE     LBB6_10
  3387  
  3388  LBB6_3:
  3389  	VMOVQ (DI)(R11*8), X0
  3390  	CMPQ  R8, $0x10
  3391  	JAE   LBB6_5
  3392  	XORL  BX, BX
  3393  	JMP   LBB6_8
  3394  
  3395  LBB6_5:
  3396  	VMOVQ  X0, X0
  3397  	VXORPD X1, X1, X1
  3398  	XORL   BX, BX
  3399  	VXORPD X2, X2, X2
  3400  	VXORPD X3, X3, X3
  3401  
  3402  LBB6_6:
  3403  	VMOVUPD      (DX)(BX*8), Y4
  3404  	VMOVUPD      32(DX)(BX*8), Y5
  3405  	VMOVUPD      64(DX)(BX*8), Y6
  3406  	VMOVUPD      96(DX)(BX*8), Y7
  3407  	VFMADD231PD  -96(AX)(BX*8), Y4, Y0
  3408  	VFMADD231PD  -64(AX)(BX*8), Y5, Y1
  3409  	VFMADD231PD  -32(AX)(BX*8), Y6, Y2
  3410  	VFMADD231PD  (AX)(BX*8), Y7, Y3
  3411  	ADDQ         $0x10, BX
  3412  	CMPQ         R9, BX
  3413  	JNE          LBB6_6
  3414  	VADDPD       Y0, Y1, Y0
  3415  	VADDPD       Y0, Y2, Y0
  3416  	VADDPD       Y0, Y3, Y0
  3417  	VEXTRACTF128 $0x01, Y0, X1
  3418  	VADDPD       X1, X0, X0
  3419  	VPERMILPD    $0x01, X0, X1
  3420  	VADDSD       X1, X0, X0
  3421  	MOVQ         R9, BX
  3422  	CMPQ         R9, R8
  3423  	JE           LBB6_9
  3424  
  3425  LBB6_8:
  3426  	VMOVSD      (DX)(BX*8), X1
  3427  	VFMADD231SD (SI)(BX*8), X1, X0
  3428  	ADDQ        $0x01, BX
  3429  	CMPQ        R8, BX
  3430  	JNE         LBB6_8
  3431  	JMP         LBB6_9
  3432  
  3433  LBB6_10:
  3434  	POPQ BX
  3435  	VZEROUPPER
  3436  	RET
  3437  
  3438  // func MatMulVec_AVX2_F32(x []float32, y []float32, z []float32, a int, b int)
  3439  // Requires: AVX, FMA3
  3440  TEXT ·MatMulVec_AVX2_F32(SB), $0-88
  3441  	MOVQ   x_base+0(FP), DI
  3442  	MOVQ   y_base+24(FP), SI
  3443  	MOVQ   z_base+48(FP), DX
  3444  	MOVQ   a+72(FP), CX
  3445  	MOVQ   b+80(FP), R8
  3446  	PUSHQ  BX
  3447  	TESTQ  CX, CX
  3448  	JE     LBB7_10
  3449  	TESTQ  R8, R8
  3450  	JE     LBB7_10
  3451  	MOVQ   R8, R9
  3452  	ANDQ   $-32, R9
  3453  	LEAQ   96(SI), AX
  3454  	XORQ   R10, R10
  3455  	LEAQ   (R10)(R8*4), R10
  3456  	XORL   R11, R11
  3457  	VXORPS X0, X0, X0
  3458  	JMP    LBB7_3
  3459  
  3460  LBB7_9:
  3461  	VMOVSS X1, (DI)(R11*4)
  3462  	ADDQ   $0x01, R11
  3463  	ADDQ   R10, AX
  3464  	ADDQ   R10, SI
  3465  	CMPQ   R11, CX
  3466  	JE     LBB7_10
  3467  
  3468  LBB7_3:
  3469  	VMOVSS (DI)(R11*4), X1
  3470  	CMPQ   R8, $0x20
  3471  	JAE    LBB7_5
  3472  	XORL   BX, BX
  3473  	JMP    LBB7_8
  3474  
  3475  LBB7_5:
  3476  	VBLENDPS $0x01, X1, X0, X1
  3477  	VXORPS   X2, X2, X2
  3478  	XORL     BX, BX
  3479  	VXORPS   X3, X3, X3
  3480  	VXORPS   X4, X4, X4
  3481  
  3482  LBB7_6:
  3483  	VMOVUPS      (DX)(BX*4), Y5
  3484  	VMOVUPS      32(DX)(BX*4), Y6
  3485  	VMOVUPS      64(DX)(BX*4), Y7
  3486  	VMOVUPS      96(DX)(BX*4), Y8
  3487  	VFMADD231PS  -96(AX)(BX*4), Y5, Y1
  3488  	VFMADD231PS  -64(AX)(BX*4), Y6, Y2
  3489  	VFMADD231PS  -32(AX)(BX*4), Y7, Y3
  3490  	VFMADD231PS  (AX)(BX*4), Y8, Y4
  3491  	ADDQ         $0x20, BX
  3492  	CMPQ         R9, BX
  3493  	JNE          LBB7_6
  3494  	VADDPS       Y1, Y2, Y1
  3495  	VADDPS       Y1, Y3, Y1
  3496  	VADDPS       Y1, Y4, Y1
  3497  	VEXTRACTF128 $0x01, Y1, X2
  3498  	VADDPS       X2, X1, X1
  3499  	VPERMILPD    $0x01, X1, X2
  3500  	VADDPS       X2, X1, X1
  3501  	VMOVSHDUP    X1, X2
  3502  	VADDSS       X2, X1, X1
  3503  	MOVQ         R9, BX
  3504  	CMPQ         R9, R8
  3505  	JE           LBB7_9
  3506  
  3507  LBB7_8:
  3508  	VMOVSS      (DX)(BX*4), X2
  3509  	VFMADD231SS (SI)(BX*4), X2, X1
  3510  	ADDQ        $0x01, BX
  3511  	CMPQ        R8, BX
  3512  	JNE         LBB7_8
  3513  	JMP         LBB7_9
  3514  
  3515  LBB7_10:
  3516  	POPQ BX
  3517  	VZEROUPPER
  3518  	RET
  3519  
  3520  // func MatMulTiled_AVX2_F64(x []float64, y []float64, z []float64, a int, b int, c int)
  3521  // Requires: AVX, AVX2, CMOV, FMA3
  3522  TEXT ·MatMulTiled_AVX2_F64(SB), $8-96
  3523  	MOVQ  x_base+0(FP), DI
  3524  	MOVQ  y_base+24(FP), SI
  3525  	MOVQ  z_base+48(FP), DX
  3526  	MOVQ  a+72(FP), CX
  3527  	MOVQ  b+80(FP), R8
  3528  	MOVQ  c+88(FP), R9
  3529  	PUSHQ BP
  3530  	PUSHQ R15
  3531  	PUSHQ R14
  3532  	PUSHQ R13
  3533  	PUSHQ R12
  3534  	PUSHQ BX
  3535  	SUBQ  $0x48, SP
  3536  	MOVQ  R9, -128(SP)
  3537  	MOVQ  R8, -104(SP)
  3538  	MOVQ  DX, -88(SP)
  3539  	MOVQ  DI, -112(SP)
  3540  	MOVQ  CX, -64(SP)
  3541  	ADDQ  $0x07, CX
  3542  	MOVQ  CX, -72(SP)
  3543  	JE    LBB8_21
  3544  	MOVQ  -104(SP), AX
  3545  	ADDQ  $0xff, AX
  3546  	MOVQ  AX, 8(SP)
  3547  	JE    LBB8_21
  3548  	MOVQ  -128(SP), AX
  3549  	ADDQ  $0xff, AX
  3550  	MOVQ  AX, -40(SP)
  3551  	JE    LBB8_21
  3552  	MOVQ  -88(SP), AX
  3553  	ADDQ  $0x60, AX
  3554  	MOVQ  AX, -48(SP)
  3555  	MOVQ  -128(SP), AX
  3556  	XORQ  R15, R15
  3557  	LEAQ  (R15)(AX*8), BX
  3558  	MOVQ  -112(SP), CX
  3559  	ADDQ  $0x60, CX
  3560  	MOVQ  CX, -96(SP)
  3561  	SHLQ  $0x06, AX
  3562  	MOVQ  AX, -80(SP)
  3563  	XORL  DX, DX
  3564  	JMP   LBB8_4
  3565  
  3566  LBB8_20:
  3567  	MOVQ -80(SP), AX
  3568  	ADDQ AX, -96(SP)
  3569  	ADDQ AX, -112(SP)
  3570  	MOVQ -56(SP), AX
  3571  	MOVQ AX, DX
  3572  	CMPQ AX, -72(SP)
  3573  	JAE  LBB8_21
  3574  
  3575  LBB8_4:
  3576  	LEAQ    8(DX), AX
  3577  	MOVQ    -64(SP), CX
  3578  	CMPQ    AX, CX
  3579  	MOVQ    AX, -56(SP)
  3580  	CMOVQGT CX, AX
  3581  	CDQE
  3582  	MOVQ    DX, -16(SP)
  3583  	MOVQ    AX, 24(SP)
  3584  	CMPQ    DX, AX
  3585  	JAE     LBB8_20
  3586  	XORL    AX, AX
  3587  	MOVQ    AX, -120(SP)
  3588  	MOVL    $+256, DX
  3589  	XORL    AX, AX
  3590  	JMP     LBB8_6
  3591  
  3592  LBB8_19:
  3593  	MOVQ -120(SP), AX
  3594  	ADDL $0x01, AX
  3595  	MOVQ AX, -120(SP)
  3596  	MOVQ -24(SP), DX
  3597  	ADDQ $+256, DX
  3598  	MOVQ -32(SP), AX
  3599  	CMPQ AX, -40(SP)
  3600  	JAE  LBB8_20
  3601  
  3602  LBB8_6:
  3603  	MOVL    AX, DI
  3604  	MOVQ    -128(SP), BP
  3605  	CMPQ    BP, DX
  3606  	MOVQ    DX, -24(SP)
  3607  	CMOVQLT BP, DX
  3608  	ADDQ    $+256, AX
  3609  	CMPQ    BP, AX
  3610  	MOVQ    AX, CX
  3611  	CMOVQLT BP, CX
  3612  	MOVQ    AX, -32(SP)
  3613  	CMOVQLT BP, AX
  3614  	CMPL    DI, AX
  3615  	JGE     LBB8_19
  3616  	MOVLQSX DI, R14
  3617  	MOVQ    -96(SP), DI
  3618  	LEAQ    (DI)(R14*8), DI
  3619  	MOVQ    DI, (SP)
  3620  	MOVLQSX DX, R11
  3621  	SUBQ    R14, R11
  3622  	ANDQ    $-16, R11
  3623  	MOVLQSX CX, R12
  3624  	MOVQ    -120(SP), CX
  3625  	SHLL    $0x08, CX
  3626  	MOVLQSX CX, CX
  3627  	SUBQ    CX, R12
  3628  	MOVLQSX AX, DX
  3629  	MOVQ    R12, CX
  3630  	ANDQ    $-16, CX
  3631  	MOVQ    -48(SP), AX
  3632  	LEAQ    (AX)(R14*8), AX
  3633  	MOVQ    AX, -8(SP)
  3634  	MOVQ    R14, R13
  3635  	MOVQ    CX, 64(SP)
  3636  	ADDQ    CX, R13
  3637  	XORL    AX, AX
  3638  	JMP     LBB8_8
  3639  
  3640  LBB8_18:
  3641  	MOVQ 16(SP), AX
  3642  	CMPQ AX, 8(SP)
  3643  	JAE  LBB8_19
  3644  
  3645  LBB8_8:
  3646  	MOVL    AX, CX
  3647  	ADDQ    $+256, AX
  3648  	MOVQ    -104(SP), DI
  3649  	CMPQ    AX, DI
  3650  	MOVQ    AX, 16(SP)
  3651  	CMOVQGT DI, AX
  3652  	CMPL    CX, AX
  3653  	JGE     LBB8_18
  3654  	MOVLQSX CX, DI
  3655  	MOVQ    -128(SP), CX
  3656  	MOVQ    DI, 48(SP)
  3657  	IMULQ   DI, CX
  3658  	MOVQ    -88(SP), DI
  3659  	LEAQ    (DI)(CX*8), DI
  3660  	MOVQ    DI, 40(SP)
  3661  	MOVQ    -8(SP), DI
  3662  	LEAQ    (DI)(CX*8), CX
  3663  	MOVQ    CX, 32(SP)
  3664  	CDQE
  3665  	MOVQ    -112(SP), CX
  3666  	MOVQ    (SP), R10
  3667  	MOVQ    -16(SP), R8
  3668  	JMP     LBB8_10
  3669  
  3670  LBB8_17:
  3671  	MOVQ 56(SP), R8
  3672  	ADDQ $0x01, R8
  3673  	ADDQ BX, R10
  3674  	ADDQ BX, CX
  3675  	CMPQ R8, 24(SP)
  3676  	JAE  LBB8_18
  3677  
  3678  LBB8_10:
  3679  	MOVQ  R8, 56(SP)
  3680  	IMULQ -104(SP), R8
  3681  	MOVQ  40(SP), R15
  3682  	MOVQ  32(SP), DI
  3683  	MOVQ  48(SP), R9
  3684  	JMP   LBB8_11
  3685  
  3686  LBB8_16:
  3687  	ADDQ $0x01, R9
  3688  	ADDQ BX, DI
  3689  	ADDQ BX, R15
  3690  	CMPQ R9, AX
  3691  	JGE  LBB8_17
  3692  
  3693  LBB8_11:
  3694  	LEAQ         (R9)(R8*1), BP
  3695  	VMOVSD       (SI)(BP*8), X0
  3696  	MOVQ         R14, BP
  3697  	CMPQ         R12, $0x10
  3698  	JB           LBB8_15
  3699  	VBROADCASTSD X0, Y1
  3700  	XORL         BP, BP
  3701  
  3702  LBB8_13:
  3703  	VMOVUPD     -96(DI)(BP*8), Y2
  3704  	VMOVUPD     -64(DI)(BP*8), Y3
  3705  	VMOVUPD     -32(DI)(BP*8), Y4
  3706  	VMOVUPD     (DI)(BP*8), Y5
  3707  	VFMADD213PD -96(R10)(BP*8), Y1, Y2
  3708  	VFMADD213PD -64(R10)(BP*8), Y1, Y3
  3709  	VFMADD213PD -32(R10)(BP*8), Y1, Y4
  3710  	VFMADD213PD (R10)(BP*8), Y1, Y5
  3711  	VMOVUPD     Y2, -96(R10)(BP*8)
  3712  	VMOVUPD     Y3, -64(R10)(BP*8)
  3713  	VMOVUPD     Y4, -32(R10)(BP*8)
  3714  	VMOVUPD     Y5, (R10)(BP*8)
  3715  	ADDQ        $0x10, BP
  3716  	CMPQ        R11, BP
  3717  	JNE         LBB8_13
  3718  	MOVQ        R13, BP
  3719  	CMPQ        R12, 64(SP)
  3720  	JE          LBB8_16
  3721  
  3722  LBB8_15:
  3723  	VMOVSD      (R15)(BP*8), X1
  3724  	VFMADD213SD (CX)(BP*8), X0, X1
  3725  	VMOVSD      X1, (CX)(BP*8)
  3726  	ADDQ        $0x01, BP
  3727  	CMPQ        BP, DX
  3728  	JL          LBB8_15
  3729  	JMP         LBB8_16
  3730  
  3731  LBB8_21:
  3732  	ADDQ $0x48, SP
  3733  	POPQ BX
  3734  	POPQ R12
  3735  	POPQ R13
  3736  	POPQ R14
  3737  	POPQ R15
  3738  	POPQ BP
  3739  	VZEROUPPER
  3740  	RET
  3741  
  3742  // func MatMulTiled_AVX2_F32(x []float32, y []float32, z []float32, a int, b int, c int)
  3743  // Requires: AVX, AVX2, CMOV, FMA3
  3744  TEXT ·MatMulTiled_AVX2_F32(SB), $8-96
  3745  	MOVQ  x_base+0(FP), DI
  3746  	MOVQ  y_base+24(FP), SI
  3747  	MOVQ  z_base+48(FP), DX
  3748  	MOVQ  a+72(FP), CX
  3749  	MOVQ  b+80(FP), R8
  3750  	MOVQ  c+88(FP), R9
  3751  	PUSHQ BP
  3752  	PUSHQ R15
  3753  	PUSHQ R14
  3754  	PUSHQ R13
  3755  	PUSHQ R12
  3756  	PUSHQ BX
  3757  	SUBQ  $0x48, SP
  3758  	MOVQ  R9, -128(SP)
  3759  	MOVQ  R8, -104(SP)
  3760  	MOVQ  DX, -88(SP)
  3761  	MOVQ  DI, -112(SP)
  3762  	MOVQ  CX, -64(SP)
  3763  	ADDQ  $0x07, CX
  3764  	MOVQ  CX, -72(SP)
  3765  	JE    LBB9_21
  3766  	MOVQ  -104(SP), AX
  3767  	ADDQ  $0xff, AX
  3768  	MOVQ  AX, 8(SP)
  3769  	JE    LBB9_21
  3770  	MOVQ  -128(SP), AX
  3771  	ADDQ  $0xff, AX
  3772  	MOVQ  AX, -40(SP)
  3773  	JE    LBB9_21
  3774  	MOVQ  -88(SP), AX
  3775  	ADDQ  $0x60, AX
  3776  	MOVQ  AX, -48(SP)
  3777  	MOVQ  -128(SP), AX
  3778  	XORQ  R15, R15
  3779  	LEAQ  (R15)(AX*4), BX
  3780  	MOVQ  -112(SP), CX
  3781  	ADDQ  $0x60, CX
  3782  	MOVQ  CX, -96(SP)
  3783  	SHLQ  $0x05, AX
  3784  	MOVQ  AX, -80(SP)
  3785  	XORL  DX, DX
  3786  	JMP   LBB9_4
  3787  
  3788  LBB9_20:
  3789  	MOVQ -80(SP), AX
  3790  	ADDQ AX, -96(SP)
  3791  	ADDQ AX, -112(SP)
  3792  	MOVQ -56(SP), AX
  3793  	MOVQ AX, DX
  3794  	CMPQ AX, -72(SP)
  3795  	JAE  LBB9_21
  3796  
  3797  LBB9_4:
  3798  	LEAQ    8(DX), AX
  3799  	MOVQ    -64(SP), CX
  3800  	CMPQ    AX, CX
  3801  	MOVQ    AX, -56(SP)
  3802  	CMOVQGT CX, AX
  3803  	CDQE
  3804  	MOVQ    DX, -16(SP)
  3805  	MOVQ    AX, 24(SP)
  3806  	CMPQ    DX, AX
  3807  	JAE     LBB9_20
  3808  	XORL    AX, AX
  3809  	MOVQ    AX, -120(SP)
  3810  	MOVL    $+256, DX
  3811  	XORL    AX, AX
  3812  	JMP     LBB9_6
  3813  
  3814  LBB9_19:
  3815  	MOVQ -120(SP), AX
  3816  	ADDL $0x01, AX
  3817  	MOVQ AX, -120(SP)
  3818  	MOVQ -24(SP), DX
  3819  	ADDQ $+256, DX
  3820  	MOVQ -32(SP), AX
  3821  	CMPQ AX, -40(SP)
  3822  	JAE  LBB9_20
  3823  
  3824  LBB9_6:
  3825  	MOVL    AX, DI
  3826  	MOVQ    -128(SP), BP
  3827  	CMPQ    BP, DX
  3828  	MOVQ    DX, -24(SP)
  3829  	CMOVQLT BP, DX
  3830  	ADDQ    $+256, AX
  3831  	CMPQ    BP, AX
  3832  	MOVQ    AX, CX
  3833  	CMOVQLT BP, CX
  3834  	MOVQ    AX, -32(SP)
  3835  	CMOVQLT BP, AX
  3836  	CMPL    DI, AX
  3837  	JGE     LBB9_19
  3838  	MOVLQSX DI, R14
  3839  	MOVQ    -96(SP), DI
  3840  	LEAQ    (DI)(R14*4), DI
  3841  	MOVQ    DI, (SP)
  3842  	MOVLQSX DX, R11
  3843  	SUBQ    R14, R11
  3844  	ANDQ    $-32, R11
  3845  	MOVLQSX CX, R12
  3846  	MOVQ    -120(SP), CX
  3847  	SHLL    $0x08, CX
  3848  	MOVLQSX CX, CX
  3849  	SUBQ    CX, R12
  3850  	MOVLQSX AX, DX
  3851  	MOVQ    R12, CX
  3852  	ANDQ    $-32, CX
  3853  	MOVQ    -48(SP), AX
  3854  	LEAQ    (AX)(R14*4), AX
  3855  	MOVQ    AX, -8(SP)
  3856  	MOVQ    R14, R13
  3857  	MOVQ    CX, 64(SP)
  3858  	ADDQ    CX, R13
  3859  	XORL    AX, AX
  3860  	JMP     LBB9_8
  3861  
  3862  LBB9_18:
  3863  	MOVQ 16(SP), AX
  3864  	CMPQ AX, 8(SP)
  3865  	JAE  LBB9_19
  3866  
  3867  LBB9_8:
  3868  	MOVL    AX, CX
  3869  	ADDQ    $+256, AX
  3870  	MOVQ    -104(SP), DI
  3871  	CMPQ    AX, DI
  3872  	MOVQ    AX, 16(SP)
  3873  	CMOVQGT DI, AX
  3874  	CMPL    CX, AX
  3875  	JGE     LBB9_18
  3876  	MOVLQSX CX, DI
  3877  	MOVQ    -128(SP), CX
  3878  	MOVQ    DI, 48(SP)
  3879  	IMULQ   DI, CX
  3880  	MOVQ    -88(SP), DI
  3881  	LEAQ    (DI)(CX*4), DI
  3882  	MOVQ    DI, 40(SP)
  3883  	MOVQ    -8(SP), DI
  3884  	LEAQ    (DI)(CX*4), CX
  3885  	MOVQ    CX, 32(SP)
  3886  	CDQE
  3887  	MOVQ    -112(SP), CX
  3888  	MOVQ    (SP), R10
  3889  	MOVQ    -16(SP), R8
  3890  	JMP     LBB9_10
  3891  
  3892  LBB9_17:
  3893  	MOVQ 56(SP), R8
  3894  	ADDQ $0x01, R8
  3895  	ADDQ BX, R10
  3896  	ADDQ BX, CX
  3897  	CMPQ R8, 24(SP)
  3898  	JAE  LBB9_18
  3899  
  3900  LBB9_10:
  3901  	MOVQ  R8, 56(SP)
  3902  	IMULQ -104(SP), R8
  3903  	MOVQ  40(SP), R15
  3904  	MOVQ  32(SP), DI
  3905  	MOVQ  48(SP), R9
  3906  	JMP   LBB9_11
  3907  
  3908  LBB9_16:
  3909  	ADDQ $0x01, R9
  3910  	ADDQ BX, DI
  3911  	ADDQ BX, R15
  3912  	CMPQ R9, AX
  3913  	JGE  LBB9_17
  3914  
  3915  LBB9_11:
  3916  	LEAQ         (R9)(R8*1), BP
  3917  	VMOVSS       (SI)(BP*4), X0
  3918  	MOVQ         R14, BP
  3919  	CMPQ         R12, $0x20
  3920  	JB           LBB9_15
  3921  	VBROADCASTSS X0, Y1
  3922  	XORL         BP, BP
  3923  
  3924  LBB9_13:
  3925  	VMOVUPS     -96(DI)(BP*4), Y2
  3926  	VMOVUPS     -64(DI)(BP*4), Y3
  3927  	VMOVUPS     -32(DI)(BP*4), Y4
  3928  	VMOVUPS     (DI)(BP*4), Y5
  3929  	VFMADD213PS -96(R10)(BP*4), Y1, Y2
  3930  	VFMADD213PS -64(R10)(BP*4), Y1, Y3
  3931  	VFMADD213PS -32(R10)(BP*4), Y1, Y4
  3932  	VFMADD213PS (R10)(BP*4), Y1, Y5
  3933  	VMOVUPS     Y2, -96(R10)(BP*4)
  3934  	VMOVUPS     Y3, -64(R10)(BP*4)
  3935  	VMOVUPS     Y4, -32(R10)(BP*4)
  3936  	VMOVUPS     Y5, (R10)(BP*4)
  3937  	ADDQ        $0x20, BP
  3938  	CMPQ        R11, BP
  3939  	JNE         LBB9_13
  3940  	MOVQ        R13, BP
  3941  	CMPQ        R12, 64(SP)
  3942  	JE          LBB9_16
  3943  
  3944  LBB9_15:
  3945  	VMOVSS      (R15)(BP*4), X1
  3946  	VFMADD213SS (CX)(BP*4), X0, X1
  3947  	VMOVSS      X1, (CX)(BP*4)
  3948  	ADDQ        $0x01, BP
  3949  	CMPQ        BP, DX
  3950  	JL          LBB9_15
  3951  	JMP         LBB9_16
  3952  
  3953  LBB9_21:
  3954  	ADDQ $0x48, SP
  3955  	POPQ BX
  3956  	POPQ R12
  3957  	POPQ R13
  3958  	POPQ R14
  3959  	POPQ R15
  3960  	POPQ BP
  3961  	VZEROUPPER
  3962  	RET
  3963  
  3964  // func Sqrt_AVX2_F64(x []float64) float64
  3965  // Requires: AVX, SSE2
  3966  TEXT ·Sqrt_AVX2_F64(SB), NOSPLIT, $0-32
  3967  	MOVQ  x_base+0(FP), DI
  3968  	MOVQ  x_len+8(FP), SI
  3969  	TESTQ SI, SI
  3970  	JE    LBB0_7
  3971  	CMPQ  SI, $0x04
  3972  	JAE   LBB0_3
  3973  	XORL  AX, AX
  3974  	JMP   LBB0_6
  3975  
  3976  LBB0_3:
  3977  	MOVQ SI, AX
  3978  	ANDQ $-4, AX
  3979  	XORL CX, CX
  3980  
  3981  LBB0_4:
  3982  	VSQRTPD (DI)(CX*8), Y0
  3983  	VMOVUPD Y0, (DI)(CX*8)
  3984  	ADDQ    $0x04, CX
  3985  	CMPQ    AX, CX
  3986  	JNE     LBB0_4
  3987  	CMPQ    AX, SI
  3988  	JE      LBB0_7
  3989  
  3990  LBB0_6:
  3991  	VMOVSD  (DI)(AX*8), X0
  3992  	VSQRTSD X0, X0, X0
  3993  	VMOVSD  X0, (DI)(AX*8)
  3994  	ADDQ    $0x01, AX
  3995  	CMPQ    SI, AX
  3996  	JNE     LBB0_6
  3997  
  3998  LBB0_7:
  3999  	VZEROUPPER
  4000  	MOVSD X0, ret+24(FP)
  4001  	RET
  4002  
  4003  DATA dataSqrtF32<>+0(SB)/4, $0xc0400000
  4004  DATA dataSqrtF32<>+4(SB)/4, $0xbf000000
  4005  DATA dataSqrtF32<>+8(SB)/4, $0x7fffffff
  4006  DATA dataSqrtF32<>+12(SB)/4, $0x00800000
  4007  GLOBL dataSqrtF32<>(SB), RODATA|NOPTR, $16
  4008  
  4009  // func Sqrt_AVX2_F32(x []float32) float32
  4010  // Requires: AVX, FMA3, SSE
  4011  TEXT ·Sqrt_AVX2_F32(SB), NOSPLIT, $0-28
  4012  	MOVQ  x_base+0(FP), DI
  4013  	MOVQ  x_len+8(FP), SI
  4014  	TESTQ SI, SI
  4015  	JE    LBB1_8
  4016  	CMPQ  SI, $0x20
  4017  	JAE   LBB1_3
  4018  	XORL  AX, AX
  4019  	JMP   LBB1_6
  4020  
  4021  LBB1_3:
  4022  	MOVQ         SI, AX
  4023  	ANDQ         $-32, AX
  4024  	XORL         CX, CX
  4025  	VBROADCASTSS dataSqrtF32<>+0(SB), Y0
  4026  	VBROADCASTSS dataSqrtF32<>+4(SB), Y1
  4027  	VBROADCASTSS dataSqrtF32<>+8(SB), Y2
  4028  	VBROADCASTSS dataSqrtF32<>+12(SB), Y3
  4029  
  4030  LBB1_4:
  4031  	VMOVUPS     (DI)(CX*4), Y4
  4032  	VMOVUPS     32(DI)(CX*4), Y5
  4033  	VMOVUPS     64(DI)(CX*4), Y6
  4034  	VRSQRTPS    Y4, Y7
  4035  	VMOVUPS     96(DI)(CX*4), Y8
  4036  	VMULPS      Y7, Y4, Y9
  4037  	VFMADD213PS Y0, Y9, Y7
  4038  	VMULPS      Y1, Y9, Y9
  4039  	VMULPS      Y7, Y9, Y7
  4040  	VANDPS      Y2, Y4, Y4
  4041  	VCMPPS      $0x02, Y4, Y3, Y4
  4042  	VANDPS      Y7, Y4, Y4
  4043  	VRSQRTPS    Y5, Y7
  4044  	VMULPS      Y7, Y5, Y9
  4045  	VFMADD213PS Y0, Y9, Y7
  4046  	VMULPS      Y1, Y9, Y9
  4047  	VMULPS      Y7, Y9, Y7
  4048  	VANDPS      Y2, Y5, Y5
  4049  	VCMPPS      $0x02, Y5, Y3, Y5
  4050  	VRSQRTPS    Y6, Y9
  4051  	VANDPS      Y7, Y5, Y5
  4052  	VMULPS      Y6, Y9, Y7
  4053  	VFMADD213PS Y0, Y7, Y9
  4054  	VMULPS      Y1, Y7, Y7
  4055  	VMULPS      Y7, Y9, Y7
  4056  	VANDPS      Y2, Y6, Y6
  4057  	VCMPPS      $0x02, Y6, Y3, Y6
  4058  	VANDPS      Y7, Y6, Y6
  4059  	VRSQRTPS    Y8, Y7
  4060  	VMULPS      Y7, Y8, Y9
  4061  	VFMADD213PS Y0, Y9, Y7
  4062  	VMULPS      Y1, Y9, Y9
  4063  	VMULPS      Y7, Y9, Y7
  4064  	VANDPS      Y2, Y8, Y8
  4065  	VCMPPS      $0x02, Y8, Y3, Y8
  4066  	VANDPS      Y7, Y8, Y7
  4067  	VMOVUPS     Y4, (DI)(CX*4)
  4068  	VMOVUPS     Y5, 32(DI)(CX*4)
  4069  	VMOVUPS     Y6, 64(DI)(CX*4)
  4070  	VMOVUPS     Y7, 96(DI)(CX*4)
  4071  	ADDQ        $0x20, CX
  4072  	CMPQ        AX, CX
  4073  	JNE         LBB1_4
  4074  	CMPQ        AX, SI
  4075  	JE          LBB1_8
  4076  
  4077  LBB1_6:
  4078  	VMOVSS       dataSqrtF32<>+0(SB), X0
  4079  	VMOVSS       dataSqrtF32<>+4(SB), X1
  4080  	VBROADCASTSS dataSqrtF32<>+8(SB), X2
  4081  	VMOVSS       dataSqrtF32<>+12(SB), X3
  4082  
  4083  LBB1_7:
  4084  	VMOVSS      (DI)(AX*4), X4
  4085  	VRSQRTSS    X4, X4, X5
  4086  	VMULSS      X5, X4, X6
  4087  	VFMADD213SS X0, X6, X5
  4088  	VMULSS      X1, X6, X6
  4089  	VMULSS      X5, X6, X5
  4090  	VANDPS      X2, X4, X4
  4091  	VCMPSS      $0x01, X3, X4, X4
  4092  	VANDNPS     X5, X4, X4
  4093  	VMOVSS      X4, (DI)(AX*4)
  4094  	ADDQ        $0x01, AX
  4095  	CMPQ        SI, AX
  4096  	JNE         LBB1_7
  4097  
  4098  LBB1_8:
  4099  	VZEROUPPER
  4100  	MOVSS X0, ret+24(FP)
  4101  	RET
  4102  
  4103  DATA dataRoundF64<>+0(SB)/8, $0x8000000000000000
  4104  DATA dataRoundF64<>+8(SB)/8, $0x3fdfffffffffffff
  4105  DATA dataRoundF64<>+16(SB)/8, $0x8000000000000000
  4106  DATA dataRoundF64<>+24(SB)/8, $0x8000000000000000
  4107  GLOBL dataRoundF64<>(SB), RODATA|NOPTR, $32
  4108  
  4109  // func Round_AVX2_F64(x []float64) float64
  4110  // Requires: AVX, SSE2
  4111  TEXT ·Round_AVX2_F64(SB), NOSPLIT, $0-32
  4112  	MOVQ  x_base+0(FP), DI
  4113  	MOVQ  x_len+8(FP), SI
  4114  	TESTQ SI, SI
  4115  	JE    LBB2_8
  4116  	CMPQ  SI, $0x10
  4117  	JAE   LBB2_3
  4118  	XORL  AX, AX
  4119  	JMP   LBB2_6
  4120  
  4121  LBB2_3:
  4122  	MOVQ         SI, AX
  4123  	ANDQ         $-16, AX
  4124  	XORL         CX, CX
  4125  	VBROADCASTSD dataRoundF64<>+0(SB), Y0
  4126  	VBROADCASTSD dataRoundF64<>+8(SB), Y1
  4127  
  4128  LBB2_4:
  4129  	VMOVUPD  (DI)(CX*8), Y2
  4130  	VMOVUPD  32(DI)(CX*8), Y3
  4131  	VMOVUPD  64(DI)(CX*8), Y4
  4132  	VMOVUPD  96(DI)(CX*8), Y5
  4133  	VANDPD   Y0, Y2, Y6
  4134  	VORPD    Y1, Y6, Y6
  4135  	VADDPD   Y6, Y2, Y2
  4136  	VROUNDPD $0x0b, Y2, Y2
  4137  	VANDPD   Y0, Y3, Y6
  4138  	VORPD    Y1, Y6, Y6
  4139  	VADDPD   Y6, Y3, Y3
  4140  	VROUNDPD $0x0b, Y3, Y3
  4141  	VANDPD   Y0, Y4, Y6
  4142  	VORPD    Y1, Y6, Y6
  4143  	VADDPD   Y6, Y4, Y4
  4144  	VROUNDPD $0x0b, Y4, Y4
  4145  	VANDPD   Y0, Y5, Y6
  4146  	VORPD    Y1, Y6, Y6
  4147  	VADDPD   Y6, Y5, Y5
  4148  	VROUNDPD $0x0b, Y5, Y5
  4149  	VMOVUPD  Y2, (DI)(CX*8)
  4150  	VMOVUPD  Y3, 32(DI)(CX*8)
  4151  	VMOVUPD  Y4, 64(DI)(CX*8)
  4152  	VMOVUPD  Y5, 96(DI)(CX*8)
  4153  	ADDQ     $0x10, CX
  4154  	CMPQ     AX, CX
  4155  	JNE      LBB2_4
  4156  	CMPQ     AX, SI
  4157  	JE       LBB2_8
  4158  
  4159  LBB2_6:
  4160  	VMOVUPD  dataRoundF64<>+16(SB), X0
  4161  	VMOVDDUP dataRoundF64<>+8(SB), X1
  4162  
  4163  LBB2_7:
  4164  	VMOVSD   (DI)(AX*8), X2
  4165  	VANDPD   X0, X2, X3
  4166  	VORPD    X1, X3, X3
  4167  	VADDSD   X3, X2, X2
  4168  	VROUNDSD $0x0b, X2, X2, X2
  4169  	VMOVSD   X2, (DI)(AX*8)
  4170  	ADDQ     $0x01, AX
  4171  	CMPQ     SI, AX
  4172  	JNE      LBB2_7
  4173  
  4174  LBB2_8:
  4175  	VZEROUPPER
  4176  	MOVSD X0, ret+24(FP)
  4177  	RET
  4178  
  4179  DATA dataRoundF32<>+0(SB)/4, $0x80000000
  4180  DATA dataRoundF32<>+4(SB)/4, $0x3effffff
  4181  GLOBL dataRoundF32<>(SB), RODATA|NOPTR, $8
  4182  
  4183  // func Round_AVX2_F32(x []float32) float32
  4184  // Requires: AVX, SSE
  4185  TEXT ·Round_AVX2_F32(SB), NOSPLIT, $0-28
  4186  	MOVQ  x_base+0(FP), DI
  4187  	MOVQ  x_len+8(FP), SI
  4188  	TESTQ SI, SI
  4189  	JE    LBB3_8
  4190  	CMPQ  SI, $0x20
  4191  	JAE   LBB3_3
  4192  	XORL  AX, AX
  4193  	JMP   LBB3_6
  4194  
  4195  LBB3_3:
  4196  	MOVQ         SI, AX
  4197  	ANDQ         $-32, AX
  4198  	XORL         CX, CX
  4199  	VBROADCASTSS dataRoundF32<>+0(SB), Y0
  4200  	VBROADCASTSS dataRoundF32<>+4(SB), Y1
  4201  
  4202  LBB3_4:
  4203  	VMOVUPS  (DI)(CX*4), Y2
  4204  	VMOVUPS  32(DI)(CX*4), Y3
  4205  	VMOVUPS  64(DI)(CX*4), Y4
  4206  	VMOVUPS  96(DI)(CX*4), Y5
  4207  	VANDPS   Y0, Y2, Y6
  4208  	VORPS    Y1, Y6, Y6
  4209  	VADDPS   Y6, Y2, Y2
  4210  	VROUNDPS $0x0b, Y2, Y2
  4211  	VANDPS   Y0, Y3, Y6
  4212  	VORPS    Y1, Y6, Y6
  4213  	VADDPS   Y6, Y3, Y3
  4214  	VROUNDPS $0x0b, Y3, Y3
  4215  	VANDPS   Y0, Y4, Y6
  4216  	VORPS    Y1, Y6, Y6
  4217  	VADDPS   Y6, Y4, Y4
  4218  	VROUNDPS $0x0b, Y4, Y4
  4219  	VANDPS   Y0, Y5, Y6
  4220  	VORPS    Y1, Y6, Y6
  4221  	VADDPS   Y6, Y5, Y5
  4222  	VROUNDPS $0x0b, Y5, Y5
  4223  	VMOVUPS  Y2, (DI)(CX*4)
  4224  	VMOVUPS  Y3, 32(DI)(CX*4)
  4225  	VMOVUPS  Y4, 64(DI)(CX*4)
  4226  	VMOVUPS  Y5, 96(DI)(CX*4)
  4227  	ADDQ     $0x20, CX
  4228  	CMPQ     AX, CX
  4229  	JNE      LBB3_4
  4230  	CMPQ     AX, SI
  4231  	JE       LBB3_8
  4232  
  4233  LBB3_6:
  4234  	VBROADCASTSS dataRoundF32<>+0(SB), X0
  4235  	VBROADCASTSS dataRoundF32<>+4(SB), X1
  4236  
  4237  LBB3_7:
  4238  	VMOVSS   (DI)(AX*4), X2
  4239  	VANDPS   X0, X2, X3
  4240  	VORPS    X1, X3, X3
  4241  	VADDSS   X3, X2, X2
  4242  	VROUNDSS $0x0b, X2, X2, X2
  4243  	VMOVSS   X2, (DI)(AX*4)
  4244  	ADDQ     $0x01, AX
  4245  	CMPQ     SI, AX
  4246  	JNE      LBB3_7
  4247  
  4248  LBB3_8:
  4249  	VZEROUPPER
  4250  	MOVSS X0, ret+24(FP)
  4251  	RET
  4252  
  4253  // func Floor_AVX2_F64(x []float64) float64
  4254  // Requires: AVX, SSE2
  4255  TEXT ·Floor_AVX2_F64(SB), NOSPLIT, $0-32
  4256  	MOVQ  x_base+0(FP), DI
  4257  	MOVQ  x_len+8(FP), SI
  4258  	TESTQ SI, SI
  4259  	JE    LBB4_11
  4260  	CMPQ  SI, $0x10
  4261  	JAE   LBB4_3
  4262  	XORL  AX, AX
  4263  	JMP   LBB4_10
  4264  
  4265  LBB4_3:
  4266  	MOVQ  SI, AX
  4267  	ANDQ  $-16, AX
  4268  	LEAQ  -16(AX), CX
  4269  	MOVQ  CX, R8
  4270  	SHRQ  $0x04, R8
  4271  	ADDQ  $0x01, R8
  4272  	TESTQ CX, CX
  4273  	JE    LBB4_4
  4274  	MOVQ  R8, DX
  4275  	ANDQ  $-2, DX
  4276  	XORL  CX, CX
  4277  
  4278  LBB4_6:
  4279  	VROUNDPD $0x09, (DI)(CX*8), Y0
  4280  	VROUNDPD $0x09, 32(DI)(CX*8), Y1
  4281  	VROUNDPD $0x09, 64(DI)(CX*8), Y2
  4282  	VROUNDPD $0x09, 96(DI)(CX*8), Y3
  4283  	VMOVUPD  Y0, (DI)(CX*8)
  4284  	VMOVUPD  Y1, 32(DI)(CX*8)
  4285  	VMOVUPD  Y2, 64(DI)(CX*8)
  4286  	VMOVUPD  Y3, 96(DI)(CX*8)
  4287  	VROUNDPD $0x09, 128(DI)(CX*8), Y0
  4288  	VROUNDPD $0x09, 160(DI)(CX*8), Y1
  4289  	VROUNDPD $0x09, 192(DI)(CX*8), Y2
  4290  	VROUNDPD $0x09, 224(DI)(CX*8), Y3
  4291  	VMOVUPD  Y0, 128(DI)(CX*8)
  4292  	VMOVUPD  Y1, 160(DI)(CX*8)
  4293  	VMOVUPD  Y2, 192(DI)(CX*8)
  4294  	VMOVUPD  Y3, 224(DI)(CX*8)
  4295  	ADDQ     $0x20, CX
  4296  	ADDQ     $-2, DX
  4297  	JNE      LBB4_6
  4298  	TESTB    $0x01, R8
  4299  	JE       LBB4_9
  4300  
  4301  LBB4_8:
  4302  	VROUNDPD $0x09, (DI)(CX*8), Y0
  4303  	VROUNDPD $0x09, 32(DI)(CX*8), Y1
  4304  	VROUNDPD $0x09, 64(DI)(CX*8), Y2
  4305  	VROUNDPD $0x09, 96(DI)(CX*8), Y3
  4306  	VMOVUPD  Y0, (DI)(CX*8)
  4307  	VMOVUPD  Y1, 32(DI)(CX*8)
  4308  	VMOVUPD  Y2, 64(DI)(CX*8)
  4309  	VMOVUPD  Y3, 96(DI)(CX*8)
  4310  
  4311  LBB4_9:
  4312  	CMPQ AX, SI
  4313  	JE   LBB4_11
  4314  
  4315  LBB4_10:
  4316  	VMOVSD   (DI)(AX*8), X0
  4317  	VROUNDSD $0x09, X0, X0, X0
  4318  	VMOVSD   X0, (DI)(AX*8)
  4319  	ADDQ     $0x01, AX
  4320  	CMPQ     SI, AX
  4321  	JNE      LBB4_10
  4322  
  4323  LBB4_11:
  4324  	VZEROUPPER
  4325  	MOVSD X0, ret+24(FP)
  4326  	RET
  4327  
  4328  LBB4_4:
  4329  	XORL  CX, CX
  4330  	TESTB $0x01, R8
  4331  	JNE   LBB4_8
  4332  	JMP   LBB4_9
  4333  
  4334  // func Floor_AVX2_F32(x []float32) float32
  4335  // Requires: AVX, SSE
  4336  TEXT ·Floor_AVX2_F32(SB), NOSPLIT, $0-28
  4337  	MOVQ  x_base+0(FP), DI
  4338  	MOVQ  x_len+8(FP), SI
  4339  	TESTQ SI, SI
  4340  	JE    LBB5_11
  4341  	CMPQ  SI, $0x20
  4342  	JAE   LBB5_3
  4343  	XORL  AX, AX
  4344  	JMP   LBB5_10
  4345  
  4346  LBB5_3:
  4347  	MOVQ  SI, AX
  4348  	ANDQ  $-32, AX
  4349  	LEAQ  -32(AX), CX
  4350  	MOVQ  CX, R8
  4351  	SHRQ  $0x05, R8
  4352  	ADDQ  $0x01, R8
  4353  	TESTQ CX, CX
  4354  	JE    LBB5_4
  4355  	MOVQ  R8, DX
  4356  	ANDQ  $-2, DX
  4357  	XORL  CX, CX
  4358  
  4359  LBB5_6:
  4360  	VROUNDPS $0x09, (DI)(CX*4), Y0
  4361  	VROUNDPS $0x09, 32(DI)(CX*4), Y1
  4362  	VROUNDPS $0x09, 64(DI)(CX*4), Y2
  4363  	VROUNDPS $0x09, 96(DI)(CX*4), Y3
  4364  	VMOVUPS  Y0, (DI)(CX*4)
  4365  	VMOVUPS  Y1, 32(DI)(CX*4)
  4366  	VMOVUPS  Y2, 64(DI)(CX*4)
  4367  	VMOVUPS  Y3, 96(DI)(CX*4)
  4368  	VROUNDPS $0x09, 128(DI)(CX*4), Y0
  4369  	VROUNDPS $0x09, 160(DI)(CX*4), Y1
  4370  	VROUNDPS $0x09, 192(DI)(CX*4), Y2
  4371  	VROUNDPS $0x09, 224(DI)(CX*4), Y3
  4372  	VMOVUPS  Y0, 128(DI)(CX*4)
  4373  	VMOVUPS  Y1, 160(DI)(CX*4)
  4374  	VMOVUPS  Y2, 192(DI)(CX*4)
  4375  	VMOVUPS  Y3, 224(DI)(CX*4)
  4376  	ADDQ     $0x40, CX
  4377  	ADDQ     $-2, DX
  4378  	JNE      LBB5_6
  4379  	TESTB    $0x01, R8
  4380  	JE       LBB5_9
  4381  
  4382  LBB5_8:
  4383  	VROUNDPS $0x09, (DI)(CX*4), Y0
  4384  	VROUNDPS $0x09, 32(DI)(CX*4), Y1
  4385  	VROUNDPS $0x09, 64(DI)(CX*4), Y2
  4386  	VROUNDPS $0x09, 96(DI)(CX*4), Y3
  4387  	VMOVUPS  Y0, (DI)(CX*4)
  4388  	VMOVUPS  Y1, 32(DI)(CX*4)
  4389  	VMOVUPS  Y2, 64(DI)(CX*4)
  4390  	VMOVUPS  Y3, 96(DI)(CX*4)
  4391  
  4392  LBB5_9:
  4393  	CMPQ AX, SI
  4394  	JE   LBB5_11
  4395  
  4396  LBB5_10:
  4397  	VMOVSS   (DI)(AX*4), X0
  4398  	VROUNDSS $0x09, X0, X0, X0
  4399  	VMOVSS   X0, (DI)(AX*4)
  4400  	ADDQ     $0x01, AX
  4401  	CMPQ     SI, AX
  4402  	JNE      LBB5_10
  4403  
  4404  LBB5_11:
  4405  	VZEROUPPER
  4406  	MOVSS X0, ret+24(FP)
  4407  	RET
  4408  
  4409  LBB5_4:
  4410  	XORL  CX, CX
  4411  	TESTB $0x01, R8
  4412  	JNE   LBB5_8
  4413  	JMP   LBB5_9
  4414  
  4415  // func Ceil_AVX2_F64(x []float64) float64
  4416  // Requires: AVX, SSE2
  4417  TEXT ·Ceil_AVX2_F64(SB), NOSPLIT, $0-32
  4418  	MOVQ  x_base+0(FP), DI
  4419  	MOVQ  x_len+8(FP), SI
  4420  	TESTQ SI, SI
  4421  	JE    LBB6_11
  4422  	CMPQ  SI, $0x10
  4423  	JAE   LBB6_3
  4424  	XORL  AX, AX
  4425  	JMP   LBB6_10
  4426  
  4427  LBB6_3:
  4428  	MOVQ  SI, AX
  4429  	ANDQ  $-16, AX
  4430  	LEAQ  -16(AX), CX
  4431  	MOVQ  CX, R8
  4432  	SHRQ  $0x04, R8
  4433  	ADDQ  $0x01, R8
  4434  	TESTQ CX, CX
  4435  	JE    LBB6_4
  4436  	MOVQ  R8, DX
  4437  	ANDQ  $-2, DX
  4438  	XORL  CX, CX
  4439  
  4440  LBB6_6:
  4441  	VROUNDPD $0x0a, (DI)(CX*8), Y0
  4442  	VROUNDPD $0x0a, 32(DI)(CX*8), Y1
  4443  	VROUNDPD $0x0a, 64(DI)(CX*8), Y2
  4444  	VROUNDPD $0x0a, 96(DI)(CX*8), Y3
  4445  	VMOVUPD  Y0, (DI)(CX*8)
  4446  	VMOVUPD  Y1, 32(DI)(CX*8)
  4447  	VMOVUPD  Y2, 64(DI)(CX*8)
  4448  	VMOVUPD  Y3, 96(DI)(CX*8)
  4449  	VROUNDPD $0x0a, 128(DI)(CX*8), Y0
  4450  	VROUNDPD $0x0a, 160(DI)(CX*8), Y1
  4451  	VROUNDPD $0x0a, 192(DI)(CX*8), Y2
  4452  	VROUNDPD $0x0a, 224(DI)(CX*8), Y3
  4453  	VMOVUPD  Y0, 128(DI)(CX*8)
  4454  	VMOVUPD  Y1, 160(DI)(CX*8)
  4455  	VMOVUPD  Y2, 192(DI)(CX*8)
  4456  	VMOVUPD  Y3, 224(DI)(CX*8)
  4457  	ADDQ     $0x20, CX
  4458  	ADDQ     $-2, DX
  4459  	JNE      LBB6_6
  4460  	TESTB    $0x01, R8
  4461  	JE       LBB6_9
  4462  
  4463  LBB6_8:
  4464  	VROUNDPD $0x0a, (DI)(CX*8), Y0
  4465  	VROUNDPD $0x0a, 32(DI)(CX*8), Y1
  4466  	VROUNDPD $0x0a, 64(DI)(CX*8), Y2
  4467  	VROUNDPD $0x0a, 96(DI)(CX*8), Y3
  4468  	VMOVUPD  Y0, (DI)(CX*8)
  4469  	VMOVUPD  Y1, 32(DI)(CX*8)
  4470  	VMOVUPD  Y2, 64(DI)(CX*8)
  4471  	VMOVUPD  Y3, 96(DI)(CX*8)
  4472  
  4473  LBB6_9:
  4474  	CMPQ AX, SI
  4475  	JE   LBB6_11
  4476  
  4477  LBB6_10:
  4478  	VMOVSD   (DI)(AX*8), X0
  4479  	VROUNDSD $0x0a, X0, X0, X0
  4480  	VMOVSD   X0, (DI)(AX*8)
  4481  	ADDQ     $0x01, AX
  4482  	CMPQ     SI, AX
  4483  	JNE      LBB6_10
  4484  
  4485  LBB6_11:
  4486  	VZEROUPPER
  4487  	MOVSD X0, ret+24(FP)
  4488  	RET
  4489  
  4490  LBB6_4:
  4491  	XORL  CX, CX
  4492  	TESTB $0x01, R8
  4493  	JNE   LBB6_8
  4494  	JMP   LBB6_9
  4495  
  4496  // func Ceil_AVX2_F32(x []float32) float32
  4497  // Requires: AVX, SSE
  4498  TEXT ·Ceil_AVX2_F32(SB), NOSPLIT, $0-28
  4499  	MOVQ  x_base+0(FP), DI
  4500  	MOVQ  x_len+8(FP), SI
  4501  	TESTQ SI, SI
  4502  	JE    LBB7_11
  4503  	CMPQ  SI, $0x20
  4504  	JAE   LBB7_3
  4505  	XORL  AX, AX
  4506  	JMP   LBB7_10
  4507  
  4508  LBB7_3:
  4509  	MOVQ  SI, AX
  4510  	ANDQ  $-32, AX
  4511  	LEAQ  -32(AX), CX
  4512  	MOVQ  CX, R8
  4513  	SHRQ  $0x05, R8
  4514  	ADDQ  $0x01, R8
  4515  	TESTQ CX, CX
  4516  	JE    LBB7_4
  4517  	MOVQ  R8, DX
  4518  	ANDQ  $-2, DX
  4519  	XORL  CX, CX
  4520  
  4521  LBB7_6:
  4522  	VROUNDPS $0x0a, (DI)(CX*4), Y0
  4523  	VROUNDPS $0x0a, 32(DI)(CX*4), Y1
  4524  	VROUNDPS $0x0a, 64(DI)(CX*4), Y2
  4525  	VROUNDPS $0x0a, 96(DI)(CX*4), Y3
  4526  	VMOVUPS  Y0, (DI)(CX*4)
  4527  	VMOVUPS  Y1, 32(DI)(CX*4)
  4528  	VMOVUPS  Y2, 64(DI)(CX*4)
  4529  	VMOVUPS  Y3, 96(DI)(CX*4)
  4530  	VROUNDPS $0x0a, 128(DI)(CX*4), Y0
  4531  	VROUNDPS $0x0a, 160(DI)(CX*4), Y1
  4532  	VROUNDPS $0x0a, 192(DI)(CX*4), Y2
  4533  	VROUNDPS $0x0a, 224(DI)(CX*4), Y3
  4534  	VMOVUPS  Y0, 128(DI)(CX*4)
  4535  	VMOVUPS  Y1, 160(DI)(CX*4)
  4536  	VMOVUPS  Y2, 192(DI)(CX*4)
  4537  	VMOVUPS  Y3, 224(DI)(CX*4)
  4538  	ADDQ     $0x40, CX
  4539  	ADDQ     $-2, DX
  4540  	JNE      LBB7_6
  4541  	TESTB    $0x01, R8
  4542  	JE       LBB7_9
  4543  
  4544  LBB7_8:
  4545  	VROUNDPS $0x0a, (DI)(CX*4), Y0
  4546  	VROUNDPS $0x0a, 32(DI)(CX*4), Y1
  4547  	VROUNDPS $0x0a, 64(DI)(CX*4), Y2
  4548  	VROUNDPS $0x0a, 96(DI)(CX*4), Y3
  4549  	VMOVUPS  Y0, (DI)(CX*4)
  4550  	VMOVUPS  Y1, 32(DI)(CX*4)
  4551  	VMOVUPS  Y2, 64(DI)(CX*4)
  4552  	VMOVUPS  Y3, 96(DI)(CX*4)
  4553  
  4554  LBB7_9:
  4555  	CMPQ AX, SI
  4556  	JE   LBB7_11
  4557  
  4558  LBB7_10:
  4559  	VMOVSS   (DI)(AX*4), X0
  4560  	VROUNDSS $0x0a, X0, X0, X0
  4561  	VMOVSS   X0, (DI)(AX*4)
  4562  	ADDQ     $0x01, AX
  4563  	CMPQ     SI, AX
  4564  	JNE      LBB7_10
  4565  
  4566  LBB7_11:
  4567  	VZEROUPPER
  4568  	MOVSS X0, ret+24(FP)
  4569  	RET
  4570  
  4571  LBB7_4:
  4572  	XORL  CX, CX
  4573  	TESTB $0x01, R8
  4574  	JNE   LBB7_8
  4575  	JMP   LBB7_9
  4576  
  4577  DATA dataPowF64<>+0(SB)/8, $0x7fffffffffffffff
  4578  DATA dataPowF64<>+8(SB)/8, $0x3fe6a09e667f3bcd
  4579  DATA dataPowF64<>+16(SB)/8, $0xbff0000000000000
  4580  DATA dataPowF64<>+24(SB)/8, $0x401a509f46f4fa53
  4581  DATA dataPowF64<>+32(SB)/8, $0x3fdfe818a0fe1a83
  4582  DATA dataPowF64<>+40(SB)/8, $0x3f07bc0962b395ca
  4583  DATA dataPowF64<>+48(SB)/8, $0x404e798eb86c3351
  4584  DATA dataPowF64<>+56(SB)/8, $0x403de9738b8cb9c9
  4585  DATA dataPowF64<>+64(SB)/8, $0x40340a202d99830a
  4586  DATA dataPowF64<>+72(SB)/8, $0x404c8e7597479a10
  4587  DATA dataPowF64<>+80(SB)/8, $0x4054c30b52213498
  4588  DATA dataPowF64<>+88(SB)/8, $0x402e20359e903e37
  4589  DATA dataPowF64<>+96(SB)/8, $0x407351945dc908a5
  4590  DATA dataPowF64<>+104(SB)/8, $0x406bb86590fcfb56
  4591  DATA dataPowF64<>+112(SB)/8, $0x404e0f304466448e
  4592  DATA dataPowF64<>+120(SB)/8, $0x406b0db13e48e066
  4593  DATA dataPowF64<>+128(SB)/8, $0x4330000000000000
  4594  DATA dataPowF64<>+136(SB)/8, $0xc3300000000003ff
  4595  DATA dataPowF64<>+144(SB)/8, $0x3ff0000000000000
  4596  DATA dataPowF64<>+152(SB)/8, $0xbfe0000000000000
  4597  DATA dataPowF64<>+160(SB)/8, $0x3fe0000000000000
  4598  DATA dataPowF64<>+168(SB)/8, $0x3ff71547652b82fe
  4599  DATA dataPowF64<>+176(SB)/8, $0xbfe62e4000000000
  4600  DATA dataPowF64<>+184(SB)/8, $0x3eb7f7d1cf79abca
  4601  DATA dataPowF64<>+192(SB)/8, $0x3fe62e42fefa39ef
  4602  DATA dataPowF64<>+200(SB)/8, $0x3e21eed8eff8d898
  4603  DATA dataPowF64<>+208(SB)/8, $0x3de6124613a86d09
  4604  DATA dataPowF64<>+216(SB)/8, $0x3e927e4fb7789f5c
  4605  DATA dataPowF64<>+224(SB)/8, $0x3e5ae64567f544e4
  4606  DATA dataPowF64<>+232(SB)/8, $0x3efa01a01a01a01a
  4607  DATA dataPowF64<>+240(SB)/8, $0x3ec71de3a556c734
  4608  DATA dataPowF64<>+248(SB)/8, $0x3f56c16c16c16c17
  4609  DATA dataPowF64<>+256(SB)/8, $0x3f2a01a01a01a01a
  4610  DATA dataPowF64<>+264(SB)/8, $0x3fa5555555555555
  4611  DATA dataPowF64<>+272(SB)/8, $0x3f81111111111111
  4612  DATA dataPowF64<>+280(SB)/8, $0x3fc5555555555555
  4613  DATA dataPowF64<>+288(SB)/8, $0x00000000000007fe
  4614  DATA dataPowF64<>+296(SB)/8, $0x40a7700000000000
  4615  DATA dataPowF64<>+304(SB)/8, $0x0000000000000001
  4616  DATA dataPowF64<>+312(SB)/8, $0xc0a7700000000000
  4617  DATA dataPowF64<>+320(SB)/8, $0x7ff0000000000000
  4618  DATA dataPowF64<>+328(SB)/8, $0x7ff8002040000000
  4619  DATA dataPowF64<>+336(SB)/8, $0x000fffffffffffff
  4620  DATA dataPowF64<>+344(SB)/8, $0x000fffffffffffff
  4621  DATA dataPowF64<>+352(SB)/8, $0x3fe0000000000000
  4622  DATA dataPowF64<>+360(SB)/8, $0x3fe0000000000000
  4623  GLOBL dataPowF64<>(SB), RODATA|NOPTR, $368
  4624  
  4625  // func Pow_4x_AVX2_F64(x []float64, y []float64)
  4626  // Requires: AVX, AVX2, FMA3
  4627  TEXT ·Pow_4x_AVX2_F64(SB), NOSPLIT, $0-48
  4628  	MOVQ         x_base+0(FP), DI
  4629  	MOVQ         y_base+24(FP), SI
  4630  	MOVQ         x_len+8(FP), DX
  4631  	SUBQ         $+1192, SP
  4632  	ANDQ         $-4, DX
  4633  	JE           LBB9_11
  4634  	XORL         R8, R8
  4635  	VBROADCASTSD dataPowF64<>+0(SB), Y0
  4636  	VMOVUPS      Y0, 512(SP)
  4637  	VBROADCASTSD dataPowF64<>+8(SB), Y0
  4638  	VMOVUPS      Y0, 1120(SP)
  4639  	VPXOR        X6, X6, X6
  4640  	VBROADCASTSD dataPowF64<>+16(SB), Y0
  4641  	VMOVUPS      Y0, 1088(SP)
  4642  	VBROADCASTSD dataPowF64<>+24(SB), Y0
  4643  	VMOVUPS      Y0, 1056(SP)
  4644  	VBROADCASTSD dataPowF64<>+32(SB), Y0
  4645  	VMOVUPS      Y0, 1024(SP)
  4646  	VBROADCASTSD dataPowF64<>+40(SB), Y0
  4647  	VMOVUPS      Y0, 992(SP)
  4648  	VBROADCASTSD dataPowF64<>+48(SB), Y0
  4649  	VMOVUPS      Y0, 960(SP)
  4650  	VBROADCASTSD dataPowF64<>+56(SB), Y0
  4651  	VMOVUPS      Y0, 928(SP)
  4652  	VBROADCASTSD dataPowF64<>+64(SB), Y0
  4653  	VMOVUPS      Y0, 896(SP)
  4654  	VBROADCASTSD dataPowF64<>+72(SB), Y0
  4655  	VMOVUPS      Y0, 864(SP)
  4656  	VBROADCASTSD dataPowF64<>+80(SB), Y0
  4657  	VMOVUPS      Y0, 832(SP)
  4658  	VBROADCASTSD dataPowF64<>+88(SB), Y0
  4659  	VMOVUPS      Y0, 800(SP)
  4660  	VBROADCASTSD dataPowF64<>+96(SB), Y0
  4661  	VMOVUPS      Y0, 768(SP)
  4662  	VBROADCASTSD dataPowF64<>+104(SB), Y0
  4663  	VMOVUPS      Y0, 736(SP)
  4664  	VBROADCASTSD dataPowF64<>+112(SB), Y0
  4665  	VMOVUPS      Y0, 704(SP)
  4666  	VBROADCASTSD dataPowF64<>+120(SB), Y0
  4667  	VMOVUPS      Y0, 672(SP)
  4668  	VBROADCASTSD dataPowF64<>+128(SB), Y0
  4669  	VMOVUPS      Y0, 640(SP)
  4670  	VBROADCASTSD dataPowF64<>+136(SB), Y0
  4671  	VMOVUPS      Y0, 608(SP)
  4672  	VBROADCASTSD dataPowF64<>+144(SB), Y0
  4673  	VMOVUPS      Y0, -128(SP)
  4674  	VBROADCASTSD dataPowF64<>+152(SB), Y0
  4675  	VMOVUPS      Y0, 576(SP)
  4676  	VBROADCASTSD dataPowF64<>+160(SB), Y0
  4677  	VMOVUPS      Y0, 544(SP)
  4678  	VBROADCASTSD dataPowF64<>+168(SB), Y0
  4679  	VMOVUPS      Y0, 480(SP)
  4680  	VBROADCASTSD dataPowF64<>+176(SB), Y0
  4681  	VMOVUPS      Y0, 448(SP)
  4682  	VBROADCASTSD dataPowF64<>+184(SB), Y0
  4683  	VMOVUPS      Y0, 416(SP)
  4684  	VBROADCASTSD dataPowF64<>+192(SB), Y0
  4685  	VMOVUPS      Y0, 384(SP)
  4686  	VBROADCASTSD dataPowF64<>+200(SB), Y0
  4687  	VMOVUPS      Y0, 352(SP)
  4688  	VBROADCASTSD dataPowF64<>+208(SB), Y0
  4689  	VMOVUPS      Y0, 320(SP)
  4690  	VBROADCASTSD dataPowF64<>+216(SB), Y0
  4691  	VMOVUPS      Y0, 288(SP)
  4692  	VBROADCASTSD dataPowF64<>+224(SB), Y0
  4693  	VMOVUPS      Y0, 256(SP)
  4694  	VBROADCASTSD dataPowF64<>+232(SB), Y0
  4695  	VMOVUPS      Y0, 224(SP)
  4696  	VBROADCASTSD dataPowF64<>+240(SB), Y0
  4697  	VMOVUPS      Y0, 192(SP)
  4698  	VBROADCASTSD dataPowF64<>+248(SB), Y0
  4699  	VMOVUPS      Y0, 160(SP)
  4700  	VBROADCASTSD dataPowF64<>+256(SB), Y0
  4701  	VMOVUPS      Y0, 128(SP)
  4702  	VBROADCASTSD dataPowF64<>+264(SB), Y0
  4703  	VMOVUPS      Y0, 96(SP)
  4704  	VBROADCASTSD dataPowF64<>+272(SB), Y0
  4705  	VMOVUPS      Y0, 64(SP)
  4706  	VBROADCASTSD dataPowF64<>+280(SB), Y0
  4707  	VMOVUPS      Y0, 32(SP)
  4708  	VBROADCASTSD dataPowF64<>+288(SB), Y0
  4709  	VMOVUPS      Y0, (SP)
  4710  	VBROADCASTSD dataPowF64<>+296(SB), Y0
  4711  	VMOVUPS      Y0, -32(SP)
  4712  	VBROADCASTSD dataPowF64<>+304(SB), Y0
  4713  	VMOVUPS      Y0, -64(SP)
  4714  	VBROADCASTSD dataPowF64<>+312(SB), Y0
  4715  	VMOVUPD      Y0, -96(SP)
  4716  	VPBROADCASTQ dataPowF64<>+320(SB), Y5
  4717  	VBROADCASTSD dataPowF64<>+320(SB), Y10
  4718  	JMP          LBB9_2
  4719  
  4720  LBB9_10:
  4721  	VMOVUPD Y2, (DI)(R8*8)
  4722  	ADDQ    $0x04, R8
  4723  	CMPQ    R8, DX
  4724  	JAE     LBB9_11
  4725  
  4726  LBB9_2:
  4727  	VMOVAPD      Y10, Y9
  4728  	VMOVDQU      (DI)(R8*8), Y13
  4729  	VMOVUPD      (SI)(R8*8), Y12
  4730  	VPAND        512(SP), Y13, Y10
  4731  	VMOVUPD      dataPowF64<>+336(SB), X1
  4732  	VANDPD       (DI)(R8*8), X1, X2
  4733  	VMOVUPD      dataPowF64<>+352(SB), X0
  4734  	VORPD        X0, X2, X2
  4735  	VANDPD       16(DI)(R8*8), X1, X3
  4736  	VORPD        X0, X3, X3
  4737  	VINSERTF128  $0x01, X3, Y2, Y3
  4738  	VMOVUPD      1120(SP), Y0
  4739  	VCMPPD       $0x01, Y3, Y0, Y2
  4740  	VANDNPD      Y3, Y2, Y4
  4741  	VADDPD       1088(SP), Y3, Y3
  4742  	VADDPD       Y4, Y3, Y4
  4743  	VMULPD       Y4, Y4, Y3
  4744  	VMULPD       Y3, Y3, Y7
  4745  	VMOVUPD      1024(SP), Y8
  4746  	VFMADD213PD  1056(SP), Y4, Y8
  4747  	VFMADD231PD  992(SP), Y3, Y8
  4748  	VMOVUPD      928(SP), Y11
  4749  	VFMADD213PD  960(SP), Y4, Y11
  4750  	VMOVUPD      864(SP), Y14
  4751  	VFMADD213PD  896(SP), Y4, Y14
  4752  	VFMADD231PD  Y11, Y3, Y14
  4753  	VFMADD231PD  Y8, Y7, Y14
  4754  	VMULPD       Y4, Y3, Y8
  4755  	VMULPD       Y14, Y8, Y8
  4756  	VADDPD       832(SP), Y3, Y11
  4757  	VFMADD231PD  800(SP), Y4, Y11
  4758  	VMOVUPD      736(SP), Y14
  4759  	VFMADD213PD  768(SP), Y4, Y14
  4760  	VMOVUPD      672(SP), Y15
  4761  	VFMADD213PD  704(SP), Y4, Y15
  4762  	VFMADD231PD  Y14, Y3, Y15
  4763  	VFMADD231PD  Y11, Y7, Y15
  4764  	VDIVPD       Y15, Y8, Y7
  4765  	VMOVDQU      Y10, 1152(SP)
  4766  	VPSRLQ       $0x34, Y10, Y8
  4767  	VPOR         640(SP), Y8, Y8
  4768  	VADDPD       608(SP), Y8, Y8
  4769  	VMOVUPD      -128(SP), Y0
  4770  	VANDPD       Y0, Y2, Y2
  4771  	VADDPD       Y2, Y8, Y8
  4772  	VMULPD       Y12, Y8, Y2
  4773  	VROUNDPD     $0x08, Y2, Y2
  4774  	VFNMADD213PD Y2, Y12, Y8
  4775  	VMOVUPD      576(SP), Y1
  4776  	VMOVAPD      Y1, Y11
  4777  	VFMADD213PD  Y4, Y3, Y11
  4778  	VADDPD       Y7, Y11, Y11
  4779  	VMOVUPD      544(SP), Y10
  4780  	VMULPD       Y4, Y10, Y14
  4781  	VMULPD       Y1, Y3, Y15
  4782  	VFMADD231PD  Y14, Y4, Y15
  4783  	VSUBPD       Y4, Y11, Y4
  4784  	VFMADD231PD  Y3, Y10, Y4
  4785  	VMOVUPD      480(SP), Y1
  4786  	VMULPD       Y1, Y12, Y3
  4787  	VMULPD       Y3, Y11, Y3
  4788  	VROUNDPD     $0x08, Y3, Y3
  4789  	VMULPD       448(SP), Y3, Y14
  4790  	VFMADD231PD  Y11, Y12, Y14
  4791  	VFMSUB231PD  416(SP), Y3, Y14
  4792  	VMOVUPD      384(SP), Y11
  4793  	VFMADD231PD  Y8, Y11, Y14
  4794  	VSUBPD       Y7, Y15, Y7
  4795  	VADDPD       Y4, Y7, Y4
  4796  	VFNMSUB213PD Y14, Y12, Y4
  4797  	VMULPD       Y1, Y4, Y7
  4798  	VROUNDPD     $0x08, Y7, Y7
  4799  	VFNMADD231PD Y11, Y7, Y4
  4800  	VMULPD       Y4, Y4, Y8
  4801  	VMOVUPD      320(SP), Y11
  4802  	VFMADD213PD  352(SP), Y4, Y11
  4803  	VMOVUPD      256(SP), Y14
  4804  	VFMADD213PD  288(SP), Y4, Y14
  4805  	VMOVUPD      192(SP), Y15
  4806  	VFMADD213PD  224(SP), Y4, Y15
  4807  	VFMADD231PD  Y14, Y8, Y15
  4808  	VMOVUPD      128(SP), Y14
  4809  	VFMADD213PD  160(SP), Y4, Y14
  4810  	VMOVUPD      64(SP), Y1
  4811  	VFMADD213PD  96(SP), Y4, Y1
  4812  	VFMADD231PD  Y14, Y8, Y1
  4813  	VMOVUPD      32(SP), Y14
  4814  	VFMADD213PD  Y10, Y4, Y14
  4815  	VFMADD213PD  Y4, Y8, Y14
  4816  	VMULPD       Y8, Y8, Y4
  4817  	VFMADD231PD  Y11, Y4, Y15
  4818  	VFMADD231PD  Y1, Y4, Y14
  4819  	VMULPD       Y4, Y4, Y1
  4820  	VFMADD231PD  Y15, Y1, Y14
  4821  	VADDPD       Y0, Y14, Y1
  4822  	VADDPD       Y2, Y3, Y2
  4823  	VADDPD       Y7, Y2, Y15
  4824  	VROUNDPD     $0x08, Y15, Y2
  4825  	VCVTTSD2SIQ  X2, R9
  4826  	VPERMILPD    $0x01, X2, X3
  4827  	VCVTTSD2SIQ  X3, AX
  4828  	VEXTRACTF128 $0x01, Y2, X2
  4829  	VCVTTSD2SIQ  X2, CX
  4830  	VMOVQ        CX, X3
  4831  	VPERMILPD    $0x01, X2, X2
  4832  	VCVTTSD2SIQ  X2, CX
  4833  	VMOVQ        CX, X2
  4834  	VPUNPCKLQDQ  X2, X3, X2
  4835  	VMOVQ        R9, X3
  4836  	VMOVQ        AX, X4
  4837  	VPUNPCKLQDQ  X4, X3, X3
  4838  	VINSERTI128  $0x01, X2, Y3, Y2
  4839  	VPSRAD       $0x1f, Y1, Y3
  4840  	VPSRAD       $0x14, Y1, Y4
  4841  	VPSRLQ       $0x20, Y4, Y4
  4842  	VPBLENDD     $0xaa, Y3, Y4, Y3
  4843  	VPADDQ       Y3, Y2, Y4
  4844  	VPCMPGTQ     (SP), Y4, Y3
  4845  	VMOVUPD      -32(SP), Y0
  4846  	VCMPPD       $0x01, Y15, Y0, Y7
  4847  	VPOR         Y7, Y3, Y3
  4848  	VMOVDQU      -64(SP), Y0
  4849  	VPCMPGTQ     Y4, Y0, Y4
  4850  	VCMPPD       $0x01, -96(SP), Y15, Y7
  4851  	VPOR         Y7, Y4, Y4
  4852  	VPSLLQ       $0x34, Y2, Y2
  4853  	VPADDQ       Y1, Y2, Y2
  4854  	VPOR         Y3, Y4, Y1
  4855  	VPTEST       Y1, Y1
  4856  	JNE          LBB9_3
  4857  	VMOVAPD      Y9, Y10
  4858  	JMP          LBB9_5
  4859  
  4860  LBB9_3:
  4861  	VPANDN    Y2, Y4, Y1
  4862  	VMOVAPD   Y9, Y10
  4863  	VBLENDVPD Y3, Y9, Y1, Y2
  4864  
  4865  LBB9_5:
  4866  	VPAND     Y5, Y13, Y11
  4867  	VPCMPEQQ  Y6, Y11, Y4
  4868  	VPSRAD    $0x1f, Y13, Y1
  4869  	VPSHUFD   $0xf5, Y1, Y7
  4870  	VCMPPD    $0x01, Y6, Y12, Y14
  4871  	VCMPPD    $0x00, Y6, Y12, Y3
  4872  	VANDPD    -128(SP), Y3, Y1
  4873  	VBLENDVPD Y14, Y10, Y1, Y1
  4874  	VBLENDVPD Y4, Y1, Y2, Y2
  4875  	VPTEST    Y7, Y7
  4876  	JNE       LBB9_7
  4877  	VPXOR     X7, X7, X7
  4878  	JMP       LBB9_8
  4879  
  4880  LBB9_7:
  4881  	VROUNDPD     $0x08, Y12, Y1
  4882  	VCMPPD       $0x00, Y1, Y12, Y8
  4883  	VCVTTSD2SIQ  X1, R9
  4884  	VPERMILPD    $0x01, X1, X10
  4885  	VCVTTSD2SIQ  X10, CX
  4886  	VEXTRACTF128 $0x01, Y1, X1
  4887  	VCVTTSD2SIQ  X1, AX
  4888  	VXORPD       X10, X10, X10
  4889  	VMOVQ        AX, X6
  4890  	VPERMILPD    $0x01, X1, X1
  4891  	VCVTTSD2SIQ  X1, AX
  4892  	VMOVQ        AX, X1
  4893  	VPUNPCKLQDQ  X1, X6, X1
  4894  	VMOVQ        R9, X6
  4895  	VMOVQ        CX, X0
  4896  	VPUNPCKLQDQ  X0, X6, X0
  4897  	VINSERTI128  $0x01, X1, Y0, Y0
  4898  	VPSLLQ       $0x3f, Y0, Y0
  4899  	VPOR         Y2, Y0, Y1
  4900  	VCMPPD       $0x00, Y10, Y13, Y6
  4901  	VBROADCASTSD dataPowF64<>+328(SB), Y10
  4902  	VBLENDVPD    Y6, Y2, Y10, Y6
  4903  	VMOVAPD      Y9, Y10
  4904  	VBLENDVPD    Y8, Y1, Y6, Y1
  4905  	VXORPD       X6, X6, X6
  4906  	VBLENDVPD    Y7, Y1, Y2, Y2
  4907  	VANDPD       Y0, Y8, Y7
  4908  
  4909  LBB9_8:
  4910  	VPCMPEQD  Y9, Y9, Y9
  4911  	VANDPD    Y5, Y12, Y0
  4912  	VANDPD    Y5, Y15, Y1
  4913  	VPCMPEQQ  Y5, Y1, Y15
  4914  	VPXOR     Y9, Y15, Y1
  4915  	VPCMPEQQ  Y5, Y0, Y8
  4916  	VPCMPEQQ  Y5, Y11, Y11
  4917  	VPXOR     Y9, Y11, Y0
  4918  	VPANDN    Y0, Y8, Y0
  4919  	VPOR      Y4, Y1, Y1
  4920  	VPAND     Y0, Y1, Y0
  4921  	VPTEST    Y9, Y0
  4922  	JB        LBB9_10
  4923  	VPXOR     Y9, Y8, Y0
  4924  	VPANDN    Y0, Y15, Y0
  4925  	VMOVUPD   -128(SP), Y8
  4926  	VMOVUPD   1152(SP), Y9
  4927  	VCMPPD    $0x00, Y8, Y9, Y1
  4928  	VCMPPD    $0x01, Y9, Y8, Y4
  4929  	VPSRAD    $0x1f, Y12, Y6
  4930  	VPXOR     Y4, Y6, Y4
  4931  	VPXOR     X6, X6, X6
  4932  	VBLENDVPD Y4, Y10, Y6, Y4
  4933  	VBLENDVPD Y1, Y8, Y4, Y1
  4934  	VBLENDVPD Y0, Y2, Y1, Y0
  4935  	VANDPD    Y2, Y7, Y1
  4936  	VANDPD    Y7, Y13, Y2
  4937  	VORPD     Y2, Y9, Y2
  4938  	VBLENDVPD Y14, Y1, Y2, Y1
  4939  	VBLENDVPD Y3, Y8, Y1, Y1
  4940  	VBLENDVPD Y11, Y1, Y0, Y0
  4941  	VCMPPD    $0x03, Y13, Y13, Y1
  4942  	VCMPPD    $0x03, Y12, Y12, Y2
  4943  	VORPD     Y1, Y2, Y1
  4944  	VADDPD    Y13, Y12, Y2
  4945  	VBLENDVPD Y1, Y2, Y0, Y2
  4946  	JMP       LBB9_10
  4947  
  4948  LBB9_11:
  4949  	ADDQ $+1192, SP
  4950  	VZEROUPPER
  4951  	RET
  4952  
  4953  DATA genPowF32<>+0(SB)/4, $0x7fffffff
  4954  DATA genPowF32<>+4(SB)/4, $0x3f3504f3
  4955  DATA genPowF32<>+8(SB)/4, $0xbf800000
  4956  DATA genPowF32<>+12(SB)/4, $0x3def251a
  4957  DATA genPowF32<>+16(SB)/4, $0xbdebd1b8
  4958  DATA genPowF32<>+20(SB)/4, $0x3e11e9bf
  4959  DATA genPowF32<>+24(SB)/4, $0xbdfe5d4f
  4960  DATA genPowF32<>+28(SB)/4, $0x3e4cceac
  4961  DATA genPowF32<>+32(SB)/4, $0xbe2aae50
  4962  DATA genPowF32<>+36(SB)/4, $0x3eaaaaaa
  4963  DATA genPowF32<>+40(SB)/4, $0xbe7ffffc
  4964  DATA genPowF32<>+44(SB)/4, $0x3d9021bb
  4965  DATA genPowF32<>+48(SB)/4, $0xcb00007f
  4966  DATA genPowF32<>+52(SB)/4, $0x3f800000
  4967  DATA genPowF32<>+56(SB)/4, $0xbf000000
  4968  DATA genPowF32<>+60(SB)/4, $0x3f000000
  4969  DATA genPowF32<>+64(SB)/4, $0x3fb8aa3b
  4970  DATA genPowF32<>+68(SB)/4, $0xbf318000
  4971  DATA genPowF32<>+72(SB)/4, $0xb95e8083
  4972  DATA genPowF32<>+76(SB)/4, $0xbf317218
  4973  DATA genPowF32<>+80(SB)/4, $0x3d2aaaab
  4974  DATA genPowF32<>+84(SB)/4, $0x3c088889
  4975  DATA genPowF32<>+88(SB)/4, $0x3ab60b61
  4976  DATA genPowF32<>+92(SB)/4, $0x39500d01
  4977  DATA genPowF32<>+96(SB)/4, $0x3e2aaaab
  4978  DATA genPowF32<>+100(SB)/4, $0x000000fe
  4979  DATA genPowF32<>+104(SB)/4, $0x43960000
  4980  DATA genPowF32<>+108(SB)/4, $0x00000001
  4981  DATA genPowF32<>+112(SB)/4, $0xc3960000
  4982  DATA genPowF32<>+116(SB)/4, $0x7f800000
  4983  DATA genPowF32<>+120(SB)/4, $0x7fc00102
  4984  DATA genPowF32<>+124(SB)/8, $0x007fffff007fffff
  4985  DATA genPowF32<>+132(SB)/8, $0x007fffff007fffff
  4986  DATA genPowF32<>+140(SB)/8, $0x3f0000003f000000
  4987  DATA genPowF32<>+148(SB)/8, $0x3f0000003f000000
  4988  DATA genPowF32<>+156(SB)/8, $0x4b0000004b000000
  4989  DATA genPowF32<>+164(SB)/1, $0xff
  4990  DATA genPowF32<>+165(SB)/1, $0x00
  4991  DATA genPowF32<>+166(SB)/1, $0x00
  4992  DATA genPowF32<>+167(SB)/1, $0x00
  4993  DATA genPowF32<>+168(SB)/1, $0xff
  4994  DATA genPowF32<>+169(SB)/1, $0x00
  4995  DATA genPowF32<>+170(SB)/1, $0x00
  4996  DATA genPowF32<>+171(SB)/1, $0x00
  4997  DATA genPowF32<>+172(SB)/1, $0xff
  4998  DATA genPowF32<>+173(SB)/1, $0x00
  4999  DATA genPowF32<>+174(SB)/1, $0x00
  5000  DATA genPowF32<>+175(SB)/1, $0x00
  5001  DATA genPowF32<>+176(SB)/1, $0xff
  5002  DATA genPowF32<>+177(SB)/1, $0x00
  5003  DATA genPowF32<>+178(SB)/1, $0x00
  5004  DATA genPowF32<>+179(SB)/1, $0x00
  5005  DATA genPowF32<>+180(SB)/1, $0xff
  5006  DATA genPowF32<>+181(SB)/1, $0x00
  5007  DATA genPowF32<>+182(SB)/1, $0x00
  5008  DATA genPowF32<>+183(SB)/1, $0x00
  5009  DATA genPowF32<>+184(SB)/1, $0xff
  5010  DATA genPowF32<>+185(SB)/1, $0x00
  5011  DATA genPowF32<>+186(SB)/1, $0x00
  5012  DATA genPowF32<>+187(SB)/1, $0x00
  5013  DATA genPowF32<>+188(SB)/1, $0xff
  5014  DATA genPowF32<>+189(SB)/1, $0x00
  5015  DATA genPowF32<>+190(SB)/1, $0x00
  5016  DATA genPowF32<>+191(SB)/1, $0x00
  5017  DATA genPowF32<>+192(SB)/1, $0xff
  5018  DATA genPowF32<>+193(SB)/1, $0x00
  5019  DATA genPowF32<>+194(SB)/1, $0x00
  5020  DATA genPowF32<>+195(SB)/1, $0x00
  5021  GLOBL genPowF32<>(SB), RODATA|NOPTR, $196
  5022  
  5023  // func Pow_8x_AVX2_F32(x []float32, y []float32)
  5024  // Requires: AVX, AVX2, FMA3
  5025  TEXT ·Pow_8x_AVX2_F32(SB), NOSPLIT, $0-48
  5026  	MOVQ         x_base+0(FP), DI
  5027  	MOVQ         y_base+24(FP), SI
  5028  	MOVQ         x_len+8(FP), DX
  5029  	SUBQ         $+872, SP
  5030  	ANDQ         $-8, DX
  5031  	JE           LBB8_12
  5032  	XORL         AX, AX
  5033  	VBROADCASTSS genPowF32<>+0(SB), Y0
  5034  	VMOVUPS      Y0, 320(SP)
  5035  	VBROADCASTSS genPowF32<>+4(SB), Y0
  5036  	VMOVUPS      Y0, 800(SP)
  5037  	VPXOR        X7, X7, X7
  5038  	VBROADCASTSS genPowF32<>+8(SB), Y0
  5039  	VMOVUPS      Y0, 768(SP)
  5040  	VBROADCASTSS genPowF32<>+12(SB), Y0
  5041  	VMOVUPS      Y0, 736(SP)
  5042  	VBROADCASTSS genPowF32<>+16(SB), Y0
  5043  	VMOVUPS      Y0, 704(SP)
  5044  	VBROADCASTSS genPowF32<>+20(SB), Y0
  5045  	VMOVUPS      Y0, 672(SP)
  5046  	VBROADCASTSS genPowF32<>+24(SB), Y0
  5047  	VMOVUPS      Y0, 640(SP)
  5048  	VBROADCASTSS genPowF32<>+28(SB), Y0
  5049  	VMOVUPS      Y0, 608(SP)
  5050  	VBROADCASTSS genPowF32<>+32(SB), Y0
  5051  	VMOVUPS      Y0, 576(SP)
  5052  	VBROADCASTSS genPowF32<>+36(SB), Y0
  5053  	VMOVUPS      Y0, 544(SP)
  5054  	VBROADCASTSS genPowF32<>+40(SB), Y0
  5055  	VMOVUPS      Y0, 512(SP)
  5056  	VBROADCASTSS genPowF32<>+44(SB), Y0
  5057  	VMOVUPS      Y0, 480(SP)
  5058  	VBROADCASTSD genPowF32<>+156(SB), Y0
  5059  	VMOVUPS      Y0, 448(SP)
  5060  	VBROADCASTSS genPowF32<>+48(SB), Y0
  5061  	VMOVUPS      Y0, 416(SP)
  5062  	VBROADCASTSS genPowF32<>+52(SB), Y0
  5063  	VMOVUPS      Y0, -128(SP)
  5064  	VBROADCASTSS genPowF32<>+56(SB), Y0
  5065  	VMOVUPS      Y0, 384(SP)
  5066  	VBROADCASTSS genPowF32<>+60(SB), Y0
  5067  	VMOVUPS      Y0, 352(SP)
  5068  	VBROADCASTSS genPowF32<>+64(SB), Y0
  5069  	VMOVUPS      Y0, 288(SP)
  5070  	VBROADCASTSS genPowF32<>+68(SB), Y0
  5071  	VMOVUPS      Y0, 256(SP)
  5072  	VBROADCASTSS genPowF32<>+72(SB), Y0
  5073  	VMOVUPS      Y0, 224(SP)
  5074  	VBROADCASTSS genPowF32<>+76(SB), Y0
  5075  	VMOVUPS      Y0, 192(SP)
  5076  	VBROADCASTSS genPowF32<>+80(SB), Y0
  5077  	VMOVUPS      Y0, 160(SP)
  5078  	VBROADCASTSS genPowF32<>+84(SB), Y0
  5079  	VMOVUPS      Y0, 128(SP)
  5080  	VBROADCASTSS genPowF32<>+88(SB), Y0
  5081  	VMOVUPS      Y0, 96(SP)
  5082  	VBROADCASTSS genPowF32<>+92(SB), Y0
  5083  	VMOVUPS      Y0, 64(SP)
  5084  	VBROADCASTSS genPowF32<>+96(SB), Y0
  5085  	VMOVUPS      Y0, 32(SP)
  5086  	VBROADCASTSS genPowF32<>+100(SB), Y0
  5087  	VMOVUPS      Y0, (SP)
  5088  	VBROADCASTSS genPowF32<>+104(SB), Y0
  5089  	VMOVUPS      Y0, -32(SP)
  5090  	VBROADCASTSS genPowF32<>+108(SB), Y0
  5091  	VMOVUPS      Y0, -64(SP)
  5092  	VPBROADCASTD genPowF32<>+112(SB), Y0
  5093  	VMOVDQU      Y0, -96(SP)
  5094  	VPBROADCASTD genPowF32<>+116(SB), Y8
  5095  	VBROADCASTSS genPowF32<>+116(SB), Y12
  5096  	JMP          LBB8_2
  5097  
  5098  LBB8_10:
  5099  	VPXOR     Y0, Y15, Y0
  5100  	VPANDN    Y0, Y14, Y0
  5101  	VMOVUPS   -128(SP), Y14
  5102  	VMOVUPS   832(SP), Y2
  5103  	VCMPPS    $0x00, Y2, Y14, Y3
  5104  	VCMPPS    $0x01, Y2, Y14, Y4
  5105  	VXORPS    Y4, Y11, Y4
  5106  	VPXOR     X7, X7, X7
  5107  	VBLENDVPS Y4, Y12, Y7, Y4
  5108  	VBLENDVPS Y3, Y14, Y4, Y3
  5109  	VBLENDVPS Y0, Y6, Y3, Y0
  5110  	VANDPS    Y6, Y10, Y3
  5111  	VANDPS    Y9, Y10, Y4
  5112  	VORPS     Y2, Y4, Y4
  5113  	VBLENDVPS Y13, Y3, Y4, Y3
  5114  	VBLENDVPS Y1, Y14, Y3, Y1
  5115  	VBLENDVPS Y5, Y0, Y1, Y0
  5116  	VCMPPS    $0x03, Y9, Y9, Y1
  5117  	VCMPPS    $0x03, Y11, Y11, Y3
  5118  	VORPS     Y1, Y3, Y1
  5119  	VADDPS    Y9, Y11, Y3
  5120  	VBLENDVPS Y1, Y3, Y0, Y6
  5121  	VMOVUPS   Y6, (DI)(AX*4)
  5122  	ADDQ      $0x08, AX
  5123  	CMPQ      AX, DX
  5124  	JAE       LBB8_12
  5125  
  5126  LBB8_2:
  5127  	VMOVAPS      Y12, Y2
  5128  	VMOVDQU      (DI)(AX*4), Y9
  5129  	VMOVUPS      (SI)(AX*4), Y11
  5130  	VPAND        320(SP), Y9, Y12
  5131  	VMOVUPS      genPowF32<>+124(SB), X1
  5132  	VANDPS       (DI)(AX*4), X1, X0
  5133  	VMOVUPS      genPowF32<>+140(SB), X3
  5134  	VORPS        X3, X0, X0
  5135  	VANDPS       16(DI)(AX*4), X1, X1
  5136  	VORPS        X3, X1, X1
  5137  	VINSERTF128  $0x01, X1, Y0, Y0
  5138  	VMOVUPS      800(SP), Y1
  5139  	VCMPPS       $0x01, Y0, Y1, Y1
  5140  	VANDNPS      Y0, Y1, Y4
  5141  	VADDPS       768(SP), Y0, Y0
  5142  	VADDPS       Y4, Y0, Y4
  5143  	VMULPS       Y4, Y4, Y6
  5144  	VMULPS       Y6, Y6, Y0
  5145  	VMOVUPS      704(SP), Y5
  5146  	VFMADD213PS  736(SP), Y4, Y5
  5147  	VMOVUPS      640(SP), Y10
  5148  	VFMADD213PS  672(SP), Y4, Y10
  5149  	VFMADD231PS  Y5, Y6, Y10
  5150  	VMOVUPS      576(SP), Y5
  5151  	VFMADD213PS  608(SP), Y4, Y5
  5152  	VMOVUPS      512(SP), Y13
  5153  	VFMADD213PS  544(SP), Y4, Y13
  5154  	VMULPS       Y0, Y0, Y14
  5155  	VFMADD132PS  480(SP), Y13, Y14
  5156  	VFMADD231PS  Y5, Y6, Y14
  5157  	VFMADD231PS  Y10, Y0, Y14
  5158  	VMULPS       Y4, Y6, Y0
  5159  	VMULPS       Y0, Y14, Y0
  5160  	VMOVDQU      Y12, 832(SP)
  5161  	VPSRLD       $0x17, Y12, Y5
  5162  	VPOR         448(SP), Y5, Y5
  5163  	VADDPS       416(SP), Y5, Y5
  5164  	VMOVUPS      -128(SP), Y3
  5165  	VANDPS       Y3, Y1, Y1
  5166  	VADDPS       Y1, Y5, Y5
  5167  	VMULPS       Y5, Y11, Y1
  5168  	VROUNDPS     $0x08, Y1, Y1
  5169  	VFNMADD213PS Y1, Y11, Y5
  5170  	VMOVUPS      384(SP), Y14
  5171  	VMOVAPS      Y14, Y10
  5172  	VFMADD213PS  Y4, Y6, Y10
  5173  	VADDPS       Y0, Y10, Y10
  5174  	VMOVUPS      352(SP), Y12
  5175  	VMULPS       Y4, Y12, Y13
  5176  	VMULPS       Y6, Y14, Y14
  5177  	VFMADD231PS  Y13, Y4, Y14
  5178  	VSUBPS       Y4, Y10, Y4
  5179  	VFMADD231PS  Y6, Y12, Y4
  5180  	VMOVUPS      288(SP), Y15
  5181  	VMULPS       Y15, Y11, Y6
  5182  	VMULPS       Y6, Y10, Y6
  5183  	VROUNDPS     $0x08, Y6, Y6
  5184  	VMULPS       256(SP), Y6, Y13
  5185  	VFMADD231PS  Y10, Y11, Y13
  5186  	VFNMADD231PS 224(SP), Y6, Y13
  5187  	VSUBPS       Y0, Y14, Y0
  5188  	VADDPS       Y4, Y0, Y0
  5189  	VMOVUPS      192(SP), Y10
  5190  	VMULPS       Y5, Y10, Y4
  5191  	VFNMADD231PS Y0, Y11, Y4
  5192  	VADDPS       Y4, Y13, Y0
  5193  	VMULPS       Y0, Y15, Y4
  5194  	VROUNDPS     $0x08, Y4, Y4
  5195  	VFMADD231PS  Y10, Y4, Y0
  5196  	VMULPS       Y0, Y0, Y5
  5197  	VMULPS       Y5, Y5, Y10
  5198  	VMOVUPS      64(SP), Y13
  5199  	VFMADD213PS  96(SP), Y0, Y13
  5200  	VMOVUPS      32(SP), Y14
  5201  	VFMADD213PS  Y12, Y0, Y14
  5202  	VFMADD231PS  Y13, Y10, Y14
  5203  	VMOVUPS      128(SP), Y10
  5204  	VFMADD213PS  160(SP), Y0, Y10
  5205  	VFMADD231PS  Y10, Y5, Y14
  5206  	VADDPS       Y3, Y0, Y10
  5207  	VFMADD231PS  Y14, Y5, Y10
  5208  	VADDPS       Y1, Y6, Y0
  5209  	VADDPS       Y4, Y0, Y14
  5210  	VCVTPS2DQ    Y14, Y4
  5211  	VPSRLD       $0x17, Y10, Y0
  5212  	VPAND        genPowF32<>+164(SB), Y0, Y0
  5213  	VPADDD       Y4, Y0, Y0
  5214  	VPCMPGTD     (SP), Y0, Y1
  5215  	VMOVUPS      -32(SP), Y3
  5216  	VCMPPS       $0x01, Y14, Y3, Y5
  5217  	VPOR         Y5, Y1, Y1
  5218  	VMOVDQU      -64(SP), Y3
  5219  	VPCMPGTD     Y0, Y3, Y0
  5220  	VCMPPS       $0x01, -96(SP), Y14, Y5
  5221  	VPOR         Y5, Y0, Y0
  5222  	VPSLLD       $0x17, Y4, Y4
  5223  	VPADDD       Y4, Y10, Y6
  5224  	VPOR         Y1, Y0, Y4
  5225  	VTESTPS      Y4, Y4
  5226  	JNE          LBB8_3
  5227  	VPCMPEQD     Y15, Y15, Y15
  5228  	VMOVAPS      Y2, Y12
  5229  	JMP          LBB8_5
  5230  
  5231  LBB8_3:
  5232  	VPANDN    Y6, Y0, Y0
  5233  	VMOVAPS   Y2, Y12
  5234  	VBLENDVPS Y1, Y2, Y0, Y6
  5235  	VPCMPEQD  Y15, Y15, Y15
  5236  
  5237  LBB8_5:
  5238  	VPAND     Y8, Y9, Y5
  5239  	VPCMPEQD  Y7, Y5, Y4
  5240  	VCMPPS    $0x01, Y7, Y11, Y13
  5241  	VCMPPS    $0x00, Y7, Y11, Y1
  5242  	VANDPS    -128(SP), Y1, Y0
  5243  	VBLENDVPS Y13, Y12, Y0, Y0
  5244  	VBLENDVPS Y4, Y0, Y6, Y6
  5245  	VMOVMSKPS Y9, CX
  5246  	TESTL     CX, CX
  5247  	JNE       LBB8_7
  5248  	VXORPS    X10, X10, X10
  5249  	JMP       LBB8_8
  5250  
  5251  LBB8_7:
  5252  	VROUNDPS     $0x08, Y11, Y0
  5253  	VCMPPS       $0x00, Y0, Y11, Y0
  5254  	VCVTPS2DQ    Y11, Y10
  5255  	VPSLLD       $0x1f, Y10, Y10
  5256  	VPOR         Y6, Y10, Y12
  5257  	VPXOR        X3, X3, X3
  5258  	VCMPPS       $0x00, Y3, Y9, Y7
  5259  	VBROADCASTSS genPowF32<>+120(SB), Y3
  5260  	VBLENDVPS    Y7, Y6, Y3, Y3
  5261  	VBLENDVPS    Y0, Y12, Y3, Y3
  5262  	VMOVAPS      Y2, Y12
  5263  	VPSRAD       $0x1f, Y9, Y7
  5264  	VBLENDVPS    Y7, Y3, Y6, Y6
  5265  	VANDPS       Y0, Y10, Y10
  5266  
  5267  LBB8_8:
  5268  	VPCMPEQD Y5, Y8, Y0
  5269  	VPXOR    Y0, Y15, Y5
  5270  	VANDPS   Y8, Y11, Y0
  5271  	VANDPS   Y8, Y14, Y3
  5272  	VPCMPEQD Y3, Y8, Y14
  5273  	VPXOR    Y15, Y14, Y3
  5274  	VPCMPEQD Y0, Y8, Y0
  5275  	VPANDN   Y5, Y0, Y7
  5276  	VPOR     Y4, Y3, Y3
  5277  	VPAND    Y7, Y3, Y3
  5278  	VTESTPS  Y15, Y3
  5279  	JAE      LBB8_10
  5280  	VPXOR    X7, X7, X7
  5281  	VMOVUPS  Y6, (DI)(AX*4)
  5282  	ADDQ     $0x08, AX
  5283  	CMPQ     AX, DX
  5284  	JB       LBB8_2
  5285  
  5286  LBB8_12:
  5287  	ADDQ $+872, SP
  5288  	VZEROUPPER
  5289  	RET
  5290  
  5291  DATA dataSinF32<>+0(SB)/4, $0x7fffffff
  5292  DATA dataSinF32<>+4(SB)/4, $0x3fa2f983
  5293  DATA dataSinF32<>+8(SB)/4, $0xfffffffe
  5294  DATA dataSinF32<>+12(SB)/4, $0x00000002
  5295  DATA dataSinF32<>+16(SB)/4, $0xbf490fdb
  5296  DATA dataSinF32<>+20(SB)/4, $0x80000000
  5297  DATA dataSinF32<>+24(SB)/4, $0x37ccf5ce
  5298  DATA dataSinF32<>+28(SB)/4, $0xbab6061a
  5299  DATA dataSinF32<>+32(SB)/4, $0x3d2aaaa5
  5300  DATA dataSinF32<>+36(SB)/4, $0xbf000000
  5301  DATA dataSinF32<>+40(SB)/4, $0x3f800000
  5302  DATA dataSinF32<>+44(SB)/4, $0xb94ca1f9
  5303  DATA dataSinF32<>+48(SB)/4, $0x3c08839e
  5304  DATA dataSinF32<>+52(SB)/4, $0xbe2aaaa3
  5305  DATA dataSinF32<>+56(SB)/4, $0x4b7fffff
  5306  DATA dataSinF32<>+60(SB)/8, $0xffffffffffffffff
  5307  DATA dataSinF32<>+68(SB)/8, $0xffffffffffffffff
  5308  DATA dataSinF32<>+76(SB)/8, $0xffffffffffffffff
  5309  DATA dataSinF32<>+84(SB)/8, $0xffffffffffffffff
  5310  DATA dataSinF32<>+92(SB)/8, $0x0000000000000000
  5311  DATA dataSinF32<>+100(SB)/8, $0x0000000000000000
  5312  DATA dataSinF32<>+108(SB)/8, $0x0000000000000000
  5313  DATA dataSinF32<>+116(SB)/8, $0x0000000000000000
  5314  GLOBL dataSinF32<>(SB), RODATA|NOPTR, $124
  5315  
  5316  // func Sin_AVX2_F32(x []float32)
  5317  // Requires: AVX, AVX2, CMOV, FMA3
  5318  TEXT ·Sin_AVX2_F32(SB), $0-24
  5319  	MOVQ         x_base+0(FP), DI
  5320  	MOVQ         x_len+8(FP), SI
  5321  	PUSHQ        AX
  5322  	MOVQ         SI, AX
  5323  	ANDQ         $-8, AX
  5324  	JE           LBB12_3
  5325  	XORL         CX, CX
  5326  	VBROADCASTSS dataSinF32<>+0(SB), Y0
  5327  	VMOVUPS      Y0, -32(SP)
  5328  	VBROADCASTSS dataSinF32<>+4(SB), Y0
  5329  	VMOVUPS      Y0, -64(SP)
  5330  	VBROADCASTSS dataSinF32<>+8(SB), Y0
  5331  	VMOVUPS      Y0, -96(SP)
  5332  	VPBROADCASTD dataSinF32<>+12(SB), Y4
  5333  	VPBROADCASTD dataSinF32<>+16(SB), Y0
  5334  	VMOVDQU      Y0, -128(SP)
  5335  	VPBROADCASTD dataSinF32<>+20(SB), Y7
  5336  	VBROADCASTSS dataSinF32<>+24(SB), Y8
  5337  	VBROADCASTSS dataSinF32<>+28(SB), Y9
  5338  	VBROADCASTSS dataSinF32<>+32(SB), Y10
  5339  	VBROADCASTSS dataSinF32<>+36(SB), Y11
  5340  	VBROADCASTSS dataSinF32<>+40(SB), Y12
  5341  	VBROADCASTSS dataSinF32<>+44(SB), Y3
  5342  	VBROADCASTSS dataSinF32<>+48(SB), Y14
  5343  	VBROADCASTSS dataSinF32<>+52(SB), Y15
  5344  
  5345  LBB12_2:
  5346  	VMOVUPS     (DI)(CX*4), Y2
  5347  	VANDPS      -32(SP), Y2, Y5
  5348  	VMULPS      -64(SP), Y5, Y0
  5349  	VCVTTPS2DQ  Y0, Y0
  5350  	VPSUBD      dataSinF32<>+60(SB), Y0, Y0
  5351  	VPAND       -96(SP), Y0, Y1
  5352  	VCVTDQ2PS   Y1, Y1
  5353  	VFMADD132PS -128(SP), Y5, Y1
  5354  	VMULPS      Y1, Y1, Y5
  5355  	VMOVAPS     Y3, Y13
  5356  	VFMADD213PS Y14, Y5, Y13
  5357  	VFMADD213PS Y15, Y5, Y13
  5358  	VMULPS      Y1, Y5, Y6
  5359  	VFMADD213PS Y1, Y13, Y6
  5360  	VPSLLD      $0x1d, Y0, Y1
  5361  	VPAND       Y4, Y0, Y0
  5362  	VPXOR       Y2, Y1, Y1
  5363  	VMOVAPS     Y8, Y2
  5364  	VFMADD213PS Y9, Y5, Y2
  5365  	VFMADD213PS Y10, Y5, Y2
  5366  	VFMADD213PS Y11, Y5, Y2
  5367  	VFMADD213PS Y12, Y5, Y2
  5368  	VPCMPEQD    Y4, Y0, Y5
  5369  	VANDPS      Y5, Y2, Y2
  5370  	VPCMPEQD    dataSinF32<>+92(SB), Y0, Y0
  5371  	VANDPS      Y0, Y6, Y0
  5372  	VADDPS      Y2, Y0, Y0
  5373  	VPAND       Y7, Y1, Y1
  5374  	VPXOR       Y0, Y1, Y0
  5375  	VMOVDQU     Y0, (DI)(CX*4)
  5376  	ADDQ        $0x08, CX
  5377  	CMPQ        CX, AX
  5378  	JB          LBB12_2
  5379  
  5380  LBB12_3:
  5381  	CMPQ         AX, SI
  5382  	JAE          LBB12_14
  5383  	VBROADCASTSS dataSinF32<>+20(SB), X0
  5384  	VPXOR        X1, X1, X1
  5385  	VMOVSS       dataSinF32<>+56(SB), X2
  5386  	VMOVSS       dataSinF32<>+40(SB), X9
  5387  	VMOVSS       dataSinF32<>+16(SB), X10
  5388  	VMOVSS       dataSinF32<>+24(SB), X12
  5389  	VMOVSS       dataSinF32<>+28(SB), X11
  5390  	VMOVSS       dataSinF32<>+32(SB), X13
  5391  	VMOVSS       dataSinF32<>+36(SB), X14
  5392  	VMOVSS       dataSinF32<>+44(SB), X8
  5393  	VMOVSS       dataSinF32<>+48(SB), X15
  5394  	VMOVSS       dataSinF32<>+52(SB), X6
  5395  	JMP          LBB12_5
  5396  
  5397  LBB12_13:
  5398  	ADDQ $0x01, AX
  5399  	CMPQ AX, SI
  5400  	JAE  LBB12_14
  5401  
  5402  LBB12_5:
  5403  	VMOVSS     (DI)(AX*4), X4
  5404  	VXORPS     X0, X4, X3
  5405  	VCMPSS     $0x01, X1, X4, X5
  5406  	VBLENDVPS  X5, X3, X4, X3
  5407  	VUCOMISS   X2, X3
  5408  	JA         LBB12_13
  5409  	VUCOMISS   X1, X4
  5410  	SETCS      R8
  5411  	VMULSS     dataSinF32<>+4(SB), X3, X4
  5412  	VCVTTSS2SI X4, DX
  5413  	VROUNDSS   $0x0b, X4, X4, X4
  5414  	MOVL       DX, CX
  5415  	ANDL       $0x01, CX
  5416  	JE         LBB12_8
  5417  	VADDSS     X4, X9, X4
  5418  
  5419  LBB12_8:
  5420  	ADDL        DX, CX
  5421  	ANDL        $0x07, CX
  5422  	LEAL        -4(CX), DX
  5423  	CMPL        CX, $0x04
  5424  	SETCC       R9
  5425  	CMOVLLT     CX, DX
  5426  	VFMADD231SS X10, X4, X3
  5427  	VMULSS      X3, X3, X4
  5428  	VMOVAPS     X12, X7
  5429  	VFMADD213SS X11, X4, X7
  5430  	VFMADD213SS X13, X4, X7
  5431  	VFMADD213SS X14, X4, X7
  5432  	VMOVAPS     X8, X5
  5433  	VFMADD213SS X15, X4, X5
  5434  	VFMADD213SS X6, X4, X5
  5435  	ADDL        $-1, DX
  5436  	CMPL        DX, $0x02
  5437  	JB          LBB12_9
  5438  	VMULSS      X3, X4, X4
  5439  	VFMADD213SS X3, X4, X5
  5440  	VMOVAPS     X5, X4
  5441  	VMOVSS      X4, (DI)(AX*4)
  5442  	CMPB        R8, R9
  5443  	JE          LBB12_13
  5444  	JMP         LBB12_12
  5445  
  5446  LBB12_9:
  5447  	VFMADD213SS X9, X7, X4
  5448  	VMOVSS      X4, (DI)(AX*4)
  5449  	CMPB        R8, R9
  5450  	JE          LBB12_13
  5451  
  5452  LBB12_12:
  5453  	VXORPS X0, X4, X3
  5454  	VMOVSS X3, (DI)(AX*4)
  5455  	JMP    LBB12_13
  5456  
  5457  LBB12_14:
  5458  	POPQ AX
  5459  	VZEROUPPER
  5460  	RET
  5461  
  5462  DATA dataCosF32<>+0(SB)/4, $0x7fffffff
  5463  DATA dataCosF32<>+4(SB)/4, $0x3fa2f983
  5464  DATA dataCosF32<>+8(SB)/4, $0xfffffffe
  5465  DATA dataCosF32<>+12(SB)/4, $0x00000002
  5466  DATA dataCosF32<>+16(SB)/4, $0xbf490fdb
  5467  DATA dataCosF32<>+20(SB)/4, $0xc0000000
  5468  DATA dataCosF32<>+24(SB)/4, $0x37ccf5ce
  5469  DATA dataCosF32<>+28(SB)/4, $0xbab6061a
  5470  DATA dataCosF32<>+32(SB)/4, $0x3d2aaaa5
  5471  DATA dataCosF32<>+36(SB)/4, $0xbf000000
  5472  DATA dataCosF32<>+40(SB)/4, $0x3f800000
  5473  DATA dataCosF32<>+44(SB)/4, $0xb94ca1f9
  5474  DATA dataCosF32<>+48(SB)/4, $0x3c08839e
  5475  DATA dataCosF32<>+52(SB)/4, $0xbe2aaaa3
  5476  DATA dataCosF32<>+56(SB)/4, $0x80000000
  5477  DATA dataCosF32<>+60(SB)/4, $0x4b7fffff
  5478  DATA dataCosF32<>+64(SB)/8, $0xffffffffffffffff
  5479  DATA dataCosF32<>+72(SB)/8, $0xffffffffffffffff
  5480  DATA dataCosF32<>+80(SB)/8, $0xffffffffffffffff
  5481  DATA dataCosF32<>+88(SB)/8, $0xffffffffffffffff
  5482  DATA dataCosF32<>+96(SB)/8, $0x0000000000000000
  5483  DATA dataCosF32<>+104(SB)/8, $0x0000000000000000
  5484  DATA dataCosF32<>+112(SB)/8, $0x0000000000000000
  5485  DATA dataCosF32<>+120(SB)/8, $0x0000000000000000
  5486  GLOBL dataCosF32<>(SB), RODATA|NOPTR, $128
  5487  
  5488  // func Cos_AVX2_F32(x []float32)
  5489  // Requires: AVX, AVX2, CMOV, FMA3
  5490  TEXT ·Cos_AVX2_F32(SB), NOSPLIT, $0-24
  5491  	MOVQ         x_base+0(FP), DI
  5492  	MOVQ         x_len+8(FP), SI
  5493  	SUBQ         $0x48, SP
  5494  	MOVQ         SI, AX
  5495  	ANDQ         $-8, AX
  5496  	JE           LBB13_3
  5497  	XORL         CX, CX
  5498  	VBROADCASTSS dataCosF32<>+0(SB), Y0
  5499  	VMOVUPS      Y0, 32(SP)
  5500  	VBROADCASTSS dataCosF32<>+4(SB), Y0
  5501  	VMOVUPS      Y0, (SP)
  5502  	VBROADCASTSS dataCosF32<>+8(SB), Y0
  5503  	VMOVUPS      Y0, -32(SP)
  5504  	VPBROADCASTD dataCosF32<>+12(SB), Y4
  5505  	VBROADCASTSS dataCosF32<>+16(SB), Y0
  5506  	VMOVUPS      Y0, -64(SP)
  5507  	VBROADCASTSS dataCosF32<>+20(SB), Y0
  5508  	VMOVUPS      Y0, -96(SP)
  5509  	VBROADCASTSS dataCosF32<>+24(SB), Y0
  5510  	VMOVUPS      Y0, -128(SP)
  5511  	VBROADCASTSS dataCosF32<>+28(SB), Y9
  5512  	VBROADCASTSS dataCosF32<>+32(SB), Y10
  5513  	VBROADCASTSS dataCosF32<>+36(SB), Y6
  5514  	VBROADCASTSS dataCosF32<>+40(SB), Y12
  5515  	VBROADCASTSS dataCosF32<>+44(SB), Y13
  5516  	VBROADCASTSS dataCosF32<>+48(SB), Y14
  5517  	VBROADCASTSS dataCosF32<>+52(SB), Y15
  5518  	VPBROADCASTD dataCosF32<>+56(SB), Y2
  5519  
  5520  LBB13_2:
  5521  	VMOVUPS     32(SP), Y0
  5522  	VANDPS      (DI)(CX*4), Y0, Y5
  5523  	VMULPS      (SP), Y5, Y0
  5524  	VCVTTPS2DQ  Y0, Y0
  5525  	VPSUBD      dataCosF32<>+64(SB), Y0, Y0
  5526  	VPAND       -32(SP), Y0, Y1
  5527  	VCVTDQ2PS   Y1, Y3
  5528  	VFMADD132PS -64(SP), Y5, Y3
  5529  	VMULPS      Y3, Y3, Y5
  5530  	VMOVUPS     -128(SP), Y8
  5531  	VFMADD213PS Y9, Y5, Y8
  5532  	VFMADD213PS Y10, Y5, Y8
  5533  	VMULPS      Y5, Y5, Y7
  5534  	VMOVAPS     Y6, Y11
  5535  	VFMADD213PS Y12, Y5, Y11
  5536  	VFMADD231PS Y7, Y8, Y11
  5537  	VMOVAPS     Y13, Y7
  5538  	VFMADD213PS Y14, Y5, Y7
  5539  	VFMADD213PS Y15, Y5, Y7
  5540  	VMULPS      Y3, Y5, Y5
  5541  	VFMADD213PS Y3, Y7, Y5
  5542  	VPAND       Y4, Y0, Y0
  5543  	VPCMPEQD    Y4, Y0, Y3
  5544  	VPCMPEQD    dataCosF32<>+96(SB), Y0, Y0
  5545  	VANDPS      Y0, Y5, Y0
  5546  	VANDPS      Y3, Y11, Y3
  5547  	VADDPS      Y3, Y0, Y0
  5548  	VADDPS      Y5, Y11, Y3
  5549  	VSUBPS      Y0, Y3, Y0
  5550  	VPSLLD      $0x1d, Y1, Y1
  5551  	VPADDD      -96(SP), Y1, Y1
  5552  	VPAND       Y2, Y1, Y1
  5553  	VPXOR       Y2, Y1, Y1
  5554  	VXORPS      Y1, Y0, Y0
  5555  	VMOVUPS     Y0, (DI)(CX*4)
  5556  	ADDQ        $0x08, CX
  5557  	CMPQ        CX, AX
  5558  	JB          LBB13_2
  5559  
  5560  LBB13_3:
  5561  	CMPQ         AX, SI
  5562  	JAE          LBB13_14
  5563  	VBROADCASTSS dataCosF32<>+56(SB), X0
  5564  	VXORPS       X1, X1, X1
  5565  	VMOVSS       dataCosF32<>+60(SB), X2
  5566  	VMOVSS       dataCosF32<>+40(SB), X9
  5567  	VMOVSS       dataCosF32<>+16(SB), X10
  5568  	VMOVSS       dataCosF32<>+24(SB), X8
  5569  	VMOVSS       dataCosF32<>+28(SB), X11
  5570  	VMOVSS       dataCosF32<>+32(SB), X13
  5571  	VMOVSS       dataCosF32<>+36(SB), X14
  5572  	VMOVSS       dataCosF32<>+44(SB), X7
  5573  	VMOVSS       dataCosF32<>+48(SB), X15
  5574  	VMOVSS       dataCosF32<>+52(SB), X6
  5575  	JMP          LBB13_5
  5576  
  5577  LBB13_13:
  5578  	ADDQ $0x01, AX
  5579  	CMPQ AX, SI
  5580  	JAE  LBB13_14
  5581  
  5582  LBB13_5:
  5583  	VMOVSS     (DI)(AX*4), X3
  5584  	VXORPS     X0, X3, X4
  5585  	VCMPSS     $0x01, X1, X3, X5
  5586  	VBLENDVPS  X5, X4, X3, X3
  5587  	VUCOMISS   X2, X3
  5588  	JA         LBB13_13
  5589  	VMULSS     dataCosF32<>+4(SB), X3, X4
  5590  	VCVTTSS2SI X4, DX
  5591  	VROUNDSS   $0x0b, X4, X4, X4
  5592  	MOVL       DX, CX
  5593  	ANDL       $0x01, CX
  5594  	JE         LBB13_8
  5595  	VADDSS     X4, X9, X4
  5596  
  5597  LBB13_8:
  5598  	ADDL        DX, CX
  5599  	ANDL        $0x07, CX
  5600  	LEAL        -4(CX), DX
  5601  	CMPL        CX, $0x04
  5602  	CMOVLLT     CX, DX
  5603  	SETCC       R8
  5604  	CMPL        DX, $0x02
  5605  	SETCC       CL
  5606  	VFMADD231SS X10, X4, X3
  5607  	VMULSS      X3, X3, X4
  5608  	VMOVAPS     X8, X12
  5609  	VFMADD213SS X11, X4, X12
  5610  	VFMADD213SS X13, X4, X12
  5611  	VFMADD213SS X14, X4, X12
  5612  	VMOVAPS     X7, X5
  5613  	VFMADD213SS X15, X4, X5
  5614  	VFMADD213SS X6, X4, X5
  5615  	ADDL        $-1, DX
  5616  	CMPL        DX, $0x02
  5617  	JB          LBB13_9
  5618  	VFMADD213SS X9, X12, X4
  5619  	VMOVAPS     X4, X5
  5620  	VMOVSS      X5, (DI)(AX*4)
  5621  	CMPB        R8, CL
  5622  	JE          LBB13_13
  5623  	JMP         LBB13_12
  5624  
  5625  LBB13_9:
  5626  	VMULSS      X3, X4, X4
  5627  	VFMADD213SS X3, X4, X5
  5628  	VMOVSS      X5, (DI)(AX*4)
  5629  	CMPB        R8, CL
  5630  	JE          LBB13_13
  5631  
  5632  LBB13_12:
  5633  	VXORPS X0, X5, X3
  5634  	VMOVSS X3, (DI)(AX*4)
  5635  	JMP    LBB13_13
  5636  
  5637  LBB13_14:
  5638  	ADDQ $0x48, SP
  5639  	VZEROUPPER
  5640  	RET
  5641  
  5642  DATA dataSinCosF32<>+0(SB)/4, $0x7fffffff
  5643  DATA dataSinCosF32<>+4(SB)/4, $0x3fa2f983
  5644  DATA dataSinCosF32<>+8(SB)/4, $0xfffffffe
  5645  DATA dataSinCosF32<>+12(SB)/4, $0x00000002
  5646  DATA dataSinCosF32<>+16(SB)/4, $0xbf490fdb
  5647  DATA dataSinCosF32<>+20(SB)/4, $0xc0000000
  5648  DATA dataSinCosF32<>+24(SB)/4, $0x80000000
  5649  DATA dataSinCosF32<>+28(SB)/4, $0x37ccf5ce
  5650  DATA dataSinCosF32<>+32(SB)/4, $0xbab6061a
  5651  DATA dataSinCosF32<>+36(SB)/4, $0x3d2aaaa5
  5652  DATA dataSinCosF32<>+40(SB)/4, $0xbf000000
  5653  DATA dataSinCosF32<>+44(SB)/4, $0x3f800000
  5654  DATA dataSinCosF32<>+48(SB)/4, $0xb94ca1f9
  5655  DATA dataSinCosF32<>+52(SB)/4, $0x3c08839e
  5656  DATA dataSinCosF32<>+56(SB)/4, $0xbe2aaaa3
  5657  DATA dataSinCosF32<>+60(SB)/4, $0x4b7fffff
  5658  DATA dataSinCosF32<>+64(SB)/8, $0xffffffffffffffff
  5659  DATA dataSinCosF32<>+72(SB)/8, $0xffffffffffffffff
  5660  DATA dataSinCosF32<>+80(SB)/8, $0xffffffffffffffff
  5661  DATA dataSinCosF32<>+88(SB)/8, $0xffffffffffffffff
  5662  DATA dataSinCosF32<>+96(SB)/8, $0x0000000000000000
  5663  DATA dataSinCosF32<>+104(SB)/8, $0x0000000000000000
  5664  DATA dataSinCosF32<>+112(SB)/8, $0x0000000000000000
  5665  DATA dataSinCosF32<>+120(SB)/8, $0x0000000000000000
  5666  GLOBL dataSinCosF32<>(SB), RODATA|NOPTR, $128
  5667  
  5668  // func SinCos_AVX2_F32(x []float32, y []float32, z []float32)
  5669  // Requires: AVX, AVX2, CMOV, FMA3
  5670  TEXT ·SinCos_AVX2_F32(SB), $0-72
  5671  	MOVQ         x_base+0(FP), DI
  5672  	MOVQ         y_base+24(FP), SI
  5673  	MOVQ         z_base+48(FP), DX
  5674  	MOVQ         x_len+8(FP), CX
  5675  	PUSHQ        BX
  5676  	SUBQ         $0x60, SP
  5677  	MOVQ         CX, R8
  5678  	ANDQ         $-8, R8
  5679  	JE           LBB14_3
  5680  	XORL         AX, AX
  5681  	VBROADCASTSS dataSinCosF32<>+0(SB), Y0
  5682  	VMOVUPS      Y0, 64(SP)
  5683  	VBROADCASTSS dataSinCosF32<>+4(SB), Y0
  5684  	VMOVUPS      Y0, 32(SP)
  5685  	VBROADCASTSS dataSinCosF32<>+8(SB), Y0
  5686  	VMOVUPS      Y0, (SP)
  5687  	VPBROADCASTD dataSinCosF32<>+12(SB), Y4
  5688  	VBROADCASTSS dataSinCosF32<>+16(SB), Y0
  5689  	VMOVUPS      Y0, -32(SP)
  5690  	VBROADCASTSS dataSinCosF32<>+20(SB), Y0
  5691  	VMOVUPS      Y0, -64(SP)
  5692  	VPBROADCASTD dataSinCosF32<>+24(SB), Y8
  5693  	VBROADCASTSS dataSinCosF32<>+28(SB), Y0
  5694  	VMOVUPS      Y0, -96(SP)
  5695  	VBROADCASTSS dataSinCosF32<>+32(SB), Y0
  5696  	VMOVUPS      Y0, -128(SP)
  5697  	VBROADCASTSS dataSinCosF32<>+36(SB), Y11
  5698  	VBROADCASTSS dataSinCosF32<>+40(SB), Y10
  5699  	VBROADCASTSS dataSinCosF32<>+44(SB), Y13
  5700  	VBROADCASTSS dataSinCosF32<>+48(SB), Y14
  5701  	VBROADCASTSS dataSinCosF32<>+52(SB), Y15
  5702  	VBROADCASTSS dataSinCosF32<>+56(SB), Y2
  5703  
  5704  LBB14_2:
  5705  	VMOVUPS     (DX)(AX*4), Y5
  5706  	VANDPS      64(SP), Y5, Y1
  5707  	VMULPS      32(SP), Y1, Y0
  5708  	VCVTTPS2DQ  Y0, Y0
  5709  	VPSUBD      dataSinCosF32<>+64(SB), Y0, Y3
  5710  	VPAND       (SP), Y3, Y0
  5711  	VCVTDQ2PS   Y0, Y6
  5712  	VFMADD132PS -32(SP), Y1, Y6
  5713  	VMULPS      Y6, Y6, Y1
  5714  	VMOVUPS     -96(SP), Y9
  5715  	VFMADD213PS -128(SP), Y1, Y9
  5716  	VFMADD213PS Y11, Y1, Y9
  5717  	VMULPS      Y1, Y1, Y7
  5718  	VMOVAPS     Y10, Y12
  5719  	VFMADD213PS Y13, Y1, Y12
  5720  	VFMADD231PS Y7, Y9, Y12
  5721  	VMOVAPS     Y14, Y7
  5722  	VFMADD213PS Y15, Y1, Y7
  5723  	VFMADD213PS Y2, Y1, Y7
  5724  	VMULPS      Y6, Y1, Y1
  5725  	VFMADD213PS Y6, Y7, Y1
  5726  	VPSLLD      $0x1d, Y3, Y6
  5727  	VPAND       Y4, Y3, Y3
  5728  	VPXOR       Y5, Y6, Y5
  5729  	VPCMPEQD    Y4, Y3, Y6
  5730  	VPCMPEQD    dataSinCosF32<>+96(SB), Y3, Y3
  5731  	VANDPS      Y3, Y1, Y3
  5732  	VANDPS      Y6, Y12, Y6
  5733  	VADDPS      Y3, Y6, Y3
  5734  	VADDPS      Y1, Y12, Y1
  5735  	VPAND       Y5, Y8, Y5
  5736  	VSUBPS      Y3, Y1, Y1
  5737  	VPXOR       Y3, Y5, Y3
  5738  	VPSLLD      $0x1d, Y0, Y0
  5739  	VPADDD      -64(SP), Y0, Y0
  5740  	VPAND       Y0, Y8, Y0
  5741  	VPXOR       Y0, Y8, Y0
  5742  	VXORPS      Y0, Y1, Y0
  5743  	VMOVDQU     Y3, (DI)(AX*4)
  5744  	VMOVUPS     Y0, (SI)(AX*4)
  5745  	ADDQ        $0x08, AX
  5746  	CMPQ        AX, R8
  5747  	JB          LBB14_2
  5748  
  5749  LBB14_3:
  5750  	CMPQ         R8, CX
  5751  	JAE          LBB14_16
  5752  	VBROADCASTSS dataSinCosF32<>+24(SB), X0
  5753  	VXORPS       X1, X1, X1
  5754  	VMOVSS       dataSinCosF32<>+60(SB), X2
  5755  	VMOVSS       dataSinCosF32<>+44(SB), X6
  5756  	VMOVSS       dataSinCosF32<>+28(SB), X8
  5757  	VMOVSS       dataSinCosF32<>+36(SB), X12
  5758  	VMOVSS       dataSinCosF32<>+40(SB), X13
  5759  	VMOVSS       dataSinCosF32<>+48(SB), X15
  5760  	VMOVSS       dataSinCosF32<>+52(SB), X14
  5761  	VMOVSS       dataSinCosF32<>+56(SB), X10
  5762  	JMP          LBB14_5
  5763  
  5764  LBB14_15:
  5765  	ADDQ $0x01, R8
  5766  	CMPQ R8, CX
  5767  	JAE  LBB14_16
  5768  
  5769  LBB14_5:
  5770  	VMOVSS     (DX)(R8*4), X4
  5771  	VXORPS     X0, X4, X5
  5772  	VCMPSS     $0x01, X1, X4, X7
  5773  	VBLENDVPS  X7, X5, X4, X5
  5774  	VUCOMISS   X2, X5
  5775  	JA         LBB14_15
  5776  	VUCOMISS   X1, X4
  5777  	SETCS      R9
  5778  	VMULSS     dataSinCosF32<>+4(SB), X5, X4
  5779  	VCVTTSS2SI X4, R10
  5780  	VROUNDSS   $0x0b, X4, X4, X4
  5781  	MOVL       R10, AX
  5782  	ANDL       $0x01, AX
  5783  	JE         LBB14_8
  5784  	VADDSS     X6, X4, X4
  5785  
  5786  LBB14_8:
  5787  	ADDL        R10, AX
  5788  	ANDL        $0x07, AX
  5789  	LEAL        -4(AX), R10
  5790  	CMPL        AX, $0x04
  5791  	SETCC       R11
  5792  	CMOVLLT     AX, R10
  5793  	VFMADD231SS dataSinCosF32<>+16(SB), X4, X5
  5794  	VMULSS      X5, X5, X7
  5795  	VMOVAPS     X8, X11
  5796  	VFMADD213SS dataSinCosF32<>+32(SB), X7, X11
  5797  	VFMADD213SS X12, X7, X11
  5798  	VMULSS      X7, X7, X9
  5799  	VMOVAPS     X6, X4
  5800  	VFMADD231SS X13, X7, X4
  5801  	VFMADD231SS X9, X11, X4
  5802  	VMOVAPS     X15, X3
  5803  	VFMADD213SS X14, X7, X3
  5804  	VFMADD213SS X10, X7, X3
  5805  	VMULSS      X5, X7, X7
  5806  	VFMADD213SS X5, X3, X7
  5807  	LEAL        -1(R10), BX
  5808  	CMPL        BX, $0x02
  5809  	JB          LBB14_9
  5810  	VMOVAPS     X7, X5
  5811  	VMOVSS      X5, (DI)(R8*4)
  5812  	VMOVSS      X4, (SI)(R8*4)
  5813  	CMPB        R9, R11
  5814  	JNE         LBB14_12
  5815  	JMP         LBB14_13
  5816  
  5817  LBB14_9:
  5818  	VMOVAPS X4, X5
  5819  	VMOVAPS X7, X4
  5820  	VMOVSS  X5, (DI)(R8*4)
  5821  	VMOVSS  X4, (SI)(R8*4)
  5822  	CMPB    R9, R11
  5823  	JE      LBB14_13
  5824  
  5825  LBB14_12:
  5826  	VMOVSS (DI)(R8*4), X3
  5827  	VXORPS X0, X3, X3
  5828  	VMOVSS X3, (DI)(R8*4)
  5829  
  5830  LBB14_13:
  5831  	CMPL   R10, $0x02
  5832  	SETCC  BL
  5833  	CMPL   AX, $0x04
  5834  	SETCC  AL
  5835  	CMPB   AL, BL
  5836  	JE     LBB14_15
  5837  	VMOVSS (SI)(R8*4), X3
  5838  	VXORPS X0, X3, X3
  5839  	VMOVSS X3, (SI)(R8*4)
  5840  	JMP    LBB14_15
  5841  
  5842  LBB14_16:
  5843  	ADDQ $0x60, SP
  5844  	POPQ BX
  5845  	VZEROUPPER
  5846  	RET
  5847  
  5848  DATA dataExpLen8xF32<>+0(SB)/4, $0x42b17218
  5849  DATA dataExpLen8xF32<>+4(SB)/4, $0xc2ce8ed0
  5850  DATA dataExpLen8xF32<>+8(SB)/4, $0x3f000000
  5851  DATA dataExpLen8xF32<>+12(SB)/4, $0x3fb8aa3b
  5852  DATA dataExpLen8xF32<>+16(SB)/4, $0xbf318000
  5853  DATA dataExpLen8xF32<>+20(SB)/4, $0x395e8083
  5854  DATA dataExpLen8xF32<>+24(SB)/4, $0x3f800000
  5855  DATA dataExpLen8xF32<>+28(SB)/4, $0x3ab743ce
  5856  DATA dataExpLen8xF32<>+32(SB)/4, $0x39506967
  5857  DATA dataExpLen8xF32<>+36(SB)/4, $0x3c088908
  5858  DATA dataExpLen8xF32<>+40(SB)/4, $0x3d2aa9c1
  5859  DATA dataExpLen8xF32<>+44(SB)/4, $0x3e2aaaaa
  5860  DATA dataExpLen8xF32<>+48(SB)/4, $0x7f7fffff
  5861  GLOBL dataExpLen8xF32<>(SB), RODATA|NOPTR, $52
  5862  
  5863  // func Exp_Len8x_AVX2_F32(x []float32)
  5864  // Requires: AVX, AVX2, FMA3
  5865  TEXT ·Exp_Len8x_AVX2_F32(SB), NOSPLIT, $0-24
  5866  	MOVQ         x_base+0(FP), DI
  5867  	MOVQ         x_len+8(FP), SI
  5868  	TESTQ        SI, SI
  5869  	JE           LBB11_3
  5870  	XORL         AX, AX
  5871  	VBROADCASTSS dataExpLen8xF32<>+0(SB), Y0
  5872  	VMOVUPS      Y0, -40(SP)
  5873  	VBROADCASTSS dataExpLen8xF32<>+4(SB), Y0
  5874  	VMOVUPS      Y0, -72(SP)
  5875  	VBROADCASTSS dataExpLen8xF32<>+8(SB), Y2
  5876  	VBROADCASTSS dataExpLen8xF32<>+12(SB), Y3
  5877  	VBROADCASTSS dataExpLen8xF32<>+16(SB), Y4
  5878  	VBROADCASTSS dataExpLen8xF32<>+20(SB), Y5
  5879  	VPBROADCASTD dataExpLen8xF32<>+24(SB), Y6
  5880  	VBROADCASTSS dataExpLen8xF32<>+28(SB), Y7
  5881  	VBROADCASTSS dataExpLen8xF32<>+32(SB), Y1
  5882  	VBROADCASTSS dataExpLen8xF32<>+36(SB), Y9
  5883  	VBROADCASTSS dataExpLen8xF32<>+40(SB), Y10
  5884  	VBROADCASTSS dataExpLen8xF32<>+44(SB), Y11
  5885  	VBROADCASTSS dataExpLen8xF32<>+48(SB), Y12
  5886  
  5887  LBB11_2:
  5888  	VMOVUPS     (DI)(AX*4), Y13
  5889  	VMOVAPS     Y3, Y14
  5890  	VFMADD213PS Y2, Y13, Y14
  5891  	VROUNDPS    $0x01, Y14, Y14
  5892  	VMOVAPS     Y4, Y15
  5893  	VFMADD213PS Y13, Y14, Y15
  5894  	VFMADD231PS Y5, Y14, Y15
  5895  	VMULPS      Y15, Y15, Y0
  5896  	VMOVAPS     Y1, Y8
  5897  	VFMADD213PS Y7, Y15, Y8
  5898  	VFMADD213PS Y9, Y15, Y8
  5899  	VFMADD213PS Y10, Y15, Y8
  5900  	VFMADD213PS Y11, Y15, Y8
  5901  	VFMADD213PS Y2, Y15, Y8
  5902  	VFMADD213PS Y15, Y0, Y8
  5903  	VCVTTPS2DQ  Y14, Y0
  5904  	VPSLLD      $0x17, Y0, Y0
  5905  	VPADDD      Y6, Y0, Y0
  5906  	VFMADD213PS Y0, Y0, Y8
  5907  	VMOVUPS     -40(SP), Y0
  5908  	VCMPPS      $0x01, Y13, Y0, Y0
  5909  	VBLENDVPS   Y0, Y12, Y8, Y0
  5910  	VMOVUPS     -72(SP), Y8
  5911  	VCMPPS      $0x02, Y13, Y8, Y8
  5912  	VANDPS      Y0, Y8, Y0
  5913  	VMOVUPS     Y0, (DI)(AX*4)
  5914  	ADDQ        $0x08, AX
  5915  	CMPQ        AX, SI
  5916  	JB          LBB11_2
  5917  
  5918  LBB11_3:
  5919  	VZEROUPPER
  5920  	RET
  5921  
  5922  DATA dataLogLen8xF32<>+0(SB)/4, $0x00800000
  5923  DATA dataLogLen8xF32<>+4(SB)/4, $0x807fffff
  5924  DATA dataLogLen8xF32<>+8(SB)/4, $0x3f000000
  5925  DATA dataLogLen8xF32<>+12(SB)/4, $0xffffff81
  5926  DATA dataLogLen8xF32<>+16(SB)/4, $0x3f800000
  5927  DATA dataLogLen8xF32<>+20(SB)/4, $0x3f3504f3
  5928  DATA dataLogLen8xF32<>+24(SB)/4, $0xbf800000
  5929  DATA dataLogLen8xF32<>+28(SB)/4, $0x3d9021bb
  5930  DATA dataLogLen8xF32<>+32(SB)/4, $0xbdebd1b8
  5931  DATA dataLogLen8xF32<>+36(SB)/4, $0x3def251a
  5932  DATA dataLogLen8xF32<>+40(SB)/4, $0xbdfe5d4f
  5933  DATA dataLogLen8xF32<>+44(SB)/4, $0x3e11e9bf
  5934  DATA dataLogLen8xF32<>+48(SB)/4, $0xbe2aae50
  5935  DATA dataLogLen8xF32<>+52(SB)/4, $0x3e4cceac
  5936  DATA dataLogLen8xF32<>+56(SB)/4, $0xbe7ffffc
  5937  DATA dataLogLen8xF32<>+60(SB)/4, $0x3eaaaaaa
  5938  DATA dataLogLen8xF32<>+64(SB)/4, $0x3f317218
  5939  DATA dataLogLen8xF32<>+68(SB)/4, $0xbf000000
  5940  DATA dataLogLen8xF32<>+72(SB)/8, $0x0000000000000000
  5941  DATA dataLogLen8xF32<>+80(SB)/8, $0x0000000000000000
  5942  DATA dataLogLen8xF32<>+88(SB)/8, $0x0000000000000000
  5943  DATA dataLogLen8xF32<>+96(SB)/8, $0x0000000000000000
  5944  GLOBL dataLogLen8xF32<>(SB), RODATA|NOPTR, $104
  5945  
  5946  // func Log_Len8x_AVX2_F32(x []float32)
  5947  // Requires: AVX, AVX2, FMA3
  5948  TEXT ·Log_Len8x_AVX2_F32(SB), NOSPLIT, $0-24
  5949  	MOVQ         x_base+0(FP), DI
  5950  	MOVQ         x_len+8(FP), SI
  5951  	SUBQ         $0x68, SP
  5952  	TESTQ        SI, SI
  5953  	JE           LBB10_3
  5954  	XORL         AX, AX
  5955  	VBROADCASTSS dataLogLen8xF32<>+0(SB), Y0
  5956  	VMOVUPS      Y0, 64(SP)
  5957  	VBROADCASTSS dataLogLen8xF32<>+4(SB), Y0
  5958  	VMOVUPS      Y0, 32(SP)
  5959  	VBROADCASTSS dataLogLen8xF32<>+8(SB), Y0
  5960  	VMOVUPS      Y0, (SP)
  5961  	VBROADCASTSS dataLogLen8xF32<>+12(SB), Y0
  5962  	VMOVUPS      Y0, -32(SP)
  5963  	VBROADCASTSS dataLogLen8xF32<>+16(SB), Y0
  5964  	VMOVUPS      Y0, -64(SP)
  5965  	VBROADCASTSS dataLogLen8xF32<>+20(SB), Y0
  5966  	VMOVUPS      Y0, -96(SP)
  5967  	VBROADCASTSS dataLogLen8xF32<>+24(SB), Y0
  5968  	VMOVUPS      Y0, -128(SP)
  5969  	VBROADCASTSS dataLogLen8xF32<>+28(SB), Y8
  5970  	VBROADCASTSS dataLogLen8xF32<>+32(SB), Y9
  5971  	VBROADCASTSS dataLogLen8xF32<>+36(SB), Y10
  5972  	VBROADCASTSS dataLogLen8xF32<>+40(SB), Y11
  5973  	VBROADCASTSS dataLogLen8xF32<>+44(SB), Y12
  5974  	VBROADCASTSS dataLogLen8xF32<>+48(SB), Y13
  5975  	VBROADCASTSS dataLogLen8xF32<>+52(SB), Y14
  5976  	VBROADCASTSS dataLogLen8xF32<>+56(SB), Y15
  5977  	VBROADCASTSS dataLogLen8xF32<>+60(SB), Y0
  5978  	VBROADCASTSS dataLogLen8xF32<>+64(SB), Y1
  5979  	VBROADCASTSS dataLogLen8xF32<>+68(SB), Y2
  5980  
  5981  LBB10_2:
  5982  	VMOVUPS     (DI)(AX*4), Y3
  5983  	VMAXPS      64(SP), Y3, Y4
  5984  	VPSRLD      $0x17, Y4, Y5
  5985  	VPADDD      -32(SP), Y5, Y5
  5986  	VANDPS      32(SP), Y4, Y4
  5987  	VORPS       (SP), Y4, Y4
  5988  	VCVTDQ2PS   Y5, Y5
  5989  	VADDPS      -64(SP), Y5, Y6
  5990  	VCMPPS      $0x01, -96(SP), Y4, Y7
  5991  	VBLENDVPS   Y7, Y5, Y6, Y5
  5992  	VANDPS      Y4, Y7, Y6
  5993  	VADDPS      -128(SP), Y4, Y4
  5994  	VADDPS      Y6, Y4, Y4
  5995  	VMOVAPS     Y8, Y6
  5996  	VFMADD213PS Y9, Y4, Y6
  5997  	VFMADD213PS Y10, Y4, Y6
  5998  	VFMADD213PS Y11, Y4, Y6
  5999  	VFMADD213PS Y12, Y4, Y6
  6000  	VFMADD213PS Y13, Y4, Y6
  6001  	VFMADD213PS Y14, Y4, Y6
  6002  	VFMADD213PS Y15, Y4, Y6
  6003  	VFMADD213PS Y0, Y4, Y6
  6004  	VFMADD213PS Y2, Y4, Y6
  6005  	VFMADD213PS Y4, Y1, Y5
  6006  	VMULPS      Y4, Y4, Y4
  6007  	VFMADD231PS Y6, Y4, Y5
  6008  	VCMPPS      $0x02, dataLogLen8xF32<>+72(SB), Y3, Y3
  6009  	VORPS       Y5, Y3, Y3
  6010  	VMOVUPS     Y3, (DI)(AX*4)
  6011  	ADDQ        $0x08, AX
  6012  	CMPQ        AX, SI
  6013  	JB          LBB10_2
  6014  
  6015  LBB10_3:
  6016  	ADDQ $0x68, SP
  6017  	VZEROUPPER
  6018  	RET
  6019  
  6020  DATA dataLog2Len8xF32<>+0(SB)/4, $0x00800000
  6021  DATA dataLog2Len8xF32<>+4(SB)/4, $0x807fffff
  6022  DATA dataLog2Len8xF32<>+8(SB)/4, $0x3f000000
  6023  DATA dataLog2Len8xF32<>+12(SB)/4, $0xffffff81
  6024  DATA dataLog2Len8xF32<>+16(SB)/4, $0x3f800000
  6025  DATA dataLog2Len8xF32<>+20(SB)/4, $0x3f3504f3
  6026  DATA dataLog2Len8xF32<>+24(SB)/4, $0xbf800000
  6027  DATA dataLog2Len8xF32<>+28(SB)/4, $0x3d9021bb
  6028  DATA dataLog2Len8xF32<>+32(SB)/4, $0xbdebd1b8
  6029  DATA dataLog2Len8xF32<>+36(SB)/4, $0x3def251a
  6030  DATA dataLog2Len8xF32<>+40(SB)/4, $0xbdfe5d4f
  6031  DATA dataLog2Len8xF32<>+44(SB)/4, $0x3e11e9bf
  6032  DATA dataLog2Len8xF32<>+48(SB)/4, $0xbe2aae50
  6033  DATA dataLog2Len8xF32<>+52(SB)/4, $0x3e4cceac
  6034  DATA dataLog2Len8xF32<>+56(SB)/4, $0xbe7ffffc
  6035  DATA dataLog2Len8xF32<>+60(SB)/4, $0x3eaaaaaa
  6036  DATA dataLog2Len8xF32<>+64(SB)/4, $0x3f317218
  6037  DATA dataLog2Len8xF32<>+68(SB)/4, $0xbf000000
  6038  DATA dataLog2Len8xF32<>+72(SB)/4, $0x3fb8aa3b
  6039  DATA dataLog2Len8xF32<>+76(SB)/8, $0x0000000000000000
  6040  DATA dataLog2Len8xF32<>+84(SB)/8, $0x0000000000000000
  6041  DATA dataLog2Len8xF32<>+92(SB)/8, $0x0000000000000000
  6042  DATA dataLog2Len8xF32<>+100(SB)/8, $0x0000000000000000
  6043  GLOBL dataLog2Len8xF32<>(SB), RODATA|NOPTR, $108
  6044  
  6045  // func Log2_Len8x_AVX2_F32(x []float32)
  6046  // Requires: AVX, AVX2, FMA3
  6047  TEXT ·Log2_Len8x_AVX2_F32(SB), NOSPLIT, $0-24
  6048  	MOVQ         x_base+0(FP), DI
  6049  	MOVQ         x_len+8(FP), SI
  6050  	SUBQ         $0x88, SP
  6051  	TESTQ        SI, SI
  6052  	JE           LBB9_3
  6053  	XORL         AX, AX
  6054  	VBROADCASTSS dataLog2Len8xF32<>+4(SB), Y0
  6055  	VMOVUPS      Y0, 96(SP)
  6056  	VBROADCASTSS dataLog2Len8xF32<>+8(SB), Y0
  6057  	VMOVUPS      Y0, 64(SP)
  6058  	VBROADCASTSS dataLog2Len8xF32<>+12(SB), Y0
  6059  	VMOVUPS      Y0, 32(SP)
  6060  	VBROADCASTSS dataLog2Len8xF32<>+0(SB), Y0
  6061  	VMOVUPS      Y0, (SP)
  6062  	VBROADCASTSS dataLog2Len8xF32<>+16(SB), Y0
  6063  	VMOVUPS      Y0, -32(SP)
  6064  	VBROADCASTSS dataLog2Len8xF32<>+20(SB), Y0
  6065  	VMOVUPS      Y0, -64(SP)
  6066  	VBROADCASTSS dataLog2Len8xF32<>+24(SB), Y0
  6067  	VMOVUPS      Y0, -96(SP)
  6068  	VBROADCASTSS dataLog2Len8xF32<>+28(SB), Y0
  6069  	VMOVUPS      Y0, -128(SP)
  6070  	VBROADCASTSS dataLog2Len8xF32<>+32(SB), Y9
  6071  	VBROADCASTSS dataLog2Len8xF32<>+36(SB), Y10
  6072  	VBROADCASTSS dataLog2Len8xF32<>+40(SB), Y11
  6073  	VBROADCASTSS dataLog2Len8xF32<>+44(SB), Y12
  6074  	VBROADCASTSS dataLog2Len8xF32<>+48(SB), Y13
  6075  	VBROADCASTSS dataLog2Len8xF32<>+52(SB), Y14
  6076  	VBROADCASTSS dataLog2Len8xF32<>+56(SB), Y15
  6077  	VBROADCASTSS dataLog2Len8xF32<>+60(SB), Y0
  6078  	VBROADCASTSS dataLog2Len8xF32<>+64(SB), Y1
  6079  	VBROADCASTSS dataLog2Len8xF32<>+68(SB), Y2
  6080  	VBROADCASTSS dataLog2Len8xF32<>+72(SB), Y3
  6081  
  6082  LBB9_2:
  6083  	VMOVUPS     (DI)(AX*4), Y4
  6084  	VMAXPS      (SP), Y4, Y5
  6085  	VPSRLD      $0x17, Y5, Y6
  6086  	VPADDD      32(SP), Y6, Y6
  6087  	VANDPS      96(SP), Y5, Y5
  6088  	VORPS       64(SP), Y5, Y5
  6089  	VCVTDQ2PS   Y6, Y6
  6090  	VADDPS      -32(SP), Y6, Y7
  6091  	VCMPPS      $0x01, -64(SP), Y5, Y8
  6092  	VBLENDVPS   Y8, Y6, Y7, Y6
  6093  	VANDPS      Y5, Y8, Y7
  6094  	VADDPS      -96(SP), Y5, Y5
  6095  	VADDPS      Y7, Y5, Y5
  6096  	VMOVUPS     -128(SP), Y7
  6097  	VFMADD213PS Y9, Y5, Y7
  6098  	VFMADD213PS Y10, Y5, Y7
  6099  	VFMADD213PS Y11, Y5, Y7
  6100  	VFMADD213PS Y12, Y5, Y7
  6101  	VFMADD213PS Y13, Y5, Y7
  6102  	VFMADD213PS Y14, Y5, Y7
  6103  	VFMADD213PS Y15, Y5, Y7
  6104  	VFMADD213PS Y0, Y5, Y7
  6105  	VFMADD213PS Y2, Y5, Y7
  6106  	VFMADD213PS Y5, Y1, Y6
  6107  	VMULPS      Y5, Y5, Y5
  6108  	VFMADD231PS Y7, Y5, Y6
  6109  	VCMPPS      $0x02, dataLog2Len8xF32<>+76(SB), Y4, Y4
  6110  	VMULPS      Y3, Y6, Y5
  6111  	VORPS       Y5, Y4, Y4
  6112  	VMOVUPS     Y4, (DI)(AX*4)
  6113  	ADDQ        $0x08, AX
  6114  	CMPQ        AX, SI
  6115  	JB          LBB9_2
  6116  
  6117  LBB9_3:
  6118  	ADDQ $0x88, SP
  6119  	VZEROUPPER
  6120  	RET
  6121  
  6122  DATA dataLog10Len8xF32<>+0(SB)/4, $0x00800000
  6123  DATA dataLog10Len8xF32<>+4(SB)/4, $0x807fffff
  6124  DATA dataLog10Len8xF32<>+8(SB)/4, $0x3f000000
  6125  DATA dataLog10Len8xF32<>+12(SB)/4, $0xffffff81
  6126  DATA dataLog10Len8xF32<>+16(SB)/4, $0x3f800000
  6127  DATA dataLog10Len8xF32<>+20(SB)/4, $0x3f3504f3
  6128  DATA dataLog10Len8xF32<>+24(SB)/4, $0xbf800000
  6129  DATA dataLog10Len8xF32<>+28(SB)/4, $0x3d9021bb
  6130  DATA dataLog10Len8xF32<>+32(SB)/4, $0xbdebd1b8
  6131  DATA dataLog10Len8xF32<>+36(SB)/4, $0x3def251a
  6132  DATA dataLog10Len8xF32<>+40(SB)/4, $0xbdfe5d4f
  6133  DATA dataLog10Len8xF32<>+44(SB)/4, $0x3e11e9bf
  6134  DATA dataLog10Len8xF32<>+48(SB)/4, $0xbe2aae50
  6135  DATA dataLog10Len8xF32<>+52(SB)/4, $0x3e4cceac
  6136  DATA dataLog10Len8xF32<>+56(SB)/4, $0xbe7ffffc
  6137  DATA dataLog10Len8xF32<>+60(SB)/4, $0x3eaaaaaa
  6138  DATA dataLog10Len8xF32<>+64(SB)/4, $0x3f317218
  6139  DATA dataLog10Len8xF32<>+68(SB)/4, $0xbf000000
  6140  DATA dataLog10Len8xF32<>+72(SB)/4, $0x3ede5bd9
  6141  DATA dataLog10Len8xF32<>+76(SB)/8, $0x0000000000000000
  6142  DATA dataLog10Len8xF32<>+84(SB)/8, $0x0000000000000000
  6143  DATA dataLog10Len8xF32<>+92(SB)/8, $0x0000000000000000
  6144  DATA dataLog10Len8xF32<>+100(SB)/8, $0x0000000000000000
  6145  GLOBL dataLog10Len8xF32<>(SB), RODATA|NOPTR, $108
  6146  
  6147  // func Log10_Len8x_AVX2_F32(x []float32)
  6148  // Requires: AVX, AVX2, FMA3
  6149  TEXT ·Log10_Len8x_AVX2_F32(SB), NOSPLIT, $0-24
  6150  	MOVQ         x_base+0(FP), DI
  6151  	MOVQ         x_len+8(FP), SI
  6152  	SUBQ         $0x88, SP
  6153  	TESTQ        SI, SI
  6154  	JE           LBB8_3
  6155  	XORL         AX, AX
  6156  	VBROADCASTSS dataLog10Len8xF32<>+4(SB), Y0
  6157  	VMOVUPS      Y0, 96(SP)
  6158  	VBROADCASTSS dataLog10Len8xF32<>+8(SB), Y0
  6159  	VMOVUPS      Y0, 64(SP)
  6160  	VBROADCASTSS dataLog10Len8xF32<>+12(SB), Y0
  6161  	VMOVUPS      Y0, 32(SP)
  6162  	VBROADCASTSS dataLog10Len8xF32<>+0(SB), Y0
  6163  	VMOVUPS      Y0, (SP)
  6164  	VBROADCASTSS dataLog10Len8xF32<>+16(SB), Y0
  6165  	VMOVUPS      Y0, -32(SP)
  6166  	VBROADCASTSS dataLog10Len8xF32<>+20(SB), Y0
  6167  	VMOVUPS      Y0, -64(SP)
  6168  	VBROADCASTSS dataLog10Len8xF32<>+24(SB), Y0
  6169  	VMOVUPS      Y0, -96(SP)
  6170  	VBROADCASTSS dataLog10Len8xF32<>+28(SB), Y0
  6171  	VMOVUPS      Y0, -128(SP)
  6172  	VBROADCASTSS dataLog10Len8xF32<>+32(SB), Y9
  6173  	VBROADCASTSS dataLog10Len8xF32<>+36(SB), Y10
  6174  	VBROADCASTSS dataLog10Len8xF32<>+40(SB), Y11
  6175  	VBROADCASTSS dataLog10Len8xF32<>+44(SB), Y12
  6176  	VBROADCASTSS dataLog10Len8xF32<>+48(SB), Y13
  6177  	VBROADCASTSS dataLog10Len8xF32<>+52(SB), Y14
  6178  	VBROADCASTSS dataLog10Len8xF32<>+56(SB), Y15
  6179  	VBROADCASTSS dataLog10Len8xF32<>+60(SB), Y0
  6180  	VBROADCASTSS dataLog10Len8xF32<>+64(SB), Y1
  6181  	VBROADCASTSS dataLog10Len8xF32<>+68(SB), Y2
  6182  	VBROADCASTSS dataLog10Len8xF32<>+72(SB), Y3
  6183  
  6184  LBB8_2:
  6185  	VMOVUPS     (DI)(AX*4), Y4
  6186  	VMAXPS      (SP), Y4, Y5
  6187  	VPSRLD      $0x17, Y5, Y6
  6188  	VPADDD      32(SP), Y6, Y6
  6189  	VANDPS      96(SP), Y5, Y5
  6190  	VORPS       64(SP), Y5, Y5
  6191  	VCVTDQ2PS   Y6, Y6
  6192  	VADDPS      -32(SP), Y6, Y7
  6193  	VCMPPS      $0x01, -64(SP), Y5, Y8
  6194  	VBLENDVPS   Y8, Y6, Y7, Y6
  6195  	VANDPS      Y5, Y8, Y7
  6196  	VADDPS      -96(SP), Y5, Y5
  6197  	VADDPS      Y7, Y5, Y5
  6198  	VMOVUPS     -128(SP), Y7
  6199  	VFMADD213PS Y9, Y5, Y7
  6200  	VFMADD213PS Y10, Y5, Y7
  6201  	VFMADD213PS Y11, Y5, Y7
  6202  	VFMADD213PS Y12, Y5, Y7
  6203  	VFMADD213PS Y13, Y5, Y7
  6204  	VFMADD213PS Y14, Y5, Y7
  6205  	VFMADD213PS Y15, Y5, Y7
  6206  	VFMADD213PS Y0, Y5, Y7
  6207  	VFMADD213PS Y2, Y5, Y7
  6208  	VFMADD213PS Y5, Y1, Y6
  6209  	VMULPS      Y5, Y5, Y5
  6210  	VFMADD231PS Y7, Y5, Y6
  6211  	VCMPPS      $0x02, dataLog10Len8xF32<>+76(SB), Y4, Y4
  6212  	VMULPS      Y3, Y6, Y5
  6213  	VORPS       Y5, Y4, Y4
  6214  	VMOVUPS     Y4, (DI)(AX*4)
  6215  	ADDQ        $0x08, AX
  6216  	CMPQ        AX, SI
  6217  	JB          LBB8_2
  6218  
  6219  LBB8_3:
  6220  	ADDQ $0x88, SP
  6221  	VZEROUPPER
  6222  	RET
  6223  
  6224  DATA dataMinF64<>+0(SB)/8, $0x7fefffffffffffff
  6225  GLOBL dataMinF64<>(SB), RODATA|NOPTR, $8
  6226  
  6227  // func Min_AVX2_F64(x []float64) float64
  6228  // Requires: AVX, SSE2
  6229  TEXT ·Min_AVX2_F64(SB), NOSPLIT, $0-32
  6230  	MOVQ   x_base+0(FP), DI
  6231  	MOVQ   x_len+8(FP), SI
  6232  	TESTQ  SI, SI
  6233  	JE     LBB0_1
  6234  	CMPQ   SI, $0x10
  6235  	JAE    LBB0_4
  6236  	VMOVSD dataMinF64<>+0(SB), X0
  6237  	XORL   AX, AX
  6238  	JMP    LBB0_11
  6239  
  6240  LBB0_1:
  6241  	VMOVSD dataMinF64<>+0(SB), X0
  6242  	MOVSD  X0, ret+24(FP)
  6243  	RET
  6244  
  6245  LBB0_4:
  6246  	MOVQ         SI, AX
  6247  	ANDQ         $-16, AX
  6248  	LEAQ         -16(AX), CX
  6249  	MOVQ         CX, R8
  6250  	SHRQ         $0x04, R8
  6251  	ADDQ         $0x01, R8
  6252  	TESTQ        CX, CX
  6253  	JE           LBB0_5
  6254  	MOVQ         R8, CX
  6255  	ANDQ         $-2, CX
  6256  	VBROADCASTSD dataMinF64<>+0(SB), Y0
  6257  	XORL         DX, DX
  6258  	VMOVAPD      Y0, Y1
  6259  	VMOVAPD      Y0, Y2
  6260  	VMOVAPD      Y0, Y3
  6261  
  6262  LBB0_7:
  6263  	VMINPD (DI)(DX*8), Y0, Y0
  6264  	VMINPD 32(DI)(DX*8), Y1, Y1
  6265  	VMINPD 64(DI)(DX*8), Y2, Y2
  6266  	VMINPD 96(DI)(DX*8), Y3, Y3
  6267  	VMINPD 128(DI)(DX*8), Y0, Y0
  6268  	VMINPD 160(DI)(DX*8), Y1, Y1
  6269  	VMINPD 192(DI)(DX*8), Y2, Y2
  6270  	VMINPD 224(DI)(DX*8), Y3, Y3
  6271  	ADDQ   $0x20, DX
  6272  	ADDQ   $-2, CX
  6273  	JNE    LBB0_7
  6274  	TESTB  $0x01, R8
  6275  	JE     LBB0_10
  6276  
  6277  LBB0_9:
  6278  	VMINPD (DI)(DX*8), Y0, Y0
  6279  	VMINPD 32(DI)(DX*8), Y1, Y1
  6280  	VMINPD 64(DI)(DX*8), Y2, Y2
  6281  	VMINPD 96(DI)(DX*8), Y3, Y3
  6282  
  6283  LBB0_10:
  6284  	VMINPD       Y3, Y0, Y0
  6285  	VMINPD       Y2, Y1, Y1
  6286  	VMINPD       Y0, Y1, Y0
  6287  	VEXTRACTF128 $0x01, Y0, X1
  6288  	VMINPD       X1, X0, X0
  6289  	VPERMILPD    $0x01, X0, X1
  6290  	VMINSD       X1, X0, X0
  6291  	CMPQ         AX, SI
  6292  	JE           LBB0_12
  6293  
  6294  LBB0_11:
  6295  	VMINSD (DI)(AX*8), X0, X0
  6296  	ADDQ   $0x01, AX
  6297  	CMPQ   SI, AX
  6298  	JNE    LBB0_11
  6299  
  6300  LBB0_12:
  6301  	VZEROUPPER
  6302  	MOVSD X0, ret+24(FP)
  6303  	RET
  6304  
  6305  LBB0_5:
  6306  	VBROADCASTSD dataMinF64<>+0(SB), Y0
  6307  	XORL         DX, DX
  6308  	VMOVAPD      Y0, Y1
  6309  	VMOVAPD      Y0, Y2
  6310  	VMOVAPD      Y0, Y3
  6311  	TESTB        $0x01, R8
  6312  	JNE          LBB0_9
  6313  	JMP          LBB0_10
  6314  
  6315  DATA dataMinF32<>+0(SB)/4, $0x7f7fffff
  6316  GLOBL dataMinF32<>(SB), RODATA|NOPTR, $4
  6317  
  6318  // func Min_AVX2_F32(x []float32) float32
  6319  // Requires: AVX, SSE
  6320  TEXT ·Min_AVX2_F32(SB), NOSPLIT, $0-28
  6321  	MOVQ   x_base+0(FP), DI
  6322  	MOVQ   x_len+8(FP), SI
  6323  	TESTQ  SI, SI
  6324  	JE     LBB1_1
  6325  	CMPQ   SI, $0x20
  6326  	JAE    LBB1_4
  6327  	VMOVSS dataMinF32<>+0(SB), X0
  6328  	XORL   AX, AX
  6329  	JMP    LBB1_11
  6330  
  6331  LBB1_1:
  6332  	VMOVSS dataMinF32<>+0(SB), X0
  6333  	MOVSS  X0, ret+24(FP)
  6334  	RET
  6335  
  6336  LBB1_4:
  6337  	MOVQ         SI, AX
  6338  	ANDQ         $-32, AX
  6339  	LEAQ         -32(AX), CX
  6340  	MOVQ         CX, R8
  6341  	SHRQ         $0x05, R8
  6342  	ADDQ         $0x01, R8
  6343  	TESTQ        CX, CX
  6344  	JE           LBB1_5
  6345  	MOVQ         R8, CX
  6346  	ANDQ         $-2, CX
  6347  	VBROADCASTSS dataMinF32<>+0(SB), Y0
  6348  	XORL         DX, DX
  6349  	VMOVAPS      Y0, Y1
  6350  	VMOVAPS      Y0, Y2
  6351  	VMOVAPS      Y0, Y3
  6352  
  6353  LBB1_7:
  6354  	VMINPS (DI)(DX*4), Y0, Y0
  6355  	VMINPS 32(DI)(DX*4), Y1, Y1
  6356  	VMINPS 64(DI)(DX*4), Y2, Y2
  6357  	VMINPS 96(DI)(DX*4), Y3, Y3
  6358  	VMINPS 128(DI)(DX*4), Y0, Y0
  6359  	VMINPS 160(DI)(DX*4), Y1, Y1
  6360  	VMINPS 192(DI)(DX*4), Y2, Y2
  6361  	VMINPS 224(DI)(DX*4), Y3, Y3
  6362  	ADDQ   $0x40, DX
  6363  	ADDQ   $-2, CX
  6364  	JNE    LBB1_7
  6365  	TESTB  $0x01, R8
  6366  	JE     LBB1_10
  6367  
  6368  LBB1_9:
  6369  	VMINPS (DI)(DX*4), Y0, Y0
  6370  	VMINPS 32(DI)(DX*4), Y1, Y1
  6371  	VMINPS 64(DI)(DX*4), Y2, Y2
  6372  	VMINPS 96(DI)(DX*4), Y3, Y3
  6373  
  6374  LBB1_10:
  6375  	VMINPS       Y3, Y0, Y0
  6376  	VMINPS       Y2, Y1, Y1
  6377  	VMINPS       Y0, Y1, Y0
  6378  	VEXTRACTF128 $0x01, Y0, X1
  6379  	VMINPS       X1, X0, X0
  6380  	VPERMILPD    $0x01, X0, X1
  6381  	VMINPS       X1, X0, X0
  6382  	VMOVSHDUP    X0, X1
  6383  	VMINSS       X1, X0, X0
  6384  	CMPQ         AX, SI
  6385  	JE           LBB1_12
  6386  
  6387  LBB1_11:
  6388  	VMINSS (DI)(AX*4), X0, X0
  6389  	ADDQ   $0x01, AX
  6390  	CMPQ   SI, AX
  6391  	JNE    LBB1_11
  6392  
  6393  LBB1_12:
  6394  	VZEROUPPER
  6395  	MOVSS X0, ret+24(FP)
  6396  	RET
  6397  
  6398  LBB1_5:
  6399  	VBROADCASTSS dataMinF32<>+0(SB), Y0
  6400  	XORL         DX, DX
  6401  	VMOVAPS      Y0, Y1
  6402  	VMOVAPS      Y0, Y2
  6403  	VMOVAPS      Y0, Y3
  6404  	TESTB        $0x01, R8
  6405  	JNE          LBB1_9
  6406  	JMP          LBB1_10
  6407  
  6408  // func Minimum_AVX2_F64(x []float64, y []float64)
  6409  // Requires: AVX
  6410  TEXT ·Minimum_AVX2_F64(SB), NOSPLIT, $0-48
  6411  	MOVQ  x_base+0(FP), DI
  6412  	MOVQ  y_base+24(FP), SI
  6413  	MOVQ  x_len+8(FP), DX
  6414  	TESTQ DX, DX
  6415  	JE    LBB2_9
  6416  	CMPQ  DX, $0x10
  6417  	JAE   LBB2_3
  6418  	XORL  AX, AX
  6419  	JMP   LBB2_6
  6420  
  6421  LBB2_3:
  6422  	MOVQ DX, AX
  6423  	ANDQ $-16, AX
  6424  	LEAQ 96(DI), R8
  6425  	XORL CX, CX
  6426  
  6427  LBB2_4:
  6428  	VMOVUPD    (SI)(CX*8), Y0
  6429  	VMOVUPD    32(SI)(CX*8), Y1
  6430  	VMOVUPD    64(SI)(CX*8), Y2
  6431  	VMOVUPD    96(SI)(CX*8), Y3
  6432  	VCMPPD     $0x01, -96(R8)(CX*8), Y0, Y4
  6433  	VCMPPD     $0x01, -64(R8)(CX*8), Y1, Y5
  6434  	VCMPPD     $0x01, -32(R8)(CX*8), Y2, Y6
  6435  	VCMPPD     $0x01, (R8)(CX*8), Y3, Y7
  6436  	VMASKMOVPD Y0, Y4, -96(R8)(CX*8)
  6437  	VMASKMOVPD Y1, Y5, -64(R8)(CX*8)
  6438  	VMASKMOVPD Y2, Y6, -32(R8)(CX*8)
  6439  	VMASKMOVPD Y3, Y7, (R8)(CX*8)
  6440  	ADDQ       $0x10, CX
  6441  	CMPQ       AX, CX
  6442  	JNE        LBB2_4
  6443  	CMPQ       AX, DX
  6444  	JNE        LBB2_6
  6445  
  6446  LBB2_9:
  6447  	VZEROUPPER
  6448  	RET
  6449  
  6450  LBB2_8:
  6451  	ADDQ $0x01, AX
  6452  	CMPQ DX, AX
  6453  	JE   LBB2_9
  6454  
  6455  LBB2_6:
  6456  	VMOVSD   (SI)(AX*8), X0
  6457  	VUCOMISD (DI)(AX*8), X0
  6458  	JAE      LBB2_8
  6459  	VMOVSD   X0, (DI)(AX*8)
  6460  	JMP      LBB2_8
  6461  
  6462  // func Minimum_AVX2_F32(x []float32, y []float32)
  6463  // Requires: AVX
  6464  TEXT ·Minimum_AVX2_F32(SB), NOSPLIT, $0-48
  6465  	MOVQ  x_base+0(FP), DI
  6466  	MOVQ  y_base+24(FP), SI
  6467  	MOVQ  x_len+8(FP), DX
  6468  	TESTQ DX, DX
  6469  	JE    LBB3_9
  6470  	CMPQ  DX, $0x20
  6471  	JAE   LBB3_3
  6472  	XORL  AX, AX
  6473  	JMP   LBB3_6
  6474  
  6475  LBB3_3:
  6476  	MOVQ DX, AX
  6477  	ANDQ $-32, AX
  6478  	LEAQ 96(DI), R8
  6479  	XORL CX, CX
  6480  
  6481  LBB3_4:
  6482  	VMOVUPS    (SI)(CX*4), Y0
  6483  	VMOVUPS    32(SI)(CX*4), Y1
  6484  	VMOVUPS    64(SI)(CX*4), Y2
  6485  	VMOVUPS    96(SI)(CX*4), Y3
  6486  	VCMPPS     $0x01, -96(R8)(CX*4), Y0, Y4
  6487  	VCMPPS     $0x01, -64(R8)(CX*4), Y1, Y5
  6488  	VCMPPS     $0x01, -32(R8)(CX*4), Y2, Y6
  6489  	VCMPPS     $0x01, (R8)(CX*4), Y3, Y7
  6490  	VMASKMOVPS Y0, Y4, -96(R8)(CX*4)
  6491  	VMASKMOVPS Y1, Y5, -64(R8)(CX*4)
  6492  	VMASKMOVPS Y2, Y6, -32(R8)(CX*4)
  6493  	VMASKMOVPS Y3, Y7, (R8)(CX*4)
  6494  	ADDQ       $0x20, CX
  6495  	CMPQ       AX, CX
  6496  	JNE        LBB3_4
  6497  	CMPQ       AX, DX
  6498  	JNE        LBB3_6
  6499  
  6500  LBB3_9:
  6501  	VZEROUPPER
  6502  	RET
  6503  
  6504  LBB3_8:
  6505  	ADDQ $0x01, AX
  6506  	CMPQ DX, AX
  6507  	JE   LBB3_9
  6508  
  6509  LBB3_6:
  6510  	VMOVSS   (SI)(AX*4), X0
  6511  	VUCOMISS (DI)(AX*4), X0
  6512  	JAE      LBB3_8
  6513  	VMOVSS   X0, (DI)(AX*4)
  6514  	JMP      LBB3_8
  6515  
  6516  // func MinimumNumber_AVX2_F64(x []float64, a float64)
  6517  // Requires: AVX, AVX2, SSE2
  6518  TEXT ·MinimumNumber_AVX2_F64(SB), NOSPLIT, $0-32
  6519  	MOVQ  x_base+0(FP), DI
  6520  	MOVSD a+24(FP), X0
  6521  	MOVQ  x_len+8(FP), SI
  6522  	TESTQ SI, SI
  6523  	JE    LBB4_9
  6524  	CMPQ  SI, $0x10
  6525  	JAE   LBB4_3
  6526  	XORL  AX, AX
  6527  	JMP   LBB4_6
  6528  
  6529  LBB4_3:
  6530  	MOVQ         SI, AX
  6531  	ANDQ         $-16, AX
  6532  	VBROADCASTSD X0, Y1
  6533  	LEAQ         96(DI), CX
  6534  	XORL         DX, DX
  6535  
  6536  LBB4_4:
  6537  	VCMPPD     $0x01, -96(CX)(DX*8), Y1, Y2
  6538  	VCMPPD     $0x01, -64(CX)(DX*8), Y1, Y3
  6539  	VCMPPD     $0x01, -32(CX)(DX*8), Y1, Y4
  6540  	VCMPPD     $0x01, (CX)(DX*8), Y1, Y5
  6541  	VMASKMOVPD Y1, Y2, -96(CX)(DX*8)
  6542  	VMASKMOVPD Y1, Y3, -64(CX)(DX*8)
  6543  	VMASKMOVPD Y1, Y4, -32(CX)(DX*8)
  6544  	VMASKMOVPD Y1, Y5, (CX)(DX*8)
  6545  	ADDQ       $0x10, DX
  6546  	CMPQ       AX, DX
  6547  	JNE        LBB4_4
  6548  	CMPQ       AX, SI
  6549  	JNE        LBB4_6
  6550  
  6551  LBB4_9:
  6552  	VZEROUPPER
  6553  	RET
  6554  
  6555  LBB4_8:
  6556  	ADDQ $0x01, AX
  6557  	CMPQ SI, AX
  6558  	JE   LBB4_9
  6559  
  6560  LBB4_6:
  6561  	VUCOMISD (DI)(AX*8), X0
  6562  	JAE      LBB4_8
  6563  	VMOVSD   X0, (DI)(AX*8)
  6564  	JMP      LBB4_8
  6565  
  6566  // func MinimumNumber_AVX2_F32(x []float32, a float32)
  6567  // Requires: AVX, AVX2, SSE
  6568  TEXT ·MinimumNumber_AVX2_F32(SB), NOSPLIT, $0-28
  6569  	MOVQ  x_base+0(FP), DI
  6570  	MOVSS a+24(FP), X0
  6571  	MOVQ  x_len+8(FP), SI
  6572  	TESTQ SI, SI
  6573  	JE    LBB5_9
  6574  	CMPQ  SI, $0x20
  6575  	JAE   LBB5_3
  6576  	XORL  AX, AX
  6577  	JMP   LBB5_6
  6578  
  6579  LBB5_3:
  6580  	MOVQ         SI, AX
  6581  	ANDQ         $-32, AX
  6582  	VBROADCASTSS X0, Y1
  6583  	LEAQ         96(DI), CX
  6584  	XORL         DX, DX
  6585  
  6586  LBB5_4:
  6587  	VCMPPS     $0x01, -96(CX)(DX*4), Y1, Y2
  6588  	VCMPPS     $0x01, -64(CX)(DX*4), Y1, Y3
  6589  	VCMPPS     $0x01, -32(CX)(DX*4), Y1, Y4
  6590  	VCMPPS     $0x01, (CX)(DX*4), Y1, Y5
  6591  	VMASKMOVPS Y1, Y2, -96(CX)(DX*4)
  6592  	VMASKMOVPS Y1, Y3, -64(CX)(DX*4)
  6593  	VMASKMOVPS Y1, Y4, -32(CX)(DX*4)
  6594  	VMASKMOVPS Y1, Y5, (CX)(DX*4)
  6595  	ADDQ       $0x20, DX
  6596  	CMPQ       AX, DX
  6597  	JNE        LBB5_4
  6598  	CMPQ       AX, SI
  6599  	JNE        LBB5_6
  6600  
  6601  LBB5_9:
  6602  	VZEROUPPER
  6603  	RET
  6604  
  6605  LBB5_8:
  6606  	ADDQ $0x01, AX
  6607  	CMPQ SI, AX
  6608  	JE   LBB5_9
  6609  
  6610  LBB5_6:
  6611  	VUCOMISS (DI)(AX*4), X0
  6612  	JAE      LBB5_8
  6613  	VMOVSS   X0, (DI)(AX*4)
  6614  	JMP      LBB5_8
  6615  
  6616  DATA dataMaxF64<>+0(SB)/8, $0xffefffffffffffff
  6617  GLOBL dataMaxF64<>(SB), RODATA|NOPTR, $8
  6618  
  6619  // func Max_AVX2_F64(x []float64) float64
  6620  // Requires: AVX, SSE2
  6621  TEXT ·Max_AVX2_F64(SB), NOSPLIT, $0-32
  6622  	MOVQ   x_base+0(FP), DI
  6623  	MOVQ   x_len+8(FP), SI
  6624  	TESTQ  SI, SI
  6625  	JE     empty
  6626  	CMPQ   SI, $0x10
  6627  	JAE    loop
  6628  	VMOVSD dataMaxF64<>+0(SB), X0
  6629  	XORL   AX, AX
  6630  	JMP    collect
  6631  
  6632  empty:
  6633  	VMOVSD dataMaxF64<>+0(SB), X0
  6634  	MOVSD  X0, ret+24(FP)
  6635  	RET
  6636  
  6637  loop:
  6638  	MOVQ         SI, AX
  6639  	ANDQ         $-16, AX
  6640  	LEAQ         -16(AX), CX
  6641  	MOVQ         CX, R8
  6642  	SHRQ         $0x04, R8
  6643  	ADDQ         $0x01, R8
  6644  	TESTQ        CX, CX
  6645  	JE           setmin
  6646  	MOVQ         R8, CX
  6647  	ANDQ         $-2, CX
  6648  	VBROADCASTSD dataMaxF64<>+0(SB), Y0
  6649  	XORL         DX, DX
  6650  	VMOVAPD      Y0, Y1
  6651  	VMOVAPD      Y0, Y2
  6652  	VMOVAPD      Y0, Y3
  6653  
  6654  body:
  6655  	VMAXPD (DI)(DX*8), Y0, Y0
  6656  	VMAXPD 32(DI)(DX*8), Y1, Y1
  6657  	VMAXPD 64(DI)(DX*8), Y2, Y2
  6658  	VMAXPD 96(DI)(DX*8), Y3, Y3
  6659  	VMAXPD 128(DI)(DX*8), Y0, Y0
  6660  	VMAXPD 160(DI)(DX*8), Y1, Y1
  6661  	VMAXPD 192(DI)(DX*8), Y2, Y2
  6662  	VMAXPD 224(DI)(DX*8), Y3, Y3
  6663  	ADDQ   $+32, DX
  6664  	ADDQ   $-2, CX
  6665  	JNE    body
  6666  	TESTB  $0x01, R8
  6667  	JE     combinevectors
  6668  
  6669  tail:
  6670  	VMAXPD (DI)(DX*8), Y0, Y0
  6671  	VMAXPD 32(DI)(DX*8), Y1, Y1
  6672  	VMAXPD 64(DI)(DX*8), Y1, Y1
  6673  	VMAXPD 96(DI)(DX*8), Y1, Y1
  6674  
  6675  combinevectors:
  6676  	VMAXPD       Y3, Y0, Y0
  6677  	VMAXPD       Y2, Y1, Y1
  6678  	VMAXPD       Y0, Y1, Y0
  6679  	VEXTRACTF128 $0x01, Y0, X1
  6680  	VMAXPD       X1, X0, X0
  6681  	VPERMILPD    $0x01, X0, X1
  6682  	VMAXSD       X1, X0, X0
  6683  	CMPQ         AX, SI
  6684  	JE           return
  6685  
  6686  collect:
  6687  	VMAXSD (DI)(AX*8), X0, X0
  6688  	ADDQ   $0x01, AX
  6689  	CMPQ   SI, AX
  6690  	JNE    collect
  6691  
  6692  return:
  6693  	VZEROUPPER
  6694  	MOVSD X0, ret+24(FP)
  6695  	RET
  6696  
  6697  setmin:
  6698  	VBROADCASTSD dataMaxF64<>+0(SB), Y0
  6699  	XORL         DX, DX
  6700  	VMOVAPD      Y0, Y1
  6701  	VMOVAPD      Y0, Y2
  6702  	VMOVAPD      Y0, Y3
  6703  	TESTB        $0x01, R8
  6704  	JNE          tail
  6705  	JMP          combinevectors
  6706  
  6707  DATA dataMaxF32<>+0(SB)/4, $0xff7fffff
  6708  GLOBL dataMaxF32<>(SB), RODATA|NOPTR, $4
  6709  
  6710  // func Max_AVX2_F32(x []float32) float32
  6711  // Requires: AVX, SSE
  6712  TEXT ·Max_AVX2_F32(SB), NOSPLIT, $0-28
  6713  	MOVQ   x_base+0(FP), DI
  6714  	MOVQ   x_len+8(FP), SI
  6715  	TESTQ  SI, SI
  6716  	JE     empty
  6717  	CMPQ   SI, $0x20
  6718  	JAE    loop
  6719  	VMOVSS dataMaxF32<>+0(SB), X0
  6720  	XORL   AX, AX
  6721  	JMP    collect
  6722  
  6723  empty:
  6724  	VMOVSS dataMaxF32<>+0(SB), X0
  6725  	MOVSS  X0, ret+24(FP)
  6726  	RET
  6727  
  6728  loop:
  6729  	MOVQ         SI, AX
  6730  	ANDQ         $-32, AX
  6731  	LEAQ         -32(AX), CX
  6732  	MOVQ         CX, R8
  6733  	SHRQ         $0x05, R8
  6734  	ADDQ         $0x01, R8
  6735  	TESTQ        CX, CX
  6736  	JE           setmin
  6737  	MOVQ         R8, CX
  6738  	ANDQ         $-2, CX
  6739  	VBROADCASTSS dataMaxF32<>+0(SB), Y0
  6740  	XORL         DX, DX
  6741  	VMOVAPD      Y0, Y1
  6742  	VMOVAPD      Y0, Y2
  6743  	VMOVAPD      Y0, Y3
  6744  
  6745  body:
  6746  	VMAXPS (DI)(DX*4), Y0, Y0
  6747  	VMAXPS 32(DI)(DX*4), Y1, Y1
  6748  	VMAXPS 64(DI)(DX*4), Y2, Y2
  6749  	VMAXPS 96(DI)(DX*4), Y3, Y3
  6750  	VMAXPS 128(DI)(DX*4), Y0, Y0
  6751  	VMAXPS 160(DI)(DX*4), Y1, Y1
  6752  	VMAXPS 192(DI)(DX*4), Y2, Y2
  6753  	VMAXPS 224(DI)(DX*4), Y3, Y3
  6754  	ADDQ   $+64, DX
  6755  	ADDQ   $-2, CX
  6756  	JNE    body
  6757  	TESTB  $0x01, R8
  6758  	JE     combinevectors
  6759  
  6760  tail:
  6761  	VMAXPS (DI)(DX*4), Y0, Y0
  6762  	VMAXPS 32(DI)(DX*4), Y1, Y1
  6763  	VMAXPS 64(DI)(DX*4), Y1, Y1
  6764  	VMAXPS 96(DI)(DX*4), Y1, Y1
  6765  
  6766  combinevectors:
  6767  	VMAXPS       Y3, Y0, Y0
  6768  	VMAXPS       Y2, Y1, Y1
  6769  	VMAXPS       Y0, Y1, Y0
  6770  	VEXTRACTF128 $0x01, Y0, X1
  6771  	VMAXPS       X1, X0, X0
  6772  	VPERMILPD    $0x01, X0, X1
  6773  	VMAXPS       X1, X0, X0
  6774  	VMOVSHDUP    X0, X1
  6775  	VMAXSS       X1, X0, X0
  6776  	CMPQ         AX, SI
  6777  	JE           return
  6778  
  6779  collect:
  6780  	VMAXSS (DI)(AX*4), X0, X0
  6781  	ADDQ   $0x01, AX
  6782  	CMPQ   SI, AX
  6783  	JNE    collect
  6784  
  6785  return:
  6786  	VZEROUPPER
  6787  	MOVSS X0, ret+24(FP)
  6788  	RET
  6789  
  6790  setmin:
  6791  	VBROADCASTSS dataMaxF32<>+0(SB), Y0
  6792  	XORL         DX, DX
  6793  	VMOVAPS      Y0, Y1
  6794  	VMOVAPS      Y0, Y2
  6795  	VMOVAPS      Y0, Y3
  6796  	TESTB        $0x01, R8
  6797  	JNE          tail
  6798  	JMP          combinevectors
  6799  
  6800  // func Maximum_AVX2_F64(x []float64, y []float64)
  6801  // Requires: AVX
  6802  TEXT ·Maximum_AVX2_F64(SB), NOSPLIT, $0-48
  6803  	MOVQ  x_base+0(FP), DI
  6804  	MOVQ  y_base+24(FP), SI
  6805  	MOVQ  x_len+8(FP), DX
  6806  	TESTQ DX, DX
  6807  	JE    return
  6808  	CMPQ  DX, $0x10
  6809  	JAE   loop
  6810  	XORL  AX, AX
  6811  	JMP   tailbody
  6812  
  6813  loop:
  6814  	MOVQ DX, AX
  6815  	ANDQ $-16, AX
  6816  	LEAQ 96(DI), R8
  6817  	XORL CX, CX
  6818  
  6819  body:
  6820  	VMOVUPD    (SI)(CX*8), Y0
  6821  	VMOVUPD    32(SI)(CX*8), Y1
  6822  	VMOVUPD    64(SI)(CX*8), Y2
  6823  	VMOVUPD    96(SI)(CX*8), Y3
  6824  	VMOVUPD    -96(R8)(CX*8), Y4
  6825  	VMOVUPD    -64(R8)(CX*8), Y5
  6826  	VMOVUPD    -32(R8)(CX*8), Y6
  6827  	VMOVUPD    (R8)(CX*8), Y7
  6828  	VCMPPD     $0x01, Y0, Y4, Y4
  6829  	VMASKMOVPD Y0, Y4, -96(R8)(CX*8)
  6830  	VCMPPD     $0x01, Y1, Y5, Y0
  6831  	VMASKMOVPD Y1, Y0, -64(R8)(CX*8)
  6832  	VCMPPD     $0x01, Y2, Y6, Y0
  6833  	VMASKMOVPD Y2, Y0, -32(R8)(CX*8)
  6834  	VCMPPD     $0x01, Y3, Y7, Y0
  6835  	VMASKMOVPD Y3, Y0, (R8)(CX*8)
  6836  	ADDQ       $0x10, CX
  6837  	CMPQ       CX, AX
  6838  	JNE        body
  6839  	CMPQ       DX, AX
  6840  	JNE        tailbody
  6841  
  6842  return:
  6843  	VZEROUPPER
  6844  	RET
  6845  
  6846  tail:
  6847  	ADDQ $0x01, AX
  6848  	CMPQ AX, DX
  6849  	JE   return
  6850  
  6851  tailbody:
  6852  	VMOVSD   (SI)(AX*8), X0
  6853  	VUCOMISD (DI)(AX*8), X0
  6854  	JBE      tail
  6855  	VMOVSD   X0, (DI)(AX*8)
  6856  	JMP      tail
  6857  
  6858  // func Maximum_AVX2_F32(x []float32, y []float32)
  6859  // Requires: AVX
  6860  TEXT ·Maximum_AVX2_F32(SB), NOSPLIT, $0-48
  6861  	MOVQ  x_base+0(FP), DI
  6862  	MOVQ  y_base+24(FP), SI
  6863  	MOVQ  x_len+8(FP), DX
  6864  	TESTQ DX, DX
  6865  	JE    return
  6866  	CMPQ  DX, $0x20
  6867  	JAE   loop
  6868  	XORL  AX, AX
  6869  	JMP   tailbody
  6870  
  6871  loop:
  6872  	MOVQ DX, AX
  6873  	ANDQ $-32, AX
  6874  	LEAQ 96(DI), R8
  6875  	XORL CX, CX
  6876  
  6877  body:
  6878  	VMOVUPS    (SI)(CX*4), Y0
  6879  	VMOVUPS    32(SI)(CX*4), Y1
  6880  	VMOVUPS    64(SI)(CX*4), Y2
  6881  	VMOVUPS    96(SI)(CX*4), Y3
  6882  	VMOVUPS    -96(R8)(CX*4), Y4
  6883  	VMOVUPS    -64(R8)(CX*4), Y5
  6884  	VMOVUPS    -32(R8)(CX*4), Y6
  6885  	VMOVUPS    (R8)(CX*4), Y7
  6886  	VCMPPS     $0x01, Y0, Y4, Y4
  6887  	VMASKMOVPS Y0, Y4, -96(R8)(CX*4)
  6888  	VCMPPS     $0x01, Y1, Y5, Y0
  6889  	VMASKMOVPS Y1, Y0, -64(R8)(CX*4)
  6890  	VCMPPS     $0x01, Y2, Y6, Y0
  6891  	VMASKMOVPS Y2, Y0, -32(R8)(CX*4)
  6892  	VCMPPS     $0x01, Y3, Y7, Y0
  6893  	VMASKMOVPS Y3, Y0, (R8)(CX*4)
  6894  	ADDQ       $0x20, CX
  6895  	CMPQ       CX, AX
  6896  	JNE        body
  6897  	CMPQ       DX, AX
  6898  	JNE        tailbody
  6899  
  6900  return:
  6901  	VZEROUPPER
  6902  	RET
  6903  
  6904  tail:
  6905  	ADDQ $0x01, AX
  6906  	CMPQ AX, DX
  6907  	JE   return
  6908  
  6909  tailbody:
  6910  	VMOVSS   (SI)(AX*4), X0
  6911  	VUCOMISS (DI)(AX*4), X0
  6912  	JBE      tail
  6913  	VMOVSS   X0, (DI)(AX*4)
  6914  	JMP      tail
  6915  
  6916  // func MaximumNumber_AVX2_F64(x []float64, a float64)
  6917  // Requires: AVX, AVX2, SSE2
  6918  TEXT ·MaximumNumber_AVX2_F64(SB), NOSPLIT, $0-32
  6919  	MOVQ  x_base+0(FP), DI
  6920  	MOVSD a+24(FP), X0
  6921  	MOVQ  x_len+8(FP), SI
  6922  	TESTQ SI, SI
  6923  	JE    return
  6924  	CMPQ  SI, $0x10
  6925  	JAE   loop
  6926  	XORL  AX, AX
  6927  	JMP   tailbody
  6928  
  6929  loop:
  6930  	MOVQ         SI, AX
  6931  	ANDQ         $-16, AX
  6932  	VBROADCASTSD X0, Y1
  6933  	LEAQ         96(DI), CX
  6934  	XORL         DX, DX
  6935  
  6936  body:
  6937  	VMOVUPD    -96(CX)(DX*8), Y2
  6938  	VMOVUPD    -64(CX)(DX*8), Y3
  6939  	VMOVUPD    -32(CX)(DX*8), Y4
  6940  	VMOVUPD    (CX)(DX*8), Y5
  6941  	VCMPPD     $0x01, Y1, Y2, Y2
  6942  	VMASKMOVPD Y1, Y2, -96(CX)(DX*8)
  6943  	VCMPPD     $0x01, Y1, Y3, Y2
  6944  	VMASKMOVPD Y1, Y2, -64(CX)(DX*8)
  6945  	VCMPPD     $0x01, Y1, Y4, Y2
  6946  	VMASKMOVPD Y1, Y2, -32(CX)(DX*8)
  6947  	VCMPPD     $0x01, Y1, Y5, Y2
  6948  	VMASKMOVPD Y1, Y2, (CX)(DX*8)
  6949  	ADDQ       $0x10, DX
  6950  	CMPQ       AX, DX
  6951  	JNE        body
  6952  	CMPQ       AX, SI
  6953  	JNE        tailbody
  6954  
  6955  return:
  6956  	VZEROUPPER
  6957  	RET
  6958  
  6959  tail:
  6960  	ADDQ $0x01, AX
  6961  	CMPQ SI, AX
  6962  	JE   return
  6963  
  6964  tailbody:
  6965  	VUCOMISD (DI)(AX*8), X0
  6966  	JBE      tail
  6967  	VMOVSD   X0, (DI)(AX*8)
  6968  	JMP      tail
  6969  
  6970  // func MaximumNumber_AVX2_F32(x []float32, a float32)
  6971  // Requires: AVX, AVX2, SSE
  6972  TEXT ·MaximumNumber_AVX2_F32(SB), NOSPLIT, $0-28
  6973  	MOVQ  x_base+0(FP), DI
  6974  	MOVSS a+24(FP), X0
  6975  	MOVQ  x_len+8(FP), SI
  6976  	TESTQ SI, SI
  6977  	JE    return
  6978  	CMPQ  SI, $0x20
  6979  	JAE   loop
  6980  	XORL  AX, AX
  6981  	JMP   tailbody
  6982  
  6983  loop:
  6984  	MOVQ         SI, AX
  6985  	ANDQ         $-32, AX
  6986  	VBROADCASTSS X0, Y1
  6987  	LEAQ         96(DI), CX
  6988  	XORL         DX, DX
  6989  
  6990  body:
  6991  	VMOVUPS    -96(CX)(DX*4), Y2
  6992  	VMOVUPS    -64(CX)(DX*4), Y3
  6993  	VMOVUPS    -32(CX)(DX*4), Y4
  6994  	VMOVUPS    (CX)(DX*4), Y5
  6995  	VCMPPS     $0x01, Y1, Y2, Y2
  6996  	VMASKMOVPS Y1, Y2, -96(CX)(DX*4)
  6997  	VCMPPS     $0x01, Y1, Y3, Y2
  6998  	VMASKMOVPS Y1, Y2, -64(CX)(DX*4)
  6999  	VCMPPS     $0x01, Y1, Y4, Y2
  7000  	VMASKMOVPS Y1, Y2, -32(CX)(DX*4)
  7001  	VCMPPS     $0x01, Y1, Y5, Y2
  7002  	VMASKMOVPS Y1, Y2, (CX)(DX*4)
  7003  	ADDQ       $0x20, DX
  7004  	CMPQ       AX, DX
  7005  	JNE        body
  7006  	CMPQ       AX, SI
  7007  	JNE        tailbody
  7008  
  7009  return:
  7010  	VZEROUPPER
  7011  	RET
  7012  
  7013  tail:
  7014  	ADDQ $0x01, AX
  7015  	CMPQ SI, AX
  7016  	JE   return
  7017  
  7018  tailbody:
  7019  	VUCOMISS (DI)(AX*4), X0
  7020  	JBE      tail
  7021  	VMOVSS   X0, (DI)(AX*4)
  7022  	JMP      tail
  7023  
  7024  // func Find_AVX2_F64(x []float64, a float64) int
  7025  // Requires: AVX, AVX2, SSE2
  7026  TEXT ·Find_AVX2_F64(SB), NOSPLIT, $0-40
  7027  	MOVQ         x_base+0(FP), DI
  7028  	MOVSD        a+24(FP), X0
  7029  	MOVQ         x_len+8(FP), SI
  7030  	MOVQ         SI, CX
  7031  	ANDQ         $-8, CX
  7032  	JE           tail
  7033  	VPBROADCASTQ X0, Y1
  7034  	XORL         AX, AX
  7035  
  7036  loop:
  7037  	VPCMPEQQ (DI)(AX*8), Y1, Y2
  7038  	VPCMPEQQ 32(DI)(AX*8), Y1, Y3
  7039  	VPOR     Y2, Y3, Y4
  7040  	VPTEST   Y4, Y4
  7041  	JNE      mask
  7042  	ADDQ     $0x08, AX
  7043  	CMPQ     AX, CX
  7044  	JB       loop
  7045  	CMPQ     AX, SI
  7046  	JB       tailbody
  7047  
  7048  return:
  7049  	VZEROUPPER
  7050  	MOVQ AX, ret+32(FP)
  7051  	RET
  7052  
  7053  tail:
  7054  	XORL AX, AX
  7055  	CMPQ AX, SI
  7056  	JAE  return
  7057  
  7058  tailbody:
  7059  	VUCOMISD (DI)(AX*8), X0
  7060  	JE       return
  7061  	ADDQ     $0x01, AX
  7062  	CMPQ     SI, AX
  7063  	JNE      tailbody
  7064  	MOVQ     SI, AX
  7065  	VZEROUPPER
  7066  	MOVQ     AX, ret+32(FP)
  7067  	RET
  7068  
  7069  mask:
  7070  	VMOVMSKPD Y3, CX
  7071  	SHLL      $0x04, CX
  7072  	VMOVMSKPD Y2, DX
  7073  	ORL       CX, DX
  7074  	BSFL      DX, CX
  7075  	ADDQ      CX, AX
  7076  	VZEROUPPER
  7077  	MOVQ      AX, ret+32(FP)
  7078  	RET
  7079  
  7080  // func Find_AVX2_F32(x []float32, a float32) int
  7081  // Requires: AVX, AVX2, SSE
  7082  TEXT ·Find_AVX2_F32(SB), NOSPLIT, $0-40
  7083  	MOVQ         x_base+0(FP), DI
  7084  	MOVSS        a+24(FP), X0
  7085  	MOVQ         x_len+8(FP), SI
  7086  	MOVQ         SI, CX
  7087  	ANDQ         $-16, CX
  7088  	JE           tail
  7089  	VPBROADCASTD X0, Y1
  7090  	XORL         AX, AX
  7091  
  7092  loop:
  7093  	VPCMPEQD (DI)(AX*4), Y1, Y2
  7094  	VPCMPEQD 32(DI)(AX*4), Y1, Y3
  7095  	VPOR     Y2, Y3, Y4
  7096  	VPTEST   Y4, Y4
  7097  	JNE      mask
  7098  	ADDQ     $0x10, AX
  7099  	CMPQ     AX, CX
  7100  	JB       loop
  7101  	CMPQ     AX, SI
  7102  	JB       tailbody
  7103  
  7104  return:
  7105  	VZEROUPPER
  7106  	MOVQ AX, ret+32(FP)
  7107  	RET
  7108  
  7109  tail:
  7110  	XORL AX, AX
  7111  	CMPQ AX, SI
  7112  	JAE  return
  7113  
  7114  tailbody:
  7115  	VUCOMISS (DI)(AX*4), X0
  7116  	JE       return
  7117  	ADDQ     $+1, AX
  7118  	CMPQ     SI, AX
  7119  	JNE      tailbody
  7120  	MOVQ     SI, AX
  7121  	VZEROUPPER
  7122  	MOVQ     AX, ret+32(FP)
  7123  	RET
  7124  
  7125  mask:
  7126  	VMOVMSKPS Y3, CX
  7127  	SHLL      $0x08, CX
  7128  	VMOVMSKPS Y2, DX
  7129  	ORL       CX, DX
  7130  	BSFL      DX, CX
  7131  	ADDQ      CX, AX
  7132  	VZEROUPPER
  7133  	MOVQ      AX, ret+32(FP)
  7134  	RET
  7135  
  7136  DATA dataLtF64<>+0(SB)/1, $0x01
  7137  DATA dataLtF64<>+1(SB)/1, $0x01
  7138  DATA dataLtF64<>+2(SB)/1, $0x01
  7139  DATA dataLtF64<>+3(SB)/1, $0x01
  7140  DATA dataLtF64<>+4(SB)/1, $0x00
  7141  DATA dataLtF64<>+5(SB)/1, $0x00
  7142  DATA dataLtF64<>+6(SB)/1, $0x00
  7143  DATA dataLtF64<>+7(SB)/1, $0x00
  7144  DATA dataLtF64<>+8(SB)/1, $0x00
  7145  DATA dataLtF64<>+9(SB)/1, $0x00
  7146  DATA dataLtF64<>+10(SB)/1, $0x00
  7147  DATA dataLtF64<>+11(SB)/1, $0x00
  7148  DATA dataLtF64<>+12(SB)/1, $0x00
  7149  DATA dataLtF64<>+13(SB)/1, $0x00
  7150  DATA dataLtF64<>+14(SB)/1, $0x00
  7151  DATA dataLtF64<>+15(SB)/1, $0x00
  7152  GLOBL dataLtF64<>(SB), RODATA|NOPTR, $16
  7153  
  7154  // func Lt_AVX2_F64(x []bool, y []float64, z []float64)
  7155  // Requires: AVX, AVX2
  7156  TEXT ·Lt_AVX2_F64(SB), NOSPLIT, $0-72
  7157  	MOVQ  x_base+0(FP), DI
  7158  	MOVQ  y_base+24(FP), SI
  7159  	MOVQ  z_base+48(FP), DX
  7160  	MOVQ  x_len+8(FP), CX
  7161  	TESTQ CX, CX
  7162  	JE    LBB0_7
  7163  	CMPQ  CX, $0x10
  7164  	JAE   LBB0_3
  7165  	XORL  R8, R8
  7166  	JMP   LBB0_6
  7167  
  7168  LBB0_3:
  7169  	MOVQ    CX, R8
  7170  	ANDQ    $-16, R8
  7171  	XORL    AX, AX
  7172  	VMOVDQU dataLtF64<>+0(SB), X0
  7173  
  7174  LBB0_4:
  7175  	VMOVUPD      (SI)(AX*8), Y1
  7176  	VMOVUPD      32(SI)(AX*8), Y2
  7177  	VMOVUPD      64(SI)(AX*8), Y3
  7178  	VMOVUPD      96(SI)(AX*8), Y4
  7179  	VCMPPD       $0x01, (DX)(AX*8), Y1, Y1
  7180  	VEXTRACTF128 $0x01, Y1, X5
  7181  	VPACKSSDW    X5, X1, X1
  7182  	VPACKSSDW    X1, X1, X1
  7183  	VPACKSSWB    X1, X1, X1
  7184  	VCMPPD       $0x01, 32(DX)(AX*8), Y2, Y2
  7185  	VPAND        X0, X1, X1
  7186  	VEXTRACTF128 $0x01, Y2, X5
  7187  	VPACKSSDW    X5, X2, X2
  7188  	VPACKSSDW    X2, X2, X2
  7189  	VPACKSSWB    X2, X2, X2
  7190  	VPAND        X0, X2, X2
  7191  	VCMPPD       $0x01, 64(DX)(AX*8), Y3, Y3
  7192  	VPUNPCKLDQ   X2, X1, X1
  7193  	VEXTRACTF128 $0x01, Y3, X2
  7194  	VPACKSSDW    X2, X3, X2
  7195  	VPACKSSDW    X2, X2, X2
  7196  	VPACKSSWB    X2, X2, X2
  7197  	VPAND        X0, X2, X2
  7198  	VCMPPD       $0x01, 96(DX)(AX*8), Y4, Y3
  7199  	VEXTRACTF128 $0x01, Y3, X4
  7200  	VPACKSSDW    X4, X3, X3
  7201  	VPACKSSDW    X3, X3, X3
  7202  	VPACKSSWB    X3, X3, X3
  7203  	VPAND        X0, X3, X3
  7204  	VPBROADCASTD X3, X3
  7205  	VPBROADCASTD X2, X2
  7206  	VPUNPCKLDQ   X3, X2, X2
  7207  	VPBLENDD     $0x0c, X2, X1, X1
  7208  	VMOVDQU      X1, (DI)(AX*1)
  7209  	ADDQ         $0x10, AX
  7210  	CMPQ         R8, AX
  7211  	JNE          LBB0_4
  7212  	CMPQ         R8, CX
  7213  	JE           LBB0_7
  7214  
  7215  LBB0_6:
  7216  	VMOVSD   (SI)(R8*8), X0
  7217  	VUCOMISD (DX)(R8*8), X0
  7218  	SETCS    (DI)(R8*1)
  7219  	ADDQ     $0x01, R8
  7220  	CMPQ     CX, R8
  7221  	JNE      LBB0_6
  7222  
  7223  LBB0_7:
  7224  	VZEROUPPER
  7225  	RET
  7226  
  7227  DATA dataLtF32<>+0(SB)/1, $0x01
  7228  DATA dataLtF32<>+1(SB)/1, $0x01
  7229  DATA dataLtF32<>+2(SB)/1, $0x01
  7230  DATA dataLtF32<>+3(SB)/1, $0x01
  7231  DATA dataLtF32<>+4(SB)/1, $0x01
  7232  DATA dataLtF32<>+5(SB)/1, $0x01
  7233  DATA dataLtF32<>+6(SB)/1, $0x01
  7234  DATA dataLtF32<>+7(SB)/1, $0x01
  7235  DATA dataLtF32<>+8(SB)/1, $0x00
  7236  DATA dataLtF32<>+9(SB)/1, $0x00
  7237  DATA dataLtF32<>+10(SB)/1, $0x00
  7238  DATA dataLtF32<>+11(SB)/1, $0x00
  7239  DATA dataLtF32<>+12(SB)/1, $0x00
  7240  DATA dataLtF32<>+13(SB)/1, $0x00
  7241  DATA dataLtF32<>+14(SB)/1, $0x00
  7242  DATA dataLtF32<>+15(SB)/1, $0x00
  7243  GLOBL dataLtF32<>(SB), RODATA|NOPTR, $16
  7244  
  7245  // func Lt_AVX2_F32(x []bool, y []float32, z []float32)
  7246  // Requires: AVX, AVX2
  7247  TEXT ·Lt_AVX2_F32(SB), NOSPLIT, $0-72
  7248  	MOVQ  x_base+0(FP), DI
  7249  	MOVQ  y_base+24(FP), SI
  7250  	MOVQ  z_base+48(FP), DX
  7251  	MOVQ  x_len+8(FP), CX
  7252  	TESTQ CX, CX
  7253  	JE    LBB1_7
  7254  	CMPQ  CX, $0x20
  7255  	JAE   LBB1_3
  7256  	XORL  R8, R8
  7257  	JMP   LBB1_6
  7258  
  7259  LBB1_3:
  7260  	MOVQ    CX, R8
  7261  	ANDQ    $-32, R8
  7262  	XORL    AX, AX
  7263  	VMOVDQU dataLtF32<>+0(SB), X0
  7264  
  7265  LBB1_4:
  7266  	VMOVUPS      (SI)(AX*4), Y1
  7267  	VMOVUPS      32(SI)(AX*4), Y2
  7268  	VMOVUPS      64(SI)(AX*4), Y3
  7269  	VMOVUPS      96(SI)(AX*4), Y4
  7270  	VCMPPS       $0x01, (DX)(AX*4), Y1, Y1
  7271  	VEXTRACTF128 $0x01, Y1, X5
  7272  	VPACKSSDW    X5, X1, X1
  7273  	VPACKSSWB    X1, X1, X1
  7274  	VCMPPS       $0x01, 32(DX)(AX*4), Y2, Y2
  7275  	VPAND        X0, X1, X1
  7276  	VEXTRACTF128 $0x01, Y2, X5
  7277  	VPACKSSDW    X5, X2, X2
  7278  	VPACKSSWB    X2, X2, X2
  7279  	VPAND        X0, X2, X2
  7280  	VCMPPS       $0x01, 64(DX)(AX*4), Y3, Y3
  7281  	VEXTRACTF128 $0x01, Y3, X5
  7282  	VPACKSSDW    X5, X3, X3
  7283  	VPACKSSWB    X3, X3, X3
  7284  	VCMPPS       $0x01, 96(DX)(AX*4), Y4, Y4
  7285  	VPAND        X0, X3, X3
  7286  	VEXTRACTF128 $0x01, Y4, X5
  7287  	VPACKSSDW    X5, X4, X4
  7288  	VPACKSSWB    X4, X4, X4
  7289  	VPAND        X0, X4, X4
  7290  	VINSERTI128  $0x01, X4, Y3, Y3
  7291  	VINSERTI128  $0x01, X2, Y1, Y1
  7292  	VPUNPCKLQDQ  Y3, Y1, Y1
  7293  	VPERMQ       $0xd8, Y1, Y1
  7294  	VMOVDQU      Y1, (DI)(AX*1)
  7295  	ADDQ         $0x20, AX
  7296  	CMPQ         R8, AX
  7297  	JNE          LBB1_4
  7298  	CMPQ         R8, CX
  7299  	JE           LBB1_7
  7300  
  7301  LBB1_6:
  7302  	VMOVSS   (SI)(R8*4), X0
  7303  	VUCOMISS (DX)(R8*4), X0
  7304  	SETCS    (DI)(R8*1)
  7305  	ADDQ     $0x01, R8
  7306  	CMPQ     CX, R8
  7307  	JNE      LBB1_6
  7308  
  7309  LBB1_7:
  7310  	VZEROUPPER
  7311  	RET
  7312  
  7313  DATA dataLteF64<>+0(SB)/1, $0x01
  7314  DATA dataLteF64<>+1(SB)/1, $0x01
  7315  DATA dataLteF64<>+2(SB)/1, $0x01
  7316  DATA dataLteF64<>+3(SB)/1, $0x01
  7317  DATA dataLteF64<>+4(SB)/1, $0x00
  7318  DATA dataLteF64<>+5(SB)/1, $0x00
  7319  DATA dataLteF64<>+6(SB)/1, $0x00
  7320  DATA dataLteF64<>+7(SB)/1, $0x00
  7321  DATA dataLteF64<>+8(SB)/1, $0x00
  7322  DATA dataLteF64<>+9(SB)/1, $0x00
  7323  DATA dataLteF64<>+10(SB)/1, $0x00
  7324  DATA dataLteF64<>+11(SB)/1, $0x00
  7325  DATA dataLteF64<>+12(SB)/1, $0x00
  7326  DATA dataLteF64<>+13(SB)/1, $0x00
  7327  DATA dataLteF64<>+14(SB)/1, $0x00
  7328  DATA dataLteF64<>+15(SB)/1, $0x00
  7329  GLOBL dataLteF64<>(SB), RODATA|NOPTR, $16
  7330  
  7331  // func Lte_AVX2_F64(x []bool, y []float64, z []float64)
  7332  // Requires: AVX, AVX2
  7333  TEXT ·Lte_AVX2_F64(SB), NOSPLIT, $0-72
  7334  	MOVQ  x_base+0(FP), DI
  7335  	MOVQ  y_base+24(FP), SI
  7336  	MOVQ  z_base+48(FP), DX
  7337  	MOVQ  x_len+8(FP), CX
  7338  	TESTQ CX, CX
  7339  	JE    LBB2_7
  7340  	CMPQ  CX, $0x10
  7341  	JAE   LBB2_3
  7342  	XORL  R8, R8
  7343  	JMP   LBB2_6
  7344  
  7345  LBB2_3:
  7346  	MOVQ    CX, R8
  7347  	ANDQ    $-16, R8
  7348  	XORL    AX, AX
  7349  	VMOVDQU dataLteF64<>+0(SB), X0
  7350  
  7351  LBB2_4:
  7352  	VMOVUPD      (SI)(AX*8), Y1
  7353  	VMOVUPD      32(SI)(AX*8), Y2
  7354  	VMOVUPD      64(SI)(AX*8), Y3
  7355  	VMOVUPD      96(SI)(AX*8), Y4
  7356  	VCMPPD       $0x02, (DX)(AX*8), Y1, Y1
  7357  	VEXTRACTF128 $0x01, Y1, X5
  7358  	VPACKSSDW    X5, X1, X1
  7359  	VPACKSSDW    X1, X1, X1
  7360  	VPACKSSWB    X1, X1, X1
  7361  	VCMPPD       $0x02, 32(DX)(AX*8), Y2, Y2
  7362  	VPAND        X0, X1, X1
  7363  	VEXTRACTF128 $0x01, Y2, X5
  7364  	VPACKSSDW    X5, X2, X2
  7365  	VPACKSSDW    X2, X2, X2
  7366  	VPACKSSWB    X2, X2, X2
  7367  	VPAND        X0, X2, X2
  7368  	VCMPPD       $0x02, 64(DX)(AX*8), Y3, Y3
  7369  	VPUNPCKLDQ   X2, X1, X1
  7370  	VEXTRACTF128 $0x01, Y3, X2
  7371  	VPACKSSDW    X2, X3, X2
  7372  	VPACKSSDW    X2, X2, X2
  7373  	VPACKSSWB    X2, X2, X2
  7374  	VPAND        X0, X2, X2
  7375  	VCMPPD       $0x02, 96(DX)(AX*8), Y4, Y3
  7376  	VEXTRACTF128 $0x01, Y3, X4
  7377  	VPACKSSDW    X4, X3, X3
  7378  	VPACKSSDW    X3, X3, X3
  7379  	VPACKSSWB    X3, X3, X3
  7380  	VPAND        X0, X3, X3
  7381  	VPBROADCASTD X3, X3
  7382  	VPBROADCASTD X2, X2
  7383  	VPUNPCKLDQ   X3, X2, X2
  7384  	VPBLENDD     $0x0c, X2, X1, X1
  7385  	VMOVDQU      X1, (DI)(AX*1)
  7386  	ADDQ         $0x10, AX
  7387  	CMPQ         R8, AX
  7388  	JNE          LBB2_4
  7389  	CMPQ         R8, CX
  7390  	JE           LBB2_7
  7391  
  7392  LBB2_6:
  7393  	VMOVSD   (SI)(R8*8), X0
  7394  	VUCOMISD (DX)(R8*8), X0
  7395  	SETLS    (DI)(R8*1)
  7396  	ADDQ     $0x01, R8
  7397  	CMPQ     CX, R8
  7398  	JNE      LBB2_6
  7399  
  7400  LBB2_7:
  7401  	VZEROUPPER
  7402  	RET
  7403  
  7404  DATA dataLteF32<>+0(SB)/1, $0x01
  7405  DATA dataLteF32<>+1(SB)/1, $0x01
  7406  DATA dataLteF32<>+2(SB)/1, $0x01
  7407  DATA dataLteF32<>+3(SB)/1, $0x01
  7408  DATA dataLteF32<>+4(SB)/1, $0x01
  7409  DATA dataLteF32<>+5(SB)/1, $0x01
  7410  DATA dataLteF32<>+6(SB)/1, $0x01
  7411  DATA dataLteF32<>+7(SB)/1, $0x01
  7412  DATA dataLteF32<>+8(SB)/1, $0x00
  7413  DATA dataLteF32<>+9(SB)/1, $0x00
  7414  DATA dataLteF32<>+10(SB)/1, $0x00
  7415  DATA dataLteF32<>+11(SB)/1, $0x00
  7416  DATA dataLteF32<>+12(SB)/1, $0x00
  7417  DATA dataLteF32<>+13(SB)/1, $0x00
  7418  DATA dataLteF32<>+14(SB)/1, $0x00
  7419  DATA dataLteF32<>+15(SB)/1, $0x00
  7420  GLOBL dataLteF32<>(SB), RODATA|NOPTR, $16
  7421  
  7422  // func Lte_AVX2_F32(x []bool, y []float32, z []float32)
  7423  // Requires: AVX, AVX2
  7424  TEXT ·Lte_AVX2_F32(SB), NOSPLIT, $0-72
  7425  	MOVQ  x_base+0(FP), DI
  7426  	MOVQ  y_base+24(FP), SI
  7427  	MOVQ  z_base+48(FP), DX
  7428  	MOVQ  x_len+8(FP), CX
  7429  	TESTQ CX, CX
  7430  	JE    LBB3_7
  7431  	CMPQ  CX, $0x20
  7432  	JAE   LBB3_3
  7433  	XORL  R8, R8
  7434  	JMP   LBB3_6
  7435  
  7436  LBB3_3:
  7437  	MOVQ    CX, R8
  7438  	ANDQ    $-32, R8
  7439  	XORL    AX, AX
  7440  	VMOVDQU dataLteF32<>+0(SB), X0
  7441  
  7442  LBB3_4:
  7443  	VMOVUPS      (SI)(AX*4), Y1
  7444  	VMOVUPS      32(SI)(AX*4), Y2
  7445  	VMOVUPS      64(SI)(AX*4), Y3
  7446  	VMOVUPS      96(SI)(AX*4), Y4
  7447  	VCMPPS       $0x02, (DX)(AX*4), Y1, Y1
  7448  	VEXTRACTF128 $0x01, Y1, X5
  7449  	VPACKSSDW    X5, X1, X1
  7450  	VPACKSSWB    X1, X1, X1
  7451  	VCMPPS       $0x02, 32(DX)(AX*4), Y2, Y2
  7452  	VPAND        X0, X1, X1
  7453  	VEXTRACTF128 $0x01, Y2, X5
  7454  	VPACKSSDW    X5, X2, X2
  7455  	VPACKSSWB    X2, X2, X2
  7456  	VPAND        X0, X2, X2
  7457  	VCMPPS       $0x02, 64(DX)(AX*4), Y3, Y3
  7458  	VEXTRACTF128 $0x01, Y3, X5
  7459  	VPACKSSDW    X5, X3, X3
  7460  	VPACKSSWB    X3, X3, X3
  7461  	VCMPPS       $0x02, 96(DX)(AX*4), Y4, Y4
  7462  	VPAND        X0, X3, X3
  7463  	VEXTRACTF128 $0x01, Y4, X5
  7464  	VPACKSSDW    X5, X4, X4
  7465  	VPACKSSWB    X4, X4, X4
  7466  	VPAND        X0, X4, X4
  7467  	VINSERTI128  $0x01, X4, Y3, Y3
  7468  	VINSERTI128  $0x01, X2, Y1, Y1
  7469  	VPUNPCKLQDQ  Y3, Y1, Y1
  7470  	VPERMQ       $0xd8, Y1, Y1
  7471  	VMOVDQU      Y1, (DI)(AX*1)
  7472  	ADDQ         $0x20, AX
  7473  	CMPQ         R8, AX
  7474  	JNE          LBB3_4
  7475  	CMPQ         R8, CX
  7476  	JE           LBB3_7
  7477  
  7478  LBB3_6:
  7479  	VMOVSS   (SI)(R8*4), X0
  7480  	VUCOMISS (DX)(R8*4), X0
  7481  	SETLS    (DI)(R8*1)
  7482  	ADDQ     $0x01, R8
  7483  	CMPQ     CX, R8
  7484  	JNE      LBB3_6
  7485  
  7486  LBB3_7:
  7487  	VZEROUPPER
  7488  	RET
  7489  
  7490  DATA dataGtF64<>+0(SB)/1, $0x01
  7491  DATA dataGtF64<>+1(SB)/1, $0x01
  7492  DATA dataGtF64<>+2(SB)/1, $0x01
  7493  DATA dataGtF64<>+3(SB)/1, $0x01
  7494  DATA dataGtF64<>+4(SB)/1, $0x00
  7495  DATA dataGtF64<>+5(SB)/1, $0x00
  7496  DATA dataGtF64<>+6(SB)/1, $0x00
  7497  DATA dataGtF64<>+7(SB)/1, $0x00
  7498  DATA dataGtF64<>+8(SB)/1, $0x00
  7499  DATA dataGtF64<>+9(SB)/1, $0x00
  7500  DATA dataGtF64<>+10(SB)/1, $0x00
  7501  DATA dataGtF64<>+11(SB)/1, $0x00
  7502  DATA dataGtF64<>+12(SB)/1, $0x00
  7503  DATA dataGtF64<>+13(SB)/1, $0x00
  7504  DATA dataGtF64<>+14(SB)/1, $0x00
  7505  DATA dataGtF64<>+15(SB)/1, $0x00
  7506  GLOBL dataGtF64<>(SB), RODATA|NOPTR, $16
  7507  
  7508  // func Gt_AVX2_F64(x []bool, y []float64, z []float64)
  7509  // Requires: AVX, AVX2
  7510  TEXT ·Gt_AVX2_F64(SB), NOSPLIT, $0-72
  7511  	MOVQ  x_base+0(FP), DI
  7512  	MOVQ  y_base+24(FP), SI
  7513  	MOVQ  z_base+48(FP), DX
  7514  	MOVQ  x_len+8(FP), CX
  7515  	TESTQ CX, CX
  7516  	JE    LBB4_7
  7517  	CMPQ  CX, $0x10
  7518  	JAE   LBB4_3
  7519  	XORL  R8, R8
  7520  	JMP   LBB4_6
  7521  
  7522  LBB4_3:
  7523  	MOVQ    CX, R8
  7524  	ANDQ    $-16, R8
  7525  	XORL    AX, AX
  7526  	VMOVDQU dataGtF64<>+0(SB), X0
  7527  
  7528  LBB4_4:
  7529  	VMOVUPD      (DX)(AX*8), Y1
  7530  	VMOVUPD      32(DX)(AX*8), Y2
  7531  	VMOVUPD      64(DX)(AX*8), Y3
  7532  	VMOVUPD      96(DX)(AX*8), Y4
  7533  	VCMPPD       $0x01, (SI)(AX*8), Y1, Y1
  7534  	VEXTRACTF128 $0x01, Y1, X5
  7535  	VPACKSSDW    X5, X1, X1
  7536  	VPACKSSDW    X1, X1, X1
  7537  	VPACKSSWB    X1, X1, X1
  7538  	VCMPPD       $0x01, 32(SI)(AX*8), Y2, Y2
  7539  	VPAND        X0, X1, X1
  7540  	VEXTRACTF128 $0x01, Y2, X5
  7541  	VPACKSSDW    X5, X2, X2
  7542  	VPACKSSDW    X2, X2, X2
  7543  	VPACKSSWB    X2, X2, X2
  7544  	VPAND        X0, X2, X2
  7545  	VCMPPD       $0x01, 64(SI)(AX*8), Y3, Y3
  7546  	VPUNPCKLDQ   X2, X1, X1
  7547  	VEXTRACTF128 $0x01, Y3, X2
  7548  	VPACKSSDW    X2, X3, X2
  7549  	VPACKSSDW    X2, X2, X2
  7550  	VPACKSSWB    X2, X2, X2
  7551  	VPAND        X0, X2, X2
  7552  	VCMPPD       $0x01, 96(SI)(AX*8), Y4, Y3
  7553  	VEXTRACTF128 $0x01, Y3, X4
  7554  	VPACKSSDW    X4, X3, X3
  7555  	VPACKSSDW    X3, X3, X3
  7556  	VPACKSSWB    X3, X3, X3
  7557  	VPAND        X0, X3, X3
  7558  	VPBROADCASTD X3, X3
  7559  	VPBROADCASTD X2, X2
  7560  	VPUNPCKLDQ   X3, X2, X2
  7561  	VPBLENDD     $0x0c, X2, X1, X1
  7562  	VMOVDQU      X1, (DI)(AX*1)
  7563  	ADDQ         $0x10, AX
  7564  	CMPQ         R8, AX
  7565  	JNE          LBB4_4
  7566  	CMPQ         R8, CX
  7567  	JE           LBB4_7
  7568  
  7569  LBB4_6:
  7570  	VMOVSD   (SI)(R8*8), X0
  7571  	VUCOMISD (DX)(R8*8), X0
  7572  	SETHI    (DI)(R8*1)
  7573  	ADDQ     $0x01, R8
  7574  	CMPQ     CX, R8
  7575  	JNE      LBB4_6
  7576  
  7577  LBB4_7:
  7578  	VZEROUPPER
  7579  	RET
  7580  
  7581  DATA dataGtF32<>+0(SB)/1, $0x01
  7582  DATA dataGtF32<>+1(SB)/1, $0x01
  7583  DATA dataGtF32<>+2(SB)/1, $0x01
  7584  DATA dataGtF32<>+3(SB)/1, $0x01
  7585  DATA dataGtF32<>+4(SB)/1, $0x01
  7586  DATA dataGtF32<>+5(SB)/1, $0x01
  7587  DATA dataGtF32<>+6(SB)/1, $0x01
  7588  DATA dataGtF32<>+7(SB)/1, $0x01
  7589  DATA dataGtF32<>+8(SB)/1, $0x00
  7590  DATA dataGtF32<>+9(SB)/1, $0x00
  7591  DATA dataGtF32<>+10(SB)/1, $0x00
  7592  DATA dataGtF32<>+11(SB)/1, $0x00
  7593  DATA dataGtF32<>+12(SB)/1, $0x00
  7594  DATA dataGtF32<>+13(SB)/1, $0x00
  7595  DATA dataGtF32<>+14(SB)/1, $0x00
  7596  DATA dataGtF32<>+15(SB)/1, $0x00
  7597  GLOBL dataGtF32<>(SB), RODATA|NOPTR, $16
  7598  
  7599  // func Gt_AVX2_F32(x []bool, y []float32, z []float32)
  7600  // Requires: AVX, AVX2
  7601  TEXT ·Gt_AVX2_F32(SB), NOSPLIT, $0-72
  7602  	MOVQ  x_base+0(FP), DI
  7603  	MOVQ  y_base+24(FP), SI
  7604  	MOVQ  z_base+48(FP), DX
  7605  	MOVQ  x_len+8(FP), CX
  7606  	TESTQ CX, CX
  7607  	JE    LBB5_7
  7608  	CMPQ  CX, $0x20
  7609  	JAE   LBB5_3
  7610  	XORL  R8, R8
  7611  	JMP   LBB5_6
  7612  
  7613  LBB5_3:
  7614  	MOVQ    CX, R8
  7615  	ANDQ    $-32, R8
  7616  	XORL    AX, AX
  7617  	VMOVDQU dataGtF32<>+0(SB), X0
  7618  
  7619  LBB5_4:
  7620  	VMOVUPS      (DX)(AX*4), Y1
  7621  	VMOVUPS      32(DX)(AX*4), Y2
  7622  	VMOVUPS      64(DX)(AX*4), Y3
  7623  	VMOVUPS      96(DX)(AX*4), Y4
  7624  	VCMPPS       $0x01, (SI)(AX*4), Y1, Y1
  7625  	VEXTRACTF128 $0x01, Y1, X5
  7626  	VPACKSSDW    X5, X1, X1
  7627  	VPACKSSWB    X1, X1, X1
  7628  	VCMPPS       $0x01, 32(SI)(AX*4), Y2, Y2
  7629  	VPAND        X0, X1, X1
  7630  	VEXTRACTF128 $0x01, Y2, X5
  7631  	VPACKSSDW    X5, X2, X2
  7632  	VPACKSSWB    X2, X2, X2
  7633  	VPAND        X0, X2, X2
  7634  	VCMPPS       $0x01, 64(SI)(AX*4), Y3, Y3
  7635  	VEXTRACTF128 $0x01, Y3, X5
  7636  	VPACKSSDW    X5, X3, X3
  7637  	VPACKSSWB    X3, X3, X3
  7638  	VCMPPS       $0x01, 96(SI)(AX*4), Y4, Y4
  7639  	VPAND        X0, X3, X3
  7640  	VEXTRACTF128 $0x01, Y4, X5
  7641  	VPACKSSDW    X5, X4, X4
  7642  	VPACKSSWB    X4, X4, X4
  7643  	VPAND        X0, X4, X4
  7644  	VINSERTI128  $0x01, X4, Y3, Y3
  7645  	VINSERTI128  $0x01, X2, Y1, Y1
  7646  	VPUNPCKLQDQ  Y3, Y1, Y1
  7647  	VPERMQ       $0xd8, Y1, Y1
  7648  	VMOVDQU      Y1, (DI)(AX*1)
  7649  	ADDQ         $0x20, AX
  7650  	CMPQ         R8, AX
  7651  	JNE          LBB5_4
  7652  	CMPQ         R8, CX
  7653  	JE           LBB5_7
  7654  
  7655  LBB5_6:
  7656  	VMOVSS   (SI)(R8*4), X0
  7657  	VUCOMISS (DX)(R8*4), X0
  7658  	SETHI    (DI)(R8*1)
  7659  	ADDQ     $0x01, R8
  7660  	CMPQ     CX, R8
  7661  	JNE      LBB5_6
  7662  
  7663  LBB5_7:
  7664  	VZEROUPPER
  7665  	RET
  7666  
  7667  DATA dataGteF64<>+0(SB)/1, $0x01
  7668  DATA dataGteF64<>+1(SB)/1, $0x01
  7669  DATA dataGteF64<>+2(SB)/1, $0x01
  7670  DATA dataGteF64<>+3(SB)/1, $0x01
  7671  DATA dataGteF64<>+4(SB)/1, $0x00
  7672  DATA dataGteF64<>+5(SB)/1, $0x00
  7673  DATA dataGteF64<>+6(SB)/1, $0x00
  7674  DATA dataGteF64<>+7(SB)/1, $0x00
  7675  DATA dataGteF64<>+8(SB)/1, $0x00
  7676  DATA dataGteF64<>+9(SB)/1, $0x00
  7677  DATA dataGteF64<>+10(SB)/1, $0x00
  7678  DATA dataGteF64<>+11(SB)/1, $0x00
  7679  DATA dataGteF64<>+12(SB)/1, $0x00
  7680  DATA dataGteF64<>+13(SB)/1, $0x00
  7681  DATA dataGteF64<>+14(SB)/1, $0x00
  7682  DATA dataGteF64<>+15(SB)/1, $0x00
  7683  GLOBL dataGteF64<>(SB), RODATA|NOPTR, $16
  7684  
  7685  // func Gte_AVX2_F64(x []bool, y []float64, z []float64)
  7686  // Requires: AVX, AVX2
  7687  TEXT ·Gte_AVX2_F64(SB), NOSPLIT, $0-72
  7688  	MOVQ  x_base+0(FP), DI
  7689  	MOVQ  y_base+24(FP), SI
  7690  	MOVQ  z_base+48(FP), DX
  7691  	MOVQ  x_len+8(FP), CX
  7692  	TESTQ CX, CX
  7693  	JE    LBB6_7
  7694  	CMPQ  CX, $0x10
  7695  	JAE   LBB6_3
  7696  	XORL  R8, R8
  7697  	JMP   LBB6_6
  7698  
  7699  LBB6_3:
  7700  	MOVQ    CX, R8
  7701  	ANDQ    $-16, R8
  7702  	XORL    AX, AX
  7703  	VMOVDQU dataGteF64<>+0(SB), X0
  7704  
  7705  LBB6_4:
  7706  	VMOVUPD      (DX)(AX*8), Y1
  7707  	VMOVUPD      32(DX)(AX*8), Y2
  7708  	VMOVUPD      64(DX)(AX*8), Y3
  7709  	VMOVUPD      96(DX)(AX*8), Y4
  7710  	VCMPPD       $0x02, (SI)(AX*8), Y1, Y1
  7711  	VEXTRACTF128 $0x01, Y1, X5
  7712  	VPACKSSDW    X5, X1, X1
  7713  	VPACKSSDW    X1, X1, X1
  7714  	VPACKSSWB    X1, X1, X1
  7715  	VCMPPD       $0x02, 32(SI)(AX*8), Y2, Y2
  7716  	VPAND        X0, X1, X1
  7717  	VEXTRACTF128 $0x01, Y2, X5
  7718  	VPACKSSDW    X5, X2, X2
  7719  	VPACKSSDW    X2, X2, X2
  7720  	VPACKSSWB    X2, X2, X2
  7721  	VPAND        X0, X2, X2
  7722  	VCMPPD       $0x02, 64(SI)(AX*8), Y3, Y3
  7723  	VPUNPCKLDQ   X2, X1, X1
  7724  	VEXTRACTF128 $0x01, Y3, X2
  7725  	VPACKSSDW    X2, X3, X2
  7726  	VPACKSSDW    X2, X2, X2
  7727  	VPACKSSWB    X2, X2, X2
  7728  	VPAND        X0, X2, X2
  7729  	VCMPPD       $0x02, 96(SI)(AX*8), Y4, Y3
  7730  	VEXTRACTF128 $0x01, Y3, X4
  7731  	VPACKSSDW    X4, X3, X3
  7732  	VPACKSSDW    X3, X3, X3
  7733  	VPACKSSWB    X3, X3, X3
  7734  	VPAND        X0, X3, X3
  7735  	VPBROADCASTD X3, X3
  7736  	VPBROADCASTD X2, X2
  7737  	VPUNPCKLDQ   X3, X2, X2
  7738  	VPBLENDD     $0x0c, X2, X1, X1
  7739  	VMOVDQU      X1, (DI)(AX*1)
  7740  	ADDQ         $0x10, AX
  7741  	CMPQ         R8, AX
  7742  	JNE          LBB6_4
  7743  	CMPQ         R8, CX
  7744  	JE           LBB6_7
  7745  
  7746  LBB6_6:
  7747  	VMOVSD   (SI)(R8*8), X0
  7748  	VUCOMISD (DX)(R8*8), X0
  7749  	SETCC    (DI)(R8*1)
  7750  	ADDQ     $0x01, R8
  7751  	CMPQ     CX, R8
  7752  	JNE      LBB6_6
  7753  
  7754  LBB6_7:
  7755  	VZEROUPPER
  7756  	RET
  7757  
  7758  DATA dataGteF32<>+0(SB)/1, $0x01
  7759  DATA dataGteF32<>+1(SB)/1, $0x01
  7760  DATA dataGteF32<>+2(SB)/1, $0x01
  7761  DATA dataGteF32<>+3(SB)/1, $0x01
  7762  DATA dataGteF32<>+4(SB)/1, $0x01
  7763  DATA dataGteF32<>+5(SB)/1, $0x01
  7764  DATA dataGteF32<>+6(SB)/1, $0x01
  7765  DATA dataGteF32<>+7(SB)/1, $0x01
  7766  DATA dataGteF32<>+8(SB)/1, $0x00
  7767  DATA dataGteF32<>+9(SB)/1, $0x00
  7768  DATA dataGteF32<>+10(SB)/1, $0x00
  7769  DATA dataGteF32<>+11(SB)/1, $0x00
  7770  DATA dataGteF32<>+12(SB)/1, $0x00
  7771  DATA dataGteF32<>+13(SB)/1, $0x00
  7772  DATA dataGteF32<>+14(SB)/1, $0x00
  7773  DATA dataGteF32<>+15(SB)/1, $0x00
  7774  GLOBL dataGteF32<>(SB), RODATA|NOPTR, $16
  7775  
  7776  // func Gte_AVX2_F32(x []bool, y []float32, z []float32)
  7777  // Requires: AVX, AVX2
  7778  TEXT ·Gte_AVX2_F32(SB), NOSPLIT, $0-72
  7779  	MOVQ  x_base+0(FP), DI
  7780  	MOVQ  y_base+24(FP), SI
  7781  	MOVQ  z_base+48(FP), DX
  7782  	MOVQ  x_len+8(FP), CX
  7783  	TESTQ CX, CX
  7784  	JE    LBB7_7
  7785  	CMPQ  CX, $0x20
  7786  	JAE   LBB7_3
  7787  	XORL  R8, R8
  7788  	JMP   LBB7_6
  7789  
  7790  LBB7_3:
  7791  	MOVQ    CX, R8
  7792  	ANDQ    $-32, R8
  7793  	XORL    AX, AX
  7794  	VMOVDQU dataGteF32<>+0(SB), X0
  7795  
  7796  LBB7_4:
  7797  	VMOVUPS      (DX)(AX*4), Y1
  7798  	VMOVUPS      32(DX)(AX*4), Y2
  7799  	VMOVUPS      64(DX)(AX*4), Y3
  7800  	VMOVUPS      96(DX)(AX*4), Y4
  7801  	VCMPPS       $0x02, (SI)(AX*4), Y1, Y1
  7802  	VEXTRACTF128 $0x01, Y1, X5
  7803  	VPACKSSDW    X5, X1, X1
  7804  	VPACKSSWB    X1, X1, X1
  7805  	VCMPPS       $0x02, 32(SI)(AX*4), Y2, Y2
  7806  	VPAND        X0, X1, X1
  7807  	VEXTRACTF128 $0x01, Y2, X5
  7808  	VPACKSSDW    X5, X2, X2
  7809  	VPACKSSWB    X2, X2, X2
  7810  	VPAND        X0, X2, X2
  7811  	VCMPPS       $0x02, 64(SI)(AX*4), Y3, Y3
  7812  	VEXTRACTF128 $0x01, Y3, X5
  7813  	VPACKSSDW    X5, X3, X3
  7814  	VPACKSSWB    X3, X3, X3
  7815  	VCMPPS       $0x02, 96(SI)(AX*4), Y4, Y4
  7816  	VPAND        X0, X3, X3
  7817  	VEXTRACTF128 $0x01, Y4, X5
  7818  	VPACKSSDW    X5, X4, X4
  7819  	VPACKSSWB    X4, X4, X4
  7820  	VPAND        X0, X4, X4
  7821  	VINSERTI128  $0x01, X4, Y3, Y3
  7822  	VINSERTI128  $0x01, X2, Y1, Y1
  7823  	VPUNPCKLQDQ  Y3, Y1, Y1
  7824  	VPERMQ       $0xd8, Y1, Y1
  7825  	VMOVDQU      Y1, (DI)(AX*1)
  7826  	ADDQ         $0x20, AX
  7827  	CMPQ         R8, AX
  7828  	JNE          LBB7_4
  7829  	CMPQ         R8, CX
  7830  	JE           LBB7_7
  7831  
  7832  LBB7_6:
  7833  	VMOVSS   (SI)(R8*4), X0
  7834  	VUCOMISS (DX)(R8*4), X0
  7835  	SETCC    (DI)(R8*1)
  7836  	ADDQ     $0x01, R8
  7837  	CMPQ     CX, R8
  7838  	JNE      LBB7_6
  7839  
  7840  LBB7_7:
  7841  	VZEROUPPER
  7842  	RET
  7843  
  7844  DATA dataEqF64<>+0(SB)/1, $0x01
  7845  DATA dataEqF64<>+1(SB)/1, $0x01
  7846  DATA dataEqF64<>+2(SB)/1, $0x01
  7847  DATA dataEqF64<>+3(SB)/1, $0x01
  7848  DATA dataEqF64<>+4(SB)/1, $0x00
  7849  DATA dataEqF64<>+5(SB)/1, $0x00
  7850  DATA dataEqF64<>+6(SB)/1, $0x00
  7851  DATA dataEqF64<>+7(SB)/1, $0x00
  7852  DATA dataEqF64<>+8(SB)/1, $0x00
  7853  DATA dataEqF64<>+9(SB)/1, $0x00
  7854  DATA dataEqF64<>+10(SB)/1, $0x00
  7855  DATA dataEqF64<>+11(SB)/1, $0x00
  7856  DATA dataEqF64<>+12(SB)/1, $0x00
  7857  DATA dataEqF64<>+13(SB)/1, $0x00
  7858  DATA dataEqF64<>+14(SB)/1, $0x00
  7859  DATA dataEqF64<>+15(SB)/1, $0x00
  7860  GLOBL dataEqF64<>(SB), RODATA|NOPTR, $16
  7861  
  7862  // func Eq_AVX2_F64(x []bool, y []float64, z []float64)
  7863  // Requires: AVX, AVX2
  7864  TEXT ·Eq_AVX2_F64(SB), NOSPLIT, $0-72
  7865  	MOVQ  x_base+0(FP), DI
  7866  	MOVQ  y_base+24(FP), SI
  7867  	MOVQ  z_base+48(FP), DX
  7868  	MOVQ  x_len+8(FP), CX
  7869  	TESTQ CX, CX
  7870  	JE    LBB8_7
  7871  	CMPQ  CX, $0x10
  7872  	JAE   LBB8_3
  7873  	XORL  R8, R8
  7874  	JMP   LBB8_6
  7875  
  7876  LBB8_3:
  7877  	MOVQ    CX, R8
  7878  	ANDQ    $-16, R8
  7879  	XORL    AX, AX
  7880  	VMOVDQU dataEqF64<>+0(SB), X0
  7881  
  7882  LBB8_4:
  7883  	VMOVUPD      (DX)(AX*8), Y1
  7884  	VMOVUPD      32(DX)(AX*8), Y2
  7885  	VMOVUPD      64(DX)(AX*8), Y3
  7886  	VMOVUPD      96(DX)(AX*8), Y4
  7887  	VCMPPD       $0x00, (SI)(AX*8), Y1, Y1
  7888  	VEXTRACTF128 $0x01, Y1, X5
  7889  	VPACKSSDW    X5, X1, X1
  7890  	VPACKSSDW    X1, X1, X1
  7891  	VPACKSSWB    X1, X1, X1
  7892  	VCMPPD       $0x00, 32(SI)(AX*8), Y2, Y2
  7893  	VPAND        X0, X1, X1
  7894  	VEXTRACTF128 $0x01, Y2, X5
  7895  	VPACKSSDW    X5, X2, X2
  7896  	VPACKSSDW    X2, X2, X2
  7897  	VPACKSSWB    X2, X2, X2
  7898  	VPAND        X0, X2, X2
  7899  	VCMPPD       $0x00, 64(SI)(AX*8), Y3, Y3
  7900  	VPUNPCKLDQ   X2, X1, X1
  7901  	VEXTRACTF128 $0x01, Y3, X2
  7902  	VPACKSSDW    X2, X3, X2
  7903  	VPACKSSDW    X2, X2, X2
  7904  	VPACKSSWB    X2, X2, X2
  7905  	VPAND        X0, X2, X2
  7906  	VCMPPD       $0x00, 96(SI)(AX*8), Y4, Y3
  7907  	VEXTRACTF128 $0x01, Y3, X4
  7908  	VPACKSSDW    X4, X3, X3
  7909  	VPACKSSDW    X3, X3, X3
  7910  	VPACKSSWB    X3, X3, X3
  7911  	VPAND        X0, X3, X3
  7912  	VPBROADCASTD X3, X3
  7913  	VPBROADCASTD X2, X2
  7914  	VPUNPCKLDQ   X3, X2, X2
  7915  	VPBLENDD     $0x0c, X2, X1, X1
  7916  	VMOVDQU      X1, (DI)(AX*1)
  7917  	ADDQ         $0x10, AX
  7918  	CMPQ         R8, AX
  7919  	JNE          LBB8_4
  7920  	CMPQ         R8, CX
  7921  	JE           LBB8_7
  7922  
  7923  LBB8_6:
  7924  	VMOVSD   (SI)(R8*8), X0
  7925  	VUCOMISD (DX)(R8*8), X0
  7926  	SETEQ    (DI)(R8*1)
  7927  	ADDQ     $0x01, R8
  7928  	CMPQ     CX, R8
  7929  	JNE      LBB8_6
  7930  
  7931  LBB8_7:
  7932  	VZEROUPPER
  7933  	RET
  7934  
  7935  DATA dataEqF32<>+0(SB)/1, $0x01
  7936  DATA dataEqF32<>+1(SB)/1, $0x01
  7937  DATA dataEqF32<>+2(SB)/1, $0x01
  7938  DATA dataEqF32<>+3(SB)/1, $0x01
  7939  DATA dataEqF32<>+4(SB)/1, $0x01
  7940  DATA dataEqF32<>+5(SB)/1, $0x01
  7941  DATA dataEqF32<>+6(SB)/1, $0x01
  7942  DATA dataEqF32<>+7(SB)/1, $0x01
  7943  DATA dataEqF32<>+8(SB)/1, $0x00
  7944  DATA dataEqF32<>+9(SB)/1, $0x00
  7945  DATA dataEqF32<>+10(SB)/1, $0x00
  7946  DATA dataEqF32<>+11(SB)/1, $0x00
  7947  DATA dataEqF32<>+12(SB)/1, $0x00
  7948  DATA dataEqF32<>+13(SB)/1, $0x00
  7949  DATA dataEqF32<>+14(SB)/1, $0x00
  7950  DATA dataEqF32<>+15(SB)/1, $0x00
  7951  GLOBL dataEqF32<>(SB), RODATA|NOPTR, $16
  7952  
  7953  // func Eq_AVX2_F32(x []bool, y []float32, z []float32)
  7954  // Requires: AVX, AVX2
  7955  TEXT ·Eq_AVX2_F32(SB), NOSPLIT, $0-72
  7956  	MOVQ  x_base+0(FP), DI
  7957  	MOVQ  y_base+24(FP), SI
  7958  	MOVQ  z_base+48(FP), DX
  7959  	MOVQ  x_len+8(FP), CX
  7960  	TESTQ CX, CX
  7961  	JE    LBB9_7
  7962  	CMPQ  CX, $0x20
  7963  	JAE   LBB9_3
  7964  	XORL  R8, R8
  7965  	JMP   LBB9_6
  7966  
  7967  LBB9_3:
  7968  	MOVQ    CX, R8
  7969  	ANDQ    $-32, R8
  7970  	XORL    AX, AX
  7971  	VMOVDQU dataEqF32<>+0(SB), X0
  7972  
  7973  LBB9_4:
  7974  	VMOVUPS      (DX)(AX*4), Y1
  7975  	VMOVUPS      32(DX)(AX*4), Y2
  7976  	VMOVUPS      64(DX)(AX*4), Y3
  7977  	VMOVUPS      96(DX)(AX*4), Y4
  7978  	VCMPPS       $0x00, (SI)(AX*4), Y1, Y1
  7979  	VEXTRACTF128 $0x01, Y1, X5
  7980  	VPACKSSDW    X5, X1, X1
  7981  	VPACKSSWB    X1, X1, X1
  7982  	VCMPPS       $0x00, 32(SI)(AX*4), Y2, Y2
  7983  	VPAND        X0, X1, X1
  7984  	VEXTRACTF128 $0x01, Y2, X5
  7985  	VPACKSSDW    X5, X2, X2
  7986  	VPACKSSWB    X2, X2, X2
  7987  	VPAND        X0, X2, X2
  7988  	VCMPPS       $0x00, 64(SI)(AX*4), Y3, Y3
  7989  	VEXTRACTF128 $0x01, Y3, X5
  7990  	VPACKSSDW    X5, X3, X3
  7991  	VPACKSSWB    X3, X3, X3
  7992  	VCMPPS       $0x00, 96(SI)(AX*4), Y4, Y4
  7993  	VPAND        X0, X3, X3
  7994  	VEXTRACTF128 $0x01, Y4, X5
  7995  	VPACKSSDW    X5, X4, X4
  7996  	VPACKSSWB    X4, X4, X4
  7997  	VPAND        X0, X4, X4
  7998  	VINSERTI128  $0x01, X4, Y3, Y3
  7999  	VINSERTI128  $0x01, X2, Y1, Y1
  8000  	VPUNPCKLQDQ  Y3, Y1, Y1
  8001  	VPERMQ       $0xd8, Y1, Y1
  8002  	VMOVDQU      Y1, (DI)(AX*1)
  8003  	ADDQ         $0x20, AX
  8004  	CMPQ         R8, AX
  8005  	JNE          LBB9_4
  8006  	CMPQ         R8, CX
  8007  	JE           LBB9_7
  8008  
  8009  LBB9_6:
  8010  	VMOVSS   (SI)(R8*4), X0
  8011  	VUCOMISS (DX)(R8*4), X0
  8012  	SETEQ    (DI)(R8*1)
  8013  	ADDQ     $0x01, R8
  8014  	CMPQ     CX, R8
  8015  	JNE      LBB9_6
  8016  
  8017  LBB9_7:
  8018  	VZEROUPPER
  8019  	RET
  8020  
  8021  DATA dataNeqF64<>+0(SB)/1, $0x01
  8022  DATA dataNeqF64<>+1(SB)/1, $0x01
  8023  DATA dataNeqF64<>+2(SB)/1, $0x01
  8024  DATA dataNeqF64<>+3(SB)/1, $0x01
  8025  DATA dataNeqF64<>+4(SB)/1, $0x00
  8026  DATA dataNeqF64<>+5(SB)/1, $0x00
  8027  DATA dataNeqF64<>+6(SB)/1, $0x00
  8028  DATA dataNeqF64<>+7(SB)/1, $0x00
  8029  DATA dataNeqF64<>+8(SB)/1, $0x00
  8030  DATA dataNeqF64<>+9(SB)/1, $0x00
  8031  DATA dataNeqF64<>+10(SB)/1, $0x00
  8032  DATA dataNeqF64<>+11(SB)/1, $0x00
  8033  DATA dataNeqF64<>+12(SB)/1, $0x00
  8034  DATA dataNeqF64<>+13(SB)/1, $0x00
  8035  DATA dataNeqF64<>+14(SB)/1, $0x00
  8036  DATA dataNeqF64<>+15(SB)/1, $0x00
  8037  GLOBL dataNeqF64<>(SB), RODATA|NOPTR, $16
  8038  
  8039  // func Neq_AVX2_F64(x []bool, y []float64, z []float64)
  8040  // Requires: AVX, AVX2
  8041  TEXT ·Neq_AVX2_F64(SB), NOSPLIT, $0-72
  8042  	MOVQ  x_base+0(FP), DI
  8043  	MOVQ  y_base+24(FP), SI
  8044  	MOVQ  z_base+48(FP), DX
  8045  	MOVQ  x_len+8(FP), CX
  8046  	TESTQ CX, CX
  8047  	JE    LBB10_7
  8048  	CMPQ  CX, $0x10
  8049  	JAE   LBB10_3
  8050  	XORL  R8, R8
  8051  	JMP   LBB10_6
  8052  
  8053  LBB10_3:
  8054  	MOVQ    CX, R8
  8055  	ANDQ    $-16, R8
  8056  	XORL    AX, AX
  8057  	VMOVDQU dataNeqF64<>+0(SB), X0
  8058  
  8059  LBB10_4:
  8060  	VMOVUPD      (DX)(AX*8), Y1
  8061  	VMOVUPD      32(DX)(AX*8), Y2
  8062  	VMOVUPD      64(DX)(AX*8), Y3
  8063  	VMOVUPD      96(DX)(AX*8), Y4
  8064  	VCMPPD       $0x04, (SI)(AX*8), Y1, Y1
  8065  	VEXTRACTF128 $0x01, Y1, X5
  8066  	VPACKSSDW    X5, X1, X1
  8067  	VPACKSSDW    X1, X1, X1
  8068  	VPACKSSWB    X1, X1, X1
  8069  	VCMPPD       $0x04, 32(SI)(AX*8), Y2, Y2
  8070  	VPAND        X0, X1, X1
  8071  	VEXTRACTF128 $0x01, Y2, X5
  8072  	VPACKSSDW    X5, X2, X2
  8073  	VPACKSSDW    X2, X2, X2
  8074  	VPACKSSWB    X2, X2, X2
  8075  	VPAND        X0, X2, X2
  8076  	VCMPPD       $0x04, 64(SI)(AX*8), Y3, Y3
  8077  	VPUNPCKLDQ   X2, X1, X1
  8078  	VEXTRACTF128 $0x01, Y3, X2
  8079  	VPACKSSDW    X2, X3, X2
  8080  	VPACKSSDW    X2, X2, X2
  8081  	VPACKSSWB    X2, X2, X2
  8082  	VPAND        X0, X2, X2
  8083  	VCMPPD       $0x04, 96(SI)(AX*8), Y4, Y3
  8084  	VEXTRACTF128 $0x01, Y3, X4
  8085  	VPACKSSDW    X4, X3, X3
  8086  	VPACKSSDW    X3, X3, X3
  8087  	VPACKSSWB    X3, X3, X3
  8088  	VPAND        X0, X3, X3
  8089  	VPBROADCASTD X3, X3
  8090  	VPBROADCASTD X2, X2
  8091  	VPUNPCKLDQ   X3, X2, X2
  8092  	VPBLENDD     $0x0c, X2, X1, X1
  8093  	VMOVDQU      X1, (DI)(AX*1)
  8094  	ADDQ         $0x10, AX
  8095  	CMPQ         R8, AX
  8096  	JNE          LBB10_4
  8097  	CMPQ         R8, CX
  8098  	JE           LBB10_7
  8099  
  8100  LBB10_6:
  8101  	VMOVSD   (SI)(R8*8), X0
  8102  	VUCOMISD (DX)(R8*8), X0
  8103  	SETNE    (DI)(R8*1)
  8104  	ADDQ     $0x01, R8
  8105  	CMPQ     CX, R8
  8106  	JNE      LBB10_6
  8107  
  8108  LBB10_7:
  8109  	VZEROUPPER
  8110  	RET
  8111  
  8112  DATA dataNeqF32<>+0(SB)/1, $0x01
  8113  DATA dataNeqF32<>+1(SB)/1, $0x01
  8114  DATA dataNeqF32<>+2(SB)/1, $0x01
  8115  DATA dataNeqF32<>+3(SB)/1, $0x01
  8116  DATA dataNeqF32<>+4(SB)/1, $0x01
  8117  DATA dataNeqF32<>+5(SB)/1, $0x01
  8118  DATA dataNeqF32<>+6(SB)/1, $0x01
  8119  DATA dataNeqF32<>+7(SB)/1, $0x01
  8120  DATA dataNeqF32<>+8(SB)/1, $0x00
  8121  DATA dataNeqF32<>+9(SB)/1, $0x00
  8122  DATA dataNeqF32<>+10(SB)/1, $0x00
  8123  DATA dataNeqF32<>+11(SB)/1, $0x00
  8124  DATA dataNeqF32<>+12(SB)/1, $0x00
  8125  DATA dataNeqF32<>+13(SB)/1, $0x00
  8126  DATA dataNeqF32<>+14(SB)/1, $0x00
  8127  DATA dataNeqF32<>+15(SB)/1, $0x00
  8128  GLOBL dataNeqF32<>(SB), RODATA|NOPTR, $16
  8129  
  8130  // func Neq_AVX2_F32(x []bool, y []float32, z []float32)
  8131  // Requires: AVX, AVX2
  8132  TEXT ·Neq_AVX2_F32(SB), NOSPLIT, $0-72
  8133  	MOVQ  x_base+0(FP), DI
  8134  	MOVQ  y_base+24(FP), SI
  8135  	MOVQ  z_base+48(FP), DX
  8136  	MOVQ  x_len+8(FP), CX
  8137  	TESTQ CX, CX
  8138  	JE    LBB11_7
  8139  	CMPQ  CX, $0x20
  8140  	JAE   LBB11_3
  8141  	XORL  R8, R8
  8142  	JMP   LBB11_6
  8143  
  8144  LBB11_3:
  8145  	MOVQ    CX, R8
  8146  	ANDQ    $-32, R8
  8147  	XORL    AX, AX
  8148  	VMOVDQU dataNeqF32<>+0(SB), X0
  8149  
  8150  LBB11_4:
  8151  	VMOVUPS      (DX)(AX*4), Y1
  8152  	VMOVUPS      32(DX)(AX*4), Y2
  8153  	VMOVUPS      64(DX)(AX*4), Y3
  8154  	VMOVUPS      96(DX)(AX*4), Y4
  8155  	VCMPPS       $0x04, (SI)(AX*4), Y1, Y1
  8156  	VEXTRACTF128 $0x01, Y1, X5
  8157  	VPACKSSDW    X5, X1, X1
  8158  	VPACKSSWB    X1, X1, X1
  8159  	VCMPPS       $0x04, 32(SI)(AX*4), Y2, Y2
  8160  	VPAND        X0, X1, X1
  8161  	VEXTRACTF128 $0x01, Y2, X5
  8162  	VPACKSSDW    X5, X2, X2
  8163  	VPACKSSWB    X2, X2, X2
  8164  	VPAND        X0, X2, X2
  8165  	VCMPPS       $0x04, 64(SI)(AX*4), Y3, Y3
  8166  	VEXTRACTF128 $0x01, Y3, X5
  8167  	VPACKSSDW    X5, X3, X3
  8168  	VPACKSSWB    X3, X3, X3
  8169  	VCMPPS       $0x04, 96(SI)(AX*4), Y4, Y4
  8170  	VPAND        X0, X3, X3
  8171  	VEXTRACTF128 $0x01, Y4, X5
  8172  	VPACKSSDW    X5, X4, X4
  8173  	VPACKSSWB    X4, X4, X4
  8174  	VPAND        X0, X4, X4
  8175  	VINSERTI128  $0x01, X4, Y3, Y3
  8176  	VINSERTI128  $0x01, X2, Y1, Y1
  8177  	VPUNPCKLQDQ  Y3, Y1, Y1
  8178  	VPERMQ       $0xd8, Y1, Y1
  8179  	VMOVDQU      Y1, (DI)(AX*1)
  8180  	ADDQ         $0x20, AX
  8181  	CMPQ         R8, AX
  8182  	JNE          LBB11_4
  8183  	CMPQ         R8, CX
  8184  	JE           LBB11_7
  8185  
  8186  LBB11_6:
  8187  	VMOVSS   (SI)(R8*4), X0
  8188  	VUCOMISS (DX)(R8*4), X0
  8189  	SETNE    (DI)(R8*1)
  8190  	ADDQ     $0x01, R8
  8191  	CMPQ     CX, R8
  8192  	JNE      LBB11_6
  8193  
  8194  LBB11_7:
  8195  	VZEROUPPER
  8196  	RET
  8197  
  8198  DATA dataLtNumberF64<>+0(SB)/1, $0x01
  8199  DATA dataLtNumberF64<>+1(SB)/1, $0x01
  8200  DATA dataLtNumberF64<>+2(SB)/1, $0x01
  8201  DATA dataLtNumberF64<>+3(SB)/1, $0x01
  8202  DATA dataLtNumberF64<>+4(SB)/1, $0x00
  8203  DATA dataLtNumberF64<>+5(SB)/1, $0x00
  8204  DATA dataLtNumberF64<>+6(SB)/1, $0x00
  8205  DATA dataLtNumberF64<>+7(SB)/1, $0x00
  8206  DATA dataLtNumberF64<>+8(SB)/1, $0x00
  8207  DATA dataLtNumberF64<>+9(SB)/1, $0x00
  8208  DATA dataLtNumberF64<>+10(SB)/1, $0x00
  8209  DATA dataLtNumberF64<>+11(SB)/1, $0x00
  8210  DATA dataLtNumberF64<>+12(SB)/1, $0x00
  8211  DATA dataLtNumberF64<>+13(SB)/1, $0x00
  8212  DATA dataLtNumberF64<>+14(SB)/1, $0x00
  8213  DATA dataLtNumberF64<>+15(SB)/1, $0x00
  8214  GLOBL dataLtNumberF64<>(SB), RODATA|NOPTR, $16
  8215  
  8216  // func LtNumber_AVX2_F64(x []bool, y []float64, a float64)
  8217  // Requires: AVX, AVX2, SSE2
  8218  TEXT ·LtNumber_AVX2_F64(SB), NOSPLIT, $0-56
  8219  	MOVQ  x_base+0(FP), DI
  8220  	MOVQ  y_base+24(FP), SI
  8221  	MOVSD a+48(FP), X0
  8222  	MOVQ  x_len+8(FP), DX
  8223  	TESTQ DX, DX
  8224  	JE    LBB12_7
  8225  	CMPQ  DX, $0x10
  8226  	JAE   LBB12_3
  8227  	XORL  AX, AX
  8228  	JMP   LBB12_6
  8229  
  8230  LBB12_3:
  8231  	MOVQ         DX, AX
  8232  	ANDQ         $-16, AX
  8233  	VBROADCASTSD X0, Y1
  8234  	XORL         CX, CX
  8235  	VMOVDQU      dataLtNumberF64<>+0(SB), X2
  8236  
  8237  LBB12_4:
  8238  	VMOVUPD      (SI)(CX*8), Y3
  8239  	VMOVUPD      32(SI)(CX*8), Y4
  8240  	VMOVUPD      64(SI)(CX*8), Y5
  8241  	VMOVUPD      96(SI)(CX*8), Y6
  8242  	VCMPPD       $0x01, Y1, Y3, Y3
  8243  	VEXTRACTF128 $0x01, Y3, X7
  8244  	VPACKSSDW    X7, X3, X3
  8245  	VPACKSSDW    X3, X3, X3
  8246  	VPACKSSWB    X3, X3, X3
  8247  	VPAND        X2, X3, X3
  8248  	VCMPPD       $0x01, Y1, Y4, Y4
  8249  	VEXTRACTF128 $0x01, Y4, X7
  8250  	VPACKSSDW    X7, X4, X4
  8251  	VPACKSSDW    X4, X4, X4
  8252  	VPACKSSWB    X4, X4, X4
  8253  	VPAND        X2, X4, X4
  8254  	VPUNPCKLDQ   X4, X3, X3
  8255  	VCMPPD       $0x01, Y1, Y5, Y4
  8256  	VEXTRACTF128 $0x01, Y4, X5
  8257  	VPACKSSDW    X5, X4, X4
  8258  	VPACKSSDW    X4, X4, X4
  8259  	VPACKSSWB    X4, X4, X4
  8260  	VPAND        X2, X4, X4
  8261  	VCMPPD       $0x01, Y1, Y6, Y5
  8262  	VEXTRACTF128 $0x01, Y5, X6
  8263  	VPACKSSDW    X6, X5, X5
  8264  	VPACKSSDW    X5, X5, X5
  8265  	VPACKSSWB    X5, X5, X5
  8266  	VPAND        X2, X5, X5
  8267  	VPBROADCASTD X5, X5
  8268  	VPBROADCASTD X4, X4
  8269  	VPUNPCKLDQ   X5, X4, X4
  8270  	VPBLENDD     $0x0c, X4, X3, X3
  8271  	VMOVDQU      X3, (DI)(CX*1)
  8272  	ADDQ         $0x10, CX
  8273  	CMPQ         AX, CX
  8274  	JNE          LBB12_4
  8275  	CMPQ         AX, DX
  8276  	JE           LBB12_7
  8277  
  8278  LBB12_6:
  8279  	VUCOMISD (SI)(AX*8), X0
  8280  	SETHI    (DI)(AX*1)
  8281  	ADDQ     $0x01, AX
  8282  	CMPQ     DX, AX
  8283  	JNE      LBB12_6
  8284  
  8285  LBB12_7:
  8286  	VZEROUPPER
  8287  	RET
  8288  
  8289  DATA dataLtNumberF32<>+0(SB)/1, $0x01
  8290  DATA dataLtNumberF32<>+1(SB)/1, $0x01
  8291  DATA dataLtNumberF32<>+2(SB)/1, $0x01
  8292  DATA dataLtNumberF32<>+3(SB)/1, $0x01
  8293  DATA dataLtNumberF32<>+4(SB)/1, $0x01
  8294  DATA dataLtNumberF32<>+5(SB)/1, $0x01
  8295  DATA dataLtNumberF32<>+6(SB)/1, $0x01
  8296  DATA dataLtNumberF32<>+7(SB)/1, $0x01
  8297  DATA dataLtNumberF32<>+8(SB)/1, $0x00
  8298  DATA dataLtNumberF32<>+9(SB)/1, $0x00
  8299  DATA dataLtNumberF32<>+10(SB)/1, $0x00
  8300  DATA dataLtNumberF32<>+11(SB)/1, $0x00
  8301  DATA dataLtNumberF32<>+12(SB)/1, $0x00
  8302  DATA dataLtNumberF32<>+13(SB)/1, $0x00
  8303  DATA dataLtNumberF32<>+14(SB)/1, $0x00
  8304  DATA dataLtNumberF32<>+15(SB)/1, $0x00
  8305  GLOBL dataLtNumberF32<>(SB), RODATA|NOPTR, $16
  8306  
  8307  // func LtNumber_AVX2_F32(x []bool, y []float32, a float32)
  8308  // Requires: AVX, AVX2, SSE
  8309  TEXT ·LtNumber_AVX2_F32(SB), NOSPLIT, $0-52
  8310  	MOVQ  x_base+0(FP), DI
  8311  	MOVQ  y_base+24(FP), SI
  8312  	MOVSS a+48(FP), X0
  8313  	MOVQ  x_len+8(FP), DX
  8314  	TESTQ DX, DX
  8315  	JE    LBB13_7
  8316  	CMPQ  DX, $0x20
  8317  	JAE   LBB13_3
  8318  	XORL  AX, AX
  8319  	JMP   LBB13_6
  8320  
  8321  LBB13_3:
  8322  	MOVQ         DX, AX
  8323  	ANDQ         $-32, AX
  8324  	VBROADCASTSS X0, Y1
  8325  	XORL         CX, CX
  8326  	VMOVDQU      dataLtNumberF32<>+0(SB), X2
  8327  
  8328  LBB13_4:
  8329  	VMOVUPS      (SI)(CX*4), Y3
  8330  	VMOVUPS      32(SI)(CX*4), Y4
  8331  	VMOVUPS      64(SI)(CX*4), Y5
  8332  	VMOVUPS      96(SI)(CX*4), Y6
  8333  	VCMPPS       $0x01, Y1, Y3, Y3
  8334  	VEXTRACTF128 $0x01, Y3, X7
  8335  	VPACKSSDW    X7, X3, X3
  8336  	VPACKSSWB    X3, X3, X3
  8337  	VPAND        X2, X3, X3
  8338  	VCMPPS       $0x01, Y1, Y4, Y4
  8339  	VEXTRACTF128 $0x01, Y4, X7
  8340  	VPACKSSDW    X7, X4, X4
  8341  	VPACKSSWB    X4, X4, X4
  8342  	VPAND        X2, X4, X4
  8343  	VCMPPS       $0x01, Y1, Y5, Y5
  8344  	VEXTRACTF128 $0x01, Y5, X7
  8345  	VPACKSSDW    X7, X5, X5
  8346  	VPACKSSWB    X5, X5, X5
  8347  	VPAND        X2, X5, X5
  8348  	VCMPPS       $0x01, Y1, Y6, Y6
  8349  	VEXTRACTF128 $0x01, Y6, X7
  8350  	VPACKSSDW    X7, X6, X6
  8351  	VPACKSSWB    X6, X6, X6
  8352  	VPAND        X2, X6, X6
  8353  	VINSERTI128  $0x01, X6, Y5, Y5
  8354  	VINSERTI128  $0x01, X4, Y3, Y3
  8355  	VPUNPCKLQDQ  Y5, Y3, Y3
  8356  	VPERMQ       $0xd8, Y3, Y3
  8357  	VMOVDQU      Y3, (DI)(CX*1)
  8358  	ADDQ         $0x20, CX
  8359  	CMPQ         AX, CX
  8360  	JNE          LBB13_4
  8361  	CMPQ         AX, DX
  8362  	JE           LBB13_7
  8363  
  8364  LBB13_6:
  8365  	VUCOMISS (SI)(AX*4), X0
  8366  	SETHI    (DI)(AX*1)
  8367  	ADDQ     $0x01, AX
  8368  	CMPQ     DX, AX
  8369  	JNE      LBB13_6
  8370  
  8371  LBB13_7:
  8372  	VZEROUPPER
  8373  	RET
  8374  
  8375  DATA dataLteNumberF64<>+0(SB)/1, $0x01
  8376  DATA dataLteNumberF64<>+1(SB)/1, $0x01
  8377  DATA dataLteNumberF64<>+2(SB)/1, $0x01
  8378  DATA dataLteNumberF64<>+3(SB)/1, $0x01
  8379  DATA dataLteNumberF64<>+4(SB)/1, $0x00
  8380  DATA dataLteNumberF64<>+5(SB)/1, $0x00
  8381  DATA dataLteNumberF64<>+6(SB)/1, $0x00
  8382  DATA dataLteNumberF64<>+7(SB)/1, $0x00
  8383  DATA dataLteNumberF64<>+8(SB)/1, $0x00
  8384  DATA dataLteNumberF64<>+9(SB)/1, $0x00
  8385  DATA dataLteNumberF64<>+10(SB)/1, $0x00
  8386  DATA dataLteNumberF64<>+11(SB)/1, $0x00
  8387  DATA dataLteNumberF64<>+12(SB)/1, $0x00
  8388  DATA dataLteNumberF64<>+13(SB)/1, $0x00
  8389  DATA dataLteNumberF64<>+14(SB)/1, $0x00
  8390  DATA dataLteNumberF64<>+15(SB)/1, $0x00
  8391  GLOBL dataLteNumberF64<>(SB), RODATA|NOPTR, $16
  8392  
  8393  // func LteNumber_AVX2_F64(x []bool, y []float64, a float64)
  8394  // Requires: AVX, AVX2, SSE2
  8395  TEXT ·LteNumber_AVX2_F64(SB), NOSPLIT, $0-56
  8396  	MOVQ  x_base+0(FP), DI
  8397  	MOVQ  y_base+24(FP), SI
  8398  	MOVSD a+48(FP), X0
  8399  	MOVQ  x_len+8(FP), DX
  8400  	TESTQ DX, DX
  8401  	JE    LBB14_7
  8402  	CMPQ  DX, $0x10
  8403  	JAE   LBB14_3
  8404  	XORL  AX, AX
  8405  	JMP   LBB14_6
  8406  
  8407  LBB14_3:
  8408  	MOVQ         DX, AX
  8409  	ANDQ         $-16, AX
  8410  	VBROADCASTSD X0, Y1
  8411  	XORL         CX, CX
  8412  	VMOVDQU      dataLteNumberF64<>+0(SB), X2
  8413  
  8414  LBB14_4:
  8415  	VMOVUPD      (SI)(CX*8), Y3
  8416  	VMOVUPD      32(SI)(CX*8), Y4
  8417  	VMOVUPD      64(SI)(CX*8), Y5
  8418  	VMOVUPD      96(SI)(CX*8), Y6
  8419  	VCMPPD       $0x02, Y1, Y3, Y3
  8420  	VEXTRACTF128 $0x01, Y3, X7
  8421  	VPACKSSDW    X7, X3, X3
  8422  	VPACKSSDW    X3, X3, X3
  8423  	VPACKSSWB    X3, X3, X3
  8424  	VPAND        X2, X3, X3
  8425  	VCMPPD       $0x02, Y1, Y4, Y4
  8426  	VEXTRACTF128 $0x01, Y4, X7
  8427  	VPACKSSDW    X7, X4, X4
  8428  	VPACKSSDW    X4, X4, X4
  8429  	VPACKSSWB    X4, X4, X4
  8430  	VPAND        X2, X4, X4
  8431  	VPUNPCKLDQ   X4, X3, X3
  8432  	VCMPPD       $0x02, Y1, Y5, Y4
  8433  	VEXTRACTF128 $0x01, Y4, X5
  8434  	VPACKSSDW    X5, X4, X4
  8435  	VPACKSSDW    X4, X4, X4
  8436  	VPACKSSWB    X4, X4, X4
  8437  	VPAND        X2, X4, X4
  8438  	VCMPPD       $0x02, Y1, Y6, Y5
  8439  	VEXTRACTF128 $0x01, Y5, X6
  8440  	VPACKSSDW    X6, X5, X5
  8441  	VPACKSSDW    X5, X5, X5
  8442  	VPACKSSWB    X5, X5, X5
  8443  	VPAND        X2, X5, X5
  8444  	VPBROADCASTD X5, X5
  8445  	VPBROADCASTD X4, X4
  8446  	VPUNPCKLDQ   X5, X4, X4
  8447  	VPBLENDD     $0x0c, X4, X3, X3
  8448  	VMOVDQU      X3, (DI)(CX*1)
  8449  	ADDQ         $0x10, CX
  8450  	CMPQ         AX, CX
  8451  	JNE          LBB14_4
  8452  	CMPQ         AX, DX
  8453  	JE           LBB14_7
  8454  
  8455  LBB14_6:
  8456  	VUCOMISD (SI)(AX*8), X0
  8457  	SETCC    (DI)(AX*1)
  8458  	ADDQ     $0x01, AX
  8459  	CMPQ     DX, AX
  8460  	JNE      LBB14_6
  8461  
  8462  LBB14_7:
  8463  	VZEROUPPER
  8464  	RET
  8465  
  8466  DATA dataLteNumberF32<>+0(SB)/1, $0x01
  8467  DATA dataLteNumberF32<>+1(SB)/1, $0x01
  8468  DATA dataLteNumberF32<>+2(SB)/1, $0x01
  8469  DATA dataLteNumberF32<>+3(SB)/1, $0x01
  8470  DATA dataLteNumberF32<>+4(SB)/1, $0x01
  8471  DATA dataLteNumberF32<>+5(SB)/1, $0x01
  8472  DATA dataLteNumberF32<>+6(SB)/1, $0x01
  8473  DATA dataLteNumberF32<>+7(SB)/1, $0x01
  8474  DATA dataLteNumberF32<>+8(SB)/1, $0x00
  8475  DATA dataLteNumberF32<>+9(SB)/1, $0x00
  8476  DATA dataLteNumberF32<>+10(SB)/1, $0x00
  8477  DATA dataLteNumberF32<>+11(SB)/1, $0x00
  8478  DATA dataLteNumberF32<>+12(SB)/1, $0x00
  8479  DATA dataLteNumberF32<>+13(SB)/1, $0x00
  8480  DATA dataLteNumberF32<>+14(SB)/1, $0x00
  8481  DATA dataLteNumberF32<>+15(SB)/1, $0x00
  8482  GLOBL dataLteNumberF32<>(SB), RODATA|NOPTR, $16
  8483  
  8484  // func LteNumber_AVX2_F32(x []bool, y []float32, a float32)
  8485  // Requires: AVX, AVX2, SSE
  8486  TEXT ·LteNumber_AVX2_F32(SB), NOSPLIT, $0-52
  8487  	MOVQ  x_base+0(FP), DI
  8488  	MOVQ  y_base+24(FP), SI
  8489  	MOVSS a+48(FP), X0
  8490  	MOVQ  x_len+8(FP), DX
  8491  	TESTQ DX, DX
  8492  	JE    LBB15_7
  8493  	CMPQ  DX, $0x20
  8494  	JAE   LBB15_3
  8495  	XORL  AX, AX
  8496  	JMP   LBB15_6
  8497  
  8498  LBB15_3:
  8499  	MOVQ         DX, AX
  8500  	ANDQ         $-32, AX
  8501  	VBROADCASTSS X0, Y1
  8502  	XORL         CX, CX
  8503  	VMOVDQU      dataLteNumberF32<>+0(SB), X2
  8504  
  8505  LBB15_4:
  8506  	VMOVUPS      (SI)(CX*4), Y3
  8507  	VMOVUPS      32(SI)(CX*4), Y4
  8508  	VMOVUPS      64(SI)(CX*4), Y5
  8509  	VMOVUPS      96(SI)(CX*4), Y6
  8510  	VCMPPS       $0x02, Y1, Y3, Y3
  8511  	VEXTRACTF128 $0x01, Y3, X7
  8512  	VPACKSSDW    X7, X3, X3
  8513  	VPACKSSWB    X3, X3, X3
  8514  	VPAND        X2, X3, X3
  8515  	VCMPPS       $0x02, Y1, Y4, Y4
  8516  	VEXTRACTF128 $0x01, Y4, X7
  8517  	VPACKSSDW    X7, X4, X4
  8518  	VPACKSSWB    X4, X4, X4
  8519  	VPAND        X2, X4, X4
  8520  	VCMPPS       $0x02, Y1, Y5, Y5
  8521  	VEXTRACTF128 $0x01, Y5, X7
  8522  	VPACKSSDW    X7, X5, X5
  8523  	VPACKSSWB    X5, X5, X5
  8524  	VPAND        X2, X5, X5
  8525  	VCMPPS       $0x02, Y1, Y6, Y6
  8526  	VEXTRACTF128 $0x01, Y6, X7
  8527  	VPACKSSDW    X7, X6, X6
  8528  	VPACKSSWB    X6, X6, X6
  8529  	VPAND        X2, X6, X6
  8530  	VINSERTI128  $0x01, X6, Y5, Y5
  8531  	VINSERTI128  $0x01, X4, Y3, Y3
  8532  	VPUNPCKLQDQ  Y5, Y3, Y3
  8533  	VPERMQ       $0xd8, Y3, Y3
  8534  	VMOVDQU      Y3, (DI)(CX*1)
  8535  	ADDQ         $0x20, CX
  8536  	CMPQ         AX, CX
  8537  	JNE          LBB15_4
  8538  	CMPQ         AX, DX
  8539  	JE           LBB15_7
  8540  
  8541  LBB15_6:
  8542  	VUCOMISS (SI)(AX*4), X0
  8543  	SETCC    (DI)(AX*1)
  8544  	ADDQ     $0x01, AX
  8545  	CMPQ     DX, AX
  8546  	JNE      LBB15_6
  8547  
  8548  LBB15_7:
  8549  	VZEROUPPER
  8550  	RET
  8551  
  8552  DATA dataGtNumberF64<>+0(SB)/1, $0x01
  8553  DATA dataGtNumberF64<>+1(SB)/1, $0x01
  8554  DATA dataGtNumberF64<>+2(SB)/1, $0x01
  8555  DATA dataGtNumberF64<>+3(SB)/1, $0x01
  8556  DATA dataGtNumberF64<>+4(SB)/1, $0x00
  8557  DATA dataGtNumberF64<>+5(SB)/1, $0x00
  8558  DATA dataGtNumberF64<>+6(SB)/1, $0x00
  8559  DATA dataGtNumberF64<>+7(SB)/1, $0x00
  8560  DATA dataGtNumberF64<>+8(SB)/1, $0x00
  8561  DATA dataGtNumberF64<>+9(SB)/1, $0x00
  8562  DATA dataGtNumberF64<>+10(SB)/1, $0x00
  8563  DATA dataGtNumberF64<>+11(SB)/1, $0x00
  8564  DATA dataGtNumberF64<>+12(SB)/1, $0x00
  8565  DATA dataGtNumberF64<>+13(SB)/1, $0x00
  8566  DATA dataGtNumberF64<>+14(SB)/1, $0x00
  8567  DATA dataGtNumberF64<>+15(SB)/1, $0x00
  8568  GLOBL dataGtNumberF64<>(SB), RODATA|NOPTR, $16
  8569  
  8570  // func GtNumber_AVX2_F64(x []bool, y []float64, a float64)
  8571  // Requires: AVX, AVX2, SSE2
  8572  TEXT ·GtNumber_AVX2_F64(SB), NOSPLIT, $0-56
  8573  	MOVQ  x_base+0(FP), DI
  8574  	MOVQ  y_base+24(FP), SI
  8575  	MOVSD a+48(FP), X0
  8576  	MOVQ  x_len+8(FP), DX
  8577  	TESTQ DX, DX
  8578  	JE    LBB16_7
  8579  	CMPQ  DX, $0x10
  8580  	JAE   LBB16_3
  8581  	XORL  AX, AX
  8582  	JMP   LBB16_6
  8583  
  8584  LBB16_3:
  8585  	MOVQ         DX, AX
  8586  	ANDQ         $-16, AX
  8587  	VBROADCASTSD X0, Y1
  8588  	XORL         CX, CX
  8589  	VMOVDQU      dataGtNumberF64<>+0(SB), X2
  8590  
  8591  LBB16_4:
  8592  	VCMPPD       $0x01, (SI)(CX*8), Y1, Y3
  8593  	VEXTRACTF128 $0x01, Y3, X4
  8594  	VPACKSSDW    X4, X3, X3
  8595  	VPACKSSDW    X3, X3, X3
  8596  	VPACKSSWB    X3, X3, X3
  8597  	VCMPPD       $0x01, 32(SI)(CX*8), Y1, Y4
  8598  	VPAND        X2, X3, X3
  8599  	VEXTRACTF128 $0x01, Y4, X5
  8600  	VPACKSSDW    X5, X4, X4
  8601  	VPACKSSDW    X4, X4, X4
  8602  	VPACKSSWB    X4, X4, X4
  8603  	VPAND        X2, X4, X4
  8604  	VCMPPD       $0x01, 64(SI)(CX*8), Y1, Y5
  8605  	VPUNPCKLDQ   X4, X3, X3
  8606  	VEXTRACTF128 $0x01, Y5, X4
  8607  	VPACKSSDW    X4, X5, X4
  8608  	VPACKSSDW    X4, X4, X4
  8609  	VPACKSSWB    X4, X4, X4
  8610  	VPAND        X2, X4, X4
  8611  	VCMPPD       $0x01, 96(SI)(CX*8), Y1, Y5
  8612  	VEXTRACTF128 $0x01, Y5, X6
  8613  	VPACKSSDW    X6, X5, X5
  8614  	VPACKSSDW    X5, X5, X5
  8615  	VPACKSSWB    X5, X5, X5
  8616  	VPAND        X2, X5, X5
  8617  	VPBROADCASTD X5, X5
  8618  	VPBROADCASTD X4, X4
  8619  	VPUNPCKLDQ   X5, X4, X4
  8620  	VPBLENDD     $0x0c, X4, X3, X3
  8621  	VMOVDQU      X3, (DI)(CX*1)
  8622  	ADDQ         $0x10, CX
  8623  	CMPQ         AX, CX
  8624  	JNE          LBB16_4
  8625  	CMPQ         AX, DX
  8626  	JE           LBB16_7
  8627  
  8628  LBB16_6:
  8629  	VUCOMISD (SI)(AX*8), X0
  8630  	SETCS    (DI)(AX*1)
  8631  	ADDQ     $0x01, AX
  8632  	CMPQ     DX, AX
  8633  	JNE      LBB16_6
  8634  
  8635  LBB16_7:
  8636  	VZEROUPPER
  8637  	RET
  8638  
  8639  DATA dataGtNumberF32<>+0(SB)/1, $0x01
  8640  DATA dataGtNumberF32<>+1(SB)/1, $0x01
  8641  DATA dataGtNumberF32<>+2(SB)/1, $0x01
  8642  DATA dataGtNumberF32<>+3(SB)/1, $0x01
  8643  DATA dataGtNumberF32<>+4(SB)/1, $0x01
  8644  DATA dataGtNumberF32<>+5(SB)/1, $0x01
  8645  DATA dataGtNumberF32<>+6(SB)/1, $0x01
  8646  DATA dataGtNumberF32<>+7(SB)/1, $0x01
  8647  DATA dataGtNumberF32<>+8(SB)/1, $0x00
  8648  DATA dataGtNumberF32<>+9(SB)/1, $0x00
  8649  DATA dataGtNumberF32<>+10(SB)/1, $0x00
  8650  DATA dataGtNumberF32<>+11(SB)/1, $0x00
  8651  DATA dataGtNumberF32<>+12(SB)/1, $0x00
  8652  DATA dataGtNumberF32<>+13(SB)/1, $0x00
  8653  DATA dataGtNumberF32<>+14(SB)/1, $0x00
  8654  DATA dataGtNumberF32<>+15(SB)/1, $0x00
  8655  GLOBL dataGtNumberF32<>(SB), RODATA|NOPTR, $16
  8656  
  8657  // func GtNumber_AVX2_F32(x []bool, y []float32, a float32)
  8658  // Requires: AVX, AVX2, SSE
  8659  TEXT ·GtNumber_AVX2_F32(SB), NOSPLIT, $0-52
  8660  	MOVQ  x_base+0(FP), DI
  8661  	MOVQ  y_base+24(FP), SI
  8662  	MOVSS a+48(FP), X0
  8663  	MOVQ  x_len+8(FP), DX
  8664  	TESTQ DX, DX
  8665  	JE    LBB17_7
  8666  	CMPQ  DX, $0x20
  8667  	JAE   LBB17_3
  8668  	XORL  AX, AX
  8669  	JMP   LBB17_6
  8670  
  8671  LBB17_3:
  8672  	MOVQ         DX, AX
  8673  	ANDQ         $-32, AX
  8674  	VBROADCASTSS X0, Y1
  8675  	XORL         CX, CX
  8676  	VMOVDQU      dataGtNumberF32<>+0(SB), X2
  8677  
  8678  LBB17_4:
  8679  	VCMPPS       $0x01, (SI)(CX*4), Y1, Y3
  8680  	VEXTRACTF128 $0x01, Y3, X4
  8681  	VPACKSSDW    X4, X3, X3
  8682  	VPACKSSWB    X3, X3, X3
  8683  	VCMPPS       $0x01, 32(SI)(CX*4), Y1, Y4
  8684  	VPAND        X2, X3, X3
  8685  	VEXTRACTF128 $0x01, Y4, X5
  8686  	VPACKSSDW    X5, X4, X4
  8687  	VPACKSSWB    X4, X4, X4
  8688  	VPAND        X2, X4, X4
  8689  	VCMPPS       $0x01, 64(SI)(CX*4), Y1, Y5
  8690  	VEXTRACTF128 $0x01, Y5, X6
  8691  	VPACKSSDW    X6, X5, X5
  8692  	VPACKSSWB    X5, X5, X5
  8693  	VCMPPS       $0x01, 96(SI)(CX*4), Y1, Y6
  8694  	VPAND        X2, X5, X5
  8695  	VEXTRACTF128 $0x01, Y6, X7
  8696  	VPACKSSDW    X7, X6, X6
  8697  	VPACKSSWB    X6, X6, X6
  8698  	VPAND        X2, X6, X6
  8699  	VINSERTI128  $0x01, X6, Y5, Y5
  8700  	VINSERTI128  $0x01, X4, Y3, Y3
  8701  	VPUNPCKLQDQ  Y5, Y3, Y3
  8702  	VPERMQ       $0xd8, Y3, Y3
  8703  	VMOVDQU      Y3, (DI)(CX*1)
  8704  	ADDQ         $0x20, CX
  8705  	CMPQ         AX, CX
  8706  	JNE          LBB17_4
  8707  	CMPQ         AX, DX
  8708  	JE           LBB17_7
  8709  
  8710  LBB17_6:
  8711  	VUCOMISS (SI)(AX*4), X0
  8712  	SETCS    (DI)(AX*1)
  8713  	ADDQ     $0x01, AX
  8714  	CMPQ     DX, AX
  8715  	JNE      LBB17_6
  8716  
  8717  LBB17_7:
  8718  	VZEROUPPER
  8719  	RET
  8720  
  8721  DATA dataGteNumberF64<>+0(SB)/1, $0x01
  8722  DATA dataGteNumberF64<>+1(SB)/1, $0x01
  8723  DATA dataGteNumberF64<>+2(SB)/1, $0x01
  8724  DATA dataGteNumberF64<>+3(SB)/1, $0x01
  8725  DATA dataGteNumberF64<>+4(SB)/1, $0x00
  8726  DATA dataGteNumberF64<>+5(SB)/1, $0x00
  8727  DATA dataGteNumberF64<>+6(SB)/1, $0x00
  8728  DATA dataGteNumberF64<>+7(SB)/1, $0x00
  8729  DATA dataGteNumberF64<>+8(SB)/1, $0x00
  8730  DATA dataGteNumberF64<>+9(SB)/1, $0x00
  8731  DATA dataGteNumberF64<>+10(SB)/1, $0x00
  8732  DATA dataGteNumberF64<>+11(SB)/1, $0x00
  8733  DATA dataGteNumberF64<>+12(SB)/1, $0x00
  8734  DATA dataGteNumberF64<>+13(SB)/1, $0x00
  8735  DATA dataGteNumberF64<>+14(SB)/1, $0x00
  8736  DATA dataGteNumberF64<>+15(SB)/1, $0x00
  8737  GLOBL dataGteNumberF64<>(SB), RODATA|NOPTR, $16
  8738  
  8739  // func GteNumber_AVX2_F64(x []bool, y []float64, a float64)
  8740  // Requires: AVX, AVX2, SSE2
  8741  TEXT ·GteNumber_AVX2_F64(SB), NOSPLIT, $0-56
  8742  	MOVQ  x_base+0(FP), DI
  8743  	MOVQ  y_base+24(FP), SI
  8744  	MOVSD a+48(FP), X0
  8745  	MOVQ  x_len+8(FP), DX
  8746  	TESTQ DX, DX
  8747  	JE    LBB18_7
  8748  	CMPQ  DX, $0x10
  8749  	JAE   LBB18_3
  8750  	XORL  AX, AX
  8751  	JMP   LBB18_6
  8752  
  8753  LBB18_3:
  8754  	MOVQ         DX, AX
  8755  	ANDQ         $-16, AX
  8756  	VBROADCASTSD X0, Y1
  8757  	XORL         CX, CX
  8758  	VMOVDQU      dataGteNumberF64<>+0(SB), X2
  8759  
  8760  LBB18_4:
  8761  	VCMPPD       $0x02, (SI)(CX*8), Y1, Y3
  8762  	VEXTRACTF128 $0x01, Y3, X4
  8763  	VPACKSSDW    X4, X3, X3
  8764  	VPACKSSDW    X3, X3, X3
  8765  	VPACKSSWB    X3, X3, X3
  8766  	VCMPPD       $0x02, 32(SI)(CX*8), Y1, Y4
  8767  	VPAND        X2, X3, X3
  8768  	VEXTRACTF128 $0x01, Y4, X5
  8769  	VPACKSSDW    X5, X4, X4
  8770  	VPACKSSDW    X4, X4, X4
  8771  	VPACKSSWB    X4, X4, X4
  8772  	VPAND        X2, X4, X4
  8773  	VCMPPD       $0x02, 64(SI)(CX*8), Y1, Y5
  8774  	VPUNPCKLDQ   X4, X3, X3
  8775  	VEXTRACTF128 $0x01, Y5, X4
  8776  	VPACKSSDW    X4, X5, X4
  8777  	VPACKSSDW    X4, X4, X4
  8778  	VPACKSSWB    X4, X4, X4
  8779  	VPAND        X2, X4, X4
  8780  	VCMPPD       $0x02, 96(SI)(CX*8), Y1, Y5
  8781  	VEXTRACTF128 $0x01, Y5, X6
  8782  	VPACKSSDW    X6, X5, X5
  8783  	VPACKSSDW    X5, X5, X5
  8784  	VPACKSSWB    X5, X5, X5
  8785  	VPAND        X2, X5, X5
  8786  	VPBROADCASTD X5, X5
  8787  	VPBROADCASTD X4, X4
  8788  	VPUNPCKLDQ   X5, X4, X4
  8789  	VPBLENDD     $0x0c, X4, X3, X3
  8790  	VMOVDQU      X3, (DI)(CX*1)
  8791  	ADDQ         $0x10, CX
  8792  	CMPQ         AX, CX
  8793  	JNE          LBB18_4
  8794  	CMPQ         AX, DX
  8795  	JE           LBB18_7
  8796  
  8797  LBB18_6:
  8798  	VUCOMISD (SI)(AX*8), X0
  8799  	SETLS    (DI)(AX*1)
  8800  	ADDQ     $0x01, AX
  8801  	CMPQ     DX, AX
  8802  	JNE      LBB18_6
  8803  
  8804  LBB18_7:
  8805  	VZEROUPPER
  8806  	RET
  8807  
  8808  DATA dataGteNumberF32<>+0(SB)/1, $0x01
  8809  DATA dataGteNumberF32<>+1(SB)/1, $0x01
  8810  DATA dataGteNumberF32<>+2(SB)/1, $0x01
  8811  DATA dataGteNumberF32<>+3(SB)/1, $0x01
  8812  DATA dataGteNumberF32<>+4(SB)/1, $0x01
  8813  DATA dataGteNumberF32<>+5(SB)/1, $0x01
  8814  DATA dataGteNumberF32<>+6(SB)/1, $0x01
  8815  DATA dataGteNumberF32<>+7(SB)/1, $0x01
  8816  DATA dataGteNumberF32<>+8(SB)/1, $0x00
  8817  DATA dataGteNumberF32<>+9(SB)/1, $0x00
  8818  DATA dataGteNumberF32<>+10(SB)/1, $0x00
  8819  DATA dataGteNumberF32<>+11(SB)/1, $0x00
  8820  DATA dataGteNumberF32<>+12(SB)/1, $0x00
  8821  DATA dataGteNumberF32<>+13(SB)/1, $0x00
  8822  DATA dataGteNumberF32<>+14(SB)/1, $0x00
  8823  DATA dataGteNumberF32<>+15(SB)/1, $0x00
  8824  GLOBL dataGteNumberF32<>(SB), RODATA|NOPTR, $16
  8825  
  8826  // func GteNumber_AVX2_F32(x []bool, y []float32, a float32)
  8827  // Requires: AVX, AVX2, SSE
  8828  TEXT ·GteNumber_AVX2_F32(SB), NOSPLIT, $0-52
  8829  	MOVQ  x_base+0(FP), DI
  8830  	MOVQ  y_base+24(FP), SI
  8831  	MOVSS a+48(FP), X0
  8832  	MOVQ  x_len+8(FP), DX
  8833  	TESTQ DX, DX
  8834  	JE    LBB19_7
  8835  	CMPQ  DX, $0x20
  8836  	JAE   LBB19_3
  8837  	XORL  AX, AX
  8838  	JMP   LBB19_6
  8839  
  8840  LBB19_3:
  8841  	MOVQ         DX, AX
  8842  	ANDQ         $-32, AX
  8843  	VBROADCASTSS X0, Y1
  8844  	XORL         CX, CX
  8845  	VMOVDQU      dataGteNumberF32<>+0(SB), X2
  8846  
  8847  LBB19_4:
  8848  	VCMPPS       $0x02, (SI)(CX*4), Y1, Y3
  8849  	VEXTRACTF128 $0x01, Y3, X4
  8850  	VPACKSSDW    X4, X3, X3
  8851  	VPACKSSWB    X3, X3, X3
  8852  	VCMPPS       $0x02, 32(SI)(CX*4), Y1, Y4
  8853  	VPAND        X2, X3, X3
  8854  	VEXTRACTF128 $0x01, Y4, X5
  8855  	VPACKSSDW    X5, X4, X4
  8856  	VPACKSSWB    X4, X4, X4
  8857  	VPAND        X2, X4, X4
  8858  	VCMPPS       $0x02, 64(SI)(CX*4), Y1, Y5
  8859  	VEXTRACTF128 $0x01, Y5, X6
  8860  	VPACKSSDW    X6, X5, X5
  8861  	VPACKSSWB    X5, X5, X5
  8862  	VCMPPS       $0x02, 96(SI)(CX*4), Y1, Y6
  8863  	VPAND        X2, X5, X5
  8864  	VEXTRACTF128 $0x01, Y6, X7
  8865  	VPACKSSDW    X7, X6, X6
  8866  	VPACKSSWB    X6, X6, X6
  8867  	VPAND        X2, X6, X6
  8868  	VINSERTI128  $0x01, X6, Y5, Y5
  8869  	VINSERTI128  $0x01, X4, Y3, Y3
  8870  	VPUNPCKLQDQ  Y5, Y3, Y3
  8871  	VPERMQ       $0xd8, Y3, Y3
  8872  	VMOVDQU      Y3, (DI)(CX*1)
  8873  	ADDQ         $0x20, CX
  8874  	CMPQ         AX, CX
  8875  	JNE          LBB19_4
  8876  	CMPQ         AX, DX
  8877  	JE           LBB19_7
  8878  
  8879  LBB19_6:
  8880  	VUCOMISS (SI)(AX*4), X0
  8881  	SETLS    (DI)(AX*1)
  8882  	ADDQ     $0x01, AX
  8883  	CMPQ     DX, AX
  8884  	JNE      LBB19_6
  8885  
  8886  LBB19_7:
  8887  	VZEROUPPER
  8888  	RET
  8889  
  8890  DATA dataEqNumberF64<>+0(SB)/1, $0x01
  8891  DATA dataEqNumberF64<>+1(SB)/1, $0x01
  8892  DATA dataEqNumberF64<>+2(SB)/1, $0x01
  8893  DATA dataEqNumberF64<>+3(SB)/1, $0x01
  8894  DATA dataEqNumberF64<>+4(SB)/1, $0x00
  8895  DATA dataEqNumberF64<>+5(SB)/1, $0x00
  8896  DATA dataEqNumberF64<>+6(SB)/1, $0x00
  8897  DATA dataEqNumberF64<>+7(SB)/1, $0x00
  8898  DATA dataEqNumberF64<>+8(SB)/1, $0x00
  8899  DATA dataEqNumberF64<>+9(SB)/1, $0x00
  8900  DATA dataEqNumberF64<>+10(SB)/1, $0x00
  8901  DATA dataEqNumberF64<>+11(SB)/1, $0x00
  8902  DATA dataEqNumberF64<>+12(SB)/1, $0x00
  8903  DATA dataEqNumberF64<>+13(SB)/1, $0x00
  8904  DATA dataEqNumberF64<>+14(SB)/1, $0x00
  8905  DATA dataEqNumberF64<>+15(SB)/1, $0x00
  8906  GLOBL dataEqNumberF64<>(SB), RODATA|NOPTR, $16
  8907  
  8908  // func EqNumber_AVX2_F64(x []bool, y []float64, a float64)
  8909  // Requires: AVX, AVX2, SSE2
  8910  TEXT ·EqNumber_AVX2_F64(SB), NOSPLIT, $0-56
  8911  	MOVQ  x_base+0(FP), DI
  8912  	MOVQ  y_base+24(FP), SI
  8913  	MOVSD a+48(FP), X0
  8914  	MOVQ  x_len+8(FP), DX
  8915  	TESTQ DX, DX
  8916  	JE    LBB20_7
  8917  	CMPQ  DX, $0x10
  8918  	JAE   LBB20_3
  8919  	XORL  AX, AX
  8920  	JMP   LBB20_6
  8921  
  8922  LBB20_3:
  8923  	MOVQ         DX, AX
  8924  	ANDQ         $-16, AX
  8925  	VBROADCASTSD X0, Y1
  8926  	XORL         CX, CX
  8927  	VMOVDQU      dataEqNumberF64<>+0(SB), X2
  8928  
  8929  LBB20_4:
  8930  	VCMPPD       $0x00, (SI)(CX*8), Y1, Y3
  8931  	VEXTRACTF128 $0x01, Y3, X4
  8932  	VPACKSSDW    X4, X3, X3
  8933  	VPACKSSDW    X3, X3, X3
  8934  	VPACKSSWB    X3, X3, X3
  8935  	VCMPPD       $0x00, 32(SI)(CX*8), Y1, Y4
  8936  	VPAND        X2, X3, X3
  8937  	VEXTRACTF128 $0x01, Y4, X5
  8938  	VPACKSSDW    X5, X4, X4
  8939  	VPACKSSDW    X4, X4, X4
  8940  	VPACKSSWB    X4, X4, X4
  8941  	VPAND        X2, X4, X4
  8942  	VCMPPD       $0x00, 64(SI)(CX*8), Y1, Y5
  8943  	VPUNPCKLDQ   X4, X3, X3
  8944  	VEXTRACTF128 $0x01, Y5, X4
  8945  	VPACKSSDW    X4, X5, X4
  8946  	VPACKSSDW    X4, X4, X4
  8947  	VPACKSSWB    X4, X4, X4
  8948  	VPAND        X2, X4, X4
  8949  	VCMPPD       $0x00, 96(SI)(CX*8), Y1, Y5
  8950  	VEXTRACTF128 $0x01, Y5, X6
  8951  	VPACKSSDW    X6, X5, X5
  8952  	VPACKSSDW    X5, X5, X5
  8953  	VPACKSSWB    X5, X5, X5
  8954  	VPAND        X2, X5, X5
  8955  	VPBROADCASTD X5, X5
  8956  	VPBROADCASTD X4, X4
  8957  	VPUNPCKLDQ   X5, X4, X4
  8958  	VPBLENDD     $0x0c, X4, X3, X3
  8959  	VMOVDQU      X3, (DI)(CX*1)
  8960  	ADDQ         $0x10, CX
  8961  	CMPQ         AX, CX
  8962  	JNE          LBB20_4
  8963  	CMPQ         AX, DX
  8964  	JE           LBB20_7
  8965  
  8966  LBB20_6:
  8967  	VUCOMISD (SI)(AX*8), X0
  8968  	SETEQ    (DI)(AX*1)
  8969  	ADDQ     $0x01, AX
  8970  	CMPQ     DX, AX
  8971  	JNE      LBB20_6
  8972  
  8973  LBB20_7:
  8974  	VZEROUPPER
  8975  	RET
  8976  
  8977  DATA dataEqNumberF32<>+0(SB)/1, $0x01
  8978  DATA dataEqNumberF32<>+1(SB)/1, $0x01
  8979  DATA dataEqNumberF32<>+2(SB)/1, $0x01
  8980  DATA dataEqNumberF32<>+3(SB)/1, $0x01
  8981  DATA dataEqNumberF32<>+4(SB)/1, $0x01
  8982  DATA dataEqNumberF32<>+5(SB)/1, $0x01
  8983  DATA dataEqNumberF32<>+6(SB)/1, $0x01
  8984  DATA dataEqNumberF32<>+7(SB)/1, $0x01
  8985  DATA dataEqNumberF32<>+8(SB)/1, $0x00
  8986  DATA dataEqNumberF32<>+9(SB)/1, $0x00
  8987  DATA dataEqNumberF32<>+10(SB)/1, $0x00
  8988  DATA dataEqNumberF32<>+11(SB)/1, $0x00
  8989  DATA dataEqNumberF32<>+12(SB)/1, $0x00
  8990  DATA dataEqNumberF32<>+13(SB)/1, $0x00
  8991  DATA dataEqNumberF32<>+14(SB)/1, $0x00
  8992  DATA dataEqNumberF32<>+15(SB)/1, $0x00
  8993  GLOBL dataEqNumberF32<>(SB), RODATA|NOPTR, $16
  8994  
  8995  // func EqNumber_AVX2_F32(x []bool, y []float32, a float32)
  8996  // Requires: AVX, AVX2, SSE
  8997  TEXT ·EqNumber_AVX2_F32(SB), NOSPLIT, $0-52
  8998  	MOVQ  x_base+0(FP), DI
  8999  	MOVQ  y_base+24(FP), SI
  9000  	MOVSS a+48(FP), X0
  9001  	MOVQ  x_len+8(FP), DX
  9002  	TESTQ DX, DX
  9003  	JE    LBB21_7
  9004  	CMPQ  DX, $0x20
  9005  	JAE   LBB21_3
  9006  	XORL  AX, AX
  9007  	JMP   LBB21_6
  9008  
  9009  LBB21_3:
  9010  	MOVQ         DX, AX
  9011  	ANDQ         $-32, AX
  9012  	VBROADCASTSS X0, Y1
  9013  	XORL         CX, CX
  9014  	VMOVDQU      dataEqNumberF32<>+0(SB), X2
  9015  
  9016  LBB21_4:
  9017  	VCMPPS       $0x00, (SI)(CX*4), Y1, Y3
  9018  	VEXTRACTF128 $0x01, Y3, X4
  9019  	VPACKSSDW    X4, X3, X3
  9020  	VPACKSSWB    X3, X3, X3
  9021  	VCMPPS       $0x00, 32(SI)(CX*4), Y1, Y4
  9022  	VPAND        X2, X3, X3
  9023  	VEXTRACTF128 $0x01, Y4, X5
  9024  	VPACKSSDW    X5, X4, X4
  9025  	VPACKSSWB    X4, X4, X4
  9026  	VPAND        X2, X4, X4
  9027  	VCMPPS       $0x00, 64(SI)(CX*4), Y1, Y5
  9028  	VEXTRACTF128 $0x01, Y5, X6
  9029  	VPACKSSDW    X6, X5, X5
  9030  	VPACKSSWB    X5, X5, X5
  9031  	VCMPPS       $0x00, 96(SI)(CX*4), Y1, Y6
  9032  	VPAND        X2, X5, X5
  9033  	VEXTRACTF128 $0x01, Y6, X7
  9034  	VPACKSSDW    X7, X6, X6
  9035  	VPACKSSWB    X6, X6, X6
  9036  	VPAND        X2, X6, X6
  9037  	VINSERTI128  $0x01, X6, Y5, Y5
  9038  	VINSERTI128  $0x01, X4, Y3, Y3
  9039  	VPUNPCKLQDQ  Y5, Y3, Y3
  9040  	VPERMQ       $0xd8, Y3, Y3
  9041  	VMOVDQU      Y3, (DI)(CX*1)
  9042  	ADDQ         $0x20, CX
  9043  	CMPQ         AX, CX
  9044  	JNE          LBB21_4
  9045  	CMPQ         AX, DX
  9046  	JE           LBB21_7
  9047  
  9048  LBB21_6:
  9049  	VUCOMISS (SI)(AX*4), X0
  9050  	SETEQ    (DI)(AX*1)
  9051  	ADDQ     $0x01, AX
  9052  	CMPQ     DX, AX
  9053  	JNE      LBB21_6
  9054  
  9055  LBB21_7:
  9056  	VZEROUPPER
  9057  	RET
  9058  
  9059  DATA dataNeqNumberF64<>+0(SB)/1, $0x01
  9060  DATA dataNeqNumberF64<>+1(SB)/1, $0x01
  9061  DATA dataNeqNumberF64<>+2(SB)/1, $0x01
  9062  DATA dataNeqNumberF64<>+3(SB)/1, $0x01
  9063  DATA dataNeqNumberF64<>+4(SB)/1, $0x00
  9064  DATA dataNeqNumberF64<>+5(SB)/1, $0x00
  9065  DATA dataNeqNumberF64<>+6(SB)/1, $0x00
  9066  DATA dataNeqNumberF64<>+7(SB)/1, $0x00
  9067  DATA dataNeqNumberF64<>+8(SB)/1, $0x00
  9068  DATA dataNeqNumberF64<>+9(SB)/1, $0x00
  9069  DATA dataNeqNumberF64<>+10(SB)/1, $0x00
  9070  DATA dataNeqNumberF64<>+11(SB)/1, $0x00
  9071  DATA dataNeqNumberF64<>+12(SB)/1, $0x00
  9072  DATA dataNeqNumberF64<>+13(SB)/1, $0x00
  9073  DATA dataNeqNumberF64<>+14(SB)/1, $0x00
  9074  DATA dataNeqNumberF64<>+15(SB)/1, $0x00
  9075  GLOBL dataNeqNumberF64<>(SB), RODATA|NOPTR, $16
  9076  
  9077  // func NeqNumber_AVX2_F64(x []bool, y []float64, a float64)
  9078  // Requires: AVX, AVX2, SSE2
  9079  TEXT ·NeqNumber_AVX2_F64(SB), NOSPLIT, $0-56
  9080  	MOVQ  x_base+0(FP), DI
  9081  	MOVQ  y_base+24(FP), SI
  9082  	MOVSD a+48(FP), X0
  9083  	MOVQ  x_len+8(FP), DX
  9084  	TESTQ DX, DX
  9085  	JE    LBB22_7
  9086  	CMPQ  DX, $0x10
  9087  	JAE   LBB22_3
  9088  	XORL  AX, AX
  9089  	JMP   LBB22_6
  9090  
  9091  LBB22_3:
  9092  	MOVQ         DX, AX
  9093  	ANDQ         $-16, AX
  9094  	VBROADCASTSD X0, Y1
  9095  	XORL         CX, CX
  9096  	VMOVDQU      dataNeqNumberF64<>+0(SB), X2
  9097  
  9098  LBB22_4:
  9099  	VCMPPD       $0x04, (SI)(CX*8), Y1, Y3
  9100  	VEXTRACTF128 $0x01, Y3, X4
  9101  	VPACKSSDW    X4, X3, X3
  9102  	VPACKSSDW    X3, X3, X3
  9103  	VPACKSSWB    X3, X3, X3
  9104  	VCMPPD       $0x04, 32(SI)(CX*8), Y1, Y4
  9105  	VPAND        X2, X3, X3
  9106  	VEXTRACTF128 $0x01, Y4, X5
  9107  	VPACKSSDW    X5, X4, X4
  9108  	VPACKSSDW    X4, X4, X4
  9109  	VPACKSSWB    X4, X4, X4
  9110  	VPAND        X2, X4, X4
  9111  	VCMPPD       $0x04, 64(SI)(CX*8), Y1, Y5
  9112  	VPUNPCKLDQ   X4, X3, X3
  9113  	VEXTRACTF128 $0x01, Y5, X4
  9114  	VPACKSSDW    X4, X5, X4
  9115  	VPACKSSDW    X4, X4, X4
  9116  	VPACKSSWB    X4, X4, X4
  9117  	VPAND        X2, X4, X4
  9118  	VCMPPD       $0x04, 96(SI)(CX*8), Y1, Y5
  9119  	VEXTRACTF128 $0x01, Y5, X6
  9120  	VPACKSSDW    X6, X5, X5
  9121  	VPACKSSDW    X5, X5, X5
  9122  	VPACKSSWB    X5, X5, X5
  9123  	VPAND        X2, X5, X5
  9124  	VPBROADCASTD X5, X5
  9125  	VPBROADCASTD X4, X4
  9126  	VPUNPCKLDQ   X5, X4, X4
  9127  	VPBLENDD     $0x0c, X4, X3, X3
  9128  	VMOVDQU      X3, (DI)(CX*1)
  9129  	ADDQ         $0x10, CX
  9130  	CMPQ         AX, CX
  9131  	JNE          LBB22_4
  9132  	CMPQ         AX, DX
  9133  	JE           LBB22_7
  9134  
  9135  LBB22_6:
  9136  	VUCOMISD (SI)(AX*8), X0
  9137  	SETNE    (DI)(AX*1)
  9138  	ADDQ     $0x01, AX
  9139  	CMPQ     DX, AX
  9140  	JNE      LBB22_6
  9141  
  9142  LBB22_7:
  9143  	VZEROUPPER
  9144  	RET
  9145  
  9146  DATA dataNeqNumberF32<>+0(SB)/1, $0x01
  9147  DATA dataNeqNumberF32<>+1(SB)/1, $0x01
  9148  DATA dataNeqNumberF32<>+2(SB)/1, $0x01
  9149  DATA dataNeqNumberF32<>+3(SB)/1, $0x01
  9150  DATA dataNeqNumberF32<>+4(SB)/1, $0x01
  9151  DATA dataNeqNumberF32<>+5(SB)/1, $0x01
  9152  DATA dataNeqNumberF32<>+6(SB)/1, $0x01
  9153  DATA dataNeqNumberF32<>+7(SB)/1, $0x01
  9154  DATA dataNeqNumberF32<>+8(SB)/1, $0x00
  9155  DATA dataNeqNumberF32<>+9(SB)/1, $0x00
  9156  DATA dataNeqNumberF32<>+10(SB)/1, $0x00
  9157  DATA dataNeqNumberF32<>+11(SB)/1, $0x00
  9158  DATA dataNeqNumberF32<>+12(SB)/1, $0x00
  9159  DATA dataNeqNumberF32<>+13(SB)/1, $0x00
  9160  DATA dataNeqNumberF32<>+14(SB)/1, $0x00
  9161  DATA dataNeqNumberF32<>+15(SB)/1, $0x00
  9162  GLOBL dataNeqNumberF32<>(SB), RODATA|NOPTR, $16
  9163  
  9164  // func NeqNumber_AVX2_F32(x []bool, y []float32, a float32)
  9165  // Requires: AVX, AVX2, SSE
  9166  TEXT ·NeqNumber_AVX2_F32(SB), NOSPLIT, $0-52
  9167  	MOVQ  x_base+0(FP), DI
  9168  	MOVQ  y_base+24(FP), SI
  9169  	MOVSS a+48(FP), X0
  9170  	MOVQ  x_len+8(FP), DX
  9171  	TESTQ DX, DX
  9172  	JE    LBB23_7
  9173  	CMPQ  DX, $0x20
  9174  	JAE   LBB23_3
  9175  	XORL  AX, AX
  9176  	JMP   LBB23_6
  9177  
  9178  LBB23_3:
  9179  	MOVQ         DX, AX
  9180  	ANDQ         $-32, AX
  9181  	VBROADCASTSS X0, Y1
  9182  	XORL         CX, CX
  9183  	VMOVDQU      dataNeqNumberF32<>+0(SB), X2
  9184  
  9185  LBB23_4:
  9186  	VCMPPS       $0x04, (SI)(CX*4), Y1, Y3
  9187  	VEXTRACTF128 $0x01, Y3, X4
  9188  	VPACKSSDW    X4, X3, X3
  9189  	VPACKSSWB    X3, X3, X3
  9190  	VCMPPS       $0x04, 32(SI)(CX*4), Y1, Y4
  9191  	VPAND        X2, X3, X3
  9192  	VEXTRACTF128 $0x01, Y4, X5
  9193  	VPACKSSDW    X5, X4, X4
  9194  	VPACKSSWB    X4, X4, X4
  9195  	VPAND        X2, X4, X4
  9196  	VCMPPS       $0x04, 64(SI)(CX*4), Y1, Y5
  9197  	VEXTRACTF128 $0x01, Y5, X6
  9198  	VPACKSSDW    X6, X5, X5
  9199  	VPACKSSWB    X5, X5, X5
  9200  	VCMPPS       $0x04, 96(SI)(CX*4), Y1, Y6
  9201  	VPAND        X2, X5, X5
  9202  	VEXTRACTF128 $0x01, Y6, X7
  9203  	VPACKSSDW    X7, X6, X6
  9204  	VPACKSSWB    X6, X6, X6
  9205  	VPAND        X2, X6, X6
  9206  	VINSERTI128  $0x01, X6, Y5, Y5
  9207  	VINSERTI128  $0x01, X4, Y3, Y3
  9208  	VPUNPCKLQDQ  Y5, Y3, Y3
  9209  	VPERMQ       $0xd8, Y3, Y3
  9210  	VMOVDQU      Y3, (DI)(CX*1)
  9211  	ADDQ         $0x20, CX
  9212  	CMPQ         AX, CX
  9213  	JNE          LBB23_4
  9214  	CMPQ         AX, DX
  9215  	JE           LBB23_7
  9216  
  9217  LBB23_6:
  9218  	VUCOMISS (SI)(AX*4), X0
  9219  	SETNE    (DI)(AX*1)
  9220  	ADDQ     $0x01, AX
  9221  	CMPQ     DX, AX
  9222  	JNE      LBB23_6
  9223  
  9224  LBB23_7:
  9225  	VZEROUPPER
  9226  	RET
  9227  
  9228  DATA dataNot<>+0(SB)/1, $0x01
  9229  DATA dataNot<>+1(SB)/1, $0x01
  9230  DATA dataNot<>+2(SB)/1, $0x01
  9231  DATA dataNot<>+3(SB)/1, $0x01
  9232  DATA dataNot<>+4(SB)/1, $0x01
  9233  DATA dataNot<>+5(SB)/1, $0x01
  9234  DATA dataNot<>+6(SB)/1, $0x01
  9235  DATA dataNot<>+7(SB)/1, $0x01
  9236  DATA dataNot<>+8(SB)/1, $0x01
  9237  DATA dataNot<>+9(SB)/1, $0x01
  9238  DATA dataNot<>+10(SB)/1, $0x01
  9239  DATA dataNot<>+11(SB)/1, $0x01
  9240  DATA dataNot<>+12(SB)/1, $0x01
  9241  DATA dataNot<>+13(SB)/1, $0x01
  9242  DATA dataNot<>+14(SB)/1, $0x01
  9243  DATA dataNot<>+15(SB)/1, $0x01
  9244  DATA dataNot<>+16(SB)/1, $0x01
  9245  DATA dataNot<>+17(SB)/1, $0x01
  9246  DATA dataNot<>+18(SB)/1, $0x01
  9247  DATA dataNot<>+19(SB)/1, $0x01
  9248  DATA dataNot<>+20(SB)/1, $0x01
  9249  DATA dataNot<>+21(SB)/1, $0x01
  9250  DATA dataNot<>+22(SB)/1, $0x01
  9251  DATA dataNot<>+23(SB)/1, $0x01
  9252  DATA dataNot<>+24(SB)/1, $0x01
  9253  DATA dataNot<>+25(SB)/1, $0x01
  9254  DATA dataNot<>+26(SB)/1, $0x01
  9255  DATA dataNot<>+27(SB)/1, $0x01
  9256  DATA dataNot<>+28(SB)/1, $0x01
  9257  DATA dataNot<>+29(SB)/1, $0x01
  9258  DATA dataNot<>+30(SB)/1, $0x01
  9259  DATA dataNot<>+31(SB)/1, $0x01
  9260  GLOBL dataNot<>(SB), RODATA|NOPTR, $32
  9261  
  9262  // func Not_AVX2(x []bool)
  9263  // Requires: AVX
  9264  TEXT ·Not_AVX2(SB), NOSPLIT, $0-24
  9265  	MOVQ  x_base+0(FP), DI
  9266  	MOVQ  x_len+8(FP), SI
  9267  	TESTQ SI, SI
  9268  	JE    LBB0_17
  9269  	CMPQ  SI, $0x10
  9270  	JAE   LBB0_3
  9271  	XORL  AX, AX
  9272  	JMP   LBB0_16
  9273  
  9274  LBB0_3:
  9275  	CMPQ SI, $0x80
  9276  	JAE  LBB0_5
  9277  	XORL AX, AX
  9278  	JMP  LBB0_13
  9279  
  9280  LBB0_5:
  9281  	MOVQ    SI, AX
  9282  	ANDQ    $-128, AX
  9283  	LEAQ    -128(AX), CX
  9284  	MOVQ    CX, R8
  9285  	SHRQ    $0x07, R8
  9286  	ADDQ    $0x01, R8
  9287  	TESTQ   CX, CX
  9288  	JE      LBB0_6
  9289  	MOVQ    R8, DX
  9290  	ANDQ    $-2, DX
  9291  	XORL    CX, CX
  9292  	VMOVUPS dataNot<>+0(SB), Y0
  9293  
  9294  LBB0_8:
  9295  	VXORPS  (DI)(CX*1), Y0, Y1
  9296  	VXORPS  32(DI)(CX*1), Y0, Y2
  9297  	VXORPS  64(DI)(CX*1), Y0, Y3
  9298  	VXORPS  96(DI)(CX*1), Y0, Y4
  9299  	VMOVUPS Y1, (DI)(CX*1)
  9300  	VMOVUPS Y2, 32(DI)(CX*1)
  9301  	VMOVUPS Y3, 64(DI)(CX*1)
  9302  	VMOVUPS Y4, 96(DI)(CX*1)
  9303  	VXORPS  128(DI)(CX*1), Y0, Y1
  9304  	VXORPS  160(DI)(CX*1), Y0, Y2
  9305  	VXORPS  192(DI)(CX*1), Y0, Y3
  9306  	VXORPS  224(DI)(CX*1), Y0, Y4
  9307  	VMOVUPS Y1, 128(DI)(CX*1)
  9308  	VMOVUPS Y2, 160(DI)(CX*1)
  9309  	VMOVUPS Y3, 192(DI)(CX*1)
  9310  	VMOVUPS Y4, 224(DI)(CX*1)
  9311  	ADDQ    $+256, CX
  9312  	ADDQ    $-2, DX
  9313  	JNE     LBB0_8
  9314  	TESTB   $0x01, R8
  9315  	JE      LBB0_11
  9316  
  9317  LBB0_10:
  9318  	VMOVUPS dataNot<>+0(SB), Y0
  9319  	VXORPS  (DI)(CX*1), Y0, Y1
  9320  	VXORPS  32(DI)(CX*1), Y0, Y2
  9321  	VXORPS  64(DI)(CX*1), Y0, Y3
  9322  	VXORPS  96(DI)(CX*1), Y0, Y0
  9323  	VMOVUPS Y1, (DI)(CX*1)
  9324  	VMOVUPS Y2, 32(DI)(CX*1)
  9325  	VMOVUPS Y3, 64(DI)(CX*1)
  9326  	VMOVUPS Y0, 96(DI)(CX*1)
  9327  
  9328  LBB0_11:
  9329  	CMPQ  AX, SI
  9330  	JE    LBB0_17
  9331  	TESTB $0x70, SI
  9332  	JE    LBB0_16
  9333  
  9334  LBB0_13:
  9335  	MOVQ    AX, CX
  9336  	MOVQ    SI, AX
  9337  	ANDQ    $-16, AX
  9338  	VMOVUPS dataNot<>+0(SB), X0
  9339  
  9340  LBB0_14:
  9341  	VXORPS  (DI)(CX*1), X0, X1
  9342  	VMOVUPS X1, (DI)(CX*1)
  9343  	ADDQ    $0x10, CX
  9344  	CMPQ    AX, CX
  9345  	JNE     LBB0_14
  9346  	CMPQ    AX, SI
  9347  	JE      LBB0_17
  9348  
  9349  LBB0_16:
  9350  	XORB $0x01, (DI)(AX*1)
  9351  	ADDQ $0x01, AX
  9352  	CMPQ SI, AX
  9353  	JNE  LBB0_16
  9354  
  9355  LBB0_17:
  9356  	VZEROUPPER
  9357  	RET
  9358  
  9359  LBB0_6:
  9360  	XORL  CX, CX
  9361  	TESTB $0x01, R8
  9362  	JNE   LBB0_10
  9363  	JMP   LBB0_11
  9364  
  9365  // func And_AVX2(x []bool, y []bool)
  9366  // Requires: AVX
  9367  TEXT ·And_AVX2(SB), NOSPLIT, $0-48
  9368  	MOVQ  x_base+0(FP), DI
  9369  	MOVQ  y_base+24(FP), SI
  9370  	MOVQ  x_len+8(FP), DX
  9371  	TESTQ DX, DX
  9372  	JE    LBB1_13
  9373  	CMPQ  DX, $0x10
  9374  	JAE   LBB1_3
  9375  	XORL  AX, AX
  9376  	JMP   LBB1_12
  9377  
  9378  LBB1_3:
  9379  	CMPQ DX, $0x80
  9380  	JAE  LBB1_5
  9381  	XORL AX, AX
  9382  	JMP  LBB1_9
  9383  
  9384  LBB1_5:
  9385  	MOVQ DX, AX
  9386  	ANDQ $-128, AX
  9387  	XORL CX, CX
  9388  
  9389  LBB1_6:
  9390  	VMOVUPS (SI)(CX*1), Y0
  9391  	VMOVUPS 32(SI)(CX*1), Y1
  9392  	VMOVUPS 64(SI)(CX*1), Y2
  9393  	VMOVUPS 96(SI)(CX*1), Y3
  9394  	VANDPS  (DI)(CX*1), Y0, Y0
  9395  	VANDPS  32(DI)(CX*1), Y1, Y1
  9396  	VANDPS  64(DI)(CX*1), Y2, Y2
  9397  	VANDPS  96(DI)(CX*1), Y3, Y3
  9398  	VMOVUPS Y0, (DI)(CX*1)
  9399  	VMOVUPS Y1, 32(DI)(CX*1)
  9400  	VMOVUPS Y2, 64(DI)(CX*1)
  9401  	VMOVUPS Y3, 96(DI)(CX*1)
  9402  	SUBQ    $-128, CX
  9403  	CMPQ    AX, CX
  9404  	JNE     LBB1_6
  9405  	CMPQ    AX, DX
  9406  	JE      LBB1_13
  9407  	TESTB   $0x70, DL
  9408  	JE      LBB1_12
  9409  
  9410  LBB1_9:
  9411  	MOVQ AX, CX
  9412  	MOVQ DX, AX
  9413  	ANDQ $-16, AX
  9414  
  9415  LBB1_10:
  9416  	VMOVUPS (SI)(CX*1), X0
  9417  	VANDPS  (DI)(CX*1), X0, X0
  9418  	VMOVUPS X0, (DI)(CX*1)
  9419  	ADDQ    $0x10, CX
  9420  	CMPQ    AX, CX
  9421  	JNE     LBB1_10
  9422  	CMPQ    AX, DX
  9423  	JE      LBB1_13
  9424  
  9425  LBB1_12:
  9426  	MOVBLZX (SI)(AX*1), CX
  9427  	ANDB    CL, (DI)(AX*1)
  9428  	ADDQ    $0x01, AX
  9429  	CMPQ    DX, AX
  9430  	JNE     LBB1_12
  9431  
  9432  LBB1_13:
  9433  	VZEROUPPER
  9434  	RET
  9435  
  9436  // func Or_AVX2(x []bool, y []bool)
  9437  // Requires: AVX
  9438  TEXT ·Or_AVX2(SB), NOSPLIT, $0-48
  9439  	MOVQ  x_base+0(FP), DI
  9440  	MOVQ  y_base+24(FP), SI
  9441  	MOVQ  x_len+8(FP), DX
  9442  	TESTQ DX, DX
  9443  	JE    LBB2_13
  9444  	CMPQ  DX, $0x10
  9445  	JAE   LBB2_3
  9446  	XORL  AX, AX
  9447  	JMP   LBB2_12
  9448  
  9449  LBB2_3:
  9450  	CMPQ DX, $0x80
  9451  	JAE  LBB2_5
  9452  	XORL AX, AX
  9453  	JMP  LBB2_9
  9454  
  9455  LBB2_5:
  9456  	MOVQ DX, AX
  9457  	ANDQ $-128, AX
  9458  	XORL CX, CX
  9459  
  9460  LBB2_6:
  9461  	VMOVUPS (SI)(CX*1), Y0
  9462  	VMOVUPS 32(SI)(CX*1), Y1
  9463  	VMOVUPS 64(SI)(CX*1), Y2
  9464  	VMOVUPS 96(SI)(CX*1), Y3
  9465  	VORPS   (DI)(CX*1), Y0, Y0
  9466  	VORPS   32(DI)(CX*1), Y1, Y1
  9467  	VORPS   64(DI)(CX*1), Y2, Y2
  9468  	VORPS   96(DI)(CX*1), Y3, Y3
  9469  	VMOVUPS Y0, (DI)(CX*1)
  9470  	VMOVUPS Y1, 32(DI)(CX*1)
  9471  	VMOVUPS Y2, 64(DI)(CX*1)
  9472  	VMOVUPS Y3, 96(DI)(CX*1)
  9473  	SUBQ    $-128, CX
  9474  	CMPQ    AX, CX
  9475  	JNE     LBB2_6
  9476  	CMPQ    AX, DX
  9477  	JE      LBB2_13
  9478  	TESTB   $0x70, DL
  9479  	JE      LBB2_12
  9480  
  9481  LBB2_9:
  9482  	MOVQ AX, CX
  9483  	MOVQ DX, AX
  9484  	ANDQ $-16, AX
  9485  
  9486  LBB2_10:
  9487  	VMOVUPS (SI)(CX*1), X0
  9488  	VORPS   (DI)(CX*1), X0, X0
  9489  	VMOVUPS X0, (DI)(CX*1)
  9490  	ADDQ    $0x10, CX
  9491  	CMPQ    AX, CX
  9492  	JNE     LBB2_10
  9493  	CMPQ    AX, DX
  9494  	JE      LBB2_13
  9495  
  9496  LBB2_12:
  9497  	MOVBLZX (SI)(AX*1), CX
  9498  	ORB     CL, (DI)(AX*1)
  9499  	ADDQ    $0x01, AX
  9500  	CMPQ    DX, AX
  9501  	JNE     LBB2_12
  9502  
  9503  LBB2_13:
  9504  	VZEROUPPER
  9505  	RET
  9506  
  9507  // func Xor_AVX2(x []bool, y []bool)
  9508  // Requires: AVX
  9509  TEXT ·Xor_AVX2(SB), NOSPLIT, $0-48
  9510  	MOVQ  x_base+0(FP), DI
  9511  	MOVQ  y_base+24(FP), SI
  9512  	MOVQ  x_len+8(FP), DX
  9513  	TESTQ DX, DX
  9514  	JE    LBB3_13
  9515  	CMPQ  DX, $0x10
  9516  	JAE   LBB3_3
  9517  	XORL  AX, AX
  9518  	JMP   LBB3_12
  9519  
  9520  LBB3_3:
  9521  	CMPQ DX, $0x80
  9522  	JAE  LBB3_5
  9523  	XORL AX, AX
  9524  	JMP  LBB3_9
  9525  
  9526  LBB3_5:
  9527  	MOVQ DX, AX
  9528  	ANDQ $-128, AX
  9529  	XORL CX, CX
  9530  
  9531  LBB3_6:
  9532  	VMOVUPS (SI)(CX*1), Y0
  9533  	VMOVUPS 32(SI)(CX*1), Y1
  9534  	VMOVUPS 64(SI)(CX*1), Y2
  9535  	VMOVUPS 96(SI)(CX*1), Y3
  9536  	VXORPS  (DI)(CX*1), Y0, Y0
  9537  	VXORPS  32(DI)(CX*1), Y1, Y1
  9538  	VXORPS  64(DI)(CX*1), Y2, Y2
  9539  	VXORPS  96(DI)(CX*1), Y3, Y3
  9540  	VMOVUPS Y0, (DI)(CX*1)
  9541  	VMOVUPS Y1, 32(DI)(CX*1)
  9542  	VMOVUPS Y2, 64(DI)(CX*1)
  9543  	VMOVUPS Y3, 96(DI)(CX*1)
  9544  	SUBQ    $-128, CX
  9545  	CMPQ    AX, CX
  9546  	JNE     LBB3_6
  9547  	CMPQ    AX, DX
  9548  	JE      LBB3_13
  9549  	TESTB   $0x70, DL
  9550  	JE      LBB3_12
  9551  
  9552  LBB3_9:
  9553  	MOVQ AX, CX
  9554  	MOVQ DX, AX
  9555  	ANDQ $-16, AX
  9556  
  9557  LBB3_10:
  9558  	VMOVUPS (SI)(CX*1), X0
  9559  	VXORPS  (DI)(CX*1), X0, X0
  9560  	VMOVUPS X0, (DI)(CX*1)
  9561  	ADDQ    $0x10, CX
  9562  	CMPQ    AX, CX
  9563  	JNE     LBB3_10
  9564  	CMPQ    AX, DX
  9565  	JE      LBB3_13
  9566  
  9567  LBB3_12:
  9568  	MOVBLZX (SI)(AX*1), CX
  9569  	XORB    CL, (DI)(AX*1)
  9570  	ADDQ    $0x01, AX
  9571  	CMPQ    DX, AX
  9572  	JNE     LBB3_12
  9573  
  9574  LBB3_13:
  9575  	VZEROUPPER
  9576  	RET
  9577  
  9578  // func All_AVX2(x []bool) int
  9579  // Requires: AVX, AVX2
  9580  TEXT ·All_AVX2(SB), NOSPLIT, $0-32
  9581  	MOVQ  x_base+0(FP), DI
  9582  	MOVQ  x_len+8(FP), SI
  9583  	MOVQ  SI, AX
  9584  	XORL  CX, CX
  9585  	ANDQ  $-32, AX
  9586  	JE    LBB0_1
  9587  	VPXOR X0, X0, X0
  9588  
  9589  LBB0_8:
  9590  	VPCMPEQB (DI)(CX*1), Y0, Y1
  9591  	VPTEST   Y1, Y1
  9592  	JNE      LBB0_9
  9593  	ADDQ     $0x20, CX
  9594  	CMPQ     CX, AX
  9595  	JB       LBB0_8
  9596  
  9597  LBB0_1:
  9598  	MOVB $0x01, AL
  9599  	CMPQ CX, SI
  9600  	JAE  LBB0_6
  9601  	ADDQ $-1, SI
  9602  
  9603  LBB0_3:
  9604  	MOVBLZX (DI)(CX*1), AX
  9605  	TESTB   AL, AL
  9606  	JE      LBB0_5
  9607  	LEAQ    1(CX), DX
  9608  	CMPQ    SI, CX
  9609  	MOVQ    DX, CX
  9610  	JNE     LBB0_3
  9611  
  9612  LBB0_5:
  9613  	TESTB AL, AL
  9614  	SETNE AL
  9615  
  9616  LBB0_6:
  9617  	VZEROUPPER
  9618  	MOVQ AX, ret+24(FP)
  9619  	RET
  9620  
  9621  LBB0_9:
  9622  	XORL AX, AX
  9623  	VZEROUPPER
  9624  	MOVQ AX, ret+24(FP)
  9625  	RET
  9626  
  9627  // func Any_AVX2(x []bool) int
  9628  // Requires: AVX
  9629  TEXT ·Any_AVX2(SB), NOSPLIT, $0-32
  9630  	MOVQ x_base+0(FP), DI
  9631  	MOVQ x_len+8(FP), SI
  9632  	MOVQ SI, CX
  9633  	XORL AX, AX
  9634  	ANDQ $-32, CX
  9635  	JE   LBB1_1
  9636  
  9637  LBB1_4:
  9638  	VMOVDQU (DI)(AX*1), Y0
  9639  	VPTEST  Y0, Y0
  9640  	JNE     LBB1_5
  9641  	ADDQ    $0x20, AX
  9642  	CMPQ    AX, CX
  9643  	JB      LBB1_4
  9644  
  9645  LBB1_1:
  9646  	CMPQ AX, SI
  9647  	JAE  LBB1_2
  9648  	ADDQ $-1, SI
  9649  
  9650  LBB1_7:
  9651  	MOVBLZX (DI)(AX*1), CX
  9652  	TESTB   CL, CL
  9653  	JNE     LBB1_9
  9654  	LEAQ    1(AX), DX
  9655  	CMPQ    SI, AX
  9656  	MOVQ    DX, AX
  9657  	JNE     LBB1_7
  9658  
  9659  LBB1_9:
  9660  	TESTB CL, CL
  9661  	SETNE AL
  9662  	VZEROUPPER
  9663  	MOVQ  AX, ret+24(FP)
  9664  	RET
  9665  
  9666  LBB1_5:
  9667  	MOVB $0x01, AL
  9668  	VZEROUPPER
  9669  	MOVQ AX, ret+24(FP)
  9670  	RET
  9671  
  9672  LBB1_2:
  9673  	XORL AX, AX
  9674  	VZEROUPPER
  9675  	MOVQ AX, ret+24(FP)
  9676  	RET
  9677  
  9678  // func None_AVX2(x []bool) int
  9679  // Requires: AVX
  9680  TEXT ·None_AVX2(SB), NOSPLIT, $0-32
  9681  	MOVQ x_base+0(FP), DI
  9682  	MOVQ x_len+8(FP), SI
  9683  	MOVQ SI, AX
  9684  	XORL CX, CX
  9685  	ANDQ $-32, AX
  9686  	JE   LBB2_1
  9687  
  9688  LBB2_7:
  9689  	VMOVDQU (DI)(CX*1), Y0
  9690  	VPTEST  Y0, Y0
  9691  	JNE     LBB2_8
  9692  	ADDQ    $0x20, CX
  9693  	CMPQ    CX, AX
  9694  	JB      LBB2_7
  9695  
  9696  LBB2_1:
  9697  	MOVB $0x01, AL
  9698  	CMPQ CX, SI
  9699  	JAE  LBB2_5
  9700  	ADDQ $-1, SI
  9701  
  9702  LBB2_3:
  9703  	CMPB  (DI)(CX*1), $0x00
  9704  	SETEQ AL
  9705  	JNE   LBB2_5
  9706  	LEAQ  1(CX), DX
  9707  	CMPQ  SI, CX
  9708  	MOVQ  DX, CX
  9709  	JNE   LBB2_3
  9710  
  9711  LBB2_5:
  9712  	VZEROUPPER
  9713  	MOVQ AX, ret+24(FP)
  9714  	RET
  9715  
  9716  LBB2_8:
  9717  	XORL AX, AX
  9718  	VZEROUPPER
  9719  	MOVQ AX, ret+24(FP)
  9720  	RET
  9721  
  9722  // func Count_AVX2(x []bool) int
  9723  // Requires: AVX, AVX2
  9724  TEXT ·Count_AVX2(SB), NOSPLIT, $0-32
  9725  	MOVQ  x_base+0(FP), DI
  9726  	MOVQ  x_len+8(FP), SI
  9727  	TESTQ SI, SI
  9728  	JE    LBB9_1
  9729  	CMPQ  SI, $0x10
  9730  	JAE   LBB9_4
  9731  	XORL  CX, CX
  9732  	XORL  AX, AX
  9733  	JMP   LBB9_11
  9734  
  9735  LBB9_1:
  9736  	XORL AX, AX
  9737  	MOVQ AX, ret+24(FP)
  9738  	RET
  9739  
  9740  LBB9_4:
  9741  	MOVQ  SI, CX
  9742  	ANDQ  $-16, CX
  9743  	LEAQ  -16(CX), AX
  9744  	MOVQ  AX, R8
  9745  	SHRQ  $0x04, R8
  9746  	ADDQ  $0x01, R8
  9747  	TESTQ AX, AX
  9748  	JE    LBB9_5
  9749  	MOVQ  R8, DX
  9750  	ANDQ  $-2, DX
  9751  	VPXOR X0, X0, X0
  9752  	XORL  AX, AX
  9753  	VPXOR X1, X1, X1
  9754  	VPXOR X2, X2, X2
  9755  	VPXOR X3, X3, X3
  9756  
  9757  LBB9_7:
  9758  	VPMOVZXBQ (DI)(AX*1), Y4
  9759  	VPADDQ    Y4, Y0, Y0
  9760  	VPMOVZXBQ 4(DI)(AX*1), Y4
  9761  	VPADDQ    Y4, Y1, Y1
  9762  	VPMOVZXBQ 8(DI)(AX*1), Y4
  9763  	VPMOVZXBQ 12(DI)(AX*1), Y5
  9764  	VPADDQ    Y4, Y2, Y2
  9765  	VPADDQ    Y5, Y3, Y3
  9766  	VPMOVZXBQ 16(DI)(AX*1), Y4
  9767  	VPADDQ    Y4, Y0, Y0
  9768  	VPMOVZXBQ 20(DI)(AX*1), Y4
  9769  	VPADDQ    Y4, Y1, Y1
  9770  	VPMOVZXBQ 24(DI)(AX*1), Y4
  9771  	VPMOVZXBQ 28(DI)(AX*1), Y5
  9772  	VPADDQ    Y4, Y2, Y2
  9773  	VPADDQ    Y5, Y3, Y3
  9774  	ADDQ      $0x20, AX
  9775  	ADDQ      $-2, DX
  9776  	JNE       LBB9_7
  9777  	TESTB     $0x01, R8
  9778  	JE        LBB9_10
  9779  
  9780  LBB9_9:
  9781  	VPMOVZXBQ (DI)(AX*1), Y4
  9782  	VPMOVZXBQ 4(DI)(AX*1), Y5
  9783  	VPADDQ    Y4, Y0, Y0
  9784  	VPADDQ    Y5, Y1, Y1
  9785  	VPMOVZXBQ 8(DI)(AX*1), Y4
  9786  	VPADDQ    Y4, Y2, Y2
  9787  	VPMOVZXBQ 12(DI)(AX*1), Y4
  9788  	VPADDQ    Y4, Y3, Y3
  9789  
  9790  LBB9_10:
  9791  	VPADDQ       Y3, Y1, Y1
  9792  	VPADDQ       Y2, Y0, Y0
  9793  	VPADDQ       Y1, Y0, Y0
  9794  	VEXTRACTI128 $0x01, Y0, X1
  9795  	VPADDQ       X1, X0, X0
  9796  	VPSHUFD      $0xee, X0, X1
  9797  	VPADDQ       X1, X0, X0
  9798  	VMOVQ        X0, AX
  9799  	CMPQ         CX, SI
  9800  	JE           LBB9_12
  9801  
  9802  LBB9_11:
  9803  	MOVBLZX (DI)(CX*1), DX
  9804  	ADDQ    DX, AX
  9805  	ADDQ    $0x01, CX
  9806  	CMPQ    SI, CX
  9807  	JNE     LBB9_11
  9808  
  9809  LBB9_12:
  9810  	VZEROUPPER
  9811  	MOVQ AX, ret+24(FP)
  9812  	RET
  9813  
  9814  LBB9_5:
  9815  	VPXOR X0, X0, X0
  9816  	XORL  AX, AX
  9817  	VPXOR X1, X1, X1
  9818  	VPXOR X2, X2, X2
  9819  	VPXOR X3, X3, X3
  9820  	TESTB $0x01, R8
  9821  	JNE   LBB9_9
  9822  	JMP   LBB9_10
  9823  
  9824  // func Repeat_AVX2_F64(x []float64, a float64, n int)
  9825  // Requires: AVX, AVX2, SSE2
  9826  TEXT ·Repeat_AVX2_F64(SB), NOSPLIT, $0-40
  9827  	MOVQ  x_base+0(FP), DI
  9828  	MOVSD a+24(FP), X0
  9829  	MOVQ  n+32(FP), SI
  9830  	TESTQ SI, SI
  9831  	JE    LBB0_12
  9832  	CMPQ  SI, $0x10
  9833  	JAE   LBB0_3
  9834  	XORL  AX, AX
  9835  	JMP   LBB0_11
  9836  
  9837  LBB0_3:
  9838  	MOVQ         SI, AX
  9839  	ANDQ         $-16, AX
  9840  	VBROADCASTSD X0, Y1
  9841  	LEAQ         -16(AX), CX
  9842  	MOVQ         CX, DX
  9843  	SHRQ         $0x04, DX
  9844  	ADDQ         $0x01, DX
  9845  	MOVL         DX, R8
  9846  	ANDL         $0x03, R8
  9847  	CMPQ         CX, $0x30
  9848  	JAE          LBB0_5
  9849  	XORL         CX, CX
  9850  	JMP          LBB0_7
  9851  
  9852  LBB0_5:
  9853  	ANDQ $-4, DX
  9854  	XORL CX, CX
  9855  
  9856  LBB0_6:
  9857  	VMOVUPS Y1, (DI)(CX*8)
  9858  	VMOVUPS Y1, 32(DI)(CX*8)
  9859  	VMOVUPS Y1, 64(DI)(CX*8)
  9860  	VMOVUPS Y1, 96(DI)(CX*8)
  9861  	VMOVUPS Y1, 128(DI)(CX*8)
  9862  	VMOVUPS Y1, 160(DI)(CX*8)
  9863  	VMOVUPS Y1, 192(DI)(CX*8)
  9864  	VMOVUPS Y1, 224(DI)(CX*8)
  9865  	VMOVUPS Y1, 256(DI)(CX*8)
  9866  	VMOVUPS Y1, 288(DI)(CX*8)
  9867  	VMOVUPS Y1, 320(DI)(CX*8)
  9868  	VMOVUPS Y1, 352(DI)(CX*8)
  9869  	VMOVUPS Y1, 384(DI)(CX*8)
  9870  	VMOVUPS Y1, 416(DI)(CX*8)
  9871  	VMOVUPS Y1, 448(DI)(CX*8)
  9872  	VMOVUPS Y1, 480(DI)(CX*8)
  9873  	ADDQ    $0x40, CX
  9874  	ADDQ    $-4, DX
  9875  	JNE     LBB0_6
  9876  
  9877  LBB0_7:
  9878  	TESTQ R8, R8
  9879  	JE    LBB0_10
  9880  	LEAQ  (DI)(CX*8), CX
  9881  	ADDQ  $0x60, CX
  9882  	SHLQ  $0x07, R8
  9883  	XORL  DX, DX
  9884  
  9885  LBB0_9:
  9886  	VMOVUPS Y1, -96(CX)(DX*1)
  9887  	VMOVUPS Y1, -64(CX)(DX*1)
  9888  	VMOVUPS Y1, -32(CX)(DX*1)
  9889  	VMOVUPS Y1, (CX)(DX*1)
  9890  	SUBQ    $-128, DX
  9891  	CMPQ    R8, DX
  9892  	JNE     LBB0_9
  9893  
  9894  LBB0_10:
  9895  	CMPQ AX, SI
  9896  	JE   LBB0_12
  9897  
  9898  LBB0_11:
  9899  	VMOVSD X0, (DI)(AX*8)
  9900  	ADDQ   $0x01, AX
  9901  	CMPQ   SI, AX
  9902  	JNE    LBB0_11
  9903  
  9904  LBB0_12:
  9905  	VZEROUPPER
  9906  	RET
  9907  
  9908  // func Repeat_AVX2_F32(x []float32, a float32, n int)
  9909  // Requires: AVX, AVX2, SSE
  9910  TEXT ·Repeat_AVX2_F32(SB), NOSPLIT, $0-40
  9911  	MOVQ  x_base+0(FP), DI
  9912  	MOVSS a+24(FP), X0
  9913  	MOVQ  n+32(FP), SI
  9914  	TESTQ SI, SI
  9915  	JE    LBB1_12
  9916  	CMPQ  SI, $0x20
  9917  	JAE   LBB1_3
  9918  	XORL  AX, AX
  9919  	JMP   LBB1_11
  9920  
  9921  LBB1_3:
  9922  	MOVQ         SI, AX
  9923  	ANDQ         $-32, AX
  9924  	VBROADCASTSS X0, Y1
  9925  	LEAQ         -32(AX), CX
  9926  	MOVQ         CX, DX
  9927  	SHRQ         $0x05, DX
  9928  	ADDQ         $0x01, DX
  9929  	MOVL         DX, R8
  9930  	ANDL         $0x03, R8
  9931  	CMPQ         CX, $0x60
  9932  	JAE          LBB1_5
  9933  	XORL         CX, CX
  9934  	JMP          LBB1_7
  9935  
  9936  LBB1_5:
  9937  	ANDQ $-4, DX
  9938  	XORL CX, CX
  9939  
  9940  LBB1_6:
  9941  	VMOVUPS Y1, (DI)(CX*4)
  9942  	VMOVUPS Y1, 32(DI)(CX*4)
  9943  	VMOVUPS Y1, 64(DI)(CX*4)
  9944  	VMOVUPS Y1, 96(DI)(CX*4)
  9945  	VMOVUPS Y1, 128(DI)(CX*4)
  9946  	VMOVUPS Y1, 160(DI)(CX*4)
  9947  	VMOVUPS Y1, 192(DI)(CX*4)
  9948  	VMOVUPS Y1, 224(DI)(CX*4)
  9949  	VMOVUPS Y1, 256(DI)(CX*4)
  9950  	VMOVUPS Y1, 288(DI)(CX*4)
  9951  	VMOVUPS Y1, 320(DI)(CX*4)
  9952  	VMOVUPS Y1, 352(DI)(CX*4)
  9953  	VMOVUPS Y1, 384(DI)(CX*4)
  9954  	VMOVUPS Y1, 416(DI)(CX*4)
  9955  	VMOVUPS Y1, 448(DI)(CX*4)
  9956  	VMOVUPS Y1, 480(DI)(CX*4)
  9957  	SUBQ    $-128, CX
  9958  	ADDQ    $-4, DX
  9959  	JNE     LBB1_6
  9960  
  9961  LBB1_7:
  9962  	TESTQ R8, R8
  9963  	JE    LBB1_10
  9964  	LEAQ  (DI)(CX*4), CX
  9965  	ADDQ  $0x60, CX
  9966  	SHLQ  $0x07, R8
  9967  	XORL  DX, DX
  9968  
  9969  LBB1_9:
  9970  	VMOVUPS Y1, -96(CX)(DX*1)
  9971  	VMOVUPS Y1, -64(CX)(DX*1)
  9972  	VMOVUPS Y1, -32(CX)(DX*1)
  9973  	VMOVUPS Y1, (CX)(DX*1)
  9974  	SUBQ    $-128, DX
  9975  	CMPQ    R8, DX
  9976  	JNE     LBB1_9
  9977  
  9978  LBB1_10:
  9979  	CMPQ AX, SI
  9980  	JE   LBB1_12
  9981  
  9982  LBB1_11:
  9983  	VMOVSS X0, (DI)(AX*4)
  9984  	ADDQ   $0x01, AX
  9985  	CMPQ   SI, AX
  9986  	JNE    LBB1_11
  9987  
  9988  LBB1_12:
  9989  	VZEROUPPER
  9990  	RET
  9991  
  9992  DATA dataRangeF64<>+0(SB)/8, $0x0000000000000000
  9993  DATA dataRangeF64<>+8(SB)/8, $0x3ff0000000000000
  9994  DATA dataRangeF64<>+16(SB)/8, $0x4000000000000000
  9995  DATA dataRangeF64<>+24(SB)/8, $0x4008000000000000
  9996  DATA dataRangeF64<>+32(SB)/8, $0x4010000000000000
  9997  DATA dataRangeF64<>+40(SB)/8, $0x4020000000000000
  9998  DATA dataRangeF64<>+48(SB)/8, $0x4028000000000000
  9999  DATA dataRangeF64<>+56(SB)/8, $0x4030000000000000
 10000  DATA dataRangeF64<>+64(SB)/8, $0x4034000000000000
 10001  DATA dataRangeF64<>+72(SB)/8, $0x4038000000000000
 10002  DATA dataRangeF64<>+80(SB)/8, $0x403c000000000000
 10003  DATA dataRangeF64<>+88(SB)/8, $0x4040000000000000
 10004  DATA dataRangeF64<>+96(SB)/8, $0x3ff0000000000000
 10005  GLOBL dataRangeF64<>(SB), RODATA|NOPTR, $104
 10006  
 10007  // func Range_AVX2_F64(x []float64, a float64, n int)
 10008  // Requires: AVX, AVX2, SSE2
 10009  TEXT ·Range_AVX2_F64(SB), NOSPLIT, $0-40
 10010  	MOVQ  x_base+0(FP), DI
 10011  	MOVSD a+24(FP), X0
 10012  	MOVQ  n+32(FP), SI
 10013  	TESTQ SI, SI
 10014  	JE    LBB2_13
 10015  	CMPQ  SI, $0x10
 10016  	JAE   LBB2_3
 10017  	XORL  AX, AX
 10018  	JMP   LBB2_11
 10019  
 10020  LBB2_3:
 10021  	MOVQ         SI, AX
 10022  	ANDQ         $-16, AX
 10023  	VBROADCASTSD X0, Y1
 10024  	VADDPD       dataRangeF64<>+0(SB), Y1, Y1
 10025  	LEAQ         -16(AX), CX
 10026  	MOVQ         CX, R8
 10027  	SHRQ         $0x04, R8
 10028  	ADDQ         $0x01, R8
 10029  	TESTQ        CX, CX
 10030  	JE           LBB2_4
 10031  	MOVQ         R8, DX
 10032  	ANDQ         $-2, DX
 10033  	XORL         CX, CX
 10034  	VBROADCASTSD dataRangeF64<>+32(SB), Y2
 10035  	VBROADCASTSD dataRangeF64<>+40(SB), Y3
 10036  	VBROADCASTSD dataRangeF64<>+48(SB), Y4
 10037  	VBROADCASTSD dataRangeF64<>+56(SB), Y5
 10038  	VBROADCASTSD dataRangeF64<>+64(SB), Y6
 10039  	VBROADCASTSD dataRangeF64<>+72(SB), Y7
 10040  	VBROADCASTSD dataRangeF64<>+80(SB), Y8
 10041  	VBROADCASTSD dataRangeF64<>+88(SB), Y9
 10042  
 10043  LBB2_6:
 10044  	VADDPD  Y2, Y1, Y10
 10045  	VADDPD  Y3, Y1, Y11
 10046  	VADDPD  Y4, Y1, Y12
 10047  	VMOVUPD Y1, (DI)(CX*8)
 10048  	VMOVUPD Y10, 32(DI)(CX*8)
 10049  	VMOVUPD Y11, 64(DI)(CX*8)
 10050  	VMOVUPD Y12, 96(DI)(CX*8)
 10051  	VADDPD  Y5, Y1, Y10
 10052  	VADDPD  Y6, Y1, Y11
 10053  	VADDPD  Y7, Y1, Y12
 10054  	VADDPD  Y1, Y8, Y13
 10055  	VMOVUPD Y10, 128(DI)(CX*8)
 10056  	VMOVUPD Y11, 160(DI)(CX*8)
 10057  	VMOVUPD Y12, 192(DI)(CX*8)
 10058  	VMOVUPD Y13, 224(DI)(CX*8)
 10059  	ADDQ    $0x20, CX
 10060  	VADDPD  Y1, Y9, Y1
 10061  	ADDQ    $-2, DX
 10062  	JNE     LBB2_6
 10063  	TESTB   $0x01, R8
 10064  	JE      LBB2_9
 10065  
 10066  LBB2_8:
 10067  	VBROADCASTSD dataRangeF64<>+32(SB), Y2
 10068  	VADDPD       Y2, Y1, Y2
 10069  	VBROADCASTSD dataRangeF64<>+40(SB), Y3
 10070  	VADDPD       Y3, Y1, Y3
 10071  	VBROADCASTSD dataRangeF64<>+48(SB), Y4
 10072  	VADDPD       Y4, Y1, Y4
 10073  	VMOVUPD      Y1, (DI)(CX*8)
 10074  	VMOVUPD      Y2, 32(DI)(CX*8)
 10075  	VMOVUPD      Y3, 64(DI)(CX*8)
 10076  	VMOVUPD      Y4, 96(DI)(CX*8)
 10077  
 10078  LBB2_9:
 10079  	CMPQ       AX, SI
 10080  	JE         LBB2_13
 10081  	VCVTSI2SDQ AX, X14, X1
 10082  	VADDSD     X0, X1, X0
 10083  
 10084  LBB2_11:
 10085  	VMOVSD dataRangeF64<>+96(SB), X1
 10086  
 10087  LBB2_12:
 10088  	VMOVSD X0, (DI)(AX*8)
 10089  	VADDSD X1, X0, X0
 10090  	ADDQ   $0x01, AX
 10091  	CMPQ   SI, AX
 10092  	JNE    LBB2_12
 10093  
 10094  LBB2_13:
 10095  	VZEROUPPER
 10096  	RET
 10097  
 10098  LBB2_4:
 10099  	XORL  CX, CX
 10100  	TESTB $0x01, R8
 10101  	JNE   LBB2_8
 10102  	JMP   LBB2_9
 10103  
 10104  DATA dataRangeF32<>+0(SB)/4, $0x00000000
 10105  DATA dataRangeF32<>+4(SB)/4, $0x3f800000
 10106  DATA dataRangeF32<>+8(SB)/4, $0x40000000
 10107  DATA dataRangeF32<>+12(SB)/4, $0x40400000
 10108  DATA dataRangeF32<>+16(SB)/4, $0x40800000
 10109  DATA dataRangeF32<>+20(SB)/4, $0x40a00000
 10110  DATA dataRangeF32<>+24(SB)/4, $0x40c00000
 10111  DATA dataRangeF32<>+28(SB)/4, $0x40e00000
 10112  DATA dataRangeF32<>+32(SB)/4, $0x41000000
 10113  DATA dataRangeF32<>+36(SB)/4, $0x41800000
 10114  DATA dataRangeF32<>+40(SB)/4, $0x41c00000
 10115  DATA dataRangeF32<>+44(SB)/4, $0x42000000
 10116  DATA dataRangeF32<>+48(SB)/4, $0x42200000
 10117  DATA dataRangeF32<>+52(SB)/4, $0x42400000
 10118  DATA dataRangeF32<>+56(SB)/4, $0x42600000
 10119  DATA dataRangeF32<>+60(SB)/4, $0x42800000
 10120  DATA dataRangeF32<>+64(SB)/4, $0x3f800000
 10121  GLOBL dataRangeF32<>(SB), RODATA|NOPTR, $68
 10122  
 10123  // func Range_AVX2_F32(x []float32, a float32, n int)
 10124  // Requires: AVX, AVX2, SSE
 10125  TEXT ·Range_AVX2_F32(SB), NOSPLIT, $0-40
 10126  	MOVQ  x_base+0(FP), DI
 10127  	MOVSS a+24(FP), X0
 10128  	MOVQ  n+32(FP), SI
 10129  	TESTQ SI, SI
 10130  	JE    LBB3_13
 10131  	CMPQ  SI, $0x20
 10132  	JAE   LBB3_3
 10133  	XORL  AX, AX
 10134  	JMP   LBB3_11
 10135  
 10136  LBB3_3:
 10137  	MOVQ         SI, AX
 10138  	ANDQ         $-32, AX
 10139  	VBROADCASTSS X0, Y1
 10140  	VADDPS       dataRangeF32<>+0(SB), Y1, Y1
 10141  	LEAQ         -32(AX), CX
 10142  	MOVQ         CX, R8
 10143  	SHRQ         $0x05, R8
 10144  	ADDQ         $0x01, R8
 10145  	TESTQ        CX, CX
 10146  	JE           LBB3_4
 10147  	MOVQ         R8, DX
 10148  	ANDQ         $-2, DX
 10149  	XORL         CX, CX
 10150  	VBROADCASTSS dataRangeF32<>+32(SB), Y2
 10151  	VBROADCASTSS dataRangeF32<>+36(SB), Y3
 10152  	VBROADCASTSS dataRangeF32<>+40(SB), Y4
 10153  	VBROADCASTSS dataRangeF32<>+44(SB), Y5
 10154  	VBROADCASTSS dataRangeF32<>+48(SB), Y6
 10155  	VBROADCASTSS dataRangeF32<>+52(SB), Y7
 10156  	VBROADCASTSS dataRangeF32<>+56(SB), Y8
 10157  	VBROADCASTSS dataRangeF32<>+60(SB), Y9
 10158  
 10159  LBB3_6:
 10160  	VADDPS  Y2, Y1, Y10
 10161  	VADDPS  Y3, Y1, Y11
 10162  	VADDPS  Y4, Y1, Y12
 10163  	VMOVUPS Y1, (DI)(CX*4)
 10164  	VMOVUPS Y10, 32(DI)(CX*4)
 10165  	VMOVUPS Y11, 64(DI)(CX*4)
 10166  	VMOVUPS Y12, 96(DI)(CX*4)
 10167  	VADDPS  Y5, Y1, Y10
 10168  	VADDPS  Y6, Y1, Y11
 10169  	VADDPS  Y7, Y1, Y12
 10170  	VADDPS  Y1, Y8, Y13
 10171  	VMOVUPS Y10, 128(DI)(CX*4)
 10172  	VMOVUPS Y11, 160(DI)(CX*4)
 10173  	VMOVUPS Y12, 192(DI)(CX*4)
 10174  	VMOVUPS Y13, 224(DI)(CX*4)
 10175  	ADDQ    $0x40, CX
 10176  	VADDPS  Y1, Y9, Y1
 10177  	ADDQ    $-2, DX
 10178  	JNE     LBB3_6
 10179  	TESTB   $0x01, R8
 10180  	JE      LBB3_9
 10181  
 10182  LBB3_8:
 10183  	VBROADCASTSS dataRangeF32<>+32(SB), Y2
 10184  	VADDPS       Y2, Y1, Y2
 10185  	VBROADCASTSS dataRangeF32<>+36(SB), Y3
 10186  	VADDPS       Y3, Y1, Y3
 10187  	VBROADCASTSS dataRangeF32<>+40(SB), Y4
 10188  	VADDPS       Y4, Y1, Y4
 10189  	VMOVUPS      Y1, (DI)(CX*4)
 10190  	VMOVUPS      Y2, 32(DI)(CX*4)
 10191  	VMOVUPS      Y3, 64(DI)(CX*4)
 10192  	VMOVUPS      Y4, 96(DI)(CX*4)
 10193  
 10194  LBB3_9:
 10195  	CMPQ       AX, SI
 10196  	JE         LBB3_13
 10197  	VCVTSI2SSQ AX, X14, X1
 10198  	VADDSS     X0, X1, X0
 10199  
 10200  LBB3_11:
 10201  	VMOVSS dataRangeF32<>+64(SB), X1
 10202  
 10203  LBB3_12:
 10204  	VMOVSS X0, (DI)(AX*4)
 10205  	VADDSS X1, X0, X0
 10206  	ADDQ   $0x01, AX
 10207  	CMPQ   SI, AX
 10208  	JNE    LBB3_12
 10209  
 10210  LBB3_13:
 10211  	VZEROUPPER
 10212  	RET
 10213  
 10214  LBB3_4:
 10215  	XORL  CX, CX
 10216  	TESTB $0x01, R8
 10217  	JNE   LBB3_8
 10218  	JMP   LBB3_9
 10219  
 10220  DATA dataFromBoolF64<>+0(SB)/4, $+1
 10221  DATA dataFromBoolF64<>+4(SB)/8, $0x3ff0000000000000
 10222  GLOBL dataFromBoolF64<>(SB), RODATA|NOPTR, $12
 10223  
 10224  // func FromBool_AVX2_F64(x []float64, y []bool)
 10225  // Requires: AVX, AVX2
 10226  TEXT ·FromBool_AVX2_F64(SB), NOSPLIT, $0-48
 10227  	MOVQ  x_base+0(FP), DI
 10228  	MOVQ  y_base+24(FP), SI
 10229  	MOVQ  x_len+8(FP), DX
 10230  	TESTQ DX, DX
 10231  	JE    LBB4_10
 10232  	CMPQ  DX, $0x10
 10233  	JAE   LBB4_3
 10234  	XORL  AX, AX
 10235  	JMP   LBB4_6
 10236  
 10237  LBB4_3:
 10238  	MOVQ         DX, AX
 10239  	ANDQ         $-16, AX
 10240  	XORL         CX, CX
 10241  	VPXOR        X0, X0, X0
 10242  	VPCMPEQD     X1, X1, X1
 10243  	VPBROADCASTD dataFromBoolF64<>+0(SB), X2
 10244  
 10245  LBB4_4:
 10246  	VMOVD     (SI)(CX*1), X3
 10247  	VMOVD     4(SI)(CX*1), X4
 10248  	VMOVD     8(SI)(CX*1), X5
 10249  	VMOVD     12(SI)(CX*1), X6
 10250  	VPCMPEQB  X0, X3, X3
 10251  	VPXOR     X1, X3, X3
 10252  	VPMOVZXBD X3, X3
 10253  	VPAND     X2, X3, X3
 10254  	VCVTDQ2PD X3, Y3
 10255  	VPCMPEQB  X0, X4, X4
 10256  	VPXOR     X1, X4, X4
 10257  	VPMOVZXBD X4, X4
 10258  	VPAND     X2, X4, X4
 10259  	VCVTDQ2PD X4, Y4
 10260  	VPCMPEQB  X0, X5, X5
 10261  	VPXOR     X1, X5, X5
 10262  	VPMOVZXBD X5, X5
 10263  	VPAND     X2, X5, X5
 10264  	VCVTDQ2PD X5, Y5
 10265  	VPCMPEQB  X0, X6, X6
 10266  	VPXOR     X1, X6, X6
 10267  	VPMOVZXBD X6, X6
 10268  	VPAND     X2, X6, X6
 10269  	VCVTDQ2PD X6, Y6
 10270  	VMOVUPS   Y3, (DI)(CX*8)
 10271  	VMOVUPS   Y4, 32(DI)(CX*8)
 10272  	VMOVUPS   Y5, 64(DI)(CX*8)
 10273  	VMOVUPS   Y6, 96(DI)(CX*8)
 10274  	ADDQ      $0x10, CX
 10275  	CMPQ      AX, CX
 10276  	JNE       LBB4_4
 10277  	CMPQ      AX, DX
 10278  	JNE       LBB4_6
 10279  
 10280  LBB4_10:
 10281  	VZEROUPPER
 10282  	RET
 10283  
 10284  LBB4_6:
 10285  	VMOVQ dataFromBoolF64<>+4(SB), X0
 10286  	JMP   LBB4_7
 10287  
 10288  LBB4_9:
 10289  	VMOVQ X1, (DI)(AX*8)
 10290  	ADDQ  $0x01, AX
 10291  	CMPQ  DX, AX
 10292  	JE    LBB4_10
 10293  
 10294  LBB4_7:
 10295  	CMPB    (SI)(AX*1), $0x00
 10296  	VMOVDQA X0, X1
 10297  	JNE     LBB4_9
 10298  	VPXOR   X1, X1, X1
 10299  	JMP     LBB4_9
 10300  
 10301  DATA dataFromBoolF32<>+0(SB)/4, $+1
 10302  DATA dataFromBoolF32<>+4(SB)/4, $+1065353216
 10303  GLOBL dataFromBoolF32<>(SB), RODATA|NOPTR, $8
 10304  
 10305  // func FromBool_AVX2_F32(x []float32, y []bool)
 10306  // Requires: AVX, AVX2
 10307  TEXT ·FromBool_AVX2_F32(SB), NOSPLIT, $0-48
 10308  	MOVQ  x_base+0(FP), DI
 10309  	MOVQ  y_base+24(FP), SI
 10310  	MOVQ  x_len+8(FP), DX
 10311  	TESTQ DX, DX
 10312  	JE    LBB5_10
 10313  	CMPQ  DX, $0x20
 10314  	JAE   LBB5_3
 10315  	XORL  AX, AX
 10316  	JMP   LBB5_6
 10317  
 10318  LBB5_3:
 10319  	MOVQ         DX, AX
 10320  	ANDQ         $-32, AX
 10321  	XORL         CX, CX
 10322  	VPXOR        X0, X0, X0
 10323  	VPCMPEQD     X1, X1, X1
 10324  	VPBROADCASTD dataFromBoolF32<>+0(SB), Y2
 10325  
 10326  LBB5_4:
 10327  	VMOVQ     (SI)(CX*1), X3
 10328  	VMOVQ     8(SI)(CX*1), X4
 10329  	VMOVQ     16(SI)(CX*1), X5
 10330  	VMOVQ     24(SI)(CX*1), X6
 10331  	VPCMPEQB  X0, X3, X3
 10332  	VPXOR     X1, X3, X3
 10333  	VPMOVZXBD X3, Y3
 10334  	VPAND     Y2, Y3, Y3
 10335  	VCVTDQ2PS Y3, Y3
 10336  	VPCMPEQB  X0, X4, X4
 10337  	VPXOR     X1, X4, X4
 10338  	VPMOVZXBD X4, Y4
 10339  	VPAND     Y2, Y4, Y4
 10340  	VCVTDQ2PS Y4, Y4
 10341  	VPCMPEQB  X0, X5, X5
 10342  	VPXOR     X1, X5, X5
 10343  	VPMOVZXBD X5, Y5
 10344  	VPAND     Y2, Y5, Y5
 10345  	VCVTDQ2PS Y5, Y5
 10346  	VPCMPEQB  X0, X6, X6
 10347  	VPXOR     X1, X6, X6
 10348  	VPMOVZXBD X6, Y6
 10349  	VPAND     Y2, Y6, Y6
 10350  	VCVTDQ2PS Y6, Y6
 10351  	VMOVUPS   Y3, (DI)(CX*4)
 10352  	VMOVUPS   Y4, 32(DI)(CX*4)
 10353  	VMOVUPS   Y5, 64(DI)(CX*4)
 10354  	VMOVUPS   Y6, 96(DI)(CX*4)
 10355  	ADDQ      $0x20, CX
 10356  	CMPQ      AX, CX
 10357  	JNE       LBB5_4
 10358  	CMPQ      AX, DX
 10359  	JNE       LBB5_6
 10360  
 10361  LBB5_10:
 10362  	VZEROUPPER
 10363  	RET
 10364  
 10365  LBB5_6:
 10366  	VMOVD dataFromBoolF32<>+4(SB), X0
 10367  	JMP   LBB5_7
 10368  
 10369  LBB5_9:
 10370  	VMOVD X1, (DI)(AX*4)
 10371  	ADDQ  $0x01, AX
 10372  	CMPQ  DX, AX
 10373  	JE    LBB5_10
 10374  
 10375  LBB5_7:
 10376  	CMPB    (SI)(AX*1), $0x00
 10377  	VMOVDQA X0, X1
 10378  	JNE     LBB5_9
 10379  	VPXOR   X1, X1, X1
 10380  	JMP     LBB5_9
 10381  
 10382  // func FromInt32_AVX2_F64(x []float64, y []int32)
 10383  // Requires: AVX
 10384  TEXT ·FromInt32_AVX2_F64(SB), NOSPLIT, $0-48
 10385  	MOVQ  x_base+0(FP), DI
 10386  	MOVQ  y_base+24(FP), SI
 10387  	MOVQ  x_len+8(FP), DX
 10388  	TESTQ DX, DX
 10389  	JE    LBB10_11
 10390  	CMPQ  DX, $0x10
 10391  	JAE   LBB10_3
 10392  	XORL  AX, AX
 10393  	JMP   LBB10_10
 10394  
 10395  LBB10_3:
 10396  	MOVQ  DX, AX
 10397  	ANDQ  $-16, AX
 10398  	LEAQ  -16(AX), CX
 10399  	MOVQ  CX, R8
 10400  	SHRQ  $0x04, R8
 10401  	ADDQ  $0x01, R8
 10402  	TESTQ CX, CX
 10403  	JE    LBB10_4
 10404  	MOVQ  R8, R9
 10405  	ANDQ  $-2, R9
 10406  	XORL  CX, CX
 10407  
 10408  LBB10_6:
 10409  	VCVTDQ2PD (SI)(CX*4), Y0
 10410  	VCVTDQ2PD 16(SI)(CX*4), Y1
 10411  	VCVTDQ2PD 32(SI)(CX*4), Y2
 10412  	VCVTDQ2PD 48(SI)(CX*4), Y3
 10413  	VMOVUPS   Y0, (DI)(CX*8)
 10414  	VMOVUPS   Y1, 32(DI)(CX*8)
 10415  	VMOVUPS   Y2, 64(DI)(CX*8)
 10416  	VMOVUPS   Y3, 96(DI)(CX*8)
 10417  	VCVTDQ2PD 64(SI)(CX*4), Y0
 10418  	VCVTDQ2PD 80(SI)(CX*4), Y1
 10419  	VCVTDQ2PD 96(SI)(CX*4), Y2
 10420  	VCVTDQ2PD 112(SI)(CX*4), Y3
 10421  	VMOVUPD   Y0, 128(DI)(CX*8)
 10422  	VMOVUPS   Y1, 160(DI)(CX*8)
 10423  	VMOVUPS   Y2, 192(DI)(CX*8)
 10424  	VMOVUPS   Y3, 224(DI)(CX*8)
 10425  	ADDQ      $0x20, CX
 10426  	ADDQ      $-2, R9
 10427  	JNE       LBB10_6
 10428  	TESTB     $0x01, R8
 10429  	JE        LBB10_9
 10430  
 10431  LBB10_8:
 10432  	VCVTDQ2PD (SI)(CX*4), Y0
 10433  	VCVTDQ2PD 16(SI)(CX*4), Y1
 10434  	VCVTDQ2PD 32(SI)(CX*4), Y2
 10435  	VCVTDQ2PD 48(SI)(CX*4), Y3
 10436  	VMOVUPD   Y0, (DI)(CX*8)
 10437  	VMOVUPS   Y1, 32(DI)(CX*8)
 10438  	VMOVUPS   Y2, 64(DI)(CX*8)
 10439  	VMOVUPS   Y3, 96(DI)(CX*8)
 10440  
 10441  LBB10_9:
 10442  	CMPQ AX, DX
 10443  	JE   LBB10_11
 10444  
 10445  LBB10_10:
 10446  	VCVTSI2SDL (SI)(AX*4), X4, X0
 10447  	VMOVSD     X0, (DI)(AX*8)
 10448  	ADDQ       $0x01, AX
 10449  	CMPQ       DX, AX
 10450  	JNE        LBB10_10
 10451  
 10452  LBB10_11:
 10453  	VZEROUPPER
 10454  	RET
 10455  
 10456  LBB10_4:
 10457  	XORL  CX, CX
 10458  	TESTB $0x01, R8
 10459  	JNE   LBB10_8
 10460  	JMP   LBB10_9
 10461  
 10462  // func FromInt32_AVX2_F32(x []float32, y []int32)
 10463  // Requires: AVX
 10464  TEXT ·FromInt32_AVX2_F32(SB), NOSPLIT, $0-48
 10465  	MOVQ  x_base+0(FP), DI
 10466  	MOVQ  y_base+24(FP), SI
 10467  	MOVQ  x_len+8(FP), DX
 10468  	TESTQ DX, DX
 10469  	JE    LBB11_11
 10470  	CMPQ  DX, $0x20
 10471  	JAE   LBB11_3
 10472  	XORL  AX, AX
 10473  	JMP   LBB11_10
 10474  
 10475  LBB11_3:
 10476  	MOVQ  DX, AX
 10477  	ANDQ  $-32, AX
 10478  	LEAQ  -32(AX), CX
 10479  	MOVQ  CX, R8
 10480  	SHRQ  $0x05, R8
 10481  	ADDQ  $0x01, R8
 10482  	TESTQ CX, CX
 10483  	JE    LBB11_4
 10484  	MOVQ  R8, R9
 10485  	ANDQ  $-2, R9
 10486  	XORL  CX, CX
 10487  
 10488  LBB11_6:
 10489  	VCVTDQ2PS (SI)(CX*4), Y0
 10490  	VCVTDQ2PS 32(SI)(CX*4), Y1
 10491  	VCVTDQ2PS 64(SI)(CX*4), Y2
 10492  	VCVTDQ2PS 96(SI)(CX*4), Y3
 10493  	VMOVUPS   Y0, (DI)(CX*4)
 10494  	VMOVUPS   Y1, 32(DI)(CX*4)
 10495  	VMOVUPS   Y2, 64(DI)(CX*4)
 10496  	VMOVUPS   Y3, 96(DI)(CX*4)
 10497  	VCVTDQ2PS 128(SI)(CX*4), Y0
 10498  	VCVTDQ2PS 160(SI)(CX*4), Y1
 10499  	VCVTDQ2PS 192(SI)(CX*4), Y2
 10500  	VCVTDQ2PS 224(SI)(CX*4), Y3
 10501  	VMOVUPS   Y0, 128(DI)(CX*4)
 10502  	VMOVUPS   Y1, 160(DI)(CX*4)
 10503  	VMOVUPS   Y2, 192(DI)(CX*4)
 10504  	VMOVUPS   Y3, 224(DI)(CX*4)
 10505  	ADDQ      $0x40, CX
 10506  	ADDQ      $-2, R9
 10507  	JNE       LBB11_6
 10508  	TESTB     $0x01, R8
 10509  	JE        LBB11_9
 10510  
 10511  LBB11_8:
 10512  	VCVTDQ2PS (SI)(CX*4), Y0
 10513  	VCVTDQ2PS 32(SI)(CX*4), Y1
 10514  	VCVTDQ2PS 64(SI)(CX*4), Y2
 10515  	VCVTDQ2PS 96(SI)(CX*4), Y3
 10516  	VMOVUPS   Y0, (DI)(CX*4)
 10517  	VMOVUPS   Y1, 32(DI)(CX*4)
 10518  	VMOVUPS   Y2, 64(DI)(CX*4)
 10519  	VMOVUPS   Y3, 96(DI)(CX*4)
 10520  
 10521  LBB11_9:
 10522  	CMPQ AX, DX
 10523  	JE   LBB11_11
 10524  
 10525  LBB11_10:
 10526  	VCVTSI2SSL (SI)(AX*4), X4, X0
 10527  	VMOVSS     X0, (DI)(AX*4)
 10528  	ADDQ       $0x01, AX
 10529  	CMPQ       DX, AX
 10530  	JNE        LBB11_10
 10531  
 10532  LBB11_11:
 10533  	VZEROUPPER
 10534  	RET
 10535  
 10536  LBB11_4:
 10537  	XORL  CX, CX
 10538  	TESTB $0x01, R8
 10539  	JNE   LBB11_8
 10540  	JMP   LBB11_9
 10541  
 10542  // func FromInt64_AVX2_F64(x []float64, y []int64)
 10543  // Requires: AVX
 10544  TEXT ·FromInt64_AVX2_F64(SB), NOSPLIT, $0-48
 10545  	MOVQ  x_base+0(FP), DI
 10546  	MOVQ  y_base+24(FP), SI
 10547  	MOVQ  x_len+8(FP), DX
 10548  	TESTQ DX, DX
 10549  	JE    LBB8_11
 10550  	CMPQ  DX, $0x10
 10551  	JAE   LBB8_3
 10552  	XORL  R10, R10
 10553  	JMP   LBB8_10
 10554  
 10555  LBB8_3:
 10556  	MOVQ  DX, R10
 10557  	ANDQ  $-16, R10
 10558  	LEAQ  -16(R10), CX
 10559  	MOVQ  CX, R8
 10560  	SHRQ  $0x04, R8
 10561  	ADDQ  $0x01, R8
 10562  	TESTQ CX, CX
 10563  	JE    LBB8_4
 10564  	MOVQ  R8, R9
 10565  	ANDQ  $-2, R9
 10566  	XORL  CX, CX
 10567  
 10568  LBB8_6:
 10569  	VMOVDQU    (SI)(CX*8), X0
 10570  	VMOVDQU    16(SI)(CX*8), X1
 10571  	VPEXTRQ    $0x01, X0, AX
 10572  	VCVTSI2SDQ AX, X11, X2
 10573  	VMOVDQU    32(SI)(CX*8), X3
 10574  	VMOVQ      X0, AX
 10575  	VCVTSI2SDQ AX, X11, X0
 10576  	VPEXTRQ    $0x01, X1, AX
 10577  	VCVTSI2SDQ AX, X11, X4
 10578  	VMOVDQU    48(SI)(CX*8), X5
 10579  	VMOVQ      X1, AX
 10580  	VCVTSI2SDQ AX, X11, X1
 10581  	VPEXTRQ    $0x01, X5, AX
 10582  	VCVTSI2SDQ AX, X11, X6
 10583  	VUNPCKLPD  X2, X0, X8
 10584  	VMOVQ      X5, AX
 10585  	VCVTSI2SDQ AX, X11, X2
 10586  	VPEXTRQ    $0x01, X3, AX
 10587  	VCVTSI2SDQ AX, X11, X5
 10588  	VUNPCKLPD  X4, X1, X10
 10589  	VMOVQ      X3, AX
 10590  	VCVTSI2SDQ AX, X11, X3
 10591  	VUNPCKLPD  X6, X2, X9
 10592  	VMOVDQU    80(SI)(CX*8), X4
 10593  	VPEXTRQ    $0x01, X4, AX
 10594  	VUNPCKLPD  X5, X3, X3
 10595  	VCVTSI2SDQ AX, X11, X5
 10596  	VMOVQ      X4, AX
 10597  	VCVTSI2SDQ AX, X11, X4
 10598  	VUNPCKLPD  X5, X4, X4
 10599  	VMOVDQU    64(SI)(CX*8), X5
 10600  	VPEXTRQ    $0x01, X5, AX
 10601  	VCVTSI2SDQ AX, X11, X6
 10602  	VMOVQ      X5, AX
 10603  	VCVTSI2SDQ AX, X11, X5
 10604  	VMOVDQU    112(SI)(CX*8), X7
 10605  	VPEXTRQ    $0x01, X7, AX
 10606  	VCVTSI2SDQ AX, X11, X0
 10607  	VMOVQ      X7, AX
 10608  	VCVTSI2SDQ AX, X11, X7
 10609  	VMOVDQU    96(SI)(CX*8), X2
 10610  	VPEXTRQ    $0x01, X2, AX
 10611  	VCVTSI2SDQ AX, X11, X1
 10612  	VUNPCKLPD  X6, X5, X5
 10613  	VMOVQ      X2, AX
 10614  	VCVTSI2SDQ AX, X11, X2
 10615  	VUNPCKLPD  X0, X7, X0
 10616  	VUNPCKLPD  X1, X2, X1
 10617  	VMOVUPD    X10, 16(DI)(CX*8)
 10618  	VMOVUPD    X8, (DI)(CX*8)
 10619  	VMOVUPD    X3, 32(DI)(CX*8)
 10620  	VMOVUPD    X9, 48(DI)(CX*8)
 10621  	VMOVUPD    X5, 64(DI)(CX*8)
 10622  	VMOVUPD    X4, 80(DI)(CX*8)
 10623  	VMOVUPD    X1, 96(DI)(CX*8)
 10624  	VMOVUPD    X0, 112(DI)(CX*8)
 10625  	VMOVDQU    128(SI)(CX*8), X0
 10626  	VMOVDQU    144(SI)(CX*8), X1
 10627  	VPEXTRQ    $0x01, X0, AX
 10628  	VCVTSI2SDQ AX, X11, X2
 10629  	VMOVDQU    160(SI)(CX*8), X3
 10630  	VMOVQ      X0, AX
 10631  	VCVTSI2SDQ AX, X11, X0
 10632  	VPEXTRQ    $0x01, X1, AX
 10633  	VCVTSI2SDQ AX, X11, X4
 10634  	VMOVDQU    176(SI)(CX*8), X5
 10635  	VMOVQ      X1, AX
 10636  	VCVTSI2SDQ AX, X11, X1
 10637  	VPEXTRQ    $0x01, X5, AX
 10638  	VCVTSI2SDQ AX, X11, X6
 10639  	VUNPCKLPD  X2, X0, X8
 10640  	VMOVQ      X5, AX
 10641  	VCVTSI2SDQ AX, X11, X2
 10642  	VPEXTRQ    $0x01, X3, AX
 10643  	VCVTSI2SDQ AX, X11, X5
 10644  	VUNPCKLPD  X4, X1, X10
 10645  	VMOVQ      X3, AX
 10646  	VCVTSI2SDQ AX, X11, X3
 10647  	VUNPCKLPD  X6, X2, X9
 10648  	VMOVDQU    208(SI)(CX*8), X4
 10649  	VPEXTRQ    $0x01, X4, AX
 10650  	VUNPCKLPD  X5, X3, X3
 10651  	VCVTSI2SDQ AX, X11, X5
 10652  	VMOVQ      X4, AX
 10653  	VCVTSI2SDQ AX, X11, X4
 10654  	VUNPCKLPD  X5, X4, X4
 10655  	VMOVDQU    192(SI)(CX*8), X5
 10656  	VPEXTRQ    $0x01, X5, AX
 10657  	VCVTSI2SDQ AX, X11, X6
 10658  	VMOVQ      X5, AX
 10659  	VCVTSI2SDQ AX, X11, X5
 10660  	VMOVDQU    240(SI)(CX*8), X7
 10661  	VPEXTRQ    $0x01, X7, AX
 10662  	VCVTSI2SDQ AX, X11, X0
 10663  	VMOVQ      X7, AX
 10664  	VCVTSI2SDQ AX, X11, X7
 10665  	VMOVDQU    224(SI)(CX*8), X2
 10666  	VPEXTRQ    $0x01, X2, AX
 10667  	VCVTSI2SDQ AX, X11, X1
 10668  	VUNPCKLPD  X6, X5, X5
 10669  	VMOVQ      X2, AX
 10670  	VCVTSI2SDQ AX, X11, X2
 10671  	VUNPCKLPD  X0, X7, X0
 10672  	VUNPCKLPD  X1, X2, X1
 10673  	VMOVUPD    X10, 144(DI)(CX*8)
 10674  	VMOVUPD    X8, 128(DI)(CX*8)
 10675  	VMOVUPD    X3, 160(DI)(CX*8)
 10676  	VMOVUPD    X9, 176(DI)(CX*8)
 10677  	VMOVUPD    X5, 192(DI)(CX*8)
 10678  	VMOVUPD    X4, 208(DI)(CX*8)
 10679  	VMOVUPD    X1, 224(DI)(CX*8)
 10680  	VMOVUPD    X0, 240(DI)(CX*8)
 10681  	ADDQ       $0x20, CX
 10682  	ADDQ       $-2, R9
 10683  	JNE        LBB8_6
 10684  	TESTB      $0x01, R8
 10685  	JE         LBB8_9
 10686  
 10687  LBB8_8:
 10688  	VMOVDQU    (SI)(CX*8), X0
 10689  	VMOVDQU    16(SI)(CX*8), X1
 10690  	VMOVDQU    32(SI)(CX*8), X3
 10691  	VMOVDQU    48(SI)(CX*8), X2
 10692  	VPEXTRQ    $0x01, X0, AX
 10693  	VCVTSI2SDQ AX, X11, X4
 10694  	VMOVQ      X0, AX
 10695  	VCVTSI2SDQ AX, X11, X0
 10696  	VUNPCKLPD  X4, X0, X8
 10697  	VPEXTRQ    $0x01, X1, AX
 10698  	VCVTSI2SDQ AX, X11, X4
 10699  	VMOVQ      X1, AX
 10700  	VCVTSI2SDQ AX, X11, X1
 10701  	VUNPCKLPD  X4, X1, X1
 10702  	VPEXTRQ    $0x01, X2, AX
 10703  	VCVTSI2SDQ AX, X11, X4
 10704  	VMOVQ      X2, AX
 10705  	VCVTSI2SDQ AX, X11, X2
 10706  	VUNPCKLPD  X4, X2, X2
 10707  	VPEXTRQ    $0x01, X3, AX
 10708  	VCVTSI2SDQ AX, X11, X4
 10709  	VMOVQ      X3, AX
 10710  	VCVTSI2SDQ AX, X11, X3
 10711  	VMOVDQU    80(SI)(CX*8), X5
 10712  	VPEXTRQ    $0x01, X5, AX
 10713  	VCVTSI2SDQ AX, X11, X6
 10714  	VMOVQ      X5, AX
 10715  	VCVTSI2SDQ AX, X11, X5
 10716  	VMOVDQU    64(SI)(CX*8), X7
 10717  	VPEXTRQ    $0x01, X7, AX
 10718  	VCVTSI2SDQ AX, X11, X0
 10719  	VUNPCKLPD  X4, X3, X3
 10720  	VMOVQ      X7, AX
 10721  	VCVTSI2SDQ AX, X11, X4
 10722  	VUNPCKLPD  X6, X5, X5
 10723  	VMOVDQU    112(SI)(CX*8), X6
 10724  	VPEXTRQ    $0x01, X6, AX
 10725  	VUNPCKLPD  X0, X4, X0
 10726  	VCVTSI2SDQ AX, X11, X4
 10727  	VMOVQ      X6, AX
 10728  	VCVTSI2SDQ AX, X11, X6
 10729  	VUNPCKLPD  X4, X6, X4
 10730  	VMOVDQU    96(SI)(CX*8), X6
 10731  	VPEXTRQ    $0x01, X6, AX
 10732  	VCVTSI2SDQ AX, X11, X7
 10733  	VMOVQ      X6, AX
 10734  	VCVTSI2SDQ AX, X11, X6
 10735  	VUNPCKLPD  X7, X6, X6
 10736  	VMOVUPD    X1, 16(DI)(CX*8)
 10737  	VMOVUPD    X8, (DI)(CX*8)
 10738  	VMOVUPD    X3, 32(DI)(CX*8)
 10739  	VMOVUPD    X2, 48(DI)(CX*8)
 10740  	VMOVUPD    X0, 64(DI)(CX*8)
 10741  	VMOVUPD    X5, 80(DI)(CX*8)
 10742  	VMOVUPD    X6, 96(DI)(CX*8)
 10743  	VMOVUPD    X4, 112(DI)(CX*8)
 10744  
 10745  LBB8_9:
 10746  	CMPQ R10, DX
 10747  	JE   LBB8_11
 10748  
 10749  LBB8_10:
 10750  	VCVTSI2SDQ (SI)(R10*8), X11, X0
 10751  	VMOVSD     X0, (DI)(R10*8)
 10752  	ADDQ       $0x01, R10
 10753  	CMPQ       DX, R10
 10754  	JNE        LBB8_10
 10755  
 10756  LBB8_11:
 10757  	RET
 10758  
 10759  LBB8_4:
 10760  	XORL  CX, CX
 10761  	TESTB $0x01, R8
 10762  	JNE   LBB8_8
 10763  	JMP   LBB8_9
 10764  
 10765  // func FromInt64_AVX2_F32(x []float32, y []int64)
 10766  // Requires: AVX
 10767  TEXT ·FromInt64_AVX2_F32(SB), NOSPLIT, $0-48
 10768  	MOVQ  x_base+0(FP), DI
 10769  	MOVQ  y_base+24(FP), SI
 10770  	MOVQ  x_len+8(FP), DX
 10771  	TESTQ DX, DX
 10772  	JE    LBB9_11
 10773  	CMPQ  DX, $0x10
 10774  	JAE   LBB9_3
 10775  	XORL  R11, R11
 10776  	JMP   LBB9_10
 10777  
 10778  LBB9_3:
 10779  	MOVQ  DX, R11
 10780  	ANDQ  $-16, R11
 10781  	LEAQ  -16(R11), CX
 10782  	MOVQ  CX, R8
 10783  	SHRQ  $0x04, R8
 10784  	ADDQ  $0x01, R8
 10785  	TESTQ CX, CX
 10786  	JE    LBB9_4
 10787  	MOVQ  R8, R9
 10788  	ANDQ  $-2, R9
 10789  	XORL  CX, CX
 10790  
 10791  LBB9_6:
 10792  	VMOVDQU    (SI)(CX*8), X0
 10793  	VPEXTRQ    $0x01, X0, R10
 10794  	VMOVDQU    16(SI)(CX*8), X1
 10795  	VCVTSI2SSQ R10, X8, X2
 10796  	VMOVQ      X0, AX
 10797  	VCVTSI2SSQ AX, X8, X0
 10798  	VMOVQ      X1, AX
 10799  	VCVTSI2SSQ AX, X8, X3
 10800  	VPEXTRQ    $0x01, X1, AX
 10801  	VCVTSI2SSQ AX, X8, X1
 10802  	VMOVDQU    32(SI)(CX*8), X4
 10803  	VPEXTRQ    $0x01, X4, AX
 10804  	VMOVDQU    48(SI)(CX*8), X5
 10805  	VCVTSI2SSQ AX, X8, X6
 10806  	VMOVQ      X4, AX
 10807  	VCVTSI2SSQ AX, X8, X4
 10808  	VMOVQ      X5, AX
 10809  	VCVTSI2SSQ AX, X8, X7
 10810  	VINSERTPS  $0x10, X2, X0, X0
 10811  	VINSERTPS  $0x20, X3, X0, X0
 10812  	VPEXTRQ    $0x01, X5, AX
 10813  	VINSERTPS  $0x30, X1, X0, X0
 10814  	VCVTSI2SSQ AX, X8, X1
 10815  	VINSERTPS  $0x10, X6, X4, X2
 10816  	VMOVDQU    64(SI)(CX*8), X3
 10817  	VPEXTRQ    $0x01, X3, AX
 10818  	VCVTSI2SSQ AX, X8, X4
 10819  	VMOVQ      X3, AX
 10820  	VCVTSI2SSQ AX, X8, X3
 10821  	VMOVDQU    80(SI)(CX*8), X5
 10822  	VMOVQ      X5, AX
 10823  	VCVTSI2SSQ AX, X8, X6
 10824  	VINSERTPS  $0x20, X7, X2, X2
 10825  	VINSERTPS  $0x30, X1, X2, X1
 10826  	VPEXTRQ    $0x01, X5, AX
 10827  	VINSERTPS  $0x10, X4, X3, X2
 10828  	VCVTSI2SSQ AX, X8, X3
 10829  	VINSERTPS  $0x20, X6, X2, X2
 10830  	VMOVDQU    96(SI)(CX*8), X4
 10831  	VPEXTRQ    $0x01, X4, AX
 10832  	VCVTSI2SSQ AX, X8, X5
 10833  	VMOVQ      X4, AX
 10834  	VCVTSI2SSQ AX, X8, X4
 10835  	VMOVDQU    112(SI)(CX*8), X6
 10836  	VMOVQ      X6, AX
 10837  	VCVTSI2SSQ AX, X8, X7
 10838  	VINSERTPS  $0x30, X3, X2, X2
 10839  	VINSERTPS  $0x10, X5, X4, X3
 10840  	VPEXTRQ    $0x01, X6, AX
 10841  	VINSERTPS  $0x20, X7, X3, X3
 10842  	VCVTSI2SSQ AX, X8, X4
 10843  	VINSERTPS  $0x30, X4, X3, X3
 10844  	VMOVUPS    X0, (DI)(CX*4)
 10845  	VMOVUPS    X1, 16(DI)(CX*4)
 10846  	VMOVUPS    X2, 32(DI)(CX*4)
 10847  	VMOVUPS    X3, 48(DI)(CX*4)
 10848  	VMOVDQU    128(SI)(CX*8), X0
 10849  	VPEXTRQ    $0x01, X0, AX
 10850  	VMOVDQU    144(SI)(CX*8), X1
 10851  	VCVTSI2SSQ AX, X8, X2
 10852  	VMOVQ      X0, AX
 10853  	VCVTSI2SSQ AX, X8, X0
 10854  	VMOVQ      X1, AX
 10855  	VCVTSI2SSQ AX, X8, X3
 10856  	VPEXTRQ    $0x01, X1, AX
 10857  	VCVTSI2SSQ AX, X8, X1
 10858  	VMOVDQU    160(SI)(CX*8), X4
 10859  	VPEXTRQ    $0x01, X4, AX
 10860  	VCVTSI2SSQ AX, X8, X5
 10861  	VMOVQ      X4, AX
 10862  	VCVTSI2SSQ AX, X8, X4
 10863  	VINSERTPS  $0x10, X2, X0, X0
 10864  	VMOVDQU    176(SI)(CX*8), X2
 10865  	VPEXTRQ    $0x01, X2, R10
 10866  	VMOVQ      X2, AX
 10867  	VCVTSI2SSQ AX, X8, X2
 10868  	VINSERTPS  $0x20, X3, X0, X0
 10869  	VCVTSI2SSQ R10, X8, X3
 10870  	VINSERTPS  $0x30, X1, X0, X0
 10871  	VMOVDQU    192(SI)(CX*8), X1
 10872  	VPEXTRQ    $0x01, X1, AX
 10873  	VINSERTPS  $0x10, X5, X4, X4
 10874  	VCVTSI2SSQ AX, X8, X5
 10875  	VMOVQ      X1, AX
 10876  	VCVTSI2SSQ AX, X8, X1
 10877  	VINSERTPS  $0x20, X2, X4, X2
 10878  	VMOVDQU    208(SI)(CX*8), X4
 10879  	VPEXTRQ    $0x01, X4, R10
 10880  	VMOVQ      X4, AX
 10881  	VCVTSI2SSQ AX, X8, X4
 10882  	VINSERTPS  $0x30, X3, X2, X2
 10883  	VCVTSI2SSQ R10, X8, X3
 10884  	VINSERTPS  $0x10, X5, X1, X1
 10885  	VMOVDQU    224(SI)(CX*8), X5
 10886  	VPEXTRQ    $0x01, X5, AX
 10887  	VINSERTPS  $0x20, X4, X1, X1
 10888  	VCVTSI2SSQ AX, X8, X4
 10889  	VMOVQ      X5, AX
 10890  	VCVTSI2SSQ AX, X8, X5
 10891  	VINSERTPS  $0x30, X3, X1, X1
 10892  	VMOVDQU    240(SI)(CX*8), X3
 10893  	VPEXTRQ    $0x01, X3, R10
 10894  	VMOVQ      X3, AX
 10895  	VCVTSI2SSQ AX, X8, X3
 10896  	VINSERTPS  $0x10, X4, X5, X4
 10897  	VCVTSI2SSQ R10, X8, X5
 10898  	VINSERTPS  $0x20, X3, X4, X3
 10899  	VINSERTPS  $0x30, X5, X3, X3
 10900  	VMOVUPS    X0, 64(DI)(CX*4)
 10901  	VMOVUPS    X2, 80(DI)(CX*4)
 10902  	VMOVUPS    X1, 96(DI)(CX*4)
 10903  	VMOVUPS    X3, 112(DI)(CX*4)
 10904  	ADDQ       $0x20, CX
 10905  	ADDQ       $-2, R9
 10906  	JNE        LBB9_6
 10907  	TESTB      $0x01, R8
 10908  	JE         LBB9_9
 10909  
 10910  LBB9_8:
 10911  	VMOVDQU    (SI)(CX*8), X0
 10912  	VPEXTRQ    $0x01, X0, AX
 10913  	VMOVDQU    16(SI)(CX*8), X1
 10914  	VCVTSI2SSQ AX, X8, X2
 10915  	VMOVQ      X0, AX
 10916  	VCVTSI2SSQ AX, X8, X0
 10917  	VMOVQ      X1, AX
 10918  	VCVTSI2SSQ AX, X8, X3
 10919  	VPEXTRQ    $0x01, X1, AX
 10920  	VCVTSI2SSQ AX, X8, X1
 10921  	VMOVDQU    32(SI)(CX*8), X4
 10922  	VMOVDQU    48(SI)(CX*8), X5
 10923  	VPEXTRQ    $0x01, X4, AX
 10924  	VINSERTPS  $0x10, X2, X0, X0
 10925  	VCVTSI2SSQ AX, X8, X2
 10926  	VMOVQ      X4, AX
 10927  	VCVTSI2SSQ AX, X8, X4
 10928  	VMOVQ      X5, AX
 10929  	VCVTSI2SSQ AX, X8, X6
 10930  	VINSERTPS  $0x20, X3, X0, X0
 10931  	VINSERTPS  $0x30, X1, X0, X0
 10932  	VPEXTRQ    $0x01, X5, AX
 10933  	VINSERTPS  $0x10, X2, X4, X1
 10934  	VCVTSI2SSQ AX, X8, X2
 10935  	VINSERTPS  $0x20, X6, X1, X1
 10936  	VMOVDQU    64(SI)(CX*8), X3
 10937  	VPEXTRQ    $0x01, X3, AX
 10938  	VCVTSI2SSQ AX, X8, X4
 10939  	VMOVQ      X3, AX
 10940  	VCVTSI2SSQ AX, X8, X3
 10941  	VMOVDQU    80(SI)(CX*8), X5
 10942  	VMOVQ      X5, AX
 10943  	VCVTSI2SSQ AX, X8, X6
 10944  	VINSERTPS  $0x30, X2, X1, X1
 10945  	VINSERTPS  $0x10, X4, X3, X2
 10946  	VPEXTRQ    $0x01, X5, AX
 10947  	VINSERTPS  $0x20, X6, X2, X2
 10948  	VCVTSI2SSQ AX, X8, X3
 10949  	VINSERTPS  $0x30, X3, X2, X2
 10950  	VMOVDQU    96(SI)(CX*8), X3
 10951  	VPEXTRQ    $0x01, X3, AX
 10952  	VCVTSI2SSQ AX, X8, X4
 10953  	VMOVQ      X3, AX
 10954  	VCVTSI2SSQ AX, X8, X3
 10955  	VMOVDQU    112(SI)(CX*8), X5
 10956  	VMOVQ      X5, AX
 10957  	VCVTSI2SSQ AX, X8, X6
 10958  	VINSERTPS  $0x10, X4, X3, X3
 10959  	VINSERTPS  $0x20, X6, X3, X3
 10960  	VPEXTRQ    $0x01, X5, AX
 10961  	VCVTSI2SSQ AX, X8, X4
 10962  	VINSERTPS  $0x30, X4, X3, X3
 10963  	VMOVUPS    X0, (DI)(CX*4)
 10964  	VMOVUPS    X1, 16(DI)(CX*4)
 10965  	VMOVUPS    X2, 32(DI)(CX*4)
 10966  	VMOVUPS    X3, 48(DI)(CX*4)
 10967  
 10968  LBB9_9:
 10969  	CMPQ R11, DX
 10970  	JE   LBB9_11
 10971  
 10972  LBB9_10:
 10973  	VCVTSI2SSQ (SI)(R11*8), X8, X0
 10974  	VMOVSS     X0, (DI)(R11*4)
 10975  	ADDQ       $0x01, R11
 10976  	CMPQ       DX, R11
 10977  	JNE        LBB9_10
 10978  
 10979  LBB9_11:
 10980  	RET
 10981  
 10982  LBB9_4:
 10983  	XORL  CX, CX
 10984  	TESTB $0x01, R8
 10985  	JNE   LBB9_8
 10986  	JMP   LBB9_9
 10987  
 10988  // func FromFloat32_AVX2_F64(x []float64, y []float32)
 10989  // Requires: AVX
 10990  TEXT ·FromFloat32_AVX2_F64(SB), NOSPLIT, $0-48
 10991  	MOVQ  x_base+0(FP), DI
 10992  	MOVQ  y_base+24(FP), SI
 10993  	MOVQ  x_len+8(FP), DX
 10994  	TESTQ DX, DX
 10995  	JE    LBB6_11
 10996  	CMPQ  DX, $0x10
 10997  	JAE   LBB6_3
 10998  	XORL  AX, AX
 10999  	JMP   LBB6_10
 11000  
 11001  LBB6_3:
 11002  	MOVQ  DX, AX
 11003  	ANDQ  $-16, AX
 11004  	LEAQ  -16(AX), CX
 11005  	MOVQ  CX, R8
 11006  	SHRQ  $0x04, R8
 11007  	ADDQ  $0x01, R8
 11008  	TESTQ CX, CX
 11009  	JE    LBB6_4
 11010  	MOVQ  R8, R9
 11011  	ANDQ  $-2, R9
 11012  	XORL  CX, CX
 11013  
 11014  LBB6_6:
 11015  	VCVTPS2PD (SI)(CX*4), Y0
 11016  	VCVTPS2PD 16(SI)(CX*4), Y1
 11017  	VCVTPS2PD 32(SI)(CX*4), Y2
 11018  	VCVTPS2PD 48(SI)(CX*4), Y3
 11019  	VMOVUPS   Y0, (DI)(CX*8)
 11020  	VMOVUPS   Y1, 32(DI)(CX*8)
 11021  	VMOVUPS   Y2, 64(DI)(CX*8)
 11022  	VMOVUPS   Y3, 96(DI)(CX*8)
 11023  	VCVTPS2PD 64(SI)(CX*4), Y0
 11024  	VCVTPS2PD 80(SI)(CX*4), Y1
 11025  	VCVTPS2PD 96(SI)(CX*4), Y2
 11026  	VCVTPS2PD 112(SI)(CX*4), Y3
 11027  	VMOVUPS   Y0, 128(DI)(CX*8)
 11028  	VMOVUPS   Y1, 160(DI)(CX*8)
 11029  	VMOVUPS   Y2, 192(DI)(CX*8)
 11030  	VMOVUPS   Y3, 224(DI)(CX*8)
 11031  	ADDQ      $0x20, CX
 11032  	ADDQ      $-2, R9
 11033  	JNE       LBB6_6
 11034  	TESTB     $0x01, R8
 11035  	JE        LBB6_9
 11036  
 11037  LBB6_8:
 11038  	VCVTPS2PD (SI)(CX*4), Y0
 11039  	VCVTPS2PD 16(SI)(CX*4), Y1
 11040  	VCVTPS2PD 32(SI)(CX*4), Y2
 11041  	VCVTPS2PD 48(SI)(CX*4), Y3
 11042  	VMOVUPS   Y0, (DI)(CX*8)
 11043  	VMOVUPS   Y1, 32(DI)(CX*8)
 11044  	VMOVUPS   Y2, 64(DI)(CX*8)
 11045  	VMOVUPS   Y3, 96(DI)(CX*8)
 11046  
 11047  LBB6_9:
 11048  	CMPQ AX, DX
 11049  	JE   LBB6_11
 11050  
 11051  LBB6_10:
 11052  	VMOVSS    (SI)(AX*4), X0
 11053  	VCVTSS2SD X0, X0, X0
 11054  	VMOVSD    X0, (DI)(AX*8)
 11055  	ADDQ      $0x01, AX
 11056  	CMPQ      DX, AX
 11057  	JNE       LBB6_10
 11058  
 11059  LBB6_11:
 11060  	VZEROUPPER
 11061  	RET
 11062  
 11063  LBB6_4:
 11064  	XORL  CX, CX
 11065  	TESTB $0x01, R8
 11066  	JNE   LBB6_8
 11067  	JMP   LBB6_9
 11068  
 11069  // func FromFloat64_AVX2_F32(x []float32, y []float64)
 11070  // Requires: AVX
 11071  TEXT ·FromFloat64_AVX2_F32(SB), NOSPLIT, $0-48
 11072  	MOVQ  x_base+0(FP), DI
 11073  	MOVQ  y_base+24(FP), SI
 11074  	MOVQ  x_len+8(FP), DX
 11075  	TESTQ DX, DX
 11076  	JE    LBB7_11
 11077  	CMPQ  DX, $0x10
 11078  	JAE   LBB7_3
 11079  	XORL  AX, AX
 11080  	JMP   LBB7_10
 11081  
 11082  LBB7_3:
 11083  	MOVQ  DX, AX
 11084  	ANDQ  $-16, AX
 11085  	LEAQ  -16(AX), CX
 11086  	MOVQ  CX, R8
 11087  	SHRQ  $0x04, R8
 11088  	ADDQ  $0x01, R8
 11089  	TESTQ CX, CX
 11090  	JE    LBB7_4
 11091  	MOVQ  R8, R9
 11092  	ANDQ  $-2, R9
 11093  	XORL  CX, CX
 11094  
 11095  LBB7_6:
 11096  	VCVTPD2PSY (SI)(CX*8), X0
 11097  	VCVTPD2PSY 32(SI)(CX*8), X1
 11098  	VCVTPD2PSY 64(SI)(CX*8), X2
 11099  	VCVTPD2PSY 96(SI)(CX*8), X3
 11100  	VMOVUPD    X0, (DI)(CX*4)
 11101  	VMOVUPD    X1, 16(DI)(CX*4)
 11102  	VMOVUPD    X2, 32(DI)(CX*4)
 11103  	VMOVUPD    X3, 48(DI)(CX*4)
 11104  	VCVTPD2PSY 128(SI)(CX*8), X0
 11105  	VCVTPD2PSY 160(SI)(CX*8), X1
 11106  	VCVTPD2PSY 192(SI)(CX*8), X2
 11107  	VCVTPD2PSY 224(SI)(CX*8), X3
 11108  	VMOVUPD    X0, 64(DI)(CX*4)
 11109  	VMOVUPD    X1, 80(DI)(CX*4)
 11110  	VMOVUPD    X2, 96(DI)(CX*4)
 11111  	VMOVUPD    X3, 112(DI)(CX*4)
 11112  	ADDQ       $0x20, CX
 11113  	ADDQ       $-2, R9
 11114  	JNE        LBB7_6
 11115  	TESTB      $0x01, R8
 11116  	JE         LBB7_9
 11117  
 11118  LBB7_8:
 11119  	VCVTPD2PSY (SI)(CX*8), X0
 11120  	VCVTPD2PSY 32(SI)(CX*8), X1
 11121  	VCVTPD2PSY 64(SI)(CX*8), X2
 11122  	VCVTPD2PSY 96(SI)(CX*8), X3
 11123  	VMOVUPD    X0, (DI)(CX*4)
 11124  	VMOVUPD    X1, 16(DI)(CX*4)
 11125  	VMOVUPD    X2, 32(DI)(CX*4)
 11126  	VMOVUPD    X3, 48(DI)(CX*4)
 11127  
 11128  LBB7_9:
 11129  	CMPQ AX, DX
 11130  	JE   LBB7_11
 11131  
 11132  LBB7_10:
 11133  	VMOVSD    (SI)(AX*8), X0
 11134  	VCVTSD2SS X0, X0, X0
 11135  	VMOVSS    X0, (DI)(AX*4)
 11136  	ADDQ      $0x01, AX
 11137  	CMPQ      DX, AX
 11138  	JNE       LBB7_10
 11139  
 11140  LBB7_11:
 11141  	RET
 11142  
 11143  LBB7_4:
 11144  	XORL  CX, CX
 11145  	TESTB $0x01, R8
 11146  	JNE   LBB7_8
 11147  	JMP   LBB7_9
 11148  
 11149  DATA dataToBoolF64<>+0(SB)/1, $+1
 11150  DATA dataToBoolF64<>+1(SB)/1, $+1
 11151  DATA dataToBoolF64<>+2(SB)/1, $+1
 11152  DATA dataToBoolF64<>+3(SB)/1, $+1
 11153  DATA dataToBoolF64<>+4(SB)/1, $+0
 11154  DATA dataToBoolF64<>+5(SB)/1, $+0
 11155  DATA dataToBoolF64<>+6(SB)/1, $+0
 11156  DATA dataToBoolF64<>+7(SB)/1, $+0
 11157  DATA dataToBoolF64<>+8(SB)/1, $+0
 11158  DATA dataToBoolF64<>+9(SB)/1, $+0
 11159  DATA dataToBoolF64<>+10(SB)/1, $+0
 11160  DATA dataToBoolF64<>+11(SB)/1, $+0
 11161  DATA dataToBoolF64<>+12(SB)/1, $+0
 11162  DATA dataToBoolF64<>+13(SB)/1, $+0
 11163  DATA dataToBoolF64<>+14(SB)/1, $+0
 11164  DATA dataToBoolF64<>+15(SB)/1, $+0
 11165  GLOBL dataToBoolF64<>(SB), RODATA|NOPTR, $16
 11166  
 11167  // func ToBool_AVX2_F64(x []bool, y []float64)
 11168  // Requires: AVX, AVX2
 11169  TEXT ·ToBool_AVX2_F64(SB), NOSPLIT, $0-48
 11170  	MOVQ  x_base+0(FP), DI
 11171  	MOVQ  y_base+24(FP), SI
 11172  	MOVQ  x_len+8(FP), DX
 11173  	TESTQ DX, DX
 11174  	JE    LBB12_8
 11175  	CMPQ  DX, $0x10
 11176  	JAE   LBB12_3
 11177  	XORL  AX, AX
 11178  	JMP   LBB12_6
 11179  
 11180  LBB12_3:
 11181  	MOVQ    DX, AX
 11182  	ANDQ    $-16, AX
 11183  	XORL    CX, CX
 11184  	VXORPD  X0, X0, X0
 11185  	VMOVDQU dataToBoolF64<>+0(SB), X1
 11186  
 11187  LBB12_4:
 11188  	VCMPPD       $0x04, (SI)(CX*8), Y0, Y2
 11189  	VEXTRACTF128 $0x01, Y2, X3
 11190  	VPACKSSDW    X3, X2, X2
 11191  	VPACKSSDW    X2, X2, X2
 11192  	VPACKSSWB    X2, X2, X2
 11193  	VCMPPD       $0x04, 32(SI)(CX*8), Y0, Y3
 11194  	VPAND        X1, X2, X2
 11195  	VEXTRACTF128 $0x01, Y3, X4
 11196  	VPACKSSDW    X4, X3, X3
 11197  	VPACKSSDW    X3, X3, X3
 11198  	VPACKSSWB    X3, X3, X3
 11199  	VPAND        X1, X3, X3
 11200  	VCMPPD       $0x04, 64(SI)(CX*8), Y0, Y4
 11201  	VPUNPCKLDQ   X3, X2, X2
 11202  	VEXTRACTF128 $0x01, Y4, X3
 11203  	VPACKSSDW    X3, X4, X3
 11204  	VPACKSSDW    X3, X3, X3
 11205  	VPACKSSWB    X3, X3, X3
 11206  	VPAND        X1, X3, X3
 11207  	VCMPPD       $0x04, 96(SI)(CX*8), Y0, Y4
 11208  	VEXTRACTF128 $0x01, Y4, X5
 11209  	VPACKSSDW    X5, X4, X4
 11210  	VPACKSSDW    X4, X4, X4
 11211  	VPACKSSWB    X4, X4, X4
 11212  	VPAND        X1, X4, X4
 11213  	VPBROADCASTD X4, X4
 11214  	VPBROADCASTD X3, X3
 11215  	VPUNPCKLDQ   X4, X3, X3
 11216  	VPBLENDD     $0x0c, X3, X2, X2
 11217  	VMOVDQU      X2, (DI)(CX*1)
 11218  	ADDQ         $0x10, CX
 11219  	CMPQ         AX, CX
 11220  	JNE          LBB12_4
 11221  	CMPQ         AX, DX
 11222  	JE           LBB12_8
 11223  
 11224  LBB12_6:
 11225  	VXORPD X0, X0, X0
 11226  
 11227  LBB12_7:
 11228  	VUCOMISD (SI)(AX*8), X0
 11229  	SETNE    (DI)(AX*1)
 11230  	ADDQ     $0x01, AX
 11231  	CMPQ     DX, AX
 11232  	JNE      LBB12_7
 11233  
 11234  LBB12_8:
 11235  	VZEROUPPER
 11236  	RET
 11237  
 11238  DATA dataToBoolF32<>+0(SB)/1, $+1
 11239  DATA dataToBoolF32<>+1(SB)/1, $+1
 11240  DATA dataToBoolF32<>+2(SB)/1, $+1
 11241  DATA dataToBoolF32<>+3(SB)/1, $+1
 11242  DATA dataToBoolF32<>+4(SB)/1, $+1
 11243  DATA dataToBoolF32<>+5(SB)/1, $+1
 11244  DATA dataToBoolF32<>+6(SB)/1, $+1
 11245  DATA dataToBoolF32<>+7(SB)/1, $+1
 11246  DATA dataToBoolF32<>+8(SB)/1, $+0
 11247  DATA dataToBoolF32<>+9(SB)/1, $+0
 11248  DATA dataToBoolF32<>+10(SB)/1, $+0
 11249  DATA dataToBoolF32<>+11(SB)/1, $+0
 11250  DATA dataToBoolF32<>+12(SB)/1, $+0
 11251  DATA dataToBoolF32<>+13(SB)/1, $+0
 11252  DATA dataToBoolF32<>+14(SB)/1, $+0
 11253  DATA dataToBoolF32<>+15(SB)/1, $+0
 11254  GLOBL dataToBoolF32<>(SB), RODATA|NOPTR, $16
 11255  
 11256  // func ToBool_AVX2_F32(x []bool, y []float32)
 11257  // Requires: AVX, AVX2
 11258  TEXT ·ToBool_AVX2_F32(SB), NOSPLIT, $0-48
 11259  	MOVQ  x_base+0(FP), DI
 11260  	MOVQ  y_base+24(FP), SI
 11261  	MOVQ  x_len+8(FP), DX
 11262  	TESTQ DX, DX
 11263  	JE    LBB13_8
 11264  	CMPQ  DX, $0x20
 11265  	JAE   LBB13_3
 11266  	XORL  AX, AX
 11267  	JMP   LBB13_6
 11268  
 11269  LBB13_3:
 11270  	MOVQ    DX, AX
 11271  	ANDQ    $-32, AX
 11272  	XORL    CX, CX
 11273  	VXORPS  X0, X0, X0
 11274  	VMOVDQU dataToBoolF32<>+0(SB), X1
 11275  
 11276  LBB13_4:
 11277  	VCMPPS       $0x04, (SI)(CX*4), Y0, Y2
 11278  	VEXTRACTF128 $0x01, Y2, X3
 11279  	VPACKSSDW    X3, X2, X2
 11280  	VPACKSSWB    X2, X2, X2
 11281  	VCMPPS       $0x04, 32(SI)(CX*4), Y0, Y3
 11282  	VPAND        X1, X2, X2
 11283  	VEXTRACTF128 $0x01, Y3, X4
 11284  	VPACKSSDW    X4, X3, X3
 11285  	VPACKSSWB    X3, X3, X3
 11286  	VPAND        X1, X3, X3
 11287  	VCMPPS       $0x04, 64(SI)(CX*4), Y0, Y4
 11288  	VEXTRACTF128 $0x01, Y4, X5
 11289  	VPACKSSDW    X5, X4, X4
 11290  	VPACKSSWB    X4, X4, X4
 11291  	VCMPPS       $0x04, 96(SI)(CX*4), Y0, Y5
 11292  	VPAND        X1, X4, X4
 11293  	VEXTRACTF128 $0x01, Y5, X6
 11294  	VPACKSSDW    X6, X5, X5
 11295  	VPACKSSWB    X5, X5, X5
 11296  	VPAND        X1, X5, X5
 11297  	VINSERTI128  $0x01, X5, Y4, Y4
 11298  	VINSERTI128  $0x01, X3, Y2, Y2
 11299  	VPUNPCKLQDQ  Y4, Y2, Y2
 11300  	VPERMQ       $0xd8, Y2, Y2
 11301  	VMOVDQU      Y2, (DI)(CX*1)
 11302  	ADDQ         $0x20, CX
 11303  	CMPQ         AX, CX
 11304  	JNE          LBB13_4
 11305  	CMPQ         AX, DX
 11306  	JE           LBB13_8
 11307  
 11308  LBB13_6:
 11309  	VXORPS X0, X0, X0
 11310  
 11311  LBB13_7:
 11312  	VUCOMISS (SI)(AX*4), X0
 11313  	SETNE    (DI)(AX*1)
 11314  	ADDQ     $0x01, AX
 11315  	CMPQ     DX, AX
 11316  	JNE      LBB13_7
 11317  
 11318  LBB13_8:
 11319  	VZEROUPPER
 11320  	RET
 11321  
 11322  // func ToInt32_AVX2_F64(x []int32, y []float64)
 11323  // Requires: AVX
 11324  TEXT ·ToInt32_AVX2_F64(SB), NOSPLIT, $0-48
 11325  	MOVQ  x_base+0(FP), DI
 11326  	MOVQ  y_base+24(FP), SI
 11327  	MOVQ  x_len+8(FP), DX
 11328  	TESTQ DX, DX
 11329  	JE    LBB16_11
 11330  	CMPQ  DX, $0x10
 11331  	JAE   LBB16_3
 11332  	XORL  AX, AX
 11333  	JMP   LBB16_10
 11334  
 11335  LBB16_3:
 11336  	MOVQ  DX, AX
 11337  	ANDQ  $-16, AX
 11338  	LEAQ  -16(AX), CX
 11339  	MOVQ  CX, R8
 11340  	SHRQ  $0x04, R8
 11341  	ADDQ  $0x01, R8
 11342  	TESTQ CX, CX
 11343  	JE    LBB16_4
 11344  	MOVQ  R8, R9
 11345  	ANDQ  $-2, R9
 11346  	XORL  CX, CX
 11347  
 11348  LBB16_6:
 11349  	VCVTTPD2DQY (SI)(CX*8), X0
 11350  	VCVTTPD2DQY 32(SI)(CX*8), X1
 11351  	VCVTTPD2DQY 64(SI)(CX*8), X2
 11352  	VCVTTPD2DQY 96(SI)(CX*8), X3
 11353  	VMOVUPD     X0, (DI)(CX*4)
 11354  	VMOVUPD     X1, 16(DI)(CX*4)
 11355  	VMOVUPD     X2, 32(DI)(CX*4)
 11356  	VMOVUPD     X3, 48(DI)(CX*4)
 11357  	VCVTTPD2DQY 128(SI)(CX*8), X0
 11358  	VCVTTPD2DQY 160(SI)(CX*8), X1
 11359  	VCVTTPD2DQY 192(SI)(CX*8), X2
 11360  	VCVTTPD2DQY 224(SI)(CX*8), X3
 11361  	VMOVUPD     X0, 64(DI)(CX*4)
 11362  	VMOVUPD     X1, 80(DI)(CX*4)
 11363  	VMOVUPD     X2, 96(DI)(CX*4)
 11364  	VMOVUPD     X3, 112(DI)(CX*4)
 11365  	ADDQ        $0x20, CX
 11366  	ADDQ        $-2, R9
 11367  	JNE         LBB16_6
 11368  	TESTB       $0x01, R8
 11369  	JE          LBB16_9
 11370  
 11371  LBB16_8:
 11372  	VCVTTPD2DQY (SI)(CX*8), X0
 11373  	VCVTTPD2DQY 32(SI)(CX*8), X1
 11374  	VCVTTPD2DQY 64(SI)(CX*8), X2
 11375  	VCVTTPD2DQY 96(SI)(CX*8), X3
 11376  	VMOVUPD     X0, (DI)(CX*4)
 11377  	VMOVUPD     X1, 16(DI)(CX*4)
 11378  	VMOVUPD     X2, 32(DI)(CX*4)
 11379  	VMOVUPD     X3, 48(DI)(CX*4)
 11380  
 11381  LBB16_9:
 11382  	CMPQ AX, DX
 11383  	JE   LBB16_11
 11384  
 11385  LBB16_10:
 11386  	VCVTTSD2SI (SI)(AX*8), CX
 11387  	MOVL       CX, (DI)(AX*4)
 11388  	ADDQ       $0x01, AX
 11389  	CMPQ       DX, AX
 11390  	JNE        LBB16_10
 11391  
 11392  LBB16_11:
 11393  	RET
 11394  
 11395  LBB16_4:
 11396  	XORL  CX, CX
 11397  	TESTB $0x01, R8
 11398  	JNE   LBB16_8
 11399  	JMP   LBB16_9
 11400  
 11401  // func ToInt32_AVX2_F32(x []int32, y []float32)
 11402  // Requires: AVX
 11403  TEXT ·ToInt32_AVX2_F32(SB), NOSPLIT, $0-48
 11404  	MOVQ  x_base+0(FP), DI
 11405  	MOVQ  y_base+24(FP), SI
 11406  	MOVQ  x_len+8(FP), DX
 11407  	TESTQ DX, DX
 11408  	JE    LBB17_11
 11409  	CMPQ  DX, $0x20
 11410  	JAE   LBB17_3
 11411  	XORL  AX, AX
 11412  	JMP   LBB17_10
 11413  
 11414  LBB17_3:
 11415  	MOVQ  DX, AX
 11416  	ANDQ  $-32, AX
 11417  	LEAQ  -32(AX), CX
 11418  	MOVQ  CX, R8
 11419  	SHRQ  $0x05, R8
 11420  	ADDQ  $0x01, R8
 11421  	TESTQ CX, CX
 11422  	JE    LBB17_4
 11423  	MOVQ  R8, R9
 11424  	ANDQ  $-2, R9
 11425  	XORL  CX, CX
 11426  
 11427  LBB17_6:
 11428  	VCVTTPS2DQ (SI)(CX*4), Y0
 11429  	VCVTTPS2DQ 32(SI)(CX*4), Y1
 11430  	VCVTTPS2DQ 64(SI)(CX*4), Y2
 11431  	VCVTTPS2DQ 96(SI)(CX*4), Y3
 11432  	VMOVUPS    Y0, (DI)(CX*4)
 11433  	VMOVUPS    Y1, 32(DI)(CX*4)
 11434  	VMOVUPS    Y2, 64(DI)(CX*4)
 11435  	VMOVUPS    Y3, 96(DI)(CX*4)
 11436  	VCVTTPS2DQ 128(SI)(CX*4), Y0
 11437  	VCVTTPS2DQ 160(SI)(CX*4), Y1
 11438  	VCVTTPS2DQ 192(SI)(CX*4), Y2
 11439  	VCVTTPS2DQ 224(SI)(CX*4), Y3
 11440  	VMOVUPS    Y0, 128(DI)(CX*4)
 11441  	VMOVUPS    Y1, 160(DI)(CX*4)
 11442  	VMOVUPS    Y2, 192(DI)(CX*4)
 11443  	VMOVUPS    Y3, 224(DI)(CX*4)
 11444  	ADDQ       $0x40, CX
 11445  	ADDQ       $-2, R9
 11446  	JNE        LBB17_6
 11447  	TESTB      $0x01, R8
 11448  	JE         LBB17_9
 11449  
 11450  LBB17_8:
 11451  	VCVTTPS2DQ (SI)(CX*4), Y0
 11452  	VCVTTPS2DQ 32(SI)(CX*4), Y1
 11453  	VCVTTPS2DQ 64(SI)(CX*4), Y2
 11454  	VCVTTPS2DQ 96(SI)(CX*4), Y3
 11455  	VMOVUPS    Y0, (DI)(CX*4)
 11456  	VMOVUPS    Y1, 32(DI)(CX*4)
 11457  	VMOVUPS    Y2, 64(DI)(CX*4)
 11458  	VMOVUPS    Y3, 96(DI)(CX*4)
 11459  
 11460  LBB17_9:
 11461  	CMPQ AX, DX
 11462  	JE   LBB17_11
 11463  
 11464  LBB17_10:
 11465  	VCVTTSS2SI (SI)(AX*4), CX
 11466  	MOVL       CX, (DI)(AX*4)
 11467  	ADDQ       $0x01, AX
 11468  	CMPQ       DX, AX
 11469  	JNE        LBB17_10
 11470  
 11471  LBB17_11:
 11472  	VZEROUPPER
 11473  	RET
 11474  
 11475  LBB17_4:
 11476  	XORL  CX, CX
 11477  	TESTB $0x01, R8
 11478  	JNE   LBB17_8
 11479  	JMP   LBB17_9
 11480  
 11481  // func ToInt64_AVX2_F64(x []int64, y []float64)
 11482  // Requires: AVX
 11483  TEXT ·ToInt64_AVX2_F64(SB), NOSPLIT, $0-48
 11484  	MOVQ  x_base+0(FP), DI
 11485  	MOVQ  y_base+24(FP), SI
 11486  	MOVQ  x_len+8(FP), DX
 11487  	TESTQ DX, DX
 11488  	JE    LBB14_8
 11489  	LEAQ  -1(DX), CX
 11490  	MOVL  DX, R8
 11491  	ANDL  $0x03, R8
 11492  	CMPQ  CX, $0x03
 11493  	JAE   LBB14_3
 11494  	XORL  CX, CX
 11495  	JMP   LBB14_5
 11496  
 11497  LBB14_3:
 11498  	ANDQ $-4, DX
 11499  	XORL CX, CX
 11500  
 11501  LBB14_4:
 11502  	VCVTTSD2SIQ (SI)(CX*8), AX
 11503  	MOVQ        AX, (DI)(CX*8)
 11504  	VCVTTSD2SIQ 8(SI)(CX*8), AX
 11505  	MOVQ        AX, 8(DI)(CX*8)
 11506  	VCVTTSD2SIQ 16(SI)(CX*8), AX
 11507  	MOVQ        AX, 16(DI)(CX*8)
 11508  	VCVTTSD2SIQ 24(SI)(CX*8), AX
 11509  	MOVQ        AX, 24(DI)(CX*8)
 11510  	ADDQ        $0x04, CX
 11511  	CMPQ        DX, CX
 11512  	JNE         LBB14_4
 11513  
 11514  LBB14_5:
 11515  	TESTQ R8, R8
 11516  	JE    LBB14_8
 11517  	LEAQ  (DI)(CX*8), DX
 11518  	LEAQ  (SI)(CX*8), CX
 11519  	XORL  SI, SI
 11520  
 11521  LBB14_7:
 11522  	VCVTTSD2SIQ (CX)(SI*8), AX
 11523  	MOVQ        AX, (DX)(SI*8)
 11524  	ADDQ        $0x01, SI
 11525  	CMPQ        R8, SI
 11526  	JNE         LBB14_7
 11527  
 11528  LBB14_8:
 11529  	RET
 11530  
 11531  // func ToInt64_AVX2_F32(x []int64, y []float32)
 11532  // Requires: AVX
 11533  TEXT ·ToInt64_AVX2_F32(SB), NOSPLIT, $0-48
 11534  	MOVQ  x_base+0(FP), DI
 11535  	MOVQ  y_base+24(FP), SI
 11536  	MOVQ  x_len+8(FP), DX
 11537  	TESTQ DX, DX
 11538  	JE    LBB15_8
 11539  	LEAQ  -1(DX), CX
 11540  	MOVL  DX, R8
 11541  	ANDL  $0x03, R8
 11542  	CMPQ  CX, $0x03
 11543  	JAE   LBB15_3
 11544  	XORL  CX, CX
 11545  	JMP   LBB15_5
 11546  
 11547  LBB15_3:
 11548  	ANDQ $-4, DX
 11549  	XORL CX, CX
 11550  
 11551  LBB15_4:
 11552  	VCVTTSS2SIQ (SI)(CX*4), AX
 11553  	MOVQ        AX, (DI)(CX*8)
 11554  	VCVTTSS2SIQ 4(SI)(CX*4), AX
 11555  	MOVQ        AX, 8(DI)(CX*8)
 11556  	VCVTTSS2SIQ 8(SI)(CX*4), AX
 11557  	MOVQ        AX, 16(DI)(CX*8)
 11558  	VCVTTSS2SIQ 12(SI)(CX*4), AX
 11559  	MOVQ        AX, 24(DI)(CX*8)
 11560  	ADDQ        $0x04, CX
 11561  	CMPQ        DX, CX
 11562  	JNE         LBB15_4
 11563  
 11564  LBB15_5:
 11565  	TESTQ R8, R8
 11566  	JE    LBB15_8
 11567  	LEAQ  (DI)(CX*8), DX
 11568  	LEAQ  (SI)(CX*4), CX
 11569  	XORL  SI, SI
 11570  
 11571  LBB15_7:
 11572  	VCVTTSS2SIQ (CX)(SI*4), AX
 11573  	MOVQ        AX, (DX)(SI*8)
 11574  	ADDQ        $0x01, SI
 11575  	CMPQ        R8, SI
 11576  	JNE         LBB15_7
 11577  
 11578  LBB15_8:
 11579  	RET