gitee.com/quant1x/gox@v1.7.6/num/asm/_avx2/special.s (about)

     1  Sqrt_F64_V(double*, unsigned long):                      # @Sqrt_F64_V(double*, unsigned long)
     2          testq   %rsi, %rsi
     3          je      .LBB0_7
     4          cmpq    $4, %rsi
     5          jae     .LBB0_3
     6          xorl    %eax, %eax
     7          jmp     .LBB0_6
     8  .LBB0_3:
     9          movq    %rsi, %rax
    10          andq    $-4, %rax
    11          xorl    %ecx, %ecx
    12  .LBB0_4:                                # =>This Inner Loop Header: Depth=1
    13          vsqrtpd (%rdi,%rcx,8), %ymm0
    14          vmovupd %ymm0, (%rdi,%rcx,8)
    15          addq    $4, %rcx
    16          cmpq    %rcx, %rax
    17          jne     .LBB0_4
    18          cmpq    %rsi, %rax
    19          je      .LBB0_7
    20  .LBB0_6:                                # =>This Inner Loop Header: Depth=1
    21          vmovsd  (%rdi,%rax,8), %xmm0            # xmm0 = mem[0],zero
    22          vsqrtsd %xmm0, %xmm0, %xmm0
    23          vmovsd  %xmm0, (%rdi,%rax,8)
    24          incq    %rax
    25          cmpq    %rax, %rsi
    26          jne     .LBB0_6
    27  .LBB0_7:
    28          vzeroupper
    29          retq
    30  .LCPI1_0:
    31          .long   0xc0400000                      # float -3
    32  .LCPI1_1:
    33          .long   0xbf000000                      # float -0.5
    34  Sqrt_F32_V(float*, unsigned long):                      # @Sqrt_F32_V(float*, unsigned long)
    35          testq   %rsi, %rsi
    36          je      .LBB1_7
    37          cmpq    $32, %rsi
    38          jae     .LBB1_3
    39          xorl    %eax, %eax
    40          jmp     .LBB1_6
    41  .LBB1_3:
    42          movq    %rsi, %rax
    43          andq    $-32, %rax
    44          xorl    %ecx, %ecx
    45          vbroadcastss    .LCPI1_0(%rip), %ymm0   # ymm0 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
    46          vbroadcastss    .LCPI1_1(%rip), %ymm1   # ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
    47          vxorps  %xmm2, %xmm2, %xmm2
    48  .LBB1_4:                                # =>This Inner Loop Header: Depth=1
    49          vmovups (%rdi,%rcx,4), %ymm3
    50          vmovups 32(%rdi,%rcx,4), %ymm4
    51          vmovups 64(%rdi,%rcx,4), %ymm5
    52          vrsqrtps        %ymm3, %ymm6
    53          vmovups 96(%rdi,%rcx,4), %ymm7
    54          vmulps  %ymm6, %ymm3, %ymm8
    55          vfmadd213ps     %ymm0, %ymm8, %ymm6     # ymm6 = (ymm8 * ymm6) + ymm0
    56          vmulps  %ymm1, %ymm8, %ymm8
    57          vmulps  %ymm6, %ymm8, %ymm6
    58          vrsqrtps        %ymm4, %ymm8
    59          vcmpneqps       %ymm2, %ymm3, %ymm3
    60          vandps  %ymm6, %ymm3, %ymm3
    61          vmulps  %ymm4, %ymm8, %ymm6
    62          vfmadd213ps     %ymm0, %ymm6, %ymm8     # ymm8 = (ymm6 * ymm8) + ymm0
    63          vmulps  %ymm1, %ymm6, %ymm6
    64          vmulps  %ymm6, %ymm8, %ymm6
    65          vcmpneqps       %ymm2, %ymm4, %ymm4
    66          vandps  %ymm6, %ymm4, %ymm4
    67          vrsqrtps        %ymm5, %ymm6
    68          vmulps  %ymm6, %ymm5, %ymm8
    69          vfmadd213ps     %ymm0, %ymm8, %ymm6     # ymm6 = (ymm8 * ymm6) + ymm0
    70          vmulps  %ymm1, %ymm8, %ymm8
    71          vmulps  %ymm6, %ymm8, %ymm6
    72          vcmpneqps       %ymm2, %ymm5, %ymm5
    73          vandps  %ymm6, %ymm5, %ymm5
    74          vrsqrtps        %ymm7, %ymm6
    75          vmulps  %ymm6, %ymm7, %ymm8
    76          vfmadd213ps     %ymm0, %ymm8, %ymm6     # ymm6 = (ymm8 * ymm6) + ymm0
    77          vmulps  %ymm1, %ymm8, %ymm8
    78          vmulps  %ymm6, %ymm8, %ymm6
    79          vcmpneqps       %ymm2, %ymm7, %ymm7
    80          vandps  %ymm6, %ymm7, %ymm6
    81          vmovups %ymm3, (%rdi,%rcx,4)
    82          vmovups %ymm4, 32(%rdi,%rcx,4)
    83          vmovups %ymm5, 64(%rdi,%rcx,4)
    84          vmovups %ymm6, 96(%rdi,%rcx,4)
    85          addq    $32, %rcx
    86          cmpq    %rcx, %rax
    87          jne     .LBB1_4
    88          cmpq    %rsi, %rax
    89          je      .LBB1_7
    90  .LBB1_6:                                # =>This Inner Loop Header: Depth=1
    91          vmovss  (%rdi,%rax,4), %xmm0            # xmm0 = mem[0],zero,zero,zero
    92          vsqrtss %xmm0, %xmm0, %xmm0
    93          vmovss  %xmm0, (%rdi,%rax,4)
    94          incq    %rax
    95          cmpq    %rax, %rsi
    96          jne     .LBB1_6
    97  .LBB1_7:
    98          vzeroupper
    99          retq
   100  .LCPI2_0:
   101          .quad   0x8000000000000000              # double -0
   102  .LCPI2_1:
   103          .quad   0x3fdfffffffffffff              # double 0.49999999999999994
   104  .LCPI2_2:
   105          .quad   0x8000000000000000              # double -0
   106          .quad   0x8000000000000000              # double -0
   107  Round_F64_V(double*, unsigned long):                     # @Round_F64_V(double*, unsigned long)
   108          testq   %rsi, %rsi
   109          je      .LBB2_8
   110          cmpq    $16, %rsi
   111          jae     .LBB2_3
   112          xorl    %eax, %eax
   113          jmp     .LBB2_6
   114  .LBB2_3:
   115          movq    %rsi, %rax
   116          andq    $-16, %rax
   117          xorl    %ecx, %ecx
   118          vbroadcastsd    .LCPI2_0(%rip), %ymm0   # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
   119          vbroadcastsd    .LCPI2_1(%rip), %ymm1   # ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
   120  .LBB2_4:                                # =>This Inner Loop Header: Depth=1
   121          vmovupd (%rdi,%rcx,8), %ymm2
   122          vmovupd 32(%rdi,%rcx,8), %ymm3
   123          vmovupd 64(%rdi,%rcx,8), %ymm4
   124          vmovupd 96(%rdi,%rcx,8), %ymm5
   125          vandpd  %ymm0, %ymm2, %ymm6
   126          vorpd   %ymm1, %ymm6, %ymm6
   127          vaddpd  %ymm6, %ymm2, %ymm2
   128          vroundpd        $11, %ymm2, %ymm2
   129          vandpd  %ymm0, %ymm3, %ymm6
   130          vorpd   %ymm1, %ymm6, %ymm6
   131          vaddpd  %ymm6, %ymm3, %ymm3
   132          vroundpd        $11, %ymm3, %ymm3
   133          vandpd  %ymm0, %ymm4, %ymm6
   134          vorpd   %ymm1, %ymm6, %ymm6
   135          vaddpd  %ymm6, %ymm4, %ymm4
   136          vroundpd        $11, %ymm4, %ymm4
   137          vandpd  %ymm0, %ymm5, %ymm6
   138          vorpd   %ymm1, %ymm6, %ymm6
   139          vaddpd  %ymm6, %ymm5, %ymm5
   140          vroundpd        $11, %ymm5, %ymm5
   141          vmovupd %ymm2, (%rdi,%rcx,8)
   142          vmovupd %ymm3, 32(%rdi,%rcx,8)
   143          vmovupd %ymm4, 64(%rdi,%rcx,8)
   144          vmovupd %ymm5, 96(%rdi,%rcx,8)
   145          addq    $16, %rcx
   146          cmpq    %rcx, %rax
   147          jne     .LBB2_4
   148          cmpq    %rsi, %rax
   149          je      .LBB2_8
   150  .LBB2_6:
   151          vmovapd .LCPI2_2(%rip), %xmm0           # xmm0 = [-0.0E+0,-0.0E+0]
   152          vmovddup        .LCPI2_1(%rip), %xmm1           # xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
   153  .LBB2_7:                                # =>This Inner Loop Header: Depth=1
   154          vmovsd  (%rdi,%rax,8), %xmm2            # xmm2 = mem[0],zero
   155          vandpd  %xmm0, %xmm2, %xmm3
   156          vorpd   %xmm1, %xmm3, %xmm3
   157          vaddsd  %xmm3, %xmm2, %xmm2
   158          vroundsd        $11, %xmm2, %xmm2, %xmm2
   159          vmovsd  %xmm2, (%rdi,%rax,8)
   160          incq    %rax
   161          cmpq    %rax, %rsi
   162          jne     .LBB2_7
   163  .LBB2_8:
   164          vzeroupper
   165          retq
   166  .LCPI3_0:
   167          .long   0x80000000                      # float -0
   168  .LCPI3_1:
   169          .long   0x3effffff                      # float 0.49999997
   170  Round_F32_V(float*, unsigned long):                     # @Round_F32_V(float*, unsigned long)
   171          testq   %rsi, %rsi
   172          je      .LBB3_8
   173          cmpq    $32, %rsi
   174          jae     .LBB3_3
   175          xorl    %eax, %eax
   176          jmp     .LBB3_6
   177  .LBB3_3:
   178          movq    %rsi, %rax
   179          andq    $-32, %rax
   180          xorl    %ecx, %ecx
   181          vbroadcastss    .LCPI3_0(%rip), %ymm0   # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
   182          vbroadcastss    .LCPI3_1(%rip), %ymm1   # ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
   183  .LBB3_4:                                # =>This Inner Loop Header: Depth=1
   184          vmovups (%rdi,%rcx,4), %ymm2
   185          vmovups 32(%rdi,%rcx,4), %ymm3
   186          vmovups 64(%rdi,%rcx,4), %ymm4
   187          vmovups 96(%rdi,%rcx,4), %ymm5
   188          vandps  %ymm0, %ymm2, %ymm6
   189          vorps   %ymm1, %ymm6, %ymm6
   190          vaddps  %ymm6, %ymm2, %ymm2
   191          vroundps        $11, %ymm2, %ymm2
   192          vandps  %ymm0, %ymm3, %ymm6
   193          vorps   %ymm1, %ymm6, %ymm6
   194          vaddps  %ymm6, %ymm3, %ymm3
   195          vroundps        $11, %ymm3, %ymm3
   196          vandps  %ymm0, %ymm4, %ymm6
   197          vorps   %ymm1, %ymm6, %ymm6
   198          vaddps  %ymm6, %ymm4, %ymm4
   199          vroundps        $11, %ymm4, %ymm4
   200          vandps  %ymm0, %ymm5, %ymm6
   201          vorps   %ymm1, %ymm6, %ymm6
   202          vaddps  %ymm6, %ymm5, %ymm5
   203          vroundps        $11, %ymm5, %ymm5
   204          vmovups %ymm2, (%rdi,%rcx,4)
   205          vmovups %ymm3, 32(%rdi,%rcx,4)
   206          vmovups %ymm4, 64(%rdi,%rcx,4)
   207          vmovups %ymm5, 96(%rdi,%rcx,4)
   208          addq    $32, %rcx
   209          cmpq    %rcx, %rax
   210          jne     .LBB3_4
   211          cmpq    %rsi, %rax
   212          je      .LBB3_8
   213  .LBB3_6:
   214          vbroadcastss    .LCPI3_0(%rip), %xmm0   # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
   215          vbroadcastss    .LCPI3_1(%rip), %xmm1   # xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
   216  .LBB3_7:                                # =>This Inner Loop Header: Depth=1
   217          vmovss  (%rdi,%rax,4), %xmm2            # xmm2 = mem[0],zero,zero,zero
   218          vandps  %xmm0, %xmm2, %xmm3
   219          vorps   %xmm1, %xmm3, %xmm3
   220          vaddss  %xmm3, %xmm2, %xmm2
   221          vroundss        $11, %xmm2, %xmm2, %xmm2
   222          vmovss  %xmm2, (%rdi,%rax,4)
   223          incq    %rax
   224          cmpq    %rax, %rsi
   225          jne     .LBB3_7
   226  .LBB3_8:
   227          vzeroupper
   228          retq
   229  Floor_F64_V(double*, unsigned long):                     # @Floor_F64_V(double*, unsigned long)
   230          testq   %rsi, %rsi
   231          je      .LBB4_7
   232          cmpq    $16, %rsi
   233          jae     .LBB4_3
   234          xorl    %eax, %eax
   235          jmp     .LBB4_6
   236  .LBB4_3:
   237          movq    %rsi, %rax
   238          andq    $-16, %rax
   239          xorl    %ecx, %ecx
   240  .LBB4_4:                                # =>This Inner Loop Header: Depth=1
   241          vroundpd        $9, (%rdi,%rcx,8), %ymm0
   242          vroundpd        $9, 32(%rdi,%rcx,8), %ymm1
   243          vroundpd        $9, 64(%rdi,%rcx,8), %ymm2
   244          vroundpd        $9, 96(%rdi,%rcx,8), %ymm3
   245          vmovupd %ymm0, (%rdi,%rcx,8)
   246          vmovupd %ymm1, 32(%rdi,%rcx,8)
   247          vmovupd %ymm2, 64(%rdi,%rcx,8)
   248          vmovupd %ymm3, 96(%rdi,%rcx,8)
   249          addq    $16, %rcx
   250          cmpq    %rcx, %rax
   251          jne     .LBB4_4
   252          cmpq    %rsi, %rax
   253          je      .LBB4_7
   254  .LBB4_6:                                # =>This Inner Loop Header: Depth=1
   255          vmovsd  (%rdi,%rax,8), %xmm0            # xmm0 = mem[0],zero
   256          vroundsd        $9, %xmm0, %xmm0, %xmm0
   257          vmovsd  %xmm0, (%rdi,%rax,8)
   258          incq    %rax
   259          cmpq    %rax, %rsi
   260          jne     .LBB4_6
   261  .LBB4_7:
   262          vzeroupper
   263          retq
   264  Floor_F32_V(float*, unsigned long):                     # @Floor_F32_V(float*, unsigned long)
   265          testq   %rsi, %rsi
   266          je      .LBB5_7
   267          cmpq    $32, %rsi
   268          jae     .LBB5_3
   269          xorl    %eax, %eax
   270          jmp     .LBB5_6
   271  .LBB5_3:
   272          movq    %rsi, %rax
   273          andq    $-32, %rax
   274          xorl    %ecx, %ecx
   275  .LBB5_4:                                # =>This Inner Loop Header: Depth=1
   276          vroundps        $9, (%rdi,%rcx,4), %ymm0
   277          vroundps        $9, 32(%rdi,%rcx,4), %ymm1
   278          vroundps        $9, 64(%rdi,%rcx,4), %ymm2
   279          vroundps        $9, 96(%rdi,%rcx,4), %ymm3
   280          vmovups %ymm0, (%rdi,%rcx,4)
   281          vmovups %ymm1, 32(%rdi,%rcx,4)
   282          vmovups %ymm2, 64(%rdi,%rcx,4)
   283          vmovups %ymm3, 96(%rdi,%rcx,4)
   284          addq    $32, %rcx
   285          cmpq    %rcx, %rax
   286          jne     .LBB5_4
   287          cmpq    %rsi, %rax
   288          je      .LBB5_7
   289  .LBB5_6:                                # =>This Inner Loop Header: Depth=1
   290          vmovss  (%rdi,%rax,4), %xmm0            # xmm0 = mem[0],zero,zero,zero
   291          vroundss        $9, %xmm0, %xmm0, %xmm0
   292          vmovss  %xmm0, (%rdi,%rax,4)
   293          incq    %rax
   294          cmpq    %rax, %rsi
   295          jne     .LBB5_6
   296  .LBB5_7:
   297          vzeroupper
   298          retq
   299  Ceil_F64_V(double*, unsigned long):                      # @Ceil_F64_V(double*, unsigned long)
   300          testq   %rsi, %rsi
   301          je      .LBB6_7
   302          cmpq    $16, %rsi
   303          jae     .LBB6_3
   304          xorl    %eax, %eax
   305          jmp     .LBB6_6
   306  .LBB6_3:
   307          movq    %rsi, %rax
   308          andq    $-16, %rax
   309          xorl    %ecx, %ecx
   310  .LBB6_4:                                # =>This Inner Loop Header: Depth=1
   311          vroundpd        $10, (%rdi,%rcx,8), %ymm0
   312          vroundpd        $10, 32(%rdi,%rcx,8), %ymm1
   313          vroundpd        $10, 64(%rdi,%rcx,8), %ymm2
   314          vroundpd        $10, 96(%rdi,%rcx,8), %ymm3
   315          vmovupd %ymm0, (%rdi,%rcx,8)
   316          vmovupd %ymm1, 32(%rdi,%rcx,8)
   317          vmovupd %ymm2, 64(%rdi,%rcx,8)
   318          vmovupd %ymm3, 96(%rdi,%rcx,8)
   319          addq    $16, %rcx
   320          cmpq    %rcx, %rax
   321          jne     .LBB6_4
   322          cmpq    %rsi, %rax
   323          je      .LBB6_7
   324  .LBB6_6:                                # =>This Inner Loop Header: Depth=1
   325          vmovsd  (%rdi,%rax,8), %xmm0            # xmm0 = mem[0],zero
   326          vroundsd        $10, %xmm0, %xmm0, %xmm0
   327          vmovsd  %xmm0, (%rdi,%rax,8)
   328          incq    %rax
   329          cmpq    %rax, %rsi
   330          jne     .LBB6_6
   331  .LBB6_7:
   332          vzeroupper
   333          retq
   334  Ceil_F32_V(float*, unsigned long):                      # @Ceil_F32_V(float*, unsigned long)
   335          testq   %rsi, %rsi
   336          je      .LBB7_7
   337          cmpq    $32, %rsi
   338          jae     .LBB7_3
   339          xorl    %eax, %eax
   340          jmp     .LBB7_6
   341  .LBB7_3:
   342          movq    %rsi, %rax
   343          andq    $-32, %rax
   344          xorl    %ecx, %ecx
   345  .LBB7_4:                                # =>This Inner Loop Header: Depth=1
   346          vroundps        $10, (%rdi,%rcx,4), %ymm0
   347          vroundps        $10, 32(%rdi,%rcx,4), %ymm1
   348          vroundps        $10, 64(%rdi,%rcx,4), %ymm2
   349          vroundps        $10, 96(%rdi,%rcx,4), %ymm3
   350          vmovups %ymm0, (%rdi,%rcx,4)
   351          vmovups %ymm1, 32(%rdi,%rcx,4)
   352          vmovups %ymm2, 64(%rdi,%rcx,4)
   353          vmovups %ymm3, 96(%rdi,%rcx,4)
   354          addq    $32, %rcx
   355          cmpq    %rcx, %rax
   356          jne     .LBB7_4
   357          cmpq    %rsi, %rax
   358          je      .LBB7_7
   359  .LBB7_6:                                # =>This Inner Loop Header: Depth=1
   360          vmovss  (%rdi,%rax,4), %xmm0            # xmm0 = mem[0],zero,zero,zero
   361          vroundss        $10, %xmm0, %xmm0, %xmm0
   362          vmovss  %xmm0, (%rdi,%rax,4)
   363          incq    %rax
   364          cmpq    %rax, %rsi
   365          jne     .LBB7_6
   366  .LBB7_7:
   367          vzeroupper
   368          retq
   369  .LCPI8_0:
   370          .quad   9223372036854775807             # 0x7fffffffffffffff
   371  .LCPI8_3:
   372          .quad   0x3fe6a09e667f3bcd              # double 0.70710678118654757
   373  .LCPI8_4:
   374          .quad   0xbff0000000000000              # double -1
   375  .LCPI8_5:
   376          .quad   0x401a509f46f4fa53              # double 6.5787325942061043
   377  .LCPI8_6:
   378          .quad   0x3fdfe818a0fe1a83              # double 0.49854102823193375
   379  .LCPI8_7:
   380          .quad   0x3f07bc0962b395ca              # double 4.5270000862445198E-5
   381  .LCPI8_8:
   382          .quad   0x404e798eb86c3351              # double 60.94966798098779
   383  .LCPI8_9:
   384          .quad   0x403de9738b8cb9c9              # double 29.911919328553072
   385  .LCPI8_10:
   386          .quad   0x40340a202d99830a              # double 20.039553499201283
   387  .LCPI8_11:
   388          .quad   0x404c8e7597479a10              # double 57.112963590585537
   389  .LCPI8_12:
   390          .quad   0x4054c30b52213498              # double 83.047565967967216
   391  .LCPI8_13:
   392          .quad   0x402e20359e903e37              # double 15.062909083469192
   393  .LCPI8_14:
   394          .quad   0x407351945dc908a5              # double 309.09872225312057
   395  .LCPI8_15:
   396          .quad   0x406bb86590fcfb56              # double 221.76239823732857
   397  .LCPI8_16:
   398          .quad   0x404e0f304466448e              # double 60.118660497603841
   399  .LCPI8_17:
   400          .quad   0x406b0db13e48e066              # double 216.42788614495947
   401  .LCPI8_18:
   402          .quad   4841369599423283200             # 0x4330000000000000
   403  .LCPI8_19:
   404          .quad   0xc3300000000003ff              # double -4503599627371519
   405  .LCPI8_20:
   406          .quad   0x3ff0000000000000              # double 1
   407  .LCPI8_21:
   408          .quad   0xbfe0000000000000              # double -0.5
   409  .LCPI8_22:
   410          .quad   0x3fe0000000000000              # double 0.5
   411  .LCPI8_23:
   412          .quad   0x3ff71547652b82fe              # double 1.4426950408889634
   413  .LCPI8_24:
   414          .quad   0xbfe62e4000000000              # double -0.693145751953125
   415  .LCPI8_25:
   416          .quad   0x3eb7f7d1cf79abca              # double 1.4286068203094173E-6
   417  .LCPI8_26:
   418          .quad   0x3fe62e42fefa39ef              # double 0.69314718055994529
   419  .LCPI8_27:
   420          .quad   0x3e21eed8eff8d898              # double 2.08767569878681E-9
   421  .LCPI8_28:
   422          .quad   0x3de6124613a86d09              # double 1.6059043836821613E-10
   423  .LCPI8_29:
   424          .quad   0x3e927e4fb7789f5c              # double 2.7557319223985888E-7
   425  .LCPI8_30:
   426          .quad   0x3e5ae64567f544e4              # double 2.505210838544172E-8
   427  .LCPI8_31:
   428          .quad   0x3efa01a01a01a01a              # double 2.4801587301587302E-5
   429  .LCPI8_32:
   430          .quad   0x3ec71de3a556c734              # double 2.7557319223985893E-6
   431  .LCPI8_33:
   432          .quad   0x3f56c16c16c16c17              # double 0.0013888888888888889
   433  .LCPI8_34:
   434          .quad   0x3f2a01a01a01a01a              # double 1.9841269841269841E-4
   435  .LCPI8_35:
   436          .quad   0x3fa5555555555555              # double 0.041666666666666664
   437  .LCPI8_36:
   438          .quad   0x3f81111111111111              # double 0.0083333333333333332
   439  .LCPI8_37:
   440          .quad   0x3fc5555555555555              # double 0.16666666666666666
   441  .LCPI8_38:
   442          .quad   2046                            # 0x7fe
   443  .LCPI8_39:
   444          .quad   0x40a7700000000000              # double 3000
   445  .LCPI8_40:
   446          .quad   1                               # 0x1
   447  .LCPI8_41:
   448          .quad   0xc0a7700000000000              # double -3000
   449  .LCPI8_42:
   450          .quad   9218868437227405312             # 0x7ff0000000000000
   451  .LCPI8_43:
   452          .quad   0x7ff8002040000000              # double NaN
   453  .LCPI8_1:
   454          .quad   4503599627370495                # 0xfffffffffffff
   455          .quad   4503599627370495                # 0xfffffffffffff
   456  .LCPI8_2:
   457          .quad   4602678819172646912             # 0x3fe0000000000000
   458          .quad   4602678819172646912             # 0x3fe0000000000000
   459  Pow_4x_F64_V(double*, double*, unsigned long):                  # @Pow_4x_F64_V(double*, double*, unsigned long)
   460          subq    $1192, %rsp                     # imm = 0x4A8
   461          andq    $-4, %rdx
   462          je      .LBB8_11
   463          xorl    %r8d, %r8d
   464          vbroadcastsd    .LCPI8_0(%rip), %ymm0   # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
   465          vmovups %ymm0, 512(%rsp)                # 32-byte Spill
   466          vbroadcastsd    .LCPI8_3(%rip), %ymm0   # ymm0 = [7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1]
   467          vmovups %ymm0, 1120(%rsp)               # 32-byte Spill
   468          vpxor   %xmm6, %xmm6, %xmm6
   469          vbroadcastsd    .LCPI8_4(%rip), %ymm0   # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
   470          vmovups %ymm0, 1088(%rsp)               # 32-byte Spill
   471          vbroadcastsd    .LCPI8_5(%rip), %ymm0   # ymm0 = [6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0]
   472          vmovups %ymm0, 1056(%rsp)               # 32-byte Spill
   473          vbroadcastsd    .LCPI8_6(%rip), %ymm0   # ymm0 = [4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1]
   474          vmovups %ymm0, 1024(%rsp)               # 32-byte Spill
   475          vbroadcastsd    .LCPI8_7(%rip), %ymm0   # ymm0 = [4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5]
   476          vmovups %ymm0, 992(%rsp)                # 32-byte Spill
   477          vbroadcastsd    .LCPI8_8(%rip), %ymm0   # ymm0 = [6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1]
   478          vmovups %ymm0, 960(%rsp)                # 32-byte Spill
   479          vbroadcastsd    .LCPI8_9(%rip), %ymm0   # ymm0 = [2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1]
   480          vmovups %ymm0, 928(%rsp)                # 32-byte Spill
   481          vbroadcastsd    .LCPI8_10(%rip), %ymm0  # ymm0 = [2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1]
   482          vmovups %ymm0, 896(%rsp)                # 32-byte Spill
   483          vbroadcastsd    .LCPI8_11(%rip), %ymm0  # ymm0 = [5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1]
   484          vmovups %ymm0, 864(%rsp)                # 32-byte Spill
   485          vbroadcastsd    .LCPI8_12(%rip), %ymm0  # ymm0 = [8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1]
   486          vmovups %ymm0, 832(%rsp)                # 32-byte Spill
   487          vbroadcastsd    .LCPI8_13(%rip), %ymm0  # ymm0 = [1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1]
   488          vmovups %ymm0, 800(%rsp)                # 32-byte Spill
   489          vbroadcastsd    .LCPI8_14(%rip), %ymm0  # ymm0 = [3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2]
   490          vmovups %ymm0, 768(%rsp)                # 32-byte Spill
   491          vbroadcastsd    .LCPI8_15(%rip), %ymm0  # ymm0 = [2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2]
   492          vmovups %ymm0, 736(%rsp)                # 32-byte Spill
   493          vbroadcastsd    .LCPI8_16(%rip), %ymm0  # ymm0 = [6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1]
   494          vmovups %ymm0, 704(%rsp)                # 32-byte Spill
   495          vbroadcastsd    .LCPI8_17(%rip), %ymm0  # ymm0 = [2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2]
   496          vmovups %ymm0, 672(%rsp)                # 32-byte Spill
   497          vbroadcastsd    .LCPI8_18(%rip), %ymm0  # ymm0 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
   498          vmovups %ymm0, 640(%rsp)                # 32-byte Spill
   499          vbroadcastsd    .LCPI8_19(%rip), %ymm0  # ymm0 = [-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15]
   500          vmovups %ymm0, 608(%rsp)                # 32-byte Spill
   501          vbroadcastsd    .LCPI8_20(%rip), %ymm0  # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
   502          vmovups %ymm0, -128(%rsp)               # 32-byte Spill
   503          vbroadcastsd    .LCPI8_21(%rip), %ymm0  # ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
   504          vmovups %ymm0, 576(%rsp)                # 32-byte Spill
   505          vbroadcastsd    .LCPI8_22(%rip), %ymm0  # ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
   506          vmovups %ymm0, 544(%rsp)                # 32-byte Spill
   507          vbroadcastsd    .LCPI8_23(%rip), %ymm0  # ymm0 = [1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0]
   508          vmovups %ymm0, 480(%rsp)                # 32-byte Spill
   509          vbroadcastsd    .LCPI8_24(%rip), %ymm0  # ymm0 = [-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1]
   510          vmovups %ymm0, 448(%rsp)                # 32-byte Spill
   511          vbroadcastsd    .LCPI8_25(%rip), %ymm0  # ymm0 = [1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6]
   512          vmovups %ymm0, 416(%rsp)                # 32-byte Spill
   513          vbroadcastsd    .LCPI8_26(%rip), %ymm0  # ymm0 = [6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1]
   514          vmovups %ymm0, 384(%rsp)                # 32-byte Spill
   515          vbroadcastsd    .LCPI8_27(%rip), %ymm0  # ymm0 = [2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9]
   516          vmovups %ymm0, 352(%rsp)                # 32-byte Spill
   517          vbroadcastsd    .LCPI8_28(%rip), %ymm0  # ymm0 = [1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10]
   518          vmovups %ymm0, 320(%rsp)                # 32-byte Spill
   519          vbroadcastsd    .LCPI8_29(%rip), %ymm0  # ymm0 = [2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7]
   520          vmovups %ymm0, 288(%rsp)                # 32-byte Spill
   521          vbroadcastsd    .LCPI8_30(%rip), %ymm0  # ymm0 = [2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8]
   522          vmovups %ymm0, 256(%rsp)                # 32-byte Spill
   523          vbroadcastsd    .LCPI8_31(%rip), %ymm0  # ymm0 = [2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5]
   524          vmovups %ymm0, 224(%rsp)                # 32-byte Spill
   525          vbroadcastsd    .LCPI8_32(%rip), %ymm0  # ymm0 = [2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6]
   526          vmovups %ymm0, 192(%rsp)                # 32-byte Spill
   527          vbroadcastsd    .LCPI8_33(%rip), %ymm0  # ymm0 = [1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3]
   528          vmovups %ymm0, 160(%rsp)                # 32-byte Spill
   529          vbroadcastsd    .LCPI8_34(%rip), %ymm0  # ymm0 = [1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4]
   530          vmovups %ymm0, 128(%rsp)                # 32-byte Spill
   531          vbroadcastsd    .LCPI8_35(%rip), %ymm0  # ymm0 = [4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2]
   532          vmovups %ymm0, 96(%rsp)                 # 32-byte Spill
   533          vbroadcastsd    .LCPI8_36(%rip), %ymm0  # ymm0 = [8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3]
   534          vmovups %ymm0, 64(%rsp)                 # 32-byte Spill
   535          vbroadcastsd    .LCPI8_37(%rip), %ymm0  # ymm0 = [1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1]
   536          vmovups %ymm0, 32(%rsp)                 # 32-byte Spill
   537          vbroadcastsd    .LCPI8_38(%rip), %ymm0  # ymm0 = [2046,2046,2046,2046]
   538          vmovups %ymm0, (%rsp)                   # 32-byte Spill
   539          vbroadcastsd    .LCPI8_39(%rip), %ymm0  # ymm0 = [3.0E+3,3.0E+3,3.0E+3,3.0E+3]
   540          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
   541          vbroadcastsd    .LCPI8_40(%rip), %ymm0  # ymm0 = [1,1,1,1]
   542          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
   543          vbroadcastsd    .LCPI8_41(%rip), %ymm0  # ymm0 = [-3.0E+3,-3.0E+3,-3.0E+3,-3.0E+3]
   544          vmovupd %ymm0, -96(%rsp)                # 32-byte Spill
   545          vpbroadcastq    .LCPI8_42(%rip), %ymm5  # ymm5 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312]
   546          vbroadcastsd    .LCPI8_42(%rip), %ymm10 # ymm10 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312]
   547          jmp     .LBB8_2
   548  .LBB8_10:                               #   in Loop: Header=BB8_2 Depth=1
   549          vmovupd %ymm2, (%rdi,%r8,8)
   550          addq    $4, %r8
   551          cmpq    %rdx, %r8
   552          jae     .LBB8_11
   553  .LBB8_2:                                # =>This Inner Loop Header: Depth=1
   554          vmovapd %ymm10, %ymm9
   555          vmovdqu (%rdi,%r8,8), %ymm13
   556          vmovupd (%rsi,%r8,8), %ymm12
   557          vpand   512(%rsp), %ymm13, %ymm10       # 32-byte Folded Reload
   558          vmovapd .LCPI8_1(%rip), %xmm1           # xmm1 = [4503599627370495,4503599627370495]
   559          vandpd  (%rdi,%r8,8), %xmm1, %xmm2
   560          vmovapd .LCPI8_2(%rip), %xmm0           # xmm0 = [4602678819172646912,4602678819172646912]
   561          vorpd   %xmm0, %xmm2, %xmm2
   562          vandpd  16(%rdi,%r8,8), %xmm1, %xmm3
   563          vorpd   %xmm0, %xmm3, %xmm3
   564          vinsertf128     $1, %xmm3, %ymm2, %ymm3
   565          vmovupd 1120(%rsp), %ymm0               # 32-byte Reload
   566          vcmpltpd        %ymm3, %ymm0, %ymm2
   567          vandnpd %ymm3, %ymm2, %ymm4
   568          vaddpd  1088(%rsp), %ymm3, %ymm3        # 32-byte Folded Reload
   569          vaddpd  %ymm4, %ymm3, %ymm4
   570          vmulpd  %ymm4, %ymm4, %ymm3
   571          vmulpd  %ymm3, %ymm3, %ymm7
   572          vmovupd 1024(%rsp), %ymm8               # 32-byte Reload
   573          vfmadd213pd     1056(%rsp), %ymm4, %ymm8 # 32-byte Folded Reload
   574          vfmadd231pd     992(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload
   575          vmovupd 928(%rsp), %ymm11               # 32-byte Reload
   576          vfmadd213pd     960(%rsp), %ymm4, %ymm11 # 32-byte Folded Reload
   577          vmovupd 864(%rsp), %ymm14               # 32-byte Reload
   578          vfmadd213pd     896(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload
   579          vfmadd231pd     %ymm11, %ymm3, %ymm14   # ymm14 = (ymm3 * ymm11) + ymm14
   580          vfmadd231pd     %ymm8, %ymm7, %ymm14    # ymm14 = (ymm7 * ymm8) + ymm14
   581          vmulpd  %ymm4, %ymm3, %ymm8
   582          vmulpd  %ymm14, %ymm8, %ymm8
   583          vaddpd  832(%rsp), %ymm3, %ymm11        # 32-byte Folded Reload
   584          vfmadd231pd     800(%rsp), %ymm4, %ymm11 # 32-byte Folded Reload
   585          vmovupd 736(%rsp), %ymm14               # 32-byte Reload
   586          vfmadd213pd     768(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload
   587          vmovupd 672(%rsp), %ymm15               # 32-byte Reload
   588          vfmadd213pd     704(%rsp), %ymm4, %ymm15 # 32-byte Folded Reload
   589          vfmadd231pd     %ymm14, %ymm3, %ymm15   # ymm15 = (ymm3 * ymm14) + ymm15
   590          vfmadd231pd     %ymm11, %ymm7, %ymm15   # ymm15 = (ymm7 * ymm11) + ymm15
   591          vdivpd  %ymm15, %ymm8, %ymm7
   592          vmovdqu %ymm10, 1152(%rsp)              # 32-byte Spill
   593          vpsrlq  $52, %ymm10, %ymm8
   594          vpor    640(%rsp), %ymm8, %ymm8         # 32-byte Folded Reload
   595          vaddpd  608(%rsp), %ymm8, %ymm8         # 32-byte Folded Reload
   596          vmovupd -128(%rsp), %ymm0               # 32-byte Reload
   597          vandpd  %ymm0, %ymm2, %ymm2
   598          vaddpd  %ymm2, %ymm8, %ymm8
   599          vmulpd  %ymm12, %ymm8, %ymm2
   600          vroundpd        $8, %ymm2, %ymm2
   601          vfnmadd213pd    %ymm2, %ymm12, %ymm8    # ymm8 = -(ymm12 * ymm8) + ymm2
   602          vmovupd 576(%rsp), %ymm1                # 32-byte Reload
   603          vmovapd %ymm1, %ymm11
   604          vfmadd213pd     %ymm4, %ymm3, %ymm11    # ymm11 = (ymm3 * ymm11) + ymm4
   605          vaddpd  %ymm7, %ymm11, %ymm11
   606          vmovupd 544(%rsp), %ymm10               # 32-byte Reload
   607          vmulpd  %ymm4, %ymm10, %ymm14
   608          vmulpd  %ymm1, %ymm3, %ymm15
   609          vfmadd231pd     %ymm14, %ymm4, %ymm15   # ymm15 = (ymm4 * ymm14) + ymm15
   610          vsubpd  %ymm4, %ymm11, %ymm4
   611          vfmadd231pd     %ymm3, %ymm10, %ymm4    # ymm4 = (ymm10 * ymm3) + ymm4
   612          vmovupd 480(%rsp), %ymm1                # 32-byte Reload
   613          vmulpd  %ymm1, %ymm12, %ymm3
   614          vmulpd  %ymm3, %ymm11, %ymm3
   615          vroundpd        $8, %ymm3, %ymm3
   616          vmulpd  448(%rsp), %ymm3, %ymm14        # 32-byte Folded Reload
   617          vfmadd231pd     %ymm11, %ymm12, %ymm14  # ymm14 = (ymm12 * ymm11) + ymm14
   618          vfmsub231pd     416(%rsp), %ymm3, %ymm14 # 32-byte Folded Reload
   619          vmovupd 384(%rsp), %ymm11               # 32-byte Reload
   620          vfmadd231pd     %ymm8, %ymm11, %ymm14   # ymm14 = (ymm11 * ymm8) + ymm14
   621          vsubpd  %ymm7, %ymm15, %ymm7
   622          vaddpd  %ymm4, %ymm7, %ymm4
   623          vfnmsub213pd    %ymm14, %ymm12, %ymm4   # ymm4 = -(ymm12 * ymm4) - ymm14
   624          vmulpd  %ymm1, %ymm4, %ymm7
   625          vroundpd        $8, %ymm7, %ymm7
   626          vfnmadd231pd    %ymm11, %ymm7, %ymm4    # ymm4 = -(ymm7 * ymm11) + ymm4
   627          vmulpd  %ymm4, %ymm4, %ymm8
   628          vmovupd 320(%rsp), %ymm11               # 32-byte Reload
   629          vfmadd213pd     352(%rsp), %ymm4, %ymm11 # 32-byte Folded Reload
   630          vmovupd 256(%rsp), %ymm14               # 32-byte Reload
   631          vfmadd213pd     288(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload
   632          vmovupd 192(%rsp), %ymm15               # 32-byte Reload
   633          vfmadd213pd     224(%rsp), %ymm4, %ymm15 # 32-byte Folded Reload
   634          vfmadd231pd     %ymm14, %ymm8, %ymm15   # ymm15 = (ymm8 * ymm14) + ymm15
   635          vmovupd 128(%rsp), %ymm14               # 32-byte Reload
   636          vfmadd213pd     160(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload
   637          vmovupd 64(%rsp), %ymm1                 # 32-byte Reload
   638          vfmadd213pd     96(%rsp), %ymm4, %ymm1  # 32-byte Folded Reload
   639          vfmadd231pd     %ymm14, %ymm8, %ymm1    # ymm1 = (ymm8 * ymm14) + ymm1
   640          vmovupd 32(%rsp), %ymm14                # 32-byte Reload
   641          vfmadd213pd     %ymm10, %ymm4, %ymm14   # ymm14 = (ymm4 * ymm14) + ymm10
   642          vfmadd213pd     %ymm4, %ymm8, %ymm14    # ymm14 = (ymm8 * ymm14) + ymm4
   643          vmulpd  %ymm8, %ymm8, %ymm4
   644          vfmadd231pd     %ymm11, %ymm4, %ymm15   # ymm15 = (ymm4 * ymm11) + ymm15
   645          vfmadd231pd     %ymm1, %ymm4, %ymm14    # ymm14 = (ymm4 * ymm1) + ymm14
   646          vmulpd  %ymm4, %ymm4, %ymm1
   647          vfmadd231pd     %ymm15, %ymm1, %ymm14   # ymm14 = (ymm1 * ymm15) + ymm14
   648          vaddpd  %ymm0, %ymm14, %ymm1
   649          vaddpd  %ymm2, %ymm3, %ymm2
   650          vaddpd  %ymm7, %ymm2, %ymm15
   651          vroundpd        $8, %ymm15, %ymm2
   652          vcvttsd2si      %xmm2, %r9
   653          vpermilpd       $1, %xmm2, %xmm3        # xmm3 = xmm2[1,0]
   654          vcvttsd2si      %xmm3, %rax
   655          vextractf128    $1, %ymm2, %xmm2
   656          vcvttsd2si      %xmm2, %rcx
   657          vmovq   %rcx, %xmm3
   658          vpermilpd       $1, %xmm2, %xmm2        # xmm2 = xmm2[1,0]
   659          vcvttsd2si      %xmm2, %rcx
   660          vmovq   %rcx, %xmm2
   661          vpunpcklqdq     %xmm2, %xmm3, %xmm2     # xmm2 = xmm3[0],xmm2[0]
   662          vmovq   %r9, %xmm3
   663          vmovq   %rax, %xmm4
   664          vpunpcklqdq     %xmm4, %xmm3, %xmm3     # xmm3 = xmm3[0],xmm4[0]
   665          vinserti128     $1, %xmm2, %ymm3, %ymm2
   666          vpsrad  $31, %ymm1, %ymm3
   667          vpsrad  $20, %ymm1, %ymm4
   668          vpsrlq  $32, %ymm4, %ymm4
   669          vpblendd        $170, %ymm3, %ymm4, %ymm3       # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
   670          vpaddq  %ymm3, %ymm2, %ymm4
   671          vpcmpgtq        (%rsp), %ymm4, %ymm3            # 32-byte Folded Reload
   672          vmovupd -32(%rsp), %ymm0                # 32-byte Reload
   673          vcmpltpd        %ymm15, %ymm0, %ymm7
   674          vpor    %ymm7, %ymm3, %ymm3
   675          vmovdqu -64(%rsp), %ymm0                # 32-byte Reload
   676          vpcmpgtq        %ymm4, %ymm0, %ymm4
   677          vcmpltpd        -96(%rsp), %ymm15, %ymm7        # 32-byte Folded Reload
   678          vpor    %ymm7, %ymm4, %ymm4
   679          vpsllq  $52, %ymm2, %ymm2
   680          vpaddq  %ymm1, %ymm2, %ymm2
   681          vpor    %ymm3, %ymm4, %ymm1
   682          vptest  %ymm1, %ymm1
   683          jne     .LBB8_3
   684          vmovapd %ymm9, %ymm10
   685          jmp     .LBB8_5
   686  .LBB8_3:                                #   in Loop: Header=BB8_2 Depth=1
   687          vpandn  %ymm2, %ymm4, %ymm1
   688          vmovapd %ymm9, %ymm10
   689          vblendvpd       %ymm3, %ymm9, %ymm1, %ymm2
   690  .LBB8_5:                                #   in Loop: Header=BB8_2 Depth=1
   691          vpand   %ymm5, %ymm13, %ymm11
   692          vpcmpeqq        %ymm6, %ymm11, %ymm4
   693          vpsrad  $31, %ymm13, %ymm1
   694          vpshufd $245, %ymm1, %ymm7              # ymm7 = ymm1[1,1,3,3,5,5,7,7]
   695          vcmpltpd        %ymm6, %ymm12, %ymm14
   696          vcmpeqpd        %ymm6, %ymm12, %ymm3
   697          vandpd  -128(%rsp), %ymm3, %ymm1        # 32-byte Folded Reload
   698          vblendvpd       %ymm14, %ymm10, %ymm1, %ymm1
   699          vblendvpd       %ymm4, %ymm1, %ymm2, %ymm2
   700          vptest  %ymm7, %ymm7
   701          jne     .LBB8_7
   702          vpxor   %xmm7, %xmm7, %xmm7
   703          jmp     .LBB8_8
   704  .LBB8_7:                                #   in Loop: Header=BB8_2 Depth=1
   705          vroundpd        $8, %ymm12, %ymm1
   706          vcmpeqpd        %ymm1, %ymm12, %ymm8
   707          vcvttsd2si      %xmm1, %r9
   708          vpermilpd       $1, %xmm1, %xmm10       # xmm10 = xmm1[1,0]
   709          vcvttsd2si      %xmm10, %rcx
   710          vextractf128    $1, %ymm1, %xmm1
   711          vcvttsd2si      %xmm1, %rax
   712          vxorpd  %xmm10, %xmm10, %xmm10
   713          vmovq   %rax, %xmm6
   714          vpermilpd       $1, %xmm1, %xmm1        # xmm1 = xmm1[1,0]
   715          vcvttsd2si      %xmm1, %rax
   716          vmovq   %rax, %xmm1
   717          vpunpcklqdq     %xmm1, %xmm6, %xmm1     # xmm1 = xmm6[0],xmm1[0]
   718          vmovq   %r9, %xmm6
   719          vmovq   %rcx, %xmm0
   720          vpunpcklqdq     %xmm0, %xmm6, %xmm0     # xmm0 = xmm6[0],xmm0[0]
   721          vinserti128     $1, %xmm1, %ymm0, %ymm0
   722          vpsllq  $63, %ymm0, %ymm0
   723          vpor    %ymm2, %ymm0, %ymm1
   724          vcmpeqpd        %ymm10, %ymm13, %ymm6
   725          vbroadcastsd    .LCPI8_43(%rip), %ymm10 # ymm10 = [NaN,NaN,NaN,NaN]
   726          vblendvpd       %ymm6, %ymm2, %ymm10, %ymm6
   727          vmovapd %ymm9, %ymm10
   728          vblendvpd       %ymm8, %ymm1, %ymm6, %ymm1
   729          vxorpd  %xmm6, %xmm6, %xmm6
   730          vblendvpd       %ymm7, %ymm1, %ymm2, %ymm2
   731          vandpd  %ymm0, %ymm8, %ymm7
   732  .LBB8_8:                                #   in Loop: Header=BB8_2 Depth=1
   733          vpcmpeqd        %ymm9, %ymm9, %ymm9
   734          vandpd  %ymm5, %ymm12, %ymm0
   735          vandpd  %ymm5, %ymm15, %ymm1
   736          vpcmpeqq        %ymm5, %ymm1, %ymm15
   737          vpxor   %ymm9, %ymm15, %ymm1
   738          vpcmpeqq        %ymm5, %ymm0, %ymm8
   739          vpcmpeqq        %ymm5, %ymm11, %ymm11
   740          vpxor   %ymm9, %ymm11, %ymm0
   741          vpandn  %ymm0, %ymm8, %ymm0
   742          vpor    %ymm4, %ymm1, %ymm1
   743          vpand   %ymm0, %ymm1, %ymm0
   744          vptest  %ymm9, %ymm0
   745          jb      .LBB8_10
   746          vpxor   %ymm9, %ymm8, %ymm0
   747          vpandn  %ymm0, %ymm15, %ymm0
   748          vmovupd -128(%rsp), %ymm8               # 32-byte Reload
   749          vmovupd 1152(%rsp), %ymm9               # 32-byte Reload
   750          vcmpeqpd        %ymm8, %ymm9, %ymm1
   751          vcmpltpd        %ymm9, %ymm8, %ymm4
   752          vpsrad  $31, %ymm12, %ymm6
   753          vpxor   %ymm4, %ymm6, %ymm4
   754          vpxor   %xmm6, %xmm6, %xmm6
   755          vblendvpd       %ymm4, %ymm10, %ymm6, %ymm4
   756          vblendvpd       %ymm1, %ymm8, %ymm4, %ymm1
   757          vblendvpd       %ymm0, %ymm2, %ymm1, %ymm0
   758          vandpd  %ymm2, %ymm7, %ymm1
   759          vandpd  %ymm7, %ymm13, %ymm2
   760          vorpd   %ymm2, %ymm9, %ymm2
   761          vblendvpd       %ymm14, %ymm1, %ymm2, %ymm1
   762          vblendvpd       %ymm3, %ymm8, %ymm1, %ymm1
   763          vblendvpd       %ymm11, %ymm1, %ymm0, %ymm0
   764          vcmpunordpd     %ymm13, %ymm13, %ymm1
   765          vcmpunordpd     %ymm12, %ymm12, %ymm2
   766          vorpd   %ymm1, %ymm2, %ymm1
   767          vaddpd  %ymm13, %ymm12, %ymm2
   768          vblendvpd       %ymm1, %ymm2, %ymm0, %ymm2
   769          jmp     .LBB8_10
   770  .LBB8_11:
   771          addq    $1192, %rsp                     # imm = 0x4A8
   772          vzeroupper
   773          retq
   774  .LCPI9_0:
   775          .long   2147483647                      # 0x7fffffff
   776  .LCPI9_3:
   777          .long   0x3f3504f3                      # float 0.707106769
   778  .LCPI9_4:
   779          .long   0xbf800000                      # float -1
   780  .LCPI9_5:
   781          .long   0x3def251a                      # float 0.116769984
   782  .LCPI9_6:
   783          .long   0xbdebd1b8                      # float -0.115146101
   784  .LCPI9_7:
   785          .long   0x3e11e9bf                      # float 0.142493233
   786  .LCPI9_8:
   787          .long   0xbdfe5d4f                      # float -0.12420141
   788  .LCPI9_9:
   789          .long   0x3e4cceac                      # float 0.200007141
   790  .LCPI9_10:
   791          .long   0xbe2aae50                      # float -0.166680574
   792  .LCPI9_11:
   793          .long   0x3eaaaaaa                      # float 0.333333313
   794  .LCPI9_12:
   795          .long   0xbe7ffffc                      # float -0.24999994
   796  .LCPI9_13:
   797          .long   0x3d9021bb                      # float 0.0703768358
   798  .LCPI9_15:
   799          .long   0xcb00007f                      # float -8388735
   800  .LCPI9_16:
   801          .long   0x3f800000                      # float 1
   802  .LCPI9_17:
   803          .long   0xbf000000                      # float -0.5
   804  .LCPI9_18:
   805          .long   0x3f000000                      # float 0.5
   806  .LCPI9_19:
   807          .long   0x3fb8aa3b                      # float 1.44269502
   808  .LCPI9_20:
   809          .long   0xbf318000                      # float -0.693359375
   810  .LCPI9_21:
   811          .long   0xb95e8083                      # float -2.12194442E-4
   812  .LCPI9_22:
   813          .long   0xbf317218                      # float -0.693147182
   814  .LCPI9_23:
   815          .long   0x3d2aaaab                      # float 0.0416666679
   816  .LCPI9_24:
   817          .long   0x3c088889                      # float 0.00833333377
   818  .LCPI9_25:
   819          .long   0x3ab60b61                      # float 0.00138888892
   820  .LCPI9_26:
   821          .long   0x39500d01                      # float 1.98412701E-4
   822  .LCPI9_27:
   823          .long   0x3e2aaaab                      # float 0.166666672
   824  .LCPI9_29:
   825          .long   254                             # 0xfe
   826  .LCPI9_30:
   827          .long   0x43960000                      # float 300
   828  .LCPI9_31:
   829          .long   1                               # 0x1
   830  .LCPI9_32:
   831          .long   0xc3960000                      # float -300
   832  .LCPI9_33:
   833          .long   2139095040                      # 0x7f800000
   834  .LCPI9_34:
   835          .long   0x7fc00102                      # float NaN
   836  .LCPI9_1:
   837          .quad   36028792732385279               # 0x7fffff007fffff
   838          .quad   36028792732385279               # 0x7fffff007fffff
   839  .LCPI9_2:
   840          .quad   4539628425446424576             # 0x3f0000003f000000
   841          .quad   4539628425446424576             # 0x3f0000003f000000
   842  .LCPI9_14:
   843          .quad   5404319554102886400             # 0x4b0000004b000000
   844  .LCPI9_28:
   845          .byte   255                             # 0xff
   846          .byte   0                               # 0x0
   847          .byte   0                               # 0x0
   848          .byte   0                               # 0x0
   849          .byte   255                             # 0xff
   850          .byte   0                               # 0x0
   851          .byte   0                               # 0x0
   852          .byte   0                               # 0x0
   853          .byte   255                             # 0xff
   854          .byte   0                               # 0x0
   855          .byte   0                               # 0x0
   856          .byte   0                               # 0x0
   857          .byte   255                             # 0xff
   858          .byte   0                               # 0x0
   859          .byte   0                               # 0x0
   860          .byte   0                               # 0x0
   861          .byte   255                             # 0xff
   862          .byte   0                               # 0x0
   863          .byte   0                               # 0x0
   864          .byte   0                               # 0x0
   865          .byte   255                             # 0xff
   866          .byte   0                               # 0x0
   867          .byte   0                               # 0x0
   868          .byte   0                               # 0x0
   869          .byte   255                             # 0xff
   870          .byte   0                               # 0x0
   871          .byte   0                               # 0x0
   872          .byte   0                               # 0x0
   873          .byte   255                             # 0xff
   874          .byte   0                               # 0x0
   875          .byte   0                               # 0x0
   876          .byte   0                               # 0x0
   877  Pow_8x_F32_V(float*, float*, unsigned long):                  # @Pow_8x_F32_V(float*, float*, unsigned long)
   878          subq    $872, %rsp                      # imm = 0x368
   879          andq    $-8, %rdx
   880          je      .LBB9_12
   881          xorl    %eax, %eax
   882          vbroadcastss    .LCPI9_0(%rip), %ymm0   # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
   883          vmovups %ymm0, 320(%rsp)                # 32-byte Spill
   884          vbroadcastss    .LCPI9_3(%rip), %ymm0   # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1]
   885          vmovups %ymm0, 800(%rsp)                # 32-byte Spill
   886          vpxor   %xmm7, %xmm7, %xmm7
   887          vbroadcastss    .LCPI9_4(%rip), %ymm0   # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
   888          vmovups %ymm0, 768(%rsp)                # 32-byte Spill
   889          vbroadcastss    .LCPI9_5(%rip), %ymm0   # ymm0 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1]
   890          vmovups %ymm0, 736(%rsp)                # 32-byte Spill
   891          vbroadcastss    .LCPI9_6(%rip), %ymm0   # ymm0 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1]
   892          vmovups %ymm0, 704(%rsp)                # 32-byte Spill
   893          vbroadcastss    .LCPI9_7(%rip), %ymm0   # ymm0 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1]
   894          vmovups %ymm0, 672(%rsp)                # 32-byte Spill
   895          vbroadcastss    .LCPI9_8(%rip), %ymm0   # ymm0 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1]
   896          vmovups %ymm0, 640(%rsp)                # 32-byte Spill
   897          vbroadcastss    .LCPI9_9(%rip), %ymm0   # ymm0 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1]
   898          vmovups %ymm0, 608(%rsp)                # 32-byte Spill
   899          vbroadcastss    .LCPI9_10(%rip), %ymm0  # ymm0 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1]
   900          vmovups %ymm0, 576(%rsp)                # 32-byte Spill
   901          vbroadcastss    .LCPI9_11(%rip), %ymm0  # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1]
   902          vmovups %ymm0, 544(%rsp)                # 32-byte Spill
   903          vbroadcastss    .LCPI9_12(%rip), %ymm0  # ymm0 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1]
   904          vmovups %ymm0, 512(%rsp)                # 32-byte Spill
   905          vbroadcastss    .LCPI9_13(%rip), %ymm0  # ymm0 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2]
   906          vmovups %ymm0, 480(%rsp)                # 32-byte Spill
   907          vbroadcastsd    .LCPI9_14(%rip), %ymm0  # ymm0 = [5404319554102886400,5404319554102886400,5404319554102886400,5404319554102886400]
   908          vmovups %ymm0, 448(%rsp)                # 32-byte Spill
   909          vbroadcastss    .LCPI9_15(%rip), %ymm0  # ymm0 = [-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6]
   910          vmovups %ymm0, 416(%rsp)                # 32-byte Spill
   911          vbroadcastss    .LCPI9_16(%rip), %ymm0  # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
   912          vmovups %ymm0, -128(%rsp)               # 32-byte Spill
   913          vbroadcastss    .LCPI9_17(%rip), %ymm0  # ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
   914          vmovups %ymm0, 384(%rsp)                # 32-byte Spill
   915          vbroadcastss    .LCPI9_18(%rip), %ymm0  # ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1]
   916          vmovups %ymm0, 352(%rsp)                # 32-byte Spill
   917          vbroadcastss    .LCPI9_19(%rip), %ymm0  # ymm0 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0]
   918          vmovups %ymm0, 288(%rsp)                # 32-byte Spill
   919          vbroadcastss    .LCPI9_20(%rip), %ymm0  # ymm0 = [-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1]
   920          vmovups %ymm0, 256(%rsp)                # 32-byte Spill
   921          vbroadcastss    .LCPI9_21(%rip), %ymm0  # ymm0 = [-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4]
   922          vmovups %ymm0, 224(%rsp)                # 32-byte Spill
   923          vbroadcastss    .LCPI9_22(%rip), %ymm0  # ymm0 = [-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1]
   924          vmovups %ymm0, 192(%rsp)                # 32-byte Spill
   925          vbroadcastss    .LCPI9_23(%rip), %ymm0  # ymm0 = [4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2]
   926          vmovups %ymm0, 160(%rsp)                # 32-byte Spill
   927          vbroadcastss    .LCPI9_24(%rip), %ymm0  # ymm0 = [8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3]
   928          vmovups %ymm0, 128(%rsp)                # 32-byte Spill
   929          vbroadcastss    .LCPI9_25(%rip), %ymm0  # ymm0 = [1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3]
   930          vmovups %ymm0, 96(%rsp)                 # 32-byte Spill
   931          vbroadcastss    .LCPI9_26(%rip), %ymm0  # ymm0 = [1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4]
   932          vmovups %ymm0, 64(%rsp)                 # 32-byte Spill
   933          vbroadcastss    .LCPI9_27(%rip), %ymm0  # ymm0 = [1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1]
   934          vmovups %ymm0, 32(%rsp)                 # 32-byte Spill
   935          vbroadcastss    .LCPI9_29(%rip), %ymm0  # ymm0 = [254,254,254,254,254,254,254,254]
   936          vmovups %ymm0, (%rsp)                   # 32-byte Spill
   937          vbroadcastss    .LCPI9_30(%rip), %ymm0  # ymm0 = [3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2]
   938          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
   939          vbroadcastss    .LCPI9_31(%rip), %ymm0  # ymm0 = [1,1,1,1,1,1,1,1]
   940          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
   941          vpbroadcastd    .LCPI9_32(%rip), %ymm0  # ymm0 = [-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2]
   942          vmovdqu %ymm0, -96(%rsp)                # 32-byte Spill
   943          vpbroadcastd    .LCPI9_33(%rip), %ymm8  # ymm8 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
   944          vbroadcastss    .LCPI9_33(%rip), %ymm12 # ymm12 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
   945          jmp     .LBB9_2
   946  .LBB9_10:                               #   in Loop: Header=BB9_2 Depth=1
   947          vpxor   %ymm0, %ymm15, %ymm0
   948          vpandn  %ymm0, %ymm14, %ymm0
   949          vmovups -128(%rsp), %ymm14              # 32-byte Reload
   950          vmovups 832(%rsp), %ymm2                # 32-byte Reload
   951          vcmpeqps        %ymm2, %ymm14, %ymm3
   952          vcmpltps        %ymm2, %ymm14, %ymm4
   953          vxorps  %ymm4, %ymm11, %ymm4
   954          vpxor   %xmm7, %xmm7, %xmm7
   955          vblendvps       %ymm4, %ymm12, %ymm7, %ymm4
   956          vblendvps       %ymm3, %ymm14, %ymm4, %ymm3
   957          vblendvps       %ymm0, %ymm6, %ymm3, %ymm0
   958          vandps  %ymm6, %ymm10, %ymm3
   959          vandps  %ymm9, %ymm10, %ymm4
   960          vorps   %ymm2, %ymm4, %ymm4
   961          vblendvps       %ymm13, %ymm3, %ymm4, %ymm3
   962          vblendvps       %ymm1, %ymm14, %ymm3, %ymm1
   963          vblendvps       %ymm5, %ymm0, %ymm1, %ymm0
   964          vcmpunordps     %ymm9, %ymm9, %ymm1
   965          vcmpunordps     %ymm11, %ymm11, %ymm3
   966          vorps   %ymm1, %ymm3, %ymm1
   967          vaddps  %ymm9, %ymm11, %ymm3
   968          vblendvps       %ymm1, %ymm3, %ymm0, %ymm6
   969          vmovups %ymm6, (%rdi,%rax,4)
   970          addq    $8, %rax
   971          cmpq    %rdx, %rax
   972          jae     .LBB9_12
   973  .LBB9_2:                                # =>This Inner Loop Header: Depth=1
   974          vmovaps %ymm12, %ymm2
   975          vmovdqu (%rdi,%rax,4), %ymm9
   976          vmovups (%rsi,%rax,4), %ymm11
   977          vpand   320(%rsp), %ymm9, %ymm12        # 32-byte Folded Reload
   978          vmovaps .LCPI9_1(%rip), %xmm1           # xmm1 = [36028792732385279,36028792732385279]
   979          vandps  (%rdi,%rax,4), %xmm1, %xmm0
   980          vmovaps .LCPI9_2(%rip), %xmm3           # xmm3 = [4539628425446424576,4539628425446424576]
   981          vorps   %xmm3, %xmm0, %xmm0
   982          vandps  16(%rdi,%rax,4), %xmm1, %xmm1
   983          vorps   %xmm3, %xmm1, %xmm1
   984          vinsertf128     $1, %xmm1, %ymm0, %ymm0
   985          vmovups 800(%rsp), %ymm1                # 32-byte Reload
   986          vcmpltps        %ymm0, %ymm1, %ymm1
   987          vandnps %ymm0, %ymm1, %ymm4
   988          vaddps  768(%rsp), %ymm0, %ymm0         # 32-byte Folded Reload
   989          vaddps  %ymm4, %ymm0, %ymm4
   990          vmulps  %ymm4, %ymm4, %ymm6
   991          vmulps  %ymm6, %ymm6, %ymm0
   992          vmovups 704(%rsp), %ymm5                # 32-byte Reload
   993          vfmadd213ps     736(%rsp), %ymm4, %ymm5 # 32-byte Folded Reload
   994          vmovups 640(%rsp), %ymm10               # 32-byte Reload
   995          vfmadd213ps     672(%rsp), %ymm4, %ymm10 # 32-byte Folded Reload
   996          vfmadd231ps     %ymm5, %ymm6, %ymm10    # ymm10 = (ymm6 * ymm5) + ymm10
   997          vmovups 576(%rsp), %ymm5                # 32-byte Reload
   998          vfmadd213ps     608(%rsp), %ymm4, %ymm5 # 32-byte Folded Reload
   999          vmovups 512(%rsp), %ymm13               # 32-byte Reload
  1000          vfmadd213ps     544(%rsp), %ymm4, %ymm13 # 32-byte Folded Reload
  1001          vmulps  %ymm0, %ymm0, %ymm14
  1002          vfmadd132ps     480(%rsp), %ymm13, %ymm14 # 32-byte Folded Reload
  1003          vfmadd231ps     %ymm5, %ymm6, %ymm14    # ymm14 = (ymm6 * ymm5) + ymm14
  1004          vfmadd231ps     %ymm10, %ymm0, %ymm14   # ymm14 = (ymm0 * ymm10) + ymm14
  1005          vmulps  %ymm4, %ymm6, %ymm0
  1006          vmulps  %ymm0, %ymm14, %ymm0
  1007          vmovdqu %ymm12, 832(%rsp)               # 32-byte Spill
  1008          vpsrld  $23, %ymm12, %ymm5
  1009          vpor    448(%rsp), %ymm5, %ymm5         # 32-byte Folded Reload
  1010          vaddps  416(%rsp), %ymm5, %ymm5         # 32-byte Folded Reload
  1011          vmovups -128(%rsp), %ymm3               # 32-byte Reload
  1012          vandps  %ymm3, %ymm1, %ymm1
  1013          vaddps  %ymm1, %ymm5, %ymm5
  1014          vmulps  %ymm5, %ymm11, %ymm1
  1015          vroundps        $8, %ymm1, %ymm1
  1016          vfnmadd213ps    %ymm1, %ymm11, %ymm5    # ymm5 = -(ymm11 * ymm5) + ymm1
  1017          vmovups 384(%rsp), %ymm14               # 32-byte Reload
  1018          vmovaps %ymm14, %ymm10
  1019          vfmadd213ps     %ymm4, %ymm6, %ymm10    # ymm10 = (ymm6 * ymm10) + ymm4
  1020          vaddps  %ymm0, %ymm10, %ymm10
  1021          vmovups 352(%rsp), %ymm12               # 32-byte Reload
  1022          vmulps  %ymm4, %ymm12, %ymm13
  1023          vmulps  %ymm6, %ymm14, %ymm14
  1024          vfmadd231ps     %ymm13, %ymm4, %ymm14   # ymm14 = (ymm4 * ymm13) + ymm14
  1025          vsubps  %ymm4, %ymm10, %ymm4
  1026          vfmadd231ps     %ymm6, %ymm12, %ymm4    # ymm4 = (ymm12 * ymm6) + ymm4
  1027          vmovups 288(%rsp), %ymm15               # 32-byte Reload
  1028          vmulps  %ymm15, %ymm11, %ymm6
  1029          vmulps  %ymm6, %ymm10, %ymm6
  1030          vroundps        $8, %ymm6, %ymm6
  1031          vmulps  256(%rsp), %ymm6, %ymm13        # 32-byte Folded Reload
  1032          vfmadd231ps     %ymm10, %ymm11, %ymm13  # ymm13 = (ymm11 * ymm10) + ymm13
  1033          vfnmadd231ps    224(%rsp), %ymm6, %ymm13 # 32-byte Folded Reload
  1034          vsubps  %ymm0, %ymm14, %ymm0
  1035          vaddps  %ymm4, %ymm0, %ymm0
  1036          vmovups 192(%rsp), %ymm10               # 32-byte Reload
  1037          vmulps  %ymm5, %ymm10, %ymm4
  1038          vfnmadd231ps    %ymm0, %ymm11, %ymm4    # ymm4 = -(ymm11 * ymm0) + ymm4
  1039          vaddps  %ymm4, %ymm13, %ymm0
  1040          vmulps  %ymm0, %ymm15, %ymm4
  1041          vroundps        $8, %ymm4, %ymm4
  1042          vfmadd231ps     %ymm10, %ymm4, %ymm0    # ymm0 = (ymm4 * ymm10) + ymm0
  1043          vmulps  %ymm0, %ymm0, %ymm5
  1044          vmulps  %ymm5, %ymm5, %ymm10
  1045          vmovups 64(%rsp), %ymm13                # 32-byte Reload
  1046          vfmadd213ps     96(%rsp), %ymm0, %ymm13 # 32-byte Folded Reload
  1047          vmovups 32(%rsp), %ymm14                # 32-byte Reload
  1048          vfmadd213ps     %ymm12, %ymm0, %ymm14   # ymm14 = (ymm0 * ymm14) + ymm12
  1049          vfmadd231ps     %ymm13, %ymm10, %ymm14  # ymm14 = (ymm10 * ymm13) + ymm14
  1050          vmovups 128(%rsp), %ymm10               # 32-byte Reload
  1051          vfmadd213ps     160(%rsp), %ymm0, %ymm10 # 32-byte Folded Reload
  1052          vfmadd231ps     %ymm10, %ymm5, %ymm14   # ymm14 = (ymm5 * ymm10) + ymm14
  1053          vaddps  %ymm3, %ymm0, %ymm10
  1054          vfmadd231ps     %ymm14, %ymm5, %ymm10   # ymm10 = (ymm5 * ymm14) + ymm10
  1055          vaddps  %ymm1, %ymm6, %ymm0
  1056          vaddps  %ymm4, %ymm0, %ymm14
  1057          vcvtps2dq       %ymm14, %ymm4
  1058          vpsrld  $23, %ymm10, %ymm0
  1059          vpand   .LCPI9_28(%rip), %ymm0, %ymm0
  1060          vpaddd  %ymm4, %ymm0, %ymm0
  1061          vpcmpgtd        (%rsp), %ymm0, %ymm1            # 32-byte Folded Reload
  1062          vmovups -32(%rsp), %ymm3                # 32-byte Reload
  1063          vcmpltps        %ymm14, %ymm3, %ymm5
  1064          vpor    %ymm5, %ymm1, %ymm1
  1065          vmovdqu -64(%rsp), %ymm3                # 32-byte Reload
  1066          vpcmpgtd        %ymm0, %ymm3, %ymm0
  1067          vcmpltps        -96(%rsp), %ymm14, %ymm5        # 32-byte Folded Reload
  1068          vpor    %ymm5, %ymm0, %ymm0
  1069          vpslld  $23, %ymm4, %ymm4
  1070          vpaddd  %ymm4, %ymm10, %ymm6
  1071          vpor    %ymm1, %ymm0, %ymm4
  1072          vtestps %ymm4, %ymm4
  1073          jne     .LBB9_3
  1074          vpcmpeqd        %ymm15, %ymm15, %ymm15
  1075          vmovaps %ymm2, %ymm12
  1076          jmp     .LBB9_5
  1077  .LBB9_3:                                #   in Loop: Header=BB9_2 Depth=1
  1078          vpandn  %ymm6, %ymm0, %ymm0
  1079          vmovaps %ymm2, %ymm12
  1080          vblendvps       %ymm1, %ymm2, %ymm0, %ymm6
  1081          vpcmpeqd        %ymm15, %ymm15, %ymm15
  1082  .LBB9_5:                                #   in Loop: Header=BB9_2 Depth=1
  1083          vpand   %ymm8, %ymm9, %ymm5
  1084          vpcmpeqd        %ymm7, %ymm5, %ymm4
  1085          vcmpltps        %ymm7, %ymm11, %ymm13
  1086          vcmpeqps        %ymm7, %ymm11, %ymm1
  1087          vandps  -128(%rsp), %ymm1, %ymm0        # 32-byte Folded Reload
  1088          vblendvps       %ymm13, %ymm12, %ymm0, %ymm0
  1089          vblendvps       %ymm4, %ymm0, %ymm6, %ymm6
  1090          vmovmskps       %ymm9, %ecx
  1091          testl   %ecx, %ecx
  1092          jne     .LBB9_7
  1093          vxorps  %xmm10, %xmm10, %xmm10
  1094          jmp     .LBB9_8
  1095  .LBB9_7:                                #   in Loop: Header=BB9_2 Depth=1
  1096          vroundps        $8, %ymm11, %ymm0
  1097          vcmpeqps        %ymm0, %ymm11, %ymm0
  1098          vcvtps2dq       %ymm11, %ymm10
  1099          vpslld  $31, %ymm10, %ymm10
  1100          vpor    %ymm6, %ymm10, %ymm12
  1101          vpxor   %xmm3, %xmm3, %xmm3
  1102          vcmpeqps        %ymm3, %ymm9, %ymm7
  1103          vbroadcastss    .LCPI9_34(%rip), %ymm3  # ymm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
  1104          vblendvps       %ymm7, %ymm6, %ymm3, %ymm3
  1105          vblendvps       %ymm0, %ymm12, %ymm3, %ymm3
  1106          vmovaps %ymm2, %ymm12
  1107          vpsrad  $31, %ymm9, %ymm7
  1108          vblendvps       %ymm7, %ymm3, %ymm6, %ymm6
  1109          vandps  %ymm0, %ymm10, %ymm10
  1110  .LBB9_8:                                #   in Loop: Header=BB9_2 Depth=1
  1111          vpcmpeqd        %ymm5, %ymm8, %ymm0
  1112          vpxor   %ymm0, %ymm15, %ymm5
  1113          vandps  %ymm8, %ymm11, %ymm0
  1114          vandps  %ymm8, %ymm14, %ymm3
  1115          vpcmpeqd        %ymm3, %ymm8, %ymm14
  1116          vpxor   %ymm15, %ymm14, %ymm3
  1117          vpcmpeqd        %ymm0, %ymm8, %ymm0
  1118          vpandn  %ymm5, %ymm0, %ymm7
  1119          vpor    %ymm4, %ymm3, %ymm3
  1120          vpand   %ymm7, %ymm3, %ymm3
  1121          vtestps %ymm15, %ymm3
  1122          jae     .LBB9_10
  1123          vpxor   %xmm7, %xmm7, %xmm7
  1124          vmovups %ymm6, (%rdi,%rax,4)
  1125          addq    $8, %rax
  1126          cmpq    %rdx, %rax
  1127          jb      .LBB9_2
  1128  .LBB9_12:
  1129          addq    $872, %rsp                      # imm = 0x368
  1130          vzeroupper
  1131          retq
  1132  .LCPI10_0:
  1133          .quad   0x3ff71547652b82fe              # double 1.4426950408889634
  1134  .LCPI10_1:
  1135          .quad   9218868437227405312             # 0x7ff0000000000000
  1136  .LCPI10_2:
  1137          .quad   0x3ff0000000000000              # double 1
  1138  .LCPI10_3:
  1139          .quad   9223372036854775807             # 0x7fffffffffffffff
  1140  .LCPI10_6:
  1141          .quad   0x3fe6a09e667f3bcd              # double 0.70710678118654757
  1142  .LCPI10_7:
  1143          .quad   0xbff0000000000000              # double -1
  1144  .LCPI10_8:
  1145          .quad   0x401a509f46f4fa53              # double 6.5787325942061043
  1146  .LCPI10_9:
  1147          .quad   0x3fdfe818a0fe1a83              # double 0.49854102823193375
  1148  .LCPI10_10:
  1149          .quad   0x3f07bc0962b395ca              # double 4.5270000862445198E-5
  1150  .LCPI10_11:
  1151          .quad   0x404e798eb86c3351              # double 60.94966798098779
  1152  .LCPI10_12:
  1153          .quad   0x403de9738b8cb9c9              # double 29.911919328553072
  1154  .LCPI10_13:
  1155          .quad   0x40340a202d99830a              # double 20.039553499201283
  1156  .LCPI10_14:
  1157          .quad   0x404c8e7597479a10              # double 57.112963590585537
  1158  .LCPI10_15:
  1159          .quad   0x4054c30b52213498              # double 83.047565967967216
  1160  .LCPI10_16:
  1161          .quad   0x402e20359e903e37              # double 15.062909083469192
  1162  .LCPI10_17:
  1163          .quad   0x407351945dc908a5              # double 309.09872225312057
  1164  .LCPI10_18:
  1165          .quad   0x406bb86590fcfb56              # double 221.76239823732857
  1166  .LCPI10_19:
  1167          .quad   0x404e0f304466448e              # double 60.118660497603841
  1168  .LCPI10_20:
  1169          .quad   0x406b0db13e48e066              # double 216.42788614495947
  1170  .LCPI10_21:
  1171          .quad   4841369599423283200             # 0x4330000000000000
  1172  .LCPI10_22:
  1173          .quad   0xc3300000000003ff              # double -4503599627371519
  1174  .LCPI10_23:
  1175          .quad   0xbfe0000000000000              # double -0.5
  1176  .LCPI10_24:
  1177          .quad   0x3fe0000000000000              # double 0.5
  1178  .LCPI10_25:
  1179          .quad   0xbfe62e4000000000              # double -0.693145751953125
  1180  .LCPI10_26:
  1181          .quad   0x3eb7f7d1cf79abca              # double 1.4286068203094173E-6
  1182  .LCPI10_27:
  1183          .quad   0x3fe62e42fefa39ef              # double 0.69314718055994529
  1184  .LCPI10_28:
  1185          .quad   0x3e21eed8eff8d898              # double 2.08767569878681E-9
  1186  .LCPI10_29:
  1187          .quad   0x3de6124613a86d09              # double 1.6059043836821613E-10
  1188  .LCPI10_30:
  1189          .quad   0x3e927e4fb7789f5c              # double 2.7557319223985888E-7
  1190  .LCPI10_31:
  1191          .quad   0x3e5ae64567f544e4              # double 2.505210838544172E-8
  1192  .LCPI10_32:
  1193          .quad   0x3efa01a01a01a01a              # double 2.4801587301587302E-5
  1194  .LCPI10_33:
  1195          .quad   0x3ec71de3a556c734              # double 2.7557319223985893E-6
  1196  .LCPI10_34:
  1197          .quad   0x3f56c16c16c16c17              # double 0.0013888888888888889
  1198  .LCPI10_35:
  1199          .quad   0x3f2a01a01a01a01a              # double 1.9841269841269841E-4
  1200  .LCPI10_36:
  1201          .quad   0x3fa5555555555555              # double 0.041666666666666664
  1202  .LCPI10_37:
  1203          .quad   0x3f81111111111111              # double 0.0083333333333333332
  1204  .LCPI10_38:
  1205          .quad   0x3fc5555555555555              # double 0.16666666666666666
  1206  .LCPI10_39:
  1207          .quad   2046                            # 0x7fe
  1208  .LCPI10_40:
  1209          .quad   0x40a7700000000000              # double 3000
  1210  .LCPI10_41:
  1211          .quad   1                               # 0x1
  1212  .LCPI10_42:
  1213          .quad   0xc0a7700000000000              # double -3000
  1214  .LCPI10_43:
  1215          .quad   0x7ff8002040000000              # double NaN
  1216  .LCPI10_4:
  1217          .quad   4503599627370495                # 0xfffffffffffff
  1218          .quad   4503599627370495                # 0xfffffffffffff
  1219  .LCPI10_5:
  1220          .quad   4602678819172646912             # 0x3fe0000000000000
  1221          .quad   4602678819172646912             # 0x3fe0000000000000
  1222  PowNumber_4x_F64_V(double*, double, unsigned long):             # @PowNumber_4x_F64_V(double*, double, unsigned long)
  1223          subq    $1352, %rsp                     # imm = 0x548
  1224          andq    $-4, %rsi
  1225          je      .LBB10_10
  1226          vbroadcastsd    %xmm0, %ymm0
  1227          vbroadcastsd    .LCPI10_0(%rip), %ymm1  # ymm1 = [1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0]
  1228          vbroadcastsd    .LCPI10_1(%rip), %ymm2  # ymm2 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312]
  1229          vmovupd %ymm1, 1312(%rsp)               # 32-byte Spill
  1230          vmulpd  %ymm1, %ymm0, %ymm1
  1231          vmovupd %ymm1, 1280(%rsp)               # 32-byte Spill
  1232          vandpd  %ymm2, %ymm0, %ymm1
  1233          vmovupd %ymm1, 1248(%rsp)               # 32-byte Spill
  1234          vxorpd  %xmm1, %xmm1, %xmm1
  1235          vcmpltpd        %ymm1, %ymm0, %ymm3
  1236          vcmpeqpd        %ymm1, %ymm0, %ymm4
  1237          vbroadcastsd    .LCPI10_2(%rip), %ymm1  # ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  1238          vmovupd %ymm4, -64(%rsp)                # 32-byte Spill
  1239          vandpd  %ymm1, %ymm4, %ymm1
  1240          vbroadcastsd    .LCPI10_1(%rip), %ymm4  # ymm4 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312]
  1241          vmovupd %ymm3, -32(%rsp)                # 32-byte Spill
  1242          vmovupd %ymm4, -128(%rsp)               # 32-byte Spill
  1243          vblendvpd       %ymm3, %ymm4, %ymm1, %ymm1
  1244          vmovupd %ymm1, 1216(%rsp)               # 32-byte Spill
  1245          vpsrad  $31, %ymm0, %ymm1
  1246          vpshufd $245, %ymm1, %ymm1              # ymm1 = ymm1[1,1,3,3,5,5,7,7]
  1247          vmovdqu %ymm1, -96(%rsp)                # 32-byte Spill
  1248          xorl    %r8d, %r8d
  1249          vbroadcastsd    .LCPI10_3(%rip), %ymm1  # ymm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
  1250          vmovups %ymm1, 1184(%rsp)               # 32-byte Spill
  1251          vbroadcastsd    .LCPI10_6(%rip), %ymm1  # ymm1 = [7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1]
  1252          vmovups %ymm1, 1152(%rsp)               # 32-byte Spill
  1253          vbroadcastsd    .LCPI10_7(%rip), %ymm1  # ymm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
  1254          vmovups %ymm1, 1120(%rsp)               # 32-byte Spill
  1255          vbroadcastsd    .LCPI10_8(%rip), %ymm1  # ymm1 = [6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0]
  1256          vmovups %ymm1, 1088(%rsp)               # 32-byte Spill
  1257          vbroadcastsd    .LCPI10_9(%rip), %ymm1  # ymm1 = [4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1]
  1258          vmovups %ymm1, 1056(%rsp)               # 32-byte Spill
  1259          vbroadcastsd    .LCPI10_10(%rip), %ymm1 # ymm1 = [4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5]
  1260          vmovups %ymm1, 1024(%rsp)               # 32-byte Spill
  1261          vbroadcastsd    .LCPI10_11(%rip), %ymm1 # ymm1 = [6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1]
  1262          vmovups %ymm1, 992(%rsp)                # 32-byte Spill
  1263          vbroadcastsd    .LCPI10_12(%rip), %ymm1 # ymm1 = [2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1]
  1264          vmovups %ymm1, 960(%rsp)                # 32-byte Spill
  1265          vbroadcastsd    .LCPI10_13(%rip), %ymm1 # ymm1 = [2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1]
  1266          vmovups %ymm1, 928(%rsp)                # 32-byte Spill
  1267          vbroadcastsd    .LCPI10_14(%rip), %ymm1 # ymm1 = [5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1]
  1268          vmovups %ymm1, 896(%rsp)                # 32-byte Spill
  1269          vbroadcastsd    .LCPI10_15(%rip), %ymm1 # ymm1 = [8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1]
  1270          vmovups %ymm1, 864(%rsp)                # 32-byte Spill
  1271          vbroadcastsd    .LCPI10_16(%rip), %ymm1 # ymm1 = [1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1]
  1272          vmovups %ymm1, 832(%rsp)                # 32-byte Spill
  1273          vbroadcastsd    .LCPI10_17(%rip), %ymm1 # ymm1 = [3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2]
  1274          vmovups %ymm1, 800(%rsp)                # 32-byte Spill
  1275          vbroadcastsd    .LCPI10_18(%rip), %ymm1 # ymm1 = [2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2]
  1276          vmovups %ymm1, 768(%rsp)                # 32-byte Spill
  1277          vbroadcastsd    .LCPI10_19(%rip), %ymm1 # ymm1 = [6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1]
  1278          vmovups %ymm1, 736(%rsp)                # 32-byte Spill
  1279          vbroadcastsd    .LCPI10_20(%rip), %ymm1 # ymm1 = [2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2]
  1280          vmovups %ymm1, 704(%rsp)                # 32-byte Spill
  1281          vbroadcastsd    .LCPI10_21(%rip), %ymm1 # ymm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
  1282          vmovups %ymm1, 672(%rsp)                # 32-byte Spill
  1283          vbroadcastsd    .LCPI10_22(%rip), %ymm1 # ymm1 = [-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15]
  1284          vmovups %ymm1, 640(%rsp)                # 32-byte Spill
  1285          vbroadcastsd    .LCPI10_2(%rip), %ymm13 # ymm13 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  1286          vbroadcastsd    .LCPI10_23(%rip), %ymm1 # ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  1287          vmovups %ymm1, 608(%rsp)                # 32-byte Spill
  1288          vbroadcastsd    .LCPI10_24(%rip), %ymm1 # ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
  1289          vmovups %ymm1, 576(%rsp)                # 32-byte Spill
  1290          vbroadcastsd    .LCPI10_25(%rip), %ymm1 # ymm1 = [-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1]
  1291          vmovups %ymm1, 544(%rsp)                # 32-byte Spill
  1292          vbroadcastsd    .LCPI10_26(%rip), %ymm1 # ymm1 = [1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6]
  1293          vmovups %ymm1, 512(%rsp)                # 32-byte Spill
  1294          vbroadcastsd    .LCPI10_27(%rip), %ymm1 # ymm1 = [6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1]
  1295          vmovups %ymm1, 480(%rsp)                # 32-byte Spill
  1296          vbroadcastsd    .LCPI10_28(%rip), %ymm1 # ymm1 = [2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9]
  1297          vmovups %ymm1, 448(%rsp)                # 32-byte Spill
  1298          vbroadcastsd    .LCPI10_29(%rip), %ymm1 # ymm1 = [1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10]
  1299          vmovups %ymm1, 416(%rsp)                # 32-byte Spill
  1300          vbroadcastsd    .LCPI10_30(%rip), %ymm1 # ymm1 = [2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7]
  1301          vmovups %ymm1, 384(%rsp)                # 32-byte Spill
  1302          vbroadcastsd    .LCPI10_31(%rip), %ymm1 # ymm1 = [2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8]
  1303          vmovups %ymm1, 352(%rsp)                # 32-byte Spill
  1304          vbroadcastsd    .LCPI10_32(%rip), %ymm1 # ymm1 = [2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5]
  1305          vmovups %ymm1, 320(%rsp)                # 32-byte Spill
  1306          vbroadcastsd    .LCPI10_33(%rip), %ymm1 # ymm1 = [2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6]
  1307          vmovups %ymm1, 288(%rsp)                # 32-byte Spill
  1308          vbroadcastsd    .LCPI10_34(%rip), %ymm1 # ymm1 = [1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3]
  1309          vmovups %ymm1, 256(%rsp)                # 32-byte Spill
  1310          vbroadcastsd    .LCPI10_35(%rip), %ymm1 # ymm1 = [1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4]
  1311          vmovups %ymm1, 224(%rsp)                # 32-byte Spill
  1312          vbroadcastsd    .LCPI10_36(%rip), %ymm1 # ymm1 = [4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2]
  1313          vmovups %ymm1, 192(%rsp)                # 32-byte Spill
  1314          vbroadcastsd    .LCPI10_37(%rip), %ymm1 # ymm1 = [8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3]
  1315          vmovups %ymm1, 160(%rsp)                # 32-byte Spill
  1316          vbroadcastsd    .LCPI10_38(%rip), %ymm1 # ymm1 = [1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1]
  1317          vmovups %ymm1, 128(%rsp)                # 32-byte Spill
  1318          vbroadcastsd    .LCPI10_39(%rip), %ymm1 # ymm1 = [2046,2046,2046,2046]
  1319          vmovups %ymm1, 96(%rsp)                 # 32-byte Spill
  1320          vbroadcastsd    .LCPI10_40(%rip), %ymm1 # ymm1 = [3.0E+3,3.0E+3,3.0E+3,3.0E+3]
  1321          vmovups %ymm1, 64(%rsp)                 # 32-byte Spill
  1322          vbroadcastsd    .LCPI10_41(%rip), %ymm1 # ymm1 = [1,1,1,1]
  1323          vmovups %ymm1, 32(%rsp)                 # 32-byte Spill
  1324          vbroadcastsd    .LCPI10_42(%rip), %ymm1 # ymm1 = [-3.0E+3,-3.0E+3,-3.0E+3,-3.0E+3]
  1325          vmovupd %ymm1, (%rsp)                   # 32-byte Spill
  1326          jmp     .LBB10_2
  1327  .LBB10_9:                               #   in Loop: Header=BB10_2 Depth=1
  1328          vmovupd %ymm6, (%rdi,%r8,8)
  1329          addq    $4, %r8
  1330          cmpq    %rsi, %r8
  1331          jae     .LBB10_10
  1332  .LBB10_2:                               # =>This Inner Loop Header: Depth=1
  1333          vmovdqu (%rdi,%r8,8), %ymm15
  1334          vmovapd .LCPI10_4(%rip), %xmm1          # xmm1 = [4503599627370495,4503599627370495]
  1335          vandpd  (%rdi,%r8,8), %xmm1, %xmm3
  1336          vmovapd .LCPI10_5(%rip), %xmm5          # xmm5 = [4602678819172646912,4602678819172646912]
  1337          vorpd   %xmm5, %xmm3, %xmm3
  1338          vandpd  16(%rdi,%r8,8), %xmm1, %xmm4
  1339          vorpd   %xmm5, %xmm4, %xmm4
  1340          vinsertf128     $1, %xmm4, %ymm3, %ymm3
  1341          vmovupd 1152(%rsp), %ymm1               # 32-byte Reload
  1342          vcmpltpd        %ymm3, %ymm1, %ymm6
  1343          vandnpd %ymm3, %ymm6, %ymm4
  1344          vaddpd  1120(%rsp), %ymm3, %ymm3        # 32-byte Folded Reload
  1345          vaddpd  %ymm4, %ymm3, %ymm8
  1346          vmulpd  %ymm8, %ymm8, %ymm4
  1347          vmulpd  %ymm4, %ymm4, %ymm3
  1348          vmovupd 1056(%rsp), %ymm7               # 32-byte Reload
  1349          vfmadd213pd     1088(%rsp), %ymm8, %ymm7 # 32-byte Folded Reload
  1350          vfmadd231pd     1024(%rsp), %ymm4, %ymm7 # 32-byte Folded Reload
  1351          vmovupd 960(%rsp), %ymm9                # 32-byte Reload
  1352          vfmadd213pd     992(%rsp), %ymm8, %ymm9 # 32-byte Folded Reload
  1353          vmovupd 896(%rsp), %ymm14               # 32-byte Reload
  1354          vfmadd213pd     928(%rsp), %ymm8, %ymm14 # 32-byte Folded Reload
  1355          vfmadd231pd     %ymm9, %ymm4, %ymm14    # ymm14 = (ymm4 * ymm9) + ymm14
  1356          vfmadd231pd     %ymm7, %ymm3, %ymm14    # ymm14 = (ymm3 * ymm7) + ymm14
  1357          vmulpd  %ymm4, %ymm8, %ymm7
  1358          vmulpd  %ymm7, %ymm14, %ymm7
  1359          vaddpd  864(%rsp), %ymm4, %ymm9         # 32-byte Folded Reload
  1360          vfmadd231pd     832(%rsp), %ymm8, %ymm9 # 32-byte Folded Reload
  1361          vmovupd 768(%rsp), %ymm14               # 32-byte Reload
  1362          vfmadd213pd     800(%rsp), %ymm8, %ymm14 # 32-byte Folded Reload
  1363          vmovupd 704(%rsp), %ymm11               # 32-byte Reload
  1364          vfmadd213pd     736(%rsp), %ymm8, %ymm11 # 32-byte Folded Reload
  1365          vfmadd231pd     %ymm14, %ymm4, %ymm11   # ymm11 = (ymm4 * ymm14) + ymm11
  1366          vfmadd231pd     %ymm9, %ymm3, %ymm11    # ymm11 = (ymm3 * ymm9) + ymm11
  1367          vdivpd  %ymm11, %ymm7, %ymm7
  1368          vpand   1184(%rsp), %ymm15, %ymm12      # 32-byte Folded Reload
  1369          vpsrlq  $52, %ymm12, %ymm9
  1370          vpor    672(%rsp), %ymm9, %ymm9         # 32-byte Folded Reload
  1371          vaddpd  640(%rsp), %ymm9, %ymm9         # 32-byte Folded Reload
  1372          vandpd  %ymm6, %ymm13, %ymm6
  1373          vaddpd  %ymm6, %ymm9, %ymm9
  1374          vmulpd  %ymm0, %ymm9, %ymm6
  1375          vroundpd        $8, %ymm6, %ymm6
  1376          vfnmadd213pd    %ymm6, %ymm0, %ymm9     # ymm9 = -(ymm0 * ymm9) + ymm6
  1377          vmovupd 608(%rsp), %ymm1                # 32-byte Reload
  1378          vmovapd %ymm1, %ymm11
  1379          vfmadd213pd     %ymm8, %ymm4, %ymm11    # ymm11 = (ymm4 * ymm11) + ymm8
  1380          vaddpd  %ymm7, %ymm11, %ymm11
  1381          vmovupd 576(%rsp), %ymm3                # 32-byte Reload
  1382          vmulpd  %ymm3, %ymm8, %ymm14
  1383          vmulpd  %ymm1, %ymm4, %ymm10
  1384          vfmadd231pd     %ymm14, %ymm8, %ymm10   # ymm10 = (ymm8 * ymm14) + ymm10
  1385          vsubpd  %ymm8, %ymm11, %ymm8
  1386          vfmadd231pd     %ymm4, %ymm3, %ymm8     # ymm8 = (ymm3 * ymm4) + ymm8
  1387          vmulpd  1280(%rsp), %ymm11, %ymm4       # 32-byte Folded Reload
  1388          vroundpd        $8, %ymm4, %ymm4
  1389          vmulpd  544(%rsp), %ymm4, %ymm14        # 32-byte Folded Reload
  1390          vfmadd231pd     %ymm11, %ymm0, %ymm14   # ymm14 = (ymm0 * ymm11) + ymm14
  1391          vfmsub231pd     512(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload
  1392          vmovupd 480(%rsp), %ymm1                # 32-byte Reload
  1393          vfmadd231pd     %ymm9, %ymm1, %ymm14    # ymm14 = (ymm1 * ymm9) + ymm14
  1394          vsubpd  %ymm7, %ymm10, %ymm7
  1395          vaddpd  %ymm7, %ymm8, %ymm7
  1396          vfnmsub213pd    %ymm14, %ymm0, %ymm7    # ymm7 = -(ymm0 * ymm7) - ymm14
  1397          vmulpd  1312(%rsp), %ymm7, %ymm8        # 32-byte Folded Reload
  1398          vroundpd        $8, %ymm8, %ymm8
  1399          vfnmadd231pd    %ymm1, %ymm8, %ymm7     # ymm7 = -(ymm8 * ymm1) + ymm7
  1400          vmulpd  %ymm7, %ymm7, %ymm9
  1401          vmovupd 416(%rsp), %ymm10               # 32-byte Reload
  1402          vfmadd213pd     448(%rsp), %ymm7, %ymm10 # 32-byte Folded Reload
  1403          vmovupd 352(%rsp), %ymm11               # 32-byte Reload
  1404          vfmadd213pd     384(%rsp), %ymm7, %ymm11 # 32-byte Folded Reload
  1405          vmovupd 288(%rsp), %ymm14               # 32-byte Reload
  1406          vfmadd213pd     320(%rsp), %ymm7, %ymm14 # 32-byte Folded Reload
  1407          vfmadd231pd     %ymm11, %ymm9, %ymm14   # ymm14 = (ymm9 * ymm11) + ymm14
  1408          vmovupd 224(%rsp), %ymm11               # 32-byte Reload
  1409          vfmadd213pd     256(%rsp), %ymm7, %ymm11 # 32-byte Folded Reload
  1410          vmovupd 160(%rsp), %ymm5                # 32-byte Reload
  1411          vfmadd213pd     192(%rsp), %ymm7, %ymm5 # 32-byte Folded Reload
  1412          vfmadd231pd     %ymm11, %ymm9, %ymm5    # ymm5 = (ymm9 * ymm11) + ymm5
  1413          vmovupd 128(%rsp), %ymm11               # 32-byte Reload
  1414          vfmadd213pd     %ymm3, %ymm7, %ymm11    # ymm11 = (ymm7 * ymm11) + ymm3
  1415          vfmadd213pd     %ymm7, %ymm9, %ymm11    # ymm11 = (ymm9 * ymm11) + ymm7
  1416          vmulpd  %ymm9, %ymm9, %ymm7
  1417          vfmadd231pd     %ymm10, %ymm7, %ymm14   # ymm14 = (ymm7 * ymm10) + ymm14
  1418          vfmadd231pd     %ymm5, %ymm7, %ymm11    # ymm11 = (ymm7 * ymm5) + ymm11
  1419          vmulpd  %ymm7, %ymm7, %ymm5
  1420          vfmadd231pd     %ymm14, %ymm5, %ymm11   # ymm11 = (ymm5 * ymm14) + ymm11
  1421          vaddpd  %ymm13, %ymm11, %ymm5
  1422          vaddpd  %ymm6, %ymm4, %ymm4
  1423          vaddpd  %ymm4, %ymm8, %ymm14
  1424          vroundpd        $8, %ymm14, %ymm4
  1425          vcvttsd2si      %xmm4, %rcx
  1426          vpermilpd       $1, %xmm4, %xmm6        # xmm6 = xmm4[1,0]
  1427          vcvttsd2si      %xmm6, %rdx
  1428          vextractf128    $1, %ymm4, %xmm4
  1429          vcvttsd2si      %xmm4, %rax
  1430          vmovq   %rax, %xmm6
  1431          vpermilpd       $1, %xmm4, %xmm4        # xmm4 = xmm4[1,0]
  1432          vcvttsd2si      %xmm4, %rax
  1433          vmovq   %rax, %xmm4
  1434          vpunpcklqdq     %xmm4, %xmm6, %xmm4     # xmm4 = xmm6[0],xmm4[0]
  1435          vmovq   %rcx, %xmm6
  1436          vmovq   %rdx, %xmm7
  1437          vpunpcklqdq     %xmm7, %xmm6, %xmm6     # xmm6 = xmm6[0],xmm7[0]
  1438          vinserti128     $1, %xmm4, %ymm6, %ymm6
  1439          vpsrad  $31, %ymm5, %ymm4
  1440          vpsrad  $20, %ymm5, %ymm7
  1441          vpsrlq  $32, %ymm7, %ymm7
  1442          vpblendd        $170, %ymm4, %ymm7, %ymm4       # ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7]
  1443          vpaddq  %ymm4, %ymm6, %ymm7
  1444          vpcmpgtq        96(%rsp), %ymm7, %ymm4          # 32-byte Folded Reload
  1445          vmovupd 64(%rsp), %ymm1                 # 32-byte Reload
  1446          vcmpltpd        %ymm14, %ymm1, %ymm8
  1447          vpor    %ymm4, %ymm8, %ymm4
  1448          vmovdqu 32(%rsp), %ymm1                 # 32-byte Reload
  1449          vpcmpgtq        %ymm7, %ymm1, %ymm7
  1450          vcmpltpd        (%rsp), %ymm14, %ymm8           # 32-byte Folded Reload
  1451          vpor    %ymm7, %ymm8, %ymm7
  1452          vpsllq  $52, %ymm6, %ymm6
  1453          vpaddq  %ymm5, %ymm6, %ymm6
  1454          vpor    %ymm4, %ymm7, %ymm5
  1455          vptest  %ymm5, %ymm5
  1456          je      .LBB10_4
  1457          vpandn  %ymm6, %ymm7, %ymm5
  1458          vblendvpd       %ymm4, -128(%rsp), %ymm5, %ymm6 # 32-byte Folded Reload
  1459  .LBB10_4:                               #   in Loop: Header=BB10_2 Depth=1
  1460          vxorpd  %xmm11, %xmm11, %xmm11
  1461          vpand   %ymm2, %ymm15, %ymm4
  1462          vpcmpeqq        %ymm11, %ymm4, %ymm8
  1463          vpsrad  $31, %ymm15, %ymm5
  1464          vpshufd $245, %ymm5, %ymm9              # ymm9 = ymm5[1,1,3,3,5,5,7,7]
  1465          vblendvpd       %ymm8, 1216(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload
  1466          vptest  %ymm9, %ymm9
  1467          jne     .LBB10_6
  1468          vpxor   %xmm9, %xmm9, %xmm9
  1469          jmp     .LBB10_7
  1470  .LBB10_6:                               #   in Loop: Header=BB10_2 Depth=1
  1471          vroundpd        $8, %ymm0, %ymm5
  1472          vcmpeqpd        %ymm0, %ymm5, %ymm7
  1473          vcvttsd2si      %xmm5, %rax
  1474          vpermilpd       $1, %xmm5, %xmm1        # xmm1 = xmm5[1,0]
  1475          vcvttsd2si      %xmm1, %rcx
  1476          vextractf128    $1, %ymm5, %xmm1
  1477          vcvttsd2si      %xmm1, %rdx
  1478          vmovq   %rdx, %xmm5
  1479          vpermilpd       $1, %xmm1, %xmm1        # xmm1 = xmm1[1,0]
  1480          vcvttsd2si      %xmm1, %rdx
  1481          vmovq   %rdx, %xmm1
  1482          vpunpcklqdq     %xmm1, %xmm5, %xmm1     # xmm1 = xmm5[0],xmm1[0]
  1483          vmovq   %rax, %xmm5
  1484          vmovq   %rcx, %xmm3
  1485          vpunpcklqdq     %xmm3, %xmm5, %xmm3     # xmm3 = xmm5[0],xmm3[0]
  1486          vinserti128     $1, %xmm1, %ymm3, %ymm1
  1487          vpsllq  $63, %ymm1, %ymm1
  1488          vpor    %ymm6, %ymm1, %ymm3
  1489          vcmpeqpd        %ymm11, %ymm15, %ymm5
  1490          vbroadcastsd    .LCPI10_43(%rip), %ymm10 # ymm10 = [NaN,NaN,NaN,NaN]
  1491          vblendvpd       %ymm5, %ymm6, %ymm10, %ymm5
  1492          vblendvpd       %ymm7, %ymm3, %ymm5, %ymm3
  1493          vblendvpd       %ymm9, %ymm3, %ymm6, %ymm6
  1494          vandpd  %ymm1, %ymm7, %ymm9
  1495  .LBB10_7:                               #   in Loop: Header=BB10_2 Depth=1
  1496          vandpd  %ymm2, %ymm14, %ymm1
  1497          vpcmpeqq        %ymm2, %ymm1, %ymm14
  1498          vpcmpeqd        %ymm5, %ymm5, %ymm5
  1499          vpxor   %ymm5, %ymm14, %ymm1
  1500          vpcmpeqq        %ymm2, %ymm4, %ymm4
  1501          vpcmpeqq        1248(%rsp), %ymm2, %ymm3        # 32-byte Folded Reload
  1502          vpxor   %ymm5, %ymm3, %ymm7
  1503          vpandn  %ymm7, %ymm4, %ymm3
  1504          vpor    %ymm1, %ymm8, %ymm1
  1505          vpand   %ymm3, %ymm1, %ymm1
  1506          vptest  %ymm5, %ymm1
  1507          jb      .LBB10_9
  1508          vpandn  %ymm7, %ymm14, %ymm1
  1509          vcmpeqpd        %ymm13, %ymm12, %ymm3
  1510          vcmpltpd        %ymm12, %ymm13, %ymm5
  1511          vxorpd  -96(%rsp), %ymm5, %ymm5         # 32-byte Folded Reload
  1512          vblendvpd       %ymm5, -128(%rsp), %ymm11, %ymm5 # 32-byte Folded Reload
  1513          vblendvpd       %ymm3, %ymm13, %ymm5, %ymm3
  1514          vblendvpd       %ymm1, %ymm6, %ymm3, %ymm1
  1515          vandpd  %ymm6, %ymm9, %ymm3
  1516          vandpd  %ymm15, %ymm9, %ymm5
  1517          vorpd   %ymm5, %ymm12, %ymm5
  1518          vmovupd -32(%rsp), %ymm6                # 32-byte Reload
  1519          vblendvpd       %ymm6, %ymm3, %ymm5, %ymm3
  1520          vmovupd -64(%rsp), %ymm5                # 32-byte Reload
  1521          vblendvpd       %ymm5, %ymm13, %ymm3, %ymm3
  1522          vblendvpd       %ymm4, %ymm3, %ymm1, %ymm1
  1523          vcmpunordpd     %ymm15, %ymm15, %ymm3
  1524          vcmpunordpd     %ymm0, %ymm0, %ymm4
  1525          vorpd   %ymm3, %ymm4, %ymm3
  1526          vaddpd  %ymm0, %ymm15, %ymm4
  1527          vblendvpd       %ymm3, %ymm4, %ymm1, %ymm6
  1528          jmp     .LBB10_9
  1529  .LBB10_10:
  1530          addq    $1352, %rsp                     # imm = 0x548
  1531          vzeroupper
  1532          retq
  1533  .LCPI11_0:
  1534          .long   0x3fb8aa3b                      # float 1.44269502
  1535  .LCPI11_1:
  1536          .long   2139095040                      # 0x7f800000
  1537  .LCPI11_2:
  1538          .long   0x3f800000                      # float 1
  1539  .LCPI11_3:
  1540          .long   2147483647                      # 0x7fffffff
  1541  .LCPI11_6:
  1542          .long   0x3f3504f3                      # float 0.707106769
  1543  .LCPI11_7:
  1544          .long   0xbf800000                      # float -1
  1545  .LCPI11_8:
  1546          .long   0x3def251a                      # float 0.116769984
  1547  .LCPI11_9:
  1548          .long   0xbdebd1b8                      # float -0.115146101
  1549  .LCPI11_10:
  1550          .long   0x3e11e9bf                      # float 0.142493233
  1551  .LCPI11_11:
  1552          .long   0xbdfe5d4f                      # float -0.12420141
  1553  .LCPI11_12:
  1554          .long   0x3e4cceac                      # float 0.200007141
  1555  .LCPI11_13:
  1556          .long   0xbe2aae50                      # float -0.166680574
  1557  .LCPI11_14:
  1558          .long   0x3eaaaaaa                      # float 0.333333313
  1559  .LCPI11_15:
  1560          .long   0xbe7ffffc                      # float -0.24999994
  1561  .LCPI11_16:
  1562          .long   0x3d9021bb                      # float 0.0703768358
  1563  .LCPI11_18:
  1564          .long   0xcb00007f                      # float -8388735
  1565  .LCPI11_19:
  1566          .long   0xbf000000                      # float -0.5
  1567  .LCPI11_20:
  1568          .long   0x3f000000                      # float 0.5
  1569  .LCPI11_21:
  1570          .long   0xbf318000                      # float -0.693359375
  1571  .LCPI11_22:
  1572          .long   0xb95e8083                      # float -2.12194442E-4
  1573  .LCPI11_23:
  1574          .long   0xbf317218                      # float -0.693147182
  1575  .LCPI11_24:
  1576          .long   0x3d2aaaab                      # float 0.0416666679
  1577  .LCPI11_25:
  1578          .long   0x3c088889                      # float 0.00833333377
  1579  .LCPI11_26:
  1580          .long   0x3ab60b61                      # float 0.00138888892
  1581  .LCPI11_27:
  1582          .long   0x39500d01                      # float 1.98412701E-4
  1583  .LCPI11_28:
  1584          .long   0x3e2aaaab                      # float 0.166666672
  1585  .LCPI11_30:
  1586          .long   254                             # 0xfe
  1587  .LCPI11_31:
  1588          .long   0x43960000                      # float 300
  1589  .LCPI11_32:
  1590          .long   1                               # 0x1
  1591  .LCPI11_33:
  1592          .long   0xc3960000                      # float -300
  1593  .LCPI11_34:
  1594          .long   0x7fc00102                      # float NaN
  1595  .LCPI11_4:
  1596          .quad   36028792732385279               # 0x7fffff007fffff
  1597          .quad   36028792732385279               # 0x7fffff007fffff
  1598  .LCPI11_5:
  1599          .quad   4539628425446424576             # 0x3f0000003f000000
  1600          .quad   4539628425446424576             # 0x3f0000003f000000
  1601  .LCPI11_17:
  1602          .quad   5404319554102886400             # 0x4b0000004b000000
  1603  .LCPI11_29:
  1604          .byte   255                             # 0xff
  1605          .byte   0                               # 0x0
  1606          .byte   0                               # 0x0
  1607          .byte   0                               # 0x0
  1608          .byte   255                             # 0xff
  1609          .byte   0                               # 0x0
  1610          .byte   0                               # 0x0
  1611          .byte   0                               # 0x0
  1612          .byte   255                             # 0xff
  1613          .byte   0                               # 0x0
  1614          .byte   0                               # 0x0
  1615          .byte   0                               # 0x0
  1616          .byte   255                             # 0xff
  1617          .byte   0                               # 0x0
  1618          .byte   0                               # 0x0
  1619          .byte   0                               # 0x0
  1620          .byte   255                             # 0xff
  1621          .byte   0                               # 0x0
  1622          .byte   0                               # 0x0
  1623          .byte   0                               # 0x0
  1624          .byte   255                             # 0xff
  1625          .byte   0                               # 0x0
  1626          .byte   0                               # 0x0
  1627          .byte   0                               # 0x0
  1628          .byte   255                             # 0xff
  1629          .byte   0                               # 0x0
  1630          .byte   0                               # 0x0
  1631          .byte   0                               # 0x0
  1632          .byte   255                             # 0xff
  1633          .byte   0                               # 0x0
  1634          .byte   0                               # 0x0
  1635          .byte   0                               # 0x0
  1636  PowNumber_8x_F32_V(float*, float, unsigned long):             # @PowNumber_8x_F32_V(float*, float, unsigned long)
  1637          subq    $1000, %rsp                     # imm = 0x3E8
  1638          andq    $-8, %rsi
  1639          je      .LBB11_11
  1640          vbroadcastss    %xmm0, %ymm0
  1641          vbroadcastss    .LCPI11_0(%rip), %ymm14 # ymm14 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0]
  1642          vmulps  %ymm0, %ymm14, %ymm1
  1643          vmovups %ymm1, 384(%rsp)                # 32-byte Spill
  1644          vbroadcastss    .LCPI11_1(%rip), %ymm3  # ymm3 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
  1645          vxorps  %xmm15, %xmm15, %xmm15
  1646          vcmpltps        %ymm15, %ymm0, %ymm2
  1647          vcmpeqps        %ymm0, %ymm15, %ymm4
  1648          vbroadcastss    .LCPI11_2(%rip), %ymm1  # ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  1649          vmovups %ymm4, -64(%rsp)                # 32-byte Spill
  1650          vandps  %ymm1, %ymm4, %ymm1
  1651          vbroadcastss    .LCPI11_1(%rip), %ymm4  # ymm4 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
  1652          vmovups %ymm2, -32(%rsp)                # 32-byte Spill
  1653          vmovups %ymm4, -128(%rsp)               # 32-byte Spill
  1654          vblendvps       %ymm2, %ymm4, %ymm1, %ymm1
  1655          vmovups %ymm1, 960(%rsp)                # 32-byte Spill
  1656          vandps  %ymm3, %ymm0, %ymm1
  1657          vmovups %ymm1, 928(%rsp)                # 32-byte Spill
  1658          vpsrad  $31, %ymm0, %ymm1
  1659          vmovdqu %ymm1, -96(%rsp)                # 32-byte Spill
  1660          vbroadcastss    .LCPI11_3(%rip), %ymm1  # ymm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
  1661          vmovups %ymm1, 896(%rsp)                # 32-byte Spill
  1662          xorl    %eax, %eax
  1663          vbroadcastss    .LCPI11_6(%rip), %ymm1  # ymm1 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1]
  1664          vmovups %ymm1, 864(%rsp)                # 32-byte Spill
  1665          vbroadcastss    .LCPI11_7(%rip), %ymm1  # ymm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
  1666          vmovups %ymm1, 832(%rsp)                # 32-byte Spill
  1667          vbroadcastss    .LCPI11_8(%rip), %ymm1  # ymm1 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1]
  1668          vmovups %ymm1, 800(%rsp)                # 32-byte Spill
  1669          vbroadcastss    .LCPI11_9(%rip), %ymm1  # ymm1 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1]
  1670          vmovups %ymm1, 768(%rsp)                # 32-byte Spill
  1671          vbroadcastss    .LCPI11_10(%rip), %ymm1 # ymm1 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1]
  1672          vmovups %ymm1, 736(%rsp)                # 32-byte Spill
  1673          vbroadcastss    .LCPI11_11(%rip), %ymm1 # ymm1 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1]
  1674          vmovups %ymm1, 704(%rsp)                # 32-byte Spill
  1675          vbroadcastss    .LCPI11_12(%rip), %ymm1 # ymm1 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1]
  1676          vmovups %ymm1, 672(%rsp)                # 32-byte Spill
  1677          vbroadcastss    .LCPI11_13(%rip), %ymm1 # ymm1 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1]
  1678          vmovups %ymm1, 640(%rsp)                # 32-byte Spill
  1679          vbroadcastss    .LCPI11_14(%rip), %ymm1 # ymm1 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1]
  1680          vmovups %ymm1, 608(%rsp)                # 32-byte Spill
  1681          vbroadcastss    .LCPI11_15(%rip), %ymm1 # ymm1 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1]
  1682          vmovups %ymm1, 576(%rsp)                # 32-byte Spill
  1683          vbroadcastss    .LCPI11_16(%rip), %ymm1 # ymm1 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2]
  1684          vmovups %ymm1, 544(%rsp)                # 32-byte Spill
  1685          vbroadcastsd    .LCPI11_17(%rip), %ymm1 # ymm1 = [5404319554102886400,5404319554102886400,5404319554102886400,5404319554102886400]
  1686          vmovups %ymm1, 512(%rsp)                # 32-byte Spill
  1687          vbroadcastss    .LCPI11_18(%rip), %ymm1 # ymm1 = [-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6]
  1688          vmovups %ymm1, 480(%rsp)                # 32-byte Spill
  1689          vbroadcastss    .LCPI11_2(%rip), %ymm11 # ymm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  1690          vbroadcastss    .LCPI11_19(%rip), %ymm1 # ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  1691          vmovups %ymm1, 448(%rsp)                # 32-byte Spill
  1692          vbroadcastss    .LCPI11_20(%rip), %ymm1 # ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1]
  1693          vmovups %ymm1, 416(%rsp)                # 32-byte Spill
  1694          vbroadcastss    .LCPI11_21(%rip), %ymm1 # ymm1 = [-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1]
  1695          vmovups %ymm1, 352(%rsp)                # 32-byte Spill
  1696          vbroadcastss    .LCPI11_22(%rip), %ymm1 # ymm1 = [-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4]
  1697          vmovups %ymm1, 320(%rsp)                # 32-byte Spill
  1698          vbroadcastss    .LCPI11_23(%rip), %ymm1 # ymm1 = [-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1]
  1699          vmovups %ymm1, 288(%rsp)                # 32-byte Spill
  1700          vbroadcastss    .LCPI11_24(%rip), %ymm1 # ymm1 = [4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2]
  1701          vmovups %ymm1, 256(%rsp)                # 32-byte Spill
  1702          vbroadcastss    .LCPI11_25(%rip), %ymm1 # ymm1 = [8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3]
  1703          vmovups %ymm1, 224(%rsp)                # 32-byte Spill
  1704          vbroadcastss    .LCPI11_26(%rip), %ymm1 # ymm1 = [1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3]
  1705          vmovups %ymm1, 192(%rsp)                # 32-byte Spill
  1706          vbroadcastss    .LCPI11_27(%rip), %ymm1 # ymm1 = [1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4]
  1707          vmovups %ymm1, 160(%rsp)                # 32-byte Spill
  1708          vbroadcastss    .LCPI11_28(%rip), %ymm1 # ymm1 = [1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1]
  1709          vmovups %ymm1, 128(%rsp)                # 32-byte Spill
  1710          vbroadcastss    .LCPI11_30(%rip), %ymm1 # ymm1 = [254,254,254,254,254,254,254,254]
  1711          vmovups %ymm1, 96(%rsp)                 # 32-byte Spill
  1712          vbroadcastss    .LCPI11_31(%rip), %ymm1 # ymm1 = [3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2]
  1713          vmovups %ymm1, 64(%rsp)                 # 32-byte Spill
  1714          vbroadcastss    .LCPI11_32(%rip), %ymm1 # ymm1 = [1,1,1,1,1,1,1,1]
  1715          vmovups %ymm1, 32(%rsp)                 # 32-byte Spill
  1716          vbroadcastss    .LCPI11_33(%rip), %ymm1 # ymm1 = [-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2]
  1717          vmovups %ymm1, (%rsp)                   # 32-byte Spill
  1718          jmp     .LBB11_2
  1719  .LBB11_9:                               #   in Loop: Header=BB11_2 Depth=1
  1720          vpxor   %ymm6, %ymm12, %ymm5
  1721          vpandn  %ymm5, %ymm4, %ymm4
  1722          vcmpeqps        %ymm1, %ymm11, %ymm5
  1723          vcmpltps        %ymm1, %ymm11, %ymm6
  1724          vxorps  -96(%rsp), %ymm6, %ymm6         # 32-byte Folded Reload
  1725          vxorps  %xmm15, %xmm15, %xmm15
  1726          vblendvps       %ymm6, -128(%rsp), %ymm15, %ymm6 # 32-byte Folded Reload
  1727          vblendvps       %ymm5, %ymm11, %ymm6, %ymm5
  1728          vblendvps       %ymm4, %ymm2, %ymm5, %ymm4
  1729          vandps  %ymm2, %ymm9, %ymm2
  1730          vandps  %ymm7, %ymm9, %ymm5
  1731          vorps   %ymm1, %ymm5, %ymm1
  1732          vmovups -32(%rsp), %ymm5                # 32-byte Reload
  1733          vblendvps       %ymm5, %ymm2, %ymm1, %ymm1
  1734          vmovups -64(%rsp), %ymm2                # 32-byte Reload
  1735          vblendvps       %ymm2, %ymm11, %ymm1, %ymm1
  1736          vblendvps       %ymm8, %ymm4, %ymm1, %ymm1
  1737          vcmpunordps     %ymm7, %ymm7, %ymm2
  1738          vcmpunordps     %ymm0, %ymm0, %ymm4
  1739          vorps   %ymm2, %ymm4, %ymm2
  1740          vaddps  %ymm0, %ymm7, %ymm4
  1741          vblendvps       %ymm2, %ymm4, %ymm1, %ymm2
  1742          vmovups %ymm2, (%rdi,%rax,4)
  1743          addq    $8, %rax
  1744          cmpq    %rsi, %rax
  1745          jae     .LBB11_11
  1746  .LBB11_2:                               # =>This Inner Loop Header: Depth=1
  1747          vmovdqu (%rdi,%rax,4), %ymm7
  1748          vmovaps .LCPI11_4(%rip), %xmm4          # xmm4 = [36028792732385279,36028792732385279]
  1749          vandps  (%rdi,%rax,4), %xmm4, %xmm2
  1750          vpand   896(%rsp), %ymm7, %ymm1         # 32-byte Folded Reload
  1751          vmovaps .LCPI11_5(%rip), %xmm5          # xmm5 = [4539628425446424576,4539628425446424576]
  1752          vorps   %xmm5, %xmm2, %xmm2
  1753          vandps  16(%rdi,%rax,4), %xmm4, %xmm4
  1754          vorps   %xmm5, %xmm4, %xmm4
  1755          vinsertf128     $1, %xmm4, %ymm2, %ymm2
  1756          vmovups 864(%rsp), %ymm4                # 32-byte Reload
  1757          vcmpltps        %ymm2, %ymm4, %ymm6
  1758          vandnps %ymm2, %ymm6, %ymm4
  1759          vaddps  832(%rsp), %ymm2, %ymm2         # 32-byte Folded Reload
  1760          vaddps  %ymm4, %ymm2, %ymm5
  1761          vmulps  %ymm5, %ymm5, %ymm4
  1762          vmulps  %ymm4, %ymm4, %ymm2
  1763          vmovups 768(%rsp), %ymm8                # 32-byte Reload
  1764          vfmadd213ps     800(%rsp), %ymm5, %ymm8 # 32-byte Folded Reload
  1765          vmovups 704(%rsp), %ymm9                # 32-byte Reload
  1766          vfmadd213ps     736(%rsp), %ymm5, %ymm9 # 32-byte Folded Reload
  1767          vfmadd231ps     %ymm8, %ymm4, %ymm9     # ymm9 = (ymm4 * ymm8) + ymm9
  1768          vmovups 640(%rsp), %ymm8                # 32-byte Reload
  1769          vfmadd213ps     672(%rsp), %ymm5, %ymm8 # 32-byte Folded Reload
  1770          vmovups 576(%rsp), %ymm10               # 32-byte Reload
  1771          vfmadd213ps     608(%rsp), %ymm5, %ymm10 # 32-byte Folded Reload
  1772          vmulps  %ymm2, %ymm2, %ymm13
  1773          vfmadd132ps     544(%rsp), %ymm10, %ymm13 # 32-byte Folded Reload
  1774          vfmadd231ps     %ymm8, %ymm4, %ymm13    # ymm13 = (ymm4 * ymm8) + ymm13
  1775          vfmadd231ps     %ymm9, %ymm2, %ymm13    # ymm13 = (ymm2 * ymm9) + ymm13
  1776          vmulps  %ymm5, %ymm4, %ymm2
  1777          vmulps  %ymm2, %ymm13, %ymm8
  1778          vpsrld  $23, %ymm1, %ymm2
  1779          vpor    512(%rsp), %ymm2, %ymm2         # 32-byte Folded Reload
  1780          vaddps  480(%rsp), %ymm2, %ymm2         # 32-byte Folded Reload
  1781          vandps  %ymm6, %ymm11, %ymm6
  1782          vaddps  %ymm6, %ymm2, %ymm6
  1783          vmulps  %ymm0, %ymm6, %ymm2
  1784          vroundps        $8, %ymm2, %ymm2
  1785          vfnmadd213ps    %ymm2, %ymm0, %ymm6     # ymm6 = -(ymm0 * ymm6) + ymm2
  1786          vmovups 448(%rsp), %ymm13               # 32-byte Reload
  1787          vmovaps %ymm13, %ymm9
  1788          vfmadd213ps     %ymm5, %ymm4, %ymm9     # ymm9 = (ymm4 * ymm9) + ymm5
  1789          vaddps  %ymm9, %ymm8, %ymm9
  1790          vmovups 416(%rsp), %ymm12               # 32-byte Reload
  1791          vmulps  %ymm5, %ymm12, %ymm10
  1792          vmulps  %ymm4, %ymm13, %ymm13
  1793          vfmadd231ps     %ymm10, %ymm5, %ymm13   # ymm13 = (ymm5 * ymm10) + ymm13
  1794          vsubps  %ymm5, %ymm9, %ymm5
  1795          vfmadd231ps     %ymm4, %ymm12, %ymm5    # ymm5 = (ymm12 * ymm4) + ymm5
  1796          vmulps  384(%rsp), %ymm9, %ymm4         # 32-byte Folded Reload
  1797          vroundps        $8, %ymm4, %ymm4
  1798          vmulps  352(%rsp), %ymm4, %ymm10        # 32-byte Folded Reload
  1799          vfmadd231ps     %ymm9, %ymm0, %ymm10    # ymm10 = (ymm0 * ymm9) + ymm10
  1800          vfnmadd231ps    320(%rsp), %ymm4, %ymm10 # 32-byte Folded Reload
  1801          vsubps  %ymm8, %ymm13, %ymm8
  1802          vaddps  %ymm5, %ymm8, %ymm5
  1803          vmovups 288(%rsp), %ymm8                # 32-byte Reload
  1804          vmulps  %ymm6, %ymm8, %ymm6
  1805          vfnmadd231ps    %ymm5, %ymm0, %ymm6     # ymm6 = -(ymm0 * ymm5) + ymm6
  1806          vaddps  %ymm6, %ymm10, %ymm5
  1807          vmulps  %ymm5, %ymm14, %ymm6
  1808          vroundps        $8, %ymm6, %ymm6
  1809          vfmadd231ps     %ymm8, %ymm6, %ymm5     # ymm5 = (ymm6 * ymm8) + ymm5
  1810          vmulps  %ymm5, %ymm5, %ymm8
  1811          vmulps  %ymm8, %ymm8, %ymm9
  1812          vmovups 160(%rsp), %ymm10               # 32-byte Reload
  1813          vfmadd213ps     192(%rsp), %ymm5, %ymm10 # 32-byte Folded Reload
  1814          vmovups 128(%rsp), %ymm13               # 32-byte Reload
  1815          vfmadd213ps     %ymm12, %ymm5, %ymm13   # ymm13 = (ymm5 * ymm13) + ymm12
  1816          vfmadd231ps     %ymm10, %ymm9, %ymm13   # ymm13 = (ymm9 * ymm10) + ymm13
  1817          vmovups 224(%rsp), %ymm9                # 32-byte Reload
  1818          vfmadd213ps     256(%rsp), %ymm5, %ymm9 # 32-byte Folded Reload
  1819          vfmadd231ps     %ymm9, %ymm8, %ymm13    # ymm13 = (ymm8 * ymm9) + ymm13
  1820          vaddps  %ymm5, %ymm11, %ymm9
  1821          vfmadd231ps     %ymm13, %ymm8, %ymm9    # ymm9 = (ymm8 * ymm13) + ymm9
  1822          vaddps  %ymm2, %ymm4, %ymm2
  1823          vaddps  %ymm6, %ymm2, %ymm4
  1824          vcvtps2dq       %ymm4, %ymm2
  1825          vpsrld  $23, %ymm9, %ymm5
  1826          vpand   .LCPI11_29(%rip), %ymm5, %ymm5
  1827          vpaddd  %ymm2, %ymm5, %ymm6
  1828          vpcmpgtd        96(%rsp), %ymm6, %ymm5          # 32-byte Folded Reload
  1829          vmovups 64(%rsp), %ymm8                 # 32-byte Reload
  1830          vcmpltps        %ymm4, %ymm8, %ymm8
  1831          vpor    %ymm5, %ymm8, %ymm5
  1832          vmovdqu 32(%rsp), %ymm8                 # 32-byte Reload
  1833          vpcmpgtd        %ymm6, %ymm8, %ymm6
  1834          vcmpltps        (%rsp), %ymm4, %ymm8            # 32-byte Folded Reload
  1835          vpor    %ymm6, %ymm8, %ymm6
  1836          vpslld  $23, %ymm2, %ymm2
  1837          vpaddd  %ymm2, %ymm9, %ymm2
  1838          vpor    %ymm5, %ymm6, %ymm8
  1839          vtestps %ymm8, %ymm8
  1840          je      .LBB11_4
  1841          vpandn  %ymm2, %ymm6, %ymm2
  1842          vblendvps       %ymm5, -128(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
  1843  .LBB11_4:                               #   in Loop: Header=BB11_2 Depth=1
  1844          vpand   %ymm3, %ymm7, %ymm8
  1845          vpcmpeqd        %ymm15, %ymm8, %ymm5
  1846          vblendvps       %ymm5, 960(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
  1847          vmovmskps       %ymm7, %ecx
  1848          testl   %ecx, %ecx
  1849          jne     .LBB11_6
  1850          vxorps  %xmm9, %xmm9, %xmm9
  1851          jmp     .LBB11_7
  1852  .LBB11_6:                               #   in Loop: Header=BB11_2 Depth=1
  1853          vroundps        $8, %ymm0, %ymm6
  1854          vcmpeqps        %ymm0, %ymm6, %ymm6
  1855          vcvtps2dq       %ymm0, %ymm9
  1856          vpslld  $31, %ymm9, %ymm9
  1857          vpor    %ymm2, %ymm9, %ymm10
  1858          vcmpeqps        %ymm7, %ymm15, %ymm13
  1859          vmovaps %ymm14, %ymm12
  1860          vbroadcastss    .LCPI11_34(%rip), %ymm14 # ymm14 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
  1861          vblendvps       %ymm13, %ymm2, %ymm14, %ymm13
  1862          vmovaps %ymm12, %ymm14
  1863          vblendvps       %ymm6, %ymm10, %ymm13, %ymm10
  1864          vpsrad  $31, %ymm7, %ymm13
  1865          vblendvps       %ymm13, %ymm10, %ymm2, %ymm2
  1866          vandps  %ymm6, %ymm9, %ymm9
  1867  .LBB11_7:                               #   in Loop: Header=BB11_2 Depth=1
  1868          vpcmpeqd        %ymm12, %ymm12, %ymm12
  1869          vpcmpeqd        %ymm3, %ymm8, %ymm6
  1870          vpxor   %ymm6, %ymm12, %ymm8
  1871          vandps  %ymm3, %ymm4, %ymm4
  1872          vpcmpeqd        %ymm3, %ymm4, %ymm4
  1873          vpxor   %ymm4, %ymm12, %ymm10
  1874          vpcmpeqd        928(%rsp), %ymm3, %ymm6         # 32-byte Folded Reload
  1875          vpandn  %ymm8, %ymm6, %ymm13
  1876          vpor    %ymm5, %ymm10, %ymm5
  1877          vpand   %ymm5, %ymm13, %ymm5
  1878          vtestps %ymm12, %ymm5
  1879          jae     .LBB11_9
  1880          vxorps  %xmm15, %xmm15, %xmm15
  1881          vmovups %ymm2, (%rdi,%rax,4)
  1882          addq    $8, %rax
  1883          cmpq    %rsi, %rax
  1884          jb      .LBB11_2
  1885  .LBB11_11:
  1886          addq    $1000, %rsp                     # imm = 0x3E8
  1887          vzeroupper
  1888          retq
  1889  .LCPI12_0:
  1890          .long   0x00800000                      # float 1.17549435E-38
  1891  .LCPI12_1:
  1892          .long   2155872255                      # 0x807fffff
  1893  .LCPI12_2:
  1894          .long   1056964608                      # 0x3f000000
  1895  .LCPI12_3:
  1896          .long   4294967169                      # 0xffffff81
  1897  .LCPI12_4:
  1898          .long   0x3f800000                      # float 1
  1899  .LCPI12_5:
  1900          .long   0x3f3504f3                      # float 0.707106769
  1901  .LCPI12_6:
  1902          .long   0xbf800000                      # float -1
  1903  .LCPI12_7:
  1904          .long   0x3d9021bb                      # float 0.0703768358
  1905  .LCPI12_8:
  1906          .long   0xbdebd1b8                      # float -0.115146101
  1907  .LCPI12_9:
  1908          .long   0x3def251a                      # float 0.116769984
  1909  .LCPI12_10:
  1910          .long   0xbdfe5d4f                      # float -0.12420141
  1911  .LCPI12_11:
  1912          .long   0x3e11e9bf                      # float 0.142493233
  1913  .LCPI12_12:
  1914          .long   0xbe2aae50                      # float -0.166680574
  1915  .LCPI12_13:
  1916          .long   0x3e4cceac                      # float 0.200007141
  1917  .LCPI12_14:
  1918          .long   0xbe7ffffc                      # float -0.24999994
  1919  .LCPI12_15:
  1920          .long   0x3eaaaaaa                      # float 0.333333313
  1921  .LCPI12_16:
  1922          .long   0x3f317218                      # float 0.693147182
  1923  .LCPI12_17:
  1924          .long   0xbf000000                      # float -0.5
  1925  .LCPI12_18:
  1926          .long   0x3ede5bd9                      # float 0.434294492
  1927  .LCPI12_19:
  1928          .zero   32
  1929  Log10_Len8x_F32_V(float*, unsigned long):               # @Log10_Len8x_F32_V(float*, unsigned long)
  1930          subq    $136, %rsp
  1931          testq   %rsi, %rsi
  1932          je      .LBB12_3
  1933          xorl    %eax, %eax
  1934          vbroadcastss    .LCPI12_1(%rip), %ymm0  # ymm0 = [2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255]
  1935          vmovups %ymm0, 96(%rsp)                 # 32-byte Spill
  1936          vbroadcastss    .LCPI12_2(%rip), %ymm0  # ymm0 = [1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608]
  1937          vmovups %ymm0, 64(%rsp)                 # 32-byte Spill
  1938          vbroadcastss    .LCPI12_3(%rip), %ymm0  # ymm0 = [4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169]
  1939          vmovups %ymm0, 32(%rsp)                 # 32-byte Spill
  1940          vbroadcastss    .LCPI12_0(%rip), %ymm0  # ymm0 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
  1941          vmovups %ymm0, (%rsp)                   # 32-byte Spill
  1942          vbroadcastss    .LCPI12_4(%rip), %ymm0  # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  1943          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
  1944          vbroadcastss    .LCPI12_5(%rip), %ymm0  # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1]
  1945          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
  1946          vbroadcastss    .LCPI12_6(%rip), %ymm0  # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
  1947          vmovups %ymm0, -96(%rsp)                # 32-byte Spill
  1948          vbroadcastss    .LCPI12_7(%rip), %ymm0  # ymm0 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2]
  1949          vmovups %ymm0, -128(%rsp)               # 32-byte Spill
  1950          vbroadcastss    .LCPI12_8(%rip), %ymm9  # ymm9 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1]
  1951          vbroadcastss    .LCPI12_9(%rip), %ymm10 # ymm10 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1]
  1952          vbroadcastss    .LCPI12_10(%rip), %ymm11 # ymm11 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1]
  1953          vbroadcastss    .LCPI12_11(%rip), %ymm12 # ymm12 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1]
  1954          vbroadcastss    .LCPI12_12(%rip), %ymm13 # ymm13 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1]
  1955          vbroadcastss    .LCPI12_13(%rip), %ymm14 # ymm14 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1]
  1956          vbroadcastss    .LCPI12_14(%rip), %ymm15 # ymm15 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1]
  1957          vbroadcastss    .LCPI12_15(%rip), %ymm0 # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1]
  1958          vbroadcastss    .LCPI12_16(%rip), %ymm1 # ymm1 = [6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1]
  1959          vbroadcastss    .LCPI12_17(%rip), %ymm2 # ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  1960          vbroadcastss    .LCPI12_18(%rip), %ymm3 # ymm3 = [4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1]
  1961  .LBB12_2:                               # =>This Inner Loop Header: Depth=1
  1962          vmovups (%rdi,%rax,4), %ymm4
  1963          vmaxps  (%rsp), %ymm4, %ymm5            # 32-byte Folded Reload
  1964          vpsrld  $23, %ymm5, %ymm6
  1965          vpaddd  32(%rsp), %ymm6, %ymm6          # 32-byte Folded Reload
  1966          vandps  96(%rsp), %ymm5, %ymm5          # 32-byte Folded Reload
  1967          vorps   64(%rsp), %ymm5, %ymm5          # 32-byte Folded Reload
  1968          vcvtdq2ps       %ymm6, %ymm6
  1969          vaddps  -32(%rsp), %ymm6, %ymm7         # 32-byte Folded Reload
  1970          vcmpltps        -64(%rsp), %ymm5, %ymm8         # 32-byte Folded Reload
  1971          vblendvps       %ymm8, %ymm6, %ymm7, %ymm6
  1972          vandps  %ymm5, %ymm8, %ymm7
  1973          vaddps  -96(%rsp), %ymm5, %ymm5         # 32-byte Folded Reload
  1974          vaddps  %ymm7, %ymm5, %ymm5
  1975          vmovups -128(%rsp), %ymm7               # 32-byte Reload
  1976          vfmadd213ps     %ymm9, %ymm5, %ymm7     # ymm7 = (ymm5 * ymm7) + ymm9
  1977          vfmadd213ps     %ymm10, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm10
  1978          vfmadd213ps     %ymm11, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm11
  1979          vfmadd213ps     %ymm12, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm12
  1980          vfmadd213ps     %ymm13, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm13
  1981          vfmadd213ps     %ymm14, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm14
  1982          vfmadd213ps     %ymm15, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm15
  1983          vfmadd213ps     %ymm0, %ymm5, %ymm7     # ymm7 = (ymm5 * ymm7) + ymm0
  1984          vfmadd213ps     %ymm2, %ymm5, %ymm7     # ymm7 = (ymm5 * ymm7) + ymm2
  1985          vfmadd213ps     %ymm5, %ymm1, %ymm6     # ymm6 = (ymm1 * ymm6) + ymm5
  1986          vmulps  %ymm5, %ymm5, %ymm5
  1987          vfmadd231ps     %ymm7, %ymm5, %ymm6     # ymm6 = (ymm5 * ymm7) + ymm6
  1988          vcmpleps        .LCPI12_19(%rip), %ymm4, %ymm4
  1989          vmulps  %ymm3, %ymm6, %ymm5
  1990          vorps   %ymm5, %ymm4, %ymm4
  1991          vmovups %ymm4, (%rdi,%rax,4)
  1992          addq    $8, %rax
  1993          cmpq    %rsi, %rax
  1994          jb      .LBB12_2
  1995  .LBB12_3:
  1996          addq    $136, %rsp
  1997          vzeroupper
  1998          retq
  1999  .LCPI13_0:
  2000          .long   0x00800000                      # float 1.17549435E-38
  2001  .LCPI13_1:
  2002          .long   2155872255                      # 0x807fffff
  2003  .LCPI13_2:
  2004          .long   1056964608                      # 0x3f000000
  2005  .LCPI13_3:
  2006          .long   4294967169                      # 0xffffff81
  2007  .LCPI13_4:
  2008          .long   0x3f800000                      # float 1
  2009  .LCPI13_5:
  2010          .long   0x3f3504f3                      # float 0.707106769
  2011  .LCPI13_6:
  2012          .long   0xbf800000                      # float -1
  2013  .LCPI13_7:
  2014          .long   0x3d9021bb                      # float 0.0703768358
  2015  .LCPI13_8:
  2016          .long   0xbdebd1b8                      # float -0.115146101
  2017  .LCPI13_9:
  2018          .long   0x3def251a                      # float 0.116769984
  2019  .LCPI13_10:
  2020          .long   0xbdfe5d4f                      # float -0.12420141
  2021  .LCPI13_11:
  2022          .long   0x3e11e9bf                      # float 0.142493233
  2023  .LCPI13_12:
  2024          .long   0xbe2aae50                      # float -0.166680574
  2025  .LCPI13_13:
  2026          .long   0x3e4cceac                      # float 0.200007141
  2027  .LCPI13_14:
  2028          .long   0xbe7ffffc                      # float -0.24999994
  2029  .LCPI13_15:
  2030          .long   0x3eaaaaaa                      # float 0.333333313
  2031  .LCPI13_16:
  2032          .long   0x3f317218                      # float 0.693147182
  2033  .LCPI13_17:
  2034          .long   0xbf000000                      # float -0.5
  2035  .LCPI13_18:
  2036          .long   0x3fb8aa3b                      # float 1.44269502
  2037  .LCPI13_19:
  2038          .zero   32
  2039  Log2_Len8x_F32_V(float*, unsigned long):                # @Log2_Len8x_F32_V(float*, unsigned long)
  2040          subq    $136, %rsp
  2041          testq   %rsi, %rsi
  2042          je      .LBB13_3
  2043          xorl    %eax, %eax
  2044          vbroadcastss    .LCPI13_1(%rip), %ymm0  # ymm0 = [2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255]
  2045          vmovups %ymm0, 96(%rsp)                 # 32-byte Spill
  2046          vbroadcastss    .LCPI13_2(%rip), %ymm0  # ymm0 = [1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608]
  2047          vmovups %ymm0, 64(%rsp)                 # 32-byte Spill
  2048          vbroadcastss    .LCPI13_3(%rip), %ymm0  # ymm0 = [4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169]
  2049          vmovups %ymm0, 32(%rsp)                 # 32-byte Spill
  2050          vbroadcastss    .LCPI13_0(%rip), %ymm0  # ymm0 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
  2051          vmovups %ymm0, (%rsp)                   # 32-byte Spill
  2052          vbroadcastss    .LCPI13_4(%rip), %ymm0  # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  2053          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
  2054          vbroadcastss    .LCPI13_5(%rip), %ymm0  # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1]
  2055          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
  2056          vbroadcastss    .LCPI13_6(%rip), %ymm0  # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
  2057          vmovups %ymm0, -96(%rsp)                # 32-byte Spill
  2058          vbroadcastss    .LCPI13_7(%rip), %ymm0  # ymm0 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2]
  2059          vmovups %ymm0, -128(%rsp)               # 32-byte Spill
  2060          vbroadcastss    .LCPI13_8(%rip), %ymm9  # ymm9 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1]
  2061          vbroadcastss    .LCPI13_9(%rip), %ymm10 # ymm10 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1]
  2062          vbroadcastss    .LCPI13_10(%rip), %ymm11 # ymm11 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1]
  2063          vbroadcastss    .LCPI13_11(%rip), %ymm12 # ymm12 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1]
  2064          vbroadcastss    .LCPI13_12(%rip), %ymm13 # ymm13 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1]
  2065          vbroadcastss    .LCPI13_13(%rip), %ymm14 # ymm14 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1]
  2066          vbroadcastss    .LCPI13_14(%rip), %ymm15 # ymm15 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1]
  2067          vbroadcastss    .LCPI13_15(%rip), %ymm0 # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1]
  2068          vbroadcastss    .LCPI13_16(%rip), %ymm1 # ymm1 = [6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1]
  2069          vbroadcastss    .LCPI13_17(%rip), %ymm2 # ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  2070          vbroadcastss    .LCPI13_18(%rip), %ymm3 # ymm3 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0]
  2071  .LBB13_2:                               # =>This Inner Loop Header: Depth=1
  2072          vmovups (%rdi,%rax,4), %ymm4
  2073          vmaxps  (%rsp), %ymm4, %ymm5            # 32-byte Folded Reload
  2074          vpsrld  $23, %ymm5, %ymm6
  2075          vpaddd  32(%rsp), %ymm6, %ymm6          # 32-byte Folded Reload
  2076          vandps  96(%rsp), %ymm5, %ymm5          # 32-byte Folded Reload
  2077          vorps   64(%rsp), %ymm5, %ymm5          # 32-byte Folded Reload
  2078          vcvtdq2ps       %ymm6, %ymm6
  2079          vaddps  -32(%rsp), %ymm6, %ymm7         # 32-byte Folded Reload
  2080          vcmpltps        -64(%rsp), %ymm5, %ymm8         # 32-byte Folded Reload
  2081          vblendvps       %ymm8, %ymm6, %ymm7, %ymm6
  2082          vandps  %ymm5, %ymm8, %ymm7
  2083          vaddps  -96(%rsp), %ymm5, %ymm5         # 32-byte Folded Reload
  2084          vaddps  %ymm7, %ymm5, %ymm5
  2085          vmovups -128(%rsp), %ymm7               # 32-byte Reload
  2086          vfmadd213ps     %ymm9, %ymm5, %ymm7     # ymm7 = (ymm5 * ymm7) + ymm9
  2087          vfmadd213ps     %ymm10, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm10
  2088          vfmadd213ps     %ymm11, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm11
  2089          vfmadd213ps     %ymm12, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm12
  2090          vfmadd213ps     %ymm13, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm13
  2091          vfmadd213ps     %ymm14, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm14
  2092          vfmadd213ps     %ymm15, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm15
  2093          vfmadd213ps     %ymm0, %ymm5, %ymm7     # ymm7 = (ymm5 * ymm7) + ymm0
  2094          vfmadd213ps     %ymm2, %ymm5, %ymm7     # ymm7 = (ymm5 * ymm7) + ymm2
  2095          vfmadd213ps     %ymm5, %ymm1, %ymm6     # ymm6 = (ymm1 * ymm6) + ymm5
  2096          vmulps  %ymm5, %ymm5, %ymm5
  2097          vfmadd231ps     %ymm7, %ymm5, %ymm6     # ymm6 = (ymm5 * ymm7) + ymm6
  2098          vcmpleps        .LCPI13_19(%rip), %ymm4, %ymm4
  2099          vmulps  %ymm3, %ymm6, %ymm5
  2100          vorps   %ymm5, %ymm4, %ymm4
  2101          vmovups %ymm4, (%rdi,%rax,4)
  2102          addq    $8, %rax
  2103          cmpq    %rsi, %rax
  2104          jb      .LBB13_2
  2105  .LBB13_3:
  2106          addq    $136, %rsp
  2107          vzeroupper
  2108          retq
  2109  .LCPI14_0:
  2110          .long   0x00800000                      # float 1.17549435E-38
  2111  .LCPI14_1:
  2112          .long   2155872255                      # 0x807fffff
  2113  .LCPI14_2:
  2114          .long   1056964608                      # 0x3f000000
  2115  .LCPI14_3:
  2116          .long   4294967169                      # 0xffffff81
  2117  .LCPI14_4:
  2118          .long   0x3f800000                      # float 1
  2119  .LCPI14_5:
  2120          .long   0x3f3504f3                      # float 0.707106769
  2121  .LCPI14_6:
  2122          .long   0xbf800000                      # float -1
  2123  .LCPI14_7:
  2124          .long   0x3d9021bb                      # float 0.0703768358
  2125  .LCPI14_8:
  2126          .long   0xbdebd1b8                      # float -0.115146101
  2127  .LCPI14_9:
  2128          .long   0x3def251a                      # float 0.116769984
  2129  .LCPI14_10:
  2130          .long   0xbdfe5d4f                      # float -0.12420141
  2131  .LCPI14_11:
  2132          .long   0x3e11e9bf                      # float 0.142493233
  2133  .LCPI14_12:
  2134          .long   0xbe2aae50                      # float -0.166680574
  2135  .LCPI14_13:
  2136          .long   0x3e4cceac                      # float 0.200007141
  2137  .LCPI14_14:
  2138          .long   0xbe7ffffc                      # float -0.24999994
  2139  .LCPI14_15:
  2140          .long   0x3eaaaaaa                      # float 0.333333313
  2141  .LCPI14_16:
  2142          .long   0x3f317218                      # float 0.693147182
  2143  .LCPI14_17:
  2144          .long   0xbf000000                      # float -0.5
  2145  .LCPI14_18:
  2146          .zero   32
  2147  Log_Len8x_F32_V(float*, unsigned long):                 # @Log_Len8x_F32_V(float*, unsigned long)
  2148          subq    $104, %rsp
  2149          testq   %rsi, %rsi
  2150          je      .LBB14_3
  2151          xorl    %eax, %eax
  2152          vbroadcastss    .LCPI14_0(%rip), %ymm0  # ymm0 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
  2153          vmovups %ymm0, 64(%rsp)                 # 32-byte Spill
  2154          vbroadcastss    .LCPI14_1(%rip), %ymm0  # ymm0 = [2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255]
  2155          vmovups %ymm0, 32(%rsp)                 # 32-byte Spill
  2156          vbroadcastss    .LCPI14_2(%rip), %ymm0  # ymm0 = [1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608]
  2157          vmovups %ymm0, (%rsp)                   # 32-byte Spill
  2158          vbroadcastss    .LCPI14_3(%rip), %ymm0  # ymm0 = [4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169]
  2159          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
  2160          vbroadcastss    .LCPI14_4(%rip), %ymm0  # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  2161          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
  2162          vbroadcastss    .LCPI14_5(%rip), %ymm0  # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1]
  2163          vmovups %ymm0, -96(%rsp)                # 32-byte Spill
  2164          vbroadcastss    .LCPI14_6(%rip), %ymm0  # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
  2165          vmovups %ymm0, -128(%rsp)               # 32-byte Spill
  2166          vbroadcastss    .LCPI14_7(%rip), %ymm8  # ymm8 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2]
  2167          vbroadcastss    .LCPI14_8(%rip), %ymm9  # ymm9 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1]
  2168          vbroadcastss    .LCPI14_9(%rip), %ymm10 # ymm10 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1]
  2169          vbroadcastss    .LCPI14_10(%rip), %ymm11 # ymm11 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1]
  2170          vbroadcastss    .LCPI14_11(%rip), %ymm12 # ymm12 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1]
  2171          vbroadcastss    .LCPI14_12(%rip), %ymm13 # ymm13 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1]
  2172          vbroadcastss    .LCPI14_13(%rip), %ymm14 # ymm14 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1]
  2173          vbroadcastss    .LCPI14_14(%rip), %ymm15 # ymm15 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1]
  2174          vbroadcastss    .LCPI14_15(%rip), %ymm0 # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1]
  2175          vbroadcastss    .LCPI14_16(%rip), %ymm1 # ymm1 = [6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1]
  2176          vbroadcastss    .LCPI14_17(%rip), %ymm2 # ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  2177  .LBB14_2:                               # =>This Inner Loop Header: Depth=1
  2178          vmovups (%rdi,%rax,4), %ymm3
  2179          vmaxps  64(%rsp), %ymm3, %ymm4          # 32-byte Folded Reload
  2180          vpsrld  $23, %ymm4, %ymm5
  2181          vpaddd  -32(%rsp), %ymm5, %ymm5         # 32-byte Folded Reload
  2182          vandps  32(%rsp), %ymm4, %ymm4          # 32-byte Folded Reload
  2183          vorps   (%rsp), %ymm4, %ymm4            # 32-byte Folded Reload
  2184          vcvtdq2ps       %ymm5, %ymm5
  2185          vaddps  -64(%rsp), %ymm5, %ymm6         # 32-byte Folded Reload
  2186          vcmpltps        -96(%rsp), %ymm4, %ymm7         # 32-byte Folded Reload
  2187          vblendvps       %ymm7, %ymm5, %ymm6, %ymm5
  2188          vandps  %ymm4, %ymm7, %ymm6
  2189          vaddps  -128(%rsp), %ymm4, %ymm4        # 32-byte Folded Reload
  2190          vaddps  %ymm6, %ymm4, %ymm4
  2191          vmovaps %ymm8, %ymm6
  2192          vfmadd213ps     %ymm9, %ymm4, %ymm6     # ymm6 = (ymm4 * ymm6) + ymm9
  2193          vfmadd213ps     %ymm10, %ymm4, %ymm6    # ymm6 = (ymm4 * ymm6) + ymm10
  2194          vfmadd213ps     %ymm11, %ymm4, %ymm6    # ymm6 = (ymm4 * ymm6) + ymm11
  2195          vfmadd213ps     %ymm12, %ymm4, %ymm6    # ymm6 = (ymm4 * ymm6) + ymm12
  2196          vfmadd213ps     %ymm13, %ymm4, %ymm6    # ymm6 = (ymm4 * ymm6) + ymm13
  2197          vfmadd213ps     %ymm14, %ymm4, %ymm6    # ymm6 = (ymm4 * ymm6) + ymm14
  2198          vfmadd213ps     %ymm15, %ymm4, %ymm6    # ymm6 = (ymm4 * ymm6) + ymm15
  2199          vfmadd213ps     %ymm0, %ymm4, %ymm6     # ymm6 = (ymm4 * ymm6) + ymm0
  2200          vfmadd213ps     %ymm2, %ymm4, %ymm6     # ymm6 = (ymm4 * ymm6) + ymm2
  2201          vfmadd213ps     %ymm4, %ymm1, %ymm5     # ymm5 = (ymm1 * ymm5) + ymm4
  2202          vmulps  %ymm4, %ymm4, %ymm4
  2203          vfmadd231ps     %ymm6, %ymm4, %ymm5     # ymm5 = (ymm4 * ymm6) + ymm5
  2204          vcmpleps        .LCPI14_18(%rip), %ymm3, %ymm3
  2205          vorps   %ymm5, %ymm3, %ymm3
  2206          vmovups %ymm3, (%rdi,%rax,4)
  2207          addq    $8, %rax
  2208          cmpq    %rsi, %rax
  2209          jb      .LBB14_2
  2210  .LBB14_3:
  2211          addq    $104, %rsp
  2212          vzeroupper
  2213          retq
  2214  .LCPI15_0:
  2215          .long   0x42b17218                      # float 88.7228394
  2216  .LCPI15_1:
  2217          .long   0xc2ce8ed0                      # float -103.278931
  2218  .LCPI15_2:
  2219          .long   0x3f000000                      # float 0.5
  2220  .LCPI15_3:
  2221          .long   0x3fb8aa3b                      # float 1.44269502
  2222  .LCPI15_4:
  2223          .long   0xbf318000                      # float -0.693359375
  2224  .LCPI15_5:
  2225          .long   0x395e8083                      # float 2.12194442E-4
  2226  .LCPI15_6:
  2227          .long   1065353216                      # 0x3f800000
  2228  .LCPI15_7:
  2229          .long   0x3ab743ce                      # float 0.00139819994
  2230  .LCPI15_8:
  2231          .long   0x39506967                      # float 1.98756912E-4
  2232  .LCPI15_9:
  2233          .long   0x3c088908                      # float 0.00833345205
  2234  .LCPI15_10:
  2235          .long   0x3d2aa9c1                      # float 0.0416657962
  2236  .LCPI15_11:
  2237          .long   0x3e2aaaaa                      # float 0.166666657
  2238  .LCPI15_12:
  2239          .long   0x7f7fffff                      # float 3.40282347E+38
  2240  Exp_Len8x_F32_V(float*, unsigned long):                 # @Exp_Len8x_F32_V(float*, unsigned long)
  2241          testq   %rsi, %rsi
  2242          je      .LBB15_3
  2243          xorl    %eax, %eax
  2244          vbroadcastss    .LCPI15_0(%rip), %ymm0  # ymm0 = [8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1]
  2245          vmovups %ymm0, -40(%rsp)                # 32-byte Spill
  2246          vbroadcastss    .LCPI15_1(%rip), %ymm0  # ymm0 = [-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2]
  2247          vmovups %ymm0, -72(%rsp)                # 32-byte Spill
  2248          vbroadcastss    .LCPI15_2(%rip), %ymm2  # ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1]
  2249          vbroadcastss    .LCPI15_3(%rip), %ymm3  # ymm3 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0]
  2250          vbroadcastss    .LCPI15_4(%rip), %ymm4  # ymm4 = [-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1]
  2251          vbroadcastss    .LCPI15_5(%rip), %ymm5  # ymm5 = [2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4]
  2252          vpbroadcastd    .LCPI15_6(%rip), %ymm6  # ymm6 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
  2253          vbroadcastss    .LCPI15_7(%rip), %ymm7  # ymm7 = [1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3]
  2254          vbroadcastss    .LCPI15_8(%rip), %ymm1  # ymm1 = [1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4]
  2255          vbroadcastss    .LCPI15_9(%rip), %ymm9  # ymm9 = [8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3]
  2256          vbroadcastss    .LCPI15_10(%rip), %ymm10 # ymm10 = [4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2]
  2257          vbroadcastss    .LCPI15_11(%rip), %ymm11 # ymm11 = [1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1]
  2258          vbroadcastss    .LCPI15_12(%rip), %ymm12 # ymm12 = [3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38]
  2259  .LBB15_2:                               # =>This Inner Loop Header: Depth=1
  2260          vmovups (%rdi,%rax,4), %ymm13
  2261          vmovaps %ymm3, %ymm14
  2262          vfmadd213ps     %ymm2, %ymm13, %ymm14   # ymm14 = (ymm13 * ymm14) + ymm2
  2263          vroundps        $1, %ymm14, %ymm14
  2264          vmovaps %ymm4, %ymm15
  2265          vfmadd213ps     %ymm13, %ymm14, %ymm15  # ymm15 = (ymm14 * ymm15) + ymm13
  2266          vfmadd231ps     %ymm5, %ymm14, %ymm15   # ymm15 = (ymm14 * ymm5) + ymm15
  2267          vmulps  %ymm15, %ymm15, %ymm0
  2268          vmovaps %ymm1, %ymm8
  2269          vfmadd213ps     %ymm7, %ymm15, %ymm8    # ymm8 = (ymm15 * ymm8) + ymm7
  2270          vfmadd213ps     %ymm9, %ymm15, %ymm8    # ymm8 = (ymm15 * ymm8) + ymm9
  2271          vfmadd213ps     %ymm10, %ymm15, %ymm8   # ymm8 = (ymm15 * ymm8) + ymm10
  2272          vfmadd213ps     %ymm11, %ymm15, %ymm8   # ymm8 = (ymm15 * ymm8) + ymm11
  2273          vfmadd213ps     %ymm2, %ymm15, %ymm8    # ymm8 = (ymm15 * ymm8) + ymm2
  2274          vfmadd213ps     %ymm15, %ymm0, %ymm8    # ymm8 = (ymm0 * ymm8) + ymm15
  2275          vcvttps2dq      %ymm14, %ymm0
  2276          vpslld  $23, %ymm0, %ymm0
  2277          vpaddd  %ymm6, %ymm0, %ymm0
  2278          vfmadd213ps     %ymm0, %ymm0, %ymm8     # ymm8 = (ymm0 * ymm8) + ymm0
  2279          vmovups -40(%rsp), %ymm0                # 32-byte Reload
  2280          vcmpltps        %ymm13, %ymm0, %ymm0
  2281          vblendvps       %ymm0, %ymm12, %ymm8, %ymm0
  2282          vmovups -72(%rsp), %ymm8                # 32-byte Reload
  2283          vcmpleps        %ymm13, %ymm8, %ymm8
  2284          vandps  %ymm0, %ymm8, %ymm0
  2285          vmovups %ymm0, (%rdi,%rax,4)
  2286          addq    $8, %rax
  2287          cmpq    %rsi, %rax
  2288          jb      .LBB15_2
  2289  .LBB15_3:
  2290          vzeroupper
  2291          retq
  2292  .LCPI16_0:
  2293          .long   2147483647                      # 0x7fffffff
  2294  .LCPI16_1:
  2295          .long   0x3fa2f983                      # float 1.27323949
  2296  .LCPI16_2:
  2297          .long   4294967294                      # 0xfffffffe
  2298  .LCPI16_3:
  2299          .long   2                               # 0x2
  2300  .LCPI16_4:
  2301          .long   0xbf490fdb                      # float -0.785398185
  2302  .LCPI16_5:
  2303          .long   2147483648                      # 0x80000000
  2304  .LCPI16_6:
  2305          .long   0x37ccf5ce                      # float 2.44331568E-5
  2306  .LCPI16_7:
  2307          .long   0xbab6061a                      # float -0.00138873165
  2308  .LCPI16_8:
  2309          .long   0x3d2aaaa5                      # float 0.0416666456
  2310  .LCPI16_9:
  2311          .long   0xbf000000                      # float -0.5
  2312  .LCPI16_10:
  2313          .long   0x3f800000                      # float 1
  2314  .LCPI16_11:
  2315          .long   0xb94ca1f9                      # float -1.95152956E-4
  2316  .LCPI16_12:
  2317          .long   0x3c08839e                      # float 0.00833216123
  2318  .LCPI16_13:
  2319          .long   0xbe2aaaa3                      # float -0.166666552
  2320  .LCPI16_14:
  2321          .long   0x4b7fffff                      # float 16777215
  2322  .LCPI16_15:
  2323          .long   0x00000000                      # float 0
  2324  .LCPI16_16:
  2325          .zero   32,255
  2326  .LCPI16_17:
  2327          .zero   32
  2328  Sin_F32_V(float*, unsigned long):                        # @Sin_F32_V(float*, unsigned long)
  2329          pushq   %rax
  2330          movq    %rsi, %rax
  2331          andq    $-8, %rax
  2332          je      .LBB16_3
  2333          xorl    %ecx, %ecx
  2334          vbroadcastss    .LCPI16_0(%rip), %ymm0  # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
  2335          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
  2336          vbroadcastss    .LCPI16_1(%rip), %ymm0  # ymm0 = [1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0]
  2337          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
  2338          vbroadcastss    .LCPI16_2(%rip), %ymm0  # ymm0 = [4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294]
  2339          vmovups %ymm0, -96(%rsp)                # 32-byte Spill
  2340          vpbroadcastd    .LCPI16_3(%rip), %ymm4  # ymm4 = [2,2,2,2,2,2,2,2]
  2341          vpbroadcastd    .LCPI16_4(%rip), %ymm0  # ymm0 = [-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1]
  2342          vmovdqu %ymm0, -128(%rsp)               # 32-byte Spill
  2343          vpbroadcastd    .LCPI16_5(%rip), %ymm7  # ymm7 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
  2344          vbroadcastss    .LCPI16_6(%rip), %ymm8  # ymm8 = [2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5]
  2345          vbroadcastss    .LCPI16_7(%rip), %ymm9  # ymm9 = [-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3]
  2346          vbroadcastss    .LCPI16_8(%rip), %ymm10 # ymm10 = [4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2]
  2347          vbroadcastss    .LCPI16_9(%rip), %ymm11 # ymm11 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  2348          vbroadcastss    .LCPI16_10(%rip), %ymm12 # ymm12 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  2349          vbroadcastss    .LCPI16_11(%rip), %ymm3 # ymm3 = [-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4]
  2350          vbroadcastss    .LCPI16_12(%rip), %ymm14 # ymm14 = [8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3]
  2351          vbroadcastss    .LCPI16_13(%rip), %ymm15 # ymm15 = [-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1]
  2352  .LBB16_2:                               # =>This Inner Loop Header: Depth=1
  2353          vmovups (%rdi,%rcx,4), %ymm2
  2354          vandps  -32(%rsp), %ymm2, %ymm5         # 32-byte Folded Reload
  2355          vmulps  -64(%rsp), %ymm5, %ymm0         # 32-byte Folded Reload
  2356          vcvttps2dq      %ymm0, %ymm0
  2357          vpsubd  .LCPI16_16(%rip), %ymm0, %ymm0
  2358          vpand   -96(%rsp), %ymm0, %ymm1         # 32-byte Folded Reload
  2359          vcvtdq2ps       %ymm1, %ymm1
  2360          vfmadd132ps     -128(%rsp), %ymm5, %ymm1 # 32-byte Folded Reload
  2361          vmulps  %ymm1, %ymm1, %ymm5
  2362          vmovaps %ymm3, %ymm13
  2363          vfmadd213ps     %ymm14, %ymm5, %ymm13   # ymm13 = (ymm5 * ymm13) + ymm14
  2364          vfmadd213ps     %ymm15, %ymm5, %ymm13   # ymm13 = (ymm5 * ymm13) + ymm15
  2365          vmulps  %ymm1, %ymm5, %ymm6
  2366          vfmadd213ps     %ymm1, %ymm13, %ymm6    # ymm6 = (ymm13 * ymm6) + ymm1
  2367          vpslld  $29, %ymm0, %ymm1
  2368          vpand   %ymm4, %ymm0, %ymm0
  2369          vpxor   %ymm2, %ymm1, %ymm1
  2370          vmovaps %ymm8, %ymm2
  2371          vfmadd213ps     %ymm9, %ymm5, %ymm2     # ymm2 = (ymm5 * ymm2) + ymm9
  2372          vfmadd213ps     %ymm10, %ymm5, %ymm2    # ymm2 = (ymm5 * ymm2) + ymm10
  2373          vfmadd213ps     %ymm11, %ymm5, %ymm2    # ymm2 = (ymm5 * ymm2) + ymm11
  2374          vfmadd213ps     %ymm12, %ymm5, %ymm2    # ymm2 = (ymm5 * ymm2) + ymm12
  2375          vpcmpeqd        %ymm4, %ymm0, %ymm5
  2376          vandps  %ymm5, %ymm2, %ymm2
  2377          vpcmpeqd        .LCPI16_17(%rip), %ymm0, %ymm0
  2378          vandps  %ymm0, %ymm6, %ymm0
  2379          vaddps  %ymm2, %ymm0, %ymm0
  2380          vpand   %ymm7, %ymm1, %ymm1
  2381          vpxor   %ymm0, %ymm1, %ymm0
  2382          vmovdqu %ymm0, (%rdi,%rcx,4)
  2383          addq    $8, %rcx
  2384          cmpq    %rax, %rcx
  2385          jb      .LBB16_2
  2386  .LBB16_3:
  2387          cmpq    %rsi, %rax
  2388          jae     .LBB16_14
  2389          vbroadcastss    .LCPI16_5(%rip), %xmm0  # xmm0 = [2147483648,2147483648,2147483648,2147483648]
  2390          vmovss  .LCPI16_14(%rip), %xmm1         # xmm1 = mem[0],zero,zero,zero
  2391          vmovss  .LCPI16_1(%rip), %xmm9          # xmm9 = mem[0],zero,zero,zero
  2392          vmovss  .LCPI16_10(%rip), %xmm10        # xmm10 = mem[0],zero,zero,zero
  2393          vmovss  .LCPI16_4(%rip), %xmm11         # xmm11 = mem[0],zero,zero,zero
  2394          vmovss  .LCPI16_6(%rip), %xmm13         # xmm13 = mem[0],zero,zero,zero
  2395          vmovss  .LCPI16_7(%rip), %xmm12         # xmm12 = mem[0],zero,zero,zero
  2396          vmovss  .LCPI16_8(%rip), %xmm14         # xmm14 = mem[0],zero,zero,zero
  2397          vmovss  .LCPI16_9(%rip), %xmm15         # xmm15 = mem[0],zero,zero,zero
  2398          vmovss  .LCPI16_11(%rip), %xmm8         # xmm8 = mem[0],zero,zero,zero
  2399          vmovss  .LCPI16_12(%rip), %xmm5         # xmm5 = mem[0],zero,zero,zero
  2400          vmovss  .LCPI16_13(%rip), %xmm7         # xmm7 = mem[0],zero,zero,zero
  2401          jmp     .LBB16_5
  2402  .LBB16_13:                              #   in Loop: Header=BB16_5 Depth=1
  2403          incq    %rax
  2404          cmpq    %rsi, %rax
  2405          jae     .LBB16_14
  2406  .LBB16_5:                               # =>This Inner Loop Header: Depth=1
  2407          vmovss  (%rdi,%rax,4), %xmm2            # xmm2 = mem[0],zero,zero,zero
  2408          vxorps  %xmm0, %xmm2, %xmm3
  2409          vmaxss  %xmm2, %xmm3, %xmm6
  2410          vucomiss        %xmm1, %xmm6
  2411          ja      .LBB16_13
  2412          vucomiss        .LCPI16_15(%rip), %xmm2
  2413          vmulss  %xmm6, %xmm9, %xmm2
  2414          vcvttss2si      %xmm2, %ecx
  2415          setb    %r8b
  2416          vroundss        $11, %xmm2, %xmm2, %xmm2
  2417          movl    %ecx, %edx
  2418          andl    $1, %edx
  2419          je      .LBB16_8
  2420          vaddss  %xmm2, %xmm10, %xmm2
  2421  .LBB16_8:                               #   in Loop: Header=BB16_5 Depth=1
  2422          addl    %ecx, %edx
  2423          andl    $7, %edx
  2424          leal    -4(%rdx), %ecx
  2425          cmpl    $4, %edx
  2426          cmovbl  %edx, %ecx
  2427          setae   %dl
  2428          vfmadd231ss     %xmm11, %xmm2, %xmm6    # xmm6 = (xmm2 * xmm11) + xmm6
  2429          vmulss  %xmm6, %xmm6, %xmm2
  2430          vmovaps %xmm13, %xmm3
  2431          vfmadd213ss     %xmm12, %xmm2, %xmm3    # xmm3 = (xmm2 * xmm3) + xmm12
  2432          vfmadd213ss     %xmm14, %xmm2, %xmm3    # xmm3 = (xmm2 * xmm3) + xmm14
  2433          vfmadd213ss     %xmm15, %xmm2, %xmm3    # xmm3 = (xmm2 * xmm3) + xmm15
  2434          vmovaps %xmm8, %xmm4
  2435          vfmadd213ss     %xmm5, %xmm2, %xmm4     # xmm4 = (xmm2 * xmm4) + xmm5
  2436          vfmadd213ss     %xmm7, %xmm2, %xmm4     # xmm4 = (xmm2 * xmm4) + xmm7
  2437          decl    %ecx
  2438          cmpl    $2, %ecx
  2439          jb      .LBB16_9
  2440          vmulss  %xmm6, %xmm2, %xmm2
  2441          vfmadd213ss     %xmm6, %xmm2, %xmm4     # xmm4 = (xmm2 * xmm4) + xmm6
  2442          vmovaps %xmm4, %xmm2
  2443          vmovss  %xmm2, (%rdi,%rax,4)
  2444          cmpb    %dl, %r8b
  2445          je      .LBB16_13
  2446          jmp     .LBB16_12
  2447  .LBB16_9:                               #   in Loop: Header=BB16_5 Depth=1
  2448          vfmadd213ss     %xmm10, %xmm3, %xmm2    # xmm2 = (xmm3 * xmm2) + xmm10
  2449          vmovss  %xmm2, (%rdi,%rax,4)
  2450          cmpb    %dl, %r8b
  2451          je      .LBB16_13
  2452  .LBB16_12:                              #   in Loop: Header=BB16_5 Depth=1
  2453          vxorps  %xmm0, %xmm2, %xmm2
  2454          vmovss  %xmm2, (%rdi,%rax,4)
  2455          jmp     .LBB16_13
  2456  .LBB16_14:
  2457          popq    %rax
  2458          vzeroupper
  2459          retq
  2460  .LCPI17_0:
  2461          .long   2147483647                      # 0x7fffffff
  2462  .LCPI17_1:
  2463          .long   0x3fa2f983                      # float 1.27323949
  2464  .LCPI17_2:
  2465          .long   4294967294                      # 0xfffffffe
  2466  .LCPI17_3:
  2467          .long   2                               # 0x2
  2468  .LCPI17_4:
  2469          .long   0xbf490fdb                      # float -0.785398185
  2470  .LCPI17_5:
  2471          .long   3221225472                      # 0xc0000000
  2472  .LCPI17_6:
  2473          .long   0x37ccf5ce                      # float 2.44331568E-5
  2474  .LCPI17_7:
  2475          .long   0xbab6061a                      # float -0.00138873165
  2476  .LCPI17_8:
  2477          .long   0x3d2aaaa5                      # float 0.0416666456
  2478  .LCPI17_9:
  2479          .long   0xbf000000                      # float -0.5
  2480  .LCPI17_10:
  2481          .long   0x3f800000                      # float 1
  2482  .LCPI17_11:
  2483          .long   0xb94ca1f9                      # float -1.95152956E-4
  2484  .LCPI17_12:
  2485          .long   0x3c08839e                      # float 0.00833216123
  2486  .LCPI17_13:
  2487          .long   0xbe2aaaa3                      # float -0.166666552
  2488  .LCPI17_14:
  2489          .long   2147483648                      # 0x80000000
  2490  .LCPI17_15:
  2491          .long   0x4b7fffff                      # float 16777215
  2492  .LCPI17_16:
  2493          .zero   32,255
  2494  .LCPI17_17:
  2495          .zero   32
  2496  Cos_F32_V(float*, unsigned long):                        # @Cos_F32_V(float*, unsigned long)
  2497          subq    $72, %rsp
  2498          movq    %rsi, %rax
  2499          andq    $-8, %rax
  2500          je      .LBB17_3
  2501          xorl    %ecx, %ecx
  2502          vbroadcastss    .LCPI17_0(%rip), %ymm0  # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
  2503          vmovups %ymm0, 32(%rsp)                 # 32-byte Spill
  2504          vbroadcastss    .LCPI17_1(%rip), %ymm0  # ymm0 = [1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0]
  2505          vmovups %ymm0, (%rsp)                   # 32-byte Spill
  2506          vbroadcastss    .LCPI17_2(%rip), %ymm0  # ymm0 = [4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294]
  2507          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
  2508          vpbroadcastd    .LCPI17_3(%rip), %ymm4  # ymm4 = [2,2,2,2,2,2,2,2]
  2509          vbroadcastss    .LCPI17_4(%rip), %ymm0  # ymm0 = [-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1]
  2510          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
  2511          vbroadcastss    .LCPI17_5(%rip), %ymm0  # ymm0 = [3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472]
  2512          vmovups %ymm0, -96(%rsp)                # 32-byte Spill
  2513          vbroadcastss    .LCPI17_6(%rip), %ymm0  # ymm0 = [2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5]
  2514          vmovups %ymm0, -128(%rsp)               # 32-byte Spill
  2515          vbroadcastss    .LCPI17_7(%rip), %ymm9  # ymm9 = [-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3]
  2516          vbroadcastss    .LCPI17_8(%rip), %ymm10 # ymm10 = [4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2]
  2517          vbroadcastss    .LCPI17_9(%rip), %ymm6  # ymm6 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  2518          vbroadcastss    .LCPI17_10(%rip), %ymm12 # ymm12 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  2519          vbroadcastss    .LCPI17_11(%rip), %ymm13 # ymm13 = [-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4]
  2520          vbroadcastss    .LCPI17_12(%rip), %ymm14 # ymm14 = [8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3]
  2521          vbroadcastss    .LCPI17_13(%rip), %ymm15 # ymm15 = [-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1]
  2522          vpbroadcastd    .LCPI17_14(%rip), %ymm2 # ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
  2523  .LBB17_2:                               # =>This Inner Loop Header: Depth=1
  2524          vmovups 32(%rsp), %ymm0                 # 32-byte Reload
  2525          vandps  (%rdi,%rcx,4), %ymm0, %ymm5
  2526          vmulps  (%rsp), %ymm5, %ymm0            # 32-byte Folded Reload
  2527          vcvttps2dq      %ymm0, %ymm0
  2528          vpsubd  .LCPI17_16(%rip), %ymm0, %ymm0
  2529          vpand   -32(%rsp), %ymm0, %ymm1         # 32-byte Folded Reload
  2530          vcvtdq2ps       %ymm1, %ymm3
  2531          vfmadd132ps     -64(%rsp), %ymm5, %ymm3 # 32-byte Folded Reload
  2532          vmulps  %ymm3, %ymm3, %ymm5
  2533          vmovups -128(%rsp), %ymm8               # 32-byte Reload
  2534          vfmadd213ps     %ymm9, %ymm5, %ymm8     # ymm8 = (ymm5 * ymm8) + ymm9
  2535          vfmadd213ps     %ymm10, %ymm5, %ymm8    # ymm8 = (ymm5 * ymm8) + ymm10
  2536          vmulps  %ymm5, %ymm5, %ymm7
  2537          vmovaps %ymm6, %ymm11
  2538          vfmadd213ps     %ymm12, %ymm5, %ymm11   # ymm11 = (ymm5 * ymm11) + ymm12
  2539          vfmadd231ps     %ymm7, %ymm8, %ymm11    # ymm11 = (ymm8 * ymm7) + ymm11
  2540          vmovaps %ymm13, %ymm7
  2541          vfmadd213ps     %ymm14, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm14
  2542          vfmadd213ps     %ymm15, %ymm5, %ymm7    # ymm7 = (ymm5 * ymm7) + ymm15
  2543          vmulps  %ymm3, %ymm5, %ymm5
  2544          vfmadd213ps     %ymm3, %ymm7, %ymm5     # ymm5 = (ymm7 * ymm5) + ymm3
  2545          vpand   %ymm4, %ymm0, %ymm0
  2546          vpcmpeqd        %ymm4, %ymm0, %ymm3
  2547          vpcmpeqd        .LCPI17_17(%rip), %ymm0, %ymm0
  2548          vandps  %ymm0, %ymm5, %ymm0
  2549          vandps  %ymm3, %ymm11, %ymm3
  2550          vaddps  %ymm3, %ymm0, %ymm0
  2551          vaddps  %ymm5, %ymm11, %ymm3
  2552          vsubps  %ymm0, %ymm3, %ymm0
  2553          vpslld  $29, %ymm1, %ymm1
  2554          vpaddd  -96(%rsp), %ymm1, %ymm1         # 32-byte Folded Reload
  2555          vpand   %ymm2, %ymm1, %ymm1
  2556          vpxor   %ymm2, %ymm1, %ymm1
  2557          vxorps  %ymm1, %ymm0, %ymm0
  2558          vmovups %ymm0, (%rdi,%rcx,4)
  2559          addq    $8, %rcx
  2560          cmpq    %rax, %rcx
  2561          jb      .LBB17_2
  2562  .LBB17_3:
  2563          cmpq    %rsi, %rax
  2564          jae     .LBB17_14
  2565          vbroadcastss    .LCPI17_14(%rip), %xmm0 # xmm0 = [2147483648,2147483648,2147483648,2147483648]
  2566          vmovss  .LCPI17_15(%rip), %xmm1         # xmm1 = mem[0],zero,zero,zero
  2567          vmovss  .LCPI17_1(%rip), %xmm8          # xmm8 = mem[0],zero,zero,zero
  2568          vmovss  .LCPI17_10(%rip), %xmm9         # xmm9 = mem[0],zero,zero,zero
  2569          vmovss  .LCPI17_4(%rip), %xmm10         # xmm10 = mem[0],zero,zero,zero
  2570          vmovss  .LCPI17_6(%rip), %xmm12         # xmm12 = mem[0],zero,zero,zero
  2571          vmovss  .LCPI17_7(%rip), %xmm11         # xmm11 = mem[0],zero,zero,zero
  2572          vmovss  .LCPI17_8(%rip), %xmm13         # xmm13 = mem[0],zero,zero,zero
  2573          vmovss  .LCPI17_9(%rip), %xmm14         # xmm14 = mem[0],zero,zero,zero
  2574          vmovss  .LCPI17_11(%rip), %xmm2         # xmm2 = mem[0],zero,zero,zero
  2575          vmovss  .LCPI17_12(%rip), %xmm15        # xmm15 = mem[0],zero,zero,zero
  2576          vmovss  .LCPI17_13(%rip), %xmm6         # xmm6 = mem[0],zero,zero,zero
  2577          jmp     .LBB17_5
  2578  .LBB17_13:                              #   in Loop: Header=BB17_5 Depth=1
  2579          incq    %rax
  2580          cmpq    %rsi, %rax
  2581          jae     .LBB17_14
  2582  .LBB17_5:                               # =>This Inner Loop Header: Depth=1
  2583          vmovss  (%rdi,%rax,4), %xmm3            # xmm3 = mem[0],zero,zero,zero
  2584          vxorps  %xmm0, %xmm3, %xmm4
  2585          vmaxss  %xmm3, %xmm4, %xmm5
  2586          vucomiss        %xmm1, %xmm5
  2587          ja      .LBB17_13
  2588          vmulss  %xmm5, %xmm8, %xmm3
  2589          vcvttss2si      %xmm3, %ecx
  2590          vroundss        $11, %xmm3, %xmm3, %xmm7
  2591          movl    %ecx, %edx
  2592          andl    $1, %edx
  2593          je      .LBB17_8
  2594          vaddss  %xmm7, %xmm9, %xmm7
  2595  .LBB17_8:                               #   in Loop: Header=BB17_5 Depth=1
  2596          addl    %ecx, %edx
  2597          andl    $7, %edx
  2598          leal    -4(%rdx), %ecx
  2599          cmpl    $4, %edx
  2600          setae   %r8b
  2601          cmovbl  %edx, %ecx
  2602          cmpl    $2, %ecx
  2603          setae   %dl
  2604          vfmadd231ss     %xmm10, %xmm7, %xmm5    # xmm5 = (xmm7 * xmm10) + xmm5
  2605          vmulss  %xmm5, %xmm5, %xmm7
  2606          vmovaps %xmm12, %xmm4
  2607          vfmadd213ss     %xmm11, %xmm7, %xmm4    # xmm4 = (xmm7 * xmm4) + xmm11
  2608          vfmadd213ss     %xmm13, %xmm7, %xmm4    # xmm4 = (xmm7 * xmm4) + xmm13
  2609          vfmadd213ss     %xmm14, %xmm7, %xmm4    # xmm4 = (xmm7 * xmm4) + xmm14
  2610          vmovaps %xmm2, %xmm3
  2611          vfmadd213ss     %xmm15, %xmm7, %xmm3    # xmm3 = (xmm7 * xmm3) + xmm15
  2612          vfmadd213ss     %xmm6, %xmm7, %xmm3     # xmm3 = (xmm7 * xmm3) + xmm6
  2613          decl    %ecx
  2614          cmpl    $2, %ecx
  2615          jb      .LBB17_9
  2616          vfmadd213ss     %xmm9, %xmm4, %xmm7     # xmm7 = (xmm4 * xmm7) + xmm9
  2617          vmovaps %xmm7, %xmm3
  2618          vmovss  %xmm3, (%rdi,%rax,4)
  2619          cmpb    %dl, %r8b
  2620          je      .LBB17_13
  2621          jmp     .LBB17_12
  2622  .LBB17_9:                               #   in Loop: Header=BB17_5 Depth=1
  2623          vmulss  %xmm5, %xmm7, %xmm4
  2624          vfmadd213ss     %xmm5, %xmm4, %xmm3     # xmm3 = (xmm4 * xmm3) + xmm5
  2625          vmovss  %xmm3, (%rdi,%rax,4)
  2626          cmpb    %dl, %r8b
  2627          je      .LBB17_13
  2628  .LBB17_12:                              #   in Loop: Header=BB17_5 Depth=1
  2629          vxorps  %xmm0, %xmm3, %xmm3
  2630          vmovss  %xmm3, (%rdi,%rax,4)
  2631          jmp     .LBB17_13
  2632  .LBB17_14:
  2633          addq    $72, %rsp
  2634          vzeroupper
  2635          retq
  2636  .LCPI18_0:
  2637          .long   2147483647                      # 0x7fffffff
  2638  .LCPI18_1:
  2639          .long   0x3fa2f983                      # float 1.27323949
  2640  .LCPI18_2:
  2641          .long   4294967294                      # 0xfffffffe
  2642  .LCPI18_3:
  2643          .long   2                               # 0x2
  2644  .LCPI18_4:
  2645          .long   0xbf490fdb                      # float -0.785398185
  2646  .LCPI18_5:
  2647          .long   3221225472                      # 0xc0000000
  2648  .LCPI18_6:
  2649          .long   2147483648                      # 0x80000000
  2650  .LCPI18_7:
  2651          .long   0x37ccf5ce                      # float 2.44331568E-5
  2652  .LCPI18_8:
  2653          .long   0xbab6061a                      # float -0.00138873165
  2654  .LCPI18_9:
  2655          .long   0x3d2aaaa5                      # float 0.0416666456
  2656  .LCPI18_10:
  2657          .long   0xbf000000                      # float -0.5
  2658  .LCPI18_11:
  2659          .long   0x3f800000                      # float 1
  2660  .LCPI18_12:
  2661          .long   0xb94ca1f9                      # float -1.95152956E-4
  2662  .LCPI18_13:
  2663          .long   0x3c08839e                      # float 0.00833216123
  2664  .LCPI18_14:
  2665          .long   0xbe2aaaa3                      # float -0.166666552
  2666  .LCPI18_15:
  2667          .long   0x4b7fffff                      # float 16777215
  2668  .LCPI18_16:
  2669          .long   0x00000000                      # float 0
  2670  .LCPI18_17:
  2671          .zero   32,255
  2672  .LCPI18_18:
  2673          .zero   32
  2674  SinCos_F32_V(float*, float*, float*, unsigned long):                # @SinCos_F32_V(float*, float*, float*, unsigned long)
  2675          pushq   %rbx
  2676          subq    $96, %rsp
  2677          movq    %rcx, %r8
  2678          andq    $-8, %r8
  2679          je      .LBB18_3
  2680          xorl    %eax, %eax
  2681          vbroadcastss    .LCPI18_0(%rip), %ymm0  # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
  2682          vmovups %ymm0, 64(%rsp)                 # 32-byte Spill
  2683          vbroadcastss    .LCPI18_1(%rip), %ymm0  # ymm0 = [1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0]
  2684          vmovups %ymm0, 32(%rsp)                 # 32-byte Spill
  2685          vbroadcastss    .LCPI18_2(%rip), %ymm0  # ymm0 = [4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294]
  2686          vmovups %ymm0, (%rsp)                   # 32-byte Spill
  2687          vpbroadcastd    .LCPI18_3(%rip), %ymm4  # ymm4 = [2,2,2,2,2,2,2,2]
  2688          vbroadcastss    .LCPI18_4(%rip), %ymm0  # ymm0 = [-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1]
  2689          vmovups %ymm0, -32(%rsp)                # 32-byte Spill
  2690          vbroadcastss    .LCPI18_5(%rip), %ymm0  # ymm0 = [3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472]
  2691          vmovups %ymm0, -64(%rsp)                # 32-byte Spill
  2692          vpbroadcastd    .LCPI18_6(%rip), %ymm8  # ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
  2693          vbroadcastss    .LCPI18_7(%rip), %ymm0  # ymm0 = [2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5]
  2694          vmovups %ymm0, -96(%rsp)                # 32-byte Spill
  2695          vbroadcastss    .LCPI18_8(%rip), %ymm0  # ymm0 = [-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3]
  2696          vmovups %ymm0, -128(%rsp)               # 32-byte Spill
  2697          vbroadcastss    .LCPI18_9(%rip), %ymm11 # ymm11 = [4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2]
  2698          vbroadcastss    .LCPI18_10(%rip), %ymm10 # ymm10 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
  2699          vbroadcastss    .LCPI18_11(%rip), %ymm13 # ymm13 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
  2700          vbroadcastss    .LCPI18_12(%rip), %ymm14 # ymm14 = [-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4]
  2701          vbroadcastss    .LCPI18_13(%rip), %ymm15 # ymm15 = [8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3]
  2702          vbroadcastss    .LCPI18_14(%rip), %ymm2 # ymm2 = [-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1]
  2703  .LBB18_2:                               # =>This Inner Loop Header: Depth=1
  2704          vmovups (%rdx,%rax,4), %ymm5
  2705          vandps  64(%rsp), %ymm5, %ymm1          # 32-byte Folded Reload
  2706          vmulps  32(%rsp), %ymm1, %ymm0          # 32-byte Folded Reload
  2707          vcvttps2dq      %ymm0, %ymm0
  2708          vpsubd  .LCPI18_17(%rip), %ymm0, %ymm3
  2709          vpand   (%rsp), %ymm3, %ymm0            # 32-byte Folded Reload
  2710          vcvtdq2ps       %ymm0, %ymm6
  2711          vfmadd132ps     -32(%rsp), %ymm1, %ymm6 # 32-byte Folded Reload
  2712          vmulps  %ymm6, %ymm6, %ymm1
  2713          vmovups -96(%rsp), %ymm9                # 32-byte Reload
  2714          vfmadd213ps     -128(%rsp), %ymm1, %ymm9 # 32-byte Folded Reload
  2715          vfmadd213ps     %ymm11, %ymm1, %ymm9    # ymm9 = (ymm1 * ymm9) + ymm11
  2716          vmulps  %ymm1, %ymm1, %ymm7
  2717          vmovaps %ymm10, %ymm12
  2718          vfmadd213ps     %ymm13, %ymm1, %ymm12   # ymm12 = (ymm1 * ymm12) + ymm13
  2719          vfmadd231ps     %ymm7, %ymm9, %ymm12    # ymm12 = (ymm9 * ymm7) + ymm12
  2720          vmovaps %ymm14, %ymm7
  2721          vfmadd213ps     %ymm15, %ymm1, %ymm7    # ymm7 = (ymm1 * ymm7) + ymm15
  2722          vfmadd213ps     %ymm2, %ymm1, %ymm7     # ymm7 = (ymm1 * ymm7) + ymm2
  2723          vmulps  %ymm6, %ymm1, %ymm1
  2724          vfmadd213ps     %ymm6, %ymm7, %ymm1     # ymm1 = (ymm7 * ymm1) + ymm6
  2725          vpslld  $29, %ymm3, %ymm6
  2726          vpand   %ymm4, %ymm3, %ymm3
  2727          vpxor   %ymm5, %ymm6, %ymm5
  2728          vpcmpeqd        %ymm4, %ymm3, %ymm6
  2729          vpcmpeqd        .LCPI18_18(%rip), %ymm3, %ymm3
  2730          vandps  %ymm3, %ymm1, %ymm3
  2731          vandps  %ymm6, %ymm12, %ymm6
  2732          vaddps  %ymm3, %ymm6, %ymm3
  2733          vaddps  %ymm1, %ymm12, %ymm1
  2734          vpand   %ymm5, %ymm8, %ymm5
  2735          vsubps  %ymm3, %ymm1, %ymm1
  2736          vpxor   %ymm3, %ymm5, %ymm3
  2737          vpslld  $29, %ymm0, %ymm0
  2738          vpaddd  -64(%rsp), %ymm0, %ymm0         # 32-byte Folded Reload
  2739          vpand   %ymm0, %ymm8, %ymm0
  2740          vpxor   %ymm0, %ymm8, %ymm0
  2741          vxorps  %ymm0, %ymm1, %ymm0
  2742          vmovdqu %ymm3, (%rdi,%rax,4)
  2743          vmovups %ymm0, (%rsi,%rax,4)
  2744          addq    $8, %rax
  2745          cmpq    %r8, %rax
  2746          jb      .LBB18_2
  2747  .LBB18_3:
  2748          cmpq    %rcx, %r8
  2749          jae     .LBB18_16
  2750          vbroadcastss    .LCPI18_6(%rip), %xmm0  # xmm0 = [2147483648,2147483648,2147483648,2147483648]
  2751          vmovss  .LCPI18_15(%rip), %xmm1         # xmm1 = mem[0],zero,zero,zero
  2752          vmovss  .LCPI18_11(%rip), %xmm3         # xmm3 = mem[0],zero,zero,zero
  2753          vmovss  .LCPI18_7(%rip), %xmm8          # xmm8 = mem[0],zero,zero,zero
  2754          vmovss  .LCPI18_8(%rip), %xmm11         # xmm11 = mem[0],zero,zero,zero
  2755          vmovss  .LCPI18_9(%rip), %xmm13         # xmm13 = mem[0],zero,zero,zero
  2756          vmovss  .LCPI18_10(%rip), %xmm14        # xmm14 = mem[0],zero,zero,zero
  2757          vmovss  .LCPI18_12(%rip), %xmm10        # xmm10 = mem[0],zero,zero,zero
  2758          vmovss  .LCPI18_13(%rip), %xmm15        # xmm15 = mem[0],zero,zero,zero
  2759          vmovss  .LCPI18_14(%rip), %xmm6         # xmm6 = mem[0],zero,zero,zero
  2760          jmp     .LBB18_5
  2761  .LBB18_15:                              #   in Loop: Header=BB18_5 Depth=1
  2762          incq    %r8
  2763          cmpq    %rcx, %r8
  2764          jae     .LBB18_16
  2765  .LBB18_5:                               # =>This Inner Loop Header: Depth=1
  2766          vmovss  (%rdx,%r8,4), %xmm4             # xmm4 = mem[0],zero,zero,zero
  2767          vxorps  %xmm0, %xmm4, %xmm2
  2768          vmaxss  %xmm4, %xmm2, %xmm2
  2769          vucomiss        %xmm1, %xmm2
  2770          ja      .LBB18_15
  2771          vucomiss        .LCPI18_16(%rip), %xmm4
  2772          vmulss  .LCPI18_1(%rip), %xmm2, %xmm4
  2773          vcvttss2si      %xmm4, %r10d
  2774          setb    %r9b
  2775          vroundss        $11, %xmm4, %xmm4, %xmm4
  2776          movl    %r10d, %eax
  2777          andl    $1, %eax
  2778          je      .LBB18_8
  2779          vaddss  %xmm3, %xmm4, %xmm4
  2780  .LBB18_8:                               #   in Loop: Header=BB18_5 Depth=1
  2781          addl    %r10d, %eax
  2782          andl    $7, %eax
  2783          leal    -4(%rax), %r10d
  2784          cmpl    $4, %eax
  2785          setae   %r11b
  2786          cmovbl  %eax, %r10d
  2787          vfmadd231ss     .LCPI18_4(%rip), %xmm4, %xmm2 # xmm2 = (xmm4 * mem) + xmm2
  2788          vmulss  %xmm2, %xmm2, %xmm7
  2789          vmovaps %xmm8, %xmm12
  2790          vfmadd213ss     %xmm11, %xmm7, %xmm12   # xmm12 = (xmm7 * xmm12) + xmm11
  2791          vfmadd213ss     %xmm13, %xmm7, %xmm12   # xmm12 = (xmm7 * xmm12) + xmm13
  2792          vmulss  %xmm7, %xmm7, %xmm9
  2793          vmovaps %xmm3, %xmm4
  2794          vfmadd231ss     %xmm14, %xmm7, %xmm4    # xmm4 = (xmm7 * xmm14) + xmm4
  2795          vfmadd231ss     %xmm9, %xmm12, %xmm4    # xmm4 = (xmm12 * xmm9) + xmm4
  2796          vmovaps %xmm10, %xmm5
  2797          vfmadd213ss     %xmm15, %xmm7, %xmm5    # xmm5 = (xmm7 * xmm5) + xmm15
  2798          vfmadd213ss     %xmm6, %xmm7, %xmm5     # xmm5 = (xmm7 * xmm5) + xmm6
  2799          vmulss  %xmm2, %xmm7, %xmm7
  2800          vfmadd213ss     %xmm2, %xmm5, %xmm7     # xmm7 = (xmm5 * xmm7) + xmm2
  2801          leal    -1(%r10), %ebx
  2802          cmpl    $2, %ebx
  2803          jb      .LBB18_9
  2804          vmovaps %xmm7, %xmm2
  2805          vmovss  %xmm2, (%rdi,%r8,4)
  2806          vmovss  %xmm4, (%rsi,%r8,4)
  2807          cmpb    %r11b, %r9b
  2808          jne     .LBB18_12
  2809          jmp     .LBB18_13
  2810  .LBB18_9:                               #   in Loop: Header=BB18_5 Depth=1
  2811          vmovaps %xmm4, %xmm2
  2812          vmovaps %xmm7, %xmm4
  2813          vmovss  %xmm2, (%rdi,%r8,4)
  2814          vmovss  %xmm4, (%rsi,%r8,4)
  2815          cmpb    %r11b, %r9b
  2816          je      .LBB18_13
  2817  .LBB18_12:                              #   in Loop: Header=BB18_5 Depth=1
  2818          vmovss  (%rdi,%r8,4), %xmm2             # xmm2 = mem[0],zero,zero,zero
  2819          vxorps  %xmm0, %xmm2, %xmm2
  2820          vmovss  %xmm2, (%rdi,%r8,4)
  2821  .LBB18_13:                              #   in Loop: Header=BB18_5 Depth=1
  2822          cmpl    $2, %r10d
  2823          setae   %bl
  2824          cmpl    $4, %eax
  2825          setae   %al
  2826          cmpb    %bl, %al
  2827          je      .LBB18_15
  2828          vmovss  (%rsi,%r8,4), %xmm2             # xmm2 = mem[0],zero,zero,zero
  2829          vxorps  %xmm0, %xmm2, %xmm2
  2830          vmovss  %xmm2, (%rsi,%r8,4)
  2831          jmp     .LBB18_15
  2832  .LBB18_16:
  2833          addq    $96, %rsp
  2834          popq    %rbx
  2835          vzeroupper
  2836          retq