gitee.com/quant1x/gox@v1.7.6/num/asm/_avx2/special.s (about) 1 Sqrt_F64_V(double*, unsigned long): # @Sqrt_F64_V(double*, unsigned long) 2 testq %rsi, %rsi 3 je .LBB0_7 4 cmpq $4, %rsi 5 jae .LBB0_3 6 xorl %eax, %eax 7 jmp .LBB0_6 8 .LBB0_3: 9 movq %rsi, %rax 10 andq $-4, %rax 11 xorl %ecx, %ecx 12 .LBB0_4: # =>This Inner Loop Header: Depth=1 13 vsqrtpd (%rdi,%rcx,8), %ymm0 14 vmovupd %ymm0, (%rdi,%rcx,8) 15 addq $4, %rcx 16 cmpq %rcx, %rax 17 jne .LBB0_4 18 cmpq %rsi, %rax 19 je .LBB0_7 20 .LBB0_6: # =>This Inner Loop Header: Depth=1 21 vmovsd (%rdi,%rax,8), %xmm0 # xmm0 = mem[0],zero 22 vsqrtsd %xmm0, %xmm0, %xmm0 23 vmovsd %xmm0, (%rdi,%rax,8) 24 incq %rax 25 cmpq %rax, %rsi 26 jne .LBB0_6 27 .LBB0_7: 28 vzeroupper 29 retq 30 .LCPI1_0: 31 .long 0xc0400000 # float -3 32 .LCPI1_1: 33 .long 0xbf000000 # float -0.5 34 Sqrt_F32_V(float*, unsigned long): # @Sqrt_F32_V(float*, unsigned long) 35 testq %rsi, %rsi 36 je .LBB1_7 37 cmpq $32, %rsi 38 jae .LBB1_3 39 xorl %eax, %eax 40 jmp .LBB1_6 41 .LBB1_3: 42 movq %rsi, %rax 43 andq $-32, %rax 44 xorl %ecx, %ecx 45 vbroadcastss .LCPI1_0(%rip), %ymm0 # ymm0 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 46 vbroadcastss .LCPI1_1(%rip), %ymm1 # ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 47 vxorps %xmm2, %xmm2, %xmm2 48 .LBB1_4: # =>This Inner Loop Header: Depth=1 49 vmovups (%rdi,%rcx,4), %ymm3 50 vmovups 32(%rdi,%rcx,4), %ymm4 51 vmovups 64(%rdi,%rcx,4), %ymm5 52 vrsqrtps %ymm3, %ymm6 53 vmovups 96(%rdi,%rcx,4), %ymm7 54 vmulps %ymm6, %ymm3, %ymm8 55 vfmadd213ps %ymm0, %ymm8, %ymm6 # ymm6 = (ymm8 * ymm6) + ymm0 56 vmulps %ymm1, %ymm8, %ymm8 57 vmulps %ymm6, %ymm8, %ymm6 58 vrsqrtps %ymm4, %ymm8 59 vcmpneqps %ymm2, %ymm3, %ymm3 60 vandps %ymm6, %ymm3, %ymm3 61 vmulps %ymm4, %ymm8, %ymm6 62 vfmadd213ps %ymm0, %ymm6, %ymm8 # ymm8 = (ymm6 * ymm8) + ymm0 63 vmulps %ymm1, %ymm6, %ymm6 64 vmulps %ymm6, %ymm8, %ymm6 65 vcmpneqps %ymm2, %ymm4, %ymm4 66 vandps %ymm6, %ymm4, %ymm4 67 vrsqrtps %ymm5, %ymm6 68 vmulps %ymm6, %ymm5, %ymm8 69 vfmadd213ps %ymm0, %ymm8, %ymm6 # ymm6 = (ymm8 * ymm6) + ymm0 70 vmulps %ymm1, %ymm8, %ymm8 71 vmulps %ymm6, %ymm8, %ymm6 72 vcmpneqps %ymm2, %ymm5, %ymm5 73 vandps %ymm6, %ymm5, %ymm5 74 vrsqrtps %ymm7, %ymm6 75 vmulps %ymm6, %ymm7, %ymm8 76 vfmadd213ps %ymm0, %ymm8, %ymm6 # ymm6 = (ymm8 * ymm6) + ymm0 77 vmulps %ymm1, %ymm8, %ymm8 78 vmulps %ymm6, %ymm8, %ymm6 79 vcmpneqps %ymm2, %ymm7, %ymm7 80 vandps %ymm6, %ymm7, %ymm6 81 vmovups %ymm3, (%rdi,%rcx,4) 82 vmovups %ymm4, 32(%rdi,%rcx,4) 83 vmovups %ymm5, 64(%rdi,%rcx,4) 84 vmovups %ymm6, 96(%rdi,%rcx,4) 85 addq $32, %rcx 86 cmpq %rcx, %rax 87 jne .LBB1_4 88 cmpq %rsi, %rax 89 je .LBB1_7 90 .LBB1_6: # =>This Inner Loop Header: Depth=1 91 vmovss (%rdi,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 92 vsqrtss %xmm0, %xmm0, %xmm0 93 vmovss %xmm0, (%rdi,%rax,4) 94 incq %rax 95 cmpq %rax, %rsi 96 jne .LBB1_6 97 .LBB1_7: 98 vzeroupper 99 retq 100 .LCPI2_0: 101 .quad 0x8000000000000000 # double -0 102 .LCPI2_1: 103 .quad 0x3fdfffffffffffff # double 0.49999999999999994 104 .LCPI2_2: 105 .quad 0x8000000000000000 # double -0 106 .quad 0x8000000000000000 # double -0 107 Round_F64_V(double*, unsigned long): # @Round_F64_V(double*, unsigned long) 108 testq %rsi, %rsi 109 je .LBB2_8 110 cmpq $16, %rsi 111 jae .LBB2_3 112 xorl %eax, %eax 113 jmp .LBB2_6 114 .LBB2_3: 115 movq %rsi, %rax 116 andq $-16, %rax 117 xorl %ecx, %ecx 118 vbroadcastsd .LCPI2_0(%rip), %ymm0 # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 119 vbroadcastsd .LCPI2_1(%rip), %ymm1 # ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] 120 .LBB2_4: # =>This Inner Loop Header: Depth=1 121 vmovupd (%rdi,%rcx,8), %ymm2 122 vmovupd 32(%rdi,%rcx,8), %ymm3 123 vmovupd 64(%rdi,%rcx,8), %ymm4 124 vmovupd 96(%rdi,%rcx,8), %ymm5 125 vandpd %ymm0, %ymm2, %ymm6 126 vorpd %ymm1, %ymm6, %ymm6 127 vaddpd %ymm6, %ymm2, %ymm2 128 vroundpd $11, %ymm2, %ymm2 129 vandpd %ymm0, %ymm3, %ymm6 130 vorpd %ymm1, %ymm6, %ymm6 131 vaddpd %ymm6, %ymm3, %ymm3 132 vroundpd $11, %ymm3, %ymm3 133 vandpd %ymm0, %ymm4, %ymm6 134 vorpd %ymm1, %ymm6, %ymm6 135 vaddpd %ymm6, %ymm4, %ymm4 136 vroundpd $11, %ymm4, %ymm4 137 vandpd %ymm0, %ymm5, %ymm6 138 vorpd %ymm1, %ymm6, %ymm6 139 vaddpd %ymm6, %ymm5, %ymm5 140 vroundpd $11, %ymm5, %ymm5 141 vmovupd %ymm2, (%rdi,%rcx,8) 142 vmovupd %ymm3, 32(%rdi,%rcx,8) 143 vmovupd %ymm4, 64(%rdi,%rcx,8) 144 vmovupd %ymm5, 96(%rdi,%rcx,8) 145 addq $16, %rcx 146 cmpq %rcx, %rax 147 jne .LBB2_4 148 cmpq %rsi, %rax 149 je .LBB2_8 150 .LBB2_6: 151 vmovapd .LCPI2_2(%rip), %xmm0 # xmm0 = [-0.0E+0,-0.0E+0] 152 vmovddup .LCPI2_1(%rip), %xmm1 # xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1] 153 .LBB2_7: # =>This Inner Loop Header: Depth=1 154 vmovsd (%rdi,%rax,8), %xmm2 # xmm2 = mem[0],zero 155 vandpd %xmm0, %xmm2, %xmm3 156 vorpd %xmm1, %xmm3, %xmm3 157 vaddsd %xmm3, %xmm2, %xmm2 158 vroundsd $11, %xmm2, %xmm2, %xmm2 159 vmovsd %xmm2, (%rdi,%rax,8) 160 incq %rax 161 cmpq %rax, %rsi 162 jne .LBB2_7 163 .LBB2_8: 164 vzeroupper 165 retq 166 .LCPI3_0: 167 .long 0x80000000 # float -0 168 .LCPI3_1: 169 .long 0x3effffff # float 0.49999997 170 Round_F32_V(float*, unsigned long): # @Round_F32_V(float*, unsigned long) 171 testq %rsi, %rsi 172 je .LBB3_8 173 cmpq $32, %rsi 174 jae .LBB3_3 175 xorl %eax, %eax 176 jmp .LBB3_6 177 .LBB3_3: 178 movq %rsi, %rax 179 andq $-32, %rax 180 xorl %ecx, %ecx 181 vbroadcastss .LCPI3_0(%rip), %ymm0 # ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 182 vbroadcastss .LCPI3_1(%rip), %ymm1 # ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 183 .LBB3_4: # =>This Inner Loop Header: Depth=1 184 vmovups (%rdi,%rcx,4), %ymm2 185 vmovups 32(%rdi,%rcx,4), %ymm3 186 vmovups 64(%rdi,%rcx,4), %ymm4 187 vmovups 96(%rdi,%rcx,4), %ymm5 188 vandps %ymm0, %ymm2, %ymm6 189 vorps %ymm1, %ymm6, %ymm6 190 vaddps %ymm6, %ymm2, %ymm2 191 vroundps $11, %ymm2, %ymm2 192 vandps %ymm0, %ymm3, %ymm6 193 vorps %ymm1, %ymm6, %ymm6 194 vaddps %ymm6, %ymm3, %ymm3 195 vroundps $11, %ymm3, %ymm3 196 vandps %ymm0, %ymm4, %ymm6 197 vorps %ymm1, %ymm6, %ymm6 198 vaddps %ymm6, %ymm4, %ymm4 199 vroundps $11, %ymm4, %ymm4 200 vandps %ymm0, %ymm5, %ymm6 201 vorps %ymm1, %ymm6, %ymm6 202 vaddps %ymm6, %ymm5, %ymm5 203 vroundps $11, %ymm5, %ymm5 204 vmovups %ymm2, (%rdi,%rcx,4) 205 vmovups %ymm3, 32(%rdi,%rcx,4) 206 vmovups %ymm4, 64(%rdi,%rcx,4) 207 vmovups %ymm5, 96(%rdi,%rcx,4) 208 addq $32, %rcx 209 cmpq %rcx, %rax 210 jne .LBB3_4 211 cmpq %rsi, %rax 212 je .LBB3_8 213 .LBB3_6: 214 vbroadcastss .LCPI3_0(%rip), %xmm0 # xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 215 vbroadcastss .LCPI3_1(%rip), %xmm1 # xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 216 .LBB3_7: # =>This Inner Loop Header: Depth=1 217 vmovss (%rdi,%rax,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 218 vandps %xmm0, %xmm2, %xmm3 219 vorps %xmm1, %xmm3, %xmm3 220 vaddss %xmm3, %xmm2, %xmm2 221 vroundss $11, %xmm2, %xmm2, %xmm2 222 vmovss %xmm2, (%rdi,%rax,4) 223 incq %rax 224 cmpq %rax, %rsi 225 jne .LBB3_7 226 .LBB3_8: 227 vzeroupper 228 retq 229 Floor_F64_V(double*, unsigned long): # @Floor_F64_V(double*, unsigned long) 230 testq %rsi, %rsi 231 je .LBB4_7 232 cmpq $16, %rsi 233 jae .LBB4_3 234 xorl %eax, %eax 235 jmp .LBB4_6 236 .LBB4_3: 237 movq %rsi, %rax 238 andq $-16, %rax 239 xorl %ecx, %ecx 240 .LBB4_4: # =>This Inner Loop Header: Depth=1 241 vroundpd $9, (%rdi,%rcx,8), %ymm0 242 vroundpd $9, 32(%rdi,%rcx,8), %ymm1 243 vroundpd $9, 64(%rdi,%rcx,8), %ymm2 244 vroundpd $9, 96(%rdi,%rcx,8), %ymm3 245 vmovupd %ymm0, (%rdi,%rcx,8) 246 vmovupd %ymm1, 32(%rdi,%rcx,8) 247 vmovupd %ymm2, 64(%rdi,%rcx,8) 248 vmovupd %ymm3, 96(%rdi,%rcx,8) 249 addq $16, %rcx 250 cmpq %rcx, %rax 251 jne .LBB4_4 252 cmpq %rsi, %rax 253 je .LBB4_7 254 .LBB4_6: # =>This Inner Loop Header: Depth=1 255 vmovsd (%rdi,%rax,8), %xmm0 # xmm0 = mem[0],zero 256 vroundsd $9, %xmm0, %xmm0, %xmm0 257 vmovsd %xmm0, (%rdi,%rax,8) 258 incq %rax 259 cmpq %rax, %rsi 260 jne .LBB4_6 261 .LBB4_7: 262 vzeroupper 263 retq 264 Floor_F32_V(float*, unsigned long): # @Floor_F32_V(float*, unsigned long) 265 testq %rsi, %rsi 266 je .LBB5_7 267 cmpq $32, %rsi 268 jae .LBB5_3 269 xorl %eax, %eax 270 jmp .LBB5_6 271 .LBB5_3: 272 movq %rsi, %rax 273 andq $-32, %rax 274 xorl %ecx, %ecx 275 .LBB5_4: # =>This Inner Loop Header: Depth=1 276 vroundps $9, (%rdi,%rcx,4), %ymm0 277 vroundps $9, 32(%rdi,%rcx,4), %ymm1 278 vroundps $9, 64(%rdi,%rcx,4), %ymm2 279 vroundps $9, 96(%rdi,%rcx,4), %ymm3 280 vmovups %ymm0, (%rdi,%rcx,4) 281 vmovups %ymm1, 32(%rdi,%rcx,4) 282 vmovups %ymm2, 64(%rdi,%rcx,4) 283 vmovups %ymm3, 96(%rdi,%rcx,4) 284 addq $32, %rcx 285 cmpq %rcx, %rax 286 jne .LBB5_4 287 cmpq %rsi, %rax 288 je .LBB5_7 289 .LBB5_6: # =>This Inner Loop Header: Depth=1 290 vmovss (%rdi,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 291 vroundss $9, %xmm0, %xmm0, %xmm0 292 vmovss %xmm0, (%rdi,%rax,4) 293 incq %rax 294 cmpq %rax, %rsi 295 jne .LBB5_6 296 .LBB5_7: 297 vzeroupper 298 retq 299 Ceil_F64_V(double*, unsigned long): # @Ceil_F64_V(double*, unsigned long) 300 testq %rsi, %rsi 301 je .LBB6_7 302 cmpq $16, %rsi 303 jae .LBB6_3 304 xorl %eax, %eax 305 jmp .LBB6_6 306 .LBB6_3: 307 movq %rsi, %rax 308 andq $-16, %rax 309 xorl %ecx, %ecx 310 .LBB6_4: # =>This Inner Loop Header: Depth=1 311 vroundpd $10, (%rdi,%rcx,8), %ymm0 312 vroundpd $10, 32(%rdi,%rcx,8), %ymm1 313 vroundpd $10, 64(%rdi,%rcx,8), %ymm2 314 vroundpd $10, 96(%rdi,%rcx,8), %ymm3 315 vmovupd %ymm0, (%rdi,%rcx,8) 316 vmovupd %ymm1, 32(%rdi,%rcx,8) 317 vmovupd %ymm2, 64(%rdi,%rcx,8) 318 vmovupd %ymm3, 96(%rdi,%rcx,8) 319 addq $16, %rcx 320 cmpq %rcx, %rax 321 jne .LBB6_4 322 cmpq %rsi, %rax 323 je .LBB6_7 324 .LBB6_6: # =>This Inner Loop Header: Depth=1 325 vmovsd (%rdi,%rax,8), %xmm0 # xmm0 = mem[0],zero 326 vroundsd $10, %xmm0, %xmm0, %xmm0 327 vmovsd %xmm0, (%rdi,%rax,8) 328 incq %rax 329 cmpq %rax, %rsi 330 jne .LBB6_6 331 .LBB6_7: 332 vzeroupper 333 retq 334 Ceil_F32_V(float*, unsigned long): # @Ceil_F32_V(float*, unsigned long) 335 testq %rsi, %rsi 336 je .LBB7_7 337 cmpq $32, %rsi 338 jae .LBB7_3 339 xorl %eax, %eax 340 jmp .LBB7_6 341 .LBB7_3: 342 movq %rsi, %rax 343 andq $-32, %rax 344 xorl %ecx, %ecx 345 .LBB7_4: # =>This Inner Loop Header: Depth=1 346 vroundps $10, (%rdi,%rcx,4), %ymm0 347 vroundps $10, 32(%rdi,%rcx,4), %ymm1 348 vroundps $10, 64(%rdi,%rcx,4), %ymm2 349 vroundps $10, 96(%rdi,%rcx,4), %ymm3 350 vmovups %ymm0, (%rdi,%rcx,4) 351 vmovups %ymm1, 32(%rdi,%rcx,4) 352 vmovups %ymm2, 64(%rdi,%rcx,4) 353 vmovups %ymm3, 96(%rdi,%rcx,4) 354 addq $32, %rcx 355 cmpq %rcx, %rax 356 jne .LBB7_4 357 cmpq %rsi, %rax 358 je .LBB7_7 359 .LBB7_6: # =>This Inner Loop Header: Depth=1 360 vmovss (%rdi,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 361 vroundss $10, %xmm0, %xmm0, %xmm0 362 vmovss %xmm0, (%rdi,%rax,4) 363 incq %rax 364 cmpq %rax, %rsi 365 jne .LBB7_6 366 .LBB7_7: 367 vzeroupper 368 retq 369 .LCPI8_0: 370 .quad 9223372036854775807 # 0x7fffffffffffffff 371 .LCPI8_3: 372 .quad 0x3fe6a09e667f3bcd # double 0.70710678118654757 373 .LCPI8_4: 374 .quad 0xbff0000000000000 # double -1 375 .LCPI8_5: 376 .quad 0x401a509f46f4fa53 # double 6.5787325942061043 377 .LCPI8_6: 378 .quad 0x3fdfe818a0fe1a83 # double 0.49854102823193375 379 .LCPI8_7: 380 .quad 0x3f07bc0962b395ca # double 4.5270000862445198E-5 381 .LCPI8_8: 382 .quad 0x404e798eb86c3351 # double 60.94966798098779 383 .LCPI8_9: 384 .quad 0x403de9738b8cb9c9 # double 29.911919328553072 385 .LCPI8_10: 386 .quad 0x40340a202d99830a # double 20.039553499201283 387 .LCPI8_11: 388 .quad 0x404c8e7597479a10 # double 57.112963590585537 389 .LCPI8_12: 390 .quad 0x4054c30b52213498 # double 83.047565967967216 391 .LCPI8_13: 392 .quad 0x402e20359e903e37 # double 15.062909083469192 393 .LCPI8_14: 394 .quad 0x407351945dc908a5 # double 309.09872225312057 395 .LCPI8_15: 396 .quad 0x406bb86590fcfb56 # double 221.76239823732857 397 .LCPI8_16: 398 .quad 0x404e0f304466448e # double 60.118660497603841 399 .LCPI8_17: 400 .quad 0x406b0db13e48e066 # double 216.42788614495947 401 .LCPI8_18: 402 .quad 4841369599423283200 # 0x4330000000000000 403 .LCPI8_19: 404 .quad 0xc3300000000003ff # double -4503599627371519 405 .LCPI8_20: 406 .quad 0x3ff0000000000000 # double 1 407 .LCPI8_21: 408 .quad 0xbfe0000000000000 # double -0.5 409 .LCPI8_22: 410 .quad 0x3fe0000000000000 # double 0.5 411 .LCPI8_23: 412 .quad 0x3ff71547652b82fe # double 1.4426950408889634 413 .LCPI8_24: 414 .quad 0xbfe62e4000000000 # double -0.693145751953125 415 .LCPI8_25: 416 .quad 0x3eb7f7d1cf79abca # double 1.4286068203094173E-6 417 .LCPI8_26: 418 .quad 0x3fe62e42fefa39ef # double 0.69314718055994529 419 .LCPI8_27: 420 .quad 0x3e21eed8eff8d898 # double 2.08767569878681E-9 421 .LCPI8_28: 422 .quad 0x3de6124613a86d09 # double 1.6059043836821613E-10 423 .LCPI8_29: 424 .quad 0x3e927e4fb7789f5c # double 2.7557319223985888E-7 425 .LCPI8_30: 426 .quad 0x3e5ae64567f544e4 # double 2.505210838544172E-8 427 .LCPI8_31: 428 .quad 0x3efa01a01a01a01a # double 2.4801587301587302E-5 429 .LCPI8_32: 430 .quad 0x3ec71de3a556c734 # double 2.7557319223985893E-6 431 .LCPI8_33: 432 .quad 0x3f56c16c16c16c17 # double 0.0013888888888888889 433 .LCPI8_34: 434 .quad 0x3f2a01a01a01a01a # double 1.9841269841269841E-4 435 .LCPI8_35: 436 .quad 0x3fa5555555555555 # double 0.041666666666666664 437 .LCPI8_36: 438 .quad 0x3f81111111111111 # double 0.0083333333333333332 439 .LCPI8_37: 440 .quad 0x3fc5555555555555 # double 0.16666666666666666 441 .LCPI8_38: 442 .quad 2046 # 0x7fe 443 .LCPI8_39: 444 .quad 0x40a7700000000000 # double 3000 445 .LCPI8_40: 446 .quad 1 # 0x1 447 .LCPI8_41: 448 .quad 0xc0a7700000000000 # double -3000 449 .LCPI8_42: 450 .quad 9218868437227405312 # 0x7ff0000000000000 451 .LCPI8_43: 452 .quad 0x7ff8002040000000 # double NaN 453 .LCPI8_1: 454 .quad 4503599627370495 # 0xfffffffffffff 455 .quad 4503599627370495 # 0xfffffffffffff 456 .LCPI8_2: 457 .quad 4602678819172646912 # 0x3fe0000000000000 458 .quad 4602678819172646912 # 0x3fe0000000000000 459 Pow_4x_F64_V(double*, double*, unsigned long): # @Pow_4x_F64_V(double*, double*, unsigned long) 460 subq $1192, %rsp # imm = 0x4A8 461 andq $-4, %rdx 462 je .LBB8_11 463 xorl %r8d, %r8d 464 vbroadcastsd .LCPI8_0(%rip), %ymm0 # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] 465 vmovups %ymm0, 512(%rsp) # 32-byte Spill 466 vbroadcastsd .LCPI8_3(%rip), %ymm0 # ymm0 = [7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1] 467 vmovups %ymm0, 1120(%rsp) # 32-byte Spill 468 vpxor %xmm6, %xmm6, %xmm6 469 vbroadcastsd .LCPI8_4(%rip), %ymm0 # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] 470 vmovups %ymm0, 1088(%rsp) # 32-byte Spill 471 vbroadcastsd .LCPI8_5(%rip), %ymm0 # ymm0 = [6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0] 472 vmovups %ymm0, 1056(%rsp) # 32-byte Spill 473 vbroadcastsd .LCPI8_6(%rip), %ymm0 # ymm0 = [4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1] 474 vmovups %ymm0, 1024(%rsp) # 32-byte Spill 475 vbroadcastsd .LCPI8_7(%rip), %ymm0 # ymm0 = [4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5] 476 vmovups %ymm0, 992(%rsp) # 32-byte Spill 477 vbroadcastsd .LCPI8_8(%rip), %ymm0 # ymm0 = [6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1] 478 vmovups %ymm0, 960(%rsp) # 32-byte Spill 479 vbroadcastsd .LCPI8_9(%rip), %ymm0 # ymm0 = [2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1] 480 vmovups %ymm0, 928(%rsp) # 32-byte Spill 481 vbroadcastsd .LCPI8_10(%rip), %ymm0 # ymm0 = [2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1] 482 vmovups %ymm0, 896(%rsp) # 32-byte Spill 483 vbroadcastsd .LCPI8_11(%rip), %ymm0 # ymm0 = [5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1] 484 vmovups %ymm0, 864(%rsp) # 32-byte Spill 485 vbroadcastsd .LCPI8_12(%rip), %ymm0 # ymm0 = [8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1] 486 vmovups %ymm0, 832(%rsp) # 32-byte Spill 487 vbroadcastsd .LCPI8_13(%rip), %ymm0 # ymm0 = [1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1] 488 vmovups %ymm0, 800(%rsp) # 32-byte Spill 489 vbroadcastsd .LCPI8_14(%rip), %ymm0 # ymm0 = [3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2] 490 vmovups %ymm0, 768(%rsp) # 32-byte Spill 491 vbroadcastsd .LCPI8_15(%rip), %ymm0 # ymm0 = [2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2] 492 vmovups %ymm0, 736(%rsp) # 32-byte Spill 493 vbroadcastsd .LCPI8_16(%rip), %ymm0 # ymm0 = [6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1] 494 vmovups %ymm0, 704(%rsp) # 32-byte Spill 495 vbroadcastsd .LCPI8_17(%rip), %ymm0 # ymm0 = [2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2] 496 vmovups %ymm0, 672(%rsp) # 32-byte Spill 497 vbroadcastsd .LCPI8_18(%rip), %ymm0 # ymm0 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] 498 vmovups %ymm0, 640(%rsp) # 32-byte Spill 499 vbroadcastsd .LCPI8_19(%rip), %ymm0 # ymm0 = [-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15] 500 vmovups %ymm0, 608(%rsp) # 32-byte Spill 501 vbroadcastsd .LCPI8_20(%rip), %ymm0 # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 502 vmovups %ymm0, -128(%rsp) # 32-byte Spill 503 vbroadcastsd .LCPI8_21(%rip), %ymm0 # ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 504 vmovups %ymm0, 576(%rsp) # 32-byte Spill 505 vbroadcastsd .LCPI8_22(%rip), %ymm0 # ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] 506 vmovups %ymm0, 544(%rsp) # 32-byte Spill 507 vbroadcastsd .LCPI8_23(%rip), %ymm0 # ymm0 = [1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0] 508 vmovups %ymm0, 480(%rsp) # 32-byte Spill 509 vbroadcastsd .LCPI8_24(%rip), %ymm0 # ymm0 = [-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1] 510 vmovups %ymm0, 448(%rsp) # 32-byte Spill 511 vbroadcastsd .LCPI8_25(%rip), %ymm0 # ymm0 = [1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6] 512 vmovups %ymm0, 416(%rsp) # 32-byte Spill 513 vbroadcastsd .LCPI8_26(%rip), %ymm0 # ymm0 = [6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1] 514 vmovups %ymm0, 384(%rsp) # 32-byte Spill 515 vbroadcastsd .LCPI8_27(%rip), %ymm0 # ymm0 = [2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9] 516 vmovups %ymm0, 352(%rsp) # 32-byte Spill 517 vbroadcastsd .LCPI8_28(%rip), %ymm0 # ymm0 = [1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10] 518 vmovups %ymm0, 320(%rsp) # 32-byte Spill 519 vbroadcastsd .LCPI8_29(%rip), %ymm0 # ymm0 = [2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7] 520 vmovups %ymm0, 288(%rsp) # 32-byte Spill 521 vbroadcastsd .LCPI8_30(%rip), %ymm0 # ymm0 = [2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8] 522 vmovups %ymm0, 256(%rsp) # 32-byte Spill 523 vbroadcastsd .LCPI8_31(%rip), %ymm0 # ymm0 = [2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5] 524 vmovups %ymm0, 224(%rsp) # 32-byte Spill 525 vbroadcastsd .LCPI8_32(%rip), %ymm0 # ymm0 = [2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6] 526 vmovups %ymm0, 192(%rsp) # 32-byte Spill 527 vbroadcastsd .LCPI8_33(%rip), %ymm0 # ymm0 = [1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3] 528 vmovups %ymm0, 160(%rsp) # 32-byte Spill 529 vbroadcastsd .LCPI8_34(%rip), %ymm0 # ymm0 = [1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4] 530 vmovups %ymm0, 128(%rsp) # 32-byte Spill 531 vbroadcastsd .LCPI8_35(%rip), %ymm0 # ymm0 = [4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2] 532 vmovups %ymm0, 96(%rsp) # 32-byte Spill 533 vbroadcastsd .LCPI8_36(%rip), %ymm0 # ymm0 = [8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3] 534 vmovups %ymm0, 64(%rsp) # 32-byte Spill 535 vbroadcastsd .LCPI8_37(%rip), %ymm0 # ymm0 = [1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1] 536 vmovups %ymm0, 32(%rsp) # 32-byte Spill 537 vbroadcastsd .LCPI8_38(%rip), %ymm0 # ymm0 = [2046,2046,2046,2046] 538 vmovups %ymm0, (%rsp) # 32-byte Spill 539 vbroadcastsd .LCPI8_39(%rip), %ymm0 # ymm0 = [3.0E+3,3.0E+3,3.0E+3,3.0E+3] 540 vmovups %ymm0, -32(%rsp) # 32-byte Spill 541 vbroadcastsd .LCPI8_40(%rip), %ymm0 # ymm0 = [1,1,1,1] 542 vmovups %ymm0, -64(%rsp) # 32-byte Spill 543 vbroadcastsd .LCPI8_41(%rip), %ymm0 # ymm0 = [-3.0E+3,-3.0E+3,-3.0E+3,-3.0E+3] 544 vmovupd %ymm0, -96(%rsp) # 32-byte Spill 545 vpbroadcastq .LCPI8_42(%rip), %ymm5 # ymm5 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312] 546 vbroadcastsd .LCPI8_42(%rip), %ymm10 # ymm10 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312] 547 jmp .LBB8_2 548 .LBB8_10: # in Loop: Header=BB8_2 Depth=1 549 vmovupd %ymm2, (%rdi,%r8,8) 550 addq $4, %r8 551 cmpq %rdx, %r8 552 jae .LBB8_11 553 .LBB8_2: # =>This Inner Loop Header: Depth=1 554 vmovapd %ymm10, %ymm9 555 vmovdqu (%rdi,%r8,8), %ymm13 556 vmovupd (%rsi,%r8,8), %ymm12 557 vpand 512(%rsp), %ymm13, %ymm10 # 32-byte Folded Reload 558 vmovapd .LCPI8_1(%rip), %xmm1 # xmm1 = [4503599627370495,4503599627370495] 559 vandpd (%rdi,%r8,8), %xmm1, %xmm2 560 vmovapd .LCPI8_2(%rip), %xmm0 # xmm0 = [4602678819172646912,4602678819172646912] 561 vorpd %xmm0, %xmm2, %xmm2 562 vandpd 16(%rdi,%r8,8), %xmm1, %xmm3 563 vorpd %xmm0, %xmm3, %xmm3 564 vinsertf128 $1, %xmm3, %ymm2, %ymm3 565 vmovupd 1120(%rsp), %ymm0 # 32-byte Reload 566 vcmpltpd %ymm3, %ymm0, %ymm2 567 vandnpd %ymm3, %ymm2, %ymm4 568 vaddpd 1088(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload 569 vaddpd %ymm4, %ymm3, %ymm4 570 vmulpd %ymm4, %ymm4, %ymm3 571 vmulpd %ymm3, %ymm3, %ymm7 572 vmovupd 1024(%rsp), %ymm8 # 32-byte Reload 573 vfmadd213pd 1056(%rsp), %ymm4, %ymm8 # 32-byte Folded Reload 574 vfmadd231pd 992(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload 575 vmovupd 928(%rsp), %ymm11 # 32-byte Reload 576 vfmadd213pd 960(%rsp), %ymm4, %ymm11 # 32-byte Folded Reload 577 vmovupd 864(%rsp), %ymm14 # 32-byte Reload 578 vfmadd213pd 896(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload 579 vfmadd231pd %ymm11, %ymm3, %ymm14 # ymm14 = (ymm3 * ymm11) + ymm14 580 vfmadd231pd %ymm8, %ymm7, %ymm14 # ymm14 = (ymm7 * ymm8) + ymm14 581 vmulpd %ymm4, %ymm3, %ymm8 582 vmulpd %ymm14, %ymm8, %ymm8 583 vaddpd 832(%rsp), %ymm3, %ymm11 # 32-byte Folded Reload 584 vfmadd231pd 800(%rsp), %ymm4, %ymm11 # 32-byte Folded Reload 585 vmovupd 736(%rsp), %ymm14 # 32-byte Reload 586 vfmadd213pd 768(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload 587 vmovupd 672(%rsp), %ymm15 # 32-byte Reload 588 vfmadd213pd 704(%rsp), %ymm4, %ymm15 # 32-byte Folded Reload 589 vfmadd231pd %ymm14, %ymm3, %ymm15 # ymm15 = (ymm3 * ymm14) + ymm15 590 vfmadd231pd %ymm11, %ymm7, %ymm15 # ymm15 = (ymm7 * ymm11) + ymm15 591 vdivpd %ymm15, %ymm8, %ymm7 592 vmovdqu %ymm10, 1152(%rsp) # 32-byte Spill 593 vpsrlq $52, %ymm10, %ymm8 594 vpor 640(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload 595 vaddpd 608(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload 596 vmovupd -128(%rsp), %ymm0 # 32-byte Reload 597 vandpd %ymm0, %ymm2, %ymm2 598 vaddpd %ymm2, %ymm8, %ymm8 599 vmulpd %ymm12, %ymm8, %ymm2 600 vroundpd $8, %ymm2, %ymm2 601 vfnmadd213pd %ymm2, %ymm12, %ymm8 # ymm8 = -(ymm12 * ymm8) + ymm2 602 vmovupd 576(%rsp), %ymm1 # 32-byte Reload 603 vmovapd %ymm1, %ymm11 604 vfmadd213pd %ymm4, %ymm3, %ymm11 # ymm11 = (ymm3 * ymm11) + ymm4 605 vaddpd %ymm7, %ymm11, %ymm11 606 vmovupd 544(%rsp), %ymm10 # 32-byte Reload 607 vmulpd %ymm4, %ymm10, %ymm14 608 vmulpd %ymm1, %ymm3, %ymm15 609 vfmadd231pd %ymm14, %ymm4, %ymm15 # ymm15 = (ymm4 * ymm14) + ymm15 610 vsubpd %ymm4, %ymm11, %ymm4 611 vfmadd231pd %ymm3, %ymm10, %ymm4 # ymm4 = (ymm10 * ymm3) + ymm4 612 vmovupd 480(%rsp), %ymm1 # 32-byte Reload 613 vmulpd %ymm1, %ymm12, %ymm3 614 vmulpd %ymm3, %ymm11, %ymm3 615 vroundpd $8, %ymm3, %ymm3 616 vmulpd 448(%rsp), %ymm3, %ymm14 # 32-byte Folded Reload 617 vfmadd231pd %ymm11, %ymm12, %ymm14 # ymm14 = (ymm12 * ymm11) + ymm14 618 vfmsub231pd 416(%rsp), %ymm3, %ymm14 # 32-byte Folded Reload 619 vmovupd 384(%rsp), %ymm11 # 32-byte Reload 620 vfmadd231pd %ymm8, %ymm11, %ymm14 # ymm14 = (ymm11 * ymm8) + ymm14 621 vsubpd %ymm7, %ymm15, %ymm7 622 vaddpd %ymm4, %ymm7, %ymm4 623 vfnmsub213pd %ymm14, %ymm12, %ymm4 # ymm4 = -(ymm12 * ymm4) - ymm14 624 vmulpd %ymm1, %ymm4, %ymm7 625 vroundpd $8, %ymm7, %ymm7 626 vfnmadd231pd %ymm11, %ymm7, %ymm4 # ymm4 = -(ymm7 * ymm11) + ymm4 627 vmulpd %ymm4, %ymm4, %ymm8 628 vmovupd 320(%rsp), %ymm11 # 32-byte Reload 629 vfmadd213pd 352(%rsp), %ymm4, %ymm11 # 32-byte Folded Reload 630 vmovupd 256(%rsp), %ymm14 # 32-byte Reload 631 vfmadd213pd 288(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload 632 vmovupd 192(%rsp), %ymm15 # 32-byte Reload 633 vfmadd213pd 224(%rsp), %ymm4, %ymm15 # 32-byte Folded Reload 634 vfmadd231pd %ymm14, %ymm8, %ymm15 # ymm15 = (ymm8 * ymm14) + ymm15 635 vmovupd 128(%rsp), %ymm14 # 32-byte Reload 636 vfmadd213pd 160(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload 637 vmovupd 64(%rsp), %ymm1 # 32-byte Reload 638 vfmadd213pd 96(%rsp), %ymm4, %ymm1 # 32-byte Folded Reload 639 vfmadd231pd %ymm14, %ymm8, %ymm1 # ymm1 = (ymm8 * ymm14) + ymm1 640 vmovupd 32(%rsp), %ymm14 # 32-byte Reload 641 vfmadd213pd %ymm10, %ymm4, %ymm14 # ymm14 = (ymm4 * ymm14) + ymm10 642 vfmadd213pd %ymm4, %ymm8, %ymm14 # ymm14 = (ymm8 * ymm14) + ymm4 643 vmulpd %ymm8, %ymm8, %ymm4 644 vfmadd231pd %ymm11, %ymm4, %ymm15 # ymm15 = (ymm4 * ymm11) + ymm15 645 vfmadd231pd %ymm1, %ymm4, %ymm14 # ymm14 = (ymm4 * ymm1) + ymm14 646 vmulpd %ymm4, %ymm4, %ymm1 647 vfmadd231pd %ymm15, %ymm1, %ymm14 # ymm14 = (ymm1 * ymm15) + ymm14 648 vaddpd %ymm0, %ymm14, %ymm1 649 vaddpd %ymm2, %ymm3, %ymm2 650 vaddpd %ymm7, %ymm2, %ymm15 651 vroundpd $8, %ymm15, %ymm2 652 vcvttsd2si %xmm2, %r9 653 vpermilpd $1, %xmm2, %xmm3 # xmm3 = xmm2[1,0] 654 vcvttsd2si %xmm3, %rax 655 vextractf128 $1, %ymm2, %xmm2 656 vcvttsd2si %xmm2, %rcx 657 vmovq %rcx, %xmm3 658 vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0] 659 vcvttsd2si %xmm2, %rcx 660 vmovq %rcx, %xmm2 661 vpunpcklqdq %xmm2, %xmm3, %xmm2 # xmm2 = xmm3[0],xmm2[0] 662 vmovq %r9, %xmm3 663 vmovq %rax, %xmm4 664 vpunpcklqdq %xmm4, %xmm3, %xmm3 # xmm3 = xmm3[0],xmm4[0] 665 vinserti128 $1, %xmm2, %ymm3, %ymm2 666 vpsrad $31, %ymm1, %ymm3 667 vpsrad $20, %ymm1, %ymm4 668 vpsrlq $32, %ymm4, %ymm4 669 vpblendd $170, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 670 vpaddq %ymm3, %ymm2, %ymm4 671 vpcmpgtq (%rsp), %ymm4, %ymm3 # 32-byte Folded Reload 672 vmovupd -32(%rsp), %ymm0 # 32-byte Reload 673 vcmpltpd %ymm15, %ymm0, %ymm7 674 vpor %ymm7, %ymm3, %ymm3 675 vmovdqu -64(%rsp), %ymm0 # 32-byte Reload 676 vpcmpgtq %ymm4, %ymm0, %ymm4 677 vcmpltpd -96(%rsp), %ymm15, %ymm7 # 32-byte Folded Reload 678 vpor %ymm7, %ymm4, %ymm4 679 vpsllq $52, %ymm2, %ymm2 680 vpaddq %ymm1, %ymm2, %ymm2 681 vpor %ymm3, %ymm4, %ymm1 682 vptest %ymm1, %ymm1 683 jne .LBB8_3 684 vmovapd %ymm9, %ymm10 685 jmp .LBB8_5 686 .LBB8_3: # in Loop: Header=BB8_2 Depth=1 687 vpandn %ymm2, %ymm4, %ymm1 688 vmovapd %ymm9, %ymm10 689 vblendvpd %ymm3, %ymm9, %ymm1, %ymm2 690 .LBB8_5: # in Loop: Header=BB8_2 Depth=1 691 vpand %ymm5, %ymm13, %ymm11 692 vpcmpeqq %ymm6, %ymm11, %ymm4 693 vpsrad $31, %ymm13, %ymm1 694 vpshufd $245, %ymm1, %ymm7 # ymm7 = ymm1[1,1,3,3,5,5,7,7] 695 vcmpltpd %ymm6, %ymm12, %ymm14 696 vcmpeqpd %ymm6, %ymm12, %ymm3 697 vandpd -128(%rsp), %ymm3, %ymm1 # 32-byte Folded Reload 698 vblendvpd %ymm14, %ymm10, %ymm1, %ymm1 699 vblendvpd %ymm4, %ymm1, %ymm2, %ymm2 700 vptest %ymm7, %ymm7 701 jne .LBB8_7 702 vpxor %xmm7, %xmm7, %xmm7 703 jmp .LBB8_8 704 .LBB8_7: # in Loop: Header=BB8_2 Depth=1 705 vroundpd $8, %ymm12, %ymm1 706 vcmpeqpd %ymm1, %ymm12, %ymm8 707 vcvttsd2si %xmm1, %r9 708 vpermilpd $1, %xmm1, %xmm10 # xmm10 = xmm1[1,0] 709 vcvttsd2si %xmm10, %rcx 710 vextractf128 $1, %ymm1, %xmm1 711 vcvttsd2si %xmm1, %rax 712 vxorpd %xmm10, %xmm10, %xmm10 713 vmovq %rax, %xmm6 714 vpermilpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] 715 vcvttsd2si %xmm1, %rax 716 vmovq %rax, %xmm1 717 vpunpcklqdq %xmm1, %xmm6, %xmm1 # xmm1 = xmm6[0],xmm1[0] 718 vmovq %r9, %xmm6 719 vmovq %rcx, %xmm0 720 vpunpcklqdq %xmm0, %xmm6, %xmm0 # xmm0 = xmm6[0],xmm0[0] 721 vinserti128 $1, %xmm1, %ymm0, %ymm0 722 vpsllq $63, %ymm0, %ymm0 723 vpor %ymm2, %ymm0, %ymm1 724 vcmpeqpd %ymm10, %ymm13, %ymm6 725 vbroadcastsd .LCPI8_43(%rip), %ymm10 # ymm10 = [NaN,NaN,NaN,NaN] 726 vblendvpd %ymm6, %ymm2, %ymm10, %ymm6 727 vmovapd %ymm9, %ymm10 728 vblendvpd %ymm8, %ymm1, %ymm6, %ymm1 729 vxorpd %xmm6, %xmm6, %xmm6 730 vblendvpd %ymm7, %ymm1, %ymm2, %ymm2 731 vandpd %ymm0, %ymm8, %ymm7 732 .LBB8_8: # in Loop: Header=BB8_2 Depth=1 733 vpcmpeqd %ymm9, %ymm9, %ymm9 734 vandpd %ymm5, %ymm12, %ymm0 735 vandpd %ymm5, %ymm15, %ymm1 736 vpcmpeqq %ymm5, %ymm1, %ymm15 737 vpxor %ymm9, %ymm15, %ymm1 738 vpcmpeqq %ymm5, %ymm0, %ymm8 739 vpcmpeqq %ymm5, %ymm11, %ymm11 740 vpxor %ymm9, %ymm11, %ymm0 741 vpandn %ymm0, %ymm8, %ymm0 742 vpor %ymm4, %ymm1, %ymm1 743 vpand %ymm0, %ymm1, %ymm0 744 vptest %ymm9, %ymm0 745 jb .LBB8_10 746 vpxor %ymm9, %ymm8, %ymm0 747 vpandn %ymm0, %ymm15, %ymm0 748 vmovupd -128(%rsp), %ymm8 # 32-byte Reload 749 vmovupd 1152(%rsp), %ymm9 # 32-byte Reload 750 vcmpeqpd %ymm8, %ymm9, %ymm1 751 vcmpltpd %ymm9, %ymm8, %ymm4 752 vpsrad $31, %ymm12, %ymm6 753 vpxor %ymm4, %ymm6, %ymm4 754 vpxor %xmm6, %xmm6, %xmm6 755 vblendvpd %ymm4, %ymm10, %ymm6, %ymm4 756 vblendvpd %ymm1, %ymm8, %ymm4, %ymm1 757 vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 758 vandpd %ymm2, %ymm7, %ymm1 759 vandpd %ymm7, %ymm13, %ymm2 760 vorpd %ymm2, %ymm9, %ymm2 761 vblendvpd %ymm14, %ymm1, %ymm2, %ymm1 762 vblendvpd %ymm3, %ymm8, %ymm1, %ymm1 763 vblendvpd %ymm11, %ymm1, %ymm0, %ymm0 764 vcmpunordpd %ymm13, %ymm13, %ymm1 765 vcmpunordpd %ymm12, %ymm12, %ymm2 766 vorpd %ymm1, %ymm2, %ymm1 767 vaddpd %ymm13, %ymm12, %ymm2 768 vblendvpd %ymm1, %ymm2, %ymm0, %ymm2 769 jmp .LBB8_10 770 .LBB8_11: 771 addq $1192, %rsp # imm = 0x4A8 772 vzeroupper 773 retq 774 .LCPI9_0: 775 .long 2147483647 # 0x7fffffff 776 .LCPI9_3: 777 .long 0x3f3504f3 # float 0.707106769 778 .LCPI9_4: 779 .long 0xbf800000 # float -1 780 .LCPI9_5: 781 .long 0x3def251a # float 0.116769984 782 .LCPI9_6: 783 .long 0xbdebd1b8 # float -0.115146101 784 .LCPI9_7: 785 .long 0x3e11e9bf # float 0.142493233 786 .LCPI9_8: 787 .long 0xbdfe5d4f # float -0.12420141 788 .LCPI9_9: 789 .long 0x3e4cceac # float 0.200007141 790 .LCPI9_10: 791 .long 0xbe2aae50 # float -0.166680574 792 .LCPI9_11: 793 .long 0x3eaaaaaa # float 0.333333313 794 .LCPI9_12: 795 .long 0xbe7ffffc # float -0.24999994 796 .LCPI9_13: 797 .long 0x3d9021bb # float 0.0703768358 798 .LCPI9_15: 799 .long 0xcb00007f # float -8388735 800 .LCPI9_16: 801 .long 0x3f800000 # float 1 802 .LCPI9_17: 803 .long 0xbf000000 # float -0.5 804 .LCPI9_18: 805 .long 0x3f000000 # float 0.5 806 .LCPI9_19: 807 .long 0x3fb8aa3b # float 1.44269502 808 .LCPI9_20: 809 .long 0xbf318000 # float -0.693359375 810 .LCPI9_21: 811 .long 0xb95e8083 # float -2.12194442E-4 812 .LCPI9_22: 813 .long 0xbf317218 # float -0.693147182 814 .LCPI9_23: 815 .long 0x3d2aaaab # float 0.0416666679 816 .LCPI9_24: 817 .long 0x3c088889 # float 0.00833333377 818 .LCPI9_25: 819 .long 0x3ab60b61 # float 0.00138888892 820 .LCPI9_26: 821 .long 0x39500d01 # float 1.98412701E-4 822 .LCPI9_27: 823 .long 0x3e2aaaab # float 0.166666672 824 .LCPI9_29: 825 .long 254 # 0xfe 826 .LCPI9_30: 827 .long 0x43960000 # float 300 828 .LCPI9_31: 829 .long 1 # 0x1 830 .LCPI9_32: 831 .long 0xc3960000 # float -300 832 .LCPI9_33: 833 .long 2139095040 # 0x7f800000 834 .LCPI9_34: 835 .long 0x7fc00102 # float NaN 836 .LCPI9_1: 837 .quad 36028792732385279 # 0x7fffff007fffff 838 .quad 36028792732385279 # 0x7fffff007fffff 839 .LCPI9_2: 840 .quad 4539628425446424576 # 0x3f0000003f000000 841 .quad 4539628425446424576 # 0x3f0000003f000000 842 .LCPI9_14: 843 .quad 5404319554102886400 # 0x4b0000004b000000 844 .LCPI9_28: 845 .byte 255 # 0xff 846 .byte 0 # 0x0 847 .byte 0 # 0x0 848 .byte 0 # 0x0 849 .byte 255 # 0xff 850 .byte 0 # 0x0 851 .byte 0 # 0x0 852 .byte 0 # 0x0 853 .byte 255 # 0xff 854 .byte 0 # 0x0 855 .byte 0 # 0x0 856 .byte 0 # 0x0 857 .byte 255 # 0xff 858 .byte 0 # 0x0 859 .byte 0 # 0x0 860 .byte 0 # 0x0 861 .byte 255 # 0xff 862 .byte 0 # 0x0 863 .byte 0 # 0x0 864 .byte 0 # 0x0 865 .byte 255 # 0xff 866 .byte 0 # 0x0 867 .byte 0 # 0x0 868 .byte 0 # 0x0 869 .byte 255 # 0xff 870 .byte 0 # 0x0 871 .byte 0 # 0x0 872 .byte 0 # 0x0 873 .byte 255 # 0xff 874 .byte 0 # 0x0 875 .byte 0 # 0x0 876 .byte 0 # 0x0 877 Pow_8x_F32_V(float*, float*, unsigned long): # @Pow_8x_F32_V(float*, float*, unsigned long) 878 subq $872, %rsp # imm = 0x368 879 andq $-8, %rdx 880 je .LBB9_12 881 xorl %eax, %eax 882 vbroadcastss .LCPI9_0(%rip), %ymm0 # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 883 vmovups %ymm0, 320(%rsp) # 32-byte Spill 884 vbroadcastss .LCPI9_3(%rip), %ymm0 # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1] 885 vmovups %ymm0, 800(%rsp) # 32-byte Spill 886 vpxor %xmm7, %xmm7, %xmm7 887 vbroadcastss .LCPI9_4(%rip), %ymm0 # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] 888 vmovups %ymm0, 768(%rsp) # 32-byte Spill 889 vbroadcastss .LCPI9_5(%rip), %ymm0 # ymm0 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1] 890 vmovups %ymm0, 736(%rsp) # 32-byte Spill 891 vbroadcastss .LCPI9_6(%rip), %ymm0 # ymm0 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1] 892 vmovups %ymm0, 704(%rsp) # 32-byte Spill 893 vbroadcastss .LCPI9_7(%rip), %ymm0 # ymm0 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1] 894 vmovups %ymm0, 672(%rsp) # 32-byte Spill 895 vbroadcastss .LCPI9_8(%rip), %ymm0 # ymm0 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1] 896 vmovups %ymm0, 640(%rsp) # 32-byte Spill 897 vbroadcastss .LCPI9_9(%rip), %ymm0 # ymm0 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1] 898 vmovups %ymm0, 608(%rsp) # 32-byte Spill 899 vbroadcastss .LCPI9_10(%rip), %ymm0 # ymm0 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1] 900 vmovups %ymm0, 576(%rsp) # 32-byte Spill 901 vbroadcastss .LCPI9_11(%rip), %ymm0 # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1] 902 vmovups %ymm0, 544(%rsp) # 32-byte Spill 903 vbroadcastss .LCPI9_12(%rip), %ymm0 # ymm0 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1] 904 vmovups %ymm0, 512(%rsp) # 32-byte Spill 905 vbroadcastss .LCPI9_13(%rip), %ymm0 # ymm0 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2] 906 vmovups %ymm0, 480(%rsp) # 32-byte Spill 907 vbroadcastsd .LCPI9_14(%rip), %ymm0 # ymm0 = [5404319554102886400,5404319554102886400,5404319554102886400,5404319554102886400] 908 vmovups %ymm0, 448(%rsp) # 32-byte Spill 909 vbroadcastss .LCPI9_15(%rip), %ymm0 # ymm0 = [-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6] 910 vmovups %ymm0, 416(%rsp) # 32-byte Spill 911 vbroadcastss .LCPI9_16(%rip), %ymm0 # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 912 vmovups %ymm0, -128(%rsp) # 32-byte Spill 913 vbroadcastss .LCPI9_17(%rip), %ymm0 # ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 914 vmovups %ymm0, 384(%rsp) # 32-byte Spill 915 vbroadcastss .LCPI9_18(%rip), %ymm0 # ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1] 916 vmovups %ymm0, 352(%rsp) # 32-byte Spill 917 vbroadcastss .LCPI9_19(%rip), %ymm0 # ymm0 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0] 918 vmovups %ymm0, 288(%rsp) # 32-byte Spill 919 vbroadcastss .LCPI9_20(%rip), %ymm0 # ymm0 = [-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1] 920 vmovups %ymm0, 256(%rsp) # 32-byte Spill 921 vbroadcastss .LCPI9_21(%rip), %ymm0 # ymm0 = [-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4] 922 vmovups %ymm0, 224(%rsp) # 32-byte Spill 923 vbroadcastss .LCPI9_22(%rip), %ymm0 # ymm0 = [-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1] 924 vmovups %ymm0, 192(%rsp) # 32-byte Spill 925 vbroadcastss .LCPI9_23(%rip), %ymm0 # ymm0 = [4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2] 926 vmovups %ymm0, 160(%rsp) # 32-byte Spill 927 vbroadcastss .LCPI9_24(%rip), %ymm0 # ymm0 = [8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3] 928 vmovups %ymm0, 128(%rsp) # 32-byte Spill 929 vbroadcastss .LCPI9_25(%rip), %ymm0 # ymm0 = [1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3] 930 vmovups %ymm0, 96(%rsp) # 32-byte Spill 931 vbroadcastss .LCPI9_26(%rip), %ymm0 # ymm0 = [1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4] 932 vmovups %ymm0, 64(%rsp) # 32-byte Spill 933 vbroadcastss .LCPI9_27(%rip), %ymm0 # ymm0 = [1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1] 934 vmovups %ymm0, 32(%rsp) # 32-byte Spill 935 vbroadcastss .LCPI9_29(%rip), %ymm0 # ymm0 = [254,254,254,254,254,254,254,254] 936 vmovups %ymm0, (%rsp) # 32-byte Spill 937 vbroadcastss .LCPI9_30(%rip), %ymm0 # ymm0 = [3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2] 938 vmovups %ymm0, -32(%rsp) # 32-byte Spill 939 vbroadcastss .LCPI9_31(%rip), %ymm0 # ymm0 = [1,1,1,1,1,1,1,1] 940 vmovups %ymm0, -64(%rsp) # 32-byte Spill 941 vpbroadcastd .LCPI9_32(%rip), %ymm0 # ymm0 = [-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2] 942 vmovdqu %ymm0, -96(%rsp) # 32-byte Spill 943 vpbroadcastd .LCPI9_33(%rip), %ymm8 # ymm8 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040] 944 vbroadcastss .LCPI9_33(%rip), %ymm12 # ymm12 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040] 945 jmp .LBB9_2 946 .LBB9_10: # in Loop: Header=BB9_2 Depth=1 947 vpxor %ymm0, %ymm15, %ymm0 948 vpandn %ymm0, %ymm14, %ymm0 949 vmovups -128(%rsp), %ymm14 # 32-byte Reload 950 vmovups 832(%rsp), %ymm2 # 32-byte Reload 951 vcmpeqps %ymm2, %ymm14, %ymm3 952 vcmpltps %ymm2, %ymm14, %ymm4 953 vxorps %ymm4, %ymm11, %ymm4 954 vpxor %xmm7, %xmm7, %xmm7 955 vblendvps %ymm4, %ymm12, %ymm7, %ymm4 956 vblendvps %ymm3, %ymm14, %ymm4, %ymm3 957 vblendvps %ymm0, %ymm6, %ymm3, %ymm0 958 vandps %ymm6, %ymm10, %ymm3 959 vandps %ymm9, %ymm10, %ymm4 960 vorps %ymm2, %ymm4, %ymm4 961 vblendvps %ymm13, %ymm3, %ymm4, %ymm3 962 vblendvps %ymm1, %ymm14, %ymm3, %ymm1 963 vblendvps %ymm5, %ymm0, %ymm1, %ymm0 964 vcmpunordps %ymm9, %ymm9, %ymm1 965 vcmpunordps %ymm11, %ymm11, %ymm3 966 vorps %ymm1, %ymm3, %ymm1 967 vaddps %ymm9, %ymm11, %ymm3 968 vblendvps %ymm1, %ymm3, %ymm0, %ymm6 969 vmovups %ymm6, (%rdi,%rax,4) 970 addq $8, %rax 971 cmpq %rdx, %rax 972 jae .LBB9_12 973 .LBB9_2: # =>This Inner Loop Header: Depth=1 974 vmovaps %ymm12, %ymm2 975 vmovdqu (%rdi,%rax,4), %ymm9 976 vmovups (%rsi,%rax,4), %ymm11 977 vpand 320(%rsp), %ymm9, %ymm12 # 32-byte Folded Reload 978 vmovaps .LCPI9_1(%rip), %xmm1 # xmm1 = [36028792732385279,36028792732385279] 979 vandps (%rdi,%rax,4), %xmm1, %xmm0 980 vmovaps .LCPI9_2(%rip), %xmm3 # xmm3 = [4539628425446424576,4539628425446424576] 981 vorps %xmm3, %xmm0, %xmm0 982 vandps 16(%rdi,%rax,4), %xmm1, %xmm1 983 vorps %xmm3, %xmm1, %xmm1 984 vinsertf128 $1, %xmm1, %ymm0, %ymm0 985 vmovups 800(%rsp), %ymm1 # 32-byte Reload 986 vcmpltps %ymm0, %ymm1, %ymm1 987 vandnps %ymm0, %ymm1, %ymm4 988 vaddps 768(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 989 vaddps %ymm4, %ymm0, %ymm4 990 vmulps %ymm4, %ymm4, %ymm6 991 vmulps %ymm6, %ymm6, %ymm0 992 vmovups 704(%rsp), %ymm5 # 32-byte Reload 993 vfmadd213ps 736(%rsp), %ymm4, %ymm5 # 32-byte Folded Reload 994 vmovups 640(%rsp), %ymm10 # 32-byte Reload 995 vfmadd213ps 672(%rsp), %ymm4, %ymm10 # 32-byte Folded Reload 996 vfmadd231ps %ymm5, %ymm6, %ymm10 # ymm10 = (ymm6 * ymm5) + ymm10 997 vmovups 576(%rsp), %ymm5 # 32-byte Reload 998 vfmadd213ps 608(%rsp), %ymm4, %ymm5 # 32-byte Folded Reload 999 vmovups 512(%rsp), %ymm13 # 32-byte Reload 1000 vfmadd213ps 544(%rsp), %ymm4, %ymm13 # 32-byte Folded Reload 1001 vmulps %ymm0, %ymm0, %ymm14 1002 vfmadd132ps 480(%rsp), %ymm13, %ymm14 # 32-byte Folded Reload 1003 vfmadd231ps %ymm5, %ymm6, %ymm14 # ymm14 = (ymm6 * ymm5) + ymm14 1004 vfmadd231ps %ymm10, %ymm0, %ymm14 # ymm14 = (ymm0 * ymm10) + ymm14 1005 vmulps %ymm4, %ymm6, %ymm0 1006 vmulps %ymm0, %ymm14, %ymm0 1007 vmovdqu %ymm12, 832(%rsp) # 32-byte Spill 1008 vpsrld $23, %ymm12, %ymm5 1009 vpor 448(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 1010 vaddps 416(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 1011 vmovups -128(%rsp), %ymm3 # 32-byte Reload 1012 vandps %ymm3, %ymm1, %ymm1 1013 vaddps %ymm1, %ymm5, %ymm5 1014 vmulps %ymm5, %ymm11, %ymm1 1015 vroundps $8, %ymm1, %ymm1 1016 vfnmadd213ps %ymm1, %ymm11, %ymm5 # ymm5 = -(ymm11 * ymm5) + ymm1 1017 vmovups 384(%rsp), %ymm14 # 32-byte Reload 1018 vmovaps %ymm14, %ymm10 1019 vfmadd213ps %ymm4, %ymm6, %ymm10 # ymm10 = (ymm6 * ymm10) + ymm4 1020 vaddps %ymm0, %ymm10, %ymm10 1021 vmovups 352(%rsp), %ymm12 # 32-byte Reload 1022 vmulps %ymm4, %ymm12, %ymm13 1023 vmulps %ymm6, %ymm14, %ymm14 1024 vfmadd231ps %ymm13, %ymm4, %ymm14 # ymm14 = (ymm4 * ymm13) + ymm14 1025 vsubps %ymm4, %ymm10, %ymm4 1026 vfmadd231ps %ymm6, %ymm12, %ymm4 # ymm4 = (ymm12 * ymm6) + ymm4 1027 vmovups 288(%rsp), %ymm15 # 32-byte Reload 1028 vmulps %ymm15, %ymm11, %ymm6 1029 vmulps %ymm6, %ymm10, %ymm6 1030 vroundps $8, %ymm6, %ymm6 1031 vmulps 256(%rsp), %ymm6, %ymm13 # 32-byte Folded Reload 1032 vfmadd231ps %ymm10, %ymm11, %ymm13 # ymm13 = (ymm11 * ymm10) + ymm13 1033 vfnmadd231ps 224(%rsp), %ymm6, %ymm13 # 32-byte Folded Reload 1034 vsubps %ymm0, %ymm14, %ymm0 1035 vaddps %ymm4, %ymm0, %ymm0 1036 vmovups 192(%rsp), %ymm10 # 32-byte Reload 1037 vmulps %ymm5, %ymm10, %ymm4 1038 vfnmadd231ps %ymm0, %ymm11, %ymm4 # ymm4 = -(ymm11 * ymm0) + ymm4 1039 vaddps %ymm4, %ymm13, %ymm0 1040 vmulps %ymm0, %ymm15, %ymm4 1041 vroundps $8, %ymm4, %ymm4 1042 vfmadd231ps %ymm10, %ymm4, %ymm0 # ymm0 = (ymm4 * ymm10) + ymm0 1043 vmulps %ymm0, %ymm0, %ymm5 1044 vmulps %ymm5, %ymm5, %ymm10 1045 vmovups 64(%rsp), %ymm13 # 32-byte Reload 1046 vfmadd213ps 96(%rsp), %ymm0, %ymm13 # 32-byte Folded Reload 1047 vmovups 32(%rsp), %ymm14 # 32-byte Reload 1048 vfmadd213ps %ymm12, %ymm0, %ymm14 # ymm14 = (ymm0 * ymm14) + ymm12 1049 vfmadd231ps %ymm13, %ymm10, %ymm14 # ymm14 = (ymm10 * ymm13) + ymm14 1050 vmovups 128(%rsp), %ymm10 # 32-byte Reload 1051 vfmadd213ps 160(%rsp), %ymm0, %ymm10 # 32-byte Folded Reload 1052 vfmadd231ps %ymm10, %ymm5, %ymm14 # ymm14 = (ymm5 * ymm10) + ymm14 1053 vaddps %ymm3, %ymm0, %ymm10 1054 vfmadd231ps %ymm14, %ymm5, %ymm10 # ymm10 = (ymm5 * ymm14) + ymm10 1055 vaddps %ymm1, %ymm6, %ymm0 1056 vaddps %ymm4, %ymm0, %ymm14 1057 vcvtps2dq %ymm14, %ymm4 1058 vpsrld $23, %ymm10, %ymm0 1059 vpand .LCPI9_28(%rip), %ymm0, %ymm0 1060 vpaddd %ymm4, %ymm0, %ymm0 1061 vpcmpgtd (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload 1062 vmovups -32(%rsp), %ymm3 # 32-byte Reload 1063 vcmpltps %ymm14, %ymm3, %ymm5 1064 vpor %ymm5, %ymm1, %ymm1 1065 vmovdqu -64(%rsp), %ymm3 # 32-byte Reload 1066 vpcmpgtd %ymm0, %ymm3, %ymm0 1067 vcmpltps -96(%rsp), %ymm14, %ymm5 # 32-byte Folded Reload 1068 vpor %ymm5, %ymm0, %ymm0 1069 vpslld $23, %ymm4, %ymm4 1070 vpaddd %ymm4, %ymm10, %ymm6 1071 vpor %ymm1, %ymm0, %ymm4 1072 vtestps %ymm4, %ymm4 1073 jne .LBB9_3 1074 vpcmpeqd %ymm15, %ymm15, %ymm15 1075 vmovaps %ymm2, %ymm12 1076 jmp .LBB9_5 1077 .LBB9_3: # in Loop: Header=BB9_2 Depth=1 1078 vpandn %ymm6, %ymm0, %ymm0 1079 vmovaps %ymm2, %ymm12 1080 vblendvps %ymm1, %ymm2, %ymm0, %ymm6 1081 vpcmpeqd %ymm15, %ymm15, %ymm15 1082 .LBB9_5: # in Loop: Header=BB9_2 Depth=1 1083 vpand %ymm8, %ymm9, %ymm5 1084 vpcmpeqd %ymm7, %ymm5, %ymm4 1085 vcmpltps %ymm7, %ymm11, %ymm13 1086 vcmpeqps %ymm7, %ymm11, %ymm1 1087 vandps -128(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload 1088 vblendvps %ymm13, %ymm12, %ymm0, %ymm0 1089 vblendvps %ymm4, %ymm0, %ymm6, %ymm6 1090 vmovmskps %ymm9, %ecx 1091 testl %ecx, %ecx 1092 jne .LBB9_7 1093 vxorps %xmm10, %xmm10, %xmm10 1094 jmp .LBB9_8 1095 .LBB9_7: # in Loop: Header=BB9_2 Depth=1 1096 vroundps $8, %ymm11, %ymm0 1097 vcmpeqps %ymm0, %ymm11, %ymm0 1098 vcvtps2dq %ymm11, %ymm10 1099 vpslld $31, %ymm10, %ymm10 1100 vpor %ymm6, %ymm10, %ymm12 1101 vpxor %xmm3, %xmm3, %xmm3 1102 vcmpeqps %ymm3, %ymm9, %ymm7 1103 vbroadcastss .LCPI9_34(%rip), %ymm3 # ymm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 1104 vblendvps %ymm7, %ymm6, %ymm3, %ymm3 1105 vblendvps %ymm0, %ymm12, %ymm3, %ymm3 1106 vmovaps %ymm2, %ymm12 1107 vpsrad $31, %ymm9, %ymm7 1108 vblendvps %ymm7, %ymm3, %ymm6, %ymm6 1109 vandps %ymm0, %ymm10, %ymm10 1110 .LBB9_8: # in Loop: Header=BB9_2 Depth=1 1111 vpcmpeqd %ymm5, %ymm8, %ymm0 1112 vpxor %ymm0, %ymm15, %ymm5 1113 vandps %ymm8, %ymm11, %ymm0 1114 vandps %ymm8, %ymm14, %ymm3 1115 vpcmpeqd %ymm3, %ymm8, %ymm14 1116 vpxor %ymm15, %ymm14, %ymm3 1117 vpcmpeqd %ymm0, %ymm8, %ymm0 1118 vpandn %ymm5, %ymm0, %ymm7 1119 vpor %ymm4, %ymm3, %ymm3 1120 vpand %ymm7, %ymm3, %ymm3 1121 vtestps %ymm15, %ymm3 1122 jae .LBB9_10 1123 vpxor %xmm7, %xmm7, %xmm7 1124 vmovups %ymm6, (%rdi,%rax,4) 1125 addq $8, %rax 1126 cmpq %rdx, %rax 1127 jb .LBB9_2 1128 .LBB9_12: 1129 addq $872, %rsp # imm = 0x368 1130 vzeroupper 1131 retq 1132 .LCPI10_0: 1133 .quad 0x3ff71547652b82fe # double 1.4426950408889634 1134 .LCPI10_1: 1135 .quad 9218868437227405312 # 0x7ff0000000000000 1136 .LCPI10_2: 1137 .quad 0x3ff0000000000000 # double 1 1138 .LCPI10_3: 1139 .quad 9223372036854775807 # 0x7fffffffffffffff 1140 .LCPI10_6: 1141 .quad 0x3fe6a09e667f3bcd # double 0.70710678118654757 1142 .LCPI10_7: 1143 .quad 0xbff0000000000000 # double -1 1144 .LCPI10_8: 1145 .quad 0x401a509f46f4fa53 # double 6.5787325942061043 1146 .LCPI10_9: 1147 .quad 0x3fdfe818a0fe1a83 # double 0.49854102823193375 1148 .LCPI10_10: 1149 .quad 0x3f07bc0962b395ca # double 4.5270000862445198E-5 1150 .LCPI10_11: 1151 .quad 0x404e798eb86c3351 # double 60.94966798098779 1152 .LCPI10_12: 1153 .quad 0x403de9738b8cb9c9 # double 29.911919328553072 1154 .LCPI10_13: 1155 .quad 0x40340a202d99830a # double 20.039553499201283 1156 .LCPI10_14: 1157 .quad 0x404c8e7597479a10 # double 57.112963590585537 1158 .LCPI10_15: 1159 .quad 0x4054c30b52213498 # double 83.047565967967216 1160 .LCPI10_16: 1161 .quad 0x402e20359e903e37 # double 15.062909083469192 1162 .LCPI10_17: 1163 .quad 0x407351945dc908a5 # double 309.09872225312057 1164 .LCPI10_18: 1165 .quad 0x406bb86590fcfb56 # double 221.76239823732857 1166 .LCPI10_19: 1167 .quad 0x404e0f304466448e # double 60.118660497603841 1168 .LCPI10_20: 1169 .quad 0x406b0db13e48e066 # double 216.42788614495947 1170 .LCPI10_21: 1171 .quad 4841369599423283200 # 0x4330000000000000 1172 .LCPI10_22: 1173 .quad 0xc3300000000003ff # double -4503599627371519 1174 .LCPI10_23: 1175 .quad 0xbfe0000000000000 # double -0.5 1176 .LCPI10_24: 1177 .quad 0x3fe0000000000000 # double 0.5 1178 .LCPI10_25: 1179 .quad 0xbfe62e4000000000 # double -0.693145751953125 1180 .LCPI10_26: 1181 .quad 0x3eb7f7d1cf79abca # double 1.4286068203094173E-6 1182 .LCPI10_27: 1183 .quad 0x3fe62e42fefa39ef # double 0.69314718055994529 1184 .LCPI10_28: 1185 .quad 0x3e21eed8eff8d898 # double 2.08767569878681E-9 1186 .LCPI10_29: 1187 .quad 0x3de6124613a86d09 # double 1.6059043836821613E-10 1188 .LCPI10_30: 1189 .quad 0x3e927e4fb7789f5c # double 2.7557319223985888E-7 1190 .LCPI10_31: 1191 .quad 0x3e5ae64567f544e4 # double 2.505210838544172E-8 1192 .LCPI10_32: 1193 .quad 0x3efa01a01a01a01a # double 2.4801587301587302E-5 1194 .LCPI10_33: 1195 .quad 0x3ec71de3a556c734 # double 2.7557319223985893E-6 1196 .LCPI10_34: 1197 .quad 0x3f56c16c16c16c17 # double 0.0013888888888888889 1198 .LCPI10_35: 1199 .quad 0x3f2a01a01a01a01a # double 1.9841269841269841E-4 1200 .LCPI10_36: 1201 .quad 0x3fa5555555555555 # double 0.041666666666666664 1202 .LCPI10_37: 1203 .quad 0x3f81111111111111 # double 0.0083333333333333332 1204 .LCPI10_38: 1205 .quad 0x3fc5555555555555 # double 0.16666666666666666 1206 .LCPI10_39: 1207 .quad 2046 # 0x7fe 1208 .LCPI10_40: 1209 .quad 0x40a7700000000000 # double 3000 1210 .LCPI10_41: 1211 .quad 1 # 0x1 1212 .LCPI10_42: 1213 .quad 0xc0a7700000000000 # double -3000 1214 .LCPI10_43: 1215 .quad 0x7ff8002040000000 # double NaN 1216 .LCPI10_4: 1217 .quad 4503599627370495 # 0xfffffffffffff 1218 .quad 4503599627370495 # 0xfffffffffffff 1219 .LCPI10_5: 1220 .quad 4602678819172646912 # 0x3fe0000000000000 1221 .quad 4602678819172646912 # 0x3fe0000000000000 1222 PowNumber_4x_F64_V(double*, double, unsigned long): # @PowNumber_4x_F64_V(double*, double, unsigned long) 1223 subq $1352, %rsp # imm = 0x548 1224 andq $-4, %rsi 1225 je .LBB10_10 1226 vbroadcastsd %xmm0, %ymm0 1227 vbroadcastsd .LCPI10_0(%rip), %ymm1 # ymm1 = [1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0,1.4426950408889634E+0] 1228 vbroadcastsd .LCPI10_1(%rip), %ymm2 # ymm2 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312] 1229 vmovupd %ymm1, 1312(%rsp) # 32-byte Spill 1230 vmulpd %ymm1, %ymm0, %ymm1 1231 vmovupd %ymm1, 1280(%rsp) # 32-byte Spill 1232 vandpd %ymm2, %ymm0, %ymm1 1233 vmovupd %ymm1, 1248(%rsp) # 32-byte Spill 1234 vxorpd %xmm1, %xmm1, %xmm1 1235 vcmpltpd %ymm1, %ymm0, %ymm3 1236 vcmpeqpd %ymm1, %ymm0, %ymm4 1237 vbroadcastsd .LCPI10_2(%rip), %ymm1 # ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1238 vmovupd %ymm4, -64(%rsp) # 32-byte Spill 1239 vandpd %ymm1, %ymm4, %ymm1 1240 vbroadcastsd .LCPI10_1(%rip), %ymm4 # ymm4 = [9218868437227405312,9218868437227405312,9218868437227405312,9218868437227405312] 1241 vmovupd %ymm3, -32(%rsp) # 32-byte Spill 1242 vmovupd %ymm4, -128(%rsp) # 32-byte Spill 1243 vblendvpd %ymm3, %ymm4, %ymm1, %ymm1 1244 vmovupd %ymm1, 1216(%rsp) # 32-byte Spill 1245 vpsrad $31, %ymm0, %ymm1 1246 vpshufd $245, %ymm1, %ymm1 # ymm1 = ymm1[1,1,3,3,5,5,7,7] 1247 vmovdqu %ymm1, -96(%rsp) # 32-byte Spill 1248 xorl %r8d, %r8d 1249 vbroadcastsd .LCPI10_3(%rip), %ymm1 # ymm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] 1250 vmovups %ymm1, 1184(%rsp) # 32-byte Spill 1251 vbroadcastsd .LCPI10_6(%rip), %ymm1 # ymm1 = [7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1,7.0710678118654757E-1] 1252 vmovups %ymm1, 1152(%rsp) # 32-byte Spill 1253 vbroadcastsd .LCPI10_7(%rip), %ymm1 # ymm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] 1254 vmovups %ymm1, 1120(%rsp) # 32-byte Spill 1255 vbroadcastsd .LCPI10_8(%rip), %ymm1 # ymm1 = [6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0,6.5787325942061043E+0] 1256 vmovups %ymm1, 1088(%rsp) # 32-byte Spill 1257 vbroadcastsd .LCPI10_9(%rip), %ymm1 # ymm1 = [4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1,4.9854102823193375E-1] 1258 vmovups %ymm1, 1056(%rsp) # 32-byte Spill 1259 vbroadcastsd .LCPI10_10(%rip), %ymm1 # ymm1 = [4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5,4.5270000862445198E-5] 1260 vmovups %ymm1, 1024(%rsp) # 32-byte Spill 1261 vbroadcastsd .LCPI10_11(%rip), %ymm1 # ymm1 = [6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1,6.094966798098779E+1] 1262 vmovups %ymm1, 992(%rsp) # 32-byte Spill 1263 vbroadcastsd .LCPI10_12(%rip), %ymm1 # ymm1 = [2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1,2.9911919328553072E+1] 1264 vmovups %ymm1, 960(%rsp) # 32-byte Spill 1265 vbroadcastsd .LCPI10_13(%rip), %ymm1 # ymm1 = [2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1,2.0039553499201283E+1] 1266 vmovups %ymm1, 928(%rsp) # 32-byte Spill 1267 vbroadcastsd .LCPI10_14(%rip), %ymm1 # ymm1 = [5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1,5.7112963590585537E+1] 1268 vmovups %ymm1, 896(%rsp) # 32-byte Spill 1269 vbroadcastsd .LCPI10_15(%rip), %ymm1 # ymm1 = [8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1,8.3047565967967216E+1] 1270 vmovups %ymm1, 864(%rsp) # 32-byte Spill 1271 vbroadcastsd .LCPI10_16(%rip), %ymm1 # ymm1 = [1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1,1.5062909083469192E+1] 1272 vmovups %ymm1, 832(%rsp) # 32-byte Spill 1273 vbroadcastsd .LCPI10_17(%rip), %ymm1 # ymm1 = [3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2,3.0909872225312057E+2] 1274 vmovups %ymm1, 800(%rsp) # 32-byte Spill 1275 vbroadcastsd .LCPI10_18(%rip), %ymm1 # ymm1 = [2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2,2.2176239823732857E+2] 1276 vmovups %ymm1, 768(%rsp) # 32-byte Spill 1277 vbroadcastsd .LCPI10_19(%rip), %ymm1 # ymm1 = [6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1,6.0118660497603841E+1] 1278 vmovups %ymm1, 736(%rsp) # 32-byte Spill 1279 vbroadcastsd .LCPI10_20(%rip), %ymm1 # ymm1 = [2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2,2.1642788614495947E+2] 1280 vmovups %ymm1, 704(%rsp) # 32-byte Spill 1281 vbroadcastsd .LCPI10_21(%rip), %ymm1 # ymm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] 1282 vmovups %ymm1, 672(%rsp) # 32-byte Spill 1283 vbroadcastsd .LCPI10_22(%rip), %ymm1 # ymm1 = [-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15,-4.503599627371519E+15] 1284 vmovups %ymm1, 640(%rsp) # 32-byte Spill 1285 vbroadcastsd .LCPI10_2(%rip), %ymm13 # ymm13 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1286 vbroadcastsd .LCPI10_23(%rip), %ymm1 # ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 1287 vmovups %ymm1, 608(%rsp) # 32-byte Spill 1288 vbroadcastsd .LCPI10_24(%rip), %ymm1 # ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] 1289 vmovups %ymm1, 576(%rsp) # 32-byte Spill 1290 vbroadcastsd .LCPI10_25(%rip), %ymm1 # ymm1 = [-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1,-6.93145751953125E-1] 1291 vmovups %ymm1, 544(%rsp) # 32-byte Spill 1292 vbroadcastsd .LCPI10_26(%rip), %ymm1 # ymm1 = [1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6,1.4286068203094173E-6] 1293 vmovups %ymm1, 512(%rsp) # 32-byte Spill 1294 vbroadcastsd .LCPI10_27(%rip), %ymm1 # ymm1 = [6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1,6.9314718055994529E-1] 1295 vmovups %ymm1, 480(%rsp) # 32-byte Spill 1296 vbroadcastsd .LCPI10_28(%rip), %ymm1 # ymm1 = [2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9,2.08767569878681E-9] 1297 vmovups %ymm1, 448(%rsp) # 32-byte Spill 1298 vbroadcastsd .LCPI10_29(%rip), %ymm1 # ymm1 = [1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10,1.6059043836821613E-10] 1299 vmovups %ymm1, 416(%rsp) # 32-byte Spill 1300 vbroadcastsd .LCPI10_30(%rip), %ymm1 # ymm1 = [2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7,2.7557319223985888E-7] 1301 vmovups %ymm1, 384(%rsp) # 32-byte Spill 1302 vbroadcastsd .LCPI10_31(%rip), %ymm1 # ymm1 = [2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8,2.505210838544172E-8] 1303 vmovups %ymm1, 352(%rsp) # 32-byte Spill 1304 vbroadcastsd .LCPI10_32(%rip), %ymm1 # ymm1 = [2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5,2.4801587301587302E-5] 1305 vmovups %ymm1, 320(%rsp) # 32-byte Spill 1306 vbroadcastsd .LCPI10_33(%rip), %ymm1 # ymm1 = [2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6,2.7557319223985893E-6] 1307 vmovups %ymm1, 288(%rsp) # 32-byte Spill 1308 vbroadcastsd .LCPI10_34(%rip), %ymm1 # ymm1 = [1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3,1.3888888888888889E-3] 1309 vmovups %ymm1, 256(%rsp) # 32-byte Spill 1310 vbroadcastsd .LCPI10_35(%rip), %ymm1 # ymm1 = [1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4,1.9841269841269841E-4] 1311 vmovups %ymm1, 224(%rsp) # 32-byte Spill 1312 vbroadcastsd .LCPI10_36(%rip), %ymm1 # ymm1 = [4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2,4.1666666666666664E-2] 1313 vmovups %ymm1, 192(%rsp) # 32-byte Spill 1314 vbroadcastsd .LCPI10_37(%rip), %ymm1 # ymm1 = [8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3,8.3333333333333332E-3] 1315 vmovups %ymm1, 160(%rsp) # 32-byte Spill 1316 vbroadcastsd .LCPI10_38(%rip), %ymm1 # ymm1 = [1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1,1.6666666666666666E-1] 1317 vmovups %ymm1, 128(%rsp) # 32-byte Spill 1318 vbroadcastsd .LCPI10_39(%rip), %ymm1 # ymm1 = [2046,2046,2046,2046] 1319 vmovups %ymm1, 96(%rsp) # 32-byte Spill 1320 vbroadcastsd .LCPI10_40(%rip), %ymm1 # ymm1 = [3.0E+3,3.0E+3,3.0E+3,3.0E+3] 1321 vmovups %ymm1, 64(%rsp) # 32-byte Spill 1322 vbroadcastsd .LCPI10_41(%rip), %ymm1 # ymm1 = [1,1,1,1] 1323 vmovups %ymm1, 32(%rsp) # 32-byte Spill 1324 vbroadcastsd .LCPI10_42(%rip), %ymm1 # ymm1 = [-3.0E+3,-3.0E+3,-3.0E+3,-3.0E+3] 1325 vmovupd %ymm1, (%rsp) # 32-byte Spill 1326 jmp .LBB10_2 1327 .LBB10_9: # in Loop: Header=BB10_2 Depth=1 1328 vmovupd %ymm6, (%rdi,%r8,8) 1329 addq $4, %r8 1330 cmpq %rsi, %r8 1331 jae .LBB10_10 1332 .LBB10_2: # =>This Inner Loop Header: Depth=1 1333 vmovdqu (%rdi,%r8,8), %ymm15 1334 vmovapd .LCPI10_4(%rip), %xmm1 # xmm1 = [4503599627370495,4503599627370495] 1335 vandpd (%rdi,%r8,8), %xmm1, %xmm3 1336 vmovapd .LCPI10_5(%rip), %xmm5 # xmm5 = [4602678819172646912,4602678819172646912] 1337 vorpd %xmm5, %xmm3, %xmm3 1338 vandpd 16(%rdi,%r8,8), %xmm1, %xmm4 1339 vorpd %xmm5, %xmm4, %xmm4 1340 vinsertf128 $1, %xmm4, %ymm3, %ymm3 1341 vmovupd 1152(%rsp), %ymm1 # 32-byte Reload 1342 vcmpltpd %ymm3, %ymm1, %ymm6 1343 vandnpd %ymm3, %ymm6, %ymm4 1344 vaddpd 1120(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload 1345 vaddpd %ymm4, %ymm3, %ymm8 1346 vmulpd %ymm8, %ymm8, %ymm4 1347 vmulpd %ymm4, %ymm4, %ymm3 1348 vmovupd 1056(%rsp), %ymm7 # 32-byte Reload 1349 vfmadd213pd 1088(%rsp), %ymm8, %ymm7 # 32-byte Folded Reload 1350 vfmadd231pd 1024(%rsp), %ymm4, %ymm7 # 32-byte Folded Reload 1351 vmovupd 960(%rsp), %ymm9 # 32-byte Reload 1352 vfmadd213pd 992(%rsp), %ymm8, %ymm9 # 32-byte Folded Reload 1353 vmovupd 896(%rsp), %ymm14 # 32-byte Reload 1354 vfmadd213pd 928(%rsp), %ymm8, %ymm14 # 32-byte Folded Reload 1355 vfmadd231pd %ymm9, %ymm4, %ymm14 # ymm14 = (ymm4 * ymm9) + ymm14 1356 vfmadd231pd %ymm7, %ymm3, %ymm14 # ymm14 = (ymm3 * ymm7) + ymm14 1357 vmulpd %ymm4, %ymm8, %ymm7 1358 vmulpd %ymm7, %ymm14, %ymm7 1359 vaddpd 864(%rsp), %ymm4, %ymm9 # 32-byte Folded Reload 1360 vfmadd231pd 832(%rsp), %ymm8, %ymm9 # 32-byte Folded Reload 1361 vmovupd 768(%rsp), %ymm14 # 32-byte Reload 1362 vfmadd213pd 800(%rsp), %ymm8, %ymm14 # 32-byte Folded Reload 1363 vmovupd 704(%rsp), %ymm11 # 32-byte Reload 1364 vfmadd213pd 736(%rsp), %ymm8, %ymm11 # 32-byte Folded Reload 1365 vfmadd231pd %ymm14, %ymm4, %ymm11 # ymm11 = (ymm4 * ymm14) + ymm11 1366 vfmadd231pd %ymm9, %ymm3, %ymm11 # ymm11 = (ymm3 * ymm9) + ymm11 1367 vdivpd %ymm11, %ymm7, %ymm7 1368 vpand 1184(%rsp), %ymm15, %ymm12 # 32-byte Folded Reload 1369 vpsrlq $52, %ymm12, %ymm9 1370 vpor 672(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload 1371 vaddpd 640(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload 1372 vandpd %ymm6, %ymm13, %ymm6 1373 vaddpd %ymm6, %ymm9, %ymm9 1374 vmulpd %ymm0, %ymm9, %ymm6 1375 vroundpd $8, %ymm6, %ymm6 1376 vfnmadd213pd %ymm6, %ymm0, %ymm9 # ymm9 = -(ymm0 * ymm9) + ymm6 1377 vmovupd 608(%rsp), %ymm1 # 32-byte Reload 1378 vmovapd %ymm1, %ymm11 1379 vfmadd213pd %ymm8, %ymm4, %ymm11 # ymm11 = (ymm4 * ymm11) + ymm8 1380 vaddpd %ymm7, %ymm11, %ymm11 1381 vmovupd 576(%rsp), %ymm3 # 32-byte Reload 1382 vmulpd %ymm3, %ymm8, %ymm14 1383 vmulpd %ymm1, %ymm4, %ymm10 1384 vfmadd231pd %ymm14, %ymm8, %ymm10 # ymm10 = (ymm8 * ymm14) + ymm10 1385 vsubpd %ymm8, %ymm11, %ymm8 1386 vfmadd231pd %ymm4, %ymm3, %ymm8 # ymm8 = (ymm3 * ymm4) + ymm8 1387 vmulpd 1280(%rsp), %ymm11, %ymm4 # 32-byte Folded Reload 1388 vroundpd $8, %ymm4, %ymm4 1389 vmulpd 544(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload 1390 vfmadd231pd %ymm11, %ymm0, %ymm14 # ymm14 = (ymm0 * ymm11) + ymm14 1391 vfmsub231pd 512(%rsp), %ymm4, %ymm14 # 32-byte Folded Reload 1392 vmovupd 480(%rsp), %ymm1 # 32-byte Reload 1393 vfmadd231pd %ymm9, %ymm1, %ymm14 # ymm14 = (ymm1 * ymm9) + ymm14 1394 vsubpd %ymm7, %ymm10, %ymm7 1395 vaddpd %ymm7, %ymm8, %ymm7 1396 vfnmsub213pd %ymm14, %ymm0, %ymm7 # ymm7 = -(ymm0 * ymm7) - ymm14 1397 vmulpd 1312(%rsp), %ymm7, %ymm8 # 32-byte Folded Reload 1398 vroundpd $8, %ymm8, %ymm8 1399 vfnmadd231pd %ymm1, %ymm8, %ymm7 # ymm7 = -(ymm8 * ymm1) + ymm7 1400 vmulpd %ymm7, %ymm7, %ymm9 1401 vmovupd 416(%rsp), %ymm10 # 32-byte Reload 1402 vfmadd213pd 448(%rsp), %ymm7, %ymm10 # 32-byte Folded Reload 1403 vmovupd 352(%rsp), %ymm11 # 32-byte Reload 1404 vfmadd213pd 384(%rsp), %ymm7, %ymm11 # 32-byte Folded Reload 1405 vmovupd 288(%rsp), %ymm14 # 32-byte Reload 1406 vfmadd213pd 320(%rsp), %ymm7, %ymm14 # 32-byte Folded Reload 1407 vfmadd231pd %ymm11, %ymm9, %ymm14 # ymm14 = (ymm9 * ymm11) + ymm14 1408 vmovupd 224(%rsp), %ymm11 # 32-byte Reload 1409 vfmadd213pd 256(%rsp), %ymm7, %ymm11 # 32-byte Folded Reload 1410 vmovupd 160(%rsp), %ymm5 # 32-byte Reload 1411 vfmadd213pd 192(%rsp), %ymm7, %ymm5 # 32-byte Folded Reload 1412 vfmadd231pd %ymm11, %ymm9, %ymm5 # ymm5 = (ymm9 * ymm11) + ymm5 1413 vmovupd 128(%rsp), %ymm11 # 32-byte Reload 1414 vfmadd213pd %ymm3, %ymm7, %ymm11 # ymm11 = (ymm7 * ymm11) + ymm3 1415 vfmadd213pd %ymm7, %ymm9, %ymm11 # ymm11 = (ymm9 * ymm11) + ymm7 1416 vmulpd %ymm9, %ymm9, %ymm7 1417 vfmadd231pd %ymm10, %ymm7, %ymm14 # ymm14 = (ymm7 * ymm10) + ymm14 1418 vfmadd231pd %ymm5, %ymm7, %ymm11 # ymm11 = (ymm7 * ymm5) + ymm11 1419 vmulpd %ymm7, %ymm7, %ymm5 1420 vfmadd231pd %ymm14, %ymm5, %ymm11 # ymm11 = (ymm5 * ymm14) + ymm11 1421 vaddpd %ymm13, %ymm11, %ymm5 1422 vaddpd %ymm6, %ymm4, %ymm4 1423 vaddpd %ymm4, %ymm8, %ymm14 1424 vroundpd $8, %ymm14, %ymm4 1425 vcvttsd2si %xmm4, %rcx 1426 vpermilpd $1, %xmm4, %xmm6 # xmm6 = xmm4[1,0] 1427 vcvttsd2si %xmm6, %rdx 1428 vextractf128 $1, %ymm4, %xmm4 1429 vcvttsd2si %xmm4, %rax 1430 vmovq %rax, %xmm6 1431 vpermilpd $1, %xmm4, %xmm4 # xmm4 = xmm4[1,0] 1432 vcvttsd2si %xmm4, %rax 1433 vmovq %rax, %xmm4 1434 vpunpcklqdq %xmm4, %xmm6, %xmm4 # xmm4 = xmm6[0],xmm4[0] 1435 vmovq %rcx, %xmm6 1436 vmovq %rdx, %xmm7 1437 vpunpcklqdq %xmm7, %xmm6, %xmm6 # xmm6 = xmm6[0],xmm7[0] 1438 vinserti128 $1, %xmm4, %ymm6, %ymm6 1439 vpsrad $31, %ymm5, %ymm4 1440 vpsrad $20, %ymm5, %ymm7 1441 vpsrlq $32, %ymm7, %ymm7 1442 vpblendd $170, %ymm4, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] 1443 vpaddq %ymm4, %ymm6, %ymm7 1444 vpcmpgtq 96(%rsp), %ymm7, %ymm4 # 32-byte Folded Reload 1445 vmovupd 64(%rsp), %ymm1 # 32-byte Reload 1446 vcmpltpd %ymm14, %ymm1, %ymm8 1447 vpor %ymm4, %ymm8, %ymm4 1448 vmovdqu 32(%rsp), %ymm1 # 32-byte Reload 1449 vpcmpgtq %ymm7, %ymm1, %ymm7 1450 vcmpltpd (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload 1451 vpor %ymm7, %ymm8, %ymm7 1452 vpsllq $52, %ymm6, %ymm6 1453 vpaddq %ymm5, %ymm6, %ymm6 1454 vpor %ymm4, %ymm7, %ymm5 1455 vptest %ymm5, %ymm5 1456 je .LBB10_4 1457 vpandn %ymm6, %ymm7, %ymm5 1458 vblendvpd %ymm4, -128(%rsp), %ymm5, %ymm6 # 32-byte Folded Reload 1459 .LBB10_4: # in Loop: Header=BB10_2 Depth=1 1460 vxorpd %xmm11, %xmm11, %xmm11 1461 vpand %ymm2, %ymm15, %ymm4 1462 vpcmpeqq %ymm11, %ymm4, %ymm8 1463 vpsrad $31, %ymm15, %ymm5 1464 vpshufd $245, %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] 1465 vblendvpd %ymm8, 1216(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload 1466 vptest %ymm9, %ymm9 1467 jne .LBB10_6 1468 vpxor %xmm9, %xmm9, %xmm9 1469 jmp .LBB10_7 1470 .LBB10_6: # in Loop: Header=BB10_2 Depth=1 1471 vroundpd $8, %ymm0, %ymm5 1472 vcmpeqpd %ymm0, %ymm5, %ymm7 1473 vcvttsd2si %xmm5, %rax 1474 vpermilpd $1, %xmm5, %xmm1 # xmm1 = xmm5[1,0] 1475 vcvttsd2si %xmm1, %rcx 1476 vextractf128 $1, %ymm5, %xmm1 1477 vcvttsd2si %xmm1, %rdx 1478 vmovq %rdx, %xmm5 1479 vpermilpd $1, %xmm1, %xmm1 # xmm1 = xmm1[1,0] 1480 vcvttsd2si %xmm1, %rdx 1481 vmovq %rdx, %xmm1 1482 vpunpcklqdq %xmm1, %xmm5, %xmm1 # xmm1 = xmm5[0],xmm1[0] 1483 vmovq %rax, %xmm5 1484 vmovq %rcx, %xmm3 1485 vpunpcklqdq %xmm3, %xmm5, %xmm3 # xmm3 = xmm5[0],xmm3[0] 1486 vinserti128 $1, %xmm1, %ymm3, %ymm1 1487 vpsllq $63, %ymm1, %ymm1 1488 vpor %ymm6, %ymm1, %ymm3 1489 vcmpeqpd %ymm11, %ymm15, %ymm5 1490 vbroadcastsd .LCPI10_43(%rip), %ymm10 # ymm10 = [NaN,NaN,NaN,NaN] 1491 vblendvpd %ymm5, %ymm6, %ymm10, %ymm5 1492 vblendvpd %ymm7, %ymm3, %ymm5, %ymm3 1493 vblendvpd %ymm9, %ymm3, %ymm6, %ymm6 1494 vandpd %ymm1, %ymm7, %ymm9 1495 .LBB10_7: # in Loop: Header=BB10_2 Depth=1 1496 vandpd %ymm2, %ymm14, %ymm1 1497 vpcmpeqq %ymm2, %ymm1, %ymm14 1498 vpcmpeqd %ymm5, %ymm5, %ymm5 1499 vpxor %ymm5, %ymm14, %ymm1 1500 vpcmpeqq %ymm2, %ymm4, %ymm4 1501 vpcmpeqq 1248(%rsp), %ymm2, %ymm3 # 32-byte Folded Reload 1502 vpxor %ymm5, %ymm3, %ymm7 1503 vpandn %ymm7, %ymm4, %ymm3 1504 vpor %ymm1, %ymm8, %ymm1 1505 vpand %ymm3, %ymm1, %ymm1 1506 vptest %ymm5, %ymm1 1507 jb .LBB10_9 1508 vpandn %ymm7, %ymm14, %ymm1 1509 vcmpeqpd %ymm13, %ymm12, %ymm3 1510 vcmpltpd %ymm12, %ymm13, %ymm5 1511 vxorpd -96(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 1512 vblendvpd %ymm5, -128(%rsp), %ymm11, %ymm5 # 32-byte Folded Reload 1513 vblendvpd %ymm3, %ymm13, %ymm5, %ymm3 1514 vblendvpd %ymm1, %ymm6, %ymm3, %ymm1 1515 vandpd %ymm6, %ymm9, %ymm3 1516 vandpd %ymm15, %ymm9, %ymm5 1517 vorpd %ymm5, %ymm12, %ymm5 1518 vmovupd -32(%rsp), %ymm6 # 32-byte Reload 1519 vblendvpd %ymm6, %ymm3, %ymm5, %ymm3 1520 vmovupd -64(%rsp), %ymm5 # 32-byte Reload 1521 vblendvpd %ymm5, %ymm13, %ymm3, %ymm3 1522 vblendvpd %ymm4, %ymm3, %ymm1, %ymm1 1523 vcmpunordpd %ymm15, %ymm15, %ymm3 1524 vcmpunordpd %ymm0, %ymm0, %ymm4 1525 vorpd %ymm3, %ymm4, %ymm3 1526 vaddpd %ymm0, %ymm15, %ymm4 1527 vblendvpd %ymm3, %ymm4, %ymm1, %ymm6 1528 jmp .LBB10_9 1529 .LBB10_10: 1530 addq $1352, %rsp # imm = 0x548 1531 vzeroupper 1532 retq 1533 .LCPI11_0: 1534 .long 0x3fb8aa3b # float 1.44269502 1535 .LCPI11_1: 1536 .long 2139095040 # 0x7f800000 1537 .LCPI11_2: 1538 .long 0x3f800000 # float 1 1539 .LCPI11_3: 1540 .long 2147483647 # 0x7fffffff 1541 .LCPI11_6: 1542 .long 0x3f3504f3 # float 0.707106769 1543 .LCPI11_7: 1544 .long 0xbf800000 # float -1 1545 .LCPI11_8: 1546 .long 0x3def251a # float 0.116769984 1547 .LCPI11_9: 1548 .long 0xbdebd1b8 # float -0.115146101 1549 .LCPI11_10: 1550 .long 0x3e11e9bf # float 0.142493233 1551 .LCPI11_11: 1552 .long 0xbdfe5d4f # float -0.12420141 1553 .LCPI11_12: 1554 .long 0x3e4cceac # float 0.200007141 1555 .LCPI11_13: 1556 .long 0xbe2aae50 # float -0.166680574 1557 .LCPI11_14: 1558 .long 0x3eaaaaaa # float 0.333333313 1559 .LCPI11_15: 1560 .long 0xbe7ffffc # float -0.24999994 1561 .LCPI11_16: 1562 .long 0x3d9021bb # float 0.0703768358 1563 .LCPI11_18: 1564 .long 0xcb00007f # float -8388735 1565 .LCPI11_19: 1566 .long 0xbf000000 # float -0.5 1567 .LCPI11_20: 1568 .long 0x3f000000 # float 0.5 1569 .LCPI11_21: 1570 .long 0xbf318000 # float -0.693359375 1571 .LCPI11_22: 1572 .long 0xb95e8083 # float -2.12194442E-4 1573 .LCPI11_23: 1574 .long 0xbf317218 # float -0.693147182 1575 .LCPI11_24: 1576 .long 0x3d2aaaab # float 0.0416666679 1577 .LCPI11_25: 1578 .long 0x3c088889 # float 0.00833333377 1579 .LCPI11_26: 1580 .long 0x3ab60b61 # float 0.00138888892 1581 .LCPI11_27: 1582 .long 0x39500d01 # float 1.98412701E-4 1583 .LCPI11_28: 1584 .long 0x3e2aaaab # float 0.166666672 1585 .LCPI11_30: 1586 .long 254 # 0xfe 1587 .LCPI11_31: 1588 .long 0x43960000 # float 300 1589 .LCPI11_32: 1590 .long 1 # 0x1 1591 .LCPI11_33: 1592 .long 0xc3960000 # float -300 1593 .LCPI11_34: 1594 .long 0x7fc00102 # float NaN 1595 .LCPI11_4: 1596 .quad 36028792732385279 # 0x7fffff007fffff 1597 .quad 36028792732385279 # 0x7fffff007fffff 1598 .LCPI11_5: 1599 .quad 4539628425446424576 # 0x3f0000003f000000 1600 .quad 4539628425446424576 # 0x3f0000003f000000 1601 .LCPI11_17: 1602 .quad 5404319554102886400 # 0x4b0000004b000000 1603 .LCPI11_29: 1604 .byte 255 # 0xff 1605 .byte 0 # 0x0 1606 .byte 0 # 0x0 1607 .byte 0 # 0x0 1608 .byte 255 # 0xff 1609 .byte 0 # 0x0 1610 .byte 0 # 0x0 1611 .byte 0 # 0x0 1612 .byte 255 # 0xff 1613 .byte 0 # 0x0 1614 .byte 0 # 0x0 1615 .byte 0 # 0x0 1616 .byte 255 # 0xff 1617 .byte 0 # 0x0 1618 .byte 0 # 0x0 1619 .byte 0 # 0x0 1620 .byte 255 # 0xff 1621 .byte 0 # 0x0 1622 .byte 0 # 0x0 1623 .byte 0 # 0x0 1624 .byte 255 # 0xff 1625 .byte 0 # 0x0 1626 .byte 0 # 0x0 1627 .byte 0 # 0x0 1628 .byte 255 # 0xff 1629 .byte 0 # 0x0 1630 .byte 0 # 0x0 1631 .byte 0 # 0x0 1632 .byte 255 # 0xff 1633 .byte 0 # 0x0 1634 .byte 0 # 0x0 1635 .byte 0 # 0x0 1636 PowNumber_8x_F32_V(float*, float, unsigned long): # @PowNumber_8x_F32_V(float*, float, unsigned long) 1637 subq $1000, %rsp # imm = 0x3E8 1638 andq $-8, %rsi 1639 je .LBB11_11 1640 vbroadcastss %xmm0, %ymm0 1641 vbroadcastss .LCPI11_0(%rip), %ymm14 # ymm14 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0] 1642 vmulps %ymm0, %ymm14, %ymm1 1643 vmovups %ymm1, 384(%rsp) # 32-byte Spill 1644 vbroadcastss .LCPI11_1(%rip), %ymm3 # ymm3 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040] 1645 vxorps %xmm15, %xmm15, %xmm15 1646 vcmpltps %ymm15, %ymm0, %ymm2 1647 vcmpeqps %ymm0, %ymm15, %ymm4 1648 vbroadcastss .LCPI11_2(%rip), %ymm1 # ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1649 vmovups %ymm4, -64(%rsp) # 32-byte Spill 1650 vandps %ymm1, %ymm4, %ymm1 1651 vbroadcastss .LCPI11_1(%rip), %ymm4 # ymm4 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040] 1652 vmovups %ymm2, -32(%rsp) # 32-byte Spill 1653 vmovups %ymm4, -128(%rsp) # 32-byte Spill 1654 vblendvps %ymm2, %ymm4, %ymm1, %ymm1 1655 vmovups %ymm1, 960(%rsp) # 32-byte Spill 1656 vandps %ymm3, %ymm0, %ymm1 1657 vmovups %ymm1, 928(%rsp) # 32-byte Spill 1658 vpsrad $31, %ymm0, %ymm1 1659 vmovdqu %ymm1, -96(%rsp) # 32-byte Spill 1660 vbroadcastss .LCPI11_3(%rip), %ymm1 # ymm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 1661 vmovups %ymm1, 896(%rsp) # 32-byte Spill 1662 xorl %eax, %eax 1663 vbroadcastss .LCPI11_6(%rip), %ymm1 # ymm1 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1] 1664 vmovups %ymm1, 864(%rsp) # 32-byte Spill 1665 vbroadcastss .LCPI11_7(%rip), %ymm1 # ymm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] 1666 vmovups %ymm1, 832(%rsp) # 32-byte Spill 1667 vbroadcastss .LCPI11_8(%rip), %ymm1 # ymm1 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1] 1668 vmovups %ymm1, 800(%rsp) # 32-byte Spill 1669 vbroadcastss .LCPI11_9(%rip), %ymm1 # ymm1 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1] 1670 vmovups %ymm1, 768(%rsp) # 32-byte Spill 1671 vbroadcastss .LCPI11_10(%rip), %ymm1 # ymm1 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1] 1672 vmovups %ymm1, 736(%rsp) # 32-byte Spill 1673 vbroadcastss .LCPI11_11(%rip), %ymm1 # ymm1 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1] 1674 vmovups %ymm1, 704(%rsp) # 32-byte Spill 1675 vbroadcastss .LCPI11_12(%rip), %ymm1 # ymm1 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1] 1676 vmovups %ymm1, 672(%rsp) # 32-byte Spill 1677 vbroadcastss .LCPI11_13(%rip), %ymm1 # ymm1 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1] 1678 vmovups %ymm1, 640(%rsp) # 32-byte Spill 1679 vbroadcastss .LCPI11_14(%rip), %ymm1 # ymm1 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1] 1680 vmovups %ymm1, 608(%rsp) # 32-byte Spill 1681 vbroadcastss .LCPI11_15(%rip), %ymm1 # ymm1 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1] 1682 vmovups %ymm1, 576(%rsp) # 32-byte Spill 1683 vbroadcastss .LCPI11_16(%rip), %ymm1 # ymm1 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2] 1684 vmovups %ymm1, 544(%rsp) # 32-byte Spill 1685 vbroadcastsd .LCPI11_17(%rip), %ymm1 # ymm1 = [5404319554102886400,5404319554102886400,5404319554102886400,5404319554102886400] 1686 vmovups %ymm1, 512(%rsp) # 32-byte Spill 1687 vbroadcastss .LCPI11_18(%rip), %ymm1 # ymm1 = [-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6,-8.388735E+6] 1688 vmovups %ymm1, 480(%rsp) # 32-byte Spill 1689 vbroadcastss .LCPI11_2(%rip), %ymm11 # ymm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1690 vbroadcastss .LCPI11_19(%rip), %ymm1 # ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 1691 vmovups %ymm1, 448(%rsp) # 32-byte Spill 1692 vbroadcastss .LCPI11_20(%rip), %ymm1 # ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1] 1693 vmovups %ymm1, 416(%rsp) # 32-byte Spill 1694 vbroadcastss .LCPI11_21(%rip), %ymm1 # ymm1 = [-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1] 1695 vmovups %ymm1, 352(%rsp) # 32-byte Spill 1696 vbroadcastss .LCPI11_22(%rip), %ymm1 # ymm1 = [-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4,-2.12194442E-4] 1697 vmovups %ymm1, 320(%rsp) # 32-byte Spill 1698 vbroadcastss .LCPI11_23(%rip), %ymm1 # ymm1 = [-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1,-6.93147182E-1] 1699 vmovups %ymm1, 288(%rsp) # 32-byte Spill 1700 vbroadcastss .LCPI11_24(%rip), %ymm1 # ymm1 = [4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2,4.16666679E-2] 1701 vmovups %ymm1, 256(%rsp) # 32-byte Spill 1702 vbroadcastss .LCPI11_25(%rip), %ymm1 # ymm1 = [8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3,8.33333377E-3] 1703 vmovups %ymm1, 224(%rsp) # 32-byte Spill 1704 vbroadcastss .LCPI11_26(%rip), %ymm1 # ymm1 = [1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3,1.38888892E-3] 1705 vmovups %ymm1, 192(%rsp) # 32-byte Spill 1706 vbroadcastss .LCPI11_27(%rip), %ymm1 # ymm1 = [1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4,1.98412701E-4] 1707 vmovups %ymm1, 160(%rsp) # 32-byte Spill 1708 vbroadcastss .LCPI11_28(%rip), %ymm1 # ymm1 = [1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1,1.66666672E-1] 1709 vmovups %ymm1, 128(%rsp) # 32-byte Spill 1710 vbroadcastss .LCPI11_30(%rip), %ymm1 # ymm1 = [254,254,254,254,254,254,254,254] 1711 vmovups %ymm1, 96(%rsp) # 32-byte Spill 1712 vbroadcastss .LCPI11_31(%rip), %ymm1 # ymm1 = [3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2,3.0E+2] 1713 vmovups %ymm1, 64(%rsp) # 32-byte Spill 1714 vbroadcastss .LCPI11_32(%rip), %ymm1 # ymm1 = [1,1,1,1,1,1,1,1] 1715 vmovups %ymm1, 32(%rsp) # 32-byte Spill 1716 vbroadcastss .LCPI11_33(%rip), %ymm1 # ymm1 = [-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2,-3.0E+2] 1717 vmovups %ymm1, (%rsp) # 32-byte Spill 1718 jmp .LBB11_2 1719 .LBB11_9: # in Loop: Header=BB11_2 Depth=1 1720 vpxor %ymm6, %ymm12, %ymm5 1721 vpandn %ymm5, %ymm4, %ymm4 1722 vcmpeqps %ymm1, %ymm11, %ymm5 1723 vcmpltps %ymm1, %ymm11, %ymm6 1724 vxorps -96(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload 1725 vxorps %xmm15, %xmm15, %xmm15 1726 vblendvps %ymm6, -128(%rsp), %ymm15, %ymm6 # 32-byte Folded Reload 1727 vblendvps %ymm5, %ymm11, %ymm6, %ymm5 1728 vblendvps %ymm4, %ymm2, %ymm5, %ymm4 1729 vandps %ymm2, %ymm9, %ymm2 1730 vandps %ymm7, %ymm9, %ymm5 1731 vorps %ymm1, %ymm5, %ymm1 1732 vmovups -32(%rsp), %ymm5 # 32-byte Reload 1733 vblendvps %ymm5, %ymm2, %ymm1, %ymm1 1734 vmovups -64(%rsp), %ymm2 # 32-byte Reload 1735 vblendvps %ymm2, %ymm11, %ymm1, %ymm1 1736 vblendvps %ymm8, %ymm4, %ymm1, %ymm1 1737 vcmpunordps %ymm7, %ymm7, %ymm2 1738 vcmpunordps %ymm0, %ymm0, %ymm4 1739 vorps %ymm2, %ymm4, %ymm2 1740 vaddps %ymm0, %ymm7, %ymm4 1741 vblendvps %ymm2, %ymm4, %ymm1, %ymm2 1742 vmovups %ymm2, (%rdi,%rax,4) 1743 addq $8, %rax 1744 cmpq %rsi, %rax 1745 jae .LBB11_11 1746 .LBB11_2: # =>This Inner Loop Header: Depth=1 1747 vmovdqu (%rdi,%rax,4), %ymm7 1748 vmovaps .LCPI11_4(%rip), %xmm4 # xmm4 = [36028792732385279,36028792732385279] 1749 vandps (%rdi,%rax,4), %xmm4, %xmm2 1750 vpand 896(%rsp), %ymm7, %ymm1 # 32-byte Folded Reload 1751 vmovaps .LCPI11_5(%rip), %xmm5 # xmm5 = [4539628425446424576,4539628425446424576] 1752 vorps %xmm5, %xmm2, %xmm2 1753 vandps 16(%rdi,%rax,4), %xmm4, %xmm4 1754 vorps %xmm5, %xmm4, %xmm4 1755 vinsertf128 $1, %xmm4, %ymm2, %ymm2 1756 vmovups 864(%rsp), %ymm4 # 32-byte Reload 1757 vcmpltps %ymm2, %ymm4, %ymm6 1758 vandnps %ymm2, %ymm6, %ymm4 1759 vaddps 832(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 1760 vaddps %ymm4, %ymm2, %ymm5 1761 vmulps %ymm5, %ymm5, %ymm4 1762 vmulps %ymm4, %ymm4, %ymm2 1763 vmovups 768(%rsp), %ymm8 # 32-byte Reload 1764 vfmadd213ps 800(%rsp), %ymm5, %ymm8 # 32-byte Folded Reload 1765 vmovups 704(%rsp), %ymm9 # 32-byte Reload 1766 vfmadd213ps 736(%rsp), %ymm5, %ymm9 # 32-byte Folded Reload 1767 vfmadd231ps %ymm8, %ymm4, %ymm9 # ymm9 = (ymm4 * ymm8) + ymm9 1768 vmovups 640(%rsp), %ymm8 # 32-byte Reload 1769 vfmadd213ps 672(%rsp), %ymm5, %ymm8 # 32-byte Folded Reload 1770 vmovups 576(%rsp), %ymm10 # 32-byte Reload 1771 vfmadd213ps 608(%rsp), %ymm5, %ymm10 # 32-byte Folded Reload 1772 vmulps %ymm2, %ymm2, %ymm13 1773 vfmadd132ps 544(%rsp), %ymm10, %ymm13 # 32-byte Folded Reload 1774 vfmadd231ps %ymm8, %ymm4, %ymm13 # ymm13 = (ymm4 * ymm8) + ymm13 1775 vfmadd231ps %ymm9, %ymm2, %ymm13 # ymm13 = (ymm2 * ymm9) + ymm13 1776 vmulps %ymm5, %ymm4, %ymm2 1777 vmulps %ymm2, %ymm13, %ymm8 1778 vpsrld $23, %ymm1, %ymm2 1779 vpor 512(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 1780 vaddps 480(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 1781 vandps %ymm6, %ymm11, %ymm6 1782 vaddps %ymm6, %ymm2, %ymm6 1783 vmulps %ymm0, %ymm6, %ymm2 1784 vroundps $8, %ymm2, %ymm2 1785 vfnmadd213ps %ymm2, %ymm0, %ymm6 # ymm6 = -(ymm0 * ymm6) + ymm2 1786 vmovups 448(%rsp), %ymm13 # 32-byte Reload 1787 vmovaps %ymm13, %ymm9 1788 vfmadd213ps %ymm5, %ymm4, %ymm9 # ymm9 = (ymm4 * ymm9) + ymm5 1789 vaddps %ymm9, %ymm8, %ymm9 1790 vmovups 416(%rsp), %ymm12 # 32-byte Reload 1791 vmulps %ymm5, %ymm12, %ymm10 1792 vmulps %ymm4, %ymm13, %ymm13 1793 vfmadd231ps %ymm10, %ymm5, %ymm13 # ymm13 = (ymm5 * ymm10) + ymm13 1794 vsubps %ymm5, %ymm9, %ymm5 1795 vfmadd231ps %ymm4, %ymm12, %ymm5 # ymm5 = (ymm12 * ymm4) + ymm5 1796 vmulps 384(%rsp), %ymm9, %ymm4 # 32-byte Folded Reload 1797 vroundps $8, %ymm4, %ymm4 1798 vmulps 352(%rsp), %ymm4, %ymm10 # 32-byte Folded Reload 1799 vfmadd231ps %ymm9, %ymm0, %ymm10 # ymm10 = (ymm0 * ymm9) + ymm10 1800 vfnmadd231ps 320(%rsp), %ymm4, %ymm10 # 32-byte Folded Reload 1801 vsubps %ymm8, %ymm13, %ymm8 1802 vaddps %ymm5, %ymm8, %ymm5 1803 vmovups 288(%rsp), %ymm8 # 32-byte Reload 1804 vmulps %ymm6, %ymm8, %ymm6 1805 vfnmadd231ps %ymm5, %ymm0, %ymm6 # ymm6 = -(ymm0 * ymm5) + ymm6 1806 vaddps %ymm6, %ymm10, %ymm5 1807 vmulps %ymm5, %ymm14, %ymm6 1808 vroundps $8, %ymm6, %ymm6 1809 vfmadd231ps %ymm8, %ymm6, %ymm5 # ymm5 = (ymm6 * ymm8) + ymm5 1810 vmulps %ymm5, %ymm5, %ymm8 1811 vmulps %ymm8, %ymm8, %ymm9 1812 vmovups 160(%rsp), %ymm10 # 32-byte Reload 1813 vfmadd213ps 192(%rsp), %ymm5, %ymm10 # 32-byte Folded Reload 1814 vmovups 128(%rsp), %ymm13 # 32-byte Reload 1815 vfmadd213ps %ymm12, %ymm5, %ymm13 # ymm13 = (ymm5 * ymm13) + ymm12 1816 vfmadd231ps %ymm10, %ymm9, %ymm13 # ymm13 = (ymm9 * ymm10) + ymm13 1817 vmovups 224(%rsp), %ymm9 # 32-byte Reload 1818 vfmadd213ps 256(%rsp), %ymm5, %ymm9 # 32-byte Folded Reload 1819 vfmadd231ps %ymm9, %ymm8, %ymm13 # ymm13 = (ymm8 * ymm9) + ymm13 1820 vaddps %ymm5, %ymm11, %ymm9 1821 vfmadd231ps %ymm13, %ymm8, %ymm9 # ymm9 = (ymm8 * ymm13) + ymm9 1822 vaddps %ymm2, %ymm4, %ymm2 1823 vaddps %ymm6, %ymm2, %ymm4 1824 vcvtps2dq %ymm4, %ymm2 1825 vpsrld $23, %ymm9, %ymm5 1826 vpand .LCPI11_29(%rip), %ymm5, %ymm5 1827 vpaddd %ymm2, %ymm5, %ymm6 1828 vpcmpgtd 96(%rsp), %ymm6, %ymm5 # 32-byte Folded Reload 1829 vmovups 64(%rsp), %ymm8 # 32-byte Reload 1830 vcmpltps %ymm4, %ymm8, %ymm8 1831 vpor %ymm5, %ymm8, %ymm5 1832 vmovdqu 32(%rsp), %ymm8 # 32-byte Reload 1833 vpcmpgtd %ymm6, %ymm8, %ymm6 1834 vcmpltps (%rsp), %ymm4, %ymm8 # 32-byte Folded Reload 1835 vpor %ymm6, %ymm8, %ymm6 1836 vpslld $23, %ymm2, %ymm2 1837 vpaddd %ymm2, %ymm9, %ymm2 1838 vpor %ymm5, %ymm6, %ymm8 1839 vtestps %ymm8, %ymm8 1840 je .LBB11_4 1841 vpandn %ymm2, %ymm6, %ymm2 1842 vblendvps %ymm5, -128(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 1843 .LBB11_4: # in Loop: Header=BB11_2 Depth=1 1844 vpand %ymm3, %ymm7, %ymm8 1845 vpcmpeqd %ymm15, %ymm8, %ymm5 1846 vblendvps %ymm5, 960(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload 1847 vmovmskps %ymm7, %ecx 1848 testl %ecx, %ecx 1849 jne .LBB11_6 1850 vxorps %xmm9, %xmm9, %xmm9 1851 jmp .LBB11_7 1852 .LBB11_6: # in Loop: Header=BB11_2 Depth=1 1853 vroundps $8, %ymm0, %ymm6 1854 vcmpeqps %ymm0, %ymm6, %ymm6 1855 vcvtps2dq %ymm0, %ymm9 1856 vpslld $31, %ymm9, %ymm9 1857 vpor %ymm2, %ymm9, %ymm10 1858 vcmpeqps %ymm7, %ymm15, %ymm13 1859 vmovaps %ymm14, %ymm12 1860 vbroadcastss .LCPI11_34(%rip), %ymm14 # ymm14 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 1861 vblendvps %ymm13, %ymm2, %ymm14, %ymm13 1862 vmovaps %ymm12, %ymm14 1863 vblendvps %ymm6, %ymm10, %ymm13, %ymm10 1864 vpsrad $31, %ymm7, %ymm13 1865 vblendvps %ymm13, %ymm10, %ymm2, %ymm2 1866 vandps %ymm6, %ymm9, %ymm9 1867 .LBB11_7: # in Loop: Header=BB11_2 Depth=1 1868 vpcmpeqd %ymm12, %ymm12, %ymm12 1869 vpcmpeqd %ymm3, %ymm8, %ymm6 1870 vpxor %ymm6, %ymm12, %ymm8 1871 vandps %ymm3, %ymm4, %ymm4 1872 vpcmpeqd %ymm3, %ymm4, %ymm4 1873 vpxor %ymm4, %ymm12, %ymm10 1874 vpcmpeqd 928(%rsp), %ymm3, %ymm6 # 32-byte Folded Reload 1875 vpandn %ymm8, %ymm6, %ymm13 1876 vpor %ymm5, %ymm10, %ymm5 1877 vpand %ymm5, %ymm13, %ymm5 1878 vtestps %ymm12, %ymm5 1879 jae .LBB11_9 1880 vxorps %xmm15, %xmm15, %xmm15 1881 vmovups %ymm2, (%rdi,%rax,4) 1882 addq $8, %rax 1883 cmpq %rsi, %rax 1884 jb .LBB11_2 1885 .LBB11_11: 1886 addq $1000, %rsp # imm = 0x3E8 1887 vzeroupper 1888 retq 1889 .LCPI12_0: 1890 .long 0x00800000 # float 1.17549435E-38 1891 .LCPI12_1: 1892 .long 2155872255 # 0x807fffff 1893 .LCPI12_2: 1894 .long 1056964608 # 0x3f000000 1895 .LCPI12_3: 1896 .long 4294967169 # 0xffffff81 1897 .LCPI12_4: 1898 .long 0x3f800000 # float 1 1899 .LCPI12_5: 1900 .long 0x3f3504f3 # float 0.707106769 1901 .LCPI12_6: 1902 .long 0xbf800000 # float -1 1903 .LCPI12_7: 1904 .long 0x3d9021bb # float 0.0703768358 1905 .LCPI12_8: 1906 .long 0xbdebd1b8 # float -0.115146101 1907 .LCPI12_9: 1908 .long 0x3def251a # float 0.116769984 1909 .LCPI12_10: 1910 .long 0xbdfe5d4f # float -0.12420141 1911 .LCPI12_11: 1912 .long 0x3e11e9bf # float 0.142493233 1913 .LCPI12_12: 1914 .long 0xbe2aae50 # float -0.166680574 1915 .LCPI12_13: 1916 .long 0x3e4cceac # float 0.200007141 1917 .LCPI12_14: 1918 .long 0xbe7ffffc # float -0.24999994 1919 .LCPI12_15: 1920 .long 0x3eaaaaaa # float 0.333333313 1921 .LCPI12_16: 1922 .long 0x3f317218 # float 0.693147182 1923 .LCPI12_17: 1924 .long 0xbf000000 # float -0.5 1925 .LCPI12_18: 1926 .long 0x3ede5bd9 # float 0.434294492 1927 .LCPI12_19: 1928 .zero 32 1929 Log10_Len8x_F32_V(float*, unsigned long): # @Log10_Len8x_F32_V(float*, unsigned long) 1930 subq $136, %rsp 1931 testq %rsi, %rsi 1932 je .LBB12_3 1933 xorl %eax, %eax 1934 vbroadcastss .LCPI12_1(%rip), %ymm0 # ymm0 = [2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255] 1935 vmovups %ymm0, 96(%rsp) # 32-byte Spill 1936 vbroadcastss .LCPI12_2(%rip), %ymm0 # ymm0 = [1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608] 1937 vmovups %ymm0, 64(%rsp) # 32-byte Spill 1938 vbroadcastss .LCPI12_3(%rip), %ymm0 # ymm0 = [4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169] 1939 vmovups %ymm0, 32(%rsp) # 32-byte Spill 1940 vbroadcastss .LCPI12_0(%rip), %ymm0 # ymm0 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 1941 vmovups %ymm0, (%rsp) # 32-byte Spill 1942 vbroadcastss .LCPI12_4(%rip), %ymm0 # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1943 vmovups %ymm0, -32(%rsp) # 32-byte Spill 1944 vbroadcastss .LCPI12_5(%rip), %ymm0 # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1] 1945 vmovups %ymm0, -64(%rsp) # 32-byte Spill 1946 vbroadcastss .LCPI12_6(%rip), %ymm0 # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] 1947 vmovups %ymm0, -96(%rsp) # 32-byte Spill 1948 vbroadcastss .LCPI12_7(%rip), %ymm0 # ymm0 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2] 1949 vmovups %ymm0, -128(%rsp) # 32-byte Spill 1950 vbroadcastss .LCPI12_8(%rip), %ymm9 # ymm9 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1] 1951 vbroadcastss .LCPI12_9(%rip), %ymm10 # ymm10 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1] 1952 vbroadcastss .LCPI12_10(%rip), %ymm11 # ymm11 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1] 1953 vbroadcastss .LCPI12_11(%rip), %ymm12 # ymm12 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1] 1954 vbroadcastss .LCPI12_12(%rip), %ymm13 # ymm13 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1] 1955 vbroadcastss .LCPI12_13(%rip), %ymm14 # ymm14 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1] 1956 vbroadcastss .LCPI12_14(%rip), %ymm15 # ymm15 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1] 1957 vbroadcastss .LCPI12_15(%rip), %ymm0 # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1] 1958 vbroadcastss .LCPI12_16(%rip), %ymm1 # ymm1 = [6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1] 1959 vbroadcastss .LCPI12_17(%rip), %ymm2 # ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 1960 vbroadcastss .LCPI12_18(%rip), %ymm3 # ymm3 = [4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1,4.34294492E-1] 1961 .LBB12_2: # =>This Inner Loop Header: Depth=1 1962 vmovups (%rdi,%rax,4), %ymm4 1963 vmaxps (%rsp), %ymm4, %ymm5 # 32-byte Folded Reload 1964 vpsrld $23, %ymm5, %ymm6 1965 vpaddd 32(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload 1966 vandps 96(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 1967 vorps 64(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 1968 vcvtdq2ps %ymm6, %ymm6 1969 vaddps -32(%rsp), %ymm6, %ymm7 # 32-byte Folded Reload 1970 vcmpltps -64(%rsp), %ymm5, %ymm8 # 32-byte Folded Reload 1971 vblendvps %ymm8, %ymm6, %ymm7, %ymm6 1972 vandps %ymm5, %ymm8, %ymm7 1973 vaddps -96(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 1974 vaddps %ymm7, %ymm5, %ymm5 1975 vmovups -128(%rsp), %ymm7 # 32-byte Reload 1976 vfmadd213ps %ymm9, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm9 1977 vfmadd213ps %ymm10, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm10 1978 vfmadd213ps %ymm11, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm11 1979 vfmadd213ps %ymm12, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm12 1980 vfmadd213ps %ymm13, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm13 1981 vfmadd213ps %ymm14, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm14 1982 vfmadd213ps %ymm15, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm15 1983 vfmadd213ps %ymm0, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm0 1984 vfmadd213ps %ymm2, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm2 1985 vfmadd213ps %ymm5, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm6) + ymm5 1986 vmulps %ymm5, %ymm5, %ymm5 1987 vfmadd231ps %ymm7, %ymm5, %ymm6 # ymm6 = (ymm5 * ymm7) + ymm6 1988 vcmpleps .LCPI12_19(%rip), %ymm4, %ymm4 1989 vmulps %ymm3, %ymm6, %ymm5 1990 vorps %ymm5, %ymm4, %ymm4 1991 vmovups %ymm4, (%rdi,%rax,4) 1992 addq $8, %rax 1993 cmpq %rsi, %rax 1994 jb .LBB12_2 1995 .LBB12_3: 1996 addq $136, %rsp 1997 vzeroupper 1998 retq 1999 .LCPI13_0: 2000 .long 0x00800000 # float 1.17549435E-38 2001 .LCPI13_1: 2002 .long 2155872255 # 0x807fffff 2003 .LCPI13_2: 2004 .long 1056964608 # 0x3f000000 2005 .LCPI13_3: 2006 .long 4294967169 # 0xffffff81 2007 .LCPI13_4: 2008 .long 0x3f800000 # float 1 2009 .LCPI13_5: 2010 .long 0x3f3504f3 # float 0.707106769 2011 .LCPI13_6: 2012 .long 0xbf800000 # float -1 2013 .LCPI13_7: 2014 .long 0x3d9021bb # float 0.0703768358 2015 .LCPI13_8: 2016 .long 0xbdebd1b8 # float -0.115146101 2017 .LCPI13_9: 2018 .long 0x3def251a # float 0.116769984 2019 .LCPI13_10: 2020 .long 0xbdfe5d4f # float -0.12420141 2021 .LCPI13_11: 2022 .long 0x3e11e9bf # float 0.142493233 2023 .LCPI13_12: 2024 .long 0xbe2aae50 # float -0.166680574 2025 .LCPI13_13: 2026 .long 0x3e4cceac # float 0.200007141 2027 .LCPI13_14: 2028 .long 0xbe7ffffc # float -0.24999994 2029 .LCPI13_15: 2030 .long 0x3eaaaaaa # float 0.333333313 2031 .LCPI13_16: 2032 .long 0x3f317218 # float 0.693147182 2033 .LCPI13_17: 2034 .long 0xbf000000 # float -0.5 2035 .LCPI13_18: 2036 .long 0x3fb8aa3b # float 1.44269502 2037 .LCPI13_19: 2038 .zero 32 2039 Log2_Len8x_F32_V(float*, unsigned long): # @Log2_Len8x_F32_V(float*, unsigned long) 2040 subq $136, %rsp 2041 testq %rsi, %rsi 2042 je .LBB13_3 2043 xorl %eax, %eax 2044 vbroadcastss .LCPI13_1(%rip), %ymm0 # ymm0 = [2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255] 2045 vmovups %ymm0, 96(%rsp) # 32-byte Spill 2046 vbroadcastss .LCPI13_2(%rip), %ymm0 # ymm0 = [1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608] 2047 vmovups %ymm0, 64(%rsp) # 32-byte Spill 2048 vbroadcastss .LCPI13_3(%rip), %ymm0 # ymm0 = [4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169] 2049 vmovups %ymm0, 32(%rsp) # 32-byte Spill 2050 vbroadcastss .LCPI13_0(%rip), %ymm0 # ymm0 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 2051 vmovups %ymm0, (%rsp) # 32-byte Spill 2052 vbroadcastss .LCPI13_4(%rip), %ymm0 # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 2053 vmovups %ymm0, -32(%rsp) # 32-byte Spill 2054 vbroadcastss .LCPI13_5(%rip), %ymm0 # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1] 2055 vmovups %ymm0, -64(%rsp) # 32-byte Spill 2056 vbroadcastss .LCPI13_6(%rip), %ymm0 # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] 2057 vmovups %ymm0, -96(%rsp) # 32-byte Spill 2058 vbroadcastss .LCPI13_7(%rip), %ymm0 # ymm0 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2] 2059 vmovups %ymm0, -128(%rsp) # 32-byte Spill 2060 vbroadcastss .LCPI13_8(%rip), %ymm9 # ymm9 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1] 2061 vbroadcastss .LCPI13_9(%rip), %ymm10 # ymm10 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1] 2062 vbroadcastss .LCPI13_10(%rip), %ymm11 # ymm11 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1] 2063 vbroadcastss .LCPI13_11(%rip), %ymm12 # ymm12 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1] 2064 vbroadcastss .LCPI13_12(%rip), %ymm13 # ymm13 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1] 2065 vbroadcastss .LCPI13_13(%rip), %ymm14 # ymm14 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1] 2066 vbroadcastss .LCPI13_14(%rip), %ymm15 # ymm15 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1] 2067 vbroadcastss .LCPI13_15(%rip), %ymm0 # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1] 2068 vbroadcastss .LCPI13_16(%rip), %ymm1 # ymm1 = [6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1] 2069 vbroadcastss .LCPI13_17(%rip), %ymm2 # ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 2070 vbroadcastss .LCPI13_18(%rip), %ymm3 # ymm3 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0] 2071 .LBB13_2: # =>This Inner Loop Header: Depth=1 2072 vmovups (%rdi,%rax,4), %ymm4 2073 vmaxps (%rsp), %ymm4, %ymm5 # 32-byte Folded Reload 2074 vpsrld $23, %ymm5, %ymm6 2075 vpaddd 32(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload 2076 vandps 96(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 2077 vorps 64(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 2078 vcvtdq2ps %ymm6, %ymm6 2079 vaddps -32(%rsp), %ymm6, %ymm7 # 32-byte Folded Reload 2080 vcmpltps -64(%rsp), %ymm5, %ymm8 # 32-byte Folded Reload 2081 vblendvps %ymm8, %ymm6, %ymm7, %ymm6 2082 vandps %ymm5, %ymm8, %ymm7 2083 vaddps -96(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 2084 vaddps %ymm7, %ymm5, %ymm5 2085 vmovups -128(%rsp), %ymm7 # 32-byte Reload 2086 vfmadd213ps %ymm9, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm9 2087 vfmadd213ps %ymm10, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm10 2088 vfmadd213ps %ymm11, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm11 2089 vfmadd213ps %ymm12, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm12 2090 vfmadd213ps %ymm13, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm13 2091 vfmadd213ps %ymm14, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm14 2092 vfmadd213ps %ymm15, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm15 2093 vfmadd213ps %ymm0, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm0 2094 vfmadd213ps %ymm2, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm2 2095 vfmadd213ps %ymm5, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm6) + ymm5 2096 vmulps %ymm5, %ymm5, %ymm5 2097 vfmadd231ps %ymm7, %ymm5, %ymm6 # ymm6 = (ymm5 * ymm7) + ymm6 2098 vcmpleps .LCPI13_19(%rip), %ymm4, %ymm4 2099 vmulps %ymm3, %ymm6, %ymm5 2100 vorps %ymm5, %ymm4, %ymm4 2101 vmovups %ymm4, (%rdi,%rax,4) 2102 addq $8, %rax 2103 cmpq %rsi, %rax 2104 jb .LBB13_2 2105 .LBB13_3: 2106 addq $136, %rsp 2107 vzeroupper 2108 retq 2109 .LCPI14_0: 2110 .long 0x00800000 # float 1.17549435E-38 2111 .LCPI14_1: 2112 .long 2155872255 # 0x807fffff 2113 .LCPI14_2: 2114 .long 1056964608 # 0x3f000000 2115 .LCPI14_3: 2116 .long 4294967169 # 0xffffff81 2117 .LCPI14_4: 2118 .long 0x3f800000 # float 1 2119 .LCPI14_5: 2120 .long 0x3f3504f3 # float 0.707106769 2121 .LCPI14_6: 2122 .long 0xbf800000 # float -1 2123 .LCPI14_7: 2124 .long 0x3d9021bb # float 0.0703768358 2125 .LCPI14_8: 2126 .long 0xbdebd1b8 # float -0.115146101 2127 .LCPI14_9: 2128 .long 0x3def251a # float 0.116769984 2129 .LCPI14_10: 2130 .long 0xbdfe5d4f # float -0.12420141 2131 .LCPI14_11: 2132 .long 0x3e11e9bf # float 0.142493233 2133 .LCPI14_12: 2134 .long 0xbe2aae50 # float -0.166680574 2135 .LCPI14_13: 2136 .long 0x3e4cceac # float 0.200007141 2137 .LCPI14_14: 2138 .long 0xbe7ffffc # float -0.24999994 2139 .LCPI14_15: 2140 .long 0x3eaaaaaa # float 0.333333313 2141 .LCPI14_16: 2142 .long 0x3f317218 # float 0.693147182 2143 .LCPI14_17: 2144 .long 0xbf000000 # float -0.5 2145 .LCPI14_18: 2146 .zero 32 2147 Log_Len8x_F32_V(float*, unsigned long): # @Log_Len8x_F32_V(float*, unsigned long) 2148 subq $104, %rsp 2149 testq %rsi, %rsi 2150 je .LBB14_3 2151 xorl %eax, %eax 2152 vbroadcastss .LCPI14_0(%rip), %ymm0 # ymm0 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 2153 vmovups %ymm0, 64(%rsp) # 32-byte Spill 2154 vbroadcastss .LCPI14_1(%rip), %ymm0 # ymm0 = [2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255,2155872255] 2155 vmovups %ymm0, 32(%rsp) # 32-byte Spill 2156 vbroadcastss .LCPI14_2(%rip), %ymm0 # ymm0 = [1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608,1056964608] 2157 vmovups %ymm0, (%rsp) # 32-byte Spill 2158 vbroadcastss .LCPI14_3(%rip), %ymm0 # ymm0 = [4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169,4294967169] 2159 vmovups %ymm0, -32(%rsp) # 32-byte Spill 2160 vbroadcastss .LCPI14_4(%rip), %ymm0 # ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 2161 vmovups %ymm0, -64(%rsp) # 32-byte Spill 2162 vbroadcastss .LCPI14_5(%rip), %ymm0 # ymm0 = [7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1,7.07106769E-1] 2163 vmovups %ymm0, -96(%rsp) # 32-byte Spill 2164 vbroadcastss .LCPI14_6(%rip), %ymm0 # ymm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] 2165 vmovups %ymm0, -128(%rsp) # 32-byte Spill 2166 vbroadcastss .LCPI14_7(%rip), %ymm8 # ymm8 = [7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2,7.03768358E-2] 2167 vbroadcastss .LCPI14_8(%rip), %ymm9 # ymm9 = [-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1,-1.15146101E-1] 2168 vbroadcastss .LCPI14_9(%rip), %ymm10 # ymm10 = [1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1,1.16769984E-1] 2169 vbroadcastss .LCPI14_10(%rip), %ymm11 # ymm11 = [-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1,-1.2420141E-1] 2170 vbroadcastss .LCPI14_11(%rip), %ymm12 # ymm12 = [1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1,1.42493233E-1] 2171 vbroadcastss .LCPI14_12(%rip), %ymm13 # ymm13 = [-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1,-1.66680574E-1] 2172 vbroadcastss .LCPI14_13(%rip), %ymm14 # ymm14 = [2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1,2.00007141E-1] 2173 vbroadcastss .LCPI14_14(%rip), %ymm15 # ymm15 = [-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1,-2.4999994E-1] 2174 vbroadcastss .LCPI14_15(%rip), %ymm0 # ymm0 = [3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1,3.33333313E-1] 2175 vbroadcastss .LCPI14_16(%rip), %ymm1 # ymm1 = [6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1,6.93147182E-1] 2176 vbroadcastss .LCPI14_17(%rip), %ymm2 # ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 2177 .LBB14_2: # =>This Inner Loop Header: Depth=1 2178 vmovups (%rdi,%rax,4), %ymm3 2179 vmaxps 64(%rsp), %ymm3, %ymm4 # 32-byte Folded Reload 2180 vpsrld $23, %ymm4, %ymm5 2181 vpaddd -32(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload 2182 vandps 32(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload 2183 vorps (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload 2184 vcvtdq2ps %ymm5, %ymm5 2185 vaddps -64(%rsp), %ymm5, %ymm6 # 32-byte Folded Reload 2186 vcmpltps -96(%rsp), %ymm4, %ymm7 # 32-byte Folded Reload 2187 vblendvps %ymm7, %ymm5, %ymm6, %ymm5 2188 vandps %ymm4, %ymm7, %ymm6 2189 vaddps -128(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload 2190 vaddps %ymm6, %ymm4, %ymm4 2191 vmovaps %ymm8, %ymm6 2192 vfmadd213ps %ymm9, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm9 2193 vfmadd213ps %ymm10, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm10 2194 vfmadd213ps %ymm11, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm11 2195 vfmadd213ps %ymm12, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm12 2196 vfmadd213ps %ymm13, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm13 2197 vfmadd213ps %ymm14, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm14 2198 vfmadd213ps %ymm15, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm15 2199 vfmadd213ps %ymm0, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm0 2200 vfmadd213ps %ymm2, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm6) + ymm2 2201 vfmadd213ps %ymm4, %ymm1, %ymm5 # ymm5 = (ymm1 * ymm5) + ymm4 2202 vmulps %ymm4, %ymm4, %ymm4 2203 vfmadd231ps %ymm6, %ymm4, %ymm5 # ymm5 = (ymm4 * ymm6) + ymm5 2204 vcmpleps .LCPI14_18(%rip), %ymm3, %ymm3 2205 vorps %ymm5, %ymm3, %ymm3 2206 vmovups %ymm3, (%rdi,%rax,4) 2207 addq $8, %rax 2208 cmpq %rsi, %rax 2209 jb .LBB14_2 2210 .LBB14_3: 2211 addq $104, %rsp 2212 vzeroupper 2213 retq 2214 .LCPI15_0: 2215 .long 0x42b17218 # float 88.7228394 2216 .LCPI15_1: 2217 .long 0xc2ce8ed0 # float -103.278931 2218 .LCPI15_2: 2219 .long 0x3f000000 # float 0.5 2220 .LCPI15_3: 2221 .long 0x3fb8aa3b # float 1.44269502 2222 .LCPI15_4: 2223 .long 0xbf318000 # float -0.693359375 2224 .LCPI15_5: 2225 .long 0x395e8083 # float 2.12194442E-4 2226 .LCPI15_6: 2227 .long 1065353216 # 0x3f800000 2228 .LCPI15_7: 2229 .long 0x3ab743ce # float 0.00139819994 2230 .LCPI15_8: 2231 .long 0x39506967 # float 1.98756912E-4 2232 .LCPI15_9: 2233 .long 0x3c088908 # float 0.00833345205 2234 .LCPI15_10: 2235 .long 0x3d2aa9c1 # float 0.0416657962 2236 .LCPI15_11: 2237 .long 0x3e2aaaaa # float 0.166666657 2238 .LCPI15_12: 2239 .long 0x7f7fffff # float 3.40282347E+38 2240 Exp_Len8x_F32_V(float*, unsigned long): # @Exp_Len8x_F32_V(float*, unsigned long) 2241 testq %rsi, %rsi 2242 je .LBB15_3 2243 xorl %eax, %eax 2244 vbroadcastss .LCPI15_0(%rip), %ymm0 # ymm0 = [8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1,8.87228394E+1] 2245 vmovups %ymm0, -40(%rsp) # 32-byte Spill 2246 vbroadcastss .LCPI15_1(%rip), %ymm0 # ymm0 = [-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2,-1.03278931E+2] 2247 vmovups %ymm0, -72(%rsp) # 32-byte Spill 2248 vbroadcastss .LCPI15_2(%rip), %ymm2 # ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1,5.0E-1] 2249 vbroadcastss .LCPI15_3(%rip), %ymm3 # ymm3 = [1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0,1.44269502E+0] 2250 vbroadcastss .LCPI15_4(%rip), %ymm4 # ymm4 = [-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1,-6.93359375E-1] 2251 vbroadcastss .LCPI15_5(%rip), %ymm5 # ymm5 = [2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4,2.12194442E-4] 2252 vpbroadcastd .LCPI15_6(%rip), %ymm6 # ymm6 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216] 2253 vbroadcastss .LCPI15_7(%rip), %ymm7 # ymm7 = [1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3,1.39819994E-3] 2254 vbroadcastss .LCPI15_8(%rip), %ymm1 # ymm1 = [1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4,1.98756912E-4] 2255 vbroadcastss .LCPI15_9(%rip), %ymm9 # ymm9 = [8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3,8.33345205E-3] 2256 vbroadcastss .LCPI15_10(%rip), %ymm10 # ymm10 = [4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2,4.16657962E-2] 2257 vbroadcastss .LCPI15_11(%rip), %ymm11 # ymm11 = [1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1,1.66666657E-1] 2258 vbroadcastss .LCPI15_12(%rip), %ymm12 # ymm12 = [3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38,3.40282347E+38] 2259 .LBB15_2: # =>This Inner Loop Header: Depth=1 2260 vmovups (%rdi,%rax,4), %ymm13 2261 vmovaps %ymm3, %ymm14 2262 vfmadd213ps %ymm2, %ymm13, %ymm14 # ymm14 = (ymm13 * ymm14) + ymm2 2263 vroundps $1, %ymm14, %ymm14 2264 vmovaps %ymm4, %ymm15 2265 vfmadd213ps %ymm13, %ymm14, %ymm15 # ymm15 = (ymm14 * ymm15) + ymm13 2266 vfmadd231ps %ymm5, %ymm14, %ymm15 # ymm15 = (ymm14 * ymm5) + ymm15 2267 vmulps %ymm15, %ymm15, %ymm0 2268 vmovaps %ymm1, %ymm8 2269 vfmadd213ps %ymm7, %ymm15, %ymm8 # ymm8 = (ymm15 * ymm8) + ymm7 2270 vfmadd213ps %ymm9, %ymm15, %ymm8 # ymm8 = (ymm15 * ymm8) + ymm9 2271 vfmadd213ps %ymm10, %ymm15, %ymm8 # ymm8 = (ymm15 * ymm8) + ymm10 2272 vfmadd213ps %ymm11, %ymm15, %ymm8 # ymm8 = (ymm15 * ymm8) + ymm11 2273 vfmadd213ps %ymm2, %ymm15, %ymm8 # ymm8 = (ymm15 * ymm8) + ymm2 2274 vfmadd213ps %ymm15, %ymm0, %ymm8 # ymm8 = (ymm0 * ymm8) + ymm15 2275 vcvttps2dq %ymm14, %ymm0 2276 vpslld $23, %ymm0, %ymm0 2277 vpaddd %ymm6, %ymm0, %ymm0 2278 vfmadd213ps %ymm0, %ymm0, %ymm8 # ymm8 = (ymm0 * ymm8) + ymm0 2279 vmovups -40(%rsp), %ymm0 # 32-byte Reload 2280 vcmpltps %ymm13, %ymm0, %ymm0 2281 vblendvps %ymm0, %ymm12, %ymm8, %ymm0 2282 vmovups -72(%rsp), %ymm8 # 32-byte Reload 2283 vcmpleps %ymm13, %ymm8, %ymm8 2284 vandps %ymm0, %ymm8, %ymm0 2285 vmovups %ymm0, (%rdi,%rax,4) 2286 addq $8, %rax 2287 cmpq %rsi, %rax 2288 jb .LBB15_2 2289 .LBB15_3: 2290 vzeroupper 2291 retq 2292 .LCPI16_0: 2293 .long 2147483647 # 0x7fffffff 2294 .LCPI16_1: 2295 .long 0x3fa2f983 # float 1.27323949 2296 .LCPI16_2: 2297 .long 4294967294 # 0xfffffffe 2298 .LCPI16_3: 2299 .long 2 # 0x2 2300 .LCPI16_4: 2301 .long 0xbf490fdb # float -0.785398185 2302 .LCPI16_5: 2303 .long 2147483648 # 0x80000000 2304 .LCPI16_6: 2305 .long 0x37ccf5ce # float 2.44331568E-5 2306 .LCPI16_7: 2307 .long 0xbab6061a # float -0.00138873165 2308 .LCPI16_8: 2309 .long 0x3d2aaaa5 # float 0.0416666456 2310 .LCPI16_9: 2311 .long 0xbf000000 # float -0.5 2312 .LCPI16_10: 2313 .long 0x3f800000 # float 1 2314 .LCPI16_11: 2315 .long 0xb94ca1f9 # float -1.95152956E-4 2316 .LCPI16_12: 2317 .long 0x3c08839e # float 0.00833216123 2318 .LCPI16_13: 2319 .long 0xbe2aaaa3 # float -0.166666552 2320 .LCPI16_14: 2321 .long 0x4b7fffff # float 16777215 2322 .LCPI16_15: 2323 .long 0x00000000 # float 0 2324 .LCPI16_16: 2325 .zero 32,255 2326 .LCPI16_17: 2327 .zero 32 2328 Sin_F32_V(float*, unsigned long): # @Sin_F32_V(float*, unsigned long) 2329 pushq %rax 2330 movq %rsi, %rax 2331 andq $-8, %rax 2332 je .LBB16_3 2333 xorl %ecx, %ecx 2334 vbroadcastss .LCPI16_0(%rip), %ymm0 # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 2335 vmovups %ymm0, -32(%rsp) # 32-byte Spill 2336 vbroadcastss .LCPI16_1(%rip), %ymm0 # ymm0 = [1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0] 2337 vmovups %ymm0, -64(%rsp) # 32-byte Spill 2338 vbroadcastss .LCPI16_2(%rip), %ymm0 # ymm0 = [4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294] 2339 vmovups %ymm0, -96(%rsp) # 32-byte Spill 2340 vpbroadcastd .LCPI16_3(%rip), %ymm4 # ymm4 = [2,2,2,2,2,2,2,2] 2341 vpbroadcastd .LCPI16_4(%rip), %ymm0 # ymm0 = [-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1] 2342 vmovdqu %ymm0, -128(%rsp) # 32-byte Spill 2343 vpbroadcastd .LCPI16_5(%rip), %ymm7 # ymm7 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 2344 vbroadcastss .LCPI16_6(%rip), %ymm8 # ymm8 = [2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5] 2345 vbroadcastss .LCPI16_7(%rip), %ymm9 # ymm9 = [-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3] 2346 vbroadcastss .LCPI16_8(%rip), %ymm10 # ymm10 = [4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2] 2347 vbroadcastss .LCPI16_9(%rip), %ymm11 # ymm11 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 2348 vbroadcastss .LCPI16_10(%rip), %ymm12 # ymm12 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 2349 vbroadcastss .LCPI16_11(%rip), %ymm3 # ymm3 = [-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4] 2350 vbroadcastss .LCPI16_12(%rip), %ymm14 # ymm14 = [8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3] 2351 vbroadcastss .LCPI16_13(%rip), %ymm15 # ymm15 = [-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1] 2352 .LBB16_2: # =>This Inner Loop Header: Depth=1 2353 vmovups (%rdi,%rcx,4), %ymm2 2354 vandps -32(%rsp), %ymm2, %ymm5 # 32-byte Folded Reload 2355 vmulps -64(%rsp), %ymm5, %ymm0 # 32-byte Folded Reload 2356 vcvttps2dq %ymm0, %ymm0 2357 vpsubd .LCPI16_16(%rip), %ymm0, %ymm0 2358 vpand -96(%rsp), %ymm0, %ymm1 # 32-byte Folded Reload 2359 vcvtdq2ps %ymm1, %ymm1 2360 vfmadd132ps -128(%rsp), %ymm5, %ymm1 # 32-byte Folded Reload 2361 vmulps %ymm1, %ymm1, %ymm5 2362 vmovaps %ymm3, %ymm13 2363 vfmadd213ps %ymm14, %ymm5, %ymm13 # ymm13 = (ymm5 * ymm13) + ymm14 2364 vfmadd213ps %ymm15, %ymm5, %ymm13 # ymm13 = (ymm5 * ymm13) + ymm15 2365 vmulps %ymm1, %ymm5, %ymm6 2366 vfmadd213ps %ymm1, %ymm13, %ymm6 # ymm6 = (ymm13 * ymm6) + ymm1 2367 vpslld $29, %ymm0, %ymm1 2368 vpand %ymm4, %ymm0, %ymm0 2369 vpxor %ymm2, %ymm1, %ymm1 2370 vmovaps %ymm8, %ymm2 2371 vfmadd213ps %ymm9, %ymm5, %ymm2 # ymm2 = (ymm5 * ymm2) + ymm9 2372 vfmadd213ps %ymm10, %ymm5, %ymm2 # ymm2 = (ymm5 * ymm2) + ymm10 2373 vfmadd213ps %ymm11, %ymm5, %ymm2 # ymm2 = (ymm5 * ymm2) + ymm11 2374 vfmadd213ps %ymm12, %ymm5, %ymm2 # ymm2 = (ymm5 * ymm2) + ymm12 2375 vpcmpeqd %ymm4, %ymm0, %ymm5 2376 vandps %ymm5, %ymm2, %ymm2 2377 vpcmpeqd .LCPI16_17(%rip), %ymm0, %ymm0 2378 vandps %ymm0, %ymm6, %ymm0 2379 vaddps %ymm2, %ymm0, %ymm0 2380 vpand %ymm7, %ymm1, %ymm1 2381 vpxor %ymm0, %ymm1, %ymm0 2382 vmovdqu %ymm0, (%rdi,%rcx,4) 2383 addq $8, %rcx 2384 cmpq %rax, %rcx 2385 jb .LBB16_2 2386 .LBB16_3: 2387 cmpq %rsi, %rax 2388 jae .LBB16_14 2389 vbroadcastss .LCPI16_5(%rip), %xmm0 # xmm0 = [2147483648,2147483648,2147483648,2147483648] 2390 vmovss .LCPI16_14(%rip), %xmm1 # xmm1 = mem[0],zero,zero,zero 2391 vmovss .LCPI16_1(%rip), %xmm9 # xmm9 = mem[0],zero,zero,zero 2392 vmovss .LCPI16_10(%rip), %xmm10 # xmm10 = mem[0],zero,zero,zero 2393 vmovss .LCPI16_4(%rip), %xmm11 # xmm11 = mem[0],zero,zero,zero 2394 vmovss .LCPI16_6(%rip), %xmm13 # xmm13 = mem[0],zero,zero,zero 2395 vmovss .LCPI16_7(%rip), %xmm12 # xmm12 = mem[0],zero,zero,zero 2396 vmovss .LCPI16_8(%rip), %xmm14 # xmm14 = mem[0],zero,zero,zero 2397 vmovss .LCPI16_9(%rip), %xmm15 # xmm15 = mem[0],zero,zero,zero 2398 vmovss .LCPI16_11(%rip), %xmm8 # xmm8 = mem[0],zero,zero,zero 2399 vmovss .LCPI16_12(%rip), %xmm5 # xmm5 = mem[0],zero,zero,zero 2400 vmovss .LCPI16_13(%rip), %xmm7 # xmm7 = mem[0],zero,zero,zero 2401 jmp .LBB16_5 2402 .LBB16_13: # in Loop: Header=BB16_5 Depth=1 2403 incq %rax 2404 cmpq %rsi, %rax 2405 jae .LBB16_14 2406 .LBB16_5: # =>This Inner Loop Header: Depth=1 2407 vmovss (%rdi,%rax,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 2408 vxorps %xmm0, %xmm2, %xmm3 2409 vmaxss %xmm2, %xmm3, %xmm6 2410 vucomiss %xmm1, %xmm6 2411 ja .LBB16_13 2412 vucomiss .LCPI16_15(%rip), %xmm2 2413 vmulss %xmm6, %xmm9, %xmm2 2414 vcvttss2si %xmm2, %ecx 2415 setb %r8b 2416 vroundss $11, %xmm2, %xmm2, %xmm2 2417 movl %ecx, %edx 2418 andl $1, %edx 2419 je .LBB16_8 2420 vaddss %xmm2, %xmm10, %xmm2 2421 .LBB16_8: # in Loop: Header=BB16_5 Depth=1 2422 addl %ecx, %edx 2423 andl $7, %edx 2424 leal -4(%rdx), %ecx 2425 cmpl $4, %edx 2426 cmovbl %edx, %ecx 2427 setae %dl 2428 vfmadd231ss %xmm11, %xmm2, %xmm6 # xmm6 = (xmm2 * xmm11) + xmm6 2429 vmulss %xmm6, %xmm6, %xmm2 2430 vmovaps %xmm13, %xmm3 2431 vfmadd213ss %xmm12, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm3) + xmm12 2432 vfmadd213ss %xmm14, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm3) + xmm14 2433 vfmadd213ss %xmm15, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm3) + xmm15 2434 vmovaps %xmm8, %xmm4 2435 vfmadd213ss %xmm5, %xmm2, %xmm4 # xmm4 = (xmm2 * xmm4) + xmm5 2436 vfmadd213ss %xmm7, %xmm2, %xmm4 # xmm4 = (xmm2 * xmm4) + xmm7 2437 decl %ecx 2438 cmpl $2, %ecx 2439 jb .LBB16_9 2440 vmulss %xmm6, %xmm2, %xmm2 2441 vfmadd213ss %xmm6, %xmm2, %xmm4 # xmm4 = (xmm2 * xmm4) + xmm6 2442 vmovaps %xmm4, %xmm2 2443 vmovss %xmm2, (%rdi,%rax,4) 2444 cmpb %dl, %r8b 2445 je .LBB16_13 2446 jmp .LBB16_12 2447 .LBB16_9: # in Loop: Header=BB16_5 Depth=1 2448 vfmadd213ss %xmm10, %xmm3, %xmm2 # xmm2 = (xmm3 * xmm2) + xmm10 2449 vmovss %xmm2, (%rdi,%rax,4) 2450 cmpb %dl, %r8b 2451 je .LBB16_13 2452 .LBB16_12: # in Loop: Header=BB16_5 Depth=1 2453 vxorps %xmm0, %xmm2, %xmm2 2454 vmovss %xmm2, (%rdi,%rax,4) 2455 jmp .LBB16_13 2456 .LBB16_14: 2457 popq %rax 2458 vzeroupper 2459 retq 2460 .LCPI17_0: 2461 .long 2147483647 # 0x7fffffff 2462 .LCPI17_1: 2463 .long 0x3fa2f983 # float 1.27323949 2464 .LCPI17_2: 2465 .long 4294967294 # 0xfffffffe 2466 .LCPI17_3: 2467 .long 2 # 0x2 2468 .LCPI17_4: 2469 .long 0xbf490fdb # float -0.785398185 2470 .LCPI17_5: 2471 .long 3221225472 # 0xc0000000 2472 .LCPI17_6: 2473 .long 0x37ccf5ce # float 2.44331568E-5 2474 .LCPI17_7: 2475 .long 0xbab6061a # float -0.00138873165 2476 .LCPI17_8: 2477 .long 0x3d2aaaa5 # float 0.0416666456 2478 .LCPI17_9: 2479 .long 0xbf000000 # float -0.5 2480 .LCPI17_10: 2481 .long 0x3f800000 # float 1 2482 .LCPI17_11: 2483 .long 0xb94ca1f9 # float -1.95152956E-4 2484 .LCPI17_12: 2485 .long 0x3c08839e # float 0.00833216123 2486 .LCPI17_13: 2487 .long 0xbe2aaaa3 # float -0.166666552 2488 .LCPI17_14: 2489 .long 2147483648 # 0x80000000 2490 .LCPI17_15: 2491 .long 0x4b7fffff # float 16777215 2492 .LCPI17_16: 2493 .zero 32,255 2494 .LCPI17_17: 2495 .zero 32 2496 Cos_F32_V(float*, unsigned long): # @Cos_F32_V(float*, unsigned long) 2497 subq $72, %rsp 2498 movq %rsi, %rax 2499 andq $-8, %rax 2500 je .LBB17_3 2501 xorl %ecx, %ecx 2502 vbroadcastss .LCPI17_0(%rip), %ymm0 # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 2503 vmovups %ymm0, 32(%rsp) # 32-byte Spill 2504 vbroadcastss .LCPI17_1(%rip), %ymm0 # ymm0 = [1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0] 2505 vmovups %ymm0, (%rsp) # 32-byte Spill 2506 vbroadcastss .LCPI17_2(%rip), %ymm0 # ymm0 = [4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294] 2507 vmovups %ymm0, -32(%rsp) # 32-byte Spill 2508 vpbroadcastd .LCPI17_3(%rip), %ymm4 # ymm4 = [2,2,2,2,2,2,2,2] 2509 vbroadcastss .LCPI17_4(%rip), %ymm0 # ymm0 = [-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1] 2510 vmovups %ymm0, -64(%rsp) # 32-byte Spill 2511 vbroadcastss .LCPI17_5(%rip), %ymm0 # ymm0 = [3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472] 2512 vmovups %ymm0, -96(%rsp) # 32-byte Spill 2513 vbroadcastss .LCPI17_6(%rip), %ymm0 # ymm0 = [2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5] 2514 vmovups %ymm0, -128(%rsp) # 32-byte Spill 2515 vbroadcastss .LCPI17_7(%rip), %ymm9 # ymm9 = [-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3] 2516 vbroadcastss .LCPI17_8(%rip), %ymm10 # ymm10 = [4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2] 2517 vbroadcastss .LCPI17_9(%rip), %ymm6 # ymm6 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 2518 vbroadcastss .LCPI17_10(%rip), %ymm12 # ymm12 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 2519 vbroadcastss .LCPI17_11(%rip), %ymm13 # ymm13 = [-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4] 2520 vbroadcastss .LCPI17_12(%rip), %ymm14 # ymm14 = [8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3] 2521 vbroadcastss .LCPI17_13(%rip), %ymm15 # ymm15 = [-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1] 2522 vpbroadcastd .LCPI17_14(%rip), %ymm2 # ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 2523 .LBB17_2: # =>This Inner Loop Header: Depth=1 2524 vmovups 32(%rsp), %ymm0 # 32-byte Reload 2525 vandps (%rdi,%rcx,4), %ymm0, %ymm5 2526 vmulps (%rsp), %ymm5, %ymm0 # 32-byte Folded Reload 2527 vcvttps2dq %ymm0, %ymm0 2528 vpsubd .LCPI17_16(%rip), %ymm0, %ymm0 2529 vpand -32(%rsp), %ymm0, %ymm1 # 32-byte Folded Reload 2530 vcvtdq2ps %ymm1, %ymm3 2531 vfmadd132ps -64(%rsp), %ymm5, %ymm3 # 32-byte Folded Reload 2532 vmulps %ymm3, %ymm3, %ymm5 2533 vmovups -128(%rsp), %ymm8 # 32-byte Reload 2534 vfmadd213ps %ymm9, %ymm5, %ymm8 # ymm8 = (ymm5 * ymm8) + ymm9 2535 vfmadd213ps %ymm10, %ymm5, %ymm8 # ymm8 = (ymm5 * ymm8) + ymm10 2536 vmulps %ymm5, %ymm5, %ymm7 2537 vmovaps %ymm6, %ymm11 2538 vfmadd213ps %ymm12, %ymm5, %ymm11 # ymm11 = (ymm5 * ymm11) + ymm12 2539 vfmadd231ps %ymm7, %ymm8, %ymm11 # ymm11 = (ymm8 * ymm7) + ymm11 2540 vmovaps %ymm13, %ymm7 2541 vfmadd213ps %ymm14, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm14 2542 vfmadd213ps %ymm15, %ymm5, %ymm7 # ymm7 = (ymm5 * ymm7) + ymm15 2543 vmulps %ymm3, %ymm5, %ymm5 2544 vfmadd213ps %ymm3, %ymm7, %ymm5 # ymm5 = (ymm7 * ymm5) + ymm3 2545 vpand %ymm4, %ymm0, %ymm0 2546 vpcmpeqd %ymm4, %ymm0, %ymm3 2547 vpcmpeqd .LCPI17_17(%rip), %ymm0, %ymm0 2548 vandps %ymm0, %ymm5, %ymm0 2549 vandps %ymm3, %ymm11, %ymm3 2550 vaddps %ymm3, %ymm0, %ymm0 2551 vaddps %ymm5, %ymm11, %ymm3 2552 vsubps %ymm0, %ymm3, %ymm0 2553 vpslld $29, %ymm1, %ymm1 2554 vpaddd -96(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload 2555 vpand %ymm2, %ymm1, %ymm1 2556 vpxor %ymm2, %ymm1, %ymm1 2557 vxorps %ymm1, %ymm0, %ymm0 2558 vmovups %ymm0, (%rdi,%rcx,4) 2559 addq $8, %rcx 2560 cmpq %rax, %rcx 2561 jb .LBB17_2 2562 .LBB17_3: 2563 cmpq %rsi, %rax 2564 jae .LBB17_14 2565 vbroadcastss .LCPI17_14(%rip), %xmm0 # xmm0 = [2147483648,2147483648,2147483648,2147483648] 2566 vmovss .LCPI17_15(%rip), %xmm1 # xmm1 = mem[0],zero,zero,zero 2567 vmovss .LCPI17_1(%rip), %xmm8 # xmm8 = mem[0],zero,zero,zero 2568 vmovss .LCPI17_10(%rip), %xmm9 # xmm9 = mem[0],zero,zero,zero 2569 vmovss .LCPI17_4(%rip), %xmm10 # xmm10 = mem[0],zero,zero,zero 2570 vmovss .LCPI17_6(%rip), %xmm12 # xmm12 = mem[0],zero,zero,zero 2571 vmovss .LCPI17_7(%rip), %xmm11 # xmm11 = mem[0],zero,zero,zero 2572 vmovss .LCPI17_8(%rip), %xmm13 # xmm13 = mem[0],zero,zero,zero 2573 vmovss .LCPI17_9(%rip), %xmm14 # xmm14 = mem[0],zero,zero,zero 2574 vmovss .LCPI17_11(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero 2575 vmovss .LCPI17_12(%rip), %xmm15 # xmm15 = mem[0],zero,zero,zero 2576 vmovss .LCPI17_13(%rip), %xmm6 # xmm6 = mem[0],zero,zero,zero 2577 jmp .LBB17_5 2578 .LBB17_13: # in Loop: Header=BB17_5 Depth=1 2579 incq %rax 2580 cmpq %rsi, %rax 2581 jae .LBB17_14 2582 .LBB17_5: # =>This Inner Loop Header: Depth=1 2583 vmovss (%rdi,%rax,4), %xmm3 # xmm3 = mem[0],zero,zero,zero 2584 vxorps %xmm0, %xmm3, %xmm4 2585 vmaxss %xmm3, %xmm4, %xmm5 2586 vucomiss %xmm1, %xmm5 2587 ja .LBB17_13 2588 vmulss %xmm5, %xmm8, %xmm3 2589 vcvttss2si %xmm3, %ecx 2590 vroundss $11, %xmm3, %xmm3, %xmm7 2591 movl %ecx, %edx 2592 andl $1, %edx 2593 je .LBB17_8 2594 vaddss %xmm7, %xmm9, %xmm7 2595 .LBB17_8: # in Loop: Header=BB17_5 Depth=1 2596 addl %ecx, %edx 2597 andl $7, %edx 2598 leal -4(%rdx), %ecx 2599 cmpl $4, %edx 2600 setae %r8b 2601 cmovbl %edx, %ecx 2602 cmpl $2, %ecx 2603 setae %dl 2604 vfmadd231ss %xmm10, %xmm7, %xmm5 # xmm5 = (xmm7 * xmm10) + xmm5 2605 vmulss %xmm5, %xmm5, %xmm7 2606 vmovaps %xmm12, %xmm4 2607 vfmadd213ss %xmm11, %xmm7, %xmm4 # xmm4 = (xmm7 * xmm4) + xmm11 2608 vfmadd213ss %xmm13, %xmm7, %xmm4 # xmm4 = (xmm7 * xmm4) + xmm13 2609 vfmadd213ss %xmm14, %xmm7, %xmm4 # xmm4 = (xmm7 * xmm4) + xmm14 2610 vmovaps %xmm2, %xmm3 2611 vfmadd213ss %xmm15, %xmm7, %xmm3 # xmm3 = (xmm7 * xmm3) + xmm15 2612 vfmadd213ss %xmm6, %xmm7, %xmm3 # xmm3 = (xmm7 * xmm3) + xmm6 2613 decl %ecx 2614 cmpl $2, %ecx 2615 jb .LBB17_9 2616 vfmadd213ss %xmm9, %xmm4, %xmm7 # xmm7 = (xmm4 * xmm7) + xmm9 2617 vmovaps %xmm7, %xmm3 2618 vmovss %xmm3, (%rdi,%rax,4) 2619 cmpb %dl, %r8b 2620 je .LBB17_13 2621 jmp .LBB17_12 2622 .LBB17_9: # in Loop: Header=BB17_5 Depth=1 2623 vmulss %xmm5, %xmm7, %xmm4 2624 vfmadd213ss %xmm5, %xmm4, %xmm3 # xmm3 = (xmm4 * xmm3) + xmm5 2625 vmovss %xmm3, (%rdi,%rax,4) 2626 cmpb %dl, %r8b 2627 je .LBB17_13 2628 .LBB17_12: # in Loop: Header=BB17_5 Depth=1 2629 vxorps %xmm0, %xmm3, %xmm3 2630 vmovss %xmm3, (%rdi,%rax,4) 2631 jmp .LBB17_13 2632 .LBB17_14: 2633 addq $72, %rsp 2634 vzeroupper 2635 retq 2636 .LCPI18_0: 2637 .long 2147483647 # 0x7fffffff 2638 .LCPI18_1: 2639 .long 0x3fa2f983 # float 1.27323949 2640 .LCPI18_2: 2641 .long 4294967294 # 0xfffffffe 2642 .LCPI18_3: 2643 .long 2 # 0x2 2644 .LCPI18_4: 2645 .long 0xbf490fdb # float -0.785398185 2646 .LCPI18_5: 2647 .long 3221225472 # 0xc0000000 2648 .LCPI18_6: 2649 .long 2147483648 # 0x80000000 2650 .LCPI18_7: 2651 .long 0x37ccf5ce # float 2.44331568E-5 2652 .LCPI18_8: 2653 .long 0xbab6061a # float -0.00138873165 2654 .LCPI18_9: 2655 .long 0x3d2aaaa5 # float 0.0416666456 2656 .LCPI18_10: 2657 .long 0xbf000000 # float -0.5 2658 .LCPI18_11: 2659 .long 0x3f800000 # float 1 2660 .LCPI18_12: 2661 .long 0xb94ca1f9 # float -1.95152956E-4 2662 .LCPI18_13: 2663 .long 0x3c08839e # float 0.00833216123 2664 .LCPI18_14: 2665 .long 0xbe2aaaa3 # float -0.166666552 2666 .LCPI18_15: 2667 .long 0x4b7fffff # float 16777215 2668 .LCPI18_16: 2669 .long 0x00000000 # float 0 2670 .LCPI18_17: 2671 .zero 32,255 2672 .LCPI18_18: 2673 .zero 32 2674 SinCos_F32_V(float*, float*, float*, unsigned long): # @SinCos_F32_V(float*, float*, float*, unsigned long) 2675 pushq %rbx 2676 subq $96, %rsp 2677 movq %rcx, %r8 2678 andq $-8, %r8 2679 je .LBB18_3 2680 xorl %eax, %eax 2681 vbroadcastss .LCPI18_0(%rip), %ymm0 # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 2682 vmovups %ymm0, 64(%rsp) # 32-byte Spill 2683 vbroadcastss .LCPI18_1(%rip), %ymm0 # ymm0 = [1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0,1.27323949E+0] 2684 vmovups %ymm0, 32(%rsp) # 32-byte Spill 2685 vbroadcastss .LCPI18_2(%rip), %ymm0 # ymm0 = [4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294,4294967294] 2686 vmovups %ymm0, (%rsp) # 32-byte Spill 2687 vpbroadcastd .LCPI18_3(%rip), %ymm4 # ymm4 = [2,2,2,2,2,2,2,2] 2688 vbroadcastss .LCPI18_4(%rip), %ymm0 # ymm0 = [-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1,-7.85398185E-1] 2689 vmovups %ymm0, -32(%rsp) # 32-byte Spill 2690 vbroadcastss .LCPI18_5(%rip), %ymm0 # ymm0 = [3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472,3221225472] 2691 vmovups %ymm0, -64(%rsp) # 32-byte Spill 2692 vpbroadcastd .LCPI18_6(%rip), %ymm8 # ymm8 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 2693 vbroadcastss .LCPI18_7(%rip), %ymm0 # ymm0 = [2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5,2.44331568E-5] 2694 vmovups %ymm0, -96(%rsp) # 32-byte Spill 2695 vbroadcastss .LCPI18_8(%rip), %ymm0 # ymm0 = [-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3,-1.38873165E-3] 2696 vmovups %ymm0, -128(%rsp) # 32-byte Spill 2697 vbroadcastss .LCPI18_9(%rip), %ymm11 # ymm11 = [4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2,4.16666456E-2] 2698 vbroadcastss .LCPI18_10(%rip), %ymm10 # ymm10 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 2699 vbroadcastss .LCPI18_11(%rip), %ymm13 # ymm13 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 2700 vbroadcastss .LCPI18_12(%rip), %ymm14 # ymm14 = [-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4,-1.95152956E-4] 2701 vbroadcastss .LCPI18_13(%rip), %ymm15 # ymm15 = [8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3,8.33216123E-3] 2702 vbroadcastss .LCPI18_14(%rip), %ymm2 # ymm2 = [-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1,-1.66666552E-1] 2703 .LBB18_2: # =>This Inner Loop Header: Depth=1 2704 vmovups (%rdx,%rax,4), %ymm5 2705 vandps 64(%rsp), %ymm5, %ymm1 # 32-byte Folded Reload 2706 vmulps 32(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload 2707 vcvttps2dq %ymm0, %ymm0 2708 vpsubd .LCPI18_17(%rip), %ymm0, %ymm3 2709 vpand (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload 2710 vcvtdq2ps %ymm0, %ymm6 2711 vfmadd132ps -32(%rsp), %ymm1, %ymm6 # 32-byte Folded Reload 2712 vmulps %ymm6, %ymm6, %ymm1 2713 vmovups -96(%rsp), %ymm9 # 32-byte Reload 2714 vfmadd213ps -128(%rsp), %ymm1, %ymm9 # 32-byte Folded Reload 2715 vfmadd213ps %ymm11, %ymm1, %ymm9 # ymm9 = (ymm1 * ymm9) + ymm11 2716 vmulps %ymm1, %ymm1, %ymm7 2717 vmovaps %ymm10, %ymm12 2718 vfmadd213ps %ymm13, %ymm1, %ymm12 # ymm12 = (ymm1 * ymm12) + ymm13 2719 vfmadd231ps %ymm7, %ymm9, %ymm12 # ymm12 = (ymm9 * ymm7) + ymm12 2720 vmovaps %ymm14, %ymm7 2721 vfmadd213ps %ymm15, %ymm1, %ymm7 # ymm7 = (ymm1 * ymm7) + ymm15 2722 vfmadd213ps %ymm2, %ymm1, %ymm7 # ymm7 = (ymm1 * ymm7) + ymm2 2723 vmulps %ymm6, %ymm1, %ymm1 2724 vfmadd213ps %ymm6, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm6 2725 vpslld $29, %ymm3, %ymm6 2726 vpand %ymm4, %ymm3, %ymm3 2727 vpxor %ymm5, %ymm6, %ymm5 2728 vpcmpeqd %ymm4, %ymm3, %ymm6 2729 vpcmpeqd .LCPI18_18(%rip), %ymm3, %ymm3 2730 vandps %ymm3, %ymm1, %ymm3 2731 vandps %ymm6, %ymm12, %ymm6 2732 vaddps %ymm3, %ymm6, %ymm3 2733 vaddps %ymm1, %ymm12, %ymm1 2734 vpand %ymm5, %ymm8, %ymm5 2735 vsubps %ymm3, %ymm1, %ymm1 2736 vpxor %ymm3, %ymm5, %ymm3 2737 vpslld $29, %ymm0, %ymm0 2738 vpaddd -64(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 2739 vpand %ymm0, %ymm8, %ymm0 2740 vpxor %ymm0, %ymm8, %ymm0 2741 vxorps %ymm0, %ymm1, %ymm0 2742 vmovdqu %ymm3, (%rdi,%rax,4) 2743 vmovups %ymm0, (%rsi,%rax,4) 2744 addq $8, %rax 2745 cmpq %r8, %rax 2746 jb .LBB18_2 2747 .LBB18_3: 2748 cmpq %rcx, %r8 2749 jae .LBB18_16 2750 vbroadcastss .LCPI18_6(%rip), %xmm0 # xmm0 = [2147483648,2147483648,2147483648,2147483648] 2751 vmovss .LCPI18_15(%rip), %xmm1 # xmm1 = mem[0],zero,zero,zero 2752 vmovss .LCPI18_11(%rip), %xmm3 # xmm3 = mem[0],zero,zero,zero 2753 vmovss .LCPI18_7(%rip), %xmm8 # xmm8 = mem[0],zero,zero,zero 2754 vmovss .LCPI18_8(%rip), %xmm11 # xmm11 = mem[0],zero,zero,zero 2755 vmovss .LCPI18_9(%rip), %xmm13 # xmm13 = mem[0],zero,zero,zero 2756 vmovss .LCPI18_10(%rip), %xmm14 # xmm14 = mem[0],zero,zero,zero 2757 vmovss .LCPI18_12(%rip), %xmm10 # xmm10 = mem[0],zero,zero,zero 2758 vmovss .LCPI18_13(%rip), %xmm15 # xmm15 = mem[0],zero,zero,zero 2759 vmovss .LCPI18_14(%rip), %xmm6 # xmm6 = mem[0],zero,zero,zero 2760 jmp .LBB18_5 2761 .LBB18_15: # in Loop: Header=BB18_5 Depth=1 2762 incq %r8 2763 cmpq %rcx, %r8 2764 jae .LBB18_16 2765 .LBB18_5: # =>This Inner Loop Header: Depth=1 2766 vmovss (%rdx,%r8,4), %xmm4 # xmm4 = mem[0],zero,zero,zero 2767 vxorps %xmm0, %xmm4, %xmm2 2768 vmaxss %xmm4, %xmm2, %xmm2 2769 vucomiss %xmm1, %xmm2 2770 ja .LBB18_15 2771 vucomiss .LCPI18_16(%rip), %xmm4 2772 vmulss .LCPI18_1(%rip), %xmm2, %xmm4 2773 vcvttss2si %xmm4, %r10d 2774 setb %r9b 2775 vroundss $11, %xmm4, %xmm4, %xmm4 2776 movl %r10d, %eax 2777 andl $1, %eax 2778 je .LBB18_8 2779 vaddss %xmm3, %xmm4, %xmm4 2780 .LBB18_8: # in Loop: Header=BB18_5 Depth=1 2781 addl %r10d, %eax 2782 andl $7, %eax 2783 leal -4(%rax), %r10d 2784 cmpl $4, %eax 2785 setae %r11b 2786 cmovbl %eax, %r10d 2787 vfmadd231ss .LCPI18_4(%rip), %xmm4, %xmm2 # xmm2 = (xmm4 * mem) + xmm2 2788 vmulss %xmm2, %xmm2, %xmm7 2789 vmovaps %xmm8, %xmm12 2790 vfmadd213ss %xmm11, %xmm7, %xmm12 # xmm12 = (xmm7 * xmm12) + xmm11 2791 vfmadd213ss %xmm13, %xmm7, %xmm12 # xmm12 = (xmm7 * xmm12) + xmm13 2792 vmulss %xmm7, %xmm7, %xmm9 2793 vmovaps %xmm3, %xmm4 2794 vfmadd231ss %xmm14, %xmm7, %xmm4 # xmm4 = (xmm7 * xmm14) + xmm4 2795 vfmadd231ss %xmm9, %xmm12, %xmm4 # xmm4 = (xmm12 * xmm9) + xmm4 2796 vmovaps %xmm10, %xmm5 2797 vfmadd213ss %xmm15, %xmm7, %xmm5 # xmm5 = (xmm7 * xmm5) + xmm15 2798 vfmadd213ss %xmm6, %xmm7, %xmm5 # xmm5 = (xmm7 * xmm5) + xmm6 2799 vmulss %xmm2, %xmm7, %xmm7 2800 vfmadd213ss %xmm2, %xmm5, %xmm7 # xmm7 = (xmm5 * xmm7) + xmm2 2801 leal -1(%r10), %ebx 2802 cmpl $2, %ebx 2803 jb .LBB18_9 2804 vmovaps %xmm7, %xmm2 2805 vmovss %xmm2, (%rdi,%r8,4) 2806 vmovss %xmm4, (%rsi,%r8,4) 2807 cmpb %r11b, %r9b 2808 jne .LBB18_12 2809 jmp .LBB18_13 2810 .LBB18_9: # in Loop: Header=BB18_5 Depth=1 2811 vmovaps %xmm4, %xmm2 2812 vmovaps %xmm7, %xmm4 2813 vmovss %xmm2, (%rdi,%r8,4) 2814 vmovss %xmm4, (%rsi,%r8,4) 2815 cmpb %r11b, %r9b 2816 je .LBB18_13 2817 .LBB18_12: # in Loop: Header=BB18_5 Depth=1 2818 vmovss (%rdi,%r8,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 2819 vxorps %xmm0, %xmm2, %xmm2 2820 vmovss %xmm2, (%rdi,%r8,4) 2821 .LBB18_13: # in Loop: Header=BB18_5 Depth=1 2822 cmpl $2, %r10d 2823 setae %bl 2824 cmpl $4, %eax 2825 setae %al 2826 cmpb %bl, %al 2827 je .LBB18_15 2828 vmovss (%rsi,%r8,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 2829 vxorps %xmm0, %xmm2, %xmm2 2830 vmovss %xmm2, (%rsi,%r8,4) 2831 jmp .LBB18_15 2832 .LBB18_16: 2833 addq $96, %rsp 2834 popq %rbx 2835 vzeroupper 2836 retq