gitee.com/quant1x/gox@v1.7.6/num/asm/special.go (about) 1 package main 2 3 import ( 4 . "github.com/mmcloughlin/avo/build" 5 . "github.com/mmcloughlin/avo/operand" 6 . "github.com/mmcloughlin/avo/reg" 7 ) 8 9 func genSqrt_F64() { 10 11 TEXT("Sqrt_AVX2_F64", NOSPLIT, "func(x []float64) float64") 12 Pragma("noescape") 13 Load(Param("x").Base(), RDI) 14 Load(Param("x").Len(), RSI) 15 16 TESTQ(RSI, RSI) 17 JE(LabelRef("LBB0_7")) 18 CMPQ(RSI, Imm(4)) 19 JAE(LabelRef("LBB0_3")) 20 XORL(EAX, EAX) 21 JMP(LabelRef("LBB0_6")) 22 23 Label("LBB0_3") 24 { 25 MOVQ(RSI, RAX) 26 ANDQ(I32(-4), RAX) 27 XORL(ECX, ECX) 28 } 29 30 Label("LBB0_4") 31 { 32 VSQRTPD(Mem{Base: RDI}.Idx(RCX, 8), Y0) 33 VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8)) 34 ADDQ(Imm(4), RCX) 35 CMPQ(RAX, RCX) 36 JNE(LabelRef("LBB0_4")) 37 CMPQ(RAX, RSI) 38 JE(LabelRef("LBB0_7")) 39 } 40 41 Label("LBB0_6") 42 { 43 VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0) 44 VSQRTSD(X0, X0, X0) 45 VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8)) 46 ADDQ(Imm(1), RAX) 47 CMPQ(RSI, RAX) 48 JNE(LabelRef("LBB0_6")) 49 } 50 51 Label("LBB0_7") 52 { 53 VZEROUPPER() 54 Store(X0, ReturnIndex(0)) 55 RET() 56 } 57 } 58 59 func genSqrt_F32() { 60 data := GLOBL("dataSqrtF32", RODATA|NOPTR) 61 DATA(0, U32(0xc0400000)) 62 DATA(4, U32(0xbf000000)) 63 DATA(8, U32(0x7fffffff)) 64 DATA(12, U32(0x00800000)) 65 66 TEXT("Sqrt_AVX2_F32", NOSPLIT, "func(x []float32) float32") 67 Pragma("noescape") 68 Load(Param("x").Base(), RDI) 69 Load(Param("x").Len(), RSI) 70 71 TESTQ(RSI, RSI) 72 JE(LabelRef("LBB1_8")) 73 CMPQ(RSI, Imm(32)) 74 JAE(LabelRef("LBB1_3")) 75 XORL(EAX, EAX) 76 JMP(LabelRef("LBB1_6")) 77 78 Label("LBB1_3") 79 { 80 MOVQ(RSI, RAX) 81 ANDQ(I32(-32), RAX) 82 XORL(ECX, ECX) 83 VBROADCASTSS(data.Offset(0), Y0) 84 VBROADCASTSS(data.Offset(4), Y1) 85 VBROADCASTSS(data.Offset(8), Y2) 86 VBROADCASTSS(data.Offset(12), Y3) 87 } 88 89 Label("LBB1_4") 90 { 91 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y4) 92 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y5) 93 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y6) 94 VRSQRTPS(Y4, Y7) 95 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y8) 96 VMULPS(Y7, Y4, Y9) 97 VFMADD213PS(Y0, Y9, Y7) 98 VMULPS(Y1, Y9, Y9) 99 VMULPS(Y7, Y9, Y7) 100 VANDPS(Y2, Y4, Y4) 101 VCMPPS(Imm(2), Y4, Y3, Y4) 102 VANDPS(Y7, Y4, Y4) 103 VRSQRTPS(Y5, Y7) 104 VMULPS(Y7, Y5, Y9) 105 VFMADD213PS(Y0, Y9, Y7) 106 VMULPS(Y1, Y9, Y9) 107 VMULPS(Y7, Y9, Y7) 108 VANDPS(Y2, Y5, Y5) 109 VCMPPS(Imm(2), Y5, Y3, Y5) 110 VRSQRTPS(Y6, Y9) 111 VANDPS(Y7, Y5, Y5) 112 VMULPS(Y6, Y9, Y7) 113 VFMADD213PS(Y0, Y7, Y9) 114 VMULPS(Y1, Y7, Y7) 115 VMULPS(Y7, Y9, Y7) 116 VANDPS(Y2, Y6, Y6) 117 VCMPPS(Imm(2), Y6, Y3, Y6) 118 VANDPS(Y7, Y6, Y6) 119 VRSQRTPS(Y8, Y7) 120 VMULPS(Y7, Y8, Y9) 121 VFMADD213PS(Y0, Y9, Y7) 122 VMULPS(Y1, Y9, Y9) 123 VMULPS(Y7, Y9, Y7) 124 VANDPS(Y2, Y8, Y8) 125 VCMPPS(Imm(2), Y8, Y3, Y8) 126 VANDPS(Y7, Y8, Y7) 127 VMOVUPS(Y4, Mem{Base: RDI}.Idx(RCX, 4)) 128 VMOVUPS(Y5, Mem{Base: RDI}.Idx(RCX, 4).Offset(32)) 129 VMOVUPS(Y6, Mem{Base: RDI}.Idx(RCX, 4).Offset(64)) 130 VMOVUPS(Y7, Mem{Base: RDI}.Idx(RCX, 4).Offset(96)) 131 ADDQ(Imm(32), RCX) 132 CMPQ(RAX, RCX) 133 JNE(LabelRef("LBB1_4")) 134 CMPQ(RAX, RSI) 135 JE(LabelRef("LBB1_8")) 136 } 137 138 Label("LBB1_6") 139 { 140 VMOVSS(data.Offset(0), X0) 141 VMOVSS(data.Offset(4), X1) 142 VBROADCASTSS(data.Offset(8), X2) 143 VMOVSS(data.Offset(12), X3) 144 } 145 146 Label("LBB1_7") 147 { 148 VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X4) 149 VRSQRTSS(X4, X4, X5) 150 VMULSS(X5, X4, X6) 151 VFMADD213SS(X0, X6, X5) 152 VMULSS(X1, X6, X6) 153 VMULSS(X5, X6, X5) 154 VANDPS(X2, X4, X4) 155 VCMPSS(Imm(1), X3, X4, X4) 156 VANDNPS(X5, X4, X4) 157 VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4)) 158 ADDQ(Imm(1), RAX) 159 CMPQ(RSI, RAX) 160 JNE(LabelRef("LBB1_7")) 161 } 162 163 Label("LBB1_8") 164 { 165 VZEROUPPER() 166 Store(X0, ReturnIndex(0)) 167 RET() 168 } 169 } 170 171 func genRound_F64() { 172 173 data := GLOBL("dataRoundF64", RODATA|NOPTR) 174 DATA(0, U64(0x8000000000000000)) 175 DATA(8, U64(0x3fdfffffffffffff)) 176 DATA(16, U64(0x8000000000000000)) 177 DATA(24, U64(0x8000000000000000)) 178 179 TEXT("Round_AVX2_F64", NOSPLIT, "func(x []float64) float64") 180 Pragma("noescape") 181 Load(Param("x").Base(), RDI) 182 Load(Param("x").Len(), RSI) 183 184 TESTQ(RSI, RSI) 185 JE(LabelRef("LBB2_8")) 186 CMPQ(RSI, Imm(16)) 187 JAE(LabelRef("LBB2_3")) 188 XORL(EAX, EAX) 189 JMP(LabelRef("LBB2_6")) 190 191 Label("LBB2_3") 192 { 193 MOVQ(RSI, RAX) 194 ANDQ(I32(-16), RAX) 195 XORL(ECX, ECX) 196 VBROADCASTSD(data.Offset(0), Y0) 197 VBROADCASTSD(data.Offset(8), Y1) 198 } 199 200 Label("LBB2_4") 201 { 202 VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8), Y2) 203 VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y3) 204 VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y4) 205 VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y5) 206 VANDPD(Y0, Y2, Y6) 207 VORPD(Y1, Y6, Y6) 208 VADDPD(Y6, Y2, Y2) 209 VROUNDPD(Imm(11), Y2, Y2) 210 VANDPD(Y0, Y3, Y6) 211 VORPD(Y1, Y6, Y6) 212 VADDPD(Y6, Y3, Y3) 213 VROUNDPD(Imm(11), Y3, Y3) 214 VANDPD(Y0, Y4, Y6) 215 VORPD(Y1, Y6, Y6) 216 VADDPD(Y6, Y4, Y4) 217 VROUNDPD(Imm(11), Y4, Y4) 218 VANDPD(Y0, Y5, Y6) 219 VORPD(Y1, Y6, Y6) 220 VADDPD(Y6, Y5, Y5) 221 VROUNDPD(Imm(11), Y5, Y5) 222 VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8)) 223 VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(32)) 224 VMOVUPD(Y4, Mem{Base: RDI}.Idx(RCX, 8).Offset(64)) 225 VMOVUPD(Y5, Mem{Base: RDI}.Idx(RCX, 8).Offset(96)) 226 ADDQ(Imm(16), RCX) 227 CMPQ(RAX, RCX) 228 JNE(LabelRef("LBB2_4")) 229 CMPQ(RAX, RSI) 230 JE(LabelRef("LBB2_8")) 231 } 232 233 Label("LBB2_6") 234 { 235 VMOVUPD(data.Offset(16), X0) 236 VMOVDDUP(data.Offset(8), X1) 237 } 238 239 Label("LBB2_7") 240 { 241 VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X2) 242 VANDPD(X0, X2, X3) 243 VORPD(X1, X3, X3) 244 VADDSD(X3, X2, X2) 245 VROUNDSD(Imm(11), X2, X2, X2) 246 VMOVSD(X2, Mem{Base: RDI}.Idx(RAX, 8)) 247 ADDQ(Imm(1), RAX) 248 CMPQ(RSI, RAX) 249 JNE(LabelRef("LBB2_7")) 250 } 251 252 Label("LBB2_8") 253 { 254 VZEROUPPER() 255 Store(X0, ReturnIndex(0)) 256 RET() 257 } 258 } 259 260 func genRound_F32() { 261 262 data := GLOBL("dataRoundF32", RODATA|NOPTR) 263 DATA(0, U32(0x80000000)) 264 DATA(4, U32(0x3effffff)) 265 266 TEXT("Round_AVX2_F32", NOSPLIT, "func(x []float32) float32") 267 Pragma("noescape") 268 Load(Param("x").Base(), RDI) 269 Load(Param("x").Len(), RSI) 270 271 TESTQ(RSI, RSI) 272 JE(LabelRef("LBB3_8")) 273 CMPQ(RSI, Imm(32)) 274 JAE(LabelRef("LBB3_3")) 275 XORL(EAX, EAX) 276 JMP(LabelRef("LBB3_6")) 277 278 Label("LBB3_3") 279 { 280 MOVQ(RSI, RAX) 281 ANDQ(I32(-32), RAX) 282 XORL(ECX, ECX) 283 VBROADCASTSS(data.Offset(0), Y0) 284 VBROADCASTSS(data.Offset(4), Y1) 285 } 286 287 Label("LBB3_4") 288 { 289 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y2) 290 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y3) 291 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y4) 292 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y5) 293 VANDPS(Y0, Y2, Y6) 294 VORPS(Y1, Y6, Y6) 295 VADDPS(Y6, Y2, Y2) 296 VROUNDPS(Imm(11), Y2, Y2) 297 VANDPS(Y0, Y3, Y6) 298 VORPS(Y1, Y6, Y6) 299 VADDPS(Y6, Y3, Y3) 300 VROUNDPS(Imm(11), Y3, Y3) 301 VANDPS(Y0, Y4, Y6) 302 VORPS(Y1, Y6, Y6) 303 VADDPS(Y6, Y4, Y4) 304 VROUNDPS(Imm(11), Y4, Y4) 305 VANDPS(Y0, Y5, Y6) 306 VORPS(Y1, Y6, Y6) 307 VADDPS(Y6, Y5, Y5) 308 VROUNDPS(Imm(11), Y5, Y5) 309 VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4)) 310 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(32)) 311 VMOVUPS(Y4, Mem{Base: RDI}.Idx(RCX, 4).Offset(64)) 312 VMOVUPS(Y5, Mem{Base: RDI}.Idx(RCX, 4).Offset(96)) 313 ADDQ(Imm(32), RCX) 314 CMPQ(RAX, RCX) 315 JNE(LabelRef("LBB3_4")) 316 CMPQ(RAX, RSI) 317 JE(LabelRef("LBB3_8")) 318 } 319 320 Label("LBB3_6") 321 { 322 VBROADCASTSS(data.Offset(0), X0) 323 VBROADCASTSS(data.Offset(4), X1) 324 } 325 326 Label("LBB3_7") 327 { 328 VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X2) 329 VANDPS(X0, X2, X3) 330 VORPS(X1, X3, X3) 331 VADDSS(X3, X2, X2) 332 VROUNDSS(Imm(11), X2, X2, X2) 333 VMOVSS(X2, Mem{Base: RDI}.Idx(RAX, 4)) 334 ADDQ(Imm(1), RAX) 335 CMPQ(RSI, RAX) 336 JNE(LabelRef("LBB3_7")) 337 } 338 339 Label("LBB3_8") 340 { 341 VZEROUPPER() 342 Store(X0, ReturnIndex(0)) 343 RET() 344 } 345 } 346 347 func genFloor_F64() { 348 349 TEXT("Floor_AVX2_F64", NOSPLIT, "func(x []float64) float64") 350 Pragma("noescape") 351 Load(Param("x").Base(), RDI) 352 Load(Param("x").Len(), RSI) 353 354 TESTQ(RSI, RSI) 355 JE(LabelRef("LBB4_11")) 356 CMPQ(RSI, Imm(16)) 357 JAE(LabelRef("LBB4_3")) 358 XORL(EAX, EAX) 359 JMP(LabelRef("LBB4_10")) 360 361 Label("LBB4_3") 362 { 363 MOVQ(RSI, RAX) 364 ANDQ(I32(-16), RAX) 365 LEAQ(Mem{Base: RAX}.Offset(-16), RCX) 366 MOVQ(RCX, R8) 367 SHRQ(Imm(4), R8) 368 ADDQ(Imm(1), R8) 369 TESTQ(RCX, RCX) 370 JE(LabelRef("LBB4_4")) 371 MOVQ(R8, RDX) 372 ANDQ(I32(-2), RDX) 373 XORL(ECX, ECX) 374 } 375 376 Label("LBB4_6") 377 { 378 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8), Y0) 379 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1) 380 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2) 381 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3) 382 VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8)) 383 VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32)) 384 VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64)) 385 VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96)) 386 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(128), Y0) 387 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(160), Y1) 388 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(192), Y2) 389 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(224), Y3) 390 VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8).Offset(128)) 391 VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(160)) 392 VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(192)) 393 VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(224)) 394 ADDQ(Imm(32), RCX) 395 ADDQ(I32(-2), RDX) 396 JNE(LabelRef("LBB4_6")) 397 TESTB(Imm(1), R8B) 398 JE(LabelRef("LBB4_9")) 399 } 400 401 Label("LBB4_8") 402 { 403 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8), Y0) 404 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1) 405 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2) 406 VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3) 407 VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8)) 408 VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32)) 409 VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64)) 410 VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96)) 411 } 412 413 Label("LBB4_9") 414 { 415 CMPQ(RAX, RSI) 416 JE(LabelRef("LBB4_11")) 417 } 418 419 Label("LBB4_10") 420 { 421 VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0) 422 VROUNDSD(Imm(9), X0, X0, X0) 423 VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8)) 424 ADDQ(Imm(1), RAX) 425 CMPQ(RSI, RAX) 426 JNE(LabelRef("LBB4_10")) 427 } 428 429 Label("LBB4_11") 430 { 431 VZEROUPPER() 432 Store(X0, ReturnIndex(0)) 433 RET() 434 } 435 436 Label("LBB4_4") 437 { 438 XORL(ECX, ECX) 439 TESTB(Imm(1), R8B) 440 JNE(LabelRef("LBB4_8")) 441 JMP(LabelRef("LBB4_9")) 442 } 443 } 444 445 func genFloor_F32() { 446 447 TEXT("Floor_AVX2_F32", NOSPLIT, "func(x []float32) float32") 448 Pragma("noescape") 449 Load(Param("x").Base(), RDI) 450 Load(Param("x").Len(), RSI) 451 452 TESTQ(RSI, RSI) 453 JE(LabelRef("LBB5_11")) 454 CMPQ(RSI, Imm(32)) 455 JAE(LabelRef("LBB5_3")) 456 XORL(EAX, EAX) 457 JMP(LabelRef("LBB5_10")) 458 459 Label("LBB5_3") 460 { 461 MOVQ(RSI, RAX) 462 ANDQ(I32(-32), RAX) 463 LEAQ(Mem{Base: RAX}.Offset(-32), RCX) 464 MOVQ(RCX, R8) 465 SHRQ(Imm(5), R8) 466 ADDQ(Imm(1), R8) 467 TESTQ(RCX, RCX) 468 JE(LabelRef("LBB5_4")) 469 MOVQ(R8, RDX) 470 ANDQ(I32(-2), RDX) 471 XORL(ECX, ECX) 472 } 473 474 Label("LBB5_6") 475 { 476 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4), Y0) 477 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1) 478 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2) 479 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3) 480 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4)) 481 VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32)) 482 VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64)) 483 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96)) 484 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(128), Y0) 485 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(160), Y1) 486 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(192), Y2) 487 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(224), Y3) 488 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4).Offset(128)) 489 VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(160)) 490 VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(192)) 491 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(224)) 492 ADDQ(Imm(64), RCX) 493 ADDQ(I32(-2), RDX) 494 JNE(LabelRef("LBB5_6")) 495 TESTB(Imm(1), R8B) 496 JE(LabelRef("LBB5_9")) 497 } 498 499 Label("LBB5_8") 500 { 501 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4), Y0) 502 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1) 503 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2) 504 VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3) 505 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4)) 506 VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32)) 507 VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64)) 508 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96)) 509 } 510 511 Label("LBB5_9") 512 { 513 CMPQ(RAX, RSI) 514 JE(LabelRef("LBB5_11")) 515 } 516 517 Label("LBB5_10") 518 { 519 VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X0) 520 VROUNDSS(Imm(9), X0, X0, X0) 521 VMOVSS(X0, Mem{Base: RDI}.Idx(RAX, 4)) 522 ADDQ(Imm(1), RAX) 523 CMPQ(RSI, RAX) 524 JNE(LabelRef("LBB5_10")) 525 } 526 527 Label("LBB5_11") 528 { 529 VZEROUPPER() 530 Store(X0, ReturnIndex(0)) 531 RET() 532 } 533 534 Label("LBB5_4") 535 { 536 XORL(ECX, ECX) 537 TESTB(Imm(1), R8B) 538 JNE(LabelRef("LBB5_8")) 539 JMP(LabelRef("LBB5_9")) 540 } 541 } 542 543 func genCeil_F64() { 544 545 TEXT("Ceil_AVX2_F64", NOSPLIT, "func(x []float64) float64") 546 Pragma("noescape") 547 Load(Param("x").Base(), RDI) 548 Load(Param("x").Len(), RSI) 549 550 TESTQ(RSI, RSI) 551 JE(LabelRef("LBB6_11")) 552 CMPQ(RSI, Imm(16)) 553 JAE(LabelRef("LBB6_3")) 554 XORL(EAX, EAX) 555 JMP(LabelRef("LBB6_10")) 556 557 Label("LBB6_3") 558 { 559 MOVQ(RSI, RAX) 560 ANDQ(I32(-16), RAX) 561 LEAQ(Mem{Base: RAX}.Offset(-16), RCX) 562 MOVQ(RCX, R8) 563 SHRQ(Imm(4), R8) 564 ADDQ(Imm(1), R8) 565 TESTQ(RCX, RCX) 566 JE(LabelRef("LBB6_4")) 567 MOVQ(R8, RDX) 568 ANDQ(I32(-2), RDX) 569 XORL(ECX, ECX) 570 } 571 572 Label("LBB6_6") 573 { 574 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8), Y0) 575 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1) 576 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2) 577 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3) 578 VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8)) 579 VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32)) 580 VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64)) 581 VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96)) 582 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(128), Y0) 583 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(160), Y1) 584 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(192), Y2) 585 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(224), Y3) 586 VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8).Offset(128)) 587 VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(160)) 588 VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(192)) 589 VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(224)) 590 ADDQ(Imm(32), RCX) 591 ADDQ(I32(-2), RDX) 592 JNE(LabelRef("LBB6_6")) 593 TESTB(Imm(1), R8B) 594 JE(LabelRef("LBB6_9")) 595 } 596 597 Label("LBB6_8") 598 { 599 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8), Y0) 600 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1) 601 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2) 602 VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3) 603 VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8)) 604 VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32)) 605 VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64)) 606 VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96)) 607 } 608 609 Label("LBB6_9") 610 { 611 CMPQ(RAX, RSI) 612 JE(LabelRef("LBB6_11")) 613 } 614 615 Label("LBB6_10") 616 { 617 VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0) 618 VROUNDSD(Imm(10), X0, X0, X0) 619 VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8)) 620 ADDQ(Imm(1), RAX) 621 CMPQ(RSI, RAX) 622 JNE(LabelRef("LBB6_10")) 623 } 624 625 Label("LBB6_11") 626 { 627 VZEROUPPER() 628 Store(X0, ReturnIndex(0)) 629 RET() 630 } 631 632 Label("LBB6_4") 633 { 634 XORL(ECX, ECX) 635 TESTB(Imm(1), R8B) 636 JNE(LabelRef("LBB6_8")) 637 JMP(LabelRef("LBB6_9")) 638 } 639 } 640 641 func genCeil_F32() { 642 643 TEXT("Ceil_AVX2_F32", NOSPLIT, "func(x []float32) float32") 644 Pragma("noescape") 645 Load(Param("x").Base(), RDI) 646 Load(Param("x").Len(), RSI) 647 648 TESTQ(RSI, RSI) 649 JE(LabelRef("LBB7_11")) 650 CMPQ(RSI, Imm(32)) 651 JAE(LabelRef("LBB7_3")) 652 XORL(EAX, EAX) 653 JMP(LabelRef("LBB7_10")) 654 655 Label("LBB7_3") 656 { 657 MOVQ(RSI, RAX) 658 ANDQ(I32(-32), RAX) 659 LEAQ(Mem{Base: RAX}.Offset(-32), RCX) 660 MOVQ(RCX, R8) 661 SHRQ(Imm(5), R8) 662 ADDQ(Imm(1), R8) 663 TESTQ(RCX, RCX) 664 JE(LabelRef("LBB7_4")) 665 MOVQ(R8, RDX) 666 ANDQ(I32(-2), RDX) 667 XORL(ECX, ECX) 668 } 669 670 Label("LBB7_6") 671 { 672 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4), Y0) 673 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1) 674 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2) 675 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3) 676 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4)) 677 VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32)) 678 VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64)) 679 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96)) 680 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(128), Y0) 681 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(160), Y1) 682 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(192), Y2) 683 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(224), Y3) 684 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4).Offset(128)) 685 VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(160)) 686 VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(192)) 687 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(224)) 688 ADDQ(Imm(64), RCX) 689 ADDQ(I32(-2), RDX) 690 JNE(LabelRef("LBB7_6")) 691 TESTB(Imm(1), R8B) 692 JE(LabelRef("LBB7_9")) 693 } 694 695 Label("LBB7_8") 696 { 697 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4), Y0) 698 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1) 699 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2) 700 VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3) 701 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4)) 702 VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32)) 703 VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64)) 704 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96)) 705 } 706 707 Label("LBB7_9") 708 { 709 CMPQ(RAX, RSI) 710 JE(LabelRef("LBB7_11")) 711 } 712 713 Label("LBB7_10") 714 { 715 VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X0) 716 VROUNDSS(Imm(10), X0, X0, X0) 717 VMOVSS(X0, Mem{Base: RDI}.Idx(RAX, 4)) 718 ADDQ(Imm(1), RAX) 719 CMPQ(RSI, RAX) 720 JNE(LabelRef("LBB7_10")) 721 } 722 723 Label("LBB7_11") 724 { 725 VZEROUPPER() 726 Store(X0, ReturnIndex(0)) 727 RET() 728 } 729 730 Label("LBB7_4") 731 { 732 XORL(ECX, ECX) 733 TESTB(Imm(1), R8B) 734 JNE(LabelRef("LBB7_8")) 735 JMP(LabelRef("LBB7_9")) 736 } 737 } 738 739 func genPow_4x_F64() { 740 741 data := GLOBL("dataPowF64", RODATA|NOPTR) 742 DATA(0, U64(9223372036854775807)) // Label("LCPI9_0") 743 DATA(8, U64(0x3fe6a09e667f3bcd)) // Label("LCPI9_3") 744 DATA(16, U64(0xbff0000000000000)) // Label("LCPI9_4") 745 DATA(24, U64(0x401a509f46f4fa53)) // Label("LCPI9_5") 746 DATA(32, U64(0x3fdfe818a0fe1a83)) // Label("LCPI9_6") 747 DATA(40, U64(0x3f07bc0962b395ca)) // Label("LCPI9_7") 748 DATA(48, U64(0x404e798eb86c3351)) // Label("LCPI9_8") 749 DATA(56, U64(0x403de9738b8cb9c9)) // Label("LCPI9_9") 750 DATA(64, U64(0x40340a202d99830a)) // Label("LCPI9_10") 751 DATA(72, U64(0x404c8e7597479a10)) // Label("LCPI9_11") 752 DATA(80, U64(0x4054c30b52213498)) // Label("LCPI9_12") 753 DATA(88, U64(0x402e20359e903e37)) // Label("LCPI9_13") 754 DATA(96, U64(0x407351945dc908a5)) // Label("LCPI9_14") 755 DATA(104, U64(0x406bb86590fcfb56)) // Label("LCPI9_15") 756 DATA(112, U64(0x404e0f304466448e)) // Label("LCPI9_16") 757 DATA(120, U64(0x406b0db13e48e066)) // Label("LCPI9_17") 758 DATA(128, U64(4841369599423283200)) // Label("LCPI9_18") 759 DATA(136, U64(0xc3300000000003ff)) // Label("LCPI9_19") 760 DATA(144, U64(0x3ff0000000000000)) // Label("LCPI9_20") 761 DATA(152, U64(0xbfe0000000000000)) // Label("LCPI9_21") 762 DATA(160, U64(0x3fe0000000000000)) // Label("LCPI9_22") 763 DATA(168, U64(0x3ff71547652b82fe)) // Label("LCPI9_23") 764 DATA(176, U64(0xbfe62e4000000000)) // Label("LCPI9_24") 765 DATA(184, U64(0x3eb7f7d1cf79abca)) // Label("LCPI9_25") 766 DATA(192, U64(0x3fe62e42fefa39ef)) // Label("LCPI9_26") 767 DATA(200, U64(0x3e21eed8eff8d898)) // Label("LCPI9_27") 768 DATA(208, U64(0x3de6124613a86d09)) // Label("LCPI9_28") 769 DATA(216, U64(0x3e927e4fb7789f5c)) // Label("LCPI9_29") 770 DATA(224, U64(0x3e5ae64567f544e4)) // Label("LCPI9_30") 771 DATA(232, U64(0x3efa01a01a01a01a)) // Label("LCPI9_31") 772 DATA(240, U64(0x3ec71de3a556c734)) // Label("LCPI9_32") 773 DATA(248, U64(0x3f56c16c16c16c17)) // Label("LCPI9_33") 774 DATA(256, U64(0x3f2a01a01a01a01a)) // Label("LCPI9_34") 775 DATA(264, U64(0x3fa5555555555555)) // Label("LCPI9_35") 776 DATA(272, U64(0x3f81111111111111)) // Label("LCPI9_36") 777 DATA(280, U64(0x3fc5555555555555)) // Label("LCPI9_37") 778 DATA(288, U64(2046)) // Label("LCPI9_38") 779 DATA(296, U64(0x40a7700000000000)) // Label("LCPI9_39") 780 DATA(304, U64(1)) // Label("LCPI9_40") 781 DATA(312, U64(0xc0a7700000000000)) // Label("LCPI9_41") 782 DATA(320, U64(9218868437227405312)) // Label("LCPI9_42") 783 DATA(328, U64(0x7ff8002040000000)) // Label("LCPI9_43") 784 DATA(336, U64(4503599627370495)) // Label("LCPI9_1") 785 DATA(344, U64(4503599627370495)) 786 DATA(352, U64(4602678819172646912)) // Label("LCPI9_2") 787 DATA(360, U64(4602678819172646912)) 788 789 TEXT("Pow_4x_AVX2_F64", NOSPLIT, "func(x, y []float64)") 790 Pragma("noescape") 791 Load(Param("x").Base(), RDI) 792 Load(Param("y").Base(), RSI) 793 Load(Param("x").Len(), RDX) 794 795 SUBQ(I32(1192), RSP) 796 ANDQ(I32(-4), RDX) 797 JE(LabelRef("LBB9_11")) 798 XORL(R8L, R8L) 799 VBROADCASTSD(data.Offset(0), Y0) 800 VMOVUPS(Y0, Mem{Base: RSP}.Offset(512)) 801 VBROADCASTSD(data.Offset(8), Y0) 802 VMOVUPS(Y0, Mem{Base: RSP}.Offset(1120)) 803 VPXOR(X6, X6, X6) 804 VBROADCASTSD(data.Offset(16), Y0) 805 VMOVUPS(Y0, Mem{Base: RSP}.Offset(1088)) 806 VBROADCASTSD(data.Offset(24), Y0) 807 VMOVUPS(Y0, Mem{Base: RSP}.Offset(1056)) 808 VBROADCASTSD(data.Offset(32), Y0) 809 VMOVUPS(Y0, Mem{Base: RSP}.Offset(1024)) 810 VBROADCASTSD(data.Offset(40), Y0) 811 VMOVUPS(Y0, Mem{Base: RSP}.Offset(992)) 812 VBROADCASTSD(data.Offset(48), Y0) 813 VMOVUPS(Y0, Mem{Base: RSP}.Offset(960)) 814 VBROADCASTSD(data.Offset(56), Y0) 815 VMOVUPS(Y0, Mem{Base: RSP}.Offset(928)) 816 VBROADCASTSD(data.Offset(64), Y0) 817 VMOVUPS(Y0, Mem{Base: RSP}.Offset(896)) 818 VBROADCASTSD(data.Offset(72), Y0) 819 VMOVUPS(Y0, Mem{Base: RSP}.Offset(864)) 820 VBROADCASTSD(data.Offset(80), Y0) 821 VMOVUPS(Y0, Mem{Base: RSP}.Offset(832)) 822 VBROADCASTSD(data.Offset(88), Y0) 823 VMOVUPS(Y0, Mem{Base: RSP}.Offset(800)) 824 VBROADCASTSD(data.Offset(96), Y0) 825 VMOVUPS(Y0, Mem{Base: RSP}.Offset(768)) 826 VBROADCASTSD(data.Offset(104), Y0) 827 VMOVUPS(Y0, Mem{Base: RSP}.Offset(736)) 828 VBROADCASTSD(data.Offset(112), Y0) 829 VMOVUPS(Y0, Mem{Base: RSP}.Offset(704)) 830 VBROADCASTSD(data.Offset(120), Y0) 831 VMOVUPS(Y0, Mem{Base: RSP}.Offset(672)) 832 VBROADCASTSD(data.Offset(128), Y0) 833 VMOVUPS(Y0, Mem{Base: RSP}.Offset(640)) 834 VBROADCASTSD(data.Offset(136), Y0) 835 VMOVUPS(Y0, Mem{Base: RSP}.Offset(608)) 836 VBROADCASTSD(data.Offset(144), Y0) 837 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128)) 838 VBROADCASTSD(data.Offset(152), Y0) 839 VMOVUPS(Y0, Mem{Base: RSP}.Offset(576)) 840 VBROADCASTSD(data.Offset(160), Y0) 841 VMOVUPS(Y0, Mem{Base: RSP}.Offset(544)) 842 VBROADCASTSD(data.Offset(168), Y0) 843 VMOVUPS(Y0, Mem{Base: RSP}.Offset(480)) 844 VBROADCASTSD(data.Offset(176), Y0) 845 VMOVUPS(Y0, Mem{Base: RSP}.Offset(448)) 846 VBROADCASTSD(data.Offset(184), Y0) 847 VMOVUPS(Y0, Mem{Base: RSP}.Offset(416)) 848 VBROADCASTSD(data.Offset(192), Y0) 849 VMOVUPS(Y0, Mem{Base: RSP}.Offset(384)) 850 VBROADCASTSD(data.Offset(200), Y0) 851 VMOVUPS(Y0, Mem{Base: RSP}.Offset(352)) 852 VBROADCASTSD(data.Offset(208), Y0) 853 VMOVUPS(Y0, Mem{Base: RSP}.Offset(320)) 854 VBROADCASTSD(data.Offset(216), Y0) 855 VMOVUPS(Y0, Mem{Base: RSP}.Offset(288)) 856 VBROADCASTSD(data.Offset(224), Y0) 857 VMOVUPS(Y0, Mem{Base: RSP}.Offset(256)) 858 VBROADCASTSD(data.Offset(232), Y0) 859 VMOVUPS(Y0, Mem{Base: RSP}.Offset(224)) 860 VBROADCASTSD(data.Offset(240), Y0) 861 VMOVUPS(Y0, Mem{Base: RSP}.Offset(192)) 862 VBROADCASTSD(data.Offset(248), Y0) 863 VMOVUPS(Y0, Mem{Base: RSP}.Offset(160)) 864 VBROADCASTSD(data.Offset(256), Y0) 865 VMOVUPS(Y0, Mem{Base: RSP}.Offset(128)) 866 VBROADCASTSD(data.Offset(264), Y0) 867 VMOVUPS(Y0, Mem{Base: RSP}.Offset(96)) 868 VBROADCASTSD(data.Offset(272), Y0) 869 VMOVUPS(Y0, Mem{Base: RSP}.Offset(64)) 870 VBROADCASTSD(data.Offset(280), Y0) 871 VMOVUPS(Y0, Mem{Base: RSP}.Offset(32)) 872 VBROADCASTSD(data.Offset(288), Y0) 873 VMOVUPS(Y0, Mem{Base: RSP}) 874 VBROADCASTSD(data.Offset(296), Y0) 875 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 876 VBROADCASTSD(data.Offset(304), Y0) 877 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 878 VBROADCASTSD(data.Offset(312), Y0) 879 VMOVUPD(Y0, Mem{Base: RSP}.Offset(-96)) 880 VPBROADCASTQ(data.Offset(320), Y5) 881 VBROADCASTSD(data.Offset(320), Y10) 882 JMP(LabelRef("LBB9_2")) 883 884 Label("LBB9_10") 885 { 886 VMOVUPD(Y2, Mem{Base: RDI}.Idx(R8, 8)) 887 ADDQ(Imm(4), R8) 888 CMPQ(R8, RDX) 889 JAE(LabelRef("LBB9_11")) 890 } 891 892 Label("LBB9_2") 893 { 894 VMOVAPD(Y10, Y9) 895 VMOVDQU(Mem{Base: RDI}.Idx(R8, 8), Y13) 896 VMOVUPD(Mem{Base: RSI}.Idx(R8, 8), Y12) 897 VPAND(Mem{Base: RSP}.Offset(512), Y13, Y10) 898 VMOVUPD(data.Offset(336), X1) 899 VANDPD(Mem{Base: RDI}.Idx(R8, 8), X1, X2) 900 VMOVUPD(data.Offset(352), X0) 901 VORPD(X0, X2, X2) 902 VANDPD(Mem{Base: RDI}.Idx(R8, 8).Offset(16), X1, X3) 903 VORPD(X0, X3, X3) 904 VINSERTF128(Imm(1), X3, Y2, Y3) 905 VMOVUPD(Mem{Base: RSP}.Offset(1120), Y0) 906 VCMPPD(Imm(1), Y3, Y0, Y2) 907 VANDNPD(Y3, Y2, Y4) 908 VADDPD(Mem{Base: RSP}.Offset(1088), Y3, Y3) 909 VADDPD(Y4, Y3, Y4) 910 VMULPD(Y4, Y4, Y3) 911 VMULPD(Y3, Y3, Y7) 912 VMOVUPD(Mem{Base: RSP}.Offset(1024), Y8) 913 VFMADD213PD(Mem{Base: RSP}.Offset(1056), Y4, Y8) 914 VFMADD231PD(Mem{Base: RSP}.Offset(992), Y3, Y8) 915 VMOVUPD(Mem{Base: RSP}.Offset(928), Y11) 916 VFMADD213PD(Mem{Base: RSP}.Offset(960), Y4, Y11) 917 VMOVUPD(Mem{Base: RSP}.Offset(864), Y14) 918 VFMADD213PD(Mem{Base: RSP}.Offset(896), Y4, Y14) 919 VFMADD231PD(Y11, Y3, Y14) 920 VFMADD231PD(Y8, Y7, Y14) 921 VMULPD(Y4, Y3, Y8) 922 VMULPD(Y14, Y8, Y8) 923 VADDPD(Mem{Base: RSP}.Offset(832), Y3, Y11) 924 VFMADD231PD(Mem{Base: RSP}.Offset(800), Y4, Y11) 925 VMOVUPD(Mem{Base: RSP}.Offset(736), Y14) 926 VFMADD213PD(Mem{Base: RSP}.Offset(768), Y4, Y14) 927 VMOVUPD(Mem{Base: RSP}.Offset(672), Y15) 928 VFMADD213PD(Mem{Base: RSP}.Offset(704), Y4, Y15) 929 VFMADD231PD(Y14, Y3, Y15) 930 VFMADD231PD(Y11, Y7, Y15) 931 VDIVPD(Y15, Y8, Y7) 932 VMOVDQU(Y10, Mem{Base: RSP}.Offset(1152)) 933 VPSRLQ(Imm(52), Y10, Y8) 934 VPOR(Mem{Base: RSP}.Offset(640), Y8, Y8) 935 VADDPD(Mem{Base: RSP}.Offset(608), Y8, Y8) 936 VMOVUPD(Mem{Base: RSP}.Offset(-128), Y0) 937 VANDPD(Y0, Y2, Y2) 938 VADDPD(Y2, Y8, Y8) 939 VMULPD(Y12, Y8, Y2) 940 VROUNDPD(Imm(8), Y2, Y2) 941 VFNMADD213PD(Y2, Y12, Y8) 942 VMOVUPD(Mem{Base: RSP}.Offset(576), Y1) 943 VMOVAPD(Y1, Y11) 944 VFMADD213PD(Y4, Y3, Y11) 945 VADDPD(Y7, Y11, Y11) 946 VMOVUPD(Mem{Base: RSP}.Offset(544), Y10) 947 VMULPD(Y4, Y10, Y14) 948 VMULPD(Y1, Y3, Y15) 949 VFMADD231PD(Y14, Y4, Y15) 950 VSUBPD(Y4, Y11, Y4) 951 VFMADD231PD(Y3, Y10, Y4) 952 VMOVUPD(Mem{Base: RSP}.Offset(480), Y1) 953 VMULPD(Y1, Y12, Y3) 954 VMULPD(Y3, Y11, Y3) 955 VROUNDPD(Imm(8), Y3, Y3) 956 VMULPD(Mem{Base: RSP}.Offset(448), Y3, Y14) 957 VFMADD231PD(Y11, Y12, Y14) 958 VFMSUB231PD(Mem{Base: RSP}.Offset(416), Y3, Y14) 959 VMOVUPD(Mem{Base: RSP}.Offset(384), Y11) 960 VFMADD231PD(Y8, Y11, Y14) 961 VSUBPD(Y7, Y15, Y7) 962 VADDPD(Y4, Y7, Y4) 963 VFNMSUB213PD(Y14, Y12, Y4) 964 VMULPD(Y1, Y4, Y7) 965 VROUNDPD(Imm(8), Y7, Y7) 966 VFNMADD231PD(Y11, Y7, Y4) 967 VMULPD(Y4, Y4, Y8) 968 VMOVUPD(Mem{Base: RSP}.Offset(320), Y11) 969 VFMADD213PD(Mem{Base: RSP}.Offset(352), Y4, Y11) 970 VMOVUPD(Mem{Base: RSP}.Offset(256), Y14) 971 VFMADD213PD(Mem{Base: RSP}.Offset(288), Y4, Y14) 972 VMOVUPD(Mem{Base: RSP}.Offset(192), Y15) 973 VFMADD213PD(Mem{Base: RSP}.Offset(224), Y4, Y15) 974 VFMADD231PD(Y14, Y8, Y15) 975 VMOVUPD(Mem{Base: RSP}.Offset(128), Y14) 976 VFMADD213PD(Mem{Base: RSP}.Offset(160), Y4, Y14) 977 VMOVUPD(Mem{Base: RSP}.Offset(64), Y1) 978 VFMADD213PD(Mem{Base: RSP}.Offset(96), Y4, Y1) 979 VFMADD231PD(Y14, Y8, Y1) 980 VMOVUPD(Mem{Base: RSP}.Offset(32), Y14) 981 VFMADD213PD(Y10, Y4, Y14) 982 VFMADD213PD(Y4, Y8, Y14) 983 VMULPD(Y8, Y8, Y4) 984 VFMADD231PD(Y11, Y4, Y15) 985 VFMADD231PD(Y1, Y4, Y14) 986 VMULPD(Y4, Y4, Y1) 987 VFMADD231PD(Y15, Y1, Y14) 988 VADDPD(Y0, Y14, Y1) 989 VADDPD(Y2, Y3, Y2) 990 VADDPD(Y7, Y2, Y15) 991 VROUNDPD(Imm(8), Y15, Y2) 992 VCVTTSD2SIQ(X2, R9) 993 VPERMILPD(Imm(1), X2, X3) 994 VCVTTSD2SIQ(X3, RAX) 995 VEXTRACTF128(Imm(1), Y2, X2) 996 VCVTTSD2SIQ(X2, RCX) 997 VMOVQ(RCX, X3) 998 VPERMILPD(Imm(1), X2, X2) 999 VCVTTSD2SIQ(X2, RCX) 1000 VMOVQ(RCX, X2) 1001 VPUNPCKLQDQ(X2, X3, X2) 1002 VMOVQ(R9, X3) 1003 VMOVQ(RAX, X4) 1004 VPUNPCKLQDQ(X4, X3, X3) 1005 VINSERTI128(Imm(1), X2, Y3, Y2) 1006 VPSRAD(Imm(31), Y1, Y3) 1007 VPSRAD(Imm(20), Y1, Y4) 1008 VPSRLQ(Imm(32), Y4, Y4) 1009 VPBLENDD(Imm(170), Y3, Y4, Y3) 1010 VPADDQ(Y3, Y2, Y4) 1011 VPCMPGTQ(Mem{Base: RSP}, Y4, Y3) 1012 VMOVUPD(Mem{Base: RSP}.Offset(-32), Y0) 1013 VCMPPD(Imm(1), Y15, Y0, Y7) 1014 VPOR(Y7, Y3, Y3) 1015 VMOVDQU(Mem{Base: RSP}.Offset(-64), Y0) 1016 VPCMPGTQ(Y4, Y0, Y4) 1017 VCMPPD(Imm(1), Mem{Base: RSP}.Offset(-96), Y15, Y7) 1018 VPOR(Y7, Y4, Y4) 1019 VPSLLQ(Imm(52), Y2, Y2) 1020 VPADDQ(Y1, Y2, Y2) 1021 VPOR(Y3, Y4, Y1) 1022 VPTEST(Y1, Y1) 1023 JNE(LabelRef("LBB9_3")) 1024 VMOVAPD(Y9, Y10) 1025 JMP(LabelRef("LBB9_5")) 1026 } 1027 1028 Label("LBB9_3") 1029 { 1030 VPANDN(Y2, Y4, Y1) 1031 VMOVAPD(Y9, Y10) 1032 VBLENDVPD(Y3, Y9, Y1, Y2) 1033 } 1034 1035 Label("LBB9_5") 1036 { 1037 VPAND(Y5, Y13, Y11) 1038 VPCMPEQQ(Y6, Y11, Y4) 1039 VPSRAD(Imm(31), Y13, Y1) 1040 VPSHUFD(Imm(245), Y1, Y7) 1041 VCMPPD(Imm(1), Y6, Y12, Y14) 1042 VCMPPD(Imm(0), Y6, Y12, Y3) 1043 VANDPD(Mem{Base: RSP}.Offset(-128), Y3, Y1) 1044 VBLENDVPD(Y14, Y10, Y1, Y1) 1045 VBLENDVPD(Y4, Y1, Y2, Y2) 1046 VPTEST(Y7, Y7) 1047 JNE(LabelRef("LBB9_7")) 1048 VPXOR(X7, X7, X7) 1049 JMP(LabelRef("LBB9_8")) 1050 } 1051 1052 Label("LBB9_7") 1053 { 1054 VROUNDPD(Imm(8), Y12, Y1) 1055 VCMPPD(Imm(0), Y1, Y12, Y8) 1056 VCVTTSD2SIQ(X1, R9) 1057 VPERMILPD(Imm(1), X1, X10) 1058 VCVTTSD2SIQ(X10, RCX) 1059 VEXTRACTF128(Imm(1), Y1, X1) 1060 VCVTTSD2SIQ(X1, RAX) 1061 VXORPD(X10, X10, X10) 1062 VMOVQ(RAX, X6) 1063 VPERMILPD(Imm(1), X1, X1) 1064 VCVTTSD2SIQ(X1, RAX) 1065 VMOVQ(RAX, X1) 1066 VPUNPCKLQDQ(X1, X6, X1) 1067 VMOVQ(R9, X6) 1068 VMOVQ(RCX, X0) 1069 VPUNPCKLQDQ(X0, X6, X0) 1070 VINSERTI128(Imm(1), X1, Y0, Y0) 1071 VPSLLQ(Imm(63), Y0, Y0) 1072 VPOR(Y2, Y0, Y1) 1073 VCMPPD(Imm(0), Y10, Y13, Y6) 1074 VBROADCASTSD(data.Offset(328), Y10) 1075 VBLENDVPD(Y6, Y2, Y10, Y6) 1076 VMOVAPD(Y9, Y10) 1077 VBLENDVPD(Y8, Y1, Y6, Y1) 1078 VXORPD(X6, X6, X6) 1079 VBLENDVPD(Y7, Y1, Y2, Y2) 1080 VANDPD(Y0, Y8, Y7) 1081 } 1082 1083 Label("LBB9_8") 1084 { 1085 VPCMPEQD(Y9, Y9, Y9) 1086 VANDPD(Y5, Y12, Y0) 1087 VANDPD(Y5, Y15, Y1) 1088 VPCMPEQQ(Y5, Y1, Y15) 1089 VPXOR(Y9, Y15, Y1) 1090 VPCMPEQQ(Y5, Y0, Y8) 1091 VPCMPEQQ(Y5, Y11, Y11) 1092 VPXOR(Y9, Y11, Y0) 1093 VPANDN(Y0, Y8, Y0) 1094 VPOR(Y4, Y1, Y1) 1095 VPAND(Y0, Y1, Y0) 1096 VPTEST(Y9, Y0) 1097 JB(LabelRef("LBB9_10")) 1098 VPXOR(Y9, Y8, Y0) 1099 VPANDN(Y0, Y15, Y0) 1100 VMOVUPD(Mem{Base: RSP}.Offset(-128), Y8) 1101 VMOVUPD(Mem{Base: RSP}.Offset(1152), Y9) 1102 VCMPPD(Imm(0), Y8, Y9, Y1) 1103 VCMPPD(Imm(1), Y9, Y8, Y4) 1104 VPSRAD(Imm(31), Y12, Y6) 1105 VPXOR(Y4, Y6, Y4) 1106 VPXOR(X6, X6, X6) 1107 VBLENDVPD(Y4, Y10, Y6, Y4) 1108 VBLENDVPD(Y1, Y8, Y4, Y1) 1109 VBLENDVPD(Y0, Y2, Y1, Y0) 1110 VANDPD(Y2, Y7, Y1) 1111 VANDPD(Y7, Y13, Y2) 1112 VORPD(Y2, Y9, Y2) 1113 VBLENDVPD(Y14, Y1, Y2, Y1) 1114 VBLENDVPD(Y3, Y8, Y1, Y1) 1115 VBLENDVPD(Y11, Y1, Y0, Y0) 1116 VCMPPD(Imm(3), Y13, Y13, Y1) 1117 VCMPPD(Imm(3), Y12, Y12, Y2) 1118 VORPD(Y1, Y2, Y1) 1119 VADDPD(Y13, Y12, Y2) 1120 VBLENDVPD(Y1, Y2, Y0, Y2) 1121 JMP(LabelRef("LBB9_10")) 1122 } 1123 1124 Label("LBB9_11") 1125 { 1126 ADDQ(I32(1192), RSP) 1127 VZEROUPPER() 1128 RET() 1129 } 1130 } 1131 1132 func genPow_8x_F32() { 1133 1134 data := GLOBL("genPowF32", RODATA|NOPTR) 1135 1136 DATA(0, U32(2147483647)) // Label("LCPI8_0") 1137 DATA(4, U32(0x3f3504f3)) // Label("LCPI8_3") 1138 DATA(8, U32(0xbf800000)) // Label("LCPI8_4") 1139 DATA(12, U32(0x3def251a)) // Label("LCPI8_5") 1140 DATA(16, U32(0xbdebd1b8)) // Label("LCPI8_6") 1141 DATA(20, U32(0x3e11e9bf)) // Label("LCPI8_7") 1142 DATA(24, U32(0xbdfe5d4f)) // Label("LCPI8_8") 1143 DATA(28, U32(0x3e4cceac)) // Label("LCPI8_9") 1144 DATA(32, U32(0xbe2aae50)) // Label("LCPI8_10") 1145 DATA(36, U32(0x3eaaaaaa)) // Label("LCPI8_11") 1146 DATA(40, U32(0xbe7ffffc)) // Label("LCPI8_12") 1147 DATA(44, U32(0x3d9021bb)) // Label("LCPI8_13") 1148 DATA(48, U32(0xcb00007f)) // Label("LCPI8_15") 1149 DATA(52, U32(0x3f800000)) // Label("LCPI8_16") 1150 DATA(56, U32(0xbf000000)) // Label("LCPI8_17") 1151 DATA(60, U32(0x3f000000)) // Label("LCPI8_18") 1152 DATA(64, U32(0x3fb8aa3b)) // Label("LCPI8_19") 1153 DATA(68, U32(0xbf318000)) // Label("LCPI8_20") 1154 DATA(72, U32(0xb95e8083)) // Label("LCPI8_21") 1155 DATA(76, U32(0xbf317218)) // Label("LCPI8_22") 1156 DATA(80, U32(0x3d2aaaab)) // Label("LCPI8_23") 1157 DATA(84, U32(0x3c088889)) // Label("LCPI8_24") 1158 DATA(88, U32(0x3ab60b61)) // Label("LCPI8_25") 1159 DATA(92, U32(0x39500d01)) // Label("LCPI8_26") 1160 DATA(96, U32(0x3e2aaaab)) // Label("LCPI8_27") 1161 DATA(100, U32(254)) // Label("LCPI8_29") 1162 DATA(104, U32(0x43960000)) // Label("LCPI8_30") 1163 DATA(108, U32(1)) // Label("LCPI8_31") 1164 DATA(112, U32(0xc3960000)) // Label("LCPI8_32") 1165 DATA(116, U32(2139095040)) // Label("LCPI8_33") 1166 DATA(120, U32(0x7fc00102)) // Label("LCPI8_34") 1167 1168 DATA(124, U64(36028792732385279)) // Label("LCPI8_1") 1169 DATA(132, U64(36028792732385279)) 1170 1171 DATA(140, U64(4539628425446424576)) // Label("LCPI8_2") 1172 DATA(148, U64(4539628425446424576)) 1173 1174 DATA(156, U64(5404319554102886400)) // Label("LCPI8_14") 1175 1176 DATA(164, U8(255)) // Label("LCPI8_28") 1177 DATA(165, U8(0)) 1178 DATA(166, U8(0)) 1179 DATA(167, U8(0)) 1180 DATA(168, U8(255)) 1181 DATA(169, U8(0)) 1182 DATA(170, U8(0)) 1183 DATA(171, U8(0)) 1184 DATA(172, U8(255)) 1185 DATA(173, U8(0)) 1186 DATA(174, U8(0)) 1187 DATA(175, U8(0)) 1188 DATA(176, U8(255)) 1189 DATA(177, U8(0)) 1190 DATA(178, U8(0)) 1191 DATA(179, U8(0)) 1192 DATA(180, U8(255)) 1193 DATA(181, U8(0)) 1194 DATA(182, U8(0)) 1195 DATA(183, U8(0)) 1196 DATA(184, U8(255)) 1197 DATA(185, U8(0)) 1198 DATA(186, U8(0)) 1199 DATA(187, U8(0)) 1200 DATA(188, U8(255)) 1201 DATA(189, U8(0)) 1202 DATA(190, U8(0)) 1203 DATA(191, U8(0)) 1204 DATA(192, U8(255)) 1205 DATA(193, U8(0)) 1206 DATA(194, U8(0)) 1207 DATA(195, U8(0)) 1208 1209 TEXT("Pow_8x_AVX2_F32", NOSPLIT, "func(x, y []float32)") 1210 Pragma("noescape") 1211 Load(Param("x").Base(), RDI) 1212 Load(Param("y").Base(), RSI) 1213 Load(Param("x").Len(), RDX) 1214 1215 SUBQ(I32(872), RSP) 1216 ANDQ(I32(-8), RDX) 1217 JE(LabelRef("LBB8_12")) 1218 XORL(EAX, EAX) 1219 VBROADCASTSS(data.Offset(0), Y0) 1220 VMOVUPS(Y0, Mem{Base: RSP}.Offset(320)) 1221 VBROADCASTSS(data.Offset(4), Y0) 1222 VMOVUPS(Y0, Mem{Base: RSP}.Offset(800)) 1223 VPXOR(X7, X7, X7) 1224 VBROADCASTSS(data.Offset(8), Y0) 1225 VMOVUPS(Y0, Mem{Base: RSP}.Offset(768)) 1226 VBROADCASTSS(data.Offset(12), Y0) 1227 VMOVUPS(Y0, Mem{Base: RSP}.Offset(736)) 1228 VBROADCASTSS(data.Offset(16), Y0) 1229 VMOVUPS(Y0, Mem{Base: RSP}.Offset(704)) 1230 VBROADCASTSS(data.Offset(20), Y0) 1231 VMOVUPS(Y0, Mem{Base: RSP}.Offset(672)) 1232 VBROADCASTSS(data.Offset(24), Y0) 1233 VMOVUPS(Y0, Mem{Base: RSP}.Offset(640)) 1234 VBROADCASTSS(data.Offset(28), Y0) 1235 VMOVUPS(Y0, Mem{Base: RSP}.Offset(608)) 1236 VBROADCASTSS(data.Offset(32), Y0) 1237 VMOVUPS(Y0, Mem{Base: RSP}.Offset(576)) 1238 VBROADCASTSS(data.Offset(36), Y0) 1239 VMOVUPS(Y0, Mem{Base: RSP}.Offset(544)) 1240 VBROADCASTSS(data.Offset(40), Y0) 1241 VMOVUPS(Y0, Mem{Base: RSP}.Offset(512)) 1242 VBROADCASTSS(data.Offset(44), Y0) 1243 VMOVUPS(Y0, Mem{Base: RSP}.Offset(480)) 1244 VBROADCASTSD(data.Offset(156), Y0) 1245 VMOVUPS(Y0, Mem{Base: RSP}.Offset(448)) 1246 VBROADCASTSS(data.Offset(48), Y0) 1247 VMOVUPS(Y0, Mem{Base: RSP}.Offset(416)) 1248 VBROADCASTSS(data.Offset(52), Y0) 1249 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128)) 1250 VBROADCASTSS(data.Offset(56), Y0) 1251 VMOVUPS(Y0, Mem{Base: RSP}.Offset(384)) 1252 VBROADCASTSS(data.Offset(60), Y0) 1253 VMOVUPS(Y0, Mem{Base: RSP}.Offset(352)) 1254 VBROADCASTSS(data.Offset(64), Y0) 1255 VMOVUPS(Y0, Mem{Base: RSP}.Offset(288)) 1256 VBROADCASTSS(data.Offset(68), Y0) 1257 VMOVUPS(Y0, Mem{Base: RSP}.Offset(256)) 1258 VBROADCASTSS(data.Offset(72), Y0) 1259 VMOVUPS(Y0, Mem{Base: RSP}.Offset(224)) 1260 VBROADCASTSS(data.Offset(76), Y0) 1261 VMOVUPS(Y0, Mem{Base: RSP}.Offset(192)) 1262 VBROADCASTSS(data.Offset(80), Y0) 1263 VMOVUPS(Y0, Mem{Base: RSP}.Offset(160)) 1264 VBROADCASTSS(data.Offset(84), Y0) 1265 VMOVUPS(Y0, Mem{Base: RSP}.Offset(128)) 1266 VBROADCASTSS(data.Offset(88), Y0) 1267 VMOVUPS(Y0, Mem{Base: RSP}.Offset(96)) 1268 VBROADCASTSS(data.Offset(92), Y0) 1269 VMOVUPS(Y0, Mem{Base: RSP}.Offset(64)) 1270 VBROADCASTSS(data.Offset(96), Y0) 1271 VMOVUPS(Y0, Mem{Base: RSP}.Offset(32)) 1272 VBROADCASTSS(data.Offset(100), Y0) 1273 VMOVUPS(Y0, Mem{Base: RSP}) 1274 VBROADCASTSS(data.Offset(104), Y0) 1275 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 1276 VBROADCASTSS(data.Offset(108), Y0) 1277 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 1278 VPBROADCASTD(data.Offset(112), Y0) 1279 VMOVDQU(Y0, Mem{Base: RSP}.Offset(-96)) 1280 VPBROADCASTD(data.Offset(116), Y8) 1281 VBROADCASTSS(data.Offset(116), Y12) 1282 JMP(LabelRef("LBB8_2")) 1283 1284 Label("LBB8_10") 1285 { 1286 VPXOR(Y0, Y15, Y0) 1287 VPANDN(Y0, Y14, Y0) 1288 VMOVUPS(Mem{Base: RSP}.Offset(-128), Y14) 1289 VMOVUPS(Mem{Base: RSP}.Offset(832), Y2) 1290 VCMPPS(Imm(0), Y2, Y14, Y3) 1291 VCMPPS(Imm(1), Y2, Y14, Y4) 1292 VXORPS(Y4, Y11, Y4) 1293 VPXOR(X7, X7, X7) 1294 VBLENDVPS(Y4, Y12, Y7, Y4) 1295 VBLENDVPS(Y3, Y14, Y4, Y3) 1296 VBLENDVPS(Y0, Y6, Y3, Y0) 1297 VANDPS(Y6, Y10, Y3) 1298 VANDPS(Y9, Y10, Y4) 1299 VORPS(Y2, Y4, Y4) 1300 VBLENDVPS(Y13, Y3, Y4, Y3) 1301 VBLENDVPS(Y1, Y14, Y3, Y1) 1302 VBLENDVPS(Y5, Y0, Y1, Y0) 1303 VCMPPS(Imm(3), Y9, Y9, Y1) 1304 VCMPPS(Imm(3), Y11, Y11, Y3) 1305 VORPS(Y1, Y3, Y1) 1306 VADDPS(Y9, Y11, Y3) 1307 VBLENDVPS(Y1, Y3, Y0, Y6) 1308 VMOVUPS(Y6, Mem{Base: RDI}.Idx(RAX, 4)) 1309 ADDQ(Imm(8), RAX) 1310 CMPQ(RAX, RDX) 1311 JAE(LabelRef("LBB8_12")) 1312 } 1313 1314 Label("LBB8_2") 1315 { 1316 VMOVAPS(Y12, Y2) 1317 VMOVDQU(Mem{Base: RDI}.Idx(RAX, 4), Y9) 1318 VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4), Y11) 1319 VPAND(Mem{Base: RSP}.Offset(320), Y9, Y12) 1320 VMOVUPS(data.Offset(124), X1) 1321 VANDPS(Mem{Base: RDI}.Idx(RAX, 4), X1, X0) 1322 VMOVUPS(data.Offset(140), X3) 1323 VORPS(X3, X0, X0) 1324 VANDPS(Mem{Base: RDI}.Idx(RAX, 4).Offset(16), X1, X1) 1325 VORPS(X3, X1, X1) 1326 VINSERTF128(Imm(1), X1, Y0, Y0) 1327 VMOVUPS(Mem{Base: RSP}.Offset(800), Y1) 1328 VCMPPS(Imm(1), Y0, Y1, Y1) 1329 VANDNPS(Y0, Y1, Y4) 1330 VADDPS(Mem{Base: RSP}.Offset(768), Y0, Y0) 1331 VADDPS(Y4, Y0, Y4) 1332 VMULPS(Y4, Y4, Y6) 1333 VMULPS(Y6, Y6, Y0) 1334 VMOVUPS(Mem{Base: RSP}.Offset(704), Y5) 1335 VFMADD213PS(Mem{Base: RSP}.Offset(736), Y4, Y5) 1336 VMOVUPS(Mem{Base: RSP}.Offset(640), Y10) 1337 VFMADD213PS(Mem{Base: RSP}.Offset(672), Y4, Y10) 1338 VFMADD231PS(Y5, Y6, Y10) 1339 VMOVUPS(Mem{Base: RSP}.Offset(576), Y5) 1340 VFMADD213PS(Mem{Base: RSP}.Offset(608), Y4, Y5) 1341 VMOVUPS(Mem{Base: RSP}.Offset(512), Y13) 1342 VFMADD213PS(Mem{Base: RSP}.Offset(544), Y4, Y13) 1343 VMULPS(Y0, Y0, Y14) 1344 VFMADD132PS(Mem{Base: RSP}.Offset(480), Y13, Y14) 1345 VFMADD231PS(Y5, Y6, Y14) 1346 VFMADD231PS(Y10, Y0, Y14) 1347 VMULPS(Y4, Y6, Y0) 1348 VMULPS(Y0, Y14, Y0) 1349 VMOVDQU(Y12, Mem{Base: RSP}.Offset(832)) 1350 VPSRLD(Imm(23), Y12, Y5) 1351 VPOR(Mem{Base: RSP}.Offset(448), Y5, Y5) 1352 VADDPS(Mem{Base: RSP}.Offset(416), Y5, Y5) 1353 VMOVUPS(Mem{Base: RSP}.Offset(-128), Y3) 1354 VANDPS(Y3, Y1, Y1) 1355 VADDPS(Y1, Y5, Y5) 1356 VMULPS(Y5, Y11, Y1) 1357 VROUNDPS(Imm(8), Y1, Y1) 1358 VFNMADD213PS(Y1, Y11, Y5) 1359 VMOVUPS(Mem{Base: RSP}.Offset(384), Y14) 1360 VMOVAPS(Y14, Y10) 1361 VFMADD213PS(Y4, Y6, Y10) 1362 VADDPS(Y0, Y10, Y10) 1363 VMOVUPS(Mem{Base: RSP}.Offset(352), Y12) 1364 VMULPS(Y4, Y12, Y13) 1365 VMULPS(Y6, Y14, Y14) 1366 VFMADD231PS(Y13, Y4, Y14) 1367 VSUBPS(Y4, Y10, Y4) 1368 VFMADD231PS(Y6, Y12, Y4) 1369 VMOVUPS(Mem{Base: RSP}.Offset(288), Y15) 1370 VMULPS(Y15, Y11, Y6) 1371 VMULPS(Y6, Y10, Y6) 1372 VROUNDPS(Imm(8), Y6, Y6) 1373 VMULPS(Mem{Base: RSP}.Offset(256), Y6, Y13) 1374 VFMADD231PS(Y10, Y11, Y13) 1375 VFNMADD231PS(Mem{Base: RSP}.Offset(224), Y6, Y13) 1376 VSUBPS(Y0, Y14, Y0) 1377 VADDPS(Y4, Y0, Y0) 1378 VMOVUPS(Mem{Base: RSP}.Offset(192), Y10) 1379 VMULPS(Y5, Y10, Y4) 1380 VFNMADD231PS(Y0, Y11, Y4) 1381 VADDPS(Y4, Y13, Y0) 1382 VMULPS(Y0, Y15, Y4) 1383 VROUNDPS(Imm(8), Y4, Y4) 1384 VFMADD231PS(Y10, Y4, Y0) 1385 VMULPS(Y0, Y0, Y5) 1386 VMULPS(Y5, Y5, Y10) 1387 VMOVUPS(Mem{Base: RSP}.Offset(64), Y13) 1388 VFMADD213PS(Mem{Base: RSP}.Offset(96), Y0, Y13) 1389 VMOVUPS(Mem{Base: RSP}.Offset(32), Y14) 1390 VFMADD213PS(Y12, Y0, Y14) 1391 VFMADD231PS(Y13, Y10, Y14) 1392 VMOVUPS(Mem{Base: RSP}.Offset(128), Y10) 1393 VFMADD213PS(Mem{Base: RSP}.Offset(160), Y0, Y10) 1394 VFMADD231PS(Y10, Y5, Y14) 1395 VADDPS(Y3, Y0, Y10) 1396 VFMADD231PS(Y14, Y5, Y10) 1397 VADDPS(Y1, Y6, Y0) 1398 VADDPS(Y4, Y0, Y14) 1399 VCVTPS2DQ(Y14, Y4) 1400 VPSRLD(Imm(23), Y10, Y0) 1401 VPAND(data.Offset(164), Y0, Y0) 1402 VPADDD(Y4, Y0, Y0) 1403 VPCMPGTD(Mem{Base: RSP}, Y0, Y1) 1404 VMOVUPS(Mem{Base: RSP}.Offset(-32), Y3) 1405 VCMPPS(Imm(1), Y14, Y3, Y5) 1406 VPOR(Y5, Y1, Y1) 1407 VMOVDQU(Mem{Base: RSP}.Offset(-64), Y3) 1408 VPCMPGTD(Y0, Y3, Y0) 1409 VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-96), Y14, Y5) 1410 VPOR(Y5, Y0, Y0) 1411 VPSLLD(Imm(23), Y4, Y4) 1412 VPADDD(Y4, Y10, Y6) 1413 VPOR(Y1, Y0, Y4) 1414 VTESTPS(Y4, Y4) 1415 JNE(LabelRef("LBB8_3")) 1416 VPCMPEQD(Y15, Y15, Y15) 1417 VMOVAPS(Y2, Y12) 1418 JMP(LabelRef("LBB8_5")) 1419 } 1420 1421 Label("LBB8_3") 1422 { 1423 VPANDN(Y6, Y0, Y0) 1424 VMOVAPS(Y2, Y12) 1425 VBLENDVPS(Y1, Y2, Y0, Y6) 1426 VPCMPEQD(Y15, Y15, Y15) 1427 } 1428 1429 Label("LBB8_5") 1430 { 1431 VPAND(Y8, Y9, Y5) 1432 VPCMPEQD(Y7, Y5, Y4) 1433 VCMPPS(Imm(1), Y7, Y11, Y13) 1434 VCMPPS(Imm(0), Y7, Y11, Y1) 1435 VANDPS(Mem{Base: RSP}.Offset(-128), Y1, Y0) 1436 VBLENDVPS(Y13, Y12, Y0, Y0) 1437 VBLENDVPS(Y4, Y0, Y6, Y6) 1438 VMOVMSKPS(Y9, ECX) 1439 TESTL(ECX, ECX) 1440 JNE(LabelRef("LBB8_7")) 1441 VXORPS(X10, X10, X10) 1442 JMP(LabelRef("LBB8_8")) 1443 } 1444 1445 Label("LBB8_7") 1446 { 1447 VROUNDPS(Imm(8), Y11, Y0) 1448 VCMPPS(Imm(0), Y0, Y11, Y0) 1449 VCVTPS2DQ(Y11, Y10) 1450 VPSLLD(Imm(31), Y10, Y10) 1451 VPOR(Y6, Y10, Y12) 1452 VPXOR(X3, X3, X3) 1453 VCMPPS(Imm(0), Y3, Y9, Y7) 1454 VBROADCASTSS(data.Offset(120), Y3) 1455 VBLENDVPS(Y7, Y6, Y3, Y3) 1456 VBLENDVPS(Y0, Y12, Y3, Y3) 1457 VMOVAPS(Y2, Y12) 1458 VPSRAD(Imm(31), Y9, Y7) 1459 VBLENDVPS(Y7, Y3, Y6, Y6) 1460 VANDPS(Y0, Y10, Y10) 1461 } 1462 1463 Label("LBB8_8") 1464 { 1465 VPCMPEQD(Y5, Y8, Y0) 1466 VPXOR(Y0, Y15, Y5) 1467 VANDPS(Y8, Y11, Y0) 1468 VANDPS(Y8, Y14, Y3) 1469 VPCMPEQD(Y3, Y8, Y14) 1470 VPXOR(Y15, Y14, Y3) 1471 VPCMPEQD(Y0, Y8, Y0) 1472 VPANDN(Y5, Y0, Y7) 1473 VPOR(Y4, Y3, Y3) 1474 VPAND(Y7, Y3, Y3) 1475 VTESTPS(Y15, Y3) 1476 JAE(LabelRef("LBB8_10")) 1477 VPXOR(X7, X7, X7) 1478 VMOVUPS(Y6, Mem{Base: RDI}.Idx(RAX, 4)) 1479 ADDQ(Imm(8), RAX) 1480 CMPQ(RAX, RDX) 1481 JB(LabelRef("LBB8_2")) 1482 } 1483 1484 Label("LBB8_12") 1485 { 1486 ADDQ(I32(872), RSP) 1487 VZEROUPPER() 1488 RET() 1489 } 1490 } 1491 1492 func genLog10_Len8x_F32() { 1493 1494 data := GLOBL("dataLog10Len8xF32", RODATA|NOPTR) 1495 DATA(0, U32(0x00800000)) 1496 DATA(4, U32(2155872255)) 1497 DATA(8, U32(1056964608)) 1498 DATA(12, U32(4294967169)) 1499 DATA(16, U32(0x3f800000)) 1500 DATA(20, U32(0x3f3504f3)) 1501 DATA(24, U32(0xbf800000)) 1502 DATA(28, U32(0x3d9021bb)) 1503 DATA(32, U32(0xbdebd1b8)) 1504 DATA(36, U32(0x3def251a)) 1505 DATA(40, U32(0xbdfe5d4f)) 1506 DATA(44, U32(0x3e11e9bf)) 1507 DATA(48, U32(0xbe2aae50)) 1508 DATA(52, U32(0x3e4cceac)) 1509 DATA(56, U32(0xbe7ffffc)) 1510 DATA(60, U32(0x3eaaaaaa)) 1511 DATA(64, U32(0x3f317218)) 1512 DATA(68, U32(0xbf000000)) 1513 DATA(72, U32(0x3ede5bd9)) 1514 DATA(76, U64(0x0)) 1515 DATA(84, U64(0x0)) 1516 DATA(92, U64(0x0)) 1517 DATA(100, U64(0x0)) 1518 1519 TEXT("Log10_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)") 1520 Pragma("noescape") 1521 Load(Param("x").Base(), RDI) 1522 Load(Param("x").Len(), RSI) 1523 1524 SUBQ(Imm(136), RSP) 1525 TESTQ(RSI, RSI) 1526 JE(LabelRef("LBB8_3")) 1527 XORL(EAX, EAX) 1528 VBROADCASTSS(data.Offset(4), Y0) 1529 VMOVUPS(Y0, Mem{Base: RSP}.Offset(96)) 1530 VBROADCASTSS(data.Offset(8), Y0) 1531 VMOVUPS(Y0, Mem{Base: RSP}.Offset(64)) 1532 VBROADCASTSS(data.Offset(12), Y0) 1533 VMOVUPS(Y0, Mem{Base: RSP}.Offset(32)) 1534 VBROADCASTSS(data.Offset(0), Y0) 1535 VMOVUPS(Y0, Mem{Base: RSP}) 1536 VBROADCASTSS(data.Offset(16), Y0) 1537 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 1538 VBROADCASTSS(data.Offset(20), Y0) 1539 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 1540 VBROADCASTSS(data.Offset(24), Y0) 1541 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96)) 1542 VBROADCASTSS(data.Offset(28), Y0) 1543 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128)) 1544 VBROADCASTSS(data.Offset(32), Y9) 1545 VBROADCASTSS(data.Offset(36), Y10) 1546 VBROADCASTSS(data.Offset(40), Y11) 1547 VBROADCASTSS(data.Offset(44), Y12) 1548 VBROADCASTSS(data.Offset(48), Y13) 1549 VBROADCASTSS(data.Offset(52), Y14) 1550 VBROADCASTSS(data.Offset(56), Y15) 1551 VBROADCASTSS(data.Offset(60), Y0) 1552 VBROADCASTSS(data.Offset(64), Y1) 1553 VBROADCASTSS(data.Offset(68), Y2) 1554 VBROADCASTSS(data.Offset(72), Y3) 1555 1556 Label("LBB8_2") 1557 { 1558 VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y4) 1559 VMAXPS(Mem{Base: RSP}, Y4, Y5) 1560 VPSRLD(Imm(23), Y5, Y6) 1561 VPADDD(Mem{Base: RSP}.Offset(32), Y6, Y6) 1562 VANDPS(Mem{Base: RSP}.Offset(96), Y5, Y5) 1563 VORPS(Mem{Base: RSP}.Offset(64), Y5, Y5) 1564 VCVTDQ2PS(Y6, Y6) 1565 VADDPS(Mem{Base: RSP}.Offset(-32), Y6, Y7) 1566 VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-64), Y5, Y8) 1567 VBLENDVPS(Y8, Y6, Y7, Y6) 1568 VANDPS(Y5, Y8, Y7) 1569 VADDPS(Mem{Base: RSP}.Offset(-96), Y5, Y5) 1570 VADDPS(Y7, Y5, Y5) 1571 VMOVUPS(Mem{Base: RSP}.Offset(-128), Y7) 1572 VFMADD213PS(Y9, Y5, Y7) 1573 VFMADD213PS(Y10, Y5, Y7) 1574 VFMADD213PS(Y11, Y5, Y7) 1575 VFMADD213PS(Y12, Y5, Y7) 1576 VFMADD213PS(Y13, Y5, Y7) 1577 VFMADD213PS(Y14, Y5, Y7) 1578 VFMADD213PS(Y15, Y5, Y7) 1579 VFMADD213PS(Y0, Y5, Y7) 1580 VFMADD213PS(Y2, Y5, Y7) 1581 VFMADD213PS(Y5, Y1, Y6) 1582 VMULPS(Y5, Y5, Y5) 1583 VFMADD231PS(Y7, Y5, Y6) 1584 VCMPPS(Imm(2), data.Offset(76), Y4, Y4) 1585 VMULPS(Y3, Y6, Y5) 1586 VORPS(Y5, Y4, Y4) 1587 VMOVUPS(Y4, Mem{Base: RDI}.Idx(RAX, 4)) 1588 ADDQ(Imm(8), RAX) 1589 CMPQ(RAX, RSI) 1590 JB(LabelRef("LBB8_2")) 1591 } 1592 1593 Label("LBB8_3") 1594 { 1595 ADDQ(Imm(136), RSP) 1596 VZEROUPPER() 1597 RET() 1598 } 1599 } 1600 1601 func genLog2_Len8x_F32() { 1602 1603 data := GLOBL("dataLog2Len8xF32", RODATA|NOPTR) 1604 DATA(0, U32(0x00800000)) 1605 DATA(4, U32(2155872255)) 1606 DATA(8, U32(1056964608)) 1607 DATA(12, U32(4294967169)) 1608 DATA(16, U32(0x3f800000)) 1609 DATA(20, U32(0x3f3504f3)) 1610 DATA(24, U32(0xbf800000)) 1611 DATA(28, U32(0x3d9021bb)) 1612 DATA(32, U32(0xbdebd1b8)) 1613 DATA(36, U32(0x3def251a)) 1614 DATA(40, U32(0xbdfe5d4f)) 1615 DATA(44, U32(0x3e11e9bf)) 1616 DATA(48, U32(0xbe2aae50)) 1617 DATA(52, U32(0x3e4cceac)) 1618 DATA(56, U32(0xbe7ffffc)) 1619 DATA(60, U32(0x3eaaaaaa)) 1620 DATA(64, U32(0x3f317218)) 1621 DATA(68, U32(0xbf000000)) 1622 DATA(72, U32(0x3fb8aa3b)) 1623 DATA(76, U64(0x0)) 1624 DATA(84, U64(0x0)) 1625 DATA(92, U64(0x0)) 1626 DATA(100, U64(0x0)) 1627 1628 TEXT("Log2_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)") 1629 Pragma("noescape") 1630 Load(Param("x").Base(), RDI) 1631 Load(Param("x").Len(), RSI) 1632 1633 SUBQ(Imm(136), RSP) 1634 TESTQ(RSI, RSI) 1635 JE(LabelRef("LBB9_3")) 1636 XORL(EAX, EAX) 1637 VBROADCASTSS(data.Offset(4), Y0) 1638 VMOVUPS(Y0, Mem{Base: RSP}.Offset(96)) 1639 VBROADCASTSS(data.Offset(8), Y0) 1640 VMOVUPS(Y0, Mem{Base: RSP}.Offset(64)) 1641 VBROADCASTSS(data.Offset(12), Y0) 1642 VMOVUPS(Y0, Mem{Base: RSP}.Offset(32)) 1643 VBROADCASTSS(data.Offset(0), Y0) 1644 VMOVUPS(Y0, Mem{Base: RSP}) 1645 VBROADCASTSS(data.Offset(16), Y0) 1646 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 1647 VBROADCASTSS(data.Offset(20), Y0) 1648 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 1649 VBROADCASTSS(data.Offset(24), Y0) 1650 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96)) 1651 VBROADCASTSS(data.Offset(28), Y0) 1652 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128)) 1653 VBROADCASTSS(data.Offset(32), Y9) 1654 VBROADCASTSS(data.Offset(36), Y10) 1655 VBROADCASTSS(data.Offset(40), Y11) 1656 VBROADCASTSS(data.Offset(44), Y12) 1657 VBROADCASTSS(data.Offset(48), Y13) 1658 VBROADCASTSS(data.Offset(52), Y14) 1659 VBROADCASTSS(data.Offset(56), Y15) 1660 VBROADCASTSS(data.Offset(60), Y0) 1661 VBROADCASTSS(data.Offset(64), Y1) 1662 VBROADCASTSS(data.Offset(68), Y2) 1663 VBROADCASTSS(data.Offset(72), Y3) 1664 1665 Label("LBB9_2") 1666 { 1667 VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y4) 1668 VMAXPS(Mem{Base: RSP}, Y4, Y5) 1669 VPSRLD(Imm(23), Y5, Y6) 1670 VPADDD(Mem{Base: RSP}.Offset(32), Y6, Y6) 1671 VANDPS(Mem{Base: RSP}.Offset(96), Y5, Y5) 1672 VORPS(Mem{Base: RSP}.Offset(64), Y5, Y5) 1673 VCVTDQ2PS(Y6, Y6) 1674 VADDPS(Mem{Base: RSP}.Offset(-32), Y6, Y7) 1675 VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-64), Y5, Y8) 1676 VBLENDVPS(Y8, Y6, Y7, Y6) 1677 VANDPS(Y5, Y8, Y7) 1678 VADDPS(Mem{Base: RSP}.Offset(-96), Y5, Y5) 1679 VADDPS(Y7, Y5, Y5) 1680 VMOVUPS(Mem{Base: RSP}.Offset(-128), Y7) 1681 VFMADD213PS(Y9, Y5, Y7) 1682 VFMADD213PS(Y10, Y5, Y7) 1683 VFMADD213PS(Y11, Y5, Y7) 1684 VFMADD213PS(Y12, Y5, Y7) 1685 VFMADD213PS(Y13, Y5, Y7) 1686 VFMADD213PS(Y14, Y5, Y7) 1687 VFMADD213PS(Y15, Y5, Y7) 1688 VFMADD213PS(Y0, Y5, Y7) 1689 VFMADD213PS(Y2, Y5, Y7) 1690 VFMADD213PS(Y5, Y1, Y6) 1691 VMULPS(Y5, Y5, Y5) 1692 VFMADD231PS(Y7, Y5, Y6) 1693 VCMPPS(Imm(2), data.Offset(76), Y4, Y4) 1694 VMULPS(Y3, Y6, Y5) 1695 VORPS(Y5, Y4, Y4) 1696 VMOVUPS(Y4, Mem{Base: RDI}.Idx(RAX, 4)) 1697 ADDQ(Imm(8), RAX) 1698 CMPQ(RAX, RSI) 1699 JB(LabelRef("LBB9_2")) 1700 } 1701 1702 Label("LBB9_3") 1703 { 1704 ADDQ(Imm(136), RSP) 1705 VZEROUPPER() 1706 RET() 1707 } 1708 } 1709 1710 func genLog_Len8x_F32() { 1711 1712 data := GLOBL("dataLogLen8xF32", RODATA|NOPTR) 1713 DATA(0, U32(0x00800000)) 1714 DATA(4, U32(2155872255)) 1715 DATA(8, U32(1056964608)) 1716 DATA(12, U32(4294967169)) 1717 DATA(16, U32(0x3f800000)) 1718 DATA(20, U32(0x3f3504f3)) 1719 DATA(24, U32(0xbf800000)) 1720 DATA(28, U32(0x3d9021bb)) 1721 DATA(32, U32(0xbdebd1b8)) 1722 DATA(36, U32(0x3def251a)) 1723 DATA(40, U32(0xbdfe5d4f)) 1724 DATA(44, U32(0x3e11e9bf)) 1725 DATA(48, U32(0xbe2aae50)) 1726 DATA(52, U32(0x3e4cceac)) 1727 DATA(56, U32(0xbe7ffffc)) 1728 DATA(60, U32(0x3eaaaaaa)) 1729 DATA(64, U32(0x3f317218)) 1730 DATA(68, U32(0xbf000000)) 1731 DATA(72, U64(0x0)) 1732 DATA(80, U64(0x0)) 1733 DATA(88, U64(0x0)) 1734 DATA(96, U64(0x0)) 1735 1736 TEXT("Log_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)") 1737 Pragma("noescape") 1738 Load(Param("x").Base(), RDI) 1739 Load(Param("x").Len(), RSI) 1740 1741 SUBQ(Imm(104), RSP) 1742 TESTQ(RSI, RSI) 1743 JE(LabelRef("LBB10_3")) 1744 XORL(EAX, EAX) 1745 VBROADCASTSS(data.Offset(0), Y0) 1746 VMOVUPS(Y0, Mem{Base: RSP}.Offset(64)) 1747 VBROADCASTSS(data.Offset(4), Y0) 1748 VMOVUPS(Y0, Mem{Base: RSP}.Offset(32)) 1749 VBROADCASTSS(data.Offset(8), Y0) 1750 VMOVUPS(Y0, Mem{Base: RSP}) 1751 VBROADCASTSS(data.Offset(12), Y0) 1752 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 1753 VBROADCASTSS(data.Offset(16), Y0) 1754 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 1755 VBROADCASTSS(data.Offset(20), Y0) 1756 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96)) 1757 VBROADCASTSS(data.Offset(24), Y0) 1758 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128)) 1759 VBROADCASTSS(data.Offset(28), Y8) 1760 VBROADCASTSS(data.Offset(32), Y9) 1761 VBROADCASTSS(data.Offset(36), Y10) 1762 VBROADCASTSS(data.Offset(40), Y11) 1763 VBROADCASTSS(data.Offset(44), Y12) 1764 VBROADCASTSS(data.Offset(48), Y13) 1765 VBROADCASTSS(data.Offset(52), Y14) 1766 VBROADCASTSS(data.Offset(56), Y15) 1767 VBROADCASTSS(data.Offset(60), Y0) 1768 VBROADCASTSS(data.Offset(64), Y1) 1769 VBROADCASTSS(data.Offset(68), Y2) 1770 1771 Label("LBB10_2") 1772 { 1773 VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y3) 1774 VMAXPS(Mem{Base: RSP}.Offset(64), Y3, Y4) 1775 VPSRLD(Imm(23), Y4, Y5) 1776 VPADDD(Mem{Base: RSP}.Offset(-32), Y5, Y5) 1777 VANDPS(Mem{Base: RSP}.Offset(32), Y4, Y4) 1778 VORPS(Mem{Base: RSP}, Y4, Y4) 1779 VCVTDQ2PS(Y5, Y5) 1780 VADDPS(Mem{Base: RSP}.Offset(-64), Y5, Y6) 1781 VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-96), Y4, Y7) 1782 VBLENDVPS(Y7, Y5, Y6, Y5) 1783 VANDPS(Y4, Y7, Y6) 1784 VADDPS(Mem{Base: RSP}.Offset(-128), Y4, Y4) 1785 VADDPS(Y6, Y4, Y4) 1786 VMOVAPS(Y8, Y6) 1787 VFMADD213PS(Y9, Y4, Y6) 1788 VFMADD213PS(Y10, Y4, Y6) 1789 VFMADD213PS(Y11, Y4, Y6) 1790 VFMADD213PS(Y12, Y4, Y6) 1791 VFMADD213PS(Y13, Y4, Y6) 1792 VFMADD213PS(Y14, Y4, Y6) 1793 VFMADD213PS(Y15, Y4, Y6) 1794 VFMADD213PS(Y0, Y4, Y6) 1795 VFMADD213PS(Y2, Y4, Y6) 1796 VFMADD213PS(Y4, Y1, Y5) 1797 VMULPS(Y4, Y4, Y4) 1798 VFMADD231PS(Y6, Y4, Y5) 1799 VCMPPS(Imm(2), data.Offset(72), Y3, Y3) 1800 VORPS(Y5, Y3, Y3) 1801 VMOVUPS(Y3, Mem{Base: RDI}.Idx(RAX, 4)) 1802 ADDQ(Imm(8), RAX) 1803 CMPQ(RAX, RSI) 1804 JB(LabelRef("LBB10_2")) 1805 } 1806 1807 Label("LBB10_3") 1808 { 1809 ADDQ(Imm(104), RSP) 1810 VZEROUPPER() 1811 RET() 1812 } 1813 } 1814 1815 func genExp_Len8x_F32() { 1816 1817 data := GLOBL("dataExpLen8xF32", RODATA|NOPTR) 1818 DATA(0, U32(0x42b17218)) 1819 DATA(4, U32(0xc2ce8ed0)) 1820 DATA(8, U32(0x3f000000)) 1821 DATA(12, U32(0x3fb8aa3b)) 1822 DATA(16, U32(0xbf318000)) 1823 DATA(20, U32(0x395e8083)) 1824 DATA(24, U32(1065353216)) 1825 DATA(28, U32(0x3ab743ce)) 1826 DATA(32, U32(0x39506967)) 1827 DATA(36, U32(0x3c088908)) 1828 DATA(40, U32(0x3d2aa9c1)) 1829 DATA(44, U32(0x3e2aaaaa)) 1830 DATA(48, U32(0x7f7fffff)) 1831 1832 TEXT("Exp_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)") 1833 Pragma("noescape") 1834 Load(Param("x").Base(), RDI) 1835 Load(Param("x").Len(), RSI) 1836 1837 TESTQ(RSI, RSI) 1838 JE(LabelRef("LBB11_3")) 1839 XORL(EAX, EAX) 1840 VBROADCASTSS(data.Offset(0), Y0) 1841 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-40)) 1842 VBROADCASTSS(data.Offset(4), Y0) 1843 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-72)) 1844 VBROADCASTSS(data.Offset(8), Y2) 1845 VBROADCASTSS(data.Offset(12), Y3) 1846 VBROADCASTSS(data.Offset(16), Y4) 1847 VBROADCASTSS(data.Offset(20), Y5) 1848 VPBROADCASTD(data.Offset(24), Y6) 1849 VBROADCASTSS(data.Offset(28), Y7) 1850 VBROADCASTSS(data.Offset(32), Y1) 1851 VBROADCASTSS(data.Offset(36), Y9) 1852 VBROADCASTSS(data.Offset(40), Y10) 1853 VBROADCASTSS(data.Offset(44), Y11) 1854 VBROADCASTSS(data.Offset(48), Y12) 1855 1856 Label("LBB11_2") 1857 { 1858 VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y13) 1859 VMOVAPS(Y3, Y14) 1860 VFMADD213PS(Y2, Y13, Y14) 1861 VROUNDPS(Imm(1), Y14, Y14) 1862 VMOVAPS(Y4, Y15) 1863 VFMADD213PS(Y13, Y14, Y15) 1864 VFMADD231PS(Y5, Y14, Y15) 1865 VMULPS(Y15, Y15, Y0) 1866 VMOVAPS(Y1, Y8) 1867 VFMADD213PS(Y7, Y15, Y8) 1868 VFMADD213PS(Y9, Y15, Y8) 1869 VFMADD213PS(Y10, Y15, Y8) 1870 VFMADD213PS(Y11, Y15, Y8) 1871 VFMADD213PS(Y2, Y15, Y8) 1872 VFMADD213PS(Y15, Y0, Y8) 1873 VCVTTPS2DQ(Y14, Y0) 1874 VPSLLD(Imm(23), Y0, Y0) 1875 VPADDD(Y6, Y0, Y0) 1876 VFMADD213PS(Y0, Y0, Y8) 1877 VMOVUPS(Mem{Base: RSP}.Offset(-40), Y0) 1878 VCMPPS(Imm(1), Y13, Y0, Y0) 1879 VBLENDVPS(Y0, Y12, Y8, Y0) 1880 VMOVUPS(Mem{Base: RSP}.Offset(-72), Y8) 1881 VCMPPS(Imm(2), Y13, Y8, Y8) 1882 VANDPS(Y0, Y8, Y0) 1883 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RAX, 4)) 1884 ADDQ(Imm(8), RAX) 1885 CMPQ(RAX, RSI) 1886 JB(LabelRef("LBB11_2")) 1887 } 1888 1889 Label("LBB11_3") 1890 { 1891 VZEROUPPER() 1892 RET() 1893 } 1894 } 1895 1896 func genSin_F32() { 1897 1898 data := GLOBL("dataSinF32", RODATA|NOPTR) 1899 DATA(0, U32(2147483647)) 1900 DATA(4, U32(0x3fa2f983)) 1901 DATA(8, U32(4294967294)) 1902 DATA(12, U32(2)) 1903 DATA(16, U32(0xbf490fdb)) 1904 DATA(20, U32(2147483648)) 1905 DATA(24, U32(0x37ccf5ce)) 1906 DATA(28, U32(0xbab6061a)) 1907 DATA(32, U32(0x3d2aaaa5)) 1908 DATA(36, U32(0xbf000000)) 1909 DATA(40, U32(0x3f800000)) 1910 DATA(44, U32(0xb94ca1f9)) 1911 DATA(48, U32(0x3c08839e)) 1912 DATA(52, U32(0xbe2aaaa3)) 1913 DATA(56, U32(0x4b7fffff)) 1914 DATA(60, U64(0xffffffffffffffff)) 1915 DATA(68, U64(0xffffffffffffffff)) 1916 DATA(76, U64(0xffffffffffffffff)) 1917 DATA(84, U64(0xffffffffffffffff)) 1918 DATA(92, U64(0x0)) 1919 DATA(100, U64(0x0)) 1920 DATA(108, U64(0x0)) 1921 DATA(116, U64(0x0)) 1922 1923 TEXT("Sin_AVX2_F32", 0, "func(x []float32)") 1924 Pragma("noescape") 1925 Load(Param("x").Base(), RDI) 1926 Load(Param("x").Len(), RSI) 1927 1928 PUSHQ(RAX) 1929 MOVQ(RSI, RAX) 1930 ANDQ(I32(-8), RAX) 1931 JE(LabelRef("LBB12_3")) 1932 XORL(ECX, ECX) 1933 VBROADCASTSS(data.Offset(0), Y0) 1934 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 1935 VBROADCASTSS(data.Offset(4), Y0) 1936 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 1937 VBROADCASTSS(data.Offset(8), Y0) 1938 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96)) 1939 VPBROADCASTD(data.Offset(12), Y4) 1940 VPBROADCASTD(data.Offset(16), Y0) 1941 VMOVDQU(Y0, Mem{Base: RSP}.Offset(-128)) 1942 VPBROADCASTD(data.Offset(20), Y7) 1943 VBROADCASTSS(data.Offset(24), Y8) 1944 VBROADCASTSS(data.Offset(28), Y9) 1945 VBROADCASTSS(data.Offset(32), Y10) 1946 VBROADCASTSS(data.Offset(36), Y11) 1947 VBROADCASTSS(data.Offset(40), Y12) 1948 VBROADCASTSS(data.Offset(44), Y3) 1949 VBROADCASTSS(data.Offset(48), Y14) 1950 VBROADCASTSS(data.Offset(52), Y15) 1951 1952 Label("LBB12_2") 1953 { 1954 VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y2) 1955 VANDPS(Mem{Base: RSP}.Offset(-32), Y2, Y5) 1956 VMULPS(Mem{Base: RSP}.Offset(-64), Y5, Y0) 1957 VCVTTPS2DQ(Y0, Y0) 1958 VPSUBD(data.Offset(60), Y0, Y0) 1959 VPAND(Mem{Base: RSP}.Offset(-96), Y0, Y1) 1960 VCVTDQ2PS(Y1, Y1) 1961 VFMADD132PS(Mem{Base: RSP}.Offset(-128), Y5, Y1) 1962 VMULPS(Y1, Y1, Y5) 1963 VMOVAPS(Y3, Y13) 1964 VFMADD213PS(Y14, Y5, Y13) 1965 VFMADD213PS(Y15, Y5, Y13) 1966 VMULPS(Y1, Y5, Y6) 1967 VFMADD213PS(Y1, Y13, Y6) 1968 VPSLLD(Imm(29), Y0, Y1) 1969 VPAND(Y4, Y0, Y0) 1970 VPXOR(Y2, Y1, Y1) 1971 VMOVAPS(Y8, Y2) 1972 VFMADD213PS(Y9, Y5, Y2) 1973 VFMADD213PS(Y10, Y5, Y2) 1974 VFMADD213PS(Y11, Y5, Y2) 1975 VFMADD213PS(Y12, Y5, Y2) 1976 VPCMPEQD(Y4, Y0, Y5) 1977 VANDPS(Y5, Y2, Y2) 1978 VPCMPEQD(data.Offset(92), Y0, Y0) 1979 VANDPS(Y0, Y6, Y0) 1980 VADDPS(Y2, Y0, Y0) 1981 VPAND(Y7, Y1, Y1) 1982 VPXOR(Y0, Y1, Y0) 1983 VMOVDQU(Y0, Mem{Base: RDI}.Idx(RCX, 4)) 1984 ADDQ(Imm(8), RCX) 1985 CMPQ(RCX, RAX) 1986 JB(LabelRef("LBB12_2")) 1987 } 1988 1989 Label("LBB12_3") 1990 { 1991 CMPQ(RAX, RSI) 1992 JAE(LabelRef("LBB12_14")) 1993 VBROADCASTSS(data.Offset(20), X0) 1994 VPXOR(X1, X1, X1) 1995 VMOVSS(data.Offset(56), X2) 1996 VMOVSS(data.Offset(40), X9) 1997 VMOVSS(data.Offset(16), X10) 1998 VMOVSS(data.Offset(24), X12) 1999 VMOVSS(data.Offset(28), X11) 2000 VMOVSS(data.Offset(32), X13) 2001 VMOVSS(data.Offset(36), X14) 2002 VMOVSS(data.Offset(44), X8) 2003 VMOVSS(data.Offset(48), X15) 2004 VMOVSS(data.Offset(52), X6) 2005 JMP(LabelRef("LBB12_5")) 2006 } 2007 2008 Label("LBB12_13") 2009 { 2010 ADDQ(Imm(1), RAX) 2011 CMPQ(RAX, RSI) 2012 JAE(LabelRef("LBB12_14")) 2013 } 2014 2015 Label("LBB12_5") 2016 { 2017 VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X4) 2018 VXORPS(X0, X4, X3) 2019 VCMPSS(Imm(1), X1, X4, X5) 2020 VBLENDVPS(X5, X3, X4, X3) 2021 VUCOMISS(X2, X3) 2022 JA(LabelRef("LBB12_13")) 2023 VUCOMISS(X1, X4) 2024 SETCS(R8B) 2025 VMULSS(data.Offset(4), X3, X4) 2026 VCVTTSS2SI(X4, EDX) 2027 VROUNDSS(Imm(11), X4, X4, X4) 2028 MOVL(EDX, ECX) 2029 ANDL(Imm(1), ECX) 2030 JE(LabelRef("LBB12_8")) 2031 VADDSS(X4, X9, X4) 2032 } 2033 2034 Label("LBB12_8") 2035 { 2036 ADDL(EDX, ECX) 2037 ANDL(Imm(7), ECX) 2038 LEAL(Mem{Base: RCX}.Offset(-4), EDX) 2039 CMPL(ECX, Imm(4)) 2040 SETCC(R9B) 2041 CMOVLLT(ECX, EDX) 2042 VFMADD231SS(X10, X4, X3) 2043 VMULSS(X3, X3, X4) 2044 VMOVAPS(X12, X7) 2045 VFMADD213SS(X11, X4, X7) 2046 VFMADD213SS(X13, X4, X7) 2047 VFMADD213SS(X14, X4, X7) 2048 VMOVAPS(X8, X5) 2049 VFMADD213SS(X15, X4, X5) 2050 VFMADD213SS(X6, X4, X5) 2051 ADDL(I32(-1), EDX) 2052 CMPL(EDX, Imm(2)) 2053 JB(LabelRef("LBB12_9")) 2054 VMULSS(X3, X4, X4) 2055 VFMADD213SS(X3, X4, X5) 2056 VMOVAPS(X5, X4) 2057 VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4)) 2058 CMPB(R8B, R9B) 2059 JE(LabelRef("LBB12_13")) 2060 JMP(LabelRef("LBB12_12")) 2061 } 2062 2063 Label("LBB12_9") 2064 { 2065 VFMADD213SS(X9, X7, X4) 2066 VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4)) 2067 CMPB(R8B, R9B) 2068 JE(LabelRef("LBB12_13")) 2069 } 2070 2071 Label("LBB12_12") 2072 { 2073 VXORPS(X0, X4, X3) 2074 VMOVSS(X3, Mem{Base: RDI}.Idx(RAX, 4)) 2075 JMP(LabelRef("LBB12_13")) 2076 } 2077 2078 Label("LBB12_14") 2079 { 2080 POPQ(RAX) 2081 VZEROUPPER() 2082 RET() 2083 } 2084 } 2085 2086 func genCos_F32() { 2087 2088 data := GLOBL("dataCosF32", RODATA|NOPTR) 2089 DATA(0, U32(2147483647)) 2090 DATA(4, U32(0x3fa2f983)) 2091 DATA(8, U32(4294967294)) 2092 DATA(12, U32(2)) 2093 DATA(16, U32(0xbf490fdb)) 2094 DATA(20, U32(3221225472)) 2095 DATA(24, U32(0x37ccf5ce)) 2096 DATA(28, U32(0xbab6061a)) 2097 DATA(32, U32(0x3d2aaaa5)) 2098 DATA(36, U32(0xbf000000)) 2099 DATA(40, U32(0x3f800000)) 2100 DATA(44, U32(0xb94ca1f9)) 2101 DATA(48, U32(0x3c08839e)) 2102 DATA(52, U32(0xbe2aaaa3)) 2103 DATA(56, U32(2147483648)) 2104 DATA(60, U32(0x4b7fffff)) 2105 DATA(64, U64(0xffffffffffffffff)) 2106 DATA(72, U64(0xffffffffffffffff)) 2107 DATA(80, U64(0xffffffffffffffff)) 2108 DATA(88, U64(0xffffffffffffffff)) 2109 DATA(96, U64(0x0)) 2110 DATA(104, U64(0x0)) 2111 DATA(112, U64(0x0)) 2112 DATA(120, U64(0x0)) 2113 2114 TEXT("Cos_AVX2_F32", NOSPLIT, "func(x []float32)") 2115 Pragma("noescape") 2116 Load(Param("x").Base(), RDI) 2117 Load(Param("x").Len(), RSI) 2118 2119 SUBQ(Imm(72), RSP) 2120 MOVQ(RSI, RAX) 2121 ANDQ(I32(-8), RAX) 2122 JE(LabelRef("LBB13_3")) 2123 XORL(ECX, ECX) 2124 VBROADCASTSS(data.Offset(0), Y0) 2125 VMOVUPS(Y0, Mem{Base: RSP}.Offset(32)) 2126 VBROADCASTSS(data.Offset(4), Y0) 2127 VMOVUPS(Y0, Mem{Base: RSP}) 2128 VBROADCASTSS(data.Offset(8), Y0) 2129 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 2130 VPBROADCASTD(data.Offset(12), Y4) 2131 VBROADCASTSS(data.Offset(16), Y0) 2132 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 2133 VBROADCASTSS(data.Offset(20), Y0) 2134 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96)) 2135 VBROADCASTSS(data.Offset(24), Y0) 2136 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128)) 2137 VBROADCASTSS(data.Offset(28), Y9) 2138 VBROADCASTSS(data.Offset(32), Y10) 2139 VBROADCASTSS(data.Offset(36), Y6) 2140 VBROADCASTSS(data.Offset(40), Y12) 2141 VBROADCASTSS(data.Offset(44), Y13) 2142 VBROADCASTSS(data.Offset(48), Y14) 2143 VBROADCASTSS(data.Offset(52), Y15) 2144 VPBROADCASTD(data.Offset(56), Y2) 2145 2146 Label("LBB13_2") 2147 { 2148 VMOVUPS(Mem{Base: RSP}.Offset(32), Y0) 2149 VANDPS(Mem{Base: RDI}.Idx(RCX, 4), Y0, Y5) 2150 VMULPS(Mem{Base: RSP}, Y5, Y0) 2151 VCVTTPS2DQ(Y0, Y0) 2152 VPSUBD(data.Offset(64), Y0, Y0) 2153 VPAND(Mem{Base: RSP}.Offset(-32), Y0, Y1) 2154 VCVTDQ2PS(Y1, Y3) 2155 VFMADD132PS(Mem{Base: RSP}.Offset(-64), Y5, Y3) 2156 VMULPS(Y3, Y3, Y5) 2157 VMOVUPS(Mem{Base: RSP}.Offset(-128), Y8) 2158 VFMADD213PS(Y9, Y5, Y8) 2159 VFMADD213PS(Y10, Y5, Y8) 2160 VMULPS(Y5, Y5, Y7) 2161 VMOVAPS(Y6, Y11) 2162 VFMADD213PS(Y12, Y5, Y11) 2163 VFMADD231PS(Y7, Y8, Y11) 2164 VMOVAPS(Y13, Y7) 2165 VFMADD213PS(Y14, Y5, Y7) 2166 VFMADD213PS(Y15, Y5, Y7) 2167 VMULPS(Y3, Y5, Y5) 2168 VFMADD213PS(Y3, Y7, Y5) 2169 VPAND(Y4, Y0, Y0) 2170 VPCMPEQD(Y4, Y0, Y3) 2171 VPCMPEQD(data.Offset(96), Y0, Y0) 2172 VANDPS(Y0, Y5, Y0) 2173 VANDPS(Y3, Y11, Y3) 2174 VADDPS(Y3, Y0, Y0) 2175 VADDPS(Y5, Y11, Y3) 2176 VSUBPS(Y0, Y3, Y0) 2177 VPSLLD(Imm(29), Y1, Y1) 2178 VPADDD(Mem{Base: RSP}.Offset(-96), Y1, Y1) 2179 VPAND(Y2, Y1, Y1) 2180 VPXOR(Y2, Y1, Y1) 2181 VXORPS(Y1, Y0, Y0) 2182 VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4)) 2183 ADDQ(Imm(8), RCX) 2184 CMPQ(RCX, RAX) 2185 JB(LabelRef("LBB13_2")) 2186 } 2187 2188 Label("LBB13_3") 2189 { 2190 CMPQ(RAX, RSI) 2191 JAE(LabelRef("LBB13_14")) 2192 VBROADCASTSS(data.Offset(56), X0) 2193 VXORPS(X1, X1, X1) 2194 VMOVSS(data.Offset(60), X2) 2195 VMOVSS(data.Offset(40), X9) 2196 VMOVSS(data.Offset(16), X10) 2197 VMOVSS(data.Offset(24), X8) 2198 VMOVSS(data.Offset(28), X11) 2199 VMOVSS(data.Offset(32), X13) 2200 VMOVSS(data.Offset(36), X14) 2201 VMOVSS(data.Offset(44), X7) 2202 VMOVSS(data.Offset(48), X15) 2203 VMOVSS(data.Offset(52), X6) 2204 JMP(LabelRef("LBB13_5")) 2205 } 2206 2207 Label("LBB13_13") 2208 { 2209 ADDQ(Imm(1), RAX) 2210 CMPQ(RAX, RSI) 2211 JAE(LabelRef("LBB13_14")) 2212 } 2213 2214 Label("LBB13_5") 2215 { 2216 VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X3) 2217 VXORPS(X0, X3, X4) 2218 VCMPSS(Imm(1), X1, X3, X5) 2219 VBLENDVPS(X5, X4, X3, X3) 2220 VUCOMISS(X2, X3) 2221 JA(LabelRef("LBB13_13")) 2222 VMULSS(data.Offset(4), X3, X4) 2223 VCVTTSS2SI(X4, EDX) 2224 VROUNDSS(Imm(11), X4, X4, X4) 2225 MOVL(EDX, ECX) 2226 ANDL(Imm(1), ECX) 2227 JE(LabelRef("LBB13_8")) 2228 VADDSS(X4, X9, X4) 2229 } 2230 2231 Label("LBB13_8") 2232 { 2233 ADDL(EDX, ECX) 2234 ANDL(Imm(7), ECX) 2235 LEAL(Mem{Base: RCX}.Offset(-4), EDX) 2236 CMPL(ECX, Imm(4)) 2237 CMOVLLT(ECX, EDX) 2238 SETCC(R8B) 2239 CMPL(EDX, Imm(2)) 2240 SETCC(CL) 2241 VFMADD231SS(X10, X4, X3) 2242 VMULSS(X3, X3, X4) 2243 VMOVAPS(X8, X12) 2244 VFMADD213SS(X11, X4, X12) 2245 VFMADD213SS(X13, X4, X12) 2246 VFMADD213SS(X14, X4, X12) 2247 VMOVAPS(X7, X5) 2248 VFMADD213SS(X15, X4, X5) 2249 VFMADD213SS(X6, X4, X5) 2250 ADDL(I32(-1), EDX) 2251 CMPL(EDX, Imm(2)) 2252 JB(LabelRef("LBB13_9")) 2253 VFMADD213SS(X9, X12, X4) 2254 VMOVAPS(X4, X5) 2255 VMOVSS(X5, Mem{Base: RDI}.Idx(RAX, 4)) 2256 CMPB(R8B, CL) 2257 JE(LabelRef("LBB13_13")) 2258 JMP(LabelRef("LBB13_12")) 2259 } 2260 2261 Label("LBB13_9") 2262 { 2263 VMULSS(X3, X4, X4) 2264 VFMADD213SS(X3, X4, X5) 2265 VMOVSS(X5, Mem{Base: RDI}.Idx(RAX, 4)) 2266 CMPB(R8B, CL) 2267 JE(LabelRef("LBB13_13")) 2268 } 2269 2270 Label("LBB13_12") 2271 { 2272 VXORPS(X0, X5, X3) 2273 VMOVSS(X3, Mem{Base: RDI}.Idx(RAX, 4)) 2274 JMP(LabelRef("LBB13_13")) 2275 } 2276 2277 Label("LBB13_14") 2278 { 2279 ADDQ(Imm(72), RSP) 2280 VZEROUPPER() 2281 RET() 2282 } 2283 } 2284 2285 func genSinCos_F32() { 2286 2287 data := GLOBL("dataSinCosF32", RODATA|NOPTR) 2288 DATA(0, U32(2147483647)) 2289 DATA(4, U32(0x3fa2f983)) 2290 DATA(8, U32(4294967294)) 2291 DATA(12, U32(2)) 2292 DATA(16, U32(0xbf490fdb)) 2293 DATA(20, U32(3221225472)) 2294 DATA(24, U32(2147483648)) 2295 DATA(28, U32(0x37ccf5ce)) 2296 DATA(32, U32(0xbab6061a)) 2297 DATA(36, U32(0x3d2aaaa5)) 2298 DATA(40, U32(0xbf000000)) 2299 DATA(44, U32(0x3f800000)) 2300 DATA(48, U32(0xb94ca1f9)) 2301 DATA(52, U32(0x3c08839e)) 2302 DATA(56, U32(0xbe2aaaa3)) 2303 DATA(60, U32(0x4b7fffff)) 2304 DATA(64, U64(0xffffffffffffffff)) 2305 DATA(72, U64(0xffffffffffffffff)) 2306 DATA(80, U64(0xffffffffffffffff)) 2307 DATA(88, U64(0xffffffffffffffff)) 2308 DATA(96, U64(0x0)) 2309 DATA(104, U64(0x0)) 2310 DATA(112, U64(0x0)) 2311 DATA(120, U64(0x0)) 2312 2313 TEXT("SinCos_AVX2_F32", 0, "func(x, y, z []float32)") 2314 Pragma("noescape") 2315 Load(Param("x").Base(), RDI) 2316 Load(Param("y").Base(), RSI) 2317 Load(Param("z").Base(), RDX) 2318 Load(Param("x").Len(), RCX) 2319 2320 PUSHQ(RBX) 2321 SUBQ(Imm(96), RSP) 2322 MOVQ(RCX, R8) 2323 ANDQ(I32(-8), R8) 2324 JE(LabelRef("LBB14_3")) 2325 XORL(EAX, EAX) 2326 VBROADCASTSS(data.Offset(0), Y0) 2327 VMOVUPS(Y0, Mem{Base: RSP}.Offset(64)) 2328 VBROADCASTSS(data.Offset(4), Y0) 2329 VMOVUPS(Y0, Mem{Base: RSP}.Offset(32)) 2330 VBROADCASTSS(data.Offset(8), Y0) 2331 VMOVUPS(Y0, Mem{Base: RSP}) 2332 VPBROADCASTD(data.Offset(12), Y4) 2333 VBROADCASTSS(data.Offset(16), Y0) 2334 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32)) 2335 VBROADCASTSS(data.Offset(20), Y0) 2336 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64)) 2337 VPBROADCASTD(data.Offset(24), Y8) 2338 VBROADCASTSS(data.Offset(28), Y0) 2339 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96)) 2340 VBROADCASTSS(data.Offset(32), Y0) 2341 VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128)) 2342 VBROADCASTSS(data.Offset(36), Y11) 2343 VBROADCASTSS(data.Offset(40), Y10) 2344 VBROADCASTSS(data.Offset(44), Y13) 2345 VBROADCASTSS(data.Offset(48), Y14) 2346 VBROADCASTSS(data.Offset(52), Y15) 2347 VBROADCASTSS(data.Offset(56), Y2) 2348 2349 Label("LBB14_2") 2350 { 2351 VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y5) 2352 VANDPS(Mem{Base: RSP}.Offset(64), Y5, Y1) 2353 VMULPS(Mem{Base: RSP}.Offset(32), Y1, Y0) 2354 VCVTTPS2DQ(Y0, Y0) 2355 VPSUBD(data.Offset(64), Y0, Y3) 2356 VPAND(Mem{Base: RSP}, Y3, Y0) 2357 VCVTDQ2PS(Y0, Y6) 2358 VFMADD132PS(Mem{Base: RSP}.Offset(-32), Y1, Y6) 2359 VMULPS(Y6, Y6, Y1) 2360 VMOVUPS(Mem{Base: RSP}.Offset(-96), Y9) 2361 VFMADD213PS(Mem{Base: RSP}.Offset(-128), Y1, Y9) 2362 VFMADD213PS(Y11, Y1, Y9) 2363 VMULPS(Y1, Y1, Y7) 2364 VMOVAPS(Y10, Y12) 2365 VFMADD213PS(Y13, Y1, Y12) 2366 VFMADD231PS(Y7, Y9, Y12) 2367 VMOVAPS(Y14, Y7) 2368 VFMADD213PS(Y15, Y1, Y7) 2369 VFMADD213PS(Y2, Y1, Y7) 2370 VMULPS(Y6, Y1, Y1) 2371 VFMADD213PS(Y6, Y7, Y1) 2372 VPSLLD(Imm(29), Y3, Y6) 2373 VPAND(Y4, Y3, Y3) 2374 VPXOR(Y5, Y6, Y5) 2375 VPCMPEQD(Y4, Y3, Y6) 2376 VPCMPEQD(data.Offset(96), Y3, Y3) 2377 VANDPS(Y3, Y1, Y3) 2378 VANDPS(Y6, Y12, Y6) 2379 VADDPS(Y3, Y6, Y3) 2380 VADDPS(Y1, Y12, Y1) 2381 VPAND(Y5, Y8, Y5) 2382 VSUBPS(Y3, Y1, Y1) 2383 VPXOR(Y3, Y5, Y3) 2384 VPSLLD(Imm(29), Y0, Y0) 2385 VPADDD(Mem{Base: RSP}.Offset(-64), Y0, Y0) 2386 VPAND(Y0, Y8, Y0) 2387 VPXOR(Y0, Y8, Y0) 2388 VXORPS(Y0, Y1, Y0) 2389 VMOVDQU(Y3, Mem{Base: RDI}.Idx(RAX, 4)) 2390 VMOVUPS(Y0, Mem{Base: RSI}.Idx(RAX, 4)) 2391 ADDQ(Imm(8), RAX) 2392 CMPQ(RAX, R8) 2393 JB(LabelRef("LBB14_2")) 2394 } 2395 2396 Label("LBB14_3") 2397 { 2398 CMPQ(R8, RCX) 2399 JAE(LabelRef("LBB14_16")) 2400 VBROADCASTSS(data.Offset(24), X0) 2401 VXORPS(X1, X1, X1) 2402 VMOVSS(data.Offset(60), X2) 2403 VMOVSS(data.Offset(44), X6) 2404 VMOVSS(data.Offset(28), X8) 2405 VMOVSS(data.Offset(36), X12) 2406 VMOVSS(data.Offset(40), X13) 2407 VMOVSS(data.Offset(48), X15) 2408 VMOVSS(data.Offset(52), X14) 2409 VMOVSS(data.Offset(56), X10) 2410 JMP(LabelRef("LBB14_5")) 2411 } 2412 2413 Label("LBB14_15") 2414 { 2415 ADDQ(Imm(1), R8) 2416 CMPQ(R8, RCX) 2417 JAE(LabelRef("LBB14_16")) 2418 } 2419 2420 Label("LBB14_5") 2421 { 2422 VMOVSS(Mem{Base: RDX}.Idx(R8, 4), X4) 2423 VXORPS(X0, X4, X5) 2424 VCMPSS(Imm(1), X1, X4, X7) 2425 VBLENDVPS(X7, X5, X4, X5) 2426 VUCOMISS(X2, X5) 2427 JA(LabelRef("LBB14_15")) 2428 VUCOMISS(X1, X4) 2429 SETCS(R9B) 2430 VMULSS(data.Offset(4), X5, X4) 2431 VCVTTSS2SI(X4, R10L) 2432 VROUNDSS(Imm(11), X4, X4, X4) 2433 MOVL(R10L, EAX) 2434 ANDL(Imm(1), EAX) 2435 JE(LabelRef("LBB14_8")) 2436 VADDSS(X6, X4, X4) 2437 } 2438 2439 Label("LBB14_8") 2440 { 2441 ADDL(R10L, EAX) 2442 ANDL(Imm(7), EAX) 2443 LEAL(Mem{Base: RAX}.Offset(-4), R10L) 2444 CMPL(EAX, Imm(4)) 2445 SETCC(R11B) 2446 CMOVLLT(EAX, R10L) 2447 VFMADD231SS(data.Offset(16), X4, X5) 2448 VMULSS(X5, X5, X7) 2449 VMOVAPS(X8, X11) 2450 VFMADD213SS(data.Offset(32), X7, X11) 2451 VFMADD213SS(X12, X7, X11) 2452 VMULSS(X7, X7, X9) 2453 VMOVAPS(X6, X4) 2454 VFMADD231SS(X13, X7, X4) 2455 VFMADD231SS(X9, X11, X4) 2456 VMOVAPS(X15, X3) 2457 VFMADD213SS(X14, X7, X3) 2458 VFMADD213SS(X10, X7, X3) 2459 VMULSS(X5, X7, X7) 2460 VFMADD213SS(X5, X3, X7) 2461 LEAL(Mem{Base: R10}.Offset(-1), EBX) 2462 CMPL(EBX, Imm(2)) 2463 JB(LabelRef("LBB14_9")) 2464 VMOVAPS(X7, X5) 2465 VMOVSS(X5, Mem{Base: RDI}.Idx(R8, 4)) 2466 VMOVSS(X4, Mem{Base: RSI}.Idx(R8, 4)) 2467 CMPB(R9B, R11B) 2468 JNE(LabelRef("LBB14_12")) 2469 JMP(LabelRef("LBB14_13")) 2470 } 2471 2472 Label("LBB14_9") 2473 { 2474 VMOVAPS(X4, X5) 2475 VMOVAPS(X7, X4) 2476 VMOVSS(X5, Mem{Base: RDI}.Idx(R8, 4)) 2477 VMOVSS(X4, Mem{Base: RSI}.Idx(R8, 4)) 2478 CMPB(R9B, R11B) 2479 JE(LabelRef("LBB14_13")) 2480 } 2481 2482 Label("LBB14_12") 2483 { 2484 VMOVSS(Mem{Base: RDI}.Idx(R8, 4), X3) 2485 VXORPS(X0, X3, X3) 2486 VMOVSS(X3, Mem{Base: RDI}.Idx(R8, 4)) 2487 } 2488 2489 Label("LBB14_13") 2490 { 2491 CMPL(R10L, Imm(2)) 2492 SETCC(BL) 2493 CMPL(EAX, Imm(4)) 2494 SETCC(AL) 2495 CMPB(AL, BL) 2496 JE(LabelRef("LBB14_15")) 2497 VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X3) 2498 VXORPS(X0, X3, X3) 2499 VMOVSS(X3, Mem{Base: RSI}.Idx(R8, 4)) 2500 JMP(LabelRef("LBB14_15")) 2501 } 2502 2503 Label("LBB14_16") 2504 { 2505 ADDQ(Imm(96), RSP) 2506 POPQ(RBX) 2507 VZEROUPPER() 2508 RET() 2509 } 2510 }