github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/compiler/impl_vec_amd64.go (about) 1 package compiler 2 3 import ( 4 "errors" 5 6 "github.com/bananabytelabs/wazero/internal/asm" 7 "github.com/bananabytelabs/wazero/internal/asm/amd64" 8 "github.com/bananabytelabs/wazero/internal/wazeroir" 9 ) 10 11 // compileV128Const implements compiler.compileV128Const for amd64 architecture. 12 func (c *amd64Compiler) compileV128Const(o *wazeroir.UnionOperation) error { 13 if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { 14 return err 15 } 16 17 lo, hi := o.U1, o.U2 18 19 result, err := c.allocateRegister(registerTypeVector) 20 if err != nil { 21 return err 22 } 23 24 // We cannot directly load the value from memory to float regs, 25 // so we move it to int reg temporarily. 26 tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose) 27 if err != nil { 28 return err 29 } 30 31 // Move the lower 64-bits. 32 if lo == 0 { 33 c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg) 34 } else { 35 c.assembler.CompileConstToRegister(amd64.MOVQ, int64(lo), tmpReg) 36 } 37 c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, result) 38 39 if lo != 0 && hi == 0 { 40 c.assembler.CompileRegisterToRegister(amd64.XORQ, tmpReg, tmpReg) 41 } else if hi != 0 { 42 c.assembler.CompileConstToRegister(amd64.MOVQ, int64(hi), tmpReg) 43 } 44 // Move the higher 64-bits with PINSRQ at the second element of 64x2 vector. 45 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmpReg, result, 1) 46 47 c.pushVectorRuntimeValueLocationOnRegister(result) 48 return nil 49 } 50 51 // compileV128Add implements compiler.compileV128Add for amd64 architecture. 52 func (c *amd64Compiler) compileV128Add(o *wazeroir.UnionOperation) error { 53 x2 := c.locationStack.popV128() 54 if err := c.compileEnsureOnRegister(x2); err != nil { 55 return err 56 } 57 58 x1 := c.locationStack.popV128() 59 if err := c.compileEnsureOnRegister(x1); err != nil { 60 return err 61 } 62 var inst asm.Instruction 63 shape := o.B1 64 switch shape { 65 case wazeroir.ShapeI8x16: 66 inst = amd64.PADDB 67 case wazeroir.ShapeI16x8: 68 inst = amd64.PADDW 69 case wazeroir.ShapeI32x4: 70 inst = amd64.PADDD 71 case wazeroir.ShapeI64x2: 72 inst = amd64.PADDQ 73 case wazeroir.ShapeF32x4: 74 inst = amd64.ADDPS 75 case wazeroir.ShapeF64x2: 76 inst = amd64.ADDPD 77 } 78 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 79 80 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 81 c.locationStack.markRegisterUnused(x2.register) 82 return nil 83 } 84 85 // compileV128Sub implements compiler.compileV128Sub for amd64 architecture. 86 func (c *amd64Compiler) compileV128Sub(o *wazeroir.UnionOperation) error { 87 x2 := c.locationStack.popV128() 88 if err := c.compileEnsureOnRegister(x2); err != nil { 89 return err 90 } 91 92 x1 := c.locationStack.popV128() 93 if err := c.compileEnsureOnRegister(x1); err != nil { 94 return err 95 } 96 var inst asm.Instruction 97 shape := o.B1 98 switch shape { 99 case wazeroir.ShapeI8x16: 100 inst = amd64.PSUBB 101 case wazeroir.ShapeI16x8: 102 inst = amd64.PSUBW 103 case wazeroir.ShapeI32x4: 104 inst = amd64.PSUBD 105 case wazeroir.ShapeI64x2: 106 inst = amd64.PSUBQ 107 case wazeroir.ShapeF32x4: 108 inst = amd64.SUBPS 109 case wazeroir.ShapeF64x2: 110 inst = amd64.SUBPD 111 } 112 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 113 114 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 115 c.locationStack.markRegisterUnused(x2.register) 116 return nil 117 } 118 119 // compileV128Load implements compiler.compileV128Load for amd64 architecture. 120 func (c *amd64Compiler) compileV128Load(o *wazeroir.UnionOperation) error { 121 result, err := c.allocateRegister(registerTypeVector) 122 if err != nil { 123 return err 124 } 125 126 offset := uint32(o.U2) 127 loadType := wazeroir.V128LoadType(o.B1) 128 129 switch loadType { 130 case wazeroir.V128LoadType128: 131 err = c.compileV128LoadImpl(amd64.MOVDQU, offset, 16, result) 132 case wazeroir.V128LoadType8x8s: 133 err = c.compileV128LoadImpl(amd64.PMOVSXBW, offset, 8, result) 134 case wazeroir.V128LoadType8x8u: 135 err = c.compileV128LoadImpl(amd64.PMOVZXBW, offset, 8, result) 136 case wazeroir.V128LoadType16x4s: 137 err = c.compileV128LoadImpl(amd64.PMOVSXWD, offset, 8, result) 138 case wazeroir.V128LoadType16x4u: 139 err = c.compileV128LoadImpl(amd64.PMOVZXWD, offset, 8, result) 140 case wazeroir.V128LoadType32x2s: 141 err = c.compileV128LoadImpl(amd64.PMOVSXDQ, offset, 8, result) 142 case wazeroir.V128LoadType32x2u: 143 err = c.compileV128LoadImpl(amd64.PMOVZXDQ, offset, 8, result) 144 case wazeroir.V128LoadType8Splat: 145 reg, err := c.compileMemoryAccessCeilSetup(offset, 1) 146 if err != nil { 147 return err 148 } 149 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVBQZX, amd64ReservedRegisterForMemory, -1, 150 reg, 1, reg) 151 // pinsrb $0, reg, result 152 // pxor tmpVReg, tmpVReg 153 // pshufb tmpVReg, result 154 c.locationStack.markRegisterUsed(result) 155 tmpVReg, err := c.allocateRegister(registerTypeVector) 156 if err != nil { 157 return err 158 } 159 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, reg, result, 0) 160 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmpVReg, tmpVReg) 161 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmpVReg, result) 162 case wazeroir.V128LoadType16Splat: 163 reg, err := c.compileMemoryAccessCeilSetup(offset, 2) 164 if err != nil { 165 return err 166 } 167 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVWQZX, amd64ReservedRegisterForMemory, -2, 168 reg, 1, reg) 169 // pinsrw $0, reg, result 170 // pinsrw $1, reg, result 171 // pshufd $0, result, result (result = result[0,0,0,0]) 172 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 0) 173 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 1) 174 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 175 case wazeroir.V128LoadType32Splat: 176 reg, err := c.compileMemoryAccessCeilSetup(offset, 4) 177 if err != nil { 178 return err 179 } 180 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVLQZX, amd64ReservedRegisterForMemory, -4, 181 reg, 1, reg) 182 // pinsrd $0, reg, result 183 // pshufd $0, result, result (result = result[0,0,0,0]) 184 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, reg, result, 0) 185 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 186 case wazeroir.V128LoadType64Splat: 187 reg, err := c.compileMemoryAccessCeilSetup(offset, 8) 188 if err != nil { 189 return err 190 } 191 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, amd64ReservedRegisterForMemory, -8, 192 reg, 1, reg) 193 // pinsrq $0, reg, result 194 // pinsrq $1, reg, result 195 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 0) 196 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 1) 197 case wazeroir.V128LoadType32zero: 198 err = c.compileV128LoadImpl(amd64.MOVL, offset, 4, result) 199 case wazeroir.V128LoadType64zero: 200 err = c.compileV128LoadImpl(amd64.MOVQ, offset, 8, result) 201 } 202 203 if err != nil { 204 return err 205 } 206 207 c.pushVectorRuntimeValueLocationOnRegister(result) 208 return nil 209 } 210 211 func (c *amd64Compiler) compileV128LoadImpl(inst asm.Instruction, offset uint32, targetSizeInBytes int64, dst asm.Register) error { 212 offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) 213 if err != nil { 214 return err 215 } 216 c.assembler.CompileMemoryWithIndexToRegister(inst, amd64ReservedRegisterForMemory, -targetSizeInBytes, 217 offsetReg, 1, dst) 218 return nil 219 } 220 221 // compileV128LoadLane implements compiler.compileV128LoadLane for amd64. 222 func (c *amd64Compiler) compileV128LoadLane(o *wazeroir.UnionOperation) error { 223 targetVector := c.locationStack.popV128() 224 if err := c.compileEnsureOnRegister(targetVector); err != nil { 225 return err 226 } 227 228 laneSize, laneIndex := o.B1, o.B2 229 offset := uint32(o.U2) 230 231 var insertInst asm.Instruction 232 switch laneSize { 233 case 8: 234 insertInst = amd64.PINSRB 235 case 16: 236 insertInst = amd64.PINSRW 237 case 32: 238 insertInst = amd64.PINSRD 239 case 64: 240 insertInst = amd64.PINSRQ 241 } 242 243 targetSizeInBytes := int64(laneSize / 8) 244 offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) 245 if err != nil { 246 return err 247 } 248 c.assembler.CompileMemoryWithIndexAndArgToRegister(insertInst, amd64ReservedRegisterForMemory, -targetSizeInBytes, 249 offsetReg, 1, targetVector.register, laneIndex) 250 251 c.pushVectorRuntimeValueLocationOnRegister(targetVector.register) 252 return nil 253 } 254 255 // compileV128Store implements compiler.compileV128Store for amd64. 256 func (c *amd64Compiler) compileV128Store(o *wazeroir.UnionOperation) error { 257 val := c.locationStack.popV128() 258 if err := c.compileEnsureOnRegister(val); err != nil { 259 return err 260 } 261 262 const targetSizeInBytes = 16 263 offset := uint32(o.U2) 264 offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) 265 if err != nil { 266 return err 267 } 268 269 c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVDQU, val.register, 270 amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1) 271 272 c.locationStack.markRegisterUnused(val.register, offsetReg) 273 return nil 274 } 275 276 // compileV128StoreLane implements compiler.compileV128StoreLane for amd64. 277 func (c *amd64Compiler) compileV128StoreLane(o *wazeroir.UnionOperation) error { 278 var storeInst asm.Instruction 279 laneSize := o.B1 280 laneIndex := o.B2 281 offset := uint32(o.U2) 282 switch laneSize { 283 case 8: 284 storeInst = amd64.PEXTRB 285 case 16: 286 storeInst = amd64.PEXTRW 287 case 32: 288 storeInst = amd64.PEXTRD 289 case 64: 290 storeInst = amd64.PEXTRQ 291 } 292 293 val := c.locationStack.popV128() 294 if err := c.compileEnsureOnRegister(val); err != nil { 295 return err 296 } 297 298 targetSizeInBytes := int64(laneSize / 8) 299 offsetReg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) 300 if err != nil { 301 return err 302 } 303 304 c.assembler.CompileRegisterToMemoryWithIndexAndArg(storeInst, val.register, 305 amd64ReservedRegisterForMemory, -targetSizeInBytes, offsetReg, 1, laneIndex) 306 307 c.locationStack.markRegisterUnused(val.register, offsetReg) 308 return nil 309 } 310 311 // compileV128ExtractLane implements compiler.compileV128ExtractLane for amd64. 312 func (c *amd64Compiler) compileV128ExtractLane(o *wazeroir.UnionOperation) error { 313 v := c.locationStack.popV128() 314 if err := c.compileEnsureOnRegister(v); err != nil { 315 return err 316 } 317 vreg := v.register 318 shape := o.B1 319 laneIndex := o.B2 320 signed := o.B3 321 switch shape { 322 case wazeroir.ShapeI8x16: 323 result, err := c.allocateRegister(registerTypeGeneralPurpose) 324 if err != nil { 325 return err 326 } 327 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRB, vreg, result, laneIndex) 328 if signed { 329 c.assembler.CompileRegisterToRegister(amd64.MOVBLSX, result, result) 330 } else { 331 c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, result, result) 332 } 333 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 334 c.locationStack.markRegisterUnused(vreg) 335 case wazeroir.ShapeI16x8: 336 result, err := c.allocateRegister(registerTypeGeneralPurpose) 337 if err != nil { 338 return err 339 } 340 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRW, vreg, result, laneIndex) 341 if signed { 342 c.assembler.CompileRegisterToRegister(amd64.MOVWLSX, result, result) 343 } else { 344 c.assembler.CompileRegisterToRegister(amd64.MOVWLZX, result, result) 345 } 346 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 347 c.locationStack.markRegisterUnused(vreg) 348 case wazeroir.ShapeI32x4: 349 result, err := c.allocateRegister(registerTypeGeneralPurpose) 350 if err != nil { 351 return err 352 } 353 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRD, vreg, result, laneIndex) 354 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 355 c.locationStack.markRegisterUnused(vreg) 356 case wazeroir.ShapeI64x2: 357 result, err := c.allocateRegister(registerTypeGeneralPurpose) 358 if err != nil { 359 return err 360 } 361 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, vreg, result, laneIndex) 362 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64) 363 c.locationStack.markRegisterUnused(vreg) 364 case wazeroir.ShapeF32x4: 365 if laneIndex != 0 { 366 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, vreg, vreg, laneIndex) 367 } 368 c.pushRuntimeValueLocationOnRegister(vreg, runtimeValueTypeF32) 369 case wazeroir.ShapeF64x2: 370 if laneIndex != 0 { 371 // This case we can assume LaneIndex == 1. 372 // We have to modify the val.register as, for example: 373 // 0b11 0b10 0b01 0b00 374 // | | | | 375 // [x3, x2, x1, x0] -> [x0, x0, x3, x2] 376 // where val.register = [x3, x2, x1, x0] and each xN = 32bits. 377 // Then, we interpret the register as float64, therefore, the float64 value is obtained as [x3, x2]. 378 arg := byte(0b00_00_11_10) 379 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, vreg, vreg, arg) 380 } 381 c.pushRuntimeValueLocationOnRegister(vreg, runtimeValueTypeF64) 382 } 383 384 return nil 385 } 386 387 // compileV128ReplaceLane implements compiler.compileV128ReplaceLane for amd64. 388 func (c *amd64Compiler) compileV128ReplaceLane(o *wazeroir.UnionOperation) error { 389 origin := c.locationStack.pop() 390 if err := c.compileEnsureOnRegister(origin); err != nil { 391 return err 392 } 393 394 vector := c.locationStack.popV128() 395 if err := c.compileEnsureOnRegister(vector); err != nil { 396 return err 397 } 398 399 shape := o.B1 400 laneIndex := o.B2 401 switch shape { 402 case wazeroir.ShapeI8x16: 403 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, vector.register, laneIndex) 404 case wazeroir.ShapeI16x8: 405 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, vector.register, laneIndex) 406 case wazeroir.ShapeI32x4: 407 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, vector.register, laneIndex) 408 case wazeroir.ShapeI64x2: 409 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, vector.register, laneIndex) 410 case wazeroir.ShapeF32x4: 411 c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, vector.register, 412 // In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument. 413 // See https://www.felixcloutier.com/x86/insertps 414 laneIndex<<4, 415 ) 416 case wazeroir.ShapeF64x2: 417 if laneIndex == 0 { 418 c.assembler.CompileRegisterToRegister(amd64.MOVSD, origin.register, vector.register) 419 } else { 420 c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, vector.register) 421 } 422 } 423 424 c.pushVectorRuntimeValueLocationOnRegister(vector.register) 425 c.locationStack.markRegisterUnused(origin.register) 426 return nil 427 } 428 429 // compileV128Splat implements compiler.compileV128Splat for amd64. 430 func (c *amd64Compiler) compileV128Splat(o *wazeroir.UnionOperation) (err error) { 431 origin := c.locationStack.pop() 432 if err = c.compileEnsureOnRegister(origin); err != nil { 433 return 434 } 435 436 var result asm.Register 437 shape := o.B1 438 switch shape { 439 case wazeroir.ShapeI8x16: 440 result, err = c.allocateRegister(registerTypeVector) 441 if err != nil { 442 return err 443 } 444 c.locationStack.markRegisterUsed(result) 445 446 tmp, err := c.allocateRegister(registerTypeVector) 447 if err != nil { 448 return err 449 } 450 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, origin.register, result, 0) 451 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 452 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, result) 453 case wazeroir.ShapeI16x8: 454 result, err = c.allocateRegister(registerTypeVector) 455 if err != nil { 456 return err 457 } 458 c.locationStack.markRegisterUsed(result) 459 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 0) 460 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, origin.register, result, 1) 461 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 462 case wazeroir.ShapeI32x4: 463 result, err = c.allocateRegister(registerTypeVector) 464 if err != nil { 465 return err 466 } 467 c.locationStack.markRegisterUsed(result) 468 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, origin.register, result, 0) 469 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 470 case wazeroir.ShapeI64x2: 471 result, err = c.allocateRegister(registerTypeVector) 472 if err != nil { 473 return err 474 } 475 c.locationStack.markRegisterUsed(result) 476 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 0) 477 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, origin.register, result, 1) 478 case wazeroir.ShapeF32x4: 479 result = origin.register 480 c.assembler.CompileRegisterToRegisterWithArg(amd64.INSERTPS, origin.register, result, 0) 481 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0) 482 case wazeroir.ShapeF64x2: 483 result = origin.register 484 c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, result) 485 c.assembler.CompileRegisterToRegister(amd64.MOVLHPS, origin.register, result) 486 } 487 488 c.locationStack.markRegisterUnused(origin.register) 489 c.pushVectorRuntimeValueLocationOnRegister(result) 490 return nil 491 } 492 493 // compileV128Shuffle implements compiler.compileV128Shuffle for amd64. 494 func (c *amd64Compiler) compileV128Shuffle(o *wazeroir.UnionOperation) error { 495 w := c.locationStack.popV128() 496 if err := c.compileEnsureOnRegister(w); err != nil { 497 return err 498 } 499 500 v := c.locationStack.popV128() 501 if err := c.compileEnsureOnRegister(v); err != nil { 502 return err 503 } 504 505 wr, vr := w.register, v.register 506 507 tmp, err := c.allocateRegister(registerTypeVector) 508 if err != nil { 509 return err 510 } 511 512 consts := [32]byte{} 513 lanes := o.Us 514 for i, unsignedLane := range lanes { 515 lane := byte(unsignedLane) 516 if lane < 16 { 517 consts[i+16] = 0x80 518 consts[i] = lane 519 } else { 520 consts[i+16] = lane - 16 521 consts[i] = 0x80 522 } 523 } 524 525 err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[:16]), tmp) 526 if err != nil { 527 return err 528 } 529 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, vr) 530 err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(consts[16:]), tmp) 531 if err != nil { 532 return err 533 } 534 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp, wr) 535 c.assembler.CompileRegisterToRegister(amd64.ORPS, vr, wr) 536 537 c.pushVectorRuntimeValueLocationOnRegister(wr) 538 c.locationStack.markRegisterUnused(vr) 539 return nil 540 } 541 542 var swizzleConst = [16]byte{ 543 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 544 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 545 } 546 547 // compileV128Swizzle implements compiler.compileV128Swizzle for amd64. 548 func (c *amd64Compiler) compileV128Swizzle(*wazeroir.UnionOperation) error { 549 index := c.locationStack.popV128() 550 if err := c.compileEnsureOnRegister(index); err != nil { 551 return err 552 } 553 554 base := c.locationStack.popV128() 555 if err := c.compileEnsureOnRegister(base); err != nil { 556 return err 557 } 558 559 idxReg, baseReg := index.register, base.register 560 561 tmp, err := c.allocateRegister(registerTypeVector) 562 if err != nil { 563 return err 564 } 565 566 err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(swizzleConst[:]), tmp) 567 if err != nil { 568 return err 569 } 570 571 c.assembler.CompileRegisterToRegister(amd64.PADDUSB, tmp, idxReg) 572 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, idxReg, baseReg) 573 574 c.pushVectorRuntimeValueLocationOnRegister(baseReg) 575 c.locationStack.markRegisterUnused(idxReg) 576 return nil 577 } 578 579 // compileV128AnyTrue implements compiler.compileV128AnyTrue for amd64. 580 func (c *amd64Compiler) compileV128AnyTrue(*wazeroir.UnionOperation) error { 581 v := c.locationStack.popV128() 582 if err := c.compileEnsureOnRegister(v); err != nil { 583 return err 584 } 585 vreg := v.register 586 587 c.assembler.CompileRegisterToRegister(amd64.PTEST, vreg, vreg) 588 589 c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateNE) 590 c.locationStack.markRegisterUnused(vreg) 591 return nil 592 } 593 594 // compileV128AllTrue implements compiler.compileV128AllTrue for amd64. 595 func (c *amd64Compiler) compileV128AllTrue(o *wazeroir.UnionOperation) error { 596 v := c.locationStack.popV128() 597 if err := c.compileEnsureOnRegister(v); err != nil { 598 return err 599 } 600 601 tmp, err := c.allocateRegister(registerTypeVector) 602 if err != nil { 603 return err 604 } 605 606 var cmpInst asm.Instruction 607 shape := o.B1 608 switch shape { 609 case wazeroir.ShapeI8x16: 610 cmpInst = amd64.PCMPEQB 611 case wazeroir.ShapeI16x8: 612 cmpInst = amd64.PCMPEQW 613 case wazeroir.ShapeI32x4: 614 cmpInst = amd64.PCMPEQD 615 case wazeroir.ShapeI64x2: 616 cmpInst = amd64.PCMPEQQ 617 } 618 619 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 620 c.assembler.CompileRegisterToRegister(cmpInst, v.register, tmp) 621 c.assembler.CompileRegisterToRegister(amd64.PTEST, tmp, tmp) 622 c.locationStack.markRegisterUnused(v.register, tmp) 623 c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE) 624 return nil 625 } 626 627 // compileV128BitMask implements compiler.compileV128BitMask for amd64. 628 func (c *amd64Compiler) compileV128BitMask(o *wazeroir.UnionOperation) error { 629 v := c.locationStack.popV128() 630 if err := c.compileEnsureOnRegister(v); err != nil { 631 return err 632 } 633 634 result, err := c.allocateRegister(registerTypeGeneralPurpose) 635 if err != nil { 636 return err 637 } 638 639 shape := o.B1 640 switch shape { 641 case wazeroir.ShapeI8x16: 642 c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result) 643 case wazeroir.ShapeI16x8: 644 // When we have: 645 // R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)] 646 // R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)] 647 // where RX(wn) is n-th signed word (16-bit) of RX register, 648 // 649 // "PACKSSWB R1, R2" produces 650 // R1 = [ 651 // byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)), 652 // byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)), 653 // byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)), 654 // byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)), 655 // ] 656 // where R1 is the destination register, and 657 // byte_sat(w) = int8(w) if w fits as signed 8-bit, 658 // 0x80 if w is less than 0x80 659 // 0x7F if w is greater than 0x7f 660 // 661 // See https://www.felixcloutier.com/x86/packsswb:packssdw for detail. 662 // 663 // Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8). 664 c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, v.register, v.register) 665 c.assembler.CompileRegisterToRegister(amd64.PMOVMSKB, v.register, result) 666 // Clear the higher bits than 8. 667 c.assembler.CompileConstToRegister(amd64.SHRQ, 8, result) 668 case wazeroir.ShapeI32x4: 669 c.assembler.CompileRegisterToRegister(amd64.MOVMSKPS, v.register, result) 670 case wazeroir.ShapeI64x2: 671 c.assembler.CompileRegisterToRegister(amd64.MOVMSKPD, v.register, result) 672 } 673 674 c.locationStack.markRegisterUnused(v.register) 675 c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) 676 return nil 677 } 678 679 // compileV128And implements compiler.compileV128And for amd64. 680 func (c *amd64Compiler) compileV128And(*wazeroir.UnionOperation) error { 681 x2 := c.locationStack.popV128() 682 if err := c.compileEnsureOnRegister(x2); err != nil { 683 return err 684 } 685 686 x1 := c.locationStack.popV128() 687 if err := c.compileEnsureOnRegister(x1); err != nil { 688 return err 689 } 690 691 c.assembler.CompileRegisterToRegister(amd64.PAND, x2.register, x1.register) 692 693 c.locationStack.markRegisterUnused(x2.register) 694 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 695 return nil 696 } 697 698 // compileV128Not implements compiler.compileV128Not for amd64. 699 func (c *amd64Compiler) compileV128Not(*wazeroir.UnionOperation) error { 700 v := c.locationStack.popV128() 701 if err := c.compileEnsureOnRegister(v); err != nil { 702 return err 703 } 704 705 tmp, err := c.allocateRegister(registerTypeVector) 706 if err != nil { 707 return err 708 } 709 710 // Set all bits on tmp register. 711 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 712 // Then XOR with tmp to reverse all bits on v.register. 713 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, v.register) 714 c.pushVectorRuntimeValueLocationOnRegister(v.register) 715 return nil 716 } 717 718 // compileV128Or implements compiler.compileV128Or for amd64. 719 func (c *amd64Compiler) compileV128Or(*wazeroir.UnionOperation) error { 720 x2 := c.locationStack.popV128() 721 if err := c.compileEnsureOnRegister(x2); err != nil { 722 return err 723 } 724 725 x1 := c.locationStack.popV128() 726 if err := c.compileEnsureOnRegister(x1); err != nil { 727 return err 728 } 729 730 c.assembler.CompileRegisterToRegister(amd64.POR, x2.register, x1.register) 731 732 c.locationStack.markRegisterUnused(x2.register) 733 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 734 return nil 735 } 736 737 // compileV128Xor implements compiler.compileV128Xor for amd64. 738 func (c *amd64Compiler) compileV128Xor(*wazeroir.UnionOperation) error { 739 x2 := c.locationStack.popV128() 740 if err := c.compileEnsureOnRegister(x2); err != nil { 741 return err 742 } 743 744 x1 := c.locationStack.popV128() 745 if err := c.compileEnsureOnRegister(x1); err != nil { 746 return err 747 } 748 749 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2.register, x1.register) 750 751 c.locationStack.markRegisterUnused(x2.register) 752 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 753 return nil 754 } 755 756 // compileV128Bitselect implements compiler.compileV128Bitselect for amd64. 757 func (c *amd64Compiler) compileV128Bitselect(*wazeroir.UnionOperation) error { 758 selector := c.locationStack.popV128() 759 if err := c.compileEnsureOnRegister(selector); err != nil { 760 return err 761 } 762 763 x2 := c.locationStack.popV128() 764 if err := c.compileEnsureOnRegister(x2); err != nil { 765 return err 766 } 767 768 x1 := c.locationStack.popV128() 769 if err := c.compileEnsureOnRegister(x1); err != nil { 770 return err 771 } 772 773 // The following logic is equivalent to v128.or(v128.and(v1, selector), v128.and(v2, v128.not(selector))) 774 // See https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select 775 c.assembler.CompileRegisterToRegister(amd64.PAND, selector.register, x1.register) 776 c.assembler.CompileRegisterToRegister(amd64.PANDN, x2.register, selector.register) 777 c.assembler.CompileRegisterToRegister(amd64.POR, selector.register, x1.register) 778 779 c.locationStack.markRegisterUnused(x2.register, selector.register) 780 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 781 return nil 782 } 783 784 // compileV128AndNot implements compiler.compileV128AndNot for amd64. 785 func (c *amd64Compiler) compileV128AndNot(*wazeroir.UnionOperation) error { 786 x2 := c.locationStack.popV128() 787 if err := c.compileEnsureOnRegister(x2); err != nil { 788 return err 789 } 790 791 x1 := c.locationStack.popV128() 792 if err := c.compileEnsureOnRegister(x1); err != nil { 793 return err 794 } 795 796 c.assembler.CompileRegisterToRegister(amd64.PANDN, x1.register, x2.register) 797 798 c.locationStack.markRegisterUnused(x1.register) 799 c.pushVectorRuntimeValueLocationOnRegister(x2.register) 800 return nil 801 } 802 803 // compileV128Shr implements compiler.compileV128Shr for amd64. 804 func (c *amd64Compiler) compileV128Shr(o *wazeroir.UnionOperation) error { 805 // https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity 806 shape := o.B1 807 signed := o.B3 808 if shape == wazeroir.ShapeI8x16 { 809 return c.compileV128ShrI8x16Impl(signed) 810 } else if shape == wazeroir.ShapeI64x2 && signed { 811 return c.compileV128ShrI64x2SignedImpl() 812 } else { 813 return c.compileV128ShrImpl(o) 814 } 815 } 816 817 // compileV128ShrImpl implements shift right instructions except for i8x16 (logical/arithmetic) and i64x2 (arithmetic). 818 func (c *amd64Compiler) compileV128ShrImpl(o *wazeroir.UnionOperation) error { 819 s := c.locationStack.pop() 820 if err := c.compileEnsureOnRegister(s); err != nil { 821 return err 822 } 823 824 x1 := c.locationStack.popV128() 825 if err := c.compileEnsureOnRegister(x1); err != nil { 826 return err 827 } 828 829 vecTmp, err := c.allocateRegister(registerTypeVector) 830 if err != nil { 831 return err 832 } 833 834 var moduleConst int64 835 var shift asm.Instruction 836 shape := o.B1 837 signed := o.B3 838 switch shape { 839 case wazeroir.ShapeI16x8: 840 moduleConst = 0xf // modulo 16. 841 if signed { 842 shift = amd64.PSRAW 843 } else { 844 shift = amd64.PSRLW 845 } 846 case wazeroir.ShapeI32x4: 847 moduleConst = 0x1f // modulo 32. 848 if signed { 849 shift = amd64.PSRAD 850 } else { 851 shift = amd64.PSRLD 852 } 853 case wazeroir.ShapeI64x2: 854 moduleConst = 0x3f // modulo 64. 855 shift = amd64.PSRLQ 856 } 857 858 gpShiftAmount := s.register 859 c.assembler.CompileConstToRegister(amd64.ANDQ, moduleConst, gpShiftAmount) 860 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp) 861 c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register) 862 863 c.locationStack.markRegisterUnused(gpShiftAmount) 864 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 865 return nil 866 } 867 868 // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i64x2 signed (arithmetic) shift. 869 // PSRAQ instruction requires AVX, so we emulate it without AVX instructions. https://www.felixcloutier.com/x86/psraw:psrad:psraq 870 func (c *amd64Compiler) compileV128ShrI64x2SignedImpl() error { 871 const shiftCountRegister = amd64.RegCX 872 873 s := c.locationStack.pop() 874 if s.register != shiftCountRegister { 875 // If another value lives on the CX register, we release it to the stack. 876 c.onValueReleaseRegisterToStack(shiftCountRegister) 877 if s.onStack() { 878 s.setRegister(shiftCountRegister) 879 c.compileLoadValueOnStackToRegister(s) 880 } else if s.onConditionalRegister() { 881 c.compileMoveConditionalToGeneralPurposeRegister(s, shiftCountRegister) 882 } else { // already on register. 883 old := s.register 884 c.assembler.CompileRegisterToRegister(amd64.MOVL, old, shiftCountRegister) 885 s.setRegister(shiftCountRegister) 886 c.locationStack.markRegisterUnused(old) 887 } 888 } 889 890 c.locationStack.markRegisterUsed(shiftCountRegister) 891 tmp, err := c.allocateRegister(registerTypeGeneralPurpose) 892 if err != nil { 893 return err 894 } 895 896 x1 := c.locationStack.popV128() 897 if err := c.compileEnsureOnRegister(x1); err != nil { 898 return err 899 } 900 901 // Extract each lane into tmp, execute SHR on tmp, and write it back to the lane. 902 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 0) 903 c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp) 904 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 0) 905 c.assembler.CompileRegisterToRegisterWithArg(amd64.PEXTRQ, x1.register, tmp, 1) 906 c.assembler.CompileRegisterToRegister(amd64.SARQ, shiftCountRegister, tmp) 907 c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, tmp, x1.register, 1) 908 909 c.locationStack.markRegisterUnused(shiftCountRegister) 910 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 911 return nil 912 } 913 914 // i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64. 915 // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. 916 var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. 917 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift 918 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift 919 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift 920 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift 921 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift 922 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift 923 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift 924 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift 925 } 926 927 // compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i8x16 signed logical/arithmetic shifts. 928 // amd64 doesn't have packed byte shifts, so we need this special casing. 929 // See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity 930 func (c *amd64Compiler) compileV128ShrI8x16Impl(signed bool) error { 931 s := c.locationStack.pop() 932 if err := c.compileEnsureOnRegister(s); err != nil { 933 return err 934 } 935 936 v := c.locationStack.popV128() 937 if err := c.compileEnsureOnRegister(v); err != nil { 938 return err 939 } 940 941 vecTmp, err := c.allocateRegister(registerTypeVector) 942 if err != nil { 943 return err 944 } 945 946 gpShiftAmount := s.register 947 c.assembler.CompileConstToRegister(amd64.ANDQ, 0x7, gpShiftAmount) // mod 8. 948 949 if signed { 950 c.locationStack.markRegisterUsed(vecTmp) 951 vecTmp2, err := c.allocateRegister(registerTypeVector) 952 if err != nil { 953 return err 954 } 955 956 vreg := v.register 957 958 // Copy the value from v.register to vecTmp. 959 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vreg, vecTmp) 960 961 // Assuming that we have 962 // vreg = [b1, ..., b16] 963 // vecTmp = [b1, ..., b16] 964 // at this point, then we use PUNPCKLBW and PUNPCKHBW to produce: 965 // vreg = [b1, b1, b2, b2, ..., b8, b8] 966 // vecTmp = [b9, b9, b10, b10, ..., b16, b16] 967 c.assembler.CompileRegisterToRegister(amd64.PUNPCKLBW, vreg, vreg) 968 c.assembler.CompileRegisterToRegister(amd64.PUNPCKHBW, vecTmp, vecTmp) 969 970 // Adding 8 to the shift amount, and then move the amount to vecTmp2. 971 c.assembler.CompileConstToRegister(amd64.ADDQ, 0x8, gpShiftAmount) 972 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp2) 973 974 // Perform the word packed arithmetic right shifts on vreg and vecTmp. 975 // This changes these two registers as: 976 // vreg = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s] 977 // vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s] 978 // where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte. 979 c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vreg) 980 c.assembler.CompileRegisterToRegister(amd64.PSRAW, vecTmp2, vecTmp) 981 982 // Finally, we can get the result by packing these two word vectors. 983 c.assembler.CompileRegisterToRegister(amd64.PACKSSWB, vecTmp, vreg) 984 985 c.locationStack.markRegisterUnused(gpShiftAmount, vecTmp) 986 c.pushVectorRuntimeValueLocationOnRegister(vreg) 987 } else { 988 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp) 989 // amd64 doesn't have packed byte shifts, so we packed word shift here, and then mark-out 990 // the unnecessary bits below. 991 c.assembler.CompileRegisterToRegister(amd64.PSRLW, vecTmp, v.register) 992 993 gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose) 994 if err != nil { 995 return err 996 } 997 998 // Read the initial address of the mask table into gpTmp register. 999 err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16LogicalSHRMaskTable[:]), gpTmp) 1000 if err != nil { 1001 return err 1002 } 1003 1004 // We have to get the mask according to the shift amount, so we first have to do 1005 // gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes). 1006 c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount) 1007 1008 // Now ready to read the content of the mask into the vecTmp. 1009 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU, 1010 gpTmp, 0, gpShiftAmount, 1, 1011 vecTmp, 1012 ) 1013 1014 // Finally, clear out the unnecessary 1015 c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, v.register) 1016 1017 c.locationStack.markRegisterUnused(gpShiftAmount) 1018 c.pushVectorRuntimeValueLocationOnRegister(v.register) 1019 } 1020 return nil 1021 } 1022 1023 // i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64. 1024 // The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. 1025 var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. 1026 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift 1027 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift 1028 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift 1029 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift 1030 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift 1031 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift 1032 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift 1033 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift 1034 } 1035 1036 // compileV128Shl implements compiler.compileV128Shl for amd64. 1037 func (c *amd64Compiler) compileV128Shl(o *wazeroir.UnionOperation) error { 1038 s := c.locationStack.pop() 1039 if err := c.compileEnsureOnRegister(s); err != nil { 1040 return err 1041 } 1042 1043 x1 := c.locationStack.popV128() 1044 if err := c.compileEnsureOnRegister(x1); err != nil { 1045 return err 1046 } 1047 1048 vecTmp, err := c.allocateRegister(registerTypeVector) 1049 if err != nil { 1050 return err 1051 } 1052 1053 var modulo int64 1054 var shift asm.Instruction 1055 shape := o.B1 1056 switch shape { 1057 case wazeroir.ShapeI8x16: 1058 modulo = 0x7 // modulo 8. 1059 // x86 doesn't have packed bytes shift, so we use PSLLW and mask-out the redundant bits. 1060 // See https://stackoverflow.com/questions/35002937/sse-simd-shift-with-one-byte-element-size-granularity 1061 shift = amd64.PSLLW 1062 case wazeroir.ShapeI16x8: 1063 modulo = 0xf // modulo 16. 1064 shift = amd64.PSLLW 1065 case wazeroir.ShapeI32x4: 1066 modulo = 0x1f // modulo 32. 1067 shift = amd64.PSLLD 1068 case wazeroir.ShapeI64x2: 1069 modulo = 0x3f // modulo 64. 1070 shift = amd64.PSLLQ 1071 } 1072 1073 gpShiftAmount := s.register 1074 c.assembler.CompileConstToRegister(amd64.ANDQ, modulo, gpShiftAmount) 1075 c.assembler.CompileRegisterToRegister(amd64.MOVL, gpShiftAmount, vecTmp) 1076 c.assembler.CompileRegisterToRegister(shift, vecTmp, x1.register) 1077 1078 if shape == wazeroir.ShapeI8x16 { 1079 gpTmp, err := c.allocateRegister(registerTypeGeneralPurpose) 1080 if err != nil { 1081 return err 1082 } 1083 1084 // Read the initial address of the mask table into gpTmp register. 1085 err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, asm.NewStaticConst(i8x16SHLMaskTable[:]), gpTmp) 1086 if err != nil { 1087 return err 1088 } 1089 1090 // We have to get the mask according to the shift amount, so we first have to do 1091 // gpShiftAmount << 4 = gpShiftAmount*16 to get the initial offset of the mask (16 is the size of each mask in bytes). 1092 c.assembler.CompileConstToRegister(amd64.SHLQ, 4, gpShiftAmount) 1093 1094 // Now ready to read the content of the mask into the vecTmp. 1095 c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVDQU, 1096 gpTmp, 0, gpShiftAmount, 1, 1097 vecTmp, 1098 ) 1099 1100 // Finally, clear out the unnecessary 1101 c.assembler.CompileRegisterToRegister(amd64.PAND, vecTmp, x1.register) 1102 } 1103 1104 c.locationStack.markRegisterUnused(gpShiftAmount) 1105 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1106 return nil 1107 } 1108 1109 // compileV128Cmp implements compiler.compileV128Cmp for amd64. 1110 func (c *amd64Compiler) compileV128Cmp(o *wazeroir.UnionOperation) error { 1111 x2 := c.locationStack.popV128() 1112 if err := c.compileEnsureOnRegister(x2); err != nil { 1113 return err 1114 } 1115 1116 x1 := c.locationStack.popV128() 1117 if err := c.compileEnsureOnRegister(x1); err != nil { 1118 return err 1119 } 1120 1121 const ( 1122 // See https://www.felixcloutier.com/x86/cmppd and https://www.felixcloutier.com/x86/cmpps 1123 floatEqualArg = 0 1124 floatLessThanArg = 1 1125 floatLessThanOrEqualArg = 2 1126 floatNotEqualARg = 4 1127 ) 1128 1129 x1Reg, x2Reg, result := x1.register, x2.register, asm.NilRegister 1130 v128CmpType := o.B1 1131 switch v128CmpType { 1132 case wazeroir.V128CmpTypeF32x4Eq: 1133 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatEqualArg) 1134 result = x1Reg 1135 case wazeroir.V128CmpTypeF32x4Ne: 1136 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatNotEqualARg) 1137 result = x1Reg 1138 case wazeroir.V128CmpTypeF32x4Lt: 1139 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanArg) 1140 result = x1Reg 1141 case wazeroir.V128CmpTypeF32x4Gt: 1142 // Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead. 1143 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanArg) 1144 result = x2Reg 1145 case wazeroir.V128CmpTypeF32x4Le: 1146 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x2Reg, x1Reg, floatLessThanOrEqualArg) 1147 result = x1Reg 1148 case wazeroir.V128CmpTypeF32x4Ge: 1149 // Without AVX, there's no float Ge instruction, so we swap the register and use Le instead. 1150 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, x1Reg, x2Reg, floatLessThanOrEqualArg) 1151 result = x2Reg 1152 case wazeroir.V128CmpTypeF64x2Eq: 1153 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatEqualArg) 1154 result = x1Reg 1155 case wazeroir.V128CmpTypeF64x2Ne: 1156 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatNotEqualARg) 1157 result = x1Reg 1158 case wazeroir.V128CmpTypeF64x2Lt: 1159 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanArg) 1160 result = x1Reg 1161 case wazeroir.V128CmpTypeF64x2Gt: 1162 // Without AVX, there's no float Gt instruction, so we swap the register and use Lt instead. 1163 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanArg) 1164 result = x2Reg 1165 case wazeroir.V128CmpTypeF64x2Le: 1166 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x2Reg, x1Reg, floatLessThanOrEqualArg) 1167 result = x1Reg 1168 case wazeroir.V128CmpTypeF64x2Ge: 1169 // Without AVX, there's no float Ge instruction, so we swap the register and use Le instead. 1170 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, x1Reg, x2Reg, floatLessThanOrEqualArg) 1171 result = x2Reg 1172 case wazeroir.V128CmpTypeI8x16Eq: 1173 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg) 1174 result = x1Reg 1175 case wazeroir.V128CmpTypeI8x16Ne: 1176 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg) 1177 // Set all bits on x2Reg register. 1178 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1179 // Swap the bits on x1Reg register. 1180 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1181 result = x1Reg 1182 case wazeroir.V128CmpTypeI8x16LtS: 1183 c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x1Reg, x2Reg) 1184 result = x2Reg 1185 case wazeroir.V128CmpTypeI8x16LtU, wazeroir.V128CmpTypeI8x16GtU: 1186 // Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg. 1187 if v128CmpType == wazeroir.V128CmpTypeI8x16LtU { 1188 c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, x1Reg) 1189 } else { 1190 c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, x1Reg) 1191 } 1192 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, x2Reg, x1Reg) 1193 // Set all bits on x2Reg register. 1194 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1195 // Swap the bits on x2Reg register. 1196 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1197 result = x1Reg 1198 case wazeroir.V128CmpTypeI8x16GtS: 1199 c.assembler.CompileRegisterToRegister(amd64.PCMPGTB, x2Reg, x1Reg) 1200 result = x1Reg 1201 case wazeroir.V128CmpTypeI8x16LeS, wazeroir.V128CmpTypeI8x16LeU: 1202 tmp, err := c.allocateRegister(registerTypeVector) 1203 if err != nil { 1204 return err 1205 } 1206 // Copy the value on the src to tmp. 1207 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1208 if v128CmpType == wazeroir.V128CmpTypeI8x16LeS { 1209 c.assembler.CompileRegisterToRegister(amd64.PMINSB, x2Reg, tmp) 1210 } else { 1211 c.assembler.CompileRegisterToRegister(amd64.PMINUB, x2Reg, tmp) 1212 } 1213 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg) 1214 result = x1Reg 1215 case wazeroir.V128CmpTypeI8x16GeS, wazeroir.V128CmpTypeI8x16GeU: 1216 tmp, err := c.allocateRegister(registerTypeVector) 1217 if err != nil { 1218 return err 1219 } 1220 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1221 if v128CmpType == wazeroir.V128CmpTypeI8x16GeS { 1222 c.assembler.CompileRegisterToRegister(amd64.PMAXSB, x2Reg, tmp) 1223 } else { 1224 c.assembler.CompileRegisterToRegister(amd64.PMAXUB, x2Reg, tmp) 1225 } 1226 c.assembler.CompileRegisterToRegister(amd64.PCMPEQB, tmp, x1Reg) 1227 result = x1Reg 1228 case wazeroir.V128CmpTypeI16x8Eq: 1229 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg) 1230 result = x1Reg 1231 case wazeroir.V128CmpTypeI16x8Ne: 1232 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg) 1233 // Set all bits on x2Reg register. 1234 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1235 // Swap the bits on x1Reg register. 1236 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1237 result = x1Reg 1238 case wazeroir.V128CmpTypeI16x8LtS: 1239 c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x1Reg, x2Reg) 1240 result = x2Reg 1241 case wazeroir.V128CmpTypeI16x8LtU, wazeroir.V128CmpTypeI16x8GtU: 1242 // Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg. 1243 if v128CmpType == wazeroir.V128CmpTypeI16x8LtU { 1244 c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, x1Reg) 1245 } else { 1246 c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, x1Reg) 1247 } 1248 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x2Reg, x1Reg) 1249 // Set all bits on x2Reg register. 1250 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1251 // Swap the bits on x2Reg register. 1252 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1253 result = x1Reg 1254 case wazeroir.V128CmpTypeI16x8GtS: 1255 c.assembler.CompileRegisterToRegister(amd64.PCMPGTW, x2Reg, x1Reg) 1256 result = x1Reg 1257 case wazeroir.V128CmpTypeI16x8LeS, wazeroir.V128CmpTypeI16x8LeU: 1258 tmp, err := c.allocateRegister(registerTypeVector) 1259 if err != nil { 1260 return err 1261 } 1262 // Copy the value on the src to tmp. 1263 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1264 if v128CmpType == wazeroir.V128CmpTypeI16x8LeS { 1265 c.assembler.CompileRegisterToRegister(amd64.PMINSW, x2Reg, tmp) 1266 } else { 1267 c.assembler.CompileRegisterToRegister(amd64.PMINUW, x2Reg, tmp) 1268 } 1269 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg) 1270 result = x1Reg 1271 case wazeroir.V128CmpTypeI16x8GeS, wazeroir.V128CmpTypeI16x8GeU: 1272 tmp, err := c.allocateRegister(registerTypeVector) 1273 if err != nil { 1274 return err 1275 } 1276 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1277 if v128CmpType == wazeroir.V128CmpTypeI16x8GeS { 1278 c.assembler.CompileRegisterToRegister(amd64.PMAXSW, x2Reg, tmp) 1279 } else { 1280 c.assembler.CompileRegisterToRegister(amd64.PMAXUW, x2Reg, tmp) 1281 } 1282 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, tmp, x1Reg) 1283 result = x1Reg 1284 case wazeroir.V128CmpTypeI32x4Eq: 1285 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg) 1286 result = x1Reg 1287 case wazeroir.V128CmpTypeI32x4Ne: 1288 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg) 1289 // Set all bits on x2Reg register. 1290 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1291 // Swap the bits on x1Reg register. 1292 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1293 result = x1Reg 1294 case wazeroir.V128CmpTypeI32x4LtS: 1295 c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x1Reg, x2Reg) 1296 result = x2Reg 1297 case wazeroir.V128CmpTypeI32x4LtU, wazeroir.V128CmpTypeI32x4GtU: 1298 // Take the unsigned min/max values on each byte on x1 and x2 onto x1Reg. 1299 if v128CmpType == wazeroir.V128CmpTypeI32x4LtU { 1300 c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, x1Reg) 1301 } else { 1302 c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, x1Reg) 1303 } 1304 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x1Reg) 1305 // Set all bits on x2Reg register. 1306 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1307 // Swap the bits on x2Reg register. 1308 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1309 result = x1Reg 1310 case wazeroir.V128CmpTypeI32x4GtS: 1311 c.assembler.CompileRegisterToRegister(amd64.PCMPGTD, x2Reg, x1Reg) 1312 result = x1Reg 1313 case wazeroir.V128CmpTypeI32x4LeS, wazeroir.V128CmpTypeI32x4LeU: 1314 tmp, err := c.allocateRegister(registerTypeVector) 1315 if err != nil { 1316 return err 1317 } 1318 // Copy the value on the src to tmp. 1319 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1320 if v128CmpType == wazeroir.V128CmpTypeI32x4LeS { 1321 c.assembler.CompileRegisterToRegister(amd64.PMINSD, x2Reg, tmp) 1322 } else { 1323 c.assembler.CompileRegisterToRegister(amd64.PMINUD, x2Reg, tmp) 1324 } 1325 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg) 1326 result = x1Reg 1327 case wazeroir.V128CmpTypeI32x4GeS, wazeroir.V128CmpTypeI32x4GeU: 1328 tmp, err := c.allocateRegister(registerTypeVector) 1329 if err != nil { 1330 return err 1331 } 1332 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1Reg, tmp) 1333 if v128CmpType == wazeroir.V128CmpTypeI32x4GeS { 1334 c.assembler.CompileRegisterToRegister(amd64.PMAXSD, x2Reg, tmp) 1335 } else { 1336 c.assembler.CompileRegisterToRegister(amd64.PMAXUD, x2Reg, tmp) 1337 } 1338 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, x1Reg) 1339 result = x1Reg 1340 case wazeroir.V128CmpTypeI64x2Eq: 1341 c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg) 1342 result = x1Reg 1343 case wazeroir.V128CmpTypeI64x2Ne: 1344 c.assembler.CompileRegisterToRegister(amd64.PCMPEQQ, x2Reg, x1Reg) 1345 // Set all bits on x2Reg register. 1346 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1347 // Swap the bits on x1Reg register. 1348 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1349 result = x1Reg 1350 case wazeroir.V128CmpTypeI64x2LtS: 1351 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg) 1352 result = x2Reg 1353 case wazeroir.V128CmpTypeI64x2GtS: 1354 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg) 1355 result = x1Reg 1356 case wazeroir.V128CmpTypeI64x2LeS: 1357 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x2Reg, x1Reg) 1358 // Set all bits on x2Reg register. 1359 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x2Reg, x2Reg) 1360 // Swap the bits on x1Reg register. 1361 c.assembler.CompileRegisterToRegister(amd64.PXOR, x2Reg, x1Reg) 1362 result = x1Reg 1363 case wazeroir.V128CmpTypeI64x2GeS: 1364 c.assembler.CompileRegisterToRegister(amd64.PCMPGTQ, x1Reg, x2Reg) 1365 // Set all bits on x1Reg register. 1366 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, x1Reg, x1Reg) 1367 // Swap the bits on x2Reg register. 1368 c.assembler.CompileRegisterToRegister(amd64.PXOR, x1Reg, x2Reg) 1369 result = x2Reg 1370 } 1371 1372 c.locationStack.markRegisterUnused(x1Reg, x2Reg) 1373 c.pushVectorRuntimeValueLocationOnRegister(result) 1374 return nil 1375 } 1376 1377 // compileV128AddSat implements compiler.compileV128AddSat for amd64. 1378 func (c *amd64Compiler) compileV128AddSat(o *wazeroir.UnionOperation) error { 1379 var inst asm.Instruction 1380 shape := o.B1 1381 signed := o.B3 1382 switch shape { 1383 case wazeroir.ShapeI8x16: 1384 if signed { 1385 inst = amd64.PADDSB 1386 } else { 1387 inst = amd64.PADDUSB 1388 } 1389 case wazeroir.ShapeI16x8: 1390 if signed { 1391 inst = amd64.PADDSW 1392 } else { 1393 inst = amd64.PADDUSW 1394 } 1395 } 1396 1397 x2 := c.locationStack.popV128() 1398 if err := c.compileEnsureOnRegister(x2); err != nil { 1399 return err 1400 } 1401 1402 x1 := c.locationStack.popV128() 1403 if err := c.compileEnsureOnRegister(x1); err != nil { 1404 return err 1405 } 1406 1407 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1408 1409 c.locationStack.markRegisterUnused(x2.register) 1410 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1411 return nil 1412 } 1413 1414 // compileV128SubSat implements compiler.compileV128SubSat for amd64. 1415 func (c *amd64Compiler) compileV128SubSat(o *wazeroir.UnionOperation) error { 1416 var inst asm.Instruction 1417 shape := o.B1 1418 signed := o.B3 1419 switch shape { 1420 case wazeroir.ShapeI8x16: 1421 if signed { 1422 inst = amd64.PSUBSB 1423 } else { 1424 inst = amd64.PSUBUSB 1425 } 1426 case wazeroir.ShapeI16x8: 1427 if signed { 1428 inst = amd64.PSUBSW 1429 } else { 1430 inst = amd64.PSUBUSW 1431 } 1432 } 1433 1434 x2 := c.locationStack.popV128() 1435 if err := c.compileEnsureOnRegister(x2); err != nil { 1436 return err 1437 } 1438 1439 x1 := c.locationStack.popV128() 1440 if err := c.compileEnsureOnRegister(x1); err != nil { 1441 return err 1442 } 1443 1444 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1445 1446 c.locationStack.markRegisterUnused(x2.register) 1447 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1448 return nil 1449 } 1450 1451 // compileV128Mul implements compiler.compileV128Mul for amd64. 1452 func (c *amd64Compiler) compileV128Mul(o *wazeroir.UnionOperation) error { 1453 var inst asm.Instruction 1454 shape := o.B1 1455 switch shape { 1456 case wazeroir.ShapeI16x8: 1457 inst = amd64.PMULLW 1458 case wazeroir.ShapeI32x4: 1459 inst = amd64.PMULLD 1460 case wazeroir.ShapeI64x2: 1461 return c.compileV128MulI64x2() 1462 case wazeroir.ShapeF32x4: 1463 inst = amd64.MULPS 1464 case wazeroir.ShapeF64x2: 1465 inst = amd64.MULPD 1466 } 1467 1468 x2 := c.locationStack.popV128() 1469 if err := c.compileEnsureOnRegister(x2); err != nil { 1470 return err 1471 } 1472 1473 x1 := c.locationStack.popV128() 1474 if err := c.compileEnsureOnRegister(x1); err != nil { 1475 return err 1476 } 1477 1478 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1479 1480 c.locationStack.markRegisterUnused(x2.register) 1481 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1482 return nil 1483 } 1484 1485 // compileV128MulI64x2 implements V128Mul for i64x2. 1486 func (c *amd64Compiler) compileV128MulI64x2() error { 1487 x2 := c.locationStack.popV128() 1488 if err := c.compileEnsureOnRegister(x2); err != nil { 1489 return err 1490 } 1491 1492 x1 := c.locationStack.popV128() 1493 if err := c.compileEnsureOnRegister(x1); err != nil { 1494 return err 1495 } 1496 1497 x1r, x2r := x1.register, x2.register 1498 1499 tmp1, err := c.allocateRegister(registerTypeVector) 1500 if err != nil { 1501 return err 1502 } 1503 1504 c.locationStack.markRegisterUsed(tmp1) 1505 1506 tmp2, err := c.allocateRegister(registerTypeVector) 1507 if err != nil { 1508 return err 1509 } 1510 1511 // Assuming that we have 1512 // x1r = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high] 1513 // x2r = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high] 1514 // where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane. 1515 1516 // Copy x1's value into tmp1. 1517 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp1) 1518 // And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high] 1519 c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp1) 1520 1521 // Execute "pmuludq x2r,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit. 1522 c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, tmp1) 1523 1524 // Copy x2's value into tmp2. 1525 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp2) 1526 // And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high] 1527 c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp2) 1528 1529 // Execute "pmuludq x1r,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit. 1530 c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x1r, tmp2) 1531 1532 // Adds tmp1 and tmp2 and do the logical left shift by 32-bit, 1533 // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32] 1534 c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp2, tmp1) 1535 c.assembler.CompileConstToRegister(amd64.PSLLQ, 32, tmp1) 1536 1537 // Execute "pmuludq x2r,x1r", which makes x1r = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit. 1538 c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, x1r) 1539 1540 // Finally, we get the result by adding x1r and tmp1, 1541 // which makes x1r = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo] 1542 c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp1, x1r) 1543 1544 c.locationStack.markRegisterUnused(x2r, tmp1) 1545 c.pushVectorRuntimeValueLocationOnRegister(x1r) 1546 return nil 1547 } 1548 1549 // compileV128Div implements compiler.compileV128Div for amd64. 1550 func (c *amd64Compiler) compileV128Div(o *wazeroir.UnionOperation) error { 1551 x2 := c.locationStack.popV128() 1552 if err := c.compileEnsureOnRegister(x2); err != nil { 1553 return err 1554 } 1555 1556 x1 := c.locationStack.popV128() 1557 if err := c.compileEnsureOnRegister(x1); err != nil { 1558 return err 1559 } 1560 1561 var inst asm.Instruction 1562 shape := o.B1 1563 switch shape { 1564 case wazeroir.ShapeF32x4: 1565 inst = amd64.DIVPS 1566 case wazeroir.ShapeF64x2: 1567 inst = amd64.DIVPD 1568 } 1569 1570 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1571 1572 c.locationStack.markRegisterUnused(x2.register) 1573 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1574 return nil 1575 } 1576 1577 // compileV128Neg implements compiler.compileV128Neg for amd64. 1578 func (c *amd64Compiler) compileV128Neg(o *wazeroir.UnionOperation) error { 1579 shape := o.B1 1580 if shape <= wazeroir.ShapeI64x2 { 1581 return c.compileV128NegInt(shape) 1582 } else { 1583 return c.compileV128NegFloat(shape) 1584 } 1585 } 1586 1587 // compileV128NegInt implements compiler.compileV128Neg for integer lanes. 1588 func (c *amd64Compiler) compileV128NegInt(s wazeroir.Shape) error { 1589 v := c.locationStack.popV128() 1590 if err := c.compileEnsureOnRegister(v); err != nil { 1591 return err 1592 } 1593 1594 result, err := c.allocateRegister(registerTypeVector) 1595 if err != nil { 1596 return err 1597 } 1598 1599 var subInst asm.Instruction 1600 switch s { 1601 case wazeroir.ShapeI8x16: 1602 subInst = amd64.PSUBB 1603 case wazeroir.ShapeI16x8: 1604 subInst = amd64.PSUBW 1605 case wazeroir.ShapeI32x4: 1606 subInst = amd64.PSUBD 1607 case wazeroir.ShapeI64x2: 1608 subInst = amd64.PSUBQ 1609 } 1610 1611 c.assembler.CompileRegisterToRegister(amd64.PXOR, result, result) 1612 c.assembler.CompileRegisterToRegister(subInst, v.register, result) 1613 1614 c.locationStack.markRegisterUnused(v.register) 1615 c.pushVectorRuntimeValueLocationOnRegister(result) 1616 return nil 1617 } 1618 1619 // compileV128NegInt implements compiler.compileV128Neg for float lanes. 1620 func (c *amd64Compiler) compileV128NegFloat(s wazeroir.Shape) error { 1621 v := c.locationStack.popV128() 1622 if err := c.compileEnsureOnRegister(v); err != nil { 1623 return err 1624 } 1625 1626 tmp, err := c.allocateRegister(registerTypeVector) 1627 if err != nil { 1628 return err 1629 } 1630 1631 var leftShiftInst, xorInst asm.Instruction 1632 var leftShiftAmount asm.ConstantValue 1633 if s == wazeroir.ShapeF32x4 { 1634 leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLD, 31, amd64.XORPS 1635 } else { 1636 leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLQ, 63, amd64.XORPD 1637 } 1638 1639 // Clear all bits on tmp. 1640 c.assembler.CompileRegisterToRegister(amd64.XORPS, tmp, tmp) 1641 // Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction). 1642 // See https://www.felixcloutier.com/x86/cmpps 1643 // 1644 // Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane 1645 // if the lane is NaN. 1646 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, tmp, tmp, 0x8) 1647 // Do the left shift on each lane to set only the most significant bit in each. 1648 c.assembler.CompileConstToRegister(leftShiftInst, leftShiftAmount, tmp) 1649 // Get the negated result by XOR on each lane with tmp. 1650 c.assembler.CompileRegisterToRegister(xorInst, tmp, v.register) 1651 1652 c.pushVectorRuntimeValueLocationOnRegister(v.register) 1653 return nil 1654 } 1655 1656 // compileV128Sqrt implements compiler.compileV128Sqrt for amd64. 1657 func (c *amd64Compiler) compileV128Sqrt(o *wazeroir.UnionOperation) error { 1658 v := c.locationStack.popV128() 1659 if err := c.compileEnsureOnRegister(v); err != nil { 1660 return err 1661 } 1662 1663 var inst asm.Instruction 1664 shape := o.B1 1665 switch shape { 1666 case wazeroir.ShapeF64x2: 1667 inst = amd64.SQRTPD 1668 case wazeroir.ShapeF32x4: 1669 inst = amd64.SQRTPS 1670 } 1671 1672 c.assembler.CompileRegisterToRegister(inst, v.register, v.register) 1673 c.pushVectorRuntimeValueLocationOnRegister(v.register) 1674 return nil 1675 } 1676 1677 // compileV128Abs implements compiler.compileV128Abs for amd64. 1678 func (c *amd64Compiler) compileV128Abs(o *wazeroir.UnionOperation) error { 1679 shape := o.B1 1680 if shape == wazeroir.ShapeI64x2 { 1681 return c.compileV128AbsI64x2() 1682 } 1683 1684 v := c.locationStack.popV128() 1685 if err := c.compileEnsureOnRegister(v); err != nil { 1686 return err 1687 } 1688 1689 result := v.register 1690 switch shape { 1691 case wazeroir.ShapeI8x16: 1692 c.assembler.CompileRegisterToRegister(amd64.PABSB, result, result) 1693 case wazeroir.ShapeI16x8: 1694 c.assembler.CompileRegisterToRegister(amd64.PABSW, result, result) 1695 case wazeroir.ShapeI32x4: 1696 c.assembler.CompileRegisterToRegister(amd64.PABSD, result, result) 1697 case wazeroir.ShapeF32x4: 1698 tmp, err := c.allocateRegister(registerTypeVector) 1699 if err != nil { 1700 return err 1701 } 1702 // Set all bits on tmp. 1703 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 1704 // Shift right packed single floats by 1 to clear the sign bits. 1705 c.assembler.CompileConstToRegister(amd64.PSRLD, 1, tmp) 1706 // Clear the sign bit of vr. 1707 c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, result) 1708 case wazeroir.ShapeF64x2: 1709 tmp, err := c.allocateRegister(registerTypeVector) 1710 if err != nil { 1711 return err 1712 } 1713 // Set all bits on tmp. 1714 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 1715 // Shift right packed single floats by 1 to clear the sign bits. 1716 c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, tmp) 1717 // Clear the sign bit of vr. 1718 c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmp, result) 1719 } 1720 1721 c.pushVectorRuntimeValueLocationOnRegister(result) 1722 return nil 1723 } 1724 1725 // compileV128AbsI64x2 implements compileV128Abs for i64x2 lanes. 1726 func (c *amd64Compiler) compileV128AbsI64x2() error { 1727 // See https://www.felixcloutier.com/x86/blendvpd 1728 const blendMaskReg = amd64.RegX0 1729 c.onValueReleaseRegisterToStack(blendMaskReg) 1730 c.locationStack.markRegisterUsed(blendMaskReg) 1731 1732 v := c.locationStack.popV128() 1733 if err := c.compileEnsureOnRegister(v); err != nil { 1734 return err 1735 } 1736 vr := v.register 1737 1738 if vr == blendMaskReg { 1739 return errors.New("BUG: X0 must not be used") 1740 } 1741 1742 tmp, err := c.allocateRegister(registerTypeVector) 1743 if err != nil { 1744 return err 1745 } 1746 c.locationStack.markRegisterUsed(tmp) 1747 1748 // Copy the value to tmp. 1749 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 1750 1751 // Clear all bits on blendMaskReg. 1752 c.assembler.CompileRegisterToRegister(amd64.PXOR, blendMaskReg, blendMaskReg) 1753 // Subtract vr from blendMaskReg. 1754 c.assembler.CompileRegisterToRegister(amd64.PSUBQ, vr, blendMaskReg) 1755 // Copy the subtracted value ^^ back into vr. 1756 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, blendMaskReg, vr) 1757 1758 c.assembler.CompileRegisterToRegister(amd64.BLENDVPD, tmp, vr) 1759 1760 c.locationStack.markRegisterUnused(blendMaskReg, tmp) 1761 c.pushVectorRuntimeValueLocationOnRegister(vr) 1762 return nil 1763 } 1764 1765 var ( 1766 popcntMask = [16]byte{ 1767 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 1768 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 1769 } 1770 // popcntTable holds each index's Popcnt, for example popcntTable[5] holds popcnt(0x05). 1771 popcntTable = [16]byte{ 1772 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 1773 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 1774 } 1775 ) 1776 1777 // compileV128Popcnt implements compiler.compileV128Popcnt for amd64. 1778 func (c *amd64Compiler) compileV128Popcnt(operation *wazeroir.UnionOperation) error { 1779 v := c.locationStack.popV128() 1780 if err := c.compileEnsureOnRegister(v); err != nil { 1781 return err 1782 } 1783 vr := v.register 1784 1785 tmp1, err := c.allocateRegister(registerTypeVector) 1786 if err != nil { 1787 return err 1788 } 1789 1790 c.locationStack.markRegisterUsed(tmp1) 1791 1792 tmp2, err := c.allocateRegister(registerTypeVector) 1793 if err != nil { 1794 return err 1795 } 1796 1797 c.locationStack.markRegisterUsed(tmp2) 1798 1799 tmp3, err := c.allocateRegister(registerTypeVector) 1800 if err != nil { 1801 return err 1802 } 1803 1804 // Read the popcntMask into tmp1, and we have 1805 // tmp1 = [0xf, ..., 0xf] 1806 if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntMask[:]), tmp1); err != nil { 1807 return err 1808 } 1809 1810 // Copy the original value into tmp2. 1811 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2) 1812 1813 // Given that we have: 1814 // v = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn. 1815 // 1816 // Take PAND on tmp1 and tmp2, and we have 1817 // tmp2 = [l1, ..., l16]. 1818 c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, tmp2) 1819 1820 // Do logical (packed word) right shift by 4 on vr and PAND with vr and tmp1, meaning that we have 1821 // vr = [h1, ...., h16]. 1822 c.assembler.CompileConstToRegister(amd64.PSRLW, 4, vr) 1823 c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, vr) 1824 1825 // Read the popcntTable into tmp1, and we have 1826 // tmp1 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04] 1827 if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(popcntTable[:]), tmp1); err != nil { 1828 return err 1829 } 1830 1831 // Copy the tmp1 into tmp3, and we have 1832 // tmp3 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04] 1833 c.assembler.CompileRegisterToRegister(amd64.MOVDQU, tmp1, tmp3) 1834 1835 // tmp3 = [popcnt(l1), ..., popcnt(l16)]. 1836 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp2, tmp3) 1837 1838 // tmp1 = [popcnt(h1), ..., popcnt(h16)]. 1839 c.assembler.CompileRegisterToRegister(amd64.PSHUFB, vr, tmp1) 1840 1841 // vr = tmp1 = [popcnt(h1), ..., popcnt(h16)]. 1842 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, tmp1, vr) 1843 1844 // vr += tmp3 = [popcnt(h1)+popcnt(l1), ..., popcnt(h16)+popcnt(l16)] = [popcnt(b1), ..., popcnt(b16)]. 1845 c.assembler.CompileRegisterToRegister(amd64.PADDB, tmp3, vr) 1846 1847 c.locationStack.markRegisterUnused(tmp1, tmp2) 1848 c.pushVectorRuntimeValueLocationOnRegister(vr) 1849 return nil 1850 } 1851 1852 // compileV128Min implements compiler.compileV128Min for amd64. 1853 func (c *amd64Compiler) compileV128Min(o *wazeroir.UnionOperation) error { 1854 x2 := c.locationStack.popV128() 1855 if err := c.compileEnsureOnRegister(x2); err != nil { 1856 return err 1857 } 1858 1859 x1 := c.locationStack.popV128() 1860 if err := c.compileEnsureOnRegister(x1); err != nil { 1861 return err 1862 } 1863 1864 shape := o.B1 1865 if shape >= wazeroir.ShapeF32x4 { 1866 return c.compileV128FloatMinImpl(shape == wazeroir.ShapeF32x4, x1.register, x2.register) 1867 } 1868 1869 signed := o.B3 1870 var inst asm.Instruction 1871 switch shape { 1872 case wazeroir.ShapeI8x16: 1873 if signed { 1874 inst = amd64.PMINSB 1875 } else { 1876 inst = amd64.PMINUB 1877 } 1878 case wazeroir.ShapeI16x8: 1879 if signed { 1880 inst = amd64.PMINSW 1881 } else { 1882 inst = amd64.PMINUW 1883 } 1884 case wazeroir.ShapeI32x4: 1885 if signed { 1886 inst = amd64.PMINSD 1887 } else { 1888 inst = amd64.PMINUD 1889 } 1890 } 1891 1892 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1893 1894 c.locationStack.markRegisterUnused(x2.register) 1895 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1896 return nil 1897 } 1898 1899 // compileV128FloatMinImpl implements compiler.compileV128Min for float lanes. 1900 func (c *amd64Compiler) compileV128FloatMinImpl(is32bit bool, x1r, x2r asm.Register) error { 1901 tmp, err := c.allocateRegister(registerTypeVector) 1902 if err != nil { 1903 return err 1904 } 1905 1906 var min, cmp, andn, or, srl /* shit right logical */ asm.Instruction 1907 var shiftNumToInverseNaN asm.ConstantValue 1908 if is32bit { 1909 min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.PSRLD, 0xa 1910 } else { 1911 min, cmp, andn, or, srl, shiftNumToInverseNaN = amd64.MINPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.PSRLQ, 0xd 1912 } 1913 1914 // Let v1 and v2 be the operand values on x1r and x2r at this point. 1915 1916 // Copy the value into tmp: tmp=v1 1917 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp) 1918 // tmp=min(v1, v2) 1919 c.assembler.CompileRegisterToRegister(min, x2r, tmp) 1920 // x2r=min(v2, v1) 1921 c.assembler.CompileRegisterToRegister(min, x1r, x2r) 1922 // x1r=min(v2, v1) 1923 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, x1r) 1924 1925 // x2r = -0 if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN 1926 // NaN if v1 == NaN || v2 == NaN 1927 // min(v1, v2) otherwise 1928 c.assembler.CompileRegisterToRegister(or, tmp, x2r) 1929 // x1r = 0^ (set all bits) if v1 == NaN || v2 == NaN 1930 // 0 otherwise 1931 c.assembler.CompileRegisterToRegisterWithArg(cmp, tmp, x1r, 3) 1932 // x2r = -0 if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN 1933 // ^0 if v1 == NaN || v2 == NaN 1934 // min(v1, v2) otherwise 1935 c.assembler.CompileRegisterToRegister(or, x1r, x2r) 1936 // x1r = set all bits on the mantissa bits 1937 // 0 otherwise 1938 c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r) 1939 // x1r = x2r and !x1r 1940 // = -0 if (v1 == -0 || x2 == -0) && v1 != NaN && v2 !=NaN 1941 // set all bits on exponential and sign bit (== NaN) if v1 == NaN || v2 == NaN 1942 // min(v1, v2) otherwise 1943 c.assembler.CompileRegisterToRegister(andn, x2r, x1r) 1944 1945 c.locationStack.markRegisterUnused(x2r) 1946 c.pushVectorRuntimeValueLocationOnRegister(x1r) 1947 return nil 1948 } 1949 1950 // compileV128Max implements compiler.compileV128Max for amd64. 1951 func (c *amd64Compiler) compileV128Max(o *wazeroir.UnionOperation) error { 1952 x2 := c.locationStack.popV128() 1953 if err := c.compileEnsureOnRegister(x2); err != nil { 1954 return err 1955 } 1956 1957 x1 := c.locationStack.popV128() 1958 if err := c.compileEnsureOnRegister(x1); err != nil { 1959 return err 1960 } 1961 1962 shape := o.B1 1963 if shape >= wazeroir.ShapeF32x4 { 1964 return c.compileV128FloatMaxImpl(shape == wazeroir.ShapeF32x4, x1.register, x2.register) 1965 } 1966 1967 signed := o.B3 1968 var inst asm.Instruction 1969 switch shape { 1970 case wazeroir.ShapeI8x16: 1971 if signed { 1972 inst = amd64.PMAXSB 1973 } else { 1974 inst = amd64.PMAXUB 1975 } 1976 case wazeroir.ShapeI16x8: 1977 if signed { 1978 inst = amd64.PMAXSW 1979 } else { 1980 inst = amd64.PMAXUW 1981 } 1982 case wazeroir.ShapeI32x4: 1983 if signed { 1984 inst = amd64.PMAXSD 1985 } else { 1986 inst = amd64.PMAXUD 1987 } 1988 } 1989 1990 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 1991 1992 c.locationStack.markRegisterUnused(x2.register) 1993 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 1994 return nil 1995 } 1996 1997 // compileV128FloatMaxImpl implements compiler.compileV128Max for float lanes. 1998 func (c *amd64Compiler) compileV128FloatMaxImpl(is32bit bool, x1r, x2r asm.Register) error { 1999 tmp, err := c.allocateRegister(registerTypeVector) 2000 if err != nil { 2001 return err 2002 } 2003 2004 var max, cmp, andn, or, xor, sub, srl /* shit right logical */ asm.Instruction 2005 var shiftNumToInverseNaN asm.ConstantValue 2006 if is32bit { 2007 max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPS, amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.XORPS, amd64.SUBPS, amd64.PSRLD, 0xa 2008 } else { 2009 max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = amd64.MAXPD, amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.XORPD, amd64.SUBPD, amd64.PSRLQ, 0xd 2010 } 2011 2012 // Let v1 and v2 be the operand values on x1r and x2r at this point. 2013 2014 // Copy the value into tmp: tmp=v2 2015 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp) 2016 // tmp=max(v2, v1) 2017 c.assembler.CompileRegisterToRegister(max, x1r, tmp) 2018 // x1r=max(v1, v2) 2019 c.assembler.CompileRegisterToRegister(max, x2r, x1r) 2020 // x2r=max(v1, v2) 2021 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, x2r) 2022 2023 // x2r = -0 if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) 2024 // 0 if (v1 == 0 && v2 == 0) 2025 // -0 if (v1 == -0 && v2 == -0) 2026 // v1^v2 if v1 == NaN || v2 == NaN 2027 // 0 otherwise 2028 c.assembler.CompileRegisterToRegister(xor, tmp, x2r) 2029 // x1r = -0 if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) 2030 // 0 if (v1 == 0 && v2 == 0) 2031 // -0 if (v1 == -0 && v2 == -0) 2032 // NaN if v1 == NaN || v2 == NaN 2033 // max(v1, v2) otherwise 2034 c.assembler.CompileRegisterToRegister(or, x2r, x1r) 2035 // Copy x1r into tmp. 2036 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp) 2037 // tmp = 0 if (v1 == -0 && v2 == 0) || (v1 == 0 && v2 == -0) || (v1 == 0 && v2 == 0) 2038 // -0 if (v1 == -0 && v2 == -0) 2039 // NaN if v1 == NaN || v2 == NaN 2040 // max(v1, v2) otherwise 2041 // 2042 // Note: -0 - (-0) = 0 (!= -0) in floating point operation. 2043 c.assembler.CompileRegisterToRegister(sub, x2r, tmp) 2044 // x1r = 0^ if v1 == NaN || v2 == NaN 2045 c.assembler.CompileRegisterToRegisterWithArg(cmp, x1r, x1r, 3) 2046 // x1r = set all bits on the mantissa bits 2047 // 0 otherwise 2048 c.assembler.CompileConstToRegister(srl, shiftNumToInverseNaN, x1r) 2049 c.assembler.CompileRegisterToRegister(andn, tmp, x1r) 2050 2051 c.locationStack.markRegisterUnused(x2r) 2052 c.pushVectorRuntimeValueLocationOnRegister(x1r) 2053 return nil 2054 } 2055 2056 // compileV128AvgrU implements compiler.compileV128AvgrU for amd64. 2057 func (c *amd64Compiler) compileV128AvgrU(o *wazeroir.UnionOperation) error { 2058 x2 := c.locationStack.popV128() 2059 if err := c.compileEnsureOnRegister(x2); err != nil { 2060 return err 2061 } 2062 2063 x1 := c.locationStack.popV128() 2064 if err := c.compileEnsureOnRegister(x1); err != nil { 2065 return err 2066 } 2067 2068 var inst asm.Instruction 2069 shape := o.B1 2070 switch shape { 2071 case wazeroir.ShapeI8x16: 2072 inst = amd64.PAVGB 2073 case wazeroir.ShapeI16x8: 2074 inst = amd64.PAVGW 2075 } 2076 2077 c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) 2078 2079 c.locationStack.markRegisterUnused(x2.register) 2080 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 2081 return nil 2082 } 2083 2084 // compileV128Pmin implements compiler.compileV128Pmin for amd64. 2085 func (c *amd64Compiler) compileV128Pmin(o *wazeroir.UnionOperation) error { 2086 x2 := c.locationStack.popV128() 2087 if err := c.compileEnsureOnRegister(x2); err != nil { 2088 return err 2089 } 2090 2091 x1 := c.locationStack.popV128() 2092 if err := c.compileEnsureOnRegister(x1); err != nil { 2093 return err 2094 } 2095 2096 var min asm.Instruction 2097 if o.B1 == wazeroir.ShapeF32x4 { 2098 min = amd64.MINPS 2099 } else { 2100 min = amd64.MINPD 2101 } 2102 2103 x1r, v2r := x1.register, x2.register 2104 2105 c.assembler.CompileRegisterToRegister(min, x1r, v2r) 2106 2107 c.locationStack.markRegisterUnused(x1r) 2108 c.pushVectorRuntimeValueLocationOnRegister(v2r) 2109 return nil 2110 } 2111 2112 // compileV128Pmax implements compiler.compileV128Pmax for amd64. 2113 func (c *amd64Compiler) compileV128Pmax(o *wazeroir.UnionOperation) error { 2114 x2 := c.locationStack.popV128() 2115 if err := c.compileEnsureOnRegister(x2); err != nil { 2116 return err 2117 } 2118 2119 x1 := c.locationStack.popV128() 2120 if err := c.compileEnsureOnRegister(x1); err != nil { 2121 return err 2122 } 2123 2124 var min asm.Instruction 2125 if o.B1 == wazeroir.ShapeF32x4 { 2126 min = amd64.MAXPS 2127 } else { 2128 min = amd64.MAXPD 2129 } 2130 2131 x1r, v2r := x1.register, x2.register 2132 2133 c.assembler.CompileRegisterToRegister(min, x1r, v2r) 2134 2135 c.locationStack.markRegisterUnused(x1r) 2136 c.pushVectorRuntimeValueLocationOnRegister(v2r) 2137 return nil 2138 } 2139 2140 // compileV128Ceil implements compiler.compileV128Ceil for amd64. 2141 func (c *amd64Compiler) compileV128Ceil(o *wazeroir.UnionOperation) error { 2142 // See https://www.felixcloutier.com/x86/roundpd 2143 const roundModeCeil = 0x2 2144 return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeCeil) 2145 } 2146 2147 // compileV128Floor implements compiler.compileV128Floor for amd64. 2148 func (c *amd64Compiler) compileV128Floor(o *wazeroir.UnionOperation) error { 2149 // See https://www.felixcloutier.com/x86/roundpd 2150 const roundModeFloor = 0x1 2151 return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeFloor) 2152 } 2153 2154 // compileV128Trunc implements compiler.compileV128Trunc for amd64. 2155 func (c *amd64Compiler) compileV128Trunc(o *wazeroir.UnionOperation) error { 2156 // See https://www.felixcloutier.com/x86/roundpd 2157 const roundModeTrunc = 0x3 2158 return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeTrunc) 2159 } 2160 2161 // compileV128Nearest implements compiler.compileV128Nearest for amd64. 2162 func (c *amd64Compiler) compileV128Nearest(o *wazeroir.UnionOperation) error { 2163 // See https://www.felixcloutier.com/x86/roundpd 2164 const roundModeNearest = 0x0 2165 return c.compileV128RoundImpl(o.B1 == wazeroir.ShapeF32x4, roundModeNearest) 2166 } 2167 2168 // compileV128RoundImpl implements compileV128Nearest compileV128Trunc compileV128Floor and compileV128Ceil 2169 // with ROUNDPS (32-bit lane) and ROUNDPD (64-bit lane). 2170 func (c *amd64Compiler) compileV128RoundImpl(is32bit bool, mode byte) error { 2171 v := c.locationStack.popV128() 2172 if err := c.compileEnsureOnRegister(v); err != nil { 2173 return err 2174 } 2175 vr := v.register 2176 2177 var round asm.Instruction 2178 if is32bit { 2179 round = amd64.ROUNDPS 2180 } else { 2181 round = amd64.ROUNDPD 2182 } 2183 2184 c.assembler.CompileRegisterToRegisterWithArg(round, vr, vr, mode) 2185 c.pushVectorRuntimeValueLocationOnRegister(vr) 2186 return nil 2187 } 2188 2189 // compileV128Extend implements compiler.compileV128Extend for amd64. 2190 func (c *amd64Compiler) compileV128Extend(o *wazeroir.UnionOperation) error { 2191 v := c.locationStack.popV128() 2192 if err := c.compileEnsureOnRegister(v); err != nil { 2193 return err 2194 } 2195 vr := v.register 2196 2197 originShape := o.B1 2198 signed := o.B2 == 1 2199 useLow := o.B3 2200 if !useLow { 2201 // We have to shift the higher 64-bits into the lower ones before the actual extending instruction. 2202 // Shifting right by 0x8 * 8 = 64bits and concatenate itself. 2203 // See https://www.felixcloutier.com/x86/palignr 2204 c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, v.register, v.register, 0x8) 2205 } 2206 2207 var extend asm.Instruction 2208 switch originShape { 2209 case wazeroir.ShapeI8x16: 2210 if signed { 2211 extend = amd64.PMOVSXBW 2212 } else { 2213 extend = amd64.PMOVZXBW 2214 } 2215 case wazeroir.ShapeI16x8: 2216 if signed { 2217 extend = amd64.PMOVSXWD 2218 } else { 2219 extend = amd64.PMOVZXWD 2220 } 2221 case wazeroir.ShapeI32x4: 2222 if signed { 2223 extend = amd64.PMOVSXDQ 2224 } else { 2225 extend = amd64.PMOVZXDQ 2226 } 2227 } 2228 2229 c.assembler.CompileRegisterToRegister(extend, vr, vr) 2230 c.pushVectorRuntimeValueLocationOnRegister(vr) 2231 return nil 2232 } 2233 2234 // compileV128ExtMul implements compiler.compileV128ExtMul for amd64. 2235 func (c *amd64Compiler) compileV128ExtMul(o *wazeroir.UnionOperation) error { 2236 x2 := c.locationStack.popV128() 2237 if err := c.compileEnsureOnRegister(x2); err != nil { 2238 return err 2239 } 2240 2241 x1 := c.locationStack.popV128() 2242 if err := c.compileEnsureOnRegister(x1); err != nil { 2243 return err 2244 } 2245 2246 x1r, x2r := x1.register, x2.register 2247 2248 originShape := o.B1 2249 signed := o.B2 == 1 2250 useLow := o.B3 2251 switch originShape { 2252 case wazeroir.ShapeI8x16: 2253 if !useLow { 2254 // We have to shift the higher 64-bits into the lower ones before the actual extending instruction. 2255 // Shifting right by 0x8 * 8 = 64bits and concatenate itself. 2256 // See https://www.felixcloutier.com/x86/palignr 2257 c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x1r, x1r, 0x8) 2258 c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x2r, x2r, 0x8) 2259 } 2260 2261 var ext asm.Instruction 2262 if signed { 2263 ext = amd64.PMOVSXBW 2264 } else { 2265 ext = amd64.PMOVZXBW 2266 } 2267 2268 // Signed or Zero extend lower half packed bytes to packed words. 2269 c.assembler.CompileRegisterToRegister(ext, x1r, x1r) 2270 c.assembler.CompileRegisterToRegister(ext, x2r, x2r) 2271 2272 c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r) 2273 case wazeroir.ShapeI16x8: 2274 tmp, err := c.allocateRegister(registerTypeVector) 2275 if err != nil { 2276 return err 2277 } 2278 2279 // Copy the value on x1r to tmp. 2280 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp) 2281 2282 // Multiply the values and store the lower 16-bits into x1r. 2283 c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r) 2284 if signed { 2285 // Signed multiply the values and store the higher 16-bits into tmp. 2286 c.assembler.CompileRegisterToRegister(amd64.PMULHW, x2r, tmp) 2287 } else { 2288 // Unsigned multiply the values and store the higher 16-bits into tmp. 2289 c.assembler.CompileRegisterToRegister(amd64.PMULHUW, x2r, tmp) 2290 } 2291 2292 // Unpack lower or higher half of vectors (tmp and x1r) and concatenate them. 2293 if useLow { 2294 c.assembler.CompileRegisterToRegister(amd64.PUNPCKLWD, tmp, x1r) 2295 } else { 2296 c.assembler.CompileRegisterToRegister(amd64.PUNPCKHWD, tmp, x1r) 2297 } 2298 case wazeroir.ShapeI32x4: 2299 var shuffleOrder byte 2300 // Given that the original state of the register is as [v1, v2, v3, v4] where vN = a word, 2301 if useLow { 2302 // This makes the register as [v1, v1, v2, v2] 2303 shuffleOrder = 0b01010000 2304 } else { 2305 // This makes the register as [v3, v3, v4, v4] 2306 shuffleOrder = 0b11111010 2307 } 2308 // See https://www.felixcloutier.com/x86/pshufd 2309 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x1r, x1r, shuffleOrder) 2310 c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x2r, x2r, shuffleOrder) 2311 2312 var mul asm.Instruction 2313 if signed { 2314 mul = amd64.PMULDQ 2315 } else { 2316 mul = amd64.PMULUDQ 2317 } 2318 c.assembler.CompileRegisterToRegister(mul, x2r, x1r) 2319 } 2320 2321 c.locationStack.markRegisterUnused(x2r) 2322 c.pushVectorRuntimeValueLocationOnRegister(x1r) 2323 return nil 2324 } 2325 2326 var q15mulrSatSMask = [16]byte{ 2327 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 2328 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 2329 } 2330 2331 // compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for amd64. 2332 func (c *amd64Compiler) compileV128Q15mulrSatS(*wazeroir.UnionOperation) error { 2333 x2 := c.locationStack.popV128() 2334 if err := c.compileEnsureOnRegister(x2); err != nil { 2335 return err 2336 } 2337 2338 x1 := c.locationStack.popV128() 2339 if err := c.compileEnsureOnRegister(x1); err != nil { 2340 return err 2341 } 2342 2343 tmp, err := c.allocateRegister(registerTypeVector) 2344 if err != nil { 2345 return err 2346 } 2347 2348 x1r, x2r := x1.register, x2.register 2349 2350 // See https://github.com/WebAssembly/simd/pull/365 for the following logic. 2351 if err := c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(q15mulrSatSMask[:]), tmp); err != nil { 2352 return err 2353 } 2354 2355 c.assembler.CompileRegisterToRegister(amd64.PMULHRSW, x2r, x1r) 2356 c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x1r, tmp) 2357 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, x1r) 2358 2359 c.locationStack.markRegisterUnused(x2r) 2360 c.pushVectorRuntimeValueLocationOnRegister(x1r) 2361 return nil 2362 } 2363 2364 var ( 2365 allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1} 2366 allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0} 2367 2368 extAddPairwiseI16x8uMask = [16 * 2]byte{ 2369 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 2370 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 2371 } 2372 ) 2373 2374 // compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for amd64. 2375 func (c *amd64Compiler) compileV128ExtAddPairwise(o *wazeroir.UnionOperation) error { 2376 v := c.locationStack.popV128() 2377 if err := c.compileEnsureOnRegister(v); err != nil { 2378 return err 2379 } 2380 vr := v.register 2381 2382 originShape := o.B1 2383 signed := o.B3 2384 switch originShape { 2385 case wazeroir.ShapeI8x16: 2386 allOnesReg, err := c.allocateRegister(registerTypeVector) 2387 if err != nil { 2388 return err 2389 } 2390 2391 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2392 asm.NewStaticConst(allOnesI8x16[:]), allOnesReg); err != nil { 2393 return err 2394 } 2395 2396 var result asm.Register 2397 // See https://www.felixcloutier.com/x86/pmaddubsw for detail. 2398 if signed { 2399 // Interpret vr's value as signed byte and multiply with one and add pairwise, which results in pairwise 2400 // signed extadd. 2401 c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, vr, allOnesReg) 2402 result = allOnesReg 2403 } else { 2404 // Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned. 2405 c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, allOnesReg, vr) 2406 result = vr 2407 } 2408 2409 if result != vr { 2410 c.locationStack.markRegisterUnused(vr) 2411 } 2412 c.pushVectorRuntimeValueLocationOnRegister(result) 2413 case wazeroir.ShapeI16x8: 2414 tmp, err := c.allocateRegister(registerTypeVector) 2415 if err != nil { 2416 return err 2417 } 2418 2419 if signed { 2420 // See https://www.felixcloutier.com/x86/pmaddwd 2421 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2422 asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil { 2423 return err 2424 } 2425 2426 c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr) 2427 c.pushVectorRuntimeValueLocationOnRegister(vr) 2428 } else { 2429 2430 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2431 asm.NewStaticConst(extAddPairwiseI16x8uMask[:16]), tmp); err != nil { 2432 return err 2433 } 2434 2435 // Flip the sign bits on vr. 2436 // 2437 // Assuming that vr = [w1, ..., w8], now we have, 2438 // vr[i] = int8(-w1) for i = 0...8 2439 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr) 2440 2441 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2442 asm.NewStaticConst(allOnesI16x8[:]), tmp); err != nil { 2443 return err 2444 } 2445 2446 // For i = 0,..4 (as this results in i32x4 lanes), now we have 2447 // vr[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1))) 2448 c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr) 2449 2450 // tmp[i] = [0, 0, 1, 0] = int32(math.MaxInt16+1) 2451 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2452 asm.NewStaticConst(extAddPairwiseI16x8uMask[16:]), tmp); err != nil { 2453 return err 2454 } 2455 2456 // vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)). 2457 c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr) 2458 c.pushVectorRuntimeValueLocationOnRegister(vr) 2459 } 2460 } 2461 return nil 2462 } 2463 2464 // compileV128FloatPromote implements compiler.compileV128FloatPromote for amd64. 2465 func (c *amd64Compiler) compileV128FloatPromote(*wazeroir.UnionOperation) error { 2466 v := c.locationStack.popV128() 2467 if err := c.compileEnsureOnRegister(v); err != nil { 2468 return err 2469 } 2470 vr := v.register 2471 2472 c.assembler.CompileRegisterToRegister(amd64.CVTPS2PD, vr, vr) 2473 c.pushVectorRuntimeValueLocationOnRegister(vr) 2474 return nil 2475 } 2476 2477 // compileV128FloatDemote implements compiler.compileV128FloatDemote for amd64. 2478 func (c *amd64Compiler) compileV128FloatDemote(*wazeroir.UnionOperation) error { 2479 v := c.locationStack.popV128() 2480 if err := c.compileEnsureOnRegister(v); err != nil { 2481 return err 2482 } 2483 vr := v.register 2484 2485 c.assembler.CompileRegisterToRegister(amd64.CVTPD2PS, vr, vr) 2486 c.pushVectorRuntimeValueLocationOnRegister(vr) 2487 return nil 2488 } 2489 2490 // compileV128Dot implements compiler.compileV128Dot for amd64. 2491 func (c *amd64Compiler) compileV128Dot(*wazeroir.UnionOperation) error { 2492 x2 := c.locationStack.popV128() 2493 if err := c.compileEnsureOnRegister(x2); err != nil { 2494 return err 2495 } 2496 2497 x1 := c.locationStack.popV128() 2498 if err := c.compileEnsureOnRegister(x1); err != nil { 2499 return err 2500 } 2501 2502 c.assembler.CompileRegisterToRegister(amd64.PMADDWD, x2.register, x1.register) 2503 2504 c.locationStack.markRegisterUnused(x2.register) 2505 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 2506 return nil 2507 } 2508 2509 var fConvertFromIMask = [16]byte{ 2510 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 2511 } 2512 2513 // compileV128FConvertFromI implements compiler.compileV128FConvertFromI for amd64. 2514 func (c *amd64Compiler) compileV128FConvertFromI(o *wazeroir.UnionOperation) error { 2515 v := c.locationStack.popV128() 2516 if err := c.compileEnsureOnRegister(v); err != nil { 2517 return err 2518 } 2519 vr := v.register 2520 2521 destinationShape := o.B1 2522 signed := o.B3 2523 2524 switch destinationShape { 2525 case wazeroir.ShapeF32x4: 2526 if signed { 2527 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr) 2528 } else { 2529 tmp, err := c.allocateRegister(registerTypeVector) 2530 if err != nil { 2531 return err 2532 } 2533 2534 // Copy the value into tmp. 2535 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 2536 2537 // Clear the higher 16-bits of tmp. 2538 c.assembler.CompileConstToRegister(amd64.PSLLD, 0xa, tmp) 2539 c.assembler.CompileConstToRegister(amd64.PSRLD, 0xa, tmp) 2540 2541 // Subtract the higher 16-bits from vr == clear the lower 16-bits of vr. 2542 c.assembler.CompileRegisterToRegister(amd64.PSUBD, tmp, vr) 2543 2544 // Convert the lower 16-bits in tmp. 2545 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp) 2546 2547 // Left shift by one and convert vr, meaning that halved conversion result of higher 16-bits in vr. 2548 c.assembler.CompileConstToRegister(amd64.PSRLD, 1, vr) 2549 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr) 2550 2551 // Double the converted halved higher 16bits. 2552 c.assembler.CompileRegisterToRegister(amd64.ADDPS, vr, vr) 2553 2554 // Get the conversion result by add tmp (holding lower 16-bit conversion) into vr. 2555 c.assembler.CompileRegisterToRegister(amd64.ADDPS, tmp, vr) 2556 } 2557 case wazeroir.ShapeF64x2: 2558 if signed { 2559 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PD, vr, vr) 2560 } else { 2561 tmp, err := c.allocateRegister(registerTypeVector) 2562 if err != nil { 2563 return err 2564 } 2565 2566 // tmp = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00] 2567 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, asm.NewStaticConst(fConvertFromIMask[:16]), tmp); err != nil { 2568 return err 2569 } 2570 2571 // Given that we have vr = [d1, d2, d3, d4], this results in 2572 // vr = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]] 2573 // = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52] 2574 // ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double 2575 c.assembler.CompileRegisterToRegister(amd64.UNPCKLPS, tmp, vr) 2576 2577 // tmp = [float64(0x1.0p52), float64(0x1.0p52)] 2578 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVDQU, 2579 asm.NewStaticConst(twop52[:]), tmp); err != nil { 2580 return err 2581 } 2582 2583 // Now, we get the result as 2584 // vr = [float64(uint32(d1)), float64(uint32(d2))] 2585 // because the following equality always satisfies: 2586 // float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y)) 2587 c.assembler.CompileRegisterToRegister(amd64.SUBPD, tmp, vr) 2588 } 2589 } 2590 2591 c.pushVectorRuntimeValueLocationOnRegister(vr) 2592 return nil 2593 } 2594 2595 // compileV128Narrow implements compiler.compileV128Narrow for amd64. 2596 func (c *amd64Compiler) compileV128Narrow(o *wazeroir.UnionOperation) error { 2597 x2 := c.locationStack.popV128() 2598 if err := c.compileEnsureOnRegister(x2); err != nil { 2599 return err 2600 } 2601 2602 x1 := c.locationStack.popV128() 2603 if err := c.compileEnsureOnRegister(x1); err != nil { 2604 return err 2605 } 2606 2607 var narrow asm.Instruction 2608 originShape := o.B1 2609 signed := o.B3 2610 switch originShape { 2611 case wazeroir.ShapeI16x8: 2612 if signed { 2613 narrow = amd64.PACKSSWB 2614 } else { 2615 narrow = amd64.PACKUSWB 2616 } 2617 case wazeroir.ShapeI32x4: 2618 if signed { 2619 narrow = amd64.PACKSSDW 2620 } else { 2621 narrow = amd64.PACKUSDW 2622 } 2623 } 2624 c.assembler.CompileRegisterToRegister(narrow, x2.register, x1.register) 2625 2626 c.locationStack.markRegisterUnused(x2.register) 2627 c.pushVectorRuntimeValueLocationOnRegister(x1.register) 2628 return nil 2629 } 2630 2631 var ( 2632 // i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes. 2633 i32sMaxOnF64x2 = [16]byte{ 2634 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) 2635 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) 2636 } 2637 2638 // i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes. 2639 i32uMaxOnF64x2 = [16]byte{ 2640 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) 2641 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) 2642 } 2643 2644 // twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that 2645 // with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics, 2646 // like addition or subtraction, the resulted floating point holds exactly the same 2647 // bit representations in 32-bit integer on its mantissa. 2648 // 2649 // Note: the name twop52 is common across various compiler ecosystem. 2650 // E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28 2651 // E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html 2652 twop52 = [16]byte{ 2653 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) 2654 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) 2655 } 2656 ) 2657 2658 // compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for amd64. 2659 func (c *amd64Compiler) compileV128ITruncSatFromF(o *wazeroir.UnionOperation) error { 2660 v := c.locationStack.popV128() 2661 if err := c.compileEnsureOnRegister(v); err != nil { 2662 return err 2663 } 2664 vr := v.register 2665 2666 tmp, err := c.allocateRegister(registerTypeVector) 2667 if err != nil { 2668 return err 2669 } 2670 2671 c.locationStack.markRegisterUsed(tmp) 2672 2673 originShape := o.B1 2674 signed := o.B3 2675 switch originShape { 2676 case wazeroir.ShapeF32x4: 2677 if signed { 2678 // Copy the value into tmp. 2679 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 2680 2681 // Assuming we have vr = [v1, v2, v3, v4]. 2682 // 2683 // Set all bits if lane is not NaN on tmp. 2684 // tmp[i] = 0xffffffff if vi != NaN 2685 // = 0 if vi == NaN 2686 c.assembler.CompileRegisterToRegister(amd64.CMPEQPS, tmp, tmp) 2687 2688 // Clear NaN lanes on vr, meaning that 2689 // vr[i] = vi if vi != NaN 2690 // 0 if vi == NaN 2691 c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, vr) 2692 2693 // tmp[i] = ^vi if vi != NaN 2694 // = 0xffffffff if vi == NaN 2695 // which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative. 2696 c.assembler.CompileRegisterToRegister(amd64.PXOR, vr, tmp) 2697 2698 // vr[i] = int32(vi) if vi != NaN and vr is not overflowing. 2699 // = 0x80000000 if vi != NaN and vr is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq) 2700 // = 0 if vi == NaN 2701 c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr) 2702 2703 // Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane. 2704 // 2705 // tmp[i] = 0x80000000 if vi is positive 2706 // = any satisfying any&0x80000000 = 0 if vi is negative or zero. 2707 c.assembler.CompileRegisterToRegister(amd64.PAND, vr, tmp) 2708 2709 // Arithmetic right shifting tmp by 31, meaning that we have 2710 // tmp[i] = 0xffffffff if vi is positive, 0 otherwise. 2711 c.assembler.CompileConstToRegister(amd64.PSRAD, 0x1f, tmp) 2712 2713 // Flipping 0x80000000 if vi is positive, otherwise keep intact. 2714 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr) 2715 } else { 2716 tmp2, err := c.allocateRegister(registerTypeVector) 2717 if err != nil { 2718 return err 2719 } 2720 2721 // See https://github.com/bytecodealliance/wasmtime/pull/2440 2722 // Note: even v8 doesn't seem to have support for this i32x4.tranc_sat_f32x4_u. 2723 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 2724 c.assembler.CompileRegisterToRegister(amd64.MAXPS, tmp, vr) 2725 c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp) 2726 c.assembler.CompileConstToRegister(amd64.PSRLD, 0x1, tmp) 2727 c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp) 2728 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2) 2729 c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr) 2730 c.assembler.CompileRegisterToRegister(amd64.SUBPS, tmp, tmp2) 2731 c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, tmp2, tmp, 0x2) // == CMPLEPS 2732 c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, tmp2, tmp2) 2733 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp2) 2734 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 2735 c.assembler.CompileRegisterToRegister(amd64.PMAXSD, tmp, tmp2) 2736 c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp2, vr) 2737 } 2738 case wazeroir.ShapeF64x2: 2739 tmp2, err := c.allocateRegister(registerTypeVector) 2740 if err != nil { 2741 return err 2742 } 2743 2744 if signed { 2745 // Copy the value into tmp. 2746 c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp) 2747 2748 // Set all bits for non-NaN lanes, zeros otherwise. 2749 // I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise. 2750 c.assembler.CompileRegisterToRegister(amd64.CMPEQPD, tmp, tmp) 2751 2752 // Load the 2147483647 into tmp2's each lane. 2753 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32sMaxOnF64x2[:]), tmp2); err != nil { 2754 return err 2755 } 2756 2757 // tmp[i] = 2147483647 if vi != NaN, 0 otherwise. 2758 c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp2, tmp) 2759 2760 // MINPD returns the source register's value as-is, so we have 2761 // vr[i] = vi if vi != NaN 2762 // = 0 if vi == NaN 2763 c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp, vr) 2764 2765 c.assembler.CompileRegisterToRegister(amd64.CVTTPD2DQ, vr, vr) 2766 } else { 2767 // Clears all bits on tmp. 2768 c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp) 2769 2770 // vr[i] = vi if vi != NaN && vi > 0 2771 // = 0 if vi == NaN || vi <= 0 2772 c.assembler.CompileRegisterToRegister(amd64.MAXPD, tmp, vr) 2773 2774 // tmp2[i] = float64(math.MaxUint32) = math.MaxUint32 2775 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(i32uMaxOnF64x2[:]), tmp2); err != nil { 2776 return err 2777 } 2778 2779 // vr[i] = vi if vi != NaN && vi > 0 && vi <= math.MaxUint32 2780 // = 0 otherwise 2781 c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp2, vr) 2782 2783 // Round the floating points into integer. 2784 c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDPD, vr, vr, 0x3) 2785 2786 // tmp2[i] = float64(0x1.0p52) 2787 if err = c.assembler.CompileStaticConstToRegister(amd64.MOVUPD, asm.NewStaticConst(twop52[:]), tmp2); err != nil { 2788 return err 2789 } 2790 2791 // vr[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32 2792 // = 0 otherwise 2793 // 2794 // This means that vr[i] holds exactly the same bit of uint32(vi) in its lower 32-bits. 2795 c.assembler.CompileRegisterToRegister(amd64.ADDPD, tmp2, vr) 2796 2797 // At this point, we have 2798 // vr = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)] 2799 // tmp = [0, 0, 0, 0] 2800 // as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in 2801 // vr = [vr[00], vr[10], tmp[00], tmp[00]] = [vr[00], vr[10], 0, 0] 2802 // meaning that for i = 0 and 1, we have 2803 // vr[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32 2804 // = 0 otherwise. 2805 c.assembler.CompileRegisterToRegisterWithArg(amd64.SHUFPS, tmp, vr, 0b00_00_10_00) 2806 } 2807 } 2808 2809 c.locationStack.markRegisterUnused(tmp) 2810 c.pushVectorRuntimeValueLocationOnRegister(vr) 2811 return nil 2812 }