github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/amd64/machine.go (about) 1 package amd64 2 3 import ( 4 "context" 5 "encoding/binary" 6 "fmt" 7 "math" 8 "strings" 9 10 "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" 11 "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" 12 "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" 13 "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" 14 "github.com/tetratelabs/wazero/internal/platform" 15 ) 16 17 // NewBackend returns a new backend for arm64. 18 func NewBackend() backend.Machine { 19 ectx := backend.NewExecutableContextT[instruction]( 20 resetInstruction, 21 setNext, 22 setPrev, 23 asNop, 24 ) 25 return &machine{ 26 ectx: ectx, 27 cpuFeatures: platform.CpuFeatures, 28 regAlloc: regalloc.NewAllocator(regInfo), 29 spillSlots: map[regalloc.VRegID]int64{}, 30 amodePool: wazevoapi.NewPool[amode](nil), 31 constSwizzleMaskConstIndex: -1, 32 constSqmulRoundSatIndex: -1, 33 constI8x16SHLMaskTableIndex: -1, 34 constI8x16LogicalSHRMaskTableIndex: -1, 35 constF64x2CvtFromIMaskIndex: -1, 36 constTwop52Index: -1, 37 constI32sMaxOnF64x2Index: -1, 38 constI32uMaxOnF64x2Index: -1, 39 constAllOnesI8x16Index: -1, 40 constAllOnesI16x8Index: -1, 41 constExtAddPairwiseI16x8uMask1Index: -1, 42 constExtAddPairwiseI16x8uMask2Index: -1, 43 } 44 } 45 46 type ( 47 // machine implements backend.Machine for amd64. 48 machine struct { 49 c backend.Compiler 50 ectx *backend.ExecutableContextT[instruction] 51 stackBoundsCheckDisabled bool 52 53 amodePool wazevoapi.Pool[amode] 54 55 cpuFeatures platform.CpuFeatureFlags 56 57 regAlloc regalloc.Allocator 58 regAllocFn *backend.RegAllocFunction[*instruction, *machine] 59 regAllocStarted bool 60 61 spillSlotSize int64 62 spillSlots map[regalloc.VRegID]int64 63 currentABI *backend.FunctionABI 64 clobberedRegs []regalloc.VReg 65 66 maxRequiredStackSizeForCalls int64 67 68 labelResolutionPends []labelResolutionPend 69 70 jmpTableTargets [][]uint32 71 consts []_const 72 73 constSwizzleMaskConstIndex, constSqmulRoundSatIndex, 74 constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex, 75 constF64x2CvtFromIMaskIndex, constTwop52Index, 76 constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index, 77 constAllOnesI8x16Index, constAllOnesI16x8Index, 78 constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int 79 } 80 81 _const struct { 82 lo, hi uint64 83 _var []byte 84 label *labelPosition 85 } 86 87 labelResolutionPend struct { 88 instr *instruction 89 instrOffset int64 90 // imm32Offset is the offset of the last 4 bytes of the instruction. 91 imm32Offset int64 92 } 93 94 labelPosition = backend.LabelPosition[instruction] 95 ) 96 97 func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label { 98 index := *i 99 if index == -1 { 100 label := m.allocateLabel() 101 index = len(m.consts) 102 m.consts = append(m.consts, _const{ 103 _var: _var, 104 label: label, 105 }) 106 *i = index 107 } 108 return m.consts[index].label.L 109 } 110 111 // Reset implements backend.Machine. 112 func (m *machine) Reset() { 113 m.consts = m.consts[:0] 114 m.clobberedRegs = m.clobberedRegs[:0] 115 for key := range m.spillSlots { 116 m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) 117 } 118 for _, key := range m.clobberedRegs { 119 delete(m.spillSlots, regalloc.VRegID(key)) 120 } 121 122 m.stackBoundsCheckDisabled = false 123 m.ectx.Reset() 124 125 m.regAllocFn.Reset() 126 m.regAlloc.Reset() 127 m.regAllocStarted = false 128 m.clobberedRegs = m.clobberedRegs[:0] 129 130 m.spillSlotSize = 0 131 m.maxRequiredStackSizeForCalls = 0 132 133 m.amodePool.Reset() 134 m.jmpTableTargets = m.jmpTableTargets[:0] 135 m.constSwizzleMaskConstIndex = -1 136 m.constSqmulRoundSatIndex = -1 137 m.constI8x16SHLMaskTableIndex = -1 138 m.constI8x16LogicalSHRMaskTableIndex = -1 139 m.constF64x2CvtFromIMaskIndex = -1 140 m.constTwop52Index = -1 141 m.constI32sMaxOnF64x2Index = -1 142 m.constI32uMaxOnF64x2Index = -1 143 m.constAllOnesI8x16Index = -1 144 m.constAllOnesI16x8Index = -1 145 m.constExtAddPairwiseI16x8uMask1Index = -1 146 m.constExtAddPairwiseI16x8uMask2Index = -1 147 } 148 149 // ExecutableContext implements backend.Machine. 150 func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx } 151 152 // DisableStackCheck implements backend.Machine. 153 func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true } 154 155 // SetCompiler implements backend.Machine. 156 func (m *machine) SetCompiler(c backend.Compiler) { 157 m.c = c 158 m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c) 159 } 160 161 // SetCurrentABI implements backend.Machine. 162 func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { 163 m.currentABI = abi 164 } 165 166 // RegAlloc implements backend.Machine. 167 func (m *machine) RegAlloc() { 168 rf := m.regAllocFn 169 for _, pos := range m.ectx.OrderedBlockLabels { 170 rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End) 171 } 172 173 m.regAllocStarted = true 174 m.regAlloc.DoAllocation(rf) 175 // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes. 176 m.spillSlotSize = (m.spillSlotSize + 15) &^ 15 177 } 178 179 // InsertReturn implements backend.Machine. 180 func (m *machine) InsertReturn() { 181 i := m.allocateInstr().asRet() 182 m.insert(i) 183 } 184 185 // LowerSingleBranch implements backend.Machine. 186 func (m *machine) LowerSingleBranch(b *ssa.Instruction) { 187 ectx := m.ectx 188 switch b.Opcode() { 189 case ssa.OpcodeJump: 190 _, _, targetBlk := b.BranchData() 191 if b.IsFallthroughJump() { 192 return 193 } 194 jmp := m.allocateInstr() 195 target := ectx.GetOrAllocateSSABlockLabel(targetBlk) 196 if target == backend.LabelReturn { 197 jmp.asRet() 198 } else { 199 jmp.asJmp(newOperandLabel(target)) 200 } 201 m.insert(jmp) 202 case ssa.OpcodeBrTable: 203 index, target := b.BrTableData() 204 m.lowerBrTable(index, target) 205 default: 206 panic("BUG: unexpected branch opcode" + b.Opcode().String()) 207 } 208 } 209 210 func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) { 211 // TODO: reuse the slice! 212 labels := make([]uint32, len(targets)) 213 for j, target := range targets { 214 labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target)) 215 } 216 index = len(m.jmpTableTargets) 217 m.jmpTableTargets = append(m.jmpTableTargets, labels) 218 return 219 } 220 221 var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp} 222 223 func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) { 224 _v := m.getOperand_Reg(m.c.ValueDefinition(index)) 225 v := m.copyToTmp(_v.reg()) 226 227 // First, we need to do the bounds check. 228 maxIndex := m.c.AllocateVReg(ssa.TypeI32) 229 m.lowerIconst(maxIndex, uint64(len(targets)-1), false) 230 cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false) 231 m.insert(cmp) 232 233 // Then do the conditional move maxIndex to v if v > maxIndex. 234 cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false) 235 m.insert(cmov) 236 237 // Now that v has the correct index. Load the address of the jump table into the addr. 238 addr := m.c.AllocateVReg(ssa.TypeI64) 239 leaJmpTableAddr := m.allocateInstr() 240 m.insert(leaJmpTableAddr) 241 242 // Then add the target's offset into jmpTableAddr. 243 loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, 244 // Shift by 3 because each entry is 8 bytes. 245 newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true) 246 m.insert(loadTargetOffsetFromJmpTable) 247 248 // Now ready to jump. 249 jmp := m.allocateInstr().asJmp(newOperandReg(addr)) 250 m.insert(jmp) 251 252 jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget() 253 m.insert(jmpTableBegin) 254 leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr) 255 256 jmpTable := m.allocateInstr() 257 targetSliceIndex := m.addJmpTableTarget(targets) 258 jmpTable.asJmpTableSequence(targetSliceIndex, len(targets)) 259 m.insert(jmpTable) 260 } 261 262 // LowerConditionalBranch implements backend.Machine. 263 func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { 264 exctx := m.ectx 265 cval, args, targetBlk := b.BranchData() 266 if len(args) > 0 { 267 panic(fmt.Sprintf( 268 "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", 269 exctx.CurrentSSABlk, 270 targetBlk, 271 )) 272 } 273 274 target := exctx.GetOrAllocateSSABlockLabel(targetBlk) 275 cvalDef := m.c.ValueDefinition(cval) 276 277 switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { 278 case ssa.OpcodeIcmp: 279 cvalInstr := cvalDef.Instr 280 x, y, c := cvalInstr.IcmpData() 281 282 cc := condFromSSAIntCmpCond(c) 283 if b.Opcode() == ssa.OpcodeBrz { 284 cc = cc.invert() 285 } 286 287 // First, perform the comparison and set the flag. 288 xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y) 289 if !m.tryLowerBandToFlag(xd, yd) { 290 m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64) 291 } 292 293 // Then perform the conditional branch. 294 m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) 295 cvalDef.Instr.MarkLowered() 296 case ssa.OpcodeFcmp: 297 cvalInstr := cvalDef.Instr 298 299 f1, f2, and := m.lowerFcmpToFlags(cvalInstr) 300 isBrz := b.Opcode() == ssa.OpcodeBrz 301 if isBrz { 302 f1 = f1.invert() 303 } 304 if f2 == condInvalid { 305 m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target))) 306 } else { 307 if isBrz { 308 f2 = f2.invert() 309 and = !and 310 } 311 jmp1, jmp2 := m.allocateInstr(), m.allocateInstr() 312 m.insert(jmp1) 313 m.insert(jmp2) 314 notTaken, notTakenLabel := m.allocateBrTarget() 315 m.insert(notTaken) 316 if and { 317 jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel)) 318 jmp2.asJmpIf(f2, newOperandLabel(target)) 319 } else { 320 jmp1.asJmpIf(f1, newOperandLabel(target)) 321 jmp2.asJmpIf(f2, newOperandLabel(target)) 322 } 323 } 324 325 cvalDef.Instr.MarkLowered() 326 default: 327 v := m.getOperand_Reg(cvalDef) 328 329 var cc cond 330 if b.Opcode() == ssa.OpcodeBrz { 331 cc = condZ 332 } else { 333 cc = condNZ 334 } 335 336 // Perform test %v, %v to set the flag. 337 cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false) 338 m.insert(cmp) 339 m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) 340 } 341 } 342 343 // LowerInstr implements backend.Machine. 344 func (m *machine) LowerInstr(instr *ssa.Instruction) { 345 if l := instr.SourceOffset(); l.Valid() { 346 info := m.allocateInstr().asEmitSourceOffsetInfo(l) 347 m.insert(info) 348 } 349 350 switch op := instr.Opcode(); op { 351 case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: 352 panic("BUG: branching instructions are handled by LowerBranches") 353 case ssa.OpcodeReturn: 354 panic("BUG: return must be handled by backend.Compiler") 355 case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. 356 case ssa.OpcodeCall, ssa.OpcodeCallIndirect: 357 m.lowerCall(instr) 358 case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: 359 m.lowerStore(instr) 360 case ssa.OpcodeIadd: 361 m.lowerAluRmiROp(instr, aluRmiROpcodeAdd) 362 case ssa.OpcodeIsub: 363 m.lowerAluRmiROp(instr, aluRmiROpcodeSub) 364 case ssa.OpcodeImul: 365 m.lowerAluRmiROp(instr, aluRmiROpcodeMul) 366 case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem: 367 isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv 368 isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem 369 m.lowerIDivRem(instr, isDiv, isSigned) 370 case ssa.OpcodeBand: 371 m.lowerAluRmiROp(instr, aluRmiROpcodeAnd) 372 case ssa.OpcodeBor: 373 m.lowerAluRmiROp(instr, aluRmiROpcodeOr) 374 case ssa.OpcodeBxor: 375 m.lowerAluRmiROp(instr, aluRmiROpcodeXor) 376 case ssa.OpcodeIshl: 377 m.lowerShiftR(instr, shiftROpShiftLeft) 378 case ssa.OpcodeSshr: 379 m.lowerShiftR(instr, shiftROpShiftRightArithmetic) 380 case ssa.OpcodeUshr: 381 m.lowerShiftR(instr, shiftROpShiftRightLogical) 382 case ssa.OpcodeRotl: 383 m.lowerShiftR(instr, shiftROpRotateLeft) 384 case ssa.OpcodeRotr: 385 m.lowerShiftR(instr, shiftROpRotateRight) 386 case ssa.OpcodeClz: 387 m.lowerClz(instr) 388 case ssa.OpcodeCtz: 389 m.lowerCtz(instr) 390 case ssa.OpcodePopcnt: 391 m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt) 392 case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv: 393 m.lowerXmmRmR(instr) 394 case ssa.OpcodeFabs: 395 m.lowerFabsFneg(instr) 396 case ssa.OpcodeFneg: 397 m.lowerFabsFneg(instr) 398 case ssa.OpcodeCeil: 399 m.lowerRound(instr, roundingModeUp) 400 case ssa.OpcodeFloor: 401 m.lowerRound(instr, roundingModeDown) 402 case ssa.OpcodeTrunc: 403 m.lowerRound(instr, roundingModeZero) 404 case ssa.OpcodeNearest: 405 m.lowerRound(instr, roundingModeNearest) 406 case ssa.OpcodeFmin, ssa.OpcodeFmax: 407 m.lowerFminFmax(instr) 408 case ssa.OpcodeFcopysign: 409 m.lowerFcopysign(instr) 410 case ssa.OpcodeBitcast: 411 m.lowerBitcast(instr) 412 case ssa.OpcodeSqrt: 413 m.lowerSqrt(instr) 414 case ssa.OpcodeFpromote: 415 v := instr.Arg() 416 rn := m.getOperand_Reg(m.c.ValueDefinition(v)) 417 rd := m.c.VRegOf(instr.Return()) 418 cnt := m.allocateInstr() 419 cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd) 420 m.insert(cnt) 421 case ssa.OpcodeFdemote: 422 v := instr.Arg() 423 rn := m.getOperand_Reg(m.c.ValueDefinition(v)) 424 rd := m.c.VRegOf(instr.Return()) 425 cnt := m.allocateInstr() 426 cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd) 427 m.insert(cnt) 428 case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: 429 x, ctx := instr.Arg2() 430 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 431 rd := m.c.VRegOf(instr.Return()) 432 ctxVReg := m.c.VRegOf(ctx) 433 m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, 434 instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) 435 case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: 436 x, ctx := instr.Arg2() 437 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 438 rd := m.c.VRegOf(instr.Return()) 439 ctxVReg := m.c.VRegOf(ctx) 440 m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, 441 instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) 442 case ssa.OpcodeFcvtFromSint: 443 x := instr.Arg() 444 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 445 rd := newOperandReg(m.c.VRegOf(instr.Return())) 446 m.lowerFcvtFromSint(rn, rd, 447 x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64) 448 case ssa.OpcodeFcvtFromUint: 449 x := instr.Arg() 450 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 451 rd := newOperandReg(m.c.VRegOf(instr.Return())) 452 m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64, 453 instr.Return().Type().Bits() == 64) 454 case ssa.OpcodeVanyTrue: 455 m.lowerVanyTrue(instr) 456 case ssa.OpcodeVallTrue: 457 m.lowerVallTrue(instr) 458 case ssa.OpcodeVhighBits: 459 m.lowerVhighBits(instr) 460 case ssa.OpcodeVbnot: 461 m.lowerVbnot(instr) 462 case ssa.OpcodeVband: 463 x, y := instr.Arg2() 464 m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return()) 465 case ssa.OpcodeVbor: 466 x, y := instr.Arg2() 467 m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return()) 468 case ssa.OpcodeVbxor: 469 x, y := instr.Arg2() 470 m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return()) 471 case ssa.OpcodeVbandnot: 472 m.lowerVbandnot(instr, sseOpcodePandn) 473 case ssa.OpcodeVbitselect: 474 m.lowerVbitselect(instr) 475 case ssa.OpcodeVIadd: 476 x, y, lane := instr.Arg2WithLane() 477 var vecOp sseOpcode 478 switch lane { 479 case ssa.VecLaneI8x16: 480 vecOp = sseOpcodePaddb 481 case ssa.VecLaneI16x8: 482 vecOp = sseOpcodePaddw 483 case ssa.VecLaneI32x4: 484 vecOp = sseOpcodePaddd 485 case ssa.VecLaneI64x2: 486 vecOp = sseOpcodePaddq 487 } 488 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 489 490 case ssa.OpcodeVSaddSat: 491 x, y, lane := instr.Arg2WithLane() 492 var vecOp sseOpcode 493 switch lane { 494 case ssa.VecLaneI8x16: 495 vecOp = sseOpcodePaddsb 496 case ssa.VecLaneI16x8: 497 vecOp = sseOpcodePaddsw 498 } 499 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 500 501 case ssa.OpcodeVUaddSat: 502 x, y, lane := instr.Arg2WithLane() 503 var vecOp sseOpcode 504 switch lane { 505 case ssa.VecLaneI8x16: 506 vecOp = sseOpcodePaddusb 507 case ssa.VecLaneI16x8: 508 vecOp = sseOpcodePaddusw 509 } 510 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 511 512 case ssa.OpcodeVIsub: 513 x, y, lane := instr.Arg2WithLane() 514 var vecOp sseOpcode 515 switch lane { 516 case ssa.VecLaneI8x16: 517 vecOp = sseOpcodePsubb 518 case ssa.VecLaneI16x8: 519 vecOp = sseOpcodePsubw 520 case ssa.VecLaneI32x4: 521 vecOp = sseOpcodePsubd 522 case ssa.VecLaneI64x2: 523 vecOp = sseOpcodePsubq 524 } 525 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 526 527 case ssa.OpcodeVSsubSat: 528 x, y, lane := instr.Arg2WithLane() 529 var vecOp sseOpcode 530 switch lane { 531 case ssa.VecLaneI8x16: 532 vecOp = sseOpcodePsubsb 533 case ssa.VecLaneI16x8: 534 vecOp = sseOpcodePsubsw 535 } 536 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 537 538 case ssa.OpcodeVUsubSat: 539 x, y, lane := instr.Arg2WithLane() 540 var vecOp sseOpcode 541 switch lane { 542 case ssa.VecLaneI8x16: 543 vecOp = sseOpcodePsubusb 544 case ssa.VecLaneI16x8: 545 vecOp = sseOpcodePsubusw 546 } 547 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 548 549 case ssa.OpcodeVImul: 550 m.lowerVImul(instr) 551 case ssa.OpcodeVIneg: 552 x, lane := instr.ArgWithLane() 553 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 554 rd := m.c.VRegOf(instr.Return()) 555 var vecOp sseOpcode 556 switch lane { 557 case ssa.VecLaneI8x16: 558 vecOp = sseOpcodePsubb 559 case ssa.VecLaneI16x8: 560 vecOp = sseOpcodePsubw 561 case ssa.VecLaneI32x4: 562 vecOp = sseOpcodePsubd 563 case ssa.VecLaneI64x2: 564 vecOp = sseOpcodePsubq 565 default: 566 panic("BUG") 567 } 568 569 tmp := m.c.AllocateVReg(ssa.TypeV128) 570 m.insert(m.allocateInstr().asZeros(tmp)) 571 572 i := m.allocateInstr() 573 i.asXmmRmR(vecOp, rn, tmp) 574 m.insert(i) 575 576 m.copyTo(tmp, rd) 577 case ssa.OpcodeVFadd: 578 x, y, lane := instr.Arg2WithLane() 579 var vecOp sseOpcode 580 switch lane { 581 case ssa.VecLaneF32x4: 582 vecOp = sseOpcodeAddps 583 case ssa.VecLaneF64x2: 584 vecOp = sseOpcodeAddpd 585 } 586 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 587 588 case ssa.OpcodeVFsub: 589 x, y, lane := instr.Arg2WithLane() 590 var vecOp sseOpcode 591 switch lane { 592 case ssa.VecLaneF32x4: 593 vecOp = sseOpcodeSubps 594 case ssa.VecLaneF64x2: 595 vecOp = sseOpcodeSubpd 596 } 597 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 598 599 case ssa.OpcodeVFdiv: 600 x, y, lane := instr.Arg2WithLane() 601 var vecOp sseOpcode 602 switch lane { 603 case ssa.VecLaneF32x4: 604 vecOp = sseOpcodeDivps 605 case ssa.VecLaneF64x2: 606 vecOp = sseOpcodeDivpd 607 } 608 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 609 610 case ssa.OpcodeVFmul: 611 x, y, lane := instr.Arg2WithLane() 612 var vecOp sseOpcode 613 switch lane { 614 case ssa.VecLaneF32x4: 615 vecOp = sseOpcodeMulps 616 case ssa.VecLaneF64x2: 617 vecOp = sseOpcodeMulpd 618 } 619 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 620 621 case ssa.OpcodeVFneg: 622 x, lane := instr.ArgWithLane() 623 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 624 rd := m.c.VRegOf(instr.Return()) 625 626 tmp := m.c.AllocateVReg(ssa.TypeV128) 627 628 var shiftOp, xorOp sseOpcode 629 var shiftAmt uint32 630 switch lane { 631 case ssa.VecLaneF32x4: 632 shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps 633 case ssa.VecLaneF64x2: 634 shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd 635 } 636 637 zero := m.allocateInstr() 638 zero.asZeros(tmp) 639 m.insert(zero) 640 641 // Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction). 642 // See https://www.felixcloutier.com/x86/cmpps 643 // 644 // Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane 645 // if the lane is NaN. 646 cmp := m.allocateInstr() 647 cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp) 648 m.insert(cmp) 649 650 // Do the left shift on each lane to set only the most significant bit in each. 651 i := m.allocateInstr() 652 i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp) 653 m.insert(i) 654 655 // Get the negated result by XOR on each lane with tmp. 656 i = m.allocateInstr() 657 i.asXmmRmR(xorOp, rn, tmp) 658 m.insert(i) 659 660 m.copyTo(tmp, rd) 661 662 case ssa.OpcodeVSqrt: 663 x, lane := instr.ArgWithLane() 664 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 665 rd := m.c.VRegOf(instr.Return()) 666 667 var vecOp sseOpcode 668 switch lane { 669 case ssa.VecLaneF32x4: 670 vecOp = sseOpcodeSqrtps 671 case ssa.VecLaneF64x2: 672 vecOp = sseOpcodeSqrtpd 673 } 674 i := m.allocateInstr() 675 i.asXmmUnaryRmR(vecOp, rn, rd) 676 m.insert(i) 677 678 case ssa.OpcodeVImin: 679 x, y, lane := instr.Arg2WithLane() 680 var vecOp sseOpcode 681 switch lane { 682 case ssa.VecLaneI8x16: 683 vecOp = sseOpcodePminsb 684 case ssa.VecLaneI16x8: 685 vecOp = sseOpcodePminsw 686 case ssa.VecLaneI32x4: 687 vecOp = sseOpcodePminsd 688 } 689 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 690 691 case ssa.OpcodeVUmin: 692 x, y, lane := instr.Arg2WithLane() 693 var vecOp sseOpcode 694 switch lane { 695 case ssa.VecLaneI8x16: 696 vecOp = sseOpcodePminub 697 case ssa.VecLaneI16x8: 698 vecOp = sseOpcodePminuw 699 case ssa.VecLaneI32x4: 700 vecOp = sseOpcodePminud 701 } 702 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 703 704 case ssa.OpcodeVImax: 705 x, y, lane := instr.Arg2WithLane() 706 var vecOp sseOpcode 707 switch lane { 708 case ssa.VecLaneI8x16: 709 vecOp = sseOpcodePmaxsb 710 case ssa.VecLaneI16x8: 711 vecOp = sseOpcodePmaxsw 712 case ssa.VecLaneI32x4: 713 vecOp = sseOpcodePmaxsd 714 } 715 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 716 717 case ssa.OpcodeVUmax: 718 x, y, lane := instr.Arg2WithLane() 719 var vecOp sseOpcode 720 switch lane { 721 case ssa.VecLaneI8x16: 722 vecOp = sseOpcodePmaxub 723 case ssa.VecLaneI16x8: 724 vecOp = sseOpcodePmaxuw 725 case ssa.VecLaneI32x4: 726 vecOp = sseOpcodePmaxud 727 } 728 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 729 730 case ssa.OpcodeVAvgRound: 731 x, y, lane := instr.Arg2WithLane() 732 var vecOp sseOpcode 733 switch lane { 734 case ssa.VecLaneI8x16: 735 vecOp = sseOpcodePavgb 736 case ssa.VecLaneI16x8: 737 vecOp = sseOpcodePavgw 738 } 739 m.lowerVbBinOp(vecOp, x, y, instr.Return()) 740 741 case ssa.OpcodeVIcmp: 742 x, y, c, lane := instr.VIcmpData() 743 m.lowerVIcmp(x, y, c, instr.Return(), lane) 744 745 case ssa.OpcodeVFcmp: 746 x, y, c, lane := instr.VFcmpData() 747 m.lowerVFcmp(x, y, c, instr.Return(), lane) 748 749 case ssa.OpcodeExtractlane: 750 x, index, signed, lane := instr.ExtractlaneData() 751 m.lowerExtractLane(x, index, signed, instr.Return(), lane) 752 753 case ssa.OpcodeInsertlane: 754 x, y, index, lane := instr.InsertlaneData() 755 m.lowerInsertLane(x, y, index, instr.Return(), lane) 756 757 case ssa.OpcodeSwizzle: 758 x, y, _ := instr.Arg2WithLane() 759 m.lowerSwizzle(x, y, instr.Return()) 760 761 case ssa.OpcodeShuffle: 762 x, y, lo, hi := instr.ShuffleData() 763 m.lowerShuffle(x, y, lo, hi, instr.Return()) 764 765 case ssa.OpcodeSplat: 766 x, lane := instr.ArgWithLane() 767 m.lowerSplat(x, instr.Return(), lane) 768 769 case ssa.OpcodeSqmulRoundSat: 770 x, y := instr.Arg2() 771 m.lowerSqmulRoundSat(x, y, instr.Return()) 772 773 case ssa.OpcodeVZeroExtLoad: 774 ptr, offset, typ := instr.VZeroExtLoadData() 775 var sseOp sseOpcode 776 // Both movss and movsd clears the higher bits of the destination register upt 128 bits. 777 // https://www.felixcloutier.com/x86/movss 778 // https://www.felixcloutier.com/x86/movsd 779 if typ == ssa.TypeF32 { 780 sseOp = sseOpcodeMovss 781 } else { 782 sseOp = sseOpcodeMovsd 783 } 784 mem := m.lowerToAddressMode(ptr, offset) 785 dst := m.c.VRegOf(instr.Return()) 786 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst)) 787 788 case ssa.OpcodeVMinPseudo: 789 x, y, lane := instr.Arg2WithLane() 790 var vecOp sseOpcode 791 switch lane { 792 case ssa.VecLaneF32x4: 793 vecOp = sseOpcodeMinps 794 case ssa.VecLaneF64x2: 795 vecOp = sseOpcodeMinpd 796 default: 797 panic("BUG: unexpected lane type") 798 } 799 m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) 800 801 case ssa.OpcodeVMaxPseudo: 802 x, y, lane := instr.Arg2WithLane() 803 var vecOp sseOpcode 804 switch lane { 805 case ssa.VecLaneF32x4: 806 vecOp = sseOpcodeMaxps 807 case ssa.VecLaneF64x2: 808 vecOp = sseOpcodeMaxpd 809 default: 810 panic("BUG: unexpected lane type") 811 } 812 m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) 813 814 case ssa.OpcodeVIshl: 815 x, y, lane := instr.Arg2WithLane() 816 m.lowerVIshl(x, y, instr.Return(), lane) 817 818 case ssa.OpcodeVSshr: 819 x, y, lane := instr.Arg2WithLane() 820 m.lowerVSshr(x, y, instr.Return(), lane) 821 822 case ssa.OpcodeVUshr: 823 x, y, lane := instr.Arg2WithLane() 824 m.lowerVUshr(x, y, instr.Return(), lane) 825 826 case ssa.OpcodeVCeil: 827 x, lane := instr.ArgWithLane() 828 m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2) 829 830 case ssa.OpcodeVFloor: 831 x, lane := instr.ArgWithLane() 832 m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2) 833 834 case ssa.OpcodeVTrunc: 835 x, lane := instr.ArgWithLane() 836 m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2) 837 838 case ssa.OpcodeVNearest: 839 x, lane := instr.ArgWithLane() 840 m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2) 841 842 case ssa.OpcodeExtIaddPairwise: 843 x, lane, signed := instr.ExtIaddPairwiseData() 844 m.lowerExtIaddPairwise(x, instr.Return(), lane, signed) 845 846 case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow: 847 x, lane := instr.ArgWithLane() 848 m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow) 849 850 case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh: 851 x, lane := instr.ArgWithLane() 852 m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh) 853 854 case ssa.OpcodeLoadSplat: 855 ptr, offset, lane := instr.LoadSplatData() 856 m.lowerLoadSplat(ptr, offset, instr.Return(), lane) 857 858 case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint: 859 x, lane := instr.ArgWithLane() 860 m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint) 861 862 case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: 863 x, lane := instr.ArgWithLane() 864 m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat) 865 866 case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: 867 x, y, lane := instr.Arg2WithLane() 868 m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow) 869 870 case ssa.OpcodeFvpromoteLow: 871 x := instr.Arg() 872 src := m.getOperand_Reg(m.c.ValueDefinition(x)) 873 dst := m.c.VRegOf(instr.Return()) 874 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst)) 875 876 case ssa.OpcodeFvdemote: 877 x := instr.Arg() 878 src := m.getOperand_Reg(m.c.ValueDefinition(x)) 879 dst := m.c.VRegOf(instr.Return()) 880 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst)) 881 882 case ssa.OpcodeWideningPairwiseDotProductS: 883 x, y := instr.Arg2() 884 m.lowerWideningPairwiseDotProductS(x, y, instr.Return()) 885 886 case ssa.OpcodeVIabs: 887 m.lowerVIabs(instr) 888 case ssa.OpcodeVIpopcnt: 889 m.lowerVIpopcnt(instr) 890 case ssa.OpcodeVFmin: 891 m.lowerVFmin(instr) 892 case ssa.OpcodeVFmax: 893 m.lowerVFmax(instr) 894 case ssa.OpcodeVFabs: 895 m.lowerVFabs(instr) 896 case ssa.OpcodeUndefined: 897 m.insert(m.allocateInstr().asUD2()) 898 case ssa.OpcodeExitWithCode: 899 execCtx, code := instr.ExitWithCodeData() 900 m.lowerExitWithCode(m.c.VRegOf(execCtx), code) 901 case ssa.OpcodeExitIfTrueWithCode: 902 execCtx, c, code := instr.ExitIfTrueWithCodeData() 903 m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code) 904 case ssa.OpcodeLoad: 905 ptr, offset, typ := instr.LoadData() 906 dst := m.c.VRegOf(instr.Return()) 907 m.lowerLoad(ptr, offset, typ, dst) 908 case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: 909 ptr, offset, _ := instr.LoadData() 910 ret := m.c.VRegOf(instr.Return()) 911 m.lowerExtLoad(op, ptr, offset, ret) 912 case ssa.OpcodeVconst: 913 result := m.c.VRegOf(instr.Return()) 914 lo, hi := instr.VconstData() 915 m.lowerVconst(result, lo, hi) 916 case ssa.OpcodeSExtend, ssa.OpcodeUExtend: 917 from, to, signed := instr.ExtendData() 918 m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) 919 case ssa.OpcodeIcmp: 920 m.lowerIcmp(instr) 921 case ssa.OpcodeFcmp: 922 m.lowerFcmp(instr) 923 case ssa.OpcodeSelect: 924 cval, x, y := instr.SelectData() 925 m.lowerSelect(x, y, cval, instr.Return()) 926 case ssa.OpcodeIreduce: 927 rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg())) 928 retVal := instr.Return() 929 rd := m.c.VRegOf(retVal) 930 931 if retVal.Type() != ssa.TypeI32 { 932 panic("TODO?: Ireduce to non-i32") 933 } 934 m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd)) 935 936 case ssa.OpcodeAtomicLoad: 937 ptr := instr.Arg() 938 size := instr.AtomicTargetSize() 939 dst := m.c.VRegOf(instr.Return()) 940 941 // At this point, the ptr is ensured to be aligned, so using a normal load is atomic. 942 // https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30 943 mem := newOperandMem(m.lowerToAddressMode(ptr, 0)) 944 load := m.allocateInstr() 945 switch size { 946 case 8: 947 load.asMov64MR(mem, dst) 948 case 4: 949 load.asMovzxRmR(extModeLQ, mem, dst) 950 case 2: 951 load.asMovzxRmR(extModeWQ, mem, dst) 952 case 1: 953 load.asMovzxRmR(extModeBQ, mem, dst) 954 default: 955 panic("BUG") 956 } 957 m.insert(load) 958 959 case ssa.OpcodeFence: 960 m.insert(m.allocateInstr().asMFence()) 961 962 case ssa.OpcodeAtomicStore: 963 ptr, _val := instr.Arg2() 964 size := instr.AtomicTargetSize() 965 966 val := m.getOperand_Reg(m.c.ValueDefinition(_val)) 967 // The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register. 968 copied := m.copyToTmp(val.reg()) 969 970 mem := newOperandMem(m.lowerToAddressMode(ptr, 0)) 971 store := m.allocateInstr().asXCHG(copied, mem, byte(size)) 972 m.insert(store) 973 974 case ssa.OpcodeAtomicCas: 975 addr, exp, repl := instr.Arg3() 976 size := instr.AtomicTargetSize() 977 m.lowerAtomicCas(addr, exp, repl, size, instr.Return()) 978 979 case ssa.OpcodeAtomicRmw: 980 addr, val := instr.Arg2() 981 atomicOp, size := instr.AtomicRmwData() 982 m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return()) 983 984 default: 985 panic("TODO: lowering " + op.String()) 986 } 987 } 988 989 func (m *machine) lowerAtomicRmw(op ssa.AtomicRmwOp, addr, val ssa.Value, size uint64, ret ssa.Value) { 990 mem := m.lowerToAddressMode(addr, 0) 991 _val := m.getOperand_Reg(m.c.ValueDefinition(val)) 992 993 switch op { 994 case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub: 995 valCopied := m.copyToTmp(_val.reg()) 996 if op == ssa.AtomicRmwOpSub { 997 // Negate the value. 998 m.insert(m.allocateInstr().asNeg(newOperandReg(valCopied), true)) 999 } 1000 m.insert(m.allocateInstr().asLockXAdd(valCopied, mem, byte(size))) 1001 m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) 1002 m.copyTo(valCopied, m.c.VRegOf(ret)) 1003 1004 case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor: 1005 accumulator := raxVReg 1006 // Reserve rax for the accumulator to make regalloc happy. 1007 // Note: do this initialization before defining valCopied, because it might be the same register and 1008 // if that happens, the unnecessary load/store will be performed inside the loop. 1009 // This can be mitigated in any way once the register allocator is clever enough. 1010 m.insert(m.allocateInstr().asDefineUninitializedReg(accumulator)) 1011 1012 // Copy the value to a temporary register. 1013 valCopied := m.copyToTmp(_val.reg()) 1014 m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) 1015 1016 memOp := newOperandMem(mem) 1017 tmp := m.c.AllocateVReg(ssa.TypeI64) 1018 beginLoop, beginLoopLabel := m.allocateBrTarget() 1019 { 1020 m.insert(beginLoop) 1021 // Reset the value on tmp by the original value. 1022 m.copyTo(valCopied, tmp) 1023 // Load the current value at the memory location into accumulator. 1024 switch size { 1025 case 1: 1026 m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, memOp, accumulator)) 1027 case 2: 1028 m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, memOp, accumulator)) 1029 case 4: 1030 m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, memOp, accumulator)) 1031 case 8: 1032 m.insert(m.allocateInstr().asMov64MR(memOp, accumulator)) 1033 default: 1034 panic("BUG") 1035 } 1036 // Then perform the logical operation on the accumulator and the value on tmp. 1037 switch op { 1038 case ssa.AtomicRmwOpAnd: 1039 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(accumulator), tmp, true)) 1040 case ssa.AtomicRmwOpOr: 1041 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(accumulator), tmp, true)) 1042 case ssa.AtomicRmwOpXor: 1043 m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(accumulator), tmp, true)) 1044 default: 1045 panic("BUG") 1046 } 1047 // Finally, try compare-exchange the value at the memory location with the tmp. 1048 m.insert(m.allocateInstr().asLockCmpXCHG(tmp, memOp.addressMode(), byte(size))) 1049 // If it succeeds, ZF will be set, and we can break the loop. 1050 m.insert(m.allocateInstr().asJmpIf(condNZ, newOperandLabel(beginLoopLabel))) 1051 } 1052 1053 // valCopied must be alive at the end of the loop. 1054 m.insert(m.allocateInstr().asNopUseReg(valCopied)) 1055 1056 // At this point, accumulator contains the result. 1057 m.clearHigherBitsForAtomic(accumulator, size, ret.Type()) 1058 m.copyTo(accumulator, m.c.VRegOf(ret)) 1059 1060 case ssa.AtomicRmwOpXchg: 1061 valCopied := m.copyToTmp(_val.reg()) 1062 1063 m.insert(m.allocateInstr().asXCHG(valCopied, newOperandMem(mem), byte(size))) 1064 m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) 1065 m.copyTo(valCopied, m.c.VRegOf(ret)) 1066 1067 default: 1068 panic("BUG") 1069 } 1070 } 1071 1072 func (m *machine) lowerAtomicCas(addr, exp, repl ssa.Value, size uint64, ret ssa.Value) { 1073 mem := m.lowerToAddressMode(addr, 0) 1074 expOp := m.getOperand_Reg(m.c.ValueDefinition(exp)) 1075 replOp := m.getOperand_Reg(m.c.ValueDefinition(repl)) 1076 1077 accumulator := raxVReg 1078 m.copyTo(expOp.reg(), accumulator) 1079 m.insert(m.allocateInstr().asLockCmpXCHG(replOp.reg(), mem, byte(size))) 1080 m.clearHigherBitsForAtomic(accumulator, size, ret.Type()) 1081 m.copyTo(accumulator, m.c.VRegOf(ret)) 1082 } 1083 1084 func (m *machine) clearHigherBitsForAtomic(r regalloc.VReg, valSize uint64, resultType ssa.Type) { 1085 switch resultType { 1086 case ssa.TypeI32: 1087 switch valSize { 1088 case 1: 1089 m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(r), r)) 1090 case 2: 1091 m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(r), r)) 1092 } 1093 case ssa.TypeI64: 1094 switch valSize { 1095 case 1: 1096 m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(r), r)) 1097 case 2: 1098 m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(r), r)) 1099 case 4: 1100 m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(r), r)) 1101 } 1102 } 1103 } 1104 1105 func (m *machine) lowerFcmp(instr *ssa.Instruction) { 1106 f1, f2, and := m.lowerFcmpToFlags(instr) 1107 rd := m.c.VRegOf(instr.Return()) 1108 if f2 == condInvalid { 1109 tmp := m.c.AllocateVReg(ssa.TypeI32) 1110 m.insert(m.allocateInstr().asSetcc(f1, tmp)) 1111 // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match 1112 // the semantics of Icmp that sets either 0 or 1. 1113 m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) 1114 } else { 1115 tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32) 1116 m.insert(m.allocateInstr().asSetcc(f1, tmp1)) 1117 m.insert(m.allocateInstr().asSetcc(f2, tmp2)) 1118 var op aluRmiROpcode 1119 if and { 1120 op = aluRmiROpcodeAnd 1121 } else { 1122 op = aluRmiROpcodeOr 1123 } 1124 m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false)) 1125 m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd)) 1126 } 1127 } 1128 1129 func (m *machine) lowerIcmp(instr *ssa.Instruction) { 1130 x, y, c := instr.IcmpData() 1131 m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64) 1132 rd := m.c.VRegOf(instr.Return()) 1133 tmp := m.c.AllocateVReg(ssa.TypeI32) 1134 m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp)) 1135 // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match 1136 // the semantics of Icmp that sets either 0 or 1. 1137 m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) 1138 } 1139 1140 func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) { 1141 xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) 1142 rd := m.c.VRegOf(ret) 1143 1144 var cond cond 1145 cvalDef := m.c.ValueDefinition(cval) 1146 switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { 1147 case ssa.OpcodeIcmp: 1148 icmp := cvalDef.Instr 1149 xc, yc, cc := icmp.IcmpData() 1150 m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64) 1151 cond = condFromSSAIntCmpCond(cc) 1152 icmp.Lowered() 1153 default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex. 1154 cv := m.getOperand_Reg(cvalDef) 1155 test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false) 1156 m.insert(test) 1157 cond = condNZ 1158 } 1159 1160 if typ := x.Type(); typ.IsInt() { 1161 _64 := typ.Bits() == 64 1162 mov := m.allocateInstr() 1163 tmp := m.c.AllocateVReg(typ) 1164 switch yo.kind { 1165 case operandKindReg: 1166 mov.asMovRR(yo.reg(), tmp, _64) 1167 case operandKindMem: 1168 if _64 { 1169 mov.asMov64MR(yo, tmp) 1170 } else { 1171 mov.asMovzxRmR(extModeLQ, yo, tmp) 1172 } 1173 default: 1174 panic("BUG") 1175 } 1176 m.insert(mov) 1177 cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64) 1178 m.insert(cmov) 1179 m.insert(m.allocateInstr().asMovRR(tmp, rd, _64)) 1180 } else { 1181 mov := m.allocateInstr() 1182 tmp := m.c.AllocateVReg(typ) 1183 switch typ { 1184 case ssa.TypeF32: 1185 mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp) 1186 case ssa.TypeF64: 1187 mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp) 1188 case ssa.TypeV128: 1189 mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp) 1190 default: 1191 panic("BUG") 1192 } 1193 m.insert(mov) 1194 1195 cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size()) 1196 m.insert(cmov) 1197 1198 m.copyTo(tmp, rd) 1199 } 1200 } 1201 1202 func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) { 1203 x := i.op1 1204 rd := i.op2.reg() 1205 cond := cond(i.u1) 1206 1207 jcc := m.allocateInstr() 1208 m.insert(jcc) 1209 1210 mov := m.allocateInstr() 1211 switch i.u2 { 1212 case 4: 1213 mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd) 1214 case 8: 1215 mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd) 1216 case 16: 1217 mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd) 1218 default: 1219 panic("BUG") 1220 } 1221 m.insert(mov) 1222 1223 nop, end := m.allocateBrTarget() 1224 m.insert(nop) 1225 jcc.asJmpIf(cond.invert(), newOperandLabel(end)) 1226 } 1227 1228 func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) { 1229 rd0 := m.c.VRegOf(ret) 1230 arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg)) 1231 1232 rd := m.c.AllocateVReg(ret.Type()) 1233 1234 ext := m.allocateInstr() 1235 switch { 1236 case from == 8 && to == 16 && signed: 1237 ext.asMovsxRmR(extModeBQ, arg, rd) 1238 case from == 8 && to == 16 && !signed: 1239 ext.asMovzxRmR(extModeBL, arg, rd) 1240 case from == 8 && to == 32 && signed: 1241 ext.asMovsxRmR(extModeBL, arg, rd) 1242 case from == 8 && to == 32 && !signed: 1243 ext.asMovzxRmR(extModeBQ, arg, rd) 1244 case from == 8 && to == 64 && signed: 1245 ext.asMovsxRmR(extModeBQ, arg, rd) 1246 case from == 8 && to == 64 && !signed: 1247 ext.asMovzxRmR(extModeBQ, arg, rd) 1248 case from == 16 && to == 32 && signed: 1249 ext.asMovsxRmR(extModeWL, arg, rd) 1250 case from == 16 && to == 32 && !signed: 1251 ext.asMovzxRmR(extModeWL, arg, rd) 1252 case from == 16 && to == 64 && signed: 1253 ext.asMovsxRmR(extModeWQ, arg, rd) 1254 case from == 16 && to == 64 && !signed: 1255 ext.asMovzxRmR(extModeWQ, arg, rd) 1256 case from == 32 && to == 64 && signed: 1257 ext.asMovsxRmR(extModeLQ, arg, rd) 1258 case from == 32 && to == 64 && !signed: 1259 ext.asMovzxRmR(extModeLQ, arg, rd) 1260 default: 1261 panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed)) 1262 } 1263 m.insert(ext) 1264 1265 m.copyTo(rd, rd0) 1266 } 1267 1268 func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) { 1269 if lo == 0 && hi == 0 { 1270 m.insert(m.allocateInstr().asZeros(dst)) 1271 return 1272 } 1273 1274 load := m.allocateInstr() 1275 constLabel := m.allocateLabel() 1276 m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi}) 1277 load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst) 1278 m.insert(load) 1279 } 1280 1281 func (m *machine) lowerCtz(instr *ssa.Instruction) { 1282 if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { 1283 m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt) 1284 } else { 1285 // On processors that do not support TZCNT, the BSF instruction is 1286 // executed instead. The key difference between TZCNT and BSF 1287 // instruction is that if source operand is zero, the content of 1288 // destination operand is undefined. 1289 // https://www.felixcloutier.com/x86/tzcnt.html 1290 1291 x := instr.Arg() 1292 if !x.Type().IsInt() { 1293 panic("BUG?") 1294 } 1295 _64 := x.Type().Bits() == 64 1296 1297 xDef := m.c.ValueDefinition(x) 1298 tmp := m.c.AllocateVReg(x.Type()) 1299 rm := m.getOperand_Reg(xDef) 1300 1301 // First, we have to check if the target is non-zero. 1302 test := m.allocateInstr() 1303 test.asCmpRmiR(false, rm, rm.reg(), _64) 1304 m.insert(test) 1305 1306 jmpNz := m.allocateInstr() 1307 m.insert(jmpNz) 1308 1309 // If the value is zero, we just push the const value. 1310 m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) 1311 1312 // Now jump right after the non-zero case. 1313 jmpAtEnd := m.allocateInstr() 1314 m.insert(jmpAtEnd) 1315 1316 // jmpNz target label is set here. 1317 nop, nz := m.allocateBrTarget() 1318 jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) 1319 m.insert(nop) 1320 1321 // Emit the non-zero case. 1322 bsr := m.allocateInstr() 1323 bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64) 1324 m.insert(bsr) 1325 1326 // jmpAtEnd target label is set here. 1327 nopEnd, end := m.allocateBrTarget() 1328 jmpAtEnd.asJmp(newOperandLabel(end)) 1329 m.insert(nopEnd) 1330 1331 m.copyTo(tmp, m.c.VRegOf(instr.Return())) 1332 } 1333 } 1334 1335 func (m *machine) lowerClz(instr *ssa.Instruction) { 1336 if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { 1337 m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt) 1338 } else { 1339 // On processors that do not support LZCNT, we combine BSR (calculating 1340 // most significant set bit) with XOR. This logic is described in 1341 // "Replace Raw Assembly Code with Builtin Intrinsics" section in: 1342 // https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code. 1343 1344 x := instr.Arg() 1345 if !x.Type().IsInt() { 1346 panic("BUG?") 1347 } 1348 _64 := x.Type().Bits() == 64 1349 1350 xDef := m.c.ValueDefinition(x) 1351 rm := m.getOperand_Reg(xDef) 1352 tmp := m.c.AllocateVReg(x.Type()) 1353 1354 // First, we have to check if the rm is non-zero as BSR is undefined 1355 // on zero. See https://www.felixcloutier.com/x86/bsr. 1356 test := m.allocateInstr() 1357 test.asCmpRmiR(false, rm, rm.reg(), _64) 1358 m.insert(test) 1359 1360 jmpNz := m.allocateInstr() 1361 m.insert(jmpNz) 1362 1363 // If the value is zero, we just push the const value. 1364 m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) 1365 1366 // Now jump right after the non-zero case. 1367 jmpAtEnd := m.allocateInstr() 1368 m.insert(jmpAtEnd) 1369 1370 // jmpNz target label is set here. 1371 nop, nz := m.allocateBrTarget() 1372 jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) 1373 m.insert(nop) 1374 1375 // Emit the non-zero case. 1376 bsr := m.allocateInstr() 1377 bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64) 1378 m.insert(bsr) 1379 1380 // Now we XOR the value with the bit length minus one. 1381 xor := m.allocateInstr() 1382 xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64) 1383 m.insert(xor) 1384 1385 // jmpAtEnd target label is set here. 1386 nopEnd, end := m.allocateBrTarget() 1387 jmpAtEnd.asJmp(newOperandLabel(end)) 1388 m.insert(nopEnd) 1389 1390 m.copyTo(tmp, m.c.VRegOf(instr.Return())) 1391 } 1392 } 1393 1394 func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) { 1395 x := si.Arg() 1396 if !x.Type().IsInt() { 1397 panic("BUG?") 1398 } 1399 _64 := x.Type().Bits() == 64 1400 1401 xDef := m.c.ValueDefinition(x) 1402 rm := m.getOperand_Mem_Reg(xDef) 1403 rd := m.c.VRegOf(si.Return()) 1404 1405 instr := m.allocateInstr() 1406 instr.asUnaryRmR(op, rm, rd, _64) 1407 m.insert(instr) 1408 } 1409 1410 func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) { 1411 mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) 1412 load := m.allocateInstr() 1413 switch typ { 1414 case ssa.TypeI32: 1415 load.asMovzxRmR(extModeLQ, mem, dst) 1416 case ssa.TypeI64: 1417 load.asMov64MR(mem, dst) 1418 case ssa.TypeF32: 1419 load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst) 1420 case ssa.TypeF64: 1421 load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst) 1422 case ssa.TypeV128: 1423 load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst) 1424 default: 1425 panic("BUG") 1426 } 1427 m.insert(load) 1428 } 1429 1430 func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) { 1431 mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) 1432 load := m.allocateInstr() 1433 switch op { 1434 case ssa.OpcodeUload8: 1435 load.asMovzxRmR(extModeBQ, mem, dst) 1436 case ssa.OpcodeUload16: 1437 load.asMovzxRmR(extModeWQ, mem, dst) 1438 case ssa.OpcodeUload32: 1439 load.asMovzxRmR(extModeLQ, mem, dst) 1440 case ssa.OpcodeSload8: 1441 load.asMovsxRmR(extModeBQ, mem, dst) 1442 case ssa.OpcodeSload16: 1443 load.asMovsxRmR(extModeWQ, mem, dst) 1444 case ssa.OpcodeSload32: 1445 load.asMovsxRmR(extModeLQ, mem, dst) 1446 default: 1447 panic("BUG") 1448 } 1449 m.insert(load) 1450 } 1451 1452 func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { 1453 condDef := m.c.ValueDefinition(cond) 1454 if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) { 1455 panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) 1456 } 1457 cvalInstr := condDef.Instr 1458 cvalInstr.MarkLowered() 1459 1460 // We need to copy the execution context to a temp register, because if it's spilled, 1461 // it might end up being reloaded inside the exiting branch. 1462 execCtxTmp := m.copyToTmp(execCtx) 1463 1464 x, y, c := cvalInstr.IcmpData() 1465 xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) 1466 if !m.tryLowerBandToFlag(xx, yy) { 1467 m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64) 1468 } 1469 1470 jmpIf := m.allocateInstr() 1471 m.insert(jmpIf) 1472 l := m.lowerExitWithCode(execCtxTmp, code) 1473 jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l)) 1474 } 1475 1476 func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) { 1477 var target *backend.SSAValueDefinition 1478 if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 { 1479 if m.c.MatchInstr(y, ssa.OpcodeBand) { 1480 target = y 1481 } 1482 } 1483 1484 if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 { 1485 if m.c.MatchInstr(x, ssa.OpcodeBand) { 1486 target = x 1487 } 1488 } 1489 1490 if target == nil { 1491 return false 1492 } 1493 1494 bandInstr := target.Instr 1495 bandX, bandY := bandInstr.Arg2() 1496 1497 xx := m.getOperand_Reg(m.c.ValueDefinition(bandX)) 1498 yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY)) 1499 test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64) 1500 m.insert(test) 1501 bandInstr.MarkLowered() 1502 return true 1503 } 1504 1505 func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) { 1506 saveRsp = m.allocateInstr().asMovRM( 1507 rspVReg, 1508 newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)), 1509 8, 1510 ) 1511 1512 saveRbp = m.allocateInstr().asMovRM( 1513 rbpVReg, 1514 newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)), 1515 8, 1516 ) 1517 setExitCode = m.allocateInstr().asMovRM( 1518 exitCodeReg, 1519 newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)), 1520 4, 1521 ) 1522 return 1523 } 1524 1525 func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) { 1526 exitCodeReg := rbpVReg 1527 saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg) 1528 1529 // Set save RSP, RBP, and write exit code. 1530 m.insert(saveRsp) 1531 m.insert(saveRbp) 1532 m.lowerIconst(exitCodeReg, uint64(code), false) 1533 m.insert(setExitCode) 1534 1535 ripReg := rbpVReg 1536 1537 // Next is to save the current address for stack unwinding. 1538 nop, currentAddrLabel := m.allocateBrTarget() 1539 m.insert(nop) 1540 readRip := m.allocateInstr().asLEA(newOperandLabel(currentAddrLabel), ripReg) 1541 m.insert(readRip) 1542 saveRip := m.allocateInstr().asMovRM( 1543 ripReg, 1544 newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)), 1545 8, 1546 ) 1547 m.insert(saveRip) 1548 1549 // Finally exit. 1550 exitSq := m.allocateExitSeq(execCtx) 1551 m.insert(exitSq) 1552 1553 // Return the label for continuation. 1554 continuation, afterLabel := m.allocateBrTarget() 1555 m.insert(continuation) 1556 return afterLabel 1557 } 1558 1559 func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) { 1560 x, y := si.Arg2() 1561 if !x.Type().IsInt() { 1562 panic("BUG?") 1563 } 1564 1565 _64 := x.Type().Bits() == 64 1566 1567 xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) 1568 1569 // TODO: commutative args can be swapped if one of them is an immediate. 1570 rn := m.getOperand_Reg(xDef) 1571 rm := m.getOperand_Mem_Imm32_Reg(yDef) 1572 rd := m.c.VRegOf(si.Return()) 1573 1574 // rn is being overwritten, so we first copy its value to a temp register, 1575 // in case it is referenced again later. 1576 tmp := m.copyToTmp(rn.reg()) 1577 1578 alu := m.allocateInstr() 1579 alu.asAluRmiR(op, rm, tmp, _64) 1580 m.insert(alu) 1581 1582 // tmp now contains the result, we copy it to the dest register. 1583 m.copyTo(tmp, rd) 1584 } 1585 1586 func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) { 1587 x, amt := si.Arg2() 1588 if !x.Type().IsInt() { 1589 panic("BUG?") 1590 } 1591 _64 := x.Type().Bits() == 64 1592 1593 xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt) 1594 1595 opAmt := m.getOperand_Imm32_Reg(amtDef) 1596 rx := m.getOperand_Reg(xDef) 1597 rd := m.c.VRegOf(si.Return()) 1598 1599 // rx is being overwritten, so we first copy its value to a temp register, 1600 // in case it is referenced again later. 1601 tmpDst := m.copyToTmp(rx.reg()) 1602 1603 if opAmt.kind == operandKindReg { 1604 // If opAmt is a register we must copy its value to rcx, 1605 // because shiftR encoding mandates that the shift amount is in rcx. 1606 m.copyTo(opAmt.reg(), rcxVReg) 1607 1608 alu := m.allocateInstr() 1609 alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64) 1610 m.insert(alu) 1611 1612 } else { 1613 alu := m.allocateInstr() 1614 alu.asShiftR(op, opAmt, tmpDst, _64) 1615 m.insert(alu) 1616 } 1617 1618 // tmp now contains the result, we copy it to the dest register. 1619 m.copyTo(tmpDst, rd) 1620 } 1621 1622 func (m *machine) lowerXmmRmR(instr *ssa.Instruction) { 1623 x, y := instr.Arg2() 1624 if !x.Type().IsFloat() { 1625 panic("BUG?") 1626 } 1627 _64 := x.Type().Bits() == 64 1628 1629 var op sseOpcode 1630 if _64 { 1631 switch instr.Opcode() { 1632 case ssa.OpcodeFadd: 1633 op = sseOpcodeAddsd 1634 case ssa.OpcodeFsub: 1635 op = sseOpcodeSubsd 1636 case ssa.OpcodeFmul: 1637 op = sseOpcodeMulsd 1638 case ssa.OpcodeFdiv: 1639 op = sseOpcodeDivsd 1640 default: 1641 panic("BUG") 1642 } 1643 } else { 1644 switch instr.Opcode() { 1645 case ssa.OpcodeFadd: 1646 op = sseOpcodeAddss 1647 case ssa.OpcodeFsub: 1648 op = sseOpcodeSubss 1649 case ssa.OpcodeFmul: 1650 op = sseOpcodeMulss 1651 case ssa.OpcodeFdiv: 1652 op = sseOpcodeDivss 1653 default: 1654 panic("BUG") 1655 } 1656 } 1657 1658 xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) 1659 rn := m.getOperand_Reg(yDef) 1660 rm := m.getOperand_Reg(xDef) 1661 rd := m.c.VRegOf(instr.Return()) 1662 1663 // rm is being overwritten, so we first copy its value to a temp register, 1664 // in case it is referenced again later. 1665 tmp := m.copyToTmp(rm.reg()) 1666 1667 xmm := m.allocateInstr().asXmmRmR(op, rn, tmp) 1668 m.insert(xmm) 1669 1670 m.copyTo(tmp, rd) 1671 } 1672 1673 func (m *machine) lowerSqrt(instr *ssa.Instruction) { 1674 x := instr.Arg() 1675 if !x.Type().IsFloat() { 1676 panic("BUG") 1677 } 1678 _64 := x.Type().Bits() == 64 1679 var op sseOpcode 1680 if _64 { 1681 op = sseOpcodeSqrtsd 1682 } else { 1683 op = sseOpcodeSqrtss 1684 } 1685 1686 xDef := m.c.ValueDefinition(x) 1687 rm := m.getOperand_Mem_Reg(xDef) 1688 rd := m.c.VRegOf(instr.Return()) 1689 1690 xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd) 1691 m.insert(xmm) 1692 } 1693 1694 func (m *machine) lowerFabsFneg(instr *ssa.Instruction) { 1695 x := instr.Arg() 1696 if !x.Type().IsFloat() { 1697 panic("BUG") 1698 } 1699 _64 := x.Type().Bits() == 64 1700 var op sseOpcode 1701 var mask uint64 1702 if _64 { 1703 switch instr.Opcode() { 1704 case ssa.OpcodeFabs: 1705 mask, op = 0x7fffffffffffffff, sseOpcodeAndpd 1706 case ssa.OpcodeFneg: 1707 mask, op = 0x8000000000000000, sseOpcodeXorpd 1708 } 1709 } else { 1710 switch instr.Opcode() { 1711 case ssa.OpcodeFabs: 1712 mask, op = 0x7fffffff, sseOpcodeAndps 1713 case ssa.OpcodeFneg: 1714 mask, op = 0x80000000, sseOpcodeXorps 1715 } 1716 } 1717 1718 tmp := m.c.AllocateVReg(x.Type()) 1719 1720 xDef := m.c.ValueDefinition(x) 1721 rm := m.getOperand_Reg(xDef) 1722 rd := m.c.VRegOf(instr.Return()) 1723 1724 m.lowerFconst(tmp, mask, _64) 1725 1726 xmm := m.allocateInstr().asXmmRmR(op, rm, tmp) 1727 m.insert(xmm) 1728 1729 m.copyTo(tmp, rd) 1730 } 1731 1732 func (m *machine) lowerStore(si *ssa.Instruction) { 1733 value, ptr, offset, storeSizeInBits := si.StoreData() 1734 rm := m.getOperand_Reg(m.c.ValueDefinition(value)) 1735 mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) 1736 1737 store := m.allocateInstr() 1738 switch value.Type() { 1739 case ssa.TypeI32: 1740 store.asMovRM(rm.reg(), mem, storeSizeInBits/8) 1741 case ssa.TypeI64: 1742 store.asMovRM(rm.reg(), mem, storeSizeInBits/8) 1743 case ssa.TypeF32: 1744 store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem) 1745 case ssa.TypeF64: 1746 store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem) 1747 case ssa.TypeV128: 1748 store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem) 1749 default: 1750 panic("BUG") 1751 } 1752 m.insert(store) 1753 } 1754 1755 func (m *machine) lowerCall(si *ssa.Instruction) { 1756 isDirectCall := si.Opcode() == ssa.OpcodeCall 1757 var indirectCalleePtr ssa.Value 1758 var directCallee ssa.FuncRef 1759 var sigID ssa.SignatureID 1760 var args []ssa.Value 1761 var isMemmove bool 1762 if isDirectCall { 1763 directCallee, sigID, args = si.CallData() 1764 } else { 1765 indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData() 1766 } 1767 calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID)) 1768 1769 stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize()) 1770 if m.maxRequiredStackSizeForCalls < stackSlotSize+16 { 1771 m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP. 1772 } 1773 1774 // Note: See machine.SetupPrologue for the stack layout. 1775 // The stack pointer decrease/increase will be inserted later in the compilation. 1776 1777 for i, arg := range args { 1778 reg := m.c.VRegOf(arg) 1779 def := m.c.ValueDefinition(arg) 1780 m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize) 1781 } 1782 1783 if isMemmove { 1784 // Go's memmove *might* use all xmm0-xmm15, so we need to release them. 1785 // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics 1786 // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286 1787 for i := regalloc.RealReg(0); i < 16; i++ { 1788 m.insert(m.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+i])) 1789 } 1790 } 1791 1792 if isDirectCall { 1793 call := m.allocateInstr().asCall(directCallee, calleeABI) 1794 m.insert(call) 1795 } else { 1796 ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr)) 1797 callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI) 1798 m.insert(callInd) 1799 } 1800 1801 if isMemmove { 1802 for i := regalloc.RealReg(0); i < 16; i++ { 1803 m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+i])) 1804 } 1805 } 1806 1807 var index int 1808 r1, rs := si.Returns() 1809 if r1.Valid() { 1810 m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize) 1811 index++ 1812 } 1813 1814 for _, r := range rs { 1815 m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize) 1816 index++ 1817 } 1818 } 1819 1820 // callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the 1821 // caller side of the function call. 1822 func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) { 1823 arg := &a.Args[argIndex] 1824 if def != nil && def.IsFromInstr() { 1825 // Constant instructions are inlined. 1826 if inst := def.Instr; inst.Constant() { 1827 m.insertLoadConstant(inst, reg) 1828 } 1829 } 1830 if arg.Kind == backend.ABIArgKindReg { 1831 m.InsertMove(arg.Reg, reg, arg.Type) 1832 } else { 1833 store := m.allocateInstr() 1834 mem := newOperandMem(m.newAmodeImmReg( 1835 // -stackSlotSize because the stack pointer is not yet decreased. 1836 uint32(arg.Offset-stackSlotSize), rspVReg)) 1837 switch arg.Type { 1838 case ssa.TypeI32: 1839 store.asMovRM(reg, mem, 4) 1840 case ssa.TypeI64: 1841 store.asMovRM(reg, mem, 8) 1842 case ssa.TypeF32: 1843 store.asXmmMovRM(sseOpcodeMovss, reg, mem) 1844 case ssa.TypeF64: 1845 store.asXmmMovRM(sseOpcodeMovsd, reg, mem) 1846 case ssa.TypeV128: 1847 store.asXmmMovRM(sseOpcodeMovdqu, reg, mem) 1848 default: 1849 panic("BUG") 1850 } 1851 m.insert(store) 1852 } 1853 } 1854 1855 func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) { 1856 r := &a.Rets[retIndex] 1857 if r.Kind == backend.ABIArgKindReg { 1858 m.InsertMove(reg, r.Reg, r.Type) 1859 } else { 1860 load := m.allocateInstr() 1861 mem := newOperandMem(m.newAmodeImmReg( 1862 // -stackSlotSize because the stack pointer is not yet decreased. 1863 uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg)) 1864 switch r.Type { 1865 case ssa.TypeI32: 1866 load.asMovzxRmR(extModeLQ, mem, reg) 1867 case ssa.TypeI64: 1868 load.asMov64MR(mem, reg) 1869 case ssa.TypeF32: 1870 load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg) 1871 case ssa.TypeF64: 1872 load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg) 1873 case ssa.TypeV128: 1874 load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg) 1875 default: 1876 panic("BUG") 1877 } 1878 m.insert(load) 1879 } 1880 } 1881 1882 // InsertMove implements backend.Machine. 1883 func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { 1884 switch typ { 1885 case ssa.TypeI32, ssa.TypeI64: 1886 i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64) 1887 m.insert(i) 1888 case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: 1889 var op sseOpcode 1890 switch typ { 1891 case ssa.TypeF32: 1892 op = sseOpcodeMovss 1893 case ssa.TypeF64: 1894 op = sseOpcodeMovsd 1895 case ssa.TypeV128: 1896 op = sseOpcodeMovdqa 1897 } 1898 i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst) 1899 m.insert(i) 1900 default: 1901 panic("BUG") 1902 } 1903 } 1904 1905 // Format implements backend.Machine. 1906 func (m *machine) Format() string { 1907 ectx := m.ectx 1908 begins := map[*instruction]backend.Label{} 1909 for l, pos := range ectx.LabelPositions { 1910 begins[pos.Begin] = l 1911 } 1912 1913 irBlocks := map[backend.Label]ssa.BasicBlockID{} 1914 for i, l := range ectx.SsaBlockIDToLabels { 1915 irBlocks[l] = ssa.BasicBlockID(i) 1916 } 1917 1918 var lines []string 1919 for cur := ectx.RootInstr; cur != nil; cur = cur.next { 1920 if l, ok := begins[cur]; ok { 1921 var labelStr string 1922 if blkID, ok := irBlocks[l]; ok { 1923 labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) 1924 } else { 1925 labelStr = fmt.Sprintf("%s:", l) 1926 } 1927 lines = append(lines, labelStr) 1928 } 1929 if cur.kind == nop0 { 1930 continue 1931 } 1932 lines = append(lines, "\t"+cur.String()) 1933 } 1934 for _, vc := range m.consts { 1935 if vc._var == nil { 1936 lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi)) 1937 } else { 1938 lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var)) 1939 } 1940 } 1941 return "\n" + strings.Join(lines, "\n") + "\n" 1942 } 1943 1944 func (m *machine) encodeWithoutSSA(root *instruction) { 1945 m.labelResolutionPends = m.labelResolutionPends[:0] 1946 ectx := m.ectx 1947 1948 bufPtr := m.c.BufPtr() 1949 for cur := root; cur != nil; cur = cur.next { 1950 offset := int64(len(*bufPtr)) 1951 if cur.kind == nop0 { 1952 l := cur.nop0Label() 1953 if pos, ok := ectx.LabelPositions[l]; ok { 1954 pos.BinaryOffset = offset 1955 } 1956 } 1957 1958 needLabelResolution := cur.encode(m.c) 1959 if needLabelResolution { 1960 m.labelResolutionPends = append(m.labelResolutionPends, 1961 labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4}, 1962 ) 1963 } 1964 } 1965 1966 for i := range m.labelResolutionPends { 1967 p := &m.labelResolutionPends[i] 1968 switch p.instr.kind { 1969 case jmp, jmpIf, lea: 1970 target := p.instr.jmpLabel() 1971 targetOffset := ectx.LabelPositions[target].BinaryOffset 1972 imm32Offset := p.imm32Offset 1973 jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. 1974 binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset)) 1975 default: 1976 panic("BUG") 1977 } 1978 } 1979 } 1980 1981 // Encode implements backend.Machine Encode. 1982 func (m *machine) Encode(ctx context.Context) (err error) { 1983 ectx := m.ectx 1984 bufPtr := m.c.BufPtr() 1985 1986 var fn string 1987 var fnIndex int 1988 var labelToSSABlockID map[backend.Label]ssa.BasicBlockID 1989 if wazevoapi.PerfMapEnabled { 1990 fn = wazevoapi.GetCurrentFunctionName(ctx) 1991 labelToSSABlockID = make(map[backend.Label]ssa.BasicBlockID) 1992 for i, l := range ectx.SsaBlockIDToLabels { 1993 labelToSSABlockID[l] = ssa.BasicBlockID(i) 1994 } 1995 fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx) 1996 } 1997 1998 m.labelResolutionPends = m.labelResolutionPends[:0] 1999 for _, pos := range ectx.OrderedBlockLabels { 2000 offset := int64(len(*bufPtr)) 2001 pos.BinaryOffset = offset 2002 for cur := pos.Begin; cur != pos.End.next; cur = cur.next { 2003 offset := int64(len(*bufPtr)) 2004 2005 switch cur.kind { 2006 case nop0: 2007 l := cur.nop0Label() 2008 if pos, ok := ectx.LabelPositions[l]; ok { 2009 pos.BinaryOffset = offset 2010 } 2011 case sourceOffsetInfo: 2012 m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo()) 2013 } 2014 2015 needLabelResolution := cur.encode(m.c) 2016 if needLabelResolution { 2017 m.labelResolutionPends = append(m.labelResolutionPends, 2018 labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4}, 2019 ) 2020 } 2021 } 2022 2023 if wazevoapi.PerfMapEnabled { 2024 l := pos.L 2025 var labelStr string 2026 if blkID, ok := labelToSSABlockID[l]; ok { 2027 labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID) 2028 } else { 2029 labelStr = l.String() 2030 } 2031 size := int64(len(*bufPtr)) - offset 2032 wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr)) 2033 } 2034 } 2035 2036 for i := range m.consts { 2037 offset := int64(len(*bufPtr)) 2038 vc := &m.consts[i] 2039 vc.label.BinaryOffset = offset 2040 if vc._var == nil { 2041 lo, hi := vc.lo, vc.hi 2042 m.c.Emit8Bytes(lo) 2043 m.c.Emit8Bytes(hi) 2044 } else { 2045 for _, b := range vc._var { 2046 m.c.EmitByte(b) 2047 } 2048 } 2049 } 2050 2051 buf := *bufPtr 2052 for i := range m.labelResolutionPends { 2053 p := &m.labelResolutionPends[i] 2054 switch p.instr.kind { 2055 case jmp, jmpIf, lea, xmmUnaryRmR: 2056 target := p.instr.jmpLabel() 2057 targetOffset := ectx.LabelPositions[target].BinaryOffset 2058 imm32Offset := p.imm32Offset 2059 jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. 2060 binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset)) 2061 case jmpTableIsland: 2062 tableBegin := p.instrOffset 2063 // Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes. 2064 targets := m.jmpTableTargets[p.instr.u1] 2065 for i, l := range targets { 2066 targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset 2067 jmpOffset := targetOffset - tableBegin 2068 binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset)) 2069 } 2070 default: 2071 panic("BUG") 2072 } 2073 } 2074 return 2075 } 2076 2077 // ResolveRelocations implements backend.Machine. 2078 func (m *machine) ResolveRelocations(refToBinaryOffset []int, binary []byte, relocations []backend.RelocationInfo, _ []int) { 2079 for _, r := range relocations { 2080 offset := r.Offset 2081 calleeFnOffset := refToBinaryOffset[r.FuncRef] 2082 // offset is the offset of the last 4 bytes of the call instruction. 2083 callInstrOffsetBytes := binary[offset : offset+4] 2084 diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction). 2085 callInstrOffsetBytes[0] = byte(diff) 2086 callInstrOffsetBytes[1] = byte(diff >> 8) 2087 callInstrOffsetBytes[2] = byte(diff >> 16) 2088 callInstrOffsetBytes[3] = byte(diff >> 24) 2089 } 2090 } 2091 2092 // CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo. 2093 func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return } 2094 2095 func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) { 2096 x := m.getOperand_Reg(xd) 2097 y := m.getOperand_Mem_Imm32_Reg(yd) 2098 cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64) 2099 m.insert(cmp) 2100 } 2101 2102 func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) { 2103 x, y, c := instr.FcmpData() 2104 switch c { 2105 case ssa.FloatCmpCondEqual: 2106 f1, f2 = condNP, condZ 2107 and = true 2108 case ssa.FloatCmpCondNotEqual: 2109 f1, f2 = condP, condNZ 2110 case ssa.FloatCmpCondLessThan: 2111 f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan) 2112 f2 = condInvalid 2113 x, y = y, x 2114 case ssa.FloatCmpCondLessThanOrEqual: 2115 f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual) 2116 f2 = condInvalid 2117 x, y = y, x 2118 default: 2119 f1 = condFromSSAFloatCmpCond(c) 2120 f2 = condInvalid 2121 } 2122 2123 var opc sseOpcode 2124 if x.Type() == ssa.TypeF32 { 2125 opc = sseOpcodeUcomiss 2126 } else { 2127 opc = sseOpcodeUcomisd 2128 } 2129 2130 xr := m.getOperand_Reg(m.c.ValueDefinition(x)) 2131 yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 2132 m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg())) 2133 return 2134 } 2135 2136 // allocateInstr allocates an instruction. 2137 func (m *machine) allocateInstr() *instruction { 2138 instr := m.ectx.InstructionPool.Allocate() 2139 if !m.regAllocStarted { 2140 instr.addedBeforeRegAlloc = true 2141 } 2142 return instr 2143 } 2144 2145 func (m *machine) allocateNop() *instruction { 2146 instr := m.allocateInstr() 2147 instr.kind = nop0 2148 return instr 2149 } 2150 2151 func (m *machine) insert(i *instruction) { 2152 ectx := m.ectx 2153 ectx.PendingInstructions = append(ectx.PendingInstructions, i) 2154 } 2155 2156 func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint 2157 pos := m.allocateLabel() 2158 l = pos.L 2159 nop = m.allocateInstr() 2160 nop.asNop0WithLabel(l) 2161 pos.Begin, pos.End = nop, nop 2162 return 2163 } 2164 2165 func (m *machine) allocateLabel() *labelPosition { 2166 ectx := m.ectx 2167 l := ectx.AllocateLabel() 2168 pos := ectx.AllocateLabelPosition(l) 2169 ectx.LabelPositions[l] = pos 2170 return pos 2171 } 2172 2173 func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { 2174 offset, ok := m.spillSlots[id] 2175 if !ok { 2176 offset = m.spillSlotSize 2177 m.spillSlots[id] = offset 2178 m.spillSlotSize += int64(size) 2179 } 2180 return offset 2181 } 2182 2183 func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) { 2184 mov := m.allocateInstr() 2185 if src.RegType() == regalloc.RegTypeInt { 2186 mov.asMovRR(src, dst, true) 2187 } else { 2188 mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst) 2189 } 2190 m.insert(mov) 2191 } 2192 2193 func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { 2194 typ := m.c.TypeOf(v) 2195 tmp := m.c.AllocateVReg(typ) 2196 m.copyTo(v, tmp) 2197 return tmp 2198 } 2199 2200 func (m *machine) requiredStackSize() int64 { 2201 return m.maxRequiredStackSizeForCalls + 2202 m.frameSize() + 2203 16 + // Need for stack checking. 2204 16 // return address and the caller RBP. 2205 } 2206 2207 func (m *machine) frameSize() int64 { 2208 s := m.clobberedRegSlotSize() + m.spillSlotSize 2209 if s&0xf != 0 { 2210 panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) 2211 } 2212 return s 2213 } 2214 2215 func (m *machine) clobberedRegSlotSize() int64 { 2216 return int64(len(m.clobberedRegs) * 16) 2217 } 2218 2219 func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) { 2220 x, y, execCtx := si.Arg3() 2221 2222 dividend := m.getOperand_Reg(m.c.ValueDefinition(x)) 2223 divisor := m.getOperand_Reg(m.c.ValueDefinition(y)) 2224 ctxVReg := m.c.VRegOf(execCtx) 2225 tmpGp := m.c.AllocateVReg(si.Return().Type()) 2226 2227 m.copyTo(dividend.reg(), raxVReg) 2228 m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg)) 2229 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) 2230 seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64) 2231 m.insert(seq) 2232 rd := m.c.VRegOf(si.Return()) 2233 if isDiv { 2234 m.copyTo(raxVReg, rd) 2235 } else { 2236 m.copyTo(rdxVReg, rd) 2237 } 2238 } 2239 2240 func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) { 2241 execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData() 2242 2243 dividend := raxVReg 2244 2245 // Ensure yr is not zero. 2246 test := m.allocateInstr() 2247 test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64) 2248 m.insert(test) 2249 2250 jnz := m.allocateInstr() 2251 m.insert(jnz) 2252 2253 nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero) 2254 2255 // If not zero, we can proceed with the division. 2256 jnz.asJmpIf(condNZ, newOperandLabel(nz)) 2257 2258 var ifRemNeg1 *instruction 2259 if signed { 2260 var neg1 uint64 2261 if _64 { 2262 neg1 = 0xffffffffffffffff 2263 } else { 2264 neg1 = 0xffffffff 2265 } 2266 m.lowerIconst(tmpGp, neg1, _64) 2267 2268 if isDiv { 2269 // For signed division, we have to have branches for "math.MinInt{32,64} / -1" 2270 // case which results in the floating point exception via division error as 2271 // the resulting value exceeds the maximum of signed int. 2272 2273 // First, we check if the divisor is -1. 2274 cmp := m.allocateInstr() 2275 cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) 2276 m.insert(cmp) 2277 2278 ifNotNeg1 := m.allocateInstr() 2279 m.insert(ifNotNeg1) 2280 2281 var minInt uint64 2282 if _64 { 2283 minInt = 0x8000000000000000 2284 } else { 2285 minInt = 0x80000000 2286 } 2287 m.lowerIconst(tmpGp, minInt, _64) 2288 2289 // Next we check if the quotient is the most negative value for the signed integer, i.e. 2290 // if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively. 2291 cmp2 := m.allocateInstr() 2292 cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64) 2293 m.insert(cmp2) 2294 2295 ifNotMinInt := m.allocateInstr() 2296 m.insert(ifNotMinInt) 2297 2298 // Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1), 2299 // as that is the overflow in division as the result becomes 2^31 which is larger than 2300 // the maximum of signed 32-bit int (2^31-1). 2301 end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 2302 ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end)) 2303 ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end)) 2304 } else { 2305 // If it is remainder, zeros DX register and compare the divisor to -1. 2306 xor := m.allocateInstr().asZeros(rdxVReg) 2307 m.insert(xor) 2308 2309 // We check if the divisor is -1. 2310 cmp := m.allocateInstr() 2311 cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) 2312 m.insert(cmp) 2313 2314 ifRemNeg1 = m.allocateInstr() 2315 m.insert(ifRemNeg1) 2316 } 2317 2318 // Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers. 2319 sed := m.allocateInstr() 2320 sed.asSignExtendData(_64) 2321 m.insert(sed) 2322 } else { 2323 // Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers. 2324 zeros := m.allocateInstr().asZeros(rdxVReg) 2325 m.insert(zeros) 2326 } 2327 2328 div := m.allocateInstr() 2329 div.asDiv(newOperandReg(divisor), signed, _64) 2330 m.insert(div) 2331 2332 nop, end := m.allocateBrTarget() 2333 m.insert(nop) 2334 // If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function. 2335 if ifRemNeg1 != nil { 2336 ifRemNeg1.asJmpIf(condZ, newOperandLabel(end)) 2337 } 2338 } 2339 2340 func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) { 2341 x := instr.Arg() 2342 if !x.Type().IsFloat() { 2343 panic("BUG?") 2344 } 2345 var op sseOpcode 2346 if x.Type().Bits() == 64 { 2347 op = sseOpcodeRoundsd 2348 } else { 2349 op = sseOpcodeRoundss 2350 } 2351 2352 xDef := m.c.ValueDefinition(x) 2353 rm := m.getOperand_Mem_Reg(xDef) 2354 rd := m.c.VRegOf(instr.Return()) 2355 2356 xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd) 2357 m.insert(xmm) 2358 } 2359 2360 func (m *machine) lowerFminFmax(instr *ssa.Instruction) { 2361 x, y := instr.Arg2() 2362 if !x.Type().IsFloat() { 2363 panic("BUG?") 2364 } 2365 2366 _64 := x.Type().Bits() == 64 2367 isMin := instr.Opcode() == ssa.OpcodeFmin 2368 var minMaxOp sseOpcode 2369 2370 switch { 2371 case _64 && isMin: 2372 minMaxOp = sseOpcodeMinpd 2373 case _64 && !isMin: 2374 minMaxOp = sseOpcodeMaxpd 2375 case !_64 && isMin: 2376 minMaxOp = sseOpcodeMinps 2377 case !_64 && !isMin: 2378 minMaxOp = sseOpcodeMaxps 2379 } 2380 2381 xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) 2382 rm := m.getOperand_Reg(xDef) 2383 // We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg. 2384 rn := m.getOperand_Reg(yDef) 2385 rd := m.c.VRegOf(instr.Return()) 2386 2387 tmp := m.copyToTmp(rm.reg()) 2388 2389 // Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case. 2390 cmp := m.allocateInstr() 2391 if _64 { 2392 cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp) 2393 } else { 2394 cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp) 2395 } 2396 m.insert(cmp) 2397 2398 // At this point, we have the three cases of conditional flags below 2399 // (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.) 2400 // 2401 // 1) Two values are NaN-free and different: All flags are cleared. 2402 // 2) Two values are NaN-free and equal: Only ZF flags is set. 2403 // 3) One of Two values is NaN: ZF, PF and CF flags are set. 2404 2405 // Jump instruction to handle 1) case by checking the ZF flag 2406 // as ZF is only set for 2) and 3) cases. 2407 nanFreeOrDiffJump := m.allocateInstr() 2408 m.insert(nanFreeOrDiffJump) 2409 2410 // Start handling 2) and 3). 2411 2412 // Jump if one of two values is NaN by checking the parity flag (PF). 2413 ifIsNan := m.allocateInstr() 2414 m.insert(ifIsNan) 2415 2416 // Start handling 2) NaN-free and equal. 2417 2418 // Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is 2419 // returned if two values are positive and negative zeros. 2420 var op sseOpcode 2421 switch { 2422 case !_64 && isMin: 2423 op = sseOpcodeOrps 2424 case _64 && isMin: 2425 op = sseOpcodeOrpd 2426 case !_64 && !isMin: 2427 op = sseOpcodeAndps 2428 case _64 && !isMin: 2429 op = sseOpcodeAndpd 2430 } 2431 orAnd := m.allocateInstr() 2432 orAnd.asXmmRmR(op, rn, tmp) 2433 m.insert(orAnd) 2434 2435 // Done, jump to end. 2436 sameExitJump := m.allocateInstr() 2437 m.insert(sameExitJump) 2438 2439 // Start handling 3) either is NaN. 2440 isNanTarget, isNan := m.allocateBrTarget() 2441 m.insert(isNanTarget) 2442 ifIsNan.asJmpIf(condP, newOperandLabel(isNan)) 2443 2444 // We emit the ADD instruction to produce the NaN in tmp. 2445 add := m.allocateInstr() 2446 if _64 { 2447 add.asXmmRmR(sseOpcodeAddsd, rn, tmp) 2448 } else { 2449 add.asXmmRmR(sseOpcodeAddss, rn, tmp) 2450 } 2451 m.insert(add) 2452 2453 // Exit from the NaN case branch. 2454 nanExitJmp := m.allocateInstr() 2455 m.insert(nanExitJmp) 2456 2457 // Start handling 1). 2458 doMinMaxTarget, doMinMax := m.allocateBrTarget() 2459 m.insert(doMinMaxTarget) 2460 nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax)) 2461 2462 // Now handle the NaN-free and different values case. 2463 minMax := m.allocateInstr() 2464 minMax.asXmmRmR(minMaxOp, rn, tmp) 2465 m.insert(minMax) 2466 2467 endNop, end := m.allocateBrTarget() 2468 m.insert(endNop) 2469 nanExitJmp.asJmp(newOperandLabel(end)) 2470 sameExitJump.asJmp(newOperandLabel(end)) 2471 2472 m.copyTo(tmp, rd) 2473 } 2474 2475 func (m *machine) lowerFcopysign(instr *ssa.Instruction) { 2476 x, y := instr.Arg2() 2477 if !x.Type().IsFloat() { 2478 panic("BUG") 2479 } 2480 2481 _64 := x.Type().Bits() == 64 2482 2483 xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) 2484 rm := m.getOperand_Reg(xDef) 2485 rn := m.getOperand_Reg(yDef) 2486 rd := m.c.VRegOf(instr.Return()) 2487 2488 // Clear the non-sign bits of src via AND with the mask. 2489 var opAnd, opOr sseOpcode 2490 var signMask uint64 2491 if _64 { 2492 signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd 2493 } else { 2494 signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps 2495 } 2496 2497 signBitReg := m.c.AllocateVReg(x.Type()) 2498 m.lowerFconst(signBitReg, signMask, _64) 2499 nonSignBitReg := m.c.AllocateVReg(x.Type()) 2500 m.lowerFconst(nonSignBitReg, ^signMask, _64) 2501 2502 // Extract the sign bits of rn. 2503 and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg) 2504 m.insert(and) 2505 2506 // Clear the sign bit of dst via AND with the non-sign bit mask. 2507 xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg) 2508 m.insert(xor) 2509 2510 // Copy the sign bits of src to dst via OR. 2511 or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg) 2512 m.insert(or) 2513 2514 m.copyTo(nonSignBitReg, rd) 2515 } 2516 2517 func (m *machine) lowerBitcast(instr *ssa.Instruction) { 2518 x, dstTyp := instr.BitcastData() 2519 srcTyp := x.Type() 2520 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 2521 rd := m.c.VRegOf(instr.Return()) 2522 switch { 2523 case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32: 2524 cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false) 2525 m.insert(cvt) 2526 case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32: 2527 cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false) 2528 m.insert(cvt) 2529 case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64: 2530 cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true) 2531 m.insert(cvt) 2532 case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64: 2533 cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true) 2534 m.insert(cvt) 2535 default: 2536 panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp)) 2537 } 2538 } 2539 2540 func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { 2541 var tmpXmm regalloc.VReg 2542 if dst64 { 2543 tmpXmm = m.c.AllocateVReg(ssa.TypeF64) 2544 } else { 2545 tmpXmm = m.c.AllocateVReg(ssa.TypeF32) 2546 } 2547 2548 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) 2549 tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) 2550 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) 2551 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) 2552 2553 m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat)) 2554 m.copyTo(tmpGp, rd) 2555 } 2556 2557 func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) { 2558 execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData() 2559 var cmpOp, truncOp sseOpcode 2560 if src64 { 2561 cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si 2562 } else { 2563 cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si 2564 } 2565 2566 trunc := m.allocateInstr() 2567 trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) 2568 m.insert(trunc) 2569 2570 // Check if the dst operand was INT_MIN, by checking it against 1. 2571 cmp1 := m.allocateInstr() 2572 cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64) 2573 m.insert(cmp1) 2574 2575 // If no overflow, then we are done. 2576 doneTarget, done := m.allocateBrTarget() 2577 ifNoOverflow := m.allocateInstr() 2578 ifNoOverflow.asJmpIf(condNO, newOperandLabel(done)) 2579 m.insert(ifNoOverflow) 2580 2581 // Now, check for NaN. 2582 cmpNan := m.allocateInstr() 2583 cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src) 2584 m.insert(cmpNan) 2585 2586 // We allocate the "non-nan target" here, but we will insert it later. 2587 notNanTarget, notNaN := m.allocateBrTarget() 2588 ifNotNan := m.allocateInstr() 2589 ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN)) 2590 m.insert(ifNotNan) 2591 2592 if sat { 2593 // If NaN and saturating, return 0. 2594 zeroDst := m.allocateInstr().asZeros(tmpGp) 2595 m.insert(zeroDst) 2596 2597 jmpEnd := m.allocateInstr() 2598 jmpEnd.asJmp(newOperandLabel(done)) 2599 m.insert(jmpEnd) 2600 2601 // Otherwise: 2602 m.insert(notNanTarget) 2603 2604 // Zero-out the tmp register. 2605 zero := m.allocateInstr().asZeros(tmpXmm) 2606 m.insert(zero) 2607 2608 cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) 2609 m.insert(cmpXmm) 2610 2611 // if >= jump to end. 2612 jmpEnd2 := m.allocateInstr() 2613 jmpEnd2.asJmpIf(condB, newOperandLabel(done)) 2614 m.insert(jmpEnd2) 2615 2616 // Otherwise, saturate to INT_MAX. 2617 if dst64 { 2618 m.lowerIconst(tmpGp, math.MaxInt64, dst64) 2619 } else { 2620 m.lowerIconst(tmpGp, math.MaxInt32, dst64) 2621 } 2622 2623 } else { 2624 2625 // If non-sat, NaN, trap. 2626 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) 2627 2628 // Otherwise, we will jump here. 2629 m.insert(notNanTarget) 2630 2631 // jump over trap if src larger than threshold 2632 condAboveThreshold := condNB 2633 2634 // The magic constants are various combination of minInt for int[32|64] represented as float[32|64]. 2635 var minInt uint64 2636 switch { 2637 case src64 && dst64: 2638 minInt = 0xc3e0000000000000 2639 case src64 && !dst64: 2640 condAboveThreshold = condNBE 2641 minInt = 0xC1E0_0000_0020_0000 2642 case !src64 && dst64: 2643 minInt = 0xDF00_0000 2644 case !src64 && !dst64: 2645 minInt = 0xCF00_0000 2646 } 2647 2648 loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64) 2649 m.insert(loadToGP) 2650 2651 movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64) 2652 m.insert(movToXmm) 2653 2654 cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) 2655 m.insert(cmpXmm) 2656 2657 jmpIfLarger := m.allocateInstr() 2658 checkPositiveTarget, checkPositive := m.allocateBrTarget() 2659 jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive)) 2660 m.insert(jmpIfLarger) 2661 2662 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 2663 2664 // If positive, it was a real overflow. 2665 m.insert(checkPositiveTarget) 2666 2667 // Zero out the temp register. 2668 xorpd := m.allocateInstr() 2669 xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm) 2670 m.insert(xorpd) 2671 2672 pos := m.allocateInstr() 2673 pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm) 2674 m.insert(pos) 2675 2676 // If >= jump to end. 2677 jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done)) 2678 m.insert(jmp) 2679 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 2680 } 2681 2682 m.insert(doneTarget) 2683 } 2684 2685 func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { 2686 tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64) 2687 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) 2688 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2)) 2689 tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) 2690 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) 2691 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) 2692 2693 m.insert(m.allocateFcvtToUintSequence( 2694 ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat, 2695 )) 2696 m.copyTo(tmpGp, rd) 2697 } 2698 2699 func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) { 2700 execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData() 2701 2702 var subOp, cmpOp, truncOp sseOpcode 2703 if src64 { 2704 subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si 2705 } else { 2706 subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si 2707 } 2708 2709 doneTarget, done := m.allocateBrTarget() 2710 2711 switch { 2712 case src64 && dst64: 2713 loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true) 2714 m.insert(loadToGP) 2715 movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) 2716 m.insert(movToXmm) 2717 case src64 && !dst64: 2718 loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true) 2719 m.insert(loadToGP) 2720 movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) 2721 m.insert(movToXmm) 2722 case !src64 && dst64: 2723 loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false) 2724 m.insert(loadToGP) 2725 movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) 2726 m.insert(movToXmm) 2727 case !src64 && !dst64: 2728 loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false) 2729 m.insert(loadToGP) 2730 movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) 2731 m.insert(movToXmm) 2732 } 2733 2734 cmp := m.allocateInstr() 2735 cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) 2736 m.insert(cmp) 2737 2738 // If above `tmp` ("large threshold"), jump to `ifAboveThreshold` 2739 ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget() 2740 jmpIfAboveThreshold := m.allocateInstr() 2741 jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold)) 2742 m.insert(jmpIfAboveThreshold) 2743 2744 ifNotNaNTarget, ifNotNaN := m.allocateBrTarget() 2745 jmpIfNotNaN := m.allocateInstr() 2746 jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN)) 2747 m.insert(jmpIfNotNaN) 2748 2749 // If NaN, handle the error condition. 2750 if sat { 2751 // On NaN, saturating, we just return 0. 2752 zeros := m.allocateInstr().asZeros(tmpGp) 2753 m.insert(zeros) 2754 2755 jmpEnd := m.allocateInstr() 2756 jmpEnd.asJmp(newOperandLabel(done)) 2757 m.insert(jmpEnd) 2758 } else { 2759 // On NaN, non-saturating, we trap. 2760 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) 2761 } 2762 2763 // If not NaN, land here. 2764 m.insert(ifNotNaNTarget) 2765 2766 // Truncation happens here. 2767 2768 trunc := m.allocateInstr() 2769 trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) 2770 m.insert(trunc) 2771 2772 // Check if the result is negative. 2773 cmpNeg := m.allocateInstr() 2774 cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) 2775 m.insert(cmpNeg) 2776 2777 // If non-neg, jump to end. 2778 jmpIfNonNeg := m.allocateInstr() 2779 jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done)) 2780 m.insert(jmpIfNonNeg) 2781 2782 if sat { 2783 // If the input was "small" (< 2**(width -1)), the only way to get an integer 2784 // overflow is because the input was too small: saturate to the min value, i.e. 0. 2785 zeros := m.allocateInstr().asZeros(tmpGp) 2786 m.insert(zeros) 2787 2788 jmpEnd := m.allocateInstr() 2789 jmpEnd.asJmp(newOperandLabel(done)) 2790 m.insert(jmpEnd) 2791 } else { 2792 // If not saturating, trap. 2793 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 2794 } 2795 2796 // If above the threshold, land here. 2797 m.insert(ifAboveThresholdTarget) 2798 2799 // tmpDiff := threshold - rn. 2800 copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2) 2801 m.insert(copySrc) 2802 2803 sub := m.allocateInstr() 2804 sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000 2805 m.insert(sub) 2806 2807 trunc2 := m.allocateInstr() 2808 trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64) 2809 m.insert(trunc2) 2810 2811 // Check if the result is negative. 2812 cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) 2813 m.insert(cmpNeg2) 2814 2815 ifNextLargeTarget, ifNextLarge := m.allocateBrTarget() 2816 jmpIfNextLarge := m.allocateInstr() 2817 jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge)) 2818 m.insert(jmpIfNextLarge) 2819 2820 if sat { 2821 // The input was "large" (>= maxInt), so the only way to get an integer 2822 // overflow is because the input was too large: saturate to the max value. 2823 var maxInt uint64 2824 if dst64 { 2825 maxInt = math.MaxUint64 2826 } else { 2827 maxInt = math.MaxUint32 2828 } 2829 m.lowerIconst(tmpGp, maxInt, dst64) 2830 2831 jmpToEnd := m.allocateInstr() 2832 jmpToEnd.asJmp(newOperandLabel(done)) 2833 m.insert(jmpToEnd) 2834 } else { 2835 // If not saturating, trap. 2836 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 2837 } 2838 2839 m.insert(ifNextLargeTarget) 2840 2841 var op operand 2842 if dst64 { 2843 m.lowerIconst(tmpGp2, 0x8000000000000000, true) 2844 op = newOperandReg(tmpGp2) 2845 } else { 2846 op = newOperandImm32(0x80000000) 2847 } 2848 2849 add := m.allocateInstr() 2850 add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64) 2851 m.insert(add) 2852 2853 m.insert(doneTarget) 2854 } 2855 2856 func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) { 2857 var op sseOpcode 2858 if dst64 { 2859 op = sseOpcodeCvtsi2sd 2860 } else { 2861 op = sseOpcodeCvtsi2ss 2862 } 2863 2864 trunc := m.allocateInstr() 2865 trunc.asGprToXmm(op, rn, rd.reg(), src64) 2866 m.insert(trunc) 2867 } 2868 2869 func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) { 2870 var op sseOpcode 2871 if dst64 { 2872 op = sseOpcodeCvtsi2sd 2873 } else { 2874 op = sseOpcodeCvtsi2ss 2875 } 2876 2877 // Src is 32 bit, then we just perform the conversion with 64 bit width. 2878 // 2879 // See the following link for why we use 64bit conversion for unsigned 32bit integer sources: 2880 // https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float. 2881 // 2882 // Here's the summary: 2883 // >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float, 2884 // >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide 2885 // >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values, 2886 // >> which allows CVTSI2SS to be used after all. 2887 // 2888 if !src64 { 2889 // Before we convert, we have to clear the higher 32-bits of the 64-bit register 2890 // to get the correct result. 2891 tmp := m.c.AllocateVReg(ssa.TypeI32) 2892 m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, tmp)) 2893 m.insert(m.allocateInstr().asGprToXmm(op, newOperandReg(tmp), rd.reg(), true)) 2894 return 2895 } 2896 2897 // If uint64, we have to do a bit more work. 2898 endTarget, end := m.allocateBrTarget() 2899 2900 var tmpXmm regalloc.VReg 2901 if dst64 { 2902 tmpXmm = m.c.AllocateVReg(ssa.TypeF64) 2903 } else { 2904 tmpXmm = m.c.AllocateVReg(ssa.TypeF32) 2905 } 2906 2907 // Check if the most significant bit (sign bit) is set. 2908 test := m.allocateInstr() 2909 test.asCmpRmiR(false, rn, rn.reg(), src64) 2910 m.insert(test) 2911 2912 // Jump if the sign bit is set. 2913 ifSignTarget, ifSign := m.allocateBrTarget() 2914 jmpIfNeg := m.allocateInstr() 2915 jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign)) 2916 m.insert(jmpIfNeg) 2917 2918 // If the sign bit is not set, we could fit the unsigned int into float32/float64. 2919 // So, we convert it to float and emit jump instruction to exit from this branch. 2920 cvt := m.allocateInstr() 2921 cvt.asGprToXmm(op, rn, tmpXmm, src64) 2922 m.insert(cvt) 2923 2924 // We are done, jump to end. 2925 jmpEnd := m.allocateInstr() 2926 jmpEnd.asJmp(newOperandLabel(end)) 2927 m.insert(jmpEnd) 2928 2929 // Now handling the case where sign-bit is set. 2930 // We emit the following sequences: 2931 // mov %rn, %tmp 2932 // shr 1, %tmp 2933 // mov %rn, %tmp2 2934 // and 1, %tmp2 2935 // or %tmp2, %tmp 2936 // cvtsi2ss %tmp, %xmm0 2937 // addsd %xmm0, %xmm0 2938 m.insert(ifSignTarget) 2939 2940 tmp := m.copyToTmp(rn.reg()) 2941 shr := m.allocateInstr() 2942 shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64) 2943 m.insert(shr) 2944 2945 tmp2 := m.copyToTmp(rn.reg()) 2946 and := m.allocateInstr() 2947 and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64) 2948 m.insert(and) 2949 2950 or := m.allocateInstr() 2951 or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64) 2952 m.insert(or) 2953 2954 cvt2 := m.allocateInstr() 2955 cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64) 2956 m.insert(cvt2) 2957 2958 addsd := m.allocateInstr() 2959 if dst64 { 2960 addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm) 2961 } else { 2962 addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm) 2963 } 2964 m.insert(addsd) 2965 2966 m.insert(endTarget) 2967 m.copyTo(tmpXmm, rd.reg()) 2968 } 2969 2970 func (m *machine) lowerVanyTrue(instr *ssa.Instruction) { 2971 x := instr.Arg() 2972 rm := m.getOperand_Reg(m.c.ValueDefinition(x)) 2973 rd := m.c.VRegOf(instr.Return()) 2974 2975 tmp := m.c.AllocateVReg(ssa.TypeI32) 2976 2977 cmp := m.allocateInstr() 2978 cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg()) 2979 m.insert(cmp) 2980 2981 setcc := m.allocateInstr() 2982 setcc.asSetcc(condNZ, tmp) 2983 m.insert(setcc) 2984 2985 // Clear the irrelevant bits. 2986 and := m.allocateInstr() 2987 and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false) 2988 m.insert(and) 2989 2990 m.copyTo(tmp, rd) 2991 } 2992 2993 func (m *machine) lowerVallTrue(instr *ssa.Instruction) { 2994 x, lane := instr.ArgWithLane() 2995 var op sseOpcode 2996 switch lane { 2997 case ssa.VecLaneI8x16: 2998 op = sseOpcodePcmpeqb 2999 case ssa.VecLaneI16x8: 3000 op = sseOpcodePcmpeqw 3001 case ssa.VecLaneI32x4: 3002 op = sseOpcodePcmpeqd 3003 case ssa.VecLaneI64x2: 3004 op = sseOpcodePcmpeqq 3005 } 3006 rm := m.getOperand_Reg(m.c.ValueDefinition(x)) 3007 rd := m.c.VRegOf(instr.Return()) 3008 3009 tmp := m.c.AllocateVReg(ssa.TypeV128) 3010 3011 zeros := m.allocateInstr() 3012 zeros.asZeros(tmp) 3013 m.insert(zeros) 3014 3015 pcmp := m.allocateInstr() 3016 pcmp.asXmmRmR(op, rm, tmp) 3017 m.insert(pcmp) 3018 3019 test := m.allocateInstr() 3020 test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp) 3021 m.insert(test) 3022 3023 tmp2 := m.c.AllocateVReg(ssa.TypeI32) 3024 3025 setcc := m.allocateInstr() 3026 setcc.asSetcc(condZ, tmp2) 3027 m.insert(setcc) 3028 3029 // Clear the irrelevant bits. 3030 and := m.allocateInstr() 3031 and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false) 3032 m.insert(and) 3033 3034 m.copyTo(tmp2, rd) 3035 } 3036 3037 func (m *machine) lowerVhighBits(instr *ssa.Instruction) { 3038 x, lane := instr.ArgWithLane() 3039 rm := m.getOperand_Reg(m.c.ValueDefinition(x)) 3040 rd := m.c.VRegOf(instr.Return()) 3041 switch lane { 3042 case ssa.VecLaneI8x16: 3043 mov := m.allocateInstr() 3044 mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false) 3045 m.insert(mov) 3046 3047 case ssa.VecLaneI16x8: 3048 // When we have: 3049 // R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)] 3050 // R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)] 3051 // where RX(wn) is n-th signed word (16-bit) of RX register, 3052 // 3053 // "PACKSSWB R1, R2" produces 3054 // R1 = [ 3055 // byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)), 3056 // byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)), 3057 // byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)), 3058 // byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)), 3059 // ] 3060 // where R1 is the destination register, and 3061 // byte_sat(w) = int8(w) if w fits as signed 8-bit, 3062 // 0x80 if w is less than 0x80 3063 // 0x7F if w is greater than 0x7f 3064 // 3065 // See https://www.felixcloutier.com/x86/packsswb:packssdw for detail. 3066 // 3067 // Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8). 3068 tmp := m.copyToTmp(rm.reg()) 3069 res := m.c.AllocateVReg(ssa.TypeI32) 3070 3071 pak := m.allocateInstr() 3072 pak.asXmmRmR(sseOpcodePacksswb, rm, tmp) 3073 m.insert(pak) 3074 3075 mov := m.allocateInstr() 3076 mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false) 3077 m.insert(mov) 3078 3079 // Clear the higher bits than 8. 3080 shr := m.allocateInstr() 3081 shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false) 3082 m.insert(shr) 3083 3084 m.copyTo(res, rd) 3085 3086 case ssa.VecLaneI32x4: 3087 mov := m.allocateInstr() 3088 mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true) 3089 m.insert(mov) 3090 3091 case ssa.VecLaneI64x2: 3092 mov := m.allocateInstr() 3093 mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true) 3094 m.insert(mov) 3095 } 3096 } 3097 3098 func (m *machine) lowerVbnot(instr *ssa.Instruction) { 3099 x := instr.Arg() 3100 xDef := m.c.ValueDefinition(x) 3101 rm := m.getOperand_Reg(xDef) 3102 rd := m.c.VRegOf(instr.Return()) 3103 3104 tmp := m.copyToTmp(rm.reg()) 3105 tmp2 := m.c.AllocateVReg(ssa.TypeV128) 3106 3107 // Ensure tmp2 is considered defined by regalloc. 3108 m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) 3109 3110 // Set all bits on tmp register. 3111 pak := m.allocateInstr() 3112 pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2) 3113 m.insert(pak) 3114 3115 // Then XOR with tmp to reverse all bits on v.register. 3116 xor := m.allocateInstr() 3117 xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp) 3118 m.insert(xor) 3119 3120 m.copyTo(tmp, rd) 3121 } 3122 3123 func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) { 3124 tmpDst := m.c.AllocateVReg(ssa.TypeV128) 3125 m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) 3126 3127 switch lane { 3128 case ssa.VecLaneI8x16: 3129 tmp := m.c.AllocateVReg(ssa.TypeV128) 3130 m.insert(m.allocateInstr().asDefineUninitializedReg(tmp)) 3131 xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3132 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst)) 3133 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp)) 3134 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst)) 3135 case ssa.VecLaneI16x8: 3136 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 3137 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst)) 3138 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst)) 3139 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) 3140 case ssa.VecLaneI32x4: 3141 xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3142 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst)) 3143 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) 3144 case ssa.VecLaneI64x2: 3145 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 3146 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst)) 3147 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst)) 3148 case ssa.VecLaneF32x4: 3149 xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3150 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst)) 3151 m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) 3152 case ssa.VecLaneF64x2: 3153 xx := m.getOperand_Reg(m.c.ValueDefinition(x)) 3154 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst)) 3155 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst)) 3156 default: 3157 panic(fmt.Sprintf("invalid lane type: %s", lane)) 3158 } 3159 3160 m.copyTo(tmpDst, m.c.VRegOf(ret)) 3161 } 3162 3163 func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) { 3164 var xMask, yMask [2]uint64 3165 for i := 0; i < 8; i++ { 3166 loLane := byte(lo >> (i * 8)) 3167 if loLane < 16 { 3168 xMask[0] |= uint64(loLane) << (i * 8) 3169 yMask[0] |= uint64(0x80) << (i * 8) 3170 } else { 3171 xMask[0] |= uint64(0x80) << (i * 8) 3172 yMask[0] |= uint64(loLane-16) << (i * 8) 3173 } 3174 hiLane := byte(hi >> (i * 8)) 3175 if hiLane < 16 { 3176 xMask[1] |= uint64(hiLane) << (i * 8) 3177 yMask[1] |= uint64(0x80) << (i * 8) 3178 } else { 3179 xMask[1] |= uint64(0x80) << (i * 8) 3180 yMask[1] |= uint64(hiLane-16) << (i * 8) 3181 } 3182 } 3183 3184 xmaskLabel := m.allocateLabel() 3185 m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel}) 3186 ymaskLabel := m.allocateLabel() 3187 m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel}) 3188 3189 xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) 3190 tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg()) 3191 3192 // Apply mask to X. 3193 tmp := m.c.AllocateVReg(ssa.TypeV128) 3194 loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp) 3195 m.insert(loadMaskLo) 3196 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX)) 3197 3198 // Apply mask to Y. 3199 loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp) 3200 m.insert(loadMaskHi) 3201 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY)) 3202 3203 // Combine the results. 3204 m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY)) 3205 3206 m.copyTo(tmpY, m.c.VRegOf(ret)) 3207 } 3208 3209 func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) { 3210 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 3211 rm := m.getOperand_Reg(m.c.ValueDefinition(y)) 3212 rd := m.c.VRegOf(ret) 3213 3214 tmp := m.copyToTmp(rn.reg()) 3215 3216 binOp := m.allocateInstr() 3217 binOp.asXmmRmR(op, rm, tmp) 3218 m.insert(binOp) 3219 3220 m.copyTo(tmp, rd) 3221 } 3222 3223 func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) { 3224 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 3225 rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 3226 rd := m.c.VRegOf(ret) 3227 3228 tmp := m.copyToTmp(rn.reg()) 3229 3230 binOp := m.allocateInstr() 3231 binOp.asXmmRmR(op, rm, tmp) 3232 m.insert(binOp) 3233 3234 m.copyTo(tmp, rd) 3235 } 3236 3237 func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) { 3238 var cmpOp sseOpcode 3239 switch lane { 3240 case ssa.VecLaneF32x4: 3241 cmpOp = sseOpcodeCmpps 3242 case ssa.VecLaneF64x2: 3243 cmpOp = sseOpcodeCmppd 3244 default: 3245 panic(fmt.Sprintf("invalid lane type: %s", lane)) 3246 } 3247 3248 xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) 3249 var cmpImm cmpPred 3250 switch c { 3251 case ssa.FloatCmpCondGreaterThan: 3252 yy, xx = xx, yy 3253 cmpImm = cmpPredLT_OS 3254 case ssa.FloatCmpCondGreaterThanOrEqual: 3255 yy, xx = xx, yy 3256 cmpImm = cmpPredLE_OS 3257 case ssa.FloatCmpCondEqual: 3258 cmpImm = cmpPredEQ_OQ 3259 case ssa.FloatCmpCondNotEqual: 3260 cmpImm = cmpPredNEQ_UQ 3261 case ssa.FloatCmpCondLessThan: 3262 cmpImm = cmpPredLT_OS 3263 case ssa.FloatCmpCondLessThanOrEqual: 3264 cmpImm = cmpPredLE_OS 3265 default: 3266 panic(fmt.Sprintf("invalid float comparison condition: %s", c)) 3267 } 3268 3269 tmp := m.c.AllocateVReg(ssa.TypeV128) 3270 xxx := m.getOperand_Mem_Reg(xx) 3271 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp)) 3272 3273 rm := m.getOperand_Mem_Reg(yy) 3274 m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp)) 3275 3276 m.copyTo(tmp, m.c.VRegOf(ret)) 3277 } 3278 3279 func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) { 3280 var eq, gt, maxu, minu, mins sseOpcode 3281 switch lane { 3282 case ssa.VecLaneI8x16: 3283 eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb 3284 case ssa.VecLaneI16x8: 3285 eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw 3286 case ssa.VecLaneI32x4: 3287 eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd 3288 case ssa.VecLaneI64x2: 3289 eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq 3290 default: 3291 panic(fmt.Sprintf("invalid lane type: %s", lane)) 3292 } 3293 3294 tmp := m.c.AllocateVReg(ssa.TypeV128) 3295 var op operand 3296 switch c { 3297 case ssa.IntegerCmpCondSignedLessThanOrEqual: 3298 if lane == ssa.VecLaneI64x2 { 3299 x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3300 // Copy x to tmp. 3301 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) 3302 op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 3303 } else { 3304 y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 3305 // Copy y to tmp. 3306 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) 3307 op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3308 } 3309 case ssa.IntegerCmpCondSignedGreaterThanOrEqual: 3310 if lane == ssa.VecLaneI64x2 { 3311 y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 3312 // Copy y to tmp. 3313 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) 3314 op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3315 } else { 3316 x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3317 // Copy x to tmp. 3318 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) 3319 op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 3320 } 3321 case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual: 3322 y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 3323 // Copy y to tmp. 3324 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) 3325 op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3326 default: 3327 x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3328 // Copy x to tmp. 3329 m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) 3330 op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) 3331 } 3332 3333 switch c { 3334 case ssa.IntegerCmpCondEqual: 3335 m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) 3336 case ssa.IntegerCmpCondNotEqual: 3337 // First we compare for equality. 3338 m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) 3339 // Then flip the bits. To do so, we set all bits on tmp2. 3340 tmp2 := m.c.AllocateVReg(ssa.TypeV128) 3341 m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) 3342 m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) 3343 // And then xor with tmp. 3344 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) 3345 case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan: 3346 m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) 3347 case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual: 3348 if lane == ssa.VecLaneI64x2 { 3349 m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) 3350 // Then flip the bits. To do so, we set all bits on tmp2. 3351 tmp2 := m.c.AllocateVReg(ssa.TypeV128) 3352 m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) 3353 m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) 3354 // And then xor with tmp. 3355 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) 3356 } else { 3357 // First take min of x and y. 3358 m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp)) 3359 // Then compare for equality. 3360 m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) 3361 } 3362 case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan: 3363 // First maxu of x and y. 3364 m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp)) 3365 // Then compare for equality. 3366 m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) 3367 // Then flip the bits. To do so, we set all bits on tmp2. 3368 tmp2 := m.c.AllocateVReg(ssa.TypeV128) 3369 m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) 3370 m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) 3371 // And then xor with tmp. 3372 m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) 3373 case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual: 3374 m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp)) 3375 m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) 3376 default: 3377 panic("BUG") 3378 } 3379 3380 m.copyTo(tmp, m.c.VRegOf(ret)) 3381 } 3382 3383 func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) { 3384 x, y := instr.Arg2() 3385 xDef := m.c.ValueDefinition(x) 3386 yDef := m.c.ValueDefinition(y) 3387 rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) 3388 rd := m.c.VRegOf(instr.Return()) 3389 3390 tmp := m.copyToTmp(rn.reg()) 3391 3392 // pandn between rn, rm. 3393 pand := m.allocateInstr() 3394 pand.asXmmRmR(sseOpcodePandn, rm, tmp) 3395 m.insert(pand) 3396 3397 m.copyTo(tmp, rd) 3398 } 3399 3400 func (m *machine) lowerVbitselect(instr *ssa.Instruction) { 3401 c, x, y := instr.SelectData() 3402 xDef := m.c.ValueDefinition(x) 3403 yDef := m.c.ValueDefinition(y) 3404 rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) 3405 creg := m.getOperand_Reg(m.c.ValueDefinition(c)) 3406 rd := m.c.VRegOf(instr.Return()) 3407 3408 tmpC := m.copyToTmp(creg.reg()) 3409 tmpX := m.copyToTmp(rm.reg()) 3410 3411 // And between c, x (overwrites x). 3412 pand := m.allocateInstr() 3413 pand.asXmmRmR(sseOpcodePand, creg, tmpX) 3414 m.insert(pand) 3415 3416 // Andn between y, c (overwrites c). 3417 pandn := m.allocateInstr() 3418 pandn.asXmmRmR(sseOpcodePandn, rn, tmpC) 3419 m.insert(pandn) 3420 3421 por := m.allocateInstr() 3422 por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX) 3423 m.insert(por) 3424 3425 m.copyTo(tmpX, rd) 3426 } 3427 3428 func (m *machine) lowerVFmin(instr *ssa.Instruction) { 3429 x, y, lane := instr.Arg2WithLane() 3430 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 3431 rm := m.getOperand_Reg(m.c.ValueDefinition(y)) 3432 rd := m.c.VRegOf(instr.Return()) 3433 3434 var min, cmp, andn, or, srl /* shift right logical */ sseOpcode 3435 var shiftNumToInverseNaN uint32 3436 if lane == ssa.VecLaneF32x4 { 3437 min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa 3438 } else { 3439 min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd 3440 } 3441 3442 tmp1 := m.copyToTmp(rn.reg()) 3443 tmp2 := m.copyToTmp(rm.reg()) 3444 3445 // tmp1=min(rn, rm) 3446 minIns1 := m.allocateInstr() 3447 minIns1.asXmmRmR(min, rn, tmp2) 3448 m.insert(minIns1) 3449 3450 // tmp2=min(rm, rn) 3451 minIns2 := m.allocateInstr() 3452 minIns2.asXmmRmR(min, rm, tmp1) 3453 m.insert(minIns2) 3454 3455 // tmp3:=tmp1=min(rn, rm) 3456 tmp3 := m.copyToTmp(tmp1) 3457 3458 // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN 3459 // NaN if rn == NaN || rm == NaN 3460 // min(rm, rm) otherwise 3461 orIns := m.allocateInstr() 3462 orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1) 3463 m.insert(orIns) 3464 3465 // tmp3 is originally min(rn,rm). 3466 // tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN 3467 // 0 otherwise 3468 cmpIns := m.allocateInstr() 3469 cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3) 3470 m.insert(cmpIns) 3471 3472 // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN 3473 // ^0 if rn == NaN || rm == NaN 3474 // min(v1, v2) otherwise 3475 orIns2 := m.allocateInstr() 3476 orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1) 3477 m.insert(orIns2) 3478 3479 // tmp3 = set all bits on the mantissa bits 3480 // 0 otherwise 3481 shift := m.allocateInstr() 3482 shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3) 3483 m.insert(shift) 3484 3485 // tmp3 = tmp1 and !tmp3 3486 // = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN 3487 // set all bits on exponential and sign bit (== NaN) if rn == NaN || rm == NaN 3488 // min(rn, rm) otherwise 3489 andnIns := m.allocateInstr() 3490 andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3) 3491 m.insert(andnIns) 3492 3493 m.copyTo(tmp3, rd) 3494 } 3495 3496 func (m *machine) lowerVFmax(instr *ssa.Instruction) { 3497 x, y, lane := instr.Arg2WithLane() 3498 rn := m.getOperand_Reg(m.c.ValueDefinition(x)) 3499 rm := m.getOperand_Reg(m.c.ValueDefinition(y)) 3500 rd := m.c.VRegOf(instr.Return()) 3501 3502 var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode 3503 var shiftNumToInverseNaN uint32 3504 if lane == ssa.VecLaneF32x4 { 3505 max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa 3506 } else { 3507 max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd 3508 } 3509 3510 tmp0 := m.copyToTmp(rm.reg()) 3511 tmp1 := m.copyToTmp(rn.reg()) 3512 3513 // tmp0=max(rn, rm) 3514 maxIns1 := m.allocateInstr() 3515 maxIns1.asXmmRmR(max, rn, tmp0) 3516 m.insert(maxIns1) 3517 3518 // tmp1=max(rm, rn) 3519 maxIns2 := m.allocateInstr() 3520 maxIns2.asXmmRmR(max, rm, tmp1) 3521 m.insert(maxIns2) 3522 3523 // tmp2=max(rm, rn) 3524 tmp2 := m.copyToTmp(tmp1) 3525 3526 // tmp2 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) 3527 // 0 if (rn == 0 && rm == 0) 3528 // -0 if (rn == -0 && rm == -0) 3529 // v1^v2 if rn == NaN || rm == NaN 3530 // 0 otherwise 3531 xorInstr := m.allocateInstr() 3532 xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2) 3533 m.insert(xorInstr) 3534 // tmp1 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) 3535 // 0 if (rn == 0 && rm == 0) 3536 // -0 if (rn == -0 && rm == -0) 3537 // NaN if rn == NaN || rm == NaN 3538 // max(v1, v2) otherwise 3539 orInstr := m.allocateInstr() 3540 orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1) 3541 m.insert(orInstr) 3542 3543 tmp3 := m.copyToTmp(tmp1) 3544 3545 // tmp3 = 0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm == 0) 3546 // -0 if (rn == -0 && rm == -0) 3547 // NaN if rn == NaN || rm == NaN 3548 // max(v1, v2) otherwise 3549 // 3550 // Note: -0 - (-0) = 0 (!= -0) in floating point operation. 3551 subIns := m.allocateInstr() 3552 subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3) 3553 m.insert(subIns) 3554 3555 // tmp1 = 0^ if rn == NaN || rm == NaN 3556 cmpIns := m.allocateInstr() 3557 cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1) 3558 m.insert(cmpIns) 3559 3560 // tmp1 = set all bits on the mantissa bits 3561 // 0 otherwise 3562 shift := m.allocateInstr() 3563 shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1) 3564 m.insert(shift) 3565 3566 andnIns := m.allocateInstr() 3567 andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1) 3568 m.insert(andnIns) 3569 3570 m.copyTo(tmp1, rd) 3571 } 3572 3573 func (m *machine) lowerVFabs(instr *ssa.Instruction) { 3574 x, lane := instr.ArgWithLane() 3575 rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) 3576 rd := m.c.VRegOf(instr.Return()) 3577 3578 tmp := m.c.AllocateVReg(ssa.TypeV128) 3579 3580 def := m.allocateInstr() 3581 def.asDefineUninitializedReg(tmp) 3582 m.insert(def) 3583 3584 // Set all bits on tmp. 3585 pcmp := m.allocateInstr() 3586 pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp) 3587 m.insert(pcmp) 3588 3589 switch lane { 3590 case ssa.VecLaneF32x4: 3591 // Shift right packed single floats by 1 to clear the sign bits. 3592 shift := m.allocateInstr() 3593 shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp) 3594 m.insert(shift) 3595 // Clear the sign bit of rm. 3596 andp := m.allocateInstr() 3597 andp.asXmmRmR(sseOpcodeAndpd, rm, tmp) 3598 m.insert(andp) 3599 case ssa.VecLaneF64x2: 3600 // Shift right packed single floats by 1 to clear the sign bits. 3601 shift := m.allocateInstr() 3602 shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp) 3603 m.insert(shift) 3604 // Clear the sign bit of rm. 3605 andp := m.allocateInstr() 3606 andp.asXmmRmR(sseOpcodeAndps, rm, tmp) 3607 m.insert(andp) 3608 } 3609 3610 m.copyTo(tmp, rd) 3611 }