github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/amd64/machine.go

github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/amd64/machine.go (about)

     1  package amd64
     2  
     3  import (
     4  	"context"
     5  	"encoding/binary"
     6  	"fmt"
     7  	"math"
     8  	"strings"
     9  
    10  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
    11  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
    12  	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
    13  	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
    14  	"github.com/tetratelabs/wazero/internal/platform"
    15  )
    16  
    17  // NewBackend returns a new backend for arm64.
    18  func NewBackend() backend.Machine {
    19  	ectx := backend.NewExecutableContextT[instruction](
    20  		resetInstruction,
    21  		setNext,
    22  		setPrev,
    23  		asNop,
    24  	)
    25  	return &machine{
    26  		ectx:                                ectx,
    27  		cpuFeatures:                         platform.CpuFeatures,
    28  		regAlloc:                            regalloc.NewAllocator(regInfo),
    29  		spillSlots:                          map[regalloc.VRegID]int64{},
    30  		amodePool:                           wazevoapi.NewPool[amode](nil),
    31  		constSwizzleMaskConstIndex:          -1,
    32  		constSqmulRoundSatIndex:             -1,
    33  		constI8x16SHLMaskTableIndex:         -1,
    34  		constI8x16LogicalSHRMaskTableIndex:  -1,
    35  		constF64x2CvtFromIMaskIndex:         -1,
    36  		constTwop52Index:                    -1,
    37  		constI32sMaxOnF64x2Index:            -1,
    38  		constI32uMaxOnF64x2Index:            -1,
    39  		constAllOnesI8x16Index:              -1,
    40  		constAllOnesI16x8Index:              -1,
    41  		constExtAddPairwiseI16x8uMask1Index: -1,
    42  		constExtAddPairwiseI16x8uMask2Index: -1,
    43  	}
    44  }
    45  
    46  type (
    47  	// machine implements backend.Machine for amd64.
    48  	machine struct {
    49  		c                        backend.Compiler
    50  		ectx                     *backend.ExecutableContextT[instruction]
    51  		stackBoundsCheckDisabled bool
    52  
    53  		amodePool wazevoapi.Pool[amode]
    54  
    55  		cpuFeatures platform.CpuFeatureFlags
    56  
    57  		regAlloc        regalloc.Allocator
    58  		regAllocFn      *backend.RegAllocFunction[*instruction, *machine]
    59  		regAllocStarted bool
    60  
    61  		spillSlotSize int64
    62  		spillSlots    map[regalloc.VRegID]int64
    63  		currentABI    *backend.FunctionABI
    64  		clobberedRegs []regalloc.VReg
    65  
    66  		maxRequiredStackSizeForCalls int64
    67  
    68  		labelResolutionPends []labelResolutionPend
    69  
    70  		jmpTableTargets [][]uint32
    71  		consts          []_const
    72  
    73  		constSwizzleMaskConstIndex, constSqmulRoundSatIndex,
    74  		constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex,
    75  		constF64x2CvtFromIMaskIndex, constTwop52Index,
    76  		constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index,
    77  		constAllOnesI8x16Index, constAllOnesI16x8Index,
    78  		constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int
    79  	}
    80  
    81  	_const struct {
    82  		lo, hi uint64
    83  		_var   []byte
    84  		label  *labelPosition
    85  	}
    86  
    87  	labelResolutionPend struct {
    88  		instr       *instruction
    89  		instrOffset int64
    90  		// imm32Offset is the offset of the last 4 bytes of the instruction.
    91  		imm32Offset int64
    92  	}
    93  
    94  	labelPosition = backend.LabelPosition[instruction]
    95  )
    96  
    97  func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label {
    98  	index := *i
    99  	if index == -1 {
   100  		label := m.allocateLabel()
   101  		index = len(m.consts)
   102  		m.consts = append(m.consts, _const{
   103  			_var:  _var,
   104  			label: label,
   105  		})
   106  		*i = index
   107  	}
   108  	return m.consts[index].label.L
   109  }
   110  
   111  // Reset implements backend.Machine.
   112  func (m *machine) Reset() {
   113  	m.consts = m.consts[:0]
   114  	m.clobberedRegs = m.clobberedRegs[:0]
   115  	for key := range m.spillSlots {
   116  		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
   117  	}
   118  	for _, key := range m.clobberedRegs {
   119  		delete(m.spillSlots, regalloc.VRegID(key))
   120  	}
   121  
   122  	m.stackBoundsCheckDisabled = false
   123  	m.ectx.Reset()
   124  
   125  	m.regAllocFn.Reset()
   126  	m.regAlloc.Reset()
   127  	m.regAllocStarted = false
   128  	m.clobberedRegs = m.clobberedRegs[:0]
   129  
   130  	m.spillSlotSize = 0
   131  	m.maxRequiredStackSizeForCalls = 0
   132  
   133  	m.amodePool.Reset()
   134  	m.jmpTableTargets = m.jmpTableTargets[:0]
   135  	m.constSwizzleMaskConstIndex = -1
   136  	m.constSqmulRoundSatIndex = -1
   137  	m.constI8x16SHLMaskTableIndex = -1
   138  	m.constI8x16LogicalSHRMaskTableIndex = -1
   139  	m.constF64x2CvtFromIMaskIndex = -1
   140  	m.constTwop52Index = -1
   141  	m.constI32sMaxOnF64x2Index = -1
   142  	m.constI32uMaxOnF64x2Index = -1
   143  	m.constAllOnesI8x16Index = -1
   144  	m.constAllOnesI16x8Index = -1
   145  	m.constExtAddPairwiseI16x8uMask1Index = -1
   146  	m.constExtAddPairwiseI16x8uMask2Index = -1
   147  }
   148  
   149  // ExecutableContext implements backend.Machine.
   150  func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx }
   151  
   152  // DisableStackCheck implements backend.Machine.
   153  func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true }
   154  
   155  // SetCompiler implements backend.Machine.
   156  func (m *machine) SetCompiler(c backend.Compiler) {
   157  	m.c = c
   158  	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c)
   159  }
   160  
   161  // SetCurrentABI implements backend.Machine.
   162  func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
   163  	m.currentABI = abi
   164  }
   165  
   166  // RegAlloc implements backend.Machine.
   167  func (m *machine) RegAlloc() {
   168  	rf := m.regAllocFn
   169  	for _, pos := range m.ectx.OrderedBlockLabels {
   170  		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
   171  	}
   172  
   173  	m.regAllocStarted = true
   174  	m.regAlloc.DoAllocation(rf)
   175  	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
   176  	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
   177  }
   178  
   179  // InsertReturn implements backend.Machine.
   180  func (m *machine) InsertReturn() {
   181  	i := m.allocateInstr().asRet()
   182  	m.insert(i)
   183  }
   184  
   185  // LowerSingleBranch implements backend.Machine.
   186  func (m *machine) LowerSingleBranch(b *ssa.Instruction) {
   187  	ectx := m.ectx
   188  	switch b.Opcode() {
   189  	case ssa.OpcodeJump:
   190  		_, _, targetBlk := b.BranchData()
   191  		if b.IsFallthroughJump() {
   192  			return
   193  		}
   194  		jmp := m.allocateInstr()
   195  		target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
   196  		if target == backend.LabelReturn {
   197  			jmp.asRet()
   198  		} else {
   199  			jmp.asJmp(newOperandLabel(target))
   200  		}
   201  		m.insert(jmp)
   202  	case ssa.OpcodeBrTable:
   203  		index, target := b.BrTableData()
   204  		m.lowerBrTable(index, target)
   205  	default:
   206  		panic("BUG: unexpected branch opcode" + b.Opcode().String())
   207  	}
   208  }
   209  
   210  func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
   211  	// TODO: reuse the slice!
   212  	labels := make([]uint32, len(targets))
   213  	for j, target := range targets {
   214  		labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target))
   215  	}
   216  	index = len(m.jmpTableTargets)
   217  	m.jmpTableTargets = append(m.jmpTableTargets, labels)
   218  	return
   219  }
   220  
   221  var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp}
   222  
   223  func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) {
   224  	_v := m.getOperand_Reg(m.c.ValueDefinition(index))
   225  	v := m.copyToTmp(_v.reg())
   226  
   227  	// First, we need to do the bounds check.
   228  	maxIndex := m.c.AllocateVReg(ssa.TypeI32)
   229  	m.lowerIconst(maxIndex, uint64(len(targets)-1), false)
   230  	cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false)
   231  	m.insert(cmp)
   232  
   233  	// Then do the conditional move maxIndex to v if v > maxIndex.
   234  	cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false)
   235  	m.insert(cmov)
   236  
   237  	// Now that v has the correct index. Load the address of the jump table into the addr.
   238  	addr := m.c.AllocateVReg(ssa.TypeI64)
   239  	leaJmpTableAddr := m.allocateInstr()
   240  	m.insert(leaJmpTableAddr)
   241  
   242  	// Then add the target's offset into jmpTableAddr.
   243  	loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd,
   244  		// Shift by 3 because each entry is 8 bytes.
   245  		newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true)
   246  	m.insert(loadTargetOffsetFromJmpTable)
   247  
   248  	// Now ready to jump.
   249  	jmp := m.allocateInstr().asJmp(newOperandReg(addr))
   250  	m.insert(jmp)
   251  
   252  	jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget()
   253  	m.insert(jmpTableBegin)
   254  	leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr)
   255  
   256  	jmpTable := m.allocateInstr()
   257  	targetSliceIndex := m.addJmpTableTarget(targets)
   258  	jmpTable.asJmpTableSequence(targetSliceIndex, len(targets))
   259  	m.insert(jmpTable)
   260  }
   261  
   262  // LowerConditionalBranch implements backend.Machine.
   263  func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
   264  	exctx := m.ectx
   265  	cval, args, targetBlk := b.BranchData()
   266  	if len(args) > 0 {
   267  		panic(fmt.Sprintf(
   268  			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
   269  			exctx.CurrentSSABlk,
   270  			targetBlk,
   271  		))
   272  	}
   273  
   274  	target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
   275  	cvalDef := m.c.ValueDefinition(cval)
   276  
   277  	switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
   278  	case ssa.OpcodeIcmp:
   279  		cvalInstr := cvalDef.Instr
   280  		x, y, c := cvalInstr.IcmpData()
   281  
   282  		cc := condFromSSAIntCmpCond(c)
   283  		if b.Opcode() == ssa.OpcodeBrz {
   284  			cc = cc.invert()
   285  		}
   286  
   287  		// First, perform the comparison and set the flag.
   288  		xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
   289  		if !m.tryLowerBandToFlag(xd, yd) {
   290  			m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64)
   291  		}
   292  
   293  		// Then perform the conditional branch.
   294  		m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
   295  		cvalDef.Instr.MarkLowered()
   296  	case ssa.OpcodeFcmp:
   297  		cvalInstr := cvalDef.Instr
   298  
   299  		f1, f2, and := m.lowerFcmpToFlags(cvalInstr)
   300  		isBrz := b.Opcode() == ssa.OpcodeBrz
   301  		if isBrz {
   302  			f1 = f1.invert()
   303  		}
   304  		if f2 == condInvalid {
   305  			m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target)))
   306  		} else {
   307  			if isBrz {
   308  				f2 = f2.invert()
   309  				and = !and
   310  			}
   311  			jmp1, jmp2 := m.allocateInstr(), m.allocateInstr()
   312  			m.insert(jmp1)
   313  			m.insert(jmp2)
   314  			notTaken, notTakenLabel := m.allocateBrTarget()
   315  			m.insert(notTaken)
   316  			if and {
   317  				jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel))
   318  				jmp2.asJmpIf(f2, newOperandLabel(target))
   319  			} else {
   320  				jmp1.asJmpIf(f1, newOperandLabel(target))
   321  				jmp2.asJmpIf(f2, newOperandLabel(target))
   322  			}
   323  		}
   324  
   325  		cvalDef.Instr.MarkLowered()
   326  	default:
   327  		v := m.getOperand_Reg(cvalDef)
   328  
   329  		var cc cond
   330  		if b.Opcode() == ssa.OpcodeBrz {
   331  			cc = condZ
   332  		} else {
   333  			cc = condNZ
   334  		}
   335  
   336  		// Perform test %v, %v to set the flag.
   337  		cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false)
   338  		m.insert(cmp)
   339  		m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
   340  	}
   341  }
   342  
   343  // LowerInstr implements backend.Machine.
   344  func (m *machine) LowerInstr(instr *ssa.Instruction) {
   345  	if l := instr.SourceOffset(); l.Valid() {
   346  		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
   347  		m.insert(info)
   348  	}
   349  
   350  	switch op := instr.Opcode(); op {
   351  	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
   352  		panic("BUG: branching instructions are handled by LowerBranches")
   353  	case ssa.OpcodeReturn:
   354  		panic("BUG: return must be handled by backend.Compiler")
   355  	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
   356  	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
   357  		m.lowerCall(instr)
   358  	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
   359  		m.lowerStore(instr)
   360  	case ssa.OpcodeIadd:
   361  		m.lowerAluRmiROp(instr, aluRmiROpcodeAdd)
   362  	case ssa.OpcodeIsub:
   363  		m.lowerAluRmiROp(instr, aluRmiROpcodeSub)
   364  	case ssa.OpcodeImul:
   365  		m.lowerAluRmiROp(instr, aluRmiROpcodeMul)
   366  	case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem:
   367  		isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv
   368  		isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem
   369  		m.lowerIDivRem(instr, isDiv, isSigned)
   370  	case ssa.OpcodeBand:
   371  		m.lowerAluRmiROp(instr, aluRmiROpcodeAnd)
   372  	case ssa.OpcodeBor:
   373  		m.lowerAluRmiROp(instr, aluRmiROpcodeOr)
   374  	case ssa.OpcodeBxor:
   375  		m.lowerAluRmiROp(instr, aluRmiROpcodeXor)
   376  	case ssa.OpcodeIshl:
   377  		m.lowerShiftR(instr, shiftROpShiftLeft)
   378  	case ssa.OpcodeSshr:
   379  		m.lowerShiftR(instr, shiftROpShiftRightArithmetic)
   380  	case ssa.OpcodeUshr:
   381  		m.lowerShiftR(instr, shiftROpShiftRightLogical)
   382  	case ssa.OpcodeRotl:
   383  		m.lowerShiftR(instr, shiftROpRotateLeft)
   384  	case ssa.OpcodeRotr:
   385  		m.lowerShiftR(instr, shiftROpRotateRight)
   386  	case ssa.OpcodeClz:
   387  		m.lowerClz(instr)
   388  	case ssa.OpcodeCtz:
   389  		m.lowerCtz(instr)
   390  	case ssa.OpcodePopcnt:
   391  		m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt)
   392  	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv:
   393  		m.lowerXmmRmR(instr)
   394  	case ssa.OpcodeFabs:
   395  		m.lowerFabsFneg(instr)
   396  	case ssa.OpcodeFneg:
   397  		m.lowerFabsFneg(instr)
   398  	case ssa.OpcodeCeil:
   399  		m.lowerRound(instr, roundingModeUp)
   400  	case ssa.OpcodeFloor:
   401  		m.lowerRound(instr, roundingModeDown)
   402  	case ssa.OpcodeTrunc:
   403  		m.lowerRound(instr, roundingModeZero)
   404  	case ssa.OpcodeNearest:
   405  		m.lowerRound(instr, roundingModeNearest)
   406  	case ssa.OpcodeFmin, ssa.OpcodeFmax:
   407  		m.lowerFminFmax(instr)
   408  	case ssa.OpcodeFcopysign:
   409  		m.lowerFcopysign(instr)
   410  	case ssa.OpcodeBitcast:
   411  		m.lowerBitcast(instr)
   412  	case ssa.OpcodeSqrt:
   413  		m.lowerSqrt(instr)
   414  	case ssa.OpcodeFpromote:
   415  		v := instr.Arg()
   416  		rn := m.getOperand_Reg(m.c.ValueDefinition(v))
   417  		rd := m.c.VRegOf(instr.Return())
   418  		cnt := m.allocateInstr()
   419  		cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd)
   420  		m.insert(cnt)
   421  	case ssa.OpcodeFdemote:
   422  		v := instr.Arg()
   423  		rn := m.getOperand_Reg(m.c.ValueDefinition(v))
   424  		rd := m.c.VRegOf(instr.Return())
   425  		cnt := m.allocateInstr()
   426  		cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd)
   427  		m.insert(cnt)
   428  	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
   429  		x, ctx := instr.Arg2()
   430  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   431  		rd := m.c.VRegOf(instr.Return())
   432  		ctxVReg := m.c.VRegOf(ctx)
   433  		m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
   434  			instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
   435  	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
   436  		x, ctx := instr.Arg2()
   437  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   438  		rd := m.c.VRegOf(instr.Return())
   439  		ctxVReg := m.c.VRegOf(ctx)
   440  		m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
   441  			instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
   442  	case ssa.OpcodeFcvtFromSint:
   443  		x := instr.Arg()
   444  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   445  		rd := newOperandReg(m.c.VRegOf(instr.Return()))
   446  		m.lowerFcvtFromSint(rn, rd,
   447  			x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64)
   448  	case ssa.OpcodeFcvtFromUint:
   449  		x := instr.Arg()
   450  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   451  		rd := newOperandReg(m.c.VRegOf(instr.Return()))
   452  		m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64,
   453  			instr.Return().Type().Bits() == 64)
   454  	case ssa.OpcodeVanyTrue:
   455  		m.lowerVanyTrue(instr)
   456  	case ssa.OpcodeVallTrue:
   457  		m.lowerVallTrue(instr)
   458  	case ssa.OpcodeVhighBits:
   459  		m.lowerVhighBits(instr)
   460  	case ssa.OpcodeVbnot:
   461  		m.lowerVbnot(instr)
   462  	case ssa.OpcodeVband:
   463  		x, y := instr.Arg2()
   464  		m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return())
   465  	case ssa.OpcodeVbor:
   466  		x, y := instr.Arg2()
   467  		m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return())
   468  	case ssa.OpcodeVbxor:
   469  		x, y := instr.Arg2()
   470  		m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return())
   471  	case ssa.OpcodeVbandnot:
   472  		m.lowerVbandnot(instr, sseOpcodePandn)
   473  	case ssa.OpcodeVbitselect:
   474  		m.lowerVbitselect(instr)
   475  	case ssa.OpcodeVIadd:
   476  		x, y, lane := instr.Arg2WithLane()
   477  		var vecOp sseOpcode
   478  		switch lane {
   479  		case ssa.VecLaneI8x16:
   480  			vecOp = sseOpcodePaddb
   481  		case ssa.VecLaneI16x8:
   482  			vecOp = sseOpcodePaddw
   483  		case ssa.VecLaneI32x4:
   484  			vecOp = sseOpcodePaddd
   485  		case ssa.VecLaneI64x2:
   486  			vecOp = sseOpcodePaddq
   487  		}
   488  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   489  
   490  	case ssa.OpcodeVSaddSat:
   491  		x, y, lane := instr.Arg2WithLane()
   492  		var vecOp sseOpcode
   493  		switch lane {
   494  		case ssa.VecLaneI8x16:
   495  			vecOp = sseOpcodePaddsb
   496  		case ssa.VecLaneI16x8:
   497  			vecOp = sseOpcodePaddsw
   498  		}
   499  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   500  
   501  	case ssa.OpcodeVUaddSat:
   502  		x, y, lane := instr.Arg2WithLane()
   503  		var vecOp sseOpcode
   504  		switch lane {
   505  		case ssa.VecLaneI8x16:
   506  			vecOp = sseOpcodePaddusb
   507  		case ssa.VecLaneI16x8:
   508  			vecOp = sseOpcodePaddusw
   509  		}
   510  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   511  
   512  	case ssa.OpcodeVIsub:
   513  		x, y, lane := instr.Arg2WithLane()
   514  		var vecOp sseOpcode
   515  		switch lane {
   516  		case ssa.VecLaneI8x16:
   517  			vecOp = sseOpcodePsubb
   518  		case ssa.VecLaneI16x8:
   519  			vecOp = sseOpcodePsubw
   520  		case ssa.VecLaneI32x4:
   521  			vecOp = sseOpcodePsubd
   522  		case ssa.VecLaneI64x2:
   523  			vecOp = sseOpcodePsubq
   524  		}
   525  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   526  
   527  	case ssa.OpcodeVSsubSat:
   528  		x, y, lane := instr.Arg2WithLane()
   529  		var vecOp sseOpcode
   530  		switch lane {
   531  		case ssa.VecLaneI8x16:
   532  			vecOp = sseOpcodePsubsb
   533  		case ssa.VecLaneI16x8:
   534  			vecOp = sseOpcodePsubsw
   535  		}
   536  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   537  
   538  	case ssa.OpcodeVUsubSat:
   539  		x, y, lane := instr.Arg2WithLane()
   540  		var vecOp sseOpcode
   541  		switch lane {
   542  		case ssa.VecLaneI8x16:
   543  			vecOp = sseOpcodePsubusb
   544  		case ssa.VecLaneI16x8:
   545  			vecOp = sseOpcodePsubusw
   546  		}
   547  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   548  
   549  	case ssa.OpcodeVImul:
   550  		m.lowerVImul(instr)
   551  	case ssa.OpcodeVIneg:
   552  		x, lane := instr.ArgWithLane()
   553  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   554  		rd := m.c.VRegOf(instr.Return())
   555  		var vecOp sseOpcode
   556  		switch lane {
   557  		case ssa.VecLaneI8x16:
   558  			vecOp = sseOpcodePsubb
   559  		case ssa.VecLaneI16x8:
   560  			vecOp = sseOpcodePsubw
   561  		case ssa.VecLaneI32x4:
   562  			vecOp = sseOpcodePsubd
   563  		case ssa.VecLaneI64x2:
   564  			vecOp = sseOpcodePsubq
   565  		default:
   566  			panic("BUG")
   567  		}
   568  
   569  		tmp := m.c.AllocateVReg(ssa.TypeV128)
   570  		m.insert(m.allocateInstr().asZeros(tmp))
   571  
   572  		i := m.allocateInstr()
   573  		i.asXmmRmR(vecOp, rn, tmp)
   574  		m.insert(i)
   575  
   576  		m.copyTo(tmp, rd)
   577  	case ssa.OpcodeVFadd:
   578  		x, y, lane := instr.Arg2WithLane()
   579  		var vecOp sseOpcode
   580  		switch lane {
   581  		case ssa.VecLaneF32x4:
   582  			vecOp = sseOpcodeAddps
   583  		case ssa.VecLaneF64x2:
   584  			vecOp = sseOpcodeAddpd
   585  		}
   586  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   587  
   588  	case ssa.OpcodeVFsub:
   589  		x, y, lane := instr.Arg2WithLane()
   590  		var vecOp sseOpcode
   591  		switch lane {
   592  		case ssa.VecLaneF32x4:
   593  			vecOp = sseOpcodeSubps
   594  		case ssa.VecLaneF64x2:
   595  			vecOp = sseOpcodeSubpd
   596  		}
   597  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   598  
   599  	case ssa.OpcodeVFdiv:
   600  		x, y, lane := instr.Arg2WithLane()
   601  		var vecOp sseOpcode
   602  		switch lane {
   603  		case ssa.VecLaneF32x4:
   604  			vecOp = sseOpcodeDivps
   605  		case ssa.VecLaneF64x2:
   606  			vecOp = sseOpcodeDivpd
   607  		}
   608  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   609  
   610  	case ssa.OpcodeVFmul:
   611  		x, y, lane := instr.Arg2WithLane()
   612  		var vecOp sseOpcode
   613  		switch lane {
   614  		case ssa.VecLaneF32x4:
   615  			vecOp = sseOpcodeMulps
   616  		case ssa.VecLaneF64x2:
   617  			vecOp = sseOpcodeMulpd
   618  		}
   619  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   620  
   621  	case ssa.OpcodeVFneg:
   622  		x, lane := instr.ArgWithLane()
   623  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   624  		rd := m.c.VRegOf(instr.Return())
   625  
   626  		tmp := m.c.AllocateVReg(ssa.TypeV128)
   627  
   628  		var shiftOp, xorOp sseOpcode
   629  		var shiftAmt uint32
   630  		switch lane {
   631  		case ssa.VecLaneF32x4:
   632  			shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps
   633  		case ssa.VecLaneF64x2:
   634  			shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd
   635  		}
   636  
   637  		zero := m.allocateInstr()
   638  		zero.asZeros(tmp)
   639  		m.insert(zero)
   640  
   641  		// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
   642  		// See https://www.felixcloutier.com/x86/cmpps
   643  		//
   644  		// Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
   645  		// if the lane is NaN.
   646  		cmp := m.allocateInstr()
   647  		cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp)
   648  		m.insert(cmp)
   649  
   650  		// Do the left shift on each lane to set only the most significant bit in each.
   651  		i := m.allocateInstr()
   652  		i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp)
   653  		m.insert(i)
   654  
   655  		// Get the negated result by XOR on each lane with tmp.
   656  		i = m.allocateInstr()
   657  		i.asXmmRmR(xorOp, rn, tmp)
   658  		m.insert(i)
   659  
   660  		m.copyTo(tmp, rd)
   661  
   662  	case ssa.OpcodeVSqrt:
   663  		x, lane := instr.ArgWithLane()
   664  		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
   665  		rd := m.c.VRegOf(instr.Return())
   666  
   667  		var vecOp sseOpcode
   668  		switch lane {
   669  		case ssa.VecLaneF32x4:
   670  			vecOp = sseOpcodeSqrtps
   671  		case ssa.VecLaneF64x2:
   672  			vecOp = sseOpcodeSqrtpd
   673  		}
   674  		i := m.allocateInstr()
   675  		i.asXmmUnaryRmR(vecOp, rn, rd)
   676  		m.insert(i)
   677  
   678  	case ssa.OpcodeVImin:
   679  		x, y, lane := instr.Arg2WithLane()
   680  		var vecOp sseOpcode
   681  		switch lane {
   682  		case ssa.VecLaneI8x16:
   683  			vecOp = sseOpcodePminsb
   684  		case ssa.VecLaneI16x8:
   685  			vecOp = sseOpcodePminsw
   686  		case ssa.VecLaneI32x4:
   687  			vecOp = sseOpcodePminsd
   688  		}
   689  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   690  
   691  	case ssa.OpcodeVUmin:
   692  		x, y, lane := instr.Arg2WithLane()
   693  		var vecOp sseOpcode
   694  		switch lane {
   695  		case ssa.VecLaneI8x16:
   696  			vecOp = sseOpcodePminub
   697  		case ssa.VecLaneI16x8:
   698  			vecOp = sseOpcodePminuw
   699  		case ssa.VecLaneI32x4:
   700  			vecOp = sseOpcodePminud
   701  		}
   702  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   703  
   704  	case ssa.OpcodeVImax:
   705  		x, y, lane := instr.Arg2WithLane()
   706  		var vecOp sseOpcode
   707  		switch lane {
   708  		case ssa.VecLaneI8x16:
   709  			vecOp = sseOpcodePmaxsb
   710  		case ssa.VecLaneI16x8:
   711  			vecOp = sseOpcodePmaxsw
   712  		case ssa.VecLaneI32x4:
   713  			vecOp = sseOpcodePmaxsd
   714  		}
   715  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   716  
   717  	case ssa.OpcodeVUmax:
   718  		x, y, lane := instr.Arg2WithLane()
   719  		var vecOp sseOpcode
   720  		switch lane {
   721  		case ssa.VecLaneI8x16:
   722  			vecOp = sseOpcodePmaxub
   723  		case ssa.VecLaneI16x8:
   724  			vecOp = sseOpcodePmaxuw
   725  		case ssa.VecLaneI32x4:
   726  			vecOp = sseOpcodePmaxud
   727  		}
   728  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   729  
   730  	case ssa.OpcodeVAvgRound:
   731  		x, y, lane := instr.Arg2WithLane()
   732  		var vecOp sseOpcode
   733  		switch lane {
   734  		case ssa.VecLaneI8x16:
   735  			vecOp = sseOpcodePavgb
   736  		case ssa.VecLaneI16x8:
   737  			vecOp = sseOpcodePavgw
   738  		}
   739  		m.lowerVbBinOp(vecOp, x, y, instr.Return())
   740  
   741  	case ssa.OpcodeVIcmp:
   742  		x, y, c, lane := instr.VIcmpData()
   743  		m.lowerVIcmp(x, y, c, instr.Return(), lane)
   744  
   745  	case ssa.OpcodeVFcmp:
   746  		x, y, c, lane := instr.VFcmpData()
   747  		m.lowerVFcmp(x, y, c, instr.Return(), lane)
   748  
   749  	case ssa.OpcodeExtractlane:
   750  		x, index, signed, lane := instr.ExtractlaneData()
   751  		m.lowerExtractLane(x, index, signed, instr.Return(), lane)
   752  
   753  	case ssa.OpcodeInsertlane:
   754  		x, y, index, lane := instr.InsertlaneData()
   755  		m.lowerInsertLane(x, y, index, instr.Return(), lane)
   756  
   757  	case ssa.OpcodeSwizzle:
   758  		x, y, _ := instr.Arg2WithLane()
   759  		m.lowerSwizzle(x, y, instr.Return())
   760  
   761  	case ssa.OpcodeShuffle:
   762  		x, y, lo, hi := instr.ShuffleData()
   763  		m.lowerShuffle(x, y, lo, hi, instr.Return())
   764  
   765  	case ssa.OpcodeSplat:
   766  		x, lane := instr.ArgWithLane()
   767  		m.lowerSplat(x, instr.Return(), lane)
   768  
   769  	case ssa.OpcodeSqmulRoundSat:
   770  		x, y := instr.Arg2()
   771  		m.lowerSqmulRoundSat(x, y, instr.Return())
   772  
   773  	case ssa.OpcodeVZeroExtLoad:
   774  		ptr, offset, typ := instr.VZeroExtLoadData()
   775  		var sseOp sseOpcode
   776  		// Both movss and movsd clears the higher bits of the destination register upt 128 bits.
   777  		// https://www.felixcloutier.com/x86/movss
   778  		// https://www.felixcloutier.com/x86/movsd
   779  		if typ == ssa.TypeF32 {
   780  			sseOp = sseOpcodeMovss
   781  		} else {
   782  			sseOp = sseOpcodeMovsd
   783  		}
   784  		mem := m.lowerToAddressMode(ptr, offset)
   785  		dst := m.c.VRegOf(instr.Return())
   786  		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst))
   787  
   788  	case ssa.OpcodeVMinPseudo:
   789  		x, y, lane := instr.Arg2WithLane()
   790  		var vecOp sseOpcode
   791  		switch lane {
   792  		case ssa.VecLaneF32x4:
   793  			vecOp = sseOpcodeMinps
   794  		case ssa.VecLaneF64x2:
   795  			vecOp = sseOpcodeMinpd
   796  		default:
   797  			panic("BUG: unexpected lane type")
   798  		}
   799  		m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
   800  
   801  	case ssa.OpcodeVMaxPseudo:
   802  		x, y, lane := instr.Arg2WithLane()
   803  		var vecOp sseOpcode
   804  		switch lane {
   805  		case ssa.VecLaneF32x4:
   806  			vecOp = sseOpcodeMaxps
   807  		case ssa.VecLaneF64x2:
   808  			vecOp = sseOpcodeMaxpd
   809  		default:
   810  			panic("BUG: unexpected lane type")
   811  		}
   812  		m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
   813  
   814  	case ssa.OpcodeVIshl:
   815  		x, y, lane := instr.Arg2WithLane()
   816  		m.lowerVIshl(x, y, instr.Return(), lane)
   817  
   818  	case ssa.OpcodeVSshr:
   819  		x, y, lane := instr.Arg2WithLane()
   820  		m.lowerVSshr(x, y, instr.Return(), lane)
   821  
   822  	case ssa.OpcodeVUshr:
   823  		x, y, lane := instr.Arg2WithLane()
   824  		m.lowerVUshr(x, y, instr.Return(), lane)
   825  
   826  	case ssa.OpcodeVCeil:
   827  		x, lane := instr.ArgWithLane()
   828  		m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2)
   829  
   830  	case ssa.OpcodeVFloor:
   831  		x, lane := instr.ArgWithLane()
   832  		m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2)
   833  
   834  	case ssa.OpcodeVTrunc:
   835  		x, lane := instr.ArgWithLane()
   836  		m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2)
   837  
   838  	case ssa.OpcodeVNearest:
   839  		x, lane := instr.ArgWithLane()
   840  		m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2)
   841  
   842  	case ssa.OpcodeExtIaddPairwise:
   843  		x, lane, signed := instr.ExtIaddPairwiseData()
   844  		m.lowerExtIaddPairwise(x, instr.Return(), lane, signed)
   845  
   846  	case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow:
   847  		x, lane := instr.ArgWithLane()
   848  		m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow)
   849  
   850  	case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh:
   851  		x, lane := instr.ArgWithLane()
   852  		m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh)
   853  
   854  	case ssa.OpcodeLoadSplat:
   855  		ptr, offset, lane := instr.LoadSplatData()
   856  		m.lowerLoadSplat(ptr, offset, instr.Return(), lane)
   857  
   858  	case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint:
   859  		x, lane := instr.ArgWithLane()
   860  		m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint)
   861  
   862  	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
   863  		x, lane := instr.ArgWithLane()
   864  		m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat)
   865  
   866  	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
   867  		x, y, lane := instr.Arg2WithLane()
   868  		m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow)
   869  
   870  	case ssa.OpcodeFvpromoteLow:
   871  		x := instr.Arg()
   872  		src := m.getOperand_Reg(m.c.ValueDefinition(x))
   873  		dst := m.c.VRegOf(instr.Return())
   874  		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst))
   875  
   876  	case ssa.OpcodeFvdemote:
   877  		x := instr.Arg()
   878  		src := m.getOperand_Reg(m.c.ValueDefinition(x))
   879  		dst := m.c.VRegOf(instr.Return())
   880  		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst))
   881  
   882  	case ssa.OpcodeWideningPairwiseDotProductS:
   883  		x, y := instr.Arg2()
   884  		m.lowerWideningPairwiseDotProductS(x, y, instr.Return())
   885  
   886  	case ssa.OpcodeVIabs:
   887  		m.lowerVIabs(instr)
   888  	case ssa.OpcodeVIpopcnt:
   889  		m.lowerVIpopcnt(instr)
   890  	case ssa.OpcodeVFmin:
   891  		m.lowerVFmin(instr)
   892  	case ssa.OpcodeVFmax:
   893  		m.lowerVFmax(instr)
   894  	case ssa.OpcodeVFabs:
   895  		m.lowerVFabs(instr)
   896  	case ssa.OpcodeUndefined:
   897  		m.insert(m.allocateInstr().asUD2())
   898  	case ssa.OpcodeExitWithCode:
   899  		execCtx, code := instr.ExitWithCodeData()
   900  		m.lowerExitWithCode(m.c.VRegOf(execCtx), code)
   901  	case ssa.OpcodeExitIfTrueWithCode:
   902  		execCtx, c, code := instr.ExitIfTrueWithCodeData()
   903  		m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code)
   904  	case ssa.OpcodeLoad:
   905  		ptr, offset, typ := instr.LoadData()
   906  		dst := m.c.VRegOf(instr.Return())
   907  		m.lowerLoad(ptr, offset, typ, dst)
   908  	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
   909  		ptr, offset, _ := instr.LoadData()
   910  		ret := m.c.VRegOf(instr.Return())
   911  		m.lowerExtLoad(op, ptr, offset, ret)
   912  	case ssa.OpcodeVconst:
   913  		result := m.c.VRegOf(instr.Return())
   914  		lo, hi := instr.VconstData()
   915  		m.lowerVconst(result, lo, hi)
   916  	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
   917  		from, to, signed := instr.ExtendData()
   918  		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
   919  	case ssa.OpcodeIcmp:
   920  		m.lowerIcmp(instr)
   921  	case ssa.OpcodeFcmp:
   922  		m.lowerFcmp(instr)
   923  	case ssa.OpcodeSelect:
   924  		cval, x, y := instr.SelectData()
   925  		m.lowerSelect(x, y, cval, instr.Return())
   926  	case ssa.OpcodeIreduce:
   927  		rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg()))
   928  		retVal := instr.Return()
   929  		rd := m.c.VRegOf(retVal)
   930  
   931  		if retVal.Type() != ssa.TypeI32 {
   932  			panic("TODO?: Ireduce to non-i32")
   933  		}
   934  		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd))
   935  
   936  	case ssa.OpcodeAtomicLoad:
   937  		ptr := instr.Arg()
   938  		size := instr.AtomicTargetSize()
   939  		dst := m.c.VRegOf(instr.Return())
   940  
   941  		// At this point, the ptr is ensured to be aligned, so using a normal load is atomic.
   942  		// https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30
   943  		mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
   944  		load := m.allocateInstr()
   945  		switch size {
   946  		case 8:
   947  			load.asMov64MR(mem, dst)
   948  		case 4:
   949  			load.asMovzxRmR(extModeLQ, mem, dst)
   950  		case 2:
   951  			load.asMovzxRmR(extModeWQ, mem, dst)
   952  		case 1:
   953  			load.asMovzxRmR(extModeBQ, mem, dst)
   954  		default:
   955  			panic("BUG")
   956  		}
   957  		m.insert(load)
   958  
   959  	case ssa.OpcodeFence:
   960  		m.insert(m.allocateInstr().asMFence())
   961  
   962  	case ssa.OpcodeAtomicStore:
   963  		ptr, _val := instr.Arg2()
   964  		size := instr.AtomicTargetSize()
   965  
   966  		val := m.getOperand_Reg(m.c.ValueDefinition(_val))
   967  		// The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register.
   968  		copied := m.copyToTmp(val.reg())
   969  
   970  		mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
   971  		store := m.allocateInstr().asXCHG(copied, mem, byte(size))
   972  		m.insert(store)
   973  
   974  	case ssa.OpcodeAtomicCas:
   975  		addr, exp, repl := instr.Arg3()
   976  		size := instr.AtomicTargetSize()
   977  		m.lowerAtomicCas(addr, exp, repl, size, instr.Return())
   978  
   979  	case ssa.OpcodeAtomicRmw:
   980  		addr, val := instr.Arg2()
   981  		atomicOp, size := instr.AtomicRmwData()
   982  		m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return())
   983  
   984  	default:
   985  		panic("TODO: lowering " + op.String())
   986  	}
   987  }
   988  
   989  func (m *machine) lowerAtomicRmw(op ssa.AtomicRmwOp, addr, val ssa.Value, size uint64, ret ssa.Value) {
   990  	mem := m.lowerToAddressMode(addr, 0)
   991  	_val := m.getOperand_Reg(m.c.ValueDefinition(val))
   992  
   993  	switch op {
   994  	case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub:
   995  		valCopied := m.copyToTmp(_val.reg())
   996  		if op == ssa.AtomicRmwOpSub {
   997  			// Negate the value.
   998  			m.insert(m.allocateInstr().asNeg(newOperandReg(valCopied), true))
   999  		}
  1000  		m.insert(m.allocateInstr().asLockXAdd(valCopied, mem, byte(size)))
  1001  		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
  1002  		m.copyTo(valCopied, m.c.VRegOf(ret))
  1003  
  1004  	case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor:
  1005  		accumulator := raxVReg
  1006  		// Reserve rax for the accumulator to make regalloc happy.
  1007  		// Note: do this initialization before defining valCopied, because it might be the same register and
  1008  		// if that happens, the unnecessary load/store will be performed inside the loop.
  1009  		// This can be mitigated in any way once the register allocator is clever enough.
  1010  		m.insert(m.allocateInstr().asDefineUninitializedReg(accumulator))
  1011  
  1012  		// Copy the value to a temporary register.
  1013  		valCopied := m.copyToTmp(_val.reg())
  1014  		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
  1015  
  1016  		memOp := newOperandMem(mem)
  1017  		tmp := m.c.AllocateVReg(ssa.TypeI64)
  1018  		beginLoop, beginLoopLabel := m.allocateBrTarget()
  1019  		{
  1020  			m.insert(beginLoop)
  1021  			// Reset the value on tmp by the original value.
  1022  			m.copyTo(valCopied, tmp)
  1023  			// Load the current value at the memory location into accumulator.
  1024  			switch size {
  1025  			case 1:
  1026  				m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, memOp, accumulator))
  1027  			case 2:
  1028  				m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, memOp, accumulator))
  1029  			case 4:
  1030  				m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, memOp, accumulator))
  1031  			case 8:
  1032  				m.insert(m.allocateInstr().asMov64MR(memOp, accumulator))
  1033  			default:
  1034  				panic("BUG")
  1035  			}
  1036  			// Then perform the logical operation on the accumulator and the value on tmp.
  1037  			switch op {
  1038  			case ssa.AtomicRmwOpAnd:
  1039  				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(accumulator), tmp, true))
  1040  			case ssa.AtomicRmwOpOr:
  1041  				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(accumulator), tmp, true))
  1042  			case ssa.AtomicRmwOpXor:
  1043  				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(accumulator), tmp, true))
  1044  			default:
  1045  				panic("BUG")
  1046  			}
  1047  			// Finally, try compare-exchange the value at the memory location with the tmp.
  1048  			m.insert(m.allocateInstr().asLockCmpXCHG(tmp, memOp.addressMode(), byte(size)))
  1049  			// If it succeeds, ZF will be set, and we can break the loop.
  1050  			m.insert(m.allocateInstr().asJmpIf(condNZ, newOperandLabel(beginLoopLabel)))
  1051  		}
  1052  
  1053  		// valCopied must be alive at the end of the loop.
  1054  		m.insert(m.allocateInstr().asNopUseReg(valCopied))
  1055  
  1056  		// At this point, accumulator contains the result.
  1057  		m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
  1058  		m.copyTo(accumulator, m.c.VRegOf(ret))
  1059  
  1060  	case ssa.AtomicRmwOpXchg:
  1061  		valCopied := m.copyToTmp(_val.reg())
  1062  
  1063  		m.insert(m.allocateInstr().asXCHG(valCopied, newOperandMem(mem), byte(size)))
  1064  		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
  1065  		m.copyTo(valCopied, m.c.VRegOf(ret))
  1066  
  1067  	default:
  1068  		panic("BUG")
  1069  	}
  1070  }
  1071  
  1072  func (m *machine) lowerAtomicCas(addr, exp, repl ssa.Value, size uint64, ret ssa.Value) {
  1073  	mem := m.lowerToAddressMode(addr, 0)
  1074  	expOp := m.getOperand_Reg(m.c.ValueDefinition(exp))
  1075  	replOp := m.getOperand_Reg(m.c.ValueDefinition(repl))
  1076  
  1077  	accumulator := raxVReg
  1078  	m.copyTo(expOp.reg(), accumulator)
  1079  	m.insert(m.allocateInstr().asLockCmpXCHG(replOp.reg(), mem, byte(size)))
  1080  	m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
  1081  	m.copyTo(accumulator, m.c.VRegOf(ret))
  1082  }
  1083  
  1084  func (m *machine) clearHigherBitsForAtomic(r regalloc.VReg, valSize uint64, resultType ssa.Type) {
  1085  	switch resultType {
  1086  	case ssa.TypeI32:
  1087  		switch valSize {
  1088  		case 1:
  1089  			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(r), r))
  1090  		case 2:
  1091  			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(r), r))
  1092  		}
  1093  	case ssa.TypeI64:
  1094  		switch valSize {
  1095  		case 1:
  1096  			m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(r), r))
  1097  		case 2:
  1098  			m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(r), r))
  1099  		case 4:
  1100  			m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(r), r))
  1101  		}
  1102  	}
  1103  }
  1104  
  1105  func (m *machine) lowerFcmp(instr *ssa.Instruction) {
  1106  	f1, f2, and := m.lowerFcmpToFlags(instr)
  1107  	rd := m.c.VRegOf(instr.Return())
  1108  	if f2 == condInvalid {
  1109  		tmp := m.c.AllocateVReg(ssa.TypeI32)
  1110  		m.insert(m.allocateInstr().asSetcc(f1, tmp))
  1111  		// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
  1112  		// the semantics of Icmp that sets either 0 or 1.
  1113  		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
  1114  	} else {
  1115  		tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32)
  1116  		m.insert(m.allocateInstr().asSetcc(f1, tmp1))
  1117  		m.insert(m.allocateInstr().asSetcc(f2, tmp2))
  1118  		var op aluRmiROpcode
  1119  		if and {
  1120  			op = aluRmiROpcodeAnd
  1121  		} else {
  1122  			op = aluRmiROpcodeOr
  1123  		}
  1124  		m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false))
  1125  		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd))
  1126  	}
  1127  }
  1128  
  1129  func (m *machine) lowerIcmp(instr *ssa.Instruction) {
  1130  	x, y, c := instr.IcmpData()
  1131  	m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64)
  1132  	rd := m.c.VRegOf(instr.Return())
  1133  	tmp := m.c.AllocateVReg(ssa.TypeI32)
  1134  	m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp))
  1135  	// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
  1136  	// the semantics of Icmp that sets either 0 or 1.
  1137  	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
  1138  }
  1139  
  1140  func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) {
  1141  	xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
  1142  	rd := m.c.VRegOf(ret)
  1143  
  1144  	var cond cond
  1145  	cvalDef := m.c.ValueDefinition(cval)
  1146  	switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
  1147  	case ssa.OpcodeIcmp:
  1148  		icmp := cvalDef.Instr
  1149  		xc, yc, cc := icmp.IcmpData()
  1150  		m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64)
  1151  		cond = condFromSSAIntCmpCond(cc)
  1152  		icmp.Lowered()
  1153  	default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex.
  1154  		cv := m.getOperand_Reg(cvalDef)
  1155  		test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false)
  1156  		m.insert(test)
  1157  		cond = condNZ
  1158  	}
  1159  
  1160  	if typ := x.Type(); typ.IsInt() {
  1161  		_64 := typ.Bits() == 64
  1162  		mov := m.allocateInstr()
  1163  		tmp := m.c.AllocateVReg(typ)
  1164  		switch yo.kind {
  1165  		case operandKindReg:
  1166  			mov.asMovRR(yo.reg(), tmp, _64)
  1167  		case operandKindMem:
  1168  			if _64 {
  1169  				mov.asMov64MR(yo, tmp)
  1170  			} else {
  1171  				mov.asMovzxRmR(extModeLQ, yo, tmp)
  1172  			}
  1173  		default:
  1174  			panic("BUG")
  1175  		}
  1176  		m.insert(mov)
  1177  		cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64)
  1178  		m.insert(cmov)
  1179  		m.insert(m.allocateInstr().asMovRR(tmp, rd, _64))
  1180  	} else {
  1181  		mov := m.allocateInstr()
  1182  		tmp := m.c.AllocateVReg(typ)
  1183  		switch typ {
  1184  		case ssa.TypeF32:
  1185  			mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp)
  1186  		case ssa.TypeF64:
  1187  			mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp)
  1188  		case ssa.TypeV128:
  1189  			mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp)
  1190  		default:
  1191  			panic("BUG")
  1192  		}
  1193  		m.insert(mov)
  1194  
  1195  		cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size())
  1196  		m.insert(cmov)
  1197  
  1198  		m.copyTo(tmp, rd)
  1199  	}
  1200  }
  1201  
  1202  func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) {
  1203  	x := i.op1
  1204  	rd := i.op2.reg()
  1205  	cond := cond(i.u1)
  1206  
  1207  	jcc := m.allocateInstr()
  1208  	m.insert(jcc)
  1209  
  1210  	mov := m.allocateInstr()
  1211  	switch i.u2 {
  1212  	case 4:
  1213  		mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd)
  1214  	case 8:
  1215  		mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd)
  1216  	case 16:
  1217  		mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd)
  1218  	default:
  1219  		panic("BUG")
  1220  	}
  1221  	m.insert(mov)
  1222  
  1223  	nop, end := m.allocateBrTarget()
  1224  	m.insert(nop)
  1225  	jcc.asJmpIf(cond.invert(), newOperandLabel(end))
  1226  }
  1227  
  1228  func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) {
  1229  	rd0 := m.c.VRegOf(ret)
  1230  	arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg))
  1231  
  1232  	rd := m.c.AllocateVReg(ret.Type())
  1233  
  1234  	ext := m.allocateInstr()
  1235  	switch {
  1236  	case from == 8 && to == 16 && signed:
  1237  		ext.asMovsxRmR(extModeBQ, arg, rd)
  1238  	case from == 8 && to == 16 && !signed:
  1239  		ext.asMovzxRmR(extModeBL, arg, rd)
  1240  	case from == 8 && to == 32 && signed:
  1241  		ext.asMovsxRmR(extModeBL, arg, rd)
  1242  	case from == 8 && to == 32 && !signed:
  1243  		ext.asMovzxRmR(extModeBQ, arg, rd)
  1244  	case from == 8 && to == 64 && signed:
  1245  		ext.asMovsxRmR(extModeBQ, arg, rd)
  1246  	case from == 8 && to == 64 && !signed:
  1247  		ext.asMovzxRmR(extModeBQ, arg, rd)
  1248  	case from == 16 && to == 32 && signed:
  1249  		ext.asMovsxRmR(extModeWL, arg, rd)
  1250  	case from == 16 && to == 32 && !signed:
  1251  		ext.asMovzxRmR(extModeWL, arg, rd)
  1252  	case from == 16 && to == 64 && signed:
  1253  		ext.asMovsxRmR(extModeWQ, arg, rd)
  1254  	case from == 16 && to == 64 && !signed:
  1255  		ext.asMovzxRmR(extModeWQ, arg, rd)
  1256  	case from == 32 && to == 64 && signed:
  1257  		ext.asMovsxRmR(extModeLQ, arg, rd)
  1258  	case from == 32 && to == 64 && !signed:
  1259  		ext.asMovzxRmR(extModeLQ, arg, rd)
  1260  	default:
  1261  		panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed))
  1262  	}
  1263  	m.insert(ext)
  1264  
  1265  	m.copyTo(rd, rd0)
  1266  }
  1267  
  1268  func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) {
  1269  	if lo == 0 && hi == 0 {
  1270  		m.insert(m.allocateInstr().asZeros(dst))
  1271  		return
  1272  	}
  1273  
  1274  	load := m.allocateInstr()
  1275  	constLabel := m.allocateLabel()
  1276  	m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi})
  1277  	load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst)
  1278  	m.insert(load)
  1279  }
  1280  
  1281  func (m *machine) lowerCtz(instr *ssa.Instruction) {
  1282  	if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
  1283  		m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt)
  1284  	} else {
  1285  		// On processors that do not support TZCNT, the BSF instruction is
  1286  		// executed instead. The key difference between TZCNT and BSF
  1287  		// instruction is that if source operand is zero, the content of
  1288  		// destination operand is undefined.
  1289  		// https://www.felixcloutier.com/x86/tzcnt.html
  1290  
  1291  		x := instr.Arg()
  1292  		if !x.Type().IsInt() {
  1293  			panic("BUG?")
  1294  		}
  1295  		_64 := x.Type().Bits() == 64
  1296  
  1297  		xDef := m.c.ValueDefinition(x)
  1298  		tmp := m.c.AllocateVReg(x.Type())
  1299  		rm := m.getOperand_Reg(xDef)
  1300  
  1301  		// First, we have to check if the target is non-zero.
  1302  		test := m.allocateInstr()
  1303  		test.asCmpRmiR(false, rm, rm.reg(), _64)
  1304  		m.insert(test)
  1305  
  1306  		jmpNz := m.allocateInstr()
  1307  		m.insert(jmpNz)
  1308  
  1309  		// If the value is zero, we just push the const value.
  1310  		m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
  1311  
  1312  		// Now jump right after the non-zero case.
  1313  		jmpAtEnd := m.allocateInstr()
  1314  		m.insert(jmpAtEnd)
  1315  
  1316  		// jmpNz target label is set here.
  1317  		nop, nz := m.allocateBrTarget()
  1318  		jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
  1319  		m.insert(nop)
  1320  
  1321  		// Emit the non-zero case.
  1322  		bsr := m.allocateInstr()
  1323  		bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64)
  1324  		m.insert(bsr)
  1325  
  1326  		// jmpAtEnd target label is set here.
  1327  		nopEnd, end := m.allocateBrTarget()
  1328  		jmpAtEnd.asJmp(newOperandLabel(end))
  1329  		m.insert(nopEnd)
  1330  
  1331  		m.copyTo(tmp, m.c.VRegOf(instr.Return()))
  1332  	}
  1333  }
  1334  
  1335  func (m *machine) lowerClz(instr *ssa.Instruction) {
  1336  	if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
  1337  		m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt)
  1338  	} else {
  1339  		// On processors that do not support LZCNT, we combine BSR (calculating
  1340  		// most significant set bit) with XOR. This logic is described in
  1341  		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
  1342  		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
  1343  
  1344  		x := instr.Arg()
  1345  		if !x.Type().IsInt() {
  1346  			panic("BUG?")
  1347  		}
  1348  		_64 := x.Type().Bits() == 64
  1349  
  1350  		xDef := m.c.ValueDefinition(x)
  1351  		rm := m.getOperand_Reg(xDef)
  1352  		tmp := m.c.AllocateVReg(x.Type())
  1353  
  1354  		// First, we have to check if the rm is non-zero as BSR is undefined
  1355  		// on zero. See https://www.felixcloutier.com/x86/bsr.
  1356  		test := m.allocateInstr()
  1357  		test.asCmpRmiR(false, rm, rm.reg(), _64)
  1358  		m.insert(test)
  1359  
  1360  		jmpNz := m.allocateInstr()
  1361  		m.insert(jmpNz)
  1362  
  1363  		// If the value is zero, we just push the const value.
  1364  		m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
  1365  
  1366  		// Now jump right after the non-zero case.
  1367  		jmpAtEnd := m.allocateInstr()
  1368  		m.insert(jmpAtEnd)
  1369  
  1370  		// jmpNz target label is set here.
  1371  		nop, nz := m.allocateBrTarget()
  1372  		jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
  1373  		m.insert(nop)
  1374  
  1375  		// Emit the non-zero case.
  1376  		bsr := m.allocateInstr()
  1377  		bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64)
  1378  		m.insert(bsr)
  1379  
  1380  		// Now we XOR the value with the bit length minus one.
  1381  		xor := m.allocateInstr()
  1382  		xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64)
  1383  		m.insert(xor)
  1384  
  1385  		// jmpAtEnd target label is set here.
  1386  		nopEnd, end := m.allocateBrTarget()
  1387  		jmpAtEnd.asJmp(newOperandLabel(end))
  1388  		m.insert(nopEnd)
  1389  
  1390  		m.copyTo(tmp, m.c.VRegOf(instr.Return()))
  1391  	}
  1392  }
  1393  
  1394  func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) {
  1395  	x := si.Arg()
  1396  	if !x.Type().IsInt() {
  1397  		panic("BUG?")
  1398  	}
  1399  	_64 := x.Type().Bits() == 64
  1400  
  1401  	xDef := m.c.ValueDefinition(x)
  1402  	rm := m.getOperand_Mem_Reg(xDef)
  1403  	rd := m.c.VRegOf(si.Return())
  1404  
  1405  	instr := m.allocateInstr()
  1406  	instr.asUnaryRmR(op, rm, rd, _64)
  1407  	m.insert(instr)
  1408  }
  1409  
  1410  func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) {
  1411  	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
  1412  	load := m.allocateInstr()
  1413  	switch typ {
  1414  	case ssa.TypeI32:
  1415  		load.asMovzxRmR(extModeLQ, mem, dst)
  1416  	case ssa.TypeI64:
  1417  		load.asMov64MR(mem, dst)
  1418  	case ssa.TypeF32:
  1419  		load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst)
  1420  	case ssa.TypeF64:
  1421  		load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst)
  1422  	case ssa.TypeV128:
  1423  		load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst)
  1424  	default:
  1425  		panic("BUG")
  1426  	}
  1427  	m.insert(load)
  1428  }
  1429  
  1430  func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) {
  1431  	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
  1432  	load := m.allocateInstr()
  1433  	switch op {
  1434  	case ssa.OpcodeUload8:
  1435  		load.asMovzxRmR(extModeBQ, mem, dst)
  1436  	case ssa.OpcodeUload16:
  1437  		load.asMovzxRmR(extModeWQ, mem, dst)
  1438  	case ssa.OpcodeUload32:
  1439  		load.asMovzxRmR(extModeLQ, mem, dst)
  1440  	case ssa.OpcodeSload8:
  1441  		load.asMovsxRmR(extModeBQ, mem, dst)
  1442  	case ssa.OpcodeSload16:
  1443  		load.asMovsxRmR(extModeWQ, mem, dst)
  1444  	case ssa.OpcodeSload32:
  1445  		load.asMovsxRmR(extModeLQ, mem, dst)
  1446  	default:
  1447  		panic("BUG")
  1448  	}
  1449  	m.insert(load)
  1450  }
  1451  
  1452  func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
  1453  	condDef := m.c.ValueDefinition(cond)
  1454  	if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) {
  1455  		panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
  1456  	}
  1457  	cvalInstr := condDef.Instr
  1458  	cvalInstr.MarkLowered()
  1459  
  1460  	// We need to copy the execution context to a temp register, because if it's spilled,
  1461  	// it might end up being reloaded inside the exiting branch.
  1462  	execCtxTmp := m.copyToTmp(execCtx)
  1463  
  1464  	x, y, c := cvalInstr.IcmpData()
  1465  	xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
  1466  	if !m.tryLowerBandToFlag(xx, yy) {
  1467  		m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64)
  1468  	}
  1469  
  1470  	jmpIf := m.allocateInstr()
  1471  	m.insert(jmpIf)
  1472  	l := m.lowerExitWithCode(execCtxTmp, code)
  1473  	jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l))
  1474  }
  1475  
  1476  func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) {
  1477  	var target *backend.SSAValueDefinition
  1478  	if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 {
  1479  		if m.c.MatchInstr(y, ssa.OpcodeBand) {
  1480  			target = y
  1481  		}
  1482  	}
  1483  
  1484  	if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 {
  1485  		if m.c.MatchInstr(x, ssa.OpcodeBand) {
  1486  			target = x
  1487  		}
  1488  	}
  1489  
  1490  	if target == nil {
  1491  		return false
  1492  	}
  1493  
  1494  	bandInstr := target.Instr
  1495  	bandX, bandY := bandInstr.Arg2()
  1496  
  1497  	xx := m.getOperand_Reg(m.c.ValueDefinition(bandX))
  1498  	yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY))
  1499  	test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64)
  1500  	m.insert(test)
  1501  	bandInstr.MarkLowered()
  1502  	return true
  1503  }
  1504  
  1505  func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) {
  1506  	saveRsp = m.allocateInstr().asMovRM(
  1507  		rspVReg,
  1508  		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)),
  1509  		8,
  1510  	)
  1511  
  1512  	saveRbp = m.allocateInstr().asMovRM(
  1513  		rbpVReg,
  1514  		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)),
  1515  		8,
  1516  	)
  1517  	setExitCode = m.allocateInstr().asMovRM(
  1518  		exitCodeReg,
  1519  		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)),
  1520  		4,
  1521  	)
  1522  	return
  1523  }
  1524  
  1525  func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) {
  1526  	exitCodeReg := rbpVReg
  1527  	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg)
  1528  
  1529  	// Set save RSP, RBP, and write exit code.
  1530  	m.insert(saveRsp)
  1531  	m.insert(saveRbp)
  1532  	m.lowerIconst(exitCodeReg, uint64(code), false)
  1533  	m.insert(setExitCode)
  1534  
  1535  	ripReg := rbpVReg
  1536  
  1537  	// Next is to save the current address for stack unwinding.
  1538  	nop, currentAddrLabel := m.allocateBrTarget()
  1539  	m.insert(nop)
  1540  	readRip := m.allocateInstr().asLEA(newOperandLabel(currentAddrLabel), ripReg)
  1541  	m.insert(readRip)
  1542  	saveRip := m.allocateInstr().asMovRM(
  1543  		ripReg,
  1544  		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
  1545  		8,
  1546  	)
  1547  	m.insert(saveRip)
  1548  
  1549  	// Finally exit.
  1550  	exitSq := m.allocateExitSeq(execCtx)
  1551  	m.insert(exitSq)
  1552  
  1553  	// Return the label for continuation.
  1554  	continuation, afterLabel := m.allocateBrTarget()
  1555  	m.insert(continuation)
  1556  	return afterLabel
  1557  }
  1558  
  1559  func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) {
  1560  	x, y := si.Arg2()
  1561  	if !x.Type().IsInt() {
  1562  		panic("BUG?")
  1563  	}
  1564  
  1565  	_64 := x.Type().Bits() == 64
  1566  
  1567  	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
  1568  
  1569  	// TODO: commutative args can be swapped if one of them is an immediate.
  1570  	rn := m.getOperand_Reg(xDef)
  1571  	rm := m.getOperand_Mem_Imm32_Reg(yDef)
  1572  	rd := m.c.VRegOf(si.Return())
  1573  
  1574  	// rn is being overwritten, so we first copy its value to a temp register,
  1575  	// in case it is referenced again later.
  1576  	tmp := m.copyToTmp(rn.reg())
  1577  
  1578  	alu := m.allocateInstr()
  1579  	alu.asAluRmiR(op, rm, tmp, _64)
  1580  	m.insert(alu)
  1581  
  1582  	// tmp now contains the result, we copy it to the dest register.
  1583  	m.copyTo(tmp, rd)
  1584  }
  1585  
  1586  func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) {
  1587  	x, amt := si.Arg2()
  1588  	if !x.Type().IsInt() {
  1589  		panic("BUG?")
  1590  	}
  1591  	_64 := x.Type().Bits() == 64
  1592  
  1593  	xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt)
  1594  
  1595  	opAmt := m.getOperand_Imm32_Reg(amtDef)
  1596  	rx := m.getOperand_Reg(xDef)
  1597  	rd := m.c.VRegOf(si.Return())
  1598  
  1599  	// rx is being overwritten, so we first copy its value to a temp register,
  1600  	// in case it is referenced again later.
  1601  	tmpDst := m.copyToTmp(rx.reg())
  1602  
  1603  	if opAmt.kind == operandKindReg {
  1604  		// If opAmt is a register we must copy its value to rcx,
  1605  		// because shiftR encoding mandates that the shift amount is in rcx.
  1606  		m.copyTo(opAmt.reg(), rcxVReg)
  1607  
  1608  		alu := m.allocateInstr()
  1609  		alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64)
  1610  		m.insert(alu)
  1611  
  1612  	} else {
  1613  		alu := m.allocateInstr()
  1614  		alu.asShiftR(op, opAmt, tmpDst, _64)
  1615  		m.insert(alu)
  1616  	}
  1617  
  1618  	// tmp now contains the result, we copy it to the dest register.
  1619  	m.copyTo(tmpDst, rd)
  1620  }
  1621  
  1622  func (m *machine) lowerXmmRmR(instr *ssa.Instruction) {
  1623  	x, y := instr.Arg2()
  1624  	if !x.Type().IsFloat() {
  1625  		panic("BUG?")
  1626  	}
  1627  	_64 := x.Type().Bits() == 64
  1628  
  1629  	var op sseOpcode
  1630  	if _64 {
  1631  		switch instr.Opcode() {
  1632  		case ssa.OpcodeFadd:
  1633  			op = sseOpcodeAddsd
  1634  		case ssa.OpcodeFsub:
  1635  			op = sseOpcodeSubsd
  1636  		case ssa.OpcodeFmul:
  1637  			op = sseOpcodeMulsd
  1638  		case ssa.OpcodeFdiv:
  1639  			op = sseOpcodeDivsd
  1640  		default:
  1641  			panic("BUG")
  1642  		}
  1643  	} else {
  1644  		switch instr.Opcode() {
  1645  		case ssa.OpcodeFadd:
  1646  			op = sseOpcodeAddss
  1647  		case ssa.OpcodeFsub:
  1648  			op = sseOpcodeSubss
  1649  		case ssa.OpcodeFmul:
  1650  			op = sseOpcodeMulss
  1651  		case ssa.OpcodeFdiv:
  1652  			op = sseOpcodeDivss
  1653  		default:
  1654  			panic("BUG")
  1655  		}
  1656  	}
  1657  
  1658  	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
  1659  	rn := m.getOperand_Reg(yDef)
  1660  	rm := m.getOperand_Reg(xDef)
  1661  	rd := m.c.VRegOf(instr.Return())
  1662  
  1663  	// rm is being overwritten, so we first copy its value to a temp register,
  1664  	// in case it is referenced again later.
  1665  	tmp := m.copyToTmp(rm.reg())
  1666  
  1667  	xmm := m.allocateInstr().asXmmRmR(op, rn, tmp)
  1668  	m.insert(xmm)
  1669  
  1670  	m.copyTo(tmp, rd)
  1671  }
  1672  
  1673  func (m *machine) lowerSqrt(instr *ssa.Instruction) {
  1674  	x := instr.Arg()
  1675  	if !x.Type().IsFloat() {
  1676  		panic("BUG")
  1677  	}
  1678  	_64 := x.Type().Bits() == 64
  1679  	var op sseOpcode
  1680  	if _64 {
  1681  		op = sseOpcodeSqrtsd
  1682  	} else {
  1683  		op = sseOpcodeSqrtss
  1684  	}
  1685  
  1686  	xDef := m.c.ValueDefinition(x)
  1687  	rm := m.getOperand_Mem_Reg(xDef)
  1688  	rd := m.c.VRegOf(instr.Return())
  1689  
  1690  	xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd)
  1691  	m.insert(xmm)
  1692  }
  1693  
  1694  func (m *machine) lowerFabsFneg(instr *ssa.Instruction) {
  1695  	x := instr.Arg()
  1696  	if !x.Type().IsFloat() {
  1697  		panic("BUG")
  1698  	}
  1699  	_64 := x.Type().Bits() == 64
  1700  	var op sseOpcode
  1701  	var mask uint64
  1702  	if _64 {
  1703  		switch instr.Opcode() {
  1704  		case ssa.OpcodeFabs:
  1705  			mask, op = 0x7fffffffffffffff, sseOpcodeAndpd
  1706  		case ssa.OpcodeFneg:
  1707  			mask, op = 0x8000000000000000, sseOpcodeXorpd
  1708  		}
  1709  	} else {
  1710  		switch instr.Opcode() {
  1711  		case ssa.OpcodeFabs:
  1712  			mask, op = 0x7fffffff, sseOpcodeAndps
  1713  		case ssa.OpcodeFneg:
  1714  			mask, op = 0x80000000, sseOpcodeXorps
  1715  		}
  1716  	}
  1717  
  1718  	tmp := m.c.AllocateVReg(x.Type())
  1719  
  1720  	xDef := m.c.ValueDefinition(x)
  1721  	rm := m.getOperand_Reg(xDef)
  1722  	rd := m.c.VRegOf(instr.Return())
  1723  
  1724  	m.lowerFconst(tmp, mask, _64)
  1725  
  1726  	xmm := m.allocateInstr().asXmmRmR(op, rm, tmp)
  1727  	m.insert(xmm)
  1728  
  1729  	m.copyTo(tmp, rd)
  1730  }
  1731  
  1732  func (m *machine) lowerStore(si *ssa.Instruction) {
  1733  	value, ptr, offset, storeSizeInBits := si.StoreData()
  1734  	rm := m.getOperand_Reg(m.c.ValueDefinition(value))
  1735  	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
  1736  
  1737  	store := m.allocateInstr()
  1738  	switch value.Type() {
  1739  	case ssa.TypeI32:
  1740  		store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
  1741  	case ssa.TypeI64:
  1742  		store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
  1743  	case ssa.TypeF32:
  1744  		store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem)
  1745  	case ssa.TypeF64:
  1746  		store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem)
  1747  	case ssa.TypeV128:
  1748  		store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem)
  1749  	default:
  1750  		panic("BUG")
  1751  	}
  1752  	m.insert(store)
  1753  }
  1754  
  1755  func (m *machine) lowerCall(si *ssa.Instruction) {
  1756  	isDirectCall := si.Opcode() == ssa.OpcodeCall
  1757  	var indirectCalleePtr ssa.Value
  1758  	var directCallee ssa.FuncRef
  1759  	var sigID ssa.SignatureID
  1760  	var args []ssa.Value
  1761  	var isMemmove bool
  1762  	if isDirectCall {
  1763  		directCallee, sigID, args = si.CallData()
  1764  	} else {
  1765  		indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData()
  1766  	}
  1767  	calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID))
  1768  
  1769  	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
  1770  	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
  1771  		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP.
  1772  	}
  1773  
  1774  	// Note: See machine.SetupPrologue for the stack layout.
  1775  	// The stack pointer decrease/increase will be inserted later in the compilation.
  1776  
  1777  	for i, arg := range args {
  1778  		reg := m.c.VRegOf(arg)
  1779  		def := m.c.ValueDefinition(arg)
  1780  		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
  1781  	}
  1782  
  1783  	if isMemmove {
  1784  		// Go's memmove *might* use all xmm0-xmm15, so we need to release them.
  1785  		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics
  1786  		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286
  1787  		for i := regalloc.RealReg(0); i < 16; i++ {
  1788  			m.insert(m.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+i]))
  1789  		}
  1790  	}
  1791  
  1792  	if isDirectCall {
  1793  		call := m.allocateInstr().asCall(directCallee, calleeABI)
  1794  		m.insert(call)
  1795  	} else {
  1796  		ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr))
  1797  		callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI)
  1798  		m.insert(callInd)
  1799  	}
  1800  
  1801  	if isMemmove {
  1802  		for i := regalloc.RealReg(0); i < 16; i++ {
  1803  			m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+i]))
  1804  		}
  1805  	}
  1806  
  1807  	var index int
  1808  	r1, rs := si.Returns()
  1809  	if r1.Valid() {
  1810  		m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize)
  1811  		index++
  1812  	}
  1813  
  1814  	for _, r := range rs {
  1815  		m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize)
  1816  		index++
  1817  	}
  1818  }
  1819  
  1820  // callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
  1821  // caller side of the function call.
  1822  func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) {
  1823  	arg := &a.Args[argIndex]
  1824  	if def != nil && def.IsFromInstr() {
  1825  		// Constant instructions are inlined.
  1826  		if inst := def.Instr; inst.Constant() {
  1827  			m.insertLoadConstant(inst, reg)
  1828  		}
  1829  	}
  1830  	if arg.Kind == backend.ABIArgKindReg {
  1831  		m.InsertMove(arg.Reg, reg, arg.Type)
  1832  	} else {
  1833  		store := m.allocateInstr()
  1834  		mem := newOperandMem(m.newAmodeImmReg(
  1835  			// -stackSlotSize because the stack pointer is not yet decreased.
  1836  			uint32(arg.Offset-stackSlotSize), rspVReg))
  1837  		switch arg.Type {
  1838  		case ssa.TypeI32:
  1839  			store.asMovRM(reg, mem, 4)
  1840  		case ssa.TypeI64:
  1841  			store.asMovRM(reg, mem, 8)
  1842  		case ssa.TypeF32:
  1843  			store.asXmmMovRM(sseOpcodeMovss, reg, mem)
  1844  		case ssa.TypeF64:
  1845  			store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
  1846  		case ssa.TypeV128:
  1847  			store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
  1848  		default:
  1849  			panic("BUG")
  1850  		}
  1851  		m.insert(store)
  1852  	}
  1853  }
  1854  
  1855  func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) {
  1856  	r := &a.Rets[retIndex]
  1857  	if r.Kind == backend.ABIArgKindReg {
  1858  		m.InsertMove(reg, r.Reg, r.Type)
  1859  	} else {
  1860  		load := m.allocateInstr()
  1861  		mem := newOperandMem(m.newAmodeImmReg(
  1862  			// -stackSlotSize because the stack pointer is not yet decreased.
  1863  			uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg))
  1864  		switch r.Type {
  1865  		case ssa.TypeI32:
  1866  			load.asMovzxRmR(extModeLQ, mem, reg)
  1867  		case ssa.TypeI64:
  1868  			load.asMov64MR(mem, reg)
  1869  		case ssa.TypeF32:
  1870  			load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
  1871  		case ssa.TypeF64:
  1872  			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
  1873  		case ssa.TypeV128:
  1874  			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
  1875  		default:
  1876  			panic("BUG")
  1877  		}
  1878  		m.insert(load)
  1879  	}
  1880  }
  1881  
  1882  // InsertMove implements backend.Machine.
  1883  func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
  1884  	switch typ {
  1885  	case ssa.TypeI32, ssa.TypeI64:
  1886  		i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64)
  1887  		m.insert(i)
  1888  	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
  1889  		var op sseOpcode
  1890  		switch typ {
  1891  		case ssa.TypeF32:
  1892  			op = sseOpcodeMovss
  1893  		case ssa.TypeF64:
  1894  			op = sseOpcodeMovsd
  1895  		case ssa.TypeV128:
  1896  			op = sseOpcodeMovdqa
  1897  		}
  1898  		i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst)
  1899  		m.insert(i)
  1900  	default:
  1901  		panic("BUG")
  1902  	}
  1903  }
  1904  
  1905  // Format implements backend.Machine.
  1906  func (m *machine) Format() string {
  1907  	ectx := m.ectx
  1908  	begins := map[*instruction]backend.Label{}
  1909  	for l, pos := range ectx.LabelPositions {
  1910  		begins[pos.Begin] = l
  1911  	}
  1912  
  1913  	irBlocks := map[backend.Label]ssa.BasicBlockID{}
  1914  	for i, l := range ectx.SsaBlockIDToLabels {
  1915  		irBlocks[l] = ssa.BasicBlockID(i)
  1916  	}
  1917  
  1918  	var lines []string
  1919  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
  1920  		if l, ok := begins[cur]; ok {
  1921  			var labelStr string
  1922  			if blkID, ok := irBlocks[l]; ok {
  1923  				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
  1924  			} else {
  1925  				labelStr = fmt.Sprintf("%s:", l)
  1926  			}
  1927  			lines = append(lines, labelStr)
  1928  		}
  1929  		if cur.kind == nop0 {
  1930  			continue
  1931  		}
  1932  		lines = append(lines, "\t"+cur.String())
  1933  	}
  1934  	for _, vc := range m.consts {
  1935  		if vc._var == nil {
  1936  			lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi))
  1937  		} else {
  1938  			lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var))
  1939  		}
  1940  	}
  1941  	return "\n" + strings.Join(lines, "\n") + "\n"
  1942  }
  1943  
  1944  func (m *machine) encodeWithoutSSA(root *instruction) {
  1945  	m.labelResolutionPends = m.labelResolutionPends[:0]
  1946  	ectx := m.ectx
  1947  
  1948  	bufPtr := m.c.BufPtr()
  1949  	for cur := root; cur != nil; cur = cur.next {
  1950  		offset := int64(len(*bufPtr))
  1951  		if cur.kind == nop0 {
  1952  			l := cur.nop0Label()
  1953  			if pos, ok := ectx.LabelPositions[l]; ok {
  1954  				pos.BinaryOffset = offset
  1955  			}
  1956  		}
  1957  
  1958  		needLabelResolution := cur.encode(m.c)
  1959  		if needLabelResolution {
  1960  			m.labelResolutionPends = append(m.labelResolutionPends,
  1961  				labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4},
  1962  			)
  1963  		}
  1964  	}
  1965  
  1966  	for i := range m.labelResolutionPends {
  1967  		p := &m.labelResolutionPends[i]
  1968  		switch p.instr.kind {
  1969  		case jmp, jmpIf, lea:
  1970  			target := p.instr.jmpLabel()
  1971  			targetOffset := ectx.LabelPositions[target].BinaryOffset
  1972  			imm32Offset := p.imm32Offset
  1973  			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
  1974  			binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset))
  1975  		default:
  1976  			panic("BUG")
  1977  		}
  1978  	}
  1979  }
  1980  
  1981  // Encode implements backend.Machine Encode.
  1982  func (m *machine) Encode(ctx context.Context) (err error) {
  1983  	ectx := m.ectx
  1984  	bufPtr := m.c.BufPtr()
  1985  
  1986  	var fn string
  1987  	var fnIndex int
  1988  	var labelToSSABlockID map[backend.Label]ssa.BasicBlockID
  1989  	if wazevoapi.PerfMapEnabled {
  1990  		fn = wazevoapi.GetCurrentFunctionName(ctx)
  1991  		labelToSSABlockID = make(map[backend.Label]ssa.BasicBlockID)
  1992  		for i, l := range ectx.SsaBlockIDToLabels {
  1993  			labelToSSABlockID[l] = ssa.BasicBlockID(i)
  1994  		}
  1995  		fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
  1996  	}
  1997  
  1998  	m.labelResolutionPends = m.labelResolutionPends[:0]
  1999  	for _, pos := range ectx.OrderedBlockLabels {
  2000  		offset := int64(len(*bufPtr))
  2001  		pos.BinaryOffset = offset
  2002  		for cur := pos.Begin; cur != pos.End.next; cur = cur.next {
  2003  			offset := int64(len(*bufPtr))
  2004  
  2005  			switch cur.kind {
  2006  			case nop0:
  2007  				l := cur.nop0Label()
  2008  				if pos, ok := ectx.LabelPositions[l]; ok {
  2009  					pos.BinaryOffset = offset
  2010  				}
  2011  			case sourceOffsetInfo:
  2012  				m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo())
  2013  			}
  2014  
  2015  			needLabelResolution := cur.encode(m.c)
  2016  			if needLabelResolution {
  2017  				m.labelResolutionPends = append(m.labelResolutionPends,
  2018  					labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4},
  2019  				)
  2020  			}
  2021  		}
  2022  
  2023  		if wazevoapi.PerfMapEnabled {
  2024  			l := pos.L
  2025  			var labelStr string
  2026  			if blkID, ok := labelToSSABlockID[l]; ok {
  2027  				labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
  2028  			} else {
  2029  				labelStr = l.String()
  2030  			}
  2031  			size := int64(len(*bufPtr)) - offset
  2032  			wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
  2033  		}
  2034  	}
  2035  
  2036  	for i := range m.consts {
  2037  		offset := int64(len(*bufPtr))
  2038  		vc := &m.consts[i]
  2039  		vc.label.BinaryOffset = offset
  2040  		if vc._var == nil {
  2041  			lo, hi := vc.lo, vc.hi
  2042  			m.c.Emit8Bytes(lo)
  2043  			m.c.Emit8Bytes(hi)
  2044  		} else {
  2045  			for _, b := range vc._var {
  2046  				m.c.EmitByte(b)
  2047  			}
  2048  		}
  2049  	}
  2050  
  2051  	buf := *bufPtr
  2052  	for i := range m.labelResolutionPends {
  2053  		p := &m.labelResolutionPends[i]
  2054  		switch p.instr.kind {
  2055  		case jmp, jmpIf, lea, xmmUnaryRmR:
  2056  			target := p.instr.jmpLabel()
  2057  			targetOffset := ectx.LabelPositions[target].BinaryOffset
  2058  			imm32Offset := p.imm32Offset
  2059  			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
  2060  			binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset))
  2061  		case jmpTableIsland:
  2062  			tableBegin := p.instrOffset
  2063  			// Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes.
  2064  			targets := m.jmpTableTargets[p.instr.u1]
  2065  			for i, l := range targets {
  2066  				targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset
  2067  				jmpOffset := targetOffset - tableBegin
  2068  				binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset))
  2069  			}
  2070  		default:
  2071  			panic("BUG")
  2072  		}
  2073  	}
  2074  	return
  2075  }
  2076  
  2077  // ResolveRelocations implements backend.Machine.
  2078  func (m *machine) ResolveRelocations(refToBinaryOffset []int, binary []byte, relocations []backend.RelocationInfo, _ []int) {
  2079  	for _, r := range relocations {
  2080  		offset := r.Offset
  2081  		calleeFnOffset := refToBinaryOffset[r.FuncRef]
  2082  		// offset is the offset of the last 4 bytes of the call instruction.
  2083  		callInstrOffsetBytes := binary[offset : offset+4]
  2084  		diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction).
  2085  		callInstrOffsetBytes[0] = byte(diff)
  2086  		callInstrOffsetBytes[1] = byte(diff >> 8)
  2087  		callInstrOffsetBytes[2] = byte(diff >> 16)
  2088  		callInstrOffsetBytes[3] = byte(diff >> 24)
  2089  	}
  2090  }
  2091  
  2092  // CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
  2093  func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return }
  2094  
  2095  func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) {
  2096  	x := m.getOperand_Reg(xd)
  2097  	y := m.getOperand_Mem_Imm32_Reg(yd)
  2098  	cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64)
  2099  	m.insert(cmp)
  2100  }
  2101  
  2102  func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) {
  2103  	x, y, c := instr.FcmpData()
  2104  	switch c {
  2105  	case ssa.FloatCmpCondEqual:
  2106  		f1, f2 = condNP, condZ
  2107  		and = true
  2108  	case ssa.FloatCmpCondNotEqual:
  2109  		f1, f2 = condP, condNZ
  2110  	case ssa.FloatCmpCondLessThan:
  2111  		f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan)
  2112  		f2 = condInvalid
  2113  		x, y = y, x
  2114  	case ssa.FloatCmpCondLessThanOrEqual:
  2115  		f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual)
  2116  		f2 = condInvalid
  2117  		x, y = y, x
  2118  	default:
  2119  		f1 = condFromSSAFloatCmpCond(c)
  2120  		f2 = condInvalid
  2121  	}
  2122  
  2123  	var opc sseOpcode
  2124  	if x.Type() == ssa.TypeF32 {
  2125  		opc = sseOpcodeUcomiss
  2126  	} else {
  2127  		opc = sseOpcodeUcomisd
  2128  	}
  2129  
  2130  	xr := m.getOperand_Reg(m.c.ValueDefinition(x))
  2131  	yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  2132  	m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg()))
  2133  	return
  2134  }
  2135  
  2136  // allocateInstr allocates an instruction.
  2137  func (m *machine) allocateInstr() *instruction {
  2138  	instr := m.ectx.InstructionPool.Allocate()
  2139  	if !m.regAllocStarted {
  2140  		instr.addedBeforeRegAlloc = true
  2141  	}
  2142  	return instr
  2143  }
  2144  
  2145  func (m *machine) allocateNop() *instruction {
  2146  	instr := m.allocateInstr()
  2147  	instr.kind = nop0
  2148  	return instr
  2149  }
  2150  
  2151  func (m *machine) insert(i *instruction) {
  2152  	ectx := m.ectx
  2153  	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
  2154  }
  2155  
  2156  func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint
  2157  	pos := m.allocateLabel()
  2158  	l = pos.L
  2159  	nop = m.allocateInstr()
  2160  	nop.asNop0WithLabel(l)
  2161  	pos.Begin, pos.End = nop, nop
  2162  	return
  2163  }
  2164  
  2165  func (m *machine) allocateLabel() *labelPosition {
  2166  	ectx := m.ectx
  2167  	l := ectx.AllocateLabel()
  2168  	pos := ectx.AllocateLabelPosition(l)
  2169  	ectx.LabelPositions[l] = pos
  2170  	return pos
  2171  }
  2172  
  2173  func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
  2174  	offset, ok := m.spillSlots[id]
  2175  	if !ok {
  2176  		offset = m.spillSlotSize
  2177  		m.spillSlots[id] = offset
  2178  		m.spillSlotSize += int64(size)
  2179  	}
  2180  	return offset
  2181  }
  2182  
  2183  func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) {
  2184  	mov := m.allocateInstr()
  2185  	if src.RegType() == regalloc.RegTypeInt {
  2186  		mov.asMovRR(src, dst, true)
  2187  	} else {
  2188  		mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
  2189  	}
  2190  	m.insert(mov)
  2191  }
  2192  
  2193  func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
  2194  	typ := m.c.TypeOf(v)
  2195  	tmp := m.c.AllocateVReg(typ)
  2196  	m.copyTo(v, tmp)
  2197  	return tmp
  2198  }
  2199  
  2200  func (m *machine) requiredStackSize() int64 {
  2201  	return m.maxRequiredStackSizeForCalls +
  2202  		m.frameSize() +
  2203  		16 + // Need for stack checking.
  2204  		16 // return address and the caller RBP.
  2205  }
  2206  
  2207  func (m *machine) frameSize() int64 {
  2208  	s := m.clobberedRegSlotSize() + m.spillSlotSize
  2209  	if s&0xf != 0 {
  2210  		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
  2211  	}
  2212  	return s
  2213  }
  2214  
  2215  func (m *machine) clobberedRegSlotSize() int64 {
  2216  	return int64(len(m.clobberedRegs) * 16)
  2217  }
  2218  
  2219  func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) {
  2220  	x, y, execCtx := si.Arg3()
  2221  
  2222  	dividend := m.getOperand_Reg(m.c.ValueDefinition(x))
  2223  	divisor := m.getOperand_Reg(m.c.ValueDefinition(y))
  2224  	ctxVReg := m.c.VRegOf(execCtx)
  2225  	tmpGp := m.c.AllocateVReg(si.Return().Type())
  2226  
  2227  	m.copyTo(dividend.reg(), raxVReg)
  2228  	m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg))
  2229  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
  2230  	seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64)
  2231  	m.insert(seq)
  2232  	rd := m.c.VRegOf(si.Return())
  2233  	if isDiv {
  2234  		m.copyTo(raxVReg, rd)
  2235  	} else {
  2236  		m.copyTo(rdxVReg, rd)
  2237  	}
  2238  }
  2239  
  2240  func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) {
  2241  	execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData()
  2242  
  2243  	dividend := raxVReg
  2244  
  2245  	// Ensure yr is not zero.
  2246  	test := m.allocateInstr()
  2247  	test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64)
  2248  	m.insert(test)
  2249  
  2250  	jnz := m.allocateInstr()
  2251  	m.insert(jnz)
  2252  
  2253  	nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero)
  2254  
  2255  	// If not zero, we can proceed with the division.
  2256  	jnz.asJmpIf(condNZ, newOperandLabel(nz))
  2257  
  2258  	var ifRemNeg1 *instruction
  2259  	if signed {
  2260  		var neg1 uint64
  2261  		if _64 {
  2262  			neg1 = 0xffffffffffffffff
  2263  		} else {
  2264  			neg1 = 0xffffffff
  2265  		}
  2266  		m.lowerIconst(tmpGp, neg1, _64)
  2267  
  2268  		if isDiv {
  2269  			// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  2270  			// case which results in the floating point exception via division error as
  2271  			// the resulting value exceeds the maximum of signed int.
  2272  
  2273  			// First, we check if the divisor is -1.
  2274  			cmp := m.allocateInstr()
  2275  			cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
  2276  			m.insert(cmp)
  2277  
  2278  			ifNotNeg1 := m.allocateInstr()
  2279  			m.insert(ifNotNeg1)
  2280  
  2281  			var minInt uint64
  2282  			if _64 {
  2283  				minInt = 0x8000000000000000
  2284  			} else {
  2285  				minInt = 0x80000000
  2286  			}
  2287  			m.lowerIconst(tmpGp, minInt, _64)
  2288  
  2289  			// Next we check if the quotient is the most negative value for the signed integer, i.e.
  2290  			// if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
  2291  			cmp2 := m.allocateInstr()
  2292  			cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64)
  2293  			m.insert(cmp2)
  2294  
  2295  			ifNotMinInt := m.allocateInstr()
  2296  			m.insert(ifNotMinInt)
  2297  
  2298  			// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
  2299  			// as that is the overflow in division as the result becomes 2^31 which is larger than
  2300  			// the maximum of signed 32-bit int (2^31-1).
  2301  			end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  2302  			ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end))
  2303  			ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end))
  2304  		} else {
  2305  			// If it is remainder, zeros DX register and compare the divisor to -1.
  2306  			xor := m.allocateInstr().asZeros(rdxVReg)
  2307  			m.insert(xor)
  2308  
  2309  			// We check if the divisor is -1.
  2310  			cmp := m.allocateInstr()
  2311  			cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
  2312  			m.insert(cmp)
  2313  
  2314  			ifRemNeg1 = m.allocateInstr()
  2315  			m.insert(ifRemNeg1)
  2316  		}
  2317  
  2318  		// Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
  2319  		sed := m.allocateInstr()
  2320  		sed.asSignExtendData(_64)
  2321  		m.insert(sed)
  2322  	} else {
  2323  		// Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
  2324  		zeros := m.allocateInstr().asZeros(rdxVReg)
  2325  		m.insert(zeros)
  2326  	}
  2327  
  2328  	div := m.allocateInstr()
  2329  	div.asDiv(newOperandReg(divisor), signed, _64)
  2330  	m.insert(div)
  2331  
  2332  	nop, end := m.allocateBrTarget()
  2333  	m.insert(nop)
  2334  	// If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function.
  2335  	if ifRemNeg1 != nil {
  2336  		ifRemNeg1.asJmpIf(condZ, newOperandLabel(end))
  2337  	}
  2338  }
  2339  
  2340  func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) {
  2341  	x := instr.Arg()
  2342  	if !x.Type().IsFloat() {
  2343  		panic("BUG?")
  2344  	}
  2345  	var op sseOpcode
  2346  	if x.Type().Bits() == 64 {
  2347  		op = sseOpcodeRoundsd
  2348  	} else {
  2349  		op = sseOpcodeRoundss
  2350  	}
  2351  
  2352  	xDef := m.c.ValueDefinition(x)
  2353  	rm := m.getOperand_Mem_Reg(xDef)
  2354  	rd := m.c.VRegOf(instr.Return())
  2355  
  2356  	xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd)
  2357  	m.insert(xmm)
  2358  }
  2359  
  2360  func (m *machine) lowerFminFmax(instr *ssa.Instruction) {
  2361  	x, y := instr.Arg2()
  2362  	if !x.Type().IsFloat() {
  2363  		panic("BUG?")
  2364  	}
  2365  
  2366  	_64 := x.Type().Bits() == 64
  2367  	isMin := instr.Opcode() == ssa.OpcodeFmin
  2368  	var minMaxOp sseOpcode
  2369  
  2370  	switch {
  2371  	case _64 && isMin:
  2372  		minMaxOp = sseOpcodeMinpd
  2373  	case _64 && !isMin:
  2374  		minMaxOp = sseOpcodeMaxpd
  2375  	case !_64 && isMin:
  2376  		minMaxOp = sseOpcodeMinps
  2377  	case !_64 && !isMin:
  2378  		minMaxOp = sseOpcodeMaxps
  2379  	}
  2380  
  2381  	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
  2382  	rm := m.getOperand_Reg(xDef)
  2383  	// We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg.
  2384  	rn := m.getOperand_Reg(yDef)
  2385  	rd := m.c.VRegOf(instr.Return())
  2386  
  2387  	tmp := m.copyToTmp(rm.reg())
  2388  
  2389  	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case.
  2390  	cmp := m.allocateInstr()
  2391  	if _64 {
  2392  		cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp)
  2393  	} else {
  2394  		cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp)
  2395  	}
  2396  	m.insert(cmp)
  2397  
  2398  	// At this point, we have the three cases of conditional flags below
  2399  	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
  2400  	//
  2401  	// 1) Two values are NaN-free and different: All flags are cleared.
  2402  	// 2) Two values are NaN-free and equal: Only ZF flags is set.
  2403  	// 3) One of Two values is NaN: ZF, PF and CF flags are set.
  2404  
  2405  	// Jump instruction to handle 1) case by checking the ZF flag
  2406  	// as ZF is only set for 2) and 3) cases.
  2407  	nanFreeOrDiffJump := m.allocateInstr()
  2408  	m.insert(nanFreeOrDiffJump)
  2409  
  2410  	// Start handling 2) and 3).
  2411  
  2412  	// Jump if one of two values is NaN by checking the parity flag (PF).
  2413  	ifIsNan := m.allocateInstr()
  2414  	m.insert(ifIsNan)
  2415  
  2416  	// Start handling 2) NaN-free and equal.
  2417  
  2418  	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
  2419  	// returned if two values are positive and negative zeros.
  2420  	var op sseOpcode
  2421  	switch {
  2422  	case !_64 && isMin:
  2423  		op = sseOpcodeOrps
  2424  	case _64 && isMin:
  2425  		op = sseOpcodeOrpd
  2426  	case !_64 && !isMin:
  2427  		op = sseOpcodeAndps
  2428  	case _64 && !isMin:
  2429  		op = sseOpcodeAndpd
  2430  	}
  2431  	orAnd := m.allocateInstr()
  2432  	orAnd.asXmmRmR(op, rn, tmp)
  2433  	m.insert(orAnd)
  2434  
  2435  	// Done, jump to end.
  2436  	sameExitJump := m.allocateInstr()
  2437  	m.insert(sameExitJump)
  2438  
  2439  	// Start handling 3) either is NaN.
  2440  	isNanTarget, isNan := m.allocateBrTarget()
  2441  	m.insert(isNanTarget)
  2442  	ifIsNan.asJmpIf(condP, newOperandLabel(isNan))
  2443  
  2444  	// We emit the ADD instruction to produce the NaN in tmp.
  2445  	add := m.allocateInstr()
  2446  	if _64 {
  2447  		add.asXmmRmR(sseOpcodeAddsd, rn, tmp)
  2448  	} else {
  2449  		add.asXmmRmR(sseOpcodeAddss, rn, tmp)
  2450  	}
  2451  	m.insert(add)
  2452  
  2453  	// Exit from the NaN case branch.
  2454  	nanExitJmp := m.allocateInstr()
  2455  	m.insert(nanExitJmp)
  2456  
  2457  	// Start handling 1).
  2458  	doMinMaxTarget, doMinMax := m.allocateBrTarget()
  2459  	m.insert(doMinMaxTarget)
  2460  	nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax))
  2461  
  2462  	// Now handle the NaN-free and different values case.
  2463  	minMax := m.allocateInstr()
  2464  	minMax.asXmmRmR(minMaxOp, rn, tmp)
  2465  	m.insert(minMax)
  2466  
  2467  	endNop, end := m.allocateBrTarget()
  2468  	m.insert(endNop)
  2469  	nanExitJmp.asJmp(newOperandLabel(end))
  2470  	sameExitJump.asJmp(newOperandLabel(end))
  2471  
  2472  	m.copyTo(tmp, rd)
  2473  }
  2474  
  2475  func (m *machine) lowerFcopysign(instr *ssa.Instruction) {
  2476  	x, y := instr.Arg2()
  2477  	if !x.Type().IsFloat() {
  2478  		panic("BUG")
  2479  	}
  2480  
  2481  	_64 := x.Type().Bits() == 64
  2482  
  2483  	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
  2484  	rm := m.getOperand_Reg(xDef)
  2485  	rn := m.getOperand_Reg(yDef)
  2486  	rd := m.c.VRegOf(instr.Return())
  2487  
  2488  	// Clear the non-sign bits of src via AND with the mask.
  2489  	var opAnd, opOr sseOpcode
  2490  	var signMask uint64
  2491  	if _64 {
  2492  		signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd
  2493  	} else {
  2494  		signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps
  2495  	}
  2496  
  2497  	signBitReg := m.c.AllocateVReg(x.Type())
  2498  	m.lowerFconst(signBitReg, signMask, _64)
  2499  	nonSignBitReg := m.c.AllocateVReg(x.Type())
  2500  	m.lowerFconst(nonSignBitReg, ^signMask, _64)
  2501  
  2502  	// Extract the sign bits of rn.
  2503  	and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg)
  2504  	m.insert(and)
  2505  
  2506  	// Clear the sign bit of dst via AND with the non-sign bit mask.
  2507  	xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg)
  2508  	m.insert(xor)
  2509  
  2510  	// Copy the sign bits of src to dst via OR.
  2511  	or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg)
  2512  	m.insert(or)
  2513  
  2514  	m.copyTo(nonSignBitReg, rd)
  2515  }
  2516  
  2517  func (m *machine) lowerBitcast(instr *ssa.Instruction) {
  2518  	x, dstTyp := instr.BitcastData()
  2519  	srcTyp := x.Type()
  2520  	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
  2521  	rd := m.c.VRegOf(instr.Return())
  2522  	switch {
  2523  	case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32:
  2524  		cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false)
  2525  		m.insert(cvt)
  2526  	case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32:
  2527  		cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false)
  2528  		m.insert(cvt)
  2529  	case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64:
  2530  		cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true)
  2531  		m.insert(cvt)
  2532  	case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64:
  2533  		cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true)
  2534  		m.insert(cvt)
  2535  	default:
  2536  		panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp))
  2537  	}
  2538  }
  2539  
  2540  func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
  2541  	var tmpXmm regalloc.VReg
  2542  	if dst64 {
  2543  		tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
  2544  	} else {
  2545  		tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
  2546  	}
  2547  
  2548  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
  2549  	tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
  2550  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
  2551  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
  2552  
  2553  	m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat))
  2554  	m.copyTo(tmpGp, rd)
  2555  }
  2556  
  2557  func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) {
  2558  	execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData()
  2559  	var cmpOp, truncOp sseOpcode
  2560  	if src64 {
  2561  		cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si
  2562  	} else {
  2563  		cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si
  2564  	}
  2565  
  2566  	trunc := m.allocateInstr()
  2567  	trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
  2568  	m.insert(trunc)
  2569  
  2570  	// Check if the dst operand was INT_MIN, by checking it against 1.
  2571  	cmp1 := m.allocateInstr()
  2572  	cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64)
  2573  	m.insert(cmp1)
  2574  
  2575  	// If no overflow, then we are done.
  2576  	doneTarget, done := m.allocateBrTarget()
  2577  	ifNoOverflow := m.allocateInstr()
  2578  	ifNoOverflow.asJmpIf(condNO, newOperandLabel(done))
  2579  	m.insert(ifNoOverflow)
  2580  
  2581  	// Now, check for NaN.
  2582  	cmpNan := m.allocateInstr()
  2583  	cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src)
  2584  	m.insert(cmpNan)
  2585  
  2586  	// We allocate the "non-nan target" here, but we will insert it later.
  2587  	notNanTarget, notNaN := m.allocateBrTarget()
  2588  	ifNotNan := m.allocateInstr()
  2589  	ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN))
  2590  	m.insert(ifNotNan)
  2591  
  2592  	if sat {
  2593  		// If NaN and saturating, return 0.
  2594  		zeroDst := m.allocateInstr().asZeros(tmpGp)
  2595  		m.insert(zeroDst)
  2596  
  2597  		jmpEnd := m.allocateInstr()
  2598  		jmpEnd.asJmp(newOperandLabel(done))
  2599  		m.insert(jmpEnd)
  2600  
  2601  		// Otherwise:
  2602  		m.insert(notNanTarget)
  2603  
  2604  		// Zero-out the tmp register.
  2605  		zero := m.allocateInstr().asZeros(tmpXmm)
  2606  		m.insert(zero)
  2607  
  2608  		cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
  2609  		m.insert(cmpXmm)
  2610  
  2611  		// if >= jump to end.
  2612  		jmpEnd2 := m.allocateInstr()
  2613  		jmpEnd2.asJmpIf(condB, newOperandLabel(done))
  2614  		m.insert(jmpEnd2)
  2615  
  2616  		// Otherwise, saturate to INT_MAX.
  2617  		if dst64 {
  2618  			m.lowerIconst(tmpGp, math.MaxInt64, dst64)
  2619  		} else {
  2620  			m.lowerIconst(tmpGp, math.MaxInt32, dst64)
  2621  		}
  2622  
  2623  	} else {
  2624  
  2625  		// If non-sat, NaN, trap.
  2626  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
  2627  
  2628  		// Otherwise, we will jump here.
  2629  		m.insert(notNanTarget)
  2630  
  2631  		// jump over trap if src larger than threshold
  2632  		condAboveThreshold := condNB
  2633  
  2634  		// The magic constants are various combination of minInt for int[32|64] represented as float[32|64].
  2635  		var minInt uint64
  2636  		switch {
  2637  		case src64 && dst64:
  2638  			minInt = 0xc3e0000000000000
  2639  		case src64 && !dst64:
  2640  			condAboveThreshold = condNBE
  2641  			minInt = 0xC1E0_0000_0020_0000
  2642  		case !src64 && dst64:
  2643  			minInt = 0xDF00_0000
  2644  		case !src64 && !dst64:
  2645  			minInt = 0xCF00_0000
  2646  		}
  2647  
  2648  		loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64)
  2649  		m.insert(loadToGP)
  2650  
  2651  		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64)
  2652  		m.insert(movToXmm)
  2653  
  2654  		cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
  2655  		m.insert(cmpXmm)
  2656  
  2657  		jmpIfLarger := m.allocateInstr()
  2658  		checkPositiveTarget, checkPositive := m.allocateBrTarget()
  2659  		jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive))
  2660  		m.insert(jmpIfLarger)
  2661  
  2662  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  2663  
  2664  		// If positive, it was a real overflow.
  2665  		m.insert(checkPositiveTarget)
  2666  
  2667  		// Zero out the temp register.
  2668  		xorpd := m.allocateInstr()
  2669  		xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm)
  2670  		m.insert(xorpd)
  2671  
  2672  		pos := m.allocateInstr()
  2673  		pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm)
  2674  		m.insert(pos)
  2675  
  2676  		// If >= jump to end.
  2677  		jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done))
  2678  		m.insert(jmp)
  2679  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  2680  	}
  2681  
  2682  	m.insert(doneTarget)
  2683  }
  2684  
  2685  func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
  2686  	tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64)
  2687  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
  2688  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2))
  2689  	tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
  2690  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
  2691  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
  2692  
  2693  	m.insert(m.allocateFcvtToUintSequence(
  2694  		ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat,
  2695  	))
  2696  	m.copyTo(tmpGp, rd)
  2697  }
  2698  
  2699  func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) {
  2700  	execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData()
  2701  
  2702  	var subOp, cmpOp, truncOp sseOpcode
  2703  	if src64 {
  2704  		subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si
  2705  	} else {
  2706  		subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si
  2707  	}
  2708  
  2709  	doneTarget, done := m.allocateBrTarget()
  2710  
  2711  	switch {
  2712  	case src64 && dst64:
  2713  		loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true)
  2714  		m.insert(loadToGP)
  2715  		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
  2716  		m.insert(movToXmm)
  2717  	case src64 && !dst64:
  2718  		loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true)
  2719  		m.insert(loadToGP)
  2720  		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
  2721  		m.insert(movToXmm)
  2722  	case !src64 && dst64:
  2723  		loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false)
  2724  		m.insert(loadToGP)
  2725  		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
  2726  		m.insert(movToXmm)
  2727  	case !src64 && !dst64:
  2728  		loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false)
  2729  		m.insert(loadToGP)
  2730  		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
  2731  		m.insert(movToXmm)
  2732  	}
  2733  
  2734  	cmp := m.allocateInstr()
  2735  	cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
  2736  	m.insert(cmp)
  2737  
  2738  	// If above `tmp` ("large threshold"), jump to `ifAboveThreshold`
  2739  	ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget()
  2740  	jmpIfAboveThreshold := m.allocateInstr()
  2741  	jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold))
  2742  	m.insert(jmpIfAboveThreshold)
  2743  
  2744  	ifNotNaNTarget, ifNotNaN := m.allocateBrTarget()
  2745  	jmpIfNotNaN := m.allocateInstr()
  2746  	jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN))
  2747  	m.insert(jmpIfNotNaN)
  2748  
  2749  	// If NaN, handle the error condition.
  2750  	if sat {
  2751  		// On NaN, saturating, we just return 0.
  2752  		zeros := m.allocateInstr().asZeros(tmpGp)
  2753  		m.insert(zeros)
  2754  
  2755  		jmpEnd := m.allocateInstr()
  2756  		jmpEnd.asJmp(newOperandLabel(done))
  2757  		m.insert(jmpEnd)
  2758  	} else {
  2759  		// On NaN, non-saturating, we trap.
  2760  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
  2761  	}
  2762  
  2763  	// If not NaN, land here.
  2764  	m.insert(ifNotNaNTarget)
  2765  
  2766  	// Truncation happens here.
  2767  
  2768  	trunc := m.allocateInstr()
  2769  	trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
  2770  	m.insert(trunc)
  2771  
  2772  	// Check if the result is negative.
  2773  	cmpNeg := m.allocateInstr()
  2774  	cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
  2775  	m.insert(cmpNeg)
  2776  
  2777  	// If non-neg, jump to end.
  2778  	jmpIfNonNeg := m.allocateInstr()
  2779  	jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done))
  2780  	m.insert(jmpIfNonNeg)
  2781  
  2782  	if sat {
  2783  		// If the input was "small" (< 2**(width -1)), the only way to get an integer
  2784  		// overflow is because the input was too small: saturate to the min value, i.e. 0.
  2785  		zeros := m.allocateInstr().asZeros(tmpGp)
  2786  		m.insert(zeros)
  2787  
  2788  		jmpEnd := m.allocateInstr()
  2789  		jmpEnd.asJmp(newOperandLabel(done))
  2790  		m.insert(jmpEnd)
  2791  	} else {
  2792  		// If not saturating, trap.
  2793  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  2794  	}
  2795  
  2796  	// If above the threshold, land here.
  2797  	m.insert(ifAboveThresholdTarget)
  2798  
  2799  	// tmpDiff := threshold - rn.
  2800  	copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2)
  2801  	m.insert(copySrc)
  2802  
  2803  	sub := m.allocateInstr()
  2804  	sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000
  2805  	m.insert(sub)
  2806  
  2807  	trunc2 := m.allocateInstr()
  2808  	trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64)
  2809  	m.insert(trunc2)
  2810  
  2811  	// Check if the result is negative.
  2812  	cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
  2813  	m.insert(cmpNeg2)
  2814  
  2815  	ifNextLargeTarget, ifNextLarge := m.allocateBrTarget()
  2816  	jmpIfNextLarge := m.allocateInstr()
  2817  	jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge))
  2818  	m.insert(jmpIfNextLarge)
  2819  
  2820  	if sat {
  2821  		// The input was "large" (>= maxInt), so the only way to get an integer
  2822  		// overflow is because the input was too large: saturate to the max value.
  2823  		var maxInt uint64
  2824  		if dst64 {
  2825  			maxInt = math.MaxUint64
  2826  		} else {
  2827  			maxInt = math.MaxUint32
  2828  		}
  2829  		m.lowerIconst(tmpGp, maxInt, dst64)
  2830  
  2831  		jmpToEnd := m.allocateInstr()
  2832  		jmpToEnd.asJmp(newOperandLabel(done))
  2833  		m.insert(jmpToEnd)
  2834  	} else {
  2835  		// If not saturating, trap.
  2836  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  2837  	}
  2838  
  2839  	m.insert(ifNextLargeTarget)
  2840  
  2841  	var op operand
  2842  	if dst64 {
  2843  		m.lowerIconst(tmpGp2, 0x8000000000000000, true)
  2844  		op = newOperandReg(tmpGp2)
  2845  	} else {
  2846  		op = newOperandImm32(0x80000000)
  2847  	}
  2848  
  2849  	add := m.allocateInstr()
  2850  	add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64)
  2851  	m.insert(add)
  2852  
  2853  	m.insert(doneTarget)
  2854  }
  2855  
  2856  func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) {
  2857  	var op sseOpcode
  2858  	if dst64 {
  2859  		op = sseOpcodeCvtsi2sd
  2860  	} else {
  2861  		op = sseOpcodeCvtsi2ss
  2862  	}
  2863  
  2864  	trunc := m.allocateInstr()
  2865  	trunc.asGprToXmm(op, rn, rd.reg(), src64)
  2866  	m.insert(trunc)
  2867  }
  2868  
  2869  func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) {
  2870  	var op sseOpcode
  2871  	if dst64 {
  2872  		op = sseOpcodeCvtsi2sd
  2873  	} else {
  2874  		op = sseOpcodeCvtsi2ss
  2875  	}
  2876  
  2877  	// Src is 32 bit, then we just perform the conversion with 64 bit width.
  2878  	//
  2879  	// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
  2880  	// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
  2881  	//
  2882  	// Here's the summary:
  2883  	// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
  2884  	// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
  2885  	// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
  2886  	// >> which allows CVTSI2SS to be used after all.
  2887  	//
  2888  	if !src64 {
  2889  		// Before we convert, we have to clear the higher 32-bits of the 64-bit register
  2890  		// to get the correct result.
  2891  		tmp := m.c.AllocateVReg(ssa.TypeI32)
  2892  		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, tmp))
  2893  		m.insert(m.allocateInstr().asGprToXmm(op, newOperandReg(tmp), rd.reg(), true))
  2894  		return
  2895  	}
  2896  
  2897  	// If uint64, we have to do a bit more work.
  2898  	endTarget, end := m.allocateBrTarget()
  2899  
  2900  	var tmpXmm regalloc.VReg
  2901  	if dst64 {
  2902  		tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
  2903  	} else {
  2904  		tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
  2905  	}
  2906  
  2907  	// Check if the most significant bit (sign bit) is set.
  2908  	test := m.allocateInstr()
  2909  	test.asCmpRmiR(false, rn, rn.reg(), src64)
  2910  	m.insert(test)
  2911  
  2912  	// Jump if the sign bit is set.
  2913  	ifSignTarget, ifSign := m.allocateBrTarget()
  2914  	jmpIfNeg := m.allocateInstr()
  2915  	jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign))
  2916  	m.insert(jmpIfNeg)
  2917  
  2918  	// If the sign bit is not set, we could fit the unsigned int into float32/float64.
  2919  	// So, we convert it to float and emit jump instruction to exit from this branch.
  2920  	cvt := m.allocateInstr()
  2921  	cvt.asGprToXmm(op, rn, tmpXmm, src64)
  2922  	m.insert(cvt)
  2923  
  2924  	// We are done, jump to end.
  2925  	jmpEnd := m.allocateInstr()
  2926  	jmpEnd.asJmp(newOperandLabel(end))
  2927  	m.insert(jmpEnd)
  2928  
  2929  	// Now handling the case where sign-bit is set.
  2930  	// We emit the following sequences:
  2931  	// 	   mov      %rn, %tmp
  2932  	// 	   shr      1, %tmp
  2933  	// 	   mov      %rn, %tmp2
  2934  	// 	   and      1, %tmp2
  2935  	// 	   or       %tmp2, %tmp
  2936  	// 	   cvtsi2ss %tmp, %xmm0
  2937  	// 	   addsd    %xmm0, %xmm0
  2938  	m.insert(ifSignTarget)
  2939  
  2940  	tmp := m.copyToTmp(rn.reg())
  2941  	shr := m.allocateInstr()
  2942  	shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64)
  2943  	m.insert(shr)
  2944  
  2945  	tmp2 := m.copyToTmp(rn.reg())
  2946  	and := m.allocateInstr()
  2947  	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64)
  2948  	m.insert(and)
  2949  
  2950  	or := m.allocateInstr()
  2951  	or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64)
  2952  	m.insert(or)
  2953  
  2954  	cvt2 := m.allocateInstr()
  2955  	cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64)
  2956  	m.insert(cvt2)
  2957  
  2958  	addsd := m.allocateInstr()
  2959  	if dst64 {
  2960  		addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm)
  2961  	} else {
  2962  		addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm)
  2963  	}
  2964  	m.insert(addsd)
  2965  
  2966  	m.insert(endTarget)
  2967  	m.copyTo(tmpXmm, rd.reg())
  2968  }
  2969  
  2970  func (m *machine) lowerVanyTrue(instr *ssa.Instruction) {
  2971  	x := instr.Arg()
  2972  	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
  2973  	rd := m.c.VRegOf(instr.Return())
  2974  
  2975  	tmp := m.c.AllocateVReg(ssa.TypeI32)
  2976  
  2977  	cmp := m.allocateInstr()
  2978  	cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg())
  2979  	m.insert(cmp)
  2980  
  2981  	setcc := m.allocateInstr()
  2982  	setcc.asSetcc(condNZ, tmp)
  2983  	m.insert(setcc)
  2984  
  2985  	// Clear the irrelevant bits.
  2986  	and := m.allocateInstr()
  2987  	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false)
  2988  	m.insert(and)
  2989  
  2990  	m.copyTo(tmp, rd)
  2991  }
  2992  
  2993  func (m *machine) lowerVallTrue(instr *ssa.Instruction) {
  2994  	x, lane := instr.ArgWithLane()
  2995  	var op sseOpcode
  2996  	switch lane {
  2997  	case ssa.VecLaneI8x16:
  2998  		op = sseOpcodePcmpeqb
  2999  	case ssa.VecLaneI16x8:
  3000  		op = sseOpcodePcmpeqw
  3001  	case ssa.VecLaneI32x4:
  3002  		op = sseOpcodePcmpeqd
  3003  	case ssa.VecLaneI64x2:
  3004  		op = sseOpcodePcmpeqq
  3005  	}
  3006  	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
  3007  	rd := m.c.VRegOf(instr.Return())
  3008  
  3009  	tmp := m.c.AllocateVReg(ssa.TypeV128)
  3010  
  3011  	zeros := m.allocateInstr()
  3012  	zeros.asZeros(tmp)
  3013  	m.insert(zeros)
  3014  
  3015  	pcmp := m.allocateInstr()
  3016  	pcmp.asXmmRmR(op, rm, tmp)
  3017  	m.insert(pcmp)
  3018  
  3019  	test := m.allocateInstr()
  3020  	test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp)
  3021  	m.insert(test)
  3022  
  3023  	tmp2 := m.c.AllocateVReg(ssa.TypeI32)
  3024  
  3025  	setcc := m.allocateInstr()
  3026  	setcc.asSetcc(condZ, tmp2)
  3027  	m.insert(setcc)
  3028  
  3029  	// Clear the irrelevant bits.
  3030  	and := m.allocateInstr()
  3031  	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false)
  3032  	m.insert(and)
  3033  
  3034  	m.copyTo(tmp2, rd)
  3035  }
  3036  
  3037  func (m *machine) lowerVhighBits(instr *ssa.Instruction) {
  3038  	x, lane := instr.ArgWithLane()
  3039  	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
  3040  	rd := m.c.VRegOf(instr.Return())
  3041  	switch lane {
  3042  	case ssa.VecLaneI8x16:
  3043  		mov := m.allocateInstr()
  3044  		mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false)
  3045  		m.insert(mov)
  3046  
  3047  	case ssa.VecLaneI16x8:
  3048  		// When we have:
  3049  		// 	R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
  3050  		// 	R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
  3051  		//	where RX(wn) is n-th signed word (16-bit) of RX register,
  3052  		//
  3053  		// "PACKSSWB R1, R2" produces
  3054  		//  R1 = [
  3055  		// 		byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
  3056  		// 		byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
  3057  		// 		byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
  3058  		// 		byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
  3059  		//  ]
  3060  		//  where R1 is the destination register, and
  3061  		// 	byte_sat(w) = int8(w) if w fits as signed 8-bit,
  3062  		//                0x80 if w is less than 0x80
  3063  		//                0x7F if w is greater than 0x7f
  3064  		//
  3065  		// See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
  3066  		//
  3067  		// Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
  3068  		tmp := m.copyToTmp(rm.reg())
  3069  		res := m.c.AllocateVReg(ssa.TypeI32)
  3070  
  3071  		pak := m.allocateInstr()
  3072  		pak.asXmmRmR(sseOpcodePacksswb, rm, tmp)
  3073  		m.insert(pak)
  3074  
  3075  		mov := m.allocateInstr()
  3076  		mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false)
  3077  		m.insert(mov)
  3078  
  3079  		// Clear the higher bits than 8.
  3080  		shr := m.allocateInstr()
  3081  		shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false)
  3082  		m.insert(shr)
  3083  
  3084  		m.copyTo(res, rd)
  3085  
  3086  	case ssa.VecLaneI32x4:
  3087  		mov := m.allocateInstr()
  3088  		mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true)
  3089  		m.insert(mov)
  3090  
  3091  	case ssa.VecLaneI64x2:
  3092  		mov := m.allocateInstr()
  3093  		mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true)
  3094  		m.insert(mov)
  3095  	}
  3096  }
  3097  
  3098  func (m *machine) lowerVbnot(instr *ssa.Instruction) {
  3099  	x := instr.Arg()
  3100  	xDef := m.c.ValueDefinition(x)
  3101  	rm := m.getOperand_Reg(xDef)
  3102  	rd := m.c.VRegOf(instr.Return())
  3103  
  3104  	tmp := m.copyToTmp(rm.reg())
  3105  	tmp2 := m.c.AllocateVReg(ssa.TypeV128)
  3106  
  3107  	// Ensure tmp2 is considered defined by regalloc.
  3108  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
  3109  
  3110  	// Set all bits on tmp register.
  3111  	pak := m.allocateInstr()
  3112  	pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2)
  3113  	m.insert(pak)
  3114  
  3115  	// Then XOR with tmp to reverse all bits on v.register.
  3116  	xor := m.allocateInstr()
  3117  	xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)
  3118  	m.insert(xor)
  3119  
  3120  	m.copyTo(tmp, rd)
  3121  }
  3122  
  3123  func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) {
  3124  	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
  3125  	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
  3126  
  3127  	switch lane {
  3128  	case ssa.VecLaneI8x16:
  3129  		tmp := m.c.AllocateVReg(ssa.TypeV128)
  3130  		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp))
  3131  		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3132  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst))
  3133  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
  3134  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst))
  3135  	case ssa.VecLaneI16x8:
  3136  		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
  3137  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst))
  3138  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst))
  3139  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
  3140  	case ssa.VecLaneI32x4:
  3141  		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3142  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst))
  3143  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
  3144  	case ssa.VecLaneI64x2:
  3145  		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
  3146  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst))
  3147  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst))
  3148  	case ssa.VecLaneF32x4:
  3149  		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3150  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst))
  3151  		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
  3152  	case ssa.VecLaneF64x2:
  3153  		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
  3154  		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
  3155  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst))
  3156  	default:
  3157  		panic(fmt.Sprintf("invalid lane type: %s", lane))
  3158  	}
  3159  
  3160  	m.copyTo(tmpDst, m.c.VRegOf(ret))
  3161  }
  3162  
  3163  func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) {
  3164  	var xMask, yMask [2]uint64
  3165  	for i := 0; i < 8; i++ {
  3166  		loLane := byte(lo >> (i * 8))
  3167  		if loLane < 16 {
  3168  			xMask[0] |= uint64(loLane) << (i * 8)
  3169  			yMask[0] |= uint64(0x80) << (i * 8)
  3170  		} else {
  3171  			xMask[0] |= uint64(0x80) << (i * 8)
  3172  			yMask[0] |= uint64(loLane-16) << (i * 8)
  3173  		}
  3174  		hiLane := byte(hi >> (i * 8))
  3175  		if hiLane < 16 {
  3176  			xMask[1] |= uint64(hiLane) << (i * 8)
  3177  			yMask[1] |= uint64(0x80) << (i * 8)
  3178  		} else {
  3179  			xMask[1] |= uint64(0x80) << (i * 8)
  3180  			yMask[1] |= uint64(hiLane-16) << (i * 8)
  3181  		}
  3182  	}
  3183  
  3184  	xmaskLabel := m.allocateLabel()
  3185  	m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel})
  3186  	ymaskLabel := m.allocateLabel()
  3187  	m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel})
  3188  
  3189  	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
  3190  	tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg())
  3191  
  3192  	// Apply mask to X.
  3193  	tmp := m.c.AllocateVReg(ssa.TypeV128)
  3194  	loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp)
  3195  	m.insert(loadMaskLo)
  3196  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX))
  3197  
  3198  	// Apply mask to Y.
  3199  	loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp)
  3200  	m.insert(loadMaskHi)
  3201  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY))
  3202  
  3203  	// Combine the results.
  3204  	m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY))
  3205  
  3206  	m.copyTo(tmpY, m.c.VRegOf(ret))
  3207  }
  3208  
  3209  func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) {
  3210  	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
  3211  	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
  3212  	rd := m.c.VRegOf(ret)
  3213  
  3214  	tmp := m.copyToTmp(rn.reg())
  3215  
  3216  	binOp := m.allocateInstr()
  3217  	binOp.asXmmRmR(op, rm, tmp)
  3218  	m.insert(binOp)
  3219  
  3220  	m.copyTo(tmp, rd)
  3221  }
  3222  
  3223  func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) {
  3224  	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
  3225  	rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  3226  	rd := m.c.VRegOf(ret)
  3227  
  3228  	tmp := m.copyToTmp(rn.reg())
  3229  
  3230  	binOp := m.allocateInstr()
  3231  	binOp.asXmmRmR(op, rm, tmp)
  3232  	m.insert(binOp)
  3233  
  3234  	m.copyTo(tmp, rd)
  3235  }
  3236  
  3237  func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) {
  3238  	var cmpOp sseOpcode
  3239  	switch lane {
  3240  	case ssa.VecLaneF32x4:
  3241  		cmpOp = sseOpcodeCmpps
  3242  	case ssa.VecLaneF64x2:
  3243  		cmpOp = sseOpcodeCmppd
  3244  	default:
  3245  		panic(fmt.Sprintf("invalid lane type: %s", lane))
  3246  	}
  3247  
  3248  	xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
  3249  	var cmpImm cmpPred
  3250  	switch c {
  3251  	case ssa.FloatCmpCondGreaterThan:
  3252  		yy, xx = xx, yy
  3253  		cmpImm = cmpPredLT_OS
  3254  	case ssa.FloatCmpCondGreaterThanOrEqual:
  3255  		yy, xx = xx, yy
  3256  		cmpImm = cmpPredLE_OS
  3257  	case ssa.FloatCmpCondEqual:
  3258  		cmpImm = cmpPredEQ_OQ
  3259  	case ssa.FloatCmpCondNotEqual:
  3260  		cmpImm = cmpPredNEQ_UQ
  3261  	case ssa.FloatCmpCondLessThan:
  3262  		cmpImm = cmpPredLT_OS
  3263  	case ssa.FloatCmpCondLessThanOrEqual:
  3264  		cmpImm = cmpPredLE_OS
  3265  	default:
  3266  		panic(fmt.Sprintf("invalid float comparison condition: %s", c))
  3267  	}
  3268  
  3269  	tmp := m.c.AllocateVReg(ssa.TypeV128)
  3270  	xxx := m.getOperand_Mem_Reg(xx)
  3271  	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp))
  3272  
  3273  	rm := m.getOperand_Mem_Reg(yy)
  3274  	m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp))
  3275  
  3276  	m.copyTo(tmp, m.c.VRegOf(ret))
  3277  }
  3278  
  3279  func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) {
  3280  	var eq, gt, maxu, minu, mins sseOpcode
  3281  	switch lane {
  3282  	case ssa.VecLaneI8x16:
  3283  		eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb
  3284  	case ssa.VecLaneI16x8:
  3285  		eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw
  3286  	case ssa.VecLaneI32x4:
  3287  		eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd
  3288  	case ssa.VecLaneI64x2:
  3289  		eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq
  3290  	default:
  3291  		panic(fmt.Sprintf("invalid lane type: %s", lane))
  3292  	}
  3293  
  3294  	tmp := m.c.AllocateVReg(ssa.TypeV128)
  3295  	var op operand
  3296  	switch c {
  3297  	case ssa.IntegerCmpCondSignedLessThanOrEqual:
  3298  		if lane == ssa.VecLaneI64x2 {
  3299  			x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3300  			// Copy x to tmp.
  3301  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
  3302  			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  3303  		} else {
  3304  			y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  3305  			// Copy y to tmp.
  3306  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
  3307  			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3308  		}
  3309  	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
  3310  		if lane == ssa.VecLaneI64x2 {
  3311  			y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  3312  			// Copy y to tmp.
  3313  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
  3314  			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3315  		} else {
  3316  			x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3317  			// Copy x to tmp.
  3318  			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
  3319  			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  3320  		}
  3321  	case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
  3322  		y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  3323  		// Copy y to tmp.
  3324  		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
  3325  		op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3326  	default:
  3327  		x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3328  		// Copy x to tmp.
  3329  		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
  3330  		op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
  3331  	}
  3332  
  3333  	switch c {
  3334  	case ssa.IntegerCmpCondEqual:
  3335  		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
  3336  	case ssa.IntegerCmpCondNotEqual:
  3337  		// First we compare for equality.
  3338  		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
  3339  		// Then flip the bits. To do so, we set all bits on tmp2.
  3340  		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
  3341  		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
  3342  		m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
  3343  		// And then xor with tmp.
  3344  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
  3345  	case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan:
  3346  		m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
  3347  	case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual:
  3348  		if lane == ssa.VecLaneI64x2 {
  3349  			m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
  3350  			// Then flip the bits. To do so, we set all bits on tmp2.
  3351  			tmp2 := m.c.AllocateVReg(ssa.TypeV128)
  3352  			m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
  3353  			m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
  3354  			// And then xor with tmp.
  3355  			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
  3356  		} else {
  3357  			// First take min of x and y.
  3358  			m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp))
  3359  			// Then compare for equality.
  3360  			m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
  3361  		}
  3362  	case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan:
  3363  		// First maxu of x and y.
  3364  		m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp))
  3365  		// Then compare for equality.
  3366  		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
  3367  		// Then flip the bits. To do so, we set all bits on tmp2.
  3368  		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
  3369  		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
  3370  		m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
  3371  		// And then xor with tmp.
  3372  		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
  3373  	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
  3374  		m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp))
  3375  		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
  3376  	default:
  3377  		panic("BUG")
  3378  	}
  3379  
  3380  	m.copyTo(tmp, m.c.VRegOf(ret))
  3381  }
  3382  
  3383  func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) {
  3384  	x, y := instr.Arg2()
  3385  	xDef := m.c.ValueDefinition(x)
  3386  	yDef := m.c.ValueDefinition(y)
  3387  	rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
  3388  	rd := m.c.VRegOf(instr.Return())
  3389  
  3390  	tmp := m.copyToTmp(rn.reg())
  3391  
  3392  	// pandn between rn, rm.
  3393  	pand := m.allocateInstr()
  3394  	pand.asXmmRmR(sseOpcodePandn, rm, tmp)
  3395  	m.insert(pand)
  3396  
  3397  	m.copyTo(tmp, rd)
  3398  }
  3399  
  3400  func (m *machine) lowerVbitselect(instr *ssa.Instruction) {
  3401  	c, x, y := instr.SelectData()
  3402  	xDef := m.c.ValueDefinition(x)
  3403  	yDef := m.c.ValueDefinition(y)
  3404  	rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
  3405  	creg := m.getOperand_Reg(m.c.ValueDefinition(c))
  3406  	rd := m.c.VRegOf(instr.Return())
  3407  
  3408  	tmpC := m.copyToTmp(creg.reg())
  3409  	tmpX := m.copyToTmp(rm.reg())
  3410  
  3411  	// And between c, x (overwrites x).
  3412  	pand := m.allocateInstr()
  3413  	pand.asXmmRmR(sseOpcodePand, creg, tmpX)
  3414  	m.insert(pand)
  3415  
  3416  	// Andn between y, c (overwrites c).
  3417  	pandn := m.allocateInstr()
  3418  	pandn.asXmmRmR(sseOpcodePandn, rn, tmpC)
  3419  	m.insert(pandn)
  3420  
  3421  	por := m.allocateInstr()
  3422  	por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX)
  3423  	m.insert(por)
  3424  
  3425  	m.copyTo(tmpX, rd)
  3426  }
  3427  
  3428  func (m *machine) lowerVFmin(instr *ssa.Instruction) {
  3429  	x, y, lane := instr.Arg2WithLane()
  3430  	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
  3431  	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
  3432  	rd := m.c.VRegOf(instr.Return())
  3433  
  3434  	var min, cmp, andn, or, srl /* shift right logical */ sseOpcode
  3435  	var shiftNumToInverseNaN uint32
  3436  	if lane == ssa.VecLaneF32x4 {
  3437  		min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa
  3438  	} else {
  3439  		min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd
  3440  	}
  3441  
  3442  	tmp1 := m.copyToTmp(rn.reg())
  3443  	tmp2 := m.copyToTmp(rm.reg())
  3444  
  3445  	// tmp1=min(rn, rm)
  3446  	minIns1 := m.allocateInstr()
  3447  	minIns1.asXmmRmR(min, rn, tmp2)
  3448  	m.insert(minIns1)
  3449  
  3450  	// tmp2=min(rm, rn)
  3451  	minIns2 := m.allocateInstr()
  3452  	minIns2.asXmmRmR(min, rm, tmp1)
  3453  	m.insert(minIns2)
  3454  
  3455  	// tmp3:=tmp1=min(rn, rm)
  3456  	tmp3 := m.copyToTmp(tmp1)
  3457  
  3458  	// tmp1 = -0         if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
  3459  	//       NaN         if rn == NaN || rm == NaN
  3460  	//       min(rm, rm) otherwise
  3461  	orIns := m.allocateInstr()
  3462  	orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1)
  3463  	m.insert(orIns)
  3464  
  3465  	// tmp3 is originally min(rn,rm).
  3466  	// tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN
  3467  	//        0 otherwise
  3468  	cmpIns := m.allocateInstr()
  3469  	cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3)
  3470  	m.insert(cmpIns)
  3471  
  3472  	// tmp1 = -0          if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
  3473  	//        ^0          if rn == NaN || rm == NaN
  3474  	//        min(v1, v2) otherwise
  3475  	orIns2 := m.allocateInstr()
  3476  	orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1)
  3477  	m.insert(orIns2)
  3478  
  3479  	// tmp3 = set all bits on the mantissa bits
  3480  	//        0 otherwise
  3481  	shift := m.allocateInstr()
  3482  	shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3)
  3483  	m.insert(shift)
  3484  
  3485  	// tmp3 = tmp1 and !tmp3
  3486  	//     = -0                                                   if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
  3487  	//       set all bits on exponential and sign bit (== NaN)    if rn == NaN || rm == NaN
  3488  	//       min(rn, rm)                                          otherwise
  3489  	andnIns := m.allocateInstr()
  3490  	andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3)
  3491  	m.insert(andnIns)
  3492  
  3493  	m.copyTo(tmp3, rd)
  3494  }
  3495  
  3496  func (m *machine) lowerVFmax(instr *ssa.Instruction) {
  3497  	x, y, lane := instr.Arg2WithLane()
  3498  	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
  3499  	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
  3500  	rd := m.c.VRegOf(instr.Return())
  3501  
  3502  	var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode
  3503  	var shiftNumToInverseNaN uint32
  3504  	if lane == ssa.VecLaneF32x4 {
  3505  		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa
  3506  	} else {
  3507  		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd
  3508  	}
  3509  
  3510  	tmp0 := m.copyToTmp(rm.reg())
  3511  	tmp1 := m.copyToTmp(rn.reg())
  3512  
  3513  	// tmp0=max(rn, rm)
  3514  	maxIns1 := m.allocateInstr()
  3515  	maxIns1.asXmmRmR(max, rn, tmp0)
  3516  	m.insert(maxIns1)
  3517  
  3518  	// tmp1=max(rm, rn)
  3519  	maxIns2 := m.allocateInstr()
  3520  	maxIns2.asXmmRmR(max, rm, tmp1)
  3521  	m.insert(maxIns2)
  3522  
  3523  	// tmp2=max(rm, rn)
  3524  	tmp2 := m.copyToTmp(tmp1)
  3525  
  3526  	// tmp2 = -0       if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
  3527  	//         0       if (rn == 0 && rm ==  0)
  3528  	//        -0       if (rn == -0 && rm == -0)
  3529  	//       v1^v2     if rn == NaN || rm == NaN
  3530  	//         0       otherwise
  3531  	xorInstr := m.allocateInstr()
  3532  	xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2)
  3533  	m.insert(xorInstr)
  3534  	// tmp1 = -0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
  3535  	//         0           if (rn == 0 && rm ==  0)
  3536  	//        -0           if (rn == -0 && rm == -0)
  3537  	//        NaN          if rn == NaN || rm == NaN
  3538  	//        max(v1, v2)  otherwise
  3539  	orInstr := m.allocateInstr()
  3540  	orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1)
  3541  	m.insert(orInstr)
  3542  
  3543  	tmp3 := m.copyToTmp(tmp1)
  3544  
  3545  	// tmp3 = 0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm ==  0)
  3546  	//       -0           if (rn == -0 && rm == -0)
  3547  	//       NaN          if rn == NaN || rm == NaN
  3548  	//       max(v1, v2)  otherwise
  3549  	//
  3550  	// Note: -0 - (-0) = 0 (!= -0) in floating point operation.
  3551  	subIns := m.allocateInstr()
  3552  	subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3)
  3553  	m.insert(subIns)
  3554  
  3555  	// tmp1 = 0^ if rn == NaN || rm == NaN
  3556  	cmpIns := m.allocateInstr()
  3557  	cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1)
  3558  	m.insert(cmpIns)
  3559  
  3560  	// tmp1 = set all bits on the mantissa bits
  3561  	//        0 otherwise
  3562  	shift := m.allocateInstr()
  3563  	shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1)
  3564  	m.insert(shift)
  3565  
  3566  	andnIns := m.allocateInstr()
  3567  	andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1)
  3568  	m.insert(andnIns)
  3569  
  3570  	m.copyTo(tmp1, rd)
  3571  }
  3572  
  3573  func (m *machine) lowerVFabs(instr *ssa.Instruction) {
  3574  	x, lane := instr.ArgWithLane()
  3575  	rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
  3576  	rd := m.c.VRegOf(instr.Return())
  3577  
  3578  	tmp := m.c.AllocateVReg(ssa.TypeV128)
  3579  
  3580  	def := m.allocateInstr()
  3581  	def.asDefineUninitializedReg(tmp)
  3582  	m.insert(def)
  3583  
  3584  	// Set all bits on tmp.
  3585  	pcmp := m.allocateInstr()
  3586  	pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp)
  3587  	m.insert(pcmp)
  3588  
  3589  	switch lane {
  3590  	case ssa.VecLaneF32x4:
  3591  		// Shift right packed single floats by 1 to clear the sign bits.
  3592  		shift := m.allocateInstr()
  3593  		shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp)
  3594  		m.insert(shift)
  3595  		// Clear the sign bit of rm.
  3596  		andp := m.allocateInstr()
  3597  		andp.asXmmRmR(sseOpcodeAndpd, rm, tmp)
  3598  		m.insert(andp)
  3599  	case ssa.VecLaneF64x2:
  3600  		// Shift right packed single floats by 1 to clear the sign bits.
  3601  		shift := m.allocateInstr()
  3602  		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp)
  3603  		m.insert(shift)
  3604  		// Clear the sign bit of rm.
  3605  		andp := m.allocateInstr()
  3606  		andp.asXmmRmR(sseOpcodeAndps, rm, tmp)
  3607  		m.insert(andp)
  3608  	}
  3609  
  3610  	m.copyTo(tmp, rd)
  3611  }