github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/engine/wazevo/backend/isa/arm64/lower_instr.go (about)

     1  package arm64
     2  
     3  // Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions
     4  // into machine specific instructions.
     5  //
     6  // Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree,
     7  // and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection.
     8  
     9  import (
    10  	"fmt"
    11  	"math"
    12  
    13  	"github.com/wasilibs/wazerox/internal/engine/wazevo/backend/regalloc"
    14  	"github.com/wasilibs/wazerox/internal/engine/wazevo/ssa"
    15  	"github.com/wasilibs/wazerox/internal/engine/wazevo/wazevoapi"
    16  )
    17  
    18  // LowerSingleBranch implements backend.Machine.
    19  func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
    20  	switch br.Opcode() {
    21  	case ssa.OpcodeJump:
    22  		_, _, targetBlk := br.BranchData()
    23  		if br.IsFallthroughJump() {
    24  			return
    25  		}
    26  		b := m.allocateInstr()
    27  		target := m.getOrAllocateSSABlockLabel(targetBlk)
    28  		if target == returnLabel {
    29  			b.asRet(m.currentABI)
    30  		} else {
    31  			b.asBr(target)
    32  		}
    33  		m.insert(b)
    34  	case ssa.OpcodeBrTable:
    35  		m.lowerBrTable(br)
    36  	default:
    37  		panic("BUG: unexpected branch opcode" + br.Opcode().String())
    38  	}
    39  }
    40  
    41  func (m *machine) lowerBrTable(i *ssa.Instruction) {
    42  	index, targets := i.BrTableData()
    43  	indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone)
    44  
    45  	// Firstly, we have to do the bounds check of the index, and
    46  	// set it to the default target (sitting at the end of the list) if it's out of bounds.
    47  
    48  	// mov  maxIndexReg #maximum_index
    49  	// subs wzr, index, maxIndexReg
    50  	// csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg.
    51  	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
    52  	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
    53  	subs := m.allocateInstr()
    54  	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
    55  	m.insert(subs)
    56  	csel := m.allocateInstr()
    57  	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
    58  	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
    59  	m.insert(csel)
    60  
    61  	brSequence := m.allocateInstr()
    62  
    63  	// TODO: reuse the slice!
    64  	labels := make([]uint32, len(targets))
    65  	for j, target := range targets {
    66  		labels[j] = uint32(m.getOrAllocateSSABlockLabel(target))
    67  	}
    68  
    69  	brSequence.asBrTableSequence(adjustedIndex, labels)
    70  	m.insert(brSequence)
    71  }
    72  
    73  // LowerConditionalBranch implements backend.Machine.
    74  func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
    75  	cval, args, targetBlk := b.BranchData()
    76  	if len(args) > 0 {
    77  		panic(fmt.Sprintf(
    78  			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
    79  			m.currentSSABlk,
    80  			targetBlk,
    81  		))
    82  	}
    83  
    84  	target := m.getOrAllocateSSABlockLabel(targetBlk)
    85  	cvalDef := m.compiler.ValueDefinition(cval)
    86  
    87  	switch {
    88  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
    89  		cvalInstr := cvalDef.Instr
    90  		x, y, c := cvalInstr.IcmpData()
    91  		cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed()
    92  		if b.Opcode() == ssa.OpcodeBrz {
    93  			cc = cc.invert()
    94  		}
    95  
    96  		m.lowerIcmpToFlag(x, y, signed)
    97  		cbr := m.allocateInstr()
    98  		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
    99  		m.insert(cbr)
   100  		cvalDef.Instr.MarkLowered()
   101  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
   102  		cvalInstr := cvalDef.Instr
   103  		x, y, c := cvalInstr.FcmpData()
   104  		cc := condFlagFromSSAFloatCmpCond(c)
   105  		if b.Opcode() == ssa.OpcodeBrz {
   106  			cc = cc.invert()
   107  		}
   108  		m.lowerFcmpToFlag(x, y)
   109  		cbr := m.allocateInstr()
   110  		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
   111  		m.insert(cbr)
   112  		cvalDef.Instr.MarkLowered()
   113  	default:
   114  		rn := m.getOperand_NR(cvalDef, extModeNone)
   115  		var c cond
   116  		if b.Opcode() == ssa.OpcodeBrz {
   117  			c = registerAsRegZeroCond(rn.nr())
   118  		} else {
   119  			c = registerAsRegNotZeroCond(rn.nr())
   120  		}
   121  		cbr := m.allocateInstr()
   122  		cbr.asCondBr(c, target, false)
   123  		m.insert(cbr)
   124  	}
   125  }
   126  
   127  // LowerInstr implements backend.Machine.
   128  func (m *machine) LowerInstr(instr *ssa.Instruction) {
   129  	if l := instr.SourceOffset(); l.Valid() {
   130  		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
   131  		m.insert(info)
   132  	}
   133  
   134  	switch op := instr.Opcode(); op {
   135  	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
   136  		panic("BUG: branching instructions are handled by LowerBranches")
   137  	case ssa.OpcodeReturn:
   138  		panic("BUG: return must be handled by backend.Compiler")
   139  	case ssa.OpcodeIadd, ssa.OpcodeIsub:
   140  		m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd)
   141  	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin:
   142  		m.lowerFpuBinOp(instr)
   143  	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
   144  	case ssa.OpcodeExitWithCode:
   145  		execCtx, code := instr.ExitWithCodeData()
   146  		m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code)
   147  	case ssa.OpcodeExitIfTrueWithCode:
   148  		execCtx, c, code := instr.ExitIfTrueWithCodeData()
   149  		m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code)
   150  	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
   151  		m.lowerStore(instr)
   152  	case ssa.OpcodeLoad:
   153  		dst := instr.Return()
   154  		ptr, offset, typ := instr.LoadData()
   155  		m.lowerLoad(ptr, offset, typ, dst)
   156  	case ssa.OpcodeVZeroExtLoad:
   157  		dst := instr.Return()
   158  		ptr, offset, typ := instr.VZeroExtLoadData()
   159  		m.lowerLoad(ptr, offset, typ, dst)
   160  	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
   161  		ptr, offset, _ := instr.LoadData()
   162  		ret := m.compiler.VRegOf(instr.Return())
   163  		m.lowerExtLoad(op, ptr, offset, ret)
   164  	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
   165  		m.lowerCall(instr)
   166  	case ssa.OpcodeIcmp:
   167  		m.lowerIcmp(instr)
   168  	case ssa.OpcodeVIcmp:
   169  		m.lowerVIcmp(instr)
   170  	case ssa.OpcodeVFcmp:
   171  		m.lowerVFcmp(instr)
   172  	case ssa.OpcodeVCeil:
   173  		m.lowerVecMisc(vecOpFrintp, instr)
   174  	case ssa.OpcodeVFloor:
   175  		m.lowerVecMisc(vecOpFrintm, instr)
   176  	case ssa.OpcodeVTrunc:
   177  		m.lowerVecMisc(vecOpFrintz, instr)
   178  	case ssa.OpcodeVNearest:
   179  		m.lowerVecMisc(vecOpFrintn, instr)
   180  	case ssa.OpcodeVMaxPseudo:
   181  		m.lowerVMinMaxPseudo(instr, true)
   182  	case ssa.OpcodeVMinPseudo:
   183  		m.lowerVMinMaxPseudo(instr, false)
   184  	case ssa.OpcodeBand:
   185  		m.lowerBitwiseAluOp(instr, aluOpAnd)
   186  	case ssa.OpcodeBor:
   187  		m.lowerBitwiseAluOp(instr, aluOpOrr)
   188  	case ssa.OpcodeBxor:
   189  		m.lowerBitwiseAluOp(instr, aluOpEor)
   190  	case ssa.OpcodeIshl:
   191  		m.lowerShifts(instr, extModeNone, aluOpLsl)
   192  	case ssa.OpcodeSshr:
   193  		if instr.Return().Type().Bits() == 64 {
   194  			m.lowerShifts(instr, extModeSignExtend64, aluOpAsr)
   195  		} else {
   196  			m.lowerShifts(instr, extModeSignExtend32, aluOpAsr)
   197  		}
   198  	case ssa.OpcodeUshr:
   199  		if instr.Return().Type().Bits() == 64 {
   200  			m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr)
   201  		} else {
   202  			m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr)
   203  		}
   204  	case ssa.OpcodeRotl:
   205  		m.lowerRotl(instr)
   206  	case ssa.OpcodeRotr:
   207  		m.lowerRotr(instr)
   208  	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
   209  		from, to, signed := instr.ExtendData()
   210  		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
   211  	case ssa.OpcodeFcmp:
   212  		x, y, c := instr.FcmpData()
   213  		m.lowerFcmp(x, y, instr.Return(), c)
   214  	case ssa.OpcodeImul:
   215  		x, y := instr.Arg2()
   216  		result := instr.Return()
   217  		m.lowerImul(x, y, result)
   218  	case ssa.OpcodeUndefined:
   219  		undef := m.allocateInstr()
   220  		undef.asUDF()
   221  		m.insert(undef)
   222  	case ssa.OpcodeSelect:
   223  		c, x, y := instr.SelectData()
   224  		if x.Type() == ssa.TypeV128 {
   225  			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
   226  			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   227  			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   228  			rd := operandNR(m.compiler.VRegOf(instr.Return()))
   229  			m.lowerSelectVec(rc, rn, rm, rd)
   230  		} else {
   231  			m.lowerSelect(c, x, y, instr.Return())
   232  		}
   233  	case ssa.OpcodeClz:
   234  		x := instr.Arg()
   235  		result := instr.Return()
   236  		m.lowerClz(x, result)
   237  	case ssa.OpcodeCtz:
   238  		x := instr.Arg()
   239  		result := instr.Return()
   240  		m.lowerCtz(x, result)
   241  	case ssa.OpcodePopcnt:
   242  		x := instr.Arg()
   243  		result := instr.Return()
   244  		m.lowerPopcnt(x, result)
   245  	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
   246  		x, ctx := instr.Arg2()
   247  		result := instr.Return()
   248  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   249  		rd := operandNR(m.compiler.VRegOf(result))
   250  		ctxVReg := m.compiler.VRegOf(ctx)
   251  		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
   252  			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
   253  	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
   254  		x, ctx := instr.Arg2()
   255  		result := instr.Return()
   256  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   257  		rd := operandNR(m.compiler.VRegOf(result))
   258  		ctxVReg := m.compiler.VRegOf(ctx)
   259  		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
   260  			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
   261  	case ssa.OpcodeFcvtFromSint:
   262  		x := instr.Arg()
   263  		result := instr.Return()
   264  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   265  		rd := operandNR(m.compiler.VRegOf(result))
   266  		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
   267  	case ssa.OpcodeFcvtFromUint:
   268  		x := instr.Arg()
   269  		result := instr.Return()
   270  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   271  		rd := operandNR(m.compiler.VRegOf(result))
   272  		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
   273  	case ssa.OpcodeFdemote:
   274  		v := instr.Arg()
   275  		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
   276  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   277  		cnt := m.allocateInstr()
   278  		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
   279  		m.insert(cnt)
   280  	case ssa.OpcodeFpromote:
   281  		v := instr.Arg()
   282  		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
   283  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   284  		cnt := m.allocateInstr()
   285  		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
   286  		m.insert(cnt)
   287  	case ssa.OpcodeIreduce:
   288  		rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone)
   289  		retVal := instr.Return()
   290  		rd := m.compiler.VRegOf(retVal)
   291  
   292  		if retVal.Type() != ssa.TypeI32 {
   293  			panic("TODO?: Ireduce to non-i32")
   294  		}
   295  		mov := m.allocateInstr()
   296  		mov.asMove32(rd, rn.reg())
   297  		m.insert(mov)
   298  	case ssa.OpcodeFneg:
   299  		m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return())
   300  	case ssa.OpcodeSqrt:
   301  		m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return())
   302  	case ssa.OpcodeCeil:
   303  		m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return())
   304  	case ssa.OpcodeFloor:
   305  		m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return())
   306  	case ssa.OpcodeTrunc:
   307  		m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return())
   308  	case ssa.OpcodeNearest:
   309  		m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return())
   310  	case ssa.OpcodeFabs:
   311  		m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return())
   312  	case ssa.OpcodeBitcast:
   313  		m.lowerBitcast(instr)
   314  	case ssa.OpcodeFcopysign:
   315  		x, y := instr.Arg2()
   316  		m.lowerFcopysign(x, y, instr.Return())
   317  	case ssa.OpcodeSdiv, ssa.OpcodeUdiv:
   318  		x, y, ctx := instr.Arg3()
   319  		ctxVReg := m.compiler.VRegOf(ctx)
   320  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   321  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   322  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   323  		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
   324  	case ssa.OpcodeSrem, ssa.OpcodeUrem:
   325  		x, y, ctx := instr.Arg3()
   326  		ctxVReg := m.compiler.VRegOf(ctx)
   327  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   328  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   329  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   330  		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
   331  	case ssa.OpcodeVconst:
   332  		result := m.compiler.VRegOf(instr.Return())
   333  		lo, hi := instr.VconstData()
   334  		v := m.allocateInstr()
   335  		v.asLoadFpuConst128(result, lo, hi)
   336  		m.insert(v)
   337  	case ssa.OpcodeVbnot:
   338  		x := instr.Arg()
   339  		ins := m.allocateInstr()
   340  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   341  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   342  		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
   343  		m.insert(ins)
   344  	case ssa.OpcodeVbxor:
   345  		x, y := instr.Arg2()
   346  		m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B)
   347  	case ssa.OpcodeVbor:
   348  		x, y := instr.Arg2()
   349  		m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B)
   350  	case ssa.OpcodeVband:
   351  		x, y := instr.Arg2()
   352  		m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B)
   353  	case ssa.OpcodeVbandnot:
   354  		x, y := instr.Arg2()
   355  		m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B)
   356  	case ssa.OpcodeVbitselect:
   357  		c, x, y := instr.SelectData()
   358  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   359  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   360  		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
   361  		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   362  
   363  		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
   364  		// in case when it is used somewhere else.
   365  		mov := m.allocateInstr()
   366  		mov.asFpuMov128(tmp.nr(), creg.nr())
   367  		m.insert(mov)
   368  
   369  		ins := m.allocateInstr()
   370  		ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B)
   371  		m.insert(ins)
   372  
   373  		mov2 := m.allocateInstr()
   374  		rd := m.compiler.VRegOf(instr.Return())
   375  		mov2.asFpuMov128(rd, tmp.nr())
   376  		m.insert(mov2)
   377  	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
   378  		x, lane := instr.ArgWithLane()
   379  		var arr vecArrangement
   380  		if op == ssa.OpcodeVallTrue {
   381  			arr = ssaLaneToArrangement(lane)
   382  		}
   383  		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   384  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   385  		m.lowerVcheckTrue(op, rm, rd, arr)
   386  	case ssa.OpcodeVhighBits:
   387  		x, lane := instr.ArgWithLane()
   388  		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   389  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   390  		arr := ssaLaneToArrangement(lane)
   391  		m.lowerVhighBits(rm, rd, arr)
   392  	case ssa.OpcodeVIadd:
   393  		x, y, lane := instr.Arg2WithLane()
   394  		arr := ssaLaneToArrangement(lane)
   395  		m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr)
   396  	case ssa.OpcodeIaddPairwise:
   397  		x, y, lane := instr.Arg2WithLane()
   398  		arr := ssaLaneToArrangement(lane)
   399  		m.lowerVecRRR(vecOpAddp, x, y, instr.Return(), arr)
   400  	case ssa.OpcodeVSaddSat:
   401  		x, y, lane := instr.Arg2WithLane()
   402  		arr := ssaLaneToArrangement(lane)
   403  		m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr)
   404  	case ssa.OpcodeVUaddSat:
   405  		x, y, lane := instr.Arg2WithLane()
   406  		arr := ssaLaneToArrangement(lane)
   407  		m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr)
   408  	case ssa.OpcodeVIsub:
   409  		x, y, lane := instr.Arg2WithLane()
   410  		arr := ssaLaneToArrangement(lane)
   411  		m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr)
   412  	case ssa.OpcodeVSsubSat:
   413  		x, y, lane := instr.Arg2WithLane()
   414  		arr := ssaLaneToArrangement(lane)
   415  		m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr)
   416  	case ssa.OpcodeVUsubSat:
   417  		x, y, lane := instr.Arg2WithLane()
   418  		arr := ssaLaneToArrangement(lane)
   419  		m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr)
   420  	case ssa.OpcodeVImin:
   421  		x, y, lane := instr.Arg2WithLane()
   422  		arr := ssaLaneToArrangement(lane)
   423  		m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr)
   424  	case ssa.OpcodeVUmin:
   425  		x, y, lane := instr.Arg2WithLane()
   426  		arr := ssaLaneToArrangement(lane)
   427  		m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr)
   428  	case ssa.OpcodeVImax:
   429  		x, y, lane := instr.Arg2WithLane()
   430  		arr := ssaLaneToArrangement(lane)
   431  		m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr)
   432  	case ssa.OpcodeVUmax:
   433  		x, y, lane := instr.Arg2WithLane()
   434  		arr := ssaLaneToArrangement(lane)
   435  		m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr)
   436  	case ssa.OpcodeVAvgRound:
   437  		x, y, lane := instr.Arg2WithLane()
   438  		arr := ssaLaneToArrangement(lane)
   439  		m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr)
   440  	case ssa.OpcodeVImul:
   441  		x, y, lane := instr.Arg2WithLane()
   442  		arr := ssaLaneToArrangement(lane)
   443  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   444  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   445  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   446  		m.lowerVIMul(rd, rn, rm, arr)
   447  	case ssa.OpcodeVIabs:
   448  		m.lowerVecMisc(vecOpAbs, instr)
   449  	case ssa.OpcodeVIneg:
   450  		m.lowerVecMisc(vecOpNeg, instr)
   451  	case ssa.OpcodeVIpopcnt:
   452  		m.lowerVecMisc(vecOpCnt, instr)
   453  	case ssa.OpcodeVIshl,
   454  		ssa.OpcodeVSshr, ssa.OpcodeVUshr:
   455  		x, y, lane := instr.Arg2WithLane()
   456  		arr := ssaLaneToArrangement(lane)
   457  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   458  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   459  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   460  		m.lowerVShift(op, rd, rn, rm, arr)
   461  	case ssa.OpcodeVSqrt:
   462  		m.lowerVecMisc(vecOpFsqrt, instr)
   463  	case ssa.OpcodeVFabs:
   464  		m.lowerVecMisc(vecOpFabs, instr)
   465  	case ssa.OpcodeVFneg:
   466  		m.lowerVecMisc(vecOpFneg, instr)
   467  	case ssa.OpcodeVFmin:
   468  		x, y, lane := instr.Arg2WithLane()
   469  		arr := ssaLaneToArrangement(lane)
   470  		m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr)
   471  	case ssa.OpcodeVFmax:
   472  		x, y, lane := instr.Arg2WithLane()
   473  		arr := ssaLaneToArrangement(lane)
   474  		m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr)
   475  	case ssa.OpcodeVFadd:
   476  		x, y, lane := instr.Arg2WithLane()
   477  		arr := ssaLaneToArrangement(lane)
   478  		m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr)
   479  	case ssa.OpcodeVFsub:
   480  		x, y, lane := instr.Arg2WithLane()
   481  		arr := ssaLaneToArrangement(lane)
   482  		m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr)
   483  	case ssa.OpcodeVFmul:
   484  		x, y, lane := instr.Arg2WithLane()
   485  		arr := ssaLaneToArrangement(lane)
   486  		m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr)
   487  	case ssa.OpcodeSqmulRoundSat:
   488  		x, y, lane := instr.Arg2WithLane()
   489  		arr := ssaLaneToArrangement(lane)
   490  		m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr)
   491  	case ssa.OpcodeVFdiv:
   492  		x, y, lane := instr.Arg2WithLane()
   493  		arr := ssaLaneToArrangement(lane)
   494  		m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr)
   495  	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
   496  		x, lane := instr.ArgWithLane()
   497  		arr := ssaLaneToArrangement(lane)
   498  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   499  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   500  		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
   501  	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
   502  		x, lane := instr.ArgWithLane()
   503  		arr := ssaLaneToArrangement(lane)
   504  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   505  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   506  		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
   507  	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
   508  		x, lane := instr.ArgWithLane()
   509  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   510  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   511  
   512  		var arr vecArrangement
   513  		switch lane {
   514  		case ssa.VecLaneI8x16:
   515  			arr = vecArrangement8B
   516  		case ssa.VecLaneI16x8:
   517  			arr = vecArrangement4H
   518  		case ssa.VecLaneI32x4:
   519  			arr = vecArrangement2S
   520  		}
   521  
   522  		shll := m.allocateInstr()
   523  		if signed := op == ssa.OpcodeSwidenLow; signed {
   524  			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
   525  		} else {
   526  			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
   527  		}
   528  		m.insert(shll)
   529  	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
   530  		x, lane := instr.ArgWithLane()
   531  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   532  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   533  
   534  		arr := ssaLaneToArrangement(lane)
   535  
   536  		shll := m.allocateInstr()
   537  		if signed := op == ssa.OpcodeSwidenHigh; signed {
   538  			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
   539  		} else {
   540  			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
   541  		}
   542  		m.insert(shll)
   543  
   544  	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
   545  		x, y, lane := instr.Arg2WithLane()
   546  		var arr, arr2 vecArrangement
   547  		switch lane {
   548  		case ssa.VecLaneI16x8: // I16x8
   549  			arr = vecArrangement8B
   550  			arr2 = vecArrangement16B // Implies sqxtn2.
   551  		case ssa.VecLaneI32x4:
   552  			arr = vecArrangement4H
   553  			arr2 = vecArrangement8H // Implies sqxtn2.
   554  		default:
   555  			panic("unsupported lane " + lane.String())
   556  		}
   557  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   558  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   559  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   560  
   561  		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   562  
   563  		loQxtn := m.allocateInstr()
   564  		hiQxtn := m.allocateInstr()
   565  		if signed := op == ssa.OpcodeSnarrow; signed {
   566  			// Narrow lanes on rn and write them into lower-half of rd.
   567  			loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low
   568  			// Narrow lanes on rm and write them into higher-half of rd.
   569  			hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2)
   570  		} else {
   571  			// Narrow lanes on rn and write them into lower-half of rd.
   572  			loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low
   573  			// Narrow lanes on rm and write them into higher-half of rd.
   574  			hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2)
   575  		}
   576  		m.insert(loQxtn)
   577  		m.insert(hiQxtn)
   578  
   579  		mov := m.allocateInstr()
   580  		mov.asFpuMov128(rd.nr(), tmp.nr())
   581  		m.insert(mov)
   582  	case ssa.OpcodeFvpromoteLow:
   583  		x, lane := instr.ArgWithLane()
   584  		if lane != ssa.VecLaneF32x4 {
   585  			panic("unsupported lane type " + lane.String())
   586  		}
   587  		ins := m.allocateInstr()
   588  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   589  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   590  		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
   591  		m.insert(ins)
   592  	case ssa.OpcodeFvdemote:
   593  		x, lane := instr.ArgWithLane()
   594  		if lane != ssa.VecLaneF64x2 {
   595  			panic("unsupported lane type " + lane.String())
   596  		}
   597  		ins := m.allocateInstr()
   598  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   599  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   600  		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
   601  		m.insert(ins)
   602  	case ssa.OpcodeExtractlane:
   603  		x, index, signed, lane := instr.ExtractlaneData()
   604  
   605  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   606  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   607  
   608  		mov := m.allocateInstr()
   609  		switch lane {
   610  		case ssa.VecLaneI8x16:
   611  			mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed)
   612  		case ssa.VecLaneI16x8:
   613  			mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed)
   614  		case ssa.VecLaneI32x4:
   615  			mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed)
   616  		case ssa.VecLaneI64x2:
   617  			mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed)
   618  		case ssa.VecLaneF32x4:
   619  			mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index))
   620  		case ssa.VecLaneF64x2:
   621  			mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index))
   622  		default:
   623  			panic("unsupported lane: " + lane.String())
   624  		}
   625  
   626  		m.insert(mov)
   627  
   628  	case ssa.OpcodeInsertlane:
   629  		x, y, index, lane := instr.InsertlaneData()
   630  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   631  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   632  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   633  		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   634  
   635  		// Initially mov rn to tmp.
   636  		mov1 := m.allocateInstr()
   637  		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
   638  		m.insert(mov1)
   639  
   640  		// movToVec and vecMovElement do not clear the remaining bits to zero,
   641  		// thus, we can mov rm in-place to tmp.
   642  		mov2 := m.allocateInstr()
   643  		switch lane {
   644  		case ssa.VecLaneI8x16:
   645  			mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index))
   646  		case ssa.VecLaneI16x8:
   647  			mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index))
   648  		case ssa.VecLaneI32x4:
   649  			mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index))
   650  		case ssa.VecLaneI64x2:
   651  			mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index))
   652  		case ssa.VecLaneF32x4:
   653  			mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0))
   654  		case ssa.VecLaneF64x2:
   655  			mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0))
   656  		}
   657  		m.insert(mov2)
   658  
   659  		// Finally mov tmp to rd.
   660  		mov3 := m.allocateInstr()
   661  		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
   662  		m.insert(mov3)
   663  
   664  	case ssa.OpcodeSwizzle:
   665  		x, y, lane := instr.Arg2WithLane()
   666  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   667  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   668  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   669  
   670  		arr := ssaLaneToArrangement(lane)
   671  
   672  		// tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr>
   673  		tbl1 := m.allocateInstr()
   674  		tbl1.asVecTbl(1, rd, rn, rm, arr)
   675  		m.insert(tbl1)
   676  
   677  	case ssa.OpcodeShuffle:
   678  		x, y, lane1, lane2 := instr.ShuffleData()
   679  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   680  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   681  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   682  
   683  		m.lowerShuffle(rd, rn, rm, lane1, lane2)
   684  
   685  	case ssa.OpcodeSplat:
   686  		x, lane := instr.ArgWithLane()
   687  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   688  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   689  
   690  		dup := m.allocateInstr()
   691  		switch lane {
   692  		case ssa.VecLaneI8x16:
   693  			dup.asVecDup(rd, rn, vecArrangement16B)
   694  		case ssa.VecLaneI16x8:
   695  			dup.asVecDup(rd, rn, vecArrangement8H)
   696  		case ssa.VecLaneI32x4:
   697  			dup.asVecDup(rd, rn, vecArrangement4S)
   698  		case ssa.VecLaneI64x2:
   699  			dup.asVecDup(rd, rn, vecArrangement2D)
   700  		case ssa.VecLaneF32x4:
   701  			dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0))
   702  		case ssa.VecLaneF64x2:
   703  			dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0))
   704  		}
   705  		m.insert(dup)
   706  
   707  	case ssa.OpcodeLoadSplat:
   708  		ptr, offset, lane := instr.LoadSplatData()
   709  		m.lowerLoadSplat(ptr, offset, lane, instr.Return())
   710  	default:
   711  		panic("TODO: lowering " + op.String())
   712  	}
   713  	m.FlushPendingInstructions()
   714  }
   715  
   716  func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
   717  	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
   718  	vReg, wReg := v29VReg, v30VReg
   719  
   720  	// Initialize v29, v30 to rn, rm.
   721  	movv := m.allocateInstr()
   722  	movv.asFpuMov128(vReg, rn.nr())
   723  	m.insert(movv)
   724  
   725  	movw := m.allocateInstr()
   726  	movw.asFpuMov128(wReg, rm.nr())
   727  	m.insert(movw)
   728  
   729  	// `lane1`, `lane2` are already encoded as two u64s with the right layout:
   730  	//     lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0]
   731  	//     lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8]
   732  	// Thus, we can use loadFpuConst128.
   733  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   734  	lfc := m.allocateInstr()
   735  	lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2)
   736  	m.insert(lfc)
   737  
   738  	// tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b
   739  	tbl2 := m.allocateInstr()
   740  	tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B)
   741  	m.insert(tbl2)
   742  }
   743  
   744  func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
   745  	var modulo byte
   746  	switch arr {
   747  	case vecArrangement16B:
   748  		modulo = 0x7 // Modulo 8.
   749  	case vecArrangement8H:
   750  		modulo = 0xf // Modulo 16.
   751  	case vecArrangement4S:
   752  		modulo = 0x1f // Modulo 32.
   753  	case vecArrangement2D:
   754  		modulo = 0x3f // Modulo 64.
   755  	default:
   756  		panic("unsupported arrangment " + arr.String())
   757  	}
   758  
   759  	rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
   760  	vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   761  
   762  	and := m.allocateInstr()
   763  	and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true)
   764  	m.insert(and)
   765  
   766  	if op != ssa.OpcodeVIshl {
   767  		// Negate the amount to make this as right shift.
   768  		neg := m.allocateInstr()
   769  		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
   770  		m.insert(neg)
   771  	}
   772  
   773  	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
   774  	dup := m.allocateInstr()
   775  	dup.asVecDup(vtmp, rtmp, arr)
   776  	m.insert(dup)
   777  
   778  	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
   779  		sshl := m.allocateInstr()
   780  		sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr)
   781  		m.insert(sshl)
   782  	} else {
   783  		ushl := m.allocateInstr()
   784  		ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr)
   785  		m.insert(ushl)
   786  	}
   787  }
   788  
   789  func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
   790  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   791  
   792  	// Special case VallTrue for i64x2.
   793  	if op == ssa.OpcodeVallTrue && arr == vecArrangement2D {
   794  		// 	cmeq v3?.2d, v2?.2d, #0
   795  		//	addp v3?.2d, v3?.2d, v3?.2d
   796  		//	fcmp v3?, v3?
   797  		//	cset dst, eq
   798  
   799  		ins := m.allocateInstr()
   800  		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
   801  		m.insert(ins)
   802  
   803  		addp := m.allocateInstr()
   804  		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
   805  		m.insert(addp)
   806  
   807  		fcmp := m.allocateInstr()
   808  		fcmp.asFpuCmp(tmp, tmp, true)
   809  		m.insert(fcmp)
   810  
   811  		cset := m.allocateInstr()
   812  		cset.asCSet(rd.nr(), false, eq)
   813  		m.insert(cset)
   814  
   815  		return
   816  	}
   817  
   818  	// Create a scalar value with umaxp or uminv, then compare it against zero.
   819  	ins := m.allocateInstr()
   820  	if op == ssa.OpcodeVanyTrue {
   821  		// 	umaxp v4?.16b, v2?.16b, v2?.16b
   822  		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
   823  	} else {
   824  		// 	uminv d4?, v2?.4s
   825  		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
   826  	}
   827  	m.insert(ins)
   828  
   829  	//	mov x3?, v4?.d[0]
   830  	//	ccmp x3?, #0x0, #0x0, al
   831  	//	cset x3?, ne
   832  	//	mov x0, x3?
   833  
   834  	movv := m.allocateInstr()
   835  	movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false)
   836  	m.insert(movv)
   837  
   838  	fc := m.allocateInstr()
   839  	fc.asCCmpImm(rd, uint64(0), al, 0, true)
   840  	m.insert(fc)
   841  
   842  	cset := m.allocateInstr()
   843  	cset.asCSet(rd.nr(), false, ne)
   844  	m.insert(cset)
   845  }
   846  
   847  func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
   848  	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
   849  	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   850  	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   851  
   852  	switch arr {
   853  	case vecArrangement16B:
   854  		//	sshr v6?.16b, v2?.16b, #7
   855  		//	movz x4?, #0x201, lsl 0
   856  		//	movk x4?, #0x804, lsl 16
   857  		//	movk x4?, #0x2010, lsl 32
   858  		//	movk x4?, #0x8040, lsl 48
   859  		//	dup v5?.2d, x4?
   860  		//	and v6?.16b, v6?.16b, v5?.16b
   861  		//	ext v5?.16b, v6?.16b, v6?.16b, #8
   862  		//	zip1 v5?.16b, v6?.16b, v5?.16b
   863  		//	addv s5?, v5?.8h
   864  		//	umov s3?, v5?.h[0]
   865  
   866  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
   867  		// v1[i] = 0xff if vi<0, 0 otherwise.
   868  		sshr := m.allocateInstr()
   869  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
   870  		m.insert(sshr)
   871  
   872  		// Load the bit mask into r0.
   873  		m.insertMOVZ(r0.nr(), 0x0201, 0, true)
   874  		m.insertMOVK(r0.nr(), 0x0804, 1, true)
   875  		m.insertMOVK(r0.nr(), 0x2010, 2, true)
   876  		m.insertMOVK(r0.nr(), 0x8040, 3, true)
   877  
   878  		// dup r0 to v0.
   879  		dup := m.allocateInstr()
   880  		dup.asVecDup(v0, r0, vecArrangement2D)
   881  		m.insert(dup)
   882  
   883  		// Lane-wise logical AND with the bit mask, meaning that we have
   884  		// v[i] = (1 << i) if vi<0, 0 otherwise.
   885  		//
   886  		// Below, we use the following notation:
   887  		// wi := (1 << i) if vi<0, 0 otherwise.
   888  		and := m.allocateInstr()
   889  		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
   890  		m.insert(and)
   891  
   892  		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
   893  		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
   894  		ext := m.allocateInstr()
   895  		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
   896  		m.insert(ext)
   897  
   898  		// v = [w0, w8, ..., w7, w15]
   899  		zip1 := m.allocateInstr()
   900  		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
   901  		m.insert(zip1)
   902  
   903  		// v.h[0] = w0 + ... + w15
   904  		addv := m.allocateInstr()
   905  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
   906  		m.insert(addv)
   907  
   908  		// Extract the v.h[0] as the result.
   909  		movfv := m.allocateInstr()
   910  		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
   911  		m.insert(movfv)
   912  	case vecArrangement8H:
   913  		//	sshr v6?.8h, v2?.8h, #15
   914  		//	movz x4?, #0x1, lsl 0
   915  		//	movk x4?, #0x2, lsl 16
   916  		//	movk x4?, #0x4, lsl 32
   917  		//	movk x4?, #0x8, lsl 48
   918  		//	dup v5?.2d, x4?
   919  		//	lsl x4?, x4?, 0x4
   920  		//	ins v5?.d[1], x4?
   921  		//	and v5?.16b, v6?.16b, v5?.16b
   922  		//	addv s5?, v5?.8h
   923  		//	umov s3?, v5?.h[0]
   924  
   925  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
   926  		// v[i] = 0xffff if vi<0, 0 otherwise.
   927  		sshr := m.allocateInstr()
   928  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
   929  		m.insert(sshr)
   930  
   931  		// Load the bit mask into r0.
   932  		m.lowerConstantI64(r0.nr(), 0x0008000400020001)
   933  
   934  		// dup r0 to vector v0.
   935  		dup := m.allocateInstr()
   936  		dup.asVecDup(v0, r0, vecArrangement2D)
   937  		m.insert(dup)
   938  
   939  		lsl := m.allocateInstr()
   940  		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
   941  		m.insert(lsl)
   942  
   943  		movv := m.allocateInstr()
   944  		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
   945  		m.insert(movv)
   946  
   947  		// Lane-wise logical AND with the bitmask, meaning that we have
   948  		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
   949  		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
   950  		and := m.allocateInstr()
   951  		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
   952  		m.insert(and)
   953  
   954  		addv := m.allocateInstr()
   955  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
   956  		m.insert(addv)
   957  
   958  		movfv := m.allocateInstr()
   959  		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
   960  		m.insert(movfv)
   961  	case vecArrangement4S:
   962  		// 	sshr v6?.8h, v2?.8h, #15
   963  		//	movz x4?, #0x1, lsl 0
   964  		//	movk x4?, #0x2, lsl 16
   965  		//	movk x4?, #0x4, lsl 32
   966  		//	movk x4?, #0x8, lsl 48
   967  		//	dup v5?.2d, x4?
   968  		//	lsl x4?, x4?, 0x4
   969  		//	ins v5?.d[1], x4?
   970  		//	and v5?.16b, v6?.16b, v5?.16b
   971  		//	addv s5?, v5?.8h
   972  		//	umov s3?, v5?.h[0]
   973  
   974  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
   975  		// v[i] = 0xffffffff if vi<0, 0 otherwise.
   976  		sshr := m.allocateInstr()
   977  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
   978  		m.insert(sshr)
   979  
   980  		// Load the bit mask into r0.
   981  		m.lowerConstantI64(r0.nr(), 0x0000000200000001)
   982  
   983  		// dup r0 to vector v0.
   984  		dup := m.allocateInstr()
   985  		dup.asVecDup(v0, r0, vecArrangement2D)
   986  		m.insert(dup)
   987  
   988  		lsl := m.allocateInstr()
   989  		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
   990  		m.insert(lsl)
   991  
   992  		movv := m.allocateInstr()
   993  		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
   994  		m.insert(movv)
   995  
   996  		// Lane-wise logical AND with the bitmask, meaning that we have
   997  		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
   998  		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
   999  		and := m.allocateInstr()
  1000  		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
  1001  		m.insert(and)
  1002  
  1003  		addv := m.allocateInstr()
  1004  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
  1005  		m.insert(addv)
  1006  
  1007  		movfv := m.allocateInstr()
  1008  		movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false)
  1009  		m.insert(movfv)
  1010  	case vecArrangement2D:
  1011  		// 	mov d3?, v2?.d[0]
  1012  		//	mov x4?, v2?.d[1]
  1013  		//	lsr x4?, x4?, 0x3f
  1014  		//	lsr d3?, d3?, 0x3f
  1015  		//	add s3?, s3?, w4?, lsl #1
  1016  
  1017  		// Move the lower 64-bit int into result.
  1018  		movv0 := m.allocateInstr()
  1019  		movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false)
  1020  		m.insert(movv0)
  1021  
  1022  		// Move the higher 64-bit int into r0.
  1023  		movv1 := m.allocateInstr()
  1024  		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
  1025  		m.insert(movv1)
  1026  
  1027  		// Move the sign bit into the least significant bit.
  1028  		lsr1 := m.allocateInstr()
  1029  		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
  1030  		m.insert(lsr1)
  1031  
  1032  		lsr2 := m.allocateInstr()
  1033  		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
  1034  		m.insert(lsr2)
  1035  
  1036  		// rd = (r0<<1) | rd
  1037  		lsl := m.allocateInstr()
  1038  		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
  1039  		m.insert(lsl)
  1040  	default:
  1041  		panic("Unsupported " + arr.String())
  1042  	}
  1043  }
  1044  
  1045  func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
  1046  	x, lane := instr.ArgWithLane()
  1047  	arr := ssaLaneToArrangement(lane)
  1048  	ins := m.allocateInstr()
  1049  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1050  	rd := operandNR(m.compiler.VRegOf(instr.Return()))
  1051  	ins.asVecMisc(op, rd, rn, arr)
  1052  	m.insert(ins)
  1053  }
  1054  
  1055  func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) {
  1056  	ins := m.allocateInstr()
  1057  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1058  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1059  	rd := operandNR(m.compiler.VRegOf(ret))
  1060  	ins.asVecRRR(op, rd, rn, rm, arr)
  1061  	m.insert(ins)
  1062  }
  1063  
  1064  func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
  1065  	if arr != vecArrangement2D {
  1066  		mul := m.allocateInstr()
  1067  		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
  1068  		m.insert(mul)
  1069  	} else {
  1070  		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1071  		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1072  		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1073  
  1074  		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1075  
  1076  		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
  1077  		rev64 := m.allocateInstr()
  1078  		rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S)
  1079  		m.insert(rev64)
  1080  
  1081  		mul := m.allocateInstr()
  1082  		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
  1083  		m.insert(mul)
  1084  
  1085  		xtn1 := m.allocateInstr()
  1086  		xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S)
  1087  		m.insert(xtn1)
  1088  
  1089  		addp := m.allocateInstr()
  1090  		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
  1091  		m.insert(addp)
  1092  
  1093  		xtn2 := m.allocateInstr()
  1094  		xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S)
  1095  		m.insert(xtn2)
  1096  
  1097  		// Note: do not write the result directly into result yet. This is the same reason as in bsl.
  1098  		// In short, in UMLAL instruction, the result register is also one of the source register, and
  1099  		// the value on the result register is significant.
  1100  		shll := m.allocateInstr()
  1101  		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
  1102  		m.insert(shll)
  1103  
  1104  		umlal := m.allocateInstr()
  1105  		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
  1106  		m.insert(umlal)
  1107  
  1108  		mov := m.allocateInstr()
  1109  		mov.asFpuMov128(rd.nr(), tmpRes.nr())
  1110  		m.insert(mov)
  1111  	}
  1112  }
  1113  
  1114  func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
  1115  	x, y, lane := instr.Arg2WithLane()
  1116  	arr := ssaLaneToArrangement(lane)
  1117  
  1118  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1119  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1120  
  1121  	// Note: this usage of tmp is important.
  1122  	// BSL modifies the destination register, so we need to use a temporary register so that
  1123  	// the actual definition of the destination register happens *after* the BSL instruction.
  1124  	// That way, we can force the spill instruction to be inserted after the BSL instruction.
  1125  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1126  
  1127  	fcmgt := m.allocateInstr()
  1128  	if max {
  1129  		fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr)
  1130  	} else {
  1131  		// If min, swap the args.
  1132  		fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr)
  1133  	}
  1134  	m.insert(fcmgt)
  1135  
  1136  	bsl := m.allocateInstr()
  1137  	bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B)
  1138  	m.insert(bsl)
  1139  
  1140  	res := operandNR(m.compiler.VRegOf(instr.Return()))
  1141  	mov2 := m.allocateInstr()
  1142  	mov2.asFpuMov128(res.nr(), tmp.nr())
  1143  	m.insert(mov2)
  1144  }
  1145  
  1146  func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
  1147  	div := m.allocateInstr()
  1148  
  1149  	if signed {
  1150  		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
  1151  	} else {
  1152  		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
  1153  	}
  1154  	m.insert(div)
  1155  
  1156  	// Check if rm is zero:
  1157  	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
  1158  
  1159  	// rd = rn-rd*rm by MSUB instruction.
  1160  	msub := m.allocateInstr()
  1161  	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
  1162  	m.insert(msub)
  1163  }
  1164  
  1165  func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
  1166  	div := m.allocateInstr()
  1167  
  1168  	if signed {
  1169  		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
  1170  	} else {
  1171  		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
  1172  	}
  1173  	m.insert(div)
  1174  
  1175  	// Check if rm is zero:
  1176  	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
  1177  
  1178  	if signed {
  1179  		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
  1180  		minusOneCheck := m.allocateInstr()
  1181  		// Sets eq condition if rm == -1.
  1182  		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
  1183  		m.insert(minusOneCheck)
  1184  
  1185  		ccmp := m.allocateInstr()
  1186  		// If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag.
  1187  		ccmp.asCCmpImm(rn, 1, eq, 0, _64bit)
  1188  		m.insert(ccmp)
  1189  
  1190  		// Check the overflow flag.
  1191  		m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow)
  1192  	}
  1193  }
  1194  
  1195  // exitIfNot emits a conditional branch to exit if the condition is not met.
  1196  // If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit.
  1197  // Otherwise, `cond64bit` is ignored.
  1198  func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) {
  1199  	execCtxTmp := m.copyToTmp(execCtxVReg)
  1200  
  1201  	cbr := m.allocateInstr()
  1202  	m.insert(cbr)
  1203  	m.lowerExitWithCode(execCtxTmp, code)
  1204  	// Conditional branch target is after exit.
  1205  	l := m.insertBrTargetLabel()
  1206  	cbr.asCondBr(c, l, cond64bit)
  1207  }
  1208  
  1209  func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
  1210  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1211  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1212  	var tmpI, tmpF operand
  1213  	_64 := x.Type() == ssa.TypeF64
  1214  	if _64 {
  1215  		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1216  		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
  1217  	} else {
  1218  		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
  1219  		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
  1220  	}
  1221  	rd := m.compiler.VRegOf(ret)
  1222  	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
  1223  }
  1224  
  1225  func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
  1226  	// This is exactly the same code emitted by GCC for "__builtin_copysign":
  1227  	//
  1228  	//    mov     x0, -9223372036854775808
  1229  	//    fmov    d2, x0
  1230  	//    vbit    v0.8b, v1.8b, v2.8b
  1231  	//
  1232  
  1233  	setMSB := m.allocateInstr()
  1234  	if _64bit {
  1235  		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
  1236  		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
  1237  	} else {
  1238  		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
  1239  		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
  1240  	}
  1241  	m.insert(setMSB)
  1242  
  1243  	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1244  
  1245  	mov := m.allocateInstr()
  1246  	mov.asFpuMov64(tmpReg.nr(), rn.nr())
  1247  	m.insert(mov)
  1248  
  1249  	vbit := m.allocateInstr()
  1250  	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
  1251  	m.insert(vbit)
  1252  
  1253  	movDst := m.allocateInstr()
  1254  	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
  1255  	m.insert(movDst)
  1256  }
  1257  
  1258  func (m *machine) lowerBitcast(instr *ssa.Instruction) {
  1259  	v, dstType := instr.BitcastData()
  1260  	srcType := v.Type()
  1261  	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
  1262  	rd := operandNR(m.compiler.VRegOf(instr.Return()))
  1263  	srcInt := srcType.IsInt()
  1264  	dstInt := dstType.IsInt()
  1265  	switch {
  1266  	case srcInt && !dstInt: // Int to Float:
  1267  		mov := m.allocateInstr()
  1268  		var arr vecArrangement
  1269  		if srcType.Bits() == 64 {
  1270  			arr = vecArrangementD
  1271  		} else {
  1272  			arr = vecArrangementS
  1273  		}
  1274  		mov.asMovToVec(rd, rn, arr, vecIndex(0))
  1275  		m.insert(mov)
  1276  	case !srcInt && dstInt: // Float to Int:
  1277  		mov := m.allocateInstr()
  1278  		var arr vecArrangement
  1279  		if dstType.Bits() == 64 {
  1280  			arr = vecArrangementD
  1281  		} else {
  1282  			arr = vecArrangementS
  1283  		}
  1284  		mov.asMovFromVec(rd, rn, arr, vecIndex(0), false)
  1285  		m.insert(mov)
  1286  	default:
  1287  		panic("TODO?BUG?")
  1288  	}
  1289  }
  1290  
  1291  func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
  1292  	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
  1293  	rd := operandNR(m.compiler.VRegOf(out))
  1294  
  1295  	neg := m.allocateInstr()
  1296  	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
  1297  	m.insert(neg)
  1298  }
  1299  
  1300  func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
  1301  	if !nonTrapping {
  1302  		// First of all, we have to clear the FPU flags.
  1303  		flagClear := m.allocateInstr()
  1304  		flagClear.asMovToFPSR(xzrVReg)
  1305  		m.insert(flagClear)
  1306  	}
  1307  
  1308  	// Then, do the conversion which doesn't trap inherently.
  1309  	cvt := m.allocateInstr()
  1310  	cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit)
  1311  	m.insert(cvt)
  1312  
  1313  	if !nonTrapping {
  1314  		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
  1315  
  1316  		// After the conversion, check the FPU flags.
  1317  		getFlag := m.allocateInstr()
  1318  		getFlag.asMovFromFPSR(tmpReg)
  1319  		m.insert(getFlag)
  1320  
  1321  		execCtx := m.copyToTmp(ctx)
  1322  		_rn := operandNR(m.copyToTmp(rn.nr()))
  1323  
  1324  		// Check if the conversion was undefined by comparing the status with 1.
  1325  		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
  1326  		alu := m.allocateInstr()
  1327  		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
  1328  		m.insert(alu)
  1329  
  1330  		// If it is not undefined, we can return the result.
  1331  		ok := m.allocateInstr()
  1332  		m.insert(ok)
  1333  
  1334  		// Otherwise, we have to choose the status depending on it is overflow or NaN conversion.
  1335  
  1336  		// Comparing itself to check if it is a NaN.
  1337  		fpuCmp := m.allocateInstr()
  1338  		fpuCmp.asFpuCmp(_rn, _rn, src64bit)
  1339  		m.insert(fpuCmp)
  1340  		// If the VC flag is not set (== VS flag is set), it is a NaN.
  1341  		m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger)
  1342  		// Otherwise, it is an overflow.
  1343  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  1344  
  1345  		// Conditional branch target is after exit.
  1346  		l := m.insertBrTargetLabel()
  1347  		ok.asCondBr(ne.asCond(), l, false /* ignored */)
  1348  	}
  1349  }
  1350  
  1351  func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
  1352  	cvt := m.allocateInstr()
  1353  	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
  1354  	m.insert(cvt)
  1355  }
  1356  
  1357  func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
  1358  	instr := m.allocateInstr()
  1359  	var op fpuBinOp
  1360  	switch si.Opcode() {
  1361  	case ssa.OpcodeFadd:
  1362  		op = fpuBinOpAdd
  1363  	case ssa.OpcodeFsub:
  1364  		op = fpuBinOpSub
  1365  	case ssa.OpcodeFmul:
  1366  		op = fpuBinOpMul
  1367  	case ssa.OpcodeFdiv:
  1368  		op = fpuBinOpDiv
  1369  	case ssa.OpcodeFmax:
  1370  		op = fpuBinOpMax
  1371  	case ssa.OpcodeFmin:
  1372  		op = fpuBinOpMin
  1373  	}
  1374  	x, y := si.Arg2()
  1375  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1376  	rn := m.getOperand_NR(xDef, extModeNone)
  1377  	rm := m.getOperand_NR(yDef, extModeNone)
  1378  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1379  	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
  1380  	m.insert(instr)
  1381  }
  1382  
  1383  func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
  1384  	x, y := si.Arg2()
  1385  	if !x.Type().IsInt() {
  1386  		panic("BUG?")
  1387  	}
  1388  
  1389  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1390  	rn := m.getOperand_NR(xDef, extModeNone)
  1391  	rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone)
  1392  
  1393  	var aop aluOp
  1394  	switch {
  1395  	case add && !yNegated: // rn+rm = x+y
  1396  		aop = aluOpAdd
  1397  	case add && yNegated: // rn-rm = x-(-y) = x+y
  1398  		aop = aluOpSub
  1399  	case !add && !yNegated: // rn-rm = x-y
  1400  		aop = aluOpSub
  1401  	case !add && yNegated: // rn+rm = x-(-y) = x-y
  1402  		aop = aluOpAdd
  1403  	}
  1404  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1405  	alu := m.allocateInstr()
  1406  	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
  1407  	m.insert(alu)
  1408  }
  1409  
  1410  // InsertMove implements backend.Machine.
  1411  func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
  1412  	instr := m.allocateInstr()
  1413  	switch typ {
  1414  	case ssa.TypeI32, ssa.TypeI64:
  1415  		instr.asMove64(dst, src)
  1416  	case ssa.TypeF32, ssa.TypeF64:
  1417  		instr.asFpuMov64(dst, src)
  1418  	case ssa.TypeV128:
  1419  		instr.asFpuMov128(dst, src)
  1420  	default:
  1421  		panic("TODO")
  1422  	}
  1423  	m.insert(instr)
  1424  }
  1425  
  1426  func (m *machine) lowerIcmp(si *ssa.Instruction) {
  1427  	x, y, c := si.IcmpData()
  1428  	flag := condFlagFromSSAIntegerCmpCond(c)
  1429  
  1430  	in64bit := x.Type().Bits() == 64
  1431  	var ext extMode
  1432  	if in64bit {
  1433  		if c.Signed() {
  1434  			ext = extModeSignExtend64
  1435  		} else {
  1436  			ext = extModeZeroExtend64
  1437  		}
  1438  	} else {
  1439  		if c.Signed() {
  1440  			ext = extModeSignExtend32
  1441  		} else {
  1442  			ext = extModeZeroExtend32
  1443  		}
  1444  	}
  1445  
  1446  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
  1447  	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
  1448  	alu := m.allocateInstr()
  1449  	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
  1450  	m.insert(alu)
  1451  
  1452  	cset := m.allocateInstr()
  1453  	cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag)
  1454  	m.insert(cset)
  1455  }
  1456  
  1457  func (m *machine) lowerVIcmp(si *ssa.Instruction) {
  1458  	x, y, c, lane := si.VIcmpData()
  1459  	flag := condFlagFromSSAIntegerCmpCond(c)
  1460  	arr := ssaLaneToArrangement(lane)
  1461  
  1462  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1463  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1464  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1465  
  1466  	switch flag {
  1467  	case eq:
  1468  		cmp := m.allocateInstr()
  1469  		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
  1470  		m.insert(cmp)
  1471  	case ne:
  1472  		cmp := m.allocateInstr()
  1473  		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
  1474  		m.insert(cmp)
  1475  		not := m.allocateInstr()
  1476  		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
  1477  		m.insert(not)
  1478  	case ge:
  1479  		cmp := m.allocateInstr()
  1480  		cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr)
  1481  		m.insert(cmp)
  1482  	case gt:
  1483  		cmp := m.allocateInstr()
  1484  		cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr)
  1485  		m.insert(cmp)
  1486  	case le:
  1487  		cmp := m.allocateInstr()
  1488  		cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped
  1489  		m.insert(cmp)
  1490  	case lt:
  1491  		cmp := m.allocateInstr()
  1492  		cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped
  1493  		m.insert(cmp)
  1494  	case hs:
  1495  		cmp := m.allocateInstr()
  1496  		cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr)
  1497  		m.insert(cmp)
  1498  	case hi:
  1499  		cmp := m.allocateInstr()
  1500  		cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr)
  1501  		m.insert(cmp)
  1502  	case ls:
  1503  		cmp := m.allocateInstr()
  1504  		cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped
  1505  		m.insert(cmp)
  1506  	case lo:
  1507  		cmp := m.allocateInstr()
  1508  		cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped
  1509  		m.insert(cmp)
  1510  	}
  1511  }
  1512  
  1513  func (m *machine) lowerVFcmp(si *ssa.Instruction) {
  1514  	x, y, c, lane := si.VFcmpData()
  1515  	flag := condFlagFromSSAFloatCmpCond(c)
  1516  	arr := ssaLaneToArrangement(lane)
  1517  
  1518  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1519  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1520  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1521  
  1522  	switch flag {
  1523  	case eq:
  1524  		cmp := m.allocateInstr()
  1525  		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
  1526  		m.insert(cmp)
  1527  	case ne:
  1528  		cmp := m.allocateInstr()
  1529  		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
  1530  		m.insert(cmp)
  1531  		not := m.allocateInstr()
  1532  		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
  1533  		m.insert(not)
  1534  	case ge:
  1535  		cmp := m.allocateInstr()
  1536  		cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr)
  1537  		m.insert(cmp)
  1538  	case gt:
  1539  		cmp := m.allocateInstr()
  1540  		cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr)
  1541  		m.insert(cmp)
  1542  	case mi:
  1543  		cmp := m.allocateInstr()
  1544  		cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped
  1545  		m.insert(cmp)
  1546  	case ls:
  1547  		cmp := m.allocateInstr()
  1548  		cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped
  1549  		m.insert(cmp)
  1550  	}
  1551  }
  1552  
  1553  func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
  1554  	cvt := m.allocateInstr()
  1555  	if signed {
  1556  		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
  1557  	} else {
  1558  		cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr)
  1559  	}
  1560  	m.insert(cvt)
  1561  
  1562  	if arr == vecArrangement2D {
  1563  		narrow := m.allocateInstr()
  1564  		if signed {
  1565  			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
  1566  		} else {
  1567  			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
  1568  		}
  1569  		m.insert(narrow)
  1570  	}
  1571  }
  1572  
  1573  func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
  1574  	cvt := m.allocateInstr()
  1575  	if signed {
  1576  		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
  1577  	} else {
  1578  		cvt.asVecMisc(vecOpUcvtf, rd, rn, arr)
  1579  	}
  1580  	m.insert(cvt)
  1581  }
  1582  
  1583  func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
  1584  	x, amount := si.Arg2()
  1585  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
  1586  	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
  1587  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1588  
  1589  	alu := m.allocateInstr()
  1590  	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
  1591  	m.insert(alu)
  1592  }
  1593  
  1594  func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp) {
  1595  	x, y := si.Arg2()
  1596  	if !x.Type().IsInt() {
  1597  		panic("BUG?")
  1598  	}
  1599  
  1600  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1601  	rn := m.getOperand_NR(xDef, extModeNone)
  1602  	rm := m.getOperand_SR_NR(yDef, extModeNone)
  1603  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1604  
  1605  	alu := m.allocateInstr()
  1606  	alu.asALU(op, rd, rn, rm, si.Return().Type().Bits() == 64)
  1607  	m.insert(alu)
  1608  }
  1609  
  1610  func (m *machine) lowerRotl(si *ssa.Instruction) {
  1611  	x, y := si.Arg2()
  1612  	r := si.Return()
  1613  	_64 := r.Type().Bits() == 64
  1614  
  1615  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1616  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1617  	var tmp operand
  1618  	if _64 {
  1619  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
  1620  	} else {
  1621  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
  1622  	}
  1623  	rd := operandNR(m.compiler.VRegOf(r))
  1624  
  1625  	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
  1626  	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
  1627  }
  1628  
  1629  func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
  1630  	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
  1631  	neg := m.allocateInstr()
  1632  	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
  1633  	m.insert(neg)
  1634  	alu := m.allocateInstr()
  1635  	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
  1636  	m.insert(alu)
  1637  }
  1638  
  1639  func (m *machine) lowerRotr(si *ssa.Instruction) {
  1640  	x, y := si.Arg2()
  1641  
  1642  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1643  	rn := m.getOperand_NR(xDef, extModeNone)
  1644  	rm := m.getOperand_NR(yDef, extModeNone)
  1645  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1646  
  1647  	alu := m.allocateInstr()
  1648  	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
  1649  	m.insert(alu)
  1650  }
  1651  
  1652  func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) {
  1653  	rd := m.compiler.VRegOf(ret)
  1654  	rn := m.getOperand_NR(m.compiler.ValueDefinition(arg), extModeNone)
  1655  
  1656  	ext := m.allocateInstr()
  1657  	ext.asExtend(rd, rn.nr(), from, to, signed)
  1658  	m.insert(ext)
  1659  }
  1660  
  1661  func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) {
  1662  	rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1663  
  1664  	fc := m.allocateInstr()
  1665  	fc.asFpuCmp(rn, rm, x.Type().Bits() == 64)
  1666  	m.insert(fc)
  1667  
  1668  	cset := m.allocateInstr()
  1669  	cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c))
  1670  	m.insert(cset)
  1671  }
  1672  
  1673  func (m *machine) lowerImul(x, y, result ssa.Value) {
  1674  	rd := m.compiler.VRegOf(result)
  1675  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1676  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1677  
  1678  	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
  1679  
  1680  	mul := m.allocateInstr()
  1681  	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
  1682  	m.insert(mul)
  1683  }
  1684  
  1685  func (m *machine) lowerClz(x, result ssa.Value) {
  1686  	rd := m.compiler.VRegOf(result)
  1687  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1688  	clz := m.allocateInstr()
  1689  	clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64)
  1690  	m.insert(clz)
  1691  }
  1692  
  1693  func (m *machine) lowerCtz(x, result ssa.Value) {
  1694  	rd := m.compiler.VRegOf(result)
  1695  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1696  	rbit := m.allocateInstr()
  1697  	_64 := x.Type().Bits() == 64
  1698  	var tmpReg regalloc.VReg
  1699  	if _64 {
  1700  		tmpReg = m.compiler.AllocateVReg(ssa.TypeI64)
  1701  	} else {
  1702  		tmpReg = m.compiler.AllocateVReg(ssa.TypeI32)
  1703  	}
  1704  	rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64)
  1705  	m.insert(rbit)
  1706  
  1707  	clz := m.allocateInstr()
  1708  	clz.asBitRR(bitOpClz, rd, tmpReg, _64)
  1709  	m.insert(clz)
  1710  }
  1711  
  1712  func (m *machine) lowerPopcnt(x, result ssa.Value) {
  1713  	// arm64 doesn't have an instruction for population count on scalar register,
  1714  	// so we use the vector instruction `cnt`.
  1715  	// This is exactly what the official Go implements bits.OneCount.
  1716  	// For example, "func () int { return bits.OneCount(10) }" is compiled as
  1717  	//
  1718  	//    MOVD    $10, R0 ;; Load 10.
  1719  	//    FMOVD   R0, F0
  1720  	//    VCNT    V0.B8, V0.B8
  1721  	//    UADDLV  V0.B8, V0
  1722  	//
  1723  	// In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`,
  1724  	// and the registers may use different names. In our encoding we use the following
  1725  	// instructions:
  1726  	//
  1727  	//    ins v0.d[0], x0     ;; mov from GPR to vec (FMOV above) is encoded as INS
  1728  	//    cnt v0.16b, v0.16b  ;; we use vec arrangement 16b
  1729  	//    uaddlv h0, v0.8b    ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b
  1730  	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
  1731  	//
  1732  
  1733  	rd := operandNR(m.compiler.VRegOf(result))
  1734  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1735  
  1736  	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1737  	ins := m.allocateInstr()
  1738  	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
  1739  	m.insert(ins)
  1740  
  1741  	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1742  	cnt := m.allocateInstr()
  1743  	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
  1744  	m.insert(cnt)
  1745  
  1746  	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1747  	uaddlv := m.allocateInstr()
  1748  	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
  1749  	m.insert(uaddlv)
  1750  
  1751  	mov := m.allocateInstr()
  1752  	mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false)
  1753  	m.insert(mov)
  1754  }
  1755  
  1756  // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument.
  1757  func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) {
  1758  	tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32)
  1759  	loadExitCodeConst := m.allocateInstr()
  1760  	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
  1761  
  1762  	setExitCode := m.allocateInstr()
  1763  	setExitCode.asStore(operandNR(tmpReg1),
  1764  		addressMode{
  1765  			kind: addressModeKindRegUnsignedImm12,
  1766  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
  1767  		}, 32)
  1768  
  1769  	// In order to unwind the stack, we also need to push the current stack pointer:
  1770  	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
  1771  	movSpToTmp := m.allocateInstr()
  1772  	movSpToTmp.asMove64(tmp2, spVReg)
  1773  	strSpToExecCtx := m.allocateInstr()
  1774  	strSpToExecCtx.asStore(operandNR(tmp2),
  1775  		addressMode{
  1776  			kind: addressModeKindRegUnsignedImm12,
  1777  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
  1778  		}, 64)
  1779  	// Also the address of this exit.
  1780  	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
  1781  	currentAddrToTmp := m.allocateInstr()
  1782  	currentAddrToTmp.asAdr(tmp3, 0)
  1783  	storeCurrentAddrToExecCtx := m.allocateInstr()
  1784  	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
  1785  		addressMode{
  1786  			kind: addressModeKindRegUnsignedImm12,
  1787  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
  1788  		}, 64)
  1789  
  1790  	exitSeq := m.allocateInstr()
  1791  	exitSeq.asExitSequence(execCtxVReg)
  1792  
  1793  	m.insert(loadExitCodeConst)
  1794  	m.insert(setExitCode)
  1795  	m.insert(movSpToTmp)
  1796  	m.insert(strSpToExecCtx)
  1797  	m.insert(currentAddrToTmp)
  1798  	m.insert(storeCurrentAddrToExecCtx)
  1799  	m.insert(exitSeq)
  1800  }
  1801  
  1802  func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
  1803  	if x.Type() != y.Type() {
  1804  		panic(
  1805  			fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s",
  1806  				x.ID(), x.Type(), y.ID(), y.Type()))
  1807  	}
  1808  
  1809  	extMod := extModeOf(x.Type(), signed)
  1810  
  1811  	// First operand must be in pure register form.
  1812  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod)
  1813  	// Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions.
  1814  	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod)
  1815  
  1816  	alu := m.allocateInstr()
  1817  	// subs zr, rn, rm
  1818  	alu.asALU(
  1819  		aluOpSubS,
  1820  		// We don't need the result, just need to set flags.
  1821  		operandNR(xzrVReg),
  1822  		rn,
  1823  		rm,
  1824  		x.Type().Bits() == 64,
  1825  	)
  1826  	m.insert(alu)
  1827  }
  1828  
  1829  func (m *machine) lowerFcmpToFlag(x, y ssa.Value) {
  1830  	if x.Type() != y.Type() {
  1831  		panic("TODO(maybe): support icmp with different types")
  1832  	}
  1833  
  1834  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1835  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1836  	cmp := m.allocateInstr()
  1837  	cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64)
  1838  	m.insert(cmp)
  1839  }
  1840  
  1841  func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
  1842  	condDef := m.compiler.ValueDefinition(cond)
  1843  	if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) {
  1844  		panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
  1845  	}
  1846  	condDef.Instr.MarkLowered()
  1847  
  1848  	cvalInstr := condDef.Instr
  1849  	x, y, c := cvalInstr.IcmpData()
  1850  	signed := c.Signed()
  1851  	m.lowerIcmpToFlag(x, y, signed)
  1852  
  1853  	execCtxTmp := m.copyToTmp(execCtxVReg)
  1854  
  1855  	// We have to skip the entire exit sequence if the condition is false.
  1856  	cbr := m.allocateInstr()
  1857  	m.insert(cbr)
  1858  	m.lowerExitWithCode(execCtxTmp, code)
  1859  	// conditional branch target is after exit.
  1860  	l := m.insertBrTargetLabel()
  1861  	cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */)
  1862  }
  1863  
  1864  func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
  1865  	cvalDef := m.compiler.ValueDefinition(c)
  1866  
  1867  	var cc condFlag
  1868  	switch {
  1869  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
  1870  		cvalInstr := cvalDef.Instr
  1871  		x, y, c := cvalInstr.IcmpData()
  1872  		cc = condFlagFromSSAIntegerCmpCond(c)
  1873  		m.lowerIcmpToFlag(x, y, c.Signed())
  1874  		cvalDef.Instr.MarkLowered()
  1875  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
  1876  		cvalInstr := cvalDef.Instr
  1877  		x, y, c := cvalInstr.FcmpData()
  1878  		cc = condFlagFromSSAFloatCmpCond(c)
  1879  		m.lowerFcmpToFlag(x, y)
  1880  		cvalDef.Instr.MarkLowered()
  1881  	default:
  1882  		rn := m.getOperand_NR(cvalDef, extModeNone)
  1883  		if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 {
  1884  			panic("TODO?BUG?: support select with non-integer condition")
  1885  		}
  1886  		alu := m.allocateInstr()
  1887  		// subs zr, rn, zr
  1888  		alu.asALU(
  1889  			aluOpSubS,
  1890  			// We don't need the result, just need to set flags.
  1891  			operandNR(xzrVReg),
  1892  			rn,
  1893  			operandNR(xzrVReg),
  1894  			c.Type().Bits() == 64,
  1895  		)
  1896  		m.insert(alu)
  1897  		cc = ne
  1898  	}
  1899  
  1900  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1901  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1902  
  1903  	rd := operandNR(m.compiler.VRegOf(result))
  1904  	switch x.Type() {
  1905  	case ssa.TypeI32, ssa.TypeI64:
  1906  		// csel rd, rn, rm, cc
  1907  		csel := m.allocateInstr()
  1908  		csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
  1909  		m.insert(csel)
  1910  	case ssa.TypeF32, ssa.TypeF64:
  1911  		// fcsel rd, rn, rm, cc
  1912  		fcsel := m.allocateInstr()
  1913  		fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
  1914  		m.insert(fcsel)
  1915  	default:
  1916  		panic("BUG")
  1917  	}
  1918  }
  1919  
  1920  func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
  1921  	// First check if `rc` is zero or not.
  1922  	checkZero := m.allocateInstr()
  1923  	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
  1924  	m.insert(checkZero)
  1925  
  1926  	// Then use CSETM to set all bits to one if `rc` is zero.
  1927  	allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64)
  1928  	cset := m.allocateInstr()
  1929  	cset.asCSet(allOnesOrZero, true, ne)
  1930  	m.insert(cset)
  1931  
  1932  	// Then move the bits to the result vector register.
  1933  	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1934  	dup := m.allocateInstr()
  1935  	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
  1936  	m.insert(dup)
  1937  
  1938  	// Now that `tmp2` has either all bits one or zero depending on `rc`,
  1939  	// we can use bsl to select between `rn` and `rm`.
  1940  	ins := m.allocateInstr()
  1941  	ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B)
  1942  	m.insert(ins)
  1943  
  1944  	// Finally, move the result to the destination register.
  1945  	mov2 := m.allocateInstr()
  1946  	mov2.asFpuMov128(rd.nr(), tmp2.nr())
  1947  	m.insert(mov2)
  1948  }
  1949  
  1950  // copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue
  1951  // e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes
  1952  func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
  1953  	typ := m.compiler.TypeOf(v)
  1954  	mov := m.allocateInstr()
  1955  	tmp := m.compiler.AllocateVReg(typ)
  1956  	if typ.IsInt() {
  1957  		mov.asMove64(tmp, v)
  1958  	} else {
  1959  		mov.asFpuMov128(tmp, v)
  1960  	}
  1961  	m.insert(mov)
  1962  	return tmp
  1963  }