github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/wazevo/backend/isa/arm64/lower_instr.go (about)

     1  package arm64
     2  
     3  // Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions
     4  // into machine specific instructions.
     5  //
     6  // Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree,
     7  // and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection.
     8  
     9  import (
    10  	"fmt"
    11  	"math"
    12  
    13  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/backend/regalloc"
    14  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/ssa"
    15  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/wazevoapi"
    16  )
    17  
    18  // LowerSingleBranch implements backend.Machine.
    19  func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
    20  	ectx := m.executableContext
    21  	switch br.Opcode() {
    22  	case ssa.OpcodeJump:
    23  		_, _, targetBlk := br.BranchData()
    24  		if br.IsFallthroughJump() {
    25  			return
    26  		}
    27  		b := m.allocateInstr()
    28  		target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
    29  		if target == labelReturn {
    30  			b.asRet(m.currentABI)
    31  		} else {
    32  			b.asBr(target)
    33  		}
    34  		m.insert(b)
    35  	case ssa.OpcodeBrTable:
    36  		m.lowerBrTable(br)
    37  	default:
    38  		panic("BUG: unexpected branch opcode" + br.Opcode().String())
    39  	}
    40  }
    41  
    42  func (m *machine) lowerBrTable(i *ssa.Instruction) {
    43  	index, targets := i.BrTableData()
    44  	indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone)
    45  
    46  	// Firstly, we have to do the bounds check of the index, and
    47  	// set it to the default target (sitting at the end of the list) if it's out of bounds.
    48  
    49  	// mov  maxIndexReg #maximum_index
    50  	// subs wzr, index, maxIndexReg
    51  	// csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg.
    52  	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
    53  	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
    54  	subs := m.allocateInstr()
    55  	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
    56  	m.insert(subs)
    57  	csel := m.allocateInstr()
    58  	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
    59  	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
    60  	m.insert(csel)
    61  
    62  	brSequence := m.allocateInstr()
    63  
    64  	// TODO: reuse the slice!
    65  	labels := make([]uint32, len(targets))
    66  	for j, target := range targets {
    67  		labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
    68  	}
    69  
    70  	brSequence.asBrTableSequence(adjustedIndex, labels)
    71  	m.insert(brSequence)
    72  }
    73  
    74  // LowerConditionalBranch implements backend.Machine.
    75  func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
    76  	exctx := m.executableContext
    77  	cval, args, targetBlk := b.BranchData()
    78  	if len(args) > 0 {
    79  		panic(fmt.Sprintf(
    80  			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
    81  			exctx.CurrentSSABlk,
    82  			targetBlk,
    83  		))
    84  	}
    85  
    86  	target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
    87  	cvalDef := m.compiler.ValueDefinition(cval)
    88  
    89  	switch {
    90  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
    91  		cvalInstr := cvalDef.Instr
    92  		x, y, c := cvalInstr.IcmpData()
    93  		cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed()
    94  		if b.Opcode() == ssa.OpcodeBrz {
    95  			cc = cc.invert()
    96  		}
    97  
    98  		m.lowerIcmpToFlag(x, y, signed)
    99  		cbr := m.allocateInstr()
   100  		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
   101  		m.insert(cbr)
   102  		cvalDef.Instr.MarkLowered()
   103  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
   104  		cvalInstr := cvalDef.Instr
   105  		x, y, c := cvalInstr.FcmpData()
   106  		cc := condFlagFromSSAFloatCmpCond(c)
   107  		if b.Opcode() == ssa.OpcodeBrz {
   108  			cc = cc.invert()
   109  		}
   110  		m.lowerFcmpToFlag(x, y)
   111  		cbr := m.allocateInstr()
   112  		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
   113  		m.insert(cbr)
   114  		cvalDef.Instr.MarkLowered()
   115  	default:
   116  		rn := m.getOperand_NR(cvalDef, extModeNone)
   117  		var c cond
   118  		if b.Opcode() == ssa.OpcodeBrz {
   119  			c = registerAsRegZeroCond(rn.nr())
   120  		} else {
   121  			c = registerAsRegNotZeroCond(rn.nr())
   122  		}
   123  		cbr := m.allocateInstr()
   124  		cbr.asCondBr(c, target, false)
   125  		m.insert(cbr)
   126  	}
   127  }
   128  
   129  // LowerInstr implements backend.Machine.
   130  func (m *machine) LowerInstr(instr *ssa.Instruction) {
   131  	if l := instr.SourceOffset(); l.Valid() {
   132  		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
   133  		m.insert(info)
   134  	}
   135  
   136  	switch op := instr.Opcode(); op {
   137  	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
   138  		panic("BUG: branching instructions are handled by LowerBranches")
   139  	case ssa.OpcodeReturn:
   140  		panic("BUG: return must be handled by backend.Compiler")
   141  	case ssa.OpcodeIadd, ssa.OpcodeIsub:
   142  		m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd)
   143  	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin:
   144  		m.lowerFpuBinOp(instr)
   145  	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
   146  	case ssa.OpcodeExitWithCode:
   147  		execCtx, code := instr.ExitWithCodeData()
   148  		m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code)
   149  	case ssa.OpcodeExitIfTrueWithCode:
   150  		execCtx, c, code := instr.ExitIfTrueWithCodeData()
   151  		m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code)
   152  	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
   153  		m.lowerStore(instr)
   154  	case ssa.OpcodeLoad:
   155  		dst := instr.Return()
   156  		ptr, offset, typ := instr.LoadData()
   157  		m.lowerLoad(ptr, offset, typ, dst)
   158  	case ssa.OpcodeVZeroExtLoad:
   159  		dst := instr.Return()
   160  		ptr, offset, typ := instr.VZeroExtLoadData()
   161  		m.lowerLoad(ptr, offset, typ, dst)
   162  	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
   163  		ptr, offset, _ := instr.LoadData()
   164  		ret := m.compiler.VRegOf(instr.Return())
   165  		m.lowerExtLoad(op, ptr, offset, ret)
   166  	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
   167  		m.lowerCall(instr)
   168  	case ssa.OpcodeIcmp:
   169  		m.lowerIcmp(instr)
   170  	case ssa.OpcodeVIcmp:
   171  		m.lowerVIcmp(instr)
   172  	case ssa.OpcodeVFcmp:
   173  		m.lowerVFcmp(instr)
   174  	case ssa.OpcodeVCeil:
   175  		m.lowerVecMisc(vecOpFrintp, instr)
   176  	case ssa.OpcodeVFloor:
   177  		m.lowerVecMisc(vecOpFrintm, instr)
   178  	case ssa.OpcodeVTrunc:
   179  		m.lowerVecMisc(vecOpFrintz, instr)
   180  	case ssa.OpcodeVNearest:
   181  		m.lowerVecMisc(vecOpFrintn, instr)
   182  	case ssa.OpcodeVMaxPseudo:
   183  		m.lowerVMinMaxPseudo(instr, true)
   184  	case ssa.OpcodeVMinPseudo:
   185  		m.lowerVMinMaxPseudo(instr, false)
   186  	case ssa.OpcodeBand:
   187  		m.lowerBitwiseAluOp(instr, aluOpAnd)
   188  	case ssa.OpcodeBor:
   189  		m.lowerBitwiseAluOp(instr, aluOpOrr)
   190  	case ssa.OpcodeBxor:
   191  		m.lowerBitwiseAluOp(instr, aluOpEor)
   192  	case ssa.OpcodeIshl:
   193  		m.lowerShifts(instr, extModeNone, aluOpLsl)
   194  	case ssa.OpcodeSshr:
   195  		if instr.Return().Type().Bits() == 64 {
   196  			m.lowerShifts(instr, extModeSignExtend64, aluOpAsr)
   197  		} else {
   198  			m.lowerShifts(instr, extModeSignExtend32, aluOpAsr)
   199  		}
   200  	case ssa.OpcodeUshr:
   201  		if instr.Return().Type().Bits() == 64 {
   202  			m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr)
   203  		} else {
   204  			m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr)
   205  		}
   206  	case ssa.OpcodeRotl:
   207  		m.lowerRotl(instr)
   208  	case ssa.OpcodeRotr:
   209  		m.lowerRotr(instr)
   210  	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
   211  		from, to, signed := instr.ExtendData()
   212  		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
   213  	case ssa.OpcodeFcmp:
   214  		x, y, c := instr.FcmpData()
   215  		m.lowerFcmp(x, y, instr.Return(), c)
   216  	case ssa.OpcodeImul:
   217  		x, y := instr.Arg2()
   218  		result := instr.Return()
   219  		m.lowerImul(x, y, result)
   220  	case ssa.OpcodeUndefined:
   221  		undef := m.allocateInstr()
   222  		undef.asUDF()
   223  		m.insert(undef)
   224  	case ssa.OpcodeSelect:
   225  		c, x, y := instr.SelectData()
   226  		if x.Type() == ssa.TypeV128 {
   227  			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
   228  			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   229  			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   230  			rd := operandNR(m.compiler.VRegOf(instr.Return()))
   231  			m.lowerSelectVec(rc, rn, rm, rd)
   232  		} else {
   233  			m.lowerSelect(c, x, y, instr.Return())
   234  		}
   235  	case ssa.OpcodeClz:
   236  		x := instr.Arg()
   237  		result := instr.Return()
   238  		m.lowerClz(x, result)
   239  	case ssa.OpcodeCtz:
   240  		x := instr.Arg()
   241  		result := instr.Return()
   242  		m.lowerCtz(x, result)
   243  	case ssa.OpcodePopcnt:
   244  		x := instr.Arg()
   245  		result := instr.Return()
   246  		m.lowerPopcnt(x, result)
   247  	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
   248  		x, ctx := instr.Arg2()
   249  		result := instr.Return()
   250  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   251  		rd := operandNR(m.compiler.VRegOf(result))
   252  		ctxVReg := m.compiler.VRegOf(ctx)
   253  		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
   254  			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
   255  	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
   256  		x, ctx := instr.Arg2()
   257  		result := instr.Return()
   258  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   259  		rd := operandNR(m.compiler.VRegOf(result))
   260  		ctxVReg := m.compiler.VRegOf(ctx)
   261  		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
   262  			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
   263  	case ssa.OpcodeFcvtFromSint:
   264  		x := instr.Arg()
   265  		result := instr.Return()
   266  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   267  		rd := operandNR(m.compiler.VRegOf(result))
   268  		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
   269  	case ssa.OpcodeFcvtFromUint:
   270  		x := instr.Arg()
   271  		result := instr.Return()
   272  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   273  		rd := operandNR(m.compiler.VRegOf(result))
   274  		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
   275  	case ssa.OpcodeFdemote:
   276  		v := instr.Arg()
   277  		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
   278  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   279  		cnt := m.allocateInstr()
   280  		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
   281  		m.insert(cnt)
   282  	case ssa.OpcodeFpromote:
   283  		v := instr.Arg()
   284  		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
   285  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   286  		cnt := m.allocateInstr()
   287  		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
   288  		m.insert(cnt)
   289  	case ssa.OpcodeIreduce:
   290  		rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone)
   291  		retVal := instr.Return()
   292  		rd := m.compiler.VRegOf(retVal)
   293  
   294  		if retVal.Type() != ssa.TypeI32 {
   295  			panic("TODO?: Ireduce to non-i32")
   296  		}
   297  		mov := m.allocateInstr()
   298  		mov.asMove32(rd, rn.reg())
   299  		m.insert(mov)
   300  	case ssa.OpcodeFneg:
   301  		m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return())
   302  	case ssa.OpcodeSqrt:
   303  		m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return())
   304  	case ssa.OpcodeCeil:
   305  		m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return())
   306  	case ssa.OpcodeFloor:
   307  		m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return())
   308  	case ssa.OpcodeTrunc:
   309  		m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return())
   310  	case ssa.OpcodeNearest:
   311  		m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return())
   312  	case ssa.OpcodeFabs:
   313  		m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return())
   314  	case ssa.OpcodeBitcast:
   315  		m.lowerBitcast(instr)
   316  	case ssa.OpcodeFcopysign:
   317  		x, y := instr.Arg2()
   318  		m.lowerFcopysign(x, y, instr.Return())
   319  	case ssa.OpcodeSdiv, ssa.OpcodeUdiv:
   320  		x, y, ctx := instr.Arg3()
   321  		ctxVReg := m.compiler.VRegOf(ctx)
   322  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   323  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   324  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   325  		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
   326  	case ssa.OpcodeSrem, ssa.OpcodeUrem:
   327  		x, y, ctx := instr.Arg3()
   328  		ctxVReg := m.compiler.VRegOf(ctx)
   329  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   330  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   331  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   332  		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
   333  	case ssa.OpcodeVconst:
   334  		result := m.compiler.VRegOf(instr.Return())
   335  		lo, hi := instr.VconstData()
   336  		v := m.allocateInstr()
   337  		v.asLoadFpuConst128(result, lo, hi)
   338  		m.insert(v)
   339  	case ssa.OpcodeVbnot:
   340  		x := instr.Arg()
   341  		ins := m.allocateInstr()
   342  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   343  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   344  		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
   345  		m.insert(ins)
   346  	case ssa.OpcodeVbxor:
   347  		x, y := instr.Arg2()
   348  		m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B)
   349  	case ssa.OpcodeVbor:
   350  		x, y := instr.Arg2()
   351  		m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B)
   352  	case ssa.OpcodeVband:
   353  		x, y := instr.Arg2()
   354  		m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B)
   355  	case ssa.OpcodeVbandnot:
   356  		x, y := instr.Arg2()
   357  		m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B)
   358  	case ssa.OpcodeVbitselect:
   359  		c, x, y := instr.SelectData()
   360  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   361  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   362  		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
   363  		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   364  
   365  		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
   366  		// in case when it is used somewhere else.
   367  		mov := m.allocateInstr()
   368  		mov.asFpuMov128(tmp.nr(), creg.nr())
   369  		m.insert(mov)
   370  
   371  		ins := m.allocateInstr()
   372  		ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B)
   373  		m.insert(ins)
   374  
   375  		mov2 := m.allocateInstr()
   376  		rd := m.compiler.VRegOf(instr.Return())
   377  		mov2.asFpuMov128(rd, tmp.nr())
   378  		m.insert(mov2)
   379  	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
   380  		x, lane := instr.ArgWithLane()
   381  		var arr vecArrangement
   382  		if op == ssa.OpcodeVallTrue {
   383  			arr = ssaLaneToArrangement(lane)
   384  		}
   385  		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   386  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   387  		m.lowerVcheckTrue(op, rm, rd, arr)
   388  	case ssa.OpcodeVhighBits:
   389  		x, lane := instr.ArgWithLane()
   390  		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   391  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   392  		arr := ssaLaneToArrangement(lane)
   393  		m.lowerVhighBits(rm, rd, arr)
   394  	case ssa.OpcodeVIadd:
   395  		x, y, lane := instr.Arg2WithLane()
   396  		arr := ssaLaneToArrangement(lane)
   397  		m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr)
   398  	case ssa.OpcodeIaddPairwise:
   399  		x, y, lane := instr.Arg2WithLane()
   400  		arr := ssaLaneToArrangement(lane)
   401  		m.lowerVecRRR(vecOpAddp, x, y, instr.Return(), arr)
   402  	case ssa.OpcodeVSaddSat:
   403  		x, y, lane := instr.Arg2WithLane()
   404  		arr := ssaLaneToArrangement(lane)
   405  		m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr)
   406  	case ssa.OpcodeVUaddSat:
   407  		x, y, lane := instr.Arg2WithLane()
   408  		arr := ssaLaneToArrangement(lane)
   409  		m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr)
   410  	case ssa.OpcodeVIsub:
   411  		x, y, lane := instr.Arg2WithLane()
   412  		arr := ssaLaneToArrangement(lane)
   413  		m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr)
   414  	case ssa.OpcodeVSsubSat:
   415  		x, y, lane := instr.Arg2WithLane()
   416  		arr := ssaLaneToArrangement(lane)
   417  		m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr)
   418  	case ssa.OpcodeVUsubSat:
   419  		x, y, lane := instr.Arg2WithLane()
   420  		arr := ssaLaneToArrangement(lane)
   421  		m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr)
   422  	case ssa.OpcodeVImin:
   423  		x, y, lane := instr.Arg2WithLane()
   424  		arr := ssaLaneToArrangement(lane)
   425  		m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr)
   426  	case ssa.OpcodeVUmin:
   427  		x, y, lane := instr.Arg2WithLane()
   428  		arr := ssaLaneToArrangement(lane)
   429  		m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr)
   430  	case ssa.OpcodeVImax:
   431  		x, y, lane := instr.Arg2WithLane()
   432  		arr := ssaLaneToArrangement(lane)
   433  		m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr)
   434  	case ssa.OpcodeVUmax:
   435  		x, y, lane := instr.Arg2WithLane()
   436  		arr := ssaLaneToArrangement(lane)
   437  		m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr)
   438  	case ssa.OpcodeVAvgRound:
   439  		x, y, lane := instr.Arg2WithLane()
   440  		arr := ssaLaneToArrangement(lane)
   441  		m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr)
   442  	case ssa.OpcodeVImul:
   443  		x, y, lane := instr.Arg2WithLane()
   444  		arr := ssaLaneToArrangement(lane)
   445  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   446  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   447  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   448  		m.lowerVIMul(rd, rn, rm, arr)
   449  	case ssa.OpcodeVIabs:
   450  		m.lowerVecMisc(vecOpAbs, instr)
   451  	case ssa.OpcodeVIneg:
   452  		m.lowerVecMisc(vecOpNeg, instr)
   453  	case ssa.OpcodeVIpopcnt:
   454  		m.lowerVecMisc(vecOpCnt, instr)
   455  	case ssa.OpcodeVIshl,
   456  		ssa.OpcodeVSshr, ssa.OpcodeVUshr:
   457  		x, y, lane := instr.Arg2WithLane()
   458  		arr := ssaLaneToArrangement(lane)
   459  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   460  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   461  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   462  		m.lowerVShift(op, rd, rn, rm, arr)
   463  	case ssa.OpcodeVSqrt:
   464  		m.lowerVecMisc(vecOpFsqrt, instr)
   465  	case ssa.OpcodeVFabs:
   466  		m.lowerVecMisc(vecOpFabs, instr)
   467  	case ssa.OpcodeVFneg:
   468  		m.lowerVecMisc(vecOpFneg, instr)
   469  	case ssa.OpcodeVFmin:
   470  		x, y, lane := instr.Arg2WithLane()
   471  		arr := ssaLaneToArrangement(lane)
   472  		m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr)
   473  	case ssa.OpcodeVFmax:
   474  		x, y, lane := instr.Arg2WithLane()
   475  		arr := ssaLaneToArrangement(lane)
   476  		m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr)
   477  	case ssa.OpcodeVFadd:
   478  		x, y, lane := instr.Arg2WithLane()
   479  		arr := ssaLaneToArrangement(lane)
   480  		m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr)
   481  	case ssa.OpcodeVFsub:
   482  		x, y, lane := instr.Arg2WithLane()
   483  		arr := ssaLaneToArrangement(lane)
   484  		m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr)
   485  	case ssa.OpcodeVFmul:
   486  		x, y, lane := instr.Arg2WithLane()
   487  		arr := ssaLaneToArrangement(lane)
   488  		m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr)
   489  	case ssa.OpcodeSqmulRoundSat:
   490  		x, y, lane := instr.Arg2WithLane()
   491  		arr := ssaLaneToArrangement(lane)
   492  		m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr)
   493  	case ssa.OpcodeVFdiv:
   494  		x, y, lane := instr.Arg2WithLane()
   495  		arr := ssaLaneToArrangement(lane)
   496  		m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr)
   497  	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
   498  		x, lane := instr.ArgWithLane()
   499  		arr := ssaLaneToArrangement(lane)
   500  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   501  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   502  		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
   503  	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
   504  		x, lane := instr.ArgWithLane()
   505  		arr := ssaLaneToArrangement(lane)
   506  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   507  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   508  		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
   509  	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
   510  		x, lane := instr.ArgWithLane()
   511  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   512  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   513  
   514  		var arr vecArrangement
   515  		switch lane {
   516  		case ssa.VecLaneI8x16:
   517  			arr = vecArrangement8B
   518  		case ssa.VecLaneI16x8:
   519  			arr = vecArrangement4H
   520  		case ssa.VecLaneI32x4:
   521  			arr = vecArrangement2S
   522  		}
   523  
   524  		shll := m.allocateInstr()
   525  		if signed := op == ssa.OpcodeSwidenLow; signed {
   526  			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
   527  		} else {
   528  			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
   529  		}
   530  		m.insert(shll)
   531  	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
   532  		x, lane := instr.ArgWithLane()
   533  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   534  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   535  
   536  		arr := ssaLaneToArrangement(lane)
   537  
   538  		shll := m.allocateInstr()
   539  		if signed := op == ssa.OpcodeSwidenHigh; signed {
   540  			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
   541  		} else {
   542  			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
   543  		}
   544  		m.insert(shll)
   545  
   546  	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
   547  		x, y, lane := instr.Arg2WithLane()
   548  		var arr, arr2 vecArrangement
   549  		switch lane {
   550  		case ssa.VecLaneI16x8: // I16x8
   551  			arr = vecArrangement8B
   552  			arr2 = vecArrangement16B // Implies sqxtn2.
   553  		case ssa.VecLaneI32x4:
   554  			arr = vecArrangement4H
   555  			arr2 = vecArrangement8H // Implies sqxtn2.
   556  		default:
   557  			panic("unsupported lane " + lane.String())
   558  		}
   559  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   560  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   561  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   562  
   563  		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   564  
   565  		loQxtn := m.allocateInstr()
   566  		hiQxtn := m.allocateInstr()
   567  		if signed := op == ssa.OpcodeSnarrow; signed {
   568  			// Narrow lanes on rn and write them into lower-half of rd.
   569  			loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low
   570  			// Narrow lanes on rm and write them into higher-half of rd.
   571  			hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2)
   572  		} else {
   573  			// Narrow lanes on rn and write them into lower-half of rd.
   574  			loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low
   575  			// Narrow lanes on rm and write them into higher-half of rd.
   576  			hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2)
   577  		}
   578  		m.insert(loQxtn)
   579  		m.insert(hiQxtn)
   580  
   581  		mov := m.allocateInstr()
   582  		mov.asFpuMov128(rd.nr(), tmp.nr())
   583  		m.insert(mov)
   584  	case ssa.OpcodeFvpromoteLow:
   585  		x, lane := instr.ArgWithLane()
   586  		if lane != ssa.VecLaneF32x4 {
   587  			panic("unsupported lane type " + lane.String())
   588  		}
   589  		ins := m.allocateInstr()
   590  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   591  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   592  		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
   593  		m.insert(ins)
   594  	case ssa.OpcodeFvdemote:
   595  		x, lane := instr.ArgWithLane()
   596  		if lane != ssa.VecLaneF64x2 {
   597  			panic("unsupported lane type " + lane.String())
   598  		}
   599  		ins := m.allocateInstr()
   600  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   601  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   602  		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
   603  		m.insert(ins)
   604  	case ssa.OpcodeExtractlane:
   605  		x, index, signed, lane := instr.ExtractlaneData()
   606  
   607  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   608  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   609  
   610  		mov := m.allocateInstr()
   611  		switch lane {
   612  		case ssa.VecLaneI8x16:
   613  			mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed)
   614  		case ssa.VecLaneI16x8:
   615  			mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed)
   616  		case ssa.VecLaneI32x4:
   617  			mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed)
   618  		case ssa.VecLaneI64x2:
   619  			mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed)
   620  		case ssa.VecLaneF32x4:
   621  			mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index))
   622  		case ssa.VecLaneF64x2:
   623  			mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index))
   624  		default:
   625  			panic("unsupported lane: " + lane.String())
   626  		}
   627  
   628  		m.insert(mov)
   629  
   630  	case ssa.OpcodeInsertlane:
   631  		x, y, index, lane := instr.InsertlaneData()
   632  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   633  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   634  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   635  		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   636  
   637  		// Initially mov rn to tmp.
   638  		mov1 := m.allocateInstr()
   639  		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
   640  		m.insert(mov1)
   641  
   642  		// movToVec and vecMovElement do not clear the remaining bits to zero,
   643  		// thus, we can mov rm in-place to tmp.
   644  		mov2 := m.allocateInstr()
   645  		switch lane {
   646  		case ssa.VecLaneI8x16:
   647  			mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index))
   648  		case ssa.VecLaneI16x8:
   649  			mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index))
   650  		case ssa.VecLaneI32x4:
   651  			mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index))
   652  		case ssa.VecLaneI64x2:
   653  			mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index))
   654  		case ssa.VecLaneF32x4:
   655  			mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0))
   656  		case ssa.VecLaneF64x2:
   657  			mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0))
   658  		}
   659  		m.insert(mov2)
   660  
   661  		// Finally mov tmp to rd.
   662  		mov3 := m.allocateInstr()
   663  		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
   664  		m.insert(mov3)
   665  
   666  	case ssa.OpcodeSwizzle:
   667  		x, y, lane := instr.Arg2WithLane()
   668  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   669  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   670  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   671  
   672  		arr := ssaLaneToArrangement(lane)
   673  
   674  		// tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr>
   675  		tbl1 := m.allocateInstr()
   676  		tbl1.asVecTbl(1, rd, rn, rm, arr)
   677  		m.insert(tbl1)
   678  
   679  	case ssa.OpcodeShuffle:
   680  		x, y, lane1, lane2 := instr.ShuffleData()
   681  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   682  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   683  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   684  
   685  		m.lowerShuffle(rd, rn, rm, lane1, lane2)
   686  
   687  	case ssa.OpcodeSplat:
   688  		x, lane := instr.ArgWithLane()
   689  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   690  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   691  
   692  		dup := m.allocateInstr()
   693  		switch lane {
   694  		case ssa.VecLaneI8x16:
   695  			dup.asVecDup(rd, rn, vecArrangement16B)
   696  		case ssa.VecLaneI16x8:
   697  			dup.asVecDup(rd, rn, vecArrangement8H)
   698  		case ssa.VecLaneI32x4:
   699  			dup.asVecDup(rd, rn, vecArrangement4S)
   700  		case ssa.VecLaneI64x2:
   701  			dup.asVecDup(rd, rn, vecArrangement2D)
   702  		case ssa.VecLaneF32x4:
   703  			dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0))
   704  		case ssa.VecLaneF64x2:
   705  			dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0))
   706  		}
   707  		m.insert(dup)
   708  
   709  	case ssa.OpcodeLoadSplat:
   710  		ptr, offset, lane := instr.LoadSplatData()
   711  		m.lowerLoadSplat(ptr, offset, lane, instr.Return())
   712  	default:
   713  		panic("TODO: lowering " + op.String())
   714  	}
   715  	m.executableContext.FlushPendingInstructions()
   716  }
   717  
   718  func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
   719  	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
   720  	vReg, wReg := v29VReg, v30VReg
   721  
   722  	// Initialize v29, v30 to rn, rm.
   723  	movv := m.allocateInstr()
   724  	movv.asFpuMov128(vReg, rn.nr())
   725  	m.insert(movv)
   726  
   727  	movw := m.allocateInstr()
   728  	movw.asFpuMov128(wReg, rm.nr())
   729  	m.insert(movw)
   730  
   731  	// `lane1`, `lane2` are already encoded as two u64s with the right layout:
   732  	//     lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0]
   733  	//     lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8]
   734  	// Thus, we can use loadFpuConst128.
   735  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   736  	lfc := m.allocateInstr()
   737  	lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2)
   738  	m.insert(lfc)
   739  
   740  	// tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b
   741  	tbl2 := m.allocateInstr()
   742  	tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B)
   743  	m.insert(tbl2)
   744  }
   745  
   746  func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
   747  	var modulo byte
   748  	switch arr {
   749  	case vecArrangement16B:
   750  		modulo = 0x7 // Modulo 8.
   751  	case vecArrangement8H:
   752  		modulo = 0xf // Modulo 16.
   753  	case vecArrangement4S:
   754  		modulo = 0x1f // Modulo 32.
   755  	case vecArrangement2D:
   756  		modulo = 0x3f // Modulo 64.
   757  	default:
   758  		panic("unsupported arrangment " + arr.String())
   759  	}
   760  
   761  	rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
   762  	vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   763  
   764  	and := m.allocateInstr()
   765  	and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true)
   766  	m.insert(and)
   767  
   768  	if op != ssa.OpcodeVIshl {
   769  		// Negate the amount to make this as right shift.
   770  		neg := m.allocateInstr()
   771  		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
   772  		m.insert(neg)
   773  	}
   774  
   775  	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
   776  	dup := m.allocateInstr()
   777  	dup.asVecDup(vtmp, rtmp, arr)
   778  	m.insert(dup)
   779  
   780  	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
   781  		sshl := m.allocateInstr()
   782  		sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr)
   783  		m.insert(sshl)
   784  	} else {
   785  		ushl := m.allocateInstr()
   786  		ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr)
   787  		m.insert(ushl)
   788  	}
   789  }
   790  
   791  func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
   792  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   793  
   794  	// Special case VallTrue for i64x2.
   795  	if op == ssa.OpcodeVallTrue && arr == vecArrangement2D {
   796  		// 	cmeq v3?.2d, v2?.2d, #0
   797  		//	addp v3?.2d, v3?.2d, v3?.2d
   798  		//	fcmp v3?, v3?
   799  		//	cset dst, eq
   800  
   801  		ins := m.allocateInstr()
   802  		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
   803  		m.insert(ins)
   804  
   805  		addp := m.allocateInstr()
   806  		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
   807  		m.insert(addp)
   808  
   809  		fcmp := m.allocateInstr()
   810  		fcmp.asFpuCmp(tmp, tmp, true)
   811  		m.insert(fcmp)
   812  
   813  		cset := m.allocateInstr()
   814  		cset.asCSet(rd.nr(), false, eq)
   815  		m.insert(cset)
   816  
   817  		return
   818  	}
   819  
   820  	// Create a scalar value with umaxp or uminv, then compare it against zero.
   821  	ins := m.allocateInstr()
   822  	if op == ssa.OpcodeVanyTrue {
   823  		// 	umaxp v4?.16b, v2?.16b, v2?.16b
   824  		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
   825  	} else {
   826  		// 	uminv d4?, v2?.4s
   827  		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
   828  	}
   829  	m.insert(ins)
   830  
   831  	//	mov x3?, v4?.d[0]
   832  	//	ccmp x3?, #0x0, #0x0, al
   833  	//	cset x3?, ne
   834  	//	mov x0, x3?
   835  
   836  	movv := m.allocateInstr()
   837  	movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false)
   838  	m.insert(movv)
   839  
   840  	fc := m.allocateInstr()
   841  	fc.asCCmpImm(rd, uint64(0), al, 0, true)
   842  	m.insert(fc)
   843  
   844  	cset := m.allocateInstr()
   845  	cset.asCSet(rd.nr(), false, ne)
   846  	m.insert(cset)
   847  }
   848  
   849  func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
   850  	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
   851  	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   852  	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   853  
   854  	switch arr {
   855  	case vecArrangement16B:
   856  		//	sshr v6?.16b, v2?.16b, #7
   857  		//	movz x4?, #0x201, lsl 0
   858  		//	movk x4?, #0x804, lsl 16
   859  		//	movk x4?, #0x2010, lsl 32
   860  		//	movk x4?, #0x8040, lsl 48
   861  		//	dup v5?.2d, x4?
   862  		//	and v6?.16b, v6?.16b, v5?.16b
   863  		//	ext v5?.16b, v6?.16b, v6?.16b, #8
   864  		//	zip1 v5?.16b, v6?.16b, v5?.16b
   865  		//	addv s5?, v5?.8h
   866  		//	umov s3?, v5?.h[0]
   867  
   868  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
   869  		// v1[i] = 0xff if vi<0, 0 otherwise.
   870  		sshr := m.allocateInstr()
   871  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
   872  		m.insert(sshr)
   873  
   874  		// Load the bit mask into r0.
   875  		m.insertMOVZ(r0.nr(), 0x0201, 0, true)
   876  		m.insertMOVK(r0.nr(), 0x0804, 1, true)
   877  		m.insertMOVK(r0.nr(), 0x2010, 2, true)
   878  		m.insertMOVK(r0.nr(), 0x8040, 3, true)
   879  
   880  		// dup r0 to v0.
   881  		dup := m.allocateInstr()
   882  		dup.asVecDup(v0, r0, vecArrangement2D)
   883  		m.insert(dup)
   884  
   885  		// Lane-wise logical AND with the bit mask, meaning that we have
   886  		// v[i] = (1 << i) if vi<0, 0 otherwise.
   887  		//
   888  		// Below, we use the following notation:
   889  		// wi := (1 << i) if vi<0, 0 otherwise.
   890  		and := m.allocateInstr()
   891  		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
   892  		m.insert(and)
   893  
   894  		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
   895  		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
   896  		ext := m.allocateInstr()
   897  		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
   898  		m.insert(ext)
   899  
   900  		// v = [w0, w8, ..., w7, w15]
   901  		zip1 := m.allocateInstr()
   902  		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
   903  		m.insert(zip1)
   904  
   905  		// v.h[0] = w0 + ... + w15
   906  		addv := m.allocateInstr()
   907  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
   908  		m.insert(addv)
   909  
   910  		// Extract the v.h[0] as the result.
   911  		movfv := m.allocateInstr()
   912  		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
   913  		m.insert(movfv)
   914  	case vecArrangement8H:
   915  		//	sshr v6?.8h, v2?.8h, #15
   916  		//	movz x4?, #0x1, lsl 0
   917  		//	movk x4?, #0x2, lsl 16
   918  		//	movk x4?, #0x4, lsl 32
   919  		//	movk x4?, #0x8, lsl 48
   920  		//	dup v5?.2d, x4?
   921  		//	lsl x4?, x4?, 0x4
   922  		//	ins v5?.d[1], x4?
   923  		//	and v5?.16b, v6?.16b, v5?.16b
   924  		//	addv s5?, v5?.8h
   925  		//	umov s3?, v5?.h[0]
   926  
   927  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
   928  		// v[i] = 0xffff if vi<0, 0 otherwise.
   929  		sshr := m.allocateInstr()
   930  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
   931  		m.insert(sshr)
   932  
   933  		// Load the bit mask into r0.
   934  		m.lowerConstantI64(r0.nr(), 0x0008000400020001)
   935  
   936  		// dup r0 to vector v0.
   937  		dup := m.allocateInstr()
   938  		dup.asVecDup(v0, r0, vecArrangement2D)
   939  		m.insert(dup)
   940  
   941  		lsl := m.allocateInstr()
   942  		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
   943  		m.insert(lsl)
   944  
   945  		movv := m.allocateInstr()
   946  		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
   947  		m.insert(movv)
   948  
   949  		// Lane-wise logical AND with the bitmask, meaning that we have
   950  		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
   951  		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
   952  		and := m.allocateInstr()
   953  		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
   954  		m.insert(and)
   955  
   956  		addv := m.allocateInstr()
   957  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
   958  		m.insert(addv)
   959  
   960  		movfv := m.allocateInstr()
   961  		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
   962  		m.insert(movfv)
   963  	case vecArrangement4S:
   964  		// 	sshr v6?.8h, v2?.8h, #15
   965  		//	movz x4?, #0x1, lsl 0
   966  		//	movk x4?, #0x2, lsl 16
   967  		//	movk x4?, #0x4, lsl 32
   968  		//	movk x4?, #0x8, lsl 48
   969  		//	dup v5?.2d, x4?
   970  		//	lsl x4?, x4?, 0x4
   971  		//	ins v5?.d[1], x4?
   972  		//	and v5?.16b, v6?.16b, v5?.16b
   973  		//	addv s5?, v5?.8h
   974  		//	umov s3?, v5?.h[0]
   975  
   976  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
   977  		// v[i] = 0xffffffff if vi<0, 0 otherwise.
   978  		sshr := m.allocateInstr()
   979  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
   980  		m.insert(sshr)
   981  
   982  		// Load the bit mask into r0.
   983  		m.lowerConstantI64(r0.nr(), 0x0000000200000001)
   984  
   985  		// dup r0 to vector v0.
   986  		dup := m.allocateInstr()
   987  		dup.asVecDup(v0, r0, vecArrangement2D)
   988  		m.insert(dup)
   989  
   990  		lsl := m.allocateInstr()
   991  		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
   992  		m.insert(lsl)
   993  
   994  		movv := m.allocateInstr()
   995  		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
   996  		m.insert(movv)
   997  
   998  		// Lane-wise logical AND with the bitmask, meaning that we have
   999  		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
  1000  		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
  1001  		and := m.allocateInstr()
  1002  		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
  1003  		m.insert(and)
  1004  
  1005  		addv := m.allocateInstr()
  1006  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
  1007  		m.insert(addv)
  1008  
  1009  		movfv := m.allocateInstr()
  1010  		movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false)
  1011  		m.insert(movfv)
  1012  	case vecArrangement2D:
  1013  		// 	mov d3?, v2?.d[0]
  1014  		//	mov x4?, v2?.d[1]
  1015  		//	lsr x4?, x4?, 0x3f
  1016  		//	lsr d3?, d3?, 0x3f
  1017  		//	add s3?, s3?, w4?, lsl #1
  1018  
  1019  		// Move the lower 64-bit int into result.
  1020  		movv0 := m.allocateInstr()
  1021  		movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false)
  1022  		m.insert(movv0)
  1023  
  1024  		// Move the higher 64-bit int into r0.
  1025  		movv1 := m.allocateInstr()
  1026  		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
  1027  		m.insert(movv1)
  1028  
  1029  		// Move the sign bit into the least significant bit.
  1030  		lsr1 := m.allocateInstr()
  1031  		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
  1032  		m.insert(lsr1)
  1033  
  1034  		lsr2 := m.allocateInstr()
  1035  		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
  1036  		m.insert(lsr2)
  1037  
  1038  		// rd = (r0<<1) | rd
  1039  		lsl := m.allocateInstr()
  1040  		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
  1041  		m.insert(lsl)
  1042  	default:
  1043  		panic("Unsupported " + arr.String())
  1044  	}
  1045  }
  1046  
  1047  func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
  1048  	x, lane := instr.ArgWithLane()
  1049  	arr := ssaLaneToArrangement(lane)
  1050  	ins := m.allocateInstr()
  1051  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1052  	rd := operandNR(m.compiler.VRegOf(instr.Return()))
  1053  	ins.asVecMisc(op, rd, rn, arr)
  1054  	m.insert(ins)
  1055  }
  1056  
  1057  func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) {
  1058  	ins := m.allocateInstr()
  1059  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1060  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1061  	rd := operandNR(m.compiler.VRegOf(ret))
  1062  	ins.asVecRRR(op, rd, rn, rm, arr)
  1063  	m.insert(ins)
  1064  }
  1065  
  1066  func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
  1067  	if arr != vecArrangement2D {
  1068  		mul := m.allocateInstr()
  1069  		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
  1070  		m.insert(mul)
  1071  	} else {
  1072  		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1073  		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1074  		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1075  
  1076  		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1077  
  1078  		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
  1079  		rev64 := m.allocateInstr()
  1080  		rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S)
  1081  		m.insert(rev64)
  1082  
  1083  		mul := m.allocateInstr()
  1084  		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
  1085  		m.insert(mul)
  1086  
  1087  		xtn1 := m.allocateInstr()
  1088  		xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S)
  1089  		m.insert(xtn1)
  1090  
  1091  		addp := m.allocateInstr()
  1092  		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
  1093  		m.insert(addp)
  1094  
  1095  		xtn2 := m.allocateInstr()
  1096  		xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S)
  1097  		m.insert(xtn2)
  1098  
  1099  		// Note: do not write the result directly into result yet. This is the same reason as in bsl.
  1100  		// In short, in UMLAL instruction, the result register is also one of the source register, and
  1101  		// the value on the result register is significant.
  1102  		shll := m.allocateInstr()
  1103  		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
  1104  		m.insert(shll)
  1105  
  1106  		umlal := m.allocateInstr()
  1107  		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
  1108  		m.insert(umlal)
  1109  
  1110  		mov := m.allocateInstr()
  1111  		mov.asFpuMov128(rd.nr(), tmpRes.nr())
  1112  		m.insert(mov)
  1113  	}
  1114  }
  1115  
  1116  func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
  1117  	x, y, lane := instr.Arg2WithLane()
  1118  	arr := ssaLaneToArrangement(lane)
  1119  
  1120  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1121  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1122  
  1123  	// Note: this usage of tmp is important.
  1124  	// BSL modifies the destination register, so we need to use a temporary register so that
  1125  	// the actual definition of the destination register happens *after* the BSL instruction.
  1126  	// That way, we can force the spill instruction to be inserted after the BSL instruction.
  1127  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1128  
  1129  	fcmgt := m.allocateInstr()
  1130  	if max {
  1131  		fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr)
  1132  	} else {
  1133  		// If min, swap the args.
  1134  		fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr)
  1135  	}
  1136  	m.insert(fcmgt)
  1137  
  1138  	bsl := m.allocateInstr()
  1139  	bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B)
  1140  	m.insert(bsl)
  1141  
  1142  	res := operandNR(m.compiler.VRegOf(instr.Return()))
  1143  	mov2 := m.allocateInstr()
  1144  	mov2.asFpuMov128(res.nr(), tmp.nr())
  1145  	m.insert(mov2)
  1146  }
  1147  
  1148  func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
  1149  	div := m.allocateInstr()
  1150  
  1151  	if signed {
  1152  		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
  1153  	} else {
  1154  		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
  1155  	}
  1156  	m.insert(div)
  1157  
  1158  	// Check if rm is zero:
  1159  	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
  1160  
  1161  	// rd = rn-rd*rm by MSUB instruction.
  1162  	msub := m.allocateInstr()
  1163  	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
  1164  	m.insert(msub)
  1165  }
  1166  
  1167  func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
  1168  	div := m.allocateInstr()
  1169  
  1170  	if signed {
  1171  		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
  1172  	} else {
  1173  		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
  1174  	}
  1175  	m.insert(div)
  1176  
  1177  	// Check if rm is zero:
  1178  	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
  1179  
  1180  	if signed {
  1181  		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
  1182  		minusOneCheck := m.allocateInstr()
  1183  		// Sets eq condition if rm == -1.
  1184  		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
  1185  		m.insert(minusOneCheck)
  1186  
  1187  		ccmp := m.allocateInstr()
  1188  		// If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag.
  1189  		ccmp.asCCmpImm(rn, 1, eq, 0, _64bit)
  1190  		m.insert(ccmp)
  1191  
  1192  		// Check the overflow flag.
  1193  		m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow)
  1194  	}
  1195  }
  1196  
  1197  // exitIfNot emits a conditional branch to exit if the condition is not met.
  1198  // If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit.
  1199  // Otherwise, `cond64bit` is ignored.
  1200  func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) {
  1201  	execCtxTmp := m.copyToTmp(execCtxVReg)
  1202  
  1203  	cbr := m.allocateInstr()
  1204  	m.insert(cbr)
  1205  	m.lowerExitWithCode(execCtxTmp, code)
  1206  	// Conditional branch target is after exit.
  1207  	l := m.insertBrTargetLabel()
  1208  	cbr.asCondBr(c, l, cond64bit)
  1209  }
  1210  
  1211  func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
  1212  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1213  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1214  	var tmpI, tmpF operand
  1215  	_64 := x.Type() == ssa.TypeF64
  1216  	if _64 {
  1217  		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1218  		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
  1219  	} else {
  1220  		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
  1221  		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
  1222  	}
  1223  	rd := m.compiler.VRegOf(ret)
  1224  	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
  1225  }
  1226  
  1227  func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
  1228  	// This is exactly the same code emitted by GCC for "__builtin_copysign":
  1229  	//
  1230  	//    mov     x0, -9223372036854775808
  1231  	//    fmov    d2, x0
  1232  	//    vbit    v0.8b, v1.8b, v2.8b
  1233  	//
  1234  
  1235  	setMSB := m.allocateInstr()
  1236  	if _64bit {
  1237  		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
  1238  		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
  1239  	} else {
  1240  		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
  1241  		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
  1242  	}
  1243  	m.insert(setMSB)
  1244  
  1245  	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1246  
  1247  	mov := m.allocateInstr()
  1248  	mov.asFpuMov64(tmpReg.nr(), rn.nr())
  1249  	m.insert(mov)
  1250  
  1251  	vbit := m.allocateInstr()
  1252  	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
  1253  	m.insert(vbit)
  1254  
  1255  	movDst := m.allocateInstr()
  1256  	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
  1257  	m.insert(movDst)
  1258  }
  1259  
  1260  func (m *machine) lowerBitcast(instr *ssa.Instruction) {
  1261  	v, dstType := instr.BitcastData()
  1262  	srcType := v.Type()
  1263  	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
  1264  	rd := operandNR(m.compiler.VRegOf(instr.Return()))
  1265  	srcInt := srcType.IsInt()
  1266  	dstInt := dstType.IsInt()
  1267  	switch {
  1268  	case srcInt && !dstInt: // Int to Float:
  1269  		mov := m.allocateInstr()
  1270  		var arr vecArrangement
  1271  		if srcType.Bits() == 64 {
  1272  			arr = vecArrangementD
  1273  		} else {
  1274  			arr = vecArrangementS
  1275  		}
  1276  		mov.asMovToVec(rd, rn, arr, vecIndex(0))
  1277  		m.insert(mov)
  1278  	case !srcInt && dstInt: // Float to Int:
  1279  		mov := m.allocateInstr()
  1280  		var arr vecArrangement
  1281  		if dstType.Bits() == 64 {
  1282  			arr = vecArrangementD
  1283  		} else {
  1284  			arr = vecArrangementS
  1285  		}
  1286  		mov.asMovFromVec(rd, rn, arr, vecIndex(0), false)
  1287  		m.insert(mov)
  1288  	default:
  1289  		panic("TODO?BUG?")
  1290  	}
  1291  }
  1292  
  1293  func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
  1294  	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
  1295  	rd := operandNR(m.compiler.VRegOf(out))
  1296  
  1297  	neg := m.allocateInstr()
  1298  	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
  1299  	m.insert(neg)
  1300  }
  1301  
  1302  func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
  1303  	if !nonTrapping {
  1304  		// First of all, we have to clear the FPU flags.
  1305  		flagClear := m.allocateInstr()
  1306  		flagClear.asMovToFPSR(xzrVReg)
  1307  		m.insert(flagClear)
  1308  	}
  1309  
  1310  	// Then, do the conversion which doesn't trap inherently.
  1311  	cvt := m.allocateInstr()
  1312  	cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit)
  1313  	m.insert(cvt)
  1314  
  1315  	if !nonTrapping {
  1316  		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
  1317  
  1318  		// After the conversion, check the FPU flags.
  1319  		getFlag := m.allocateInstr()
  1320  		getFlag.asMovFromFPSR(tmpReg)
  1321  		m.insert(getFlag)
  1322  
  1323  		execCtx := m.copyToTmp(ctx)
  1324  		_rn := operandNR(m.copyToTmp(rn.nr()))
  1325  
  1326  		// Check if the conversion was undefined by comparing the status with 1.
  1327  		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
  1328  		alu := m.allocateInstr()
  1329  		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
  1330  		m.insert(alu)
  1331  
  1332  		// If it is not undefined, we can return the result.
  1333  		ok := m.allocateInstr()
  1334  		m.insert(ok)
  1335  
  1336  		// Otherwise, we have to choose the status depending on it is overflow or NaN conversion.
  1337  
  1338  		// Comparing itself to check if it is a NaN.
  1339  		fpuCmp := m.allocateInstr()
  1340  		fpuCmp.asFpuCmp(_rn, _rn, src64bit)
  1341  		m.insert(fpuCmp)
  1342  		// If the VC flag is not set (== VS flag is set), it is a NaN.
  1343  		m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger)
  1344  		// Otherwise, it is an overflow.
  1345  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  1346  
  1347  		// Conditional branch target is after exit.
  1348  		l := m.insertBrTargetLabel()
  1349  		ok.asCondBr(ne.asCond(), l, false /* ignored */)
  1350  	}
  1351  }
  1352  
  1353  func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
  1354  	cvt := m.allocateInstr()
  1355  	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
  1356  	m.insert(cvt)
  1357  }
  1358  
  1359  func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
  1360  	instr := m.allocateInstr()
  1361  	var op fpuBinOp
  1362  	switch si.Opcode() {
  1363  	case ssa.OpcodeFadd:
  1364  		op = fpuBinOpAdd
  1365  	case ssa.OpcodeFsub:
  1366  		op = fpuBinOpSub
  1367  	case ssa.OpcodeFmul:
  1368  		op = fpuBinOpMul
  1369  	case ssa.OpcodeFdiv:
  1370  		op = fpuBinOpDiv
  1371  	case ssa.OpcodeFmax:
  1372  		op = fpuBinOpMax
  1373  	case ssa.OpcodeFmin:
  1374  		op = fpuBinOpMin
  1375  	}
  1376  	x, y := si.Arg2()
  1377  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1378  	rn := m.getOperand_NR(xDef, extModeNone)
  1379  	rm := m.getOperand_NR(yDef, extModeNone)
  1380  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1381  	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
  1382  	m.insert(instr)
  1383  }
  1384  
  1385  func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
  1386  	x, y := si.Arg2()
  1387  	if !x.Type().IsInt() {
  1388  		panic("BUG?")
  1389  	}
  1390  
  1391  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1392  	rn := m.getOperand_NR(xDef, extModeNone)
  1393  	rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone)
  1394  
  1395  	var aop aluOp
  1396  	switch {
  1397  	case add && !yNegated: // rn+rm = x+y
  1398  		aop = aluOpAdd
  1399  	case add && yNegated: // rn-rm = x-(-y) = x+y
  1400  		aop = aluOpSub
  1401  	case !add && !yNegated: // rn-rm = x-y
  1402  		aop = aluOpSub
  1403  	case !add && yNegated: // rn+rm = x-(-y) = x-y
  1404  		aop = aluOpAdd
  1405  	}
  1406  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1407  	alu := m.allocateInstr()
  1408  	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
  1409  	m.insert(alu)
  1410  }
  1411  
  1412  // InsertMove implements backend.Machine.
  1413  func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
  1414  	instr := m.allocateInstr()
  1415  	switch typ {
  1416  	case ssa.TypeI32, ssa.TypeI64:
  1417  		instr.asMove64(dst, src)
  1418  	case ssa.TypeF32, ssa.TypeF64:
  1419  		instr.asFpuMov64(dst, src)
  1420  	case ssa.TypeV128:
  1421  		instr.asFpuMov128(dst, src)
  1422  	default:
  1423  		panic("TODO")
  1424  	}
  1425  	m.insert(instr)
  1426  }
  1427  
  1428  func (m *machine) lowerIcmp(si *ssa.Instruction) {
  1429  	x, y, c := si.IcmpData()
  1430  	flag := condFlagFromSSAIntegerCmpCond(c)
  1431  
  1432  	in64bit := x.Type().Bits() == 64
  1433  	var ext extMode
  1434  	if in64bit {
  1435  		if c.Signed() {
  1436  			ext = extModeSignExtend64
  1437  		} else {
  1438  			ext = extModeZeroExtend64
  1439  		}
  1440  	} else {
  1441  		if c.Signed() {
  1442  			ext = extModeSignExtend32
  1443  		} else {
  1444  			ext = extModeZeroExtend32
  1445  		}
  1446  	}
  1447  
  1448  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
  1449  	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
  1450  	alu := m.allocateInstr()
  1451  	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
  1452  	m.insert(alu)
  1453  
  1454  	cset := m.allocateInstr()
  1455  	cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag)
  1456  	m.insert(cset)
  1457  }
  1458  
  1459  func (m *machine) lowerVIcmp(si *ssa.Instruction) {
  1460  	x, y, c, lane := si.VIcmpData()
  1461  	flag := condFlagFromSSAIntegerCmpCond(c)
  1462  	arr := ssaLaneToArrangement(lane)
  1463  
  1464  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1465  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1466  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1467  
  1468  	switch flag {
  1469  	case eq:
  1470  		cmp := m.allocateInstr()
  1471  		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
  1472  		m.insert(cmp)
  1473  	case ne:
  1474  		cmp := m.allocateInstr()
  1475  		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
  1476  		m.insert(cmp)
  1477  		not := m.allocateInstr()
  1478  		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
  1479  		m.insert(not)
  1480  	case ge:
  1481  		cmp := m.allocateInstr()
  1482  		cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr)
  1483  		m.insert(cmp)
  1484  	case gt:
  1485  		cmp := m.allocateInstr()
  1486  		cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr)
  1487  		m.insert(cmp)
  1488  	case le:
  1489  		cmp := m.allocateInstr()
  1490  		cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped
  1491  		m.insert(cmp)
  1492  	case lt:
  1493  		cmp := m.allocateInstr()
  1494  		cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped
  1495  		m.insert(cmp)
  1496  	case hs:
  1497  		cmp := m.allocateInstr()
  1498  		cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr)
  1499  		m.insert(cmp)
  1500  	case hi:
  1501  		cmp := m.allocateInstr()
  1502  		cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr)
  1503  		m.insert(cmp)
  1504  	case ls:
  1505  		cmp := m.allocateInstr()
  1506  		cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped
  1507  		m.insert(cmp)
  1508  	case lo:
  1509  		cmp := m.allocateInstr()
  1510  		cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped
  1511  		m.insert(cmp)
  1512  	}
  1513  }
  1514  
  1515  func (m *machine) lowerVFcmp(si *ssa.Instruction) {
  1516  	x, y, c, lane := si.VFcmpData()
  1517  	flag := condFlagFromSSAFloatCmpCond(c)
  1518  	arr := ssaLaneToArrangement(lane)
  1519  
  1520  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1521  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1522  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1523  
  1524  	switch flag {
  1525  	case eq:
  1526  		cmp := m.allocateInstr()
  1527  		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
  1528  		m.insert(cmp)
  1529  	case ne:
  1530  		cmp := m.allocateInstr()
  1531  		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
  1532  		m.insert(cmp)
  1533  		not := m.allocateInstr()
  1534  		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
  1535  		m.insert(not)
  1536  	case ge:
  1537  		cmp := m.allocateInstr()
  1538  		cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr)
  1539  		m.insert(cmp)
  1540  	case gt:
  1541  		cmp := m.allocateInstr()
  1542  		cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr)
  1543  		m.insert(cmp)
  1544  	case mi:
  1545  		cmp := m.allocateInstr()
  1546  		cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped
  1547  		m.insert(cmp)
  1548  	case ls:
  1549  		cmp := m.allocateInstr()
  1550  		cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped
  1551  		m.insert(cmp)
  1552  	}
  1553  }
  1554  
  1555  func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
  1556  	cvt := m.allocateInstr()
  1557  	if signed {
  1558  		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
  1559  	} else {
  1560  		cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr)
  1561  	}
  1562  	m.insert(cvt)
  1563  
  1564  	if arr == vecArrangement2D {
  1565  		narrow := m.allocateInstr()
  1566  		if signed {
  1567  			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
  1568  		} else {
  1569  			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
  1570  		}
  1571  		m.insert(narrow)
  1572  	}
  1573  }
  1574  
  1575  func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
  1576  	cvt := m.allocateInstr()
  1577  	if signed {
  1578  		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
  1579  	} else {
  1580  		cvt.asVecMisc(vecOpUcvtf, rd, rn, arr)
  1581  	}
  1582  	m.insert(cvt)
  1583  }
  1584  
  1585  func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
  1586  	x, amount := si.Arg2()
  1587  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
  1588  	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
  1589  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1590  
  1591  	alu := m.allocateInstr()
  1592  	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
  1593  	m.insert(alu)
  1594  }
  1595  
  1596  func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp) {
  1597  	x, y := si.Arg2()
  1598  
  1599  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1600  	rn := m.getOperand_NR(xDef, extModeNone)
  1601  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1602  
  1603  	_64 := x.Type().Bits() == 64
  1604  	alu := m.allocateInstr()
  1605  	if instr := yDef.Instr; instr != nil && instr.Constant() {
  1606  		c := instr.ConstantVal()
  1607  		if isBitMaskImmediate(c, _64) {
  1608  			// Constant bit wise operations can be lowered to a single instruction.
  1609  			alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
  1610  			m.insert(alu)
  1611  			return
  1612  		}
  1613  	}
  1614  
  1615  	rm := m.getOperand_SR_NR(yDef, extModeNone)
  1616  	alu.asALU(op, rd, rn, rm, _64)
  1617  	m.insert(alu)
  1618  }
  1619  
  1620  func (m *machine) lowerRotl(si *ssa.Instruction) {
  1621  	x, y := si.Arg2()
  1622  	r := si.Return()
  1623  	_64 := r.Type().Bits() == 64
  1624  
  1625  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1626  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1627  	var tmp operand
  1628  	if _64 {
  1629  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
  1630  	} else {
  1631  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
  1632  	}
  1633  	rd := operandNR(m.compiler.VRegOf(r))
  1634  
  1635  	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
  1636  	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
  1637  }
  1638  
  1639  func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
  1640  	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
  1641  	neg := m.allocateInstr()
  1642  	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
  1643  	m.insert(neg)
  1644  	alu := m.allocateInstr()
  1645  	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
  1646  	m.insert(alu)
  1647  }
  1648  
  1649  func (m *machine) lowerRotr(si *ssa.Instruction) {
  1650  	x, y := si.Arg2()
  1651  
  1652  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1653  	rn := m.getOperand_NR(xDef, extModeNone)
  1654  	rm := m.getOperand_NR(yDef, extModeNone)
  1655  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1656  
  1657  	alu := m.allocateInstr()
  1658  	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
  1659  	m.insert(alu)
  1660  }
  1661  
  1662  func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) {
  1663  	rd := m.compiler.VRegOf(ret)
  1664  	def := m.compiler.ValueDefinition(arg)
  1665  
  1666  	if instr := def.Instr; !signed && from == 32 && instr != nil {
  1667  		// We can optimize out the unsigned extend because:
  1668  		// 	Writes to the W register set bits [63:32] of the X register to zero
  1669  		//  https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions
  1670  		switch instr.Opcode() {
  1671  		case
  1672  			ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad,
  1673  			ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot,
  1674  			ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr,
  1675  			ssa.OpcodeRotl, ssa.OpcodeRotr,
  1676  			ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32:
  1677  			// So, if the argument is the result of a 32-bit operation, we can just copy the register.
  1678  			// It is highly likely that this copy will be optimized out after register allocation.
  1679  			rn := m.compiler.VRegOf(arg)
  1680  			mov := m.allocateInstr()
  1681  			// Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend).
  1682  			mov.asMove64(rd, rn)
  1683  			m.insert(mov)
  1684  			return
  1685  		default:
  1686  		}
  1687  	}
  1688  	rn := m.getOperand_NR(def, extModeNone)
  1689  
  1690  	ext := m.allocateInstr()
  1691  	ext.asExtend(rd, rn.nr(), from, to, signed)
  1692  	m.insert(ext)
  1693  }
  1694  
  1695  func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) {
  1696  	rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1697  
  1698  	fc := m.allocateInstr()
  1699  	fc.asFpuCmp(rn, rm, x.Type().Bits() == 64)
  1700  	m.insert(fc)
  1701  
  1702  	cset := m.allocateInstr()
  1703  	cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c))
  1704  	m.insert(cset)
  1705  }
  1706  
  1707  func (m *machine) lowerImul(x, y, result ssa.Value) {
  1708  	rd := m.compiler.VRegOf(result)
  1709  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1710  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1711  
  1712  	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
  1713  
  1714  	mul := m.allocateInstr()
  1715  	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
  1716  	m.insert(mul)
  1717  }
  1718  
  1719  func (m *machine) lowerClz(x, result ssa.Value) {
  1720  	rd := m.compiler.VRegOf(result)
  1721  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1722  	clz := m.allocateInstr()
  1723  	clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64)
  1724  	m.insert(clz)
  1725  }
  1726  
  1727  func (m *machine) lowerCtz(x, result ssa.Value) {
  1728  	rd := m.compiler.VRegOf(result)
  1729  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1730  	rbit := m.allocateInstr()
  1731  	_64 := x.Type().Bits() == 64
  1732  	var tmpReg regalloc.VReg
  1733  	if _64 {
  1734  		tmpReg = m.compiler.AllocateVReg(ssa.TypeI64)
  1735  	} else {
  1736  		tmpReg = m.compiler.AllocateVReg(ssa.TypeI32)
  1737  	}
  1738  	rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64)
  1739  	m.insert(rbit)
  1740  
  1741  	clz := m.allocateInstr()
  1742  	clz.asBitRR(bitOpClz, rd, tmpReg, _64)
  1743  	m.insert(clz)
  1744  }
  1745  
  1746  func (m *machine) lowerPopcnt(x, result ssa.Value) {
  1747  	// arm64 doesn't have an instruction for population count on scalar register,
  1748  	// so we use the vector instruction `cnt`.
  1749  	// This is exactly what the official Go implements bits.OneCount.
  1750  	// For example, "func () int { return bits.OneCount(10) }" is compiled as
  1751  	//
  1752  	//    MOVD    $10, R0 ;; Load 10.
  1753  	//    FMOVD   R0, F0
  1754  	//    VCNT    V0.B8, V0.B8
  1755  	//    UADDLV  V0.B8, V0
  1756  	//
  1757  	// In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`,
  1758  	// and the registers may use different names. In our encoding we use the following
  1759  	// instructions:
  1760  	//
  1761  	//    ins v0.d[0], x0     ;; mov from GPR to vec (FMOV above) is encoded as INS
  1762  	//    cnt v0.16b, v0.16b  ;; we use vec arrangement 16b
  1763  	//    uaddlv h0, v0.8b    ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b
  1764  	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
  1765  	//
  1766  
  1767  	rd := operandNR(m.compiler.VRegOf(result))
  1768  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1769  
  1770  	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1771  	ins := m.allocateInstr()
  1772  	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
  1773  	m.insert(ins)
  1774  
  1775  	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1776  	cnt := m.allocateInstr()
  1777  	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
  1778  	m.insert(cnt)
  1779  
  1780  	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1781  	uaddlv := m.allocateInstr()
  1782  	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
  1783  	m.insert(uaddlv)
  1784  
  1785  	mov := m.allocateInstr()
  1786  	mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false)
  1787  	m.insert(mov)
  1788  }
  1789  
  1790  // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument.
  1791  func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) {
  1792  	tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32)
  1793  	loadExitCodeConst := m.allocateInstr()
  1794  	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
  1795  
  1796  	setExitCode := m.allocateInstr()
  1797  	setExitCode.asStore(operandNR(tmpReg1),
  1798  		addressMode{
  1799  			kind: addressModeKindRegUnsignedImm12,
  1800  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
  1801  		}, 32)
  1802  
  1803  	// In order to unwind the stack, we also need to push the current stack pointer:
  1804  	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
  1805  	movSpToTmp := m.allocateInstr()
  1806  	movSpToTmp.asMove64(tmp2, spVReg)
  1807  	strSpToExecCtx := m.allocateInstr()
  1808  	strSpToExecCtx.asStore(operandNR(tmp2),
  1809  		addressMode{
  1810  			kind: addressModeKindRegUnsignedImm12,
  1811  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
  1812  		}, 64)
  1813  	// Also the address of this exit.
  1814  	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
  1815  	currentAddrToTmp := m.allocateInstr()
  1816  	currentAddrToTmp.asAdr(tmp3, 0)
  1817  	storeCurrentAddrToExecCtx := m.allocateInstr()
  1818  	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
  1819  		addressMode{
  1820  			kind: addressModeKindRegUnsignedImm12,
  1821  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
  1822  		}, 64)
  1823  
  1824  	exitSeq := m.allocateInstr()
  1825  	exitSeq.asExitSequence(execCtxVReg)
  1826  
  1827  	m.insert(loadExitCodeConst)
  1828  	m.insert(setExitCode)
  1829  	m.insert(movSpToTmp)
  1830  	m.insert(strSpToExecCtx)
  1831  	m.insert(currentAddrToTmp)
  1832  	m.insert(storeCurrentAddrToExecCtx)
  1833  	m.insert(exitSeq)
  1834  }
  1835  
  1836  func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
  1837  	if x.Type() != y.Type() {
  1838  		panic(
  1839  			fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s",
  1840  				x.ID(), x.Type(), y.ID(), y.Type()))
  1841  	}
  1842  
  1843  	extMod := extModeOf(x.Type(), signed)
  1844  
  1845  	// First operand must be in pure register form.
  1846  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod)
  1847  	// Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions.
  1848  	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod)
  1849  
  1850  	alu := m.allocateInstr()
  1851  	// subs zr, rn, rm
  1852  	alu.asALU(
  1853  		aluOpSubS,
  1854  		// We don't need the result, just need to set flags.
  1855  		operandNR(xzrVReg),
  1856  		rn,
  1857  		rm,
  1858  		x.Type().Bits() == 64,
  1859  	)
  1860  	m.insert(alu)
  1861  }
  1862  
  1863  func (m *machine) lowerFcmpToFlag(x, y ssa.Value) {
  1864  	if x.Type() != y.Type() {
  1865  		panic("TODO(maybe): support icmp with different types")
  1866  	}
  1867  
  1868  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1869  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1870  	cmp := m.allocateInstr()
  1871  	cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64)
  1872  	m.insert(cmp)
  1873  }
  1874  
  1875  func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
  1876  	condDef := m.compiler.ValueDefinition(cond)
  1877  	if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) {
  1878  		panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
  1879  	}
  1880  	condDef.Instr.MarkLowered()
  1881  
  1882  	cvalInstr := condDef.Instr
  1883  	x, y, c := cvalInstr.IcmpData()
  1884  	signed := c.Signed()
  1885  	m.lowerIcmpToFlag(x, y, signed)
  1886  
  1887  	execCtxTmp := m.copyToTmp(execCtxVReg)
  1888  
  1889  	// We have to skip the entire exit sequence if the condition is false.
  1890  	cbr := m.allocateInstr()
  1891  	m.insert(cbr)
  1892  	m.lowerExitWithCode(execCtxTmp, code)
  1893  	// conditional branch target is after exit.
  1894  	l := m.insertBrTargetLabel()
  1895  	cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */)
  1896  }
  1897  
  1898  func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
  1899  	cvalDef := m.compiler.ValueDefinition(c)
  1900  
  1901  	var cc condFlag
  1902  	switch {
  1903  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
  1904  		cvalInstr := cvalDef.Instr
  1905  		x, y, c := cvalInstr.IcmpData()
  1906  		cc = condFlagFromSSAIntegerCmpCond(c)
  1907  		m.lowerIcmpToFlag(x, y, c.Signed())
  1908  		cvalDef.Instr.MarkLowered()
  1909  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
  1910  		cvalInstr := cvalDef.Instr
  1911  		x, y, c := cvalInstr.FcmpData()
  1912  		cc = condFlagFromSSAFloatCmpCond(c)
  1913  		m.lowerFcmpToFlag(x, y)
  1914  		cvalDef.Instr.MarkLowered()
  1915  	default:
  1916  		rn := m.getOperand_NR(cvalDef, extModeNone)
  1917  		if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 {
  1918  			panic("TODO?BUG?: support select with non-integer condition")
  1919  		}
  1920  		alu := m.allocateInstr()
  1921  		// subs zr, rn, zr
  1922  		alu.asALU(
  1923  			aluOpSubS,
  1924  			// We don't need the result, just need to set flags.
  1925  			operandNR(xzrVReg),
  1926  			rn,
  1927  			operandNR(xzrVReg),
  1928  			c.Type().Bits() == 64,
  1929  		)
  1930  		m.insert(alu)
  1931  		cc = ne
  1932  	}
  1933  
  1934  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1935  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1936  
  1937  	rd := operandNR(m.compiler.VRegOf(result))
  1938  	switch x.Type() {
  1939  	case ssa.TypeI32, ssa.TypeI64:
  1940  		// csel rd, rn, rm, cc
  1941  		csel := m.allocateInstr()
  1942  		csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
  1943  		m.insert(csel)
  1944  	case ssa.TypeF32, ssa.TypeF64:
  1945  		// fcsel rd, rn, rm, cc
  1946  		fcsel := m.allocateInstr()
  1947  		fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
  1948  		m.insert(fcsel)
  1949  	default:
  1950  		panic("BUG")
  1951  	}
  1952  }
  1953  
  1954  func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
  1955  	// First check if `rc` is zero or not.
  1956  	checkZero := m.allocateInstr()
  1957  	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
  1958  	m.insert(checkZero)
  1959  
  1960  	// Then use CSETM to set all bits to one if `rc` is zero.
  1961  	allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64)
  1962  	cset := m.allocateInstr()
  1963  	cset.asCSet(allOnesOrZero, true, ne)
  1964  	m.insert(cset)
  1965  
  1966  	// Then move the bits to the result vector register.
  1967  	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1968  	dup := m.allocateInstr()
  1969  	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
  1970  	m.insert(dup)
  1971  
  1972  	// Now that `tmp2` has either all bits one or zero depending on `rc`,
  1973  	// we can use bsl to select between `rn` and `rm`.
  1974  	ins := m.allocateInstr()
  1975  	ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B)
  1976  	m.insert(ins)
  1977  
  1978  	// Finally, move the result to the destination register.
  1979  	mov2 := m.allocateInstr()
  1980  	mov2.asFpuMov128(rd.nr(), tmp2.nr())
  1981  	m.insert(mov2)
  1982  }
  1983  
  1984  // copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue
  1985  // e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes
  1986  func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
  1987  	typ := m.compiler.TypeOf(v)
  1988  	mov := m.allocateInstr()
  1989  	tmp := m.compiler.AllocateVReg(typ)
  1990  	if typ.IsInt() {
  1991  		mov.asMove64(tmp, v)
  1992  	} else {
  1993  		mov.asFpuMov128(tmp, v)
  1994  	}
  1995  	m.insert(mov)
  1996  	return tmp
  1997  }