github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/arm64/lower_instr.go (about)

     1  package arm64
     2  
     3  // Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions
     4  // into machine specific instructions.
     5  //
     6  // Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree,
     7  // and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection.
     8  
     9  import (
    10  	"fmt"
    11  	"math"
    12  
    13  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
    14  	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
    15  	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
    16  )
    17  
    18  // LowerSingleBranch implements backend.Machine.
    19  func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
    20  	ectx := m.executableContext
    21  	switch br.Opcode() {
    22  	case ssa.OpcodeJump:
    23  		_, _, targetBlk := br.BranchData()
    24  		if br.IsFallthroughJump() {
    25  			return
    26  		}
    27  		b := m.allocateInstr()
    28  		target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
    29  		if target == labelReturn {
    30  			b.asRet()
    31  		} else {
    32  			b.asBr(target)
    33  		}
    34  		m.insert(b)
    35  	case ssa.OpcodeBrTable:
    36  		m.lowerBrTable(br)
    37  	default:
    38  		panic("BUG: unexpected branch opcode" + br.Opcode().String())
    39  	}
    40  }
    41  
    42  func (m *machine) lowerBrTable(i *ssa.Instruction) {
    43  	index, targets := i.BrTableData()
    44  	indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone)
    45  
    46  	// Firstly, we have to do the bounds check of the index, and
    47  	// set it to the default target (sitting at the end of the list) if it's out of bounds.
    48  
    49  	// mov  maxIndexReg #maximum_index
    50  	// subs wzr, index, maxIndexReg
    51  	// csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg.
    52  	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
    53  	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
    54  	subs := m.allocateInstr()
    55  	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
    56  	m.insert(subs)
    57  	csel := m.allocateInstr()
    58  	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
    59  	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
    60  	m.insert(csel)
    61  
    62  	brSequence := m.allocateInstr()
    63  
    64  	tableIndex := m.addJmpTableTarget(targets)
    65  	brSequence.asBrTableSequence(adjustedIndex, tableIndex, len(targets))
    66  	m.insert(brSequence)
    67  }
    68  
    69  // LowerConditionalBranch implements backend.Machine.
    70  func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
    71  	exctx := m.executableContext
    72  	cval, args, targetBlk := b.BranchData()
    73  	if len(args) > 0 {
    74  		panic(fmt.Sprintf(
    75  			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
    76  			exctx.CurrentSSABlk,
    77  			targetBlk,
    78  		))
    79  	}
    80  
    81  	target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
    82  	cvalDef := m.compiler.ValueDefinition(cval)
    83  
    84  	switch {
    85  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
    86  		cvalInstr := cvalDef.Instr
    87  		x, y, c := cvalInstr.IcmpData()
    88  		cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed()
    89  		if b.Opcode() == ssa.OpcodeBrz {
    90  			cc = cc.invert()
    91  		}
    92  
    93  		if !m.tryLowerBandToFlag(x, y) {
    94  			m.lowerIcmpToFlag(x, y, signed)
    95  		}
    96  		cbr := m.allocateInstr()
    97  		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
    98  		m.insert(cbr)
    99  		cvalDef.Instr.MarkLowered()
   100  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
   101  		cvalInstr := cvalDef.Instr
   102  		x, y, c := cvalInstr.FcmpData()
   103  		cc := condFlagFromSSAFloatCmpCond(c)
   104  		if b.Opcode() == ssa.OpcodeBrz {
   105  			cc = cc.invert()
   106  		}
   107  		m.lowerFcmpToFlag(x, y)
   108  		cbr := m.allocateInstr()
   109  		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
   110  		m.insert(cbr)
   111  		cvalDef.Instr.MarkLowered()
   112  	default:
   113  		rn := m.getOperand_NR(cvalDef, extModeNone)
   114  		var c cond
   115  		if b.Opcode() == ssa.OpcodeBrz {
   116  			c = registerAsRegZeroCond(rn.nr())
   117  		} else {
   118  			c = registerAsRegNotZeroCond(rn.nr())
   119  		}
   120  		cbr := m.allocateInstr()
   121  		cbr.asCondBr(c, target, false)
   122  		m.insert(cbr)
   123  	}
   124  }
   125  
   126  func (m *machine) tryLowerBandToFlag(x, y ssa.Value) (ok bool) {
   127  	xx := m.compiler.ValueDefinition(x)
   128  	yy := m.compiler.ValueDefinition(y)
   129  	if xx.IsFromInstr() && xx.Instr.Constant() && xx.Instr.ConstantVal() == 0 {
   130  		if m.compiler.MatchInstr(yy, ssa.OpcodeBand) {
   131  			bandInstr := yy.Instr
   132  			m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
   133  			ok = true
   134  			bandInstr.MarkLowered()
   135  			return
   136  		}
   137  	}
   138  
   139  	if yy.IsFromInstr() && yy.Instr.Constant() && yy.Instr.ConstantVal() == 0 {
   140  		if m.compiler.MatchInstr(xx, ssa.OpcodeBand) {
   141  			bandInstr := xx.Instr
   142  			m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
   143  			ok = true
   144  			bandInstr.MarkLowered()
   145  			return
   146  		}
   147  	}
   148  	return
   149  }
   150  
   151  // LowerInstr implements backend.Machine.
   152  func (m *machine) LowerInstr(instr *ssa.Instruction) {
   153  	if l := instr.SourceOffset(); l.Valid() {
   154  		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
   155  		m.insert(info)
   156  	}
   157  
   158  	switch op := instr.Opcode(); op {
   159  	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
   160  		panic("BUG: branching instructions are handled by LowerBranches")
   161  	case ssa.OpcodeReturn:
   162  		panic("BUG: return must be handled by backend.Compiler")
   163  	case ssa.OpcodeIadd, ssa.OpcodeIsub:
   164  		m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd)
   165  	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin:
   166  		m.lowerFpuBinOp(instr)
   167  	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
   168  	case ssa.OpcodeExitWithCode:
   169  		execCtx, code := instr.ExitWithCodeData()
   170  		m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code)
   171  	case ssa.OpcodeExitIfTrueWithCode:
   172  		execCtx, c, code := instr.ExitIfTrueWithCodeData()
   173  		m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code)
   174  	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
   175  		m.lowerStore(instr)
   176  	case ssa.OpcodeLoad:
   177  		dst := instr.Return()
   178  		ptr, offset, typ := instr.LoadData()
   179  		m.lowerLoad(ptr, offset, typ, dst)
   180  	case ssa.OpcodeVZeroExtLoad:
   181  		dst := instr.Return()
   182  		ptr, offset, typ := instr.VZeroExtLoadData()
   183  		m.lowerLoad(ptr, offset, typ, dst)
   184  	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
   185  		ptr, offset, _ := instr.LoadData()
   186  		ret := m.compiler.VRegOf(instr.Return())
   187  		m.lowerExtLoad(op, ptr, offset, ret)
   188  	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
   189  		m.lowerCall(instr)
   190  	case ssa.OpcodeIcmp:
   191  		m.lowerIcmp(instr)
   192  	case ssa.OpcodeVIcmp:
   193  		m.lowerVIcmp(instr)
   194  	case ssa.OpcodeVFcmp:
   195  		m.lowerVFcmp(instr)
   196  	case ssa.OpcodeVCeil:
   197  		m.lowerVecMisc(vecOpFrintp, instr)
   198  	case ssa.OpcodeVFloor:
   199  		m.lowerVecMisc(vecOpFrintm, instr)
   200  	case ssa.OpcodeVTrunc:
   201  		m.lowerVecMisc(vecOpFrintz, instr)
   202  	case ssa.OpcodeVNearest:
   203  		m.lowerVecMisc(vecOpFrintn, instr)
   204  	case ssa.OpcodeVMaxPseudo:
   205  		m.lowerVMinMaxPseudo(instr, true)
   206  	case ssa.OpcodeVMinPseudo:
   207  		m.lowerVMinMaxPseudo(instr, false)
   208  	case ssa.OpcodeBand:
   209  		m.lowerBitwiseAluOp(instr, aluOpAnd, false)
   210  	case ssa.OpcodeBor:
   211  		m.lowerBitwiseAluOp(instr, aluOpOrr, false)
   212  	case ssa.OpcodeBxor:
   213  		m.lowerBitwiseAluOp(instr, aluOpEor, false)
   214  	case ssa.OpcodeIshl:
   215  		m.lowerShifts(instr, extModeNone, aluOpLsl)
   216  	case ssa.OpcodeSshr:
   217  		if instr.Return().Type().Bits() == 64 {
   218  			m.lowerShifts(instr, extModeSignExtend64, aluOpAsr)
   219  		} else {
   220  			m.lowerShifts(instr, extModeSignExtend32, aluOpAsr)
   221  		}
   222  	case ssa.OpcodeUshr:
   223  		if instr.Return().Type().Bits() == 64 {
   224  			m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr)
   225  		} else {
   226  			m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr)
   227  		}
   228  	case ssa.OpcodeRotl:
   229  		m.lowerRotl(instr)
   230  	case ssa.OpcodeRotr:
   231  		m.lowerRotr(instr)
   232  	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
   233  		from, to, signed := instr.ExtendData()
   234  		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
   235  	case ssa.OpcodeFcmp:
   236  		x, y, c := instr.FcmpData()
   237  		m.lowerFcmp(x, y, instr.Return(), c)
   238  	case ssa.OpcodeImul:
   239  		x, y := instr.Arg2()
   240  		result := instr.Return()
   241  		m.lowerImul(x, y, result)
   242  	case ssa.OpcodeUndefined:
   243  		undef := m.allocateInstr()
   244  		undef.asUDF()
   245  		m.insert(undef)
   246  	case ssa.OpcodeSelect:
   247  		c, x, y := instr.SelectData()
   248  		if x.Type() == ssa.TypeV128 {
   249  			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
   250  			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   251  			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   252  			rd := operandNR(m.compiler.VRegOf(instr.Return()))
   253  			m.lowerSelectVec(rc, rn, rm, rd)
   254  		} else {
   255  			m.lowerSelect(c, x, y, instr.Return())
   256  		}
   257  	case ssa.OpcodeClz:
   258  		x := instr.Arg()
   259  		result := instr.Return()
   260  		m.lowerClz(x, result)
   261  	case ssa.OpcodeCtz:
   262  		x := instr.Arg()
   263  		result := instr.Return()
   264  		m.lowerCtz(x, result)
   265  	case ssa.OpcodePopcnt:
   266  		x := instr.Arg()
   267  		result := instr.Return()
   268  		m.lowerPopcnt(x, result)
   269  	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
   270  		x, ctx := instr.Arg2()
   271  		result := instr.Return()
   272  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   273  		rd := operandNR(m.compiler.VRegOf(result))
   274  		ctxVReg := m.compiler.VRegOf(ctx)
   275  		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
   276  			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
   277  	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
   278  		x, ctx := instr.Arg2()
   279  		result := instr.Return()
   280  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   281  		rd := operandNR(m.compiler.VRegOf(result))
   282  		ctxVReg := m.compiler.VRegOf(ctx)
   283  		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
   284  			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
   285  	case ssa.OpcodeFcvtFromSint:
   286  		x := instr.Arg()
   287  		result := instr.Return()
   288  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   289  		rd := operandNR(m.compiler.VRegOf(result))
   290  		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
   291  	case ssa.OpcodeFcvtFromUint:
   292  		x := instr.Arg()
   293  		result := instr.Return()
   294  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   295  		rd := operandNR(m.compiler.VRegOf(result))
   296  		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
   297  	case ssa.OpcodeFdemote:
   298  		v := instr.Arg()
   299  		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
   300  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   301  		cnt := m.allocateInstr()
   302  		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
   303  		m.insert(cnt)
   304  	case ssa.OpcodeFpromote:
   305  		v := instr.Arg()
   306  		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
   307  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   308  		cnt := m.allocateInstr()
   309  		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
   310  		m.insert(cnt)
   311  	case ssa.OpcodeIreduce:
   312  		rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone)
   313  		retVal := instr.Return()
   314  		rd := m.compiler.VRegOf(retVal)
   315  
   316  		if retVal.Type() != ssa.TypeI32 {
   317  			panic("TODO?: Ireduce to non-i32")
   318  		}
   319  		mov := m.allocateInstr()
   320  		mov.asMove32(rd, rn.reg())
   321  		m.insert(mov)
   322  	case ssa.OpcodeFneg:
   323  		m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return())
   324  	case ssa.OpcodeSqrt:
   325  		m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return())
   326  	case ssa.OpcodeCeil:
   327  		m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return())
   328  	case ssa.OpcodeFloor:
   329  		m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return())
   330  	case ssa.OpcodeTrunc:
   331  		m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return())
   332  	case ssa.OpcodeNearest:
   333  		m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return())
   334  	case ssa.OpcodeFabs:
   335  		m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return())
   336  	case ssa.OpcodeBitcast:
   337  		m.lowerBitcast(instr)
   338  	case ssa.OpcodeFcopysign:
   339  		x, y := instr.Arg2()
   340  		m.lowerFcopysign(x, y, instr.Return())
   341  	case ssa.OpcodeSdiv, ssa.OpcodeUdiv:
   342  		x, y, ctx := instr.Arg3()
   343  		ctxVReg := m.compiler.VRegOf(ctx)
   344  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   345  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   346  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   347  		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
   348  	case ssa.OpcodeSrem, ssa.OpcodeUrem:
   349  		x, y, ctx := instr.Arg3()
   350  		ctxVReg := m.compiler.VRegOf(ctx)
   351  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   352  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   353  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   354  		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
   355  	case ssa.OpcodeVconst:
   356  		result := m.compiler.VRegOf(instr.Return())
   357  		lo, hi := instr.VconstData()
   358  		v := m.allocateInstr()
   359  		v.asLoadFpuConst128(result, lo, hi)
   360  		m.insert(v)
   361  	case ssa.OpcodeVbnot:
   362  		x := instr.Arg()
   363  		ins := m.allocateInstr()
   364  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   365  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   366  		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
   367  		m.insert(ins)
   368  	case ssa.OpcodeVbxor:
   369  		x, y := instr.Arg2()
   370  		m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B)
   371  	case ssa.OpcodeVbor:
   372  		x, y := instr.Arg2()
   373  		m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B)
   374  	case ssa.OpcodeVband:
   375  		x, y := instr.Arg2()
   376  		m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B)
   377  	case ssa.OpcodeVbandnot:
   378  		x, y := instr.Arg2()
   379  		m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B)
   380  	case ssa.OpcodeVbitselect:
   381  		c, x, y := instr.SelectData()
   382  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   383  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   384  		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
   385  		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   386  
   387  		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
   388  		// in case when it is used somewhere else.
   389  		mov := m.allocateInstr()
   390  		mov.asFpuMov128(tmp.nr(), creg.nr())
   391  		m.insert(mov)
   392  
   393  		ins := m.allocateInstr()
   394  		ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B)
   395  		m.insert(ins)
   396  
   397  		mov2 := m.allocateInstr()
   398  		rd := m.compiler.VRegOf(instr.Return())
   399  		mov2.asFpuMov128(rd, tmp.nr())
   400  		m.insert(mov2)
   401  	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
   402  		x, lane := instr.ArgWithLane()
   403  		var arr vecArrangement
   404  		if op == ssa.OpcodeVallTrue {
   405  			arr = ssaLaneToArrangement(lane)
   406  		}
   407  		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   408  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   409  		m.lowerVcheckTrue(op, rm, rd, arr)
   410  	case ssa.OpcodeVhighBits:
   411  		x, lane := instr.ArgWithLane()
   412  		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   413  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   414  		arr := ssaLaneToArrangement(lane)
   415  		m.lowerVhighBits(rm, rd, arr)
   416  	case ssa.OpcodeVIadd:
   417  		x, y, lane := instr.Arg2WithLane()
   418  		arr := ssaLaneToArrangement(lane)
   419  		m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr)
   420  	case ssa.OpcodeExtIaddPairwise:
   421  		v, lane, signed := instr.ExtIaddPairwiseData()
   422  		vv := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
   423  
   424  		tmpLo, tmpHi := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   425  		var widen vecOp
   426  		if signed {
   427  			widen = vecOpSshll
   428  		} else {
   429  			widen = vecOpUshll
   430  		}
   431  
   432  		var loArr, hiArr, dstArr vecArrangement
   433  		switch lane {
   434  		case ssa.VecLaneI8x16:
   435  			loArr, hiArr, dstArr = vecArrangement8B, vecArrangement16B, vecArrangement8H
   436  		case ssa.VecLaneI16x8:
   437  			loArr, hiArr, dstArr = vecArrangement4H, vecArrangement8H, vecArrangement4S
   438  		case ssa.VecLaneI32x4:
   439  			loArr, hiArr, dstArr = vecArrangement2S, vecArrangement4S, vecArrangement2D
   440  		default:
   441  			panic("unsupported lane " + lane.String())
   442  		}
   443  
   444  		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr)
   445  		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr)
   446  		addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr)
   447  		m.insert(widenLo)
   448  		m.insert(widenHi)
   449  		m.insert(addp)
   450  
   451  	case ssa.OpcodeVSaddSat:
   452  		x, y, lane := instr.Arg2WithLane()
   453  		arr := ssaLaneToArrangement(lane)
   454  		m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr)
   455  	case ssa.OpcodeVUaddSat:
   456  		x, y, lane := instr.Arg2WithLane()
   457  		arr := ssaLaneToArrangement(lane)
   458  		m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr)
   459  	case ssa.OpcodeVIsub:
   460  		x, y, lane := instr.Arg2WithLane()
   461  		arr := ssaLaneToArrangement(lane)
   462  		m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr)
   463  	case ssa.OpcodeVSsubSat:
   464  		x, y, lane := instr.Arg2WithLane()
   465  		arr := ssaLaneToArrangement(lane)
   466  		m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr)
   467  	case ssa.OpcodeVUsubSat:
   468  		x, y, lane := instr.Arg2WithLane()
   469  		arr := ssaLaneToArrangement(lane)
   470  		m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr)
   471  	case ssa.OpcodeVImin:
   472  		x, y, lane := instr.Arg2WithLane()
   473  		arr := ssaLaneToArrangement(lane)
   474  		m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr)
   475  	case ssa.OpcodeVUmin:
   476  		x, y, lane := instr.Arg2WithLane()
   477  		arr := ssaLaneToArrangement(lane)
   478  		m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr)
   479  	case ssa.OpcodeVImax:
   480  		x, y, lane := instr.Arg2WithLane()
   481  		arr := ssaLaneToArrangement(lane)
   482  		m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr)
   483  	case ssa.OpcodeVUmax:
   484  		x, y, lane := instr.Arg2WithLane()
   485  		arr := ssaLaneToArrangement(lane)
   486  		m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr)
   487  	case ssa.OpcodeVAvgRound:
   488  		x, y, lane := instr.Arg2WithLane()
   489  		arr := ssaLaneToArrangement(lane)
   490  		m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr)
   491  	case ssa.OpcodeVImul:
   492  		x, y, lane := instr.Arg2WithLane()
   493  		arr := ssaLaneToArrangement(lane)
   494  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   495  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   496  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   497  		m.lowerVIMul(rd, rn, rm, arr)
   498  	case ssa.OpcodeVIabs:
   499  		m.lowerVecMisc(vecOpAbs, instr)
   500  	case ssa.OpcodeVIneg:
   501  		m.lowerVecMisc(vecOpNeg, instr)
   502  	case ssa.OpcodeVIpopcnt:
   503  		m.lowerVecMisc(vecOpCnt, instr)
   504  	case ssa.OpcodeVIshl,
   505  		ssa.OpcodeVSshr, ssa.OpcodeVUshr:
   506  		x, y, lane := instr.Arg2WithLane()
   507  		arr := ssaLaneToArrangement(lane)
   508  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   509  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   510  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   511  		m.lowerVShift(op, rd, rn, rm, arr)
   512  	case ssa.OpcodeVSqrt:
   513  		m.lowerVecMisc(vecOpFsqrt, instr)
   514  	case ssa.OpcodeVFabs:
   515  		m.lowerVecMisc(vecOpFabs, instr)
   516  	case ssa.OpcodeVFneg:
   517  		m.lowerVecMisc(vecOpFneg, instr)
   518  	case ssa.OpcodeVFmin:
   519  		x, y, lane := instr.Arg2WithLane()
   520  		arr := ssaLaneToArrangement(lane)
   521  		m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr)
   522  	case ssa.OpcodeVFmax:
   523  		x, y, lane := instr.Arg2WithLane()
   524  		arr := ssaLaneToArrangement(lane)
   525  		m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr)
   526  	case ssa.OpcodeVFadd:
   527  		x, y, lane := instr.Arg2WithLane()
   528  		arr := ssaLaneToArrangement(lane)
   529  		m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr)
   530  	case ssa.OpcodeVFsub:
   531  		x, y, lane := instr.Arg2WithLane()
   532  		arr := ssaLaneToArrangement(lane)
   533  		m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr)
   534  	case ssa.OpcodeVFmul:
   535  		x, y, lane := instr.Arg2WithLane()
   536  		arr := ssaLaneToArrangement(lane)
   537  		m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr)
   538  	case ssa.OpcodeSqmulRoundSat:
   539  		x, y, lane := instr.Arg2WithLane()
   540  		arr := ssaLaneToArrangement(lane)
   541  		m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr)
   542  	case ssa.OpcodeVFdiv:
   543  		x, y, lane := instr.Arg2WithLane()
   544  		arr := ssaLaneToArrangement(lane)
   545  		m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr)
   546  	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
   547  		x, lane := instr.ArgWithLane()
   548  		arr := ssaLaneToArrangement(lane)
   549  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   550  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   551  		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
   552  	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
   553  		x, lane := instr.ArgWithLane()
   554  		arr := ssaLaneToArrangement(lane)
   555  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   556  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   557  		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
   558  	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
   559  		x, lane := instr.ArgWithLane()
   560  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   561  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   562  
   563  		var arr vecArrangement
   564  		switch lane {
   565  		case ssa.VecLaneI8x16:
   566  			arr = vecArrangement8B
   567  		case ssa.VecLaneI16x8:
   568  			arr = vecArrangement4H
   569  		case ssa.VecLaneI32x4:
   570  			arr = vecArrangement2S
   571  		}
   572  
   573  		shll := m.allocateInstr()
   574  		if signed := op == ssa.OpcodeSwidenLow; signed {
   575  			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
   576  		} else {
   577  			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
   578  		}
   579  		m.insert(shll)
   580  	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
   581  		x, lane := instr.ArgWithLane()
   582  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   583  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   584  
   585  		arr := ssaLaneToArrangement(lane)
   586  
   587  		shll := m.allocateInstr()
   588  		if signed := op == ssa.OpcodeSwidenHigh; signed {
   589  			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
   590  		} else {
   591  			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
   592  		}
   593  		m.insert(shll)
   594  
   595  	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
   596  		x, y, lane := instr.Arg2WithLane()
   597  		var arr, arr2 vecArrangement
   598  		switch lane {
   599  		case ssa.VecLaneI16x8: // I16x8
   600  			arr = vecArrangement8B
   601  			arr2 = vecArrangement16B // Implies sqxtn2.
   602  		case ssa.VecLaneI32x4:
   603  			arr = vecArrangement4H
   604  			arr2 = vecArrangement8H // Implies sqxtn2.
   605  		default:
   606  			panic("unsupported lane " + lane.String())
   607  		}
   608  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   609  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   610  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   611  
   612  		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   613  
   614  		loQxtn := m.allocateInstr()
   615  		hiQxtn := m.allocateInstr()
   616  		if signed := op == ssa.OpcodeSnarrow; signed {
   617  			// Narrow lanes on rn and write them into lower-half of rd.
   618  			loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low
   619  			// Narrow lanes on rm and write them into higher-half of rd.
   620  			hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2)
   621  		} else {
   622  			// Narrow lanes on rn and write them into lower-half of rd.
   623  			loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low
   624  			// Narrow lanes on rm and write them into higher-half of rd.
   625  			hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2)
   626  		}
   627  		m.insert(loQxtn)
   628  		m.insert(hiQxtn)
   629  
   630  		mov := m.allocateInstr()
   631  		mov.asFpuMov128(rd.nr(), tmp.nr())
   632  		m.insert(mov)
   633  	case ssa.OpcodeFvpromoteLow:
   634  		x, lane := instr.ArgWithLane()
   635  		if lane != ssa.VecLaneF32x4 {
   636  			panic("unsupported lane type " + lane.String())
   637  		}
   638  		ins := m.allocateInstr()
   639  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   640  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   641  		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
   642  		m.insert(ins)
   643  	case ssa.OpcodeFvdemote:
   644  		x, lane := instr.ArgWithLane()
   645  		if lane != ssa.VecLaneF64x2 {
   646  			panic("unsupported lane type " + lane.String())
   647  		}
   648  		ins := m.allocateInstr()
   649  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   650  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   651  		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
   652  		m.insert(ins)
   653  	case ssa.OpcodeExtractlane:
   654  		x, index, signed, lane := instr.ExtractlaneData()
   655  
   656  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   657  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   658  
   659  		mov := m.allocateInstr()
   660  		switch lane {
   661  		case ssa.VecLaneI8x16:
   662  			mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed)
   663  		case ssa.VecLaneI16x8:
   664  			mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed)
   665  		case ssa.VecLaneI32x4:
   666  			mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed)
   667  		case ssa.VecLaneI64x2:
   668  			mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed)
   669  		case ssa.VecLaneF32x4:
   670  			mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index))
   671  		case ssa.VecLaneF64x2:
   672  			mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index))
   673  		default:
   674  			panic("unsupported lane: " + lane.String())
   675  		}
   676  
   677  		m.insert(mov)
   678  
   679  	case ssa.OpcodeInsertlane:
   680  		x, y, index, lane := instr.InsertlaneData()
   681  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   682  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   683  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   684  		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   685  
   686  		// Initially mov rn to tmp.
   687  		mov1 := m.allocateInstr()
   688  		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
   689  		m.insert(mov1)
   690  
   691  		// movToVec and vecMovElement do not clear the remaining bits to zero,
   692  		// thus, we can mov rm in-place to tmp.
   693  		mov2 := m.allocateInstr()
   694  		switch lane {
   695  		case ssa.VecLaneI8x16:
   696  			mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index))
   697  		case ssa.VecLaneI16x8:
   698  			mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index))
   699  		case ssa.VecLaneI32x4:
   700  			mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index))
   701  		case ssa.VecLaneI64x2:
   702  			mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index))
   703  		case ssa.VecLaneF32x4:
   704  			mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0))
   705  		case ssa.VecLaneF64x2:
   706  			mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0))
   707  		}
   708  		m.insert(mov2)
   709  
   710  		// Finally mov tmp to rd.
   711  		mov3 := m.allocateInstr()
   712  		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
   713  		m.insert(mov3)
   714  
   715  	case ssa.OpcodeSwizzle:
   716  		x, y, lane := instr.Arg2WithLane()
   717  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   718  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   719  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   720  
   721  		arr := ssaLaneToArrangement(lane)
   722  
   723  		// tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr>
   724  		tbl1 := m.allocateInstr()
   725  		tbl1.asVecTbl(1, rd, rn, rm, arr)
   726  		m.insert(tbl1)
   727  
   728  	case ssa.OpcodeShuffle:
   729  		x, y, lane1, lane2 := instr.ShuffleData()
   730  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   731  		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   732  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   733  
   734  		m.lowerShuffle(rd, rn, rm, lane1, lane2)
   735  
   736  	case ssa.OpcodeSplat:
   737  		x, lane := instr.ArgWithLane()
   738  		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
   739  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   740  
   741  		dup := m.allocateInstr()
   742  		switch lane {
   743  		case ssa.VecLaneI8x16:
   744  			dup.asVecDup(rd, rn, vecArrangement16B)
   745  		case ssa.VecLaneI16x8:
   746  			dup.asVecDup(rd, rn, vecArrangement8H)
   747  		case ssa.VecLaneI32x4:
   748  			dup.asVecDup(rd, rn, vecArrangement4S)
   749  		case ssa.VecLaneI64x2:
   750  			dup.asVecDup(rd, rn, vecArrangement2D)
   751  		case ssa.VecLaneF32x4:
   752  			dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0))
   753  		case ssa.VecLaneF64x2:
   754  			dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0))
   755  		}
   756  		m.insert(dup)
   757  
   758  	case ssa.OpcodeWideningPairwiseDotProductS:
   759  		x, y := instr.Arg2()
   760  		xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
   761  			m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
   762  		tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   763  		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H))
   764  		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H))
   765  		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S))
   766  
   767  		rd := operandNR(m.compiler.VRegOf(instr.Return()))
   768  		m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr()))
   769  
   770  	case ssa.OpcodeLoadSplat:
   771  		ptr, offset, lane := instr.LoadSplatData()
   772  		m.lowerLoadSplat(ptr, offset, lane, instr.Return())
   773  
   774  	case ssa.OpcodeAtomicRmw:
   775  		m.lowerAtomicRmw(instr)
   776  
   777  	case ssa.OpcodeAtomicCas:
   778  		m.lowerAtomicCas(instr)
   779  
   780  	case ssa.OpcodeAtomicLoad:
   781  		m.lowerAtomicLoad(instr)
   782  
   783  	case ssa.OpcodeAtomicStore:
   784  		m.lowerAtomicStore(instr)
   785  
   786  	case ssa.OpcodeFence:
   787  		instr := m.allocateInstr()
   788  		instr.asDMB()
   789  		m.insert(instr)
   790  
   791  	default:
   792  		panic("TODO: lowering " + op.String())
   793  	}
   794  	m.executableContext.FlushPendingInstructions()
   795  }
   796  
   797  func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
   798  	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
   799  	vReg, wReg := v29VReg, v30VReg
   800  
   801  	// Initialize v29, v30 to rn, rm.
   802  	movv := m.allocateInstr()
   803  	movv.asFpuMov128(vReg, rn.nr())
   804  	m.insert(movv)
   805  
   806  	movw := m.allocateInstr()
   807  	movw.asFpuMov128(wReg, rm.nr())
   808  	m.insert(movw)
   809  
   810  	// `lane1`, `lane2` are already encoded as two u64s with the right layout:
   811  	//     lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0]
   812  	//     lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8]
   813  	// Thus, we can use loadFpuConst128.
   814  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   815  	lfc := m.allocateInstr()
   816  	lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2)
   817  	m.insert(lfc)
   818  
   819  	// tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b
   820  	tbl2 := m.allocateInstr()
   821  	tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B)
   822  	m.insert(tbl2)
   823  }
   824  
   825  func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
   826  	var modulo byte
   827  	switch arr {
   828  	case vecArrangement16B:
   829  		modulo = 0x7 // Modulo 8.
   830  	case vecArrangement8H:
   831  		modulo = 0xf // Modulo 16.
   832  	case vecArrangement4S:
   833  		modulo = 0x1f // Modulo 32.
   834  	case vecArrangement2D:
   835  		modulo = 0x3f // Modulo 64.
   836  	default:
   837  		panic("unsupported arrangment " + arr.String())
   838  	}
   839  
   840  	rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
   841  	vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   842  
   843  	and := m.allocateInstr()
   844  	and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true)
   845  	m.insert(and)
   846  
   847  	if op != ssa.OpcodeVIshl {
   848  		// Negate the amount to make this as right shift.
   849  		neg := m.allocateInstr()
   850  		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
   851  		m.insert(neg)
   852  	}
   853  
   854  	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
   855  	dup := m.allocateInstr()
   856  	dup.asVecDup(vtmp, rtmp, arr)
   857  	m.insert(dup)
   858  
   859  	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
   860  		sshl := m.allocateInstr()
   861  		sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr)
   862  		m.insert(sshl)
   863  	} else {
   864  		ushl := m.allocateInstr()
   865  		ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr)
   866  		m.insert(ushl)
   867  	}
   868  }
   869  
   870  func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
   871  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   872  
   873  	// Special case VallTrue for i64x2.
   874  	if op == ssa.OpcodeVallTrue && arr == vecArrangement2D {
   875  		// 	cmeq v3?.2d, v2?.2d, #0
   876  		//	addp v3?.2d, v3?.2d, v3?.2d
   877  		//	fcmp v3?, v3?
   878  		//	cset dst, eq
   879  
   880  		ins := m.allocateInstr()
   881  		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
   882  		m.insert(ins)
   883  
   884  		addp := m.allocateInstr()
   885  		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
   886  		m.insert(addp)
   887  
   888  		fcmp := m.allocateInstr()
   889  		fcmp.asFpuCmp(tmp, tmp, true)
   890  		m.insert(fcmp)
   891  
   892  		cset := m.allocateInstr()
   893  		cset.asCSet(rd.nr(), false, eq)
   894  		m.insert(cset)
   895  
   896  		return
   897  	}
   898  
   899  	// Create a scalar value with umaxp or uminv, then compare it against zero.
   900  	ins := m.allocateInstr()
   901  	if op == ssa.OpcodeVanyTrue {
   902  		// 	umaxp v4?.16b, v2?.16b, v2?.16b
   903  		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
   904  	} else {
   905  		// 	uminv d4?, v2?.4s
   906  		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
   907  	}
   908  	m.insert(ins)
   909  
   910  	//	mov x3?, v4?.d[0]
   911  	//	ccmp x3?, #0x0, #0x0, al
   912  	//	cset x3?, ne
   913  	//	mov x0, x3?
   914  
   915  	movv := m.allocateInstr()
   916  	movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false)
   917  	m.insert(movv)
   918  
   919  	fc := m.allocateInstr()
   920  	fc.asCCmpImm(rd, uint64(0), al, 0, true)
   921  	m.insert(fc)
   922  
   923  	cset := m.allocateInstr()
   924  	cset.asCSet(rd.nr(), false, ne)
   925  	m.insert(cset)
   926  }
   927  
   928  func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
   929  	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
   930  	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   931  	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
   932  
   933  	switch arr {
   934  	case vecArrangement16B:
   935  		//	sshr v6?.16b, v2?.16b, #7
   936  		//	movz x4?, #0x201, lsl 0
   937  		//	movk x4?, #0x804, lsl 16
   938  		//	movk x4?, #0x2010, lsl 32
   939  		//	movk x4?, #0x8040, lsl 48
   940  		//	dup v5?.2d, x4?
   941  		//	and v6?.16b, v6?.16b, v5?.16b
   942  		//	ext v5?.16b, v6?.16b, v6?.16b, #8
   943  		//	zip1 v5?.16b, v6?.16b, v5?.16b
   944  		//	addv s5?, v5?.8h
   945  		//	umov s3?, v5?.h[0]
   946  
   947  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
   948  		// v1[i] = 0xff if vi<0, 0 otherwise.
   949  		sshr := m.allocateInstr()
   950  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
   951  		m.insert(sshr)
   952  
   953  		// Load the bit mask into r0.
   954  		m.insertMOVZ(r0.nr(), 0x0201, 0, true)
   955  		m.insertMOVK(r0.nr(), 0x0804, 1, true)
   956  		m.insertMOVK(r0.nr(), 0x2010, 2, true)
   957  		m.insertMOVK(r0.nr(), 0x8040, 3, true)
   958  
   959  		// dup r0 to v0.
   960  		dup := m.allocateInstr()
   961  		dup.asVecDup(v0, r0, vecArrangement2D)
   962  		m.insert(dup)
   963  
   964  		// Lane-wise logical AND with the bit mask, meaning that we have
   965  		// v[i] = (1 << i) if vi<0, 0 otherwise.
   966  		//
   967  		// Below, we use the following notation:
   968  		// wi := (1 << i) if vi<0, 0 otherwise.
   969  		and := m.allocateInstr()
   970  		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
   971  		m.insert(and)
   972  
   973  		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
   974  		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
   975  		ext := m.allocateInstr()
   976  		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
   977  		m.insert(ext)
   978  
   979  		// v = [w0, w8, ..., w7, w15]
   980  		zip1 := m.allocateInstr()
   981  		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
   982  		m.insert(zip1)
   983  
   984  		// v.h[0] = w0 + ... + w15
   985  		addv := m.allocateInstr()
   986  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
   987  		m.insert(addv)
   988  
   989  		// Extract the v.h[0] as the result.
   990  		movfv := m.allocateInstr()
   991  		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
   992  		m.insert(movfv)
   993  	case vecArrangement8H:
   994  		//	sshr v6?.8h, v2?.8h, #15
   995  		//	movz x4?, #0x1, lsl 0
   996  		//	movk x4?, #0x2, lsl 16
   997  		//	movk x4?, #0x4, lsl 32
   998  		//	movk x4?, #0x8, lsl 48
   999  		//	dup v5?.2d, x4?
  1000  		//	lsl x4?, x4?, 0x4
  1001  		//	ins v5?.d[1], x4?
  1002  		//	and v5?.16b, v6?.16b, v5?.16b
  1003  		//	addv s5?, v5?.8h
  1004  		//	umov s3?, v5?.h[0]
  1005  
  1006  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
  1007  		// v[i] = 0xffff if vi<0, 0 otherwise.
  1008  		sshr := m.allocateInstr()
  1009  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
  1010  		m.insert(sshr)
  1011  
  1012  		// Load the bit mask into r0.
  1013  		m.lowerConstantI64(r0.nr(), 0x0008000400020001)
  1014  
  1015  		// dup r0 to vector v0.
  1016  		dup := m.allocateInstr()
  1017  		dup.asVecDup(v0, r0, vecArrangement2D)
  1018  		m.insert(dup)
  1019  
  1020  		lsl := m.allocateInstr()
  1021  		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
  1022  		m.insert(lsl)
  1023  
  1024  		movv := m.allocateInstr()
  1025  		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
  1026  		m.insert(movv)
  1027  
  1028  		// Lane-wise logical AND with the bitmask, meaning that we have
  1029  		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
  1030  		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
  1031  		and := m.allocateInstr()
  1032  		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
  1033  		m.insert(and)
  1034  
  1035  		addv := m.allocateInstr()
  1036  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
  1037  		m.insert(addv)
  1038  
  1039  		movfv := m.allocateInstr()
  1040  		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
  1041  		m.insert(movfv)
  1042  	case vecArrangement4S:
  1043  		// 	sshr v6?.8h, v2?.8h, #15
  1044  		//	movz x4?, #0x1, lsl 0
  1045  		//	movk x4?, #0x2, lsl 16
  1046  		//	movk x4?, #0x4, lsl 32
  1047  		//	movk x4?, #0x8, lsl 48
  1048  		//	dup v5?.2d, x4?
  1049  		//	lsl x4?, x4?, 0x4
  1050  		//	ins v5?.d[1], x4?
  1051  		//	and v5?.16b, v6?.16b, v5?.16b
  1052  		//	addv s5?, v5?.8h
  1053  		//	umov s3?, v5?.h[0]
  1054  
  1055  		// Right arithmetic shift on the original vector and store the result into v1. So we have:
  1056  		// v[i] = 0xffffffff if vi<0, 0 otherwise.
  1057  		sshr := m.allocateInstr()
  1058  		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
  1059  		m.insert(sshr)
  1060  
  1061  		// Load the bit mask into r0.
  1062  		m.lowerConstantI64(r0.nr(), 0x0000000200000001)
  1063  
  1064  		// dup r0 to vector v0.
  1065  		dup := m.allocateInstr()
  1066  		dup.asVecDup(v0, r0, vecArrangement2D)
  1067  		m.insert(dup)
  1068  
  1069  		lsl := m.allocateInstr()
  1070  		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
  1071  		m.insert(lsl)
  1072  
  1073  		movv := m.allocateInstr()
  1074  		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
  1075  		m.insert(movv)
  1076  
  1077  		// Lane-wise logical AND with the bitmask, meaning that we have
  1078  		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
  1079  		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
  1080  		and := m.allocateInstr()
  1081  		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
  1082  		m.insert(and)
  1083  
  1084  		addv := m.allocateInstr()
  1085  		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
  1086  		m.insert(addv)
  1087  
  1088  		movfv := m.allocateInstr()
  1089  		movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false)
  1090  		m.insert(movfv)
  1091  	case vecArrangement2D:
  1092  		// 	mov d3?, v2?.d[0]
  1093  		//	mov x4?, v2?.d[1]
  1094  		//	lsr x4?, x4?, 0x3f
  1095  		//	lsr d3?, d3?, 0x3f
  1096  		//	add s3?, s3?, w4?, lsl #1
  1097  
  1098  		// Move the lower 64-bit int into result.
  1099  		movv0 := m.allocateInstr()
  1100  		movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false)
  1101  		m.insert(movv0)
  1102  
  1103  		// Move the higher 64-bit int into r0.
  1104  		movv1 := m.allocateInstr()
  1105  		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
  1106  		m.insert(movv1)
  1107  
  1108  		// Move the sign bit into the least significant bit.
  1109  		lsr1 := m.allocateInstr()
  1110  		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
  1111  		m.insert(lsr1)
  1112  
  1113  		lsr2 := m.allocateInstr()
  1114  		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
  1115  		m.insert(lsr2)
  1116  
  1117  		// rd = (r0<<1) | rd
  1118  		lsl := m.allocateInstr()
  1119  		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
  1120  		m.insert(lsl)
  1121  	default:
  1122  		panic("Unsupported " + arr.String())
  1123  	}
  1124  }
  1125  
  1126  func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
  1127  	x, lane := instr.ArgWithLane()
  1128  	arr := ssaLaneToArrangement(lane)
  1129  	ins := m.allocateInstr()
  1130  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1131  	rd := operandNR(m.compiler.VRegOf(instr.Return()))
  1132  	ins.asVecMisc(op, rd, rn, arr)
  1133  	m.insert(ins)
  1134  }
  1135  
  1136  func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) {
  1137  	ins := m.allocateInstr()
  1138  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1139  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1140  	rd := operandNR(m.compiler.VRegOf(ret))
  1141  	ins.asVecRRR(op, rd, rn, rm, arr)
  1142  	m.insert(ins)
  1143  }
  1144  
  1145  func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
  1146  	if arr != vecArrangement2D {
  1147  		mul := m.allocateInstr()
  1148  		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
  1149  		m.insert(mul)
  1150  	} else {
  1151  		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1152  		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1153  		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1154  
  1155  		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1156  
  1157  		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
  1158  		rev64 := m.allocateInstr()
  1159  		rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S)
  1160  		m.insert(rev64)
  1161  
  1162  		mul := m.allocateInstr()
  1163  		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
  1164  		m.insert(mul)
  1165  
  1166  		xtn1 := m.allocateInstr()
  1167  		xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S)
  1168  		m.insert(xtn1)
  1169  
  1170  		addp := m.allocateInstr()
  1171  		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
  1172  		m.insert(addp)
  1173  
  1174  		xtn2 := m.allocateInstr()
  1175  		xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S)
  1176  		m.insert(xtn2)
  1177  
  1178  		// Note: do not write the result directly into result yet. This is the same reason as in bsl.
  1179  		// In short, in UMLAL instruction, the result register is also one of the source register, and
  1180  		// the value on the result register is significant.
  1181  		shll := m.allocateInstr()
  1182  		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
  1183  		m.insert(shll)
  1184  
  1185  		umlal := m.allocateInstr()
  1186  		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
  1187  		m.insert(umlal)
  1188  
  1189  		mov := m.allocateInstr()
  1190  		mov.asFpuMov128(rd.nr(), tmpRes.nr())
  1191  		m.insert(mov)
  1192  	}
  1193  }
  1194  
  1195  func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
  1196  	x, y, lane := instr.Arg2WithLane()
  1197  	arr := ssaLaneToArrangement(lane)
  1198  
  1199  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1200  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1201  
  1202  	// Note: this usage of tmp is important.
  1203  	// BSL modifies the destination register, so we need to use a temporary register so that
  1204  	// the actual definition of the destination register happens *after* the BSL instruction.
  1205  	// That way, we can force the spill instruction to be inserted after the BSL instruction.
  1206  	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  1207  
  1208  	fcmgt := m.allocateInstr()
  1209  	if max {
  1210  		fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr)
  1211  	} else {
  1212  		// If min, swap the args.
  1213  		fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr)
  1214  	}
  1215  	m.insert(fcmgt)
  1216  
  1217  	bsl := m.allocateInstr()
  1218  	bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B)
  1219  	m.insert(bsl)
  1220  
  1221  	res := operandNR(m.compiler.VRegOf(instr.Return()))
  1222  	mov2 := m.allocateInstr()
  1223  	mov2.asFpuMov128(res.nr(), tmp.nr())
  1224  	m.insert(mov2)
  1225  }
  1226  
  1227  func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
  1228  	div := m.allocateInstr()
  1229  
  1230  	if signed {
  1231  		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
  1232  	} else {
  1233  		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
  1234  	}
  1235  	m.insert(div)
  1236  
  1237  	// Check if rm is zero:
  1238  	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
  1239  
  1240  	// rd = rn-rd*rm by MSUB instruction.
  1241  	msub := m.allocateInstr()
  1242  	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
  1243  	m.insert(msub)
  1244  }
  1245  
  1246  func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
  1247  	div := m.allocateInstr()
  1248  
  1249  	if signed {
  1250  		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
  1251  	} else {
  1252  		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
  1253  	}
  1254  	m.insert(div)
  1255  
  1256  	// Check if rm is zero:
  1257  	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
  1258  
  1259  	if signed {
  1260  		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
  1261  		minusOneCheck := m.allocateInstr()
  1262  		// Sets eq condition if rm == -1.
  1263  		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
  1264  		m.insert(minusOneCheck)
  1265  
  1266  		ccmp := m.allocateInstr()
  1267  		// If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag.
  1268  		ccmp.asCCmpImm(rn, 1, eq, 0, _64bit)
  1269  		m.insert(ccmp)
  1270  
  1271  		// Check the overflow flag.
  1272  		m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow)
  1273  	}
  1274  }
  1275  
  1276  // exitIfNot emits a conditional branch to exit if the condition is not met.
  1277  // If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit.
  1278  // Otherwise, `cond64bit` is ignored.
  1279  func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) {
  1280  	execCtxTmp := m.copyToTmp(execCtxVReg)
  1281  
  1282  	cbr := m.allocateInstr()
  1283  	m.insert(cbr)
  1284  	m.lowerExitWithCode(execCtxTmp, code)
  1285  	// Conditional branch target is after exit.
  1286  	l := m.insertBrTargetLabel()
  1287  	cbr.asCondBr(c, l, cond64bit)
  1288  }
  1289  
  1290  func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
  1291  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1292  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1293  	var tmpI, tmpF operand
  1294  	_64 := x.Type() == ssa.TypeF64
  1295  	if _64 {
  1296  		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1297  		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
  1298  	} else {
  1299  		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
  1300  		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
  1301  	}
  1302  	rd := m.compiler.VRegOf(ret)
  1303  	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
  1304  }
  1305  
  1306  func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
  1307  	// This is exactly the same code emitted by GCC for "__builtin_copysign":
  1308  	//
  1309  	//    mov     x0, -9223372036854775808
  1310  	//    fmov    d2, x0
  1311  	//    vbit    v0.8b, v1.8b, v2.8b
  1312  	//
  1313  
  1314  	setMSB := m.allocateInstr()
  1315  	if _64bit {
  1316  		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
  1317  		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
  1318  	} else {
  1319  		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
  1320  		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
  1321  	}
  1322  	m.insert(setMSB)
  1323  
  1324  	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1325  
  1326  	mov := m.allocateInstr()
  1327  	mov.asFpuMov64(tmpReg.nr(), rn.nr())
  1328  	m.insert(mov)
  1329  
  1330  	vbit := m.allocateInstr()
  1331  	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
  1332  	m.insert(vbit)
  1333  
  1334  	movDst := m.allocateInstr()
  1335  	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
  1336  	m.insert(movDst)
  1337  }
  1338  
  1339  func (m *machine) lowerBitcast(instr *ssa.Instruction) {
  1340  	v, dstType := instr.BitcastData()
  1341  	srcType := v.Type()
  1342  	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
  1343  	rd := operandNR(m.compiler.VRegOf(instr.Return()))
  1344  	srcInt := srcType.IsInt()
  1345  	dstInt := dstType.IsInt()
  1346  	switch {
  1347  	case srcInt && !dstInt: // Int to Float:
  1348  		mov := m.allocateInstr()
  1349  		var arr vecArrangement
  1350  		if srcType.Bits() == 64 {
  1351  			arr = vecArrangementD
  1352  		} else {
  1353  			arr = vecArrangementS
  1354  		}
  1355  		mov.asMovToVec(rd, rn, arr, vecIndex(0))
  1356  		m.insert(mov)
  1357  	case !srcInt && dstInt: // Float to Int:
  1358  		mov := m.allocateInstr()
  1359  		var arr vecArrangement
  1360  		if dstType.Bits() == 64 {
  1361  			arr = vecArrangementD
  1362  		} else {
  1363  			arr = vecArrangementS
  1364  		}
  1365  		mov.asMovFromVec(rd, rn, arr, vecIndex(0), false)
  1366  		m.insert(mov)
  1367  	default:
  1368  		panic("TODO?BUG?")
  1369  	}
  1370  }
  1371  
  1372  func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
  1373  	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
  1374  	rd := operandNR(m.compiler.VRegOf(out))
  1375  
  1376  	neg := m.allocateInstr()
  1377  	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
  1378  	m.insert(neg)
  1379  }
  1380  
  1381  func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
  1382  	if !nonTrapping {
  1383  		// First of all, we have to clear the FPU flags.
  1384  		flagClear := m.allocateInstr()
  1385  		flagClear.asMovToFPSR(xzrVReg)
  1386  		m.insert(flagClear)
  1387  	}
  1388  
  1389  	// Then, do the conversion which doesn't trap inherently.
  1390  	cvt := m.allocateInstr()
  1391  	cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit)
  1392  	m.insert(cvt)
  1393  
  1394  	if !nonTrapping {
  1395  		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
  1396  
  1397  		// After the conversion, check the FPU flags.
  1398  		getFlag := m.allocateInstr()
  1399  		getFlag.asMovFromFPSR(tmpReg)
  1400  		m.insert(getFlag)
  1401  
  1402  		execCtx := m.copyToTmp(ctx)
  1403  		_rn := operandNR(m.copyToTmp(rn.nr()))
  1404  
  1405  		// Check if the conversion was undefined by comparing the status with 1.
  1406  		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
  1407  		alu := m.allocateInstr()
  1408  		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
  1409  		m.insert(alu)
  1410  
  1411  		// If it is not undefined, we can return the result.
  1412  		ok := m.allocateInstr()
  1413  		m.insert(ok)
  1414  
  1415  		// Otherwise, we have to choose the status depending on it is overflow or NaN conversion.
  1416  
  1417  		// Comparing itself to check if it is a NaN.
  1418  		fpuCmp := m.allocateInstr()
  1419  		fpuCmp.asFpuCmp(_rn, _rn, src64bit)
  1420  		m.insert(fpuCmp)
  1421  		// If the VC flag is not set (== VS flag is set), it is a NaN.
  1422  		m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger)
  1423  		// Otherwise, it is an overflow.
  1424  		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
  1425  
  1426  		// Conditional branch target is after exit.
  1427  		l := m.insertBrTargetLabel()
  1428  		ok.asCondBr(ne.asCond(), l, false /* ignored */)
  1429  	}
  1430  }
  1431  
  1432  func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
  1433  	cvt := m.allocateInstr()
  1434  	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
  1435  	m.insert(cvt)
  1436  }
  1437  
  1438  func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
  1439  	instr := m.allocateInstr()
  1440  	var op fpuBinOp
  1441  	switch si.Opcode() {
  1442  	case ssa.OpcodeFadd:
  1443  		op = fpuBinOpAdd
  1444  	case ssa.OpcodeFsub:
  1445  		op = fpuBinOpSub
  1446  	case ssa.OpcodeFmul:
  1447  		op = fpuBinOpMul
  1448  	case ssa.OpcodeFdiv:
  1449  		op = fpuBinOpDiv
  1450  	case ssa.OpcodeFmax:
  1451  		op = fpuBinOpMax
  1452  	case ssa.OpcodeFmin:
  1453  		op = fpuBinOpMin
  1454  	}
  1455  	x, y := si.Arg2()
  1456  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1457  	rn := m.getOperand_NR(xDef, extModeNone)
  1458  	rm := m.getOperand_NR(yDef, extModeNone)
  1459  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1460  	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
  1461  	m.insert(instr)
  1462  }
  1463  
  1464  func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
  1465  	x, y := si.Arg2()
  1466  	if !x.Type().IsInt() {
  1467  		panic("BUG?")
  1468  	}
  1469  
  1470  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1471  	rn := m.getOperand_NR(xDef, extModeNone)
  1472  	rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone)
  1473  
  1474  	var aop aluOp
  1475  	switch {
  1476  	case add && !yNegated: // rn+rm = x+y
  1477  		aop = aluOpAdd
  1478  	case add && yNegated: // rn-rm = x-(-y) = x+y
  1479  		aop = aluOpSub
  1480  	case !add && !yNegated: // rn-rm = x-y
  1481  		aop = aluOpSub
  1482  	case !add && yNegated: // rn+rm = x-(-y) = x-y
  1483  		aop = aluOpAdd
  1484  	}
  1485  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1486  	alu := m.allocateInstr()
  1487  	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
  1488  	m.insert(alu)
  1489  }
  1490  
  1491  // InsertMove implements backend.Machine.
  1492  func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
  1493  	instr := m.allocateInstr()
  1494  	switch typ {
  1495  	case ssa.TypeI32, ssa.TypeI64:
  1496  		instr.asMove64(dst, src)
  1497  	case ssa.TypeF32, ssa.TypeF64:
  1498  		instr.asFpuMov64(dst, src)
  1499  	case ssa.TypeV128:
  1500  		instr.asFpuMov128(dst, src)
  1501  	default:
  1502  		panic("TODO")
  1503  	}
  1504  	m.insert(instr)
  1505  }
  1506  
  1507  func (m *machine) lowerIcmp(si *ssa.Instruction) {
  1508  	x, y, c := si.IcmpData()
  1509  	flag := condFlagFromSSAIntegerCmpCond(c)
  1510  
  1511  	in64bit := x.Type().Bits() == 64
  1512  	var ext extMode
  1513  	if in64bit {
  1514  		if c.Signed() {
  1515  			ext = extModeSignExtend64
  1516  		} else {
  1517  			ext = extModeZeroExtend64
  1518  		}
  1519  	} else {
  1520  		if c.Signed() {
  1521  			ext = extModeSignExtend32
  1522  		} else {
  1523  			ext = extModeZeroExtend32
  1524  		}
  1525  	}
  1526  
  1527  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
  1528  	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
  1529  	alu := m.allocateInstr()
  1530  	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
  1531  	m.insert(alu)
  1532  
  1533  	cset := m.allocateInstr()
  1534  	cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag)
  1535  	m.insert(cset)
  1536  }
  1537  
  1538  func (m *machine) lowerVIcmp(si *ssa.Instruction) {
  1539  	x, y, c, lane := si.VIcmpData()
  1540  	flag := condFlagFromSSAIntegerCmpCond(c)
  1541  	arr := ssaLaneToArrangement(lane)
  1542  
  1543  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1544  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1545  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1546  
  1547  	switch flag {
  1548  	case eq:
  1549  		cmp := m.allocateInstr()
  1550  		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
  1551  		m.insert(cmp)
  1552  	case ne:
  1553  		cmp := m.allocateInstr()
  1554  		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
  1555  		m.insert(cmp)
  1556  		not := m.allocateInstr()
  1557  		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
  1558  		m.insert(not)
  1559  	case ge:
  1560  		cmp := m.allocateInstr()
  1561  		cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr)
  1562  		m.insert(cmp)
  1563  	case gt:
  1564  		cmp := m.allocateInstr()
  1565  		cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr)
  1566  		m.insert(cmp)
  1567  	case le:
  1568  		cmp := m.allocateInstr()
  1569  		cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped
  1570  		m.insert(cmp)
  1571  	case lt:
  1572  		cmp := m.allocateInstr()
  1573  		cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped
  1574  		m.insert(cmp)
  1575  	case hs:
  1576  		cmp := m.allocateInstr()
  1577  		cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr)
  1578  		m.insert(cmp)
  1579  	case hi:
  1580  		cmp := m.allocateInstr()
  1581  		cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr)
  1582  		m.insert(cmp)
  1583  	case ls:
  1584  		cmp := m.allocateInstr()
  1585  		cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped
  1586  		m.insert(cmp)
  1587  	case lo:
  1588  		cmp := m.allocateInstr()
  1589  		cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped
  1590  		m.insert(cmp)
  1591  	}
  1592  }
  1593  
  1594  func (m *machine) lowerVFcmp(si *ssa.Instruction) {
  1595  	x, y, c, lane := si.VFcmpData()
  1596  	flag := condFlagFromSSAFloatCmpCond(c)
  1597  	arr := ssaLaneToArrangement(lane)
  1598  
  1599  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1600  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1601  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1602  
  1603  	switch flag {
  1604  	case eq:
  1605  		cmp := m.allocateInstr()
  1606  		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
  1607  		m.insert(cmp)
  1608  	case ne:
  1609  		cmp := m.allocateInstr()
  1610  		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
  1611  		m.insert(cmp)
  1612  		not := m.allocateInstr()
  1613  		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
  1614  		m.insert(not)
  1615  	case ge:
  1616  		cmp := m.allocateInstr()
  1617  		cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr)
  1618  		m.insert(cmp)
  1619  	case gt:
  1620  		cmp := m.allocateInstr()
  1621  		cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr)
  1622  		m.insert(cmp)
  1623  	case mi:
  1624  		cmp := m.allocateInstr()
  1625  		cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped
  1626  		m.insert(cmp)
  1627  	case ls:
  1628  		cmp := m.allocateInstr()
  1629  		cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped
  1630  		m.insert(cmp)
  1631  	}
  1632  }
  1633  
  1634  func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
  1635  	cvt := m.allocateInstr()
  1636  	if signed {
  1637  		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
  1638  	} else {
  1639  		cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr)
  1640  	}
  1641  	m.insert(cvt)
  1642  
  1643  	if arr == vecArrangement2D {
  1644  		narrow := m.allocateInstr()
  1645  		if signed {
  1646  			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
  1647  		} else {
  1648  			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
  1649  		}
  1650  		m.insert(narrow)
  1651  	}
  1652  }
  1653  
  1654  func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
  1655  	cvt := m.allocateInstr()
  1656  	if signed {
  1657  		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
  1658  	} else {
  1659  		cvt.asVecMisc(vecOpUcvtf, rd, rn, arr)
  1660  	}
  1661  	m.insert(cvt)
  1662  }
  1663  
  1664  func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
  1665  	x, amount := si.Arg2()
  1666  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
  1667  	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
  1668  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1669  
  1670  	alu := m.allocateInstr()
  1671  	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
  1672  	m.insert(alu)
  1673  }
  1674  
  1675  func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult bool) {
  1676  	x, y := si.Arg2()
  1677  
  1678  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1679  	rn := m.getOperand_NR(xDef, extModeNone)
  1680  
  1681  	var rd operand
  1682  	if ignoreResult {
  1683  		rd = operandNR(xzrVReg)
  1684  	} else {
  1685  		rd = operandNR(m.compiler.VRegOf(si.Return()))
  1686  	}
  1687  
  1688  	_64 := x.Type().Bits() == 64
  1689  	alu := m.allocateInstr()
  1690  	if instr := yDef.Instr; instr != nil && instr.Constant() {
  1691  		c := instr.ConstantVal()
  1692  		if isBitMaskImmediate(c, _64) {
  1693  			// Constant bit wise operations can be lowered to a single instruction.
  1694  			alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
  1695  			m.insert(alu)
  1696  			return
  1697  		}
  1698  	}
  1699  
  1700  	rm := m.getOperand_SR_NR(yDef, extModeNone)
  1701  	alu.asALU(op, rd, rn, rm, _64)
  1702  	m.insert(alu)
  1703  }
  1704  
  1705  func (m *machine) lowerRotl(si *ssa.Instruction) {
  1706  	x, y := si.Arg2()
  1707  	r := si.Return()
  1708  	_64 := r.Type().Bits() == 64
  1709  
  1710  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1711  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1712  	var tmp operand
  1713  	if _64 {
  1714  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
  1715  	} else {
  1716  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
  1717  	}
  1718  	rd := operandNR(m.compiler.VRegOf(r))
  1719  
  1720  	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
  1721  	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
  1722  }
  1723  
  1724  func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
  1725  	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
  1726  	neg := m.allocateInstr()
  1727  	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
  1728  	m.insert(neg)
  1729  	alu := m.allocateInstr()
  1730  	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
  1731  	m.insert(alu)
  1732  }
  1733  
  1734  func (m *machine) lowerRotr(si *ssa.Instruction) {
  1735  	x, y := si.Arg2()
  1736  
  1737  	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
  1738  	rn := m.getOperand_NR(xDef, extModeNone)
  1739  	rm := m.getOperand_NR(yDef, extModeNone)
  1740  	rd := operandNR(m.compiler.VRegOf(si.Return()))
  1741  
  1742  	alu := m.allocateInstr()
  1743  	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
  1744  	m.insert(alu)
  1745  }
  1746  
  1747  func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) {
  1748  	rd := m.compiler.VRegOf(ret)
  1749  	def := m.compiler.ValueDefinition(arg)
  1750  
  1751  	if instr := def.Instr; !signed && from == 32 && instr != nil {
  1752  		// We can optimize out the unsigned extend because:
  1753  		// 	Writes to the W register set bits [63:32] of the X register to zero
  1754  		//  https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions
  1755  		switch instr.Opcode() {
  1756  		case
  1757  			ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad,
  1758  			ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot,
  1759  			ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr,
  1760  			ssa.OpcodeRotl, ssa.OpcodeRotr,
  1761  			ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32:
  1762  			// So, if the argument is the result of a 32-bit operation, we can just copy the register.
  1763  			// It is highly likely that this copy will be optimized out after register allocation.
  1764  			rn := m.compiler.VRegOf(arg)
  1765  			mov := m.allocateInstr()
  1766  			// Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend).
  1767  			mov.asMove64(rd, rn)
  1768  			m.insert(mov)
  1769  			return
  1770  		default:
  1771  		}
  1772  	}
  1773  	rn := m.getOperand_NR(def, extModeNone)
  1774  
  1775  	ext := m.allocateInstr()
  1776  	ext.asExtend(rd, rn.nr(), from, to, signed)
  1777  	m.insert(ext)
  1778  }
  1779  
  1780  func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) {
  1781  	rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1782  
  1783  	fc := m.allocateInstr()
  1784  	fc.asFpuCmp(rn, rm, x.Type().Bits() == 64)
  1785  	m.insert(fc)
  1786  
  1787  	cset := m.allocateInstr()
  1788  	cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c))
  1789  	m.insert(cset)
  1790  }
  1791  
  1792  func (m *machine) lowerImul(x, y, result ssa.Value) {
  1793  	rd := m.compiler.VRegOf(result)
  1794  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1795  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1796  
  1797  	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
  1798  
  1799  	mul := m.allocateInstr()
  1800  	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
  1801  	m.insert(mul)
  1802  }
  1803  
  1804  func (m *machine) lowerClz(x, result ssa.Value) {
  1805  	rd := m.compiler.VRegOf(result)
  1806  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1807  	clz := m.allocateInstr()
  1808  	clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64)
  1809  	m.insert(clz)
  1810  }
  1811  
  1812  func (m *machine) lowerCtz(x, result ssa.Value) {
  1813  	rd := m.compiler.VRegOf(result)
  1814  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1815  	rbit := m.allocateInstr()
  1816  	_64 := x.Type().Bits() == 64
  1817  	var tmpReg regalloc.VReg
  1818  	if _64 {
  1819  		tmpReg = m.compiler.AllocateVReg(ssa.TypeI64)
  1820  	} else {
  1821  		tmpReg = m.compiler.AllocateVReg(ssa.TypeI32)
  1822  	}
  1823  	rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64)
  1824  	m.insert(rbit)
  1825  
  1826  	clz := m.allocateInstr()
  1827  	clz.asBitRR(bitOpClz, rd, tmpReg, _64)
  1828  	m.insert(clz)
  1829  }
  1830  
  1831  func (m *machine) lowerPopcnt(x, result ssa.Value) {
  1832  	// arm64 doesn't have an instruction for population count on scalar register,
  1833  	// so we use the vector instruction `cnt`.
  1834  	// This is exactly what the official Go implements bits.OneCount.
  1835  	// For example, "func () int { return bits.OneCount(10) }" is compiled as
  1836  	//
  1837  	//    MOVD    $10, R0 ;; Load 10.
  1838  	//    FMOVD   R0, F0
  1839  	//    VCNT    V0.B8, V0.B8
  1840  	//    UADDLV  V0.B8, V0
  1841  	//
  1842  	// In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`,
  1843  	// and the registers may use different names. In our encoding we use the following
  1844  	// instructions:
  1845  	//
  1846  	//    ins v0.d[0], x0     ;; mov from GPR to vec (FMOV above) is encoded as INS
  1847  	//    cnt v0.16b, v0.16b  ;; we use vec arrangement 16b
  1848  	//    uaddlv h0, v0.8b    ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b
  1849  	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
  1850  	//
  1851  
  1852  	rd := operandNR(m.compiler.VRegOf(result))
  1853  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1854  
  1855  	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1856  	ins := m.allocateInstr()
  1857  	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
  1858  	m.insert(ins)
  1859  
  1860  	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1861  	cnt := m.allocateInstr()
  1862  	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
  1863  	m.insert(cnt)
  1864  
  1865  	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
  1866  	uaddlv := m.allocateInstr()
  1867  	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
  1868  	m.insert(uaddlv)
  1869  
  1870  	mov := m.allocateInstr()
  1871  	mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false)
  1872  	m.insert(mov)
  1873  }
  1874  
  1875  // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument.
  1876  func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) {
  1877  	tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32)
  1878  	loadExitCodeConst := m.allocateInstr()
  1879  	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
  1880  
  1881  	setExitCode := m.allocateInstr()
  1882  	setExitCode.asStore(operandNR(tmpReg1),
  1883  		addressMode{
  1884  			kind: addressModeKindRegUnsignedImm12,
  1885  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
  1886  		}, 32)
  1887  
  1888  	// In order to unwind the stack, we also need to push the current stack pointer:
  1889  	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
  1890  	movSpToTmp := m.allocateInstr()
  1891  	movSpToTmp.asMove64(tmp2, spVReg)
  1892  	strSpToExecCtx := m.allocateInstr()
  1893  	strSpToExecCtx.asStore(operandNR(tmp2),
  1894  		addressMode{
  1895  			kind: addressModeKindRegUnsignedImm12,
  1896  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
  1897  		}, 64)
  1898  	// Also the address of this exit.
  1899  	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
  1900  	currentAddrToTmp := m.allocateInstr()
  1901  	currentAddrToTmp.asAdr(tmp3, 0)
  1902  	storeCurrentAddrToExecCtx := m.allocateInstr()
  1903  	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
  1904  		addressMode{
  1905  			kind: addressModeKindRegUnsignedImm12,
  1906  			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
  1907  		}, 64)
  1908  
  1909  	exitSeq := m.allocateInstr()
  1910  	exitSeq.asExitSequence(execCtxVReg)
  1911  
  1912  	m.insert(loadExitCodeConst)
  1913  	m.insert(setExitCode)
  1914  	m.insert(movSpToTmp)
  1915  	m.insert(strSpToExecCtx)
  1916  	m.insert(currentAddrToTmp)
  1917  	m.insert(storeCurrentAddrToExecCtx)
  1918  	m.insert(exitSeq)
  1919  }
  1920  
  1921  func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
  1922  	if x.Type() != y.Type() {
  1923  		panic(
  1924  			fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s",
  1925  				x.ID(), x.Type(), y.ID(), y.Type()))
  1926  	}
  1927  
  1928  	extMod := extModeOf(x.Type(), signed)
  1929  
  1930  	// First operand must be in pure register form.
  1931  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod)
  1932  	// Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions.
  1933  	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod)
  1934  
  1935  	alu := m.allocateInstr()
  1936  	// subs zr, rn, rm
  1937  	alu.asALU(
  1938  		aluOpSubS,
  1939  		// We don't need the result, just need to set flags.
  1940  		operandNR(xzrVReg),
  1941  		rn,
  1942  		rm,
  1943  		x.Type().Bits() == 64,
  1944  	)
  1945  	m.insert(alu)
  1946  }
  1947  
  1948  func (m *machine) lowerFcmpToFlag(x, y ssa.Value) {
  1949  	if x.Type() != y.Type() {
  1950  		panic("TODO(maybe): support icmp with different types")
  1951  	}
  1952  
  1953  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  1954  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  1955  	cmp := m.allocateInstr()
  1956  	cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64)
  1957  	m.insert(cmp)
  1958  }
  1959  
  1960  func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
  1961  	condDef := m.compiler.ValueDefinition(cond)
  1962  	if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) {
  1963  		panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
  1964  	}
  1965  	condDef.Instr.MarkLowered()
  1966  
  1967  	cvalInstr := condDef.Instr
  1968  	x, y, c := cvalInstr.IcmpData()
  1969  	signed := c.Signed()
  1970  
  1971  	if !m.tryLowerBandToFlag(x, y) {
  1972  		m.lowerIcmpToFlag(x, y, signed)
  1973  	}
  1974  
  1975  	// We need to copy the execution context to a temp register, because if it's spilled,
  1976  	// it might end up being reloaded inside the exiting branch.
  1977  	execCtxTmp := m.copyToTmp(execCtxVReg)
  1978  
  1979  	// We have to skip the entire exit sequence if the condition is false.
  1980  	cbr := m.allocateInstr()
  1981  	m.insert(cbr)
  1982  	m.lowerExitWithCode(execCtxTmp, code)
  1983  	// conditional branch target is after exit.
  1984  	l := m.insertBrTargetLabel()
  1985  	cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */)
  1986  }
  1987  
  1988  func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
  1989  	cvalDef := m.compiler.ValueDefinition(c)
  1990  
  1991  	var cc condFlag
  1992  	switch {
  1993  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
  1994  		cvalInstr := cvalDef.Instr
  1995  		x, y, c := cvalInstr.IcmpData()
  1996  		cc = condFlagFromSSAIntegerCmpCond(c)
  1997  		m.lowerIcmpToFlag(x, y, c.Signed())
  1998  		cvalDef.Instr.MarkLowered()
  1999  	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
  2000  		cvalInstr := cvalDef.Instr
  2001  		x, y, c := cvalInstr.FcmpData()
  2002  		cc = condFlagFromSSAFloatCmpCond(c)
  2003  		m.lowerFcmpToFlag(x, y)
  2004  		cvalDef.Instr.MarkLowered()
  2005  	default:
  2006  		rn := m.getOperand_NR(cvalDef, extModeNone)
  2007  		if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 {
  2008  			panic("TODO?BUG?: support select with non-integer condition")
  2009  		}
  2010  		alu := m.allocateInstr()
  2011  		// subs zr, rn, zr
  2012  		alu.asALU(
  2013  			aluOpSubS,
  2014  			// We don't need the result, just need to set flags.
  2015  			operandNR(xzrVReg),
  2016  			rn,
  2017  			operandNR(xzrVReg),
  2018  			c.Type().Bits() == 64,
  2019  		)
  2020  		m.insert(alu)
  2021  		cc = ne
  2022  	}
  2023  
  2024  	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
  2025  	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
  2026  
  2027  	rd := operandNR(m.compiler.VRegOf(result))
  2028  	switch x.Type() {
  2029  	case ssa.TypeI32, ssa.TypeI64:
  2030  		// csel rd, rn, rm, cc
  2031  		csel := m.allocateInstr()
  2032  		csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
  2033  		m.insert(csel)
  2034  	case ssa.TypeF32, ssa.TypeF64:
  2035  		// fcsel rd, rn, rm, cc
  2036  		fcsel := m.allocateInstr()
  2037  		fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
  2038  		m.insert(fcsel)
  2039  	default:
  2040  		panic("BUG")
  2041  	}
  2042  }
  2043  
  2044  func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
  2045  	// First check if `rc` is zero or not.
  2046  	checkZero := m.allocateInstr()
  2047  	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
  2048  	m.insert(checkZero)
  2049  
  2050  	// Then use CSETM to set all bits to one if `rc` is zero.
  2051  	allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64)
  2052  	cset := m.allocateInstr()
  2053  	cset.asCSet(allOnesOrZero, true, ne)
  2054  	m.insert(cset)
  2055  
  2056  	// Then move the bits to the result vector register.
  2057  	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
  2058  	dup := m.allocateInstr()
  2059  	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
  2060  	m.insert(dup)
  2061  
  2062  	// Now that `tmp2` has either all bits one or zero depending on `rc`,
  2063  	// we can use bsl to select between `rn` and `rm`.
  2064  	ins := m.allocateInstr()
  2065  	ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B)
  2066  	m.insert(ins)
  2067  
  2068  	// Finally, move the result to the destination register.
  2069  	mov2 := m.allocateInstr()
  2070  	mov2.asFpuMov128(rd.nr(), tmp2.nr())
  2071  	m.insert(mov2)
  2072  }
  2073  
  2074  func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
  2075  	ssaOp, size := si.AtomicRmwData()
  2076  
  2077  	var op atomicRmwOp
  2078  	var negateArg bool
  2079  	var flipArg bool
  2080  	switch ssaOp {
  2081  	case ssa.AtomicRmwOpAdd:
  2082  		op = atomicRmwOpAdd
  2083  	case ssa.AtomicRmwOpSub:
  2084  		op = atomicRmwOpAdd
  2085  		negateArg = true
  2086  	case ssa.AtomicRmwOpAnd:
  2087  		op = atomicRmwOpClr
  2088  		flipArg = true
  2089  	case ssa.AtomicRmwOpOr:
  2090  		op = atomicRmwOpSet
  2091  	case ssa.AtomicRmwOpXor:
  2092  		op = atomicRmwOpEor
  2093  	case ssa.AtomicRmwOpXchg:
  2094  		op = atomicRmwOpSwp
  2095  	default:
  2096  		panic(fmt.Sprintf("unknown ssa atomic rmw op: %s", ssaOp))
  2097  	}
  2098  
  2099  	addr, val := si.Arg2()
  2100  	addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
  2101  	rn := m.getOperand_NR(addrDef, extModeNone)
  2102  	rt := operandNR(m.compiler.VRegOf(si.Return()))
  2103  	rs := m.getOperand_NR(valDef, extModeNone)
  2104  
  2105  	_64 := si.Return().Type().Bits() == 64
  2106  	var tmp operand
  2107  	if _64 {
  2108  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
  2109  	} else {
  2110  		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
  2111  	}
  2112  	m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64)
  2113  }
  2114  
  2115  func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) {
  2116  	switch {
  2117  	case negateArg:
  2118  		neg := m.allocateInstr()
  2119  		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit)
  2120  		m.insert(neg)
  2121  	case flipArg:
  2122  		flip := m.allocateInstr()
  2123  		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit)
  2124  		m.insert(flip)
  2125  	default:
  2126  		tmp = rs
  2127  	}
  2128  
  2129  	rmw := m.allocateInstr()
  2130  	rmw.asAtomicRmw(op, rn, tmp, rt, size)
  2131  	m.insert(rmw)
  2132  }
  2133  
  2134  func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
  2135  	addr, exp, repl := si.Arg3()
  2136  	size := si.AtomicTargetSize()
  2137  
  2138  	addrDef, expDef, replDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(exp), m.compiler.ValueDefinition(repl)
  2139  	rn := m.getOperand_NR(addrDef, extModeNone)
  2140  	rt := m.getOperand_NR(replDef, extModeNone)
  2141  	rs := m.getOperand_NR(expDef, extModeNone)
  2142  	tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type()))
  2143  
  2144  	_64 := si.Return().Type().Bits() == 64
  2145  	// rs is overwritten by CAS, so we need to move it to the result register before the instruction
  2146  	// in case when it is used somewhere else.
  2147  	mov := m.allocateInstr()
  2148  	if _64 {
  2149  		mov.asMove64(tmp.nr(), rs.nr())
  2150  	} else {
  2151  		mov.asMove32(tmp.nr(), rs.nr())
  2152  	}
  2153  	m.insert(mov)
  2154  
  2155  	m.lowerAtomicCasImpl(rn, tmp, rt, size)
  2156  
  2157  	mov2 := m.allocateInstr()
  2158  	rd := m.compiler.VRegOf(si.Return())
  2159  	if _64 {
  2160  		mov2.asMove64(rd, tmp.nr())
  2161  	} else {
  2162  		mov2.asMove32(rd, tmp.nr())
  2163  	}
  2164  	m.insert(mov2)
  2165  }
  2166  
  2167  func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) {
  2168  	cas := m.allocateInstr()
  2169  	cas.asAtomicCas(rn, rs, rt, size)
  2170  	m.insert(cas)
  2171  }
  2172  
  2173  func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
  2174  	addr := si.Arg()
  2175  	size := si.AtomicTargetSize()
  2176  
  2177  	addrDef := m.compiler.ValueDefinition(addr)
  2178  	rn := m.getOperand_NR(addrDef, extModeNone)
  2179  	rt := operandNR(m.compiler.VRegOf(si.Return()))
  2180  
  2181  	m.lowerAtomicLoadImpl(rn, rt, size)
  2182  }
  2183  
  2184  func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) {
  2185  	ld := m.allocateInstr()
  2186  	ld.asAtomicLoad(rn, rt, size)
  2187  	m.insert(ld)
  2188  }
  2189  
  2190  func (m *machine) lowerAtomicStore(si *ssa.Instruction) {
  2191  	addr, val := si.Arg2()
  2192  	size := si.AtomicTargetSize()
  2193  
  2194  	addrDef := m.compiler.ValueDefinition(addr)
  2195  	valDef := m.compiler.ValueDefinition(val)
  2196  	rn := m.getOperand_NR(addrDef, extModeNone)
  2197  	rt := m.getOperand_NR(valDef, extModeNone)
  2198  
  2199  	m.lowerAtomicStoreImpl(rn, rt, size)
  2200  }
  2201  
  2202  func (m *machine) lowerAtomicStoreImpl(rn, rt operand, size uint64) {
  2203  	ld := m.allocateInstr()
  2204  	ld.asAtomicStore(rn, rt, size)
  2205  	m.insert(ld)
  2206  }
  2207  
  2208  // copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue
  2209  // e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes
  2210  func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
  2211  	typ := m.compiler.TypeOf(v)
  2212  	mov := m.allocateInstr()
  2213  	tmp := m.compiler.AllocateVReg(typ)
  2214  	if typ.IsInt() {
  2215  		mov.asMove64(tmp, v)
  2216  	} else {
  2217  		mov.asFpuMov128(tmp, v)
  2218  	}
  2219  	m.insert(mov)
  2220  	return tmp
  2221  }