github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/arm64/lower_instr.go (about) 1 package arm64 2 3 // Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions 4 // into machine specific instructions. 5 // 6 // Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree, 7 // and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection. 8 9 import ( 10 "fmt" 11 "math" 12 13 "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" 14 "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" 15 "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" 16 ) 17 18 // LowerSingleBranch implements backend.Machine. 19 func (m *machine) LowerSingleBranch(br *ssa.Instruction) { 20 ectx := m.executableContext 21 switch br.Opcode() { 22 case ssa.OpcodeJump: 23 _, _, targetBlk := br.BranchData() 24 if br.IsFallthroughJump() { 25 return 26 } 27 b := m.allocateInstr() 28 target := ectx.GetOrAllocateSSABlockLabel(targetBlk) 29 if target == labelReturn { 30 b.asRet() 31 } else { 32 b.asBr(target) 33 } 34 m.insert(b) 35 case ssa.OpcodeBrTable: 36 m.lowerBrTable(br) 37 default: 38 panic("BUG: unexpected branch opcode" + br.Opcode().String()) 39 } 40 } 41 42 func (m *machine) lowerBrTable(i *ssa.Instruction) { 43 index, targets := i.BrTableData() 44 indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone) 45 46 // Firstly, we have to do the bounds check of the index, and 47 // set it to the default target (sitting at the end of the list) if it's out of bounds. 48 49 // mov maxIndexReg #maximum_index 50 // subs wzr, index, maxIndexReg 51 // csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg. 52 maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32) 53 m.lowerConstantI32(maxIndexReg, int32(len(targets)-1)) 54 subs := m.allocateInstr() 55 subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false) 56 m.insert(subs) 57 csel := m.allocateInstr() 58 adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32) 59 csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false) 60 m.insert(csel) 61 62 brSequence := m.allocateInstr() 63 64 tableIndex := m.addJmpTableTarget(targets) 65 brSequence.asBrTableSequence(adjustedIndex, tableIndex, len(targets)) 66 m.insert(brSequence) 67 } 68 69 // LowerConditionalBranch implements backend.Machine. 70 func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { 71 exctx := m.executableContext 72 cval, args, targetBlk := b.BranchData() 73 if len(args) > 0 { 74 panic(fmt.Sprintf( 75 "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", 76 exctx.CurrentSSABlk, 77 targetBlk, 78 )) 79 } 80 81 target := exctx.GetOrAllocateSSABlockLabel(targetBlk) 82 cvalDef := m.compiler.ValueDefinition(cval) 83 84 switch { 85 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. 86 cvalInstr := cvalDef.Instr 87 x, y, c := cvalInstr.IcmpData() 88 cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed() 89 if b.Opcode() == ssa.OpcodeBrz { 90 cc = cc.invert() 91 } 92 93 if !m.tryLowerBandToFlag(x, y) { 94 m.lowerIcmpToFlag(x, y, signed) 95 } 96 cbr := m.allocateInstr() 97 cbr.asCondBr(cc.asCond(), target, false /* ignored */) 98 m.insert(cbr) 99 cvalDef.Instr.MarkLowered() 100 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. 101 cvalInstr := cvalDef.Instr 102 x, y, c := cvalInstr.FcmpData() 103 cc := condFlagFromSSAFloatCmpCond(c) 104 if b.Opcode() == ssa.OpcodeBrz { 105 cc = cc.invert() 106 } 107 m.lowerFcmpToFlag(x, y) 108 cbr := m.allocateInstr() 109 cbr.asCondBr(cc.asCond(), target, false /* ignored */) 110 m.insert(cbr) 111 cvalDef.Instr.MarkLowered() 112 default: 113 rn := m.getOperand_NR(cvalDef, extModeNone) 114 var c cond 115 if b.Opcode() == ssa.OpcodeBrz { 116 c = registerAsRegZeroCond(rn.nr()) 117 } else { 118 c = registerAsRegNotZeroCond(rn.nr()) 119 } 120 cbr := m.allocateInstr() 121 cbr.asCondBr(c, target, false) 122 m.insert(cbr) 123 } 124 } 125 126 func (m *machine) tryLowerBandToFlag(x, y ssa.Value) (ok bool) { 127 xx := m.compiler.ValueDefinition(x) 128 yy := m.compiler.ValueDefinition(y) 129 if xx.IsFromInstr() && xx.Instr.Constant() && xx.Instr.ConstantVal() == 0 { 130 if m.compiler.MatchInstr(yy, ssa.OpcodeBand) { 131 bandInstr := yy.Instr 132 m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true) 133 ok = true 134 bandInstr.MarkLowered() 135 return 136 } 137 } 138 139 if yy.IsFromInstr() && yy.Instr.Constant() && yy.Instr.ConstantVal() == 0 { 140 if m.compiler.MatchInstr(xx, ssa.OpcodeBand) { 141 bandInstr := xx.Instr 142 m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true) 143 ok = true 144 bandInstr.MarkLowered() 145 return 146 } 147 } 148 return 149 } 150 151 // LowerInstr implements backend.Machine. 152 func (m *machine) LowerInstr(instr *ssa.Instruction) { 153 if l := instr.SourceOffset(); l.Valid() { 154 info := m.allocateInstr().asEmitSourceOffsetInfo(l) 155 m.insert(info) 156 } 157 158 switch op := instr.Opcode(); op { 159 case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: 160 panic("BUG: branching instructions are handled by LowerBranches") 161 case ssa.OpcodeReturn: 162 panic("BUG: return must be handled by backend.Compiler") 163 case ssa.OpcodeIadd, ssa.OpcodeIsub: 164 m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd) 165 case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin: 166 m.lowerFpuBinOp(instr) 167 case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. 168 case ssa.OpcodeExitWithCode: 169 execCtx, code := instr.ExitWithCodeData() 170 m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code) 171 case ssa.OpcodeExitIfTrueWithCode: 172 execCtx, c, code := instr.ExitIfTrueWithCodeData() 173 m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code) 174 case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: 175 m.lowerStore(instr) 176 case ssa.OpcodeLoad: 177 dst := instr.Return() 178 ptr, offset, typ := instr.LoadData() 179 m.lowerLoad(ptr, offset, typ, dst) 180 case ssa.OpcodeVZeroExtLoad: 181 dst := instr.Return() 182 ptr, offset, typ := instr.VZeroExtLoadData() 183 m.lowerLoad(ptr, offset, typ, dst) 184 case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: 185 ptr, offset, _ := instr.LoadData() 186 ret := m.compiler.VRegOf(instr.Return()) 187 m.lowerExtLoad(op, ptr, offset, ret) 188 case ssa.OpcodeCall, ssa.OpcodeCallIndirect: 189 m.lowerCall(instr) 190 case ssa.OpcodeIcmp: 191 m.lowerIcmp(instr) 192 case ssa.OpcodeVIcmp: 193 m.lowerVIcmp(instr) 194 case ssa.OpcodeVFcmp: 195 m.lowerVFcmp(instr) 196 case ssa.OpcodeVCeil: 197 m.lowerVecMisc(vecOpFrintp, instr) 198 case ssa.OpcodeVFloor: 199 m.lowerVecMisc(vecOpFrintm, instr) 200 case ssa.OpcodeVTrunc: 201 m.lowerVecMisc(vecOpFrintz, instr) 202 case ssa.OpcodeVNearest: 203 m.lowerVecMisc(vecOpFrintn, instr) 204 case ssa.OpcodeVMaxPseudo: 205 m.lowerVMinMaxPseudo(instr, true) 206 case ssa.OpcodeVMinPseudo: 207 m.lowerVMinMaxPseudo(instr, false) 208 case ssa.OpcodeBand: 209 m.lowerBitwiseAluOp(instr, aluOpAnd, false) 210 case ssa.OpcodeBor: 211 m.lowerBitwiseAluOp(instr, aluOpOrr, false) 212 case ssa.OpcodeBxor: 213 m.lowerBitwiseAluOp(instr, aluOpEor, false) 214 case ssa.OpcodeIshl: 215 m.lowerShifts(instr, extModeNone, aluOpLsl) 216 case ssa.OpcodeSshr: 217 if instr.Return().Type().Bits() == 64 { 218 m.lowerShifts(instr, extModeSignExtend64, aluOpAsr) 219 } else { 220 m.lowerShifts(instr, extModeSignExtend32, aluOpAsr) 221 } 222 case ssa.OpcodeUshr: 223 if instr.Return().Type().Bits() == 64 { 224 m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr) 225 } else { 226 m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr) 227 } 228 case ssa.OpcodeRotl: 229 m.lowerRotl(instr) 230 case ssa.OpcodeRotr: 231 m.lowerRotr(instr) 232 case ssa.OpcodeSExtend, ssa.OpcodeUExtend: 233 from, to, signed := instr.ExtendData() 234 m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) 235 case ssa.OpcodeFcmp: 236 x, y, c := instr.FcmpData() 237 m.lowerFcmp(x, y, instr.Return(), c) 238 case ssa.OpcodeImul: 239 x, y := instr.Arg2() 240 result := instr.Return() 241 m.lowerImul(x, y, result) 242 case ssa.OpcodeUndefined: 243 undef := m.allocateInstr() 244 undef.asUDF() 245 m.insert(undef) 246 case ssa.OpcodeSelect: 247 c, x, y := instr.SelectData() 248 if x.Type() == ssa.TypeV128 { 249 rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) 250 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 251 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 252 rd := operandNR(m.compiler.VRegOf(instr.Return())) 253 m.lowerSelectVec(rc, rn, rm, rd) 254 } else { 255 m.lowerSelect(c, x, y, instr.Return()) 256 } 257 case ssa.OpcodeClz: 258 x := instr.Arg() 259 result := instr.Return() 260 m.lowerClz(x, result) 261 case ssa.OpcodeCtz: 262 x := instr.Arg() 263 result := instr.Return() 264 m.lowerCtz(x, result) 265 case ssa.OpcodePopcnt: 266 x := instr.Arg() 267 result := instr.Return() 268 m.lowerPopcnt(x, result) 269 case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: 270 x, ctx := instr.Arg2() 271 result := instr.Return() 272 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 273 rd := operandNR(m.compiler.VRegOf(result)) 274 ctxVReg := m.compiler.VRegOf(ctx) 275 m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64, 276 result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) 277 case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: 278 x, ctx := instr.Arg2() 279 result := instr.Return() 280 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 281 rd := operandNR(m.compiler.VRegOf(result)) 282 ctxVReg := m.compiler.VRegOf(ctx) 283 m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64, 284 result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) 285 case ssa.OpcodeFcvtFromSint: 286 x := instr.Arg() 287 result := instr.Return() 288 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 289 rd := operandNR(m.compiler.VRegOf(result)) 290 m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) 291 case ssa.OpcodeFcvtFromUint: 292 x := instr.Arg() 293 result := instr.Return() 294 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 295 rd := operandNR(m.compiler.VRegOf(result)) 296 m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) 297 case ssa.OpcodeFdemote: 298 v := instr.Arg() 299 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 300 rd := operandNR(m.compiler.VRegOf(instr.Return())) 301 cnt := m.allocateInstr() 302 cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false) 303 m.insert(cnt) 304 case ssa.OpcodeFpromote: 305 v := instr.Arg() 306 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 307 rd := operandNR(m.compiler.VRegOf(instr.Return())) 308 cnt := m.allocateInstr() 309 cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true) 310 m.insert(cnt) 311 case ssa.OpcodeIreduce: 312 rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone) 313 retVal := instr.Return() 314 rd := m.compiler.VRegOf(retVal) 315 316 if retVal.Type() != ssa.TypeI32 { 317 panic("TODO?: Ireduce to non-i32") 318 } 319 mov := m.allocateInstr() 320 mov.asMove32(rd, rn.reg()) 321 m.insert(mov) 322 case ssa.OpcodeFneg: 323 m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return()) 324 case ssa.OpcodeSqrt: 325 m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return()) 326 case ssa.OpcodeCeil: 327 m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return()) 328 case ssa.OpcodeFloor: 329 m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return()) 330 case ssa.OpcodeTrunc: 331 m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return()) 332 case ssa.OpcodeNearest: 333 m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return()) 334 case ssa.OpcodeFabs: 335 m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return()) 336 case ssa.OpcodeBitcast: 337 m.lowerBitcast(instr) 338 case ssa.OpcodeFcopysign: 339 x, y := instr.Arg2() 340 m.lowerFcopysign(x, y, instr.Return()) 341 case ssa.OpcodeSdiv, ssa.OpcodeUdiv: 342 x, y, ctx := instr.Arg3() 343 ctxVReg := m.compiler.VRegOf(ctx) 344 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 345 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 346 rd := operandNR(m.compiler.VRegOf(instr.Return())) 347 m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv) 348 case ssa.OpcodeSrem, ssa.OpcodeUrem: 349 x, y, ctx := instr.Arg3() 350 ctxVReg := m.compiler.VRegOf(ctx) 351 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 352 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 353 rd := operandNR(m.compiler.VRegOf(instr.Return())) 354 m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem) 355 case ssa.OpcodeVconst: 356 result := m.compiler.VRegOf(instr.Return()) 357 lo, hi := instr.VconstData() 358 v := m.allocateInstr() 359 v.asLoadFpuConst128(result, lo, hi) 360 m.insert(v) 361 case ssa.OpcodeVbnot: 362 x := instr.Arg() 363 ins := m.allocateInstr() 364 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 365 rd := operandNR(m.compiler.VRegOf(instr.Return())) 366 ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B) 367 m.insert(ins) 368 case ssa.OpcodeVbxor: 369 x, y := instr.Arg2() 370 m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B) 371 case ssa.OpcodeVbor: 372 x, y := instr.Arg2() 373 m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B) 374 case ssa.OpcodeVband: 375 x, y := instr.Arg2() 376 m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B) 377 case ssa.OpcodeVbandnot: 378 x, y := instr.Arg2() 379 m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B) 380 case ssa.OpcodeVbitselect: 381 c, x, y := instr.SelectData() 382 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 383 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 384 creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) 385 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 386 387 // creg is overwritten by BSL, so we need to move it to the result register before the instruction 388 // in case when it is used somewhere else. 389 mov := m.allocateInstr() 390 mov.asFpuMov128(tmp.nr(), creg.nr()) 391 m.insert(mov) 392 393 ins := m.allocateInstr() 394 ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B) 395 m.insert(ins) 396 397 mov2 := m.allocateInstr() 398 rd := m.compiler.VRegOf(instr.Return()) 399 mov2.asFpuMov128(rd, tmp.nr()) 400 m.insert(mov2) 401 case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue: 402 x, lane := instr.ArgWithLane() 403 var arr vecArrangement 404 if op == ssa.OpcodeVallTrue { 405 arr = ssaLaneToArrangement(lane) 406 } 407 rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 408 rd := operandNR(m.compiler.VRegOf(instr.Return())) 409 m.lowerVcheckTrue(op, rm, rd, arr) 410 case ssa.OpcodeVhighBits: 411 x, lane := instr.ArgWithLane() 412 rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 413 rd := operandNR(m.compiler.VRegOf(instr.Return())) 414 arr := ssaLaneToArrangement(lane) 415 m.lowerVhighBits(rm, rd, arr) 416 case ssa.OpcodeVIadd: 417 x, y, lane := instr.Arg2WithLane() 418 arr := ssaLaneToArrangement(lane) 419 m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr) 420 case ssa.OpcodeExtIaddPairwise: 421 v, lane, signed := instr.ExtIaddPairwiseData() 422 vv := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 423 424 tmpLo, tmpHi := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 425 var widen vecOp 426 if signed { 427 widen = vecOpSshll 428 } else { 429 widen = vecOpUshll 430 } 431 432 var loArr, hiArr, dstArr vecArrangement 433 switch lane { 434 case ssa.VecLaneI8x16: 435 loArr, hiArr, dstArr = vecArrangement8B, vecArrangement16B, vecArrangement8H 436 case ssa.VecLaneI16x8: 437 loArr, hiArr, dstArr = vecArrangement4H, vecArrangement8H, vecArrangement4S 438 case ssa.VecLaneI32x4: 439 loArr, hiArr, dstArr = vecArrangement2S, vecArrangement4S, vecArrangement2D 440 default: 441 panic("unsupported lane " + lane.String()) 442 } 443 444 widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr) 445 widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr) 446 addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr) 447 m.insert(widenLo) 448 m.insert(widenHi) 449 m.insert(addp) 450 451 case ssa.OpcodeVSaddSat: 452 x, y, lane := instr.Arg2WithLane() 453 arr := ssaLaneToArrangement(lane) 454 m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr) 455 case ssa.OpcodeVUaddSat: 456 x, y, lane := instr.Arg2WithLane() 457 arr := ssaLaneToArrangement(lane) 458 m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr) 459 case ssa.OpcodeVIsub: 460 x, y, lane := instr.Arg2WithLane() 461 arr := ssaLaneToArrangement(lane) 462 m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr) 463 case ssa.OpcodeVSsubSat: 464 x, y, lane := instr.Arg2WithLane() 465 arr := ssaLaneToArrangement(lane) 466 m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr) 467 case ssa.OpcodeVUsubSat: 468 x, y, lane := instr.Arg2WithLane() 469 arr := ssaLaneToArrangement(lane) 470 m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr) 471 case ssa.OpcodeVImin: 472 x, y, lane := instr.Arg2WithLane() 473 arr := ssaLaneToArrangement(lane) 474 m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr) 475 case ssa.OpcodeVUmin: 476 x, y, lane := instr.Arg2WithLane() 477 arr := ssaLaneToArrangement(lane) 478 m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr) 479 case ssa.OpcodeVImax: 480 x, y, lane := instr.Arg2WithLane() 481 arr := ssaLaneToArrangement(lane) 482 m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr) 483 case ssa.OpcodeVUmax: 484 x, y, lane := instr.Arg2WithLane() 485 arr := ssaLaneToArrangement(lane) 486 m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr) 487 case ssa.OpcodeVAvgRound: 488 x, y, lane := instr.Arg2WithLane() 489 arr := ssaLaneToArrangement(lane) 490 m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr) 491 case ssa.OpcodeVImul: 492 x, y, lane := instr.Arg2WithLane() 493 arr := ssaLaneToArrangement(lane) 494 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 495 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 496 rd := operandNR(m.compiler.VRegOf(instr.Return())) 497 m.lowerVIMul(rd, rn, rm, arr) 498 case ssa.OpcodeVIabs: 499 m.lowerVecMisc(vecOpAbs, instr) 500 case ssa.OpcodeVIneg: 501 m.lowerVecMisc(vecOpNeg, instr) 502 case ssa.OpcodeVIpopcnt: 503 m.lowerVecMisc(vecOpCnt, instr) 504 case ssa.OpcodeVIshl, 505 ssa.OpcodeVSshr, ssa.OpcodeVUshr: 506 x, y, lane := instr.Arg2WithLane() 507 arr := ssaLaneToArrangement(lane) 508 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 509 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 510 rd := operandNR(m.compiler.VRegOf(instr.Return())) 511 m.lowerVShift(op, rd, rn, rm, arr) 512 case ssa.OpcodeVSqrt: 513 m.lowerVecMisc(vecOpFsqrt, instr) 514 case ssa.OpcodeVFabs: 515 m.lowerVecMisc(vecOpFabs, instr) 516 case ssa.OpcodeVFneg: 517 m.lowerVecMisc(vecOpFneg, instr) 518 case ssa.OpcodeVFmin: 519 x, y, lane := instr.Arg2WithLane() 520 arr := ssaLaneToArrangement(lane) 521 m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr) 522 case ssa.OpcodeVFmax: 523 x, y, lane := instr.Arg2WithLane() 524 arr := ssaLaneToArrangement(lane) 525 m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr) 526 case ssa.OpcodeVFadd: 527 x, y, lane := instr.Arg2WithLane() 528 arr := ssaLaneToArrangement(lane) 529 m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr) 530 case ssa.OpcodeVFsub: 531 x, y, lane := instr.Arg2WithLane() 532 arr := ssaLaneToArrangement(lane) 533 m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr) 534 case ssa.OpcodeVFmul: 535 x, y, lane := instr.Arg2WithLane() 536 arr := ssaLaneToArrangement(lane) 537 m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr) 538 case ssa.OpcodeSqmulRoundSat: 539 x, y, lane := instr.Arg2WithLane() 540 arr := ssaLaneToArrangement(lane) 541 m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr) 542 case ssa.OpcodeVFdiv: 543 x, y, lane := instr.Arg2WithLane() 544 arr := ssaLaneToArrangement(lane) 545 m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr) 546 case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: 547 x, lane := instr.ArgWithLane() 548 arr := ssaLaneToArrangement(lane) 549 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 550 rd := operandNR(m.compiler.VRegOf(instr.Return())) 551 m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat) 552 case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint: 553 x, lane := instr.ArgWithLane() 554 arr := ssaLaneToArrangement(lane) 555 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 556 rd := operandNR(m.compiler.VRegOf(instr.Return())) 557 m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint) 558 case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow: 559 x, lane := instr.ArgWithLane() 560 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 561 rd := operandNR(m.compiler.VRegOf(instr.Return())) 562 563 var arr vecArrangement 564 switch lane { 565 case ssa.VecLaneI8x16: 566 arr = vecArrangement8B 567 case ssa.VecLaneI16x8: 568 arr = vecArrangement4H 569 case ssa.VecLaneI32x4: 570 arr = vecArrangement2S 571 } 572 573 shll := m.allocateInstr() 574 if signed := op == ssa.OpcodeSwidenLow; signed { 575 shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) 576 } else { 577 shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) 578 } 579 m.insert(shll) 580 case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh: 581 x, lane := instr.ArgWithLane() 582 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 583 rd := operandNR(m.compiler.VRegOf(instr.Return())) 584 585 arr := ssaLaneToArrangement(lane) 586 587 shll := m.allocateInstr() 588 if signed := op == ssa.OpcodeSwidenHigh; signed { 589 shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) 590 } else { 591 shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) 592 } 593 m.insert(shll) 594 595 case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: 596 x, y, lane := instr.Arg2WithLane() 597 var arr, arr2 vecArrangement 598 switch lane { 599 case ssa.VecLaneI16x8: // I16x8 600 arr = vecArrangement8B 601 arr2 = vecArrangement16B // Implies sqxtn2. 602 case ssa.VecLaneI32x4: 603 arr = vecArrangement4H 604 arr2 = vecArrangement8H // Implies sqxtn2. 605 default: 606 panic("unsupported lane " + lane.String()) 607 } 608 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 609 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 610 rd := operandNR(m.compiler.VRegOf(instr.Return())) 611 612 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 613 614 loQxtn := m.allocateInstr() 615 hiQxtn := m.allocateInstr() 616 if signed := op == ssa.OpcodeSnarrow; signed { 617 // Narrow lanes on rn and write them into lower-half of rd. 618 loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low 619 // Narrow lanes on rm and write them into higher-half of rd. 620 hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2) 621 } else { 622 // Narrow lanes on rn and write them into lower-half of rd. 623 loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low 624 // Narrow lanes on rm and write them into higher-half of rd. 625 hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2) 626 } 627 m.insert(loQxtn) 628 m.insert(hiQxtn) 629 630 mov := m.allocateInstr() 631 mov.asFpuMov128(rd.nr(), tmp.nr()) 632 m.insert(mov) 633 case ssa.OpcodeFvpromoteLow: 634 x, lane := instr.ArgWithLane() 635 if lane != ssa.VecLaneF32x4 { 636 panic("unsupported lane type " + lane.String()) 637 } 638 ins := m.allocateInstr() 639 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 640 rd := operandNR(m.compiler.VRegOf(instr.Return())) 641 ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S) 642 m.insert(ins) 643 case ssa.OpcodeFvdemote: 644 x, lane := instr.ArgWithLane() 645 if lane != ssa.VecLaneF64x2 { 646 panic("unsupported lane type " + lane.String()) 647 } 648 ins := m.allocateInstr() 649 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 650 rd := operandNR(m.compiler.VRegOf(instr.Return())) 651 ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S) 652 m.insert(ins) 653 case ssa.OpcodeExtractlane: 654 x, index, signed, lane := instr.ExtractlaneData() 655 656 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 657 rd := operandNR(m.compiler.VRegOf(instr.Return())) 658 659 mov := m.allocateInstr() 660 switch lane { 661 case ssa.VecLaneI8x16: 662 mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed) 663 case ssa.VecLaneI16x8: 664 mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed) 665 case ssa.VecLaneI32x4: 666 mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed) 667 case ssa.VecLaneI64x2: 668 mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed) 669 case ssa.VecLaneF32x4: 670 mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index)) 671 case ssa.VecLaneF64x2: 672 mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index)) 673 default: 674 panic("unsupported lane: " + lane.String()) 675 } 676 677 m.insert(mov) 678 679 case ssa.OpcodeInsertlane: 680 x, y, index, lane := instr.InsertlaneData() 681 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 682 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 683 rd := operandNR(m.compiler.VRegOf(instr.Return())) 684 tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 685 686 // Initially mov rn to tmp. 687 mov1 := m.allocateInstr() 688 mov1.asFpuMov128(tmpReg.nr(), rn.nr()) 689 m.insert(mov1) 690 691 // movToVec and vecMovElement do not clear the remaining bits to zero, 692 // thus, we can mov rm in-place to tmp. 693 mov2 := m.allocateInstr() 694 switch lane { 695 case ssa.VecLaneI8x16: 696 mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index)) 697 case ssa.VecLaneI16x8: 698 mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index)) 699 case ssa.VecLaneI32x4: 700 mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index)) 701 case ssa.VecLaneI64x2: 702 mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index)) 703 case ssa.VecLaneF32x4: 704 mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0)) 705 case ssa.VecLaneF64x2: 706 mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0)) 707 } 708 m.insert(mov2) 709 710 // Finally mov tmp to rd. 711 mov3 := m.allocateInstr() 712 mov3.asFpuMov128(rd.nr(), tmpReg.nr()) 713 m.insert(mov3) 714 715 case ssa.OpcodeSwizzle: 716 x, y, lane := instr.Arg2WithLane() 717 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 718 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 719 rd := operandNR(m.compiler.VRegOf(instr.Return())) 720 721 arr := ssaLaneToArrangement(lane) 722 723 // tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr> 724 tbl1 := m.allocateInstr() 725 tbl1.asVecTbl(1, rd, rn, rm, arr) 726 m.insert(tbl1) 727 728 case ssa.OpcodeShuffle: 729 x, y, lane1, lane2 := instr.ShuffleData() 730 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 731 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 732 rd := operandNR(m.compiler.VRegOf(instr.Return())) 733 734 m.lowerShuffle(rd, rn, rm, lane1, lane2) 735 736 case ssa.OpcodeSplat: 737 x, lane := instr.ArgWithLane() 738 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 739 rd := operandNR(m.compiler.VRegOf(instr.Return())) 740 741 dup := m.allocateInstr() 742 switch lane { 743 case ssa.VecLaneI8x16: 744 dup.asVecDup(rd, rn, vecArrangement16B) 745 case ssa.VecLaneI16x8: 746 dup.asVecDup(rd, rn, vecArrangement8H) 747 case ssa.VecLaneI32x4: 748 dup.asVecDup(rd, rn, vecArrangement4S) 749 case ssa.VecLaneI64x2: 750 dup.asVecDup(rd, rn, vecArrangement2D) 751 case ssa.VecLaneF32x4: 752 dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0)) 753 case ssa.VecLaneF64x2: 754 dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0)) 755 } 756 m.insert(dup) 757 758 case ssa.OpcodeWideningPairwiseDotProductS: 759 x, y := instr.Arg2() 760 xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), 761 m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 762 tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 763 m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H)) 764 m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H)) 765 m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S)) 766 767 rd := operandNR(m.compiler.VRegOf(instr.Return())) 768 m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr())) 769 770 case ssa.OpcodeLoadSplat: 771 ptr, offset, lane := instr.LoadSplatData() 772 m.lowerLoadSplat(ptr, offset, lane, instr.Return()) 773 774 case ssa.OpcodeAtomicRmw: 775 m.lowerAtomicRmw(instr) 776 777 case ssa.OpcodeAtomicCas: 778 m.lowerAtomicCas(instr) 779 780 case ssa.OpcodeAtomicLoad: 781 m.lowerAtomicLoad(instr) 782 783 case ssa.OpcodeAtomicStore: 784 m.lowerAtomicStore(instr) 785 786 case ssa.OpcodeFence: 787 instr := m.allocateInstr() 788 instr.asDMB() 789 m.insert(instr) 790 791 default: 792 panic("TODO: lowering " + op.String()) 793 } 794 m.executableContext.FlushPendingInstructions() 795 } 796 797 func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) { 798 // `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30. 799 vReg, wReg := v29VReg, v30VReg 800 801 // Initialize v29, v30 to rn, rm. 802 movv := m.allocateInstr() 803 movv.asFpuMov128(vReg, rn.nr()) 804 m.insert(movv) 805 806 movw := m.allocateInstr() 807 movw.asFpuMov128(wReg, rm.nr()) 808 m.insert(movw) 809 810 // `lane1`, `lane2` are already encoded as two u64s with the right layout: 811 // lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0] 812 // lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8] 813 // Thus, we can use loadFpuConst128. 814 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 815 lfc := m.allocateInstr() 816 lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2) 817 m.insert(lfc) 818 819 // tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b 820 tbl2 := m.allocateInstr() 821 tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B) 822 m.insert(tbl2) 823 } 824 825 func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) { 826 var modulo byte 827 switch arr { 828 case vecArrangement16B: 829 modulo = 0x7 // Modulo 8. 830 case vecArrangement8H: 831 modulo = 0xf // Modulo 16. 832 case vecArrangement4S: 833 modulo = 0x1f // Modulo 32. 834 case vecArrangement2D: 835 modulo = 0x3f // Modulo 64. 836 default: 837 panic("unsupported arrangment " + arr.String()) 838 } 839 840 rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 841 vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 842 843 and := m.allocateInstr() 844 and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true) 845 m.insert(and) 846 847 if op != ssa.OpcodeVIshl { 848 // Negate the amount to make this as right shift. 849 neg := m.allocateInstr() 850 neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true) 851 m.insert(neg) 852 } 853 854 // Copy the shift amount into a vector register as sshl/ushl requires it to be there. 855 dup := m.allocateInstr() 856 dup.asVecDup(vtmp, rtmp, arr) 857 m.insert(dup) 858 859 if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr { 860 sshl := m.allocateInstr() 861 sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr) 862 m.insert(sshl) 863 } else { 864 ushl := m.allocateInstr() 865 ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr) 866 m.insert(ushl) 867 } 868 } 869 870 func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) { 871 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 872 873 // Special case VallTrue for i64x2. 874 if op == ssa.OpcodeVallTrue && arr == vecArrangement2D { 875 // cmeq v3?.2d, v2?.2d, #0 876 // addp v3?.2d, v3?.2d, v3?.2d 877 // fcmp v3?, v3? 878 // cset dst, eq 879 880 ins := m.allocateInstr() 881 ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D) 882 m.insert(ins) 883 884 addp := m.allocateInstr() 885 addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D) 886 m.insert(addp) 887 888 fcmp := m.allocateInstr() 889 fcmp.asFpuCmp(tmp, tmp, true) 890 m.insert(fcmp) 891 892 cset := m.allocateInstr() 893 cset.asCSet(rd.nr(), false, eq) 894 m.insert(cset) 895 896 return 897 } 898 899 // Create a scalar value with umaxp or uminv, then compare it against zero. 900 ins := m.allocateInstr() 901 if op == ssa.OpcodeVanyTrue { 902 // umaxp v4?.16b, v2?.16b, v2?.16b 903 ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B) 904 } else { 905 // uminv d4?, v2?.4s 906 ins.asVecLanes(vecOpUminv, tmp, rm, arr) 907 } 908 m.insert(ins) 909 910 // mov x3?, v4?.d[0] 911 // ccmp x3?, #0x0, #0x0, al 912 // cset x3?, ne 913 // mov x0, x3? 914 915 movv := m.allocateInstr() 916 movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false) 917 m.insert(movv) 918 919 fc := m.allocateInstr() 920 fc.asCCmpImm(rd, uint64(0), al, 0, true) 921 m.insert(fc) 922 923 cset := m.allocateInstr() 924 cset.asCSet(rd.nr(), false, ne) 925 m.insert(cset) 926 } 927 928 func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) { 929 r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 930 v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 931 v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 932 933 switch arr { 934 case vecArrangement16B: 935 // sshr v6?.16b, v2?.16b, #7 936 // movz x4?, #0x201, lsl 0 937 // movk x4?, #0x804, lsl 16 938 // movk x4?, #0x2010, lsl 32 939 // movk x4?, #0x8040, lsl 48 940 // dup v5?.2d, x4? 941 // and v6?.16b, v6?.16b, v5?.16b 942 // ext v5?.16b, v6?.16b, v6?.16b, #8 943 // zip1 v5?.16b, v6?.16b, v5?.16b 944 // addv s5?, v5?.8h 945 // umov s3?, v5?.h[0] 946 947 // Right arithmetic shift on the original vector and store the result into v1. So we have: 948 // v1[i] = 0xff if vi<0, 0 otherwise. 949 sshr := m.allocateInstr() 950 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B) 951 m.insert(sshr) 952 953 // Load the bit mask into r0. 954 m.insertMOVZ(r0.nr(), 0x0201, 0, true) 955 m.insertMOVK(r0.nr(), 0x0804, 1, true) 956 m.insertMOVK(r0.nr(), 0x2010, 2, true) 957 m.insertMOVK(r0.nr(), 0x8040, 3, true) 958 959 // dup r0 to v0. 960 dup := m.allocateInstr() 961 dup.asVecDup(v0, r0, vecArrangement2D) 962 m.insert(dup) 963 964 // Lane-wise logical AND with the bit mask, meaning that we have 965 // v[i] = (1 << i) if vi<0, 0 otherwise. 966 // 967 // Below, we use the following notation: 968 // wi := (1 << i) if vi<0, 0 otherwise. 969 and := m.allocateInstr() 970 and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B) 971 m.insert(and) 972 973 // Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have 974 // v0[i] = w(i+8) if i < 8, w(i-8) otherwise. 975 ext := m.allocateInstr() 976 ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8)) 977 m.insert(ext) 978 979 // v = [w0, w8, ..., w7, w15] 980 zip1 := m.allocateInstr() 981 zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B) 982 m.insert(zip1) 983 984 // v.h[0] = w0 + ... + w15 985 addv := m.allocateInstr() 986 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) 987 m.insert(addv) 988 989 // Extract the v.h[0] as the result. 990 movfv := m.allocateInstr() 991 movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) 992 m.insert(movfv) 993 case vecArrangement8H: 994 // sshr v6?.8h, v2?.8h, #15 995 // movz x4?, #0x1, lsl 0 996 // movk x4?, #0x2, lsl 16 997 // movk x4?, #0x4, lsl 32 998 // movk x4?, #0x8, lsl 48 999 // dup v5?.2d, x4? 1000 // lsl x4?, x4?, 0x4 1001 // ins v5?.d[1], x4? 1002 // and v5?.16b, v6?.16b, v5?.16b 1003 // addv s5?, v5?.8h 1004 // umov s3?, v5?.h[0] 1005 1006 // Right arithmetic shift on the original vector and store the result into v1. So we have: 1007 // v[i] = 0xffff if vi<0, 0 otherwise. 1008 sshr := m.allocateInstr() 1009 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H) 1010 m.insert(sshr) 1011 1012 // Load the bit mask into r0. 1013 m.lowerConstantI64(r0.nr(), 0x0008000400020001) 1014 1015 // dup r0 to vector v0. 1016 dup := m.allocateInstr() 1017 dup.asVecDup(v0, r0, vecArrangement2D) 1018 m.insert(dup) 1019 1020 lsl := m.allocateInstr() 1021 lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true) 1022 m.insert(lsl) 1023 1024 movv := m.allocateInstr() 1025 movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) 1026 m.insert(movv) 1027 1028 // Lane-wise logical AND with the bitmask, meaning that we have 1029 // v[i] = (1 << i) if vi<0, 0 otherwise for i=0..3 1030 // = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7 1031 and := m.allocateInstr() 1032 and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) 1033 m.insert(and) 1034 1035 addv := m.allocateInstr() 1036 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) 1037 m.insert(addv) 1038 1039 movfv := m.allocateInstr() 1040 movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) 1041 m.insert(movfv) 1042 case vecArrangement4S: 1043 // sshr v6?.8h, v2?.8h, #15 1044 // movz x4?, #0x1, lsl 0 1045 // movk x4?, #0x2, lsl 16 1046 // movk x4?, #0x4, lsl 32 1047 // movk x4?, #0x8, lsl 48 1048 // dup v5?.2d, x4? 1049 // lsl x4?, x4?, 0x4 1050 // ins v5?.d[1], x4? 1051 // and v5?.16b, v6?.16b, v5?.16b 1052 // addv s5?, v5?.8h 1053 // umov s3?, v5?.h[0] 1054 1055 // Right arithmetic shift on the original vector and store the result into v1. So we have: 1056 // v[i] = 0xffffffff if vi<0, 0 otherwise. 1057 sshr := m.allocateInstr() 1058 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S) 1059 m.insert(sshr) 1060 1061 // Load the bit mask into r0. 1062 m.lowerConstantI64(r0.nr(), 0x0000000200000001) 1063 1064 // dup r0 to vector v0. 1065 dup := m.allocateInstr() 1066 dup.asVecDup(v0, r0, vecArrangement2D) 1067 m.insert(dup) 1068 1069 lsl := m.allocateInstr() 1070 lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true) 1071 m.insert(lsl) 1072 1073 movv := m.allocateInstr() 1074 movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) 1075 m.insert(movv) 1076 1077 // Lane-wise logical AND with the bitmask, meaning that we have 1078 // v[i] = (1 << i) if vi<0, 0 otherwise for i in [0, 1] 1079 // = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3] 1080 and := m.allocateInstr() 1081 and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) 1082 m.insert(and) 1083 1084 addv := m.allocateInstr() 1085 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S) 1086 m.insert(addv) 1087 1088 movfv := m.allocateInstr() 1089 movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false) 1090 m.insert(movfv) 1091 case vecArrangement2D: 1092 // mov d3?, v2?.d[0] 1093 // mov x4?, v2?.d[1] 1094 // lsr x4?, x4?, 0x3f 1095 // lsr d3?, d3?, 0x3f 1096 // add s3?, s3?, w4?, lsl #1 1097 1098 // Move the lower 64-bit int into result. 1099 movv0 := m.allocateInstr() 1100 movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false) 1101 m.insert(movv0) 1102 1103 // Move the higher 64-bit int into r0. 1104 movv1 := m.allocateInstr() 1105 movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false) 1106 m.insert(movv1) 1107 1108 // Move the sign bit into the least significant bit. 1109 lsr1 := m.allocateInstr() 1110 lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true) 1111 m.insert(lsr1) 1112 1113 lsr2 := m.allocateInstr() 1114 lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true) 1115 m.insert(lsr2) 1116 1117 // rd = (r0<<1) | rd 1118 lsl := m.allocateInstr() 1119 lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false) 1120 m.insert(lsl) 1121 default: 1122 panic("Unsupported " + arr.String()) 1123 } 1124 } 1125 1126 func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) { 1127 x, lane := instr.ArgWithLane() 1128 arr := ssaLaneToArrangement(lane) 1129 ins := m.allocateInstr() 1130 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1131 rd := operandNR(m.compiler.VRegOf(instr.Return())) 1132 ins.asVecMisc(op, rd, rn, arr) 1133 m.insert(ins) 1134 } 1135 1136 func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) { 1137 ins := m.allocateInstr() 1138 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1139 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1140 rd := operandNR(m.compiler.VRegOf(ret)) 1141 ins.asVecRRR(op, rd, rn, rm, arr) 1142 m.insert(ins) 1143 } 1144 1145 func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) { 1146 if arr != vecArrangement2D { 1147 mul := m.allocateInstr() 1148 mul.asVecRRR(vecOpMul, rd, rn, rm, arr) 1149 m.insert(mul) 1150 } else { 1151 tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1152 tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1153 tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1154 1155 tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1156 1157 // Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696 1158 rev64 := m.allocateInstr() 1159 rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S) 1160 m.insert(rev64) 1161 1162 mul := m.allocateInstr() 1163 mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S) 1164 m.insert(mul) 1165 1166 xtn1 := m.allocateInstr() 1167 xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S) 1168 m.insert(xtn1) 1169 1170 addp := m.allocateInstr() 1171 addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S) 1172 m.insert(addp) 1173 1174 xtn2 := m.allocateInstr() 1175 xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S) 1176 m.insert(xtn2) 1177 1178 // Note: do not write the result directly into result yet. This is the same reason as in bsl. 1179 // In short, in UMLAL instruction, the result register is also one of the source register, and 1180 // the value on the result register is significant. 1181 shll := m.allocateInstr() 1182 shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S) 1183 m.insert(shll) 1184 1185 umlal := m.allocateInstr() 1186 umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S) 1187 m.insert(umlal) 1188 1189 mov := m.allocateInstr() 1190 mov.asFpuMov128(rd.nr(), tmpRes.nr()) 1191 m.insert(mov) 1192 } 1193 } 1194 1195 func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) { 1196 x, y, lane := instr.Arg2WithLane() 1197 arr := ssaLaneToArrangement(lane) 1198 1199 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1200 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1201 1202 // Note: this usage of tmp is important. 1203 // BSL modifies the destination register, so we need to use a temporary register so that 1204 // the actual definition of the destination register happens *after* the BSL instruction. 1205 // That way, we can force the spill instruction to be inserted after the BSL instruction. 1206 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1207 1208 fcmgt := m.allocateInstr() 1209 if max { 1210 fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr) 1211 } else { 1212 // If min, swap the args. 1213 fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr) 1214 } 1215 m.insert(fcmgt) 1216 1217 bsl := m.allocateInstr() 1218 bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B) 1219 m.insert(bsl) 1220 1221 res := operandNR(m.compiler.VRegOf(instr.Return())) 1222 mov2 := m.allocateInstr() 1223 mov2.asFpuMov128(res.nr(), tmp.nr()) 1224 m.insert(mov2) 1225 } 1226 1227 func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { 1228 div := m.allocateInstr() 1229 1230 if signed { 1231 div.asALU(aluOpSDiv, rd, rn, rm, _64bit) 1232 } else { 1233 div.asALU(aluOpUDiv, rd, rn, rm, _64bit) 1234 } 1235 m.insert(div) 1236 1237 // Check if rm is zero: 1238 m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) 1239 1240 // rd = rn-rd*rm by MSUB instruction. 1241 msub := m.allocateInstr() 1242 msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit) 1243 m.insert(msub) 1244 } 1245 1246 func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { 1247 div := m.allocateInstr() 1248 1249 if signed { 1250 div.asALU(aluOpSDiv, rd, rn, rm, _64bit) 1251 } else { 1252 div.asALU(aluOpUDiv, rd, rn, rm, _64bit) 1253 } 1254 m.insert(div) 1255 1256 // Check if rm is zero: 1257 m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) 1258 1259 if signed { 1260 // We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1" 1261 minusOneCheck := m.allocateInstr() 1262 // Sets eq condition if rm == -1. 1263 minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit) 1264 m.insert(minusOneCheck) 1265 1266 ccmp := m.allocateInstr() 1267 // If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag. 1268 ccmp.asCCmpImm(rn, 1, eq, 0, _64bit) 1269 m.insert(ccmp) 1270 1271 // Check the overflow flag. 1272 m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow) 1273 } 1274 } 1275 1276 // exitIfNot emits a conditional branch to exit if the condition is not met. 1277 // If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit. 1278 // Otherwise, `cond64bit` is ignored. 1279 func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) { 1280 execCtxTmp := m.copyToTmp(execCtxVReg) 1281 1282 cbr := m.allocateInstr() 1283 m.insert(cbr) 1284 m.lowerExitWithCode(execCtxTmp, code) 1285 // Conditional branch target is after exit. 1286 l := m.insertBrTargetLabel() 1287 cbr.asCondBr(c, l, cond64bit) 1288 } 1289 1290 func (m *machine) lowerFcopysign(x, y, ret ssa.Value) { 1291 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1292 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1293 var tmpI, tmpF operand 1294 _64 := x.Type() == ssa.TypeF64 1295 if _64 { 1296 tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1297 tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 1298 } else { 1299 tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32)) 1300 tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) 1301 } 1302 rd := m.compiler.VRegOf(ret) 1303 m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64) 1304 } 1305 1306 func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) { 1307 // This is exactly the same code emitted by GCC for "__builtin_copysign": 1308 // 1309 // mov x0, -9223372036854775808 1310 // fmov d2, x0 1311 // vbit v0.8b, v1.8b, v2.8b 1312 // 1313 1314 setMSB := m.allocateInstr() 1315 if _64bit { 1316 m.lowerConstantI64(tmpI.nr(), math.MinInt64) 1317 setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0)) 1318 } else { 1319 m.lowerConstantI32(tmpI.nr(), math.MinInt32) 1320 setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0)) 1321 } 1322 m.insert(setMSB) 1323 1324 tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1325 1326 mov := m.allocateInstr() 1327 mov.asFpuMov64(tmpReg.nr(), rn.nr()) 1328 m.insert(mov) 1329 1330 vbit := m.allocateInstr() 1331 vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B) 1332 m.insert(vbit) 1333 1334 movDst := m.allocateInstr() 1335 movDst.asFpuMov64(rd.nr(), tmpReg.nr()) 1336 m.insert(movDst) 1337 } 1338 1339 func (m *machine) lowerBitcast(instr *ssa.Instruction) { 1340 v, dstType := instr.BitcastData() 1341 srcType := v.Type() 1342 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 1343 rd := operandNR(m.compiler.VRegOf(instr.Return())) 1344 srcInt := srcType.IsInt() 1345 dstInt := dstType.IsInt() 1346 switch { 1347 case srcInt && !dstInt: // Int to Float: 1348 mov := m.allocateInstr() 1349 var arr vecArrangement 1350 if srcType.Bits() == 64 { 1351 arr = vecArrangementD 1352 } else { 1353 arr = vecArrangementS 1354 } 1355 mov.asMovToVec(rd, rn, arr, vecIndex(0)) 1356 m.insert(mov) 1357 case !srcInt && dstInt: // Float to Int: 1358 mov := m.allocateInstr() 1359 var arr vecArrangement 1360 if dstType.Bits() == 64 { 1361 arr = vecArrangementD 1362 } else { 1363 arr = vecArrangementS 1364 } 1365 mov.asMovFromVec(rd, rn, arr, vecIndex(0), false) 1366 m.insert(mov) 1367 default: 1368 panic("TODO?BUG?") 1369 } 1370 } 1371 1372 func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) { 1373 rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone) 1374 rd := operandNR(m.compiler.VRegOf(out)) 1375 1376 neg := m.allocateInstr() 1377 neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64) 1378 m.insert(neg) 1379 } 1380 1381 func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) { 1382 if !nonTrapping { 1383 // First of all, we have to clear the FPU flags. 1384 flagClear := m.allocateInstr() 1385 flagClear.asMovToFPSR(xzrVReg) 1386 m.insert(flagClear) 1387 } 1388 1389 // Then, do the conversion which doesn't trap inherently. 1390 cvt := m.allocateInstr() 1391 cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit) 1392 m.insert(cvt) 1393 1394 if !nonTrapping { 1395 tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) 1396 1397 // After the conversion, check the FPU flags. 1398 getFlag := m.allocateInstr() 1399 getFlag.asMovFromFPSR(tmpReg) 1400 m.insert(getFlag) 1401 1402 execCtx := m.copyToTmp(ctx) 1403 _rn := operandNR(m.copyToTmp(rn.nr())) 1404 1405 // Check if the conversion was undefined by comparing the status with 1. 1406 // See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register 1407 alu := m.allocateInstr() 1408 alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true) 1409 m.insert(alu) 1410 1411 // If it is not undefined, we can return the result. 1412 ok := m.allocateInstr() 1413 m.insert(ok) 1414 1415 // Otherwise, we have to choose the status depending on it is overflow or NaN conversion. 1416 1417 // Comparing itself to check if it is a NaN. 1418 fpuCmp := m.allocateInstr() 1419 fpuCmp.asFpuCmp(_rn, _rn, src64bit) 1420 m.insert(fpuCmp) 1421 // If the VC flag is not set (== VS flag is set), it is a NaN. 1422 m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger) 1423 // Otherwise, it is an overflow. 1424 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 1425 1426 // Conditional branch target is after exit. 1427 l := m.insertBrTargetLabel() 1428 ok.asCondBr(ne.asCond(), l, false /* ignored */) 1429 } 1430 } 1431 1432 func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) { 1433 cvt := m.allocateInstr() 1434 cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit) 1435 m.insert(cvt) 1436 } 1437 1438 func (m *machine) lowerFpuBinOp(si *ssa.Instruction) { 1439 instr := m.allocateInstr() 1440 var op fpuBinOp 1441 switch si.Opcode() { 1442 case ssa.OpcodeFadd: 1443 op = fpuBinOpAdd 1444 case ssa.OpcodeFsub: 1445 op = fpuBinOpSub 1446 case ssa.OpcodeFmul: 1447 op = fpuBinOpMul 1448 case ssa.OpcodeFdiv: 1449 op = fpuBinOpDiv 1450 case ssa.OpcodeFmax: 1451 op = fpuBinOpMax 1452 case ssa.OpcodeFmin: 1453 op = fpuBinOpMin 1454 } 1455 x, y := si.Arg2() 1456 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1457 rn := m.getOperand_NR(xDef, extModeNone) 1458 rm := m.getOperand_NR(yDef, extModeNone) 1459 rd := operandNR(m.compiler.VRegOf(si.Return())) 1460 instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64) 1461 m.insert(instr) 1462 } 1463 1464 func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) { 1465 x, y := si.Arg2() 1466 if !x.Type().IsInt() { 1467 panic("BUG?") 1468 } 1469 1470 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1471 rn := m.getOperand_NR(xDef, extModeNone) 1472 rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone) 1473 1474 var aop aluOp 1475 switch { 1476 case add && !yNegated: // rn+rm = x+y 1477 aop = aluOpAdd 1478 case add && yNegated: // rn-rm = x-(-y) = x+y 1479 aop = aluOpSub 1480 case !add && !yNegated: // rn-rm = x-y 1481 aop = aluOpSub 1482 case !add && yNegated: // rn+rm = x-(-y) = x-y 1483 aop = aluOpAdd 1484 } 1485 rd := operandNR(m.compiler.VRegOf(si.Return())) 1486 alu := m.allocateInstr() 1487 alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64) 1488 m.insert(alu) 1489 } 1490 1491 // InsertMove implements backend.Machine. 1492 func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { 1493 instr := m.allocateInstr() 1494 switch typ { 1495 case ssa.TypeI32, ssa.TypeI64: 1496 instr.asMove64(dst, src) 1497 case ssa.TypeF32, ssa.TypeF64: 1498 instr.asFpuMov64(dst, src) 1499 case ssa.TypeV128: 1500 instr.asFpuMov128(dst, src) 1501 default: 1502 panic("TODO") 1503 } 1504 m.insert(instr) 1505 } 1506 1507 func (m *machine) lowerIcmp(si *ssa.Instruction) { 1508 x, y, c := si.IcmpData() 1509 flag := condFlagFromSSAIntegerCmpCond(c) 1510 1511 in64bit := x.Type().Bits() == 64 1512 var ext extMode 1513 if in64bit { 1514 if c.Signed() { 1515 ext = extModeSignExtend64 1516 } else { 1517 ext = extModeZeroExtend64 1518 } 1519 } else { 1520 if c.Signed() { 1521 ext = extModeSignExtend32 1522 } else { 1523 ext = extModeZeroExtend32 1524 } 1525 } 1526 1527 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) 1528 rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext) 1529 alu := m.allocateInstr() 1530 alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit) 1531 m.insert(alu) 1532 1533 cset := m.allocateInstr() 1534 cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag) 1535 m.insert(cset) 1536 } 1537 1538 func (m *machine) lowerVIcmp(si *ssa.Instruction) { 1539 x, y, c, lane := si.VIcmpData() 1540 flag := condFlagFromSSAIntegerCmpCond(c) 1541 arr := ssaLaneToArrangement(lane) 1542 1543 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1544 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1545 rd := operandNR(m.compiler.VRegOf(si.Return())) 1546 1547 switch flag { 1548 case eq: 1549 cmp := m.allocateInstr() 1550 cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) 1551 m.insert(cmp) 1552 case ne: 1553 cmp := m.allocateInstr() 1554 cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) 1555 m.insert(cmp) 1556 not := m.allocateInstr() 1557 not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) 1558 m.insert(not) 1559 case ge: 1560 cmp := m.allocateInstr() 1561 cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr) 1562 m.insert(cmp) 1563 case gt: 1564 cmp := m.allocateInstr() 1565 cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr) 1566 m.insert(cmp) 1567 case le: 1568 cmp := m.allocateInstr() 1569 cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped 1570 m.insert(cmp) 1571 case lt: 1572 cmp := m.allocateInstr() 1573 cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped 1574 m.insert(cmp) 1575 case hs: 1576 cmp := m.allocateInstr() 1577 cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr) 1578 m.insert(cmp) 1579 case hi: 1580 cmp := m.allocateInstr() 1581 cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr) 1582 m.insert(cmp) 1583 case ls: 1584 cmp := m.allocateInstr() 1585 cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped 1586 m.insert(cmp) 1587 case lo: 1588 cmp := m.allocateInstr() 1589 cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped 1590 m.insert(cmp) 1591 } 1592 } 1593 1594 func (m *machine) lowerVFcmp(si *ssa.Instruction) { 1595 x, y, c, lane := si.VFcmpData() 1596 flag := condFlagFromSSAFloatCmpCond(c) 1597 arr := ssaLaneToArrangement(lane) 1598 1599 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1600 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1601 rd := operandNR(m.compiler.VRegOf(si.Return())) 1602 1603 switch flag { 1604 case eq: 1605 cmp := m.allocateInstr() 1606 cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) 1607 m.insert(cmp) 1608 case ne: 1609 cmp := m.allocateInstr() 1610 cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) 1611 m.insert(cmp) 1612 not := m.allocateInstr() 1613 not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) 1614 m.insert(not) 1615 case ge: 1616 cmp := m.allocateInstr() 1617 cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr) 1618 m.insert(cmp) 1619 case gt: 1620 cmp := m.allocateInstr() 1621 cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr) 1622 m.insert(cmp) 1623 case mi: 1624 cmp := m.allocateInstr() 1625 cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped 1626 m.insert(cmp) 1627 case ls: 1628 cmp := m.allocateInstr() 1629 cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped 1630 m.insert(cmp) 1631 } 1632 } 1633 1634 func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) { 1635 cvt := m.allocateInstr() 1636 if signed { 1637 cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr) 1638 } else { 1639 cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr) 1640 } 1641 m.insert(cvt) 1642 1643 if arr == vecArrangement2D { 1644 narrow := m.allocateInstr() 1645 if signed { 1646 narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S) 1647 } else { 1648 narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S) 1649 } 1650 m.insert(narrow) 1651 } 1652 } 1653 1654 func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) { 1655 cvt := m.allocateInstr() 1656 if signed { 1657 cvt.asVecMisc(vecOpScvtf, rd, rn, arr) 1658 } else { 1659 cvt.asVecMisc(vecOpUcvtf, rd, rn, arr) 1660 } 1661 m.insert(cvt) 1662 } 1663 1664 func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) { 1665 x, amount := si.Arg2() 1666 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) 1667 rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits()) 1668 rd := operandNR(m.compiler.VRegOf(si.Return())) 1669 1670 alu := m.allocateInstr() 1671 alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64) 1672 m.insert(alu) 1673 } 1674 1675 func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult bool) { 1676 x, y := si.Arg2() 1677 1678 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1679 rn := m.getOperand_NR(xDef, extModeNone) 1680 1681 var rd operand 1682 if ignoreResult { 1683 rd = operandNR(xzrVReg) 1684 } else { 1685 rd = operandNR(m.compiler.VRegOf(si.Return())) 1686 } 1687 1688 _64 := x.Type().Bits() == 64 1689 alu := m.allocateInstr() 1690 if instr := yDef.Instr; instr != nil && instr.Constant() { 1691 c := instr.ConstantVal() 1692 if isBitMaskImmediate(c, _64) { 1693 // Constant bit wise operations can be lowered to a single instruction. 1694 alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64) 1695 m.insert(alu) 1696 return 1697 } 1698 } 1699 1700 rm := m.getOperand_SR_NR(yDef, extModeNone) 1701 alu.asALU(op, rd, rn, rm, _64) 1702 m.insert(alu) 1703 } 1704 1705 func (m *machine) lowerRotl(si *ssa.Instruction) { 1706 x, y := si.Arg2() 1707 r := si.Return() 1708 _64 := r.Type().Bits() == 64 1709 1710 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1711 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1712 var tmp operand 1713 if _64 { 1714 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 1715 } else { 1716 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) 1717 } 1718 rd := operandNR(m.compiler.VRegOf(r)) 1719 1720 // Encode rotl as neg + rotr: neg is a sub against the zero-reg. 1721 m.lowerRotlImpl(rd, rn, rm, tmp, _64) 1722 } 1723 1724 func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) { 1725 // Encode rotl as neg + rotr: neg is a sub against the zero-reg. 1726 neg := m.allocateInstr() 1727 neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit) 1728 m.insert(neg) 1729 alu := m.allocateInstr() 1730 alu.asALU(aluOpRotR, rd, rn, tmp, is64bit) 1731 m.insert(alu) 1732 } 1733 1734 func (m *machine) lowerRotr(si *ssa.Instruction) { 1735 x, y := si.Arg2() 1736 1737 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1738 rn := m.getOperand_NR(xDef, extModeNone) 1739 rm := m.getOperand_NR(yDef, extModeNone) 1740 rd := operandNR(m.compiler.VRegOf(si.Return())) 1741 1742 alu := m.allocateInstr() 1743 alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64) 1744 m.insert(alu) 1745 } 1746 1747 func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) { 1748 rd := m.compiler.VRegOf(ret) 1749 def := m.compiler.ValueDefinition(arg) 1750 1751 if instr := def.Instr; !signed && from == 32 && instr != nil { 1752 // We can optimize out the unsigned extend because: 1753 // Writes to the W register set bits [63:32] of the X register to zero 1754 // https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions 1755 switch instr.Opcode() { 1756 case 1757 ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad, 1758 ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot, 1759 ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr, 1760 ssa.OpcodeRotl, ssa.OpcodeRotr, 1761 ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32: 1762 // So, if the argument is the result of a 32-bit operation, we can just copy the register. 1763 // It is highly likely that this copy will be optimized out after register allocation. 1764 rn := m.compiler.VRegOf(arg) 1765 mov := m.allocateInstr() 1766 // Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend). 1767 mov.asMove64(rd, rn) 1768 m.insert(mov) 1769 return 1770 default: 1771 } 1772 } 1773 rn := m.getOperand_NR(def, extModeNone) 1774 1775 ext := m.allocateInstr() 1776 ext.asExtend(rd, rn.nr(), from, to, signed) 1777 m.insert(ext) 1778 } 1779 1780 func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) { 1781 rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1782 1783 fc := m.allocateInstr() 1784 fc.asFpuCmp(rn, rm, x.Type().Bits() == 64) 1785 m.insert(fc) 1786 1787 cset := m.allocateInstr() 1788 cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c)) 1789 m.insert(cset) 1790 } 1791 1792 func (m *machine) lowerImul(x, y, result ssa.Value) { 1793 rd := m.compiler.VRegOf(result) 1794 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1795 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1796 1797 // TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg. 1798 1799 mul := m.allocateInstr() 1800 mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64) 1801 m.insert(mul) 1802 } 1803 1804 func (m *machine) lowerClz(x, result ssa.Value) { 1805 rd := m.compiler.VRegOf(result) 1806 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1807 clz := m.allocateInstr() 1808 clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64) 1809 m.insert(clz) 1810 } 1811 1812 func (m *machine) lowerCtz(x, result ssa.Value) { 1813 rd := m.compiler.VRegOf(result) 1814 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1815 rbit := m.allocateInstr() 1816 _64 := x.Type().Bits() == 64 1817 var tmpReg regalloc.VReg 1818 if _64 { 1819 tmpReg = m.compiler.AllocateVReg(ssa.TypeI64) 1820 } else { 1821 tmpReg = m.compiler.AllocateVReg(ssa.TypeI32) 1822 } 1823 rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64) 1824 m.insert(rbit) 1825 1826 clz := m.allocateInstr() 1827 clz.asBitRR(bitOpClz, rd, tmpReg, _64) 1828 m.insert(clz) 1829 } 1830 1831 func (m *machine) lowerPopcnt(x, result ssa.Value) { 1832 // arm64 doesn't have an instruction for population count on scalar register, 1833 // so we use the vector instruction `cnt`. 1834 // This is exactly what the official Go implements bits.OneCount. 1835 // For example, "func () int { return bits.OneCount(10) }" is compiled as 1836 // 1837 // MOVD $10, R0 ;; Load 10. 1838 // FMOVD R0, F0 1839 // VCNT V0.B8, V0.B8 1840 // UADDLV V0.B8, V0 1841 // 1842 // In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`, 1843 // and the registers may use different names. In our encoding we use the following 1844 // instructions: 1845 // 1846 // ins v0.d[0], x0 ;; mov from GPR to vec (FMOV above) is encoded as INS 1847 // cnt v0.16b, v0.16b ;; we use vec arrangement 16b 1848 // uaddlv h0, v0.8b ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b 1849 // mov x5, v0.d[0] ;; finally we mov the result back to a GPR 1850 // 1851 1852 rd := operandNR(m.compiler.VRegOf(result)) 1853 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1854 1855 rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1856 ins := m.allocateInstr() 1857 ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0)) 1858 m.insert(ins) 1859 1860 rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1861 cnt := m.allocateInstr() 1862 cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B) 1863 m.insert(cnt) 1864 1865 rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1866 uaddlv := m.allocateInstr() 1867 uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B) 1868 m.insert(uaddlv) 1869 1870 mov := m.allocateInstr() 1871 mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false) 1872 m.insert(mov) 1873 } 1874 1875 // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument. 1876 func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) { 1877 tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32) 1878 loadExitCodeConst := m.allocateInstr() 1879 loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true) 1880 1881 setExitCode := m.allocateInstr() 1882 setExitCode.asStore(operandNR(tmpReg1), 1883 addressMode{ 1884 kind: addressModeKindRegUnsignedImm12, 1885 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(), 1886 }, 32) 1887 1888 // In order to unwind the stack, we also need to push the current stack pointer: 1889 tmp2 := m.compiler.AllocateVReg(ssa.TypeI64) 1890 movSpToTmp := m.allocateInstr() 1891 movSpToTmp.asMove64(tmp2, spVReg) 1892 strSpToExecCtx := m.allocateInstr() 1893 strSpToExecCtx.asStore(operandNR(tmp2), 1894 addressMode{ 1895 kind: addressModeKindRegUnsignedImm12, 1896 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(), 1897 }, 64) 1898 // Also the address of this exit. 1899 tmp3 := m.compiler.AllocateVReg(ssa.TypeI64) 1900 currentAddrToTmp := m.allocateInstr() 1901 currentAddrToTmp.asAdr(tmp3, 0) 1902 storeCurrentAddrToExecCtx := m.allocateInstr() 1903 storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), 1904 addressMode{ 1905 kind: addressModeKindRegUnsignedImm12, 1906 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(), 1907 }, 64) 1908 1909 exitSeq := m.allocateInstr() 1910 exitSeq.asExitSequence(execCtxVReg) 1911 1912 m.insert(loadExitCodeConst) 1913 m.insert(setExitCode) 1914 m.insert(movSpToTmp) 1915 m.insert(strSpToExecCtx) 1916 m.insert(currentAddrToTmp) 1917 m.insert(storeCurrentAddrToExecCtx) 1918 m.insert(exitSeq) 1919 } 1920 1921 func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) { 1922 if x.Type() != y.Type() { 1923 panic( 1924 fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s", 1925 x.ID(), x.Type(), y.ID(), y.Type())) 1926 } 1927 1928 extMod := extModeOf(x.Type(), signed) 1929 1930 // First operand must be in pure register form. 1931 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod) 1932 // Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions. 1933 rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod) 1934 1935 alu := m.allocateInstr() 1936 // subs zr, rn, rm 1937 alu.asALU( 1938 aluOpSubS, 1939 // We don't need the result, just need to set flags. 1940 operandNR(xzrVReg), 1941 rn, 1942 rm, 1943 x.Type().Bits() == 64, 1944 ) 1945 m.insert(alu) 1946 } 1947 1948 func (m *machine) lowerFcmpToFlag(x, y ssa.Value) { 1949 if x.Type() != y.Type() { 1950 panic("TODO(maybe): support icmp with different types") 1951 } 1952 1953 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1954 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1955 cmp := m.allocateInstr() 1956 cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64) 1957 m.insert(cmp) 1958 } 1959 1960 func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { 1961 condDef := m.compiler.ValueDefinition(cond) 1962 if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) { 1963 panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) 1964 } 1965 condDef.Instr.MarkLowered() 1966 1967 cvalInstr := condDef.Instr 1968 x, y, c := cvalInstr.IcmpData() 1969 signed := c.Signed() 1970 1971 if !m.tryLowerBandToFlag(x, y) { 1972 m.lowerIcmpToFlag(x, y, signed) 1973 } 1974 1975 // We need to copy the execution context to a temp register, because if it's spilled, 1976 // it might end up being reloaded inside the exiting branch. 1977 execCtxTmp := m.copyToTmp(execCtxVReg) 1978 1979 // We have to skip the entire exit sequence if the condition is false. 1980 cbr := m.allocateInstr() 1981 m.insert(cbr) 1982 m.lowerExitWithCode(execCtxTmp, code) 1983 // conditional branch target is after exit. 1984 l := m.insertBrTargetLabel() 1985 cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */) 1986 } 1987 1988 func (m *machine) lowerSelect(c, x, y, result ssa.Value) { 1989 cvalDef := m.compiler.ValueDefinition(c) 1990 1991 var cc condFlag 1992 switch { 1993 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. 1994 cvalInstr := cvalDef.Instr 1995 x, y, c := cvalInstr.IcmpData() 1996 cc = condFlagFromSSAIntegerCmpCond(c) 1997 m.lowerIcmpToFlag(x, y, c.Signed()) 1998 cvalDef.Instr.MarkLowered() 1999 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. 2000 cvalInstr := cvalDef.Instr 2001 x, y, c := cvalInstr.FcmpData() 2002 cc = condFlagFromSSAFloatCmpCond(c) 2003 m.lowerFcmpToFlag(x, y) 2004 cvalDef.Instr.MarkLowered() 2005 default: 2006 rn := m.getOperand_NR(cvalDef, extModeNone) 2007 if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 { 2008 panic("TODO?BUG?: support select with non-integer condition") 2009 } 2010 alu := m.allocateInstr() 2011 // subs zr, rn, zr 2012 alu.asALU( 2013 aluOpSubS, 2014 // We don't need the result, just need to set flags. 2015 operandNR(xzrVReg), 2016 rn, 2017 operandNR(xzrVReg), 2018 c.Type().Bits() == 64, 2019 ) 2020 m.insert(alu) 2021 cc = ne 2022 } 2023 2024 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 2025 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 2026 2027 rd := operandNR(m.compiler.VRegOf(result)) 2028 switch x.Type() { 2029 case ssa.TypeI32, ssa.TypeI64: 2030 // csel rd, rn, rm, cc 2031 csel := m.allocateInstr() 2032 csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64) 2033 m.insert(csel) 2034 case ssa.TypeF32, ssa.TypeF64: 2035 // fcsel rd, rn, rm, cc 2036 fcsel := m.allocateInstr() 2037 fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64) 2038 m.insert(fcsel) 2039 default: 2040 panic("BUG") 2041 } 2042 } 2043 2044 func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) { 2045 // First check if `rc` is zero or not. 2046 checkZero := m.allocateInstr() 2047 checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false) 2048 m.insert(checkZero) 2049 2050 // Then use CSETM to set all bits to one if `rc` is zero. 2051 allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64) 2052 cset := m.allocateInstr() 2053 cset.asCSet(allOnesOrZero, true, ne) 2054 m.insert(cset) 2055 2056 // Then move the bits to the result vector register. 2057 tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 2058 dup := m.allocateInstr() 2059 dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D) 2060 m.insert(dup) 2061 2062 // Now that `tmp2` has either all bits one or zero depending on `rc`, 2063 // we can use bsl to select between `rn` and `rm`. 2064 ins := m.allocateInstr() 2065 ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B) 2066 m.insert(ins) 2067 2068 // Finally, move the result to the destination register. 2069 mov2 := m.allocateInstr() 2070 mov2.asFpuMov128(rd.nr(), tmp2.nr()) 2071 m.insert(mov2) 2072 } 2073 2074 func (m *machine) lowerAtomicRmw(si *ssa.Instruction) { 2075 ssaOp, size := si.AtomicRmwData() 2076 2077 var op atomicRmwOp 2078 var negateArg bool 2079 var flipArg bool 2080 switch ssaOp { 2081 case ssa.AtomicRmwOpAdd: 2082 op = atomicRmwOpAdd 2083 case ssa.AtomicRmwOpSub: 2084 op = atomicRmwOpAdd 2085 negateArg = true 2086 case ssa.AtomicRmwOpAnd: 2087 op = atomicRmwOpClr 2088 flipArg = true 2089 case ssa.AtomicRmwOpOr: 2090 op = atomicRmwOpSet 2091 case ssa.AtomicRmwOpXor: 2092 op = atomicRmwOpEor 2093 case ssa.AtomicRmwOpXchg: 2094 op = atomicRmwOpSwp 2095 default: 2096 panic(fmt.Sprintf("unknown ssa atomic rmw op: %s", ssaOp)) 2097 } 2098 2099 addr, val := si.Arg2() 2100 addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val) 2101 rn := m.getOperand_NR(addrDef, extModeNone) 2102 rt := operandNR(m.compiler.VRegOf(si.Return())) 2103 rs := m.getOperand_NR(valDef, extModeNone) 2104 2105 _64 := si.Return().Type().Bits() == 64 2106 var tmp operand 2107 if _64 { 2108 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 2109 } else { 2110 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) 2111 } 2112 m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64) 2113 } 2114 2115 func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) { 2116 switch { 2117 case negateArg: 2118 neg := m.allocateInstr() 2119 neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit) 2120 m.insert(neg) 2121 case flipArg: 2122 flip := m.allocateInstr() 2123 flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit) 2124 m.insert(flip) 2125 default: 2126 tmp = rs 2127 } 2128 2129 rmw := m.allocateInstr() 2130 rmw.asAtomicRmw(op, rn, tmp, rt, size) 2131 m.insert(rmw) 2132 } 2133 2134 func (m *machine) lowerAtomicCas(si *ssa.Instruction) { 2135 addr, exp, repl := si.Arg3() 2136 size := si.AtomicTargetSize() 2137 2138 addrDef, expDef, replDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(exp), m.compiler.ValueDefinition(repl) 2139 rn := m.getOperand_NR(addrDef, extModeNone) 2140 rt := m.getOperand_NR(replDef, extModeNone) 2141 rs := m.getOperand_NR(expDef, extModeNone) 2142 tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type())) 2143 2144 _64 := si.Return().Type().Bits() == 64 2145 // rs is overwritten by CAS, so we need to move it to the result register before the instruction 2146 // in case when it is used somewhere else. 2147 mov := m.allocateInstr() 2148 if _64 { 2149 mov.asMove64(tmp.nr(), rs.nr()) 2150 } else { 2151 mov.asMove32(tmp.nr(), rs.nr()) 2152 } 2153 m.insert(mov) 2154 2155 m.lowerAtomicCasImpl(rn, tmp, rt, size) 2156 2157 mov2 := m.allocateInstr() 2158 rd := m.compiler.VRegOf(si.Return()) 2159 if _64 { 2160 mov2.asMove64(rd, tmp.nr()) 2161 } else { 2162 mov2.asMove32(rd, tmp.nr()) 2163 } 2164 m.insert(mov2) 2165 } 2166 2167 func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) { 2168 cas := m.allocateInstr() 2169 cas.asAtomicCas(rn, rs, rt, size) 2170 m.insert(cas) 2171 } 2172 2173 func (m *machine) lowerAtomicLoad(si *ssa.Instruction) { 2174 addr := si.Arg() 2175 size := si.AtomicTargetSize() 2176 2177 addrDef := m.compiler.ValueDefinition(addr) 2178 rn := m.getOperand_NR(addrDef, extModeNone) 2179 rt := operandNR(m.compiler.VRegOf(si.Return())) 2180 2181 m.lowerAtomicLoadImpl(rn, rt, size) 2182 } 2183 2184 func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) { 2185 ld := m.allocateInstr() 2186 ld.asAtomicLoad(rn, rt, size) 2187 m.insert(ld) 2188 } 2189 2190 func (m *machine) lowerAtomicStore(si *ssa.Instruction) { 2191 addr, val := si.Arg2() 2192 size := si.AtomicTargetSize() 2193 2194 addrDef := m.compiler.ValueDefinition(addr) 2195 valDef := m.compiler.ValueDefinition(val) 2196 rn := m.getOperand_NR(addrDef, extModeNone) 2197 rt := m.getOperand_NR(valDef, extModeNone) 2198 2199 m.lowerAtomicStoreImpl(rn, rt, size) 2200 } 2201 2202 func (m *machine) lowerAtomicStoreImpl(rn, rt operand, size uint64) { 2203 ld := m.allocateInstr() 2204 ld.asAtomicStore(rn, rt, size) 2205 m.insert(ld) 2206 } 2207 2208 // copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue 2209 // e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes 2210 func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { 2211 typ := m.compiler.TypeOf(v) 2212 mov := m.allocateInstr() 2213 tmp := m.compiler.AllocateVReg(typ) 2214 if typ.IsInt() { 2215 mov.asMove64(tmp, v) 2216 } else { 2217 mov.asFpuMov128(tmp, v) 2218 } 2219 m.insert(mov) 2220 return tmp 2221 }