github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/wazevo/backend/isa/arm64/lower_instr.go (about) 1 package arm64 2 3 // Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions 4 // into machine specific instructions. 5 // 6 // Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree, 7 // and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection. 8 9 import ( 10 "fmt" 11 "math" 12 13 "github.com/bananabytelabs/wazero/internal/engine/wazevo/backend/regalloc" 14 "github.com/bananabytelabs/wazero/internal/engine/wazevo/ssa" 15 "github.com/bananabytelabs/wazero/internal/engine/wazevo/wazevoapi" 16 ) 17 18 // LowerSingleBranch implements backend.Machine. 19 func (m *machine) LowerSingleBranch(br *ssa.Instruction) { 20 ectx := m.executableContext 21 switch br.Opcode() { 22 case ssa.OpcodeJump: 23 _, _, targetBlk := br.BranchData() 24 if br.IsFallthroughJump() { 25 return 26 } 27 b := m.allocateInstr() 28 target := ectx.GetOrAllocateSSABlockLabel(targetBlk) 29 if target == labelReturn { 30 b.asRet(m.currentABI) 31 } else { 32 b.asBr(target) 33 } 34 m.insert(b) 35 case ssa.OpcodeBrTable: 36 m.lowerBrTable(br) 37 default: 38 panic("BUG: unexpected branch opcode" + br.Opcode().String()) 39 } 40 } 41 42 func (m *machine) lowerBrTable(i *ssa.Instruction) { 43 index, targets := i.BrTableData() 44 indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone) 45 46 // Firstly, we have to do the bounds check of the index, and 47 // set it to the default target (sitting at the end of the list) if it's out of bounds. 48 49 // mov maxIndexReg #maximum_index 50 // subs wzr, index, maxIndexReg 51 // csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg. 52 maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32) 53 m.lowerConstantI32(maxIndexReg, int32(len(targets)-1)) 54 subs := m.allocateInstr() 55 subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false) 56 m.insert(subs) 57 csel := m.allocateInstr() 58 adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32) 59 csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false) 60 m.insert(csel) 61 62 brSequence := m.allocateInstr() 63 64 // TODO: reuse the slice! 65 labels := make([]uint32, len(targets)) 66 for j, target := range targets { 67 labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target)) 68 } 69 70 brSequence.asBrTableSequence(adjustedIndex, labels) 71 m.insert(brSequence) 72 } 73 74 // LowerConditionalBranch implements backend.Machine. 75 func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { 76 exctx := m.executableContext 77 cval, args, targetBlk := b.BranchData() 78 if len(args) > 0 { 79 panic(fmt.Sprintf( 80 "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", 81 exctx.CurrentSSABlk, 82 targetBlk, 83 )) 84 } 85 86 target := exctx.GetOrAllocateSSABlockLabel(targetBlk) 87 cvalDef := m.compiler.ValueDefinition(cval) 88 89 switch { 90 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. 91 cvalInstr := cvalDef.Instr 92 x, y, c := cvalInstr.IcmpData() 93 cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed() 94 if b.Opcode() == ssa.OpcodeBrz { 95 cc = cc.invert() 96 } 97 98 m.lowerIcmpToFlag(x, y, signed) 99 cbr := m.allocateInstr() 100 cbr.asCondBr(cc.asCond(), target, false /* ignored */) 101 m.insert(cbr) 102 cvalDef.Instr.MarkLowered() 103 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. 104 cvalInstr := cvalDef.Instr 105 x, y, c := cvalInstr.FcmpData() 106 cc := condFlagFromSSAFloatCmpCond(c) 107 if b.Opcode() == ssa.OpcodeBrz { 108 cc = cc.invert() 109 } 110 m.lowerFcmpToFlag(x, y) 111 cbr := m.allocateInstr() 112 cbr.asCondBr(cc.asCond(), target, false /* ignored */) 113 m.insert(cbr) 114 cvalDef.Instr.MarkLowered() 115 default: 116 rn := m.getOperand_NR(cvalDef, extModeNone) 117 var c cond 118 if b.Opcode() == ssa.OpcodeBrz { 119 c = registerAsRegZeroCond(rn.nr()) 120 } else { 121 c = registerAsRegNotZeroCond(rn.nr()) 122 } 123 cbr := m.allocateInstr() 124 cbr.asCondBr(c, target, false) 125 m.insert(cbr) 126 } 127 } 128 129 // LowerInstr implements backend.Machine. 130 func (m *machine) LowerInstr(instr *ssa.Instruction) { 131 if l := instr.SourceOffset(); l.Valid() { 132 info := m.allocateInstr().asEmitSourceOffsetInfo(l) 133 m.insert(info) 134 } 135 136 switch op := instr.Opcode(); op { 137 case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: 138 panic("BUG: branching instructions are handled by LowerBranches") 139 case ssa.OpcodeReturn: 140 panic("BUG: return must be handled by backend.Compiler") 141 case ssa.OpcodeIadd, ssa.OpcodeIsub: 142 m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd) 143 case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin: 144 m.lowerFpuBinOp(instr) 145 case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. 146 case ssa.OpcodeExitWithCode: 147 execCtx, code := instr.ExitWithCodeData() 148 m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code) 149 case ssa.OpcodeExitIfTrueWithCode: 150 execCtx, c, code := instr.ExitIfTrueWithCodeData() 151 m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code) 152 case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: 153 m.lowerStore(instr) 154 case ssa.OpcodeLoad: 155 dst := instr.Return() 156 ptr, offset, typ := instr.LoadData() 157 m.lowerLoad(ptr, offset, typ, dst) 158 case ssa.OpcodeVZeroExtLoad: 159 dst := instr.Return() 160 ptr, offset, typ := instr.VZeroExtLoadData() 161 m.lowerLoad(ptr, offset, typ, dst) 162 case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: 163 ptr, offset, _ := instr.LoadData() 164 ret := m.compiler.VRegOf(instr.Return()) 165 m.lowerExtLoad(op, ptr, offset, ret) 166 case ssa.OpcodeCall, ssa.OpcodeCallIndirect: 167 m.lowerCall(instr) 168 case ssa.OpcodeIcmp: 169 m.lowerIcmp(instr) 170 case ssa.OpcodeVIcmp: 171 m.lowerVIcmp(instr) 172 case ssa.OpcodeVFcmp: 173 m.lowerVFcmp(instr) 174 case ssa.OpcodeVCeil: 175 m.lowerVecMisc(vecOpFrintp, instr) 176 case ssa.OpcodeVFloor: 177 m.lowerVecMisc(vecOpFrintm, instr) 178 case ssa.OpcodeVTrunc: 179 m.lowerVecMisc(vecOpFrintz, instr) 180 case ssa.OpcodeVNearest: 181 m.lowerVecMisc(vecOpFrintn, instr) 182 case ssa.OpcodeVMaxPseudo: 183 m.lowerVMinMaxPseudo(instr, true) 184 case ssa.OpcodeVMinPseudo: 185 m.lowerVMinMaxPseudo(instr, false) 186 case ssa.OpcodeBand: 187 m.lowerBitwiseAluOp(instr, aluOpAnd) 188 case ssa.OpcodeBor: 189 m.lowerBitwiseAluOp(instr, aluOpOrr) 190 case ssa.OpcodeBxor: 191 m.lowerBitwiseAluOp(instr, aluOpEor) 192 case ssa.OpcodeIshl: 193 m.lowerShifts(instr, extModeNone, aluOpLsl) 194 case ssa.OpcodeSshr: 195 if instr.Return().Type().Bits() == 64 { 196 m.lowerShifts(instr, extModeSignExtend64, aluOpAsr) 197 } else { 198 m.lowerShifts(instr, extModeSignExtend32, aluOpAsr) 199 } 200 case ssa.OpcodeUshr: 201 if instr.Return().Type().Bits() == 64 { 202 m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr) 203 } else { 204 m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr) 205 } 206 case ssa.OpcodeRotl: 207 m.lowerRotl(instr) 208 case ssa.OpcodeRotr: 209 m.lowerRotr(instr) 210 case ssa.OpcodeSExtend, ssa.OpcodeUExtend: 211 from, to, signed := instr.ExtendData() 212 m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) 213 case ssa.OpcodeFcmp: 214 x, y, c := instr.FcmpData() 215 m.lowerFcmp(x, y, instr.Return(), c) 216 case ssa.OpcodeImul: 217 x, y := instr.Arg2() 218 result := instr.Return() 219 m.lowerImul(x, y, result) 220 case ssa.OpcodeUndefined: 221 undef := m.allocateInstr() 222 undef.asUDF() 223 m.insert(undef) 224 case ssa.OpcodeSelect: 225 c, x, y := instr.SelectData() 226 if x.Type() == ssa.TypeV128 { 227 rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) 228 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 229 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 230 rd := operandNR(m.compiler.VRegOf(instr.Return())) 231 m.lowerSelectVec(rc, rn, rm, rd) 232 } else { 233 m.lowerSelect(c, x, y, instr.Return()) 234 } 235 case ssa.OpcodeClz: 236 x := instr.Arg() 237 result := instr.Return() 238 m.lowerClz(x, result) 239 case ssa.OpcodeCtz: 240 x := instr.Arg() 241 result := instr.Return() 242 m.lowerCtz(x, result) 243 case ssa.OpcodePopcnt: 244 x := instr.Arg() 245 result := instr.Return() 246 m.lowerPopcnt(x, result) 247 case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: 248 x, ctx := instr.Arg2() 249 result := instr.Return() 250 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 251 rd := operandNR(m.compiler.VRegOf(result)) 252 ctxVReg := m.compiler.VRegOf(ctx) 253 m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64, 254 result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) 255 case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: 256 x, ctx := instr.Arg2() 257 result := instr.Return() 258 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 259 rd := operandNR(m.compiler.VRegOf(result)) 260 ctxVReg := m.compiler.VRegOf(ctx) 261 m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64, 262 result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) 263 case ssa.OpcodeFcvtFromSint: 264 x := instr.Arg() 265 result := instr.Return() 266 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 267 rd := operandNR(m.compiler.VRegOf(result)) 268 m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) 269 case ssa.OpcodeFcvtFromUint: 270 x := instr.Arg() 271 result := instr.Return() 272 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 273 rd := operandNR(m.compiler.VRegOf(result)) 274 m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) 275 case ssa.OpcodeFdemote: 276 v := instr.Arg() 277 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 278 rd := operandNR(m.compiler.VRegOf(instr.Return())) 279 cnt := m.allocateInstr() 280 cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false) 281 m.insert(cnt) 282 case ssa.OpcodeFpromote: 283 v := instr.Arg() 284 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 285 rd := operandNR(m.compiler.VRegOf(instr.Return())) 286 cnt := m.allocateInstr() 287 cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true) 288 m.insert(cnt) 289 case ssa.OpcodeIreduce: 290 rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone) 291 retVal := instr.Return() 292 rd := m.compiler.VRegOf(retVal) 293 294 if retVal.Type() != ssa.TypeI32 { 295 panic("TODO?: Ireduce to non-i32") 296 } 297 mov := m.allocateInstr() 298 mov.asMove32(rd, rn.reg()) 299 m.insert(mov) 300 case ssa.OpcodeFneg: 301 m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return()) 302 case ssa.OpcodeSqrt: 303 m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return()) 304 case ssa.OpcodeCeil: 305 m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return()) 306 case ssa.OpcodeFloor: 307 m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return()) 308 case ssa.OpcodeTrunc: 309 m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return()) 310 case ssa.OpcodeNearest: 311 m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return()) 312 case ssa.OpcodeFabs: 313 m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return()) 314 case ssa.OpcodeBitcast: 315 m.lowerBitcast(instr) 316 case ssa.OpcodeFcopysign: 317 x, y := instr.Arg2() 318 m.lowerFcopysign(x, y, instr.Return()) 319 case ssa.OpcodeSdiv, ssa.OpcodeUdiv: 320 x, y, ctx := instr.Arg3() 321 ctxVReg := m.compiler.VRegOf(ctx) 322 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 323 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 324 rd := operandNR(m.compiler.VRegOf(instr.Return())) 325 m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv) 326 case ssa.OpcodeSrem, ssa.OpcodeUrem: 327 x, y, ctx := instr.Arg3() 328 ctxVReg := m.compiler.VRegOf(ctx) 329 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 330 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 331 rd := operandNR(m.compiler.VRegOf(instr.Return())) 332 m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem) 333 case ssa.OpcodeVconst: 334 result := m.compiler.VRegOf(instr.Return()) 335 lo, hi := instr.VconstData() 336 v := m.allocateInstr() 337 v.asLoadFpuConst128(result, lo, hi) 338 m.insert(v) 339 case ssa.OpcodeVbnot: 340 x := instr.Arg() 341 ins := m.allocateInstr() 342 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 343 rd := operandNR(m.compiler.VRegOf(instr.Return())) 344 ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B) 345 m.insert(ins) 346 case ssa.OpcodeVbxor: 347 x, y := instr.Arg2() 348 m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B) 349 case ssa.OpcodeVbor: 350 x, y := instr.Arg2() 351 m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B) 352 case ssa.OpcodeVband: 353 x, y := instr.Arg2() 354 m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B) 355 case ssa.OpcodeVbandnot: 356 x, y := instr.Arg2() 357 m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B) 358 case ssa.OpcodeVbitselect: 359 c, x, y := instr.SelectData() 360 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 361 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 362 creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) 363 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 364 365 // creg is overwritten by BSL, so we need to move it to the result register before the instruction 366 // in case when it is used somewhere else. 367 mov := m.allocateInstr() 368 mov.asFpuMov128(tmp.nr(), creg.nr()) 369 m.insert(mov) 370 371 ins := m.allocateInstr() 372 ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B) 373 m.insert(ins) 374 375 mov2 := m.allocateInstr() 376 rd := m.compiler.VRegOf(instr.Return()) 377 mov2.asFpuMov128(rd, tmp.nr()) 378 m.insert(mov2) 379 case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue: 380 x, lane := instr.ArgWithLane() 381 var arr vecArrangement 382 if op == ssa.OpcodeVallTrue { 383 arr = ssaLaneToArrangement(lane) 384 } 385 rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 386 rd := operandNR(m.compiler.VRegOf(instr.Return())) 387 m.lowerVcheckTrue(op, rm, rd, arr) 388 case ssa.OpcodeVhighBits: 389 x, lane := instr.ArgWithLane() 390 rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 391 rd := operandNR(m.compiler.VRegOf(instr.Return())) 392 arr := ssaLaneToArrangement(lane) 393 m.lowerVhighBits(rm, rd, arr) 394 case ssa.OpcodeVIadd: 395 x, y, lane := instr.Arg2WithLane() 396 arr := ssaLaneToArrangement(lane) 397 m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr) 398 case ssa.OpcodeIaddPairwise: 399 x, y, lane := instr.Arg2WithLane() 400 arr := ssaLaneToArrangement(lane) 401 m.lowerVecRRR(vecOpAddp, x, y, instr.Return(), arr) 402 case ssa.OpcodeVSaddSat: 403 x, y, lane := instr.Arg2WithLane() 404 arr := ssaLaneToArrangement(lane) 405 m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr) 406 case ssa.OpcodeVUaddSat: 407 x, y, lane := instr.Arg2WithLane() 408 arr := ssaLaneToArrangement(lane) 409 m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr) 410 case ssa.OpcodeVIsub: 411 x, y, lane := instr.Arg2WithLane() 412 arr := ssaLaneToArrangement(lane) 413 m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr) 414 case ssa.OpcodeVSsubSat: 415 x, y, lane := instr.Arg2WithLane() 416 arr := ssaLaneToArrangement(lane) 417 m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr) 418 case ssa.OpcodeVUsubSat: 419 x, y, lane := instr.Arg2WithLane() 420 arr := ssaLaneToArrangement(lane) 421 m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr) 422 case ssa.OpcodeVImin: 423 x, y, lane := instr.Arg2WithLane() 424 arr := ssaLaneToArrangement(lane) 425 m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr) 426 case ssa.OpcodeVUmin: 427 x, y, lane := instr.Arg2WithLane() 428 arr := ssaLaneToArrangement(lane) 429 m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr) 430 case ssa.OpcodeVImax: 431 x, y, lane := instr.Arg2WithLane() 432 arr := ssaLaneToArrangement(lane) 433 m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr) 434 case ssa.OpcodeVUmax: 435 x, y, lane := instr.Arg2WithLane() 436 arr := ssaLaneToArrangement(lane) 437 m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr) 438 case ssa.OpcodeVAvgRound: 439 x, y, lane := instr.Arg2WithLane() 440 arr := ssaLaneToArrangement(lane) 441 m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr) 442 case ssa.OpcodeVImul: 443 x, y, lane := instr.Arg2WithLane() 444 arr := ssaLaneToArrangement(lane) 445 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 446 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 447 rd := operandNR(m.compiler.VRegOf(instr.Return())) 448 m.lowerVIMul(rd, rn, rm, arr) 449 case ssa.OpcodeVIabs: 450 m.lowerVecMisc(vecOpAbs, instr) 451 case ssa.OpcodeVIneg: 452 m.lowerVecMisc(vecOpNeg, instr) 453 case ssa.OpcodeVIpopcnt: 454 m.lowerVecMisc(vecOpCnt, instr) 455 case ssa.OpcodeVIshl, 456 ssa.OpcodeVSshr, ssa.OpcodeVUshr: 457 x, y, lane := instr.Arg2WithLane() 458 arr := ssaLaneToArrangement(lane) 459 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 460 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 461 rd := operandNR(m.compiler.VRegOf(instr.Return())) 462 m.lowerVShift(op, rd, rn, rm, arr) 463 case ssa.OpcodeVSqrt: 464 m.lowerVecMisc(vecOpFsqrt, instr) 465 case ssa.OpcodeVFabs: 466 m.lowerVecMisc(vecOpFabs, instr) 467 case ssa.OpcodeVFneg: 468 m.lowerVecMisc(vecOpFneg, instr) 469 case ssa.OpcodeVFmin: 470 x, y, lane := instr.Arg2WithLane() 471 arr := ssaLaneToArrangement(lane) 472 m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr) 473 case ssa.OpcodeVFmax: 474 x, y, lane := instr.Arg2WithLane() 475 arr := ssaLaneToArrangement(lane) 476 m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr) 477 case ssa.OpcodeVFadd: 478 x, y, lane := instr.Arg2WithLane() 479 arr := ssaLaneToArrangement(lane) 480 m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr) 481 case ssa.OpcodeVFsub: 482 x, y, lane := instr.Arg2WithLane() 483 arr := ssaLaneToArrangement(lane) 484 m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr) 485 case ssa.OpcodeVFmul: 486 x, y, lane := instr.Arg2WithLane() 487 arr := ssaLaneToArrangement(lane) 488 m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr) 489 case ssa.OpcodeSqmulRoundSat: 490 x, y, lane := instr.Arg2WithLane() 491 arr := ssaLaneToArrangement(lane) 492 m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr) 493 case ssa.OpcodeVFdiv: 494 x, y, lane := instr.Arg2WithLane() 495 arr := ssaLaneToArrangement(lane) 496 m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr) 497 case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: 498 x, lane := instr.ArgWithLane() 499 arr := ssaLaneToArrangement(lane) 500 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 501 rd := operandNR(m.compiler.VRegOf(instr.Return())) 502 m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat) 503 case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint: 504 x, lane := instr.ArgWithLane() 505 arr := ssaLaneToArrangement(lane) 506 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 507 rd := operandNR(m.compiler.VRegOf(instr.Return())) 508 m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint) 509 case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow: 510 x, lane := instr.ArgWithLane() 511 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 512 rd := operandNR(m.compiler.VRegOf(instr.Return())) 513 514 var arr vecArrangement 515 switch lane { 516 case ssa.VecLaneI8x16: 517 arr = vecArrangement8B 518 case ssa.VecLaneI16x8: 519 arr = vecArrangement4H 520 case ssa.VecLaneI32x4: 521 arr = vecArrangement2S 522 } 523 524 shll := m.allocateInstr() 525 if signed := op == ssa.OpcodeSwidenLow; signed { 526 shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) 527 } else { 528 shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) 529 } 530 m.insert(shll) 531 case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh: 532 x, lane := instr.ArgWithLane() 533 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 534 rd := operandNR(m.compiler.VRegOf(instr.Return())) 535 536 arr := ssaLaneToArrangement(lane) 537 538 shll := m.allocateInstr() 539 if signed := op == ssa.OpcodeSwidenHigh; signed { 540 shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) 541 } else { 542 shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) 543 } 544 m.insert(shll) 545 546 case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: 547 x, y, lane := instr.Arg2WithLane() 548 var arr, arr2 vecArrangement 549 switch lane { 550 case ssa.VecLaneI16x8: // I16x8 551 arr = vecArrangement8B 552 arr2 = vecArrangement16B // Implies sqxtn2. 553 case ssa.VecLaneI32x4: 554 arr = vecArrangement4H 555 arr2 = vecArrangement8H // Implies sqxtn2. 556 default: 557 panic("unsupported lane " + lane.String()) 558 } 559 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 560 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 561 rd := operandNR(m.compiler.VRegOf(instr.Return())) 562 563 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 564 565 loQxtn := m.allocateInstr() 566 hiQxtn := m.allocateInstr() 567 if signed := op == ssa.OpcodeSnarrow; signed { 568 // Narrow lanes on rn and write them into lower-half of rd. 569 loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low 570 // Narrow lanes on rm and write them into higher-half of rd. 571 hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2) 572 } else { 573 // Narrow lanes on rn and write them into lower-half of rd. 574 loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low 575 // Narrow lanes on rm and write them into higher-half of rd. 576 hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2) 577 } 578 m.insert(loQxtn) 579 m.insert(hiQxtn) 580 581 mov := m.allocateInstr() 582 mov.asFpuMov128(rd.nr(), tmp.nr()) 583 m.insert(mov) 584 case ssa.OpcodeFvpromoteLow: 585 x, lane := instr.ArgWithLane() 586 if lane != ssa.VecLaneF32x4 { 587 panic("unsupported lane type " + lane.String()) 588 } 589 ins := m.allocateInstr() 590 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 591 rd := operandNR(m.compiler.VRegOf(instr.Return())) 592 ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S) 593 m.insert(ins) 594 case ssa.OpcodeFvdemote: 595 x, lane := instr.ArgWithLane() 596 if lane != ssa.VecLaneF64x2 { 597 panic("unsupported lane type " + lane.String()) 598 } 599 ins := m.allocateInstr() 600 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 601 rd := operandNR(m.compiler.VRegOf(instr.Return())) 602 ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S) 603 m.insert(ins) 604 case ssa.OpcodeExtractlane: 605 x, index, signed, lane := instr.ExtractlaneData() 606 607 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 608 rd := operandNR(m.compiler.VRegOf(instr.Return())) 609 610 mov := m.allocateInstr() 611 switch lane { 612 case ssa.VecLaneI8x16: 613 mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed) 614 case ssa.VecLaneI16x8: 615 mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed) 616 case ssa.VecLaneI32x4: 617 mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed) 618 case ssa.VecLaneI64x2: 619 mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed) 620 case ssa.VecLaneF32x4: 621 mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index)) 622 case ssa.VecLaneF64x2: 623 mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index)) 624 default: 625 panic("unsupported lane: " + lane.String()) 626 } 627 628 m.insert(mov) 629 630 case ssa.OpcodeInsertlane: 631 x, y, index, lane := instr.InsertlaneData() 632 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 633 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 634 rd := operandNR(m.compiler.VRegOf(instr.Return())) 635 tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 636 637 // Initially mov rn to tmp. 638 mov1 := m.allocateInstr() 639 mov1.asFpuMov128(tmpReg.nr(), rn.nr()) 640 m.insert(mov1) 641 642 // movToVec and vecMovElement do not clear the remaining bits to zero, 643 // thus, we can mov rm in-place to tmp. 644 mov2 := m.allocateInstr() 645 switch lane { 646 case ssa.VecLaneI8x16: 647 mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index)) 648 case ssa.VecLaneI16x8: 649 mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index)) 650 case ssa.VecLaneI32x4: 651 mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index)) 652 case ssa.VecLaneI64x2: 653 mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index)) 654 case ssa.VecLaneF32x4: 655 mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0)) 656 case ssa.VecLaneF64x2: 657 mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0)) 658 } 659 m.insert(mov2) 660 661 // Finally mov tmp to rd. 662 mov3 := m.allocateInstr() 663 mov3.asFpuMov128(rd.nr(), tmpReg.nr()) 664 m.insert(mov3) 665 666 case ssa.OpcodeSwizzle: 667 x, y, lane := instr.Arg2WithLane() 668 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 669 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 670 rd := operandNR(m.compiler.VRegOf(instr.Return())) 671 672 arr := ssaLaneToArrangement(lane) 673 674 // tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr> 675 tbl1 := m.allocateInstr() 676 tbl1.asVecTbl(1, rd, rn, rm, arr) 677 m.insert(tbl1) 678 679 case ssa.OpcodeShuffle: 680 x, y, lane1, lane2 := instr.ShuffleData() 681 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 682 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 683 rd := operandNR(m.compiler.VRegOf(instr.Return())) 684 685 m.lowerShuffle(rd, rn, rm, lane1, lane2) 686 687 case ssa.OpcodeSplat: 688 x, lane := instr.ArgWithLane() 689 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 690 rd := operandNR(m.compiler.VRegOf(instr.Return())) 691 692 dup := m.allocateInstr() 693 switch lane { 694 case ssa.VecLaneI8x16: 695 dup.asVecDup(rd, rn, vecArrangement16B) 696 case ssa.VecLaneI16x8: 697 dup.asVecDup(rd, rn, vecArrangement8H) 698 case ssa.VecLaneI32x4: 699 dup.asVecDup(rd, rn, vecArrangement4S) 700 case ssa.VecLaneI64x2: 701 dup.asVecDup(rd, rn, vecArrangement2D) 702 case ssa.VecLaneF32x4: 703 dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0)) 704 case ssa.VecLaneF64x2: 705 dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0)) 706 } 707 m.insert(dup) 708 709 case ssa.OpcodeLoadSplat: 710 ptr, offset, lane := instr.LoadSplatData() 711 m.lowerLoadSplat(ptr, offset, lane, instr.Return()) 712 default: 713 panic("TODO: lowering " + op.String()) 714 } 715 m.executableContext.FlushPendingInstructions() 716 } 717 718 func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) { 719 // `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30. 720 vReg, wReg := v29VReg, v30VReg 721 722 // Initialize v29, v30 to rn, rm. 723 movv := m.allocateInstr() 724 movv.asFpuMov128(vReg, rn.nr()) 725 m.insert(movv) 726 727 movw := m.allocateInstr() 728 movw.asFpuMov128(wReg, rm.nr()) 729 m.insert(movw) 730 731 // `lane1`, `lane2` are already encoded as two u64s with the right layout: 732 // lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0] 733 // lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8] 734 // Thus, we can use loadFpuConst128. 735 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 736 lfc := m.allocateInstr() 737 lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2) 738 m.insert(lfc) 739 740 // tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b 741 tbl2 := m.allocateInstr() 742 tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B) 743 m.insert(tbl2) 744 } 745 746 func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) { 747 var modulo byte 748 switch arr { 749 case vecArrangement16B: 750 modulo = 0x7 // Modulo 8. 751 case vecArrangement8H: 752 modulo = 0xf // Modulo 16. 753 case vecArrangement4S: 754 modulo = 0x1f // Modulo 32. 755 case vecArrangement2D: 756 modulo = 0x3f // Modulo 64. 757 default: 758 panic("unsupported arrangment " + arr.String()) 759 } 760 761 rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 762 vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 763 764 and := m.allocateInstr() 765 and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true) 766 m.insert(and) 767 768 if op != ssa.OpcodeVIshl { 769 // Negate the amount to make this as right shift. 770 neg := m.allocateInstr() 771 neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true) 772 m.insert(neg) 773 } 774 775 // Copy the shift amount into a vector register as sshl/ushl requires it to be there. 776 dup := m.allocateInstr() 777 dup.asVecDup(vtmp, rtmp, arr) 778 m.insert(dup) 779 780 if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr { 781 sshl := m.allocateInstr() 782 sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr) 783 m.insert(sshl) 784 } else { 785 ushl := m.allocateInstr() 786 ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr) 787 m.insert(ushl) 788 } 789 } 790 791 func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) { 792 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 793 794 // Special case VallTrue for i64x2. 795 if op == ssa.OpcodeVallTrue && arr == vecArrangement2D { 796 // cmeq v3?.2d, v2?.2d, #0 797 // addp v3?.2d, v3?.2d, v3?.2d 798 // fcmp v3?, v3? 799 // cset dst, eq 800 801 ins := m.allocateInstr() 802 ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D) 803 m.insert(ins) 804 805 addp := m.allocateInstr() 806 addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D) 807 m.insert(addp) 808 809 fcmp := m.allocateInstr() 810 fcmp.asFpuCmp(tmp, tmp, true) 811 m.insert(fcmp) 812 813 cset := m.allocateInstr() 814 cset.asCSet(rd.nr(), false, eq) 815 m.insert(cset) 816 817 return 818 } 819 820 // Create a scalar value with umaxp or uminv, then compare it against zero. 821 ins := m.allocateInstr() 822 if op == ssa.OpcodeVanyTrue { 823 // umaxp v4?.16b, v2?.16b, v2?.16b 824 ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B) 825 } else { 826 // uminv d4?, v2?.4s 827 ins.asVecLanes(vecOpUminv, tmp, rm, arr) 828 } 829 m.insert(ins) 830 831 // mov x3?, v4?.d[0] 832 // ccmp x3?, #0x0, #0x0, al 833 // cset x3?, ne 834 // mov x0, x3? 835 836 movv := m.allocateInstr() 837 movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false) 838 m.insert(movv) 839 840 fc := m.allocateInstr() 841 fc.asCCmpImm(rd, uint64(0), al, 0, true) 842 m.insert(fc) 843 844 cset := m.allocateInstr() 845 cset.asCSet(rd.nr(), false, ne) 846 m.insert(cset) 847 } 848 849 func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) { 850 r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 851 v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 852 v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 853 854 switch arr { 855 case vecArrangement16B: 856 // sshr v6?.16b, v2?.16b, #7 857 // movz x4?, #0x201, lsl 0 858 // movk x4?, #0x804, lsl 16 859 // movk x4?, #0x2010, lsl 32 860 // movk x4?, #0x8040, lsl 48 861 // dup v5?.2d, x4? 862 // and v6?.16b, v6?.16b, v5?.16b 863 // ext v5?.16b, v6?.16b, v6?.16b, #8 864 // zip1 v5?.16b, v6?.16b, v5?.16b 865 // addv s5?, v5?.8h 866 // umov s3?, v5?.h[0] 867 868 // Right arithmetic shift on the original vector and store the result into v1. So we have: 869 // v1[i] = 0xff if vi<0, 0 otherwise. 870 sshr := m.allocateInstr() 871 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B) 872 m.insert(sshr) 873 874 // Load the bit mask into r0. 875 m.insertMOVZ(r0.nr(), 0x0201, 0, true) 876 m.insertMOVK(r0.nr(), 0x0804, 1, true) 877 m.insertMOVK(r0.nr(), 0x2010, 2, true) 878 m.insertMOVK(r0.nr(), 0x8040, 3, true) 879 880 // dup r0 to v0. 881 dup := m.allocateInstr() 882 dup.asVecDup(v0, r0, vecArrangement2D) 883 m.insert(dup) 884 885 // Lane-wise logical AND with the bit mask, meaning that we have 886 // v[i] = (1 << i) if vi<0, 0 otherwise. 887 // 888 // Below, we use the following notation: 889 // wi := (1 << i) if vi<0, 0 otherwise. 890 and := m.allocateInstr() 891 and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B) 892 m.insert(and) 893 894 // Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have 895 // v0[i] = w(i+8) if i < 8, w(i-8) otherwise. 896 ext := m.allocateInstr() 897 ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8)) 898 m.insert(ext) 899 900 // v = [w0, w8, ..., w7, w15] 901 zip1 := m.allocateInstr() 902 zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B) 903 m.insert(zip1) 904 905 // v.h[0] = w0 + ... + w15 906 addv := m.allocateInstr() 907 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) 908 m.insert(addv) 909 910 // Extract the v.h[0] as the result. 911 movfv := m.allocateInstr() 912 movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) 913 m.insert(movfv) 914 case vecArrangement8H: 915 // sshr v6?.8h, v2?.8h, #15 916 // movz x4?, #0x1, lsl 0 917 // movk x4?, #0x2, lsl 16 918 // movk x4?, #0x4, lsl 32 919 // movk x4?, #0x8, lsl 48 920 // dup v5?.2d, x4? 921 // lsl x4?, x4?, 0x4 922 // ins v5?.d[1], x4? 923 // and v5?.16b, v6?.16b, v5?.16b 924 // addv s5?, v5?.8h 925 // umov s3?, v5?.h[0] 926 927 // Right arithmetic shift on the original vector and store the result into v1. So we have: 928 // v[i] = 0xffff if vi<0, 0 otherwise. 929 sshr := m.allocateInstr() 930 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H) 931 m.insert(sshr) 932 933 // Load the bit mask into r0. 934 m.lowerConstantI64(r0.nr(), 0x0008000400020001) 935 936 // dup r0 to vector v0. 937 dup := m.allocateInstr() 938 dup.asVecDup(v0, r0, vecArrangement2D) 939 m.insert(dup) 940 941 lsl := m.allocateInstr() 942 lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true) 943 m.insert(lsl) 944 945 movv := m.allocateInstr() 946 movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) 947 m.insert(movv) 948 949 // Lane-wise logical AND with the bitmask, meaning that we have 950 // v[i] = (1 << i) if vi<0, 0 otherwise for i=0..3 951 // = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7 952 and := m.allocateInstr() 953 and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) 954 m.insert(and) 955 956 addv := m.allocateInstr() 957 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) 958 m.insert(addv) 959 960 movfv := m.allocateInstr() 961 movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) 962 m.insert(movfv) 963 case vecArrangement4S: 964 // sshr v6?.8h, v2?.8h, #15 965 // movz x4?, #0x1, lsl 0 966 // movk x4?, #0x2, lsl 16 967 // movk x4?, #0x4, lsl 32 968 // movk x4?, #0x8, lsl 48 969 // dup v5?.2d, x4? 970 // lsl x4?, x4?, 0x4 971 // ins v5?.d[1], x4? 972 // and v5?.16b, v6?.16b, v5?.16b 973 // addv s5?, v5?.8h 974 // umov s3?, v5?.h[0] 975 976 // Right arithmetic shift on the original vector and store the result into v1. So we have: 977 // v[i] = 0xffffffff if vi<0, 0 otherwise. 978 sshr := m.allocateInstr() 979 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S) 980 m.insert(sshr) 981 982 // Load the bit mask into r0. 983 m.lowerConstantI64(r0.nr(), 0x0000000200000001) 984 985 // dup r0 to vector v0. 986 dup := m.allocateInstr() 987 dup.asVecDup(v0, r0, vecArrangement2D) 988 m.insert(dup) 989 990 lsl := m.allocateInstr() 991 lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true) 992 m.insert(lsl) 993 994 movv := m.allocateInstr() 995 movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) 996 m.insert(movv) 997 998 // Lane-wise logical AND with the bitmask, meaning that we have 999 // v[i] = (1 << i) if vi<0, 0 otherwise for i in [0, 1] 1000 // = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3] 1001 and := m.allocateInstr() 1002 and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) 1003 m.insert(and) 1004 1005 addv := m.allocateInstr() 1006 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S) 1007 m.insert(addv) 1008 1009 movfv := m.allocateInstr() 1010 movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false) 1011 m.insert(movfv) 1012 case vecArrangement2D: 1013 // mov d3?, v2?.d[0] 1014 // mov x4?, v2?.d[1] 1015 // lsr x4?, x4?, 0x3f 1016 // lsr d3?, d3?, 0x3f 1017 // add s3?, s3?, w4?, lsl #1 1018 1019 // Move the lower 64-bit int into result. 1020 movv0 := m.allocateInstr() 1021 movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false) 1022 m.insert(movv0) 1023 1024 // Move the higher 64-bit int into r0. 1025 movv1 := m.allocateInstr() 1026 movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false) 1027 m.insert(movv1) 1028 1029 // Move the sign bit into the least significant bit. 1030 lsr1 := m.allocateInstr() 1031 lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true) 1032 m.insert(lsr1) 1033 1034 lsr2 := m.allocateInstr() 1035 lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true) 1036 m.insert(lsr2) 1037 1038 // rd = (r0<<1) | rd 1039 lsl := m.allocateInstr() 1040 lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false) 1041 m.insert(lsl) 1042 default: 1043 panic("Unsupported " + arr.String()) 1044 } 1045 } 1046 1047 func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) { 1048 x, lane := instr.ArgWithLane() 1049 arr := ssaLaneToArrangement(lane) 1050 ins := m.allocateInstr() 1051 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1052 rd := operandNR(m.compiler.VRegOf(instr.Return())) 1053 ins.asVecMisc(op, rd, rn, arr) 1054 m.insert(ins) 1055 } 1056 1057 func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) { 1058 ins := m.allocateInstr() 1059 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1060 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1061 rd := operandNR(m.compiler.VRegOf(ret)) 1062 ins.asVecRRR(op, rd, rn, rm, arr) 1063 m.insert(ins) 1064 } 1065 1066 func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) { 1067 if arr != vecArrangement2D { 1068 mul := m.allocateInstr() 1069 mul.asVecRRR(vecOpMul, rd, rn, rm, arr) 1070 m.insert(mul) 1071 } else { 1072 tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1073 tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1074 tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1075 1076 tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1077 1078 // Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696 1079 rev64 := m.allocateInstr() 1080 rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S) 1081 m.insert(rev64) 1082 1083 mul := m.allocateInstr() 1084 mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S) 1085 m.insert(mul) 1086 1087 xtn1 := m.allocateInstr() 1088 xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S) 1089 m.insert(xtn1) 1090 1091 addp := m.allocateInstr() 1092 addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S) 1093 m.insert(addp) 1094 1095 xtn2 := m.allocateInstr() 1096 xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S) 1097 m.insert(xtn2) 1098 1099 // Note: do not write the result directly into result yet. This is the same reason as in bsl. 1100 // In short, in UMLAL instruction, the result register is also one of the source register, and 1101 // the value on the result register is significant. 1102 shll := m.allocateInstr() 1103 shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S) 1104 m.insert(shll) 1105 1106 umlal := m.allocateInstr() 1107 umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S) 1108 m.insert(umlal) 1109 1110 mov := m.allocateInstr() 1111 mov.asFpuMov128(rd.nr(), tmpRes.nr()) 1112 m.insert(mov) 1113 } 1114 } 1115 1116 func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) { 1117 x, y, lane := instr.Arg2WithLane() 1118 arr := ssaLaneToArrangement(lane) 1119 1120 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1121 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1122 1123 // Note: this usage of tmp is important. 1124 // BSL modifies the destination register, so we need to use a temporary register so that 1125 // the actual definition of the destination register happens *after* the BSL instruction. 1126 // That way, we can force the spill instruction to be inserted after the BSL instruction. 1127 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1128 1129 fcmgt := m.allocateInstr() 1130 if max { 1131 fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr) 1132 } else { 1133 // If min, swap the args. 1134 fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr) 1135 } 1136 m.insert(fcmgt) 1137 1138 bsl := m.allocateInstr() 1139 bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B) 1140 m.insert(bsl) 1141 1142 res := operandNR(m.compiler.VRegOf(instr.Return())) 1143 mov2 := m.allocateInstr() 1144 mov2.asFpuMov128(res.nr(), tmp.nr()) 1145 m.insert(mov2) 1146 } 1147 1148 func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { 1149 div := m.allocateInstr() 1150 1151 if signed { 1152 div.asALU(aluOpSDiv, rd, rn, rm, _64bit) 1153 } else { 1154 div.asALU(aluOpUDiv, rd, rn, rm, _64bit) 1155 } 1156 m.insert(div) 1157 1158 // Check if rm is zero: 1159 m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) 1160 1161 // rd = rn-rd*rm by MSUB instruction. 1162 msub := m.allocateInstr() 1163 msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit) 1164 m.insert(msub) 1165 } 1166 1167 func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { 1168 div := m.allocateInstr() 1169 1170 if signed { 1171 div.asALU(aluOpSDiv, rd, rn, rm, _64bit) 1172 } else { 1173 div.asALU(aluOpUDiv, rd, rn, rm, _64bit) 1174 } 1175 m.insert(div) 1176 1177 // Check if rm is zero: 1178 m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) 1179 1180 if signed { 1181 // We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1" 1182 minusOneCheck := m.allocateInstr() 1183 // Sets eq condition if rm == -1. 1184 minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit) 1185 m.insert(minusOneCheck) 1186 1187 ccmp := m.allocateInstr() 1188 // If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag. 1189 ccmp.asCCmpImm(rn, 1, eq, 0, _64bit) 1190 m.insert(ccmp) 1191 1192 // Check the overflow flag. 1193 m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow) 1194 } 1195 } 1196 1197 // exitIfNot emits a conditional branch to exit if the condition is not met. 1198 // If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit. 1199 // Otherwise, `cond64bit` is ignored. 1200 func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) { 1201 execCtxTmp := m.copyToTmp(execCtxVReg) 1202 1203 cbr := m.allocateInstr() 1204 m.insert(cbr) 1205 m.lowerExitWithCode(execCtxTmp, code) 1206 // Conditional branch target is after exit. 1207 l := m.insertBrTargetLabel() 1208 cbr.asCondBr(c, l, cond64bit) 1209 } 1210 1211 func (m *machine) lowerFcopysign(x, y, ret ssa.Value) { 1212 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1213 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1214 var tmpI, tmpF operand 1215 _64 := x.Type() == ssa.TypeF64 1216 if _64 { 1217 tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1218 tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 1219 } else { 1220 tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32)) 1221 tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) 1222 } 1223 rd := m.compiler.VRegOf(ret) 1224 m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64) 1225 } 1226 1227 func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) { 1228 // This is exactly the same code emitted by GCC for "__builtin_copysign": 1229 // 1230 // mov x0, -9223372036854775808 1231 // fmov d2, x0 1232 // vbit v0.8b, v1.8b, v2.8b 1233 // 1234 1235 setMSB := m.allocateInstr() 1236 if _64bit { 1237 m.lowerConstantI64(tmpI.nr(), math.MinInt64) 1238 setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0)) 1239 } else { 1240 m.lowerConstantI32(tmpI.nr(), math.MinInt32) 1241 setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0)) 1242 } 1243 m.insert(setMSB) 1244 1245 tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1246 1247 mov := m.allocateInstr() 1248 mov.asFpuMov64(tmpReg.nr(), rn.nr()) 1249 m.insert(mov) 1250 1251 vbit := m.allocateInstr() 1252 vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B) 1253 m.insert(vbit) 1254 1255 movDst := m.allocateInstr() 1256 movDst.asFpuMov64(rd.nr(), tmpReg.nr()) 1257 m.insert(movDst) 1258 } 1259 1260 func (m *machine) lowerBitcast(instr *ssa.Instruction) { 1261 v, dstType := instr.BitcastData() 1262 srcType := v.Type() 1263 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 1264 rd := operandNR(m.compiler.VRegOf(instr.Return())) 1265 srcInt := srcType.IsInt() 1266 dstInt := dstType.IsInt() 1267 switch { 1268 case srcInt && !dstInt: // Int to Float: 1269 mov := m.allocateInstr() 1270 var arr vecArrangement 1271 if srcType.Bits() == 64 { 1272 arr = vecArrangementD 1273 } else { 1274 arr = vecArrangementS 1275 } 1276 mov.asMovToVec(rd, rn, arr, vecIndex(0)) 1277 m.insert(mov) 1278 case !srcInt && dstInt: // Float to Int: 1279 mov := m.allocateInstr() 1280 var arr vecArrangement 1281 if dstType.Bits() == 64 { 1282 arr = vecArrangementD 1283 } else { 1284 arr = vecArrangementS 1285 } 1286 mov.asMovFromVec(rd, rn, arr, vecIndex(0), false) 1287 m.insert(mov) 1288 default: 1289 panic("TODO?BUG?") 1290 } 1291 } 1292 1293 func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) { 1294 rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone) 1295 rd := operandNR(m.compiler.VRegOf(out)) 1296 1297 neg := m.allocateInstr() 1298 neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64) 1299 m.insert(neg) 1300 } 1301 1302 func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) { 1303 if !nonTrapping { 1304 // First of all, we have to clear the FPU flags. 1305 flagClear := m.allocateInstr() 1306 flagClear.asMovToFPSR(xzrVReg) 1307 m.insert(flagClear) 1308 } 1309 1310 // Then, do the conversion which doesn't trap inherently. 1311 cvt := m.allocateInstr() 1312 cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit) 1313 m.insert(cvt) 1314 1315 if !nonTrapping { 1316 tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) 1317 1318 // After the conversion, check the FPU flags. 1319 getFlag := m.allocateInstr() 1320 getFlag.asMovFromFPSR(tmpReg) 1321 m.insert(getFlag) 1322 1323 execCtx := m.copyToTmp(ctx) 1324 _rn := operandNR(m.copyToTmp(rn.nr())) 1325 1326 // Check if the conversion was undefined by comparing the status with 1. 1327 // See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register 1328 alu := m.allocateInstr() 1329 alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true) 1330 m.insert(alu) 1331 1332 // If it is not undefined, we can return the result. 1333 ok := m.allocateInstr() 1334 m.insert(ok) 1335 1336 // Otherwise, we have to choose the status depending on it is overflow or NaN conversion. 1337 1338 // Comparing itself to check if it is a NaN. 1339 fpuCmp := m.allocateInstr() 1340 fpuCmp.asFpuCmp(_rn, _rn, src64bit) 1341 m.insert(fpuCmp) 1342 // If the VC flag is not set (== VS flag is set), it is a NaN. 1343 m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger) 1344 // Otherwise, it is an overflow. 1345 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 1346 1347 // Conditional branch target is after exit. 1348 l := m.insertBrTargetLabel() 1349 ok.asCondBr(ne.asCond(), l, false /* ignored */) 1350 } 1351 } 1352 1353 func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) { 1354 cvt := m.allocateInstr() 1355 cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit) 1356 m.insert(cvt) 1357 } 1358 1359 func (m *machine) lowerFpuBinOp(si *ssa.Instruction) { 1360 instr := m.allocateInstr() 1361 var op fpuBinOp 1362 switch si.Opcode() { 1363 case ssa.OpcodeFadd: 1364 op = fpuBinOpAdd 1365 case ssa.OpcodeFsub: 1366 op = fpuBinOpSub 1367 case ssa.OpcodeFmul: 1368 op = fpuBinOpMul 1369 case ssa.OpcodeFdiv: 1370 op = fpuBinOpDiv 1371 case ssa.OpcodeFmax: 1372 op = fpuBinOpMax 1373 case ssa.OpcodeFmin: 1374 op = fpuBinOpMin 1375 } 1376 x, y := si.Arg2() 1377 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1378 rn := m.getOperand_NR(xDef, extModeNone) 1379 rm := m.getOperand_NR(yDef, extModeNone) 1380 rd := operandNR(m.compiler.VRegOf(si.Return())) 1381 instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64) 1382 m.insert(instr) 1383 } 1384 1385 func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) { 1386 x, y := si.Arg2() 1387 if !x.Type().IsInt() { 1388 panic("BUG?") 1389 } 1390 1391 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1392 rn := m.getOperand_NR(xDef, extModeNone) 1393 rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone) 1394 1395 var aop aluOp 1396 switch { 1397 case add && !yNegated: // rn+rm = x+y 1398 aop = aluOpAdd 1399 case add && yNegated: // rn-rm = x-(-y) = x+y 1400 aop = aluOpSub 1401 case !add && !yNegated: // rn-rm = x-y 1402 aop = aluOpSub 1403 case !add && yNegated: // rn+rm = x-(-y) = x-y 1404 aop = aluOpAdd 1405 } 1406 rd := operandNR(m.compiler.VRegOf(si.Return())) 1407 alu := m.allocateInstr() 1408 alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64) 1409 m.insert(alu) 1410 } 1411 1412 // InsertMove implements backend.Machine. 1413 func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { 1414 instr := m.allocateInstr() 1415 switch typ { 1416 case ssa.TypeI32, ssa.TypeI64: 1417 instr.asMove64(dst, src) 1418 case ssa.TypeF32, ssa.TypeF64: 1419 instr.asFpuMov64(dst, src) 1420 case ssa.TypeV128: 1421 instr.asFpuMov128(dst, src) 1422 default: 1423 panic("TODO") 1424 } 1425 m.insert(instr) 1426 } 1427 1428 func (m *machine) lowerIcmp(si *ssa.Instruction) { 1429 x, y, c := si.IcmpData() 1430 flag := condFlagFromSSAIntegerCmpCond(c) 1431 1432 in64bit := x.Type().Bits() == 64 1433 var ext extMode 1434 if in64bit { 1435 if c.Signed() { 1436 ext = extModeSignExtend64 1437 } else { 1438 ext = extModeZeroExtend64 1439 } 1440 } else { 1441 if c.Signed() { 1442 ext = extModeSignExtend32 1443 } else { 1444 ext = extModeZeroExtend32 1445 } 1446 } 1447 1448 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) 1449 rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext) 1450 alu := m.allocateInstr() 1451 alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit) 1452 m.insert(alu) 1453 1454 cset := m.allocateInstr() 1455 cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag) 1456 m.insert(cset) 1457 } 1458 1459 func (m *machine) lowerVIcmp(si *ssa.Instruction) { 1460 x, y, c, lane := si.VIcmpData() 1461 flag := condFlagFromSSAIntegerCmpCond(c) 1462 arr := ssaLaneToArrangement(lane) 1463 1464 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1465 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1466 rd := operandNR(m.compiler.VRegOf(si.Return())) 1467 1468 switch flag { 1469 case eq: 1470 cmp := m.allocateInstr() 1471 cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) 1472 m.insert(cmp) 1473 case ne: 1474 cmp := m.allocateInstr() 1475 cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) 1476 m.insert(cmp) 1477 not := m.allocateInstr() 1478 not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) 1479 m.insert(not) 1480 case ge: 1481 cmp := m.allocateInstr() 1482 cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr) 1483 m.insert(cmp) 1484 case gt: 1485 cmp := m.allocateInstr() 1486 cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr) 1487 m.insert(cmp) 1488 case le: 1489 cmp := m.allocateInstr() 1490 cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped 1491 m.insert(cmp) 1492 case lt: 1493 cmp := m.allocateInstr() 1494 cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped 1495 m.insert(cmp) 1496 case hs: 1497 cmp := m.allocateInstr() 1498 cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr) 1499 m.insert(cmp) 1500 case hi: 1501 cmp := m.allocateInstr() 1502 cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr) 1503 m.insert(cmp) 1504 case ls: 1505 cmp := m.allocateInstr() 1506 cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped 1507 m.insert(cmp) 1508 case lo: 1509 cmp := m.allocateInstr() 1510 cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped 1511 m.insert(cmp) 1512 } 1513 } 1514 1515 func (m *machine) lowerVFcmp(si *ssa.Instruction) { 1516 x, y, c, lane := si.VFcmpData() 1517 flag := condFlagFromSSAFloatCmpCond(c) 1518 arr := ssaLaneToArrangement(lane) 1519 1520 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1521 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1522 rd := operandNR(m.compiler.VRegOf(si.Return())) 1523 1524 switch flag { 1525 case eq: 1526 cmp := m.allocateInstr() 1527 cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) 1528 m.insert(cmp) 1529 case ne: 1530 cmp := m.allocateInstr() 1531 cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) 1532 m.insert(cmp) 1533 not := m.allocateInstr() 1534 not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) 1535 m.insert(not) 1536 case ge: 1537 cmp := m.allocateInstr() 1538 cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr) 1539 m.insert(cmp) 1540 case gt: 1541 cmp := m.allocateInstr() 1542 cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr) 1543 m.insert(cmp) 1544 case mi: 1545 cmp := m.allocateInstr() 1546 cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped 1547 m.insert(cmp) 1548 case ls: 1549 cmp := m.allocateInstr() 1550 cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped 1551 m.insert(cmp) 1552 } 1553 } 1554 1555 func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) { 1556 cvt := m.allocateInstr() 1557 if signed { 1558 cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr) 1559 } else { 1560 cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr) 1561 } 1562 m.insert(cvt) 1563 1564 if arr == vecArrangement2D { 1565 narrow := m.allocateInstr() 1566 if signed { 1567 narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S) 1568 } else { 1569 narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S) 1570 } 1571 m.insert(narrow) 1572 } 1573 } 1574 1575 func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) { 1576 cvt := m.allocateInstr() 1577 if signed { 1578 cvt.asVecMisc(vecOpScvtf, rd, rn, arr) 1579 } else { 1580 cvt.asVecMisc(vecOpUcvtf, rd, rn, arr) 1581 } 1582 m.insert(cvt) 1583 } 1584 1585 func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) { 1586 x, amount := si.Arg2() 1587 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) 1588 rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits()) 1589 rd := operandNR(m.compiler.VRegOf(si.Return())) 1590 1591 alu := m.allocateInstr() 1592 alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64) 1593 m.insert(alu) 1594 } 1595 1596 func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp) { 1597 x, y := si.Arg2() 1598 1599 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1600 rn := m.getOperand_NR(xDef, extModeNone) 1601 rd := operandNR(m.compiler.VRegOf(si.Return())) 1602 1603 _64 := x.Type().Bits() == 64 1604 alu := m.allocateInstr() 1605 if instr := yDef.Instr; instr != nil && instr.Constant() { 1606 c := instr.ConstantVal() 1607 if isBitMaskImmediate(c, _64) { 1608 // Constant bit wise operations can be lowered to a single instruction. 1609 alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64) 1610 m.insert(alu) 1611 return 1612 } 1613 } 1614 1615 rm := m.getOperand_SR_NR(yDef, extModeNone) 1616 alu.asALU(op, rd, rn, rm, _64) 1617 m.insert(alu) 1618 } 1619 1620 func (m *machine) lowerRotl(si *ssa.Instruction) { 1621 x, y := si.Arg2() 1622 r := si.Return() 1623 _64 := r.Type().Bits() == 64 1624 1625 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1626 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1627 var tmp operand 1628 if _64 { 1629 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 1630 } else { 1631 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) 1632 } 1633 rd := operandNR(m.compiler.VRegOf(r)) 1634 1635 // Encode rotl as neg + rotr: neg is a sub against the zero-reg. 1636 m.lowerRotlImpl(rd, rn, rm, tmp, _64) 1637 } 1638 1639 func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) { 1640 // Encode rotl as neg + rotr: neg is a sub against the zero-reg. 1641 neg := m.allocateInstr() 1642 neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit) 1643 m.insert(neg) 1644 alu := m.allocateInstr() 1645 alu.asALU(aluOpRotR, rd, rn, tmp, is64bit) 1646 m.insert(alu) 1647 } 1648 1649 func (m *machine) lowerRotr(si *ssa.Instruction) { 1650 x, y := si.Arg2() 1651 1652 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1653 rn := m.getOperand_NR(xDef, extModeNone) 1654 rm := m.getOperand_NR(yDef, extModeNone) 1655 rd := operandNR(m.compiler.VRegOf(si.Return())) 1656 1657 alu := m.allocateInstr() 1658 alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64) 1659 m.insert(alu) 1660 } 1661 1662 func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) { 1663 rd := m.compiler.VRegOf(ret) 1664 def := m.compiler.ValueDefinition(arg) 1665 1666 if instr := def.Instr; !signed && from == 32 && instr != nil { 1667 // We can optimize out the unsigned extend because: 1668 // Writes to the W register set bits [63:32] of the X register to zero 1669 // https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions 1670 switch instr.Opcode() { 1671 case 1672 ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad, 1673 ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot, 1674 ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr, 1675 ssa.OpcodeRotl, ssa.OpcodeRotr, 1676 ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32: 1677 // So, if the argument is the result of a 32-bit operation, we can just copy the register. 1678 // It is highly likely that this copy will be optimized out after register allocation. 1679 rn := m.compiler.VRegOf(arg) 1680 mov := m.allocateInstr() 1681 // Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend). 1682 mov.asMove64(rd, rn) 1683 m.insert(mov) 1684 return 1685 default: 1686 } 1687 } 1688 rn := m.getOperand_NR(def, extModeNone) 1689 1690 ext := m.allocateInstr() 1691 ext.asExtend(rd, rn.nr(), from, to, signed) 1692 m.insert(ext) 1693 } 1694 1695 func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) { 1696 rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1697 1698 fc := m.allocateInstr() 1699 fc.asFpuCmp(rn, rm, x.Type().Bits() == 64) 1700 m.insert(fc) 1701 1702 cset := m.allocateInstr() 1703 cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c)) 1704 m.insert(cset) 1705 } 1706 1707 func (m *machine) lowerImul(x, y, result ssa.Value) { 1708 rd := m.compiler.VRegOf(result) 1709 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1710 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1711 1712 // TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg. 1713 1714 mul := m.allocateInstr() 1715 mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64) 1716 m.insert(mul) 1717 } 1718 1719 func (m *machine) lowerClz(x, result ssa.Value) { 1720 rd := m.compiler.VRegOf(result) 1721 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1722 clz := m.allocateInstr() 1723 clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64) 1724 m.insert(clz) 1725 } 1726 1727 func (m *machine) lowerCtz(x, result ssa.Value) { 1728 rd := m.compiler.VRegOf(result) 1729 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1730 rbit := m.allocateInstr() 1731 _64 := x.Type().Bits() == 64 1732 var tmpReg regalloc.VReg 1733 if _64 { 1734 tmpReg = m.compiler.AllocateVReg(ssa.TypeI64) 1735 } else { 1736 tmpReg = m.compiler.AllocateVReg(ssa.TypeI32) 1737 } 1738 rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64) 1739 m.insert(rbit) 1740 1741 clz := m.allocateInstr() 1742 clz.asBitRR(bitOpClz, rd, tmpReg, _64) 1743 m.insert(clz) 1744 } 1745 1746 func (m *machine) lowerPopcnt(x, result ssa.Value) { 1747 // arm64 doesn't have an instruction for population count on scalar register, 1748 // so we use the vector instruction `cnt`. 1749 // This is exactly what the official Go implements bits.OneCount. 1750 // For example, "func () int { return bits.OneCount(10) }" is compiled as 1751 // 1752 // MOVD $10, R0 ;; Load 10. 1753 // FMOVD R0, F0 1754 // VCNT V0.B8, V0.B8 1755 // UADDLV V0.B8, V0 1756 // 1757 // In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`, 1758 // and the registers may use different names. In our encoding we use the following 1759 // instructions: 1760 // 1761 // ins v0.d[0], x0 ;; mov from GPR to vec (FMOV above) is encoded as INS 1762 // cnt v0.16b, v0.16b ;; we use vec arrangement 16b 1763 // uaddlv h0, v0.8b ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b 1764 // mov x5, v0.d[0] ;; finally we mov the result back to a GPR 1765 // 1766 1767 rd := operandNR(m.compiler.VRegOf(result)) 1768 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1769 1770 rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1771 ins := m.allocateInstr() 1772 ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0)) 1773 m.insert(ins) 1774 1775 rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1776 cnt := m.allocateInstr() 1777 cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B) 1778 m.insert(cnt) 1779 1780 rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1781 uaddlv := m.allocateInstr() 1782 uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B) 1783 m.insert(uaddlv) 1784 1785 mov := m.allocateInstr() 1786 mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false) 1787 m.insert(mov) 1788 } 1789 1790 // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument. 1791 func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) { 1792 tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32) 1793 loadExitCodeConst := m.allocateInstr() 1794 loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true) 1795 1796 setExitCode := m.allocateInstr() 1797 setExitCode.asStore(operandNR(tmpReg1), 1798 addressMode{ 1799 kind: addressModeKindRegUnsignedImm12, 1800 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(), 1801 }, 32) 1802 1803 // In order to unwind the stack, we also need to push the current stack pointer: 1804 tmp2 := m.compiler.AllocateVReg(ssa.TypeI64) 1805 movSpToTmp := m.allocateInstr() 1806 movSpToTmp.asMove64(tmp2, spVReg) 1807 strSpToExecCtx := m.allocateInstr() 1808 strSpToExecCtx.asStore(operandNR(tmp2), 1809 addressMode{ 1810 kind: addressModeKindRegUnsignedImm12, 1811 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(), 1812 }, 64) 1813 // Also the address of this exit. 1814 tmp3 := m.compiler.AllocateVReg(ssa.TypeI64) 1815 currentAddrToTmp := m.allocateInstr() 1816 currentAddrToTmp.asAdr(tmp3, 0) 1817 storeCurrentAddrToExecCtx := m.allocateInstr() 1818 storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), 1819 addressMode{ 1820 kind: addressModeKindRegUnsignedImm12, 1821 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(), 1822 }, 64) 1823 1824 exitSeq := m.allocateInstr() 1825 exitSeq.asExitSequence(execCtxVReg) 1826 1827 m.insert(loadExitCodeConst) 1828 m.insert(setExitCode) 1829 m.insert(movSpToTmp) 1830 m.insert(strSpToExecCtx) 1831 m.insert(currentAddrToTmp) 1832 m.insert(storeCurrentAddrToExecCtx) 1833 m.insert(exitSeq) 1834 } 1835 1836 func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) { 1837 if x.Type() != y.Type() { 1838 panic( 1839 fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s", 1840 x.ID(), x.Type(), y.ID(), y.Type())) 1841 } 1842 1843 extMod := extModeOf(x.Type(), signed) 1844 1845 // First operand must be in pure register form. 1846 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod) 1847 // Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions. 1848 rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod) 1849 1850 alu := m.allocateInstr() 1851 // subs zr, rn, rm 1852 alu.asALU( 1853 aluOpSubS, 1854 // We don't need the result, just need to set flags. 1855 operandNR(xzrVReg), 1856 rn, 1857 rm, 1858 x.Type().Bits() == 64, 1859 ) 1860 m.insert(alu) 1861 } 1862 1863 func (m *machine) lowerFcmpToFlag(x, y ssa.Value) { 1864 if x.Type() != y.Type() { 1865 panic("TODO(maybe): support icmp with different types") 1866 } 1867 1868 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1869 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1870 cmp := m.allocateInstr() 1871 cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64) 1872 m.insert(cmp) 1873 } 1874 1875 func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { 1876 condDef := m.compiler.ValueDefinition(cond) 1877 if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) { 1878 panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) 1879 } 1880 condDef.Instr.MarkLowered() 1881 1882 cvalInstr := condDef.Instr 1883 x, y, c := cvalInstr.IcmpData() 1884 signed := c.Signed() 1885 m.lowerIcmpToFlag(x, y, signed) 1886 1887 execCtxTmp := m.copyToTmp(execCtxVReg) 1888 1889 // We have to skip the entire exit sequence if the condition is false. 1890 cbr := m.allocateInstr() 1891 m.insert(cbr) 1892 m.lowerExitWithCode(execCtxTmp, code) 1893 // conditional branch target is after exit. 1894 l := m.insertBrTargetLabel() 1895 cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */) 1896 } 1897 1898 func (m *machine) lowerSelect(c, x, y, result ssa.Value) { 1899 cvalDef := m.compiler.ValueDefinition(c) 1900 1901 var cc condFlag 1902 switch { 1903 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. 1904 cvalInstr := cvalDef.Instr 1905 x, y, c := cvalInstr.IcmpData() 1906 cc = condFlagFromSSAIntegerCmpCond(c) 1907 m.lowerIcmpToFlag(x, y, c.Signed()) 1908 cvalDef.Instr.MarkLowered() 1909 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. 1910 cvalInstr := cvalDef.Instr 1911 x, y, c := cvalInstr.FcmpData() 1912 cc = condFlagFromSSAFloatCmpCond(c) 1913 m.lowerFcmpToFlag(x, y) 1914 cvalDef.Instr.MarkLowered() 1915 default: 1916 rn := m.getOperand_NR(cvalDef, extModeNone) 1917 if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 { 1918 panic("TODO?BUG?: support select with non-integer condition") 1919 } 1920 alu := m.allocateInstr() 1921 // subs zr, rn, zr 1922 alu.asALU( 1923 aluOpSubS, 1924 // We don't need the result, just need to set flags. 1925 operandNR(xzrVReg), 1926 rn, 1927 operandNR(xzrVReg), 1928 c.Type().Bits() == 64, 1929 ) 1930 m.insert(alu) 1931 cc = ne 1932 } 1933 1934 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1935 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1936 1937 rd := operandNR(m.compiler.VRegOf(result)) 1938 switch x.Type() { 1939 case ssa.TypeI32, ssa.TypeI64: 1940 // csel rd, rn, rm, cc 1941 csel := m.allocateInstr() 1942 csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64) 1943 m.insert(csel) 1944 case ssa.TypeF32, ssa.TypeF64: 1945 // fcsel rd, rn, rm, cc 1946 fcsel := m.allocateInstr() 1947 fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64) 1948 m.insert(fcsel) 1949 default: 1950 panic("BUG") 1951 } 1952 } 1953 1954 func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) { 1955 // First check if `rc` is zero or not. 1956 checkZero := m.allocateInstr() 1957 checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false) 1958 m.insert(checkZero) 1959 1960 // Then use CSETM to set all bits to one if `rc` is zero. 1961 allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64) 1962 cset := m.allocateInstr() 1963 cset.asCSet(allOnesOrZero, true, ne) 1964 m.insert(cset) 1965 1966 // Then move the bits to the result vector register. 1967 tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1968 dup := m.allocateInstr() 1969 dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D) 1970 m.insert(dup) 1971 1972 // Now that `tmp2` has either all bits one or zero depending on `rc`, 1973 // we can use bsl to select between `rn` and `rm`. 1974 ins := m.allocateInstr() 1975 ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B) 1976 m.insert(ins) 1977 1978 // Finally, move the result to the destination register. 1979 mov2 := m.allocateInstr() 1980 mov2.asFpuMov128(rd.nr(), tmp2.nr()) 1981 m.insert(mov2) 1982 } 1983 1984 // copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue 1985 // e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes 1986 func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { 1987 typ := m.compiler.TypeOf(v) 1988 mov := m.allocateInstr() 1989 tmp := m.compiler.AllocateVReg(typ) 1990 if typ.IsInt() { 1991 mov.asMove64(tmp, v) 1992 } else { 1993 mov.asFpuMov128(tmp, v) 1994 } 1995 m.insert(mov) 1996 return tmp 1997 }