github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/engine/wazevo/backend/isa/arm64/lower_instr.go (about) 1 package arm64 2 3 // Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions 4 // into machine specific instructions. 5 // 6 // Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree, 7 // and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection. 8 9 import ( 10 "fmt" 11 "math" 12 13 "github.com/wasilibs/wazerox/internal/engine/wazevo/backend/regalloc" 14 "github.com/wasilibs/wazerox/internal/engine/wazevo/ssa" 15 "github.com/wasilibs/wazerox/internal/engine/wazevo/wazevoapi" 16 ) 17 18 // LowerSingleBranch implements backend.Machine. 19 func (m *machine) LowerSingleBranch(br *ssa.Instruction) { 20 switch br.Opcode() { 21 case ssa.OpcodeJump: 22 _, _, targetBlk := br.BranchData() 23 if br.IsFallthroughJump() { 24 return 25 } 26 b := m.allocateInstr() 27 target := m.getOrAllocateSSABlockLabel(targetBlk) 28 if target == returnLabel { 29 b.asRet(m.currentABI) 30 } else { 31 b.asBr(target) 32 } 33 m.insert(b) 34 case ssa.OpcodeBrTable: 35 m.lowerBrTable(br) 36 default: 37 panic("BUG: unexpected branch opcode" + br.Opcode().String()) 38 } 39 } 40 41 func (m *machine) lowerBrTable(i *ssa.Instruction) { 42 index, targets := i.BrTableData() 43 indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone) 44 45 // Firstly, we have to do the bounds check of the index, and 46 // set it to the default target (sitting at the end of the list) if it's out of bounds. 47 48 // mov maxIndexReg #maximum_index 49 // subs wzr, index, maxIndexReg 50 // csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg. 51 maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32) 52 m.lowerConstantI32(maxIndexReg, int32(len(targets)-1)) 53 subs := m.allocateInstr() 54 subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false) 55 m.insert(subs) 56 csel := m.allocateInstr() 57 adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32) 58 csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false) 59 m.insert(csel) 60 61 brSequence := m.allocateInstr() 62 63 // TODO: reuse the slice! 64 labels := make([]uint32, len(targets)) 65 for j, target := range targets { 66 labels[j] = uint32(m.getOrAllocateSSABlockLabel(target)) 67 } 68 69 brSequence.asBrTableSequence(adjustedIndex, labels) 70 m.insert(brSequence) 71 } 72 73 // LowerConditionalBranch implements backend.Machine. 74 func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { 75 cval, args, targetBlk := b.BranchData() 76 if len(args) > 0 { 77 panic(fmt.Sprintf( 78 "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", 79 m.currentSSABlk, 80 targetBlk, 81 )) 82 } 83 84 target := m.getOrAllocateSSABlockLabel(targetBlk) 85 cvalDef := m.compiler.ValueDefinition(cval) 86 87 switch { 88 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. 89 cvalInstr := cvalDef.Instr 90 x, y, c := cvalInstr.IcmpData() 91 cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed() 92 if b.Opcode() == ssa.OpcodeBrz { 93 cc = cc.invert() 94 } 95 96 m.lowerIcmpToFlag(x, y, signed) 97 cbr := m.allocateInstr() 98 cbr.asCondBr(cc.asCond(), target, false /* ignored */) 99 m.insert(cbr) 100 cvalDef.Instr.MarkLowered() 101 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. 102 cvalInstr := cvalDef.Instr 103 x, y, c := cvalInstr.FcmpData() 104 cc := condFlagFromSSAFloatCmpCond(c) 105 if b.Opcode() == ssa.OpcodeBrz { 106 cc = cc.invert() 107 } 108 m.lowerFcmpToFlag(x, y) 109 cbr := m.allocateInstr() 110 cbr.asCondBr(cc.asCond(), target, false /* ignored */) 111 m.insert(cbr) 112 cvalDef.Instr.MarkLowered() 113 default: 114 rn := m.getOperand_NR(cvalDef, extModeNone) 115 var c cond 116 if b.Opcode() == ssa.OpcodeBrz { 117 c = registerAsRegZeroCond(rn.nr()) 118 } else { 119 c = registerAsRegNotZeroCond(rn.nr()) 120 } 121 cbr := m.allocateInstr() 122 cbr.asCondBr(c, target, false) 123 m.insert(cbr) 124 } 125 } 126 127 // LowerInstr implements backend.Machine. 128 func (m *machine) LowerInstr(instr *ssa.Instruction) { 129 if l := instr.SourceOffset(); l.Valid() { 130 info := m.allocateInstr().asEmitSourceOffsetInfo(l) 131 m.insert(info) 132 } 133 134 switch op := instr.Opcode(); op { 135 case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: 136 panic("BUG: branching instructions are handled by LowerBranches") 137 case ssa.OpcodeReturn: 138 panic("BUG: return must be handled by backend.Compiler") 139 case ssa.OpcodeIadd, ssa.OpcodeIsub: 140 m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd) 141 case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin: 142 m.lowerFpuBinOp(instr) 143 case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. 144 case ssa.OpcodeExitWithCode: 145 execCtx, code := instr.ExitWithCodeData() 146 m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code) 147 case ssa.OpcodeExitIfTrueWithCode: 148 execCtx, c, code := instr.ExitIfTrueWithCodeData() 149 m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code) 150 case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: 151 m.lowerStore(instr) 152 case ssa.OpcodeLoad: 153 dst := instr.Return() 154 ptr, offset, typ := instr.LoadData() 155 m.lowerLoad(ptr, offset, typ, dst) 156 case ssa.OpcodeVZeroExtLoad: 157 dst := instr.Return() 158 ptr, offset, typ := instr.VZeroExtLoadData() 159 m.lowerLoad(ptr, offset, typ, dst) 160 case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: 161 ptr, offset, _ := instr.LoadData() 162 ret := m.compiler.VRegOf(instr.Return()) 163 m.lowerExtLoad(op, ptr, offset, ret) 164 case ssa.OpcodeCall, ssa.OpcodeCallIndirect: 165 m.lowerCall(instr) 166 case ssa.OpcodeIcmp: 167 m.lowerIcmp(instr) 168 case ssa.OpcodeVIcmp: 169 m.lowerVIcmp(instr) 170 case ssa.OpcodeVFcmp: 171 m.lowerVFcmp(instr) 172 case ssa.OpcodeVCeil: 173 m.lowerVecMisc(vecOpFrintp, instr) 174 case ssa.OpcodeVFloor: 175 m.lowerVecMisc(vecOpFrintm, instr) 176 case ssa.OpcodeVTrunc: 177 m.lowerVecMisc(vecOpFrintz, instr) 178 case ssa.OpcodeVNearest: 179 m.lowerVecMisc(vecOpFrintn, instr) 180 case ssa.OpcodeVMaxPseudo: 181 m.lowerVMinMaxPseudo(instr, true) 182 case ssa.OpcodeVMinPseudo: 183 m.lowerVMinMaxPseudo(instr, false) 184 case ssa.OpcodeBand: 185 m.lowerBitwiseAluOp(instr, aluOpAnd) 186 case ssa.OpcodeBor: 187 m.lowerBitwiseAluOp(instr, aluOpOrr) 188 case ssa.OpcodeBxor: 189 m.lowerBitwiseAluOp(instr, aluOpEor) 190 case ssa.OpcodeIshl: 191 m.lowerShifts(instr, extModeNone, aluOpLsl) 192 case ssa.OpcodeSshr: 193 if instr.Return().Type().Bits() == 64 { 194 m.lowerShifts(instr, extModeSignExtend64, aluOpAsr) 195 } else { 196 m.lowerShifts(instr, extModeSignExtend32, aluOpAsr) 197 } 198 case ssa.OpcodeUshr: 199 if instr.Return().Type().Bits() == 64 { 200 m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr) 201 } else { 202 m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr) 203 } 204 case ssa.OpcodeRotl: 205 m.lowerRotl(instr) 206 case ssa.OpcodeRotr: 207 m.lowerRotr(instr) 208 case ssa.OpcodeSExtend, ssa.OpcodeUExtend: 209 from, to, signed := instr.ExtendData() 210 m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) 211 case ssa.OpcodeFcmp: 212 x, y, c := instr.FcmpData() 213 m.lowerFcmp(x, y, instr.Return(), c) 214 case ssa.OpcodeImul: 215 x, y := instr.Arg2() 216 result := instr.Return() 217 m.lowerImul(x, y, result) 218 case ssa.OpcodeUndefined: 219 undef := m.allocateInstr() 220 undef.asUDF() 221 m.insert(undef) 222 case ssa.OpcodeSelect: 223 c, x, y := instr.SelectData() 224 if x.Type() == ssa.TypeV128 { 225 rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) 226 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 227 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 228 rd := operandNR(m.compiler.VRegOf(instr.Return())) 229 m.lowerSelectVec(rc, rn, rm, rd) 230 } else { 231 m.lowerSelect(c, x, y, instr.Return()) 232 } 233 case ssa.OpcodeClz: 234 x := instr.Arg() 235 result := instr.Return() 236 m.lowerClz(x, result) 237 case ssa.OpcodeCtz: 238 x := instr.Arg() 239 result := instr.Return() 240 m.lowerCtz(x, result) 241 case ssa.OpcodePopcnt: 242 x := instr.Arg() 243 result := instr.Return() 244 m.lowerPopcnt(x, result) 245 case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: 246 x, ctx := instr.Arg2() 247 result := instr.Return() 248 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 249 rd := operandNR(m.compiler.VRegOf(result)) 250 ctxVReg := m.compiler.VRegOf(ctx) 251 m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64, 252 result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) 253 case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: 254 x, ctx := instr.Arg2() 255 result := instr.Return() 256 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 257 rd := operandNR(m.compiler.VRegOf(result)) 258 ctxVReg := m.compiler.VRegOf(ctx) 259 m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64, 260 result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) 261 case ssa.OpcodeFcvtFromSint: 262 x := instr.Arg() 263 result := instr.Return() 264 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 265 rd := operandNR(m.compiler.VRegOf(result)) 266 m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) 267 case ssa.OpcodeFcvtFromUint: 268 x := instr.Arg() 269 result := instr.Return() 270 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 271 rd := operandNR(m.compiler.VRegOf(result)) 272 m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) 273 case ssa.OpcodeFdemote: 274 v := instr.Arg() 275 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 276 rd := operandNR(m.compiler.VRegOf(instr.Return())) 277 cnt := m.allocateInstr() 278 cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false) 279 m.insert(cnt) 280 case ssa.OpcodeFpromote: 281 v := instr.Arg() 282 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 283 rd := operandNR(m.compiler.VRegOf(instr.Return())) 284 cnt := m.allocateInstr() 285 cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true) 286 m.insert(cnt) 287 case ssa.OpcodeIreduce: 288 rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone) 289 retVal := instr.Return() 290 rd := m.compiler.VRegOf(retVal) 291 292 if retVal.Type() != ssa.TypeI32 { 293 panic("TODO?: Ireduce to non-i32") 294 } 295 mov := m.allocateInstr() 296 mov.asMove32(rd, rn.reg()) 297 m.insert(mov) 298 case ssa.OpcodeFneg: 299 m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return()) 300 case ssa.OpcodeSqrt: 301 m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return()) 302 case ssa.OpcodeCeil: 303 m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return()) 304 case ssa.OpcodeFloor: 305 m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return()) 306 case ssa.OpcodeTrunc: 307 m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return()) 308 case ssa.OpcodeNearest: 309 m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return()) 310 case ssa.OpcodeFabs: 311 m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return()) 312 case ssa.OpcodeBitcast: 313 m.lowerBitcast(instr) 314 case ssa.OpcodeFcopysign: 315 x, y := instr.Arg2() 316 m.lowerFcopysign(x, y, instr.Return()) 317 case ssa.OpcodeSdiv, ssa.OpcodeUdiv: 318 x, y, ctx := instr.Arg3() 319 ctxVReg := m.compiler.VRegOf(ctx) 320 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 321 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 322 rd := operandNR(m.compiler.VRegOf(instr.Return())) 323 m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv) 324 case ssa.OpcodeSrem, ssa.OpcodeUrem: 325 x, y, ctx := instr.Arg3() 326 ctxVReg := m.compiler.VRegOf(ctx) 327 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 328 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 329 rd := operandNR(m.compiler.VRegOf(instr.Return())) 330 m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem) 331 case ssa.OpcodeVconst: 332 result := m.compiler.VRegOf(instr.Return()) 333 lo, hi := instr.VconstData() 334 v := m.allocateInstr() 335 v.asLoadFpuConst128(result, lo, hi) 336 m.insert(v) 337 case ssa.OpcodeVbnot: 338 x := instr.Arg() 339 ins := m.allocateInstr() 340 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 341 rd := operandNR(m.compiler.VRegOf(instr.Return())) 342 ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B) 343 m.insert(ins) 344 case ssa.OpcodeVbxor: 345 x, y := instr.Arg2() 346 m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B) 347 case ssa.OpcodeVbor: 348 x, y := instr.Arg2() 349 m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B) 350 case ssa.OpcodeVband: 351 x, y := instr.Arg2() 352 m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B) 353 case ssa.OpcodeVbandnot: 354 x, y := instr.Arg2() 355 m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B) 356 case ssa.OpcodeVbitselect: 357 c, x, y := instr.SelectData() 358 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 359 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 360 creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) 361 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 362 363 // creg is overwritten by BSL, so we need to move it to the result register before the instruction 364 // in case when it is used somewhere else. 365 mov := m.allocateInstr() 366 mov.asFpuMov128(tmp.nr(), creg.nr()) 367 m.insert(mov) 368 369 ins := m.allocateInstr() 370 ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B) 371 m.insert(ins) 372 373 mov2 := m.allocateInstr() 374 rd := m.compiler.VRegOf(instr.Return()) 375 mov2.asFpuMov128(rd, tmp.nr()) 376 m.insert(mov2) 377 case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue: 378 x, lane := instr.ArgWithLane() 379 var arr vecArrangement 380 if op == ssa.OpcodeVallTrue { 381 arr = ssaLaneToArrangement(lane) 382 } 383 rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 384 rd := operandNR(m.compiler.VRegOf(instr.Return())) 385 m.lowerVcheckTrue(op, rm, rd, arr) 386 case ssa.OpcodeVhighBits: 387 x, lane := instr.ArgWithLane() 388 rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 389 rd := operandNR(m.compiler.VRegOf(instr.Return())) 390 arr := ssaLaneToArrangement(lane) 391 m.lowerVhighBits(rm, rd, arr) 392 case ssa.OpcodeVIadd: 393 x, y, lane := instr.Arg2WithLane() 394 arr := ssaLaneToArrangement(lane) 395 m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr) 396 case ssa.OpcodeIaddPairwise: 397 x, y, lane := instr.Arg2WithLane() 398 arr := ssaLaneToArrangement(lane) 399 m.lowerVecRRR(vecOpAddp, x, y, instr.Return(), arr) 400 case ssa.OpcodeVSaddSat: 401 x, y, lane := instr.Arg2WithLane() 402 arr := ssaLaneToArrangement(lane) 403 m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr) 404 case ssa.OpcodeVUaddSat: 405 x, y, lane := instr.Arg2WithLane() 406 arr := ssaLaneToArrangement(lane) 407 m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr) 408 case ssa.OpcodeVIsub: 409 x, y, lane := instr.Arg2WithLane() 410 arr := ssaLaneToArrangement(lane) 411 m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr) 412 case ssa.OpcodeVSsubSat: 413 x, y, lane := instr.Arg2WithLane() 414 arr := ssaLaneToArrangement(lane) 415 m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr) 416 case ssa.OpcodeVUsubSat: 417 x, y, lane := instr.Arg2WithLane() 418 arr := ssaLaneToArrangement(lane) 419 m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr) 420 case ssa.OpcodeVImin: 421 x, y, lane := instr.Arg2WithLane() 422 arr := ssaLaneToArrangement(lane) 423 m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr) 424 case ssa.OpcodeVUmin: 425 x, y, lane := instr.Arg2WithLane() 426 arr := ssaLaneToArrangement(lane) 427 m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr) 428 case ssa.OpcodeVImax: 429 x, y, lane := instr.Arg2WithLane() 430 arr := ssaLaneToArrangement(lane) 431 m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr) 432 case ssa.OpcodeVUmax: 433 x, y, lane := instr.Arg2WithLane() 434 arr := ssaLaneToArrangement(lane) 435 m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr) 436 case ssa.OpcodeVAvgRound: 437 x, y, lane := instr.Arg2WithLane() 438 arr := ssaLaneToArrangement(lane) 439 m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr) 440 case ssa.OpcodeVImul: 441 x, y, lane := instr.Arg2WithLane() 442 arr := ssaLaneToArrangement(lane) 443 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 444 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 445 rd := operandNR(m.compiler.VRegOf(instr.Return())) 446 m.lowerVIMul(rd, rn, rm, arr) 447 case ssa.OpcodeVIabs: 448 m.lowerVecMisc(vecOpAbs, instr) 449 case ssa.OpcodeVIneg: 450 m.lowerVecMisc(vecOpNeg, instr) 451 case ssa.OpcodeVIpopcnt: 452 m.lowerVecMisc(vecOpCnt, instr) 453 case ssa.OpcodeVIshl, 454 ssa.OpcodeVSshr, ssa.OpcodeVUshr: 455 x, y, lane := instr.Arg2WithLane() 456 arr := ssaLaneToArrangement(lane) 457 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 458 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 459 rd := operandNR(m.compiler.VRegOf(instr.Return())) 460 m.lowerVShift(op, rd, rn, rm, arr) 461 case ssa.OpcodeVSqrt: 462 m.lowerVecMisc(vecOpFsqrt, instr) 463 case ssa.OpcodeVFabs: 464 m.lowerVecMisc(vecOpFabs, instr) 465 case ssa.OpcodeVFneg: 466 m.lowerVecMisc(vecOpFneg, instr) 467 case ssa.OpcodeVFmin: 468 x, y, lane := instr.Arg2WithLane() 469 arr := ssaLaneToArrangement(lane) 470 m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr) 471 case ssa.OpcodeVFmax: 472 x, y, lane := instr.Arg2WithLane() 473 arr := ssaLaneToArrangement(lane) 474 m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr) 475 case ssa.OpcodeVFadd: 476 x, y, lane := instr.Arg2WithLane() 477 arr := ssaLaneToArrangement(lane) 478 m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr) 479 case ssa.OpcodeVFsub: 480 x, y, lane := instr.Arg2WithLane() 481 arr := ssaLaneToArrangement(lane) 482 m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr) 483 case ssa.OpcodeVFmul: 484 x, y, lane := instr.Arg2WithLane() 485 arr := ssaLaneToArrangement(lane) 486 m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr) 487 case ssa.OpcodeSqmulRoundSat: 488 x, y, lane := instr.Arg2WithLane() 489 arr := ssaLaneToArrangement(lane) 490 m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr) 491 case ssa.OpcodeVFdiv: 492 x, y, lane := instr.Arg2WithLane() 493 arr := ssaLaneToArrangement(lane) 494 m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr) 495 case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: 496 x, lane := instr.ArgWithLane() 497 arr := ssaLaneToArrangement(lane) 498 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 499 rd := operandNR(m.compiler.VRegOf(instr.Return())) 500 m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat) 501 case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint: 502 x, lane := instr.ArgWithLane() 503 arr := ssaLaneToArrangement(lane) 504 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 505 rd := operandNR(m.compiler.VRegOf(instr.Return())) 506 m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint) 507 case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow: 508 x, lane := instr.ArgWithLane() 509 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 510 rd := operandNR(m.compiler.VRegOf(instr.Return())) 511 512 var arr vecArrangement 513 switch lane { 514 case ssa.VecLaneI8x16: 515 arr = vecArrangement8B 516 case ssa.VecLaneI16x8: 517 arr = vecArrangement4H 518 case ssa.VecLaneI32x4: 519 arr = vecArrangement2S 520 } 521 522 shll := m.allocateInstr() 523 if signed := op == ssa.OpcodeSwidenLow; signed { 524 shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) 525 } else { 526 shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) 527 } 528 m.insert(shll) 529 case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh: 530 x, lane := instr.ArgWithLane() 531 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 532 rd := operandNR(m.compiler.VRegOf(instr.Return())) 533 534 arr := ssaLaneToArrangement(lane) 535 536 shll := m.allocateInstr() 537 if signed := op == ssa.OpcodeSwidenHigh; signed { 538 shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) 539 } else { 540 shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) 541 } 542 m.insert(shll) 543 544 case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: 545 x, y, lane := instr.Arg2WithLane() 546 var arr, arr2 vecArrangement 547 switch lane { 548 case ssa.VecLaneI16x8: // I16x8 549 arr = vecArrangement8B 550 arr2 = vecArrangement16B // Implies sqxtn2. 551 case ssa.VecLaneI32x4: 552 arr = vecArrangement4H 553 arr2 = vecArrangement8H // Implies sqxtn2. 554 default: 555 panic("unsupported lane " + lane.String()) 556 } 557 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 558 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 559 rd := operandNR(m.compiler.VRegOf(instr.Return())) 560 561 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 562 563 loQxtn := m.allocateInstr() 564 hiQxtn := m.allocateInstr() 565 if signed := op == ssa.OpcodeSnarrow; signed { 566 // Narrow lanes on rn and write them into lower-half of rd. 567 loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low 568 // Narrow lanes on rm and write them into higher-half of rd. 569 hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2) 570 } else { 571 // Narrow lanes on rn and write them into lower-half of rd. 572 loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low 573 // Narrow lanes on rm and write them into higher-half of rd. 574 hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2) 575 } 576 m.insert(loQxtn) 577 m.insert(hiQxtn) 578 579 mov := m.allocateInstr() 580 mov.asFpuMov128(rd.nr(), tmp.nr()) 581 m.insert(mov) 582 case ssa.OpcodeFvpromoteLow: 583 x, lane := instr.ArgWithLane() 584 if lane != ssa.VecLaneF32x4 { 585 panic("unsupported lane type " + lane.String()) 586 } 587 ins := m.allocateInstr() 588 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 589 rd := operandNR(m.compiler.VRegOf(instr.Return())) 590 ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S) 591 m.insert(ins) 592 case ssa.OpcodeFvdemote: 593 x, lane := instr.ArgWithLane() 594 if lane != ssa.VecLaneF64x2 { 595 panic("unsupported lane type " + lane.String()) 596 } 597 ins := m.allocateInstr() 598 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 599 rd := operandNR(m.compiler.VRegOf(instr.Return())) 600 ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S) 601 m.insert(ins) 602 case ssa.OpcodeExtractlane: 603 x, index, signed, lane := instr.ExtractlaneData() 604 605 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 606 rd := operandNR(m.compiler.VRegOf(instr.Return())) 607 608 mov := m.allocateInstr() 609 switch lane { 610 case ssa.VecLaneI8x16: 611 mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed) 612 case ssa.VecLaneI16x8: 613 mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed) 614 case ssa.VecLaneI32x4: 615 mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed) 616 case ssa.VecLaneI64x2: 617 mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed) 618 case ssa.VecLaneF32x4: 619 mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index)) 620 case ssa.VecLaneF64x2: 621 mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index)) 622 default: 623 panic("unsupported lane: " + lane.String()) 624 } 625 626 m.insert(mov) 627 628 case ssa.OpcodeInsertlane: 629 x, y, index, lane := instr.InsertlaneData() 630 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 631 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 632 rd := operandNR(m.compiler.VRegOf(instr.Return())) 633 tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 634 635 // Initially mov rn to tmp. 636 mov1 := m.allocateInstr() 637 mov1.asFpuMov128(tmpReg.nr(), rn.nr()) 638 m.insert(mov1) 639 640 // movToVec and vecMovElement do not clear the remaining bits to zero, 641 // thus, we can mov rm in-place to tmp. 642 mov2 := m.allocateInstr() 643 switch lane { 644 case ssa.VecLaneI8x16: 645 mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index)) 646 case ssa.VecLaneI16x8: 647 mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index)) 648 case ssa.VecLaneI32x4: 649 mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index)) 650 case ssa.VecLaneI64x2: 651 mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index)) 652 case ssa.VecLaneF32x4: 653 mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0)) 654 case ssa.VecLaneF64x2: 655 mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0)) 656 } 657 m.insert(mov2) 658 659 // Finally mov tmp to rd. 660 mov3 := m.allocateInstr() 661 mov3.asFpuMov128(rd.nr(), tmpReg.nr()) 662 m.insert(mov3) 663 664 case ssa.OpcodeSwizzle: 665 x, y, lane := instr.Arg2WithLane() 666 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 667 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 668 rd := operandNR(m.compiler.VRegOf(instr.Return())) 669 670 arr := ssaLaneToArrangement(lane) 671 672 // tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr> 673 tbl1 := m.allocateInstr() 674 tbl1.asVecTbl(1, rd, rn, rm, arr) 675 m.insert(tbl1) 676 677 case ssa.OpcodeShuffle: 678 x, y, lane1, lane2 := instr.ShuffleData() 679 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 680 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 681 rd := operandNR(m.compiler.VRegOf(instr.Return())) 682 683 m.lowerShuffle(rd, rn, rm, lane1, lane2) 684 685 case ssa.OpcodeSplat: 686 x, lane := instr.ArgWithLane() 687 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 688 rd := operandNR(m.compiler.VRegOf(instr.Return())) 689 690 dup := m.allocateInstr() 691 switch lane { 692 case ssa.VecLaneI8x16: 693 dup.asVecDup(rd, rn, vecArrangement16B) 694 case ssa.VecLaneI16x8: 695 dup.asVecDup(rd, rn, vecArrangement8H) 696 case ssa.VecLaneI32x4: 697 dup.asVecDup(rd, rn, vecArrangement4S) 698 case ssa.VecLaneI64x2: 699 dup.asVecDup(rd, rn, vecArrangement2D) 700 case ssa.VecLaneF32x4: 701 dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0)) 702 case ssa.VecLaneF64x2: 703 dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0)) 704 } 705 m.insert(dup) 706 707 case ssa.OpcodeLoadSplat: 708 ptr, offset, lane := instr.LoadSplatData() 709 m.lowerLoadSplat(ptr, offset, lane, instr.Return()) 710 default: 711 panic("TODO: lowering " + op.String()) 712 } 713 m.FlushPendingInstructions() 714 } 715 716 func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) { 717 // `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30. 718 vReg, wReg := v29VReg, v30VReg 719 720 // Initialize v29, v30 to rn, rm. 721 movv := m.allocateInstr() 722 movv.asFpuMov128(vReg, rn.nr()) 723 m.insert(movv) 724 725 movw := m.allocateInstr() 726 movw.asFpuMov128(wReg, rm.nr()) 727 m.insert(movw) 728 729 // `lane1`, `lane2` are already encoded as two u64s with the right layout: 730 // lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0] 731 // lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8] 732 // Thus, we can use loadFpuConst128. 733 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 734 lfc := m.allocateInstr() 735 lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2) 736 m.insert(lfc) 737 738 // tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b 739 tbl2 := m.allocateInstr() 740 tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B) 741 m.insert(tbl2) 742 } 743 744 func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) { 745 var modulo byte 746 switch arr { 747 case vecArrangement16B: 748 modulo = 0x7 // Modulo 8. 749 case vecArrangement8H: 750 modulo = 0xf // Modulo 16. 751 case vecArrangement4S: 752 modulo = 0x1f // Modulo 32. 753 case vecArrangement2D: 754 modulo = 0x3f // Modulo 64. 755 default: 756 panic("unsupported arrangment " + arr.String()) 757 } 758 759 rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 760 vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 761 762 and := m.allocateInstr() 763 and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true) 764 m.insert(and) 765 766 if op != ssa.OpcodeVIshl { 767 // Negate the amount to make this as right shift. 768 neg := m.allocateInstr() 769 neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true) 770 m.insert(neg) 771 } 772 773 // Copy the shift amount into a vector register as sshl/ushl requires it to be there. 774 dup := m.allocateInstr() 775 dup.asVecDup(vtmp, rtmp, arr) 776 m.insert(dup) 777 778 if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr { 779 sshl := m.allocateInstr() 780 sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr) 781 m.insert(sshl) 782 } else { 783 ushl := m.allocateInstr() 784 ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr) 785 m.insert(ushl) 786 } 787 } 788 789 func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) { 790 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 791 792 // Special case VallTrue for i64x2. 793 if op == ssa.OpcodeVallTrue && arr == vecArrangement2D { 794 // cmeq v3?.2d, v2?.2d, #0 795 // addp v3?.2d, v3?.2d, v3?.2d 796 // fcmp v3?, v3? 797 // cset dst, eq 798 799 ins := m.allocateInstr() 800 ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D) 801 m.insert(ins) 802 803 addp := m.allocateInstr() 804 addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D) 805 m.insert(addp) 806 807 fcmp := m.allocateInstr() 808 fcmp.asFpuCmp(tmp, tmp, true) 809 m.insert(fcmp) 810 811 cset := m.allocateInstr() 812 cset.asCSet(rd.nr(), false, eq) 813 m.insert(cset) 814 815 return 816 } 817 818 // Create a scalar value with umaxp or uminv, then compare it against zero. 819 ins := m.allocateInstr() 820 if op == ssa.OpcodeVanyTrue { 821 // umaxp v4?.16b, v2?.16b, v2?.16b 822 ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B) 823 } else { 824 // uminv d4?, v2?.4s 825 ins.asVecLanes(vecOpUminv, tmp, rm, arr) 826 } 827 m.insert(ins) 828 829 // mov x3?, v4?.d[0] 830 // ccmp x3?, #0x0, #0x0, al 831 // cset x3?, ne 832 // mov x0, x3? 833 834 movv := m.allocateInstr() 835 movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false) 836 m.insert(movv) 837 838 fc := m.allocateInstr() 839 fc.asCCmpImm(rd, uint64(0), al, 0, true) 840 m.insert(fc) 841 842 cset := m.allocateInstr() 843 cset.asCSet(rd.nr(), false, ne) 844 m.insert(cset) 845 } 846 847 func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) { 848 r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 849 v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 850 v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 851 852 switch arr { 853 case vecArrangement16B: 854 // sshr v6?.16b, v2?.16b, #7 855 // movz x4?, #0x201, lsl 0 856 // movk x4?, #0x804, lsl 16 857 // movk x4?, #0x2010, lsl 32 858 // movk x4?, #0x8040, lsl 48 859 // dup v5?.2d, x4? 860 // and v6?.16b, v6?.16b, v5?.16b 861 // ext v5?.16b, v6?.16b, v6?.16b, #8 862 // zip1 v5?.16b, v6?.16b, v5?.16b 863 // addv s5?, v5?.8h 864 // umov s3?, v5?.h[0] 865 866 // Right arithmetic shift on the original vector and store the result into v1. So we have: 867 // v1[i] = 0xff if vi<0, 0 otherwise. 868 sshr := m.allocateInstr() 869 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B) 870 m.insert(sshr) 871 872 // Load the bit mask into r0. 873 m.insertMOVZ(r0.nr(), 0x0201, 0, true) 874 m.insertMOVK(r0.nr(), 0x0804, 1, true) 875 m.insertMOVK(r0.nr(), 0x2010, 2, true) 876 m.insertMOVK(r0.nr(), 0x8040, 3, true) 877 878 // dup r0 to v0. 879 dup := m.allocateInstr() 880 dup.asVecDup(v0, r0, vecArrangement2D) 881 m.insert(dup) 882 883 // Lane-wise logical AND with the bit mask, meaning that we have 884 // v[i] = (1 << i) if vi<0, 0 otherwise. 885 // 886 // Below, we use the following notation: 887 // wi := (1 << i) if vi<0, 0 otherwise. 888 and := m.allocateInstr() 889 and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B) 890 m.insert(and) 891 892 // Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have 893 // v0[i] = w(i+8) if i < 8, w(i-8) otherwise. 894 ext := m.allocateInstr() 895 ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8)) 896 m.insert(ext) 897 898 // v = [w0, w8, ..., w7, w15] 899 zip1 := m.allocateInstr() 900 zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B) 901 m.insert(zip1) 902 903 // v.h[0] = w0 + ... + w15 904 addv := m.allocateInstr() 905 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) 906 m.insert(addv) 907 908 // Extract the v.h[0] as the result. 909 movfv := m.allocateInstr() 910 movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) 911 m.insert(movfv) 912 case vecArrangement8H: 913 // sshr v6?.8h, v2?.8h, #15 914 // movz x4?, #0x1, lsl 0 915 // movk x4?, #0x2, lsl 16 916 // movk x4?, #0x4, lsl 32 917 // movk x4?, #0x8, lsl 48 918 // dup v5?.2d, x4? 919 // lsl x4?, x4?, 0x4 920 // ins v5?.d[1], x4? 921 // and v5?.16b, v6?.16b, v5?.16b 922 // addv s5?, v5?.8h 923 // umov s3?, v5?.h[0] 924 925 // Right arithmetic shift on the original vector and store the result into v1. So we have: 926 // v[i] = 0xffff if vi<0, 0 otherwise. 927 sshr := m.allocateInstr() 928 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H) 929 m.insert(sshr) 930 931 // Load the bit mask into r0. 932 m.lowerConstantI64(r0.nr(), 0x0008000400020001) 933 934 // dup r0 to vector v0. 935 dup := m.allocateInstr() 936 dup.asVecDup(v0, r0, vecArrangement2D) 937 m.insert(dup) 938 939 lsl := m.allocateInstr() 940 lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true) 941 m.insert(lsl) 942 943 movv := m.allocateInstr() 944 movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) 945 m.insert(movv) 946 947 // Lane-wise logical AND with the bitmask, meaning that we have 948 // v[i] = (1 << i) if vi<0, 0 otherwise for i=0..3 949 // = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7 950 and := m.allocateInstr() 951 and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) 952 m.insert(and) 953 954 addv := m.allocateInstr() 955 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) 956 m.insert(addv) 957 958 movfv := m.allocateInstr() 959 movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) 960 m.insert(movfv) 961 case vecArrangement4S: 962 // sshr v6?.8h, v2?.8h, #15 963 // movz x4?, #0x1, lsl 0 964 // movk x4?, #0x2, lsl 16 965 // movk x4?, #0x4, lsl 32 966 // movk x4?, #0x8, lsl 48 967 // dup v5?.2d, x4? 968 // lsl x4?, x4?, 0x4 969 // ins v5?.d[1], x4? 970 // and v5?.16b, v6?.16b, v5?.16b 971 // addv s5?, v5?.8h 972 // umov s3?, v5?.h[0] 973 974 // Right arithmetic shift on the original vector and store the result into v1. So we have: 975 // v[i] = 0xffffffff if vi<0, 0 otherwise. 976 sshr := m.allocateInstr() 977 sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S) 978 m.insert(sshr) 979 980 // Load the bit mask into r0. 981 m.lowerConstantI64(r0.nr(), 0x0000000200000001) 982 983 // dup r0 to vector v0. 984 dup := m.allocateInstr() 985 dup.asVecDup(v0, r0, vecArrangement2D) 986 m.insert(dup) 987 988 lsl := m.allocateInstr() 989 lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true) 990 m.insert(lsl) 991 992 movv := m.allocateInstr() 993 movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) 994 m.insert(movv) 995 996 // Lane-wise logical AND with the bitmask, meaning that we have 997 // v[i] = (1 << i) if vi<0, 0 otherwise for i in [0, 1] 998 // = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3] 999 and := m.allocateInstr() 1000 and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) 1001 m.insert(and) 1002 1003 addv := m.allocateInstr() 1004 addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S) 1005 m.insert(addv) 1006 1007 movfv := m.allocateInstr() 1008 movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false) 1009 m.insert(movfv) 1010 case vecArrangement2D: 1011 // mov d3?, v2?.d[0] 1012 // mov x4?, v2?.d[1] 1013 // lsr x4?, x4?, 0x3f 1014 // lsr d3?, d3?, 0x3f 1015 // add s3?, s3?, w4?, lsl #1 1016 1017 // Move the lower 64-bit int into result. 1018 movv0 := m.allocateInstr() 1019 movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false) 1020 m.insert(movv0) 1021 1022 // Move the higher 64-bit int into r0. 1023 movv1 := m.allocateInstr() 1024 movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false) 1025 m.insert(movv1) 1026 1027 // Move the sign bit into the least significant bit. 1028 lsr1 := m.allocateInstr() 1029 lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true) 1030 m.insert(lsr1) 1031 1032 lsr2 := m.allocateInstr() 1033 lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true) 1034 m.insert(lsr2) 1035 1036 // rd = (r0<<1) | rd 1037 lsl := m.allocateInstr() 1038 lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false) 1039 m.insert(lsl) 1040 default: 1041 panic("Unsupported " + arr.String()) 1042 } 1043 } 1044 1045 func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) { 1046 x, lane := instr.ArgWithLane() 1047 arr := ssaLaneToArrangement(lane) 1048 ins := m.allocateInstr() 1049 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1050 rd := operandNR(m.compiler.VRegOf(instr.Return())) 1051 ins.asVecMisc(op, rd, rn, arr) 1052 m.insert(ins) 1053 } 1054 1055 func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) { 1056 ins := m.allocateInstr() 1057 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1058 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1059 rd := operandNR(m.compiler.VRegOf(ret)) 1060 ins.asVecRRR(op, rd, rn, rm, arr) 1061 m.insert(ins) 1062 } 1063 1064 func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) { 1065 if arr != vecArrangement2D { 1066 mul := m.allocateInstr() 1067 mul.asVecRRR(vecOpMul, rd, rn, rm, arr) 1068 m.insert(mul) 1069 } else { 1070 tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1071 tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1072 tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1073 1074 tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1075 1076 // Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696 1077 rev64 := m.allocateInstr() 1078 rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S) 1079 m.insert(rev64) 1080 1081 mul := m.allocateInstr() 1082 mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S) 1083 m.insert(mul) 1084 1085 xtn1 := m.allocateInstr() 1086 xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S) 1087 m.insert(xtn1) 1088 1089 addp := m.allocateInstr() 1090 addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S) 1091 m.insert(addp) 1092 1093 xtn2 := m.allocateInstr() 1094 xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S) 1095 m.insert(xtn2) 1096 1097 // Note: do not write the result directly into result yet. This is the same reason as in bsl. 1098 // In short, in UMLAL instruction, the result register is also one of the source register, and 1099 // the value on the result register is significant. 1100 shll := m.allocateInstr() 1101 shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S) 1102 m.insert(shll) 1103 1104 umlal := m.allocateInstr() 1105 umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S) 1106 m.insert(umlal) 1107 1108 mov := m.allocateInstr() 1109 mov.asFpuMov128(rd.nr(), tmpRes.nr()) 1110 m.insert(mov) 1111 } 1112 } 1113 1114 func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) { 1115 x, y, lane := instr.Arg2WithLane() 1116 arr := ssaLaneToArrangement(lane) 1117 1118 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1119 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1120 1121 // Note: this usage of tmp is important. 1122 // BSL modifies the destination register, so we need to use a temporary register so that 1123 // the actual definition of the destination register happens *after* the BSL instruction. 1124 // That way, we can force the spill instruction to be inserted after the BSL instruction. 1125 tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1126 1127 fcmgt := m.allocateInstr() 1128 if max { 1129 fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr) 1130 } else { 1131 // If min, swap the args. 1132 fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr) 1133 } 1134 m.insert(fcmgt) 1135 1136 bsl := m.allocateInstr() 1137 bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B) 1138 m.insert(bsl) 1139 1140 res := operandNR(m.compiler.VRegOf(instr.Return())) 1141 mov2 := m.allocateInstr() 1142 mov2.asFpuMov128(res.nr(), tmp.nr()) 1143 m.insert(mov2) 1144 } 1145 1146 func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { 1147 div := m.allocateInstr() 1148 1149 if signed { 1150 div.asALU(aluOpSDiv, rd, rn, rm, _64bit) 1151 } else { 1152 div.asALU(aluOpUDiv, rd, rn, rm, _64bit) 1153 } 1154 m.insert(div) 1155 1156 // Check if rm is zero: 1157 m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) 1158 1159 // rd = rn-rd*rm by MSUB instruction. 1160 msub := m.allocateInstr() 1161 msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit) 1162 m.insert(msub) 1163 } 1164 1165 func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { 1166 div := m.allocateInstr() 1167 1168 if signed { 1169 div.asALU(aluOpSDiv, rd, rn, rm, _64bit) 1170 } else { 1171 div.asALU(aluOpUDiv, rd, rn, rm, _64bit) 1172 } 1173 m.insert(div) 1174 1175 // Check if rm is zero: 1176 m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) 1177 1178 if signed { 1179 // We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1" 1180 minusOneCheck := m.allocateInstr() 1181 // Sets eq condition if rm == -1. 1182 minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit) 1183 m.insert(minusOneCheck) 1184 1185 ccmp := m.allocateInstr() 1186 // If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag. 1187 ccmp.asCCmpImm(rn, 1, eq, 0, _64bit) 1188 m.insert(ccmp) 1189 1190 // Check the overflow flag. 1191 m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow) 1192 } 1193 } 1194 1195 // exitIfNot emits a conditional branch to exit if the condition is not met. 1196 // If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit. 1197 // Otherwise, `cond64bit` is ignored. 1198 func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) { 1199 execCtxTmp := m.copyToTmp(execCtxVReg) 1200 1201 cbr := m.allocateInstr() 1202 m.insert(cbr) 1203 m.lowerExitWithCode(execCtxTmp, code) 1204 // Conditional branch target is after exit. 1205 l := m.insertBrTargetLabel() 1206 cbr.asCondBr(c, l, cond64bit) 1207 } 1208 1209 func (m *machine) lowerFcopysign(x, y, ret ssa.Value) { 1210 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1211 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1212 var tmpI, tmpF operand 1213 _64 := x.Type() == ssa.TypeF64 1214 if _64 { 1215 tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1216 tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 1217 } else { 1218 tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32)) 1219 tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) 1220 } 1221 rd := m.compiler.VRegOf(ret) 1222 m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64) 1223 } 1224 1225 func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) { 1226 // This is exactly the same code emitted by GCC for "__builtin_copysign": 1227 // 1228 // mov x0, -9223372036854775808 1229 // fmov d2, x0 1230 // vbit v0.8b, v1.8b, v2.8b 1231 // 1232 1233 setMSB := m.allocateInstr() 1234 if _64bit { 1235 m.lowerConstantI64(tmpI.nr(), math.MinInt64) 1236 setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0)) 1237 } else { 1238 m.lowerConstantI32(tmpI.nr(), math.MinInt32) 1239 setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0)) 1240 } 1241 m.insert(setMSB) 1242 1243 tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1244 1245 mov := m.allocateInstr() 1246 mov.asFpuMov64(tmpReg.nr(), rn.nr()) 1247 m.insert(mov) 1248 1249 vbit := m.allocateInstr() 1250 vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B) 1251 m.insert(vbit) 1252 1253 movDst := m.allocateInstr() 1254 movDst.asFpuMov64(rd.nr(), tmpReg.nr()) 1255 m.insert(movDst) 1256 } 1257 1258 func (m *machine) lowerBitcast(instr *ssa.Instruction) { 1259 v, dstType := instr.BitcastData() 1260 srcType := v.Type() 1261 rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) 1262 rd := operandNR(m.compiler.VRegOf(instr.Return())) 1263 srcInt := srcType.IsInt() 1264 dstInt := dstType.IsInt() 1265 switch { 1266 case srcInt && !dstInt: // Int to Float: 1267 mov := m.allocateInstr() 1268 var arr vecArrangement 1269 if srcType.Bits() == 64 { 1270 arr = vecArrangementD 1271 } else { 1272 arr = vecArrangementS 1273 } 1274 mov.asMovToVec(rd, rn, arr, vecIndex(0)) 1275 m.insert(mov) 1276 case !srcInt && dstInt: // Float to Int: 1277 mov := m.allocateInstr() 1278 var arr vecArrangement 1279 if dstType.Bits() == 64 { 1280 arr = vecArrangementD 1281 } else { 1282 arr = vecArrangementS 1283 } 1284 mov.asMovFromVec(rd, rn, arr, vecIndex(0), false) 1285 m.insert(mov) 1286 default: 1287 panic("TODO?BUG?") 1288 } 1289 } 1290 1291 func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) { 1292 rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone) 1293 rd := operandNR(m.compiler.VRegOf(out)) 1294 1295 neg := m.allocateInstr() 1296 neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64) 1297 m.insert(neg) 1298 } 1299 1300 func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) { 1301 if !nonTrapping { 1302 // First of all, we have to clear the FPU flags. 1303 flagClear := m.allocateInstr() 1304 flagClear.asMovToFPSR(xzrVReg) 1305 m.insert(flagClear) 1306 } 1307 1308 // Then, do the conversion which doesn't trap inherently. 1309 cvt := m.allocateInstr() 1310 cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit) 1311 m.insert(cvt) 1312 1313 if !nonTrapping { 1314 tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) 1315 1316 // After the conversion, check the FPU flags. 1317 getFlag := m.allocateInstr() 1318 getFlag.asMovFromFPSR(tmpReg) 1319 m.insert(getFlag) 1320 1321 execCtx := m.copyToTmp(ctx) 1322 _rn := operandNR(m.copyToTmp(rn.nr())) 1323 1324 // Check if the conversion was undefined by comparing the status with 1. 1325 // See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register 1326 alu := m.allocateInstr() 1327 alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true) 1328 m.insert(alu) 1329 1330 // If it is not undefined, we can return the result. 1331 ok := m.allocateInstr() 1332 m.insert(ok) 1333 1334 // Otherwise, we have to choose the status depending on it is overflow or NaN conversion. 1335 1336 // Comparing itself to check if it is a NaN. 1337 fpuCmp := m.allocateInstr() 1338 fpuCmp.asFpuCmp(_rn, _rn, src64bit) 1339 m.insert(fpuCmp) 1340 // If the VC flag is not set (== VS flag is set), it is a NaN. 1341 m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger) 1342 // Otherwise, it is an overflow. 1343 m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) 1344 1345 // Conditional branch target is after exit. 1346 l := m.insertBrTargetLabel() 1347 ok.asCondBr(ne.asCond(), l, false /* ignored */) 1348 } 1349 } 1350 1351 func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) { 1352 cvt := m.allocateInstr() 1353 cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit) 1354 m.insert(cvt) 1355 } 1356 1357 func (m *machine) lowerFpuBinOp(si *ssa.Instruction) { 1358 instr := m.allocateInstr() 1359 var op fpuBinOp 1360 switch si.Opcode() { 1361 case ssa.OpcodeFadd: 1362 op = fpuBinOpAdd 1363 case ssa.OpcodeFsub: 1364 op = fpuBinOpSub 1365 case ssa.OpcodeFmul: 1366 op = fpuBinOpMul 1367 case ssa.OpcodeFdiv: 1368 op = fpuBinOpDiv 1369 case ssa.OpcodeFmax: 1370 op = fpuBinOpMax 1371 case ssa.OpcodeFmin: 1372 op = fpuBinOpMin 1373 } 1374 x, y := si.Arg2() 1375 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1376 rn := m.getOperand_NR(xDef, extModeNone) 1377 rm := m.getOperand_NR(yDef, extModeNone) 1378 rd := operandNR(m.compiler.VRegOf(si.Return())) 1379 instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64) 1380 m.insert(instr) 1381 } 1382 1383 func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) { 1384 x, y := si.Arg2() 1385 if !x.Type().IsInt() { 1386 panic("BUG?") 1387 } 1388 1389 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1390 rn := m.getOperand_NR(xDef, extModeNone) 1391 rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone) 1392 1393 var aop aluOp 1394 switch { 1395 case add && !yNegated: // rn+rm = x+y 1396 aop = aluOpAdd 1397 case add && yNegated: // rn-rm = x-(-y) = x+y 1398 aop = aluOpSub 1399 case !add && !yNegated: // rn-rm = x-y 1400 aop = aluOpSub 1401 case !add && yNegated: // rn+rm = x-(-y) = x-y 1402 aop = aluOpAdd 1403 } 1404 rd := operandNR(m.compiler.VRegOf(si.Return())) 1405 alu := m.allocateInstr() 1406 alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64) 1407 m.insert(alu) 1408 } 1409 1410 // InsertMove implements backend.Machine. 1411 func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { 1412 instr := m.allocateInstr() 1413 switch typ { 1414 case ssa.TypeI32, ssa.TypeI64: 1415 instr.asMove64(dst, src) 1416 case ssa.TypeF32, ssa.TypeF64: 1417 instr.asFpuMov64(dst, src) 1418 case ssa.TypeV128: 1419 instr.asFpuMov128(dst, src) 1420 default: 1421 panic("TODO") 1422 } 1423 m.insert(instr) 1424 } 1425 1426 func (m *machine) lowerIcmp(si *ssa.Instruction) { 1427 x, y, c := si.IcmpData() 1428 flag := condFlagFromSSAIntegerCmpCond(c) 1429 1430 in64bit := x.Type().Bits() == 64 1431 var ext extMode 1432 if in64bit { 1433 if c.Signed() { 1434 ext = extModeSignExtend64 1435 } else { 1436 ext = extModeZeroExtend64 1437 } 1438 } else { 1439 if c.Signed() { 1440 ext = extModeSignExtend32 1441 } else { 1442 ext = extModeZeroExtend32 1443 } 1444 } 1445 1446 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) 1447 rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext) 1448 alu := m.allocateInstr() 1449 alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit) 1450 m.insert(alu) 1451 1452 cset := m.allocateInstr() 1453 cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag) 1454 m.insert(cset) 1455 } 1456 1457 func (m *machine) lowerVIcmp(si *ssa.Instruction) { 1458 x, y, c, lane := si.VIcmpData() 1459 flag := condFlagFromSSAIntegerCmpCond(c) 1460 arr := ssaLaneToArrangement(lane) 1461 1462 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1463 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1464 rd := operandNR(m.compiler.VRegOf(si.Return())) 1465 1466 switch flag { 1467 case eq: 1468 cmp := m.allocateInstr() 1469 cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) 1470 m.insert(cmp) 1471 case ne: 1472 cmp := m.allocateInstr() 1473 cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) 1474 m.insert(cmp) 1475 not := m.allocateInstr() 1476 not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) 1477 m.insert(not) 1478 case ge: 1479 cmp := m.allocateInstr() 1480 cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr) 1481 m.insert(cmp) 1482 case gt: 1483 cmp := m.allocateInstr() 1484 cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr) 1485 m.insert(cmp) 1486 case le: 1487 cmp := m.allocateInstr() 1488 cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped 1489 m.insert(cmp) 1490 case lt: 1491 cmp := m.allocateInstr() 1492 cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped 1493 m.insert(cmp) 1494 case hs: 1495 cmp := m.allocateInstr() 1496 cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr) 1497 m.insert(cmp) 1498 case hi: 1499 cmp := m.allocateInstr() 1500 cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr) 1501 m.insert(cmp) 1502 case ls: 1503 cmp := m.allocateInstr() 1504 cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped 1505 m.insert(cmp) 1506 case lo: 1507 cmp := m.allocateInstr() 1508 cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped 1509 m.insert(cmp) 1510 } 1511 } 1512 1513 func (m *machine) lowerVFcmp(si *ssa.Instruction) { 1514 x, y, c, lane := si.VFcmpData() 1515 flag := condFlagFromSSAFloatCmpCond(c) 1516 arr := ssaLaneToArrangement(lane) 1517 1518 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1519 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1520 rd := operandNR(m.compiler.VRegOf(si.Return())) 1521 1522 switch flag { 1523 case eq: 1524 cmp := m.allocateInstr() 1525 cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) 1526 m.insert(cmp) 1527 case ne: 1528 cmp := m.allocateInstr() 1529 cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) 1530 m.insert(cmp) 1531 not := m.allocateInstr() 1532 not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) 1533 m.insert(not) 1534 case ge: 1535 cmp := m.allocateInstr() 1536 cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr) 1537 m.insert(cmp) 1538 case gt: 1539 cmp := m.allocateInstr() 1540 cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr) 1541 m.insert(cmp) 1542 case mi: 1543 cmp := m.allocateInstr() 1544 cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped 1545 m.insert(cmp) 1546 case ls: 1547 cmp := m.allocateInstr() 1548 cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped 1549 m.insert(cmp) 1550 } 1551 } 1552 1553 func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) { 1554 cvt := m.allocateInstr() 1555 if signed { 1556 cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr) 1557 } else { 1558 cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr) 1559 } 1560 m.insert(cvt) 1561 1562 if arr == vecArrangement2D { 1563 narrow := m.allocateInstr() 1564 if signed { 1565 narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S) 1566 } else { 1567 narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S) 1568 } 1569 m.insert(narrow) 1570 } 1571 } 1572 1573 func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) { 1574 cvt := m.allocateInstr() 1575 if signed { 1576 cvt.asVecMisc(vecOpScvtf, rd, rn, arr) 1577 } else { 1578 cvt.asVecMisc(vecOpUcvtf, rd, rn, arr) 1579 } 1580 m.insert(cvt) 1581 } 1582 1583 func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) { 1584 x, amount := si.Arg2() 1585 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) 1586 rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits()) 1587 rd := operandNR(m.compiler.VRegOf(si.Return())) 1588 1589 alu := m.allocateInstr() 1590 alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64) 1591 m.insert(alu) 1592 } 1593 1594 func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp) { 1595 x, y := si.Arg2() 1596 if !x.Type().IsInt() { 1597 panic("BUG?") 1598 } 1599 1600 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1601 rn := m.getOperand_NR(xDef, extModeNone) 1602 rm := m.getOperand_SR_NR(yDef, extModeNone) 1603 rd := operandNR(m.compiler.VRegOf(si.Return())) 1604 1605 alu := m.allocateInstr() 1606 alu.asALU(op, rd, rn, rm, si.Return().Type().Bits() == 64) 1607 m.insert(alu) 1608 } 1609 1610 func (m *machine) lowerRotl(si *ssa.Instruction) { 1611 x, y := si.Arg2() 1612 r := si.Return() 1613 _64 := r.Type().Bits() == 64 1614 1615 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1616 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1617 var tmp operand 1618 if _64 { 1619 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 1620 } else { 1621 tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) 1622 } 1623 rd := operandNR(m.compiler.VRegOf(r)) 1624 1625 // Encode rotl as neg + rotr: neg is a sub against the zero-reg. 1626 m.lowerRotlImpl(rd, rn, rm, tmp, _64) 1627 } 1628 1629 func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) { 1630 // Encode rotl as neg + rotr: neg is a sub against the zero-reg. 1631 neg := m.allocateInstr() 1632 neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit) 1633 m.insert(neg) 1634 alu := m.allocateInstr() 1635 alu.asALU(aluOpRotR, rd, rn, tmp, is64bit) 1636 m.insert(alu) 1637 } 1638 1639 func (m *machine) lowerRotr(si *ssa.Instruction) { 1640 x, y := si.Arg2() 1641 1642 xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) 1643 rn := m.getOperand_NR(xDef, extModeNone) 1644 rm := m.getOperand_NR(yDef, extModeNone) 1645 rd := operandNR(m.compiler.VRegOf(si.Return())) 1646 1647 alu := m.allocateInstr() 1648 alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64) 1649 m.insert(alu) 1650 } 1651 1652 func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) { 1653 rd := m.compiler.VRegOf(ret) 1654 rn := m.getOperand_NR(m.compiler.ValueDefinition(arg), extModeNone) 1655 1656 ext := m.allocateInstr() 1657 ext.asExtend(rd, rn.nr(), from, to, signed) 1658 m.insert(ext) 1659 } 1660 1661 func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) { 1662 rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1663 1664 fc := m.allocateInstr() 1665 fc.asFpuCmp(rn, rm, x.Type().Bits() == 64) 1666 m.insert(fc) 1667 1668 cset := m.allocateInstr() 1669 cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c)) 1670 m.insert(cset) 1671 } 1672 1673 func (m *machine) lowerImul(x, y, result ssa.Value) { 1674 rd := m.compiler.VRegOf(result) 1675 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1676 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1677 1678 // TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg. 1679 1680 mul := m.allocateInstr() 1681 mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64) 1682 m.insert(mul) 1683 } 1684 1685 func (m *machine) lowerClz(x, result ssa.Value) { 1686 rd := m.compiler.VRegOf(result) 1687 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1688 clz := m.allocateInstr() 1689 clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64) 1690 m.insert(clz) 1691 } 1692 1693 func (m *machine) lowerCtz(x, result ssa.Value) { 1694 rd := m.compiler.VRegOf(result) 1695 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1696 rbit := m.allocateInstr() 1697 _64 := x.Type().Bits() == 64 1698 var tmpReg regalloc.VReg 1699 if _64 { 1700 tmpReg = m.compiler.AllocateVReg(ssa.TypeI64) 1701 } else { 1702 tmpReg = m.compiler.AllocateVReg(ssa.TypeI32) 1703 } 1704 rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64) 1705 m.insert(rbit) 1706 1707 clz := m.allocateInstr() 1708 clz.asBitRR(bitOpClz, rd, tmpReg, _64) 1709 m.insert(clz) 1710 } 1711 1712 func (m *machine) lowerPopcnt(x, result ssa.Value) { 1713 // arm64 doesn't have an instruction for population count on scalar register, 1714 // so we use the vector instruction `cnt`. 1715 // This is exactly what the official Go implements bits.OneCount. 1716 // For example, "func () int { return bits.OneCount(10) }" is compiled as 1717 // 1718 // MOVD $10, R0 ;; Load 10. 1719 // FMOVD R0, F0 1720 // VCNT V0.B8, V0.B8 1721 // UADDLV V0.B8, V0 1722 // 1723 // In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`, 1724 // and the registers may use different names. In our encoding we use the following 1725 // instructions: 1726 // 1727 // ins v0.d[0], x0 ;; mov from GPR to vec (FMOV above) is encoded as INS 1728 // cnt v0.16b, v0.16b ;; we use vec arrangement 16b 1729 // uaddlv h0, v0.8b ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b 1730 // mov x5, v0.d[0] ;; finally we mov the result back to a GPR 1731 // 1732 1733 rd := operandNR(m.compiler.VRegOf(result)) 1734 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1735 1736 rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1737 ins := m.allocateInstr() 1738 ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0)) 1739 m.insert(ins) 1740 1741 rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1742 cnt := m.allocateInstr() 1743 cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B) 1744 m.insert(cnt) 1745 1746 rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) 1747 uaddlv := m.allocateInstr() 1748 uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B) 1749 m.insert(uaddlv) 1750 1751 mov := m.allocateInstr() 1752 mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false) 1753 m.insert(mov) 1754 } 1755 1756 // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument. 1757 func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) { 1758 tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32) 1759 loadExitCodeConst := m.allocateInstr() 1760 loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true) 1761 1762 setExitCode := m.allocateInstr() 1763 setExitCode.asStore(operandNR(tmpReg1), 1764 addressMode{ 1765 kind: addressModeKindRegUnsignedImm12, 1766 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(), 1767 }, 32) 1768 1769 // In order to unwind the stack, we also need to push the current stack pointer: 1770 tmp2 := m.compiler.AllocateVReg(ssa.TypeI64) 1771 movSpToTmp := m.allocateInstr() 1772 movSpToTmp.asMove64(tmp2, spVReg) 1773 strSpToExecCtx := m.allocateInstr() 1774 strSpToExecCtx.asStore(operandNR(tmp2), 1775 addressMode{ 1776 kind: addressModeKindRegUnsignedImm12, 1777 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(), 1778 }, 64) 1779 // Also the address of this exit. 1780 tmp3 := m.compiler.AllocateVReg(ssa.TypeI64) 1781 currentAddrToTmp := m.allocateInstr() 1782 currentAddrToTmp.asAdr(tmp3, 0) 1783 storeCurrentAddrToExecCtx := m.allocateInstr() 1784 storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), 1785 addressMode{ 1786 kind: addressModeKindRegUnsignedImm12, 1787 rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(), 1788 }, 64) 1789 1790 exitSeq := m.allocateInstr() 1791 exitSeq.asExitSequence(execCtxVReg) 1792 1793 m.insert(loadExitCodeConst) 1794 m.insert(setExitCode) 1795 m.insert(movSpToTmp) 1796 m.insert(strSpToExecCtx) 1797 m.insert(currentAddrToTmp) 1798 m.insert(storeCurrentAddrToExecCtx) 1799 m.insert(exitSeq) 1800 } 1801 1802 func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) { 1803 if x.Type() != y.Type() { 1804 panic( 1805 fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s", 1806 x.ID(), x.Type(), y.ID(), y.Type())) 1807 } 1808 1809 extMod := extModeOf(x.Type(), signed) 1810 1811 // First operand must be in pure register form. 1812 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod) 1813 // Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions. 1814 rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod) 1815 1816 alu := m.allocateInstr() 1817 // subs zr, rn, rm 1818 alu.asALU( 1819 aluOpSubS, 1820 // We don't need the result, just need to set flags. 1821 operandNR(xzrVReg), 1822 rn, 1823 rm, 1824 x.Type().Bits() == 64, 1825 ) 1826 m.insert(alu) 1827 } 1828 1829 func (m *machine) lowerFcmpToFlag(x, y ssa.Value) { 1830 if x.Type() != y.Type() { 1831 panic("TODO(maybe): support icmp with different types") 1832 } 1833 1834 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1835 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1836 cmp := m.allocateInstr() 1837 cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64) 1838 m.insert(cmp) 1839 } 1840 1841 func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { 1842 condDef := m.compiler.ValueDefinition(cond) 1843 if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) { 1844 panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) 1845 } 1846 condDef.Instr.MarkLowered() 1847 1848 cvalInstr := condDef.Instr 1849 x, y, c := cvalInstr.IcmpData() 1850 signed := c.Signed() 1851 m.lowerIcmpToFlag(x, y, signed) 1852 1853 execCtxTmp := m.copyToTmp(execCtxVReg) 1854 1855 // We have to skip the entire exit sequence if the condition is false. 1856 cbr := m.allocateInstr() 1857 m.insert(cbr) 1858 m.lowerExitWithCode(execCtxTmp, code) 1859 // conditional branch target is after exit. 1860 l := m.insertBrTargetLabel() 1861 cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */) 1862 } 1863 1864 func (m *machine) lowerSelect(c, x, y, result ssa.Value) { 1865 cvalDef := m.compiler.ValueDefinition(c) 1866 1867 var cc condFlag 1868 switch { 1869 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. 1870 cvalInstr := cvalDef.Instr 1871 x, y, c := cvalInstr.IcmpData() 1872 cc = condFlagFromSSAIntegerCmpCond(c) 1873 m.lowerIcmpToFlag(x, y, c.Signed()) 1874 cvalDef.Instr.MarkLowered() 1875 case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. 1876 cvalInstr := cvalDef.Instr 1877 x, y, c := cvalInstr.FcmpData() 1878 cc = condFlagFromSSAFloatCmpCond(c) 1879 m.lowerFcmpToFlag(x, y) 1880 cvalDef.Instr.MarkLowered() 1881 default: 1882 rn := m.getOperand_NR(cvalDef, extModeNone) 1883 if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 { 1884 panic("TODO?BUG?: support select with non-integer condition") 1885 } 1886 alu := m.allocateInstr() 1887 // subs zr, rn, zr 1888 alu.asALU( 1889 aluOpSubS, 1890 // We don't need the result, just need to set flags. 1891 operandNR(xzrVReg), 1892 rn, 1893 operandNR(xzrVReg), 1894 c.Type().Bits() == 64, 1895 ) 1896 m.insert(alu) 1897 cc = ne 1898 } 1899 1900 rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) 1901 rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) 1902 1903 rd := operandNR(m.compiler.VRegOf(result)) 1904 switch x.Type() { 1905 case ssa.TypeI32, ssa.TypeI64: 1906 // csel rd, rn, rm, cc 1907 csel := m.allocateInstr() 1908 csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64) 1909 m.insert(csel) 1910 case ssa.TypeF32, ssa.TypeF64: 1911 // fcsel rd, rn, rm, cc 1912 fcsel := m.allocateInstr() 1913 fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64) 1914 m.insert(fcsel) 1915 default: 1916 panic("BUG") 1917 } 1918 } 1919 1920 func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) { 1921 // First check if `rc` is zero or not. 1922 checkZero := m.allocateInstr() 1923 checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false) 1924 m.insert(checkZero) 1925 1926 // Then use CSETM to set all bits to one if `rc` is zero. 1927 allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64) 1928 cset := m.allocateInstr() 1929 cset.asCSet(allOnesOrZero, true, ne) 1930 m.insert(cset) 1931 1932 // Then move the bits to the result vector register. 1933 tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) 1934 dup := m.allocateInstr() 1935 dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D) 1936 m.insert(dup) 1937 1938 // Now that `tmp2` has either all bits one or zero depending on `rc`, 1939 // we can use bsl to select between `rn` and `rm`. 1940 ins := m.allocateInstr() 1941 ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B) 1942 m.insert(ins) 1943 1944 // Finally, move the result to the destination register. 1945 mov2 := m.allocateInstr() 1946 mov2.asFpuMov128(rd.nr(), tmp2.nr()) 1947 m.insert(mov2) 1948 } 1949 1950 // copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue 1951 // e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes 1952 func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { 1953 typ := m.compiler.TypeOf(v) 1954 mov := m.allocateInstr() 1955 tmp := m.compiler.AllocateVReg(typ) 1956 if typ.IsInt() { 1957 mov.asMove64(tmp, v) 1958 } else { 1959 mov.asFpuMov128(tmp, v) 1960 } 1961 m.insert(mov) 1962 return tmp 1963 }