github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/arm64/machine.go (about) 1 package arm64 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 8 "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" 9 "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" 10 "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" 11 "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" 12 ) 13 14 type ( 15 // machine implements backend.Machine. 16 machine struct { 17 compiler backend.Compiler 18 executableContext *backend.ExecutableContextT[instruction] 19 currentABI *backend.FunctionABI 20 21 regAlloc regalloc.Allocator 22 regAllocFn *backend.RegAllocFunction[*instruction, *machine] 23 24 // addendsWorkQueue is used during address lowering, defined here for reuse. 25 addendsWorkQueue wazevoapi.Queue[ssa.Value] 26 addends32 wazevoapi.Queue[addend32] 27 // addends64 is used during address lowering, defined here for reuse. 28 addends64 wazevoapi.Queue[regalloc.VReg] 29 unresolvedAddressModes []*instruction 30 31 // condBrRelocs holds the conditional branches which need offset relocation. 32 condBrRelocs []condBrReloc 33 34 // jmpTableTargets holds the labels of the jump table targets. 35 jmpTableTargets [][]uint32 36 37 // spillSlotSize is the size of the stack slot in bytes used for spilling registers. 38 // During the execution of the function, the stack looks like: 39 // 40 // 41 // (high address) 42 // +-----------------+ 43 // | ....... | 44 // | ret Y | 45 // | ....... | 46 // | ret 0 | 47 // | arg X | 48 // | ....... | 49 // | arg 1 | 50 // | arg 0 | 51 // | xxxxx | 52 // | ReturnAddress | 53 // +-----------------+ <<-| 54 // | ........... | | 55 // | spill slot M | | <--- spillSlotSize 56 // | ............ | | 57 // | spill slot 2 | | 58 // | spill slot 1 | <<-+ 59 // | clobbered N | 60 // | ........... | 61 // | clobbered 1 | 62 // | clobbered 0 | 63 // SP---> +-----------------+ 64 // (low address) 65 // 66 // and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16. 67 // Also note that this is only known after register allocation. 68 spillSlotSize int64 69 spillSlots map[regalloc.VRegID]int64 // regalloc.VRegID to offset. 70 // clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue. 71 clobberedRegs []regalloc.VReg 72 73 maxRequiredStackSizeForCalls int64 74 stackBoundsCheckDisabled bool 75 76 regAllocStarted bool 77 } 78 79 addend32 struct { 80 r regalloc.VReg 81 ext extendOp 82 } 83 84 condBrReloc struct { 85 cbr *instruction 86 // currentLabelPos is the labelPosition within which condBr is defined. 87 currentLabelPos *labelPosition 88 // Next block's labelPosition. 89 nextLabel label 90 offset int64 91 } 92 93 labelPosition = backend.LabelPosition[instruction] 94 label = backend.Label 95 ) 96 97 const ( 98 labelReturn = backend.LabelReturn 99 labelInvalid = backend.LabelInvalid 100 ) 101 102 // NewBackend returns a new backend for arm64. 103 func NewBackend() backend.Machine { 104 m := &machine{ 105 spillSlots: make(map[regalloc.VRegID]int64), 106 executableContext: newExecutableContext(), 107 regAlloc: regalloc.NewAllocator(regInfo), 108 } 109 return m 110 } 111 112 func newExecutableContext() *backend.ExecutableContextT[instruction] { 113 return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0) 114 } 115 116 // ExecutableContext implements backend.Machine. 117 func (m *machine) ExecutableContext() backend.ExecutableContext { 118 return m.executableContext 119 } 120 121 // RegAlloc implements backend.Machine Function. 122 func (m *machine) RegAlloc() { 123 rf := m.regAllocFn 124 for _, pos := range m.executableContext.OrderedBlockLabels { 125 rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End) 126 } 127 128 m.regAllocStarted = true 129 m.regAlloc.DoAllocation(rf) 130 // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes. 131 m.spillSlotSize = (m.spillSlotSize + 15) &^ 15 132 } 133 134 // Reset implements backend.Machine. 135 func (m *machine) Reset() { 136 m.clobberedRegs = m.clobberedRegs[:0] 137 for key := range m.spillSlots { 138 m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) 139 } 140 for _, key := range m.clobberedRegs { 141 delete(m.spillSlots, regalloc.VRegID(key)) 142 } 143 m.clobberedRegs = m.clobberedRegs[:0] 144 m.regAllocStarted = false 145 m.regAlloc.Reset() 146 m.regAllocFn.Reset() 147 m.spillSlotSize = 0 148 m.unresolvedAddressModes = m.unresolvedAddressModes[:0] 149 m.maxRequiredStackSizeForCalls = 0 150 m.executableContext.Reset() 151 m.jmpTableTargets = m.jmpTableTargets[:0] 152 } 153 154 // SetCurrentABI implements backend.Machine SetCurrentABI. 155 func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { 156 m.currentABI = abi 157 } 158 159 // DisableStackCheck implements backend.Machine DisableStackCheck. 160 func (m *machine) DisableStackCheck() { 161 m.stackBoundsCheckDisabled = true 162 } 163 164 // SetCompiler implements backend.Machine. 165 func (m *machine) SetCompiler(ctx backend.Compiler) { 166 m.compiler = ctx 167 m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx) 168 } 169 170 func (m *machine) insert(i *instruction) { 171 ectx := m.executableContext 172 ectx.PendingInstructions = append(ectx.PendingInstructions, i) 173 } 174 175 func (m *machine) insertBrTargetLabel() label { 176 nop, l := m.allocateBrTarget() 177 m.insert(nop) 178 return l 179 } 180 181 func (m *machine) allocateBrTarget() (nop *instruction, l label) { 182 ectx := m.executableContext 183 l = ectx.AllocateLabel() 184 nop = m.allocateInstr() 185 nop.asNop0WithLabel(l) 186 pos := ectx.AllocateLabelPosition(l) 187 pos.Begin, pos.End = nop, nop 188 ectx.LabelPositions[l] = pos 189 return 190 } 191 192 // allocateInstr allocates an instruction. 193 func (m *machine) allocateInstr() *instruction { 194 instr := m.executableContext.InstructionPool.Allocate() 195 if !m.regAllocStarted { 196 instr.addedBeforeRegAlloc = true 197 } 198 return instr 199 } 200 201 func resetInstruction(i *instruction) { 202 *i = instruction{} 203 } 204 205 func (m *machine) allocateNop() *instruction { 206 instr := m.allocateInstr() 207 instr.asNop0() 208 return instr 209 } 210 211 func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) { 212 amode := &i.amode 213 switch amode.kind { 214 case addressModeKindResultStackSpace: 215 amode.imm += ret0offset 216 case addressModeKindArgStackSpace: 217 amode.imm += arg0offset 218 default: 219 panic("BUG") 220 } 221 222 var sizeInBits byte 223 switch i.kind { 224 case store8, uLoad8: 225 sizeInBits = 8 226 case store16, uLoad16: 227 sizeInBits = 16 228 case store32, fpuStore32, uLoad32, fpuLoad32: 229 sizeInBits = 32 230 case store64, fpuStore64, uLoad64, fpuLoad64: 231 sizeInBits = 64 232 case fpuStore128, fpuLoad128: 233 sizeInBits = 128 234 default: 235 panic("BUG") 236 } 237 238 if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) { 239 amode.kind = addressModeKindRegUnsignedImm12 240 } else { 241 // This case, we load the offset into the temporary register, 242 // and then use it as the index register. 243 newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm) 244 linkInstr(newPrev, i) 245 *amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */} 246 } 247 } 248 249 // resolveRelativeAddresses resolves the relative addresses before encoding. 250 func (m *machine) resolveRelativeAddresses(ctx context.Context) { 251 ectx := m.executableContext 252 for { 253 if len(m.unresolvedAddressModes) > 0 { 254 arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP() 255 for _, i := range m.unresolvedAddressModes { 256 m.resolveAddressingMode(arg0offset, ret0offset, i) 257 } 258 } 259 260 // Reuse the slice to gather the unresolved conditional branches. 261 m.condBrRelocs = m.condBrRelocs[:0] 262 263 var fn string 264 var fnIndex int 265 var labelToSSABlockID map[label]ssa.BasicBlockID 266 if wazevoapi.PerfMapEnabled { 267 fn = wazevoapi.GetCurrentFunctionName(ctx) 268 labelToSSABlockID = make(map[label]ssa.BasicBlockID) 269 for i, l := range ectx.SsaBlockIDToLabels { 270 labelToSSABlockID[l] = ssa.BasicBlockID(i) 271 } 272 fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx) 273 } 274 275 // Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label. 276 var offset int64 277 for i, pos := range ectx.OrderedBlockLabels { 278 pos.BinaryOffset = offset 279 var size int64 280 for cur := pos.Begin; ; cur = cur.next { 281 switch cur.kind { 282 case nop0: 283 l := cur.nop0Label() 284 if pos, ok := ectx.LabelPositions[l]; ok { 285 pos.BinaryOffset = offset + size 286 } 287 case condBr: 288 if !cur.condBrOffsetResolved() { 289 var nextLabel label 290 if i < len(ectx.OrderedBlockLabels)-1 { 291 // Note: this is only used when the block ends with fallthrough, 292 // therefore can be safely assumed that the next block exists when it's needed. 293 nextLabel = ectx.OrderedBlockLabels[i+1].L 294 } 295 m.condBrRelocs = append(m.condBrRelocs, condBrReloc{ 296 cbr: cur, currentLabelPos: pos, offset: offset + size, 297 nextLabel: nextLabel, 298 }) 299 } 300 } 301 size += cur.size() 302 if cur == pos.End { 303 break 304 } 305 } 306 307 if wazevoapi.PerfMapEnabled { 308 if size > 0 { 309 l := pos.L 310 var labelStr string 311 if blkID, ok := labelToSSABlockID[l]; ok { 312 labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID) 313 } else { 314 labelStr = l.String() 315 } 316 wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr)) 317 } 318 } 319 offset += size 320 } 321 322 // Before resolving any offsets, we need to check if all the conditional branches can be resolved. 323 var needRerun bool 324 for i := range m.condBrRelocs { 325 reloc := &m.condBrRelocs[i] 326 cbr := reloc.cbr 327 offset := reloc.offset 328 329 target := cbr.condBrLabel() 330 offsetOfTarget := ectx.LabelPositions[target].BinaryOffset 331 diff := offsetOfTarget - offset 332 if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { 333 // This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block, 334 // and jump to it. 335 m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel) 336 // Then, we need to recall this function to fix up the label offsets 337 // as they have changed after the trampoline is inserted. 338 needRerun = true 339 } 340 } 341 if needRerun { 342 if wazevoapi.PerfMapEnabled { 343 wazevoapi.PerfMap.Clear() 344 } 345 } else { 346 break 347 } 348 } 349 350 var currentOffset int64 351 for cur := ectx.RootInstr; cur != nil; cur = cur.next { 352 switch cur.kind { 353 case br: 354 target := cur.brLabel() 355 offsetOfTarget := ectx.LabelPositions[target].BinaryOffset 356 diff := offsetOfTarget - currentOffset 357 divided := diff >> 2 358 if divided < minSignedInt26 || divided > maxSignedInt26 { 359 // This means the currently compiled single function is extremely large. 360 panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range") 361 } 362 cur.brOffsetResolve(diff) 363 case condBr: 364 if !cur.condBrOffsetResolved() { 365 target := cur.condBrLabel() 366 offsetOfTarget := ectx.LabelPositions[target].BinaryOffset 367 diff := offsetOfTarget - currentOffset 368 if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { 369 panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly") 370 } 371 cur.condBrOffsetResolve(diff) 372 } 373 case brTableSequence: 374 tableIndex := cur.u1 375 targets := m.jmpTableTargets[tableIndex] 376 for i := range targets { 377 l := label(targets[i]) 378 offsetOfTarget := ectx.LabelPositions[l].BinaryOffset 379 diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin) 380 targets[i] = uint32(diff) 381 } 382 cur.brTableSequenceOffsetsResolved() 383 case emitSourceOffsetInfo: 384 m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo()) 385 } 386 currentOffset += cur.size() 387 } 388 } 389 390 const ( 391 maxSignedInt26 = 1<<25 - 1 392 minSignedInt26 = -(1 << 25) 393 394 maxSignedInt19 = 1<<18 - 1 395 minSignedInt19 = -(1 << 18) 396 ) 397 398 func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) { 399 cur := currentBlk.End 400 originalTarget := cbr.condBrLabel() 401 endNext := cur.next 402 403 if cur.kind != br { 404 // If the current block ends with a conditional branch, we can just insert the trampoline after it. 405 // Otherwise, we need to insert "skip" instruction to skip the trampoline instructions. 406 skip := m.allocateInstr() 407 skip.asBr(nextLabel) 408 cur = linkInstr(cur, skip) 409 } 410 411 cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget() 412 cbr.setCondBrTargets(cbrNewTargetLabel) 413 cur = linkInstr(cur, cbrNewTargetInstr) 414 415 // Then insert the unconditional branch to the original, which should be possible to get encoded 416 // as 26-bit offset should be enough for any practical application. 417 br := m.allocateInstr() 418 br.asBr(originalTarget) 419 cur = linkInstr(cur, br) 420 421 // Update the end of the current block. 422 currentBlk.End = cur 423 424 linkInstr(cur, endNext) 425 } 426 427 // Format implements backend.Machine. 428 func (m *machine) Format() string { 429 ectx := m.executableContext 430 begins := map[*instruction]label{} 431 for l, pos := range ectx.LabelPositions { 432 begins[pos.Begin] = l 433 } 434 435 irBlocks := map[label]ssa.BasicBlockID{} 436 for i, l := range ectx.SsaBlockIDToLabels { 437 irBlocks[l] = ssa.BasicBlockID(i) 438 } 439 440 var lines []string 441 for cur := ectx.RootInstr; cur != nil; cur = cur.next { 442 if l, ok := begins[cur]; ok { 443 var labelStr string 444 if blkID, ok := irBlocks[l]; ok { 445 labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) 446 } else { 447 labelStr = fmt.Sprintf("%s:", l) 448 } 449 lines = append(lines, labelStr) 450 } 451 if cur.kind == nop0 { 452 continue 453 } 454 lines = append(lines, "\t"+cur.String()) 455 } 456 return "\n" + strings.Join(lines, "\n") + "\n" 457 } 458 459 // InsertReturn implements backend.Machine. 460 func (m *machine) InsertReturn() { 461 i := m.allocateInstr() 462 i.asRet() 463 m.insert(i) 464 } 465 466 func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { 467 offset, ok := m.spillSlots[id] 468 if !ok { 469 offset = m.spillSlotSize 470 // TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible. 471 m.spillSlots[id] = offset 472 m.spillSlotSize += int64(size) 473 } 474 return offset + 16 // spill slot starts above the clobbered registers and the frame size. 475 } 476 477 func (m *machine) clobberedRegSlotSize() int64 { 478 return int64(len(m.clobberedRegs) * 16) 479 } 480 481 func (m *machine) arg0OffsetFromSP() int64 { 482 return m.frameSize() + 483 16 + // 16-byte aligned return address 484 16 // frame size saved below the clobbered registers. 485 } 486 487 func (m *machine) ret0OffsetFromSP() int64 { 488 return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize 489 } 490 491 func (m *machine) requiredStackSize() int64 { 492 return m.maxRequiredStackSizeForCalls + 493 m.frameSize() + 494 16 + // 16-byte aligned return address. 495 16 // frame size saved below the clobbered registers. 496 } 497 498 func (m *machine) frameSize() int64 { 499 s := m.clobberedRegSlotSize() + m.spillSlotSize 500 if s&0xf != 0 { 501 panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) 502 } 503 return s 504 } 505 506 func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) { 507 // TODO: reuse the slice! 508 labels := make([]uint32, len(targets)) 509 for j, target := range targets { 510 labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target)) 511 } 512 index = len(m.jmpTableTargets) 513 m.jmpTableTargets = append(m.jmpTableTargets, labels) 514 return 515 }