github.com/AR1011/wazero@v1.0.5/internal/engine/wazevo/backend/isa/arm64/machine.go (about) 1 package arm64 2 3 import ( 4 "context" 5 "fmt" 6 "math" 7 "strings" 8 9 "github.com/AR1011/wazero/internal/engine/wazevo/backend" 10 "github.com/AR1011/wazero/internal/engine/wazevo/backend/regalloc" 11 "github.com/AR1011/wazero/internal/engine/wazevo/ssa" 12 "github.com/AR1011/wazero/internal/engine/wazevo/wazevoapi" 13 ) 14 15 type ( 16 // machine implements backend.Machine. 17 machine struct { 18 compiler backend.Compiler 19 currentABI *abiImpl 20 currentSSABlk ssa.BasicBlock 21 // abis maps ssa.SignatureID to the ABI implementation. 22 abis []abiImpl 23 instrPool wazevoapi.Pool[instruction] 24 // rootInstr is the root instruction of the currently-compiled function. 25 rootInstr *instruction 26 // perBlockHead and perBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock. 27 perBlockHead, perBlockEnd *instruction 28 // pendingInstructions are the instructions which are not yet emitted into the instruction list. 29 pendingInstructions []*instruction 30 regAllocFn regAllocFunctionImpl 31 nextLabel label 32 33 // ssaBlockIDToLabels maps an SSA block ID to the label. 34 ssaBlockIDToLabels []label 35 // labelToInstructions maps a label to the instructions of the region which the label represents. 36 labelPositions map[label]*labelPosition 37 orderedBlockLabels []*labelPosition 38 labelPositionPool wazevoapi.Pool[labelPosition] 39 40 // addendsWorkQueue is used during address lowering, defined here for reuse. 41 addendsWorkQueue queue[ssa.Value] 42 addends32 queue[addend32] 43 // addends64 is used during address lowering, defined here for reuse. 44 addends64 queue[regalloc.VReg] 45 unresolvedAddressModes []*instruction 46 47 // condBrRelocs holds the conditional branches which need offset relocation. 48 condBrRelocs []condBrReloc 49 50 // spillSlotSize is the size of the stack slot in bytes used for spilling registers. 51 // During the execution of the function, the stack looks like: 52 // 53 // 54 // (high address) 55 // +-----------------+ 56 // | ....... | 57 // | ret Y | 58 // | ....... | 59 // | ret 0 | 60 // | arg X | 61 // | ....... | 62 // | arg 1 | 63 // | arg 0 | 64 // | xxxxx | 65 // | ReturnAddress | 66 // +-----------------+ <<-| 67 // | ........... | | 68 // | spill slot M | | <--- spillSlotSize 69 // | ............ | | 70 // | spill slot 2 | | 71 // | spill slot 1 | <<-+ 72 // | clobbered N | 73 // | ........... | 74 // | clobbered 1 | 75 // | clobbered 0 | 76 // SP---> +-----------------+ 77 // (low address) 78 // 79 // and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16. 80 // Also note that this is only known after register allocation. 81 spillSlotSize int64 82 spillSlots map[regalloc.VRegID]int64 // regalloc.VRegID to offset. 83 // clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue. 84 clobberedRegs []regalloc.VReg 85 86 maxRequiredStackSizeForCalls int64 87 stackBoundsCheckDisabled bool 88 89 regAllocStarted bool 90 } 91 92 addend32 struct { 93 r regalloc.VReg 94 ext extendOp 95 } 96 97 // label represents a position in the generated code which is either 98 // a real instruction or the constant pool (e.g. jump tables). 99 // 100 // This is exactly the same as the traditional "label" in assembly code. 101 label uint32 102 103 // labelPosition represents the regions of the generated code which the label represents. 104 labelPosition struct { 105 l label 106 begin, end *instruction 107 binarySize int64 108 binaryOffset int64 109 } 110 111 condBrReloc struct { 112 cbr *instruction 113 // currentLabelPos is the labelPosition within which condBr is defined. 114 currentLabelPos *labelPosition 115 // Next block's labelPosition. 116 nextLabel label 117 offset int64 118 } 119 ) 120 121 const ( 122 invalidLabel = 0 123 returnLabel = math.MaxUint32 124 ) 125 126 // NewBackend returns a new backend for arm64. 127 func NewBackend() backend.Machine { 128 m := &machine{ 129 instrPool: wazevoapi.NewPool[instruction](resetInstruction), 130 labelPositionPool: wazevoapi.NewPool[labelPosition](resetLabelPosition), 131 labelPositions: make(map[label]*labelPosition), 132 spillSlots: make(map[regalloc.VRegID]int64), 133 nextLabel: invalidLabel, 134 } 135 m.regAllocFn.m = m 136 m.regAllocFn.labelToRegAllocBlockIndex = make(map[label]int) 137 return m 138 } 139 140 // Reset implements backend.Machine. 141 func (m *machine) Reset() { 142 m.regAllocStarted = false 143 m.instrPool.Reset() 144 m.labelPositionPool.Reset() 145 m.currentSSABlk = nil 146 for l := label(0); l <= m.nextLabel; l++ { 147 delete(m.labelPositions, l) 148 } 149 m.pendingInstructions = m.pendingInstructions[:0] 150 m.clobberedRegs = m.clobberedRegs[:0] 151 for key := range m.spillSlots { 152 m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) 153 } 154 for _, key := range m.clobberedRegs { 155 delete(m.spillSlots, regalloc.VRegID(key)) 156 } 157 m.clobberedRegs = m.clobberedRegs[:0] 158 m.orderedBlockLabels = m.orderedBlockLabels[:0] 159 m.regAllocFn.reset() 160 m.spillSlotSize = 0 161 m.unresolvedAddressModes = m.unresolvedAddressModes[:0] 162 m.rootInstr = nil 163 m.ssaBlockIDToLabels = m.ssaBlockIDToLabels[:0] 164 m.perBlockHead, m.perBlockEnd = nil, nil 165 m.maxRequiredStackSizeForCalls = 0 166 m.nextLabel = invalidLabel 167 } 168 169 // InitializeABI implements backend.Machine InitializeABI. 170 func (m *machine) InitializeABI(sig *ssa.Signature) { 171 m.currentABI = m.getOrCreateABIImpl(sig) 172 } 173 174 // DisableStackCheck implements backend.Machine DisableStackCheck. 175 func (m *machine) DisableStackCheck() { 176 m.stackBoundsCheckDisabled = true 177 } 178 179 // ABI implements backend.Machine. 180 func (m *machine) ABI() backend.FunctionABI { 181 return m.currentABI 182 } 183 184 // allocateLabel allocates an unused label. 185 func (m *machine) allocateLabel() label { 186 m.nextLabel++ 187 return m.nextLabel 188 } 189 190 // SetCompiler implements backend.Machine. 191 func (m *machine) SetCompiler(ctx backend.Compiler) { 192 m.compiler = ctx 193 } 194 195 // StartLoweringFunction implements backend.Machine. 196 func (m *machine) StartLoweringFunction(max ssa.BasicBlockID) { 197 imax := int(max) 198 if len(m.ssaBlockIDToLabels) <= imax { 199 // Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration. 200 m.ssaBlockIDToLabels = append(m.ssaBlockIDToLabels, make([]label, imax+1)...) 201 } 202 } 203 204 // EndLoweringFunction implements backend.Machine. 205 func (m *machine) EndLoweringFunction() {} 206 207 // StartBlock implements backend.Machine. 208 func (m *machine) StartBlock(blk ssa.BasicBlock) { 209 m.currentSSABlk = blk 210 211 l := m.ssaBlockIDToLabels[m.currentSSABlk.ID()] 212 if l == invalidLabel { 213 l = m.allocateLabel() 214 m.ssaBlockIDToLabels[blk.ID()] = l 215 } 216 217 end := m.allocateNop() 218 m.perBlockHead, m.perBlockEnd = end, end 219 220 labelPos, ok := m.labelPositions[l] 221 if !ok { 222 labelPos = m.allocateLabelPosition(l) 223 m.labelPositions[l] = labelPos 224 } 225 m.orderedBlockLabels = append(m.orderedBlockLabels, labelPos) 226 labelPos.begin, labelPos.end = end, end 227 m.regAllocFn.addBlock(blk, l, labelPos) 228 } 229 230 // EndBlock implements backend.Machine. 231 func (m *machine) EndBlock() { 232 // Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions. 233 m.insertAtPerBlockHead(m.allocateNop()) 234 235 l := m.ssaBlockIDToLabels[m.currentSSABlk.ID()] 236 m.labelPositions[l].begin = m.perBlockHead 237 238 if m.currentSSABlk.EntryBlock() { 239 m.rootInstr = m.perBlockHead 240 } 241 } 242 243 func (m *machine) insert(i *instruction) { 244 m.pendingInstructions = append(m.pendingInstructions, i) 245 } 246 247 func (m *machine) insertBrTargetLabel() label { 248 nop, l := m.allocateBrTarget() 249 m.insert(nop) 250 return l 251 } 252 253 func (m *machine) allocateBrTarget() (nop *instruction, l label) { 254 l = m.allocateLabel() 255 nop = m.allocateInstr() 256 nop.asNop0WithLabel(l) 257 pos := m.allocateLabelPosition(l) 258 pos.begin, pos.end = nop, nop 259 m.labelPositions[l] = pos 260 return 261 } 262 263 func (m *machine) allocateLabelPosition(la label) *labelPosition { 264 l := m.labelPositionPool.Allocate() 265 l.l = la 266 return l 267 } 268 269 func resetLabelPosition(l *labelPosition) { 270 *l = labelPosition{} 271 } 272 273 // FlushPendingInstructions implements backend.Machine. 274 func (m *machine) FlushPendingInstructions() { 275 l := len(m.pendingInstructions) 276 if l == 0 { 277 return 278 } 279 for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order. 280 m.insertAtPerBlockHead(m.pendingInstructions[i]) 281 } 282 m.pendingInstructions = m.pendingInstructions[:0] 283 } 284 285 func (m *machine) insertAtPerBlockHead(i *instruction) { 286 if m.perBlockHead == nil { 287 m.perBlockHead = i 288 m.perBlockEnd = i 289 return 290 } 291 i.next = m.perBlockHead 292 m.perBlockHead.prev = i 293 m.perBlockHead = i 294 } 295 296 // String implements backend.Machine. 297 func (l label) String() string { 298 return fmt.Sprintf("L%d", l) 299 } 300 301 // allocateInstr allocates an instruction. 302 func (m *machine) allocateInstr() *instruction { 303 instr := m.instrPool.Allocate() 304 if !m.regAllocStarted { 305 instr.addedBeforeRegAlloc = true 306 } 307 return instr 308 } 309 310 func resetInstruction(i *instruction) { 311 *i = instruction{} 312 } 313 314 func (m *machine) allocateNop() *instruction { 315 instr := m.allocateInstr() 316 instr.asNop0() 317 return instr 318 } 319 320 func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) { 321 amode := &i.amode 322 switch amode.kind { 323 case addressModeKindResultStackSpace: 324 amode.imm += ret0offset 325 case addressModeKindArgStackSpace: 326 amode.imm += arg0offset 327 default: 328 panic("BUG") 329 } 330 331 var sizeInBits byte 332 switch i.kind { 333 case store8, uLoad8: 334 sizeInBits = 8 335 case store16, uLoad16: 336 sizeInBits = 16 337 case store32, fpuStore32, uLoad32, fpuLoad32: 338 sizeInBits = 32 339 case store64, fpuStore64, uLoad64, fpuLoad64: 340 sizeInBits = 64 341 case fpuStore128, fpuLoad128: 342 sizeInBits = 128 343 default: 344 panic("BUG") 345 } 346 347 if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) { 348 amode.kind = addressModeKindRegUnsignedImm12 349 } else { 350 // This case, we load the offset into the temporary register, 351 // and then use it as the index register. 352 newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm) 353 linkInstr(newPrev, i) 354 *amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */} 355 } 356 } 357 358 // ResolveRelativeAddresses implements backend.Machine. 359 func (m *machine) ResolveRelativeAddresses(ctx context.Context) { 360 if len(m.unresolvedAddressModes) > 0 { 361 arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP() 362 for _, i := range m.unresolvedAddressModes { 363 m.resolveAddressingMode(arg0offset, ret0offset, i) 364 } 365 } 366 367 // Reuse the slice to gather the unresolved conditional branches. 368 cbrs := m.condBrRelocs[:0] 369 370 var fn string 371 var fnIndex int 372 var labelToSSABlockID map[label]ssa.BasicBlockID 373 if wazevoapi.PerfMapEnabled { 374 fn = wazevoapi.GetCurrentFunctionName(ctx) 375 labelToSSABlockID = make(map[label]ssa.BasicBlockID) 376 for i, l := range m.ssaBlockIDToLabels { 377 labelToSSABlockID[l] = ssa.BasicBlockID(i) 378 } 379 fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx) 380 } 381 382 // Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label. 383 var offset int64 384 for i, pos := range m.orderedBlockLabels { 385 pos.binaryOffset = offset 386 var size int64 387 for cur := pos.begin; ; cur = cur.next { 388 switch cur.kind { 389 case nop0: 390 l := cur.nop0Label() 391 if pos, ok := m.labelPositions[l]; ok { 392 pos.binaryOffset = offset + size 393 } 394 case condBr: 395 if !cur.condBrOffsetResolved() { 396 var nextLabel label 397 if i < len(m.orderedBlockLabels)-1 { 398 // Note: this is only used when the block ends with fallthrough, 399 // therefore can be safely assumed that the next block exists when it's needed. 400 nextLabel = m.orderedBlockLabels[i+1].l 401 } 402 cbrs = append(cbrs, condBrReloc{ 403 cbr: cur, currentLabelPos: pos, offset: offset + size, 404 nextLabel: nextLabel, 405 }) 406 } 407 } 408 size += cur.size() 409 if cur == pos.end { 410 break 411 } 412 } 413 414 if wazevoapi.PerfMapEnabled { 415 if size > 0 { 416 l := pos.l 417 var labelStr string 418 if blkID, ok := labelToSSABlockID[l]; ok { 419 labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID) 420 } else { 421 labelStr = l.String() 422 } 423 wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr)) 424 } 425 } 426 427 pos.binarySize = size 428 offset += size 429 } 430 431 // Before resolving any offsets, we need to check if all the conditional branches can be resolved. 432 var needRerun bool 433 for i := range cbrs { 434 reloc := &cbrs[i] 435 cbr := reloc.cbr 436 offset := reloc.offset 437 438 target := cbr.condBrLabel() 439 offsetOfTarget := m.labelPositions[target].binaryOffset 440 diff := offsetOfTarget - offset 441 if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { 442 // This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block, 443 // and jump to it. 444 m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel) 445 // Then, we need to recall this function to fix up the label offsets 446 // as they have changed after the trampoline is inserted. 447 needRerun = true 448 } 449 } 450 if needRerun { 451 m.ResolveRelativeAddresses(ctx) 452 if wazevoapi.PerfMapEnabled { 453 wazevoapi.PerfMap.Clear() 454 } 455 return 456 } 457 458 var currentOffset int64 459 for cur := m.rootInstr; cur != nil; cur = cur.next { 460 switch cur.kind { 461 case br: 462 target := cur.brLabel() 463 offsetOfTarget := m.labelPositions[target].binaryOffset 464 diff := offsetOfTarget - currentOffset 465 divided := diff >> 2 466 if divided < minSignedInt26 || divided > maxSignedInt26 { 467 // This means the currently compiled single function is extremely large. 468 panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range") 469 } 470 cur.brOffsetResolve(diff) 471 case condBr: 472 if !cur.condBrOffsetResolved() { 473 target := cur.condBrLabel() 474 offsetOfTarget := m.labelPositions[target].binaryOffset 475 diff := offsetOfTarget - currentOffset 476 if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { 477 panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly") 478 } 479 cur.condBrOffsetResolve(diff) 480 } 481 case brTableSequence: 482 for i := range cur.targets { 483 l := label(cur.targets[i]) 484 offsetOfTarget := m.labelPositions[l].binaryOffset 485 diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin) 486 cur.targets[i] = uint32(diff) 487 } 488 cur.brTableSequenceOffsetsResolved() 489 case emitSourceOffsetInfo: 490 m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo()) 491 } 492 currentOffset += cur.size() 493 } 494 } 495 496 const ( 497 maxSignedInt26 int64 = 1<<25 - 1 498 minSignedInt26 int64 = -(1 << 25) 499 500 maxSignedInt19 int64 = 1<<19 - 1 501 minSignedInt19 int64 = -(1 << 19) 502 ) 503 504 func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) { 505 cur := currentBlk.end 506 originalTarget := cbr.condBrLabel() 507 endNext := cur.next 508 509 if cur.kind != br { 510 // If the current block ends with a conditional branch, we can just insert the trampoline after it. 511 // Otherwise, we need to insert "skip" instruction to skip the trampoline instructions. 512 skip := m.allocateInstr() 513 skip.asBr(nextLabel) 514 cur = linkInstr(cur, skip) 515 } 516 517 cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget() 518 cbr.setCondBrTargets(cbrNewTargetLabel) 519 cur = linkInstr(cur, cbrNewTargetInstr) 520 521 // Then insert the unconditional branch to the original, which should be possible to get encoded 522 // as 26-bit offset should be enough for any practical application. 523 br := m.allocateInstr() 524 br.asBr(originalTarget) 525 cur = linkInstr(cur, br) 526 527 // Update the end of the current block. 528 currentBlk.end = cur 529 530 linkInstr(cur, endNext) 531 } 532 533 func (m *machine) getOrAllocateSSABlockLabel(blk ssa.BasicBlock) label { 534 if blk.ReturnBlock() { 535 return returnLabel 536 } 537 l := m.ssaBlockIDToLabels[blk.ID()] 538 if l == invalidLabel { 539 l = m.allocateLabel() 540 m.ssaBlockIDToLabels[blk.ID()] = l 541 } 542 return l 543 } 544 545 // LinkAdjacentBlocks implements backend.Machine. 546 func (m *machine) LinkAdjacentBlocks(prev, next ssa.BasicBlock) { 547 prevLabelPos := m.labelPositions[m.getOrAllocateSSABlockLabel(prev)] 548 nextLabelPos := m.labelPositions[m.getOrAllocateSSABlockLabel(next)] 549 prevLabelPos.end.next = nextLabelPos.begin 550 } 551 552 // Format implements backend.Machine. 553 func (m *machine) Format() string { 554 begins := map[*instruction]label{} 555 for l, pos := range m.labelPositions { 556 begins[pos.begin] = l 557 } 558 559 irBlocks := map[label]ssa.BasicBlockID{} 560 for i, l := range m.ssaBlockIDToLabels { 561 irBlocks[l] = ssa.BasicBlockID(i) 562 } 563 564 var lines []string 565 for cur := m.rootInstr; cur != nil; cur = cur.next { 566 if l, ok := begins[cur]; ok { 567 var labelStr string 568 if blkID, ok := irBlocks[l]; ok { 569 labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) 570 } else { 571 labelStr = fmt.Sprintf("%s:", l) 572 } 573 lines = append(lines, labelStr) 574 } 575 if cur.kind == nop0 { 576 continue 577 } 578 lines = append(lines, "\t"+cur.String()) 579 } 580 return "\n" + strings.Join(lines, "\n") + "\n" 581 } 582 583 // InsertReturn implements backend.Machine. 584 func (m *machine) InsertReturn() { 585 i := m.allocateInstr() 586 i.asRet(m.currentABI) 587 m.insert(i) 588 } 589 590 func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { 591 offset, ok := m.spillSlots[id] 592 if !ok { 593 offset = m.spillSlotSize 594 // TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible. 595 m.spillSlots[id] = offset 596 m.spillSlotSize += int64(size) 597 } 598 return offset + 16 // spill slot starts above the clobbered registers and the frame size. 599 } 600 601 func (m *machine) clobberedRegSlotSize() int64 { 602 return int64(len(m.clobberedRegs) * 16) 603 } 604 605 func (m *machine) arg0OffsetFromSP() int64 { 606 return m.frameSize() + 607 16 + // 16-byte aligned return address 608 16 // frame size saved below the clobbered registers. 609 } 610 611 func (m *machine) ret0OffsetFromSP() int64 { 612 return m.arg0OffsetFromSP() + m.currentABI.argStackSize 613 } 614 615 func (m *machine) requiredStackSize() int64 { 616 return m.maxRequiredStackSizeForCalls + 617 m.frameSize() + 618 16 + // 16-byte aligned return address. 619 16 // frame size saved below the clobbered registers. 620 } 621 622 func (m *machine) frameSize() int64 { 623 s := m.clobberedRegSlotSize() + m.spillSlotSize 624 if s&0xf != 0 { 625 panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) 626 } 627 return s 628 }