github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/wazevo/backend/isa/arm64/machine.go (about)

     1  package arm64
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  
     8  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/backend"
     9  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/backend/regalloc"
    10  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/ssa"
    11  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/wazevoapi"
    12  )
    13  
    14  type (
    15  	// machine implements backend.Machine.
    16  	machine struct {
    17  		compiler          backend.Compiler
    18  		executableContext *backend.ExecutableContextT[instruction]
    19  		currentABI        *abiImpl
    20  		// abis maps ssa.SignatureID to the ABI implementation.
    21  		abis []abiImpl
    22  
    23  		regAllocFn regAllocFunctionImpl
    24  
    25  		// addendsWorkQueue is used during address lowering, defined here for reuse.
    26  		addendsWorkQueue queue[ssa.Value]
    27  		addends32        queue[addend32]
    28  		// addends64 is used during address lowering, defined here for reuse.
    29  		addends64              queue[regalloc.VReg]
    30  		unresolvedAddressModes []*instruction
    31  
    32  		// condBrRelocs holds the conditional branches which need offset relocation.
    33  		condBrRelocs []condBrReloc
    34  
    35  		// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
    36  		// During the execution of the function, the stack looks like:
    37  		//
    38  		//
    39  		//            (high address)
    40  		//          +-----------------+
    41  		//          |     .......     |
    42  		//          |      ret Y      |
    43  		//          |     .......     |
    44  		//          |      ret 0      |
    45  		//          |      arg X      |
    46  		//          |     .......     |
    47  		//          |      arg 1      |
    48  		//          |      arg 0      |
    49  		//          |      xxxxx      |
    50  		//          |   ReturnAddress |
    51  		//          +-----------------+   <<-|
    52  		//          |   ...........   |      |
    53  		//          |   spill slot M  |      | <--- spillSlotSize
    54  		//          |   ............  |      |
    55  		//          |   spill slot 2  |      |
    56  		//          |   spill slot 1  |   <<-+
    57  		//          |   clobbered N   |
    58  		//          |   ...........   |
    59  		//          |   clobbered 1   |
    60  		//          |   clobbered 0   |
    61  		//   SP---> +-----------------+
    62  		//             (low address)
    63  		//
    64  		// and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
    65  		// Also note that this is only known after register allocation.
    66  		spillSlotSize int64
    67  		spillSlots    map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
    68  		// clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
    69  		clobberedRegs []regalloc.VReg
    70  
    71  		maxRequiredStackSizeForCalls int64
    72  		stackBoundsCheckDisabled     bool
    73  
    74  		regAllocStarted bool
    75  	}
    76  
    77  	addend32 struct {
    78  		r   regalloc.VReg
    79  		ext extendOp
    80  	}
    81  
    82  	condBrReloc struct {
    83  		cbr *instruction
    84  		// currentLabelPos is the labelPosition within which condBr is defined.
    85  		currentLabelPos *labelPosition
    86  		// Next block's labelPosition.
    87  		nextLabel label
    88  		offset    int64
    89  	}
    90  
    91  	labelPosition = backend.LabelPosition[instruction]
    92  	label         = backend.Label
    93  )
    94  
    95  const (
    96  	labelReturn  = backend.LabelReturn
    97  	labelInvalid = backend.LabelInvalid
    98  )
    99  
   100  // NewBackend returns a new backend for arm64.
   101  func NewBackend() backend.Machine {
   102  	m := &machine{
   103  		spillSlots:        make(map[regalloc.VRegID]int64),
   104  		executableContext: newExecutableContext(),
   105  	}
   106  	m.regAllocFn.m = m
   107  	m.regAllocFn.labelToRegAllocBlockIndex = make(map[label]int)
   108  	return m
   109  }
   110  
   111  func newExecutableContext() *backend.ExecutableContextT[instruction] {
   112  	return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
   113  }
   114  
   115  // ExecutableContext implements backend.Machine.
   116  func (m *machine) ExecutableContext() backend.ExecutableContext {
   117  	return m.executableContext
   118  }
   119  
   120  // Reset implements backend.Machine.
   121  func (m *machine) Reset() {
   122  	m.regAllocStarted = false
   123  	m.clobberedRegs = m.clobberedRegs[:0]
   124  	for key := range m.spillSlots {
   125  		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
   126  	}
   127  	for _, key := range m.clobberedRegs {
   128  		delete(m.spillSlots, regalloc.VRegID(key))
   129  	}
   130  	m.clobberedRegs = m.clobberedRegs[:0]
   131  	m.regAllocFn.reset()
   132  	m.spillSlotSize = 0
   133  	m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
   134  	m.maxRequiredStackSizeForCalls = 0
   135  	m.executableContext.Reset()
   136  }
   137  
   138  // InitializeABI implements backend.Machine InitializeABI.
   139  func (m *machine) InitializeABI(sig *ssa.Signature) {
   140  	m.currentABI = m.getOrCreateABIImpl(sig)
   141  }
   142  
   143  // DisableStackCheck implements backend.Machine DisableStackCheck.
   144  func (m *machine) DisableStackCheck() {
   145  	m.stackBoundsCheckDisabled = true
   146  }
   147  
   148  // ABI implements backend.Machine.
   149  func (m *machine) ABI() backend.FunctionABI {
   150  	return m.currentABI
   151  }
   152  
   153  // SetCompiler implements backend.Machine.
   154  func (m *machine) SetCompiler(ctx backend.Compiler) {
   155  	m.compiler = ctx
   156  }
   157  
   158  // StartBlock implements backend.Machine.
   159  func (m *machine) StartBlock(blk ssa.BasicBlock) {
   160  	l, pos := m.executableContext.StartBlock(blk)
   161  	m.regAllocFn.addBlock(blk, l, pos)
   162  }
   163  
   164  func (m *machine) insert(i *instruction) {
   165  	ectx := m.executableContext
   166  	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
   167  }
   168  
   169  func (m *machine) insertBrTargetLabel() label {
   170  	nop, l := m.allocateBrTarget()
   171  	m.insert(nop)
   172  	return l
   173  }
   174  
   175  func (m *machine) allocateBrTarget() (nop *instruction, l label) {
   176  	ectx := m.executableContext
   177  	l = ectx.AllocateLabel()
   178  	nop = m.allocateInstr()
   179  	nop.asNop0WithLabel(l)
   180  	pos := ectx.AllocateLabelPosition(l)
   181  	pos.Begin, pos.End = nop, nop
   182  	ectx.LabelPositions[l] = pos
   183  	return
   184  }
   185  
   186  // allocateInstr allocates an instruction.
   187  func (m *machine) allocateInstr() *instruction {
   188  	instr := m.executableContext.InstructionPool.Allocate()
   189  	if !m.regAllocStarted {
   190  		instr.addedBeforeRegAlloc = true
   191  	}
   192  	return instr
   193  }
   194  
   195  func resetInstruction(i *instruction) {
   196  	*i = instruction{}
   197  }
   198  
   199  func (m *machine) allocateNop() *instruction {
   200  	instr := m.allocateInstr()
   201  	instr.asNop0()
   202  	return instr
   203  }
   204  
   205  func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
   206  	amode := &i.amode
   207  	switch amode.kind {
   208  	case addressModeKindResultStackSpace:
   209  		amode.imm += ret0offset
   210  	case addressModeKindArgStackSpace:
   211  		amode.imm += arg0offset
   212  	default:
   213  		panic("BUG")
   214  	}
   215  
   216  	var sizeInBits byte
   217  	switch i.kind {
   218  	case store8, uLoad8:
   219  		sizeInBits = 8
   220  	case store16, uLoad16:
   221  		sizeInBits = 16
   222  	case store32, fpuStore32, uLoad32, fpuLoad32:
   223  		sizeInBits = 32
   224  	case store64, fpuStore64, uLoad64, fpuLoad64:
   225  		sizeInBits = 64
   226  	case fpuStore128, fpuLoad128:
   227  		sizeInBits = 128
   228  	default:
   229  		panic("BUG")
   230  	}
   231  
   232  	if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
   233  		amode.kind = addressModeKindRegUnsignedImm12
   234  	} else {
   235  		// This case, we load the offset into the temporary register,
   236  		// and then use it as the index register.
   237  		newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
   238  		linkInstr(newPrev, i)
   239  		*amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
   240  	}
   241  }
   242  
   243  // ResolveRelativeAddresses implements backend.Machine.
   244  func (m *machine) ResolveRelativeAddresses(ctx context.Context) {
   245  	if len(m.unresolvedAddressModes) > 0 {
   246  		arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
   247  		for _, i := range m.unresolvedAddressModes {
   248  			m.resolveAddressingMode(arg0offset, ret0offset, i)
   249  		}
   250  	}
   251  
   252  	// Reuse the slice to gather the unresolved conditional branches.
   253  	cbrs := m.condBrRelocs[:0]
   254  	ectx := m.executableContext
   255  
   256  	var fn string
   257  	var fnIndex int
   258  	var labelToSSABlockID map[label]ssa.BasicBlockID
   259  	if wazevoapi.PerfMapEnabled {
   260  		fn = wazevoapi.GetCurrentFunctionName(ctx)
   261  		labelToSSABlockID = make(map[label]ssa.BasicBlockID)
   262  		for i, l := range ectx.SsaBlockIDToLabels {
   263  			labelToSSABlockID[l] = ssa.BasicBlockID(i)
   264  		}
   265  		fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
   266  	}
   267  
   268  	// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
   269  	var offset int64
   270  	for i, pos := range ectx.OrderedBlockLabels {
   271  		pos.BinaryOffset = offset
   272  		var size int64
   273  		for cur := pos.Begin; ; cur = cur.next {
   274  			switch cur.kind {
   275  			case nop0:
   276  				l := cur.nop0Label()
   277  				if pos, ok := ectx.LabelPositions[l]; ok {
   278  					pos.BinaryOffset = offset + size
   279  				}
   280  			case condBr:
   281  				if !cur.condBrOffsetResolved() {
   282  					var nextLabel label
   283  					if i < len(ectx.OrderedBlockLabels)-1 {
   284  						// Note: this is only used when the block ends with fallthrough,
   285  						// therefore can be safely assumed that the next block exists when it's needed.
   286  						nextLabel = ectx.OrderedBlockLabels[i+1].L
   287  					}
   288  					cbrs = append(cbrs, condBrReloc{
   289  						cbr: cur, currentLabelPos: pos, offset: offset + size,
   290  						nextLabel: nextLabel,
   291  					})
   292  				}
   293  			}
   294  			size += cur.size()
   295  			if cur == pos.End {
   296  				break
   297  			}
   298  		}
   299  
   300  		if wazevoapi.PerfMapEnabled {
   301  			if size > 0 {
   302  				l := pos.L
   303  				var labelStr string
   304  				if blkID, ok := labelToSSABlockID[l]; ok {
   305  					labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
   306  				} else {
   307  					labelStr = l.String()
   308  				}
   309  				wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
   310  			}
   311  		}
   312  
   313  		pos.BinarySize = size
   314  		offset += size
   315  	}
   316  
   317  	// Before resolving any offsets, we need to check if all the conditional branches can be resolved.
   318  	var needRerun bool
   319  	for i := range cbrs {
   320  		reloc := &cbrs[i]
   321  		cbr := reloc.cbr
   322  		offset := reloc.offset
   323  
   324  		target := cbr.condBrLabel()
   325  		offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
   326  		diff := offsetOfTarget - offset
   327  		if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
   328  			// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
   329  			// and jump to it.
   330  			m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
   331  			// Then, we need to recall this function to fix up the label offsets
   332  			// as they have changed after the trampoline is inserted.
   333  			needRerun = true
   334  		}
   335  	}
   336  	if needRerun {
   337  		m.ResolveRelativeAddresses(ctx)
   338  		if wazevoapi.PerfMapEnabled {
   339  			wazevoapi.PerfMap.Clear()
   340  		}
   341  		return
   342  	}
   343  
   344  	var currentOffset int64
   345  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
   346  		switch cur.kind {
   347  		case br:
   348  			target := cur.brLabel()
   349  			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
   350  			diff := offsetOfTarget - currentOffset
   351  			divided := diff >> 2
   352  			if divided < minSignedInt26 || divided > maxSignedInt26 {
   353  				// This means the currently compiled single function is extremely large.
   354  				panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
   355  			}
   356  			cur.brOffsetResolve(diff)
   357  		case condBr:
   358  			if !cur.condBrOffsetResolved() {
   359  				target := cur.condBrLabel()
   360  				offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
   361  				diff := offsetOfTarget - currentOffset
   362  				if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
   363  					panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
   364  				}
   365  				cur.condBrOffsetResolve(diff)
   366  			}
   367  		case brTableSequence:
   368  			for i := range cur.targets {
   369  				l := label(cur.targets[i])
   370  				offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
   371  				diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
   372  				cur.targets[i] = uint32(diff)
   373  			}
   374  			cur.brTableSequenceOffsetsResolved()
   375  		case emitSourceOffsetInfo:
   376  			m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
   377  		}
   378  		currentOffset += cur.size()
   379  	}
   380  }
   381  
   382  const (
   383  	maxSignedInt26 int64 = 1<<25 - 1
   384  	minSignedInt26 int64 = -(1 << 25)
   385  
   386  	maxSignedInt19 int64 = 1<<19 - 1
   387  	minSignedInt19 int64 = -(1 << 19)
   388  )
   389  
   390  func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
   391  	cur := currentBlk.End
   392  	originalTarget := cbr.condBrLabel()
   393  	endNext := cur.next
   394  
   395  	if cur.kind != br {
   396  		// If the current block ends with a conditional branch, we can just insert the trampoline after it.
   397  		// Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
   398  		skip := m.allocateInstr()
   399  		skip.asBr(nextLabel)
   400  		cur = linkInstr(cur, skip)
   401  	}
   402  
   403  	cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
   404  	cbr.setCondBrTargets(cbrNewTargetLabel)
   405  	cur = linkInstr(cur, cbrNewTargetInstr)
   406  
   407  	// Then insert the unconditional branch to the original, which should be possible to get encoded
   408  	// as 26-bit offset should be enough for any practical application.
   409  	br := m.allocateInstr()
   410  	br.asBr(originalTarget)
   411  	cur = linkInstr(cur, br)
   412  
   413  	// Update the end of the current block.
   414  	currentBlk.End = cur
   415  
   416  	linkInstr(cur, endNext)
   417  }
   418  
   419  // Format implements backend.Machine.
   420  func (m *machine) Format() string {
   421  	ectx := m.executableContext
   422  	begins := map[*instruction]label{}
   423  	for l, pos := range ectx.LabelPositions {
   424  		begins[pos.Begin] = l
   425  	}
   426  
   427  	irBlocks := map[label]ssa.BasicBlockID{}
   428  	for i, l := range ectx.SsaBlockIDToLabels {
   429  		irBlocks[l] = ssa.BasicBlockID(i)
   430  	}
   431  
   432  	var lines []string
   433  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
   434  		if l, ok := begins[cur]; ok {
   435  			var labelStr string
   436  			if blkID, ok := irBlocks[l]; ok {
   437  				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
   438  			} else {
   439  				labelStr = fmt.Sprintf("%s:", l)
   440  			}
   441  			lines = append(lines, labelStr)
   442  		}
   443  		if cur.kind == nop0 {
   444  			continue
   445  		}
   446  		lines = append(lines, "\t"+cur.String())
   447  	}
   448  	return "\n" + strings.Join(lines, "\n") + "\n"
   449  }
   450  
   451  // InsertReturn implements backend.Machine.
   452  func (m *machine) InsertReturn() {
   453  	i := m.allocateInstr()
   454  	i.asRet(m.currentABI)
   455  	m.insert(i)
   456  }
   457  
   458  func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
   459  	offset, ok := m.spillSlots[id]
   460  	if !ok {
   461  		offset = m.spillSlotSize
   462  		// TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
   463  		m.spillSlots[id] = offset
   464  		m.spillSlotSize += int64(size)
   465  	}
   466  	return offset + 16 // spill slot starts above the clobbered registers and the frame size.
   467  }
   468  
   469  func (m *machine) clobberedRegSlotSize() int64 {
   470  	return int64(len(m.clobberedRegs) * 16)
   471  }
   472  
   473  func (m *machine) arg0OffsetFromSP() int64 {
   474  	return m.frameSize() +
   475  		16 + // 16-byte aligned return address
   476  		16 // frame size saved below the clobbered registers.
   477  }
   478  
   479  func (m *machine) ret0OffsetFromSP() int64 {
   480  	return m.arg0OffsetFromSP() + m.currentABI.argStackSize
   481  }
   482  
   483  func (m *machine) requiredStackSize() int64 {
   484  	return m.maxRequiredStackSizeForCalls +
   485  		m.frameSize() +
   486  		16 + // 16-byte aligned return address.
   487  		16 // frame size saved below the clobbered registers.
   488  }
   489  
   490  func (m *machine) frameSize() int64 {
   491  	s := m.clobberedRegSlotSize() + m.spillSlotSize
   492  	if s&0xf != 0 {
   493  		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
   494  	}
   495  	return s
   496  }