github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/arm64/machine.go (about)

     1  package arm64
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  
     8  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
     9  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
    10  	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
    11  	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
    12  )
    13  
    14  type (
    15  	// machine implements backend.Machine.
    16  	machine struct {
    17  		compiler          backend.Compiler
    18  		executableContext *backend.ExecutableContextT[instruction]
    19  		currentABI        *backend.FunctionABI
    20  
    21  		regAlloc   regalloc.Allocator
    22  		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
    23  
    24  		// addendsWorkQueue is used during address lowering, defined here for reuse.
    25  		addendsWorkQueue wazevoapi.Queue[ssa.Value]
    26  		addends32        wazevoapi.Queue[addend32]
    27  		// addends64 is used during address lowering, defined here for reuse.
    28  		addends64              wazevoapi.Queue[regalloc.VReg]
    29  		unresolvedAddressModes []*instruction
    30  
    31  		// condBrRelocs holds the conditional branches which need offset relocation.
    32  		condBrRelocs []condBrReloc
    33  
    34  		// jmpTableTargets holds the labels of the jump table targets.
    35  		jmpTableTargets [][]uint32
    36  
    37  		// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
    38  		// During the execution of the function, the stack looks like:
    39  		//
    40  		//
    41  		//            (high address)
    42  		//          +-----------------+
    43  		//          |     .......     |
    44  		//          |      ret Y      |
    45  		//          |     .......     |
    46  		//          |      ret 0      |
    47  		//          |      arg X      |
    48  		//          |     .......     |
    49  		//          |      arg 1      |
    50  		//          |      arg 0      |
    51  		//          |      xxxxx      |
    52  		//          |   ReturnAddress |
    53  		//          +-----------------+   <<-|
    54  		//          |   ...........   |      |
    55  		//          |   spill slot M  |      | <--- spillSlotSize
    56  		//          |   ............  |      |
    57  		//          |   spill slot 2  |      |
    58  		//          |   spill slot 1  |   <<-+
    59  		//          |   clobbered N   |
    60  		//          |   ...........   |
    61  		//          |   clobbered 1   |
    62  		//          |   clobbered 0   |
    63  		//   SP---> +-----------------+
    64  		//             (low address)
    65  		//
    66  		// and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
    67  		// Also note that this is only known after register allocation.
    68  		spillSlotSize int64
    69  		spillSlots    map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
    70  		// clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
    71  		clobberedRegs []regalloc.VReg
    72  
    73  		maxRequiredStackSizeForCalls int64
    74  		stackBoundsCheckDisabled     bool
    75  
    76  		regAllocStarted bool
    77  	}
    78  
    79  	addend32 struct {
    80  		r   regalloc.VReg
    81  		ext extendOp
    82  	}
    83  
    84  	condBrReloc struct {
    85  		cbr *instruction
    86  		// currentLabelPos is the labelPosition within which condBr is defined.
    87  		currentLabelPos *labelPosition
    88  		// Next block's labelPosition.
    89  		nextLabel label
    90  		offset    int64
    91  	}
    92  
    93  	labelPosition = backend.LabelPosition[instruction]
    94  	label         = backend.Label
    95  )
    96  
    97  const (
    98  	labelReturn  = backend.LabelReturn
    99  	labelInvalid = backend.LabelInvalid
   100  )
   101  
   102  // NewBackend returns a new backend for arm64.
   103  func NewBackend() backend.Machine {
   104  	m := &machine{
   105  		spillSlots:        make(map[regalloc.VRegID]int64),
   106  		executableContext: newExecutableContext(),
   107  		regAlloc:          regalloc.NewAllocator(regInfo),
   108  	}
   109  	return m
   110  }
   111  
   112  func newExecutableContext() *backend.ExecutableContextT[instruction] {
   113  	return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
   114  }
   115  
   116  // ExecutableContext implements backend.Machine.
   117  func (m *machine) ExecutableContext() backend.ExecutableContext {
   118  	return m.executableContext
   119  }
   120  
   121  // RegAlloc implements backend.Machine Function.
   122  func (m *machine) RegAlloc() {
   123  	rf := m.regAllocFn
   124  	for _, pos := range m.executableContext.OrderedBlockLabels {
   125  		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
   126  	}
   127  
   128  	m.regAllocStarted = true
   129  	m.regAlloc.DoAllocation(rf)
   130  	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
   131  	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
   132  }
   133  
   134  // Reset implements backend.Machine.
   135  func (m *machine) Reset() {
   136  	m.clobberedRegs = m.clobberedRegs[:0]
   137  	for key := range m.spillSlots {
   138  		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
   139  	}
   140  	for _, key := range m.clobberedRegs {
   141  		delete(m.spillSlots, regalloc.VRegID(key))
   142  	}
   143  	m.clobberedRegs = m.clobberedRegs[:0]
   144  	m.regAllocStarted = false
   145  	m.regAlloc.Reset()
   146  	m.regAllocFn.Reset()
   147  	m.spillSlotSize = 0
   148  	m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
   149  	m.maxRequiredStackSizeForCalls = 0
   150  	m.executableContext.Reset()
   151  	m.jmpTableTargets = m.jmpTableTargets[:0]
   152  }
   153  
   154  // SetCurrentABI implements backend.Machine SetCurrentABI.
   155  func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
   156  	m.currentABI = abi
   157  }
   158  
   159  // DisableStackCheck implements backend.Machine DisableStackCheck.
   160  func (m *machine) DisableStackCheck() {
   161  	m.stackBoundsCheckDisabled = true
   162  }
   163  
   164  // SetCompiler implements backend.Machine.
   165  func (m *machine) SetCompiler(ctx backend.Compiler) {
   166  	m.compiler = ctx
   167  	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx)
   168  }
   169  
   170  func (m *machine) insert(i *instruction) {
   171  	ectx := m.executableContext
   172  	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
   173  }
   174  
   175  func (m *machine) insertBrTargetLabel() label {
   176  	nop, l := m.allocateBrTarget()
   177  	m.insert(nop)
   178  	return l
   179  }
   180  
   181  func (m *machine) allocateBrTarget() (nop *instruction, l label) {
   182  	ectx := m.executableContext
   183  	l = ectx.AllocateLabel()
   184  	nop = m.allocateInstr()
   185  	nop.asNop0WithLabel(l)
   186  	pos := ectx.AllocateLabelPosition(l)
   187  	pos.Begin, pos.End = nop, nop
   188  	ectx.LabelPositions[l] = pos
   189  	return
   190  }
   191  
   192  // allocateInstr allocates an instruction.
   193  func (m *machine) allocateInstr() *instruction {
   194  	instr := m.executableContext.InstructionPool.Allocate()
   195  	if !m.regAllocStarted {
   196  		instr.addedBeforeRegAlloc = true
   197  	}
   198  	return instr
   199  }
   200  
   201  func resetInstruction(i *instruction) {
   202  	*i = instruction{}
   203  }
   204  
   205  func (m *machine) allocateNop() *instruction {
   206  	instr := m.allocateInstr()
   207  	instr.asNop0()
   208  	return instr
   209  }
   210  
   211  func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
   212  	amode := &i.amode
   213  	switch amode.kind {
   214  	case addressModeKindResultStackSpace:
   215  		amode.imm += ret0offset
   216  	case addressModeKindArgStackSpace:
   217  		amode.imm += arg0offset
   218  	default:
   219  		panic("BUG")
   220  	}
   221  
   222  	var sizeInBits byte
   223  	switch i.kind {
   224  	case store8, uLoad8:
   225  		sizeInBits = 8
   226  	case store16, uLoad16:
   227  		sizeInBits = 16
   228  	case store32, fpuStore32, uLoad32, fpuLoad32:
   229  		sizeInBits = 32
   230  	case store64, fpuStore64, uLoad64, fpuLoad64:
   231  		sizeInBits = 64
   232  	case fpuStore128, fpuLoad128:
   233  		sizeInBits = 128
   234  	default:
   235  		panic("BUG")
   236  	}
   237  
   238  	if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
   239  		amode.kind = addressModeKindRegUnsignedImm12
   240  	} else {
   241  		// This case, we load the offset into the temporary register,
   242  		// and then use it as the index register.
   243  		newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
   244  		linkInstr(newPrev, i)
   245  		*amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
   246  	}
   247  }
   248  
   249  // resolveRelativeAddresses resolves the relative addresses before encoding.
   250  func (m *machine) resolveRelativeAddresses(ctx context.Context) {
   251  	ectx := m.executableContext
   252  	for {
   253  		if len(m.unresolvedAddressModes) > 0 {
   254  			arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
   255  			for _, i := range m.unresolvedAddressModes {
   256  				m.resolveAddressingMode(arg0offset, ret0offset, i)
   257  			}
   258  		}
   259  
   260  		// Reuse the slice to gather the unresolved conditional branches.
   261  		m.condBrRelocs = m.condBrRelocs[:0]
   262  
   263  		var fn string
   264  		var fnIndex int
   265  		var labelToSSABlockID map[label]ssa.BasicBlockID
   266  		if wazevoapi.PerfMapEnabled {
   267  			fn = wazevoapi.GetCurrentFunctionName(ctx)
   268  			labelToSSABlockID = make(map[label]ssa.BasicBlockID)
   269  			for i, l := range ectx.SsaBlockIDToLabels {
   270  				labelToSSABlockID[l] = ssa.BasicBlockID(i)
   271  			}
   272  			fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
   273  		}
   274  
   275  		// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
   276  		var offset int64
   277  		for i, pos := range ectx.OrderedBlockLabels {
   278  			pos.BinaryOffset = offset
   279  			var size int64
   280  			for cur := pos.Begin; ; cur = cur.next {
   281  				switch cur.kind {
   282  				case nop0:
   283  					l := cur.nop0Label()
   284  					if pos, ok := ectx.LabelPositions[l]; ok {
   285  						pos.BinaryOffset = offset + size
   286  					}
   287  				case condBr:
   288  					if !cur.condBrOffsetResolved() {
   289  						var nextLabel label
   290  						if i < len(ectx.OrderedBlockLabels)-1 {
   291  							// Note: this is only used when the block ends with fallthrough,
   292  							// therefore can be safely assumed that the next block exists when it's needed.
   293  							nextLabel = ectx.OrderedBlockLabels[i+1].L
   294  						}
   295  						m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
   296  							cbr: cur, currentLabelPos: pos, offset: offset + size,
   297  							nextLabel: nextLabel,
   298  						})
   299  					}
   300  				}
   301  				size += cur.size()
   302  				if cur == pos.End {
   303  					break
   304  				}
   305  			}
   306  
   307  			if wazevoapi.PerfMapEnabled {
   308  				if size > 0 {
   309  					l := pos.L
   310  					var labelStr string
   311  					if blkID, ok := labelToSSABlockID[l]; ok {
   312  						labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
   313  					} else {
   314  						labelStr = l.String()
   315  					}
   316  					wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
   317  				}
   318  			}
   319  			offset += size
   320  		}
   321  
   322  		// Before resolving any offsets, we need to check if all the conditional branches can be resolved.
   323  		var needRerun bool
   324  		for i := range m.condBrRelocs {
   325  			reloc := &m.condBrRelocs[i]
   326  			cbr := reloc.cbr
   327  			offset := reloc.offset
   328  
   329  			target := cbr.condBrLabel()
   330  			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
   331  			diff := offsetOfTarget - offset
   332  			if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
   333  				// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
   334  				// and jump to it.
   335  				m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
   336  				// Then, we need to recall this function to fix up the label offsets
   337  				// as they have changed after the trampoline is inserted.
   338  				needRerun = true
   339  			}
   340  		}
   341  		if needRerun {
   342  			if wazevoapi.PerfMapEnabled {
   343  				wazevoapi.PerfMap.Clear()
   344  			}
   345  		} else {
   346  			break
   347  		}
   348  	}
   349  
   350  	var currentOffset int64
   351  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
   352  		switch cur.kind {
   353  		case br:
   354  			target := cur.brLabel()
   355  			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
   356  			diff := offsetOfTarget - currentOffset
   357  			divided := diff >> 2
   358  			if divided < minSignedInt26 || divided > maxSignedInt26 {
   359  				// This means the currently compiled single function is extremely large.
   360  				panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
   361  			}
   362  			cur.brOffsetResolve(diff)
   363  		case condBr:
   364  			if !cur.condBrOffsetResolved() {
   365  				target := cur.condBrLabel()
   366  				offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
   367  				diff := offsetOfTarget - currentOffset
   368  				if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
   369  					panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
   370  				}
   371  				cur.condBrOffsetResolve(diff)
   372  			}
   373  		case brTableSequence:
   374  			tableIndex := cur.u1
   375  			targets := m.jmpTableTargets[tableIndex]
   376  			for i := range targets {
   377  				l := label(targets[i])
   378  				offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
   379  				diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
   380  				targets[i] = uint32(diff)
   381  			}
   382  			cur.brTableSequenceOffsetsResolved()
   383  		case emitSourceOffsetInfo:
   384  			m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
   385  		}
   386  		currentOffset += cur.size()
   387  	}
   388  }
   389  
   390  const (
   391  	maxSignedInt26 = 1<<25 - 1
   392  	minSignedInt26 = -(1 << 25)
   393  
   394  	maxSignedInt19 = 1<<18 - 1
   395  	minSignedInt19 = -(1 << 18)
   396  )
   397  
   398  func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
   399  	cur := currentBlk.End
   400  	originalTarget := cbr.condBrLabel()
   401  	endNext := cur.next
   402  
   403  	if cur.kind != br {
   404  		// If the current block ends with a conditional branch, we can just insert the trampoline after it.
   405  		// Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
   406  		skip := m.allocateInstr()
   407  		skip.asBr(nextLabel)
   408  		cur = linkInstr(cur, skip)
   409  	}
   410  
   411  	cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
   412  	cbr.setCondBrTargets(cbrNewTargetLabel)
   413  	cur = linkInstr(cur, cbrNewTargetInstr)
   414  
   415  	// Then insert the unconditional branch to the original, which should be possible to get encoded
   416  	// as 26-bit offset should be enough for any practical application.
   417  	br := m.allocateInstr()
   418  	br.asBr(originalTarget)
   419  	cur = linkInstr(cur, br)
   420  
   421  	// Update the end of the current block.
   422  	currentBlk.End = cur
   423  
   424  	linkInstr(cur, endNext)
   425  }
   426  
   427  // Format implements backend.Machine.
   428  func (m *machine) Format() string {
   429  	ectx := m.executableContext
   430  	begins := map[*instruction]label{}
   431  	for l, pos := range ectx.LabelPositions {
   432  		begins[pos.Begin] = l
   433  	}
   434  
   435  	irBlocks := map[label]ssa.BasicBlockID{}
   436  	for i, l := range ectx.SsaBlockIDToLabels {
   437  		irBlocks[l] = ssa.BasicBlockID(i)
   438  	}
   439  
   440  	var lines []string
   441  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
   442  		if l, ok := begins[cur]; ok {
   443  			var labelStr string
   444  			if blkID, ok := irBlocks[l]; ok {
   445  				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
   446  			} else {
   447  				labelStr = fmt.Sprintf("%s:", l)
   448  			}
   449  			lines = append(lines, labelStr)
   450  		}
   451  		if cur.kind == nop0 {
   452  			continue
   453  		}
   454  		lines = append(lines, "\t"+cur.String())
   455  	}
   456  	return "\n" + strings.Join(lines, "\n") + "\n"
   457  }
   458  
   459  // InsertReturn implements backend.Machine.
   460  func (m *machine) InsertReturn() {
   461  	i := m.allocateInstr()
   462  	i.asRet()
   463  	m.insert(i)
   464  }
   465  
   466  func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
   467  	offset, ok := m.spillSlots[id]
   468  	if !ok {
   469  		offset = m.spillSlotSize
   470  		// TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
   471  		m.spillSlots[id] = offset
   472  		m.spillSlotSize += int64(size)
   473  	}
   474  	return offset + 16 // spill slot starts above the clobbered registers and the frame size.
   475  }
   476  
   477  func (m *machine) clobberedRegSlotSize() int64 {
   478  	return int64(len(m.clobberedRegs) * 16)
   479  }
   480  
   481  func (m *machine) arg0OffsetFromSP() int64 {
   482  	return m.frameSize() +
   483  		16 + // 16-byte aligned return address
   484  		16 // frame size saved below the clobbered registers.
   485  }
   486  
   487  func (m *machine) ret0OffsetFromSP() int64 {
   488  	return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize
   489  }
   490  
   491  func (m *machine) requiredStackSize() int64 {
   492  	return m.maxRequiredStackSizeForCalls +
   493  		m.frameSize() +
   494  		16 + // 16-byte aligned return address.
   495  		16 // frame size saved below the clobbered registers.
   496  }
   497  
   498  func (m *machine) frameSize() int64 {
   499  	s := m.clobberedRegSlotSize() + m.spillSlotSize
   500  	if s&0xf != 0 {
   501  		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
   502  	}
   503  	return s
   504  }
   505  
   506  func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
   507  	// TODO: reuse the slice!
   508  	labels := make([]uint32, len(targets))
   509  	for j, target := range targets {
   510  		labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
   511  	}
   512  	index = len(m.jmpTableTargets)
   513  	m.jmpTableTargets = append(m.jmpTableTargets, labels)
   514  	return
   515  }