github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go (about)

     1  package arm64
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/backend/regalloc"
     7  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/wazevoapi"
     8  )
     9  
    10  // SetupPrologue implements backend.Machine.
    11  func (m *machine) SetupPrologue() {
    12  	ectx := m.executableContext
    13  
    14  	cur := ectx.RootInstr
    15  	prevInitInst := cur.next
    16  
    17  	//
    18  	//                   (high address)                    (high address)
    19  	//         SP----> +-----------------+               +------------------+ <----+
    20  	//                 |     .......     |               |     .......      |      |
    21  	//                 |      ret Y      |               |      ret Y       |      |
    22  	//                 |     .......     |               |     .......      |      |
    23  	//                 |      ret 0      |               |      ret 0       |      |
    24  	//                 |      arg X      |               |      arg X       |      |  size_of_arg_ret.
    25  	//                 |     .......     |     ====>     |     .......      |      |
    26  	//                 |      arg 1      |               |      arg 1       |      |
    27  	//                 |      arg 0      |               |      arg 0       | <----+
    28  	//                 |-----------------|               |  size_of_arg_ret |
    29  	//                                                   |  return address  |
    30  	//                                                   +------------------+ <---- SP
    31  	//                    (low address)                     (low address)
    32  
    33  	// Saves the return address (lr) and the size_of_arg_ret below the SP.
    34  	// size_of_arg_ret is used for stack unwinding.
    35  	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
    36  
    37  	if !m.stackBoundsCheckDisabled {
    38  		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
    39  	}
    40  
    41  	// Decrement SP if spillSlotSize > 0.
    42  	if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
    43  		panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
    44  	}
    45  
    46  	if regs := m.clobberedRegs; len(regs) > 0 {
    47  		//
    48  		//            (high address)                  (high address)
    49  		//          +-----------------+             +-----------------+
    50  		//          |     .......     |             |     .......     |
    51  		//          |      ret Y      |             |      ret Y      |
    52  		//          |     .......     |             |     .......     |
    53  		//          |      ret 0      |             |      ret 0      |
    54  		//          |      arg X      |             |      arg X      |
    55  		//          |     .......     |             |     .......     |
    56  		//          |      arg 1      |             |      arg 1      |
    57  		//          |      arg 0      |             |      arg 0      |
    58  		//          | size_of_arg_ret |             | size_of_arg_ret |
    59  		//          |   ReturnAddress |             |  ReturnAddress  |
    60  		//  SP----> +-----------------+    ====>    +-----------------+
    61  		//             (low address)                |   clobbered M   |
    62  		//                                          |   ............  |
    63  		//                                          |   clobbered 0   |
    64  		//                                          +-----------------+ <----- SP
    65  		//                                             (low address)
    66  		//
    67  		_amode := addressModePreOrPostIndex(spVReg,
    68  			-16,  // stack pointer must be 16-byte aligned.
    69  			true, // Decrement before store.
    70  		)
    71  		for _, vr := range regs {
    72  			// TODO: pair stores to reduce the number of instructions.
    73  			store := m.allocateInstr()
    74  			store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
    75  			cur = linkInstr(cur, store)
    76  		}
    77  	}
    78  
    79  	if size := m.spillSlotSize; size > 0 {
    80  		// Check if size is 16-byte aligned.
    81  		if size&0xf != 0 {
    82  			panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
    83  		}
    84  
    85  		cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
    86  
    87  		// At this point, the stack looks like:
    88  		//
    89  		//            (high address)
    90  		//          +------------------+
    91  		//          |     .......      |
    92  		//          |      ret Y       |
    93  		//          |     .......      |
    94  		//          |      ret 0       |
    95  		//          |      arg X       |
    96  		//          |     .......      |
    97  		//          |      arg 1       |
    98  		//          |      arg 0       |
    99  		//          |  size_of_arg_ret |
   100  		//          |   ReturnAddress  |
   101  		//          +------------------+
   102  		//          |    clobbered M   |
   103  		//          |   ............   |
   104  		//          |    clobbered 0   |
   105  		//          |   spill slot N   |
   106  		//          |   ............   |
   107  		//          |   spill slot 2   |
   108  		//          |   spill slot 0   |
   109  		//  SP----> +------------------+
   110  		//             (low address)
   111  	}
   112  
   113  	// We push the frame size into the stack to make it possible to unwind stack:
   114  	//
   115  	//
   116  	//            (high address)                  (high address)
   117  	//         +-----------------+                +-----------------+
   118  	//         |     .......     |                |     .......     |
   119  	//         |      ret Y      |                |      ret Y      |
   120  	//         |     .......     |                |     .......     |
   121  	//         |      ret 0      |                |      ret 0      |
   122  	//         |      arg X      |                |      arg X      |
   123  	//         |     .......     |                |     .......     |
   124  	//         |      arg 1      |                |      arg 1      |
   125  	//         |      arg 0      |                |      arg 0      |
   126  	//         | size_of_arg_ret |                | size_of_arg_ret |
   127  	//         |  ReturnAddress  |                |  ReturnAddress  |
   128  	//         +-----------------+      ==>       +-----------------+ <----+
   129  	//         |   clobbered  M  |                |   clobbered  M  |      |
   130  	//         |   ............  |                |   ............  |      |
   131  	//         |   clobbered  2  |                |   clobbered  2  |      |
   132  	//         |   clobbered  1  |                |   clobbered  1  |      | frame size
   133  	//         |   clobbered  0  |                |   clobbered  0  |      |
   134  	//         |   spill slot N  |                |   spill slot N  |      |
   135  	//         |   ............  |                |   ............  |      |
   136  	//         |   spill slot 0  |                |   spill slot 0  | <----+
   137  	// SP--->  +-----------------+                |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
   138  	//                                            |   frame_size    |
   139  	//                                            +-----------------+ <---- SP
   140  	//            (low address)
   141  	//
   142  	cur = m.createFrameSizeSlot(cur, m.frameSize())
   143  
   144  	linkInstr(cur, prevInitInst)
   145  }
   146  
   147  func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
   148  	// First we decrement the stack pointer to point the arg0 slot.
   149  	var sizeOfArgRetReg regalloc.VReg
   150  	s := m.currentABI.alignedArgResultStackSlotSize()
   151  	if s > 0 {
   152  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
   153  		sizeOfArgRetReg = tmpRegVReg
   154  
   155  		subSp := m.allocateInstr()
   156  		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
   157  		cur = linkInstr(cur, subSp)
   158  	} else {
   159  		sizeOfArgRetReg = xzrVReg
   160  	}
   161  
   162  	// Saves the return address (lr) and the size_of_arg_ret below the SP.
   163  	// size_of_arg_ret is used for stack unwinding.
   164  	pstr := m.allocateInstr()
   165  	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
   166  	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
   167  	cur = linkInstr(cur, pstr)
   168  	return cur
   169  }
   170  
   171  func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
   172  	var frameSizeReg regalloc.VReg
   173  	if s > 0 {
   174  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
   175  		frameSizeReg = tmpRegVReg
   176  	} else {
   177  		frameSizeReg = xzrVReg
   178  	}
   179  	_amode := addressModePreOrPostIndex(spVReg,
   180  		-16,  // stack pointer must be 16-byte aligned.
   181  		true, // Decrement before store.
   182  	)
   183  	store := m.allocateInstr()
   184  	store.asStore(operandNR(frameSizeReg), _amode, 64)
   185  	cur = linkInstr(cur, store)
   186  	return cur
   187  }
   188  
   189  // SetupEpilogue implements backend.Machine.
   190  func (m *machine) SetupEpilogue() {
   191  	ectx := m.executableContext
   192  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
   193  		if cur.kind == ret {
   194  			m.setupEpilogueAfter(cur.prev)
   195  			continue
   196  		}
   197  
   198  		// Removes the redundant copy instruction.
   199  		// TODO: doing this in `SetupEpilogue` seems weird. Find a better home.
   200  		if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
   201  			prev, next := cur.prev, cur.next
   202  			// Remove the copy instruction.
   203  			prev.next = next
   204  			if next != nil {
   205  				next.prev = prev
   206  			}
   207  		}
   208  	}
   209  }
   210  
   211  func (m *machine) setupEpilogueAfter(cur *instruction) {
   212  	prevNext := cur.next
   213  
   214  	// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
   215  	cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
   216  
   217  	if s := m.spillSlotSize; s > 0 {
   218  		// Adjust SP to the original value:
   219  		//
   220  		//            (high address)                        (high address)
   221  		//          +-----------------+                  +-----------------+
   222  		//          |     .......     |                  |     .......     |
   223  		//          |      ret Y      |                  |      ret Y      |
   224  		//          |     .......     |                  |     .......     |
   225  		//          |      ret 0      |                  |      ret 0      |
   226  		//          |      arg X      |                  |      arg X      |
   227  		//          |     .......     |                  |     .......     |
   228  		//          |      arg 1      |                  |      arg 1      |
   229  		//          |      arg 0      |                  |      arg 0      |
   230  		//          |      xxxxx      |                  |      xxxxx      |
   231  		//          |   ReturnAddress |                  |   ReturnAddress |
   232  		//          +-----------------+      ====>       +-----------------+
   233  		//          |    clobbered M  |                  |    clobbered M  |
   234  		//          |   ............  |                  |   ............  |
   235  		//          |    clobbered 1  |                  |    clobbered 1  |
   236  		//          |    clobbered 0  |                  |    clobbered 0  |
   237  		//          |   spill slot N  |                  +-----------------+ <---- SP
   238  		//          |   ............  |
   239  		//          |   spill slot 0  |
   240  		//   SP---> +-----------------+
   241  		//             (low address)
   242  		//
   243  		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
   244  	}
   245  
   246  	// First we need to restore the clobbered registers.
   247  	if len(m.clobberedRegs) > 0 {
   248  		//            (high address)
   249  		//          +-----------------+                      +-----------------+
   250  		//          |     .......     |                      |     .......     |
   251  		//          |      ret Y      |                      |      ret Y      |
   252  		//          |     .......     |                      |     .......     |
   253  		//          |      ret 0      |                      |      ret 0      |
   254  		//          |      arg X      |                      |      arg X      |
   255  		//          |     .......     |                      |     .......     |
   256  		//          |      arg 1      |                      |      arg 1      |
   257  		//          |      arg 0      |                      |      arg 0      |
   258  		//          |      xxxxx      |                      |      xxxxx      |
   259  		//          |   ReturnAddress |                      |   ReturnAddress |
   260  		//          +-----------------+      ========>       +-----------------+ <---- SP
   261  		//          |   clobbered M   |
   262  		//          |   clobbered 1   |
   263  		//          |   ...........   |
   264  		//          |   clobbered 0   |
   265  		//   SP---> +-----------------+
   266  		//             (low address)
   267  
   268  		l := len(m.clobberedRegs) - 1
   269  		for i := range m.clobberedRegs {
   270  			vr := m.clobberedRegs[l-i] // reverse order to restore.
   271  			load := m.allocateInstr()
   272  			amode := addressModePreOrPostIndex(spVReg,
   273  				16,    // stack pointer must be 16-byte aligned.
   274  				false, // Increment after store.
   275  			)
   276  			// TODO: pair loads to reduce the number of instructions.
   277  			switch regTypeToRegisterSizeInBits(vr.RegType()) {
   278  			case 64: // save int reg.
   279  				load.asULoad(operandNR(vr), amode, 64)
   280  			case 128: // save vector reg.
   281  				load.asFpuLoad(operandNR(vr), amode, 128)
   282  			}
   283  			cur = linkInstr(cur, load)
   284  		}
   285  	}
   286  
   287  	// Reload the return address (lr).
   288  	//
   289  	//            +-----------------+          +-----------------+
   290  	//            |     .......     |          |     .......     |
   291  	//            |      ret Y      |          |      ret Y      |
   292  	//            |     .......     |          |     .......     |
   293  	//            |      ret 0      |          |      ret 0      |
   294  	//            |      arg X      |          |      arg X      |
   295  	//            |     .......     |   ===>   |     .......     |
   296  	//            |      arg 1      |          |      arg 1      |
   297  	//            |      arg 0      |          |      arg 0      |
   298  	//            |      xxxxx      |          +-----------------+ <---- SP
   299  	//            |  ReturnAddress  |
   300  	//    SP----> +-----------------+
   301  
   302  	ldr := m.allocateInstr()
   303  	ldr.asULoad(operandNR(lrVReg),
   304  		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
   305  	cur = linkInstr(cur, ldr)
   306  
   307  	if s := m.currentABI.alignedArgResultStackSlotSize(); s > 0 {
   308  		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
   309  	}
   310  
   311  	linkInstr(cur, prevNext)
   312  }
   313  
   314  // saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
   315  // stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
   316  // which always points to the execution context whenever the native code is entered from Go.
   317  var saveRequiredRegs = []regalloc.VReg{
   318  	x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
   319  	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
   320  	v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
   321  	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
   322  }
   323  
   324  // insertStackBoundsCheck will insert the instructions after `cur` to check the
   325  // stack bounds, and if there's no sufficient spaces required for the function,
   326  // exit the execution and try growing it in Go world.
   327  //
   328  // TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
   329  func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
   330  	if requiredStackSize%16 != 0 {
   331  		panic("BUG")
   332  	}
   333  
   334  	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
   335  		// sub tmp, sp, #requiredStackSize
   336  		sub := m.allocateInstr()
   337  		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
   338  		cur = linkInstr(cur, sub)
   339  	} else {
   340  		// This case, we first load the requiredStackSize into the temporary register,
   341  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
   342  		// Then subtract it.
   343  		sub := m.allocateInstr()
   344  		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
   345  		cur = linkInstr(cur, sub)
   346  	}
   347  
   348  	tmp2 := x11VReg // Callee save, so it is safe to use it here in the prologue.
   349  
   350  	// ldr tmp2, [executionContext #StackBottomPtr]
   351  	ldr := m.allocateInstr()
   352  	ldr.asULoad(operandNR(tmp2), addressMode{
   353  		kind: addressModeKindRegUnsignedImm12,
   354  		rn:   x0VReg, // execution context is always the first argument.
   355  		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
   356  	}, 64)
   357  	cur = linkInstr(cur, ldr)
   358  
   359  	// subs xzr, tmp, tmp2
   360  	subs := m.allocateInstr()
   361  	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
   362  	cur = linkInstr(cur, subs)
   363  
   364  	// b.ge #imm
   365  	cbr := m.allocateInstr()
   366  	cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
   367  	cur = linkInstr(cur, cbr)
   368  
   369  	// Set the required stack size and set it to the exec context.
   370  	{
   371  		// First load the requiredStackSize into the temporary register,
   372  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
   373  		setRequiredStackSize := m.allocateInstr()
   374  		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
   375  			addressMode{
   376  				kind: addressModeKindRegUnsignedImm12,
   377  				// Execution context is always the first argument.
   378  				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
   379  			}, 64)
   380  
   381  		cur = linkInstr(cur, setRequiredStackSize)
   382  	}
   383  
   384  	ldrAddress := m.allocateInstr()
   385  	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
   386  		kind: addressModeKindRegUnsignedImm12,
   387  		rn:   x0VReg, // execution context is always the first argument
   388  		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
   389  	}, 64)
   390  	cur = linkInstr(cur, ldrAddress)
   391  
   392  	// Then jumps to the stack grow call sequence's address, meaning
   393  	// transferring the control to the code compiled by CompileStackGrowCallSequence.
   394  	bl := m.allocateInstr()
   395  	bl.asCallIndirect(tmpRegVReg, nil)
   396  	cur = linkInstr(cur, bl)
   397  
   398  	// Now that we know the entire code, we can finalize how many bytes
   399  	// we have to skip when the stack size is sufficient.
   400  	var cbrOffset int64
   401  	for _cur := cbr; ; _cur = _cur.next {
   402  		cbrOffset += _cur.size()
   403  		if _cur == cur {
   404  			break
   405  		}
   406  	}
   407  	cbr.condBrOffsetResolve(cbrOffset)
   408  	return cur
   409  }
   410  
   411  // CompileStackGrowCallSequence implements backend.Machine.
   412  func (m *machine) CompileStackGrowCallSequence() []byte {
   413  	ectx := m.executableContext
   414  
   415  	cur := m.allocateInstr()
   416  	cur.asNop0()
   417  	ectx.RootInstr = cur
   418  
   419  	// Save the callee saved and argument registers.
   420  	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
   421  
   422  	// Save the current stack pointer.
   423  	cur = m.saveCurrentStackPointer(cur, x0VReg)
   424  
   425  	// Set the exit status on the execution context.
   426  	cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
   427  
   428  	// Exit the execution.
   429  	cur = m.storeReturnAddressAndExit(cur)
   430  
   431  	// After the exit, restore the saved registers.
   432  	cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
   433  
   434  	// Then goes back the original address of this stack grow call.
   435  	ret := m.allocateInstr()
   436  	ret.asRet(nil)
   437  	linkInstr(cur, ret)
   438  
   439  	m.encode(ectx.RootInstr)
   440  	return m.compiler.Buf()
   441  }
   442  
   443  func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
   444  	ectx := m.executableContext
   445  
   446  	ectx.PendingInstructions = ectx.PendingInstructions[:0]
   447  	m.insertAddOrSubStackPointer(rd, diff, add)
   448  	for _, inserted := range ectx.PendingInstructions {
   449  		cur = linkInstr(cur, inserted)
   450  	}
   451  	return cur
   452  }