github.com/AR1011/wazero@v1.0.5/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go (about)

     1  package arm64
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/AR1011/wazero/internal/engine/wazevo/backend/regalloc"
     7  	"github.com/AR1011/wazero/internal/engine/wazevo/wazevoapi"
     8  )
     9  
    10  // SetupPrologue implements backend.Machine.
    11  func (m *machine) SetupPrologue() {
    12  	cur := m.rootInstr
    13  	prevInitInst := cur.next
    14  
    15  	//
    16  	//                   (high address)                    (high address)
    17  	//         SP----> +-----------------+               +------------------+ <----+
    18  	//                 |     .......     |               |     .......      |      |
    19  	//                 |      ret Y      |               |      ret Y       |      |
    20  	//                 |     .......     |               |     .......      |      |
    21  	//                 |      ret 0      |               |      ret 0       |      |
    22  	//                 |      arg X      |               |      arg X       |      |  size_of_arg_ret.
    23  	//                 |     .......     |     ====>     |     .......      |      |
    24  	//                 |      arg 1      |               |      arg 1       |      |
    25  	//                 |      arg 0      |               |      arg 0       | <----+
    26  	//                 |-----------------|               |  size_of_arg_ret |
    27  	//                                                   |  return address  |
    28  	//                                                   +------------------+ <---- SP
    29  	//                    (low address)                     (low address)
    30  
    31  	// Saves the return address (lr) and the size_of_arg_ret below the SP.
    32  	// size_of_arg_ret is used for stack unwinding.
    33  	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
    34  
    35  	if !m.stackBoundsCheckDisabled {
    36  		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
    37  	}
    38  
    39  	// Decrement SP if spillSlotSize > 0.
    40  	if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
    41  		panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
    42  	}
    43  
    44  	if regs := m.clobberedRegs; len(regs) > 0 {
    45  		//
    46  		//            (high address)                  (high address)
    47  		//          +-----------------+             +-----------------+
    48  		//          |     .......     |             |     .......     |
    49  		//          |      ret Y      |             |      ret Y      |
    50  		//          |     .......     |             |     .......     |
    51  		//          |      ret 0      |             |      ret 0      |
    52  		//          |      arg X      |             |      arg X      |
    53  		//          |     .......     |             |     .......     |
    54  		//          |      arg 1      |             |      arg 1      |
    55  		//          |      arg 0      |             |      arg 0      |
    56  		//          | size_of_arg_ret |             | size_of_arg_ret |
    57  		//          |   ReturnAddress |             |  ReturnAddress  |
    58  		//  SP----> +-----------------+    ====>    +-----------------+
    59  		//             (low address)                |   clobbered M   |
    60  		//                                          |   ............  |
    61  		//                                          |   clobbered 0   |
    62  		//                                          +-----------------+ <----- SP
    63  		//                                             (low address)
    64  		//
    65  		_amode := addressModePreOrPostIndex(spVReg,
    66  			-16,  // stack pointer must be 16-byte aligned.
    67  			true, // Decrement before store.
    68  		)
    69  		for _, vr := range regs {
    70  			// TODO: pair stores to reduce the number of instructions.
    71  			store := m.allocateInstr()
    72  			store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
    73  			cur = linkInstr(cur, store)
    74  		}
    75  	}
    76  
    77  	if size := m.spillSlotSize; size > 0 {
    78  		// Check if size is 16-byte aligned.
    79  		if size&0xf != 0 {
    80  			panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
    81  		}
    82  
    83  		cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
    84  
    85  		// At this point, the stack looks like:
    86  		//
    87  		//            (high address)
    88  		//          +------------------+
    89  		//          |     .......      |
    90  		//          |      ret Y       |
    91  		//          |     .......      |
    92  		//          |      ret 0       |
    93  		//          |      arg X       |
    94  		//          |     .......      |
    95  		//          |      arg 1       |
    96  		//          |      arg 0       |
    97  		//          |  size_of_arg_ret |
    98  		//          |   ReturnAddress  |
    99  		//          +------------------+
   100  		//          |    clobbered M   |
   101  		//          |   ............   |
   102  		//          |    clobbered 0   |
   103  		//          |   spill slot N   |
   104  		//          |   ............   |
   105  		//          |   spill slot 2   |
   106  		//          |   spill slot 0   |
   107  		//  SP----> +------------------+
   108  		//             (low address)
   109  	}
   110  
   111  	// We push the frame size into the stack to make it possible to unwind stack:
   112  	//
   113  	//
   114  	//            (high address)                  (high address)
   115  	//         +-----------------+                +-----------------+
   116  	//         |     .......     |                |     .......     |
   117  	//         |      ret Y      |                |      ret Y      |
   118  	//         |     .......     |                |     .......     |
   119  	//         |      ret 0      |                |      ret 0      |
   120  	//         |      arg X      |                |      arg X      |
   121  	//         |     .......     |                |     .......     |
   122  	//         |      arg 1      |                |      arg 1      |
   123  	//         |      arg 0      |                |      arg 0      |
   124  	//         | size_of_arg_ret |                | size_of_arg_ret |
   125  	//         |  ReturnAddress  |                |  ReturnAddress  |
   126  	//         +-----------------+      ==>       +-----------------+ <----+
   127  	//         |   clobbered  M  |                |   clobbered  M  |      |
   128  	//         |   ............  |                |   ............  |      |
   129  	//         |   clobbered  2  |                |   clobbered  2  |      |
   130  	//         |   clobbered  1  |                |   clobbered  1  |      | frame size
   131  	//         |   clobbered  0  |                |   clobbered  0  |      |
   132  	//         |   spill slot N  |                |   spill slot N  |      |
   133  	//         |   ............  |                |   ............  |      |
   134  	//         |   spill slot 0  |                |   spill slot 0  | <----+
   135  	// SP--->  +-----------------+                |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
   136  	//                                            |   frame_size    |
   137  	//                                            +-----------------+ <---- SP
   138  	//            (low address)
   139  	//
   140  	cur = m.createFrameSizeSlot(cur, m.frameSize())
   141  
   142  	linkInstr(cur, prevInitInst)
   143  }
   144  
   145  func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
   146  	// First we decrement the stack pointer to point the arg0 slot.
   147  	var sizeOfArgRetReg regalloc.VReg
   148  	s := m.currentABI.alignedArgResultStackSlotSize()
   149  	if s > 0 {
   150  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
   151  		sizeOfArgRetReg = tmpRegVReg
   152  
   153  		subSp := m.allocateInstr()
   154  		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
   155  		cur = linkInstr(cur, subSp)
   156  	} else {
   157  		sizeOfArgRetReg = xzrVReg
   158  	}
   159  
   160  	// Saves the return address (lr) and the size_of_arg_ret below the SP.
   161  	// size_of_arg_ret is used for stack unwinding.
   162  	pstr := m.allocateInstr()
   163  	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
   164  	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
   165  	cur = linkInstr(cur, pstr)
   166  	return cur
   167  }
   168  
   169  func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
   170  	var frameSizeReg regalloc.VReg
   171  	if s > 0 {
   172  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
   173  		frameSizeReg = tmpRegVReg
   174  	} else {
   175  		frameSizeReg = xzrVReg
   176  	}
   177  	_amode := addressModePreOrPostIndex(spVReg,
   178  		-16,  // stack pointer must be 16-byte aligned.
   179  		true, // Decrement before store.
   180  	)
   181  	store := m.allocateInstr()
   182  	store.asStore(operandNR(frameSizeReg), _amode, 64)
   183  	cur = linkInstr(cur, store)
   184  	return cur
   185  }
   186  
   187  // SetupEpilogue implements backend.Machine.
   188  func (m *machine) SetupEpilogue() {
   189  	for cur := m.rootInstr; cur != nil; cur = cur.next {
   190  		if cur.kind == ret {
   191  			m.setupEpilogueAfter(cur.prev)
   192  			continue
   193  		}
   194  
   195  		// Removes the redundant copy instruction.
   196  		// TODO: doing this in `SetupEpilogue` seems weird. Find a better home.
   197  		if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
   198  			prev, next := cur.prev, cur.next
   199  			// Remove the copy instruction.
   200  			prev.next = next
   201  			if next != nil {
   202  				next.prev = prev
   203  			}
   204  		}
   205  	}
   206  }
   207  
   208  func (m *machine) setupEpilogueAfter(cur *instruction) {
   209  	prevNext := cur.next
   210  
   211  	// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
   212  	cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
   213  
   214  	if s := m.spillSlotSize; s > 0 {
   215  		// Adjust SP to the original value:
   216  		//
   217  		//            (high address)                        (high address)
   218  		//          +-----------------+                  +-----------------+
   219  		//          |     .......     |                  |     .......     |
   220  		//          |      ret Y      |                  |      ret Y      |
   221  		//          |     .......     |                  |     .......     |
   222  		//          |      ret 0      |                  |      ret 0      |
   223  		//          |      arg X      |                  |      arg X      |
   224  		//          |     .......     |                  |     .......     |
   225  		//          |      arg 1      |                  |      arg 1      |
   226  		//          |      arg 0      |                  |      arg 0      |
   227  		//          |      xxxxx      |                  |      xxxxx      |
   228  		//          |   ReturnAddress |                  |   ReturnAddress |
   229  		//          +-----------------+      ====>       +-----------------+
   230  		//          |    clobbered M  |                  |    clobbered M  |
   231  		//          |   ............  |                  |   ............  |
   232  		//          |    clobbered 1  |                  |    clobbered 1  |
   233  		//          |    clobbered 0  |                  |    clobbered 0  |
   234  		//          |   spill slot N  |                  +-----------------+ <---- SP
   235  		//          |   ............  |
   236  		//          |   spill slot 0  |
   237  		//   SP---> +-----------------+
   238  		//             (low address)
   239  		//
   240  		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
   241  	}
   242  
   243  	// First we need to restore the clobbered registers.
   244  	if len(m.clobberedRegs) > 0 {
   245  		//            (high address)
   246  		//          +-----------------+                      +-----------------+
   247  		//          |     .......     |                      |     .......     |
   248  		//          |      ret Y      |                      |      ret Y      |
   249  		//          |     .......     |                      |     .......     |
   250  		//          |      ret 0      |                      |      ret 0      |
   251  		//          |      arg X      |                      |      arg X      |
   252  		//          |     .......     |                      |     .......     |
   253  		//          |      arg 1      |                      |      arg 1      |
   254  		//          |      arg 0      |                      |      arg 0      |
   255  		//          |      xxxxx      |                      |      xxxxx      |
   256  		//          |   ReturnAddress |                      |   ReturnAddress |
   257  		//          +-----------------+      ========>       +-----------------+ <---- SP
   258  		//          |   clobbered M   |
   259  		//          |   clobbered 1   |
   260  		//          |   ...........   |
   261  		//          |   clobbered 0   |
   262  		//   SP---> +-----------------+
   263  		//             (low address)
   264  
   265  		l := len(m.clobberedRegs) - 1
   266  		for i := range m.clobberedRegs {
   267  			vr := m.clobberedRegs[l-i] // reverse order to restore.
   268  			load := m.allocateInstr()
   269  			amode := addressModePreOrPostIndex(spVReg,
   270  				16,    // stack pointer must be 16-byte aligned.
   271  				false, // Increment after store.
   272  			)
   273  			// TODO: pair loads to reduce the number of instructions.
   274  			switch regTypeToRegisterSizeInBits(vr.RegType()) {
   275  			case 64: // save int reg.
   276  				load.asULoad(operandNR(vr), amode, 64)
   277  			case 128: // save vector reg.
   278  				load.asFpuLoad(operandNR(vr), amode, 128)
   279  			}
   280  			cur = linkInstr(cur, load)
   281  		}
   282  	}
   283  
   284  	// Reload the return address (lr).
   285  	//
   286  	//            +-----------------+          +-----------------+
   287  	//            |     .......     |          |     .......     |
   288  	//            |      ret Y      |          |      ret Y      |
   289  	//            |     .......     |          |     .......     |
   290  	//            |      ret 0      |          |      ret 0      |
   291  	//            |      arg X      |          |      arg X      |
   292  	//            |     .......     |   ===>   |     .......     |
   293  	//            |      arg 1      |          |      arg 1      |
   294  	//            |      arg 0      |          |      arg 0      |
   295  	//            |      xxxxx      |          +-----------------+ <---- SP
   296  	//            |  ReturnAddress  |
   297  	//    SP----> +-----------------+
   298  
   299  	ldr := m.allocateInstr()
   300  	ldr.asULoad(operandNR(lrVReg),
   301  		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
   302  	cur = linkInstr(cur, ldr)
   303  
   304  	if s := m.currentABI.alignedArgResultStackSlotSize(); s > 0 {
   305  		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
   306  	}
   307  
   308  	linkInstr(cur, prevNext)
   309  }
   310  
   311  // saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
   312  // stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
   313  // which always points to the execution context whenever the native code is entered from Go.
   314  var saveRequiredRegs = []regalloc.VReg{
   315  	x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
   316  	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
   317  	v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
   318  	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
   319  }
   320  
   321  // insertStackBoundsCheck will insert the instructions after `cur` to check the
   322  // stack bounds, and if there's no sufficient spaces required for the function,
   323  // exit the execution and try growing it in Go world.
   324  //
   325  // TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
   326  func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
   327  	if requiredStackSize%16 != 0 {
   328  		panic("BUG")
   329  	}
   330  
   331  	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
   332  		// sub tmp, sp, #requiredStackSize
   333  		sub := m.allocateInstr()
   334  		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
   335  		cur = linkInstr(cur, sub)
   336  	} else {
   337  		// This case, we first load the requiredStackSize into the temporary register,
   338  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
   339  		// Then subtract it.
   340  		sub := m.allocateInstr()
   341  		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
   342  		cur = linkInstr(cur, sub)
   343  	}
   344  
   345  	tmp2 := x11VReg // Callee save, so it is safe to use it here in the prologue.
   346  
   347  	// ldr tmp2, [executionContext #StackBottomPtr]
   348  	ldr := m.allocateInstr()
   349  	ldr.asULoad(operandNR(tmp2), addressMode{
   350  		kind: addressModeKindRegUnsignedImm12,
   351  		rn:   x0VReg, // execution context is always the first argument.
   352  		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
   353  	}, 64)
   354  	cur = linkInstr(cur, ldr)
   355  
   356  	// subs xzr, tmp, tmp2
   357  	subs := m.allocateInstr()
   358  	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
   359  	cur = linkInstr(cur, subs)
   360  
   361  	// b.ge #imm
   362  	cbr := m.allocateInstr()
   363  	cbr.asCondBr(ge.asCond(), invalidLabel, false /* ignored */)
   364  	cur = linkInstr(cur, cbr)
   365  
   366  	// Set the required stack size and set it to the exec context.
   367  	{
   368  		// First load the requiredStackSize into the temporary register,
   369  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
   370  		setRequiredStackSize := m.allocateInstr()
   371  		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
   372  			addressMode{
   373  				kind: addressModeKindRegUnsignedImm12,
   374  				// Execution context is always the first argument.
   375  				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
   376  			}, 64)
   377  
   378  		cur = linkInstr(cur, setRequiredStackSize)
   379  	}
   380  
   381  	ldrAddress := m.allocateInstr()
   382  	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
   383  		kind: addressModeKindRegUnsignedImm12,
   384  		rn:   x0VReg, // execution context is always the first argument
   385  		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
   386  	}, 64)
   387  	cur = linkInstr(cur, ldrAddress)
   388  
   389  	// Then jumps to the stack grow call sequence's address, meaning
   390  	// transferring the control to the code compiled by CompileStackGrowCallSequence.
   391  	bl := m.allocateInstr()
   392  	bl.asCallIndirect(tmpRegVReg, nil)
   393  	cur = linkInstr(cur, bl)
   394  
   395  	// Now that we know the entire code, we can finalize how many bytes
   396  	// we have to skip when the stack size is sufficient.
   397  	var cbrOffset int64
   398  	for _cur := cbr; ; _cur = _cur.next {
   399  		cbrOffset += _cur.size()
   400  		if _cur == cur {
   401  			break
   402  		}
   403  	}
   404  	cbr.condBrOffsetResolve(cbrOffset)
   405  	return cur
   406  }
   407  
   408  // CompileStackGrowCallSequence implements backend.Machine.
   409  func (m *machine) CompileStackGrowCallSequence() []byte {
   410  	cur := m.allocateInstr()
   411  	cur.asNop0()
   412  	m.rootInstr = cur
   413  
   414  	// Save the callee saved and argument registers.
   415  	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
   416  
   417  	// Save the current stack pointer.
   418  	cur = m.saveCurrentStackPointer(cur, x0VReg)
   419  
   420  	// Set the exit status on the execution context.
   421  	cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
   422  
   423  	// Exit the execution.
   424  	cur = m.storeReturnAddressAndExit(cur)
   425  
   426  	// After the exit, restore the saved registers.
   427  	cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
   428  
   429  	// Then goes back the original address of this stack grow call.
   430  	ret := m.allocateInstr()
   431  	ret.asRet(nil)
   432  	linkInstr(cur, ret)
   433  
   434  	m.encode(m.rootInstr)
   435  	return m.compiler.Buf()
   436  }
   437  
   438  func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
   439  	m.pendingInstructions = m.pendingInstructions[:0]
   440  	m.insertAddOrSubStackPointer(rd, diff, add)
   441  	for _, inserted := range m.pendingInstructions {
   442  		cur = linkInstr(cur, inserted)
   443  	}
   444  	return cur
   445  }