github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go (about)

     1  package arm64
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
     7  	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
     8  )
     9  
    10  // PostRegAlloc implements backend.Machine.
    11  func (m *machine) PostRegAlloc() {
    12  	m.setupPrologue()
    13  	m.postRegAlloc()
    14  }
    15  
    16  // setupPrologue initializes the prologue of the function.
    17  func (m *machine) setupPrologue() {
    18  	ectx := m.executableContext
    19  
    20  	cur := ectx.RootInstr
    21  	prevInitInst := cur.next
    22  
    23  	//
    24  	//                   (high address)                    (high address)
    25  	//         SP----> +-----------------+               +------------------+ <----+
    26  	//                 |     .......     |               |     .......      |      |
    27  	//                 |      ret Y      |               |      ret Y       |      |
    28  	//                 |     .......     |               |     .......      |      |
    29  	//                 |      ret 0      |               |      ret 0       |      |
    30  	//                 |      arg X      |               |      arg X       |      |  size_of_arg_ret.
    31  	//                 |     .......     |     ====>     |     .......      |      |
    32  	//                 |      arg 1      |               |      arg 1       |      |
    33  	//                 |      arg 0      |               |      arg 0       | <----+
    34  	//                 |-----------------|               |  size_of_arg_ret |
    35  	//                                                   |  return address  |
    36  	//                                                   +------------------+ <---- SP
    37  	//                    (low address)                     (low address)
    38  
    39  	// Saves the return address (lr) and the size_of_arg_ret below the SP.
    40  	// size_of_arg_ret is used for stack unwinding.
    41  	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
    42  
    43  	if !m.stackBoundsCheckDisabled {
    44  		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
    45  	}
    46  
    47  	// Decrement SP if spillSlotSize > 0.
    48  	if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
    49  		panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
    50  	}
    51  
    52  	if regs := m.clobberedRegs; len(regs) > 0 {
    53  		//
    54  		//            (high address)                  (high address)
    55  		//          +-----------------+             +-----------------+
    56  		//          |     .......     |             |     .......     |
    57  		//          |      ret Y      |             |      ret Y      |
    58  		//          |     .......     |             |     .......     |
    59  		//          |      ret 0      |             |      ret 0      |
    60  		//          |      arg X      |             |      arg X      |
    61  		//          |     .......     |             |     .......     |
    62  		//          |      arg 1      |             |      arg 1      |
    63  		//          |      arg 0      |             |      arg 0      |
    64  		//          | size_of_arg_ret |             | size_of_arg_ret |
    65  		//          |   ReturnAddress |             |  ReturnAddress  |
    66  		//  SP----> +-----------------+    ====>    +-----------------+
    67  		//             (low address)                |   clobbered M   |
    68  		//                                          |   ............  |
    69  		//                                          |   clobbered 0   |
    70  		//                                          +-----------------+ <----- SP
    71  		//                                             (low address)
    72  		//
    73  		_amode := addressModePreOrPostIndex(spVReg,
    74  			-16,  // stack pointer must be 16-byte aligned.
    75  			true, // Decrement before store.
    76  		)
    77  		for _, vr := range regs {
    78  			// TODO: pair stores to reduce the number of instructions.
    79  			store := m.allocateInstr()
    80  			store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
    81  			cur = linkInstr(cur, store)
    82  		}
    83  	}
    84  
    85  	if size := m.spillSlotSize; size > 0 {
    86  		// Check if size is 16-byte aligned.
    87  		if size&0xf != 0 {
    88  			panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
    89  		}
    90  
    91  		cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
    92  
    93  		// At this point, the stack looks like:
    94  		//
    95  		//            (high address)
    96  		//          +------------------+
    97  		//          |     .......      |
    98  		//          |      ret Y       |
    99  		//          |     .......      |
   100  		//          |      ret 0       |
   101  		//          |      arg X       |
   102  		//          |     .......      |
   103  		//          |      arg 1       |
   104  		//          |      arg 0       |
   105  		//          |  size_of_arg_ret |
   106  		//          |   ReturnAddress  |
   107  		//          +------------------+
   108  		//          |    clobbered M   |
   109  		//          |   ............   |
   110  		//          |    clobbered 0   |
   111  		//          |   spill slot N   |
   112  		//          |   ............   |
   113  		//          |   spill slot 2   |
   114  		//          |   spill slot 0   |
   115  		//  SP----> +------------------+
   116  		//             (low address)
   117  	}
   118  
   119  	// We push the frame size into the stack to make it possible to unwind stack:
   120  	//
   121  	//
   122  	//            (high address)                  (high address)
   123  	//         +-----------------+                +-----------------+
   124  	//         |     .......     |                |     .......     |
   125  	//         |      ret Y      |                |      ret Y      |
   126  	//         |     .......     |                |     .......     |
   127  	//         |      ret 0      |                |      ret 0      |
   128  	//         |      arg X      |                |      arg X      |
   129  	//         |     .......     |                |     .......     |
   130  	//         |      arg 1      |                |      arg 1      |
   131  	//         |      arg 0      |                |      arg 0      |
   132  	//         | size_of_arg_ret |                | size_of_arg_ret |
   133  	//         |  ReturnAddress  |                |  ReturnAddress  |
   134  	//         +-----------------+      ==>       +-----------------+ <----+
   135  	//         |   clobbered  M  |                |   clobbered  M  |      |
   136  	//         |   ............  |                |   ............  |      |
   137  	//         |   clobbered  2  |                |   clobbered  2  |      |
   138  	//         |   clobbered  1  |                |   clobbered  1  |      | frame size
   139  	//         |   clobbered  0  |                |   clobbered  0  |      |
   140  	//         |   spill slot N  |                |   spill slot N  |      |
   141  	//         |   ............  |                |   ............  |      |
   142  	//         |   spill slot 0  |                |   spill slot 0  | <----+
   143  	// SP--->  +-----------------+                |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
   144  	//                                            |   frame_size    |
   145  	//                                            +-----------------+ <---- SP
   146  	//            (low address)
   147  	//
   148  	cur = m.createFrameSizeSlot(cur, m.frameSize())
   149  
   150  	linkInstr(cur, prevInitInst)
   151  }
   152  
   153  func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
   154  	// First we decrement the stack pointer to point the arg0 slot.
   155  	var sizeOfArgRetReg regalloc.VReg
   156  	s := int64(m.currentABI.AlignedArgResultStackSlotSize())
   157  	if s > 0 {
   158  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
   159  		sizeOfArgRetReg = tmpRegVReg
   160  
   161  		subSp := m.allocateInstr()
   162  		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
   163  		cur = linkInstr(cur, subSp)
   164  	} else {
   165  		sizeOfArgRetReg = xzrVReg
   166  	}
   167  
   168  	// Saves the return address (lr) and the size_of_arg_ret below the SP.
   169  	// size_of_arg_ret is used for stack unwinding.
   170  	pstr := m.allocateInstr()
   171  	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
   172  	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
   173  	cur = linkInstr(cur, pstr)
   174  	return cur
   175  }
   176  
   177  func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
   178  	var frameSizeReg regalloc.VReg
   179  	if s > 0 {
   180  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
   181  		frameSizeReg = tmpRegVReg
   182  	} else {
   183  		frameSizeReg = xzrVReg
   184  	}
   185  	_amode := addressModePreOrPostIndex(spVReg,
   186  		-16,  // stack pointer must be 16-byte aligned.
   187  		true, // Decrement before store.
   188  	)
   189  	store := m.allocateInstr()
   190  	store.asStore(operandNR(frameSizeReg), _amode, 64)
   191  	cur = linkInstr(cur, store)
   192  	return cur
   193  }
   194  
   195  // postRegAlloc does multiple things while walking through the instructions:
   196  // 1. Removes the redundant copy instruction.
   197  // 2. Inserts the epilogue.
   198  func (m *machine) postRegAlloc() {
   199  	ectx := m.executableContext
   200  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
   201  		switch cur.kind {
   202  		case ret:
   203  			m.setupEpilogueAfter(cur.prev)
   204  		case loadConstBlockArg:
   205  			lc := cur
   206  			next := lc.next
   207  			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
   208  			m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
   209  			for _, instr := range m.executableContext.PendingInstructions {
   210  				cur = linkInstr(cur, instr)
   211  			}
   212  			linkInstr(cur, next)
   213  			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
   214  		default:
   215  			// Removes the redundant copy instruction.
   216  			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
   217  				prev, next := cur.prev, cur.next
   218  				// Remove the copy instruction.
   219  				prev.next = next
   220  				if next != nil {
   221  					next.prev = prev
   222  				}
   223  			}
   224  		}
   225  	}
   226  }
   227  
   228  func (m *machine) setupEpilogueAfter(cur *instruction) {
   229  	prevNext := cur.next
   230  
   231  	// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
   232  	cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
   233  
   234  	if s := m.spillSlotSize; s > 0 {
   235  		// Adjust SP to the original value:
   236  		//
   237  		//            (high address)                        (high address)
   238  		//          +-----------------+                  +-----------------+
   239  		//          |     .......     |                  |     .......     |
   240  		//          |      ret Y      |                  |      ret Y      |
   241  		//          |     .......     |                  |     .......     |
   242  		//          |      ret 0      |                  |      ret 0      |
   243  		//          |      arg X      |                  |      arg X      |
   244  		//          |     .......     |                  |     .......     |
   245  		//          |      arg 1      |                  |      arg 1      |
   246  		//          |      arg 0      |                  |      arg 0      |
   247  		//          |      xxxxx      |                  |      xxxxx      |
   248  		//          |   ReturnAddress |                  |   ReturnAddress |
   249  		//          +-----------------+      ====>       +-----------------+
   250  		//          |    clobbered M  |                  |    clobbered M  |
   251  		//          |   ............  |                  |   ............  |
   252  		//          |    clobbered 1  |                  |    clobbered 1  |
   253  		//          |    clobbered 0  |                  |    clobbered 0  |
   254  		//          |   spill slot N  |                  +-----------------+ <---- SP
   255  		//          |   ............  |
   256  		//          |   spill slot 0  |
   257  		//   SP---> +-----------------+
   258  		//             (low address)
   259  		//
   260  		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
   261  	}
   262  
   263  	// First we need to restore the clobbered registers.
   264  	if len(m.clobberedRegs) > 0 {
   265  		//            (high address)
   266  		//          +-----------------+                      +-----------------+
   267  		//          |     .......     |                      |     .......     |
   268  		//          |      ret Y      |                      |      ret Y      |
   269  		//          |     .......     |                      |     .......     |
   270  		//          |      ret 0      |                      |      ret 0      |
   271  		//          |      arg X      |                      |      arg X      |
   272  		//          |     .......     |                      |     .......     |
   273  		//          |      arg 1      |                      |      arg 1      |
   274  		//          |      arg 0      |                      |      arg 0      |
   275  		//          |      xxxxx      |                      |      xxxxx      |
   276  		//          |   ReturnAddress |                      |   ReturnAddress |
   277  		//          +-----------------+      ========>       +-----------------+ <---- SP
   278  		//          |   clobbered M   |
   279  		//          |   ...........   |
   280  		//          |   clobbered 1   |
   281  		//          |   clobbered 0   |
   282  		//   SP---> +-----------------+
   283  		//             (low address)
   284  
   285  		l := len(m.clobberedRegs) - 1
   286  		for i := range m.clobberedRegs {
   287  			vr := m.clobberedRegs[l-i] // reverse order to restore.
   288  			load := m.allocateInstr()
   289  			amode := addressModePreOrPostIndex(spVReg,
   290  				16,    // stack pointer must be 16-byte aligned.
   291  				false, // Increment after store.
   292  			)
   293  			// TODO: pair loads to reduce the number of instructions.
   294  			switch regTypeToRegisterSizeInBits(vr.RegType()) {
   295  			case 64: // save int reg.
   296  				load.asULoad(operandNR(vr), amode, 64)
   297  			case 128: // save vector reg.
   298  				load.asFpuLoad(operandNR(vr), amode, 128)
   299  			}
   300  			cur = linkInstr(cur, load)
   301  		}
   302  	}
   303  
   304  	// Reload the return address (lr).
   305  	//
   306  	//            +-----------------+          +-----------------+
   307  	//            |     .......     |          |     .......     |
   308  	//            |      ret Y      |          |      ret Y      |
   309  	//            |     .......     |          |     .......     |
   310  	//            |      ret 0      |          |      ret 0      |
   311  	//            |      arg X      |          |      arg X      |
   312  	//            |     .......     |   ===>   |     .......     |
   313  	//            |      arg 1      |          |      arg 1      |
   314  	//            |      arg 0      |          |      arg 0      |
   315  	//            |      xxxxx      |          +-----------------+ <---- SP
   316  	//            |  ReturnAddress  |
   317  	//    SP----> +-----------------+
   318  
   319  	ldr := m.allocateInstr()
   320  	ldr.asULoad(operandNR(lrVReg),
   321  		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
   322  	cur = linkInstr(cur, ldr)
   323  
   324  	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
   325  		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
   326  	}
   327  
   328  	linkInstr(cur, prevNext)
   329  }
   330  
   331  // saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
   332  // stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
   333  // which always points to the execution context whenever the native code is entered from Go.
   334  var saveRequiredRegs = []regalloc.VReg{
   335  	x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
   336  	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
   337  	v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
   338  	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
   339  }
   340  
   341  // insertStackBoundsCheck will insert the instructions after `cur` to check the
   342  // stack bounds, and if there's no sufficient spaces required for the function,
   343  // exit the execution and try growing it in Go world.
   344  //
   345  // TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
   346  func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
   347  	if requiredStackSize%16 != 0 {
   348  		panic("BUG")
   349  	}
   350  
   351  	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
   352  		// sub tmp, sp, #requiredStackSize
   353  		sub := m.allocateInstr()
   354  		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
   355  		cur = linkInstr(cur, sub)
   356  	} else {
   357  		// This case, we first load the requiredStackSize into the temporary register,
   358  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
   359  		// Then subtract it.
   360  		sub := m.allocateInstr()
   361  		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
   362  		cur = linkInstr(cur, sub)
   363  	}
   364  
   365  	tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue.
   366  
   367  	// ldr tmp2, [executionContext #StackBottomPtr]
   368  	ldr := m.allocateInstr()
   369  	ldr.asULoad(operandNR(tmp2), addressMode{
   370  		kind: addressModeKindRegUnsignedImm12,
   371  		rn:   x0VReg, // execution context is always the first argument.
   372  		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
   373  	}, 64)
   374  	cur = linkInstr(cur, ldr)
   375  
   376  	// subs xzr, tmp, tmp2
   377  	subs := m.allocateInstr()
   378  	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
   379  	cur = linkInstr(cur, subs)
   380  
   381  	// b.ge #imm
   382  	cbr := m.allocateInstr()
   383  	cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
   384  	cur = linkInstr(cur, cbr)
   385  
   386  	// Set the required stack size and set it to the exec context.
   387  	{
   388  		// First load the requiredStackSize into the temporary register,
   389  		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
   390  		setRequiredStackSize := m.allocateInstr()
   391  		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
   392  			addressMode{
   393  				kind: addressModeKindRegUnsignedImm12,
   394  				// Execution context is always the first argument.
   395  				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
   396  			}, 64)
   397  
   398  		cur = linkInstr(cur, setRequiredStackSize)
   399  	}
   400  
   401  	ldrAddress := m.allocateInstr()
   402  	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
   403  		kind: addressModeKindRegUnsignedImm12,
   404  		rn:   x0VReg, // execution context is always the first argument
   405  		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
   406  	}, 64)
   407  	cur = linkInstr(cur, ldrAddress)
   408  
   409  	// Then jumps to the stack grow call sequence's address, meaning
   410  	// transferring the control to the code compiled by CompileStackGrowCallSequence.
   411  	bl := m.allocateInstr()
   412  	bl.asCallIndirect(tmpRegVReg, nil)
   413  	cur = linkInstr(cur, bl)
   414  
   415  	// Now that we know the entire code, we can finalize how many bytes
   416  	// we have to skip when the stack size is sufficient.
   417  	var cbrOffset int64
   418  	for _cur := cbr; ; _cur = _cur.next {
   419  		cbrOffset += _cur.size()
   420  		if _cur == cur {
   421  			break
   422  		}
   423  	}
   424  	cbr.condBrOffsetResolve(cbrOffset)
   425  	return cur
   426  }
   427  
   428  // CompileStackGrowCallSequence implements backend.Machine.
   429  func (m *machine) CompileStackGrowCallSequence() []byte {
   430  	ectx := m.executableContext
   431  
   432  	cur := m.allocateInstr()
   433  	cur.asNop0()
   434  	ectx.RootInstr = cur
   435  
   436  	// Save the callee saved and argument registers.
   437  	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
   438  
   439  	// Save the current stack pointer.
   440  	cur = m.saveCurrentStackPointer(cur, x0VReg)
   441  
   442  	// Set the exit status on the execution context.
   443  	cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
   444  
   445  	// Exit the execution.
   446  	cur = m.storeReturnAddressAndExit(cur)
   447  
   448  	// After the exit, restore the saved registers.
   449  	cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
   450  
   451  	// Then goes back the original address of this stack grow call.
   452  	ret := m.allocateInstr()
   453  	ret.asRet()
   454  	linkInstr(cur, ret)
   455  
   456  	m.encode(ectx.RootInstr)
   457  	return m.compiler.Buf()
   458  }
   459  
   460  func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
   461  	ectx := m.executableContext
   462  
   463  	ectx.PendingInstructions = ectx.PendingInstructions[:0]
   464  	m.insertAddOrSubStackPointer(rd, diff, add)
   465  	for _, inserted := range ectx.PendingInstructions {
   466  		cur = linkInstr(cur, inserted)
   467  	}
   468  	return cur
   469  }