github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go (about)

     1  package amd64
     2  
     3  import (
     4  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
     5  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
     6  	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
     7  	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
     8  )
     9  
    10  var calleeSavedVRegs = []regalloc.VReg{
    11  	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
    12  	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
    13  }
    14  
    15  // CompileGoFunctionTrampoline implements backend.Machine.
    16  func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
    17  	ectx := m.ectx
    18  	argBegin := 1 // Skips exec context by default.
    19  	if needModuleContextPtr {
    20  		argBegin++
    21  	}
    22  
    23  	abi := &backend.FunctionABI{}
    24  	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
    25  	m.currentABI = abi
    26  
    27  	cur := m.allocateNop()
    28  	ectx.RootInstr = cur
    29  
    30  	// Execution context is always the first argument.
    31  	execCtrPtr := raxVReg
    32  
    33  	// First we update RBP and RSP just like the normal prologue.
    34  	//
    35  	//                   (high address)                     (high address)
    36  	//       RBP ----> +-----------------+                +-----------------+
    37  	//                 |     .......     |                |     .......     |
    38  	//                 |      ret Y      |                |      ret Y      |
    39  	//                 |     .......     |                |     .......     |
    40  	//                 |      ret 0      |                |      ret 0      |
    41  	//                 |      arg X      |                |      arg X      |
    42  	//                 |     .......     |     ====>      |     .......     |
    43  	//                 |      arg 1      |                |      arg 1      |
    44  	//                 |      arg 0      |                |      arg 0      |
    45  	//                 |   Return Addr   |                |   Return Addr   |
    46  	//       RSP ----> +-----------------+                |    Caller_RBP   |
    47  	//                    (low address)                   +-----------------+ <----- RSP, RBP
    48  	//
    49  	cur = m.setupRBPRSP(cur)
    50  
    51  	goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
    52  	cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur)
    53  
    54  	// Save the callee saved registers.
    55  	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
    56  
    57  	if needModuleContextPtr {
    58  		moduleCtrPtr := rbxVReg // Module context is always the second argument.
    59  		mem := m.newAmodeImmReg(
    60  			wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(),
    61  			execCtrPtr)
    62  		store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8)
    63  		cur = linkInstr(cur, store)
    64  	}
    65  
    66  	// Now let's advance the RSP to the stack slot for the arguments.
    67  	//
    68  	//                (high address)                     (high address)
    69  	//              +-----------------+               +-----------------+
    70  	//              |     .......     |               |     .......     |
    71  	//              |      ret Y      |               |      ret Y      |
    72  	//              |     .......     |               |     .......     |
    73  	//              |      ret 0      |               |      ret 0      |
    74  	//              |      arg X      |               |      arg X      |
    75  	//              |     .......     |   =======>    |     .......     |
    76  	//              |      arg 1      |               |      arg 1      |
    77  	//              |      arg 0      |               |      arg 0      |
    78  	//              |   Return Addr   |               |   Return Addr   |
    79  	//              |    Caller_RBP   |               |    Caller_RBP   |
    80  	//  RBP,RSP --> +-----------------+               +-----------------+ <----- RBP
    81  	//                 (low address)                  |  arg[N]/ret[M]  |
    82  	//                                                |    ..........   |
    83  	//                                                |  arg[1]/ret[1]  |
    84  	//                                                |  arg[0]/ret[0]  |
    85  	//                                                +-----------------+ <----- RSP
    86  	//                                                   (low address)
    87  	//
    88  	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
    89  	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
    90  	// the arguments/return values to/from Go function.
    91  	cur = m.addRSP(-int32(goSliceSizeAligned), cur)
    92  
    93  	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
    94  	var offsetInGoSlice int32
    95  	for i := range abi.Args[argBegin:] {
    96  		arg := &abi.Args[argBegin+i]
    97  		var v regalloc.VReg
    98  		if arg.Kind == backend.ABIArgKindReg {
    99  			v = arg.Reg
   100  		} else {
   101  			// We have saved callee saved registers, so we can use them.
   102  			if arg.Type.IsInt() {
   103  				v = r15VReg
   104  			} else {
   105  				v = xmm15VReg
   106  			}
   107  			mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
   108  			load := m.allocateInstr()
   109  			switch arg.Type {
   110  			case ssa.TypeI32:
   111  				load.asMovzxRmR(extModeLQ, mem, v)
   112  			case ssa.TypeI64:
   113  				load.asMov64MR(mem, v)
   114  			case ssa.TypeF32:
   115  				load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
   116  			case ssa.TypeF64:
   117  				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
   118  			case ssa.TypeV128:
   119  				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
   120  			default:
   121  				panic("BUG")
   122  			}
   123  			cur = linkInstr(cur, load)
   124  		}
   125  
   126  		store := m.allocateInstr()
   127  		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
   128  		switch arg.Type {
   129  		case ssa.TypeI32:
   130  			store.asMovRM(v, mem, 4)
   131  			offsetInGoSlice += 8 // always uint64 rep.
   132  		case ssa.TypeI64:
   133  			store.asMovRM(v, mem, 8)
   134  			offsetInGoSlice += 8
   135  		case ssa.TypeF32:
   136  			store.asXmmMovRM(sseOpcodeMovss, v, mem)
   137  			offsetInGoSlice += 8 // always uint64 rep.
   138  		case ssa.TypeF64:
   139  			store.asXmmMovRM(sseOpcodeMovsd, v, mem)
   140  			offsetInGoSlice += 8
   141  		case ssa.TypeV128:
   142  			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
   143  			offsetInGoSlice += 16
   144  		default:
   145  			panic("BUG")
   146  		}
   147  		cur = linkInstr(cur, store)
   148  	}
   149  
   150  	// Finally we push the size of the slice to the stack so the stack looks like:
   151  	//
   152  	//          (high address)
   153  	//       +-----------------+
   154  	//       |     .......     |
   155  	//       |      ret Y      |
   156  	//       |     .......     |
   157  	//       |      ret 0      |
   158  	//       |      arg X      |
   159  	//       |     .......     |
   160  	//       |      arg 1      |
   161  	//       |      arg 0      |
   162  	//       |   Return Addr   |
   163  	//       |    Caller_RBP   |
   164  	//       +-----------------+ <----- RBP
   165  	//       |  arg[N]/ret[M]  |
   166  	//       |    ..........   |
   167  	//       |  arg[1]/ret[1]  |
   168  	//       |  arg[0]/ret[0]  |
   169  	//       |    slice size   |
   170  	//       +-----------------+ <----- RSP
   171  	//         (low address)
   172  	//
   173  	// 		push $sliceSize
   174  	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned))))
   175  
   176  	// Load the exitCode to the register.
   177  	exitCodeReg := r12VReg // Callee saved which is already saved.
   178  	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false))
   179  
   180  	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
   181  	cur = linkInstr(cur, setExitCode)
   182  	cur = linkInstr(cur, saveRsp)
   183  	cur = linkInstr(cur, saveRbp)
   184  
   185  	// Ready to exit the execution.
   186  	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
   187  
   188  	// We don't need the slice size anymore, so pop it.
   189  	cur = m.addRSP(8, cur)
   190  
   191  	// Ready to set up the results.
   192  	offsetInGoSlice = 0
   193  	// To avoid overwriting with the execution context pointer by the result, we need to track the offset,
   194  	// and defer the restoration of the result to the end of this function.
   195  	var argOverlapWithExecCtxOffset int32 = -1
   196  	for i := range abi.Rets {
   197  		r := &abi.Rets[i]
   198  		var v regalloc.VReg
   199  		isRegResult := r.Kind == backend.ABIArgKindReg
   200  		if isRegResult {
   201  			v = r.Reg
   202  			if v.RealReg() == execCtrPtr.RealReg() {
   203  				argOverlapWithExecCtxOffset = offsetInGoSlice
   204  				offsetInGoSlice += 8 // always uint64 rep.
   205  				continue
   206  			}
   207  		} else {
   208  			if r.Type.IsInt() {
   209  				v = r15VReg
   210  			} else {
   211  				v = xmm15VReg
   212  			}
   213  		}
   214  
   215  		load := m.allocateInstr()
   216  		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
   217  		switch r.Type {
   218  		case ssa.TypeI32:
   219  			load.asMovzxRmR(extModeLQ, mem, v)
   220  			offsetInGoSlice += 8 // always uint64 rep.
   221  		case ssa.TypeI64:
   222  			load.asMov64MR(mem, v)
   223  			offsetInGoSlice += 8
   224  		case ssa.TypeF32:
   225  			load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
   226  			offsetInGoSlice += 8 // always uint64 rep.
   227  		case ssa.TypeF64:
   228  			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
   229  			offsetInGoSlice += 8
   230  		case ssa.TypeV128:
   231  			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
   232  			offsetInGoSlice += 16
   233  		default:
   234  			panic("BUG")
   235  		}
   236  		cur = linkInstr(cur, load)
   237  
   238  		if !isRegResult {
   239  			// We need to store it back to the result slot above rbp.
   240  			store := m.allocateInstr()
   241  			mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
   242  			switch r.Type {
   243  			case ssa.TypeI32:
   244  				store.asMovRM(v, mem, 4)
   245  			case ssa.TypeI64:
   246  				store.asMovRM(v, mem, 8)
   247  			case ssa.TypeF32:
   248  				store.asXmmMovRM(sseOpcodeMovss, v, mem)
   249  			case ssa.TypeF64:
   250  				store.asXmmMovRM(sseOpcodeMovsd, v, mem)
   251  			case ssa.TypeV128:
   252  				store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
   253  			default:
   254  				panic("BUG")
   255  			}
   256  			cur = linkInstr(cur, store)
   257  		}
   258  	}
   259  
   260  	// Before return, we need to restore the callee saved registers.
   261  	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
   262  
   263  	if argOverlapWithExecCtxOffset >= 0 {
   264  		// At this point execCtt is not used anymore, so we can finally store the
   265  		// result to the register which overlaps with the execution context pointer.
   266  		mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg))
   267  		load := m.allocateInstr().asMov64MR(mem, execCtrPtr)
   268  		cur = linkInstr(cur, load)
   269  	}
   270  
   271  	// Finally ready to return.
   272  	cur = m.revertRBPRSP(cur)
   273  	linkInstr(cur, m.allocateInstr().asRet())
   274  
   275  	m.encodeWithoutSSA(ectx.RootInstr)
   276  	return m.c.Buf()
   277  }
   278  
   279  func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
   280  	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
   281  	for _, v := range regs {
   282  		store := m.allocateInstr()
   283  		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
   284  		switch v.RegType() {
   285  		case regalloc.RegTypeInt:
   286  			store.asMovRM(v, mem, 8)
   287  		case regalloc.RegTypeFloat:
   288  			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
   289  		default:
   290  			panic("BUG")
   291  		}
   292  		cur = linkInstr(cur, store)
   293  		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
   294  	}
   295  	return cur
   296  }
   297  
   298  func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
   299  	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
   300  	for _, v := range regs {
   301  		load := m.allocateInstr()
   302  		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
   303  		switch v.RegType() {
   304  		case regalloc.RegTypeInt:
   305  			load.asMov64MR(mem, v)
   306  		case regalloc.RegTypeFloat:
   307  			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
   308  		default:
   309  			panic("BUG")
   310  		}
   311  		cur = linkInstr(cur, load)
   312  		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
   313  	}
   314  	return cur
   315  }
   316  
   317  func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction {
   318  	readRip := m.allocateInstr()
   319  	cur = linkInstr(cur, readRip)
   320  
   321  	ripReg := r12VReg // Callee saved which is already saved.
   322  	saveRip := m.allocateInstr().asMovRM(
   323  		ripReg,
   324  		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
   325  		8,
   326  	)
   327  	cur = linkInstr(cur, saveRip)
   328  
   329  	exit := m.allocateExitSeq(execCtx)
   330  	cur = linkInstr(cur, exit)
   331  
   332  	nop, l := m.allocateBrTarget()
   333  	cur = linkInstr(cur, nop)
   334  	readRip.asLEA(newOperandLabel(l), ripReg)
   335  	return cur
   336  }
   337  
   338  // saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
   339  // stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the
   340  // execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it.
   341  var stackGrowSaveVRegs = []regalloc.VReg{
   342  	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
   343  	rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg,
   344  	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
   345  	xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg,
   346  }
   347  
   348  // CompileStackGrowCallSequence implements backend.Machine.
   349  func (m *machine) CompileStackGrowCallSequence() []byte {
   350  	ectx := m.ectx
   351  
   352  	cur := m.allocateNop()
   353  	ectx.RootInstr = cur
   354  
   355  	cur = m.setupRBPRSP(cur)
   356  
   357  	// Execution context is always the first argument.
   358  	execCtrPtr := raxVReg
   359  
   360  	// Save the callee saved and argument registers.
   361  	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
   362  
   363  	// Load the exitCode to the register.
   364  	exitCodeReg := r12VReg // Already saved.
   365  	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false))
   366  
   367  	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
   368  	cur = linkInstr(cur, setExitCode)
   369  	cur = linkInstr(cur, saveRsp)
   370  	cur = linkInstr(cur, saveRbp)
   371  
   372  	// Ready to exit the execution.
   373  	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
   374  
   375  	// After the exit, restore the saved registers.
   376  	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
   377  
   378  	// Finally ready to return.
   379  	cur = m.revertRBPRSP(cur)
   380  	linkInstr(cur, m.allocateInstr().asRet())
   381  
   382  	m.encodeWithoutSSA(ectx.RootInstr)
   383  	return m.c.Buf()
   384  }
   385  
   386  // insertStackBoundsCheck will insert the instructions after `cur` to check the
   387  // stack bounds, and if there's no sufficient spaces required for the function,
   388  // exit the execution and try growing it in Go world.
   389  func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
   390  	//		add $requiredStackSize, %rsp ;; Temporarily update the sp.
   391  	// 		cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp.
   392  	// 		ja .ok
   393  	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
   394  	//      pushq r15 ;; save the temporary.
   395  	//		mov $requiredStackSize, %r15
   396  	//		mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context.
   397  	//      popq r15 ;; restore the temporary.
   398  	//		callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack.
   399  	//		jmp .cont
   400  	// .ok:
   401  	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
   402  	// .cont:
   403  	cur = m.addRSP(-int32(requiredStackSize), cur)
   404  	cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true,
   405  		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)),
   406  		rspVReg, true))
   407  
   408  	ja := m.allocateInstr()
   409  	cur = linkInstr(cur, ja)
   410  
   411  	cur = m.addRSP(int32(requiredStackSize), cur)
   412  
   413  	// Save the temporary.
   414  
   415  	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg)))
   416  	// Load the required size to the temporary.
   417  	cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true))
   418  	// Set the required size in the execution context.
   419  	cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg,
   420  		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8))
   421  	// Restore the temporary.
   422  	cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg))
   423  	// Call the Go function to grow the stack.
   424  	cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg(
   425  		wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil))
   426  	// Jump to the continuation.
   427  	jmpToCont := m.allocateInstr()
   428  	cur = linkInstr(cur, jmpToCont)
   429  
   430  	// .ok:
   431  	okInstr, ok := m.allocateBrTarget()
   432  	cur = linkInstr(cur, okInstr)
   433  	ja.asJmpIf(condNBE, newOperandLabel(ok))
   434  	// On the ok path, we only need to reverse the temporary update.
   435  	cur = m.addRSP(int32(requiredStackSize), cur)
   436  
   437  	// .cont:
   438  	contInstr, cont := m.allocateBrTarget()
   439  	cur = linkInstr(cur, contInstr)
   440  	jmpToCont.asJmp(newOperandLabel(cont))
   441  
   442  	return cur
   443  }