github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go (about)

     1  package amd64
     2  
     3  import (
     4  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
     5  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
     6  )
     7  
     8  // PostRegAlloc implements backend.Machine.
     9  func (m *machine) PostRegAlloc() {
    10  	m.setupPrologue()
    11  	m.postRegAlloc()
    12  }
    13  
    14  func (m *machine) setupPrologue() {
    15  	cur := m.ectx.RootInstr
    16  	prevInitInst := cur.next
    17  
    18  	// At this point, we have the stack layout as follows:
    19  	//
    20  	//                   (high address)
    21  	//                 +-----------------+ <----- RBP (somewhere in the middle of the stack)
    22  	//                 |     .......     |
    23  	//                 |      ret Y      |
    24  	//                 |     .......     |
    25  	//                 |      ret 0      |
    26  	//                 |      arg X      |
    27  	//                 |     .......     |
    28  	//                 |      arg 1      |
    29  	//                 |      arg 0      |
    30  	//                 |   Return Addr   |
    31  	//       RSP ----> +-----------------+
    32  	//                    (low address)
    33  
    34  	// First, we push the RBP, and update the RBP to the current RSP.
    35  	//
    36  	//                   (high address)                     (high address)
    37  	//       RBP ----> +-----------------+                +-----------------+
    38  	//                 |     .......     |                |     .......     |
    39  	//                 |      ret Y      |                |      ret Y      |
    40  	//                 |     .......     |                |     .......     |
    41  	//                 |      ret 0      |                |      ret 0      |
    42  	//                 |      arg X      |                |      arg X      |
    43  	//                 |     .......     |     ====>      |     .......     |
    44  	//                 |      arg 1      |                |      arg 1      |
    45  	//                 |      arg 0      |                |      arg 0      |
    46  	//                 |   Return Addr   |                |   Return Addr   |
    47  	//       RSP ----> +-----------------+                |    Caller_RBP   |
    48  	//                    (low address)                   +-----------------+ <----- RSP, RBP
    49  	//
    50  	cur = m.setupRBPRSP(cur)
    51  
    52  	if !m.stackBoundsCheckDisabled {
    53  		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
    54  	}
    55  
    56  	//
    57  	//            (high address)
    58  	//          +-----------------+                  +-----------------+
    59  	//          |     .......     |                  |     .......     |
    60  	//          |      ret Y      |                  |      ret Y      |
    61  	//          |     .......     |                  |     .......     |
    62  	//          |      ret 0      |                  |      ret 0      |
    63  	//          |      arg X      |                  |      arg X      |
    64  	//          |     .......     |                  |     .......     |
    65  	//          |      arg 1      |                  |      arg 1      |
    66  	//          |      arg 0      |                  |      arg 0      |
    67  	//          |      xxxxx      |                  |      xxxxx      |
    68  	//          |   Return Addr   |                  |   Return Addr   |
    69  	//          |    Caller_RBP   |      ====>       |    Caller_RBP   |
    70  	// RBP,RSP->+-----------------+                  +-----------------+ <----- RBP
    71  	//             (low address)                     |   clobbered M   |
    72  	//                                               |   clobbered 1   |
    73  	//                                               |   ...........   |
    74  	//                                               |   clobbered 0   |
    75  	//                                               +-----------------+ <----- RSP
    76  	//
    77  	if regs := m.clobberedRegs; len(regs) > 0 {
    78  		for i := range regs {
    79  			r := regs[len(regs)-1-i] // Reverse order.
    80  			if r.RegType() == regalloc.RegTypeInt {
    81  				cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r)))
    82  			} else {
    83  				// Push the XMM register is not supported by the PUSH instruction.
    84  				cur = m.addRSP(-16, cur)
    85  				push := m.allocateInstr().asXmmMovRM(
    86  					sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)),
    87  				)
    88  				cur = linkInstr(cur, push)
    89  			}
    90  		}
    91  	}
    92  
    93  	if size := m.spillSlotSize; size > 0 {
    94  		// Simply decrease the RSP to allocate the spill slots.
    95  		// 		sub $size, %rsp
    96  		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true))
    97  
    98  		// At this point, we have the stack layout as follows:
    99  		//
   100  		//            (high address)
   101  		//          +-----------------+
   102  		//          |     .......     |
   103  		//          |      ret Y      |
   104  		//          |     .......     |
   105  		//          |      ret 0      |
   106  		//          |      arg X      |
   107  		//          |     .......     |
   108  		//          |      arg 1      |
   109  		//          |      arg 0      |
   110  		//          |   ReturnAddress |
   111  		//          |   Caller_RBP    |
   112  		//          +-----------------+ <--- RBP
   113  		//          |    clobbered M  |
   114  		//          |   ............  |
   115  		//          |    clobbered 1  |
   116  		//          |    clobbered 0  |
   117  		//          |   spill slot N  |
   118  		//          |   ............  |
   119  		//          |   spill slot 0  |
   120  		//          +-----------------+ <--- RSP
   121  		//             (low address)
   122  	}
   123  
   124  	linkInstr(cur, prevInitInst)
   125  }
   126  
   127  // postRegAlloc does multiple things while walking through the instructions:
   128  // 1. Inserts the epilogue code.
   129  // 2. Removes the redundant copy instruction.
   130  // 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
   131  // 4. Lowering that is supposed to be done after regalloc.
   132  func (m *machine) postRegAlloc() {
   133  	ectx := m.ectx
   134  	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
   135  		switch k := cur.kind; k {
   136  		case ret:
   137  			m.setupEpilogueAfter(cur.prev)
   138  			continue
   139  		case fcvtToSintSequence, fcvtToUintSequence:
   140  			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
   141  			if k == fcvtToSintSequence {
   142  				m.lowerFcvtToSintSequenceAfterRegalloc(cur)
   143  			} else {
   144  				m.lowerFcvtToUintSequenceAfterRegalloc(cur)
   145  			}
   146  			prev := cur.prev
   147  			next := cur.next
   148  			cur := prev
   149  			for _, instr := range m.ectx.PendingInstructions {
   150  				cur = linkInstr(cur, instr)
   151  			}
   152  			linkInstr(cur, next)
   153  			continue
   154  		case xmmCMov:
   155  			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
   156  			m.lowerXmmCmovAfterRegAlloc(cur)
   157  			prev := cur.prev
   158  			next := cur.next
   159  			cur := prev
   160  			for _, instr := range m.ectx.PendingInstructions {
   161  				cur = linkInstr(cur, instr)
   162  			}
   163  			linkInstr(cur, next)
   164  			continue
   165  		case idivRemSequence:
   166  			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
   167  			m.lowerIDivRemSequenceAfterRegAlloc(cur)
   168  			prev := cur.prev
   169  			next := cur.next
   170  			cur := prev
   171  			for _, instr := range m.ectx.PendingInstructions {
   172  				cur = linkInstr(cur, instr)
   173  			}
   174  			linkInstr(cur, next)
   175  			continue
   176  		case call, callIndirect:
   177  			// At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction
   178  			// right before/after the call instruction. If this is done before reg alloc, the stack slot
   179  			// can point to the wrong location and therefore results in a wrong value.
   180  			call := cur
   181  			next := call.next
   182  			_, _, _, _, size := backend.ABIInfoFromUint64(call.u2)
   183  			if size > 0 {
   184  				dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
   185  				linkInstr(call.prev, dec)
   186  				linkInstr(dec, call)
   187  				inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true)
   188  				linkInstr(call, inc)
   189  				linkInstr(inc, next)
   190  			}
   191  			continue
   192  		}
   193  
   194  		// Removes the redundant copy instruction.
   195  		if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() {
   196  			prev, next := cur.prev, cur.next
   197  			// Remove the copy instruction.
   198  			prev.next = next
   199  			if next != nil {
   200  				next.prev = prev
   201  			}
   202  		}
   203  	}
   204  }
   205  
   206  func (m *machine) setupEpilogueAfter(cur *instruction) {
   207  	prevNext := cur.next
   208  
   209  	// At this point, we have the stack layout as follows:
   210  	//
   211  	//            (high address)
   212  	//          +-----------------+
   213  	//          |     .......     |
   214  	//          |      ret Y      |
   215  	//          |     .......     |
   216  	//          |      ret 0      |
   217  	//          |      arg X      |
   218  	//          |     .......     |
   219  	//          |      arg 1      |
   220  	//          |      arg 0      |
   221  	//          |   ReturnAddress |
   222  	//          |   Caller_RBP    |
   223  	//          +-----------------+ <--- RBP
   224  	//          |    clobbered M  |
   225  	//          |   ............  |
   226  	//          |    clobbered 1  |
   227  	//          |    clobbered 0  |
   228  	//          |   spill slot N  |
   229  	//          |   ............  |
   230  	//          |   spill slot 0  |
   231  	//          +-----------------+ <--- RSP
   232  	//             (low address)
   233  
   234  	if size := m.spillSlotSize; size > 0 {
   235  		// Simply increase the RSP to free the spill slots.
   236  		// 		add $size, %rsp
   237  		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true))
   238  	}
   239  
   240  	//
   241  	//             (high address)
   242  	//            +-----------------+                     +-----------------+
   243  	//            |     .......     |                     |     .......     |
   244  	//            |      ret Y      |                     |      ret Y      |
   245  	//            |     .......     |                     |     .......     |
   246  	//            |      ret 0      |                     |      ret 0      |
   247  	//            |      arg X      |                     |      arg X      |
   248  	//            |     .......     |                     |     .......     |
   249  	//            |      arg 1      |                     |      arg 1      |
   250  	//            |      arg 0      |                     |      arg 0      |
   251  	//            |   ReturnAddress |                     |   ReturnAddress |
   252  	//            |    Caller_RBP   |                     |    Caller_RBP   |
   253  	//   RBP ---> +-----------------+      ========>      +-----------------+ <---- RSP, RBP
   254  	//            |    clobbered M  |
   255  	//            |   ............  |
   256  	//            |    clobbered 1  |
   257  	//            |    clobbered 0  |
   258  	//   RSP ---> +-----------------+
   259  	//               (low address)
   260  	//
   261  	if regs := m.clobberedRegs; len(regs) > 0 {
   262  		for _, r := range regs {
   263  			if r.RegType() == regalloc.RegTypeInt {
   264  				cur = linkInstr(cur, m.allocateInstr().asPop64(r))
   265  			} else {
   266  				// Pop the XMM register is not supported by the POP instruction.
   267  				pop := m.allocateInstr().asXmmUnaryRmR(
   268  					sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r,
   269  				)
   270  				cur = linkInstr(cur, pop)
   271  				cur = m.addRSP(16, cur)
   272  			}
   273  		}
   274  	}
   275  
   276  	// Now roll back the RSP to RBP, and pop the caller's RBP.
   277  	cur = m.revertRBPRSP(cur)
   278  
   279  	linkInstr(cur, prevNext)
   280  }
   281  
   282  func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
   283  	if offset == 0 {
   284  		return cur
   285  	}
   286  	opcode := aluRmiROpcodeAdd
   287  	if offset < 0 {
   288  		opcode = aluRmiROpcodeSub
   289  		offset = -offset
   290  	}
   291  	return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true))
   292  }
   293  
   294  func (m *machine) setupRBPRSP(cur *instruction) *instruction {
   295  	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg)))
   296  	cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true))
   297  	return cur
   298  }
   299  
   300  func (m *machine) revertRBPRSP(cur *instruction) *instruction {
   301  	cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true))
   302  	cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg))
   303  	return cur
   304  }