github.com/tetratelabs/wazero@v1.7.3-0.20240513003603-48f702e154b5/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go (about) 1 package arm64 2 3 import ( 4 "fmt" 5 6 "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" 7 "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" 8 ) 9 10 // PostRegAlloc implements backend.Machine. 11 func (m *machine) PostRegAlloc() { 12 m.setupPrologue() 13 m.postRegAlloc() 14 } 15 16 // setupPrologue initializes the prologue of the function. 17 func (m *machine) setupPrologue() { 18 ectx := m.executableContext 19 20 cur := ectx.RootInstr 21 prevInitInst := cur.next 22 23 // 24 // (high address) (high address) 25 // SP----> +-----------------+ +------------------+ <----+ 26 // | ....... | | ....... | | 27 // | ret Y | | ret Y | | 28 // | ....... | | ....... | | 29 // | ret 0 | | ret 0 | | 30 // | arg X | | arg X | | size_of_arg_ret. 31 // | ....... | ====> | ....... | | 32 // | arg 1 | | arg 1 | | 33 // | arg 0 | | arg 0 | <----+ 34 // |-----------------| | size_of_arg_ret | 35 // | return address | 36 // +------------------+ <---- SP 37 // (low address) (low address) 38 39 // Saves the return address (lr) and the size_of_arg_ret below the SP. 40 // size_of_arg_ret is used for stack unwinding. 41 cur = m.createReturnAddrAndSizeOfArgRetSlot(cur) 42 43 if !m.stackBoundsCheckDisabled { 44 cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur) 45 } 46 47 // Decrement SP if spillSlotSize > 0. 48 if m.spillSlotSize == 0 && len(m.spillSlots) != 0 { 49 panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots)) 50 } 51 52 if regs := m.clobberedRegs; len(regs) > 0 { 53 // 54 // (high address) (high address) 55 // +-----------------+ +-----------------+ 56 // | ....... | | ....... | 57 // | ret Y | | ret Y | 58 // | ....... | | ....... | 59 // | ret 0 | | ret 0 | 60 // | arg X | | arg X | 61 // | ....... | | ....... | 62 // | arg 1 | | arg 1 | 63 // | arg 0 | | arg 0 | 64 // | size_of_arg_ret | | size_of_arg_ret | 65 // | ReturnAddress | | ReturnAddress | 66 // SP----> +-----------------+ ====> +-----------------+ 67 // (low address) | clobbered M | 68 // | ............ | 69 // | clobbered 0 | 70 // +-----------------+ <----- SP 71 // (low address) 72 // 73 _amode := addressModePreOrPostIndex(spVReg, 74 -16, // stack pointer must be 16-byte aligned. 75 true, // Decrement before store. 76 ) 77 for _, vr := range regs { 78 // TODO: pair stores to reduce the number of instructions. 79 store := m.allocateInstr() 80 store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType())) 81 cur = linkInstr(cur, store) 82 } 83 } 84 85 if size := m.spillSlotSize; size > 0 { 86 // Check if size is 16-byte aligned. 87 if size&0xf != 0 { 88 panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size)) 89 } 90 91 cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false) 92 93 // At this point, the stack looks like: 94 // 95 // (high address) 96 // +------------------+ 97 // | ....... | 98 // | ret Y | 99 // | ....... | 100 // | ret 0 | 101 // | arg X | 102 // | ....... | 103 // | arg 1 | 104 // | arg 0 | 105 // | size_of_arg_ret | 106 // | ReturnAddress | 107 // +------------------+ 108 // | clobbered M | 109 // | ............ | 110 // | clobbered 0 | 111 // | spill slot N | 112 // | ............ | 113 // | spill slot 2 | 114 // | spill slot 0 | 115 // SP----> +------------------+ 116 // (low address) 117 } 118 119 // We push the frame size into the stack to make it possible to unwind stack: 120 // 121 // 122 // (high address) (high address) 123 // +-----------------+ +-----------------+ 124 // | ....... | | ....... | 125 // | ret Y | | ret Y | 126 // | ....... | | ....... | 127 // | ret 0 | | ret 0 | 128 // | arg X | | arg X | 129 // | ....... | | ....... | 130 // | arg 1 | | arg 1 | 131 // | arg 0 | | arg 0 | 132 // | size_of_arg_ret | | size_of_arg_ret | 133 // | ReturnAddress | | ReturnAddress | 134 // +-----------------+ ==> +-----------------+ <----+ 135 // | clobbered M | | clobbered M | | 136 // | ............ | | ............ | | 137 // | clobbered 2 | | clobbered 2 | | 138 // | clobbered 1 | | clobbered 1 | | frame size 139 // | clobbered 0 | | clobbered 0 | | 140 // | spill slot N | | spill slot N | | 141 // | ............ | | ............ | | 142 // | spill slot 0 | | spill slot 0 | <----+ 143 // SP---> +-----------------+ | xxxxxx | ;; unused space to make it 16-byte aligned. 144 // | frame_size | 145 // +-----------------+ <---- SP 146 // (low address) 147 // 148 cur = m.createFrameSizeSlot(cur, m.frameSize()) 149 150 linkInstr(cur, prevInitInst) 151 } 152 153 func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction { 154 // First we decrement the stack pointer to point the arg0 slot. 155 var sizeOfArgRetReg regalloc.VReg 156 s := int64(m.currentABI.AlignedArgResultStackSlotSize()) 157 if s > 0 { 158 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) 159 sizeOfArgRetReg = tmpRegVReg 160 161 subSp := m.allocateInstr() 162 subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true) 163 cur = linkInstr(cur, subSp) 164 } else { 165 sizeOfArgRetReg = xzrVReg 166 } 167 168 // Saves the return address (lr) and the size_of_arg_ret below the SP. 169 // size_of_arg_ret is used for stack unwinding. 170 pstr := m.allocateInstr() 171 amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */) 172 pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode) 173 cur = linkInstr(cur, pstr) 174 return cur 175 } 176 177 func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction { 178 var frameSizeReg regalloc.VReg 179 if s > 0 { 180 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) 181 frameSizeReg = tmpRegVReg 182 } else { 183 frameSizeReg = xzrVReg 184 } 185 _amode := addressModePreOrPostIndex(spVReg, 186 -16, // stack pointer must be 16-byte aligned. 187 true, // Decrement before store. 188 ) 189 store := m.allocateInstr() 190 store.asStore(operandNR(frameSizeReg), _amode, 64) 191 cur = linkInstr(cur, store) 192 return cur 193 } 194 195 // postRegAlloc does multiple things while walking through the instructions: 196 // 1. Removes the redundant copy instruction. 197 // 2. Inserts the epilogue. 198 func (m *machine) postRegAlloc() { 199 ectx := m.executableContext 200 for cur := ectx.RootInstr; cur != nil; cur = cur.next { 201 switch cur.kind { 202 case ret: 203 m.setupEpilogueAfter(cur.prev) 204 case loadConstBlockArg: 205 lc := cur 206 next := lc.next 207 m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0] 208 m.lowerLoadConstantBlockArgAfterRegAlloc(lc) 209 for _, instr := range m.executableContext.PendingInstructions { 210 cur = linkInstr(cur, instr) 211 } 212 linkInstr(cur, next) 213 m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0] 214 default: 215 // Removes the redundant copy instruction. 216 if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() { 217 prev, next := cur.prev, cur.next 218 // Remove the copy instruction. 219 prev.next = next 220 if next != nil { 221 next.prev = prev 222 } 223 } 224 } 225 } 226 } 227 228 func (m *machine) setupEpilogueAfter(cur *instruction) { 229 prevNext := cur.next 230 231 // We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore. 232 cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true) 233 234 if s := m.spillSlotSize; s > 0 { 235 // Adjust SP to the original value: 236 // 237 // (high address) (high address) 238 // +-----------------+ +-----------------+ 239 // | ....... | | ....... | 240 // | ret Y | | ret Y | 241 // | ....... | | ....... | 242 // | ret 0 | | ret 0 | 243 // | arg X | | arg X | 244 // | ....... | | ....... | 245 // | arg 1 | | arg 1 | 246 // | arg 0 | | arg 0 | 247 // | xxxxx | | xxxxx | 248 // | ReturnAddress | | ReturnAddress | 249 // +-----------------+ ====> +-----------------+ 250 // | clobbered M | | clobbered M | 251 // | ............ | | ............ | 252 // | clobbered 1 | | clobbered 1 | 253 // | clobbered 0 | | clobbered 0 | 254 // | spill slot N | +-----------------+ <---- SP 255 // | ............ | 256 // | spill slot 0 | 257 // SP---> +-----------------+ 258 // (low address) 259 // 260 cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) 261 } 262 263 // First we need to restore the clobbered registers. 264 if len(m.clobberedRegs) > 0 { 265 // (high address) 266 // +-----------------+ +-----------------+ 267 // | ....... | | ....... | 268 // | ret Y | | ret Y | 269 // | ....... | | ....... | 270 // | ret 0 | | ret 0 | 271 // | arg X | | arg X | 272 // | ....... | | ....... | 273 // | arg 1 | | arg 1 | 274 // | arg 0 | | arg 0 | 275 // | xxxxx | | xxxxx | 276 // | ReturnAddress | | ReturnAddress | 277 // +-----------------+ ========> +-----------------+ <---- SP 278 // | clobbered M | 279 // | ........... | 280 // | clobbered 1 | 281 // | clobbered 0 | 282 // SP---> +-----------------+ 283 // (low address) 284 285 l := len(m.clobberedRegs) - 1 286 for i := range m.clobberedRegs { 287 vr := m.clobberedRegs[l-i] // reverse order to restore. 288 load := m.allocateInstr() 289 amode := addressModePreOrPostIndex(spVReg, 290 16, // stack pointer must be 16-byte aligned. 291 false, // Increment after store. 292 ) 293 // TODO: pair loads to reduce the number of instructions. 294 switch regTypeToRegisterSizeInBits(vr.RegType()) { 295 case 64: // save int reg. 296 load.asULoad(operandNR(vr), amode, 64) 297 case 128: // save vector reg. 298 load.asFpuLoad(operandNR(vr), amode, 128) 299 } 300 cur = linkInstr(cur, load) 301 } 302 } 303 304 // Reload the return address (lr). 305 // 306 // +-----------------+ +-----------------+ 307 // | ....... | | ....... | 308 // | ret Y | | ret Y | 309 // | ....... | | ....... | 310 // | ret 0 | | ret 0 | 311 // | arg X | | arg X | 312 // | ....... | ===> | ....... | 313 // | arg 1 | | arg 1 | 314 // | arg 0 | | arg 0 | 315 // | xxxxx | +-----------------+ <---- SP 316 // | ReturnAddress | 317 // SP----> +-----------------+ 318 319 ldr := m.allocateInstr() 320 ldr.asULoad(operandNR(lrVReg), 321 addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64) 322 cur = linkInstr(cur, ldr) 323 324 if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 { 325 cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) 326 } 327 328 linkInstr(cur, prevNext) 329 } 330 331 // saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient 332 // stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0, 333 // which always points to the execution context whenever the native code is entered from Go. 334 var saveRequiredRegs = []regalloc.VReg{ 335 x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg, 336 x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg, 337 v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg, 338 v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg, 339 } 340 341 // insertStackBoundsCheck will insert the instructions after `cur` to check the 342 // stack bounds, and if there's no sufficient spaces required for the function, 343 // exit the execution and try growing it in Go world. 344 // 345 // TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable. 346 func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction { 347 if requiredStackSize%16 != 0 { 348 panic("BUG") 349 } 350 351 if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok { 352 // sub tmp, sp, #requiredStackSize 353 sub := m.allocateInstr() 354 sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true) 355 cur = linkInstr(cur, sub) 356 } else { 357 // This case, we first load the requiredStackSize into the temporary register, 358 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) 359 // Then subtract it. 360 sub := m.allocateInstr() 361 sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true) 362 cur = linkInstr(cur, sub) 363 } 364 365 tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue. 366 367 // ldr tmp2, [executionContext #StackBottomPtr] 368 ldr := m.allocateInstr() 369 ldr.asULoad(operandNR(tmp2), addressMode{ 370 kind: addressModeKindRegUnsignedImm12, 371 rn: x0VReg, // execution context is always the first argument. 372 imm: wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(), 373 }, 64) 374 cur = linkInstr(cur, ldr) 375 376 // subs xzr, tmp, tmp2 377 subs := m.allocateInstr() 378 subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true) 379 cur = linkInstr(cur, subs) 380 381 // b.ge #imm 382 cbr := m.allocateInstr() 383 cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */) 384 cur = linkInstr(cur, cbr) 385 386 // Set the required stack size and set it to the exec context. 387 { 388 // First load the requiredStackSize into the temporary register, 389 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) 390 setRequiredStackSize := m.allocateInstr() 391 setRequiredStackSize.asStore(operandNR(tmpRegVReg), 392 addressMode{ 393 kind: addressModeKindRegUnsignedImm12, 394 // Execution context is always the first argument. 395 rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(), 396 }, 64) 397 398 cur = linkInstr(cur, setRequiredStackSize) 399 } 400 401 ldrAddress := m.allocateInstr() 402 ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{ 403 kind: addressModeKindRegUnsignedImm12, 404 rn: x0VReg, // execution context is always the first argument 405 imm: wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(), 406 }, 64) 407 cur = linkInstr(cur, ldrAddress) 408 409 // Then jumps to the stack grow call sequence's address, meaning 410 // transferring the control to the code compiled by CompileStackGrowCallSequence. 411 bl := m.allocateInstr() 412 bl.asCallIndirect(tmpRegVReg, nil) 413 cur = linkInstr(cur, bl) 414 415 // Now that we know the entire code, we can finalize how many bytes 416 // we have to skip when the stack size is sufficient. 417 var cbrOffset int64 418 for _cur := cbr; ; _cur = _cur.next { 419 cbrOffset += _cur.size() 420 if _cur == cur { 421 break 422 } 423 } 424 cbr.condBrOffsetResolve(cbrOffset) 425 return cur 426 } 427 428 // CompileStackGrowCallSequence implements backend.Machine. 429 func (m *machine) CompileStackGrowCallSequence() []byte { 430 ectx := m.executableContext 431 432 cur := m.allocateInstr() 433 cur.asNop0() 434 ectx.RootInstr = cur 435 436 // Save the callee saved and argument registers. 437 cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs) 438 439 // Save the current stack pointer. 440 cur = m.saveCurrentStackPointer(cur, x0VReg) 441 442 // Set the exit status on the execution context. 443 cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack) 444 445 // Exit the execution. 446 cur = m.storeReturnAddressAndExit(cur) 447 448 // After the exit, restore the saved registers. 449 cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs) 450 451 // Then goes back the original address of this stack grow call. 452 ret := m.allocateInstr() 453 ret.asRet() 454 linkInstr(cur, ret) 455 456 m.encode(ectx.RootInstr) 457 return m.compiler.Buf() 458 } 459 460 func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction { 461 ectx := m.executableContext 462 463 ectx.PendingInstructions = ectx.PendingInstructions[:0] 464 m.insertAddOrSubStackPointer(rd, diff, add) 465 for _, inserted := range ectx.PendingInstructions { 466 cur = linkInstr(cur, inserted) 467 } 468 return cur 469 }