github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go (about) 1 package arm64 2 3 import ( 4 "fmt" 5 6 "github.com/bananabytelabs/wazero/internal/engine/wazevo/backend/regalloc" 7 "github.com/bananabytelabs/wazero/internal/engine/wazevo/wazevoapi" 8 ) 9 10 // SetupPrologue implements backend.Machine. 11 func (m *machine) SetupPrologue() { 12 ectx := m.executableContext 13 14 cur := ectx.RootInstr 15 prevInitInst := cur.next 16 17 // 18 // (high address) (high address) 19 // SP----> +-----------------+ +------------------+ <----+ 20 // | ....... | | ....... | | 21 // | ret Y | | ret Y | | 22 // | ....... | | ....... | | 23 // | ret 0 | | ret 0 | | 24 // | arg X | | arg X | | size_of_arg_ret. 25 // | ....... | ====> | ....... | | 26 // | arg 1 | | arg 1 | | 27 // | arg 0 | | arg 0 | <----+ 28 // |-----------------| | size_of_arg_ret | 29 // | return address | 30 // +------------------+ <---- SP 31 // (low address) (low address) 32 33 // Saves the return address (lr) and the size_of_arg_ret below the SP. 34 // size_of_arg_ret is used for stack unwinding. 35 cur = m.createReturnAddrAndSizeOfArgRetSlot(cur) 36 37 if !m.stackBoundsCheckDisabled { 38 cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur) 39 } 40 41 // Decrement SP if spillSlotSize > 0. 42 if m.spillSlotSize == 0 && len(m.spillSlots) != 0 { 43 panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots)) 44 } 45 46 if regs := m.clobberedRegs; len(regs) > 0 { 47 // 48 // (high address) (high address) 49 // +-----------------+ +-----------------+ 50 // | ....... | | ....... | 51 // | ret Y | | ret Y | 52 // | ....... | | ....... | 53 // | ret 0 | | ret 0 | 54 // | arg X | | arg X | 55 // | ....... | | ....... | 56 // | arg 1 | | arg 1 | 57 // | arg 0 | | arg 0 | 58 // | size_of_arg_ret | | size_of_arg_ret | 59 // | ReturnAddress | | ReturnAddress | 60 // SP----> +-----------------+ ====> +-----------------+ 61 // (low address) | clobbered M | 62 // | ............ | 63 // | clobbered 0 | 64 // +-----------------+ <----- SP 65 // (low address) 66 // 67 _amode := addressModePreOrPostIndex(spVReg, 68 -16, // stack pointer must be 16-byte aligned. 69 true, // Decrement before store. 70 ) 71 for _, vr := range regs { 72 // TODO: pair stores to reduce the number of instructions. 73 store := m.allocateInstr() 74 store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType())) 75 cur = linkInstr(cur, store) 76 } 77 } 78 79 if size := m.spillSlotSize; size > 0 { 80 // Check if size is 16-byte aligned. 81 if size&0xf != 0 { 82 panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size)) 83 } 84 85 cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false) 86 87 // At this point, the stack looks like: 88 // 89 // (high address) 90 // +------------------+ 91 // | ....... | 92 // | ret Y | 93 // | ....... | 94 // | ret 0 | 95 // | arg X | 96 // | ....... | 97 // | arg 1 | 98 // | arg 0 | 99 // | size_of_arg_ret | 100 // | ReturnAddress | 101 // +------------------+ 102 // | clobbered M | 103 // | ............ | 104 // | clobbered 0 | 105 // | spill slot N | 106 // | ............ | 107 // | spill slot 2 | 108 // | spill slot 0 | 109 // SP----> +------------------+ 110 // (low address) 111 } 112 113 // We push the frame size into the stack to make it possible to unwind stack: 114 // 115 // 116 // (high address) (high address) 117 // +-----------------+ +-----------------+ 118 // | ....... | | ....... | 119 // | ret Y | | ret Y | 120 // | ....... | | ....... | 121 // | ret 0 | | ret 0 | 122 // | arg X | | arg X | 123 // | ....... | | ....... | 124 // | arg 1 | | arg 1 | 125 // | arg 0 | | arg 0 | 126 // | size_of_arg_ret | | size_of_arg_ret | 127 // | ReturnAddress | | ReturnAddress | 128 // +-----------------+ ==> +-----------------+ <----+ 129 // | clobbered M | | clobbered M | | 130 // | ............ | | ............ | | 131 // | clobbered 2 | | clobbered 2 | | 132 // | clobbered 1 | | clobbered 1 | | frame size 133 // | clobbered 0 | | clobbered 0 | | 134 // | spill slot N | | spill slot N | | 135 // | ............ | | ............ | | 136 // | spill slot 0 | | spill slot 0 | <----+ 137 // SP---> +-----------------+ | xxxxxx | ;; unused space to make it 16-byte aligned. 138 // | frame_size | 139 // +-----------------+ <---- SP 140 // (low address) 141 // 142 cur = m.createFrameSizeSlot(cur, m.frameSize()) 143 144 linkInstr(cur, prevInitInst) 145 } 146 147 func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction { 148 // First we decrement the stack pointer to point the arg0 slot. 149 var sizeOfArgRetReg regalloc.VReg 150 s := m.currentABI.alignedArgResultStackSlotSize() 151 if s > 0 { 152 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) 153 sizeOfArgRetReg = tmpRegVReg 154 155 subSp := m.allocateInstr() 156 subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true) 157 cur = linkInstr(cur, subSp) 158 } else { 159 sizeOfArgRetReg = xzrVReg 160 } 161 162 // Saves the return address (lr) and the size_of_arg_ret below the SP. 163 // size_of_arg_ret is used for stack unwinding. 164 pstr := m.allocateInstr() 165 amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */) 166 pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode) 167 cur = linkInstr(cur, pstr) 168 return cur 169 } 170 171 func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction { 172 var frameSizeReg regalloc.VReg 173 if s > 0 { 174 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) 175 frameSizeReg = tmpRegVReg 176 } else { 177 frameSizeReg = xzrVReg 178 } 179 _amode := addressModePreOrPostIndex(spVReg, 180 -16, // stack pointer must be 16-byte aligned. 181 true, // Decrement before store. 182 ) 183 store := m.allocateInstr() 184 store.asStore(operandNR(frameSizeReg), _amode, 64) 185 cur = linkInstr(cur, store) 186 return cur 187 } 188 189 // SetupEpilogue implements backend.Machine. 190 func (m *machine) SetupEpilogue() { 191 ectx := m.executableContext 192 for cur := ectx.RootInstr; cur != nil; cur = cur.next { 193 if cur.kind == ret { 194 m.setupEpilogueAfter(cur.prev) 195 continue 196 } 197 198 // Removes the redundant copy instruction. 199 // TODO: doing this in `SetupEpilogue` seems weird. Find a better home. 200 if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() { 201 prev, next := cur.prev, cur.next 202 // Remove the copy instruction. 203 prev.next = next 204 if next != nil { 205 next.prev = prev 206 } 207 } 208 } 209 } 210 211 func (m *machine) setupEpilogueAfter(cur *instruction) { 212 prevNext := cur.next 213 214 // We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore. 215 cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true) 216 217 if s := m.spillSlotSize; s > 0 { 218 // Adjust SP to the original value: 219 // 220 // (high address) (high address) 221 // +-----------------+ +-----------------+ 222 // | ....... | | ....... | 223 // | ret Y | | ret Y | 224 // | ....... | | ....... | 225 // | ret 0 | | ret 0 | 226 // | arg X | | arg X | 227 // | ....... | | ....... | 228 // | arg 1 | | arg 1 | 229 // | arg 0 | | arg 0 | 230 // | xxxxx | | xxxxx | 231 // | ReturnAddress | | ReturnAddress | 232 // +-----------------+ ====> +-----------------+ 233 // | clobbered M | | clobbered M | 234 // | ............ | | ............ | 235 // | clobbered 1 | | clobbered 1 | 236 // | clobbered 0 | | clobbered 0 | 237 // | spill slot N | +-----------------+ <---- SP 238 // | ............ | 239 // | spill slot 0 | 240 // SP---> +-----------------+ 241 // (low address) 242 // 243 cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) 244 } 245 246 // First we need to restore the clobbered registers. 247 if len(m.clobberedRegs) > 0 { 248 // (high address) 249 // +-----------------+ +-----------------+ 250 // | ....... | | ....... | 251 // | ret Y | | ret Y | 252 // | ....... | | ....... | 253 // | ret 0 | | ret 0 | 254 // | arg X | | arg X | 255 // | ....... | | ....... | 256 // | arg 1 | | arg 1 | 257 // | arg 0 | | arg 0 | 258 // | xxxxx | | xxxxx | 259 // | ReturnAddress | | ReturnAddress | 260 // +-----------------+ ========> +-----------------+ <---- SP 261 // | clobbered M | 262 // | clobbered 1 | 263 // | ........... | 264 // | clobbered 0 | 265 // SP---> +-----------------+ 266 // (low address) 267 268 l := len(m.clobberedRegs) - 1 269 for i := range m.clobberedRegs { 270 vr := m.clobberedRegs[l-i] // reverse order to restore. 271 load := m.allocateInstr() 272 amode := addressModePreOrPostIndex(spVReg, 273 16, // stack pointer must be 16-byte aligned. 274 false, // Increment after store. 275 ) 276 // TODO: pair loads to reduce the number of instructions. 277 switch regTypeToRegisterSizeInBits(vr.RegType()) { 278 case 64: // save int reg. 279 load.asULoad(operandNR(vr), amode, 64) 280 case 128: // save vector reg. 281 load.asFpuLoad(operandNR(vr), amode, 128) 282 } 283 cur = linkInstr(cur, load) 284 } 285 } 286 287 // Reload the return address (lr). 288 // 289 // +-----------------+ +-----------------+ 290 // | ....... | | ....... | 291 // | ret Y | | ret Y | 292 // | ....... | | ....... | 293 // | ret 0 | | ret 0 | 294 // | arg X | | arg X | 295 // | ....... | ===> | ....... | 296 // | arg 1 | | arg 1 | 297 // | arg 0 | | arg 0 | 298 // | xxxxx | +-----------------+ <---- SP 299 // | ReturnAddress | 300 // SP----> +-----------------+ 301 302 ldr := m.allocateInstr() 303 ldr.asULoad(operandNR(lrVReg), 304 addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64) 305 cur = linkInstr(cur, ldr) 306 307 if s := m.currentABI.alignedArgResultStackSlotSize(); s > 0 { 308 cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) 309 } 310 311 linkInstr(cur, prevNext) 312 } 313 314 // saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient 315 // stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0, 316 // which always points to the execution context whenever the native code is entered from Go. 317 var saveRequiredRegs = []regalloc.VReg{ 318 x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg, 319 x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg, 320 v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg, 321 v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg, 322 } 323 324 // insertStackBoundsCheck will insert the instructions after `cur` to check the 325 // stack bounds, and if there's no sufficient spaces required for the function, 326 // exit the execution and try growing it in Go world. 327 // 328 // TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable. 329 func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction { 330 if requiredStackSize%16 != 0 { 331 panic("BUG") 332 } 333 334 if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok { 335 // sub tmp, sp, #requiredStackSize 336 sub := m.allocateInstr() 337 sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true) 338 cur = linkInstr(cur, sub) 339 } else { 340 // This case, we first load the requiredStackSize into the temporary register, 341 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) 342 // Then subtract it. 343 sub := m.allocateInstr() 344 sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true) 345 cur = linkInstr(cur, sub) 346 } 347 348 tmp2 := x11VReg // Callee save, so it is safe to use it here in the prologue. 349 350 // ldr tmp2, [executionContext #StackBottomPtr] 351 ldr := m.allocateInstr() 352 ldr.asULoad(operandNR(tmp2), addressMode{ 353 kind: addressModeKindRegUnsignedImm12, 354 rn: x0VReg, // execution context is always the first argument. 355 imm: wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(), 356 }, 64) 357 cur = linkInstr(cur, ldr) 358 359 // subs xzr, tmp, tmp2 360 subs := m.allocateInstr() 361 subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true) 362 cur = linkInstr(cur, subs) 363 364 // b.ge #imm 365 cbr := m.allocateInstr() 366 cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */) 367 cur = linkInstr(cur, cbr) 368 369 // Set the required stack size and set it to the exec context. 370 { 371 // First load the requiredStackSize into the temporary register, 372 cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) 373 setRequiredStackSize := m.allocateInstr() 374 setRequiredStackSize.asStore(operandNR(tmpRegVReg), 375 addressMode{ 376 kind: addressModeKindRegUnsignedImm12, 377 // Execution context is always the first argument. 378 rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(), 379 }, 64) 380 381 cur = linkInstr(cur, setRequiredStackSize) 382 } 383 384 ldrAddress := m.allocateInstr() 385 ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{ 386 kind: addressModeKindRegUnsignedImm12, 387 rn: x0VReg, // execution context is always the first argument 388 imm: wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(), 389 }, 64) 390 cur = linkInstr(cur, ldrAddress) 391 392 // Then jumps to the stack grow call sequence's address, meaning 393 // transferring the control to the code compiled by CompileStackGrowCallSequence. 394 bl := m.allocateInstr() 395 bl.asCallIndirect(tmpRegVReg, nil) 396 cur = linkInstr(cur, bl) 397 398 // Now that we know the entire code, we can finalize how many bytes 399 // we have to skip when the stack size is sufficient. 400 var cbrOffset int64 401 for _cur := cbr; ; _cur = _cur.next { 402 cbrOffset += _cur.size() 403 if _cur == cur { 404 break 405 } 406 } 407 cbr.condBrOffsetResolve(cbrOffset) 408 return cur 409 } 410 411 // CompileStackGrowCallSequence implements backend.Machine. 412 func (m *machine) CompileStackGrowCallSequence() []byte { 413 ectx := m.executableContext 414 415 cur := m.allocateInstr() 416 cur.asNop0() 417 ectx.RootInstr = cur 418 419 // Save the callee saved and argument registers. 420 cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs) 421 422 // Save the current stack pointer. 423 cur = m.saveCurrentStackPointer(cur, x0VReg) 424 425 // Set the exit status on the execution context. 426 cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack) 427 428 // Exit the execution. 429 cur = m.storeReturnAddressAndExit(cur) 430 431 // After the exit, restore the saved registers. 432 cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs) 433 434 // Then goes back the original address of this stack grow call. 435 ret := m.allocateInstr() 436 ret.asRet(nil) 437 linkInstr(cur, ret) 438 439 m.encode(ectx.RootInstr) 440 return m.compiler.Buf() 441 } 442 443 func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction { 444 ectx := m.executableContext 445 446 ectx.PendingInstructions = ectx.PendingInstructions[:0] 447 m.insertAddOrSubStackPointer(rd, diff, add) 448 for _, inserted := range ectx.PendingInstructions { 449 cur = linkInstr(cur, inserted) 450 } 451 return cur 452 }