github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/wazevo/backend/isa/arm64/lower_mem.go (about) 1 package arm64 2 3 import ( 4 "fmt" 5 6 "github.com/bananabytelabs/wazero/internal/engine/wazevo/backend/regalloc" 7 "github.com/bananabytelabs/wazero/internal/engine/wazevo/ssa" 8 ) 9 10 type ( 11 // addressMode represents an ARM64 addressing mode. 12 // 13 // https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing 14 // TODO: use the bit-packed layout like operand struct. 15 addressMode struct { 16 kind addressModeKind 17 rn, rm regalloc.VReg 18 extOp extendOp 19 imm int64 20 } 21 22 // addressModeKind represents the kind of ARM64 addressing mode. 23 addressModeKind byte 24 ) 25 26 const ( 27 // addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended, 28 // and then scaled by bits(type)/8. 29 // 30 // e.g. 31 // - ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1) 32 // - strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1) 33 // - ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2) 34 // - str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3) 35 // 36 // See the following pages: 37 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register-- 38 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register-- 39 addressModeKindRegScaledExtended addressModeKind = iota 40 41 // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor. 42 addressModeKindRegScaled 43 44 // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor. 45 addressModeKindRegExtended 46 47 // addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended. 48 addressModeKindRegReg 49 50 // addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255). 51 // The immediate will be sign-extended, and be added to the base register. 52 // This is a.k.a. "unscaled" since the immediate is not scaled. 53 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled-- 54 addressModeKindRegSignedImm9 55 56 // addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset. scaled by 57 // the size of the type. In other words, the actual offset will be imm12 * bits(type)/8. 58 // See "Unsigned offset" in the following pages: 59 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- 60 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- 61 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- 62 addressModeKindRegUnsignedImm12 63 64 // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset. 65 // After the load/store, the base register will be updated by the offset. 66 // 67 // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset. 68 // 69 // See "Post-index" in the following pages for examples: 70 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- 71 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- 72 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- 73 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- 74 addressModeKindPostIndex 75 76 // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset. 77 // Before the load/store, the base register will be updated by the offset. 78 // 79 // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset. 80 // 81 // See "Pre-index" in the following pages for examples: 82 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- 83 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- 84 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- 85 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- 86 addressModeKindPreIndex 87 88 // addressModeKindArgStackSpace is used to resolve the address of the argument stack space 89 // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function 90 // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above. 91 addressModeKindArgStackSpace 92 93 // addressModeKindResultStackSpace is used to resolve the address of the result stack space 94 // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function 95 // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above. 96 addressModeKindResultStackSpace 97 ) 98 99 func (a addressMode) format(dstSizeBits byte) (ret string) { 100 base := formatVRegSized(a.rn, 64) 101 if rn := a.rn; rn.RegType() != regalloc.RegTypeInt { 102 panic("invalid base register type: " + a.rn.RegType().String()) 103 } else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 { 104 panic("BUG: likely a bug in reg alloc or reset behavior") 105 } 106 107 switch a.kind { 108 case addressModeKindRegScaledExtended: 109 amount := a.sizeInBitsToShiftAmount(dstSizeBits) 110 ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount) 111 case addressModeKindRegScaled: 112 amount := a.sizeInBitsToShiftAmount(dstSizeBits) 113 ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount) 114 case addressModeKindRegExtended: 115 ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp) 116 case addressModeKindRegReg: 117 ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits())) 118 case addressModeKindRegSignedImm9: 119 if a.imm != 0 { 120 ret = fmt.Sprintf("[%s, #%#x]", base, a.imm) 121 } else { 122 ret = fmt.Sprintf("[%s]", base) 123 } 124 case addressModeKindRegUnsignedImm12: 125 if a.imm != 0 { 126 ret = fmt.Sprintf("[%s, #%#x]", base, a.imm) 127 } else { 128 ret = fmt.Sprintf("[%s]", base) 129 } 130 case addressModeKindPostIndex: 131 ret = fmt.Sprintf("[%s], #%#x", base, a.imm) 132 case addressModeKindPreIndex: 133 ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm) 134 case addressModeKindArgStackSpace: 135 ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm) 136 case addressModeKindResultStackSpace: 137 ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm) 138 } 139 return 140 } 141 142 func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode { 143 if !offsetFitsInAddressModeKindRegSignedImm9(imm) { 144 panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm)) 145 } 146 if preIndex { 147 return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm} 148 } else { 149 return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm} 150 } 151 } 152 153 func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool { 154 divisor := int64(dstSizeInBits) / 8 155 return 0 < offset && offset%divisor == 0 && offset/divisor < 4096 156 } 157 158 func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool { 159 return -256 <= offset && offset <= 255 160 } 161 162 func (a addressMode) indexRegBits() byte { 163 bits := a.extOp.srcBits() 164 if bits != 32 && bits != 64 { 165 panic("invalid index register for address mode. it must be either 32 or 64 bits") 166 } 167 return bits 168 } 169 170 func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) { 171 switch sizeInBits { 172 case 8: 173 lsl = 0 174 case 16: 175 lsl = 1 176 case 32: 177 lsl = 2 178 case 64: 179 lsl = 3 180 } 181 return 182 } 183 184 func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) { 185 switch op { 186 case ssa.OpcodeUload8: 187 size, signed = 8, false 188 case ssa.OpcodeUload16: 189 size, signed = 16, false 190 case ssa.OpcodeUload32: 191 size, signed = 32, false 192 case ssa.OpcodeSload8: 193 size, signed = 8, true 194 case ssa.OpcodeSload16: 195 size, signed = 16, true 196 case ssa.OpcodeSload32: 197 size, signed = 32, true 198 default: 199 panic("BUG") 200 } 201 return 202 } 203 204 func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) { 205 size, signed := extLoadSignSize(op) 206 amode := m.lowerToAddressMode(ptr, offset, size) 207 load := m.allocateInstr() 208 if signed { 209 load.asSLoad(operandNR(ret), amode, size) 210 } else { 211 load.asULoad(operandNR(ret), amode, size) 212 } 213 m.insert(load) 214 } 215 216 func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) { 217 amode := m.lowerToAddressMode(ptr, offset, typ.Bits()) 218 219 dst := m.compiler.VRegOf(ret) 220 load := m.allocateInstr() 221 switch typ { 222 case ssa.TypeI32, ssa.TypeI64: 223 load.asULoad(operandNR(dst), amode, typ.Bits()) 224 case ssa.TypeF32, ssa.TypeF64: 225 load.asFpuLoad(operandNR(dst), amode, typ.Bits()) 226 case ssa.TypeV128: 227 load.asFpuLoad(operandNR(dst), amode, 128) 228 default: 229 panic("TODO") 230 } 231 m.insert(load) 232 } 233 234 func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) { 235 var opSize byte 236 switch lane { 237 case ssa.VecLaneI8x16: 238 opSize = 8 239 case ssa.VecLaneI16x8: 240 opSize = 16 241 case ssa.VecLaneI32x4: 242 opSize = 32 243 case ssa.VecLaneI64x2: 244 opSize = 64 245 } 246 amode := m.lowerToAddressMode(ptr, offset, opSize) 247 rd := operandNR(m.compiler.VRegOf(ret)) 248 m.lowerLoadSplatFromAddressMode(rd, amode, lane) 249 } 250 251 // lowerLoadSplatFromAddressMode is extracted from lowerLoadSplat for testing. 252 func (m *machine) lowerLoadSplatFromAddressMode(rd operand, amode addressMode, lane ssa.VecLane) { 253 tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) 254 255 // vecLoad1R has offset address mode (base+imm) only for post index, so the only addressing mode 256 // we can use here is "no-offset" register addressing mode, i.e. `addressModeKindRegReg`. 257 switch amode.kind { 258 case addressModeKindRegReg: 259 add := m.allocateInstr() 260 add.asALU(aluOpAdd, tmpReg, operandNR(amode.rn), operandNR(amode.rm), true) 261 m.insert(add) 262 case addressModeKindRegSignedImm9: 263 add := m.allocateInstr() 264 add.asALU(aluOpAdd, tmpReg, operandNR(amode.rn), operandImm12(uint16(amode.imm), 0), true) 265 m.insert(add) 266 case addressModeKindRegUnsignedImm12: 267 if amode.imm != 0 { 268 offsetReg := m.compiler.AllocateVReg(ssa.TypeI64) 269 m.load64bitConst(amode.imm, offsetReg) 270 add := m.allocateInstr() 271 m.insert(add) 272 add.asALU(aluOpAdd, tmpReg, operandNR(amode.rn), operandNR(offsetReg), true) 273 } else { 274 tmpReg = operandNR(amode.rn) 275 } 276 default: 277 panic("unsupported address mode for LoadSplat") 278 } 279 280 arr := ssaLaneToArrangement(lane) 281 282 ld1r := m.allocateInstr() 283 ld1r.asVecLoad1R(rd, tmpReg, arr) 284 m.insert(ld1r) 285 } 286 287 func (m *machine) lowerStore(si *ssa.Instruction) { 288 // TODO: merge consecutive stores into a single pair store instruction. 289 value, ptr, offset, storeSizeInBits := si.StoreData() 290 amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits) 291 292 valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone) 293 store := m.allocateInstr() 294 store.asStore(valueOp, amode, storeSizeInBits) 295 m.insert(store) 296 } 297 298 // lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions. 299 func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) { 300 // TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and 301 // addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed 302 // to support more efficient address resolution. 303 304 a32s, a64s, offset := m.collectAddends(ptr) 305 offset += int64(offsetBase) 306 return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset) 307 } 308 309 // lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends. 310 // During the construction, this might emit additional instructions. 311 // 312 // Extracted as a separate function for easy testing. 313 func (m *machine) lowerToAddressModeFromAddends(a32s *queue[addend32], a64s *queue[regalloc.VReg], size byte, offset int64) (amode addressMode) { 314 switch a64sExist, a32sExist := !a64s.empty(), !a32s.empty(); { 315 case a64sExist && a32sExist: 316 var base regalloc.VReg 317 base = a64s.dequeue() 318 var a32 addend32 319 a32 = a32s.dequeue() 320 amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext} 321 case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset): 322 var base regalloc.VReg 323 base = a64s.dequeue() 324 amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset} 325 offset = 0 326 case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset): 327 var base regalloc.VReg 328 base = a64s.dequeue() 329 amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset} 330 offset = 0 331 case a64sExist: 332 var base regalloc.VReg 333 base = a64s.dequeue() 334 if !a64s.empty() { 335 index := a64s.dequeue() 336 amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */} 337 } else { 338 amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} 339 } 340 case a32sExist: 341 base32 := a32s.dequeue() 342 343 // First we need 64-bit base. 344 base := m.compiler.AllocateVReg(ssa.TypeI64) 345 baseExt := m.allocateInstr() 346 var signed bool 347 if base32.ext == extendOpSXTW { 348 signed = true 349 } 350 baseExt.asExtend(base, base32.r, 32, 64, signed) 351 m.insert(baseExt) 352 353 if !a32s.empty() { 354 index := a32s.dequeue() 355 amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext} 356 } else { 357 amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} 358 } 359 default: // Only static offsets. 360 tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) 361 m.lowerConstantI64(tmpReg, offset) 362 amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0} 363 offset = 0 364 } 365 366 baseReg := amode.rn 367 if offset > 0 { 368 baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset 369 } 370 371 for !a64s.empty() { 372 a64 := a64s.dequeue() 373 baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64 374 } 375 376 for !a32s.empty() { 377 a32 := a32s.dequeue() 378 baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit) 379 } 380 amode.rn = baseReg 381 return 382 } 383 384 var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst} 385 386 func (m *machine) collectAddends(ptr ssa.Value) (addends32 *queue[addend32], addends64 *queue[regalloc.VReg], offset int64) { 387 m.addendsWorkQueue.reset() 388 m.addends32.reset() 389 m.addends64.reset() 390 m.addendsWorkQueue.enqueue(ptr) 391 392 for !m.addendsWorkQueue.empty() { 393 v := m.addendsWorkQueue.dequeue() 394 395 def := m.compiler.ValueDefinition(v) 396 switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op { 397 case ssa.OpcodeIadd: 398 // If the addend is an add, we recursively collect its operands. 399 x, y := def.Instr.Arg2() 400 m.addendsWorkQueue.enqueue(x) 401 m.addendsWorkQueue.enqueue(y) 402 def.Instr.MarkLowered() 403 case ssa.OpcodeIconst: 404 // If the addend is constant, we just statically merge it into the offset. 405 ic := def.Instr 406 u64 := ic.ConstantVal() 407 if ic.Return().Type().Bits() == 32 { 408 offset += int64(int32(u64)) // sign-extend. 409 } else { 410 offset += int64(u64) 411 } 412 def.Instr.MarkLowered() 413 case ssa.OpcodeUExtend, ssa.OpcodeSExtend: 414 switch input := def.Instr.Arg(); input.Type().Bits() { 415 case 64: 416 // If the input is already 64-bit, this extend is a no-op. TODO: shouldn't this be optimized out at much earlier stage? no? 417 m.addends64.enqueue(m.getOperand_NR(m.compiler.ValueDefinition(input), extModeNone).nr()) 418 def.Instr.MarkLowered() 419 continue 420 case 32: 421 var ext extendOp 422 if op == ssa.OpcodeUExtend { 423 ext = extendOpUXTW 424 } else { 425 ext = extendOpSXTW 426 } 427 428 inputDef := m.compiler.ValueDefinition(input) 429 constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant() 430 switch { 431 case constInst && ext == extendOpUXTW: 432 // Zero-extension of a 32-bit constant can be merged into the offset. 433 offset += int64(uint32(inputDef.Instr.ConstantVal())) 434 case constInst && ext == extendOpSXTW: 435 // Sign-extension of a 32-bit constant can be merged into the offset. 436 offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend! 437 default: 438 m.addends32.enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext}) 439 } 440 def.Instr.MarkLowered() 441 continue 442 } 443 // If this is the extension smaller than 32 bits, this cannot be merged into addressing mode since 444 // arm64 requires index registers must be at least 32 bits (extension modes can only be applied in 32 bits). 445 // fallthrough 446 panic("TODO: add tests") 447 default: 448 // If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it. 449 m.addends64.enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr()) 450 } 451 } 452 return &m.addends32, &m.addends64, offset 453 } 454 455 func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) { 456 rd = m.compiler.AllocateVReg(ssa.TypeI64) 457 alu := m.allocateInstr() 458 if imm12Op, ok := asImm12Operand(uint64(c)); ok { 459 alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true) 460 } else if imm12Op, ok = asImm12Operand(uint64(-c)); ok { 461 alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true) 462 } else { 463 tmp := m.compiler.AllocateVReg(ssa.TypeI64) 464 m.load64bitConst(c, tmp) 465 alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true) 466 } 467 m.insert(alu) 468 return 469 } 470 471 func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) { 472 rd = m.compiler.AllocateVReg(ssa.TypeI64) 473 alu := m.allocateInstr() 474 alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true) 475 m.insert(alu) 476 return 477 } 478 479 func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) { 480 rd = m.compiler.AllocateVReg(ssa.TypeI64) 481 alu := m.allocateInstr() 482 alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true) 483 m.insert(alu) 484 return 485 } 486 487 // queue is the resettable queue where the underlying slice is reused. 488 type queue[T any] struct { 489 index int 490 data []T 491 } 492 493 func (q *queue[T]) enqueue(v T) { 494 q.data = append(q.data, v) 495 } 496 497 func (q *queue[T]) dequeue() (ret T) { 498 ret = q.data[q.index] 499 q.index++ 500 return 501 } 502 503 func (q *queue[T]) empty() bool { 504 return q.index >= len(q.data) 505 } 506 507 func (q *queue[T]) reset() { 508 q.index = 0 509 q.data = q.data[:0] 510 }