github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/wazevo/backend/isa/arm64/lower_mem.go (about)

     1  package arm64
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/backend/regalloc"
     7  	"github.com/bananabytelabs/wazero/internal/engine/wazevo/ssa"
     8  )
     9  
    10  type (
    11  	// addressMode represents an ARM64 addressing mode.
    12  	//
    13  	// https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing
    14  	// TODO: use the bit-packed layout like operand struct.
    15  	addressMode struct {
    16  		kind   addressModeKind
    17  		rn, rm regalloc.VReg
    18  		extOp  extendOp
    19  		imm    int64
    20  	}
    21  
    22  	// addressModeKind represents the kind of ARM64 addressing mode.
    23  	addressModeKind byte
    24  )
    25  
    26  const (
    27  	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
    28  	// and then scaled by bits(type)/8.
    29  	//
    30  	// e.g.
    31  	// 	- ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1)
    32  	// 	- strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1)
    33  	// 	- ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2)
    34  	// 	- str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3)
    35  	//
    36  	// See the following pages:
    37  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register--
    38  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--
    39  	addressModeKindRegScaledExtended addressModeKind = iota
    40  
    41  	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor.
    42  	addressModeKindRegScaled
    43  
    44  	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor.
    45  	addressModeKindRegExtended
    46  
    47  	// addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended.
    48  	addressModeKindRegReg
    49  
    50  	// addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255).
    51  	// The immediate will be sign-extended, and be added to the base register.
    52  	// This is a.k.a. "unscaled" since the immediate is not scaled.
    53  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
    54  	addressModeKindRegSignedImm9
    55  
    56  	// addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset.  scaled by
    57  	// the size of the type. In other words, the actual offset will be imm12 * bits(type)/8.
    58  	// See "Unsigned offset" in the following pages:
    59  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
    60  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
    61  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
    62  	addressModeKindRegUnsignedImm12
    63  
    64  	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
    65  	// After the load/store, the base register will be updated by the offset.
    66  	//
    67  	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
    68  	//
    69  	// See "Post-index" in the following pages for examples:
    70  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
    71  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
    72  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
    73  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
    74  	addressModeKindPostIndex
    75  
    76  	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
    77  	// Before the load/store, the base register will be updated by the offset.
    78  	//
    79  	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
    80  	//
    81  	// See "Pre-index" in the following pages for examples:
    82  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
    83  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
    84  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
    85  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
    86  	addressModeKindPreIndex
    87  
    88  	// addressModeKindArgStackSpace is used to resolve the address of the argument stack space
    89  	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
    90  	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
    91  	addressModeKindArgStackSpace
    92  
    93  	// addressModeKindResultStackSpace is used to resolve the address of the result stack space
    94  	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
    95  	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
    96  	addressModeKindResultStackSpace
    97  )
    98  
    99  func (a addressMode) format(dstSizeBits byte) (ret string) {
   100  	base := formatVRegSized(a.rn, 64)
   101  	if rn := a.rn; rn.RegType() != regalloc.RegTypeInt {
   102  		panic("invalid base register type: " + a.rn.RegType().String())
   103  	} else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 {
   104  		panic("BUG: likely a bug in reg alloc or reset behavior")
   105  	}
   106  
   107  	switch a.kind {
   108  	case addressModeKindRegScaledExtended:
   109  		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
   110  		ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount)
   111  	case addressModeKindRegScaled:
   112  		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
   113  		ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount)
   114  	case addressModeKindRegExtended:
   115  		ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp)
   116  	case addressModeKindRegReg:
   117  		ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()))
   118  	case addressModeKindRegSignedImm9:
   119  		if a.imm != 0 {
   120  			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
   121  		} else {
   122  			ret = fmt.Sprintf("[%s]", base)
   123  		}
   124  	case addressModeKindRegUnsignedImm12:
   125  		if a.imm != 0 {
   126  			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
   127  		} else {
   128  			ret = fmt.Sprintf("[%s]", base)
   129  		}
   130  	case addressModeKindPostIndex:
   131  		ret = fmt.Sprintf("[%s], #%#x", base, a.imm)
   132  	case addressModeKindPreIndex:
   133  		ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm)
   134  	case addressModeKindArgStackSpace:
   135  		ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm)
   136  	case addressModeKindResultStackSpace:
   137  		ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm)
   138  	}
   139  	return
   140  }
   141  
   142  func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
   143  	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
   144  		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
   145  	}
   146  	if preIndex {
   147  		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
   148  	} else {
   149  		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
   150  	}
   151  }
   152  
   153  func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
   154  	divisor := int64(dstSizeInBits) / 8
   155  	return 0 < offset && offset%divisor == 0 && offset/divisor < 4096
   156  }
   157  
   158  func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool {
   159  	return -256 <= offset && offset <= 255
   160  }
   161  
   162  func (a addressMode) indexRegBits() byte {
   163  	bits := a.extOp.srcBits()
   164  	if bits != 32 && bits != 64 {
   165  		panic("invalid index register for address mode. it must be either 32 or 64 bits")
   166  	}
   167  	return bits
   168  }
   169  
   170  func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) {
   171  	switch sizeInBits {
   172  	case 8:
   173  		lsl = 0
   174  	case 16:
   175  		lsl = 1
   176  	case 32:
   177  		lsl = 2
   178  	case 64:
   179  		lsl = 3
   180  	}
   181  	return
   182  }
   183  
   184  func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) {
   185  	switch op {
   186  	case ssa.OpcodeUload8:
   187  		size, signed = 8, false
   188  	case ssa.OpcodeUload16:
   189  		size, signed = 16, false
   190  	case ssa.OpcodeUload32:
   191  		size, signed = 32, false
   192  	case ssa.OpcodeSload8:
   193  		size, signed = 8, true
   194  	case ssa.OpcodeSload16:
   195  		size, signed = 16, true
   196  	case ssa.OpcodeSload32:
   197  		size, signed = 32, true
   198  	default:
   199  		panic("BUG")
   200  	}
   201  	return
   202  }
   203  
   204  func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) {
   205  	size, signed := extLoadSignSize(op)
   206  	amode := m.lowerToAddressMode(ptr, offset, size)
   207  	load := m.allocateInstr()
   208  	if signed {
   209  		load.asSLoad(operandNR(ret), amode, size)
   210  	} else {
   211  		load.asULoad(operandNR(ret), amode, size)
   212  	}
   213  	m.insert(load)
   214  }
   215  
   216  func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) {
   217  	amode := m.lowerToAddressMode(ptr, offset, typ.Bits())
   218  
   219  	dst := m.compiler.VRegOf(ret)
   220  	load := m.allocateInstr()
   221  	switch typ {
   222  	case ssa.TypeI32, ssa.TypeI64:
   223  		load.asULoad(operandNR(dst), amode, typ.Bits())
   224  	case ssa.TypeF32, ssa.TypeF64:
   225  		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
   226  	case ssa.TypeV128:
   227  		load.asFpuLoad(operandNR(dst), amode, 128)
   228  	default:
   229  		panic("TODO")
   230  	}
   231  	m.insert(load)
   232  }
   233  
   234  func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) {
   235  	var opSize byte
   236  	switch lane {
   237  	case ssa.VecLaneI8x16:
   238  		opSize = 8
   239  	case ssa.VecLaneI16x8:
   240  		opSize = 16
   241  	case ssa.VecLaneI32x4:
   242  		opSize = 32
   243  	case ssa.VecLaneI64x2:
   244  		opSize = 64
   245  	}
   246  	amode := m.lowerToAddressMode(ptr, offset, opSize)
   247  	rd := operandNR(m.compiler.VRegOf(ret))
   248  	m.lowerLoadSplatFromAddressMode(rd, amode, lane)
   249  }
   250  
   251  // lowerLoadSplatFromAddressMode is extracted from lowerLoadSplat for testing.
   252  func (m *machine) lowerLoadSplatFromAddressMode(rd operand, amode addressMode, lane ssa.VecLane) {
   253  	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
   254  
   255  	// vecLoad1R has offset address mode (base+imm) only for post index, so the only addressing mode
   256  	// we can use here is "no-offset" register addressing mode, i.e. `addressModeKindRegReg`.
   257  	switch amode.kind {
   258  	case addressModeKindRegReg:
   259  		add := m.allocateInstr()
   260  		add.asALU(aluOpAdd, tmpReg, operandNR(amode.rn), operandNR(amode.rm), true)
   261  		m.insert(add)
   262  	case addressModeKindRegSignedImm9:
   263  		add := m.allocateInstr()
   264  		add.asALU(aluOpAdd, tmpReg, operandNR(amode.rn), operandImm12(uint16(amode.imm), 0), true)
   265  		m.insert(add)
   266  	case addressModeKindRegUnsignedImm12:
   267  		if amode.imm != 0 {
   268  			offsetReg := m.compiler.AllocateVReg(ssa.TypeI64)
   269  			m.load64bitConst(amode.imm, offsetReg)
   270  			add := m.allocateInstr()
   271  			m.insert(add)
   272  			add.asALU(aluOpAdd, tmpReg, operandNR(amode.rn), operandNR(offsetReg), true)
   273  		} else {
   274  			tmpReg = operandNR(amode.rn)
   275  		}
   276  	default:
   277  		panic("unsupported address mode for LoadSplat")
   278  	}
   279  
   280  	arr := ssaLaneToArrangement(lane)
   281  
   282  	ld1r := m.allocateInstr()
   283  	ld1r.asVecLoad1R(rd, tmpReg, arr)
   284  	m.insert(ld1r)
   285  }
   286  
   287  func (m *machine) lowerStore(si *ssa.Instruction) {
   288  	// TODO: merge consecutive stores into a single pair store instruction.
   289  	value, ptr, offset, storeSizeInBits := si.StoreData()
   290  	amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits)
   291  
   292  	valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone)
   293  	store := m.allocateInstr()
   294  	store.asStore(valueOp, amode, storeSizeInBits)
   295  	m.insert(store)
   296  }
   297  
   298  // lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
   299  func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
   300  	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
   301  	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
   302  	// to support more efficient address resolution.
   303  
   304  	a32s, a64s, offset := m.collectAddends(ptr)
   305  	offset += int64(offsetBase)
   306  	return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset)
   307  }
   308  
   309  // lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends.
   310  // During the construction, this might emit additional instructions.
   311  //
   312  // Extracted as a separate function for easy testing.
   313  func (m *machine) lowerToAddressModeFromAddends(a32s *queue[addend32], a64s *queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
   314  	switch a64sExist, a32sExist := !a64s.empty(), !a32s.empty(); {
   315  	case a64sExist && a32sExist:
   316  		var base regalloc.VReg
   317  		base = a64s.dequeue()
   318  		var a32 addend32
   319  		a32 = a32s.dequeue()
   320  		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
   321  	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
   322  		var base regalloc.VReg
   323  		base = a64s.dequeue()
   324  		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
   325  		offset = 0
   326  	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
   327  		var base regalloc.VReg
   328  		base = a64s.dequeue()
   329  		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
   330  		offset = 0
   331  	case a64sExist:
   332  		var base regalloc.VReg
   333  		base = a64s.dequeue()
   334  		if !a64s.empty() {
   335  			index := a64s.dequeue()
   336  			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
   337  		} else {
   338  			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
   339  		}
   340  	case a32sExist:
   341  		base32 := a32s.dequeue()
   342  
   343  		// First we need 64-bit base.
   344  		base := m.compiler.AllocateVReg(ssa.TypeI64)
   345  		baseExt := m.allocateInstr()
   346  		var signed bool
   347  		if base32.ext == extendOpSXTW {
   348  			signed = true
   349  		}
   350  		baseExt.asExtend(base, base32.r, 32, 64, signed)
   351  		m.insert(baseExt)
   352  
   353  		if !a32s.empty() {
   354  			index := a32s.dequeue()
   355  			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
   356  		} else {
   357  			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
   358  		}
   359  	default: // Only static offsets.
   360  		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
   361  		m.lowerConstantI64(tmpReg, offset)
   362  		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
   363  		offset = 0
   364  	}
   365  
   366  	baseReg := amode.rn
   367  	if offset > 0 {
   368  		baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
   369  	}
   370  
   371  	for !a64s.empty() {
   372  		a64 := a64s.dequeue()
   373  		baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
   374  	}
   375  
   376  	for !a32s.empty() {
   377  		a32 := a32s.dequeue()
   378  		baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
   379  	}
   380  	amode.rn = baseReg
   381  	return
   382  }
   383  
   384  var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
   385  
   386  func (m *machine) collectAddends(ptr ssa.Value) (addends32 *queue[addend32], addends64 *queue[regalloc.VReg], offset int64) {
   387  	m.addendsWorkQueue.reset()
   388  	m.addends32.reset()
   389  	m.addends64.reset()
   390  	m.addendsWorkQueue.enqueue(ptr)
   391  
   392  	for !m.addendsWorkQueue.empty() {
   393  		v := m.addendsWorkQueue.dequeue()
   394  
   395  		def := m.compiler.ValueDefinition(v)
   396  		switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
   397  		case ssa.OpcodeIadd:
   398  			// If the addend is an add, we recursively collect its operands.
   399  			x, y := def.Instr.Arg2()
   400  			m.addendsWorkQueue.enqueue(x)
   401  			m.addendsWorkQueue.enqueue(y)
   402  			def.Instr.MarkLowered()
   403  		case ssa.OpcodeIconst:
   404  			// If the addend is constant, we just statically merge it into the offset.
   405  			ic := def.Instr
   406  			u64 := ic.ConstantVal()
   407  			if ic.Return().Type().Bits() == 32 {
   408  				offset += int64(int32(u64)) // sign-extend.
   409  			} else {
   410  				offset += int64(u64)
   411  			}
   412  			def.Instr.MarkLowered()
   413  		case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
   414  			switch input := def.Instr.Arg(); input.Type().Bits() {
   415  			case 64:
   416  				// If the input is already 64-bit, this extend is a no-op. TODO: shouldn't this be optimized out at much earlier stage? no?
   417  				m.addends64.enqueue(m.getOperand_NR(m.compiler.ValueDefinition(input), extModeNone).nr())
   418  				def.Instr.MarkLowered()
   419  				continue
   420  			case 32:
   421  				var ext extendOp
   422  				if op == ssa.OpcodeUExtend {
   423  					ext = extendOpUXTW
   424  				} else {
   425  					ext = extendOpSXTW
   426  				}
   427  
   428  				inputDef := m.compiler.ValueDefinition(input)
   429  				constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
   430  				switch {
   431  				case constInst && ext == extendOpUXTW:
   432  					// Zero-extension of a 32-bit constant can be merged into the offset.
   433  					offset += int64(uint32(inputDef.Instr.ConstantVal()))
   434  				case constInst && ext == extendOpSXTW:
   435  					// Sign-extension of a 32-bit constant can be merged into the offset.
   436  					offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
   437  				default:
   438  					m.addends32.enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
   439  				}
   440  				def.Instr.MarkLowered()
   441  				continue
   442  			}
   443  			// If this is the extension smaller than 32 bits, this cannot be merged into addressing mode since
   444  			// arm64 requires index registers must be at least 32 bits (extension modes can only be applied in 32 bits).
   445  			// fallthrough
   446  			panic("TODO: add tests")
   447  		default:
   448  			// If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
   449  			m.addends64.enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
   450  		}
   451  	}
   452  	return &m.addends32, &m.addends64, offset
   453  }
   454  
   455  func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
   456  	rd = m.compiler.AllocateVReg(ssa.TypeI64)
   457  	alu := m.allocateInstr()
   458  	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
   459  		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
   460  	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
   461  		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
   462  	} else {
   463  		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
   464  		m.load64bitConst(c, tmp)
   465  		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
   466  	}
   467  	m.insert(alu)
   468  	return
   469  }
   470  
   471  func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
   472  	rd = m.compiler.AllocateVReg(ssa.TypeI64)
   473  	alu := m.allocateInstr()
   474  	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
   475  	m.insert(alu)
   476  	return
   477  }
   478  
   479  func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
   480  	rd = m.compiler.AllocateVReg(ssa.TypeI64)
   481  	alu := m.allocateInstr()
   482  	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
   483  	m.insert(alu)
   484  	return
   485  }
   486  
   487  // queue is the resettable queue where the underlying slice is reused.
   488  type queue[T any] struct {
   489  	index int
   490  	data  []T
   491  }
   492  
   493  func (q *queue[T]) enqueue(v T) {
   494  	q.data = append(q.data, v)
   495  }
   496  
   497  func (q *queue[T]) dequeue() (ret T) {
   498  	ret = q.data[q.index]
   499  	q.index++
   500  	return
   501  }
   502  
   503  func (q *queue[T]) empty() bool {
   504  	return q.index >= len(q.data)
   505  }
   506  
   507  func (q *queue[T]) reset() {
   508  	q.index = 0
   509  	q.data = q.data[:0]
   510  }