github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/compiler/impl_arm64.go (about)

     1  // This file implements the compiler for arm64 target.
     2  // Please refer to https://developer.arm.com/documentation/102374/latest/
     3  // if unfamiliar with arm64 instructions and semantics.
     4  package compiler
     5  
     6  import (
     7  	"bytes"
     8  	"errors"
     9  	"fmt"
    10  	"math"
    11  
    12  	"github.com/bananabytelabs/wazero/internal/asm"
    13  	"github.com/bananabytelabs/wazero/internal/asm/arm64"
    14  	"github.com/bananabytelabs/wazero/internal/wasm"
    15  	"github.com/bananabytelabs/wazero/internal/wazeroir"
    16  )
    17  
    18  type arm64Compiler struct {
    19  	assembler arm64.Assembler
    20  	ir        *wazeroir.CompilationResult
    21  	// locationStack holds the state of wazeroir virtual stack.
    22  	// and each item is either placed in register or the actual memory stack.
    23  	locationStack *runtimeValueLocationStack
    24  	// labels maps a label (e.g. ".L1_then") to *arm64LabelInfo.
    25  	labels [wazeroir.LabelKindNum][]arm64LabelInfo
    26  	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
    27  	stackPointerCeil uint64
    28  	// assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling.
    29  	assignStackPointerCeilNeeded asm.Node
    30  	compiledTrapTargets          [nativeCallStatusModuleClosed]asm.Node
    31  	withListener                 bool
    32  	typ                          *wasm.FunctionType
    33  	br                           *bytes.Reader
    34  	// locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack,
    35  	// we cache it here, and reset and set to .locationStack in the Init method.
    36  	locationStackForEntrypoint runtimeValueLocationStack
    37  	// frameIDMax tracks the maximum value of frame id per function.
    38  	frameIDMax int
    39  	brTableTmp []runtimeValueLocation
    40  }
    41  
    42  func newArm64Compiler() compiler {
    43  	return &arm64Compiler{
    44  		assembler:                  arm64.NewAssembler(arm64ReservedRegisterForTemporary),
    45  		locationStackForEntrypoint: newRuntimeValueLocationStack(),
    46  		br:                         bytes.NewReader(nil),
    47  	}
    48  }
    49  
    50  // Init implements compiler.Init.
    51  func (c *arm64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) {
    52  	c.assembler.Reset()
    53  	c.locationStackForEntrypoint.reset()
    54  	c.resetLabels()
    55  
    56  	*c = arm64Compiler{
    57  		ir:                         ir,
    58  		withListener:               withListener,
    59  		typ:                        typ,
    60  		assembler:                  c.assembler,
    61  		labels:                     c.labels,
    62  		br:                         c.br,
    63  		brTableTmp:                 c.brTableTmp,
    64  		locationStackForEntrypoint: c.locationStackForEntrypoint,
    65  	}
    66  
    67  	// Reuses the initial location stack for the compilation of subsequent functions.
    68  	c.locationStack = &c.locationStackForEntrypoint
    69  }
    70  
    71  // resetLabels resets the existing content in arm64Compiler.labels so that
    72  // we could reuse the allocated slices and stacks in the subsequent compilations.
    73  func (c *arm64Compiler) resetLabels() {
    74  	for i := range c.labels {
    75  		for j := range c.labels[i] {
    76  			if j > c.frameIDMax {
    77  				// Only need to reset until the maximum frame id. This makes the compilation faster for large binary.
    78  				break
    79  			}
    80  			l := &c.labels[i][j]
    81  			l.initialInstruction = nil
    82  			l.stackInitialized = false
    83  			l.initialStack.reset()
    84  		}
    85  	}
    86  }
    87  
    88  var (
    89  	arm64UnreservedVectorRegisters = []asm.Register{
    90  		arm64.RegV0, arm64.RegV1, arm64.RegV2, arm64.RegV3,
    91  		arm64.RegV4, arm64.RegV5, arm64.RegV6, arm64.RegV7, arm64.RegV8,
    92  		arm64.RegV9, arm64.RegV10, arm64.RegV11, arm64.RegV12, arm64.RegV13,
    93  		arm64.RegV14, arm64.RegV15, arm64.RegV16, arm64.RegV17, arm64.RegV18,
    94  		arm64.RegV19, arm64.RegV20, arm64.RegV21, arm64.RegV22, arm64.RegV23,
    95  		arm64.RegV24, arm64.RegV25, arm64.RegV26, arm64.RegV27, arm64.RegV28,
    96  		arm64.RegV29, arm64.RegV30, arm64.RegV31,
    97  	}
    98  
    99  	// Note (see arm64 section in https://go.dev/doc/asm):
   100  	// * RegR18 is reserved as a platform register, and we don't use it in Compiler.
   101  	// * RegR28 is reserved for Goroutine by Go runtime, and we don't use it in Compiler.
   102  	arm64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint
   103  		arm64.RegR3, arm64.RegR4, arm64.RegR5, arm64.RegR6, arm64.RegR7, arm64.RegR8,
   104  		arm64.RegR9, arm64.RegR10, arm64.RegR11, arm64.RegR12, arm64.RegR13,
   105  		arm64.RegR14, arm64.RegR15, arm64.RegR16, arm64.RegR17, arm64.RegR19,
   106  		arm64.RegR20, arm64.RegR21, arm64.RegR22, arm64.RegR23, arm64.RegR24,
   107  		arm64.RegR25, arm64.RegR26, arm64.RegR29, arm64.RegR30,
   108  	}
   109  )
   110  
   111  const (
   112  	// arm64ReservedRegisterForCallEngine holds the pointer to callEngine instance (i.e. *callEngine as uintptr)
   113  	arm64ReservedRegisterForCallEngine = arm64.RegR0
   114  	// arm64ReservedRegisterForStackBasePointerAddress holds stack base pointer's address (callEngine.stackBasePointer) in the current function call.
   115  	arm64ReservedRegisterForStackBasePointerAddress = arm64.RegR1
   116  	// arm64ReservedRegisterForMemory holds the pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
   117  	arm64ReservedRegisterForMemory = arm64.RegR2
   118  	// arm64ReservedRegisterForTemporary is the temporary register which is available at any point of execution, but its content shouldn't be supposed to live beyond the single operation.
   119  	// Note: we choose R27 as that is the temporary register used in Go's assembler.
   120  	arm64ReservedRegisterForTemporary = arm64.RegR27
   121  )
   122  
   123  var arm64CallingConventionModuleInstanceAddressRegister = arm64.RegR29
   124  
   125  const (
   126  	// arm64CallEngineArchContextCompilerCallReturnAddressOffset is the offset of archContext.nativeCallReturnAddress in callEngine.
   127  	arm64CallEngineArchContextCompilerCallReturnAddressOffset = 144
   128  	// arm64CallEngineArchContextMinimum32BitSignedIntOffset is the offset of archContext.minimum32BitSignedIntAddress in callEngine.
   129  	arm64CallEngineArchContextMinimum32BitSignedIntOffset = 152
   130  	// arm64CallEngineArchContextMinimum64BitSignedIntOffset is the offset of archContext.minimum64BitSignedIntAddress in callEngine.
   131  	arm64CallEngineArchContextMinimum64BitSignedIntOffset = 160
   132  )
   133  
   134  func isZeroRegister(r asm.Register) bool {
   135  	return r == arm64.RegRZR
   136  }
   137  
   138  // compileNOP implements compiler.compileNOP for the arm64 architecture.
   139  func (c *arm64Compiler) compileNOP() asm.Node {
   140  	return c.assembler.CompileStandAlone(arm64.NOP)
   141  }
   142  
   143  // compile implements compiler.compile for the arm64 architecture.
   144  func (c *arm64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) {
   145  	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
   146  	// used for all labels (via setLocationStack), excluding the current one.
   147  	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
   148  	stackPointerCeil = c.stackPointerCeil
   149  	if stackPointerCeil < c.locationStack.stackPointerCeil {
   150  		stackPointerCeil = c.locationStack.stackPointerCeil
   151  	}
   152  
   153  	// Now that the ceil of stack pointer is determined, we are invoking the callback.
   154  	// Note: this must be called before Assemble() below.
   155  	c.assignStackPointerCeil(stackPointerCeil)
   156  
   157  	err = c.assembler.Assemble(buf)
   158  	return
   159  }
   160  
   161  // arm64LabelInfo holds a wazeroir label specific information in this function.
   162  type arm64LabelInfo struct {
   163  	// initialInstruction is the initial instruction for this label so other block can branch into it.
   164  	initialInstruction asm.Node
   165  	// initialStack is the initial value location stack from which we start compiling this label.
   166  	initialStack     runtimeValueLocationStack
   167  	stackInitialized bool
   168  }
   169  
   170  // assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the arm64 architecture.
   171  func (c *arm64Compiler) assignStackPointerCeil(ceil uint64) {
   172  	if c.assignStackPointerCeilNeeded != nil {
   173  		c.assignStackPointerCeilNeeded.AssignSourceConstant(int64(ceil) << 3)
   174  	}
   175  }
   176  
   177  func (c *arm64Compiler) label(label wazeroir.Label) *arm64LabelInfo {
   178  	kind := label.Kind()
   179  	frames := c.labels[kind]
   180  	frameID := label.FrameID()
   181  	if c.frameIDMax < frameID {
   182  		c.frameIDMax = frameID
   183  	}
   184  	// If the frameID is not allocated yet, expand the slice by twice of the diff,
   185  	// so that we could reduce the allocation in the subsequent compilation.
   186  	if diff := frameID - len(frames) + 1; diff > 0 {
   187  		for i := 0; i < diff; i++ {
   188  			frames = append(frames, arm64LabelInfo{initialStack: newRuntimeValueLocationStack()})
   189  		}
   190  		c.labels[kind] = frames
   191  	}
   192  	return &frames[frameID]
   193  }
   194  
   195  // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
   196  func (c *arm64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
   197  	return c.locationStack
   198  }
   199  
   200  // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for arm64.
   201  func (c *arm64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
   202  	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
   203  	c.markRegisterUsed(reg)
   204  	return
   205  }
   206  
   207  // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for arm64.
   208  func (c *arm64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
   209  	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
   210  	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
   211  	c.markRegisterUsed(reg)
   212  	return
   213  }
   214  
   215  func (c *arm64Compiler) markRegisterUsed(regs ...asm.Register) {
   216  	for _, reg := range regs {
   217  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   218  			c.locationStack.markRegisterUsed(reg)
   219  		}
   220  	}
   221  }
   222  
   223  func (c *arm64Compiler) markRegisterUnused(regs ...asm.Register) {
   224  	for _, reg := range regs {
   225  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   226  			c.locationStack.markRegisterUnused(reg)
   227  		}
   228  	}
   229  }
   230  
   231  func (c *arm64Compiler) String() (ret string) { return c.locationStack.String() }
   232  
   233  // compilePreamble implements compiler.compilePreamble for the arm64 architecture.
   234  func (c *arm64Compiler) compilePreamble() error {
   235  	c.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   236  	defer c.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   237  
   238  	c.locationStack.init(c.typ)
   239  
   240  	// Check if it's necessary to grow the value stack before entering function body.
   241  	if err := c.compileMaybeGrowStack(); err != nil {
   242  		return err
   243  	}
   244  
   245  	if err := c.compileModuleContextInitialization(); err != nil {
   246  		return err
   247  	}
   248  
   249  	if c.withListener {
   250  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerBefore); err != nil {
   251  			return err
   252  		}
   253  	}
   254  
   255  	// We must initialize the stack base pointer register so that we can manipulate the stack properly.
   256  	c.compileReservedStackBasePointerRegisterInitialization()
   257  
   258  	c.compileReservedMemoryRegisterInitialization()
   259  
   260  	return nil
   261  }
   262  
   263  // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
   264  // and if so, make the builtin function call to do so. These instructions are called in the function's
   265  // preamble.
   266  func (c *arm64Compiler) compileMaybeGrowStack() error {
   267  	tmpX, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
   268  	if !found {
   269  		panic("BUG: all the registers should be free at this point")
   270  	}
   271  	c.markRegisterUsed(tmpX)
   272  	tmpY, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
   273  	if !found {
   274  		panic("BUG: all the registers should be free at this point")
   275  	}
   276  	c.markRegisterUsed(tmpY)
   277  
   278  	// "tmpX = len(ce.stack)"
   279  	c.assembler.CompileMemoryToRegister(
   280  		arm64.LDRD,
   281  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset,
   282  		tmpX,
   283  	)
   284  
   285  	// "tmpY = ce.stackBasePointer"
   286  	c.assembler.CompileMemoryToRegister(
   287  		arm64.LDRD,
   288  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
   289  		tmpY,
   290  	)
   291  
   292  	// "tmpX = tmpX - tmpY", in other words "tmpX = len(ce.stack) - ce.stackBasePointer"
   293  	c.assembler.CompileRegisterToRegister(
   294  		arm64.SUB,
   295  		tmpY,
   296  		tmpX,
   297  	)
   298  
   299  	// "tmpY = stackPointerCeil"
   300  	loadStackPointerCeil := c.assembler.CompileConstToRegister(
   301  		arm64.MOVD,
   302  		math.MaxInt32,
   303  		tmpY,
   304  	)
   305  	// At this point of compilation, we don't know the value of stack point ceil,
   306  	// so we lazily resolve the value later.
   307  	c.assignStackPointerCeilNeeded = loadStackPointerCeil
   308  
   309  	// Compare tmpX (len(ce.stack) - ce.stackBasePointer) and tmpY (ce.stackPointerCeil)
   310  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmpX, tmpY)
   311  
   312  	// If ceil > stackLen - stack base pointer, we need to grow the stack by calling builtin Go function.
   313  	brIfStackOK := c.assembler.CompileJump(arm64.BCONDLS)
   314  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexGrowStack); err != nil {
   315  		return err
   316  	}
   317  
   318  	// Otherwise, skip calling it.
   319  	c.assembler.SetJumpTargetOnNext(brIfStackOK)
   320  
   321  	c.markRegisterUnused(tmpX, tmpY)
   322  	return nil
   323  }
   324  
   325  // returnFunction emits instructions to return from the current function frame.
   326  // If the current frame is the bottom, the code goes back to the Go code with nativeCallStatusCodeReturned status.
   327  // Otherwise, we branch into the caller's return address.
   328  func (c *arm64Compiler) compileReturnFunction() error {
   329  	// Release all the registers as our calling convention requires the caller-save.
   330  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
   331  		return err
   332  	}
   333  
   334  	if c.withListener {
   335  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerAfter); err != nil {
   336  			return err
   337  		}
   338  		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
   339  		c.compileReservedStackBasePointerRegisterInitialization()
   340  	}
   341  
   342  	// arm64CallingConventionModuleInstanceAddressRegister holds the module intstance's address
   343  	// so mark it used so that it won't be used as a free register.
   344  	c.locationStack.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   345  	defer c.locationStack.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   346  
   347  	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   348  
   349  	// If the return address is zero, meaning that we return from the execution.
   350  	returnAddress.setRegister(arm64ReservedRegisterForTemporary)
   351  	c.compileLoadValueOnStackToRegister(returnAddress)
   352  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, arm64.RegRZR)
   353  
   354  	// Br if the address does not equal zero, otherwise, exit.
   355  	// If the address doesn't equal zero, return br into returnAddressRegister (caller's return address).
   356  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusCodeReturned)
   357  
   358  	// Alias for readability.
   359  	tmp := arm64CallingConventionModuleInstanceAddressRegister
   360  
   361  	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
   362  	callerStackBasePointerInBytes.setRegister(tmp)
   363  	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
   364  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   365  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
   366  
   367  	// Next, restore moduleContext.fn from callerFunction.
   368  	callerFunction.setRegister(tmp)
   369  	c.compileLoadValueOnStackToRegister(callerFunction)
   370  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   371  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
   372  
   373  	// Also, we have to put the target function's *wasm.ModuleInstance into arm64CallingConventionModuleInstanceAddressRegister.
   374  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   375  		tmp, functionModuleInstanceOffset,
   376  		arm64CallingConventionModuleInstanceAddressRegister)
   377  
   378  	c.assembler.CompileJumpToRegister(arm64.B, returnAddress.register)
   379  	return nil
   380  }
   381  
   382  func (c *arm64Compiler) compileMaybeExitFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) {
   383  	skip := c.assembler.CompileJump(skipCondition)
   384  	c.compileExitFromNativeCode(status)
   385  	c.assembler.SetJumpTargetOnNext(skip)
   386  }
   387  
   388  // compileExitFromNativeCode adds instructions to give the control back to ce.exec with the given status code.
   389  func (c *arm64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
   390  	if target := c.compiledTrapTargets[status]; target != nil {
   391  		c.assembler.CompileJump(arm64.B).AssignJumpTarget(target)
   392  		return
   393  	}
   394  
   395  	switch status {
   396  	case nativeCallStatusCodeReturned:
   397  		// Save the target for reuse.
   398  		c.compiledTrapTargets[status] = c.compileNOP()
   399  	case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction:
   400  		// Read the return address, and write it to callEngine.exitContext.returnAddress.
   401  		c.assembler.CompileReadInstructionAddress(arm64ReservedRegisterForTemporary, arm64.RET)
   402  		c.assembler.CompileRegisterToMemory(
   403  			arm64.STRD, arm64ReservedRegisterForTemporary,
   404  			arm64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset,
   405  		)
   406  	default:
   407  		if c.ir.IROperationSourceOffsetsInWasmBinary != nil {
   408  			// This case, the execution traps, and we want the top frame's source position in the stack trace.
   409  			// We store the instruction address onto callEngine.returnAddress.
   410  			c.assembler.CompileReadInstructionAddress(arm64ReservedRegisterForTemporary, arm64.STRD)
   411  			c.assembler.CompileRegisterToMemory(
   412  				arm64.STRD, arm64ReservedRegisterForTemporary,
   413  				arm64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset,
   414  			)
   415  		} else {
   416  			// We won't use the source position, so just save the target for reuse.
   417  			c.compiledTrapTargets[status] = c.compileNOP()
   418  		}
   419  	}
   420  
   421  	// Write the current stack pointer to the ce.stackPointer.
   422  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(c.locationStack.sp), arm64ReservedRegisterForTemporary)
   423  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForCallEngine,
   424  		callEngineStackContextStackPointerOffset)
   425  
   426  	// Write the status to callEngine.exitContext.statusCode.
   427  	if status != 0 {
   428  		c.assembler.CompileConstToRegister(arm64.MOVW, int64(status), arm64ReservedRegisterForTemporary)
   429  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64ReservedRegisterForTemporary,
   430  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   431  	} else {
   432  		// If the status == 0, we use zero register to store zero.
   433  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64.RegRZR,
   434  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   435  	}
   436  
   437  	// The return address to the Go code is stored in archContext.compilerReturnAddress which
   438  	// is embedded in ce. We load the value to the tmpRegister, and then
   439  	// invoke RET with that register.
   440  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   441  		arm64ReservedRegisterForCallEngine, arm64CallEngineArchContextCompilerCallReturnAddressOffset,
   442  		arm64ReservedRegisterForTemporary)
   443  
   444  	c.assembler.CompileJumpToRegister(arm64.RET, arm64ReservedRegisterForTemporary)
   445  }
   446  
   447  // compileGoHostFunction implements compiler.compileHostFunction for the arm64 architecture.
   448  func (c *arm64Compiler) compileGoDefinedHostFunction() error {
   449  	// First we must update the location stack to reflect the number of host function inputs.
   450  	c.locationStack.init(c.typ)
   451  
   452  	if c.withListener {
   453  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction,
   454  			builtinFunctionIndexFunctionListenerBefore); err != nil {
   455  			return err
   456  		}
   457  	}
   458  
   459  	// Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack
   460  	// (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack,
   461  	// and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function
   462  	// without sacrificing the performance.
   463  	c.compileReservedStackBasePointerRegisterInitialization()
   464  	// Alias for readability.
   465  	tmp := arm64CallingConventionModuleInstanceAddressRegister
   466  	// Get the location of the callerFunction (*function) in the stack, which depends on the signature.
   467  	_, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   468  	// Load the value into the tmp register: tmp = &function{..}
   469  	callerFunction.setRegister(tmp)
   470  	c.compileLoadValueOnStackToRegister(callerFunction)
   471  	// tmp = *(tmp+functionModuleInstanceOffset) = &wasm.ModuleInstance{...}
   472  	c.assembler.CompileMemoryToRegister(arm64.LDRD, tmp, functionModuleInstanceOffset, tmp)
   473  	// Load it onto callEngine.exitContext.callerModuleInstance.
   474  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   475  		tmp,
   476  		arm64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset)
   477  	// Reset the state of callerFunction value location so that we won't mess up subsequent code generation below.
   478  	c.locationStack.releaseRegister(callerFunction)
   479  
   480  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction, 0); err != nil {
   481  		return err
   482  	}
   483  
   484  	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
   485  	c.compileReservedStackBasePointerRegisterInitialization()
   486  
   487  	// Go function can change the module state in arbitrary way, so we have to force
   488  	// the callEngine.moduleContext initialization on the function return. To do so,
   489  	// we zero-out callEngine.moduleInstance.
   490  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   491  		arm64.RegRZR,
   492  		arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
   493  
   494  	return c.compileReturnFunction()
   495  }
   496  
   497  // setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
   498  // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
   499  // This is called when we branch into different block.
   500  func (c *arm64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
   501  	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
   502  		c.stackPointerCeil = c.locationStack.stackPointerCeil
   503  	}
   504  	c.locationStack = newStack
   505  }
   506  
   507  // compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the arm64 architecture.
   508  func (c *arm64Compiler) compileBuiltinFunctionCheckExitCode() error {
   509  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexCheckExitCode); err != nil {
   510  		return err
   511  	}
   512  
   513  	// After return, we re-initialize reserved registers just like preamble of functions.
   514  	c.compileReservedStackBasePointerRegisterInitialization()
   515  	c.compileReservedMemoryRegisterInitialization()
   516  	return nil
   517  }
   518  
   519  // compileLabel implements compiler.compileLabel for the arm64 architecture.
   520  func (c *arm64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipThisLabel bool) {
   521  	labelKey := wazeroir.Label(o.U1)
   522  	labelInfo := c.label(labelKey)
   523  
   524  	// If initialStack is not set, that means this label has never been reached.
   525  	if !labelInfo.stackInitialized {
   526  		skipThisLabel = true
   527  		return
   528  	}
   529  
   530  	if labelBegin := labelInfo.initialInstruction; labelBegin == nil {
   531  		// We use NOP as a beginning of instructions in a label.
   532  		// This should be eventually optimized out by assembler.
   533  		labelInfo.initialInstruction = c.assembler.CompileStandAlone(arm64.NOP)
   534  	} else {
   535  		c.assembler.Add(labelBegin)
   536  	}
   537  
   538  	// Set the initial stack.
   539  	c.setLocationStack(&labelInfo.initialStack)
   540  	return false
   541  }
   542  
   543  // compileUnreachable implements compiler.compileUnreachable for the arm64 architecture.
   544  func (c *arm64Compiler) compileUnreachable() error {
   545  	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
   546  	return nil
   547  }
   548  
   549  // compileSet implements compiler.compileSet for the arm64 architecture.
   550  func (c *arm64Compiler) compileSet(o *wazeroir.UnionOperation) error {
   551  	depth := int(o.U1)
   552  	isTargetVector := o.B3
   553  
   554  	setTargetIndex := int(c.locationStack.sp) - 1 - depth
   555  
   556  	if isTargetVector {
   557  		_ = c.locationStack.pop()
   558  	}
   559  	v := c.locationStack.pop()
   560  	if err := c.compileEnsureOnRegister(v); err != nil {
   561  		return err
   562  	}
   563  
   564  	targetLocation := &c.locationStack.stack[setTargetIndex]
   565  	if targetLocation.onRegister() {
   566  		// We no longer need the register previously used by the target location.
   567  		c.markRegisterUnused(targetLocation.register)
   568  	}
   569  
   570  	reg := v.register
   571  	targetLocation.setRegister(reg)
   572  	targetLocation.valueType = v.valueType
   573  	if isTargetVector {
   574  		hi := &c.locationStack.stack[setTargetIndex+1]
   575  		hi.setRegister(reg)
   576  	}
   577  	return nil
   578  }
   579  
   580  // compileGlobalGet implements compiler.compileGlobalGet for the arm64 architecture.
   581  func (c *arm64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error {
   582  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   583  		return err
   584  	}
   585  
   586  	index := uint32(o.U1)
   587  
   588  	wasmValueType := c.ir.Globals[index].ValType
   589  	isV128 := wasmValueType == wasm.ValueTypeV128
   590  	// Get the address of globals[index] into globalAddressReg.
   591  	globalAddressReg, err := c.compileReadGlobalAddress(index)
   592  	if err != nil {
   593  		return err
   594  	}
   595  
   596  	if isV128 {
   597  		resultReg, err := c.allocateRegister(registerTypeVector)
   598  		if err != nil {
   599  			return err
   600  		}
   601  		c.assembler.CompileConstToRegister(arm64.ADD, globalInstanceValueOffset, globalAddressReg)
   602  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV, globalAddressReg, 0,
   603  			resultReg, arm64.VectorArrangementQ)
   604  
   605  		c.pushVectorRuntimeValueLocationOnRegister(resultReg)
   606  	} else {
   607  		ldr := arm64.NOP
   608  		var result asm.Register
   609  		var vt runtimeValueType
   610  		switch wasmValueType {
   611  		case wasm.ValueTypeI32:
   612  			ldr = arm64.LDRW
   613  			vt = runtimeValueTypeI32
   614  			result = globalAddressReg
   615  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   616  			ldr = arm64.LDRD
   617  			vt = runtimeValueTypeI64
   618  			result = globalAddressReg
   619  		case wasm.ValueTypeF32:
   620  			result, err = c.allocateRegister(registerTypeVector)
   621  			if err != nil {
   622  				return err
   623  			}
   624  			ldr = arm64.FLDRS
   625  			vt = runtimeValueTypeF32
   626  		case wasm.ValueTypeF64:
   627  			result, err = c.allocateRegister(registerTypeVector)
   628  			if err != nil {
   629  				return err
   630  			}
   631  			ldr = arm64.FLDRD
   632  			vt = runtimeValueTypeF64
   633  		}
   634  
   635  		// "result = [globalAddressReg + globalInstanceValueOffset] (== globals[index].Val)"
   636  		c.assembler.CompileMemoryToRegister(
   637  			ldr,
   638  			globalAddressReg, globalInstanceValueOffset,
   639  			result,
   640  		)
   641  
   642  		c.pushRuntimeValueLocationOnRegister(result, vt)
   643  	}
   644  	return nil
   645  }
   646  
   647  // compileGlobalSet implements compiler.compileGlobalSet for the arm64 architecture.
   648  func (c *arm64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error {
   649  	index := uint32(o.U1)
   650  
   651  	wasmValueType := c.ir.Globals[index].ValType
   652  	isV128 := wasmValueType == wasm.ValueTypeV128
   653  
   654  	var val *runtimeValueLocation
   655  	if isV128 {
   656  		val = c.locationStack.popV128()
   657  	} else {
   658  		val = c.locationStack.pop()
   659  	}
   660  	if err := c.compileEnsureOnRegister(val); err != nil {
   661  		return err
   662  	}
   663  
   664  	globalInstanceAddressRegister, err := c.compileReadGlobalAddress(index)
   665  	if err != nil {
   666  		return err
   667  	}
   668  
   669  	if isV128 {
   670  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
   671  			val.register, globalInstanceAddressRegister, globalInstanceValueOffset,
   672  			arm64.VectorArrangementQ)
   673  	} else {
   674  		var str asm.Instruction
   675  		switch c.ir.Globals[index].ValType {
   676  		case wasm.ValueTypeI32:
   677  			str = arm64.STRW
   678  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   679  			str = arm64.STRD
   680  		case wasm.ValueTypeF32:
   681  			str = arm64.FSTRS
   682  		case wasm.ValueTypeF64:
   683  			str = arm64.FSTRD
   684  		}
   685  
   686  		// At this point "globalInstanceAddressRegister = globals[index]".
   687  		// Therefore, this means "globals[index].Val = val.register"
   688  		c.assembler.CompileRegisterToMemory(
   689  			str,
   690  			val.register,
   691  			globalInstanceAddressRegister, globalInstanceValueOffset,
   692  		)
   693  	}
   694  
   695  	c.markRegisterUnused(val.register)
   696  	return nil
   697  }
   698  
   699  // compileReadGlobalAddress adds instructions to store the absolute address of the global instance at globalIndex into a register
   700  func (c *arm64Compiler) compileReadGlobalAddress(globalIndex uint32) (destinationRegister asm.Register, err error) {
   701  	// TODO: rethink about the type used in store `globals []*GlobalInstance`.
   702  	// If we use `[]GlobalInstance` instead, we could reduce one MOV instruction here.
   703  
   704  	destinationRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
   705  	if err != nil {
   706  		return
   707  	}
   708  
   709  	// "destinationRegister = globalIndex * 8"
   710  	c.assembler.CompileConstToRegister(
   711  		// globalIndex is an index to []*GlobalInstance, therefore
   712  		// we have to multiply it by the size of *GlobalInstance == the pointer size == 8.
   713  		arm64.MOVD, int64(globalIndex)*8, destinationRegister,
   714  	)
   715  
   716  	// "arm64ReservedRegisterForTemporary = &globals[0]"
   717  	c.assembler.CompileMemoryToRegister(
   718  		arm64.LDRD,
   719  		arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
   720  		arm64ReservedRegisterForTemporary,
   721  	)
   722  
   723  	// "destinationRegister = [arm64ReservedRegisterForTemporary + destinationRegister] (== globals[globalIndex])".
   724  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
   725  		arm64.LDRD,
   726  		arm64ReservedRegisterForTemporary, destinationRegister,
   727  		destinationRegister,
   728  	)
   729  	return
   730  }
   731  
   732  // compileBr implements compiler.compileBr for the arm64 architecture.
   733  func (c *arm64Compiler) compileBr(o *wazeroir.UnionOperation) error {
   734  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   735  		return err
   736  	}
   737  	return c.compileBranchInto(wazeroir.Label(o.U1))
   738  }
   739  
   740  // compileBrIf implements compiler.compileBrIf for the arm64 architecture.
   741  func (c *arm64Compiler) compileBrIf(o *wazeroir.UnionOperation) error {
   742  	cond := c.locationStack.pop()
   743  
   744  	var conditionalBR asm.Node
   745  	if cond.onConditionalRegister() {
   746  		// If the cond is on a conditional register, it corresponds to one of "conditional codes"
   747  		// https://developer.arm.com/documentation/dui0801/a/Condition-Codes/Condition-code-suffixes
   748  		// Here we represent the conditional codes by using arm64.COND_** registers, and that means the
   749  		// conditional jump can be performed if we use arm64.B**.
   750  		// For example, if we have arm64.CondEQ on cond, that means we performed compileEq right before
   751  		// this compileBrIf and BrIf can be achieved by arm64.BCONDEQ.
   752  		var brInst asm.Instruction
   753  		switch cond.conditionalRegister {
   754  		case arm64.CondEQ:
   755  			brInst = arm64.BCONDEQ
   756  		case arm64.CondNE:
   757  			brInst = arm64.BCONDNE
   758  		case arm64.CondHS:
   759  			brInst = arm64.BCONDHS
   760  		case arm64.CondLO:
   761  			brInst = arm64.BCONDLO
   762  		case arm64.CondMI:
   763  			brInst = arm64.BCONDMI
   764  		case arm64.CondHI:
   765  			brInst = arm64.BCONDHI
   766  		case arm64.CondLS:
   767  			brInst = arm64.BCONDLS
   768  		case arm64.CondGE:
   769  			brInst = arm64.BCONDGE
   770  		case arm64.CondLT:
   771  			brInst = arm64.BCONDLT
   772  		case arm64.CondGT:
   773  			brInst = arm64.BCONDGT
   774  		case arm64.CondLE:
   775  			brInst = arm64.BCONDLE
   776  		default:
   777  			// BUG: This means that we use the cond.conditionalRegister somewhere in this file,
   778  			// but not covered in switch ^. That shouldn't happen.
   779  			return fmt.Errorf("unsupported condition for br_if: %v", cond.conditionalRegister)
   780  		}
   781  		conditionalBR = c.assembler.CompileJump(brInst)
   782  	} else {
   783  		// If the value is not on the conditional register, we compare the value with the zero register,
   784  		// and then do the conditional BR if the value doesn't equal zero.
   785  		if err := c.compileEnsureOnRegister(cond); err != nil {
   786  			return err
   787  		}
   788  		// Compare the value with zero register. Note that the value is ensured to be i32 by function validation phase,
   789  		// so we use CMPW (32-bit compare) here.
   790  		c.assembler.CompileTwoRegistersToNone(arm64.CMPW, cond.register, arm64.RegRZR)
   791  
   792  		conditionalBR = c.assembler.CompileJump(arm64.BCONDNE)
   793  
   794  		c.markRegisterUnused(cond.register)
   795  	}
   796  
   797  	// Emit the code for branching into else branch.
   798  	elseTarget := wazeroir.Label(o.U2)
   799  	if err := c.compileBranchInto(elseTarget); err != nil {
   800  		return err
   801  	}
   802  	// We branch into here from the original conditional BR (conditionalBR).
   803  	c.assembler.SetJumpTargetOnNext(conditionalBR)
   804  	thenTarget := wazeroir.Label(o.U1)
   805  	if err := compileDropRange(c, o.U3); err != nil {
   806  		return err
   807  	}
   808  	return c.compileBranchInto(thenTarget)
   809  }
   810  
   811  func (c *arm64Compiler) compileBranchInto(target wazeroir.Label) error {
   812  	if target.IsReturnTarget() {
   813  		return c.compileReturnFunction()
   814  	} else {
   815  		if c.ir.LabelCallers[target] > 1 {
   816  			// We can only re-use register state if when there's a single call-site.
   817  			// Release existing values on registers to the stack if there's multiple ones to have
   818  			// the consistent value location state at the beginning of label.
   819  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   820  				return err
   821  			}
   822  		}
   823  		// Set the initial stack of the target label, so we can start compiling the label
   824  		// with the appropriate value locations. Note we clone the stack here as we maybe
   825  		// manipulate the stack before compiler reaches the label.
   826  		targetLabel := c.label(target)
   827  		if !targetLabel.stackInitialized {
   828  			targetLabel.initialStack.cloneFrom(*c.locationStack)
   829  			targetLabel.stackInitialized = true
   830  		}
   831  
   832  		br := c.assembler.CompileJump(arm64.B)
   833  		c.assignBranchTarget(target, br)
   834  		return nil
   835  	}
   836  }
   837  
   838  // assignBranchTarget assigns the given label's initial instruction to the destination of br.
   839  func (c *arm64Compiler) assignBranchTarget(label wazeroir.Label, br asm.Node) {
   840  	target := c.label(label)
   841  
   842  	targetInst := target.initialInstruction
   843  	if targetInst == nil {
   844  		// If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction.
   845  		targetInst = c.assembler.AllocateNOP()
   846  		target.initialInstruction = targetInst
   847  	}
   848  
   849  	br.AssignJumpTarget(targetInst)
   850  }
   851  
   852  // compileBrTable implements compiler.compileBrTable for the arm64 architecture.
   853  func (c *arm64Compiler) compileBrTable(o *wazeroir.UnionOperation) error {
   854  	// If the operation only consists of the default target, we branch into it and return early.
   855  	if len(o.Us) == 2 {
   856  		loc := c.locationStack.pop()
   857  		if loc.onRegister() {
   858  			c.markRegisterUnused(loc.register)
   859  		}
   860  		if err := compileDropRange(c, o.Us[1]); err != nil {
   861  			return err
   862  		}
   863  		return c.compileBranchInto(wazeroir.Label(o.Us[0]))
   864  	}
   865  
   866  	index := c.locationStack.pop()
   867  	if err := c.compileEnsureOnRegister(index); err != nil {
   868  		return err
   869  	}
   870  
   871  	if isZeroRegister(index.register) {
   872  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
   873  		if err != nil {
   874  			return err
   875  		}
   876  		index.setRegister(reg)
   877  		c.markRegisterUsed(reg)
   878  
   879  		// Zero the value on a picked register.
   880  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, reg)
   881  	}
   882  
   883  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   884  	if err != nil {
   885  		return err
   886  	}
   887  
   888  	// Load the branch table's length.
   889  	// "tmpReg = len(o.Targets)"
   890  	c.assembler.CompileConstToRegister(arm64.MOVW, int64(len(o.Us)/2-1), tmpReg)
   891  	// Compare the length with offset.
   892  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmpReg, index.register)
   893  	// If the value exceeds the length, we will branch into the default target (corresponding to len(o.Targets) index).
   894  	brDefaultIndex := c.assembler.CompileJump(arm64.BCONDLO)
   895  	c.assembler.CompileRegisterToRegister(arm64.MOVW, tmpReg, index.register)
   896  	c.assembler.SetJumpTargetOnNext(brDefaultIndex)
   897  
   898  	// We prepare the asm.StaticConst which holds the offset of
   899  	// each target's first instruction (incl. default)
   900  	// relative to the beginning of label tables.
   901  	//
   902  	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
   903  	// we emit the code like this at [Emit the code for each target and default branch] below.
   904  	//
   905  	// L0:
   906  	//  0x123001: XXXX, ...
   907  	//  .....
   908  	// L1:
   909  	//  0x123005: YYY, ...
   910  	//  .....
   911  	// L_DEFAULT:
   912  	//  0x123009: ZZZ, ...
   913  	//
   914  	// then offsetData becomes like [0x0, 0x5, 0x8].
   915  	// By using this offset list, we could jump into the label for the index by
   916  	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by ADR instruction.
   917  	//
   918  	// Note: We store each offset of 32-bit unsigned integer as 4 consecutive bytes. So more precisely,
   919  	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
   920  	//
   921  	// Note: this is similar to how GCC implements Switch statements in C.
   922  	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2)))
   923  
   924  	// "tmpReg = &offsetData[0]"
   925  	c.assembler.CompileStaticConstToRegister(arm64.ADR, offsetData, tmpReg)
   926  
   927  	// "index.register = tmpReg + (index.register << 2) (== &offsetData[offset])"
   928  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, index.register, 2, tmpReg, index.register)
   929  
   930  	// "index.register = *index.register (== offsetData[offset])"
   931  	c.assembler.CompileMemoryToRegister(arm64.LDRW, index.register, 0, index.register)
   932  
   933  	// Now we read the address of the beginning of the jump table.
   934  	// In the above example, this corresponds to reading the address of 0x123001.
   935  	c.assembler.CompileReadInstructionAddress(tmpReg, arm64.B)
   936  
   937  	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
   938  	// So we could achieve the br_table jump by adding them and jump into the resulting address.
   939  	c.assembler.CompileRegisterToRegister(arm64.ADD, tmpReg, index.register)
   940  
   941  	c.assembler.CompileJumpToRegister(arm64.B, index.register)
   942  
   943  	// We no longer need the index's register, so mark it unused.
   944  	c.markRegisterUnused(index.register)
   945  
   946  	// [Emit the code for each targets and default branch]
   947  	labelInitialInstructions := make([]asm.Node, len(o.Us)/2)
   948  
   949  	// Since we might end up having the different stack state in each branch,
   950  	// we need to save the initial stack state here, and use the same initial state
   951  	// for each iteration.
   952  	initialLocationStack := c.getSavedTemporaryLocationStack()
   953  
   954  	for i := range labelInitialInstructions {
   955  		// Emit the initial instruction of each target where
   956  		// we use NOP as we don't yet know the next instruction in each label.
   957  		init := c.assembler.CompileStandAlone(arm64.NOP)
   958  		labelInitialInstructions[i] = init
   959  
   960  		targetLabel := wazeroir.Label(o.Us[i*2])
   961  		targetToDrop := o.Us[i*2+1]
   962  		if err = compileDropRange(c, targetToDrop); err != nil {
   963  			return err
   964  		}
   965  		if err = c.compileBranchInto(targetLabel); err != nil {
   966  			return err
   967  		}
   968  		// After the iteration, reset the stack's state with initialLocationStack.
   969  		c.locationStack.cloneFrom(initialLocationStack)
   970  	}
   971  
   972  	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
   973  	return nil
   974  }
   975  
   976  func (c *arm64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack {
   977  	initialLocationStack := *c.locationStack // Take copy!
   978  	// Use c.brTableTmp for the underlying stack so that we could reduce the allocations.
   979  	if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 {
   980  		c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...)
   981  	}
   982  	copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp])
   983  	initialLocationStack.stack = c.brTableTmp
   984  	return initialLocationStack
   985  }
   986  
   987  // compileCall implements compiler.compileCall for the arm64 architecture.
   988  func (c *arm64Compiler) compileCall(o *wazeroir.UnionOperation) error {
   989  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   990  		return err
   991  	}
   992  
   993  	functionIndex := o.U1
   994  
   995  	tp := &c.ir.Types[c.ir.Functions[functionIndex]]
   996  
   997  	targetFunctionAddressReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   998  	if err != nil {
   999  		return err
  1000  	}
  1001  	c.markRegisterUsed(targetFunctionAddressReg)
  1002  	defer c.markRegisterUnused(targetFunctionAddressReg)
  1003  
  1004  	// 3) Set rc.next to specify which function is executed on the current call frame.
  1005  	//
  1006  	// First, we read the address of the first item of ce.functions slice (= &ce.functions[0])
  1007  	// into tmp.
  1008  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1009  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  1010  		targetFunctionAddressReg)
  1011  
  1012  	c.assembler.CompileConstToRegister(
  1013  		arm64.ADD,
  1014  		int64(functionIndex)*functionSize, // * 8 because the size of *function equals 8 bytes.
  1015  		targetFunctionAddressReg)
  1016  
  1017  	return c.compileCallImpl(targetFunctionAddressReg, tp)
  1018  }
  1019  
  1020  // compileCallImpl implements compiler.compileCall and compiler.compileCallIndirect for the arm64 architecture.
  1021  func (c *arm64Compiler) compileCallImpl(targetFunctionAddressRegister asm.Register, functype *wasm.FunctionType) error {
  1022  	// Release all the registers as our calling convention requires the caller-save.
  1023  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  1024  		return err
  1025  	}
  1026  
  1027  	tmp, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  1028  	if !ok {
  1029  		panic("BUG: cannot take a free register")
  1030  	}
  1031  
  1032  	// The stack should look like:
  1033  	//
  1034  	//               reserved slots for results (if len(results) > len(args))
  1035  	//                      |     |
  1036  	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
  1037  	//      |                       |                                                        |
  1038  	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
  1039  	//      |
  1040  	// nextStackBasePointerOffset
  1041  	//
  1042  	// where callFrame is used to return to this currently executed function.
  1043  
  1044  	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
  1045  
  1046  	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
  1047  
  1048  	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
  1049  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1050  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  1051  		tmp)
  1052  	callFrameStackBasePointerInBytesLoc.setRegister(tmp)
  1053  	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
  1054  
  1055  	// Set callEngine.stackContext.stackBasePointer for the next function.
  1056  	c.assembler.CompileConstToRegister(arm64.ADD, nextStackBasePointerOffset<<3, tmp)
  1057  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  1058  		tmp,
  1059  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  1060  
  1061  	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
  1062  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1063  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
  1064  		tmp)
  1065  	callFrameFunctionLoc.setRegister(tmp)
  1066  	c.compileReleaseRegisterToStack(callFrameFunctionLoc)
  1067  
  1068  	// Set callEngine.moduleContext.fn to the next *function.
  1069  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  1070  		targetFunctionAddressRegister,
  1071  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  1072  
  1073  	// Write the return address into callFrameReturnAddressLoc.
  1074  	c.assembler.CompileReadInstructionAddress(tmp, arm64.B)
  1075  	callFrameReturnAddressLoc.setRegister(tmp)
  1076  	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
  1077  
  1078  	if targetFunctionAddressRegister == arm64CallingConventionModuleInstanceAddressRegister {
  1079  		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
  1080  		// the address (jump target below) will be modified and result in segfault.
  1081  		// See #526.
  1082  		c.assembler.CompileRegisterToRegister(arm64.MOVD, targetFunctionAddressRegister, tmp)
  1083  		targetFunctionAddressRegister = tmp
  1084  	}
  1085  
  1086  	// Also, we have to put the code's moduleInstance address into arm64CallingConventionModuleInstanceAddressRegister.
  1087  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1088  		targetFunctionAddressRegister, functionModuleInstanceOffset,
  1089  		arm64CallingConventionModuleInstanceAddressRegister,
  1090  	)
  1091  
  1092  	// Then, br into the target function's initial address.
  1093  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1094  		targetFunctionAddressRegister, functionCodeInitialAddressOffset,
  1095  		targetFunctionAddressRegister)
  1096  
  1097  	c.assembler.CompileJumpToRegister(arm64.B, targetFunctionAddressRegister)
  1098  
  1099  	// We consumed the function parameters, the call frame stack and reserved slots during the call.
  1100  	c.locationStack.sp = uint64(nextStackBasePointerOffset)
  1101  
  1102  	// Also, the function results were pushed by the call.
  1103  	for _, t := range functype.Results {
  1104  		loc := c.locationStack.pushRuntimeValueLocationOnStack()
  1105  		switch t {
  1106  		case wasm.ValueTypeI32:
  1107  			loc.valueType = runtimeValueTypeI32
  1108  		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
  1109  			loc.valueType = runtimeValueTypeI64
  1110  		case wasm.ValueTypeF32:
  1111  			loc.valueType = runtimeValueTypeF32
  1112  		case wasm.ValueTypeF64:
  1113  			loc.valueType = runtimeValueTypeF64
  1114  		case wasm.ValueTypeV128:
  1115  			loc.valueType = runtimeValueTypeV128Lo
  1116  			hi := c.locationStack.pushRuntimeValueLocationOnStack()
  1117  			hi.valueType = runtimeValueTypeV128Hi
  1118  		}
  1119  	}
  1120  
  1121  	if err := c.compileModuleContextInitialization(); err != nil {
  1122  		return err
  1123  	}
  1124  
  1125  	// On the function return, we initialize the state for this function.
  1126  	c.compileReservedStackBasePointerRegisterInitialization()
  1127  
  1128  	c.compileReservedMemoryRegisterInitialization()
  1129  	return nil
  1130  }
  1131  
  1132  // compileCallIndirect implements compiler.compileCallIndirect for the arm64 architecture.
  1133  func (c *arm64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) (err error) {
  1134  	offset := c.locationStack.pop()
  1135  	if err = c.compileEnsureOnRegister(offset); err != nil {
  1136  		return err
  1137  	}
  1138  	typeIndex := o.U1
  1139  	tableIndex := o.U2
  1140  
  1141  	offsetReg := offset.register
  1142  	if isZeroRegister(offsetReg) {
  1143  		offsetReg, err = c.allocateRegister(registerTypeGeneralPurpose)
  1144  		if err != nil {
  1145  			return err
  1146  		}
  1147  		c.markRegisterUsed(offsetReg)
  1148  
  1149  		// Zero the value on a picked register.
  1150  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, offsetReg)
  1151  	}
  1152  
  1153  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  1154  	if err != nil {
  1155  		return err
  1156  	}
  1157  	c.markRegisterUsed(tmp)
  1158  
  1159  	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
  1160  	if err != nil {
  1161  		return err
  1162  	}
  1163  	c.markRegisterUsed(tmp2)
  1164  
  1165  	// First, we need to check if the offset doesn't exceed the length of table.
  1166  	// "tmp = &Tables[0]"
  1167  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1168  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  1169  		tmp,
  1170  	)
  1171  	// tmp = [tmp + TableIndex*8] = [&Tables[0] + TableIndex*sizeOf(*tableInstance)] = Tables[tableIndex]
  1172  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1173  		tmp, int64(tableIndex)*8,
  1174  		tmp,
  1175  	)
  1176  	// tmp2 = [tmp + tableInstanceTableLenOffset] = len(Tables[tableIndex])
  1177  	c.assembler.CompileMemoryToRegister(arm64.LDRD, tmp, tableInstanceTableLenOffset, tmp2)
  1178  
  1179  	// "cmp tmp2, offset"
  1180  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp2, offsetReg)
  1181  
  1182  	// If it exceeds len(table), we trap.
  1183  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  1184  	// Otherwise, we proceed to do function type check.
  1185  
  1186  	// We need to obtain the absolute address of table element.
  1187  	// "tmp = &Tables[tableIndex].table[0]"
  1188  	c.assembler.CompileMemoryToRegister(
  1189  		arm64.LDRD,
  1190  		tmp, tableInstanceTableOffset,
  1191  		tmp,
  1192  	)
  1193  	// "offset = tmp + (offset << pointerSizeLog2) (== &table[offset])"
  1194  	// Here we left shifting by 3 in order to get the offset in bytes,
  1195  	// and the table element type is uintptr which is 8 bytes.
  1196  	c.assembler.CompileLeftShiftedRegisterToRegister(
  1197  		arm64.ADD,
  1198  		offsetReg, pointerSizeLog2,
  1199  		tmp,
  1200  		offsetReg,
  1201  	)
  1202  
  1203  	// "offset = (*offset) (== table[offset])"
  1204  	c.assembler.CompileMemoryToRegister(arm64.LDRD, offsetReg, 0, offsetReg)
  1205  
  1206  	// Check if the value of table[offset] equals zero, meaning that the target element is uninitialized.
  1207  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, offsetReg)
  1208  
  1209  	// Skipped if the target is initialized.
  1210  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusCodeInvalidTableAccess)
  1211  
  1212  	// next we check the type matches, i.e. table[offset].source.TypeID == targetFunctionType.
  1213  	// "tmp = table[offset].typeID"
  1214  	c.assembler.CompileMemoryToRegister(
  1215  		arm64.LDRD,
  1216  		offsetReg, functionTypeIDOffset,
  1217  		tmp,
  1218  	)
  1219  	// "tmp2 = ModuleInstance.TypeIDs[index]"
  1220  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1221  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
  1222  		tmp2)
  1223  	c.assembler.CompileMemoryToRegister(arm64.LDRW, tmp2, int64(typeIndex)*4, tmp2)
  1224  
  1225  	// Compare these two values, and if they equal, we are ready to make function call.
  1226  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmp, tmp2)
  1227  	// Skipped if the type matches.
  1228  	c.compileMaybeExitFromNativeCode(arm64.BCONDEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall)
  1229  
  1230  	targetFunctionType := &c.ir.Types[typeIndex]
  1231  	if err := c.compileCallImpl(offsetReg, targetFunctionType); err != nil {
  1232  		return err
  1233  	}
  1234  
  1235  	// The offset register should be marked as un-used as we consumed in the function call.
  1236  	c.markRegisterUnused(offsetReg, tmp, tmp2)
  1237  	return nil
  1238  }
  1239  
  1240  // compileDrop implements compiler.compileDrop for the arm64 architecture.
  1241  func (c *arm64Compiler) compileDrop(o *wazeroir.UnionOperation) error {
  1242  	return compileDropRange(c, o.U1)
  1243  }
  1244  
  1245  func (c *arm64Compiler) compileSelectV128Impl(selectorRegister asm.Register) error {
  1246  	x2 := c.locationStack.popV128()
  1247  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1248  		return err
  1249  	}
  1250  
  1251  	x1 := c.locationStack.popV128()
  1252  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1253  		return err
  1254  	}
  1255  
  1256  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, selectorRegister)
  1257  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1258  
  1259  	// In this branch, we select the value of x2, so we move the value into x1.register so that
  1260  	// we can have the result in x1.register regardless of the selection.
  1261  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1262  		x2.register, x2.register, x1.register, arm64.VectorArrangement16B)
  1263  
  1264  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1265  
  1266  	// As noted, the result exists in x1.register regardless of the selector.
  1267  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1268  	// Plus, x2.register is no longer used.
  1269  	c.markRegisterUnused(x2.register)
  1270  	return nil
  1271  }
  1272  
  1273  // compileSelect implements compiler.compileSelect for the arm64 architecture.
  1274  func (c *arm64Compiler) compileSelect(o *wazeroir.UnionOperation) error {
  1275  	cv, err := c.popValueOnRegister()
  1276  	if err != nil {
  1277  		return err
  1278  	}
  1279  
  1280  	isTargetVector := o.B3
  1281  	if isTargetVector {
  1282  		return c.compileSelectV128Impl(cv.register)
  1283  	}
  1284  
  1285  	c.markRegisterUsed(cv.register)
  1286  
  1287  	x1, x2, err := c.popTwoValuesOnRegisters()
  1288  	if err != nil {
  1289  		return err
  1290  	}
  1291  
  1292  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1293  		// If both values are zero, the result is always zero.
  1294  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1295  		c.markRegisterUnused(cv.register)
  1296  		return nil
  1297  	}
  1298  
  1299  	// In the following, we emit the code so that x1's register contains the chosen value
  1300  	// no matter which of original x1 or x2 is selected.
  1301  	//
  1302  	// If x1 is currently on zero register, we cannot place the result because
  1303  	// "MOV arm64.RegRZR x2.register" results in arm64.RegRZR regardless of the value.
  1304  	// So we explicitly assign a general purpose register to x1 here.
  1305  	if isZeroRegister(x1.register) {
  1306  		// Mark x2 and cv's registers are used so they won't be chosen.
  1307  		c.markRegisterUsed(x2.register)
  1308  		// Pick the non-zero register for x1.
  1309  		x1Reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1310  		if err != nil {
  1311  			return err
  1312  		}
  1313  		x1.setRegister(x1Reg)
  1314  		// And zero our the picked register.
  1315  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, x1Reg)
  1316  	}
  1317  
  1318  	// At this point, x1 is non-zero register, and x2 is either general purpose or zero register.
  1319  
  1320  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, cv.register)
  1321  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1322  
  1323  	// If cv == 0, we move the value of x2 to the x1.register.
  1324  
  1325  	switch x1.valueType {
  1326  	case runtimeValueTypeI32:
  1327  		// TODO: use 32-bit mov
  1328  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1329  	case runtimeValueTypeI64:
  1330  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1331  	case runtimeValueTypeF32:
  1332  		// TODO: use 32-bit mov
  1333  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1334  	case runtimeValueTypeF64:
  1335  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1336  	default:
  1337  		return errors.New("TODO: implement vector type select")
  1338  	}
  1339  
  1340  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1341  
  1342  	// Otherwise, nothing to do for select.
  1343  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1344  
  1345  	// Only x1.register is reused.
  1346  	c.markRegisterUnused(cv.register, x2.register)
  1347  	return nil
  1348  }
  1349  
  1350  // compilePick implements compiler.compilePick for the arm64 architecture.
  1351  func (c *arm64Compiler) compilePick(o *wazeroir.UnionOperation) error {
  1352  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1353  		return err
  1354  	}
  1355  	depth := o.U1
  1356  	isTargetVector := o.B3
  1357  
  1358  	pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)]
  1359  	pickedRegister, err := c.allocateRegister(pickTarget.getRegisterType())
  1360  	if err != nil {
  1361  		return err
  1362  	}
  1363  
  1364  	if pickTarget.onRegister() { // Copy the value to the pickedRegister.
  1365  		switch pickTarget.valueType {
  1366  		case runtimeValueTypeI32:
  1367  			c.assembler.CompileRegisterToRegister(arm64.MOVW, pickTarget.register, pickedRegister)
  1368  		case runtimeValueTypeI64:
  1369  			c.assembler.CompileRegisterToRegister(arm64.MOVD, pickTarget.register, pickedRegister)
  1370  		case runtimeValueTypeF32:
  1371  			c.assembler.CompileRegisterToRegister(arm64.FMOVS, pickTarget.register, pickedRegister)
  1372  		case runtimeValueTypeF64:
  1373  			c.assembler.CompileRegisterToRegister(arm64.FMOVD, pickTarget.register, pickedRegister)
  1374  		case runtimeValueTypeV128Lo:
  1375  			c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1376  				pickTarget.register, pickTarget.register, pickedRegister, arm64.VectorArrangement16B)
  1377  		case runtimeValueTypeV128Hi:
  1378  			panic("BUG") // since pick target must point to the lower 64-bits of vectors.
  1379  		}
  1380  	} else if pickTarget.onStack() {
  1381  		// Temporarily assign a register to the pick target, and then load the value.
  1382  		pickTarget.setRegister(pickedRegister)
  1383  		c.compileLoadValueOnStackToRegister(pickTarget)
  1384  
  1385  		// After the load, we revert the register assignment to the pick target.
  1386  		pickTarget.setRegister(asm.NilRegister)
  1387  		if isTargetVector {
  1388  			hi := &c.locationStack.stack[pickTarget.stackPointer+1]
  1389  			hi.setRegister(asm.NilRegister)
  1390  		}
  1391  	}
  1392  
  1393  	// Now we have the value of the target on the pickedRegister,
  1394  	// so push the location.
  1395  	c.pushRuntimeValueLocationOnRegister(pickedRegister, pickTarget.valueType)
  1396  	if isTargetVector {
  1397  		c.pushRuntimeValueLocationOnRegister(pickedRegister, runtimeValueTypeV128Hi)
  1398  	}
  1399  	return nil
  1400  }
  1401  
  1402  // compileAdd implements compiler.compileAdd for the arm64 architecture.
  1403  func (c *arm64Compiler) compileAdd(o *wazeroir.UnionOperation) error {
  1404  	x1, x2, err := c.popTwoValuesOnRegisters()
  1405  	if err != nil {
  1406  		return err
  1407  	}
  1408  
  1409  	// Addition can be nop if one of operands is zero.
  1410  	if isZeroRegister(x1.register) {
  1411  		c.pushRuntimeValueLocationOnRegister(x2.register, x1.valueType)
  1412  		return nil
  1413  	} else if isZeroRegister(x2.register) {
  1414  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1415  		return nil
  1416  	}
  1417  
  1418  	var inst asm.Instruction
  1419  	unsignedType := wazeroir.UnsignedType(o.B1)
  1420  	switch unsignedType {
  1421  	case wazeroir.UnsignedTypeI32:
  1422  		inst = arm64.ADDW
  1423  	case wazeroir.UnsignedTypeI64:
  1424  		inst = arm64.ADD
  1425  	case wazeroir.UnsignedTypeF32:
  1426  		inst = arm64.FADDS
  1427  	case wazeroir.UnsignedTypeF64:
  1428  		inst = arm64.FADDD
  1429  	}
  1430  
  1431  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1432  	// The result is placed on a register for x1, so record it.
  1433  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1434  	return nil
  1435  }
  1436  
  1437  // compileSub implements compiler.compileSub for the arm64 architecture.
  1438  func (c *arm64Compiler) compileSub(o *wazeroir.UnionOperation) error {
  1439  	x1, x2, err := c.popTwoValuesOnRegisters()
  1440  	if err != nil {
  1441  		return err
  1442  	}
  1443  
  1444  	// If both of registers are zeros, this can be nop and push the zero register.
  1445  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1446  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1447  		return nil
  1448  	}
  1449  
  1450  	// At this point, at least one of x1 or x2 registers is non zero.
  1451  	// Choose the non-zero register as destination.
  1452  	destinationReg := x1.register
  1453  	if isZeroRegister(x1.register) {
  1454  		destinationReg = x2.register
  1455  	}
  1456  
  1457  	var inst asm.Instruction
  1458  	var vt runtimeValueType
  1459  	unsignedType := wazeroir.UnsignedType(o.B1)
  1460  	switch unsignedType {
  1461  	case wazeroir.UnsignedTypeI32:
  1462  		inst = arm64.SUBW
  1463  		vt = runtimeValueTypeI32
  1464  	case wazeroir.UnsignedTypeI64:
  1465  		inst = arm64.SUB
  1466  		vt = runtimeValueTypeI64
  1467  	case wazeroir.UnsignedTypeF32:
  1468  		inst = arm64.FSUBS
  1469  		vt = runtimeValueTypeF32
  1470  	case wazeroir.UnsignedTypeF64:
  1471  		inst = arm64.FSUBD
  1472  		vt = runtimeValueTypeF64
  1473  	}
  1474  
  1475  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1476  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  1477  	return nil
  1478  }
  1479  
  1480  // compileMul implements compiler.compileMul for the arm64 architecture.
  1481  func (c *arm64Compiler) compileMul(o *wazeroir.UnionOperation) error {
  1482  	x1, x2, err := c.popTwoValuesOnRegisters()
  1483  	if err != nil {
  1484  		return err
  1485  	}
  1486  
  1487  	// Multiplication can be done by putting a zero register if one of operands is zero.
  1488  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1489  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1490  		return nil
  1491  	}
  1492  
  1493  	var inst asm.Instruction
  1494  	var vt runtimeValueType
  1495  	unsignedType := wazeroir.UnsignedType(o.B1)
  1496  	switch unsignedType {
  1497  	case wazeroir.UnsignedTypeI32:
  1498  		inst = arm64.MULW
  1499  		vt = runtimeValueTypeI32
  1500  	case wazeroir.UnsignedTypeI64:
  1501  		inst = arm64.MUL
  1502  		vt = runtimeValueTypeI64
  1503  	case wazeroir.UnsignedTypeF32:
  1504  		inst = arm64.FMULS
  1505  		vt = runtimeValueTypeF32
  1506  	case wazeroir.UnsignedTypeF64:
  1507  		inst = arm64.FMULD
  1508  		vt = runtimeValueTypeF64
  1509  	}
  1510  
  1511  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1512  	// The result is placed on a register for x1, so record it.
  1513  	c.pushRuntimeValueLocationOnRegister(x1.register, vt)
  1514  	return nil
  1515  }
  1516  
  1517  // compileClz implements compiler.compileClz for the arm64 architecture.
  1518  func (c *arm64Compiler) compileClz(o *wazeroir.UnionOperation) error {
  1519  	v, err := c.popValueOnRegister()
  1520  	if err != nil {
  1521  		return err
  1522  	}
  1523  
  1524  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1525  	if isZeroRegister(v.register) {
  1526  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1527  		// so we allocate a register and put the const on it.
  1528  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1529  		if err != nil {
  1530  			return err
  1531  		}
  1532  		var vt runtimeValueType
  1533  		if unsignedInt == wazeroir.UnsignedInt32 {
  1534  			vt = runtimeValueTypeI32
  1535  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1536  		} else {
  1537  			vt = runtimeValueTypeI64
  1538  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1539  		}
  1540  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1541  		return nil
  1542  	}
  1543  
  1544  	reg := v.register
  1545  	var vt runtimeValueType
  1546  	if unsignedInt == wazeroir.UnsignedInt32 {
  1547  		vt = runtimeValueTypeI32
  1548  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1549  	} else {
  1550  		vt = runtimeValueTypeI64
  1551  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1552  	}
  1553  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1554  	return nil
  1555  }
  1556  
  1557  // compileCtz implements compiler.compileCtz for the arm64 architecture.
  1558  func (c *arm64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
  1559  	v, err := c.popValueOnRegister()
  1560  	if err != nil {
  1561  		return err
  1562  	}
  1563  
  1564  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1565  	reg := v.register
  1566  	if isZeroRegister(reg) {
  1567  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1568  		// so we allocate a register and put the const on it.
  1569  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1570  		if err != nil {
  1571  			return err
  1572  		}
  1573  		var vt runtimeValueType
  1574  		if unsignedInt == wazeroir.UnsignedInt32 {
  1575  			vt = runtimeValueTypeI32
  1576  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1577  		} else {
  1578  			vt = runtimeValueTypeI64
  1579  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1580  		}
  1581  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1582  		return nil
  1583  	}
  1584  
  1585  	// Since arm64 doesn't have an instruction directly counting trailing zeros,
  1586  	// we reverse the bits first, and then do CLZ, which is exactly the same as
  1587  	// gcc implements __builtin_ctz for arm64.
  1588  	var vt runtimeValueType
  1589  	if unsignedInt == wazeroir.UnsignedInt32 {
  1590  		vt = runtimeValueTypeI32
  1591  		c.assembler.CompileRegisterToRegister(arm64.RBITW, reg, reg)
  1592  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1593  	} else {
  1594  		vt = runtimeValueTypeI64
  1595  		c.assembler.CompileRegisterToRegister(arm64.RBIT, reg, reg)
  1596  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1597  	}
  1598  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1599  	return nil
  1600  }
  1601  
  1602  // compilePopcnt implements compiler.compilePopcnt for the arm64 architecture.
  1603  func (c *arm64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error {
  1604  	v, err := c.popValueOnRegister()
  1605  	if err != nil {
  1606  		return err
  1607  	}
  1608  
  1609  	reg := v.register
  1610  	if isZeroRegister(reg) {
  1611  		c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1612  		return nil
  1613  	}
  1614  
  1615  	freg, err := c.allocateRegister(registerTypeVector)
  1616  	if err != nil {
  1617  		return err
  1618  	}
  1619  
  1620  	// arm64 doesn't have an instruction for population count on scalar register,
  1621  	// so we use the vector one (VCNT).
  1622  	// This exactly what the official Go implements bits.OneCount.
  1623  	// For example, "func () int { return bits.OneCount(10) }" is compiled as
  1624  	//
  1625  	//    MOVD    $10, R0 ;; Load 10.
  1626  	//    FMOVD   R0, F0
  1627  	//    VCNT    V0.B8, V0.B8
  1628  	//    UADDLV  V0.B8, V0
  1629  	//
  1630  	var movInst asm.Instruction
  1631  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1632  	if unsignedInt == wazeroir.UnsignedInt32 {
  1633  		movInst = arm64.FMOVS
  1634  	} else {
  1635  		movInst = arm64.FMOVD
  1636  	}
  1637  	c.assembler.CompileRegisterToRegister(movInst, reg, freg)
  1638  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.VCNT, freg, freg,
  1639  		arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
  1640  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.UADDLV, freg, freg, arm64.VectorArrangement8B,
  1641  		arm64.VectorIndexNone, arm64.VectorIndexNone)
  1642  
  1643  	c.assembler.CompileRegisterToRegister(movInst, freg, reg)
  1644  
  1645  	c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1646  	return nil
  1647  }
  1648  
  1649  // compileDiv implements compiler.compileDiv for the arm64 architecture.
  1650  func (c *arm64Compiler) compileDiv(o *wazeroir.UnionOperation) error {
  1651  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1652  	if err != nil {
  1653  		return err
  1654  	}
  1655  
  1656  	signedType := wazeroir.SignedType(o.B1)
  1657  
  1658  	// If the divisor is on the zero register, exit from the function deterministically.
  1659  	if isZeroRegister(divisor.register) {
  1660  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1661  		v := c.locationStack.pushRuntimeValueLocationOnStack()
  1662  		switch signedType {
  1663  		case wazeroir.SignedTypeInt32, wazeroir.SignedTypeUint32:
  1664  			v.valueType = runtimeValueTypeI32
  1665  		case wazeroir.SignedTypeUint64, wazeroir.SignedTypeInt64:
  1666  			v.valueType = runtimeValueTypeI64
  1667  		}
  1668  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1669  		return nil
  1670  	}
  1671  
  1672  	var inst asm.Instruction
  1673  	var vt runtimeValueType
  1674  	switch signedType {
  1675  	case wazeroir.SignedTypeUint32:
  1676  		inst = arm64.UDIVW
  1677  		if err := c.compileIntegerDivPrecheck(true, false, dividend.register, divisor.register); err != nil {
  1678  			return err
  1679  		}
  1680  		vt = runtimeValueTypeI32
  1681  	case wazeroir.SignedTypeUint64:
  1682  		if err := c.compileIntegerDivPrecheck(false, false, dividend.register, divisor.register); err != nil {
  1683  			return err
  1684  		}
  1685  		inst = arm64.UDIV
  1686  		vt = runtimeValueTypeI64
  1687  	case wazeroir.SignedTypeInt32:
  1688  		if err := c.compileIntegerDivPrecheck(true, true, dividend.register, divisor.register); err != nil {
  1689  			return err
  1690  		}
  1691  		inst = arm64.SDIVW
  1692  		vt = runtimeValueTypeI32
  1693  	case wazeroir.SignedTypeInt64:
  1694  		if err := c.compileIntegerDivPrecheck(false, true, dividend.register, divisor.register); err != nil {
  1695  			return err
  1696  		}
  1697  		inst = arm64.SDIV
  1698  		vt = runtimeValueTypeI64
  1699  	case wazeroir.SignedTypeFloat32:
  1700  		inst = arm64.FDIVS
  1701  		vt = runtimeValueTypeF32
  1702  	case wazeroir.SignedTypeFloat64:
  1703  		inst = arm64.FDIVD
  1704  		vt = runtimeValueTypeF64
  1705  	}
  1706  
  1707  	c.assembler.CompileRegisterToRegister(inst, divisor.register, dividend.register)
  1708  
  1709  	c.pushRuntimeValueLocationOnRegister(dividend.register, vt)
  1710  	return nil
  1711  }
  1712  
  1713  // compileIntegerDivPrecheck adds instructions to check if the divisor and dividend are sound for division operation.
  1714  // First, this adds instructions to check if the divisor equals zero, and if so, exits the function.
  1715  // Plus, for signed divisions, check if the result might result in overflow or not.
  1716  func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, dividend, divisor asm.Register) error {
  1717  	// We check the divisor value equals zero.
  1718  	var cmpInst, movInst, loadInst asm.Instruction
  1719  	var minValueOffsetInVM int64
  1720  	if is32Bit {
  1721  		cmpInst = arm64.CMPW
  1722  		movInst = arm64.MOVW
  1723  		loadInst = arm64.LDRW
  1724  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  1725  	} else {
  1726  		cmpInst = arm64.CMP
  1727  		movInst = arm64.MOVD
  1728  		loadInst = arm64.LDRD
  1729  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  1730  	}
  1731  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisor)
  1732  
  1733  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1734  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerDivisionByZero)
  1735  	// Otherwise, we proceed.
  1736  
  1737  	// If the operation is a signed integer div, we have to do an additional check on overflow.
  1738  	if isSigned {
  1739  		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  1740  		// case which results in the overflow.
  1741  
  1742  		// First, we compare the divisor with -1.
  1743  		c.assembler.CompileConstToRegister(movInst, -1, arm64ReservedRegisterForTemporary)
  1744  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, divisor)
  1745  
  1746  		// If they not equal, we skip the following check.
  1747  		brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BCONDNE)
  1748  
  1749  		// Otherwise, we further check if the dividend equals math.MinInt32 or MinInt64.
  1750  		c.assembler.CompileMemoryToRegister(
  1751  			loadInst,
  1752  			arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  1753  			arm64ReservedRegisterForTemporary,
  1754  		)
  1755  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, dividend)
  1756  
  1757  		// If they not equal, we are safe to execute the division.
  1758  		// Otherwise, we raise overflow error.
  1759  		c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerOverflow)
  1760  
  1761  		c.assembler.SetJumpTargetOnNext(brIfDivisorNonMinusOne)
  1762  	}
  1763  	return nil
  1764  }
  1765  
  1766  // compileRem implements compiler.compileRem for the arm64 architecture.
  1767  func (c *arm64Compiler) compileRem(o *wazeroir.UnionOperation) error {
  1768  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1769  	if err != nil {
  1770  		return err
  1771  	}
  1772  
  1773  	dividendReg := dividend.register
  1774  	divisorReg := divisor.register
  1775  
  1776  	// If the divisor is on the zero register, exit from the function deterministically.
  1777  	if isZeroRegister(divisor.register) {
  1778  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1779  		v := c.locationStack.pushRuntimeValueLocationOnStack()
  1780  		v.valueType = runtimeValueTypeI32
  1781  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1782  		return nil
  1783  	}
  1784  
  1785  	var divInst, msubInst, cmpInst asm.Instruction
  1786  	signedInt := wazeroir.SignedInt(o.B1)
  1787  	switch signedInt {
  1788  	case wazeroir.SignedUint32:
  1789  		divInst = arm64.UDIVW
  1790  		msubInst = arm64.MSUBW
  1791  		cmpInst = arm64.CMPW
  1792  	case wazeroir.SignedUint64:
  1793  		divInst = arm64.UDIV
  1794  		msubInst = arm64.MSUB
  1795  		cmpInst = arm64.CMP
  1796  	case wazeroir.SignedInt32:
  1797  		divInst = arm64.SDIVW
  1798  		msubInst = arm64.MSUBW
  1799  		cmpInst = arm64.CMPW
  1800  	case wazeroir.SignedInt64:
  1801  		divInst = arm64.SDIV
  1802  		msubInst = arm64.MSUB
  1803  		cmpInst = arm64.CMP
  1804  	}
  1805  
  1806  	// We check the divisor value equals zero.
  1807  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisorReg)
  1808  
  1809  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1810  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerDivisionByZero)
  1811  	// Otherwise, we proceed.
  1812  
  1813  	// Temporarily mark them used to allocate a result register while keeping these values.
  1814  	c.markRegisterUsed(dividend.register, divisor.register)
  1815  
  1816  	resultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1817  	if err != nil {
  1818  		return err
  1819  	}
  1820  
  1821  	// arm64 doesn't have an instruction for rem, we use calculate it by two instructions: UDIV (SDIV for signed) and MSUB.
  1822  	// This exactly the same code that Clang emits.
  1823  	// [input: x0=dividend, x1=divisor]
  1824  	// >> UDIV x2, x0, x1
  1825  	// >> MSUB x3, x2, x1, x0
  1826  	// [result: x2=quotient, x3=remainder]
  1827  	//
  1828  	c.assembler.CompileTwoRegistersToRegister(divInst, divisorReg, dividendReg, resultReg)
  1829  	// ResultReg = dividendReg - (divisorReg * resultReg)
  1830  	c.assembler.CompileThreeRegistersToRegister(msubInst, divisorReg, dividendReg, resultReg, resultReg)
  1831  
  1832  	c.markRegisterUnused(dividend.register, divisor.register)
  1833  	c.pushRuntimeValueLocationOnRegister(resultReg, dividend.valueType)
  1834  	return nil
  1835  }
  1836  
  1837  // compileAnd implements compiler.compileAnd for the arm64 architecture.
  1838  func (c *arm64Compiler) compileAnd(o *wazeroir.UnionOperation) error {
  1839  	x1, x2, err := c.popTwoValuesOnRegisters()
  1840  	if err != nil {
  1841  		return err
  1842  	}
  1843  
  1844  	// If either of the registers x1 or x2 is zero,
  1845  	// the result will always be zero.
  1846  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1847  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1848  		return nil
  1849  	}
  1850  
  1851  	// At this point, at least one of x1 or x2 registers is non zero.
  1852  	// Choose the non-zero register as destination.
  1853  	destinationReg := x1.register
  1854  	if isZeroRegister(x1.register) {
  1855  		destinationReg = x2.register
  1856  	}
  1857  
  1858  	var inst asm.Instruction
  1859  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1860  	switch unsignedInt {
  1861  	case wazeroir.UnsignedInt32:
  1862  		inst = arm64.ANDW
  1863  	case wazeroir.UnsignedInt64:
  1864  		inst = arm64.AND
  1865  	}
  1866  
  1867  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1868  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1869  	return nil
  1870  }
  1871  
  1872  // compileOr implements compiler.compileOr for the arm64 architecture.
  1873  func (c *arm64Compiler) compileOr(o *wazeroir.UnionOperation) error {
  1874  	x1, x2, err := c.popTwoValuesOnRegisters()
  1875  	if err != nil {
  1876  		return err
  1877  	}
  1878  
  1879  	if isZeroRegister(x1.register) {
  1880  		c.pushRuntimeValueLocationOnRegister(x2.register, x2.valueType)
  1881  		return nil
  1882  	}
  1883  	if isZeroRegister(x2.register) {
  1884  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1885  		return nil
  1886  	}
  1887  
  1888  	var inst asm.Instruction
  1889  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1890  	switch unsignedInt {
  1891  	case wazeroir.UnsignedInt32:
  1892  		inst = arm64.ORRW
  1893  	case wazeroir.UnsignedInt64:
  1894  		inst = arm64.ORR
  1895  	}
  1896  
  1897  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1898  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1899  	return nil
  1900  }
  1901  
  1902  // compileXor implements compiler.compileXor for the arm64 architecture.
  1903  func (c *arm64Compiler) compileXor(o *wazeroir.UnionOperation) error {
  1904  	x1, x2, err := c.popTwoValuesOnRegisters()
  1905  	if err != nil {
  1906  		return err
  1907  	}
  1908  
  1909  	// At this point, at least one of x1 or x2 registers is non zero.
  1910  	// Choose the non-zero register as destination.
  1911  	destinationReg := x1.register
  1912  	if isZeroRegister(x1.register) {
  1913  		destinationReg = x2.register
  1914  	}
  1915  
  1916  	var inst asm.Instruction
  1917  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1918  	switch unsignedInt {
  1919  	case wazeroir.UnsignedInt32:
  1920  		inst = arm64.EORW
  1921  	case wazeroir.UnsignedInt64:
  1922  		inst = arm64.EOR
  1923  	}
  1924  
  1925  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1926  	c.pushRuntimeValueLocationOnRegister(destinationReg, x1.valueType)
  1927  	return nil
  1928  }
  1929  
  1930  // compileShl implements compiler.compileShl for the arm64 architecture.
  1931  func (c *arm64Compiler) compileShl(o *wazeroir.UnionOperation) error {
  1932  	x1, x2, err := c.popTwoValuesOnRegisters()
  1933  	if err != nil {
  1934  		return err
  1935  	}
  1936  
  1937  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1938  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1939  		return nil
  1940  	}
  1941  
  1942  	var inst asm.Instruction
  1943  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1944  	switch unsignedInt {
  1945  	case wazeroir.UnsignedInt32:
  1946  		inst = arm64.LSLW
  1947  	case wazeroir.UnsignedInt64:
  1948  		inst = arm64.LSL
  1949  	}
  1950  
  1951  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1952  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1953  	return nil
  1954  }
  1955  
  1956  // compileShr implements compiler.compileShr for the arm64 architecture.
  1957  func (c *arm64Compiler) compileShr(o *wazeroir.UnionOperation) error {
  1958  	x1, x2, err := c.popTwoValuesOnRegisters()
  1959  	if err != nil {
  1960  		return err
  1961  	}
  1962  
  1963  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1964  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1965  		return nil
  1966  	}
  1967  
  1968  	var inst asm.Instruction
  1969  	signedInt := wazeroir.SignedInt(o.B1)
  1970  	switch signedInt {
  1971  	case wazeroir.SignedInt32:
  1972  		inst = arm64.ASRW
  1973  	case wazeroir.SignedInt64:
  1974  		inst = arm64.ASR
  1975  	case wazeroir.SignedUint32:
  1976  		inst = arm64.LSRW
  1977  	case wazeroir.SignedUint64:
  1978  		inst = arm64.LSR
  1979  	}
  1980  
  1981  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1982  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1983  	return nil
  1984  }
  1985  
  1986  // compileRotl implements compiler.compileRotl for the arm64 architecture.
  1987  func (c *arm64Compiler) compileRotl(o *wazeroir.UnionOperation) error {
  1988  	x1, x2, err := c.popTwoValuesOnRegisters()
  1989  	if err != nil {
  1990  		return err
  1991  	}
  1992  
  1993  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1994  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1995  		return nil
  1996  	}
  1997  
  1998  	var inst, neginst asm.Instruction
  1999  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2000  	switch unsignedInt {
  2001  	case wazeroir.UnsignedInt32:
  2002  		inst = arm64.RORW
  2003  		neginst = arm64.NEGW
  2004  	case wazeroir.UnsignedInt64:
  2005  		inst = arm64.ROR
  2006  		neginst = arm64.NEG
  2007  	}
  2008  
  2009  	// Arm64 doesn't have rotate left instruction.
  2010  	// The shift amount needs to be converted to a negative number, similar to assembly output of bits.RotateLeft.
  2011  	c.assembler.CompileRegisterToRegister(neginst, x2.register, x2.register)
  2012  
  2013  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  2014  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2015  	return nil
  2016  }
  2017  
  2018  // compileRotr implements compiler.compileRotr for the arm64 architecture.
  2019  func (c *arm64Compiler) compileRotr(o *wazeroir.UnionOperation) error {
  2020  	x1, x2, err := c.popTwoValuesOnRegisters()
  2021  	if err != nil {
  2022  		return err
  2023  	}
  2024  
  2025  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  2026  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2027  		return nil
  2028  	}
  2029  
  2030  	var inst asm.Instruction
  2031  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2032  	switch unsignedInt {
  2033  	case wazeroir.UnsignedInt32:
  2034  		inst = arm64.RORW
  2035  	case wazeroir.UnsignedInt64:
  2036  		inst = arm64.ROR
  2037  	}
  2038  
  2039  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  2040  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2041  	return nil
  2042  }
  2043  
  2044  // compileAbs implements compiler.compileAbs for the arm64 architecture.
  2045  func (c *arm64Compiler) compileAbs(o *wazeroir.UnionOperation) error {
  2046  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2047  		return c.compileSimpleUnop(arm64.FABSS, runtimeValueTypeF32)
  2048  	} else {
  2049  		return c.compileSimpleUnop(arm64.FABSD, runtimeValueTypeF64)
  2050  	}
  2051  }
  2052  
  2053  // compileNeg implements compiler.compileNeg for the arm64 architecture.
  2054  func (c *arm64Compiler) compileNeg(o *wazeroir.UnionOperation) error {
  2055  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2056  		return c.compileSimpleUnop(arm64.FNEGS, runtimeValueTypeF32)
  2057  	} else {
  2058  		return c.compileSimpleUnop(arm64.FNEGD, runtimeValueTypeF64)
  2059  	}
  2060  }
  2061  
  2062  // compileCeil implements compiler.compileCeil for the arm64 architecture.
  2063  func (c *arm64Compiler) compileCeil(o *wazeroir.UnionOperation) error {
  2064  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2065  		return c.compileSimpleUnop(arm64.FRINTPS, runtimeValueTypeF32)
  2066  	} else {
  2067  		return c.compileSimpleUnop(arm64.FRINTPD, runtimeValueTypeF64)
  2068  	}
  2069  }
  2070  
  2071  // compileFloor implements compiler.compileFloor for the arm64 architecture.
  2072  func (c *arm64Compiler) compileFloor(o *wazeroir.UnionOperation) error {
  2073  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2074  		return c.compileSimpleUnop(arm64.FRINTMS, runtimeValueTypeF32)
  2075  	} else {
  2076  		return c.compileSimpleUnop(arm64.FRINTMD, runtimeValueTypeF64)
  2077  	}
  2078  }
  2079  
  2080  // compileTrunc implements compiler.compileTrunc for the arm64 architecture.
  2081  func (c *arm64Compiler) compileTrunc(o *wazeroir.UnionOperation) error {
  2082  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2083  		return c.compileSimpleUnop(arm64.FRINTZS, runtimeValueTypeF32)
  2084  	} else {
  2085  		return c.compileSimpleUnop(arm64.FRINTZD, runtimeValueTypeF64)
  2086  	}
  2087  }
  2088  
  2089  // compileNearest implements compiler.compileNearest for the arm64 architecture.
  2090  func (c *arm64Compiler) compileNearest(o *wazeroir.UnionOperation) error {
  2091  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2092  		return c.compileSimpleUnop(arm64.FRINTNS, runtimeValueTypeF32)
  2093  	} else {
  2094  		return c.compileSimpleUnop(arm64.FRINTND, runtimeValueTypeF64)
  2095  	}
  2096  }
  2097  
  2098  // compileSqrt implements compiler.compileSqrt for the arm64 architecture.
  2099  func (c *arm64Compiler) compileSqrt(o *wazeroir.UnionOperation) error {
  2100  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2101  		return c.compileSimpleUnop(arm64.FSQRTS, runtimeValueTypeF32)
  2102  	} else {
  2103  		return c.compileSimpleUnop(arm64.FSQRTD, runtimeValueTypeF64)
  2104  	}
  2105  }
  2106  
  2107  // compileMin implements compiler.compileMin for the arm64 architecture.
  2108  func (c *arm64Compiler) compileMin(o *wazeroir.UnionOperation) error {
  2109  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2110  		return c.compileSimpleFloatBinop(arm64.FMINS)
  2111  	} else {
  2112  		return c.compileSimpleFloatBinop(arm64.FMIND)
  2113  	}
  2114  }
  2115  
  2116  // compileMax implements compiler.compileMax for the arm64 architecture.
  2117  func (c *arm64Compiler) compileMax(o *wazeroir.UnionOperation) error {
  2118  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2119  		return c.compileSimpleFloatBinop(arm64.FMAXS)
  2120  	} else {
  2121  		return c.compileSimpleFloatBinop(arm64.FMAXD)
  2122  	}
  2123  }
  2124  
  2125  func (c *arm64Compiler) compileSimpleFloatBinop(inst asm.Instruction) error {
  2126  	x1, x2, err := c.popTwoValuesOnRegisters()
  2127  	if err != nil {
  2128  		return err
  2129  	}
  2130  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  2131  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2132  	return nil
  2133  }
  2134  
  2135  // compileCopysign implements compiler.compileCopysign for the arm64 architecture.
  2136  func (c *arm64Compiler) compileCopysign(o *wazeroir.UnionOperation) error {
  2137  	x1, x2, err := c.popTwoValuesOnRegisters()
  2138  	if err != nil {
  2139  		return err
  2140  	}
  2141  
  2142  	var ldr asm.Instruction
  2143  	var minValueOffsetInVM int64
  2144  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2145  		ldr = arm64.FLDRS
  2146  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  2147  	} else {
  2148  		ldr = arm64.FLDRD
  2149  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  2150  	}
  2151  
  2152  	c.markRegisterUsed(x1.register, x2.register)
  2153  	freg, err := c.allocateRegister(registerTypeVector)
  2154  	if err != nil {
  2155  		return err
  2156  	}
  2157  
  2158  	// This is exactly the same code emitted by GCC for "__builtin_copysign":
  2159  	//
  2160  	//    mov     x0, -9223372036854775808
  2161  	//    fmov    d2, x0
  2162  	//    vbit    v0.8b, v1.8b, v2.8b
  2163  	//
  2164  	// "mov freg, -9223372036854775808 (stored at ce.minimum64BitSignedInt)"
  2165  	c.assembler.CompileMemoryToRegister(
  2166  		ldr,
  2167  		arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  2168  		freg,
  2169  	)
  2170  
  2171  	// VBIT inserts each bit from the first operand into the destination if the corresponding bit of the second operand is 1,
  2172  	// otherwise it leaves the destination bit unchanged.
  2173  	// See https://developer.arm.com/documentation/dui0801/g/Advanced-SIMD-Instructions--32-bit-/VBIT
  2174  	//
  2175  	// "vbit vreg.8b, x2vreg.8b, x1vreg.8b" == "inserting 64th bit of x2 into x1".
  2176  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VBIT,
  2177  		freg, x2.register, x1.register, arm64.VectorArrangement16B)
  2178  
  2179  	c.markRegisterUnused(x2.register)
  2180  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2181  	return nil
  2182  }
  2183  
  2184  // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the arm64 architecture.
  2185  func (c *arm64Compiler) compileI32WrapFromI64() error {
  2186  	return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI32)
  2187  }
  2188  
  2189  // compileITruncFromF implements compiler.compileITruncFromF for the arm64 architecture.
  2190  func (c *arm64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) error {
  2191  	// Clear the floating point status register (FPSR).
  2192  	c.assembler.CompileRegisterToRegister(arm64.MSR, arm64.RegRZR, arm64.RegFPSR)
  2193  
  2194  	var vt runtimeValueType
  2195  	var convinst asm.Instruction
  2196  	inputType := wazeroir.Float(o.B1)
  2197  	outputType := wazeroir.SignedInt(o.B2)
  2198  	nonTrapping := o.B3
  2199  
  2200  	is32bitFloat := inputType == wazeroir.Float32
  2201  	if is32bitFloat && outputType == wazeroir.SignedInt32 {
  2202  		convinst = arm64.FCVTZSSW
  2203  		vt = runtimeValueTypeI32
  2204  	} else if is32bitFloat && outputType == wazeroir.SignedInt64 {
  2205  		convinst = arm64.FCVTZSS
  2206  		vt = runtimeValueTypeI64
  2207  	} else if !is32bitFloat && outputType == wazeroir.SignedInt32 {
  2208  		convinst = arm64.FCVTZSDW
  2209  		vt = runtimeValueTypeI32
  2210  	} else if !is32bitFloat && outputType == wazeroir.SignedInt64 {
  2211  		convinst = arm64.FCVTZSD
  2212  		vt = runtimeValueTypeI64
  2213  	} else if is32bitFloat && outputType == wazeroir.SignedUint32 {
  2214  		convinst = arm64.FCVTZUSW
  2215  		vt = runtimeValueTypeI32
  2216  	} else if is32bitFloat && outputType == wazeroir.SignedUint64 {
  2217  		convinst = arm64.FCVTZUS
  2218  		vt = runtimeValueTypeI64
  2219  	} else if !is32bitFloat && outputType == wazeroir.SignedUint32 {
  2220  		convinst = arm64.FCVTZUDW
  2221  		vt = runtimeValueTypeI32
  2222  	} else if !is32bitFloat && outputType == wazeroir.SignedUint64 {
  2223  		convinst = arm64.FCVTZUD
  2224  		vt = runtimeValueTypeI64
  2225  	}
  2226  
  2227  	source, err := c.popValueOnRegister()
  2228  	if err != nil {
  2229  		return err
  2230  	}
  2231  	sourceReg := source.register
  2232  
  2233  	destinationReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2234  	if err != nil {
  2235  		return err
  2236  	}
  2237  
  2238  	c.assembler.CompileRegisterToRegister(convinst, sourceReg, destinationReg)
  2239  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  2240  
  2241  	if !nonTrapping {
  2242  		// Obtain the floating point status register value into the general purpose register,
  2243  		// so that we can check if the conversion resulted in undefined behavior.
  2244  		c.assembler.CompileRegisterToRegister(arm64.MRS, arm64.RegFPSR, arm64ReservedRegisterForTemporary)
  2245  		// Check if the conversion was undefined by comparing the status with 1.
  2246  		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
  2247  		c.assembler.CompileRegisterAndConstToNone(arm64.CMP, arm64ReservedRegisterForTemporary, 1)
  2248  
  2249  		brOK := c.assembler.CompileJump(arm64.BCONDNE)
  2250  
  2251  		// If so, exit the execution with errors depending on whether or not the source value is NaN.
  2252  		var floatcmp asm.Instruction
  2253  		if is32bitFloat {
  2254  			floatcmp = arm64.FCMPS
  2255  		} else {
  2256  			floatcmp = arm64.FCMPD
  2257  		}
  2258  		c.assembler.CompileTwoRegistersToNone(floatcmp, sourceReg, sourceReg)
  2259  		// VS flag is set if at least one of values for FCMP is NaN.
  2260  		// https://developer.arm.com/documentation/dui0801/g/Condition-Codes/Comparison-of-condition-code-meanings-in-integer-and-floating-point-code
  2261  		// If the source value is not NaN, the operation was overflow.
  2262  		c.compileMaybeExitFromNativeCode(arm64.BCONDVS, nativeCallStatusIntegerOverflow)
  2263  
  2264  		// Otherwise, the operation was invalid as this is trying to convert NaN to integer.
  2265  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2266  
  2267  		// Otherwise, we branch into the next instruction.
  2268  		c.assembler.SetJumpTargetOnNext(brOK)
  2269  	}
  2270  	return nil
  2271  }
  2272  
  2273  // compileFConvertFromI implements compiler.compileFConvertFromI for the arm64 architecture.
  2274  func (c *arm64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) error {
  2275  	var convinst asm.Instruction
  2276  	inputType := wazeroir.SignedInt(o.B1)
  2277  	outputType := wazeroir.Float(o.B2)
  2278  
  2279  	if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 {
  2280  		convinst = arm64.SCVTFWS
  2281  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 {
  2282  		convinst = arm64.SCVTFS
  2283  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 {
  2284  		convinst = arm64.SCVTFWD
  2285  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 {
  2286  		convinst = arm64.SCVTFD
  2287  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 {
  2288  		convinst = arm64.UCVTFWS
  2289  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 {
  2290  		convinst = arm64.UCVTFS
  2291  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 {
  2292  		convinst = arm64.UCVTFWD
  2293  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 {
  2294  		convinst = arm64.UCVTFD
  2295  	}
  2296  
  2297  	var vt runtimeValueType
  2298  	if outputType == wazeroir.Float32 {
  2299  		vt = runtimeValueTypeF32
  2300  	} else {
  2301  		vt = runtimeValueTypeF64
  2302  	}
  2303  	return c.compileSimpleConversion(convinst, registerTypeVector, vt)
  2304  }
  2305  
  2306  // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the arm64 architecture.
  2307  func (c *arm64Compiler) compileF32DemoteFromF64() error {
  2308  	return c.compileSimpleUnop(arm64.FCVTDS, runtimeValueTypeF32)
  2309  }
  2310  
  2311  // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the arm64 architecture.
  2312  func (c *arm64Compiler) compileF64PromoteFromF32() error {
  2313  	return c.compileSimpleUnop(arm64.FCVTSD, runtimeValueTypeF64)
  2314  }
  2315  
  2316  // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the arm64 architecture.
  2317  func (c *arm64Compiler) compileI32ReinterpretFromF32() error {
  2318  	if peek := c.locationStack.peek(); peek.onStack() {
  2319  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2320  		peek.valueType = runtimeValueTypeI32
  2321  		return nil
  2322  	}
  2323  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeGeneralPurpose, runtimeValueTypeI32)
  2324  }
  2325  
  2326  // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the arm64 architecture.
  2327  func (c *arm64Compiler) compileI64ReinterpretFromF64() error {
  2328  	if peek := c.locationStack.peek(); peek.onStack() {
  2329  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2330  		peek.valueType = runtimeValueTypeI64
  2331  		return nil
  2332  	}
  2333  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeGeneralPurpose, runtimeValueTypeI64)
  2334  }
  2335  
  2336  // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the arm64 architecture.
  2337  func (c *arm64Compiler) compileF32ReinterpretFromI32() error {
  2338  	if peek := c.locationStack.peek(); peek.onStack() {
  2339  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2340  		peek.valueType = runtimeValueTypeF32
  2341  		return nil
  2342  	}
  2343  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeVector, runtimeValueTypeF32)
  2344  }
  2345  
  2346  // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the arm64 architecture.
  2347  func (c *arm64Compiler) compileF64ReinterpretFromI64() error {
  2348  	if peek := c.locationStack.peek(); peek.onStack() {
  2349  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2350  		peek.valueType = runtimeValueTypeF64
  2351  		return nil
  2352  	}
  2353  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeVector, runtimeValueTypeF64)
  2354  }
  2355  
  2356  func (c *arm64Compiler) compileSimpleConversion(inst asm.Instruction, destinationRegType registerType, resultRuntimeValueType runtimeValueType) error {
  2357  	source, err := c.popValueOnRegister()
  2358  	if err != nil {
  2359  		return err
  2360  	}
  2361  
  2362  	destinationReg, err := c.allocateRegister(destinationRegType)
  2363  	if err != nil {
  2364  		return err
  2365  	}
  2366  
  2367  	c.assembler.CompileRegisterToRegister(inst, source.register, destinationReg)
  2368  	c.pushRuntimeValueLocationOnRegister(destinationReg, resultRuntimeValueType)
  2369  	return nil
  2370  }
  2371  
  2372  // compileExtend implements compiler.compileExtend for the arm64 architecture.
  2373  func (c *arm64Compiler) compileExtend(o *wazeroir.UnionOperation) error {
  2374  	signed := o.B1 != 0
  2375  	if signed {
  2376  		return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2377  	} else {
  2378  		return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI64)
  2379  	}
  2380  }
  2381  
  2382  // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the arm64 architecture.
  2383  func (c *arm64Compiler) compileSignExtend32From8() error {
  2384  	return c.compileSimpleUnop(arm64.SXTBW, runtimeValueTypeI32)
  2385  }
  2386  
  2387  // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the arm64 architecture.
  2388  func (c *arm64Compiler) compileSignExtend32From16() error {
  2389  	return c.compileSimpleUnop(arm64.SXTHW, runtimeValueTypeI32)
  2390  }
  2391  
  2392  // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the arm64 architecture.
  2393  func (c *arm64Compiler) compileSignExtend64From8() error {
  2394  	return c.compileSimpleUnop(arm64.SXTB, runtimeValueTypeI64)
  2395  }
  2396  
  2397  // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the arm64 architecture.
  2398  func (c *arm64Compiler) compileSignExtend64From16() error {
  2399  	return c.compileSimpleUnop(arm64.SXTH, runtimeValueTypeI64)
  2400  }
  2401  
  2402  // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the arm64 architecture.
  2403  func (c *arm64Compiler) compileSignExtend64From32() error {
  2404  	return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2405  }
  2406  
  2407  func (c *arm64Compiler) compileSimpleUnop(inst asm.Instruction, resultRuntimeValueType runtimeValueType) error {
  2408  	v, err := c.popValueOnRegister()
  2409  	if err != nil {
  2410  		return err
  2411  	}
  2412  	reg := v.register
  2413  	c.assembler.CompileRegisterToRegister(inst, reg, reg)
  2414  	c.pushRuntimeValueLocationOnRegister(reg, resultRuntimeValueType)
  2415  	return nil
  2416  }
  2417  
  2418  // compileEq implements compiler.compileEq for the arm64 architecture.
  2419  func (c *arm64Compiler) compileEq(o *wazeroir.UnionOperation) error {
  2420  	return c.emitEqOrNe(true, wazeroir.UnsignedType(o.B1))
  2421  }
  2422  
  2423  // compileNe implements compiler.compileNe for the arm64 architecture.
  2424  func (c *arm64Compiler) compileNe(o *wazeroir.UnionOperation) error {
  2425  	return c.emitEqOrNe(false, wazeroir.UnsignedType(o.B1))
  2426  }
  2427  
  2428  // emitEqOrNe implements compiler.compileEq and compiler.compileNe for the arm64 architecture.
  2429  func (c *arm64Compiler) emitEqOrNe(isEq bool, unsignedType wazeroir.UnsignedType) error {
  2430  	x1, x2, err := c.popTwoValuesOnRegisters()
  2431  	if err != nil {
  2432  		return err
  2433  	}
  2434  
  2435  	var inst asm.Instruction
  2436  	switch unsignedType {
  2437  	case wazeroir.UnsignedTypeI32:
  2438  		inst = arm64.CMPW
  2439  	case wazeroir.UnsignedTypeI64:
  2440  		inst = arm64.CMP
  2441  	case wazeroir.UnsignedTypeF32:
  2442  		inst = arm64.FCMPS
  2443  	case wazeroir.UnsignedTypeF64:
  2444  		inst = arm64.FCMPD
  2445  	}
  2446  
  2447  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2448  
  2449  	// Push the comparison result as a conditional register value.
  2450  	cond := arm64.CondNE
  2451  	if isEq {
  2452  		cond = arm64.CondEQ
  2453  	}
  2454  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(cond)
  2455  	return nil
  2456  }
  2457  
  2458  // compileEqz implements compiler.compileEqz for the arm64 architecture.
  2459  func (c *arm64Compiler) compileEqz(o *wazeroir.UnionOperation) error {
  2460  	x1, err := c.popValueOnRegister()
  2461  	if err != nil {
  2462  		return err
  2463  	}
  2464  
  2465  	var inst asm.Instruction
  2466  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2467  	switch unsignedInt {
  2468  	case wazeroir.UnsignedInt32:
  2469  		inst = arm64.CMPW
  2470  	case wazeroir.UnsignedInt64:
  2471  		inst = arm64.CMP
  2472  	}
  2473  
  2474  	c.assembler.CompileTwoRegistersToNone(inst, arm64.RegRZR, x1.register)
  2475  
  2476  	// Push the comparison result as a conditional register value.
  2477  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondEQ)
  2478  	return nil
  2479  }
  2480  
  2481  // compileLt implements compiler.compileLt for the arm64 architecture.
  2482  func (c *arm64Compiler) compileLt(o *wazeroir.UnionOperation) error {
  2483  	x1, x2, err := c.popTwoValuesOnRegisters()
  2484  	if err != nil {
  2485  		return err
  2486  	}
  2487  
  2488  	var inst asm.Instruction
  2489  	var conditionalRegister asm.ConditionalRegisterState
  2490  	signedType := wazeroir.SignedType(o.B1)
  2491  	switch signedType {
  2492  	case wazeroir.SignedTypeUint32:
  2493  		inst = arm64.CMPW
  2494  		conditionalRegister = arm64.CondLO
  2495  	case wazeroir.SignedTypeUint64:
  2496  		inst = arm64.CMP
  2497  		conditionalRegister = arm64.CondLO
  2498  	case wazeroir.SignedTypeInt32:
  2499  		inst = arm64.CMPW
  2500  		conditionalRegister = arm64.CondLT
  2501  	case wazeroir.SignedTypeInt64:
  2502  		inst = arm64.CMP
  2503  		conditionalRegister = arm64.CondLT
  2504  	case wazeroir.SignedTypeFloat32:
  2505  		inst = arm64.FCMPS
  2506  		conditionalRegister = arm64.CondMI
  2507  	case wazeroir.SignedTypeFloat64:
  2508  		inst = arm64.FCMPD
  2509  		conditionalRegister = arm64.CondMI
  2510  	}
  2511  
  2512  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2513  
  2514  	// Push the comparison result as a conditional register value.
  2515  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2516  	return nil
  2517  }
  2518  
  2519  // compileGt implements compiler.compileGt for the arm64 architecture.
  2520  func (c *arm64Compiler) compileGt(o *wazeroir.UnionOperation) error {
  2521  	x1, x2, err := c.popTwoValuesOnRegisters()
  2522  	if err != nil {
  2523  		return err
  2524  	}
  2525  
  2526  	var inst asm.Instruction
  2527  	var conditionalRegister asm.ConditionalRegisterState
  2528  	signedType := wazeroir.SignedType(o.B1)
  2529  	switch signedType {
  2530  	case wazeroir.SignedTypeUint32:
  2531  		inst = arm64.CMPW
  2532  		conditionalRegister = arm64.CondHI
  2533  	case wazeroir.SignedTypeUint64:
  2534  		inst = arm64.CMP
  2535  		conditionalRegister = arm64.CondHI
  2536  	case wazeroir.SignedTypeInt32:
  2537  		inst = arm64.CMPW
  2538  		conditionalRegister = arm64.CondGT
  2539  	case wazeroir.SignedTypeInt64:
  2540  		inst = arm64.CMP
  2541  		conditionalRegister = arm64.CondGT
  2542  	case wazeroir.SignedTypeFloat32:
  2543  		inst = arm64.FCMPS
  2544  		conditionalRegister = arm64.CondGT
  2545  	case wazeroir.SignedTypeFloat64:
  2546  		inst = arm64.FCMPD
  2547  		conditionalRegister = arm64.CondGT
  2548  	}
  2549  
  2550  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2551  
  2552  	// Push the comparison result as a conditional register value.
  2553  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2554  	return nil
  2555  }
  2556  
  2557  // compileLe implements compiler.compileLe for the arm64 architecture.
  2558  func (c *arm64Compiler) compileLe(o *wazeroir.UnionOperation) error {
  2559  	x1, x2, err := c.popTwoValuesOnRegisters()
  2560  	if err != nil {
  2561  		return err
  2562  	}
  2563  
  2564  	var inst asm.Instruction
  2565  	var conditionalRegister asm.ConditionalRegisterState
  2566  	signedType := wazeroir.SignedType(o.B1)
  2567  	switch signedType {
  2568  	case wazeroir.SignedTypeUint32:
  2569  		inst = arm64.CMPW
  2570  		conditionalRegister = arm64.CondLS
  2571  	case wazeroir.SignedTypeUint64:
  2572  		inst = arm64.CMP
  2573  		conditionalRegister = arm64.CondLS
  2574  	case wazeroir.SignedTypeInt32:
  2575  		inst = arm64.CMPW
  2576  		conditionalRegister = arm64.CondLE
  2577  	case wazeroir.SignedTypeInt64:
  2578  		inst = arm64.CMP
  2579  		conditionalRegister = arm64.CondLE
  2580  	case wazeroir.SignedTypeFloat32:
  2581  		inst = arm64.FCMPS
  2582  		conditionalRegister = arm64.CondLS
  2583  	case wazeroir.SignedTypeFloat64:
  2584  		inst = arm64.FCMPD
  2585  		conditionalRegister = arm64.CondLS
  2586  	}
  2587  
  2588  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2589  
  2590  	// Push the comparison result as a conditional register value.
  2591  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2592  	return nil
  2593  }
  2594  
  2595  // compileGe implements compiler.compileGe for the arm64 architecture.
  2596  func (c *arm64Compiler) compileGe(o *wazeroir.UnionOperation) error {
  2597  	x1, x2, err := c.popTwoValuesOnRegisters()
  2598  	if err != nil {
  2599  		return err
  2600  	}
  2601  
  2602  	var inst asm.Instruction
  2603  	var conditionalRegister asm.ConditionalRegisterState
  2604  	signedType := wazeroir.SignedType(o.B1)
  2605  	switch signedType {
  2606  	case wazeroir.SignedTypeUint32:
  2607  		inst = arm64.CMPW
  2608  		conditionalRegister = arm64.CondHS
  2609  	case wazeroir.SignedTypeUint64:
  2610  		inst = arm64.CMP
  2611  		conditionalRegister = arm64.CondHS
  2612  	case wazeroir.SignedTypeInt32:
  2613  		inst = arm64.CMPW
  2614  		conditionalRegister = arm64.CondGE
  2615  	case wazeroir.SignedTypeInt64:
  2616  		inst = arm64.CMP
  2617  		conditionalRegister = arm64.CondGE
  2618  	case wazeroir.SignedTypeFloat32:
  2619  		inst = arm64.FCMPS
  2620  		conditionalRegister = arm64.CondGE
  2621  	case wazeroir.SignedTypeFloat64:
  2622  		inst = arm64.FCMPD
  2623  		conditionalRegister = arm64.CondGE
  2624  	}
  2625  
  2626  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2627  
  2628  	// Push the comparison result as a conditional register value.
  2629  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2630  	return nil
  2631  }
  2632  
  2633  // compileLoad implements compiler.compileLoad for the arm64 architecture.
  2634  func (c *arm64Compiler) compileLoad(o *wazeroir.UnionOperation) error {
  2635  	var (
  2636  		isFloat           bool
  2637  		loadInst          asm.Instruction
  2638  		targetSizeInBytes int64
  2639  		vt                runtimeValueType
  2640  	)
  2641  
  2642  	unsignedType := wazeroir.UnsignedType(o.B1)
  2643  	offset := uint32(o.U2)
  2644  
  2645  	switch unsignedType {
  2646  	case wazeroir.UnsignedTypeI32:
  2647  		loadInst = arm64.LDRW
  2648  		targetSizeInBytes = 32 / 8
  2649  		vt = runtimeValueTypeI32
  2650  	case wazeroir.UnsignedTypeI64:
  2651  		loadInst = arm64.LDRD
  2652  		targetSizeInBytes = 64 / 8
  2653  		vt = runtimeValueTypeI64
  2654  	case wazeroir.UnsignedTypeF32:
  2655  		loadInst = arm64.FLDRS
  2656  		isFloat = true
  2657  		targetSizeInBytes = 32 / 8
  2658  		vt = runtimeValueTypeF32
  2659  	case wazeroir.UnsignedTypeF64:
  2660  		loadInst = arm64.FLDRD
  2661  		isFloat = true
  2662  		targetSizeInBytes = 64 / 8
  2663  		vt = runtimeValueTypeF64
  2664  	}
  2665  	return c.compileLoadImpl(offset, loadInst, targetSizeInBytes, isFloat, vt)
  2666  }
  2667  
  2668  // compileLoad8 implements compiler.compileLoad8 for the arm64 architecture.
  2669  func (c *arm64Compiler) compileLoad8(o *wazeroir.UnionOperation) error {
  2670  	var loadInst asm.Instruction
  2671  	var vt runtimeValueType
  2672  
  2673  	signedInt := wazeroir.SignedInt(o.B1)
  2674  	offset := uint32(o.U2)
  2675  
  2676  	switch signedInt {
  2677  	case wazeroir.SignedInt32:
  2678  		loadInst = arm64.LDRSBW
  2679  		vt = runtimeValueTypeI32
  2680  	case wazeroir.SignedInt64:
  2681  		loadInst = arm64.LDRSBD
  2682  		vt = runtimeValueTypeI64
  2683  	case wazeroir.SignedUint32:
  2684  		loadInst = arm64.LDRB
  2685  		vt = runtimeValueTypeI32
  2686  	case wazeroir.SignedUint64:
  2687  		loadInst = arm64.LDRB
  2688  		vt = runtimeValueTypeI64
  2689  	}
  2690  	return c.compileLoadImpl(offset, loadInst, 1, false, vt)
  2691  }
  2692  
  2693  // compileLoad16 implements compiler.compileLoad16 for the arm64 architecture.
  2694  func (c *arm64Compiler) compileLoad16(o *wazeroir.UnionOperation) error {
  2695  	var loadInst asm.Instruction
  2696  	var vt runtimeValueType
  2697  
  2698  	signedInt := wazeroir.SignedInt(o.B1)
  2699  	offset := uint32(o.U2)
  2700  
  2701  	switch signedInt {
  2702  	case wazeroir.SignedInt32:
  2703  		loadInst = arm64.LDRSHW
  2704  		vt = runtimeValueTypeI32
  2705  	case wazeroir.SignedInt64:
  2706  		loadInst = arm64.LDRSHD
  2707  		vt = runtimeValueTypeI64
  2708  	case wazeroir.SignedUint32:
  2709  		loadInst = arm64.LDRH
  2710  		vt = runtimeValueTypeI32
  2711  	case wazeroir.SignedUint64:
  2712  		loadInst = arm64.LDRH
  2713  		vt = runtimeValueTypeI64
  2714  	}
  2715  	return c.compileLoadImpl(offset, loadInst, 16/8, false, vt)
  2716  }
  2717  
  2718  // compileLoad32 implements compiler.compileLoad32 for the arm64 architecture.
  2719  func (c *arm64Compiler) compileLoad32(o *wazeroir.UnionOperation) error {
  2720  	var loadInst asm.Instruction
  2721  	signed := o.B1 == 1
  2722  	offset := uint32(o.U2)
  2723  
  2724  	if signed {
  2725  		loadInst = arm64.LDRSW
  2726  	} else {
  2727  		loadInst = arm64.LDRW
  2728  	}
  2729  	return c.compileLoadImpl(offset, loadInst, 32/8, false, runtimeValueTypeI64)
  2730  }
  2731  
  2732  // compileLoadImpl implements compileLoadImpl* variants for arm64 architecture.
  2733  func (c *arm64Compiler) compileLoadImpl(offsetArg uint32, loadInst asm.Instruction,
  2734  	targetSizeInBytes int64, isFloat bool, resultRuntimeValueType runtimeValueType,
  2735  ) error {
  2736  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2737  	if err != nil {
  2738  		return err
  2739  	}
  2740  
  2741  	resultRegister := offsetReg
  2742  	if isFloat {
  2743  		resultRegister, err = c.allocateRegister(registerTypeVector)
  2744  		if err != nil {
  2745  			return err
  2746  		}
  2747  	}
  2748  
  2749  	// "resultRegister = [arm64ReservedRegisterForMemory + offsetReg]"
  2750  	// In other words, "resultRegister = memory.Buffer[offset: offset+targetSizeInBytes]"
  2751  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
  2752  		loadInst,
  2753  		arm64ReservedRegisterForMemory, offsetReg,
  2754  		resultRegister,
  2755  	)
  2756  
  2757  	c.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  2758  	return nil
  2759  }
  2760  
  2761  // compileStore implements compiler.compileStore for the arm64 architecture.
  2762  func (c *arm64Compiler) compileStore(o *wazeroir.UnionOperation) error {
  2763  	var movInst asm.Instruction
  2764  	var targetSizeInBytes int64
  2765  	unsignedType := wazeroir.UnsignedType(o.B1)
  2766  	offset := uint32(o.U2)
  2767  	switch unsignedType {
  2768  	case wazeroir.UnsignedTypeI32:
  2769  		movInst = arm64.STRW
  2770  		targetSizeInBytes = 32 / 8
  2771  	case wazeroir.UnsignedTypeI64:
  2772  		movInst = arm64.STRD
  2773  		targetSizeInBytes = 64 / 8
  2774  	case wazeroir.UnsignedTypeF32:
  2775  		movInst = arm64.FSTRS
  2776  		targetSizeInBytes = 32 / 8
  2777  	case wazeroir.UnsignedTypeF64:
  2778  		movInst = arm64.FSTRD
  2779  		targetSizeInBytes = 64 / 8
  2780  	}
  2781  	return c.compileStoreImpl(offset, movInst, targetSizeInBytes)
  2782  }
  2783  
  2784  // compileStore8 implements compiler.compileStore8 for the arm64 architecture.
  2785  func (c *arm64Compiler) compileStore8(o *wazeroir.UnionOperation) error {
  2786  	return c.compileStoreImpl(uint32(o.U2), arm64.STRB, 1)
  2787  }
  2788  
  2789  // compileStore16 implements compiler.compileStore16 for the arm64 architecture.
  2790  func (c *arm64Compiler) compileStore16(o *wazeroir.UnionOperation) error {
  2791  	return c.compileStoreImpl(uint32(o.U2), arm64.STRH, 16/8)
  2792  }
  2793  
  2794  // compileStore32 implements compiler.compileStore32 for the arm64 architecture.
  2795  func (c *arm64Compiler) compileStore32(o *wazeroir.UnionOperation) error {
  2796  	return c.compileStoreImpl(uint32(o.U2), arm64.STRW, 32/8)
  2797  }
  2798  
  2799  // compileStoreImpl implements compleStore* variants for arm64 architecture.
  2800  func (c *arm64Compiler) compileStoreImpl(offsetArg uint32, storeInst asm.Instruction, targetSizeInBytes int64) error {
  2801  	val, err := c.popValueOnRegister()
  2802  	if err != nil {
  2803  		return err
  2804  	}
  2805  	// Mark temporarily used as compileMemoryAccessOffsetSetup might try allocating register.
  2806  	c.markRegisterUsed(val.register)
  2807  
  2808  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2809  	if err != nil {
  2810  		return err
  2811  	}
  2812  
  2813  	// "[arm64ReservedRegisterForMemory + offsetReg] = val.register"
  2814  	// In other words, "memory.Buffer[offset: offset+targetSizeInBytes] = val.register"
  2815  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(
  2816  		storeInst, val.register,
  2817  		arm64ReservedRegisterForMemory, offsetReg,
  2818  	)
  2819  
  2820  	c.markRegisterUnused(val.register)
  2821  	return nil
  2822  }
  2823  
  2824  // compileMemoryAccessOffsetSetup pops the top value from the stack (called "base"), stores "base + offsetArg"
  2825  // into a register, and returns the stored register. We call the result "offset" because we access the memory
  2826  // as memory.Buffer[offset: offset+targetSizeInBytes].
  2827  //
  2828  // Note: this also emits the instructions to check the out of bounds memory access.
  2829  // In other words, if the offset+targetSizeInBytes exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  2830  func (c *arm64Compiler) compileMemoryAccessOffsetSetup(offsetArg uint32, targetSizeInBytes int64) (offsetRegister asm.Register, err error) {
  2831  	base, err := c.popValueOnRegister()
  2832  	if err != nil {
  2833  		return 0, err
  2834  	}
  2835  
  2836  	offsetRegister = base.register
  2837  	if isZeroRegister(base.register) {
  2838  		offsetRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
  2839  		if err != nil {
  2840  			return
  2841  		}
  2842  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, offsetRegister)
  2843  	}
  2844  
  2845  	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxUint32 {
  2846  		// "offsetRegister = base + offsetArg + targetSizeInBytes"
  2847  		c.assembler.CompileConstToRegister(arm64.ADD, offsetConst, offsetRegister)
  2848  	} else {
  2849  		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
  2850  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  2851  		return
  2852  	}
  2853  
  2854  	// "arm64ReservedRegisterForTemporary = len(memory.Buffer)"
  2855  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2856  		arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  2857  		arm64ReservedRegisterForTemporary)
  2858  
  2859  	// Check if offsetRegister(= base+offsetArg+targetSizeInBytes) > len(memory.Buffer).
  2860  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, offsetRegister)
  2861  
  2862  	// If offsetRegister(= base+offsetArg+targetSizeInBytes) exceeds the memory length,
  2863  	//  we exit the function with nativeCallStatusCodeMemoryOutOfBounds.
  2864  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, nativeCallStatusCodeMemoryOutOfBounds)
  2865  
  2866  	// Otherwise, we subtract targetSizeInBytes from offsetRegister.
  2867  	c.assembler.CompileConstToRegister(arm64.SUB, targetSizeInBytes, offsetRegister)
  2868  	return offsetRegister, nil
  2869  }
  2870  
  2871  // compileMemoryGrow implements compileMemoryGrow variants for arm64 architecture.
  2872  func (c *arm64Compiler) compileMemoryGrow() error {
  2873  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2874  		return err
  2875  	}
  2876  
  2877  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexMemoryGrow); err != nil {
  2878  		return err
  2879  	}
  2880  
  2881  	// After return, we re-initialize reserved registers just like preamble of functions.
  2882  	c.compileReservedStackBasePointerRegisterInitialization()
  2883  	c.compileReservedMemoryRegisterInitialization()
  2884  	return nil
  2885  }
  2886  
  2887  // compileMemorySize implements compileMemorySize variants for arm64 architecture.
  2888  func (c *arm64Compiler) compileMemorySize() error {
  2889  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2890  		return err
  2891  	}
  2892  
  2893  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2894  	if err != nil {
  2895  		return err
  2896  	}
  2897  
  2898  	// "reg = len(memory.Buffer)"
  2899  	c.assembler.CompileMemoryToRegister(
  2900  		arm64.LDRD,
  2901  		arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  2902  		reg,
  2903  	)
  2904  
  2905  	// memory.size loads the page size of memory, so we have to divide by the page size.
  2906  	// "reg = reg >> wasm.MemoryPageSizeInBits (== reg / wasm.MemoryPageSize) "
  2907  	c.assembler.CompileConstToRegister(
  2908  		arm64.LSR,
  2909  		wasm.MemoryPageSizeInBits,
  2910  		reg,
  2911  	)
  2912  
  2913  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  2914  	return nil
  2915  }
  2916  
  2917  // compileCallGoFunction adds instructions to call a Go function whose address equals the addr parameter.
  2918  // compilerStatus is set before making call, and it should be either nativeCallStatusCodeCallBuiltInFunction or
  2919  // nativeCallStatusCodeCallGoHostFunction.
  2920  func (c *arm64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode, builtinFunction wasm.Index) error {
  2921  	// Release all the registers as our calling convention requires the caller-save.
  2922  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  2923  		return err
  2924  	}
  2925  
  2926  	if compilerStatus == nativeCallStatusCodeCallBuiltInFunction {
  2927  		// Set the target function address to ce.functionCallAddress
  2928  		// "tmp = $index"
  2929  		c.assembler.CompileConstToRegister(
  2930  			arm64.MOVD,
  2931  			int64(builtinFunction),
  2932  			arm64ReservedRegisterForTemporary,
  2933  		)
  2934  		// "[arm64ReservedRegisterForCallEngine + callEngineExitContextFunctionCallAddressOffset] = tmp"
  2935  		// In other words, "ce.functionCallAddress = tmp (== $addr)"
  2936  		c.assembler.CompileRegisterToMemory(
  2937  			arm64.STRW,
  2938  			arm64ReservedRegisterForTemporary,
  2939  			arm64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset,
  2940  		)
  2941  	}
  2942  
  2943  	c.compileExitFromNativeCode(compilerStatus)
  2944  	return nil
  2945  }
  2946  
  2947  // compileConstI32 implements compiler.compileConstI32 for the arm64 architecture.
  2948  func (c *arm64Compiler) compileConstI32(o *wazeroir.UnionOperation) error {
  2949  	return c.compileIntConstant(true, o.U1)
  2950  }
  2951  
  2952  // compileConstI64 implements compiler.compileConstI64 for the arm64 architecture.
  2953  func (c *arm64Compiler) compileConstI64(o *wazeroir.UnionOperation) error {
  2954  	return c.compileIntConstant(false, o.U1)
  2955  }
  2956  
  2957  // compileIntConstant adds instructions to load an integer constant.
  2958  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  2959  // value holds the (zero-extended for 32-bit case) load target constant.
  2960  func (c *arm64Compiler) compileIntConstant(is32bit bool, value uint64) error {
  2961  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2962  		return err
  2963  	}
  2964  
  2965  	var inst asm.Instruction
  2966  	var vt runtimeValueType
  2967  	if is32bit {
  2968  		inst = arm64.MOVW
  2969  		vt = runtimeValueTypeI32
  2970  	} else {
  2971  		inst = arm64.MOVD
  2972  		vt = runtimeValueTypeI64
  2973  	}
  2974  
  2975  	if value == 0 {
  2976  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, vt)
  2977  	} else {
  2978  		// Take a register to load the value.
  2979  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2980  		if err != nil {
  2981  			return err
  2982  		}
  2983  
  2984  		c.assembler.CompileConstToRegister(inst, int64(value), reg)
  2985  
  2986  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  2987  	}
  2988  	return nil
  2989  }
  2990  
  2991  // compileConstF32 implements compiler.compileConstF32 for the arm64 architecture.
  2992  func (c *arm64Compiler) compileConstF32(o *wazeroir.UnionOperation) error {
  2993  	return c.compileFloatConstant(true, o.U1 /*uint64(math.Float32bits(o.Value))*/)
  2994  }
  2995  
  2996  // compileConstF64 implements compiler.compileConstF64 for the arm64 architecture.
  2997  func (c *arm64Compiler) compileConstF64(o *wazeroir.UnionOperation) error {
  2998  	return c.compileFloatConstant(false, o.U1 /*math.Float64bits(o.Value)*/)
  2999  }
  3000  
  3001  // compileFloatConstant adds instructions to load a float constant.
  3002  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  3003  // value holds the (zero-extended for 32-bit case) bit representation of load target float constant.
  3004  func (c *arm64Compiler) compileFloatConstant(is32bit bool, value uint64) error {
  3005  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3006  		return err
  3007  	}
  3008  
  3009  	// Take a register to load the value.
  3010  	reg, err := c.allocateRegister(registerTypeVector)
  3011  	if err != nil {
  3012  		return err
  3013  	}
  3014  
  3015  	tmpReg := arm64.RegRZR
  3016  	if value != 0 {
  3017  		tmpReg = arm64ReservedRegisterForTemporary
  3018  		var inst asm.Instruction
  3019  		if is32bit {
  3020  			inst = arm64.MOVW
  3021  		} else {
  3022  			inst = arm64.MOVD
  3023  		}
  3024  		c.assembler.CompileConstToRegister(inst, int64(value), tmpReg)
  3025  	}
  3026  
  3027  	// Use FMOV instruction to move the value on integer register into the float one.
  3028  	var inst asm.Instruction
  3029  	var vt runtimeValueType
  3030  	if is32bit {
  3031  		vt = runtimeValueTypeF32
  3032  		inst = arm64.FMOVS
  3033  	} else {
  3034  		vt = runtimeValueTypeF64
  3035  		inst = arm64.FMOVD
  3036  	}
  3037  	c.assembler.CompileRegisterToRegister(inst, tmpReg, reg)
  3038  
  3039  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3040  	return nil
  3041  }
  3042  
  3043  // compileMemoryInit implements compiler.compileMemoryInit for the arm64 architecture.
  3044  func (c *arm64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error {
  3045  	dataIndex := uint32(o.U1)
  3046  	return c.compileInitImpl(false, dataIndex, 0)
  3047  }
  3048  
  3049  // compileInitImpl implements compileTableInit and compileMemoryInit.
  3050  //
  3051  // TODO: the compiled code in this function should be reused and compile at once as
  3052  // the code is independent of any module.
  3053  func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
  3054  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3055  	if isTable {
  3056  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3057  	}
  3058  
  3059  	copySize, err := c.popValueOnRegister()
  3060  	if err != nil {
  3061  		return err
  3062  	}
  3063  	c.markRegisterUsed(copySize.register)
  3064  
  3065  	sourceOffset, err := c.popValueOnRegister()
  3066  	if err != nil {
  3067  		return err
  3068  	}
  3069  	if isZeroRegister(sourceOffset.register) {
  3070  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3071  		if err != nil {
  3072  			return err
  3073  		}
  3074  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  3075  	}
  3076  	c.markRegisterUsed(sourceOffset.register)
  3077  
  3078  	destinationOffset, err := c.popValueOnRegister()
  3079  	if err != nil {
  3080  		return err
  3081  	}
  3082  	if isZeroRegister(destinationOffset.register) {
  3083  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3084  		if err != nil {
  3085  			return err
  3086  		}
  3087  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3088  	}
  3089  	c.markRegisterUsed(destinationOffset.register)
  3090  
  3091  	tableInstanceAddressReg := asm.NilRegister
  3092  	if isTable {
  3093  		tableInstanceAddressReg, err = c.allocateRegister(registerTypeGeneralPurpose)
  3094  		if err != nil {
  3095  			return err
  3096  		}
  3097  		c.markRegisterUsed(tableInstanceAddressReg)
  3098  	}
  3099  
  3100  	if !isZeroRegister(copySize.register) {
  3101  		// sourceOffset += size.
  3102  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  3103  		// destinationOffset += size.
  3104  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  3105  	}
  3106  
  3107  	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
  3108  	if err != nil {
  3109  		return err
  3110  	}
  3111  
  3112  	if isTable {
  3113  		c.compileLoadElemInstanceAddress(index, instanceAddr)
  3114  	} else {
  3115  		c.compileLoadDataInstanceAddress(index, instanceAddr)
  3116  	}
  3117  
  3118  	// Check data instance bounds.
  3119  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3120  		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
  3121  		arm64ReservedRegisterForTemporary)
  3122  
  3123  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3124  	// If not, raise out of bounds memory access error.
  3125  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3126  
  3127  	// Otherwise, ready to copy the value from destination to source.
  3128  	// Check destination bounds.
  3129  	if isTable {
  3130  		// arm64ReservedRegisterForTemporary = &tables[0]
  3131  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3132  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3133  			arm64ReservedRegisterForTemporary)
  3134  		// tableInstanceAddressReg = arm64ReservedRegisterForTemporary + tableIndex*8
  3135  		//                         = &tables[0] + sizeOf(*tableInstance)*8
  3136  		//                         = &tables[tableIndex]
  3137  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3138  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3139  			tableInstanceAddressReg)
  3140  		// arm64ReservedRegisterForTemporary = [tableInstanceAddressReg+tableInstanceTableLenOffset] = len(tables[tableIndex])
  3141  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3142  			tableInstanceAddressReg, tableInstanceTableLenOffset,
  3143  			arm64ReservedRegisterForTemporary)
  3144  	} else {
  3145  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3146  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3147  			arm64ReservedRegisterForTemporary)
  3148  	}
  3149  
  3150  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3151  	// If not, raise out of bounds memory access error.
  3152  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3153  
  3154  	// Otherwise, ready to copy the value from source to destination.
  3155  	if !isZeroRegister(copySize.register) {
  3156  		// If the size equals zero, we can skip the entire instructions beflow.
  3157  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3158  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3159  
  3160  		var ldr, str asm.Instruction
  3161  		var movSize int64
  3162  		if isTable {
  3163  			ldr, str = arm64.LDRD, arm64.STRD
  3164  			movSize = 8
  3165  
  3166  			// arm64ReservedRegisterForTemporary = &Table[0]
  3167  			c.assembler.CompileMemoryToRegister(arm64.LDRD, tableInstanceAddressReg,
  3168  				tableInstanceTableOffset, arm64ReservedRegisterForTemporary)
  3169  			// destinationOffset = (destinationOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3170  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3171  				destinationOffset.register, pointerSizeLog2,
  3172  				arm64ReservedRegisterForTemporary, destinationOffset.register)
  3173  
  3174  			// arm64ReservedRegisterForTemporary = &ElementInstance.References[0]
  3175  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3176  			// sourceOffset = (sourceOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3177  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3178  				sourceOffset.register, pointerSizeLog2,
  3179  				arm64ReservedRegisterForTemporary, sourceOffset.register)
  3180  
  3181  			// copySize = copySize << pointerSizeLog2
  3182  			c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3183  		} else {
  3184  			ldr, str = arm64.LDRB, arm64.STRB
  3185  			movSize = 1
  3186  
  3187  			// destinationOffset += memory buffer's absolute address.
  3188  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3189  
  3190  			// sourceOffset += data buffer's absolute address.
  3191  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3192  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3193  
  3194  		}
  3195  
  3196  		// Negate the counter.
  3197  		c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3198  
  3199  		beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3200  
  3201  		// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3202  		c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3203  			sourceOffset.register, copySize.register,
  3204  			arm64ReservedRegisterForTemporary)
  3205  		// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3206  		c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3207  			arm64ReservedRegisterForTemporary,
  3208  			destinationOffset.register, copySize.register,
  3209  		)
  3210  
  3211  		// Decrement the size counter and if the value is still negative, continue the loop.
  3212  		c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3213  		c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3214  
  3215  		c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3216  	}
  3217  
  3218  	c.markRegisterUnused(copySize.register, sourceOffset.register,
  3219  		destinationOffset.register, instanceAddr, tableInstanceAddressReg)
  3220  	return nil
  3221  }
  3222  
  3223  // compileDataDrop implements compiler.compileDataDrop for the arm64 architecture.
  3224  func (c *arm64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error {
  3225  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3226  		return err
  3227  	}
  3228  
  3229  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3230  	if err != nil {
  3231  		return err
  3232  	}
  3233  
  3234  	dataIndex := uint32(o.U1)
  3235  	c.compileLoadDataInstanceAddress(dataIndex, tmp)
  3236  
  3237  	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
  3238  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3239  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3240  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3241  	return nil
  3242  }
  3243  
  3244  func (c *arm64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
  3245  	// dst = dataIndex * dataInstanceStructSize
  3246  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(dataIndex)*dataInstanceStructSize, dst)
  3247  
  3248  	// arm64ReservedRegisterForTemporary = &moduleInstance.DataInstances[0]
  3249  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3250  		arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  3251  		arm64ReservedRegisterForTemporary,
  3252  	)
  3253  
  3254  	// dst = arm64ReservedRegisterForTemporary + dst
  3255  	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
  3256  	//     = &moduleInstance.DataInstances[dataIndex]
  3257  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3258  }
  3259  
  3260  // compileMemoryCopy implements compiler.compileMemoryCopy for the arm64 architecture.
  3261  func (c *arm64Compiler) compileMemoryCopy() error {
  3262  	return c.compileCopyImpl(false, 0, 0)
  3263  }
  3264  
  3265  // compileCopyImpl implements compileTableCopy and compileMemoryCopy.
  3266  //
  3267  // TODO: the compiled code in this function should be reused and compile at once as
  3268  // the code is independent of any module.
  3269  func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableIndex uint32) error {
  3270  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3271  	if isTable {
  3272  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3273  	}
  3274  
  3275  	copySize, err := c.popValueOnRegister()
  3276  	if err != nil {
  3277  		return err
  3278  	}
  3279  	c.markRegisterUsed(copySize.register)
  3280  
  3281  	sourceOffset, err := c.popValueOnRegister()
  3282  	if err != nil {
  3283  		return err
  3284  	}
  3285  	if isZeroRegister(sourceOffset.register) {
  3286  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3287  		if err != nil {
  3288  			return err
  3289  		}
  3290  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  3291  	}
  3292  	c.markRegisterUsed(sourceOffset.register)
  3293  
  3294  	destinationOffset, err := c.popValueOnRegister()
  3295  	if err != nil {
  3296  		return err
  3297  	}
  3298  	if isZeroRegister(destinationOffset.register) {
  3299  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3300  		if err != nil {
  3301  			return err
  3302  		}
  3303  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3304  	}
  3305  	c.markRegisterUsed(destinationOffset.register)
  3306  
  3307  	if !isZeroRegister(copySize.register) {
  3308  		// sourceOffset += size.
  3309  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  3310  		// destinationOffset += size.
  3311  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  3312  	}
  3313  
  3314  	if isTable {
  3315  		// arm64ReservedRegisterForTemporary = &tables[0]
  3316  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3317  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3318  			arm64ReservedRegisterForTemporary)
  3319  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3320  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3321  		//                                   = &tables[srcTableIndex]
  3322  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3323  			arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3324  			arm64ReservedRegisterForTemporary)
  3325  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3326  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3327  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3328  			arm64ReservedRegisterForTemporary)
  3329  	} else {
  3330  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3331  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3332  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3333  			arm64ReservedRegisterForTemporary)
  3334  	}
  3335  
  3336  	// Check memory len >= sourceOffset.
  3337  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3338  	// If not, raise out of bounds memory access error.
  3339  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3340  
  3341  	// Otherwise, check memory len >= destinationOffset.
  3342  	if isTable {
  3343  		// arm64ReservedRegisterForTemporary = &tables[0]
  3344  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3345  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3346  			arm64ReservedRegisterForTemporary)
  3347  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + dstTableIndex*8
  3348  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3349  		//                                   = &tables[dstTableIndex]
  3350  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3351  			arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3352  			arm64ReservedRegisterForTemporary)
  3353  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[dstTableIndex])
  3354  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3355  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3356  			arm64ReservedRegisterForTemporary)
  3357  	}
  3358  
  3359  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3360  	// If not, raise out of bounds memory access error.
  3361  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3362  
  3363  	// Otherwise, ready to copy the value from source to destination.
  3364  	var ldr, str asm.Instruction
  3365  	var movSize int64
  3366  	if isTable {
  3367  		ldr, str = arm64.LDRD, arm64.STRD
  3368  		movSize = 8
  3369  	} else {
  3370  		ldr, str = arm64.LDRB, arm64.STRB
  3371  		movSize = 1
  3372  	}
  3373  
  3374  	// If the size equals zero, we can skip the entire instructions beflow.
  3375  	if !isZeroRegister(copySize.register) {
  3376  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3377  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3378  
  3379  		// If source offet < destination offset: for (i = size-1; i >= 0; i--) dst[i] = src[i];
  3380  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, sourceOffset.register, destinationOffset.register)
  3381  		destLowerThanSourceJump := c.assembler.CompileJump(arm64.BCONDLS)
  3382  		var endJump asm.Node
  3383  		{
  3384  			// sourceOffset -= size.
  3385  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, sourceOffset.register)
  3386  			// destinationOffset -= size.
  3387  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, destinationOffset.register)
  3388  
  3389  			if isTable {
  3390  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3391  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3392  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3393  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3394  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3395  					arm64ReservedRegisterForTemporary)
  3396  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3397  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3398  					arm64ReservedRegisterForTemporary)
  3399  				// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3400  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3401  					destinationOffset.register, pointerSizeLog2,
  3402  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3403  
  3404  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3405  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3406  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3407  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3408  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3409  					arm64ReservedRegisterForTemporary)
  3410  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3411  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3412  					arm64ReservedRegisterForTemporary)
  3413  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3414  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3415  					sourceOffset.register, pointerSizeLog2,
  3416  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3417  
  3418  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3419  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3420  			} else {
  3421  				// sourceOffset += memory buffer's absolute address.
  3422  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3423  				// destinationOffset += memory buffer's absolute address.
  3424  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3425  			}
  3426  
  3427  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3428  
  3429  			// size -= 1
  3430  			c.assembler.CompileConstToRegister(arm64.SUBS, movSize, copySize.register)
  3431  
  3432  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3433  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3434  				sourceOffset.register, copySize.register,
  3435  				arm64ReservedRegisterForTemporary)
  3436  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3437  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3438  				arm64ReservedRegisterForTemporary,
  3439  				destinationOffset.register, copySize.register,
  3440  			)
  3441  
  3442  			// If the value on the copySize.register is not equal zero, continue the loop.
  3443  			c.assembler.CompileJump(arm64.BCONDNE).AssignJumpTarget(beginCopyLoop)
  3444  
  3445  			// Otherwise, exit the loop.
  3446  			endJump = c.assembler.CompileJump(arm64.B)
  3447  		}
  3448  
  3449  		// Else (destination offet < source offset): for (i = 0; i < size; i++) dst[counter-1-i] = src[counter-1-i];
  3450  		c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  3451  		{
  3452  
  3453  			if isTable {
  3454  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3455  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3456  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3457  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3458  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3459  					arm64ReservedRegisterForTemporary)
  3460  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3461  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3462  					arm64ReservedRegisterForTemporary)
  3463  				// destinationOffset = (destinationOffset<< interfaceDataySizeLog2) + &Table[dstTableIndex].Table[0]
  3464  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3465  					destinationOffset.register, pointerSizeLog2,
  3466  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3467  
  3468  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3469  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3470  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3471  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3472  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3473  					arm64ReservedRegisterForTemporary)
  3474  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3475  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3476  					arm64ReservedRegisterForTemporary)
  3477  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3478  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3479  					sourceOffset.register, pointerSizeLog2,
  3480  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3481  
  3482  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3483  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3484  			} else {
  3485  				// sourceOffset += memory buffer's absolute address.
  3486  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3487  				// destinationOffset += memory buffer's absolute address.
  3488  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3489  			}
  3490  
  3491  			// Negate the counter.
  3492  			c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3493  
  3494  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3495  
  3496  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3497  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3498  				sourceOffset.register, copySize.register,
  3499  				arm64ReservedRegisterForTemporary)
  3500  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3501  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3502  				arm64ReservedRegisterForTemporary,
  3503  				destinationOffset.register, copySize.register,
  3504  			)
  3505  
  3506  			// size += 1
  3507  			c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3508  			c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3509  		}
  3510  		c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3511  		c.assembler.SetJumpTargetOnNext(endJump)
  3512  	}
  3513  
  3514  	// Mark all of the operand registers.
  3515  	c.markRegisterUnused(copySize.register, sourceOffset.register, destinationOffset.register)
  3516  
  3517  	return nil
  3518  }
  3519  
  3520  // compileMemoryFill implements compiler.compileMemoryCopy for the arm64 architecture.
  3521  func (c *arm64Compiler) compileMemoryFill() error {
  3522  	return c.compileFillImpl(false, 0)
  3523  }
  3524  
  3525  // compileFillImpl implements TableFill and MemoryFill.
  3526  //
  3527  // TODO: the compiled code in this function should be reused and compile at once as
  3528  // the code is independent of any module.
  3529  func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
  3530  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3531  	if isTable {
  3532  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3533  	}
  3534  
  3535  	fillSize, err := c.popValueOnRegister()
  3536  	if err != nil {
  3537  		return err
  3538  	}
  3539  	c.markRegisterUsed(fillSize.register)
  3540  
  3541  	value, err := c.popValueOnRegister()
  3542  	if err != nil {
  3543  		return err
  3544  	}
  3545  	c.markRegisterUsed(value.register)
  3546  
  3547  	destinationOffset, err := c.popValueOnRegister()
  3548  	if err != nil {
  3549  		return err
  3550  	}
  3551  	if isZeroRegister(destinationOffset.register) {
  3552  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3553  		if err != nil {
  3554  			return err
  3555  		}
  3556  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3557  	}
  3558  	c.markRegisterUsed(destinationOffset.register)
  3559  
  3560  	// destinationOffset += size.
  3561  	c.assembler.CompileRegisterToRegister(arm64.ADD, fillSize.register, destinationOffset.register)
  3562  
  3563  	if isTable {
  3564  		// arm64ReservedRegisterForTemporary = &tables[0]
  3565  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3566  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3567  			arm64ReservedRegisterForTemporary)
  3568  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3569  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3570  		//                                   = &tables[srcTableIndex]
  3571  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3572  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3573  			arm64ReservedRegisterForTemporary)
  3574  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3575  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3576  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3577  			arm64ReservedRegisterForTemporary)
  3578  	} else {
  3579  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3580  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3581  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3582  			arm64ReservedRegisterForTemporary)
  3583  	}
  3584  
  3585  	// Check  len >= destinationOffset.
  3586  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3587  
  3588  	// If not, raise the runtime error.
  3589  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3590  
  3591  	// Otherwise, ready to copy the value from destination to source.
  3592  	// If the size equals zero, we can skip the entire instructions below.
  3593  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, fillSize.register)
  3594  	skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3595  
  3596  	// destinationOffset -= size.
  3597  	c.assembler.CompileRegisterToRegister(arm64.SUB, fillSize.register, destinationOffset.register)
  3598  
  3599  	var str asm.Instruction
  3600  	var movSize int64
  3601  	if isTable {
  3602  		str = arm64.STRD
  3603  		movSize = 8
  3604  
  3605  		// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3606  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3607  			callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3608  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3609  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3610  			arm64ReservedRegisterForTemporary)
  3611  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3612  			arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3613  			arm64ReservedRegisterForTemporary)
  3614  		// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3615  		c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3616  			destinationOffset.register, pointerSizeLog2,
  3617  			arm64ReservedRegisterForTemporary, destinationOffset.register)
  3618  
  3619  		// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3620  		c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, fillSize.register)
  3621  	} else {
  3622  		str = arm64.STRB
  3623  		movSize = 1
  3624  
  3625  		// destinationOffset += memory buffer's absolute address.
  3626  		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3627  	}
  3628  
  3629  	// Naively implement the copy with "for loop" by copying byte one by one.
  3630  	beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3631  
  3632  	// size -= 1
  3633  	c.assembler.CompileConstToRegister(arm64.SUBS, movSize, fillSize.register)
  3634  
  3635  	// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3636  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3637  		value.register,
  3638  		destinationOffset.register, fillSize.register,
  3639  	)
  3640  
  3641  	// If the value on the copySizeRgister.register is not equal zero, continue the loop.
  3642  	continueJump := c.assembler.CompileJump(arm64.BCONDNE)
  3643  	continueJump.AssignJumpTarget(beginCopyLoop)
  3644  
  3645  	// Mark all of the operand registers.
  3646  	c.markRegisterUnused(fillSize.register, value.register, destinationOffset.register)
  3647  
  3648  	c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3649  	return nil
  3650  }
  3651  
  3652  // compileTableInit implements compiler.compileTableInit for the arm64 architecture.
  3653  func (c *arm64Compiler) compileTableInit(o *wazeroir.UnionOperation) error {
  3654  	elemIndex := uint32(o.U1)
  3655  	tableIndex := uint32(o.U2)
  3656  	return c.compileInitImpl(true, elemIndex, tableIndex)
  3657  }
  3658  
  3659  // compileTableCopy implements compiler.compileTableCopy for the arm64 architecture.
  3660  func (c *arm64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error {
  3661  	return c.compileCopyImpl(true, uint32(o.U1), uint32(o.U2))
  3662  }
  3663  
  3664  // compileElemDrop implements compiler.compileElemDrop for the arm64 architecture.
  3665  func (c *arm64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error {
  3666  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3667  		return err
  3668  	}
  3669  
  3670  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3671  	if err != nil {
  3672  		return err
  3673  	}
  3674  
  3675  	elemIndex := uint32(o.U1)
  3676  	c.compileLoadElemInstanceAddress(elemIndex, tmp)
  3677  
  3678  	// Clears the content of ElementInstances[o.ElemIndex] (== []interface{} type).
  3679  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3680  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3681  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3682  	return nil
  3683  }
  3684  
  3685  func (c *arm64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
  3686  	// dst = dataIndex * elementInstanceStructSize
  3687  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(elemIndex)*elementInstanceStructSize, dst)
  3688  
  3689  	// arm64ReservedRegisterForTemporary = &moduleInstance.ElementInstances[0]
  3690  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3691  		arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  3692  		arm64ReservedRegisterForTemporary,
  3693  	)
  3694  
  3695  	// dst = arm64ReservedRegisterForTemporary + dst
  3696  	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
  3697  	//     = &moduleInstance.ElementInstances[elemIndex]
  3698  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3699  }
  3700  
  3701  // compileRefFunc implements compiler.compileRefFunc for the arm64 architecture.
  3702  func (c *arm64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error {
  3703  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3704  		return err
  3705  	}
  3706  
  3707  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3708  	if err != nil {
  3709  		return err
  3710  	}
  3711  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset]
  3712  	//                                   = &moduleEngine.functions[0]
  3713  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3714  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  3715  		ref)
  3716  
  3717  	// ref = ref + int64(o.FunctionIndex)*sizeOf(function)
  3718  	//     = &moduleEngine.functions[index]
  3719  	functionIndex := int64(o.U1)
  3720  	c.assembler.CompileConstToRegister(arm64.ADD,
  3721  		functionIndex*functionSize,
  3722  		ref,
  3723  	)
  3724  
  3725  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
  3726  	return nil
  3727  }
  3728  
  3729  // compileTableGet implements compiler.compileTableGet for the arm64 architecture.
  3730  func (c *arm64Compiler) compileTableGet(o *wazeroir.UnionOperation) error {
  3731  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3732  	if err != nil {
  3733  		return err
  3734  	}
  3735  	c.markRegisterUsed(ref)
  3736  
  3737  	offset, err := c.popValueOnRegister()
  3738  	if err != nil {
  3739  		return err
  3740  	}
  3741  
  3742  	// arm64ReservedRegisterForTemporary = &tables[0]
  3743  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3744  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3745  		arm64ReservedRegisterForTemporary)
  3746  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3747  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3748  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3749  	tableIndex := int64(o.U1)
  3750  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3751  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3752  		arm64ReservedRegisterForTemporary)
  3753  
  3754  	// Out of bounds check.
  3755  	// ref = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3756  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3757  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3758  		ref,
  3759  	)
  3760  	// "cmp ref, offset"
  3761  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, ref, offset.register)
  3762  
  3763  	// If it exceeds len(table), we exit the execution.
  3764  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  3765  
  3766  	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3767  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3768  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3769  		ref,
  3770  	)
  3771  
  3772  	// ref = (offset << pointerSizeLog2) + ref
  3773  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3774  	//     = &tables[TableIndex].References[offset]
  3775  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3776  		offset.register, pointerSizeLog2, ref, ref)
  3777  
  3778  	// ref = [&tables[TableIndex]] = load the Reference's pointer as uint64.
  3779  	c.assembler.CompileMemoryToRegister(arm64.LDRD, ref, 0, ref)
  3780  
  3781  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
  3782  	return nil
  3783  }
  3784  
  3785  // compileTableSet implements compiler.compileTableSet for the arm64 architecture.
  3786  func (c *arm64Compiler) compileTableSet(o *wazeroir.UnionOperation) error {
  3787  	ref := c.locationStack.pop()
  3788  	if err := c.compileEnsureOnRegister(ref); err != nil {
  3789  		return err
  3790  	}
  3791  
  3792  	offset := c.locationStack.pop()
  3793  	if err := c.compileEnsureOnRegister(offset); err != nil {
  3794  		return err
  3795  	}
  3796  
  3797  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3798  	if err != nil {
  3799  		return err
  3800  	}
  3801  
  3802  	// arm64ReservedRegisterForTemporary = &tables[0]
  3803  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3804  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3805  		arm64ReservedRegisterForTemporary)
  3806  	// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + TableIndex*8
  3807  	//                                   = &tables[0] + TableIndex*sizeOf(*tableInstance)
  3808  	//                                   = &tables[TableIndex]
  3809  	tableIndex := int64(o.U1)
  3810  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3811  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3812  		arm64ReservedRegisterForTemporary)
  3813  
  3814  	// Out of bounds check.
  3815  	// tmp = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3816  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3817  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3818  		tmp,
  3819  	)
  3820  	// "cmp tmp, offset"
  3821  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, offset.register)
  3822  
  3823  	// If it exceeds len(table), we exit the execution.
  3824  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  3825  
  3826  	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3827  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3828  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3829  		tmp,
  3830  	)
  3831  
  3832  	// tmp = (offset << pointerSizeLog2) + tmp
  3833  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3834  	//     = &tables[TableIndex].References[offset]
  3835  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, offset.register, pointerSizeLog2, tmp, tmp)
  3836  
  3837  	// Set the reference's raw pointer.
  3838  	c.assembler.CompileRegisterToMemory(arm64.STRD, ref.register, tmp, 0)
  3839  
  3840  	c.markRegisterUnused(offset.register, ref.register, tmp)
  3841  	return nil
  3842  }
  3843  
  3844  // compileTableGrow implements compiler.compileTableGrow for the arm64 architecture.
  3845  func (c *arm64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error {
  3846  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3847  		return err
  3848  	}
  3849  
  3850  	// Pushes the table index.
  3851  	tableIndex := o.U1
  3852  	if err := c.compileIntConstant(true, tableIndex); err != nil {
  3853  		return err
  3854  	}
  3855  
  3856  	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
  3857  	// Therefore, call out to the built function for this purpose.
  3858  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexTableGrow); err != nil {
  3859  		return err
  3860  	}
  3861  
  3862  	// TableGrow consumes three values (table index, number of items, initial value).
  3863  	for i := 0; i < 3; i++ {
  3864  		c.locationStack.pop()
  3865  	}
  3866  
  3867  	// Then, the previous length was pushed as the result.
  3868  	v := c.locationStack.pushRuntimeValueLocationOnStack()
  3869  	v.valueType = runtimeValueTypeI32
  3870  
  3871  	// After return, we re-initialize reserved registers just like preamble of functions.
  3872  	c.compileReservedStackBasePointerRegisterInitialization()
  3873  	c.compileReservedMemoryRegisterInitialization()
  3874  	return nil
  3875  }
  3876  
  3877  // compileTableSize implements compiler.compileTableSize for the arm64 architecture.
  3878  func (c *arm64Compiler) compileTableSize(o *wazeroir.UnionOperation) error {
  3879  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3880  		return err
  3881  	}
  3882  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  3883  	if err != nil {
  3884  		return err
  3885  	}
  3886  	c.markRegisterUsed(result)
  3887  
  3888  	// arm64ReservedRegisterForTemporary = &tables[0]
  3889  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3890  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3891  		arm64ReservedRegisterForTemporary)
  3892  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3893  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3894  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3895  	tableIndex := int64(o.U1)
  3896  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3897  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3898  		arm64ReservedRegisterForTemporary)
  3899  
  3900  	// result = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3901  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3902  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3903  		result,
  3904  	)
  3905  
  3906  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  3907  	return nil
  3908  }
  3909  
  3910  // compileTableFill implements compiler.compileTableFill for the arm64 architecture.
  3911  func (c *arm64Compiler) compileTableFill(o *wazeroir.UnionOperation) error {
  3912  	tableIndex := uint32(o.U1)
  3913  	return c.compileFillImpl(true, tableIndex)
  3914  }
  3915  
  3916  // popTwoValuesOnRegisters pops two values from the location stacks, ensures
  3917  // these two values are located on registers, and mark them unused.
  3918  //
  3919  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3920  // but the name seems awkward.
  3921  func (c *arm64Compiler) popTwoValuesOnRegisters() (x1, x2 *runtimeValueLocation, err error) {
  3922  	x2 = c.locationStack.pop()
  3923  	if err = c.compileEnsureOnRegister(x2); err != nil {
  3924  		return
  3925  	}
  3926  
  3927  	x1 = c.locationStack.pop()
  3928  	if err = c.compileEnsureOnRegister(x1); err != nil {
  3929  		return
  3930  	}
  3931  
  3932  	c.markRegisterUnused(x2.register)
  3933  	c.markRegisterUnused(x1.register)
  3934  	return
  3935  }
  3936  
  3937  // popValueOnRegister pops one value from the location stack, ensures
  3938  // that it is located on a register, and mark it unused.
  3939  //
  3940  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3941  // but the name seems awkward.
  3942  func (c *arm64Compiler) popValueOnRegister() (v *runtimeValueLocation, err error) {
  3943  	v = c.locationStack.pop()
  3944  	if err = c.compileEnsureOnRegister(v); err != nil {
  3945  		return
  3946  	}
  3947  
  3948  	c.markRegisterUnused(v.register)
  3949  	return
  3950  }
  3951  
  3952  // compileEnsureOnRegister emits instructions to ensure that a value is located on a register.
  3953  func (c *arm64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
  3954  	if loc.onStack() {
  3955  		reg, err := c.allocateRegister(loc.getRegisterType())
  3956  		if err != nil {
  3957  			return err
  3958  		}
  3959  
  3960  		// Record that the value holds the register and the register is marked used.
  3961  		loc.setRegister(reg)
  3962  		c.markRegisterUsed(reg)
  3963  
  3964  		c.compileLoadValueOnStackToRegister(loc)
  3965  	} else if loc.onConditionalRegister() {
  3966  		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  3967  	}
  3968  	return
  3969  }
  3970  
  3971  // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
  3972  // if the value is located on a conditional register.
  3973  //
  3974  // This is usually called at the beginning of methods on compiler interface where we possibly
  3975  // compile instructions without saving the conditional register value.
  3976  // compile* functions without calling this function is saving the conditional
  3977  // value to the stack or register by invoking ensureOnGeneralPurposeRegister for the top.
  3978  func (c *arm64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
  3979  	if c.locationStack.sp > 0 {
  3980  		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
  3981  			err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  3982  		}
  3983  	}
  3984  	return
  3985  }
  3986  
  3987  // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
  3988  // to a general purpose register.
  3989  func (c *arm64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
  3990  	reg, err := c.allocateRegister(loc.getRegisterType())
  3991  	if err != nil {
  3992  		return err
  3993  	}
  3994  
  3995  	c.markRegisterUsed(reg)
  3996  	c.assembler.CompileConditionalRegisterSet(loc.conditionalRegister, reg)
  3997  
  3998  	// Record that now the value is located on a general purpose register.
  3999  	loc.setRegister(reg)
  4000  	return nil
  4001  }
  4002  
  4003  // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for arm64.
  4004  func (c *arm64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
  4005  	switch loc.valueType {
  4006  	case runtimeValueTypeI32:
  4007  		c.assembler.CompileMemoryToRegister(arm64.LDRW, arm64ReservedRegisterForStackBasePointerAddress,
  4008  			int64(loc.stackPointer)*8, loc.register)
  4009  	case runtimeValueTypeI64:
  4010  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForStackBasePointerAddress,
  4011  			int64(loc.stackPointer)*8, loc.register)
  4012  	case runtimeValueTypeF32:
  4013  		c.assembler.CompileMemoryToRegister(arm64.FLDRS, arm64ReservedRegisterForStackBasePointerAddress,
  4014  			int64(loc.stackPointer)*8, loc.register)
  4015  	case runtimeValueTypeF64:
  4016  		c.assembler.CompileMemoryToRegister(arm64.FLDRD, arm64ReservedRegisterForStackBasePointerAddress,
  4017  			int64(loc.stackPointer)*8, loc.register)
  4018  	case runtimeValueTypeV128Lo:
  4019  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV,
  4020  			arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8, loc.register,
  4021  			arm64.VectorArrangementQ)
  4022  		// Higher 64-bits are loaded as well ^^.
  4023  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4024  		hi.setRegister(loc.register)
  4025  	case runtimeValueTypeV128Hi:
  4026  		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
  4027  	}
  4028  }
  4029  
  4030  // allocateRegister implements compiler.allocateRegister for arm64.
  4031  func (c *arm64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
  4032  	var ok bool
  4033  	// Try to get the unused register.
  4034  	reg, ok = c.locationStack.takeFreeRegister(t)
  4035  	if ok {
  4036  		return
  4037  	}
  4038  
  4039  	// If not found, we have to steal the register.
  4040  	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
  4041  	if !ok {
  4042  		err = fmt.Errorf("cannot steal register")
  4043  		return
  4044  	}
  4045  
  4046  	// Release the steal target register value onto stack location.
  4047  	reg = stealTarget.register
  4048  	c.compileReleaseRegisterToStack(stealTarget)
  4049  	return
  4050  }
  4051  
  4052  // compileReleaseAllRegistersToStack adds instructions to store all the values located on
  4053  // either general purpose or conditional registers onto the memory stack.
  4054  // See releaseRegisterToStack.
  4055  func (c *arm64Compiler) compileReleaseAllRegistersToStack() (err error) {
  4056  	for i := uint64(0); i < c.locationStack.sp; i++ {
  4057  		if loc := &c.locationStack.stack[i]; loc.onRegister() {
  4058  			c.compileReleaseRegisterToStack(loc)
  4059  		} else if loc.onConditionalRegister() {
  4060  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  4061  				return
  4062  			}
  4063  			c.compileReleaseRegisterToStack(loc)
  4064  		}
  4065  	}
  4066  	return
  4067  }
  4068  
  4069  // releaseRegisterToStack adds an instruction to write the value on a register back to memory stack region.
  4070  func (c *arm64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
  4071  	switch loc.valueType {
  4072  	case runtimeValueTypeI32:
  4073  		c.assembler.CompileRegisterToMemory(arm64.STRW, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4074  	case runtimeValueTypeI64:
  4075  		c.assembler.CompileRegisterToMemory(arm64.STRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4076  	case runtimeValueTypeF32:
  4077  		c.assembler.CompileRegisterToMemory(arm64.FSTRS, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4078  	case runtimeValueTypeF64:
  4079  		c.assembler.CompileRegisterToMemory(arm64.FSTRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4080  	case runtimeValueTypeV128Lo:
  4081  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
  4082  			loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
  4083  			arm64.VectorArrangementQ)
  4084  		// Higher 64-bits are released as well ^^.
  4085  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4086  		c.locationStack.releaseRegister(hi)
  4087  	case runtimeValueTypeV128Hi:
  4088  		panic("BUG: V128Hi must be released to the stack along with V128Lo")
  4089  	default:
  4090  		panic("BUG")
  4091  	}
  4092  
  4093  	// Mark the register is free.
  4094  	c.locationStack.releaseRegister(loc)
  4095  }
  4096  
  4097  // compileReservedStackBasePointerRegisterInitialization adds instructions to initialize arm64ReservedRegisterForStackBasePointerAddress
  4098  // so that it points to the absolute address of the stack base for this function.
  4099  func (c *arm64Compiler) compileReservedStackBasePointerRegisterInitialization() {
  4100  	// First, load the address of the first element in the value stack into arm64ReservedRegisterForStackBasePointerAddress temporarily.
  4101  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4102  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
  4103  		arm64ReservedRegisterForStackBasePointerAddress)
  4104  
  4105  	// next we move the base pointer (ce.stackBasePointer) to arm64ReservedRegisterForTemporary.
  4106  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4107  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  4108  		arm64ReservedRegisterForTemporary)
  4109  
  4110  	// Finally, we calculate "callEngineStackContextStackBasePointerInBytesOffset + arm64ReservedRegisterForTemporary"
  4111  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForStackBasePointerAddress)
  4112  }
  4113  
  4114  func (c *arm64Compiler) compileReservedMemoryRegisterInitialization() {
  4115  	if c.ir.HasMemory || c.ir.UsesMemory {
  4116  		// "arm64ReservedRegisterForMemory = ce.MemoryElement0Address"
  4117  		c.assembler.CompileMemoryToRegister(
  4118  			arm64.LDRD,
  4119  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4120  			arm64ReservedRegisterForMemory,
  4121  		)
  4122  	}
  4123  }
  4124  
  4125  // compileModuleContextInitialization adds instructions to initialize ce.moduleContext's fields based on
  4126  // ce.moduleContext.ModuleInstanceAddress.
  4127  // This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
  4128  func (c *arm64Compiler) compileModuleContextInitialization() error {
  4129  	tmpX, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4130  	if !found {
  4131  		panic("BUG: all the registers should be free at this point")
  4132  	}
  4133  	c.markRegisterUsed(tmpX)
  4134  	tmpY, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4135  	if !found {
  4136  		panic("BUG: all the registers should be free at this point")
  4137  	}
  4138  	c.markRegisterUsed(tmpY)
  4139  
  4140  	// "tmpX = ce.ModuleInstanceAddress"
  4141  	c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, tmpX)
  4142  
  4143  	// If the module instance address stays the same, we could skip the entire code below.
  4144  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64CallingConventionModuleInstanceAddressRegister, tmpX)
  4145  	brIfModuleUnchanged := c.assembler.CompileJump(arm64.BCONDEQ)
  4146  
  4147  	// Otherwise, update the moduleEngine.moduleContext.ModuleInstanceAddress.
  4148  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  4149  		arm64CallingConventionModuleInstanceAddressRegister,
  4150  		arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset,
  4151  	)
  4152  
  4153  	// Also, we have to update the following fields:
  4154  	// * callEngine.moduleContext.globalElement0Address
  4155  	// * callEngine.moduleContext.memoryElement0Address
  4156  	// * callEngine.moduleContext.memorySliceLen
  4157  	// * callEngine.moduleContext.memoryInstance
  4158  	// * callEngine.moduleContext.tableElement0Address
  4159  	// * callEngine.moduleContext.tableSliceLen
  4160  	// * callEngine.moduleContext.functionsElement0Address
  4161  	// * callEngine.moduleContext.typeIDsElement0Address
  4162  	// * callEngine.moduleContext.dataInstancesElement0Address
  4163  	// * callEngine.moduleContext.elementInstancesElement0Address
  4164  
  4165  	// Update globalElement0Address.
  4166  	//
  4167  	// Note: if there's global.get or set instruction in the function, the existence of the globals
  4168  	// is ensured by function validation at module instantiation phase, and that's why it is ok to
  4169  	// skip the initialization if the module's globals slice is empty.
  4170  	if len(c.ir.Globals) > 0 {
  4171  		// "tmpX = &moduleInstance.Globals[0]"
  4172  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4173  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset,
  4174  			tmpX,
  4175  		)
  4176  
  4177  		// "ce.GlobalElement0Address = tmpX (== &moduleInstance.Globals[0])"
  4178  		c.assembler.CompileRegisterToMemory(
  4179  			arm64.STRD, tmpX,
  4180  			arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
  4181  		)
  4182  	}
  4183  
  4184  	// Update memoryElement0Address and memorySliceLen.
  4185  	//
  4186  	// Note: if there's memory instruction in the function, memory instance must be non-nil.
  4187  	// That is ensured by function validation at module instantiation phase, and that's
  4188  	// why it is ok to skip the initialization if the module's memory instance is nil.
  4189  	if c.ir.HasMemory {
  4190  		// "tmpX = moduleInstance.Memory"
  4191  		c.assembler.CompileMemoryToRegister(
  4192  			arm64.LDRD,
  4193  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
  4194  			tmpX,
  4195  		)
  4196  
  4197  		// First, set ce.memoryInstance
  4198  		c.assembler.CompileRegisterToMemory(
  4199  			arm64.STRD,
  4200  			tmpX,
  4201  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset,
  4202  		)
  4203  
  4204  		// Next, we write the memory length into ce.MemorySliceLen.
  4205  		//
  4206  		// "tmpY = [tmpX + memoryInstanceBufferLenOffset] (== len(memory.Buffer))"
  4207  		c.assembler.CompileMemoryToRegister(
  4208  			arm64.LDRD,
  4209  			tmpX, memoryInstanceBufferLenOffset,
  4210  			tmpY,
  4211  		)
  4212  		// "ce.MemorySliceLen = tmpY".
  4213  		c.assembler.CompileRegisterToMemory(
  4214  			arm64.STRD,
  4215  			tmpY,
  4216  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  4217  		)
  4218  
  4219  		// Finally, we write ce.memoryElement0Address.
  4220  		//
  4221  		// "tmpY = *tmpX (== &memory.Buffer[0])"
  4222  		c.assembler.CompileMemoryToRegister(
  4223  			arm64.LDRD,
  4224  			tmpX, memoryInstanceBufferOffset,
  4225  			tmpY,
  4226  		)
  4227  		// "ce.memoryElement0Address = tmpY".
  4228  		c.assembler.CompileRegisterToMemory(
  4229  			arm64.STRD,
  4230  			tmpY,
  4231  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4232  		)
  4233  	}
  4234  
  4235  	// Update tableElement0Address, tableSliceLen and typeIDsElement0Address.
  4236  	//
  4237  	// Note: if there's table instruction in the function, the existence of the table
  4238  	// is ensured by function validation at module instantiation phase, and that's
  4239  	// why it is ok to skip the initialization if the module's table doesn't exist.
  4240  	if c.ir.HasTable {
  4241  		// "tmpX = &tables[0] (type of **wasm.Table)"
  4242  		c.assembler.CompileMemoryToRegister(
  4243  			arm64.LDRD,
  4244  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTablesOffset,
  4245  			tmpX,
  4246  		)
  4247  
  4248  		// Update ce.tableElement0Address.
  4249  		// "ce.tableElement0Address = tmpX".
  4250  		c.assembler.CompileRegisterToMemory(
  4251  			arm64.STRD,
  4252  			tmpX,
  4253  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4254  		)
  4255  
  4256  		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
  4257  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4258  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpX)
  4259  		c.assembler.CompileRegisterToMemory(arm64.STRD,
  4260  			tmpX, arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
  4261  	}
  4262  
  4263  	// Update callEngine.moduleContext.functionsElement0Address
  4264  	{
  4265  		// "tmpX = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
  4266  		//
  4267  		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
  4268  		// where tab points to the interface table, and the latter points to the actual
  4269  		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
  4270  		// See the following references for detail:
  4271  		// * https://research.swtch.com/interfaces
  4272  		// * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210
  4273  		c.assembler.CompileMemoryToRegister(
  4274  			arm64.LDRD,
  4275  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset,
  4276  			tmpX,
  4277  		)
  4278  
  4279  		// "tmpY = [tmpX + moduleEngineFunctionsOffset] (== &moduleEngine.functions[0])"
  4280  		c.assembler.CompileMemoryToRegister(
  4281  			arm64.LDRD,
  4282  			tmpX, moduleEngineFunctionsOffset,
  4283  			tmpY,
  4284  		)
  4285  
  4286  		// "callEngine.moduleContext.functionsElement0Address = tmpY".
  4287  		c.assembler.CompileRegisterToMemory(
  4288  			arm64.STRD,
  4289  			tmpY,
  4290  			arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  4291  		)
  4292  	}
  4293  
  4294  	// Update dataInstancesElement0Address.
  4295  	if c.ir.HasDataInstances {
  4296  		// "tmpX = &moduleInstance.DataInstances[0]"
  4297  		c.assembler.CompileMemoryToRegister(
  4298  			arm64.LDRD,
  4299  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
  4300  			tmpX,
  4301  		)
  4302  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4303  		c.assembler.CompileRegisterToMemory(
  4304  			arm64.STRD,
  4305  			tmpX,
  4306  			arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  4307  		)
  4308  	}
  4309  
  4310  	// Update callEngine.moduleContext.elementInstancesElement0Address
  4311  	if c.ir.HasElementInstances {
  4312  		// "tmpX = &moduleInstance.DataInstances[0]"
  4313  		c.assembler.CompileMemoryToRegister(
  4314  			arm64.LDRD,
  4315  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
  4316  			tmpX,
  4317  		)
  4318  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4319  		c.assembler.CompileRegisterToMemory(
  4320  			arm64.STRD,
  4321  			tmpX,
  4322  			arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4323  		)
  4324  	}
  4325  
  4326  	c.assembler.SetJumpTargetOnNext(brIfModuleUnchanged)
  4327  	c.markRegisterUnused(tmpX, tmpY)
  4328  	return nil
  4329  }