github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/engine/compiler/impl_arm64.go (about)

     1  // This file implements the compiler for arm64 target.
     2  // Please refer to https://developer.arm.com/documentation/102374/latest/
     3  // if unfamiliar with arm64 instructions and semantics.
     4  package compiler
     5  
     6  import (
     7  	"bytes"
     8  	"errors"
     9  	"fmt"
    10  	"math"
    11  
    12  	"github.com/wasilibs/wazerox/internal/asm"
    13  	"github.com/wasilibs/wazerox/internal/asm/arm64"
    14  	"github.com/wasilibs/wazerox/internal/wasm"
    15  	"github.com/wasilibs/wazerox/internal/wazeroir"
    16  )
    17  
    18  type arm64Compiler struct {
    19  	assembler arm64.Assembler
    20  	ir        *wazeroir.CompilationResult
    21  	// locationStack holds the state of wazeroir virtual stack.
    22  	// and each item is either placed in register or the actual memory stack.
    23  	locationStack *runtimeValueLocationStack
    24  	// labels maps a label (e.g. ".L1_then") to *arm64LabelInfo.
    25  	labels [wazeroir.LabelKindNum][]arm64LabelInfo
    26  	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
    27  	stackPointerCeil uint64
    28  	// assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling.
    29  	assignStackPointerCeilNeeded asm.Node
    30  	compiledTrapTargets          [nativeCallStatusModuleClosed]asm.Node
    31  	withListener                 bool
    32  	typ                          *wasm.FunctionType
    33  	br                           *bytes.Reader
    34  	// locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack,
    35  	// we cache it here, and reset and set to .locationStack in the Init method.
    36  	locationStackForEntrypoint runtimeValueLocationStack
    37  	// frameIDMax tracks the maximum value of frame id per function.
    38  	frameIDMax int
    39  	brTableTmp []runtimeValueLocation
    40  }
    41  
    42  func newArm64Compiler() compiler {
    43  	return &arm64Compiler{
    44  		assembler:                  arm64.NewAssembler(arm64ReservedRegisterForTemporary),
    45  		locationStackForEntrypoint: newRuntimeValueLocationStack(),
    46  		br:                         bytes.NewReader(nil),
    47  	}
    48  }
    49  
    50  // Init implements compiler.Init.
    51  func (c *arm64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) {
    52  	c.assembler.Reset()
    53  	c.locationStackForEntrypoint.reset()
    54  	c.resetLabels()
    55  
    56  	*c = arm64Compiler{
    57  		ir:                         ir,
    58  		withListener:               withListener,
    59  		typ:                        typ,
    60  		assembler:                  c.assembler,
    61  		labels:                     c.labels,
    62  		br:                         c.br,
    63  		brTableTmp:                 c.brTableTmp,
    64  		locationStackForEntrypoint: c.locationStackForEntrypoint,
    65  	}
    66  
    67  	// Reuses the initial location stack for the compilation of subsequent functions.
    68  	c.locationStack = &c.locationStackForEntrypoint
    69  }
    70  
    71  // resetLabels resets the existing content in arm64Compiler.labels so that
    72  // we could reuse the allocated slices and stacks in the subsequent compilations.
    73  func (c *arm64Compiler) resetLabels() {
    74  	for i := range c.labels {
    75  		for j := range c.labels[i] {
    76  			if j > c.frameIDMax {
    77  				// Only need to reset until the maximum frame id. This makes the compilation faster for large binary.
    78  				break
    79  			}
    80  			l := &c.labels[i][j]
    81  			l.initialInstruction = nil
    82  			l.stackInitialized = false
    83  			l.initialStack.reset()
    84  		}
    85  	}
    86  }
    87  
    88  var (
    89  	arm64UnreservedVectorRegisters = []asm.Register{
    90  		arm64.RegV0, arm64.RegV1, arm64.RegV2, arm64.RegV3,
    91  		arm64.RegV4, arm64.RegV5, arm64.RegV6, arm64.RegV7, arm64.RegV8,
    92  		arm64.RegV9, arm64.RegV10, arm64.RegV11, arm64.RegV12, arm64.RegV13,
    93  		arm64.RegV14, arm64.RegV15, arm64.RegV16, arm64.RegV17, arm64.RegV18,
    94  		arm64.RegV19, arm64.RegV20, arm64.RegV21, arm64.RegV22, arm64.RegV23,
    95  		arm64.RegV24, arm64.RegV25, arm64.RegV26, arm64.RegV27, arm64.RegV28,
    96  		arm64.RegV29, arm64.RegV30, arm64.RegV31,
    97  	}
    98  
    99  	// Note (see arm64 section in https://go.dev/doc/asm):
   100  	// * RegR18 is reserved as a platform register, and we don't use it in Compiler.
   101  	// * RegR28 is reserved for Goroutine by Go runtime, and we don't use it in Compiler.
   102  	arm64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint
   103  		arm64.RegR3, arm64.RegR4, arm64.RegR5, arm64.RegR6, arm64.RegR7, arm64.RegR8,
   104  		arm64.RegR9, arm64.RegR10, arm64.RegR11, arm64.RegR12, arm64.RegR13,
   105  		arm64.RegR14, arm64.RegR15, arm64.RegR16, arm64.RegR17, arm64.RegR19,
   106  		arm64.RegR20, arm64.RegR21, arm64.RegR22, arm64.RegR23, arm64.RegR24,
   107  		arm64.RegR25, arm64.RegR26, arm64.RegR29, arm64.RegR30,
   108  	}
   109  )
   110  
   111  const (
   112  	// arm64ReservedRegisterForCallEngine holds the pointer to callEngine instance (i.e. *callEngine as uintptr)
   113  	arm64ReservedRegisterForCallEngine = arm64.RegR0
   114  	// arm64ReservedRegisterForStackBasePointerAddress holds stack base pointer's address (callEngine.stackBasePointer) in the current function call.
   115  	arm64ReservedRegisterForStackBasePointerAddress = arm64.RegR1
   116  	// arm64ReservedRegisterForMemory holds the pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
   117  	arm64ReservedRegisterForMemory = arm64.RegR2
   118  	// arm64ReservedRegisterForTemporary is the temporary register which is available at any point of execution, but its content shouldn't be supposed to live beyond the single operation.
   119  	// Note: we choose R27 as that is the temporary register used in Go's assembler.
   120  	arm64ReservedRegisterForTemporary = arm64.RegR27
   121  )
   122  
   123  var arm64CallingConventionModuleInstanceAddressRegister = arm64.RegR29
   124  
   125  const (
   126  	// arm64CallEngineArchContextCompilerCallReturnAddressOffset is the offset of archContext.nativeCallReturnAddress in callEngine.
   127  	arm64CallEngineArchContextCompilerCallReturnAddressOffset = 144
   128  	// arm64CallEngineArchContextMinimum32BitSignedIntOffset is the offset of archContext.minimum32BitSignedIntAddress in callEngine.
   129  	arm64CallEngineArchContextMinimum32BitSignedIntOffset = 152
   130  	// arm64CallEngineArchContextMinimum64BitSignedIntOffset is the offset of archContext.minimum64BitSignedIntAddress in callEngine.
   131  	arm64CallEngineArchContextMinimum64BitSignedIntOffset = 160
   132  )
   133  
   134  func isZeroRegister(r asm.Register) bool {
   135  	return r == arm64.RegRZR
   136  }
   137  
   138  // compileNOP implements compiler.compileNOP for the arm64 architecture.
   139  func (c *arm64Compiler) compileNOP() asm.Node {
   140  	return c.assembler.CompileStandAlone(arm64.NOP)
   141  }
   142  
   143  // compile implements compiler.compile for the arm64 architecture.
   144  func (c *arm64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) {
   145  	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
   146  	// used for all labels (via setLocationStack), excluding the current one.
   147  	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
   148  	stackPointerCeil = c.stackPointerCeil
   149  	if stackPointerCeil < c.locationStack.stackPointerCeil {
   150  		stackPointerCeil = c.locationStack.stackPointerCeil
   151  	}
   152  
   153  	// Now that the ceil of stack pointer is determined, we are invoking the callback.
   154  	// Note: this must be called before Assemble() below.
   155  	c.assignStackPointerCeil(stackPointerCeil)
   156  
   157  	err = c.assembler.Assemble(buf)
   158  	return
   159  }
   160  
   161  // arm64LabelInfo holds a wazeroir label specific information in this function.
   162  type arm64LabelInfo struct {
   163  	// initialInstruction is the initial instruction for this label so other block can branch into it.
   164  	initialInstruction asm.Node
   165  	// initialStack is the initial value location stack from which we start compiling this label.
   166  	initialStack     runtimeValueLocationStack
   167  	stackInitialized bool
   168  }
   169  
   170  // assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the arm64 architecture.
   171  func (c *arm64Compiler) assignStackPointerCeil(ceil uint64) {
   172  	if c.assignStackPointerCeilNeeded != nil {
   173  		c.assignStackPointerCeilNeeded.AssignSourceConstant(int64(ceil) << 3)
   174  	}
   175  }
   176  
   177  func (c *arm64Compiler) label(label wazeroir.Label) *arm64LabelInfo {
   178  	kind := label.Kind()
   179  	frames := c.labels[kind]
   180  	frameID := label.FrameID()
   181  	if c.frameIDMax < frameID {
   182  		c.frameIDMax = frameID
   183  	}
   184  	// If the frameID is not allocated yet, expand the slice by twice of the diff,
   185  	// so that we could reduce the allocation in the subsequent compilation.
   186  	if diff := frameID - len(frames) + 1; diff > 0 {
   187  		for i := 0; i < diff; i++ {
   188  			frames = append(frames, arm64LabelInfo{initialStack: newRuntimeValueLocationStack()})
   189  		}
   190  		c.labels[kind] = frames
   191  	}
   192  	return &frames[frameID]
   193  }
   194  
   195  // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
   196  func (c *arm64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
   197  	return c.locationStack
   198  }
   199  
   200  // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for arm64.
   201  func (c *arm64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
   202  	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
   203  	c.markRegisterUsed(reg)
   204  	return
   205  }
   206  
   207  // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for arm64.
   208  func (c *arm64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
   209  	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
   210  	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
   211  	c.markRegisterUsed(reg)
   212  	return
   213  }
   214  
   215  func (c *arm64Compiler) markRegisterUsed(regs ...asm.Register) {
   216  	for _, reg := range regs {
   217  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   218  			c.locationStack.markRegisterUsed(reg)
   219  		}
   220  	}
   221  }
   222  
   223  func (c *arm64Compiler) markRegisterUnused(regs ...asm.Register) {
   224  	for _, reg := range regs {
   225  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   226  			c.locationStack.markRegisterUnused(reg)
   227  		}
   228  	}
   229  }
   230  
   231  func (c *arm64Compiler) String() (ret string) { return c.locationStack.String() }
   232  
   233  // compilePreamble implements compiler.compilePreamble for the arm64 architecture.
   234  func (c *arm64Compiler) compilePreamble() error {
   235  	c.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   236  	defer c.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   237  
   238  	c.locationStack.init(c.typ)
   239  
   240  	// Check if it's necessary to grow the value stack before entering function body.
   241  	if err := c.compileMaybeGrowStack(); err != nil {
   242  		return err
   243  	}
   244  
   245  	if err := c.compileModuleContextInitialization(); err != nil {
   246  		return err
   247  	}
   248  
   249  	if c.withListener {
   250  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerBefore); err != nil {
   251  			return err
   252  		}
   253  	}
   254  
   255  	// We must initialize the stack base pointer register so that we can manipulate the stack properly.
   256  	c.compileReservedStackBasePointerRegisterInitialization()
   257  
   258  	c.compileReservedMemoryRegisterInitialization()
   259  
   260  	return nil
   261  }
   262  
   263  // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
   264  // and if so, make the builtin function call to do so. These instructions are called in the function's
   265  // preamble.
   266  func (c *arm64Compiler) compileMaybeGrowStack() error {
   267  	tmpX, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
   268  	if !found {
   269  		panic("BUG: all the registers should be free at this point")
   270  	}
   271  	c.markRegisterUsed(tmpX)
   272  	tmpY, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
   273  	if !found {
   274  		panic("BUG: all the registers should be free at this point")
   275  	}
   276  	c.markRegisterUsed(tmpY)
   277  
   278  	// "tmpX = len(ce.stack)"
   279  	c.assembler.CompileMemoryToRegister(
   280  		arm64.LDRD,
   281  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset,
   282  		tmpX,
   283  	)
   284  
   285  	// "tmpY = ce.stackBasePointer"
   286  	c.assembler.CompileMemoryToRegister(
   287  		arm64.LDRD,
   288  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
   289  		tmpY,
   290  	)
   291  
   292  	// "tmpX = tmpX - tmpY", in other words "tmpX = len(ce.stack) - ce.stackBasePointer"
   293  	c.assembler.CompileRegisterToRegister(
   294  		arm64.SUB,
   295  		tmpY,
   296  		tmpX,
   297  	)
   298  
   299  	// "tmpY = stackPointerCeil"
   300  	loadStackPointerCeil := c.assembler.CompileConstToRegister(
   301  		arm64.MOVD,
   302  		math.MaxInt32,
   303  		tmpY,
   304  	)
   305  	// At this point of compilation, we don't know the value of stack point ceil,
   306  	// so we lazily resolve the value later.
   307  	c.assignStackPointerCeilNeeded = loadStackPointerCeil
   308  
   309  	// Compare tmpX (len(ce.stack) - ce.stackBasePointer) and tmpY (ce.stackPointerCeil)
   310  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmpX, tmpY)
   311  
   312  	// If ceil > stackLen - stack base pointer, we need to grow the stack by calling builtin Go function.
   313  	brIfStackOK := c.assembler.CompileJump(arm64.BCONDLS)
   314  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexGrowStack); err != nil {
   315  		return err
   316  	}
   317  
   318  	// Otherwise, skip calling it.
   319  	c.assembler.SetJumpTargetOnNext(brIfStackOK)
   320  
   321  	c.markRegisterUnused(tmpX, tmpY)
   322  	return nil
   323  }
   324  
   325  // returnFunction emits instructions to return from the current function frame.
   326  // If the current frame is the bottom, the code goes back to the Go code with nativeCallStatusCodeReturned status.
   327  // Otherwise, we branch into the caller's return address.
   328  func (c *arm64Compiler) compileReturnFunction() error {
   329  	// Release all the registers as our calling convention requires the caller-save.
   330  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
   331  		return err
   332  	}
   333  
   334  	if c.withListener {
   335  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerAfter); err != nil {
   336  			return err
   337  		}
   338  		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
   339  		c.compileReservedStackBasePointerRegisterInitialization()
   340  	}
   341  
   342  	// arm64CallingConventionModuleInstanceAddressRegister holds the module intstance's address
   343  	// so mark it used so that it won't be used as a free register.
   344  	c.locationStack.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   345  	defer c.locationStack.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   346  
   347  	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   348  
   349  	// If the return address is zero, meaning that we return from the execution.
   350  	returnAddress.setRegister(arm64ReservedRegisterForTemporary)
   351  	c.compileLoadValueOnStackToRegister(returnAddress)
   352  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, arm64.RegRZR)
   353  
   354  	// Br if the address does not equal zero, otherwise, exit.
   355  	// If the address doesn't equal zero, return br into returnAddressRegister (caller's return address).
   356  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusCodeReturned)
   357  
   358  	// Alias for readability.
   359  	tmp := arm64CallingConventionModuleInstanceAddressRegister
   360  
   361  	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
   362  	callerStackBasePointerInBytes.setRegister(tmp)
   363  	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
   364  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   365  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
   366  
   367  	// Next, restore moduleContext.fn from callerFunction.
   368  	callerFunction.setRegister(tmp)
   369  	c.compileLoadValueOnStackToRegister(callerFunction)
   370  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   371  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
   372  
   373  	// Also, we have to put the target function's *wasm.ModuleInstance into arm64CallingConventionModuleInstanceAddressRegister.
   374  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   375  		tmp, functionModuleInstanceOffset,
   376  		arm64CallingConventionModuleInstanceAddressRegister)
   377  
   378  	c.assembler.CompileJumpToRegister(arm64.B, returnAddress.register)
   379  	return nil
   380  }
   381  
   382  func (c *arm64Compiler) compileMaybeExitFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) {
   383  	skip := c.assembler.CompileJump(skipCondition)
   384  	c.compileExitFromNativeCode(status)
   385  	c.assembler.SetJumpTargetOnNext(skip)
   386  }
   387  
   388  // compileExitFromNativeCode adds instructions to give the control back to ce.exec with the given status code.
   389  func (c *arm64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
   390  	if target := c.compiledTrapTargets[status]; target != nil {
   391  		c.assembler.CompileJump(arm64.B).AssignJumpTarget(target)
   392  		return
   393  	}
   394  
   395  	switch status {
   396  	case nativeCallStatusCodeReturned:
   397  		// Save the target for reuse.
   398  		c.compiledTrapTargets[status] = c.compileNOP()
   399  	case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction:
   400  		// Read the return address, and write it to callEngine.exitContext.returnAddress.
   401  		c.assembler.CompileReadInstructionAddress(arm64ReservedRegisterForTemporary, arm64.RET)
   402  		c.assembler.CompileRegisterToMemory(
   403  			arm64.STRD, arm64ReservedRegisterForTemporary,
   404  			arm64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset,
   405  		)
   406  	default:
   407  		if c.ir.IROperationSourceOffsetsInWasmBinary != nil {
   408  			// This case, the execution traps, and we want the top frame's source position in the stack trace.
   409  			// We store the instruction address onto callEngine.returnAddress.
   410  			c.assembler.CompileReadInstructionAddress(arm64ReservedRegisterForTemporary, arm64.STRD)
   411  			c.assembler.CompileRegisterToMemory(
   412  				arm64.STRD, arm64ReservedRegisterForTemporary,
   413  				arm64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset,
   414  			)
   415  		} else {
   416  			// We won't use the source position, so just save the target for reuse.
   417  			c.compiledTrapTargets[status] = c.compileNOP()
   418  		}
   419  	}
   420  
   421  	// Write the current stack pointer to the ce.stackPointer.
   422  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(c.locationStack.sp), arm64ReservedRegisterForTemporary)
   423  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForCallEngine,
   424  		callEngineStackContextStackPointerOffset)
   425  
   426  	// Write the status to callEngine.exitContext.statusCode.
   427  	if status != 0 {
   428  		c.assembler.CompileConstToRegister(arm64.MOVW, int64(status), arm64ReservedRegisterForTemporary)
   429  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64ReservedRegisterForTemporary,
   430  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   431  	} else {
   432  		// If the status == 0, we use zero register to store zero.
   433  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64.RegRZR,
   434  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   435  	}
   436  
   437  	// The return address to the Go code is stored in archContext.compilerReturnAddress which
   438  	// is embedded in ce. We load the value to the tmpRegister, and then
   439  	// invoke RET with that register.
   440  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   441  		arm64ReservedRegisterForCallEngine, arm64CallEngineArchContextCompilerCallReturnAddressOffset,
   442  		arm64ReservedRegisterForTemporary)
   443  
   444  	c.assembler.CompileJumpToRegister(arm64.RET, arm64ReservedRegisterForTemporary)
   445  }
   446  
   447  // compileGoHostFunction implements compiler.compileHostFunction for the arm64 architecture.
   448  func (c *arm64Compiler) compileGoDefinedHostFunction() error {
   449  	// First we must update the location stack to reflect the number of host function inputs.
   450  	c.locationStack.init(c.typ)
   451  
   452  	if c.withListener {
   453  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction,
   454  			builtinFunctionIndexFunctionListenerBefore); err != nil {
   455  			return err
   456  		}
   457  	}
   458  
   459  	// Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack
   460  	// (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack,
   461  	// and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function
   462  	// without sacrificing the performance.
   463  	c.compileReservedStackBasePointerRegisterInitialization()
   464  	// Alias for readability.
   465  	tmp := arm64CallingConventionModuleInstanceAddressRegister
   466  	// Get the location of the callerFunction (*function) in the stack, which depends on the signature.
   467  	_, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   468  	// Load the value into the tmp register: tmp = &function{..}
   469  	callerFunction.setRegister(tmp)
   470  	c.compileLoadValueOnStackToRegister(callerFunction)
   471  	// tmp = *(tmp+functionModuleInstanceOffset) = &wasm.ModuleInstance{...}
   472  	c.assembler.CompileMemoryToRegister(arm64.LDRD, tmp, functionModuleInstanceOffset, tmp)
   473  	// Load it onto callEngine.exitContext.callerModuleInstance.
   474  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   475  		tmp,
   476  		arm64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset)
   477  	// Reset the state of callerFunction value location so that we won't mess up subsequent code generation below.
   478  	c.locationStack.releaseRegister(callerFunction)
   479  
   480  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction, 0); err != nil {
   481  		return err
   482  	}
   483  
   484  	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
   485  	c.compileReservedStackBasePointerRegisterInitialization()
   486  
   487  	// Go function can change the module state in arbitrary way, so we have to force
   488  	// the callEngine.moduleContext initialization on the function return. To do so,
   489  	// we zero-out callEngine.moduleInstance.
   490  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   491  		arm64.RegRZR,
   492  		arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
   493  
   494  	return c.compileReturnFunction()
   495  }
   496  
   497  // setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
   498  // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
   499  // This is called when we branch into different block.
   500  func (c *arm64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
   501  	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
   502  		c.stackPointerCeil = c.locationStack.stackPointerCeil
   503  	}
   504  	c.locationStack = newStack
   505  }
   506  
   507  // compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the arm64 architecture.
   508  func (c *arm64Compiler) compileBuiltinFunctionCheckExitCode() error {
   509  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexCheckExitCode); err != nil {
   510  		return err
   511  	}
   512  
   513  	// After return, we re-initialize reserved registers just like preamble of functions.
   514  	c.compileReservedStackBasePointerRegisterInitialization()
   515  	c.compileReservedMemoryRegisterInitialization()
   516  	return nil
   517  }
   518  
   519  // compileLabel implements compiler.compileLabel for the arm64 architecture.
   520  func (c *arm64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipThisLabel bool) {
   521  	labelKey := wazeroir.Label(o.U1)
   522  	labelInfo := c.label(labelKey)
   523  
   524  	// If initialStack is not set, that means this label has never been reached.
   525  	if !labelInfo.stackInitialized {
   526  		skipThisLabel = true
   527  		return
   528  	}
   529  
   530  	if labelBegin := labelInfo.initialInstruction; labelBegin == nil {
   531  		// We use NOP as a beginning of instructions in a label.
   532  		// This should be eventually optimized out by assembler.
   533  		labelInfo.initialInstruction = c.assembler.CompileStandAlone(arm64.NOP)
   534  	} else {
   535  		c.assembler.Add(labelBegin)
   536  	}
   537  
   538  	// Set the initial stack.
   539  	c.setLocationStack(&labelInfo.initialStack)
   540  	return false
   541  }
   542  
   543  // compileUnreachable implements compiler.compileUnreachable for the arm64 architecture.
   544  func (c *arm64Compiler) compileUnreachable() error {
   545  	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
   546  	return nil
   547  }
   548  
   549  // compileSet implements compiler.compileSet for the arm64 architecture.
   550  func (c *arm64Compiler) compileSet(o *wazeroir.UnionOperation) error {
   551  	depth := int(o.U1)
   552  	isTargetVector := o.B3
   553  
   554  	setTargetIndex := int(c.locationStack.sp) - 1 - depth
   555  
   556  	if isTargetVector {
   557  		_ = c.locationStack.pop()
   558  	}
   559  	v := c.locationStack.pop()
   560  	if err := c.compileEnsureOnRegister(v); err != nil {
   561  		return err
   562  	}
   563  
   564  	targetLocation := &c.locationStack.stack[setTargetIndex]
   565  	if targetLocation.onRegister() {
   566  		// We no longer need the register previously used by the target location.
   567  		c.markRegisterUnused(targetLocation.register)
   568  	}
   569  
   570  	reg := v.register
   571  	targetLocation.setRegister(reg)
   572  	targetLocation.valueType = v.valueType
   573  	if isTargetVector {
   574  		hi := &c.locationStack.stack[setTargetIndex+1]
   575  		hi.setRegister(reg)
   576  	}
   577  	return nil
   578  }
   579  
   580  // compileGlobalGet implements compiler.compileGlobalGet for the arm64 architecture.
   581  func (c *arm64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error {
   582  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   583  		return err
   584  	}
   585  
   586  	index := uint32(o.U1)
   587  
   588  	wasmValueType := c.ir.Globals[index].ValType
   589  	isV128 := wasmValueType == wasm.ValueTypeV128
   590  	// Get the address of globals[index] into globalAddressReg.
   591  	globalAddressReg, err := c.compileReadGlobalAddress(index)
   592  	if err != nil {
   593  		return err
   594  	}
   595  
   596  	if isV128 {
   597  		resultReg, err := c.allocateRegister(registerTypeVector)
   598  		if err != nil {
   599  			return err
   600  		}
   601  		c.assembler.CompileConstToRegister(arm64.ADD, globalInstanceValueOffset, globalAddressReg)
   602  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV, globalAddressReg, 0,
   603  			resultReg, arm64.VectorArrangementQ)
   604  
   605  		c.pushVectorRuntimeValueLocationOnRegister(resultReg)
   606  	} else {
   607  		ldr := arm64.NOP
   608  		var result asm.Register
   609  		var vt runtimeValueType
   610  		switch wasmValueType {
   611  		case wasm.ValueTypeI32:
   612  			ldr = arm64.LDRW
   613  			vt = runtimeValueTypeI32
   614  			result = globalAddressReg
   615  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   616  			ldr = arm64.LDRD
   617  			vt = runtimeValueTypeI64
   618  			result = globalAddressReg
   619  		case wasm.ValueTypeF32:
   620  			result, err = c.allocateRegister(registerTypeVector)
   621  			if err != nil {
   622  				return err
   623  			}
   624  			ldr = arm64.FLDRS
   625  			vt = runtimeValueTypeF32
   626  		case wasm.ValueTypeF64:
   627  			result, err = c.allocateRegister(registerTypeVector)
   628  			if err != nil {
   629  				return err
   630  			}
   631  			ldr = arm64.FLDRD
   632  			vt = runtimeValueTypeF64
   633  		}
   634  
   635  		// "result = [globalAddressReg + globalInstanceValueOffset] (== globals[index].Val)"
   636  		c.assembler.CompileMemoryToRegister(
   637  			ldr,
   638  			globalAddressReg, globalInstanceValueOffset,
   639  			result,
   640  		)
   641  
   642  		c.pushRuntimeValueLocationOnRegister(result, vt)
   643  	}
   644  	return nil
   645  }
   646  
   647  // compileGlobalSet implements compiler.compileGlobalSet for the arm64 architecture.
   648  func (c *arm64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error {
   649  	index := uint32(o.U1)
   650  
   651  	wasmValueType := c.ir.Globals[index].ValType
   652  	isV128 := wasmValueType == wasm.ValueTypeV128
   653  
   654  	var val *runtimeValueLocation
   655  	if isV128 {
   656  		val = c.locationStack.popV128()
   657  	} else {
   658  		val = c.locationStack.pop()
   659  	}
   660  	if err := c.compileEnsureOnRegister(val); err != nil {
   661  		return err
   662  	}
   663  
   664  	globalInstanceAddressRegister, err := c.compileReadGlobalAddress(index)
   665  	if err != nil {
   666  		return err
   667  	}
   668  
   669  	if isV128 {
   670  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
   671  			val.register, globalInstanceAddressRegister, globalInstanceValueOffset,
   672  			arm64.VectorArrangementQ)
   673  	} else {
   674  		var str asm.Instruction
   675  		switch c.ir.Globals[index].ValType {
   676  		case wasm.ValueTypeI32:
   677  			str = arm64.STRW
   678  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   679  			str = arm64.STRD
   680  		case wasm.ValueTypeF32:
   681  			str = arm64.FSTRS
   682  		case wasm.ValueTypeF64:
   683  			str = arm64.FSTRD
   684  		}
   685  
   686  		// At this point "globalInstanceAddressRegister = globals[index]".
   687  		// Therefore, this means "globals[index].Val = val.register"
   688  		c.assembler.CompileRegisterToMemory(
   689  			str,
   690  			val.register,
   691  			globalInstanceAddressRegister, globalInstanceValueOffset,
   692  		)
   693  	}
   694  
   695  	c.markRegisterUnused(val.register)
   696  	return nil
   697  }
   698  
   699  // compileReadGlobalAddress adds instructions to store the absolute address of the global instance at globalIndex into a register
   700  func (c *arm64Compiler) compileReadGlobalAddress(globalIndex uint32) (destinationRegister asm.Register, err error) {
   701  	// TODO: rethink about the type used in store `globals []*GlobalInstance`.
   702  	// If we use `[]GlobalInstance` instead, we could reduce one MOV instruction here.
   703  
   704  	destinationRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
   705  	if err != nil {
   706  		return
   707  	}
   708  
   709  	// "destinationRegister = globalIndex * 8"
   710  	c.assembler.CompileConstToRegister(
   711  		// globalIndex is an index to []*GlobalInstance, therefore
   712  		// we have to multiply it by the size of *GlobalInstance == the pointer size == 8.
   713  		arm64.MOVD, int64(globalIndex)*8, destinationRegister,
   714  	)
   715  
   716  	// "arm64ReservedRegisterForTemporary = &globals[0]"
   717  	c.assembler.CompileMemoryToRegister(
   718  		arm64.LDRD,
   719  		arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
   720  		arm64ReservedRegisterForTemporary,
   721  	)
   722  
   723  	// "destinationRegister = [arm64ReservedRegisterForTemporary + destinationRegister] (== globals[globalIndex])".
   724  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
   725  		arm64.LDRD,
   726  		arm64ReservedRegisterForTemporary, destinationRegister,
   727  		destinationRegister,
   728  	)
   729  	return
   730  }
   731  
   732  // compileBr implements compiler.compileBr for the arm64 architecture.
   733  func (c *arm64Compiler) compileBr(o *wazeroir.UnionOperation) error {
   734  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   735  		return err
   736  	}
   737  	return c.compileBranchInto(wazeroir.Label(o.U1))
   738  }
   739  
   740  // compileBrIf implements compiler.compileBrIf for the arm64 architecture.
   741  func (c *arm64Compiler) compileBrIf(o *wazeroir.UnionOperation) error {
   742  	cond := c.locationStack.pop()
   743  
   744  	var conditionalBR asm.Node
   745  	if cond.onConditionalRegister() {
   746  		// If the cond is on a conditional register, it corresponds to one of "conditional codes"
   747  		// https://developer.arm.com/documentation/dui0801/a/Condition-Codes/Condition-code-suffixes
   748  		// Here we represent the conditional codes by using arm64.COND_** registers, and that means the
   749  		// conditional jump can be performed if we use arm64.B**.
   750  		// For example, if we have arm64.CondEQ on cond, that means we performed compileEq right before
   751  		// this compileBrIf and BrIf can be achieved by arm64.BCONDEQ.
   752  		var brInst asm.Instruction
   753  		switch cond.conditionalRegister {
   754  		case arm64.CondEQ:
   755  			brInst = arm64.BCONDEQ
   756  		case arm64.CondNE:
   757  			brInst = arm64.BCONDNE
   758  		case arm64.CondHS:
   759  			brInst = arm64.BCONDHS
   760  		case arm64.CondLO:
   761  			brInst = arm64.BCONDLO
   762  		case arm64.CondMI:
   763  			brInst = arm64.BCONDMI
   764  		case arm64.CondHI:
   765  			brInst = arm64.BCONDHI
   766  		case arm64.CondLS:
   767  			brInst = arm64.BCONDLS
   768  		case arm64.CondGE:
   769  			brInst = arm64.BCONDGE
   770  		case arm64.CondLT:
   771  			brInst = arm64.BCONDLT
   772  		case arm64.CondGT:
   773  			brInst = arm64.BCONDGT
   774  		case arm64.CondLE:
   775  			brInst = arm64.BCONDLE
   776  		default:
   777  			// BUG: This means that we use the cond.conditionalRegister somewhere in this file,
   778  			// but not covered in switch ^. That shouldn't happen.
   779  			return fmt.Errorf("unsupported condition for br_if: %v", cond.conditionalRegister)
   780  		}
   781  		conditionalBR = c.assembler.CompileJump(brInst)
   782  	} else {
   783  		// If the value is not on the conditional register, we compare the value with the zero register,
   784  		// and then do the conditional BR if the value doesn't equal zero.
   785  		if err := c.compileEnsureOnRegister(cond); err != nil {
   786  			return err
   787  		}
   788  		// Compare the value with zero register. Note that the value is ensured to be i32 by function validation phase,
   789  		// so we use CMPW (32-bit compare) here.
   790  		c.assembler.CompileTwoRegistersToNone(arm64.CMPW, cond.register, arm64.RegRZR)
   791  
   792  		conditionalBR = c.assembler.CompileJump(arm64.BCONDNE)
   793  
   794  		c.markRegisterUnused(cond.register)
   795  	}
   796  
   797  	// Emit the code for branching into else branch.
   798  	elseTarget := wazeroir.Label(o.U2)
   799  	if err := c.compileBranchInto(elseTarget); err != nil {
   800  		return err
   801  	}
   802  	// We branch into here from the original conditional BR (conditionalBR).
   803  	c.assembler.SetJumpTargetOnNext(conditionalBR)
   804  	thenTarget := wazeroir.Label(o.U1)
   805  	if err := compileDropRange(c, o.U3); err != nil {
   806  		return err
   807  	}
   808  	return c.compileBranchInto(thenTarget)
   809  }
   810  
   811  func (c *arm64Compiler) compileBranchInto(target wazeroir.Label) error {
   812  	if target.IsReturnTarget() {
   813  		return c.compileReturnFunction()
   814  	} else {
   815  		if c.ir.LabelCallers[target] > 1 {
   816  			// We can only re-use register state if when there's a single call-site.
   817  			// Release existing values on registers to the stack if there's multiple ones to have
   818  			// the consistent value location state at the beginning of label.
   819  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   820  				return err
   821  			}
   822  		}
   823  		// Set the initial stack of the target label, so we can start compiling the label
   824  		// with the appropriate value locations. Note we clone the stack here as we maybe
   825  		// manipulate the stack before compiler reaches the label.
   826  		targetLabel := c.label(target)
   827  		if !targetLabel.stackInitialized {
   828  			targetLabel.initialStack.cloneFrom(*c.locationStack)
   829  			targetLabel.stackInitialized = true
   830  		}
   831  
   832  		br := c.assembler.CompileJump(arm64.B)
   833  		c.assignBranchTarget(target, br)
   834  		return nil
   835  	}
   836  }
   837  
   838  // assignBranchTarget assigns the given label's initial instruction to the destination of br.
   839  func (c *arm64Compiler) assignBranchTarget(label wazeroir.Label, br asm.Node) {
   840  	target := c.label(label)
   841  
   842  	targetInst := target.initialInstruction
   843  	if targetInst == nil {
   844  		// If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction.
   845  		targetInst = c.assembler.AllocateNOP()
   846  		target.initialInstruction = targetInst
   847  	}
   848  
   849  	br.AssignJumpTarget(targetInst)
   850  }
   851  
   852  // compileBrTable implements compiler.compileBrTable for the arm64 architecture.
   853  func (c *arm64Compiler) compileBrTable(o *wazeroir.UnionOperation) error {
   854  	// If the operation only consists of the default target, we branch into it and return early.
   855  	if len(o.Us) == 2 {
   856  		loc := c.locationStack.pop()
   857  		if loc.onRegister() {
   858  			c.markRegisterUnused(loc.register)
   859  		}
   860  		if err := compileDropRange(c, o.Us[1]); err != nil {
   861  			return err
   862  		}
   863  		return c.compileBranchInto(wazeroir.Label(o.Us[0]))
   864  	}
   865  
   866  	index := c.locationStack.pop()
   867  	if err := c.compileEnsureOnRegister(index); err != nil {
   868  		return err
   869  	}
   870  
   871  	if isZeroRegister(index.register) {
   872  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
   873  		if err != nil {
   874  			return err
   875  		}
   876  		index.setRegister(reg)
   877  		c.markRegisterUsed(reg)
   878  
   879  		// Zero the value on a picked register.
   880  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, reg)
   881  	}
   882  
   883  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   884  	if err != nil {
   885  		return err
   886  	}
   887  
   888  	// Load the branch table's length.
   889  	// "tmpReg = len(o.Targets)"
   890  	c.assembler.CompileConstToRegister(arm64.MOVW, int64(len(o.Us)/2-1), tmpReg)
   891  	// Compare the length with offset.
   892  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmpReg, index.register)
   893  	// If the value exceeds the length, we will branch into the default target (corresponding to len(o.Targets) index).
   894  	brDefaultIndex := c.assembler.CompileJump(arm64.BCONDLO)
   895  	c.assembler.CompileRegisterToRegister(arm64.MOVW, tmpReg, index.register)
   896  	c.assembler.SetJumpTargetOnNext(brDefaultIndex)
   897  
   898  	// We prepare the asm.StaticConst which holds the offset of
   899  	// each target's first instruction (incl. default)
   900  	// relative to the beginning of label tables.
   901  	//
   902  	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
   903  	// we emit the code like this at [Emit the code for each target and default branch] below.
   904  	//
   905  	// L0:
   906  	//  0x123001: XXXX, ...
   907  	//  .....
   908  	// L1:
   909  	//  0x123005: YYY, ...
   910  	//  .....
   911  	// L_DEFAULT:
   912  	//  0x123009: ZZZ, ...
   913  	//
   914  	// then offsetData becomes like [0x0, 0x5, 0x8].
   915  	// By using this offset list, we could jump into the label for the index by
   916  	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by ADR instruction.
   917  	//
   918  	// Note: We store each offset of 32-bit unsigned integer as 4 consecutive bytes. So more precisely,
   919  	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
   920  	//
   921  	// Note: this is similar to how GCC implements Switch statements in C.
   922  	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2)))
   923  
   924  	// "tmpReg = &offsetData[0]"
   925  	c.assembler.CompileStaticConstToRegister(arm64.ADR, offsetData, tmpReg)
   926  
   927  	// "index.register = tmpReg + (index.register << 2) (== &offsetData[offset])"
   928  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, index.register, 2, tmpReg, index.register)
   929  
   930  	// "index.register = *index.register (== offsetData[offset])"
   931  	c.assembler.CompileMemoryToRegister(arm64.LDRW, index.register, 0, index.register)
   932  
   933  	// Now we read the address of the beginning of the jump table.
   934  	// In the above example, this corresponds to reading the address of 0x123001.
   935  	c.assembler.CompileReadInstructionAddress(tmpReg, arm64.B)
   936  
   937  	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
   938  	// So we could achieve the br_table jump by adding them and jump into the resulting address.
   939  	c.assembler.CompileRegisterToRegister(arm64.ADD, tmpReg, index.register)
   940  
   941  	c.assembler.CompileJumpToRegister(arm64.B, index.register)
   942  
   943  	// We no longer need the index's register, so mark it unused.
   944  	c.markRegisterUnused(index.register)
   945  
   946  	// [Emit the code for each targets and default branch]
   947  	labelInitialInstructions := make([]asm.Node, len(o.Us)/2)
   948  
   949  	// Since we might end up having the different stack state in each branch,
   950  	// we need to save the initial stack state here, and use the same initial state
   951  	// for each iteration.
   952  	initialLocationStack := c.getSavedTemporaryLocationStack()
   953  
   954  	for i := range labelInitialInstructions {
   955  		// Emit the initial instruction of each target where
   956  		// we use NOP as we don't yet know the next instruction in each label.
   957  		init := c.assembler.CompileStandAlone(arm64.NOP)
   958  		labelInitialInstructions[i] = init
   959  
   960  		targetLabel := wazeroir.Label(o.Us[i*2])
   961  		targetToDrop := o.Us[i*2+1]
   962  		if err = compileDropRange(c, targetToDrop); err != nil {
   963  			return err
   964  		}
   965  		if err = c.compileBranchInto(targetLabel); err != nil {
   966  			return err
   967  		}
   968  		// After the iteration, reset the stack's state with initialLocationStack.
   969  		c.locationStack.cloneFrom(initialLocationStack)
   970  	}
   971  
   972  	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
   973  	return nil
   974  }
   975  
   976  func (c *arm64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack {
   977  	initialLocationStack := *c.locationStack // Take copy!
   978  	// Use c.brTableTmp for the underlying stack so that we could reduce the allocations.
   979  	if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 {
   980  		c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...)
   981  	}
   982  	copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp])
   983  	initialLocationStack.stack = c.brTableTmp
   984  	return initialLocationStack
   985  }
   986  
   987  // compileCall implements compiler.compileCall for the arm64 architecture.
   988  func (c *arm64Compiler) compileCall(o *wazeroir.UnionOperation) error {
   989  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   990  		return err
   991  	}
   992  
   993  	functionIndex := o.U1
   994  
   995  	tp := &c.ir.Types[c.ir.Functions[functionIndex]]
   996  
   997  	targetFunctionAddressReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   998  	if err != nil {
   999  		return err
  1000  	}
  1001  	c.markRegisterUsed(targetFunctionAddressReg)
  1002  	defer c.markRegisterUnused(targetFunctionAddressReg)
  1003  
  1004  	// 3) Set rc.next to specify which function is executed on the current call frame.
  1005  	//
  1006  	// First, we read the address of the first item of ce.functions slice (= &ce.functions[0])
  1007  	// into tmp.
  1008  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1009  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  1010  		targetFunctionAddressReg)
  1011  
  1012  	c.assembler.CompileConstToRegister(
  1013  		arm64.ADD,
  1014  		int64(functionIndex)*functionSize, // * 8 because the size of *function equals 8 bytes.
  1015  		targetFunctionAddressReg)
  1016  
  1017  	return c.compileCallImpl(targetFunctionAddressReg, tp)
  1018  }
  1019  
  1020  // compileCallImpl implements compiler.compileCall and compiler.compileCallIndirect for the arm64 architecture.
  1021  func (c *arm64Compiler) compileCallImpl(targetFunctionAddressRegister asm.Register, functype *wasm.FunctionType) error {
  1022  	// Release all the registers as our calling convention requires the caller-save.
  1023  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  1024  		return err
  1025  	}
  1026  
  1027  	tmp, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  1028  	if !ok {
  1029  		panic("BUG: cannot take a free register")
  1030  	}
  1031  
  1032  	// The stack should look like:
  1033  	//
  1034  	//               reserved slots for results (if len(results) > len(args))
  1035  	//                      |     |
  1036  	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
  1037  	//      |                       |                                                        |
  1038  	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
  1039  	//      |
  1040  	// nextStackBasePointerOffset
  1041  	//
  1042  	// where callFrame is used to return to this currently executed function.
  1043  
  1044  	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
  1045  
  1046  	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
  1047  
  1048  	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
  1049  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1050  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  1051  		tmp)
  1052  	callFrameStackBasePointerInBytesLoc.setRegister(tmp)
  1053  	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
  1054  
  1055  	// Set callEngine.stackContext.stackBasePointer for the next function.
  1056  	c.assembler.CompileConstToRegister(arm64.ADD, nextStackBasePointerOffset<<3, tmp)
  1057  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  1058  		tmp,
  1059  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  1060  
  1061  	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
  1062  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1063  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
  1064  		tmp)
  1065  	callFrameFunctionLoc.setRegister(tmp)
  1066  	c.compileReleaseRegisterToStack(callFrameFunctionLoc)
  1067  
  1068  	// Set callEngine.moduleContext.fn to the next *function.
  1069  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  1070  		targetFunctionAddressRegister,
  1071  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  1072  
  1073  	// Write the return address into callFrameReturnAddressLoc.
  1074  	c.assembler.CompileReadInstructionAddress(tmp, arm64.B)
  1075  	callFrameReturnAddressLoc.setRegister(tmp)
  1076  	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
  1077  
  1078  	if targetFunctionAddressRegister == arm64CallingConventionModuleInstanceAddressRegister {
  1079  		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
  1080  		// the address (jump target below) will be modified and result in segfault.
  1081  		// See #526.
  1082  		c.assembler.CompileRegisterToRegister(arm64.MOVD, targetFunctionAddressRegister, tmp)
  1083  		targetFunctionAddressRegister = tmp
  1084  	}
  1085  
  1086  	// Also, we have to put the code's moduleInstance address into arm64CallingConventionModuleInstanceAddressRegister.
  1087  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1088  		targetFunctionAddressRegister, functionModuleInstanceOffset,
  1089  		arm64CallingConventionModuleInstanceAddressRegister,
  1090  	)
  1091  
  1092  	// Then, br into the target function's initial address.
  1093  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1094  		targetFunctionAddressRegister, functionCodeInitialAddressOffset,
  1095  		targetFunctionAddressRegister)
  1096  
  1097  	c.assembler.CompileJumpToRegister(arm64.B, targetFunctionAddressRegister)
  1098  
  1099  	// We consumed the function parameters, the call frame stack and reserved slots during the call.
  1100  	c.locationStack.sp = uint64(nextStackBasePointerOffset)
  1101  
  1102  	// Also, the function results were pushed by the call.
  1103  	for _, t := range functype.Results {
  1104  		loc := c.locationStack.pushRuntimeValueLocationOnStack()
  1105  		switch t {
  1106  		case wasm.ValueTypeI32:
  1107  			loc.valueType = runtimeValueTypeI32
  1108  		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
  1109  			loc.valueType = runtimeValueTypeI64
  1110  		case wasm.ValueTypeF32:
  1111  			loc.valueType = runtimeValueTypeF32
  1112  		case wasm.ValueTypeF64:
  1113  			loc.valueType = runtimeValueTypeF64
  1114  		case wasm.ValueTypeV128:
  1115  			loc.valueType = runtimeValueTypeV128Lo
  1116  			hi := c.locationStack.pushRuntimeValueLocationOnStack()
  1117  			hi.valueType = runtimeValueTypeV128Hi
  1118  		}
  1119  	}
  1120  
  1121  	if err := c.compileModuleContextInitialization(); err != nil {
  1122  		return err
  1123  	}
  1124  
  1125  	// On the function return, we initialize the state for this function.
  1126  	c.compileReservedStackBasePointerRegisterInitialization()
  1127  
  1128  	c.compileReservedMemoryRegisterInitialization()
  1129  	return nil
  1130  }
  1131  
  1132  // compileCallIndirect implements compiler.compileCallIndirect for the arm64 architecture.
  1133  func (c *arm64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) (err error) {
  1134  	offset := c.locationStack.pop()
  1135  	if err = c.compileEnsureOnRegister(offset); err != nil {
  1136  		return err
  1137  	}
  1138  	typeIndex := o.U1
  1139  	tableIndex := o.U2
  1140  
  1141  	offsetReg := offset.register
  1142  	if isZeroRegister(offsetReg) {
  1143  		offsetReg, err = c.allocateRegister(registerTypeGeneralPurpose)
  1144  		if err != nil {
  1145  			return err
  1146  		}
  1147  		c.markRegisterUsed(offsetReg)
  1148  
  1149  		// Zero the value on a picked register.
  1150  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, offsetReg)
  1151  	}
  1152  
  1153  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  1154  	if err != nil {
  1155  		return err
  1156  	}
  1157  	c.markRegisterUsed(tmp)
  1158  
  1159  	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
  1160  	if err != nil {
  1161  		return err
  1162  	}
  1163  	c.markRegisterUsed(tmp2)
  1164  
  1165  	// First, we need to check if the offset doesn't exceed the length of table.
  1166  	// "tmp = &Tables[0]"
  1167  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1168  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  1169  		tmp,
  1170  	)
  1171  	// tmp = [tmp + TableIndex*8] = [&Tables[0] + TableIndex*sizeOf(*tableInstance)] = Tables[tableIndex]
  1172  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1173  		tmp, int64(tableIndex)*8,
  1174  		tmp,
  1175  	)
  1176  	// tmp2 = [tmp + tableInstanceTableLenOffset] = len(Tables[tableIndex])
  1177  	c.assembler.CompileMemoryToRegister(arm64.LDRD, tmp, tableInstanceTableLenOffset, tmp2)
  1178  
  1179  	// "cmp tmp2, offset"
  1180  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp2, offsetReg)
  1181  
  1182  	// If it exceeds len(table), we trap.
  1183  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  1184  	// Otherwise, we proceed to do function type check.
  1185  
  1186  	// We need to obtain the absolute address of table element.
  1187  	// "tmp = &Tables[tableIndex].table[0]"
  1188  	c.assembler.CompileMemoryToRegister(
  1189  		arm64.LDRD,
  1190  		tmp, tableInstanceTableOffset,
  1191  		tmp,
  1192  	)
  1193  	// "offset = tmp + (offset << pointerSizeLog2) (== &table[offset])"
  1194  	// Here we left shifting by 3 in order to get the offset in bytes,
  1195  	// and the table element type is uintptr which is 8 bytes.
  1196  	c.assembler.CompileLeftShiftedRegisterToRegister(
  1197  		arm64.ADD,
  1198  		offsetReg, pointerSizeLog2,
  1199  		tmp,
  1200  		offsetReg,
  1201  	)
  1202  
  1203  	// "offset = (*offset) (== table[offset])"
  1204  	c.assembler.CompileMemoryToRegister(arm64.LDRD, offsetReg, 0, offsetReg)
  1205  
  1206  	// Check if the value of table[offset] equals zero, meaning that the target element is uninitialized.
  1207  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, offsetReg)
  1208  
  1209  	// Skipped if the target is initialized.
  1210  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusCodeInvalidTableAccess)
  1211  
  1212  	// next we check the type matches, i.e. table[offset].source.TypeID == targetFunctionType.
  1213  	// "tmp = table[offset].typeID"
  1214  	c.assembler.CompileMemoryToRegister(
  1215  		arm64.LDRD,
  1216  		offsetReg, functionTypeIDOffset,
  1217  		tmp,
  1218  	)
  1219  	// "tmp2 = ModuleInstance.TypeIDs[index]"
  1220  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1221  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
  1222  		tmp2)
  1223  	c.assembler.CompileMemoryToRegister(arm64.LDRW, tmp2, int64(typeIndex)*4, tmp2)
  1224  
  1225  	// Compare these two values, and if they equal, we are ready to make function call.
  1226  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmp, tmp2)
  1227  	// Skipped if the type matches.
  1228  	c.compileMaybeExitFromNativeCode(arm64.BCONDEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall)
  1229  
  1230  	targetFunctionType := &c.ir.Types[typeIndex]
  1231  	if err := c.compileCallImpl(offsetReg, targetFunctionType); err != nil {
  1232  		return err
  1233  	}
  1234  
  1235  	// The offset register should be marked as un-used as we consumed in the function call.
  1236  	c.markRegisterUnused(offsetReg, tmp, tmp2)
  1237  	return nil
  1238  }
  1239  
  1240  // compileDrop implements compiler.compileDrop for the arm64 architecture.
  1241  func (c *arm64Compiler) compileDrop(o *wazeroir.UnionOperation) error {
  1242  	return compileDropRange(c, o.U1)
  1243  }
  1244  
  1245  func (c *arm64Compiler) compileSelectV128Impl(selectorRegister asm.Register) error {
  1246  	x2 := c.locationStack.popV128()
  1247  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1248  		return err
  1249  	}
  1250  
  1251  	x1 := c.locationStack.popV128()
  1252  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1253  		return err
  1254  	}
  1255  
  1256  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, selectorRegister)
  1257  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1258  
  1259  	// In this branch, we select the value of x2, so we move the value into x1.register so that
  1260  	// we can have the result in x1.register regardless of the selection.
  1261  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1262  		x2.register, x2.register, x1.register, arm64.VectorArrangement16B)
  1263  
  1264  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1265  
  1266  	// As noted, the result exists in x1.register regardless of the selector.
  1267  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1268  	// Plus, x2.register is no longer used.
  1269  	c.markRegisterUnused(x2.register)
  1270  	return nil
  1271  }
  1272  
  1273  // compileSelect implements compiler.compileSelect for the arm64 architecture.
  1274  func (c *arm64Compiler) compileSelect(o *wazeroir.UnionOperation) error {
  1275  	cv, err := c.popValueOnRegister()
  1276  	if err != nil {
  1277  		return err
  1278  	}
  1279  
  1280  	isTargetVector := o.B3
  1281  	if isTargetVector {
  1282  		return c.compileSelectV128Impl(cv.register)
  1283  	}
  1284  
  1285  	c.markRegisterUsed(cv.register)
  1286  
  1287  	x1, x2, err := c.popTwoValuesOnRegisters()
  1288  	if err != nil {
  1289  		return err
  1290  	}
  1291  
  1292  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1293  		// If both values are zero, the result is always zero.
  1294  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1295  		c.markRegisterUnused(cv.register)
  1296  		return nil
  1297  	}
  1298  
  1299  	// In the following, we emit the code so that x1's register contains the chosen value
  1300  	// no matter which of original x1 or x2 is selected.
  1301  	//
  1302  	// If x1 is currently on zero register, we cannot place the result because
  1303  	// "MOV arm64.RegRZR x2.register" results in arm64.RegRZR regardless of the value.
  1304  	// So we explicitly assign a general purpose register to x1 here.
  1305  	if isZeroRegister(x1.register) {
  1306  		// Mark x2 and cv's registers are used so they won't be chosen.
  1307  		c.markRegisterUsed(x2.register)
  1308  		// Pick the non-zero register for x1.
  1309  		x1Reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1310  		if err != nil {
  1311  			return err
  1312  		}
  1313  		x1.setRegister(x1Reg)
  1314  		// And zero our the picked register.
  1315  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, x1Reg)
  1316  	}
  1317  
  1318  	// At this point, x1 is non-zero register, and x2 is either general purpose or zero register.
  1319  
  1320  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, cv.register)
  1321  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1322  
  1323  	// If cv == 0, we move the value of x2 to the x1.register.
  1324  
  1325  	switch x1.valueType {
  1326  	case runtimeValueTypeI32:
  1327  		// TODO: use 32-bit mov
  1328  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1329  	case runtimeValueTypeI64:
  1330  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1331  	case runtimeValueTypeF32:
  1332  		// TODO: use 32-bit mov
  1333  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1334  	case runtimeValueTypeF64:
  1335  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1336  	default:
  1337  		return errors.New("TODO: implement vector type select")
  1338  	}
  1339  
  1340  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1341  
  1342  	// Otherwise, nothing to do for select.
  1343  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1344  
  1345  	// Only x1.register is reused.
  1346  	c.markRegisterUnused(cv.register, x2.register)
  1347  	return nil
  1348  }
  1349  
  1350  // compilePick implements compiler.compilePick for the arm64 architecture.
  1351  func (c *arm64Compiler) compilePick(o *wazeroir.UnionOperation) error {
  1352  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1353  		return err
  1354  	}
  1355  	depth := o.U1
  1356  	isTargetVector := o.B3
  1357  
  1358  	pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)]
  1359  	pickedRegister, err := c.allocateRegister(pickTarget.getRegisterType())
  1360  	if err != nil {
  1361  		return err
  1362  	}
  1363  
  1364  	if pickTarget.onRegister() { // Copy the value to the pickedRegister.
  1365  		switch pickTarget.valueType {
  1366  		case runtimeValueTypeI32:
  1367  			c.assembler.CompileRegisterToRegister(arm64.MOVW, pickTarget.register, pickedRegister)
  1368  		case runtimeValueTypeI64:
  1369  			c.assembler.CompileRegisterToRegister(arm64.MOVD, pickTarget.register, pickedRegister)
  1370  		case runtimeValueTypeF32:
  1371  			c.assembler.CompileRegisterToRegister(arm64.FMOVS, pickTarget.register, pickedRegister)
  1372  		case runtimeValueTypeF64:
  1373  			c.assembler.CompileRegisterToRegister(arm64.FMOVD, pickTarget.register, pickedRegister)
  1374  		case runtimeValueTypeV128Lo:
  1375  			c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1376  				pickTarget.register, pickTarget.register, pickedRegister, arm64.VectorArrangement16B)
  1377  		case runtimeValueTypeV128Hi:
  1378  			panic("BUG") // since pick target must point to the lower 64-bits of vectors.
  1379  		}
  1380  	} else if pickTarget.onStack() {
  1381  		// Temporarily assign a register to the pick target, and then load the value.
  1382  		pickTarget.setRegister(pickedRegister)
  1383  		c.compileLoadValueOnStackToRegister(pickTarget)
  1384  
  1385  		// After the load, we revert the register assignment to the pick target.
  1386  		pickTarget.setRegister(asm.NilRegister)
  1387  		if isTargetVector {
  1388  			hi := &c.locationStack.stack[pickTarget.stackPointer+1]
  1389  			hi.setRegister(asm.NilRegister)
  1390  		}
  1391  	}
  1392  
  1393  	// Now we have the value of the target on the pickedRegister,
  1394  	// so push the location.
  1395  	c.pushRuntimeValueLocationOnRegister(pickedRegister, pickTarget.valueType)
  1396  	if isTargetVector {
  1397  		c.pushRuntimeValueLocationOnRegister(pickedRegister, runtimeValueTypeV128Hi)
  1398  	}
  1399  	return nil
  1400  }
  1401  
  1402  // compileAdd implements compiler.compileAdd for the arm64 architecture.
  1403  func (c *arm64Compiler) compileAdd(o *wazeroir.UnionOperation) error {
  1404  	x1, x2, err := c.popTwoValuesOnRegisters()
  1405  	if err != nil {
  1406  		return err
  1407  	}
  1408  
  1409  	// Addition can be nop if one of operands is zero.
  1410  	if isZeroRegister(x1.register) {
  1411  		c.pushRuntimeValueLocationOnRegister(x2.register, x1.valueType)
  1412  		return nil
  1413  	} else if isZeroRegister(x2.register) {
  1414  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1415  		return nil
  1416  	}
  1417  
  1418  	var inst asm.Instruction
  1419  	unsignedType := wazeroir.UnsignedType(o.B1)
  1420  	switch unsignedType {
  1421  	case wazeroir.UnsignedTypeI32:
  1422  		inst = arm64.ADDW
  1423  	case wazeroir.UnsignedTypeI64:
  1424  		inst = arm64.ADD
  1425  	case wazeroir.UnsignedTypeF32:
  1426  		inst = arm64.FADDS
  1427  	case wazeroir.UnsignedTypeF64:
  1428  		inst = arm64.FADDD
  1429  	}
  1430  
  1431  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1432  	// The result is placed on a register for x1, so record it.
  1433  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1434  	return nil
  1435  }
  1436  
  1437  // compileSub implements compiler.compileSub for the arm64 architecture.
  1438  func (c *arm64Compiler) compileSub(o *wazeroir.UnionOperation) error {
  1439  	x1, x2, err := c.popTwoValuesOnRegisters()
  1440  	if err != nil {
  1441  		return err
  1442  	}
  1443  
  1444  	// If both of registers are zeros, this can be nop and push the zero register.
  1445  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1446  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1447  		return nil
  1448  	}
  1449  
  1450  	// At this point, at least one of x1 or x2 registers is non zero.
  1451  	// Choose the non-zero register as destination.
  1452  	destinationReg := x1.register
  1453  	if isZeroRegister(x1.register) {
  1454  		destinationReg = x2.register
  1455  	}
  1456  
  1457  	var inst asm.Instruction
  1458  	var vt runtimeValueType
  1459  	unsignedType := wazeroir.UnsignedType(o.B1)
  1460  	switch unsignedType {
  1461  	case wazeroir.UnsignedTypeI32:
  1462  		inst = arm64.SUBW
  1463  		vt = runtimeValueTypeI32
  1464  	case wazeroir.UnsignedTypeI64:
  1465  		inst = arm64.SUB
  1466  		vt = runtimeValueTypeI64
  1467  	case wazeroir.UnsignedTypeF32:
  1468  		inst = arm64.FSUBS
  1469  		vt = runtimeValueTypeF32
  1470  	case wazeroir.UnsignedTypeF64:
  1471  		inst = arm64.FSUBD
  1472  		vt = runtimeValueTypeF64
  1473  	}
  1474  
  1475  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1476  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  1477  	return nil
  1478  }
  1479  
  1480  // compileMul implements compiler.compileMul for the arm64 architecture.
  1481  func (c *arm64Compiler) compileMul(o *wazeroir.UnionOperation) error {
  1482  	x1, x2, err := c.popTwoValuesOnRegisters()
  1483  	if err != nil {
  1484  		return err
  1485  	}
  1486  
  1487  	// Multiplication can be done by putting a zero register if one of operands is zero.
  1488  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1489  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1490  		return nil
  1491  	}
  1492  
  1493  	var inst asm.Instruction
  1494  	var vt runtimeValueType
  1495  	unsignedType := wazeroir.UnsignedType(o.B1)
  1496  	switch unsignedType {
  1497  	case wazeroir.UnsignedTypeI32:
  1498  		inst = arm64.MULW
  1499  		vt = runtimeValueTypeI32
  1500  	case wazeroir.UnsignedTypeI64:
  1501  		inst = arm64.MUL
  1502  		vt = runtimeValueTypeI64
  1503  	case wazeroir.UnsignedTypeF32:
  1504  		inst = arm64.FMULS
  1505  		vt = runtimeValueTypeF32
  1506  	case wazeroir.UnsignedTypeF64:
  1507  		inst = arm64.FMULD
  1508  		vt = runtimeValueTypeF64
  1509  	}
  1510  
  1511  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1512  	// The result is placed on a register for x1, so record it.
  1513  	c.pushRuntimeValueLocationOnRegister(x1.register, vt)
  1514  	return nil
  1515  }
  1516  
  1517  // compileClz implements compiler.compileClz for the arm64 architecture.
  1518  func (c *arm64Compiler) compileClz(o *wazeroir.UnionOperation) error {
  1519  	v, err := c.popValueOnRegister()
  1520  	if err != nil {
  1521  		return err
  1522  	}
  1523  
  1524  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1525  	if isZeroRegister(v.register) {
  1526  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1527  		// so we allocate a register and put the const on it.
  1528  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1529  		if err != nil {
  1530  			return err
  1531  		}
  1532  		var vt runtimeValueType
  1533  		if unsignedInt == wazeroir.UnsignedInt32 {
  1534  			vt = runtimeValueTypeI32
  1535  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1536  		} else {
  1537  			vt = runtimeValueTypeI64
  1538  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1539  		}
  1540  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1541  		return nil
  1542  	}
  1543  
  1544  	reg := v.register
  1545  	var vt runtimeValueType
  1546  	if unsignedInt == wazeroir.UnsignedInt32 {
  1547  		vt = runtimeValueTypeI32
  1548  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1549  	} else {
  1550  		vt = runtimeValueTypeI64
  1551  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1552  	}
  1553  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1554  	return nil
  1555  }
  1556  
  1557  // compileCtz implements compiler.compileCtz for the arm64 architecture.
  1558  func (c *arm64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
  1559  	v, err := c.popValueOnRegister()
  1560  	if err != nil {
  1561  		return err
  1562  	}
  1563  
  1564  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1565  	reg := v.register
  1566  	if isZeroRegister(reg) {
  1567  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1568  		// so we allocate a register and put the const on it.
  1569  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1570  		if err != nil {
  1571  			return err
  1572  		}
  1573  		var vt runtimeValueType
  1574  		if unsignedInt == wazeroir.UnsignedInt32 {
  1575  			vt = runtimeValueTypeI32
  1576  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1577  		} else {
  1578  			vt = runtimeValueTypeI64
  1579  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1580  		}
  1581  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1582  		return nil
  1583  	}
  1584  
  1585  	// Since arm64 doesn't have an instruction directly counting trailing zeros,
  1586  	// we reverse the bits first, and then do CLZ, which is exactly the same as
  1587  	// gcc implements __builtin_ctz for arm64.
  1588  	var vt runtimeValueType
  1589  	if unsignedInt == wazeroir.UnsignedInt32 {
  1590  		vt = runtimeValueTypeI32
  1591  		c.assembler.CompileRegisterToRegister(arm64.RBITW, reg, reg)
  1592  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1593  	} else {
  1594  		vt = runtimeValueTypeI64
  1595  		c.assembler.CompileRegisterToRegister(arm64.RBIT, reg, reg)
  1596  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1597  	}
  1598  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1599  	return nil
  1600  }
  1601  
  1602  // compilePopcnt implements compiler.compilePopcnt for the arm64 architecture.
  1603  func (c *arm64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error {
  1604  	v, err := c.popValueOnRegister()
  1605  	if err != nil {
  1606  		return err
  1607  	}
  1608  
  1609  	reg := v.register
  1610  	if isZeroRegister(reg) {
  1611  		c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1612  		return nil
  1613  	}
  1614  
  1615  	freg, err := c.allocateRegister(registerTypeVector)
  1616  	if err != nil {
  1617  		return err
  1618  	}
  1619  
  1620  	// arm64 doesn't have an instruction for population count on scalar register,
  1621  	// so we use the vector one (VCNT).
  1622  	// This exactly what the official Go implements bits.OneCount.
  1623  	// For example, "func () int { return bits.OneCount(10) }" is compiled as
  1624  	//
  1625  	//    MOVD    $10, R0 ;; Load 10.
  1626  	//    FMOVD   R0, F0
  1627  	//    VCNT    V0.B8, V0.B8
  1628  	//    UADDLV  V0.B8, V0
  1629  	//
  1630  	var movInst asm.Instruction
  1631  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1632  	if unsignedInt == wazeroir.UnsignedInt32 {
  1633  		movInst = arm64.FMOVS
  1634  	} else {
  1635  		movInst = arm64.FMOVD
  1636  	}
  1637  	c.assembler.CompileRegisterToRegister(movInst, reg, freg)
  1638  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.VCNT, freg, freg,
  1639  		arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
  1640  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.UADDLV, freg, freg, arm64.VectorArrangement8B,
  1641  		arm64.VectorIndexNone, arm64.VectorIndexNone)
  1642  
  1643  	c.assembler.CompileRegisterToRegister(movInst, freg, reg)
  1644  
  1645  	c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1646  	return nil
  1647  }
  1648  
  1649  // compileDiv implements compiler.compileDiv for the arm64 architecture.
  1650  func (c *arm64Compiler) compileDiv(o *wazeroir.UnionOperation) error {
  1651  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1652  	if err != nil {
  1653  		return err
  1654  	}
  1655  
  1656  	signedType := wazeroir.SignedType(o.B1)
  1657  
  1658  	// If the divisor is on the zero register, exit from the function deterministically.
  1659  	if isZeroRegister(divisor.register) {
  1660  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1661  		v := c.locationStack.pushRuntimeValueLocationOnStack()
  1662  		switch signedType {
  1663  		case wazeroir.SignedTypeInt32, wazeroir.SignedTypeUint32:
  1664  			v.valueType = runtimeValueTypeI32
  1665  		case wazeroir.SignedTypeUint64, wazeroir.SignedTypeInt64:
  1666  			v.valueType = runtimeValueTypeI64
  1667  		}
  1668  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1669  		return nil
  1670  	}
  1671  
  1672  	var inst asm.Instruction
  1673  	var vt runtimeValueType
  1674  	switch signedType {
  1675  	case wazeroir.SignedTypeUint32:
  1676  		inst = arm64.UDIVW
  1677  		if err := c.compileIntegerDivPrecheck(true, false, dividend.register, divisor.register); err != nil {
  1678  			return err
  1679  		}
  1680  		vt = runtimeValueTypeI32
  1681  	case wazeroir.SignedTypeUint64:
  1682  		if err := c.compileIntegerDivPrecheck(false, false, dividend.register, divisor.register); err != nil {
  1683  			return err
  1684  		}
  1685  		inst = arm64.UDIV
  1686  		vt = runtimeValueTypeI64
  1687  	case wazeroir.SignedTypeInt32:
  1688  		if err := c.compileIntegerDivPrecheck(true, true, dividend.register, divisor.register); err != nil {
  1689  			return err
  1690  		}
  1691  		inst = arm64.SDIVW
  1692  		vt = runtimeValueTypeI32
  1693  	case wazeroir.SignedTypeInt64:
  1694  		if err := c.compileIntegerDivPrecheck(false, true, dividend.register, divisor.register); err != nil {
  1695  			return err
  1696  		}
  1697  		inst = arm64.SDIV
  1698  		vt = runtimeValueTypeI64
  1699  	case wazeroir.SignedTypeFloat32:
  1700  		inst = arm64.FDIVS
  1701  		vt = runtimeValueTypeF32
  1702  	case wazeroir.SignedTypeFloat64:
  1703  		inst = arm64.FDIVD
  1704  		vt = runtimeValueTypeF64
  1705  	}
  1706  
  1707  	c.assembler.CompileRegisterToRegister(inst, divisor.register, dividend.register)
  1708  
  1709  	c.pushRuntimeValueLocationOnRegister(dividend.register, vt)
  1710  	return nil
  1711  }
  1712  
  1713  // compileIntegerDivPrecheck adds instructions to check if the divisor and dividend are sound for division operation.
  1714  // First, this adds instructions to check if the divisor equals zero, and if so, exits the function.
  1715  // Plus, for signed divisions, check if the result might result in overflow or not.
  1716  func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, dividend, divisor asm.Register) error {
  1717  	// We check the divisor value equals zero.
  1718  	var cmpInst, movInst, loadInst asm.Instruction
  1719  	var minValueOffsetInVM int64
  1720  	if is32Bit {
  1721  		cmpInst = arm64.CMPW
  1722  		movInst = arm64.MOVW
  1723  		loadInst = arm64.LDRW
  1724  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  1725  	} else {
  1726  		cmpInst = arm64.CMP
  1727  		movInst = arm64.MOVD
  1728  		loadInst = arm64.LDRD
  1729  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  1730  	}
  1731  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisor)
  1732  
  1733  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1734  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerDivisionByZero)
  1735  	// Otherwise, we proceed.
  1736  
  1737  	// If the operation is a signed integer div, we have to do an additional check on overflow.
  1738  	if isSigned {
  1739  		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  1740  		// case which results in the overflow.
  1741  
  1742  		// First, we compare the divisor with -1.
  1743  		c.assembler.CompileConstToRegister(movInst, -1, arm64ReservedRegisterForTemporary)
  1744  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, divisor)
  1745  
  1746  		// If they not equal, we skip the following check.
  1747  		brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BCONDNE)
  1748  
  1749  		// Otherwise, we further check if the dividend equals math.MinInt32 or MinInt64.
  1750  		c.assembler.CompileMemoryToRegister(
  1751  			loadInst,
  1752  			arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  1753  			arm64ReservedRegisterForTemporary,
  1754  		)
  1755  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, dividend)
  1756  
  1757  		// If they not equal, we are safe to execute the division.
  1758  		// Otherwise, we raise overflow error.
  1759  		c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerOverflow)
  1760  
  1761  		c.assembler.SetJumpTargetOnNext(brIfDivisorNonMinusOne)
  1762  	}
  1763  	return nil
  1764  }
  1765  
  1766  // compileRem implements compiler.compileRem for the arm64 architecture.
  1767  func (c *arm64Compiler) compileRem(o *wazeroir.UnionOperation) error {
  1768  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1769  	if err != nil {
  1770  		return err
  1771  	}
  1772  
  1773  	dividendReg := dividend.register
  1774  	divisorReg := divisor.register
  1775  
  1776  	// If the divisor is on the zero register, exit from the function deterministically.
  1777  	if isZeroRegister(divisor.register) {
  1778  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1779  		v := c.locationStack.pushRuntimeValueLocationOnStack()
  1780  		v.valueType = runtimeValueTypeI32
  1781  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1782  		return nil
  1783  	}
  1784  
  1785  	var divInst, msubInst, cmpInst asm.Instruction
  1786  	signedInt := wazeroir.SignedInt(o.B1)
  1787  	switch signedInt {
  1788  	case wazeroir.SignedUint32:
  1789  		divInst = arm64.UDIVW
  1790  		msubInst = arm64.MSUBW
  1791  		cmpInst = arm64.CMPW
  1792  	case wazeroir.SignedUint64:
  1793  		divInst = arm64.UDIV
  1794  		msubInst = arm64.MSUB
  1795  		cmpInst = arm64.CMP
  1796  	case wazeroir.SignedInt32:
  1797  		divInst = arm64.SDIVW
  1798  		msubInst = arm64.MSUBW
  1799  		cmpInst = arm64.CMPW
  1800  	case wazeroir.SignedInt64:
  1801  		divInst = arm64.SDIV
  1802  		msubInst = arm64.MSUB
  1803  		cmpInst = arm64.CMP
  1804  	}
  1805  
  1806  	// We check the divisor value equals zero.
  1807  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisorReg)
  1808  
  1809  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1810  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerDivisionByZero)
  1811  	// Otherwise, we proceed.
  1812  
  1813  	// Temporarily mark them used to allocate a result register while keeping these values.
  1814  	c.markRegisterUsed(dividend.register, divisor.register)
  1815  
  1816  	resultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1817  	if err != nil {
  1818  		return err
  1819  	}
  1820  
  1821  	// arm64 doesn't have an instruction for rem, we use calculate it by two instructions: UDIV (SDIV for signed) and MSUB.
  1822  	// This exactly the same code that Clang emits.
  1823  	// [input: x0=dividend, x1=divisor]
  1824  	// >> UDIV x2, x0, x1
  1825  	// >> MSUB x3, x2, x1, x0
  1826  	// [result: x2=quotient, x3=remainder]
  1827  	//
  1828  	c.assembler.CompileTwoRegistersToRegister(divInst, divisorReg, dividendReg, resultReg)
  1829  	// ResultReg = dividendReg - (divisorReg * resultReg)
  1830  	c.assembler.CompileThreeRegistersToRegister(msubInst, divisorReg, dividendReg, resultReg, resultReg)
  1831  
  1832  	c.markRegisterUnused(dividend.register, divisor.register)
  1833  	c.pushRuntimeValueLocationOnRegister(resultReg, dividend.valueType)
  1834  	return nil
  1835  }
  1836  
  1837  // compileAnd implements compiler.compileAnd for the arm64 architecture.
  1838  func (c *arm64Compiler) compileAnd(o *wazeroir.UnionOperation) error {
  1839  	x1, x2, err := c.popTwoValuesOnRegisters()
  1840  	if err != nil {
  1841  		return err
  1842  	}
  1843  
  1844  	// If either of the registers x1 or x2 is zero,
  1845  	// the result will always be zero.
  1846  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1847  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1848  		return nil
  1849  	}
  1850  
  1851  	// At this point, at least one of x1 or x2 registers is non zero.
  1852  	// Choose the non-zero register as destination.
  1853  	destinationReg := x1.register
  1854  	if isZeroRegister(x1.register) {
  1855  		destinationReg = x2.register
  1856  	}
  1857  
  1858  	var inst asm.Instruction
  1859  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1860  	switch unsignedInt {
  1861  	case wazeroir.UnsignedInt32:
  1862  		inst = arm64.ANDW
  1863  	case wazeroir.UnsignedInt64:
  1864  		inst = arm64.AND
  1865  	}
  1866  
  1867  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1868  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1869  	return nil
  1870  }
  1871  
  1872  // compileOr implements compiler.compileOr for the arm64 architecture.
  1873  func (c *arm64Compiler) compileOr(o *wazeroir.UnionOperation) error {
  1874  	x1, x2, err := c.popTwoValuesOnRegisters()
  1875  	if err != nil {
  1876  		return err
  1877  	}
  1878  
  1879  	if isZeroRegister(x1.register) {
  1880  		c.pushRuntimeValueLocationOnRegister(x2.register, x2.valueType)
  1881  		return nil
  1882  	}
  1883  	if isZeroRegister(x2.register) {
  1884  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1885  		return nil
  1886  	}
  1887  
  1888  	var inst asm.Instruction
  1889  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1890  	switch unsignedInt {
  1891  	case wazeroir.UnsignedInt32:
  1892  		inst = arm64.ORRW
  1893  	case wazeroir.UnsignedInt64:
  1894  		inst = arm64.ORR
  1895  	}
  1896  
  1897  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1898  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1899  	return nil
  1900  }
  1901  
  1902  // compileXor implements compiler.compileXor for the arm64 architecture.
  1903  func (c *arm64Compiler) compileXor(o *wazeroir.UnionOperation) error {
  1904  	x1, x2, err := c.popTwoValuesOnRegisters()
  1905  	if err != nil {
  1906  		return err
  1907  	}
  1908  
  1909  	// At this point, at least one of x1 or x2 registers is non zero.
  1910  	// Choose the non-zero register as destination.
  1911  	destinationReg := x1.register
  1912  	if isZeroRegister(x1.register) {
  1913  		destinationReg = x2.register
  1914  	}
  1915  
  1916  	var inst asm.Instruction
  1917  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1918  	switch unsignedInt {
  1919  	case wazeroir.UnsignedInt32:
  1920  		inst = arm64.EORW
  1921  	case wazeroir.UnsignedInt64:
  1922  		inst = arm64.EOR
  1923  	}
  1924  
  1925  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1926  	c.pushRuntimeValueLocationOnRegister(destinationReg, x1.valueType)
  1927  	return nil
  1928  }
  1929  
  1930  // compileShl implements compiler.compileShl for the arm64 architecture.
  1931  func (c *arm64Compiler) compileShl(o *wazeroir.UnionOperation) error {
  1932  	x1, x2, err := c.popTwoValuesOnRegisters()
  1933  	if err != nil {
  1934  		return err
  1935  	}
  1936  
  1937  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1938  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1939  		return nil
  1940  	}
  1941  
  1942  	var inst asm.Instruction
  1943  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1944  	switch unsignedInt {
  1945  	case wazeroir.UnsignedInt32:
  1946  		inst = arm64.LSLW
  1947  	case wazeroir.UnsignedInt64:
  1948  		inst = arm64.LSL
  1949  	}
  1950  
  1951  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1952  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1953  	return nil
  1954  }
  1955  
  1956  // compileShr implements compiler.compileShr for the arm64 architecture.
  1957  func (c *arm64Compiler) compileShr(o *wazeroir.UnionOperation) error {
  1958  	x1, x2, err := c.popTwoValuesOnRegisters()
  1959  	if err != nil {
  1960  		return err
  1961  	}
  1962  
  1963  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1964  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1965  		return nil
  1966  	}
  1967  
  1968  	var inst asm.Instruction
  1969  	signedInt := wazeroir.SignedInt(o.B1)
  1970  	switch signedInt {
  1971  	case wazeroir.SignedInt32:
  1972  		inst = arm64.ASRW
  1973  	case wazeroir.SignedInt64:
  1974  		inst = arm64.ASR
  1975  	case wazeroir.SignedUint32:
  1976  		inst = arm64.LSRW
  1977  	case wazeroir.SignedUint64:
  1978  		inst = arm64.LSR
  1979  	}
  1980  
  1981  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1982  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1983  	return nil
  1984  }
  1985  
  1986  // compileRotl implements compiler.compileRotl for the arm64 architecture.
  1987  func (c *arm64Compiler) compileRotl(o *wazeroir.UnionOperation) error {
  1988  	x1, x2, err := c.popTwoValuesOnRegisters()
  1989  	if err != nil {
  1990  		return err
  1991  	}
  1992  
  1993  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1994  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1995  		return nil
  1996  	}
  1997  
  1998  	var inst, neginst asm.Instruction
  1999  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2000  	switch unsignedInt {
  2001  	case wazeroir.UnsignedInt32:
  2002  		inst = arm64.RORW
  2003  		neginst = arm64.NEGW
  2004  	case wazeroir.UnsignedInt64:
  2005  		inst = arm64.ROR
  2006  		neginst = arm64.NEG
  2007  	}
  2008  
  2009  	// Arm64 doesn't have rotate left instruction.
  2010  	// The shift amount needs to be converted to a negative number, similar to assembly output of bits.RotateLeft.
  2011  	c.assembler.CompileRegisterToRegister(neginst, x2.register, x2.register)
  2012  
  2013  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  2014  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2015  	return nil
  2016  }
  2017  
  2018  // compileRotr implements compiler.compileRotr for the arm64 architecture.
  2019  func (c *arm64Compiler) compileRotr(o *wazeroir.UnionOperation) error {
  2020  	x1, x2, err := c.popTwoValuesOnRegisters()
  2021  	if err != nil {
  2022  		return err
  2023  	}
  2024  
  2025  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  2026  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2027  		return nil
  2028  	}
  2029  
  2030  	var inst asm.Instruction
  2031  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2032  	switch unsignedInt {
  2033  	case wazeroir.UnsignedInt32:
  2034  		inst = arm64.RORW
  2035  	case wazeroir.UnsignedInt64:
  2036  		inst = arm64.ROR
  2037  	}
  2038  
  2039  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  2040  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2041  	return nil
  2042  }
  2043  
  2044  // compileAbs implements compiler.compileAbs for the arm64 architecture.
  2045  func (c *arm64Compiler) compileAbs(o *wazeroir.UnionOperation) error {
  2046  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2047  		return c.compileSimpleUnop(arm64.FABSS, runtimeValueTypeF32)
  2048  	} else {
  2049  		return c.compileSimpleUnop(arm64.FABSD, runtimeValueTypeF64)
  2050  	}
  2051  }
  2052  
  2053  // compileNeg implements compiler.compileNeg for the arm64 architecture.
  2054  func (c *arm64Compiler) compileNeg(o *wazeroir.UnionOperation) error {
  2055  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2056  		return c.compileSimpleUnop(arm64.FNEGS, runtimeValueTypeF32)
  2057  	} else {
  2058  		return c.compileSimpleUnop(arm64.FNEGD, runtimeValueTypeF64)
  2059  	}
  2060  }
  2061  
  2062  // compileCeil implements compiler.compileCeil for the arm64 architecture.
  2063  func (c *arm64Compiler) compileCeil(o *wazeroir.UnionOperation) error {
  2064  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2065  		return c.compileSimpleUnop(arm64.FRINTPS, runtimeValueTypeF32)
  2066  	} else {
  2067  		return c.compileSimpleUnop(arm64.FRINTPD, runtimeValueTypeF64)
  2068  	}
  2069  }
  2070  
  2071  // compileFloor implements compiler.compileFloor for the arm64 architecture.
  2072  func (c *arm64Compiler) compileFloor(o *wazeroir.UnionOperation) error {
  2073  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2074  		return c.compileSimpleUnop(arm64.FRINTMS, runtimeValueTypeF32)
  2075  	} else {
  2076  		return c.compileSimpleUnop(arm64.FRINTMD, runtimeValueTypeF64)
  2077  	}
  2078  }
  2079  
  2080  // compileTrunc implements compiler.compileTrunc for the arm64 architecture.
  2081  func (c *arm64Compiler) compileTrunc(o *wazeroir.UnionOperation) error {
  2082  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2083  		return c.compileSimpleUnop(arm64.FRINTZS, runtimeValueTypeF32)
  2084  	} else {
  2085  		return c.compileSimpleUnop(arm64.FRINTZD, runtimeValueTypeF64)
  2086  	}
  2087  }
  2088  
  2089  // compileNearest implements compiler.compileNearest for the arm64 architecture.
  2090  func (c *arm64Compiler) compileNearest(o *wazeroir.UnionOperation) error {
  2091  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2092  		return c.compileSimpleUnop(arm64.FRINTNS, runtimeValueTypeF32)
  2093  	} else {
  2094  		return c.compileSimpleUnop(arm64.FRINTND, runtimeValueTypeF64)
  2095  	}
  2096  }
  2097  
  2098  // compileSqrt implements compiler.compileSqrt for the arm64 architecture.
  2099  func (c *arm64Compiler) compileSqrt(o *wazeroir.UnionOperation) error {
  2100  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2101  		return c.compileSimpleUnop(arm64.FSQRTS, runtimeValueTypeF32)
  2102  	} else {
  2103  		return c.compileSimpleUnop(arm64.FSQRTD, runtimeValueTypeF64)
  2104  	}
  2105  }
  2106  
  2107  // compileMin implements compiler.compileMin for the arm64 architecture.
  2108  func (c *arm64Compiler) compileMin(o *wazeroir.UnionOperation) error {
  2109  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2110  		return c.compileSimpleFloatBinop(arm64.FMINS)
  2111  	} else {
  2112  		return c.compileSimpleFloatBinop(arm64.FMIND)
  2113  	}
  2114  }
  2115  
  2116  // compileMax implements compiler.compileMax for the arm64 architecture.
  2117  func (c *arm64Compiler) compileMax(o *wazeroir.UnionOperation) error {
  2118  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2119  		return c.compileSimpleFloatBinop(arm64.FMAXS)
  2120  	} else {
  2121  		return c.compileSimpleFloatBinop(arm64.FMAXD)
  2122  	}
  2123  }
  2124  
  2125  func (c *arm64Compiler) compileSimpleFloatBinop(inst asm.Instruction) error {
  2126  	x1, x2, err := c.popTwoValuesOnRegisters()
  2127  	if err != nil {
  2128  		return err
  2129  	}
  2130  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  2131  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2132  	return nil
  2133  }
  2134  
  2135  // compileCopysign implements compiler.compileCopysign for the arm64 architecture.
  2136  func (c *arm64Compiler) compileCopysign(o *wazeroir.UnionOperation) error {
  2137  	x1, x2, err := c.popTwoValuesOnRegisters()
  2138  	if err != nil {
  2139  		return err
  2140  	}
  2141  
  2142  	var ldr asm.Instruction
  2143  	var minValueOffsetInVM int64
  2144  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2145  		ldr = arm64.FLDRS
  2146  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  2147  	} else {
  2148  		ldr = arm64.FLDRD
  2149  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  2150  	}
  2151  
  2152  	c.markRegisterUsed(x1.register, x2.register)
  2153  	freg, err := c.allocateRegister(registerTypeVector)
  2154  	if err != nil {
  2155  		return err
  2156  	}
  2157  
  2158  	// This is exactly the same code emitted by GCC for "__builtin_copysign":
  2159  	//
  2160  	//    mov     x0, -9223372036854775808
  2161  	//    fmov    d2, x0
  2162  	//    vbit    v0.8b, v1.8b, v2.8b
  2163  	//
  2164  	// "mov freg, -9223372036854775808 (stored at ce.minimum64BitSignedInt)"
  2165  	c.assembler.CompileMemoryToRegister(
  2166  		ldr,
  2167  		arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  2168  		freg,
  2169  	)
  2170  
  2171  	// VBIT inserts each bit from the first operand into the destination if the corresponding bit of the second operand is 1,
  2172  	// otherwise it leaves the destination bit unchanged.
  2173  	// See https://developer.arm.com/documentation/dui0801/g/Advanced-SIMD-Instructions--32-bit-/VBIT
  2174  	//
  2175  	// "vbit vreg.8b, x2vreg.8b, x1vreg.8b" == "inserting 64th bit of x2 into x1".
  2176  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VBIT,
  2177  		freg, x2.register, x1.register, arm64.VectorArrangement16B)
  2178  
  2179  	c.markRegisterUnused(x2.register)
  2180  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2181  	return nil
  2182  }
  2183  
  2184  // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the arm64 architecture.
  2185  func (c *arm64Compiler) compileI32WrapFromI64() error {
  2186  	return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI32)
  2187  }
  2188  
  2189  // compileITruncFromF implements compiler.compileITruncFromF for the arm64 architecture.
  2190  func (c *arm64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) error {
  2191  	// Clear the floating point status register (FPSR).
  2192  	c.assembler.CompileRegisterToRegister(arm64.MSR, arm64.RegRZR, arm64.RegFPSR)
  2193  
  2194  	var vt runtimeValueType
  2195  	var convinst asm.Instruction
  2196  	inputType := wazeroir.Float(o.B1)
  2197  	outputType := wazeroir.SignedInt(o.B2)
  2198  	nonTrapping := o.B3
  2199  
  2200  	is32bitFloat := inputType == wazeroir.Float32
  2201  	if is32bitFloat && outputType == wazeroir.SignedInt32 {
  2202  		convinst = arm64.FCVTZSSW
  2203  		vt = runtimeValueTypeI32
  2204  	} else if is32bitFloat && outputType == wazeroir.SignedInt64 {
  2205  		convinst = arm64.FCVTZSS
  2206  		vt = runtimeValueTypeI64
  2207  	} else if !is32bitFloat && outputType == wazeroir.SignedInt32 {
  2208  		convinst = arm64.FCVTZSDW
  2209  		vt = runtimeValueTypeI32
  2210  	} else if !is32bitFloat && outputType == wazeroir.SignedInt64 {
  2211  		convinst = arm64.FCVTZSD
  2212  		vt = runtimeValueTypeI64
  2213  	} else if is32bitFloat && outputType == wazeroir.SignedUint32 {
  2214  		convinst = arm64.FCVTZUSW
  2215  		vt = runtimeValueTypeI32
  2216  	} else if is32bitFloat && outputType == wazeroir.SignedUint64 {
  2217  		convinst = arm64.FCVTZUS
  2218  		vt = runtimeValueTypeI64
  2219  	} else if !is32bitFloat && outputType == wazeroir.SignedUint32 {
  2220  		convinst = arm64.FCVTZUDW
  2221  		vt = runtimeValueTypeI32
  2222  	} else if !is32bitFloat && outputType == wazeroir.SignedUint64 {
  2223  		convinst = arm64.FCVTZUD
  2224  		vt = runtimeValueTypeI64
  2225  	}
  2226  
  2227  	source, err := c.popValueOnRegister()
  2228  	if err != nil {
  2229  		return err
  2230  	}
  2231  	sourceReg := source.register
  2232  
  2233  	destinationReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2234  	if err != nil {
  2235  		return err
  2236  	}
  2237  
  2238  	c.assembler.CompileRegisterToRegister(convinst, sourceReg, destinationReg)
  2239  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  2240  
  2241  	if !nonTrapping {
  2242  		// Obtain the floating point status register value into the general purpose register,
  2243  		// so that we can check if the conversion resulted in undefined behavior.
  2244  		c.assembler.CompileRegisterToRegister(arm64.MRS, arm64.RegFPSR, arm64ReservedRegisterForTemporary)
  2245  		// Check if the conversion was undefined by comparing the status with 1.
  2246  		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
  2247  		c.assembler.CompileRegisterAndConstToNone(arm64.CMP, arm64ReservedRegisterForTemporary, 1)
  2248  
  2249  		brOK := c.assembler.CompileJump(arm64.BCONDNE)
  2250  
  2251  		// If so, exit the execution with errors depending on whether or not the source value is NaN.
  2252  		var floatcmp asm.Instruction
  2253  		if is32bitFloat {
  2254  			floatcmp = arm64.FCMPS
  2255  		} else {
  2256  			floatcmp = arm64.FCMPD
  2257  		}
  2258  		c.assembler.CompileTwoRegistersToNone(floatcmp, sourceReg, sourceReg)
  2259  		// VS flag is set if at least one of values for FCMP is NaN.
  2260  		// https://developer.arm.com/documentation/dui0801/g/Condition-Codes/Comparison-of-condition-code-meanings-in-integer-and-floating-point-code
  2261  		// If the source value is not NaN, the operation was overflow.
  2262  		c.compileMaybeExitFromNativeCode(arm64.BCONDVS, nativeCallStatusIntegerOverflow)
  2263  
  2264  		// Otherwise, the operation was invalid as this is trying to convert NaN to integer.
  2265  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2266  
  2267  		// Otherwise, we branch into the next instruction.
  2268  		c.assembler.SetJumpTargetOnNext(brOK)
  2269  	}
  2270  	return nil
  2271  }
  2272  
  2273  // compileFConvertFromI implements compiler.compileFConvertFromI for the arm64 architecture.
  2274  func (c *arm64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) error {
  2275  	var convinst asm.Instruction
  2276  	inputType := wazeroir.SignedInt(o.B1)
  2277  	outputType := wazeroir.Float(o.B2)
  2278  
  2279  	if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 {
  2280  		convinst = arm64.SCVTFWS
  2281  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 {
  2282  		convinst = arm64.SCVTFS
  2283  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 {
  2284  		convinst = arm64.SCVTFWD
  2285  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 {
  2286  		convinst = arm64.SCVTFD
  2287  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 {
  2288  		convinst = arm64.UCVTFWS
  2289  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 {
  2290  		convinst = arm64.UCVTFS
  2291  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 {
  2292  		convinst = arm64.UCVTFWD
  2293  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 {
  2294  		convinst = arm64.UCVTFD
  2295  	}
  2296  
  2297  	var vt runtimeValueType
  2298  	if outputType == wazeroir.Float32 {
  2299  		vt = runtimeValueTypeF32
  2300  	} else {
  2301  		vt = runtimeValueTypeF64
  2302  	}
  2303  	return c.compileSimpleConversion(convinst, registerTypeVector, vt)
  2304  }
  2305  
  2306  // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the arm64 architecture.
  2307  func (c *arm64Compiler) compileF32DemoteFromF64() error {
  2308  	return c.compileSimpleUnop(arm64.FCVTDS, runtimeValueTypeF32)
  2309  }
  2310  
  2311  // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the arm64 architecture.
  2312  func (c *arm64Compiler) compileF64PromoteFromF32() error {
  2313  	return c.compileSimpleUnop(arm64.FCVTSD, runtimeValueTypeF64)
  2314  }
  2315  
  2316  // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the arm64 architecture.
  2317  func (c *arm64Compiler) compileI32ReinterpretFromF32() error {
  2318  	if peek := c.locationStack.peek(); peek.onStack() {
  2319  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2320  		peek.valueType = runtimeValueTypeI32
  2321  		return nil
  2322  	}
  2323  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeGeneralPurpose, runtimeValueTypeI32)
  2324  }
  2325  
  2326  // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the arm64 architecture.
  2327  func (c *arm64Compiler) compileI64ReinterpretFromF64() error {
  2328  	if peek := c.locationStack.peek(); peek.onStack() {
  2329  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2330  		peek.valueType = runtimeValueTypeI64
  2331  		return nil
  2332  	}
  2333  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeGeneralPurpose, runtimeValueTypeI64)
  2334  }
  2335  
  2336  // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the arm64 architecture.
  2337  func (c *arm64Compiler) compileF32ReinterpretFromI32() error {
  2338  	if peek := c.locationStack.peek(); peek.onStack() {
  2339  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2340  		peek.valueType = runtimeValueTypeF32
  2341  		return nil
  2342  	}
  2343  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeVector, runtimeValueTypeF32)
  2344  }
  2345  
  2346  // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the arm64 architecture.
  2347  func (c *arm64Compiler) compileF64ReinterpretFromI64() error {
  2348  	if peek := c.locationStack.peek(); peek.onStack() {
  2349  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2350  		peek.valueType = runtimeValueTypeF64
  2351  		return nil
  2352  	}
  2353  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeVector, runtimeValueTypeF64)
  2354  }
  2355  
  2356  func (c *arm64Compiler) compileSimpleConversion(inst asm.Instruction, destinationRegType registerType, resultRuntimeValueType runtimeValueType) error {
  2357  	source, err := c.popValueOnRegister()
  2358  	if err != nil {
  2359  		return err
  2360  	}
  2361  
  2362  	destinationReg, err := c.allocateRegister(destinationRegType)
  2363  	if err != nil {
  2364  		return err
  2365  	}
  2366  
  2367  	c.assembler.CompileRegisterToRegister(inst, source.register, destinationReg)
  2368  	c.pushRuntimeValueLocationOnRegister(destinationReg, resultRuntimeValueType)
  2369  	return nil
  2370  }
  2371  
  2372  // compileExtend implements compiler.compileExtend for the arm64 architecture.
  2373  func (c *arm64Compiler) compileExtend(o *wazeroir.UnionOperation) error {
  2374  	signed := o.B1 != 0
  2375  	if signed {
  2376  		return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2377  	} else {
  2378  		return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI64)
  2379  	}
  2380  }
  2381  
  2382  // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the arm64 architecture.
  2383  func (c *arm64Compiler) compileSignExtend32From8() error {
  2384  	return c.compileSimpleUnop(arm64.SXTBW, runtimeValueTypeI32)
  2385  }
  2386  
  2387  // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the arm64 architecture.
  2388  func (c *arm64Compiler) compileSignExtend32From16() error {
  2389  	return c.compileSimpleUnop(arm64.SXTHW, runtimeValueTypeI32)
  2390  }
  2391  
  2392  // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the arm64 architecture.
  2393  func (c *arm64Compiler) compileSignExtend64From8() error {
  2394  	return c.compileSimpleUnop(arm64.SXTB, runtimeValueTypeI64)
  2395  }
  2396  
  2397  // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the arm64 architecture.
  2398  func (c *arm64Compiler) compileSignExtend64From16() error {
  2399  	return c.compileSimpleUnop(arm64.SXTH, runtimeValueTypeI64)
  2400  }
  2401  
  2402  // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the arm64 architecture.
  2403  func (c *arm64Compiler) compileSignExtend64From32() error {
  2404  	return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2405  }
  2406  
  2407  func (c *arm64Compiler) compileSimpleUnop(inst asm.Instruction, resultRuntimeValueType runtimeValueType) error {
  2408  	v, err := c.popValueOnRegister()
  2409  	if err != nil {
  2410  		return err
  2411  	}
  2412  	reg := v.register
  2413  	c.assembler.CompileRegisterToRegister(inst, reg, reg)
  2414  	c.pushRuntimeValueLocationOnRegister(reg, resultRuntimeValueType)
  2415  	return nil
  2416  }
  2417  
  2418  // compileEq implements compiler.compileEq for the arm64 architecture.
  2419  func (c *arm64Compiler) compileEq(o *wazeroir.UnionOperation) error {
  2420  	return c.emitEqOrNe(true, wazeroir.UnsignedType(o.B1))
  2421  }
  2422  
  2423  // compileNe implements compiler.compileNe for the arm64 architecture.
  2424  func (c *arm64Compiler) compileNe(o *wazeroir.UnionOperation) error {
  2425  	return c.emitEqOrNe(false, wazeroir.UnsignedType(o.B1))
  2426  }
  2427  
  2428  // emitEqOrNe implements compiler.compileEq and compiler.compileNe for the arm64 architecture.
  2429  func (c *arm64Compiler) emitEqOrNe(isEq bool, unsignedType wazeroir.UnsignedType) error {
  2430  	x1, x2, err := c.popTwoValuesOnRegisters()
  2431  	if err != nil {
  2432  		return err
  2433  	}
  2434  
  2435  	var inst asm.Instruction
  2436  	switch unsignedType {
  2437  	case wazeroir.UnsignedTypeI32:
  2438  		inst = arm64.CMPW
  2439  	case wazeroir.UnsignedTypeI64:
  2440  		inst = arm64.CMP
  2441  	case wazeroir.UnsignedTypeF32:
  2442  		inst = arm64.FCMPS
  2443  	case wazeroir.UnsignedTypeF64:
  2444  		inst = arm64.FCMPD
  2445  	}
  2446  
  2447  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2448  
  2449  	// Push the comparison result as a conditional register value.
  2450  	cond := arm64.CondNE
  2451  	if isEq {
  2452  		cond = arm64.CondEQ
  2453  	}
  2454  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(cond)
  2455  	return nil
  2456  }
  2457  
  2458  // compileEqz implements compiler.compileEqz for the arm64 architecture.
  2459  func (c *arm64Compiler) compileEqz(o *wazeroir.UnionOperation) error {
  2460  	x1, err := c.popValueOnRegister()
  2461  	if err != nil {
  2462  		return err
  2463  	}
  2464  
  2465  	var inst asm.Instruction
  2466  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2467  	switch unsignedInt {
  2468  	case wazeroir.UnsignedInt32:
  2469  		inst = arm64.CMPW
  2470  	case wazeroir.UnsignedInt64:
  2471  		inst = arm64.CMP
  2472  	}
  2473  
  2474  	c.assembler.CompileTwoRegistersToNone(inst, arm64.RegRZR, x1.register)
  2475  
  2476  	// Push the comparison result as a conditional register value.
  2477  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondEQ)
  2478  	return nil
  2479  }
  2480  
  2481  // compileLt implements compiler.compileLt for the arm64 architecture.
  2482  func (c *arm64Compiler) compileLt(o *wazeroir.UnionOperation) error {
  2483  	x1, x2, err := c.popTwoValuesOnRegisters()
  2484  	if err != nil {
  2485  		return err
  2486  	}
  2487  
  2488  	var inst asm.Instruction
  2489  	var conditionalRegister asm.ConditionalRegisterState
  2490  	signedType := wazeroir.SignedType(o.B1)
  2491  	switch signedType {
  2492  	case wazeroir.SignedTypeUint32:
  2493  		inst = arm64.CMPW
  2494  		conditionalRegister = arm64.CondLO
  2495  	case wazeroir.SignedTypeUint64:
  2496  		inst = arm64.CMP
  2497  		conditionalRegister = arm64.CondLO
  2498  	case wazeroir.SignedTypeInt32:
  2499  		inst = arm64.CMPW
  2500  		conditionalRegister = arm64.CondLT
  2501  	case wazeroir.SignedTypeInt64:
  2502  		inst = arm64.CMP
  2503  		conditionalRegister = arm64.CondLT
  2504  	case wazeroir.SignedTypeFloat32:
  2505  		inst = arm64.FCMPS
  2506  		conditionalRegister = arm64.CondMI
  2507  	case wazeroir.SignedTypeFloat64:
  2508  		inst = arm64.FCMPD
  2509  		conditionalRegister = arm64.CondMI
  2510  	}
  2511  
  2512  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2513  
  2514  	// Push the comparison result as a conditional register value.
  2515  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2516  	return nil
  2517  }
  2518  
  2519  // compileGt implements compiler.compileGt for the arm64 architecture.
  2520  func (c *arm64Compiler) compileGt(o *wazeroir.UnionOperation) error {
  2521  	x1, x2, err := c.popTwoValuesOnRegisters()
  2522  	if err != nil {
  2523  		return err
  2524  	}
  2525  
  2526  	var inst asm.Instruction
  2527  	var conditionalRegister asm.ConditionalRegisterState
  2528  	signedType := wazeroir.SignedType(o.B1)
  2529  	switch signedType {
  2530  	case wazeroir.SignedTypeUint32:
  2531  		inst = arm64.CMPW
  2532  		conditionalRegister = arm64.CondHI
  2533  	case wazeroir.SignedTypeUint64:
  2534  		inst = arm64.CMP
  2535  		conditionalRegister = arm64.CondHI
  2536  	case wazeroir.SignedTypeInt32:
  2537  		inst = arm64.CMPW
  2538  		conditionalRegister = arm64.CondGT
  2539  	case wazeroir.SignedTypeInt64:
  2540  		inst = arm64.CMP
  2541  		conditionalRegister = arm64.CondGT
  2542  	case wazeroir.SignedTypeFloat32:
  2543  		inst = arm64.FCMPS
  2544  		conditionalRegister = arm64.CondGT
  2545  	case wazeroir.SignedTypeFloat64:
  2546  		inst = arm64.FCMPD
  2547  		conditionalRegister = arm64.CondGT
  2548  	}
  2549  
  2550  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2551  
  2552  	// Push the comparison result as a conditional register value.
  2553  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2554  	return nil
  2555  }
  2556  
  2557  // compileLe implements compiler.compileLe for the arm64 architecture.
  2558  func (c *arm64Compiler) compileLe(o *wazeroir.UnionOperation) error {
  2559  	x1, x2, err := c.popTwoValuesOnRegisters()
  2560  	if err != nil {
  2561  		return err
  2562  	}
  2563  
  2564  	var inst asm.Instruction
  2565  	var conditionalRegister asm.ConditionalRegisterState
  2566  	signedType := wazeroir.SignedType(o.B1)
  2567  	switch signedType {
  2568  	case wazeroir.SignedTypeUint32:
  2569  		inst = arm64.CMPW
  2570  		conditionalRegister = arm64.CondLS
  2571  	case wazeroir.SignedTypeUint64:
  2572  		inst = arm64.CMP
  2573  		conditionalRegister = arm64.CondLS
  2574  	case wazeroir.SignedTypeInt32:
  2575  		inst = arm64.CMPW
  2576  		conditionalRegister = arm64.CondLE
  2577  	case wazeroir.SignedTypeInt64:
  2578  		inst = arm64.CMP
  2579  		conditionalRegister = arm64.CondLE
  2580  	case wazeroir.SignedTypeFloat32:
  2581  		inst = arm64.FCMPS
  2582  		conditionalRegister = arm64.CondLS
  2583  	case wazeroir.SignedTypeFloat64:
  2584  		inst = arm64.FCMPD
  2585  		conditionalRegister = arm64.CondLS
  2586  	}
  2587  
  2588  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2589  
  2590  	// Push the comparison result as a conditional register value.
  2591  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2592  	return nil
  2593  }
  2594  
  2595  // compileGe implements compiler.compileGe for the arm64 architecture.
  2596  func (c *arm64Compiler) compileGe(o *wazeroir.UnionOperation) error {
  2597  	x1, x2, err := c.popTwoValuesOnRegisters()
  2598  	if err != nil {
  2599  		return err
  2600  	}
  2601  
  2602  	var inst asm.Instruction
  2603  	var conditionalRegister asm.ConditionalRegisterState
  2604  	signedType := wazeroir.SignedType(o.B1)
  2605  	switch signedType {
  2606  	case wazeroir.SignedTypeUint32:
  2607  		inst = arm64.CMPW
  2608  		conditionalRegister = arm64.CondHS
  2609  	case wazeroir.SignedTypeUint64:
  2610  		inst = arm64.CMP
  2611  		conditionalRegister = arm64.CondHS
  2612  	case wazeroir.SignedTypeInt32:
  2613  		inst = arm64.CMPW
  2614  		conditionalRegister = arm64.CondGE
  2615  	case wazeroir.SignedTypeInt64:
  2616  		inst = arm64.CMP
  2617  		conditionalRegister = arm64.CondGE
  2618  	case wazeroir.SignedTypeFloat32:
  2619  		inst = arm64.FCMPS
  2620  		conditionalRegister = arm64.CondGE
  2621  	case wazeroir.SignedTypeFloat64:
  2622  		inst = arm64.FCMPD
  2623  		conditionalRegister = arm64.CondGE
  2624  	}
  2625  
  2626  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2627  
  2628  	// Push the comparison result as a conditional register value.
  2629  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2630  	return nil
  2631  }
  2632  
  2633  // compileLoad implements compiler.compileLoad for the arm64 architecture.
  2634  func (c *arm64Compiler) compileLoad(o *wazeroir.UnionOperation) error {
  2635  	var (
  2636  		isFloat           bool
  2637  		loadInst          asm.Instruction
  2638  		targetSizeInBytes int64
  2639  		vt                runtimeValueType
  2640  	)
  2641  
  2642  	unsignedType := wazeroir.UnsignedType(o.B1)
  2643  	offset := uint32(o.U2)
  2644  
  2645  	switch unsignedType {
  2646  	case wazeroir.UnsignedTypeI32:
  2647  		loadInst = arm64.LDRW
  2648  		targetSizeInBytes = 32 / 8
  2649  		vt = runtimeValueTypeI32
  2650  	case wazeroir.UnsignedTypeI64:
  2651  		loadInst = arm64.LDRD
  2652  		targetSizeInBytes = 64 / 8
  2653  		vt = runtimeValueTypeI64
  2654  	case wazeroir.UnsignedTypeF32:
  2655  		loadInst = arm64.FLDRS
  2656  		isFloat = true
  2657  		targetSizeInBytes = 32 / 8
  2658  		vt = runtimeValueTypeF32
  2659  	case wazeroir.UnsignedTypeF64:
  2660  		loadInst = arm64.FLDRD
  2661  		isFloat = true
  2662  		targetSizeInBytes = 64 / 8
  2663  		vt = runtimeValueTypeF64
  2664  	}
  2665  	return c.compileLoadImpl(offset, loadInst, targetSizeInBytes, isFloat, vt)
  2666  }
  2667  
  2668  // compileLoad8 implements compiler.compileLoad8 for the arm64 architecture.
  2669  func (c *arm64Compiler) compileLoad8(o *wazeroir.UnionOperation) error {
  2670  	var loadInst asm.Instruction
  2671  	var vt runtimeValueType
  2672  
  2673  	signedInt := wazeroir.SignedInt(o.B1)
  2674  	offset := uint32(o.U2)
  2675  
  2676  	switch signedInt {
  2677  	case wazeroir.SignedInt32:
  2678  		loadInst = arm64.LDRSBW
  2679  		vt = runtimeValueTypeI32
  2680  	case wazeroir.SignedInt64:
  2681  		loadInst = arm64.LDRSBD
  2682  		vt = runtimeValueTypeI64
  2683  	case wazeroir.SignedUint32:
  2684  		loadInst = arm64.LDRB
  2685  		vt = runtimeValueTypeI32
  2686  	case wazeroir.SignedUint64:
  2687  		loadInst = arm64.LDRB
  2688  		vt = runtimeValueTypeI64
  2689  	}
  2690  	return c.compileLoadImpl(offset, loadInst, 1, false, vt)
  2691  }
  2692  
  2693  // compileLoad16 implements compiler.compileLoad16 for the arm64 architecture.
  2694  func (c *arm64Compiler) compileLoad16(o *wazeroir.UnionOperation) error {
  2695  	var loadInst asm.Instruction
  2696  	var vt runtimeValueType
  2697  
  2698  	signedInt := wazeroir.SignedInt(o.B1)
  2699  	offset := uint32(o.U2)
  2700  
  2701  	switch signedInt {
  2702  	case wazeroir.SignedInt32:
  2703  		loadInst = arm64.LDRSHW
  2704  		vt = runtimeValueTypeI32
  2705  	case wazeroir.SignedInt64:
  2706  		loadInst = arm64.LDRSHD
  2707  		vt = runtimeValueTypeI64
  2708  	case wazeroir.SignedUint32:
  2709  		loadInst = arm64.LDRH
  2710  		vt = runtimeValueTypeI32
  2711  	case wazeroir.SignedUint64:
  2712  		loadInst = arm64.LDRH
  2713  		vt = runtimeValueTypeI64
  2714  	}
  2715  	return c.compileLoadImpl(offset, loadInst, 16/8, false, vt)
  2716  }
  2717  
  2718  // compileLoad32 implements compiler.compileLoad32 for the arm64 architecture.
  2719  func (c *arm64Compiler) compileLoad32(o *wazeroir.UnionOperation) error {
  2720  	var loadInst asm.Instruction
  2721  	signed := o.B1 == 1
  2722  	offset := uint32(o.U2)
  2723  
  2724  	if signed {
  2725  		loadInst = arm64.LDRSW
  2726  	} else {
  2727  		loadInst = arm64.LDRW
  2728  	}
  2729  	return c.compileLoadImpl(offset, loadInst, 32/8, false, runtimeValueTypeI64)
  2730  }
  2731  
  2732  // compileLoadImpl implements compileLoadImpl* variants for arm64 architecture.
  2733  func (c *arm64Compiler) compileLoadImpl(offsetArg uint32, loadInst asm.Instruction,
  2734  	targetSizeInBytes int64, isFloat bool, resultRuntimeValueType runtimeValueType,
  2735  ) error {
  2736  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2737  	if err != nil {
  2738  		return err
  2739  	}
  2740  
  2741  	resultRegister := offsetReg
  2742  	if isFloat {
  2743  		resultRegister, err = c.allocateRegister(registerTypeVector)
  2744  		if err != nil {
  2745  			return err
  2746  		}
  2747  	}
  2748  
  2749  	// "resultRegister = [arm64ReservedRegisterForMemory + offsetReg]"
  2750  	// In other words, "resultRegister = memory.Buffer[offset: offset+targetSizeInBytes]"
  2751  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
  2752  		loadInst,
  2753  		arm64ReservedRegisterForMemory, offsetReg,
  2754  		resultRegister,
  2755  	)
  2756  
  2757  	c.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  2758  	return nil
  2759  }
  2760  
  2761  // compileStore implements compiler.compileStore for the arm64 architecture.
  2762  func (c *arm64Compiler) compileStore(o *wazeroir.UnionOperation) error {
  2763  	var movInst asm.Instruction
  2764  	var targetSizeInBytes int64
  2765  	unsignedType := wazeroir.UnsignedType(o.B1)
  2766  	offset := uint32(o.U2)
  2767  	switch unsignedType {
  2768  	case wazeroir.UnsignedTypeI32:
  2769  		movInst = arm64.STRW
  2770  		targetSizeInBytes = 32 / 8
  2771  	case wazeroir.UnsignedTypeI64:
  2772  		movInst = arm64.STRD
  2773  		targetSizeInBytes = 64 / 8
  2774  	case wazeroir.UnsignedTypeF32:
  2775  		movInst = arm64.FSTRS
  2776  		targetSizeInBytes = 32 / 8
  2777  	case wazeroir.UnsignedTypeF64:
  2778  		movInst = arm64.FSTRD
  2779  		targetSizeInBytes = 64 / 8
  2780  	}
  2781  	return c.compileStoreImpl(offset, movInst, targetSizeInBytes)
  2782  }
  2783  
  2784  // compileStore8 implements compiler.compileStore8 for the arm64 architecture.
  2785  func (c *arm64Compiler) compileStore8(o *wazeroir.UnionOperation) error {
  2786  	return c.compileStoreImpl(uint32(o.U2), arm64.STRB, 1)
  2787  }
  2788  
  2789  // compileStore16 implements compiler.compileStore16 for the arm64 architecture.
  2790  func (c *arm64Compiler) compileStore16(o *wazeroir.UnionOperation) error {
  2791  	return c.compileStoreImpl(uint32(o.U2), arm64.STRH, 16/8)
  2792  }
  2793  
  2794  // compileStore32 implements compiler.compileStore32 for the arm64 architecture.
  2795  func (c *arm64Compiler) compileStore32(o *wazeroir.UnionOperation) error {
  2796  	return c.compileStoreImpl(uint32(o.U2), arm64.STRW, 32/8)
  2797  }
  2798  
  2799  // compileStoreImpl implements compleStore* variants for arm64 architecture.
  2800  func (c *arm64Compiler) compileStoreImpl(offsetArg uint32, storeInst asm.Instruction, targetSizeInBytes int64) error {
  2801  	val, err := c.popValueOnRegister()
  2802  	if err != nil {
  2803  		return err
  2804  	}
  2805  	// Mark temporarily used as compileMemoryAccessOffsetSetup might try allocating register.
  2806  	c.markRegisterUsed(val.register)
  2807  
  2808  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2809  	if err != nil {
  2810  		return err
  2811  	}
  2812  
  2813  	// "[arm64ReservedRegisterForMemory + offsetReg] = val.register"
  2814  	// In other words, "memory.Buffer[offset: offset+targetSizeInBytes] = val.register"
  2815  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(
  2816  		storeInst, val.register,
  2817  		arm64ReservedRegisterForMemory, offsetReg,
  2818  	)
  2819  
  2820  	c.markRegisterUnused(val.register)
  2821  	return nil
  2822  }
  2823  
  2824  // compileMemoryAccessOffsetSetup pops the top value from the stack (called "base"), stores "base + offsetArg"
  2825  // into a register, and returns the stored register. We call the result "offset" because we access the memory
  2826  // as memory.Buffer[offset: offset+targetSizeInBytes].
  2827  //
  2828  // Note: this also emits the instructions to check the out of bounds memory access.
  2829  // In other words, if the offset+targetSizeInBytes exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  2830  func (c *arm64Compiler) compileMemoryAccessOffsetSetup(offsetArg uint32, targetSizeInBytes int64) (offsetRegister asm.Register, err error) {
  2831  	base, err := c.popValueOnRegister()
  2832  	if err != nil {
  2833  		return 0, err
  2834  	}
  2835  
  2836  	offsetRegister = base.register
  2837  	if isZeroRegister(base.register) {
  2838  		offsetRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
  2839  		if err != nil {
  2840  			return
  2841  		}
  2842  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, offsetRegister)
  2843  	}
  2844  
  2845  	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxUint32 {
  2846  		// "offsetRegister = base + offsetArg + targetSizeInBytes"
  2847  		c.assembler.CompileConstToRegister(arm64.ADD, offsetConst, offsetRegister)
  2848  	} else {
  2849  		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
  2850  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  2851  		return
  2852  	}
  2853  
  2854  	// "arm64ReservedRegisterForTemporary = len(memory.Buffer)"
  2855  	c.compileLoadMemoryBufferLen(arm64ReservedRegisterForTemporary)
  2856  
  2857  	// Check if offsetRegister(= base+offsetArg+targetSizeInBytes) > len(memory.Buffer).
  2858  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, offsetRegister)
  2859  
  2860  	// If offsetRegister(= base+offsetArg+targetSizeInBytes) exceeds the memory length,
  2861  	//  we exit the function with nativeCallStatusCodeMemoryOutOfBounds.
  2862  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, nativeCallStatusCodeMemoryOutOfBounds)
  2863  
  2864  	// Otherwise, we subtract targetSizeInBytes from offsetRegister.
  2865  	c.assembler.CompileConstToRegister(arm64.SUB, targetSizeInBytes, offsetRegister)
  2866  	return offsetRegister, nil
  2867  }
  2868  
  2869  // compileMemoryAccessBaseSetup pops the top value from the stack (called "base"), stores "memoryBufferStart + base + offsetArg"
  2870  // into a register, and returns the stored register. We call the result "base" because it refers to "base addressing" as
  2871  // per arm docs, which are reads from addresses without offsets. The result is equivalent to &memory.Buffer[offset].
  2872  //
  2873  // Note: this also emits the instructions to check the out of bounds memory access.
  2874  // In other words, if the offset+targetSizeInBytes exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  2875  func (c *arm64Compiler) compileMemoryAccessBaseSetup(offsetArg uint32, targetSizeInBytes int64) (baseRegister asm.Register, err error) {
  2876  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2877  	if err != nil {
  2878  		return
  2879  	}
  2880  
  2881  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, offsetReg)
  2882  	baseRegister = offsetReg
  2883  	return
  2884  }
  2885  
  2886  func (c *arm64Compiler) compileMemoryAlignmentCheck(baseRegister asm.Register, targetSizeInBytes int64) {
  2887  	if targetSizeInBytes == 1 {
  2888  		return // No alignment restrictions when accessing a byte
  2889  	}
  2890  	var checkBits asm.ConstantValue
  2891  	switch targetSizeInBytes {
  2892  	case 2:
  2893  		checkBits = 0b1
  2894  	case 4:
  2895  		checkBits = 0b11
  2896  	case 8:
  2897  		checkBits = 0b111
  2898  	}
  2899  	c.assembler.CompileRegisterAndConstToRegister(arm64.ANDS, baseRegister, checkBits, arm64.RegRZR)
  2900  	c.compileMaybeExitFromNativeCode(arm64.BCONDEQ, nativeCallStatusUnalignedAtomic)
  2901  }
  2902  
  2903  // compileMemoryGrow implements compileMemoryGrow variants for arm64 architecture.
  2904  func (c *arm64Compiler) compileMemoryGrow() error {
  2905  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2906  		return err
  2907  	}
  2908  
  2909  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexMemoryGrow); err != nil {
  2910  		return err
  2911  	}
  2912  
  2913  	// After return, we re-initialize reserved registers just like preamble of functions.
  2914  	c.compileReservedStackBasePointerRegisterInitialization()
  2915  	c.compileReservedMemoryRegisterInitialization()
  2916  	return nil
  2917  }
  2918  
  2919  // compileMemorySize implements compileMemorySize variants for arm64 architecture.
  2920  func (c *arm64Compiler) compileMemorySize() error {
  2921  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2922  		return err
  2923  	}
  2924  
  2925  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2926  	if err != nil {
  2927  		return err
  2928  	}
  2929  
  2930  	// "reg = len(memory.Buffer)"
  2931  	c.compileLoadMemoryBufferLen(reg)
  2932  
  2933  	// memory.size loads the page size of memory, so we have to divide by the page size.
  2934  	// "reg = reg >> wasm.MemoryPageSizeInBits (== reg / wasm.MemoryPageSize) "
  2935  	c.assembler.CompileConstToRegister(
  2936  		arm64.LSR,
  2937  		wasm.MemoryPageSizeInBits,
  2938  		reg,
  2939  	)
  2940  
  2941  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  2942  	return nil
  2943  }
  2944  
  2945  // compileCallGoFunction adds instructions to call a Go function whose address equals the addr parameter.
  2946  // compilerStatus is set before making call, and it should be either nativeCallStatusCodeCallBuiltInFunction or
  2947  // nativeCallStatusCodeCallGoHostFunction.
  2948  func (c *arm64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode, builtinFunction wasm.Index) error {
  2949  	// Release all the registers as our calling convention requires the caller-save.
  2950  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  2951  		return err
  2952  	}
  2953  
  2954  	if compilerStatus == nativeCallStatusCodeCallBuiltInFunction {
  2955  		// Set the target function address to ce.functionCallAddress
  2956  		// "tmp = $index"
  2957  		c.assembler.CompileConstToRegister(
  2958  			arm64.MOVD,
  2959  			int64(builtinFunction),
  2960  			arm64ReservedRegisterForTemporary,
  2961  		)
  2962  		// "[arm64ReservedRegisterForCallEngine + callEngineExitContextFunctionCallAddressOffset] = tmp"
  2963  		// In other words, "ce.functionCallAddress = tmp (== $addr)"
  2964  		c.assembler.CompileRegisterToMemory(
  2965  			arm64.STRW,
  2966  			arm64ReservedRegisterForTemporary,
  2967  			arm64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset,
  2968  		)
  2969  	}
  2970  
  2971  	c.compileExitFromNativeCode(compilerStatus)
  2972  	return nil
  2973  }
  2974  
  2975  // compileConstI32 implements compiler.compileConstI32 for the arm64 architecture.
  2976  func (c *arm64Compiler) compileConstI32(o *wazeroir.UnionOperation) error {
  2977  	return c.compileIntConstant(true, o.U1)
  2978  }
  2979  
  2980  // compileConstI64 implements compiler.compileConstI64 for the arm64 architecture.
  2981  func (c *arm64Compiler) compileConstI64(o *wazeroir.UnionOperation) error {
  2982  	return c.compileIntConstant(false, o.U1)
  2983  }
  2984  
  2985  // compileIntConstant adds instructions to load an integer constant.
  2986  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  2987  // value holds the (zero-extended for 32-bit case) load target constant.
  2988  func (c *arm64Compiler) compileIntConstant(is32bit bool, value uint64) error {
  2989  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2990  		return err
  2991  	}
  2992  
  2993  	var inst asm.Instruction
  2994  	var vt runtimeValueType
  2995  	if is32bit {
  2996  		inst = arm64.MOVW
  2997  		vt = runtimeValueTypeI32
  2998  	} else {
  2999  		inst = arm64.MOVD
  3000  		vt = runtimeValueTypeI64
  3001  	}
  3002  
  3003  	if value == 0 {
  3004  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, vt)
  3005  	} else {
  3006  		// Take a register to load the value.
  3007  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  3008  		if err != nil {
  3009  			return err
  3010  		}
  3011  
  3012  		c.assembler.CompileConstToRegister(inst, int64(value), reg)
  3013  
  3014  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  3015  	}
  3016  	return nil
  3017  }
  3018  
  3019  // compileConstF32 implements compiler.compileConstF32 for the arm64 architecture.
  3020  func (c *arm64Compiler) compileConstF32(o *wazeroir.UnionOperation) error {
  3021  	return c.compileFloatConstant(true, o.U1 /*uint64(math.Float32bits(o.Value))*/)
  3022  }
  3023  
  3024  // compileConstF64 implements compiler.compileConstF64 for the arm64 architecture.
  3025  func (c *arm64Compiler) compileConstF64(o *wazeroir.UnionOperation) error {
  3026  	return c.compileFloatConstant(false, o.U1 /*math.Float64bits(o.Value)*/)
  3027  }
  3028  
  3029  // compileFloatConstant adds instructions to load a float constant.
  3030  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  3031  // value holds the (zero-extended for 32-bit case) bit representation of load target float constant.
  3032  func (c *arm64Compiler) compileFloatConstant(is32bit bool, value uint64) error {
  3033  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3034  		return err
  3035  	}
  3036  
  3037  	// Take a register to load the value.
  3038  	reg, err := c.allocateRegister(registerTypeVector)
  3039  	if err != nil {
  3040  		return err
  3041  	}
  3042  
  3043  	tmpReg := arm64.RegRZR
  3044  	if value != 0 {
  3045  		tmpReg = arm64ReservedRegisterForTemporary
  3046  		var inst asm.Instruction
  3047  		if is32bit {
  3048  			inst = arm64.MOVW
  3049  		} else {
  3050  			inst = arm64.MOVD
  3051  		}
  3052  		c.assembler.CompileConstToRegister(inst, int64(value), tmpReg)
  3053  	}
  3054  
  3055  	// Use FMOV instruction to move the value on integer register into the float one.
  3056  	var inst asm.Instruction
  3057  	var vt runtimeValueType
  3058  	if is32bit {
  3059  		vt = runtimeValueTypeF32
  3060  		inst = arm64.FMOVS
  3061  	} else {
  3062  		vt = runtimeValueTypeF64
  3063  		inst = arm64.FMOVD
  3064  	}
  3065  	c.assembler.CompileRegisterToRegister(inst, tmpReg, reg)
  3066  
  3067  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3068  	return nil
  3069  }
  3070  
  3071  // compileMemoryInit implements compiler.compileMemoryInit for the arm64 architecture.
  3072  func (c *arm64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error {
  3073  	dataIndex := uint32(o.U1)
  3074  	return c.compileInitImpl(false, dataIndex, 0)
  3075  }
  3076  
  3077  // compileInitImpl implements compileTableInit and compileMemoryInit.
  3078  //
  3079  // TODO: the compiled code in this function should be reused and compile at once as
  3080  // the code is independent of any module.
  3081  func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
  3082  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3083  	if isTable {
  3084  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3085  	}
  3086  
  3087  	copySize, err := c.popValueOnRegister()
  3088  	if err != nil {
  3089  		return err
  3090  	}
  3091  	c.markRegisterUsed(copySize.register)
  3092  
  3093  	sourceOffset, err := c.popValueOnRegister()
  3094  	if err != nil {
  3095  		return err
  3096  	}
  3097  	if isZeroRegister(sourceOffset.register) {
  3098  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3099  		if err != nil {
  3100  			return err
  3101  		}
  3102  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  3103  	}
  3104  	c.markRegisterUsed(sourceOffset.register)
  3105  
  3106  	destinationOffset, err := c.popValueOnRegister()
  3107  	if err != nil {
  3108  		return err
  3109  	}
  3110  	if isZeroRegister(destinationOffset.register) {
  3111  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3112  		if err != nil {
  3113  			return err
  3114  		}
  3115  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3116  	}
  3117  	c.markRegisterUsed(destinationOffset.register)
  3118  
  3119  	tableInstanceAddressReg := asm.NilRegister
  3120  	if isTable {
  3121  		tableInstanceAddressReg, err = c.allocateRegister(registerTypeGeneralPurpose)
  3122  		if err != nil {
  3123  			return err
  3124  		}
  3125  		c.markRegisterUsed(tableInstanceAddressReg)
  3126  	}
  3127  
  3128  	if !isZeroRegister(copySize.register) {
  3129  		// sourceOffset += size.
  3130  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  3131  		// destinationOffset += size.
  3132  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  3133  	}
  3134  
  3135  	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
  3136  	if err != nil {
  3137  		return err
  3138  	}
  3139  
  3140  	if isTable {
  3141  		c.compileLoadElemInstanceAddress(index, instanceAddr)
  3142  	} else {
  3143  		c.compileLoadDataInstanceAddress(index, instanceAddr)
  3144  	}
  3145  
  3146  	// Check data instance bounds.
  3147  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3148  		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
  3149  		arm64ReservedRegisterForTemporary)
  3150  
  3151  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3152  	// If not, raise out of bounds memory access error.
  3153  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3154  
  3155  	// Otherwise, ready to copy the value from destination to source.
  3156  	// Check destination bounds.
  3157  	if isTable {
  3158  		// arm64ReservedRegisterForTemporary = &tables[0]
  3159  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3160  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3161  			arm64ReservedRegisterForTemporary)
  3162  		// tableInstanceAddressReg = arm64ReservedRegisterForTemporary + tableIndex*8
  3163  		//                         = &tables[0] + sizeOf(*tableInstance)*8
  3164  		//                         = &tables[tableIndex]
  3165  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3166  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3167  			tableInstanceAddressReg)
  3168  		// arm64ReservedRegisterForTemporary = [tableInstanceAddressReg+tableInstanceTableLenOffset] = len(tables[tableIndex])
  3169  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3170  			tableInstanceAddressReg, tableInstanceTableLenOffset,
  3171  			arm64ReservedRegisterForTemporary)
  3172  	} else {
  3173  		c.compileLoadMemoryBufferLen(arm64ReservedRegisterForTemporary)
  3174  	}
  3175  
  3176  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3177  	// If not, raise out of bounds memory access error.
  3178  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3179  
  3180  	// Otherwise, ready to copy the value from source to destination.
  3181  	if !isZeroRegister(copySize.register) {
  3182  		// If the size equals zero, we can skip the entire instructions beflow.
  3183  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3184  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3185  
  3186  		var ldr, str asm.Instruction
  3187  		var movSize int64
  3188  		if isTable {
  3189  			ldr, str = arm64.LDRD, arm64.STRD
  3190  			movSize = 8
  3191  
  3192  			// arm64ReservedRegisterForTemporary = &Table[0]
  3193  			c.assembler.CompileMemoryToRegister(arm64.LDRD, tableInstanceAddressReg,
  3194  				tableInstanceTableOffset, arm64ReservedRegisterForTemporary)
  3195  			// destinationOffset = (destinationOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3196  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3197  				destinationOffset.register, pointerSizeLog2,
  3198  				arm64ReservedRegisterForTemporary, destinationOffset.register)
  3199  
  3200  			// arm64ReservedRegisterForTemporary = &ElementInstance.References[0]
  3201  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3202  			// sourceOffset = (sourceOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3203  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3204  				sourceOffset.register, pointerSizeLog2,
  3205  				arm64ReservedRegisterForTemporary, sourceOffset.register)
  3206  
  3207  			// copySize = copySize << pointerSizeLog2
  3208  			c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3209  		} else {
  3210  			ldr, str = arm64.LDRB, arm64.STRB
  3211  			movSize = 1
  3212  
  3213  			// destinationOffset += memory buffer's absolute address.
  3214  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3215  
  3216  			// sourceOffset += data buffer's absolute address.
  3217  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3218  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3219  
  3220  		}
  3221  
  3222  		// Negate the counter.
  3223  		c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3224  
  3225  		beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3226  
  3227  		// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3228  		c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3229  			sourceOffset.register, copySize.register,
  3230  			arm64ReservedRegisterForTemporary)
  3231  		// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3232  		c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3233  			arm64ReservedRegisterForTemporary,
  3234  			destinationOffset.register, copySize.register,
  3235  		)
  3236  
  3237  		// Decrement the size counter and if the value is still negative, continue the loop.
  3238  		c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3239  		c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3240  
  3241  		c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3242  	}
  3243  
  3244  	c.markRegisterUnused(copySize.register, sourceOffset.register,
  3245  		destinationOffset.register, instanceAddr, tableInstanceAddressReg)
  3246  	return nil
  3247  }
  3248  
  3249  // compileDataDrop implements compiler.compileDataDrop for the arm64 architecture.
  3250  func (c *arm64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error {
  3251  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3252  		return err
  3253  	}
  3254  
  3255  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3256  	if err != nil {
  3257  		return err
  3258  	}
  3259  
  3260  	dataIndex := uint32(o.U1)
  3261  	c.compileLoadDataInstanceAddress(dataIndex, tmp)
  3262  
  3263  	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
  3264  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3265  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3266  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3267  	return nil
  3268  }
  3269  
  3270  func (c *arm64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
  3271  	// dst = dataIndex * dataInstanceStructSize
  3272  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(dataIndex)*dataInstanceStructSize, dst)
  3273  
  3274  	// arm64ReservedRegisterForTemporary = &moduleInstance.DataInstances[0]
  3275  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3276  		arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  3277  		arm64ReservedRegisterForTemporary,
  3278  	)
  3279  
  3280  	// dst = arm64ReservedRegisterForTemporary + dst
  3281  	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
  3282  	//     = &moduleInstance.DataInstances[dataIndex]
  3283  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3284  }
  3285  
  3286  // compileMemoryCopy implements compiler.compileMemoryCopy for the arm64 architecture.
  3287  func (c *arm64Compiler) compileMemoryCopy() error {
  3288  	return c.compileCopyImpl(false, 0, 0)
  3289  }
  3290  
  3291  // compileCopyImpl implements compileTableCopy and compileMemoryCopy.
  3292  //
  3293  // TODO: the compiled code in this function should be reused and compile at once as
  3294  // the code is independent of any module.
  3295  func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableIndex uint32) error {
  3296  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3297  	if isTable {
  3298  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3299  	}
  3300  
  3301  	copySize, err := c.popValueOnRegister()
  3302  	if err != nil {
  3303  		return err
  3304  	}
  3305  	c.markRegisterUsed(copySize.register)
  3306  
  3307  	sourceOffset, err := c.popValueOnRegister()
  3308  	if err != nil {
  3309  		return err
  3310  	}
  3311  	if isZeroRegister(sourceOffset.register) {
  3312  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3313  		if err != nil {
  3314  			return err
  3315  		}
  3316  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  3317  	}
  3318  	c.markRegisterUsed(sourceOffset.register)
  3319  
  3320  	destinationOffset, err := c.popValueOnRegister()
  3321  	if err != nil {
  3322  		return err
  3323  	}
  3324  	if isZeroRegister(destinationOffset.register) {
  3325  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3326  		if err != nil {
  3327  			return err
  3328  		}
  3329  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3330  	}
  3331  	c.markRegisterUsed(destinationOffset.register)
  3332  
  3333  	if !isZeroRegister(copySize.register) {
  3334  		// sourceOffset += size.
  3335  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  3336  		// destinationOffset += size.
  3337  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  3338  	}
  3339  
  3340  	if isTable {
  3341  		// arm64ReservedRegisterForTemporary = &tables[0]
  3342  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3343  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3344  			arm64ReservedRegisterForTemporary)
  3345  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3346  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3347  		//                                   = &tables[srcTableIndex]
  3348  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3349  			arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3350  			arm64ReservedRegisterForTemporary)
  3351  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3352  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3353  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3354  			arm64ReservedRegisterForTemporary)
  3355  	} else {
  3356  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3357  		c.compileLoadMemoryBufferLen(arm64ReservedRegisterForTemporary)
  3358  	}
  3359  
  3360  	// Check memory len >= sourceOffset.
  3361  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3362  	// If not, raise out of bounds memory access error.
  3363  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3364  
  3365  	// Otherwise, check memory len >= destinationOffset.
  3366  	if isTable {
  3367  		// arm64ReservedRegisterForTemporary = &tables[0]
  3368  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3369  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3370  			arm64ReservedRegisterForTemporary)
  3371  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + dstTableIndex*8
  3372  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3373  		//                                   = &tables[dstTableIndex]
  3374  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3375  			arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3376  			arm64ReservedRegisterForTemporary)
  3377  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[dstTableIndex])
  3378  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3379  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3380  			arm64ReservedRegisterForTemporary)
  3381  	}
  3382  
  3383  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3384  	// If not, raise out of bounds memory access error.
  3385  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3386  
  3387  	// Otherwise, ready to copy the value from source to destination.
  3388  	var ldr, str asm.Instruction
  3389  	var movSize int64
  3390  	if isTable {
  3391  		ldr, str = arm64.LDRD, arm64.STRD
  3392  		movSize = 8
  3393  	} else {
  3394  		ldr, str = arm64.LDRB, arm64.STRB
  3395  		movSize = 1
  3396  	}
  3397  
  3398  	// If the size equals zero, we can skip the entire instructions beflow.
  3399  	if !isZeroRegister(copySize.register) {
  3400  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3401  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3402  
  3403  		// If source offet < destination offset: for (i = size-1; i >= 0; i--) dst[i] = src[i];
  3404  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, sourceOffset.register, destinationOffset.register)
  3405  		destLowerThanSourceJump := c.assembler.CompileJump(arm64.BCONDLS)
  3406  		var endJump asm.Node
  3407  		{
  3408  			// sourceOffset -= size.
  3409  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, sourceOffset.register)
  3410  			// destinationOffset -= size.
  3411  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, destinationOffset.register)
  3412  
  3413  			if isTable {
  3414  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3415  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3416  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3417  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3418  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3419  					arm64ReservedRegisterForTemporary)
  3420  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3421  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3422  					arm64ReservedRegisterForTemporary)
  3423  				// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3424  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3425  					destinationOffset.register, pointerSizeLog2,
  3426  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3427  
  3428  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3429  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3430  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3431  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3432  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3433  					arm64ReservedRegisterForTemporary)
  3434  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3435  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3436  					arm64ReservedRegisterForTemporary)
  3437  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3438  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3439  					sourceOffset.register, pointerSizeLog2,
  3440  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3441  
  3442  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3443  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3444  			} else {
  3445  				// sourceOffset += memory buffer's absolute address.
  3446  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3447  				// destinationOffset += memory buffer's absolute address.
  3448  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3449  			}
  3450  
  3451  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3452  
  3453  			// size -= 1
  3454  			c.assembler.CompileConstToRegister(arm64.SUBS, movSize, copySize.register)
  3455  
  3456  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3457  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3458  				sourceOffset.register, copySize.register,
  3459  				arm64ReservedRegisterForTemporary)
  3460  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3461  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3462  				arm64ReservedRegisterForTemporary,
  3463  				destinationOffset.register, copySize.register,
  3464  			)
  3465  
  3466  			// If the value on the copySize.register is not equal zero, continue the loop.
  3467  			c.assembler.CompileJump(arm64.BCONDNE).AssignJumpTarget(beginCopyLoop)
  3468  
  3469  			// Otherwise, exit the loop.
  3470  			endJump = c.assembler.CompileJump(arm64.B)
  3471  		}
  3472  
  3473  		// Else (destination offet < source offset): for (i = 0; i < size; i++) dst[counter-1-i] = src[counter-1-i];
  3474  		c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  3475  		{
  3476  
  3477  			if isTable {
  3478  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3479  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3480  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3481  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3482  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3483  					arm64ReservedRegisterForTemporary)
  3484  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3485  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3486  					arm64ReservedRegisterForTemporary)
  3487  				// destinationOffset = (destinationOffset<< interfaceDataySizeLog2) + &Table[dstTableIndex].Table[0]
  3488  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3489  					destinationOffset.register, pointerSizeLog2,
  3490  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3491  
  3492  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3493  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3494  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3495  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3496  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3497  					arm64ReservedRegisterForTemporary)
  3498  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3499  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3500  					arm64ReservedRegisterForTemporary)
  3501  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3502  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3503  					sourceOffset.register, pointerSizeLog2,
  3504  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3505  
  3506  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3507  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3508  			} else {
  3509  				// sourceOffset += memory buffer's absolute address.
  3510  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3511  				// destinationOffset += memory buffer's absolute address.
  3512  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3513  			}
  3514  
  3515  			// Negate the counter.
  3516  			c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3517  
  3518  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3519  
  3520  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3521  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3522  				sourceOffset.register, copySize.register,
  3523  				arm64ReservedRegisterForTemporary)
  3524  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3525  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3526  				arm64ReservedRegisterForTemporary,
  3527  				destinationOffset.register, copySize.register,
  3528  			)
  3529  
  3530  			// size += 1
  3531  			c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3532  			c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3533  		}
  3534  		c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3535  		c.assembler.SetJumpTargetOnNext(endJump)
  3536  	}
  3537  
  3538  	// Mark all of the operand registers.
  3539  	c.markRegisterUnused(copySize.register, sourceOffset.register, destinationOffset.register)
  3540  
  3541  	return nil
  3542  }
  3543  
  3544  // compileMemoryFill implements compiler.compileMemoryCopy for the arm64 architecture.
  3545  func (c *arm64Compiler) compileMemoryFill() error {
  3546  	return c.compileFillImpl(false, 0)
  3547  }
  3548  
  3549  // compileFillImpl implements TableFill and MemoryFill.
  3550  //
  3551  // TODO: the compiled code in this function should be reused and compile at once as
  3552  // the code is independent of any module.
  3553  func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
  3554  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3555  	if isTable {
  3556  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3557  	}
  3558  
  3559  	fillSize, err := c.popValueOnRegister()
  3560  	if err != nil {
  3561  		return err
  3562  	}
  3563  	c.markRegisterUsed(fillSize.register)
  3564  
  3565  	value, err := c.popValueOnRegister()
  3566  	if err != nil {
  3567  		return err
  3568  	}
  3569  	c.markRegisterUsed(value.register)
  3570  
  3571  	destinationOffset, err := c.popValueOnRegister()
  3572  	if err != nil {
  3573  		return err
  3574  	}
  3575  	if isZeroRegister(destinationOffset.register) {
  3576  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3577  		if err != nil {
  3578  			return err
  3579  		}
  3580  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3581  	}
  3582  	c.markRegisterUsed(destinationOffset.register)
  3583  
  3584  	// destinationOffset += size.
  3585  	c.assembler.CompileRegisterToRegister(arm64.ADD, fillSize.register, destinationOffset.register)
  3586  
  3587  	if isTable {
  3588  		// arm64ReservedRegisterForTemporary = &tables[0]
  3589  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3590  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3591  			arm64ReservedRegisterForTemporary)
  3592  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3593  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3594  		//                                   = &tables[srcTableIndex]
  3595  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3596  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3597  			arm64ReservedRegisterForTemporary)
  3598  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3599  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3600  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3601  			arm64ReservedRegisterForTemporary)
  3602  	} else {
  3603  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3604  		c.compileLoadMemoryBufferLen(arm64ReservedRegisterForTemporary)
  3605  	}
  3606  
  3607  	// Check  len >= destinationOffset.
  3608  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3609  
  3610  	// If not, raise the runtime error.
  3611  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3612  
  3613  	// Otherwise, ready to copy the value from destination to source.
  3614  	// If the size equals zero, we can skip the entire instructions below.
  3615  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, fillSize.register)
  3616  	skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3617  
  3618  	// destinationOffset -= size.
  3619  	c.assembler.CompileRegisterToRegister(arm64.SUB, fillSize.register, destinationOffset.register)
  3620  
  3621  	var str asm.Instruction
  3622  	var movSize int64
  3623  	if isTable {
  3624  		str = arm64.STRD
  3625  		movSize = 8
  3626  
  3627  		// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3628  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3629  			callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3630  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3631  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3632  			arm64ReservedRegisterForTemporary)
  3633  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3634  			arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3635  			arm64ReservedRegisterForTemporary)
  3636  		// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3637  		c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3638  			destinationOffset.register, pointerSizeLog2,
  3639  			arm64ReservedRegisterForTemporary, destinationOffset.register)
  3640  
  3641  		// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3642  		c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, fillSize.register)
  3643  	} else {
  3644  		str = arm64.STRB
  3645  		movSize = 1
  3646  
  3647  		// destinationOffset += memory buffer's absolute address.
  3648  		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3649  	}
  3650  
  3651  	// Naively implement the copy with "for loop" by copying byte one by one.
  3652  	beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3653  
  3654  	// size -= 1
  3655  	c.assembler.CompileConstToRegister(arm64.SUBS, movSize, fillSize.register)
  3656  
  3657  	// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3658  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3659  		value.register,
  3660  		destinationOffset.register, fillSize.register,
  3661  	)
  3662  
  3663  	// If the value on the copySizeRgister.register is not equal zero, continue the loop.
  3664  	continueJump := c.assembler.CompileJump(arm64.BCONDNE)
  3665  	continueJump.AssignJumpTarget(beginCopyLoop)
  3666  
  3667  	// Mark all of the operand registers.
  3668  	c.markRegisterUnused(fillSize.register, value.register, destinationOffset.register)
  3669  
  3670  	c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3671  	return nil
  3672  }
  3673  
  3674  // compileTableInit implements compiler.compileTableInit for the arm64 architecture.
  3675  func (c *arm64Compiler) compileTableInit(o *wazeroir.UnionOperation) error {
  3676  	elemIndex := uint32(o.U1)
  3677  	tableIndex := uint32(o.U2)
  3678  	return c.compileInitImpl(true, elemIndex, tableIndex)
  3679  }
  3680  
  3681  // compileTableCopy implements compiler.compileTableCopy for the arm64 architecture.
  3682  func (c *arm64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error {
  3683  	return c.compileCopyImpl(true, uint32(o.U1), uint32(o.U2))
  3684  }
  3685  
  3686  // compileElemDrop implements compiler.compileElemDrop for the arm64 architecture.
  3687  func (c *arm64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error {
  3688  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3689  		return err
  3690  	}
  3691  
  3692  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3693  	if err != nil {
  3694  		return err
  3695  	}
  3696  
  3697  	elemIndex := uint32(o.U1)
  3698  	c.compileLoadElemInstanceAddress(elemIndex, tmp)
  3699  
  3700  	// Clears the content of ElementInstances[o.ElemIndex] (== []interface{} type).
  3701  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3702  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3703  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3704  	return nil
  3705  }
  3706  
  3707  func (c *arm64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
  3708  	// dst = dataIndex * elementInstanceStructSize
  3709  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(elemIndex)*elementInstanceStructSize, dst)
  3710  
  3711  	// arm64ReservedRegisterForTemporary = &moduleInstance.ElementInstances[0]
  3712  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3713  		arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  3714  		arm64ReservedRegisterForTemporary,
  3715  	)
  3716  
  3717  	// dst = arm64ReservedRegisterForTemporary + dst
  3718  	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
  3719  	//     = &moduleInstance.ElementInstances[elemIndex]
  3720  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3721  }
  3722  
  3723  // compileRefFunc implements compiler.compileRefFunc for the arm64 architecture.
  3724  func (c *arm64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error {
  3725  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3726  		return err
  3727  	}
  3728  
  3729  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3730  	if err != nil {
  3731  		return err
  3732  	}
  3733  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset]
  3734  	//                                   = &moduleEngine.functions[0]
  3735  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3736  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  3737  		ref)
  3738  
  3739  	// ref = ref + int64(o.FunctionIndex)*sizeOf(function)
  3740  	//     = &moduleEngine.functions[index]
  3741  	functionIndex := int64(o.U1)
  3742  	c.assembler.CompileConstToRegister(arm64.ADD,
  3743  		functionIndex*functionSize,
  3744  		ref,
  3745  	)
  3746  
  3747  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
  3748  	return nil
  3749  }
  3750  
  3751  // compileTableGet implements compiler.compileTableGet for the arm64 architecture.
  3752  func (c *arm64Compiler) compileTableGet(o *wazeroir.UnionOperation) error {
  3753  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3754  	if err != nil {
  3755  		return err
  3756  	}
  3757  	c.markRegisterUsed(ref)
  3758  
  3759  	offset, err := c.popValueOnRegister()
  3760  	if err != nil {
  3761  		return err
  3762  	}
  3763  
  3764  	// arm64ReservedRegisterForTemporary = &tables[0]
  3765  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3766  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3767  		arm64ReservedRegisterForTemporary)
  3768  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3769  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3770  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3771  	tableIndex := int64(o.U1)
  3772  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3773  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3774  		arm64ReservedRegisterForTemporary)
  3775  
  3776  	// Out of bounds check.
  3777  	// ref = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3778  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3779  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3780  		ref,
  3781  	)
  3782  	// "cmp ref, offset"
  3783  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, ref, offset.register)
  3784  
  3785  	// If it exceeds len(table), we exit the execution.
  3786  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  3787  
  3788  	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3789  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3790  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3791  		ref,
  3792  	)
  3793  
  3794  	// ref = (offset << pointerSizeLog2) + ref
  3795  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3796  	//     = &tables[TableIndex].References[offset]
  3797  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3798  		offset.register, pointerSizeLog2, ref, ref)
  3799  
  3800  	// ref = [&tables[TableIndex]] = load the Reference's pointer as uint64.
  3801  	c.assembler.CompileMemoryToRegister(arm64.LDRD, ref, 0, ref)
  3802  
  3803  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
  3804  	return nil
  3805  }
  3806  
  3807  // compileTableSet implements compiler.compileTableSet for the arm64 architecture.
  3808  func (c *arm64Compiler) compileTableSet(o *wazeroir.UnionOperation) error {
  3809  	ref := c.locationStack.pop()
  3810  	if err := c.compileEnsureOnRegister(ref); err != nil {
  3811  		return err
  3812  	}
  3813  
  3814  	offset := c.locationStack.pop()
  3815  	if err := c.compileEnsureOnRegister(offset); err != nil {
  3816  		return err
  3817  	}
  3818  
  3819  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3820  	if err != nil {
  3821  		return err
  3822  	}
  3823  
  3824  	// arm64ReservedRegisterForTemporary = &tables[0]
  3825  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3826  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3827  		arm64ReservedRegisterForTemporary)
  3828  	// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + TableIndex*8
  3829  	//                                   = &tables[0] + TableIndex*sizeOf(*tableInstance)
  3830  	//                                   = &tables[TableIndex]
  3831  	tableIndex := int64(o.U1)
  3832  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3833  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3834  		arm64ReservedRegisterForTemporary)
  3835  
  3836  	// Out of bounds check.
  3837  	// tmp = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3838  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3839  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3840  		tmp,
  3841  	)
  3842  	// "cmp tmp, offset"
  3843  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, offset.register)
  3844  
  3845  	// If it exceeds len(table), we exit the execution.
  3846  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  3847  
  3848  	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3849  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3850  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3851  		tmp,
  3852  	)
  3853  
  3854  	// tmp = (offset << pointerSizeLog2) + tmp
  3855  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3856  	//     = &tables[TableIndex].References[offset]
  3857  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, offset.register, pointerSizeLog2, tmp, tmp)
  3858  
  3859  	// Set the reference's raw pointer.
  3860  	c.assembler.CompileRegisterToMemory(arm64.STRD, ref.register, tmp, 0)
  3861  
  3862  	c.markRegisterUnused(offset.register, ref.register, tmp)
  3863  	return nil
  3864  }
  3865  
  3866  // compileTableGrow implements compiler.compileTableGrow for the arm64 architecture.
  3867  func (c *arm64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error {
  3868  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3869  		return err
  3870  	}
  3871  
  3872  	// Pushes the table index.
  3873  	tableIndex := o.U1
  3874  	if err := c.compileIntConstant(true, tableIndex); err != nil {
  3875  		return err
  3876  	}
  3877  
  3878  	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
  3879  	// Therefore, call out to the built function for this purpose.
  3880  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexTableGrow); err != nil {
  3881  		return err
  3882  	}
  3883  
  3884  	// TableGrow consumes three values (table index, number of items, initial value).
  3885  	for i := 0; i < 3; i++ {
  3886  		c.locationStack.pop()
  3887  	}
  3888  
  3889  	// Then, the previous length was pushed as the result.
  3890  	v := c.locationStack.pushRuntimeValueLocationOnStack()
  3891  	v.valueType = runtimeValueTypeI32
  3892  
  3893  	// After return, we re-initialize reserved registers just like preamble of functions.
  3894  	c.compileReservedStackBasePointerRegisterInitialization()
  3895  	c.compileReservedMemoryRegisterInitialization()
  3896  	return nil
  3897  }
  3898  
  3899  // compileTableSize implements compiler.compileTableSize for the arm64 architecture.
  3900  func (c *arm64Compiler) compileTableSize(o *wazeroir.UnionOperation) error {
  3901  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3902  		return err
  3903  	}
  3904  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  3905  	if err != nil {
  3906  		return err
  3907  	}
  3908  	c.markRegisterUsed(result)
  3909  
  3910  	// arm64ReservedRegisterForTemporary = &tables[0]
  3911  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3912  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3913  		arm64ReservedRegisterForTemporary)
  3914  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3915  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3916  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3917  	tableIndex := int64(o.U1)
  3918  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3919  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3920  		arm64ReservedRegisterForTemporary)
  3921  
  3922  	// result = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3923  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3924  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3925  		result,
  3926  	)
  3927  
  3928  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  3929  	return nil
  3930  }
  3931  
  3932  // compileTableFill implements compiler.compileTableFill for the arm64 architecture.
  3933  func (c *arm64Compiler) compileTableFill(o *wazeroir.UnionOperation) error {
  3934  	tableIndex := uint32(o.U1)
  3935  	return c.compileFillImpl(true, tableIndex)
  3936  }
  3937  
  3938  // popTwoValuesOnRegisters pops two values from the location stacks, ensures
  3939  // these two values are located on registers, and mark them unused.
  3940  //
  3941  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3942  // but the name seems awkward.
  3943  func (c *arm64Compiler) popTwoValuesOnRegisters() (x1, x2 *runtimeValueLocation, err error) {
  3944  	x2 = c.locationStack.pop()
  3945  	if err = c.compileEnsureOnRegister(x2); err != nil {
  3946  		return
  3947  	}
  3948  
  3949  	x1 = c.locationStack.pop()
  3950  	if err = c.compileEnsureOnRegister(x1); err != nil {
  3951  		return
  3952  	}
  3953  
  3954  	c.markRegisterUnused(x2.register)
  3955  	c.markRegisterUnused(x1.register)
  3956  	return
  3957  }
  3958  
  3959  // popValueOnRegister pops one value from the location stack, ensures
  3960  // that it is located on a register, and mark it unused.
  3961  //
  3962  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3963  // but the name seems awkward.
  3964  func (c *arm64Compiler) popValueOnRegister() (v *runtimeValueLocation, err error) {
  3965  	v = c.locationStack.pop()
  3966  	if err = c.compileEnsureOnRegister(v); err != nil {
  3967  		return
  3968  	}
  3969  
  3970  	c.markRegisterUnused(v.register)
  3971  	return
  3972  }
  3973  
  3974  // compileEnsureOnRegister emits instructions to ensure that a value is located on a register.
  3975  func (c *arm64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
  3976  	if loc.onStack() {
  3977  		reg, err := c.allocateRegister(loc.getRegisterType())
  3978  		if err != nil {
  3979  			return err
  3980  		}
  3981  
  3982  		// Record that the value holds the register and the register is marked used.
  3983  		loc.setRegister(reg)
  3984  		c.markRegisterUsed(reg)
  3985  
  3986  		c.compileLoadValueOnStackToRegister(loc)
  3987  	} else if loc.onConditionalRegister() {
  3988  		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  3989  	}
  3990  	return
  3991  }
  3992  
  3993  // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
  3994  // if the value is located on a conditional register.
  3995  //
  3996  // This is usually called at the beginning of methods on compiler interface where we possibly
  3997  // compile instructions without saving the conditional register value.
  3998  // compile* functions without calling this function is saving the conditional
  3999  // value to the stack or register by invoking ensureOnGeneralPurposeRegister for the top.
  4000  func (c *arm64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
  4001  	if c.locationStack.sp > 0 {
  4002  		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
  4003  			err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  4004  		}
  4005  	}
  4006  	return
  4007  }
  4008  
  4009  // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
  4010  // to a general purpose register.
  4011  func (c *arm64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
  4012  	reg, err := c.allocateRegister(loc.getRegisterType())
  4013  	if err != nil {
  4014  		return err
  4015  	}
  4016  
  4017  	c.markRegisterUsed(reg)
  4018  	c.assembler.CompileConditionalRegisterSet(loc.conditionalRegister, reg)
  4019  
  4020  	// Record that now the value is located on a general purpose register.
  4021  	loc.setRegister(reg)
  4022  	return nil
  4023  }
  4024  
  4025  // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for arm64.
  4026  func (c *arm64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
  4027  	switch loc.valueType {
  4028  	case runtimeValueTypeI32:
  4029  		c.assembler.CompileMemoryToRegister(arm64.LDRW, arm64ReservedRegisterForStackBasePointerAddress,
  4030  			int64(loc.stackPointer)*8, loc.register)
  4031  	case runtimeValueTypeI64:
  4032  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForStackBasePointerAddress,
  4033  			int64(loc.stackPointer)*8, loc.register)
  4034  	case runtimeValueTypeF32:
  4035  		c.assembler.CompileMemoryToRegister(arm64.FLDRS, arm64ReservedRegisterForStackBasePointerAddress,
  4036  			int64(loc.stackPointer)*8, loc.register)
  4037  	case runtimeValueTypeF64:
  4038  		c.assembler.CompileMemoryToRegister(arm64.FLDRD, arm64ReservedRegisterForStackBasePointerAddress,
  4039  			int64(loc.stackPointer)*8, loc.register)
  4040  	case runtimeValueTypeV128Lo:
  4041  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV,
  4042  			arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8, loc.register,
  4043  			arm64.VectorArrangementQ)
  4044  		// Higher 64-bits are loaded as well ^^.
  4045  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4046  		hi.setRegister(loc.register)
  4047  	case runtimeValueTypeV128Hi:
  4048  		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
  4049  	}
  4050  }
  4051  
  4052  // allocateRegister implements compiler.allocateRegister for arm64.
  4053  func (c *arm64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
  4054  	var ok bool
  4055  	// Try to get the unused register.
  4056  	reg, ok = c.locationStack.takeFreeRegister(t)
  4057  	if ok {
  4058  		return
  4059  	}
  4060  
  4061  	// If not found, we have to steal the register.
  4062  	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
  4063  	if !ok {
  4064  		err = fmt.Errorf("cannot steal register")
  4065  		return
  4066  	}
  4067  
  4068  	// Release the steal target register value onto stack location.
  4069  	reg = stealTarget.register
  4070  	c.compileReleaseRegisterToStack(stealTarget)
  4071  	return
  4072  }
  4073  
  4074  func (c *arm64Compiler) compileLoadMemoryBufferLen(destReg asm.Register) {
  4075  	// destReg = ce.moduleContext.MemoryInstance (pointer)
  4076  	c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset, destReg)
  4077  	// destReg = len(mem.Buffer)
  4078  	c.assembler.CompileConstToRegister(arm64.ADD, memoryInstanceBufferLenOffset, destReg)
  4079  	c.assembler.CompileMemoryWithRegisterSourceToRegister(arm64.LDARD, destReg, destReg)
  4080  }
  4081  
  4082  // compileReleaseAllRegistersToStack adds instructions to store all the values located on
  4083  // either general purpose or conditional registers onto the memory stack.
  4084  // See releaseRegisterToStack.
  4085  func (c *arm64Compiler) compileReleaseAllRegistersToStack() (err error) {
  4086  	for i := uint64(0); i < c.locationStack.sp; i++ {
  4087  		if loc := &c.locationStack.stack[i]; loc.onRegister() {
  4088  			c.compileReleaseRegisterToStack(loc)
  4089  		} else if loc.onConditionalRegister() {
  4090  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  4091  				return
  4092  			}
  4093  			c.compileReleaseRegisterToStack(loc)
  4094  		}
  4095  	}
  4096  	return
  4097  }
  4098  
  4099  // releaseRegisterToStack adds an instruction to write the value on a register back to memory stack region.
  4100  func (c *arm64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
  4101  	switch loc.valueType {
  4102  	case runtimeValueTypeI32:
  4103  		c.assembler.CompileRegisterToMemory(arm64.STRW, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4104  	case runtimeValueTypeI64:
  4105  		c.assembler.CompileRegisterToMemory(arm64.STRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4106  	case runtimeValueTypeF32:
  4107  		c.assembler.CompileRegisterToMemory(arm64.FSTRS, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4108  	case runtimeValueTypeF64:
  4109  		c.assembler.CompileRegisterToMemory(arm64.FSTRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4110  	case runtimeValueTypeV128Lo:
  4111  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
  4112  			loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
  4113  			arm64.VectorArrangementQ)
  4114  		// Higher 64-bits are released as well ^^.
  4115  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4116  		c.locationStack.releaseRegister(hi)
  4117  	case runtimeValueTypeV128Hi:
  4118  		panic("BUG: V128Hi must be released to the stack along with V128Lo")
  4119  	default:
  4120  		panic("BUG")
  4121  	}
  4122  
  4123  	// Mark the register is free.
  4124  	c.locationStack.releaseRegister(loc)
  4125  }
  4126  
  4127  // compileReservedStackBasePointerRegisterInitialization adds instructions to initialize arm64ReservedRegisterForStackBasePointerAddress
  4128  // so that it points to the absolute address of the stack base for this function.
  4129  func (c *arm64Compiler) compileReservedStackBasePointerRegisterInitialization() {
  4130  	// First, load the address of the first element in the value stack into arm64ReservedRegisterForStackBasePointerAddress temporarily.
  4131  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4132  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
  4133  		arm64ReservedRegisterForStackBasePointerAddress)
  4134  
  4135  	// next we move the base pointer (ce.stackBasePointer) to arm64ReservedRegisterForTemporary.
  4136  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4137  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  4138  		arm64ReservedRegisterForTemporary)
  4139  
  4140  	// Finally, we calculate "callEngineStackContextStackBasePointerInBytesOffset + arm64ReservedRegisterForTemporary"
  4141  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForStackBasePointerAddress)
  4142  }
  4143  
  4144  func (c *arm64Compiler) compileReservedMemoryRegisterInitialization() {
  4145  	if c.ir.HasMemory || c.ir.UsesMemory {
  4146  		// "arm64ReservedRegisterForMemory = ce.MemoryElement0Address"
  4147  		c.assembler.CompileMemoryToRegister(
  4148  			arm64.LDRD,
  4149  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4150  			arm64ReservedRegisterForMemory,
  4151  		)
  4152  	}
  4153  }
  4154  
  4155  // compileModuleContextInitialization adds instructions to initialize ce.moduleContext's fields based on
  4156  // ce.moduleContext.ModuleInstanceAddress.
  4157  // This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
  4158  func (c *arm64Compiler) compileModuleContextInitialization() error {
  4159  	tmpX, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4160  	if !found {
  4161  		panic("BUG: all the registers should be free at this point")
  4162  	}
  4163  	c.markRegisterUsed(tmpX)
  4164  	tmpY, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4165  	if !found {
  4166  		panic("BUG: all the registers should be free at this point")
  4167  	}
  4168  	c.markRegisterUsed(tmpY)
  4169  
  4170  	// "tmpX = ce.ModuleInstanceAddress"
  4171  	c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, tmpX)
  4172  
  4173  	// If the module instance address stays the same, we could skip the entire code below.
  4174  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64CallingConventionModuleInstanceAddressRegister, tmpX)
  4175  	brIfModuleUnchanged := c.assembler.CompileJump(arm64.BCONDEQ)
  4176  
  4177  	// Otherwise, update the moduleEngine.moduleContext.ModuleInstanceAddress.
  4178  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  4179  		arm64CallingConventionModuleInstanceAddressRegister,
  4180  		arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset,
  4181  	)
  4182  
  4183  	// Also, we have to update the following fields:
  4184  	// * callEngine.moduleContext.globalElement0Address
  4185  	// * callEngine.moduleContext.memoryElement0Address
  4186  	// * callEngine.moduleContext.memorySliceLen
  4187  	// * callEngine.moduleContext.memoryInstance
  4188  	// * callEngine.moduleContext.tableElement0Address
  4189  	// * callEngine.moduleContext.tableSliceLen
  4190  	// * callEngine.moduleContext.functionsElement0Address
  4191  	// * callEngine.moduleContext.typeIDsElement0Address
  4192  	// * callEngine.moduleContext.dataInstancesElement0Address
  4193  	// * callEngine.moduleContext.elementInstancesElement0Address
  4194  
  4195  	// Update globalElement0Address.
  4196  	//
  4197  	// Note: if there's global.get or set instruction in the function, the existence of the globals
  4198  	// is ensured by function validation at module instantiation phase, and that's why it is ok to
  4199  	// skip the initialization if the module's globals slice is empty.
  4200  	if len(c.ir.Globals) > 0 {
  4201  		// "tmpX = &moduleInstance.Globals[0]"
  4202  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4203  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset,
  4204  			tmpX,
  4205  		)
  4206  
  4207  		// "ce.GlobalElement0Address = tmpX (== &moduleInstance.Globals[0])"
  4208  		c.assembler.CompileRegisterToMemory(
  4209  			arm64.STRD, tmpX,
  4210  			arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
  4211  		)
  4212  	}
  4213  
  4214  	// Update memoryElement0Address and memorySliceLen.
  4215  	//
  4216  	// Note: if there's memory instruction in the function, memory instance must be non-nil.
  4217  	// That is ensured by function validation at module instantiation phase, and that's
  4218  	// why it is ok to skip the initialization if the module's memory instance is nil.
  4219  	if c.ir.HasMemory {
  4220  		// "tmpX = moduleInstance.Memory"
  4221  		c.assembler.CompileMemoryToRegister(
  4222  			arm64.LDRD,
  4223  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
  4224  			tmpX,
  4225  		)
  4226  
  4227  		// First, set ce.memoryInstance
  4228  		c.assembler.CompileRegisterToMemory(
  4229  			arm64.STRD,
  4230  			tmpX,
  4231  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset,
  4232  		)
  4233  
  4234  		// Then, we write ce.memoryElement0Address.
  4235  		//
  4236  		// "tmpY = *tmpX (== &memory.Buffer[0])"
  4237  		c.assembler.CompileMemoryToRegister(
  4238  			arm64.LDRD,
  4239  			tmpX, memoryInstanceBufferOffset,
  4240  			tmpY,
  4241  		)
  4242  		// "ce.memoryElement0Address = tmpY".
  4243  		c.assembler.CompileRegisterToMemory(
  4244  			arm64.STRD,
  4245  			tmpY,
  4246  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4247  		)
  4248  	}
  4249  
  4250  	// Update tableElement0Address, tableSliceLen and typeIDsElement0Address.
  4251  	//
  4252  	// Note: if there's table instruction in the function, the existence of the table
  4253  	// is ensured by function validation at module instantiation phase, and that's
  4254  	// why it is ok to skip the initialization if the module's table doesn't exist.
  4255  	if c.ir.HasTable {
  4256  		// "tmpX = &tables[0] (type of **wasm.Table)"
  4257  		c.assembler.CompileMemoryToRegister(
  4258  			arm64.LDRD,
  4259  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTablesOffset,
  4260  			tmpX,
  4261  		)
  4262  
  4263  		// Update ce.tableElement0Address.
  4264  		// "ce.tableElement0Address = tmpX".
  4265  		c.assembler.CompileRegisterToMemory(
  4266  			arm64.STRD,
  4267  			tmpX,
  4268  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4269  		)
  4270  
  4271  		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
  4272  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4273  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpX)
  4274  		c.assembler.CompileRegisterToMemory(arm64.STRD,
  4275  			tmpX, arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
  4276  	}
  4277  
  4278  	// Update callEngine.moduleContext.functionsElement0Address
  4279  	{
  4280  		// "tmpX = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
  4281  		//
  4282  		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
  4283  		// where tab points to the interface table, and the latter points to the actual
  4284  		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
  4285  		// See the following references for detail:
  4286  		// * https://research.swtch.com/interfaces
  4287  		// * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210
  4288  		c.assembler.CompileMemoryToRegister(
  4289  			arm64.LDRD,
  4290  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset,
  4291  			tmpX,
  4292  		)
  4293  
  4294  		// "tmpY = [tmpX + moduleEngineFunctionsOffset] (== &moduleEngine.functions[0])"
  4295  		c.assembler.CompileMemoryToRegister(
  4296  			arm64.LDRD,
  4297  			tmpX, moduleEngineFunctionsOffset,
  4298  			tmpY,
  4299  		)
  4300  
  4301  		// "callEngine.moduleContext.functionsElement0Address = tmpY".
  4302  		c.assembler.CompileRegisterToMemory(
  4303  			arm64.STRD,
  4304  			tmpY,
  4305  			arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  4306  		)
  4307  	}
  4308  
  4309  	// Update dataInstancesElement0Address.
  4310  	if c.ir.HasDataInstances {
  4311  		// "tmpX = &moduleInstance.DataInstances[0]"
  4312  		c.assembler.CompileMemoryToRegister(
  4313  			arm64.LDRD,
  4314  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
  4315  			tmpX,
  4316  		)
  4317  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4318  		c.assembler.CompileRegisterToMemory(
  4319  			arm64.STRD,
  4320  			tmpX,
  4321  			arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  4322  		)
  4323  	}
  4324  
  4325  	// Update callEngine.moduleContext.elementInstancesElement0Address
  4326  	if c.ir.HasElementInstances {
  4327  		// "tmpX = &moduleInstance.DataInstances[0]"
  4328  		c.assembler.CompileMemoryToRegister(
  4329  			arm64.LDRD,
  4330  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
  4331  			tmpX,
  4332  		)
  4333  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4334  		c.assembler.CompileRegisterToMemory(
  4335  			arm64.STRD,
  4336  			tmpX,
  4337  			arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4338  		)
  4339  	}
  4340  
  4341  	c.assembler.SetJumpTargetOnNext(brIfModuleUnchanged)
  4342  	c.markRegisterUnused(tmpX, tmpY)
  4343  	return nil
  4344  }
  4345  
  4346  func (c *arm64Compiler) compileAtomicLoad(o *wazeroir.UnionOperation) error {
  4347  	var (
  4348  		loadInst          asm.Instruction
  4349  		targetSizeInBytes int64
  4350  		vt                runtimeValueType
  4351  	)
  4352  
  4353  	unsignedType := wazeroir.UnsignedType(o.B1)
  4354  	offset := uint32(o.U2)
  4355  
  4356  	switch unsignedType {
  4357  	case wazeroir.UnsignedTypeI32:
  4358  		loadInst = arm64.LDARW
  4359  		targetSizeInBytes = 32 / 8
  4360  		vt = runtimeValueTypeI32
  4361  	case wazeroir.UnsignedTypeI64:
  4362  		loadInst = arm64.LDARD
  4363  		targetSizeInBytes = 64 / 8
  4364  		vt = runtimeValueTypeI64
  4365  	}
  4366  	return c.compileAtomicLoadImpl(offset, loadInst, targetSizeInBytes, vt)
  4367  }
  4368  
  4369  // compileAtomicLoad8 implements compiler.compileAtomicLoad8 for the arm64 architecture.
  4370  func (c *arm64Compiler) compileAtomicLoad8(o *wazeroir.UnionOperation) error {
  4371  	var vt runtimeValueType
  4372  
  4373  	unsignedType := wazeroir.UnsignedType(o.B1)
  4374  	offset := uint32(o.U2)
  4375  
  4376  	switch unsignedType {
  4377  	case wazeroir.UnsignedTypeI32:
  4378  		vt = runtimeValueTypeI32
  4379  	case wazeroir.UnsignedTypeI64:
  4380  		vt = runtimeValueTypeI64
  4381  	}
  4382  	return c.compileAtomicLoadImpl(offset, arm64.LDARB, 1, vt)
  4383  }
  4384  
  4385  // compileAtomicLoad16 implements compiler.compileAtomicLoad16 for the arm64 architecture.
  4386  func (c *arm64Compiler) compileAtomicLoad16(o *wazeroir.UnionOperation) error {
  4387  	var vt runtimeValueType
  4388  
  4389  	unsignedType := wazeroir.UnsignedType(o.B1)
  4390  	offset := uint32(o.U2)
  4391  
  4392  	switch unsignedType {
  4393  	case wazeroir.UnsignedTypeI32:
  4394  		vt = runtimeValueTypeI32
  4395  	case wazeroir.UnsignedTypeI64:
  4396  		vt = runtimeValueTypeI64
  4397  	}
  4398  	return c.compileAtomicLoadImpl(offset, arm64.LDARH, 16/8, vt)
  4399  }
  4400  
  4401  func (c *arm64Compiler) compileAtomicLoadImpl(offsetArg uint32, loadInst asm.Instruction,
  4402  	targetSizeInBytes int64, resultRuntimeValueType runtimeValueType,
  4403  ) error {
  4404  	baseReg, err := c.compileMemoryAccessBaseSetup(offsetArg, targetSizeInBytes)
  4405  	if err != nil {
  4406  		return err
  4407  	}
  4408  
  4409  	c.compileMemoryAlignmentCheck(baseReg, targetSizeInBytes)
  4410  
  4411  	resultRegister := baseReg
  4412  	c.assembler.CompileMemoryWithRegisterSourceToRegister(loadInst, baseReg, resultRegister)
  4413  
  4414  	c.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  4415  	return nil
  4416  }
  4417  
  4418  func (c *arm64Compiler) compileAtomicStore(o *wazeroir.UnionOperation) error {
  4419  	var (
  4420  		storeInst         asm.Instruction
  4421  		targetSizeInBytes int64
  4422  	)
  4423  
  4424  	unsignedType := wazeroir.UnsignedType(o.B1)
  4425  	offset := uint32(o.U2)
  4426  
  4427  	switch unsignedType {
  4428  	case wazeroir.UnsignedTypeI32:
  4429  		storeInst = arm64.STLRW
  4430  		targetSizeInBytes = 32 / 8
  4431  	case wazeroir.UnsignedTypeI64:
  4432  		storeInst = arm64.STLRD
  4433  		targetSizeInBytes = 64 / 8
  4434  	}
  4435  	return c.compileAtomicStoreImpl(offset, storeInst, targetSizeInBytes)
  4436  }
  4437  
  4438  // compileAtomicStore8 implements compiler.compileAtomiStore8 for the arm64 architecture.
  4439  func (c *arm64Compiler) compileAtomicStore8(o *wazeroir.UnionOperation) error {
  4440  	offset := uint32(o.U2)
  4441  	return c.compileAtomicStoreImpl(offset, arm64.STLRB, 1)
  4442  }
  4443  
  4444  // compileAtomicStore16 implements compiler.compileAtomicStore16 for the arm64 architecture.
  4445  func (c *arm64Compiler) compileAtomicStore16(o *wazeroir.UnionOperation) error {
  4446  	offset := uint32(o.U2)
  4447  	return c.compileAtomicStoreImpl(offset, arm64.STLRH, 16/8)
  4448  }
  4449  
  4450  func (c *arm64Compiler) compileAtomicStoreImpl(offsetArg uint32, storeInst asm.Instruction, targetSizeInBytes int64) error {
  4451  	val, err := c.popValueOnRegister()
  4452  	if err != nil {
  4453  		return err
  4454  	}
  4455  	// Mark temporarily used as compileMemoryAccessOffsetSetup might try allocating register.
  4456  	c.markRegisterUsed(val.register)
  4457  
  4458  	baseReg, err := c.compileMemoryAccessBaseSetup(offsetArg, targetSizeInBytes)
  4459  	if err != nil {
  4460  		return err
  4461  	}
  4462  	c.compileMemoryAlignmentCheck(baseReg, targetSizeInBytes)
  4463  
  4464  	c.assembler.CompileRegisterToMemoryWithRegisterDest(
  4465  		storeInst,
  4466  		val.register,
  4467  		baseReg,
  4468  	)
  4469  
  4470  	c.markRegisterUnused(val.register)
  4471  	return nil
  4472  }
  4473  
  4474  func (c *arm64Compiler) compileAtomicRMW(o *wazeroir.UnionOperation) error {
  4475  	var (
  4476  		inst              asm.Instruction
  4477  		targetSizeInBytes int64
  4478  		vt                runtimeValueType
  4479  		negateArg         bool
  4480  		flipArg           bool
  4481  	)
  4482  
  4483  	unsignedType := wazeroir.UnsignedType(o.B1)
  4484  	op := wazeroir.AtomicArithmeticOp(o.B2)
  4485  	offset := uint32(o.U2)
  4486  
  4487  	switch unsignedType {
  4488  	case wazeroir.UnsignedTypeI32:
  4489  		targetSizeInBytes = 32 / 8
  4490  		vt = runtimeValueTypeI32
  4491  		switch op {
  4492  		case wazeroir.AtomicArithmeticOpAdd:
  4493  			inst = arm64.LDADDALW
  4494  		case wazeroir.AtomicArithmeticOpSub:
  4495  			inst = arm64.LDADDALW
  4496  			negateArg = true
  4497  		case wazeroir.AtomicArithmeticOpAnd:
  4498  			inst = arm64.LDCLRALW
  4499  			flipArg = true
  4500  		case wazeroir.AtomicArithmeticOpOr:
  4501  			inst = arm64.LDSETALW
  4502  		case wazeroir.AtomicArithmeticOpXor:
  4503  			inst = arm64.LDEORALW
  4504  		case wazeroir.AtomicArithmeticOpNop:
  4505  			inst = arm64.SWPALW
  4506  		}
  4507  	case wazeroir.UnsignedTypeI64:
  4508  		targetSizeInBytes = 64 / 8
  4509  		vt = runtimeValueTypeI64
  4510  		switch op {
  4511  		case wazeroir.AtomicArithmeticOpAdd:
  4512  			inst = arm64.LDADDALD
  4513  		case wazeroir.AtomicArithmeticOpSub:
  4514  			inst = arm64.LDADDALD
  4515  			negateArg = true
  4516  		case wazeroir.AtomicArithmeticOpAnd:
  4517  			inst = arm64.LDCLRALD
  4518  			flipArg = true
  4519  		case wazeroir.AtomicArithmeticOpOr:
  4520  			inst = arm64.LDSETALD
  4521  		case wazeroir.AtomicArithmeticOpXor:
  4522  			inst = arm64.LDEORALD
  4523  		case wazeroir.AtomicArithmeticOpNop:
  4524  			inst = arm64.SWPALD
  4525  		}
  4526  	}
  4527  	return c.compileAtomicRMWImpl(inst, offset, negateArg, flipArg, targetSizeInBytes, vt)
  4528  }
  4529  
  4530  func (c *arm64Compiler) compileAtomicRMW8(o *wazeroir.UnionOperation) error {
  4531  	var (
  4532  		inst      asm.Instruction
  4533  		vt        runtimeValueType
  4534  		negateArg bool
  4535  		flipArg   bool
  4536  	)
  4537  
  4538  	unsignedType := wazeroir.UnsignedType(o.B1)
  4539  	op := wazeroir.AtomicArithmeticOp(o.B2)
  4540  	offset := uint32(o.U2)
  4541  
  4542  	switch op {
  4543  	case wazeroir.AtomicArithmeticOpAdd:
  4544  		inst = arm64.LDADDALB
  4545  	case wazeroir.AtomicArithmeticOpSub:
  4546  		inst = arm64.LDADDALB
  4547  		negateArg = true
  4548  	case wazeroir.AtomicArithmeticOpAnd:
  4549  		inst = arm64.LDCLRALB
  4550  		flipArg = true
  4551  	case wazeroir.AtomicArithmeticOpOr:
  4552  		inst = arm64.LDSETALB
  4553  	case wazeroir.AtomicArithmeticOpXor:
  4554  		inst = arm64.LDEORALB
  4555  	case wazeroir.AtomicArithmeticOpNop:
  4556  		inst = arm64.SWPALB
  4557  	}
  4558  
  4559  	switch unsignedType {
  4560  	case wazeroir.UnsignedTypeI32:
  4561  		vt = runtimeValueTypeI32
  4562  	case wazeroir.UnsignedTypeI64:
  4563  		vt = runtimeValueTypeI64
  4564  	}
  4565  	return c.compileAtomicRMWImpl(inst, offset, negateArg, flipArg, 1, vt)
  4566  }
  4567  
  4568  func (c *arm64Compiler) compileAtomicRMW16(o *wazeroir.UnionOperation) error {
  4569  	var (
  4570  		inst      asm.Instruction
  4571  		vt        runtimeValueType
  4572  		negateArg bool
  4573  		flipArg   bool
  4574  	)
  4575  
  4576  	unsignedType := wazeroir.UnsignedType(o.B1)
  4577  	op := wazeroir.AtomicArithmeticOp(o.B2)
  4578  	offset := uint32(o.U2)
  4579  
  4580  	switch op {
  4581  	case wazeroir.AtomicArithmeticOpAdd:
  4582  		inst = arm64.LDADDALH
  4583  	case wazeroir.AtomicArithmeticOpSub:
  4584  		inst = arm64.LDADDALH
  4585  		negateArg = true
  4586  	case wazeroir.AtomicArithmeticOpAnd:
  4587  		inst = arm64.LDCLRALH
  4588  		flipArg = true
  4589  	case wazeroir.AtomicArithmeticOpOr:
  4590  		inst = arm64.LDSETALH
  4591  	case wazeroir.AtomicArithmeticOpXor:
  4592  		inst = arm64.LDEORALH
  4593  	case wazeroir.AtomicArithmeticOpNop:
  4594  		inst = arm64.SWPALH
  4595  	}
  4596  
  4597  	switch unsignedType {
  4598  	case wazeroir.UnsignedTypeI32:
  4599  		vt = runtimeValueTypeI32
  4600  	case wazeroir.UnsignedTypeI64:
  4601  		vt = runtimeValueTypeI64
  4602  	}
  4603  	return c.compileAtomicRMWImpl(inst, offset, negateArg, flipArg, 16/8, vt)
  4604  }
  4605  
  4606  func (c *arm64Compiler) compileAtomicRMWImpl(inst asm.Instruction, offsetArg uint32, negateArg bool, flipArg bool,
  4607  	targetSizeInBytes int64, resultRuntimeValueType runtimeValueType,
  4608  ) error {
  4609  	val, err := c.popValueOnRegister()
  4610  	if err != nil {
  4611  		return err
  4612  	}
  4613  	// Mark temporarily used as compileMemoryAccessOffsetSetup might try allocating register.
  4614  	c.markRegisterUsed(val.register)
  4615  
  4616  	if negateArg {
  4617  		switch resultRuntimeValueType {
  4618  		case runtimeValueTypeI32:
  4619  			c.assembler.CompileRegisterToRegister(arm64.NEGW, val.register, val.register)
  4620  		case runtimeValueTypeI64:
  4621  			c.assembler.CompileRegisterToRegister(arm64.NEG, val.register, val.register)
  4622  		}
  4623  	}
  4624  
  4625  	if flipArg {
  4626  		switch resultRuntimeValueType {
  4627  		case runtimeValueTypeI32:
  4628  			c.assembler.CompileTwoRegistersToRegister(arm64.ORNW, val.register, arm64.RegRZR, val.register)
  4629  		case runtimeValueTypeI64:
  4630  			c.assembler.CompileTwoRegistersToRegister(arm64.ORN, val.register, arm64.RegRZR, val.register)
  4631  		}
  4632  	}
  4633  
  4634  	addrReg, err := c.compileMemoryAccessBaseSetup(offsetArg, targetSizeInBytes)
  4635  	if err != nil {
  4636  		return err
  4637  	}
  4638  	c.compileMemoryAlignmentCheck(addrReg, targetSizeInBytes)
  4639  
  4640  	resultRegister := addrReg
  4641  	c.assembler.CompileTwoRegistersToRegister(inst, val.register, addrReg, resultRegister)
  4642  
  4643  	c.markRegisterUnused(val.register)
  4644  
  4645  	c.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  4646  	return nil
  4647  }
  4648  
  4649  func (c *arm64Compiler) compileAtomicRMWCmpxchg(o *wazeroir.UnionOperation) error {
  4650  	var (
  4651  		casInst           asm.Instruction
  4652  		targetSizeInBytes int64
  4653  		vt                runtimeValueType
  4654  	)
  4655  
  4656  	unsignedType := wazeroir.UnsignedType(o.B1)
  4657  	offset := uint32(o.U2)
  4658  
  4659  	switch unsignedType {
  4660  	case wazeroir.UnsignedTypeI32:
  4661  		casInst = arm64.CASALW
  4662  		targetSizeInBytes = 32 / 8
  4663  		vt = runtimeValueTypeI32
  4664  	case wazeroir.UnsignedTypeI64:
  4665  		casInst = arm64.CASALD
  4666  		targetSizeInBytes = 64 / 8
  4667  		vt = runtimeValueTypeI64
  4668  	}
  4669  	return c.compileAtomicRMWCmpxchgImpl(casInst, offset, targetSizeInBytes, vt)
  4670  }
  4671  
  4672  func (c *arm64Compiler) compileAtomicRMW8Cmpxchg(o *wazeroir.UnionOperation) error {
  4673  	var vt runtimeValueType
  4674  
  4675  	unsignedType := wazeroir.UnsignedType(o.B1)
  4676  	offset := uint32(o.U2)
  4677  
  4678  	switch unsignedType {
  4679  	case wazeroir.UnsignedTypeI32:
  4680  		vt = runtimeValueTypeI32
  4681  	case wazeroir.UnsignedTypeI64:
  4682  		vt = runtimeValueTypeI64
  4683  	}
  4684  	return c.compileAtomicRMWCmpxchgImpl(arm64.CASALB, offset, 1, vt)
  4685  }
  4686  
  4687  func (c *arm64Compiler) compileAtomicRMW16Cmpxchg(o *wazeroir.UnionOperation) error {
  4688  	var vt runtimeValueType
  4689  
  4690  	unsignedType := wazeroir.UnsignedType(o.B1)
  4691  	offset := uint32(o.U2)
  4692  
  4693  	switch unsignedType {
  4694  	case wazeroir.UnsignedTypeI32:
  4695  		vt = runtimeValueTypeI32
  4696  	case wazeroir.UnsignedTypeI64:
  4697  		vt = runtimeValueTypeI64
  4698  	}
  4699  	return c.compileAtomicRMWCmpxchgImpl(arm64.CASALH, offset, 16/8, vt)
  4700  }
  4701  
  4702  func (c *arm64Compiler) compileAtomicRMWCmpxchgImpl(inst asm.Instruction, offsetArg uint32, targetSizeInBytes int64, resultRuntimeValueType runtimeValueType) error {
  4703  	repl, err := c.popValueOnRegister()
  4704  	if err != nil {
  4705  		return err
  4706  	}
  4707  	c.markRegisterUsed(repl.register)
  4708  	// CAS instruction loads the old value into the register with the comparison value.
  4709  	exp, err := c.popValueOnRegister()
  4710  	if err != nil {
  4711  		return err
  4712  	}
  4713  	if isZeroRegister(exp.register) {
  4714  		// exp is also used to load, so if it's set to the zero register we need to move to a
  4715  		// loadable register.
  4716  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4717  		if err != nil {
  4718  			return err
  4719  		}
  4720  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, reg)
  4721  		exp.register = reg
  4722  	}
  4723  	// Mark temporarily used as compileMemoryAccessOffsetSetup might try allocating register.
  4724  	c.markRegisterUsed(exp.register)
  4725  
  4726  	addrReg, err := c.compileMemoryAccessBaseSetup(offsetArg, targetSizeInBytes)
  4727  	if err != nil {
  4728  		return err
  4729  	}
  4730  	c.compileMemoryAlignmentCheck(addrReg, targetSizeInBytes)
  4731  
  4732  	c.assembler.CompileTwoRegistersToRegister(inst, exp.register, addrReg, repl.register)
  4733  
  4734  	c.markRegisterUnused(repl.register)
  4735  	c.pushRuntimeValueLocationOnRegister(exp.register, resultRuntimeValueType)
  4736  	return nil
  4737  }
  4738  
  4739  func (c *arm64Compiler) compileAtomicMemoryWait(o *wazeroir.UnionOperation) error {
  4740  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4741  		return err
  4742  	}
  4743  
  4744  	var (
  4745  		vt                runtimeValueType
  4746  		targetSizeInBytes int64
  4747  		waitFunc          wasm.Index
  4748  	)
  4749  
  4750  	unsignedType := wazeroir.UnsignedType(o.B1)
  4751  	offset := uint32(o.U2)
  4752  
  4753  	switch unsignedType {
  4754  	case wazeroir.UnsignedTypeI32:
  4755  		vt = runtimeValueTypeI32
  4756  		targetSizeInBytes = 32 / 8
  4757  		waitFunc = builtinFunctionMemoryWait32
  4758  	case wazeroir.UnsignedTypeI64:
  4759  		vt = runtimeValueTypeI64
  4760  		targetSizeInBytes = 64 / 8
  4761  		waitFunc = builtinFunctionMemoryWait64
  4762  	}
  4763  
  4764  	timeout, err := c.popValueOnRegister()
  4765  	if err != nil {
  4766  		return err
  4767  	}
  4768  	c.markRegisterUsed(timeout.register)
  4769  	exp, err := c.popValueOnRegister()
  4770  	if err != nil {
  4771  		return err
  4772  	}
  4773  	c.markRegisterUsed(exp.register)
  4774  
  4775  	baseReg, err := c.compileMemoryAccessBaseSetup(offset, targetSizeInBytes)
  4776  	if err != nil {
  4777  		return err
  4778  	}
  4779  	c.markRegisterUsed(baseReg)
  4780  	c.compileMemoryAlignmentCheck(baseReg, targetSizeInBytes)
  4781  
  4782  	// Push address, values, and timeout back to read in Go
  4783  	c.pushRuntimeValueLocationOnRegister(baseReg, runtimeValueTypeI64)
  4784  	c.pushRuntimeValueLocationOnRegister(exp.register, vt)
  4785  	c.pushRuntimeValueLocationOnRegister(timeout.register, runtimeValueTypeI64)
  4786  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, waitFunc); err != nil {
  4787  		return err
  4788  	}
  4789  	// Address, values and timeout consumed in Go
  4790  	c.locationStack.pop()
  4791  	c.locationStack.pop()
  4792  	c.locationStack.pop()
  4793  
  4794  	// Then, the result was pushed.
  4795  	v := c.locationStack.pushRuntimeValueLocationOnStack()
  4796  	v.valueType = runtimeValueTypeI32
  4797  
  4798  	c.markRegisterUnused(baseReg)
  4799  	c.markRegisterUnused(exp.register)
  4800  	c.markRegisterUnused(timeout.register)
  4801  
  4802  	// After return, we re-initialize reserved registers just like preamble of functions.
  4803  	c.compileReservedStackBasePointerRegisterInitialization()
  4804  	c.compileReservedMemoryRegisterInitialization()
  4805  
  4806  	return nil
  4807  }
  4808  
  4809  func (c *arm64Compiler) compileAtomicMemoryNotify(o *wazeroir.UnionOperation) error {
  4810  	offset := uint32(o.U2)
  4811  
  4812  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4813  		return err
  4814  	}
  4815  
  4816  	count, err := c.popValueOnRegister()
  4817  	if err != nil {
  4818  		return err
  4819  	}
  4820  	c.markRegisterUsed(count.register)
  4821  
  4822  	baseReg, err := c.compileMemoryAccessBaseSetup(offset, 4)
  4823  	if err != nil {
  4824  		return err
  4825  	}
  4826  	c.compileMemoryAlignmentCheck(baseReg, 4)
  4827  
  4828  	// Push address and count back to read in Go
  4829  	c.pushRuntimeValueLocationOnRegister(baseReg, runtimeValueTypeI64)
  4830  	c.pushRuntimeValueLocationOnRegister(count.register, runtimeValueTypeI32)
  4831  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionMemoryNotify); err != nil {
  4832  		return err
  4833  	}
  4834  
  4835  	// Address and count consumed by Go
  4836  	c.locationStack.pop()
  4837  	c.locationStack.pop()
  4838  
  4839  	// Then, the result was pushed.
  4840  	v := c.locationStack.pushRuntimeValueLocationOnStack()
  4841  	v.valueType = runtimeValueTypeI32
  4842  
  4843  	c.markRegisterUnused(count.register)
  4844  
  4845  	// After return, we re-initialize reserved registers just like preamble of functions.
  4846  	c.compileReservedStackBasePointerRegisterInitialization()
  4847  	c.compileReservedMemoryRegisterInitialization()
  4848  	return nil
  4849  }
  4850  
  4851  func (c *arm64Compiler) compileAtomicFence(_ *wazeroir.UnionOperation) error {
  4852  	c.assembler.CompileStandAlone(arm64.DMB)
  4853  	return nil
  4854  }