github.com/tetratelabs/wazero@v1.2.1/internal/engine/compiler/impl_arm64.go (about)

     1  // This file implements the compiler for arm64 target.
     2  // Please refer to https://developer.arm.com/documentation/102374/latest/
     3  // if unfamiliar with arm64 instructions and semantics.
     4  package compiler
     5  
     6  import (
     7  	"bytes"
     8  	"errors"
     9  	"fmt"
    10  	"math"
    11  
    12  	"github.com/tetratelabs/wazero/internal/asm"
    13  	"github.com/tetratelabs/wazero/internal/asm/arm64"
    14  	"github.com/tetratelabs/wazero/internal/wasm"
    15  	"github.com/tetratelabs/wazero/internal/wazeroir"
    16  )
    17  
    18  type arm64Compiler struct {
    19  	assembler arm64.Assembler
    20  	ir        *wazeroir.CompilationResult
    21  	// locationStack holds the state of wazeroir virtual stack.
    22  	// and each item is either placed in register or the actual memory stack.
    23  	locationStack *runtimeValueLocationStack
    24  	// labels maps a label (e.g. ".L1_then") to *arm64LabelInfo.
    25  	labels [wazeroir.LabelKindNum][]arm64LabelInfo
    26  	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
    27  	stackPointerCeil uint64
    28  	// assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling.
    29  	assignStackPointerCeilNeeded asm.Node
    30  	compiledTrapTargets          [nativeCallStatusModuleClosed]asm.Node
    31  	withListener                 bool
    32  	typ                          *wasm.FunctionType
    33  	br                           *bytes.Reader
    34  	// locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack,
    35  	// we cache it here, and reset and set to .locationStack in the Init method.
    36  	locationStackForEntrypoint runtimeValueLocationStack
    37  	// frameIDMax tracks the maximum value of frame id per function.
    38  	frameIDMax int
    39  	brTableTmp []runtimeValueLocation
    40  }
    41  
    42  func newArm64Compiler() compiler {
    43  	return &arm64Compiler{
    44  		assembler:                  arm64.NewAssembler(arm64ReservedRegisterForTemporary),
    45  		locationStackForEntrypoint: newRuntimeValueLocationStack(),
    46  		br:                         bytes.NewReader(nil),
    47  	}
    48  }
    49  
    50  // Init implements compiler.Init.
    51  func (c *arm64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) {
    52  	c.assembler.Reset()
    53  	c.locationStackForEntrypoint.reset()
    54  	c.resetLabels()
    55  
    56  	*c = arm64Compiler{
    57  		ir:                         ir,
    58  		withListener:               withListener,
    59  		typ:                        typ,
    60  		assembler:                  c.assembler,
    61  		labels:                     c.labels,
    62  		br:                         c.br,
    63  		brTableTmp:                 c.brTableTmp,
    64  		locationStackForEntrypoint: c.locationStackForEntrypoint,
    65  	}
    66  
    67  	// Reuses the initial location stack for the compilation of subsequent functions.
    68  	c.locationStack = &c.locationStackForEntrypoint
    69  }
    70  
    71  // resetLabels resets the existing content in arm64Compiler.labels so that
    72  // we could reuse the allocated slices and stacks in the subsequent compilations.
    73  func (c *arm64Compiler) resetLabels() {
    74  	for i := range c.labels {
    75  		for j := range c.labels[i] {
    76  			if j > c.frameIDMax {
    77  				// Only need to reset until the maximum frame id. This makes the compilation faster for large binary.
    78  				break
    79  			}
    80  			l := &c.labels[i][j]
    81  			l.initialInstruction = nil
    82  			l.stackInitialized = false
    83  			l.initialStack.reset()
    84  		}
    85  	}
    86  }
    87  
    88  var (
    89  	arm64UnreservedVectorRegisters = []asm.Register{
    90  		arm64.RegV0, arm64.RegV1, arm64.RegV2, arm64.RegV3,
    91  		arm64.RegV4, arm64.RegV5, arm64.RegV6, arm64.RegV7, arm64.RegV8,
    92  		arm64.RegV9, arm64.RegV10, arm64.RegV11, arm64.RegV12, arm64.RegV13,
    93  		arm64.RegV14, arm64.RegV15, arm64.RegV16, arm64.RegV17, arm64.RegV18,
    94  		arm64.RegV19, arm64.RegV20, arm64.RegV21, arm64.RegV22, arm64.RegV23,
    95  		arm64.RegV24, arm64.RegV25, arm64.RegV26, arm64.RegV27, arm64.RegV28,
    96  		arm64.RegV29, arm64.RegV30, arm64.RegV31,
    97  	}
    98  
    99  	// Note (see arm64 section in https://go.dev/doc/asm):
   100  	// * RegR18 is reserved as a platform register, and we don't use it in Compiler.
   101  	// * RegR28 is reserved for Goroutine by Go runtime, and we don't use it in Compiler.
   102  	arm64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint
   103  		arm64.RegR3, arm64.RegR4, arm64.RegR5, arm64.RegR6, arm64.RegR7, arm64.RegR8,
   104  		arm64.RegR9, arm64.RegR10, arm64.RegR11, arm64.RegR12, arm64.RegR13,
   105  		arm64.RegR14, arm64.RegR15, arm64.RegR16, arm64.RegR17, arm64.RegR19,
   106  		arm64.RegR20, arm64.RegR21, arm64.RegR22, arm64.RegR23, arm64.RegR24,
   107  		arm64.RegR25, arm64.RegR26, arm64.RegR29, arm64.RegR30,
   108  	}
   109  )
   110  
   111  const (
   112  	// arm64ReservedRegisterForCallEngine holds the pointer to callEngine instance (i.e. *callEngine as uintptr)
   113  	arm64ReservedRegisterForCallEngine = arm64.RegR0
   114  	// arm64ReservedRegisterForStackBasePointerAddress holds stack base pointer's address (callEngine.stackBasePointer) in the current function call.
   115  	arm64ReservedRegisterForStackBasePointerAddress = arm64.RegR1
   116  	// arm64ReservedRegisterForMemory holds the pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
   117  	arm64ReservedRegisterForMemory = arm64.RegR2
   118  	// arm64ReservedRegisterForTemporary is the temporary register which is available at any point of execution, but its content shouldn't be supposed to live beyond the single operation.
   119  	// Note: we choose R27 as that is the temporary register used in Go's assembler.
   120  	arm64ReservedRegisterForTemporary = arm64.RegR27
   121  )
   122  
   123  var arm64CallingConventionModuleInstanceAddressRegister = arm64.RegR29
   124  
   125  const (
   126  	// arm64CallEngineArchContextCompilerCallReturnAddressOffset is the offset of archContext.nativeCallReturnAddress in callEngine.
   127  	arm64CallEngineArchContextCompilerCallReturnAddressOffset = 144
   128  	// arm64CallEngineArchContextMinimum32BitSignedIntOffset is the offset of archContext.minimum32BitSignedIntAddress in callEngine.
   129  	arm64CallEngineArchContextMinimum32BitSignedIntOffset = 152
   130  	// arm64CallEngineArchContextMinimum64BitSignedIntOffset is the offset of archContext.minimum64BitSignedIntAddress in callEngine.
   131  	arm64CallEngineArchContextMinimum64BitSignedIntOffset = 160
   132  )
   133  
   134  func isZeroRegister(r asm.Register) bool {
   135  	return r == arm64.RegRZR
   136  }
   137  
   138  // compileNOP implements compiler.compileNOP for the arm64 architecture.
   139  func (c *arm64Compiler) compileNOP() asm.Node {
   140  	return c.assembler.CompileStandAlone(arm64.NOP)
   141  }
   142  
   143  // compile implements compiler.compile for the arm64 architecture.
   144  func (c *arm64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) {
   145  	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
   146  	// used for all labels (via setLocationStack), excluding the current one.
   147  	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
   148  	stackPointerCeil = c.stackPointerCeil
   149  	if stackPointerCeil < c.locationStack.stackPointerCeil {
   150  		stackPointerCeil = c.locationStack.stackPointerCeil
   151  	}
   152  
   153  	// Now that the ceil of stack pointer is determined, we are invoking the callback.
   154  	// Note: this must be called before Assemble() below.
   155  	c.assignStackPointerCeil(stackPointerCeil)
   156  
   157  	err = c.assembler.Assemble(buf)
   158  	return
   159  }
   160  
   161  // arm64LabelInfo holds a wazeroir label specific information in this function.
   162  type arm64LabelInfo struct {
   163  	// initialInstruction is the initial instruction for this label so other block can branch into it.
   164  	initialInstruction asm.Node
   165  	// initialStack is the initial value location stack from which we start compiling this label.
   166  	initialStack     runtimeValueLocationStack
   167  	stackInitialized bool
   168  }
   169  
   170  // assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the arm64 architecture.
   171  func (c *arm64Compiler) assignStackPointerCeil(ceil uint64) {
   172  	if c.assignStackPointerCeilNeeded != nil {
   173  		c.assignStackPointerCeilNeeded.AssignSourceConstant(int64(ceil) << 3)
   174  	}
   175  }
   176  
   177  func (c *arm64Compiler) label(label wazeroir.Label) *arm64LabelInfo {
   178  	kind := label.Kind()
   179  	frames := c.labels[kind]
   180  	frameID := label.FrameID()
   181  	if c.frameIDMax < frameID {
   182  		c.frameIDMax = frameID
   183  	}
   184  	// If the frameID is not allocated yet, expand the slice by twice of the diff,
   185  	// so that we could reduce the allocation in the subsequent compilation.
   186  	if diff := frameID - len(frames) + 1; diff > 0 {
   187  		for i := 0; i < diff; i++ {
   188  			frames = append(frames, arm64LabelInfo{initialStack: newRuntimeValueLocationStack()})
   189  		}
   190  		c.labels[kind] = frames
   191  	}
   192  	return &frames[frameID]
   193  }
   194  
   195  // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
   196  func (c *arm64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
   197  	return c.locationStack
   198  }
   199  
   200  // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for arm64.
   201  func (c *arm64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
   202  	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
   203  	c.markRegisterUsed(reg)
   204  	return
   205  }
   206  
   207  // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for arm64.
   208  func (c *arm64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
   209  	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
   210  	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
   211  	c.markRegisterUsed(reg)
   212  	return
   213  }
   214  
   215  func (c *arm64Compiler) markRegisterUsed(regs ...asm.Register) {
   216  	for _, reg := range regs {
   217  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   218  			c.locationStack.markRegisterUsed(reg)
   219  		}
   220  	}
   221  }
   222  
   223  func (c *arm64Compiler) markRegisterUnused(regs ...asm.Register) {
   224  	for _, reg := range regs {
   225  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   226  			c.locationStack.markRegisterUnused(reg)
   227  		}
   228  	}
   229  }
   230  
   231  func (c *arm64Compiler) String() (ret string) { return c.locationStack.String() }
   232  
   233  // compilePreamble implements compiler.compilePreamble for the arm64 architecture.
   234  func (c *arm64Compiler) compilePreamble() error {
   235  	c.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   236  	defer c.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   237  
   238  	c.locationStack.init(c.typ)
   239  
   240  	// Check if it's necessary to grow the value stack before entering function body.
   241  	if err := c.compileMaybeGrowStack(); err != nil {
   242  		return err
   243  	}
   244  
   245  	if err := c.compileModuleContextInitialization(); err != nil {
   246  		return err
   247  	}
   248  
   249  	if c.withListener {
   250  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerBefore); err != nil {
   251  			return err
   252  		}
   253  	}
   254  
   255  	// We must initialize the stack base pointer register so that we can manipulate the stack properly.
   256  	c.compileReservedStackBasePointerRegisterInitialization()
   257  
   258  	c.compileReservedMemoryRegisterInitialization()
   259  
   260  	return nil
   261  }
   262  
   263  // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
   264  // and if so, make the builtin function call to do so. These instructions are called in the function's
   265  // preamble.
   266  func (c *arm64Compiler) compileMaybeGrowStack() error {
   267  	tmpX, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
   268  	if !found {
   269  		panic("BUG: all the registers should be free at this point")
   270  	}
   271  	c.markRegisterUsed(tmpX)
   272  	tmpY, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
   273  	if !found {
   274  		panic("BUG: all the registers should be free at this point")
   275  	}
   276  	c.markRegisterUsed(tmpY)
   277  
   278  	// "tmpX = len(ce.stack)"
   279  	c.assembler.CompileMemoryToRegister(
   280  		arm64.LDRD,
   281  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset,
   282  		tmpX,
   283  	)
   284  
   285  	// "tmpY = ce.stackBasePointer"
   286  	c.assembler.CompileMemoryToRegister(
   287  		arm64.LDRD,
   288  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
   289  		tmpY,
   290  	)
   291  
   292  	// "tmpX = tmpX - tmpY", in other words "tmpX = len(ce.stack) - ce.stackBasePointer"
   293  	c.assembler.CompileRegisterToRegister(
   294  		arm64.SUB,
   295  		tmpY,
   296  		tmpX,
   297  	)
   298  
   299  	// "tmpY = stackPointerCeil"
   300  	loadStackPointerCeil := c.assembler.CompileConstToRegister(
   301  		arm64.MOVD,
   302  		math.MaxInt32,
   303  		tmpY,
   304  	)
   305  	// At this point of compilation, we don't know the value of stack point ceil,
   306  	// so we lazily resolve the value later.
   307  	c.assignStackPointerCeilNeeded = loadStackPointerCeil
   308  
   309  	// Compare tmpX (len(ce.stack) - ce.stackBasePointer) and tmpY (ce.stackPointerCeil)
   310  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmpX, tmpY)
   311  
   312  	// If ceil > stackLen - stack base pointer, we need to grow the stack by calling builtin Go function.
   313  	brIfStackOK := c.assembler.CompileJump(arm64.BCONDLS)
   314  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexGrowStack); err != nil {
   315  		return err
   316  	}
   317  
   318  	// Otherwise, skip calling it.
   319  	c.assembler.SetJumpTargetOnNext(brIfStackOK)
   320  
   321  	c.markRegisterUnused(tmpX, tmpY)
   322  	return nil
   323  }
   324  
   325  // returnFunction emits instructions to return from the current function frame.
   326  // If the current frame is the bottom, the code goes back to the Go code with nativeCallStatusCodeReturned status.
   327  // Otherwise, we branch into the caller's return address.
   328  func (c *arm64Compiler) compileReturnFunction() error {
   329  	// Release all the registers as our calling convention requires the caller-save.
   330  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
   331  		return err
   332  	}
   333  
   334  	if c.withListener {
   335  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerAfter); err != nil {
   336  			return err
   337  		}
   338  		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
   339  		c.compileReservedStackBasePointerRegisterInitialization()
   340  	}
   341  
   342  	// arm64CallingConventionModuleInstanceAddressRegister holds the module intstance's address
   343  	// so mark it used so that it won't be used as a free register.
   344  	c.locationStack.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   345  	defer c.locationStack.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   346  
   347  	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   348  
   349  	// If the return address is zero, meaning that we return from the execution.
   350  	returnAddress.setRegister(arm64ReservedRegisterForTemporary)
   351  	c.compileLoadValueOnStackToRegister(returnAddress)
   352  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, arm64.RegRZR)
   353  
   354  	// Br if the address does not equal zero, otherwise, exit.
   355  	// If the address doesn't equal zero, return br into returnAddressRegister (caller's return address).
   356  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusCodeReturned)
   357  
   358  	// Alias for readability.
   359  	tmp := arm64CallingConventionModuleInstanceAddressRegister
   360  
   361  	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
   362  	callerStackBasePointerInBytes.setRegister(tmp)
   363  	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
   364  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   365  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
   366  
   367  	// Next, restore moduleContext.fn from callerFunction.
   368  	callerFunction.setRegister(tmp)
   369  	c.compileLoadValueOnStackToRegister(callerFunction)
   370  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   371  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
   372  
   373  	// Also, we have to put the target function's *wasm.ModuleInstance into arm64CallingConventionModuleInstanceAddressRegister.
   374  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   375  		tmp, functionModuleInstanceOffset,
   376  		arm64CallingConventionModuleInstanceAddressRegister)
   377  
   378  	c.assembler.CompileJumpToRegister(arm64.B, returnAddress.register)
   379  	return nil
   380  }
   381  
   382  func (c *arm64Compiler) compileMaybeExitFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) {
   383  	if target := c.compiledTrapTargets[status]; target != nil {
   384  		// We've already compiled this.
   385  		// Invert the condition to jump into the appropriate target.
   386  		var trapCondition asm.Instruction
   387  		switch skipCondition {
   388  		case arm64.BCONDEQ:
   389  			trapCondition = arm64.BCONDNE
   390  		case arm64.BCONDNE:
   391  			trapCondition = arm64.BCONDEQ
   392  		case arm64.BCONDLO:
   393  			trapCondition = arm64.BCONDHS
   394  		case arm64.BCONDHS:
   395  			trapCondition = arm64.BCONDLO
   396  		case arm64.BCONDLS:
   397  			trapCondition = arm64.BCONDHI
   398  		case arm64.BCONDHI:
   399  			trapCondition = arm64.BCONDLS
   400  		case arm64.BCONDVS:
   401  			trapCondition = arm64.BCONDVC
   402  		case arm64.BCONDVC:
   403  			trapCondition = arm64.BCONDVS
   404  		default:
   405  			panic("BUG: couldn't invert condition")
   406  		}
   407  		c.assembler.CompileJump(trapCondition).AssignJumpTarget(target)
   408  	} else {
   409  		skip := c.assembler.CompileJump(skipCondition)
   410  		c.compileExitFromNativeCode(status)
   411  		c.assembler.SetJumpTargetOnNext(skip)
   412  	}
   413  }
   414  
   415  // compileExitFromNativeCode adds instructions to give the control back to ce.exec with the given status code.
   416  func (c *arm64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
   417  	if target := c.compiledTrapTargets[status]; target != nil {
   418  		c.assembler.CompileJump(arm64.B).AssignJumpTarget(target)
   419  	}
   420  
   421  	switch status {
   422  	case nativeCallStatusCodeReturned:
   423  		// Save the target for reuse.
   424  		c.compiledTrapTargets[status] = c.compileNOP()
   425  	case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction:
   426  		// Read the return address, and write it to callEngine.exitContext.returnAddress.
   427  		c.assembler.CompileReadInstructionAddress(arm64ReservedRegisterForTemporary, arm64.RET)
   428  		c.assembler.CompileRegisterToMemory(
   429  			arm64.STRD, arm64ReservedRegisterForTemporary,
   430  			arm64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset,
   431  		)
   432  	default:
   433  		if c.ir.IROperationSourceOffsetsInWasmBinary != nil {
   434  			// This case, the execution traps, and we want the top frame's source position in the stack trace.
   435  			// We store the instruction address onto callEngine.returnAddress.
   436  			c.assembler.CompileReadInstructionAddress(arm64ReservedRegisterForTemporary, arm64.STRD)
   437  			c.assembler.CompileRegisterToMemory(
   438  				arm64.STRD, arm64ReservedRegisterForTemporary,
   439  				arm64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset,
   440  			)
   441  		} else {
   442  			// We won't use the source position, so just save the target for reuse.
   443  			c.compiledTrapTargets[status] = c.compileNOP()
   444  		}
   445  	}
   446  
   447  	// Write the current stack pointer to the ce.stackPointer.
   448  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(c.locationStack.sp), arm64ReservedRegisterForTemporary)
   449  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForCallEngine,
   450  		callEngineStackContextStackPointerOffset)
   451  
   452  	// Write the status to callEngine.exitContext.statusCode.
   453  	if status != 0 {
   454  		c.assembler.CompileConstToRegister(arm64.MOVW, int64(status), arm64ReservedRegisterForTemporary)
   455  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64ReservedRegisterForTemporary,
   456  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   457  	} else {
   458  		// If the status == 0, we use zero register to store zero.
   459  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64.RegRZR,
   460  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   461  	}
   462  
   463  	// The return address to the Go code is stored in archContext.compilerReturnAddress which
   464  	// is embedded in ce. We load the value to the tmpRegister, and then
   465  	// invoke RET with that register.
   466  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   467  		arm64ReservedRegisterForCallEngine, arm64CallEngineArchContextCompilerCallReturnAddressOffset,
   468  		arm64ReservedRegisterForTemporary)
   469  
   470  	c.assembler.CompileJumpToRegister(arm64.RET, arm64ReservedRegisterForTemporary)
   471  }
   472  
   473  // compileGoHostFunction implements compiler.compileHostFunction for the arm64 architecture.
   474  func (c *arm64Compiler) compileGoDefinedHostFunction() error {
   475  	// First we must update the location stack to reflect the number of host function inputs.
   476  	c.locationStack.init(c.typ)
   477  
   478  	if c.withListener {
   479  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction,
   480  			builtinFunctionIndexFunctionListenerBefore); err != nil {
   481  			return err
   482  		}
   483  	}
   484  
   485  	// Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack
   486  	// (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack,
   487  	// and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function
   488  	// without sacrificing the performance.
   489  	c.compileReservedStackBasePointerRegisterInitialization()
   490  	// Alias for readability.
   491  	tmp := arm64CallingConventionModuleInstanceAddressRegister
   492  	// Get the location of the callerFunction (*function) in the stack, which depends on the signature.
   493  	_, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   494  	// Load the value into the tmp register: tmp = &function{..}
   495  	callerFunction.setRegister(tmp)
   496  	c.compileLoadValueOnStackToRegister(callerFunction)
   497  	// tmp = *(tmp+functionModuleInstanceOffset) = &wasm.ModuleInstance{...}
   498  	c.assembler.CompileMemoryToRegister(arm64.LDRD, tmp, functionModuleInstanceOffset, tmp)
   499  	// Load it onto callEngine.exitContext.callerModuleInstance.
   500  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   501  		tmp,
   502  		arm64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset)
   503  	// Reset the state of callerFunction value location so that we won't mess up subsequent code generation below.
   504  	c.locationStack.releaseRegister(callerFunction)
   505  
   506  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction, 0); err != nil {
   507  		return err
   508  	}
   509  
   510  	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
   511  	c.compileReservedStackBasePointerRegisterInitialization()
   512  
   513  	// Go function can change the module state in arbitrary way, so we have to force
   514  	// the callEngine.moduleContext initialization on the function return. To do so,
   515  	// we zero-out callEngine.moduleInstance.
   516  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   517  		arm64.RegRZR,
   518  		arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
   519  
   520  	return c.compileReturnFunction()
   521  }
   522  
   523  // setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
   524  // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
   525  // This is called when we branch into different block.
   526  func (c *arm64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
   527  	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
   528  		c.stackPointerCeil = c.locationStack.stackPointerCeil
   529  	}
   530  	c.locationStack = newStack
   531  }
   532  
   533  // compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the arm64 architecture.
   534  func (c *arm64Compiler) compileBuiltinFunctionCheckExitCode() error {
   535  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexCheckExitCode); err != nil {
   536  		return err
   537  	}
   538  
   539  	// After return, we re-initialize reserved registers just like preamble of functions.
   540  	c.compileReservedStackBasePointerRegisterInitialization()
   541  	c.compileReservedMemoryRegisterInitialization()
   542  	return nil
   543  }
   544  
   545  // compileLabel implements compiler.compileLabel for the arm64 architecture.
   546  func (c *arm64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipThisLabel bool) {
   547  	labelKey := wazeroir.Label(o.U1)
   548  	labelInfo := c.label(labelKey)
   549  
   550  	// If initialStack is not set, that means this label has never been reached.
   551  	if !labelInfo.stackInitialized {
   552  		skipThisLabel = true
   553  		return
   554  	}
   555  
   556  	if labelBegin := labelInfo.initialInstruction; labelBegin == nil {
   557  		// We use NOP as a beginning of instructions in a label.
   558  		// This should be eventually optimized out by assembler.
   559  		labelInfo.initialInstruction = c.assembler.CompileStandAlone(arm64.NOP)
   560  	} else {
   561  		c.assembler.Add(labelBegin)
   562  	}
   563  
   564  	// Set the initial stack.
   565  	c.setLocationStack(&labelInfo.initialStack)
   566  	return false
   567  }
   568  
   569  // compileUnreachable implements compiler.compileUnreachable for the arm64 architecture.
   570  func (c *arm64Compiler) compileUnreachable() error {
   571  	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
   572  	return nil
   573  }
   574  
   575  // compileSet implements compiler.compileSet for the arm64 architecture.
   576  func (c *arm64Compiler) compileSet(o *wazeroir.UnionOperation) error {
   577  	depth := int(o.U1)
   578  	isTargetVector := o.B3
   579  
   580  	setTargetIndex := int(c.locationStack.sp) - 1 - depth
   581  
   582  	if isTargetVector {
   583  		_ = c.locationStack.pop()
   584  	}
   585  	v := c.locationStack.pop()
   586  	if err := c.compileEnsureOnRegister(v); err != nil {
   587  		return err
   588  	}
   589  
   590  	targetLocation := &c.locationStack.stack[setTargetIndex]
   591  	if targetLocation.onRegister() {
   592  		// We no longer need the register previously used by the target location.
   593  		c.markRegisterUnused(targetLocation.register)
   594  	}
   595  
   596  	reg := v.register
   597  	targetLocation.setRegister(reg)
   598  	targetLocation.valueType = v.valueType
   599  	if isTargetVector {
   600  		hi := &c.locationStack.stack[setTargetIndex+1]
   601  		hi.setRegister(reg)
   602  	}
   603  	return nil
   604  }
   605  
   606  // compileGlobalGet implements compiler.compileGlobalGet for the arm64 architecture.
   607  func (c *arm64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error {
   608  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   609  		return err
   610  	}
   611  
   612  	index := uint32(o.U1)
   613  
   614  	wasmValueType := c.ir.Globals[index].ValType
   615  	isV128 := wasmValueType == wasm.ValueTypeV128
   616  	// Get the address of globals[index] into globalAddressReg.
   617  	globalAddressReg, err := c.compileReadGlobalAddress(index)
   618  	if err != nil {
   619  		return err
   620  	}
   621  
   622  	if isV128 {
   623  		resultReg, err := c.allocateRegister(registerTypeVector)
   624  		if err != nil {
   625  			return err
   626  		}
   627  		c.assembler.CompileConstToRegister(arm64.ADD, globalInstanceValueOffset, globalAddressReg)
   628  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV, globalAddressReg, 0,
   629  			resultReg, arm64.VectorArrangementQ)
   630  
   631  		c.pushVectorRuntimeValueLocationOnRegister(resultReg)
   632  	} else {
   633  		ldr := arm64.NOP
   634  		var result asm.Register
   635  		var vt runtimeValueType
   636  		switch wasmValueType {
   637  		case wasm.ValueTypeI32:
   638  			ldr = arm64.LDRW
   639  			vt = runtimeValueTypeI32
   640  			result = globalAddressReg
   641  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   642  			ldr = arm64.LDRD
   643  			vt = runtimeValueTypeI64
   644  			result = globalAddressReg
   645  		case wasm.ValueTypeF32:
   646  			result, err = c.allocateRegister(registerTypeVector)
   647  			if err != nil {
   648  				return err
   649  			}
   650  			ldr = arm64.FLDRS
   651  			vt = runtimeValueTypeF32
   652  		case wasm.ValueTypeF64:
   653  			result, err = c.allocateRegister(registerTypeVector)
   654  			if err != nil {
   655  				return err
   656  			}
   657  			ldr = arm64.FLDRD
   658  			vt = runtimeValueTypeF64
   659  		}
   660  
   661  		// "result = [globalAddressReg + globalInstanceValueOffset] (== globals[index].Val)"
   662  		c.assembler.CompileMemoryToRegister(
   663  			ldr,
   664  			globalAddressReg, globalInstanceValueOffset,
   665  			result,
   666  		)
   667  
   668  		c.pushRuntimeValueLocationOnRegister(result, vt)
   669  	}
   670  	return nil
   671  }
   672  
   673  // compileGlobalSet implements compiler.compileGlobalSet for the arm64 architecture.
   674  func (c *arm64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error {
   675  	index := uint32(o.U1)
   676  
   677  	wasmValueType := c.ir.Globals[index].ValType
   678  	isV128 := wasmValueType == wasm.ValueTypeV128
   679  
   680  	var val *runtimeValueLocation
   681  	if isV128 {
   682  		val = c.locationStack.popV128()
   683  	} else {
   684  		val = c.locationStack.pop()
   685  	}
   686  	if err := c.compileEnsureOnRegister(val); err != nil {
   687  		return err
   688  	}
   689  
   690  	globalInstanceAddressRegister, err := c.compileReadGlobalAddress(index)
   691  	if err != nil {
   692  		return err
   693  	}
   694  
   695  	if isV128 {
   696  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
   697  			val.register, globalInstanceAddressRegister, globalInstanceValueOffset,
   698  			arm64.VectorArrangementQ)
   699  	} else {
   700  		var str asm.Instruction
   701  		switch c.ir.Globals[index].ValType {
   702  		case wasm.ValueTypeI32:
   703  			str = arm64.STRW
   704  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   705  			str = arm64.STRD
   706  		case wasm.ValueTypeF32:
   707  			str = arm64.FSTRS
   708  		case wasm.ValueTypeF64:
   709  			str = arm64.FSTRD
   710  		}
   711  
   712  		// At this point "globalInstanceAddressRegister = globals[index]".
   713  		// Therefore, this means "globals[index].Val = val.register"
   714  		c.assembler.CompileRegisterToMemory(
   715  			str,
   716  			val.register,
   717  			globalInstanceAddressRegister, globalInstanceValueOffset,
   718  		)
   719  	}
   720  
   721  	c.markRegisterUnused(val.register)
   722  	return nil
   723  }
   724  
   725  // compileReadGlobalAddress adds instructions to store the absolute address of the global instance at globalIndex into a register
   726  func (c *arm64Compiler) compileReadGlobalAddress(globalIndex uint32) (destinationRegister asm.Register, err error) {
   727  	// TODO: rethink about the type used in store `globals []*GlobalInstance`.
   728  	// If we use `[]GlobalInstance` instead, we could reduce one MOV instruction here.
   729  
   730  	destinationRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
   731  	if err != nil {
   732  		return
   733  	}
   734  
   735  	// "destinationRegister = globalIndex * 8"
   736  	c.assembler.CompileConstToRegister(
   737  		// globalIndex is an index to []*GlobalInstance, therefore
   738  		// we have to multiply it by the size of *GlobalInstance == the pointer size == 8.
   739  		arm64.MOVD, int64(globalIndex)*8, destinationRegister,
   740  	)
   741  
   742  	// "arm64ReservedRegisterForTemporary = &globals[0]"
   743  	c.assembler.CompileMemoryToRegister(
   744  		arm64.LDRD,
   745  		arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
   746  		arm64ReservedRegisterForTemporary,
   747  	)
   748  
   749  	// "destinationRegister = [arm64ReservedRegisterForTemporary + destinationRegister] (== globals[globalIndex])".
   750  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
   751  		arm64.LDRD,
   752  		arm64ReservedRegisterForTemporary, destinationRegister,
   753  		destinationRegister,
   754  	)
   755  	return
   756  }
   757  
   758  // compileBr implements compiler.compileBr for the arm64 architecture.
   759  func (c *arm64Compiler) compileBr(o *wazeroir.UnionOperation) error {
   760  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   761  		return err
   762  	}
   763  	return c.compileBranchInto(wazeroir.Label(o.U1))
   764  }
   765  
   766  // compileBrIf implements compiler.compileBrIf for the arm64 architecture.
   767  func (c *arm64Compiler) compileBrIf(o *wazeroir.UnionOperation) error {
   768  	cond := c.locationStack.pop()
   769  
   770  	var conditionalBR asm.Node
   771  	if cond.onConditionalRegister() {
   772  		// If the cond is on a conditional register, it corresponds to one of "conditional codes"
   773  		// https://developer.arm.com/documentation/dui0801/a/Condition-Codes/Condition-code-suffixes
   774  		// Here we represent the conditional codes by using arm64.COND_** registers, and that means the
   775  		// conditional jump can be performed if we use arm64.B**.
   776  		// For example, if we have arm64.CondEQ on cond, that means we performed compileEq right before
   777  		// this compileBrIf and BrIf can be achieved by arm64.BCONDEQ.
   778  		var brInst asm.Instruction
   779  		switch cond.conditionalRegister {
   780  		case arm64.CondEQ:
   781  			brInst = arm64.BCONDEQ
   782  		case arm64.CondNE:
   783  			brInst = arm64.BCONDNE
   784  		case arm64.CondHS:
   785  			brInst = arm64.BCONDHS
   786  		case arm64.CondLO:
   787  			brInst = arm64.BCONDLO
   788  		case arm64.CondMI:
   789  			brInst = arm64.BCONDMI
   790  		case arm64.CondHI:
   791  			brInst = arm64.BCONDHI
   792  		case arm64.CondLS:
   793  			brInst = arm64.BCONDLS
   794  		case arm64.CondGE:
   795  			brInst = arm64.BCONDGE
   796  		case arm64.CondLT:
   797  			brInst = arm64.BCONDLT
   798  		case arm64.CondGT:
   799  			brInst = arm64.BCONDGT
   800  		case arm64.CondLE:
   801  			brInst = arm64.BCONDLE
   802  		default:
   803  			// BUG: This means that we use the cond.conditionalRegister somewhere in this file,
   804  			// but not covered in switch ^. That shouldn't happen.
   805  			return fmt.Errorf("unsupported condition for br_if: %v", cond.conditionalRegister)
   806  		}
   807  		conditionalBR = c.assembler.CompileJump(brInst)
   808  	} else {
   809  		// If the value is not on the conditional register, we compare the value with the zero register,
   810  		// and then do the conditional BR if the value doesn't equal zero.
   811  		if err := c.compileEnsureOnRegister(cond); err != nil {
   812  			return err
   813  		}
   814  		// Compare the value with zero register. Note that the value is ensured to be i32 by function validation phase,
   815  		// so we use CMPW (32-bit compare) here.
   816  		c.assembler.CompileTwoRegistersToNone(arm64.CMPW, cond.register, arm64.RegRZR)
   817  
   818  		conditionalBR = c.assembler.CompileJump(arm64.BCONDNE)
   819  
   820  		c.markRegisterUnused(cond.register)
   821  	}
   822  
   823  	// Emit the code for branching into else branch.
   824  	elseTarget := wazeroir.Label(o.U2)
   825  	if err := c.compileBranchInto(elseTarget); err != nil {
   826  		return err
   827  	}
   828  	// We branch into here from the original conditional BR (conditionalBR).
   829  	c.assembler.SetJumpTargetOnNext(conditionalBR)
   830  	thenTarget := wazeroir.Label(o.U1)
   831  	if err := compileDropRange(c, o.U3); err != nil {
   832  		return err
   833  	}
   834  	return c.compileBranchInto(thenTarget)
   835  }
   836  
   837  func (c *arm64Compiler) compileBranchInto(target wazeroir.Label) error {
   838  	if target.IsReturnTarget() {
   839  		return c.compileReturnFunction()
   840  	} else {
   841  		if c.ir.LabelCallers[target] > 1 {
   842  			// We can only re-use register state if when there's a single call-site.
   843  			// Release existing values on registers to the stack if there's multiple ones to have
   844  			// the consistent value location state at the beginning of label.
   845  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   846  				return err
   847  			}
   848  		}
   849  		// Set the initial stack of the target label, so we can start compiling the label
   850  		// with the appropriate value locations. Note we clone the stack here as we maybe
   851  		// manipulate the stack before compiler reaches the label.
   852  		targetLabel := c.label(target)
   853  		if !targetLabel.stackInitialized {
   854  			targetLabel.initialStack.cloneFrom(*c.locationStack)
   855  			targetLabel.stackInitialized = true
   856  		}
   857  
   858  		br := c.assembler.CompileJump(arm64.B)
   859  		c.assignBranchTarget(target, br)
   860  		return nil
   861  	}
   862  }
   863  
   864  // assignBranchTarget assigns the given label's initial instruction to the destination of br.
   865  func (c *arm64Compiler) assignBranchTarget(label wazeroir.Label, br asm.Node) {
   866  	target := c.label(label)
   867  
   868  	targetInst := target.initialInstruction
   869  	if targetInst == nil {
   870  		// If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction.
   871  		targetInst = c.assembler.AllocateNOP()
   872  		target.initialInstruction = targetInst
   873  	}
   874  
   875  	br.AssignJumpTarget(targetInst)
   876  }
   877  
   878  // compileBrTable implements compiler.compileBrTable for the arm64 architecture.
   879  func (c *arm64Compiler) compileBrTable(o *wazeroir.UnionOperation) error {
   880  	// If the operation only consists of the default target, we branch into it and return early.
   881  	if len(o.Us) == 2 {
   882  		loc := c.locationStack.pop()
   883  		if loc.onRegister() {
   884  			c.markRegisterUnused(loc.register)
   885  		}
   886  		if err := compileDropRange(c, o.Us[1]); err != nil {
   887  			return err
   888  		}
   889  		return c.compileBranchInto(wazeroir.Label(o.Us[0]))
   890  	}
   891  
   892  	index := c.locationStack.pop()
   893  	if err := c.compileEnsureOnRegister(index); err != nil {
   894  		return err
   895  	}
   896  
   897  	if isZeroRegister(index.register) {
   898  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
   899  		if err != nil {
   900  			return err
   901  		}
   902  		index.setRegister(reg)
   903  		c.markRegisterUsed(reg)
   904  
   905  		// Zero the value on a picked register.
   906  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, reg)
   907  	}
   908  
   909  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   910  	if err != nil {
   911  		return err
   912  	}
   913  
   914  	// Load the branch table's length.
   915  	// "tmpReg = len(o.Targets)"
   916  	c.assembler.CompileConstToRegister(arm64.MOVW, int64(len(o.Us)/2-1), tmpReg)
   917  	// Compare the length with offset.
   918  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmpReg, index.register)
   919  	// If the value exceeds the length, we will branch into the default target (corresponding to len(o.Targets) index).
   920  	brDefaultIndex := c.assembler.CompileJump(arm64.BCONDLO)
   921  	c.assembler.CompileRegisterToRegister(arm64.MOVW, tmpReg, index.register)
   922  	c.assembler.SetJumpTargetOnNext(brDefaultIndex)
   923  
   924  	// We prepare the asm.StaticConst which holds the offset of
   925  	// each target's first instruction (incl. default)
   926  	// relative to the beginning of label tables.
   927  	//
   928  	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
   929  	// we emit the code like this at [Emit the code for each target and default branch] below.
   930  	//
   931  	// L0:
   932  	//  0x123001: XXXX, ...
   933  	//  .....
   934  	// L1:
   935  	//  0x123005: YYY, ...
   936  	//  .....
   937  	// L_DEFAULT:
   938  	//  0x123009: ZZZ, ...
   939  	//
   940  	// then offsetData becomes like [0x0, 0x5, 0x8].
   941  	// By using this offset list, we could jump into the label for the index by
   942  	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by ADR instruction.
   943  	//
   944  	// Note: We store each offset of 32-bit unsigned integer as 4 consecutive bytes. So more precisely,
   945  	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
   946  	//
   947  	// Note: this is similar to how GCC implements Switch statements in C.
   948  	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2)))
   949  
   950  	// "tmpReg = &offsetData[0]"
   951  	c.assembler.CompileStaticConstToRegister(arm64.ADR, offsetData, tmpReg)
   952  
   953  	// "index.register = tmpReg + (index.register << 2) (== &offsetData[offset])"
   954  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, index.register, 2, tmpReg, index.register)
   955  
   956  	// "index.register = *index.register (== offsetData[offset])"
   957  	c.assembler.CompileMemoryToRegister(arm64.LDRW, index.register, 0, index.register)
   958  
   959  	// Now we read the address of the beginning of the jump table.
   960  	// In the above example, this corresponds to reading the address of 0x123001.
   961  	c.assembler.CompileReadInstructionAddress(tmpReg, arm64.B)
   962  
   963  	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
   964  	// So we could achieve the br_table jump by adding them and jump into the resulting address.
   965  	c.assembler.CompileRegisterToRegister(arm64.ADD, tmpReg, index.register)
   966  
   967  	c.assembler.CompileJumpToRegister(arm64.B, index.register)
   968  
   969  	// We no longer need the index's register, so mark it unused.
   970  	c.markRegisterUnused(index.register)
   971  
   972  	// [Emit the code for each targets and default branch]
   973  	labelInitialInstructions := make([]asm.Node, len(o.Us)/2)
   974  
   975  	// Since we might end up having the different stack state in each branch,
   976  	// we need to save the initial stack state here, and use the same initial state
   977  	// for each iteration.
   978  	initialLocationStack := c.getSavedTemporaryLocationStack()
   979  
   980  	for i := range labelInitialInstructions {
   981  		// Emit the initial instruction of each target where
   982  		// we use NOP as we don't yet know the next instruction in each label.
   983  		init := c.assembler.CompileStandAlone(arm64.NOP)
   984  		labelInitialInstructions[i] = init
   985  
   986  		targetLabel := wazeroir.Label(o.Us[i*2])
   987  		targetToDrop := o.Us[i*2+1]
   988  		if err = compileDropRange(c, targetToDrop); err != nil {
   989  			return err
   990  		}
   991  		if err = c.compileBranchInto(targetLabel); err != nil {
   992  			return err
   993  		}
   994  		// After the iteration, reset the stack's state with initialLocationStack.
   995  		c.locationStack.cloneFrom(initialLocationStack)
   996  	}
   997  
   998  	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
   999  	return nil
  1000  }
  1001  
  1002  func (c *arm64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack {
  1003  	initialLocationStack := *c.locationStack // Take copy!
  1004  	// Use c.brTableTmp for the underlying stack so that we could reduce the allocations.
  1005  	if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 {
  1006  		c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...)
  1007  	}
  1008  	copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp])
  1009  	initialLocationStack.stack = c.brTableTmp
  1010  	return initialLocationStack
  1011  }
  1012  
  1013  // compileCall implements compiler.compileCall for the arm64 architecture.
  1014  func (c *arm64Compiler) compileCall(o *wazeroir.UnionOperation) error {
  1015  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1016  		return err
  1017  	}
  1018  
  1019  	functionIndex := o.U1
  1020  
  1021  	tp := &c.ir.Types[c.ir.Functions[functionIndex]]
  1022  
  1023  	targetFunctionAddressReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1024  	if err != nil {
  1025  		return err
  1026  	}
  1027  	c.markRegisterUsed(targetFunctionAddressReg)
  1028  	defer c.markRegisterUnused(targetFunctionAddressReg)
  1029  
  1030  	// 3) Set rc.next to specify which function is executed on the current call frame.
  1031  	//
  1032  	// First, we read the address of the first item of ce.functions slice (= &ce.functions[0])
  1033  	// into tmp.
  1034  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1035  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  1036  		targetFunctionAddressReg)
  1037  
  1038  	c.assembler.CompileConstToRegister(
  1039  		arm64.ADD,
  1040  		int64(functionIndex)*functionSize, // * 8 because the size of *function equals 8 bytes.
  1041  		targetFunctionAddressReg)
  1042  
  1043  	return c.compileCallImpl(targetFunctionAddressReg, tp)
  1044  }
  1045  
  1046  // compileCallImpl implements compiler.compileCall and compiler.compileCallIndirect for the arm64 architecture.
  1047  func (c *arm64Compiler) compileCallImpl(targetFunctionAddressRegister asm.Register, functype *wasm.FunctionType) error {
  1048  	// Release all the registers as our calling convention requires the caller-save.
  1049  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  1050  		return err
  1051  	}
  1052  
  1053  	tmp, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  1054  	if !ok {
  1055  		panic("BUG: cannot take a free register")
  1056  	}
  1057  
  1058  	// The stack should look like:
  1059  	//
  1060  	//               reserved slots for results (if len(results) > len(args))
  1061  	//                      |     |
  1062  	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
  1063  	//      |                       |                                                        |
  1064  	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
  1065  	//      |
  1066  	// nextStackBasePointerOffset
  1067  	//
  1068  	// where callFrame is used to return to this currently executed function.
  1069  
  1070  	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
  1071  
  1072  	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
  1073  
  1074  	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
  1075  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1076  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  1077  		tmp)
  1078  	callFrameStackBasePointerInBytesLoc.setRegister(tmp)
  1079  	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
  1080  
  1081  	// Set callEngine.stackContext.stackBasePointer for the next function.
  1082  	c.assembler.CompileConstToRegister(arm64.ADD, nextStackBasePointerOffset<<3, tmp)
  1083  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  1084  		tmp,
  1085  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  1086  
  1087  	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
  1088  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1089  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
  1090  		tmp)
  1091  	callFrameFunctionLoc.setRegister(tmp)
  1092  	c.compileReleaseRegisterToStack(callFrameFunctionLoc)
  1093  
  1094  	// Set callEngine.moduleContext.fn to the next *function.
  1095  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  1096  		targetFunctionAddressRegister,
  1097  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  1098  
  1099  	// Write the return address into callFrameReturnAddressLoc.
  1100  	c.assembler.CompileReadInstructionAddress(tmp, arm64.B)
  1101  	callFrameReturnAddressLoc.setRegister(tmp)
  1102  	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
  1103  
  1104  	if targetFunctionAddressRegister == arm64CallingConventionModuleInstanceAddressRegister {
  1105  		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
  1106  		// the address (jump target below) will be modified and result in segfault.
  1107  		// See #526.
  1108  		c.assembler.CompileRegisterToRegister(arm64.MOVD, targetFunctionAddressRegister, tmp)
  1109  		targetFunctionAddressRegister = tmp
  1110  	}
  1111  
  1112  	// Also, we have to put the code's moduleInstance address into arm64CallingConventionModuleInstanceAddressRegister.
  1113  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1114  		targetFunctionAddressRegister, functionModuleInstanceOffset,
  1115  		arm64CallingConventionModuleInstanceAddressRegister,
  1116  	)
  1117  
  1118  	// Then, br into the target function's initial address.
  1119  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1120  		targetFunctionAddressRegister, functionCodeInitialAddressOffset,
  1121  		targetFunctionAddressRegister)
  1122  
  1123  	c.assembler.CompileJumpToRegister(arm64.B, targetFunctionAddressRegister)
  1124  
  1125  	// We consumed the function parameters, the call frame stack and reserved slots during the call.
  1126  	c.locationStack.sp = uint64(nextStackBasePointerOffset)
  1127  
  1128  	// Also, the function results were pushed by the call.
  1129  	for _, t := range functype.Results {
  1130  		loc := c.locationStack.pushRuntimeValueLocationOnStack()
  1131  		switch t {
  1132  		case wasm.ValueTypeI32:
  1133  			loc.valueType = runtimeValueTypeI32
  1134  		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
  1135  			loc.valueType = runtimeValueTypeI64
  1136  		case wasm.ValueTypeF32:
  1137  			loc.valueType = runtimeValueTypeF32
  1138  		case wasm.ValueTypeF64:
  1139  			loc.valueType = runtimeValueTypeF64
  1140  		case wasm.ValueTypeV128:
  1141  			loc.valueType = runtimeValueTypeV128Lo
  1142  			hi := c.locationStack.pushRuntimeValueLocationOnStack()
  1143  			hi.valueType = runtimeValueTypeV128Hi
  1144  		}
  1145  	}
  1146  
  1147  	if err := c.compileModuleContextInitialization(); err != nil {
  1148  		return err
  1149  	}
  1150  
  1151  	// On the function return, we initialize the state for this function.
  1152  	c.compileReservedStackBasePointerRegisterInitialization()
  1153  
  1154  	c.compileReservedMemoryRegisterInitialization()
  1155  	return nil
  1156  }
  1157  
  1158  // compileCallIndirect implements compiler.compileCallIndirect for the arm64 architecture.
  1159  func (c *arm64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) (err error) {
  1160  	offset := c.locationStack.pop()
  1161  	if err = c.compileEnsureOnRegister(offset); err != nil {
  1162  		return err
  1163  	}
  1164  	typeIndex := o.U1
  1165  	tableIndex := o.U2
  1166  
  1167  	offsetReg := offset.register
  1168  	if isZeroRegister(offsetReg) {
  1169  		offsetReg, err = c.allocateRegister(registerTypeGeneralPurpose)
  1170  		if err != nil {
  1171  			return err
  1172  		}
  1173  		c.markRegisterUsed(offsetReg)
  1174  
  1175  		// Zero the value on a picked register.
  1176  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, offsetReg)
  1177  	}
  1178  
  1179  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  1180  	if err != nil {
  1181  		return err
  1182  	}
  1183  	c.markRegisterUsed(tmp)
  1184  
  1185  	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
  1186  	if err != nil {
  1187  		return err
  1188  	}
  1189  	c.markRegisterUsed(tmp2)
  1190  
  1191  	// First, we need to check if the offset doesn't exceed the length of table.
  1192  	// "tmp = &Tables[0]"
  1193  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1194  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  1195  		tmp,
  1196  	)
  1197  	// tmp = [tmp + TableIndex*8] = [&Tables[0] + TableIndex*sizeOf(*tableInstance)] = Tables[tableIndex]
  1198  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1199  		tmp, int64(tableIndex)*8,
  1200  		tmp,
  1201  	)
  1202  	// tmp2 = [tmp + tableInstanceTableLenOffset] = len(Tables[tableIndex])
  1203  	c.assembler.CompileMemoryToRegister(arm64.LDRD, tmp, tableInstanceTableLenOffset, tmp2)
  1204  
  1205  	// "cmp tmp2, offset"
  1206  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp2, offsetReg)
  1207  
  1208  	// If it exceeds len(table), we trap.
  1209  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  1210  	// Otherwise, we proceed to do function type check.
  1211  
  1212  	// We need to obtain the absolute address of table element.
  1213  	// "tmp = &Tables[tableIndex].table[0]"
  1214  	c.assembler.CompileMemoryToRegister(
  1215  		arm64.LDRD,
  1216  		tmp, tableInstanceTableOffset,
  1217  		tmp,
  1218  	)
  1219  	// "offset = tmp + (offset << pointerSizeLog2) (== &table[offset])"
  1220  	// Here we left shifting by 3 in order to get the offset in bytes,
  1221  	// and the table element type is uintptr which is 8 bytes.
  1222  	c.assembler.CompileLeftShiftedRegisterToRegister(
  1223  		arm64.ADD,
  1224  		offsetReg, pointerSizeLog2,
  1225  		tmp,
  1226  		offsetReg,
  1227  	)
  1228  
  1229  	// "offset = (*offset) (== table[offset])"
  1230  	c.assembler.CompileMemoryToRegister(arm64.LDRD, offsetReg, 0, offsetReg)
  1231  
  1232  	// Check if the value of table[offset] equals zero, meaning that the target element is uninitialized.
  1233  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, offsetReg)
  1234  
  1235  	// Skipped if the target is initialized.
  1236  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusCodeInvalidTableAccess)
  1237  
  1238  	// next we check the type matches, i.e. table[offset].source.TypeID == targetFunctionType.
  1239  	// "tmp = table[offset].typeID"
  1240  	c.assembler.CompileMemoryToRegister(
  1241  		arm64.LDRD,
  1242  		offsetReg, functionTypeIDOffset,
  1243  		tmp,
  1244  	)
  1245  	// "tmp2 = ModuleInstance.TypeIDs[index]"
  1246  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1247  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
  1248  		tmp2)
  1249  	c.assembler.CompileMemoryToRegister(arm64.LDRW, tmp2, int64(typeIndex)*4, tmp2)
  1250  
  1251  	// Compare these two values, and if they equal, we are ready to make function call.
  1252  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmp, tmp2)
  1253  	// Skipped if the type matches.
  1254  	c.compileMaybeExitFromNativeCode(arm64.BCONDEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall)
  1255  
  1256  	targetFunctionType := &c.ir.Types[typeIndex]
  1257  	if err := c.compileCallImpl(offsetReg, targetFunctionType); err != nil {
  1258  		return err
  1259  	}
  1260  
  1261  	// The offset register should be marked as un-used as we consumed in the function call.
  1262  	c.markRegisterUnused(offsetReg, tmp, tmp2)
  1263  	return nil
  1264  }
  1265  
  1266  // compileDrop implements compiler.compileDrop for the arm64 architecture.
  1267  func (c *arm64Compiler) compileDrop(o *wazeroir.UnionOperation) error {
  1268  	return compileDropRange(c, o.U1)
  1269  }
  1270  
  1271  func (c *arm64Compiler) compileSelectV128Impl(selectorRegister asm.Register) error {
  1272  	x2 := c.locationStack.popV128()
  1273  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1274  		return err
  1275  	}
  1276  
  1277  	x1 := c.locationStack.popV128()
  1278  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1279  		return err
  1280  	}
  1281  
  1282  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, selectorRegister)
  1283  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1284  
  1285  	// In this branch, we select the value of x2, so we move the value into x1.register so that
  1286  	// we can have the result in x1.register regardless of the selection.
  1287  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1288  		x2.register, x2.register, x1.register, arm64.VectorArrangement16B)
  1289  
  1290  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1291  
  1292  	// As noted, the result exists in x1.register regardless of the selector.
  1293  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1294  	// Plus, x2.register is no longer used.
  1295  	c.markRegisterUnused(x2.register)
  1296  	return nil
  1297  }
  1298  
  1299  // compileSelect implements compiler.compileSelect for the arm64 architecture.
  1300  func (c *arm64Compiler) compileSelect(o *wazeroir.UnionOperation) error {
  1301  	cv, err := c.popValueOnRegister()
  1302  	if err != nil {
  1303  		return err
  1304  	}
  1305  
  1306  	isTargetVector := o.B3
  1307  	if isTargetVector {
  1308  		return c.compileSelectV128Impl(cv.register)
  1309  	}
  1310  
  1311  	c.markRegisterUsed(cv.register)
  1312  
  1313  	x1, x2, err := c.popTwoValuesOnRegisters()
  1314  	if err != nil {
  1315  		return err
  1316  	}
  1317  
  1318  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1319  		// If both values are zero, the result is always zero.
  1320  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1321  		c.markRegisterUnused(cv.register)
  1322  		return nil
  1323  	}
  1324  
  1325  	// In the following, we emit the code so that x1's register contains the chosen value
  1326  	// no matter which of original x1 or x2 is selected.
  1327  	//
  1328  	// If x1 is currently on zero register, we cannot place the result because
  1329  	// "MOV arm64.RegRZR x2.register" results in arm64.RegRZR regardless of the value.
  1330  	// So we explicitly assign a general purpose register to x1 here.
  1331  	if isZeroRegister(x1.register) {
  1332  		// Mark x2 and cv's registers are used so they won't be chosen.
  1333  		c.markRegisterUsed(x2.register)
  1334  		// Pick the non-zero register for x1.
  1335  		x1Reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1336  		if err != nil {
  1337  			return err
  1338  		}
  1339  		x1.setRegister(x1Reg)
  1340  		// And zero our the picked register.
  1341  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, x1Reg)
  1342  	}
  1343  
  1344  	// At this point, x1 is non-zero register, and x2 is either general purpose or zero register.
  1345  
  1346  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, cv.register)
  1347  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1348  
  1349  	// If cv == 0, we move the value of x2 to the x1.register.
  1350  
  1351  	switch x1.valueType {
  1352  	case runtimeValueTypeI32:
  1353  		// TODO: use 32-bit mov
  1354  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1355  	case runtimeValueTypeI64:
  1356  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1357  	case runtimeValueTypeF32:
  1358  		// TODO: use 32-bit mov
  1359  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1360  	case runtimeValueTypeF64:
  1361  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1362  	default:
  1363  		return errors.New("TODO: implement vector type select")
  1364  	}
  1365  
  1366  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1367  
  1368  	// Otherwise, nothing to do for select.
  1369  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1370  
  1371  	// Only x1.register is reused.
  1372  	c.markRegisterUnused(cv.register, x2.register)
  1373  	return nil
  1374  }
  1375  
  1376  // compilePick implements compiler.compilePick for the arm64 architecture.
  1377  func (c *arm64Compiler) compilePick(o *wazeroir.UnionOperation) error {
  1378  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1379  		return err
  1380  	}
  1381  	depth := o.U1
  1382  	isTargetVector := o.B3
  1383  
  1384  	pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)]
  1385  	pickedRegister, err := c.allocateRegister(pickTarget.getRegisterType())
  1386  	if err != nil {
  1387  		return err
  1388  	}
  1389  
  1390  	if pickTarget.onRegister() { // Copy the value to the pickedRegister.
  1391  		switch pickTarget.valueType {
  1392  		case runtimeValueTypeI32:
  1393  			c.assembler.CompileRegisterToRegister(arm64.MOVW, pickTarget.register, pickedRegister)
  1394  		case runtimeValueTypeI64:
  1395  			c.assembler.CompileRegisterToRegister(arm64.MOVD, pickTarget.register, pickedRegister)
  1396  		case runtimeValueTypeF32:
  1397  			c.assembler.CompileRegisterToRegister(arm64.FMOVS, pickTarget.register, pickedRegister)
  1398  		case runtimeValueTypeF64:
  1399  			c.assembler.CompileRegisterToRegister(arm64.FMOVD, pickTarget.register, pickedRegister)
  1400  		case runtimeValueTypeV128Lo:
  1401  			c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1402  				pickTarget.register, pickTarget.register, pickedRegister, arm64.VectorArrangement16B)
  1403  		case runtimeValueTypeV128Hi:
  1404  			panic("BUG") // since pick target must point to the lower 64-bits of vectors.
  1405  		}
  1406  	} else if pickTarget.onStack() {
  1407  		// Temporarily assign a register to the pick target, and then load the value.
  1408  		pickTarget.setRegister(pickedRegister)
  1409  		c.compileLoadValueOnStackToRegister(pickTarget)
  1410  
  1411  		// After the load, we revert the register assignment to the pick target.
  1412  		pickTarget.setRegister(asm.NilRegister)
  1413  		if isTargetVector {
  1414  			hi := &c.locationStack.stack[pickTarget.stackPointer+1]
  1415  			hi.setRegister(asm.NilRegister)
  1416  		}
  1417  	}
  1418  
  1419  	// Now we have the value of the target on the pickedRegister,
  1420  	// so push the location.
  1421  	c.pushRuntimeValueLocationOnRegister(pickedRegister, pickTarget.valueType)
  1422  	if isTargetVector {
  1423  		c.pushRuntimeValueLocationOnRegister(pickedRegister, runtimeValueTypeV128Hi)
  1424  	}
  1425  	return nil
  1426  }
  1427  
  1428  // compileAdd implements compiler.compileAdd for the arm64 architecture.
  1429  func (c *arm64Compiler) compileAdd(o *wazeroir.UnionOperation) error {
  1430  	x1, x2, err := c.popTwoValuesOnRegisters()
  1431  	if err != nil {
  1432  		return err
  1433  	}
  1434  
  1435  	// Addition can be nop if one of operands is zero.
  1436  	if isZeroRegister(x1.register) {
  1437  		c.pushRuntimeValueLocationOnRegister(x2.register, x1.valueType)
  1438  		return nil
  1439  	} else if isZeroRegister(x2.register) {
  1440  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1441  		return nil
  1442  	}
  1443  
  1444  	var inst asm.Instruction
  1445  	unsignedType := wazeroir.UnsignedType(o.B1)
  1446  	switch unsignedType {
  1447  	case wazeroir.UnsignedTypeI32:
  1448  		inst = arm64.ADDW
  1449  	case wazeroir.UnsignedTypeI64:
  1450  		inst = arm64.ADD
  1451  	case wazeroir.UnsignedTypeF32:
  1452  		inst = arm64.FADDS
  1453  	case wazeroir.UnsignedTypeF64:
  1454  		inst = arm64.FADDD
  1455  	}
  1456  
  1457  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1458  	// The result is placed on a register for x1, so record it.
  1459  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1460  	return nil
  1461  }
  1462  
  1463  // compileSub implements compiler.compileSub for the arm64 architecture.
  1464  func (c *arm64Compiler) compileSub(o *wazeroir.UnionOperation) error {
  1465  	x1, x2, err := c.popTwoValuesOnRegisters()
  1466  	if err != nil {
  1467  		return err
  1468  	}
  1469  
  1470  	// If both of registers are zeros, this can be nop and push the zero register.
  1471  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1472  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1473  		return nil
  1474  	}
  1475  
  1476  	// At this point, at least one of x1 or x2 registers is non zero.
  1477  	// Choose the non-zero register as destination.
  1478  	destinationReg := x1.register
  1479  	if isZeroRegister(x1.register) {
  1480  		destinationReg = x2.register
  1481  	}
  1482  
  1483  	var inst asm.Instruction
  1484  	var vt runtimeValueType
  1485  	unsignedType := wazeroir.UnsignedType(o.B1)
  1486  	switch unsignedType {
  1487  	case wazeroir.UnsignedTypeI32:
  1488  		inst = arm64.SUBW
  1489  		vt = runtimeValueTypeI32
  1490  	case wazeroir.UnsignedTypeI64:
  1491  		inst = arm64.SUB
  1492  		vt = runtimeValueTypeI64
  1493  	case wazeroir.UnsignedTypeF32:
  1494  		inst = arm64.FSUBS
  1495  		vt = runtimeValueTypeF32
  1496  	case wazeroir.UnsignedTypeF64:
  1497  		inst = arm64.FSUBD
  1498  		vt = runtimeValueTypeF64
  1499  	}
  1500  
  1501  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1502  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  1503  	return nil
  1504  }
  1505  
  1506  // compileMul implements compiler.compileMul for the arm64 architecture.
  1507  func (c *arm64Compiler) compileMul(o *wazeroir.UnionOperation) error {
  1508  	x1, x2, err := c.popTwoValuesOnRegisters()
  1509  	if err != nil {
  1510  		return err
  1511  	}
  1512  
  1513  	// Multiplication can be done by putting a zero register if one of operands is zero.
  1514  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1515  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1516  		return nil
  1517  	}
  1518  
  1519  	var inst asm.Instruction
  1520  	var vt runtimeValueType
  1521  	unsignedType := wazeroir.UnsignedType(o.B1)
  1522  	switch unsignedType {
  1523  	case wazeroir.UnsignedTypeI32:
  1524  		inst = arm64.MULW
  1525  		vt = runtimeValueTypeI32
  1526  	case wazeroir.UnsignedTypeI64:
  1527  		inst = arm64.MUL
  1528  		vt = runtimeValueTypeI64
  1529  	case wazeroir.UnsignedTypeF32:
  1530  		inst = arm64.FMULS
  1531  		vt = runtimeValueTypeF32
  1532  	case wazeroir.UnsignedTypeF64:
  1533  		inst = arm64.FMULD
  1534  		vt = runtimeValueTypeF64
  1535  	}
  1536  
  1537  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1538  	// The result is placed on a register for x1, so record it.
  1539  	c.pushRuntimeValueLocationOnRegister(x1.register, vt)
  1540  	return nil
  1541  }
  1542  
  1543  // compileClz implements compiler.compileClz for the arm64 architecture.
  1544  func (c *arm64Compiler) compileClz(o *wazeroir.UnionOperation) error {
  1545  	v, err := c.popValueOnRegister()
  1546  	if err != nil {
  1547  		return err
  1548  	}
  1549  
  1550  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1551  	if isZeroRegister(v.register) {
  1552  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1553  		// so we allocate a register and put the const on it.
  1554  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1555  		if err != nil {
  1556  			return err
  1557  		}
  1558  		var vt runtimeValueType
  1559  		if unsignedInt == wazeroir.UnsignedInt32 {
  1560  			vt = runtimeValueTypeI32
  1561  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1562  		} else {
  1563  			vt = runtimeValueTypeI64
  1564  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1565  		}
  1566  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1567  		return nil
  1568  	}
  1569  
  1570  	reg := v.register
  1571  	var vt runtimeValueType
  1572  	if unsignedInt == wazeroir.UnsignedInt32 {
  1573  		vt = runtimeValueTypeI32
  1574  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1575  	} else {
  1576  		vt = runtimeValueTypeI64
  1577  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1578  	}
  1579  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1580  	return nil
  1581  }
  1582  
  1583  // compileCtz implements compiler.compileCtz for the arm64 architecture.
  1584  func (c *arm64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
  1585  	v, err := c.popValueOnRegister()
  1586  	if err != nil {
  1587  		return err
  1588  	}
  1589  
  1590  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1591  	reg := v.register
  1592  	if isZeroRegister(reg) {
  1593  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1594  		// so we allocate a register and put the const on it.
  1595  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1596  		if err != nil {
  1597  			return err
  1598  		}
  1599  		var vt runtimeValueType
  1600  		if unsignedInt == wazeroir.UnsignedInt32 {
  1601  			vt = runtimeValueTypeI32
  1602  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1603  		} else {
  1604  			vt = runtimeValueTypeI64
  1605  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1606  		}
  1607  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1608  		return nil
  1609  	}
  1610  
  1611  	// Since arm64 doesn't have an instruction directly counting trailing zeros,
  1612  	// we reverse the bits first, and then do CLZ, which is exactly the same as
  1613  	// gcc implements __builtin_ctz for arm64.
  1614  	var vt runtimeValueType
  1615  	if unsignedInt == wazeroir.UnsignedInt32 {
  1616  		vt = runtimeValueTypeI32
  1617  		c.assembler.CompileRegisterToRegister(arm64.RBITW, reg, reg)
  1618  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1619  	} else {
  1620  		vt = runtimeValueTypeI64
  1621  		c.assembler.CompileRegisterToRegister(arm64.RBIT, reg, reg)
  1622  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1623  	}
  1624  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1625  	return nil
  1626  }
  1627  
  1628  // compilePopcnt implements compiler.compilePopcnt for the arm64 architecture.
  1629  func (c *arm64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error {
  1630  	v, err := c.popValueOnRegister()
  1631  	if err != nil {
  1632  		return err
  1633  	}
  1634  
  1635  	reg := v.register
  1636  	if isZeroRegister(reg) {
  1637  		c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1638  		return nil
  1639  	}
  1640  
  1641  	freg, err := c.allocateRegister(registerTypeVector)
  1642  	if err != nil {
  1643  		return err
  1644  	}
  1645  
  1646  	// arm64 doesn't have an instruction for population count on scalar register,
  1647  	// so we use the vector one (VCNT).
  1648  	// This exactly what the official Go implements bits.OneCount.
  1649  	// For example, "func () int { return bits.OneCount(10) }" is compiled as
  1650  	//
  1651  	//    MOVD    $10, R0 ;; Load 10.
  1652  	//    FMOVD   R0, F0
  1653  	//    VCNT    V0.B8, V0.B8
  1654  	//    UADDLV  V0.B8, V0
  1655  	//
  1656  	var movInst asm.Instruction
  1657  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1658  	if unsignedInt == wazeroir.UnsignedInt32 {
  1659  		movInst = arm64.FMOVS
  1660  	} else {
  1661  		movInst = arm64.FMOVD
  1662  	}
  1663  	c.assembler.CompileRegisterToRegister(movInst, reg, freg)
  1664  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.VCNT, freg, freg,
  1665  		arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
  1666  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.UADDLV, freg, freg, arm64.VectorArrangement8B,
  1667  		arm64.VectorIndexNone, arm64.VectorIndexNone)
  1668  
  1669  	c.assembler.CompileRegisterToRegister(movInst, freg, reg)
  1670  
  1671  	c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1672  	return nil
  1673  }
  1674  
  1675  // compileDiv implements compiler.compileDiv for the arm64 architecture.
  1676  func (c *arm64Compiler) compileDiv(o *wazeroir.UnionOperation) error {
  1677  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1678  	if err != nil {
  1679  		return err
  1680  	}
  1681  
  1682  	signedType := wazeroir.SignedType(o.B1)
  1683  
  1684  	// If the divisor is on the zero register, exit from the function deterministically.
  1685  	if isZeroRegister(divisor.register) {
  1686  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1687  		v := c.locationStack.pushRuntimeValueLocationOnStack()
  1688  		switch signedType {
  1689  		case wazeroir.SignedTypeInt32, wazeroir.SignedTypeUint32:
  1690  			v.valueType = runtimeValueTypeI32
  1691  		case wazeroir.SignedTypeUint64, wazeroir.SignedTypeInt64:
  1692  			v.valueType = runtimeValueTypeI64
  1693  		}
  1694  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1695  		return nil
  1696  	}
  1697  
  1698  	var inst asm.Instruction
  1699  	var vt runtimeValueType
  1700  	switch signedType {
  1701  	case wazeroir.SignedTypeUint32:
  1702  		inst = arm64.UDIVW
  1703  		if err := c.compileIntegerDivPrecheck(true, false, dividend.register, divisor.register); err != nil {
  1704  			return err
  1705  		}
  1706  		vt = runtimeValueTypeI32
  1707  	case wazeroir.SignedTypeUint64:
  1708  		if err := c.compileIntegerDivPrecheck(false, false, dividend.register, divisor.register); err != nil {
  1709  			return err
  1710  		}
  1711  		inst = arm64.UDIV
  1712  		vt = runtimeValueTypeI64
  1713  	case wazeroir.SignedTypeInt32:
  1714  		if err := c.compileIntegerDivPrecheck(true, true, dividend.register, divisor.register); err != nil {
  1715  			return err
  1716  		}
  1717  		inst = arm64.SDIVW
  1718  		vt = runtimeValueTypeI32
  1719  	case wazeroir.SignedTypeInt64:
  1720  		if err := c.compileIntegerDivPrecheck(false, true, dividend.register, divisor.register); err != nil {
  1721  			return err
  1722  		}
  1723  		inst = arm64.SDIV
  1724  		vt = runtimeValueTypeI64
  1725  	case wazeroir.SignedTypeFloat32:
  1726  		inst = arm64.FDIVS
  1727  		vt = runtimeValueTypeF32
  1728  	case wazeroir.SignedTypeFloat64:
  1729  		inst = arm64.FDIVD
  1730  		vt = runtimeValueTypeF64
  1731  	}
  1732  
  1733  	c.assembler.CompileRegisterToRegister(inst, divisor.register, dividend.register)
  1734  
  1735  	c.pushRuntimeValueLocationOnRegister(dividend.register, vt)
  1736  	return nil
  1737  }
  1738  
  1739  // compileIntegerDivPrecheck adds instructions to check if the divisor and dividend are sound for division operation.
  1740  // First, this adds instructions to check if the divisor equals zero, and if so, exits the function.
  1741  // Plus, for signed divisions, check if the result might result in overflow or not.
  1742  func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, dividend, divisor asm.Register) error {
  1743  	// We check the divisor value equals zero.
  1744  	var cmpInst, movInst, loadInst asm.Instruction
  1745  	var minValueOffsetInVM int64
  1746  	if is32Bit {
  1747  		cmpInst = arm64.CMPW
  1748  		movInst = arm64.MOVW
  1749  		loadInst = arm64.LDRW
  1750  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  1751  	} else {
  1752  		cmpInst = arm64.CMP
  1753  		movInst = arm64.MOVD
  1754  		loadInst = arm64.LDRD
  1755  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  1756  	}
  1757  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisor)
  1758  
  1759  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1760  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerDivisionByZero)
  1761  	// Otherwise, we proceed.
  1762  
  1763  	// If the operation is a signed integer div, we have to do an additional check on overflow.
  1764  	if isSigned {
  1765  		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  1766  		// case which results in the overflow.
  1767  
  1768  		// First, we compare the divisor with -1.
  1769  		c.assembler.CompileConstToRegister(movInst, -1, arm64ReservedRegisterForTemporary)
  1770  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, divisor)
  1771  
  1772  		// If they not equal, we skip the following check.
  1773  		brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BCONDNE)
  1774  
  1775  		// Otherwise, we further check if the dividend equals math.MinInt32 or MinInt64.
  1776  		c.assembler.CompileMemoryToRegister(
  1777  			loadInst,
  1778  			arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  1779  			arm64ReservedRegisterForTemporary,
  1780  		)
  1781  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, dividend)
  1782  
  1783  		// If they not equal, we are safe to execute the division.
  1784  		// Otherwise, we raise overflow error.
  1785  		c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerOverflow)
  1786  
  1787  		c.assembler.SetJumpTargetOnNext(brIfDivisorNonMinusOne)
  1788  	}
  1789  	return nil
  1790  }
  1791  
  1792  // compileRem implements compiler.compileRem for the arm64 architecture.
  1793  func (c *arm64Compiler) compileRem(o *wazeroir.UnionOperation) error {
  1794  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1795  	if err != nil {
  1796  		return err
  1797  	}
  1798  
  1799  	dividendReg := dividend.register
  1800  	divisorReg := divisor.register
  1801  
  1802  	// If the divisor is on the zero register, exit from the function deterministically.
  1803  	if isZeroRegister(divisor.register) {
  1804  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1805  		v := c.locationStack.pushRuntimeValueLocationOnStack()
  1806  		v.valueType = runtimeValueTypeI32
  1807  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1808  		return nil
  1809  	}
  1810  
  1811  	var divInst, msubInst, cmpInst asm.Instruction
  1812  	signedInt := wazeroir.SignedInt(o.B1)
  1813  	switch signedInt {
  1814  	case wazeroir.SignedUint32:
  1815  		divInst = arm64.UDIVW
  1816  		msubInst = arm64.MSUBW
  1817  		cmpInst = arm64.CMPW
  1818  	case wazeroir.SignedUint64:
  1819  		divInst = arm64.UDIV
  1820  		msubInst = arm64.MSUB
  1821  		cmpInst = arm64.CMP
  1822  	case wazeroir.SignedInt32:
  1823  		divInst = arm64.SDIVW
  1824  		msubInst = arm64.MSUBW
  1825  		cmpInst = arm64.CMPW
  1826  	case wazeroir.SignedInt64:
  1827  		divInst = arm64.SDIV
  1828  		msubInst = arm64.MSUB
  1829  		cmpInst = arm64.CMP
  1830  	}
  1831  
  1832  	// We check the divisor value equals zero.
  1833  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisorReg)
  1834  
  1835  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1836  	c.compileMaybeExitFromNativeCode(arm64.BCONDNE, nativeCallStatusIntegerDivisionByZero)
  1837  	// Otherwise, we proceed.
  1838  
  1839  	// Temporarily mark them used to allocate a result register while keeping these values.
  1840  	c.markRegisterUsed(dividend.register, divisor.register)
  1841  
  1842  	resultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1843  	if err != nil {
  1844  		return err
  1845  	}
  1846  
  1847  	// arm64 doesn't have an instruction for rem, we use calculate it by two instructions: UDIV (SDIV for signed) and MSUB.
  1848  	// This exactly the same code that Clang emits.
  1849  	// [input: x0=dividend, x1=divisor]
  1850  	// >> UDIV x2, x0, x1
  1851  	// >> MSUB x3, x2, x1, x0
  1852  	// [result: x2=quotient, x3=remainder]
  1853  	//
  1854  	c.assembler.CompileTwoRegistersToRegister(divInst, divisorReg, dividendReg, resultReg)
  1855  	// ResultReg = dividendReg - (divisorReg * resultReg)
  1856  	c.assembler.CompileThreeRegistersToRegister(msubInst, divisorReg, dividendReg, resultReg, resultReg)
  1857  
  1858  	c.markRegisterUnused(dividend.register, divisor.register)
  1859  	c.pushRuntimeValueLocationOnRegister(resultReg, dividend.valueType)
  1860  	return nil
  1861  }
  1862  
  1863  // compileAnd implements compiler.compileAnd for the arm64 architecture.
  1864  func (c *arm64Compiler) compileAnd(o *wazeroir.UnionOperation) error {
  1865  	x1, x2, err := c.popTwoValuesOnRegisters()
  1866  	if err != nil {
  1867  		return err
  1868  	}
  1869  
  1870  	// If either of the registers x1 or x2 is zero,
  1871  	// the result will always be zero.
  1872  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1873  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1874  		return nil
  1875  	}
  1876  
  1877  	// At this point, at least one of x1 or x2 registers is non zero.
  1878  	// Choose the non-zero register as destination.
  1879  	destinationReg := x1.register
  1880  	if isZeroRegister(x1.register) {
  1881  		destinationReg = x2.register
  1882  	}
  1883  
  1884  	var inst asm.Instruction
  1885  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1886  	switch unsignedInt {
  1887  	case wazeroir.UnsignedInt32:
  1888  		inst = arm64.ANDW
  1889  	case wazeroir.UnsignedInt64:
  1890  		inst = arm64.AND
  1891  	}
  1892  
  1893  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1894  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1895  	return nil
  1896  }
  1897  
  1898  // compileOr implements compiler.compileOr for the arm64 architecture.
  1899  func (c *arm64Compiler) compileOr(o *wazeroir.UnionOperation) error {
  1900  	x1, x2, err := c.popTwoValuesOnRegisters()
  1901  	if err != nil {
  1902  		return err
  1903  	}
  1904  
  1905  	if isZeroRegister(x1.register) {
  1906  		c.pushRuntimeValueLocationOnRegister(x2.register, x2.valueType)
  1907  		return nil
  1908  	}
  1909  	if isZeroRegister(x2.register) {
  1910  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1911  		return nil
  1912  	}
  1913  
  1914  	var inst asm.Instruction
  1915  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1916  	switch unsignedInt {
  1917  	case wazeroir.UnsignedInt32:
  1918  		inst = arm64.ORRW
  1919  	case wazeroir.UnsignedInt64:
  1920  		inst = arm64.ORR
  1921  	}
  1922  
  1923  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1924  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1925  	return nil
  1926  }
  1927  
  1928  // compileXor implements compiler.compileXor for the arm64 architecture.
  1929  func (c *arm64Compiler) compileXor(o *wazeroir.UnionOperation) error {
  1930  	x1, x2, err := c.popTwoValuesOnRegisters()
  1931  	if err != nil {
  1932  		return err
  1933  	}
  1934  
  1935  	// At this point, at least one of x1 or x2 registers is non zero.
  1936  	// Choose the non-zero register as destination.
  1937  	destinationReg := x1.register
  1938  	if isZeroRegister(x1.register) {
  1939  		destinationReg = x2.register
  1940  	}
  1941  
  1942  	var inst asm.Instruction
  1943  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1944  	switch unsignedInt {
  1945  	case wazeroir.UnsignedInt32:
  1946  		inst = arm64.EORW
  1947  	case wazeroir.UnsignedInt64:
  1948  		inst = arm64.EOR
  1949  	}
  1950  
  1951  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1952  	c.pushRuntimeValueLocationOnRegister(destinationReg, x1.valueType)
  1953  	return nil
  1954  }
  1955  
  1956  // compileShl implements compiler.compileShl for the arm64 architecture.
  1957  func (c *arm64Compiler) compileShl(o *wazeroir.UnionOperation) error {
  1958  	x1, x2, err := c.popTwoValuesOnRegisters()
  1959  	if err != nil {
  1960  		return err
  1961  	}
  1962  
  1963  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1964  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1965  		return nil
  1966  	}
  1967  
  1968  	var inst asm.Instruction
  1969  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1970  	switch unsignedInt {
  1971  	case wazeroir.UnsignedInt32:
  1972  		inst = arm64.LSLW
  1973  	case wazeroir.UnsignedInt64:
  1974  		inst = arm64.LSL
  1975  	}
  1976  
  1977  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1978  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1979  	return nil
  1980  }
  1981  
  1982  // compileShr implements compiler.compileShr for the arm64 architecture.
  1983  func (c *arm64Compiler) compileShr(o *wazeroir.UnionOperation) error {
  1984  	x1, x2, err := c.popTwoValuesOnRegisters()
  1985  	if err != nil {
  1986  		return err
  1987  	}
  1988  
  1989  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1990  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1991  		return nil
  1992  	}
  1993  
  1994  	var inst asm.Instruction
  1995  	signedInt := wazeroir.SignedInt(o.B1)
  1996  	switch signedInt {
  1997  	case wazeroir.SignedInt32:
  1998  		inst = arm64.ASRW
  1999  	case wazeroir.SignedInt64:
  2000  		inst = arm64.ASR
  2001  	case wazeroir.SignedUint32:
  2002  		inst = arm64.LSRW
  2003  	case wazeroir.SignedUint64:
  2004  		inst = arm64.LSR
  2005  	}
  2006  
  2007  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  2008  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2009  	return nil
  2010  }
  2011  
  2012  // compileRotl implements compiler.compileRotl for the arm64 architecture.
  2013  func (c *arm64Compiler) compileRotl(o *wazeroir.UnionOperation) error {
  2014  	x1, x2, err := c.popTwoValuesOnRegisters()
  2015  	if err != nil {
  2016  		return err
  2017  	}
  2018  
  2019  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  2020  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2021  		return nil
  2022  	}
  2023  
  2024  	var inst, neginst asm.Instruction
  2025  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2026  	switch unsignedInt {
  2027  	case wazeroir.UnsignedInt32:
  2028  		inst = arm64.RORW
  2029  		neginst = arm64.NEGW
  2030  	case wazeroir.UnsignedInt64:
  2031  		inst = arm64.ROR
  2032  		neginst = arm64.NEG
  2033  	}
  2034  
  2035  	// Arm64 doesn't have rotate left instruction.
  2036  	// The shift amount needs to be converted to a negative number, similar to assembly output of bits.RotateLeft.
  2037  	c.assembler.CompileRegisterToRegister(neginst, x2.register, x2.register)
  2038  
  2039  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  2040  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2041  	return nil
  2042  }
  2043  
  2044  // compileRotr implements compiler.compileRotr for the arm64 architecture.
  2045  func (c *arm64Compiler) compileRotr(o *wazeroir.UnionOperation) error {
  2046  	x1, x2, err := c.popTwoValuesOnRegisters()
  2047  	if err != nil {
  2048  		return err
  2049  	}
  2050  
  2051  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  2052  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2053  		return nil
  2054  	}
  2055  
  2056  	var inst asm.Instruction
  2057  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2058  	switch unsignedInt {
  2059  	case wazeroir.UnsignedInt32:
  2060  		inst = arm64.RORW
  2061  	case wazeroir.UnsignedInt64:
  2062  		inst = arm64.ROR
  2063  	}
  2064  
  2065  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  2066  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2067  	return nil
  2068  }
  2069  
  2070  // compileAbs implements compiler.compileAbs for the arm64 architecture.
  2071  func (c *arm64Compiler) compileAbs(o *wazeroir.UnionOperation) error {
  2072  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2073  		return c.compileSimpleUnop(arm64.FABSS, runtimeValueTypeF32)
  2074  	} else {
  2075  		return c.compileSimpleUnop(arm64.FABSD, runtimeValueTypeF64)
  2076  	}
  2077  }
  2078  
  2079  // compileNeg implements compiler.compileNeg for the arm64 architecture.
  2080  func (c *arm64Compiler) compileNeg(o *wazeroir.UnionOperation) error {
  2081  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2082  		return c.compileSimpleUnop(arm64.FNEGS, runtimeValueTypeF32)
  2083  	} else {
  2084  		return c.compileSimpleUnop(arm64.FNEGD, runtimeValueTypeF64)
  2085  	}
  2086  }
  2087  
  2088  // compileCeil implements compiler.compileCeil for the arm64 architecture.
  2089  func (c *arm64Compiler) compileCeil(o *wazeroir.UnionOperation) error {
  2090  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2091  		return c.compileSimpleUnop(arm64.FRINTPS, runtimeValueTypeF32)
  2092  	} else {
  2093  		return c.compileSimpleUnop(arm64.FRINTPD, runtimeValueTypeF64)
  2094  	}
  2095  }
  2096  
  2097  // compileFloor implements compiler.compileFloor for the arm64 architecture.
  2098  func (c *arm64Compiler) compileFloor(o *wazeroir.UnionOperation) error {
  2099  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2100  		return c.compileSimpleUnop(arm64.FRINTMS, runtimeValueTypeF32)
  2101  	} else {
  2102  		return c.compileSimpleUnop(arm64.FRINTMD, runtimeValueTypeF64)
  2103  	}
  2104  }
  2105  
  2106  // compileTrunc implements compiler.compileTrunc for the arm64 architecture.
  2107  func (c *arm64Compiler) compileTrunc(o *wazeroir.UnionOperation) error {
  2108  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2109  		return c.compileSimpleUnop(arm64.FRINTZS, runtimeValueTypeF32)
  2110  	} else {
  2111  		return c.compileSimpleUnop(arm64.FRINTZD, runtimeValueTypeF64)
  2112  	}
  2113  }
  2114  
  2115  // compileNearest implements compiler.compileNearest for the arm64 architecture.
  2116  func (c *arm64Compiler) compileNearest(o *wazeroir.UnionOperation) error {
  2117  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2118  		return c.compileSimpleUnop(arm64.FRINTNS, runtimeValueTypeF32)
  2119  	} else {
  2120  		return c.compileSimpleUnop(arm64.FRINTND, runtimeValueTypeF64)
  2121  	}
  2122  }
  2123  
  2124  // compileSqrt implements compiler.compileSqrt for the arm64 architecture.
  2125  func (c *arm64Compiler) compileSqrt(o *wazeroir.UnionOperation) error {
  2126  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2127  		return c.compileSimpleUnop(arm64.FSQRTS, runtimeValueTypeF32)
  2128  	} else {
  2129  		return c.compileSimpleUnop(arm64.FSQRTD, runtimeValueTypeF64)
  2130  	}
  2131  }
  2132  
  2133  // compileMin implements compiler.compileMin for the arm64 architecture.
  2134  func (c *arm64Compiler) compileMin(o *wazeroir.UnionOperation) error {
  2135  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2136  		return c.compileSimpleFloatBinop(arm64.FMINS)
  2137  	} else {
  2138  		return c.compileSimpleFloatBinop(arm64.FMIND)
  2139  	}
  2140  }
  2141  
  2142  // compileMax implements compiler.compileMax for the arm64 architecture.
  2143  func (c *arm64Compiler) compileMax(o *wazeroir.UnionOperation) error {
  2144  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2145  		return c.compileSimpleFloatBinop(arm64.FMAXS)
  2146  	} else {
  2147  		return c.compileSimpleFloatBinop(arm64.FMAXD)
  2148  	}
  2149  }
  2150  
  2151  func (c *arm64Compiler) compileSimpleFloatBinop(inst asm.Instruction) error {
  2152  	x1, x2, err := c.popTwoValuesOnRegisters()
  2153  	if err != nil {
  2154  		return err
  2155  	}
  2156  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  2157  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2158  	return nil
  2159  }
  2160  
  2161  // compileCopysign implements compiler.compileCopysign for the arm64 architecture.
  2162  func (c *arm64Compiler) compileCopysign(o *wazeroir.UnionOperation) error {
  2163  	x1, x2, err := c.popTwoValuesOnRegisters()
  2164  	if err != nil {
  2165  		return err
  2166  	}
  2167  
  2168  	var ldr asm.Instruction
  2169  	var minValueOffsetInVM int64
  2170  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2171  		ldr = arm64.FLDRS
  2172  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  2173  	} else {
  2174  		ldr = arm64.FLDRD
  2175  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  2176  	}
  2177  
  2178  	c.markRegisterUsed(x1.register, x2.register)
  2179  	freg, err := c.allocateRegister(registerTypeVector)
  2180  	if err != nil {
  2181  		return err
  2182  	}
  2183  
  2184  	// This is exactly the same code emitted by GCC for "__builtin_copysign":
  2185  	//
  2186  	//    mov     x0, -9223372036854775808
  2187  	//    fmov    d2, x0
  2188  	//    vbit    v0.8b, v1.8b, v2.8b
  2189  	//
  2190  	// "mov freg, -9223372036854775808 (stored at ce.minimum64BitSignedInt)"
  2191  	c.assembler.CompileMemoryToRegister(
  2192  		ldr,
  2193  		arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  2194  		freg,
  2195  	)
  2196  
  2197  	// VBIT inserts each bit from the first operand into the destination if the corresponding bit of the second operand is 1,
  2198  	// otherwise it leaves the destination bit unchanged.
  2199  	// See https://developer.arm.com/documentation/dui0801/g/Advanced-SIMD-Instructions--32-bit-/VBIT
  2200  	//
  2201  	// "vbit vreg.8b, x2vreg.8b, x1vreg.8b" == "inserting 64th bit of x2 into x1".
  2202  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VBIT,
  2203  		freg, x2.register, x1.register, arm64.VectorArrangement16B)
  2204  
  2205  	c.markRegisterUnused(x2.register)
  2206  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2207  	return nil
  2208  }
  2209  
  2210  // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the arm64 architecture.
  2211  func (c *arm64Compiler) compileI32WrapFromI64() error {
  2212  	return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI32)
  2213  }
  2214  
  2215  // compileITruncFromF implements compiler.compileITruncFromF for the arm64 architecture.
  2216  func (c *arm64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) error {
  2217  	// Clear the floating point status register (FPSR).
  2218  	c.assembler.CompileRegisterToRegister(arm64.MSR, arm64.RegRZR, arm64.RegFPSR)
  2219  
  2220  	var vt runtimeValueType
  2221  	var convinst asm.Instruction
  2222  	inputType := wazeroir.Float(o.B1)
  2223  	outputType := wazeroir.SignedInt(o.B2)
  2224  	nonTrapping := o.B3
  2225  
  2226  	is32bitFloat := inputType == wazeroir.Float32
  2227  	if is32bitFloat && outputType == wazeroir.SignedInt32 {
  2228  		convinst = arm64.FCVTZSSW
  2229  		vt = runtimeValueTypeI32
  2230  	} else if is32bitFloat && outputType == wazeroir.SignedInt64 {
  2231  		convinst = arm64.FCVTZSS
  2232  		vt = runtimeValueTypeI64
  2233  	} else if !is32bitFloat && outputType == wazeroir.SignedInt32 {
  2234  		convinst = arm64.FCVTZSDW
  2235  		vt = runtimeValueTypeI32
  2236  	} else if !is32bitFloat && outputType == wazeroir.SignedInt64 {
  2237  		convinst = arm64.FCVTZSD
  2238  		vt = runtimeValueTypeI64
  2239  	} else if is32bitFloat && outputType == wazeroir.SignedUint32 {
  2240  		convinst = arm64.FCVTZUSW
  2241  		vt = runtimeValueTypeI32
  2242  	} else if is32bitFloat && outputType == wazeroir.SignedUint64 {
  2243  		convinst = arm64.FCVTZUS
  2244  		vt = runtimeValueTypeI64
  2245  	} else if !is32bitFloat && outputType == wazeroir.SignedUint32 {
  2246  		convinst = arm64.FCVTZUDW
  2247  		vt = runtimeValueTypeI32
  2248  	} else if !is32bitFloat && outputType == wazeroir.SignedUint64 {
  2249  		convinst = arm64.FCVTZUD
  2250  		vt = runtimeValueTypeI64
  2251  	}
  2252  
  2253  	source, err := c.popValueOnRegister()
  2254  	if err != nil {
  2255  		return err
  2256  	}
  2257  	sourceReg := source.register
  2258  
  2259  	destinationReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2260  	if err != nil {
  2261  		return err
  2262  	}
  2263  
  2264  	c.assembler.CompileRegisterToRegister(convinst, sourceReg, destinationReg)
  2265  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  2266  
  2267  	if !nonTrapping {
  2268  		// Obtain the floating point status register value into the general purpose register,
  2269  		// so that we can check if the conversion resulted in undefined behavior.
  2270  		c.assembler.CompileRegisterToRegister(arm64.MRS, arm64.RegFPSR, arm64ReservedRegisterForTemporary)
  2271  		// Check if the conversion was undefined by comparing the status with 1.
  2272  		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
  2273  		c.assembler.CompileRegisterAndConstToNone(arm64.CMP, arm64ReservedRegisterForTemporary, 1)
  2274  
  2275  		brOK := c.assembler.CompileJump(arm64.BCONDNE)
  2276  
  2277  		// If so, exit the execution with errors depending on whether or not the source value is NaN.
  2278  		var floatcmp asm.Instruction
  2279  		if is32bitFloat {
  2280  			floatcmp = arm64.FCMPS
  2281  		} else {
  2282  			floatcmp = arm64.FCMPD
  2283  		}
  2284  		c.assembler.CompileTwoRegistersToNone(floatcmp, sourceReg, sourceReg)
  2285  		// VS flag is set if at least one of values for FCMP is NaN.
  2286  		// https://developer.arm.com/documentation/dui0801/g/Condition-Codes/Comparison-of-condition-code-meanings-in-integer-and-floating-point-code
  2287  		// If the source value is not NaN, the operation was overflow.
  2288  		c.compileMaybeExitFromNativeCode(arm64.BCONDVS, nativeCallStatusIntegerOverflow)
  2289  
  2290  		// Otherwise, the operation was invalid as this is trying to convert NaN to integer.
  2291  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2292  
  2293  		// Otherwise, we branch into the next instruction.
  2294  		c.assembler.SetJumpTargetOnNext(brOK)
  2295  	}
  2296  	return nil
  2297  }
  2298  
  2299  // compileFConvertFromI implements compiler.compileFConvertFromI for the arm64 architecture.
  2300  func (c *arm64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) error {
  2301  	var convinst asm.Instruction
  2302  	inputType := wazeroir.SignedInt(o.B1)
  2303  	outputType := wazeroir.Float(o.B2)
  2304  
  2305  	if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 {
  2306  		convinst = arm64.SCVTFWS
  2307  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 {
  2308  		convinst = arm64.SCVTFS
  2309  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 {
  2310  		convinst = arm64.SCVTFWD
  2311  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 {
  2312  		convinst = arm64.SCVTFD
  2313  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 {
  2314  		convinst = arm64.UCVTFWS
  2315  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 {
  2316  		convinst = arm64.UCVTFS
  2317  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 {
  2318  		convinst = arm64.UCVTFWD
  2319  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 {
  2320  		convinst = arm64.UCVTFD
  2321  	}
  2322  
  2323  	var vt runtimeValueType
  2324  	if outputType == wazeroir.Float32 {
  2325  		vt = runtimeValueTypeF32
  2326  	} else {
  2327  		vt = runtimeValueTypeF64
  2328  	}
  2329  	return c.compileSimpleConversion(convinst, registerTypeVector, vt)
  2330  }
  2331  
  2332  // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the arm64 architecture.
  2333  func (c *arm64Compiler) compileF32DemoteFromF64() error {
  2334  	return c.compileSimpleUnop(arm64.FCVTDS, runtimeValueTypeF32)
  2335  }
  2336  
  2337  // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the arm64 architecture.
  2338  func (c *arm64Compiler) compileF64PromoteFromF32() error {
  2339  	return c.compileSimpleUnop(arm64.FCVTSD, runtimeValueTypeF64)
  2340  }
  2341  
  2342  // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the arm64 architecture.
  2343  func (c *arm64Compiler) compileI32ReinterpretFromF32() error {
  2344  	if peek := c.locationStack.peek(); peek.onStack() {
  2345  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2346  		peek.valueType = runtimeValueTypeI32
  2347  		return nil
  2348  	}
  2349  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeGeneralPurpose, runtimeValueTypeI32)
  2350  }
  2351  
  2352  // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the arm64 architecture.
  2353  func (c *arm64Compiler) compileI64ReinterpretFromF64() error {
  2354  	if peek := c.locationStack.peek(); peek.onStack() {
  2355  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2356  		peek.valueType = runtimeValueTypeI64
  2357  		return nil
  2358  	}
  2359  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeGeneralPurpose, runtimeValueTypeI64)
  2360  }
  2361  
  2362  // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the arm64 architecture.
  2363  func (c *arm64Compiler) compileF32ReinterpretFromI32() error {
  2364  	if peek := c.locationStack.peek(); peek.onStack() {
  2365  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2366  		peek.valueType = runtimeValueTypeF32
  2367  		return nil
  2368  	}
  2369  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeVector, runtimeValueTypeF32)
  2370  }
  2371  
  2372  // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the arm64 architecture.
  2373  func (c *arm64Compiler) compileF64ReinterpretFromI64() error {
  2374  	if peek := c.locationStack.peek(); peek.onStack() {
  2375  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2376  		peek.valueType = runtimeValueTypeF64
  2377  		return nil
  2378  	}
  2379  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeVector, runtimeValueTypeF64)
  2380  }
  2381  
  2382  func (c *arm64Compiler) compileSimpleConversion(inst asm.Instruction, destinationRegType registerType, resultRuntimeValueType runtimeValueType) error {
  2383  	source, err := c.popValueOnRegister()
  2384  	if err != nil {
  2385  		return err
  2386  	}
  2387  
  2388  	destinationReg, err := c.allocateRegister(destinationRegType)
  2389  	if err != nil {
  2390  		return err
  2391  	}
  2392  
  2393  	c.assembler.CompileRegisterToRegister(inst, source.register, destinationReg)
  2394  	c.pushRuntimeValueLocationOnRegister(destinationReg, resultRuntimeValueType)
  2395  	return nil
  2396  }
  2397  
  2398  // compileExtend implements compiler.compileExtend for the arm64 architecture.
  2399  func (c *arm64Compiler) compileExtend(o *wazeroir.UnionOperation) error {
  2400  	signed := o.B1 != 0
  2401  	if signed {
  2402  		return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2403  	} else {
  2404  		return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI64)
  2405  	}
  2406  }
  2407  
  2408  // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the arm64 architecture.
  2409  func (c *arm64Compiler) compileSignExtend32From8() error {
  2410  	return c.compileSimpleUnop(arm64.SXTBW, runtimeValueTypeI32)
  2411  }
  2412  
  2413  // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the arm64 architecture.
  2414  func (c *arm64Compiler) compileSignExtend32From16() error {
  2415  	return c.compileSimpleUnop(arm64.SXTHW, runtimeValueTypeI32)
  2416  }
  2417  
  2418  // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the arm64 architecture.
  2419  func (c *arm64Compiler) compileSignExtend64From8() error {
  2420  	return c.compileSimpleUnop(arm64.SXTB, runtimeValueTypeI64)
  2421  }
  2422  
  2423  // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the arm64 architecture.
  2424  func (c *arm64Compiler) compileSignExtend64From16() error {
  2425  	return c.compileSimpleUnop(arm64.SXTH, runtimeValueTypeI64)
  2426  }
  2427  
  2428  // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the arm64 architecture.
  2429  func (c *arm64Compiler) compileSignExtend64From32() error {
  2430  	return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2431  }
  2432  
  2433  func (c *arm64Compiler) compileSimpleUnop(inst asm.Instruction, resultRuntimeValueType runtimeValueType) error {
  2434  	v, err := c.popValueOnRegister()
  2435  	if err != nil {
  2436  		return err
  2437  	}
  2438  	reg := v.register
  2439  	c.assembler.CompileRegisterToRegister(inst, reg, reg)
  2440  	c.pushRuntimeValueLocationOnRegister(reg, resultRuntimeValueType)
  2441  	return nil
  2442  }
  2443  
  2444  // compileEq implements compiler.compileEq for the arm64 architecture.
  2445  func (c *arm64Compiler) compileEq(o *wazeroir.UnionOperation) error {
  2446  	return c.emitEqOrNe(true, wazeroir.UnsignedType(o.B1))
  2447  }
  2448  
  2449  // compileNe implements compiler.compileNe for the arm64 architecture.
  2450  func (c *arm64Compiler) compileNe(o *wazeroir.UnionOperation) error {
  2451  	return c.emitEqOrNe(false, wazeroir.UnsignedType(o.B1))
  2452  }
  2453  
  2454  // emitEqOrNe implements compiler.compileEq and compiler.compileNe for the arm64 architecture.
  2455  func (c *arm64Compiler) emitEqOrNe(isEq bool, unsignedType wazeroir.UnsignedType) error {
  2456  	x1, x2, err := c.popTwoValuesOnRegisters()
  2457  	if err != nil {
  2458  		return err
  2459  	}
  2460  
  2461  	var inst asm.Instruction
  2462  	switch unsignedType {
  2463  	case wazeroir.UnsignedTypeI32:
  2464  		inst = arm64.CMPW
  2465  	case wazeroir.UnsignedTypeI64:
  2466  		inst = arm64.CMP
  2467  	case wazeroir.UnsignedTypeF32:
  2468  		inst = arm64.FCMPS
  2469  	case wazeroir.UnsignedTypeF64:
  2470  		inst = arm64.FCMPD
  2471  	}
  2472  
  2473  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2474  
  2475  	// Push the comparison result as a conditional register value.
  2476  	cond := arm64.CondNE
  2477  	if isEq {
  2478  		cond = arm64.CondEQ
  2479  	}
  2480  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(cond)
  2481  	return nil
  2482  }
  2483  
  2484  // compileEqz implements compiler.compileEqz for the arm64 architecture.
  2485  func (c *arm64Compiler) compileEqz(o *wazeroir.UnionOperation) error {
  2486  	x1, err := c.popValueOnRegister()
  2487  	if err != nil {
  2488  		return err
  2489  	}
  2490  
  2491  	var inst asm.Instruction
  2492  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  2493  	switch unsignedInt {
  2494  	case wazeroir.UnsignedInt32:
  2495  		inst = arm64.CMPW
  2496  	case wazeroir.UnsignedInt64:
  2497  		inst = arm64.CMP
  2498  	}
  2499  
  2500  	c.assembler.CompileTwoRegistersToNone(inst, arm64.RegRZR, x1.register)
  2501  
  2502  	// Push the comparison result as a conditional register value.
  2503  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondEQ)
  2504  	return nil
  2505  }
  2506  
  2507  // compileLt implements compiler.compileLt for the arm64 architecture.
  2508  func (c *arm64Compiler) compileLt(o *wazeroir.UnionOperation) error {
  2509  	x1, x2, err := c.popTwoValuesOnRegisters()
  2510  	if err != nil {
  2511  		return err
  2512  	}
  2513  
  2514  	var inst asm.Instruction
  2515  	var conditionalRegister asm.ConditionalRegisterState
  2516  	signedType := wazeroir.SignedType(o.B1)
  2517  	switch signedType {
  2518  	case wazeroir.SignedTypeUint32:
  2519  		inst = arm64.CMPW
  2520  		conditionalRegister = arm64.CondLO
  2521  	case wazeroir.SignedTypeUint64:
  2522  		inst = arm64.CMP
  2523  		conditionalRegister = arm64.CondLO
  2524  	case wazeroir.SignedTypeInt32:
  2525  		inst = arm64.CMPW
  2526  		conditionalRegister = arm64.CondLT
  2527  	case wazeroir.SignedTypeInt64:
  2528  		inst = arm64.CMP
  2529  		conditionalRegister = arm64.CondLT
  2530  	case wazeroir.SignedTypeFloat32:
  2531  		inst = arm64.FCMPS
  2532  		conditionalRegister = arm64.CondMI
  2533  	case wazeroir.SignedTypeFloat64:
  2534  		inst = arm64.FCMPD
  2535  		conditionalRegister = arm64.CondMI
  2536  	}
  2537  
  2538  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2539  
  2540  	// Push the comparison result as a conditional register value.
  2541  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2542  	return nil
  2543  }
  2544  
  2545  // compileGt implements compiler.compileGt for the arm64 architecture.
  2546  func (c *arm64Compiler) compileGt(o *wazeroir.UnionOperation) error {
  2547  	x1, x2, err := c.popTwoValuesOnRegisters()
  2548  	if err != nil {
  2549  		return err
  2550  	}
  2551  
  2552  	var inst asm.Instruction
  2553  	var conditionalRegister asm.ConditionalRegisterState
  2554  	signedType := wazeroir.SignedType(o.B1)
  2555  	switch signedType {
  2556  	case wazeroir.SignedTypeUint32:
  2557  		inst = arm64.CMPW
  2558  		conditionalRegister = arm64.CondHI
  2559  	case wazeroir.SignedTypeUint64:
  2560  		inst = arm64.CMP
  2561  		conditionalRegister = arm64.CondHI
  2562  	case wazeroir.SignedTypeInt32:
  2563  		inst = arm64.CMPW
  2564  		conditionalRegister = arm64.CondGT
  2565  	case wazeroir.SignedTypeInt64:
  2566  		inst = arm64.CMP
  2567  		conditionalRegister = arm64.CondGT
  2568  	case wazeroir.SignedTypeFloat32:
  2569  		inst = arm64.FCMPS
  2570  		conditionalRegister = arm64.CondGT
  2571  	case wazeroir.SignedTypeFloat64:
  2572  		inst = arm64.FCMPD
  2573  		conditionalRegister = arm64.CondGT
  2574  	}
  2575  
  2576  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2577  
  2578  	// Push the comparison result as a conditional register value.
  2579  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2580  	return nil
  2581  }
  2582  
  2583  // compileLe implements compiler.compileLe for the arm64 architecture.
  2584  func (c *arm64Compiler) compileLe(o *wazeroir.UnionOperation) error {
  2585  	x1, x2, err := c.popTwoValuesOnRegisters()
  2586  	if err != nil {
  2587  		return err
  2588  	}
  2589  
  2590  	var inst asm.Instruction
  2591  	var conditionalRegister asm.ConditionalRegisterState
  2592  	signedType := wazeroir.SignedType(o.B1)
  2593  	switch signedType {
  2594  	case wazeroir.SignedTypeUint32:
  2595  		inst = arm64.CMPW
  2596  		conditionalRegister = arm64.CondLS
  2597  	case wazeroir.SignedTypeUint64:
  2598  		inst = arm64.CMP
  2599  		conditionalRegister = arm64.CondLS
  2600  	case wazeroir.SignedTypeInt32:
  2601  		inst = arm64.CMPW
  2602  		conditionalRegister = arm64.CondLE
  2603  	case wazeroir.SignedTypeInt64:
  2604  		inst = arm64.CMP
  2605  		conditionalRegister = arm64.CondLE
  2606  	case wazeroir.SignedTypeFloat32:
  2607  		inst = arm64.FCMPS
  2608  		conditionalRegister = arm64.CondLS
  2609  	case wazeroir.SignedTypeFloat64:
  2610  		inst = arm64.FCMPD
  2611  		conditionalRegister = arm64.CondLS
  2612  	}
  2613  
  2614  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2615  
  2616  	// Push the comparison result as a conditional register value.
  2617  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2618  	return nil
  2619  }
  2620  
  2621  // compileGe implements compiler.compileGe for the arm64 architecture.
  2622  func (c *arm64Compiler) compileGe(o *wazeroir.UnionOperation) error {
  2623  	x1, x2, err := c.popTwoValuesOnRegisters()
  2624  	if err != nil {
  2625  		return err
  2626  	}
  2627  
  2628  	var inst asm.Instruction
  2629  	var conditionalRegister asm.ConditionalRegisterState
  2630  	signedType := wazeroir.SignedType(o.B1)
  2631  	switch signedType {
  2632  	case wazeroir.SignedTypeUint32:
  2633  		inst = arm64.CMPW
  2634  		conditionalRegister = arm64.CondHS
  2635  	case wazeroir.SignedTypeUint64:
  2636  		inst = arm64.CMP
  2637  		conditionalRegister = arm64.CondHS
  2638  	case wazeroir.SignedTypeInt32:
  2639  		inst = arm64.CMPW
  2640  		conditionalRegister = arm64.CondGE
  2641  	case wazeroir.SignedTypeInt64:
  2642  		inst = arm64.CMP
  2643  		conditionalRegister = arm64.CondGE
  2644  	case wazeroir.SignedTypeFloat32:
  2645  		inst = arm64.FCMPS
  2646  		conditionalRegister = arm64.CondGE
  2647  	case wazeroir.SignedTypeFloat64:
  2648  		inst = arm64.FCMPD
  2649  		conditionalRegister = arm64.CondGE
  2650  	}
  2651  
  2652  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2653  
  2654  	// Push the comparison result as a conditional register value.
  2655  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2656  	return nil
  2657  }
  2658  
  2659  // compileLoad implements compiler.compileLoad for the arm64 architecture.
  2660  func (c *arm64Compiler) compileLoad(o *wazeroir.UnionOperation) error {
  2661  	var (
  2662  		isFloat           bool
  2663  		loadInst          asm.Instruction
  2664  		targetSizeInBytes int64
  2665  		vt                runtimeValueType
  2666  	)
  2667  
  2668  	unsignedType := wazeroir.UnsignedType(o.B1)
  2669  	offset := uint32(o.U2)
  2670  
  2671  	switch unsignedType {
  2672  	case wazeroir.UnsignedTypeI32:
  2673  		loadInst = arm64.LDRW
  2674  		targetSizeInBytes = 32 / 8
  2675  		vt = runtimeValueTypeI32
  2676  	case wazeroir.UnsignedTypeI64:
  2677  		loadInst = arm64.LDRD
  2678  		targetSizeInBytes = 64 / 8
  2679  		vt = runtimeValueTypeI64
  2680  	case wazeroir.UnsignedTypeF32:
  2681  		loadInst = arm64.FLDRS
  2682  		isFloat = true
  2683  		targetSizeInBytes = 32 / 8
  2684  		vt = runtimeValueTypeF32
  2685  	case wazeroir.UnsignedTypeF64:
  2686  		loadInst = arm64.FLDRD
  2687  		isFloat = true
  2688  		targetSizeInBytes = 64 / 8
  2689  		vt = runtimeValueTypeF64
  2690  	}
  2691  	return c.compileLoadImpl(offset, loadInst, targetSizeInBytes, isFloat, vt)
  2692  }
  2693  
  2694  // compileLoad8 implements compiler.compileLoad8 for the arm64 architecture.
  2695  func (c *arm64Compiler) compileLoad8(o *wazeroir.UnionOperation) error {
  2696  	var loadInst asm.Instruction
  2697  	var vt runtimeValueType
  2698  
  2699  	signedInt := wazeroir.SignedInt(o.B1)
  2700  	offset := uint32(o.U2)
  2701  
  2702  	switch signedInt {
  2703  	case wazeroir.SignedInt32:
  2704  		loadInst = arm64.LDRSBW
  2705  		vt = runtimeValueTypeI32
  2706  	case wazeroir.SignedInt64:
  2707  		loadInst = arm64.LDRSBD
  2708  		vt = runtimeValueTypeI64
  2709  	case wazeroir.SignedUint32:
  2710  		loadInst = arm64.LDRB
  2711  		vt = runtimeValueTypeI32
  2712  	case wazeroir.SignedUint64:
  2713  		loadInst = arm64.LDRB
  2714  		vt = runtimeValueTypeI64
  2715  	}
  2716  	return c.compileLoadImpl(offset, loadInst, 1, false, vt)
  2717  }
  2718  
  2719  // compileLoad16 implements compiler.compileLoad16 for the arm64 architecture.
  2720  func (c *arm64Compiler) compileLoad16(o *wazeroir.UnionOperation) error {
  2721  	var loadInst asm.Instruction
  2722  	var vt runtimeValueType
  2723  
  2724  	signedInt := wazeroir.SignedInt(o.B1)
  2725  	offset := uint32(o.U2)
  2726  
  2727  	switch signedInt {
  2728  	case wazeroir.SignedInt32:
  2729  		loadInst = arm64.LDRSHW
  2730  		vt = runtimeValueTypeI32
  2731  	case wazeroir.SignedInt64:
  2732  		loadInst = arm64.LDRSHD
  2733  		vt = runtimeValueTypeI64
  2734  	case wazeroir.SignedUint32:
  2735  		loadInst = arm64.LDRH
  2736  		vt = runtimeValueTypeI32
  2737  	case wazeroir.SignedUint64:
  2738  		loadInst = arm64.LDRH
  2739  		vt = runtimeValueTypeI64
  2740  	}
  2741  	return c.compileLoadImpl(offset, loadInst, 16/8, false, vt)
  2742  }
  2743  
  2744  // compileLoad32 implements compiler.compileLoad32 for the arm64 architecture.
  2745  func (c *arm64Compiler) compileLoad32(o *wazeroir.UnionOperation) error {
  2746  	var loadInst asm.Instruction
  2747  	signed := o.B1 == 1
  2748  	offset := uint32(o.U2)
  2749  
  2750  	if signed {
  2751  		loadInst = arm64.LDRSW
  2752  	} else {
  2753  		loadInst = arm64.LDRW
  2754  	}
  2755  	return c.compileLoadImpl(offset, loadInst, 32/8, false, runtimeValueTypeI64)
  2756  }
  2757  
  2758  // compileLoadImpl implements compileLoadImpl* variants for arm64 architecture.
  2759  func (c *arm64Compiler) compileLoadImpl(offsetArg uint32, loadInst asm.Instruction,
  2760  	targetSizeInBytes int64, isFloat bool, resultRuntimeValueType runtimeValueType,
  2761  ) error {
  2762  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2763  	if err != nil {
  2764  		return err
  2765  	}
  2766  
  2767  	resultRegister := offsetReg
  2768  	if isFloat {
  2769  		resultRegister, err = c.allocateRegister(registerTypeVector)
  2770  		if err != nil {
  2771  			return err
  2772  		}
  2773  	}
  2774  
  2775  	// "resultRegister = [arm64ReservedRegisterForMemory + offsetReg]"
  2776  	// In other words, "resultRegister = memory.Buffer[offset: offset+targetSizeInBytes]"
  2777  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
  2778  		loadInst,
  2779  		arm64ReservedRegisterForMemory, offsetReg,
  2780  		resultRegister,
  2781  	)
  2782  
  2783  	c.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  2784  	return nil
  2785  }
  2786  
  2787  // compileStore implements compiler.compileStore for the arm64 architecture.
  2788  func (c *arm64Compiler) compileStore(o *wazeroir.UnionOperation) error {
  2789  	var movInst asm.Instruction
  2790  	var targetSizeInBytes int64
  2791  	unsignedType := wazeroir.UnsignedType(o.B1)
  2792  	offset := uint32(o.U2)
  2793  	switch unsignedType {
  2794  	case wazeroir.UnsignedTypeI32:
  2795  		movInst = arm64.STRW
  2796  		targetSizeInBytes = 32 / 8
  2797  	case wazeroir.UnsignedTypeI64:
  2798  		movInst = arm64.STRD
  2799  		targetSizeInBytes = 64 / 8
  2800  	case wazeroir.UnsignedTypeF32:
  2801  		movInst = arm64.FSTRS
  2802  		targetSizeInBytes = 32 / 8
  2803  	case wazeroir.UnsignedTypeF64:
  2804  		movInst = arm64.FSTRD
  2805  		targetSizeInBytes = 64 / 8
  2806  	}
  2807  	return c.compileStoreImpl(offset, movInst, targetSizeInBytes)
  2808  }
  2809  
  2810  // compileStore8 implements compiler.compileStore8 for the arm64 architecture.
  2811  func (c *arm64Compiler) compileStore8(o *wazeroir.UnionOperation) error {
  2812  	return c.compileStoreImpl(uint32(o.U2), arm64.STRB, 1)
  2813  }
  2814  
  2815  // compileStore16 implements compiler.compileStore16 for the arm64 architecture.
  2816  func (c *arm64Compiler) compileStore16(o *wazeroir.UnionOperation) error {
  2817  	return c.compileStoreImpl(uint32(o.U2), arm64.STRH, 16/8)
  2818  }
  2819  
  2820  // compileStore32 implements compiler.compileStore32 for the arm64 architecture.
  2821  func (c *arm64Compiler) compileStore32(o *wazeroir.UnionOperation) error {
  2822  	return c.compileStoreImpl(uint32(o.U2), arm64.STRW, 32/8)
  2823  }
  2824  
  2825  // compileStoreImpl implements compleStore* variants for arm64 architecture.
  2826  func (c *arm64Compiler) compileStoreImpl(offsetArg uint32, storeInst asm.Instruction, targetSizeInBytes int64) error {
  2827  	val, err := c.popValueOnRegister()
  2828  	if err != nil {
  2829  		return err
  2830  	}
  2831  	// Mark temporarily used as compileMemoryAccessOffsetSetup might try allocating register.
  2832  	c.markRegisterUsed(val.register)
  2833  
  2834  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2835  	if err != nil {
  2836  		return err
  2837  	}
  2838  
  2839  	// "[arm64ReservedRegisterForMemory + offsetReg] = val.register"
  2840  	// In other words, "memory.Buffer[offset: offset+targetSizeInBytes] = val.register"
  2841  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(
  2842  		storeInst, val.register,
  2843  		arm64ReservedRegisterForMemory, offsetReg,
  2844  	)
  2845  
  2846  	c.markRegisterUnused(val.register)
  2847  	return nil
  2848  }
  2849  
  2850  // compileMemoryAccessOffsetSetup pops the top value from the stack (called "base"), stores "base + offsetArg"
  2851  // into a register, and returns the stored register. We call the result "offset" because we access the memory
  2852  // as memory.Buffer[offset: offset+targetSizeInBytes].
  2853  //
  2854  // Note: this also emits the instructions to check the out of bounds memory access.
  2855  // In other words, if the offset+targetSizeInBytes exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  2856  func (c *arm64Compiler) compileMemoryAccessOffsetSetup(offsetArg uint32, targetSizeInBytes int64) (offsetRegister asm.Register, err error) {
  2857  	base, err := c.popValueOnRegister()
  2858  	if err != nil {
  2859  		return 0, err
  2860  	}
  2861  
  2862  	offsetRegister = base.register
  2863  	if isZeroRegister(base.register) {
  2864  		offsetRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
  2865  		if err != nil {
  2866  			return
  2867  		}
  2868  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, offsetRegister)
  2869  	}
  2870  
  2871  	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxUint32 {
  2872  		// "offsetRegister = base + offsetArg + targetSizeInBytes"
  2873  		c.assembler.CompileConstToRegister(arm64.ADD, offsetConst, offsetRegister)
  2874  	} else {
  2875  		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
  2876  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  2877  		return
  2878  	}
  2879  
  2880  	// "arm64ReservedRegisterForTemporary = len(memory.Buffer)"
  2881  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2882  		arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  2883  		arm64ReservedRegisterForTemporary)
  2884  
  2885  	// Check if offsetRegister(= base+offsetArg+targetSizeInBytes) > len(memory.Buffer).
  2886  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, offsetRegister)
  2887  
  2888  	// If offsetRegister(= base+offsetArg+targetSizeInBytes) exceeds the memory length,
  2889  	//  we exit the function with nativeCallStatusCodeMemoryOutOfBounds.
  2890  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, nativeCallStatusCodeMemoryOutOfBounds)
  2891  
  2892  	// Otherwise, we subtract targetSizeInBytes from offsetRegister.
  2893  	c.assembler.CompileConstToRegister(arm64.SUB, targetSizeInBytes, offsetRegister)
  2894  	return offsetRegister, nil
  2895  }
  2896  
  2897  // compileMemoryGrow implements compileMemoryGrow variants for arm64 architecture.
  2898  func (c *arm64Compiler) compileMemoryGrow() error {
  2899  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2900  		return err
  2901  	}
  2902  
  2903  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexMemoryGrow); err != nil {
  2904  		return err
  2905  	}
  2906  
  2907  	// After return, we re-initialize reserved registers just like preamble of functions.
  2908  	c.compileReservedStackBasePointerRegisterInitialization()
  2909  	c.compileReservedMemoryRegisterInitialization()
  2910  	return nil
  2911  }
  2912  
  2913  // compileMemorySize implements compileMemorySize variants for arm64 architecture.
  2914  func (c *arm64Compiler) compileMemorySize() error {
  2915  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2916  		return err
  2917  	}
  2918  
  2919  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2920  	if err != nil {
  2921  		return err
  2922  	}
  2923  
  2924  	// "reg = len(memory.Buffer)"
  2925  	c.assembler.CompileMemoryToRegister(
  2926  		arm64.LDRD,
  2927  		arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  2928  		reg,
  2929  	)
  2930  
  2931  	// memory.size loads the page size of memory, so we have to divide by the page size.
  2932  	// "reg = reg >> wasm.MemoryPageSizeInBits (== reg / wasm.MemoryPageSize) "
  2933  	c.assembler.CompileConstToRegister(
  2934  		arm64.LSR,
  2935  		wasm.MemoryPageSizeInBits,
  2936  		reg,
  2937  	)
  2938  
  2939  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  2940  	return nil
  2941  }
  2942  
  2943  // compileCallGoFunction adds instructions to call a Go function whose address equals the addr parameter.
  2944  // compilerStatus is set before making call, and it should be either nativeCallStatusCodeCallBuiltInFunction or
  2945  // nativeCallStatusCodeCallGoHostFunction.
  2946  func (c *arm64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode, builtinFunction wasm.Index) error {
  2947  	// Release all the registers as our calling convention requires the caller-save.
  2948  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  2949  		return err
  2950  	}
  2951  
  2952  	if compilerStatus == nativeCallStatusCodeCallBuiltInFunction {
  2953  		// Set the target function address to ce.functionCallAddress
  2954  		// "tmp = $index"
  2955  		c.assembler.CompileConstToRegister(
  2956  			arm64.MOVD,
  2957  			int64(builtinFunction),
  2958  			arm64ReservedRegisterForTemporary,
  2959  		)
  2960  		// "[arm64ReservedRegisterForCallEngine + callEngineExitContextFunctionCallAddressOffset] = tmp"
  2961  		// In other words, "ce.functionCallAddress = tmp (== $addr)"
  2962  		c.assembler.CompileRegisterToMemory(
  2963  			arm64.STRW,
  2964  			arm64ReservedRegisterForTemporary,
  2965  			arm64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset,
  2966  		)
  2967  	}
  2968  
  2969  	c.compileExitFromNativeCode(compilerStatus)
  2970  	return nil
  2971  }
  2972  
  2973  // compileConstI32 implements compiler.compileConstI32 for the arm64 architecture.
  2974  func (c *arm64Compiler) compileConstI32(o *wazeroir.UnionOperation) error {
  2975  	return c.compileIntConstant(true, o.U1)
  2976  }
  2977  
  2978  // compileConstI64 implements compiler.compileConstI64 for the arm64 architecture.
  2979  func (c *arm64Compiler) compileConstI64(o *wazeroir.UnionOperation) error {
  2980  	return c.compileIntConstant(false, o.U1)
  2981  }
  2982  
  2983  // compileIntConstant adds instructions to load an integer constant.
  2984  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  2985  // value holds the (zero-extended for 32-bit case) load target constant.
  2986  func (c *arm64Compiler) compileIntConstant(is32bit bool, value uint64) error {
  2987  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2988  		return err
  2989  	}
  2990  
  2991  	var inst asm.Instruction
  2992  	var vt runtimeValueType
  2993  	if is32bit {
  2994  		inst = arm64.MOVW
  2995  		vt = runtimeValueTypeI32
  2996  	} else {
  2997  		inst = arm64.MOVD
  2998  		vt = runtimeValueTypeI64
  2999  	}
  3000  
  3001  	if value == 0 {
  3002  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, vt)
  3003  	} else {
  3004  		// Take a register to load the value.
  3005  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  3006  		if err != nil {
  3007  			return err
  3008  		}
  3009  
  3010  		c.assembler.CompileConstToRegister(inst, int64(value), reg)
  3011  
  3012  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  3013  	}
  3014  	return nil
  3015  }
  3016  
  3017  // compileConstF32 implements compiler.compileConstF32 for the arm64 architecture.
  3018  func (c *arm64Compiler) compileConstF32(o *wazeroir.UnionOperation) error {
  3019  	return c.compileFloatConstant(true, o.U1 /*uint64(math.Float32bits(o.Value))*/)
  3020  }
  3021  
  3022  // compileConstF64 implements compiler.compileConstF64 for the arm64 architecture.
  3023  func (c *arm64Compiler) compileConstF64(o *wazeroir.UnionOperation) error {
  3024  	return c.compileFloatConstant(false, o.U1 /*math.Float64bits(o.Value)*/)
  3025  }
  3026  
  3027  // compileFloatConstant adds instructions to load a float constant.
  3028  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  3029  // value holds the (zero-extended for 32-bit case) bit representation of load target float constant.
  3030  func (c *arm64Compiler) compileFloatConstant(is32bit bool, value uint64) error {
  3031  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3032  		return err
  3033  	}
  3034  
  3035  	// Take a register to load the value.
  3036  	reg, err := c.allocateRegister(registerTypeVector)
  3037  	if err != nil {
  3038  		return err
  3039  	}
  3040  
  3041  	tmpReg := arm64.RegRZR
  3042  	if value != 0 {
  3043  		tmpReg = arm64ReservedRegisterForTemporary
  3044  		var inst asm.Instruction
  3045  		if is32bit {
  3046  			inst = arm64.MOVW
  3047  		} else {
  3048  			inst = arm64.MOVD
  3049  		}
  3050  		c.assembler.CompileConstToRegister(inst, int64(value), tmpReg)
  3051  	}
  3052  
  3053  	// Use FMOV instruction to move the value on integer register into the float one.
  3054  	var inst asm.Instruction
  3055  	var vt runtimeValueType
  3056  	if is32bit {
  3057  		vt = runtimeValueTypeF32
  3058  		inst = arm64.FMOVS
  3059  	} else {
  3060  		vt = runtimeValueTypeF64
  3061  		inst = arm64.FMOVD
  3062  	}
  3063  	c.assembler.CompileRegisterToRegister(inst, tmpReg, reg)
  3064  
  3065  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3066  	return nil
  3067  }
  3068  
  3069  // compileMemoryInit implements compiler.compileMemoryInit for the arm64 architecture.
  3070  func (c *arm64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error {
  3071  	dataIndex := uint32(o.U1)
  3072  	return c.compileInitImpl(false, dataIndex, 0)
  3073  }
  3074  
  3075  // compileInitImpl implements compileTableInit and compileMemoryInit.
  3076  //
  3077  // TODO: the compiled code in this function should be reused and compile at once as
  3078  // the code is independent of any module.
  3079  func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
  3080  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3081  	if isTable {
  3082  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3083  	}
  3084  
  3085  	copySize, err := c.popValueOnRegister()
  3086  	if err != nil {
  3087  		return err
  3088  	}
  3089  	c.markRegisterUsed(copySize.register)
  3090  
  3091  	sourceOffset, err := c.popValueOnRegister()
  3092  	if err != nil {
  3093  		return err
  3094  	}
  3095  	if isZeroRegister(sourceOffset.register) {
  3096  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3097  		if err != nil {
  3098  			return err
  3099  		}
  3100  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  3101  	}
  3102  	c.markRegisterUsed(sourceOffset.register)
  3103  
  3104  	destinationOffset, err := c.popValueOnRegister()
  3105  	if err != nil {
  3106  		return err
  3107  	}
  3108  	if isZeroRegister(destinationOffset.register) {
  3109  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3110  		if err != nil {
  3111  			return err
  3112  		}
  3113  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3114  	}
  3115  	c.markRegisterUsed(destinationOffset.register)
  3116  
  3117  	tableInstanceAddressReg := asm.NilRegister
  3118  	if isTable {
  3119  		tableInstanceAddressReg, err = c.allocateRegister(registerTypeGeneralPurpose)
  3120  		if err != nil {
  3121  			return err
  3122  		}
  3123  		c.markRegisterUsed(tableInstanceAddressReg)
  3124  	}
  3125  
  3126  	if !isZeroRegister(copySize.register) {
  3127  		// sourceOffset += size.
  3128  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  3129  		// destinationOffset += size.
  3130  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  3131  	}
  3132  
  3133  	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
  3134  	if err != nil {
  3135  		return err
  3136  	}
  3137  
  3138  	if isTable {
  3139  		c.compileLoadElemInstanceAddress(index, instanceAddr)
  3140  	} else {
  3141  		c.compileLoadDataInstanceAddress(index, instanceAddr)
  3142  	}
  3143  
  3144  	// Check data instance bounds.
  3145  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3146  		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
  3147  		arm64ReservedRegisterForTemporary)
  3148  
  3149  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3150  	// If not, raise out of bounds memory access error.
  3151  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3152  
  3153  	// Otherwise, ready to copy the value from destination to source.
  3154  	// Check destination bounds.
  3155  	if isTable {
  3156  		// arm64ReservedRegisterForTemporary = &tables[0]
  3157  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3158  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3159  			arm64ReservedRegisterForTemporary)
  3160  		// tableInstanceAddressReg = arm64ReservedRegisterForTemporary + tableIndex*8
  3161  		//                         = &tables[0] + sizeOf(*tableInstance)*8
  3162  		//                         = &tables[tableIndex]
  3163  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3164  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3165  			tableInstanceAddressReg)
  3166  		// arm64ReservedRegisterForTemporary = [tableInstanceAddressReg+tableInstanceTableLenOffset] = len(tables[tableIndex])
  3167  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3168  			tableInstanceAddressReg, tableInstanceTableLenOffset,
  3169  			arm64ReservedRegisterForTemporary)
  3170  	} else {
  3171  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3172  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3173  			arm64ReservedRegisterForTemporary)
  3174  	}
  3175  
  3176  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3177  	// If not, raise out of bounds memory access error.
  3178  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3179  
  3180  	// Otherwise, ready to copy the value from source to destination.
  3181  	if !isZeroRegister(copySize.register) {
  3182  		// If the size equals zero, we can skip the entire instructions beflow.
  3183  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3184  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3185  
  3186  		var ldr, str asm.Instruction
  3187  		var movSize int64
  3188  		if isTable {
  3189  			ldr, str = arm64.LDRD, arm64.STRD
  3190  			movSize = 8
  3191  
  3192  			// arm64ReservedRegisterForTemporary = &Table[0]
  3193  			c.assembler.CompileMemoryToRegister(arm64.LDRD, tableInstanceAddressReg,
  3194  				tableInstanceTableOffset, arm64ReservedRegisterForTemporary)
  3195  			// destinationOffset = (destinationOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3196  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3197  				destinationOffset.register, pointerSizeLog2,
  3198  				arm64ReservedRegisterForTemporary, destinationOffset.register)
  3199  
  3200  			// arm64ReservedRegisterForTemporary = &ElementInstance.References[0]
  3201  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3202  			// sourceOffset = (sourceOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3203  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3204  				sourceOffset.register, pointerSizeLog2,
  3205  				arm64ReservedRegisterForTemporary, sourceOffset.register)
  3206  
  3207  			// copySize = copySize << pointerSizeLog2
  3208  			c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3209  		} else {
  3210  			ldr, str = arm64.LDRB, arm64.STRB
  3211  			movSize = 1
  3212  
  3213  			// destinationOffset += memory buffer's absolute address.
  3214  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3215  
  3216  			// sourceOffset += data buffer's absolute address.
  3217  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3218  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3219  
  3220  		}
  3221  
  3222  		// Negate the counter.
  3223  		c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3224  
  3225  		beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3226  
  3227  		// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3228  		c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3229  			sourceOffset.register, copySize.register,
  3230  			arm64ReservedRegisterForTemporary)
  3231  		// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3232  		c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3233  			arm64ReservedRegisterForTemporary,
  3234  			destinationOffset.register, copySize.register,
  3235  		)
  3236  
  3237  		// Decrement the size counter and if the value is still negative, continue the loop.
  3238  		c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3239  		c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3240  
  3241  		c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3242  	}
  3243  
  3244  	c.markRegisterUnused(copySize.register, sourceOffset.register,
  3245  		destinationOffset.register, instanceAddr, tableInstanceAddressReg)
  3246  	return nil
  3247  }
  3248  
  3249  // compileDataDrop implements compiler.compileDataDrop for the arm64 architecture.
  3250  func (c *arm64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error {
  3251  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3252  		return err
  3253  	}
  3254  
  3255  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3256  	if err != nil {
  3257  		return err
  3258  	}
  3259  
  3260  	dataIndex := uint32(o.U1)
  3261  	c.compileLoadDataInstanceAddress(dataIndex, tmp)
  3262  
  3263  	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
  3264  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3265  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3266  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3267  	return nil
  3268  }
  3269  
  3270  func (c *arm64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
  3271  	// dst = dataIndex * dataInstanceStructSize
  3272  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(dataIndex)*dataInstanceStructSize, dst)
  3273  
  3274  	// arm64ReservedRegisterForTemporary = &moduleInstance.DataInstances[0]
  3275  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3276  		arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  3277  		arm64ReservedRegisterForTemporary,
  3278  	)
  3279  
  3280  	// dst = arm64ReservedRegisterForTemporary + dst
  3281  	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
  3282  	//     = &moduleInstance.DataInstances[dataIndex]
  3283  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3284  }
  3285  
  3286  // compileMemoryCopy implements compiler.compileMemoryCopy for the arm64 architecture.
  3287  func (c *arm64Compiler) compileMemoryCopy() error {
  3288  	return c.compileCopyImpl(false, 0, 0)
  3289  }
  3290  
  3291  // compileCopyImpl implements compileTableCopy and compileMemoryCopy.
  3292  //
  3293  // TODO: the compiled code in this function should be reused and compile at once as
  3294  // the code is independent of any module.
  3295  func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableIndex uint32) error {
  3296  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3297  	if isTable {
  3298  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3299  	}
  3300  
  3301  	copySize, err := c.popValueOnRegister()
  3302  	if err != nil {
  3303  		return err
  3304  	}
  3305  	c.markRegisterUsed(copySize.register)
  3306  
  3307  	sourceOffset, err := c.popValueOnRegister()
  3308  	if err != nil {
  3309  		return err
  3310  	}
  3311  	if isZeroRegister(sourceOffset.register) {
  3312  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3313  		if err != nil {
  3314  			return err
  3315  		}
  3316  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  3317  	}
  3318  	c.markRegisterUsed(sourceOffset.register)
  3319  
  3320  	destinationOffset, err := c.popValueOnRegister()
  3321  	if err != nil {
  3322  		return err
  3323  	}
  3324  	if isZeroRegister(destinationOffset.register) {
  3325  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3326  		if err != nil {
  3327  			return err
  3328  		}
  3329  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3330  	}
  3331  	c.markRegisterUsed(destinationOffset.register)
  3332  
  3333  	if !isZeroRegister(copySize.register) {
  3334  		// sourceOffset += size.
  3335  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  3336  		// destinationOffset += size.
  3337  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  3338  	}
  3339  
  3340  	if isTable {
  3341  		// arm64ReservedRegisterForTemporary = &tables[0]
  3342  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3343  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3344  			arm64ReservedRegisterForTemporary)
  3345  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3346  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3347  		//                                   = &tables[srcTableIndex]
  3348  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3349  			arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3350  			arm64ReservedRegisterForTemporary)
  3351  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3352  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3353  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3354  			arm64ReservedRegisterForTemporary)
  3355  	} else {
  3356  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3357  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3358  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3359  			arm64ReservedRegisterForTemporary)
  3360  	}
  3361  
  3362  	// Check memory len >= sourceOffset.
  3363  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3364  	// If not, raise out of bounds memory access error.
  3365  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3366  
  3367  	// Otherwise, check memory len >= destinationOffset.
  3368  	if isTable {
  3369  		// arm64ReservedRegisterForTemporary = &tables[0]
  3370  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3371  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3372  			arm64ReservedRegisterForTemporary)
  3373  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + dstTableIndex*8
  3374  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3375  		//                                   = &tables[dstTableIndex]
  3376  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3377  			arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3378  			arm64ReservedRegisterForTemporary)
  3379  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[dstTableIndex])
  3380  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3381  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3382  			arm64ReservedRegisterForTemporary)
  3383  	}
  3384  
  3385  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3386  	// If not, raise out of bounds memory access error.
  3387  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3388  
  3389  	// Otherwise, ready to copy the value from source to destination.
  3390  	var ldr, str asm.Instruction
  3391  	var movSize int64
  3392  	if isTable {
  3393  		ldr, str = arm64.LDRD, arm64.STRD
  3394  		movSize = 8
  3395  	} else {
  3396  		ldr, str = arm64.LDRB, arm64.STRB
  3397  		movSize = 1
  3398  	}
  3399  
  3400  	// If the size equals zero, we can skip the entire instructions beflow.
  3401  	if !isZeroRegister(copySize.register) {
  3402  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3403  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3404  
  3405  		// If source offet < destination offset: for (i = size-1; i >= 0; i--) dst[i] = src[i];
  3406  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, sourceOffset.register, destinationOffset.register)
  3407  		destLowerThanSourceJump := c.assembler.CompileJump(arm64.BCONDLS)
  3408  		var endJump asm.Node
  3409  		{
  3410  			// sourceOffset -= size.
  3411  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, sourceOffset.register)
  3412  			// destinationOffset -= size.
  3413  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, destinationOffset.register)
  3414  
  3415  			if isTable {
  3416  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3417  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3418  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3419  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3420  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3421  					arm64ReservedRegisterForTemporary)
  3422  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3423  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3424  					arm64ReservedRegisterForTemporary)
  3425  				// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3426  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3427  					destinationOffset.register, pointerSizeLog2,
  3428  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3429  
  3430  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3431  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3432  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3433  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3434  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3435  					arm64ReservedRegisterForTemporary)
  3436  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3437  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3438  					arm64ReservedRegisterForTemporary)
  3439  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3440  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3441  					sourceOffset.register, pointerSizeLog2,
  3442  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3443  
  3444  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3445  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3446  			} else {
  3447  				// sourceOffset += memory buffer's absolute address.
  3448  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3449  				// destinationOffset += memory buffer's absolute address.
  3450  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3451  			}
  3452  
  3453  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3454  
  3455  			// size -= 1
  3456  			c.assembler.CompileConstToRegister(arm64.SUBS, movSize, copySize.register)
  3457  
  3458  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3459  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3460  				sourceOffset.register, copySize.register,
  3461  				arm64ReservedRegisterForTemporary)
  3462  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3463  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3464  				arm64ReservedRegisterForTemporary,
  3465  				destinationOffset.register, copySize.register,
  3466  			)
  3467  
  3468  			// If the value on the copySize.register is not equal zero, continue the loop.
  3469  			c.assembler.CompileJump(arm64.BCONDNE).AssignJumpTarget(beginCopyLoop)
  3470  
  3471  			// Otherwise, exit the loop.
  3472  			endJump = c.assembler.CompileJump(arm64.B)
  3473  		}
  3474  
  3475  		// Else (destination offet < source offset): for (i = 0; i < size; i++) dst[counter-1-i] = src[counter-1-i];
  3476  		c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  3477  		{
  3478  
  3479  			if isTable {
  3480  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3481  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3482  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3483  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3484  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3485  					arm64ReservedRegisterForTemporary)
  3486  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3487  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3488  					arm64ReservedRegisterForTemporary)
  3489  				// destinationOffset = (destinationOffset<< interfaceDataySizeLog2) + &Table[dstTableIndex].Table[0]
  3490  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3491  					destinationOffset.register, pointerSizeLog2,
  3492  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3493  
  3494  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3495  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3496  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3497  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3498  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3499  					arm64ReservedRegisterForTemporary)
  3500  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3501  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3502  					arm64ReservedRegisterForTemporary)
  3503  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3504  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3505  					sourceOffset.register, pointerSizeLog2,
  3506  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3507  
  3508  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3509  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3510  			} else {
  3511  				// sourceOffset += memory buffer's absolute address.
  3512  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3513  				// destinationOffset += memory buffer's absolute address.
  3514  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3515  			}
  3516  
  3517  			// Negate the counter.
  3518  			c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3519  
  3520  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3521  
  3522  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3523  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3524  				sourceOffset.register, copySize.register,
  3525  				arm64ReservedRegisterForTemporary)
  3526  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3527  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3528  				arm64ReservedRegisterForTemporary,
  3529  				destinationOffset.register, copySize.register,
  3530  			)
  3531  
  3532  			// size += 1
  3533  			c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3534  			c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3535  		}
  3536  		c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3537  		c.assembler.SetJumpTargetOnNext(endJump)
  3538  	}
  3539  
  3540  	// Mark all of the operand registers.
  3541  	c.markRegisterUnused(copySize.register, sourceOffset.register, destinationOffset.register)
  3542  
  3543  	return nil
  3544  }
  3545  
  3546  // compileMemoryFill implements compiler.compileMemoryCopy for the arm64 architecture.
  3547  func (c *arm64Compiler) compileMemoryFill() error {
  3548  	return c.compileFillImpl(false, 0)
  3549  }
  3550  
  3551  // compileFillImpl implements TableFill and MemoryFill.
  3552  //
  3553  // TODO: the compiled code in this function should be reused and compile at once as
  3554  // the code is independent of any module.
  3555  func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
  3556  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3557  	if isTable {
  3558  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3559  	}
  3560  
  3561  	fillSize, err := c.popValueOnRegister()
  3562  	if err != nil {
  3563  		return err
  3564  	}
  3565  	c.markRegisterUsed(fillSize.register)
  3566  
  3567  	value, err := c.popValueOnRegister()
  3568  	if err != nil {
  3569  		return err
  3570  	}
  3571  	c.markRegisterUsed(value.register)
  3572  
  3573  	destinationOffset, err := c.popValueOnRegister()
  3574  	if err != nil {
  3575  		return err
  3576  	}
  3577  	if isZeroRegister(destinationOffset.register) {
  3578  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3579  		if err != nil {
  3580  			return err
  3581  		}
  3582  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3583  	}
  3584  	c.markRegisterUsed(destinationOffset.register)
  3585  
  3586  	// destinationOffset += size.
  3587  	c.assembler.CompileRegisterToRegister(arm64.ADD, fillSize.register, destinationOffset.register)
  3588  
  3589  	if isTable {
  3590  		// arm64ReservedRegisterForTemporary = &tables[0]
  3591  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3592  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3593  			arm64ReservedRegisterForTemporary)
  3594  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3595  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3596  		//                                   = &tables[srcTableIndex]
  3597  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3598  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3599  			arm64ReservedRegisterForTemporary)
  3600  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3601  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3602  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3603  			arm64ReservedRegisterForTemporary)
  3604  	} else {
  3605  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3606  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3607  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3608  			arm64ReservedRegisterForTemporary)
  3609  	}
  3610  
  3611  	// Check  len >= destinationOffset.
  3612  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3613  
  3614  	// If not, raise the runtime error.
  3615  	c.compileMaybeExitFromNativeCode(arm64.BCONDLS, outOfBoundsErrorStatus)
  3616  
  3617  	// Otherwise, ready to copy the value from destination to source.
  3618  	// If the size equals zero, we can skip the entire instructions below.
  3619  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, fillSize.register)
  3620  	skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3621  
  3622  	// destinationOffset -= size.
  3623  	c.assembler.CompileRegisterToRegister(arm64.SUB, fillSize.register, destinationOffset.register)
  3624  
  3625  	var str asm.Instruction
  3626  	var movSize int64
  3627  	if isTable {
  3628  		str = arm64.STRD
  3629  		movSize = 8
  3630  
  3631  		// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3632  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3633  			callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3634  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3635  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3636  			arm64ReservedRegisterForTemporary)
  3637  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3638  			arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3639  			arm64ReservedRegisterForTemporary)
  3640  		// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3641  		c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3642  			destinationOffset.register, pointerSizeLog2,
  3643  			arm64ReservedRegisterForTemporary, destinationOffset.register)
  3644  
  3645  		// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3646  		c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, fillSize.register)
  3647  	} else {
  3648  		str = arm64.STRB
  3649  		movSize = 1
  3650  
  3651  		// destinationOffset += memory buffer's absolute address.
  3652  		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3653  	}
  3654  
  3655  	// Naively implement the copy with "for loop" by copying byte one by one.
  3656  	beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3657  
  3658  	// size -= 1
  3659  	c.assembler.CompileConstToRegister(arm64.SUBS, movSize, fillSize.register)
  3660  
  3661  	// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3662  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3663  		value.register,
  3664  		destinationOffset.register, fillSize.register,
  3665  	)
  3666  
  3667  	// If the value on the copySizeRgister.register is not equal zero, continue the loop.
  3668  	continueJump := c.assembler.CompileJump(arm64.BCONDNE)
  3669  	continueJump.AssignJumpTarget(beginCopyLoop)
  3670  
  3671  	// Mark all of the operand registers.
  3672  	c.markRegisterUnused(fillSize.register, value.register, destinationOffset.register)
  3673  
  3674  	c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3675  	return nil
  3676  }
  3677  
  3678  // compileTableInit implements compiler.compileTableInit for the arm64 architecture.
  3679  func (c *arm64Compiler) compileTableInit(o *wazeroir.UnionOperation) error {
  3680  	elemIndex := uint32(o.U1)
  3681  	tableIndex := uint32(o.U2)
  3682  	return c.compileInitImpl(true, elemIndex, tableIndex)
  3683  }
  3684  
  3685  // compileTableCopy implements compiler.compileTableCopy for the arm64 architecture.
  3686  func (c *arm64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error {
  3687  	return c.compileCopyImpl(true, uint32(o.U1), uint32(o.U2))
  3688  }
  3689  
  3690  // compileElemDrop implements compiler.compileElemDrop for the arm64 architecture.
  3691  func (c *arm64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error {
  3692  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3693  		return err
  3694  	}
  3695  
  3696  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3697  	if err != nil {
  3698  		return err
  3699  	}
  3700  
  3701  	elemIndex := uint32(o.U1)
  3702  	c.compileLoadElemInstanceAddress(elemIndex, tmp)
  3703  
  3704  	// Clears the content of ElementInstances[o.ElemIndex] (== []interface{} type).
  3705  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3706  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3707  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3708  	return nil
  3709  }
  3710  
  3711  func (c *arm64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
  3712  	// dst = dataIndex * elementInstanceStructSize
  3713  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(elemIndex)*elementInstanceStructSize, dst)
  3714  
  3715  	// arm64ReservedRegisterForTemporary = &moduleInstance.ElementInstances[0]
  3716  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3717  		arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  3718  		arm64ReservedRegisterForTemporary,
  3719  	)
  3720  
  3721  	// dst = arm64ReservedRegisterForTemporary + dst
  3722  	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
  3723  	//     = &moduleInstance.ElementInstances[elemIndex]
  3724  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3725  }
  3726  
  3727  // compileRefFunc implements compiler.compileRefFunc for the arm64 architecture.
  3728  func (c *arm64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error {
  3729  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3730  		return err
  3731  	}
  3732  
  3733  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3734  	if err != nil {
  3735  		return err
  3736  	}
  3737  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset]
  3738  	//                                   = &moduleEngine.functions[0]
  3739  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3740  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  3741  		ref)
  3742  
  3743  	// ref = ref + int64(o.FunctionIndex)*sizeOf(function)
  3744  	//     = &moduleEngine.functions[index]
  3745  	functionIndex := int64(o.U1)
  3746  	c.assembler.CompileConstToRegister(arm64.ADD,
  3747  		functionIndex*functionSize,
  3748  		ref,
  3749  	)
  3750  
  3751  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
  3752  	return nil
  3753  }
  3754  
  3755  // compileTableGet implements compiler.compileTableGet for the arm64 architecture.
  3756  func (c *arm64Compiler) compileTableGet(o *wazeroir.UnionOperation) error {
  3757  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3758  	if err != nil {
  3759  		return err
  3760  	}
  3761  	c.markRegisterUsed(ref)
  3762  
  3763  	offset, err := c.popValueOnRegister()
  3764  	if err != nil {
  3765  		return err
  3766  	}
  3767  
  3768  	// arm64ReservedRegisterForTemporary = &tables[0]
  3769  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3770  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3771  		arm64ReservedRegisterForTemporary)
  3772  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3773  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3774  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3775  	tableIndex := int64(o.U1)
  3776  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3777  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3778  		arm64ReservedRegisterForTemporary)
  3779  
  3780  	// Out of bounds check.
  3781  	// ref = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3782  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3783  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3784  		ref,
  3785  	)
  3786  	// "cmp ref, offset"
  3787  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, ref, offset.register)
  3788  
  3789  	// If it exceeds len(table), we exit the execution.
  3790  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  3791  
  3792  	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3793  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3794  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3795  		ref,
  3796  	)
  3797  
  3798  	// ref = (offset << pointerSizeLog2) + ref
  3799  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3800  	//     = &tables[TableIndex].References[offset]
  3801  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3802  		offset.register, pointerSizeLog2, ref, ref)
  3803  
  3804  	// ref = [&tables[TableIndex]] = load the Reference's pointer as uint64.
  3805  	c.assembler.CompileMemoryToRegister(arm64.LDRD, ref, 0, ref)
  3806  
  3807  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
  3808  	return nil
  3809  }
  3810  
  3811  // compileTableSet implements compiler.compileTableSet for the arm64 architecture.
  3812  func (c *arm64Compiler) compileTableSet(o *wazeroir.UnionOperation) error {
  3813  	ref := c.locationStack.pop()
  3814  	if err := c.compileEnsureOnRegister(ref); err != nil {
  3815  		return err
  3816  	}
  3817  
  3818  	offset := c.locationStack.pop()
  3819  	if err := c.compileEnsureOnRegister(offset); err != nil {
  3820  		return err
  3821  	}
  3822  
  3823  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3824  	if err != nil {
  3825  		return err
  3826  	}
  3827  
  3828  	// arm64ReservedRegisterForTemporary = &tables[0]
  3829  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3830  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3831  		arm64ReservedRegisterForTemporary)
  3832  	// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + TableIndex*8
  3833  	//                                   = &tables[0] + TableIndex*sizeOf(*tableInstance)
  3834  	//                                   = &tables[TableIndex]
  3835  	tableIndex := int64(o.U1)
  3836  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3837  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3838  		arm64ReservedRegisterForTemporary)
  3839  
  3840  	// Out of bounds check.
  3841  	// tmp = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3842  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3843  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3844  		tmp,
  3845  	)
  3846  	// "cmp tmp, offset"
  3847  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, offset.register)
  3848  
  3849  	// If it exceeds len(table), we exit the execution.
  3850  	c.compileMaybeExitFromNativeCode(arm64.BCONDLO, nativeCallStatusCodeInvalidTableAccess)
  3851  
  3852  	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3853  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3854  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3855  		tmp,
  3856  	)
  3857  
  3858  	// tmp = (offset << pointerSizeLog2) + tmp
  3859  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3860  	//     = &tables[TableIndex].References[offset]
  3861  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, offset.register, pointerSizeLog2, tmp, tmp)
  3862  
  3863  	// Set the reference's raw pointer.
  3864  	c.assembler.CompileRegisterToMemory(arm64.STRD, ref.register, tmp, 0)
  3865  
  3866  	c.markRegisterUnused(offset.register, ref.register, tmp)
  3867  	return nil
  3868  }
  3869  
  3870  // compileTableGrow implements compiler.compileTableGrow for the arm64 architecture.
  3871  func (c *arm64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error {
  3872  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3873  		return err
  3874  	}
  3875  
  3876  	// Pushes the table index.
  3877  	tableIndex := o.U1
  3878  	if err := c.compileIntConstant(true, tableIndex); err != nil {
  3879  		return err
  3880  	}
  3881  
  3882  	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
  3883  	// Therefore, call out to the built function for this purpose.
  3884  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexTableGrow); err != nil {
  3885  		return err
  3886  	}
  3887  
  3888  	// TableGrow consumes three values (table index, number of items, initial value).
  3889  	for i := 0; i < 3; i++ {
  3890  		c.locationStack.pop()
  3891  	}
  3892  
  3893  	// Then, the previous length was pushed as the result.
  3894  	v := c.locationStack.pushRuntimeValueLocationOnStack()
  3895  	v.valueType = runtimeValueTypeI32
  3896  
  3897  	// After return, we re-initialize reserved registers just like preamble of functions.
  3898  	c.compileReservedStackBasePointerRegisterInitialization()
  3899  	c.compileReservedMemoryRegisterInitialization()
  3900  	return nil
  3901  }
  3902  
  3903  // compileTableSize implements compiler.compileTableSize for the arm64 architecture.
  3904  func (c *arm64Compiler) compileTableSize(o *wazeroir.UnionOperation) error {
  3905  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3906  		return err
  3907  	}
  3908  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  3909  	if err != nil {
  3910  		return err
  3911  	}
  3912  	c.markRegisterUsed(result)
  3913  
  3914  	// arm64ReservedRegisterForTemporary = &tables[0]
  3915  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3916  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3917  		arm64ReservedRegisterForTemporary)
  3918  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3919  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3920  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3921  	tableIndex := int64(o.U1)
  3922  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3923  		arm64ReservedRegisterForTemporary, tableIndex*8,
  3924  		arm64ReservedRegisterForTemporary)
  3925  
  3926  	// result = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3927  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3928  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3929  		result,
  3930  	)
  3931  
  3932  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  3933  	return nil
  3934  }
  3935  
  3936  // compileTableFill implements compiler.compileTableFill for the arm64 architecture.
  3937  func (c *arm64Compiler) compileTableFill(o *wazeroir.UnionOperation) error {
  3938  	tableIndex := uint32(o.U1)
  3939  	return c.compileFillImpl(true, tableIndex)
  3940  }
  3941  
  3942  // popTwoValuesOnRegisters pops two values from the location stacks, ensures
  3943  // these two values are located on registers, and mark them unused.
  3944  //
  3945  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3946  // but the name seems awkward.
  3947  func (c *arm64Compiler) popTwoValuesOnRegisters() (x1, x2 *runtimeValueLocation, err error) {
  3948  	x2 = c.locationStack.pop()
  3949  	if err = c.compileEnsureOnRegister(x2); err != nil {
  3950  		return
  3951  	}
  3952  
  3953  	x1 = c.locationStack.pop()
  3954  	if err = c.compileEnsureOnRegister(x1); err != nil {
  3955  		return
  3956  	}
  3957  
  3958  	c.markRegisterUnused(x2.register)
  3959  	c.markRegisterUnused(x1.register)
  3960  	return
  3961  }
  3962  
  3963  // popValueOnRegister pops one value from the location stack, ensures
  3964  // that it is located on a register, and mark it unused.
  3965  //
  3966  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3967  // but the name seems awkward.
  3968  func (c *arm64Compiler) popValueOnRegister() (v *runtimeValueLocation, err error) {
  3969  	v = c.locationStack.pop()
  3970  	if err = c.compileEnsureOnRegister(v); err != nil {
  3971  		return
  3972  	}
  3973  
  3974  	c.markRegisterUnused(v.register)
  3975  	return
  3976  }
  3977  
  3978  // compileEnsureOnRegister emits instructions to ensure that a value is located on a register.
  3979  func (c *arm64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
  3980  	if loc.onStack() {
  3981  		reg, err := c.allocateRegister(loc.getRegisterType())
  3982  		if err != nil {
  3983  			return err
  3984  		}
  3985  
  3986  		// Record that the value holds the register and the register is marked used.
  3987  		loc.setRegister(reg)
  3988  		c.markRegisterUsed(reg)
  3989  
  3990  		c.compileLoadValueOnStackToRegister(loc)
  3991  	} else if loc.onConditionalRegister() {
  3992  		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  3993  	}
  3994  	return
  3995  }
  3996  
  3997  // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
  3998  // if the value is located on a conditional register.
  3999  //
  4000  // This is usually called at the beginning of methods on compiler interface where we possibly
  4001  // compile instructions without saving the conditional register value.
  4002  // compile* functions without calling this function is saving the conditional
  4003  // value to the stack or register by invoking ensureOnGeneralPurposeRegister for the top.
  4004  func (c *arm64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
  4005  	if c.locationStack.sp > 0 {
  4006  		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
  4007  			err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  4008  		}
  4009  	}
  4010  	return
  4011  }
  4012  
  4013  // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
  4014  // to a general purpose register.
  4015  func (c *arm64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
  4016  	reg, err := c.allocateRegister(loc.getRegisterType())
  4017  	if err != nil {
  4018  		return err
  4019  	}
  4020  
  4021  	c.markRegisterUsed(reg)
  4022  	c.assembler.CompileConditionalRegisterSet(loc.conditionalRegister, reg)
  4023  
  4024  	// Record that now the value is located on a general purpose register.
  4025  	loc.setRegister(reg)
  4026  	return nil
  4027  }
  4028  
  4029  // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for arm64.
  4030  func (c *arm64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
  4031  	switch loc.valueType {
  4032  	case runtimeValueTypeI32:
  4033  		c.assembler.CompileMemoryToRegister(arm64.LDRW, arm64ReservedRegisterForStackBasePointerAddress,
  4034  			int64(loc.stackPointer)*8, loc.register)
  4035  	case runtimeValueTypeI64:
  4036  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForStackBasePointerAddress,
  4037  			int64(loc.stackPointer)*8, loc.register)
  4038  	case runtimeValueTypeF32:
  4039  		c.assembler.CompileMemoryToRegister(arm64.FLDRS, arm64ReservedRegisterForStackBasePointerAddress,
  4040  			int64(loc.stackPointer)*8, loc.register)
  4041  	case runtimeValueTypeF64:
  4042  		c.assembler.CompileMemoryToRegister(arm64.FLDRD, arm64ReservedRegisterForStackBasePointerAddress,
  4043  			int64(loc.stackPointer)*8, loc.register)
  4044  	case runtimeValueTypeV128Lo:
  4045  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV,
  4046  			arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8, loc.register,
  4047  			arm64.VectorArrangementQ)
  4048  		// Higher 64-bits are loaded as well ^^.
  4049  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4050  		hi.setRegister(loc.register)
  4051  	case runtimeValueTypeV128Hi:
  4052  		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
  4053  	}
  4054  }
  4055  
  4056  // allocateRegister implements compiler.allocateRegister for arm64.
  4057  func (c *arm64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
  4058  	var ok bool
  4059  	// Try to get the unused register.
  4060  	reg, ok = c.locationStack.takeFreeRegister(t)
  4061  	if ok {
  4062  		return
  4063  	}
  4064  
  4065  	// If not found, we have to steal the register.
  4066  	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
  4067  	if !ok {
  4068  		err = fmt.Errorf("cannot steal register")
  4069  		return
  4070  	}
  4071  
  4072  	// Release the steal target register value onto stack location.
  4073  	reg = stealTarget.register
  4074  	c.compileReleaseRegisterToStack(stealTarget)
  4075  	return
  4076  }
  4077  
  4078  // compileReleaseAllRegistersToStack adds instructions to store all the values located on
  4079  // either general purpose or conditional registers onto the memory stack.
  4080  // See releaseRegisterToStack.
  4081  func (c *arm64Compiler) compileReleaseAllRegistersToStack() (err error) {
  4082  	for i := uint64(0); i < c.locationStack.sp; i++ {
  4083  		if loc := &c.locationStack.stack[i]; loc.onRegister() {
  4084  			c.compileReleaseRegisterToStack(loc)
  4085  		} else if loc.onConditionalRegister() {
  4086  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  4087  				return
  4088  			}
  4089  			c.compileReleaseRegisterToStack(loc)
  4090  		}
  4091  	}
  4092  	return
  4093  }
  4094  
  4095  // releaseRegisterToStack adds an instruction to write the value on a register back to memory stack region.
  4096  func (c *arm64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
  4097  	switch loc.valueType {
  4098  	case runtimeValueTypeI32:
  4099  		c.assembler.CompileRegisterToMemory(arm64.STRW, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4100  	case runtimeValueTypeI64:
  4101  		c.assembler.CompileRegisterToMemory(arm64.STRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4102  	case runtimeValueTypeF32:
  4103  		c.assembler.CompileRegisterToMemory(arm64.FSTRS, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4104  	case runtimeValueTypeF64:
  4105  		c.assembler.CompileRegisterToMemory(arm64.FSTRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4106  	case runtimeValueTypeV128Lo:
  4107  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
  4108  			loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
  4109  			arm64.VectorArrangementQ)
  4110  		// Higher 64-bits are released as well ^^.
  4111  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4112  		c.locationStack.releaseRegister(hi)
  4113  	case runtimeValueTypeV128Hi:
  4114  		panic("BUG: V128Hi must be released to the stack along with V128Lo")
  4115  	default:
  4116  		panic("BUG")
  4117  	}
  4118  
  4119  	// Mark the register is free.
  4120  	c.locationStack.releaseRegister(loc)
  4121  }
  4122  
  4123  // compileReservedStackBasePointerRegisterInitialization adds instructions to initialize arm64ReservedRegisterForStackBasePointerAddress
  4124  // so that it points to the absolute address of the stack base for this function.
  4125  func (c *arm64Compiler) compileReservedStackBasePointerRegisterInitialization() {
  4126  	// First, load the address of the first element in the value stack into arm64ReservedRegisterForStackBasePointerAddress temporarily.
  4127  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4128  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
  4129  		arm64ReservedRegisterForStackBasePointerAddress)
  4130  
  4131  	// next we move the base pointer (ce.stackBasePointer) to arm64ReservedRegisterForTemporary.
  4132  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4133  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  4134  		arm64ReservedRegisterForTemporary)
  4135  
  4136  	// Finally, we calculate "callEngineStackContextStackBasePointerInBytesOffset + arm64ReservedRegisterForTemporary"
  4137  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForStackBasePointerAddress)
  4138  }
  4139  
  4140  func (c *arm64Compiler) compileReservedMemoryRegisterInitialization() {
  4141  	if c.ir.HasMemory || c.ir.UsesMemory {
  4142  		// "arm64ReservedRegisterForMemory = ce.MemoryElement0Address"
  4143  		c.assembler.CompileMemoryToRegister(
  4144  			arm64.LDRD,
  4145  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4146  			arm64ReservedRegisterForMemory,
  4147  		)
  4148  	}
  4149  }
  4150  
  4151  // compileModuleContextInitialization adds instructions to initialize ce.moduleContext's fields based on
  4152  // ce.moduleContext.ModuleInstanceAddress.
  4153  // This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
  4154  func (c *arm64Compiler) compileModuleContextInitialization() error {
  4155  	tmpX, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4156  	if !found {
  4157  		panic("BUG: all the registers should be free at this point")
  4158  	}
  4159  	c.markRegisterUsed(tmpX)
  4160  	tmpY, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4161  	if !found {
  4162  		panic("BUG: all the registers should be free at this point")
  4163  	}
  4164  	c.markRegisterUsed(tmpY)
  4165  
  4166  	// "tmpX = ce.ModuleInstanceAddress"
  4167  	c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, tmpX)
  4168  
  4169  	// If the module instance address stays the same, we could skip the entire code below.
  4170  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64CallingConventionModuleInstanceAddressRegister, tmpX)
  4171  	brIfModuleUnchanged := c.assembler.CompileJump(arm64.BCONDEQ)
  4172  
  4173  	// Otherwise, update the moduleEngine.moduleContext.ModuleInstanceAddress.
  4174  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  4175  		arm64CallingConventionModuleInstanceAddressRegister,
  4176  		arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset,
  4177  	)
  4178  
  4179  	// Also, we have to update the following fields:
  4180  	// * callEngine.moduleContext.globalElement0Address
  4181  	// * callEngine.moduleContext.memoryElement0Address
  4182  	// * callEngine.moduleContext.memorySliceLen
  4183  	// * callEngine.moduleContext.memoryInstance
  4184  	// * callEngine.moduleContext.tableElement0Address
  4185  	// * callEngine.moduleContext.tableSliceLen
  4186  	// * callEngine.moduleContext.functionsElement0Address
  4187  	// * callEngine.moduleContext.typeIDsElement0Address
  4188  	// * callEngine.moduleContext.dataInstancesElement0Address
  4189  	// * callEngine.moduleContext.elementInstancesElement0Address
  4190  
  4191  	// Update globalElement0Address.
  4192  	//
  4193  	// Note: if there's global.get or set instruction in the function, the existence of the globals
  4194  	// is ensured by function validation at module instantiation phase, and that's why it is ok to
  4195  	// skip the initialization if the module's globals slice is empty.
  4196  	if len(c.ir.Globals) > 0 {
  4197  		// "tmpX = &moduleInstance.Globals[0]"
  4198  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4199  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset,
  4200  			tmpX,
  4201  		)
  4202  
  4203  		// "ce.GlobalElement0Address = tmpX (== &moduleInstance.Globals[0])"
  4204  		c.assembler.CompileRegisterToMemory(
  4205  			arm64.STRD, tmpX,
  4206  			arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
  4207  		)
  4208  	}
  4209  
  4210  	// Update memoryElement0Address and memorySliceLen.
  4211  	//
  4212  	// Note: if there's memory instruction in the function, memory instance must be non-nil.
  4213  	// That is ensured by function validation at module instantiation phase, and that's
  4214  	// why it is ok to skip the initialization if the module's memory instance is nil.
  4215  	if c.ir.HasMemory {
  4216  		// "tmpX = moduleInstance.Memory"
  4217  		c.assembler.CompileMemoryToRegister(
  4218  			arm64.LDRD,
  4219  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
  4220  			tmpX,
  4221  		)
  4222  
  4223  		// First, set ce.memoryInstance
  4224  		c.assembler.CompileRegisterToMemory(
  4225  			arm64.STRD,
  4226  			tmpX,
  4227  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset,
  4228  		)
  4229  
  4230  		// Next, we write the memory length into ce.MemorySliceLen.
  4231  		//
  4232  		// "tmpY = [tmpX + memoryInstanceBufferLenOffset] (== len(memory.Buffer))"
  4233  		c.assembler.CompileMemoryToRegister(
  4234  			arm64.LDRD,
  4235  			tmpX, memoryInstanceBufferLenOffset,
  4236  			tmpY,
  4237  		)
  4238  		// "ce.MemorySliceLen = tmpY".
  4239  		c.assembler.CompileRegisterToMemory(
  4240  			arm64.STRD,
  4241  			tmpY,
  4242  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  4243  		)
  4244  
  4245  		// Finally, we write ce.memoryElement0Address.
  4246  		//
  4247  		// "tmpY = *tmpX (== &memory.Buffer[0])"
  4248  		c.assembler.CompileMemoryToRegister(
  4249  			arm64.LDRD,
  4250  			tmpX, memoryInstanceBufferOffset,
  4251  			tmpY,
  4252  		)
  4253  		// "ce.memoryElement0Address = tmpY".
  4254  		c.assembler.CompileRegisterToMemory(
  4255  			arm64.STRD,
  4256  			tmpY,
  4257  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4258  		)
  4259  	}
  4260  
  4261  	// Update tableElement0Address, tableSliceLen and typeIDsElement0Address.
  4262  	//
  4263  	// Note: if there's table instruction in the function, the existence of the table
  4264  	// is ensured by function validation at module instantiation phase, and that's
  4265  	// why it is ok to skip the initialization if the module's table doesn't exist.
  4266  	if c.ir.HasTable {
  4267  		// "tmpX = &tables[0] (type of **wasm.Table)"
  4268  		c.assembler.CompileMemoryToRegister(
  4269  			arm64.LDRD,
  4270  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTablesOffset,
  4271  			tmpX,
  4272  		)
  4273  
  4274  		// Update ce.tableElement0Address.
  4275  		// "ce.tableElement0Address = tmpX".
  4276  		c.assembler.CompileRegisterToMemory(
  4277  			arm64.STRD,
  4278  			tmpX,
  4279  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4280  		)
  4281  
  4282  		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
  4283  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4284  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpX)
  4285  		c.assembler.CompileRegisterToMemory(arm64.STRD,
  4286  			tmpX, arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
  4287  	}
  4288  
  4289  	// Update callEngine.moduleContext.functionsElement0Address
  4290  	{
  4291  		// "tmpX = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
  4292  		//
  4293  		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
  4294  		// where tab points to the interface table, and the latter points to the actual
  4295  		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
  4296  		// See the following references for detail:
  4297  		// * https://research.swtch.com/interfaces
  4298  		// * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210
  4299  		c.assembler.CompileMemoryToRegister(
  4300  			arm64.LDRD,
  4301  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset,
  4302  			tmpX,
  4303  		)
  4304  
  4305  		// "tmpY = [tmpX + moduleEngineFunctionsOffset] (== &moduleEngine.functions[0])"
  4306  		c.assembler.CompileMemoryToRegister(
  4307  			arm64.LDRD,
  4308  			tmpX, moduleEngineFunctionsOffset,
  4309  			tmpY,
  4310  		)
  4311  
  4312  		// "callEngine.moduleContext.functionsElement0Address = tmpY".
  4313  		c.assembler.CompileRegisterToMemory(
  4314  			arm64.STRD,
  4315  			tmpY,
  4316  			arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  4317  		)
  4318  	}
  4319  
  4320  	// Update dataInstancesElement0Address.
  4321  	if c.ir.HasDataInstances {
  4322  		// "tmpX = &moduleInstance.DataInstances[0]"
  4323  		c.assembler.CompileMemoryToRegister(
  4324  			arm64.LDRD,
  4325  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
  4326  			tmpX,
  4327  		)
  4328  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4329  		c.assembler.CompileRegisterToMemory(
  4330  			arm64.STRD,
  4331  			tmpX,
  4332  			arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  4333  		)
  4334  	}
  4335  
  4336  	// Update callEngine.moduleContext.elementInstancesElement0Address
  4337  	if c.ir.HasElementInstances {
  4338  		// "tmpX = &moduleInstance.DataInstances[0]"
  4339  		c.assembler.CompileMemoryToRegister(
  4340  			arm64.LDRD,
  4341  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
  4342  			tmpX,
  4343  		)
  4344  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4345  		c.assembler.CompileRegisterToMemory(
  4346  			arm64.STRD,
  4347  			tmpX,
  4348  			arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4349  		)
  4350  	}
  4351  
  4352  	c.assembler.SetJumpTargetOnNext(brIfModuleUnchanged)
  4353  	c.markRegisterUnused(tmpX, tmpY)
  4354  	return nil
  4355  }