wa-lang.org/wazero@v1.0.2/internal/engine/compiler/impl_arm64.go (about)

     1  // This file implements the compiler for arm64 target.
     2  // Please refer to https://developer.arm.com/documentation/102374/latest/
     3  // if unfamiliar with arm64 instructions and semantics.
     4  package compiler
     5  
     6  import (
     7  	"bytes"
     8  	"errors"
     9  	"fmt"
    10  	"math"
    11  
    12  	"wa-lang.org/wazero/internal/asm"
    13  	"wa-lang.org/wazero/internal/asm/arm64"
    14  	"wa-lang.org/wazero/internal/platform"
    15  	"wa-lang.org/wazero/internal/wasm"
    16  	"wa-lang.org/wazero/internal/wazeroir"
    17  )
    18  
    19  type arm64Compiler struct {
    20  	assembler arm64.Assembler
    21  	ir        *wazeroir.CompilationResult
    22  	// locationStack holds the state of wazeroir virtual stack.
    23  	// and each item is either placed in register or the actual memory stack.
    24  	locationStack *runtimeValueLocationStack
    25  	// labels maps a label (e.g. ".L1_then") to *arm64LabelInfo.
    26  	labels map[string]*arm64LabelInfo
    27  	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
    28  	stackPointerCeil uint64
    29  	// onStackPointerCeilDeterminedCallBack hold a callback which are called when the ceil of stack pointer is determined before generating native code.
    30  	onStackPointerCeilDeterminedCallBack func(stackPointerCeil uint64)
    31  	withListener                         bool
    32  }
    33  
    34  func newArm64Compiler(ir *wazeroir.CompilationResult, withListener bool) (compiler, error) {
    35  	return &arm64Compiler{
    36  		assembler:     arm64.NewAssembler(arm64ReservedRegisterForTemporary),
    37  		locationStack: newRuntimeValueLocationStack(),
    38  		ir:            ir,
    39  		labels:        map[string]*arm64LabelInfo{},
    40  		withListener:  withListener,
    41  	}, nil
    42  }
    43  
    44  var (
    45  	arm64UnreservedVectorRegisters = []asm.Register{
    46  		arm64.RegV0, arm64.RegV1, arm64.RegV2, arm64.RegV3,
    47  		arm64.RegV4, arm64.RegV5, arm64.RegV6, arm64.RegV7, arm64.RegV8,
    48  		arm64.RegV9, arm64.RegV10, arm64.RegV11, arm64.RegV12, arm64.RegV13,
    49  		arm64.RegV14, arm64.RegV15, arm64.RegV16, arm64.RegV17, arm64.RegV18,
    50  		arm64.RegV19, arm64.RegV20, arm64.RegV21, arm64.RegV22, arm64.RegV23,
    51  		arm64.RegV24, arm64.RegV25, arm64.RegV26, arm64.RegV27, arm64.RegV28,
    52  		arm64.RegV29, arm64.RegV30, arm64.RegV31,
    53  	}
    54  
    55  	// Note (see arm64 section in https://go.dev/doc/asm):
    56  	// * RegR18 is reserved as a platform register, and we don't use it in Compiler.
    57  	// * RegR28 is reserved for Goroutine by Go runtime, and we don't use it in Compiler.
    58  	arm64UnreservedGeneralPurposeRegisters = []asm.Register{ // nolint
    59  		arm64.RegR3, arm64.RegR4, arm64.RegR5, arm64.RegR6, arm64.RegR7, arm64.RegR8,
    60  		arm64.RegR9, arm64.RegR10, arm64.RegR11, arm64.RegR12, arm64.RegR13,
    61  		arm64.RegR14, arm64.RegR15, arm64.RegR16, arm64.RegR17, arm64.RegR19,
    62  		arm64.RegR20, arm64.RegR21, arm64.RegR22, arm64.RegR23, arm64.RegR24,
    63  		arm64.RegR25, arm64.RegR26, arm64.RegR29, arm64.RegR30,
    64  	}
    65  )
    66  
    67  const (
    68  	// arm64ReservedRegisterForCallEngine holds the pointer to callEngine instance (i.e. *callEngine as uintptr)
    69  	arm64ReservedRegisterForCallEngine = arm64.RegR0
    70  	// arm64ReservedRegisterForStackBasePointerAddress holds stack base pointer's address (callEngine.stackBasePointer) in the current function call.
    71  	arm64ReservedRegisterForStackBasePointerAddress = arm64.RegR1
    72  	// arm64ReservedRegisterForMemory holds the pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
    73  	arm64ReservedRegisterForMemory = arm64.RegR2
    74  	// arm64ReservedRegisterForTemporary is the temporary register which is available at any point of execution, but its content shouldn't be supposed to live beyond the single operation.
    75  	// Note: we choose R27 as that is the temporary register used in Go's assembler.
    76  	arm64ReservedRegisterForTemporary = arm64.RegR27
    77  )
    78  
    79  var arm64CallingConventionModuleInstanceAddressRegister = arm64.RegR29
    80  
    81  const (
    82  	// arm64CallEngineArchContextCompilerCallReturnAddressOffset is the offset of archContext.nativeCallReturnAddress in callEngine.
    83  	arm64CallEngineArchContextCompilerCallReturnAddressOffset = 136
    84  	// arm64CallEngineArchContextMinimum32BitSignedIntOffset is the offset of archContext.minimum32BitSignedIntAddress in callEngine.
    85  	arm64CallEngineArchContextMinimum32BitSignedIntOffset = 144
    86  	// arm64CallEngineArchContextMinimum64BitSignedIntOffset is the offset of archContext.minimum64BitSignedIntAddress in callEngine.
    87  	arm64CallEngineArchContextMinimum64BitSignedIntOffset = 152
    88  )
    89  
    90  func isZeroRegister(r asm.Register) bool {
    91  	return r == arm64.RegRZR
    92  }
    93  
    94  // compile implements compiler.compile for the arm64 architecture.
    95  func (c *arm64Compiler) compile() (code []byte, stackPointerCeil uint64, err error) {
    96  	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
    97  	// used for all labels (via setLocationStack), excluding the current one.
    98  	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
    99  	stackPointerCeil = c.stackPointerCeil
   100  	if stackPointerCeil < c.locationStack.stackPointerCeil {
   101  		stackPointerCeil = c.locationStack.stackPointerCeil
   102  	}
   103  
   104  	// Now that the ceil of stack pointer is determined, we are invoking the callback.
   105  	// Note: this must be called before Assemble() below.
   106  	if c.onStackPointerCeilDeterminedCallBack != nil {
   107  		c.onStackPointerCeilDeterminedCallBack(stackPointerCeil)
   108  	}
   109  
   110  	var original []byte
   111  	original, err = c.assembler.Assemble()
   112  	if err != nil {
   113  		return
   114  	}
   115  
   116  	code, err = platform.MmapCodeSegment(bytes.NewReader(original), len(original))
   117  	return
   118  }
   119  
   120  // arm64LabelInfo holds a wazeroir label specific information in this function.
   121  type arm64LabelInfo struct {
   122  	// initialInstruction is the initial instruction for this label so other block can branch into it.
   123  	initialInstruction asm.Node
   124  	// initialStack is the initial value location stack from which we start compiling this label.
   125  	initialStack *runtimeValueLocationStack
   126  	// labelBeginningCallbacks holds callbacks should to be called with initialInstruction
   127  	labelBeginningCallbacks []func(asm.Node)
   128  }
   129  
   130  func (c *arm64Compiler) label(labelKey string) *arm64LabelInfo {
   131  	ret, ok := c.labels[labelKey]
   132  	if ok {
   133  		return ret
   134  	}
   135  	c.labels[labelKey] = &arm64LabelInfo{}
   136  	return c.labels[labelKey]
   137  }
   138  
   139  // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
   140  func (c *arm64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
   141  	return c.locationStack
   142  }
   143  
   144  // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for arm64.
   145  func (c *arm64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
   146  	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
   147  	c.markRegisterUsed(reg)
   148  	return
   149  }
   150  
   151  // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for arm64.
   152  func (c *arm64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
   153  	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
   154  	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
   155  	c.markRegisterUsed(reg)
   156  	return
   157  }
   158  
   159  func (c *arm64Compiler) markRegisterUsed(regs ...asm.Register) {
   160  	for _, reg := range regs {
   161  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   162  			c.locationStack.markRegisterUsed(reg)
   163  		}
   164  	}
   165  }
   166  
   167  func (c *arm64Compiler) markRegisterUnused(regs ...asm.Register) {
   168  	for _, reg := range regs {
   169  		if !isZeroRegister(reg) && reg != asm.NilRegister {
   170  			c.locationStack.markRegisterUnused(reg)
   171  		}
   172  	}
   173  }
   174  
   175  func (c *arm64Compiler) String() (ret string) { return c.locationStack.String() }
   176  
   177  // compilePreamble implements compiler.compilePreamble for the arm64 architecture.
   178  func (c *arm64Compiler) compilePreamble() error {
   179  	c.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   180  	defer c.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   181  
   182  	c.locationStack.init(c.ir.Signature)
   183  
   184  	// Check if it's necessary to grow the value stack before entering function body.
   185  	if err := c.compileMaybeGrowStack(); err != nil {
   186  		return err
   187  	}
   188  
   189  	if err := c.compileModuleContextInitialization(); err != nil {
   190  		return err
   191  	}
   192  
   193  	if c.withListener {
   194  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerBefore); err != nil {
   195  			return err
   196  		}
   197  	}
   198  
   199  	// We must initialize the stack base pointer register so that we can manipulate the stack properly.
   200  	c.compileReservedStackBasePointerRegisterInitialization()
   201  
   202  	c.compileReservedMemoryRegisterInitialization()
   203  
   204  	return nil
   205  }
   206  
   207  // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
   208  // and if so, make the builtin function call to do so. These instructions are called in the function's
   209  // preamble.
   210  func (c *arm64Compiler) compileMaybeGrowStack() error {
   211  	tmpRegs, found := c.locationStack.takeFreeRegisters(registerTypeGeneralPurpose, 2)
   212  	if !found {
   213  		panic("BUG: all the registers should be free at this point")
   214  	}
   215  	tmpX, tmpY := tmpRegs[0], tmpRegs[1]
   216  
   217  	// "tmpX = len(ce.stack)"
   218  	c.assembler.CompileMemoryToRegister(
   219  		arm64.LDRD,
   220  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset,
   221  		tmpX,
   222  	)
   223  
   224  	// "tmpY = ce.stackBasePointer"
   225  	c.assembler.CompileMemoryToRegister(
   226  		arm64.LDRD,
   227  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
   228  		tmpY,
   229  	)
   230  
   231  	// "tmpX = tmpX - tmpY", in other words "tmpX = len(ce.stack) - ce.stackBasePointer"
   232  	c.assembler.CompileRegisterToRegister(
   233  		arm64.SUB,
   234  		tmpY,
   235  		tmpX,
   236  	)
   237  
   238  	// "tmpY = stackPointerCeil"
   239  	loadStackPointerCeil := c.assembler.CompileConstToRegister(
   240  		arm64.MOVD,
   241  		math.MaxInt32,
   242  		tmpY,
   243  	)
   244  	// At this point of compilation, we don't know the value of stack point ceil,
   245  	// so we lazily resolve the value later.
   246  	c.onStackPointerCeilDeterminedCallBack = func(stackPointerCeil uint64) {
   247  		loadStackPointerCeil.AssignSourceConstant(int64(stackPointerCeil) << 3)
   248  	}
   249  
   250  	// Compare tmpX (len(ce.stack) - ce.stackBasePointer) and tmpY (ce.stackPointerCeil)
   251  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmpX, tmpY)
   252  
   253  	// If ceil > stackLen - stack base pointer, we need to grow the stack by calling builtin Go function.
   254  	brIfStackOK := c.assembler.CompileJump(arm64.BCONDLS)
   255  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexGrowStack); err != nil {
   256  		return err
   257  	}
   258  
   259  	// Otherwise, skip calling it.
   260  	c.assembler.SetJumpTargetOnNext(brIfStackOK)
   261  
   262  	c.markRegisterUnused(tmpRegs...)
   263  	return nil
   264  }
   265  
   266  // returnFunction emits instructions to return from the current function frame.
   267  // If the current frame is the bottom, the code goes back to the Go code with nativeCallStatusCodeReturned status.
   268  // Otherwise, we branch into the caller's return address.
   269  func (c *arm64Compiler) compileReturnFunction() error {
   270  	// Release all the registers as our calling convention requires the caller-save.
   271  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
   272  		return err
   273  	}
   274  
   275  	if c.withListener {
   276  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexFunctionListenerAfter); err != nil {
   277  			return err
   278  		}
   279  		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
   280  		c.compileReservedStackBasePointerRegisterInitialization()
   281  	}
   282  
   283  	// arm64CallingConventionModuleInstanceAddressRegister holds the module intstance's address
   284  	// so mark it used so that it won't be used as a free register.
   285  	c.locationStack.markRegisterUsed(arm64CallingConventionModuleInstanceAddressRegister)
   286  	defer c.locationStack.markRegisterUnused(arm64CallingConventionModuleInstanceAddressRegister)
   287  
   288  	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.ir.Signature)
   289  
   290  	// If the return address is zero, meaning that we return from the execution.
   291  	returnAddress.setRegister(arm64ReservedRegisterForTemporary)
   292  	c.compileLoadValueOnStackToRegister(returnAddress)
   293  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, arm64.RegRZR)
   294  
   295  	// Br if the address does not equal zero.
   296  	brIfNotEqual := c.assembler.CompileJump(arm64.BCONDNE)
   297  	// Otherwise, exit.
   298  	c.compileExitFromNativeCode(nativeCallStatusCodeReturned)
   299  
   300  	// If the address doesn't equal zero, return br into returnAddressRegister (caller's return address).
   301  	c.assembler.SetJumpTargetOnNext(brIfNotEqual)
   302  
   303  	// Alias for readability.
   304  	tmp := arm64CallingConventionModuleInstanceAddressRegister
   305  
   306  	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
   307  	callerStackBasePointerInBytes.setRegister(tmp)
   308  	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
   309  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   310  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
   311  
   312  	// Next, restore moduleContext.fn from callerFunction.
   313  	callerFunction.setRegister(tmp)
   314  	c.compileLoadValueOnStackToRegister(callerFunction)
   315  	c.assembler.CompileRegisterToMemory(arm64.STRD, tmp,
   316  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
   317  
   318  	// Also, we have to put the target function's *wasm.ModuleInstance into arm64CallingConventionModuleInstanceAddressRegister.
   319  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   320  		tmp, functionModuleInstanceAddressOffset,
   321  		arm64CallingConventionModuleInstanceAddressRegister)
   322  
   323  	c.assembler.CompileJumpToRegister(arm64.B, returnAddress.register)
   324  	return nil
   325  }
   326  
   327  // compileExitFromNativeCode adds instructions to give the control back to ce.exec with the given status code.
   328  func (c *arm64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
   329  	// Write the current stack pointer to the ce.stackPointer.
   330  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(c.locationStack.sp), arm64ReservedRegisterForTemporary)
   331  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForCallEngine,
   332  		callEngineStackContextStackPointerOffset)
   333  
   334  	if status != 0 {
   335  		c.assembler.CompileConstToRegister(arm64.MOVW, int64(status), arm64ReservedRegisterForTemporary)
   336  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64ReservedRegisterForTemporary,
   337  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   338  	} else {
   339  		// If the status == 0, we use zero register to store zero.
   340  		c.assembler.CompileRegisterToMemory(arm64.STRW, arm64.RegRZR,
   341  			arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
   342  	}
   343  
   344  	// The return address to the Go code is stored in archContext.compilerReturnAddress which
   345  	// is embedded in ce. We load the value to the tmpRegister, and then
   346  	// invoke RET with that register.
   347  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   348  		arm64ReservedRegisterForCallEngine, arm64CallEngineArchContextCompilerCallReturnAddressOffset,
   349  		arm64ReservedRegisterForTemporary)
   350  
   351  	c.assembler.CompileJumpToRegister(arm64.RET, arm64ReservedRegisterForTemporary)
   352  }
   353  
   354  // compileGoHostFunction implements compiler.compileHostFunction for the arm64 architecture.
   355  func (c *arm64Compiler) compileGoDefinedHostFunction() error {
   356  	// First we must update the location stack to reflect the number of host function inputs.
   357  	c.locationStack.init(c.ir.Signature)
   358  
   359  	if c.withListener {
   360  		if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction,
   361  			builtinFunctionIndexFunctionListenerBefore); err != nil {
   362  			return err
   363  		}
   364  	}
   365  
   366  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction, 0); err != nil {
   367  		return err
   368  	}
   369  
   370  	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
   371  	c.compileReservedStackBasePointerRegisterInitialization()
   372  	return c.compileReturnFunction()
   373  }
   374  
   375  // setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
   376  // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
   377  // This is called when we branch into different block.
   378  func (c *arm64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
   379  	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
   380  		c.stackPointerCeil = c.locationStack.stackPointerCeil
   381  	}
   382  	c.locationStack = newStack
   383  }
   384  
   385  // arm64Compiler implements compiler.arm64Compiler for the arm64 architecture.
   386  func (c *arm64Compiler) compileLabel(o *wazeroir.OperationLabel) (skipThisLabel bool) {
   387  	labelKey := o.Label.String()
   388  	arm64LabelInfo := c.label(labelKey)
   389  
   390  	// If initialStack is not set, that means this label has never been reached.
   391  	if arm64LabelInfo.initialStack == nil {
   392  		skipThisLabel = true
   393  		return
   394  	}
   395  
   396  	// We use NOP as a beginning of instructions in a label.
   397  	// This should be eventually optimized out by assembler.
   398  	labelBegin := c.assembler.CompileStandAlone(arm64.NOP)
   399  
   400  	// Save the instructions so that backward branching
   401  	// instructions can branch to this label.
   402  	arm64LabelInfo.initialInstruction = labelBegin
   403  
   404  	// Set the initial stack.
   405  	c.setLocationStack(arm64LabelInfo.initialStack)
   406  
   407  	// Invoke callbacks to notify the forward branching
   408  	// instructions can properly branch to this label.
   409  	for _, cb := range arm64LabelInfo.labelBeginningCallbacks {
   410  		cb(labelBegin)
   411  	}
   412  	return false
   413  }
   414  
   415  // compileUnreachable implements compiler.compileUnreachable for the arm64 architecture.
   416  func (c *arm64Compiler) compileUnreachable() error {
   417  	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
   418  	return nil
   419  }
   420  
   421  // compileSet implements compiler.compileSet for the arm64 architecture.
   422  func (c *arm64Compiler) compileSet(o *wazeroir.OperationSet) error {
   423  	setTargetIndex := int(c.locationStack.sp) - 1 - o.Depth
   424  
   425  	if o.IsTargetVector {
   426  		_ = c.locationStack.pop()
   427  	}
   428  	v := c.locationStack.pop()
   429  	if err := c.compileEnsureOnRegister(v); err != nil {
   430  		return err
   431  	}
   432  
   433  	targetLocation := c.locationStack.stack[setTargetIndex]
   434  	if targetLocation.onRegister() {
   435  		// We no longer need the register previously used by the target location.
   436  		c.markRegisterUnused(targetLocation.register)
   437  	}
   438  
   439  	reg := v.register
   440  	targetLocation.setRegister(reg)
   441  	if o.IsTargetVector {
   442  		c.locationStack.stack[setTargetIndex+1].setRegister(reg)
   443  	}
   444  	return nil
   445  }
   446  
   447  // compileGlobalGet implements compiler.compileGlobalGet for the arm64 architecture.
   448  func (c *arm64Compiler) compileGlobalGet(o *wazeroir.OperationGlobalGet) error {
   449  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   450  		return err
   451  	}
   452  
   453  	wasmValueType := c.ir.Globals[o.Index].ValType
   454  	isV128 := wasmValueType == wasm.ValueTypeV128
   455  	// Get the address of globals[index] into globalAddressReg.
   456  	globalAddressReg, err := c.compileReadGlobalAddress(o.Index)
   457  	if err != nil {
   458  		return err
   459  	}
   460  
   461  	if isV128 {
   462  		resultReg, err := c.allocateRegister(registerTypeVector)
   463  		if err != nil {
   464  			return err
   465  		}
   466  		c.assembler.CompileConstToRegister(arm64.ADD, globalInstanceValueOffset, globalAddressReg)
   467  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV, globalAddressReg, 0,
   468  			resultReg, arm64.VectorArrangementQ)
   469  
   470  		c.pushVectorRuntimeValueLocationOnRegister(resultReg)
   471  	} else {
   472  		ldr := arm64.NOP
   473  		var result asm.Register
   474  		var vt runtimeValueType
   475  		switch wasmValueType {
   476  		case wasm.ValueTypeI32:
   477  			ldr = arm64.LDRW
   478  			vt = runtimeValueTypeI32
   479  			result = globalAddressReg
   480  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   481  			ldr = arm64.LDRD
   482  			vt = runtimeValueTypeI64
   483  			result = globalAddressReg
   484  		case wasm.ValueTypeF32:
   485  			result, err = c.allocateRegister(registerTypeVector)
   486  			if err != nil {
   487  				return err
   488  			}
   489  			ldr = arm64.FLDRS
   490  			vt = runtimeValueTypeF32
   491  		case wasm.ValueTypeF64:
   492  			result, err = c.allocateRegister(registerTypeVector)
   493  			if err != nil {
   494  				return err
   495  			}
   496  			ldr = arm64.FLDRD
   497  			vt = runtimeValueTypeF64
   498  		}
   499  
   500  		// "result = [globalAddressReg + globalInstanceValueOffset] (== globals[index].Val)"
   501  		c.assembler.CompileMemoryToRegister(
   502  			ldr,
   503  			globalAddressReg, globalInstanceValueOffset,
   504  			result,
   505  		)
   506  
   507  		c.pushRuntimeValueLocationOnRegister(result, vt)
   508  	}
   509  	return nil
   510  }
   511  
   512  // compileGlobalSet implements compiler.compileGlobalSet for the arm64 architecture.
   513  func (c *arm64Compiler) compileGlobalSet(o *wazeroir.OperationGlobalSet) error {
   514  	wasmValueType := c.ir.Globals[o.Index].ValType
   515  	isV128 := wasmValueType == wasm.ValueTypeV128
   516  
   517  	var val *runtimeValueLocation
   518  	if isV128 {
   519  		val = c.locationStack.popV128()
   520  	} else {
   521  		val = c.locationStack.pop()
   522  	}
   523  	if err := c.compileEnsureOnRegister(val); err != nil {
   524  		return err
   525  	}
   526  
   527  	globalInstanceAddressRegister, err := c.compileReadGlobalAddress(o.Index)
   528  	if err != nil {
   529  		return err
   530  	}
   531  
   532  	if isV128 {
   533  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
   534  			val.register, globalInstanceAddressRegister, globalInstanceValueOffset,
   535  			arm64.VectorArrangementQ)
   536  	} else {
   537  		var str asm.Instruction
   538  		switch c.ir.Globals[o.Index].ValType {
   539  		case wasm.ValueTypeI32:
   540  			str = arm64.STRW
   541  		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   542  			str = arm64.STRD
   543  		case wasm.ValueTypeF32:
   544  			str = arm64.FSTRS
   545  		case wasm.ValueTypeF64:
   546  			str = arm64.FSTRD
   547  		}
   548  
   549  		// At this point "globalInstanceAddressRegister = globals[index]".
   550  		// Therefore, this means "globals[index].Val = val.register"
   551  		c.assembler.CompileRegisterToMemory(
   552  			str,
   553  			val.register,
   554  			globalInstanceAddressRegister, globalInstanceValueOffset,
   555  		)
   556  	}
   557  
   558  	c.markRegisterUnused(val.register)
   559  	return nil
   560  }
   561  
   562  // compileReadGlobalAddress adds instructions to store the absolute address of the global instance at globalIndex into a register
   563  func (c *arm64Compiler) compileReadGlobalAddress(globalIndex uint32) (destinationRegister asm.Register, err error) {
   564  	// TODO: rethink about the type used in store `globals []*GlobalInstance`.
   565  	// If we use `[]GlobalInstance` instead, we could reduce one MOV instruction here.
   566  
   567  	destinationRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
   568  	if err != nil {
   569  		return
   570  	}
   571  
   572  	// "destinationRegister = globalIndex * 8"
   573  	c.assembler.CompileConstToRegister(
   574  		// globalIndex is an index to []*GlobalInstance, therefore
   575  		// we have to multiply it by the size of *GlobalInstance == the pointer size == 8.
   576  		arm64.MOVD, int64(globalIndex)*8, destinationRegister,
   577  	)
   578  
   579  	// "arm64ReservedRegisterForTemporary = &globals[0]"
   580  	c.assembler.CompileMemoryToRegister(
   581  		arm64.LDRD,
   582  		arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
   583  		arm64ReservedRegisterForTemporary,
   584  	)
   585  
   586  	// "destinationRegister = [arm64ReservedRegisterForTemporary + destinationRegister] (== globals[globalIndex])".
   587  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
   588  		arm64.LDRD,
   589  		arm64ReservedRegisterForTemporary, destinationRegister,
   590  		destinationRegister,
   591  	)
   592  	return
   593  }
   594  
   595  // compileBr implements compiler.compileBr for the arm64 architecture.
   596  func (c *arm64Compiler) compileBr(o *wazeroir.OperationBr) error {
   597  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   598  		return err
   599  	}
   600  	return c.compileBranchInto(o.Target)
   601  }
   602  
   603  // compileBrIf implements compiler.compileBrIf for the arm64 architecture.
   604  func (c *arm64Compiler) compileBrIf(o *wazeroir.OperationBrIf) error {
   605  	cond := c.locationStack.pop()
   606  
   607  	var conditionalBR asm.Node
   608  	if cond.onConditionalRegister() {
   609  		// If the cond is on a conditional register, it corresponds to one of "conditional codes"
   610  		// https://developer.arm.com/documentation/dui0801/a/Condition-Codes/Condition-code-suffixes
   611  		// Here we represent the conditional codes by using arm64.COND_** registers, and that means the
   612  		// conditional jump can be performed if we use arm64.B**.
   613  		// For example, if we have arm64.CondEQ on cond, that means we performed compileEq right before
   614  		// this compileBrIf and BrIf can be achieved by arm64.BCONDEQ.
   615  		var brInst asm.Instruction
   616  		switch cond.conditionalRegister {
   617  		case arm64.CondEQ:
   618  			brInst = arm64.BCONDEQ
   619  		case arm64.CondNE:
   620  			brInst = arm64.BCONDNE
   621  		case arm64.CondHS:
   622  			brInst = arm64.BCONDHS
   623  		case arm64.CondLO:
   624  			brInst = arm64.BCONDLO
   625  		case arm64.CondMI:
   626  			brInst = arm64.BCONDMI
   627  		case arm64.CondHI:
   628  			brInst = arm64.BCONDHI
   629  		case arm64.CondLS:
   630  			brInst = arm64.BCONDLS
   631  		case arm64.CondGE:
   632  			brInst = arm64.BCONDGE
   633  		case arm64.CondLT:
   634  			brInst = arm64.BCONDLT
   635  		case arm64.CondGT:
   636  			brInst = arm64.BCONDGT
   637  		case arm64.CondLE:
   638  			brInst = arm64.BCONDLE
   639  		default:
   640  			// BUG: This means that we use the cond.conditionalRegister somewhere in this file,
   641  			// but not covered in switch ^. That shouldn't happen.
   642  			return fmt.Errorf("unsupported condition for br_if: %v", cond.conditionalRegister)
   643  		}
   644  		conditionalBR = c.assembler.CompileJump(brInst)
   645  	} else {
   646  		// If the value is not on the conditional register, we compare the value with the zero register,
   647  		// and then do the conditional BR if the value doesn't equal zero.
   648  		if err := c.compileEnsureOnRegister(cond); err != nil {
   649  			return err
   650  		}
   651  		// Compare the value with zero register. Note that the value is ensured to be i32 by function validation phase,
   652  		// so we use CMPW (32-bit compare) here.
   653  		c.assembler.CompileTwoRegistersToNone(arm64.CMPW, cond.register, arm64.RegRZR)
   654  
   655  		conditionalBR = c.assembler.CompileJump(arm64.BCONDNE)
   656  
   657  		c.markRegisterUnused(cond.register)
   658  	}
   659  
   660  	// Emit the code for branching into else branch.
   661  	// We save and clone the location stack because we might end up modifying it inside of branchInto,
   662  	// and we have to avoid affecting the code generation for Then branch afterwards.
   663  	saved := c.locationStack
   664  	c.setLocationStack(saved.clone())
   665  	if err := compileDropRange(c, o.Else.ToDrop); err != nil {
   666  		return err
   667  	}
   668  	if err := c.compileBranchInto(o.Else.Target); err != nil {
   669  		return err
   670  	}
   671  
   672  	// Now ready to emit the code for branching into then branch.
   673  	// Retrieve the original value location stack so that the code below won't be affected by the Else branch ^^.
   674  	c.setLocationStack(saved)
   675  	// We branch into here from the original conditional BR (conditionalBR).
   676  	c.assembler.SetJumpTargetOnNext(conditionalBR)
   677  	if err := compileDropRange(c, o.Then.ToDrop); err != nil {
   678  		return err
   679  	}
   680  	return c.compileBranchInto(o.Then.Target)
   681  }
   682  
   683  func (c *arm64Compiler) compileBranchInto(target *wazeroir.BranchTarget) error {
   684  	if target.IsReturnTarget() {
   685  		return c.compileReturnFunction()
   686  	} else {
   687  		labelKey := target.String()
   688  		if c.ir.LabelCallers[labelKey] > 1 {
   689  			// We can only re-use register state if when there's a single call-site.
   690  			// Release existing values on registers to the stack if there's multiple ones to have
   691  			// the consistent value location state at the beginning of label.
   692  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   693  				return err
   694  			}
   695  		}
   696  		// Set the initial stack of the target label, so we can start compiling the label
   697  		// with the appropriate value locations. Note we clone the stack here as we maybe
   698  		// manipulate the stack before compiler reaches the label.
   699  		targetLabel := c.label(labelKey)
   700  		if targetLabel.initialStack == nil {
   701  			targetLabel.initialStack = c.locationStack.clone()
   702  		}
   703  
   704  		br := c.assembler.CompileJump(arm64.B)
   705  		c.assignBranchTarget(labelKey, br)
   706  		return nil
   707  	}
   708  }
   709  
   710  // assignBranchTarget assigns the given label's initial instruction to the destination of br.
   711  func (c *arm64Compiler) assignBranchTarget(labelKey string, br asm.Node) {
   712  	target := c.label(labelKey)
   713  	if target.initialInstruction != nil {
   714  		br.AssignJumpTarget(target.initialInstruction)
   715  	} else {
   716  		// This case, the target label hasn't been compiled yet, so we append the callback and assign
   717  		// the target instruction when compileLabel is called for the label.
   718  		target.labelBeginningCallbacks = append(target.labelBeginningCallbacks, func(labelInitialInstruction asm.Node) {
   719  			br.AssignJumpTarget(labelInitialInstruction)
   720  		})
   721  	}
   722  }
   723  
   724  // compileBrTable implements compiler.compileBrTable for the arm64 architecture.
   725  func (c *arm64Compiler) compileBrTable(o *wazeroir.OperationBrTable) error {
   726  	// If the operation only consists of the default target, we branch into it and return early.
   727  	if len(o.Targets) == 0 {
   728  		loc := c.locationStack.pop()
   729  		if loc.onRegister() {
   730  			c.markRegisterUnused(loc.register)
   731  		}
   732  		if err := compileDropRange(c, o.Default.ToDrop); err != nil {
   733  			return err
   734  		}
   735  		return c.compileBranchInto(o.Default.Target)
   736  	}
   737  
   738  	index := c.locationStack.pop()
   739  	if err := c.compileEnsureOnRegister(index); err != nil {
   740  		return err
   741  	}
   742  
   743  	if isZeroRegister(index.register) {
   744  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
   745  		if err != nil {
   746  			return err
   747  		}
   748  		index.setRegister(reg)
   749  		c.markRegisterUsed(reg)
   750  
   751  		// Zero the value on a picked register.
   752  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, reg)
   753  	}
   754  
   755  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   756  	if err != nil {
   757  		return err
   758  	}
   759  
   760  	// Load the branch table's length.
   761  	// "tmpReg = len(o.Targets)"
   762  	c.assembler.CompileConstToRegister(arm64.MOVW, int64(len(o.Targets)), tmpReg)
   763  	// Compare the length with offset.
   764  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmpReg, index.register)
   765  	// If the value exceeds the length, we will branch into the default target (corresponding to len(o.Targets) index).
   766  	brDefaultIndex := c.assembler.CompileJump(arm64.BCONDLO)
   767  	c.assembler.CompileRegisterToRegister(arm64.MOVW, tmpReg, index.register)
   768  	c.assembler.SetJumpTargetOnNext(brDefaultIndex)
   769  
   770  	// We prepare the asm.StaticConst which holds the offset of
   771  	// each target's first instruction (incl. default)
   772  	// relative to the beginning of label tables.
   773  	//
   774  	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
   775  	// we emit the code like this at [Emit the code for each target and default branch] below.
   776  	//
   777  	// L0:
   778  	//  0x123001: XXXX, ...
   779  	//  .....
   780  	// L1:
   781  	//  0x123005: YYY, ...
   782  	//  .....
   783  	// L_DEFAULT:
   784  	//  0x123009: ZZZ, ...
   785  	//
   786  	// then offsetData becomes like [0x0, 0x5, 0x8].
   787  	// By using this offset list, we could jump into the label for the index by
   788  	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by "LEA"
   789  	// instruction.
   790  	//
   791  	// Note: We store each offset of 32-bit unsigned integer as 4 consecutive bytes. So more precisely,
   792  	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
   793  	//
   794  	// Note: this is similar to how GCC implements Switch statements in C.
   795  	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Targets)+1)))
   796  
   797  	// "tmpReg = &offsetData[0]"
   798  	c.assembler.CompileStaticConstToRegister(arm64.ADR, offsetData, tmpReg)
   799  
   800  	// "index.register = tmpReg + (index.register << 2) (== &offsetData[offset])"
   801  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, index.register, 2, tmpReg, index.register)
   802  
   803  	// "index.register = *index.register (== offsetData[offset])"
   804  	c.assembler.CompileMemoryToRegister(arm64.LDRW, index.register, 0, index.register)
   805  
   806  	// Now we read the address of the beginning of the jump table.
   807  	// In the above example, this corresponds to reading the address of 0x123001.
   808  	c.assembler.CompileReadInstructionAddress(tmpReg, arm64.B)
   809  
   810  	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
   811  	// So we could achieve the br_table jump by adding them and jump into the resulting address.
   812  	c.assembler.CompileRegisterToRegister(arm64.ADD, tmpReg, index.register)
   813  
   814  	c.assembler.CompileJumpToRegister(arm64.B, index.register)
   815  
   816  	// We no longer need the index's register, so mark it unused.
   817  	c.markRegisterUnused(index.register)
   818  
   819  	// [Emit the code for each targets and default branch]
   820  	labelInitialInstructions := make([]asm.Node, len(o.Targets)+1)
   821  	saved := c.locationStack
   822  	for i := range labelInitialInstructions {
   823  		// Emit the initial instruction of each target where
   824  		// we use NOP as we don't yet know the next instruction in each label.
   825  		init := c.assembler.CompileStandAlone(arm64.NOP)
   826  		labelInitialInstructions[i] = init
   827  
   828  		var locationStack *runtimeValueLocationStack
   829  		var target *wazeroir.BranchTargetDrop
   830  		if i < len(o.Targets) {
   831  			target = o.Targets[i]
   832  			// Clone the location stack so the branch-specific code doesn't
   833  			// affect others.
   834  			locationStack = saved.clone()
   835  		} else {
   836  			target = o.Default
   837  			// If this is the default branch, we use the original one
   838  			// as this is the last code in this block.
   839  			locationStack = saved
   840  		}
   841  		c.setLocationStack(locationStack)
   842  		if err := compileDropRange(c, target.ToDrop); err != nil {
   843  			return err
   844  		}
   845  		if err := c.compileBranchInto(target.Target); err != nil {
   846  			return err
   847  		}
   848  	}
   849  
   850  	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
   851  	return nil
   852  }
   853  
   854  // compileCall implements compiler.compileCall for the arm64 architecture.
   855  func (c *arm64Compiler) compileCall(o *wazeroir.OperationCall) error {
   856  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   857  		return err
   858  	}
   859  
   860  	tp := c.ir.Types[c.ir.Functions[o.FunctionIndex]]
   861  
   862  	targetFunctionAddressReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   863  	if err != nil {
   864  		return err
   865  	}
   866  	c.markRegisterUsed(targetFunctionAddressReg)
   867  	defer c.markRegisterUnused(targetFunctionAddressReg)
   868  
   869  	// 3) Set rc.next to specify which function is executed on the current call frame.
   870  	//
   871  	// First, we read the address of the first item of ce.functions slice (= &ce.functions[0])
   872  	// into tmp.
   873  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   874  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
   875  		targetFunctionAddressReg)
   876  
   877  	c.assembler.CompileMemoryToRegister(
   878  		arm64.LDRD,
   879  		targetFunctionAddressReg, int64(o.FunctionIndex)*8, // * 8 because the size of *function equals 8 bytes.
   880  		targetFunctionAddressReg)
   881  
   882  	return c.compileCallImpl(targetFunctionAddressReg, tp)
   883  }
   884  
   885  // compileCallImpl implements compiler.compileCall and compiler.compileCallIndirect for the arm64 architecture.
   886  func (c *arm64Compiler) compileCallImpl(targetFunctionAddressRegister asm.Register, functype *wasm.FunctionType) error {
   887  	// Release all the registers as our calling convention requires the caller-save.
   888  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
   889  		return err
   890  	}
   891  
   892  	tmp, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
   893  	if !ok {
   894  		panic("BUG: cannot take a free register")
   895  	}
   896  
   897  	// The stack should look like:
   898  	//
   899  	//               reserved slots for results (if len(results) > len(args))
   900  	//                      |     |
   901  	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
   902  	//      |                       |                                                        |
   903  	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
   904  	//      |
   905  	// nextStackBasePointerOffset
   906  	//
   907  	// where callFrame is used to return to this currently executed function.
   908  
   909  	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
   910  
   911  	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
   912  
   913  	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
   914  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   915  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
   916  		tmp)
   917  	callFrameStackBasePointerInBytesLoc.setRegister(tmp)
   918  	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
   919  
   920  	// Set callEngine.stackContext.stackBasePointer for the next function.
   921  	c.assembler.CompileConstToRegister(arm64.ADD, nextStackBasePointerOffset<<3, tmp)
   922  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   923  		tmp,
   924  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
   925  
   926  	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
   927  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   928  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
   929  		tmp)
   930  	callFrameFunctionLoc.setRegister(tmp)
   931  	c.compileReleaseRegisterToStack(callFrameFunctionLoc)
   932  
   933  	// Set callEngine.moduleContext.fn to the next *function.
   934  	c.assembler.CompileRegisterToMemory(arm64.STRD,
   935  		targetFunctionAddressRegister,
   936  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
   937  
   938  	// Write the return address into callFrameReturnAddressLoc.
   939  	c.assembler.CompileReadInstructionAddress(tmp, arm64.B)
   940  	callFrameReturnAddressLoc.setRegister(tmp)
   941  	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
   942  
   943  	if targetFunctionAddressRegister == arm64CallingConventionModuleInstanceAddressRegister {
   944  		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
   945  		// the address (jump target below) will be modified and result in segfault.
   946  		// See #526.
   947  		c.assembler.CompileRegisterToRegister(arm64.MOVD, targetFunctionAddressRegister, tmp)
   948  		targetFunctionAddressRegister = tmp
   949  	}
   950  
   951  	// Also, we have to put the code's moduleInstance address into arm64CallingConventionModuleInstanceAddressRegister.
   952  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   953  		targetFunctionAddressRegister, functionModuleInstanceAddressOffset,
   954  		arm64CallingConventionModuleInstanceAddressRegister,
   955  	)
   956  
   957  	// Then, br into the target function's initial address.
   958  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
   959  		targetFunctionAddressRegister, functionCodeInitialAddressOffset,
   960  		targetFunctionAddressRegister)
   961  
   962  	c.assembler.CompileJumpToRegister(arm64.B, targetFunctionAddressRegister)
   963  
   964  	// We consumed the function parameters, the call frame stack and reserved slots during the call.
   965  	c.locationStack.sp = uint64(nextStackBasePointerOffset)
   966  
   967  	// Also, the function results were pushed by the call.
   968  	for _, t := range functype.Results {
   969  		loc := c.locationStack.pushRuntimeValueLocationOnStack()
   970  		switch t {
   971  		case wasm.ValueTypeI32:
   972  			loc.valueType = runtimeValueTypeI32
   973  		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
   974  			loc.valueType = runtimeValueTypeI64
   975  		case wasm.ValueTypeF32:
   976  			loc.valueType = runtimeValueTypeF32
   977  		case wasm.ValueTypeF64:
   978  			loc.valueType = runtimeValueTypeF64
   979  		case wasm.ValueTypeV128:
   980  			loc.valueType = runtimeValueTypeV128Lo
   981  			hi := c.locationStack.pushRuntimeValueLocationOnStack()
   982  			hi.valueType = runtimeValueTypeV128Hi
   983  		}
   984  	}
   985  
   986  	if err := c.compileModuleContextInitialization(); err != nil {
   987  		return err
   988  	}
   989  
   990  	// On the function return, we initialize the state for this function.
   991  	c.compileReservedStackBasePointerRegisterInitialization()
   992  
   993  	c.compileReservedMemoryRegisterInitialization()
   994  	return nil
   995  }
   996  
   997  // compileCallIndirect implements compiler.compileCallIndirect for the arm64 architecture.
   998  func (c *arm64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) error {
   999  	offset := c.locationStack.pop()
  1000  	if err := c.compileEnsureOnRegister(offset); err != nil {
  1001  		return err
  1002  	}
  1003  
  1004  	if isZeroRegister(offset.register) {
  1005  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1006  		if err != nil {
  1007  			return err
  1008  		}
  1009  		offset.setRegister(reg)
  1010  		c.markRegisterUsed(reg)
  1011  
  1012  		// Zero the value on a picked register.
  1013  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, reg)
  1014  	}
  1015  
  1016  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  1017  	if err != nil {
  1018  		return err
  1019  	}
  1020  	c.markRegisterUsed(tmp)
  1021  
  1022  	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
  1023  	if err != nil {
  1024  		return err
  1025  	}
  1026  	c.markRegisterUsed(tmp2)
  1027  
  1028  	// First, we need to check if the offset doesn't exceed the length of table.
  1029  	// "tmp = &Tables[0]"
  1030  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1031  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  1032  		tmp,
  1033  	)
  1034  	// tmp = [tmp + TableIndex*8] = [&Tables[0] + TableIndex*sizeOf(*tableInstance)] = Tables[tableIndex]
  1035  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1036  		tmp, int64(o.TableIndex)*8,
  1037  		tmp,
  1038  	)
  1039  	// tmp2 = [tmp + tableInstanceTableLenOffset] = len(Tables[tableIndex])
  1040  	c.assembler.CompileMemoryToRegister(arm64.LDRD, tmp, tableInstanceTableLenOffset, tmp2)
  1041  
  1042  	// "cmp tmp2, offset"
  1043  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp2, offset.register)
  1044  
  1045  	// If it exceeds len(table), we exit the execution.
  1046  	brIfOffsetOK := c.assembler.CompileJump(arm64.BCONDLO)
  1047  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  1048  
  1049  	// Otherwise, we proceed to do function type check.
  1050  	c.assembler.SetJumpTargetOnNext(brIfOffsetOK)
  1051  
  1052  	// We need to obtains the absolute address of table element.
  1053  	// "tmp = &Tables[tableIndex].table[0]"
  1054  	c.assembler.CompileMemoryToRegister(
  1055  		arm64.LDRD,
  1056  		tmp, tableInstanceTableOffset,
  1057  		tmp,
  1058  	)
  1059  	// "offset = tmp + (offset << pointerSizeLog2) (== &table[offset])"
  1060  	// Here we left shifting by 3 in order to get the offset in bytes,
  1061  	// and the table element type is uintptr which is 8 bytes.
  1062  	c.assembler.CompileLeftShiftedRegisterToRegister(
  1063  		arm64.ADD,
  1064  		offset.register, pointerSizeLog2,
  1065  		tmp,
  1066  		offset.register,
  1067  	)
  1068  
  1069  	// "offset = (*offset) (== table[offset])"
  1070  	c.assembler.CompileMemoryToRegister(arm64.LDRD, offset.register, 0, offset.register)
  1071  
  1072  	// Check if the value of table[offset] equals zero, meaning that the target element is uninitialized.
  1073  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, offset.register)
  1074  	brIfInitialized := c.assembler.CompileJump(arm64.BCONDNE)
  1075  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  1076  
  1077  	c.assembler.SetJumpTargetOnNext(brIfInitialized)
  1078  	// next we check the type matches, i.e. table[offset].source.TypeID == targetFunctionType.
  1079  	// "tmp = table[offset].source ( == *FunctionInstance type)"
  1080  	c.assembler.CompileMemoryToRegister(
  1081  		arm64.LDRD,
  1082  		offset.register, functionSourceOffset,
  1083  		tmp,
  1084  	)
  1085  	// "tmp = [tmp + functionInstanceTypeIDOffset] (== table[offset].source.TypeID)"
  1086  	c.assembler.CompileMemoryToRegister(
  1087  		arm64.LDRW, tmp, functionInstanceTypeIDOffset,
  1088  		tmp,
  1089  	)
  1090  	// "tmp2 = ModuleInstance.TypeIDs[index]"
  1091  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  1092  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
  1093  		tmp2)
  1094  	c.assembler.CompileMemoryToRegister(arm64.LDRW, tmp2, int64(o.TypeIndex)*4, tmp2)
  1095  
  1096  	// Compare these two values, and if they equal, we are ready to make function call.
  1097  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmp, tmp2)
  1098  	brIfTypeMatched := c.assembler.CompileJump(arm64.BCONDEQ)
  1099  	c.compileExitFromNativeCode(nativeCallStatusCodeTypeMismatchOnIndirectCall)
  1100  
  1101  	c.assembler.SetJumpTargetOnNext(brIfTypeMatched)
  1102  
  1103  	targetFunctionType := c.ir.Types[o.TypeIndex]
  1104  	if err := c.compileCallImpl(offset.register, targetFunctionType); err != nil {
  1105  		return err
  1106  	}
  1107  
  1108  	// The offset register should be marked as un-used as we consumed in the function call.
  1109  	c.markRegisterUnused(offset.register, tmp, tmp2)
  1110  	return nil
  1111  }
  1112  
  1113  // compileDrop implements compiler.compileDrop for the arm64 architecture.
  1114  func (c *arm64Compiler) compileDrop(o *wazeroir.OperationDrop) error {
  1115  	return compileDropRange(c, o.Depth)
  1116  }
  1117  
  1118  func (c *arm64Compiler) compileSelectV128Impl(selectorRegister asm.Register) error {
  1119  	x2 := c.locationStack.popV128()
  1120  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1121  		return err
  1122  	}
  1123  
  1124  	x1 := c.locationStack.popV128()
  1125  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1126  		return err
  1127  	}
  1128  
  1129  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, selectorRegister)
  1130  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1131  
  1132  	// In this branch, we select the value of x2, so we move the value into x1.register so that
  1133  	// we can have the result in x1.register regardless of the selection.
  1134  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1135  		x2.register, x2.register, x1.register, arm64.VectorArrangement16B)
  1136  
  1137  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1138  
  1139  	// As noted, the result exists in x1.register regardless of the selector.
  1140  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
  1141  	// Plus, x2.register is no longer used.
  1142  	c.markRegisterUnused(x2.register)
  1143  	return nil
  1144  }
  1145  
  1146  // compileSelect implements compiler.compileSelect for the arm64 architecture.
  1147  func (c *arm64Compiler) compileSelect(o *wazeroir.OperationSelect) error {
  1148  	cv, err := c.popValueOnRegister()
  1149  	if err != nil {
  1150  		return err
  1151  	}
  1152  
  1153  	if o.IsTargetVector {
  1154  		return c.compileSelectV128Impl(cv.register)
  1155  	}
  1156  
  1157  	c.markRegisterUsed(cv.register)
  1158  
  1159  	x1, x2, err := c.popTwoValuesOnRegisters()
  1160  	if err != nil {
  1161  		return err
  1162  	}
  1163  
  1164  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1165  		// If both values are zero, the result is always zero.
  1166  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1167  		c.markRegisterUnused(cv.register)
  1168  		return nil
  1169  	}
  1170  
  1171  	// In the following, we emit the code so that x1's register contains the chosen value
  1172  	// no matter which of original x1 or x2 is selected.
  1173  	//
  1174  	// If x1 is currently on zero register, we cannot place the result because
  1175  	// "MOV arm64.RegRZR x2.register" results in arm64.RegRZR regardless of the value.
  1176  	// So we explicitly assign a general purpose register to x1 here.
  1177  	if isZeroRegister(x1.register) {
  1178  		// Mark x2 and cv's registers are used so they won't be chosen.
  1179  		c.markRegisterUsed(x2.register)
  1180  		// Pick the non-zero register for x1.
  1181  		x1Reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1182  		if err != nil {
  1183  			return err
  1184  		}
  1185  		x1.setRegister(x1Reg)
  1186  		// And zero our the picked register.
  1187  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, x1Reg)
  1188  	}
  1189  
  1190  	// At this point, x1 is non-zero register, and x2 is either general purpose or zero register.
  1191  
  1192  	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, cv.register)
  1193  	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
  1194  
  1195  	// If cv == 0, we move the value of x2 to the x1.register.
  1196  
  1197  	switch x1.valueType {
  1198  	case runtimeValueTypeI32:
  1199  		// TODO: use 32-bit mov
  1200  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1201  	case runtimeValueTypeI64:
  1202  		c.assembler.CompileRegisterToRegister(arm64.MOVD, x2.register, x1.register)
  1203  	case runtimeValueTypeF32:
  1204  		// TODO: use 32-bit mov
  1205  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1206  	case runtimeValueTypeF64:
  1207  		c.assembler.CompileRegisterToRegister(arm64.FMOVD, x2.register, x1.register)
  1208  	default:
  1209  		return errors.New("TODO: implement vector type select")
  1210  	}
  1211  
  1212  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1213  
  1214  	// Otherwise, nothing to do for select.
  1215  	c.assembler.SetJumpTargetOnNext(brIfNotZero)
  1216  
  1217  	// Only x1.register is reused.
  1218  	c.markRegisterUnused(cv.register, x2.register)
  1219  	return nil
  1220  }
  1221  
  1222  // compilePick implements compiler.compilePick for the arm64 architecture.
  1223  func (c *arm64Compiler) compilePick(o *wazeroir.OperationPick) error {
  1224  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1225  		return err
  1226  	}
  1227  
  1228  	pickTarget := c.locationStack.stack[c.locationStack.sp-1-uint64(o.Depth)]
  1229  	pickedRegister, err := c.allocateRegister(pickTarget.getRegisterType())
  1230  	if err != nil {
  1231  		return err
  1232  	}
  1233  
  1234  	if pickTarget.onRegister() { // Copy the value to the pickedRegister.
  1235  		switch pickTarget.valueType {
  1236  		case runtimeValueTypeI32:
  1237  			c.assembler.CompileRegisterToRegister(arm64.MOVW, pickTarget.register, pickedRegister)
  1238  		case runtimeValueTypeI64:
  1239  			c.assembler.CompileRegisterToRegister(arm64.MOVD, pickTarget.register, pickedRegister)
  1240  		case runtimeValueTypeF32:
  1241  			c.assembler.CompileRegisterToRegister(arm64.FMOVS, pickTarget.register, pickedRegister)
  1242  		case runtimeValueTypeF64:
  1243  			c.assembler.CompileRegisterToRegister(arm64.FMOVD, pickTarget.register, pickedRegister)
  1244  		case runtimeValueTypeV128Lo:
  1245  			c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
  1246  				pickTarget.register, pickTarget.register, pickedRegister, arm64.VectorArrangement16B)
  1247  		case runtimeValueTypeV128Hi:
  1248  			panic("BUG") // since pick target must point to the lower 64-bits of vectors.
  1249  		}
  1250  	} else if pickTarget.onStack() {
  1251  		// Temporarily assign a register to the pick target, and then load the value.
  1252  		pickTarget.setRegister(pickedRegister)
  1253  		c.compileLoadValueOnStackToRegister(pickTarget)
  1254  
  1255  		// After the load, we revert the register assignment to the pick target.
  1256  		pickTarget.setRegister(asm.NilRegister)
  1257  		if o.IsTargetVector {
  1258  			hi := c.locationStack.stack[pickTarget.stackPointer+1]
  1259  			hi.setRegister(asm.NilRegister)
  1260  		}
  1261  	}
  1262  
  1263  	// Now we have the value of the target on the pickedRegister,
  1264  	// so push the location.
  1265  	c.pushRuntimeValueLocationOnRegister(pickedRegister, pickTarget.valueType)
  1266  	if o.IsTargetVector {
  1267  		c.pushRuntimeValueLocationOnRegister(pickedRegister, runtimeValueTypeV128Hi)
  1268  	}
  1269  	return nil
  1270  }
  1271  
  1272  // compileAdd implements compiler.compileAdd for the arm64 architecture.
  1273  func (c *arm64Compiler) compileAdd(o *wazeroir.OperationAdd) error {
  1274  	x1, x2, err := c.popTwoValuesOnRegisters()
  1275  	if err != nil {
  1276  		return err
  1277  	}
  1278  
  1279  	// Addition can be nop if one of operands is zero.
  1280  	if isZeroRegister(x1.register) {
  1281  		c.pushRuntimeValueLocationOnRegister(x2.register, x1.valueType)
  1282  		return nil
  1283  	} else if isZeroRegister(x2.register) {
  1284  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1285  		return nil
  1286  	}
  1287  
  1288  	var inst asm.Instruction
  1289  	switch o.Type {
  1290  	case wazeroir.UnsignedTypeI32:
  1291  		inst = arm64.ADDW
  1292  	case wazeroir.UnsignedTypeI64:
  1293  		inst = arm64.ADD
  1294  	case wazeroir.UnsignedTypeF32:
  1295  		inst = arm64.FADDS
  1296  	case wazeroir.UnsignedTypeF64:
  1297  		inst = arm64.FADDD
  1298  	}
  1299  
  1300  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1301  	// The result is placed on a register for x1, so record it.
  1302  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1303  	return nil
  1304  }
  1305  
  1306  // compileSub implements compiler.compileSub for the arm64 architecture.
  1307  func (c *arm64Compiler) compileSub(o *wazeroir.OperationSub) error {
  1308  	x1, x2, err := c.popTwoValuesOnRegisters()
  1309  	if err != nil {
  1310  		return err
  1311  	}
  1312  
  1313  	// If both of registers are zeros, this can be nop and push the zero register.
  1314  	if isZeroRegister(x1.register) && isZeroRegister(x2.register) {
  1315  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1316  		return nil
  1317  	}
  1318  
  1319  	// At this point, at least one of x1 or x2 registers is non zero.
  1320  	// Choose the non-zero register as destination.
  1321  	destinationReg := x1.register
  1322  	if isZeroRegister(x1.register) {
  1323  		destinationReg = x2.register
  1324  	}
  1325  
  1326  	var inst asm.Instruction
  1327  	var vt runtimeValueType
  1328  	switch o.Type {
  1329  	case wazeroir.UnsignedTypeI32:
  1330  		inst = arm64.SUBW
  1331  		vt = runtimeValueTypeI32
  1332  	case wazeroir.UnsignedTypeI64:
  1333  		inst = arm64.SUB
  1334  		vt = runtimeValueTypeI64
  1335  	case wazeroir.UnsignedTypeF32:
  1336  		inst = arm64.FSUBS
  1337  		vt = runtimeValueTypeF32
  1338  	case wazeroir.UnsignedTypeF64:
  1339  		inst = arm64.FSUBD
  1340  		vt = runtimeValueTypeF64
  1341  	}
  1342  
  1343  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1344  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  1345  	return nil
  1346  }
  1347  
  1348  // compileMul implements compiler.compileMul for the arm64 architecture.
  1349  func (c *arm64Compiler) compileMul(o *wazeroir.OperationMul) error {
  1350  	x1, x2, err := c.popTwoValuesOnRegisters()
  1351  	if err != nil {
  1352  		return err
  1353  	}
  1354  
  1355  	// Multiplication can be done by putting a zero register if one of operands is zero.
  1356  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1357  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1358  		return nil
  1359  	}
  1360  
  1361  	var inst asm.Instruction
  1362  	var vt runtimeValueType
  1363  	switch o.Type {
  1364  	case wazeroir.UnsignedTypeI32:
  1365  		inst = arm64.MULW
  1366  		vt = runtimeValueTypeI32
  1367  	case wazeroir.UnsignedTypeI64:
  1368  		inst = arm64.MUL
  1369  		vt = runtimeValueTypeI64
  1370  	case wazeroir.UnsignedTypeF32:
  1371  		inst = arm64.FMULS
  1372  		vt = runtimeValueTypeF32
  1373  	case wazeroir.UnsignedTypeF64:
  1374  		inst = arm64.FMULD
  1375  		vt = runtimeValueTypeF64
  1376  	}
  1377  
  1378  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1379  	// The result is placed on a register for x1, so record it.
  1380  	c.pushRuntimeValueLocationOnRegister(x1.register, vt)
  1381  	return nil
  1382  }
  1383  
  1384  // compileClz implements compiler.compileClz for the arm64 architecture.
  1385  func (c *arm64Compiler) compileClz(o *wazeroir.OperationClz) error {
  1386  	v, err := c.popValueOnRegister()
  1387  	if err != nil {
  1388  		return err
  1389  	}
  1390  
  1391  	if isZeroRegister(v.register) {
  1392  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1393  		// so we allocate a register and put the const on it.
  1394  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1395  		if err != nil {
  1396  			return err
  1397  		}
  1398  		var vt runtimeValueType
  1399  		if o.Type == wazeroir.UnsignedInt32 {
  1400  			vt = runtimeValueTypeI32
  1401  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1402  		} else {
  1403  			vt = runtimeValueTypeI64
  1404  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1405  		}
  1406  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1407  		return nil
  1408  	}
  1409  
  1410  	reg := v.register
  1411  	var vt runtimeValueType
  1412  	if o.Type == wazeroir.UnsignedInt32 {
  1413  		vt = runtimeValueTypeI32
  1414  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1415  	} else {
  1416  		vt = runtimeValueTypeI64
  1417  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1418  	}
  1419  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1420  	return nil
  1421  }
  1422  
  1423  // compileCtz implements compiler.compileCtz for the arm64 architecture.
  1424  func (c *arm64Compiler) compileCtz(o *wazeroir.OperationCtz) error {
  1425  	v, err := c.popValueOnRegister()
  1426  	if err != nil {
  1427  		return err
  1428  	}
  1429  
  1430  	reg := v.register
  1431  	if isZeroRegister(reg) {
  1432  		// If the target is zero register, the result is always 32 (or 64 for 64-bits),
  1433  		// so we allocate a register and put the const on it.
  1434  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1435  		if err != nil {
  1436  			return err
  1437  		}
  1438  		var vt runtimeValueType
  1439  		if o.Type == wazeroir.UnsignedInt32 {
  1440  			vt = runtimeValueTypeI32
  1441  			c.assembler.CompileConstToRegister(arm64.MOVW, 32, reg)
  1442  		} else {
  1443  			vt = runtimeValueTypeI64
  1444  			c.assembler.CompileConstToRegister(arm64.MOVD, 64, reg)
  1445  		}
  1446  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  1447  		return nil
  1448  	}
  1449  
  1450  	// Since arm64 doesn't have an instruction directly counting trailing zeros,
  1451  	// we reverse the bits first, and then do CLZ, which is exactly the same as
  1452  	// gcc implements __builtin_ctz for arm64.
  1453  	var vt runtimeValueType
  1454  	if o.Type == wazeroir.UnsignedInt32 {
  1455  		vt = runtimeValueTypeI32
  1456  		c.assembler.CompileRegisterToRegister(arm64.RBITW, reg, reg)
  1457  		c.assembler.CompileRegisterToRegister(arm64.CLZW, reg, reg)
  1458  	} else {
  1459  		vt = runtimeValueTypeI64
  1460  		c.assembler.CompileRegisterToRegister(arm64.RBIT, reg, reg)
  1461  		c.assembler.CompileRegisterToRegister(arm64.CLZ, reg, reg)
  1462  	}
  1463  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  1464  	return nil
  1465  }
  1466  
  1467  // compilePopcnt implements compiler.compilePopcnt for the arm64 architecture.
  1468  func (c *arm64Compiler) compilePopcnt(o *wazeroir.OperationPopcnt) error {
  1469  	v, err := c.popValueOnRegister()
  1470  	if err != nil {
  1471  		return err
  1472  	}
  1473  
  1474  	reg := v.register
  1475  	if isZeroRegister(reg) {
  1476  		c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1477  		return nil
  1478  	}
  1479  
  1480  	freg, err := c.allocateRegister(registerTypeVector)
  1481  	if err != nil {
  1482  		return err
  1483  	}
  1484  
  1485  	// arm64 doesn't have an instruction for population count on scalar register,
  1486  	// so we use the vector one (VCNT).
  1487  	// This exactly what the official Go implements bits.OneCount.
  1488  	// For example, "func () int { return bits.OneCount(10) }" is compiled as
  1489  	//
  1490  	//    MOVD    $10, R0 ;; Load 10.
  1491  	//    FMOVD   R0, F0
  1492  	//    VCNT    V0.B8, V0.B8
  1493  	//    UADDLV  V0.B8, V0
  1494  	//
  1495  	var movInst asm.Instruction
  1496  	if o.Type == wazeroir.UnsignedInt32 {
  1497  		movInst = arm64.FMOVS
  1498  	} else {
  1499  		movInst = arm64.FMOVD
  1500  	}
  1501  	c.assembler.CompileRegisterToRegister(movInst, reg, freg)
  1502  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.VCNT, freg, freg,
  1503  		arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
  1504  	c.assembler.CompileVectorRegisterToVectorRegister(arm64.UADDLV, freg, freg, arm64.VectorArrangement8B,
  1505  		arm64.VectorIndexNone, arm64.VectorIndexNone)
  1506  
  1507  	c.assembler.CompileRegisterToRegister(movInst, freg, reg)
  1508  
  1509  	c.pushRuntimeValueLocationOnRegister(reg, v.valueType)
  1510  	return nil
  1511  }
  1512  
  1513  // compileDiv implements compiler.compileDiv for the arm64 architecture.
  1514  func (c *arm64Compiler) compileDiv(o *wazeroir.OperationDiv) error {
  1515  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1516  	if err != nil {
  1517  		return err
  1518  	}
  1519  
  1520  	// If the divisor is on the zero register, exit from the function deterministically.
  1521  	if isZeroRegister(divisor.register) {
  1522  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1523  		c.locationStack.pushRuntimeValueLocationOnStack()
  1524  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1525  		return nil
  1526  	}
  1527  
  1528  	var inst asm.Instruction
  1529  	var vt runtimeValueType
  1530  	switch o.Type {
  1531  	case wazeroir.SignedTypeUint32:
  1532  		inst = arm64.UDIVW
  1533  		if err := c.compileIntegerDivPrecheck(true, false, dividend.register, divisor.register); err != nil {
  1534  			return err
  1535  		}
  1536  		vt = runtimeValueTypeI32
  1537  	case wazeroir.SignedTypeUint64:
  1538  		if err := c.compileIntegerDivPrecheck(false, false, dividend.register, divisor.register); err != nil {
  1539  			return err
  1540  		}
  1541  		inst = arm64.UDIV
  1542  		vt = runtimeValueTypeI64
  1543  	case wazeroir.SignedTypeInt32:
  1544  		if err := c.compileIntegerDivPrecheck(true, true, dividend.register, divisor.register); err != nil {
  1545  			return err
  1546  		}
  1547  		inst = arm64.SDIVW
  1548  		vt = runtimeValueTypeI32
  1549  	case wazeroir.SignedTypeInt64:
  1550  		if err := c.compileIntegerDivPrecheck(false, true, dividend.register, divisor.register); err != nil {
  1551  			return err
  1552  		}
  1553  		inst = arm64.SDIV
  1554  		vt = runtimeValueTypeI64
  1555  	case wazeroir.SignedTypeFloat32:
  1556  		inst = arm64.FDIVS
  1557  		vt = runtimeValueTypeF32
  1558  	case wazeroir.SignedTypeFloat64:
  1559  		inst = arm64.FDIVD
  1560  		vt = runtimeValueTypeF64
  1561  	}
  1562  
  1563  	c.assembler.CompileRegisterToRegister(inst, divisor.register, dividend.register)
  1564  
  1565  	c.pushRuntimeValueLocationOnRegister(dividend.register, vt)
  1566  	return nil
  1567  }
  1568  
  1569  // compileIntegerDivPrecheck adds instructions to check if the divisor and dividend are sound for division operation.
  1570  // First, this adds instructions to check if the divisor equals zero, and if so, exits the function.
  1571  // Plus, for signed divisions, check if the result might result in overflow or not.
  1572  func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, dividend, divisor asm.Register) error {
  1573  	// We check the divisor value equals zero.
  1574  	var cmpInst, movInst, loadInst asm.Instruction
  1575  	var minValueOffsetInVM int64
  1576  	if is32Bit {
  1577  		cmpInst = arm64.CMPW
  1578  		movInst = arm64.MOVW
  1579  		loadInst = arm64.LDRW
  1580  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  1581  	} else {
  1582  		cmpInst = arm64.CMP
  1583  		movInst = arm64.MOVD
  1584  		loadInst = arm64.LDRD
  1585  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  1586  	}
  1587  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisor)
  1588  
  1589  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1590  	brIfDivisorNonZero := c.assembler.CompileJump(arm64.BCONDNE)
  1591  	c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1592  
  1593  	// Otherwise, we proceed.
  1594  	c.assembler.SetJumpTargetOnNext(brIfDivisorNonZero)
  1595  
  1596  	// If the operation is a signed integer div, we have to do an additional check on overflow.
  1597  	if isSigned {
  1598  		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  1599  		// case which results in the overflow.
  1600  
  1601  		// First, we compare the divisor with -1.
  1602  		c.assembler.CompileConstToRegister(movInst, -1, arm64ReservedRegisterForTemporary)
  1603  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, divisor)
  1604  
  1605  		// If they not equal, we skip the following check.
  1606  		brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BCONDNE)
  1607  
  1608  		// Otherwise, we further check if the dividend equals math.MinInt32 or MinInt64.
  1609  		c.assembler.CompileMemoryToRegister(
  1610  			loadInst,
  1611  			arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  1612  			arm64ReservedRegisterForTemporary,
  1613  		)
  1614  		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, dividend)
  1615  
  1616  		// If they not equal, we are safe to execute the division.
  1617  		brIfDividendNotMinInt := c.assembler.CompileJump(arm64.BCONDNE)
  1618  
  1619  		// Otherwise, we raise overflow error.
  1620  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  1621  
  1622  		c.assembler.SetJumpTargetOnNext(brIfDivisorNonMinusOne, brIfDividendNotMinInt)
  1623  	}
  1624  	return nil
  1625  }
  1626  
  1627  // compileRem implements compiler.compileRem for the arm64 architecture.
  1628  func (c *arm64Compiler) compileRem(o *wazeroir.OperationRem) error {
  1629  	dividend, divisor, err := c.popTwoValuesOnRegisters()
  1630  	if err != nil {
  1631  		return err
  1632  	}
  1633  
  1634  	dividendReg := dividend.register
  1635  	divisorReg := divisor.register
  1636  
  1637  	// If the divisor is on the zero register, exit from the function deterministically.
  1638  	if isZeroRegister(divisor.register) {
  1639  		// Push any value so that the subsequent instruction can have a consistent location stack state.
  1640  		c.locationStack.pushRuntimeValueLocationOnStack()
  1641  		c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1642  		return nil
  1643  	}
  1644  
  1645  	var divInst, msubInst, cmpInst asm.Instruction
  1646  	switch o.Type {
  1647  	case wazeroir.SignedUint32:
  1648  		divInst = arm64.UDIVW
  1649  		msubInst = arm64.MSUBW
  1650  		cmpInst = arm64.CMPW
  1651  	case wazeroir.SignedUint64:
  1652  		divInst = arm64.UDIV
  1653  		msubInst = arm64.MSUB
  1654  		cmpInst = arm64.CMP
  1655  	case wazeroir.SignedInt32:
  1656  		divInst = arm64.SDIVW
  1657  		msubInst = arm64.MSUBW
  1658  		cmpInst = arm64.CMPW
  1659  	case wazeroir.SignedInt64:
  1660  		divInst = arm64.SDIV
  1661  		msubInst = arm64.MSUB
  1662  		cmpInst = arm64.CMP
  1663  	}
  1664  
  1665  	// We check the divisor value equals zero.
  1666  	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisorReg)
  1667  
  1668  	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
  1669  	brIfDivisorNonZero := c.assembler.CompileJump(arm64.BCONDNE)
  1670  	c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1671  
  1672  	// Otherwise, we proceed.
  1673  	c.assembler.SetJumpTargetOnNext(brIfDivisorNonZero)
  1674  
  1675  	// Temporarily mark them used to allocate a result register while keeping these values.
  1676  	c.markRegisterUsed(dividend.register, divisor.register)
  1677  
  1678  	resultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  1679  	if err != nil {
  1680  		return err
  1681  	}
  1682  
  1683  	// arm64 doesn't have an instruction for rem, we use calculate it by two instructions: UDIV (SDIV for signed) and MSUB.
  1684  	// This exactly the same code that Clang emits.
  1685  	// [input: x0=dividend, x1=divisor]
  1686  	// >> UDIV x2, x0, x1
  1687  	// >> MSUB x3, x2, x1, x0
  1688  	// [result: x2=quotient, x3=remainder]
  1689  	//
  1690  	c.assembler.CompileTwoRegistersToRegister(divInst, divisorReg, dividendReg, resultReg)
  1691  	// ResultReg = dividendReg - (divisorReg * resultReg)
  1692  	c.assembler.CompileThreeRegistersToRegister(msubInst, divisorReg, dividendReg, resultReg, resultReg)
  1693  
  1694  	c.markRegisterUnused(dividend.register, divisor.register)
  1695  	c.pushRuntimeValueLocationOnRegister(resultReg, dividend.valueType)
  1696  	return nil
  1697  }
  1698  
  1699  // compileAnd implements compiler.compileAnd for the arm64 architecture.
  1700  func (c *arm64Compiler) compileAnd(o *wazeroir.OperationAnd) error {
  1701  	x1, x2, err := c.popTwoValuesOnRegisters()
  1702  	if err != nil {
  1703  		return err
  1704  	}
  1705  
  1706  	// If either of the registers x1 or x2 is zero,
  1707  	// the result will always be zero.
  1708  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1709  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, x1.valueType)
  1710  		return nil
  1711  	}
  1712  
  1713  	// At this point, at least one of x1 or x2 registers is non zero.
  1714  	// Choose the non-zero register as destination.
  1715  	destinationReg := x1.register
  1716  	if isZeroRegister(x1.register) {
  1717  		destinationReg = x2.register
  1718  	}
  1719  
  1720  	var inst asm.Instruction
  1721  	switch o.Type {
  1722  	case wazeroir.UnsignedInt32:
  1723  		inst = arm64.ANDW
  1724  	case wazeroir.UnsignedInt64:
  1725  		inst = arm64.AND
  1726  	}
  1727  
  1728  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1729  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1730  	return nil
  1731  }
  1732  
  1733  // compileOr implements compiler.compileOr for the arm64 architecture.
  1734  func (c *arm64Compiler) compileOr(o *wazeroir.OperationOr) error {
  1735  	x1, x2, err := c.popTwoValuesOnRegisters()
  1736  	if err != nil {
  1737  		return err
  1738  	}
  1739  
  1740  	if isZeroRegister(x1.register) {
  1741  		c.pushRuntimeValueLocationOnRegister(x2.register, x2.valueType)
  1742  		return nil
  1743  	}
  1744  	if isZeroRegister(x2.register) {
  1745  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1746  		return nil
  1747  	}
  1748  
  1749  	var inst asm.Instruction
  1750  	switch o.Type {
  1751  	case wazeroir.UnsignedInt32:
  1752  		inst = arm64.ORRW
  1753  	case wazeroir.UnsignedInt64:
  1754  		inst = arm64.ORR
  1755  	}
  1756  
  1757  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1758  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1759  	return nil
  1760  }
  1761  
  1762  // compileXor implements compiler.compileXor for the arm64 architecture.
  1763  func (c *arm64Compiler) compileXor(o *wazeroir.OperationXor) error {
  1764  	x1, x2, err := c.popTwoValuesOnRegisters()
  1765  	if err != nil {
  1766  		return err
  1767  	}
  1768  
  1769  	// At this point, at least one of x1 or x2 registers is non zero.
  1770  	// Choose the non-zero register as destination.
  1771  	destinationReg := x1.register
  1772  	if isZeroRegister(x1.register) {
  1773  		destinationReg = x2.register
  1774  	}
  1775  
  1776  	var inst asm.Instruction
  1777  	switch o.Type {
  1778  	case wazeroir.UnsignedInt32:
  1779  		inst = arm64.EORW
  1780  	case wazeroir.UnsignedInt64:
  1781  		inst = arm64.EOR
  1782  	}
  1783  
  1784  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, destinationReg)
  1785  	c.pushRuntimeValueLocationOnRegister(destinationReg, x1.valueType)
  1786  	return nil
  1787  }
  1788  
  1789  // compileShl implements compiler.compileShl for the arm64 architecture.
  1790  func (c *arm64Compiler) compileShl(o *wazeroir.OperationShl) error {
  1791  	x1, x2, err := c.popTwoValuesOnRegisters()
  1792  	if err != nil {
  1793  		return err
  1794  	}
  1795  
  1796  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1797  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1798  		return nil
  1799  	}
  1800  
  1801  	var inst asm.Instruction
  1802  	switch o.Type {
  1803  	case wazeroir.UnsignedInt32:
  1804  		inst = arm64.LSLW
  1805  	case wazeroir.UnsignedInt64:
  1806  		inst = arm64.LSL
  1807  	}
  1808  
  1809  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1810  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1811  	return nil
  1812  }
  1813  
  1814  // compileShr implements compiler.compileShr for the arm64 architecture.
  1815  func (c *arm64Compiler) compileShr(o *wazeroir.OperationShr) error {
  1816  	x1, x2, err := c.popTwoValuesOnRegisters()
  1817  	if err != nil {
  1818  		return err
  1819  	}
  1820  
  1821  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1822  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1823  		return nil
  1824  	}
  1825  
  1826  	var inst asm.Instruction
  1827  	switch o.Type {
  1828  	case wazeroir.SignedInt32:
  1829  		inst = arm64.ASRW
  1830  	case wazeroir.SignedInt64:
  1831  		inst = arm64.ASR
  1832  	case wazeroir.SignedUint32:
  1833  		inst = arm64.LSRW
  1834  	case wazeroir.SignedUint64:
  1835  		inst = arm64.LSR
  1836  	}
  1837  
  1838  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1839  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1840  	return nil
  1841  }
  1842  
  1843  // compileRotl implements compiler.compileRotl for the arm64 architecture.
  1844  func (c *arm64Compiler) compileRotl(o *wazeroir.OperationRotl) error {
  1845  	x1, x2, err := c.popTwoValuesOnRegisters()
  1846  	if err != nil {
  1847  		return err
  1848  	}
  1849  
  1850  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1851  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1852  		return nil
  1853  	}
  1854  
  1855  	var inst, neginst asm.Instruction
  1856  	switch o.Type {
  1857  	case wazeroir.UnsignedInt32:
  1858  		inst = arm64.RORW
  1859  		neginst = arm64.NEGW
  1860  	case wazeroir.UnsignedInt64:
  1861  		inst = arm64.ROR
  1862  		neginst = arm64.NEG
  1863  	}
  1864  
  1865  	// Arm64 doesn't have rotate left instruction.
  1866  	// The shift amount needs to be converted to a negative number, similar to assembly output of bits.RotateLeft.
  1867  	c.assembler.CompileRegisterToRegister(neginst, x2.register, x2.register)
  1868  
  1869  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1870  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1871  	return nil
  1872  }
  1873  
  1874  // compileRotr implements compiler.compileRotr for the arm64 architecture.
  1875  func (c *arm64Compiler) compileRotr(o *wazeroir.OperationRotr) error {
  1876  	x1, x2, err := c.popTwoValuesOnRegisters()
  1877  	if err != nil {
  1878  		return err
  1879  	}
  1880  
  1881  	if isZeroRegister(x1.register) || isZeroRegister(x2.register) {
  1882  		c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1883  		return nil
  1884  	}
  1885  
  1886  	var inst asm.Instruction
  1887  	switch o.Type {
  1888  	case wazeroir.UnsignedInt32:
  1889  		inst = arm64.RORW
  1890  	case wazeroir.UnsignedInt64:
  1891  		inst = arm64.ROR
  1892  	}
  1893  
  1894  	c.assembler.CompileTwoRegistersToRegister(inst, x2.register, x1.register, x1.register)
  1895  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1896  	return nil
  1897  }
  1898  
  1899  // compileAbs implements compiler.compileAbs for the arm64 architecture.
  1900  func (c *arm64Compiler) compileAbs(o *wazeroir.OperationAbs) error {
  1901  	if o.Type == wazeroir.Float32 {
  1902  		return c.compileSimpleUnop(arm64.FABSS, runtimeValueTypeF32)
  1903  	} else {
  1904  		return c.compileSimpleUnop(arm64.FABSD, runtimeValueTypeF64)
  1905  	}
  1906  }
  1907  
  1908  // compileNeg implements compiler.compileNeg for the arm64 architecture.
  1909  func (c *arm64Compiler) compileNeg(o *wazeroir.OperationNeg) error {
  1910  	if o.Type == wazeroir.Float32 {
  1911  		return c.compileSimpleUnop(arm64.FNEGS, runtimeValueTypeF32)
  1912  	} else {
  1913  		return c.compileSimpleUnop(arm64.FNEGD, runtimeValueTypeF64)
  1914  	}
  1915  }
  1916  
  1917  // compileCeil implements compiler.compileCeil for the arm64 architecture.
  1918  func (c *arm64Compiler) compileCeil(o *wazeroir.OperationCeil) error {
  1919  	if o.Type == wazeroir.Float32 {
  1920  		return c.compileSimpleUnop(arm64.FRINTPS, runtimeValueTypeF32)
  1921  	} else {
  1922  		return c.compileSimpleUnop(arm64.FRINTPD, runtimeValueTypeF64)
  1923  	}
  1924  }
  1925  
  1926  // compileFloor implements compiler.compileFloor for the arm64 architecture.
  1927  func (c *arm64Compiler) compileFloor(o *wazeroir.OperationFloor) error {
  1928  	if o.Type == wazeroir.Float32 {
  1929  		return c.compileSimpleUnop(arm64.FRINTMS, runtimeValueTypeF32)
  1930  	} else {
  1931  		return c.compileSimpleUnop(arm64.FRINTMD, runtimeValueTypeF64)
  1932  	}
  1933  }
  1934  
  1935  // compileTrunc implements compiler.compileTrunc for the arm64 architecture.
  1936  func (c *arm64Compiler) compileTrunc(o *wazeroir.OperationTrunc) error {
  1937  	if o.Type == wazeroir.Float32 {
  1938  		return c.compileSimpleUnop(arm64.FRINTZS, runtimeValueTypeF32)
  1939  	} else {
  1940  		return c.compileSimpleUnop(arm64.FRINTZD, runtimeValueTypeF64)
  1941  	}
  1942  }
  1943  
  1944  // compileNearest implements compiler.compileNearest for the arm64 architecture.
  1945  func (c *arm64Compiler) compileNearest(o *wazeroir.OperationNearest) error {
  1946  	if o.Type == wazeroir.Float32 {
  1947  		return c.compileSimpleUnop(arm64.FRINTNS, runtimeValueTypeF32)
  1948  	} else {
  1949  		return c.compileSimpleUnop(arm64.FRINTND, runtimeValueTypeF64)
  1950  	}
  1951  }
  1952  
  1953  // compileSqrt implements compiler.compileSqrt for the arm64 architecture.
  1954  func (c *arm64Compiler) compileSqrt(o *wazeroir.OperationSqrt) error {
  1955  	if o.Type == wazeroir.Float32 {
  1956  		return c.compileSimpleUnop(arm64.FSQRTS, runtimeValueTypeF32)
  1957  	} else {
  1958  		return c.compileSimpleUnop(arm64.FSQRTD, runtimeValueTypeF64)
  1959  	}
  1960  }
  1961  
  1962  // compileMin implements compiler.compileMin for the arm64 architecture.
  1963  func (c *arm64Compiler) compileMin(o *wazeroir.OperationMin) error {
  1964  	if o.Type == wazeroir.Float32 {
  1965  		return c.compileSimpleFloatBinop(arm64.FMINS)
  1966  	} else {
  1967  		return c.compileSimpleFloatBinop(arm64.FMIND)
  1968  	}
  1969  }
  1970  
  1971  // compileMax implements compiler.compileMax for the arm64 architecture.
  1972  func (c *arm64Compiler) compileMax(o *wazeroir.OperationMax) error {
  1973  	if o.Type == wazeroir.Float32 {
  1974  		return c.compileSimpleFloatBinop(arm64.FMAXS)
  1975  	} else {
  1976  		return c.compileSimpleFloatBinop(arm64.FMAXD)
  1977  	}
  1978  }
  1979  
  1980  func (c *arm64Compiler) compileSimpleFloatBinop(inst asm.Instruction) error {
  1981  	x1, x2, err := c.popTwoValuesOnRegisters()
  1982  	if err != nil {
  1983  		return err
  1984  	}
  1985  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1986  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1987  	return nil
  1988  }
  1989  
  1990  // compileCopysign implements compiler.compileCopysign for the arm64 architecture.
  1991  func (c *arm64Compiler) compileCopysign(o *wazeroir.OperationCopysign) error {
  1992  	x1, x2, err := c.popTwoValuesOnRegisters()
  1993  	if err != nil {
  1994  		return err
  1995  	}
  1996  
  1997  	var ldr asm.Instruction
  1998  	var minValueOffsetInVM int64
  1999  	if o.Type == wazeroir.Float32 {
  2000  		ldr = arm64.FLDRS
  2001  		minValueOffsetInVM = arm64CallEngineArchContextMinimum32BitSignedIntOffset
  2002  	} else {
  2003  		ldr = arm64.FLDRD
  2004  		minValueOffsetInVM = arm64CallEngineArchContextMinimum64BitSignedIntOffset
  2005  	}
  2006  
  2007  	c.markRegisterUsed(x1.register, x2.register)
  2008  	freg, err := c.allocateRegister(registerTypeVector)
  2009  	if err != nil {
  2010  		return err
  2011  	}
  2012  
  2013  	// This is exactly the same code emitted by GCC for "__builtin_copysign":
  2014  	//
  2015  	//    mov     x0, -9223372036854775808
  2016  	//    fmov    d2, x0
  2017  	//    vbit    v0.8b, v1.8b, v2.8b
  2018  	//
  2019  	// "mov freg, -9223372036854775808 (stored at ce.minimum64BitSignedInt)"
  2020  	c.assembler.CompileMemoryToRegister(
  2021  		ldr,
  2022  		arm64ReservedRegisterForCallEngine, minValueOffsetInVM,
  2023  		freg,
  2024  	)
  2025  
  2026  	// VBIT inserts each bit from the first operand into the destination if the corresponding bit of the second operand is 1,
  2027  	// otherwise it leaves the destination bit unchanged.
  2028  	// See https://developer.arm.com/documentation/dui0801/g/Advanced-SIMD-Instructions--32-bit-/VBIT
  2029  	//
  2030  	// "vbit vreg.8b, x2vreg.8b, x1vreg.8b" == "inserting 64th bit of x2 into x1".
  2031  	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VBIT,
  2032  		freg, x2.register, x1.register, arm64.VectorArrangement16B)
  2033  
  2034  	c.markRegisterUnused(x2.register)
  2035  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2036  	return nil
  2037  }
  2038  
  2039  // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the arm64 architecture.
  2040  func (c *arm64Compiler) compileI32WrapFromI64() error {
  2041  	return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI64)
  2042  }
  2043  
  2044  // compileITruncFromF implements compiler.compileITruncFromF for the arm64 architecture.
  2045  func (c *arm64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) error {
  2046  	// Clear the floating point status register (FPSR).
  2047  	c.assembler.CompileRegisterToRegister(arm64.MSR, arm64.RegRZR, arm64.RegFPSR)
  2048  
  2049  	var vt runtimeValueType
  2050  	var convinst asm.Instruction
  2051  	is32bitFloat := o.InputType == wazeroir.Float32
  2052  	if is32bitFloat && o.OutputType == wazeroir.SignedInt32 {
  2053  		convinst = arm64.FCVTZSSW
  2054  		vt = runtimeValueTypeI32
  2055  	} else if is32bitFloat && o.OutputType == wazeroir.SignedInt64 {
  2056  		convinst = arm64.FCVTZSS
  2057  		vt = runtimeValueTypeI64
  2058  	} else if !is32bitFloat && o.OutputType == wazeroir.SignedInt32 {
  2059  		convinst = arm64.FCVTZSDW
  2060  		vt = runtimeValueTypeI32
  2061  	} else if !is32bitFloat && o.OutputType == wazeroir.SignedInt64 {
  2062  		convinst = arm64.FCVTZSD
  2063  		vt = runtimeValueTypeI64
  2064  	} else if is32bitFloat && o.OutputType == wazeroir.SignedUint32 {
  2065  		convinst = arm64.FCVTZUSW
  2066  		vt = runtimeValueTypeI32
  2067  	} else if is32bitFloat && o.OutputType == wazeroir.SignedUint64 {
  2068  		convinst = arm64.FCVTZUS
  2069  		vt = runtimeValueTypeI64
  2070  	} else if !is32bitFloat && o.OutputType == wazeroir.SignedUint32 {
  2071  		convinst = arm64.FCVTZUDW
  2072  		vt = runtimeValueTypeI32
  2073  	} else if !is32bitFloat && o.OutputType == wazeroir.SignedUint64 {
  2074  		convinst = arm64.FCVTZUD
  2075  		vt = runtimeValueTypeI64
  2076  	}
  2077  
  2078  	source, err := c.popValueOnRegister()
  2079  	if err != nil {
  2080  		return err
  2081  	}
  2082  
  2083  	destinationReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2084  	if err != nil {
  2085  		return err
  2086  	}
  2087  
  2088  	c.assembler.CompileRegisterToRegister(convinst, source.register, destinationReg)
  2089  	c.pushRuntimeValueLocationOnRegister(destinationReg, vt)
  2090  
  2091  	if !o.NonTrapping {
  2092  		// Obtain the floating point status register value into the general purpose register,
  2093  		// so that we can check if the conversion resulted in undefined behavior.
  2094  		c.assembler.CompileRegisterToRegister(arm64.MRS, arm64.RegFPSR, arm64ReservedRegisterForTemporary)
  2095  		// Check if the conversion was undefined by comparing the status with 1.
  2096  		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
  2097  		c.assembler.CompileRegisterAndConstToNone(arm64.CMP, arm64ReservedRegisterForTemporary, 1)
  2098  
  2099  		brOK := c.assembler.CompileJump(arm64.BCONDNE)
  2100  
  2101  		// If so, exit the execution with errors depending on whether or not the source value is NaN.
  2102  		var floatcmp asm.Instruction
  2103  		if is32bitFloat {
  2104  			floatcmp = arm64.FCMPS
  2105  		} else {
  2106  			floatcmp = arm64.FCMPD
  2107  		}
  2108  		c.assembler.CompileTwoRegistersToNone(floatcmp, source.register, source.register)
  2109  		// VS flag is set if at least one of values for FCMP is NaN.
  2110  		// https://developer.arm.com/documentation/dui0801/g/Condition-Codes/Comparison-of-condition-code-meanings-in-integer-and-floating-point-code
  2111  		brIfSourceNaN := c.assembler.CompileJump(arm64.BCONDVS)
  2112  
  2113  		// If the source value is not NaN, the operation was overflow.
  2114  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  2115  
  2116  		// Otherwise, the operation was invalid as this is trying to convert NaN to integer.
  2117  		c.assembler.SetJumpTargetOnNext(brIfSourceNaN)
  2118  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2119  
  2120  		// Otherwise, we branch into the next instruction.
  2121  		c.assembler.SetJumpTargetOnNext(brOK)
  2122  	}
  2123  	return nil
  2124  }
  2125  
  2126  // compileFConvertFromI implements compiler.compileFConvertFromI for the arm64 architecture.
  2127  func (c *arm64Compiler) compileFConvertFromI(o *wazeroir.OperationFConvertFromI) error {
  2128  	var convinst asm.Instruction
  2129  	if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedInt32 {
  2130  		convinst = arm64.SCVTFWS
  2131  	} else if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedInt64 {
  2132  		convinst = arm64.SCVTFS
  2133  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedInt32 {
  2134  		convinst = arm64.SCVTFWD
  2135  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedInt64 {
  2136  		convinst = arm64.SCVTFD
  2137  	} else if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedUint32 {
  2138  		convinst = arm64.UCVTFWS
  2139  	} else if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedUint64 {
  2140  		convinst = arm64.UCVTFS
  2141  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedUint32 {
  2142  		convinst = arm64.UCVTFWD
  2143  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedUint64 {
  2144  		convinst = arm64.UCVTFD
  2145  	}
  2146  
  2147  	var vt runtimeValueType
  2148  	if o.OutputType == wazeroir.Float32 {
  2149  		vt = runtimeValueTypeF32
  2150  	} else {
  2151  		vt = runtimeValueTypeF64
  2152  	}
  2153  	return c.compileSimpleConversion(convinst, registerTypeVector, vt)
  2154  }
  2155  
  2156  // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the arm64 architecture.
  2157  func (c *arm64Compiler) compileF32DemoteFromF64() error {
  2158  	return c.compileSimpleUnop(arm64.FCVTDS, runtimeValueTypeF32)
  2159  }
  2160  
  2161  // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the arm64 architecture.
  2162  func (c *arm64Compiler) compileF64PromoteFromF32() error {
  2163  	return c.compileSimpleUnop(arm64.FCVTSD, runtimeValueTypeF64)
  2164  }
  2165  
  2166  // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the arm64 architecture.
  2167  func (c *arm64Compiler) compileI32ReinterpretFromF32() error {
  2168  	if peek := c.locationStack.peek(); peek.onStack() {
  2169  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2170  		peek.valueType = runtimeValueTypeI32
  2171  		return nil
  2172  	}
  2173  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeGeneralPurpose, runtimeValueTypeI32)
  2174  }
  2175  
  2176  // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the arm64 architecture.
  2177  func (c *arm64Compiler) compileI64ReinterpretFromF64() error {
  2178  	if peek := c.locationStack.peek(); peek.onStack() {
  2179  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2180  		peek.valueType = runtimeValueTypeI64
  2181  		return nil
  2182  	}
  2183  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeGeneralPurpose, runtimeValueTypeI64)
  2184  }
  2185  
  2186  // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the arm64 architecture.
  2187  func (c *arm64Compiler) compileF32ReinterpretFromI32() error {
  2188  	if peek := c.locationStack.peek(); peek.onStack() {
  2189  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2190  		peek.valueType = runtimeValueTypeF32
  2191  		return nil
  2192  	}
  2193  	return c.compileSimpleConversion(arm64.FMOVS, registerTypeVector, runtimeValueTypeF32)
  2194  }
  2195  
  2196  // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the arm64 architecture.
  2197  func (c *arm64Compiler) compileF64ReinterpretFromI64() error {
  2198  	if peek := c.locationStack.peek(); peek.onStack() {
  2199  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2200  		peek.valueType = runtimeValueTypeF64
  2201  		return nil
  2202  	}
  2203  	return c.compileSimpleConversion(arm64.FMOVD, registerTypeVector, runtimeValueTypeF64)
  2204  }
  2205  
  2206  func (c *arm64Compiler) compileSimpleConversion(inst asm.Instruction, destinationRegType registerType, resultRuntimeValueType runtimeValueType) error {
  2207  	source, err := c.popValueOnRegister()
  2208  	if err != nil {
  2209  		return err
  2210  	}
  2211  
  2212  	destinationReg, err := c.allocateRegister(destinationRegType)
  2213  	if err != nil {
  2214  		return err
  2215  	}
  2216  
  2217  	c.assembler.CompileRegisterToRegister(inst, source.register, destinationReg)
  2218  	c.pushRuntimeValueLocationOnRegister(destinationReg, resultRuntimeValueType)
  2219  	return nil
  2220  }
  2221  
  2222  // compileExtend implements compiler.compileExtend for the arm64 architecture.
  2223  func (c *arm64Compiler) compileExtend(o *wazeroir.OperationExtend) error {
  2224  	if o.Signed {
  2225  		return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2226  	} else {
  2227  		return c.compileSimpleUnop(arm64.MOVW, runtimeValueTypeI64)
  2228  	}
  2229  }
  2230  
  2231  // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the arm64 architecture.
  2232  func (c *arm64Compiler) compileSignExtend32From8() error {
  2233  	return c.compileSimpleUnop(arm64.SXTBW, runtimeValueTypeI32)
  2234  }
  2235  
  2236  // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the arm64 architecture.
  2237  func (c *arm64Compiler) compileSignExtend32From16() error {
  2238  	return c.compileSimpleUnop(arm64.SXTHW, runtimeValueTypeI32)
  2239  }
  2240  
  2241  // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the arm64 architecture.
  2242  func (c *arm64Compiler) compileSignExtend64From8() error {
  2243  	return c.compileSimpleUnop(arm64.SXTB, runtimeValueTypeI64)
  2244  }
  2245  
  2246  // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the arm64 architecture.
  2247  func (c *arm64Compiler) compileSignExtend64From16() error {
  2248  	return c.compileSimpleUnop(arm64.SXTH, runtimeValueTypeI64)
  2249  }
  2250  
  2251  // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the arm64 architecture.
  2252  func (c *arm64Compiler) compileSignExtend64From32() error {
  2253  	return c.compileSimpleUnop(arm64.SXTW, runtimeValueTypeI64)
  2254  }
  2255  
  2256  func (c *arm64Compiler) compileSimpleUnop(inst asm.Instruction, resultRuntimeValueType runtimeValueType) error {
  2257  	v, err := c.popValueOnRegister()
  2258  	if err != nil {
  2259  		return err
  2260  	}
  2261  	reg := v.register
  2262  	c.assembler.CompileRegisterToRegister(inst, reg, reg)
  2263  	c.pushRuntimeValueLocationOnRegister(reg, resultRuntimeValueType)
  2264  	return nil
  2265  }
  2266  
  2267  // compileEq implements compiler.compileEq for the arm64 architecture.
  2268  func (c *arm64Compiler) compileEq(o *wazeroir.OperationEq) error {
  2269  	return c.emitEqOrNe(true, o.Type)
  2270  }
  2271  
  2272  // compileNe implements compiler.compileNe for the arm64 architecture.
  2273  func (c *arm64Compiler) compileNe(o *wazeroir.OperationNe) error {
  2274  	return c.emitEqOrNe(false, o.Type)
  2275  }
  2276  
  2277  // emitEqOrNe implements compiler.compileEq and compiler.compileNe for the arm64 architecture.
  2278  func (c *arm64Compiler) emitEqOrNe(isEq bool, unsignedType wazeroir.UnsignedType) error {
  2279  	x1, x2, err := c.popTwoValuesOnRegisters()
  2280  	if err != nil {
  2281  		return err
  2282  	}
  2283  
  2284  	var inst asm.Instruction
  2285  	switch unsignedType {
  2286  	case wazeroir.UnsignedTypeI32:
  2287  		inst = arm64.CMPW
  2288  	case wazeroir.UnsignedTypeI64:
  2289  		inst = arm64.CMP
  2290  	case wazeroir.UnsignedTypeF32:
  2291  		inst = arm64.FCMPS
  2292  	case wazeroir.UnsignedTypeF64:
  2293  		inst = arm64.FCMPD
  2294  	}
  2295  
  2296  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2297  
  2298  	// Push the comparison result as a conditional register value.
  2299  	cond := arm64.CondNE
  2300  	if isEq {
  2301  		cond = arm64.CondEQ
  2302  	}
  2303  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(cond)
  2304  	return nil
  2305  }
  2306  
  2307  // compileEqz implements compiler.compileEqz for the arm64 architecture.
  2308  func (c *arm64Compiler) compileEqz(o *wazeroir.OperationEqz) error {
  2309  	x1, err := c.popValueOnRegister()
  2310  	if err != nil {
  2311  		return err
  2312  	}
  2313  
  2314  	var inst asm.Instruction
  2315  	switch o.Type {
  2316  	case wazeroir.UnsignedInt32:
  2317  		inst = arm64.CMPW
  2318  	case wazeroir.UnsignedInt64:
  2319  		inst = arm64.CMP
  2320  	}
  2321  
  2322  	c.assembler.CompileTwoRegistersToNone(inst, arm64.RegRZR, x1.register)
  2323  
  2324  	// Push the comparison result as a conditional register value.
  2325  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondEQ)
  2326  	return nil
  2327  }
  2328  
  2329  // compileLt implements compiler.compileLt for the arm64 architecture.
  2330  func (c *arm64Compiler) compileLt(o *wazeroir.OperationLt) error {
  2331  	x1, x2, err := c.popTwoValuesOnRegisters()
  2332  	if err != nil {
  2333  		return err
  2334  	}
  2335  
  2336  	var inst asm.Instruction
  2337  	var conditionalRegister asm.ConditionalRegisterState
  2338  	switch o.Type {
  2339  	case wazeroir.SignedTypeUint32:
  2340  		inst = arm64.CMPW
  2341  		conditionalRegister = arm64.CondLO
  2342  	case wazeroir.SignedTypeUint64:
  2343  		inst = arm64.CMP
  2344  		conditionalRegister = arm64.CondLO
  2345  	case wazeroir.SignedTypeInt32:
  2346  		inst = arm64.CMPW
  2347  		conditionalRegister = arm64.CondLT
  2348  	case wazeroir.SignedTypeInt64:
  2349  		inst = arm64.CMP
  2350  		conditionalRegister = arm64.CondLT
  2351  	case wazeroir.SignedTypeFloat32:
  2352  		inst = arm64.FCMPS
  2353  		conditionalRegister = arm64.CondMI
  2354  	case wazeroir.SignedTypeFloat64:
  2355  		inst = arm64.FCMPD
  2356  		conditionalRegister = arm64.CondMI
  2357  	}
  2358  
  2359  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2360  
  2361  	// Push the comparison result as a conditional register value.
  2362  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2363  	return nil
  2364  }
  2365  
  2366  // compileGt implements compiler.compileGt for the arm64 architecture.
  2367  func (c *arm64Compiler) compileGt(o *wazeroir.OperationGt) error {
  2368  	x1, x2, err := c.popTwoValuesOnRegisters()
  2369  	if err != nil {
  2370  		return err
  2371  	}
  2372  
  2373  	var inst asm.Instruction
  2374  	var conditionalRegister asm.ConditionalRegisterState
  2375  	switch o.Type {
  2376  	case wazeroir.SignedTypeUint32:
  2377  		inst = arm64.CMPW
  2378  		conditionalRegister = arm64.CondHI
  2379  	case wazeroir.SignedTypeUint64:
  2380  		inst = arm64.CMP
  2381  		conditionalRegister = arm64.CondHI
  2382  	case wazeroir.SignedTypeInt32:
  2383  		inst = arm64.CMPW
  2384  		conditionalRegister = arm64.CondGT
  2385  	case wazeroir.SignedTypeInt64:
  2386  		inst = arm64.CMP
  2387  		conditionalRegister = arm64.CondGT
  2388  	case wazeroir.SignedTypeFloat32:
  2389  		inst = arm64.FCMPS
  2390  		conditionalRegister = arm64.CondGT
  2391  	case wazeroir.SignedTypeFloat64:
  2392  		inst = arm64.FCMPD
  2393  		conditionalRegister = arm64.CondGT
  2394  	}
  2395  
  2396  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2397  
  2398  	// Push the comparison result as a conditional register value.
  2399  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2400  	return nil
  2401  }
  2402  
  2403  // compileLe implements compiler.compileLe for the arm64 architecture.
  2404  func (c *arm64Compiler) compileLe(o *wazeroir.OperationLe) error {
  2405  	x1, x2, err := c.popTwoValuesOnRegisters()
  2406  	if err != nil {
  2407  		return err
  2408  	}
  2409  
  2410  	var inst asm.Instruction
  2411  	var conditionalRegister asm.ConditionalRegisterState
  2412  	switch o.Type {
  2413  	case wazeroir.SignedTypeUint32:
  2414  		inst = arm64.CMPW
  2415  		conditionalRegister = arm64.CondLS
  2416  	case wazeroir.SignedTypeUint64:
  2417  		inst = arm64.CMP
  2418  		conditionalRegister = arm64.CondLS
  2419  	case wazeroir.SignedTypeInt32:
  2420  		inst = arm64.CMPW
  2421  		conditionalRegister = arm64.CondLE
  2422  	case wazeroir.SignedTypeInt64:
  2423  		inst = arm64.CMP
  2424  		conditionalRegister = arm64.CondLE
  2425  	case wazeroir.SignedTypeFloat32:
  2426  		inst = arm64.FCMPS
  2427  		conditionalRegister = arm64.CondLS
  2428  	case wazeroir.SignedTypeFloat64:
  2429  		inst = arm64.FCMPD
  2430  		conditionalRegister = arm64.CondLS
  2431  	}
  2432  
  2433  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2434  
  2435  	// Push the comparison result as a conditional register value.
  2436  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2437  	return nil
  2438  }
  2439  
  2440  // compileGe implements compiler.compileGe for the arm64 architecture.
  2441  func (c *arm64Compiler) compileGe(o *wazeroir.OperationGe) error {
  2442  	x1, x2, err := c.popTwoValuesOnRegisters()
  2443  	if err != nil {
  2444  		return err
  2445  	}
  2446  
  2447  	var inst asm.Instruction
  2448  	var conditionalRegister asm.ConditionalRegisterState
  2449  	switch o.Type {
  2450  	case wazeroir.SignedTypeUint32:
  2451  		inst = arm64.CMPW
  2452  		conditionalRegister = arm64.CondHS
  2453  	case wazeroir.SignedTypeUint64:
  2454  		inst = arm64.CMP
  2455  		conditionalRegister = arm64.CondHS
  2456  	case wazeroir.SignedTypeInt32:
  2457  		inst = arm64.CMPW
  2458  		conditionalRegister = arm64.CondGE
  2459  	case wazeroir.SignedTypeInt64:
  2460  		inst = arm64.CMP
  2461  		conditionalRegister = arm64.CondGE
  2462  	case wazeroir.SignedTypeFloat32:
  2463  		inst = arm64.FCMPS
  2464  		conditionalRegister = arm64.CondGE
  2465  	case wazeroir.SignedTypeFloat64:
  2466  		inst = arm64.FCMPD
  2467  		conditionalRegister = arm64.CondGE
  2468  	}
  2469  
  2470  	c.assembler.CompileTwoRegistersToNone(inst, x2.register, x1.register)
  2471  
  2472  	// Push the comparison result as a conditional register value.
  2473  	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(conditionalRegister)
  2474  	return nil
  2475  }
  2476  
  2477  // compileLoad implements compiler.compileLoad for the arm64 architecture.
  2478  func (c *arm64Compiler) compileLoad(o *wazeroir.OperationLoad) error {
  2479  	var (
  2480  		isFloat           bool
  2481  		loadInst          asm.Instruction
  2482  		targetSizeInBytes int64
  2483  		vt                runtimeValueType
  2484  	)
  2485  
  2486  	switch o.Type {
  2487  	case wazeroir.UnsignedTypeI32:
  2488  		loadInst = arm64.LDRW
  2489  		targetSizeInBytes = 32 / 8
  2490  		vt = runtimeValueTypeI32
  2491  	case wazeroir.UnsignedTypeI64:
  2492  		loadInst = arm64.LDRD
  2493  		targetSizeInBytes = 64 / 8
  2494  		vt = runtimeValueTypeI64
  2495  	case wazeroir.UnsignedTypeF32:
  2496  		loadInst = arm64.FLDRS
  2497  		isFloat = true
  2498  		targetSizeInBytes = 32 / 8
  2499  		vt = runtimeValueTypeF32
  2500  	case wazeroir.UnsignedTypeF64:
  2501  		loadInst = arm64.FLDRD
  2502  		isFloat = true
  2503  		targetSizeInBytes = 64 / 8
  2504  		vt = runtimeValueTypeF64
  2505  	}
  2506  	return c.compileLoadImpl(o.Arg.Offset, loadInst, targetSizeInBytes, isFloat, vt)
  2507  }
  2508  
  2509  // compileLoad8 implements compiler.compileLoad8 for the arm64 architecture.
  2510  func (c *arm64Compiler) compileLoad8(o *wazeroir.OperationLoad8) error {
  2511  	var loadInst asm.Instruction
  2512  	var vt runtimeValueType
  2513  	switch o.Type {
  2514  	case wazeroir.SignedInt32:
  2515  		loadInst = arm64.LDRSBW
  2516  		vt = runtimeValueTypeI32
  2517  	case wazeroir.SignedInt64:
  2518  		loadInst = arm64.LDRSBD
  2519  		vt = runtimeValueTypeI64
  2520  	case wazeroir.SignedUint32:
  2521  		loadInst = arm64.LDRB
  2522  		vt = runtimeValueTypeI32
  2523  	case wazeroir.SignedUint64:
  2524  		loadInst = arm64.LDRB
  2525  		vt = runtimeValueTypeI64
  2526  	}
  2527  	return c.compileLoadImpl(o.Arg.Offset, loadInst, 1, false, vt)
  2528  }
  2529  
  2530  // compileLoad16 implements compiler.compileLoad16 for the arm64 architecture.
  2531  func (c *arm64Compiler) compileLoad16(o *wazeroir.OperationLoad16) error {
  2532  	var loadInst asm.Instruction
  2533  	var vt runtimeValueType
  2534  	switch o.Type {
  2535  	case wazeroir.SignedInt32:
  2536  		loadInst = arm64.LDRSHW
  2537  		vt = runtimeValueTypeI32
  2538  	case wazeroir.SignedInt64:
  2539  		loadInst = arm64.LDRSHD
  2540  		vt = runtimeValueTypeI64
  2541  	case wazeroir.SignedUint32:
  2542  		loadInst = arm64.LDRH
  2543  		vt = runtimeValueTypeI32
  2544  	case wazeroir.SignedUint64:
  2545  		loadInst = arm64.LDRH
  2546  		vt = runtimeValueTypeI64
  2547  	}
  2548  	return c.compileLoadImpl(o.Arg.Offset, loadInst, 16/8, false, vt)
  2549  }
  2550  
  2551  // compileLoad32 implements compiler.compileLoad32 for the arm64 architecture.
  2552  func (c *arm64Compiler) compileLoad32(o *wazeroir.OperationLoad32) error {
  2553  	var loadInst asm.Instruction
  2554  	if o.Signed {
  2555  		loadInst = arm64.LDRSW
  2556  	} else {
  2557  		loadInst = arm64.LDRW
  2558  	}
  2559  	return c.compileLoadImpl(o.Arg.Offset, loadInst, 32/8, false, runtimeValueTypeI64)
  2560  }
  2561  
  2562  // compileLoadImpl implements compileLoadImpl* variants for arm64 architecture.
  2563  func (c *arm64Compiler) compileLoadImpl(offsetArg uint32, loadInst asm.Instruction,
  2564  	targetSizeInBytes int64, isFloat bool, resultRuntimeValueType runtimeValueType,
  2565  ) error {
  2566  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2567  	if err != nil {
  2568  		return err
  2569  	}
  2570  
  2571  	resultRegister := offsetReg
  2572  	if isFloat {
  2573  		resultRegister, err = c.allocateRegister(registerTypeVector)
  2574  		if err != nil {
  2575  			return err
  2576  		}
  2577  	}
  2578  
  2579  	// "resultRegister = [arm64ReservedRegisterForMemory + offsetReg]"
  2580  	// In other words, "resultRegister = memory.Buffer[offset: offset+targetSizeInBytes]"
  2581  	c.assembler.CompileMemoryWithRegisterOffsetToRegister(
  2582  		loadInst,
  2583  		arm64ReservedRegisterForMemory, offsetReg,
  2584  		resultRegister,
  2585  	)
  2586  
  2587  	c.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  2588  	return nil
  2589  }
  2590  
  2591  // compileStore implements compiler.compileStore for the arm64 architecture.
  2592  func (c *arm64Compiler) compileStore(o *wazeroir.OperationStore) error {
  2593  	var movInst asm.Instruction
  2594  	var targetSizeInBytes int64
  2595  	switch o.Type {
  2596  	case wazeroir.UnsignedTypeI32:
  2597  		movInst = arm64.STRW
  2598  		targetSizeInBytes = 32 / 8
  2599  	case wazeroir.UnsignedTypeI64:
  2600  		movInst = arm64.STRD
  2601  		targetSizeInBytes = 64 / 8
  2602  	case wazeroir.UnsignedTypeF32:
  2603  		movInst = arm64.FSTRS
  2604  		targetSizeInBytes = 32 / 8
  2605  	case wazeroir.UnsignedTypeF64:
  2606  		movInst = arm64.FSTRD
  2607  		targetSizeInBytes = 64 / 8
  2608  	}
  2609  	return c.compileStoreImpl(o.Arg.Offset, movInst, targetSizeInBytes)
  2610  }
  2611  
  2612  // compileStore8 implements compiler.compileStore8 for the arm64 architecture.
  2613  func (c *arm64Compiler) compileStore8(o *wazeroir.OperationStore8) error {
  2614  	return c.compileStoreImpl(o.Arg.Offset, arm64.STRB, 1)
  2615  }
  2616  
  2617  // compileStore16 implements compiler.compileStore16 for the arm64 architecture.
  2618  func (c *arm64Compiler) compileStore16(o *wazeroir.OperationStore16) error {
  2619  	return c.compileStoreImpl(o.Arg.Offset, arm64.STRH, 16/8)
  2620  }
  2621  
  2622  // compileStore32 implements compiler.compileStore32 for the arm64 architecture.
  2623  func (c *arm64Compiler) compileStore32(o *wazeroir.OperationStore32) error {
  2624  	return c.compileStoreImpl(o.Arg.Offset, arm64.STRW, 32/8)
  2625  }
  2626  
  2627  // compileStoreImpl implements compleStore* variants for arm64 architecture.
  2628  func (c *arm64Compiler) compileStoreImpl(offsetArg uint32, storeInst asm.Instruction, targetSizeInBytes int64) error {
  2629  	val, err := c.popValueOnRegister()
  2630  	if err != nil {
  2631  		return err
  2632  	}
  2633  	// Mark temporarily used as compileMemoryAccessOffsetSetup might try allocating register.
  2634  	c.markRegisterUsed(val.register)
  2635  
  2636  	offsetReg, err := c.compileMemoryAccessOffsetSetup(offsetArg, targetSizeInBytes)
  2637  	if err != nil {
  2638  		return err
  2639  	}
  2640  
  2641  	// "[arm64ReservedRegisterForMemory + offsetReg] = val.register"
  2642  	// In other words, "memory.Buffer[offset: offset+targetSizeInBytes] = val.register"
  2643  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(
  2644  		storeInst, val.register,
  2645  		arm64ReservedRegisterForMemory, offsetReg,
  2646  	)
  2647  
  2648  	c.markRegisterUnused(val.register)
  2649  	return nil
  2650  }
  2651  
  2652  // compileMemoryAccessOffsetSetup pops the top value from the stack (called "base"), stores "base + offsetArg + targetSizeInBytes"
  2653  // into a register, and returns the stored register. We call the result "offset" because we access the memory
  2654  // as memory.Buffer[offset: offset+targetSizeInBytes].
  2655  //
  2656  // Note: this also emits the instructions to check the out of bounds memory access.
  2657  // In other words, if the offset+targetSizeInBytes exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  2658  func (c *arm64Compiler) compileMemoryAccessOffsetSetup(offsetArg uint32, targetSizeInBytes int64) (offsetRegister asm.Register, err error) {
  2659  	base, err := c.popValueOnRegister()
  2660  	if err != nil {
  2661  		return 0, err
  2662  	}
  2663  
  2664  	offsetRegister = base.register
  2665  	if isZeroRegister(base.register) {
  2666  		offsetRegister, err = c.allocateRegister(registerTypeGeneralPurpose)
  2667  		if err != nil {
  2668  			return
  2669  		}
  2670  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, offsetRegister)
  2671  	}
  2672  
  2673  	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxUint32 {
  2674  		// "offsetRegister = base + offsetArg + targetSizeInBytes"
  2675  		c.assembler.CompileConstToRegister(arm64.ADD, offsetConst, offsetRegister)
  2676  	} else {
  2677  		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
  2678  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  2679  		return
  2680  	}
  2681  
  2682  	// "arm64ReservedRegisterForTemporary = len(memory.Buffer)"
  2683  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2684  		arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  2685  		arm64ReservedRegisterForTemporary)
  2686  
  2687  	// Check if offsetRegister(= base+offsetArg+targetSizeInBytes) > len(memory.Buffer).
  2688  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, offsetRegister)
  2689  	boundsOK := c.assembler.CompileJump(arm64.BCONDLS)
  2690  
  2691  	// If offsetRegister(= base+offsetArg+targetSizeInBytes) exceeds the memory length,
  2692  	//  we exit the function with nativeCallStatusCodeMemoryOutOfBounds.
  2693  	c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  2694  
  2695  	// Otherwise, we subtract targetSizeInBytes from offsetRegister.
  2696  	c.assembler.SetJumpTargetOnNext(boundsOK)
  2697  	c.assembler.CompileConstToRegister(arm64.SUB, targetSizeInBytes, offsetRegister)
  2698  	return offsetRegister, nil
  2699  }
  2700  
  2701  // compileMemoryGrow implements compileMemoryGrow variants for arm64 architecture.
  2702  func (c *arm64Compiler) compileMemoryGrow() error {
  2703  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2704  		return err
  2705  	}
  2706  
  2707  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexMemoryGrow); err != nil {
  2708  		return err
  2709  	}
  2710  
  2711  	// After return, we re-initialize reserved registers just like preamble of functions.
  2712  	c.compileReservedStackBasePointerRegisterInitialization()
  2713  	c.compileReservedMemoryRegisterInitialization()
  2714  	return nil
  2715  }
  2716  
  2717  // compileMemorySize implements compileMemorySize variants for arm64 architecture.
  2718  func (c *arm64Compiler) compileMemorySize() error {
  2719  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2720  		return err
  2721  	}
  2722  
  2723  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2724  	if err != nil {
  2725  		return err
  2726  	}
  2727  
  2728  	// "reg = len(memory.Buffer)"
  2729  	c.assembler.CompileMemoryToRegister(
  2730  		arm64.LDRD,
  2731  		arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  2732  		reg,
  2733  	)
  2734  
  2735  	// memory.size loads the page size of memory, so we have to divide by the page size.
  2736  	// "reg = reg >> wasm.MemoryPageSizeInBits (== reg / wasm.MemoryPageSize) "
  2737  	c.assembler.CompileConstToRegister(
  2738  		arm64.LSR,
  2739  		wasm.MemoryPageSizeInBits,
  2740  		reg,
  2741  	)
  2742  
  2743  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  2744  	return nil
  2745  }
  2746  
  2747  // compileCallGoFunction adds instructions to call a Go function whose address equals the addr parameter.
  2748  // compilerStatus is set before making call, and it should be either nativeCallStatusCodeCallBuiltInFunction or
  2749  // nativeCallStatusCodeCallGoHostFunction.
  2750  func (c *arm64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode, builtinFunction wasm.Index) error {
  2751  	// Release all the registers as our calling convention requires the caller-save.
  2752  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  2753  		return err
  2754  	}
  2755  
  2756  	if compilerStatus == nativeCallStatusCodeCallBuiltInFunction {
  2757  		// Set the target function address to ce.functionCallAddress
  2758  		// "tmp = $index"
  2759  		c.assembler.CompileConstToRegister(
  2760  			arm64.MOVD,
  2761  			int64(builtinFunction),
  2762  			arm64ReservedRegisterForTemporary,
  2763  		)
  2764  		// "[arm64ReservedRegisterForCallEngine + callEngineExitContextFunctionCallAddressOffset] = tmp"
  2765  		// In other words, "ce.functionCallAddress = tmp (== $addr)"
  2766  		c.assembler.CompileRegisterToMemory(
  2767  			arm64.STRW,
  2768  			arm64ReservedRegisterForTemporary,
  2769  			arm64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset,
  2770  		)
  2771  	}
  2772  
  2773  	// Read the return address, and write it to callEngine.exitContext.returnAddress.
  2774  	c.assembler.CompileReadInstructionAddress(arm64ReservedRegisterForTemporary, arm64.RET)
  2775  	c.assembler.CompileRegisterToMemory(
  2776  		arm64.STRD, arm64ReservedRegisterForTemporary,
  2777  		arm64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset,
  2778  	)
  2779  
  2780  	c.compileExitFromNativeCode(compilerStatus)
  2781  	return nil
  2782  }
  2783  
  2784  // compileConstI32 implements compiler.compileConstI32 for the arm64 architecture.
  2785  func (c *arm64Compiler) compileConstI32(o *wazeroir.OperationConstI32) error {
  2786  	return c.compileIntConstant(true, uint64(o.Value))
  2787  }
  2788  
  2789  // compileConstI64 implements compiler.compileConstI64 for the arm64 architecture.
  2790  func (c *arm64Compiler) compileConstI64(o *wazeroir.OperationConstI64) error {
  2791  	return c.compileIntConstant(false, o.Value)
  2792  }
  2793  
  2794  // compileIntConstant adds instructions to load an integer constant.
  2795  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  2796  // value holds the (zero-extended for 32-bit case) load target constant.
  2797  func (c *arm64Compiler) compileIntConstant(is32bit bool, value uint64) error {
  2798  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2799  		return err
  2800  	}
  2801  
  2802  	var inst asm.Instruction
  2803  	var vt runtimeValueType
  2804  	if is32bit {
  2805  		inst = arm64.MOVW
  2806  		vt = runtimeValueTypeI32
  2807  	} else {
  2808  		inst = arm64.MOVD
  2809  		vt = runtimeValueTypeI64
  2810  	}
  2811  
  2812  	if value == 0 {
  2813  		c.pushRuntimeValueLocationOnRegister(arm64.RegRZR, vt)
  2814  	} else {
  2815  		// Take a register to load the value.
  2816  		reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2817  		if err != nil {
  2818  			return err
  2819  		}
  2820  
  2821  		c.assembler.CompileConstToRegister(inst, int64(value), reg)
  2822  
  2823  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  2824  	}
  2825  	return nil
  2826  }
  2827  
  2828  // compileConstF32 implements compiler.compileConstF32 for the arm64 architecture.
  2829  func (c *arm64Compiler) compileConstF32(o *wazeroir.OperationConstF32) error {
  2830  	return c.compileFloatConstant(true, uint64(math.Float32bits(o.Value)))
  2831  }
  2832  
  2833  // compileConstF64 implements compiler.compileConstF64 for the arm64 architecture.
  2834  func (c *arm64Compiler) compileConstF64(o *wazeroir.OperationConstF64) error {
  2835  	return c.compileFloatConstant(false, math.Float64bits(o.Value))
  2836  }
  2837  
  2838  // compileFloatConstant adds instructions to load a float constant.
  2839  // is32bit is true if the target value is originally 32-bit const, false otherwise.
  2840  // value holds the (zero-extended for 32-bit case) bit representation of load target float constant.
  2841  func (c *arm64Compiler) compileFloatConstant(is32bit bool, value uint64) error {
  2842  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  2843  		return err
  2844  	}
  2845  
  2846  	// Take a register to load the value.
  2847  	reg, err := c.allocateRegister(registerTypeVector)
  2848  	if err != nil {
  2849  		return err
  2850  	}
  2851  
  2852  	tmpReg := arm64.RegRZR
  2853  	if value != 0 {
  2854  		tmpReg = arm64ReservedRegisterForTemporary
  2855  		var inst asm.Instruction
  2856  		if is32bit {
  2857  			inst = arm64.MOVW
  2858  		} else {
  2859  			inst = arm64.MOVD
  2860  		}
  2861  		c.assembler.CompileConstToRegister(inst, int64(value), tmpReg)
  2862  	}
  2863  
  2864  	// Use FMOV instruction to move the value on integer register into the float one.
  2865  	var inst asm.Instruction
  2866  	var vt runtimeValueType
  2867  	if is32bit {
  2868  		vt = runtimeValueTypeF32
  2869  		inst = arm64.FMOVS
  2870  	} else {
  2871  		vt = runtimeValueTypeF64
  2872  		inst = arm64.FMOVD
  2873  	}
  2874  	c.assembler.CompileRegisterToRegister(inst, tmpReg, reg)
  2875  
  2876  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  2877  	return nil
  2878  }
  2879  
  2880  // compileMemoryInit implements compiler.compileMemoryInit for the arm64 architecture.
  2881  func (c *arm64Compiler) compileMemoryInit(o *wazeroir.OperationMemoryInit) error {
  2882  	return c.compileInitImpl(false, o.DataIndex, 0)
  2883  }
  2884  
  2885  // compileInitImpl implements compileTableInit and compileMemoryInit.
  2886  //
  2887  // TODO: the compiled code in this function should be reused and compile at once as
  2888  // the code is independent of any module.
  2889  func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
  2890  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  2891  	if isTable {
  2892  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  2893  	}
  2894  
  2895  	copySize, err := c.popValueOnRegister()
  2896  	if err != nil {
  2897  		return err
  2898  	}
  2899  	c.markRegisterUsed(copySize.register)
  2900  
  2901  	sourceOffset, err := c.popValueOnRegister()
  2902  	if err != nil {
  2903  		return err
  2904  	}
  2905  	if isZeroRegister(sourceOffset.register) {
  2906  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  2907  		if err != nil {
  2908  			return err
  2909  		}
  2910  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  2911  	}
  2912  	c.markRegisterUsed(sourceOffset.register)
  2913  
  2914  	destinationOffset, err := c.popValueOnRegister()
  2915  	if err != nil {
  2916  		return err
  2917  	}
  2918  	if isZeroRegister(destinationOffset.register) {
  2919  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  2920  		if err != nil {
  2921  			return err
  2922  		}
  2923  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  2924  	}
  2925  	c.markRegisterUsed(destinationOffset.register)
  2926  
  2927  	tableInstanceAddressReg := asm.NilRegister
  2928  	if isTable {
  2929  		tableInstanceAddressReg, err = c.allocateRegister(registerTypeGeneralPurpose)
  2930  		if err != nil {
  2931  			return err
  2932  		}
  2933  		c.markRegisterUsed(tableInstanceAddressReg)
  2934  	}
  2935  
  2936  	if !isZeroRegister(copySize.register) {
  2937  		// sourceOffset += size.
  2938  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  2939  		// destinationOffset += size.
  2940  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  2941  	}
  2942  
  2943  	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
  2944  	if err != nil {
  2945  		return err
  2946  	}
  2947  
  2948  	if isTable {
  2949  		c.compileLoadElemInstanceAddress(index, instanceAddr)
  2950  	} else {
  2951  		c.compileLoadDataInstanceAddress(index, instanceAddr)
  2952  	}
  2953  
  2954  	// Check data instance bounds.
  2955  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2956  		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
  2957  		arm64ReservedRegisterForTemporary)
  2958  
  2959  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  2960  	sourceBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
  2961  
  2962  	// If not, raise out of bounds memory access error.
  2963  	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
  2964  
  2965  	c.assembler.SetJumpTargetOnNext(sourceBoundsOK)
  2966  
  2967  	// Check destination bounds.
  2968  	if isTable {
  2969  		// arm64ReservedRegisterForTemporary = &tables[0]
  2970  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2971  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  2972  			arm64ReservedRegisterForTemporary)
  2973  		// tableInstanceAddressReg = arm64ReservedRegisterForTemporary + tableIndex*8
  2974  		//                         = &tables[0] + sizeOf(*tableInstance)*8
  2975  		//                         = &tables[tableIndex]
  2976  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2977  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  2978  			tableInstanceAddressReg)
  2979  		// arm64ReservedRegisterForTemporary = [tableInstanceAddressReg+tableInstanceTableLenOffset] = len(tables[tableIndex])
  2980  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2981  			tableInstanceAddressReg, tableInstanceTableLenOffset,
  2982  			arm64ReservedRegisterForTemporary)
  2983  	} else {
  2984  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  2985  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  2986  			arm64ReservedRegisterForTemporary)
  2987  	}
  2988  
  2989  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  2990  	destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
  2991  
  2992  	// If not, raise out of bounds memory access error.
  2993  	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
  2994  
  2995  	// Otherwise, ready to copy the value from source to destination.
  2996  	c.assembler.SetJumpTargetOnNext(destinationBoundsOK)
  2997  
  2998  	if !isZeroRegister(copySize.register) {
  2999  		// If the size equals zero, we can skip the entire instructions beflow.
  3000  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3001  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3002  
  3003  		var ldr, str asm.Instruction
  3004  		var movSize int64
  3005  		if isTable {
  3006  			ldr, str = arm64.LDRD, arm64.STRD
  3007  			movSize = 8
  3008  
  3009  			// arm64ReservedRegisterForTemporary = &Table[0]
  3010  			c.assembler.CompileMemoryToRegister(arm64.LDRD, tableInstanceAddressReg,
  3011  				tableInstanceTableOffset, arm64ReservedRegisterForTemporary)
  3012  			// destinationOffset = (destinationOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3013  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3014  				destinationOffset.register, pointerSizeLog2,
  3015  				arm64ReservedRegisterForTemporary, destinationOffset.register)
  3016  
  3017  			// arm64ReservedRegisterForTemporary = &ElementInstance.References[0]
  3018  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3019  			// sourceOffset = (sourceOffset<< pointerSizeLog2) + arm64ReservedRegisterForTemporary
  3020  			c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3021  				sourceOffset.register, pointerSizeLog2,
  3022  				arm64ReservedRegisterForTemporary, sourceOffset.register)
  3023  
  3024  			// copySize = copySize << pointerSizeLog2
  3025  			c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3026  		} else {
  3027  			ldr, str = arm64.LDRB, arm64.STRB
  3028  			movSize = 1
  3029  
  3030  			// destinationOffset += memory buffer's absolute address.
  3031  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3032  
  3033  			// sourceOffset += data buffer's absolute address.
  3034  			c.assembler.CompileMemoryToRegister(arm64.LDRD, instanceAddr, 0, arm64ReservedRegisterForTemporary)
  3035  			c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3036  
  3037  		}
  3038  
  3039  		// Negate the counter.
  3040  		c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3041  
  3042  		beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3043  
  3044  		// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3045  		c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3046  			sourceOffset.register, copySize.register,
  3047  			arm64ReservedRegisterForTemporary)
  3048  		// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3049  		c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3050  			arm64ReservedRegisterForTemporary,
  3051  			destinationOffset.register, copySize.register,
  3052  		)
  3053  
  3054  		// Decrement the size counter and if the value is still negative, continue the loop.
  3055  		c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3056  		c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3057  
  3058  		c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3059  	}
  3060  
  3061  	c.markRegisterUnused(copySize.register, sourceOffset.register,
  3062  		destinationOffset.register, instanceAddr, tableInstanceAddressReg)
  3063  	return nil
  3064  }
  3065  
  3066  // compileDataDrop implements compiler.compileDataDrop for the arm64 architecture.
  3067  func (c *arm64Compiler) compileDataDrop(o *wazeroir.OperationDataDrop) error {
  3068  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3069  		return err
  3070  	}
  3071  
  3072  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3073  	if err != nil {
  3074  		return err
  3075  	}
  3076  
  3077  	c.compileLoadDataInstanceAddress(o.DataIndex, tmp)
  3078  
  3079  	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
  3080  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3081  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3082  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3083  	return nil
  3084  }
  3085  
  3086  func (c *arm64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
  3087  	// dst = dataIndex * dataInstanceStructSize
  3088  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(dataIndex)*dataInstanceStructSize, dst)
  3089  
  3090  	// arm64ReservedRegisterForTemporary = &moduleInstance.DataInstances[0]
  3091  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3092  		arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  3093  		arm64ReservedRegisterForTemporary,
  3094  	)
  3095  
  3096  	// dst = arm64ReservedRegisterForTemporary + dst
  3097  	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
  3098  	//     = &moduleInstance.DataInstances[dataIndex]
  3099  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3100  }
  3101  
  3102  // compileMemoryCopy implements compiler.compileMemoryCopy for the arm64 architecture.
  3103  func (c *arm64Compiler) compileMemoryCopy() error {
  3104  	return c.compileCopyImpl(false, 0, 0)
  3105  }
  3106  
  3107  // compileCopyImpl implements compileTableCopy and compileMemoryCopy.
  3108  //
  3109  // TODO: the compiled code in this function should be reused and compile at once as
  3110  // the code is independent of any module.
  3111  func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableIndex uint32) error {
  3112  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3113  	if isTable {
  3114  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3115  	}
  3116  
  3117  	copySize, err := c.popValueOnRegister()
  3118  	if err != nil {
  3119  		return err
  3120  	}
  3121  	c.markRegisterUsed(copySize.register)
  3122  
  3123  	sourceOffset, err := c.popValueOnRegister()
  3124  	if err != nil {
  3125  		return err
  3126  	}
  3127  	if isZeroRegister(sourceOffset.register) {
  3128  		sourceOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3129  		if err != nil {
  3130  			return err
  3131  		}
  3132  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, sourceOffset.register)
  3133  	}
  3134  	c.markRegisterUsed(sourceOffset.register)
  3135  
  3136  	destinationOffset, err := c.popValueOnRegister()
  3137  	if err != nil {
  3138  		return err
  3139  	}
  3140  	if isZeroRegister(destinationOffset.register) {
  3141  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3142  		if err != nil {
  3143  			return err
  3144  		}
  3145  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3146  	}
  3147  	c.markRegisterUsed(destinationOffset.register)
  3148  
  3149  	if !isZeroRegister(copySize.register) {
  3150  		// sourceOffset += size.
  3151  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, sourceOffset.register)
  3152  		// destinationOffset += size.
  3153  		c.assembler.CompileRegisterToRegister(arm64.ADD, copySize.register, destinationOffset.register)
  3154  	}
  3155  
  3156  	if isTable {
  3157  		// arm64ReservedRegisterForTemporary = &tables[0]
  3158  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3159  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3160  			arm64ReservedRegisterForTemporary)
  3161  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3162  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3163  		//                                   = &tables[srcTableIndex]
  3164  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3165  			arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3166  			arm64ReservedRegisterForTemporary)
  3167  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3168  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3169  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3170  			arm64ReservedRegisterForTemporary)
  3171  	} else {
  3172  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3173  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3174  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3175  			arm64ReservedRegisterForTemporary)
  3176  	}
  3177  
  3178  	// Check memory len >= sourceOffset.
  3179  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
  3180  	sourceBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
  3181  
  3182  	// If not, raise out of bounds memory access error.
  3183  	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
  3184  
  3185  	c.assembler.SetJumpTargetOnNext(sourceBoundsOK)
  3186  
  3187  	// Otherwise, check memory len >= destinationOffset.
  3188  	if isTable {
  3189  		// arm64ReservedRegisterForTemporary = &tables[0]
  3190  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3191  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3192  			arm64ReservedRegisterForTemporary)
  3193  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + dstTableIndex*8
  3194  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3195  		//                                   = &tables[dstTableIndex]
  3196  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3197  			arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3198  			arm64ReservedRegisterForTemporary)
  3199  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[dstTableIndex])
  3200  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3201  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3202  			arm64ReservedRegisterForTemporary)
  3203  	}
  3204  
  3205  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3206  	destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
  3207  
  3208  	// If not, raise out of bounds memory access error.
  3209  	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
  3210  
  3211  	// Otherwise, ready to copy the value from source to destination.
  3212  	c.assembler.SetJumpTargetOnNext(destinationBoundsOK)
  3213  
  3214  	var ldr, str asm.Instruction
  3215  	var movSize int64
  3216  	if isTable {
  3217  		ldr, str = arm64.LDRD, arm64.STRD
  3218  		movSize = 8
  3219  	} else {
  3220  		ldr, str = arm64.LDRB, arm64.STRB
  3221  		movSize = 1
  3222  	}
  3223  
  3224  	// If the size equals zero, we can skip the entire instructions beflow.
  3225  	if !isZeroRegister(copySize.register) {
  3226  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
  3227  		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3228  
  3229  		// If source offet < destination offset: for (i = size-1; i >= 0; i--) dst[i] = src[i];
  3230  		c.assembler.CompileTwoRegistersToNone(arm64.CMP, sourceOffset.register, destinationOffset.register)
  3231  		destLowerThanSourceJump := c.assembler.CompileJump(arm64.BCONDLS)
  3232  		var endJump asm.Node
  3233  		{
  3234  			// sourceOffset -= size.
  3235  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, sourceOffset.register)
  3236  			// destinationOffset -= size.
  3237  			c.assembler.CompileRegisterToRegister(arm64.SUB, copySize.register, destinationOffset.register)
  3238  
  3239  			if isTable {
  3240  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3241  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3242  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3243  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3244  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3245  					arm64ReservedRegisterForTemporary)
  3246  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3247  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3248  					arm64ReservedRegisterForTemporary)
  3249  				// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3250  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3251  					destinationOffset.register, pointerSizeLog2,
  3252  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3253  
  3254  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3255  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3256  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3257  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3258  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3259  					arm64ReservedRegisterForTemporary)
  3260  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3261  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3262  					arm64ReservedRegisterForTemporary)
  3263  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3264  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3265  					sourceOffset.register, pointerSizeLog2,
  3266  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3267  
  3268  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3269  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3270  			} else {
  3271  				// sourceOffset += memory buffer's absolute address.
  3272  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3273  				// destinationOffset += memory buffer's absolute address.
  3274  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3275  			}
  3276  
  3277  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3278  
  3279  			// size -= 1
  3280  			c.assembler.CompileConstToRegister(arm64.SUBS, movSize, copySize.register)
  3281  
  3282  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3283  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3284  				sourceOffset.register, copySize.register,
  3285  				arm64ReservedRegisterForTemporary)
  3286  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3287  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3288  				arm64ReservedRegisterForTemporary,
  3289  				destinationOffset.register, copySize.register,
  3290  			)
  3291  
  3292  			// If the value on the copySize.register is not equal zero, continue the loop.
  3293  			c.assembler.CompileJump(arm64.BCONDNE).AssignJumpTarget(beginCopyLoop)
  3294  
  3295  			// Otherwise, exit the loop.
  3296  			endJump = c.assembler.CompileJump(arm64.B)
  3297  		}
  3298  
  3299  		// Else (destination offet < source offset): for (i = 0; i < size; i++) dst[counter-1-i] = src[counter-1-i];
  3300  		c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  3301  		{
  3302  
  3303  			if isTable {
  3304  				// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3305  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3306  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3307  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3308  					arm64ReservedRegisterForTemporary, int64(dstTableIndex)*8,
  3309  					arm64ReservedRegisterForTemporary)
  3310  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3311  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3312  					arm64ReservedRegisterForTemporary)
  3313  				// destinationOffset = (destinationOffset<< interfaceDataySizeLog2) + &Table[dstTableIndex].Table[0]
  3314  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3315  					destinationOffset.register, pointerSizeLog2,
  3316  					arm64ReservedRegisterForTemporary, destinationOffset.register)
  3317  
  3318  				// arm64ReservedRegisterForTemporary = &Tables[srcTableIndex]
  3319  				c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3320  					callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3321  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3322  					arm64ReservedRegisterForTemporary, int64(srcTableIndex)*8,
  3323  					arm64ReservedRegisterForTemporary)
  3324  				c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3325  					arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3326  					arm64ReservedRegisterForTemporary)
  3327  				// sourceOffset = (sourceOffset<< 3) + &Table[0]
  3328  				c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3329  					sourceOffset.register, pointerSizeLog2,
  3330  					arm64ReservedRegisterForTemporary, sourceOffset.register)
  3331  
  3332  				// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3333  				c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, copySize.register)
  3334  			} else {
  3335  				// sourceOffset += memory buffer's absolute address.
  3336  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, sourceOffset.register)
  3337  				// destinationOffset += memory buffer's absolute address.
  3338  				c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3339  			}
  3340  
  3341  			// Negate the counter.
  3342  			c.assembler.CompileRegisterToRegister(arm64.NEG, copySize.register, copySize.register)
  3343  
  3344  			beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3345  
  3346  			// arm64ReservedRegisterForTemporary = [sourceOffset + (size.register)]
  3347  			c.assembler.CompileMemoryWithRegisterOffsetToRegister(ldr,
  3348  				sourceOffset.register, copySize.register,
  3349  				arm64ReservedRegisterForTemporary)
  3350  			// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3351  			c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3352  				arm64ReservedRegisterForTemporary,
  3353  				destinationOffset.register, copySize.register,
  3354  			)
  3355  
  3356  			// size += 1
  3357  			c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
  3358  			c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
  3359  		}
  3360  		c.assembler.SetJumpTargetOnNext(skipCopyJump, endJump)
  3361  	}
  3362  
  3363  	// Mark all of the operand registers.
  3364  	c.markRegisterUnused(copySize.register, sourceOffset.register, destinationOffset.register)
  3365  
  3366  	return nil
  3367  }
  3368  
  3369  // compileMemoryFill implements compiler.compileMemoryCopy for the arm64 architecture.
  3370  func (c *arm64Compiler) compileMemoryFill() error {
  3371  	return c.compileFillImpl(false, 0)
  3372  }
  3373  
  3374  // compileFillImpl implements TableFill and MemoryFill.
  3375  //
  3376  // TODO: the compiled code in this function should be reused and compile at once as
  3377  // the code is independent of any module.
  3378  func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
  3379  	fillSize, err := c.popValueOnRegister()
  3380  	if err != nil {
  3381  		return err
  3382  	}
  3383  	c.markRegisterUsed(fillSize.register)
  3384  
  3385  	value, err := c.popValueOnRegister()
  3386  	if err != nil {
  3387  		return err
  3388  	}
  3389  	c.markRegisterUsed(value.register)
  3390  
  3391  	destinationOffset, err := c.popValueOnRegister()
  3392  	if err != nil {
  3393  		return err
  3394  	}
  3395  	if isZeroRegister(destinationOffset.register) {
  3396  		destinationOffset.register, err = c.allocateRegister(registerTypeGeneralPurpose)
  3397  		if err != nil {
  3398  			return err
  3399  		}
  3400  		c.assembler.CompileRegisterToRegister(arm64.MOVD, arm64.RegRZR, destinationOffset.register)
  3401  	}
  3402  	c.markRegisterUsed(destinationOffset.register)
  3403  
  3404  	// destinationOffset += size.
  3405  	c.assembler.CompileRegisterToRegister(arm64.ADD, fillSize.register, destinationOffset.register)
  3406  
  3407  	if isTable {
  3408  		// arm64ReservedRegisterForTemporary = &tables[0]
  3409  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3410  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3411  			arm64ReservedRegisterForTemporary)
  3412  		// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + srcTableIndex*8
  3413  		//                                   = &tables[0] + sizeOf(*tableInstance)*8
  3414  		//                                   = &tables[srcTableIndex]
  3415  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3416  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3417  			arm64ReservedRegisterForTemporary)
  3418  		// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary+tableInstanceTableLenOffset] = len(tables[srcTableIndex])
  3419  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3420  			arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3421  			arm64ReservedRegisterForTemporary)
  3422  	} else {
  3423  		// arm64ReservedRegisterForTemporary = len(memoryInst.Buffer).
  3424  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3425  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3426  			arm64ReservedRegisterForTemporary)
  3427  	}
  3428  
  3429  	// Check  len >= destinationOffset.
  3430  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
  3431  	destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
  3432  
  3433  	// If not, raise the runtime error.
  3434  	if isTable {
  3435  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  3436  	} else {
  3437  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3438  	}
  3439  
  3440  	// Otherwise, ready to copy the value from destination to source.
  3441  	c.assembler.SetJumpTargetOnNext(destinationBoundsOK)
  3442  
  3443  	// If the size equals zero, we can skip the entire instructions beflow.
  3444  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, fillSize.register)
  3445  	skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
  3446  
  3447  	// destinationOffset -= size.
  3448  	c.assembler.CompileRegisterToRegister(arm64.SUB, fillSize.register, destinationOffset.register)
  3449  
  3450  	var str asm.Instruction
  3451  	var movSize int64
  3452  	if isTable {
  3453  		str = arm64.STRD
  3454  		movSize = 8
  3455  
  3456  		// arm64ReservedRegisterForTemporary = &Tables[dstTableIndex].Table[0]
  3457  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine,
  3458  			callEngineModuleContextTablesElement0AddressOffset, arm64ReservedRegisterForTemporary)
  3459  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3460  			arm64ReservedRegisterForTemporary, int64(tableIndex)*8,
  3461  			arm64ReservedRegisterForTemporary)
  3462  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3463  			arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3464  			arm64ReservedRegisterForTemporary)
  3465  		// destinationOffset = (destinationOffset<< pointerSizeLog2) + &Table[dstTableIndex].Table[0]
  3466  		c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3467  			destinationOffset.register, pointerSizeLog2,
  3468  			arm64ReservedRegisterForTemporary, destinationOffset.register)
  3469  
  3470  		// copySize = copySize << pointerSizeLog2 as each element has 8 bytes and we copy one by one.
  3471  		c.assembler.CompileConstToRegister(arm64.LSL, pointerSizeLog2, fillSize.register)
  3472  	} else {
  3473  		str = arm64.STRB
  3474  		movSize = 1
  3475  
  3476  		// destinationOffset += memory buffer's absolute address.
  3477  		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, destinationOffset.register)
  3478  	}
  3479  
  3480  	// Naively implement the copy with "for loop" by copying byte one by one.
  3481  	beginCopyLoop := c.assembler.CompileStandAlone(arm64.NOP)
  3482  
  3483  	// size -= 1
  3484  	c.assembler.CompileConstToRegister(arm64.SUBS, movSize, fillSize.register)
  3485  
  3486  	// [destinationOffset + (size.register)] = arm64ReservedRegisterForTemporary.
  3487  	c.assembler.CompileRegisterToMemoryWithRegisterOffset(str,
  3488  		value.register,
  3489  		destinationOffset.register, fillSize.register,
  3490  	)
  3491  
  3492  	// If the value on the copySizeRgister.register is not equal zero, continue the loop.
  3493  	continueJump := c.assembler.CompileJump(arm64.BCONDNE)
  3494  	continueJump.AssignJumpTarget(beginCopyLoop)
  3495  
  3496  	// Mark all of the operand registers.
  3497  	c.markRegisterUnused(fillSize.register, value.register, destinationOffset.register)
  3498  
  3499  	c.assembler.SetJumpTargetOnNext(skipCopyJump)
  3500  	return nil
  3501  }
  3502  
  3503  // compileTableInit implements compiler.compileTableInit for the arm64 architecture.
  3504  func (c *arm64Compiler) compileTableInit(o *wazeroir.OperationTableInit) error {
  3505  	return c.compileInitImpl(true, o.ElemIndex, o.TableIndex)
  3506  }
  3507  
  3508  // compileTableCopy implements compiler.compileTableCopy for the arm64 architecture.
  3509  func (c *arm64Compiler) compileTableCopy(o *wazeroir.OperationTableCopy) error {
  3510  	return c.compileCopyImpl(true, o.SrcTableIndex, o.DstTableIndex)
  3511  }
  3512  
  3513  // compileElemDrop implements compiler.compileElemDrop for the arm64 architecture.
  3514  func (c *arm64Compiler) compileElemDrop(o *wazeroir.OperationElemDrop) error {
  3515  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3516  		return err
  3517  	}
  3518  
  3519  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3520  	if err != nil {
  3521  		return err
  3522  	}
  3523  
  3524  	c.compileLoadElemInstanceAddress(o.ElemIndex, tmp)
  3525  
  3526  	// Clears the content of ElementInstances[o.ElemIndex] (== []interface{} type).
  3527  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 0)
  3528  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 8)
  3529  	c.assembler.CompileRegisterToMemory(arm64.STRD, arm64.RegRZR, tmp, 16)
  3530  	return nil
  3531  }
  3532  
  3533  func (c *arm64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
  3534  	// dst = dataIndex * elementInstanceStructSize
  3535  	c.assembler.CompileConstToRegister(arm64.MOVD, int64(elemIndex)*elementInstanceStructSize, dst)
  3536  
  3537  	// arm64ReservedRegisterForTemporary = &moduleInstance.ElementInstances[0]
  3538  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3539  		arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  3540  		arm64ReservedRegisterForTemporary,
  3541  	)
  3542  
  3543  	// dst = arm64ReservedRegisterForTemporary + dst
  3544  	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
  3545  	//     = &moduleInstance.ElementInstances[elemIndex]
  3546  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, dst)
  3547  }
  3548  
  3549  // compileRefFunc implements compiler.compileRefFunc for the arm64 architecture.
  3550  func (c *arm64Compiler) compileRefFunc(o *wazeroir.OperationRefFunc) error {
  3551  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3552  		return err
  3553  	}
  3554  
  3555  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3556  	if err != nil {
  3557  		return err
  3558  	}
  3559  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset]
  3560  	//                                   = &moduleEngine.functions[0]
  3561  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3562  		arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  3563  		arm64ReservedRegisterForTemporary)
  3564  
  3565  	// ref = [arm64ReservedRegisterForTemporary +  int64(o.FunctionIndex)*8]
  3566  	//     = [&moduleEngine.functions[0] + sizeOf(*function) * index]
  3567  	//     = moduleEngine.functions[index]
  3568  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3569  		arm64ReservedRegisterForTemporary, int64(o.FunctionIndex)*8, // * 8 because the size of *code equals 8 bytes.
  3570  		ref,
  3571  	)
  3572  
  3573  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
  3574  	return nil
  3575  }
  3576  
  3577  // compileTableGet implements compiler.compileTableGet for the arm64 architecture.
  3578  func (c *arm64Compiler) compileTableGet(o *wazeroir.OperationTableGet) error {
  3579  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  3580  	if err != nil {
  3581  		return err
  3582  	}
  3583  	c.markRegisterUsed(ref)
  3584  
  3585  	offset, err := c.popValueOnRegister()
  3586  	if err != nil {
  3587  		return err
  3588  	}
  3589  
  3590  	// arm64ReservedRegisterForTemporary = &tables[0]
  3591  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3592  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3593  		arm64ReservedRegisterForTemporary)
  3594  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3595  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3596  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3597  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3598  		arm64ReservedRegisterForTemporary, int64(o.TableIndex)*8,
  3599  		arm64ReservedRegisterForTemporary)
  3600  
  3601  	// Out of bounds check.
  3602  	// ref = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3603  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3604  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3605  		ref,
  3606  	)
  3607  	// "cmp ref, offset"
  3608  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, ref, offset.register)
  3609  
  3610  	// If it exceeds len(table), we exit the execution.
  3611  	brIfBoundsOK := c.assembler.CompileJump(arm64.BCONDLO)
  3612  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  3613  	c.assembler.SetJumpTargetOnNext(brIfBoundsOK)
  3614  
  3615  	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3616  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3617  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3618  		ref,
  3619  	)
  3620  
  3621  	// ref = (offset << pointerSizeLog2) + ref
  3622  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3623  	//     = &tables[TableIndex].References[offset]
  3624  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
  3625  		offset.register, pointerSizeLog2, ref, ref)
  3626  
  3627  	// ref = [&tables[TableIndex]] = load the Reference's pointer as uint64.
  3628  	c.assembler.CompileMemoryToRegister(arm64.LDRD, ref, 0, ref)
  3629  
  3630  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
  3631  	return nil
  3632  }
  3633  
  3634  // compileTableSet implements compiler.compileTableSet for the arm64 architecture.
  3635  func (c *arm64Compiler) compileTableSet(o *wazeroir.OperationTableSet) error {
  3636  	ref := c.locationStack.pop()
  3637  	if err := c.compileEnsureOnRegister(ref); err != nil {
  3638  		return err
  3639  	}
  3640  
  3641  	offset := c.locationStack.pop()
  3642  	if err := c.compileEnsureOnRegister(offset); err != nil {
  3643  		return err
  3644  	}
  3645  
  3646  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3647  	if err != nil {
  3648  		return err
  3649  	}
  3650  
  3651  	// arm64ReservedRegisterForTemporary = &tables[0]
  3652  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3653  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3654  		arm64ReservedRegisterForTemporary)
  3655  	// arm64ReservedRegisterForTemporary = arm64ReservedRegisterForTemporary + TableIndex*8
  3656  	//                                   = &tables[0] + TableIndex*sizeOf(*tableInstance)
  3657  	//                                   = &tables[TableIndex]
  3658  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3659  		arm64ReservedRegisterForTemporary, int64(o.TableIndex)*8,
  3660  		arm64ReservedRegisterForTemporary)
  3661  
  3662  	// Out of bounds check.
  3663  	// tmp = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3664  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3665  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3666  		tmp,
  3667  	)
  3668  	// "cmp tmp, offset"
  3669  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, offset.register)
  3670  
  3671  	// If it exceeds len(table), we exit the execution.
  3672  	brIfBoundsOK := c.assembler.CompileJump(arm64.BCONDLO)
  3673  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  3674  	c.assembler.SetJumpTargetOnNext(brIfBoundsOK)
  3675  
  3676  	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  3677  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3678  		arm64ReservedRegisterForTemporary, tableInstanceTableOffset,
  3679  		tmp,
  3680  	)
  3681  
  3682  	// tmp = (offset << pointerSizeLog2) + tmp
  3683  	//     = &tables[TableIndex].References[0] + sizeOf(uintptr) * offset
  3684  	//     = &tables[TableIndex].References[offset]
  3685  	c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, offset.register, pointerSizeLog2, tmp, tmp)
  3686  
  3687  	// Set the reference's raw pointer.
  3688  	c.assembler.CompileRegisterToMemory(arm64.STRD, ref.register, tmp, 0)
  3689  
  3690  	c.markRegisterUnused(offset.register, ref.register, tmp)
  3691  	return nil
  3692  }
  3693  
  3694  // compileTableGrow implements compiler.compileTableGrow for the arm64 architecture.
  3695  func (c *arm64Compiler) compileTableGrow(o *wazeroir.OperationTableGrow) error {
  3696  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3697  		return err
  3698  	}
  3699  
  3700  	// Pushes the table index.
  3701  	if err := c.compileConstI32(&wazeroir.OperationConstI32{Value: o.TableIndex}); err != nil {
  3702  		return err
  3703  	}
  3704  
  3705  	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
  3706  	// Therefore, call out to the built function for this purpose.
  3707  	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexTableGrow); err != nil {
  3708  		return err
  3709  	}
  3710  
  3711  	// TableGrow consumes three values (table index, number of items, initial value).
  3712  	for i := 0; i < 3; i++ {
  3713  		c.locationStack.pop()
  3714  	}
  3715  
  3716  	// Then, the previous length was pushed as the result.
  3717  	c.locationStack.pushRuntimeValueLocationOnStack()
  3718  
  3719  	// After return, we re-initialize reserved registers just like preamble of functions.
  3720  	c.compileReservedStackBasePointerRegisterInitialization()
  3721  	c.compileReservedMemoryRegisterInitialization()
  3722  	return nil
  3723  }
  3724  
  3725  // compileTableSize implements compiler.compileTableSize for the arm64 architecture.
  3726  func (c *arm64Compiler) compileTableSize(o *wazeroir.OperationTableSize) error {
  3727  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3728  		return err
  3729  	}
  3730  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  3731  	if err != nil {
  3732  		return err
  3733  	}
  3734  	c.markRegisterUsed(result)
  3735  
  3736  	// arm64ReservedRegisterForTemporary = &tables[0]
  3737  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3738  		arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3739  		arm64ReservedRegisterForTemporary)
  3740  	// arm64ReservedRegisterForTemporary = [arm64ReservedRegisterForTemporary + TableIndex*8]
  3741  	//                                   = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3742  	//                                   = [&tables[TableIndex]] = tables[TableIndex].
  3743  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3744  		arm64ReservedRegisterForTemporary, int64(o.TableIndex)*8,
  3745  		arm64ReservedRegisterForTemporary)
  3746  
  3747  	// result = [&tables[TableIndex] + tableInstanceTableLenOffset] = len(tables[TableIndex])
  3748  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3749  		arm64ReservedRegisterForTemporary, tableInstanceTableLenOffset,
  3750  		result,
  3751  	)
  3752  
  3753  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  3754  	return nil
  3755  }
  3756  
  3757  // compileTableFill implements compiler.compileTableFill for the arm64 architecture.
  3758  func (c *arm64Compiler) compileTableFill(o *wazeroir.OperationTableFill) error {
  3759  	return c.compileFillImpl(true, o.TableIndex)
  3760  }
  3761  
  3762  // popTwoValuesOnRegisters pops two values from the location stacks, ensures
  3763  // these two values are located on registers, and mark them unused.
  3764  //
  3765  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3766  // but the name seems awkward.
  3767  func (c *arm64Compiler) popTwoValuesOnRegisters() (x1, x2 *runtimeValueLocation, err error) {
  3768  	x2 = c.locationStack.pop()
  3769  	if err = c.compileEnsureOnRegister(x2); err != nil {
  3770  		return
  3771  	}
  3772  
  3773  	x1 = c.locationStack.pop()
  3774  	if err = c.compileEnsureOnRegister(x1); err != nil {
  3775  		return
  3776  	}
  3777  
  3778  	c.markRegisterUnused(x2.register)
  3779  	c.markRegisterUnused(x1.register)
  3780  	return
  3781  }
  3782  
  3783  // popValueOnRegister pops one value from the location stack, ensures
  3784  // that it is located on a register, and mark it unused.
  3785  //
  3786  // TODO: we’d usually prefix this with compileXXX as this might end up emitting instructions,
  3787  // but the name seems awkward.
  3788  func (c *arm64Compiler) popValueOnRegister() (v *runtimeValueLocation, err error) {
  3789  	v = c.locationStack.pop()
  3790  	if err = c.compileEnsureOnRegister(v); err != nil {
  3791  		return
  3792  	}
  3793  
  3794  	c.markRegisterUnused(v.register)
  3795  	return
  3796  }
  3797  
  3798  // compileEnsureOnRegister emits instructions to ensure that a value is located on a register.
  3799  func (c *arm64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
  3800  	if loc.onStack() {
  3801  		reg, err := c.allocateRegister(loc.getRegisterType())
  3802  		if err != nil {
  3803  			return err
  3804  		}
  3805  
  3806  		// Record that the value holds the register and the register is marked used.
  3807  		loc.setRegister(reg)
  3808  		c.markRegisterUsed(reg)
  3809  
  3810  		c.compileLoadValueOnStackToRegister(loc)
  3811  	} else if loc.onConditionalRegister() {
  3812  		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  3813  	}
  3814  	return
  3815  }
  3816  
  3817  // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
  3818  // if the value is located on a conditional register.
  3819  //
  3820  // This is usually called at the beginning of methods on compiler interface where we possibly
  3821  // compile instructions without saving the conditional register value.
  3822  // compile* functions without calling this function is saving the conditional
  3823  // value to the stack or register by invoking ensureOnGeneralPurposeRegister for the top.
  3824  func (c *arm64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
  3825  	if c.locationStack.sp > 0 {
  3826  		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
  3827  			err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  3828  		}
  3829  	}
  3830  	return
  3831  }
  3832  
  3833  // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
  3834  // to a general purpose register.
  3835  func (c *arm64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
  3836  	reg, err := c.allocateRegister(loc.getRegisterType())
  3837  	if err != nil {
  3838  		return err
  3839  	}
  3840  
  3841  	c.markRegisterUsed(reg)
  3842  	c.assembler.CompileConditionalRegisterSet(loc.conditionalRegister, reg)
  3843  
  3844  	// Record that now the value is located on a general purpose register.
  3845  	loc.setRegister(reg)
  3846  	return nil
  3847  }
  3848  
  3849  // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for arm64.
  3850  func (c *arm64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
  3851  	switch loc.valueType {
  3852  	case runtimeValueTypeI32:
  3853  		c.assembler.CompileMemoryToRegister(arm64.LDRW, arm64ReservedRegisterForStackBasePointerAddress,
  3854  			int64(loc.stackPointer)*8, loc.register)
  3855  	case runtimeValueTypeI64:
  3856  		c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForStackBasePointerAddress,
  3857  			int64(loc.stackPointer)*8, loc.register)
  3858  	case runtimeValueTypeF32:
  3859  		c.assembler.CompileMemoryToRegister(arm64.FLDRS, arm64ReservedRegisterForStackBasePointerAddress,
  3860  			int64(loc.stackPointer)*8, loc.register)
  3861  	case runtimeValueTypeF64:
  3862  		c.assembler.CompileMemoryToRegister(arm64.FLDRD, arm64ReservedRegisterForStackBasePointerAddress,
  3863  			int64(loc.stackPointer)*8, loc.register)
  3864  	case runtimeValueTypeV128Lo:
  3865  		c.assembler.CompileMemoryToVectorRegister(arm64.VMOV,
  3866  			arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8, loc.register,
  3867  			arm64.VectorArrangementQ)
  3868  		// Higher 64-bits are loaded as well ^^.
  3869  		hi := c.locationStack.stack[loc.stackPointer+1]
  3870  		hi.setRegister(loc.register)
  3871  	case runtimeValueTypeV128Hi:
  3872  		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
  3873  	}
  3874  }
  3875  
  3876  // allocateRegister implements compiler.allocateRegister for arm64.
  3877  func (c *arm64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
  3878  	var ok bool
  3879  	// Try to get the unused register.
  3880  	reg, ok = c.locationStack.takeFreeRegister(t)
  3881  	if ok {
  3882  		return
  3883  	}
  3884  
  3885  	// If not found, we have to steal the register.
  3886  	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
  3887  	if !ok {
  3888  		err = fmt.Errorf("cannot steal register")
  3889  		return
  3890  	}
  3891  
  3892  	// Release the steal target register value onto stack location.
  3893  	reg = stealTarget.register
  3894  	c.compileReleaseRegisterToStack(stealTarget)
  3895  	return
  3896  }
  3897  
  3898  // compileReleaseAllRegistersToStack adds instructions to store all the values located on
  3899  // either general purpose or conditional registers onto the memory stack.
  3900  // See releaseRegisterToStack.
  3901  func (c *arm64Compiler) compileReleaseAllRegistersToStack() (err error) {
  3902  	for i := uint64(0); i < c.locationStack.sp; i++ {
  3903  		if loc := c.locationStack.stack[i]; loc.onRegister() {
  3904  			c.compileReleaseRegisterToStack(loc)
  3905  		} else if loc.onConditionalRegister() {
  3906  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  3907  				return
  3908  			}
  3909  			c.compileReleaseRegisterToStack(loc)
  3910  		}
  3911  	}
  3912  	return
  3913  }
  3914  
  3915  // releaseRegisterToStack adds an instruction to write the value on a register back to memory stack region.
  3916  func (c *arm64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
  3917  	switch loc.valueType {
  3918  	case runtimeValueTypeI32:
  3919  		c.assembler.CompileRegisterToMemory(arm64.STRW, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  3920  	case runtimeValueTypeI64:
  3921  		c.assembler.CompileRegisterToMemory(arm64.STRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  3922  	case runtimeValueTypeF32:
  3923  		c.assembler.CompileRegisterToMemory(arm64.FSTRS, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  3924  	case runtimeValueTypeF64:
  3925  		c.assembler.CompileRegisterToMemory(arm64.FSTRD, loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  3926  	case runtimeValueTypeV128Lo:
  3927  		c.assembler.CompileVectorRegisterToMemory(arm64.VMOV,
  3928  			loc.register, arm64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
  3929  			arm64.VectorArrangementQ)
  3930  		// Higher 64-bits are released as well ^^.
  3931  		hi := c.locationStack.stack[loc.stackPointer+1]
  3932  		c.locationStack.releaseRegister(hi)
  3933  	case runtimeValueTypeV128Hi:
  3934  		panic("BUG: V128Hi must be released to the stack along with V128Lo")
  3935  	}
  3936  
  3937  	// Mark the register is free.
  3938  	c.locationStack.releaseRegister(loc)
  3939  }
  3940  
  3941  // compileReservedStackBasePointerRegisterInitialization adds instructions to initialize arm64ReservedRegisterForStackBasePointerAddress
  3942  // so that it points to the absolute address of the stack base for this function.
  3943  func (c *arm64Compiler) compileReservedStackBasePointerRegisterInitialization() {
  3944  	// First, load the address of the first element in the value stack into arm64ReservedRegisterForStackBasePointerAddress temporarily.
  3945  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3946  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
  3947  		arm64ReservedRegisterForStackBasePointerAddress)
  3948  
  3949  	// next we move the base pointer (ce.stackBasePointer) to arm64ReservedRegisterForTemporary.
  3950  	c.assembler.CompileMemoryToRegister(arm64.LDRD,
  3951  		arm64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  3952  		arm64ReservedRegisterForTemporary)
  3953  
  3954  	// Finally, we calculate "callEngineStackContextStackBasePointerInBytesOffset + arm64ReservedRegisterForTemporary"
  3955  	c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForStackBasePointerAddress)
  3956  }
  3957  
  3958  func (c *arm64Compiler) compileReservedMemoryRegisterInitialization() {
  3959  	if c.ir.HasMemory || c.ir.UsesMemory {
  3960  		// "arm64ReservedRegisterForMemory = ce.MemoryElement0Address"
  3961  		c.assembler.CompileMemoryToRegister(
  3962  			arm64.LDRD,
  3963  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  3964  			arm64ReservedRegisterForMemory,
  3965  		)
  3966  	}
  3967  }
  3968  
  3969  // compileModuleContextInitialization adds instructions to initialize ce.moduleContext's fields based on
  3970  // ce.moduleContext.ModuleInstanceAddress.
  3971  // This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
  3972  func (c *arm64Compiler) compileModuleContextInitialization() error {
  3973  	regs, found := c.locationStack.takeFreeRegisters(registerTypeGeneralPurpose, 2)
  3974  	if !found {
  3975  		panic("BUG: all the registers should be free at this point")
  3976  	}
  3977  	c.markRegisterUsed(regs...)
  3978  
  3979  	// Alias these free registers for readability.
  3980  	tmpX, tmpY := regs[0], regs[1]
  3981  
  3982  	// "tmpX = ce.ModuleInstanceAddress"
  3983  	c.assembler.CompileMemoryToRegister(arm64.LDRD, arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceAddressOffset, tmpX)
  3984  
  3985  	// If the module instance address stays the same, we could skip the entire code below.
  3986  	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64CallingConventionModuleInstanceAddressRegister, tmpX)
  3987  	brIfModuleUnchanged := c.assembler.CompileJump(arm64.BCONDEQ)
  3988  
  3989  	// Otherwise, update the moduleEngine.moduleContext.ModuleInstanceAddress.
  3990  	c.assembler.CompileRegisterToMemory(arm64.STRD,
  3991  		arm64CallingConventionModuleInstanceAddressRegister,
  3992  		arm64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceAddressOffset,
  3993  	)
  3994  
  3995  	// Also, we have to update the following fields:
  3996  	// * callEngine.moduleContext.globalElement0Address
  3997  	// * callEngine.moduleContext.memoryElement0Address
  3998  	// * callEngine.moduleContext.memorySliceLen
  3999  	// * callEngine.moduleContext.memoryInstance
  4000  	// * callEngine.moduleContext.tableElement0Address
  4001  	// * callEngine.moduleContext.tableSliceLen
  4002  	// * callEngine.moduleContext.functionsElement0Address
  4003  	// * callEngine.moduleContext.typeIDsElement0Address
  4004  	// * callEngine.moduleContext.dataInstancesElement0Address
  4005  	// * callEngine.moduleContext.elementInstancesElement0Address
  4006  
  4007  	// Update globalElement0Address.
  4008  	//
  4009  	// Note: if there's global.get or set instruction in the function, the existence of the globals
  4010  	// is ensured by function validation at module instantiation phase, and that's why it is ok to
  4011  	// skip the initialization if the module's globals slice is empty.
  4012  	if len(c.ir.Globals) > 0 {
  4013  		// "tmpX = &moduleInstance.Globals[0]"
  4014  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4015  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset,
  4016  			tmpX,
  4017  		)
  4018  
  4019  		// "ce.GlobalElement0Address = tmpX (== &moduleInstance.Globals[0])"
  4020  		c.assembler.CompileRegisterToMemory(
  4021  			arm64.STRD, tmpX,
  4022  			arm64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset,
  4023  		)
  4024  	}
  4025  
  4026  	// Update memoryElement0Address and memorySliceLen.
  4027  	//
  4028  	// Note: if there's memory instruction in the function, memory instance must be non-nil.
  4029  	// That is ensured by function validation at module instantiation phase, and that's
  4030  	// why it is ok to skip the initialization if the module's memory instance is nil.
  4031  	if c.ir.HasMemory {
  4032  		// "tmpX = moduleInstance.Memory"
  4033  		c.assembler.CompileMemoryToRegister(
  4034  			arm64.LDRD,
  4035  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
  4036  			tmpX,
  4037  		)
  4038  
  4039  		// First, set ce.memoryInstance
  4040  		c.assembler.CompileRegisterToMemory(
  4041  			arm64.STRD,
  4042  			tmpX,
  4043  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset,
  4044  		)
  4045  
  4046  		// Next, we write the memory length into ce.MemorySliceLen.
  4047  		//
  4048  		// "tmpY = [tmpX + memoryInstanceBufferLenOffset] (== len(memory.Buffer))"
  4049  		c.assembler.CompileMemoryToRegister(
  4050  			arm64.LDRD,
  4051  			tmpX, memoryInstanceBufferLenOffset,
  4052  			tmpY,
  4053  		)
  4054  		// "ce.MemorySliceLen = tmpY".
  4055  		c.assembler.CompileRegisterToMemory(
  4056  			arm64.STRD,
  4057  			tmpY,
  4058  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  4059  		)
  4060  
  4061  		// Finally, we write ce.memoryElement0Address.
  4062  		//
  4063  		// "tmpY = *tmpX (== &memory.Buffer[0])"
  4064  		c.assembler.CompileMemoryToRegister(
  4065  			arm64.LDRD,
  4066  			tmpX, memoryInstanceBufferOffset,
  4067  			tmpY,
  4068  		)
  4069  		// "ce.memoryElement0Address = tmpY".
  4070  		c.assembler.CompileRegisterToMemory(
  4071  			arm64.STRD,
  4072  			tmpY,
  4073  			arm64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4074  		)
  4075  	}
  4076  
  4077  	// Update tableElement0Address, tableSliceLen and typeIDsElement0Address.
  4078  	//
  4079  	// Note: if there's table instruction in the function, the existence of the table
  4080  	// is ensured by function validation at module instantiation phase, and that's
  4081  	// why it is ok to skip the initialization if the module's table doesn't exist.
  4082  	if c.ir.HasTable {
  4083  		// "tmpX = &tables[0] (type of **wasm.Table)"
  4084  		c.assembler.CompileMemoryToRegister(
  4085  			arm64.LDRD,
  4086  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTablesOffset,
  4087  			tmpX,
  4088  		)
  4089  
  4090  		// Update ce.tableElement0Address.
  4091  		// "ce.tableElement0Address = tmpX".
  4092  		c.assembler.CompileRegisterToMemory(
  4093  			arm64.STRD,
  4094  			tmpX,
  4095  			arm64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4096  		)
  4097  
  4098  		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
  4099  		c.assembler.CompileMemoryToRegister(arm64.LDRD,
  4100  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpX)
  4101  		c.assembler.CompileRegisterToMemory(arm64.STRD,
  4102  			tmpX, arm64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
  4103  	}
  4104  
  4105  	// Update callEngine.moduleContext.functionsElement0Address
  4106  	{
  4107  		// "tmpX = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
  4108  		//
  4109  		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
  4110  		// where tab points to the interface table, and the latter points to the actual
  4111  		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
  4112  		// See the following references for detail:
  4113  		// * https://research.swtch.com/interfaces
  4114  		// * https://github.com/golang/go/blob/release-branch.go1.17/src/runtime/runtime2.go#L207-L210
  4115  		c.assembler.CompileMemoryToRegister(
  4116  			arm64.LDRD,
  4117  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset,
  4118  			tmpX,
  4119  		)
  4120  
  4121  		// "tmpY = [tmpX + moduleEngineFunctionsOffset] (== &moduleEngine.functions[0])"
  4122  		c.assembler.CompileMemoryToRegister(
  4123  			arm64.LDRD,
  4124  			tmpX, moduleEngineFunctionsOffset,
  4125  			tmpY,
  4126  		)
  4127  
  4128  		// "callEngine.moduleContext.functionsElement0Address = tmpY".
  4129  		c.assembler.CompileRegisterToMemory(
  4130  			arm64.STRD,
  4131  			tmpY,
  4132  			arm64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  4133  		)
  4134  	}
  4135  
  4136  	// Update dataInstancesElement0Address.
  4137  	if c.ir.HasDataInstances {
  4138  		// "tmpX = &moduleInstance.DataInstances[0]"
  4139  		c.assembler.CompileMemoryToRegister(
  4140  			arm64.LDRD,
  4141  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
  4142  			tmpX,
  4143  		)
  4144  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4145  		c.assembler.CompileRegisterToMemory(
  4146  			arm64.STRD,
  4147  			tmpX,
  4148  			arm64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  4149  		)
  4150  	}
  4151  
  4152  	// Update callEngine.moduleContext.elementInstancesElement0Address
  4153  	if c.ir.HasElementInstances {
  4154  		// "tmpX = &moduleInstance.DataInstances[0]"
  4155  		c.assembler.CompileMemoryToRegister(
  4156  			arm64.LDRD,
  4157  			arm64CallingConventionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
  4158  			tmpX,
  4159  		)
  4160  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpX".
  4161  		c.assembler.CompileRegisterToMemory(
  4162  			arm64.STRD,
  4163  			tmpX,
  4164  			arm64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4165  		)
  4166  	}
  4167  
  4168  	c.assembler.SetJumpTargetOnNext(brIfModuleUnchanged)
  4169  	c.markRegisterUnused(regs...)
  4170  	return nil
  4171  }